corp-extractor 0.2.3__py3-none-any.whl → 0.2.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.2.3.dist-info → corp_extractor-0.2.11.dist-info}/METADATA +108 -6
- corp_extractor-0.2.11.dist-info/RECORD +11 -0
- corp_extractor-0.2.11.dist-info/entry_points.txt +3 -0
- statement_extractor/__init__.py +1 -1
- statement_extractor/cli.py +245 -0
- statement_extractor/extractor.py +77 -5
- statement_extractor/models.py +6 -0
- statement_extractor/predicate_comparer.py +23 -1
- statement_extractor/scoring.py +32 -10
- corp_extractor-0.2.3.dist-info/RECORD +0 -9
- {corp_extractor-0.2.3.dist-info → corp_extractor-0.2.11.dist-info}/WHEEL +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: corp-extractor
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.11
|
|
4
4
|
Summary: Extract structured statements from text using T5-Gemma 2 and Diverse Beam Search
|
|
5
5
|
Project-URL: Homepage, https://github.com/corp-o-rate/statement-extractor
|
|
6
6
|
Project-URL: Documentation, https://github.com/corp-o-rate/statement-extractor#readme
|
|
@@ -23,10 +23,11 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
23
23
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
24
24
|
Classifier: Topic :: Text Processing :: Linguistic
|
|
25
25
|
Requires-Python: >=3.10
|
|
26
|
+
Requires-Dist: click>=8.0.0
|
|
26
27
|
Requires-Dist: numpy>=1.24.0
|
|
27
28
|
Requires-Dist: pydantic>=2.0.0
|
|
28
29
|
Requires-Dist: torch>=2.0.0
|
|
29
|
-
Requires-Dist: transformers>=
|
|
30
|
+
Requires-Dist: transformers>=5.0.0rc3
|
|
30
31
|
Provides-Extra: all
|
|
31
32
|
Requires-Dist: sentence-transformers>=2.2.0; extra == 'all'
|
|
32
33
|
Provides-Extra: dev
|
|
@@ -57,22 +58,33 @@ Extract structured subject-predicate-object statements from unstructured text us
|
|
|
57
58
|
- **Contextualized Matching** *(v0.2.2)*: Compares full "Subject Predicate Object" against source text for better accuracy
|
|
58
59
|
- **Entity Type Merging** *(v0.2.3)*: Automatically merges UNKNOWN entity types with specific types during deduplication
|
|
59
60
|
- **Reversal Detection** *(v0.2.3)*: Detects and corrects subject-object reversals using embedding comparison
|
|
61
|
+
- **Command Line Interface** *(v0.2.4)*: Full-featured CLI for terminal usage
|
|
60
62
|
- **Multiple Output Formats**: Get results as Pydantic models, JSON, XML, or dictionaries
|
|
61
63
|
|
|
62
64
|
## Installation
|
|
63
65
|
|
|
64
66
|
```bash
|
|
65
67
|
# Recommended: include embedding support for smart deduplication
|
|
66
|
-
pip install corp-extractor[embeddings]
|
|
68
|
+
pip install "corp-extractor[embeddings]"
|
|
67
69
|
|
|
68
70
|
# Minimal installation (no embedding features)
|
|
69
71
|
pip install corp-extractor
|
|
70
72
|
```
|
|
71
73
|
|
|
72
|
-
**Note**:
|
|
74
|
+
**Note**: This package requires `transformers>=5.0.0` (pre-release) for T5-Gemma2 model support. Install with `--pre` flag if needed:
|
|
75
|
+
```bash
|
|
76
|
+
pip install --pre "corp-extractor[embeddings]"
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
**For GPU support**, install PyTorch with CUDA first:
|
|
73
80
|
```bash
|
|
74
81
|
pip install torch --index-url https://download.pytorch.org/whl/cu121
|
|
75
|
-
pip install corp-extractor[embeddings]
|
|
82
|
+
pip install "corp-extractor[embeddings]"
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
**For Apple Silicon (M1/M2/M3)**, MPS acceleration is automatically detected:
|
|
86
|
+
```bash
|
|
87
|
+
pip install "corp-extractor[embeddings]" # MPS used automatically
|
|
76
88
|
```
|
|
77
89
|
|
|
78
90
|
## Quick Start
|
|
@@ -91,6 +103,96 @@ for stmt in result:
|
|
|
91
103
|
print(f" Confidence: {stmt.confidence_score:.2f}") # NEW in v0.2.0
|
|
92
104
|
```
|
|
93
105
|
|
|
106
|
+
## Command Line Interface
|
|
107
|
+
|
|
108
|
+
The library includes a CLI for quick extraction from the terminal.
|
|
109
|
+
|
|
110
|
+
### Install Globally (Recommended)
|
|
111
|
+
|
|
112
|
+
For best results, install globally first:
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
# Using uv (recommended)
|
|
116
|
+
uv tool install "corp-extractor[embeddings]"
|
|
117
|
+
|
|
118
|
+
# Using pipx
|
|
119
|
+
pipx install "corp-extractor[embeddings]"
|
|
120
|
+
|
|
121
|
+
# Using pip
|
|
122
|
+
pip install "corp-extractor[embeddings]"
|
|
123
|
+
|
|
124
|
+
# Then use anywhere
|
|
125
|
+
corp-extractor "Your text here"
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Quick Run with uvx
|
|
129
|
+
|
|
130
|
+
Run directly without installing using [uv](https://docs.astral.sh/uv/):
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
uvx corp-extractor "Apple announced a new iPhone."
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
**Note**: First run downloads the model (~1.5GB) which may take a few minutes.
|
|
137
|
+
|
|
138
|
+
### Usage Examples
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
# Extract from text argument
|
|
142
|
+
corp-extractor "Apple Inc. announced the iPhone 15 at their September event."
|
|
143
|
+
|
|
144
|
+
# Extract from file
|
|
145
|
+
corp-extractor -f article.txt
|
|
146
|
+
|
|
147
|
+
# Pipe from stdin
|
|
148
|
+
cat article.txt | corp-extractor -
|
|
149
|
+
|
|
150
|
+
# Output as JSON
|
|
151
|
+
corp-extractor "Tim Cook is CEO of Apple." --json
|
|
152
|
+
|
|
153
|
+
# Output as XML
|
|
154
|
+
corp-extractor -f article.txt --xml
|
|
155
|
+
|
|
156
|
+
# Verbose output with confidence scores
|
|
157
|
+
corp-extractor -f article.txt --verbose
|
|
158
|
+
|
|
159
|
+
# Use more beams for better quality
|
|
160
|
+
corp-extractor -f article.txt --beams 8
|
|
161
|
+
|
|
162
|
+
# Use custom predicate taxonomy
|
|
163
|
+
corp-extractor -f article.txt --taxonomy predicates.txt
|
|
164
|
+
|
|
165
|
+
# Use GPU explicitly
|
|
166
|
+
corp-extractor -f article.txt --device cuda
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### CLI Options
|
|
170
|
+
|
|
171
|
+
```
|
|
172
|
+
Usage: corp-extractor [OPTIONS] [TEXT]
|
|
173
|
+
|
|
174
|
+
Options:
|
|
175
|
+
-f, --file PATH Read input from file
|
|
176
|
+
-o, --output [table|json|xml] Output format (default: table)
|
|
177
|
+
--json Output as JSON (shortcut)
|
|
178
|
+
--xml Output as XML (shortcut)
|
|
179
|
+
-b, --beams INTEGER Number of beams (default: 4)
|
|
180
|
+
--diversity FLOAT Diversity penalty (default: 1.0)
|
|
181
|
+
--max-tokens INTEGER Max tokens to generate (default: 2048)
|
|
182
|
+
--no-dedup Disable deduplication
|
|
183
|
+
--no-embeddings Disable embedding-based dedup (faster)
|
|
184
|
+
--no-merge Disable beam merging
|
|
185
|
+
--dedup-threshold FLOAT Deduplication threshold (default: 0.65)
|
|
186
|
+
--min-confidence FLOAT Min confidence filter (default: 0)
|
|
187
|
+
--taxonomy PATH Load predicate taxonomy from file
|
|
188
|
+
--taxonomy-threshold FLOAT Taxonomy matching threshold (default: 0.5)
|
|
189
|
+
--device [auto|cuda|mps|cpu] Device to use (default: auto)
|
|
190
|
+
-v, --verbose Show confidence scores and metadata
|
|
191
|
+
-q, --quiet Suppress progress messages
|
|
192
|
+
--version Show version
|
|
193
|
+
--help Show this message
|
|
194
|
+
```
|
|
195
|
+
|
|
94
196
|
## New in v0.2.0: Quality Scoring & Beam Merging
|
|
95
197
|
|
|
96
198
|
By default, the library now:
|
|
@@ -220,7 +322,7 @@ dict_output = extract_statements_as_dict(text)
|
|
|
220
322
|
```python
|
|
221
323
|
from statement_extractor import StatementExtractor
|
|
222
324
|
|
|
223
|
-
extractor = StatementExtractor(device="cuda") # or "cpu"
|
|
325
|
+
extractor = StatementExtractor(device="cuda") # or "mps" (Apple Silicon) or "cpu"
|
|
224
326
|
|
|
225
327
|
texts = ["Text 1...", "Text 2...", "Text 3..."]
|
|
226
328
|
for text in texts:
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
statement_extractor/__init__.py,sha256=MIZgn-lD9-XGJapzdyYxMhEJFRrTzftbRklrhwA4e8w,2967
|
|
2
|
+
statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
|
|
3
|
+
statement_extractor/cli.py,sha256=NIGCpqcnzF42B16RCiSu4kN0RlnVne2ZAT8341Znt1g,8558
|
|
4
|
+
statement_extractor/extractor.py,sha256=r2gcCfZT43Q8STPuzaXmhbjWXTAs4JwMeAtCjQxlsIQ,25870
|
|
5
|
+
statement_extractor/models.py,sha256=IE3TyIiOl2CINPMroQnGT12rSeQFR0bV3y4BJ79wLmI,10877
|
|
6
|
+
statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
|
|
7
|
+
statement_extractor/scoring.py,sha256=xs0SxrV42QNBULQguU1-HhcCc-HnS-ekbcdx7FqWGVk,15663
|
|
8
|
+
corp_extractor-0.2.11.dist-info/METADATA,sha256=D-fs9i9kn4v5bRAHCHxI3cq_6vosNgDCN7uuYwVZztM,13775
|
|
9
|
+
corp_extractor-0.2.11.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
10
|
+
corp_extractor-0.2.11.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
|
|
11
|
+
corp_extractor-0.2.11.dist-info/RECORD,,
|
statement_extractor/__init__.py
CHANGED
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Command-line interface for statement extraction.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
corp-extractor "Your text here"
|
|
6
|
+
corp-extractor -f input.txt
|
|
7
|
+
cat input.txt | corp-extractor -
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
import sys
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
import click
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _configure_logging(verbose: bool) -> None:
|
|
18
|
+
"""Configure logging for the extraction pipeline."""
|
|
19
|
+
level = logging.DEBUG if verbose else logging.WARNING
|
|
20
|
+
|
|
21
|
+
# Configure root logger for statement_extractor package
|
|
22
|
+
logging.basicConfig(
|
|
23
|
+
level=level,
|
|
24
|
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
|
25
|
+
datefmt="%H:%M:%S",
|
|
26
|
+
stream=sys.stderr,
|
|
27
|
+
force=True,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# Set level for all statement_extractor loggers
|
|
31
|
+
for logger_name in [
|
|
32
|
+
"statement_extractor",
|
|
33
|
+
"statement_extractor.extractor",
|
|
34
|
+
"statement_extractor.scoring",
|
|
35
|
+
"statement_extractor.predicate_comparer",
|
|
36
|
+
"statement_extractor.canonicalization",
|
|
37
|
+
]:
|
|
38
|
+
logging.getLogger(logger_name).setLevel(level)
|
|
39
|
+
|
|
40
|
+
from . import __version__
|
|
41
|
+
from .models import (
|
|
42
|
+
ExtractionOptions,
|
|
43
|
+
PredicateComparisonConfig,
|
|
44
|
+
PredicateTaxonomy,
|
|
45
|
+
ScoringConfig,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@click.command()
|
|
50
|
+
@click.argument("text", required=False)
|
|
51
|
+
@click.option("-f", "--file", "input_file", type=click.Path(exists=True), help="Read input from file")
|
|
52
|
+
@click.option(
|
|
53
|
+
"-o", "--output",
|
|
54
|
+
type=click.Choice(["table", "json", "xml"], case_sensitive=False),
|
|
55
|
+
default="table",
|
|
56
|
+
help="Output format (default: table)"
|
|
57
|
+
)
|
|
58
|
+
@click.option("--json", "output_json", is_flag=True, help="Output as JSON (shortcut for -o json)")
|
|
59
|
+
@click.option("--xml", "output_xml", is_flag=True, help="Output as XML (shortcut for -o xml)")
|
|
60
|
+
# Beam search options
|
|
61
|
+
@click.option("-b", "--beams", type=int, default=4, help="Number of beams for diverse beam search (default: 4)")
|
|
62
|
+
@click.option("--diversity", type=float, default=1.0, help="Diversity penalty for beam search (default: 1.0)")
|
|
63
|
+
@click.option("--max-tokens", type=int, default=2048, help="Maximum tokens to generate (default: 2048)")
|
|
64
|
+
# Deduplication options
|
|
65
|
+
@click.option("--no-dedup", is_flag=True, help="Disable deduplication")
|
|
66
|
+
@click.option("--no-embeddings", is_flag=True, help="Disable embedding-based deduplication (faster)")
|
|
67
|
+
@click.option("--no-merge", is_flag=True, help="Disable beam merging (select single best beam)")
|
|
68
|
+
@click.option("--dedup-threshold", type=float, default=0.65, help="Similarity threshold for deduplication (default: 0.65)")
|
|
69
|
+
# Quality options
|
|
70
|
+
@click.option("--min-confidence", type=float, default=0.0, help="Minimum confidence threshold 0-1 (default: 0)")
|
|
71
|
+
# Taxonomy options
|
|
72
|
+
@click.option("--taxonomy", type=click.Path(exists=True), help="Load predicate taxonomy from file (one per line)")
|
|
73
|
+
@click.option("--taxonomy-threshold", type=float, default=0.5, help="Similarity threshold for taxonomy matching (default: 0.5)")
|
|
74
|
+
# Device options
|
|
75
|
+
@click.option("--device", type=click.Choice(["auto", "cuda", "mps", "cpu"]), default="auto", help="Device to use (default: auto)")
|
|
76
|
+
# Output options
|
|
77
|
+
@click.option("-v", "--verbose", is_flag=True, help="Show verbose output with confidence scores")
|
|
78
|
+
@click.option("-q", "--quiet", is_flag=True, help="Suppress progress messages")
|
|
79
|
+
@click.version_option(version=__version__)
|
|
80
|
+
def main(
|
|
81
|
+
text: Optional[str],
|
|
82
|
+
input_file: Optional[str],
|
|
83
|
+
output: str,
|
|
84
|
+
output_json: bool,
|
|
85
|
+
output_xml: bool,
|
|
86
|
+
beams: int,
|
|
87
|
+
diversity: float,
|
|
88
|
+
max_tokens: int,
|
|
89
|
+
no_dedup: bool,
|
|
90
|
+
no_embeddings: bool,
|
|
91
|
+
no_merge: bool,
|
|
92
|
+
dedup_threshold: float,
|
|
93
|
+
min_confidence: float,
|
|
94
|
+
taxonomy: Optional[str],
|
|
95
|
+
taxonomy_threshold: float,
|
|
96
|
+
device: str,
|
|
97
|
+
verbose: bool,
|
|
98
|
+
quiet: bool,
|
|
99
|
+
):
|
|
100
|
+
"""
|
|
101
|
+
Extract structured statements from text.
|
|
102
|
+
|
|
103
|
+
TEXT can be provided as an argument, read from a file with -f, or piped via stdin.
|
|
104
|
+
|
|
105
|
+
\b
|
|
106
|
+
Examples:
|
|
107
|
+
corp-extractor "Apple announced a new iPhone."
|
|
108
|
+
corp-extractor -f article.txt --json
|
|
109
|
+
corp-extractor -f article.txt -o json --beams 8
|
|
110
|
+
cat article.txt | corp-extractor -
|
|
111
|
+
echo "Tim Cook is CEO of Apple." | corp-extractor - --verbose
|
|
112
|
+
|
|
113
|
+
\b
|
|
114
|
+
Output formats:
|
|
115
|
+
table Human-readable table (default)
|
|
116
|
+
json JSON with full metadata
|
|
117
|
+
xml Raw XML from model
|
|
118
|
+
"""
|
|
119
|
+
# Configure logging based on verbose flag
|
|
120
|
+
_configure_logging(verbose)
|
|
121
|
+
|
|
122
|
+
# Determine output format
|
|
123
|
+
if output_json:
|
|
124
|
+
output = "json"
|
|
125
|
+
elif output_xml:
|
|
126
|
+
output = "xml"
|
|
127
|
+
|
|
128
|
+
# Get input text
|
|
129
|
+
input_text = _get_input_text(text, input_file)
|
|
130
|
+
if not input_text:
|
|
131
|
+
raise click.UsageError(
|
|
132
|
+
"No input provided. Use: statement-extractor \"text\", "
|
|
133
|
+
"statement-extractor -f file.txt, or pipe via stdin."
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
if not quiet:
|
|
137
|
+
click.echo(f"Processing {len(input_text)} characters...", err=True)
|
|
138
|
+
|
|
139
|
+
# Load taxonomy if provided
|
|
140
|
+
predicate_taxonomy = None
|
|
141
|
+
if taxonomy:
|
|
142
|
+
predicate_taxonomy = PredicateTaxonomy.from_file(taxonomy)
|
|
143
|
+
if not quiet:
|
|
144
|
+
click.echo(f"Loaded taxonomy with {len(predicate_taxonomy.predicates)} predicates", err=True)
|
|
145
|
+
|
|
146
|
+
# Configure predicate comparison
|
|
147
|
+
predicate_config = PredicateComparisonConfig(
|
|
148
|
+
similarity_threshold=taxonomy_threshold,
|
|
149
|
+
dedup_threshold=dedup_threshold,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Configure scoring
|
|
153
|
+
scoring_config = ScoringConfig(min_confidence=min_confidence)
|
|
154
|
+
|
|
155
|
+
# Configure extraction options
|
|
156
|
+
options = ExtractionOptions(
|
|
157
|
+
num_beams=beams,
|
|
158
|
+
diversity_penalty=diversity,
|
|
159
|
+
max_new_tokens=max_tokens,
|
|
160
|
+
deduplicate=not no_dedup,
|
|
161
|
+
embedding_dedup=not no_embeddings,
|
|
162
|
+
merge_beams=not no_merge,
|
|
163
|
+
predicate_taxonomy=predicate_taxonomy,
|
|
164
|
+
predicate_config=predicate_config,
|
|
165
|
+
scoring_config=scoring_config,
|
|
166
|
+
verbose=verbose,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# Import here to allow --help without loading torch
|
|
170
|
+
from .extractor import StatementExtractor
|
|
171
|
+
|
|
172
|
+
# Create extractor with specified device
|
|
173
|
+
device_arg = None if device == "auto" else device
|
|
174
|
+
extractor = StatementExtractor(device=device_arg)
|
|
175
|
+
|
|
176
|
+
if not quiet:
|
|
177
|
+
click.echo(f"Using device: {extractor.device}", err=True)
|
|
178
|
+
|
|
179
|
+
# Run extraction
|
|
180
|
+
try:
|
|
181
|
+
if output == "xml":
|
|
182
|
+
result = extractor.extract_as_xml(input_text, options)
|
|
183
|
+
click.echo(result)
|
|
184
|
+
elif output == "json":
|
|
185
|
+
result = extractor.extract_as_json(input_text, options)
|
|
186
|
+
click.echo(result)
|
|
187
|
+
else:
|
|
188
|
+
# Table format
|
|
189
|
+
result = extractor.extract(input_text, options)
|
|
190
|
+
_print_table(result, verbose)
|
|
191
|
+
except Exception as e:
|
|
192
|
+
logging.exception("Error extracting statements:")
|
|
193
|
+
raise click.ClickException(f"Extraction failed: {e}")
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _get_input_text(text: Optional[str], input_file: Optional[str]) -> Optional[str]:
|
|
197
|
+
"""Get input text from argument, file, or stdin."""
|
|
198
|
+
if text == "-" or (text is None and input_file is None and not sys.stdin.isatty()):
|
|
199
|
+
# Read from stdin
|
|
200
|
+
return sys.stdin.read().strip()
|
|
201
|
+
elif input_file:
|
|
202
|
+
# Read from file
|
|
203
|
+
with open(input_file, "r", encoding="utf-8") as f:
|
|
204
|
+
return f.read().strip()
|
|
205
|
+
elif text:
|
|
206
|
+
return text.strip()
|
|
207
|
+
return None
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _print_table(result, verbose: bool):
|
|
211
|
+
"""Print statements in a human-readable table format."""
|
|
212
|
+
if not result.statements:
|
|
213
|
+
click.echo("No statements extracted.")
|
|
214
|
+
return
|
|
215
|
+
|
|
216
|
+
click.echo(f"\nExtracted {len(result.statements)} statement(s):\n")
|
|
217
|
+
click.echo("-" * 80)
|
|
218
|
+
|
|
219
|
+
for i, stmt in enumerate(result.statements, 1):
|
|
220
|
+
subject_type = f" ({stmt.subject.type.value})" if stmt.subject.type.value != "UNKNOWN" else ""
|
|
221
|
+
object_type = f" ({stmt.object.type.value})" if stmt.object.type.value != "UNKNOWN" else ""
|
|
222
|
+
|
|
223
|
+
click.echo(f"{i}. {stmt.subject.text}{subject_type}")
|
|
224
|
+
click.echo(f" --[{stmt.predicate}]-->")
|
|
225
|
+
click.echo(f" {stmt.object.text}{object_type}")
|
|
226
|
+
|
|
227
|
+
if verbose:
|
|
228
|
+
if stmt.confidence_score is not None:
|
|
229
|
+
click.echo(f" Confidence: {stmt.confidence_score:.2f}")
|
|
230
|
+
|
|
231
|
+
if stmt.canonical_predicate:
|
|
232
|
+
click.echo(f" Canonical: {stmt.canonical_predicate}")
|
|
233
|
+
|
|
234
|
+
if stmt.was_reversed:
|
|
235
|
+
click.echo(f" (subject/object were swapped)")
|
|
236
|
+
|
|
237
|
+
if stmt.source_text:
|
|
238
|
+
source = stmt.source_text[:60] + "..." if len(stmt.source_text) > 60 else stmt.source_text
|
|
239
|
+
click.echo(f" Source: \"{source}\"")
|
|
240
|
+
|
|
241
|
+
click.echo("-" * 80)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
if __name__ == "__main__":
|
|
245
|
+
main()
|
statement_extractor/extractor.py
CHANGED
|
@@ -80,11 +80,16 @@ class StatementExtractor:
|
|
|
80
80
|
|
|
81
81
|
# Auto-detect device
|
|
82
82
|
if device is None:
|
|
83
|
-
|
|
83
|
+
if torch.cuda.is_available():
|
|
84
|
+
self.device = "cuda"
|
|
85
|
+
elif torch.backends.mps.is_available():
|
|
86
|
+
self.device = "mps"
|
|
87
|
+
else:
|
|
88
|
+
self.device = "cpu"
|
|
84
89
|
else:
|
|
85
90
|
self.device = device
|
|
86
91
|
|
|
87
|
-
# Auto-detect dtype
|
|
92
|
+
# Auto-detect dtype (bfloat16 only for CUDA, float32 for MPS/CPU)
|
|
88
93
|
if torch_dtype is None:
|
|
89
94
|
self.torch_dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
|
|
90
95
|
else:
|
|
@@ -175,6 +180,14 @@ class StatementExtractor:
|
|
|
175
180
|
if options is None:
|
|
176
181
|
options = ExtractionOptions()
|
|
177
182
|
|
|
183
|
+
logger.debug("=" * 60)
|
|
184
|
+
logger.debug("EXTRACTION STARTED")
|
|
185
|
+
logger.debug("=" * 60)
|
|
186
|
+
logger.debug(f"Input text length: {len(text)} chars")
|
|
187
|
+
logger.debug(f"Options: num_beams={options.num_beams}, diversity={options.diversity_penalty}")
|
|
188
|
+
logger.debug(f" merge_beams={options.merge_beams}, embedding_dedup={options.embedding_dedup}")
|
|
189
|
+
logger.debug(f" deduplicate={options.deduplicate}, max_new_tokens={options.max_new_tokens}")
|
|
190
|
+
|
|
178
191
|
# Store original text for scoring
|
|
179
192
|
original_text = text
|
|
180
193
|
|
|
@@ -185,6 +198,10 @@ class StatementExtractor:
|
|
|
185
198
|
# Run extraction with retry logic
|
|
186
199
|
statements = self._extract_with_scoring(text, original_text, options)
|
|
187
200
|
|
|
201
|
+
logger.debug("=" * 60)
|
|
202
|
+
logger.debug(f"EXTRACTION COMPLETE: {len(statements)} statements")
|
|
203
|
+
logger.debug("=" * 60)
|
|
204
|
+
|
|
188
205
|
return ExtractionResult(
|
|
189
206
|
statements=statements,
|
|
190
207
|
source_text=original_text,
|
|
@@ -270,6 +287,10 @@ class StatementExtractor:
|
|
|
270
287
|
4. Merges top beams or selects best beam
|
|
271
288
|
5. Deduplicates using embeddings (if enabled)
|
|
272
289
|
"""
|
|
290
|
+
logger.debug("-" * 40)
|
|
291
|
+
logger.debug("PHASE 1: Tokenization")
|
|
292
|
+
logger.debug("-" * 40)
|
|
293
|
+
|
|
273
294
|
# Tokenize input
|
|
274
295
|
inputs = self.tokenizer(
|
|
275
296
|
text,
|
|
@@ -278,48 +299,77 @@ class StatementExtractor:
|
|
|
278
299
|
truncation=True,
|
|
279
300
|
).to(self.device)
|
|
280
301
|
|
|
302
|
+
input_ids = inputs["input_ids"]
|
|
303
|
+
logger.debug(f"Tokenized: {input_ids.shape[1]} tokens")
|
|
304
|
+
|
|
281
305
|
# Count sentences for quality check
|
|
282
306
|
num_sentences = self._count_sentences(text)
|
|
283
307
|
min_expected = int(num_sentences * options.min_statement_ratio)
|
|
284
308
|
|
|
285
|
-
logger.
|
|
309
|
+
logger.debug(f"Input has ~{num_sentences} sentences, min expected: {min_expected}")
|
|
286
310
|
|
|
287
311
|
# Get beam scorer
|
|
288
312
|
beam_scorer = self._get_beam_scorer(options)
|
|
289
313
|
|
|
314
|
+
logger.debug("-" * 40)
|
|
315
|
+
logger.debug("PHASE 2: Diverse Beam Search Generation")
|
|
316
|
+
logger.debug("-" * 40)
|
|
317
|
+
|
|
290
318
|
all_candidates: list[list[Statement]] = []
|
|
291
319
|
|
|
292
320
|
for attempt in range(options.max_attempts):
|
|
321
|
+
logger.debug(f"Attempt {attempt + 1}/{options.max_attempts}: Generating {options.num_beams} beams...")
|
|
322
|
+
|
|
293
323
|
# Generate candidate beams
|
|
294
324
|
candidates = self._generate_candidate_beams(inputs, options)
|
|
325
|
+
logger.debug(f" Generated {len(candidates)} valid XML outputs")
|
|
295
326
|
|
|
296
327
|
# Parse each candidate to statements
|
|
297
328
|
parsed_candidates = []
|
|
298
|
-
for xml_output in candidates:
|
|
329
|
+
for i, xml_output in enumerate(candidates):
|
|
299
330
|
statements = self._parse_xml_to_statements(xml_output)
|
|
300
331
|
if statements:
|
|
301
332
|
parsed_candidates.append(statements)
|
|
333
|
+
logger.debug(f" Beam {i}: {len(statements)} statements parsed")
|
|
334
|
+
else:
|
|
335
|
+
logger.debug(f" Beam {i}: 0 statements (parse failed)")
|
|
302
336
|
|
|
303
337
|
all_candidates.extend(parsed_candidates)
|
|
304
338
|
|
|
305
339
|
# Check if we have enough statements
|
|
306
340
|
total_stmts = sum(len(c) for c in parsed_candidates)
|
|
307
|
-
logger.
|
|
341
|
+
logger.debug(f" Total: {len(parsed_candidates)} beams, {total_stmts} statements")
|
|
308
342
|
|
|
309
343
|
if total_stmts >= min_expected:
|
|
344
|
+
logger.debug(f" Sufficient statements ({total_stmts} >= {min_expected}), stopping")
|
|
310
345
|
break
|
|
311
346
|
|
|
312
347
|
if not all_candidates:
|
|
348
|
+
logger.debug("No valid candidates generated, returning empty result")
|
|
313
349
|
return []
|
|
314
350
|
|
|
351
|
+
logger.debug("-" * 40)
|
|
352
|
+
logger.debug("PHASE 3: Beam Selection/Merging")
|
|
353
|
+
logger.debug("-" * 40)
|
|
354
|
+
|
|
315
355
|
# Select or merge beams
|
|
316
356
|
if options.merge_beams:
|
|
357
|
+
logger.debug(f"Merging {len(all_candidates)} beams...")
|
|
317
358
|
statements = beam_scorer.merge_beams(all_candidates, original_text)
|
|
359
|
+
logger.debug(f" After merge: {len(statements)} statements")
|
|
318
360
|
else:
|
|
361
|
+
logger.debug(f"Selecting best beam from {len(all_candidates)} candidates...")
|
|
319
362
|
statements = beam_scorer.select_best_beam(all_candidates, original_text)
|
|
363
|
+
logger.debug(f" Selected beam has {len(statements)} statements")
|
|
364
|
+
|
|
365
|
+
logger.debug("-" * 40)
|
|
366
|
+
logger.debug("PHASE 4: Deduplication")
|
|
367
|
+
logger.debug("-" * 40)
|
|
320
368
|
|
|
321
369
|
# Apply embedding-based deduplication if enabled
|
|
322
370
|
if options.embedding_dedup and options.deduplicate:
|
|
371
|
+
logger.debug("Using embedding-based deduplication...")
|
|
372
|
+
pre_dedup_count = len(statements)
|
|
323
373
|
try:
|
|
324
374
|
comparer = self._get_predicate_comparer(options)
|
|
325
375
|
if comparer:
|
|
@@ -327,14 +377,32 @@ class StatementExtractor:
|
|
|
327
377
|
statements,
|
|
328
378
|
entity_canonicalizer=options.entity_canonicalizer
|
|
329
379
|
)
|
|
380
|
+
logger.debug(f" After embedding dedup: {len(statements)} statements (removed {pre_dedup_count - len(statements)})")
|
|
381
|
+
|
|
330
382
|
# Also normalize predicates if taxonomy provided
|
|
331
383
|
if options.predicate_taxonomy or self._predicate_taxonomy:
|
|
384
|
+
logger.debug("Normalizing predicates to taxonomy...")
|
|
332
385
|
statements = comparer.normalize_predicates(statements)
|
|
333
386
|
except Exception as e:
|
|
334
387
|
logger.warning(f"Embedding deduplication failed, falling back to exact match: {e}")
|
|
335
388
|
statements = self._deduplicate_statements_exact(statements, options)
|
|
389
|
+
logger.debug(f" After exact dedup: {len(statements)} statements")
|
|
336
390
|
elif options.deduplicate:
|
|
391
|
+
logger.debug("Using exact text deduplication...")
|
|
392
|
+
pre_dedup_count = len(statements)
|
|
337
393
|
statements = self._deduplicate_statements_exact(statements, options)
|
|
394
|
+
logger.debug(f" After exact dedup: {len(statements)} statements (removed {pre_dedup_count - len(statements)})")
|
|
395
|
+
else:
|
|
396
|
+
logger.debug("Deduplication disabled")
|
|
397
|
+
|
|
398
|
+
# Log final statements
|
|
399
|
+
logger.debug("-" * 40)
|
|
400
|
+
logger.debug("FINAL STATEMENTS:")
|
|
401
|
+
logger.debug("-" * 40)
|
|
402
|
+
for i, stmt in enumerate(statements):
|
|
403
|
+
conf = f" (conf={stmt.confidence_score:.2f})" if stmt.confidence_score else ""
|
|
404
|
+
canonical = f" -> {stmt.canonical_predicate}" if stmt.canonical_predicate else ""
|
|
405
|
+
logger.debug(f" {i+1}. {stmt.subject.text} --[{stmt.predicate}{canonical}]--> {stmt.object.text}{conf}")
|
|
338
406
|
|
|
339
407
|
return statements
|
|
340
408
|
|
|
@@ -350,12 +418,16 @@ class StatementExtractor:
|
|
|
350
418
|
outputs = self.model.generate(
|
|
351
419
|
**inputs,
|
|
352
420
|
max_new_tokens=options.max_new_tokens,
|
|
421
|
+
max_length=None, # Override model default, use max_new_tokens only
|
|
353
422
|
num_beams=num_seqs,
|
|
354
423
|
num_beam_groups=num_seqs,
|
|
355
424
|
num_return_sequences=num_seqs,
|
|
356
425
|
diversity_penalty=options.diversity_penalty,
|
|
357
426
|
do_sample=False,
|
|
427
|
+
top_p=None, # Override model config to suppress warning
|
|
428
|
+
top_k=None, # Override model config to suppress warning
|
|
358
429
|
trust_remote_code=True,
|
|
430
|
+
custom_generate="transformers-community/group-beam-search",
|
|
359
431
|
)
|
|
360
432
|
|
|
361
433
|
# Decode and process candidates
|
statement_extractor/models.py
CHANGED
|
@@ -280,5 +280,11 @@ class ExtractionOptions(BaseModel):
|
|
|
280
280
|
description="Use embedding similarity for predicate deduplication"
|
|
281
281
|
)
|
|
282
282
|
|
|
283
|
+
# Verbose logging
|
|
284
|
+
verbose: bool = Field(
|
|
285
|
+
default=False,
|
|
286
|
+
description="Enable verbose logging for debugging"
|
|
287
|
+
)
|
|
288
|
+
|
|
283
289
|
class Config:
|
|
284
290
|
arbitrary_types_allowed = True # Allow Callable type
|
|
@@ -83,7 +83,12 @@ class PredicateComparer:
|
|
|
83
83
|
# Auto-detect device
|
|
84
84
|
if device is None:
|
|
85
85
|
import torch
|
|
86
|
-
|
|
86
|
+
if torch.cuda.is_available():
|
|
87
|
+
self.device = "cuda"
|
|
88
|
+
elif torch.backends.mps.is_available():
|
|
89
|
+
self.device = "mps"
|
|
90
|
+
else:
|
|
91
|
+
self.device = "cpu"
|
|
87
92
|
else:
|
|
88
93
|
self.device = device
|
|
89
94
|
|
|
@@ -289,6 +294,8 @@ class PredicateComparer:
|
|
|
289
294
|
Returns:
|
|
290
295
|
Deduplicated list of statements (keeps best contextualized match)
|
|
291
296
|
"""
|
|
297
|
+
logger.debug(f"Embedding deduplication: {len(statements)} statements, detect_reversals={detect_reversals}")
|
|
298
|
+
|
|
292
299
|
if len(statements) <= 1:
|
|
293
300
|
return statements
|
|
294
301
|
|
|
@@ -297,27 +304,33 @@ class PredicateComparer:
|
|
|
297
304
|
return entity_canonicalizer(text)
|
|
298
305
|
return text.lower().strip()
|
|
299
306
|
|
|
307
|
+
logger.debug(" Computing predicate embeddings...")
|
|
300
308
|
# Compute all predicate embeddings at once for efficiency
|
|
301
309
|
predicates = [s.predicate for s in statements]
|
|
302
310
|
pred_embeddings = self._compute_embeddings(predicates)
|
|
311
|
+
logger.debug(f" Computed {len(pred_embeddings)} predicate embeddings")
|
|
303
312
|
|
|
313
|
+
logger.debug(" Computing contextualized embeddings (S P O)...")
|
|
304
314
|
# Compute contextualized embeddings: "Subject Predicate Object" for each statement
|
|
305
315
|
contextualized_texts = [
|
|
306
316
|
f"{s.subject.text} {s.predicate} {s.object.text}" for s in statements
|
|
307
317
|
]
|
|
308
318
|
contextualized_embeddings = self._compute_embeddings(contextualized_texts)
|
|
309
319
|
|
|
320
|
+
logger.debug(" Computing reversed embeddings (O P S)...")
|
|
310
321
|
# Compute reversed contextualized embeddings: "Object Predicate Subject"
|
|
311
322
|
reversed_texts = [
|
|
312
323
|
f"{s.object.text} {s.predicate} {s.subject.text}" for s in statements
|
|
313
324
|
]
|
|
314
325
|
reversed_embeddings = self._compute_embeddings(reversed_texts)
|
|
315
326
|
|
|
327
|
+
logger.debug(" Computing source text embeddings...")
|
|
316
328
|
# Compute source text embeddings for scoring which duplicate to keep
|
|
317
329
|
source_embeddings = []
|
|
318
330
|
for stmt in statements:
|
|
319
331
|
source_text = stmt.source_text or f"{stmt.subject.text} {stmt.predicate} {stmt.object.text}"
|
|
320
332
|
source_embeddings.append(self._compute_embeddings([source_text])[0])
|
|
333
|
+
logger.debug(" All embeddings computed, starting comparison loop...")
|
|
321
334
|
|
|
322
335
|
unique_statements: list[Statement] = []
|
|
323
336
|
unique_pred_embeddings: list[np.ndarray] = []
|
|
@@ -358,9 +371,17 @@ class PredicateComparer:
|
|
|
358
371
|
if similarity >= self.config.dedup_threshold:
|
|
359
372
|
duplicate_idx = j
|
|
360
373
|
is_reversed_match = reversed_match and not direct_match
|
|
374
|
+
match_type = "reversed" if is_reversed_match else "direct"
|
|
375
|
+
logger.debug(
|
|
376
|
+
f" [{i}] DUPLICATE of [{unique_indices[j]}] ({match_type}, sim={similarity:.3f}): "
|
|
377
|
+
f"'{stmt.subject.text}' --[{stmt.predicate}]--> '{stmt.object.text}'"
|
|
378
|
+
)
|
|
361
379
|
break
|
|
362
380
|
|
|
363
381
|
if duplicate_idx is None:
|
|
382
|
+
logger.debug(
|
|
383
|
+
f" [{i}] UNIQUE: '{stmt.subject.text}' --[{stmt.predicate}]--> '{stmt.object.text}'"
|
|
384
|
+
)
|
|
364
385
|
# Not a duplicate - add to unique list
|
|
365
386
|
unique_statements.append(stmt)
|
|
366
387
|
unique_pred_embeddings.append(pred_embeddings[i])
|
|
@@ -451,6 +472,7 @@ class PredicateComparer:
|
|
|
451
472
|
merged_stmt = existing_stmt.merge_entity_types_from(stmt)
|
|
452
473
|
unique_statements[duplicate_idx] = merged_stmt
|
|
453
474
|
|
|
475
|
+
logger.debug(f" Deduplication complete: {len(statements)} -> {len(unique_statements)} statements")
|
|
454
476
|
return unique_statements
|
|
455
477
|
|
|
456
478
|
def normalize_predicates(
|
statement_extractor/scoring.py
CHANGED
|
@@ -6,10 +6,13 @@ Provides:
|
|
|
6
6
|
- BeamScorer: Score and select/merge beams based on quality metrics
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
+
import logging
|
|
9
10
|
from typing import Optional
|
|
10
11
|
|
|
11
12
|
from .models import ScoringConfig, Statement
|
|
12
13
|
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
13
16
|
|
|
14
17
|
class TripleScorer:
|
|
15
18
|
"""
|
|
@@ -32,6 +35,7 @@ class TripleScorer:
|
|
|
32
35
|
Higher scores indicate better grounding in source text.
|
|
33
36
|
"""
|
|
34
37
|
if not source_text:
|
|
38
|
+
logger.debug(f" No source text, returning neutral score 0.5")
|
|
35
39
|
return 0.5 # Neutral score if no source text
|
|
36
40
|
|
|
37
41
|
score = 0.0
|
|
@@ -53,6 +57,7 @@ class TripleScorer:
|
|
|
53
57
|
weights_sum += 0.2
|
|
54
58
|
|
|
55
59
|
# Check proximity - subject and object in same/nearby region (weight: 0.2)
|
|
60
|
+
proximity_score = 0.0
|
|
56
61
|
if subject_found and object_found:
|
|
57
62
|
proximity_score = self._compute_proximity(
|
|
58
63
|
statement.subject.text,
|
|
@@ -62,7 +67,14 @@ class TripleScorer:
|
|
|
62
67
|
score += 0.2 * proximity_score
|
|
63
68
|
weights_sum += 0.2
|
|
64
69
|
|
|
65
|
-
|
|
70
|
+
final_score = score / weights_sum if weights_sum > 0 else 0.0
|
|
71
|
+
|
|
72
|
+
logger.debug(
|
|
73
|
+
f" Score for '{statement.subject.text}' --[{statement.predicate}]--> '{statement.object.text}': "
|
|
74
|
+
f"{final_score:.2f} (subj={subject_found}, obj={object_found}, pred={predicate_grounded}, prox={proximity_score:.2f})"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return final_score
|
|
66
78
|
|
|
67
79
|
def find_evidence_span(
|
|
68
80
|
self,
|
|
@@ -347,10 +359,12 @@ class BeamScorer:
|
|
|
347
359
|
return []
|
|
348
360
|
|
|
349
361
|
top_n = top_n or self.config.merge_top_n
|
|
362
|
+
logger.debug(f"Merging beams: {len(candidates)} candidates, selecting top {top_n}")
|
|
350
363
|
|
|
351
364
|
# Score each beam
|
|
352
365
|
scored_beams = []
|
|
353
|
-
for beam in candidates:
|
|
366
|
+
for i, beam in enumerate(candidates):
|
|
367
|
+
logger.debug(f" Scoring beam {i} ({len(beam)} statements)...")
|
|
354
368
|
for stmt in beam:
|
|
355
369
|
if stmt.confidence_score is None:
|
|
356
370
|
stmt.confidence_score = self.triple_scorer.score_triple(stmt, source_text)
|
|
@@ -359,31 +373,36 @@ class BeamScorer:
|
|
|
359
373
|
|
|
360
374
|
beam_score = self.score_beam(beam, source_text)
|
|
361
375
|
scored_beams.append((beam_score, beam))
|
|
376
|
+
logger.debug(f" Beam {i} score: {beam_score:.3f}")
|
|
362
377
|
|
|
363
378
|
# Sort and take top N
|
|
364
379
|
scored_beams.sort(key=lambda x: x[0], reverse=True)
|
|
365
380
|
top_beams = [beam for _, beam in scored_beams[:top_n]]
|
|
381
|
+
logger.debug(f" Selected top {len(top_beams)} beams")
|
|
366
382
|
|
|
367
383
|
# Pool all triples
|
|
368
384
|
all_statements: list[Statement] = []
|
|
369
385
|
for beam in top_beams:
|
|
370
386
|
all_statements.extend(beam)
|
|
387
|
+
logger.debug(f" Pooled {len(all_statements)} statements from top beams")
|
|
371
388
|
|
|
372
389
|
# Filter by confidence threshold
|
|
373
390
|
min_conf = self.config.min_confidence
|
|
374
391
|
filtered = [s for s in all_statements if (s.confidence_score or 0) >= min_conf]
|
|
392
|
+
logger.debug(f" After confidence filter (>={min_conf}): {len(filtered)} statements")
|
|
375
393
|
|
|
376
|
-
# Filter out statements where source_text doesn't support the predicate
|
|
377
|
-
# This catches model hallucinations where predicate doesn't match the evidence
|
|
378
|
-
consistent = [
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
]
|
|
394
|
+
# # Filter out statements where source_text doesn't support the predicate
|
|
395
|
+
# # This catches model hallucinations where predicate doesn't match the evidence
|
|
396
|
+
# consistent = [
|
|
397
|
+
# s for s in filtered
|
|
398
|
+
# if self._source_text_supports_predicate(s)
|
|
399
|
+
# ]
|
|
400
|
+
# logger.debug(f" After predicate consistency filter: {len(consistent)} statements")
|
|
382
401
|
|
|
383
402
|
# Deduplicate - keep highest confidence for each (subject, predicate, object)
|
|
384
403
|
# Note: Same subject+predicate with different objects is valid (e.g., "Apple announced X and Y")
|
|
385
404
|
seen: dict[tuple[str, str, str], Statement] = {}
|
|
386
|
-
for stmt in
|
|
405
|
+
for stmt in all_statements:
|
|
387
406
|
key = (
|
|
388
407
|
stmt.subject.text.lower(),
|
|
389
408
|
stmt.predicate.lower(),
|
|
@@ -392,7 +411,10 @@ class BeamScorer:
|
|
|
392
411
|
if key not in seen or (stmt.confidence_score or 0) > (seen[key].confidence_score or 0):
|
|
393
412
|
seen[key] = stmt
|
|
394
413
|
|
|
395
|
-
|
|
414
|
+
result = list(seen.values())
|
|
415
|
+
logger.debug(f" After deduplication: {len(result)} unique statements")
|
|
416
|
+
|
|
417
|
+
return result
|
|
396
418
|
|
|
397
419
|
def _source_text_supports_predicate(self, stmt: Statement) -> bool:
|
|
398
420
|
"""
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
statement_extractor/__init__.py,sha256=4Ht8GJdgik_iti7zpG71Oi5EEAnck6AYDvy7soRqIOg,2967
|
|
2
|
-
statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
|
|
3
|
-
statement_extractor/extractor.py,sha256=PX0SiJnYUnh06seyH5W77FcPpcvLXwEM8IGsuVuRh0Q,22158
|
|
4
|
-
statement_extractor/models.py,sha256=xDF3pDPhIiqiMwFMPV94aBEgZGbSe-x2TkshahOiCog,10739
|
|
5
|
-
statement_extractor/predicate_comparer.py,sha256=iwBfNJFNOFv8ODKN9F9EtmknpCeSThOpnu6P_PJSmgE,24898
|
|
6
|
-
statement_extractor/scoring.py,sha256=Wa1BW6jXtHD7dZkUXwdwE39hwFo2ko6BuIogBc4E2Lk,14493
|
|
7
|
-
corp_extractor-0.2.3.dist-info/METADATA,sha256=dCJbLWIj7hgzpkC4zYvNmnEAhNnizUEq_caea6AamIU,10724
|
|
8
|
-
corp_extractor-0.2.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
9
|
-
corp_extractor-0.2.3.dist-info/RECORD,,
|
|
File without changes
|