corp-extractor 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of corp-extractor might be problematic. Click here for more details.
- corp_extractor-0.2.7.dist-info/METADATA +377 -0
- corp_extractor-0.2.7.dist-info/RECORD +11 -0
- corp_extractor-0.2.7.dist-info/WHEEL +4 -0
- corp_extractor-0.2.7.dist-info/entry_points.txt +3 -0
- statement_extractor/__init__.py +110 -0
- statement_extractor/canonicalization.py +196 -0
- statement_extractor/cli.py +215 -0
- statement_extractor/extractor.py +649 -0
- statement_extractor/models.py +284 -0
- statement_extractor/predicate_comparer.py +611 -0
- statement_extractor/scoring.py +419 -0
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Entity canonicalization for statement deduplication.
|
|
3
|
+
|
|
4
|
+
Provides default canonicalization functions and a Canonicalizer class
|
|
5
|
+
for normalizing entity text before comparison.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
from typing import Callable, Optional
|
|
10
|
+
|
|
11
|
+
from .models import Statement
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# Common determiners to remove from the start of entity text
|
|
15
|
+
DETERMINERS = frozenset(["the", "a", "an", "this", "that", "these", "those"])
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def default_entity_canonicalizer(text: str) -> str:
|
|
19
|
+
"""
|
|
20
|
+
Default entity canonicalization function.
|
|
21
|
+
|
|
22
|
+
Transformations:
|
|
23
|
+
- Trim leading/trailing whitespace
|
|
24
|
+
- Convert to lowercase
|
|
25
|
+
- Remove leading determiners (the, a, an, etc.)
|
|
26
|
+
- Normalize internal whitespace (multiple spaces -> single)
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
text: The entity text to canonicalize
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Canonicalized text
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
>>> default_entity_canonicalizer(" The Apple Inc. ")
|
|
36
|
+
'apple inc.'
|
|
37
|
+
>>> default_entity_canonicalizer("A new product")
|
|
38
|
+
'new product'
|
|
39
|
+
"""
|
|
40
|
+
# Trim and lowercase
|
|
41
|
+
result = text.strip().lower()
|
|
42
|
+
|
|
43
|
+
# Normalize internal whitespace
|
|
44
|
+
result = re.sub(r'\s+', ' ', result)
|
|
45
|
+
|
|
46
|
+
# Remove leading determiners
|
|
47
|
+
words = result.split()
|
|
48
|
+
if words and words[0] in DETERMINERS:
|
|
49
|
+
result = ' '.join(words[1:])
|
|
50
|
+
|
|
51
|
+
return result.strip()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class Canonicalizer:
|
|
55
|
+
"""
|
|
56
|
+
Canonicalize entities for deduplication.
|
|
57
|
+
|
|
58
|
+
Supports custom canonicalization functions for entities.
|
|
59
|
+
Predicate comparison uses embeddings (see PredicateComparer).
|
|
60
|
+
|
|
61
|
+
Example:
|
|
62
|
+
>>> canon = Canonicalizer()
|
|
63
|
+
>>> canon.canonicalize_entity("The Apple Inc.")
|
|
64
|
+
'apple inc.'
|
|
65
|
+
|
|
66
|
+
>>> # With custom function
|
|
67
|
+
>>> canon = Canonicalizer(entity_fn=lambda x: x.upper())
|
|
68
|
+
>>> canon.canonicalize_entity("Apple Inc.")
|
|
69
|
+
'APPLE INC.'
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
def __init__(
|
|
73
|
+
self,
|
|
74
|
+
entity_fn: Optional[Callable[[str], str]] = None,
|
|
75
|
+
):
|
|
76
|
+
"""
|
|
77
|
+
Initialize the canonicalizer.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
entity_fn: Custom function to canonicalize entity text.
|
|
81
|
+
If None, uses default_entity_canonicalizer.
|
|
82
|
+
"""
|
|
83
|
+
self.entity_fn = entity_fn or default_entity_canonicalizer
|
|
84
|
+
|
|
85
|
+
def canonicalize_entity(self, text: str) -> str:
|
|
86
|
+
"""
|
|
87
|
+
Canonicalize an entity string.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
text: Entity text to canonicalize
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Canonicalized text
|
|
94
|
+
"""
|
|
95
|
+
return self.entity_fn(text)
|
|
96
|
+
|
|
97
|
+
def canonicalize_statement_entities(
|
|
98
|
+
self,
|
|
99
|
+
statement: Statement
|
|
100
|
+
) -> tuple[str, str]:
|
|
101
|
+
"""
|
|
102
|
+
Return canonicalized (subject, object) tuple.
|
|
103
|
+
|
|
104
|
+
Note: Predicate comparison uses embeddings, not text canonicalization.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
statement: Statement to canonicalize
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
Tuple of (canonicalized_subject, canonicalized_object)
|
|
111
|
+
"""
|
|
112
|
+
return (
|
|
113
|
+
self.canonicalize_entity(statement.subject.text),
|
|
114
|
+
self.canonicalize_entity(statement.object.text),
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
def create_dedup_key(
|
|
118
|
+
self,
|
|
119
|
+
statement: Statement,
|
|
120
|
+
predicate_canonical: Optional[str] = None
|
|
121
|
+
) -> tuple[str, str, str]:
|
|
122
|
+
"""
|
|
123
|
+
Create a deduplication key for a statement.
|
|
124
|
+
|
|
125
|
+
For exact-match deduplication (when not using embedding-based comparison).
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
statement: Statement to create key for
|
|
129
|
+
predicate_canonical: Optional canonical predicate (if taxonomy was used)
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Tuple of (subject, predicate, object) for deduplication
|
|
133
|
+
"""
|
|
134
|
+
subj = self.canonicalize_entity(statement.subject.text)
|
|
135
|
+
obj = self.canonicalize_entity(statement.object.text)
|
|
136
|
+
pred = predicate_canonical or statement.predicate.lower().strip()
|
|
137
|
+
return (subj, pred, obj)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def deduplicate_statements_exact(
|
|
141
|
+
statements: list[Statement],
|
|
142
|
+
entity_canonicalizer: Optional[Callable[[str], str]] = None,
|
|
143
|
+
detect_reversals: bool = True,
|
|
144
|
+
) -> list[Statement]:
|
|
145
|
+
"""
|
|
146
|
+
Deduplicate statements using exact text matching.
|
|
147
|
+
|
|
148
|
+
Use this when embedding-based deduplication is disabled.
|
|
149
|
+
When duplicates are found, entity types are merged - specific types
|
|
150
|
+
(ORG, PERSON, etc.) take precedence over UNKNOWN.
|
|
151
|
+
|
|
152
|
+
When detect_reversals=True, also detects reversed duplicates where
|
|
153
|
+
subject and object are swapped. The first occurrence determines the
|
|
154
|
+
canonical orientation.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
statements: List of statements to deduplicate
|
|
158
|
+
entity_canonicalizer: Optional custom canonicalization function
|
|
159
|
+
detect_reversals: Whether to detect reversed duplicates (default True)
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Deduplicated list with merged entity types
|
|
163
|
+
"""
|
|
164
|
+
if len(statements) <= 1:
|
|
165
|
+
return statements
|
|
166
|
+
|
|
167
|
+
canonicalizer = Canonicalizer(entity_fn=entity_canonicalizer)
|
|
168
|
+
|
|
169
|
+
# Map from dedup key to index in unique list
|
|
170
|
+
seen: dict[tuple[str, str, str], int] = {}
|
|
171
|
+
unique: list[Statement] = []
|
|
172
|
+
|
|
173
|
+
for stmt in statements:
|
|
174
|
+
key = canonicalizer.create_dedup_key(stmt)
|
|
175
|
+
# Also compute reversed key (object, predicate, subject)
|
|
176
|
+
reversed_key = (key[2], key[1], key[0])
|
|
177
|
+
|
|
178
|
+
if key in seen:
|
|
179
|
+
# Direct duplicate found - merge entity types
|
|
180
|
+
existing_idx = seen[key]
|
|
181
|
+
existing_stmt = unique[existing_idx]
|
|
182
|
+
merged_stmt = existing_stmt.merge_entity_types_from(stmt)
|
|
183
|
+
unique[existing_idx] = merged_stmt
|
|
184
|
+
elif detect_reversals and reversed_key in seen:
|
|
185
|
+
# Reversed duplicate found - merge entity types (accounting for reversal)
|
|
186
|
+
existing_idx = seen[reversed_key]
|
|
187
|
+
existing_stmt = unique[existing_idx]
|
|
188
|
+
# Merge types from the reversed statement
|
|
189
|
+
merged_stmt = existing_stmt.merge_entity_types_from(stmt.reversed())
|
|
190
|
+
unique[existing_idx] = merged_stmt
|
|
191
|
+
else:
|
|
192
|
+
# New unique statement
|
|
193
|
+
seen[key] = len(unique)
|
|
194
|
+
unique.append(stmt)
|
|
195
|
+
|
|
196
|
+
return unique
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Command-line interface for statement extraction.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
corp-extractor "Your text here"
|
|
6
|
+
corp-extractor -f input.txt
|
|
7
|
+
cat input.txt | corp-extractor -
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import sys
|
|
11
|
+
from typing import Optional
|
|
12
|
+
|
|
13
|
+
import click
|
|
14
|
+
|
|
15
|
+
from . import __version__
|
|
16
|
+
from .models import (
|
|
17
|
+
ExtractionOptions,
|
|
18
|
+
PredicateComparisonConfig,
|
|
19
|
+
PredicateTaxonomy,
|
|
20
|
+
ScoringConfig,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@click.command()
|
|
25
|
+
@click.argument("text", required=False)
|
|
26
|
+
@click.option("-f", "--file", "input_file", type=click.Path(exists=True), help="Read input from file")
|
|
27
|
+
@click.option(
|
|
28
|
+
"-o", "--output",
|
|
29
|
+
type=click.Choice(["table", "json", "xml"], case_sensitive=False),
|
|
30
|
+
default="table",
|
|
31
|
+
help="Output format (default: table)"
|
|
32
|
+
)
|
|
33
|
+
@click.option("--json", "output_json", is_flag=True, help="Output as JSON (shortcut for -o json)")
|
|
34
|
+
@click.option("--xml", "output_xml", is_flag=True, help="Output as XML (shortcut for -o xml)")
|
|
35
|
+
# Beam search options
|
|
36
|
+
@click.option("-b", "--beams", type=int, default=4, help="Number of beams for diverse beam search (default: 4)")
|
|
37
|
+
@click.option("--diversity", type=float, default=1.0, help="Diversity penalty for beam search (default: 1.0)")
|
|
38
|
+
@click.option("--max-tokens", type=int, default=2048, help="Maximum tokens to generate (default: 2048)")
|
|
39
|
+
# Deduplication options
|
|
40
|
+
@click.option("--no-dedup", is_flag=True, help="Disable deduplication")
|
|
41
|
+
@click.option("--no-embeddings", is_flag=True, help="Disable embedding-based deduplication (faster)")
|
|
42
|
+
@click.option("--no-merge", is_flag=True, help="Disable beam merging (select single best beam)")
|
|
43
|
+
@click.option("--dedup-threshold", type=float, default=0.65, help="Similarity threshold for deduplication (default: 0.65)")
|
|
44
|
+
# Quality options
|
|
45
|
+
@click.option("--min-confidence", type=float, default=0.0, help="Minimum confidence threshold 0-1 (default: 0)")
|
|
46
|
+
# Taxonomy options
|
|
47
|
+
@click.option("--taxonomy", type=click.Path(exists=True), help="Load predicate taxonomy from file (one per line)")
|
|
48
|
+
@click.option("--taxonomy-threshold", type=float, default=0.5, help="Similarity threshold for taxonomy matching (default: 0.5)")
|
|
49
|
+
# Device options
|
|
50
|
+
@click.option("--device", type=click.Choice(["auto", "cuda", "cpu"]), default="auto", help="Device to use (default: auto)")
|
|
51
|
+
# Output options
|
|
52
|
+
@click.option("-v", "--verbose", is_flag=True, help="Show verbose output with confidence scores")
|
|
53
|
+
@click.option("-q", "--quiet", is_flag=True, help="Suppress progress messages")
|
|
54
|
+
@click.version_option(version=__version__)
|
|
55
|
+
def main(
|
|
56
|
+
text: Optional[str],
|
|
57
|
+
input_file: Optional[str],
|
|
58
|
+
output: str,
|
|
59
|
+
output_json: bool,
|
|
60
|
+
output_xml: bool,
|
|
61
|
+
beams: int,
|
|
62
|
+
diversity: float,
|
|
63
|
+
max_tokens: int,
|
|
64
|
+
no_dedup: bool,
|
|
65
|
+
no_embeddings: bool,
|
|
66
|
+
no_merge: bool,
|
|
67
|
+
dedup_threshold: float,
|
|
68
|
+
min_confidence: float,
|
|
69
|
+
taxonomy: Optional[str],
|
|
70
|
+
taxonomy_threshold: float,
|
|
71
|
+
device: str,
|
|
72
|
+
verbose: bool,
|
|
73
|
+
quiet: bool,
|
|
74
|
+
):
|
|
75
|
+
"""
|
|
76
|
+
Extract structured statements from text.
|
|
77
|
+
|
|
78
|
+
TEXT can be provided as an argument, read from a file with -f, or piped via stdin.
|
|
79
|
+
|
|
80
|
+
\b
|
|
81
|
+
Examples:
|
|
82
|
+
corp-extractor "Apple announced a new iPhone."
|
|
83
|
+
corp-extractor -f article.txt --json
|
|
84
|
+
corp-extractor -f article.txt -o json --beams 8
|
|
85
|
+
cat article.txt | corp-extractor -
|
|
86
|
+
echo "Tim Cook is CEO of Apple." | corp-extractor - --verbose
|
|
87
|
+
|
|
88
|
+
\b
|
|
89
|
+
Output formats:
|
|
90
|
+
table Human-readable table (default)
|
|
91
|
+
json JSON with full metadata
|
|
92
|
+
xml Raw XML from model
|
|
93
|
+
"""
|
|
94
|
+
# Determine output format
|
|
95
|
+
if output_json:
|
|
96
|
+
output = "json"
|
|
97
|
+
elif output_xml:
|
|
98
|
+
output = "xml"
|
|
99
|
+
|
|
100
|
+
# Get input text
|
|
101
|
+
input_text = _get_input_text(text, input_file)
|
|
102
|
+
if not input_text:
|
|
103
|
+
raise click.UsageError(
|
|
104
|
+
"No input provided. Use: statement-extractor \"text\", "
|
|
105
|
+
"statement-extractor -f file.txt, or pipe via stdin."
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
if not quiet:
|
|
109
|
+
click.echo(f"Processing {len(input_text)} characters...", err=True)
|
|
110
|
+
|
|
111
|
+
# Load taxonomy if provided
|
|
112
|
+
predicate_taxonomy = None
|
|
113
|
+
if taxonomy:
|
|
114
|
+
predicate_taxonomy = PredicateTaxonomy.from_file(taxonomy)
|
|
115
|
+
if not quiet:
|
|
116
|
+
click.echo(f"Loaded taxonomy with {len(predicate_taxonomy.predicates)} predicates", err=True)
|
|
117
|
+
|
|
118
|
+
# Configure predicate comparison
|
|
119
|
+
predicate_config = PredicateComparisonConfig(
|
|
120
|
+
similarity_threshold=taxonomy_threshold,
|
|
121
|
+
dedup_threshold=dedup_threshold,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Configure scoring
|
|
125
|
+
scoring_config = ScoringConfig(min_confidence=min_confidence)
|
|
126
|
+
|
|
127
|
+
# Configure extraction options
|
|
128
|
+
options = ExtractionOptions(
|
|
129
|
+
num_beams=beams,
|
|
130
|
+
diversity_penalty=diversity,
|
|
131
|
+
max_new_tokens=max_tokens,
|
|
132
|
+
deduplicate=not no_dedup,
|
|
133
|
+
embedding_dedup=not no_embeddings,
|
|
134
|
+
merge_beams=not no_merge,
|
|
135
|
+
predicate_taxonomy=predicate_taxonomy,
|
|
136
|
+
predicate_config=predicate_config,
|
|
137
|
+
scoring_config=scoring_config,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Import here to allow --help without loading torch
|
|
141
|
+
from .extractor import StatementExtractor
|
|
142
|
+
|
|
143
|
+
# Create extractor with specified device
|
|
144
|
+
device_arg = None if device == "auto" else device
|
|
145
|
+
extractor = StatementExtractor(device=device_arg)
|
|
146
|
+
|
|
147
|
+
if not quiet:
|
|
148
|
+
click.echo(f"Using device: {extractor.device}", err=True)
|
|
149
|
+
|
|
150
|
+
# Run extraction
|
|
151
|
+
try:
|
|
152
|
+
if output == "xml":
|
|
153
|
+
result = extractor.extract_as_xml(input_text, options)
|
|
154
|
+
click.echo(result)
|
|
155
|
+
elif output == "json":
|
|
156
|
+
result = extractor.extract_as_json(input_text, options)
|
|
157
|
+
click.echo(result)
|
|
158
|
+
else:
|
|
159
|
+
# Table format
|
|
160
|
+
result = extractor.extract(input_text, options)
|
|
161
|
+
_print_table(result, verbose)
|
|
162
|
+
except Exception as e:
|
|
163
|
+
raise click.ClickException(f"Extraction failed: {e}")
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _get_input_text(text: Optional[str], input_file: Optional[str]) -> Optional[str]:
|
|
167
|
+
"""Get input text from argument, file, or stdin."""
|
|
168
|
+
if text == "-" or (text is None and input_file is None and not sys.stdin.isatty()):
|
|
169
|
+
# Read from stdin
|
|
170
|
+
return sys.stdin.read().strip()
|
|
171
|
+
elif input_file:
|
|
172
|
+
# Read from file
|
|
173
|
+
with open(input_file, "r", encoding="utf-8") as f:
|
|
174
|
+
return f.read().strip()
|
|
175
|
+
elif text:
|
|
176
|
+
return text.strip()
|
|
177
|
+
return None
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _print_table(result, verbose: bool):
|
|
181
|
+
"""Print statements in a human-readable table format."""
|
|
182
|
+
if not result.statements:
|
|
183
|
+
click.echo("No statements extracted.")
|
|
184
|
+
return
|
|
185
|
+
|
|
186
|
+
click.echo(f"\nExtracted {len(result.statements)} statement(s):\n")
|
|
187
|
+
click.echo("-" * 80)
|
|
188
|
+
|
|
189
|
+
for i, stmt in enumerate(result.statements, 1):
|
|
190
|
+
subject_type = f" ({stmt.subject.type.value})" if stmt.subject.type.value != "UNKNOWN" else ""
|
|
191
|
+
object_type = f" ({stmt.object.type.value})" if stmt.object.type.value != "UNKNOWN" else ""
|
|
192
|
+
|
|
193
|
+
click.echo(f"{i}. {stmt.subject.text}{subject_type}")
|
|
194
|
+
click.echo(f" --[{stmt.predicate}]-->")
|
|
195
|
+
click.echo(f" {stmt.object.text}{object_type}")
|
|
196
|
+
|
|
197
|
+
if verbose:
|
|
198
|
+
if stmt.confidence_score is not None:
|
|
199
|
+
click.echo(f" Confidence: {stmt.confidence_score:.2f}")
|
|
200
|
+
|
|
201
|
+
if stmt.canonical_predicate:
|
|
202
|
+
click.echo(f" Canonical: {stmt.canonical_predicate}")
|
|
203
|
+
|
|
204
|
+
if stmt.was_reversed:
|
|
205
|
+
click.echo(f" (subject/object were swapped)")
|
|
206
|
+
|
|
207
|
+
if stmt.source_text:
|
|
208
|
+
source = stmt.source_text[:60] + "..." if len(stmt.source_text) > 60 else stmt.source_text
|
|
209
|
+
click.echo(f" Source: \"{source}\"")
|
|
210
|
+
|
|
211
|
+
click.echo("-" * 80)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
if __name__ == "__main__":
|
|
215
|
+
main()
|