corp-extractor 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of corp-extractor might be problematic. Click here for more details.

@@ -0,0 +1,196 @@
1
+ """
2
+ Entity canonicalization for statement deduplication.
3
+
4
+ Provides default canonicalization functions and a Canonicalizer class
5
+ for normalizing entity text before comparison.
6
+ """
7
+
8
+ import re
9
+ from typing import Callable, Optional
10
+
11
+ from .models import Statement
12
+
13
+
14
+ # Common determiners to remove from the start of entity text
15
+ DETERMINERS = frozenset(["the", "a", "an", "this", "that", "these", "those"])
16
+
17
+
18
+ def default_entity_canonicalizer(text: str) -> str:
19
+ """
20
+ Default entity canonicalization function.
21
+
22
+ Transformations:
23
+ - Trim leading/trailing whitespace
24
+ - Convert to lowercase
25
+ - Remove leading determiners (the, a, an, etc.)
26
+ - Normalize internal whitespace (multiple spaces -> single)
27
+
28
+ Args:
29
+ text: The entity text to canonicalize
30
+
31
+ Returns:
32
+ Canonicalized text
33
+
34
+ Example:
35
+ >>> default_entity_canonicalizer(" The Apple Inc. ")
36
+ 'apple inc.'
37
+ >>> default_entity_canonicalizer("A new product")
38
+ 'new product'
39
+ """
40
+ # Trim and lowercase
41
+ result = text.strip().lower()
42
+
43
+ # Normalize internal whitespace
44
+ result = re.sub(r'\s+', ' ', result)
45
+
46
+ # Remove leading determiners
47
+ words = result.split()
48
+ if words and words[0] in DETERMINERS:
49
+ result = ' '.join(words[1:])
50
+
51
+ return result.strip()
52
+
53
+
54
+ class Canonicalizer:
55
+ """
56
+ Canonicalize entities for deduplication.
57
+
58
+ Supports custom canonicalization functions for entities.
59
+ Predicate comparison uses embeddings (see PredicateComparer).
60
+
61
+ Example:
62
+ >>> canon = Canonicalizer()
63
+ >>> canon.canonicalize_entity("The Apple Inc.")
64
+ 'apple inc.'
65
+
66
+ >>> # With custom function
67
+ >>> canon = Canonicalizer(entity_fn=lambda x: x.upper())
68
+ >>> canon.canonicalize_entity("Apple Inc.")
69
+ 'APPLE INC.'
70
+ """
71
+
72
+ def __init__(
73
+ self,
74
+ entity_fn: Optional[Callable[[str], str]] = None,
75
+ ):
76
+ """
77
+ Initialize the canonicalizer.
78
+
79
+ Args:
80
+ entity_fn: Custom function to canonicalize entity text.
81
+ If None, uses default_entity_canonicalizer.
82
+ """
83
+ self.entity_fn = entity_fn or default_entity_canonicalizer
84
+
85
+ def canonicalize_entity(self, text: str) -> str:
86
+ """
87
+ Canonicalize an entity string.
88
+
89
+ Args:
90
+ text: Entity text to canonicalize
91
+
92
+ Returns:
93
+ Canonicalized text
94
+ """
95
+ return self.entity_fn(text)
96
+
97
+ def canonicalize_statement_entities(
98
+ self,
99
+ statement: Statement
100
+ ) -> tuple[str, str]:
101
+ """
102
+ Return canonicalized (subject, object) tuple.
103
+
104
+ Note: Predicate comparison uses embeddings, not text canonicalization.
105
+
106
+ Args:
107
+ statement: Statement to canonicalize
108
+
109
+ Returns:
110
+ Tuple of (canonicalized_subject, canonicalized_object)
111
+ """
112
+ return (
113
+ self.canonicalize_entity(statement.subject.text),
114
+ self.canonicalize_entity(statement.object.text),
115
+ )
116
+
117
+ def create_dedup_key(
118
+ self,
119
+ statement: Statement,
120
+ predicate_canonical: Optional[str] = None
121
+ ) -> tuple[str, str, str]:
122
+ """
123
+ Create a deduplication key for a statement.
124
+
125
+ For exact-match deduplication (when not using embedding-based comparison).
126
+
127
+ Args:
128
+ statement: Statement to create key for
129
+ predicate_canonical: Optional canonical predicate (if taxonomy was used)
130
+
131
+ Returns:
132
+ Tuple of (subject, predicate, object) for deduplication
133
+ """
134
+ subj = self.canonicalize_entity(statement.subject.text)
135
+ obj = self.canonicalize_entity(statement.object.text)
136
+ pred = predicate_canonical or statement.predicate.lower().strip()
137
+ return (subj, pred, obj)
138
+
139
+
140
+ def deduplicate_statements_exact(
141
+ statements: list[Statement],
142
+ entity_canonicalizer: Optional[Callable[[str], str]] = None,
143
+ detect_reversals: bool = True,
144
+ ) -> list[Statement]:
145
+ """
146
+ Deduplicate statements using exact text matching.
147
+
148
+ Use this when embedding-based deduplication is disabled.
149
+ When duplicates are found, entity types are merged - specific types
150
+ (ORG, PERSON, etc.) take precedence over UNKNOWN.
151
+
152
+ When detect_reversals=True, also detects reversed duplicates where
153
+ subject and object are swapped. The first occurrence determines the
154
+ canonical orientation.
155
+
156
+ Args:
157
+ statements: List of statements to deduplicate
158
+ entity_canonicalizer: Optional custom canonicalization function
159
+ detect_reversals: Whether to detect reversed duplicates (default True)
160
+
161
+ Returns:
162
+ Deduplicated list with merged entity types
163
+ """
164
+ if len(statements) <= 1:
165
+ return statements
166
+
167
+ canonicalizer = Canonicalizer(entity_fn=entity_canonicalizer)
168
+
169
+ # Map from dedup key to index in unique list
170
+ seen: dict[tuple[str, str, str], int] = {}
171
+ unique: list[Statement] = []
172
+
173
+ for stmt in statements:
174
+ key = canonicalizer.create_dedup_key(stmt)
175
+ # Also compute reversed key (object, predicate, subject)
176
+ reversed_key = (key[2], key[1], key[0])
177
+
178
+ if key in seen:
179
+ # Direct duplicate found - merge entity types
180
+ existing_idx = seen[key]
181
+ existing_stmt = unique[existing_idx]
182
+ merged_stmt = existing_stmt.merge_entity_types_from(stmt)
183
+ unique[existing_idx] = merged_stmt
184
+ elif detect_reversals and reversed_key in seen:
185
+ # Reversed duplicate found - merge entity types (accounting for reversal)
186
+ existing_idx = seen[reversed_key]
187
+ existing_stmt = unique[existing_idx]
188
+ # Merge types from the reversed statement
189
+ merged_stmt = existing_stmt.merge_entity_types_from(stmt.reversed())
190
+ unique[existing_idx] = merged_stmt
191
+ else:
192
+ # New unique statement
193
+ seen[key] = len(unique)
194
+ unique.append(stmt)
195
+
196
+ return unique
@@ -0,0 +1,215 @@
1
+ """
2
+ Command-line interface for statement extraction.
3
+
4
+ Usage:
5
+ corp-extractor "Your text here"
6
+ corp-extractor -f input.txt
7
+ cat input.txt | corp-extractor -
8
+ """
9
+
10
+ import sys
11
+ from typing import Optional
12
+
13
+ import click
14
+
15
+ from . import __version__
16
+ from .models import (
17
+ ExtractionOptions,
18
+ PredicateComparisonConfig,
19
+ PredicateTaxonomy,
20
+ ScoringConfig,
21
+ )
22
+
23
+
24
+ @click.command()
25
+ @click.argument("text", required=False)
26
+ @click.option("-f", "--file", "input_file", type=click.Path(exists=True), help="Read input from file")
27
+ @click.option(
28
+ "-o", "--output",
29
+ type=click.Choice(["table", "json", "xml"], case_sensitive=False),
30
+ default="table",
31
+ help="Output format (default: table)"
32
+ )
33
+ @click.option("--json", "output_json", is_flag=True, help="Output as JSON (shortcut for -o json)")
34
+ @click.option("--xml", "output_xml", is_flag=True, help="Output as XML (shortcut for -o xml)")
35
+ # Beam search options
36
+ @click.option("-b", "--beams", type=int, default=4, help="Number of beams for diverse beam search (default: 4)")
37
+ @click.option("--diversity", type=float, default=1.0, help="Diversity penalty for beam search (default: 1.0)")
38
+ @click.option("--max-tokens", type=int, default=2048, help="Maximum tokens to generate (default: 2048)")
39
+ # Deduplication options
40
+ @click.option("--no-dedup", is_flag=True, help="Disable deduplication")
41
+ @click.option("--no-embeddings", is_flag=True, help="Disable embedding-based deduplication (faster)")
42
+ @click.option("--no-merge", is_flag=True, help="Disable beam merging (select single best beam)")
43
+ @click.option("--dedup-threshold", type=float, default=0.65, help="Similarity threshold for deduplication (default: 0.65)")
44
+ # Quality options
45
+ @click.option("--min-confidence", type=float, default=0.0, help="Minimum confidence threshold 0-1 (default: 0)")
46
+ # Taxonomy options
47
+ @click.option("--taxonomy", type=click.Path(exists=True), help="Load predicate taxonomy from file (one per line)")
48
+ @click.option("--taxonomy-threshold", type=float, default=0.5, help="Similarity threshold for taxonomy matching (default: 0.5)")
49
+ # Device options
50
+ @click.option("--device", type=click.Choice(["auto", "cuda", "cpu"]), default="auto", help="Device to use (default: auto)")
51
+ # Output options
52
+ @click.option("-v", "--verbose", is_flag=True, help="Show verbose output with confidence scores")
53
+ @click.option("-q", "--quiet", is_flag=True, help="Suppress progress messages")
54
+ @click.version_option(version=__version__)
55
+ def main(
56
+ text: Optional[str],
57
+ input_file: Optional[str],
58
+ output: str,
59
+ output_json: bool,
60
+ output_xml: bool,
61
+ beams: int,
62
+ diversity: float,
63
+ max_tokens: int,
64
+ no_dedup: bool,
65
+ no_embeddings: bool,
66
+ no_merge: bool,
67
+ dedup_threshold: float,
68
+ min_confidence: float,
69
+ taxonomy: Optional[str],
70
+ taxonomy_threshold: float,
71
+ device: str,
72
+ verbose: bool,
73
+ quiet: bool,
74
+ ):
75
+ """
76
+ Extract structured statements from text.
77
+
78
+ TEXT can be provided as an argument, read from a file with -f, or piped via stdin.
79
+
80
+ \b
81
+ Examples:
82
+ corp-extractor "Apple announced a new iPhone."
83
+ corp-extractor -f article.txt --json
84
+ corp-extractor -f article.txt -o json --beams 8
85
+ cat article.txt | corp-extractor -
86
+ echo "Tim Cook is CEO of Apple." | corp-extractor - --verbose
87
+
88
+ \b
89
+ Output formats:
90
+ table Human-readable table (default)
91
+ json JSON with full metadata
92
+ xml Raw XML from model
93
+ """
94
+ # Determine output format
95
+ if output_json:
96
+ output = "json"
97
+ elif output_xml:
98
+ output = "xml"
99
+
100
+ # Get input text
101
+ input_text = _get_input_text(text, input_file)
102
+ if not input_text:
103
+ raise click.UsageError(
104
+ "No input provided. Use: statement-extractor \"text\", "
105
+ "statement-extractor -f file.txt, or pipe via stdin."
106
+ )
107
+
108
+ if not quiet:
109
+ click.echo(f"Processing {len(input_text)} characters...", err=True)
110
+
111
+ # Load taxonomy if provided
112
+ predicate_taxonomy = None
113
+ if taxonomy:
114
+ predicate_taxonomy = PredicateTaxonomy.from_file(taxonomy)
115
+ if not quiet:
116
+ click.echo(f"Loaded taxonomy with {len(predicate_taxonomy.predicates)} predicates", err=True)
117
+
118
+ # Configure predicate comparison
119
+ predicate_config = PredicateComparisonConfig(
120
+ similarity_threshold=taxonomy_threshold,
121
+ dedup_threshold=dedup_threshold,
122
+ )
123
+
124
+ # Configure scoring
125
+ scoring_config = ScoringConfig(min_confidence=min_confidence)
126
+
127
+ # Configure extraction options
128
+ options = ExtractionOptions(
129
+ num_beams=beams,
130
+ diversity_penalty=diversity,
131
+ max_new_tokens=max_tokens,
132
+ deduplicate=not no_dedup,
133
+ embedding_dedup=not no_embeddings,
134
+ merge_beams=not no_merge,
135
+ predicate_taxonomy=predicate_taxonomy,
136
+ predicate_config=predicate_config,
137
+ scoring_config=scoring_config,
138
+ )
139
+
140
+ # Import here to allow --help without loading torch
141
+ from .extractor import StatementExtractor
142
+
143
+ # Create extractor with specified device
144
+ device_arg = None if device == "auto" else device
145
+ extractor = StatementExtractor(device=device_arg)
146
+
147
+ if not quiet:
148
+ click.echo(f"Using device: {extractor.device}", err=True)
149
+
150
+ # Run extraction
151
+ try:
152
+ if output == "xml":
153
+ result = extractor.extract_as_xml(input_text, options)
154
+ click.echo(result)
155
+ elif output == "json":
156
+ result = extractor.extract_as_json(input_text, options)
157
+ click.echo(result)
158
+ else:
159
+ # Table format
160
+ result = extractor.extract(input_text, options)
161
+ _print_table(result, verbose)
162
+ except Exception as e:
163
+ raise click.ClickException(f"Extraction failed: {e}")
164
+
165
+
166
+ def _get_input_text(text: Optional[str], input_file: Optional[str]) -> Optional[str]:
167
+ """Get input text from argument, file, or stdin."""
168
+ if text == "-" or (text is None and input_file is None and not sys.stdin.isatty()):
169
+ # Read from stdin
170
+ return sys.stdin.read().strip()
171
+ elif input_file:
172
+ # Read from file
173
+ with open(input_file, "r", encoding="utf-8") as f:
174
+ return f.read().strip()
175
+ elif text:
176
+ return text.strip()
177
+ return None
178
+
179
+
180
+ def _print_table(result, verbose: bool):
181
+ """Print statements in a human-readable table format."""
182
+ if not result.statements:
183
+ click.echo("No statements extracted.")
184
+ return
185
+
186
+ click.echo(f"\nExtracted {len(result.statements)} statement(s):\n")
187
+ click.echo("-" * 80)
188
+
189
+ for i, stmt in enumerate(result.statements, 1):
190
+ subject_type = f" ({stmt.subject.type.value})" if stmt.subject.type.value != "UNKNOWN" else ""
191
+ object_type = f" ({stmt.object.type.value})" if stmt.object.type.value != "UNKNOWN" else ""
192
+
193
+ click.echo(f"{i}. {stmt.subject.text}{subject_type}")
194
+ click.echo(f" --[{stmt.predicate}]-->")
195
+ click.echo(f" {stmt.object.text}{object_type}")
196
+
197
+ if verbose:
198
+ if stmt.confidence_score is not None:
199
+ click.echo(f" Confidence: {stmt.confidence_score:.2f}")
200
+
201
+ if stmt.canonical_predicate:
202
+ click.echo(f" Canonical: {stmt.canonical_predicate}")
203
+
204
+ if stmt.was_reversed:
205
+ click.echo(f" (subject/object were swapped)")
206
+
207
+ if stmt.source_text:
208
+ source = stmt.source_text[:60] + "..." if len(stmt.source_text) > 60 else stmt.source_text
209
+ click.echo(f" Source: \"{source}\"")
210
+
211
+ click.echo("-" * 80)
212
+
213
+
214
+ if __name__ == "__main__":
215
+ main()