corp-extractor 0.2.3__py3-none-any.whl → 0.2.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: corp-extractor
3
- Version: 0.2.3
3
+ Version: 0.2.11
4
4
  Summary: Extract structured statements from text using T5-Gemma 2 and Diverse Beam Search
5
5
  Project-URL: Homepage, https://github.com/corp-o-rate/statement-extractor
6
6
  Project-URL: Documentation, https://github.com/corp-o-rate/statement-extractor#readme
@@ -23,10 +23,11 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
23
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
24
  Classifier: Topic :: Text Processing :: Linguistic
25
25
  Requires-Python: >=3.10
26
+ Requires-Dist: click>=8.0.0
26
27
  Requires-Dist: numpy>=1.24.0
27
28
  Requires-Dist: pydantic>=2.0.0
28
29
  Requires-Dist: torch>=2.0.0
29
- Requires-Dist: transformers>=4.35.0
30
+ Requires-Dist: transformers>=5.0.0rc3
30
31
  Provides-Extra: all
31
32
  Requires-Dist: sentence-transformers>=2.2.0; extra == 'all'
32
33
  Provides-Extra: dev
@@ -57,22 +58,33 @@ Extract structured subject-predicate-object statements from unstructured text us
57
58
  - **Contextualized Matching** *(v0.2.2)*: Compares full "Subject Predicate Object" against source text for better accuracy
58
59
  - **Entity Type Merging** *(v0.2.3)*: Automatically merges UNKNOWN entity types with specific types during deduplication
59
60
  - **Reversal Detection** *(v0.2.3)*: Detects and corrects subject-object reversals using embedding comparison
61
+ - **Command Line Interface** *(v0.2.4)*: Full-featured CLI for terminal usage
60
62
  - **Multiple Output Formats**: Get results as Pydantic models, JSON, XML, or dictionaries
61
63
 
62
64
  ## Installation
63
65
 
64
66
  ```bash
65
67
  # Recommended: include embedding support for smart deduplication
66
- pip install corp-extractor[embeddings]
68
+ pip install "corp-extractor[embeddings]"
67
69
 
68
70
  # Minimal installation (no embedding features)
69
71
  pip install corp-extractor
70
72
  ```
71
73
 
72
- **Note**: For GPU support, install PyTorch with CUDA first:
74
+ **Note**: This package requires `transformers>=5.0.0` (pre-release) for T5-Gemma2 model support. Install with `--pre` flag if needed:
75
+ ```bash
76
+ pip install --pre "corp-extractor[embeddings]"
77
+ ```
78
+
79
+ **For GPU support**, install PyTorch with CUDA first:
73
80
  ```bash
74
81
  pip install torch --index-url https://download.pytorch.org/whl/cu121
75
- pip install corp-extractor[embeddings]
82
+ pip install "corp-extractor[embeddings]"
83
+ ```
84
+
85
+ **For Apple Silicon (M1/M2/M3)**, MPS acceleration is automatically detected:
86
+ ```bash
87
+ pip install "corp-extractor[embeddings]" # MPS used automatically
76
88
  ```
77
89
 
78
90
  ## Quick Start
@@ -91,6 +103,96 @@ for stmt in result:
91
103
  print(f" Confidence: {stmt.confidence_score:.2f}") # NEW in v0.2.0
92
104
  ```
93
105
 
106
+ ## Command Line Interface
107
+
108
+ The library includes a CLI for quick extraction from the terminal.
109
+
110
+ ### Install Globally (Recommended)
111
+
112
+ For best results, install globally first:
113
+
114
+ ```bash
115
+ # Using uv (recommended)
116
+ uv tool install "corp-extractor[embeddings]"
117
+
118
+ # Using pipx
119
+ pipx install "corp-extractor[embeddings]"
120
+
121
+ # Using pip
122
+ pip install "corp-extractor[embeddings]"
123
+
124
+ # Then use anywhere
125
+ corp-extractor "Your text here"
126
+ ```
127
+
128
+ ### Quick Run with uvx
129
+
130
+ Run directly without installing using [uv](https://docs.astral.sh/uv/):
131
+
132
+ ```bash
133
+ uvx corp-extractor "Apple announced a new iPhone."
134
+ ```
135
+
136
+ **Note**: First run downloads the model (~1.5GB) which may take a few minutes.
137
+
138
+ ### Usage Examples
139
+
140
+ ```bash
141
+ # Extract from text argument
142
+ corp-extractor "Apple Inc. announced the iPhone 15 at their September event."
143
+
144
+ # Extract from file
145
+ corp-extractor -f article.txt
146
+
147
+ # Pipe from stdin
148
+ cat article.txt | corp-extractor -
149
+
150
+ # Output as JSON
151
+ corp-extractor "Tim Cook is CEO of Apple." --json
152
+
153
+ # Output as XML
154
+ corp-extractor -f article.txt --xml
155
+
156
+ # Verbose output with confidence scores
157
+ corp-extractor -f article.txt --verbose
158
+
159
+ # Use more beams for better quality
160
+ corp-extractor -f article.txt --beams 8
161
+
162
+ # Use custom predicate taxonomy
163
+ corp-extractor -f article.txt --taxonomy predicates.txt
164
+
165
+ # Use GPU explicitly
166
+ corp-extractor -f article.txt --device cuda
167
+ ```
168
+
169
+ ### CLI Options
170
+
171
+ ```
172
+ Usage: corp-extractor [OPTIONS] [TEXT]
173
+
174
+ Options:
175
+ -f, --file PATH Read input from file
176
+ -o, --output [table|json|xml] Output format (default: table)
177
+ --json Output as JSON (shortcut)
178
+ --xml Output as XML (shortcut)
179
+ -b, --beams INTEGER Number of beams (default: 4)
180
+ --diversity FLOAT Diversity penalty (default: 1.0)
181
+ --max-tokens INTEGER Max tokens to generate (default: 2048)
182
+ --no-dedup Disable deduplication
183
+ --no-embeddings Disable embedding-based dedup (faster)
184
+ --no-merge Disable beam merging
185
+ --dedup-threshold FLOAT Deduplication threshold (default: 0.65)
186
+ --min-confidence FLOAT Min confidence filter (default: 0)
187
+ --taxonomy PATH Load predicate taxonomy from file
188
+ --taxonomy-threshold FLOAT Taxonomy matching threshold (default: 0.5)
189
+ --device [auto|cuda|mps|cpu] Device to use (default: auto)
190
+ -v, --verbose Show confidence scores and metadata
191
+ -q, --quiet Suppress progress messages
192
+ --version Show version
193
+ --help Show this message
194
+ ```
195
+
94
196
  ## New in v0.2.0: Quality Scoring & Beam Merging
95
197
 
96
198
  By default, the library now:
@@ -220,7 +322,7 @@ dict_output = extract_statements_as_dict(text)
220
322
  ```python
221
323
  from statement_extractor import StatementExtractor
222
324
 
223
- extractor = StatementExtractor(device="cuda") # or "cpu"
325
+ extractor = StatementExtractor(device="cuda") # or "mps" (Apple Silicon) or "cpu"
224
326
 
225
327
  texts = ["Text 1...", "Text 2...", "Text 3..."]
226
328
  for text in texts:
@@ -0,0 +1,11 @@
1
+ statement_extractor/__init__.py,sha256=MIZgn-lD9-XGJapzdyYxMhEJFRrTzftbRklrhwA4e8w,2967
2
+ statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
3
+ statement_extractor/cli.py,sha256=NIGCpqcnzF42B16RCiSu4kN0RlnVne2ZAT8341Znt1g,8558
4
+ statement_extractor/extractor.py,sha256=r2gcCfZT43Q8STPuzaXmhbjWXTAs4JwMeAtCjQxlsIQ,25870
5
+ statement_extractor/models.py,sha256=IE3TyIiOl2CINPMroQnGT12rSeQFR0bV3y4BJ79wLmI,10877
6
+ statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
7
+ statement_extractor/scoring.py,sha256=xs0SxrV42QNBULQguU1-HhcCc-HnS-ekbcdx7FqWGVk,15663
8
+ corp_extractor-0.2.11.dist-info/METADATA,sha256=D-fs9i9kn4v5bRAHCHxI3cq_6vosNgDCN7uuYwVZztM,13775
9
+ corp_extractor-0.2.11.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
10
+ corp_extractor-0.2.11.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
11
+ corp_extractor-0.2.11.dist-info/RECORD,,
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ corp-extractor = statement_extractor.cli:main
3
+ statement-extractor = statement_extractor.cli:main
@@ -29,7 +29,7 @@ Example:
29
29
  >>> data = extract_statements_as_dict("Some text...")
30
30
  """
31
31
 
32
- __version__ = "0.2.2"
32
+ __version__ = "0.2.5"
33
33
 
34
34
  # Core models
35
35
  from .models import (
@@ -0,0 +1,245 @@
1
+ """
2
+ Command-line interface for statement extraction.
3
+
4
+ Usage:
5
+ corp-extractor "Your text here"
6
+ corp-extractor -f input.txt
7
+ cat input.txt | corp-extractor -
8
+ """
9
+
10
+ import logging
11
+ import sys
12
+ from typing import Optional
13
+
14
+ import click
15
+
16
+
17
+ def _configure_logging(verbose: bool) -> None:
18
+ """Configure logging for the extraction pipeline."""
19
+ level = logging.DEBUG if verbose else logging.WARNING
20
+
21
+ # Configure root logger for statement_extractor package
22
+ logging.basicConfig(
23
+ level=level,
24
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
25
+ datefmt="%H:%M:%S",
26
+ stream=sys.stderr,
27
+ force=True,
28
+ )
29
+
30
+ # Set level for all statement_extractor loggers
31
+ for logger_name in [
32
+ "statement_extractor",
33
+ "statement_extractor.extractor",
34
+ "statement_extractor.scoring",
35
+ "statement_extractor.predicate_comparer",
36
+ "statement_extractor.canonicalization",
37
+ ]:
38
+ logging.getLogger(logger_name).setLevel(level)
39
+
40
+ from . import __version__
41
+ from .models import (
42
+ ExtractionOptions,
43
+ PredicateComparisonConfig,
44
+ PredicateTaxonomy,
45
+ ScoringConfig,
46
+ )
47
+
48
+
49
+ @click.command()
50
+ @click.argument("text", required=False)
51
+ @click.option("-f", "--file", "input_file", type=click.Path(exists=True), help="Read input from file")
52
+ @click.option(
53
+ "-o", "--output",
54
+ type=click.Choice(["table", "json", "xml"], case_sensitive=False),
55
+ default="table",
56
+ help="Output format (default: table)"
57
+ )
58
+ @click.option("--json", "output_json", is_flag=True, help="Output as JSON (shortcut for -o json)")
59
+ @click.option("--xml", "output_xml", is_flag=True, help="Output as XML (shortcut for -o xml)")
60
+ # Beam search options
61
+ @click.option("-b", "--beams", type=int, default=4, help="Number of beams for diverse beam search (default: 4)")
62
+ @click.option("--diversity", type=float, default=1.0, help="Diversity penalty for beam search (default: 1.0)")
63
+ @click.option("--max-tokens", type=int, default=2048, help="Maximum tokens to generate (default: 2048)")
64
+ # Deduplication options
65
+ @click.option("--no-dedup", is_flag=True, help="Disable deduplication")
66
+ @click.option("--no-embeddings", is_flag=True, help="Disable embedding-based deduplication (faster)")
67
+ @click.option("--no-merge", is_flag=True, help="Disable beam merging (select single best beam)")
68
+ @click.option("--dedup-threshold", type=float, default=0.65, help="Similarity threshold for deduplication (default: 0.65)")
69
+ # Quality options
70
+ @click.option("--min-confidence", type=float, default=0.0, help="Minimum confidence threshold 0-1 (default: 0)")
71
+ # Taxonomy options
72
+ @click.option("--taxonomy", type=click.Path(exists=True), help="Load predicate taxonomy from file (one per line)")
73
+ @click.option("--taxonomy-threshold", type=float, default=0.5, help="Similarity threshold for taxonomy matching (default: 0.5)")
74
+ # Device options
75
+ @click.option("--device", type=click.Choice(["auto", "cuda", "mps", "cpu"]), default="auto", help="Device to use (default: auto)")
76
+ # Output options
77
+ @click.option("-v", "--verbose", is_flag=True, help="Show verbose output with confidence scores")
78
+ @click.option("-q", "--quiet", is_flag=True, help="Suppress progress messages")
79
+ @click.version_option(version=__version__)
80
+ def main(
81
+ text: Optional[str],
82
+ input_file: Optional[str],
83
+ output: str,
84
+ output_json: bool,
85
+ output_xml: bool,
86
+ beams: int,
87
+ diversity: float,
88
+ max_tokens: int,
89
+ no_dedup: bool,
90
+ no_embeddings: bool,
91
+ no_merge: bool,
92
+ dedup_threshold: float,
93
+ min_confidence: float,
94
+ taxonomy: Optional[str],
95
+ taxonomy_threshold: float,
96
+ device: str,
97
+ verbose: bool,
98
+ quiet: bool,
99
+ ):
100
+ """
101
+ Extract structured statements from text.
102
+
103
+ TEXT can be provided as an argument, read from a file with -f, or piped via stdin.
104
+
105
+ \b
106
+ Examples:
107
+ corp-extractor "Apple announced a new iPhone."
108
+ corp-extractor -f article.txt --json
109
+ corp-extractor -f article.txt -o json --beams 8
110
+ cat article.txt | corp-extractor -
111
+ echo "Tim Cook is CEO of Apple." | corp-extractor - --verbose
112
+
113
+ \b
114
+ Output formats:
115
+ table Human-readable table (default)
116
+ json JSON with full metadata
117
+ xml Raw XML from model
118
+ """
119
+ # Configure logging based on verbose flag
120
+ _configure_logging(verbose)
121
+
122
+ # Determine output format
123
+ if output_json:
124
+ output = "json"
125
+ elif output_xml:
126
+ output = "xml"
127
+
128
+ # Get input text
129
+ input_text = _get_input_text(text, input_file)
130
+ if not input_text:
131
+ raise click.UsageError(
132
+ "No input provided. Use: statement-extractor \"text\", "
133
+ "statement-extractor -f file.txt, or pipe via stdin."
134
+ )
135
+
136
+ if not quiet:
137
+ click.echo(f"Processing {len(input_text)} characters...", err=True)
138
+
139
+ # Load taxonomy if provided
140
+ predicate_taxonomy = None
141
+ if taxonomy:
142
+ predicate_taxonomy = PredicateTaxonomy.from_file(taxonomy)
143
+ if not quiet:
144
+ click.echo(f"Loaded taxonomy with {len(predicate_taxonomy.predicates)} predicates", err=True)
145
+
146
+ # Configure predicate comparison
147
+ predicate_config = PredicateComparisonConfig(
148
+ similarity_threshold=taxonomy_threshold,
149
+ dedup_threshold=dedup_threshold,
150
+ )
151
+
152
+ # Configure scoring
153
+ scoring_config = ScoringConfig(min_confidence=min_confidence)
154
+
155
+ # Configure extraction options
156
+ options = ExtractionOptions(
157
+ num_beams=beams,
158
+ diversity_penalty=diversity,
159
+ max_new_tokens=max_tokens,
160
+ deduplicate=not no_dedup,
161
+ embedding_dedup=not no_embeddings,
162
+ merge_beams=not no_merge,
163
+ predicate_taxonomy=predicate_taxonomy,
164
+ predicate_config=predicate_config,
165
+ scoring_config=scoring_config,
166
+ verbose=verbose,
167
+ )
168
+
169
+ # Import here to allow --help without loading torch
170
+ from .extractor import StatementExtractor
171
+
172
+ # Create extractor with specified device
173
+ device_arg = None if device == "auto" else device
174
+ extractor = StatementExtractor(device=device_arg)
175
+
176
+ if not quiet:
177
+ click.echo(f"Using device: {extractor.device}", err=True)
178
+
179
+ # Run extraction
180
+ try:
181
+ if output == "xml":
182
+ result = extractor.extract_as_xml(input_text, options)
183
+ click.echo(result)
184
+ elif output == "json":
185
+ result = extractor.extract_as_json(input_text, options)
186
+ click.echo(result)
187
+ else:
188
+ # Table format
189
+ result = extractor.extract(input_text, options)
190
+ _print_table(result, verbose)
191
+ except Exception as e:
192
+ logging.exception("Error extracting statements:")
193
+ raise click.ClickException(f"Extraction failed: {e}")
194
+
195
+
196
+ def _get_input_text(text: Optional[str], input_file: Optional[str]) -> Optional[str]:
197
+ """Get input text from argument, file, or stdin."""
198
+ if text == "-" or (text is None and input_file is None and not sys.stdin.isatty()):
199
+ # Read from stdin
200
+ return sys.stdin.read().strip()
201
+ elif input_file:
202
+ # Read from file
203
+ with open(input_file, "r", encoding="utf-8") as f:
204
+ return f.read().strip()
205
+ elif text:
206
+ return text.strip()
207
+ return None
208
+
209
+
210
+ def _print_table(result, verbose: bool):
211
+ """Print statements in a human-readable table format."""
212
+ if not result.statements:
213
+ click.echo("No statements extracted.")
214
+ return
215
+
216
+ click.echo(f"\nExtracted {len(result.statements)} statement(s):\n")
217
+ click.echo("-" * 80)
218
+
219
+ for i, stmt in enumerate(result.statements, 1):
220
+ subject_type = f" ({stmt.subject.type.value})" if stmt.subject.type.value != "UNKNOWN" else ""
221
+ object_type = f" ({stmt.object.type.value})" if stmt.object.type.value != "UNKNOWN" else ""
222
+
223
+ click.echo(f"{i}. {stmt.subject.text}{subject_type}")
224
+ click.echo(f" --[{stmt.predicate}]-->")
225
+ click.echo(f" {stmt.object.text}{object_type}")
226
+
227
+ if verbose:
228
+ if stmt.confidence_score is not None:
229
+ click.echo(f" Confidence: {stmt.confidence_score:.2f}")
230
+
231
+ if stmt.canonical_predicate:
232
+ click.echo(f" Canonical: {stmt.canonical_predicate}")
233
+
234
+ if stmt.was_reversed:
235
+ click.echo(f" (subject/object were swapped)")
236
+
237
+ if stmt.source_text:
238
+ source = stmt.source_text[:60] + "..." if len(stmt.source_text) > 60 else stmt.source_text
239
+ click.echo(f" Source: \"{source}\"")
240
+
241
+ click.echo("-" * 80)
242
+
243
+
244
+ if __name__ == "__main__":
245
+ main()
@@ -80,11 +80,16 @@ class StatementExtractor:
80
80
 
81
81
  # Auto-detect device
82
82
  if device is None:
83
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
83
+ if torch.cuda.is_available():
84
+ self.device = "cuda"
85
+ elif torch.backends.mps.is_available():
86
+ self.device = "mps"
87
+ else:
88
+ self.device = "cpu"
84
89
  else:
85
90
  self.device = device
86
91
 
87
- # Auto-detect dtype
92
+ # Auto-detect dtype (bfloat16 only for CUDA, float32 for MPS/CPU)
88
93
  if torch_dtype is None:
89
94
  self.torch_dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
90
95
  else:
@@ -175,6 +180,14 @@ class StatementExtractor:
175
180
  if options is None:
176
181
  options = ExtractionOptions()
177
182
 
183
+ logger.debug("=" * 60)
184
+ logger.debug("EXTRACTION STARTED")
185
+ logger.debug("=" * 60)
186
+ logger.debug(f"Input text length: {len(text)} chars")
187
+ logger.debug(f"Options: num_beams={options.num_beams}, diversity={options.diversity_penalty}")
188
+ logger.debug(f" merge_beams={options.merge_beams}, embedding_dedup={options.embedding_dedup}")
189
+ logger.debug(f" deduplicate={options.deduplicate}, max_new_tokens={options.max_new_tokens}")
190
+
178
191
  # Store original text for scoring
179
192
  original_text = text
180
193
 
@@ -185,6 +198,10 @@ class StatementExtractor:
185
198
  # Run extraction with retry logic
186
199
  statements = self._extract_with_scoring(text, original_text, options)
187
200
 
201
+ logger.debug("=" * 60)
202
+ logger.debug(f"EXTRACTION COMPLETE: {len(statements)} statements")
203
+ logger.debug("=" * 60)
204
+
188
205
  return ExtractionResult(
189
206
  statements=statements,
190
207
  source_text=original_text,
@@ -270,6 +287,10 @@ class StatementExtractor:
270
287
  4. Merges top beams or selects best beam
271
288
  5. Deduplicates using embeddings (if enabled)
272
289
  """
290
+ logger.debug("-" * 40)
291
+ logger.debug("PHASE 1: Tokenization")
292
+ logger.debug("-" * 40)
293
+
273
294
  # Tokenize input
274
295
  inputs = self.tokenizer(
275
296
  text,
@@ -278,48 +299,77 @@ class StatementExtractor:
278
299
  truncation=True,
279
300
  ).to(self.device)
280
301
 
302
+ input_ids = inputs["input_ids"]
303
+ logger.debug(f"Tokenized: {input_ids.shape[1]} tokens")
304
+
281
305
  # Count sentences for quality check
282
306
  num_sentences = self._count_sentences(text)
283
307
  min_expected = int(num_sentences * options.min_statement_ratio)
284
308
 
285
- logger.info(f"Input has ~{num_sentences} sentences, expecting >= {min_expected} statements")
309
+ logger.debug(f"Input has ~{num_sentences} sentences, min expected: {min_expected}")
286
310
 
287
311
  # Get beam scorer
288
312
  beam_scorer = self._get_beam_scorer(options)
289
313
 
314
+ logger.debug("-" * 40)
315
+ logger.debug("PHASE 2: Diverse Beam Search Generation")
316
+ logger.debug("-" * 40)
317
+
290
318
  all_candidates: list[list[Statement]] = []
291
319
 
292
320
  for attempt in range(options.max_attempts):
321
+ logger.debug(f"Attempt {attempt + 1}/{options.max_attempts}: Generating {options.num_beams} beams...")
322
+
293
323
  # Generate candidate beams
294
324
  candidates = self._generate_candidate_beams(inputs, options)
325
+ logger.debug(f" Generated {len(candidates)} valid XML outputs")
295
326
 
296
327
  # Parse each candidate to statements
297
328
  parsed_candidates = []
298
- for xml_output in candidates:
329
+ for i, xml_output in enumerate(candidates):
299
330
  statements = self._parse_xml_to_statements(xml_output)
300
331
  if statements:
301
332
  parsed_candidates.append(statements)
333
+ logger.debug(f" Beam {i}: {len(statements)} statements parsed")
334
+ else:
335
+ logger.debug(f" Beam {i}: 0 statements (parse failed)")
302
336
 
303
337
  all_candidates.extend(parsed_candidates)
304
338
 
305
339
  # Check if we have enough statements
306
340
  total_stmts = sum(len(c) for c in parsed_candidates)
307
- logger.info(f"Attempt {attempt + 1}/{options.max_attempts}: {len(parsed_candidates)} beams, {total_stmts} total statements")
341
+ logger.debug(f" Total: {len(parsed_candidates)} beams, {total_stmts} statements")
308
342
 
309
343
  if total_stmts >= min_expected:
344
+ logger.debug(f" Sufficient statements ({total_stmts} >= {min_expected}), stopping")
310
345
  break
311
346
 
312
347
  if not all_candidates:
348
+ logger.debug("No valid candidates generated, returning empty result")
313
349
  return []
314
350
 
351
+ logger.debug("-" * 40)
352
+ logger.debug("PHASE 3: Beam Selection/Merging")
353
+ logger.debug("-" * 40)
354
+
315
355
  # Select or merge beams
316
356
  if options.merge_beams:
357
+ logger.debug(f"Merging {len(all_candidates)} beams...")
317
358
  statements = beam_scorer.merge_beams(all_candidates, original_text)
359
+ logger.debug(f" After merge: {len(statements)} statements")
318
360
  else:
361
+ logger.debug(f"Selecting best beam from {len(all_candidates)} candidates...")
319
362
  statements = beam_scorer.select_best_beam(all_candidates, original_text)
363
+ logger.debug(f" Selected beam has {len(statements)} statements")
364
+
365
+ logger.debug("-" * 40)
366
+ logger.debug("PHASE 4: Deduplication")
367
+ logger.debug("-" * 40)
320
368
 
321
369
  # Apply embedding-based deduplication if enabled
322
370
  if options.embedding_dedup and options.deduplicate:
371
+ logger.debug("Using embedding-based deduplication...")
372
+ pre_dedup_count = len(statements)
323
373
  try:
324
374
  comparer = self._get_predicate_comparer(options)
325
375
  if comparer:
@@ -327,14 +377,32 @@ class StatementExtractor:
327
377
  statements,
328
378
  entity_canonicalizer=options.entity_canonicalizer
329
379
  )
380
+ logger.debug(f" After embedding dedup: {len(statements)} statements (removed {pre_dedup_count - len(statements)})")
381
+
330
382
  # Also normalize predicates if taxonomy provided
331
383
  if options.predicate_taxonomy or self._predicate_taxonomy:
384
+ logger.debug("Normalizing predicates to taxonomy...")
332
385
  statements = comparer.normalize_predicates(statements)
333
386
  except Exception as e:
334
387
  logger.warning(f"Embedding deduplication failed, falling back to exact match: {e}")
335
388
  statements = self._deduplicate_statements_exact(statements, options)
389
+ logger.debug(f" After exact dedup: {len(statements)} statements")
336
390
  elif options.deduplicate:
391
+ logger.debug("Using exact text deduplication...")
392
+ pre_dedup_count = len(statements)
337
393
  statements = self._deduplicate_statements_exact(statements, options)
394
+ logger.debug(f" After exact dedup: {len(statements)} statements (removed {pre_dedup_count - len(statements)})")
395
+ else:
396
+ logger.debug("Deduplication disabled")
397
+
398
+ # Log final statements
399
+ logger.debug("-" * 40)
400
+ logger.debug("FINAL STATEMENTS:")
401
+ logger.debug("-" * 40)
402
+ for i, stmt in enumerate(statements):
403
+ conf = f" (conf={stmt.confidence_score:.2f})" if stmt.confidence_score else ""
404
+ canonical = f" -> {stmt.canonical_predicate}" if stmt.canonical_predicate else ""
405
+ logger.debug(f" {i+1}. {stmt.subject.text} --[{stmt.predicate}{canonical}]--> {stmt.object.text}{conf}")
338
406
 
339
407
  return statements
340
408
 
@@ -350,12 +418,16 @@ class StatementExtractor:
350
418
  outputs = self.model.generate(
351
419
  **inputs,
352
420
  max_new_tokens=options.max_new_tokens,
421
+ max_length=None, # Override model default, use max_new_tokens only
353
422
  num_beams=num_seqs,
354
423
  num_beam_groups=num_seqs,
355
424
  num_return_sequences=num_seqs,
356
425
  diversity_penalty=options.diversity_penalty,
357
426
  do_sample=False,
427
+ top_p=None, # Override model config to suppress warning
428
+ top_k=None, # Override model config to suppress warning
358
429
  trust_remote_code=True,
430
+ custom_generate="transformers-community/group-beam-search",
359
431
  )
360
432
 
361
433
  # Decode and process candidates
@@ -280,5 +280,11 @@ class ExtractionOptions(BaseModel):
280
280
  description="Use embedding similarity for predicate deduplication"
281
281
  )
282
282
 
283
+ # Verbose logging
284
+ verbose: bool = Field(
285
+ default=False,
286
+ description="Enable verbose logging for debugging"
287
+ )
288
+
283
289
  class Config:
284
290
  arbitrary_types_allowed = True # Allow Callable type
@@ -83,7 +83,12 @@ class PredicateComparer:
83
83
  # Auto-detect device
84
84
  if device is None:
85
85
  import torch
86
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
86
+ if torch.cuda.is_available():
87
+ self.device = "cuda"
88
+ elif torch.backends.mps.is_available():
89
+ self.device = "mps"
90
+ else:
91
+ self.device = "cpu"
87
92
  else:
88
93
  self.device = device
89
94
 
@@ -289,6 +294,8 @@ class PredicateComparer:
289
294
  Returns:
290
295
  Deduplicated list of statements (keeps best contextualized match)
291
296
  """
297
+ logger.debug(f"Embedding deduplication: {len(statements)} statements, detect_reversals={detect_reversals}")
298
+
292
299
  if len(statements) <= 1:
293
300
  return statements
294
301
 
@@ -297,27 +304,33 @@ class PredicateComparer:
297
304
  return entity_canonicalizer(text)
298
305
  return text.lower().strip()
299
306
 
307
+ logger.debug(" Computing predicate embeddings...")
300
308
  # Compute all predicate embeddings at once for efficiency
301
309
  predicates = [s.predicate for s in statements]
302
310
  pred_embeddings = self._compute_embeddings(predicates)
311
+ logger.debug(f" Computed {len(pred_embeddings)} predicate embeddings")
303
312
 
313
+ logger.debug(" Computing contextualized embeddings (S P O)...")
304
314
  # Compute contextualized embeddings: "Subject Predicate Object" for each statement
305
315
  contextualized_texts = [
306
316
  f"{s.subject.text} {s.predicate} {s.object.text}" for s in statements
307
317
  ]
308
318
  contextualized_embeddings = self._compute_embeddings(contextualized_texts)
309
319
 
320
+ logger.debug(" Computing reversed embeddings (O P S)...")
310
321
  # Compute reversed contextualized embeddings: "Object Predicate Subject"
311
322
  reversed_texts = [
312
323
  f"{s.object.text} {s.predicate} {s.subject.text}" for s in statements
313
324
  ]
314
325
  reversed_embeddings = self._compute_embeddings(reversed_texts)
315
326
 
327
+ logger.debug(" Computing source text embeddings...")
316
328
  # Compute source text embeddings for scoring which duplicate to keep
317
329
  source_embeddings = []
318
330
  for stmt in statements:
319
331
  source_text = stmt.source_text or f"{stmt.subject.text} {stmt.predicate} {stmt.object.text}"
320
332
  source_embeddings.append(self._compute_embeddings([source_text])[0])
333
+ logger.debug(" All embeddings computed, starting comparison loop...")
321
334
 
322
335
  unique_statements: list[Statement] = []
323
336
  unique_pred_embeddings: list[np.ndarray] = []
@@ -358,9 +371,17 @@ class PredicateComparer:
358
371
  if similarity >= self.config.dedup_threshold:
359
372
  duplicate_idx = j
360
373
  is_reversed_match = reversed_match and not direct_match
374
+ match_type = "reversed" if is_reversed_match else "direct"
375
+ logger.debug(
376
+ f" [{i}] DUPLICATE of [{unique_indices[j]}] ({match_type}, sim={similarity:.3f}): "
377
+ f"'{stmt.subject.text}' --[{stmt.predicate}]--> '{stmt.object.text}'"
378
+ )
361
379
  break
362
380
 
363
381
  if duplicate_idx is None:
382
+ logger.debug(
383
+ f" [{i}] UNIQUE: '{stmt.subject.text}' --[{stmt.predicate}]--> '{stmt.object.text}'"
384
+ )
364
385
  # Not a duplicate - add to unique list
365
386
  unique_statements.append(stmt)
366
387
  unique_pred_embeddings.append(pred_embeddings[i])
@@ -451,6 +472,7 @@ class PredicateComparer:
451
472
  merged_stmt = existing_stmt.merge_entity_types_from(stmt)
452
473
  unique_statements[duplicate_idx] = merged_stmt
453
474
 
475
+ logger.debug(f" Deduplication complete: {len(statements)} -> {len(unique_statements)} statements")
454
476
  return unique_statements
455
477
 
456
478
  def normalize_predicates(
@@ -6,10 +6,13 @@ Provides:
6
6
  - BeamScorer: Score and select/merge beams based on quality metrics
7
7
  """
8
8
 
9
+ import logging
9
10
  from typing import Optional
10
11
 
11
12
  from .models import ScoringConfig, Statement
12
13
 
14
+ logger = logging.getLogger(__name__)
15
+
13
16
 
14
17
  class TripleScorer:
15
18
  """
@@ -32,6 +35,7 @@ class TripleScorer:
32
35
  Higher scores indicate better grounding in source text.
33
36
  """
34
37
  if not source_text:
38
+ logger.debug(f" No source text, returning neutral score 0.5")
35
39
  return 0.5 # Neutral score if no source text
36
40
 
37
41
  score = 0.0
@@ -53,6 +57,7 @@ class TripleScorer:
53
57
  weights_sum += 0.2
54
58
 
55
59
  # Check proximity - subject and object in same/nearby region (weight: 0.2)
60
+ proximity_score = 0.0
56
61
  if subject_found and object_found:
57
62
  proximity_score = self._compute_proximity(
58
63
  statement.subject.text,
@@ -62,7 +67,14 @@ class TripleScorer:
62
67
  score += 0.2 * proximity_score
63
68
  weights_sum += 0.2
64
69
 
65
- return score / weights_sum if weights_sum > 0 else 0.0
70
+ final_score = score / weights_sum if weights_sum > 0 else 0.0
71
+
72
+ logger.debug(
73
+ f" Score for '{statement.subject.text}' --[{statement.predicate}]--> '{statement.object.text}': "
74
+ f"{final_score:.2f} (subj={subject_found}, obj={object_found}, pred={predicate_grounded}, prox={proximity_score:.2f})"
75
+ )
76
+
77
+ return final_score
66
78
 
67
79
  def find_evidence_span(
68
80
  self,
@@ -347,10 +359,12 @@ class BeamScorer:
347
359
  return []
348
360
 
349
361
  top_n = top_n or self.config.merge_top_n
362
+ logger.debug(f"Merging beams: {len(candidates)} candidates, selecting top {top_n}")
350
363
 
351
364
  # Score each beam
352
365
  scored_beams = []
353
- for beam in candidates:
366
+ for i, beam in enumerate(candidates):
367
+ logger.debug(f" Scoring beam {i} ({len(beam)} statements)...")
354
368
  for stmt in beam:
355
369
  if stmt.confidence_score is None:
356
370
  stmt.confidence_score = self.triple_scorer.score_triple(stmt, source_text)
@@ -359,31 +373,36 @@ class BeamScorer:
359
373
 
360
374
  beam_score = self.score_beam(beam, source_text)
361
375
  scored_beams.append((beam_score, beam))
376
+ logger.debug(f" Beam {i} score: {beam_score:.3f}")
362
377
 
363
378
  # Sort and take top N
364
379
  scored_beams.sort(key=lambda x: x[0], reverse=True)
365
380
  top_beams = [beam for _, beam in scored_beams[:top_n]]
381
+ logger.debug(f" Selected top {len(top_beams)} beams")
366
382
 
367
383
  # Pool all triples
368
384
  all_statements: list[Statement] = []
369
385
  for beam in top_beams:
370
386
  all_statements.extend(beam)
387
+ logger.debug(f" Pooled {len(all_statements)} statements from top beams")
371
388
 
372
389
  # Filter by confidence threshold
373
390
  min_conf = self.config.min_confidence
374
391
  filtered = [s for s in all_statements if (s.confidence_score or 0) >= min_conf]
392
+ logger.debug(f" After confidence filter (>={min_conf}): {len(filtered)} statements")
375
393
 
376
- # Filter out statements where source_text doesn't support the predicate
377
- # This catches model hallucinations where predicate doesn't match the evidence
378
- consistent = [
379
- s for s in filtered
380
- if self._source_text_supports_predicate(s)
381
- ]
394
+ # # Filter out statements where source_text doesn't support the predicate
395
+ # # This catches model hallucinations where predicate doesn't match the evidence
396
+ # consistent = [
397
+ # s for s in filtered
398
+ # if self._source_text_supports_predicate(s)
399
+ # ]
400
+ # logger.debug(f" After predicate consistency filter: {len(consistent)} statements")
382
401
 
383
402
  # Deduplicate - keep highest confidence for each (subject, predicate, object)
384
403
  # Note: Same subject+predicate with different objects is valid (e.g., "Apple announced X and Y")
385
404
  seen: dict[tuple[str, str, str], Statement] = {}
386
- for stmt in consistent:
405
+ for stmt in all_statements:
387
406
  key = (
388
407
  stmt.subject.text.lower(),
389
408
  stmt.predicate.lower(),
@@ -392,7 +411,10 @@ class BeamScorer:
392
411
  if key not in seen or (stmt.confidence_score or 0) > (seen[key].confidence_score or 0):
393
412
  seen[key] = stmt
394
413
 
395
- return list(seen.values())
414
+ result = list(seen.values())
415
+ logger.debug(f" After deduplication: {len(result)} unique statements")
416
+
417
+ return result
396
418
 
397
419
  def _source_text_supports_predicate(self, stmt: Statement) -> bool:
398
420
  """
@@ -1,9 +0,0 @@
1
- statement_extractor/__init__.py,sha256=4Ht8GJdgik_iti7zpG71Oi5EEAnck6AYDvy7soRqIOg,2967
2
- statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
3
- statement_extractor/extractor.py,sha256=PX0SiJnYUnh06seyH5W77FcPpcvLXwEM8IGsuVuRh0Q,22158
4
- statement_extractor/models.py,sha256=xDF3pDPhIiqiMwFMPV94aBEgZGbSe-x2TkshahOiCog,10739
5
- statement_extractor/predicate_comparer.py,sha256=iwBfNJFNOFv8ODKN9F9EtmknpCeSThOpnu6P_PJSmgE,24898
6
- statement_extractor/scoring.py,sha256=Wa1BW6jXtHD7dZkUXwdwE39hwFo2ko6BuIogBc4E2Lk,14493
7
- corp_extractor-0.2.3.dist-info/METADATA,sha256=dCJbLWIj7hgzpkC4zYvNmnEAhNnizUEq_caea6AamIU,10724
8
- corp_extractor-0.2.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
9
- corp_extractor-0.2.3.dist-info/RECORD,,