corp-extractor 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/METADATA +235 -96
  2. corp_extractor-0.5.0.dist-info/RECORD +55 -0
  3. statement_extractor/__init__.py +9 -0
  4. statement_extractor/cli.py +460 -21
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +1182 -0
  7. statement_extractor/extractor.py +32 -47
  8. statement_extractor/gliner_extraction.py +218 -0
  9. statement_extractor/llm.py +255 -0
  10. statement_extractor/models/__init__.py +74 -0
  11. statement_extractor/models/canonical.py +139 -0
  12. statement_extractor/models/entity.py +102 -0
  13. statement_extractor/models/labels.py +191 -0
  14. statement_extractor/models/qualifiers.py +91 -0
  15. statement_extractor/models/statement.py +75 -0
  16. statement_extractor/models.py +15 -6
  17. statement_extractor/pipeline/__init__.py +39 -0
  18. statement_extractor/pipeline/config.py +134 -0
  19. statement_extractor/pipeline/context.py +177 -0
  20. statement_extractor/pipeline/orchestrator.py +447 -0
  21. statement_extractor/pipeline/registry.py +297 -0
  22. statement_extractor/plugins/__init__.py +43 -0
  23. statement_extractor/plugins/base.py +446 -0
  24. statement_extractor/plugins/canonicalizers/__init__.py +17 -0
  25. statement_extractor/plugins/canonicalizers/base.py +9 -0
  26. statement_extractor/plugins/canonicalizers/location.py +219 -0
  27. statement_extractor/plugins/canonicalizers/organization.py +230 -0
  28. statement_extractor/plugins/canonicalizers/person.py +242 -0
  29. statement_extractor/plugins/extractors/__init__.py +13 -0
  30. statement_extractor/plugins/extractors/base.py +9 -0
  31. statement_extractor/plugins/extractors/gliner2.py +536 -0
  32. statement_extractor/plugins/labelers/__init__.py +29 -0
  33. statement_extractor/plugins/labelers/base.py +9 -0
  34. statement_extractor/plugins/labelers/confidence.py +138 -0
  35. statement_extractor/plugins/labelers/relation_type.py +87 -0
  36. statement_extractor/plugins/labelers/sentiment.py +159 -0
  37. statement_extractor/plugins/labelers/taxonomy.py +373 -0
  38. statement_extractor/plugins/labelers/taxonomy_embedding.py +466 -0
  39. statement_extractor/plugins/qualifiers/__init__.py +19 -0
  40. statement_extractor/plugins/qualifiers/base.py +9 -0
  41. statement_extractor/plugins/qualifiers/companies_house.py +174 -0
  42. statement_extractor/plugins/qualifiers/gleif.py +186 -0
  43. statement_extractor/plugins/qualifiers/person.py +221 -0
  44. statement_extractor/plugins/qualifiers/sec_edgar.py +198 -0
  45. statement_extractor/plugins/splitters/__init__.py +13 -0
  46. statement_extractor/plugins/splitters/base.py +9 -0
  47. statement_extractor/plugins/splitters/t5_gemma.py +188 -0
  48. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  49. statement_extractor/plugins/taxonomy/embedding.py +337 -0
  50. statement_extractor/plugins/taxonomy/mnli.py +279 -0
  51. statement_extractor/scoring.py +17 -69
  52. corp_extractor-0.3.0.dist-info/RECORD +0 -12
  53. statement_extractor/spacy_extraction.py +0 -386
  54. {corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/WHEEL +0 -0
  55. {corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: corp-extractor
3
- Version: 0.3.0
3
+ Version: 0.5.0
4
4
  Summary: Extract structured statements from text using T5-Gemma 2 and Diverse Beam Search
5
5
  Project-URL: Homepage, https://github.com/corp-o-rate/statement-extractor
6
6
  Project-URL: Documentation, https://github.com/corp-o-rate/statement-extractor#readme
@@ -23,11 +23,13 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
23
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
24
  Classifier: Topic :: Text Processing :: Linguistic
25
25
  Requires-Python: >=3.10
26
+ Requires-Dist: accelerate>=1.12.0
26
27
  Requires-Dist: click>=8.0.0
28
+ Requires-Dist: gguf>=0.17.1
29
+ Requires-Dist: gliner2
27
30
  Requires-Dist: numpy>=1.24.0
28
31
  Requires-Dist: pydantic>=2.0.0
29
32
  Requires-Dist: sentence-transformers>=2.2.0
30
- Requires-Dist: spacy>=3.5.0
31
33
  Requires-Dist: torch>=2.0.0
32
34
  Requires-Dist: transformers>=5.0.0rc3
33
35
  Provides-Extra: dev
@@ -35,6 +37,8 @@ Requires-Dist: mypy>=1.0.0; extra == 'dev'
35
37
  Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
36
38
  Requires-Dist: pytest>=7.0.0; extra == 'dev'
37
39
  Requires-Dist: ruff>=0.1.0; extra == 'dev'
40
+ Provides-Extra: llm
41
+ Requires-Dist: llama-cpp-python>=0.2.0; extra == 'llm'
38
42
  Description-Content-Type: text/markdown
39
43
 
40
44
  # Corp Extractor
@@ -47,20 +51,18 @@ Extract structured subject-predicate-object statements from unstructured text us
47
51
 
48
52
  ## Features
49
53
 
54
+ - **6-Stage Pipeline** *(v0.5.0)*: Modular plugin-based architecture for full entity resolution
50
55
  - **Structured Extraction**: Converts unstructured text into subject-predicate-object triples
51
56
  - **Entity Type Recognition**: Identifies 12 entity types (ORG, PERSON, GPE, LOC, PRODUCT, EVENT, etc.)
52
- - **Combined Quality Scoring** *(v0.3.0)*: Confidence combines semantic similarity (50%) + subject/object noun scores (25% each)
53
- - **spaCy-First Predicates** *(v0.3.0)*: Always uses spaCy for predicate extraction (model predicates are unreliable)
54
- - **Multi-Candidate Extraction** *(v0.3.0)*: Generates 3 candidates per statement (hybrid, spaCy-only, predicate-split)
55
- - **Best Triple Selection** *(v0.3.0)*: Keeps only highest-scoring triple per source (use `--all-triples` to keep all)
56
- - **Extraction Method Tracking** *(v0.3.0)*: Each statement includes `extraction_method` field (hybrid, spacy, split, model)
57
- - **Beam Merging** *(v0.2.0)*: Combines top beams for better coverage instead of picking one
58
- - **Embedding-based Dedup** *(v0.2.0)*: Uses semantic similarity to detect near-duplicate predicates
59
- - **Predicate Taxonomies** *(v0.2.0)*: Normalize predicates to canonical forms via embeddings
60
- - **Contextualized Matching** *(v0.2.2)*: Compares full "Subject Predicate Object" against source text for better accuracy
61
- - **Entity Type Merging** *(v0.2.3)*: Automatically merges UNKNOWN entity types with specific types during deduplication
62
- - **Reversal Detection** *(v0.2.3)*: Detects and corrects subject-object reversals using embedding comparison
63
- - **Command Line Interface** *(v0.2.4)*: Full-featured CLI for terminal usage
57
+ - **Entity Qualification** *(v0.5.0)*: Adds roles, identifiers (LEI, ticker, company numbers) via external APIs
58
+ - **Canonicalization** *(v0.5.0)*: Resolves entities to canonical forms with fuzzy matching
59
+ - **Statement Labeling** *(v0.5.0)*: Sentiment analysis, relation type classification, confidence scoring
60
+ - **GLiNER2 Integration** *(v0.4.0)*: Uses GLiNER2 (205M params) for entity recognition and relation extraction
61
+ - **Predefined Predicates**: Optional `--predicates` list for GLiNER2 relation extraction mode
62
+ - **Beam Merging**: Combines top beams for better coverage instead of picking one
63
+ - **Embedding-based Dedup**: Uses semantic similarity to detect near-duplicate predicates
64
+ - **Predicate Taxonomies**: Normalize predicates to canonical forms via embeddings
65
+ - **Command Line Interface**: Full-featured CLI with `split`, `pipeline`, and `plugins` commands
64
66
  - **Multiple Output Formats**: Get results as Pydantic models, JSON, XML, or dictionaries
65
67
 
66
68
  ## Installation
@@ -69,7 +71,7 @@ Extract structured subject-predicate-object statements from unstructured text us
69
71
  pip install corp-extractor
70
72
  ```
71
73
 
72
- The spaCy model for predicate inference is downloaded automatically on first use.
74
+ The GLiNER2 model (205M params) is downloaded automatically on first use.
73
75
 
74
76
  **Note**: This package requires `transformers>=5.0.0` for T5-Gemma2 model support.
75
77
 
@@ -134,62 +136,74 @@ uvx corp-extractor "Apple announced a new iPhone."
134
136
 
135
137
  ### Usage Examples
136
138
 
137
- ```bash
138
- # Extract from text argument
139
- corp-extractor "Apple Inc. announced the iPhone 15 at their September event."
140
-
141
- # Extract from file
142
- corp-extractor -f article.txt
139
+ The CLI provides three main commands: `split`, `pipeline`, and `plugins`.
143
140
 
144
- # Pipe from stdin
145
- cat article.txt | corp-extractor -
141
+ ```bash
142
+ # Simple extraction (Stage 1 only, fast)
143
+ corp-extractor split "Apple Inc. announced the iPhone 15."
144
+ corp-extractor split -f article.txt --json
145
+
146
+ # Full 6-stage pipeline (entity resolution, canonicalization, labeling, taxonomy)
147
+ corp-extractor pipeline "Amazon CEO Andy Jassy announced plans to hire workers."
148
+ corp-extractor pipeline -f article.txt --stages 1-3
149
+ corp-extractor pipeline "..." --disable-plugins sec_edgar
150
+
151
+ # Plugin management
152
+ corp-extractor plugins list
153
+ corp-extractor plugins list --stage 3
154
+ corp-extractor plugins info gleif_qualifier
155
+ ```
146
156
 
147
- # Output as JSON
148
- corp-extractor "Tim Cook is CEO of Apple." --json
157
+ ### Split Command (Simple Extraction)
149
158
 
150
- # Output as XML
151
- corp-extractor -f article.txt --xml
159
+ ```bash
160
+ corp-extractor split "Tim Cook is CEO of Apple." --json
161
+ corp-extractor split -f article.txt --beams 8 --verbose
162
+ cat article.txt | corp-extractor split -
163
+ ```
152
164
 
153
- # Verbose output with confidence scores
154
- corp-extractor -f article.txt --verbose
165
+ ### Pipeline Command (Full Entity Resolution)
155
166
 
156
- # Use more beams for better quality
157
- corp-extractor -f article.txt --beams 8
167
+ ```bash
168
+ # Run all 5 stages
169
+ corp-extractor pipeline "Apple CEO Tim Cook announced..."
158
170
 
159
- # Use custom predicate taxonomy
160
- corp-extractor -f article.txt --taxonomy predicates.txt
171
+ # Run specific stages
172
+ corp-extractor pipeline "..." --stages 1-3 # Stages 1, 2, 3
173
+ corp-extractor pipeline "..." --stages 1,2,5 # Stages 1, 2, 5
174
+ corp-extractor pipeline "..." --skip-stages 4,5 # Skip stages 4 and 5
161
175
 
162
- # Use GPU explicitly
163
- corp-extractor -f article.txt --device cuda
176
+ # Plugin selection
177
+ corp-extractor pipeline "..." --plugins gleif,companies_house
178
+ corp-extractor pipeline "..." --disable-plugins sec_edgar
164
179
  ```
165
180
 
166
- ### CLI Options
181
+ ### CLI Reference
167
182
 
168
183
  ```
169
- Usage: corp-extractor [OPTIONS] [TEXT]
184
+ Usage: corp-extractor [COMMAND] [OPTIONS]
185
+
186
+ Commands:
187
+ split Simple extraction (T5-Gemma only)
188
+ pipeline Full 5-stage pipeline with entity resolution
189
+ plugins List or inspect available plugins
170
190
 
171
- Options:
191
+ Split Options:
172
192
  -f, --file PATH Read input from file
173
193
  -o, --output [table|json|xml] Output format (default: table)
174
- --json Output as JSON (shortcut)
175
- --xml Output as XML (shortcut)
194
+ --json / --xml Output format shortcuts
176
195
  -b, --beams INTEGER Number of beams (default: 4)
177
- --diversity FLOAT Diversity penalty (default: 1.0)
178
- --max-tokens INTEGER Max tokens to generate (default: 2048)
179
- --no-dedup Disable deduplication
180
- --no-embeddings Disable embedding-based dedup (faster)
181
- --no-merge Disable beam merging
182
- --no-spacy Disable spaCy extraction (use raw model output)
183
- --all-triples Keep all candidate triples (default: best per source)
184
- --dedup-threshold FLOAT Deduplication threshold (default: 0.65)
185
- --min-confidence FLOAT Min confidence filter (default: 0)
186
- --taxonomy PATH Load predicate taxonomy from file
187
- --taxonomy-threshold FLOAT Taxonomy matching threshold (default: 0.5)
196
+ --no-gliner Disable GLiNER2 extraction
197
+ --predicates TEXT Comma-separated predicates for relation extraction
188
198
  --device [auto|cuda|mps|cpu] Device to use (default: auto)
189
199
  -v, --verbose Show confidence scores and metadata
190
- -q, --quiet Suppress progress messages
191
- --version Show version
192
- --help Show this message
200
+
201
+ Pipeline Options:
202
+ --stages TEXT Stages to run (e.g., '1-3' or '1,2,5')
203
+ --skip-stages TEXT Stages to skip (e.g., '4,5')
204
+ --plugins TEXT Enable only these plugins (comma-separated)
205
+ --disable-plugins TEXT Disable these plugins (comma-separated)
206
+ -o, --output [table|json|yaml|triples] Output format
193
207
  ```
194
208
 
195
209
  ## New in v0.2.0: Quality Scoring & Beam Merging
@@ -283,69 +297,194 @@ for stmt in fixed_statements:
283
297
 
284
298
  During deduplication, reversed duplicates (e.g., "A -> P -> B" and "B -> P -> A") are now detected and merged, with the correct orientation determined by source text similarity.
285
299
 
286
- ## New in v0.3.0: spaCy-First Extraction & Semantic Scoring
300
+ ## New in v0.5.0: Pipeline Architecture
301
+
302
+ v0.5.0 introduces a **6-stage plugin-based pipeline** for comprehensive entity resolution, statement enrichment, and taxonomy classification.
303
+
304
+ ### Pipeline Stages
305
+
306
+ | Stage | Name | Input | Output | Key Tech |
307
+ |-------|------|-------|--------|----------|
308
+ | 1 | Splitting | Text | `RawTriple[]` | T5-Gemma2 |
309
+ | 2 | Extraction | `RawTriple[]` | `PipelineStatement[]` | GLiNER2 |
310
+ | 3 | Qualification | Entities | `QualifiedEntity[]` | Gemma3, APIs |
311
+ | 4 | Canonicalization | `QualifiedEntity[]` | `CanonicalEntity[]` | Fuzzy matching |
312
+ | 5 | Labeling | Statements | `LabeledStatement[]` | Sentiment, etc. |
313
+ | 6 | Taxonomy | Statements | `TaxonomyResult[]` | MNLI, Embeddings |
314
+
315
+ ### Pipeline Python API
316
+
317
+ ```python
318
+ from statement_extractor.pipeline import ExtractionPipeline, PipelineConfig
319
+
320
+ # Run full pipeline
321
+ pipeline = ExtractionPipeline()
322
+ ctx = pipeline.process("Amazon CEO Andy Jassy announced plans to hire workers.")
323
+
324
+ # Access results at each stage
325
+ print(f"Raw triples: {len(ctx.raw_triples)}")
326
+ print(f"Statements: {len(ctx.statements)}")
327
+ print(f"Labeled: {len(ctx.labeled_statements)}")
328
+
329
+ # Output with fully qualified names
330
+ for stmt in ctx.labeled_statements:
331
+ print(f"{stmt.subject_fqn} --[{stmt.statement.predicate}]--> {stmt.object_fqn}")
332
+ # e.g., "Andy Jassy (CEO, Amazon) --[announced]--> plans to hire workers"
333
+ ```
334
+
335
+ ### Pipeline Configuration
336
+
337
+ ```python
338
+ from statement_extractor.pipeline import PipelineConfig, ExtractionPipeline
339
+
340
+ # Run only specific stages
341
+ config = PipelineConfig(
342
+ enabled_stages={1, 2, 3}, # Skip canonicalization and labeling
343
+ disabled_plugins={"sec_edgar_qualifier"}, # Disable specific plugins
344
+ )
345
+ pipeline = ExtractionPipeline(config)
346
+ ctx = pipeline.process(text)
347
+
348
+ # Alternative: create config from stage string
349
+ config = PipelineConfig.from_stage_string("1-3") # Stages 1, 2, 3
350
+ ```
351
+
352
+ ### Built-in Plugins
353
+
354
+ **Splitters (Stage 1):**
355
+ - `t5_gemma_splitter` - T5-Gemma2 statement extraction
356
+
357
+ **Extractors (Stage 2):**
358
+ - `gliner2_extractor` - GLiNER2 entity recognition and relation extraction
359
+
360
+ **Qualifiers (Stage 3):**
361
+ - `person_qualifier` - PERSON → role, org (uses Gemma3)
362
+ - `gleif_qualifier` - ORG → LEI, jurisdiction (GLEIF API)
363
+ - `companies_house_qualifier` - ORG → UK company number
364
+ - `sec_edgar_qualifier` - ORG → SEC CIK, ticker
365
+
366
+ **Canonicalizers (Stage 4):**
367
+ - `organization_canonicalizer` - ORG canonical names
368
+ - `person_canonicalizer` - PERSON name variants
287
369
 
288
- v0.3.0 introduces significant improvements to extraction quality:
370
+ **Labelers (Stage 5):**
371
+ - `sentiment_labeler` - Statement sentiment analysis
289
372
 
290
- ### spaCy-First Predicate Extraction
373
+ **Taxonomy Classifiers (Stage 6):**
374
+ - `mnli_taxonomy_classifier` - MNLI zero-shot classification against ESG taxonomy
375
+ - `embedding_taxonomy_classifier` - Embedding similarity-based taxonomy classification
291
376
 
292
- The T5-Gemma model is excellent at:
377
+ Taxonomy classifiers return **multiple labels** per statement above the confidence threshold.
378
+
379
+ ## New in v0.4.0: GLiNER2 Integration
380
+
381
+ v0.4.0 replaces spaCy with **GLiNER2** (205M params) for entity recognition and relation extraction. GLiNER2 is a unified model that handles NER, text classification, structured data extraction, and relation extraction with CPU-optimized inference.
382
+
383
+ ### Why GLiNER2?
384
+
385
+ The T5-Gemma model excels at:
293
386
  - **Triple isolation** - identifying that a relationship exists
294
387
  - **Coreference resolution** - resolving pronouns to named entities
295
388
 
296
- But unreliable at:
297
- - **Predicate extraction** - often returns empty or wrong predicates
389
+ GLiNER2 now handles:
390
+ - **Entity recognition** - refining subject/object boundaries
391
+ - **Relation extraction** - using 324 default predicates across 21 categories
392
+ - **Entity scoring** - scoring how "entity-like" subjects/objects are
393
+ - **Confidence scoring** - real confidence values via `include_confidence=True`
394
+
395
+ ### Default Predicates
396
+
397
+ GLiNER2 uses **324 predicates** organized into 21 categories (ownership, employment, funding, etc.). These are loaded from `default_predicates.json` and include descriptions and confidence thresholds.
398
+
399
+ **Key features:**
400
+ - **All matches returned** - Every matching relation is returned, not just the best one
401
+ - **Category-based extraction** - Iterates through categories to stay under GLiNER2's ~25 label limit
402
+ - **Custom predicate files** - Provide your own JSON file with custom predicates
403
+
404
+ ### Extraction Modes
405
+
406
+ **Mode 1: Default Predicates** (recommended)
407
+ ```python
408
+ from statement_extractor import extract_statements
409
+
410
+ # Uses 324 built-in predicates automatically
411
+ result = extract_statements("John works for Apple Inc. in Cupertino.")
412
+ # Returns ALL matching relations
413
+ ```
414
+
415
+ **Mode 2: Custom Predicate List**
416
+ ```python
417
+ from statement_extractor import extract_statements, ExtractionOptions
418
+
419
+ options = ExtractionOptions(predicates=["works_for", "founded", "acquired", "headquartered_in"])
420
+ result = extract_statements("John works for Apple Inc. in Cupertino.", options)
421
+ ```
298
422
 
299
- **Solution:** v0.3.0 always uses spaCy for predicate extraction. The model provides subject, object, entity types, and source text; spaCy provides the predicate.
423
+ Or via CLI:
424
+ ```bash
425
+ corp-extractor "John works for Apple Inc." --predicates "works_for,founded,acquired"
426
+ ```
300
427
 
301
- ### Three Candidate Extraction Methods
428
+ **Mode 3: Custom Predicate File**
429
+ ```python
430
+ from statement_extractor.pipeline import ExtractionPipeline, PipelineConfig
302
431
 
303
- For each statement, three candidates are generated and the best is selected:
432
+ config = PipelineConfig(
433
+ extractor_options={"predicates_file": "/path/to/custom_predicates.json"}
434
+ )
435
+ pipeline = ExtractionPipeline(config)
436
+ ctx = pipeline.process("John works for Apple Inc.")
437
+ ```
438
+
439
+ Or via CLI:
440
+ ```bash
441
+ corp-extractor pipeline "John works for Apple Inc." --predicates-file custom_predicates.json
442
+ ```
443
+
444
+ ### Two Candidate Extraction Methods
445
+
446
+ For each statement, two candidates are generated and the best is selected:
304
447
 
305
448
  | Method | Description |
306
449
  |--------|-------------|
307
- | `hybrid` | Model subject/object + spaCy predicate |
308
- | `spacy` | All components from spaCy dependency parsing |
309
- | `split` | Source text split around the predicate |
450
+ | `hybrid` | Model subject/object + GLiNER2/extracted predicate |
451
+ | `gliner` | All components refined by GLiNER2 entity recognition |
310
452
 
311
453
  ```python
312
454
  for stmt in result:
313
455
  print(f"{stmt.subject.text} --[{stmt.predicate}]--> {stmt.object.text}")
314
- print(f" Method: {stmt.extraction_method}") # hybrid, spacy, split, or model
456
+ print(f" Method: {stmt.extraction_method}") # hybrid, gliner, or model
315
457
  print(f" Confidence: {stmt.confidence_score:.2f}")
316
458
  ```
317
459
 
318
460
  ### Combined Quality Scoring
319
461
 
320
- Confidence scores combine **semantic similarity** and **grammatical accuracy**:
462
+ Confidence scores combine **semantic similarity** and **entity recognition**:
321
463
 
322
464
  | Component | Weight | Description |
323
465
  |-----------|--------|-------------|
324
466
  | Semantic similarity | 50% | Cosine similarity between source text and reassembled triple |
325
- | Subject noun score | 25% | How noun-like the subject is |
326
- | Object noun score | 25% | How noun-like the object is |
327
-
328
- **Noun scoring:**
329
- - Proper noun(s) only: 1.0
330
- - Common noun(s) only: 0.8
331
- - Contains noun + other words: 0.4-0.8 (based on ratio)
332
- - No nouns: 0.2
467
+ | Subject entity score | 25% | How entity-like the subject is (via GLiNER2) |
468
+ | Object entity score | 25% | How entity-like the object is (via GLiNER2) |
333
469
 
334
- This ensures extracted subjects and objects are grammatically valid entities, not fragments or verb phrases.
470
+ **Entity scoring (via GLiNER2):**
471
+ - Recognized entity with high confidence: 1.0
472
+ - Recognized entity with moderate confidence: 0.8
473
+ - Partially recognized: 0.6
474
+ - Not recognized: 0.2
335
475
 
336
476
  ### Extraction Method Tracking
337
477
 
338
- Each statement now includes an `extraction_method` field:
339
- - `hybrid` - Model subject/object + spaCy predicate
340
- - `spacy` - All components from spaCy dependency parsing
341
- - `split` - Subject/object from splitting source text around predicate
342
- - `model` - All components from T5-Gemma model (only when `--no-spacy`)
478
+ Each statement includes an `extraction_method` field:
479
+ - `hybrid` - Model subject/object + GLiNER2 predicate
480
+ - `gliner` - All components refined by GLiNER2 entity recognition
481
+ - `model` - All components from T5-Gemma model (only when `--no-gliner`)
343
482
 
344
483
  ### Best Triple Selection
345
484
 
346
- By default, only the **highest-scoring triple** is kept for each source sentence. This ensures clean output without redundant candidates.
485
+ By default, only the **highest-scoring triple** is kept for each source sentence.
347
486
 
348
- To keep all candidate triples (for debugging or analysis):
487
+ To keep all candidate triples:
349
488
  ```python
350
489
  options = ExtractionOptions(all_triples=True)
351
490
  result = extract_statements(text, options)
@@ -356,15 +495,15 @@ Or via CLI:
356
495
  corp-extractor "Your text" --all-triples --verbose
357
496
  ```
358
497
 
359
- **Disable spaCy extraction** to use only model output:
498
+ **Disable GLiNER2 extraction** to use only model output:
360
499
  ```python
361
- options = ExtractionOptions(use_spacy_extraction=False)
500
+ options = ExtractionOptions(use_gliner_extraction=False)
362
501
  result = extract_statements(text, options)
363
502
  ```
364
503
 
365
504
  Or via CLI:
366
505
  ```bash
367
- corp-extractor "Your text" --no-spacy
506
+ corp-extractor "Your text" --no-gliner
368
507
  ```
369
508
 
370
509
  ## Disable Embeddings
@@ -436,14 +575,14 @@ for text in texts:
436
575
  This library uses the T5-Gemma 2 statement extraction model with **Diverse Beam Search** ([Vijayakumar et al., 2016](https://arxiv.org/abs/1610.02424)):
437
576
 
438
577
  1. **Diverse Beam Search**: Generates 4+ candidate outputs using beam groups with diversity penalty
439
- 2. **Quality Scoring** *(v0.2.0)*: Each triple scored for groundedness in source text
440
- 3. **Beam Merging** *(v0.2.0)*: Top beams combined for better coverage
441
- 4. **Embedding Dedup** *(v0.2.0)*: Semantic similarity removes near-duplicate predicates
442
- 5. **Predicate Normalization** *(v0.2.0)*: Optional taxonomy matching via embeddings
443
- 6. **Contextualized Matching** *(v0.2.2)*: Full statement context used for canonicalization and dedup
444
- 7. **Entity Type Merging** *(v0.2.3)*: UNKNOWN types merged with specific types during dedup
445
- 8. **Reversal Detection** *(v0.2.3)*: Subject-object reversals detected and corrected via embedding comparison
446
- 9. **Hybrid spaCy** *(v0.2.12)*: spaCy candidates added to pool alongside model output for better coverage
578
+ 2. **Quality Scoring**: Each triple scored for groundedness in source text
579
+ 3. **Beam Merging**: Top beams combined for better coverage
580
+ 4. **Embedding Dedup**: Semantic similarity removes near-duplicate predicates
581
+ 5. **Predicate Normalization**: Optional taxonomy matching via embeddings
582
+ 6. **Contextualized Matching**: Full statement context used for canonicalization and dedup
583
+ 7. **Entity Type Merging**: UNKNOWN types merged with specific types during dedup
584
+ 8. **Reversal Detection**: Subject-object reversals detected and corrected via embedding comparison
585
+ 9. **GLiNER2 Extraction** *(v0.4.0)*: Entity recognition and relation extraction for improved accuracy
447
586
 
448
587
  ## Requirements
449
588
 
@@ -452,7 +591,7 @@ This library uses the T5-Gemma 2 statement extraction model with **Diverse Beam
452
591
  - Transformers 5.0+
453
592
  - Pydantic 2.0+
454
593
  - sentence-transformers 2.2+
455
- - spaCy 3.5+ (model downloaded automatically on first use)
594
+ - GLiNER2 (model downloaded automatically on first use)
456
595
  - ~2GB VRAM (GPU) or ~4GB RAM (CPU)
457
596
 
458
597
  ## Links
@@ -0,0 +1,55 @@
1
+ statement_extractor/__init__.py,sha256=Lmgw3jtwrfu09mXSfNFCB5AN0J6tsEQ2uOrrQciMrtI,3215
2
+ statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
3
+ statement_extractor/cli.py,sha256=iqsqvLAN0FMRoE4KskEoW-4DE5_7Tll8xeHA1t04KJg,25028
4
+ statement_extractor/extractor.py,sha256=CGJCmAMiIoDsPtjIdvOHYBcz8058eYpfLMngjELMJhI,38403
5
+ statement_extractor/gliner_extraction.py,sha256=OL4w-0_rZc6XCojaVsbGY4VdIXRJ6j8ZmeUeTOL0Ue0,8118
6
+ statement_extractor/llm.py,sha256=1eBrYs-bUPqzbpiiif_hH-gE_DeM-l3cmddrIoLHFXo,8010
7
+ statement_extractor/models.py,sha256=fXTT7qxPqynnrrpb77nCgs3K2yn_YgbSugSXv12boX4,12312
8
+ statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
9
+ statement_extractor/scoring.py,sha256=s_8nhavBNzPPFmGf2FyBummH4tgP7YGpXoMhl2Jh3Xw,16650
10
+ statement_extractor/data/default_predicates.json,sha256=7rhFXWHvStDj4sLYfBXKS50xGChPaqMKUaKTkMEJRGk,32870
11
+ statement_extractor/data/statement_taxonomy.json,sha256=XhCeVBC4aQB-7NR40Niu4yN2BmL0c2Gd-RKkUpsYK24,37981
12
+ statement_extractor/models/__init__.py,sha256=gjTu450FPe9dvhIVQXqBwF8u0hgSnPORGXzxmSEuCnM,2564
13
+ statement_extractor/models/canonical.py,sha256=ld6z6RtK03iOs_aUk8Rftcm0pUoaFpLUfyfbKI26N_o,4354
14
+ statement_extractor/models/entity.py,sha256=l2ny91BnnWwPo9zx1_Fb8WMKPNuIQFN0H7ILncylmcY,3214
15
+ statement_extractor/models/labels.py,sha256=e-mFDuzb42oJ69gLZTWCdg5_MNqRftQ2La5x8y9Cv-Y,6236
16
+ statement_extractor/models/qualifiers.py,sha256=YkvyWh2p1fK5iMRDC2Dq1r-XJOmJ1rvWFTFUIkQ9zcc,3495
17
+ statement_extractor/models/statement.py,sha256=cOgabA7IJxHYjlH5AksJRNf2Rv5VScMPqZdfjQyXRN0,2733
18
+ statement_extractor/pipeline/__init__.py,sha256=Q3M2Arx9BWH_APZxM-P0G-C3ISguG1whiA5QhxDHQCA,1071
19
+ statement_extractor/pipeline/config.py,sha256=rxZN27OWp05F-NaatwrYkjp56zbzHZ0hMtNU1mvBxgw,4130
20
+ statement_extractor/pipeline/context.py,sha256=wURDYtzDrmbHu40Af_C_oTtN55wnULKHNZjUx6O8t-0,6126
21
+ statement_extractor/pipeline/orchestrator.py,sha256=oHegnsDzXj87q8iAoi-QZj2ZyB1rX5qmg57BdIjvKo0,17617
22
+ statement_extractor/pipeline/registry.py,sha256=qj5M5tMm9GmNCguy8dWBXMT8XmhemiZjJMktZsRlevw,11415
23
+ statement_extractor/plugins/__init__.py,sha256=8k3lQGQNQSMUzxCmk4nAH8dIc1DqEnMyiqHlZZv81q0,1099
24
+ statement_extractor/plugins/base.py,sha256=GZ4WT5S2mH3C_uN6nyBz-nGlAn_Z2o2A51FSRu6gCEo,12797
25
+ statement_extractor/plugins/canonicalizers/__init__.py,sha256=LDb9NodyuLSoLzrLnNzMeviK79GHnyaLGU0J_02BBgM,421
26
+ statement_extractor/plugins/canonicalizers/base.py,sha256=dbreQuEPB48eBJmah7hpl67azVU4QLhbvSrjXr0vT88,195
27
+ statement_extractor/plugins/canonicalizers/location.py,sha256=Rz5SCM4bb0p0gsnHPzsQJv-RN59yoj9Z1NmF8yLQNv0,6590
28
+ statement_extractor/plugins/canonicalizers/organization.py,sha256=L-mhdctkRXuu84RsNHp80M_tDIiMumYaHAG6WfxpH4c,7482
29
+ statement_extractor/plugins/canonicalizers/person.py,sha256=Nw8FuJOBmg-cTaOTd2BJ1TZtydprfzIKL25wJa_VJek,6944
30
+ statement_extractor/plugins/extractors/__init__.py,sha256=sqxTI7WwDLVQKwOiQXqWS72gjJnwb76Gs9N3LGetBnI,253
31
+ statement_extractor/plugins/extractors/base.py,sha256=kNRsQ7BL84lXPXREm7CihrprDUaFwDDvMpBcbZlwSGA,179
32
+ statement_extractor/plugins/extractors/gliner2.py,sha256=rgfY8l9v8EWCxfB3g6hLnmLCIekTBkfWMG8dgSAZu-E,21627
33
+ statement_extractor/plugins/labelers/__init__.py,sha256=flHEoBvnzQ3vAKkIUHyezpYi2H3KJvYGRerCVnc80r0,965
34
+ statement_extractor/plugins/labelers/base.py,sha256=hIgJKq2LU00OcL0Zjy1L9hP8K2onlM_xtZ63XcH8qDE,171
35
+ statement_extractor/plugins/labelers/confidence.py,sha256=XiXjBYe-8ch_SCKnz0sAwTT1mJ_XKMsuzXBbwAW_OK0,4083
36
+ statement_extractor/plugins/labelers/relation_type.py,sha256=e5ASwVqJGMSCrx5GtyNk85q_-19D7W_4jI-J-Pv_kxY,2506
37
+ statement_extractor/plugins/labelers/sentiment.py,sha256=nlWv9ymb7hlDIcFa-gjbIvZlJY1VrHrXhKMD-udmIzM,5027
38
+ statement_extractor/plugins/labelers/taxonomy.py,sha256=jQp5emgWf6XgmOx7arh-owF_-TjVxiPKSJ2OGkTPbBs,12427
39
+ statement_extractor/plugins/labelers/taxonomy_embedding.py,sha256=grvC_R_sg05hR6l0DgaELy2wmf6OkbvV1pRuNU0FVk4,16027
40
+ statement_extractor/plugins/qualifiers/__init__.py,sha256=kefjGunlVDKLy2NXmtr5ZXyYi-swyQdPLkB-tHV_0vk,495
41
+ statement_extractor/plugins/qualifiers/base.py,sha256=Kx--OdIh77mnjSkTl1NvUeekItRiG8AnBUcuznOZeBI,179
42
+ statement_extractor/plugins/qualifiers/companies_house.py,sha256=_6ExJCjD0V4eZNYXtfBY99obqLpRaSv-G-V7N6R1wLg,5376
43
+ statement_extractor/plugins/qualifiers/gleif.py,sha256=WZqcNT_Yq4yVe4rdkWO59C9yZ4geV2ZTDk9wxLlOeTg,5645
44
+ statement_extractor/plugins/qualifiers/person.py,sha256=si_9CLjHsH9jYFugej4t0HMnsivclh-Yi70U6NglfIU,7101
45
+ statement_extractor/plugins/qualifiers/sec_edgar.py,sha256=3XDbizlR9YQgLrC7p-owV8Td-3TYaJlMb4B7saha3vw,6288
46
+ statement_extractor/plugins/splitters/__init__.py,sha256=05CYeAEO0lZsapK5pjxZJbOCLI1kjeK6IQjftxqqg5g,224
47
+ statement_extractor/plugins/splitters/base.py,sha256=GeIBchFTr8icRSfYR8bGSb4-GoEZ1N0IGN6Kl5W2mL0,175
48
+ statement_extractor/plugins/splitters/t5_gemma.py,sha256=8joOzlMKXhSyJaq5c3F8t-gdPcZEDiVAzNcMlgJAqsE,6733
49
+ statement_extractor/plugins/taxonomy/__init__.py,sha256=8N0tW7pm95DSCqM-s99ea0Tigbi9bZMyTkKblR1qmLw,307
50
+ statement_extractor/plugins/taxonomy/embedding.py,sha256=QW1RR07JoE8Ah97gDZ_w_ATEe6-z2t2nl1zeTDAgFjM,11347
51
+ statement_extractor/plugins/taxonomy/mnli.py,sha256=IzLjHXUFgVAgEvYI5EzOBs19UxvpcbJa8HjqI__tYII,8905
52
+ corp_extractor-0.5.0.dist-info/METADATA,sha256=H4Z8ExZFdbknpHg-EZ1P9B137hCPwKXBezHSF7X9EOE,21567
53
+ corp_extractor-0.5.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
54
+ corp_extractor-0.5.0.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
55
+ corp_extractor-0.5.0.dist-info/RECORD,,
@@ -97,6 +97,9 @@ __all__ = [
97
97
  # Scoring
98
98
  "BeamScorer",
99
99
  "TripleScorer",
100
+ # LLM (lazy import)
101
+ "LLM",
102
+ "get_llm",
100
103
  ]
101
104
 
102
105
 
@@ -109,4 +112,10 @@ def __getattr__(name: str):
109
112
  if name == "EmbeddingDependencyError":
110
113
  from .predicate_comparer import EmbeddingDependencyError
111
114
  return EmbeddingDependencyError
115
+ if name == "LLM":
116
+ from .llm import LLM
117
+ return LLM
118
+ if name == "get_llm":
119
+ from .llm import get_llm
120
+ return get_llm
112
121
  raise AttributeError(f"module {__name__!r} has no attribute {name!r}")