corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +10 -1
  4. statement_extractor/cli.py +1663 -17
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +6972 -0
  7. statement_extractor/database/__init__.py +52 -0
  8. statement_extractor/database/embeddings.py +186 -0
  9. statement_extractor/database/hub.py +520 -0
  10. statement_extractor/database/importers/__init__.py +24 -0
  11. statement_extractor/database/importers/companies_house.py +545 -0
  12. statement_extractor/database/importers/gleif.py +538 -0
  13. statement_extractor/database/importers/sec_edgar.py +375 -0
  14. statement_extractor/database/importers/wikidata.py +1012 -0
  15. statement_extractor/database/importers/wikidata_people.py +632 -0
  16. statement_extractor/database/models.py +230 -0
  17. statement_extractor/database/resolver.py +245 -0
  18. statement_extractor/database/store.py +1609 -0
  19. statement_extractor/document/__init__.py +62 -0
  20. statement_extractor/document/chunker.py +410 -0
  21. statement_extractor/document/context.py +171 -0
  22. statement_extractor/document/deduplicator.py +173 -0
  23. statement_extractor/document/html_extractor.py +246 -0
  24. statement_extractor/document/loader.py +303 -0
  25. statement_extractor/document/pipeline.py +388 -0
  26. statement_extractor/document/summarizer.py +195 -0
  27. statement_extractor/extractor.py +1 -23
  28. statement_extractor/gliner_extraction.py +4 -74
  29. statement_extractor/llm.py +255 -0
  30. statement_extractor/models/__init__.py +89 -0
  31. statement_extractor/models/canonical.py +182 -0
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/entity.py +102 -0
  34. statement_extractor/models/labels.py +220 -0
  35. statement_extractor/models/qualifiers.py +139 -0
  36. statement_extractor/models/statement.py +101 -0
  37. statement_extractor/models.py +4 -1
  38. statement_extractor/pipeline/__init__.py +39 -0
  39. statement_extractor/pipeline/config.py +129 -0
  40. statement_extractor/pipeline/context.py +177 -0
  41. statement_extractor/pipeline/orchestrator.py +416 -0
  42. statement_extractor/pipeline/registry.py +303 -0
  43. statement_extractor/plugins/__init__.py +55 -0
  44. statement_extractor/plugins/base.py +716 -0
  45. statement_extractor/plugins/extractors/__init__.py +13 -0
  46. statement_extractor/plugins/extractors/base.py +9 -0
  47. statement_extractor/plugins/extractors/gliner2.py +546 -0
  48. statement_extractor/plugins/labelers/__init__.py +29 -0
  49. statement_extractor/plugins/labelers/base.py +9 -0
  50. statement_extractor/plugins/labelers/confidence.py +138 -0
  51. statement_extractor/plugins/labelers/relation_type.py +87 -0
  52. statement_extractor/plugins/labelers/sentiment.py +159 -0
  53. statement_extractor/plugins/labelers/taxonomy.py +386 -0
  54. statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
  55. statement_extractor/plugins/pdf/__init__.py +10 -0
  56. statement_extractor/plugins/pdf/pypdf.py +291 -0
  57. statement_extractor/plugins/qualifiers/__init__.py +30 -0
  58. statement_extractor/plugins/qualifiers/base.py +9 -0
  59. statement_extractor/plugins/qualifiers/companies_house.py +185 -0
  60. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  61. statement_extractor/plugins/qualifiers/gleif.py +197 -0
  62. statement_extractor/plugins/qualifiers/person.py +785 -0
  63. statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
  64. statement_extractor/plugins/scrapers/__init__.py +10 -0
  65. statement_extractor/plugins/scrapers/http.py +236 -0
  66. statement_extractor/plugins/splitters/__init__.py +13 -0
  67. statement_extractor/plugins/splitters/base.py +9 -0
  68. statement_extractor/plugins/splitters/t5_gemma.py +293 -0
  69. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  70. statement_extractor/plugins/taxonomy/embedding.py +484 -0
  71. statement_extractor/plugins/taxonomy/mnli.py +291 -0
  72. statement_extractor/scoring.py +8 -8
  73. corp_extractor-0.4.0.dist-info/RECORD +0 -12
  74. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  75. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: corp-extractor
3
- Version: 0.4.0
3
+ Version: 0.9.0
4
4
  Summary: Extract structured statements from text using T5-Gemma 2 and Diverse Beam Search
5
5
  Project-URL: Homepage, https://github.com/corp-o-rate/statement-extractor
6
6
  Project-URL: Documentation, https://github.com/corp-o-rate/statement-extractor#readme
@@ -23,18 +23,35 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
23
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
24
  Classifier: Topic :: Text Processing :: Linguistic
25
25
  Requires-Python: >=3.10
26
+ Requires-Dist: accelerate>=1.12.0
27
+ Requires-Dist: beautifulsoup4>=4.12.0
26
28
  Requires-Dist: click>=8.0.0
29
+ Requires-Dist: gguf>=0.17.1
27
30
  Requires-Dist: gliner2
31
+ Requires-Dist: httpx>=0.25.0
32
+ Requires-Dist: huggingface-hub>=0.20.0
33
+ Requires-Dist: llama-cpp-python>=0.3.16
28
34
  Requires-Dist: numpy>=1.24.0
29
35
  Requires-Dist: pydantic>=2.0.0
36
+ Requires-Dist: pymupdf>=1.23.0
30
37
  Requires-Dist: sentence-transformers>=2.2.0
38
+ Requires-Dist: sqlite-vec>=0.1.6
31
39
  Requires-Dist: torch>=2.0.0
32
40
  Requires-Dist: transformers>=5.0.0rc3
41
+ Provides-Extra: all
42
+ Requires-Dist: llama-cpp-python>=0.2.0; extra == 'all'
43
+ Requires-Dist: pillow>=10.0.0; extra == 'all'
44
+ Requires-Dist: pytesseract>=0.3.10; extra == 'all'
33
45
  Provides-Extra: dev
34
46
  Requires-Dist: mypy>=1.0.0; extra == 'dev'
35
47
  Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
36
48
  Requires-Dist: pytest>=7.0.0; extra == 'dev'
37
49
  Requires-Dist: ruff>=0.1.0; extra == 'dev'
50
+ Provides-Extra: llm
51
+ Requires-Dist: llama-cpp-python>=0.2.0; extra == 'llm'
52
+ Provides-Extra: ocr
53
+ Requires-Dist: pillow>=10.0.0; extra == 'ocr'
54
+ Requires-Dist: pytesseract>=0.3.10; extra == 'ocr'
38
55
  Description-Content-Type: text/markdown
39
56
 
40
57
  # Corp Extractor
@@ -47,21 +64,20 @@ Extract structured subject-predicate-object statements from unstructured text us
47
64
 
48
65
  ## Features
49
66
 
67
+ - **Person Database** *(v0.9.0)*: Qualify notable people (executives, politicians, athletes, etc.) against Wikidata with canonical IDs
68
+ - **5-Stage Pipeline** *(v0.8.0)*: Modular plugin-based architecture for full entity resolution
69
+ - **Document Processing** *(v0.7.0)*: Process documents, URLs, and PDFs with chunking and deduplication
70
+ - **Entity Embedding Database** *(v0.6.0)*: Fast entity qualification using vector similarity (~100K+ SEC, ~3M GLEIF, ~5M UK organizations)
50
71
  - **Structured Extraction**: Converts unstructured text into subject-predicate-object triples
51
72
  - **Entity Type Recognition**: Identifies 12 entity types (ORG, PERSON, GPE, LOC, PRODUCT, EVENT, etc.)
73
+ - **Entity Qualification** *(v0.8.0)*: Adds identifiers (LEI, ticker, company numbers), canonical names, and FQN via embedding database
74
+ - **Statement Labeling** *(v0.5.0)*: Sentiment analysis, relation type classification, confidence scoring
52
75
  - **GLiNER2 Integration** *(v0.4.0)*: Uses GLiNER2 (205M params) for entity recognition and relation extraction
53
- - **Predefined Predicates** *(v0.4.0)*: Optional `--predicates` list for GLiNER2 relation extraction mode
54
- - **Entity-based Scoring** *(v0.4.0)*: Confidence combines semantic similarity (50%) + entity recognition scores (25% each)
55
- - **Multi-Candidate Extraction**: Generates 3 candidates per statement (hybrid, GLiNER2-only, predicate-split)
56
- - **Best Triple Selection**: Keeps only highest-scoring triple per source (use `--all-triples` to keep all)
57
- - **Extraction Method Tracking**: Each statement includes `extraction_method` field (hybrid, gliner, split, model)
76
+ - **Predefined Predicates**: Optional `--predicates` list for GLiNER2 relation extraction mode
58
77
  - **Beam Merging**: Combines top beams for better coverage instead of picking one
59
78
  - **Embedding-based Dedup**: Uses semantic similarity to detect near-duplicate predicates
60
79
  - **Predicate Taxonomies**: Normalize predicates to canonical forms via embeddings
61
- - **Contextualized Matching**: Compares full "Subject Predicate Object" against source text for better accuracy
62
- - **Entity Type Merging**: Automatically merges UNKNOWN entity types with specific types during deduplication
63
- - **Reversal Detection**: Detects and corrects subject-object reversals using embedding comparison
64
- - **Command Line Interface**: Full-featured CLI for terminal usage
80
+ - **Command Line Interface**: Full-featured CLI with `split`, `pipeline`, `document`, and `db` commands
65
81
  - **Multiple Output Formats**: Get results as Pydantic models, JSON, XML, or dictionaries
66
82
 
67
83
  ## Installation
@@ -135,63 +151,74 @@ uvx corp-extractor "Apple announced a new iPhone."
135
151
 
136
152
  ### Usage Examples
137
153
 
138
- ```bash
139
- # Extract from text argument
140
- corp-extractor "Apple Inc. announced the iPhone 15 at their September event."
141
-
142
- # Extract from file
143
- corp-extractor -f article.txt
154
+ The CLI provides three main commands: `split`, `pipeline`, and `plugins`.
144
155
 
145
- # Pipe from stdin
146
- cat article.txt | corp-extractor -
156
+ ```bash
157
+ # Simple extraction (Stage 1 only, fast)
158
+ corp-extractor split "Apple Inc. announced the iPhone 15."
159
+ corp-extractor split -f article.txt --json
160
+
161
+ # Full 5-stage pipeline (entity resolution, labeling, taxonomy)
162
+ corp-extractor pipeline "Amazon CEO Andy Jassy announced plans to hire workers."
163
+ corp-extractor pipeline -f article.txt --stages 1-3
164
+ corp-extractor pipeline "..." --disable-plugins sec_edgar
165
+
166
+ # Plugin management
167
+ corp-extractor plugins list
168
+ corp-extractor plugins list --stage 3
169
+ corp-extractor plugins info gleif_qualifier
170
+ ```
147
171
 
148
- # Output as JSON
149
- corp-extractor "Tim Cook is CEO of Apple." --json
172
+ ### Split Command (Simple Extraction)
150
173
 
151
- # Output as XML
152
- corp-extractor -f article.txt --xml
174
+ ```bash
175
+ corp-extractor split "Tim Cook is CEO of Apple." --json
176
+ corp-extractor split -f article.txt --beams 8 --verbose
177
+ cat article.txt | corp-extractor split -
178
+ ```
153
179
 
154
- # Verbose output with confidence scores
155
- corp-extractor -f article.txt --verbose
180
+ ### Pipeline Command (Full Entity Resolution)
156
181
 
157
- # Use more beams for better quality
158
- corp-extractor -f article.txt --beams 8
182
+ ```bash
183
+ # Run all 5 stages
184
+ corp-extractor pipeline "Apple CEO Tim Cook announced..."
159
185
 
160
- # Use custom predicate taxonomy
161
- corp-extractor -f article.txt --taxonomy predicates.txt
186
+ # Run specific stages
187
+ corp-extractor pipeline "..." --stages 1-3 # Stages 1, 2, 3
188
+ corp-extractor pipeline "..." --stages 1,2,5 # Stages 1, 2, 5
189
+ corp-extractor pipeline "..." --skip-stages 4,5 # Skip stages 4 and 5
162
190
 
163
- # Use GPU explicitly
164
- corp-extractor -f article.txt --device cuda
191
+ # Plugin selection
192
+ corp-extractor pipeline "..." --plugins gleif,companies_house
193
+ corp-extractor pipeline "..." --disable-plugins sec_edgar
165
194
  ```
166
195
 
167
- ### CLI Options
196
+ ### CLI Reference
168
197
 
169
198
  ```
170
- Usage: corp-extractor [OPTIONS] [TEXT]
199
+ Usage: corp-extractor [COMMAND] [OPTIONS]
171
200
 
172
- Options:
201
+ Commands:
202
+ split Simple extraction (T5-Gemma only)
203
+ pipeline Full 5-stage pipeline with entity resolution
204
+ plugins List or inspect available plugins
205
+
206
+ Split Options:
173
207
  -f, --file PATH Read input from file
174
208
  -o, --output [table|json|xml] Output format (default: table)
175
- --json Output as JSON (shortcut)
176
- --xml Output as XML (shortcut)
209
+ --json / --xml Output format shortcuts
177
210
  -b, --beams INTEGER Number of beams (default: 4)
178
- --diversity FLOAT Diversity penalty (default: 1.0)
179
- --max-tokens INTEGER Max tokens to generate (default: 2048)
180
- --no-dedup Disable deduplication
181
- --no-embeddings Disable embedding-based dedup (faster)
182
- --no-merge Disable beam merging
183
- --no-gliner Disable GLiNER2 extraction (use raw model output)
184
- --predicates TEXT Comma-separated predicate types for GLiNER2 relation extraction
185
- --all-triples Keep all candidate triples (default: best per source)
186
- --dedup-threshold FLOAT Deduplication threshold (default: 0.65)
187
- --min-confidence FLOAT Min confidence filter (default: 0)
188
- --taxonomy PATH Load predicate taxonomy from file
189
- --taxonomy-threshold FLOAT Taxonomy matching threshold (default: 0.5)
211
+ --no-gliner Disable GLiNER2 extraction
212
+ --predicates TEXT Comma-separated predicates for relation extraction
190
213
  --device [auto|cuda|mps|cpu] Device to use (default: auto)
191
214
  -v, --verbose Show confidence scores and metadata
192
- -q, --quiet Suppress progress messages
193
- --version Show version
194
- --help Show this message
215
+
216
+ Pipeline Options:
217
+ --stages TEXT Stages to run (e.g., '1-3' or '1,2,5')
218
+ --skip-stages TEXT Stages to skip (e.g., '4,5')
219
+ --plugins TEXT Enable only these plugins (comma-separated)
220
+ --disable-plugins TEXT Disable these plugins (comma-separated)
221
+ -o, --output [table|json|yaml|triples] Output format
195
222
  ```
196
223
 
197
224
  ## New in v0.2.0: Quality Scoring & Beam Merging
@@ -285,6 +312,237 @@ for stmt in fixed_statements:
285
312
 
286
313
  During deduplication, reversed duplicates (e.g., "A -> P -> B" and "B -> P -> A") are now detected and merged, with the correct orientation determined by source text similarity.
287
314
 
315
+ ## Pipeline Architecture
316
+
317
+ The library uses a **5-stage plugin-based pipeline** for comprehensive entity resolution, statement enrichment, and taxonomy classification.
318
+
319
+ ### Pipeline Stages
320
+
321
+ | Stage | Name | Input | Output | Key Tech |
322
+ |-------|------|-------|--------|----------|
323
+ | 1 | Splitting | Text | `RawTriple[]` | T5-Gemma2 |
324
+ | 2 | Extraction | `RawTriple[]` | `PipelineStatement[]` | GLiNER2 |
325
+ | 3 | Qualification | Entities | `CanonicalEntity[]` | Embedding DB |
326
+ | 4 | Labeling | Statements | `LabeledStatement[]` | Sentiment, etc. |
327
+ | 5 | Taxonomy | Statements | `TaxonomyResult[]` | MNLI, Embeddings |
328
+
329
+ ### Pipeline Python API
330
+
331
+ ```python
332
+ from statement_extractor.pipeline import ExtractionPipeline, PipelineConfig
333
+
334
+ # Run full pipeline
335
+ pipeline = ExtractionPipeline()
336
+ ctx = pipeline.process("Amazon CEO Andy Jassy announced plans to hire workers.")
337
+
338
+ # Access results at each stage
339
+ print(f"Raw triples: {len(ctx.raw_triples)}")
340
+ print(f"Statements: {len(ctx.statements)}")
341
+ print(f"Labeled: {len(ctx.labeled_statements)}")
342
+
343
+ # Output with fully qualified names
344
+ for stmt in ctx.labeled_statements:
345
+ print(f"{stmt.subject_fqn} --[{stmt.statement.predicate}]--> {stmt.object_fqn}")
346
+ # e.g., "Andy Jassy (CEO, Amazon) --[announced]--> plans to hire workers"
347
+ ```
348
+
349
+ ### Pipeline Configuration
350
+
351
+ ```python
352
+ from statement_extractor.pipeline import PipelineConfig, ExtractionPipeline
353
+
354
+ # Run only specific stages
355
+ config = PipelineConfig(
356
+ enabled_stages={1, 2, 3}, # Skip labeling and taxonomy
357
+ disabled_plugins={"person_qualifier"}, # Disable specific plugins
358
+ )
359
+ pipeline = ExtractionPipeline(config)
360
+ ctx = pipeline.process(text)
361
+
362
+ # Alternative: create config from stage string
363
+ config = PipelineConfig.from_stage_string("1-3") # Stages 1, 2, 3
364
+ ```
365
+
366
+ ### Built-in Plugins
367
+
368
+ **Splitters (Stage 1):**
369
+ - `t5_gemma_splitter` - T5-Gemma2 statement extraction
370
+
371
+ **Extractors (Stage 2):**
372
+ - `gliner2_extractor` - GLiNER2 entity recognition and relation extraction
373
+
374
+ **Qualifiers (Stage 3):**
375
+ - `person_qualifier` - PERSON → role, org, canonical ID via Wikidata person database *(enhanced in v0.9.0)*
376
+ - `embedding_company_qualifier` - ORG → canonical name, identifiers (LEI, CIK, company number), and FQN via embedding database
377
+
378
+ **Labelers (Stage 4):**
379
+ - `sentiment_labeler` - Statement sentiment analysis
380
+ - `confidence_labeler` - Confidence scoring
381
+ - `relation_type_labeler` - Relation type classification
382
+
383
+ **Taxonomy Classifiers (Stage 5):**
384
+ - `mnli_taxonomy_classifier` - MNLI zero-shot classification against ESG taxonomy
385
+ - `embedding_taxonomy_classifier` - Embedding similarity-based taxonomy classification
386
+
387
+ Taxonomy classifiers return **multiple labels** per statement above the confidence threshold.
388
+
389
+ ## New in v0.6.0: Entity Embedding Database
390
+
391
+ v0.6.0 introduces an **entity embedding database** for fast entity qualification using vector similarity search.
392
+
393
+ ### Data Sources
394
+
395
+ **Organizations:**
396
+
397
+ | Source | Records | Identifier | EntityType Mapping |
398
+ |--------|---------|------------|-------------------|
399
+ | GLEIF | ~3.2M | LEI (Legal Entity Identifier) | GENERAL→business, FUND→fund, BRANCH→branch, INTERNATIONAL_ORGANIZATION→international_org |
400
+ | SEC Edgar | ~100K+ | CIK (Central Index Key) | business (or fund via SIC codes) |
401
+ | Companies House | ~5M | UK Company Number | Maps company_type to business/nonprofit |
402
+ | Wikidata | Variable | Wikidata QID | 35+ query types mapped to EntityType |
403
+
404
+ **People** *(v0.9.0)*:
405
+
406
+ | Source | Records | Identifier | PersonType Classification |
407
+ |--------|---------|------------|--------------------------|
408
+ | Wikidata | Variable | Wikidata QID | executive, politician, athlete, artist, academic, scientist, journalist, entrepreneur, activist |
409
+
410
+ ### EntityType Classification
411
+
412
+ Each organization record is classified with an `entity_type` field:
413
+
414
+ | Category | Types |
415
+ |----------|-------|
416
+ | Business | `business`, `fund`, `branch` |
417
+ | Non-profit | `nonprofit`, `ngo`, `foundation`, `trade_union` |
418
+ | Government | `government`, `international_org`, `political_party` |
419
+ | Other | `educational`, `research`, `healthcare`, `media`, `sports`, `religious`, `unknown` |
420
+
421
+ ### Building the Database
422
+
423
+ ```bash
424
+ # Import organizations from authoritative sources
425
+ corp-extractor db import-gleif --download
426
+ corp-extractor db import-sec --download # Bulk submissions.zip (~100K+ filers)
427
+ corp-extractor db import-companies-house --download
428
+ corp-extractor db import-wikidata --limit 50000
429
+
430
+ # Import notable people (v0.9.0)
431
+ corp-extractor db import-people --type executive --limit 5000
432
+ corp-extractor db import-people --all --limit 10000 # All person types
433
+
434
+ # Check status
435
+ corp-extractor db status
436
+
437
+ # Search for an organization
438
+ corp-extractor db search "Microsoft"
439
+
440
+ # Search for a person (v0.9.0)
441
+ corp-extractor db search-people "Tim Cook"
442
+ ```
443
+
444
+ ### Using in Pipeline
445
+
446
+ The database is automatically used by the `embedding_company_qualifier` plugin for Stage 3 (Qualification):
447
+
448
+ ```python
449
+ from statement_extractor.pipeline import ExtractionPipeline
450
+
451
+ pipeline = ExtractionPipeline()
452
+ ctx = pipeline.process("Microsoft acquired Activision Blizzard.")
453
+
454
+ for stmt in ctx.labeled_statements:
455
+ print(f"{stmt.subject_fqn}") # e.g., "Microsoft (sec_edgar:0000789019)"
456
+ ```
457
+
458
+ ### Publishing to HuggingFace
459
+
460
+ ```bash
461
+ # Upload database with all variants (full, lite, compressed)
462
+ export HF_TOKEN="hf_..."
463
+ corp-extractor db upload # Uses default cache location
464
+ corp-extractor db upload entities.db # Or specify path
465
+ corp-extractor db upload --no-lite # Skip lite version
466
+ corp-extractor db upload --no-compress # Skip compressed versions
467
+
468
+ # Download pre-built database (lite version by default)
469
+ corp-extractor db download # Lite version (smaller, faster)
470
+ corp-extractor db download --full # Full version with all metadata
471
+
472
+ # Local database management
473
+ corp-extractor db create-lite entities.db # Create lite version
474
+ corp-extractor db compress entities.db # Compress with gzip
475
+ ```
476
+
477
+ See [COMPANY_DB.md](../COMPANY_DB.md) for complete build and publish instructions.
478
+
479
+ ## New in v0.7.0: Document Processing
480
+
481
+ v0.7.0 introduces **document-level processing** for handling files, URLs, and PDFs with automatic chunking, deduplication, and citation tracking.
482
+
483
+ ### Document CLI
484
+
485
+ ```bash
486
+ # Process local files
487
+ corp-extractor document process article.txt
488
+ corp-extractor document process report.txt --title "Annual Report" --year 2024
489
+
490
+ # Process URLs (web pages and PDFs)
491
+ corp-extractor document process https://example.com/article
492
+ corp-extractor document process https://example.com/report.pdf --use-ocr
493
+
494
+ # Configure chunking
495
+ corp-extractor document process article.txt --max-tokens 500 --overlap 50
496
+
497
+ # Preview chunking without extraction
498
+ corp-extractor document chunk article.txt --max-tokens 500
499
+ ```
500
+
501
+ ### Document Python API
502
+
503
+ ```python
504
+ from statement_extractor.document import DocumentPipeline, DocumentPipelineConfig, Document
505
+ from statement_extractor.models.document import ChunkingConfig
506
+
507
+ # Configure document processing
508
+ config = DocumentPipelineConfig(
509
+ chunking=ChunkingConfig(target_tokens=1000, overlap_tokens=100),
510
+ generate_summary=True,
511
+ deduplicate_across_chunks=True,
512
+ )
513
+
514
+ pipeline = DocumentPipeline(config)
515
+
516
+ # Process text
517
+ document = Document.from_text("Your long document text...", title="My Document")
518
+ ctx = pipeline.process(document)
519
+
520
+ # Process URL (async)
521
+ ctx = await pipeline.process_url("https://example.com/article")
522
+
523
+ # Access results
524
+ print(f"Chunks: {ctx.chunk_count}")
525
+ print(f"Statements: {ctx.statement_count}")
526
+ print(f"Duplicates removed: {ctx.duplicates_removed}")
527
+
528
+ for stmt in ctx.labeled_statements:
529
+ print(f"{stmt.subject_fqn} --[{stmt.statement.predicate}]--> {stmt.object_fqn}")
530
+ if stmt.citation:
531
+ print(f" Citation: {stmt.citation}")
532
+ ```
533
+
534
+ ### PDF Processing
535
+
536
+ PDFs are automatically parsed using PyMuPDF. For scanned PDFs, use OCR:
537
+
538
+ ```bash
539
+ # Install OCR dependencies
540
+ pip install "corp-extractor[ocr]"
541
+
542
+ # Process with OCR
543
+ corp-extractor document process scanned.pdf --use-ocr
544
+ ```
545
+
288
546
  ## New in v0.4.0: GLiNER2 Integration
289
547
 
290
548
  v0.4.0 replaces spaCy with **GLiNER2** (205M params) for entity recognition and relation extraction. GLiNER2 is a unified model that handles NER, text classification, structured data extraction, and relation extraction with CPU-optimized inference.
@@ -297,12 +555,31 @@ The T5-Gemma model excels at:
297
555
 
298
556
  GLiNER2 now handles:
299
557
  - **Entity recognition** - refining subject/object boundaries
300
- - **Relation extraction** - when predefined predicates are provided
558
+ - **Relation extraction** - using 324 default predicates across 21 categories
301
559
  - **Entity scoring** - scoring how "entity-like" subjects/objects are
560
+ - **Confidence scoring** - real confidence values via `include_confidence=True`
561
+
562
+ ### Default Predicates
563
+
564
+ GLiNER2 uses **324 predicates** organized into 21 categories (ownership, employment, funding, etc.). These are loaded from `default_predicates.json` and include descriptions and confidence thresholds.
302
565
 
303
- ### Two Extraction Modes
566
+ **Key features:**
567
+ - **All matches returned** - Every matching relation is returned, not just the best one
568
+ - **Category-based extraction** - Iterates through categories to stay under GLiNER2's ~25 label limit
569
+ - **Custom predicate files** - Provide your own JSON file with custom predicates
304
570
 
305
- **Mode 1: With Predicate List** (GLiNER2 relation extraction)
571
+ ### Extraction Modes
572
+
573
+ **Mode 1: Default Predicates** (recommended)
574
+ ```python
575
+ from statement_extractor import extract_statements
576
+
577
+ # Uses 324 built-in predicates automatically
578
+ result = extract_statements("John works for Apple Inc. in Cupertino.")
579
+ # Returns ALL matching relations
580
+ ```
581
+
582
+ **Mode 2: Custom Predicate List**
306
583
  ```python
307
584
  from statement_extractor import extract_statements, ExtractionOptions
308
585
 
@@ -315,27 +592,35 @@ Or via CLI:
315
592
  corp-extractor "John works for Apple Inc." --predicates "works_for,founded,acquired"
316
593
  ```
317
594
 
318
- **Mode 2: Without Predicate List** (entity-refined extraction)
595
+ **Mode 3: Custom Predicate File**
319
596
  ```python
320
- result = extract_statements("Apple announced a new iPhone.")
321
- # Uses GLiNER2 for entity extraction to refine boundaries
322
- # Extracts predicate from source text using T5-Gemma's hint
597
+ from statement_extractor.pipeline import ExtractionPipeline, PipelineConfig
598
+
599
+ config = PipelineConfig(
600
+ extractor_options={"predicates_file": "/path/to/custom_predicates.json"}
601
+ )
602
+ pipeline = ExtractionPipeline(config)
603
+ ctx = pipeline.process("John works for Apple Inc.")
604
+ ```
605
+
606
+ Or via CLI:
607
+ ```bash
608
+ corp-extractor pipeline "John works for Apple Inc." --predicates-file custom_predicates.json
323
609
  ```
324
610
 
325
- ### Three Candidate Extraction Methods
611
+ ### Two Candidate Extraction Methods
326
612
 
327
- For each statement, three candidates are generated and the best is selected:
613
+ For each statement, two candidates are generated and the best is selected:
328
614
 
329
615
  | Method | Description |
330
616
  |--------|-------------|
331
617
  | `hybrid` | Model subject/object + GLiNER2/extracted predicate |
332
- | `gliner` | All components refined by GLiNER2 |
333
- | `split` | Source text split around the predicate |
618
+ | `gliner` | All components refined by GLiNER2 entity recognition |
334
619
 
335
620
  ```python
336
621
  for stmt in result:
337
622
  print(f"{stmt.subject.text} --[{stmt.predicate}]--> {stmt.object.text}")
338
- print(f" Method: {stmt.extraction_method}") # hybrid, gliner, split, or model
623
+ print(f" Method: {stmt.extraction_method}") # hybrid, gliner, or model
339
624
  print(f" Confidence: {stmt.confidence_score:.2f}")
340
625
  ```
341
626
 
@@ -359,8 +644,7 @@ Confidence scores combine **semantic similarity** and **entity recognition**:
359
644
 
360
645
  Each statement includes an `extraction_method` field:
361
646
  - `hybrid` - Model subject/object + GLiNER2 predicate
362
- - `gliner` - All components refined by GLiNER2
363
- - `split` - Subject/object from splitting source text around predicate
647
+ - `gliner` - All components refined by GLiNER2 entity recognition
364
648
  - `model` - All components from T5-Gemma model (only when `--no-gliner`)
365
649
 
366
650
  ### Best Triple Selection
@@ -0,0 +1,76 @@
1
+ statement_extractor/__init__.py,sha256=vOJFsK6wNOoBvGYOvIKsseaqpFR8vNg_XPH-r8SmLas,3215
2
+ statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
3
+ statement_extractor/cli.py,sha256=BTFLIBZoNa2ADrYVslbXiZGrzhRWmi7ppbnAPV3xUyg,71191
4
+ statement_extractor/extractor.py,sha256=CGJCmAMiIoDsPtjIdvOHYBcz8058eYpfLMngjELMJhI,38403
5
+ statement_extractor/gliner_extraction.py,sha256=OL4w-0_rZc6XCojaVsbGY4VdIXRJ6j8ZmeUeTOL0Ue0,8118
6
+ statement_extractor/llm.py,sha256=1eBrYs-bUPqzbpiiif_hH-gE_DeM-l3cmddrIoLHFXo,8010
7
+ statement_extractor/models.py,sha256=fXTT7qxPqynnrrpb77nCgs3K2yn_YgbSugSXv12boX4,12312
8
+ statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
9
+ statement_extractor/scoring.py,sha256=V9WHQ-QCAoycnnaTHydWkFo-48_lcS6Mkztxjfi4wVg,16632
10
+ statement_extractor/data/default_predicates.json,sha256=7rhFXWHvStDj4sLYfBXKS50xGChPaqMKUaKTkMEJRGk,32870
11
+ statement_extractor/data/statement_taxonomy.json,sha256=LI9RWthuJTFCcuaIbh6h3FEu8EJpejiKjAtNM_y1s8A,336543
12
+ statement_extractor/database/__init__.py,sha256=1eScQOm7866v9xndaqCK-xsXDUhKBSj5YGtGoQ80LgU,1548
13
+ statement_extractor/database/embeddings.py,sha256=j_gUTEdRyyQCPcx9imsOh1nVDPjeiRXXG22OZ7KIO4w,5535
14
+ statement_extractor/database/hub.py,sha256=HOnRp62RnkXvk2KgwqOLVpEkXwy0LS0n3tIJrkYCo2c,16842
15
+ statement_extractor/database/models.py,sha256=ke4byqJiiBlZfRhxqoC0nsdDhb6YSG2I4S5W5BRBNY4,8813
16
+ statement_extractor/database/resolver.py,sha256=_fTITarFmAYOtuRbOos48ou_aqX4yJC0K2csdLbIktI,7202
17
+ statement_extractor/database/store.py,sha256=1qdRZ7q5nTLUYbtUC9cWSLey_GVf5kAQ6dTF9EEwDXY,56735
18
+ statement_extractor/database/importers/__init__.py,sha256=0CPqafekQpqxFmZhe3uMJLNssqxGzEniZlArGyl8QKU,668
19
+ statement_extractor/database/importers/companies_house.py,sha256=G0DZAs_9RM7uTwY7imt70IXUVvhntoO-xXnJ0o6jjGw,19635
20
+ statement_extractor/database/importers/gleif.py,sha256=MTFuksVf83Barn1c6JvBLBouxXbzogWulKb8oqEODAk,18948
21
+ statement_extractor/database/importers/sec_edgar.py,sha256=_B4QcXhZ_5ulXTSVW9dKKAzFIVwn-VIh_X39jcUhqsg,12923
22
+ statement_extractor/database/importers/wikidata.py,sha256=ZZYHiqSlYlco1TSzCLUKqdT-i5X1cUSK1EnsfWWwPAc,33770
23
+ statement_extractor/database/importers/wikidata_people.py,sha256=loqyf5sbtBqCITiTxqV3PLyx3SefmVefhZE0Y-cRoC4,22205
24
+ statement_extractor/document/__init__.py,sha256=csbUUjxaZay-0WXtjZmULjDfL9VNxhOlePyKTMdRDYo,1714
25
+ statement_extractor/document/chunker.py,sha256=I76p6Qwujk2kkN7GJ1sMwbQNOfEpbt29u-RxJdt15oE,14020
26
+ statement_extractor/document/context.py,sha256=9DvyguwCjlef2MeNWZMgydvD54FPiOppjdvamQnrKzM,5450
27
+ statement_extractor/document/deduplicator.py,sha256=8tPKWAGqNfjteOdnk7B82izyfIpvOebirZ-OIQKixwU,4821
28
+ statement_extractor/document/html_extractor.py,sha256=YRhaIsurBJTeECLkL2YJsSv8gDJJN33fS-ESkGvDBGs,6600
29
+ statement_extractor/document/loader.py,sha256=Ygund7bz4EVcwsFsxkrrgSjOCK4tbb_sqkMlzK_oEKM,8996
30
+ statement_extractor/document/pipeline.py,sha256=h4q-CG_WtBLibkTXCFhfTizMme8bJS5f6ZWOECqhRYU,13675
31
+ statement_extractor/document/summarizer.py,sha256=DOF6qPw0oWEtLSt97oXOFyzb0jGWZZ7frDFp11rL3is,5853
32
+ statement_extractor/models/__init__.py,sha256=9FxKkJ4EIPXmSkMo_j9jqAKH6jTkvz5Tzk2YvQL7UVk,2884
33
+ statement_extractor/models/canonical.py,sha256=LaSU3CUJZOtBM1SpRTAmK-3N7QnYmxZYJvQE1NVIjLY,6003
34
+ statement_extractor/models/document.py,sha256=McCyXz88YtJtlsfiFzagjRAhY32ovpIDKXQI_eV_DZI,9203
35
+ statement_extractor/models/entity.py,sha256=l2ny91BnnWwPo9zx1_Fb8WMKPNuIQFN0H7ILncylmcY,3214
36
+ statement_extractor/models/labels.py,sha256=NUcjFDuGUOM82mgsaWOdoIVbRNiQ6TdN-imNuTograo,7326
37
+ statement_extractor/models/qualifiers.py,sha256=l--khVzt-N6jgibZ-MSSl-3SdQUZJN9dGoxdNhRmM_I,5926
38
+ statement_extractor/models/statement.py,sha256=agC4jcP9ospbZC91J6c0UgLAmfsg1tnqNcSvkqOtqBQ,3629
39
+ statement_extractor/pipeline/__init__.py,sha256=Q3M2Arx9BWH_APZxM-P0G-C3ISguG1whiA5QhxDHQCA,1071
40
+ statement_extractor/pipeline/config.py,sha256=FXtqMMpRmdeuHB86D6YrFx5A36XHVg5GlBBZuPEn4JA,3957
41
+ statement_extractor/pipeline/context.py,sha256=wURDYtzDrmbHu40Af_C_oTtN55wnULKHNZjUx6O8t-0,6126
42
+ statement_extractor/pipeline/orchestrator.py,sha256=1pe6hyEtd495LJrfH3QgxQadNqERmehQEs5LHsAVIxM,16580
43
+ statement_extractor/pipeline/registry.py,sha256=yBybhRd1HU2Y75TebLGBzF6nbPiHKZ0cHkyj-3CVnhg,11390
44
+ statement_extractor/plugins/__init__.py,sha256=pIcPeoMFd-56jOM_kGrUWvPuwqN6vFJ-oUbu130-tzI,1345
45
+ statement_extractor/plugins/base.py,sha256=ItqJZ5rH65gW4-pXpraRb45y7F3lXqsKECumhV3tDyk,21516
46
+ statement_extractor/plugins/extractors/__init__.py,sha256=sqxTI7WwDLVQKwOiQXqWS72gjJnwb76Gs9N3LGetBnI,253
47
+ statement_extractor/plugins/extractors/base.py,sha256=kNRsQ7BL84lXPXREm7CihrprDUaFwDDvMpBcbZlwSGA,179
48
+ statement_extractor/plugins/extractors/gliner2.py,sha256=ObEQMNE6ArjRl2s4x3lkOSPs03cmtTYFlppnbhtkI7A,21876
49
+ statement_extractor/plugins/labelers/__init__.py,sha256=flHEoBvnzQ3vAKkIUHyezpYi2H3KJvYGRerCVnc80r0,965
50
+ statement_extractor/plugins/labelers/base.py,sha256=hIgJKq2LU00OcL0Zjy1L9hP8K2onlM_xtZ63XcH8qDE,171
51
+ statement_extractor/plugins/labelers/confidence.py,sha256=XiXjBYe-8ch_SCKnz0sAwTT1mJ_XKMsuzXBbwAW_OK0,4083
52
+ statement_extractor/plugins/labelers/relation_type.py,sha256=e5ASwVqJGMSCrx5GtyNk85q_-19D7W_4jI-J-Pv_kxY,2506
53
+ statement_extractor/plugins/labelers/sentiment.py,sha256=nlWv9ymb7hlDIcFa-gjbIvZlJY1VrHrXhKMD-udmIzM,5027
54
+ statement_extractor/plugins/labelers/taxonomy.py,sha256=u_TQVCTOZCtZis5ZP0xvxh5Ehc0fCJ-DG6E86GxjNcs,12725
55
+ statement_extractor/plugins/labelers/taxonomy_embedding.py,sha256=NsSls2jkWm8LyNNuDkG2Rs4PYKQQxeMUDLTRrvSNk_A,16305
56
+ statement_extractor/plugins/pdf/__init__.py,sha256=QLbgg3lgpwUKR1EGmzhbOJh5IB4-3rpWen9c75YNLtM,220
57
+ statement_extractor/plugins/pdf/pypdf.py,sha256=JgmWa1-6tiATbPvhONMqRd5kAXJ--tb8rlEcR3u73sk,8612
58
+ statement_extractor/plugins/qualifiers/__init__.py,sha256=H4FEZSw1GWBQB-Y79nQnLwhZ3okKQJqgJHGEA0Zp8pA,951
59
+ statement_extractor/plugins/qualifiers/base.py,sha256=Kx--OdIh77mnjSkTl1NvUeekItRiG8AnBUcuznOZeBI,179
60
+ statement_extractor/plugins/qualifiers/companies_house.py,sha256=6TlK6Zebb5wDJ9GGO3FvM9zOh27TWpio5BX9k7lBr7U,5854
61
+ statement_extractor/plugins/qualifiers/embedding_company.py,sha256=EmCxImdXBCA7zxM1stAVeAYlzeNPC_jSlyVN5q1XEJA,14567
62
+ statement_extractor/plugins/qualifiers/gleif.py,sha256=zHzC9eOt0R9Z56n0CXgTF7POJqu6v03SRmiJLmv8OGE,6104
63
+ statement_extractor/plugins/qualifiers/person.py,sha256=GZCUJaQncC_wB4nBQ4RLY5dJ-CdARMLpByc_Nn09wj8,28461
64
+ statement_extractor/plugins/qualifiers/sec_edgar.py,sha256=d7QqGiE-3lFDQiXkYmNQU62K4oP2XYK6NzV6LNKPC5k,6754
65
+ statement_extractor/plugins/scrapers/__init__.py,sha256=mh1nmPtcsewrYeW5oELeke6DSzL8jsGOJ2OcH-A4-eo,208
66
+ statement_extractor/plugins/scrapers/http.py,sha256=igoB1JN7U-FPdBFmNfrdZV-Ho4JQ3RXniLz17SmQx8I,7778
67
+ statement_extractor/plugins/splitters/__init__.py,sha256=05CYeAEO0lZsapK5pjxZJbOCLI1kjeK6IQjftxqqg5g,224
68
+ statement_extractor/plugins/splitters/base.py,sha256=GeIBchFTr8icRSfYR8bGSb4-GoEZ1N0IGN6Kl5W2mL0,175
69
+ statement_extractor/plugins/splitters/t5_gemma.py,sha256=AwYYKQrAmiue5IK9bbJ-Uhfl9oCZTX1X_tmKguKIdjU,9982
70
+ statement_extractor/plugins/taxonomy/__init__.py,sha256=8N0tW7pm95DSCqM-s99ea0Tigbi9bZMyTkKblR1qmLw,307
71
+ statement_extractor/plugins/taxonomy/embedding.py,sha256=yCuNE8UeY8tH2dHGRKL3hmRQBmdz9_9YQ0t5_VTCf7E,16349
72
+ statement_extractor/plugins/taxonomy/mnli.py,sha256=zPZlpAHQqdnwH7fXS_CSY0HCMnaSrrk-fDQb1ZIqqPc,9163
73
+ corp_extractor-0.9.0.dist-info/METADATA,sha256=9pWemKEWyeEqW92sRd4SqdMykO-92kl5UIrs-P2xAn0,27553
74
+ corp_extractor-0.9.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
75
+ corp_extractor-0.9.0.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
76
+ corp_extractor-0.9.0.dist-info/RECORD,,
@@ -29,7 +29,7 @@ Example:
29
29
  >>> data = extract_statements_as_dict("Some text...")
30
30
  """
31
31
 
32
- __version__ = "0.3.0"
32
+ __version__ = "0.6.0"
33
33
 
34
34
  # Core models
35
35
  from .models import (
@@ -97,6 +97,9 @@ __all__ = [
97
97
  # Scoring
98
98
  "BeamScorer",
99
99
  "TripleScorer",
100
+ # LLM (lazy import)
101
+ "LLM",
102
+ "get_llm",
100
103
  ]
101
104
 
102
105
 
@@ -109,4 +112,10 @@ def __getattr__(name: str):
109
112
  if name == "EmbeddingDependencyError":
110
113
  from .predicate_comparer import EmbeddingDependencyError
111
114
  return EmbeddingDependencyError
115
+ if name == "LLM":
116
+ from .llm import LLM
117
+ return LLM
118
+ if name == "get_llm":
119
+ from .llm import get_llm
120
+ return get_llm
112
121
  raise AttributeError(f"module {__name__!r} has no attribute {name!r}")