corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
  2. corp_extractor-0.9.3.dist-info/RECORD +79 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +2030 -24
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +428 -0
  9. statement_extractor/database/importers/__init__.py +32 -0
  10. statement_extractor/database/importers/companies_house.py +559 -0
  11. statement_extractor/database/importers/companies_house_officers.py +431 -0
  12. statement_extractor/database/importers/gleif.py +561 -0
  13. statement_extractor/database/importers/sec_edgar.py +392 -0
  14. statement_extractor/database/importers/sec_form4.py +512 -0
  15. statement_extractor/database/importers/wikidata.py +1120 -0
  16. statement_extractor/database/importers/wikidata_dump.py +1951 -0
  17. statement_extractor/database/importers/wikidata_people.py +1130 -0
  18. statement_extractor/database/models.py +254 -0
  19. statement_extractor/database/resolver.py +245 -0
  20. statement_extractor/database/store.py +3034 -0
  21. statement_extractor/document/__init__.py +62 -0
  22. statement_extractor/document/chunker.py +410 -0
  23. statement_extractor/document/context.py +171 -0
  24. statement_extractor/document/deduplicator.py +171 -0
  25. statement_extractor/document/html_extractor.py +246 -0
  26. statement_extractor/document/loader.py +303 -0
  27. statement_extractor/document/pipeline.py +388 -0
  28. statement_extractor/document/summarizer.py +195 -0
  29. statement_extractor/extractor.py +1 -1
  30. statement_extractor/models/__init__.py +19 -3
  31. statement_extractor/models/canonical.py +44 -1
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/labels.py +47 -18
  34. statement_extractor/models/qualifiers.py +51 -3
  35. statement_extractor/models/statement.py +39 -15
  36. statement_extractor/models.py +1 -1
  37. statement_extractor/pipeline/config.py +6 -11
  38. statement_extractor/pipeline/context.py +5 -5
  39. statement_extractor/pipeline/orchestrator.py +90 -121
  40. statement_extractor/pipeline/registry.py +52 -46
  41. statement_extractor/plugins/__init__.py +20 -8
  42. statement_extractor/plugins/base.py +348 -78
  43. statement_extractor/plugins/extractors/gliner2.py +38 -28
  44. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  45. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  46. statement_extractor/plugins/pdf/__init__.py +10 -0
  47. statement_extractor/plugins/pdf/pypdf.py +291 -0
  48. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  49. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  50. statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
  51. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  52. statement_extractor/plugins/qualifiers/person.py +588 -14
  53. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  54. statement_extractor/plugins/scrapers/__init__.py +10 -0
  55. statement_extractor/plugins/scrapers/http.py +236 -0
  56. statement_extractor/plugins/splitters/t5_gemma.py +176 -75
  57. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  58. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  59. statement_extractor/scoring.py +8 -8
  60. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  61. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  62. statement_extractor/plugins/canonicalizers/base.py +0 -9
  63. statement_extractor/plugins/canonicalizers/location.py +0 -219
  64. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  65. statement_extractor/plugins/canonicalizers/person.py +0 -242
  66. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
  67. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: corp-extractor
3
- Version: 0.5.0
4
- Summary: Extract structured statements from text using T5-Gemma 2 and Diverse Beam Search
3
+ Version: 0.9.3
4
+ Summary: Extract structured entity and relationship information from text
5
5
  Project-URL: Homepage, https://github.com/corp-o-rate/statement-extractor
6
6
  Project-URL: Documentation, https://github.com/corp-o-rate/statement-extractor#readme
7
7
  Project-URL: Repository, https://github.com/corp-o-rate/statement-extractor
@@ -9,7 +9,7 @@ Project-URL: Issues, https://github.com/corp-o-rate/statement-extractor/issues
9
9
  Author-email: Corp-o-Rate <neil@corp-o-rate.com>
10
10
  Maintainer-email: Corp-o-Rate <neil@corp-o-rate.com>
11
11
  License: MIT
12
- Keywords: diverse-beam-search,embeddings,gemma,information-extraction,knowledge-graph,nlp,statement-extraction,subject-predicate-object,t5,transformers,triples
12
+ Keywords: diverse-beam-search,embeddings,entities,entity-linking,entity-resolution,gemma,information-extraction,knowledge-graph,nlp,semantic-parsing,statement-extraction,subject-predicate-object,t5gemma2,transformers,triples
13
13
  Classifier: Development Status :: 4 - Beta
14
14
  Classifier: Intended Audience :: Developers
15
15
  Classifier: Intended Audience :: Science/Research
@@ -24,14 +24,25 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
24
  Classifier: Topic :: Text Processing :: Linguistic
25
25
  Requires-Python: >=3.10
26
26
  Requires-Dist: accelerate>=1.12.0
27
+ Requires-Dist: beautifulsoup4>=4.12.0
27
28
  Requires-Dist: click>=8.0.0
28
29
  Requires-Dist: gguf>=0.17.1
29
30
  Requires-Dist: gliner2
31
+ Requires-Dist: httpx>=0.25.0
32
+ Requires-Dist: huggingface-hub>=0.20.0
33
+ Requires-Dist: llama-cpp-python>=0.3.16
30
34
  Requires-Dist: numpy>=1.24.0
35
+ Requires-Dist: pycountry>=24.6.1
31
36
  Requires-Dist: pydantic>=2.0.0
37
+ Requires-Dist: pymupdf>=1.23.0
32
38
  Requires-Dist: sentence-transformers>=2.2.0
39
+ Requires-Dist: sqlite-vec>=0.1.6
33
40
  Requires-Dist: torch>=2.0.0
34
41
  Requires-Dist: transformers>=5.0.0rc3
42
+ Provides-Extra: all
43
+ Requires-Dist: llama-cpp-python>=0.2.0; extra == 'all'
44
+ Requires-Dist: pillow>=10.0.0; extra == 'all'
45
+ Requires-Dist: pytesseract>=0.3.10; extra == 'all'
35
46
  Provides-Extra: dev
36
47
  Requires-Dist: mypy>=1.0.0; extra == 'dev'
37
48
  Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
@@ -39,6 +50,9 @@ Requires-Dist: pytest>=7.0.0; extra == 'dev'
39
50
  Requires-Dist: ruff>=0.1.0; extra == 'dev'
40
51
  Provides-Extra: llm
41
52
  Requires-Dist: llama-cpp-python>=0.2.0; extra == 'llm'
53
+ Provides-Extra: ocr
54
+ Requires-Dist: pillow>=10.0.0; extra == 'ocr'
55
+ Requires-Dist: pytesseract>=0.3.10; extra == 'ocr'
42
56
  Description-Content-Type: text/markdown
43
57
 
44
58
  # Corp Extractor
@@ -51,18 +65,21 @@ Extract structured subject-predicate-object statements from unstructured text us
51
65
 
52
66
  ## Features
53
67
 
54
- - **6-Stage Pipeline** *(v0.5.0)*: Modular plugin-based architecture for full entity resolution
68
+ - **Person Database** *(v0.9.2)*: Qualify notable people (executives, politicians, athletes, etc.) against Wikidata with canonical IDs
69
+ - **Organization Canonicalization** *(v0.9.2)*: Link equivalent records across sources (LEI, ticker, CIK, name matching)
70
+ - **5-Stage Pipeline** *(v0.8.0)*: Modular plugin-based architecture for full entity resolution
71
+ - **Document Processing** *(v0.7.0)*: Process documents, URLs, and PDFs with chunking and deduplication
72
+ - **Entity Embedding Database** *(v0.6.0)*: Fast entity qualification using vector similarity (~100K+ SEC, ~3M GLEIF, ~5M UK organizations)
55
73
  - **Structured Extraction**: Converts unstructured text into subject-predicate-object triples
56
74
  - **Entity Type Recognition**: Identifies 12 entity types (ORG, PERSON, GPE, LOC, PRODUCT, EVENT, etc.)
57
- - **Entity Qualification** *(v0.5.0)*: Adds roles, identifiers (LEI, ticker, company numbers) via external APIs
58
- - **Canonicalization** *(v0.5.0)*: Resolves entities to canonical forms with fuzzy matching
75
+ - **Entity Qualification** *(v0.8.0)*: Adds identifiers (LEI, ticker, company numbers), canonical names, and FQN via embedding database
59
76
  - **Statement Labeling** *(v0.5.0)*: Sentiment analysis, relation type classification, confidence scoring
60
77
  - **GLiNER2 Integration** *(v0.4.0)*: Uses GLiNER2 (205M params) for entity recognition and relation extraction
61
78
  - **Predefined Predicates**: Optional `--predicates` list for GLiNER2 relation extraction mode
62
79
  - **Beam Merging**: Combines top beams for better coverage instead of picking one
63
80
  - **Embedding-based Dedup**: Uses semantic similarity to detect near-duplicate predicates
64
81
  - **Predicate Taxonomies**: Normalize predicates to canonical forms via embeddings
65
- - **Command Line Interface**: Full-featured CLI with `split`, `pipeline`, and `plugins` commands
82
+ - **Command Line Interface**: Full-featured CLI with `split`, `pipeline`, `document`, and `db` commands
66
83
  - **Multiple Output Formats**: Get results as Pydantic models, JSON, XML, or dictionaries
67
84
 
68
85
  ## Installation
@@ -143,7 +160,7 @@ The CLI provides three main commands: `split`, `pipeline`, and `plugins`.
143
160
  corp-extractor split "Apple Inc. announced the iPhone 15."
144
161
  corp-extractor split -f article.txt --json
145
162
 
146
- # Full 6-stage pipeline (entity resolution, canonicalization, labeling, taxonomy)
163
+ # Full 5-stage pipeline (entity resolution, labeling, taxonomy)
147
164
  corp-extractor pipeline "Amazon CEO Andy Jassy announced plans to hire workers."
148
165
  corp-extractor pipeline -f article.txt --stages 1-3
149
166
  corp-extractor pipeline "..." --disable-plugins sec_edgar
@@ -206,10 +223,10 @@ Pipeline Options:
206
223
  -o, --output [table|json|yaml|triples] Output format
207
224
  ```
208
225
 
209
- ## New in v0.2.0: Quality Scoring & Beam Merging
226
+ ## Quality Scoring & Beam Merging
210
227
 
211
- By default, the library now:
212
- - **Scores each triple** for groundedness based on whether entities appear in source text
228
+ By default, the library:
229
+ - **Scores each triple** using semantic similarity (50%) + GLiNER2 entity recognition (50%)
213
230
  - **Merges top beams** instead of selecting one, improving coverage
214
231
  - **Uses embeddings** to detect semantically similar predicates ("bought" ≈ "acquired")
215
232
 
@@ -297,9 +314,9 @@ for stmt in fixed_statements:
297
314
 
298
315
  During deduplication, reversed duplicates (e.g., "A -> P -> B" and "B -> P -> A") are now detected and merged, with the correct orientation determined by source text similarity.
299
316
 
300
- ## New in v0.5.0: Pipeline Architecture
317
+ ## Pipeline Architecture
301
318
 
302
- v0.5.0 introduces a **6-stage plugin-based pipeline** for comprehensive entity resolution, statement enrichment, and taxonomy classification.
319
+ The library uses a **5-stage plugin-based pipeline** for comprehensive entity resolution, statement enrichment, and taxonomy classification.
303
320
 
304
321
  ### Pipeline Stages
305
322
 
@@ -307,10 +324,9 @@ v0.5.0 introduces a **6-stage plugin-based pipeline** for comprehensive entity r
307
324
  |-------|------|-------|--------|----------|
308
325
  | 1 | Splitting | Text | `RawTriple[]` | T5-Gemma2 |
309
326
  | 2 | Extraction | `RawTriple[]` | `PipelineStatement[]` | GLiNER2 |
310
- | 3 | Qualification | Entities | `QualifiedEntity[]` | Gemma3, APIs |
311
- | 4 | Canonicalization | `QualifiedEntity[]` | `CanonicalEntity[]` | Fuzzy matching |
312
- | 5 | Labeling | Statements | `LabeledStatement[]` | Sentiment, etc. |
313
- | 6 | Taxonomy | Statements | `TaxonomyResult[]` | MNLI, Embeddings |
327
+ | 3 | Qualification | Entities | `CanonicalEntity[]` | Embedding DB |
328
+ | 4 | Labeling | Statements | `LabeledStatement[]` | Sentiment, etc. |
329
+ | 5 | Taxonomy | Statements | `TaxonomyResult[]` | MNLI, Embeddings |
314
330
 
315
331
  ### Pipeline Python API
316
332
 
@@ -339,8 +355,8 @@ from statement_extractor.pipeline import PipelineConfig, ExtractionPipeline
339
355
 
340
356
  # Run only specific stages
341
357
  config = PipelineConfig(
342
- enabled_stages={1, 2, 3}, # Skip canonicalization and labeling
343
- disabled_plugins={"sec_edgar_qualifier"}, # Disable specific plugins
358
+ enabled_stages={1, 2, 3}, # Skip labeling and taxonomy
359
+ disabled_plugins={"person_qualifier"}, # Disable specific plugins
344
360
  )
345
361
  pipeline = ExtractionPipeline(config)
346
362
  ctx = pipeline.process(text)
@@ -358,24 +374,206 @@ config = PipelineConfig.from_stage_string("1-3") # Stages 1, 2, 3
358
374
  - `gliner2_extractor` - GLiNER2 entity recognition and relation extraction
359
375
 
360
376
  **Qualifiers (Stage 3):**
361
- - `person_qualifier` - PERSON → role, org (uses Gemma3)
362
- - `gleif_qualifier` - ORG → LEI, jurisdiction (GLEIF API)
363
- - `companies_house_qualifier` - ORG → UK company number
364
- - `sec_edgar_qualifier` - ORG → SEC CIK, ticker
377
+ - `person_qualifier` - PERSON → role, org, canonical ID via Wikidata person database *(enhanced in v0.9.0)*
378
+ - `embedding_company_qualifier` - ORG → canonical name, identifiers (LEI, CIK, company number), and FQN via embedding database
365
379
 
366
- **Canonicalizers (Stage 4):**
367
- - `organization_canonicalizer` - ORG canonical names
368
- - `person_canonicalizer` - PERSON name variants
369
-
370
- **Labelers (Stage 5):**
380
+ **Labelers (Stage 4):**
371
381
  - `sentiment_labeler` - Statement sentiment analysis
382
+ - `confidence_labeler` - Confidence scoring
383
+ - `relation_type_labeler` - Relation type classification
372
384
 
373
- **Taxonomy Classifiers (Stage 6):**
385
+ **Taxonomy Classifiers (Stage 5):**
374
386
  - `mnli_taxonomy_classifier` - MNLI zero-shot classification against ESG taxonomy
375
387
  - `embedding_taxonomy_classifier` - Embedding similarity-based taxonomy classification
376
388
 
377
389
  Taxonomy classifiers return **multiple labels** per statement above the confidence threshold.
378
390
 
391
+ ## Entity Database
392
+
393
+ The library includes an **entity embedding database** for fast entity qualification using vector similarity search. It stores records from authoritative sources (GLEIF, SEC, Companies House, Wikidata) with 768-dimensional embeddings for semantic matching.
394
+
395
+ **Quick start:**
396
+ ```bash
397
+ corp-extractor db download # Download pre-built database
398
+ corp-extractor db search "Microsoft" # Search organizations
399
+ corp-extractor db search-people "Tim Cook" # Search people
400
+ ```
401
+
402
+ For comprehensive documentation including schema, CLI reference, Python API, and build instructions, see **[ENTITY_DATABASE.md](./ENTITY_DATABASE.md)**.
403
+
404
+ ## New in v0.6.0: Entity Embedding Database
405
+
406
+ v0.6.0 introduces an **entity embedding database** for fast entity qualification using vector similarity search.
407
+
408
+ ### Data Sources
409
+
410
+ **Organizations:**
411
+
412
+ | Source | Records | Identifier | EntityType Mapping |
413
+ |--------|---------|------------|-------------------|
414
+ | GLEIF | ~3.2M | LEI (Legal Entity Identifier) | GENERAL→business, FUND→fund, BRANCH→branch, INTERNATIONAL_ORGANIZATION→international_org |
415
+ | SEC Edgar | ~100K+ | CIK (Central Index Key) | business (or fund via SIC codes) |
416
+ | Companies House | ~5M | UK Company Number | Maps company_type to business/nonprofit |
417
+ | Wikidata | Variable | Wikidata QID | 35+ query types mapped to EntityType |
418
+
419
+ **People** *(v0.9.0)*:
420
+
421
+ | Source | Records | Identifier | PersonType Classification |
422
+ |--------|---------|------------|--------------------------|
423
+ | Wikidata (SPARQL) | Variable | Wikidata QID | executive, politician, athlete, artist, academic, scientist, journalist, entrepreneur, activist |
424
+ | Wikidata (Dump) | All humans with enwiki | Wikidata QID | Classified from positions (P39) and occupations (P106) |
425
+
426
+ **Date Fields**: All importers now include `from_date` and `to_date` where available:
427
+ - **GLEIF**: LEI registration date
428
+ - **SEC Edgar**: First SEC filing date
429
+ - **Companies House**: Incorporation and dissolution dates
430
+ - **Wikidata Orgs**: Inception (P571) and dissolution (P576) dates
431
+ - **Wikidata People**: Position start (P580) and end (P582) dates
432
+
433
+ **Note**: The same person can have multiple records with different role/org combinations (unique on `source_id + role + org`). Organizations discovered during people import are automatically inserted into the organizations table with `known_for_org_id` foreign key linking people to their organizations.
434
+
435
+ ### EntityType Classification
436
+
437
+ Each organization record is classified with an `entity_type` field:
438
+
439
+ | Category | Types |
440
+ |----------|-------|
441
+ | Business | `business`, `fund`, `branch` |
442
+ | Non-profit | `nonprofit`, `ngo`, `foundation`, `trade_union` |
443
+ | Government | `government`, `international_org`, `political_party` |
444
+ | Other | `educational`, `research`, `healthcare`, `media`, `sports`, `religious`, `unknown` |
445
+
446
+ ### Building the Database
447
+
448
+ ```bash
449
+ # Import organizations from authoritative sources
450
+ corp-extractor db import-gleif --download
451
+ corp-extractor db import-sec --download # Bulk submissions.zip (~100K+ filers)
452
+ corp-extractor db import-companies-house --download
453
+ corp-extractor db import-wikidata --limit 50000
454
+
455
+ # Import notable people (v0.9.0)
456
+ corp-extractor db import-people --type executive --limit 5000
457
+ corp-extractor db import-people --all --limit 10000 # All person types
458
+ corp-extractor db import-people --type executive --skip-existing # Skip existing records
459
+ corp-extractor db import-people --type executive --enrich-dates # Fetch role start/end dates
460
+
461
+ # Import from Wikidata dump (v0.9.1) - avoids SPARQL timeouts
462
+ corp-extractor db import-wikidata-dump --download --limit 50000 # Downloads ~100GB dump
463
+ corp-extractor db import-wikidata-dump --dump /path/to/dump.bz2 --people --no-orgs # Local dump
464
+
465
+ # Check status
466
+ corp-extractor db status
467
+
468
+ # Search for an organization
469
+ corp-extractor db search "Microsoft"
470
+
471
+ # Search for a person (v0.9.0)
472
+ corp-extractor db search-people "Tim Cook"
473
+ ```
474
+
475
+ ### Using in Pipeline
476
+
477
+ The database is automatically used by the `embedding_company_qualifier` plugin for Stage 3 (Qualification):
478
+
479
+ ```python
480
+ from statement_extractor.pipeline import ExtractionPipeline
481
+
482
+ pipeline = ExtractionPipeline()
483
+ ctx = pipeline.process("Microsoft acquired Activision Blizzard.")
484
+
485
+ for stmt in ctx.labeled_statements:
486
+ print(f"{stmt.subject_fqn}") # e.g., "Microsoft (sec_edgar:0000789019)"
487
+ ```
488
+
489
+ ### Publishing to HuggingFace
490
+
491
+ ```bash
492
+ # Upload database with all variants (full, lite, compressed)
493
+ export HF_TOKEN="hf_..."
494
+ corp-extractor db upload # Uses default cache location
495
+ corp-extractor db upload entities.db # Or specify path
496
+ corp-extractor db upload --no-lite # Skip lite version
497
+ corp-extractor db upload --no-compress # Skip compressed versions
498
+
499
+ # Download pre-built database (lite version by default)
500
+ corp-extractor db download # Lite version (smaller, faster)
501
+ corp-extractor db download --full # Full version with all metadata
502
+
503
+ # Local database management
504
+ corp-extractor db create-lite entities.db # Create lite version
505
+ corp-extractor db compress entities.db # Compress with gzip
506
+ ```
507
+
508
+ See [COMPANY_DB.md](../COMPANY_DB.md) for complete build and publish instructions.
509
+
510
+ ## New in v0.7.0: Document Processing
511
+
512
+ v0.7.0 introduces **document-level processing** for handling files, URLs, and PDFs with automatic chunking, deduplication, and citation tracking.
513
+
514
+ ### Document CLI
515
+
516
+ ```bash
517
+ # Process local files
518
+ corp-extractor document process article.txt
519
+ corp-extractor document process report.txt --title "Annual Report" --year 2024
520
+
521
+ # Process URLs (web pages and PDFs)
522
+ corp-extractor document process https://example.com/article
523
+ corp-extractor document process https://example.com/report.pdf --use-ocr
524
+
525
+ # Configure chunking
526
+ corp-extractor document process article.txt --max-tokens 500 --overlap 50
527
+
528
+ # Preview chunking without extraction
529
+ corp-extractor document chunk article.txt --max-tokens 500
530
+ ```
531
+
532
+ ### Document Python API
533
+
534
+ ```python
535
+ from statement_extractor.document import DocumentPipeline, DocumentPipelineConfig, Document
536
+ from statement_extractor.models.document import ChunkingConfig
537
+
538
+ # Configure document processing
539
+ config = DocumentPipelineConfig(
540
+ chunking=ChunkingConfig(target_tokens=1000, overlap_tokens=100),
541
+ generate_summary=True,
542
+ deduplicate_across_chunks=True,
543
+ )
544
+
545
+ pipeline = DocumentPipeline(config)
546
+
547
+ # Process text
548
+ document = Document.from_text("Your long document text...", title="My Document")
549
+ ctx = pipeline.process(document)
550
+
551
+ # Process URL (async)
552
+ ctx = await pipeline.process_url("https://example.com/article")
553
+
554
+ # Access results
555
+ print(f"Chunks: {ctx.chunk_count}")
556
+ print(f"Statements: {ctx.statement_count}")
557
+ print(f"Duplicates removed: {ctx.duplicates_removed}")
558
+
559
+ for stmt in ctx.labeled_statements:
560
+ print(f"{stmt.subject_fqn} --[{stmt.statement.predicate}]--> {stmt.object_fqn}")
561
+ if stmt.citation:
562
+ print(f" Citation: {stmt.citation}")
563
+ ```
564
+
565
+ ### PDF Processing
566
+
567
+ PDFs are automatically parsed using PyMuPDF. For scanned PDFs, use OCR:
568
+
569
+ ```bash
570
+ # Install OCR dependencies
571
+ pip install "corp-extractor[ocr]"
572
+
573
+ # Process with OCR
574
+ corp-extractor document process scanned.pdf --use-ocr
575
+ ```
576
+
379
577
  ## New in v0.4.0: GLiNER2 Integration
380
578
 
381
579
  v0.4.0 replaces spaCy with **GLiNER2** (205M params) for entity recognition and relation extraction. GLiNER2 is a unified model that handles NER, text classification, structured data extraction, and relation extraction with CPU-optimized inference.
@@ -575,7 +773,7 @@ for text in texts:
575
773
  This library uses the T5-Gemma 2 statement extraction model with **Diverse Beam Search** ([Vijayakumar et al., 2016](https://arxiv.org/abs/1610.02424)):
576
774
 
577
775
  1. **Diverse Beam Search**: Generates 4+ candidate outputs using beam groups with diversity penalty
578
- 2. **Quality Scoring**: Each triple scored for groundedness in source text
776
+ 2. **Quality Scoring**: Each triple scored via semantic similarity + GLiNER2 entity recognition
579
777
  3. **Beam Merging**: Top beams combined for better coverage
580
778
  4. **Embedding Dedup**: Semantic similarity removes near-duplicate predicates
581
779
  5. **Predicate Normalization**: Optional taxonomy matching via embeddings
@@ -0,0 +1,79 @@
1
+ statement_extractor/__init__.py,sha256=vOJFsK6wNOoBvGYOvIKsseaqpFR8vNg_XPH-r8SmLas,3215
2
+ statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
3
+ statement_extractor/cli.py,sha256=l4YcqKmtks6exMAIHSUw_ukWGZ4x-v_V_Gnm-wOGc3g,106464
4
+ statement_extractor/extractor.py,sha256=m10na6I2iU1GwokQTxodePttYgigHykoss5LWrE8JOQ,38418
5
+ statement_extractor/gliner_extraction.py,sha256=OL4w-0_rZc6XCojaVsbGY4VdIXRJ6j8ZmeUeTOL0Ue0,8118
6
+ statement_extractor/llm.py,sha256=1eBrYs-bUPqzbpiiif_hH-gE_DeM-l3cmddrIoLHFXo,8010
7
+ statement_extractor/models.py,sha256=rBotCX2hRTMW4MXXkkWYv4JctP0HQR0NSJSlBcNhsF0,12302
8
+ statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
9
+ statement_extractor/scoring.py,sha256=V9WHQ-QCAoycnnaTHydWkFo-48_lcS6Mkztxjfi4wVg,16632
10
+ statement_extractor/data/default_predicates.json,sha256=7rhFXWHvStDj4sLYfBXKS50xGChPaqMKUaKTkMEJRGk,32870
11
+ statement_extractor/data/statement_taxonomy.json,sha256=LI9RWthuJTFCcuaIbh6h3FEu8EJpejiKjAtNM_y1s8A,336543
12
+ statement_extractor/database/__init__.py,sha256=1eScQOm7866v9xndaqCK-xsXDUhKBSj5YGtGoQ80LgU,1548
13
+ statement_extractor/database/embeddings.py,sha256=j_gUTEdRyyQCPcx9imsOh1nVDPjeiRXXG22OZ7KIO4w,5535
14
+ statement_extractor/database/hub.py,sha256=3xCvbCeqC6GR3XgVow7MAXg46ZPDYfDX8it93Xikw5w,13295
15
+ statement_extractor/database/models.py,sha256=4aLs5tp2QTAd9vAyPf80EUoHypd_K0jELcY4J51iaNw,10563
16
+ statement_extractor/database/resolver.py,sha256=_fTITarFmAYOtuRbOos48ou_aqX4yJC0K2csdLbIktI,7202
17
+ statement_extractor/database/store.py,sha256=FPyfC6KjD6pjfU2jccVEvsAcCtfqMdwVJuS7aqCNrKA,112320
18
+ statement_extractor/database/importers/__init__.py,sha256=acIoX_BPdXv2DOMFyVbFZPDGNWp2s1FpC774loTqL5I,1121
19
+ statement_extractor/database/importers/companies_house.py,sha256=b5OMFtoHhkPgoGK08ThQn9BtTu9uC_dYzBVpC10xT4U,20252
20
+ statement_extractor/database/importers/companies_house_officers.py,sha256=QDFA0FzqDx9p6VjRrB7o4BE3e30l7i0ML_ktntsB-kA,15565
21
+ statement_extractor/database/importers/gleif.py,sha256=sw4YYROD6wi7IbBEKGCn8kko0nOYbKOyukDJKGQp17Q,20200
22
+ statement_extractor/database/importers/sec_edgar.py,sha256=0nnhnOrf5d1wR9PGjl8AuNOnp4mfmEtopjkgUY_PLQc,13738
23
+ statement_extractor/database/importers/sec_form4.py,sha256=ZoV-oyNhG5AOUm4u9hemmRI5KnpNs3Gw_dfisjkD3zU,18234
24
+ statement_extractor/database/importers/wikidata.py,sha256=tRj4kEMVIq7sRXxjyxj-scl8eXybkrLVvyNDYV2T5lg,39572
25
+ statement_extractor/database/importers/wikidata_dump.py,sha256=GSLn_BV4h-Efms2tp_eYyhqSJsRFjnZzyqgaUCDmyVY,77903
26
+ statement_extractor/database/importers/wikidata_people.py,sha256=s4AB2pQLK2qHK9X5BLoW-II3qZBbJG4zbU3Ro4FBT9o,43157
27
+ statement_extractor/document/__init__.py,sha256=csbUUjxaZay-0WXtjZmULjDfL9VNxhOlePyKTMdRDYo,1714
28
+ statement_extractor/document/chunker.py,sha256=I76p6Qwujk2kkN7GJ1sMwbQNOfEpbt29u-RxJdt15oE,14020
29
+ statement_extractor/document/context.py,sha256=9DvyguwCjlef2MeNWZMgydvD54FPiOppjdvamQnrKzM,5450
30
+ statement_extractor/document/deduplicator.py,sha256=R_RwEdVeVQBYZHvjkVA0ShAWr8x618VrO9dkYWXvifI,4771
31
+ statement_extractor/document/html_extractor.py,sha256=YRhaIsurBJTeECLkL2YJsSv8gDJJN33fS-ESkGvDBGs,6600
32
+ statement_extractor/document/loader.py,sha256=Ygund7bz4EVcwsFsxkrrgSjOCK4tbb_sqkMlzK_oEKM,8996
33
+ statement_extractor/document/pipeline.py,sha256=h4q-CG_WtBLibkTXCFhfTizMme8bJS5f6ZWOECqhRYU,13675
34
+ statement_extractor/document/summarizer.py,sha256=DOF6qPw0oWEtLSt97oXOFyzb0jGWZZ7frDFp11rL3is,5853
35
+ statement_extractor/models/__init__.py,sha256=OJOK0ral_jskrSxx6nCc3TB6JlVYaC5HI2eYXr9dhMQ,2971
36
+ statement_extractor/models/canonical.py,sha256=LaSU3CUJZOtBM1SpRTAmK-3N7QnYmxZYJvQE1NVIjLY,6003
37
+ statement_extractor/models/document.py,sha256=McCyXz88YtJtlsfiFzagjRAhY32ovpIDKXQI_eV_DZI,9203
38
+ statement_extractor/models/entity.py,sha256=l2ny91BnnWwPo9zx1_Fb8WMKPNuIQFN0H7ILncylmcY,3214
39
+ statement_extractor/models/labels.py,sha256=NUcjFDuGUOM82mgsaWOdoIVbRNiQ6TdN-imNuTograo,7326
40
+ statement_extractor/models/qualifiers.py,sha256=l--khVzt-N6jgibZ-MSSl-3SdQUZJN9dGoxdNhRmM_I,5926
41
+ statement_extractor/models/statement.py,sha256=Wpp2OtZ5inhqbtEcblWdcES7g7lA-FVjqjz6Jq7hqzo,3329
42
+ statement_extractor/pipeline/__init__.py,sha256=Q3M2Arx9BWH_APZxM-P0G-C3ISguG1whiA5QhxDHQCA,1071
43
+ statement_extractor/pipeline/config.py,sha256=FXtqMMpRmdeuHB86D6YrFx5A36XHVg5GlBBZuPEn4JA,3957
44
+ statement_extractor/pipeline/context.py,sha256=evAdyH5oOCNM_ILGZNS1mov3lM4D3mCvr5hzsjaB0Bs,6136
45
+ statement_extractor/pipeline/orchestrator.py,sha256=qH6rD4_wI_kZ_e8NeIv2XYHUA07ldogFewFsZeRQVxw,16687
46
+ statement_extractor/pipeline/registry.py,sha256=yBybhRd1HU2Y75TebLGBzF6nbPiHKZ0cHkyj-3CVnhg,11390
47
+ statement_extractor/plugins/__init__.py,sha256=pIcPeoMFd-56jOM_kGrUWvPuwqN6vFJ-oUbu130-tzI,1345
48
+ statement_extractor/plugins/base.py,sha256=xC661iFtnhIxtZLTwuCc-0rFV1q2V3hCTV-uOaILsOA,21622
49
+ statement_extractor/plugins/extractors/__init__.py,sha256=sqxTI7WwDLVQKwOiQXqWS72gjJnwb76Gs9N3LGetBnI,253
50
+ statement_extractor/plugins/extractors/base.py,sha256=kNRsQ7BL84lXPXREm7CihrprDUaFwDDvMpBcbZlwSGA,179
51
+ statement_extractor/plugins/extractors/gliner2.py,sha256=yDwKJVniMj4YwjR4Rm6MALDk633H5qcKcxa2xOLh9LI,21999
52
+ statement_extractor/plugins/labelers/__init__.py,sha256=flHEoBvnzQ3vAKkIUHyezpYi2H3KJvYGRerCVnc80r0,965
53
+ statement_extractor/plugins/labelers/base.py,sha256=hIgJKq2LU00OcL0Zjy1L9hP8K2onlM_xtZ63XcH8qDE,171
54
+ statement_extractor/plugins/labelers/confidence.py,sha256=XiXjBYe-8ch_SCKnz0sAwTT1mJ_XKMsuzXBbwAW_OK0,4083
55
+ statement_extractor/plugins/labelers/relation_type.py,sha256=e5ASwVqJGMSCrx5GtyNk85q_-19D7W_4jI-J-Pv_kxY,2506
56
+ statement_extractor/plugins/labelers/sentiment.py,sha256=nlWv9ymb7hlDIcFa-gjbIvZlJY1VrHrXhKMD-udmIzM,5027
57
+ statement_extractor/plugins/labelers/taxonomy.py,sha256=u_TQVCTOZCtZis5ZP0xvxh5Ehc0fCJ-DG6E86GxjNcs,12725
58
+ statement_extractor/plugins/labelers/taxonomy_embedding.py,sha256=NsSls2jkWm8LyNNuDkG2Rs4PYKQQxeMUDLTRrvSNk_A,16305
59
+ statement_extractor/plugins/pdf/__init__.py,sha256=QLbgg3lgpwUKR1EGmzhbOJh5IB4-3rpWen9c75YNLtM,220
60
+ statement_extractor/plugins/pdf/pypdf.py,sha256=JgmWa1-6tiATbPvhONMqRd5kAXJ--tb8rlEcR3u73sk,8612
61
+ statement_extractor/plugins/qualifiers/__init__.py,sha256=H4FEZSw1GWBQB-Y79nQnLwhZ3okKQJqgJHGEA0Zp8pA,951
62
+ statement_extractor/plugins/qualifiers/base.py,sha256=Kx--OdIh77mnjSkTl1NvUeekItRiG8AnBUcuznOZeBI,179
63
+ statement_extractor/plugins/qualifiers/companies_house.py,sha256=6TlK6Zebb5wDJ9GGO3FvM9zOh27TWpio5BX9k7lBr7U,5854
64
+ statement_extractor/plugins/qualifiers/embedding_company.py,sha256=nc7oTFjEBuPiprjXKeFRiMYM6tNicMNum_xQ9LSgEOg,14756
65
+ statement_extractor/plugins/qualifiers/gleif.py,sha256=zHzC9eOt0R9Z56n0CXgTF7POJqu6v03SRmiJLmv8OGE,6104
66
+ statement_extractor/plugins/qualifiers/person.py,sha256=EN1T0G9NT6wOeIGljzZql11o63BujaHzK44yRqMTiRk,29034
67
+ statement_extractor/plugins/qualifiers/sec_edgar.py,sha256=d7QqGiE-3lFDQiXkYmNQU62K4oP2XYK6NzV6LNKPC5k,6754
68
+ statement_extractor/plugins/scrapers/__init__.py,sha256=mh1nmPtcsewrYeW5oELeke6DSzL8jsGOJ2OcH-A4-eo,208
69
+ statement_extractor/plugins/scrapers/http.py,sha256=igoB1JN7U-FPdBFmNfrdZV-Ho4JQ3RXniLz17SmQx8I,7778
70
+ statement_extractor/plugins/splitters/__init__.py,sha256=05CYeAEO0lZsapK5pjxZJbOCLI1kjeK6IQjftxqqg5g,224
71
+ statement_extractor/plugins/splitters/base.py,sha256=GeIBchFTr8icRSfYR8bGSb4-GoEZ1N0IGN6Kl5W2mL0,175
72
+ statement_extractor/plugins/splitters/t5_gemma.py,sha256=5qjxeHznuAA9hL8EbUDDGQ3N2gYLmtg0hv9BsLWzfMk,9971
73
+ statement_extractor/plugins/taxonomy/__init__.py,sha256=8N0tW7pm95DSCqM-s99ea0Tigbi9bZMyTkKblR1qmLw,307
74
+ statement_extractor/plugins/taxonomy/embedding.py,sha256=yCuNE8UeY8tH2dHGRKL3hmRQBmdz9_9YQ0t5_VTCf7E,16349
75
+ statement_extractor/plugins/taxonomy/mnli.py,sha256=zPZlpAHQqdnwH7fXS_CSY0HCMnaSrrk-fDQb1ZIqqPc,9163
76
+ corp_extractor-0.9.3.dist-info/METADATA,sha256=Ps8LucareMigmuhXiPIDUXPgsWp5F7noVYT7VbTrSZA,29633
77
+ corp_extractor-0.9.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
78
+ corp_extractor-0.9.3.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
79
+ corp_extractor-0.9.3.dist-info/RECORD,,
@@ -29,7 +29,7 @@ Example:
29
29
  >>> data = extract_statements_as_dict("Some text...")
30
30
  """
31
31
 
32
- __version__ = "0.3.0"
32
+ __version__ = "0.6.0"
33
33
 
34
34
  # Core models
35
35
  from .models import (