corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
- corp_extractor-0.9.3.dist-info/RECORD +79 -0
- statement_extractor/__init__.py +1 -1
- statement_extractor/cli.py +2030 -24
- statement_extractor/data/statement_taxonomy.json +6949 -1159
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +428 -0
- statement_extractor/database/importers/__init__.py +32 -0
- statement_extractor/database/importers/companies_house.py +559 -0
- statement_extractor/database/importers/companies_house_officers.py +431 -0
- statement_extractor/database/importers/gleif.py +561 -0
- statement_extractor/database/importers/sec_edgar.py +392 -0
- statement_extractor/database/importers/sec_form4.py +512 -0
- statement_extractor/database/importers/wikidata.py +1120 -0
- statement_extractor/database/importers/wikidata_dump.py +1951 -0
- statement_extractor/database/importers/wikidata_people.py +1130 -0
- statement_extractor/database/models.py +254 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +3034 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +171 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/extractor.py +1 -1
- statement_extractor/models/__init__.py +19 -3
- statement_extractor/models/canonical.py +44 -1
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/labels.py +47 -18
- statement_extractor/models/qualifiers.py +51 -3
- statement_extractor/models/statement.py +39 -15
- statement_extractor/models.py +1 -1
- statement_extractor/pipeline/config.py +6 -11
- statement_extractor/pipeline/context.py +5 -5
- statement_extractor/pipeline/orchestrator.py +90 -121
- statement_extractor/pipeline/registry.py +52 -46
- statement_extractor/plugins/__init__.py +20 -8
- statement_extractor/plugins/base.py +348 -78
- statement_extractor/plugins/extractors/gliner2.py +38 -28
- statement_extractor/plugins/labelers/taxonomy.py +18 -5
- statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +11 -0
- statement_extractor/plugins/qualifiers/companies_house.py +14 -3
- statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
- statement_extractor/plugins/qualifiers/gleif.py +14 -3
- statement_extractor/plugins/qualifiers/person.py +588 -14
- statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/t5_gemma.py +176 -75
- statement_extractor/plugins/taxonomy/embedding.py +193 -46
- statement_extractor/plugins/taxonomy/mnli.py +16 -4
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.5.0.dist-info/RECORD +0 -55
- statement_extractor/plugins/canonicalizers/__init__.py +0 -17
- statement_extractor/plugins/canonicalizers/base.py +0 -9
- statement_extractor/plugins/canonicalizers/location.py +0 -219
- statement_extractor/plugins/canonicalizers/organization.py +0 -230
- statement_extractor/plugins/canonicalizers/person.py +0 -242
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: corp-extractor
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary: Extract structured
|
|
3
|
+
Version: 0.9.3
|
|
4
|
+
Summary: Extract structured entity and relationship information from text
|
|
5
5
|
Project-URL: Homepage, https://github.com/corp-o-rate/statement-extractor
|
|
6
6
|
Project-URL: Documentation, https://github.com/corp-o-rate/statement-extractor#readme
|
|
7
7
|
Project-URL: Repository, https://github.com/corp-o-rate/statement-extractor
|
|
@@ -9,7 +9,7 @@ Project-URL: Issues, https://github.com/corp-o-rate/statement-extractor/issues
|
|
|
9
9
|
Author-email: Corp-o-Rate <neil@corp-o-rate.com>
|
|
10
10
|
Maintainer-email: Corp-o-Rate <neil@corp-o-rate.com>
|
|
11
11
|
License: MIT
|
|
12
|
-
Keywords: diverse-beam-search,embeddings,gemma,information-extraction,knowledge-graph,nlp,statement-extraction,subject-predicate-object,
|
|
12
|
+
Keywords: diverse-beam-search,embeddings,entities,entity-linking,entity-resolution,gemma,information-extraction,knowledge-graph,nlp,semantic-parsing,statement-extraction,subject-predicate-object,t5gemma2,transformers,triples
|
|
13
13
|
Classifier: Development Status :: 4 - Beta
|
|
14
14
|
Classifier: Intended Audience :: Developers
|
|
15
15
|
Classifier: Intended Audience :: Science/Research
|
|
@@ -24,14 +24,25 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
|
24
24
|
Classifier: Topic :: Text Processing :: Linguistic
|
|
25
25
|
Requires-Python: >=3.10
|
|
26
26
|
Requires-Dist: accelerate>=1.12.0
|
|
27
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
27
28
|
Requires-Dist: click>=8.0.0
|
|
28
29
|
Requires-Dist: gguf>=0.17.1
|
|
29
30
|
Requires-Dist: gliner2
|
|
31
|
+
Requires-Dist: httpx>=0.25.0
|
|
32
|
+
Requires-Dist: huggingface-hub>=0.20.0
|
|
33
|
+
Requires-Dist: llama-cpp-python>=0.3.16
|
|
30
34
|
Requires-Dist: numpy>=1.24.0
|
|
35
|
+
Requires-Dist: pycountry>=24.6.1
|
|
31
36
|
Requires-Dist: pydantic>=2.0.0
|
|
37
|
+
Requires-Dist: pymupdf>=1.23.0
|
|
32
38
|
Requires-Dist: sentence-transformers>=2.2.0
|
|
39
|
+
Requires-Dist: sqlite-vec>=0.1.6
|
|
33
40
|
Requires-Dist: torch>=2.0.0
|
|
34
41
|
Requires-Dist: transformers>=5.0.0rc3
|
|
42
|
+
Provides-Extra: all
|
|
43
|
+
Requires-Dist: llama-cpp-python>=0.2.0; extra == 'all'
|
|
44
|
+
Requires-Dist: pillow>=10.0.0; extra == 'all'
|
|
45
|
+
Requires-Dist: pytesseract>=0.3.10; extra == 'all'
|
|
35
46
|
Provides-Extra: dev
|
|
36
47
|
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
37
48
|
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
@@ -39,6 +50,9 @@ Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
|
39
50
|
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
40
51
|
Provides-Extra: llm
|
|
41
52
|
Requires-Dist: llama-cpp-python>=0.2.0; extra == 'llm'
|
|
53
|
+
Provides-Extra: ocr
|
|
54
|
+
Requires-Dist: pillow>=10.0.0; extra == 'ocr'
|
|
55
|
+
Requires-Dist: pytesseract>=0.3.10; extra == 'ocr'
|
|
42
56
|
Description-Content-Type: text/markdown
|
|
43
57
|
|
|
44
58
|
# Corp Extractor
|
|
@@ -51,18 +65,21 @@ Extract structured subject-predicate-object statements from unstructured text us
|
|
|
51
65
|
|
|
52
66
|
## Features
|
|
53
67
|
|
|
54
|
-
- **
|
|
68
|
+
- **Person Database** *(v0.9.2)*: Qualify notable people (executives, politicians, athletes, etc.) against Wikidata with canonical IDs
|
|
69
|
+
- **Organization Canonicalization** *(v0.9.2)*: Link equivalent records across sources (LEI, ticker, CIK, name matching)
|
|
70
|
+
- **5-Stage Pipeline** *(v0.8.0)*: Modular plugin-based architecture for full entity resolution
|
|
71
|
+
- **Document Processing** *(v0.7.0)*: Process documents, URLs, and PDFs with chunking and deduplication
|
|
72
|
+
- **Entity Embedding Database** *(v0.6.0)*: Fast entity qualification using vector similarity (~100K+ SEC, ~3M GLEIF, ~5M UK organizations)
|
|
55
73
|
- **Structured Extraction**: Converts unstructured text into subject-predicate-object triples
|
|
56
74
|
- **Entity Type Recognition**: Identifies 12 entity types (ORG, PERSON, GPE, LOC, PRODUCT, EVENT, etc.)
|
|
57
|
-
- **Entity Qualification** *(v0.
|
|
58
|
-
- **Canonicalization** *(v0.5.0)*: Resolves entities to canonical forms with fuzzy matching
|
|
75
|
+
- **Entity Qualification** *(v0.8.0)*: Adds identifiers (LEI, ticker, company numbers), canonical names, and FQN via embedding database
|
|
59
76
|
- **Statement Labeling** *(v0.5.0)*: Sentiment analysis, relation type classification, confidence scoring
|
|
60
77
|
- **GLiNER2 Integration** *(v0.4.0)*: Uses GLiNER2 (205M params) for entity recognition and relation extraction
|
|
61
78
|
- **Predefined Predicates**: Optional `--predicates` list for GLiNER2 relation extraction mode
|
|
62
79
|
- **Beam Merging**: Combines top beams for better coverage instead of picking one
|
|
63
80
|
- **Embedding-based Dedup**: Uses semantic similarity to detect near-duplicate predicates
|
|
64
81
|
- **Predicate Taxonomies**: Normalize predicates to canonical forms via embeddings
|
|
65
|
-
- **Command Line Interface**: Full-featured CLI with `split`, `pipeline`, and `
|
|
82
|
+
- **Command Line Interface**: Full-featured CLI with `split`, `pipeline`, `document`, and `db` commands
|
|
66
83
|
- **Multiple Output Formats**: Get results as Pydantic models, JSON, XML, or dictionaries
|
|
67
84
|
|
|
68
85
|
## Installation
|
|
@@ -143,7 +160,7 @@ The CLI provides three main commands: `split`, `pipeline`, and `plugins`.
|
|
|
143
160
|
corp-extractor split "Apple Inc. announced the iPhone 15."
|
|
144
161
|
corp-extractor split -f article.txt --json
|
|
145
162
|
|
|
146
|
-
# Full
|
|
163
|
+
# Full 5-stage pipeline (entity resolution, labeling, taxonomy)
|
|
147
164
|
corp-extractor pipeline "Amazon CEO Andy Jassy announced plans to hire workers."
|
|
148
165
|
corp-extractor pipeline -f article.txt --stages 1-3
|
|
149
166
|
corp-extractor pipeline "..." --disable-plugins sec_edgar
|
|
@@ -206,10 +223,10 @@ Pipeline Options:
|
|
|
206
223
|
-o, --output [table|json|yaml|triples] Output format
|
|
207
224
|
```
|
|
208
225
|
|
|
209
|
-
##
|
|
226
|
+
## Quality Scoring & Beam Merging
|
|
210
227
|
|
|
211
|
-
By default, the library
|
|
212
|
-
- **Scores each triple**
|
|
228
|
+
By default, the library:
|
|
229
|
+
- **Scores each triple** using semantic similarity (50%) + GLiNER2 entity recognition (50%)
|
|
213
230
|
- **Merges top beams** instead of selecting one, improving coverage
|
|
214
231
|
- **Uses embeddings** to detect semantically similar predicates ("bought" ≈ "acquired")
|
|
215
232
|
|
|
@@ -297,9 +314,9 @@ for stmt in fixed_statements:
|
|
|
297
314
|
|
|
298
315
|
During deduplication, reversed duplicates (e.g., "A -> P -> B" and "B -> P -> A") are now detected and merged, with the correct orientation determined by source text similarity.
|
|
299
316
|
|
|
300
|
-
##
|
|
317
|
+
## Pipeline Architecture
|
|
301
318
|
|
|
302
|
-
|
|
319
|
+
The library uses a **5-stage plugin-based pipeline** for comprehensive entity resolution, statement enrichment, and taxonomy classification.
|
|
303
320
|
|
|
304
321
|
### Pipeline Stages
|
|
305
322
|
|
|
@@ -307,10 +324,9 @@ v0.5.0 introduces a **6-stage plugin-based pipeline** for comprehensive entity r
|
|
|
307
324
|
|-------|------|-------|--------|----------|
|
|
308
325
|
| 1 | Splitting | Text | `RawTriple[]` | T5-Gemma2 |
|
|
309
326
|
| 2 | Extraction | `RawTriple[]` | `PipelineStatement[]` | GLiNER2 |
|
|
310
|
-
| 3 | Qualification | Entities | `
|
|
311
|
-
| 4 |
|
|
312
|
-
| 5 |
|
|
313
|
-
| 6 | Taxonomy | Statements | `TaxonomyResult[]` | MNLI, Embeddings |
|
|
327
|
+
| 3 | Qualification | Entities | `CanonicalEntity[]` | Embedding DB |
|
|
328
|
+
| 4 | Labeling | Statements | `LabeledStatement[]` | Sentiment, etc. |
|
|
329
|
+
| 5 | Taxonomy | Statements | `TaxonomyResult[]` | MNLI, Embeddings |
|
|
314
330
|
|
|
315
331
|
### Pipeline Python API
|
|
316
332
|
|
|
@@ -339,8 +355,8 @@ from statement_extractor.pipeline import PipelineConfig, ExtractionPipeline
|
|
|
339
355
|
|
|
340
356
|
# Run only specific stages
|
|
341
357
|
config = PipelineConfig(
|
|
342
|
-
enabled_stages={1, 2, 3}, # Skip
|
|
343
|
-
disabled_plugins={"
|
|
358
|
+
enabled_stages={1, 2, 3}, # Skip labeling and taxonomy
|
|
359
|
+
disabled_plugins={"person_qualifier"}, # Disable specific plugins
|
|
344
360
|
)
|
|
345
361
|
pipeline = ExtractionPipeline(config)
|
|
346
362
|
ctx = pipeline.process(text)
|
|
@@ -358,24 +374,206 @@ config = PipelineConfig.from_stage_string("1-3") # Stages 1, 2, 3
|
|
|
358
374
|
- `gliner2_extractor` - GLiNER2 entity recognition and relation extraction
|
|
359
375
|
|
|
360
376
|
**Qualifiers (Stage 3):**
|
|
361
|
-
- `person_qualifier` - PERSON → role, org (
|
|
362
|
-
- `
|
|
363
|
-
- `companies_house_qualifier` - ORG → UK company number
|
|
364
|
-
- `sec_edgar_qualifier` - ORG → SEC CIK, ticker
|
|
377
|
+
- `person_qualifier` - PERSON → role, org, canonical ID via Wikidata person database *(enhanced in v0.9.0)*
|
|
378
|
+
- `embedding_company_qualifier` - ORG → canonical name, identifiers (LEI, CIK, company number), and FQN via embedding database
|
|
365
379
|
|
|
366
|
-
**
|
|
367
|
-
- `organization_canonicalizer` - ORG canonical names
|
|
368
|
-
- `person_canonicalizer` - PERSON name variants
|
|
369
|
-
|
|
370
|
-
**Labelers (Stage 5):**
|
|
380
|
+
**Labelers (Stage 4):**
|
|
371
381
|
- `sentiment_labeler` - Statement sentiment analysis
|
|
382
|
+
- `confidence_labeler` - Confidence scoring
|
|
383
|
+
- `relation_type_labeler` - Relation type classification
|
|
372
384
|
|
|
373
|
-
**Taxonomy Classifiers (Stage
|
|
385
|
+
**Taxonomy Classifiers (Stage 5):**
|
|
374
386
|
- `mnli_taxonomy_classifier` - MNLI zero-shot classification against ESG taxonomy
|
|
375
387
|
- `embedding_taxonomy_classifier` - Embedding similarity-based taxonomy classification
|
|
376
388
|
|
|
377
389
|
Taxonomy classifiers return **multiple labels** per statement above the confidence threshold.
|
|
378
390
|
|
|
391
|
+
## Entity Database
|
|
392
|
+
|
|
393
|
+
The library includes an **entity embedding database** for fast entity qualification using vector similarity search. It stores records from authoritative sources (GLEIF, SEC, Companies House, Wikidata) with 768-dimensional embeddings for semantic matching.
|
|
394
|
+
|
|
395
|
+
**Quick start:**
|
|
396
|
+
```bash
|
|
397
|
+
corp-extractor db download # Download pre-built database
|
|
398
|
+
corp-extractor db search "Microsoft" # Search organizations
|
|
399
|
+
corp-extractor db search-people "Tim Cook" # Search people
|
|
400
|
+
```
|
|
401
|
+
|
|
402
|
+
For comprehensive documentation including schema, CLI reference, Python API, and build instructions, see **[ENTITY_DATABASE.md](./ENTITY_DATABASE.md)**.
|
|
403
|
+
|
|
404
|
+
## New in v0.6.0: Entity Embedding Database
|
|
405
|
+
|
|
406
|
+
v0.6.0 introduces an **entity embedding database** for fast entity qualification using vector similarity search.
|
|
407
|
+
|
|
408
|
+
### Data Sources
|
|
409
|
+
|
|
410
|
+
**Organizations:**
|
|
411
|
+
|
|
412
|
+
| Source | Records | Identifier | EntityType Mapping |
|
|
413
|
+
|--------|---------|------------|-------------------|
|
|
414
|
+
| GLEIF | ~3.2M | LEI (Legal Entity Identifier) | GENERAL→business, FUND→fund, BRANCH→branch, INTERNATIONAL_ORGANIZATION→international_org |
|
|
415
|
+
| SEC Edgar | ~100K+ | CIK (Central Index Key) | business (or fund via SIC codes) |
|
|
416
|
+
| Companies House | ~5M | UK Company Number | Maps company_type to business/nonprofit |
|
|
417
|
+
| Wikidata | Variable | Wikidata QID | 35+ query types mapped to EntityType |
|
|
418
|
+
|
|
419
|
+
**People** *(v0.9.0)*:
|
|
420
|
+
|
|
421
|
+
| Source | Records | Identifier | PersonType Classification |
|
|
422
|
+
|--------|---------|------------|--------------------------|
|
|
423
|
+
| Wikidata (SPARQL) | Variable | Wikidata QID | executive, politician, athlete, artist, academic, scientist, journalist, entrepreneur, activist |
|
|
424
|
+
| Wikidata (Dump) | All humans with enwiki | Wikidata QID | Classified from positions (P39) and occupations (P106) |
|
|
425
|
+
|
|
426
|
+
**Date Fields**: All importers now include `from_date` and `to_date` where available:
|
|
427
|
+
- **GLEIF**: LEI registration date
|
|
428
|
+
- **SEC Edgar**: First SEC filing date
|
|
429
|
+
- **Companies House**: Incorporation and dissolution dates
|
|
430
|
+
- **Wikidata Orgs**: Inception (P571) and dissolution (P576) dates
|
|
431
|
+
- **Wikidata People**: Position start (P580) and end (P582) dates
|
|
432
|
+
|
|
433
|
+
**Note**: The same person can have multiple records with different role/org combinations (unique on `source_id + role + org`). Organizations discovered during people import are automatically inserted into the organizations table with `known_for_org_id` foreign key linking people to their organizations.
|
|
434
|
+
|
|
435
|
+
### EntityType Classification
|
|
436
|
+
|
|
437
|
+
Each organization record is classified with an `entity_type` field:
|
|
438
|
+
|
|
439
|
+
| Category | Types |
|
|
440
|
+
|----------|-------|
|
|
441
|
+
| Business | `business`, `fund`, `branch` |
|
|
442
|
+
| Non-profit | `nonprofit`, `ngo`, `foundation`, `trade_union` |
|
|
443
|
+
| Government | `government`, `international_org`, `political_party` |
|
|
444
|
+
| Other | `educational`, `research`, `healthcare`, `media`, `sports`, `religious`, `unknown` |
|
|
445
|
+
|
|
446
|
+
### Building the Database
|
|
447
|
+
|
|
448
|
+
```bash
|
|
449
|
+
# Import organizations from authoritative sources
|
|
450
|
+
corp-extractor db import-gleif --download
|
|
451
|
+
corp-extractor db import-sec --download # Bulk submissions.zip (~100K+ filers)
|
|
452
|
+
corp-extractor db import-companies-house --download
|
|
453
|
+
corp-extractor db import-wikidata --limit 50000
|
|
454
|
+
|
|
455
|
+
# Import notable people (v0.9.0)
|
|
456
|
+
corp-extractor db import-people --type executive --limit 5000
|
|
457
|
+
corp-extractor db import-people --all --limit 10000 # All person types
|
|
458
|
+
corp-extractor db import-people --type executive --skip-existing # Skip existing records
|
|
459
|
+
corp-extractor db import-people --type executive --enrich-dates # Fetch role start/end dates
|
|
460
|
+
|
|
461
|
+
# Import from Wikidata dump (v0.9.1) - avoids SPARQL timeouts
|
|
462
|
+
corp-extractor db import-wikidata-dump --download --limit 50000 # Downloads ~100GB dump
|
|
463
|
+
corp-extractor db import-wikidata-dump --dump /path/to/dump.bz2 --people --no-orgs # Local dump
|
|
464
|
+
|
|
465
|
+
# Check status
|
|
466
|
+
corp-extractor db status
|
|
467
|
+
|
|
468
|
+
# Search for an organization
|
|
469
|
+
corp-extractor db search "Microsoft"
|
|
470
|
+
|
|
471
|
+
# Search for a person (v0.9.0)
|
|
472
|
+
corp-extractor db search-people "Tim Cook"
|
|
473
|
+
```
|
|
474
|
+
|
|
475
|
+
### Using in Pipeline
|
|
476
|
+
|
|
477
|
+
The database is automatically used by the `embedding_company_qualifier` plugin for Stage 3 (Qualification):
|
|
478
|
+
|
|
479
|
+
```python
|
|
480
|
+
from statement_extractor.pipeline import ExtractionPipeline
|
|
481
|
+
|
|
482
|
+
pipeline = ExtractionPipeline()
|
|
483
|
+
ctx = pipeline.process("Microsoft acquired Activision Blizzard.")
|
|
484
|
+
|
|
485
|
+
for stmt in ctx.labeled_statements:
|
|
486
|
+
print(f"{stmt.subject_fqn}") # e.g., "Microsoft (sec_edgar:0000789019)"
|
|
487
|
+
```
|
|
488
|
+
|
|
489
|
+
### Publishing to HuggingFace
|
|
490
|
+
|
|
491
|
+
```bash
|
|
492
|
+
# Upload database with all variants (full, lite, compressed)
|
|
493
|
+
export HF_TOKEN="hf_..."
|
|
494
|
+
corp-extractor db upload # Uses default cache location
|
|
495
|
+
corp-extractor db upload entities.db # Or specify path
|
|
496
|
+
corp-extractor db upload --no-lite # Skip lite version
|
|
497
|
+
corp-extractor db upload --no-compress # Skip compressed versions
|
|
498
|
+
|
|
499
|
+
# Download pre-built database (lite version by default)
|
|
500
|
+
corp-extractor db download # Lite version (smaller, faster)
|
|
501
|
+
corp-extractor db download --full # Full version with all metadata
|
|
502
|
+
|
|
503
|
+
# Local database management
|
|
504
|
+
corp-extractor db create-lite entities.db # Create lite version
|
|
505
|
+
corp-extractor db compress entities.db # Compress with gzip
|
|
506
|
+
```
|
|
507
|
+
|
|
508
|
+
See [COMPANY_DB.md](../COMPANY_DB.md) for complete build and publish instructions.
|
|
509
|
+
|
|
510
|
+
## New in v0.7.0: Document Processing
|
|
511
|
+
|
|
512
|
+
v0.7.0 introduces **document-level processing** for handling files, URLs, and PDFs with automatic chunking, deduplication, and citation tracking.
|
|
513
|
+
|
|
514
|
+
### Document CLI
|
|
515
|
+
|
|
516
|
+
```bash
|
|
517
|
+
# Process local files
|
|
518
|
+
corp-extractor document process article.txt
|
|
519
|
+
corp-extractor document process report.txt --title "Annual Report" --year 2024
|
|
520
|
+
|
|
521
|
+
# Process URLs (web pages and PDFs)
|
|
522
|
+
corp-extractor document process https://example.com/article
|
|
523
|
+
corp-extractor document process https://example.com/report.pdf --use-ocr
|
|
524
|
+
|
|
525
|
+
# Configure chunking
|
|
526
|
+
corp-extractor document process article.txt --max-tokens 500 --overlap 50
|
|
527
|
+
|
|
528
|
+
# Preview chunking without extraction
|
|
529
|
+
corp-extractor document chunk article.txt --max-tokens 500
|
|
530
|
+
```
|
|
531
|
+
|
|
532
|
+
### Document Python API
|
|
533
|
+
|
|
534
|
+
```python
|
|
535
|
+
from statement_extractor.document import DocumentPipeline, DocumentPipelineConfig, Document
|
|
536
|
+
from statement_extractor.models.document import ChunkingConfig
|
|
537
|
+
|
|
538
|
+
# Configure document processing
|
|
539
|
+
config = DocumentPipelineConfig(
|
|
540
|
+
chunking=ChunkingConfig(target_tokens=1000, overlap_tokens=100),
|
|
541
|
+
generate_summary=True,
|
|
542
|
+
deduplicate_across_chunks=True,
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
pipeline = DocumentPipeline(config)
|
|
546
|
+
|
|
547
|
+
# Process text
|
|
548
|
+
document = Document.from_text("Your long document text...", title="My Document")
|
|
549
|
+
ctx = pipeline.process(document)
|
|
550
|
+
|
|
551
|
+
# Process URL (async)
|
|
552
|
+
ctx = await pipeline.process_url("https://example.com/article")
|
|
553
|
+
|
|
554
|
+
# Access results
|
|
555
|
+
print(f"Chunks: {ctx.chunk_count}")
|
|
556
|
+
print(f"Statements: {ctx.statement_count}")
|
|
557
|
+
print(f"Duplicates removed: {ctx.duplicates_removed}")
|
|
558
|
+
|
|
559
|
+
for stmt in ctx.labeled_statements:
|
|
560
|
+
print(f"{stmt.subject_fqn} --[{stmt.statement.predicate}]--> {stmt.object_fqn}")
|
|
561
|
+
if stmt.citation:
|
|
562
|
+
print(f" Citation: {stmt.citation}")
|
|
563
|
+
```
|
|
564
|
+
|
|
565
|
+
### PDF Processing
|
|
566
|
+
|
|
567
|
+
PDFs are automatically parsed using PyMuPDF. For scanned PDFs, use OCR:
|
|
568
|
+
|
|
569
|
+
```bash
|
|
570
|
+
# Install OCR dependencies
|
|
571
|
+
pip install "corp-extractor[ocr]"
|
|
572
|
+
|
|
573
|
+
# Process with OCR
|
|
574
|
+
corp-extractor document process scanned.pdf --use-ocr
|
|
575
|
+
```
|
|
576
|
+
|
|
379
577
|
## New in v0.4.0: GLiNER2 Integration
|
|
380
578
|
|
|
381
579
|
v0.4.0 replaces spaCy with **GLiNER2** (205M params) for entity recognition and relation extraction. GLiNER2 is a unified model that handles NER, text classification, structured data extraction, and relation extraction with CPU-optimized inference.
|
|
@@ -575,7 +773,7 @@ for text in texts:
|
|
|
575
773
|
This library uses the T5-Gemma 2 statement extraction model with **Diverse Beam Search** ([Vijayakumar et al., 2016](https://arxiv.org/abs/1610.02424)):
|
|
576
774
|
|
|
577
775
|
1. **Diverse Beam Search**: Generates 4+ candidate outputs using beam groups with diversity penalty
|
|
578
|
-
2. **Quality Scoring**: Each triple scored
|
|
776
|
+
2. **Quality Scoring**: Each triple scored via semantic similarity + GLiNER2 entity recognition
|
|
579
777
|
3. **Beam Merging**: Top beams combined for better coverage
|
|
580
778
|
4. **Embedding Dedup**: Semantic similarity removes near-duplicate predicates
|
|
581
779
|
5. **Predicate Normalization**: Optional taxonomy matching via embeddings
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
statement_extractor/__init__.py,sha256=vOJFsK6wNOoBvGYOvIKsseaqpFR8vNg_XPH-r8SmLas,3215
|
|
2
|
+
statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
|
|
3
|
+
statement_extractor/cli.py,sha256=l4YcqKmtks6exMAIHSUw_ukWGZ4x-v_V_Gnm-wOGc3g,106464
|
|
4
|
+
statement_extractor/extractor.py,sha256=m10na6I2iU1GwokQTxodePttYgigHykoss5LWrE8JOQ,38418
|
|
5
|
+
statement_extractor/gliner_extraction.py,sha256=OL4w-0_rZc6XCojaVsbGY4VdIXRJ6j8ZmeUeTOL0Ue0,8118
|
|
6
|
+
statement_extractor/llm.py,sha256=1eBrYs-bUPqzbpiiif_hH-gE_DeM-l3cmddrIoLHFXo,8010
|
|
7
|
+
statement_extractor/models.py,sha256=rBotCX2hRTMW4MXXkkWYv4JctP0HQR0NSJSlBcNhsF0,12302
|
|
8
|
+
statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
|
|
9
|
+
statement_extractor/scoring.py,sha256=V9WHQ-QCAoycnnaTHydWkFo-48_lcS6Mkztxjfi4wVg,16632
|
|
10
|
+
statement_extractor/data/default_predicates.json,sha256=7rhFXWHvStDj4sLYfBXKS50xGChPaqMKUaKTkMEJRGk,32870
|
|
11
|
+
statement_extractor/data/statement_taxonomy.json,sha256=LI9RWthuJTFCcuaIbh6h3FEu8EJpejiKjAtNM_y1s8A,336543
|
|
12
|
+
statement_extractor/database/__init__.py,sha256=1eScQOm7866v9xndaqCK-xsXDUhKBSj5YGtGoQ80LgU,1548
|
|
13
|
+
statement_extractor/database/embeddings.py,sha256=j_gUTEdRyyQCPcx9imsOh1nVDPjeiRXXG22OZ7KIO4w,5535
|
|
14
|
+
statement_extractor/database/hub.py,sha256=3xCvbCeqC6GR3XgVow7MAXg46ZPDYfDX8it93Xikw5w,13295
|
|
15
|
+
statement_extractor/database/models.py,sha256=4aLs5tp2QTAd9vAyPf80EUoHypd_K0jELcY4J51iaNw,10563
|
|
16
|
+
statement_extractor/database/resolver.py,sha256=_fTITarFmAYOtuRbOos48ou_aqX4yJC0K2csdLbIktI,7202
|
|
17
|
+
statement_extractor/database/store.py,sha256=FPyfC6KjD6pjfU2jccVEvsAcCtfqMdwVJuS7aqCNrKA,112320
|
|
18
|
+
statement_extractor/database/importers/__init__.py,sha256=acIoX_BPdXv2DOMFyVbFZPDGNWp2s1FpC774loTqL5I,1121
|
|
19
|
+
statement_extractor/database/importers/companies_house.py,sha256=b5OMFtoHhkPgoGK08ThQn9BtTu9uC_dYzBVpC10xT4U,20252
|
|
20
|
+
statement_extractor/database/importers/companies_house_officers.py,sha256=QDFA0FzqDx9p6VjRrB7o4BE3e30l7i0ML_ktntsB-kA,15565
|
|
21
|
+
statement_extractor/database/importers/gleif.py,sha256=sw4YYROD6wi7IbBEKGCn8kko0nOYbKOyukDJKGQp17Q,20200
|
|
22
|
+
statement_extractor/database/importers/sec_edgar.py,sha256=0nnhnOrf5d1wR9PGjl8AuNOnp4mfmEtopjkgUY_PLQc,13738
|
|
23
|
+
statement_extractor/database/importers/sec_form4.py,sha256=ZoV-oyNhG5AOUm4u9hemmRI5KnpNs3Gw_dfisjkD3zU,18234
|
|
24
|
+
statement_extractor/database/importers/wikidata.py,sha256=tRj4kEMVIq7sRXxjyxj-scl8eXybkrLVvyNDYV2T5lg,39572
|
|
25
|
+
statement_extractor/database/importers/wikidata_dump.py,sha256=GSLn_BV4h-Efms2tp_eYyhqSJsRFjnZzyqgaUCDmyVY,77903
|
|
26
|
+
statement_extractor/database/importers/wikidata_people.py,sha256=s4AB2pQLK2qHK9X5BLoW-II3qZBbJG4zbU3Ro4FBT9o,43157
|
|
27
|
+
statement_extractor/document/__init__.py,sha256=csbUUjxaZay-0WXtjZmULjDfL9VNxhOlePyKTMdRDYo,1714
|
|
28
|
+
statement_extractor/document/chunker.py,sha256=I76p6Qwujk2kkN7GJ1sMwbQNOfEpbt29u-RxJdt15oE,14020
|
|
29
|
+
statement_extractor/document/context.py,sha256=9DvyguwCjlef2MeNWZMgydvD54FPiOppjdvamQnrKzM,5450
|
|
30
|
+
statement_extractor/document/deduplicator.py,sha256=R_RwEdVeVQBYZHvjkVA0ShAWr8x618VrO9dkYWXvifI,4771
|
|
31
|
+
statement_extractor/document/html_extractor.py,sha256=YRhaIsurBJTeECLkL2YJsSv8gDJJN33fS-ESkGvDBGs,6600
|
|
32
|
+
statement_extractor/document/loader.py,sha256=Ygund7bz4EVcwsFsxkrrgSjOCK4tbb_sqkMlzK_oEKM,8996
|
|
33
|
+
statement_extractor/document/pipeline.py,sha256=h4q-CG_WtBLibkTXCFhfTizMme8bJS5f6ZWOECqhRYU,13675
|
|
34
|
+
statement_extractor/document/summarizer.py,sha256=DOF6qPw0oWEtLSt97oXOFyzb0jGWZZ7frDFp11rL3is,5853
|
|
35
|
+
statement_extractor/models/__init__.py,sha256=OJOK0ral_jskrSxx6nCc3TB6JlVYaC5HI2eYXr9dhMQ,2971
|
|
36
|
+
statement_extractor/models/canonical.py,sha256=LaSU3CUJZOtBM1SpRTAmK-3N7QnYmxZYJvQE1NVIjLY,6003
|
|
37
|
+
statement_extractor/models/document.py,sha256=McCyXz88YtJtlsfiFzagjRAhY32ovpIDKXQI_eV_DZI,9203
|
|
38
|
+
statement_extractor/models/entity.py,sha256=l2ny91BnnWwPo9zx1_Fb8WMKPNuIQFN0H7ILncylmcY,3214
|
|
39
|
+
statement_extractor/models/labels.py,sha256=NUcjFDuGUOM82mgsaWOdoIVbRNiQ6TdN-imNuTograo,7326
|
|
40
|
+
statement_extractor/models/qualifiers.py,sha256=l--khVzt-N6jgibZ-MSSl-3SdQUZJN9dGoxdNhRmM_I,5926
|
|
41
|
+
statement_extractor/models/statement.py,sha256=Wpp2OtZ5inhqbtEcblWdcES7g7lA-FVjqjz6Jq7hqzo,3329
|
|
42
|
+
statement_extractor/pipeline/__init__.py,sha256=Q3M2Arx9BWH_APZxM-P0G-C3ISguG1whiA5QhxDHQCA,1071
|
|
43
|
+
statement_extractor/pipeline/config.py,sha256=FXtqMMpRmdeuHB86D6YrFx5A36XHVg5GlBBZuPEn4JA,3957
|
|
44
|
+
statement_extractor/pipeline/context.py,sha256=evAdyH5oOCNM_ILGZNS1mov3lM4D3mCvr5hzsjaB0Bs,6136
|
|
45
|
+
statement_extractor/pipeline/orchestrator.py,sha256=qH6rD4_wI_kZ_e8NeIv2XYHUA07ldogFewFsZeRQVxw,16687
|
|
46
|
+
statement_extractor/pipeline/registry.py,sha256=yBybhRd1HU2Y75TebLGBzF6nbPiHKZ0cHkyj-3CVnhg,11390
|
|
47
|
+
statement_extractor/plugins/__init__.py,sha256=pIcPeoMFd-56jOM_kGrUWvPuwqN6vFJ-oUbu130-tzI,1345
|
|
48
|
+
statement_extractor/plugins/base.py,sha256=xC661iFtnhIxtZLTwuCc-0rFV1q2V3hCTV-uOaILsOA,21622
|
|
49
|
+
statement_extractor/plugins/extractors/__init__.py,sha256=sqxTI7WwDLVQKwOiQXqWS72gjJnwb76Gs9N3LGetBnI,253
|
|
50
|
+
statement_extractor/plugins/extractors/base.py,sha256=kNRsQ7BL84lXPXREm7CihrprDUaFwDDvMpBcbZlwSGA,179
|
|
51
|
+
statement_extractor/plugins/extractors/gliner2.py,sha256=yDwKJVniMj4YwjR4Rm6MALDk633H5qcKcxa2xOLh9LI,21999
|
|
52
|
+
statement_extractor/plugins/labelers/__init__.py,sha256=flHEoBvnzQ3vAKkIUHyezpYi2H3KJvYGRerCVnc80r0,965
|
|
53
|
+
statement_extractor/plugins/labelers/base.py,sha256=hIgJKq2LU00OcL0Zjy1L9hP8K2onlM_xtZ63XcH8qDE,171
|
|
54
|
+
statement_extractor/plugins/labelers/confidence.py,sha256=XiXjBYe-8ch_SCKnz0sAwTT1mJ_XKMsuzXBbwAW_OK0,4083
|
|
55
|
+
statement_extractor/plugins/labelers/relation_type.py,sha256=e5ASwVqJGMSCrx5GtyNk85q_-19D7W_4jI-J-Pv_kxY,2506
|
|
56
|
+
statement_extractor/plugins/labelers/sentiment.py,sha256=nlWv9ymb7hlDIcFa-gjbIvZlJY1VrHrXhKMD-udmIzM,5027
|
|
57
|
+
statement_extractor/plugins/labelers/taxonomy.py,sha256=u_TQVCTOZCtZis5ZP0xvxh5Ehc0fCJ-DG6E86GxjNcs,12725
|
|
58
|
+
statement_extractor/plugins/labelers/taxonomy_embedding.py,sha256=NsSls2jkWm8LyNNuDkG2Rs4PYKQQxeMUDLTRrvSNk_A,16305
|
|
59
|
+
statement_extractor/plugins/pdf/__init__.py,sha256=QLbgg3lgpwUKR1EGmzhbOJh5IB4-3rpWen9c75YNLtM,220
|
|
60
|
+
statement_extractor/plugins/pdf/pypdf.py,sha256=JgmWa1-6tiATbPvhONMqRd5kAXJ--tb8rlEcR3u73sk,8612
|
|
61
|
+
statement_extractor/plugins/qualifiers/__init__.py,sha256=H4FEZSw1GWBQB-Y79nQnLwhZ3okKQJqgJHGEA0Zp8pA,951
|
|
62
|
+
statement_extractor/plugins/qualifiers/base.py,sha256=Kx--OdIh77mnjSkTl1NvUeekItRiG8AnBUcuznOZeBI,179
|
|
63
|
+
statement_extractor/plugins/qualifiers/companies_house.py,sha256=6TlK6Zebb5wDJ9GGO3FvM9zOh27TWpio5BX9k7lBr7U,5854
|
|
64
|
+
statement_extractor/plugins/qualifiers/embedding_company.py,sha256=nc7oTFjEBuPiprjXKeFRiMYM6tNicMNum_xQ9LSgEOg,14756
|
|
65
|
+
statement_extractor/plugins/qualifiers/gleif.py,sha256=zHzC9eOt0R9Z56n0CXgTF7POJqu6v03SRmiJLmv8OGE,6104
|
|
66
|
+
statement_extractor/plugins/qualifiers/person.py,sha256=EN1T0G9NT6wOeIGljzZql11o63BujaHzK44yRqMTiRk,29034
|
|
67
|
+
statement_extractor/plugins/qualifiers/sec_edgar.py,sha256=d7QqGiE-3lFDQiXkYmNQU62K4oP2XYK6NzV6LNKPC5k,6754
|
|
68
|
+
statement_extractor/plugins/scrapers/__init__.py,sha256=mh1nmPtcsewrYeW5oELeke6DSzL8jsGOJ2OcH-A4-eo,208
|
|
69
|
+
statement_extractor/plugins/scrapers/http.py,sha256=igoB1JN7U-FPdBFmNfrdZV-Ho4JQ3RXniLz17SmQx8I,7778
|
|
70
|
+
statement_extractor/plugins/splitters/__init__.py,sha256=05CYeAEO0lZsapK5pjxZJbOCLI1kjeK6IQjftxqqg5g,224
|
|
71
|
+
statement_extractor/plugins/splitters/base.py,sha256=GeIBchFTr8icRSfYR8bGSb4-GoEZ1N0IGN6Kl5W2mL0,175
|
|
72
|
+
statement_extractor/plugins/splitters/t5_gemma.py,sha256=5qjxeHznuAA9hL8EbUDDGQ3N2gYLmtg0hv9BsLWzfMk,9971
|
|
73
|
+
statement_extractor/plugins/taxonomy/__init__.py,sha256=8N0tW7pm95DSCqM-s99ea0Tigbi9bZMyTkKblR1qmLw,307
|
|
74
|
+
statement_extractor/plugins/taxonomy/embedding.py,sha256=yCuNE8UeY8tH2dHGRKL3hmRQBmdz9_9YQ0t5_VTCf7E,16349
|
|
75
|
+
statement_extractor/plugins/taxonomy/mnli.py,sha256=zPZlpAHQqdnwH7fXS_CSY0HCMnaSrrk-fDQb1ZIqqPc,9163
|
|
76
|
+
corp_extractor-0.9.3.dist-info/METADATA,sha256=Ps8LucareMigmuhXiPIDUXPgsWp5F7noVYT7VbTrSZA,29633
|
|
77
|
+
corp_extractor-0.9.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
78
|
+
corp_extractor-0.9.3.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
|
|
79
|
+
corp_extractor-0.9.3.dist-info/RECORD,,
|