corp-extractor 0.9.3__py3-none-any.whl → 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +33 -3
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +16 -12
- statement_extractor/cli.py +472 -45
- statement_extractor/database/embeddings.py +45 -0
- statement_extractor/database/hub.py +51 -9
- statement_extractor/database/importers/import_utils.py +264 -0
- statement_extractor/database/importers/wikidata_dump.py +334 -3
- statement_extractor/database/importers/wikidata_people.py +44 -0
- statement_extractor/database/migrate_v2.py +852 -0
- statement_extractor/database/models.py +125 -1
- statement_extractor/database/schema_v2.py +409 -0
- statement_extractor/database/seed_data.py +359 -0
- statement_extractor/database/store.py +2113 -322
- statement_extractor/plugins/qualifiers/person.py +109 -52
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: corp-extractor
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.4
|
|
4
4
|
Summary: Extract structured entity and relationship information from text
|
|
5
5
|
Project-URL: Homepage, https://github.com/corp-o-rate/statement-extractor
|
|
6
6
|
Project-URL: Documentation, https://github.com/corp-o-rate/statement-extractor#readme
|
|
@@ -57,7 +57,7 @@ Description-Content-Type: text/markdown
|
|
|
57
57
|
|
|
58
58
|
# Corp Extractor
|
|
59
59
|
|
|
60
|
-
|
|
60
|
+
Analyze complex text to extract relationship information about people and organizations. Runs entirely on your hardware (RTX 4090+, Apple M1 16GB+) with no external API dependencies. Uses fine-tuned T5-Gemma 2 for statement splitting and coreference resolution, plus GLiNER2 for entity extraction. Includes a database of 10M+ organizations and 40M+ people with quantized embeddings for fast entity qualification.
|
|
61
61
|
|
|
62
62
|
[](https://pypi.org/project/corp-extractor/)
|
|
63
63
|
[](https://pypi.org/project/corp-extractor/)
|
|
@@ -65,6 +65,7 @@ Extract structured subject-predicate-object statements from unstructured text us
|
|
|
65
65
|
|
|
66
66
|
## Features
|
|
67
67
|
|
|
68
|
+
- **Database v2 Schema** *(v0.9.4)*: Normalized schema with INTEGER FK references, new roles/locations tables, int8 scalar embeddings (75% smaller)
|
|
68
69
|
- **Person Database** *(v0.9.2)*: Qualify notable people (executives, politicians, athletes, etc.) against Wikidata with canonical IDs
|
|
69
70
|
- **Organization Canonicalization** *(v0.9.2)*: Link equivalent records across sources (LEI, ticker, CIK, name matching)
|
|
70
71
|
- **5-Stage Pipeline** *(v0.8.0)*: Modular plugin-based architecture for full entity resolution
|
|
@@ -397,10 +398,34 @@ The library includes an **entity embedding database** for fast entity qualificat
|
|
|
397
398
|
corp-extractor db download # Download pre-built database
|
|
398
399
|
corp-extractor db search "Microsoft" # Search organizations
|
|
399
400
|
corp-extractor db search-people "Tim Cook" # Search people
|
|
401
|
+
corp-extractor db search-roles "CEO" # Search roles (v0.9.4)
|
|
402
|
+
corp-extractor db search-locations "California" # Search locations (v0.9.4)
|
|
400
403
|
```
|
|
401
404
|
|
|
402
405
|
For comprehensive documentation including schema, CLI reference, Python API, and build instructions, see **[ENTITY_DATABASE.md](./ENTITY_DATABASE.md)**.
|
|
403
406
|
|
|
407
|
+
## New in v0.9.4: Database v2 Schema
|
|
408
|
+
|
|
409
|
+
v0.9.4 introduces a **normalized v2 schema** with significant improvements:
|
|
410
|
+
|
|
411
|
+
- **INTEGER FK references** replace TEXT enum columns for better query performance
|
|
412
|
+
- **New enum lookup tables**: `source_types`, `people_types`, `organization_types`, `location_types`
|
|
413
|
+
- **New tables**: `roles` (job titles with Wikidata QID), `locations` (countries/states/cities with hierarchy)
|
|
414
|
+
- **Scalar (int8) embeddings**: 75% storage reduction with ~92% recall at top-100
|
|
415
|
+
- **QID as integers**: Wikidata QIDs stored as integers (Q prefix stripped)
|
|
416
|
+
- **Human-readable views**: `organizations_view`, `people_view`, `roles_view`, `locations_view`
|
|
417
|
+
|
|
418
|
+
**Migration:**
|
|
419
|
+
```bash
|
|
420
|
+
# Migrate existing v1 database to v2
|
|
421
|
+
corp-extractor db migrate-v2 entities.db entities-v2.db
|
|
422
|
+
|
|
423
|
+
# Generate int8 scalar embeddings
|
|
424
|
+
corp-extractor db backfill-scalar
|
|
425
|
+
```
|
|
426
|
+
|
|
427
|
+
**Default database path**: `~/.cache/corp-extractor/entities-v2.db`
|
|
428
|
+
|
|
404
429
|
## New in v0.6.0: Entity Embedding Database
|
|
405
430
|
|
|
406
431
|
v0.6.0 introduces an **entity embedding database** for fast entity qualification using vector similarity search.
|
|
@@ -461,6 +486,11 @@ corp-extractor db import-people --type executive --enrich-dates # Fetch role s
|
|
|
461
486
|
# Import from Wikidata dump (v0.9.1) - avoids SPARQL timeouts
|
|
462
487
|
corp-extractor db import-wikidata-dump --download --limit 50000 # Downloads ~100GB dump
|
|
463
488
|
corp-extractor db import-wikidata-dump --dump /path/to/dump.bz2 --people --no-orgs # Local dump
|
|
489
|
+
corp-extractor db import-wikidata-dump --dump dump.bz2 --locations --no-people --no-orgs # Locations only (v0.9.4)
|
|
490
|
+
|
|
491
|
+
# Migrate to v2 schema (v0.9.4)
|
|
492
|
+
corp-extractor db migrate-v2 entities.db entities-v2.db
|
|
493
|
+
corp-extractor db backfill-scalar # Generate int8 embeddings (75% smaller)
|
|
464
494
|
|
|
465
495
|
# Check status
|
|
466
496
|
corp-extractor db status
|
|
@@ -505,7 +535,7 @@ corp-extractor db create-lite entities.db # Create lite version
|
|
|
505
535
|
corp-extractor db compress entities.db # Compress with gzip
|
|
506
536
|
```
|
|
507
537
|
|
|
508
|
-
See [
|
|
538
|
+
See [ENTITY_DATABASE.md](./ENTITY_DATABASE.md) for complete build and publish instructions.
|
|
509
539
|
|
|
510
540
|
## New in v0.7.0: Document Processing
|
|
511
541
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
statement_extractor/__init__.py,sha256=vOJFsK6wNOoBvGYOvIKsseaqpFR8vNg_XPH-r8SmLas,3215
|
|
2
2
|
statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
|
|
3
|
-
statement_extractor/cli.py,sha256=
|
|
3
|
+
statement_extractor/cli.py,sha256=2c3K5wUWL03xRndkvNI1rzFGkcYXJYzTxX4wVIP1O3I,125325
|
|
4
4
|
statement_extractor/extractor.py,sha256=m10na6I2iU1GwokQTxodePttYgigHykoss5LWrE8JOQ,38418
|
|
5
5
|
statement_extractor/gliner_extraction.py,sha256=OL4w-0_rZc6XCojaVsbGY4VdIXRJ6j8ZmeUeTOL0Ue0,8118
|
|
6
6
|
statement_extractor/llm.py,sha256=1eBrYs-bUPqzbpiiif_hH-gE_DeM-l3cmddrIoLHFXo,8010
|
|
@@ -10,20 +10,24 @@ statement_extractor/scoring.py,sha256=V9WHQ-QCAoycnnaTHydWkFo-48_lcS6Mkztxjfi4wV
|
|
|
10
10
|
statement_extractor/data/default_predicates.json,sha256=7rhFXWHvStDj4sLYfBXKS50xGChPaqMKUaKTkMEJRGk,32870
|
|
11
11
|
statement_extractor/data/statement_taxonomy.json,sha256=LI9RWthuJTFCcuaIbh6h3FEu8EJpejiKjAtNM_y1s8A,336543
|
|
12
12
|
statement_extractor/database/__init__.py,sha256=1eScQOm7866v9xndaqCK-xsXDUhKBSj5YGtGoQ80LgU,1548
|
|
13
|
-
statement_extractor/database/embeddings.py,sha256=
|
|
14
|
-
statement_extractor/database/hub.py,sha256=
|
|
15
|
-
statement_extractor/database/
|
|
13
|
+
statement_extractor/database/embeddings.py,sha256=VT49amsNyCuhnoGFfYsSii8bPIrnatzvzmQhoq_wlxQ,6965
|
|
14
|
+
statement_extractor/database/hub.py,sha256=3T3yooMI2kpk-SnjSxxglKEVsckC_dGDUEWjnKEJWBk,15128
|
|
15
|
+
statement_extractor/database/migrate_v2.py,sha256=I3zHEMPD5q2dTzLIxrqc6Fxj3y0XHe28UWKOd6CLD3g,29789
|
|
16
|
+
statement_extractor/database/models.py,sha256=GSyZZUPjIWLY9V3l-Fi44dnc9SgD61mhuYZUEZEiDV0,15913
|
|
16
17
|
statement_extractor/database/resolver.py,sha256=_fTITarFmAYOtuRbOos48ou_aqX4yJC0K2csdLbIktI,7202
|
|
17
|
-
statement_extractor/database/
|
|
18
|
+
statement_extractor/database/schema_v2.py,sha256=QUxBp6-X2hM3DRY52vxYN0DRpDG0d1abXJ4uoWPYApA,13330
|
|
19
|
+
statement_extractor/database/seed_data.py,sha256=z_F73_LfZxAoW3fg2Or-oRBjpD-9mn5TSwhkL2D4dWE,10030
|
|
20
|
+
statement_extractor/database/store.py,sha256=fEAm4KWfJ0Z6ZzlVq6jmZKoVy8GlAGa5wgn2vV1jGDk,180742
|
|
18
21
|
statement_extractor/database/importers/__init__.py,sha256=acIoX_BPdXv2DOMFyVbFZPDGNWp2s1FpC774loTqL5I,1121
|
|
19
22
|
statement_extractor/database/importers/companies_house.py,sha256=b5OMFtoHhkPgoGK08ThQn9BtTu9uC_dYzBVpC10xT4U,20252
|
|
20
23
|
statement_extractor/database/importers/companies_house_officers.py,sha256=QDFA0FzqDx9p6VjRrB7o4BE3e30l7i0ML_ktntsB-kA,15565
|
|
21
24
|
statement_extractor/database/importers/gleif.py,sha256=sw4YYROD6wi7IbBEKGCn8kko0nOYbKOyukDJKGQp17Q,20200
|
|
25
|
+
statement_extractor/database/importers/import_utils.py,sha256=2nVsUelN4_mKQ08qfzpeJsxkA9piyANznnmRs50Qt0w,6335
|
|
22
26
|
statement_extractor/database/importers/sec_edgar.py,sha256=0nnhnOrf5d1wR9PGjl8AuNOnp4mfmEtopjkgUY_PLQc,13738
|
|
23
27
|
statement_extractor/database/importers/sec_form4.py,sha256=ZoV-oyNhG5AOUm4u9hemmRI5KnpNs3Gw_dfisjkD3zU,18234
|
|
24
28
|
statement_extractor/database/importers/wikidata.py,sha256=tRj4kEMVIq7sRXxjyxj-scl8eXybkrLVvyNDYV2T5lg,39572
|
|
25
|
-
statement_extractor/database/importers/wikidata_dump.py,sha256=
|
|
26
|
-
statement_extractor/database/importers/wikidata_people.py,sha256=
|
|
29
|
+
statement_extractor/database/importers/wikidata_dump.py,sha256=6vTluVuXm5INq5urhnd_es5i4mzE3HM0cEKJIblGTbU,93101
|
|
30
|
+
statement_extractor/database/importers/wikidata_people.py,sha256=vrEFGvMdXUT3Fz_diJxQrR0qch7P-rAElKeBRnssSG0,44964
|
|
27
31
|
statement_extractor/document/__init__.py,sha256=csbUUjxaZay-0WXtjZmULjDfL9VNxhOlePyKTMdRDYo,1714
|
|
28
32
|
statement_extractor/document/chunker.py,sha256=I76p6Qwujk2kkN7GJ1sMwbQNOfEpbt29u-RxJdt15oE,14020
|
|
29
33
|
statement_extractor/document/context.py,sha256=9DvyguwCjlef2MeNWZMgydvD54FPiOppjdvamQnrKzM,5450
|
|
@@ -63,7 +67,7 @@ statement_extractor/plugins/qualifiers/base.py,sha256=Kx--OdIh77mnjSkTl1NvUeekIt
|
|
|
63
67
|
statement_extractor/plugins/qualifiers/companies_house.py,sha256=6TlK6Zebb5wDJ9GGO3FvM9zOh27TWpio5BX9k7lBr7U,5854
|
|
64
68
|
statement_extractor/plugins/qualifiers/embedding_company.py,sha256=nc7oTFjEBuPiprjXKeFRiMYM6tNicMNum_xQ9LSgEOg,14756
|
|
65
69
|
statement_extractor/plugins/qualifiers/gleif.py,sha256=zHzC9eOt0R9Z56n0CXgTF7POJqu6v03SRmiJLmv8OGE,6104
|
|
66
|
-
statement_extractor/plugins/qualifiers/person.py,sha256=
|
|
70
|
+
statement_extractor/plugins/qualifiers/person.py,sha256=SKBCFnIKCJJt77qKyPi_kla7DDZl-n64FcU7txMKs9U,32154
|
|
67
71
|
statement_extractor/plugins/qualifiers/sec_edgar.py,sha256=d7QqGiE-3lFDQiXkYmNQU62K4oP2XYK6NzV6LNKPC5k,6754
|
|
68
72
|
statement_extractor/plugins/scrapers/__init__.py,sha256=mh1nmPtcsewrYeW5oELeke6DSzL8jsGOJ2OcH-A4-eo,208
|
|
69
73
|
statement_extractor/plugins/scrapers/http.py,sha256=igoB1JN7U-FPdBFmNfrdZV-Ho4JQ3RXniLz17SmQx8I,7778
|
|
@@ -73,7 +77,7 @@ statement_extractor/plugins/splitters/t5_gemma.py,sha256=5qjxeHznuAA9hL8EbUDDGQ3
|
|
|
73
77
|
statement_extractor/plugins/taxonomy/__init__.py,sha256=8N0tW7pm95DSCqM-s99ea0Tigbi9bZMyTkKblR1qmLw,307
|
|
74
78
|
statement_extractor/plugins/taxonomy/embedding.py,sha256=yCuNE8UeY8tH2dHGRKL3hmRQBmdz9_9YQ0t5_VTCf7E,16349
|
|
75
79
|
statement_extractor/plugins/taxonomy/mnli.py,sha256=zPZlpAHQqdnwH7fXS_CSY0HCMnaSrrk-fDQb1ZIqqPc,9163
|
|
76
|
-
corp_extractor-0.9.
|
|
77
|
-
corp_extractor-0.9.
|
|
78
|
-
corp_extractor-0.9.
|
|
79
|
-
corp_extractor-0.9.
|
|
80
|
+
corp_extractor-0.9.4.dist-info/METADATA,sha256=a5pkoSpziVKqggeeSX_TGfSKk67GtB8ywpl6YzOdX6c,31449
|
|
81
|
+
corp_extractor-0.9.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
82
|
+
corp_extractor-0.9.4.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
|
|
83
|
+
corp_extractor-0.9.4.dist-info/RECORD,,
|