corp-extractor 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +72 -11
  2. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +34 -27
  3. statement_extractor/cli.py +1317 -101
  4. statement_extractor/database/embeddings.py +45 -0
  5. statement_extractor/database/hub.py +86 -136
  6. statement_extractor/database/importers/__init__.py +10 -2
  7. statement_extractor/database/importers/companies_house.py +16 -2
  8. statement_extractor/database/importers/companies_house_officers.py +431 -0
  9. statement_extractor/database/importers/gleif.py +23 -0
  10. statement_extractor/database/importers/import_utils.py +264 -0
  11. statement_extractor/database/importers/sec_edgar.py +17 -0
  12. statement_extractor/database/importers/sec_form4.py +512 -0
  13. statement_extractor/database/importers/wikidata.py +151 -43
  14. statement_extractor/database/importers/wikidata_dump.py +2282 -0
  15. statement_extractor/database/importers/wikidata_people.py +867 -325
  16. statement_extractor/database/migrate_v2.py +852 -0
  17. statement_extractor/database/models.py +155 -7
  18. statement_extractor/database/schema_v2.py +409 -0
  19. statement_extractor/database/seed_data.py +359 -0
  20. statement_extractor/database/store.py +3449 -233
  21. statement_extractor/document/deduplicator.py +10 -12
  22. statement_extractor/extractor.py +1 -1
  23. statement_extractor/models/__init__.py +3 -2
  24. statement_extractor/models/statement.py +15 -17
  25. statement_extractor/models.py +1 -1
  26. statement_extractor/pipeline/context.py +5 -5
  27. statement_extractor/pipeline/orchestrator.py +12 -12
  28. statement_extractor/plugins/base.py +17 -17
  29. statement_extractor/plugins/extractors/gliner2.py +28 -28
  30. statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
  31. statement_extractor/plugins/qualifiers/person.py +120 -53
  32. statement_extractor/plugins/splitters/t5_gemma.py +35 -39
  33. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
  34. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: corp-extractor
3
- Version: 0.9.0
4
- Summary: Extract structured statements from text using T5-Gemma 2 and Diverse Beam Search
3
+ Version: 0.9.4
4
+ Summary: Extract structured entity and relationship information from text
5
5
  Project-URL: Homepage, https://github.com/corp-o-rate/statement-extractor
6
6
  Project-URL: Documentation, https://github.com/corp-o-rate/statement-extractor#readme
7
7
  Project-URL: Repository, https://github.com/corp-o-rate/statement-extractor
@@ -9,7 +9,7 @@ Project-URL: Issues, https://github.com/corp-o-rate/statement-extractor/issues
9
9
  Author-email: Corp-o-Rate <neil@corp-o-rate.com>
10
10
  Maintainer-email: Corp-o-Rate <neil@corp-o-rate.com>
11
11
  License: MIT
12
- Keywords: diverse-beam-search,embeddings,gemma,information-extraction,knowledge-graph,nlp,statement-extraction,subject-predicate-object,t5,transformers,triples
12
+ Keywords: diverse-beam-search,embeddings,entities,entity-linking,entity-resolution,gemma,information-extraction,knowledge-graph,nlp,semantic-parsing,statement-extraction,subject-predicate-object,t5gemma2,transformers,triples
13
13
  Classifier: Development Status :: 4 - Beta
14
14
  Classifier: Intended Audience :: Developers
15
15
  Classifier: Intended Audience :: Science/Research
@@ -32,6 +32,7 @@ Requires-Dist: httpx>=0.25.0
32
32
  Requires-Dist: huggingface-hub>=0.20.0
33
33
  Requires-Dist: llama-cpp-python>=0.3.16
34
34
  Requires-Dist: numpy>=1.24.0
35
+ Requires-Dist: pycountry>=24.6.1
35
36
  Requires-Dist: pydantic>=2.0.0
36
37
  Requires-Dist: pymupdf>=1.23.0
37
38
  Requires-Dist: sentence-transformers>=2.2.0
@@ -56,7 +57,7 @@ Description-Content-Type: text/markdown
56
57
 
57
58
  # Corp Extractor
58
59
 
59
- Extract structured subject-predicate-object statements from unstructured text using the T5-Gemma 2 model.
60
+ Analyze complex text to extract relationship information about people and organizations. Runs entirely on your hardware (RTX 4090+, Apple M1 16GB+) with no external API dependencies. Uses fine-tuned T5-Gemma 2 for statement splitting and coreference resolution, plus GLiNER2 for entity extraction. Includes a database of 10M+ organizations and 40M+ people with quantized embeddings for fast entity qualification.
60
61
 
61
62
  [![PyPI version](https://img.shields.io/pypi/v/corp-extractor.svg)](https://pypi.org/project/corp-extractor/)
62
63
  [![Python 3.10+](https://img.shields.io/pypi/pyversions/corp-extractor.svg)](https://pypi.org/project/corp-extractor/)
@@ -64,7 +65,9 @@ Extract structured subject-predicate-object statements from unstructured text us
64
65
 
65
66
  ## Features
66
67
 
67
- - **Person Database** *(v0.9.0)*: Qualify notable people (executives, politicians, athletes, etc.) against Wikidata with canonical IDs
68
+ - **Database v2 Schema** *(v0.9.4)*: Normalized schema with INTEGER FK references, new roles/locations tables, int8 scalar embeddings (75% smaller)
69
+ - **Person Database** *(v0.9.2)*: Qualify notable people (executives, politicians, athletes, etc.) against Wikidata with canonical IDs
70
+ - **Organization Canonicalization** *(v0.9.2)*: Link equivalent records across sources (LEI, ticker, CIK, name matching)
68
71
  - **5-Stage Pipeline** *(v0.8.0)*: Modular plugin-based architecture for full entity resolution
69
72
  - **Document Processing** *(v0.7.0)*: Process documents, URLs, and PDFs with chunking and deduplication
70
73
  - **Entity Embedding Database** *(v0.6.0)*: Fast entity qualification using vector similarity (~100K+ SEC, ~3M GLEIF, ~5M UK organizations)
@@ -221,10 +224,10 @@ Pipeline Options:
221
224
  -o, --output [table|json|yaml|triples] Output format
222
225
  ```
223
226
 
224
- ## New in v0.2.0: Quality Scoring & Beam Merging
227
+ ## Quality Scoring & Beam Merging
225
228
 
226
- By default, the library now:
227
- - **Scores each triple** for groundedness based on whether entities appear in source text
229
+ By default, the library:
230
+ - **Scores each triple** using semantic similarity (50%) + GLiNER2 entity recognition (50%)
228
231
  - **Merges top beams** instead of selecting one, improving coverage
229
232
  - **Uses embeddings** to detect semantically similar predicates ("bought" ≈ "acquired")
230
233
 
@@ -386,6 +389,43 @@ config = PipelineConfig.from_stage_string("1-3") # Stages 1, 2, 3
386
389
 
387
390
  Taxonomy classifiers return **multiple labels** per statement above the confidence threshold.
388
391
 
392
+ ## Entity Database
393
+
394
+ The library includes an **entity embedding database** for fast entity qualification using vector similarity search. It stores records from authoritative sources (GLEIF, SEC, Companies House, Wikidata) with 768-dimensional embeddings for semantic matching.
395
+
396
+ **Quick start:**
397
+ ```bash
398
+ corp-extractor db download # Download pre-built database
399
+ corp-extractor db search "Microsoft" # Search organizations
400
+ corp-extractor db search-people "Tim Cook" # Search people
401
+ corp-extractor db search-roles "CEO" # Search roles (v0.9.4)
402
+ corp-extractor db search-locations "California" # Search locations (v0.9.4)
403
+ ```
404
+
405
+ For comprehensive documentation including schema, CLI reference, Python API, and build instructions, see **[ENTITY_DATABASE.md](./ENTITY_DATABASE.md)**.
406
+
407
+ ## New in v0.9.4: Database v2 Schema
408
+
409
+ v0.9.4 introduces a **normalized v2 schema** with significant improvements:
410
+
411
+ - **INTEGER FK references** replace TEXT enum columns for better query performance
412
+ - **New enum lookup tables**: `source_types`, `people_types`, `organization_types`, `location_types`
413
+ - **New tables**: `roles` (job titles with Wikidata QID), `locations` (countries/states/cities with hierarchy)
414
+ - **Scalar (int8) embeddings**: 75% storage reduction with ~92% recall at top-100
415
+ - **QID as integers**: Wikidata QIDs stored as integers (Q prefix stripped)
416
+ - **Human-readable views**: `organizations_view`, `people_view`, `roles_view`, `locations_view`
417
+
418
+ **Migration:**
419
+ ```bash
420
+ # Migrate existing v1 database to v2
421
+ corp-extractor db migrate-v2 entities.db entities-v2.db
422
+
423
+ # Generate int8 scalar embeddings
424
+ corp-extractor db backfill-scalar
425
+ ```
426
+
427
+ **Default database path**: `~/.cache/corp-extractor/entities-v2.db`
428
+
389
429
  ## New in v0.6.0: Entity Embedding Database
390
430
 
391
431
  v0.6.0 introduces an **entity embedding database** for fast entity qualification using vector similarity search.
@@ -405,7 +445,17 @@ v0.6.0 introduces an **entity embedding database** for fast entity qualification
405
445
 
406
446
  | Source | Records | Identifier | PersonType Classification |
407
447
  |--------|---------|------------|--------------------------|
408
- | Wikidata | Variable | Wikidata QID | executive, politician, athlete, artist, academic, scientist, journalist, entrepreneur, activist |
448
+ | Wikidata (SPARQL) | Variable | Wikidata QID | executive, politician, athlete, artist, academic, scientist, journalist, entrepreneur, activist |
449
+ | Wikidata (Dump) | All humans with enwiki | Wikidata QID | Classified from positions (P39) and occupations (P106) |
450
+
451
+ **Date Fields**: All importers now include `from_date` and `to_date` where available:
452
+ - **GLEIF**: LEI registration date
453
+ - **SEC Edgar**: First SEC filing date
454
+ - **Companies House**: Incorporation and dissolution dates
455
+ - **Wikidata Orgs**: Inception (P571) and dissolution (P576) dates
456
+ - **Wikidata People**: Position start (P580) and end (P582) dates
457
+
458
+ **Note**: The same person can have multiple records with different role/org combinations (unique on `source_id + role + org`). Organizations discovered during people import are automatically inserted into the organizations table with `known_for_org_id` foreign key linking people to their organizations.
409
459
 
410
460
  ### EntityType Classification
411
461
 
@@ -430,6 +480,17 @@ corp-extractor db import-wikidata --limit 50000
430
480
  # Import notable people (v0.9.0)
431
481
  corp-extractor db import-people --type executive --limit 5000
432
482
  corp-extractor db import-people --all --limit 10000 # All person types
483
+ corp-extractor db import-people --type executive --skip-existing # Skip existing records
484
+ corp-extractor db import-people --type executive --enrich-dates # Fetch role start/end dates
485
+
486
+ # Import from Wikidata dump (v0.9.1) - avoids SPARQL timeouts
487
+ corp-extractor db import-wikidata-dump --download --limit 50000 # Downloads ~100GB dump
488
+ corp-extractor db import-wikidata-dump --dump /path/to/dump.bz2 --people --no-orgs # Local dump
489
+ corp-extractor db import-wikidata-dump --dump dump.bz2 --locations --no-people --no-orgs # Locations only (v0.9.4)
490
+
491
+ # Migrate to v2 schema (v0.9.4)
492
+ corp-extractor db migrate-v2 entities.db entities-v2.db
493
+ corp-extractor db backfill-scalar # Generate int8 embeddings (75% smaller)
433
494
 
434
495
  # Check status
435
496
  corp-extractor db status
@@ -474,7 +535,7 @@ corp-extractor db create-lite entities.db # Create lite version
474
535
  corp-extractor db compress entities.db # Compress with gzip
475
536
  ```
476
537
 
477
- See [COMPANY_DB.md](../COMPANY_DB.md) for complete build and publish instructions.
538
+ See [ENTITY_DATABASE.md](./ENTITY_DATABASE.md) for complete build and publish instructions.
478
539
 
479
540
  ## New in v0.7.0: Document Processing
480
541
 
@@ -742,7 +803,7 @@ for text in texts:
742
803
  This library uses the T5-Gemma 2 statement extraction model with **Diverse Beam Search** ([Vijayakumar et al., 2016](https://arxiv.org/abs/1610.02424)):
743
804
 
744
805
  1. **Diverse Beam Search**: Generates 4+ candidate outputs using beam groups with diversity penalty
745
- 2. **Quality Scoring**: Each triple scored for groundedness in source text
806
+ 2. **Quality Scoring**: Each triple scored via semantic similarity + GLiNER2 entity recognition
746
807
  3. **Beam Merging**: Top beams combined for better coverage
747
808
  4. **Embedding Dedup**: Semantic similarity removes near-duplicate predicates
748
809
  5. **Predicate Normalization**: Optional taxonomy matching via embeddings
@@ -1,51 +1,58 @@
1
1
  statement_extractor/__init__.py,sha256=vOJFsK6wNOoBvGYOvIKsseaqpFR8vNg_XPH-r8SmLas,3215
2
2
  statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
3
- statement_extractor/cli.py,sha256=BTFLIBZoNa2ADrYVslbXiZGrzhRWmi7ppbnAPV3xUyg,71191
4
- statement_extractor/extractor.py,sha256=CGJCmAMiIoDsPtjIdvOHYBcz8058eYpfLMngjELMJhI,38403
3
+ statement_extractor/cli.py,sha256=2c3K5wUWL03xRndkvNI1rzFGkcYXJYzTxX4wVIP1O3I,125325
4
+ statement_extractor/extractor.py,sha256=m10na6I2iU1GwokQTxodePttYgigHykoss5LWrE8JOQ,38418
5
5
  statement_extractor/gliner_extraction.py,sha256=OL4w-0_rZc6XCojaVsbGY4VdIXRJ6j8ZmeUeTOL0Ue0,8118
6
6
  statement_extractor/llm.py,sha256=1eBrYs-bUPqzbpiiif_hH-gE_DeM-l3cmddrIoLHFXo,8010
7
- statement_extractor/models.py,sha256=fXTT7qxPqynnrrpb77nCgs3K2yn_YgbSugSXv12boX4,12312
7
+ statement_extractor/models.py,sha256=rBotCX2hRTMW4MXXkkWYv4JctP0HQR0NSJSlBcNhsF0,12302
8
8
  statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
9
9
  statement_extractor/scoring.py,sha256=V9WHQ-QCAoycnnaTHydWkFo-48_lcS6Mkztxjfi4wVg,16632
10
10
  statement_extractor/data/default_predicates.json,sha256=7rhFXWHvStDj4sLYfBXKS50xGChPaqMKUaKTkMEJRGk,32870
11
11
  statement_extractor/data/statement_taxonomy.json,sha256=LI9RWthuJTFCcuaIbh6h3FEu8EJpejiKjAtNM_y1s8A,336543
12
12
  statement_extractor/database/__init__.py,sha256=1eScQOm7866v9xndaqCK-xsXDUhKBSj5YGtGoQ80LgU,1548
13
- statement_extractor/database/embeddings.py,sha256=j_gUTEdRyyQCPcx9imsOh1nVDPjeiRXXG22OZ7KIO4w,5535
14
- statement_extractor/database/hub.py,sha256=HOnRp62RnkXvk2KgwqOLVpEkXwy0LS0n3tIJrkYCo2c,16842
15
- statement_extractor/database/models.py,sha256=ke4byqJiiBlZfRhxqoC0nsdDhb6YSG2I4S5W5BRBNY4,8813
13
+ statement_extractor/database/embeddings.py,sha256=VT49amsNyCuhnoGFfYsSii8bPIrnatzvzmQhoq_wlxQ,6965
14
+ statement_extractor/database/hub.py,sha256=3T3yooMI2kpk-SnjSxxglKEVsckC_dGDUEWjnKEJWBk,15128
15
+ statement_extractor/database/migrate_v2.py,sha256=I3zHEMPD5q2dTzLIxrqc6Fxj3y0XHe28UWKOd6CLD3g,29789
16
+ statement_extractor/database/models.py,sha256=GSyZZUPjIWLY9V3l-Fi44dnc9SgD61mhuYZUEZEiDV0,15913
16
17
  statement_extractor/database/resolver.py,sha256=_fTITarFmAYOtuRbOos48ou_aqX4yJC0K2csdLbIktI,7202
17
- statement_extractor/database/store.py,sha256=1qdRZ7q5nTLUYbtUC9cWSLey_GVf5kAQ6dTF9EEwDXY,56735
18
- statement_extractor/database/importers/__init__.py,sha256=0CPqafekQpqxFmZhe3uMJLNssqxGzEniZlArGyl8QKU,668
19
- statement_extractor/database/importers/companies_house.py,sha256=G0DZAs_9RM7uTwY7imt70IXUVvhntoO-xXnJ0o6jjGw,19635
20
- statement_extractor/database/importers/gleif.py,sha256=MTFuksVf83Barn1c6JvBLBouxXbzogWulKb8oqEODAk,18948
21
- statement_extractor/database/importers/sec_edgar.py,sha256=_B4QcXhZ_5ulXTSVW9dKKAzFIVwn-VIh_X39jcUhqsg,12923
22
- statement_extractor/database/importers/wikidata.py,sha256=ZZYHiqSlYlco1TSzCLUKqdT-i5X1cUSK1EnsfWWwPAc,33770
23
- statement_extractor/database/importers/wikidata_people.py,sha256=loqyf5sbtBqCITiTxqV3PLyx3SefmVefhZE0Y-cRoC4,22205
18
+ statement_extractor/database/schema_v2.py,sha256=QUxBp6-X2hM3DRY52vxYN0DRpDG0d1abXJ4uoWPYApA,13330
19
+ statement_extractor/database/seed_data.py,sha256=z_F73_LfZxAoW3fg2Or-oRBjpD-9mn5TSwhkL2D4dWE,10030
20
+ statement_extractor/database/store.py,sha256=fEAm4KWfJ0Z6ZzlVq6jmZKoVy8GlAGa5wgn2vV1jGDk,180742
21
+ statement_extractor/database/importers/__init__.py,sha256=acIoX_BPdXv2DOMFyVbFZPDGNWp2s1FpC774loTqL5I,1121
22
+ statement_extractor/database/importers/companies_house.py,sha256=b5OMFtoHhkPgoGK08ThQn9BtTu9uC_dYzBVpC10xT4U,20252
23
+ statement_extractor/database/importers/companies_house_officers.py,sha256=QDFA0FzqDx9p6VjRrB7o4BE3e30l7i0ML_ktntsB-kA,15565
24
+ statement_extractor/database/importers/gleif.py,sha256=sw4YYROD6wi7IbBEKGCn8kko0nOYbKOyukDJKGQp17Q,20200
25
+ statement_extractor/database/importers/import_utils.py,sha256=2nVsUelN4_mKQ08qfzpeJsxkA9piyANznnmRs50Qt0w,6335
26
+ statement_extractor/database/importers/sec_edgar.py,sha256=0nnhnOrf5d1wR9PGjl8AuNOnp4mfmEtopjkgUY_PLQc,13738
27
+ statement_extractor/database/importers/sec_form4.py,sha256=ZoV-oyNhG5AOUm4u9hemmRI5KnpNs3Gw_dfisjkD3zU,18234
28
+ statement_extractor/database/importers/wikidata.py,sha256=tRj4kEMVIq7sRXxjyxj-scl8eXybkrLVvyNDYV2T5lg,39572
29
+ statement_extractor/database/importers/wikidata_dump.py,sha256=6vTluVuXm5INq5urhnd_es5i4mzE3HM0cEKJIblGTbU,93101
30
+ statement_extractor/database/importers/wikidata_people.py,sha256=vrEFGvMdXUT3Fz_diJxQrR0qch7P-rAElKeBRnssSG0,44964
24
31
  statement_extractor/document/__init__.py,sha256=csbUUjxaZay-0WXtjZmULjDfL9VNxhOlePyKTMdRDYo,1714
25
32
  statement_extractor/document/chunker.py,sha256=I76p6Qwujk2kkN7GJ1sMwbQNOfEpbt29u-RxJdt15oE,14020
26
33
  statement_extractor/document/context.py,sha256=9DvyguwCjlef2MeNWZMgydvD54FPiOppjdvamQnrKzM,5450
27
- statement_extractor/document/deduplicator.py,sha256=8tPKWAGqNfjteOdnk7B82izyfIpvOebirZ-OIQKixwU,4821
34
+ statement_extractor/document/deduplicator.py,sha256=R_RwEdVeVQBYZHvjkVA0ShAWr8x618VrO9dkYWXvifI,4771
28
35
  statement_extractor/document/html_extractor.py,sha256=YRhaIsurBJTeECLkL2YJsSv8gDJJN33fS-ESkGvDBGs,6600
29
36
  statement_extractor/document/loader.py,sha256=Ygund7bz4EVcwsFsxkrrgSjOCK4tbb_sqkMlzK_oEKM,8996
30
37
  statement_extractor/document/pipeline.py,sha256=h4q-CG_WtBLibkTXCFhfTizMme8bJS5f6ZWOECqhRYU,13675
31
38
  statement_extractor/document/summarizer.py,sha256=DOF6qPw0oWEtLSt97oXOFyzb0jGWZZ7frDFp11rL3is,5853
32
- statement_extractor/models/__init__.py,sha256=9FxKkJ4EIPXmSkMo_j9jqAKH6jTkvz5Tzk2YvQL7UVk,2884
39
+ statement_extractor/models/__init__.py,sha256=OJOK0ral_jskrSxx6nCc3TB6JlVYaC5HI2eYXr9dhMQ,2971
33
40
  statement_extractor/models/canonical.py,sha256=LaSU3CUJZOtBM1SpRTAmK-3N7QnYmxZYJvQE1NVIjLY,6003
34
41
  statement_extractor/models/document.py,sha256=McCyXz88YtJtlsfiFzagjRAhY32ovpIDKXQI_eV_DZI,9203
35
42
  statement_extractor/models/entity.py,sha256=l2ny91BnnWwPo9zx1_Fb8WMKPNuIQFN0H7ILncylmcY,3214
36
43
  statement_extractor/models/labels.py,sha256=NUcjFDuGUOM82mgsaWOdoIVbRNiQ6TdN-imNuTograo,7326
37
44
  statement_extractor/models/qualifiers.py,sha256=l--khVzt-N6jgibZ-MSSl-3SdQUZJN9dGoxdNhRmM_I,5926
38
- statement_extractor/models/statement.py,sha256=agC4jcP9ospbZC91J6c0UgLAmfsg1tnqNcSvkqOtqBQ,3629
45
+ statement_extractor/models/statement.py,sha256=Wpp2OtZ5inhqbtEcblWdcES7g7lA-FVjqjz6Jq7hqzo,3329
39
46
  statement_extractor/pipeline/__init__.py,sha256=Q3M2Arx9BWH_APZxM-P0G-C3ISguG1whiA5QhxDHQCA,1071
40
47
  statement_extractor/pipeline/config.py,sha256=FXtqMMpRmdeuHB86D6YrFx5A36XHVg5GlBBZuPEn4JA,3957
41
- statement_extractor/pipeline/context.py,sha256=wURDYtzDrmbHu40Af_C_oTtN55wnULKHNZjUx6O8t-0,6126
42
- statement_extractor/pipeline/orchestrator.py,sha256=1pe6hyEtd495LJrfH3QgxQadNqERmehQEs5LHsAVIxM,16580
48
+ statement_extractor/pipeline/context.py,sha256=evAdyH5oOCNM_ILGZNS1mov3lM4D3mCvr5hzsjaB0Bs,6136
49
+ statement_extractor/pipeline/orchestrator.py,sha256=qH6rD4_wI_kZ_e8NeIv2XYHUA07ldogFewFsZeRQVxw,16687
43
50
  statement_extractor/pipeline/registry.py,sha256=yBybhRd1HU2Y75TebLGBzF6nbPiHKZ0cHkyj-3CVnhg,11390
44
51
  statement_extractor/plugins/__init__.py,sha256=pIcPeoMFd-56jOM_kGrUWvPuwqN6vFJ-oUbu130-tzI,1345
45
- statement_extractor/plugins/base.py,sha256=ItqJZ5rH65gW4-pXpraRb45y7F3lXqsKECumhV3tDyk,21516
52
+ statement_extractor/plugins/base.py,sha256=xC661iFtnhIxtZLTwuCc-0rFV1q2V3hCTV-uOaILsOA,21622
46
53
  statement_extractor/plugins/extractors/__init__.py,sha256=sqxTI7WwDLVQKwOiQXqWS72gjJnwb76Gs9N3LGetBnI,253
47
54
  statement_extractor/plugins/extractors/base.py,sha256=kNRsQ7BL84lXPXREm7CihrprDUaFwDDvMpBcbZlwSGA,179
48
- statement_extractor/plugins/extractors/gliner2.py,sha256=ObEQMNE6ArjRl2s4x3lkOSPs03cmtTYFlppnbhtkI7A,21876
55
+ statement_extractor/plugins/extractors/gliner2.py,sha256=yDwKJVniMj4YwjR4Rm6MALDk633H5qcKcxa2xOLh9LI,21999
49
56
  statement_extractor/plugins/labelers/__init__.py,sha256=flHEoBvnzQ3vAKkIUHyezpYi2H3KJvYGRerCVnc80r0,965
50
57
  statement_extractor/plugins/labelers/base.py,sha256=hIgJKq2LU00OcL0Zjy1L9hP8K2onlM_xtZ63XcH8qDE,171
51
58
  statement_extractor/plugins/labelers/confidence.py,sha256=XiXjBYe-8ch_SCKnz0sAwTT1mJ_XKMsuzXBbwAW_OK0,4083
@@ -58,19 +65,19 @@ statement_extractor/plugins/pdf/pypdf.py,sha256=JgmWa1-6tiATbPvhONMqRd5kAXJ--tb8
58
65
  statement_extractor/plugins/qualifiers/__init__.py,sha256=H4FEZSw1GWBQB-Y79nQnLwhZ3okKQJqgJHGEA0Zp8pA,951
59
66
  statement_extractor/plugins/qualifiers/base.py,sha256=Kx--OdIh77mnjSkTl1NvUeekItRiG8AnBUcuznOZeBI,179
60
67
  statement_extractor/plugins/qualifiers/companies_house.py,sha256=6TlK6Zebb5wDJ9GGO3FvM9zOh27TWpio5BX9k7lBr7U,5854
61
- statement_extractor/plugins/qualifiers/embedding_company.py,sha256=EmCxImdXBCA7zxM1stAVeAYlzeNPC_jSlyVN5q1XEJA,14567
68
+ statement_extractor/plugins/qualifiers/embedding_company.py,sha256=nc7oTFjEBuPiprjXKeFRiMYM6tNicMNum_xQ9LSgEOg,14756
62
69
  statement_extractor/plugins/qualifiers/gleif.py,sha256=zHzC9eOt0R9Z56n0CXgTF7POJqu6v03SRmiJLmv8OGE,6104
63
- statement_extractor/plugins/qualifiers/person.py,sha256=GZCUJaQncC_wB4nBQ4RLY5dJ-CdARMLpByc_Nn09wj8,28461
70
+ statement_extractor/plugins/qualifiers/person.py,sha256=SKBCFnIKCJJt77qKyPi_kla7DDZl-n64FcU7txMKs9U,32154
64
71
  statement_extractor/plugins/qualifiers/sec_edgar.py,sha256=d7QqGiE-3lFDQiXkYmNQU62K4oP2XYK6NzV6LNKPC5k,6754
65
72
  statement_extractor/plugins/scrapers/__init__.py,sha256=mh1nmPtcsewrYeW5oELeke6DSzL8jsGOJ2OcH-A4-eo,208
66
73
  statement_extractor/plugins/scrapers/http.py,sha256=igoB1JN7U-FPdBFmNfrdZV-Ho4JQ3RXniLz17SmQx8I,7778
67
74
  statement_extractor/plugins/splitters/__init__.py,sha256=05CYeAEO0lZsapK5pjxZJbOCLI1kjeK6IQjftxqqg5g,224
68
75
  statement_extractor/plugins/splitters/base.py,sha256=GeIBchFTr8icRSfYR8bGSb4-GoEZ1N0IGN6Kl5W2mL0,175
69
- statement_extractor/plugins/splitters/t5_gemma.py,sha256=AwYYKQrAmiue5IK9bbJ-Uhfl9oCZTX1X_tmKguKIdjU,9982
76
+ statement_extractor/plugins/splitters/t5_gemma.py,sha256=5qjxeHznuAA9hL8EbUDDGQ3N2gYLmtg0hv9BsLWzfMk,9971
70
77
  statement_extractor/plugins/taxonomy/__init__.py,sha256=8N0tW7pm95DSCqM-s99ea0Tigbi9bZMyTkKblR1qmLw,307
71
78
  statement_extractor/plugins/taxonomy/embedding.py,sha256=yCuNE8UeY8tH2dHGRKL3hmRQBmdz9_9YQ0t5_VTCf7E,16349
72
79
  statement_extractor/plugins/taxonomy/mnli.py,sha256=zPZlpAHQqdnwH7fXS_CSY0HCMnaSrrk-fDQb1ZIqqPc,9163
73
- corp_extractor-0.9.0.dist-info/METADATA,sha256=9pWemKEWyeEqW92sRd4SqdMykO-92kl5UIrs-P2xAn0,27553
74
- corp_extractor-0.9.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
75
- corp_extractor-0.9.0.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
76
- corp_extractor-0.9.0.dist-info/RECORD,,
80
+ corp_extractor-0.9.4.dist-info/METADATA,sha256=a5pkoSpziVKqggeeSX_TGfSKk67GtB8ywpl6YzOdX6c,31449
81
+ corp_extractor-0.9.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
82
+ corp_extractor-0.9.4.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
83
+ corp_extractor-0.9.4.dist-info/RECORD,,