corp-extractor 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +72 -11
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +34 -27
- statement_extractor/cli.py +1317 -101
- statement_extractor/database/embeddings.py +45 -0
- statement_extractor/database/hub.py +86 -136
- statement_extractor/database/importers/__init__.py +10 -2
- statement_extractor/database/importers/companies_house.py +16 -2
- statement_extractor/database/importers/companies_house_officers.py +431 -0
- statement_extractor/database/importers/gleif.py +23 -0
- statement_extractor/database/importers/import_utils.py +264 -0
- statement_extractor/database/importers/sec_edgar.py +17 -0
- statement_extractor/database/importers/sec_form4.py +512 -0
- statement_extractor/database/importers/wikidata.py +151 -43
- statement_extractor/database/importers/wikidata_dump.py +2282 -0
- statement_extractor/database/importers/wikidata_people.py +867 -325
- statement_extractor/database/migrate_v2.py +852 -0
- statement_extractor/database/models.py +155 -7
- statement_extractor/database/schema_v2.py +409 -0
- statement_extractor/database/seed_data.py +359 -0
- statement_extractor/database/store.py +3449 -233
- statement_extractor/document/deduplicator.py +10 -12
- statement_extractor/extractor.py +1 -1
- statement_extractor/models/__init__.py +3 -2
- statement_extractor/models/statement.py +15 -17
- statement_extractor/models.py +1 -1
- statement_extractor/pipeline/context.py +5 -5
- statement_extractor/pipeline/orchestrator.py +12 -12
- statement_extractor/plugins/base.py +17 -17
- statement_extractor/plugins/extractors/gliner2.py +28 -28
- statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
- statement_extractor/plugins/qualifiers/person.py +120 -53
- statement_extractor/plugins/splitters/t5_gemma.py +35 -39
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: corp-extractor
|
|
3
|
-
Version: 0.9.
|
|
4
|
-
Summary: Extract structured
|
|
3
|
+
Version: 0.9.4
|
|
4
|
+
Summary: Extract structured entity and relationship information from text
|
|
5
5
|
Project-URL: Homepage, https://github.com/corp-o-rate/statement-extractor
|
|
6
6
|
Project-URL: Documentation, https://github.com/corp-o-rate/statement-extractor#readme
|
|
7
7
|
Project-URL: Repository, https://github.com/corp-o-rate/statement-extractor
|
|
@@ -9,7 +9,7 @@ Project-URL: Issues, https://github.com/corp-o-rate/statement-extractor/issues
|
|
|
9
9
|
Author-email: Corp-o-Rate <neil@corp-o-rate.com>
|
|
10
10
|
Maintainer-email: Corp-o-Rate <neil@corp-o-rate.com>
|
|
11
11
|
License: MIT
|
|
12
|
-
Keywords: diverse-beam-search,embeddings,gemma,information-extraction,knowledge-graph,nlp,statement-extraction,subject-predicate-object,
|
|
12
|
+
Keywords: diverse-beam-search,embeddings,entities,entity-linking,entity-resolution,gemma,information-extraction,knowledge-graph,nlp,semantic-parsing,statement-extraction,subject-predicate-object,t5gemma2,transformers,triples
|
|
13
13
|
Classifier: Development Status :: 4 - Beta
|
|
14
14
|
Classifier: Intended Audience :: Developers
|
|
15
15
|
Classifier: Intended Audience :: Science/Research
|
|
@@ -32,6 +32,7 @@ Requires-Dist: httpx>=0.25.0
|
|
|
32
32
|
Requires-Dist: huggingface-hub>=0.20.0
|
|
33
33
|
Requires-Dist: llama-cpp-python>=0.3.16
|
|
34
34
|
Requires-Dist: numpy>=1.24.0
|
|
35
|
+
Requires-Dist: pycountry>=24.6.1
|
|
35
36
|
Requires-Dist: pydantic>=2.0.0
|
|
36
37
|
Requires-Dist: pymupdf>=1.23.0
|
|
37
38
|
Requires-Dist: sentence-transformers>=2.2.0
|
|
@@ -56,7 +57,7 @@ Description-Content-Type: text/markdown
|
|
|
56
57
|
|
|
57
58
|
# Corp Extractor
|
|
58
59
|
|
|
59
|
-
|
|
60
|
+
Analyze complex text to extract relationship information about people and organizations. Runs entirely on your hardware (RTX 4090+, Apple M1 16GB+) with no external API dependencies. Uses fine-tuned T5-Gemma 2 for statement splitting and coreference resolution, plus GLiNER2 for entity extraction. Includes a database of 10M+ organizations and 40M+ people with quantized embeddings for fast entity qualification.
|
|
60
61
|
|
|
61
62
|
[](https://pypi.org/project/corp-extractor/)
|
|
62
63
|
[](https://pypi.org/project/corp-extractor/)
|
|
@@ -64,7 +65,9 @@ Extract structured subject-predicate-object statements from unstructured text us
|
|
|
64
65
|
|
|
65
66
|
## Features
|
|
66
67
|
|
|
67
|
-
- **
|
|
68
|
+
- **Database v2 Schema** *(v0.9.4)*: Normalized schema with INTEGER FK references, new roles/locations tables, int8 scalar embeddings (75% smaller)
|
|
69
|
+
- **Person Database** *(v0.9.2)*: Qualify notable people (executives, politicians, athletes, etc.) against Wikidata with canonical IDs
|
|
70
|
+
- **Organization Canonicalization** *(v0.9.2)*: Link equivalent records across sources (LEI, ticker, CIK, name matching)
|
|
68
71
|
- **5-Stage Pipeline** *(v0.8.0)*: Modular plugin-based architecture for full entity resolution
|
|
69
72
|
- **Document Processing** *(v0.7.0)*: Process documents, URLs, and PDFs with chunking and deduplication
|
|
70
73
|
- **Entity Embedding Database** *(v0.6.0)*: Fast entity qualification using vector similarity (~100K+ SEC, ~3M GLEIF, ~5M UK organizations)
|
|
@@ -221,10 +224,10 @@ Pipeline Options:
|
|
|
221
224
|
-o, --output [table|json|yaml|triples] Output format
|
|
222
225
|
```
|
|
223
226
|
|
|
224
|
-
##
|
|
227
|
+
## Quality Scoring & Beam Merging
|
|
225
228
|
|
|
226
|
-
By default, the library
|
|
227
|
-
- **Scores each triple**
|
|
229
|
+
By default, the library:
|
|
230
|
+
- **Scores each triple** using semantic similarity (50%) + GLiNER2 entity recognition (50%)
|
|
228
231
|
- **Merges top beams** instead of selecting one, improving coverage
|
|
229
232
|
- **Uses embeddings** to detect semantically similar predicates ("bought" ≈ "acquired")
|
|
230
233
|
|
|
@@ -386,6 +389,43 @@ config = PipelineConfig.from_stage_string("1-3") # Stages 1, 2, 3
|
|
|
386
389
|
|
|
387
390
|
Taxonomy classifiers return **multiple labels** per statement above the confidence threshold.
|
|
388
391
|
|
|
392
|
+
## Entity Database
|
|
393
|
+
|
|
394
|
+
The library includes an **entity embedding database** for fast entity qualification using vector similarity search. It stores records from authoritative sources (GLEIF, SEC, Companies House, Wikidata) with 768-dimensional embeddings for semantic matching.
|
|
395
|
+
|
|
396
|
+
**Quick start:**
|
|
397
|
+
```bash
|
|
398
|
+
corp-extractor db download # Download pre-built database
|
|
399
|
+
corp-extractor db search "Microsoft" # Search organizations
|
|
400
|
+
corp-extractor db search-people "Tim Cook" # Search people
|
|
401
|
+
corp-extractor db search-roles "CEO" # Search roles (v0.9.4)
|
|
402
|
+
corp-extractor db search-locations "California" # Search locations (v0.9.4)
|
|
403
|
+
```
|
|
404
|
+
|
|
405
|
+
For comprehensive documentation including schema, CLI reference, Python API, and build instructions, see **[ENTITY_DATABASE.md](./ENTITY_DATABASE.md)**.
|
|
406
|
+
|
|
407
|
+
## New in v0.9.4: Database v2 Schema
|
|
408
|
+
|
|
409
|
+
v0.9.4 introduces a **normalized v2 schema** with significant improvements:
|
|
410
|
+
|
|
411
|
+
- **INTEGER FK references** replace TEXT enum columns for better query performance
|
|
412
|
+
- **New enum lookup tables**: `source_types`, `people_types`, `organization_types`, `location_types`
|
|
413
|
+
- **New tables**: `roles` (job titles with Wikidata QID), `locations` (countries/states/cities with hierarchy)
|
|
414
|
+
- **Scalar (int8) embeddings**: 75% storage reduction with ~92% recall at top-100
|
|
415
|
+
- **QID as integers**: Wikidata QIDs stored as integers (Q prefix stripped)
|
|
416
|
+
- **Human-readable views**: `organizations_view`, `people_view`, `roles_view`, `locations_view`
|
|
417
|
+
|
|
418
|
+
**Migration:**
|
|
419
|
+
```bash
|
|
420
|
+
# Migrate existing v1 database to v2
|
|
421
|
+
corp-extractor db migrate-v2 entities.db entities-v2.db
|
|
422
|
+
|
|
423
|
+
# Generate int8 scalar embeddings
|
|
424
|
+
corp-extractor db backfill-scalar
|
|
425
|
+
```
|
|
426
|
+
|
|
427
|
+
**Default database path**: `~/.cache/corp-extractor/entities-v2.db`
|
|
428
|
+
|
|
389
429
|
## New in v0.6.0: Entity Embedding Database
|
|
390
430
|
|
|
391
431
|
v0.6.0 introduces an **entity embedding database** for fast entity qualification using vector similarity search.
|
|
@@ -405,7 +445,17 @@ v0.6.0 introduces an **entity embedding database** for fast entity qualification
|
|
|
405
445
|
|
|
406
446
|
| Source | Records | Identifier | PersonType Classification |
|
|
407
447
|
|--------|---------|------------|--------------------------|
|
|
408
|
-
| Wikidata | Variable | Wikidata QID | executive, politician, athlete, artist, academic, scientist, journalist, entrepreneur, activist |
|
|
448
|
+
| Wikidata (SPARQL) | Variable | Wikidata QID | executive, politician, athlete, artist, academic, scientist, journalist, entrepreneur, activist |
|
|
449
|
+
| Wikidata (Dump) | All humans with enwiki | Wikidata QID | Classified from positions (P39) and occupations (P106) |
|
|
450
|
+
|
|
451
|
+
**Date Fields**: All importers now include `from_date` and `to_date` where available:
|
|
452
|
+
- **GLEIF**: LEI registration date
|
|
453
|
+
- **SEC Edgar**: First SEC filing date
|
|
454
|
+
- **Companies House**: Incorporation and dissolution dates
|
|
455
|
+
- **Wikidata Orgs**: Inception (P571) and dissolution (P576) dates
|
|
456
|
+
- **Wikidata People**: Position start (P580) and end (P582) dates
|
|
457
|
+
|
|
458
|
+
**Note**: The same person can have multiple records with different role/org combinations (unique on `source_id + role + org`). Organizations discovered during people import are automatically inserted into the organizations table with `known_for_org_id` foreign key linking people to their organizations.
|
|
409
459
|
|
|
410
460
|
### EntityType Classification
|
|
411
461
|
|
|
@@ -430,6 +480,17 @@ corp-extractor db import-wikidata --limit 50000
|
|
|
430
480
|
# Import notable people (v0.9.0)
|
|
431
481
|
corp-extractor db import-people --type executive --limit 5000
|
|
432
482
|
corp-extractor db import-people --all --limit 10000 # All person types
|
|
483
|
+
corp-extractor db import-people --type executive --skip-existing # Skip existing records
|
|
484
|
+
corp-extractor db import-people --type executive --enrich-dates # Fetch role start/end dates
|
|
485
|
+
|
|
486
|
+
# Import from Wikidata dump (v0.9.1) - avoids SPARQL timeouts
|
|
487
|
+
corp-extractor db import-wikidata-dump --download --limit 50000 # Downloads ~100GB dump
|
|
488
|
+
corp-extractor db import-wikidata-dump --dump /path/to/dump.bz2 --people --no-orgs # Local dump
|
|
489
|
+
corp-extractor db import-wikidata-dump --dump dump.bz2 --locations --no-people --no-orgs # Locations only (v0.9.4)
|
|
490
|
+
|
|
491
|
+
# Migrate to v2 schema (v0.9.4)
|
|
492
|
+
corp-extractor db migrate-v2 entities.db entities-v2.db
|
|
493
|
+
corp-extractor db backfill-scalar # Generate int8 embeddings (75% smaller)
|
|
433
494
|
|
|
434
495
|
# Check status
|
|
435
496
|
corp-extractor db status
|
|
@@ -474,7 +535,7 @@ corp-extractor db create-lite entities.db # Create lite version
|
|
|
474
535
|
corp-extractor db compress entities.db # Compress with gzip
|
|
475
536
|
```
|
|
476
537
|
|
|
477
|
-
See [
|
|
538
|
+
See [ENTITY_DATABASE.md](./ENTITY_DATABASE.md) for complete build and publish instructions.
|
|
478
539
|
|
|
479
540
|
## New in v0.7.0: Document Processing
|
|
480
541
|
|
|
@@ -742,7 +803,7 @@ for text in texts:
|
|
|
742
803
|
This library uses the T5-Gemma 2 statement extraction model with **Diverse Beam Search** ([Vijayakumar et al., 2016](https://arxiv.org/abs/1610.02424)):
|
|
743
804
|
|
|
744
805
|
1. **Diverse Beam Search**: Generates 4+ candidate outputs using beam groups with diversity penalty
|
|
745
|
-
2. **Quality Scoring**: Each triple scored
|
|
806
|
+
2. **Quality Scoring**: Each triple scored via semantic similarity + GLiNER2 entity recognition
|
|
746
807
|
3. **Beam Merging**: Top beams combined for better coverage
|
|
747
808
|
4. **Embedding Dedup**: Semantic similarity removes near-duplicate predicates
|
|
748
809
|
5. **Predicate Normalization**: Optional taxonomy matching via embeddings
|
|
@@ -1,51 +1,58 @@
|
|
|
1
1
|
statement_extractor/__init__.py,sha256=vOJFsK6wNOoBvGYOvIKsseaqpFR8vNg_XPH-r8SmLas,3215
|
|
2
2
|
statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
|
|
3
|
-
statement_extractor/cli.py,sha256=
|
|
4
|
-
statement_extractor/extractor.py,sha256=
|
|
3
|
+
statement_extractor/cli.py,sha256=2c3K5wUWL03xRndkvNI1rzFGkcYXJYzTxX4wVIP1O3I,125325
|
|
4
|
+
statement_extractor/extractor.py,sha256=m10na6I2iU1GwokQTxodePttYgigHykoss5LWrE8JOQ,38418
|
|
5
5
|
statement_extractor/gliner_extraction.py,sha256=OL4w-0_rZc6XCojaVsbGY4VdIXRJ6j8ZmeUeTOL0Ue0,8118
|
|
6
6
|
statement_extractor/llm.py,sha256=1eBrYs-bUPqzbpiiif_hH-gE_DeM-l3cmddrIoLHFXo,8010
|
|
7
|
-
statement_extractor/models.py,sha256=
|
|
7
|
+
statement_extractor/models.py,sha256=rBotCX2hRTMW4MXXkkWYv4JctP0HQR0NSJSlBcNhsF0,12302
|
|
8
8
|
statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
|
|
9
9
|
statement_extractor/scoring.py,sha256=V9WHQ-QCAoycnnaTHydWkFo-48_lcS6Mkztxjfi4wVg,16632
|
|
10
10
|
statement_extractor/data/default_predicates.json,sha256=7rhFXWHvStDj4sLYfBXKS50xGChPaqMKUaKTkMEJRGk,32870
|
|
11
11
|
statement_extractor/data/statement_taxonomy.json,sha256=LI9RWthuJTFCcuaIbh6h3FEu8EJpejiKjAtNM_y1s8A,336543
|
|
12
12
|
statement_extractor/database/__init__.py,sha256=1eScQOm7866v9xndaqCK-xsXDUhKBSj5YGtGoQ80LgU,1548
|
|
13
|
-
statement_extractor/database/embeddings.py,sha256=
|
|
14
|
-
statement_extractor/database/hub.py,sha256=
|
|
15
|
-
statement_extractor/database/
|
|
13
|
+
statement_extractor/database/embeddings.py,sha256=VT49amsNyCuhnoGFfYsSii8bPIrnatzvzmQhoq_wlxQ,6965
|
|
14
|
+
statement_extractor/database/hub.py,sha256=3T3yooMI2kpk-SnjSxxglKEVsckC_dGDUEWjnKEJWBk,15128
|
|
15
|
+
statement_extractor/database/migrate_v2.py,sha256=I3zHEMPD5q2dTzLIxrqc6Fxj3y0XHe28UWKOd6CLD3g,29789
|
|
16
|
+
statement_extractor/database/models.py,sha256=GSyZZUPjIWLY9V3l-Fi44dnc9SgD61mhuYZUEZEiDV0,15913
|
|
16
17
|
statement_extractor/database/resolver.py,sha256=_fTITarFmAYOtuRbOos48ou_aqX4yJC0K2csdLbIktI,7202
|
|
17
|
-
statement_extractor/database/
|
|
18
|
-
statement_extractor/database/
|
|
19
|
-
statement_extractor/database/
|
|
20
|
-
statement_extractor/database/importers/
|
|
21
|
-
statement_extractor/database/importers/
|
|
22
|
-
statement_extractor/database/importers/
|
|
23
|
-
statement_extractor/database/importers/
|
|
18
|
+
statement_extractor/database/schema_v2.py,sha256=QUxBp6-X2hM3DRY52vxYN0DRpDG0d1abXJ4uoWPYApA,13330
|
|
19
|
+
statement_extractor/database/seed_data.py,sha256=z_F73_LfZxAoW3fg2Or-oRBjpD-9mn5TSwhkL2D4dWE,10030
|
|
20
|
+
statement_extractor/database/store.py,sha256=fEAm4KWfJ0Z6ZzlVq6jmZKoVy8GlAGa5wgn2vV1jGDk,180742
|
|
21
|
+
statement_extractor/database/importers/__init__.py,sha256=acIoX_BPdXv2DOMFyVbFZPDGNWp2s1FpC774loTqL5I,1121
|
|
22
|
+
statement_extractor/database/importers/companies_house.py,sha256=b5OMFtoHhkPgoGK08ThQn9BtTu9uC_dYzBVpC10xT4U,20252
|
|
23
|
+
statement_extractor/database/importers/companies_house_officers.py,sha256=QDFA0FzqDx9p6VjRrB7o4BE3e30l7i0ML_ktntsB-kA,15565
|
|
24
|
+
statement_extractor/database/importers/gleif.py,sha256=sw4YYROD6wi7IbBEKGCn8kko0nOYbKOyukDJKGQp17Q,20200
|
|
25
|
+
statement_extractor/database/importers/import_utils.py,sha256=2nVsUelN4_mKQ08qfzpeJsxkA9piyANznnmRs50Qt0w,6335
|
|
26
|
+
statement_extractor/database/importers/sec_edgar.py,sha256=0nnhnOrf5d1wR9PGjl8AuNOnp4mfmEtopjkgUY_PLQc,13738
|
|
27
|
+
statement_extractor/database/importers/sec_form4.py,sha256=ZoV-oyNhG5AOUm4u9hemmRI5KnpNs3Gw_dfisjkD3zU,18234
|
|
28
|
+
statement_extractor/database/importers/wikidata.py,sha256=tRj4kEMVIq7sRXxjyxj-scl8eXybkrLVvyNDYV2T5lg,39572
|
|
29
|
+
statement_extractor/database/importers/wikidata_dump.py,sha256=6vTluVuXm5INq5urhnd_es5i4mzE3HM0cEKJIblGTbU,93101
|
|
30
|
+
statement_extractor/database/importers/wikidata_people.py,sha256=vrEFGvMdXUT3Fz_diJxQrR0qch7P-rAElKeBRnssSG0,44964
|
|
24
31
|
statement_extractor/document/__init__.py,sha256=csbUUjxaZay-0WXtjZmULjDfL9VNxhOlePyKTMdRDYo,1714
|
|
25
32
|
statement_extractor/document/chunker.py,sha256=I76p6Qwujk2kkN7GJ1sMwbQNOfEpbt29u-RxJdt15oE,14020
|
|
26
33
|
statement_extractor/document/context.py,sha256=9DvyguwCjlef2MeNWZMgydvD54FPiOppjdvamQnrKzM,5450
|
|
27
|
-
statement_extractor/document/deduplicator.py,sha256=
|
|
34
|
+
statement_extractor/document/deduplicator.py,sha256=R_RwEdVeVQBYZHvjkVA0ShAWr8x618VrO9dkYWXvifI,4771
|
|
28
35
|
statement_extractor/document/html_extractor.py,sha256=YRhaIsurBJTeECLkL2YJsSv8gDJJN33fS-ESkGvDBGs,6600
|
|
29
36
|
statement_extractor/document/loader.py,sha256=Ygund7bz4EVcwsFsxkrrgSjOCK4tbb_sqkMlzK_oEKM,8996
|
|
30
37
|
statement_extractor/document/pipeline.py,sha256=h4q-CG_WtBLibkTXCFhfTizMme8bJS5f6ZWOECqhRYU,13675
|
|
31
38
|
statement_extractor/document/summarizer.py,sha256=DOF6qPw0oWEtLSt97oXOFyzb0jGWZZ7frDFp11rL3is,5853
|
|
32
|
-
statement_extractor/models/__init__.py,sha256=
|
|
39
|
+
statement_extractor/models/__init__.py,sha256=OJOK0ral_jskrSxx6nCc3TB6JlVYaC5HI2eYXr9dhMQ,2971
|
|
33
40
|
statement_extractor/models/canonical.py,sha256=LaSU3CUJZOtBM1SpRTAmK-3N7QnYmxZYJvQE1NVIjLY,6003
|
|
34
41
|
statement_extractor/models/document.py,sha256=McCyXz88YtJtlsfiFzagjRAhY32ovpIDKXQI_eV_DZI,9203
|
|
35
42
|
statement_extractor/models/entity.py,sha256=l2ny91BnnWwPo9zx1_Fb8WMKPNuIQFN0H7ILncylmcY,3214
|
|
36
43
|
statement_extractor/models/labels.py,sha256=NUcjFDuGUOM82mgsaWOdoIVbRNiQ6TdN-imNuTograo,7326
|
|
37
44
|
statement_extractor/models/qualifiers.py,sha256=l--khVzt-N6jgibZ-MSSl-3SdQUZJN9dGoxdNhRmM_I,5926
|
|
38
|
-
statement_extractor/models/statement.py,sha256=
|
|
45
|
+
statement_extractor/models/statement.py,sha256=Wpp2OtZ5inhqbtEcblWdcES7g7lA-FVjqjz6Jq7hqzo,3329
|
|
39
46
|
statement_extractor/pipeline/__init__.py,sha256=Q3M2Arx9BWH_APZxM-P0G-C3ISguG1whiA5QhxDHQCA,1071
|
|
40
47
|
statement_extractor/pipeline/config.py,sha256=FXtqMMpRmdeuHB86D6YrFx5A36XHVg5GlBBZuPEn4JA,3957
|
|
41
|
-
statement_extractor/pipeline/context.py,sha256=
|
|
42
|
-
statement_extractor/pipeline/orchestrator.py,sha256=
|
|
48
|
+
statement_extractor/pipeline/context.py,sha256=evAdyH5oOCNM_ILGZNS1mov3lM4D3mCvr5hzsjaB0Bs,6136
|
|
49
|
+
statement_extractor/pipeline/orchestrator.py,sha256=qH6rD4_wI_kZ_e8NeIv2XYHUA07ldogFewFsZeRQVxw,16687
|
|
43
50
|
statement_extractor/pipeline/registry.py,sha256=yBybhRd1HU2Y75TebLGBzF6nbPiHKZ0cHkyj-3CVnhg,11390
|
|
44
51
|
statement_extractor/plugins/__init__.py,sha256=pIcPeoMFd-56jOM_kGrUWvPuwqN6vFJ-oUbu130-tzI,1345
|
|
45
|
-
statement_extractor/plugins/base.py,sha256=
|
|
52
|
+
statement_extractor/plugins/base.py,sha256=xC661iFtnhIxtZLTwuCc-0rFV1q2V3hCTV-uOaILsOA,21622
|
|
46
53
|
statement_extractor/plugins/extractors/__init__.py,sha256=sqxTI7WwDLVQKwOiQXqWS72gjJnwb76Gs9N3LGetBnI,253
|
|
47
54
|
statement_extractor/plugins/extractors/base.py,sha256=kNRsQ7BL84lXPXREm7CihrprDUaFwDDvMpBcbZlwSGA,179
|
|
48
|
-
statement_extractor/plugins/extractors/gliner2.py,sha256=
|
|
55
|
+
statement_extractor/plugins/extractors/gliner2.py,sha256=yDwKJVniMj4YwjR4Rm6MALDk633H5qcKcxa2xOLh9LI,21999
|
|
49
56
|
statement_extractor/plugins/labelers/__init__.py,sha256=flHEoBvnzQ3vAKkIUHyezpYi2H3KJvYGRerCVnc80r0,965
|
|
50
57
|
statement_extractor/plugins/labelers/base.py,sha256=hIgJKq2LU00OcL0Zjy1L9hP8K2onlM_xtZ63XcH8qDE,171
|
|
51
58
|
statement_extractor/plugins/labelers/confidence.py,sha256=XiXjBYe-8ch_SCKnz0sAwTT1mJ_XKMsuzXBbwAW_OK0,4083
|
|
@@ -58,19 +65,19 @@ statement_extractor/plugins/pdf/pypdf.py,sha256=JgmWa1-6tiATbPvhONMqRd5kAXJ--tb8
|
|
|
58
65
|
statement_extractor/plugins/qualifiers/__init__.py,sha256=H4FEZSw1GWBQB-Y79nQnLwhZ3okKQJqgJHGEA0Zp8pA,951
|
|
59
66
|
statement_extractor/plugins/qualifiers/base.py,sha256=Kx--OdIh77mnjSkTl1NvUeekItRiG8AnBUcuznOZeBI,179
|
|
60
67
|
statement_extractor/plugins/qualifiers/companies_house.py,sha256=6TlK6Zebb5wDJ9GGO3FvM9zOh27TWpio5BX9k7lBr7U,5854
|
|
61
|
-
statement_extractor/plugins/qualifiers/embedding_company.py,sha256=
|
|
68
|
+
statement_extractor/plugins/qualifiers/embedding_company.py,sha256=nc7oTFjEBuPiprjXKeFRiMYM6tNicMNum_xQ9LSgEOg,14756
|
|
62
69
|
statement_extractor/plugins/qualifiers/gleif.py,sha256=zHzC9eOt0R9Z56n0CXgTF7POJqu6v03SRmiJLmv8OGE,6104
|
|
63
|
-
statement_extractor/plugins/qualifiers/person.py,sha256=
|
|
70
|
+
statement_extractor/plugins/qualifiers/person.py,sha256=SKBCFnIKCJJt77qKyPi_kla7DDZl-n64FcU7txMKs9U,32154
|
|
64
71
|
statement_extractor/plugins/qualifiers/sec_edgar.py,sha256=d7QqGiE-3lFDQiXkYmNQU62K4oP2XYK6NzV6LNKPC5k,6754
|
|
65
72
|
statement_extractor/plugins/scrapers/__init__.py,sha256=mh1nmPtcsewrYeW5oELeke6DSzL8jsGOJ2OcH-A4-eo,208
|
|
66
73
|
statement_extractor/plugins/scrapers/http.py,sha256=igoB1JN7U-FPdBFmNfrdZV-Ho4JQ3RXniLz17SmQx8I,7778
|
|
67
74
|
statement_extractor/plugins/splitters/__init__.py,sha256=05CYeAEO0lZsapK5pjxZJbOCLI1kjeK6IQjftxqqg5g,224
|
|
68
75
|
statement_extractor/plugins/splitters/base.py,sha256=GeIBchFTr8icRSfYR8bGSb4-GoEZ1N0IGN6Kl5W2mL0,175
|
|
69
|
-
statement_extractor/plugins/splitters/t5_gemma.py,sha256=
|
|
76
|
+
statement_extractor/plugins/splitters/t5_gemma.py,sha256=5qjxeHznuAA9hL8EbUDDGQ3N2gYLmtg0hv9BsLWzfMk,9971
|
|
70
77
|
statement_extractor/plugins/taxonomy/__init__.py,sha256=8N0tW7pm95DSCqM-s99ea0Tigbi9bZMyTkKblR1qmLw,307
|
|
71
78
|
statement_extractor/plugins/taxonomy/embedding.py,sha256=yCuNE8UeY8tH2dHGRKL3hmRQBmdz9_9YQ0t5_VTCf7E,16349
|
|
72
79
|
statement_extractor/plugins/taxonomy/mnli.py,sha256=zPZlpAHQqdnwH7fXS_CSY0HCMnaSrrk-fDQb1ZIqqPc,9163
|
|
73
|
-
corp_extractor-0.9.
|
|
74
|
-
corp_extractor-0.9.
|
|
75
|
-
corp_extractor-0.9.
|
|
76
|
-
corp_extractor-0.9.
|
|
80
|
+
corp_extractor-0.9.4.dist-info/METADATA,sha256=a5pkoSpziVKqggeeSX_TGfSKk67GtB8ywpl6YzOdX6c,31449
|
|
81
|
+
corp_extractor-0.9.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
82
|
+
corp_extractor-0.9.4.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
|
|
83
|
+
corp_extractor-0.9.4.dist-info/RECORD,,
|