corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
- corp_extractor-0.9.0.dist-info/RECORD +76 -0
- statement_extractor/__init__.py +1 -1
- statement_extractor/cli.py +1227 -10
- statement_extractor/data/statement_taxonomy.json +6949 -1159
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +520 -0
- statement_extractor/database/importers/__init__.py +24 -0
- statement_extractor/database/importers/companies_house.py +545 -0
- statement_extractor/database/importers/gleif.py +538 -0
- statement_extractor/database/importers/sec_edgar.py +375 -0
- statement_extractor/database/importers/wikidata.py +1012 -0
- statement_extractor/database/importers/wikidata_people.py +632 -0
- statement_extractor/database/models.py +230 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +1609 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +173 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/models/__init__.py +16 -1
- statement_extractor/models/canonical.py +44 -1
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/labels.py +47 -18
- statement_extractor/models/qualifiers.py +51 -3
- statement_extractor/models/statement.py +26 -0
- statement_extractor/pipeline/config.py +6 -11
- statement_extractor/pipeline/orchestrator.py +80 -111
- statement_extractor/pipeline/registry.py +52 -46
- statement_extractor/plugins/__init__.py +20 -8
- statement_extractor/plugins/base.py +334 -64
- statement_extractor/plugins/extractors/gliner2.py +10 -0
- statement_extractor/plugins/labelers/taxonomy.py +18 -5
- statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +11 -0
- statement_extractor/plugins/qualifiers/companies_house.py +14 -3
- statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
- statement_extractor/plugins/qualifiers/gleif.py +14 -3
- statement_extractor/plugins/qualifiers/person.py +578 -14
- statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/t5_gemma.py +158 -53
- statement_extractor/plugins/taxonomy/embedding.py +193 -46
- statement_extractor/plugins/taxonomy/mnli.py +16 -4
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.5.0.dist-info/RECORD +0 -55
- statement_extractor/plugins/canonicalizers/__init__.py +0 -17
- statement_extractor/plugins/canonicalizers/base.py +0 -9
- statement_extractor/plugins/canonicalizers/location.py +0 -219
- statement_extractor/plugins/canonicalizers/organization.py +0 -230
- statement_extractor/plugins/canonicalizers/person.py +0 -242
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: corp-extractor
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.9.0
|
|
4
4
|
Summary: Extract structured statements from text using T5-Gemma 2 and Diverse Beam Search
|
|
5
5
|
Project-URL: Homepage, https://github.com/corp-o-rate/statement-extractor
|
|
6
6
|
Project-URL: Documentation, https://github.com/corp-o-rate/statement-extractor#readme
|
|
@@ -24,14 +24,24 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
|
24
24
|
Classifier: Topic :: Text Processing :: Linguistic
|
|
25
25
|
Requires-Python: >=3.10
|
|
26
26
|
Requires-Dist: accelerate>=1.12.0
|
|
27
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
27
28
|
Requires-Dist: click>=8.0.0
|
|
28
29
|
Requires-Dist: gguf>=0.17.1
|
|
29
30
|
Requires-Dist: gliner2
|
|
31
|
+
Requires-Dist: httpx>=0.25.0
|
|
32
|
+
Requires-Dist: huggingface-hub>=0.20.0
|
|
33
|
+
Requires-Dist: llama-cpp-python>=0.3.16
|
|
30
34
|
Requires-Dist: numpy>=1.24.0
|
|
31
35
|
Requires-Dist: pydantic>=2.0.0
|
|
36
|
+
Requires-Dist: pymupdf>=1.23.0
|
|
32
37
|
Requires-Dist: sentence-transformers>=2.2.0
|
|
38
|
+
Requires-Dist: sqlite-vec>=0.1.6
|
|
33
39
|
Requires-Dist: torch>=2.0.0
|
|
34
40
|
Requires-Dist: transformers>=5.0.0rc3
|
|
41
|
+
Provides-Extra: all
|
|
42
|
+
Requires-Dist: llama-cpp-python>=0.2.0; extra == 'all'
|
|
43
|
+
Requires-Dist: pillow>=10.0.0; extra == 'all'
|
|
44
|
+
Requires-Dist: pytesseract>=0.3.10; extra == 'all'
|
|
35
45
|
Provides-Extra: dev
|
|
36
46
|
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
37
47
|
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
@@ -39,6 +49,9 @@ Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
|
39
49
|
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
40
50
|
Provides-Extra: llm
|
|
41
51
|
Requires-Dist: llama-cpp-python>=0.2.0; extra == 'llm'
|
|
52
|
+
Provides-Extra: ocr
|
|
53
|
+
Requires-Dist: pillow>=10.0.0; extra == 'ocr'
|
|
54
|
+
Requires-Dist: pytesseract>=0.3.10; extra == 'ocr'
|
|
42
55
|
Description-Content-Type: text/markdown
|
|
43
56
|
|
|
44
57
|
# Corp Extractor
|
|
@@ -51,18 +64,20 @@ Extract structured subject-predicate-object statements from unstructured text us
|
|
|
51
64
|
|
|
52
65
|
## Features
|
|
53
66
|
|
|
54
|
-
- **
|
|
67
|
+
- **Person Database** *(v0.9.0)*: Qualify notable people (executives, politicians, athletes, etc.) against Wikidata with canonical IDs
|
|
68
|
+
- **5-Stage Pipeline** *(v0.8.0)*: Modular plugin-based architecture for full entity resolution
|
|
69
|
+
- **Document Processing** *(v0.7.0)*: Process documents, URLs, and PDFs with chunking and deduplication
|
|
70
|
+
- **Entity Embedding Database** *(v0.6.0)*: Fast entity qualification using vector similarity (~100K+ SEC, ~3M GLEIF, ~5M UK organizations)
|
|
55
71
|
- **Structured Extraction**: Converts unstructured text into subject-predicate-object triples
|
|
56
72
|
- **Entity Type Recognition**: Identifies 12 entity types (ORG, PERSON, GPE, LOC, PRODUCT, EVENT, etc.)
|
|
57
|
-
- **Entity Qualification** *(v0.
|
|
58
|
-
- **Canonicalization** *(v0.5.0)*: Resolves entities to canonical forms with fuzzy matching
|
|
73
|
+
- **Entity Qualification** *(v0.8.0)*: Adds identifiers (LEI, ticker, company numbers), canonical names, and FQN via embedding database
|
|
59
74
|
- **Statement Labeling** *(v0.5.0)*: Sentiment analysis, relation type classification, confidence scoring
|
|
60
75
|
- **GLiNER2 Integration** *(v0.4.0)*: Uses GLiNER2 (205M params) for entity recognition and relation extraction
|
|
61
76
|
- **Predefined Predicates**: Optional `--predicates` list for GLiNER2 relation extraction mode
|
|
62
77
|
- **Beam Merging**: Combines top beams for better coverage instead of picking one
|
|
63
78
|
- **Embedding-based Dedup**: Uses semantic similarity to detect near-duplicate predicates
|
|
64
79
|
- **Predicate Taxonomies**: Normalize predicates to canonical forms via embeddings
|
|
65
|
-
- **Command Line Interface**: Full-featured CLI with `split`, `pipeline`, and `
|
|
80
|
+
- **Command Line Interface**: Full-featured CLI with `split`, `pipeline`, `document`, and `db` commands
|
|
66
81
|
- **Multiple Output Formats**: Get results as Pydantic models, JSON, XML, or dictionaries
|
|
67
82
|
|
|
68
83
|
## Installation
|
|
@@ -143,7 +158,7 @@ The CLI provides three main commands: `split`, `pipeline`, and `plugins`.
|
|
|
143
158
|
corp-extractor split "Apple Inc. announced the iPhone 15."
|
|
144
159
|
corp-extractor split -f article.txt --json
|
|
145
160
|
|
|
146
|
-
# Full
|
|
161
|
+
# Full 5-stage pipeline (entity resolution, labeling, taxonomy)
|
|
147
162
|
corp-extractor pipeline "Amazon CEO Andy Jassy announced plans to hire workers."
|
|
148
163
|
corp-extractor pipeline -f article.txt --stages 1-3
|
|
149
164
|
corp-extractor pipeline "..." --disable-plugins sec_edgar
|
|
@@ -297,9 +312,9 @@ for stmt in fixed_statements:
|
|
|
297
312
|
|
|
298
313
|
During deduplication, reversed duplicates (e.g., "A -> P -> B" and "B -> P -> A") are now detected and merged, with the correct orientation determined by source text similarity.
|
|
299
314
|
|
|
300
|
-
##
|
|
315
|
+
## Pipeline Architecture
|
|
301
316
|
|
|
302
|
-
|
|
317
|
+
The library uses a **5-stage plugin-based pipeline** for comprehensive entity resolution, statement enrichment, and taxonomy classification.
|
|
303
318
|
|
|
304
319
|
### Pipeline Stages
|
|
305
320
|
|
|
@@ -307,10 +322,9 @@ v0.5.0 introduces a **6-stage plugin-based pipeline** for comprehensive entity r
|
|
|
307
322
|
|-------|------|-------|--------|----------|
|
|
308
323
|
| 1 | Splitting | Text | `RawTriple[]` | T5-Gemma2 |
|
|
309
324
|
| 2 | Extraction | `RawTriple[]` | `PipelineStatement[]` | GLiNER2 |
|
|
310
|
-
| 3 | Qualification | Entities | `
|
|
311
|
-
| 4 |
|
|
312
|
-
| 5 |
|
|
313
|
-
| 6 | Taxonomy | Statements | `TaxonomyResult[]` | MNLI, Embeddings |
|
|
325
|
+
| 3 | Qualification | Entities | `CanonicalEntity[]` | Embedding DB |
|
|
326
|
+
| 4 | Labeling | Statements | `LabeledStatement[]` | Sentiment, etc. |
|
|
327
|
+
| 5 | Taxonomy | Statements | `TaxonomyResult[]` | MNLI, Embeddings |
|
|
314
328
|
|
|
315
329
|
### Pipeline Python API
|
|
316
330
|
|
|
@@ -339,8 +353,8 @@ from statement_extractor.pipeline import PipelineConfig, ExtractionPipeline
|
|
|
339
353
|
|
|
340
354
|
# Run only specific stages
|
|
341
355
|
config = PipelineConfig(
|
|
342
|
-
enabled_stages={1, 2, 3}, # Skip
|
|
343
|
-
disabled_plugins={"
|
|
356
|
+
enabled_stages={1, 2, 3}, # Skip labeling and taxonomy
|
|
357
|
+
disabled_plugins={"person_qualifier"}, # Disable specific plugins
|
|
344
358
|
)
|
|
345
359
|
pipeline = ExtractionPipeline(config)
|
|
346
360
|
ctx = pipeline.process(text)
|
|
@@ -358,24 +372,177 @@ config = PipelineConfig.from_stage_string("1-3") # Stages 1, 2, 3
|
|
|
358
372
|
- `gliner2_extractor` - GLiNER2 entity recognition and relation extraction
|
|
359
373
|
|
|
360
374
|
**Qualifiers (Stage 3):**
|
|
361
|
-
- `person_qualifier` - PERSON → role, org (
|
|
362
|
-
- `
|
|
363
|
-
- `companies_house_qualifier` - ORG → UK company number
|
|
364
|
-
- `sec_edgar_qualifier` - ORG → SEC CIK, ticker
|
|
375
|
+
- `person_qualifier` - PERSON → role, org, canonical ID via Wikidata person database *(enhanced in v0.9.0)*
|
|
376
|
+
- `embedding_company_qualifier` - ORG → canonical name, identifiers (LEI, CIK, company number), and FQN via embedding database
|
|
365
377
|
|
|
366
|
-
**
|
|
367
|
-
- `organization_canonicalizer` - ORG canonical names
|
|
368
|
-
- `person_canonicalizer` - PERSON name variants
|
|
369
|
-
|
|
370
|
-
**Labelers (Stage 5):**
|
|
378
|
+
**Labelers (Stage 4):**
|
|
371
379
|
- `sentiment_labeler` - Statement sentiment analysis
|
|
380
|
+
- `confidence_labeler` - Confidence scoring
|
|
381
|
+
- `relation_type_labeler` - Relation type classification
|
|
372
382
|
|
|
373
|
-
**Taxonomy Classifiers (Stage
|
|
383
|
+
**Taxonomy Classifiers (Stage 5):**
|
|
374
384
|
- `mnli_taxonomy_classifier` - MNLI zero-shot classification against ESG taxonomy
|
|
375
385
|
- `embedding_taxonomy_classifier` - Embedding similarity-based taxonomy classification
|
|
376
386
|
|
|
377
387
|
Taxonomy classifiers return **multiple labels** per statement above the confidence threshold.
|
|
378
388
|
|
|
389
|
+
## New in v0.6.0: Entity Embedding Database
|
|
390
|
+
|
|
391
|
+
v0.6.0 introduces an **entity embedding database** for fast entity qualification using vector similarity search.
|
|
392
|
+
|
|
393
|
+
### Data Sources
|
|
394
|
+
|
|
395
|
+
**Organizations:**
|
|
396
|
+
|
|
397
|
+
| Source | Records | Identifier | EntityType Mapping |
|
|
398
|
+
|--------|---------|------------|-------------------|
|
|
399
|
+
| GLEIF | ~3.2M | LEI (Legal Entity Identifier) | GENERAL→business, FUND→fund, BRANCH→branch, INTERNATIONAL_ORGANIZATION→international_org |
|
|
400
|
+
| SEC Edgar | ~100K+ | CIK (Central Index Key) | business (or fund via SIC codes) |
|
|
401
|
+
| Companies House | ~5M | UK Company Number | Maps company_type to business/nonprofit |
|
|
402
|
+
| Wikidata | Variable | Wikidata QID | 35+ query types mapped to EntityType |
|
|
403
|
+
|
|
404
|
+
**People** *(v0.9.0)*:
|
|
405
|
+
|
|
406
|
+
| Source | Records | Identifier | PersonType Classification |
|
|
407
|
+
|--------|---------|------------|--------------------------|
|
|
408
|
+
| Wikidata | Variable | Wikidata QID | executive, politician, athlete, artist, academic, scientist, journalist, entrepreneur, activist |
|
|
409
|
+
|
|
410
|
+
### EntityType Classification
|
|
411
|
+
|
|
412
|
+
Each organization record is classified with an `entity_type` field:
|
|
413
|
+
|
|
414
|
+
| Category | Types |
|
|
415
|
+
|----------|-------|
|
|
416
|
+
| Business | `business`, `fund`, `branch` |
|
|
417
|
+
| Non-profit | `nonprofit`, `ngo`, `foundation`, `trade_union` |
|
|
418
|
+
| Government | `government`, `international_org`, `political_party` |
|
|
419
|
+
| Other | `educational`, `research`, `healthcare`, `media`, `sports`, `religious`, `unknown` |
|
|
420
|
+
|
|
421
|
+
### Building the Database
|
|
422
|
+
|
|
423
|
+
```bash
|
|
424
|
+
# Import organizations from authoritative sources
|
|
425
|
+
corp-extractor db import-gleif --download
|
|
426
|
+
corp-extractor db import-sec --download # Bulk submissions.zip (~100K+ filers)
|
|
427
|
+
corp-extractor db import-companies-house --download
|
|
428
|
+
corp-extractor db import-wikidata --limit 50000
|
|
429
|
+
|
|
430
|
+
# Import notable people (v0.9.0)
|
|
431
|
+
corp-extractor db import-people --type executive --limit 5000
|
|
432
|
+
corp-extractor db import-people --all --limit 10000 # All person types
|
|
433
|
+
|
|
434
|
+
# Check status
|
|
435
|
+
corp-extractor db status
|
|
436
|
+
|
|
437
|
+
# Search for an organization
|
|
438
|
+
corp-extractor db search "Microsoft"
|
|
439
|
+
|
|
440
|
+
# Search for a person (v0.9.0)
|
|
441
|
+
corp-extractor db search-people "Tim Cook"
|
|
442
|
+
```
|
|
443
|
+
|
|
444
|
+
### Using in Pipeline
|
|
445
|
+
|
|
446
|
+
The database is automatically used by the `embedding_company_qualifier` plugin for Stage 3 (Qualification):
|
|
447
|
+
|
|
448
|
+
```python
|
|
449
|
+
from statement_extractor.pipeline import ExtractionPipeline
|
|
450
|
+
|
|
451
|
+
pipeline = ExtractionPipeline()
|
|
452
|
+
ctx = pipeline.process("Microsoft acquired Activision Blizzard.")
|
|
453
|
+
|
|
454
|
+
for stmt in ctx.labeled_statements:
|
|
455
|
+
print(f"{stmt.subject_fqn}") # e.g., "Microsoft (sec_edgar:0000789019)"
|
|
456
|
+
```
|
|
457
|
+
|
|
458
|
+
### Publishing to HuggingFace
|
|
459
|
+
|
|
460
|
+
```bash
|
|
461
|
+
# Upload database with all variants (full, lite, compressed)
|
|
462
|
+
export HF_TOKEN="hf_..."
|
|
463
|
+
corp-extractor db upload # Uses default cache location
|
|
464
|
+
corp-extractor db upload entities.db # Or specify path
|
|
465
|
+
corp-extractor db upload --no-lite # Skip lite version
|
|
466
|
+
corp-extractor db upload --no-compress # Skip compressed versions
|
|
467
|
+
|
|
468
|
+
# Download pre-built database (lite version by default)
|
|
469
|
+
corp-extractor db download # Lite version (smaller, faster)
|
|
470
|
+
corp-extractor db download --full # Full version with all metadata
|
|
471
|
+
|
|
472
|
+
# Local database management
|
|
473
|
+
corp-extractor db create-lite entities.db # Create lite version
|
|
474
|
+
corp-extractor db compress entities.db # Compress with gzip
|
|
475
|
+
```
|
|
476
|
+
|
|
477
|
+
See [COMPANY_DB.md](../COMPANY_DB.md) for complete build and publish instructions.
|
|
478
|
+
|
|
479
|
+
## New in v0.7.0: Document Processing
|
|
480
|
+
|
|
481
|
+
v0.7.0 introduces **document-level processing** for handling files, URLs, and PDFs with automatic chunking, deduplication, and citation tracking.
|
|
482
|
+
|
|
483
|
+
### Document CLI
|
|
484
|
+
|
|
485
|
+
```bash
|
|
486
|
+
# Process local files
|
|
487
|
+
corp-extractor document process article.txt
|
|
488
|
+
corp-extractor document process report.txt --title "Annual Report" --year 2024
|
|
489
|
+
|
|
490
|
+
# Process URLs (web pages and PDFs)
|
|
491
|
+
corp-extractor document process https://example.com/article
|
|
492
|
+
corp-extractor document process https://example.com/report.pdf --use-ocr
|
|
493
|
+
|
|
494
|
+
# Configure chunking
|
|
495
|
+
corp-extractor document process article.txt --max-tokens 500 --overlap 50
|
|
496
|
+
|
|
497
|
+
# Preview chunking without extraction
|
|
498
|
+
corp-extractor document chunk article.txt --max-tokens 500
|
|
499
|
+
```
|
|
500
|
+
|
|
501
|
+
### Document Python API
|
|
502
|
+
|
|
503
|
+
```python
|
|
504
|
+
from statement_extractor.document import DocumentPipeline, DocumentPipelineConfig, Document
|
|
505
|
+
from statement_extractor.models.document import ChunkingConfig
|
|
506
|
+
|
|
507
|
+
# Configure document processing
|
|
508
|
+
config = DocumentPipelineConfig(
|
|
509
|
+
chunking=ChunkingConfig(target_tokens=1000, overlap_tokens=100),
|
|
510
|
+
generate_summary=True,
|
|
511
|
+
deduplicate_across_chunks=True,
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
pipeline = DocumentPipeline(config)
|
|
515
|
+
|
|
516
|
+
# Process text
|
|
517
|
+
document = Document.from_text("Your long document text...", title="My Document")
|
|
518
|
+
ctx = pipeline.process(document)
|
|
519
|
+
|
|
520
|
+
# Process URL (async)
|
|
521
|
+
ctx = await pipeline.process_url("https://example.com/article")
|
|
522
|
+
|
|
523
|
+
# Access results
|
|
524
|
+
print(f"Chunks: {ctx.chunk_count}")
|
|
525
|
+
print(f"Statements: {ctx.statement_count}")
|
|
526
|
+
print(f"Duplicates removed: {ctx.duplicates_removed}")
|
|
527
|
+
|
|
528
|
+
for stmt in ctx.labeled_statements:
|
|
529
|
+
print(f"{stmt.subject_fqn} --[{stmt.statement.predicate}]--> {stmt.object_fqn}")
|
|
530
|
+
if stmt.citation:
|
|
531
|
+
print(f" Citation: {stmt.citation}")
|
|
532
|
+
```
|
|
533
|
+
|
|
534
|
+
### PDF Processing
|
|
535
|
+
|
|
536
|
+
PDFs are automatically parsed using PyMuPDF. For scanned PDFs, use OCR:
|
|
537
|
+
|
|
538
|
+
```bash
|
|
539
|
+
# Install OCR dependencies
|
|
540
|
+
pip install "corp-extractor[ocr]"
|
|
541
|
+
|
|
542
|
+
# Process with OCR
|
|
543
|
+
corp-extractor document process scanned.pdf --use-ocr
|
|
544
|
+
```
|
|
545
|
+
|
|
379
546
|
## New in v0.4.0: GLiNER2 Integration
|
|
380
547
|
|
|
381
548
|
v0.4.0 replaces spaCy with **GLiNER2** (205M params) for entity recognition and relation extraction. GLiNER2 is a unified model that handles NER, text classification, structured data extraction, and relation extraction with CPU-optimized inference.
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
statement_extractor/__init__.py,sha256=vOJFsK6wNOoBvGYOvIKsseaqpFR8vNg_XPH-r8SmLas,3215
|
|
2
|
+
statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
|
|
3
|
+
statement_extractor/cli.py,sha256=BTFLIBZoNa2ADrYVslbXiZGrzhRWmi7ppbnAPV3xUyg,71191
|
|
4
|
+
statement_extractor/extractor.py,sha256=CGJCmAMiIoDsPtjIdvOHYBcz8058eYpfLMngjELMJhI,38403
|
|
5
|
+
statement_extractor/gliner_extraction.py,sha256=OL4w-0_rZc6XCojaVsbGY4VdIXRJ6j8ZmeUeTOL0Ue0,8118
|
|
6
|
+
statement_extractor/llm.py,sha256=1eBrYs-bUPqzbpiiif_hH-gE_DeM-l3cmddrIoLHFXo,8010
|
|
7
|
+
statement_extractor/models.py,sha256=fXTT7qxPqynnrrpb77nCgs3K2yn_YgbSugSXv12boX4,12312
|
|
8
|
+
statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
|
|
9
|
+
statement_extractor/scoring.py,sha256=V9WHQ-QCAoycnnaTHydWkFo-48_lcS6Mkztxjfi4wVg,16632
|
|
10
|
+
statement_extractor/data/default_predicates.json,sha256=7rhFXWHvStDj4sLYfBXKS50xGChPaqMKUaKTkMEJRGk,32870
|
|
11
|
+
statement_extractor/data/statement_taxonomy.json,sha256=LI9RWthuJTFCcuaIbh6h3FEu8EJpejiKjAtNM_y1s8A,336543
|
|
12
|
+
statement_extractor/database/__init__.py,sha256=1eScQOm7866v9xndaqCK-xsXDUhKBSj5YGtGoQ80LgU,1548
|
|
13
|
+
statement_extractor/database/embeddings.py,sha256=j_gUTEdRyyQCPcx9imsOh1nVDPjeiRXXG22OZ7KIO4w,5535
|
|
14
|
+
statement_extractor/database/hub.py,sha256=HOnRp62RnkXvk2KgwqOLVpEkXwy0LS0n3tIJrkYCo2c,16842
|
|
15
|
+
statement_extractor/database/models.py,sha256=ke4byqJiiBlZfRhxqoC0nsdDhb6YSG2I4S5W5BRBNY4,8813
|
|
16
|
+
statement_extractor/database/resolver.py,sha256=_fTITarFmAYOtuRbOos48ou_aqX4yJC0K2csdLbIktI,7202
|
|
17
|
+
statement_extractor/database/store.py,sha256=1qdRZ7q5nTLUYbtUC9cWSLey_GVf5kAQ6dTF9EEwDXY,56735
|
|
18
|
+
statement_extractor/database/importers/__init__.py,sha256=0CPqafekQpqxFmZhe3uMJLNssqxGzEniZlArGyl8QKU,668
|
|
19
|
+
statement_extractor/database/importers/companies_house.py,sha256=G0DZAs_9RM7uTwY7imt70IXUVvhntoO-xXnJ0o6jjGw,19635
|
|
20
|
+
statement_extractor/database/importers/gleif.py,sha256=MTFuksVf83Barn1c6JvBLBouxXbzogWulKb8oqEODAk,18948
|
|
21
|
+
statement_extractor/database/importers/sec_edgar.py,sha256=_B4QcXhZ_5ulXTSVW9dKKAzFIVwn-VIh_X39jcUhqsg,12923
|
|
22
|
+
statement_extractor/database/importers/wikidata.py,sha256=ZZYHiqSlYlco1TSzCLUKqdT-i5X1cUSK1EnsfWWwPAc,33770
|
|
23
|
+
statement_extractor/database/importers/wikidata_people.py,sha256=loqyf5sbtBqCITiTxqV3PLyx3SefmVefhZE0Y-cRoC4,22205
|
|
24
|
+
statement_extractor/document/__init__.py,sha256=csbUUjxaZay-0WXtjZmULjDfL9VNxhOlePyKTMdRDYo,1714
|
|
25
|
+
statement_extractor/document/chunker.py,sha256=I76p6Qwujk2kkN7GJ1sMwbQNOfEpbt29u-RxJdt15oE,14020
|
|
26
|
+
statement_extractor/document/context.py,sha256=9DvyguwCjlef2MeNWZMgydvD54FPiOppjdvamQnrKzM,5450
|
|
27
|
+
statement_extractor/document/deduplicator.py,sha256=8tPKWAGqNfjteOdnk7B82izyfIpvOebirZ-OIQKixwU,4821
|
|
28
|
+
statement_extractor/document/html_extractor.py,sha256=YRhaIsurBJTeECLkL2YJsSv8gDJJN33fS-ESkGvDBGs,6600
|
|
29
|
+
statement_extractor/document/loader.py,sha256=Ygund7bz4EVcwsFsxkrrgSjOCK4tbb_sqkMlzK_oEKM,8996
|
|
30
|
+
statement_extractor/document/pipeline.py,sha256=h4q-CG_WtBLibkTXCFhfTizMme8bJS5f6ZWOECqhRYU,13675
|
|
31
|
+
statement_extractor/document/summarizer.py,sha256=DOF6qPw0oWEtLSt97oXOFyzb0jGWZZ7frDFp11rL3is,5853
|
|
32
|
+
statement_extractor/models/__init__.py,sha256=9FxKkJ4EIPXmSkMo_j9jqAKH6jTkvz5Tzk2YvQL7UVk,2884
|
|
33
|
+
statement_extractor/models/canonical.py,sha256=LaSU3CUJZOtBM1SpRTAmK-3N7QnYmxZYJvQE1NVIjLY,6003
|
|
34
|
+
statement_extractor/models/document.py,sha256=McCyXz88YtJtlsfiFzagjRAhY32ovpIDKXQI_eV_DZI,9203
|
|
35
|
+
statement_extractor/models/entity.py,sha256=l2ny91BnnWwPo9zx1_Fb8WMKPNuIQFN0H7ILncylmcY,3214
|
|
36
|
+
statement_extractor/models/labels.py,sha256=NUcjFDuGUOM82mgsaWOdoIVbRNiQ6TdN-imNuTograo,7326
|
|
37
|
+
statement_extractor/models/qualifiers.py,sha256=l--khVzt-N6jgibZ-MSSl-3SdQUZJN9dGoxdNhRmM_I,5926
|
|
38
|
+
statement_extractor/models/statement.py,sha256=agC4jcP9ospbZC91J6c0UgLAmfsg1tnqNcSvkqOtqBQ,3629
|
|
39
|
+
statement_extractor/pipeline/__init__.py,sha256=Q3M2Arx9BWH_APZxM-P0G-C3ISguG1whiA5QhxDHQCA,1071
|
|
40
|
+
statement_extractor/pipeline/config.py,sha256=FXtqMMpRmdeuHB86D6YrFx5A36XHVg5GlBBZuPEn4JA,3957
|
|
41
|
+
statement_extractor/pipeline/context.py,sha256=wURDYtzDrmbHu40Af_C_oTtN55wnULKHNZjUx6O8t-0,6126
|
|
42
|
+
statement_extractor/pipeline/orchestrator.py,sha256=1pe6hyEtd495LJrfH3QgxQadNqERmehQEs5LHsAVIxM,16580
|
|
43
|
+
statement_extractor/pipeline/registry.py,sha256=yBybhRd1HU2Y75TebLGBzF6nbPiHKZ0cHkyj-3CVnhg,11390
|
|
44
|
+
statement_extractor/plugins/__init__.py,sha256=pIcPeoMFd-56jOM_kGrUWvPuwqN6vFJ-oUbu130-tzI,1345
|
|
45
|
+
statement_extractor/plugins/base.py,sha256=ItqJZ5rH65gW4-pXpraRb45y7F3lXqsKECumhV3tDyk,21516
|
|
46
|
+
statement_extractor/plugins/extractors/__init__.py,sha256=sqxTI7WwDLVQKwOiQXqWS72gjJnwb76Gs9N3LGetBnI,253
|
|
47
|
+
statement_extractor/plugins/extractors/base.py,sha256=kNRsQ7BL84lXPXREm7CihrprDUaFwDDvMpBcbZlwSGA,179
|
|
48
|
+
statement_extractor/plugins/extractors/gliner2.py,sha256=ObEQMNE6ArjRl2s4x3lkOSPs03cmtTYFlppnbhtkI7A,21876
|
|
49
|
+
statement_extractor/plugins/labelers/__init__.py,sha256=flHEoBvnzQ3vAKkIUHyezpYi2H3KJvYGRerCVnc80r0,965
|
|
50
|
+
statement_extractor/plugins/labelers/base.py,sha256=hIgJKq2LU00OcL0Zjy1L9hP8K2onlM_xtZ63XcH8qDE,171
|
|
51
|
+
statement_extractor/plugins/labelers/confidence.py,sha256=XiXjBYe-8ch_SCKnz0sAwTT1mJ_XKMsuzXBbwAW_OK0,4083
|
|
52
|
+
statement_extractor/plugins/labelers/relation_type.py,sha256=e5ASwVqJGMSCrx5GtyNk85q_-19D7W_4jI-J-Pv_kxY,2506
|
|
53
|
+
statement_extractor/plugins/labelers/sentiment.py,sha256=nlWv9ymb7hlDIcFa-gjbIvZlJY1VrHrXhKMD-udmIzM,5027
|
|
54
|
+
statement_extractor/plugins/labelers/taxonomy.py,sha256=u_TQVCTOZCtZis5ZP0xvxh5Ehc0fCJ-DG6E86GxjNcs,12725
|
|
55
|
+
statement_extractor/plugins/labelers/taxonomy_embedding.py,sha256=NsSls2jkWm8LyNNuDkG2Rs4PYKQQxeMUDLTRrvSNk_A,16305
|
|
56
|
+
statement_extractor/plugins/pdf/__init__.py,sha256=QLbgg3lgpwUKR1EGmzhbOJh5IB4-3rpWen9c75YNLtM,220
|
|
57
|
+
statement_extractor/plugins/pdf/pypdf.py,sha256=JgmWa1-6tiATbPvhONMqRd5kAXJ--tb8rlEcR3u73sk,8612
|
|
58
|
+
statement_extractor/plugins/qualifiers/__init__.py,sha256=H4FEZSw1GWBQB-Y79nQnLwhZ3okKQJqgJHGEA0Zp8pA,951
|
|
59
|
+
statement_extractor/plugins/qualifiers/base.py,sha256=Kx--OdIh77mnjSkTl1NvUeekItRiG8AnBUcuznOZeBI,179
|
|
60
|
+
statement_extractor/plugins/qualifiers/companies_house.py,sha256=6TlK6Zebb5wDJ9GGO3FvM9zOh27TWpio5BX9k7lBr7U,5854
|
|
61
|
+
statement_extractor/plugins/qualifiers/embedding_company.py,sha256=EmCxImdXBCA7zxM1stAVeAYlzeNPC_jSlyVN5q1XEJA,14567
|
|
62
|
+
statement_extractor/plugins/qualifiers/gleif.py,sha256=zHzC9eOt0R9Z56n0CXgTF7POJqu6v03SRmiJLmv8OGE,6104
|
|
63
|
+
statement_extractor/plugins/qualifiers/person.py,sha256=GZCUJaQncC_wB4nBQ4RLY5dJ-CdARMLpByc_Nn09wj8,28461
|
|
64
|
+
statement_extractor/plugins/qualifiers/sec_edgar.py,sha256=d7QqGiE-3lFDQiXkYmNQU62K4oP2XYK6NzV6LNKPC5k,6754
|
|
65
|
+
statement_extractor/plugins/scrapers/__init__.py,sha256=mh1nmPtcsewrYeW5oELeke6DSzL8jsGOJ2OcH-A4-eo,208
|
|
66
|
+
statement_extractor/plugins/scrapers/http.py,sha256=igoB1JN7U-FPdBFmNfrdZV-Ho4JQ3RXniLz17SmQx8I,7778
|
|
67
|
+
statement_extractor/plugins/splitters/__init__.py,sha256=05CYeAEO0lZsapK5pjxZJbOCLI1kjeK6IQjftxqqg5g,224
|
|
68
|
+
statement_extractor/plugins/splitters/base.py,sha256=GeIBchFTr8icRSfYR8bGSb4-GoEZ1N0IGN6Kl5W2mL0,175
|
|
69
|
+
statement_extractor/plugins/splitters/t5_gemma.py,sha256=AwYYKQrAmiue5IK9bbJ-Uhfl9oCZTX1X_tmKguKIdjU,9982
|
|
70
|
+
statement_extractor/plugins/taxonomy/__init__.py,sha256=8N0tW7pm95DSCqM-s99ea0Tigbi9bZMyTkKblR1qmLw,307
|
|
71
|
+
statement_extractor/plugins/taxonomy/embedding.py,sha256=yCuNE8UeY8tH2dHGRKL3hmRQBmdz9_9YQ0t5_VTCf7E,16349
|
|
72
|
+
statement_extractor/plugins/taxonomy/mnli.py,sha256=zPZlpAHQqdnwH7fXS_CSY0HCMnaSrrk-fDQb1ZIqqPc,9163
|
|
73
|
+
corp_extractor-0.9.0.dist-info/METADATA,sha256=9pWemKEWyeEqW92sRd4SqdMykO-92kl5UIrs-P2xAn0,27553
|
|
74
|
+
corp_extractor-0.9.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
75
|
+
corp_extractor-0.9.0.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
|
|
76
|
+
corp_extractor-0.9.0.dist-info/RECORD,,
|