corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +1227 -10
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +520 -0
  9. statement_extractor/database/importers/__init__.py +24 -0
  10. statement_extractor/database/importers/companies_house.py +545 -0
  11. statement_extractor/database/importers/gleif.py +538 -0
  12. statement_extractor/database/importers/sec_edgar.py +375 -0
  13. statement_extractor/database/importers/wikidata.py +1012 -0
  14. statement_extractor/database/importers/wikidata_people.py +632 -0
  15. statement_extractor/database/models.py +230 -0
  16. statement_extractor/database/resolver.py +245 -0
  17. statement_extractor/database/store.py +1609 -0
  18. statement_extractor/document/__init__.py +62 -0
  19. statement_extractor/document/chunker.py +410 -0
  20. statement_extractor/document/context.py +171 -0
  21. statement_extractor/document/deduplicator.py +173 -0
  22. statement_extractor/document/html_extractor.py +246 -0
  23. statement_extractor/document/loader.py +303 -0
  24. statement_extractor/document/pipeline.py +388 -0
  25. statement_extractor/document/summarizer.py +195 -0
  26. statement_extractor/models/__init__.py +16 -1
  27. statement_extractor/models/canonical.py +44 -1
  28. statement_extractor/models/document.py +308 -0
  29. statement_extractor/models/labels.py +47 -18
  30. statement_extractor/models/qualifiers.py +51 -3
  31. statement_extractor/models/statement.py +26 -0
  32. statement_extractor/pipeline/config.py +6 -11
  33. statement_extractor/pipeline/orchestrator.py +80 -111
  34. statement_extractor/pipeline/registry.py +52 -46
  35. statement_extractor/plugins/__init__.py +20 -8
  36. statement_extractor/plugins/base.py +334 -64
  37. statement_extractor/plugins/extractors/gliner2.py +10 -0
  38. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  39. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  40. statement_extractor/plugins/pdf/__init__.py +10 -0
  41. statement_extractor/plugins/pdf/pypdf.py +291 -0
  42. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  43. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  44. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  45. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  46. statement_extractor/plugins/qualifiers/person.py +578 -14
  47. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  48. statement_extractor/plugins/scrapers/__init__.py +10 -0
  49. statement_extractor/plugins/scrapers/http.py +236 -0
  50. statement_extractor/plugins/splitters/t5_gemma.py +158 -53
  51. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  52. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  53. statement_extractor/scoring.py +8 -8
  54. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  55. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  56. statement_extractor/plugins/canonicalizers/base.py +0 -9
  57. statement_extractor/plugins/canonicalizers/location.py +0 -219
  58. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  59. statement_extractor/plugins/canonicalizers/person.py +0 -242
  60. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  61. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: corp-extractor
3
- Version: 0.5.0
3
+ Version: 0.9.0
4
4
  Summary: Extract structured statements from text using T5-Gemma 2 and Diverse Beam Search
5
5
  Project-URL: Homepage, https://github.com/corp-o-rate/statement-extractor
6
6
  Project-URL: Documentation, https://github.com/corp-o-rate/statement-extractor#readme
@@ -24,14 +24,24 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
24
  Classifier: Topic :: Text Processing :: Linguistic
25
25
  Requires-Python: >=3.10
26
26
  Requires-Dist: accelerate>=1.12.0
27
+ Requires-Dist: beautifulsoup4>=4.12.0
27
28
  Requires-Dist: click>=8.0.0
28
29
  Requires-Dist: gguf>=0.17.1
29
30
  Requires-Dist: gliner2
31
+ Requires-Dist: httpx>=0.25.0
32
+ Requires-Dist: huggingface-hub>=0.20.0
33
+ Requires-Dist: llama-cpp-python>=0.3.16
30
34
  Requires-Dist: numpy>=1.24.0
31
35
  Requires-Dist: pydantic>=2.0.0
36
+ Requires-Dist: pymupdf>=1.23.0
32
37
  Requires-Dist: sentence-transformers>=2.2.0
38
+ Requires-Dist: sqlite-vec>=0.1.6
33
39
  Requires-Dist: torch>=2.0.0
34
40
  Requires-Dist: transformers>=5.0.0rc3
41
+ Provides-Extra: all
42
+ Requires-Dist: llama-cpp-python>=0.2.0; extra == 'all'
43
+ Requires-Dist: pillow>=10.0.0; extra == 'all'
44
+ Requires-Dist: pytesseract>=0.3.10; extra == 'all'
35
45
  Provides-Extra: dev
36
46
  Requires-Dist: mypy>=1.0.0; extra == 'dev'
37
47
  Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
@@ -39,6 +49,9 @@ Requires-Dist: pytest>=7.0.0; extra == 'dev'
39
49
  Requires-Dist: ruff>=0.1.0; extra == 'dev'
40
50
  Provides-Extra: llm
41
51
  Requires-Dist: llama-cpp-python>=0.2.0; extra == 'llm'
52
+ Provides-Extra: ocr
53
+ Requires-Dist: pillow>=10.0.0; extra == 'ocr'
54
+ Requires-Dist: pytesseract>=0.3.10; extra == 'ocr'
42
55
  Description-Content-Type: text/markdown
43
56
 
44
57
  # Corp Extractor
@@ -51,18 +64,20 @@ Extract structured subject-predicate-object statements from unstructured text us
51
64
 
52
65
  ## Features
53
66
 
54
- - **6-Stage Pipeline** *(v0.5.0)*: Modular plugin-based architecture for full entity resolution
67
+ - **Person Database** *(v0.9.0)*: Qualify notable people (executives, politicians, athletes, etc.) against Wikidata with canonical IDs
68
+ - **5-Stage Pipeline** *(v0.8.0)*: Modular plugin-based architecture for full entity resolution
69
+ - **Document Processing** *(v0.7.0)*: Process documents, URLs, and PDFs with chunking and deduplication
70
+ - **Entity Embedding Database** *(v0.6.0)*: Fast entity qualification using vector similarity (~100K+ SEC, ~3M GLEIF, ~5M UK organizations)
55
71
  - **Structured Extraction**: Converts unstructured text into subject-predicate-object triples
56
72
  - **Entity Type Recognition**: Identifies 12 entity types (ORG, PERSON, GPE, LOC, PRODUCT, EVENT, etc.)
57
- - **Entity Qualification** *(v0.5.0)*: Adds roles, identifiers (LEI, ticker, company numbers) via external APIs
58
- - **Canonicalization** *(v0.5.0)*: Resolves entities to canonical forms with fuzzy matching
73
+ - **Entity Qualification** *(v0.8.0)*: Adds identifiers (LEI, ticker, company numbers), canonical names, and FQN via embedding database
59
74
  - **Statement Labeling** *(v0.5.0)*: Sentiment analysis, relation type classification, confidence scoring
60
75
  - **GLiNER2 Integration** *(v0.4.0)*: Uses GLiNER2 (205M params) for entity recognition and relation extraction
61
76
  - **Predefined Predicates**: Optional `--predicates` list for GLiNER2 relation extraction mode
62
77
  - **Beam Merging**: Combines top beams for better coverage instead of picking one
63
78
  - **Embedding-based Dedup**: Uses semantic similarity to detect near-duplicate predicates
64
79
  - **Predicate Taxonomies**: Normalize predicates to canonical forms via embeddings
65
- - **Command Line Interface**: Full-featured CLI with `split`, `pipeline`, and `plugins` commands
80
+ - **Command Line Interface**: Full-featured CLI with `split`, `pipeline`, `document`, and `db` commands
66
81
  - **Multiple Output Formats**: Get results as Pydantic models, JSON, XML, or dictionaries
67
82
 
68
83
  ## Installation
@@ -143,7 +158,7 @@ The CLI provides three main commands: `split`, `pipeline`, and `plugins`.
143
158
  corp-extractor split "Apple Inc. announced the iPhone 15."
144
159
  corp-extractor split -f article.txt --json
145
160
 
146
- # Full 6-stage pipeline (entity resolution, canonicalization, labeling, taxonomy)
161
+ # Full 5-stage pipeline (entity resolution, labeling, taxonomy)
147
162
  corp-extractor pipeline "Amazon CEO Andy Jassy announced plans to hire workers."
148
163
  corp-extractor pipeline -f article.txt --stages 1-3
149
164
  corp-extractor pipeline "..." --disable-plugins sec_edgar
@@ -297,9 +312,9 @@ for stmt in fixed_statements:
297
312
 
298
313
  During deduplication, reversed duplicates (e.g., "A -> P -> B" and "B -> P -> A") are now detected and merged, with the correct orientation determined by source text similarity.
299
314
 
300
- ## New in v0.5.0: Pipeline Architecture
315
+ ## Pipeline Architecture
301
316
 
302
- v0.5.0 introduces a **6-stage plugin-based pipeline** for comprehensive entity resolution, statement enrichment, and taxonomy classification.
317
+ The library uses a **5-stage plugin-based pipeline** for comprehensive entity resolution, statement enrichment, and taxonomy classification.
303
318
 
304
319
  ### Pipeline Stages
305
320
 
@@ -307,10 +322,9 @@ v0.5.0 introduces a **6-stage plugin-based pipeline** for comprehensive entity r
307
322
  |-------|------|-------|--------|----------|
308
323
  | 1 | Splitting | Text | `RawTriple[]` | T5-Gemma2 |
309
324
  | 2 | Extraction | `RawTriple[]` | `PipelineStatement[]` | GLiNER2 |
310
- | 3 | Qualification | Entities | `QualifiedEntity[]` | Gemma3, APIs |
311
- | 4 | Canonicalization | `QualifiedEntity[]` | `CanonicalEntity[]` | Fuzzy matching |
312
- | 5 | Labeling | Statements | `LabeledStatement[]` | Sentiment, etc. |
313
- | 6 | Taxonomy | Statements | `TaxonomyResult[]` | MNLI, Embeddings |
325
+ | 3 | Qualification | Entities | `CanonicalEntity[]` | Embedding DB |
326
+ | 4 | Labeling | Statements | `LabeledStatement[]` | Sentiment, etc. |
327
+ | 5 | Taxonomy | Statements | `TaxonomyResult[]` | MNLI, Embeddings |
314
328
 
315
329
  ### Pipeline Python API
316
330
 
@@ -339,8 +353,8 @@ from statement_extractor.pipeline import PipelineConfig, ExtractionPipeline
339
353
 
340
354
  # Run only specific stages
341
355
  config = PipelineConfig(
342
- enabled_stages={1, 2, 3}, # Skip canonicalization and labeling
343
- disabled_plugins={"sec_edgar_qualifier"}, # Disable specific plugins
356
+ enabled_stages={1, 2, 3}, # Skip labeling and taxonomy
357
+ disabled_plugins={"person_qualifier"}, # Disable specific plugins
344
358
  )
345
359
  pipeline = ExtractionPipeline(config)
346
360
  ctx = pipeline.process(text)
@@ -358,24 +372,177 @@ config = PipelineConfig.from_stage_string("1-3") # Stages 1, 2, 3
358
372
  - `gliner2_extractor` - GLiNER2 entity recognition and relation extraction
359
373
 
360
374
  **Qualifiers (Stage 3):**
361
- - `person_qualifier` - PERSON → role, org (uses Gemma3)
362
- - `gleif_qualifier` - ORG → LEI, jurisdiction (GLEIF API)
363
- - `companies_house_qualifier` - ORG → UK company number
364
- - `sec_edgar_qualifier` - ORG → SEC CIK, ticker
375
+ - `person_qualifier` - PERSON → role, org, canonical ID via Wikidata person database *(enhanced in v0.9.0)*
376
+ - `embedding_company_qualifier` - ORG → canonical name, identifiers (LEI, CIK, company number), and FQN via embedding database
365
377
 
366
- **Canonicalizers (Stage 4):**
367
- - `organization_canonicalizer` - ORG canonical names
368
- - `person_canonicalizer` - PERSON name variants
369
-
370
- **Labelers (Stage 5):**
378
+ **Labelers (Stage 4):**
371
379
  - `sentiment_labeler` - Statement sentiment analysis
380
+ - `confidence_labeler` - Confidence scoring
381
+ - `relation_type_labeler` - Relation type classification
372
382
 
373
- **Taxonomy Classifiers (Stage 6):**
383
+ **Taxonomy Classifiers (Stage 5):**
374
384
  - `mnli_taxonomy_classifier` - MNLI zero-shot classification against ESG taxonomy
375
385
  - `embedding_taxonomy_classifier` - Embedding similarity-based taxonomy classification
376
386
 
377
387
  Taxonomy classifiers return **multiple labels** per statement above the confidence threshold.
378
388
 
389
+ ## New in v0.6.0: Entity Embedding Database
390
+
391
+ v0.6.0 introduces an **entity embedding database** for fast entity qualification using vector similarity search.
392
+
393
+ ### Data Sources
394
+
395
+ **Organizations:**
396
+
397
+ | Source | Records | Identifier | EntityType Mapping |
398
+ |--------|---------|------------|-------------------|
399
+ | GLEIF | ~3.2M | LEI (Legal Entity Identifier) | GENERAL→business, FUND→fund, BRANCH→branch, INTERNATIONAL_ORGANIZATION→international_org |
400
+ | SEC Edgar | ~100K+ | CIK (Central Index Key) | business (or fund via SIC codes) |
401
+ | Companies House | ~5M | UK Company Number | Maps company_type to business/nonprofit |
402
+ | Wikidata | Variable | Wikidata QID | 35+ query types mapped to EntityType |
403
+
404
+ **People** *(v0.9.0)*:
405
+
406
+ | Source | Records | Identifier | PersonType Classification |
407
+ |--------|---------|------------|--------------------------|
408
+ | Wikidata | Variable | Wikidata QID | executive, politician, athlete, artist, academic, scientist, journalist, entrepreneur, activist |
409
+
410
+ ### EntityType Classification
411
+
412
+ Each organization record is classified with an `entity_type` field:
413
+
414
+ | Category | Types |
415
+ |----------|-------|
416
+ | Business | `business`, `fund`, `branch` |
417
+ | Non-profit | `nonprofit`, `ngo`, `foundation`, `trade_union` |
418
+ | Government | `government`, `international_org`, `political_party` |
419
+ | Other | `educational`, `research`, `healthcare`, `media`, `sports`, `religious`, `unknown` |
420
+
421
+ ### Building the Database
422
+
423
+ ```bash
424
+ # Import organizations from authoritative sources
425
+ corp-extractor db import-gleif --download
426
+ corp-extractor db import-sec --download # Bulk submissions.zip (~100K+ filers)
427
+ corp-extractor db import-companies-house --download
428
+ corp-extractor db import-wikidata --limit 50000
429
+
430
+ # Import notable people (v0.9.0)
431
+ corp-extractor db import-people --type executive --limit 5000
432
+ corp-extractor db import-people --all --limit 10000 # All person types
433
+
434
+ # Check status
435
+ corp-extractor db status
436
+
437
+ # Search for an organization
438
+ corp-extractor db search "Microsoft"
439
+
440
+ # Search for a person (v0.9.0)
441
+ corp-extractor db search-people "Tim Cook"
442
+ ```
443
+
444
+ ### Using in Pipeline
445
+
446
+ The database is automatically used by the `embedding_company_qualifier` plugin for Stage 3 (Qualification):
447
+
448
+ ```python
449
+ from statement_extractor.pipeline import ExtractionPipeline
450
+
451
+ pipeline = ExtractionPipeline()
452
+ ctx = pipeline.process("Microsoft acquired Activision Blizzard.")
453
+
454
+ for stmt in ctx.labeled_statements:
455
+ print(f"{stmt.subject_fqn}") # e.g., "Microsoft (sec_edgar:0000789019)"
456
+ ```
457
+
458
+ ### Publishing to HuggingFace
459
+
460
+ ```bash
461
+ # Upload database with all variants (full, lite, compressed)
462
+ export HF_TOKEN="hf_..."
463
+ corp-extractor db upload # Uses default cache location
464
+ corp-extractor db upload entities.db # Or specify path
465
+ corp-extractor db upload --no-lite # Skip lite version
466
+ corp-extractor db upload --no-compress # Skip compressed versions
467
+
468
+ # Download pre-built database (lite version by default)
469
+ corp-extractor db download # Lite version (smaller, faster)
470
+ corp-extractor db download --full # Full version with all metadata
471
+
472
+ # Local database management
473
+ corp-extractor db create-lite entities.db # Create lite version
474
+ corp-extractor db compress entities.db # Compress with gzip
475
+ ```
476
+
477
+ See [COMPANY_DB.md](../COMPANY_DB.md) for complete build and publish instructions.
478
+
479
+ ## New in v0.7.0: Document Processing
480
+
481
+ v0.7.0 introduces **document-level processing** for handling files, URLs, and PDFs with automatic chunking, deduplication, and citation tracking.
482
+
483
+ ### Document CLI
484
+
485
+ ```bash
486
+ # Process local files
487
+ corp-extractor document process article.txt
488
+ corp-extractor document process report.txt --title "Annual Report" --year 2024
489
+
490
+ # Process URLs (web pages and PDFs)
491
+ corp-extractor document process https://example.com/article
492
+ corp-extractor document process https://example.com/report.pdf --use-ocr
493
+
494
+ # Configure chunking
495
+ corp-extractor document process article.txt --max-tokens 500 --overlap 50
496
+
497
+ # Preview chunking without extraction
498
+ corp-extractor document chunk article.txt --max-tokens 500
499
+ ```
500
+
501
+ ### Document Python API
502
+
503
+ ```python
504
+ from statement_extractor.document import DocumentPipeline, DocumentPipelineConfig, Document
505
+ from statement_extractor.models.document import ChunkingConfig
506
+
507
+ # Configure document processing
508
+ config = DocumentPipelineConfig(
509
+ chunking=ChunkingConfig(target_tokens=1000, overlap_tokens=100),
510
+ generate_summary=True,
511
+ deduplicate_across_chunks=True,
512
+ )
513
+
514
+ pipeline = DocumentPipeline(config)
515
+
516
+ # Process text
517
+ document = Document.from_text("Your long document text...", title="My Document")
518
+ ctx = pipeline.process(document)
519
+
520
+ # Process URL (async)
521
+ ctx = await pipeline.process_url("https://example.com/article")
522
+
523
+ # Access results
524
+ print(f"Chunks: {ctx.chunk_count}")
525
+ print(f"Statements: {ctx.statement_count}")
526
+ print(f"Duplicates removed: {ctx.duplicates_removed}")
527
+
528
+ for stmt in ctx.labeled_statements:
529
+ print(f"{stmt.subject_fqn} --[{stmt.statement.predicate}]--> {stmt.object_fqn}")
530
+ if stmt.citation:
531
+ print(f" Citation: {stmt.citation}")
532
+ ```
533
+
534
+ ### PDF Processing
535
+
536
+ PDFs are automatically parsed using PyMuPDF. For scanned PDFs, use OCR:
537
+
538
+ ```bash
539
+ # Install OCR dependencies
540
+ pip install "corp-extractor[ocr]"
541
+
542
+ # Process with OCR
543
+ corp-extractor document process scanned.pdf --use-ocr
544
+ ```
545
+
379
546
  ## New in v0.4.0: GLiNER2 Integration
380
547
 
381
548
  v0.4.0 replaces spaCy with **GLiNER2** (205M params) for entity recognition and relation extraction. GLiNER2 is a unified model that handles NER, text classification, structured data extraction, and relation extraction with CPU-optimized inference.
@@ -0,0 +1,76 @@
1
+ statement_extractor/__init__.py,sha256=vOJFsK6wNOoBvGYOvIKsseaqpFR8vNg_XPH-r8SmLas,3215
2
+ statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
3
+ statement_extractor/cli.py,sha256=BTFLIBZoNa2ADrYVslbXiZGrzhRWmi7ppbnAPV3xUyg,71191
4
+ statement_extractor/extractor.py,sha256=CGJCmAMiIoDsPtjIdvOHYBcz8058eYpfLMngjELMJhI,38403
5
+ statement_extractor/gliner_extraction.py,sha256=OL4w-0_rZc6XCojaVsbGY4VdIXRJ6j8ZmeUeTOL0Ue0,8118
6
+ statement_extractor/llm.py,sha256=1eBrYs-bUPqzbpiiif_hH-gE_DeM-l3cmddrIoLHFXo,8010
7
+ statement_extractor/models.py,sha256=fXTT7qxPqynnrrpb77nCgs3K2yn_YgbSugSXv12boX4,12312
8
+ statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
9
+ statement_extractor/scoring.py,sha256=V9WHQ-QCAoycnnaTHydWkFo-48_lcS6Mkztxjfi4wVg,16632
10
+ statement_extractor/data/default_predicates.json,sha256=7rhFXWHvStDj4sLYfBXKS50xGChPaqMKUaKTkMEJRGk,32870
11
+ statement_extractor/data/statement_taxonomy.json,sha256=LI9RWthuJTFCcuaIbh6h3FEu8EJpejiKjAtNM_y1s8A,336543
12
+ statement_extractor/database/__init__.py,sha256=1eScQOm7866v9xndaqCK-xsXDUhKBSj5YGtGoQ80LgU,1548
13
+ statement_extractor/database/embeddings.py,sha256=j_gUTEdRyyQCPcx9imsOh1nVDPjeiRXXG22OZ7KIO4w,5535
14
+ statement_extractor/database/hub.py,sha256=HOnRp62RnkXvk2KgwqOLVpEkXwy0LS0n3tIJrkYCo2c,16842
15
+ statement_extractor/database/models.py,sha256=ke4byqJiiBlZfRhxqoC0nsdDhb6YSG2I4S5W5BRBNY4,8813
16
+ statement_extractor/database/resolver.py,sha256=_fTITarFmAYOtuRbOos48ou_aqX4yJC0K2csdLbIktI,7202
17
+ statement_extractor/database/store.py,sha256=1qdRZ7q5nTLUYbtUC9cWSLey_GVf5kAQ6dTF9EEwDXY,56735
18
+ statement_extractor/database/importers/__init__.py,sha256=0CPqafekQpqxFmZhe3uMJLNssqxGzEniZlArGyl8QKU,668
19
+ statement_extractor/database/importers/companies_house.py,sha256=G0DZAs_9RM7uTwY7imt70IXUVvhntoO-xXnJ0o6jjGw,19635
20
+ statement_extractor/database/importers/gleif.py,sha256=MTFuksVf83Barn1c6JvBLBouxXbzogWulKb8oqEODAk,18948
21
+ statement_extractor/database/importers/sec_edgar.py,sha256=_B4QcXhZ_5ulXTSVW9dKKAzFIVwn-VIh_X39jcUhqsg,12923
22
+ statement_extractor/database/importers/wikidata.py,sha256=ZZYHiqSlYlco1TSzCLUKqdT-i5X1cUSK1EnsfWWwPAc,33770
23
+ statement_extractor/database/importers/wikidata_people.py,sha256=loqyf5sbtBqCITiTxqV3PLyx3SefmVefhZE0Y-cRoC4,22205
24
+ statement_extractor/document/__init__.py,sha256=csbUUjxaZay-0WXtjZmULjDfL9VNxhOlePyKTMdRDYo,1714
25
+ statement_extractor/document/chunker.py,sha256=I76p6Qwujk2kkN7GJ1sMwbQNOfEpbt29u-RxJdt15oE,14020
26
+ statement_extractor/document/context.py,sha256=9DvyguwCjlef2MeNWZMgydvD54FPiOppjdvamQnrKzM,5450
27
+ statement_extractor/document/deduplicator.py,sha256=8tPKWAGqNfjteOdnk7B82izyfIpvOebirZ-OIQKixwU,4821
28
+ statement_extractor/document/html_extractor.py,sha256=YRhaIsurBJTeECLkL2YJsSv8gDJJN33fS-ESkGvDBGs,6600
29
+ statement_extractor/document/loader.py,sha256=Ygund7bz4EVcwsFsxkrrgSjOCK4tbb_sqkMlzK_oEKM,8996
30
+ statement_extractor/document/pipeline.py,sha256=h4q-CG_WtBLibkTXCFhfTizMme8bJS5f6ZWOECqhRYU,13675
31
+ statement_extractor/document/summarizer.py,sha256=DOF6qPw0oWEtLSt97oXOFyzb0jGWZZ7frDFp11rL3is,5853
32
+ statement_extractor/models/__init__.py,sha256=9FxKkJ4EIPXmSkMo_j9jqAKH6jTkvz5Tzk2YvQL7UVk,2884
33
+ statement_extractor/models/canonical.py,sha256=LaSU3CUJZOtBM1SpRTAmK-3N7QnYmxZYJvQE1NVIjLY,6003
34
+ statement_extractor/models/document.py,sha256=McCyXz88YtJtlsfiFzagjRAhY32ovpIDKXQI_eV_DZI,9203
35
+ statement_extractor/models/entity.py,sha256=l2ny91BnnWwPo9zx1_Fb8WMKPNuIQFN0H7ILncylmcY,3214
36
+ statement_extractor/models/labels.py,sha256=NUcjFDuGUOM82mgsaWOdoIVbRNiQ6TdN-imNuTograo,7326
37
+ statement_extractor/models/qualifiers.py,sha256=l--khVzt-N6jgibZ-MSSl-3SdQUZJN9dGoxdNhRmM_I,5926
38
+ statement_extractor/models/statement.py,sha256=agC4jcP9ospbZC91J6c0UgLAmfsg1tnqNcSvkqOtqBQ,3629
39
+ statement_extractor/pipeline/__init__.py,sha256=Q3M2Arx9BWH_APZxM-P0G-C3ISguG1whiA5QhxDHQCA,1071
40
+ statement_extractor/pipeline/config.py,sha256=FXtqMMpRmdeuHB86D6YrFx5A36XHVg5GlBBZuPEn4JA,3957
41
+ statement_extractor/pipeline/context.py,sha256=wURDYtzDrmbHu40Af_C_oTtN55wnULKHNZjUx6O8t-0,6126
42
+ statement_extractor/pipeline/orchestrator.py,sha256=1pe6hyEtd495LJrfH3QgxQadNqERmehQEs5LHsAVIxM,16580
43
+ statement_extractor/pipeline/registry.py,sha256=yBybhRd1HU2Y75TebLGBzF6nbPiHKZ0cHkyj-3CVnhg,11390
44
+ statement_extractor/plugins/__init__.py,sha256=pIcPeoMFd-56jOM_kGrUWvPuwqN6vFJ-oUbu130-tzI,1345
45
+ statement_extractor/plugins/base.py,sha256=ItqJZ5rH65gW4-pXpraRb45y7F3lXqsKECumhV3tDyk,21516
46
+ statement_extractor/plugins/extractors/__init__.py,sha256=sqxTI7WwDLVQKwOiQXqWS72gjJnwb76Gs9N3LGetBnI,253
47
+ statement_extractor/plugins/extractors/base.py,sha256=kNRsQ7BL84lXPXREm7CihrprDUaFwDDvMpBcbZlwSGA,179
48
+ statement_extractor/plugins/extractors/gliner2.py,sha256=ObEQMNE6ArjRl2s4x3lkOSPs03cmtTYFlppnbhtkI7A,21876
49
+ statement_extractor/plugins/labelers/__init__.py,sha256=flHEoBvnzQ3vAKkIUHyezpYi2H3KJvYGRerCVnc80r0,965
50
+ statement_extractor/plugins/labelers/base.py,sha256=hIgJKq2LU00OcL0Zjy1L9hP8K2onlM_xtZ63XcH8qDE,171
51
+ statement_extractor/plugins/labelers/confidence.py,sha256=XiXjBYe-8ch_SCKnz0sAwTT1mJ_XKMsuzXBbwAW_OK0,4083
52
+ statement_extractor/plugins/labelers/relation_type.py,sha256=e5ASwVqJGMSCrx5GtyNk85q_-19D7W_4jI-J-Pv_kxY,2506
53
+ statement_extractor/plugins/labelers/sentiment.py,sha256=nlWv9ymb7hlDIcFa-gjbIvZlJY1VrHrXhKMD-udmIzM,5027
54
+ statement_extractor/plugins/labelers/taxonomy.py,sha256=u_TQVCTOZCtZis5ZP0xvxh5Ehc0fCJ-DG6E86GxjNcs,12725
55
+ statement_extractor/plugins/labelers/taxonomy_embedding.py,sha256=NsSls2jkWm8LyNNuDkG2Rs4PYKQQxeMUDLTRrvSNk_A,16305
56
+ statement_extractor/plugins/pdf/__init__.py,sha256=QLbgg3lgpwUKR1EGmzhbOJh5IB4-3rpWen9c75YNLtM,220
57
+ statement_extractor/plugins/pdf/pypdf.py,sha256=JgmWa1-6tiATbPvhONMqRd5kAXJ--tb8rlEcR3u73sk,8612
58
+ statement_extractor/plugins/qualifiers/__init__.py,sha256=H4FEZSw1GWBQB-Y79nQnLwhZ3okKQJqgJHGEA0Zp8pA,951
59
+ statement_extractor/plugins/qualifiers/base.py,sha256=Kx--OdIh77mnjSkTl1NvUeekItRiG8AnBUcuznOZeBI,179
60
+ statement_extractor/plugins/qualifiers/companies_house.py,sha256=6TlK6Zebb5wDJ9GGO3FvM9zOh27TWpio5BX9k7lBr7U,5854
61
+ statement_extractor/plugins/qualifiers/embedding_company.py,sha256=EmCxImdXBCA7zxM1stAVeAYlzeNPC_jSlyVN5q1XEJA,14567
62
+ statement_extractor/plugins/qualifiers/gleif.py,sha256=zHzC9eOt0R9Z56n0CXgTF7POJqu6v03SRmiJLmv8OGE,6104
63
+ statement_extractor/plugins/qualifiers/person.py,sha256=GZCUJaQncC_wB4nBQ4RLY5dJ-CdARMLpByc_Nn09wj8,28461
64
+ statement_extractor/plugins/qualifiers/sec_edgar.py,sha256=d7QqGiE-3lFDQiXkYmNQU62K4oP2XYK6NzV6LNKPC5k,6754
65
+ statement_extractor/plugins/scrapers/__init__.py,sha256=mh1nmPtcsewrYeW5oELeke6DSzL8jsGOJ2OcH-A4-eo,208
66
+ statement_extractor/plugins/scrapers/http.py,sha256=igoB1JN7U-FPdBFmNfrdZV-Ho4JQ3RXniLz17SmQx8I,7778
67
+ statement_extractor/plugins/splitters/__init__.py,sha256=05CYeAEO0lZsapK5pjxZJbOCLI1kjeK6IQjftxqqg5g,224
68
+ statement_extractor/plugins/splitters/base.py,sha256=GeIBchFTr8icRSfYR8bGSb4-GoEZ1N0IGN6Kl5W2mL0,175
69
+ statement_extractor/plugins/splitters/t5_gemma.py,sha256=AwYYKQrAmiue5IK9bbJ-Uhfl9oCZTX1X_tmKguKIdjU,9982
70
+ statement_extractor/plugins/taxonomy/__init__.py,sha256=8N0tW7pm95DSCqM-s99ea0Tigbi9bZMyTkKblR1qmLw,307
71
+ statement_extractor/plugins/taxonomy/embedding.py,sha256=yCuNE8UeY8tH2dHGRKL3hmRQBmdz9_9YQ0t5_VTCf7E,16349
72
+ statement_extractor/plugins/taxonomy/mnli.py,sha256=zPZlpAHQqdnwH7fXS_CSY0HCMnaSrrk-fDQb1ZIqqPc,9163
73
+ corp_extractor-0.9.0.dist-info/METADATA,sha256=9pWemKEWyeEqW92sRd4SqdMykO-92kl5UIrs-P2xAn0,27553
74
+ corp_extractor-0.9.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
75
+ corp_extractor-0.9.0.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
76
+ corp_extractor-0.9.0.dist-info/RECORD,,
@@ -29,7 +29,7 @@ Example:
29
29
  >>> data = extract_statements_as_dict("Some text...")
30
30
  """
31
31
 
32
- __version__ = "0.3.0"
32
+ __version__ = "0.6.0"
33
33
 
34
34
  # Core models
35
35
  from .models import (