PyPI - corp-extractor - Versions diffs - 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

corp-extractor 0.5.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
corp_extractor-0.9.0.dist-info/RECORD +76 -0
statement_extractor/__init__.py +1 -1
statement_extractor/cli.py +1227 -10
statement_extractor/data/statement_taxonomy.json +6949 -1159
statement_extractor/database/__init__.py +52 -0
statement_extractor/database/embeddings.py +186 -0
statement_extractor/database/hub.py +520 -0
statement_extractor/database/importers/__init__.py +24 -0
statement_extractor/database/importers/companies_house.py +545 -0
statement_extractor/database/importers/gleif.py +538 -0
statement_extractor/database/importers/sec_edgar.py +375 -0
statement_extractor/database/importers/wikidata.py +1012 -0
statement_extractor/database/importers/wikidata_people.py +632 -0
statement_extractor/database/models.py +230 -0
statement_extractor/database/resolver.py +245 -0
statement_extractor/database/store.py +1609 -0
statement_extractor/document/__init__.py +62 -0
statement_extractor/document/chunker.py +410 -0
statement_extractor/document/context.py +171 -0
statement_extractor/document/deduplicator.py +173 -0
statement_extractor/document/html_extractor.py +246 -0
statement_extractor/document/loader.py +303 -0
statement_extractor/document/pipeline.py +388 -0
statement_extractor/document/summarizer.py +195 -0
statement_extractor/models/__init__.py +16 -1
statement_extractor/models/canonical.py +44 -1
statement_extractor/models/document.py +308 -0
statement_extractor/models/labels.py +47 -18
statement_extractor/models/qualifiers.py +51 -3
statement_extractor/models/statement.py +26 -0
statement_extractor/pipeline/config.py +6 -11
statement_extractor/pipeline/orchestrator.py +80 -111
statement_extractor/pipeline/registry.py +52 -46
statement_extractor/plugins/__init__.py +20 -8
statement_extractor/plugins/base.py +334 -64
statement_extractor/plugins/extractors/gliner2.py +10 -0
statement_extractor/plugins/labelers/taxonomy.py +18 -5
statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
statement_extractor/plugins/pdf/__init__.py +10 -0
statement_extractor/plugins/pdf/pypdf.py +291 -0
statement_extractor/plugins/qualifiers/__init__.py +11 -0
statement_extractor/plugins/qualifiers/companies_house.py +14 -3
statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
statement_extractor/plugins/qualifiers/gleif.py +14 -3
statement_extractor/plugins/qualifiers/person.py +578 -14
statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
statement_extractor/plugins/scrapers/__init__.py +10 -0
statement_extractor/plugins/scrapers/http.py +236 -0
statement_extractor/plugins/splitters/t5_gemma.py +158 -53
statement_extractor/plugins/taxonomy/embedding.py +193 -46
statement_extractor/plugins/taxonomy/mnli.py +16 -4
statement_extractor/scoring.py +8 -8
corp_extractor-0.5.0.dist-info/RECORD +0 -55
statement_extractor/plugins/canonicalizers/__init__.py +0 -17
statement_extractor/plugins/canonicalizers/base.py +0 -9
statement_extractor/plugins/canonicalizers/location.py +0 -219
statement_extractor/plugins/canonicalizers/organization.py +0 -230
statement_extractor/plugins/canonicalizers/person.py +0 -242
{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0

{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: corp-extractor
-Version: 0.5.0
+Version: 0.9.0
 Summary: Extract structured statements from text using T5-Gemma 2 and Diverse Beam Search
 Project-URL: Homepage, https://github.com/corp-o-rate/statement-extractor
 Project-URL: Documentation, https://github.com/corp-o-rate/statement-extractor#readme
@@ -24,14 +24,24 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
 Classifier: Topic :: Text Processing :: Linguistic
 Requires-Python: >=3.10
 Requires-Dist: accelerate>=1.12.0
+Requires-Dist: beautifulsoup4>=4.12.0
 Requires-Dist: click>=8.0.0
 Requires-Dist: gguf>=0.17.1
 Requires-Dist: gliner2
+Requires-Dist: httpx>=0.25.0
+Requires-Dist: huggingface-hub>=0.20.0
+Requires-Dist: llama-cpp-python>=0.3.16
 Requires-Dist: numpy>=1.24.0
 Requires-Dist: pydantic>=2.0.0
+Requires-Dist: pymupdf>=1.23.0
 Requires-Dist: sentence-transformers>=2.2.0
+Requires-Dist: sqlite-vec>=0.1.6
 Requires-Dist: torch>=2.0.0
 Requires-Dist: transformers>=5.0.0rc3
+Provides-Extra: all
+Requires-Dist: llama-cpp-python>=0.2.0; extra == 'all'
+Requires-Dist: pillow>=10.0.0; extra == 'all'
+Requires-Dist: pytesseract>=0.3.10; extra == 'all'
 Provides-Extra: dev
 Requires-Dist: mypy>=1.0.0; extra == 'dev'
 Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
@@ -39,6 +49,9 @@ Requires-Dist: pytest>=7.0.0; extra == 'dev'
 Requires-Dist: ruff>=0.1.0; extra == 'dev'
 Provides-Extra: llm
 Requires-Dist: llama-cpp-python>=0.2.0; extra == 'llm'
+Provides-Extra: ocr
+Requires-Dist: pillow>=10.0.0; extra == 'ocr'
+Requires-Dist: pytesseract>=0.3.10; extra == 'ocr'
 Description-Content-Type: text/markdown
 # Corp Extractor
@@ -51,18 +64,20 @@ Extract structured subject-predicate-object statements from unstructured text us
 ## Features
-- **6-Stage Pipeline** *(v0.5.0)*: Modular plugin-based architecture for full entity resolution
+- **Person Database** *(v0.9.0)*: Qualify notable people (executives, politicians, athletes, etc.) against Wikidata with canonical IDs
+- **5-Stage Pipeline** *(v0.8.0)*: Modular plugin-based architecture for full entity resolution
+- **Document Processing** *(v0.7.0)*: Process documents, URLs, and PDFs with chunking and deduplication
+- **Entity Embedding Database** *(v0.6.0)*: Fast entity qualification using vector similarity (~100K+ SEC, ~3M GLEIF, ~5M UK organizations)
 - **Structured Extraction**: Converts unstructured text into subject-predicate-object triples
 - **Entity Type Recognition**: Identifies 12 entity types (ORG, PERSON, GPE, LOC, PRODUCT, EVENT, etc.)
-- **Entity Qualification** *(v0.5.0)*: Adds roles, identifiers (LEI, ticker, company numbers) via external APIs
-- **Canonicalization** *(v0.5.0)*: Resolves entities to canonical forms with fuzzy matching
+- **Entity Qualification** *(v0.8.0)*: Adds identifiers (LEI, ticker, company numbers), canonical names, and FQN via embedding database
 - **Statement Labeling** *(v0.5.0)*: Sentiment analysis, relation type classification, confidence scoring
 - **GLiNER2 Integration** *(v0.4.0)*: Uses GLiNER2 (205M params) for entity recognition and relation extraction
 - **Predefined Predicates**: Optional `--predicates` list for GLiNER2 relation extraction mode
 - **Beam Merging**: Combines top beams for better coverage instead of picking one
 - **Embedding-based Dedup**: Uses semantic similarity to detect near-duplicate predicates
 - **Predicate Taxonomies**: Normalize predicates to canonical forms via embeddings
-- **Command Line Interface**: Full-featured CLI with `split`, `pipeline`, and `plugins` commands
+- **Command Line Interface**: Full-featured CLI with `split`, `pipeline`, `document`, and `db` commands
 - **Multiple Output Formats**: Get results as Pydantic models, JSON, XML, or dictionaries
 ## Installation
@@ -143,7 +158,7 @@ The CLI provides three main commands: `split`, `pipeline`, and `plugins`.
 corp-extractor split "Apple Inc. announced the iPhone 15."
 corp-extractor split -f article.txt --json
-# Full 6-stage pipeline (entity resolution, canonicalization, labeling, taxonomy)
+# Full 5-stage pipeline (entity resolution, labeling, taxonomy)
 corp-extractor pipeline "Amazon CEO Andy Jassy announced plans to hire workers."
 corp-extractor pipeline -f article.txt --stages 1-3
 corp-extractor pipeline "..." --disable-plugins sec_edgar
@@ -297,9 +312,9 @@ for stmt in fixed_statements:
 During deduplication, reversed duplicates (e.g., "A -> P -> B" and "B -> P -> A") are now detected and merged, with the correct orientation determined by source text similarity.
-## New in v0.5.0: Pipeline Architecture
+## Pipeline Architecture
-v0.5.0 introduces a **6-stage plugin-based pipeline** for comprehensive entity resolution, statement enrichment, and taxonomy classification.
+The library uses a **5-stage plugin-based pipeline** for comprehensive entity resolution, statement enrichment, and taxonomy classification.
 ### Pipeline Stages
@@ -307,10 +322,9 @@ v0.5.0 introduces a **6-stage plugin-based pipeline** for comprehensive entity r
 |-------|------|-------|--------|----------|
 | 1 | Splitting | Text | `RawTriple[]` | T5-Gemma2 |
 | 2 | Extraction | `RawTriple[]` | `PipelineStatement[]` | GLiNER2 |
-| 3 | Qualification | Entities | `QualifiedEntity[]` | Gemma3, APIs |
-| 4 | Canonicalization | `QualifiedEntity[]` | `CanonicalEntity[]` | Fuzzy matching |
-| 5 | Labeling | Statements | `LabeledStatement[]` | Sentiment, etc. |
-| 6 | Taxonomy | Statements | `TaxonomyResult[]` | MNLI, Embeddings |
+| 3 | Qualification | Entities | `CanonicalEntity[]` | Embedding DB |
+| 4 | Labeling | Statements | `LabeledStatement[]` | Sentiment, etc. |
+| 5 | Taxonomy | Statements | `TaxonomyResult[]` | MNLI, Embeddings |
 ### Pipeline Python API
@@ -339,8 +353,8 @@ from statement_extractor.pipeline import PipelineConfig, ExtractionPipeline
 # Run only specific stages
 config = PipelineConfig(
-    enabled_stages={1, 2, 3},  # Skip canonicalization and labeling
-    disabled_plugins={"sec_edgar_qualifier"},  # Disable specific plugins
+    enabled_stages={1, 2, 3},  # Skip labeling and taxonomy
+    disabled_plugins={"person_qualifier"},  # Disable specific plugins
 )
 pipeline = ExtractionPipeline(config)
 ctx = pipeline.process(text)
@@ -358,24 +372,177 @@ config = PipelineConfig.from_stage_string("1-3")  # Stages 1, 2, 3
 - `gliner2_extractor` - GLiNER2 entity recognition and relation extraction
 **Qualifiers (Stage 3):**
-- `person_qualifier` - PERSON → role, org (uses Gemma3)
-- `gleif_qualifier` - ORG → LEI, jurisdiction (GLEIF API)
-- `companies_house_qualifier` - ORG → UK company number
-- `sec_edgar_qualifier` - ORG → SEC CIK, ticker
+- `person_qualifier` - PERSON → role, org, canonical ID via Wikidata person database *(enhanced in v0.9.0)*
+- `embedding_company_qualifier` - ORG → canonical name, identifiers (LEI, CIK, company number), and FQN via embedding database
-**Canonicalizers (Stage 4):**
-- `organization_canonicalizer` - ORG canonical names
-- `person_canonicalizer` - PERSON name variants
-**Labelers (Stage 5):**
+**Labelers (Stage 4):**
 - `sentiment_labeler` - Statement sentiment analysis
+- `confidence_labeler` - Confidence scoring
+- `relation_type_labeler` - Relation type classification
-**Taxonomy Classifiers (Stage 6):**
+**Taxonomy Classifiers (Stage 5):**
 - `mnli_taxonomy_classifier` - MNLI zero-shot classification against ESG taxonomy
 - `embedding_taxonomy_classifier` - Embedding similarity-based taxonomy classification
 Taxonomy classifiers return **multiple labels** per statement above the confidence threshold.
+## New in v0.6.0: Entity Embedding Database
+v0.6.0 introduces an **entity embedding database** for fast entity qualification using vector similarity search.
+### Data Sources
+**Organizations:**
+| Source | Records | Identifier | EntityType Mapping |
+|--------|---------|------------|-------------------|
+| GLEIF | ~3.2M | LEI (Legal Entity Identifier) | GENERAL→business, FUND→fund, BRANCH→branch, INTERNATIONAL_ORGANIZATION→international_org |
+| SEC Edgar | ~100K+ | CIK (Central Index Key) | business (or fund via SIC codes) |
+| Companies House | ~5M | UK Company Number | Maps company_type to business/nonprofit |
+| Wikidata | Variable | Wikidata QID | 35+ query types mapped to EntityType |
+**People** *(v0.9.0)*:
+| Source | Records | Identifier | PersonType Classification |
+|--------|---------|------------|--------------------------|
+| Wikidata | Variable | Wikidata QID | executive, politician, athlete, artist, academic, scientist, journalist, entrepreneur, activist |
+### EntityType Classification
+Each organization record is classified with an `entity_type` field:
+| Category | Types |
+|----------|-------|
+| Business | `business`, `fund`, `branch` |
+| Non-profit | `nonprofit`, `ngo`, `foundation`, `trade_union` |
+| Government | `government`, `international_org`, `political_party` |
+| Other | `educational`, `research`, `healthcare`, `media`, `sports`, `religious`, `unknown` |
+### Building the Database
+```bash
+# Import organizations from authoritative sources
+corp-extractor db import-gleif --download
+corp-extractor db import-sec --download      # Bulk submissions.zip (~100K+ filers)
+corp-extractor db import-companies-house --download
+corp-extractor db import-wikidata --limit 50000
+# Import notable people (v0.9.0)
+corp-extractor db import-people --type executive --limit 5000
+corp-extractor db import-people --all --limit 10000  # All person types
+# Check status
+corp-extractor db status
+# Search for an organization
+corp-extractor db search "Microsoft"
+# Search for a person (v0.9.0)
+corp-extractor db search-people "Tim Cook"
+```
+### Using in Pipeline
+The database is automatically used by the `embedding_company_qualifier` plugin for Stage 3 (Qualification):
+```python
+from statement_extractor.pipeline import ExtractionPipeline
+pipeline = ExtractionPipeline()
+ctx = pipeline.process("Microsoft acquired Activision Blizzard.")
+for stmt in ctx.labeled_statements:
+    print(f"{stmt.subject_fqn}")  # e.g., "Microsoft (sec_edgar:0000789019)"
+```
+### Publishing to HuggingFace
+```bash
+# Upload database with all variants (full, lite, compressed)
+export HF_TOKEN="hf_..."
+corp-extractor db upload                     # Uses default cache location
+corp-extractor db upload entities.db         # Or specify path
+corp-extractor db upload --no-lite           # Skip lite version
+corp-extractor db upload --no-compress       # Skip compressed versions
+# Download pre-built database (lite version by default)
+corp-extractor db download                   # Lite version (smaller, faster)
+corp-extractor db download --full            # Full version with all metadata
+# Local database management
+corp-extractor db create-lite entities.db    # Create lite version
+corp-extractor db compress entities.db       # Compress with gzip
+```
+See [COMPANY_DB.md](../COMPANY_DB.md) for complete build and publish instructions.
+## New in v0.7.0: Document Processing
+v0.7.0 introduces **document-level processing** for handling files, URLs, and PDFs with automatic chunking, deduplication, and citation tracking.
+### Document CLI
+```bash
+# Process local files
+corp-extractor document process article.txt
+corp-extractor document process report.txt --title "Annual Report" --year 2024
+# Process URLs (web pages and PDFs)
+corp-extractor document process https://example.com/article
+corp-extractor document process https://example.com/report.pdf --use-ocr
+# Configure chunking
+corp-extractor document process article.txt --max-tokens 500 --overlap 50
+# Preview chunking without extraction
+corp-extractor document chunk article.txt --max-tokens 500
+```
+### Document Python API
+```python
+from statement_extractor.document import DocumentPipeline, DocumentPipelineConfig, Document
+from statement_extractor.models.document import ChunkingConfig
+# Configure document processing
+config = DocumentPipelineConfig(
+    chunking=ChunkingConfig(target_tokens=1000, overlap_tokens=100),
+    generate_summary=True,
+    deduplicate_across_chunks=True,
+)
+pipeline = DocumentPipeline(config)
+# Process text
+document = Document.from_text("Your long document text...", title="My Document")
+ctx = pipeline.process(document)
+# Process URL (async)
+ctx = await pipeline.process_url("https://example.com/article")
+# Access results
+print(f"Chunks: {ctx.chunk_count}")
+print(f"Statements: {ctx.statement_count}")
+print(f"Duplicates removed: {ctx.duplicates_removed}")
+for stmt in ctx.labeled_statements:
+    print(f"{stmt.subject_fqn} --[{stmt.statement.predicate}]--> {stmt.object_fqn}")
+    if stmt.citation:
+        print(f"  Citation: {stmt.citation}")
+```
+### PDF Processing
+PDFs are automatically parsed using PyMuPDF. For scanned PDFs, use OCR:
+```bash
+# Install OCR dependencies
+pip install "corp-extractor[ocr]"
+# Process with OCR
+corp-extractor document process scanned.pdf --use-ocr
+```
 ## New in v0.4.0: GLiNER2 Integration
 v0.4.0 replaces spaCy with **GLiNER2** (205M params) for entity recognition and relation extraction. GLiNER2 is a unified model that handles NER, text classification, structured data extraction, and relation extraction with CPU-optimized inference.

corp_extractor-0.9.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,76 @@
+statement_extractor/__init__.py,sha256=vOJFsK6wNOoBvGYOvIKsseaqpFR8vNg_XPH-r8SmLas,3215
+statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
+statement_extractor/cli.py,sha256=BTFLIBZoNa2ADrYVslbXiZGrzhRWmi7ppbnAPV3xUyg,71191
+statement_extractor/extractor.py,sha256=CGJCmAMiIoDsPtjIdvOHYBcz8058eYpfLMngjELMJhI,38403
+statement_extractor/gliner_extraction.py,sha256=OL4w-0_rZc6XCojaVsbGY4VdIXRJ6j8ZmeUeTOL0Ue0,8118
+statement_extractor/llm.py,sha256=1eBrYs-bUPqzbpiiif_hH-gE_DeM-l3cmddrIoLHFXo,8010
+statement_extractor/models.py,sha256=fXTT7qxPqynnrrpb77nCgs3K2yn_YgbSugSXv12boX4,12312
+statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
+statement_extractor/scoring.py,sha256=V9WHQ-QCAoycnnaTHydWkFo-48_lcS6Mkztxjfi4wVg,16632
+statement_extractor/data/default_predicates.json,sha256=7rhFXWHvStDj4sLYfBXKS50xGChPaqMKUaKTkMEJRGk,32870
+statement_extractor/data/statement_taxonomy.json,sha256=LI9RWthuJTFCcuaIbh6h3FEu8EJpejiKjAtNM_y1s8A,336543
+statement_extractor/database/__init__.py,sha256=1eScQOm7866v9xndaqCK-xsXDUhKBSj5YGtGoQ80LgU,1548
+statement_extractor/database/embeddings.py,sha256=j_gUTEdRyyQCPcx9imsOh1nVDPjeiRXXG22OZ7KIO4w,5535
+statement_extractor/database/hub.py,sha256=HOnRp62RnkXvk2KgwqOLVpEkXwy0LS0n3tIJrkYCo2c,16842
+statement_extractor/database/models.py,sha256=ke4byqJiiBlZfRhxqoC0nsdDhb6YSG2I4S5W5BRBNY4,8813
+statement_extractor/database/resolver.py,sha256=_fTITarFmAYOtuRbOos48ou_aqX4yJC0K2csdLbIktI,7202
+statement_extractor/database/store.py,sha256=1qdRZ7q5nTLUYbtUC9cWSLey_GVf5kAQ6dTF9EEwDXY,56735
+statement_extractor/database/importers/__init__.py,sha256=0CPqafekQpqxFmZhe3uMJLNssqxGzEniZlArGyl8QKU,668
+statement_extractor/database/importers/companies_house.py,sha256=G0DZAs_9RM7uTwY7imt70IXUVvhntoO-xXnJ0o6jjGw,19635
+statement_extractor/database/importers/gleif.py,sha256=MTFuksVf83Barn1c6JvBLBouxXbzogWulKb8oqEODAk,18948
+statement_extractor/database/importers/sec_edgar.py,sha256=_B4QcXhZ_5ulXTSVW9dKKAzFIVwn-VIh_X39jcUhqsg,12923
+statement_extractor/database/importers/wikidata.py,sha256=ZZYHiqSlYlco1TSzCLUKqdT-i5X1cUSK1EnsfWWwPAc,33770
+statement_extractor/database/importers/wikidata_people.py,sha256=loqyf5sbtBqCITiTxqV3PLyx3SefmVefhZE0Y-cRoC4,22205
+statement_extractor/document/__init__.py,sha256=csbUUjxaZay-0WXtjZmULjDfL9VNxhOlePyKTMdRDYo,1714
+statement_extractor/document/chunker.py,sha256=I76p6Qwujk2kkN7GJ1sMwbQNOfEpbt29u-RxJdt15oE,14020
+statement_extractor/document/context.py,sha256=9DvyguwCjlef2MeNWZMgydvD54FPiOppjdvamQnrKzM,5450
+statement_extractor/document/deduplicator.py,sha256=8tPKWAGqNfjteOdnk7B82izyfIpvOebirZ-OIQKixwU,4821
+statement_extractor/document/html_extractor.py,sha256=YRhaIsurBJTeECLkL2YJsSv8gDJJN33fS-ESkGvDBGs,6600
+statement_extractor/document/loader.py,sha256=Ygund7bz4EVcwsFsxkrrgSjOCK4tbb_sqkMlzK_oEKM,8996
+statement_extractor/document/pipeline.py,sha256=h4q-CG_WtBLibkTXCFhfTizMme8bJS5f6ZWOECqhRYU,13675
+statement_extractor/document/summarizer.py,sha256=DOF6qPw0oWEtLSt97oXOFyzb0jGWZZ7frDFp11rL3is,5853
+statement_extractor/models/__init__.py,sha256=9FxKkJ4EIPXmSkMo_j9jqAKH6jTkvz5Tzk2YvQL7UVk,2884
+statement_extractor/models/canonical.py,sha256=LaSU3CUJZOtBM1SpRTAmK-3N7QnYmxZYJvQE1NVIjLY,6003
+statement_extractor/models/document.py,sha256=McCyXz88YtJtlsfiFzagjRAhY32ovpIDKXQI_eV_DZI,9203
+statement_extractor/models/entity.py,sha256=l2ny91BnnWwPo9zx1_Fb8WMKPNuIQFN0H7ILncylmcY,3214
+statement_extractor/models/labels.py,sha256=NUcjFDuGUOM82mgsaWOdoIVbRNiQ6TdN-imNuTograo,7326
+statement_extractor/models/qualifiers.py,sha256=l--khVzt-N6jgibZ-MSSl-3SdQUZJN9dGoxdNhRmM_I,5926
+statement_extractor/models/statement.py,sha256=agC4jcP9ospbZC91J6c0UgLAmfsg1tnqNcSvkqOtqBQ,3629
+statement_extractor/pipeline/__init__.py,sha256=Q3M2Arx9BWH_APZxM-P0G-C3ISguG1whiA5QhxDHQCA,1071
+statement_extractor/pipeline/config.py,sha256=FXtqMMpRmdeuHB86D6YrFx5A36XHVg5GlBBZuPEn4JA,3957
+statement_extractor/pipeline/context.py,sha256=wURDYtzDrmbHu40Af_C_oTtN55wnULKHNZjUx6O8t-0,6126
+statement_extractor/pipeline/orchestrator.py,sha256=1pe6hyEtd495LJrfH3QgxQadNqERmehQEs5LHsAVIxM,16580
+statement_extractor/pipeline/registry.py,sha256=yBybhRd1HU2Y75TebLGBzF6nbPiHKZ0cHkyj-3CVnhg,11390
+statement_extractor/plugins/__init__.py,sha256=pIcPeoMFd-56jOM_kGrUWvPuwqN6vFJ-oUbu130-tzI,1345
+statement_extractor/plugins/base.py,sha256=ItqJZ5rH65gW4-pXpraRb45y7F3lXqsKECumhV3tDyk,21516
+statement_extractor/plugins/extractors/__init__.py,sha256=sqxTI7WwDLVQKwOiQXqWS72gjJnwb76Gs9N3LGetBnI,253
+statement_extractor/plugins/extractors/base.py,sha256=kNRsQ7BL84lXPXREm7CihrprDUaFwDDvMpBcbZlwSGA,179
+statement_extractor/plugins/extractors/gliner2.py,sha256=ObEQMNE6ArjRl2s4x3lkOSPs03cmtTYFlppnbhtkI7A,21876
+statement_extractor/plugins/labelers/__init__.py,sha256=flHEoBvnzQ3vAKkIUHyezpYi2H3KJvYGRerCVnc80r0,965
+statement_extractor/plugins/labelers/base.py,sha256=hIgJKq2LU00OcL0Zjy1L9hP8K2onlM_xtZ63XcH8qDE,171
+statement_extractor/plugins/labelers/confidence.py,sha256=XiXjBYe-8ch_SCKnz0sAwTT1mJ_XKMsuzXBbwAW_OK0,4083
+statement_extractor/plugins/labelers/relation_type.py,sha256=e5ASwVqJGMSCrx5GtyNk85q_-19D7W_4jI-J-Pv_kxY,2506
+statement_extractor/plugins/labelers/sentiment.py,sha256=nlWv9ymb7hlDIcFa-gjbIvZlJY1VrHrXhKMD-udmIzM,5027
+statement_extractor/plugins/labelers/taxonomy.py,sha256=u_TQVCTOZCtZis5ZP0xvxh5Ehc0fCJ-DG6E86GxjNcs,12725
+statement_extractor/plugins/labelers/taxonomy_embedding.py,sha256=NsSls2jkWm8LyNNuDkG2Rs4PYKQQxeMUDLTRrvSNk_A,16305
+statement_extractor/plugins/pdf/__init__.py,sha256=QLbgg3lgpwUKR1EGmzhbOJh5IB4-3rpWen9c75YNLtM,220
+statement_extractor/plugins/pdf/pypdf.py,sha256=JgmWa1-6tiATbPvhONMqRd5kAXJ--tb8rlEcR3u73sk,8612
+statement_extractor/plugins/qualifiers/__init__.py,sha256=H4FEZSw1GWBQB-Y79nQnLwhZ3okKQJqgJHGEA0Zp8pA,951
+statement_extractor/plugins/qualifiers/base.py,sha256=Kx--OdIh77mnjSkTl1NvUeekItRiG8AnBUcuznOZeBI,179
+statement_extractor/plugins/qualifiers/companies_house.py,sha256=6TlK6Zebb5wDJ9GGO3FvM9zOh27TWpio5BX9k7lBr7U,5854
+statement_extractor/plugins/qualifiers/embedding_company.py,sha256=EmCxImdXBCA7zxM1stAVeAYlzeNPC_jSlyVN5q1XEJA,14567
+statement_extractor/plugins/qualifiers/gleif.py,sha256=zHzC9eOt0R9Z56n0CXgTF7POJqu6v03SRmiJLmv8OGE,6104
+statement_extractor/plugins/qualifiers/person.py,sha256=GZCUJaQncC_wB4nBQ4RLY5dJ-CdARMLpByc_Nn09wj8,28461
+statement_extractor/plugins/qualifiers/sec_edgar.py,sha256=d7QqGiE-3lFDQiXkYmNQU62K4oP2XYK6NzV6LNKPC5k,6754
+statement_extractor/plugins/scrapers/__init__.py,sha256=mh1nmPtcsewrYeW5oELeke6DSzL8jsGOJ2OcH-A4-eo,208
+statement_extractor/plugins/scrapers/http.py,sha256=igoB1JN7U-FPdBFmNfrdZV-Ho4JQ3RXniLz17SmQx8I,7778
+statement_extractor/plugins/splitters/__init__.py,sha256=05CYeAEO0lZsapK5pjxZJbOCLI1kjeK6IQjftxqqg5g,224
+statement_extractor/plugins/splitters/base.py,sha256=GeIBchFTr8icRSfYR8bGSb4-GoEZ1N0IGN6Kl5W2mL0,175
+statement_extractor/plugins/splitters/t5_gemma.py,sha256=AwYYKQrAmiue5IK9bbJ-Uhfl9oCZTX1X_tmKguKIdjU,9982
+statement_extractor/plugins/taxonomy/__init__.py,sha256=8N0tW7pm95DSCqM-s99ea0Tigbi9bZMyTkKblR1qmLw,307
+statement_extractor/plugins/taxonomy/embedding.py,sha256=yCuNE8UeY8tH2dHGRKL3hmRQBmdz9_9YQ0t5_VTCf7E,16349
+statement_extractor/plugins/taxonomy/mnli.py,sha256=zPZlpAHQqdnwH7fXS_CSY0HCMnaSrrk-fDQb1ZIqqPc,9163
+corp_extractor-0.9.0.dist-info/METADATA,sha256=9pWemKEWyeEqW92sRd4SqdMykO-92kl5UIrs-P2xAn0,27553
+corp_extractor-0.9.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+corp_extractor-0.9.0.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
+corp_extractor-0.9.0.dist-info/RECORD,,

statement_extractor/__init__.py CHANGED Viewed

@@ -29,7 +29,7 @@ Example:
     >>> data = extract_statements_as_dict("Some text...")
 """
-__version__ = "0.3.0"
+__version__ = "0.6.0"
 # Core models
 from .models import (

corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

corp-extractor 0.5.0py3-none-any.whl → 0.9.0py3-none-any.whl