PyPI - corp-extractor - Versions diffs - 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl - Mend

corp-extractor 0.5.0py3-none-any.whl → 0.9.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
corp_extractor-0.9.3.dist-info/RECORD +79 -0
statement_extractor/__init__.py +1 -1
statement_extractor/cli.py +2030 -24
statement_extractor/data/statement_taxonomy.json +6949 -1159
statement_extractor/database/__init__.py +52 -0
statement_extractor/database/embeddings.py +186 -0
statement_extractor/database/hub.py +428 -0
statement_extractor/database/importers/__init__.py +32 -0
statement_extractor/database/importers/companies_house.py +559 -0
statement_extractor/database/importers/companies_house_officers.py +431 -0
statement_extractor/database/importers/gleif.py +561 -0
statement_extractor/database/importers/sec_edgar.py +392 -0
statement_extractor/database/importers/sec_form4.py +512 -0
statement_extractor/database/importers/wikidata.py +1120 -0
statement_extractor/database/importers/wikidata_dump.py +1951 -0
statement_extractor/database/importers/wikidata_people.py +1130 -0
statement_extractor/database/models.py +254 -0
statement_extractor/database/resolver.py +245 -0
statement_extractor/database/store.py +3034 -0
statement_extractor/document/__init__.py +62 -0
statement_extractor/document/chunker.py +410 -0
statement_extractor/document/context.py +171 -0
statement_extractor/document/deduplicator.py +171 -0
statement_extractor/document/html_extractor.py +246 -0
statement_extractor/document/loader.py +303 -0
statement_extractor/document/pipeline.py +388 -0
statement_extractor/document/summarizer.py +195 -0
statement_extractor/extractor.py +1 -1
statement_extractor/models/__init__.py +19 -3
statement_extractor/models/canonical.py +44 -1
statement_extractor/models/document.py +308 -0
statement_extractor/models/labels.py +47 -18
statement_extractor/models/qualifiers.py +51 -3
statement_extractor/models/statement.py +39 -15
statement_extractor/models.py +1 -1
statement_extractor/pipeline/config.py +6 -11
statement_extractor/pipeline/context.py +5 -5
statement_extractor/pipeline/orchestrator.py +90 -121
statement_extractor/pipeline/registry.py +52 -46
statement_extractor/plugins/__init__.py +20 -8
statement_extractor/plugins/base.py +348 -78
statement_extractor/plugins/extractors/gliner2.py +38 -28
statement_extractor/plugins/labelers/taxonomy.py +18 -5
statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
statement_extractor/plugins/pdf/__init__.py +10 -0
statement_extractor/plugins/pdf/pypdf.py +291 -0
statement_extractor/plugins/qualifiers/__init__.py +11 -0
statement_extractor/plugins/qualifiers/companies_house.py +14 -3
statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
statement_extractor/plugins/qualifiers/gleif.py +14 -3
statement_extractor/plugins/qualifiers/person.py +588 -14
statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
statement_extractor/plugins/scrapers/__init__.py +10 -0
statement_extractor/plugins/scrapers/http.py +236 -0
statement_extractor/plugins/splitters/t5_gemma.py +176 -75
statement_extractor/plugins/taxonomy/embedding.py +193 -46
statement_extractor/plugins/taxonomy/mnli.py +16 -4
statement_extractor/scoring.py +8 -8
corp_extractor-0.5.0.dist-info/RECORD +0 -55
statement_extractor/plugins/canonicalizers/__init__.py +0 -17
statement_extractor/plugins/canonicalizers/base.py +0 -9
statement_extractor/plugins/canonicalizers/location.py +0 -219
statement_extractor/plugins/canonicalizers/organization.py +0 -230
statement_extractor/plugins/canonicalizers/person.py +0 -242
{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0

{corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.4
 Name: corp-extractor
-Version: 0.5.0
-Summary: Extract structured statements from text using T5-Gemma 2 and Diverse Beam Search
+Version: 0.9.3
+Summary: Extract structured entity and relationship information from text
 Project-URL: Homepage, https://github.com/corp-o-rate/statement-extractor
 Project-URL: Documentation, https://github.com/corp-o-rate/statement-extractor#readme
 Project-URL: Repository, https://github.com/corp-o-rate/statement-extractor
@@ -9,7 +9,7 @@ Project-URL: Issues, https://github.com/corp-o-rate/statement-extractor/issues
 Author-email: Corp-o-Rate <neil@corp-o-rate.com>
 Maintainer-email: Corp-o-Rate <neil@corp-o-rate.com>
 License: MIT
-Keywords: diverse-beam-search,embeddings,gemma,information-extraction,knowledge-graph,nlp,statement-extraction,subject-predicate-object,t5,transformers,triples
+Keywords: diverse-beam-search,embeddings,entities,entity-linking,entity-resolution,gemma,information-extraction,knowledge-graph,nlp,semantic-parsing,statement-extraction,subject-predicate-object,t5gemma2,transformers,triples
 Classifier: Development Status :: 4 - Beta
 Classifier: Intended Audience :: Developers
 Classifier: Intended Audience :: Science/Research
@@ -24,14 +24,25 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
 Classifier: Topic :: Text Processing :: Linguistic
 Requires-Python: >=3.10
 Requires-Dist: accelerate>=1.12.0
+Requires-Dist: beautifulsoup4>=4.12.0
 Requires-Dist: click>=8.0.0
 Requires-Dist: gguf>=0.17.1
 Requires-Dist: gliner2
+Requires-Dist: httpx>=0.25.0
+Requires-Dist: huggingface-hub>=0.20.0
+Requires-Dist: llama-cpp-python>=0.3.16
 Requires-Dist: numpy>=1.24.0
+Requires-Dist: pycountry>=24.6.1
 Requires-Dist: pydantic>=2.0.0
+Requires-Dist: pymupdf>=1.23.0
 Requires-Dist: sentence-transformers>=2.2.0
+Requires-Dist: sqlite-vec>=0.1.6
 Requires-Dist: torch>=2.0.0
 Requires-Dist: transformers>=5.0.0rc3
+Provides-Extra: all
+Requires-Dist: llama-cpp-python>=0.2.0; extra == 'all'
+Requires-Dist: pillow>=10.0.0; extra == 'all'
+Requires-Dist: pytesseract>=0.3.10; extra == 'all'
 Provides-Extra: dev
 Requires-Dist: mypy>=1.0.0; extra == 'dev'
 Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
@@ -39,6 +50,9 @@ Requires-Dist: pytest>=7.0.0; extra == 'dev'
 Requires-Dist: ruff>=0.1.0; extra == 'dev'
 Provides-Extra: llm
 Requires-Dist: llama-cpp-python>=0.2.0; extra == 'llm'
+Provides-Extra: ocr
+Requires-Dist: pillow>=10.0.0; extra == 'ocr'
+Requires-Dist: pytesseract>=0.3.10; extra == 'ocr'
 Description-Content-Type: text/markdown
 # Corp Extractor
@@ -51,18 +65,21 @@ Extract structured subject-predicate-object statements from unstructured text us
 ## Features
-- **6-Stage Pipeline** *(v0.5.0)*: Modular plugin-based architecture for full entity resolution
+- **Person Database** *(v0.9.2)*: Qualify notable people (executives, politicians, athletes, etc.) against Wikidata with canonical IDs
+- **Organization Canonicalization** *(v0.9.2)*: Link equivalent records across sources (LEI, ticker, CIK, name matching)
+- **5-Stage Pipeline** *(v0.8.0)*: Modular plugin-based architecture for full entity resolution
+- **Document Processing** *(v0.7.0)*: Process documents, URLs, and PDFs with chunking and deduplication
+- **Entity Embedding Database** *(v0.6.0)*: Fast entity qualification using vector similarity (~100K+ SEC, ~3M GLEIF, ~5M UK organizations)
 - **Structured Extraction**: Converts unstructured text into subject-predicate-object triples
 - **Entity Type Recognition**: Identifies 12 entity types (ORG, PERSON, GPE, LOC, PRODUCT, EVENT, etc.)
-- **Entity Qualification** *(v0.5.0)*: Adds roles, identifiers (LEI, ticker, company numbers) via external APIs
-- **Canonicalization** *(v0.5.0)*: Resolves entities to canonical forms with fuzzy matching
+- **Entity Qualification** *(v0.8.0)*: Adds identifiers (LEI, ticker, company numbers), canonical names, and FQN via embedding database
 - **Statement Labeling** *(v0.5.0)*: Sentiment analysis, relation type classification, confidence scoring
 - **GLiNER2 Integration** *(v0.4.0)*: Uses GLiNER2 (205M params) for entity recognition and relation extraction
 - **Predefined Predicates**: Optional `--predicates` list for GLiNER2 relation extraction mode
 - **Beam Merging**: Combines top beams for better coverage instead of picking one
 - **Embedding-based Dedup**: Uses semantic similarity to detect near-duplicate predicates
 - **Predicate Taxonomies**: Normalize predicates to canonical forms via embeddings
-- **Command Line Interface**: Full-featured CLI with `split`, `pipeline`, and `plugins` commands
+- **Command Line Interface**: Full-featured CLI with `split`, `pipeline`, `document`, and `db` commands
 - **Multiple Output Formats**: Get results as Pydantic models, JSON, XML, or dictionaries
 ## Installation
@@ -143,7 +160,7 @@ The CLI provides three main commands: `split`, `pipeline`, and `plugins`.
 corp-extractor split "Apple Inc. announced the iPhone 15."
 corp-extractor split -f article.txt --json
-# Full 6-stage pipeline (entity resolution, canonicalization, labeling, taxonomy)
+# Full 5-stage pipeline (entity resolution, labeling, taxonomy)
 corp-extractor pipeline "Amazon CEO Andy Jassy announced plans to hire workers."
 corp-extractor pipeline -f article.txt --stages 1-3
 corp-extractor pipeline "..." --disable-plugins sec_edgar
@@ -206,10 +223,10 @@ Pipeline Options:
   -o, --output [table|json|yaml|triples]  Output format
 ```
-## New in v0.2.0: Quality Scoring & Beam Merging
+## Quality Scoring & Beam Merging
-By default, the library now:
-- **Scores each triple** for groundedness based on whether entities appear in source text
+By default, the library:
+- **Scores each triple** using semantic similarity (50%) + GLiNER2 entity recognition (50%)
 - **Merges top beams** instead of selecting one, improving coverage
 - **Uses embeddings** to detect semantically similar predicates ("bought" ≈ "acquired")
@@ -297,9 +314,9 @@ for stmt in fixed_statements:
 During deduplication, reversed duplicates (e.g., "A -> P -> B" and "B -> P -> A") are now detected and merged, with the correct orientation determined by source text similarity.
-## New in v0.5.0: Pipeline Architecture
+## Pipeline Architecture
-v0.5.0 introduces a **6-stage plugin-based pipeline** for comprehensive entity resolution, statement enrichment, and taxonomy classification.
+The library uses a **5-stage plugin-based pipeline** for comprehensive entity resolution, statement enrichment, and taxonomy classification.
 ### Pipeline Stages
@@ -307,10 +324,9 @@ v0.5.0 introduces a **6-stage plugin-based pipeline** for comprehensive entity r
 |-------|------|-------|--------|----------|
 | 1 | Splitting | Text | `RawTriple[]` | T5-Gemma2 |
 | 2 | Extraction | `RawTriple[]` | `PipelineStatement[]` | GLiNER2 |
-| 3 | Qualification | Entities | `QualifiedEntity[]` | Gemma3, APIs |
-| 4 | Canonicalization | `QualifiedEntity[]` | `CanonicalEntity[]` | Fuzzy matching |
-| 5 | Labeling | Statements | `LabeledStatement[]` | Sentiment, etc. |
-| 6 | Taxonomy | Statements | `TaxonomyResult[]` | MNLI, Embeddings |
+| 3 | Qualification | Entities | `CanonicalEntity[]` | Embedding DB |
+| 4 | Labeling | Statements | `LabeledStatement[]` | Sentiment, etc. |
+| 5 | Taxonomy | Statements | `TaxonomyResult[]` | MNLI, Embeddings |
 ### Pipeline Python API
@@ -339,8 +355,8 @@ from statement_extractor.pipeline import PipelineConfig, ExtractionPipeline
 # Run only specific stages
 config = PipelineConfig(
-    enabled_stages={1, 2, 3},  # Skip canonicalization and labeling
-    disabled_plugins={"sec_edgar_qualifier"},  # Disable specific plugins
+    enabled_stages={1, 2, 3},  # Skip labeling and taxonomy
+    disabled_plugins={"person_qualifier"},  # Disable specific plugins
 )
 pipeline = ExtractionPipeline(config)
 ctx = pipeline.process(text)
@@ -358,24 +374,206 @@ config = PipelineConfig.from_stage_string("1-3")  # Stages 1, 2, 3
 - `gliner2_extractor` - GLiNER2 entity recognition and relation extraction
 **Qualifiers (Stage 3):**
-- `person_qualifier` - PERSON → role, org (uses Gemma3)
-- `gleif_qualifier` - ORG → LEI, jurisdiction (GLEIF API)
-- `companies_house_qualifier` - ORG → UK company number
-- `sec_edgar_qualifier` - ORG → SEC CIK, ticker
+- `person_qualifier` - PERSON → role, org, canonical ID via Wikidata person database *(enhanced in v0.9.0)*
+- `embedding_company_qualifier` - ORG → canonical name, identifiers (LEI, CIK, company number), and FQN via embedding database
-**Canonicalizers (Stage 4):**
-- `organization_canonicalizer` - ORG canonical names
-- `person_canonicalizer` - PERSON name variants
-**Labelers (Stage 5):**
+**Labelers (Stage 4):**
 - `sentiment_labeler` - Statement sentiment analysis
+- `confidence_labeler` - Confidence scoring
+- `relation_type_labeler` - Relation type classification
-**Taxonomy Classifiers (Stage 6):**
+**Taxonomy Classifiers (Stage 5):**
 - `mnli_taxonomy_classifier` - MNLI zero-shot classification against ESG taxonomy
 - `embedding_taxonomy_classifier` - Embedding similarity-based taxonomy classification
 Taxonomy classifiers return **multiple labels** per statement above the confidence threshold.
+## Entity Database
+The library includes an **entity embedding database** for fast entity qualification using vector similarity search. It stores records from authoritative sources (GLEIF, SEC, Companies House, Wikidata) with 768-dimensional embeddings for semantic matching.
+**Quick start:**
+```bash
+corp-extractor db download              # Download pre-built database
+corp-extractor db search "Microsoft"    # Search organizations
+corp-extractor db search-people "Tim Cook"  # Search people
+```
+For comprehensive documentation including schema, CLI reference, Python API, and build instructions, see **[ENTITY_DATABASE.md](./ENTITY_DATABASE.md)**.
+## New in v0.6.0: Entity Embedding Database
+v0.6.0 introduces an **entity embedding database** for fast entity qualification using vector similarity search.
+### Data Sources
+**Organizations:**
+| Source | Records | Identifier | EntityType Mapping |
+|--------|---------|------------|-------------------|
+| GLEIF | ~3.2M | LEI (Legal Entity Identifier) | GENERAL→business, FUND→fund, BRANCH→branch, INTERNATIONAL_ORGANIZATION→international_org |
+| SEC Edgar | ~100K+ | CIK (Central Index Key) | business (or fund via SIC codes) |
+| Companies House | ~5M | UK Company Number | Maps company_type to business/nonprofit |
+| Wikidata | Variable | Wikidata QID | 35+ query types mapped to EntityType |
+**People** *(v0.9.0)*:
+| Source | Records | Identifier | PersonType Classification |
+|--------|---------|------------|--------------------------|
+| Wikidata (SPARQL) | Variable | Wikidata QID | executive, politician, athlete, artist, academic, scientist, journalist, entrepreneur, activist |
+| Wikidata (Dump) | All humans with enwiki | Wikidata QID | Classified from positions (P39) and occupations (P106) |
+**Date Fields**: All importers now include `from_date` and `to_date` where available:
+- **GLEIF**: LEI registration date
+- **SEC Edgar**: First SEC filing date
+- **Companies House**: Incorporation and dissolution dates
+- **Wikidata Orgs**: Inception (P571) and dissolution (P576) dates
+- **Wikidata People**: Position start (P580) and end (P582) dates
+**Note**: The same person can have multiple records with different role/org combinations (unique on `source_id + role + org`). Organizations discovered during people import are automatically inserted into the organizations table with `known_for_org_id` foreign key linking people to their organizations.
+### EntityType Classification
+Each organization record is classified with an `entity_type` field:
+| Category | Types |
+|----------|-------|
+| Business | `business`, `fund`, `branch` |
+| Non-profit | `nonprofit`, `ngo`, `foundation`, `trade_union` |
+| Government | `government`, `international_org`, `political_party` |
+| Other | `educational`, `research`, `healthcare`, `media`, `sports`, `religious`, `unknown` |
+### Building the Database
+```bash
+# Import organizations from authoritative sources
+corp-extractor db import-gleif --download
+corp-extractor db import-sec --download      # Bulk submissions.zip (~100K+ filers)
+corp-extractor db import-companies-house --download
+corp-extractor db import-wikidata --limit 50000
+# Import notable people (v0.9.0)
+corp-extractor db import-people --type executive --limit 5000
+corp-extractor db import-people --all --limit 10000  # All person types
+corp-extractor db import-people --type executive --skip-existing  # Skip existing records
+corp-extractor db import-people --type executive --enrich-dates   # Fetch role start/end dates
+# Import from Wikidata dump (v0.9.1) - avoids SPARQL timeouts
+corp-extractor db import-wikidata-dump --download --limit 50000   # Downloads ~100GB dump
+corp-extractor db import-wikidata-dump --dump /path/to/dump.bz2 --people --no-orgs  # Local dump
+# Check status
+corp-extractor db status
+# Search for an organization
+corp-extractor db search "Microsoft"
+# Search for a person (v0.9.0)
+corp-extractor db search-people "Tim Cook"
+```
+### Using in Pipeline
+The database is automatically used by the `embedding_company_qualifier` plugin for Stage 3 (Qualification):
+```python
+from statement_extractor.pipeline import ExtractionPipeline
+pipeline = ExtractionPipeline()
+ctx = pipeline.process("Microsoft acquired Activision Blizzard.")
+for stmt in ctx.labeled_statements:
+    print(f"{stmt.subject_fqn}")  # e.g., "Microsoft (sec_edgar:0000789019)"
+```
+### Publishing to HuggingFace
+```bash
+# Upload database with all variants (full, lite, compressed)
+export HF_TOKEN="hf_..."
+corp-extractor db upload                     # Uses default cache location
+corp-extractor db upload entities.db         # Or specify path
+corp-extractor db upload --no-lite           # Skip lite version
+corp-extractor db upload --no-compress       # Skip compressed versions
+# Download pre-built database (lite version by default)
+corp-extractor db download                   # Lite version (smaller, faster)
+corp-extractor db download --full            # Full version with all metadata
+# Local database management
+corp-extractor db create-lite entities.db    # Create lite version
+corp-extractor db compress entities.db       # Compress with gzip
+```
+See [COMPANY_DB.md](../COMPANY_DB.md) for complete build and publish instructions.
+## New in v0.7.0: Document Processing
+v0.7.0 introduces **document-level processing** for handling files, URLs, and PDFs with automatic chunking, deduplication, and citation tracking.
+### Document CLI
+```bash
+# Process local files
+corp-extractor document process article.txt
+corp-extractor document process report.txt --title "Annual Report" --year 2024
+# Process URLs (web pages and PDFs)
+corp-extractor document process https://example.com/article
+corp-extractor document process https://example.com/report.pdf --use-ocr
+# Configure chunking
+corp-extractor document process article.txt --max-tokens 500 --overlap 50
+# Preview chunking without extraction
+corp-extractor document chunk article.txt --max-tokens 500
+```
+### Document Python API
+```python
+from statement_extractor.document import DocumentPipeline, DocumentPipelineConfig, Document
+from statement_extractor.models.document import ChunkingConfig
+# Configure document processing
+config = DocumentPipelineConfig(
+    chunking=ChunkingConfig(target_tokens=1000, overlap_tokens=100),
+    generate_summary=True,
+    deduplicate_across_chunks=True,
+)
+pipeline = DocumentPipeline(config)
+# Process text
+document = Document.from_text("Your long document text...", title="My Document")
+ctx = pipeline.process(document)
+# Process URL (async)
+ctx = await pipeline.process_url("https://example.com/article")
+# Access results
+print(f"Chunks: {ctx.chunk_count}")
+print(f"Statements: {ctx.statement_count}")
+print(f"Duplicates removed: {ctx.duplicates_removed}")
+for stmt in ctx.labeled_statements:
+    print(f"{stmt.subject_fqn} --[{stmt.statement.predicate}]--> {stmt.object_fqn}")
+    if stmt.citation:
+        print(f"  Citation: {stmt.citation}")
+```
+### PDF Processing
+PDFs are automatically parsed using PyMuPDF. For scanned PDFs, use OCR:
+```bash
+# Install OCR dependencies
+pip install "corp-extractor[ocr]"
+# Process with OCR
+corp-extractor document process scanned.pdf --use-ocr
+```
 ## New in v0.4.0: GLiNER2 Integration
 v0.4.0 replaces spaCy with **GLiNER2** (205M params) for entity recognition and relation extraction. GLiNER2 is a unified model that handles NER, text classification, structured data extraction, and relation extraction with CPU-optimized inference.
@@ -575,7 +773,7 @@ for text in texts:
 This library uses the T5-Gemma 2 statement extraction model with **Diverse Beam Search** ([Vijayakumar et al., 2016](https://arxiv.org/abs/1610.02424)):
 1. **Diverse Beam Search**: Generates 4+ candidate outputs using beam groups with diversity penalty
-2. **Quality Scoring**: Each triple scored for groundedness in source text
+2. **Quality Scoring**: Each triple scored via semantic similarity + GLiNER2 entity recognition
 3. **Beam Merging**: Top beams combined for better coverage
 4. **Embedding Dedup**: Semantic similarity removes near-duplicate predicates
 5. **Predicate Normalization**: Optional taxonomy matching via embeddings

corp_extractor-0.9.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,79 @@
+statement_extractor/__init__.py,sha256=vOJFsK6wNOoBvGYOvIKsseaqpFR8vNg_XPH-r8SmLas,3215
+statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
+statement_extractor/cli.py,sha256=l4YcqKmtks6exMAIHSUw_ukWGZ4x-v_V_Gnm-wOGc3g,106464
+statement_extractor/extractor.py,sha256=m10na6I2iU1GwokQTxodePttYgigHykoss5LWrE8JOQ,38418
+statement_extractor/gliner_extraction.py,sha256=OL4w-0_rZc6XCojaVsbGY4VdIXRJ6j8ZmeUeTOL0Ue0,8118
+statement_extractor/llm.py,sha256=1eBrYs-bUPqzbpiiif_hH-gE_DeM-l3cmddrIoLHFXo,8010
+statement_extractor/models.py,sha256=rBotCX2hRTMW4MXXkkWYv4JctP0HQR0NSJSlBcNhsF0,12302
+statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
+statement_extractor/scoring.py,sha256=V9WHQ-QCAoycnnaTHydWkFo-48_lcS6Mkztxjfi4wVg,16632
+statement_extractor/data/default_predicates.json,sha256=7rhFXWHvStDj4sLYfBXKS50xGChPaqMKUaKTkMEJRGk,32870
+statement_extractor/data/statement_taxonomy.json,sha256=LI9RWthuJTFCcuaIbh6h3FEu8EJpejiKjAtNM_y1s8A,336543
+statement_extractor/database/__init__.py,sha256=1eScQOm7866v9xndaqCK-xsXDUhKBSj5YGtGoQ80LgU,1548
+statement_extractor/database/embeddings.py,sha256=j_gUTEdRyyQCPcx9imsOh1nVDPjeiRXXG22OZ7KIO4w,5535
+statement_extractor/database/hub.py,sha256=3xCvbCeqC6GR3XgVow7MAXg46ZPDYfDX8it93Xikw5w,13295
+statement_extractor/database/models.py,sha256=4aLs5tp2QTAd9vAyPf80EUoHypd_K0jELcY4J51iaNw,10563
+statement_extractor/database/resolver.py,sha256=_fTITarFmAYOtuRbOos48ou_aqX4yJC0K2csdLbIktI,7202
+statement_extractor/database/store.py,sha256=FPyfC6KjD6pjfU2jccVEvsAcCtfqMdwVJuS7aqCNrKA,112320
+statement_extractor/database/importers/__init__.py,sha256=acIoX_BPdXv2DOMFyVbFZPDGNWp2s1FpC774loTqL5I,1121
+statement_extractor/database/importers/companies_house.py,sha256=b5OMFtoHhkPgoGK08ThQn9BtTu9uC_dYzBVpC10xT4U,20252
+statement_extractor/database/importers/companies_house_officers.py,sha256=QDFA0FzqDx9p6VjRrB7o4BE3e30l7i0ML_ktntsB-kA,15565
+statement_extractor/database/importers/gleif.py,sha256=sw4YYROD6wi7IbBEKGCn8kko0nOYbKOyukDJKGQp17Q,20200
+statement_extractor/database/importers/sec_edgar.py,sha256=0nnhnOrf5d1wR9PGjl8AuNOnp4mfmEtopjkgUY_PLQc,13738
+statement_extractor/database/importers/sec_form4.py,sha256=ZoV-oyNhG5AOUm4u9hemmRI5KnpNs3Gw_dfisjkD3zU,18234
+statement_extractor/database/importers/wikidata.py,sha256=tRj4kEMVIq7sRXxjyxj-scl8eXybkrLVvyNDYV2T5lg,39572
+statement_extractor/database/importers/wikidata_dump.py,sha256=GSLn_BV4h-Efms2tp_eYyhqSJsRFjnZzyqgaUCDmyVY,77903
+statement_extractor/database/importers/wikidata_people.py,sha256=s4AB2pQLK2qHK9X5BLoW-II3qZBbJG4zbU3Ro4FBT9o,43157
+statement_extractor/document/__init__.py,sha256=csbUUjxaZay-0WXtjZmULjDfL9VNxhOlePyKTMdRDYo,1714
+statement_extractor/document/chunker.py,sha256=I76p6Qwujk2kkN7GJ1sMwbQNOfEpbt29u-RxJdt15oE,14020
+statement_extractor/document/context.py,sha256=9DvyguwCjlef2MeNWZMgydvD54FPiOppjdvamQnrKzM,5450
+statement_extractor/document/deduplicator.py,sha256=R_RwEdVeVQBYZHvjkVA0ShAWr8x618VrO9dkYWXvifI,4771
+statement_extractor/document/html_extractor.py,sha256=YRhaIsurBJTeECLkL2YJsSv8gDJJN33fS-ESkGvDBGs,6600
+statement_extractor/document/loader.py,sha256=Ygund7bz4EVcwsFsxkrrgSjOCK4tbb_sqkMlzK_oEKM,8996
+statement_extractor/document/pipeline.py,sha256=h4q-CG_WtBLibkTXCFhfTizMme8bJS5f6ZWOECqhRYU,13675
+statement_extractor/document/summarizer.py,sha256=DOF6qPw0oWEtLSt97oXOFyzb0jGWZZ7frDFp11rL3is,5853
+statement_extractor/models/__init__.py,sha256=OJOK0ral_jskrSxx6nCc3TB6JlVYaC5HI2eYXr9dhMQ,2971
+statement_extractor/models/canonical.py,sha256=LaSU3CUJZOtBM1SpRTAmK-3N7QnYmxZYJvQE1NVIjLY,6003
+statement_extractor/models/document.py,sha256=McCyXz88YtJtlsfiFzagjRAhY32ovpIDKXQI_eV_DZI,9203
+statement_extractor/models/entity.py,sha256=l2ny91BnnWwPo9zx1_Fb8WMKPNuIQFN0H7ILncylmcY,3214
+statement_extractor/models/labels.py,sha256=NUcjFDuGUOM82mgsaWOdoIVbRNiQ6TdN-imNuTograo,7326
+statement_extractor/models/qualifiers.py,sha256=l--khVzt-N6jgibZ-MSSl-3SdQUZJN9dGoxdNhRmM_I,5926
+statement_extractor/models/statement.py,sha256=Wpp2OtZ5inhqbtEcblWdcES7g7lA-FVjqjz6Jq7hqzo,3329
+statement_extractor/pipeline/__init__.py,sha256=Q3M2Arx9BWH_APZxM-P0G-C3ISguG1whiA5QhxDHQCA,1071
+statement_extractor/pipeline/config.py,sha256=FXtqMMpRmdeuHB86D6YrFx5A36XHVg5GlBBZuPEn4JA,3957
+statement_extractor/pipeline/context.py,sha256=evAdyH5oOCNM_ILGZNS1mov3lM4D3mCvr5hzsjaB0Bs,6136
+statement_extractor/pipeline/orchestrator.py,sha256=qH6rD4_wI_kZ_e8NeIv2XYHUA07ldogFewFsZeRQVxw,16687
+statement_extractor/pipeline/registry.py,sha256=yBybhRd1HU2Y75TebLGBzF6nbPiHKZ0cHkyj-3CVnhg,11390
+statement_extractor/plugins/__init__.py,sha256=pIcPeoMFd-56jOM_kGrUWvPuwqN6vFJ-oUbu130-tzI,1345
+statement_extractor/plugins/base.py,sha256=xC661iFtnhIxtZLTwuCc-0rFV1q2V3hCTV-uOaILsOA,21622
+statement_extractor/plugins/extractors/__init__.py,sha256=sqxTI7WwDLVQKwOiQXqWS72gjJnwb76Gs9N3LGetBnI,253
+statement_extractor/plugins/extractors/base.py,sha256=kNRsQ7BL84lXPXREm7CihrprDUaFwDDvMpBcbZlwSGA,179
+statement_extractor/plugins/extractors/gliner2.py,sha256=yDwKJVniMj4YwjR4Rm6MALDk633H5qcKcxa2xOLh9LI,21999
+statement_extractor/plugins/labelers/__init__.py,sha256=flHEoBvnzQ3vAKkIUHyezpYi2H3KJvYGRerCVnc80r0,965
+statement_extractor/plugins/labelers/base.py,sha256=hIgJKq2LU00OcL0Zjy1L9hP8K2onlM_xtZ63XcH8qDE,171
+statement_extractor/plugins/labelers/confidence.py,sha256=XiXjBYe-8ch_SCKnz0sAwTT1mJ_XKMsuzXBbwAW_OK0,4083
+statement_extractor/plugins/labelers/relation_type.py,sha256=e5ASwVqJGMSCrx5GtyNk85q_-19D7W_4jI-J-Pv_kxY,2506
+statement_extractor/plugins/labelers/sentiment.py,sha256=nlWv9ymb7hlDIcFa-gjbIvZlJY1VrHrXhKMD-udmIzM,5027
+statement_extractor/plugins/labelers/taxonomy.py,sha256=u_TQVCTOZCtZis5ZP0xvxh5Ehc0fCJ-DG6E86GxjNcs,12725
+statement_extractor/plugins/labelers/taxonomy_embedding.py,sha256=NsSls2jkWm8LyNNuDkG2Rs4PYKQQxeMUDLTRrvSNk_A,16305
+statement_extractor/plugins/pdf/__init__.py,sha256=QLbgg3lgpwUKR1EGmzhbOJh5IB4-3rpWen9c75YNLtM,220
+statement_extractor/plugins/pdf/pypdf.py,sha256=JgmWa1-6tiATbPvhONMqRd5kAXJ--tb8rlEcR3u73sk,8612
+statement_extractor/plugins/qualifiers/__init__.py,sha256=H4FEZSw1GWBQB-Y79nQnLwhZ3okKQJqgJHGEA0Zp8pA,951
+statement_extractor/plugins/qualifiers/base.py,sha256=Kx--OdIh77mnjSkTl1NvUeekItRiG8AnBUcuznOZeBI,179
+statement_extractor/plugins/qualifiers/companies_house.py,sha256=6TlK6Zebb5wDJ9GGO3FvM9zOh27TWpio5BX9k7lBr7U,5854
+statement_extractor/plugins/qualifiers/embedding_company.py,sha256=nc7oTFjEBuPiprjXKeFRiMYM6tNicMNum_xQ9LSgEOg,14756
+statement_extractor/plugins/qualifiers/gleif.py,sha256=zHzC9eOt0R9Z56n0CXgTF7POJqu6v03SRmiJLmv8OGE,6104
+statement_extractor/plugins/qualifiers/person.py,sha256=EN1T0G9NT6wOeIGljzZql11o63BujaHzK44yRqMTiRk,29034
+statement_extractor/plugins/qualifiers/sec_edgar.py,sha256=d7QqGiE-3lFDQiXkYmNQU62K4oP2XYK6NzV6LNKPC5k,6754
+statement_extractor/plugins/scrapers/__init__.py,sha256=mh1nmPtcsewrYeW5oELeke6DSzL8jsGOJ2OcH-A4-eo,208
+statement_extractor/plugins/scrapers/http.py,sha256=igoB1JN7U-FPdBFmNfrdZV-Ho4JQ3RXniLz17SmQx8I,7778
+statement_extractor/plugins/splitters/__init__.py,sha256=05CYeAEO0lZsapK5pjxZJbOCLI1kjeK6IQjftxqqg5g,224
+statement_extractor/plugins/splitters/base.py,sha256=GeIBchFTr8icRSfYR8bGSb4-GoEZ1N0IGN6Kl5W2mL0,175
+statement_extractor/plugins/splitters/t5_gemma.py,sha256=5qjxeHznuAA9hL8EbUDDGQ3N2gYLmtg0hv9BsLWzfMk,9971
+statement_extractor/plugins/taxonomy/__init__.py,sha256=8N0tW7pm95DSCqM-s99ea0Tigbi9bZMyTkKblR1qmLw,307
+statement_extractor/plugins/taxonomy/embedding.py,sha256=yCuNE8UeY8tH2dHGRKL3hmRQBmdz9_9YQ0t5_VTCf7E,16349
+statement_extractor/plugins/taxonomy/mnli.py,sha256=zPZlpAHQqdnwH7fXS_CSY0HCMnaSrrk-fDQb1ZIqqPc,9163
+corp_extractor-0.9.3.dist-info/METADATA,sha256=Ps8LucareMigmuhXiPIDUXPgsWp5F7noVYT7VbTrSZA,29633
+corp_extractor-0.9.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+corp_extractor-0.9.3.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
+corp_extractor-0.9.3.dist-info/RECORD,,

statement_extractor/__init__.py CHANGED Viewed

@@ -29,7 +29,7 @@ Example:
     >>> data = extract_statements_as_dict("Some text...")
 """
-__version__ = "0.3.0"
+__version__ = "0.6.0"
 # Core models
 from .models import (

corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

corp-extractor 0.5.0py3-none-any.whl → 0.9.3py3-none-any.whl