corp-extractor 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/METADATA +181 -64
  2. corp_extractor-0.5.0.dist-info/RECORD +55 -0
  3. statement_extractor/__init__.py +9 -0
  4. statement_extractor/cli.py +446 -17
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +1182 -0
  7. statement_extractor/extractor.py +1 -23
  8. statement_extractor/gliner_extraction.py +4 -74
  9. statement_extractor/llm.py +255 -0
  10. statement_extractor/models/__init__.py +74 -0
  11. statement_extractor/models/canonical.py +139 -0
  12. statement_extractor/models/entity.py +102 -0
  13. statement_extractor/models/labels.py +191 -0
  14. statement_extractor/models/qualifiers.py +91 -0
  15. statement_extractor/models/statement.py +75 -0
  16. statement_extractor/models.py +4 -1
  17. statement_extractor/pipeline/__init__.py +39 -0
  18. statement_extractor/pipeline/config.py +134 -0
  19. statement_extractor/pipeline/context.py +177 -0
  20. statement_extractor/pipeline/orchestrator.py +447 -0
  21. statement_extractor/pipeline/registry.py +297 -0
  22. statement_extractor/plugins/__init__.py +43 -0
  23. statement_extractor/plugins/base.py +446 -0
  24. statement_extractor/plugins/canonicalizers/__init__.py +17 -0
  25. statement_extractor/plugins/canonicalizers/base.py +9 -0
  26. statement_extractor/plugins/canonicalizers/location.py +219 -0
  27. statement_extractor/plugins/canonicalizers/organization.py +230 -0
  28. statement_extractor/plugins/canonicalizers/person.py +242 -0
  29. statement_extractor/plugins/extractors/__init__.py +13 -0
  30. statement_extractor/plugins/extractors/base.py +9 -0
  31. statement_extractor/plugins/extractors/gliner2.py +536 -0
  32. statement_extractor/plugins/labelers/__init__.py +29 -0
  33. statement_extractor/plugins/labelers/base.py +9 -0
  34. statement_extractor/plugins/labelers/confidence.py +138 -0
  35. statement_extractor/plugins/labelers/relation_type.py +87 -0
  36. statement_extractor/plugins/labelers/sentiment.py +159 -0
  37. statement_extractor/plugins/labelers/taxonomy.py +373 -0
  38. statement_extractor/plugins/labelers/taxonomy_embedding.py +466 -0
  39. statement_extractor/plugins/qualifiers/__init__.py +19 -0
  40. statement_extractor/plugins/qualifiers/base.py +9 -0
  41. statement_extractor/plugins/qualifiers/companies_house.py +174 -0
  42. statement_extractor/plugins/qualifiers/gleif.py +186 -0
  43. statement_extractor/plugins/qualifiers/person.py +221 -0
  44. statement_extractor/plugins/qualifiers/sec_edgar.py +198 -0
  45. statement_extractor/plugins/splitters/__init__.py +13 -0
  46. statement_extractor/plugins/splitters/base.py +9 -0
  47. statement_extractor/plugins/splitters/t5_gemma.py +188 -0
  48. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  49. statement_extractor/plugins/taxonomy/embedding.py +337 -0
  50. statement_extractor/plugins/taxonomy/mnli.py +279 -0
  51. corp_extractor-0.4.0.dist-info/RECORD +0 -12
  52. {corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/WHEEL +0 -0
  53. {corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: corp-extractor
3
- Version: 0.4.0
3
+ Version: 0.5.0
4
4
  Summary: Extract structured statements from text using T5-Gemma 2 and Diverse Beam Search
5
5
  Project-URL: Homepage, https://github.com/corp-o-rate/statement-extractor
6
6
  Project-URL: Documentation, https://github.com/corp-o-rate/statement-extractor#readme
@@ -23,7 +23,9 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
23
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
24
  Classifier: Topic :: Text Processing :: Linguistic
25
25
  Requires-Python: >=3.10
26
+ Requires-Dist: accelerate>=1.12.0
26
27
  Requires-Dist: click>=8.0.0
28
+ Requires-Dist: gguf>=0.17.1
27
29
  Requires-Dist: gliner2
28
30
  Requires-Dist: numpy>=1.24.0
29
31
  Requires-Dist: pydantic>=2.0.0
@@ -35,6 +37,8 @@ Requires-Dist: mypy>=1.0.0; extra == 'dev'
35
37
  Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
36
38
  Requires-Dist: pytest>=7.0.0; extra == 'dev'
37
39
  Requires-Dist: ruff>=0.1.0; extra == 'dev'
40
+ Provides-Extra: llm
41
+ Requires-Dist: llama-cpp-python>=0.2.0; extra == 'llm'
38
42
  Description-Content-Type: text/markdown
39
43
 
40
44
  # Corp Extractor
@@ -47,21 +51,18 @@ Extract structured subject-predicate-object statements from unstructured text us
47
51
 
48
52
  ## Features
49
53
 
54
+ - **6-Stage Pipeline** *(v0.5.0)*: Modular plugin-based architecture for full entity resolution
50
55
  - **Structured Extraction**: Converts unstructured text into subject-predicate-object triples
51
56
  - **Entity Type Recognition**: Identifies 12 entity types (ORG, PERSON, GPE, LOC, PRODUCT, EVENT, etc.)
57
+ - **Entity Qualification** *(v0.5.0)*: Adds roles, identifiers (LEI, ticker, company numbers) via external APIs
58
+ - **Canonicalization** *(v0.5.0)*: Resolves entities to canonical forms with fuzzy matching
59
+ - **Statement Labeling** *(v0.5.0)*: Sentiment analysis, relation type classification, confidence scoring
52
60
  - **GLiNER2 Integration** *(v0.4.0)*: Uses GLiNER2 (205M params) for entity recognition and relation extraction
53
- - **Predefined Predicates** *(v0.4.0)*: Optional `--predicates` list for GLiNER2 relation extraction mode
54
- - **Entity-based Scoring** *(v0.4.0)*: Confidence combines semantic similarity (50%) + entity recognition scores (25% each)
55
- - **Multi-Candidate Extraction**: Generates 3 candidates per statement (hybrid, GLiNER2-only, predicate-split)
56
- - **Best Triple Selection**: Keeps only highest-scoring triple per source (use `--all-triples` to keep all)
57
- - **Extraction Method Tracking**: Each statement includes `extraction_method` field (hybrid, gliner, split, model)
61
+ - **Predefined Predicates**: Optional `--predicates` list for GLiNER2 relation extraction mode
58
62
  - **Beam Merging**: Combines top beams for better coverage instead of picking one
59
63
  - **Embedding-based Dedup**: Uses semantic similarity to detect near-duplicate predicates
60
64
  - **Predicate Taxonomies**: Normalize predicates to canonical forms via embeddings
61
- - **Contextualized Matching**: Compares full "Subject Predicate Object" against source text for better accuracy
62
- - **Entity Type Merging**: Automatically merges UNKNOWN entity types with specific types during deduplication
63
- - **Reversal Detection**: Detects and corrects subject-object reversals using embedding comparison
64
- - **Command Line Interface**: Full-featured CLI for terminal usage
65
+ - **Command Line Interface**: Full-featured CLI with `split`, `pipeline`, and `plugins` commands
65
66
  - **Multiple Output Formats**: Get results as Pydantic models, JSON, XML, or dictionaries
66
67
 
67
68
  ## Installation
@@ -135,63 +136,74 @@ uvx corp-extractor "Apple announced a new iPhone."
135
136
 
136
137
  ### Usage Examples
137
138
 
138
- ```bash
139
- # Extract from text argument
140
- corp-extractor "Apple Inc. announced the iPhone 15 at their September event."
141
-
142
- # Extract from file
143
- corp-extractor -f article.txt
139
+ The CLI provides three main commands: `split`, `pipeline`, and `plugins`.
144
140
 
145
- # Pipe from stdin
146
- cat article.txt | corp-extractor -
141
+ ```bash
142
+ # Simple extraction (Stage 1 only, fast)
143
+ corp-extractor split "Apple Inc. announced the iPhone 15."
144
+ corp-extractor split -f article.txt --json
145
+
146
+ # Full 6-stage pipeline (entity resolution, canonicalization, labeling, taxonomy)
147
+ corp-extractor pipeline "Amazon CEO Andy Jassy announced plans to hire workers."
148
+ corp-extractor pipeline -f article.txt --stages 1-3
149
+ corp-extractor pipeline "..." --disable-plugins sec_edgar
150
+
151
+ # Plugin management
152
+ corp-extractor plugins list
153
+ corp-extractor plugins list --stage 3
154
+ corp-extractor plugins info gleif_qualifier
155
+ ```
147
156
 
148
- # Output as JSON
149
- corp-extractor "Tim Cook is CEO of Apple." --json
157
+ ### Split Command (Simple Extraction)
150
158
 
151
- # Output as XML
152
- corp-extractor -f article.txt --xml
159
+ ```bash
160
+ corp-extractor split "Tim Cook is CEO of Apple." --json
161
+ corp-extractor split -f article.txt --beams 8 --verbose
162
+ cat article.txt | corp-extractor split -
163
+ ```
153
164
 
154
- # Verbose output with confidence scores
155
- corp-extractor -f article.txt --verbose
165
+ ### Pipeline Command (Full Entity Resolution)
156
166
 
157
- # Use more beams for better quality
158
- corp-extractor -f article.txt --beams 8
167
+ ```bash
168
+ # Run all 5 stages
169
+ corp-extractor pipeline "Apple CEO Tim Cook announced..."
159
170
 
160
- # Use custom predicate taxonomy
161
- corp-extractor -f article.txt --taxonomy predicates.txt
171
+ # Run specific stages
172
+ corp-extractor pipeline "..." --stages 1-3 # Stages 1, 2, 3
173
+ corp-extractor pipeline "..." --stages 1,2,5 # Stages 1, 2, 5
174
+ corp-extractor pipeline "..." --skip-stages 4,5 # Skip stages 4 and 5
162
175
 
163
- # Use GPU explicitly
164
- corp-extractor -f article.txt --device cuda
176
+ # Plugin selection
177
+ corp-extractor pipeline "..." --plugins gleif,companies_house
178
+ corp-extractor pipeline "..." --disable-plugins sec_edgar
165
179
  ```
166
180
 
167
- ### CLI Options
181
+ ### CLI Reference
168
182
 
169
183
  ```
170
- Usage: corp-extractor [OPTIONS] [TEXT]
184
+ Usage: corp-extractor [COMMAND] [OPTIONS]
171
185
 
172
- Options:
186
+ Commands:
187
+ split Simple extraction (T5-Gemma only)
188
+ pipeline Full 5-stage pipeline with entity resolution
189
+ plugins List or inspect available plugins
190
+
191
+ Split Options:
173
192
  -f, --file PATH Read input from file
174
193
  -o, --output [table|json|xml] Output format (default: table)
175
- --json Output as JSON (shortcut)
176
- --xml Output as XML (shortcut)
194
+ --json / --xml Output format shortcuts
177
195
  -b, --beams INTEGER Number of beams (default: 4)
178
- --diversity FLOAT Diversity penalty (default: 1.0)
179
- --max-tokens INTEGER Max tokens to generate (default: 2048)
180
- --no-dedup Disable deduplication
181
- --no-embeddings Disable embedding-based dedup (faster)
182
- --no-merge Disable beam merging
183
- --no-gliner Disable GLiNER2 extraction (use raw model output)
184
- --predicates TEXT Comma-separated predicate types for GLiNER2 relation extraction
185
- --all-triples Keep all candidate triples (default: best per source)
186
- --dedup-threshold FLOAT Deduplication threshold (default: 0.65)
187
- --min-confidence FLOAT Min confidence filter (default: 0)
188
- --taxonomy PATH Load predicate taxonomy from file
189
- --taxonomy-threshold FLOAT Taxonomy matching threshold (default: 0.5)
196
+ --no-gliner Disable GLiNER2 extraction
197
+ --predicates TEXT Comma-separated predicates for relation extraction
190
198
  --device [auto|cuda|mps|cpu] Device to use (default: auto)
191
199
  -v, --verbose Show confidence scores and metadata
192
- -q, --quiet Suppress progress messages
193
- --version Show version
194
- --help Show this message
200
+
201
+ Pipeline Options:
202
+ --stages TEXT Stages to run (e.g., '1-3' or '1,2,5')
203
+ --skip-stages TEXT Stages to skip (e.g., '4,5')
204
+ --plugins TEXT Enable only these plugins (comma-separated)
205
+ --disable-plugins TEXT Disable these plugins (comma-separated)
206
+ -o, --output [table|json|yaml|triples] Output format
195
207
  ```
196
208
 
197
209
  ## New in v0.2.0: Quality Scoring & Beam Merging
@@ -285,6 +297,85 @@ for stmt in fixed_statements:
285
297
 
286
298
  During deduplication, reversed duplicates (e.g., "A -> P -> B" and "B -> P -> A") are now detected and merged, with the correct orientation determined by source text similarity.
287
299
 
300
+ ## New in v0.5.0: Pipeline Architecture
301
+
302
+ v0.5.0 introduces a **6-stage plugin-based pipeline** for comprehensive entity resolution, statement enrichment, and taxonomy classification.
303
+
304
+ ### Pipeline Stages
305
+
306
+ | Stage | Name | Input | Output | Key Tech |
307
+ |-------|------|-------|--------|----------|
308
+ | 1 | Splitting | Text | `RawTriple[]` | T5-Gemma2 |
309
+ | 2 | Extraction | `RawTriple[]` | `PipelineStatement[]` | GLiNER2 |
310
+ | 3 | Qualification | Entities | `QualifiedEntity[]` | Gemma3, APIs |
311
+ | 4 | Canonicalization | `QualifiedEntity[]` | `CanonicalEntity[]` | Fuzzy matching |
312
+ | 5 | Labeling | Statements | `LabeledStatement[]` | Sentiment, etc. |
313
+ | 6 | Taxonomy | Statements | `TaxonomyResult[]` | MNLI, Embeddings |
314
+
315
+ ### Pipeline Python API
316
+
317
+ ```python
318
+ from statement_extractor.pipeline import ExtractionPipeline, PipelineConfig
319
+
320
+ # Run full pipeline
321
+ pipeline = ExtractionPipeline()
322
+ ctx = pipeline.process("Amazon CEO Andy Jassy announced plans to hire workers.")
323
+
324
+ # Access results at each stage
325
+ print(f"Raw triples: {len(ctx.raw_triples)}")
326
+ print(f"Statements: {len(ctx.statements)}")
327
+ print(f"Labeled: {len(ctx.labeled_statements)}")
328
+
329
+ # Output with fully qualified names
330
+ for stmt in ctx.labeled_statements:
331
+ print(f"{stmt.subject_fqn} --[{stmt.statement.predicate}]--> {stmt.object_fqn}")
332
+ # e.g., "Andy Jassy (CEO, Amazon) --[announced]--> plans to hire workers"
333
+ ```
334
+
335
+ ### Pipeline Configuration
336
+
337
+ ```python
338
+ from statement_extractor.pipeline import PipelineConfig, ExtractionPipeline
339
+
340
+ # Run only specific stages
341
+ config = PipelineConfig(
342
+ enabled_stages={1, 2, 3}, # Skip canonicalization and labeling
343
+ disabled_plugins={"sec_edgar_qualifier"}, # Disable specific plugins
344
+ )
345
+ pipeline = ExtractionPipeline(config)
346
+ ctx = pipeline.process(text)
347
+
348
+ # Alternative: create config from stage string
349
+ config = PipelineConfig.from_stage_string("1-3") # Stages 1, 2, 3
350
+ ```
351
+
352
+ ### Built-in Plugins
353
+
354
+ **Splitters (Stage 1):**
355
+ - `t5_gemma_splitter` - T5-Gemma2 statement extraction
356
+
357
+ **Extractors (Stage 2):**
358
+ - `gliner2_extractor` - GLiNER2 entity recognition and relation extraction
359
+
360
+ **Qualifiers (Stage 3):**
361
+ - `person_qualifier` - PERSON → role, org (uses Gemma3)
362
+ - `gleif_qualifier` - ORG → LEI, jurisdiction (GLEIF API)
363
+ - `companies_house_qualifier` - ORG → UK company number
364
+ - `sec_edgar_qualifier` - ORG → SEC CIK, ticker
365
+
366
+ **Canonicalizers (Stage 4):**
367
+ - `organization_canonicalizer` - ORG canonical names
368
+ - `person_canonicalizer` - PERSON name variants
369
+
370
+ **Labelers (Stage 5):**
371
+ - `sentiment_labeler` - Statement sentiment analysis
372
+
373
+ **Taxonomy Classifiers (Stage 6):**
374
+ - `mnli_taxonomy_classifier` - MNLI zero-shot classification against ESG taxonomy
375
+ - `embedding_taxonomy_classifier` - Embedding similarity-based taxonomy classification
376
+
377
+ Taxonomy classifiers return **multiple labels** per statement above the confidence threshold.
378
+
288
379
  ## New in v0.4.0: GLiNER2 Integration
289
380
 
290
381
  v0.4.0 replaces spaCy with **GLiNER2** (205M params) for entity recognition and relation extraction. GLiNER2 is a unified model that handles NER, text classification, structured data extraction, and relation extraction with CPU-optimized inference.
@@ -297,12 +388,31 @@ The T5-Gemma model excels at:
297
388
 
298
389
  GLiNER2 now handles:
299
390
  - **Entity recognition** - refining subject/object boundaries
300
- - **Relation extraction** - when predefined predicates are provided
391
+ - **Relation extraction** - using 324 default predicates across 21 categories
301
392
  - **Entity scoring** - scoring how "entity-like" subjects/objects are
393
+ - **Confidence scoring** - real confidence values via `include_confidence=True`
394
+
395
+ ### Default Predicates
396
+
397
+ GLiNER2 uses **324 predicates** organized into 21 categories (ownership, employment, funding, etc.). These are loaded from `default_predicates.json` and include descriptions and confidence thresholds.
302
398
 
303
- ### Two Extraction Modes
399
+ **Key features:**
400
+ - **All matches returned** - Every matching relation is returned, not just the best one
401
+ - **Category-based extraction** - Iterates through categories to stay under GLiNER2's ~25 label limit
402
+ - **Custom predicate files** - Provide your own JSON file with custom predicates
304
403
 
305
- **Mode 1: With Predicate List** (GLiNER2 relation extraction)
404
+ ### Extraction Modes
405
+
406
+ **Mode 1: Default Predicates** (recommended)
407
+ ```python
408
+ from statement_extractor import extract_statements
409
+
410
+ # Uses 324 built-in predicates automatically
411
+ result = extract_statements("John works for Apple Inc. in Cupertino.")
412
+ # Returns ALL matching relations
413
+ ```
414
+
415
+ **Mode 2: Custom Predicate List**
306
416
  ```python
307
417
  from statement_extractor import extract_statements, ExtractionOptions
308
418
 
@@ -315,27 +425,35 @@ Or via CLI:
315
425
  corp-extractor "John works for Apple Inc." --predicates "works_for,founded,acquired"
316
426
  ```
317
427
 
318
- **Mode 2: Without Predicate List** (entity-refined extraction)
428
+ **Mode 3: Custom Predicate File**
319
429
  ```python
320
- result = extract_statements("Apple announced a new iPhone.")
321
- # Uses GLiNER2 for entity extraction to refine boundaries
322
- # Extracts predicate from source text using T5-Gemma's hint
430
+ from statement_extractor.pipeline import ExtractionPipeline, PipelineConfig
431
+
432
+ config = PipelineConfig(
433
+ extractor_options={"predicates_file": "/path/to/custom_predicates.json"}
434
+ )
435
+ pipeline = ExtractionPipeline(config)
436
+ ctx = pipeline.process("John works for Apple Inc.")
437
+ ```
438
+
439
+ Or via CLI:
440
+ ```bash
441
+ corp-extractor pipeline "John works for Apple Inc." --predicates-file custom_predicates.json
323
442
  ```
324
443
 
325
- ### Three Candidate Extraction Methods
444
+ ### Two Candidate Extraction Methods
326
445
 
327
- For each statement, three candidates are generated and the best is selected:
446
+ For each statement, two candidates are generated and the best is selected:
328
447
 
329
448
  | Method | Description |
330
449
  |--------|-------------|
331
450
  | `hybrid` | Model subject/object + GLiNER2/extracted predicate |
332
- | `gliner` | All components refined by GLiNER2 |
333
- | `split` | Source text split around the predicate |
451
+ | `gliner` | All components refined by GLiNER2 entity recognition |
334
452
 
335
453
  ```python
336
454
  for stmt in result:
337
455
  print(f"{stmt.subject.text} --[{stmt.predicate}]--> {stmt.object.text}")
338
- print(f" Method: {stmt.extraction_method}") # hybrid, gliner, split, or model
456
+ print(f" Method: {stmt.extraction_method}") # hybrid, gliner, or model
339
457
  print(f" Confidence: {stmt.confidence_score:.2f}")
340
458
  ```
341
459
 
@@ -359,8 +477,7 @@ Confidence scores combine **semantic similarity** and **entity recognition**:
359
477
 
360
478
  Each statement includes an `extraction_method` field:
361
479
  - `hybrid` - Model subject/object + GLiNER2 predicate
362
- - `gliner` - All components refined by GLiNER2
363
- - `split` - Subject/object from splitting source text around predicate
480
+ - `gliner` - All components refined by GLiNER2 entity recognition
364
481
  - `model` - All components from T5-Gemma model (only when `--no-gliner`)
365
482
 
366
483
  ### Best Triple Selection
@@ -0,0 +1,55 @@
1
+ statement_extractor/__init__.py,sha256=Lmgw3jtwrfu09mXSfNFCB5AN0J6tsEQ2uOrrQciMrtI,3215
2
+ statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
3
+ statement_extractor/cli.py,sha256=iqsqvLAN0FMRoE4KskEoW-4DE5_7Tll8xeHA1t04KJg,25028
4
+ statement_extractor/extractor.py,sha256=CGJCmAMiIoDsPtjIdvOHYBcz8058eYpfLMngjELMJhI,38403
5
+ statement_extractor/gliner_extraction.py,sha256=OL4w-0_rZc6XCojaVsbGY4VdIXRJ6j8ZmeUeTOL0Ue0,8118
6
+ statement_extractor/llm.py,sha256=1eBrYs-bUPqzbpiiif_hH-gE_DeM-l3cmddrIoLHFXo,8010
7
+ statement_extractor/models.py,sha256=fXTT7qxPqynnrrpb77nCgs3K2yn_YgbSugSXv12boX4,12312
8
+ statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
9
+ statement_extractor/scoring.py,sha256=s_8nhavBNzPPFmGf2FyBummH4tgP7YGpXoMhl2Jh3Xw,16650
10
+ statement_extractor/data/default_predicates.json,sha256=7rhFXWHvStDj4sLYfBXKS50xGChPaqMKUaKTkMEJRGk,32870
11
+ statement_extractor/data/statement_taxonomy.json,sha256=XhCeVBC4aQB-7NR40Niu4yN2BmL0c2Gd-RKkUpsYK24,37981
12
+ statement_extractor/models/__init__.py,sha256=gjTu450FPe9dvhIVQXqBwF8u0hgSnPORGXzxmSEuCnM,2564
13
+ statement_extractor/models/canonical.py,sha256=ld6z6RtK03iOs_aUk8Rftcm0pUoaFpLUfyfbKI26N_o,4354
14
+ statement_extractor/models/entity.py,sha256=l2ny91BnnWwPo9zx1_Fb8WMKPNuIQFN0H7ILncylmcY,3214
15
+ statement_extractor/models/labels.py,sha256=e-mFDuzb42oJ69gLZTWCdg5_MNqRftQ2La5x8y9Cv-Y,6236
16
+ statement_extractor/models/qualifiers.py,sha256=YkvyWh2p1fK5iMRDC2Dq1r-XJOmJ1rvWFTFUIkQ9zcc,3495
17
+ statement_extractor/models/statement.py,sha256=cOgabA7IJxHYjlH5AksJRNf2Rv5VScMPqZdfjQyXRN0,2733
18
+ statement_extractor/pipeline/__init__.py,sha256=Q3M2Arx9BWH_APZxM-P0G-C3ISguG1whiA5QhxDHQCA,1071
19
+ statement_extractor/pipeline/config.py,sha256=rxZN27OWp05F-NaatwrYkjp56zbzHZ0hMtNU1mvBxgw,4130
20
+ statement_extractor/pipeline/context.py,sha256=wURDYtzDrmbHu40Af_C_oTtN55wnULKHNZjUx6O8t-0,6126
21
+ statement_extractor/pipeline/orchestrator.py,sha256=oHegnsDzXj87q8iAoi-QZj2ZyB1rX5qmg57BdIjvKo0,17617
22
+ statement_extractor/pipeline/registry.py,sha256=qj5M5tMm9GmNCguy8dWBXMT8XmhemiZjJMktZsRlevw,11415
23
+ statement_extractor/plugins/__init__.py,sha256=8k3lQGQNQSMUzxCmk4nAH8dIc1DqEnMyiqHlZZv81q0,1099
24
+ statement_extractor/plugins/base.py,sha256=GZ4WT5S2mH3C_uN6nyBz-nGlAn_Z2o2A51FSRu6gCEo,12797
25
+ statement_extractor/plugins/canonicalizers/__init__.py,sha256=LDb9NodyuLSoLzrLnNzMeviK79GHnyaLGU0J_02BBgM,421
26
+ statement_extractor/plugins/canonicalizers/base.py,sha256=dbreQuEPB48eBJmah7hpl67azVU4QLhbvSrjXr0vT88,195
27
+ statement_extractor/plugins/canonicalizers/location.py,sha256=Rz5SCM4bb0p0gsnHPzsQJv-RN59yoj9Z1NmF8yLQNv0,6590
28
+ statement_extractor/plugins/canonicalizers/organization.py,sha256=L-mhdctkRXuu84RsNHp80M_tDIiMumYaHAG6WfxpH4c,7482
29
+ statement_extractor/plugins/canonicalizers/person.py,sha256=Nw8FuJOBmg-cTaOTd2BJ1TZtydprfzIKL25wJa_VJek,6944
30
+ statement_extractor/plugins/extractors/__init__.py,sha256=sqxTI7WwDLVQKwOiQXqWS72gjJnwb76Gs9N3LGetBnI,253
31
+ statement_extractor/plugins/extractors/base.py,sha256=kNRsQ7BL84lXPXREm7CihrprDUaFwDDvMpBcbZlwSGA,179
32
+ statement_extractor/plugins/extractors/gliner2.py,sha256=rgfY8l9v8EWCxfB3g6hLnmLCIekTBkfWMG8dgSAZu-E,21627
33
+ statement_extractor/plugins/labelers/__init__.py,sha256=flHEoBvnzQ3vAKkIUHyezpYi2H3KJvYGRerCVnc80r0,965
34
+ statement_extractor/plugins/labelers/base.py,sha256=hIgJKq2LU00OcL0Zjy1L9hP8K2onlM_xtZ63XcH8qDE,171
35
+ statement_extractor/plugins/labelers/confidence.py,sha256=XiXjBYe-8ch_SCKnz0sAwTT1mJ_XKMsuzXBbwAW_OK0,4083
36
+ statement_extractor/plugins/labelers/relation_type.py,sha256=e5ASwVqJGMSCrx5GtyNk85q_-19D7W_4jI-J-Pv_kxY,2506
37
+ statement_extractor/plugins/labelers/sentiment.py,sha256=nlWv9ymb7hlDIcFa-gjbIvZlJY1VrHrXhKMD-udmIzM,5027
38
+ statement_extractor/plugins/labelers/taxonomy.py,sha256=jQp5emgWf6XgmOx7arh-owF_-TjVxiPKSJ2OGkTPbBs,12427
39
+ statement_extractor/plugins/labelers/taxonomy_embedding.py,sha256=grvC_R_sg05hR6l0DgaELy2wmf6OkbvV1pRuNU0FVk4,16027
40
+ statement_extractor/plugins/qualifiers/__init__.py,sha256=kefjGunlVDKLy2NXmtr5ZXyYi-swyQdPLkB-tHV_0vk,495
41
+ statement_extractor/plugins/qualifiers/base.py,sha256=Kx--OdIh77mnjSkTl1NvUeekItRiG8AnBUcuznOZeBI,179
42
+ statement_extractor/plugins/qualifiers/companies_house.py,sha256=_6ExJCjD0V4eZNYXtfBY99obqLpRaSv-G-V7N6R1wLg,5376
43
+ statement_extractor/plugins/qualifiers/gleif.py,sha256=WZqcNT_Yq4yVe4rdkWO59C9yZ4geV2ZTDk9wxLlOeTg,5645
44
+ statement_extractor/plugins/qualifiers/person.py,sha256=si_9CLjHsH9jYFugej4t0HMnsivclh-Yi70U6NglfIU,7101
45
+ statement_extractor/plugins/qualifiers/sec_edgar.py,sha256=3XDbizlR9YQgLrC7p-owV8Td-3TYaJlMb4B7saha3vw,6288
46
+ statement_extractor/plugins/splitters/__init__.py,sha256=05CYeAEO0lZsapK5pjxZJbOCLI1kjeK6IQjftxqqg5g,224
47
+ statement_extractor/plugins/splitters/base.py,sha256=GeIBchFTr8icRSfYR8bGSb4-GoEZ1N0IGN6Kl5W2mL0,175
48
+ statement_extractor/plugins/splitters/t5_gemma.py,sha256=8joOzlMKXhSyJaq5c3F8t-gdPcZEDiVAzNcMlgJAqsE,6733
49
+ statement_extractor/plugins/taxonomy/__init__.py,sha256=8N0tW7pm95DSCqM-s99ea0Tigbi9bZMyTkKblR1qmLw,307
50
+ statement_extractor/plugins/taxonomy/embedding.py,sha256=QW1RR07JoE8Ah97gDZ_w_ATEe6-z2t2nl1zeTDAgFjM,11347
51
+ statement_extractor/plugins/taxonomy/mnli.py,sha256=IzLjHXUFgVAgEvYI5EzOBs19UxvpcbJa8HjqI__tYII,8905
52
+ corp_extractor-0.5.0.dist-info/METADATA,sha256=H4Z8ExZFdbknpHg-EZ1P9B137hCPwKXBezHSF7X9EOE,21567
53
+ corp_extractor-0.5.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
54
+ corp_extractor-0.5.0.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
55
+ corp_extractor-0.5.0.dist-info/RECORD,,
@@ -97,6 +97,9 @@ __all__ = [
97
97
  # Scoring
98
98
  "BeamScorer",
99
99
  "TripleScorer",
100
+ # LLM (lazy import)
101
+ "LLM",
102
+ "get_llm",
100
103
  ]
101
104
 
102
105
 
@@ -109,4 +112,10 @@ def __getattr__(name: str):
109
112
  if name == "EmbeddingDependencyError":
110
113
  from .predicate_comparer import EmbeddingDependencyError
111
114
  return EmbeddingDependencyError
115
+ if name == "LLM":
116
+ from .llm import LLM
117
+ return LLM
118
+ if name == "get_llm":
119
+ from .llm import get_llm
120
+ return get_llm
112
121
  raise AttributeError(f"module {__name__!r} has no attribute {name!r}")