corp-extractor 0.2.11__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.2.11.dist-info → corp_extractor-0.3.0.dist-info}/METADATA +104 -19
- corp_extractor-0.3.0.dist-info/RECORD +12 -0
- statement_extractor/__init__.py +3 -1
- statement_extractor/cli.py +10 -0
- statement_extractor/extractor.py +305 -22
- statement_extractor/models.py +27 -1
- statement_extractor/scoring.py +160 -90
- statement_extractor/spacy_extraction.py +386 -0
- corp_extractor-0.2.11.dist-info/RECORD +0 -11
- {corp_extractor-0.2.11.dist-info → corp_extractor-0.3.0.dist-info}/WHEEL +0 -0
- {corp_extractor-0.2.11.dist-info → corp_extractor-0.3.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: corp-extractor
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Extract structured statements from text using T5-Gemma 2 and Diverse Beam Search
|
|
5
5
|
Project-URL: Homepage, https://github.com/corp-o-rate/statement-extractor
|
|
6
6
|
Project-URL: Documentation, https://github.com/corp-o-rate/statement-extractor#readme
|
|
@@ -26,17 +26,15 @@ Requires-Python: >=3.10
|
|
|
26
26
|
Requires-Dist: click>=8.0.0
|
|
27
27
|
Requires-Dist: numpy>=1.24.0
|
|
28
28
|
Requires-Dist: pydantic>=2.0.0
|
|
29
|
+
Requires-Dist: sentence-transformers>=2.2.0
|
|
30
|
+
Requires-Dist: spacy>=3.5.0
|
|
29
31
|
Requires-Dist: torch>=2.0.0
|
|
30
32
|
Requires-Dist: transformers>=5.0.0rc3
|
|
31
|
-
Provides-Extra: all
|
|
32
|
-
Requires-Dist: sentence-transformers>=2.2.0; extra == 'all'
|
|
33
33
|
Provides-Extra: dev
|
|
34
34
|
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
35
35
|
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
36
36
|
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
37
37
|
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
38
|
-
Provides-Extra: embeddings
|
|
39
|
-
Requires-Dist: sentence-transformers>=2.2.0; extra == 'embeddings'
|
|
40
38
|
Description-Content-Type: text/markdown
|
|
41
39
|
|
|
42
40
|
# Corp Extractor
|
|
@@ -51,7 +49,11 @@ Extract structured subject-predicate-object statements from unstructured text us
|
|
|
51
49
|
|
|
52
50
|
- **Structured Extraction**: Converts unstructured text into subject-predicate-object triples
|
|
53
51
|
- **Entity Type Recognition**: Identifies 12 entity types (ORG, PERSON, GPE, LOC, PRODUCT, EVENT, etc.)
|
|
54
|
-
- **Quality Scoring** *(v0.
|
|
52
|
+
- **Combined Quality Scoring** *(v0.3.0)*: Confidence combines semantic similarity (50%) + subject/object noun scores (25% each)
|
|
53
|
+
- **spaCy-First Predicates** *(v0.3.0)*: Always uses spaCy for predicate extraction (model predicates are unreliable)
|
|
54
|
+
- **Multi-Candidate Extraction** *(v0.3.0)*: Generates 3 candidates per statement (hybrid, spaCy-only, predicate-split)
|
|
55
|
+
- **Best Triple Selection** *(v0.3.0)*: Keeps only highest-scoring triple per source (use `--all-triples` to keep all)
|
|
56
|
+
- **Extraction Method Tracking** *(v0.3.0)*: Each statement includes `extraction_method` field (hybrid, spacy, split, model)
|
|
55
57
|
- **Beam Merging** *(v0.2.0)*: Combines top beams for better coverage instead of picking one
|
|
56
58
|
- **Embedding-based Dedup** *(v0.2.0)*: Uses semantic similarity to detect near-duplicate predicates
|
|
57
59
|
- **Predicate Taxonomies** *(v0.2.0)*: Normalize predicates to canonical forms via embeddings
|
|
@@ -64,27 +66,22 @@ Extract structured subject-predicate-object statements from unstructured text us
|
|
|
64
66
|
## Installation
|
|
65
67
|
|
|
66
68
|
```bash
|
|
67
|
-
# Recommended: include embedding support for smart deduplication
|
|
68
|
-
pip install "corp-extractor[embeddings]"
|
|
69
|
-
|
|
70
|
-
# Minimal installation (no embedding features)
|
|
71
69
|
pip install corp-extractor
|
|
72
70
|
```
|
|
73
71
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
```
|
|
72
|
+
The spaCy model for predicate inference is downloaded automatically on first use.
|
|
73
|
+
|
|
74
|
+
**Note**: This package requires `transformers>=5.0.0` for T5-Gemma2 model support.
|
|
78
75
|
|
|
79
76
|
**For GPU support**, install PyTorch with CUDA first:
|
|
80
77
|
```bash
|
|
81
78
|
pip install torch --index-url https://download.pytorch.org/whl/cu121
|
|
82
|
-
pip install
|
|
79
|
+
pip install corp-extractor
|
|
83
80
|
```
|
|
84
81
|
|
|
85
82
|
**For Apple Silicon (M1/M2/M3)**, MPS acceleration is automatically detected:
|
|
86
83
|
```bash
|
|
87
|
-
pip install
|
|
84
|
+
pip install corp-extractor # MPS used automatically
|
|
88
85
|
```
|
|
89
86
|
|
|
90
87
|
## Quick Start
|
|
@@ -182,6 +179,8 @@ Options:
|
|
|
182
179
|
--no-dedup Disable deduplication
|
|
183
180
|
--no-embeddings Disable embedding-based dedup (faster)
|
|
184
181
|
--no-merge Disable beam merging
|
|
182
|
+
--no-spacy Disable spaCy extraction (use raw model output)
|
|
183
|
+
--all-triples Keep all candidate triples (default: best per source)
|
|
185
184
|
--dedup-threshold FLOAT Deduplication threshold (default: 0.65)
|
|
186
185
|
--min-confidence FLOAT Min confidence filter (default: 0)
|
|
187
186
|
--taxonomy PATH Load predicate taxonomy from file
|
|
@@ -284,7 +283,91 @@ for stmt in fixed_statements:
|
|
|
284
283
|
|
|
285
284
|
During deduplication, reversed duplicates (e.g., "A -> P -> B" and "B -> P -> A") are now detected and merged, with the correct orientation determined by source text similarity.
|
|
286
285
|
|
|
287
|
-
##
|
|
286
|
+
## New in v0.3.0: spaCy-First Extraction & Semantic Scoring
|
|
287
|
+
|
|
288
|
+
v0.3.0 introduces significant improvements to extraction quality:
|
|
289
|
+
|
|
290
|
+
### spaCy-First Predicate Extraction
|
|
291
|
+
|
|
292
|
+
The T5-Gemma model is excellent at:
|
|
293
|
+
- **Triple isolation** - identifying that a relationship exists
|
|
294
|
+
- **Coreference resolution** - resolving pronouns to named entities
|
|
295
|
+
|
|
296
|
+
But unreliable at:
|
|
297
|
+
- **Predicate extraction** - often returns empty or wrong predicates
|
|
298
|
+
|
|
299
|
+
**Solution:** v0.3.0 always uses spaCy for predicate extraction. The model provides subject, object, entity types, and source text; spaCy provides the predicate.
|
|
300
|
+
|
|
301
|
+
### Three Candidate Extraction Methods
|
|
302
|
+
|
|
303
|
+
For each statement, three candidates are generated and the best is selected:
|
|
304
|
+
|
|
305
|
+
| Method | Description |
|
|
306
|
+
|--------|-------------|
|
|
307
|
+
| `hybrid` | Model subject/object + spaCy predicate |
|
|
308
|
+
| `spacy` | All components from spaCy dependency parsing |
|
|
309
|
+
| `split` | Source text split around the predicate |
|
|
310
|
+
|
|
311
|
+
```python
|
|
312
|
+
for stmt in result:
|
|
313
|
+
print(f"{stmt.subject.text} --[{stmt.predicate}]--> {stmt.object.text}")
|
|
314
|
+
print(f" Method: {stmt.extraction_method}") # hybrid, spacy, split, or model
|
|
315
|
+
print(f" Confidence: {stmt.confidence_score:.2f}")
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
### Combined Quality Scoring
|
|
319
|
+
|
|
320
|
+
Confidence scores combine **semantic similarity** and **grammatical accuracy**:
|
|
321
|
+
|
|
322
|
+
| Component | Weight | Description |
|
|
323
|
+
|-----------|--------|-------------|
|
|
324
|
+
| Semantic similarity | 50% | Cosine similarity between source text and reassembled triple |
|
|
325
|
+
| Subject noun score | 25% | How noun-like the subject is |
|
|
326
|
+
| Object noun score | 25% | How noun-like the object is |
|
|
327
|
+
|
|
328
|
+
**Noun scoring:**
|
|
329
|
+
- Proper noun(s) only: 1.0
|
|
330
|
+
- Common noun(s) only: 0.8
|
|
331
|
+
- Contains noun + other words: 0.4-0.8 (based on ratio)
|
|
332
|
+
- No nouns: 0.2
|
|
333
|
+
|
|
334
|
+
This ensures extracted subjects and objects are grammatically valid entities, not fragments or verb phrases.
|
|
335
|
+
|
|
336
|
+
### Extraction Method Tracking
|
|
337
|
+
|
|
338
|
+
Each statement now includes an `extraction_method` field:
|
|
339
|
+
- `hybrid` - Model subject/object + spaCy predicate
|
|
340
|
+
- `spacy` - All components from spaCy dependency parsing
|
|
341
|
+
- `split` - Subject/object from splitting source text around predicate
|
|
342
|
+
- `model` - All components from T5-Gemma model (only when `--no-spacy`)
|
|
343
|
+
|
|
344
|
+
### Best Triple Selection
|
|
345
|
+
|
|
346
|
+
By default, only the **highest-scoring triple** is kept for each source sentence. This ensures clean output without redundant candidates.
|
|
347
|
+
|
|
348
|
+
To keep all candidate triples (for debugging or analysis):
|
|
349
|
+
```python
|
|
350
|
+
options = ExtractionOptions(all_triples=True)
|
|
351
|
+
result = extract_statements(text, options)
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
Or via CLI:
|
|
355
|
+
```bash
|
|
356
|
+
corp-extractor "Your text" --all-triples --verbose
|
|
357
|
+
```
|
|
358
|
+
|
|
359
|
+
**Disable spaCy extraction** to use only model output:
|
|
360
|
+
```python
|
|
361
|
+
options = ExtractionOptions(use_spacy_extraction=False)
|
|
362
|
+
result = extract_statements(text, options)
|
|
363
|
+
```
|
|
364
|
+
|
|
365
|
+
Or via CLI:
|
|
366
|
+
```bash
|
|
367
|
+
corp-extractor "Your text" --no-spacy
|
|
368
|
+
```
|
|
369
|
+
|
|
370
|
+
## Disable Embeddings
|
|
288
371
|
|
|
289
372
|
```python
|
|
290
373
|
options = ExtractionOptions(
|
|
@@ -360,14 +443,16 @@ This library uses the T5-Gemma 2 statement extraction model with **Diverse Beam
|
|
|
360
443
|
6. **Contextualized Matching** *(v0.2.2)*: Full statement context used for canonicalization and dedup
|
|
361
444
|
7. **Entity Type Merging** *(v0.2.3)*: UNKNOWN types merged with specific types during dedup
|
|
362
445
|
8. **Reversal Detection** *(v0.2.3)*: Subject-object reversals detected and corrected via embedding comparison
|
|
446
|
+
9. **Hybrid spaCy** *(v0.2.12)*: spaCy candidates added to pool alongside model output for better coverage
|
|
363
447
|
|
|
364
448
|
## Requirements
|
|
365
449
|
|
|
366
450
|
- Python 3.10+
|
|
367
451
|
- PyTorch 2.0+
|
|
368
|
-
- Transformers
|
|
452
|
+
- Transformers 5.0+
|
|
369
453
|
- Pydantic 2.0+
|
|
370
|
-
- sentence-transformers 2.2+
|
|
454
|
+
- sentence-transformers 2.2+
|
|
455
|
+
- spaCy 3.5+ (model downloaded automatically on first use)
|
|
371
456
|
- ~2GB VRAM (GPU) or ~4GB RAM (CPU)
|
|
372
457
|
|
|
373
458
|
## Links
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
statement_extractor/__init__.py,sha256=KwZfWnTB9oevTLw0TrNlYFu67qIYO-34JqDtcpjOhZI,3013
|
|
2
|
+
statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
|
|
3
|
+
statement_extractor/cli.py,sha256=JMEXiT2xwmW1J8JmJliQh32AT-7bTAtAscPx1AGRfPg,9054
|
|
4
|
+
statement_extractor/extractor.py,sha256=vS8UCgE8uITt_28PwCh4WCqOjWLpfrJcN3fh1YPBcjA,39657
|
|
5
|
+
statement_extractor/models.py,sha256=FxLj2fIodX317XVIJLZ0GFNahm_VV07KzdoLSSjoVD4,11952
|
|
6
|
+
statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
|
|
7
|
+
statement_extractor/scoring.py,sha256=pdNgyLHmlk-npISzm4nycK9G4wM2nztg5KTG7piFACI,18135
|
|
8
|
+
statement_extractor/spacy_extraction.py,sha256=ACvIB-Ag7H7h_Gb0cdypIr8fnf3A-UjyJnqqjWD5Ccs,12320
|
|
9
|
+
corp_extractor-0.3.0.dist-info/METADATA,sha256=eu8b7R_FQxFyc_9FSocy078TTyB7BwvGX-YAS79hKgg,17042
|
|
10
|
+
corp_extractor-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
11
|
+
corp_extractor-0.3.0.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
|
|
12
|
+
corp_extractor-0.3.0.dist-info/RECORD,,
|
statement_extractor/__init__.py
CHANGED
|
@@ -29,12 +29,13 @@ Example:
|
|
|
29
29
|
>>> data = extract_statements_as_dict("Some text...")
|
|
30
30
|
"""
|
|
31
31
|
|
|
32
|
-
__version__ = "0.
|
|
32
|
+
__version__ = "0.3.0"
|
|
33
33
|
|
|
34
34
|
# Core models
|
|
35
35
|
from .models import (
|
|
36
36
|
Entity,
|
|
37
37
|
EntityType,
|
|
38
|
+
ExtractionMethod,
|
|
38
39
|
ExtractionOptions,
|
|
39
40
|
ExtractionResult,
|
|
40
41
|
Statement,
|
|
@@ -73,6 +74,7 @@ __all__ = [
|
|
|
73
74
|
# Core models
|
|
74
75
|
"Entity",
|
|
75
76
|
"EntityType",
|
|
77
|
+
"ExtractionMethod",
|
|
76
78
|
"ExtractionOptions",
|
|
77
79
|
"ExtractionResult",
|
|
78
80
|
"Statement",
|
statement_extractor/cli.py
CHANGED
|
@@ -34,6 +34,7 @@ def _configure_logging(verbose: bool) -> None:
|
|
|
34
34
|
"statement_extractor.scoring",
|
|
35
35
|
"statement_extractor.predicate_comparer",
|
|
36
36
|
"statement_extractor.canonicalization",
|
|
37
|
+
"statement_extractor.spacy_extraction",
|
|
37
38
|
]:
|
|
38
39
|
logging.getLogger(logger_name).setLevel(level)
|
|
39
40
|
|
|
@@ -65,6 +66,8 @@ from .models import (
|
|
|
65
66
|
@click.option("--no-dedup", is_flag=True, help="Disable deduplication")
|
|
66
67
|
@click.option("--no-embeddings", is_flag=True, help="Disable embedding-based deduplication (faster)")
|
|
67
68
|
@click.option("--no-merge", is_flag=True, help="Disable beam merging (select single best beam)")
|
|
69
|
+
@click.option("--no-spacy", is_flag=True, help="Disable spaCy extraction (use raw model output)")
|
|
70
|
+
@click.option("--all-triples", is_flag=True, help="Keep all candidate triples instead of selecting best per source")
|
|
68
71
|
@click.option("--dedup-threshold", type=float, default=0.65, help="Similarity threshold for deduplication (default: 0.65)")
|
|
69
72
|
# Quality options
|
|
70
73
|
@click.option("--min-confidence", type=float, default=0.0, help="Minimum confidence threshold 0-1 (default: 0)")
|
|
@@ -89,6 +92,8 @@ def main(
|
|
|
89
92
|
no_dedup: bool,
|
|
90
93
|
no_embeddings: bool,
|
|
91
94
|
no_merge: bool,
|
|
95
|
+
no_spacy: bool,
|
|
96
|
+
all_triples: bool,
|
|
92
97
|
dedup_threshold: float,
|
|
93
98
|
min_confidence: float,
|
|
94
99
|
taxonomy: Optional[str],
|
|
@@ -160,6 +165,8 @@ def main(
|
|
|
160
165
|
deduplicate=not no_dedup,
|
|
161
166
|
embedding_dedup=not no_embeddings,
|
|
162
167
|
merge_beams=not no_merge,
|
|
168
|
+
use_spacy_extraction=not no_spacy,
|
|
169
|
+
all_triples=all_triples,
|
|
163
170
|
predicate_taxonomy=predicate_taxonomy,
|
|
164
171
|
predicate_config=predicate_config,
|
|
165
172
|
scoring_config=scoring_config,
|
|
@@ -225,6 +232,9 @@ def _print_table(result, verbose: bool):
|
|
|
225
232
|
click.echo(f" {stmt.object.text}{object_type}")
|
|
226
233
|
|
|
227
234
|
if verbose:
|
|
235
|
+
# Always show extraction method
|
|
236
|
+
click.echo(f" Method: {stmt.extraction_method.value}")
|
|
237
|
+
|
|
228
238
|
if stmt.confidence_score is not None:
|
|
229
239
|
click.echo(f" Confidence: {stmt.confidence_score:.2f}")
|
|
230
240
|
|