corp-extractor 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,62 @@
1
+ # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
2
+
3
+ # dependencies
4
+ /node_modules
5
+ /.pnp
6
+ .pnp.*
7
+ .yarn/*
8
+ !.yarn/patches
9
+ !.yarn/plugins
10
+ !.yarn/releases
11
+ !.yarn/versions
12
+
13
+ # testing
14
+ /coverage
15
+
16
+ # next.js
17
+ /.next/
18
+ /out/
19
+
20
+ # production
21
+ /build
22
+
23
+ # misc
24
+ .DS_Store
25
+ *.pem
26
+
27
+ # debug
28
+ npm-debug.log*
29
+ yarn-debug.log*
30
+ yarn-error.log*
31
+ .pnpm-debug.log*
32
+
33
+ # env files (can opt-in for committing if needed)
34
+ .env*
35
+ !.env.example
36
+
37
+ # vercel
38
+ .vercel
39
+
40
+ # typescript
41
+ *.tsbuildinfo
42
+ next-env.d.ts
43
+
44
+ # Python
45
+ __pycache__/
46
+ *.py[cod]
47
+ *$py.class
48
+ .venv/
49
+ venv/
50
+ *.egg-info/
51
+
52
+ # uv
53
+ uv.lock
54
+
55
+ # IDE
56
+ .idea/
57
+ *.iml
58
+ .vscode/
59
+
60
+ # Model files (too large for git)
61
+ /model/
62
+ /.claude/settings.local.json
@@ -0,0 +1,224 @@
1
+ Metadata-Version: 2.4
2
+ Name: corp-extractor
3
+ Version: 0.2.0
4
+ Summary: Extract structured statements from text using T5-Gemma 2 and Diverse Beam Search
5
+ Project-URL: Homepage, https://github.com/corp-o-rate/statement-extractor
6
+ Project-URL: Documentation, https://github.com/corp-o-rate/statement-extractor#readme
7
+ Project-URL: Repository, https://github.com/corp-o-rate/statement-extractor
8
+ Project-URL: Issues, https://github.com/corp-o-rate/statement-extractor/issues
9
+ Author-email: Corp-o-Rate <neil@corp-o-rate.com>
10
+ Maintainer-email: Corp-o-Rate <neil@corp-o-rate.com>
11
+ License: MIT
12
+ Keywords: diverse-beam-search,embeddings,gemma,information-extraction,knowledge-graph,nlp,statement-extraction,subject-predicate-object,t5,transformers,triples
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
+ Classifier: Topic :: Text Processing :: Linguistic
25
+ Requires-Python: >=3.10
26
+ Requires-Dist: numpy>=1.24.0
27
+ Requires-Dist: pydantic>=2.0.0
28
+ Requires-Dist: torch>=2.0.0
29
+ Requires-Dist: transformers>=4.35.0
30
+ Provides-Extra: all
31
+ Requires-Dist: sentence-transformers>=2.2.0; extra == 'all'
32
+ Provides-Extra: dev
33
+ Requires-Dist: mypy>=1.0.0; extra == 'dev'
34
+ Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
35
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
36
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
37
+ Provides-Extra: embeddings
38
+ Requires-Dist: sentence-transformers>=2.2.0; extra == 'embeddings'
39
+ Description-Content-Type: text/markdown
40
+
41
+ # Corp Extractor
42
+
43
+ Extract structured subject-predicate-object statements from unstructured text using the T5-Gemma 2 model.
44
+
45
+ [![PyPI version](https://badge.fury.io/py/corp-extractor.svg)](https://badge.fury.io/py/corp-extractor)
46
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
47
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
48
+
49
+ ## Features
50
+
51
+ - **Structured Extraction**: Converts unstructured text into subject-predicate-object triples
52
+ - **Entity Type Recognition**: Identifies 12 entity types (ORG, PERSON, GPE, LOC, PRODUCT, EVENT, etc.)
53
+ - **Quality Scoring** *(v0.2.0)*: Each triple scored for groundedness (0-1) based on source text
54
+ - **Beam Merging** *(v0.2.0)*: Combines top beams for better coverage instead of picking one
55
+ - **Embedding-based Dedup** *(v0.2.0)*: Uses semantic similarity to detect near-duplicate predicates
56
+ - **Predicate Taxonomies** *(v0.2.0)*: Normalize predicates to canonical forms via embeddings
57
+ - **Multiple Output Formats**: Get results as Pydantic models, JSON, XML, or dictionaries
58
+
59
+ ## Installation
60
+
61
+ ```bash
62
+ # Recommended: include embedding support for smart deduplication
63
+ pip install corp-extractor[embeddings]
64
+
65
+ # Minimal installation (no embedding features)
66
+ pip install corp-extractor
67
+ ```
68
+
69
+ **Note**: For GPU support, install PyTorch with CUDA first:
70
+ ```bash
71
+ pip install torch --index-url https://download.pytorch.org/whl/cu121
72
+ pip install corp-extractor[embeddings]
73
+ ```
74
+
75
+ ## Quick Start
76
+
77
+ ```python
78
+ from statement_extractor import extract_statements
79
+
80
+ result = extract_statements("""
81
+ Apple Inc. announced the iPhone 15 at their September event.
82
+ Tim Cook presented the new features to customers worldwide.
83
+ """)
84
+
85
+ for stmt in result:
86
+ print(f"{stmt.subject.text} ({stmt.subject.type})")
87
+ print(f" --[{stmt.predicate}]--> {stmt.object.text}")
88
+ print(f" Confidence: {stmt.confidence_score:.2f}") # NEW in v0.2.0
89
+ ```
90
+
91
+ ## New in v0.2.0: Quality Scoring & Beam Merging
92
+
93
+ By default, the library now:
94
+ - **Scores each triple** for groundedness based on whether entities appear in source text
95
+ - **Merges top beams** instead of selecting one, improving coverage
96
+ - **Uses embeddings** to detect semantically similar predicates ("bought" ≈ "acquired")
97
+
98
+ ```python
99
+ from statement_extractor import ExtractionOptions, ScoringConfig
100
+
101
+ # Precision mode - filter low-confidence triples
102
+ scoring = ScoringConfig(min_confidence=0.7)
103
+ options = ExtractionOptions(scoring_config=scoring)
104
+ result = extract_statements(text, options)
105
+
106
+ # Access confidence scores
107
+ for stmt in result:
108
+ print(f"{stmt} (confidence: {stmt.confidence_score:.2f})")
109
+ ```
110
+
111
+ ## New in v0.2.0: Predicate Taxonomies
112
+
113
+ Normalize predicates to canonical forms using embedding similarity:
114
+
115
+ ```python
116
+ from statement_extractor import PredicateTaxonomy, ExtractionOptions
117
+
118
+ taxonomy = PredicateTaxonomy(predicates=[
119
+ "acquired", "founded", "works_for", "announced",
120
+ "invested_in", "partnered_with"
121
+ ])
122
+
123
+ options = ExtractionOptions(predicate_taxonomy=taxonomy)
124
+ result = extract_statements(text, options)
125
+
126
+ # "bought" -> "acquired" via embedding similarity
127
+ for stmt in result:
128
+ if stmt.canonical_predicate:
129
+ print(f"{stmt.predicate} -> {stmt.canonical_predicate}")
130
+ ```
131
+
132
+ ## Disable Embeddings (Faster, No Extra Dependencies)
133
+
134
+ ```python
135
+ options = ExtractionOptions(
136
+ embedding_dedup=False, # Use exact text matching
137
+ merge_beams=False, # Select single best beam
138
+ )
139
+ result = extract_statements(text, options)
140
+ ```
141
+
142
+ ## Output Formats
143
+
144
+ ```python
145
+ from statement_extractor import (
146
+ extract_statements,
147
+ extract_statements_as_json,
148
+ extract_statements_as_xml,
149
+ extract_statements_as_dict,
150
+ )
151
+
152
+ # Pydantic models (default)
153
+ result = extract_statements(text)
154
+
155
+ # JSON string
156
+ json_output = extract_statements_as_json(text)
157
+
158
+ # Raw XML (model's native format)
159
+ xml_output = extract_statements_as_xml(text)
160
+
161
+ # Python dictionary
162
+ dict_output = extract_statements_as_dict(text)
163
+ ```
164
+
165
+ ## Batch Processing
166
+
167
+ ```python
168
+ from statement_extractor import StatementExtractor
169
+
170
+ extractor = StatementExtractor(device="cuda") # or "cpu"
171
+
172
+ texts = ["Text 1...", "Text 2...", "Text 3..."]
173
+ for text in texts:
174
+ result = extractor.extract(text)
175
+ print(f"Found {len(result)} statements")
176
+ ```
177
+
178
+ ## Entity Types
179
+
180
+ | Type | Description | Example |
181
+ |------|-------------|---------|
182
+ | `ORG` | Organizations | Apple Inc., United Nations |
183
+ | `PERSON` | People | Tim Cook, Elon Musk |
184
+ | `GPE` | Geopolitical entities | USA, California, Paris |
185
+ | `LOC` | Non-GPE locations | Mount Everest, Pacific Ocean |
186
+ | `PRODUCT` | Products | iPhone, Model S |
187
+ | `EVENT` | Events | World Cup, CES 2024 |
188
+ | `WORK_OF_ART` | Creative works | Mona Lisa, Game of Thrones |
189
+ | `LAW` | Legal documents | GDPR, Clean Air Act |
190
+ | `DATE` | Dates | 2024, January 15 |
191
+ | `MONEY` | Monetary values | $50 million, €100 |
192
+ | `PERCENT` | Percentages | 25%, 0.5% |
193
+ | `QUANTITY` | Quantities | 500 employees, 1.5 tons |
194
+ | `UNKNOWN` | Unrecognized | (fallback) |
195
+
196
+ ## How It Works
197
+
198
+ This library uses the T5-Gemma 2 statement extraction model with **Diverse Beam Search** ([Vijayakumar et al., 2016](https://arxiv.org/abs/1610.02424)):
199
+
200
+ 1. **Diverse Beam Search**: Generates 4+ candidate outputs using beam groups with diversity penalty
201
+ 2. **Quality Scoring** *(v0.2.0)*: Each triple scored for groundedness in source text
202
+ 3. **Beam Merging** *(v0.2.0)*: Top beams combined for better coverage
203
+ 4. **Embedding Dedup** *(v0.2.0)*: Semantic similarity removes near-duplicate predicates
204
+ 5. **Predicate Normalization** *(v0.2.0)*: Optional taxonomy matching via embeddings
205
+
206
+ ## Requirements
207
+
208
+ - Python 3.10+
209
+ - PyTorch 2.0+
210
+ - Transformers 4.35+
211
+ - Pydantic 2.0+
212
+ - sentence-transformers 2.2+ *(optional, for embedding features)*
213
+ - ~2GB VRAM (GPU) or ~4GB RAM (CPU)
214
+
215
+ ## Links
216
+
217
+ - [Model on HuggingFace](https://huggingface.co/Corp-o-Rate-Community/statement-extractor)
218
+ - [Web Demo](https://statement-extractor.corp-o-rate.com)
219
+ - [Diverse Beam Search Paper](https://arxiv.org/abs/1610.02424)
220
+ - [Corp-o-Rate](https://corp-o-rate.com)
221
+
222
+ ## License
223
+
224
+ MIT License - see LICENSE file for details.
@@ -0,0 +1,184 @@
1
+ # Corp Extractor
2
+
3
+ Extract structured subject-predicate-object statements from unstructured text using the T5-Gemma 2 model.
4
+
5
+ [![PyPI version](https://badge.fury.io/py/corp-extractor.svg)](https://badge.fury.io/py/corp-extractor)
6
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
7
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
+
9
+ ## Features
10
+
11
+ - **Structured Extraction**: Converts unstructured text into subject-predicate-object triples
12
+ - **Entity Type Recognition**: Identifies 12 entity types (ORG, PERSON, GPE, LOC, PRODUCT, EVENT, etc.)
13
+ - **Quality Scoring** *(v0.2.0)*: Each triple scored for groundedness (0-1) based on source text
14
+ - **Beam Merging** *(v0.2.0)*: Combines top beams for better coverage instead of picking one
15
+ - **Embedding-based Dedup** *(v0.2.0)*: Uses semantic similarity to detect near-duplicate predicates
16
+ - **Predicate Taxonomies** *(v0.2.0)*: Normalize predicates to canonical forms via embeddings
17
+ - **Multiple Output Formats**: Get results as Pydantic models, JSON, XML, or dictionaries
18
+
19
+ ## Installation
20
+
21
+ ```bash
22
+ # Recommended: include embedding support for smart deduplication
23
+ pip install corp-extractor[embeddings]
24
+
25
+ # Minimal installation (no embedding features)
26
+ pip install corp-extractor
27
+ ```
28
+
29
+ **Note**: For GPU support, install PyTorch with CUDA first:
30
+ ```bash
31
+ pip install torch --index-url https://download.pytorch.org/whl/cu121
32
+ pip install corp-extractor[embeddings]
33
+ ```
34
+
35
+ ## Quick Start
36
+
37
+ ```python
38
+ from statement_extractor import extract_statements
39
+
40
+ result = extract_statements("""
41
+ Apple Inc. announced the iPhone 15 at their September event.
42
+ Tim Cook presented the new features to customers worldwide.
43
+ """)
44
+
45
+ for stmt in result:
46
+ print(f"{stmt.subject.text} ({stmt.subject.type})")
47
+ print(f" --[{stmt.predicate}]--> {stmt.object.text}")
48
+ print(f" Confidence: {stmt.confidence_score:.2f}") # NEW in v0.2.0
49
+ ```
50
+
51
+ ## New in v0.2.0: Quality Scoring & Beam Merging
52
+
53
+ By default, the library now:
54
+ - **Scores each triple** for groundedness based on whether entities appear in source text
55
+ - **Merges top beams** instead of selecting one, improving coverage
56
+ - **Uses embeddings** to detect semantically similar predicates ("bought" ≈ "acquired")
57
+
58
+ ```python
59
+ from statement_extractor import ExtractionOptions, ScoringConfig
60
+
61
+ # Precision mode - filter low-confidence triples
62
+ scoring = ScoringConfig(min_confidence=0.7)
63
+ options = ExtractionOptions(scoring_config=scoring)
64
+ result = extract_statements(text, options)
65
+
66
+ # Access confidence scores
67
+ for stmt in result:
68
+ print(f"{stmt} (confidence: {stmt.confidence_score:.2f})")
69
+ ```
70
+
71
+ ## New in v0.2.0: Predicate Taxonomies
72
+
73
+ Normalize predicates to canonical forms using embedding similarity:
74
+
75
+ ```python
76
+ from statement_extractor import PredicateTaxonomy, ExtractionOptions
77
+
78
+ taxonomy = PredicateTaxonomy(predicates=[
79
+ "acquired", "founded", "works_for", "announced",
80
+ "invested_in", "partnered_with"
81
+ ])
82
+
83
+ options = ExtractionOptions(predicate_taxonomy=taxonomy)
84
+ result = extract_statements(text, options)
85
+
86
+ # "bought" -> "acquired" via embedding similarity
87
+ for stmt in result:
88
+ if stmt.canonical_predicate:
89
+ print(f"{stmt.predicate} -> {stmt.canonical_predicate}")
90
+ ```
91
+
92
+ ## Disable Embeddings (Faster, No Extra Dependencies)
93
+
94
+ ```python
95
+ options = ExtractionOptions(
96
+ embedding_dedup=False, # Use exact text matching
97
+ merge_beams=False, # Select single best beam
98
+ )
99
+ result = extract_statements(text, options)
100
+ ```
101
+
102
+ ## Output Formats
103
+
104
+ ```python
105
+ from statement_extractor import (
106
+ extract_statements,
107
+ extract_statements_as_json,
108
+ extract_statements_as_xml,
109
+ extract_statements_as_dict,
110
+ )
111
+
112
+ # Pydantic models (default)
113
+ result = extract_statements(text)
114
+
115
+ # JSON string
116
+ json_output = extract_statements_as_json(text)
117
+
118
+ # Raw XML (model's native format)
119
+ xml_output = extract_statements_as_xml(text)
120
+
121
+ # Python dictionary
122
+ dict_output = extract_statements_as_dict(text)
123
+ ```
124
+
125
+ ## Batch Processing
126
+
127
+ ```python
128
+ from statement_extractor import StatementExtractor
129
+
130
+ extractor = StatementExtractor(device="cuda") # or "cpu"
131
+
132
+ texts = ["Text 1...", "Text 2...", "Text 3..."]
133
+ for text in texts:
134
+ result = extractor.extract(text)
135
+ print(f"Found {len(result)} statements")
136
+ ```
137
+
138
+ ## Entity Types
139
+
140
+ | Type | Description | Example |
141
+ |------|-------------|---------|
142
+ | `ORG` | Organizations | Apple Inc., United Nations |
143
+ | `PERSON` | People | Tim Cook, Elon Musk |
144
+ | `GPE` | Geopolitical entities | USA, California, Paris |
145
+ | `LOC` | Non-GPE locations | Mount Everest, Pacific Ocean |
146
+ | `PRODUCT` | Products | iPhone, Model S |
147
+ | `EVENT` | Events | World Cup, CES 2024 |
148
+ | `WORK_OF_ART` | Creative works | Mona Lisa, Game of Thrones |
149
+ | `LAW` | Legal documents | GDPR, Clean Air Act |
150
+ | `DATE` | Dates | 2024, January 15 |
151
+ | `MONEY` | Monetary values | $50 million, €100 |
152
+ | `PERCENT` | Percentages | 25%, 0.5% |
153
+ | `QUANTITY` | Quantities | 500 employees, 1.5 tons |
154
+ | `UNKNOWN` | Unrecognized | (fallback) |
155
+
156
+ ## How It Works
157
+
158
+ This library uses the T5-Gemma 2 statement extraction model with **Diverse Beam Search** ([Vijayakumar et al., 2016](https://arxiv.org/abs/1610.02424)):
159
+
160
+ 1. **Diverse Beam Search**: Generates 4+ candidate outputs using beam groups with diversity penalty
161
+ 2. **Quality Scoring** *(v0.2.0)*: Each triple scored for groundedness in source text
162
+ 3. **Beam Merging** *(v0.2.0)*: Top beams combined for better coverage
163
+ 4. **Embedding Dedup** *(v0.2.0)*: Semantic similarity removes near-duplicate predicates
164
+ 5. **Predicate Normalization** *(v0.2.0)*: Optional taxonomy matching via embeddings
165
+
166
+ ## Requirements
167
+
168
+ - Python 3.10+
169
+ - PyTorch 2.0+
170
+ - Transformers 4.35+
171
+ - Pydantic 2.0+
172
+ - sentence-transformers 2.2+ *(optional, for embedding features)*
173
+ - ~2GB VRAM (GPU) or ~4GB RAM (CPU)
174
+
175
+ ## Links
176
+
177
+ - [Model on HuggingFace](https://huggingface.co/Corp-o-Rate-Community/statement-extractor)
178
+ - [Web Demo](https://statement-extractor.corp-o-rate.com)
179
+ - [Diverse Beam Search Paper](https://arxiv.org/abs/1610.02424)
180
+ - [Corp-o-Rate](https://corp-o-rate.com)
181
+
182
+ ## License
183
+
184
+ MIT License - see LICENSE file for details.
@@ -0,0 +1,96 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "corp-extractor"
7
+ version = "0.2.0"
8
+ description = "Extract structured statements from text using T5-Gemma 2 and Diverse Beam Search"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "MIT" }
12
+ authors = [
13
+ { name = "Corp-o-Rate", email = "neil@corp-o-rate.com" }
14
+ ]
15
+ maintainers = [
16
+ { name = "Corp-o-Rate", email = "neil@corp-o-rate.com" }
17
+ ]
18
+ keywords = [
19
+ "nlp",
20
+ "statement-extraction",
21
+ "information-extraction",
22
+ "knowledge-graph",
23
+ "t5",
24
+ "gemma",
25
+ "transformers",
26
+ "diverse-beam-search",
27
+ "subject-predicate-object",
28
+ "triples",
29
+ "embeddings",
30
+ ]
31
+ classifiers = [
32
+ "Development Status :: 4 - Beta",
33
+ "Intended Audience :: Developers",
34
+ "Intended Audience :: Science/Research",
35
+ "License :: OSI Approved :: MIT License",
36
+ "Operating System :: OS Independent",
37
+ "Programming Language :: Python :: 3",
38
+ "Programming Language :: Python :: 3.10",
39
+ "Programming Language :: Python :: 3.11",
40
+ "Programming Language :: Python :: 3.12",
41
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
42
+ "Topic :: Scientific/Engineering :: Information Analysis",
43
+ "Topic :: Text Processing :: Linguistic",
44
+ ]
45
+
46
+ dependencies = [
47
+ "pydantic>=2.0.0",
48
+ "torch>=2.0.0",
49
+ "transformers>=4.35.0",
50
+ "numpy>=1.24.0",
51
+ ]
52
+
53
+ [project.optional-dependencies]
54
+ # Embedding-based predicate comparison (enabled by default in ExtractionOptions)
55
+ embeddings = [
56
+ "sentence-transformers>=2.2.0",
57
+ ]
58
+ dev = [
59
+ "pytest>=7.0.0",
60
+ "pytest-cov>=4.0.0",
61
+ "ruff>=0.1.0",
62
+ "mypy>=1.0.0",
63
+ ]
64
+ # Full installation with all optional features
65
+ all = [
66
+ "sentence-transformers>=2.2.0",
67
+ ]
68
+
69
+ [project.urls]
70
+ Homepage = "https://github.com/corp-o-rate/statement-extractor"
71
+ Documentation = "https://github.com/corp-o-rate/statement-extractor#readme"
72
+ Repository = "https://github.com/corp-o-rate/statement-extractor"
73
+ Issues = "https://github.com/corp-o-rate/statement-extractor/issues"
74
+
75
+ [tool.hatch.build.targets.sdist]
76
+ include = [
77
+ "/src",
78
+ ]
79
+
80
+ [tool.hatch.build.targets.wheel]
81
+ packages = ["src/statement_extractor"]
82
+
83
+ [tool.ruff]
84
+ line-length = 100
85
+ target-version = "py310"
86
+
87
+ [tool.ruff.lint]
88
+ select = ["E", "F", "I", "W"]
89
+
90
+ [tool.mypy]
91
+ python_version = "3.10"
92
+ warn_return_any = true
93
+ warn_unused_configs = true
94
+
95
+ [tool.pytest.ini_options]
96
+ testpaths = ["tests"]
@@ -0,0 +1,110 @@
1
+ """
2
+ Statement Extractor - Extract structured statements from text using T5-Gemma 2.
3
+
4
+ A Python library for extracting subject-predicate-object triples from unstructured text.
5
+ Uses Diverse Beam Search (Vijayakumar et al., 2016) for high-quality extraction.
6
+
7
+ Paper: https://arxiv.org/abs/1610.02424
8
+
9
+ Features:
10
+ - Quality-based beam scoring and merging
11
+ - Embedding-based predicate comparison for smart deduplication
12
+ - Configurable precision/recall tradeoff
13
+ - Support for predicate taxonomies
14
+
15
+ Example:
16
+ >>> from statement_extractor import extract_statements
17
+ >>> result = extract_statements("Apple Inc. announced a new iPhone today.")
18
+ >>> for stmt in result:
19
+ ... print(f"{stmt.subject.text} -> {stmt.predicate} -> {stmt.object.text}")
20
+ Apple Inc. -> announced -> a new iPhone
21
+
22
+ >>> # Access confidence scores
23
+ >>> for stmt in result:
24
+ ... print(f"{stmt} (confidence: {stmt.confidence_score:.2f})")
25
+
26
+ >>> # Get as different formats
27
+ >>> xml = extract_statements_as_xml("Some text...")
28
+ >>> json_str = extract_statements_as_json("Some text...")
29
+ >>> data = extract_statements_as_dict("Some text...")
30
+ """
31
+
32
+ __version__ = "0.2.0"
33
+
34
+ # Core models
35
+ from .models import (
36
+ Entity,
37
+ EntityType,
38
+ ExtractionOptions,
39
+ ExtractionResult,
40
+ Statement,
41
+ # New in 0.2.0
42
+ PredicateMatch,
43
+ PredicateTaxonomy,
44
+ PredicateComparisonConfig,
45
+ ScoringConfig,
46
+ )
47
+
48
+ # Main extractor
49
+ from .extractor import (
50
+ StatementExtractor,
51
+ extract_statements,
52
+ extract_statements_as_dict,
53
+ extract_statements_as_json,
54
+ extract_statements_as_xml,
55
+ )
56
+
57
+ # Canonicalization utilities
58
+ from .canonicalization import (
59
+ Canonicalizer,
60
+ default_entity_canonicalizer,
61
+ deduplicate_statements_exact,
62
+ )
63
+
64
+ # Scoring utilities
65
+ from .scoring import (
66
+ BeamScorer,
67
+ TripleScorer,
68
+ )
69
+
70
+ __all__ = [
71
+ # Version
72
+ "__version__",
73
+ # Core models
74
+ "Entity",
75
+ "EntityType",
76
+ "ExtractionOptions",
77
+ "ExtractionResult",
78
+ "Statement",
79
+ # Configuration models (new in 0.2.0)
80
+ "PredicateMatch",
81
+ "PredicateTaxonomy",
82
+ "PredicateComparisonConfig",
83
+ "ScoringConfig",
84
+ # Extractor class
85
+ "StatementExtractor",
86
+ # Convenience functions
87
+ "extract_statements",
88
+ "extract_statements_as_dict",
89
+ "extract_statements_as_json",
90
+ "extract_statements_as_xml",
91
+ # Canonicalization
92
+ "Canonicalizer",
93
+ "default_entity_canonicalizer",
94
+ "deduplicate_statements_exact",
95
+ # Scoring
96
+ "BeamScorer",
97
+ "TripleScorer",
98
+ ]
99
+
100
+
101
+ # Lazy imports for optional dependencies
102
+ def __getattr__(name: str):
103
+ """Lazy import for optional modules."""
104
+ if name == "PredicateComparer":
105
+ from .predicate_comparer import PredicateComparer
106
+ return PredicateComparer
107
+ if name == "EmbeddingDependencyError":
108
+ from .predicate_comparer import EmbeddingDependencyError
109
+ return EmbeddingDependencyError
110
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")