corp-extractor 0.2.3__tar.gz → 0.2.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.2.3 → corp_extractor-0.2.8}/PKG-INFO +102 -5
- {corp_extractor-0.2.3 → corp_extractor-0.2.8}/README.md +99 -3
- {corp_extractor-0.2.3 → corp_extractor-0.2.8}/pyproject.toml +7 -2
- {corp_extractor-0.2.3 → corp_extractor-0.2.8}/src/statement_extractor/__init__.py +1 -1
- corp_extractor-0.2.8/src/statement_extractor/cli.py +215 -0
- {corp_extractor-0.2.3 → corp_extractor-0.2.8}/src/statement_extractor/extractor.py +11 -2
- {corp_extractor-0.2.3 → corp_extractor-0.2.8}/src/statement_extractor/predicate_comparer.py +6 -1
- {corp_extractor-0.2.3 → corp_extractor-0.2.8}/.gitignore +0 -0
- {corp_extractor-0.2.3 → corp_extractor-0.2.8}/src/statement_extractor/canonicalization.py +0 -0
- {corp_extractor-0.2.3 → corp_extractor-0.2.8}/src/statement_extractor/models.py +0 -0
- {corp_extractor-0.2.3 → corp_extractor-0.2.8}/src/statement_extractor/scoring.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: corp-extractor
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.8
|
|
4
4
|
Summary: Extract structured statements from text using T5-Gemma 2 and Diverse Beam Search
|
|
5
5
|
Project-URL: Homepage, https://github.com/corp-o-rate/statement-extractor
|
|
6
6
|
Project-URL: Documentation, https://github.com/corp-o-rate/statement-extractor#readme
|
|
@@ -23,10 +23,11 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
23
23
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
24
24
|
Classifier: Topic :: Text Processing :: Linguistic
|
|
25
25
|
Requires-Python: >=3.10
|
|
26
|
+
Requires-Dist: click>=8.0.0
|
|
26
27
|
Requires-Dist: numpy>=1.24.0
|
|
27
28
|
Requires-Dist: pydantic>=2.0.0
|
|
28
29
|
Requires-Dist: torch>=2.0.0
|
|
29
|
-
Requires-Dist: transformers>=
|
|
30
|
+
Requires-Dist: transformers>=5.0.0rc3
|
|
30
31
|
Provides-Extra: all
|
|
31
32
|
Requires-Dist: sentence-transformers>=2.2.0; extra == 'all'
|
|
32
33
|
Provides-Extra: dev
|
|
@@ -57,22 +58,28 @@ Extract structured subject-predicate-object statements from unstructured text us
|
|
|
57
58
|
- **Contextualized Matching** *(v0.2.2)*: Compares full "Subject Predicate Object" against source text for better accuracy
|
|
58
59
|
- **Entity Type Merging** *(v0.2.3)*: Automatically merges UNKNOWN entity types with specific types during deduplication
|
|
59
60
|
- **Reversal Detection** *(v0.2.3)*: Detects and corrects subject-object reversals using embedding comparison
|
|
61
|
+
- **Command Line Interface** *(v0.2.4)*: Full-featured CLI for terminal usage
|
|
60
62
|
- **Multiple Output Formats**: Get results as Pydantic models, JSON, XML, or dictionaries
|
|
61
63
|
|
|
62
64
|
## Installation
|
|
63
65
|
|
|
64
66
|
```bash
|
|
65
67
|
# Recommended: include embedding support for smart deduplication
|
|
66
|
-
pip install corp-extractor[embeddings]
|
|
68
|
+
pip install "corp-extractor[embeddings]"
|
|
67
69
|
|
|
68
70
|
# Minimal installation (no embedding features)
|
|
69
71
|
pip install corp-extractor
|
|
70
72
|
```
|
|
71
73
|
|
|
72
|
-
**Note**:
|
|
74
|
+
**Note**: This package requires `transformers>=5.0.0` (pre-release) for T5-Gemma2 model support. Install with `--pre` flag if needed:
|
|
75
|
+
```bash
|
|
76
|
+
pip install --pre "corp-extractor[embeddings]"
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
**For GPU support**, install PyTorch with CUDA first:
|
|
73
80
|
```bash
|
|
74
81
|
pip install torch --index-url https://download.pytorch.org/whl/cu121
|
|
75
|
-
pip install corp-extractor[embeddings]
|
|
82
|
+
pip install "corp-extractor[embeddings]"
|
|
76
83
|
```
|
|
77
84
|
|
|
78
85
|
## Quick Start
|
|
@@ -91,6 +98,96 @@ for stmt in result:
|
|
|
91
98
|
print(f" Confidence: {stmt.confidence_score:.2f}") # NEW in v0.2.0
|
|
92
99
|
```
|
|
93
100
|
|
|
101
|
+
## Command Line Interface
|
|
102
|
+
|
|
103
|
+
The library includes a CLI for quick extraction from the terminal.
|
|
104
|
+
|
|
105
|
+
### Install Globally (Recommended)
|
|
106
|
+
|
|
107
|
+
For best results, install globally first:
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
# Using uv (recommended)
|
|
111
|
+
uv tool install "corp-extractor[embeddings]"
|
|
112
|
+
|
|
113
|
+
# Using pipx
|
|
114
|
+
pipx install "corp-extractor[embeddings]"
|
|
115
|
+
|
|
116
|
+
# Using pip
|
|
117
|
+
pip install "corp-extractor[embeddings]"
|
|
118
|
+
|
|
119
|
+
# Then use anywhere
|
|
120
|
+
corp-extractor "Your text here"
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### Quick Run with uvx
|
|
124
|
+
|
|
125
|
+
Run directly without installing using [uv](https://docs.astral.sh/uv/):
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
uvx corp-extractor "Apple announced a new iPhone."
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
**Note**: First run downloads the model (~1.5GB) which may take a few minutes.
|
|
132
|
+
|
|
133
|
+
### Usage Examples
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
# Extract from text argument
|
|
137
|
+
corp-extractor "Apple Inc. announced the iPhone 15 at their September event."
|
|
138
|
+
|
|
139
|
+
# Extract from file
|
|
140
|
+
corp-extractor -f article.txt
|
|
141
|
+
|
|
142
|
+
# Pipe from stdin
|
|
143
|
+
cat article.txt | corp-extractor -
|
|
144
|
+
|
|
145
|
+
# Output as JSON
|
|
146
|
+
corp-extractor "Tim Cook is CEO of Apple." --json
|
|
147
|
+
|
|
148
|
+
# Output as XML
|
|
149
|
+
corp-extractor -f article.txt --xml
|
|
150
|
+
|
|
151
|
+
# Verbose output with confidence scores
|
|
152
|
+
corp-extractor -f article.txt --verbose
|
|
153
|
+
|
|
154
|
+
# Use more beams for better quality
|
|
155
|
+
corp-extractor -f article.txt --beams 8
|
|
156
|
+
|
|
157
|
+
# Use custom predicate taxonomy
|
|
158
|
+
corp-extractor -f article.txt --taxonomy predicates.txt
|
|
159
|
+
|
|
160
|
+
# Use GPU explicitly
|
|
161
|
+
corp-extractor -f article.txt --device cuda
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
### CLI Options
|
|
165
|
+
|
|
166
|
+
```
|
|
167
|
+
Usage: corp-extractor [OPTIONS] [TEXT]
|
|
168
|
+
|
|
169
|
+
Options:
|
|
170
|
+
-f, --file PATH Read input from file
|
|
171
|
+
-o, --output [table|json|xml] Output format (default: table)
|
|
172
|
+
--json Output as JSON (shortcut)
|
|
173
|
+
--xml Output as XML (shortcut)
|
|
174
|
+
-b, --beams INTEGER Number of beams (default: 4)
|
|
175
|
+
--diversity FLOAT Diversity penalty (default: 1.0)
|
|
176
|
+
--max-tokens INTEGER Max tokens to generate (default: 2048)
|
|
177
|
+
--no-dedup Disable deduplication
|
|
178
|
+
--no-embeddings Disable embedding-based dedup (faster)
|
|
179
|
+
--no-merge Disable beam merging
|
|
180
|
+
--dedup-threshold FLOAT Deduplication threshold (default: 0.65)
|
|
181
|
+
--min-confidence FLOAT Min confidence filter (default: 0)
|
|
182
|
+
--taxonomy PATH Load predicate taxonomy from file
|
|
183
|
+
--taxonomy-threshold FLOAT Taxonomy matching threshold (default: 0.5)
|
|
184
|
+
--device [auto|cuda|cpu] Device to use (default: auto)
|
|
185
|
+
-v, --verbose Show confidence scores and metadata
|
|
186
|
+
-q, --quiet Suppress progress messages
|
|
187
|
+
--version Show version
|
|
188
|
+
--help Show this message
|
|
189
|
+
```
|
|
190
|
+
|
|
94
191
|
## New in v0.2.0: Quality Scoring & Beam Merging
|
|
95
192
|
|
|
96
193
|
By default, the library now:
|
|
@@ -17,22 +17,28 @@ Extract structured subject-predicate-object statements from unstructured text us
|
|
|
17
17
|
- **Contextualized Matching** *(v0.2.2)*: Compares full "Subject Predicate Object" against source text for better accuracy
|
|
18
18
|
- **Entity Type Merging** *(v0.2.3)*: Automatically merges UNKNOWN entity types with specific types during deduplication
|
|
19
19
|
- **Reversal Detection** *(v0.2.3)*: Detects and corrects subject-object reversals using embedding comparison
|
|
20
|
+
- **Command Line Interface** *(v0.2.4)*: Full-featured CLI for terminal usage
|
|
20
21
|
- **Multiple Output Formats**: Get results as Pydantic models, JSON, XML, or dictionaries
|
|
21
22
|
|
|
22
23
|
## Installation
|
|
23
24
|
|
|
24
25
|
```bash
|
|
25
26
|
# Recommended: include embedding support for smart deduplication
|
|
26
|
-
pip install corp-extractor[embeddings]
|
|
27
|
+
pip install "corp-extractor[embeddings]"
|
|
27
28
|
|
|
28
29
|
# Minimal installation (no embedding features)
|
|
29
30
|
pip install corp-extractor
|
|
30
31
|
```
|
|
31
32
|
|
|
32
|
-
**Note**:
|
|
33
|
+
**Note**: This package requires `transformers>=5.0.0` (pre-release) for T5-Gemma2 model support. Install with `--pre` flag if needed:
|
|
34
|
+
```bash
|
|
35
|
+
pip install --pre "corp-extractor[embeddings]"
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
**For GPU support**, install PyTorch with CUDA first:
|
|
33
39
|
```bash
|
|
34
40
|
pip install torch --index-url https://download.pytorch.org/whl/cu121
|
|
35
|
-
pip install corp-extractor[embeddings]
|
|
41
|
+
pip install "corp-extractor[embeddings]"
|
|
36
42
|
```
|
|
37
43
|
|
|
38
44
|
## Quick Start
|
|
@@ -51,6 +57,96 @@ for stmt in result:
|
|
|
51
57
|
print(f" Confidence: {stmt.confidence_score:.2f}") # NEW in v0.2.0
|
|
52
58
|
```
|
|
53
59
|
|
|
60
|
+
## Command Line Interface
|
|
61
|
+
|
|
62
|
+
The library includes a CLI for quick extraction from the terminal.
|
|
63
|
+
|
|
64
|
+
### Install Globally (Recommended)
|
|
65
|
+
|
|
66
|
+
For best results, install globally first:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
# Using uv (recommended)
|
|
70
|
+
uv tool install "corp-extractor[embeddings]"
|
|
71
|
+
|
|
72
|
+
# Using pipx
|
|
73
|
+
pipx install "corp-extractor[embeddings]"
|
|
74
|
+
|
|
75
|
+
# Using pip
|
|
76
|
+
pip install "corp-extractor[embeddings]"
|
|
77
|
+
|
|
78
|
+
# Then use anywhere
|
|
79
|
+
corp-extractor "Your text here"
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Quick Run with uvx
|
|
83
|
+
|
|
84
|
+
Run directly without installing using [uv](https://docs.astral.sh/uv/):
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
uvx corp-extractor "Apple announced a new iPhone."
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
**Note**: First run downloads the model (~1.5GB) which may take a few minutes.
|
|
91
|
+
|
|
92
|
+
### Usage Examples
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
# Extract from text argument
|
|
96
|
+
corp-extractor "Apple Inc. announced the iPhone 15 at their September event."
|
|
97
|
+
|
|
98
|
+
# Extract from file
|
|
99
|
+
corp-extractor -f article.txt
|
|
100
|
+
|
|
101
|
+
# Pipe from stdin
|
|
102
|
+
cat article.txt | corp-extractor -
|
|
103
|
+
|
|
104
|
+
# Output as JSON
|
|
105
|
+
corp-extractor "Tim Cook is CEO of Apple." --json
|
|
106
|
+
|
|
107
|
+
# Output as XML
|
|
108
|
+
corp-extractor -f article.txt --xml
|
|
109
|
+
|
|
110
|
+
# Verbose output with confidence scores
|
|
111
|
+
corp-extractor -f article.txt --verbose
|
|
112
|
+
|
|
113
|
+
# Use more beams for better quality
|
|
114
|
+
corp-extractor -f article.txt --beams 8
|
|
115
|
+
|
|
116
|
+
# Use custom predicate taxonomy
|
|
117
|
+
corp-extractor -f article.txt --taxonomy predicates.txt
|
|
118
|
+
|
|
119
|
+
# Use GPU explicitly
|
|
120
|
+
corp-extractor -f article.txt --device cuda
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### CLI Options
|
|
124
|
+
|
|
125
|
+
```
|
|
126
|
+
Usage: corp-extractor [OPTIONS] [TEXT]
|
|
127
|
+
|
|
128
|
+
Options:
|
|
129
|
+
-f, --file PATH Read input from file
|
|
130
|
+
-o, --output [table|json|xml] Output format (default: table)
|
|
131
|
+
--json Output as JSON (shortcut)
|
|
132
|
+
--xml Output as XML (shortcut)
|
|
133
|
+
-b, --beams INTEGER Number of beams (default: 4)
|
|
134
|
+
--diversity FLOAT Diversity penalty (default: 1.0)
|
|
135
|
+
--max-tokens INTEGER Max tokens to generate (default: 2048)
|
|
136
|
+
--no-dedup Disable deduplication
|
|
137
|
+
--no-embeddings Disable embedding-based dedup (faster)
|
|
138
|
+
--no-merge Disable beam merging
|
|
139
|
+
--dedup-threshold FLOAT Deduplication threshold (default: 0.65)
|
|
140
|
+
--min-confidence FLOAT Min confidence filter (default: 0)
|
|
141
|
+
--taxonomy PATH Load predicate taxonomy from file
|
|
142
|
+
--taxonomy-threshold FLOAT Taxonomy matching threshold (default: 0.5)
|
|
143
|
+
--device [auto|cuda|cpu] Device to use (default: auto)
|
|
144
|
+
-v, --verbose Show confidence scores and metadata
|
|
145
|
+
-q, --quiet Suppress progress messages
|
|
146
|
+
--version Show version
|
|
147
|
+
--help Show this message
|
|
148
|
+
```
|
|
149
|
+
|
|
54
150
|
## New in v0.2.0: Quality Scoring & Beam Merging
|
|
55
151
|
|
|
56
152
|
By default, the library now:
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "corp-extractor"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.8"
|
|
8
8
|
description = "Extract structured statements from text using T5-Gemma 2 and Diverse Beam Search"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -46,8 +46,9 @@ classifiers = [
|
|
|
46
46
|
dependencies = [
|
|
47
47
|
"pydantic>=2.0.0",
|
|
48
48
|
"torch>=2.0.0",
|
|
49
|
-
"transformers>=
|
|
49
|
+
"transformers>=5.0.0rc3",
|
|
50
50
|
"numpy>=1.24.0",
|
|
51
|
+
"click>=8.0.0",
|
|
51
52
|
]
|
|
52
53
|
|
|
53
54
|
[project.optional-dependencies]
|
|
@@ -66,6 +67,10 @@ all = [
|
|
|
66
67
|
"sentence-transformers>=2.2.0",
|
|
67
68
|
]
|
|
68
69
|
|
|
70
|
+
[project.scripts]
|
|
71
|
+
statement-extractor = "statement_extractor.cli:main"
|
|
72
|
+
corp-extractor = "statement_extractor.cli:main"
|
|
73
|
+
|
|
69
74
|
[project.urls]
|
|
70
75
|
Homepage = "https://github.com/corp-o-rate/statement-extractor"
|
|
71
76
|
Documentation = "https://github.com/corp-o-rate/statement-extractor#readme"
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Command-line interface for statement extraction.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
corp-extractor "Your text here"
|
|
6
|
+
corp-extractor -f input.txt
|
|
7
|
+
cat input.txt | corp-extractor -
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import sys
|
|
11
|
+
from typing import Optional
|
|
12
|
+
|
|
13
|
+
import click
|
|
14
|
+
|
|
15
|
+
from . import __version__
|
|
16
|
+
from .models import (
|
|
17
|
+
ExtractionOptions,
|
|
18
|
+
PredicateComparisonConfig,
|
|
19
|
+
PredicateTaxonomy,
|
|
20
|
+
ScoringConfig,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@click.command()
|
|
25
|
+
@click.argument("text", required=False)
|
|
26
|
+
@click.option("-f", "--file", "input_file", type=click.Path(exists=True), help="Read input from file")
|
|
27
|
+
@click.option(
|
|
28
|
+
"-o", "--output",
|
|
29
|
+
type=click.Choice(["table", "json", "xml"], case_sensitive=False),
|
|
30
|
+
default="table",
|
|
31
|
+
help="Output format (default: table)"
|
|
32
|
+
)
|
|
33
|
+
@click.option("--json", "output_json", is_flag=True, help="Output as JSON (shortcut for -o json)")
|
|
34
|
+
@click.option("--xml", "output_xml", is_flag=True, help="Output as XML (shortcut for -o xml)")
|
|
35
|
+
# Beam search options
|
|
36
|
+
@click.option("-b", "--beams", type=int, default=4, help="Number of beams for diverse beam search (default: 4)")
|
|
37
|
+
@click.option("--diversity", type=float, default=1.0, help="Diversity penalty for beam search (default: 1.0)")
|
|
38
|
+
@click.option("--max-tokens", type=int, default=2048, help="Maximum tokens to generate (default: 2048)")
|
|
39
|
+
# Deduplication options
|
|
40
|
+
@click.option("--no-dedup", is_flag=True, help="Disable deduplication")
|
|
41
|
+
@click.option("--no-embeddings", is_flag=True, help="Disable embedding-based deduplication (faster)")
|
|
42
|
+
@click.option("--no-merge", is_flag=True, help="Disable beam merging (select single best beam)")
|
|
43
|
+
@click.option("--dedup-threshold", type=float, default=0.65, help="Similarity threshold for deduplication (default: 0.65)")
|
|
44
|
+
# Quality options
|
|
45
|
+
@click.option("--min-confidence", type=float, default=0.0, help="Minimum confidence threshold 0-1 (default: 0)")
|
|
46
|
+
# Taxonomy options
|
|
47
|
+
@click.option("--taxonomy", type=click.Path(exists=True), help="Load predicate taxonomy from file (one per line)")
|
|
48
|
+
@click.option("--taxonomy-threshold", type=float, default=0.5, help="Similarity threshold for taxonomy matching (default: 0.5)")
|
|
49
|
+
# Device options
|
|
50
|
+
@click.option("--device", type=click.Choice(["auto", "cuda", "mps", "cpu"]), default="auto", help="Device to use (default: auto)")
|
|
51
|
+
# Output options
|
|
52
|
+
@click.option("-v", "--verbose", is_flag=True, help="Show verbose output with confidence scores")
|
|
53
|
+
@click.option("-q", "--quiet", is_flag=True, help="Suppress progress messages")
|
|
54
|
+
@click.version_option(version=__version__)
|
|
55
|
+
def main(
|
|
56
|
+
text: Optional[str],
|
|
57
|
+
input_file: Optional[str],
|
|
58
|
+
output: str,
|
|
59
|
+
output_json: bool,
|
|
60
|
+
output_xml: bool,
|
|
61
|
+
beams: int,
|
|
62
|
+
diversity: float,
|
|
63
|
+
max_tokens: int,
|
|
64
|
+
no_dedup: bool,
|
|
65
|
+
no_embeddings: bool,
|
|
66
|
+
no_merge: bool,
|
|
67
|
+
dedup_threshold: float,
|
|
68
|
+
min_confidence: float,
|
|
69
|
+
taxonomy: Optional[str],
|
|
70
|
+
taxonomy_threshold: float,
|
|
71
|
+
device: str,
|
|
72
|
+
verbose: bool,
|
|
73
|
+
quiet: bool,
|
|
74
|
+
):
|
|
75
|
+
"""
|
|
76
|
+
Extract structured statements from text.
|
|
77
|
+
|
|
78
|
+
TEXT can be provided as an argument, read from a file with -f, or piped via stdin.
|
|
79
|
+
|
|
80
|
+
\b
|
|
81
|
+
Examples:
|
|
82
|
+
corp-extractor "Apple announced a new iPhone."
|
|
83
|
+
corp-extractor -f article.txt --json
|
|
84
|
+
corp-extractor -f article.txt -o json --beams 8
|
|
85
|
+
cat article.txt | corp-extractor -
|
|
86
|
+
echo "Tim Cook is CEO of Apple." | corp-extractor - --verbose
|
|
87
|
+
|
|
88
|
+
\b
|
|
89
|
+
Output formats:
|
|
90
|
+
table Human-readable table (default)
|
|
91
|
+
json JSON with full metadata
|
|
92
|
+
xml Raw XML from model
|
|
93
|
+
"""
|
|
94
|
+
# Determine output format
|
|
95
|
+
if output_json:
|
|
96
|
+
output = "json"
|
|
97
|
+
elif output_xml:
|
|
98
|
+
output = "xml"
|
|
99
|
+
|
|
100
|
+
# Get input text
|
|
101
|
+
input_text = _get_input_text(text, input_file)
|
|
102
|
+
if not input_text:
|
|
103
|
+
raise click.UsageError(
|
|
104
|
+
"No input provided. Use: statement-extractor \"text\", "
|
|
105
|
+
"statement-extractor -f file.txt, or pipe via stdin."
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
if not quiet:
|
|
109
|
+
click.echo(f"Processing {len(input_text)} characters...", err=True)
|
|
110
|
+
|
|
111
|
+
# Load taxonomy if provided
|
|
112
|
+
predicate_taxonomy = None
|
|
113
|
+
if taxonomy:
|
|
114
|
+
predicate_taxonomy = PredicateTaxonomy.from_file(taxonomy)
|
|
115
|
+
if not quiet:
|
|
116
|
+
click.echo(f"Loaded taxonomy with {len(predicate_taxonomy.predicates)} predicates", err=True)
|
|
117
|
+
|
|
118
|
+
# Configure predicate comparison
|
|
119
|
+
predicate_config = PredicateComparisonConfig(
|
|
120
|
+
similarity_threshold=taxonomy_threshold,
|
|
121
|
+
dedup_threshold=dedup_threshold,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Configure scoring
|
|
125
|
+
scoring_config = ScoringConfig(min_confidence=min_confidence)
|
|
126
|
+
|
|
127
|
+
# Configure extraction options
|
|
128
|
+
options = ExtractionOptions(
|
|
129
|
+
num_beams=beams,
|
|
130
|
+
diversity_penalty=diversity,
|
|
131
|
+
max_new_tokens=max_tokens,
|
|
132
|
+
deduplicate=not no_dedup,
|
|
133
|
+
embedding_dedup=not no_embeddings,
|
|
134
|
+
merge_beams=not no_merge,
|
|
135
|
+
predicate_taxonomy=predicate_taxonomy,
|
|
136
|
+
predicate_config=predicate_config,
|
|
137
|
+
scoring_config=scoring_config,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Import here to allow --help without loading torch
|
|
141
|
+
from .extractor import StatementExtractor
|
|
142
|
+
|
|
143
|
+
# Create extractor with specified device
|
|
144
|
+
device_arg = None if device == "auto" else device
|
|
145
|
+
extractor = StatementExtractor(device=device_arg)
|
|
146
|
+
|
|
147
|
+
if not quiet:
|
|
148
|
+
click.echo(f"Using device: {extractor.device}", err=True)
|
|
149
|
+
|
|
150
|
+
# Run extraction
|
|
151
|
+
try:
|
|
152
|
+
if output == "xml":
|
|
153
|
+
result = extractor.extract_as_xml(input_text, options)
|
|
154
|
+
click.echo(result)
|
|
155
|
+
elif output == "json":
|
|
156
|
+
result = extractor.extract_as_json(input_text, options)
|
|
157
|
+
click.echo(result)
|
|
158
|
+
else:
|
|
159
|
+
# Table format
|
|
160
|
+
result = extractor.extract(input_text, options)
|
|
161
|
+
_print_table(result, verbose)
|
|
162
|
+
except Exception as e:
|
|
163
|
+
raise click.ClickException(f"Extraction failed: {e}")
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _get_input_text(text: Optional[str], input_file: Optional[str]) -> Optional[str]:
|
|
167
|
+
"""Get input text from argument, file, or stdin."""
|
|
168
|
+
if text == "-" or (text is None and input_file is None and not sys.stdin.isatty()):
|
|
169
|
+
# Read from stdin
|
|
170
|
+
return sys.stdin.read().strip()
|
|
171
|
+
elif input_file:
|
|
172
|
+
# Read from file
|
|
173
|
+
with open(input_file, "r", encoding="utf-8") as f:
|
|
174
|
+
return f.read().strip()
|
|
175
|
+
elif text:
|
|
176
|
+
return text.strip()
|
|
177
|
+
return None
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _print_table(result, verbose: bool):
|
|
181
|
+
"""Print statements in a human-readable table format."""
|
|
182
|
+
if not result.statements:
|
|
183
|
+
click.echo("No statements extracted.")
|
|
184
|
+
return
|
|
185
|
+
|
|
186
|
+
click.echo(f"\nExtracted {len(result.statements)} statement(s):\n")
|
|
187
|
+
click.echo("-" * 80)
|
|
188
|
+
|
|
189
|
+
for i, stmt in enumerate(result.statements, 1):
|
|
190
|
+
subject_type = f" ({stmt.subject.type.value})" if stmt.subject.type.value != "UNKNOWN" else ""
|
|
191
|
+
object_type = f" ({stmt.object.type.value})" if stmt.object.type.value != "UNKNOWN" else ""
|
|
192
|
+
|
|
193
|
+
click.echo(f"{i}. {stmt.subject.text}{subject_type}")
|
|
194
|
+
click.echo(f" --[{stmt.predicate}]-->")
|
|
195
|
+
click.echo(f" {stmt.object.text}{object_type}")
|
|
196
|
+
|
|
197
|
+
if verbose:
|
|
198
|
+
if stmt.confidence_score is not None:
|
|
199
|
+
click.echo(f" Confidence: {stmt.confidence_score:.2f}")
|
|
200
|
+
|
|
201
|
+
if stmt.canonical_predicate:
|
|
202
|
+
click.echo(f" Canonical: {stmt.canonical_predicate}")
|
|
203
|
+
|
|
204
|
+
if stmt.was_reversed:
|
|
205
|
+
click.echo(f" (subject/object were swapped)")
|
|
206
|
+
|
|
207
|
+
if stmt.source_text:
|
|
208
|
+
source = stmt.source_text[:60] + "..." if len(stmt.source_text) > 60 else stmt.source_text
|
|
209
|
+
click.echo(f" Source: \"{source}\"")
|
|
210
|
+
|
|
211
|
+
click.echo("-" * 80)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
if __name__ == "__main__":
|
|
215
|
+
main()
|
|
@@ -80,11 +80,16 @@ class StatementExtractor:
|
|
|
80
80
|
|
|
81
81
|
# Auto-detect device
|
|
82
82
|
if device is None:
|
|
83
|
-
|
|
83
|
+
if torch.cuda.is_available():
|
|
84
|
+
self.device = "cuda"
|
|
85
|
+
elif torch.backends.mps.is_available():
|
|
86
|
+
self.device = "mps"
|
|
87
|
+
else:
|
|
88
|
+
self.device = "cpu"
|
|
84
89
|
else:
|
|
85
90
|
self.device = device
|
|
86
91
|
|
|
87
|
-
# Auto-detect dtype
|
|
92
|
+
# Auto-detect dtype (bfloat16 only for CUDA, float32 for MPS/CPU)
|
|
88
93
|
if torch_dtype is None:
|
|
89
94
|
self.torch_dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
|
|
90
95
|
else:
|
|
@@ -350,12 +355,16 @@ class StatementExtractor:
|
|
|
350
355
|
outputs = self.model.generate(
|
|
351
356
|
**inputs,
|
|
352
357
|
max_new_tokens=options.max_new_tokens,
|
|
358
|
+
max_length=None, # Override model default, use max_new_tokens only
|
|
353
359
|
num_beams=num_seqs,
|
|
354
360
|
num_beam_groups=num_seqs,
|
|
355
361
|
num_return_sequences=num_seqs,
|
|
356
362
|
diversity_penalty=options.diversity_penalty,
|
|
357
363
|
do_sample=False,
|
|
364
|
+
top_p=None, # Override model config to suppress warning
|
|
365
|
+
top_k=None, # Override model config to suppress warning
|
|
358
366
|
trust_remote_code=True,
|
|
367
|
+
custom_generate="transformers-community/group-beam-search",
|
|
359
368
|
)
|
|
360
369
|
|
|
361
370
|
# Decode and process candidates
|
|
@@ -83,7 +83,12 @@ class PredicateComparer:
|
|
|
83
83
|
# Auto-detect device
|
|
84
84
|
if device is None:
|
|
85
85
|
import torch
|
|
86
|
-
|
|
86
|
+
if torch.cuda.is_available():
|
|
87
|
+
self.device = "cuda"
|
|
88
|
+
elif torch.backends.mps.is_available():
|
|
89
|
+
self.device = "mps"
|
|
90
|
+
else:
|
|
91
|
+
self.device = "cpu"
|
|
87
92
|
else:
|
|
88
93
|
self.device = device
|
|
89
94
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|