corp-extractor 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: corp-extractor
3
- Version: 0.2.3
3
+ Version: 0.2.5
4
4
  Summary: Extract structured statements from text using T5-Gemma 2 and Diverse Beam Search
5
5
  Project-URL: Homepage, https://github.com/corp-o-rate/statement-extractor
6
6
  Project-URL: Documentation, https://github.com/corp-o-rate/statement-extractor#readme
@@ -23,10 +23,11 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
23
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
24
  Classifier: Topic :: Text Processing :: Linguistic
25
25
  Requires-Python: >=3.10
26
+ Requires-Dist: click>=8.0.0
26
27
  Requires-Dist: numpy>=1.24.0
27
28
  Requires-Dist: pydantic>=2.0.0
28
29
  Requires-Dist: torch>=2.0.0
29
- Requires-Dist: transformers>=4.35.0
30
+ Requires-Dist: transformers>=5.0.0
30
31
  Provides-Extra: all
31
32
  Requires-Dist: sentence-transformers>=2.2.0; extra == 'all'
32
33
  Provides-Extra: dev
@@ -57,6 +58,7 @@ Extract structured subject-predicate-object statements from unstructured text us
57
58
  - **Contextualized Matching** *(v0.2.2)*: Compares full "Subject Predicate Object" against source text for better accuracy
58
59
  - **Entity Type Merging** *(v0.2.3)*: Automatically merges UNKNOWN entity types with specific types during deduplication
59
60
  - **Reversal Detection** *(v0.2.3)*: Detects and corrects subject-object reversals using embedding comparison
61
+ - **Command Line Interface** *(v0.2.4)*: Full-featured CLI for terminal usage
60
62
  - **Multiple Output Formats**: Get results as Pydantic models, JSON, XML, or dictionaries
61
63
 
62
64
  ## Installation
@@ -69,7 +71,9 @@ pip install corp-extractor[embeddings]
69
71
  pip install corp-extractor
70
72
  ```
71
73
 
72
- **Note**: For GPU support, install PyTorch with CUDA first:
74
+ **Note**: This package requires the development version of `transformers` from GitHub (for T5-Gemma2 support). This is handled automatically during installation.
75
+
76
+ **For GPU support**, install PyTorch with CUDA first:
73
77
  ```bash
74
78
  pip install torch --index-url https://download.pytorch.org/whl/cu121
75
79
  pip install corp-extractor[embeddings]
@@ -91,6 +95,96 @@ for stmt in result:
91
95
  print(f" Confidence: {stmt.confidence_score:.2f}") # NEW in v0.2.0
92
96
  ```
93
97
 
98
+ ## Command Line Interface
99
+
100
+ The library includes a CLI for quick extraction from the terminal.
101
+
102
+ ### Install Globally (Recommended)
103
+
104
+ For best results, install globally first:
105
+
106
+ ```bash
107
+ # Using uv (recommended)
108
+ uv tool install corp-extractor[embeddings]
109
+
110
+ # Using pipx
111
+ pipx install corp-extractor[embeddings]
112
+
113
+ # Using pip
114
+ pip install corp-extractor[embeddings]
115
+
116
+ # Then use anywhere
117
+ corp-extractor "Your text here"
118
+ ```
119
+
120
+ ### Quick Run with uvx
121
+
122
+ Run directly without installing using [uv](https://docs.astral.sh/uv/):
123
+
124
+ ```bash
125
+ uvx corp-extractor "Apple announced a new iPhone."
126
+ ```
127
+
128
+ **Note**: uvx runs may be slower on first use as it installs transformers from git.
129
+
130
+ ### Usage Examples
131
+
132
+ ```bash
133
+ # Extract from text argument
134
+ corp-extractor "Apple Inc. announced the iPhone 15 at their September event."
135
+
136
+ # Extract from file
137
+ corp-extractor -f article.txt
138
+
139
+ # Pipe from stdin
140
+ cat article.txt | corp-extractor -
141
+
142
+ # Output as JSON
143
+ corp-extractor "Tim Cook is CEO of Apple." --json
144
+
145
+ # Output as XML
146
+ corp-extractor -f article.txt --xml
147
+
148
+ # Verbose output with confidence scores
149
+ corp-extractor -f article.txt --verbose
150
+
151
+ # Use more beams for better quality
152
+ corp-extractor -f article.txt --beams 8
153
+
154
+ # Use custom predicate taxonomy
155
+ corp-extractor -f article.txt --taxonomy predicates.txt
156
+
157
+ # Use GPU explicitly
158
+ corp-extractor -f article.txt --device cuda
159
+ ```
160
+
161
+ ### CLI Options
162
+
163
+ ```
164
+ Usage: corp-extractor [OPTIONS] [TEXT]
165
+
166
+ Options:
167
+ -f, --file PATH Read input from file
168
+ -o, --output [table|json|xml] Output format (default: table)
169
+ --json Output as JSON (shortcut)
170
+ --xml Output as XML (shortcut)
171
+ -b, --beams INTEGER Number of beams (default: 4)
172
+ --diversity FLOAT Diversity penalty (default: 1.0)
173
+ --max-tokens INTEGER Max tokens to generate (default: 2048)
174
+ --no-dedup Disable deduplication
175
+ --no-embeddings Disable embedding-based dedup (faster)
176
+ --no-merge Disable beam merging
177
+ --dedup-threshold FLOAT Deduplication threshold (default: 0.65)
178
+ --min-confidence FLOAT Min confidence filter (default: 0)
179
+ --taxonomy PATH Load predicate taxonomy from file
180
+ --taxonomy-threshold FLOAT Taxonomy matching threshold (default: 0.5)
181
+ --device [auto|cuda|cpu] Device to use (default: auto)
182
+ -v, --verbose Show confidence scores and metadata
183
+ -q, --quiet Suppress progress messages
184
+ --version Show version
185
+ --help Show this message
186
+ ```
187
+
94
188
  ## New in v0.2.0: Quality Scoring & Beam Merging
95
189
 
96
190
  By default, the library now:
@@ -1,9 +1,11 @@
1
- statement_extractor/__init__.py,sha256=4Ht8GJdgik_iti7zpG71Oi5EEAnck6AYDvy7soRqIOg,2967
1
+ statement_extractor/__init__.py,sha256=MIZgn-lD9-XGJapzdyYxMhEJFRrTzftbRklrhwA4e8w,2967
2
2
  statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
3
+ statement_extractor/cli.py,sha256=kJnZm_mbq4np1vTxSjczMZM5zGuDlC8Z5xLJd8O3xZ4,7605
3
4
  statement_extractor/extractor.py,sha256=PX0SiJnYUnh06seyH5W77FcPpcvLXwEM8IGsuVuRh0Q,22158
4
5
  statement_extractor/models.py,sha256=xDF3pDPhIiqiMwFMPV94aBEgZGbSe-x2TkshahOiCog,10739
5
6
  statement_extractor/predicate_comparer.py,sha256=iwBfNJFNOFv8ODKN9F9EtmknpCeSThOpnu6P_PJSmgE,24898
6
7
  statement_extractor/scoring.py,sha256=Wa1BW6jXtHD7dZkUXwdwE39hwFo2ko6BuIogBc4E2Lk,14493
7
- corp_extractor-0.2.3.dist-info/METADATA,sha256=dCJbLWIj7hgzpkC4zYvNmnEAhNnizUEq_caea6AamIU,10724
8
- corp_extractor-0.2.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
9
- corp_extractor-0.2.3.dist-info/RECORD,,
8
+ corp_extractor-0.2.5.dist-info/METADATA,sha256=iN_MPbqHhizaFAGJKzR5JNSbDivrS133oSTiYWrFht4,13552
9
+ corp_extractor-0.2.5.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
10
+ corp_extractor-0.2.5.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
11
+ corp_extractor-0.2.5.dist-info/RECORD,,
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ corp-extractor = statement_extractor.cli:main
3
+ statement-extractor = statement_extractor.cli:main
@@ -29,7 +29,7 @@ Example:
29
29
  >>> data = extract_statements_as_dict("Some text...")
30
30
  """
31
31
 
32
- __version__ = "0.2.2"
32
+ __version__ = "0.2.5"
33
33
 
34
34
  # Core models
35
35
  from .models import (
@@ -0,0 +1,215 @@
1
+ """
2
+ Command-line interface for statement extraction.
3
+
4
+ Usage:
5
+ corp-extractor "Your text here"
6
+ corp-extractor -f input.txt
7
+ cat input.txt | corp-extractor -
8
+ """
9
+
10
+ import sys
11
+ from typing import Optional
12
+
13
+ import click
14
+
15
+ from . import __version__
16
+ from .models import (
17
+ ExtractionOptions,
18
+ PredicateComparisonConfig,
19
+ PredicateTaxonomy,
20
+ ScoringConfig,
21
+ )
22
+
23
+
24
+ @click.command()
25
+ @click.argument("text", required=False)
26
+ @click.option("-f", "--file", "input_file", type=click.Path(exists=True), help="Read input from file")
27
+ @click.option(
28
+ "-o", "--output",
29
+ type=click.Choice(["table", "json", "xml"], case_sensitive=False),
30
+ default="table",
31
+ help="Output format (default: table)"
32
+ )
33
+ @click.option("--json", "output_json", is_flag=True, help="Output as JSON (shortcut for -o json)")
34
+ @click.option("--xml", "output_xml", is_flag=True, help="Output as XML (shortcut for -o xml)")
35
+ # Beam search options
36
+ @click.option("-b", "--beams", type=int, default=4, help="Number of beams for diverse beam search (default: 4)")
37
+ @click.option("--diversity", type=float, default=1.0, help="Diversity penalty for beam search (default: 1.0)")
38
+ @click.option("--max-tokens", type=int, default=2048, help="Maximum tokens to generate (default: 2048)")
39
+ # Deduplication options
40
+ @click.option("--no-dedup", is_flag=True, help="Disable deduplication")
41
+ @click.option("--no-embeddings", is_flag=True, help="Disable embedding-based deduplication (faster)")
42
+ @click.option("--no-merge", is_flag=True, help="Disable beam merging (select single best beam)")
43
+ @click.option("--dedup-threshold", type=float, default=0.65, help="Similarity threshold for deduplication (default: 0.65)")
44
+ # Quality options
45
+ @click.option("--min-confidence", type=float, default=0.0, help="Minimum confidence threshold 0-1 (default: 0)")
46
+ # Taxonomy options
47
+ @click.option("--taxonomy", type=click.Path(exists=True), help="Load predicate taxonomy from file (one per line)")
48
+ @click.option("--taxonomy-threshold", type=float, default=0.5, help="Similarity threshold for taxonomy matching (default: 0.5)")
49
+ # Device options
50
+ @click.option("--device", type=click.Choice(["auto", "cuda", "cpu"]), default="auto", help="Device to use (default: auto)")
51
+ # Output options
52
+ @click.option("-v", "--verbose", is_flag=True, help="Show verbose output with confidence scores")
53
+ @click.option("-q", "--quiet", is_flag=True, help="Suppress progress messages")
54
+ @click.version_option(version=__version__)
55
+ def main(
56
+ text: Optional[str],
57
+ input_file: Optional[str],
58
+ output: str,
59
+ output_json: bool,
60
+ output_xml: bool,
61
+ beams: int,
62
+ diversity: float,
63
+ max_tokens: int,
64
+ no_dedup: bool,
65
+ no_embeddings: bool,
66
+ no_merge: bool,
67
+ dedup_threshold: float,
68
+ min_confidence: float,
69
+ taxonomy: Optional[str],
70
+ taxonomy_threshold: float,
71
+ device: str,
72
+ verbose: bool,
73
+ quiet: bool,
74
+ ):
75
+ """
76
+ Extract structured statements from text.
77
+
78
+ TEXT can be provided as an argument, read from a file with -f, or piped via stdin.
79
+
80
+ \b
81
+ Examples:
82
+ corp-extractor "Apple announced a new iPhone."
83
+ corp-extractor -f article.txt --json
84
+ corp-extractor -f article.txt -o json --beams 8
85
+ cat article.txt | corp-extractor -
86
+ echo "Tim Cook is CEO of Apple." | corp-extractor - --verbose
87
+
88
+ \b
89
+ Output formats:
90
+ table Human-readable table (default)
91
+ json JSON with full metadata
92
+ xml Raw XML from model
93
+ """
94
+ # Determine output format
95
+ if output_json:
96
+ output = "json"
97
+ elif output_xml:
98
+ output = "xml"
99
+
100
+ # Get input text
101
+ input_text = _get_input_text(text, input_file)
102
+ if not input_text:
103
+ raise click.UsageError(
104
+ "No input provided. Use: statement-extractor \"text\", "
105
+ "statement-extractor -f file.txt, or pipe via stdin."
106
+ )
107
+
108
+ if not quiet:
109
+ click.echo(f"Processing {len(input_text)} characters...", err=True)
110
+
111
+ # Load taxonomy if provided
112
+ predicate_taxonomy = None
113
+ if taxonomy:
114
+ predicate_taxonomy = PredicateTaxonomy.from_file(taxonomy)
115
+ if not quiet:
116
+ click.echo(f"Loaded taxonomy with {len(predicate_taxonomy.predicates)} predicates", err=True)
117
+
118
+ # Configure predicate comparison
119
+ predicate_config = PredicateComparisonConfig(
120
+ similarity_threshold=taxonomy_threshold,
121
+ dedup_threshold=dedup_threshold,
122
+ )
123
+
124
+ # Configure scoring
125
+ scoring_config = ScoringConfig(min_confidence=min_confidence)
126
+
127
+ # Configure extraction options
128
+ options = ExtractionOptions(
129
+ num_beams=beams,
130
+ diversity_penalty=diversity,
131
+ max_new_tokens=max_tokens,
132
+ deduplicate=not no_dedup,
133
+ embedding_dedup=not no_embeddings,
134
+ merge_beams=not no_merge,
135
+ predicate_taxonomy=predicate_taxonomy,
136
+ predicate_config=predicate_config,
137
+ scoring_config=scoring_config,
138
+ )
139
+
140
+ # Import here to allow --help without loading torch
141
+ from .extractor import StatementExtractor
142
+
143
+ # Create extractor with specified device
144
+ device_arg = None if device == "auto" else device
145
+ extractor = StatementExtractor(device=device_arg)
146
+
147
+ if not quiet:
148
+ click.echo(f"Using device: {extractor.device}", err=True)
149
+
150
+ # Run extraction
151
+ try:
152
+ if output == "xml":
153
+ result = extractor.extract_as_xml(input_text, options)
154
+ click.echo(result)
155
+ elif output == "json":
156
+ result = extractor.extract_as_json(input_text, options)
157
+ click.echo(result)
158
+ else:
159
+ # Table format
160
+ result = extractor.extract(input_text, options)
161
+ _print_table(result, verbose)
162
+ except Exception as e:
163
+ raise click.ClickException(f"Extraction failed: {e}")
164
+
165
+
166
+ def _get_input_text(text: Optional[str], input_file: Optional[str]) -> Optional[str]:
167
+ """Get input text from argument, file, or stdin."""
168
+ if text == "-" or (text is None and input_file is None and not sys.stdin.isatty()):
169
+ # Read from stdin
170
+ return sys.stdin.read().strip()
171
+ elif input_file:
172
+ # Read from file
173
+ with open(input_file, "r", encoding="utf-8") as f:
174
+ return f.read().strip()
175
+ elif text:
176
+ return text.strip()
177
+ return None
178
+
179
+
180
+ def _print_table(result, verbose: bool):
181
+ """Print statements in a human-readable table format."""
182
+ if not result.statements:
183
+ click.echo("No statements extracted.")
184
+ return
185
+
186
+ click.echo(f"\nExtracted {len(result.statements)} statement(s):\n")
187
+ click.echo("-" * 80)
188
+
189
+ for i, stmt in enumerate(result.statements, 1):
190
+ subject_type = f" ({stmt.subject.type.value})" if stmt.subject.type.value != "UNKNOWN" else ""
191
+ object_type = f" ({stmt.object.type.value})" if stmt.object.type.value != "UNKNOWN" else ""
192
+
193
+ click.echo(f"{i}. {stmt.subject.text}{subject_type}")
194
+ click.echo(f" --[{stmt.predicate}]-->")
195
+ click.echo(f" {stmt.object.text}{object_type}")
196
+
197
+ if verbose:
198
+ if stmt.confidence_score is not None:
199
+ click.echo(f" Confidence: {stmt.confidence_score:.2f}")
200
+
201
+ if stmt.canonical_predicate:
202
+ click.echo(f" Canonical: {stmt.canonical_predicate}")
203
+
204
+ if stmt.was_reversed:
205
+ click.echo(f" (subject/object were swapped)")
206
+
207
+ if stmt.source_text:
208
+ source = stmt.source_text[:60] + "..." if len(stmt.source_text) > 60 else stmt.source_text
209
+ click.echo(f" Source: \"{source}\"")
210
+
211
+ click.echo("-" * 80)
212
+
213
+
214
+ if __name__ == "__main__":
215
+ main()