corp-extractor 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/METADATA +235 -96
- corp_extractor-0.5.0.dist-info/RECORD +55 -0
- statement_extractor/__init__.py +9 -0
- statement_extractor/cli.py +460 -21
- statement_extractor/data/default_predicates.json +368 -0
- statement_extractor/data/statement_taxonomy.json +1182 -0
- statement_extractor/extractor.py +32 -47
- statement_extractor/gliner_extraction.py +218 -0
- statement_extractor/llm.py +255 -0
- statement_extractor/models/__init__.py +74 -0
- statement_extractor/models/canonical.py +139 -0
- statement_extractor/models/entity.py +102 -0
- statement_extractor/models/labels.py +191 -0
- statement_extractor/models/qualifiers.py +91 -0
- statement_extractor/models/statement.py +75 -0
- statement_extractor/models.py +15 -6
- statement_extractor/pipeline/__init__.py +39 -0
- statement_extractor/pipeline/config.py +134 -0
- statement_extractor/pipeline/context.py +177 -0
- statement_extractor/pipeline/orchestrator.py +447 -0
- statement_extractor/pipeline/registry.py +297 -0
- statement_extractor/plugins/__init__.py +43 -0
- statement_extractor/plugins/base.py +446 -0
- statement_extractor/plugins/canonicalizers/__init__.py +17 -0
- statement_extractor/plugins/canonicalizers/base.py +9 -0
- statement_extractor/plugins/canonicalizers/location.py +219 -0
- statement_extractor/plugins/canonicalizers/organization.py +230 -0
- statement_extractor/plugins/canonicalizers/person.py +242 -0
- statement_extractor/plugins/extractors/__init__.py +13 -0
- statement_extractor/plugins/extractors/base.py +9 -0
- statement_extractor/plugins/extractors/gliner2.py +536 -0
- statement_extractor/plugins/labelers/__init__.py +29 -0
- statement_extractor/plugins/labelers/base.py +9 -0
- statement_extractor/plugins/labelers/confidence.py +138 -0
- statement_extractor/plugins/labelers/relation_type.py +87 -0
- statement_extractor/plugins/labelers/sentiment.py +159 -0
- statement_extractor/plugins/labelers/taxonomy.py +373 -0
- statement_extractor/plugins/labelers/taxonomy_embedding.py +466 -0
- statement_extractor/plugins/qualifiers/__init__.py +19 -0
- statement_extractor/plugins/qualifiers/base.py +9 -0
- statement_extractor/plugins/qualifiers/companies_house.py +174 -0
- statement_extractor/plugins/qualifiers/gleif.py +186 -0
- statement_extractor/plugins/qualifiers/person.py +221 -0
- statement_extractor/plugins/qualifiers/sec_edgar.py +198 -0
- statement_extractor/plugins/splitters/__init__.py +13 -0
- statement_extractor/plugins/splitters/base.py +9 -0
- statement_extractor/plugins/splitters/t5_gemma.py +188 -0
- statement_extractor/plugins/taxonomy/__init__.py +13 -0
- statement_extractor/plugins/taxonomy/embedding.py +337 -0
- statement_extractor/plugins/taxonomy/mnli.py +279 -0
- statement_extractor/scoring.py +17 -69
- corp_extractor-0.3.0.dist-info/RECORD +0 -12
- statement_extractor/spacy_extraction.py +0 -386
- {corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/WHEEL +0 -0
- {corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/entry_points.txt +0 -0
statement_extractor/cli.py
CHANGED
|
@@ -2,11 +2,13 @@
|
|
|
2
2
|
Command-line interface for statement extraction.
|
|
3
3
|
|
|
4
4
|
Usage:
|
|
5
|
-
corp-extractor "Your text here"
|
|
6
|
-
corp-extractor -f input.txt
|
|
7
|
-
|
|
5
|
+
corp-extractor split "Your text here"
|
|
6
|
+
corp-extractor split -f input.txt
|
|
7
|
+
corp-extractor pipeline "Your text here" --stages 1-5
|
|
8
|
+
corp-extractor plugins list
|
|
8
9
|
"""
|
|
9
10
|
|
|
11
|
+
import json
|
|
10
12
|
import logging
|
|
11
13
|
import sys
|
|
12
14
|
from typing import Optional
|
|
@@ -34,10 +36,26 @@ def _configure_logging(verbose: bool) -> None:
|
|
|
34
36
|
"statement_extractor.scoring",
|
|
35
37
|
"statement_extractor.predicate_comparer",
|
|
36
38
|
"statement_extractor.canonicalization",
|
|
37
|
-
"statement_extractor.
|
|
39
|
+
"statement_extractor.gliner_extraction",
|
|
40
|
+
"statement_extractor.pipeline",
|
|
41
|
+
"statement_extractor.plugins",
|
|
42
|
+
"statement_extractor.plugins.extractors.gliner2",
|
|
43
|
+
"statement_extractor.plugins.splitters",
|
|
44
|
+
"statement_extractor.plugins.labelers",
|
|
38
45
|
]:
|
|
39
46
|
logging.getLogger(logger_name).setLevel(level)
|
|
40
47
|
|
|
48
|
+
# Suppress noisy third-party loggers
|
|
49
|
+
for noisy_logger in [
|
|
50
|
+
"httpcore.http11",
|
|
51
|
+
"httpcore.connection",
|
|
52
|
+
"httpx",
|
|
53
|
+
"urllib3",
|
|
54
|
+
"huggingface_hub",
|
|
55
|
+
]:
|
|
56
|
+
logging.getLogger(noisy_logger).setLevel(logging.WARNING)
|
|
57
|
+
|
|
58
|
+
|
|
41
59
|
from . import __version__
|
|
42
60
|
from .models import (
|
|
43
61
|
ExtractionOptions,
|
|
@@ -47,7 +65,33 @@ from .models import (
|
|
|
47
65
|
)
|
|
48
66
|
|
|
49
67
|
|
|
50
|
-
@click.
|
|
68
|
+
@click.group()
|
|
69
|
+
@click.version_option(version=__version__)
|
|
70
|
+
def main():
|
|
71
|
+
"""
|
|
72
|
+
Extract structured statements from text.
|
|
73
|
+
|
|
74
|
+
\b
|
|
75
|
+
Commands:
|
|
76
|
+
split Extract sub-statements from text (simple, fast)
|
|
77
|
+
pipeline Run the full 5-stage extraction pipeline
|
|
78
|
+
plugins List or inspect available plugins
|
|
79
|
+
|
|
80
|
+
\b
|
|
81
|
+
Examples:
|
|
82
|
+
corp-extractor split "Apple announced a new iPhone."
|
|
83
|
+
corp-extractor split -f article.txt --json
|
|
84
|
+
corp-extractor pipeline "Apple CEO Tim Cook announced..." --stages 1-3
|
|
85
|
+
corp-extractor plugins list
|
|
86
|
+
"""
|
|
87
|
+
pass
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# =============================================================================
|
|
91
|
+
# Split command (simple extraction)
|
|
92
|
+
# =============================================================================
|
|
93
|
+
|
|
94
|
+
@main.command("split")
|
|
51
95
|
@click.argument("text", required=False)
|
|
52
96
|
@click.option("-f", "--file", "input_file", type=click.Path(exists=True), help="Read input from file")
|
|
53
97
|
@click.option(
|
|
@@ -66,7 +110,8 @@ from .models import (
|
|
|
66
110
|
@click.option("--no-dedup", is_flag=True, help="Disable deduplication")
|
|
67
111
|
@click.option("--no-embeddings", is_flag=True, help="Disable embedding-based deduplication (faster)")
|
|
68
112
|
@click.option("--no-merge", is_flag=True, help="Disable beam merging (select single best beam)")
|
|
69
|
-
@click.option("--no-
|
|
113
|
+
@click.option("--no-gliner", is_flag=True, help="Disable GLiNER2 extraction (use raw model output)")
|
|
114
|
+
@click.option("--predicates", type=str, help="Comma-separated list of predicate types for GLiNER2 relation extraction")
|
|
70
115
|
@click.option("--all-triples", is_flag=True, help="Keep all candidate triples instead of selecting best per source")
|
|
71
116
|
@click.option("--dedup-threshold", type=float, default=0.65, help="Similarity threshold for deduplication (default: 0.65)")
|
|
72
117
|
# Quality options
|
|
@@ -79,8 +124,7 @@ from .models import (
|
|
|
79
124
|
# Output options
|
|
80
125
|
@click.option("-v", "--verbose", is_flag=True, help="Show verbose output with confidence scores")
|
|
81
126
|
@click.option("-q", "--quiet", is_flag=True, help="Suppress progress messages")
|
|
82
|
-
|
|
83
|
-
def main(
|
|
127
|
+
def split_cmd(
|
|
84
128
|
text: Optional[str],
|
|
85
129
|
input_file: Optional[str],
|
|
86
130
|
output: str,
|
|
@@ -92,7 +136,8 @@ def main(
|
|
|
92
136
|
no_dedup: bool,
|
|
93
137
|
no_embeddings: bool,
|
|
94
138
|
no_merge: bool,
|
|
95
|
-
|
|
139
|
+
no_gliner: bool,
|
|
140
|
+
predicates: Optional[str],
|
|
96
141
|
all_triples: bool,
|
|
97
142
|
dedup_threshold: float,
|
|
98
143
|
min_confidence: float,
|
|
@@ -103,17 +148,18 @@ def main(
|
|
|
103
148
|
quiet: bool,
|
|
104
149
|
):
|
|
105
150
|
"""
|
|
106
|
-
Extract
|
|
151
|
+
Extract sub-statements from text using T5-Gemma model.
|
|
107
152
|
|
|
108
|
-
|
|
153
|
+
This command splits text into structured subject-predicate-object triples.
|
|
154
|
+
It's fast and simple - use 'pipeline' for full entity resolution.
|
|
109
155
|
|
|
110
156
|
\b
|
|
111
157
|
Examples:
|
|
112
|
-
corp-extractor "Apple announced a new iPhone."
|
|
113
|
-
corp-extractor -f article.txt --json
|
|
114
|
-
corp-extractor -f article.txt -o json --beams 8
|
|
115
|
-
cat article.txt | corp-extractor -
|
|
116
|
-
echo "Tim Cook is CEO of Apple." | corp-extractor - --verbose
|
|
158
|
+
corp-extractor split "Apple announced a new iPhone."
|
|
159
|
+
corp-extractor split -f article.txt --json
|
|
160
|
+
corp-extractor split -f article.txt -o json --beams 8
|
|
161
|
+
cat article.txt | corp-extractor split -
|
|
162
|
+
echo "Tim Cook is CEO of Apple." | corp-extractor split - --verbose
|
|
117
163
|
|
|
118
164
|
\b
|
|
119
165
|
Output formats:
|
|
@@ -133,10 +179,7 @@ def main(
|
|
|
133
179
|
# Get input text
|
|
134
180
|
input_text = _get_input_text(text, input_file)
|
|
135
181
|
if not input_text:
|
|
136
|
-
raise click.UsageError(
|
|
137
|
-
"No input provided. Use: statement-extractor \"text\", "
|
|
138
|
-
"statement-extractor -f file.txt, or pipe via stdin."
|
|
139
|
-
)
|
|
182
|
+
raise click.UsageError("No input provided. Provide text argument or use -f file.txt")
|
|
140
183
|
|
|
141
184
|
if not quiet:
|
|
142
185
|
click.echo(f"Processing {len(input_text)} characters...", err=True)
|
|
@@ -157,6 +200,13 @@ def main(
|
|
|
157
200
|
# Configure scoring
|
|
158
201
|
scoring_config = ScoringConfig(min_confidence=min_confidence)
|
|
159
202
|
|
|
203
|
+
# Parse predicates if provided
|
|
204
|
+
predicate_list = None
|
|
205
|
+
if predicates:
|
|
206
|
+
predicate_list = [p.strip() for p in predicates.split(",") if p.strip()]
|
|
207
|
+
if not quiet:
|
|
208
|
+
click.echo(f"Using predicate list: {predicate_list}", err=True)
|
|
209
|
+
|
|
160
210
|
# Configure extraction options
|
|
161
211
|
options = ExtractionOptions(
|
|
162
212
|
num_beams=beams,
|
|
@@ -165,7 +215,8 @@ def main(
|
|
|
165
215
|
deduplicate=not no_dedup,
|
|
166
216
|
embedding_dedup=not no_embeddings,
|
|
167
217
|
merge_beams=not no_merge,
|
|
168
|
-
|
|
218
|
+
use_gliner_extraction=not no_gliner,
|
|
219
|
+
predicates=predicate_list,
|
|
169
220
|
all_triples=all_triples,
|
|
170
221
|
predicate_taxonomy=predicate_taxonomy,
|
|
171
222
|
predicate_config=predicate_config,
|
|
@@ -200,6 +251,394 @@ def main(
|
|
|
200
251
|
raise click.ClickException(f"Extraction failed: {e}")
|
|
201
252
|
|
|
202
253
|
|
|
254
|
+
# =============================================================================
|
|
255
|
+
# Pipeline command
|
|
256
|
+
# =============================================================================
|
|
257
|
+
|
|
258
|
+
@main.command("pipeline")
|
|
259
|
+
@click.argument("text", required=False)
|
|
260
|
+
@click.option("-f", "--file", "input_file", type=click.Path(exists=True), help="Read input from file")
|
|
261
|
+
@click.option(
|
|
262
|
+
"--stages",
|
|
263
|
+
type=str,
|
|
264
|
+
default="1-6",
|
|
265
|
+
help="Stages to run (e.g., '1,2,3' or '1-3' or '1-6')"
|
|
266
|
+
)
|
|
267
|
+
@click.option(
|
|
268
|
+
"--skip-stages",
|
|
269
|
+
type=str,
|
|
270
|
+
default=None,
|
|
271
|
+
help="Stages to skip (e.g., '4,5')"
|
|
272
|
+
)
|
|
273
|
+
@click.option(
|
|
274
|
+
"--plugins",
|
|
275
|
+
"enabled_plugins",
|
|
276
|
+
type=str,
|
|
277
|
+
default=None,
|
|
278
|
+
help="Plugins to enable (comma-separated names)"
|
|
279
|
+
)
|
|
280
|
+
@click.option(
|
|
281
|
+
"--disable-plugins",
|
|
282
|
+
type=str,
|
|
283
|
+
default=None,
|
|
284
|
+
help="Plugins to disable (comma-separated names)"
|
|
285
|
+
)
|
|
286
|
+
@click.option(
|
|
287
|
+
"--no-default-predicates",
|
|
288
|
+
is_flag=True,
|
|
289
|
+
help="Disable default predicate taxonomy (GLiNER2 will only use entity extraction)"
|
|
290
|
+
)
|
|
291
|
+
@click.option(
|
|
292
|
+
"-o", "--output",
|
|
293
|
+
type=click.Choice(["table", "json", "yaml", "triples"], case_sensitive=False),
|
|
294
|
+
default="table",
|
|
295
|
+
help="Output format (default: table)"
|
|
296
|
+
)
|
|
297
|
+
@click.option("-v", "--verbose", is_flag=True, help="Show verbose output")
|
|
298
|
+
@click.option("-q", "--quiet", is_flag=True, help="Suppress progress messages")
|
|
299
|
+
def pipeline_cmd(
|
|
300
|
+
text: Optional[str],
|
|
301
|
+
input_file: Optional[str],
|
|
302
|
+
stages: str,
|
|
303
|
+
skip_stages: Optional[str],
|
|
304
|
+
enabled_plugins: Optional[str],
|
|
305
|
+
disable_plugins: Optional[str],
|
|
306
|
+
no_default_predicates: bool,
|
|
307
|
+
output: str,
|
|
308
|
+
verbose: bool,
|
|
309
|
+
quiet: bool,
|
|
310
|
+
):
|
|
311
|
+
"""
|
|
312
|
+
Run the full 5-stage extraction pipeline.
|
|
313
|
+
|
|
314
|
+
\b
|
|
315
|
+
Stages:
|
|
316
|
+
1. Splitting - Text → Raw triples (T5-Gemma)
|
|
317
|
+
2. Extraction - Raw triples → Typed statements (GLiNER2)
|
|
318
|
+
3. Qualification - Add qualifiers and identifiers
|
|
319
|
+
4. Canonicalization - Resolve to canonical forms
|
|
320
|
+
5. Labeling - Apply sentiment, relation type, confidence
|
|
321
|
+
|
|
322
|
+
\b
|
|
323
|
+
Examples:
|
|
324
|
+
corp-extractor pipeline "Apple CEO Tim Cook announced..."
|
|
325
|
+
corp-extractor pipeline -f article.txt --stages 1-3
|
|
326
|
+
corp-extractor pipeline "..." --plugins gleif,companies_house
|
|
327
|
+
corp-extractor pipeline "..." --disable-plugins sec_edgar
|
|
328
|
+
"""
|
|
329
|
+
_configure_logging(verbose)
|
|
330
|
+
|
|
331
|
+
# Get input text
|
|
332
|
+
input_text = _get_input_text(text, input_file)
|
|
333
|
+
if not input_text:
|
|
334
|
+
raise click.UsageError("No input provided. Provide text argument or use -f file.txt")
|
|
335
|
+
|
|
336
|
+
if not quiet:
|
|
337
|
+
click.echo(f"Processing {len(input_text)} characters through pipeline...", err=True)
|
|
338
|
+
|
|
339
|
+
# Import pipeline components (also loads plugins)
|
|
340
|
+
from .pipeline import ExtractionPipeline, PipelineConfig
|
|
341
|
+
_load_all_plugins()
|
|
342
|
+
|
|
343
|
+
# Parse stages
|
|
344
|
+
enabled_stages = _parse_stages(stages)
|
|
345
|
+
if skip_stages:
|
|
346
|
+
skip_set = _parse_stages(skip_stages)
|
|
347
|
+
enabled_stages = enabled_stages - skip_set
|
|
348
|
+
|
|
349
|
+
if not quiet:
|
|
350
|
+
click.echo(f"Running stages: {sorted(enabled_stages)}", err=True)
|
|
351
|
+
|
|
352
|
+
# Parse plugin selection
|
|
353
|
+
enabled_plugin_set = None
|
|
354
|
+
if enabled_plugins:
|
|
355
|
+
enabled_plugin_set = {p.strip() for p in enabled_plugins.split(",") if p.strip()}
|
|
356
|
+
|
|
357
|
+
disabled_plugin_set = set()
|
|
358
|
+
if disable_plugins:
|
|
359
|
+
disabled_plugin_set = {p.strip() for p in disable_plugins.split(",") if p.strip()}
|
|
360
|
+
|
|
361
|
+
# Build extractor options
|
|
362
|
+
extractor_options = {}
|
|
363
|
+
if no_default_predicates:
|
|
364
|
+
extractor_options["use_default_predicates"] = False
|
|
365
|
+
if not quiet:
|
|
366
|
+
click.echo("Default predicates disabled - using entity extraction only", err=True)
|
|
367
|
+
|
|
368
|
+
# Create config
|
|
369
|
+
config = PipelineConfig(
|
|
370
|
+
enabled_stages=enabled_stages,
|
|
371
|
+
enabled_plugins=enabled_plugin_set,
|
|
372
|
+
disabled_plugins=disabled_plugin_set,
|
|
373
|
+
extractor_options=extractor_options,
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
# Run pipeline
|
|
377
|
+
try:
|
|
378
|
+
pipeline = ExtractionPipeline(config)
|
|
379
|
+
ctx = pipeline.process(input_text)
|
|
380
|
+
|
|
381
|
+
# Output results
|
|
382
|
+
if output == "json":
|
|
383
|
+
_print_pipeline_json(ctx)
|
|
384
|
+
elif output == "yaml":
|
|
385
|
+
_print_pipeline_yaml(ctx)
|
|
386
|
+
elif output == "triples":
|
|
387
|
+
_print_pipeline_triples(ctx)
|
|
388
|
+
else:
|
|
389
|
+
_print_pipeline_table(ctx, verbose)
|
|
390
|
+
|
|
391
|
+
# Report errors/warnings
|
|
392
|
+
if ctx.processing_errors and not quiet:
|
|
393
|
+
click.echo(f"\nErrors: {len(ctx.processing_errors)}", err=True)
|
|
394
|
+
for error in ctx.processing_errors:
|
|
395
|
+
click.echo(f" - {error}", err=True)
|
|
396
|
+
|
|
397
|
+
if ctx.processing_warnings and verbose:
|
|
398
|
+
click.echo(f"\nWarnings: {len(ctx.processing_warnings)}", err=True)
|
|
399
|
+
for warning in ctx.processing_warnings:
|
|
400
|
+
click.echo(f" - {warning}", err=True)
|
|
401
|
+
|
|
402
|
+
except Exception as e:
|
|
403
|
+
logging.exception("Pipeline error:")
|
|
404
|
+
raise click.ClickException(f"Pipeline failed: {e}")
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def _parse_stages(stages_str: str) -> set[int]:
|
|
408
|
+
"""Parse stage string like '1,2,3' or '1-3' into a set of ints."""
|
|
409
|
+
result = set()
|
|
410
|
+
for part in stages_str.split(","):
|
|
411
|
+
part = part.strip()
|
|
412
|
+
if "-" in part:
|
|
413
|
+
start, end = part.split("-", 1)
|
|
414
|
+
for i in range(int(start), int(end) + 1):
|
|
415
|
+
result.add(i)
|
|
416
|
+
else:
|
|
417
|
+
result.add(int(part))
|
|
418
|
+
return result
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def _print_pipeline_json(ctx):
|
|
422
|
+
"""Print pipeline results as JSON."""
|
|
423
|
+
output = {
|
|
424
|
+
"statement_count": ctx.statement_count,
|
|
425
|
+
"raw_triples": [t.model_dump() for t in ctx.raw_triples],
|
|
426
|
+
"statements": [s.model_dump() for s in ctx.statements],
|
|
427
|
+
"labeled_statements": [stmt.as_dict() for stmt in ctx.labeled_statements],
|
|
428
|
+
"timings": ctx.stage_timings,
|
|
429
|
+
"warnings": ctx.processing_warnings,
|
|
430
|
+
"errors": ctx.processing_errors,
|
|
431
|
+
}
|
|
432
|
+
click.echo(json.dumps(output, indent=2, default=str))
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def _print_pipeline_yaml(ctx):
|
|
436
|
+
"""Print pipeline results as YAML."""
|
|
437
|
+
try:
|
|
438
|
+
import yaml
|
|
439
|
+
output = {
|
|
440
|
+
"statement_count": ctx.statement_count,
|
|
441
|
+
"statements": [stmt.as_dict() for stmt in ctx.labeled_statements],
|
|
442
|
+
"timings": ctx.stage_timings,
|
|
443
|
+
}
|
|
444
|
+
click.echo(yaml.dump(output, default_flow_style=False))
|
|
445
|
+
except ImportError:
|
|
446
|
+
click.echo("YAML output requires PyYAML: pip install pyyaml", err=True)
|
|
447
|
+
_print_pipeline_json(ctx)
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
def _print_pipeline_triples(ctx):
|
|
451
|
+
"""Print pipeline results as simple triples."""
|
|
452
|
+
if ctx.labeled_statements:
|
|
453
|
+
for stmt in ctx.labeled_statements:
|
|
454
|
+
click.echo(f"{stmt.subject_fqn}\t{stmt.statement.predicate}\t{stmt.object_fqn}")
|
|
455
|
+
elif ctx.statements:
|
|
456
|
+
for stmt in ctx.statements:
|
|
457
|
+
click.echo(f"{stmt.subject.text}\t{stmt.predicate}\t{stmt.object.text}")
|
|
458
|
+
elif ctx.raw_triples:
|
|
459
|
+
for triple in ctx.raw_triples:
|
|
460
|
+
click.echo(f"{triple.subject_text}\t{triple.predicate_text}\t{triple.object_text}")
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
def _print_pipeline_table(ctx, verbose: bool):
|
|
464
|
+
"""Print pipeline results in table format."""
|
|
465
|
+
# Try labeled statements first, then statements, then raw triples
|
|
466
|
+
if ctx.labeled_statements:
|
|
467
|
+
click.echo(f"\nExtracted {len(ctx.labeled_statements)} statement(s):\n")
|
|
468
|
+
click.echo("-" * 80)
|
|
469
|
+
|
|
470
|
+
for i, stmt in enumerate(ctx.labeled_statements, 1):
|
|
471
|
+
click.echo(f"{i}. {stmt.subject_fqn}")
|
|
472
|
+
click.echo(f" --[{stmt.statement.predicate}]-->")
|
|
473
|
+
click.echo(f" {stmt.object_fqn}")
|
|
474
|
+
|
|
475
|
+
# Show labels (always in recent versions, not just verbose)
|
|
476
|
+
for label in stmt.labels:
|
|
477
|
+
if isinstance(label.label_value, float):
|
|
478
|
+
click.echo(f" {label.label_type}: {label.label_value:.3f}")
|
|
479
|
+
else:
|
|
480
|
+
click.echo(f" {label.label_type}: {label.label_value}")
|
|
481
|
+
|
|
482
|
+
# Show top taxonomy results (sorted by confidence)
|
|
483
|
+
if stmt.taxonomy_results:
|
|
484
|
+
sorted_taxonomy = sorted(stmt.taxonomy_results, key=lambda t: t.confidence, reverse=True)
|
|
485
|
+
top_taxonomy = sorted_taxonomy[:5] # Show top 5
|
|
486
|
+
taxonomy_strs = [f"{t.category}:{t.label} ({t.confidence:.2f})" for t in top_taxonomy]
|
|
487
|
+
click.echo(f" topics: {', '.join(taxonomy_strs)}")
|
|
488
|
+
if len(sorted_taxonomy) > 5:
|
|
489
|
+
click.echo(f" ... and {len(sorted_taxonomy) - 5} more topics")
|
|
490
|
+
|
|
491
|
+
if verbose and stmt.statement.source_text:
|
|
492
|
+
source = stmt.statement.source_text[:60] + "..." if len(stmt.statement.source_text) > 60 else stmt.statement.source_text
|
|
493
|
+
click.echo(f" Source: \"{source}\"")
|
|
494
|
+
|
|
495
|
+
click.echo("-" * 80)
|
|
496
|
+
|
|
497
|
+
elif ctx.statements:
|
|
498
|
+
click.echo(f"\nExtracted {len(ctx.statements)} statement(s):\n")
|
|
499
|
+
click.echo("-" * 80)
|
|
500
|
+
|
|
501
|
+
for i, stmt in enumerate(ctx.statements, 1):
|
|
502
|
+
subj_type = f" ({stmt.subject.type.value})" if stmt.subject.type.value != "UNKNOWN" else ""
|
|
503
|
+
obj_type = f" ({stmt.object.type.value})" if stmt.object.type.value != "UNKNOWN" else ""
|
|
504
|
+
|
|
505
|
+
click.echo(f"{i}. {stmt.subject.text}{subj_type}")
|
|
506
|
+
click.echo(f" --[{stmt.predicate}]-->")
|
|
507
|
+
click.echo(f" {stmt.object.text}{obj_type}")
|
|
508
|
+
|
|
509
|
+
if verbose and stmt.confidence_score is not None:
|
|
510
|
+
click.echo(f" Confidence: {stmt.confidence_score:.2f}")
|
|
511
|
+
|
|
512
|
+
click.echo("-" * 80)
|
|
513
|
+
|
|
514
|
+
elif ctx.raw_triples:
|
|
515
|
+
click.echo(f"\nExtracted {len(ctx.raw_triples)} raw triple(s):\n")
|
|
516
|
+
click.echo("-" * 80)
|
|
517
|
+
|
|
518
|
+
for i, triple in enumerate(ctx.raw_triples, 1):
|
|
519
|
+
click.echo(f"{i}. {triple.subject_text}")
|
|
520
|
+
click.echo(f" --[{triple.predicate_text}]-->")
|
|
521
|
+
click.echo(f" {triple.object_text}")
|
|
522
|
+
|
|
523
|
+
if verbose:
|
|
524
|
+
click.echo(f" Confidence: {triple.confidence:.2f}")
|
|
525
|
+
if triple.source_sentence:
|
|
526
|
+
source = triple.source_sentence[:60] + "..." if len(triple.source_sentence) > 60 else triple.source_sentence
|
|
527
|
+
click.echo(f" Source: \"{source}\"")
|
|
528
|
+
|
|
529
|
+
click.echo("-" * 80)
|
|
530
|
+
|
|
531
|
+
else:
|
|
532
|
+
click.echo("No statements extracted.")
|
|
533
|
+
return
|
|
534
|
+
|
|
535
|
+
# Show timings in verbose mode
|
|
536
|
+
if verbose and ctx.stage_timings:
|
|
537
|
+
click.echo("\nStage timings:")
|
|
538
|
+
for stage, duration in ctx.stage_timings.items():
|
|
539
|
+
click.echo(f" {stage}: {duration:.3f}s")
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
# =============================================================================
|
|
543
|
+
# Plugins command
|
|
544
|
+
# =============================================================================
|
|
545
|
+
|
|
546
|
+
@main.command("plugins")
|
|
547
|
+
@click.argument("action", type=click.Choice(["list", "info"]))
|
|
548
|
+
@click.argument("plugin_name", required=False)
|
|
549
|
+
@click.option("--stage", type=int, help="Filter by stage number (1-5)")
|
|
550
|
+
def plugins_cmd(action: str, plugin_name: Optional[str], stage: Optional[int]):
|
|
551
|
+
"""
|
|
552
|
+
List or inspect available plugins.
|
|
553
|
+
|
|
554
|
+
\b
|
|
555
|
+
Actions:
|
|
556
|
+
list List all available plugins
|
|
557
|
+
info Show details about a specific plugin
|
|
558
|
+
|
|
559
|
+
\b
|
|
560
|
+
Examples:
|
|
561
|
+
corp-extractor plugins list
|
|
562
|
+
corp-extractor plugins list --stage 3
|
|
563
|
+
corp-extractor plugins info gleif_qualifier
|
|
564
|
+
"""
|
|
565
|
+
# Import and load plugins
|
|
566
|
+
_load_all_plugins()
|
|
567
|
+
|
|
568
|
+
from .pipeline.registry import PluginRegistry
|
|
569
|
+
|
|
570
|
+
if action == "list":
|
|
571
|
+
plugins = PluginRegistry.list_plugins(stage=stage)
|
|
572
|
+
if not plugins:
|
|
573
|
+
click.echo("No plugins registered.")
|
|
574
|
+
return
|
|
575
|
+
|
|
576
|
+
# Group by stage
|
|
577
|
+
by_stage: dict[int, list] = {}
|
|
578
|
+
for plugin in plugins:
|
|
579
|
+
stage_num = plugin["stage"]
|
|
580
|
+
if stage_num not in by_stage:
|
|
581
|
+
by_stage[stage_num] = []
|
|
582
|
+
by_stage[stage_num].append(plugin)
|
|
583
|
+
|
|
584
|
+
for stage_num in sorted(by_stage.keys()):
|
|
585
|
+
stage_plugins = by_stage[stage_num]
|
|
586
|
+
stage_name = stage_plugins[0]["stage_name"]
|
|
587
|
+
click.echo(f"\nStage {stage_num}: {stage_name.title()}")
|
|
588
|
+
click.echo("-" * 40)
|
|
589
|
+
|
|
590
|
+
for p in stage_plugins:
|
|
591
|
+
entity_types = p.get("entity_types", [])
|
|
592
|
+
types_str = f" ({', '.join(entity_types)})" if entity_types else ""
|
|
593
|
+
click.echo(f" {p['name']}{types_str} [priority: {p['priority']}]")
|
|
594
|
+
|
|
595
|
+
elif action == "info":
|
|
596
|
+
if not plugin_name:
|
|
597
|
+
raise click.UsageError("Plugin name required for 'info' action")
|
|
598
|
+
|
|
599
|
+
plugin = PluginRegistry.get_plugin(plugin_name)
|
|
600
|
+
if not plugin:
|
|
601
|
+
raise click.ClickException(f"Plugin not found: {plugin_name}")
|
|
602
|
+
|
|
603
|
+
click.echo(f"\nPlugin: {plugin.name}")
|
|
604
|
+
click.echo(f"Priority: {plugin.priority}")
|
|
605
|
+
click.echo(f"Capabilities: {plugin.capabilities.name if plugin.capabilities else 'NONE'}")
|
|
606
|
+
|
|
607
|
+
if plugin.description:
|
|
608
|
+
click.echo(f"Description: {plugin.description}")
|
|
609
|
+
|
|
610
|
+
if hasattr(plugin, "supported_entity_types"):
|
|
611
|
+
types = [t.value for t in plugin.supported_entity_types]
|
|
612
|
+
click.echo(f"Entity types: {', '.join(types)}")
|
|
613
|
+
|
|
614
|
+
if hasattr(plugin, "label_type"):
|
|
615
|
+
click.echo(f"Label type: {plugin.label_type}")
|
|
616
|
+
|
|
617
|
+
if hasattr(plugin, "supported_identifier_types"):
|
|
618
|
+
ids = plugin.supported_identifier_types
|
|
619
|
+
if ids:
|
|
620
|
+
click.echo(f"Supported identifiers: {', '.join(ids)}")
|
|
621
|
+
|
|
622
|
+
if hasattr(plugin, "provided_identifier_types"):
|
|
623
|
+
ids = plugin.provided_identifier_types
|
|
624
|
+
if ids:
|
|
625
|
+
click.echo(f"Provided identifiers: {', '.join(ids)}")
|
|
626
|
+
|
|
627
|
+
|
|
628
|
+
def _load_all_plugins():
|
|
629
|
+
"""Load all plugins by importing their modules."""
|
|
630
|
+
# Import all plugin modules to trigger registration
|
|
631
|
+
try:
|
|
632
|
+
from .plugins import splitters, extractors, qualifiers, canonicalizers, labelers, taxonomy
|
|
633
|
+
# The @PluginRegistry decorators will register plugins on import
|
|
634
|
+
except ImportError as e:
|
|
635
|
+
logging.debug(f"Some plugins failed to load: {e}")
|
|
636
|
+
|
|
637
|
+
|
|
638
|
+
# =============================================================================
|
|
639
|
+
# Helper functions
|
|
640
|
+
# =============================================================================
|
|
641
|
+
|
|
203
642
|
def _get_input_text(text: Optional[str], input_file: Optional[str]) -> Optional[str]:
|
|
204
643
|
"""Get input text from argument, file, or stdin."""
|
|
205
644
|
if text == "-" or (text is None and input_file is None and not sys.stdin.isatty()):
|