corp-extractor 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/METADATA +235 -96
  2. corp_extractor-0.5.0.dist-info/RECORD +55 -0
  3. statement_extractor/__init__.py +9 -0
  4. statement_extractor/cli.py +460 -21
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +1182 -0
  7. statement_extractor/extractor.py +32 -47
  8. statement_extractor/gliner_extraction.py +218 -0
  9. statement_extractor/llm.py +255 -0
  10. statement_extractor/models/__init__.py +74 -0
  11. statement_extractor/models/canonical.py +139 -0
  12. statement_extractor/models/entity.py +102 -0
  13. statement_extractor/models/labels.py +191 -0
  14. statement_extractor/models/qualifiers.py +91 -0
  15. statement_extractor/models/statement.py +75 -0
  16. statement_extractor/models.py +15 -6
  17. statement_extractor/pipeline/__init__.py +39 -0
  18. statement_extractor/pipeline/config.py +134 -0
  19. statement_extractor/pipeline/context.py +177 -0
  20. statement_extractor/pipeline/orchestrator.py +447 -0
  21. statement_extractor/pipeline/registry.py +297 -0
  22. statement_extractor/plugins/__init__.py +43 -0
  23. statement_extractor/plugins/base.py +446 -0
  24. statement_extractor/plugins/canonicalizers/__init__.py +17 -0
  25. statement_extractor/plugins/canonicalizers/base.py +9 -0
  26. statement_extractor/plugins/canonicalizers/location.py +219 -0
  27. statement_extractor/plugins/canonicalizers/organization.py +230 -0
  28. statement_extractor/plugins/canonicalizers/person.py +242 -0
  29. statement_extractor/plugins/extractors/__init__.py +13 -0
  30. statement_extractor/plugins/extractors/base.py +9 -0
  31. statement_extractor/plugins/extractors/gliner2.py +536 -0
  32. statement_extractor/plugins/labelers/__init__.py +29 -0
  33. statement_extractor/plugins/labelers/base.py +9 -0
  34. statement_extractor/plugins/labelers/confidence.py +138 -0
  35. statement_extractor/plugins/labelers/relation_type.py +87 -0
  36. statement_extractor/plugins/labelers/sentiment.py +159 -0
  37. statement_extractor/plugins/labelers/taxonomy.py +373 -0
  38. statement_extractor/plugins/labelers/taxonomy_embedding.py +466 -0
  39. statement_extractor/plugins/qualifiers/__init__.py +19 -0
  40. statement_extractor/plugins/qualifiers/base.py +9 -0
  41. statement_extractor/plugins/qualifiers/companies_house.py +174 -0
  42. statement_extractor/plugins/qualifiers/gleif.py +186 -0
  43. statement_extractor/plugins/qualifiers/person.py +221 -0
  44. statement_extractor/plugins/qualifiers/sec_edgar.py +198 -0
  45. statement_extractor/plugins/splitters/__init__.py +13 -0
  46. statement_extractor/plugins/splitters/base.py +9 -0
  47. statement_extractor/plugins/splitters/t5_gemma.py +188 -0
  48. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  49. statement_extractor/plugins/taxonomy/embedding.py +337 -0
  50. statement_extractor/plugins/taxonomy/mnli.py +279 -0
  51. statement_extractor/scoring.py +17 -69
  52. corp_extractor-0.3.0.dist-info/RECORD +0 -12
  53. statement_extractor/spacy_extraction.py +0 -386
  54. {corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/WHEEL +0 -0
  55. {corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/entry_points.txt +0 -0
@@ -2,11 +2,13 @@
2
2
  Command-line interface for statement extraction.
3
3
 
4
4
  Usage:
5
- corp-extractor "Your text here"
6
- corp-extractor -f input.txt
7
- cat input.txt | corp-extractor -
5
+ corp-extractor split "Your text here"
6
+ corp-extractor split -f input.txt
7
+ corp-extractor pipeline "Your text here" --stages 1-5
8
+ corp-extractor plugins list
8
9
  """
9
10
 
11
+ import json
10
12
  import logging
11
13
  import sys
12
14
  from typing import Optional
@@ -34,10 +36,26 @@ def _configure_logging(verbose: bool) -> None:
34
36
  "statement_extractor.scoring",
35
37
  "statement_extractor.predicate_comparer",
36
38
  "statement_extractor.canonicalization",
37
- "statement_extractor.spacy_extraction",
39
+ "statement_extractor.gliner_extraction",
40
+ "statement_extractor.pipeline",
41
+ "statement_extractor.plugins",
42
+ "statement_extractor.plugins.extractors.gliner2",
43
+ "statement_extractor.plugins.splitters",
44
+ "statement_extractor.plugins.labelers",
38
45
  ]:
39
46
  logging.getLogger(logger_name).setLevel(level)
40
47
 
48
+ # Suppress noisy third-party loggers
49
+ for noisy_logger in [
50
+ "httpcore.http11",
51
+ "httpcore.connection",
52
+ "httpx",
53
+ "urllib3",
54
+ "huggingface_hub",
55
+ ]:
56
+ logging.getLogger(noisy_logger).setLevel(logging.WARNING)
57
+
58
+
41
59
  from . import __version__
42
60
  from .models import (
43
61
  ExtractionOptions,
@@ -47,7 +65,33 @@ from .models import (
47
65
  )
48
66
 
49
67
 
50
- @click.command()
68
+ @click.group()
69
+ @click.version_option(version=__version__)
70
+ def main():
71
+ """
72
+ Extract structured statements from text.
73
+
74
+ \b
75
+ Commands:
76
+ split Extract sub-statements from text (simple, fast)
77
+ pipeline Run the full 5-stage extraction pipeline
78
+ plugins List or inspect available plugins
79
+
80
+ \b
81
+ Examples:
82
+ corp-extractor split "Apple announced a new iPhone."
83
+ corp-extractor split -f article.txt --json
84
+ corp-extractor pipeline "Apple CEO Tim Cook announced..." --stages 1-3
85
+ corp-extractor plugins list
86
+ """
87
+ pass
88
+
89
+
90
+ # =============================================================================
91
+ # Split command (simple extraction)
92
+ # =============================================================================
93
+
94
+ @main.command("split")
51
95
  @click.argument("text", required=False)
52
96
  @click.option("-f", "--file", "input_file", type=click.Path(exists=True), help="Read input from file")
53
97
  @click.option(
@@ -66,7 +110,8 @@ from .models import (
66
110
  @click.option("--no-dedup", is_flag=True, help="Disable deduplication")
67
111
  @click.option("--no-embeddings", is_flag=True, help="Disable embedding-based deduplication (faster)")
68
112
  @click.option("--no-merge", is_flag=True, help="Disable beam merging (select single best beam)")
69
- @click.option("--no-spacy", is_flag=True, help="Disable spaCy extraction (use raw model output)")
113
+ @click.option("--no-gliner", is_flag=True, help="Disable GLiNER2 extraction (use raw model output)")
114
+ @click.option("--predicates", type=str, help="Comma-separated list of predicate types for GLiNER2 relation extraction")
70
115
  @click.option("--all-triples", is_flag=True, help="Keep all candidate triples instead of selecting best per source")
71
116
  @click.option("--dedup-threshold", type=float, default=0.65, help="Similarity threshold for deduplication (default: 0.65)")
72
117
  # Quality options
@@ -79,8 +124,7 @@ from .models import (
79
124
  # Output options
80
125
  @click.option("-v", "--verbose", is_flag=True, help="Show verbose output with confidence scores")
81
126
  @click.option("-q", "--quiet", is_flag=True, help="Suppress progress messages")
82
- @click.version_option(version=__version__)
83
- def main(
127
+ def split_cmd(
84
128
  text: Optional[str],
85
129
  input_file: Optional[str],
86
130
  output: str,
@@ -92,7 +136,8 @@ def main(
92
136
  no_dedup: bool,
93
137
  no_embeddings: bool,
94
138
  no_merge: bool,
95
- no_spacy: bool,
139
+ no_gliner: bool,
140
+ predicates: Optional[str],
96
141
  all_triples: bool,
97
142
  dedup_threshold: float,
98
143
  min_confidence: float,
@@ -103,17 +148,18 @@ def main(
103
148
  quiet: bool,
104
149
  ):
105
150
  """
106
- Extract structured statements from text.
151
+ Extract sub-statements from text using T5-Gemma model.
107
152
 
108
- TEXT can be provided as an argument, read from a file with -f, or piped via stdin.
153
+ This command splits text into structured subject-predicate-object triples.
154
+ It's fast and simple - use 'pipeline' for full entity resolution.
109
155
 
110
156
  \b
111
157
  Examples:
112
- corp-extractor "Apple announced a new iPhone."
113
- corp-extractor -f article.txt --json
114
- corp-extractor -f article.txt -o json --beams 8
115
- cat article.txt | corp-extractor -
116
- echo "Tim Cook is CEO of Apple." | corp-extractor - --verbose
158
+ corp-extractor split "Apple announced a new iPhone."
159
+ corp-extractor split -f article.txt --json
160
+ corp-extractor split -f article.txt -o json --beams 8
161
+ cat article.txt | corp-extractor split -
162
+ echo "Tim Cook is CEO of Apple." | corp-extractor split - --verbose
117
163
 
118
164
  \b
119
165
  Output formats:
@@ -133,10 +179,7 @@ def main(
133
179
  # Get input text
134
180
  input_text = _get_input_text(text, input_file)
135
181
  if not input_text:
136
- raise click.UsageError(
137
- "No input provided. Use: statement-extractor \"text\", "
138
- "statement-extractor -f file.txt, or pipe via stdin."
139
- )
182
+ raise click.UsageError("No input provided. Provide text argument or use -f file.txt")
140
183
 
141
184
  if not quiet:
142
185
  click.echo(f"Processing {len(input_text)} characters...", err=True)
@@ -157,6 +200,13 @@ def main(
157
200
  # Configure scoring
158
201
  scoring_config = ScoringConfig(min_confidence=min_confidence)
159
202
 
203
+ # Parse predicates if provided
204
+ predicate_list = None
205
+ if predicates:
206
+ predicate_list = [p.strip() for p in predicates.split(",") if p.strip()]
207
+ if not quiet:
208
+ click.echo(f"Using predicate list: {predicate_list}", err=True)
209
+
160
210
  # Configure extraction options
161
211
  options = ExtractionOptions(
162
212
  num_beams=beams,
@@ -165,7 +215,8 @@ def main(
165
215
  deduplicate=not no_dedup,
166
216
  embedding_dedup=not no_embeddings,
167
217
  merge_beams=not no_merge,
168
- use_spacy_extraction=not no_spacy,
218
+ use_gliner_extraction=not no_gliner,
219
+ predicates=predicate_list,
169
220
  all_triples=all_triples,
170
221
  predicate_taxonomy=predicate_taxonomy,
171
222
  predicate_config=predicate_config,
@@ -200,6 +251,394 @@ def main(
200
251
  raise click.ClickException(f"Extraction failed: {e}")
201
252
 
202
253
 
254
+ # =============================================================================
255
+ # Pipeline command
256
+ # =============================================================================
257
+
258
+ @main.command("pipeline")
259
+ @click.argument("text", required=False)
260
+ @click.option("-f", "--file", "input_file", type=click.Path(exists=True), help="Read input from file")
261
+ @click.option(
262
+ "--stages",
263
+ type=str,
264
+ default="1-6",
265
+ help="Stages to run (e.g., '1,2,3' or '1-3' or '1-6')"
266
+ )
267
+ @click.option(
268
+ "--skip-stages",
269
+ type=str,
270
+ default=None,
271
+ help="Stages to skip (e.g., '4,5')"
272
+ )
273
+ @click.option(
274
+ "--plugins",
275
+ "enabled_plugins",
276
+ type=str,
277
+ default=None,
278
+ help="Plugins to enable (comma-separated names)"
279
+ )
280
+ @click.option(
281
+ "--disable-plugins",
282
+ type=str,
283
+ default=None,
284
+ help="Plugins to disable (comma-separated names)"
285
+ )
286
+ @click.option(
287
+ "--no-default-predicates",
288
+ is_flag=True,
289
+ help="Disable default predicate taxonomy (GLiNER2 will only use entity extraction)"
290
+ )
291
+ @click.option(
292
+ "-o", "--output",
293
+ type=click.Choice(["table", "json", "yaml", "triples"], case_sensitive=False),
294
+ default="table",
295
+ help="Output format (default: table)"
296
+ )
297
+ @click.option("-v", "--verbose", is_flag=True, help="Show verbose output")
298
+ @click.option("-q", "--quiet", is_flag=True, help="Suppress progress messages")
299
+ def pipeline_cmd(
300
+ text: Optional[str],
301
+ input_file: Optional[str],
302
+ stages: str,
303
+ skip_stages: Optional[str],
304
+ enabled_plugins: Optional[str],
305
+ disable_plugins: Optional[str],
306
+ no_default_predicates: bool,
307
+ output: str,
308
+ verbose: bool,
309
+ quiet: bool,
310
+ ):
311
+ """
312
+ Run the full 5-stage extraction pipeline.
313
+
314
+ \b
315
+ Stages:
316
+ 1. Splitting - Text → Raw triples (T5-Gemma)
317
+ 2. Extraction - Raw triples → Typed statements (GLiNER2)
318
+ 3. Qualification - Add qualifiers and identifiers
319
+ 4. Canonicalization - Resolve to canonical forms
320
+ 5. Labeling - Apply sentiment, relation type, confidence
321
+
322
+ \b
323
+ Examples:
324
+ corp-extractor pipeline "Apple CEO Tim Cook announced..."
325
+ corp-extractor pipeline -f article.txt --stages 1-3
326
+ corp-extractor pipeline "..." --plugins gleif,companies_house
327
+ corp-extractor pipeline "..." --disable-plugins sec_edgar
328
+ """
329
+ _configure_logging(verbose)
330
+
331
+ # Get input text
332
+ input_text = _get_input_text(text, input_file)
333
+ if not input_text:
334
+ raise click.UsageError("No input provided. Provide text argument or use -f file.txt")
335
+
336
+ if not quiet:
337
+ click.echo(f"Processing {len(input_text)} characters through pipeline...", err=True)
338
+
339
+ # Import pipeline components (also loads plugins)
340
+ from .pipeline import ExtractionPipeline, PipelineConfig
341
+ _load_all_plugins()
342
+
343
+ # Parse stages
344
+ enabled_stages = _parse_stages(stages)
345
+ if skip_stages:
346
+ skip_set = _parse_stages(skip_stages)
347
+ enabled_stages = enabled_stages - skip_set
348
+
349
+ if not quiet:
350
+ click.echo(f"Running stages: {sorted(enabled_stages)}", err=True)
351
+
352
+ # Parse plugin selection
353
+ enabled_plugin_set = None
354
+ if enabled_plugins:
355
+ enabled_plugin_set = {p.strip() for p in enabled_plugins.split(",") if p.strip()}
356
+
357
+ disabled_plugin_set = set()
358
+ if disable_plugins:
359
+ disabled_plugin_set = {p.strip() for p in disable_plugins.split(",") if p.strip()}
360
+
361
+ # Build extractor options
362
+ extractor_options = {}
363
+ if no_default_predicates:
364
+ extractor_options["use_default_predicates"] = False
365
+ if not quiet:
366
+ click.echo("Default predicates disabled - using entity extraction only", err=True)
367
+
368
+ # Create config
369
+ config = PipelineConfig(
370
+ enabled_stages=enabled_stages,
371
+ enabled_plugins=enabled_plugin_set,
372
+ disabled_plugins=disabled_plugin_set,
373
+ extractor_options=extractor_options,
374
+ )
375
+
376
+ # Run pipeline
377
+ try:
378
+ pipeline = ExtractionPipeline(config)
379
+ ctx = pipeline.process(input_text)
380
+
381
+ # Output results
382
+ if output == "json":
383
+ _print_pipeline_json(ctx)
384
+ elif output == "yaml":
385
+ _print_pipeline_yaml(ctx)
386
+ elif output == "triples":
387
+ _print_pipeline_triples(ctx)
388
+ else:
389
+ _print_pipeline_table(ctx, verbose)
390
+
391
+ # Report errors/warnings
392
+ if ctx.processing_errors and not quiet:
393
+ click.echo(f"\nErrors: {len(ctx.processing_errors)}", err=True)
394
+ for error in ctx.processing_errors:
395
+ click.echo(f" - {error}", err=True)
396
+
397
+ if ctx.processing_warnings and verbose:
398
+ click.echo(f"\nWarnings: {len(ctx.processing_warnings)}", err=True)
399
+ for warning in ctx.processing_warnings:
400
+ click.echo(f" - {warning}", err=True)
401
+
402
+ except Exception as e:
403
+ logging.exception("Pipeline error:")
404
+ raise click.ClickException(f"Pipeline failed: {e}")
405
+
406
+
407
+ def _parse_stages(stages_str: str) -> set[int]:
408
+ """Parse stage string like '1,2,3' or '1-3' into a set of ints."""
409
+ result = set()
410
+ for part in stages_str.split(","):
411
+ part = part.strip()
412
+ if "-" in part:
413
+ start, end = part.split("-", 1)
414
+ for i in range(int(start), int(end) + 1):
415
+ result.add(i)
416
+ else:
417
+ result.add(int(part))
418
+ return result
419
+
420
+
421
+ def _print_pipeline_json(ctx):
422
+ """Print pipeline results as JSON."""
423
+ output = {
424
+ "statement_count": ctx.statement_count,
425
+ "raw_triples": [t.model_dump() for t in ctx.raw_triples],
426
+ "statements": [s.model_dump() for s in ctx.statements],
427
+ "labeled_statements": [stmt.as_dict() for stmt in ctx.labeled_statements],
428
+ "timings": ctx.stage_timings,
429
+ "warnings": ctx.processing_warnings,
430
+ "errors": ctx.processing_errors,
431
+ }
432
+ click.echo(json.dumps(output, indent=2, default=str))
433
+
434
+
435
+ def _print_pipeline_yaml(ctx):
436
+ """Print pipeline results as YAML."""
437
+ try:
438
+ import yaml
439
+ output = {
440
+ "statement_count": ctx.statement_count,
441
+ "statements": [stmt.as_dict() for stmt in ctx.labeled_statements],
442
+ "timings": ctx.stage_timings,
443
+ }
444
+ click.echo(yaml.dump(output, default_flow_style=False))
445
+ except ImportError:
446
+ click.echo("YAML output requires PyYAML: pip install pyyaml", err=True)
447
+ _print_pipeline_json(ctx)
448
+
449
+
450
+ def _print_pipeline_triples(ctx):
451
+ """Print pipeline results as simple triples."""
452
+ if ctx.labeled_statements:
453
+ for stmt in ctx.labeled_statements:
454
+ click.echo(f"{stmt.subject_fqn}\t{stmt.statement.predicate}\t{stmt.object_fqn}")
455
+ elif ctx.statements:
456
+ for stmt in ctx.statements:
457
+ click.echo(f"{stmt.subject.text}\t{stmt.predicate}\t{stmt.object.text}")
458
+ elif ctx.raw_triples:
459
+ for triple in ctx.raw_triples:
460
+ click.echo(f"{triple.subject_text}\t{triple.predicate_text}\t{triple.object_text}")
461
+
462
+
463
+ def _print_pipeline_table(ctx, verbose: bool):
464
+ """Print pipeline results in table format."""
465
+ # Try labeled statements first, then statements, then raw triples
466
+ if ctx.labeled_statements:
467
+ click.echo(f"\nExtracted {len(ctx.labeled_statements)} statement(s):\n")
468
+ click.echo("-" * 80)
469
+
470
+ for i, stmt in enumerate(ctx.labeled_statements, 1):
471
+ click.echo(f"{i}. {stmt.subject_fqn}")
472
+ click.echo(f" --[{stmt.statement.predicate}]-->")
473
+ click.echo(f" {stmt.object_fqn}")
474
+
475
+ # Show labels (always in recent versions, not just verbose)
476
+ for label in stmt.labels:
477
+ if isinstance(label.label_value, float):
478
+ click.echo(f" {label.label_type}: {label.label_value:.3f}")
479
+ else:
480
+ click.echo(f" {label.label_type}: {label.label_value}")
481
+
482
+ # Show top taxonomy results (sorted by confidence)
483
+ if stmt.taxonomy_results:
484
+ sorted_taxonomy = sorted(stmt.taxonomy_results, key=lambda t: t.confidence, reverse=True)
485
+ top_taxonomy = sorted_taxonomy[:5] # Show top 5
486
+ taxonomy_strs = [f"{t.category}:{t.label} ({t.confidence:.2f})" for t in top_taxonomy]
487
+ click.echo(f" topics: {', '.join(taxonomy_strs)}")
488
+ if len(sorted_taxonomy) > 5:
489
+ click.echo(f" ... and {len(sorted_taxonomy) - 5} more topics")
490
+
491
+ if verbose and stmt.statement.source_text:
492
+ source = stmt.statement.source_text[:60] + "..." if len(stmt.statement.source_text) > 60 else stmt.statement.source_text
493
+ click.echo(f" Source: \"{source}\"")
494
+
495
+ click.echo("-" * 80)
496
+
497
+ elif ctx.statements:
498
+ click.echo(f"\nExtracted {len(ctx.statements)} statement(s):\n")
499
+ click.echo("-" * 80)
500
+
501
+ for i, stmt in enumerate(ctx.statements, 1):
502
+ subj_type = f" ({stmt.subject.type.value})" if stmt.subject.type.value != "UNKNOWN" else ""
503
+ obj_type = f" ({stmt.object.type.value})" if stmt.object.type.value != "UNKNOWN" else ""
504
+
505
+ click.echo(f"{i}. {stmt.subject.text}{subj_type}")
506
+ click.echo(f" --[{stmt.predicate}]-->")
507
+ click.echo(f" {stmt.object.text}{obj_type}")
508
+
509
+ if verbose and stmt.confidence_score is not None:
510
+ click.echo(f" Confidence: {stmt.confidence_score:.2f}")
511
+
512
+ click.echo("-" * 80)
513
+
514
+ elif ctx.raw_triples:
515
+ click.echo(f"\nExtracted {len(ctx.raw_triples)} raw triple(s):\n")
516
+ click.echo("-" * 80)
517
+
518
+ for i, triple in enumerate(ctx.raw_triples, 1):
519
+ click.echo(f"{i}. {triple.subject_text}")
520
+ click.echo(f" --[{triple.predicate_text}]-->")
521
+ click.echo(f" {triple.object_text}")
522
+
523
+ if verbose:
524
+ click.echo(f" Confidence: {triple.confidence:.2f}")
525
+ if triple.source_sentence:
526
+ source = triple.source_sentence[:60] + "..." if len(triple.source_sentence) > 60 else triple.source_sentence
527
+ click.echo(f" Source: \"{source}\"")
528
+
529
+ click.echo("-" * 80)
530
+
531
+ else:
532
+ click.echo("No statements extracted.")
533
+ return
534
+
535
+ # Show timings in verbose mode
536
+ if verbose and ctx.stage_timings:
537
+ click.echo("\nStage timings:")
538
+ for stage, duration in ctx.stage_timings.items():
539
+ click.echo(f" {stage}: {duration:.3f}s")
540
+
541
+
542
+ # =============================================================================
543
+ # Plugins command
544
+ # =============================================================================
545
+
546
+ @main.command("plugins")
547
+ @click.argument("action", type=click.Choice(["list", "info"]))
548
+ @click.argument("plugin_name", required=False)
549
+ @click.option("--stage", type=int, help="Filter by stage number (1-5)")
550
+ def plugins_cmd(action: str, plugin_name: Optional[str], stage: Optional[int]):
551
+ """
552
+ List or inspect available plugins.
553
+
554
+ \b
555
+ Actions:
556
+ list List all available plugins
557
+ info Show details about a specific plugin
558
+
559
+ \b
560
+ Examples:
561
+ corp-extractor plugins list
562
+ corp-extractor plugins list --stage 3
563
+ corp-extractor plugins info gleif_qualifier
564
+ """
565
+ # Import and load plugins
566
+ _load_all_plugins()
567
+
568
+ from .pipeline.registry import PluginRegistry
569
+
570
+ if action == "list":
571
+ plugins = PluginRegistry.list_plugins(stage=stage)
572
+ if not plugins:
573
+ click.echo("No plugins registered.")
574
+ return
575
+
576
+ # Group by stage
577
+ by_stage: dict[int, list] = {}
578
+ for plugin in plugins:
579
+ stage_num = plugin["stage"]
580
+ if stage_num not in by_stage:
581
+ by_stage[stage_num] = []
582
+ by_stage[stage_num].append(plugin)
583
+
584
+ for stage_num in sorted(by_stage.keys()):
585
+ stage_plugins = by_stage[stage_num]
586
+ stage_name = stage_plugins[0]["stage_name"]
587
+ click.echo(f"\nStage {stage_num}: {stage_name.title()}")
588
+ click.echo("-" * 40)
589
+
590
+ for p in stage_plugins:
591
+ entity_types = p.get("entity_types", [])
592
+ types_str = f" ({', '.join(entity_types)})" if entity_types else ""
593
+ click.echo(f" {p['name']}{types_str} [priority: {p['priority']}]")
594
+
595
+ elif action == "info":
596
+ if not plugin_name:
597
+ raise click.UsageError("Plugin name required for 'info' action")
598
+
599
+ plugin = PluginRegistry.get_plugin(plugin_name)
600
+ if not plugin:
601
+ raise click.ClickException(f"Plugin not found: {plugin_name}")
602
+
603
+ click.echo(f"\nPlugin: {plugin.name}")
604
+ click.echo(f"Priority: {plugin.priority}")
605
+ click.echo(f"Capabilities: {plugin.capabilities.name if plugin.capabilities else 'NONE'}")
606
+
607
+ if plugin.description:
608
+ click.echo(f"Description: {plugin.description}")
609
+
610
+ if hasattr(plugin, "supported_entity_types"):
611
+ types = [t.value for t in plugin.supported_entity_types]
612
+ click.echo(f"Entity types: {', '.join(types)}")
613
+
614
+ if hasattr(plugin, "label_type"):
615
+ click.echo(f"Label type: {plugin.label_type}")
616
+
617
+ if hasattr(plugin, "supported_identifier_types"):
618
+ ids = plugin.supported_identifier_types
619
+ if ids:
620
+ click.echo(f"Supported identifiers: {', '.join(ids)}")
621
+
622
+ if hasattr(plugin, "provided_identifier_types"):
623
+ ids = plugin.provided_identifier_types
624
+ if ids:
625
+ click.echo(f"Provided identifiers: {', '.join(ids)}")
626
+
627
+
628
+ def _load_all_plugins():
629
+ """Load all plugins by importing their modules."""
630
+ # Import all plugin modules to trigger registration
631
+ try:
632
+ from .plugins import splitters, extractors, qualifiers, canonicalizers, labelers, taxonomy
633
+ # The @PluginRegistry decorators will register plugins on import
634
+ except ImportError as e:
635
+ logging.debug(f"Some plugins failed to load: {e}")
636
+
637
+
638
+ # =============================================================================
639
+ # Helper functions
640
+ # =============================================================================
641
+
203
642
  def _get_input_text(text: Optional[str], input_file: Optional[str]) -> Optional[str]:
204
643
  """Get input text from argument, file, or stdin."""
205
644
  if text == "-" or (text is None and input_file is None and not sys.stdin.isatty()):