corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +10 -1
  4. statement_extractor/cli.py +1663 -17
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +6972 -0
  7. statement_extractor/database/__init__.py +52 -0
  8. statement_extractor/database/embeddings.py +186 -0
  9. statement_extractor/database/hub.py +520 -0
  10. statement_extractor/database/importers/__init__.py +24 -0
  11. statement_extractor/database/importers/companies_house.py +545 -0
  12. statement_extractor/database/importers/gleif.py +538 -0
  13. statement_extractor/database/importers/sec_edgar.py +375 -0
  14. statement_extractor/database/importers/wikidata.py +1012 -0
  15. statement_extractor/database/importers/wikidata_people.py +632 -0
  16. statement_extractor/database/models.py +230 -0
  17. statement_extractor/database/resolver.py +245 -0
  18. statement_extractor/database/store.py +1609 -0
  19. statement_extractor/document/__init__.py +62 -0
  20. statement_extractor/document/chunker.py +410 -0
  21. statement_extractor/document/context.py +171 -0
  22. statement_extractor/document/deduplicator.py +173 -0
  23. statement_extractor/document/html_extractor.py +246 -0
  24. statement_extractor/document/loader.py +303 -0
  25. statement_extractor/document/pipeline.py +388 -0
  26. statement_extractor/document/summarizer.py +195 -0
  27. statement_extractor/extractor.py +1 -23
  28. statement_extractor/gliner_extraction.py +4 -74
  29. statement_extractor/llm.py +255 -0
  30. statement_extractor/models/__init__.py +89 -0
  31. statement_extractor/models/canonical.py +182 -0
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/entity.py +102 -0
  34. statement_extractor/models/labels.py +220 -0
  35. statement_extractor/models/qualifiers.py +139 -0
  36. statement_extractor/models/statement.py +101 -0
  37. statement_extractor/models.py +4 -1
  38. statement_extractor/pipeline/__init__.py +39 -0
  39. statement_extractor/pipeline/config.py +129 -0
  40. statement_extractor/pipeline/context.py +177 -0
  41. statement_extractor/pipeline/orchestrator.py +416 -0
  42. statement_extractor/pipeline/registry.py +303 -0
  43. statement_extractor/plugins/__init__.py +55 -0
  44. statement_extractor/plugins/base.py +716 -0
  45. statement_extractor/plugins/extractors/__init__.py +13 -0
  46. statement_extractor/plugins/extractors/base.py +9 -0
  47. statement_extractor/plugins/extractors/gliner2.py +546 -0
  48. statement_extractor/plugins/labelers/__init__.py +29 -0
  49. statement_extractor/plugins/labelers/base.py +9 -0
  50. statement_extractor/plugins/labelers/confidence.py +138 -0
  51. statement_extractor/plugins/labelers/relation_type.py +87 -0
  52. statement_extractor/plugins/labelers/sentiment.py +159 -0
  53. statement_extractor/plugins/labelers/taxonomy.py +386 -0
  54. statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
  55. statement_extractor/plugins/pdf/__init__.py +10 -0
  56. statement_extractor/plugins/pdf/pypdf.py +291 -0
  57. statement_extractor/plugins/qualifiers/__init__.py +30 -0
  58. statement_extractor/plugins/qualifiers/base.py +9 -0
  59. statement_extractor/plugins/qualifiers/companies_house.py +185 -0
  60. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  61. statement_extractor/plugins/qualifiers/gleif.py +197 -0
  62. statement_extractor/plugins/qualifiers/person.py +785 -0
  63. statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
  64. statement_extractor/plugins/scrapers/__init__.py +10 -0
  65. statement_extractor/plugins/scrapers/http.py +236 -0
  66. statement_extractor/plugins/splitters/__init__.py +13 -0
  67. statement_extractor/plugins/splitters/base.py +9 -0
  68. statement_extractor/plugins/splitters/t5_gemma.py +293 -0
  69. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  70. statement_extractor/plugins/taxonomy/embedding.py +484 -0
  71. statement_extractor/plugins/taxonomy/mnli.py +291 -0
  72. statement_extractor/scoring.py +8 -8
  73. corp_extractor-0.4.0.dist-info/RECORD +0 -12
  74. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  75. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -2,13 +2,16 @@
2
2
  Command-line interface for statement extraction.
3
3
 
4
4
  Usage:
5
- corp-extractor "Your text here"
6
- corp-extractor -f input.txt
7
- cat input.txt | corp-extractor -
5
+ corp-extractor split "Your text here"
6
+ corp-extractor split -f input.txt
7
+ corp-extractor pipeline "Your text here" --stages 1-5
8
+ corp-extractor plugins list
8
9
  """
9
10
 
11
+ import json
10
12
  import logging
11
13
  import sys
14
+ from pathlib import Path
12
15
  from typing import Optional
13
16
 
14
17
  import click
@@ -35,9 +38,36 @@ def _configure_logging(verbose: bool) -> None:
35
38
  "statement_extractor.predicate_comparer",
36
39
  "statement_extractor.canonicalization",
37
40
  "statement_extractor.gliner_extraction",
41
+ "statement_extractor.pipeline",
42
+ "statement_extractor.plugins",
43
+ "statement_extractor.plugins.extractors.gliner2",
44
+ "statement_extractor.plugins.splitters",
45
+ "statement_extractor.plugins.labelers",
46
+ "statement_extractor.plugins.scrapers",
47
+ "statement_extractor.plugins.scrapers.http",
48
+ "statement_extractor.plugins.pdf",
49
+ "statement_extractor.plugins.pdf.pypdf",
50
+ "statement_extractor.document",
51
+ "statement_extractor.document.loader",
52
+ "statement_extractor.document.html_extractor",
53
+ "statement_extractor.document.pipeline",
54
+ "statement_extractor.document.chunker",
38
55
  ]:
39
56
  logging.getLogger(logger_name).setLevel(level)
40
57
 
58
+ # Suppress noisy third-party loggers
59
+ for noisy_logger in [
60
+ "httpcore",
61
+ "httpcore.http11",
62
+ "httpcore.connection",
63
+ "httpx",
64
+ "urllib3",
65
+ "huggingface_hub",
66
+ "asyncio",
67
+ ]:
68
+ logging.getLogger(noisy_logger).setLevel(logging.WARNING)
69
+
70
+
41
71
  from . import __version__
42
72
  from .models import (
43
73
  ExtractionOptions,
@@ -47,7 +77,36 @@ from .models import (
47
77
  )
48
78
 
49
79
 
50
- @click.command()
80
+ @click.group()
81
+ @click.version_option(version=__version__)
82
+ def main():
83
+ """
84
+ Extract structured statements from text.
85
+
86
+ \b
87
+ Commands:
88
+ split Extract sub-statements from text (simple, fast)
89
+ pipeline Run the full 6-stage extraction pipeline
90
+ document Process documents with chunking and citations
91
+ plugins List or inspect available plugins
92
+ db Manage entity/organization embedding database
93
+
94
+ \b
95
+ Examples:
96
+ corp-extractor split "Apple announced a new iPhone."
97
+ corp-extractor split -f article.txt --json
98
+ corp-extractor pipeline "Apple CEO Tim Cook announced..." --stages 1-3
99
+ corp-extractor document process report.txt --title "Annual Report"
100
+ corp-extractor plugins list
101
+ """
102
+ pass
103
+
104
+
105
+ # =============================================================================
106
+ # Split command (simple extraction)
107
+ # =============================================================================
108
+
109
+ @main.command("split")
51
110
  @click.argument("text", required=False)
52
111
  @click.option("-f", "--file", "input_file", type=click.Path(exists=True), help="Read input from file")
53
112
  @click.option(
@@ -80,8 +139,7 @@ from .models import (
80
139
  # Output options
81
140
  @click.option("-v", "--verbose", is_flag=True, help="Show verbose output with confidence scores")
82
141
  @click.option("-q", "--quiet", is_flag=True, help="Suppress progress messages")
83
- @click.version_option(version=__version__)
84
- def main(
142
+ def split_cmd(
85
143
  text: Optional[str],
86
144
  input_file: Optional[str],
87
145
  output: str,
@@ -105,17 +163,18 @@ def main(
105
163
  quiet: bool,
106
164
  ):
107
165
  """
108
- Extract structured statements from text.
166
+ Extract sub-statements from text using T5-Gemma model.
109
167
 
110
- TEXT can be provided as an argument, read from a file with -f, or piped via stdin.
168
+ This command splits text into structured subject-predicate-object triples.
169
+ It's fast and simple - use 'pipeline' for full entity resolution.
111
170
 
112
171
  \b
113
172
  Examples:
114
- corp-extractor "Apple announced a new iPhone."
115
- corp-extractor -f article.txt --json
116
- corp-extractor -f article.txt -o json --beams 8
117
- cat article.txt | corp-extractor -
118
- echo "Tim Cook is CEO of Apple." | corp-extractor - --verbose
173
+ corp-extractor split "Apple announced a new iPhone."
174
+ corp-extractor split -f article.txt --json
175
+ corp-extractor split -f article.txt -o json --beams 8
176
+ cat article.txt | corp-extractor split -
177
+ echo "Tim Cook is CEO of Apple." | corp-extractor split - --verbose
119
178
 
120
179
  \b
121
180
  Output formats:
@@ -135,10 +194,7 @@ def main(
135
194
  # Get input text
136
195
  input_text = _get_input_text(text, input_file)
137
196
  if not input_text:
138
- raise click.UsageError(
139
- "No input provided. Use: statement-extractor \"text\", "
140
- "statement-extractor -f file.txt, or pipe via stdin."
141
- )
197
+ raise click.UsageError("No input provided. Provide text argument or use -f file.txt")
142
198
 
143
199
  if not quiet:
144
200
  click.echo(f"Processing {len(input_text)} characters...", err=True)
@@ -210,6 +266,1596 @@ def main(
210
266
  raise click.ClickException(f"Extraction failed: {e}")
211
267
 
212
268
 
269
+ # =============================================================================
270
+ # Pipeline command
271
+ # =============================================================================
272
+
273
+ @main.command("pipeline")
274
+ @click.argument("text", required=False)
275
+ @click.option("-f", "--file", "input_file", type=click.Path(exists=True), help="Read input from file")
276
+ @click.option(
277
+ "--stages",
278
+ type=str,
279
+ default="1-6",
280
+ help="Stages to run (e.g., '1,2,3' or '1-3' or '1-6')"
281
+ )
282
+ @click.option(
283
+ "--skip-stages",
284
+ type=str,
285
+ default=None,
286
+ help="Stages to skip (e.g., '4,5')"
287
+ )
288
+ @click.option(
289
+ "--plugins",
290
+ "enabled_plugins",
291
+ type=str,
292
+ default=None,
293
+ help="Plugins to enable (comma-separated names)"
294
+ )
295
+ @click.option(
296
+ "--disable-plugins",
297
+ type=str,
298
+ default=None,
299
+ help="Plugins to disable (comma-separated names)"
300
+ )
301
+ @click.option(
302
+ "--no-default-predicates",
303
+ is_flag=True,
304
+ help="Disable default predicate taxonomy (GLiNER2 will only use entity extraction)"
305
+ )
306
+ @click.option(
307
+ "-o", "--output",
308
+ type=click.Choice(["table", "json", "yaml", "triples"], case_sensitive=False),
309
+ default="table",
310
+ help="Output format (default: table)"
311
+ )
312
+ @click.option("-v", "--verbose", is_flag=True, help="Show verbose output")
313
+ @click.option("-q", "--quiet", is_flag=True, help="Suppress progress messages")
314
+ def pipeline_cmd(
315
+ text: Optional[str],
316
+ input_file: Optional[str],
317
+ stages: str,
318
+ skip_stages: Optional[str],
319
+ enabled_plugins: Optional[str],
320
+ disable_plugins: Optional[str],
321
+ no_default_predicates: bool,
322
+ output: str,
323
+ verbose: bool,
324
+ quiet: bool,
325
+ ):
326
+ """
327
+ Run the full 5-stage extraction pipeline.
328
+
329
+ \b
330
+ Stages:
331
+ 1. Splitting - Text → Raw triples (T5-Gemma)
332
+ 2. Extraction - Raw triples → Typed statements (GLiNER2)
333
+ 3. Qualification - Add qualifiers and identifiers
334
+ 4. Canonicalization - Resolve to canonical forms
335
+ 5. Labeling - Apply sentiment, relation type, confidence
336
+
337
+ \b
338
+ Examples:
339
+ corp-extractor pipeline "Apple CEO Tim Cook announced..."
340
+ corp-extractor pipeline -f article.txt --stages 1-3
341
+ corp-extractor pipeline "..." --plugins gleif,companies_house
342
+ corp-extractor pipeline "..." --disable-plugins sec_edgar
343
+ """
344
+ _configure_logging(verbose)
345
+
346
+ # Get input text
347
+ input_text = _get_input_text(text, input_file)
348
+ if not input_text:
349
+ raise click.UsageError("No input provided. Provide text argument or use -f file.txt")
350
+
351
+ if not quiet:
352
+ click.echo(f"Processing {len(input_text)} characters through pipeline...", err=True)
353
+
354
+ # Import pipeline components (also loads plugins)
355
+ from .pipeline import ExtractionPipeline, PipelineConfig
356
+ _load_all_plugins()
357
+
358
+ # Parse stages
359
+ enabled_stages = _parse_stages(stages)
360
+ if skip_stages:
361
+ skip_set = _parse_stages(skip_stages)
362
+ enabled_stages = enabled_stages - skip_set
363
+
364
+ if not quiet:
365
+ click.echo(f"Running stages: {sorted(enabled_stages)}", err=True)
366
+
367
+ # Parse plugin selection
368
+ enabled_plugin_set = None
369
+ if enabled_plugins:
370
+ enabled_plugin_set = {p.strip() for p in enabled_plugins.split(",") if p.strip()}
371
+
372
+ disabled_plugin_set = None
373
+ if disable_plugins:
374
+ disabled_plugin_set = {p.strip() for p in disable_plugins.split(",") if p.strip()}
375
+
376
+ # Build extractor options
377
+ extractor_options = {}
378
+ if no_default_predicates:
379
+ extractor_options["use_default_predicates"] = False
380
+ if not quiet:
381
+ click.echo("Default predicates disabled - using entity extraction only", err=True)
382
+
383
+ # Create config - only pass disabled_plugins if user explicitly specified, otherwise use defaults
384
+ config_kwargs: dict = {
385
+ "enabled_stages": enabled_stages,
386
+ "enabled_plugins": enabled_plugin_set,
387
+ "extractor_options": extractor_options,
388
+ }
389
+ if disabled_plugin_set is not None:
390
+ config_kwargs["disabled_plugins"] = disabled_plugin_set
391
+ config = PipelineConfig(**config_kwargs)
392
+
393
+ # Run pipeline
394
+ try:
395
+ pipeline = ExtractionPipeline(config)
396
+ ctx = pipeline.process(input_text)
397
+
398
+ # Output results
399
+ if output == "json":
400
+ _print_pipeline_json(ctx)
401
+ elif output == "yaml":
402
+ _print_pipeline_yaml(ctx)
403
+ elif output == "triples":
404
+ _print_pipeline_triples(ctx)
405
+ else:
406
+ _print_pipeline_table(ctx, verbose)
407
+
408
+ # Report errors/warnings
409
+ if ctx.processing_errors and not quiet:
410
+ click.echo(f"\nErrors: {len(ctx.processing_errors)}", err=True)
411
+ for error in ctx.processing_errors:
412
+ click.echo(f" - {error}", err=True)
413
+
414
+ if ctx.processing_warnings and verbose:
415
+ click.echo(f"\nWarnings: {len(ctx.processing_warnings)}", err=True)
416
+ for warning in ctx.processing_warnings:
417
+ click.echo(f" - {warning}", err=True)
418
+
419
+ except Exception as e:
420
+ logging.exception("Pipeline error:")
421
+ raise click.ClickException(f"Pipeline failed: {e}")
422
+
423
+
424
+ def _parse_stages(stages_str: str) -> set[int]:
425
+ """Parse stage string like '1,2,3' or '1-3' into a set of ints."""
426
+ result = set()
427
+ for part in stages_str.split(","):
428
+ part = part.strip()
429
+ if "-" in part:
430
+ start, end = part.split("-", 1)
431
+ for i in range(int(start), int(end) + 1):
432
+ result.add(i)
433
+ else:
434
+ result.add(int(part))
435
+ return result
436
+
437
+
438
+ def _print_pipeline_json(ctx):
439
+ """Print pipeline results as JSON."""
440
+ output = {
441
+ "statement_count": ctx.statement_count,
442
+ "raw_triples": [t.model_dump() for t in ctx.raw_triples],
443
+ "statements": [s.model_dump() for s in ctx.statements],
444
+ "labeled_statements": [stmt.as_dict() for stmt in ctx.labeled_statements],
445
+ "timings": ctx.stage_timings,
446
+ "warnings": ctx.processing_warnings,
447
+ "errors": ctx.processing_errors,
448
+ }
449
+ click.echo(json.dumps(output, indent=2, default=str))
450
+
451
+
452
+ def _print_pipeline_yaml(ctx):
453
+ """Print pipeline results as YAML."""
454
+ try:
455
+ import yaml
456
+ output = {
457
+ "statement_count": ctx.statement_count,
458
+ "statements": [stmt.as_dict() for stmt in ctx.labeled_statements],
459
+ "timings": ctx.stage_timings,
460
+ }
461
+ click.echo(yaml.dump(output, default_flow_style=False))
462
+ except ImportError:
463
+ click.echo("YAML output requires PyYAML: pip install pyyaml", err=True)
464
+ _print_pipeline_json(ctx)
465
+
466
+
467
+ def _print_pipeline_triples(ctx):
468
+ """Print pipeline results as simple triples."""
469
+ if ctx.labeled_statements:
470
+ for stmt in ctx.labeled_statements:
471
+ click.echo(f"{stmt.subject_fqn}\t{stmt.statement.predicate}\t{stmt.object_fqn}")
472
+ elif ctx.statements:
473
+ for stmt in ctx.statements:
474
+ click.echo(f"{stmt.subject.text}\t{stmt.predicate}\t{stmt.object.text}")
475
+ elif ctx.raw_triples:
476
+ for triple in ctx.raw_triples:
477
+ click.echo(f"{triple.subject_text}\t{triple.predicate_text}\t{triple.object_text}")
478
+
479
+
480
+ def _print_pipeline_table(ctx, verbose: bool):
481
+ """Print pipeline results in table format."""
482
+ # Try labeled statements first, then statements, then raw triples
483
+ if ctx.labeled_statements:
484
+ click.echo(f"\nExtracted {len(ctx.labeled_statements)} statement(s):\n")
485
+ click.echo("-" * 80)
486
+
487
+ for i, stmt in enumerate(ctx.labeled_statements, 1):
488
+ click.echo(f"{i}. {stmt.subject_fqn}")
489
+ click.echo(f" --[{stmt.statement.predicate}]-->")
490
+ click.echo(f" {stmt.object_fqn}")
491
+
492
+ # Show labels (always in recent versions, not just verbose)
493
+ for label in stmt.labels:
494
+ if isinstance(label.label_value, float):
495
+ click.echo(f" {label.label_type}: {label.label_value:.3f}")
496
+ else:
497
+ click.echo(f" {label.label_type}: {label.label_value}")
498
+
499
+ # Show top taxonomy results (sorted by confidence)
500
+ if stmt.taxonomy_results:
501
+ sorted_taxonomy = sorted(stmt.taxonomy_results, key=lambda t: t.confidence, reverse=True)
502
+ top_taxonomy = sorted_taxonomy[:5] # Show top 5
503
+ taxonomy_strs = [f"{t.category}:{t.label} ({t.confidence:.2f})" for t in top_taxonomy]
504
+ click.echo(f" topics: {', '.join(taxonomy_strs)}")
505
+ if len(sorted_taxonomy) > 5:
506
+ click.echo(f" ... and {len(sorted_taxonomy) - 5} more topics")
507
+
508
+ if verbose and stmt.statement.source_text:
509
+ source = stmt.statement.source_text[:60] + "..." if len(stmt.statement.source_text) > 60 else stmt.statement.source_text
510
+ click.echo(f" Source: \"{source}\"")
511
+
512
+ click.echo("-" * 80)
513
+
514
+ elif ctx.statements:
515
+ click.echo(f"\nExtracted {len(ctx.statements)} statement(s):\n")
516
+ click.echo("-" * 80)
517
+
518
+ for i, stmt in enumerate(ctx.statements, 1):
519
+ subj_type = f" ({stmt.subject.type.value})" if stmt.subject.type.value != "UNKNOWN" else ""
520
+ obj_type = f" ({stmt.object.type.value})" if stmt.object.type.value != "UNKNOWN" else ""
521
+
522
+ click.echo(f"{i}. {stmt.subject.text}{subj_type}")
523
+ click.echo(f" --[{stmt.predicate}]-->")
524
+ click.echo(f" {stmt.object.text}{obj_type}")
525
+
526
+ if verbose and stmt.confidence_score is not None:
527
+ click.echo(f" Confidence: {stmt.confidence_score:.2f}")
528
+
529
+ click.echo("-" * 80)
530
+
531
+ elif ctx.raw_triples:
532
+ click.echo(f"\nExtracted {len(ctx.raw_triples)} raw triple(s):\n")
533
+ click.echo("-" * 80)
534
+
535
+ for i, triple in enumerate(ctx.raw_triples, 1):
536
+ click.echo(f"{i}. {triple.subject_text}")
537
+ click.echo(f" --[{triple.predicate_text}]-->")
538
+ click.echo(f" {triple.object_text}")
539
+
540
+ if verbose:
541
+ click.echo(f" Confidence: {triple.confidence:.2f}")
542
+ if triple.source_sentence:
543
+ source = triple.source_sentence[:60] + "..." if len(triple.source_sentence) > 60 else triple.source_sentence
544
+ click.echo(f" Source: \"{source}\"")
545
+
546
+ click.echo("-" * 80)
547
+
548
+ else:
549
+ click.echo("No statements extracted.")
550
+ return
551
+
552
+ # Show timings in verbose mode
553
+ if verbose and ctx.stage_timings:
554
+ click.echo("\nStage timings:")
555
+ for stage, duration in ctx.stage_timings.items():
556
+ click.echo(f" {stage}: {duration:.3f}s")
557
+
558
+
559
+ # =============================================================================
560
+ # Plugins command
561
+ # =============================================================================
562
+
563
+ @main.command("plugins")
564
+ @click.argument("action", type=click.Choice(["list", "info"]))
565
+ @click.argument("plugin_name", required=False)
566
+ @click.option("--stage", type=int, help="Filter by stage number (1-5)")
567
+ def plugins_cmd(action: str, plugin_name: Optional[str], stage: Optional[int]):
568
+ """
569
+ List or inspect available plugins.
570
+
571
+ \b
572
+ Actions:
573
+ list List all available plugins
574
+ info Show details about a specific plugin
575
+
576
+ \b
577
+ Examples:
578
+ corp-extractor plugins list
579
+ corp-extractor plugins list --stage 3
580
+ corp-extractor plugins info gleif_qualifier
581
+ """
582
+ # Import and load plugins
583
+ _load_all_plugins()
584
+
585
+ from .pipeline.registry import PluginRegistry
586
+
587
+ if action == "list":
588
+ plugins = PluginRegistry.list_plugins(stage=stage)
589
+ if not plugins:
590
+ click.echo("No plugins registered.")
591
+ return
592
+
593
+ # Group by stage
594
+ by_stage: dict[int, list] = {}
595
+ for plugin in plugins:
596
+ stage_num = plugin["stage"]
597
+ if stage_num not in by_stage:
598
+ by_stage[stage_num] = []
599
+ by_stage[stage_num].append(plugin)
600
+
601
+ for stage_num in sorted(by_stage.keys()):
602
+ stage_plugins = by_stage[stage_num]
603
+ stage_name = stage_plugins[0]["stage_name"]
604
+ click.echo(f"\nStage {stage_num}: {stage_name.title()}")
605
+ click.echo("-" * 40)
606
+
607
+ for p in stage_plugins:
608
+ entity_types = p.get("entity_types", [])
609
+ types_str = f" ({', '.join(entity_types)})" if entity_types else ""
610
+ click.echo(f" {p['name']}{types_str} [priority: {p['priority']}]")
611
+
612
+ elif action == "info":
613
+ if not plugin_name:
614
+ raise click.UsageError("Plugin name required for 'info' action")
615
+
616
+ plugin = PluginRegistry.get_plugin(plugin_name)
617
+ if not plugin:
618
+ raise click.ClickException(f"Plugin not found: {plugin_name}")
619
+
620
+ click.echo(f"\nPlugin: {plugin.name}")
621
+ click.echo(f"Priority: {plugin.priority}")
622
+ click.echo(f"Capabilities: {plugin.capabilities.name if plugin.capabilities else 'NONE'}")
623
+
624
+ if plugin.description:
625
+ click.echo(f"Description: {plugin.description}")
626
+
627
+ if hasattr(plugin, "supported_entity_types"):
628
+ types = [t.value for t in plugin.supported_entity_types]
629
+ click.echo(f"Entity types: {', '.join(types)}")
630
+
631
+ if hasattr(plugin, "label_type"):
632
+ click.echo(f"Label type: {plugin.label_type}")
633
+
634
+ if hasattr(plugin, "supported_identifier_types"):
635
+ ids = plugin.supported_identifier_types
636
+ if ids:
637
+ click.echo(f"Supported identifiers: {', '.join(ids)}")
638
+
639
+ if hasattr(plugin, "provided_identifier_types"):
640
+ ids = plugin.provided_identifier_types
641
+ if ids:
642
+ click.echo(f"Provided identifiers: {', '.join(ids)}")
643
+
644
+
645
+ def _load_all_plugins():
646
+ """Load all plugins by importing their modules."""
647
+ # Import all plugin modules to trigger registration
648
+ try:
649
+ from .plugins import splitters, extractors, qualifiers, labelers, taxonomy
650
+ # The @PluginRegistry decorators will register plugins on import
651
+ _ = splitters, extractors, qualifiers, labelers, taxonomy # Silence unused warnings
652
+ except ImportError as e:
653
+ logging.debug(f"Some plugins failed to load: {e}")
654
+
655
+
656
+ # =============================================================================
657
+ # Database commands
658
+ # =============================================================================
659
+
660
+ @main.group("db")
661
+ def db_cmd():
662
+ """
663
+ Manage entity/organization embedding database.
664
+
665
+ \b
666
+ Commands:
667
+ import-gleif Import GLEIF LEI data (~3M records)
668
+ import-sec Import SEC Edgar bulk data (~100K+ filers)
669
+ import-companies-house Import UK Companies House (~5M records)
670
+ import-wikidata Import Wikidata organizations
671
+ import-people Import Wikidata notable people
672
+ status Show database status
673
+ search Search for an organization
674
+ search-people Search for a person
675
+ download Download database from HuggingFace
676
+ upload Upload database with lite/compressed variants
677
+ create-lite Create lite version (no record data)
678
+ compress Compress database with gzip
679
+
680
+ \b
681
+ Examples:
682
+ corp-extractor db import-sec --download
683
+ corp-extractor db import-gleif --download --limit 100000
684
+ corp-extractor db import-people --all --limit 10000
685
+ corp-extractor db status
686
+ corp-extractor db search "Apple Inc"
687
+ corp-extractor db search-people "Tim Cook"
688
+ corp-extractor db upload entities.db
689
+ """
690
+ pass
691
+
692
+
693
+ @db_cmd.command("gleif-info")
694
+ def db_gleif_info():
695
+ """
696
+ Show information about the latest available GLEIF data file.
697
+
698
+ \b
699
+ Examples:
700
+ corp-extractor db gleif-info
701
+ """
702
+ from .database.importers import GleifImporter
703
+
704
+ importer = GleifImporter()
705
+
706
+ try:
707
+ info = importer.get_latest_file_info()
708
+ record_count = info.get('record_count')
709
+
710
+ click.echo("\nLatest GLEIF Data File")
711
+ click.echo("=" * 40)
712
+ click.echo(f"File ID: {info['id']}")
713
+ click.echo(f"Publish Date: {info['publish_date']}")
714
+ click.echo(f"Record Count: {record_count:,}" if record_count else "Record Count: unknown")
715
+
716
+ delta = info.get("delta_from_last_file", {})
717
+ if delta:
718
+ click.echo(f"\nChanges from previous file:")
719
+ if delta.get('new'):
720
+ click.echo(f" New: {delta.get('new'):,}")
721
+ if delta.get('updated'):
722
+ click.echo(f" Updated: {delta.get('updated'):,}")
723
+ if delta.get('retired'):
724
+ click.echo(f" Retired: {delta.get('retired'):,}")
725
+
726
+ except Exception as e:
727
+ raise click.ClickException(f"Failed to get GLEIF info: {e}")
728
+
729
+
730
+ @db_cmd.command("import-gleif")
731
+ @click.argument("file_path", type=click.Path(exists=True), required=False)
732
+ @click.option("--download", is_flag=True, help="Download latest GLEIF file before importing")
733
+ @click.option("--force", is_flag=True, help="Force re-download even if cached")
734
+ @click.option("--db", "db_path", type=click.Path(), help="Database path (default: ~/.cache/corp-extractor/entities.db)")
735
+ @click.option("--limit", type=int, help="Limit number of records to import")
736
+ @click.option("--batch-size", type=int, default=50000, help="Batch size for commits (default: 50000)")
737
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
738
+ def db_import_gleif(file_path: Optional[str], download: bool, force: bool, db_path: Optional[str], limit: Optional[int], batch_size: int, verbose: bool):
739
+ """
740
+ Import GLEIF LEI data into the entity database.
741
+
742
+ If no file path is provided and --download is set, downloads the latest
743
+ GLEIF data file automatically. Downloaded files are cached and reused
744
+ unless --force is specified.
745
+
746
+ \b
747
+ Examples:
748
+ corp-extractor db import-gleif /path/to/lei-records.xml
749
+ corp-extractor db import-gleif --download
750
+ corp-extractor db import-gleif --download --limit 10000
751
+ corp-extractor db import-gleif --download --force # Re-download
752
+ """
753
+ _configure_logging(verbose)
754
+
755
+ from .database import OrganizationDatabase, CompanyEmbedder
756
+ from .database.importers import GleifImporter
757
+
758
+ importer = GleifImporter()
759
+
760
+ # Handle file path
761
+ if file_path is None:
762
+ if not download:
763
+ raise click.UsageError("Either provide a file path or use --download to fetch the latest GLEIF data")
764
+ click.echo("Downloading latest GLEIF data...", err=True)
765
+ file_path = str(importer.download_latest(force=force))
766
+ elif download:
767
+ click.echo("Downloading latest GLEIF data (ignoring provided file path)...", err=True)
768
+ file_path = str(importer.download_latest(force=force))
769
+
770
+ click.echo(f"Importing GLEIF data from {file_path}...", err=True)
771
+
772
+ # Initialize components
773
+ embedder = CompanyEmbedder()
774
+ database = OrganizationDatabase(db_path=db_path, embedding_dim=embedder.embedding_dim)
775
+
776
+ # Import records in batches
777
+ records = []
778
+ count = 0
779
+
780
+ for record in importer.import_from_file(file_path, limit=limit):
781
+ records.append(record)
782
+
783
+ if len(records) >= batch_size:
784
+ # Embed and insert batch
785
+ names = [r.name for r in records]
786
+ embeddings = embedder.embed_batch(names)
787
+ database.insert_batch(records, embeddings)
788
+ count += len(records)
789
+ click.echo(f"Imported {count} records...", err=True)
790
+ records = []
791
+
792
+ # Final batch
793
+ if records:
794
+ names = [r.name for r in records]
795
+ embeddings = embedder.embed_batch(names)
796
+ database.insert_batch(records, embeddings)
797
+ count += len(records)
798
+
799
+ click.echo(f"\nImported {count} GLEIF records successfully.", err=True)
800
+ database.close()
801
+
802
+
803
+ @db_cmd.command("import-sec")
804
+ @click.option("--download", is_flag=True, help="Download bulk submissions.zip (~500MB, ~100K+ filers)")
805
+ @click.option("--file", "file_path", type=click.Path(exists=True), help="Local file (submissions.zip or company_tickers.json)")
806
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
807
+ @click.option("--limit", type=int, help="Limit number of records")
808
+ @click.option("--batch-size", type=int, default=10000, help="Batch size (default: 10000)")
809
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
810
+ def db_import_sec(download: bool, file_path: Optional[str], db_path: Optional[str], limit: Optional[int], batch_size: int, verbose: bool):
811
+ """
812
+ Import SEC Edgar data into the entity database.
813
+
814
+ By default, downloads the bulk submissions.zip file which contains
815
+ ALL SEC filers (~100K+), not just companies with ticker symbols (~10K).
816
+
817
+ \b
818
+ Examples:
819
+ corp-extractor db import-sec --download
820
+ corp-extractor db import-sec --download --limit 50000
821
+ corp-extractor db import-sec --file /path/to/submissions.zip
822
+ corp-extractor db import-sec --file /path/to/company_tickers.json # legacy
823
+ """
824
+ _configure_logging(verbose)
825
+
826
+ from .database import OrganizationDatabase, CompanyEmbedder
827
+ from .database.importers import SecEdgarImporter
828
+
829
+ if not download and not file_path:
830
+ raise click.UsageError("Either --download or --file is required")
831
+
832
+ # Initialize components
833
+ embedder = CompanyEmbedder()
834
+ database = OrganizationDatabase(db_path=db_path, embedding_dim=embedder.embedding_dim)
835
+ importer = SecEdgarImporter()
836
+
837
+ # Get records
838
+ if file_path:
839
+ click.echo(f"Importing SEC Edgar data from {file_path}...", err=True)
840
+ record_iter = importer.import_from_file(file_path, limit=limit)
841
+ else:
842
+ click.echo("Downloading SEC submissions.zip (~500MB)...", err=True)
843
+ record_iter = importer.import_from_url(limit=limit)
844
+
845
+ # Import records in batches
846
+ records = []
847
+ count = 0
848
+
849
+ for record in record_iter:
850
+ records.append(record)
851
+
852
+ if len(records) >= batch_size:
853
+ names = [r.name for r in records]
854
+ embeddings = embedder.embed_batch(names)
855
+ database.insert_batch(records, embeddings)
856
+ count += len(records)
857
+ click.echo(f"Imported {count} records...", err=True)
858
+ records = []
859
+
860
+ # Final batch
861
+ if records:
862
+ names = [r.name for r in records]
863
+ embeddings = embedder.embed_batch(names)
864
+ database.insert_batch(records, embeddings)
865
+ count += len(records)
866
+
867
+ click.echo(f"\nImported {count} SEC Edgar records successfully.", err=True)
868
+ database.close()
869
+
870
+
871
+ @db_cmd.command("import-wikidata")
872
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
873
+ @click.option("--limit", type=int, help="Limit number of records")
874
+ @click.option("--batch-size", type=int, default=1000, help="Batch size for commits (default: 1000)")
875
+ @click.option("--type", "query_type", type=click.Choice(["lei", "ticker", "public", "business", "organization", "nonprofit", "government"]), default="lei",
876
+ help="Query type to use for fetching data")
877
+ @click.option("--all", "import_all", is_flag=True, help="Run all query types sequentially")
878
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
879
+ def db_import_wikidata(db_path: Optional[str], limit: Optional[int], batch_size: int, query_type: str, import_all: bool, verbose: bool):
880
+ """
881
+ Import organization data from Wikidata via SPARQL.
882
+
883
+ Uses simplified SPARQL queries that avoid timeouts on Wikidata's endpoint.
884
+ Query types target different organization categories.
885
+
886
+ \b
887
+ Query types:
888
+ lei Companies with LEI codes (fastest, most reliable)
889
+ ticker Companies listed on stock exchanges
890
+ public Direct instances of "public company" (Q891723)
891
+ business Direct instances of "business enterprise" (Q4830453)
892
+ organization All organizations (Q43229) - NGOs, associations, etc.
893
+ nonprofit Non-profit organizations (Q163740)
894
+ government Government agencies (Q327333)
895
+
896
+ \b
897
+ Examples:
898
+ corp-extractor db import-wikidata --limit 10
899
+ corp-extractor db import-wikidata --type organization --limit 1000
900
+ corp-extractor db import-wikidata --type nonprofit --limit 5000
901
+ corp-extractor db import-wikidata --all --limit 10000
902
+ """
903
+ _configure_logging(verbose)
904
+
905
+ from .database import OrganizationDatabase, CompanyEmbedder
906
+ from .database.importers import WikidataImporter
907
+
908
+ click.echo(f"Importing Wikidata organization data via SPARQL (type={query_type}, all={import_all})...", err=True)
909
+
910
+ # Initialize components
911
+ embedder = CompanyEmbedder()
912
+ database = OrganizationDatabase(db_path=db_path, embedding_dim=embedder.embedding_dim)
913
+ importer = WikidataImporter(batch_size=500) # Smaller SPARQL batch size for reliability
914
+
915
+ # Import records in batches
916
+ records = []
917
+ count = 0
918
+
919
+ for record in importer.import_from_sparql(limit=limit, query_type=query_type, import_all=import_all):
920
+ records.append(record)
921
+
922
+ if len(records) >= batch_size:
923
+ names = [r.name for r in records]
924
+ embeddings = embedder.embed_batch(names)
925
+ database.insert_batch(records, embeddings)
926
+ count += len(records)
927
+ click.echo(f"Imported {count} records...", err=True)
928
+ records = []
929
+
930
+ # Final batch
931
+ if records:
932
+ names = [r.name for r in records]
933
+ embeddings = embedder.embed_batch(names)
934
+ database.insert_batch(records, embeddings)
935
+ count += len(records)
936
+
937
+ click.echo(f"\nImported {count} Wikidata records successfully.", err=True)
938
+ database.close()
939
+
940
+
941
+ @db_cmd.command("import-people")
942
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
943
+ @click.option("--limit", type=int, help="Limit number of records")
944
+ @click.option("--batch-size", type=int, default=1000, help="Batch size for commits (default: 1000)")
945
+ @click.option("--type", "query_type", type=click.Choice([
946
+ "executive", "politician", "athlete", "artist",
947
+ "academic", "scientist", "journalist", "entrepreneur", "activist"
948
+ ]), default="executive", help="Person type to import")
949
+ @click.option("--all", "import_all", is_flag=True, help="Run all person type queries sequentially")
950
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
951
+ def db_import_people(db_path: Optional[str], limit: Optional[int], batch_size: int, query_type: str, import_all: bool, verbose: bool):
952
+ """
953
+ Import notable people data from Wikidata via SPARQL.
954
+
955
+ Imports people with English Wikipedia articles (ensures notability).
956
+ Includes executives, politicians, athletes, artists, academics, and more.
957
+
958
+ \b
959
+ Examples:
960
+ corp-extractor db import-people --type executive --limit 5000
961
+ corp-extractor db import-people --all --limit 10000
962
+ corp-extractor db import-people --type politician -v
963
+ """
964
+ _configure_logging(verbose)
965
+
966
+ from .database.store import get_person_database, DEFAULT_DB_PATH
967
+ from .database.embeddings import CompanyEmbedder
968
+ from .database.importers.wikidata_people import WikidataPeopleImporter
969
+
970
+ # Default database path
971
+ if db_path is None:
972
+ db_path_obj = DEFAULT_DB_PATH
973
+ else:
974
+ db_path_obj = Path(db_path)
975
+
976
+ click.echo(f"Importing Wikidata people to {db_path_obj}...", err=True)
977
+
978
+ # Initialize components
979
+ database = get_person_database(db_path=db_path_obj)
980
+ embedder = CompanyEmbedder()
981
+ importer = WikidataPeopleImporter(batch_size=batch_size)
982
+
983
+ # Batch processing
984
+ records = []
985
+ count = 0
986
+
987
+ for record in importer.import_from_sparql(limit=limit, query_type=query_type, import_all=import_all):
988
+ records.append(record)
989
+
990
+ if len(records) >= batch_size:
991
+ # Generate embeddings using the combined name|role|org format
992
+ embedding_texts = [r.get_embedding_text() for r in records]
993
+ embeddings = embedder.embed_batch(embedding_texts)
994
+ database.insert_batch(records, embeddings)
995
+ count += len(records)
996
+ click.echo(f" Imported {count} people...", err=True)
997
+ records = []
998
+
999
+ # Final batch
1000
+ if records:
1001
+ embedding_texts = [r.get_embedding_text() for r in records]
1002
+ embeddings = embedder.embed_batch(embedding_texts)
1003
+ database.insert_batch(records, embeddings)
1004
+ count += len(records)
1005
+
1006
+ click.echo(f"\nImported {count} people successfully.", err=True)
1007
+ database.close()
1008
+
1009
+
1010
+ @db_cmd.command("search-people")
1011
+ @click.argument("query")
1012
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
1013
+ @click.option("--top-k", type=int, default=10, help="Number of results")
1014
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1015
+ def db_search_people(query: str, db_path: Optional[str], top_k: int, verbose: bool):
1016
+ """
1017
+ Search for a person in the database.
1018
+
1019
+ \b
1020
+ Examples:
1021
+ corp-extractor db search-people "Tim Cook"
1022
+ corp-extractor db search-people "Elon Musk" --top-k 5
1023
+ """
1024
+ _configure_logging(verbose)
1025
+
1026
+ from .database.store import get_person_database, DEFAULT_DB_PATH
1027
+ from .database.embeddings import CompanyEmbedder
1028
+
1029
+ # Default database path
1030
+ if db_path is None:
1031
+ db_path_obj = DEFAULT_DB_PATH
1032
+ else:
1033
+ db_path_obj = Path(db_path)
1034
+
1035
+ click.echo(f"Searching for '{query}' in {db_path_obj}...", err=True)
1036
+
1037
+ # Initialize components
1038
+ database = get_person_database(db_path=db_path_obj)
1039
+ embedder = CompanyEmbedder()
1040
+
1041
+ # Embed query and search
1042
+ query_embedding = embedder.embed(query)
1043
+ results = database.search(query_embedding, top_k=top_k, query_text=query)
1044
+
1045
+ if not results:
1046
+ click.echo("No results found.", err=True)
1047
+ return
1048
+
1049
+ click.echo(f"\nFound {len(results)} results:\n")
1050
+ for i, (record, similarity) in enumerate(results, 1):
1051
+ role_str = f" ({record.known_for_role})" if record.known_for_role else ""
1052
+ org_str = f" at {record.known_for_org}" if record.known_for_org else ""
1053
+ country_str = f" [{record.country}]" if record.country else ""
1054
+ click.echo(f" {i}. {record.name}{role_str}{org_str}{country_str}")
1055
+ click.echo(f" Source: wikidata:{record.source_id}, Type: {record.person_type.value}, Score: {similarity:.3f}")
1056
+ click.echo()
1057
+
1058
+ database.close()
1059
+
1060
+
1061
+ @db_cmd.command("import-companies-house")
1062
+ @click.option("--download", is_flag=True, help="Download bulk data file (free, no API key needed)")
1063
+ @click.option("--force", is_flag=True, help="Force re-download even if cached")
1064
+ @click.option("--file", "file_path", type=click.Path(exists=True), help="Local Companies House CSV/JSON file")
1065
+ @click.option("--search", "search_terms", type=str, help="Comma-separated search terms (requires API key)")
1066
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
1067
+ @click.option("--limit", type=int, help="Limit number of records")
1068
+ @click.option("--batch-size", type=int, default=50000, help="Batch size for commits (default: 50000)")
1069
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1070
+ def db_import_companies_house(
1071
+ download: bool,
1072
+ force: bool,
1073
+ file_path: Optional[str],
1074
+ search_terms: Optional[str],
1075
+ db_path: Optional[str],
1076
+ limit: Optional[int],
1077
+ batch_size: int,
1078
+ verbose: bool,
1079
+ ):
1080
+ """
1081
+ Import UK Companies House data into the entity database.
1082
+
1083
+ \b
1084
+ Options:
1085
+ --download Download free bulk data (all UK companies, ~5M records)
1086
+ --file Import from local CSV/JSON file
1087
+ --search Search via API (requires COMPANIES_HOUSE_API_KEY)
1088
+
1089
+ \b
1090
+ Examples:
1091
+ corp-extractor db import-companies-house --download
1092
+ corp-extractor db import-companies-house --download --limit 100000
1093
+ corp-extractor db import-companies-house --file /path/to/companies.csv
1094
+ corp-extractor db import-companies-house --search "bank,insurance"
1095
+ """
1096
+ _configure_logging(verbose)
1097
+
1098
+ from .database import OrganizationDatabase, CompanyEmbedder
1099
+ from .database.importers import CompaniesHouseImporter
1100
+
1101
+ if not file_path and not search_terms and not download:
1102
+ raise click.UsageError("Either --download, --file, or --search is required")
1103
+
1104
+ click.echo("Importing Companies House data...", err=True)
1105
+
1106
+ # Initialize components
1107
+ embedder = CompanyEmbedder()
1108
+ database = OrganizationDatabase(db_path=db_path, embedding_dim=embedder.embedding_dim)
1109
+ importer = CompaniesHouseImporter()
1110
+
1111
+ # Get records
1112
+ if download:
1113
+ # Download bulk data file
1114
+ csv_path = importer.download_bulk_data(force=force)
1115
+ click.echo(f"Using bulk data file: {csv_path}", err=True)
1116
+ record_iter = importer.import_from_file(csv_path, limit=limit)
1117
+ elif file_path:
1118
+ record_iter = importer.import_from_file(file_path, limit=limit)
1119
+ else:
1120
+ terms = [t.strip() for t in search_terms.split(",") if t.strip()]
1121
+ click.echo(f"Searching for: {terms}", err=True)
1122
+ record_iter = importer.import_from_search(
1123
+ search_terms=terms,
1124
+ limit_per_term=limit or 100,
1125
+ total_limit=limit,
1126
+ )
1127
+
1128
+ # Import records in batches
1129
+ records = []
1130
+ count = 0
1131
+
1132
+ for record in record_iter:
1133
+ records.append(record)
1134
+
1135
+ if len(records) >= batch_size:
1136
+ names = [r.name for r in records]
1137
+ embeddings = embedder.embed_batch(names)
1138
+ database.insert_batch(records, embeddings)
1139
+ count += len(records)
1140
+ click.echo(f"Imported {count} records...", err=True)
1141
+ records = []
1142
+
1143
+ # Final batch
1144
+ if records:
1145
+ names = [r.name for r in records]
1146
+ embeddings = embedder.embed_batch(names)
1147
+ database.insert_batch(records, embeddings)
1148
+ count += len(records)
1149
+
1150
+ click.echo(f"\nImported {count} Companies House records successfully.", err=True)
1151
+ database.close()
1152
+
1153
+
1154
+ @db_cmd.command("status")
1155
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
1156
+ def db_status(db_path: Optional[str]):
1157
+ """
1158
+ Show database status and statistics.
1159
+
1160
+ \b
1161
+ Examples:
1162
+ corp-extractor db status
1163
+ corp-extractor db status --db /path/to/entities.db
1164
+ """
1165
+ from .database import OrganizationDatabase
1166
+
1167
+ try:
1168
+ database = OrganizationDatabase(db_path=db_path)
1169
+ stats = database.get_stats()
1170
+
1171
+ click.echo("\nEntity Database Status")
1172
+ click.echo("=" * 40)
1173
+ click.echo(f"Total records: {stats.total_records:,}")
1174
+ click.echo(f"Embedding dimension: {stats.embedding_dimension}")
1175
+ click.echo(f"Database size: {stats.database_size_bytes / 1024 / 1024:.2f} MB")
1176
+
1177
+ # Check for missing embeddings
1178
+ missing_embeddings = database.get_missing_embedding_count()
1179
+ if missing_embeddings > 0:
1180
+ click.echo(f"\n⚠️ Missing embeddings: {missing_embeddings:,}")
1181
+ click.echo(" Run 'corp-extractor db repair-embeddings' to fix")
1182
+
1183
+ if stats.by_source:
1184
+ click.echo("\nRecords by source:")
1185
+ for source, count in stats.by_source.items():
1186
+ click.echo(f" {source}: {count:,}")
1187
+
1188
+ database.close()
1189
+
1190
+ except Exception as e:
1191
+ raise click.ClickException(f"Failed to read database: {e}")
1192
+
1193
+
1194
+ @db_cmd.command("search")
1195
+ @click.argument("query")
1196
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
1197
+ @click.option("--top-k", type=int, default=10, help="Number of results")
1198
+ @click.option("--source", type=click.Choice(["gleif", "sec_edgar", "companies_house", "wikipedia"]), help="Filter by source")
1199
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1200
+ def db_search(query: str, db_path: Optional[str], top_k: int, source: Optional[str], verbose: bool):
1201
+ """
1202
+ Search for an organization in the database.
1203
+
1204
+ \b
1205
+ Examples:
1206
+ corp-extractor db search "Apple Inc"
1207
+ corp-extractor db search "Microsoft" --source sec_edgar
1208
+ """
1209
+ _configure_logging(verbose)
1210
+
1211
+ from .database import OrganizationDatabase, CompanyEmbedder
1212
+
1213
+ embedder = CompanyEmbedder()
1214
+ database = OrganizationDatabase(db_path=db_path)
1215
+
1216
+ click.echo(f"Searching for: {query}", err=True)
1217
+
1218
+ # Embed query
1219
+ query_embedding = embedder.embed(query)
1220
+
1221
+ # Search
1222
+ results = database.search(query_embedding, top_k=top_k, source_filter=source)
1223
+
1224
+ if not results:
1225
+ click.echo("No results found.")
1226
+ return
1227
+
1228
+ click.echo(f"\nTop {len(results)} matches:")
1229
+ click.echo("-" * 60)
1230
+
1231
+ for i, (record, similarity) in enumerate(results, 1):
1232
+ click.echo(f"{i}. {record.legal_name}")
1233
+ click.echo(f" Source: {record.source} | ID: {record.source_id}")
1234
+ click.echo(f" Canonical ID: {record.canonical_id}")
1235
+ click.echo(f" Similarity: {similarity:.4f}")
1236
+ if verbose and record.record:
1237
+ if record.record.get("ticker"):
1238
+ click.echo(f" Ticker: {record.record['ticker']}")
1239
+ if record.record.get("jurisdiction"):
1240
+ click.echo(f" Jurisdiction: {record.record['jurisdiction']}")
1241
+ click.echo()
1242
+
1243
+ database.close()
1244
+
1245
+
1246
+ @db_cmd.command("download")
1247
+ @click.option("--repo", type=str, default="Corp-o-Rate-Community/entity-references", help="HuggingFace repo ID")
1248
+ @click.option("--db", "db_path", type=click.Path(), help="Output path for database")
1249
+ @click.option("--full", is_flag=True, help="Download full version (larger, includes record metadata)")
1250
+ @click.option("--no-compress", is_flag=True, help="Download uncompressed version (slower)")
1251
+ @click.option("--force", is_flag=True, help="Force re-download")
1252
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1253
+ def db_download(repo: str, db_path: Optional[str], full: bool, no_compress: bool, force: bool, verbose: bool):
1254
+ """
1255
+ Download entity database from HuggingFace Hub.
1256
+
1257
+ By default downloads the lite version (smaller, without record metadata).
1258
+ Use --full for the complete database with all source record data.
1259
+
1260
+ \b
1261
+ Examples:
1262
+ corp-extractor db download
1263
+ corp-extractor db download --full
1264
+ corp-extractor db download --repo my-org/my-entity-db
1265
+ """
1266
+ _configure_logging(verbose)
1267
+ from .database.hub import download_database
1268
+
1269
+ filename = "entities.db" if full else "entities-lite.db"
1270
+ click.echo(f"Downloading {'full ' if full else 'lite '}database from {repo}...", err=True)
1271
+
1272
+ try:
1273
+ path = download_database(
1274
+ repo_id=repo,
1275
+ filename=filename,
1276
+ force_download=force,
1277
+ prefer_compressed=not no_compress,
1278
+ )
1279
+ click.echo(f"Database downloaded to: {path}")
1280
+ except Exception as e:
1281
+ raise click.ClickException(f"Download failed: {e}")
1282
+
1283
+
1284
+ @db_cmd.command("upload")
1285
+ @click.argument("db_path", type=click.Path(exists=True), required=False)
1286
+ @click.option("--repo", type=str, default="Corp-o-Rate-Community/entity-references", help="HuggingFace repo ID")
1287
+ @click.option("--message", type=str, default="Update entity database", help="Commit message")
1288
+ @click.option("--no-lite", is_flag=True, help="Skip creating lite version (without record data)")
1289
+ @click.option("--no-compress", is_flag=True, help="Skip creating compressed versions")
1290
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1291
+ def db_upload(db_path: Optional[str], repo: str, message: str, no_lite: bool, no_compress: bool, verbose: bool):
1292
+ """
1293
+ Upload entity database to HuggingFace Hub with variants.
1294
+
1295
+ If no path is provided, uploads from the default cache location.
1296
+
1297
+ By default uploads:
1298
+ - entities.db (full database)
1299
+ - entities-lite.db (without record data, smaller)
1300
+ - entities.db.gz (compressed full)
1301
+ - entities-lite.db.gz (compressed lite)
1302
+
1303
+ Requires HF_TOKEN environment variable to be set.
1304
+
1305
+ \b
1306
+ Examples:
1307
+ corp-extractor db upload
1308
+ corp-extractor db upload /path/to/entities.db
1309
+ corp-extractor db upload --no-lite --no-compress
1310
+ corp-extractor db upload --repo my-org/my-entity-db
1311
+ """
1312
+ _configure_logging(verbose)
1313
+ from .database.hub import upload_database_with_variants, DEFAULT_CACHE_DIR, DEFAULT_DB_FULL_FILENAME
1314
+
1315
+ # Use default cache location if no path provided
1316
+ if db_path is None:
1317
+ db_path = str(DEFAULT_CACHE_DIR / DEFAULT_DB_FULL_FILENAME)
1318
+ if not Path(db_path).exists():
1319
+ raise click.ClickException(
1320
+ f"Database not found at default location: {db_path}\n"
1321
+ "Build the database first with import commands, or specify a path."
1322
+ )
1323
+
1324
+ click.echo(f"Uploading {db_path} to {repo}...", err=True)
1325
+ if not no_lite:
1326
+ click.echo(" - Creating lite version (without record data)", err=True)
1327
+ if not no_compress:
1328
+ click.echo(" - Creating compressed versions", err=True)
1329
+
1330
+ try:
1331
+ results = upload_database_with_variants(
1332
+ db_path=db_path,
1333
+ repo_id=repo,
1334
+ commit_message=message,
1335
+ include_lite=not no_lite,
1336
+ include_compressed=not no_compress,
1337
+ )
1338
+ click.echo(f"\nUploaded {len(results)} file(s) successfully:")
1339
+ for filename, url in results.items():
1340
+ click.echo(f" - {filename}")
1341
+ except Exception as e:
1342
+ raise click.ClickException(f"Upload failed: {e}")
1343
+
1344
+
1345
+ @db_cmd.command("create-lite")
1346
+ @click.argument("db_path", type=click.Path(exists=True))
1347
+ @click.option("-o", "--output", type=click.Path(), help="Output path (default: adds -lite suffix)")
1348
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1349
+ def db_create_lite(db_path: str, output: Optional[str], verbose: bool):
1350
+ """
1351
+ Create a lite version of the database without record data.
1352
+
1353
+ The lite version strips the `record` column (full source data),
1354
+ keeping only core fields and embeddings. This significantly
1355
+ reduces file size while maintaining search functionality.
1356
+
1357
+ \b
1358
+ Examples:
1359
+ corp-extractor db create-lite entities.db
1360
+ corp-extractor db create-lite entities.db -o entities-lite.db
1361
+ """
1362
+ _configure_logging(verbose)
1363
+ from .database.hub import create_lite_database
1364
+
1365
+ click.echo(f"Creating lite database from {db_path}...", err=True)
1366
+
1367
+ try:
1368
+ lite_path = create_lite_database(db_path, output)
1369
+ click.echo(f"Lite database created: {lite_path}")
1370
+ except Exception as e:
1371
+ raise click.ClickException(f"Failed to create lite database: {e}")
1372
+
1373
+
1374
+ @db_cmd.command("compress")
1375
+ @click.argument("db_path", type=click.Path(exists=True))
1376
+ @click.option("-o", "--output", type=click.Path(), help="Output path (default: adds .gz suffix)")
1377
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1378
+ def db_compress(db_path: str, output: Optional[str], verbose: bool):
1379
+ """
1380
+ Compress a database file using gzip.
1381
+
1382
+ \b
1383
+ Examples:
1384
+ corp-extractor db compress entities.db
1385
+ corp-extractor db compress entities.db -o entities.db.gz
1386
+ """
1387
+ _configure_logging(verbose)
1388
+ from .database.hub import compress_database
1389
+
1390
+ click.echo(f"Compressing {db_path}...", err=True)
1391
+
1392
+ try:
1393
+ compressed_path = compress_database(db_path, output)
1394
+ click.echo(f"Compressed database created: {compressed_path}")
1395
+ except Exception as e:
1396
+ raise click.ClickException(f"Compression failed: {e}")
1397
+
1398
+
1399
+ @db_cmd.command("repair-embeddings")
1400
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
1401
+ @click.option("--batch-size", type=int, default=1000, help="Batch size for embedding generation (default: 1000)")
1402
+ @click.option("--source", type=str, help="Only repair specific source (gleif, sec_edgar, etc.)")
1403
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1404
+ def db_repair_embeddings(db_path: Optional[str], batch_size: int, source: Optional[str], verbose: bool):
1405
+ """
1406
+ Generate missing embeddings for organizations in the database.
1407
+
1408
+ This repairs databases where organizations were imported without embeddings
1409
+ being properly stored in the organization_embeddings table.
1410
+
1411
+ \b
1412
+ Examples:
1413
+ corp-extractor db repair-embeddings
1414
+ corp-extractor db repair-embeddings --source wikipedia
1415
+ corp-extractor db repair-embeddings --batch-size 500
1416
+ """
1417
+ _configure_logging(verbose)
1418
+
1419
+ from .database import OrganizationDatabase, CompanyEmbedder
1420
+
1421
+ database = OrganizationDatabase(db_path=db_path)
1422
+ embedder = CompanyEmbedder()
1423
+
1424
+ # Check how many need repair
1425
+ missing_count = database.get_missing_embedding_count()
1426
+ if missing_count == 0:
1427
+ click.echo("All organizations have embeddings. Nothing to repair.")
1428
+ database.close()
1429
+ return
1430
+
1431
+ click.echo(f"Found {missing_count:,} organizations without embeddings.", err=True)
1432
+ click.echo("Generating embeddings...", err=True)
1433
+
1434
+ # Process in batches
1435
+ org_ids = []
1436
+ names = []
1437
+ count = 0
1438
+
1439
+ for org_id, name in database.get_organizations_without_embeddings(batch_size=batch_size, source=source):
1440
+ org_ids.append(org_id)
1441
+ names.append(name)
1442
+
1443
+ if len(names) >= batch_size:
1444
+ # Generate embeddings
1445
+ embeddings = embedder.embed_batch(names)
1446
+ database.insert_embeddings_batch(org_ids, embeddings)
1447
+ count += len(names)
1448
+ click.echo(f"Repaired {count:,} / {missing_count:,} embeddings...", err=True)
1449
+ org_ids = []
1450
+ names = []
1451
+
1452
+ # Final batch
1453
+ if names:
1454
+ embeddings = embedder.embed_batch(names)
1455
+ database.insert_embeddings_batch(org_ids, embeddings)
1456
+ count += len(names)
1457
+
1458
+ click.echo(f"\nRepaired {count:,} embeddings successfully.", err=True)
1459
+ database.close()
1460
+
1461
+
1462
+ @db_cmd.command("migrate")
1463
+ @click.argument("db_path", type=click.Path(exists=True))
1464
+ @click.option("--rename-file", is_flag=True, help="Also rename companies.db to entities.db")
1465
+ @click.option("--yes", is_flag=True, help="Skip confirmation prompt")
1466
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1467
+ def db_migrate(db_path: str, rename_file: bool, yes: bool, verbose: bool):
1468
+ """
1469
+ Migrate database from legacy schema to new schema.
1470
+
1471
+ Migrates from old naming (companies/company_embeddings tables)
1472
+ to new naming (organizations/organization_embeddings tables).
1473
+
1474
+ \b
1475
+ What this does:
1476
+ - Renames 'companies' table to 'organizations'
1477
+ - Renames 'company_embeddings' table to 'organization_embeddings'
1478
+ - Updates all indexes
1479
+
1480
+ \b
1481
+ Examples:
1482
+ corp-extractor db migrate companies.db
1483
+ corp-extractor db migrate companies.db --rename-file
1484
+ corp-extractor db migrate ~/.cache/corp-extractor/companies.db --yes
1485
+ """
1486
+ _configure_logging(verbose)
1487
+
1488
+ from pathlib import Path
1489
+ from .database import OrganizationDatabase
1490
+
1491
+ db_path_obj = Path(db_path)
1492
+
1493
+ if not yes:
1494
+ click.confirm(
1495
+ f"This will migrate {db_path} from legacy schema (companies) to new schema (organizations).\n"
1496
+ "This operation cannot be undone. Continue?",
1497
+ abort=True
1498
+ )
1499
+
1500
+ try:
1501
+ database = OrganizationDatabase(db_path=db_path)
1502
+ migrations = database.migrate_from_legacy_schema()
1503
+ database.close()
1504
+
1505
+ if migrations:
1506
+ click.echo("Migration completed:")
1507
+ for table, action in migrations.items():
1508
+ click.echo(f" {table}: {action}")
1509
+ else:
1510
+ click.echo("No migration needed. Database already uses new schema.")
1511
+
1512
+ # Optionally rename the file
1513
+ if rename_file and db_path_obj.name.startswith("companies"):
1514
+ new_name = db_path_obj.name.replace("companies", "entities")
1515
+ new_path = db_path_obj.parent / new_name
1516
+ db_path_obj.rename(new_path)
1517
+ click.echo(f"Renamed file: {db_path} -> {new_path}")
1518
+
1519
+ except Exception as e:
1520
+ raise click.ClickException(f"Migration failed: {e}")
1521
+
1522
+
1523
+ # =============================================================================
1524
+ # Document commands
1525
+ # =============================================================================
1526
+
1527
+ @main.group("document")
1528
+ def document_cmd():
1529
+ """
1530
+ Process documents with chunking, deduplication, and citations.
1531
+
1532
+ \b
1533
+ Commands:
1534
+ process Process a document through the full pipeline
1535
+ chunk Preview chunking without extraction
1536
+
1537
+ \b
1538
+ Examples:
1539
+ corp-extractor document process article.txt
1540
+ corp-extractor document process report.pdf --no-summary
1541
+ corp-extractor document chunk article.txt --max-tokens 500
1542
+ """
1543
+ pass
1544
+
1545
+
1546
+ @document_cmd.command("process")
1547
+ @click.argument("input_source") # Can be file path or URL
1548
+ @click.option("--title", type=str, help="Document title (for citations)")
1549
+ @click.option("--author", "authors", type=str, multiple=True, help="Document author(s)")
1550
+ @click.option("--year", type=int, help="Publication year")
1551
+ @click.option("--max-tokens", type=int, default=1000, help="Target tokens per chunk (default: 1000)")
1552
+ @click.option("--overlap", type=int, default=100, help="Token overlap between chunks (default: 100)")
1553
+ @click.option("--no-summary", is_flag=True, help="Skip document summarization")
1554
+ @click.option("--no-dedup", is_flag=True, help="Skip deduplication across chunks")
1555
+ @click.option("--use-ocr", is_flag=True, help="Force OCR for PDF parsing")
1556
+ @click.option(
1557
+ "--stages",
1558
+ type=str,
1559
+ default="1-6",
1560
+ help="Pipeline stages to run (e.g., '1-3' or '1,2,5')"
1561
+ )
1562
+ @click.option(
1563
+ "-o", "--output",
1564
+ type=click.Choice(["table", "json", "triples"], case_sensitive=False),
1565
+ default="table",
1566
+ help="Output format (default: table)"
1567
+ )
1568
+ @click.option("-v", "--verbose", is_flag=True, help="Show verbose output")
1569
+ @click.option("-q", "--quiet", is_flag=True, help="Suppress progress messages")
1570
+ def document_process(
1571
+ input_source: str,
1572
+ title: Optional[str],
1573
+ authors: tuple[str, ...],
1574
+ year: Optional[int],
1575
+ max_tokens: int,
1576
+ overlap: int,
1577
+ no_summary: bool,
1578
+ no_dedup: bool,
1579
+ use_ocr: bool,
1580
+ stages: str,
1581
+ output: str,
1582
+ verbose: bool,
1583
+ quiet: bool,
1584
+ ):
1585
+ """
1586
+ Process a document or URL through the extraction pipeline with chunking.
1587
+
1588
+ Supports text files, URLs (web pages and PDFs).
1589
+
1590
+ \b
1591
+ Examples:
1592
+ corp-extractor document process article.txt
1593
+ corp-extractor document process report.txt --title "Annual Report" --year 2024
1594
+ corp-extractor document process https://example.com/article
1595
+ corp-extractor document process https://example.com/report.pdf --use-ocr
1596
+ corp-extractor document process doc.txt --no-summary --stages 1-3
1597
+ corp-extractor document process doc.txt -o json
1598
+ """
1599
+ _configure_logging(verbose)
1600
+
1601
+ # Import document pipeline
1602
+ from .document import DocumentPipeline, DocumentPipelineConfig, Document
1603
+ from .models.document import ChunkingConfig
1604
+ from .pipeline import PipelineConfig
1605
+ _load_all_plugins()
1606
+
1607
+ # Parse stages
1608
+ enabled_stages = _parse_stages(stages)
1609
+
1610
+ # Build configs
1611
+ chunking_config = ChunkingConfig(
1612
+ target_tokens=max_tokens,
1613
+ max_tokens=max_tokens * 2,
1614
+ overlap_tokens=overlap,
1615
+ )
1616
+
1617
+ pipeline_config = PipelineConfig(
1618
+ enabled_stages=enabled_stages,
1619
+ )
1620
+
1621
+ doc_config = DocumentPipelineConfig(
1622
+ chunking=chunking_config,
1623
+ generate_summary=not no_summary,
1624
+ deduplicate_across_chunks=not no_dedup,
1625
+ pipeline_config=pipeline_config,
1626
+ )
1627
+
1628
+ # Create pipeline
1629
+ pipeline = DocumentPipeline(doc_config)
1630
+
1631
+ # Detect if input is a URL
1632
+ is_url = input_source.startswith(("http://", "https://"))
1633
+
1634
+ # Process
1635
+ try:
1636
+ if is_url:
1637
+ # Process URL
1638
+ from .document import URLLoaderConfig
1639
+
1640
+ if not quiet:
1641
+ click.echo(f"Fetching URL: {input_source}", err=True)
1642
+
1643
+ loader_config = URLLoaderConfig(use_ocr=use_ocr)
1644
+ ctx = pipeline.process_url_sync(input_source, loader_config)
1645
+
1646
+ if not quiet:
1647
+ click.echo(f"Processed: {ctx.document.metadata.title or 'Untitled'}", err=True)
1648
+
1649
+ else:
1650
+ # Process file
1651
+ from pathlib import Path
1652
+ import os
1653
+
1654
+ if not os.path.exists(input_source):
1655
+ raise click.ClickException(f"File not found: {input_source}")
1656
+
1657
+ # Read input file
1658
+ with open(input_source, "r", encoding="utf-8") as f:
1659
+ text = f.read()
1660
+
1661
+ if not text.strip():
1662
+ raise click.ClickException("Input file is empty")
1663
+
1664
+ if not quiet:
1665
+ click.echo(f"Processing document: {input_source} ({len(text)} chars)", err=True)
1666
+
1667
+ # Create document with metadata
1668
+ doc_title = title or Path(input_source).stem
1669
+ document = Document.from_text(
1670
+ text=text,
1671
+ title=doc_title,
1672
+ source_type="text",
1673
+ authors=list(authors),
1674
+ year=year,
1675
+ )
1676
+
1677
+ ctx = pipeline.process(document)
1678
+
1679
+ # Output results
1680
+ if output == "json":
1681
+ _print_document_json(ctx)
1682
+ elif output == "triples":
1683
+ _print_document_triples(ctx)
1684
+ else:
1685
+ _print_document_table(ctx, verbose)
1686
+
1687
+ # Report stats
1688
+ if not quiet:
1689
+ click.echo(f"\nChunks: {ctx.chunk_count}", err=True)
1690
+ click.echo(f"Statements: {ctx.statement_count}", err=True)
1691
+ if ctx.duplicates_removed > 0:
1692
+ click.echo(f"Duplicates removed: {ctx.duplicates_removed}", err=True)
1693
+
1694
+ if ctx.processing_errors:
1695
+ click.echo(f"\nErrors: {len(ctx.processing_errors)}", err=True)
1696
+ for error in ctx.processing_errors:
1697
+ click.echo(f" - {error}", err=True)
1698
+
1699
+ except Exception as e:
1700
+ logging.exception("Document processing error:")
1701
+ raise click.ClickException(f"Processing failed: {e}")
1702
+
1703
+
1704
+ @document_cmd.command("chunk")
1705
+ @click.argument("input_path", type=click.Path(exists=True))
1706
+ @click.option("--max-tokens", type=int, default=1000, help="Target tokens per chunk (default: 1000)")
1707
+ @click.option("--overlap", type=int, default=100, help="Token overlap between chunks (default: 100)")
1708
+ @click.option("-o", "--output", type=click.Choice(["table", "json"]), default="table", help="Output format")
1709
+ @click.option("-v", "--verbose", is_flag=True, help="Show verbose output")
1710
+ def document_chunk(
1711
+ input_path: str,
1712
+ max_tokens: int,
1713
+ overlap: int,
1714
+ output: str,
1715
+ verbose: bool,
1716
+ ):
1717
+ """
1718
+ Preview document chunking without running extraction.
1719
+
1720
+ Shows how a document would be split into chunks for processing.
1721
+
1722
+ \b
1723
+ Examples:
1724
+ corp-extractor document chunk article.txt
1725
+ corp-extractor document chunk article.txt --max-tokens 500
1726
+ corp-extractor document chunk article.txt -o json
1727
+ """
1728
+ _configure_logging(verbose)
1729
+
1730
+ # Read input file
1731
+ with open(input_path, "r", encoding="utf-8") as f:
1732
+ text = f.read()
1733
+
1734
+ if not text.strip():
1735
+ raise click.ClickException("Input file is empty")
1736
+
1737
+ click.echo(f"Chunking document: {input_path} ({len(text)} chars)", err=True)
1738
+
1739
+ from .document import DocumentChunker, Document
1740
+ from .models.document import ChunkingConfig
1741
+
1742
+ config = ChunkingConfig(
1743
+ target_tokens=max_tokens,
1744
+ max_tokens=max_tokens * 2,
1745
+ overlap_tokens=overlap,
1746
+ )
1747
+
1748
+ from pathlib import Path
1749
+ document = Document.from_text(text, title=Path(input_path).stem)
1750
+ chunker = DocumentChunker(config)
1751
+ chunks = chunker.chunk_document(document)
1752
+
1753
+ if output == "json":
1754
+ import json
1755
+ chunk_data = [
1756
+ {
1757
+ "index": c.chunk_index,
1758
+ "tokens": c.token_count,
1759
+ "chars": len(c.text),
1760
+ "pages": c.page_numbers,
1761
+ "overlap": c.overlap_chars,
1762
+ "preview": c.text[:100] + "..." if len(c.text) > 100 else c.text,
1763
+ }
1764
+ for c in chunks
1765
+ ]
1766
+ click.echo(json.dumps({"chunks": chunk_data, "total": len(chunks)}, indent=2))
1767
+ else:
1768
+ click.echo(f"\nCreated {len(chunks)} chunk(s):\n")
1769
+ click.echo("-" * 80)
1770
+
1771
+ for chunk in chunks:
1772
+ click.echo(f"Chunk {chunk.chunk_index + 1}:")
1773
+ click.echo(f" Tokens: {chunk.token_count}")
1774
+ click.echo(f" Characters: {len(chunk.text)}")
1775
+ if chunk.page_numbers:
1776
+ click.echo(f" Pages: {chunk.page_numbers}")
1777
+ if chunk.overlap_chars > 0:
1778
+ click.echo(f" Overlap: {chunk.overlap_chars} chars")
1779
+
1780
+ preview = chunk.text[:200].replace("\n", " ")
1781
+ if len(chunk.text) > 200:
1782
+ preview += "..."
1783
+ click.echo(f" Preview: {preview}")
1784
+ click.echo("-" * 80)
1785
+
1786
+
1787
+ def _print_document_json(ctx):
1788
+ """Print document context as JSON."""
1789
+ import json
1790
+ click.echo(json.dumps(ctx.as_dict(), indent=2, default=str))
1791
+
1792
+
1793
+ def _print_document_triples(ctx):
1794
+ """Print document statements as triples."""
1795
+ for stmt in ctx.labeled_statements:
1796
+ parts = [stmt.subject_fqn, stmt.statement.predicate, stmt.object_fqn]
1797
+ if stmt.page_number:
1798
+ parts.append(f"p.{stmt.page_number}")
1799
+ click.echo("\t".join(parts))
1800
+
1801
+
1802
+ def _print_document_table(ctx, verbose: bool):
1803
+ """Print document context in table format."""
1804
+ # Show summary if available
1805
+ if ctx.document.summary:
1806
+ click.echo("\nDocument Summary:")
1807
+ click.echo("-" * 40)
1808
+ click.echo(ctx.document.summary)
1809
+ click.echo("-" * 40)
1810
+
1811
+ if not ctx.labeled_statements:
1812
+ click.echo("\nNo statements extracted.")
1813
+ return
1814
+
1815
+ click.echo(f"\nExtracted {len(ctx.labeled_statements)} statement(s):\n")
1816
+ click.echo("-" * 80)
1817
+
1818
+ for i, stmt in enumerate(ctx.labeled_statements, 1):
1819
+ click.echo(f"{i}. {stmt.subject_fqn}")
1820
+ click.echo(f" --[{stmt.statement.predicate}]-->")
1821
+ click.echo(f" {stmt.object_fqn}")
1822
+
1823
+ # Show citation
1824
+ if stmt.citation:
1825
+ click.echo(f" Citation: {stmt.citation}")
1826
+ elif stmt.page_number:
1827
+ click.echo(f" Page: {stmt.page_number}")
1828
+
1829
+ # Show labels
1830
+ for label in stmt.labels:
1831
+ if isinstance(label.label_value, float):
1832
+ click.echo(f" {label.label_type}: {label.label_value:.3f}")
1833
+ else:
1834
+ click.echo(f" {label.label_type}: {label.label_value}")
1835
+
1836
+ # Show taxonomy (top 3)
1837
+ if stmt.taxonomy_results:
1838
+ sorted_taxonomy = sorted(stmt.taxonomy_results, key=lambda t: t.confidence, reverse=True)[:3]
1839
+ taxonomy_strs = [f"{t.category}:{t.label}" for t in sorted_taxonomy]
1840
+ click.echo(f" Topics: {', '.join(taxonomy_strs)}")
1841
+
1842
+ if verbose and stmt.statement.source_text:
1843
+ source = stmt.statement.source_text[:60] + "..." if len(stmt.statement.source_text) > 60 else stmt.statement.source_text
1844
+ click.echo(f" Source: \"{source}\"")
1845
+
1846
+ click.echo("-" * 80)
1847
+
1848
+ # Show timings in verbose mode
1849
+ if verbose and ctx.stage_timings:
1850
+ click.echo("\nStage timings:")
1851
+ for stage, duration in ctx.stage_timings.items():
1852
+ click.echo(f" {stage}: {duration:.3f}s")
1853
+
1854
+
1855
+ # =============================================================================
1856
+ # Helper functions
1857
+ # =============================================================================
1858
+
213
1859
  def _get_input_text(text: Optional[str], input_file: Optional[str]) -> Optional[str]:
214
1860
  """Get input text from argument, file, or stdin."""
215
1861
  if text == "-" or (text is None and input_file is None and not sys.stdin.isatty()):