corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +10 -1
  4. statement_extractor/cli.py +1663 -17
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +6972 -0
  7. statement_extractor/database/__init__.py +52 -0
  8. statement_extractor/database/embeddings.py +186 -0
  9. statement_extractor/database/hub.py +520 -0
  10. statement_extractor/database/importers/__init__.py +24 -0
  11. statement_extractor/database/importers/companies_house.py +545 -0
  12. statement_extractor/database/importers/gleif.py +538 -0
  13. statement_extractor/database/importers/sec_edgar.py +375 -0
  14. statement_extractor/database/importers/wikidata.py +1012 -0
  15. statement_extractor/database/importers/wikidata_people.py +632 -0
  16. statement_extractor/database/models.py +230 -0
  17. statement_extractor/database/resolver.py +245 -0
  18. statement_extractor/database/store.py +1609 -0
  19. statement_extractor/document/__init__.py +62 -0
  20. statement_extractor/document/chunker.py +410 -0
  21. statement_extractor/document/context.py +171 -0
  22. statement_extractor/document/deduplicator.py +173 -0
  23. statement_extractor/document/html_extractor.py +246 -0
  24. statement_extractor/document/loader.py +303 -0
  25. statement_extractor/document/pipeline.py +388 -0
  26. statement_extractor/document/summarizer.py +195 -0
  27. statement_extractor/extractor.py +1 -23
  28. statement_extractor/gliner_extraction.py +4 -74
  29. statement_extractor/llm.py +255 -0
  30. statement_extractor/models/__init__.py +89 -0
  31. statement_extractor/models/canonical.py +182 -0
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/entity.py +102 -0
  34. statement_extractor/models/labels.py +220 -0
  35. statement_extractor/models/qualifiers.py +139 -0
  36. statement_extractor/models/statement.py +101 -0
  37. statement_extractor/models.py +4 -1
  38. statement_extractor/pipeline/__init__.py +39 -0
  39. statement_extractor/pipeline/config.py +129 -0
  40. statement_extractor/pipeline/context.py +177 -0
  41. statement_extractor/pipeline/orchestrator.py +416 -0
  42. statement_extractor/pipeline/registry.py +303 -0
  43. statement_extractor/plugins/__init__.py +55 -0
  44. statement_extractor/plugins/base.py +716 -0
  45. statement_extractor/plugins/extractors/__init__.py +13 -0
  46. statement_extractor/plugins/extractors/base.py +9 -0
  47. statement_extractor/plugins/extractors/gliner2.py +546 -0
  48. statement_extractor/plugins/labelers/__init__.py +29 -0
  49. statement_extractor/plugins/labelers/base.py +9 -0
  50. statement_extractor/plugins/labelers/confidence.py +138 -0
  51. statement_extractor/plugins/labelers/relation_type.py +87 -0
  52. statement_extractor/plugins/labelers/sentiment.py +159 -0
  53. statement_extractor/plugins/labelers/taxonomy.py +386 -0
  54. statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
  55. statement_extractor/plugins/pdf/__init__.py +10 -0
  56. statement_extractor/plugins/pdf/pypdf.py +291 -0
  57. statement_extractor/plugins/qualifiers/__init__.py +30 -0
  58. statement_extractor/plugins/qualifiers/base.py +9 -0
  59. statement_extractor/plugins/qualifiers/companies_house.py +185 -0
  60. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  61. statement_extractor/plugins/qualifiers/gleif.py +197 -0
  62. statement_extractor/plugins/qualifiers/person.py +785 -0
  63. statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
  64. statement_extractor/plugins/scrapers/__init__.py +10 -0
  65. statement_extractor/plugins/scrapers/http.py +236 -0
  66. statement_extractor/plugins/splitters/__init__.py +13 -0
  67. statement_extractor/plugins/splitters/base.py +9 -0
  68. statement_extractor/plugins/splitters/t5_gemma.py +293 -0
  69. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  70. statement_extractor/plugins/taxonomy/embedding.py +484 -0
  71. statement_extractor/plugins/taxonomy/mnli.py +291 -0
  72. statement_extractor/scoring.py +8 -8
  73. corp_extractor-0.4.0.dist-info/RECORD +0 -12
  74. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  75. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,716 @@
1
+ """
2
+ Base plugin classes for the extraction pipeline.
3
+
4
+ Defines the abstract interfaces for each pipeline stage:
5
+ - BaseSplitterPlugin: Stage 1 - Text → RawTriple
6
+ - BaseExtractorPlugin: Stage 2 - RawTriple → PipelineStatement
7
+ - BaseQualifierPlugin: Stage 3 - Entity → CanonicalEntity
8
+ - BaseLabelerPlugin: Stage 4 - Statement → StatementLabel
9
+ - BaseTaxonomyPlugin: Stage 5 - Statement → TaxonomyResult
10
+
11
+ Content acquisition plugins (for URL processing):
12
+ - BaseScraperPlugin: Fetch content from URLs
13
+ - BasePDFParserPlugin: Extract text from PDFs
14
+ """
15
+
16
+ from abc import ABC, abstractmethod
17
+ from enum import Enum, Flag, auto
18
+ from typing import TYPE_CHECKING, Any, Optional
19
+
20
+ from pydantic import BaseModel, Field
21
+
22
+ if TYPE_CHECKING:
23
+ from ..pipeline.context import PipelineContext
24
+ from ..models import (
25
+ RawTriple,
26
+ PipelineStatement,
27
+ ExtractedEntity,
28
+ CanonicalEntity,
29
+ StatementLabel,
30
+ TaxonomyResult,
31
+ EntityType,
32
+ )
33
+
34
+
35
+ class PluginCapability(Flag):
36
+ """Flags indicating plugin capabilities."""
37
+ NONE = 0
38
+ BATCH_PROCESSING = auto() # Can process multiple items at once
39
+ ASYNC_PROCESSING = auto() # Supports async execution
40
+ EXTERNAL_API = auto() # Uses external API (may have rate limits)
41
+ LLM_REQUIRED = auto() # Requires an LLM model
42
+ CACHING = auto() # Supports result caching
43
+
44
+
45
+ def get_available_vram_gb() -> float:
46
+ """
47
+ Get available GPU VRAM in GB.
48
+
49
+ Returns 0.0 if no GPU is available or VRAM cannot be determined.
50
+ """
51
+ try:
52
+ import torch
53
+ if torch.cuda.is_available():
54
+ device = torch.cuda.current_device()
55
+ total = torch.cuda.get_device_properties(device).total_memory
56
+ allocated = torch.cuda.memory_allocated(device)
57
+ available = (total - allocated) / (1024 ** 3) # Convert to GB
58
+ return available
59
+ elif torch.backends.mps.is_available():
60
+ # MPS doesn't expose VRAM info; estimate based on typical Apple Silicon
61
+ # Return a conservative estimate
62
+ return 8.0
63
+ except ImportError:
64
+ pass
65
+ return 0.0
66
+
67
+
68
+ def calculate_batch_size(
69
+ per_item_vram_gb: float,
70
+ overhead_gb: float = 2.0,
71
+ min_batch: int = 1,
72
+ max_batch: int = 32,
73
+ ) -> int:
74
+ """
75
+ Calculate optimal batch size based on available VRAM.
76
+
77
+ Args:
78
+ per_item_vram_gb: VRAM required per item in GB
79
+ overhead_gb: Reserved VRAM for model weights and system overhead
80
+ min_batch: Minimum batch size
81
+ max_batch: Maximum batch size cap
82
+
83
+ Returns:
84
+ Optimal batch size for the current GPU
85
+ """
86
+ available = get_available_vram_gb()
87
+ if available <= 0 or per_item_vram_gb <= 0:
88
+ return min_batch
89
+
90
+ usable = max(0, available - overhead_gb)
91
+ batch_size = int(usable / per_item_vram_gb)
92
+ return max(min_batch, min(batch_size, max_batch))
93
+
94
+
95
+ class BasePlugin(ABC):
96
+ """
97
+ Base class for all pipeline plugins.
98
+
99
+ All plugins must implement the name property and can optionally
100
+ override priority and capabilities.
101
+ """
102
+
103
+ @property
104
+ @abstractmethod
105
+ def name(self) -> str:
106
+ """Unique name for this plugin (used for registration and CLI)."""
107
+ ...
108
+
109
+ @property
110
+ def priority(self) -> int:
111
+ """
112
+ Plugin priority (lower = higher priority, runs first).
113
+
114
+ Default is 100. Use lower values (e.g., 10, 20) for critical plugins
115
+ that should run before others.
116
+ """
117
+ return 100
118
+
119
+ @property
120
+ def capabilities(self) -> PluginCapability:
121
+ """Plugin capabilities (flags)."""
122
+ return PluginCapability.NONE
123
+
124
+ @property
125
+ def description(self) -> str:
126
+ """Human-readable description of this plugin."""
127
+ return ""
128
+
129
+ @property
130
+ def model_vram_gb(self) -> float:
131
+ """
132
+ Estimated VRAM required for model weights in GB.
133
+
134
+ Override this if the plugin loads a GPU model. This is used to
135
+ reserve memory overhead when calculating batch sizes.
136
+
137
+ Default is 0.0 (no GPU model).
138
+ """
139
+ return 0.0
140
+
141
+ @property
142
+ def per_item_vram_gb(self) -> float:
143
+ """
144
+ Estimated VRAM required per item during batch processing in GB.
145
+
146
+ Override this for plugins with BATCH_PROCESSING capability.
147
+ Used to calculate optimal batch size: batch = (available - overhead) / per_item
148
+
149
+ Default is 0.1 GB (100MB) as a conservative estimate.
150
+ """
151
+ return 0.1
152
+
153
+ def get_optimal_batch_size(self, max_batch: int = 32) -> int:
154
+ """
155
+ Calculate optimal batch size based on available VRAM and plugin requirements.
156
+
157
+ Args:
158
+ max_batch: Maximum batch size cap
159
+
160
+ Returns:
161
+ Optimal batch size for current GPU state
162
+ """
163
+ if not (PluginCapability.BATCH_PROCESSING in self.capabilities):
164
+ return 1
165
+
166
+ return calculate_batch_size(
167
+ per_item_vram_gb=self.per_item_vram_gb,
168
+ overhead_gb=self.model_vram_gb + 1.0, # Add 1GB system overhead
169
+ min_batch=1,
170
+ max_batch=max_batch,
171
+ )
172
+
173
+
174
+ class BaseSplitterPlugin(BasePlugin):
175
+ """
176
+ Stage 1 plugin: Split text into atomic triples.
177
+
178
+ Takes raw text and produces RawTriple objects containing
179
+ subject/predicate/object text and source sentence.
180
+ """
181
+
182
+ @abstractmethod
183
+ def split(
184
+ self,
185
+ text: str,
186
+ context: "PipelineContext",
187
+ ) -> list["RawTriple"]:
188
+ """
189
+ Split text into atomic triples.
190
+
191
+ Args:
192
+ text: Input text to split
193
+ context: Pipeline context for accessing metadata and config
194
+
195
+ Returns:
196
+ List of RawTriple objects
197
+ """
198
+ ...
199
+
200
+ def split_batch(
201
+ self,
202
+ texts: list[str],
203
+ context: "PipelineContext",
204
+ ) -> list[list["RawTriple"]]:
205
+ """
206
+ Split multiple texts into atomic triples in a single batch.
207
+
208
+ Default implementation calls split() for each text sequentially.
209
+ Plugins with BATCH_PROCESSING capability should override this
210
+ for efficient GPU batching using get_optimal_batch_size().
211
+
212
+ Args:
213
+ texts: List of input texts to split
214
+ context: Pipeline context for accessing metadata and config
215
+
216
+ Returns:
217
+ List of RawTriple lists, one per input text
218
+ """
219
+ return [self.split(text, context) for text in texts]
220
+
221
+
222
+ class BaseExtractorPlugin(BasePlugin):
223
+ """
224
+ Stage 2 plugin: Refine triples into statements with typed entities.
225
+
226
+ Takes RawTriple objects and produces PipelineStatement objects
227
+ with ExtractedEntity subjects/objects that have types, spans,
228
+ and confidence scores.
229
+ """
230
+
231
+ @abstractmethod
232
+ def extract(
233
+ self,
234
+ raw_triples: list["RawTriple"],
235
+ context: "PipelineContext",
236
+ ) -> list["PipelineStatement"]:
237
+ """
238
+ Extract statements from raw triples.
239
+
240
+ Args:
241
+ raw_triples: Raw triples from Stage 1
242
+ context: Pipeline context
243
+
244
+ Returns:
245
+ List of PipelineStatement objects with typed entities
246
+ """
247
+ ...
248
+
249
+
250
+ class BaseQualifierPlugin(BasePlugin):
251
+ """
252
+ Stage 3 plugin: Qualify entities with identifiers and canonical forms.
253
+
254
+ Processes entities of specific types and adds:
255
+ - Semantic qualifiers (role, org for PERSON entities)
256
+ - External identifiers (LEI, company number, SEC CIK)
257
+ - Canonical name and FQN (fully qualified name)
258
+
259
+ Returns a CanonicalEntity ready for use in labeled statements.
260
+ """
261
+
262
+ @property
263
+ @abstractmethod
264
+ def supported_entity_types(self) -> set["EntityType"]:
265
+ """Entity types this plugin can qualify (e.g., {ORG, PERSON})."""
266
+ ...
267
+
268
+ @property
269
+ def supported_identifier_types(self) -> list[str]:
270
+ """
271
+ Identifier types this plugin can use for lookup.
272
+
273
+ For example, GLEIFQualifier can lookup by 'lei'.
274
+ """
275
+ return []
276
+
277
+ @property
278
+ def provided_identifier_types(self) -> list[str]:
279
+ """
280
+ Identifier types this plugin can provide.
281
+
282
+ For example, GLEIFQualifier provides 'lei', 'jurisdiction'.
283
+ """
284
+ return []
285
+
286
+ @abstractmethod
287
+ def qualify(
288
+ self,
289
+ entity: "ExtractedEntity",
290
+ context: "PipelineContext",
291
+ ) -> "CanonicalEntity | None":
292
+ """
293
+ Qualify an entity and return its canonical form.
294
+
295
+ This method should:
296
+ 1. Look up identifiers (LEI, CIK, company number, etc.)
297
+ 2. Find the canonical name if available
298
+ 3. Generate the FQN (fully qualified name)
299
+ 4. Return a CanonicalEntity with all information
300
+
301
+ Args:
302
+ entity: The entity to qualify
303
+ context: Pipeline context (for accessing source text, other entities)
304
+
305
+ Returns:
306
+ CanonicalEntity with qualifiers and FQN, or None if entity not found
307
+ """
308
+ ...
309
+
310
+
311
+ class ClassificationSchema:
312
+ """
313
+ Schema for simple multi-choice classification (2-20 choices).
314
+
315
+ Handled by GLiNER2 `.classification()` in a single pass.
316
+
317
+ Examples:
318
+ - sentiment: ["positive", "negative", "neutral"]
319
+ - certainty: ["certain", "uncertain", "speculative"]
320
+ - temporality: ["past", "present", "future"]
321
+ """
322
+
323
+ def __init__(
324
+ self,
325
+ label_type: str,
326
+ choices: list[str],
327
+ description: str = "",
328
+ scope: str = "statement", # "statement", "subject", "object", "predicate"
329
+ ):
330
+ self.label_type = label_type
331
+ self.choices = choices
332
+ self.description = description
333
+ self.scope = scope
334
+
335
+ def __repr__(self) -> str:
336
+ return f"ClassificationSchema({self.label_type!r}, choices={self.choices!r})"
337
+
338
+
339
+ class TaxonomySchema:
340
+ """
341
+ Schema for large taxonomy labeling (100s of values).
342
+
343
+ Too many choices for GLiNER2 classification. Requires MNLI or similar:
344
+ - MNLI zero-shot with label descriptions
345
+ - Embedding-based nearest neighbor search
346
+ - Hierarchical classification (category → subcategory)
347
+
348
+ Examples:
349
+ - industry_code: NAICS/SIC codes (1000+ values)
350
+ - relation_type: detailed relation ontology (100+ types)
351
+ - job_title: standardized job taxonomy
352
+ """
353
+
354
+ def __init__(
355
+ self,
356
+ label_type: str,
357
+ values: list[str] | dict[str, list[str]], # flat list or hierarchical dict
358
+ description: str = "",
359
+ scope: str = "statement", # "statement", "subject", "object", "predicate"
360
+ label_descriptions: dict[str, str] | None = None, # descriptions for MNLI
361
+ ):
362
+ self.label_type = label_type
363
+ self.values = values
364
+ self.description = description
365
+ self.scope = scope
366
+ self.label_descriptions = label_descriptions # e.g., {"NAICS:5112": "Software Publishers"}
367
+
368
+ @property
369
+ def is_hierarchical(self) -> bool:
370
+ """Check if taxonomy is hierarchical (dict) vs flat (list)."""
371
+ return isinstance(self.values, dict)
372
+
373
+ @property
374
+ def all_values(self) -> list[str]:
375
+ """Get all taxonomy values (flattened if hierarchical)."""
376
+ if isinstance(self.values, list):
377
+ return self.values
378
+ # Flatten hierarchical dict
379
+ result = []
380
+ for category, subcategories in self.values.items():
381
+ result.append(category)
382
+ result.extend(subcategories)
383
+ return result
384
+
385
+ def __repr__(self) -> str:
386
+ count = len(self.all_values)
387
+ return f"TaxonomySchema({self.label_type!r}, {count} values)"
388
+
389
+
390
+ class BaseLabelerPlugin(BasePlugin):
391
+ """
392
+ Stage 4 plugin: Apply labels to statements.
393
+
394
+ Adds classification labels (sentiment, relation type, confidence)
395
+ to the final labeled statements.
396
+
397
+ Labelers can provide a classification_schema that extractors will use
398
+ to run classification in a single model pass. The results are stored
399
+ in the pipeline context for the labeler to retrieve.
400
+ """
401
+
402
+ @property
403
+ @abstractmethod
404
+ def label_type(self) -> str:
405
+ """
406
+ The type of label this plugin produces.
407
+
408
+ Examples: 'sentiment', 'relation_type', 'confidence'
409
+ """
410
+ ...
411
+
412
+ @property
413
+ def classification_schema(self) -> ClassificationSchema | None:
414
+ """
415
+ Simple multi-choice classification schema (2-20 choices).
416
+
417
+ If provided, GLiNER2 extractor will run `.classification()` and store
418
+ results in context for this labeler to retrieve.
419
+
420
+ Returns:
421
+ ClassificationSchema or None
422
+ """
423
+ return None
424
+
425
+ @property
426
+ def taxonomy_schema(self) -> TaxonomySchema | None:
427
+ """
428
+ Large taxonomy schema (100s of values).
429
+
430
+ If provided, requires MNLI or embedding-based classification.
431
+ Results stored in context for this labeler to retrieve.
432
+
433
+ Returns:
434
+ TaxonomySchema or None
435
+ """
436
+ return None
437
+
438
+ @abstractmethod
439
+ def label(
440
+ self,
441
+ statement: "PipelineStatement",
442
+ subject_canonical: "CanonicalEntity",
443
+ object_canonical: "CanonicalEntity",
444
+ context: "PipelineContext",
445
+ ) -> "StatementLabel | None":
446
+ """
447
+ Apply a label to a statement.
448
+
449
+ Args:
450
+ statement: The statement to label
451
+ subject_canonical: Canonicalized subject entity
452
+ object_canonical: Canonicalized object entity
453
+ context: Pipeline context (check context.classification_results for pre-computed labels)
454
+
455
+ Returns:
456
+ StatementLabel if applicable, None otherwise
457
+ """
458
+ ...
459
+
460
+
461
+ class BaseTaxonomyPlugin(BasePlugin):
462
+ """
463
+ Stage 5 plugin: Classify statements against a taxonomy.
464
+
465
+ Taxonomy classification is separate from labeling because:
466
+ - It operates on large taxonomies (100s-1000s of labels)
467
+ - It requires specialized models (MNLI, embeddings)
468
+ - It's computationally heavier than simple labeling
469
+
470
+ Taxonomy plugins produce TaxonomyResult objects that are stored
471
+ in the pipeline context.
472
+ """
473
+
474
+ @property
475
+ @abstractmethod
476
+ def taxonomy_name(self) -> str:
477
+ """
478
+ Name of the taxonomy this plugin classifies against.
479
+
480
+ Examples: 'esg_topics', 'industry_codes', 'relation_types'
481
+ """
482
+ ...
483
+
484
+ @property
485
+ def taxonomy_schema(self) -> TaxonomySchema | None:
486
+ """
487
+ The taxonomy schema this plugin uses.
488
+
489
+ Returns:
490
+ TaxonomySchema describing the taxonomy structure
491
+ """
492
+ return None
493
+
494
+ @property
495
+ def supported_categories(self) -> list[str]:
496
+ """
497
+ List of taxonomy categories this plugin supports.
498
+
499
+ Returns empty list if all categories are supported.
500
+ """
501
+ return []
502
+
503
+ @abstractmethod
504
+ def classify(
505
+ self,
506
+ statement: "PipelineStatement",
507
+ subject_canonical: "CanonicalEntity",
508
+ object_canonical: "CanonicalEntity",
509
+ context: "PipelineContext",
510
+ ) -> list["TaxonomyResult"]:
511
+ """
512
+ Classify a statement against the taxonomy.
513
+
514
+ Returns all labels above the confidence threshold. A single statement
515
+ may have multiple applicable taxonomy labels.
516
+
517
+ Args:
518
+ statement: The statement to classify
519
+ subject_canonical: Canonicalized subject entity
520
+ object_canonical: Canonicalized object entity
521
+ context: Pipeline context
522
+
523
+ Returns:
524
+ List of TaxonomyResult objects (empty if none above threshold)
525
+ """
526
+ ...
527
+
528
+ def classify_batch(
529
+ self,
530
+ items: list[tuple["PipelineStatement", "CanonicalEntity", "CanonicalEntity"]],
531
+ context: "PipelineContext",
532
+ ) -> list[list["TaxonomyResult"]]:
533
+ """
534
+ Classify multiple statements against the taxonomy in a single batch.
535
+
536
+ Default implementation calls classify() for each statement sequentially.
537
+ Plugins with BATCH_PROCESSING capability should override this
538
+ for efficient GPU batching using get_optimal_batch_size().
539
+
540
+ Args:
541
+ items: List of (statement, subject_canonical, object_canonical) tuples
542
+ context: Pipeline context
543
+
544
+ Returns:
545
+ List of TaxonomyResult lists, one per input statement
546
+ """
547
+ return [
548
+ self.classify(stmt, subj, obj, context)
549
+ for stmt, subj, obj in items
550
+ ]
551
+
552
+
553
+ # =============================================================================
554
+ # Content Acquisition Plugins (for URL processing)
555
+ # =============================================================================
556
+
557
+
558
+ class ContentType(str, Enum):
559
+ """Content type detected from URL or HTTP response."""
560
+ HTML = "html"
561
+ PDF = "pdf"
562
+ BINARY = "binary"
563
+ UNKNOWN = "unknown"
564
+
565
+
566
+ class ScraperResult(BaseModel):
567
+ """Result from a scraper plugin."""
568
+ url: str = Field(description="Original URL requested")
569
+ final_url: str = Field(description="Final URL after redirects")
570
+ content: bytes = Field(description="Raw content bytes")
571
+ content_type: ContentType = Field(description="Detected content type")
572
+ headers: dict[str, str] = Field(default_factory=dict, description="Response headers")
573
+ error: Optional[str] = Field(default=None, description="Error message if fetch failed")
574
+
575
+ model_config = {"arbitrary_types_allowed": True}
576
+
577
+ @property
578
+ def ok(self) -> bool:
579
+ """Check if the fetch was successful."""
580
+ return self.error is None and len(self.content) > 0
581
+
582
+
583
+ class PDFParseResult(BaseModel):
584
+ """Result from a PDF parser plugin."""
585
+ pages: list[str] = Field(description="Extracted text for each page")
586
+ page_count: int = Field(description="Total number of pages in PDF")
587
+ metadata: dict[str, Any] = Field(default_factory=dict, description="PDF metadata (title, author, etc)")
588
+ error: Optional[str] = Field(default=None, description="Error message if parsing failed")
589
+
590
+ @property
591
+ def ok(self) -> bool:
592
+ """Check if parsing was successful."""
593
+ return self.error is None
594
+
595
+ @property
596
+ def full_text(self) -> str:
597
+ """Get concatenated text from all pages."""
598
+ return "\n\n".join(self.pages)
599
+
600
+
601
+ class BaseScraperPlugin(BasePlugin):
602
+ """
603
+ Plugin for fetching content from URLs.
604
+
605
+ Scrapers handle HTTP requests, redirects, retries, and content type detection.
606
+ They return raw bytes that can be processed by appropriate parsers (HTML, PDF, etc).
607
+
608
+ Example implementation:
609
+ @PluginRegistry.scraper
610
+ class MyScraperPlugin(BaseScraperPlugin):
611
+ @property
612
+ def name(self) -> str:
613
+ return "my_scraper"
614
+
615
+ async def fetch(self, url: str, timeout: float = 30.0) -> ScraperResult:
616
+ # Implement fetching logic
617
+ ...
618
+ """
619
+
620
+ @property
621
+ def capabilities(self) -> PluginCapability:
622
+ """Scrapers support async processing by default."""
623
+ return PluginCapability.ASYNC_PROCESSING | PluginCapability.EXTERNAL_API
624
+
625
+ @abstractmethod
626
+ async def fetch(self, url: str, timeout: float = 30.0) -> ScraperResult:
627
+ """
628
+ Fetch content from a URL.
629
+
630
+ Args:
631
+ url: The URL to fetch
632
+ timeout: Request timeout in seconds
633
+
634
+ Returns:
635
+ ScraperResult with content, content type, and any errors
636
+ """
637
+ ...
638
+
639
+ async def head(self, url: str, timeout: float = 10.0) -> ScraperResult:
640
+ """
641
+ Check content type without downloading the full body.
642
+
643
+ Default implementation does a full fetch. Override for efficiency.
644
+
645
+ Args:
646
+ url: The URL to check
647
+ timeout: Request timeout in seconds
648
+
649
+ Returns:
650
+ ScraperResult with content_type populated (content may be empty)
651
+ """
652
+ return await self.fetch(url, timeout)
653
+
654
+ def is_supported_url(self, url: str) -> bool:
655
+ """
656
+ Check if this scraper can handle the URL.
657
+
658
+ Override to restrict to specific URL patterns or domains.
659
+
660
+ Args:
661
+ url: The URL to check
662
+
663
+ Returns:
664
+ True if this scraper can handle the URL
665
+ """
666
+ return True
667
+
668
+
669
+ class BasePDFParserPlugin(BasePlugin):
670
+ """
671
+ Plugin for extracting text from PDF files.
672
+
673
+ PDF parsers take raw PDF bytes and extract text content page by page.
674
+ They may support OCR for image-heavy PDFs.
675
+
676
+ Example implementation:
677
+ @PluginRegistry.pdf_parser
678
+ class MyPDFParserPlugin(BasePDFParserPlugin):
679
+ @property
680
+ def name(self) -> str:
681
+ return "my_pdf_parser"
682
+
683
+ def parse(self, pdf_bytes: bytes, ...) -> PDFParseResult:
684
+ # Implement parsing logic
685
+ ...
686
+ """
687
+
688
+ @abstractmethod
689
+ def parse(
690
+ self,
691
+ pdf_bytes: bytes,
692
+ max_pages: int = 500,
693
+ use_ocr: bool = False,
694
+ ) -> PDFParseResult:
695
+ """
696
+ Extract text from PDF bytes.
697
+
698
+ Args:
699
+ pdf_bytes: Raw PDF file content
700
+ max_pages: Maximum number of pages to process
701
+ use_ocr: Force OCR even for text-extractable PDFs
702
+
703
+ Returns:
704
+ PDFParseResult with extracted text for each page
705
+ """
706
+ ...
707
+
708
+ @property
709
+ def supports_ocr(self) -> bool:
710
+ """
711
+ Whether this parser supports OCR for image-heavy PDFs.
712
+
713
+ Returns:
714
+ True if OCR is available
715
+ """
716
+ return False