corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +1227 -10
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +520 -0
  9. statement_extractor/database/importers/__init__.py +24 -0
  10. statement_extractor/database/importers/companies_house.py +545 -0
  11. statement_extractor/database/importers/gleif.py +538 -0
  12. statement_extractor/database/importers/sec_edgar.py +375 -0
  13. statement_extractor/database/importers/wikidata.py +1012 -0
  14. statement_extractor/database/importers/wikidata_people.py +632 -0
  15. statement_extractor/database/models.py +230 -0
  16. statement_extractor/database/resolver.py +245 -0
  17. statement_extractor/database/store.py +1609 -0
  18. statement_extractor/document/__init__.py +62 -0
  19. statement_extractor/document/chunker.py +410 -0
  20. statement_extractor/document/context.py +171 -0
  21. statement_extractor/document/deduplicator.py +173 -0
  22. statement_extractor/document/html_extractor.py +246 -0
  23. statement_extractor/document/loader.py +303 -0
  24. statement_extractor/document/pipeline.py +388 -0
  25. statement_extractor/document/summarizer.py +195 -0
  26. statement_extractor/models/__init__.py +16 -1
  27. statement_extractor/models/canonical.py +44 -1
  28. statement_extractor/models/document.py +308 -0
  29. statement_extractor/models/labels.py +47 -18
  30. statement_extractor/models/qualifiers.py +51 -3
  31. statement_extractor/models/statement.py +26 -0
  32. statement_extractor/pipeline/config.py +6 -11
  33. statement_extractor/pipeline/orchestrator.py +80 -111
  34. statement_extractor/pipeline/registry.py +52 -46
  35. statement_extractor/plugins/__init__.py +20 -8
  36. statement_extractor/plugins/base.py +334 -64
  37. statement_extractor/plugins/extractors/gliner2.py +10 -0
  38. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  39. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  40. statement_extractor/plugins/pdf/__init__.py +10 -0
  41. statement_extractor/plugins/pdf/pypdf.py +291 -0
  42. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  43. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  44. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  45. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  46. statement_extractor/plugins/qualifiers/person.py +578 -14
  47. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  48. statement_extractor/plugins/scrapers/__init__.py +10 -0
  49. statement_extractor/plugins/scrapers/http.py +236 -0
  50. statement_extractor/plugins/splitters/t5_gemma.py +158 -53
  51. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  52. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  53. statement_extractor/scoring.py +8 -8
  54. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  55. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  56. statement_extractor/plugins/canonicalizers/base.py +0 -9
  57. statement_extractor/plugins/canonicalizers/location.py +0 -219
  58. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  59. statement_extractor/plugins/canonicalizers/person.py +0 -242
  60. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  61. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -4,10 +4,9 @@ Plugins module for the extraction pipeline.
4
4
  Contains all plugin implementations organized by stage:
5
5
  - splitters/: Stage 1 - Text to atomic triples
6
6
  - extractors/: Stage 2 - Refine entities and relations
7
- - qualifiers/: Stage 3 - Add qualifiers and identifiers
8
- - canonicalizers/: Stage 4 - Resolve canonical forms
9
- - labelers/: Stage 5 - Classify statements
10
- - taxonomy/: Stage 6 - Taxonomy classification
7
+ - qualifiers/: Stage 3 - Qualify entities (add identifiers, canonical names, FQN)
8
+ - labelers/: Stage 4 - Classify statements
9
+ - taxonomy/: Stage 5 - Taxonomy classification
11
10
  """
12
11
 
13
12
  from .base import (
@@ -16,13 +15,20 @@ from .base import (
16
15
  BaseSplitterPlugin,
17
16
  BaseExtractorPlugin,
18
17
  BaseQualifierPlugin,
19
- BaseCanonicalizerPlugin,
20
18
  BaseLabelerPlugin,
21
19
  BaseTaxonomyPlugin,
20
+ # Content acquisition plugins
21
+ ContentType,
22
+ ScraperResult,
23
+ PDFParseResult,
24
+ BaseScraperPlugin,
25
+ BasePDFParserPlugin,
22
26
  )
23
27
 
24
28
  # Import plugin modules for auto-registration
25
- from . import splitters, extractors, qualifiers, canonicalizers, labelers, taxonomy
29
+ from . import splitters, extractors, qualifiers, labelers, taxonomy
30
+ # Content acquisition plugins
31
+ from . import scrapers, pdf
26
32
 
27
33
  __all__ = [
28
34
  "PluginCapability",
@@ -30,14 +36,20 @@ __all__ = [
30
36
  "BaseSplitterPlugin",
31
37
  "BaseExtractorPlugin",
32
38
  "BaseQualifierPlugin",
33
- "BaseCanonicalizerPlugin",
34
39
  "BaseLabelerPlugin",
35
40
  "BaseTaxonomyPlugin",
41
+ # Content acquisition plugins
42
+ "ContentType",
43
+ "ScraperResult",
44
+ "PDFParseResult",
45
+ "BaseScraperPlugin",
46
+ "BasePDFParserPlugin",
36
47
  # Plugin modules
37
48
  "splitters",
38
49
  "extractors",
39
50
  "qualifiers",
40
- "canonicalizers",
41
51
  "labelers",
42
52
  "taxonomy",
53
+ "scrapers",
54
+ "pdf",
43
55
  ]
@@ -4,15 +4,20 @@ Base plugin classes for the extraction pipeline.
4
4
  Defines the abstract interfaces for each pipeline stage:
5
5
  - BaseSplitterPlugin: Stage 1 - Text → RawTriple
6
6
  - BaseExtractorPlugin: Stage 2 - RawTriple → PipelineStatement
7
- - BaseQualifierPlugin: Stage 3 - Entity → EntityQualifiers
8
- - BaseCanonicalizerPlugin: Stage 4 - QualifiedEntityCanonicalMatch
9
- - BaseLabelerPlugin: Stage 5 - Statement → StatementLabel
10
- - BaseTaxonomyPlugin: Stage 6 - Statement → TaxonomyResult
7
+ - BaseQualifierPlugin: Stage 3 - Entity → CanonicalEntity
8
+ - BaseLabelerPlugin: Stage 4 - StatementStatementLabel
9
+ - BaseTaxonomyPlugin: Stage 5 - Statement → TaxonomyResult
10
+
11
+ Content acquisition plugins (for URL processing):
12
+ - BaseScraperPlugin: Fetch content from URLs
13
+ - BasePDFParserPlugin: Extract text from PDFs
11
14
  """
12
15
 
13
16
  from abc import ABC, abstractmethod
14
- from enum import Flag, auto
15
- from typing import TYPE_CHECKING
17
+ from enum import Enum, Flag, auto
18
+ from typing import TYPE_CHECKING, Any, Optional
19
+
20
+ from pydantic import BaseModel, Field
16
21
 
17
22
  if TYPE_CHECKING:
18
23
  from ..pipeline.context import PipelineContext
@@ -20,9 +25,6 @@ if TYPE_CHECKING:
20
25
  RawTriple,
21
26
  PipelineStatement,
22
27
  ExtractedEntity,
23
- EntityQualifiers,
24
- QualifiedEntity,
25
- CanonicalMatch,
26
28
  CanonicalEntity,
27
29
  StatementLabel,
28
30
  TaxonomyResult,
@@ -40,6 +42,56 @@ class PluginCapability(Flag):
40
42
  CACHING = auto() # Supports result caching
41
43
 
42
44
 
45
+ def get_available_vram_gb() -> float:
46
+ """
47
+ Get available GPU VRAM in GB.
48
+
49
+ Returns 0.0 if no GPU is available or VRAM cannot be determined.
50
+ """
51
+ try:
52
+ import torch
53
+ if torch.cuda.is_available():
54
+ device = torch.cuda.current_device()
55
+ total = torch.cuda.get_device_properties(device).total_memory
56
+ allocated = torch.cuda.memory_allocated(device)
57
+ available = (total - allocated) / (1024 ** 3) # Convert to GB
58
+ return available
59
+ elif torch.backends.mps.is_available():
60
+ # MPS doesn't expose VRAM info; estimate based on typical Apple Silicon
61
+ # Return a conservative estimate
62
+ return 8.0
63
+ except ImportError:
64
+ pass
65
+ return 0.0
66
+
67
+
68
+ def calculate_batch_size(
69
+ per_item_vram_gb: float,
70
+ overhead_gb: float = 2.0,
71
+ min_batch: int = 1,
72
+ max_batch: int = 32,
73
+ ) -> int:
74
+ """
75
+ Calculate optimal batch size based on available VRAM.
76
+
77
+ Args:
78
+ per_item_vram_gb: VRAM required per item in GB
79
+ overhead_gb: Reserved VRAM for model weights and system overhead
80
+ min_batch: Minimum batch size
81
+ max_batch: Maximum batch size cap
82
+
83
+ Returns:
84
+ Optimal batch size for the current GPU
85
+ """
86
+ available = get_available_vram_gb()
87
+ if available <= 0 or per_item_vram_gb <= 0:
88
+ return min_batch
89
+
90
+ usable = max(0, available - overhead_gb)
91
+ batch_size = int(usable / per_item_vram_gb)
92
+ return max(min_batch, min(batch_size, max_batch))
93
+
94
+
43
95
  class BasePlugin(ABC):
44
96
  """
45
97
  Base class for all pipeline plugins.
@@ -74,6 +126,50 @@ class BasePlugin(ABC):
74
126
  """Human-readable description of this plugin."""
75
127
  return ""
76
128
 
129
+ @property
130
+ def model_vram_gb(self) -> float:
131
+ """
132
+ Estimated VRAM required for model weights in GB.
133
+
134
+ Override this if the plugin loads a GPU model. This is used to
135
+ reserve memory overhead when calculating batch sizes.
136
+
137
+ Default is 0.0 (no GPU model).
138
+ """
139
+ return 0.0
140
+
141
+ @property
142
+ def per_item_vram_gb(self) -> float:
143
+ """
144
+ Estimated VRAM required per item during batch processing in GB.
145
+
146
+ Override this for plugins with BATCH_PROCESSING capability.
147
+ Used to calculate optimal batch size: batch = (available - overhead) / per_item
148
+
149
+ Default is 0.1 GB (100MB) as a conservative estimate.
150
+ """
151
+ return 0.1
152
+
153
+ def get_optimal_batch_size(self, max_batch: int = 32) -> int:
154
+ """
155
+ Calculate optimal batch size based on available VRAM and plugin requirements.
156
+
157
+ Args:
158
+ max_batch: Maximum batch size cap
159
+
160
+ Returns:
161
+ Optimal batch size for current GPU state
162
+ """
163
+ if not (PluginCapability.BATCH_PROCESSING in self.capabilities):
164
+ return 1
165
+
166
+ return calculate_batch_size(
167
+ per_item_vram_gb=self.per_item_vram_gb,
168
+ overhead_gb=self.model_vram_gb + 1.0, # Add 1GB system overhead
169
+ min_batch=1,
170
+ max_batch=max_batch,
171
+ )
172
+
77
173
 
78
174
  class BaseSplitterPlugin(BasePlugin):
79
175
  """
@@ -101,6 +197,27 @@ class BaseSplitterPlugin(BasePlugin):
101
197
  """
102
198
  ...
103
199
 
200
+ def split_batch(
201
+ self,
202
+ texts: list[str],
203
+ context: "PipelineContext",
204
+ ) -> list[list["RawTriple"]]:
205
+ """
206
+ Split multiple texts into atomic triples in a single batch.
207
+
208
+ Default implementation calls split() for each text sequentially.
209
+ Plugins with BATCH_PROCESSING capability should override this
210
+ for efficient GPU batching using get_optimal_batch_size().
211
+
212
+ Args:
213
+ texts: List of input texts to split
214
+ context: Pipeline context for accessing metadata and config
215
+
216
+ Returns:
217
+ List of RawTriple lists, one per input text
218
+ """
219
+ return [self.split(text, context) for text in texts]
220
+
104
221
 
105
222
  class BaseExtractorPlugin(BasePlugin):
106
223
  """
@@ -132,10 +249,14 @@ class BaseExtractorPlugin(BasePlugin):
132
249
 
133
250
  class BaseQualifierPlugin(BasePlugin):
134
251
  """
135
- Stage 3 plugin: Add qualifiers and identifiers to entities.
252
+ Stage 3 plugin: Qualify entities with identifiers and canonical forms.
253
+
254
+ Processes entities of specific types and adds:
255
+ - Semantic qualifiers (role, org for PERSON entities)
256
+ - External identifiers (LEI, company number, SEC CIK)
257
+ - Canonical name and FQN (fully qualified name)
136
258
 
137
- Processes entities of specific types and adds semantic qualifiers
138
- (role, org) or external identifiers (LEI, company number).
259
+ Returns a CanonicalEntity ready for use in labeled statements.
139
260
  """
140
261
 
141
262
  @property
@@ -167,67 +288,26 @@ class BaseQualifierPlugin(BasePlugin):
167
288
  self,
168
289
  entity: "ExtractedEntity",
169
290
  context: "PipelineContext",
170
- ) -> "EntityQualifiers | None":
291
+ ) -> "CanonicalEntity | None":
171
292
  """
172
- Add qualifiers to an entity.
293
+ Qualify an entity and return its canonical form.
294
+
295
+ This method should:
296
+ 1. Look up identifiers (LEI, CIK, company number, etc.)
297
+ 2. Find the canonical name if available
298
+ 3. Generate the FQN (fully qualified name)
299
+ 4. Return a CanonicalEntity with all information
173
300
 
174
301
  Args:
175
302
  entity: The entity to qualify
176
303
  context: Pipeline context (for accessing source text, other entities)
177
304
 
178
305
  Returns:
179
- EntityQualifiers with added information, or None if nothing to add
306
+ CanonicalEntity with qualifiers and FQN, or None if entity not found
180
307
  """
181
308
  ...
182
309
 
183
310
 
184
- class BaseCanonicalizerPlugin(BasePlugin):
185
- """
186
- Stage 4 plugin: Resolve entities to canonical forms.
187
-
188
- Takes qualified entities and finds their canonical representations
189
- using various matching strategies (identifier, name, fuzzy, LLM).
190
- """
191
-
192
- @property
193
- @abstractmethod
194
- def supported_entity_types(self) -> set["EntityType"]:
195
- """Entity types this plugin can canonicalize."""
196
- ...
197
-
198
- @abstractmethod
199
- def find_canonical(
200
- self,
201
- entity: "QualifiedEntity",
202
- context: "PipelineContext",
203
- ) -> "CanonicalMatch | None":
204
- """
205
- Find canonical form for an entity.
206
-
207
- Args:
208
- entity: Qualified entity to canonicalize
209
- context: Pipeline context
210
-
211
- Returns:
212
- CanonicalMatch if found, None otherwise
213
- """
214
- ...
215
-
216
- def format_fqn(
217
- self,
218
- entity: "QualifiedEntity",
219
- match: "CanonicalMatch | None",
220
- ) -> str:
221
- """
222
- Format the fully qualified name for display.
223
-
224
- Can be overridden by subclasses for custom formatting.
225
- Default implementation uses CanonicalEntity._generate_fqn.
226
- """
227
- from ..models import CanonicalEntity
228
- return CanonicalEntity._generate_fqn(entity, match)
229
-
230
-
231
311
  class ClassificationSchema:
232
312
  """
233
313
  Schema for simple multi-choice classification (2-20 choices).
@@ -309,7 +389,7 @@ class TaxonomySchema:
309
389
 
310
390
  class BaseLabelerPlugin(BasePlugin):
311
391
  """
312
- Stage 5 plugin: Apply labels to statements.
392
+ Stage 4 plugin: Apply labels to statements.
313
393
 
314
394
  Adds classification labels (sentiment, relation type, confidence)
315
395
  to the final labeled statements.
@@ -380,7 +460,7 @@ class BaseLabelerPlugin(BasePlugin):
380
460
 
381
461
  class BaseTaxonomyPlugin(BasePlugin):
382
462
  """
383
- Stage 6 plugin: Classify statements against a taxonomy.
463
+ Stage 5 plugin: Classify statements against a taxonomy.
384
464
 
385
465
  Taxonomy classification is separate from labeling because:
386
466
  - It operates on large taxonomies (100s-1000s of labels)
@@ -444,3 +524,193 @@ class BaseTaxonomyPlugin(BasePlugin):
444
524
  List of TaxonomyResult objects (empty if none above threshold)
445
525
  """
446
526
  ...
527
+
528
+ def classify_batch(
529
+ self,
530
+ items: list[tuple["PipelineStatement", "CanonicalEntity", "CanonicalEntity"]],
531
+ context: "PipelineContext",
532
+ ) -> list[list["TaxonomyResult"]]:
533
+ """
534
+ Classify multiple statements against the taxonomy in a single batch.
535
+
536
+ Default implementation calls classify() for each statement sequentially.
537
+ Plugins with BATCH_PROCESSING capability should override this
538
+ for efficient GPU batching using get_optimal_batch_size().
539
+
540
+ Args:
541
+ items: List of (statement, subject_canonical, object_canonical) tuples
542
+ context: Pipeline context
543
+
544
+ Returns:
545
+ List of TaxonomyResult lists, one per input statement
546
+ """
547
+ return [
548
+ self.classify(stmt, subj, obj, context)
549
+ for stmt, subj, obj in items
550
+ ]
551
+
552
+
553
+ # =============================================================================
554
+ # Content Acquisition Plugins (for URL processing)
555
+ # =============================================================================
556
+
557
+
558
+ class ContentType(str, Enum):
559
+ """Content type detected from URL or HTTP response."""
560
+ HTML = "html"
561
+ PDF = "pdf"
562
+ BINARY = "binary"
563
+ UNKNOWN = "unknown"
564
+
565
+
566
+ class ScraperResult(BaseModel):
567
+ """Result from a scraper plugin."""
568
+ url: str = Field(description="Original URL requested")
569
+ final_url: str = Field(description="Final URL after redirects")
570
+ content: bytes = Field(description="Raw content bytes")
571
+ content_type: ContentType = Field(description="Detected content type")
572
+ headers: dict[str, str] = Field(default_factory=dict, description="Response headers")
573
+ error: Optional[str] = Field(default=None, description="Error message if fetch failed")
574
+
575
+ model_config = {"arbitrary_types_allowed": True}
576
+
577
+ @property
578
+ def ok(self) -> bool:
579
+ """Check if the fetch was successful."""
580
+ return self.error is None and len(self.content) > 0
581
+
582
+
583
+ class PDFParseResult(BaseModel):
584
+ """Result from a PDF parser plugin."""
585
+ pages: list[str] = Field(description="Extracted text for each page")
586
+ page_count: int = Field(description="Total number of pages in PDF")
587
+ metadata: dict[str, Any] = Field(default_factory=dict, description="PDF metadata (title, author, etc)")
588
+ error: Optional[str] = Field(default=None, description="Error message if parsing failed")
589
+
590
+ @property
591
+ def ok(self) -> bool:
592
+ """Check if parsing was successful."""
593
+ return self.error is None
594
+
595
+ @property
596
+ def full_text(self) -> str:
597
+ """Get concatenated text from all pages."""
598
+ return "\n\n".join(self.pages)
599
+
600
+
601
+ class BaseScraperPlugin(BasePlugin):
602
+ """
603
+ Plugin for fetching content from URLs.
604
+
605
+ Scrapers handle HTTP requests, redirects, retries, and content type detection.
606
+ They return raw bytes that can be processed by appropriate parsers (HTML, PDF, etc).
607
+
608
+ Example implementation:
609
+ @PluginRegistry.scraper
610
+ class MyScraperPlugin(BaseScraperPlugin):
611
+ @property
612
+ def name(self) -> str:
613
+ return "my_scraper"
614
+
615
+ async def fetch(self, url: str, timeout: float = 30.0) -> ScraperResult:
616
+ # Implement fetching logic
617
+ ...
618
+ """
619
+
620
+ @property
621
+ def capabilities(self) -> PluginCapability:
622
+ """Scrapers support async processing by default."""
623
+ return PluginCapability.ASYNC_PROCESSING | PluginCapability.EXTERNAL_API
624
+
625
+ @abstractmethod
626
+ async def fetch(self, url: str, timeout: float = 30.0) -> ScraperResult:
627
+ """
628
+ Fetch content from a URL.
629
+
630
+ Args:
631
+ url: The URL to fetch
632
+ timeout: Request timeout in seconds
633
+
634
+ Returns:
635
+ ScraperResult with content, content type, and any errors
636
+ """
637
+ ...
638
+
639
+ async def head(self, url: str, timeout: float = 10.0) -> ScraperResult:
640
+ """
641
+ Check content type without downloading the full body.
642
+
643
+ Default implementation does a full fetch. Override for efficiency.
644
+
645
+ Args:
646
+ url: The URL to check
647
+ timeout: Request timeout in seconds
648
+
649
+ Returns:
650
+ ScraperResult with content_type populated (content may be empty)
651
+ """
652
+ return await self.fetch(url, timeout)
653
+
654
+ def is_supported_url(self, url: str) -> bool:
655
+ """
656
+ Check if this scraper can handle the URL.
657
+
658
+ Override to restrict to specific URL patterns or domains.
659
+
660
+ Args:
661
+ url: The URL to check
662
+
663
+ Returns:
664
+ True if this scraper can handle the URL
665
+ """
666
+ return True
667
+
668
+
669
+ class BasePDFParserPlugin(BasePlugin):
670
+ """
671
+ Plugin for extracting text from PDF files.
672
+
673
+ PDF parsers take raw PDF bytes and extract text content page by page.
674
+ They may support OCR for image-heavy PDFs.
675
+
676
+ Example implementation:
677
+ @PluginRegistry.pdf_parser
678
+ class MyPDFParserPlugin(BasePDFParserPlugin):
679
+ @property
680
+ def name(self) -> str:
681
+ return "my_pdf_parser"
682
+
683
+ def parse(self, pdf_bytes: bytes, ...) -> PDFParseResult:
684
+ # Implement parsing logic
685
+ ...
686
+ """
687
+
688
+ @abstractmethod
689
+ def parse(
690
+ self,
691
+ pdf_bytes: bytes,
692
+ max_pages: int = 500,
693
+ use_ocr: bool = False,
694
+ ) -> PDFParseResult:
695
+ """
696
+ Extract text from PDF bytes.
697
+
698
+ Args:
699
+ pdf_bytes: Raw PDF file content
700
+ max_pages: Maximum number of pages to process
701
+ use_ocr: Force OCR even for text-extractable PDFs
702
+
703
+ Returns:
704
+ PDFParseResult with extracted text for each page
705
+ """
706
+ ...
707
+
708
+ @property
709
+ def supports_ocr(self) -> bool:
710
+ """
711
+ Whether this parser supports OCR for image-heavy PDFs.
712
+
713
+ Returns:
714
+ True if OCR is available
715
+ """
716
+ return False
@@ -180,6 +180,16 @@ class GLiNER2Extractor(BaseExtractorPlugin):
180
180
  def description(self) -> str:
181
181
  return "GLiNER2 model for entity and relation extraction"
182
182
 
183
+ @property
184
+ def model_vram_gb(self) -> float:
185
+ """GLiNER2 model weights ~0.8GB."""
186
+ return 0.8
187
+
188
+ @property
189
+ def per_item_vram_gb(self) -> float:
190
+ """Each triple during batch processing ~0.1GB."""
191
+ return 0.1
192
+
183
193
  def _get_model(self):
184
194
  """Lazy-load the GLiNER2 model."""
185
195
  if self._model is None:
@@ -8,9 +8,19 @@ there are too many possible values for simple multi-choice classification.
8
8
  import json
9
9
  import logging
10
10
  from pathlib import Path
11
- from typing import Optional
11
+ from typing import Optional, TypedDict
12
12
 
13
13
  from ..base import BaseLabelerPlugin, TaxonomySchema, PluginCapability
14
+
15
+
16
+ class TaxonomyEntry(TypedDict):
17
+ """Structure for each taxonomy label entry."""
18
+ description: str
19
+ id: int
20
+ mnli_label: str
21
+ embedding_label: str
22
+
23
+
14
24
  from ...pipeline.context import PipelineContext
15
25
  from ...models import (
16
26
  PipelineStatement,
@@ -214,7 +224,7 @@ class TaxonomyLabeler(BaseLabelerPlugin):
214
224
  self._top_k_categories = top_k_categories
215
225
  self._min_confidence = min_confidence
216
226
 
217
- self._taxonomy: Optional[dict[str, dict[str, int]]] = None
227
+ self._taxonomy: Optional[dict[str, dict[str, TaxonomyEntry]]] = None
218
228
  self._classifier: Optional[TaxonomyClassifier] = None
219
229
 
220
230
  @property
@@ -250,7 +260,7 @@ class TaxonomyLabeler(BaseLabelerPlugin):
250
260
  scope="statement",
251
261
  )
252
262
 
253
- def _load_taxonomy(self) -> dict[str, dict[str, int]]:
263
+ def _load_taxonomy(self) -> dict[str, dict[str, TaxonomyEntry]]:
254
264
  """Load taxonomy from JSON file."""
255
265
  if self._taxonomy is not None:
256
266
  return self._taxonomy
@@ -358,12 +368,15 @@ class TaxonomyLabeler(BaseLabelerPlugin):
358
368
  taxonomy = self._load_taxonomy()
359
369
 
360
370
  if category and category in taxonomy:
361
- return taxonomy[category].get(label)
371
+ entry = taxonomy[category].get(label)
372
+ if entry:
373
+ return entry.get("id")
362
374
 
363
375
  # Search all categories for flat classification
364
376
  for cat_labels in taxonomy.values():
365
377
  if label in cat_labels:
366
- return cat_labels[label]
378
+ entry = cat_labels[label]
379
+ return entry.get("id")
367
380
 
368
381
  return None
369
382
 
@@ -11,10 +11,19 @@ import json
11
11
  import logging
12
12
  import time
13
13
  from pathlib import Path
14
- from typing import Optional
14
+ from typing import Optional, TypedDict
15
15
 
16
16
  import numpy as np
17
17
 
18
+
19
+ class TaxonomyEntry(TypedDict):
20
+ """Structure for each taxonomy label entry."""
21
+ description: str
22
+ id: int
23
+ mnli_label: str
24
+ embedding_label: str
25
+
26
+
18
27
  from ..base import BaseLabelerPlugin, TaxonomySchema, PluginCapability
19
28
  from ...pipeline.context import PipelineContext
20
29
  from ...models import (
@@ -106,14 +115,14 @@ class EmbeddingClassifier:
106
115
 
107
116
  def precompute_label_embeddings(
108
117
  self,
109
- taxonomy: dict[str, dict[str, int]],
118
+ taxonomy: dict[str, dict[str, TaxonomyEntry]],
110
119
  categories: Optional[list[str]] = None,
111
120
  ) -> None:
112
121
  """
113
122
  Pre-compute embeddings for all label names.
114
123
 
115
124
  Args:
116
- taxonomy: Taxonomy dict {category: {label: id, ...}, ...}
125
+ taxonomy: Taxonomy dict {category: {label: TaxonomyEntry, ...}, ...}
117
126
  categories: Categories to include (default: all)
118
127
  """
119
128
  self._load_model()
@@ -314,7 +323,7 @@ class EmbeddingTaxonomyLabeler(BaseLabelerPlugin):
314
323
  self._top_k_categories = top_k_categories
315
324
  self._min_confidence = min_confidence
316
325
 
317
- self._taxonomy: Optional[dict[str, dict[str, int]]] = None
326
+ self._taxonomy: Optional[dict[str, dict[str, TaxonomyEntry]]] = None
318
327
  self._classifier: Optional[EmbeddingClassifier] = None
319
328
  self._embeddings_computed = False
320
329
 
@@ -350,7 +359,7 @@ class EmbeddingTaxonomyLabeler(BaseLabelerPlugin):
350
359
  scope="statement",
351
360
  )
352
361
 
353
- def _load_taxonomy(self) -> dict[str, dict[str, int]]:
362
+ def _load_taxonomy(self) -> dict[str, dict[str, TaxonomyEntry]]:
354
363
  """Load taxonomy from JSON file."""
355
364
  if self._taxonomy is not None:
356
365
  return self._taxonomy
@@ -456,7 +465,9 @@ class EmbeddingTaxonomyLabeler(BaseLabelerPlugin):
456
465
  taxonomy = self._load_taxonomy()
457
466
 
458
467
  if category in taxonomy:
459
- return taxonomy[category].get(label)
468
+ entry = taxonomy[category].get(label)
469
+ if entry:
470
+ return entry.get("id")
460
471
 
461
472
  return None
462
473
 
@@ -0,0 +1,10 @@
1
+ """
2
+ PDF parser plugins for extracting text from PDF files.
3
+
4
+ Built-in parsers:
5
+ - pypdf_parser: Default PDF parser using PyMuPDF with optional OCR
6
+ """
7
+
8
+ from .pypdf import PyPDFParserPlugin
9
+
10
+ __all__ = ["PyPDFParserPlugin"]