corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +10 -1
  4. statement_extractor/cli.py +1663 -17
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +6972 -0
  7. statement_extractor/database/__init__.py +52 -0
  8. statement_extractor/database/embeddings.py +186 -0
  9. statement_extractor/database/hub.py +520 -0
  10. statement_extractor/database/importers/__init__.py +24 -0
  11. statement_extractor/database/importers/companies_house.py +545 -0
  12. statement_extractor/database/importers/gleif.py +538 -0
  13. statement_extractor/database/importers/sec_edgar.py +375 -0
  14. statement_extractor/database/importers/wikidata.py +1012 -0
  15. statement_extractor/database/importers/wikidata_people.py +632 -0
  16. statement_extractor/database/models.py +230 -0
  17. statement_extractor/database/resolver.py +245 -0
  18. statement_extractor/database/store.py +1609 -0
  19. statement_extractor/document/__init__.py +62 -0
  20. statement_extractor/document/chunker.py +410 -0
  21. statement_extractor/document/context.py +171 -0
  22. statement_extractor/document/deduplicator.py +173 -0
  23. statement_extractor/document/html_extractor.py +246 -0
  24. statement_extractor/document/loader.py +303 -0
  25. statement_extractor/document/pipeline.py +388 -0
  26. statement_extractor/document/summarizer.py +195 -0
  27. statement_extractor/extractor.py +1 -23
  28. statement_extractor/gliner_extraction.py +4 -74
  29. statement_extractor/llm.py +255 -0
  30. statement_extractor/models/__init__.py +89 -0
  31. statement_extractor/models/canonical.py +182 -0
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/entity.py +102 -0
  34. statement_extractor/models/labels.py +220 -0
  35. statement_extractor/models/qualifiers.py +139 -0
  36. statement_extractor/models/statement.py +101 -0
  37. statement_extractor/models.py +4 -1
  38. statement_extractor/pipeline/__init__.py +39 -0
  39. statement_extractor/pipeline/config.py +129 -0
  40. statement_extractor/pipeline/context.py +177 -0
  41. statement_extractor/pipeline/orchestrator.py +416 -0
  42. statement_extractor/pipeline/registry.py +303 -0
  43. statement_extractor/plugins/__init__.py +55 -0
  44. statement_extractor/plugins/base.py +716 -0
  45. statement_extractor/plugins/extractors/__init__.py +13 -0
  46. statement_extractor/plugins/extractors/base.py +9 -0
  47. statement_extractor/plugins/extractors/gliner2.py +546 -0
  48. statement_extractor/plugins/labelers/__init__.py +29 -0
  49. statement_extractor/plugins/labelers/base.py +9 -0
  50. statement_extractor/plugins/labelers/confidence.py +138 -0
  51. statement_extractor/plugins/labelers/relation_type.py +87 -0
  52. statement_extractor/plugins/labelers/sentiment.py +159 -0
  53. statement_extractor/plugins/labelers/taxonomy.py +386 -0
  54. statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
  55. statement_extractor/plugins/pdf/__init__.py +10 -0
  56. statement_extractor/plugins/pdf/pypdf.py +291 -0
  57. statement_extractor/plugins/qualifiers/__init__.py +30 -0
  58. statement_extractor/plugins/qualifiers/base.py +9 -0
  59. statement_extractor/plugins/qualifiers/companies_house.py +185 -0
  60. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  61. statement_extractor/plugins/qualifiers/gleif.py +197 -0
  62. statement_extractor/plugins/qualifiers/person.py +785 -0
  63. statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
  64. statement_extractor/plugins/scrapers/__init__.py +10 -0
  65. statement_extractor/plugins/scrapers/http.py +236 -0
  66. statement_extractor/plugins/splitters/__init__.py +13 -0
  67. statement_extractor/plugins/splitters/base.py +9 -0
  68. statement_extractor/plugins/splitters/t5_gemma.py +293 -0
  69. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  70. statement_extractor/plugins/taxonomy/embedding.py +484 -0
  71. statement_extractor/plugins/taxonomy/mnli.py +291 -0
  72. statement_extractor/scoring.py +8 -8
  73. corp_extractor-0.4.0.dist-info/RECORD +0 -12
  74. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  75. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,303 @@
1
+ """
2
+ PluginRegistry - Registration and discovery of plugins.
3
+
4
+ Provides a central registry for all plugin types with decorator-based
5
+ registration and discovery by entity type.
6
+ """
7
+
8
+ import logging
9
+ from typing import TYPE_CHECKING, Type, TypeVar
10
+
11
+ if TYPE_CHECKING:
12
+ from ..plugins.base import (
13
+ BasePlugin,
14
+ BaseSplitterPlugin,
15
+ BaseExtractorPlugin,
16
+ BaseQualifierPlugin,
17
+ BaseLabelerPlugin,
18
+ BaseTaxonomyPlugin,
19
+ BaseScraperPlugin,
20
+ BasePDFParserPlugin,
21
+ )
22
+ from ..models import EntityType
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ T = TypeVar("T", bound="BasePlugin")
27
+
28
+
29
+ class PluginRegistry:
30
+ """
31
+ Central registry for all pipeline plugins.
32
+
33
+ Supports registration by decorator or explicit method call.
34
+ Plugins are sorted by priority (lower = higher priority).
35
+ """
36
+
37
+ # Class-level storage for registered plugins
38
+ _splitters: list["BaseSplitterPlugin"] = []
39
+ _extractors: list["BaseExtractorPlugin"] = []
40
+ _qualifiers: list["BaseQualifierPlugin"] = []
41
+ _labelers: list["BaseLabelerPlugin"] = []
42
+ _taxonomy_classifiers: list["BaseTaxonomyPlugin"] = []
43
+
44
+ # Content acquisition plugins
45
+ _scrapers: list["BaseScraperPlugin"] = []
46
+ _pdf_parsers: list["BasePDFParserPlugin"] = []
47
+
48
+ # Index by entity type for quick lookup
49
+ _qualifiers_by_type: dict["EntityType", list["BaseQualifierPlugin"]] = {}
50
+
51
+ # Index by name for CLI lookup
52
+ _all_plugins: dict[str, "BasePlugin"] = {}
53
+
54
+ @classmethod
55
+ def clear(cls) -> None:
56
+ """Clear all registered plugins (useful for testing)."""
57
+ cls._splitters = []
58
+ cls._extractors = []
59
+ cls._qualifiers = []
60
+ cls._labelers = []
61
+ cls._taxonomy_classifiers = []
62
+ cls._scrapers = []
63
+ cls._pdf_parsers = []
64
+ cls._qualifiers_by_type = {}
65
+ cls._all_plugins = {}
66
+
67
+ # =========================================================================
68
+ # Registration methods
69
+ # =========================================================================
70
+
71
+ @classmethod
72
+ def register_splitter(cls, plugin: "BaseSplitterPlugin") -> None:
73
+ """Register a splitter plugin."""
74
+ cls._splitters.append(plugin)
75
+ cls._splitters.sort(key=lambda p: p.priority)
76
+ cls._all_plugins[plugin.name] = plugin
77
+ logger.debug(f"Registered splitter: {plugin.name} (priority={plugin.priority})")
78
+
79
+ @classmethod
80
+ def register_extractor(cls, plugin: "BaseExtractorPlugin") -> None:
81
+ """Register an extractor plugin."""
82
+ cls._extractors.append(plugin)
83
+ cls._extractors.sort(key=lambda p: p.priority)
84
+ cls._all_plugins[plugin.name] = plugin
85
+ logger.debug(f"Registered extractor: {plugin.name} (priority={plugin.priority})")
86
+
87
+ @classmethod
88
+ def register_qualifier(cls, plugin: "BaseQualifierPlugin") -> None:
89
+ """Register a qualifier plugin."""
90
+ cls._qualifiers.append(plugin)
91
+ cls._qualifiers.sort(key=lambda p: p.priority)
92
+ cls._all_plugins[plugin.name] = plugin
93
+
94
+ # Index by entity type
95
+ for entity_type in plugin.supported_entity_types:
96
+ if entity_type not in cls._qualifiers_by_type:
97
+ cls._qualifiers_by_type[entity_type] = []
98
+ cls._qualifiers_by_type[entity_type].append(plugin)
99
+ cls._qualifiers_by_type[entity_type].sort(key=lambda p: p.priority)
100
+
101
+ logger.debug(
102
+ f"Registered qualifier: {plugin.name} "
103
+ f"(priority={plugin.priority}, types={[t.value for t in plugin.supported_entity_types]})"
104
+ )
105
+
106
+ @classmethod
107
+ def register_labeler(cls, plugin: "BaseLabelerPlugin") -> None:
108
+ """Register a labeler plugin."""
109
+ cls._labelers.append(plugin)
110
+ cls._labelers.sort(key=lambda p: p.priority)
111
+ cls._all_plugins[plugin.name] = plugin
112
+ logger.debug(f"Registered labeler: {plugin.name} (priority={plugin.priority})")
113
+
114
+ @classmethod
115
+ def register_taxonomy(cls, plugin: "BaseTaxonomyPlugin") -> None:
116
+ """Register a taxonomy classifier plugin."""
117
+ cls._taxonomy_classifiers.append(plugin)
118
+ cls._taxonomy_classifiers.sort(key=lambda p: p.priority)
119
+ cls._all_plugins[plugin.name] = plugin
120
+ logger.debug(f"Registered taxonomy: {plugin.name} (priority={plugin.priority})")
121
+
122
+ @classmethod
123
+ def register_scraper(cls, plugin: "BaseScraperPlugin") -> None:
124
+ """Register a scraper plugin."""
125
+ cls._scrapers.append(plugin)
126
+ cls._scrapers.sort(key=lambda p: p.priority)
127
+ cls._all_plugins[plugin.name] = plugin
128
+ logger.debug(f"Registered scraper: {plugin.name} (priority={plugin.priority})")
129
+
130
+ @classmethod
131
+ def register_pdf_parser(cls, plugin: "BasePDFParserPlugin") -> None:
132
+ """Register a PDF parser plugin."""
133
+ cls._pdf_parsers.append(plugin)
134
+ cls._pdf_parsers.sort(key=lambda p: p.priority)
135
+ cls._all_plugins[plugin.name] = plugin
136
+ logger.debug(f"Registered PDF parser: {plugin.name} (priority={plugin.priority})")
137
+
138
+ # =========================================================================
139
+ # Decorator registration
140
+ # =========================================================================
141
+
142
+ @classmethod
143
+ def splitter(cls, plugin_class: Type[T]) -> Type[T]:
144
+ """Decorator to register a splitter plugin class."""
145
+ cls.register_splitter(plugin_class())
146
+ return plugin_class
147
+
148
+ @classmethod
149
+ def extractor(cls, plugin_class: Type[T]) -> Type[T]:
150
+ """Decorator to register an extractor plugin class."""
151
+ cls.register_extractor(plugin_class())
152
+ return plugin_class
153
+
154
+ @classmethod
155
+ def qualifier(cls, plugin_class: Type[T]) -> Type[T]:
156
+ """Decorator to register a qualifier plugin class."""
157
+ cls.register_qualifier(plugin_class())
158
+ return plugin_class
159
+
160
+ @classmethod
161
+ def labeler(cls, plugin_class: Type[T]) -> Type[T]:
162
+ """Decorator to register a labeler plugin class."""
163
+ cls.register_labeler(plugin_class())
164
+ return plugin_class
165
+
166
+ @classmethod
167
+ def taxonomy(cls, plugin_class: Type[T]) -> Type[T]:
168
+ """Decorator to register a taxonomy classifier plugin class."""
169
+ cls.register_taxonomy(plugin_class())
170
+ return plugin_class
171
+
172
+ @classmethod
173
+ def scraper(cls, plugin_class: Type[T]) -> Type[T]:
174
+ """Decorator to register a scraper plugin class."""
175
+ cls.register_scraper(plugin_class())
176
+ return plugin_class
177
+
178
+ @classmethod
179
+ def pdf_parser(cls, plugin_class: Type[T]) -> Type[T]:
180
+ """Decorator to register a PDF parser plugin class."""
181
+ cls.register_pdf_parser(plugin_class())
182
+ return plugin_class
183
+
184
+ # =========================================================================
185
+ # Retrieval methods
186
+ # =========================================================================
187
+
188
+ @classmethod
189
+ def get_splitters(cls) -> list["BaseSplitterPlugin"]:
190
+ """Get all registered splitter plugins (sorted by priority)."""
191
+ return cls._splitters.copy()
192
+
193
+ @classmethod
194
+ def get_extractors(cls) -> list["BaseExtractorPlugin"]:
195
+ """Get all registered extractor plugins (sorted by priority)."""
196
+ return cls._extractors.copy()
197
+
198
+ @classmethod
199
+ def get_qualifiers(cls) -> list["BaseQualifierPlugin"]:
200
+ """Get all registered qualifier plugins (sorted by priority)."""
201
+ return cls._qualifiers.copy()
202
+
203
+ @classmethod
204
+ def get_qualifiers_for_type(cls, entity_type: "EntityType") -> list["BaseQualifierPlugin"]:
205
+ """Get qualifier plugins that support a specific entity type."""
206
+ return cls._qualifiers_by_type.get(entity_type, []).copy()
207
+
208
+ @classmethod
209
+ def get_labelers(cls) -> list["BaseLabelerPlugin"]:
210
+ """Get all registered labeler plugins (sorted by priority)."""
211
+ return cls._labelers.copy()
212
+
213
+ @classmethod
214
+ def get_taxonomy_classifiers(cls) -> list["BaseTaxonomyPlugin"]:
215
+ """Get all registered taxonomy classifier plugins (sorted by priority)."""
216
+ return cls._taxonomy_classifiers.copy()
217
+
218
+ @classmethod
219
+ def get_scrapers(cls) -> list["BaseScraperPlugin"]:
220
+ """Get all registered scraper plugins (sorted by priority)."""
221
+ return cls._scrapers.copy()
222
+
223
+ @classmethod
224
+ def get_pdf_parsers(cls) -> list["BasePDFParserPlugin"]:
225
+ """Get all registered PDF parser plugins (sorted by priority)."""
226
+ return cls._pdf_parsers.copy()
227
+
228
+ @classmethod
229
+ def get_plugin(cls, name: str) -> "BasePlugin | None":
230
+ """Get a plugin by name."""
231
+ return cls._all_plugins.get(name)
232
+
233
+ @classmethod
234
+ def get_all_plugins(cls) -> dict[str, "BasePlugin"]:
235
+ """Get all registered plugins by name."""
236
+ return cls._all_plugins.copy()
237
+
238
+ @classmethod
239
+ def get_plugins_for_stage(cls, stage: int) -> list["BasePlugin"]:
240
+ """Get all plugins for a specific stage number."""
241
+ if stage == 1:
242
+ return cls._splitters.copy()
243
+ elif stage == 2:
244
+ return cls._extractors.copy()
245
+ elif stage == 3:
246
+ return cls._qualifiers.copy()
247
+ elif stage == 4:
248
+ return cls._labelers.copy()
249
+ elif stage == 5:
250
+ return cls._taxonomy_classifiers.copy()
251
+ return []
252
+
253
+ # =========================================================================
254
+ # Info methods
255
+ # =========================================================================
256
+
257
+ @classmethod
258
+ def list_plugins(cls, stage: int | None = None) -> list[dict]:
259
+ """
260
+ List all plugins with their info.
261
+
262
+ Args:
263
+ stage: Optional stage number to filter by
264
+
265
+ Returns:
266
+ List of plugin info dicts with name, stage, priority, description
267
+ """
268
+ result = []
269
+
270
+ plugins_by_stage = [
271
+ (1, "splitting", cls._splitters),
272
+ (2, "extraction", cls._extractors),
273
+ (3, "qualification", cls._qualifiers),
274
+ (4, "labeling", cls._labelers),
275
+ (5, "taxonomy", cls._taxonomy_classifiers),
276
+ # Content acquisition plugins (stage 0)
277
+ (0, "scraper", cls._scrapers),
278
+ (-1, "pdf_parser", cls._pdf_parsers),
279
+ ]
280
+
281
+ for stage_num, stage_name, plugins in plugins_by_stage:
282
+ if stage is not None and stage != stage_num:
283
+ continue
284
+ for plugin in plugins:
285
+ info = {
286
+ "name": plugin.name,
287
+ "stage": stage_num,
288
+ "stage_name": stage_name,
289
+ "priority": plugin.priority,
290
+ "capabilities": plugin.capabilities.name if plugin.capabilities else "NONE",
291
+ }
292
+ # Add entity types for qualifiers/canonicalizers
293
+ if hasattr(plugin, "supported_entity_types"):
294
+ info["entity_types"] = [t.value for t in plugin.supported_entity_types]
295
+ # Add label type for labelers
296
+ if hasattr(plugin, "label_type"):
297
+ info["label_type"] = plugin.label_type
298
+ # Add taxonomy name for taxonomy classifiers
299
+ if hasattr(plugin, "taxonomy_name"):
300
+ info["taxonomy_name"] = plugin.taxonomy_name
301
+ result.append(info)
302
+
303
+ return result
@@ -0,0 +1,55 @@
1
+ """
2
+ Plugins module for the extraction pipeline.
3
+
4
+ Contains all plugin implementations organized by stage:
5
+ - splitters/: Stage 1 - Text to atomic triples
6
+ - extractors/: Stage 2 - Refine entities and relations
7
+ - qualifiers/: Stage 3 - Qualify entities (add identifiers, canonical names, FQN)
8
+ - labelers/: Stage 4 - Classify statements
9
+ - taxonomy/: Stage 5 - Taxonomy classification
10
+ """
11
+
12
+ from .base import (
13
+ PluginCapability,
14
+ BasePlugin,
15
+ BaseSplitterPlugin,
16
+ BaseExtractorPlugin,
17
+ BaseQualifierPlugin,
18
+ BaseLabelerPlugin,
19
+ BaseTaxonomyPlugin,
20
+ # Content acquisition plugins
21
+ ContentType,
22
+ ScraperResult,
23
+ PDFParseResult,
24
+ BaseScraperPlugin,
25
+ BasePDFParserPlugin,
26
+ )
27
+
28
+ # Import plugin modules for auto-registration
29
+ from . import splitters, extractors, qualifiers, labelers, taxonomy
30
+ # Content acquisition plugins
31
+ from . import scrapers, pdf
32
+
33
+ __all__ = [
34
+ "PluginCapability",
35
+ "BasePlugin",
36
+ "BaseSplitterPlugin",
37
+ "BaseExtractorPlugin",
38
+ "BaseQualifierPlugin",
39
+ "BaseLabelerPlugin",
40
+ "BaseTaxonomyPlugin",
41
+ # Content acquisition plugins
42
+ "ContentType",
43
+ "ScraperResult",
44
+ "PDFParseResult",
45
+ "BaseScraperPlugin",
46
+ "BasePDFParserPlugin",
47
+ # Plugin modules
48
+ "splitters",
49
+ "extractors",
50
+ "qualifiers",
51
+ "labelers",
52
+ "taxonomy",
53
+ "scrapers",
54
+ "pdf",
55
+ ]