corp-extractor 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/METADATA +235 -96
  2. corp_extractor-0.5.0.dist-info/RECORD +55 -0
  3. statement_extractor/__init__.py +9 -0
  4. statement_extractor/cli.py +460 -21
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +1182 -0
  7. statement_extractor/extractor.py +32 -47
  8. statement_extractor/gliner_extraction.py +218 -0
  9. statement_extractor/llm.py +255 -0
  10. statement_extractor/models/__init__.py +74 -0
  11. statement_extractor/models/canonical.py +139 -0
  12. statement_extractor/models/entity.py +102 -0
  13. statement_extractor/models/labels.py +191 -0
  14. statement_extractor/models/qualifiers.py +91 -0
  15. statement_extractor/models/statement.py +75 -0
  16. statement_extractor/models.py +15 -6
  17. statement_extractor/pipeline/__init__.py +39 -0
  18. statement_extractor/pipeline/config.py +134 -0
  19. statement_extractor/pipeline/context.py +177 -0
  20. statement_extractor/pipeline/orchestrator.py +447 -0
  21. statement_extractor/pipeline/registry.py +297 -0
  22. statement_extractor/plugins/__init__.py +43 -0
  23. statement_extractor/plugins/base.py +446 -0
  24. statement_extractor/plugins/canonicalizers/__init__.py +17 -0
  25. statement_extractor/plugins/canonicalizers/base.py +9 -0
  26. statement_extractor/plugins/canonicalizers/location.py +219 -0
  27. statement_extractor/plugins/canonicalizers/organization.py +230 -0
  28. statement_extractor/plugins/canonicalizers/person.py +242 -0
  29. statement_extractor/plugins/extractors/__init__.py +13 -0
  30. statement_extractor/plugins/extractors/base.py +9 -0
  31. statement_extractor/plugins/extractors/gliner2.py +536 -0
  32. statement_extractor/plugins/labelers/__init__.py +29 -0
  33. statement_extractor/plugins/labelers/base.py +9 -0
  34. statement_extractor/plugins/labelers/confidence.py +138 -0
  35. statement_extractor/plugins/labelers/relation_type.py +87 -0
  36. statement_extractor/plugins/labelers/sentiment.py +159 -0
  37. statement_extractor/plugins/labelers/taxonomy.py +373 -0
  38. statement_extractor/plugins/labelers/taxonomy_embedding.py +466 -0
  39. statement_extractor/plugins/qualifiers/__init__.py +19 -0
  40. statement_extractor/plugins/qualifiers/base.py +9 -0
  41. statement_extractor/plugins/qualifiers/companies_house.py +174 -0
  42. statement_extractor/plugins/qualifiers/gleif.py +186 -0
  43. statement_extractor/plugins/qualifiers/person.py +221 -0
  44. statement_extractor/plugins/qualifiers/sec_edgar.py +198 -0
  45. statement_extractor/plugins/splitters/__init__.py +13 -0
  46. statement_extractor/plugins/splitters/base.py +9 -0
  47. statement_extractor/plugins/splitters/t5_gemma.py +188 -0
  48. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  49. statement_extractor/plugins/taxonomy/embedding.py +337 -0
  50. statement_extractor/plugins/taxonomy/mnli.py +279 -0
  51. statement_extractor/scoring.py +17 -69
  52. corp_extractor-0.3.0.dist-info/RECORD +0 -12
  53. statement_extractor/spacy_extraction.py +0 -386
  54. {corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/WHEEL +0 -0
  55. {corp_extractor-0.3.0.dist-info → corp_extractor-0.5.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,297 @@
1
+ """
2
+ PluginRegistry - Registration and discovery of plugins.
3
+
4
+ Provides a central registry for all plugin types with decorator-based
5
+ registration and discovery by entity type.
6
+ """
7
+
8
+ import logging
9
+ from typing import TYPE_CHECKING, Type, TypeVar
10
+
11
+ if TYPE_CHECKING:
12
+ from ..plugins.base import (
13
+ BasePlugin,
14
+ BaseSplitterPlugin,
15
+ BaseExtractorPlugin,
16
+ BaseQualifierPlugin,
17
+ BaseCanonicalizerPlugin,
18
+ BaseLabelerPlugin,
19
+ BaseTaxonomyPlugin,
20
+ )
21
+ from ..models import EntityType
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ T = TypeVar("T", bound="BasePlugin")
26
+
27
+
28
+ class PluginRegistry:
29
+ """
30
+ Central registry for all pipeline plugins.
31
+
32
+ Supports registration by decorator or explicit method call.
33
+ Plugins are sorted by priority (lower = higher priority).
34
+ """
35
+
36
+ # Class-level storage for registered plugins
37
+ _splitters: list["BaseSplitterPlugin"] = []
38
+ _extractors: list["BaseExtractorPlugin"] = []
39
+ _qualifiers: list["BaseQualifierPlugin"] = []
40
+ _canonicalizers: list["BaseCanonicalizerPlugin"] = []
41
+ _labelers: list["BaseLabelerPlugin"] = []
42
+ _taxonomy_classifiers: list["BaseTaxonomyPlugin"] = []
43
+
44
+ # Index by entity type for quick lookup
45
+ _qualifiers_by_type: dict["EntityType", list["BaseQualifierPlugin"]] = {}
46
+ _canonicalizers_by_type: dict["EntityType", list["BaseCanonicalizerPlugin"]] = {}
47
+
48
+ # Index by name for CLI lookup
49
+ _all_plugins: dict[str, "BasePlugin"] = {}
50
+
51
+ @classmethod
52
+ def clear(cls) -> None:
53
+ """Clear all registered plugins (useful for testing)."""
54
+ cls._splitters = []
55
+ cls._extractors = []
56
+ cls._qualifiers = []
57
+ cls._canonicalizers = []
58
+ cls._labelers = []
59
+ cls._taxonomy_classifiers = []
60
+ cls._qualifiers_by_type = {}
61
+ cls._canonicalizers_by_type = {}
62
+ cls._all_plugins = {}
63
+
64
+ # =========================================================================
65
+ # Registration methods
66
+ # =========================================================================
67
+
68
+ @classmethod
69
+ def register_splitter(cls, plugin: "BaseSplitterPlugin") -> None:
70
+ """Register a splitter plugin."""
71
+ cls._splitters.append(plugin)
72
+ cls._splitters.sort(key=lambda p: p.priority)
73
+ cls._all_plugins[plugin.name] = plugin
74
+ logger.debug(f"Registered splitter: {plugin.name} (priority={plugin.priority})")
75
+
76
+ @classmethod
77
+ def register_extractor(cls, plugin: "BaseExtractorPlugin") -> None:
78
+ """Register an extractor plugin."""
79
+ cls._extractors.append(plugin)
80
+ cls._extractors.sort(key=lambda p: p.priority)
81
+ cls._all_plugins[plugin.name] = plugin
82
+ logger.debug(f"Registered extractor: {plugin.name} (priority={plugin.priority})")
83
+
84
+ @classmethod
85
+ def register_qualifier(cls, plugin: "BaseQualifierPlugin") -> None:
86
+ """Register a qualifier plugin."""
87
+ cls._qualifiers.append(plugin)
88
+ cls._qualifiers.sort(key=lambda p: p.priority)
89
+ cls._all_plugins[plugin.name] = plugin
90
+
91
+ # Index by entity type
92
+ for entity_type in plugin.supported_entity_types:
93
+ if entity_type not in cls._qualifiers_by_type:
94
+ cls._qualifiers_by_type[entity_type] = []
95
+ cls._qualifiers_by_type[entity_type].append(plugin)
96
+ cls._qualifiers_by_type[entity_type].sort(key=lambda p: p.priority)
97
+
98
+ logger.debug(
99
+ f"Registered qualifier: {plugin.name} "
100
+ f"(priority={plugin.priority}, types={[t.value for t in plugin.supported_entity_types]})"
101
+ )
102
+
103
+ @classmethod
104
+ def register_canonicalizer(cls, plugin: "BaseCanonicalizerPlugin") -> None:
105
+ """Register a canonicalizer plugin."""
106
+ cls._canonicalizers.append(plugin)
107
+ cls._canonicalizers.sort(key=lambda p: p.priority)
108
+ cls._all_plugins[plugin.name] = plugin
109
+
110
+ # Index by entity type
111
+ for entity_type in plugin.supported_entity_types:
112
+ if entity_type not in cls._canonicalizers_by_type:
113
+ cls._canonicalizers_by_type[entity_type] = []
114
+ cls._canonicalizers_by_type[entity_type].append(plugin)
115
+ cls._canonicalizers_by_type[entity_type].sort(key=lambda p: p.priority)
116
+
117
+ logger.debug(
118
+ f"Registered canonicalizer: {plugin.name} "
119
+ f"(priority={plugin.priority}, types={[t.value for t in plugin.supported_entity_types]})"
120
+ )
121
+
122
+ @classmethod
123
+ def register_labeler(cls, plugin: "BaseLabelerPlugin") -> None:
124
+ """Register a labeler plugin."""
125
+ cls._labelers.append(plugin)
126
+ cls._labelers.sort(key=lambda p: p.priority)
127
+ cls._all_plugins[plugin.name] = plugin
128
+ logger.debug(f"Registered labeler: {plugin.name} (priority={plugin.priority})")
129
+
130
+ @classmethod
131
+ def register_taxonomy(cls, plugin: "BaseTaxonomyPlugin") -> None:
132
+ """Register a taxonomy classifier plugin."""
133
+ cls._taxonomy_classifiers.append(plugin)
134
+ cls._taxonomy_classifiers.sort(key=lambda p: p.priority)
135
+ cls._all_plugins[plugin.name] = plugin
136
+ logger.debug(f"Registered taxonomy: {plugin.name} (priority={plugin.priority})")
137
+
138
+ # =========================================================================
139
+ # Decorator registration
140
+ # =========================================================================
141
+
142
+ @classmethod
143
+ def splitter(cls, plugin_class: Type[T]) -> Type[T]:
144
+ """Decorator to register a splitter plugin class."""
145
+ cls.register_splitter(plugin_class())
146
+ return plugin_class
147
+
148
+ @classmethod
149
+ def extractor(cls, plugin_class: Type[T]) -> Type[T]:
150
+ """Decorator to register an extractor plugin class."""
151
+ cls.register_extractor(plugin_class())
152
+ return plugin_class
153
+
154
+ @classmethod
155
+ def qualifier(cls, plugin_class: Type[T]) -> Type[T]:
156
+ """Decorator to register a qualifier plugin class."""
157
+ cls.register_qualifier(plugin_class())
158
+ return plugin_class
159
+
160
+ @classmethod
161
+ def canonicalizer(cls, plugin_class: Type[T]) -> Type[T]:
162
+ """Decorator to register a canonicalizer plugin class."""
163
+ cls.register_canonicalizer(plugin_class())
164
+ return plugin_class
165
+
166
+ @classmethod
167
+ def labeler(cls, plugin_class: Type[T]) -> Type[T]:
168
+ """Decorator to register a labeler plugin class."""
169
+ cls.register_labeler(plugin_class())
170
+ return plugin_class
171
+
172
+ @classmethod
173
+ def taxonomy(cls, plugin_class: Type[T]) -> Type[T]:
174
+ """Decorator to register a taxonomy classifier plugin class."""
175
+ cls.register_taxonomy(plugin_class())
176
+ return plugin_class
177
+
178
+ # =========================================================================
179
+ # Retrieval methods
180
+ # =========================================================================
181
+
182
+ @classmethod
183
+ def get_splitters(cls) -> list["BaseSplitterPlugin"]:
184
+ """Get all registered splitter plugins (sorted by priority)."""
185
+ return cls._splitters.copy()
186
+
187
+ @classmethod
188
+ def get_extractors(cls) -> list["BaseExtractorPlugin"]:
189
+ """Get all registered extractor plugins (sorted by priority)."""
190
+ return cls._extractors.copy()
191
+
192
+ @classmethod
193
+ def get_qualifiers(cls) -> list["BaseQualifierPlugin"]:
194
+ """Get all registered qualifier plugins (sorted by priority)."""
195
+ return cls._qualifiers.copy()
196
+
197
+ @classmethod
198
+ def get_qualifiers_for_type(cls, entity_type: "EntityType") -> list["BaseQualifierPlugin"]:
199
+ """Get qualifier plugins that support a specific entity type."""
200
+ return cls._qualifiers_by_type.get(entity_type, []).copy()
201
+
202
+ @classmethod
203
+ def get_canonicalizers(cls) -> list["BaseCanonicalizerPlugin"]:
204
+ """Get all registered canonicalizer plugins (sorted by priority)."""
205
+ return cls._canonicalizers.copy()
206
+
207
+ @classmethod
208
+ def get_canonicalizers_for_type(cls, entity_type: "EntityType") -> list["BaseCanonicalizerPlugin"]:
209
+ """Get canonicalizer plugins that support a specific entity type."""
210
+ return cls._canonicalizers_by_type.get(entity_type, []).copy()
211
+
212
+ @classmethod
213
+ def get_labelers(cls) -> list["BaseLabelerPlugin"]:
214
+ """Get all registered labeler plugins (sorted by priority)."""
215
+ return cls._labelers.copy()
216
+
217
+ @classmethod
218
+ def get_taxonomy_classifiers(cls) -> list["BaseTaxonomyPlugin"]:
219
+ """Get all registered taxonomy classifier plugins (sorted by priority)."""
220
+ return cls._taxonomy_classifiers.copy()
221
+
222
+ @classmethod
223
+ def get_plugin(cls, name: str) -> "BasePlugin | None":
224
+ """Get a plugin by name."""
225
+ return cls._all_plugins.get(name)
226
+
227
+ @classmethod
228
+ def get_all_plugins(cls) -> dict[str, "BasePlugin"]:
229
+ """Get all registered plugins by name."""
230
+ return cls._all_plugins.copy()
231
+
232
+ @classmethod
233
+ def get_plugins_for_stage(cls, stage: int) -> list["BasePlugin"]:
234
+ """Get all plugins for a specific stage number."""
235
+ if stage == 1:
236
+ return cls._splitters.copy()
237
+ elif stage == 2:
238
+ return cls._extractors.copy()
239
+ elif stage == 3:
240
+ return cls._qualifiers.copy()
241
+ elif stage == 4:
242
+ return cls._canonicalizers.copy()
243
+ elif stage == 5:
244
+ return cls._labelers.copy()
245
+ elif stage == 6:
246
+ return cls._taxonomy_classifiers.copy()
247
+ return []
248
+
249
+ # =========================================================================
250
+ # Info methods
251
+ # =========================================================================
252
+
253
+ @classmethod
254
+ def list_plugins(cls, stage: int | None = None) -> list[dict]:
255
+ """
256
+ List all plugins with their info.
257
+
258
+ Args:
259
+ stage: Optional stage number to filter by
260
+
261
+ Returns:
262
+ List of plugin info dicts with name, stage, priority, description
263
+ """
264
+ result = []
265
+
266
+ plugins_by_stage = [
267
+ (1, "splitting", cls._splitters),
268
+ (2, "extraction", cls._extractors),
269
+ (3, "qualification", cls._qualifiers),
270
+ (4, "canonicalization", cls._canonicalizers),
271
+ (5, "labeling", cls._labelers),
272
+ (6, "taxonomy", cls._taxonomy_classifiers),
273
+ ]
274
+
275
+ for stage_num, stage_name, plugins in plugins_by_stage:
276
+ if stage is not None and stage != stage_num:
277
+ continue
278
+ for plugin in plugins:
279
+ info = {
280
+ "name": plugin.name,
281
+ "stage": stage_num,
282
+ "stage_name": stage_name,
283
+ "priority": plugin.priority,
284
+ "capabilities": plugin.capabilities.name if plugin.capabilities else "NONE",
285
+ }
286
+ # Add entity types for qualifiers/canonicalizers
287
+ if hasattr(plugin, "supported_entity_types"):
288
+ info["entity_types"] = [t.value for t in plugin.supported_entity_types]
289
+ # Add label type for labelers
290
+ if hasattr(plugin, "label_type"):
291
+ info["label_type"] = plugin.label_type
292
+ # Add taxonomy name for taxonomy classifiers
293
+ if hasattr(plugin, "taxonomy_name"):
294
+ info["taxonomy_name"] = plugin.taxonomy_name
295
+ result.append(info)
296
+
297
+ return result
@@ -0,0 +1,43 @@
1
+ """
2
+ Plugins module for the extraction pipeline.
3
+
4
+ Contains all plugin implementations organized by stage:
5
+ - splitters/: Stage 1 - Text to atomic triples
6
+ - extractors/: Stage 2 - Refine entities and relations
7
+ - qualifiers/: Stage 3 - Add qualifiers and identifiers
8
+ - canonicalizers/: Stage 4 - Resolve canonical forms
9
+ - labelers/: Stage 5 - Classify statements
10
+ - taxonomy/: Stage 6 - Taxonomy classification
11
+ """
12
+
13
+ from .base import (
14
+ PluginCapability,
15
+ BasePlugin,
16
+ BaseSplitterPlugin,
17
+ BaseExtractorPlugin,
18
+ BaseQualifierPlugin,
19
+ BaseCanonicalizerPlugin,
20
+ BaseLabelerPlugin,
21
+ BaseTaxonomyPlugin,
22
+ )
23
+
24
+ # Import plugin modules for auto-registration
25
+ from . import splitters, extractors, qualifiers, canonicalizers, labelers, taxonomy
26
+
27
+ __all__ = [
28
+ "PluginCapability",
29
+ "BasePlugin",
30
+ "BaseSplitterPlugin",
31
+ "BaseExtractorPlugin",
32
+ "BaseQualifierPlugin",
33
+ "BaseCanonicalizerPlugin",
34
+ "BaseLabelerPlugin",
35
+ "BaseTaxonomyPlugin",
36
+ # Plugin modules
37
+ "splitters",
38
+ "extractors",
39
+ "qualifiers",
40
+ "canonicalizers",
41
+ "labelers",
42
+ "taxonomy",
43
+ ]