corp-extractor 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/METADATA +181 -64
  2. corp_extractor-0.5.0.dist-info/RECORD +55 -0
  3. statement_extractor/__init__.py +9 -0
  4. statement_extractor/cli.py +446 -17
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +1182 -0
  7. statement_extractor/extractor.py +1 -23
  8. statement_extractor/gliner_extraction.py +4 -74
  9. statement_extractor/llm.py +255 -0
  10. statement_extractor/models/__init__.py +74 -0
  11. statement_extractor/models/canonical.py +139 -0
  12. statement_extractor/models/entity.py +102 -0
  13. statement_extractor/models/labels.py +191 -0
  14. statement_extractor/models/qualifiers.py +91 -0
  15. statement_extractor/models/statement.py +75 -0
  16. statement_extractor/models.py +4 -1
  17. statement_extractor/pipeline/__init__.py +39 -0
  18. statement_extractor/pipeline/config.py +134 -0
  19. statement_extractor/pipeline/context.py +177 -0
  20. statement_extractor/pipeline/orchestrator.py +447 -0
  21. statement_extractor/pipeline/registry.py +297 -0
  22. statement_extractor/plugins/__init__.py +43 -0
  23. statement_extractor/plugins/base.py +446 -0
  24. statement_extractor/plugins/canonicalizers/__init__.py +17 -0
  25. statement_extractor/plugins/canonicalizers/base.py +9 -0
  26. statement_extractor/plugins/canonicalizers/location.py +219 -0
  27. statement_extractor/plugins/canonicalizers/organization.py +230 -0
  28. statement_extractor/plugins/canonicalizers/person.py +242 -0
  29. statement_extractor/plugins/extractors/__init__.py +13 -0
  30. statement_extractor/plugins/extractors/base.py +9 -0
  31. statement_extractor/plugins/extractors/gliner2.py +536 -0
  32. statement_extractor/plugins/labelers/__init__.py +29 -0
  33. statement_extractor/plugins/labelers/base.py +9 -0
  34. statement_extractor/plugins/labelers/confidence.py +138 -0
  35. statement_extractor/plugins/labelers/relation_type.py +87 -0
  36. statement_extractor/plugins/labelers/sentiment.py +159 -0
  37. statement_extractor/plugins/labelers/taxonomy.py +373 -0
  38. statement_extractor/plugins/labelers/taxonomy_embedding.py +466 -0
  39. statement_extractor/plugins/qualifiers/__init__.py +19 -0
  40. statement_extractor/plugins/qualifiers/base.py +9 -0
  41. statement_extractor/plugins/qualifiers/companies_house.py +174 -0
  42. statement_extractor/plugins/qualifiers/gleif.py +186 -0
  43. statement_extractor/plugins/qualifiers/person.py +221 -0
  44. statement_extractor/plugins/qualifiers/sec_edgar.py +198 -0
  45. statement_extractor/plugins/splitters/__init__.py +13 -0
  46. statement_extractor/plugins/splitters/base.py +9 -0
  47. statement_extractor/plugins/splitters/t5_gemma.py +188 -0
  48. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  49. statement_extractor/plugins/taxonomy/embedding.py +337 -0
  50. statement_extractor/plugins/taxonomy/mnli.py +279 -0
  51. corp_extractor-0.4.0.dist-info/RECORD +0 -12
  52. {corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/WHEEL +0 -0
  53. {corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,230 @@
1
+ """
2
+ OrganizationCanonicalizer - Resolves ORG entities to canonical forms.
3
+
4
+ Uses a tiered matching approach:
5
+ 1. LEI exact match (confidence 1.0)
6
+ 2. Company number + jurisdiction (confidence 0.95)
7
+ 3. Trigram fuzzy name match (confidence 0.85+)
8
+ 4. LLM verification for uncertain matches (confidence 0.6-0.85)
9
+ """
10
+
11
+ import logging
12
+ import re
13
+ from typing import Optional
14
+
15
+ from ..base import BaseCanonicalizerPlugin, PluginCapability
16
+ from ...pipeline.context import PipelineContext
17
+ from ...pipeline.registry import PluginRegistry
18
+ from ...models import QualifiedEntity, CanonicalMatch, EntityType
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # Common organization suffixes to normalize
23
+ ORG_SUFFIXES = [
24
+ r'\s+Inc\.?$', r'\s+Corp\.?$', r'\s+Corporation$',
25
+ r'\s+Ltd\.?$', r'\s+Limited$', r'\s+LLC$',
26
+ r'\s+LLP$', r'\s+PLC$', r'\s+Co\.?$',
27
+ r'\s+Company$', r'\s+Group$', r'\s+Holdings$',
28
+ r'\s+&\s+Co\.?$', r',\s+Inc\.?$',
29
+ ]
30
+
31
+
32
+ def normalize_org_name(name: str) -> str:
33
+ """Normalize organization name for matching."""
34
+ normalized = name.strip()
35
+ for pattern in ORG_SUFFIXES:
36
+ normalized = re.sub(pattern, '', normalized, flags=re.IGNORECASE)
37
+ # Strip trailing punctuation (commas, periods) that may be left after suffix removal
38
+ normalized = re.sub(r'[,.\s]+$', '', normalized)
39
+ return normalized.strip().lower()
40
+
41
+
42
+ def trigram_similarity(a: str, b: str) -> float:
43
+ """Calculate trigram similarity between two strings."""
44
+ if not a or not b:
45
+ return 0.0
46
+
47
+ def get_trigrams(s: str) -> set:
48
+ s = s.lower()
49
+ return {s[i:i+3] for i in range(len(s) - 2)} if len(s) >= 3 else {s}
50
+
51
+ trigrams_a = get_trigrams(a)
52
+ trigrams_b = get_trigrams(b)
53
+
54
+ if not trigrams_a or not trigrams_b:
55
+ return 0.0
56
+
57
+ intersection = len(trigrams_a & trigrams_b)
58
+ union = len(trigrams_a | trigrams_b)
59
+
60
+ return intersection / union if union > 0 else 0.0
61
+
62
+
63
+ @PluginRegistry.canonicalizer
64
+ class OrganizationCanonicalizer(BaseCanonicalizerPlugin):
65
+ """
66
+ Canonicalizer for ORG entities.
67
+
68
+ Uses tiered matching approach with identifier, name, and fuzzy matching.
69
+ """
70
+
71
+ def __init__(
72
+ self,
73
+ fuzzy_threshold: float = 0.75,
74
+ use_llm_verification: bool = False,
75
+ ):
76
+ """
77
+ Initialize the organization canonicalizer.
78
+
79
+ Args:
80
+ fuzzy_threshold: Minimum trigram similarity for fuzzy matches
81
+ use_llm_verification: Whether to use LLM for uncertain matches
82
+ """
83
+ self._fuzzy_threshold = fuzzy_threshold
84
+ self._use_llm_verification = use_llm_verification
85
+
86
+ @property
87
+ def name(self) -> str:
88
+ return "organization_canonicalizer"
89
+
90
+ @property
91
+ def priority(self) -> int:
92
+ return 10
93
+
94
+ @property
95
+ def capabilities(self) -> PluginCapability:
96
+ caps = PluginCapability.CACHING
97
+ if self._use_llm_verification:
98
+ caps |= PluginCapability.LLM_REQUIRED
99
+ return caps
100
+
101
+ @property
102
+ def description(self) -> str:
103
+ return "Resolves ORG entities to canonical forms using identifier and name matching"
104
+
105
+ @property
106
+ def supported_entity_types(self) -> set[EntityType]:
107
+ return {EntityType.ORG}
108
+
109
+ def find_canonical(
110
+ self,
111
+ entity: QualifiedEntity,
112
+ context: PipelineContext,
113
+ ) -> Optional[CanonicalMatch]:
114
+ """
115
+ Find canonical form for an ORG entity.
116
+
117
+ Args:
118
+ entity: Qualified entity to canonicalize
119
+ context: Pipeline context
120
+
121
+ Returns:
122
+ CanonicalMatch if found
123
+ """
124
+ qualifiers = entity.qualifiers
125
+ identifiers = qualifiers.identifiers
126
+
127
+ # Tier 1: LEI exact match
128
+ if "lei" in identifiers:
129
+ return CanonicalMatch(
130
+ canonical_id=identifiers["lei"],
131
+ canonical_name=entity.original_text,
132
+ match_method="identifier",
133
+ match_confidence=1.0,
134
+ match_details={"identifier_type": "lei"},
135
+ )
136
+
137
+ # Tier 2: Company number + jurisdiction
138
+ if "ch_number" in identifiers and qualifiers.jurisdiction:
139
+ return CanonicalMatch(
140
+ canonical_id=f"{qualifiers.jurisdiction}:{identifiers['ch_number']}",
141
+ canonical_name=entity.original_text,
142
+ match_method="identifier",
143
+ match_confidence=0.95,
144
+ match_details={"identifier_type": "ch_number", "jurisdiction": qualifiers.jurisdiction},
145
+ )
146
+
147
+ if "sec_cik" in identifiers:
148
+ ticker = identifiers.get("ticker", "")
149
+ canonical_name = f"{entity.original_text} ({ticker})" if ticker else entity.original_text
150
+ return CanonicalMatch(
151
+ canonical_id=f"SEC:{identifiers['sec_cik']}",
152
+ canonical_name=canonical_name,
153
+ match_method="identifier",
154
+ match_confidence=0.95,
155
+ match_details={"identifier_type": "sec_cik", "ticker": ticker},
156
+ )
157
+
158
+ # Tier 3: Fuzzy name match against other ORG entities in context
159
+ best_match = self._find_fuzzy_match(entity, context)
160
+ if best_match:
161
+ return best_match
162
+
163
+ # No canonical match found
164
+ return None
165
+
166
+ def _find_fuzzy_match(
167
+ self,
168
+ entity: QualifiedEntity,
169
+ context: PipelineContext,
170
+ ) -> Optional[CanonicalMatch]:
171
+ """Find fuzzy matches against other ORG entities in context."""
172
+ normalized_name = normalize_org_name(entity.original_text)
173
+
174
+ best_match = None
175
+ best_similarity = 0.0
176
+
177
+ for other_ref, other_entity in context.qualified_entities.items():
178
+ if other_ref == entity.entity_ref:
179
+ continue
180
+
181
+ if other_entity.entity_type != EntityType.ORG:
182
+ continue
183
+
184
+ other_normalized = normalize_org_name(other_entity.original_text)
185
+ similarity = trigram_similarity(normalized_name, other_normalized)
186
+
187
+ if similarity > best_similarity and similarity >= self._fuzzy_threshold:
188
+ best_similarity = similarity
189
+ best_match = other_entity
190
+
191
+ if best_match and best_similarity >= self._fuzzy_threshold:
192
+ confidence = 0.85 + (best_similarity - self._fuzzy_threshold) * 0.1
193
+ confidence = min(confidence, 0.95)
194
+
195
+ return CanonicalMatch(
196
+ canonical_id=None,
197
+ canonical_name=best_match.original_text,
198
+ match_method="name_fuzzy",
199
+ match_confidence=confidence,
200
+ match_details={"similarity": best_similarity, "matched_entity": best_match.entity_ref},
201
+ )
202
+
203
+ return None
204
+
205
+ def format_fqn(
206
+ self,
207
+ entity: QualifiedEntity,
208
+ match: Optional[CanonicalMatch],
209
+ ) -> str:
210
+ """Format FQN for an organization."""
211
+ base_name = match.canonical_name if match else entity.original_text
212
+
213
+ parts = []
214
+ identifiers = entity.qualifiers.identifiers
215
+
216
+ # Add ticker if available
217
+ if "ticker" in identifiers:
218
+ parts.append(identifiers["ticker"])
219
+
220
+ # Add jurisdiction
221
+ if entity.qualifiers.jurisdiction:
222
+ parts.append(entity.qualifiers.jurisdiction)
223
+
224
+ if parts:
225
+ return f"{base_name} ({', '.join(parts)})"
226
+ return base_name
227
+
228
+
229
+ # Allow importing without decorator for testing
230
+ OrganizationCanonicalizerClass = OrganizationCanonicalizer
@@ -0,0 +1,242 @@
1
+ """
2
+ PersonCanonicalizer - Resolves PERSON entities to canonical forms.
3
+
4
+ Uses:
5
+ 1. Name variants (Tim vs Timothy)
6
+ 2. Role + org context matching
7
+ 3. LLM identity verification for uncertain matches
8
+ """
9
+
10
+ import logging
11
+ import re
12
+ from typing import Optional
13
+
14
+ from ..base import BaseCanonicalizerPlugin, PluginCapability
15
+ from ...pipeline.context import PipelineContext
16
+ from ...pipeline.registry import PluginRegistry
17
+ from ...models import QualifiedEntity, CanonicalMatch, EntityType
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Common name variations
22
+ NAME_VARIANTS = {
23
+ "tim": ["timothy"],
24
+ "timothy": ["tim"],
25
+ "mike": ["michael"],
26
+ "michael": ["mike"],
27
+ "bob": ["robert"],
28
+ "robert": ["bob", "rob"],
29
+ "rob": ["robert"],
30
+ "bill": ["william"],
31
+ "william": ["bill", "will"],
32
+ "will": ["william"],
33
+ "jim": ["james"],
34
+ "james": ["jim", "jimmy"],
35
+ "jimmy": ["james"],
36
+ "tom": ["thomas"],
37
+ "thomas": ["tom", "tommy"],
38
+ "joe": ["joseph"],
39
+ "joseph": ["joe"],
40
+ "alex": ["alexander", "alexandra"],
41
+ "alexander": ["alex"],
42
+ "alexandra": ["alex"],
43
+ "dan": ["daniel"],
44
+ "daniel": ["dan", "danny"],
45
+ "dave": ["david"],
46
+ "david": ["dave"],
47
+ "ed": ["edward", "edwin"],
48
+ "edward": ["ed", "eddie"],
49
+ "jen": ["jennifer"],
50
+ "jennifer": ["jen", "jenny"],
51
+ "kate": ["katherine", "catherine"],
52
+ "katherine": ["kate", "kathy"],
53
+ "catherine": ["kate", "cathy"],
54
+ "chris": ["christopher", "christine"],
55
+ "christopher": ["chris"],
56
+ "christine": ["chris"],
57
+ "matt": ["matthew"],
58
+ "matthew": ["matt"],
59
+ "nick": ["nicholas"],
60
+ "nicholas": ["nick"],
61
+ "sam": ["samuel", "samantha"],
62
+ "samuel": ["sam"],
63
+ "samantha": ["sam"],
64
+ "steve": ["steven", "stephen"],
65
+ "steven": ["steve"],
66
+ "stephen": ["steve"],
67
+ }
68
+
69
+
70
+ def normalize_person_name(name: str) -> str:
71
+ """Normalize a person name for matching."""
72
+ # Remove titles
73
+ name = re.sub(r'^(Mr\.?|Mrs\.?|Ms\.?|Dr\.?|Prof\.?)\s+', '', name, flags=re.IGNORECASE)
74
+ # Remove suffixes
75
+ name = re.sub(r'\s+(Jr\.?|Sr\.?|III?|IV|V)$', '', name, flags=re.IGNORECASE)
76
+ return name.strip().lower()
77
+
78
+
79
+ def get_name_parts(name: str) -> tuple[str, str]:
80
+ """Split name into first and last name."""
81
+ normalized = normalize_person_name(name)
82
+ parts = normalized.split()
83
+ if len(parts) >= 2:
84
+ return parts[0], parts[-1]
85
+ elif len(parts) == 1:
86
+ return parts[0], ""
87
+ return "", ""
88
+
89
+
90
+ def names_match(name1: str, name2: str) -> tuple[bool, float, bool]:
91
+ """
92
+ Check if two names match, considering variants.
93
+
94
+ Returns (matches, confidence, is_variant).
95
+ """
96
+ first1, last1 = get_name_parts(name1)
97
+ first2, last2 = get_name_parts(name2)
98
+
99
+ # Last names must match (if both present)
100
+ if last1 and last2 and last1 != last2:
101
+ return False, 0.0, False
102
+
103
+ # Check first name match
104
+ if first1 == first2:
105
+ return True, 1.0, False
106
+
107
+ # Check variants
108
+ variants1 = NAME_VARIANTS.get(first1, [])
109
+ variants2 = NAME_VARIANTS.get(first2, [])
110
+
111
+ if first2 in variants1 or first1 in variants2:
112
+ return True, 0.9, True
113
+
114
+ return False, 0.0, False
115
+
116
+
117
+ @PluginRegistry.canonicalizer
118
+ class PersonCanonicalizer(BaseCanonicalizerPlugin):
119
+ """
120
+ Canonicalizer for PERSON entities.
121
+
122
+ Uses name variants and context matching.
123
+ """
124
+
125
+ def __init__(
126
+ self,
127
+ use_context_matching: bool = True,
128
+ ):
129
+ """
130
+ Initialize the person canonicalizer.
131
+
132
+ Args:
133
+ use_context_matching: Whether to use role+org for disambiguation
134
+ """
135
+ self._use_context_matching = use_context_matching
136
+
137
+ @property
138
+ def name(self) -> str:
139
+ return "person_canonicalizer"
140
+
141
+ @property
142
+ def priority(self) -> int:
143
+ return 10
144
+
145
+ @property
146
+ def capabilities(self) -> PluginCapability:
147
+ return PluginCapability.CACHING
148
+
149
+ @property
150
+ def description(self) -> str:
151
+ return "Resolves PERSON entities using name variants and context"
152
+
153
+ @property
154
+ def supported_entity_types(self) -> set[EntityType]:
155
+ return {EntityType.PERSON}
156
+
157
+ def find_canonical(
158
+ self,
159
+ entity: QualifiedEntity,
160
+ context: PipelineContext,
161
+ ) -> Optional[CanonicalMatch]:
162
+ """
163
+ Find canonical form for a PERSON entity.
164
+
165
+ Args:
166
+ entity: Qualified entity to canonicalize
167
+ context: Pipeline context
168
+
169
+ Returns:
170
+ CanonicalMatch if found
171
+ """
172
+ # Look for matching PERSON entities in context
173
+ best_match = None
174
+ best_confidence = 0.0
175
+
176
+ best_is_variant = False
177
+
178
+ for other_ref, other_entity in context.qualified_entities.items():
179
+ if other_ref == entity.entity_ref:
180
+ continue
181
+
182
+ if other_entity.entity_type != EntityType.PERSON:
183
+ continue
184
+
185
+ # Check name match
186
+ matches, confidence, is_variant = names_match(entity.original_text, other_entity.original_text)
187
+ if not matches:
188
+ continue
189
+
190
+ # Boost confidence if role+org also match
191
+ if self._use_context_matching and confidence > 0:
192
+ my_qualifiers = entity.qualifiers
193
+ other_qualifiers = other_entity.qualifiers
194
+
195
+ if my_qualifiers.role and other_qualifiers.role:
196
+ if my_qualifiers.role.lower() == other_qualifiers.role.lower():
197
+ confidence = min(confidence + 0.05, 1.0)
198
+
199
+ if my_qualifiers.org and other_qualifiers.org:
200
+ if my_qualifiers.org.lower() == other_qualifiers.org.lower():
201
+ confidence = min(confidence + 0.05, 1.0)
202
+
203
+ if confidence > best_confidence:
204
+ best_confidence = confidence
205
+ best_match = other_entity
206
+ best_is_variant = is_variant
207
+
208
+ if best_match and best_confidence >= 0.8:
209
+ return CanonicalMatch(
210
+ canonical_id=None,
211
+ canonical_name=best_match.original_text,
212
+ match_method="name_variant" if best_is_variant else "name_exact",
213
+ match_confidence=best_confidence,
214
+ match_details={"matched_entity": best_match.entity_ref},
215
+ )
216
+
217
+ return None
218
+
219
+ def format_fqn(
220
+ self,
221
+ entity: QualifiedEntity,
222
+ match: Optional[CanonicalMatch],
223
+ ) -> str:
224
+ """Format FQN for a person."""
225
+ base_name = match.canonical_name if match else entity.original_text
226
+
227
+ parts = []
228
+ qualifiers = entity.qualifiers
229
+
230
+ if qualifiers.role:
231
+ parts.append(qualifiers.role)
232
+
233
+ if qualifiers.org:
234
+ parts.append(qualifiers.org)
235
+
236
+ if parts:
237
+ return f"{base_name} ({', '.join(parts)})"
238
+ return base_name
239
+
240
+
241
+ # Allow importing without decorator for testing
242
+ PersonCanonicalizerClass = PersonCanonicalizer
@@ -0,0 +1,13 @@
1
+ """
2
+ Extractor plugins for Stage 2 (Extraction).
3
+
4
+ Refines raw triples into statements with typed entities.
5
+ """
6
+
7
+ from .base import BaseExtractorPlugin
8
+ from .gliner2 import GLiNER2Extractor
9
+
10
+ __all__ = [
11
+ "BaseExtractorPlugin",
12
+ "GLiNER2Extractor",
13
+ ]
@@ -0,0 +1,9 @@
1
+ """
2
+ Base class for extractor plugins.
3
+
4
+ Re-exports BaseExtractorPlugin from the main plugins module.
5
+ """
6
+
7
+ from ..base import BaseExtractorPlugin
8
+
9
+ __all__ = ["BaseExtractorPlugin"]