corp-extractor 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/METADATA +181 -64
  2. corp_extractor-0.5.0.dist-info/RECORD +55 -0
  3. statement_extractor/__init__.py +9 -0
  4. statement_extractor/cli.py +446 -17
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +1182 -0
  7. statement_extractor/extractor.py +1 -23
  8. statement_extractor/gliner_extraction.py +4 -74
  9. statement_extractor/llm.py +255 -0
  10. statement_extractor/models/__init__.py +74 -0
  11. statement_extractor/models/canonical.py +139 -0
  12. statement_extractor/models/entity.py +102 -0
  13. statement_extractor/models/labels.py +191 -0
  14. statement_extractor/models/qualifiers.py +91 -0
  15. statement_extractor/models/statement.py +75 -0
  16. statement_extractor/models.py +4 -1
  17. statement_extractor/pipeline/__init__.py +39 -0
  18. statement_extractor/pipeline/config.py +134 -0
  19. statement_extractor/pipeline/context.py +177 -0
  20. statement_extractor/pipeline/orchestrator.py +447 -0
  21. statement_extractor/pipeline/registry.py +297 -0
  22. statement_extractor/plugins/__init__.py +43 -0
  23. statement_extractor/plugins/base.py +446 -0
  24. statement_extractor/plugins/canonicalizers/__init__.py +17 -0
  25. statement_extractor/plugins/canonicalizers/base.py +9 -0
  26. statement_extractor/plugins/canonicalizers/location.py +219 -0
  27. statement_extractor/plugins/canonicalizers/organization.py +230 -0
  28. statement_extractor/plugins/canonicalizers/person.py +242 -0
  29. statement_extractor/plugins/extractors/__init__.py +13 -0
  30. statement_extractor/plugins/extractors/base.py +9 -0
  31. statement_extractor/plugins/extractors/gliner2.py +536 -0
  32. statement_extractor/plugins/labelers/__init__.py +29 -0
  33. statement_extractor/plugins/labelers/base.py +9 -0
  34. statement_extractor/plugins/labelers/confidence.py +138 -0
  35. statement_extractor/plugins/labelers/relation_type.py +87 -0
  36. statement_extractor/plugins/labelers/sentiment.py +159 -0
  37. statement_extractor/plugins/labelers/taxonomy.py +373 -0
  38. statement_extractor/plugins/labelers/taxonomy_embedding.py +466 -0
  39. statement_extractor/plugins/qualifiers/__init__.py +19 -0
  40. statement_extractor/plugins/qualifiers/base.py +9 -0
  41. statement_extractor/plugins/qualifiers/companies_house.py +174 -0
  42. statement_extractor/plugins/qualifiers/gleif.py +186 -0
  43. statement_extractor/plugins/qualifiers/person.py +221 -0
  44. statement_extractor/plugins/qualifiers/sec_edgar.py +198 -0
  45. statement_extractor/plugins/splitters/__init__.py +13 -0
  46. statement_extractor/plugins/splitters/base.py +9 -0
  47. statement_extractor/plugins/splitters/t5_gemma.py +188 -0
  48. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  49. statement_extractor/plugins/taxonomy/embedding.py +337 -0
  50. statement_extractor/plugins/taxonomy/mnli.py +279 -0
  51. corp_extractor-0.4.0.dist-info/RECORD +0 -12
  52. {corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/WHEEL +0 -0
  53. {corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,186 @@
1
+ """
2
+ GLEIFQualifierPlugin - Qualifies ORG entities with LEI and related data.
3
+
4
+ Uses the GLEIF (Global Legal Entity Identifier Foundation) API to:
5
+ - Look up LEI by organization name
6
+ - Retrieve legal name, jurisdiction, parent company info
7
+ """
8
+
9
+ import logging
10
+ from typing import Optional
11
+ from urllib.parse import quote
12
+
13
+ from ..base import BaseQualifierPlugin, PluginCapability
14
+ from ...pipeline.context import PipelineContext
15
+ from ...pipeline.registry import PluginRegistry
16
+ from ...models import ExtractedEntity, EntityQualifiers, EntityType
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # GLEIF API base URL
21
+ GLEIF_API_BASE = "https://api.gleif.org/api/v1"
22
+
23
+
24
+ @PluginRegistry.qualifier
25
+ class GLEIFQualifierPlugin(BaseQualifierPlugin):
26
+ """
27
+ Qualifier plugin for ORG entities using GLEIF API.
28
+
29
+ Looks up Legal Entity Identifiers (LEI) and related corporate data.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ timeout: int = 10,
35
+ cache_results: bool = True,
36
+ ):
37
+ """
38
+ Initialize the GLEIF qualifier.
39
+
40
+ Args:
41
+ timeout: API request timeout in seconds
42
+ cache_results: Whether to cache API results
43
+ """
44
+ self._timeout = timeout
45
+ self._cache_results = cache_results
46
+ self._cache: dict[str, Optional[dict]] = {}
47
+
48
+ @property
49
+ def name(self) -> str:
50
+ return "gleif_qualifier"
51
+
52
+ @property
53
+ def priority(self) -> int:
54
+ return 10 # High priority for ORG entities
55
+
56
+ @property
57
+ def capabilities(self) -> PluginCapability:
58
+ return PluginCapability.EXTERNAL_API | PluginCapability.CACHING
59
+
60
+ @property
61
+ def description(self) -> str:
62
+ return "Looks up LEI and corporate data from GLEIF API"
63
+
64
+ @property
65
+ def supported_entity_types(self) -> set[EntityType]:
66
+ return {EntityType.ORG}
67
+
68
+ @property
69
+ def supported_identifier_types(self) -> list[str]:
70
+ return ["lei"] # Can lookup by existing LEI
71
+
72
+ @property
73
+ def provided_identifier_types(self) -> list[str]:
74
+ return ["lei"] # Provides LEI
75
+
76
+ def qualify(
77
+ self,
78
+ entity: ExtractedEntity,
79
+ context: PipelineContext,
80
+ ) -> Optional[EntityQualifiers]:
81
+ """
82
+ Qualify an ORG entity with GLEIF data.
83
+
84
+ Args:
85
+ entity: The ORG entity to qualify
86
+ context: Pipeline context
87
+
88
+ Returns:
89
+ EntityQualifiers with LEI and jurisdiction, or None if not found
90
+ """
91
+ if entity.type != EntityType.ORG:
92
+ return None
93
+
94
+ # Check cache first
95
+ cache_key = entity.text.lower().strip()
96
+ if self._cache_results and cache_key in self._cache:
97
+ cached = self._cache[cache_key]
98
+ if cached is None:
99
+ return None
100
+ return self._data_to_qualifiers(cached)
101
+
102
+ # Search GLEIF API
103
+ result = self._search_gleif(entity.text)
104
+
105
+ # Cache result
106
+ if self._cache_results:
107
+ self._cache[cache_key] = result
108
+
109
+ if result:
110
+ return self._data_to_qualifiers(result)
111
+
112
+ return None
113
+
114
+ def _search_gleif(self, org_name: str) -> Optional[dict]:
115
+ """Search GLEIF API for organization."""
116
+ try:
117
+ import requests
118
+
119
+ # Fuzzy name search
120
+ url = f"{GLEIF_API_BASE}/lei-records"
121
+ params = {
122
+ "filter[entity.legalName]": org_name,
123
+ "page[size]": 5,
124
+ }
125
+
126
+ response = requests.get(url, params=params, timeout=self._timeout)
127
+ response.raise_for_status()
128
+ data = response.json()
129
+
130
+ records = data.get("data", [])
131
+ if not records:
132
+ # Try fulltext search as fallback
133
+ params = {
134
+ "filter[fulltext]": org_name,
135
+ "page[size]": 5,
136
+ }
137
+ response = requests.get(url, params=params, timeout=self._timeout)
138
+ response.raise_for_status()
139
+ data = response.json()
140
+ records = data.get("data", [])
141
+
142
+ if records:
143
+ # Return first match
144
+ record = records[0]
145
+ return self._parse_lei_record(record)
146
+
147
+ except ImportError:
148
+ logger.warning("requests library not available for GLEIF API")
149
+ except Exception as e:
150
+ logger.debug(f"GLEIF API error: {e}")
151
+
152
+ return None
153
+
154
+ def _parse_lei_record(self, record: dict) -> dict:
155
+ """Parse a GLEIF LEI record into a simplified dict."""
156
+ attrs = record.get("attributes", {})
157
+ entity = attrs.get("entity", {})
158
+ legal_name = entity.get("legalName", {}).get("name", "")
159
+ legal_address = entity.get("legalAddress", {})
160
+ jurisdiction = entity.get("jurisdiction", "")
161
+
162
+ return {
163
+ "lei": record.get("id", ""),
164
+ "legal_name": legal_name,
165
+ "jurisdiction": jurisdiction,
166
+ "country": legal_address.get("country", ""),
167
+ "city": legal_address.get("city", ""),
168
+ "status": attrs.get("registration", {}).get("status", ""),
169
+ }
170
+
171
+ def _data_to_qualifiers(self, data: dict) -> EntityQualifiers:
172
+ """Convert GLEIF data to EntityQualifiers."""
173
+ identifiers = {}
174
+ if data.get("lei"):
175
+ identifiers["lei"] = data["lei"]
176
+
177
+ return EntityQualifiers(
178
+ jurisdiction=data.get("jurisdiction"),
179
+ country=data.get("country"),
180
+ city=data.get("city"),
181
+ identifiers=identifiers,
182
+ )
183
+
184
+
185
+ # Allow importing without decorator for testing
186
+ GLEIFQualifierPluginClass = GLEIFQualifierPlugin
@@ -0,0 +1,221 @@
1
+ """
2
+ PersonQualifierPlugin - Qualifies PERSON entities with role and organization.
3
+
4
+ Uses Gemma3 12B (instruction-tuned) to extract:
5
+ - role: Job title/position (e.g., "CEO", "President")
6
+ - org: Organization/employer (e.g., "Apple Inc", "Microsoft")
7
+ """
8
+
9
+ import json
10
+ import logging
11
+ import re
12
+ from typing import Optional
13
+
14
+ from ..base import BaseQualifierPlugin, PluginCapability
15
+ from ...pipeline.context import PipelineContext
16
+ from ...pipeline.registry import PluginRegistry
17
+ from ...models import ExtractedEntity, EntityQualifiers, EntityType
18
+ from ...llm import LLM
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @PluginRegistry.qualifier
24
+ class PersonQualifierPlugin(BaseQualifierPlugin):
25
+ """
26
+ Qualifier plugin for PERSON entities.
27
+
28
+ Uses Gemma3 12B to extract role and organization from context.
29
+ Falls back to pattern matching if model is not available.
30
+ """
31
+
32
+ # Common role patterns for fallback
33
+ ROLE_PATTERNS = [
34
+ r"\b(CEO|CFO|CTO|COO|CMO|CIO|CISO|CSO)\b",
35
+ r"\b(Chief\s+\w+\s+Officer)\b",
36
+ r"\b(President|Chairman|Director|Manager|Executive|Founder|Co-Founder)\b",
37
+ r"\b(Vice\s+President|VP)\b",
38
+ r"\b(Head\s+of\s+\w+)\b",
39
+ r"\b(Senior\s+\w+|Lead\s+\w+|Principal\s+\w+)\b",
40
+ ]
41
+
42
+ def __init__(
43
+ self,
44
+ model_id: str = "google/gemma-3-12b-it-qat-q4_0-gguf",
45
+ gguf_file: Optional[str] = None,
46
+ use_llm: bool = True,
47
+ use_4bit: bool = True,
48
+ ):
49
+ """
50
+ Initialize the person qualifier.
51
+
52
+ Args:
53
+ model_id: HuggingFace model ID for LLM qualification
54
+ gguf_file: GGUF filename for quantized models (auto-detected if model_id ends with -gguf)
55
+ use_llm: Whether to use LLM
56
+ use_4bit: Use 4-bit quantization (requires bitsandbytes, ignored for GGUF)
57
+ """
58
+ self._use_llm = use_llm
59
+ self._llm: Optional[LLM] = None
60
+ if use_llm:
61
+ self._llm = LLM(
62
+ model_id=model_id,
63
+ gguf_file=gguf_file,
64
+ use_4bit=use_4bit,
65
+ )
66
+
67
+ @property
68
+ def name(self) -> str:
69
+ return "person_qualifier"
70
+
71
+ @property
72
+ def priority(self) -> int:
73
+ return 10 # High priority for PERSON entities
74
+
75
+ @property
76
+ def capabilities(self) -> PluginCapability:
77
+ caps = PluginCapability.NONE
78
+ if self._use_llm:
79
+ caps |= PluginCapability.LLM_REQUIRED
80
+ return caps
81
+
82
+ @property
83
+ def description(self) -> str:
84
+ return "Extracts role and organization for PERSON entities using Gemma3"
85
+
86
+ @property
87
+ def supported_entity_types(self) -> set[EntityType]:
88
+ return {EntityType.PERSON}
89
+
90
+ @property
91
+ def provided_identifier_types(self) -> list[str]:
92
+ return [] # Provides qualifiers, not identifiers
93
+
94
+ def qualify(
95
+ self,
96
+ entity: ExtractedEntity,
97
+ context: PipelineContext,
98
+ ) -> Optional[EntityQualifiers]:
99
+ """
100
+ Qualify a PERSON entity with role and organization.
101
+
102
+ Args:
103
+ entity: The PERSON entity to qualify
104
+ context: Pipeline context for accessing source text
105
+
106
+ Returns:
107
+ EntityQualifiers with role and org, or None if nothing found
108
+ """
109
+ if entity.type != EntityType.PERSON:
110
+ return None
111
+
112
+ # Use the full source text for LLM qualification
113
+ # This provides maximum context for understanding the person's role/org
114
+ full_text = context.source_text
115
+
116
+ # Try LLM extraction first with full text
117
+ if self._llm is not None:
118
+ result = self._extract_with_llm(entity.text, full_text)
119
+ if result and (result.role or result.org):
120
+ return result
121
+
122
+ # Fallback to pattern matching with full text
123
+ return self._extract_with_patterns(entity.text, full_text)
124
+
125
+ def _extract_with_llm(
126
+ self,
127
+ person_name: str,
128
+ context_text: str,
129
+ ) -> Optional[EntityQualifiers]:
130
+ """Extract role and org using Gemma3."""
131
+ if self._llm is None:
132
+ return None
133
+
134
+ try:
135
+ prompt = f"""Extract qualifiers for a person from the given context.
136
+ Instructions:
137
+ - "role" = job title or position (e.g., "CEO", "President", "Director")
138
+ - "org" = company or organization name (e.g., "Amazon", "Apple Inc", "Microsoft")
139
+ - These are DIFFERENT things: role is a job title, org is a company name
140
+ - Return null for fields not mentioned in the context
141
+
142
+ Return ONLY valid JSON:
143
+
144
+ E.g.
145
+ <context>We interviewed Big Ducks Quacking Inc team. James is new in the role of the CEO</context>
146
+ <person>James</person>
147
+
148
+ Should return:
149
+
150
+ {{"role": "CEO", "org": "Big Ducks Quacking Inc"}}
151
+
152
+ ---
153
+
154
+ <context>{context_text}</context>
155
+ <person>{person_name}</person>
156
+ """
157
+
158
+ logger.debug(f"LLM request: {prompt}")
159
+ response = self._llm.generate(prompt, max_tokens=100, stop=["\n\n", "</s>"])
160
+ logger.debug(f"LLM response: {response}")
161
+
162
+ # Extract JSON from response
163
+ json_match = re.search(r'\{[^}]+\}', response)
164
+ if json_match:
165
+ data = json.loads(json_match.group())
166
+ role = data.get("role")
167
+ org = data.get("org")
168
+
169
+ # Validate: role and org should be different (reject if same)
170
+ if role and org and role.lower() == org.lower():
171
+ logger.debug(f"Rejected duplicate role/org: {role}")
172
+ org = None # Clear org if it's same as role
173
+
174
+ if role or org:
175
+ return EntityQualifiers(role=role, org=org)
176
+
177
+ except Exception as e:
178
+ logger.exception(f"LLM extraction failed: {e}")
179
+ raise e
180
+
181
+ return None
182
+
183
+ def _extract_with_patterns(
184
+ self,
185
+ person_name: str,
186
+ context_text: str,
187
+ ) -> Optional[EntityQualifiers]:
188
+ """Extract role and org using pattern matching."""
189
+ role = None
190
+ org = None
191
+
192
+ # Look for role patterns
193
+ for pattern in self.ROLE_PATTERNS:
194
+ match = re.search(pattern, context_text, re.IGNORECASE)
195
+ if match:
196
+ role = match.group(1)
197
+ break
198
+
199
+ # Look for "of [Organization]" or "at [Organization]" patterns
200
+ org_patterns = [
201
+ rf'{re.escape(person_name)}[^.]*?\bof\s+([A-Z][A-Za-z\s&]+(?:Inc|Corp|Ltd|LLC|Company|Co)?\.?)',
202
+ rf'{re.escape(person_name)}[^.]*?\bat\s+([A-Z][A-Za-z\s&]+(?:Inc|Corp|Ltd|LLC|Company|Co)?\.?)',
203
+ rf'([A-Z][A-Za-z\s&]+(?:Inc|Corp|Ltd|LLC|Company|Co)?\.?)\s*(?:\'s|s)?\s*{re.escape(person_name)}',
204
+ ]
205
+
206
+ for pattern in org_patterns:
207
+ match = re.search(pattern, context_text)
208
+ if match:
209
+ org = match.group(1).strip()
210
+ # Clean up trailing punctuation
211
+ org = org.rstrip('.,;')
212
+ break
213
+
214
+ if role or org:
215
+ return EntityQualifiers(role=role, org=org)
216
+
217
+ return None
218
+
219
+
220
+ # Allow importing without decorator for testing
221
+ PersonQualifierPluginClass = PersonQualifierPlugin
@@ -0,0 +1,198 @@
1
+ """
2
+ SECEdgarQualifierPlugin - Qualifies US ORG entities with SEC data.
3
+
4
+ Uses the SEC EDGAR API to:
5
+ - Look up CIK (Central Index Key) by company name
6
+ - Retrieve ticker symbol, exchange, filing history
7
+ """
8
+
9
+ import logging
10
+ from typing import Optional
11
+
12
+ from ..base import BaseQualifierPlugin, PluginCapability
13
+ from ...pipeline.context import PipelineContext
14
+ from ...pipeline.registry import PluginRegistry
15
+ from ...models import ExtractedEntity, EntityQualifiers, EntityType
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # SEC EDGAR API endpoints
20
+ SEC_COMPANY_SEARCH = "https://efts.sec.gov/LATEST/search-index"
21
+ SEC_COMPANY_TICKERS = "https://www.sec.gov/files/company_tickers.json"
22
+
23
+
24
+ @PluginRegistry.qualifier
25
+ class SECEdgarQualifierPlugin(BaseQualifierPlugin):
26
+ """
27
+ Qualifier plugin for US ORG entities using SEC EDGAR.
28
+
29
+ Provides CIK and ticker symbol for publicly traded US companies.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ timeout: int = 10,
35
+ cache_results: bool = True,
36
+ ):
37
+ """
38
+ Initialize the SEC EDGAR qualifier.
39
+
40
+ Args:
41
+ timeout: API request timeout in seconds
42
+ cache_results: Whether to cache API results
43
+ """
44
+ self._timeout = timeout
45
+ self._cache_results = cache_results
46
+ self._cache: dict[str, Optional[dict]] = {}
47
+ self._ticker_cache: Optional[dict] = None
48
+
49
+ @property
50
+ def name(self) -> str:
51
+ return "sec_edgar_qualifier"
52
+
53
+ @property
54
+ def priority(self) -> int:
55
+ return 30 # Run after GLEIF and Companies House
56
+
57
+ @property
58
+ def capabilities(self) -> PluginCapability:
59
+ return PluginCapability.EXTERNAL_API | PluginCapability.CACHING
60
+
61
+ @property
62
+ def description(self) -> str:
63
+ return "Looks up SEC CIK and ticker for US public companies"
64
+
65
+ @property
66
+ def supported_entity_types(self) -> set[EntityType]:
67
+ return {EntityType.ORG}
68
+
69
+ @property
70
+ def supported_identifier_types(self) -> list[str]:
71
+ return ["sec_cik", "ticker"] # Can lookup by CIK or ticker
72
+
73
+ @property
74
+ def provided_identifier_types(self) -> list[str]:
75
+ return ["sec_cik", "ticker"] # Provides CIK and ticker
76
+
77
+ def qualify(
78
+ self,
79
+ entity: ExtractedEntity,
80
+ context: PipelineContext,
81
+ ) -> Optional[EntityQualifiers]:
82
+ """
83
+ Qualify an ORG entity with SEC EDGAR data.
84
+
85
+ Args:
86
+ entity: The ORG entity to qualify
87
+ context: Pipeline context
88
+
89
+ Returns:
90
+ EntityQualifiers with CIK and ticker, or None if not found
91
+ """
92
+ if entity.type != EntityType.ORG:
93
+ return None
94
+
95
+ # Check cache first
96
+ cache_key = entity.text.lower().strip()
97
+ if self._cache_results and cache_key in self._cache:
98
+ cached = self._cache[cache_key]
99
+ if cached is None:
100
+ return None
101
+ return self._data_to_qualifiers(cached)
102
+
103
+ # Search SEC
104
+ result = self._search_sec(entity.text)
105
+
106
+ # Cache result
107
+ if self._cache_results:
108
+ self._cache[cache_key] = result
109
+
110
+ if result:
111
+ return self._data_to_qualifiers(result)
112
+
113
+ return None
114
+
115
+ def _load_ticker_cache(self) -> dict:
116
+ """Load the SEC company tickers JSON (cached)."""
117
+ if self._ticker_cache is not None:
118
+ return self._ticker_cache
119
+
120
+ try:
121
+ import requests
122
+
123
+ response = requests.get(SEC_COMPANY_TICKERS, timeout=self._timeout)
124
+ response.raise_for_status()
125
+ data = response.json()
126
+
127
+ # Build lookup by company name (lowercase)
128
+ self._ticker_cache = {}
129
+ for key, company in data.items():
130
+ name = company.get("title", "").lower()
131
+ if name:
132
+ self._ticker_cache[name] = {
133
+ "cik": str(company.get("cik_str", "")),
134
+ "ticker": company.get("ticker", ""),
135
+ "title": company.get("title", ""),
136
+ }
137
+
138
+ logger.debug(f"Loaded {len(self._ticker_cache)} SEC company tickers")
139
+ return self._ticker_cache
140
+
141
+ except Exception as e:
142
+ logger.debug(f"Failed to load SEC ticker cache: {e}")
143
+ self._ticker_cache = {}
144
+ return self._ticker_cache
145
+
146
+ def _search_sec(self, org_name: str) -> Optional[dict]:
147
+ """Search SEC for company information."""
148
+ try:
149
+ # Load ticker cache
150
+ ticker_cache = self._load_ticker_cache()
151
+
152
+ # Try exact match first
153
+ org_lower = org_name.lower().strip()
154
+ if org_lower in ticker_cache:
155
+ return ticker_cache[org_lower]
156
+
157
+ # Try partial match
158
+ for name, data in ticker_cache.items():
159
+ if org_lower in name or name in org_lower:
160
+ return data
161
+
162
+ # Try matching without common suffixes
163
+ clean_name = org_lower
164
+ for suffix in [" inc", " inc.", " corp", " corp.", " co", " co.", " ltd", " llc"]:
165
+ clean_name = clean_name.replace(suffix, "")
166
+ clean_name = clean_name.strip()
167
+
168
+ for name, data in ticker_cache.items():
169
+ clean_cached = name
170
+ for suffix in [" inc", " inc.", " corp", " corp.", " co", " co.", " ltd", " llc"]:
171
+ clean_cached = clean_cached.replace(suffix, "")
172
+ clean_cached = clean_cached.strip()
173
+
174
+ if clean_name == clean_cached or clean_name in clean_cached or clean_cached in clean_name:
175
+ return data
176
+
177
+ except Exception as e:
178
+ logger.debug(f"SEC search error: {e}")
179
+
180
+ return None
181
+
182
+ def _data_to_qualifiers(self, data: dict) -> EntityQualifiers:
183
+ """Convert SEC data to EntityQualifiers."""
184
+ identifiers = {}
185
+ if data.get("cik"):
186
+ identifiers["sec_cik"] = data["cik"]
187
+ if data.get("ticker"):
188
+ identifiers["ticker"] = data["ticker"]
189
+
190
+ return EntityQualifiers(
191
+ jurisdiction="US",
192
+ country="US",
193
+ identifiers=identifiers,
194
+ )
195
+
196
+
197
+ # Allow importing without decorator for testing
198
+ SECEdgarQualifierPluginClass = SECEdgarQualifierPlugin
@@ -0,0 +1,13 @@
1
+ """
2
+ Splitter plugins for Stage 1 (Splitting).
3
+
4
+ Splits text into atomic triples.
5
+ """
6
+
7
+ from .base import BaseSplitterPlugin
8
+ from .t5_gemma import T5GemmaSplitter
9
+
10
+ __all__ = [
11
+ "BaseSplitterPlugin",
12
+ "T5GemmaSplitter",
13
+ ]
@@ -0,0 +1,9 @@
1
+ """
2
+ Base class for splitter plugins.
3
+
4
+ Re-exports BaseSplitterPlugin from the main plugins module.
5
+ """
6
+
7
+ from ..base import BaseSplitterPlugin
8
+
9
+ __all__ = ["BaseSplitterPlugin"]