corp-extractor 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +72 -11
  2. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +34 -27
  3. statement_extractor/cli.py +1317 -101
  4. statement_extractor/database/embeddings.py +45 -0
  5. statement_extractor/database/hub.py +86 -136
  6. statement_extractor/database/importers/__init__.py +10 -2
  7. statement_extractor/database/importers/companies_house.py +16 -2
  8. statement_extractor/database/importers/companies_house_officers.py +431 -0
  9. statement_extractor/database/importers/gleif.py +23 -0
  10. statement_extractor/database/importers/import_utils.py +264 -0
  11. statement_extractor/database/importers/sec_edgar.py +17 -0
  12. statement_extractor/database/importers/sec_form4.py +512 -0
  13. statement_extractor/database/importers/wikidata.py +151 -43
  14. statement_extractor/database/importers/wikidata_dump.py +2282 -0
  15. statement_extractor/database/importers/wikidata_people.py +867 -325
  16. statement_extractor/database/migrate_v2.py +852 -0
  17. statement_extractor/database/models.py +155 -7
  18. statement_extractor/database/schema_v2.py +409 -0
  19. statement_extractor/database/seed_data.py +359 -0
  20. statement_extractor/database/store.py +3449 -233
  21. statement_extractor/document/deduplicator.py +10 -12
  22. statement_extractor/extractor.py +1 -1
  23. statement_extractor/models/__init__.py +3 -2
  24. statement_extractor/models/statement.py +15 -17
  25. statement_extractor/models.py +1 -1
  26. statement_extractor/pipeline/context.py +5 -5
  27. statement_extractor/pipeline/orchestrator.py +12 -12
  28. statement_extractor/plugins/base.py +17 -17
  29. statement_extractor/plugins/extractors/gliner2.py +28 -28
  30. statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
  31. statement_extractor/plugins/qualifiers/person.py +120 -53
  32. statement_extractor/plugins/splitters/t5_gemma.py +35 -39
  33. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
  34. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
@@ -2,6 +2,8 @@
2
2
  StatementDeduplicator - Hash-based deduplication for statements.
3
3
 
4
4
  Removes duplicate statements across chunks using normalized hashing.
5
+ Works with Stage 2+ output (PipelineStatement, LabeledStatement) which
6
+ have subject-predicate-object structure.
5
7
  """
6
8
 
7
9
  import hashlib
@@ -9,12 +11,12 @@ import logging
9
11
  from typing import TypeVar, Union
10
12
 
11
13
  from ..models.labels import LabeledStatement
12
- from ..models.statement import PipelineStatement, RawTriple
14
+ from ..models.statement import PipelineStatement
13
15
 
14
16
  logger = logging.getLogger(__name__)
15
17
 
16
18
  # Type variable for generic deduplication
17
- T = TypeVar("T", RawTriple, PipelineStatement, LabeledStatement)
19
+ T = TypeVar("T", PipelineStatement, LabeledStatement)
18
20
 
19
21
 
20
22
  class StatementDeduplicator:
@@ -23,6 +25,8 @@ class StatementDeduplicator:
23
25
 
24
26
  Uses a hash of normalized (subject, predicate, object) to identify
25
27
  duplicates. Keeps the first occurrence of each unique statement.
28
+
29
+ Works with PipelineStatement (Stage 2) and LabeledStatement (Stage 4).
26
30
  """
27
31
 
28
32
  def __init__(self):
@@ -46,20 +50,14 @@ class StatementDeduplicator:
46
50
 
47
51
  def _get_triple_parts(
48
52
  self,
49
- stmt: Union[RawTriple, PipelineStatement, LabeledStatement],
53
+ stmt: Union[PipelineStatement, LabeledStatement],
50
54
  ) -> tuple[str, str, str]:
51
55
  """
52
56
  Extract (subject, predicate, object) from a statement.
53
57
 
54
58
  Handles different statement types consistently.
55
59
  """
56
- if isinstance(stmt, RawTriple):
57
- return (
58
- stmt.subject_text,
59
- stmt.predicate_text,
60
- stmt.object_text,
61
- )
62
- elif isinstance(stmt, LabeledStatement):
60
+ if isinstance(stmt, LabeledStatement):
63
61
  return (
64
62
  stmt.statement.subject.text,
65
63
  stmt.statement.predicate,
@@ -75,7 +73,7 @@ class StatementDeduplicator:
75
73
 
76
74
  def _hash_triple(
77
75
  self,
78
- stmt: Union[RawTriple, PipelineStatement, LabeledStatement],
76
+ stmt: Union[PipelineStatement, LabeledStatement],
79
77
  ) -> str:
80
78
  """
81
79
  Generate a hash for a statement triple.
@@ -96,7 +94,7 @@ class StatementDeduplicator:
96
94
 
97
95
  def is_duplicate(
98
96
  self,
99
- stmt: Union[RawTriple, PipelineStatement, LabeledStatement],
97
+ stmt: Union[PipelineStatement, LabeledStatement],
100
98
  ) -> bool:
101
99
  """
102
100
  Check if a statement is a duplicate.
@@ -392,7 +392,7 @@ class StatementExtractor:
392
392
  This is the new extraction pipeline that:
393
393
  1. Generates multiple candidates via DBS
394
394
  2. Parses each to statements
395
- 3. Scores each triple for groundedness
395
+ 3. Scores each triple for quality (semantic + entity)
396
396
  4. Merges top beams or selects best beam
397
397
  5. Deduplicates using embeddings (if enabled)
398
398
  """
@@ -43,7 +43,7 @@ else:
43
43
 
44
44
  # New pipeline models
45
45
  from .entity import ExtractedEntity
46
- from .statement import RawTriple, PipelineStatement
46
+ from .statement import SplitSentence, RawTriple, PipelineStatement
47
47
  from .qualifiers import EntityQualifiers, QualifiedEntity, ResolvedRole, ResolvedOrganization
48
48
  from .canonical import CanonicalMatch, CanonicalEntity
49
49
  from .labels import StatementLabel, LabeledStatement, TaxonomyResult
@@ -69,7 +69,8 @@ __all__ = [
69
69
  "ExtractionOptions",
70
70
  # New pipeline models
71
71
  "ExtractedEntity",
72
- "RawTriple",
72
+ "SplitSentence",
73
+ "RawTriple", # Backwards compatibility alias for SplitSentence
73
74
  "PipelineStatement",
74
75
  "EntityQualifiers",
75
76
  "QualifiedEntity",
@@ -1,8 +1,8 @@
1
1
  """
2
2
  Statement models for the extraction pipeline.
3
3
 
4
- RawTriple: Output of Stage 1 (Splitting)
5
- PipelineStatement: Output of Stage 2 (Extraction) with refined entities
4
+ SplitSentence: Output of Stage 1 (Splitting) - atomic sentences/statements
5
+ PipelineStatement: Output of Stage 2 (Extraction) with subject-predicate-object triples
6
6
  """
7
7
 
8
8
  from typing import Optional
@@ -12,22 +12,20 @@ from pydantic import BaseModel, Field
12
12
  from .entity import ExtractedEntity
13
13
 
14
14
 
15
- class RawTriple(BaseModel):
15
+ class SplitSentence(BaseModel):
16
16
  """
17
- A raw triple from Stage 1 (Splitting).
17
+ An atomic sentence from Stage 1 (Splitting).
18
18
 
19
- Contains the basic text components before entity refinement.
20
- Generated by T5-Gemma or other splitting plugins.
19
+ Stage 1 splits text into atomic sentences that can each be converted
20
+ to subject-predicate-object triples in Stage 2. Generated by T5-Gemma
21
+ or other splitting plugins.
21
22
  """
22
- subject_text: str = Field(..., description="Raw subject text")
23
- predicate_text: str = Field(..., description="Raw predicate text")
24
- object_text: str = Field(..., description="Raw object text")
25
- source_sentence: str = Field(..., description="The source sentence this triple was extracted from")
23
+ text: str = Field(..., description="The atomic sentence text")
26
24
  confidence: float = Field(
27
25
  default=1.0,
28
26
  ge=0.0,
29
27
  le=1.0,
30
- description="Extraction confidence from the splitter"
28
+ description="Confidence that this is a valid atomic statement"
31
29
  )
32
30
  # Document tracking fields
33
31
  document_id: Optional[str] = Field(
@@ -36,19 +34,19 @@ class RawTriple(BaseModel):
36
34
  )
37
35
  page_number: Optional[int] = Field(
38
36
  None,
39
- description="Page number where this triple was extracted (1-indexed)"
37
+ description="Page number where this sentence was extracted (1-indexed)"
40
38
  )
41
39
  chunk_index: Optional[int] = Field(
42
40
  None,
43
- description="Index of the chunk this triple was extracted from (0-indexed)"
41
+ description="Index of the chunk this sentence was extracted from (0-indexed)"
44
42
  )
45
43
 
46
44
  def __str__(self) -> str:
47
- return f"{self.subject_text} --[{self.predicate_text}]--> {self.object_text}"
45
+ return self.text
48
46
 
49
- def as_tuple(self) -> tuple[str, str, str]:
50
- """Return as a simple (subject, predicate, object) tuple."""
51
- return (self.subject_text, self.predicate_text, self.object_text)
47
+
48
+ # Backwards compatibility alias
49
+ RawTriple = SplitSentence
52
50
 
53
51
 
54
52
  class PipelineStatement(BaseModel):
@@ -217,7 +217,7 @@ class ScoringConfig(BaseModel):
217
217
  quality_weight: float = Field(
218
218
  default=1.0,
219
219
  ge=0.0,
220
- description="Weight for groundedness/quality scores in beam selection"
220
+ description="Weight for confidence scores in beam selection"
221
221
  )
222
222
  coverage_weight: float = Field(
223
223
  default=0.5,
@@ -2,7 +2,7 @@
2
2
  PipelineContext - Data container that flows through all pipeline stages.
3
3
 
4
4
  The context accumulates outputs from each stage:
5
- - Stage 1 (Splitting): raw_triples
5
+ - Stage 1 (Splitting): split_sentences
6
6
  - Stage 2 (Extraction): statements
7
7
  - Stage 3 (Qualification): qualified_entities
8
8
  - Stage 4 (Canonicalization): canonical_entities
@@ -14,7 +14,7 @@ from typing import Any, Optional
14
14
  from pydantic import BaseModel, Field
15
15
 
16
16
  from ..models import (
17
- RawTriple,
17
+ SplitSentence,
18
18
  PipelineStatement,
19
19
  QualifiedEntity,
20
20
  CanonicalEntity,
@@ -37,10 +37,10 @@ class PipelineContext(BaseModel):
37
37
  description="Metadata about the source (e.g., document ID, URL, timestamp)"
38
38
  )
39
39
 
40
- # Stage 1 output: Raw triples from splitting
41
- raw_triples: list[RawTriple] = Field(
40
+ # Stage 1 output: Split sentences
41
+ split_sentences: list[SplitSentence] = Field(
42
42
  default_factory=list,
43
- description="Raw triples from Stage 1 (Splitting)"
43
+ description="Atomic sentences from Stage 1 (Splitting)"
44
44
  )
45
45
 
46
46
  # Stage 2 output: Statements with extracted entities
@@ -2,8 +2,8 @@
2
2
  ExtractionPipeline - Main orchestrator for the 5-stage extraction pipeline.
3
3
 
4
4
  Coordinates the flow of data through all pipeline stages:
5
- 1. Splitting: Text → RawTriple
6
- 2. Extraction: RawTriple → PipelineStatement
5
+ 1. Splitting: Text → SplitSentence (atomic sentences)
6
+ 2. Extraction: SplitSentence → PipelineStatement (subject-predicate-object triples)
7
7
  3. Qualification: Entity → CanonicalEntity
8
8
  4. Labeling: Statement → LabeledStatement
9
9
  5. Taxonomy: Statement → TaxonomyResult
@@ -31,8 +31,8 @@ class ExtractionPipeline:
31
31
  Main pipeline orchestrator.
32
32
 
33
33
  Coordinates the flow of data through all 5 stages:
34
- 1. Splitting: Text → RawTriple (using splitter plugins)
35
- 2. Extraction: RawTriple → PipelineStatement (using extractor plugins)
34
+ 1. Splitting: Text → SplitSentence (using splitter plugins)
35
+ 2. Extraction: SplitSentence → PipelineStatement (using extractor plugins)
36
36
  3. Qualification: Entity → CanonicalEntity (using qualifier + canonicalizer plugins)
37
37
  4. Labeling: Statement → LabeledStatement (using labeler plugins)
38
38
  5. Taxonomy: Statement → TaxonomyResult (using taxonomy plugins)
@@ -115,7 +115,7 @@ class ExtractionPipeline:
115
115
  return ctx
116
116
 
117
117
  def _run_splitting(self, ctx: PipelineContext) -> PipelineContext:
118
- """Stage 1: Split text into raw triples."""
118
+ """Stage 1: Split text into atomic sentences."""
119
119
  stage_name = get_stage_name(1)
120
120
  logger.debug(f"Running {stage_name} stage")
121
121
  start_time = time.time()
@@ -132,9 +132,9 @@ class ExtractionPipeline:
132
132
 
133
133
  logger.debug(f"Using splitter: {splitter.name}")
134
134
  try:
135
- raw_triples = splitter.split(ctx.source_text, ctx)
136
- ctx.raw_triples = raw_triples
137
- logger.info(f"Splitting produced {len(raw_triples)} raw triples")
135
+ split_sentences = splitter.split(ctx.source_text, ctx)
136
+ ctx.split_sentences = split_sentences
137
+ logger.info(f"Splitting produced {len(split_sentences)} sentences")
138
138
  break
139
139
  except Exception as e:
140
140
  logger.exception(f"Splitter {splitter.name} failed")
@@ -146,13 +146,13 @@ class ExtractionPipeline:
146
146
  return ctx
147
147
 
148
148
  def _run_extraction(self, ctx: PipelineContext) -> PipelineContext:
149
- """Stage 2: Extract statements with typed entities from raw triples."""
149
+ """Stage 2: Extract subject-predicate-object triples from split sentences."""
150
150
  stage_name = get_stage_name(2)
151
151
  logger.debug(f"Running {stage_name} stage")
152
152
  start_time = time.time()
153
153
 
154
- if not ctx.raw_triples:
155
- logger.debug("No raw triples to extract from")
154
+ if not ctx.split_sentences:
155
+ logger.debug("No split sentences to extract from")
156
156
  return ctx
157
157
 
158
158
  extractors = PluginRegistry.get_extractors()
@@ -177,7 +177,7 @@ class ExtractionPipeline:
177
177
 
178
178
  logger.debug(f"Using extractor: {extractor.name}")
179
179
  try:
180
- statements = extractor.extract(ctx.raw_triples, ctx)
180
+ statements = extractor.extract(ctx.split_sentences, ctx)
181
181
  ctx.statements = statements
182
182
  logger.info(f"Extraction produced {len(statements)} statements")
183
183
  break
@@ -2,8 +2,8 @@
2
2
  Base plugin classes for the extraction pipeline.
3
3
 
4
4
  Defines the abstract interfaces for each pipeline stage:
5
- - BaseSplitterPlugin: Stage 1 - Text → RawTriple
6
- - BaseExtractorPlugin: Stage 2 - RawTriple → PipelineStatement
5
+ - BaseSplitterPlugin: Stage 1 - Text → SplitSentence (atomic sentences)
6
+ - BaseExtractorPlugin: Stage 2 - SplitSentence → PipelineStatement (triples)
7
7
  - BaseQualifierPlugin: Stage 3 - Entity → CanonicalEntity
8
8
  - BaseLabelerPlugin: Stage 4 - Statement → StatementLabel
9
9
  - BaseTaxonomyPlugin: Stage 5 - Statement → TaxonomyResult
@@ -22,7 +22,7 @@ from pydantic import BaseModel, Field
22
22
  if TYPE_CHECKING:
23
23
  from ..pipeline.context import PipelineContext
24
24
  from ..models import (
25
- RawTriple,
25
+ SplitSentence,
26
26
  PipelineStatement,
27
27
  ExtractedEntity,
28
28
  CanonicalEntity,
@@ -173,10 +173,10 @@ class BasePlugin(ABC):
173
173
 
174
174
  class BaseSplitterPlugin(BasePlugin):
175
175
  """
176
- Stage 1 plugin: Split text into atomic triples.
176
+ Stage 1 plugin: Split text into atomic sentences.
177
177
 
178
- Takes raw text and produces RawTriple objects containing
179
- subject/predicate/object text and source sentence.
178
+ Takes raw text and produces SplitSentence objects containing
179
+ atomic statements that can be converted to triples in Stage 2.
180
180
  """
181
181
 
182
182
  @abstractmethod
@@ -184,16 +184,16 @@ class BaseSplitterPlugin(BasePlugin):
184
184
  self,
185
185
  text: str,
186
186
  context: "PipelineContext",
187
- ) -> list["RawTriple"]:
187
+ ) -> list["SplitSentence"]:
188
188
  """
189
- Split text into atomic triples.
189
+ Split text into atomic sentences.
190
190
 
191
191
  Args:
192
192
  text: Input text to split
193
193
  context: Pipeline context for accessing metadata and config
194
194
 
195
195
  Returns:
196
- List of RawTriple objects
196
+ List of SplitSentence objects
197
197
  """
198
198
  ...
199
199
 
@@ -201,9 +201,9 @@ class BaseSplitterPlugin(BasePlugin):
201
201
  self,
202
202
  texts: list[str],
203
203
  context: "PipelineContext",
204
- ) -> list[list["RawTriple"]]:
204
+ ) -> list[list["SplitSentence"]]:
205
205
  """
206
- Split multiple texts into atomic triples in a single batch.
206
+ Split multiple texts into atomic sentences in a single batch.
207
207
 
208
208
  Default implementation calls split() for each text sequentially.
209
209
  Plugins with BATCH_PROCESSING capability should override this
@@ -214,16 +214,16 @@ class BaseSplitterPlugin(BasePlugin):
214
214
  context: Pipeline context for accessing metadata and config
215
215
 
216
216
  Returns:
217
- List of RawTriple lists, one per input text
217
+ List of SplitSentence lists, one per input text
218
218
  """
219
219
  return [self.split(text, context) for text in texts]
220
220
 
221
221
 
222
222
  class BaseExtractorPlugin(BasePlugin):
223
223
  """
224
- Stage 2 plugin: Refine triples into statements with typed entities.
224
+ Stage 2 plugin: Extract subject-predicate-object triples from sentences.
225
225
 
226
- Takes RawTriple objects and produces PipelineStatement objects
226
+ Takes SplitSentence objects and produces PipelineStatement objects
227
227
  with ExtractedEntity subjects/objects that have types, spans,
228
228
  and confidence scores.
229
229
  """
@@ -231,14 +231,14 @@ class BaseExtractorPlugin(BasePlugin):
231
231
  @abstractmethod
232
232
  def extract(
233
233
  self,
234
- raw_triples: list["RawTriple"],
234
+ split_sentences: list["SplitSentence"],
235
235
  context: "PipelineContext",
236
236
  ) -> list["PipelineStatement"]:
237
237
  """
238
- Extract statements from raw triples.
238
+ Extract triples from split sentences.
239
239
 
240
240
  Args:
241
- raw_triples: Raw triples from Stage 1
241
+ split_sentences: Atomic sentences from Stage 1
242
242
  context: Pipeline context
243
243
 
244
244
  Returns:
@@ -1,9 +1,9 @@
1
1
  """
2
- GLiNER2Extractor - Stage 2 plugin that refines triples using GLiNER2.
2
+ GLiNER2Extractor - Stage 2 plugin that extracts triples from sentences.
3
3
 
4
4
  Uses GLiNER2 for:
5
- 1. Entity extraction: Refine subject/object boundaries
6
- 2. Relation extraction: When predicate list is provided
5
+ 1. Entity extraction: Identify subject/object entities with types
6
+ 2. Relation extraction: Extract predicates using predicate list
7
7
  3. Entity scoring: Score how entity-like subjects/objects are
8
8
  4. Classification: Run labeler classification schemas in single pass
9
9
  """
@@ -16,7 +16,7 @@ from typing import Optional
16
16
  from ..base import BaseExtractorPlugin, ClassificationSchema, PluginCapability
17
17
  from ...pipeline.context import PipelineContext
18
18
  from ...pipeline.registry import PluginRegistry
19
- from ...models import RawTriple, PipelineStatement, ExtractedEntity, EntityType
19
+ from ...models import SplitSentence, PipelineStatement, ExtractedEntity, EntityType
20
20
 
21
21
  logger = logging.getLogger(__name__)
22
22
 
@@ -110,11 +110,11 @@ GLINER_TYPE_MAP = {
110
110
  @PluginRegistry.extractor
111
111
  class GLiNER2Extractor(BaseExtractorPlugin):
112
112
  """
113
- Extractor plugin that uses GLiNER2 for entity and relation refinement.
113
+ Extractor plugin that uses GLiNER2 for entity and relation extraction.
114
114
 
115
- Processes raw triples from Stage 1 and produces PipelineStatement
116
- objects with typed entities. Also runs classification schemas from
117
- labeler plugins in a single pass.
115
+ Processes split sentences from Stage 1 and produces PipelineStatement
116
+ objects with subject-predicate-object triples and typed entities.
117
+ Also runs classification schemas from labeler plugins in a single pass.
118
118
  """
119
119
 
120
120
  def __init__(
@@ -209,36 +209,36 @@ class GLiNER2Extractor(BaseExtractorPlugin):
209
209
 
210
210
  def extract(
211
211
  self,
212
- raw_triples: list[RawTriple],
212
+ split_sentences: list[SplitSentence],
213
213
  context: PipelineContext,
214
214
  ) -> list[PipelineStatement]:
215
215
  """
216
- Extract statements from raw triples using GLiNER2.
216
+ Extract subject-predicate-object triples from split sentences using GLiNER2.
217
217
 
218
218
  Returns ALL matching relations from GLiNER2 (not just the best one).
219
219
  Also runs any classification schemas and stores results in context.
220
220
 
221
221
  Args:
222
- raw_triples: Raw triples from Stage 1
222
+ split_sentences: Atomic sentences from Stage 1
223
223
  context: Pipeline context
224
224
 
225
225
  Returns:
226
- List of PipelineStatement objects (may contain multiple per raw triple)
226
+ List of PipelineStatement objects (may contain multiple per sentence)
227
227
  """
228
228
  predicate_categories = self._get_predicate_categories()
229
- logger.info(f"GLiNER2Extractor processing {len(raw_triples)} triples")
229
+ logger.info(f"GLiNER2Extractor processing {len(split_sentences)} sentences")
230
230
  logger.info(f"Using {len(predicate_categories)} predicate categories")
231
231
 
232
232
  statements = []
233
233
  model = self._get_model()
234
234
  classified_texts: set[str] = set()
235
235
 
236
- for raw in raw_triples:
236
+ for sentence in split_sentences:
237
237
  try:
238
238
  if model:
239
239
  # Use relation extraction iterating through categories
240
240
  # Returns ALL matches, not just the best one
241
- extracted_stmts = self._extract_with_relations(raw, model, predicate_categories)
241
+ extracted_stmts = self._extract_with_relations(sentence, model, predicate_categories)
242
242
  else:
243
243
  # No model available - skip
244
244
  logger.warning("No GLiNER2 model available - skipping extraction")
@@ -253,10 +253,10 @@ class GLiNER2Extractor(BaseExtractorPlugin):
253
253
  classified_texts.add(stmt.source_text)
254
254
 
255
255
  except Exception as e:
256
- logger.warning(f"Error extracting triple: {e}")
257
- # No fallback - skip this triple
256
+ logger.warning(f"Error extracting from sentence: {e}")
257
+ # No fallback - skip this sentence
258
258
 
259
- logger.info(f"GLiNER2Extractor produced {len(statements)} statements from {len(raw_triples)} raw triples")
259
+ logger.info(f"GLiNER2Extractor produced {len(statements)} statements from {len(split_sentences)} sentences")
260
260
  return statements
261
261
 
262
262
  def _run_classifications(
@@ -316,7 +316,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
316
316
 
317
317
  def _extract_with_relations(
318
318
  self,
319
- raw: RawTriple,
319
+ sentence: SplitSentence,
320
320
  model,
321
321
  predicate_categories: dict[str, dict[str, PredicateConfig]],
322
322
  ) -> list[PipelineStatement]:
@@ -328,14 +328,14 @@ class GLiNER2Extractor(BaseExtractorPlugin):
328
328
  Returns ALL matching relations, not just the best one.
329
329
 
330
330
  Args:
331
- raw: Raw triple from Stage 1
331
+ sentence: Split sentence from Stage 1
332
332
  model: GLiNER2 model instance
333
333
  predicate_categories: Dict of category -> predicates to use
334
334
 
335
335
  Returns:
336
336
  List of PipelineStatements for all relations found
337
337
  """
338
- logger.debug(f"Attempting relation extraction for: '{raw.source_sentence[:80]}...'")
338
+ logger.debug(f"Attempting relation extraction for: '{sentence.text[:80]}...'")
339
339
 
340
340
  # Iterate through each category separately to stay under GLiNER2's ~25 label limit
341
341
  # Use schema API with entities + relations together for better extraction
@@ -355,7 +355,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
355
355
  .entities(self._get_entity_types())
356
356
  .relations(relations_dict)
357
357
  )
358
- result = model.extract(raw.source_sentence, schema, include_confidence=True)
358
+ result = model.extract(sentence.text, schema, include_confidence=True)
359
359
 
360
360
  # Get relations from this category
361
361
  relation_data = result.get("relations", result.get("relation_extraction", {}))
@@ -379,7 +379,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
379
379
  logger.debug(f" GLiNER2 found {total_found} total relations across all categories")
380
380
 
381
381
  if not all_relations:
382
- logger.debug(f"No GLiNER2 relation match in: '{raw.source_sentence[:60]}...'")
382
+ logger.debug(f"No GLiNER2 relation match in: '{sentence.text[:60]}...'")
383
383
  return []
384
384
 
385
385
  # Filter by confidence threshold and sort descending
@@ -402,8 +402,8 @@ class GLiNER2Extractor(BaseExtractorPlugin):
402
402
  )
403
403
 
404
404
  # Get entity types
405
- subj_type = self._infer_entity_type(head, model, raw.source_sentence)
406
- obj_type = self._infer_entity_type(tail, model, raw.source_sentence)
405
+ subj_type = self._infer_entity_type(head, model, sentence.text)
406
+ obj_type = self._infer_entity_type(tail, model, sentence.text)
407
407
  logger.debug(f" Entity types: {subj_type.value}, {obj_type.value}")
408
408
 
409
409
  stmt = PipelineStatement(
@@ -419,7 +419,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
419
419
  type=obj_type,
420
420
  confidence=confidence,
421
421
  ),
422
- source_text=raw.source_sentence,
422
+ source_text=sentence.text,
423
423
  confidence_score=confidence,
424
424
  extraction_method="gliner_relation",
425
425
  )
@@ -429,7 +429,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
429
429
 
430
430
  def _extract_with_entities(
431
431
  self,
432
- raw: RawTriple,
432
+ sentence: SplitSentence,
433
433
  model,
434
434
  ) -> Optional[PipelineStatement]:
435
435
  """
@@ -438,7 +438,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
438
438
  This method is called when predicates are disabled. Without GLiNER2 relation
439
439
  extraction, we cannot form valid statements.
440
440
  """
441
- logger.debug(f"Entity extraction mode (no predicates) - skipping: '{raw.source_sentence[:60]}...'")
441
+ logger.debug(f"Entity extraction mode (no predicates) - skipping: '{sentence.text[:60]}...'")
442
442
  return None
443
443
 
444
444
  def _parse_relation(self, rel) -> tuple[str, str, float]:
@@ -60,7 +60,7 @@ class EmbeddingCompanyQualifier(BaseQualifierPlugin):
60
60
  self,
61
61
  db_path: Optional[str] = None,
62
62
  top_k: int = 20,
63
- min_similarity: float = 0.5,
63
+ min_similarity: float = 0.3,
64
64
  use_llm_confirmation: bool = True,
65
65
  auto_download_db: bool = True,
66
66
  ):
@@ -215,11 +215,13 @@ class EmbeddingCompanyQualifier(BaseQualifierPlugin):
215
215
  self._cache[cache_key] = None
216
216
  return None
217
217
 
218
- # Log all candidates
219
- logger.info(f" Found {len(results)} candidates for '{entity.text}':")
220
- for i, (record, sim) in enumerate(results[:10], 1):
218
+ # Log all candidates (scores are prominence-adjusted)
219
+ logger.info(f" Found {len(results)} candidates for '{entity.text}' (prominence-adjusted):")
220
+ for i, (record, score) in enumerate(results[:10], 1):
221
221
  region_str = f" [{record.region}]" if record.region else ""
222
- logger.info(f" {i}. {record.name}{region_str} (sim={sim:.3f}, source={record.source})")
222
+ ticker = record.record.get("ticker", "")
223
+ ticker_str = f" ticker={ticker}" if ticker else ""
224
+ logger.info(f" {i}. {record.name}{region_str} (score={score:.3f}, source={record.source}{ticker_str})")
223
225
 
224
226
  # Get best match (optionally with LLM confirmation)
225
227
  logger.info(f" Selecting best match (LLM={self._use_llm_confirmation})...")