corp-extractor 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +72 -11
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +34 -27
- statement_extractor/cli.py +1317 -101
- statement_extractor/database/embeddings.py +45 -0
- statement_extractor/database/hub.py +86 -136
- statement_extractor/database/importers/__init__.py +10 -2
- statement_extractor/database/importers/companies_house.py +16 -2
- statement_extractor/database/importers/companies_house_officers.py +431 -0
- statement_extractor/database/importers/gleif.py +23 -0
- statement_extractor/database/importers/import_utils.py +264 -0
- statement_extractor/database/importers/sec_edgar.py +17 -0
- statement_extractor/database/importers/sec_form4.py +512 -0
- statement_extractor/database/importers/wikidata.py +151 -43
- statement_extractor/database/importers/wikidata_dump.py +2282 -0
- statement_extractor/database/importers/wikidata_people.py +867 -325
- statement_extractor/database/migrate_v2.py +852 -0
- statement_extractor/database/models.py +155 -7
- statement_extractor/database/schema_v2.py +409 -0
- statement_extractor/database/seed_data.py +359 -0
- statement_extractor/database/store.py +3449 -233
- statement_extractor/document/deduplicator.py +10 -12
- statement_extractor/extractor.py +1 -1
- statement_extractor/models/__init__.py +3 -2
- statement_extractor/models/statement.py +15 -17
- statement_extractor/models.py +1 -1
- statement_extractor/pipeline/context.py +5 -5
- statement_extractor/pipeline/orchestrator.py +12 -12
- statement_extractor/plugins/base.py +17 -17
- statement_extractor/plugins/extractors/gliner2.py +28 -28
- statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
- statement_extractor/plugins/qualifiers/person.py +120 -53
- statement_extractor/plugins/splitters/t5_gemma.py +35 -39
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
StatementDeduplicator - Hash-based deduplication for statements.
|
|
3
3
|
|
|
4
4
|
Removes duplicate statements across chunks using normalized hashing.
|
|
5
|
+
Works with Stage 2+ output (PipelineStatement, LabeledStatement) which
|
|
6
|
+
have subject-predicate-object structure.
|
|
5
7
|
"""
|
|
6
8
|
|
|
7
9
|
import hashlib
|
|
@@ -9,12 +11,12 @@ import logging
|
|
|
9
11
|
from typing import TypeVar, Union
|
|
10
12
|
|
|
11
13
|
from ..models.labels import LabeledStatement
|
|
12
|
-
from ..models.statement import PipelineStatement
|
|
14
|
+
from ..models.statement import PipelineStatement
|
|
13
15
|
|
|
14
16
|
logger = logging.getLogger(__name__)
|
|
15
17
|
|
|
16
18
|
# Type variable for generic deduplication
|
|
17
|
-
T = TypeVar("T",
|
|
19
|
+
T = TypeVar("T", PipelineStatement, LabeledStatement)
|
|
18
20
|
|
|
19
21
|
|
|
20
22
|
class StatementDeduplicator:
|
|
@@ -23,6 +25,8 @@ class StatementDeduplicator:
|
|
|
23
25
|
|
|
24
26
|
Uses a hash of normalized (subject, predicate, object) to identify
|
|
25
27
|
duplicates. Keeps the first occurrence of each unique statement.
|
|
28
|
+
|
|
29
|
+
Works with PipelineStatement (Stage 2) and LabeledStatement (Stage 4).
|
|
26
30
|
"""
|
|
27
31
|
|
|
28
32
|
def __init__(self):
|
|
@@ -46,20 +50,14 @@ class StatementDeduplicator:
|
|
|
46
50
|
|
|
47
51
|
def _get_triple_parts(
|
|
48
52
|
self,
|
|
49
|
-
stmt: Union[
|
|
53
|
+
stmt: Union[PipelineStatement, LabeledStatement],
|
|
50
54
|
) -> tuple[str, str, str]:
|
|
51
55
|
"""
|
|
52
56
|
Extract (subject, predicate, object) from a statement.
|
|
53
57
|
|
|
54
58
|
Handles different statement types consistently.
|
|
55
59
|
"""
|
|
56
|
-
if isinstance(stmt,
|
|
57
|
-
return (
|
|
58
|
-
stmt.subject_text,
|
|
59
|
-
stmt.predicate_text,
|
|
60
|
-
stmt.object_text,
|
|
61
|
-
)
|
|
62
|
-
elif isinstance(stmt, LabeledStatement):
|
|
60
|
+
if isinstance(stmt, LabeledStatement):
|
|
63
61
|
return (
|
|
64
62
|
stmt.statement.subject.text,
|
|
65
63
|
stmt.statement.predicate,
|
|
@@ -75,7 +73,7 @@ class StatementDeduplicator:
|
|
|
75
73
|
|
|
76
74
|
def _hash_triple(
|
|
77
75
|
self,
|
|
78
|
-
stmt: Union[
|
|
76
|
+
stmt: Union[PipelineStatement, LabeledStatement],
|
|
79
77
|
) -> str:
|
|
80
78
|
"""
|
|
81
79
|
Generate a hash for a statement triple.
|
|
@@ -96,7 +94,7 @@ class StatementDeduplicator:
|
|
|
96
94
|
|
|
97
95
|
def is_duplicate(
|
|
98
96
|
self,
|
|
99
|
-
stmt: Union[
|
|
97
|
+
stmt: Union[PipelineStatement, LabeledStatement],
|
|
100
98
|
) -> bool:
|
|
101
99
|
"""
|
|
102
100
|
Check if a statement is a duplicate.
|
statement_extractor/extractor.py
CHANGED
|
@@ -392,7 +392,7 @@ class StatementExtractor:
|
|
|
392
392
|
This is the new extraction pipeline that:
|
|
393
393
|
1. Generates multiple candidates via DBS
|
|
394
394
|
2. Parses each to statements
|
|
395
|
-
3. Scores each triple for
|
|
395
|
+
3. Scores each triple for quality (semantic + entity)
|
|
396
396
|
4. Merges top beams or selects best beam
|
|
397
397
|
5. Deduplicates using embeddings (if enabled)
|
|
398
398
|
"""
|
|
@@ -43,7 +43,7 @@ else:
|
|
|
43
43
|
|
|
44
44
|
# New pipeline models
|
|
45
45
|
from .entity import ExtractedEntity
|
|
46
|
-
from .statement import RawTriple, PipelineStatement
|
|
46
|
+
from .statement import SplitSentence, RawTriple, PipelineStatement
|
|
47
47
|
from .qualifiers import EntityQualifiers, QualifiedEntity, ResolvedRole, ResolvedOrganization
|
|
48
48
|
from .canonical import CanonicalMatch, CanonicalEntity
|
|
49
49
|
from .labels import StatementLabel, LabeledStatement, TaxonomyResult
|
|
@@ -69,7 +69,8 @@ __all__ = [
|
|
|
69
69
|
"ExtractionOptions",
|
|
70
70
|
# New pipeline models
|
|
71
71
|
"ExtractedEntity",
|
|
72
|
-
"
|
|
72
|
+
"SplitSentence",
|
|
73
|
+
"RawTriple", # Backwards compatibility alias for SplitSentence
|
|
73
74
|
"PipelineStatement",
|
|
74
75
|
"EntityQualifiers",
|
|
75
76
|
"QualifiedEntity",
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Statement models for the extraction pipeline.
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
PipelineStatement: Output of Stage 2 (Extraction) with
|
|
4
|
+
SplitSentence: Output of Stage 1 (Splitting) - atomic sentences/statements
|
|
5
|
+
PipelineStatement: Output of Stage 2 (Extraction) with subject-predicate-object triples
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
from typing import Optional
|
|
@@ -12,22 +12,20 @@ from pydantic import BaseModel, Field
|
|
|
12
12
|
from .entity import ExtractedEntity
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
class
|
|
15
|
+
class SplitSentence(BaseModel):
|
|
16
16
|
"""
|
|
17
|
-
|
|
17
|
+
An atomic sentence from Stage 1 (Splitting).
|
|
18
18
|
|
|
19
|
-
|
|
20
|
-
Generated by T5-Gemma
|
|
19
|
+
Stage 1 splits text into atomic sentences that can each be converted
|
|
20
|
+
to subject-predicate-object triples in Stage 2. Generated by T5-Gemma
|
|
21
|
+
or other splitting plugins.
|
|
21
22
|
"""
|
|
22
|
-
|
|
23
|
-
predicate_text: str = Field(..., description="Raw predicate text")
|
|
24
|
-
object_text: str = Field(..., description="Raw object text")
|
|
25
|
-
source_sentence: str = Field(..., description="The source sentence this triple was extracted from")
|
|
23
|
+
text: str = Field(..., description="The atomic sentence text")
|
|
26
24
|
confidence: float = Field(
|
|
27
25
|
default=1.0,
|
|
28
26
|
ge=0.0,
|
|
29
27
|
le=1.0,
|
|
30
|
-
description="
|
|
28
|
+
description="Confidence that this is a valid atomic statement"
|
|
31
29
|
)
|
|
32
30
|
# Document tracking fields
|
|
33
31
|
document_id: Optional[str] = Field(
|
|
@@ -36,19 +34,19 @@ class RawTriple(BaseModel):
|
|
|
36
34
|
)
|
|
37
35
|
page_number: Optional[int] = Field(
|
|
38
36
|
None,
|
|
39
|
-
description="Page number where this
|
|
37
|
+
description="Page number where this sentence was extracted (1-indexed)"
|
|
40
38
|
)
|
|
41
39
|
chunk_index: Optional[int] = Field(
|
|
42
40
|
None,
|
|
43
|
-
description="Index of the chunk this
|
|
41
|
+
description="Index of the chunk this sentence was extracted from (0-indexed)"
|
|
44
42
|
)
|
|
45
43
|
|
|
46
44
|
def __str__(self) -> str:
|
|
47
|
-
return
|
|
45
|
+
return self.text
|
|
48
46
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
47
|
+
|
|
48
|
+
# Backwards compatibility alias
|
|
49
|
+
RawTriple = SplitSentence
|
|
52
50
|
|
|
53
51
|
|
|
54
52
|
class PipelineStatement(BaseModel):
|
statement_extractor/models.py
CHANGED
|
@@ -217,7 +217,7 @@ class ScoringConfig(BaseModel):
|
|
|
217
217
|
quality_weight: float = Field(
|
|
218
218
|
default=1.0,
|
|
219
219
|
ge=0.0,
|
|
220
|
-
description="Weight for
|
|
220
|
+
description="Weight for confidence scores in beam selection"
|
|
221
221
|
)
|
|
222
222
|
coverage_weight: float = Field(
|
|
223
223
|
default=0.5,
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
PipelineContext - Data container that flows through all pipeline stages.
|
|
3
3
|
|
|
4
4
|
The context accumulates outputs from each stage:
|
|
5
|
-
- Stage 1 (Splitting):
|
|
5
|
+
- Stage 1 (Splitting): split_sentences
|
|
6
6
|
- Stage 2 (Extraction): statements
|
|
7
7
|
- Stage 3 (Qualification): qualified_entities
|
|
8
8
|
- Stage 4 (Canonicalization): canonical_entities
|
|
@@ -14,7 +14,7 @@ from typing import Any, Optional
|
|
|
14
14
|
from pydantic import BaseModel, Field
|
|
15
15
|
|
|
16
16
|
from ..models import (
|
|
17
|
-
|
|
17
|
+
SplitSentence,
|
|
18
18
|
PipelineStatement,
|
|
19
19
|
QualifiedEntity,
|
|
20
20
|
CanonicalEntity,
|
|
@@ -37,10 +37,10 @@ class PipelineContext(BaseModel):
|
|
|
37
37
|
description="Metadata about the source (e.g., document ID, URL, timestamp)"
|
|
38
38
|
)
|
|
39
39
|
|
|
40
|
-
# Stage 1 output:
|
|
41
|
-
|
|
40
|
+
# Stage 1 output: Split sentences
|
|
41
|
+
split_sentences: list[SplitSentence] = Field(
|
|
42
42
|
default_factory=list,
|
|
43
|
-
description="
|
|
43
|
+
description="Atomic sentences from Stage 1 (Splitting)"
|
|
44
44
|
)
|
|
45
45
|
|
|
46
46
|
# Stage 2 output: Statements with extracted entities
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
ExtractionPipeline - Main orchestrator for the 5-stage extraction pipeline.
|
|
3
3
|
|
|
4
4
|
Coordinates the flow of data through all pipeline stages:
|
|
5
|
-
1. Splitting: Text →
|
|
6
|
-
2. Extraction:
|
|
5
|
+
1. Splitting: Text → SplitSentence (atomic sentences)
|
|
6
|
+
2. Extraction: SplitSentence → PipelineStatement (subject-predicate-object triples)
|
|
7
7
|
3. Qualification: Entity → CanonicalEntity
|
|
8
8
|
4. Labeling: Statement → LabeledStatement
|
|
9
9
|
5. Taxonomy: Statement → TaxonomyResult
|
|
@@ -31,8 +31,8 @@ class ExtractionPipeline:
|
|
|
31
31
|
Main pipeline orchestrator.
|
|
32
32
|
|
|
33
33
|
Coordinates the flow of data through all 5 stages:
|
|
34
|
-
1. Splitting: Text →
|
|
35
|
-
2. Extraction:
|
|
34
|
+
1. Splitting: Text → SplitSentence (using splitter plugins)
|
|
35
|
+
2. Extraction: SplitSentence → PipelineStatement (using extractor plugins)
|
|
36
36
|
3. Qualification: Entity → CanonicalEntity (using qualifier + canonicalizer plugins)
|
|
37
37
|
4. Labeling: Statement → LabeledStatement (using labeler plugins)
|
|
38
38
|
5. Taxonomy: Statement → TaxonomyResult (using taxonomy plugins)
|
|
@@ -115,7 +115,7 @@ class ExtractionPipeline:
|
|
|
115
115
|
return ctx
|
|
116
116
|
|
|
117
117
|
def _run_splitting(self, ctx: PipelineContext) -> PipelineContext:
|
|
118
|
-
"""Stage 1: Split text into
|
|
118
|
+
"""Stage 1: Split text into atomic sentences."""
|
|
119
119
|
stage_name = get_stage_name(1)
|
|
120
120
|
logger.debug(f"Running {stage_name} stage")
|
|
121
121
|
start_time = time.time()
|
|
@@ -132,9 +132,9 @@ class ExtractionPipeline:
|
|
|
132
132
|
|
|
133
133
|
logger.debug(f"Using splitter: {splitter.name}")
|
|
134
134
|
try:
|
|
135
|
-
|
|
136
|
-
ctx.
|
|
137
|
-
logger.info(f"Splitting produced {len(
|
|
135
|
+
split_sentences = splitter.split(ctx.source_text, ctx)
|
|
136
|
+
ctx.split_sentences = split_sentences
|
|
137
|
+
logger.info(f"Splitting produced {len(split_sentences)} sentences")
|
|
138
138
|
break
|
|
139
139
|
except Exception as e:
|
|
140
140
|
logger.exception(f"Splitter {splitter.name} failed")
|
|
@@ -146,13 +146,13 @@ class ExtractionPipeline:
|
|
|
146
146
|
return ctx
|
|
147
147
|
|
|
148
148
|
def _run_extraction(self, ctx: PipelineContext) -> PipelineContext:
|
|
149
|
-
"""Stage 2: Extract
|
|
149
|
+
"""Stage 2: Extract subject-predicate-object triples from split sentences."""
|
|
150
150
|
stage_name = get_stage_name(2)
|
|
151
151
|
logger.debug(f"Running {stage_name} stage")
|
|
152
152
|
start_time = time.time()
|
|
153
153
|
|
|
154
|
-
if not ctx.
|
|
155
|
-
logger.debug("No
|
|
154
|
+
if not ctx.split_sentences:
|
|
155
|
+
logger.debug("No split sentences to extract from")
|
|
156
156
|
return ctx
|
|
157
157
|
|
|
158
158
|
extractors = PluginRegistry.get_extractors()
|
|
@@ -177,7 +177,7 @@ class ExtractionPipeline:
|
|
|
177
177
|
|
|
178
178
|
logger.debug(f"Using extractor: {extractor.name}")
|
|
179
179
|
try:
|
|
180
|
-
statements = extractor.extract(ctx.
|
|
180
|
+
statements = extractor.extract(ctx.split_sentences, ctx)
|
|
181
181
|
ctx.statements = statements
|
|
182
182
|
logger.info(f"Extraction produced {len(statements)} statements")
|
|
183
183
|
break
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
Base plugin classes for the extraction pipeline.
|
|
3
3
|
|
|
4
4
|
Defines the abstract interfaces for each pipeline stage:
|
|
5
|
-
- BaseSplitterPlugin: Stage 1 - Text →
|
|
6
|
-
- BaseExtractorPlugin: Stage 2 -
|
|
5
|
+
- BaseSplitterPlugin: Stage 1 - Text → SplitSentence (atomic sentences)
|
|
6
|
+
- BaseExtractorPlugin: Stage 2 - SplitSentence → PipelineStatement (triples)
|
|
7
7
|
- BaseQualifierPlugin: Stage 3 - Entity → CanonicalEntity
|
|
8
8
|
- BaseLabelerPlugin: Stage 4 - Statement → StatementLabel
|
|
9
9
|
- BaseTaxonomyPlugin: Stage 5 - Statement → TaxonomyResult
|
|
@@ -22,7 +22,7 @@ from pydantic import BaseModel, Field
|
|
|
22
22
|
if TYPE_CHECKING:
|
|
23
23
|
from ..pipeline.context import PipelineContext
|
|
24
24
|
from ..models import (
|
|
25
|
-
|
|
25
|
+
SplitSentence,
|
|
26
26
|
PipelineStatement,
|
|
27
27
|
ExtractedEntity,
|
|
28
28
|
CanonicalEntity,
|
|
@@ -173,10 +173,10 @@ class BasePlugin(ABC):
|
|
|
173
173
|
|
|
174
174
|
class BaseSplitterPlugin(BasePlugin):
|
|
175
175
|
"""
|
|
176
|
-
Stage 1 plugin: Split text into atomic
|
|
176
|
+
Stage 1 plugin: Split text into atomic sentences.
|
|
177
177
|
|
|
178
|
-
Takes raw text and produces
|
|
179
|
-
|
|
178
|
+
Takes raw text and produces SplitSentence objects containing
|
|
179
|
+
atomic statements that can be converted to triples in Stage 2.
|
|
180
180
|
"""
|
|
181
181
|
|
|
182
182
|
@abstractmethod
|
|
@@ -184,16 +184,16 @@ class BaseSplitterPlugin(BasePlugin):
|
|
|
184
184
|
self,
|
|
185
185
|
text: str,
|
|
186
186
|
context: "PipelineContext",
|
|
187
|
-
) -> list["
|
|
187
|
+
) -> list["SplitSentence"]:
|
|
188
188
|
"""
|
|
189
|
-
Split text into atomic
|
|
189
|
+
Split text into atomic sentences.
|
|
190
190
|
|
|
191
191
|
Args:
|
|
192
192
|
text: Input text to split
|
|
193
193
|
context: Pipeline context for accessing metadata and config
|
|
194
194
|
|
|
195
195
|
Returns:
|
|
196
|
-
List of
|
|
196
|
+
List of SplitSentence objects
|
|
197
197
|
"""
|
|
198
198
|
...
|
|
199
199
|
|
|
@@ -201,9 +201,9 @@ class BaseSplitterPlugin(BasePlugin):
|
|
|
201
201
|
self,
|
|
202
202
|
texts: list[str],
|
|
203
203
|
context: "PipelineContext",
|
|
204
|
-
) -> list[list["
|
|
204
|
+
) -> list[list["SplitSentence"]]:
|
|
205
205
|
"""
|
|
206
|
-
Split multiple texts into atomic
|
|
206
|
+
Split multiple texts into atomic sentences in a single batch.
|
|
207
207
|
|
|
208
208
|
Default implementation calls split() for each text sequentially.
|
|
209
209
|
Plugins with BATCH_PROCESSING capability should override this
|
|
@@ -214,16 +214,16 @@ class BaseSplitterPlugin(BasePlugin):
|
|
|
214
214
|
context: Pipeline context for accessing metadata and config
|
|
215
215
|
|
|
216
216
|
Returns:
|
|
217
|
-
List of
|
|
217
|
+
List of SplitSentence lists, one per input text
|
|
218
218
|
"""
|
|
219
219
|
return [self.split(text, context) for text in texts]
|
|
220
220
|
|
|
221
221
|
|
|
222
222
|
class BaseExtractorPlugin(BasePlugin):
|
|
223
223
|
"""
|
|
224
|
-
Stage 2 plugin:
|
|
224
|
+
Stage 2 plugin: Extract subject-predicate-object triples from sentences.
|
|
225
225
|
|
|
226
|
-
Takes
|
|
226
|
+
Takes SplitSentence objects and produces PipelineStatement objects
|
|
227
227
|
with ExtractedEntity subjects/objects that have types, spans,
|
|
228
228
|
and confidence scores.
|
|
229
229
|
"""
|
|
@@ -231,14 +231,14 @@ class BaseExtractorPlugin(BasePlugin):
|
|
|
231
231
|
@abstractmethod
|
|
232
232
|
def extract(
|
|
233
233
|
self,
|
|
234
|
-
|
|
234
|
+
split_sentences: list["SplitSentence"],
|
|
235
235
|
context: "PipelineContext",
|
|
236
236
|
) -> list["PipelineStatement"]:
|
|
237
237
|
"""
|
|
238
|
-
Extract
|
|
238
|
+
Extract triples from split sentences.
|
|
239
239
|
|
|
240
240
|
Args:
|
|
241
|
-
|
|
241
|
+
split_sentences: Atomic sentences from Stage 1
|
|
242
242
|
context: Pipeline context
|
|
243
243
|
|
|
244
244
|
Returns:
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
"""
|
|
2
|
-
GLiNER2Extractor - Stage 2 plugin that
|
|
2
|
+
GLiNER2Extractor - Stage 2 plugin that extracts triples from sentences.
|
|
3
3
|
|
|
4
4
|
Uses GLiNER2 for:
|
|
5
|
-
1. Entity extraction:
|
|
6
|
-
2. Relation extraction:
|
|
5
|
+
1. Entity extraction: Identify subject/object entities with types
|
|
6
|
+
2. Relation extraction: Extract predicates using predicate list
|
|
7
7
|
3. Entity scoring: Score how entity-like subjects/objects are
|
|
8
8
|
4. Classification: Run labeler classification schemas in single pass
|
|
9
9
|
"""
|
|
@@ -16,7 +16,7 @@ from typing import Optional
|
|
|
16
16
|
from ..base import BaseExtractorPlugin, ClassificationSchema, PluginCapability
|
|
17
17
|
from ...pipeline.context import PipelineContext
|
|
18
18
|
from ...pipeline.registry import PluginRegistry
|
|
19
|
-
from ...models import
|
|
19
|
+
from ...models import SplitSentence, PipelineStatement, ExtractedEntity, EntityType
|
|
20
20
|
|
|
21
21
|
logger = logging.getLogger(__name__)
|
|
22
22
|
|
|
@@ -110,11 +110,11 @@ GLINER_TYPE_MAP = {
|
|
|
110
110
|
@PluginRegistry.extractor
|
|
111
111
|
class GLiNER2Extractor(BaseExtractorPlugin):
|
|
112
112
|
"""
|
|
113
|
-
Extractor plugin that uses GLiNER2 for entity and relation
|
|
113
|
+
Extractor plugin that uses GLiNER2 for entity and relation extraction.
|
|
114
114
|
|
|
115
|
-
Processes
|
|
116
|
-
objects with typed entities.
|
|
117
|
-
labeler plugins in a single pass.
|
|
115
|
+
Processes split sentences from Stage 1 and produces PipelineStatement
|
|
116
|
+
objects with subject-predicate-object triples and typed entities.
|
|
117
|
+
Also runs classification schemas from labeler plugins in a single pass.
|
|
118
118
|
"""
|
|
119
119
|
|
|
120
120
|
def __init__(
|
|
@@ -209,36 +209,36 @@ class GLiNER2Extractor(BaseExtractorPlugin):
|
|
|
209
209
|
|
|
210
210
|
def extract(
|
|
211
211
|
self,
|
|
212
|
-
|
|
212
|
+
split_sentences: list[SplitSentence],
|
|
213
213
|
context: PipelineContext,
|
|
214
214
|
) -> list[PipelineStatement]:
|
|
215
215
|
"""
|
|
216
|
-
Extract
|
|
216
|
+
Extract subject-predicate-object triples from split sentences using GLiNER2.
|
|
217
217
|
|
|
218
218
|
Returns ALL matching relations from GLiNER2 (not just the best one).
|
|
219
219
|
Also runs any classification schemas and stores results in context.
|
|
220
220
|
|
|
221
221
|
Args:
|
|
222
|
-
|
|
222
|
+
split_sentences: Atomic sentences from Stage 1
|
|
223
223
|
context: Pipeline context
|
|
224
224
|
|
|
225
225
|
Returns:
|
|
226
|
-
List of PipelineStatement objects (may contain multiple per
|
|
226
|
+
List of PipelineStatement objects (may contain multiple per sentence)
|
|
227
227
|
"""
|
|
228
228
|
predicate_categories = self._get_predicate_categories()
|
|
229
|
-
logger.info(f"GLiNER2Extractor processing {len(
|
|
229
|
+
logger.info(f"GLiNER2Extractor processing {len(split_sentences)} sentences")
|
|
230
230
|
logger.info(f"Using {len(predicate_categories)} predicate categories")
|
|
231
231
|
|
|
232
232
|
statements = []
|
|
233
233
|
model = self._get_model()
|
|
234
234
|
classified_texts: set[str] = set()
|
|
235
235
|
|
|
236
|
-
for
|
|
236
|
+
for sentence in split_sentences:
|
|
237
237
|
try:
|
|
238
238
|
if model:
|
|
239
239
|
# Use relation extraction iterating through categories
|
|
240
240
|
# Returns ALL matches, not just the best one
|
|
241
|
-
extracted_stmts = self._extract_with_relations(
|
|
241
|
+
extracted_stmts = self._extract_with_relations(sentence, model, predicate_categories)
|
|
242
242
|
else:
|
|
243
243
|
# No model available - skip
|
|
244
244
|
logger.warning("No GLiNER2 model available - skipping extraction")
|
|
@@ -253,10 +253,10 @@ class GLiNER2Extractor(BaseExtractorPlugin):
|
|
|
253
253
|
classified_texts.add(stmt.source_text)
|
|
254
254
|
|
|
255
255
|
except Exception as e:
|
|
256
|
-
logger.warning(f"Error extracting
|
|
257
|
-
# No fallback - skip this
|
|
256
|
+
logger.warning(f"Error extracting from sentence: {e}")
|
|
257
|
+
# No fallback - skip this sentence
|
|
258
258
|
|
|
259
|
-
logger.info(f"GLiNER2Extractor produced {len(statements)} statements from {len(
|
|
259
|
+
logger.info(f"GLiNER2Extractor produced {len(statements)} statements from {len(split_sentences)} sentences")
|
|
260
260
|
return statements
|
|
261
261
|
|
|
262
262
|
def _run_classifications(
|
|
@@ -316,7 +316,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
|
|
|
316
316
|
|
|
317
317
|
def _extract_with_relations(
|
|
318
318
|
self,
|
|
319
|
-
|
|
319
|
+
sentence: SplitSentence,
|
|
320
320
|
model,
|
|
321
321
|
predicate_categories: dict[str, dict[str, PredicateConfig]],
|
|
322
322
|
) -> list[PipelineStatement]:
|
|
@@ -328,14 +328,14 @@ class GLiNER2Extractor(BaseExtractorPlugin):
|
|
|
328
328
|
Returns ALL matching relations, not just the best one.
|
|
329
329
|
|
|
330
330
|
Args:
|
|
331
|
-
|
|
331
|
+
sentence: Split sentence from Stage 1
|
|
332
332
|
model: GLiNER2 model instance
|
|
333
333
|
predicate_categories: Dict of category -> predicates to use
|
|
334
334
|
|
|
335
335
|
Returns:
|
|
336
336
|
List of PipelineStatements for all relations found
|
|
337
337
|
"""
|
|
338
|
-
logger.debug(f"Attempting relation extraction for: '{
|
|
338
|
+
logger.debug(f"Attempting relation extraction for: '{sentence.text[:80]}...'")
|
|
339
339
|
|
|
340
340
|
# Iterate through each category separately to stay under GLiNER2's ~25 label limit
|
|
341
341
|
# Use schema API with entities + relations together for better extraction
|
|
@@ -355,7 +355,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
|
|
|
355
355
|
.entities(self._get_entity_types())
|
|
356
356
|
.relations(relations_dict)
|
|
357
357
|
)
|
|
358
|
-
result = model.extract(
|
|
358
|
+
result = model.extract(sentence.text, schema, include_confidence=True)
|
|
359
359
|
|
|
360
360
|
# Get relations from this category
|
|
361
361
|
relation_data = result.get("relations", result.get("relation_extraction", {}))
|
|
@@ -379,7 +379,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
|
|
|
379
379
|
logger.debug(f" GLiNER2 found {total_found} total relations across all categories")
|
|
380
380
|
|
|
381
381
|
if not all_relations:
|
|
382
|
-
logger.debug(f"No GLiNER2 relation match in: '{
|
|
382
|
+
logger.debug(f"No GLiNER2 relation match in: '{sentence.text[:60]}...'")
|
|
383
383
|
return []
|
|
384
384
|
|
|
385
385
|
# Filter by confidence threshold and sort descending
|
|
@@ -402,8 +402,8 @@ class GLiNER2Extractor(BaseExtractorPlugin):
|
|
|
402
402
|
)
|
|
403
403
|
|
|
404
404
|
# Get entity types
|
|
405
|
-
subj_type = self._infer_entity_type(head, model,
|
|
406
|
-
obj_type = self._infer_entity_type(tail, model,
|
|
405
|
+
subj_type = self._infer_entity_type(head, model, sentence.text)
|
|
406
|
+
obj_type = self._infer_entity_type(tail, model, sentence.text)
|
|
407
407
|
logger.debug(f" Entity types: {subj_type.value}, {obj_type.value}")
|
|
408
408
|
|
|
409
409
|
stmt = PipelineStatement(
|
|
@@ -419,7 +419,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
|
|
|
419
419
|
type=obj_type,
|
|
420
420
|
confidence=confidence,
|
|
421
421
|
),
|
|
422
|
-
source_text=
|
|
422
|
+
source_text=sentence.text,
|
|
423
423
|
confidence_score=confidence,
|
|
424
424
|
extraction_method="gliner_relation",
|
|
425
425
|
)
|
|
@@ -429,7 +429,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
|
|
|
429
429
|
|
|
430
430
|
def _extract_with_entities(
|
|
431
431
|
self,
|
|
432
|
-
|
|
432
|
+
sentence: SplitSentence,
|
|
433
433
|
model,
|
|
434
434
|
) -> Optional[PipelineStatement]:
|
|
435
435
|
"""
|
|
@@ -438,7 +438,7 @@ class GLiNER2Extractor(BaseExtractorPlugin):
|
|
|
438
438
|
This method is called when predicates are disabled. Without GLiNER2 relation
|
|
439
439
|
extraction, we cannot form valid statements.
|
|
440
440
|
"""
|
|
441
|
-
logger.debug(f"Entity extraction mode (no predicates) - skipping: '{
|
|
441
|
+
logger.debug(f"Entity extraction mode (no predicates) - skipping: '{sentence.text[:60]}...'")
|
|
442
442
|
return None
|
|
443
443
|
|
|
444
444
|
def _parse_relation(self, rel) -> tuple[str, str, float]:
|
|
@@ -60,7 +60,7 @@ class EmbeddingCompanyQualifier(BaseQualifierPlugin):
|
|
|
60
60
|
self,
|
|
61
61
|
db_path: Optional[str] = None,
|
|
62
62
|
top_k: int = 20,
|
|
63
|
-
min_similarity: float = 0.
|
|
63
|
+
min_similarity: float = 0.3,
|
|
64
64
|
use_llm_confirmation: bool = True,
|
|
65
65
|
auto_download_db: bool = True,
|
|
66
66
|
):
|
|
@@ -215,11 +215,13 @@ class EmbeddingCompanyQualifier(BaseQualifierPlugin):
|
|
|
215
215
|
self._cache[cache_key] = None
|
|
216
216
|
return None
|
|
217
217
|
|
|
218
|
-
# Log all candidates
|
|
219
|
-
logger.info(f" Found {len(results)} candidates for '{entity.text}':")
|
|
220
|
-
for i, (record,
|
|
218
|
+
# Log all candidates (scores are prominence-adjusted)
|
|
219
|
+
logger.info(f" Found {len(results)} candidates for '{entity.text}' (prominence-adjusted):")
|
|
220
|
+
for i, (record, score) in enumerate(results[:10], 1):
|
|
221
221
|
region_str = f" [{record.region}]" if record.region else ""
|
|
222
|
-
|
|
222
|
+
ticker = record.record.get("ticker", "")
|
|
223
|
+
ticker_str = f" ticker={ticker}" if ticker else ""
|
|
224
|
+
logger.info(f" {i}. {record.name}{region_str} (score={score:.3f}, source={record.source}{ticker_str})")
|
|
223
225
|
|
|
224
226
|
# Get best match (optionally with LLM confirmation)
|
|
225
227
|
logger.info(f" Selecting best match (LLM={self._use_llm_confirmation})...")
|