corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
- corp_extractor-0.9.0.dist-info/RECORD +76 -0
- statement_extractor/__init__.py +10 -1
- statement_extractor/cli.py +1663 -17
- statement_extractor/data/default_predicates.json +368 -0
- statement_extractor/data/statement_taxonomy.json +6972 -0
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +520 -0
- statement_extractor/database/importers/__init__.py +24 -0
- statement_extractor/database/importers/companies_house.py +545 -0
- statement_extractor/database/importers/gleif.py +538 -0
- statement_extractor/database/importers/sec_edgar.py +375 -0
- statement_extractor/database/importers/wikidata.py +1012 -0
- statement_extractor/database/importers/wikidata_people.py +632 -0
- statement_extractor/database/models.py +230 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +1609 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +173 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/extractor.py +1 -23
- statement_extractor/gliner_extraction.py +4 -74
- statement_extractor/llm.py +255 -0
- statement_extractor/models/__init__.py +89 -0
- statement_extractor/models/canonical.py +182 -0
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/entity.py +102 -0
- statement_extractor/models/labels.py +220 -0
- statement_extractor/models/qualifiers.py +139 -0
- statement_extractor/models/statement.py +101 -0
- statement_extractor/models.py +4 -1
- statement_extractor/pipeline/__init__.py +39 -0
- statement_extractor/pipeline/config.py +129 -0
- statement_extractor/pipeline/context.py +177 -0
- statement_extractor/pipeline/orchestrator.py +416 -0
- statement_extractor/pipeline/registry.py +303 -0
- statement_extractor/plugins/__init__.py +55 -0
- statement_extractor/plugins/base.py +716 -0
- statement_extractor/plugins/extractors/__init__.py +13 -0
- statement_extractor/plugins/extractors/base.py +9 -0
- statement_extractor/plugins/extractors/gliner2.py +546 -0
- statement_extractor/plugins/labelers/__init__.py +29 -0
- statement_extractor/plugins/labelers/base.py +9 -0
- statement_extractor/plugins/labelers/confidence.py +138 -0
- statement_extractor/plugins/labelers/relation_type.py +87 -0
- statement_extractor/plugins/labelers/sentiment.py +159 -0
- statement_extractor/plugins/labelers/taxonomy.py +386 -0
- statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +30 -0
- statement_extractor/plugins/qualifiers/base.py +9 -0
- statement_extractor/plugins/qualifiers/companies_house.py +185 -0
- statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
- statement_extractor/plugins/qualifiers/gleif.py +197 -0
- statement_extractor/plugins/qualifiers/person.py +785 -0
- statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/__init__.py +13 -0
- statement_extractor/plugins/splitters/base.py +9 -0
- statement_extractor/plugins/splitters/t5_gemma.py +293 -0
- statement_extractor/plugins/taxonomy/__init__.py +13 -0
- statement_extractor/plugins/taxonomy/embedding.py +484 -0
- statement_extractor/plugins/taxonomy/mnli.py +291 -0
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.4.0.dist-info/RECORD +0 -12
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
- {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ConfidenceLabeler - Aggregates confidence scores from all pipeline stages.
|
|
3
|
+
|
|
4
|
+
Combines:
|
|
5
|
+
- Statement extraction confidence
|
|
6
|
+
- Entity extraction confidence
|
|
7
|
+
- Canonical match confidence
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from typing import Optional
|
|
12
|
+
|
|
13
|
+
from ..base import BaseLabelerPlugin, PluginCapability
|
|
14
|
+
from ...pipeline.context import PipelineContext
|
|
15
|
+
from ...pipeline.registry import PluginRegistry
|
|
16
|
+
from ...models import (
|
|
17
|
+
PipelineStatement,
|
|
18
|
+
CanonicalEntity,
|
|
19
|
+
StatementLabel,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@PluginRegistry.labeler
|
|
26
|
+
class ConfidenceLabeler(BaseLabelerPlugin):
|
|
27
|
+
"""
|
|
28
|
+
Labeler that aggregates confidence scores from all pipeline stages.
|
|
29
|
+
|
|
30
|
+
Produces an overall confidence score for each statement.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
statement_weight: float = 0.4,
|
|
36
|
+
subject_weight: float = 0.2,
|
|
37
|
+
object_weight: float = 0.2,
|
|
38
|
+
canonical_weight: float = 0.2,
|
|
39
|
+
):
|
|
40
|
+
"""
|
|
41
|
+
Initialize the confidence labeler.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
statement_weight: Weight for statement extraction confidence
|
|
45
|
+
subject_weight: Weight for subject entity confidence
|
|
46
|
+
object_weight: Weight for object entity confidence
|
|
47
|
+
canonical_weight: Weight for canonical match confidence
|
|
48
|
+
"""
|
|
49
|
+
self._statement_weight = statement_weight
|
|
50
|
+
self._subject_weight = subject_weight
|
|
51
|
+
self._object_weight = object_weight
|
|
52
|
+
self._canonical_weight = canonical_weight
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def name(self) -> str:
|
|
56
|
+
return "confidence_labeler"
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def priority(self) -> int:
|
|
60
|
+
return 100 # Run after other labelers
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def capabilities(self) -> PluginCapability:
|
|
64
|
+
return PluginCapability.NONE
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def description(self) -> str:
|
|
68
|
+
return "Aggregates confidence scores from all pipeline stages"
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def label_type(self) -> str:
|
|
72
|
+
return "confidence"
|
|
73
|
+
|
|
74
|
+
def label(
|
|
75
|
+
self,
|
|
76
|
+
statement: PipelineStatement,
|
|
77
|
+
subject_canonical: CanonicalEntity,
|
|
78
|
+
object_canonical: CanonicalEntity,
|
|
79
|
+
context: PipelineContext,
|
|
80
|
+
) -> Optional[StatementLabel]:
|
|
81
|
+
"""
|
|
82
|
+
Calculate aggregate confidence for a statement.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
statement: The statement to label
|
|
86
|
+
subject_canonical: Canonicalized subject
|
|
87
|
+
object_canonical: Canonicalized object
|
|
88
|
+
context: Pipeline context
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
StatementLabel with aggregate confidence
|
|
92
|
+
"""
|
|
93
|
+
scores = []
|
|
94
|
+
weights = []
|
|
95
|
+
|
|
96
|
+
# Statement confidence
|
|
97
|
+
if statement.confidence_score is not None:
|
|
98
|
+
scores.append(statement.confidence_score)
|
|
99
|
+
weights.append(self._statement_weight)
|
|
100
|
+
|
|
101
|
+
# Subject entity confidence
|
|
102
|
+
scores.append(statement.subject.confidence)
|
|
103
|
+
weights.append(self._subject_weight)
|
|
104
|
+
|
|
105
|
+
# Object entity confidence
|
|
106
|
+
scores.append(statement.object.confidence)
|
|
107
|
+
weights.append(self._object_weight)
|
|
108
|
+
|
|
109
|
+
# Canonical match confidence
|
|
110
|
+
subj_canon_conf = (
|
|
111
|
+
subject_canonical.canonical_match.match_confidence
|
|
112
|
+
if subject_canonical.canonical_match else 0.5
|
|
113
|
+
)
|
|
114
|
+
obj_canon_conf = (
|
|
115
|
+
object_canonical.canonical_match.match_confidence
|
|
116
|
+
if object_canonical.canonical_match else 0.5
|
|
117
|
+
)
|
|
118
|
+
avg_canon_conf = (subj_canon_conf + obj_canon_conf) / 2
|
|
119
|
+
scores.append(avg_canon_conf)
|
|
120
|
+
weights.append(self._canonical_weight)
|
|
121
|
+
|
|
122
|
+
# Calculate weighted average
|
|
123
|
+
total_weight = sum(weights)
|
|
124
|
+
if total_weight > 0:
|
|
125
|
+
aggregate_confidence = sum(s * w for s, w in zip(scores, weights)) / total_weight
|
|
126
|
+
else:
|
|
127
|
+
aggregate_confidence = 0.5
|
|
128
|
+
|
|
129
|
+
return StatementLabel(
|
|
130
|
+
label_type=self.label_type,
|
|
131
|
+
label_value=round(aggregate_confidence, 3),
|
|
132
|
+
confidence=1.0, # High confidence in our calculation
|
|
133
|
+
labeler=self.name,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
# Allow importing without decorator for testing
|
|
138
|
+
ConfidenceLabelerClass = ConfidenceLabeler
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RelationTypeLabeler - Uses predicate category from GLiNER2 extraction.
|
|
3
|
+
|
|
4
|
+
The relation type comes from the predicate category assigned during
|
|
5
|
+
Stage 2 extraction (GLiNER2). If no category is available, logs an error.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from ..base import BaseLabelerPlugin, PluginCapability
|
|
12
|
+
from ...pipeline.context import PipelineContext
|
|
13
|
+
from ...pipeline.registry import PluginRegistry
|
|
14
|
+
from ...models import (
|
|
15
|
+
PipelineStatement,
|
|
16
|
+
CanonicalEntity,
|
|
17
|
+
StatementLabel,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@PluginRegistry.labeler
|
|
24
|
+
class RelationTypeLabeler(BaseLabelerPlugin):
|
|
25
|
+
"""
|
|
26
|
+
Labeler that uses predicate category from GLiNER2 as relation type.
|
|
27
|
+
|
|
28
|
+
The category is set during Stage 2 extraction when GLiNER2 matches
|
|
29
|
+
a predicate from default_predicates.json (organized by category).
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def name(self) -> str:
|
|
34
|
+
return "relation_type_labeler"
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def priority(self) -> int:
|
|
38
|
+
return 20
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def capabilities(self) -> PluginCapability:
|
|
42
|
+
return PluginCapability.NONE
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def description(self) -> str:
|
|
46
|
+
return "Uses predicate category from GLiNER2 as relation type"
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def label_type(self) -> str:
|
|
50
|
+
return "relation_type"
|
|
51
|
+
|
|
52
|
+
def label(
|
|
53
|
+
self,
|
|
54
|
+
statement: PipelineStatement,
|
|
55
|
+
subject_canonical: CanonicalEntity,
|
|
56
|
+
object_canonical: CanonicalEntity,
|
|
57
|
+
context: PipelineContext,
|
|
58
|
+
) -> Optional[StatementLabel]:
|
|
59
|
+
"""
|
|
60
|
+
Get relation type from statement's predicate category.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
statement: The statement to label
|
|
64
|
+
subject_canonical: Canonicalized subject
|
|
65
|
+
object_canonical: Canonicalized object
|
|
66
|
+
context: Pipeline context
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
StatementLabel with relation type, or None if no category
|
|
70
|
+
"""
|
|
71
|
+
if not statement.predicate_category:
|
|
72
|
+
logger.error(
|
|
73
|
+
f"No predicate_category for statement: "
|
|
74
|
+
f"'{statement.subject.text}' --[{statement.predicate}]--> '{statement.object.text}'"
|
|
75
|
+
)
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
return StatementLabel(
|
|
79
|
+
label_type=self.label_type,
|
|
80
|
+
label_value=statement.predicate_category,
|
|
81
|
+
confidence=statement.confidence_score, # Use statement's confidence
|
|
82
|
+
labeler=self.name,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# Allow importing without decorator for testing
|
|
87
|
+
RelationTypeLabelerClass = RelationTypeLabeler
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SentimentLabeler - Classifies statement sentiment.
|
|
3
|
+
|
|
4
|
+
Uses GLiNER2 classification when available, falls back to pattern matching.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import re
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from ..base import BaseLabelerPlugin, ClassificationSchema, PluginCapability
|
|
12
|
+
from ...pipeline.context import PipelineContext
|
|
13
|
+
from ...pipeline.registry import PluginRegistry
|
|
14
|
+
from ...models import (
|
|
15
|
+
PipelineStatement,
|
|
16
|
+
CanonicalEntity,
|
|
17
|
+
StatementLabel,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
# Positive predicates and patterns
|
|
23
|
+
POSITIVE_PATTERNS = [
|
|
24
|
+
r'\b(acquired|announced|launched|released|expanded|grew|increased|improved|won|awarded)\b',
|
|
25
|
+
r'\b(partnered|collaborated|joined|signed|agreed|approved|completed|achieved)\b',
|
|
26
|
+
r'\b(invested|funded|raised|promoted|hired|appointed)\b',
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
# Negative predicates and patterns
|
|
30
|
+
NEGATIVE_PATTERNS = [
|
|
31
|
+
r'\b(lost|declined|decreased|dropped|fell|failed|fired|laid off|resigned)\b',
|
|
32
|
+
r'\b(sued|accused|charged|investigated|fined|penalized|rejected|denied)\b',
|
|
33
|
+
r'\b(closed|shut down|cancelled|terminated|withdrew|abandoned)\b',
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
# Neutral predicates (don't include words that are also in positive/negative lists)
|
|
37
|
+
NEUTRAL_PATTERNS = [
|
|
38
|
+
r'\b(said|stated|reported|confirmed|disclosed)\b',
|
|
39
|
+
r'\b(is|was|are|were|has|have|had)\b',
|
|
40
|
+
r'\b(located|based|headquartered|operates|employs)\b',
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def classify_sentiment(text: str) -> tuple[str, float]:
|
|
45
|
+
"""
|
|
46
|
+
Classify sentiment of text using pattern matching.
|
|
47
|
+
|
|
48
|
+
Returns (sentiment, confidence) where sentiment is 'positive', 'negative', or 'neutral'.
|
|
49
|
+
"""
|
|
50
|
+
text_lower = text.lower()
|
|
51
|
+
|
|
52
|
+
positive_matches = sum(
|
|
53
|
+
len(re.findall(pattern, text_lower, re.IGNORECASE))
|
|
54
|
+
for pattern in POSITIVE_PATTERNS
|
|
55
|
+
)
|
|
56
|
+
negative_matches = sum(
|
|
57
|
+
len(re.findall(pattern, text_lower, re.IGNORECASE))
|
|
58
|
+
for pattern in NEGATIVE_PATTERNS
|
|
59
|
+
)
|
|
60
|
+
neutral_matches = sum(
|
|
61
|
+
len(re.findall(pattern, text_lower, re.IGNORECASE))
|
|
62
|
+
for pattern in NEUTRAL_PATTERNS
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
total_matches = positive_matches + negative_matches + neutral_matches
|
|
66
|
+
|
|
67
|
+
if total_matches == 0:
|
|
68
|
+
return "neutral", 0.5
|
|
69
|
+
|
|
70
|
+
if positive_matches > negative_matches and positive_matches > neutral_matches:
|
|
71
|
+
confidence = min(0.6 + (positive_matches / total_matches) * 0.3, 0.9)
|
|
72
|
+
return "positive", confidence
|
|
73
|
+
|
|
74
|
+
if negative_matches > positive_matches and negative_matches > neutral_matches:
|
|
75
|
+
confidence = min(0.6 + (negative_matches / total_matches) * 0.3, 0.9)
|
|
76
|
+
return "negative", confidence
|
|
77
|
+
|
|
78
|
+
return "neutral", 0.6
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@PluginRegistry.labeler
|
|
82
|
+
class SentimentLabeler(BaseLabelerPlugin):
|
|
83
|
+
"""
|
|
84
|
+
Labeler that classifies statement sentiment.
|
|
85
|
+
|
|
86
|
+
Provides a ClassificationSchema so GLiNER2 can run classification.
|
|
87
|
+
Falls back to pattern matching if no pre-computed result available.
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
SENTIMENT_CHOICES = ["positive", "negative", "neutral"]
|
|
91
|
+
|
|
92
|
+
@property
|
|
93
|
+
def name(self) -> str:
|
|
94
|
+
return "sentiment_labeler"
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def priority(self) -> int:
|
|
98
|
+
return 10
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def capabilities(self) -> PluginCapability:
|
|
102
|
+
return PluginCapability.NONE
|
|
103
|
+
|
|
104
|
+
@property
|
|
105
|
+
def description(self) -> str:
|
|
106
|
+
return "Classifies statement sentiment (positive/negative/neutral)"
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def label_type(self) -> str:
|
|
110
|
+
return "sentiment"
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def classification_schema(self) -> ClassificationSchema:
|
|
114
|
+
"""Provide schema for GLiNER2 to run classification."""
|
|
115
|
+
return ClassificationSchema(
|
|
116
|
+
label_type=self.label_type,
|
|
117
|
+
choices=self.SENTIMENT_CHOICES,
|
|
118
|
+
description="Classify the sentiment of this statement",
|
|
119
|
+
scope="statement",
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
def label(
|
|
123
|
+
self,
|
|
124
|
+
statement: PipelineStatement,
|
|
125
|
+
subject_canonical: CanonicalEntity,
|
|
126
|
+
object_canonical: CanonicalEntity,
|
|
127
|
+
context: PipelineContext,
|
|
128
|
+
) -> Optional[StatementLabel]:
|
|
129
|
+
"""
|
|
130
|
+
Classify sentiment of a statement.
|
|
131
|
+
|
|
132
|
+
First checks for pre-computed result from extractor (GLiNER2).
|
|
133
|
+
Falls back to pattern matching if not available.
|
|
134
|
+
"""
|
|
135
|
+
# Check for pre-computed classification from extractor
|
|
136
|
+
result = context.get_classification(statement.source_text, self.label_type)
|
|
137
|
+
if result:
|
|
138
|
+
label_value, confidence = result
|
|
139
|
+
return StatementLabel(
|
|
140
|
+
label_type=self.label_type,
|
|
141
|
+
label_value=label_value,
|
|
142
|
+
confidence=confidence,
|
|
143
|
+
labeler=self.name,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Fallback: pattern-based classification
|
|
147
|
+
text_to_analyze = f"{statement.predicate} {statement.source_text}"
|
|
148
|
+
sentiment, confidence = classify_sentiment(text_to_analyze)
|
|
149
|
+
|
|
150
|
+
return StatementLabel(
|
|
151
|
+
label_type=self.label_type,
|
|
152
|
+
label_value=sentiment,
|
|
153
|
+
confidence=confidence,
|
|
154
|
+
labeler=self.name,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
# Allow importing without decorator for testing
|
|
159
|
+
SentimentLabelerClass = SentimentLabeler
|