corp-extractor 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/METADATA +181 -64
  2. corp_extractor-0.5.0.dist-info/RECORD +55 -0
  3. statement_extractor/__init__.py +9 -0
  4. statement_extractor/cli.py +446 -17
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +1182 -0
  7. statement_extractor/extractor.py +1 -23
  8. statement_extractor/gliner_extraction.py +4 -74
  9. statement_extractor/llm.py +255 -0
  10. statement_extractor/models/__init__.py +74 -0
  11. statement_extractor/models/canonical.py +139 -0
  12. statement_extractor/models/entity.py +102 -0
  13. statement_extractor/models/labels.py +191 -0
  14. statement_extractor/models/qualifiers.py +91 -0
  15. statement_extractor/models/statement.py +75 -0
  16. statement_extractor/models.py +4 -1
  17. statement_extractor/pipeline/__init__.py +39 -0
  18. statement_extractor/pipeline/config.py +134 -0
  19. statement_extractor/pipeline/context.py +177 -0
  20. statement_extractor/pipeline/orchestrator.py +447 -0
  21. statement_extractor/pipeline/registry.py +297 -0
  22. statement_extractor/plugins/__init__.py +43 -0
  23. statement_extractor/plugins/base.py +446 -0
  24. statement_extractor/plugins/canonicalizers/__init__.py +17 -0
  25. statement_extractor/plugins/canonicalizers/base.py +9 -0
  26. statement_extractor/plugins/canonicalizers/location.py +219 -0
  27. statement_extractor/plugins/canonicalizers/organization.py +230 -0
  28. statement_extractor/plugins/canonicalizers/person.py +242 -0
  29. statement_extractor/plugins/extractors/__init__.py +13 -0
  30. statement_extractor/plugins/extractors/base.py +9 -0
  31. statement_extractor/plugins/extractors/gliner2.py +536 -0
  32. statement_extractor/plugins/labelers/__init__.py +29 -0
  33. statement_extractor/plugins/labelers/base.py +9 -0
  34. statement_extractor/plugins/labelers/confidence.py +138 -0
  35. statement_extractor/plugins/labelers/relation_type.py +87 -0
  36. statement_extractor/plugins/labelers/sentiment.py +159 -0
  37. statement_extractor/plugins/labelers/taxonomy.py +373 -0
  38. statement_extractor/plugins/labelers/taxonomy_embedding.py +466 -0
  39. statement_extractor/plugins/qualifiers/__init__.py +19 -0
  40. statement_extractor/plugins/qualifiers/base.py +9 -0
  41. statement_extractor/plugins/qualifiers/companies_house.py +174 -0
  42. statement_extractor/plugins/qualifiers/gleif.py +186 -0
  43. statement_extractor/plugins/qualifiers/person.py +221 -0
  44. statement_extractor/plugins/qualifiers/sec_edgar.py +198 -0
  45. statement_extractor/plugins/splitters/__init__.py +13 -0
  46. statement_extractor/plugins/splitters/base.py +9 -0
  47. statement_extractor/plugins/splitters/t5_gemma.py +188 -0
  48. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  49. statement_extractor/plugins/taxonomy/embedding.py +337 -0
  50. statement_extractor/plugins/taxonomy/mnli.py +279 -0
  51. corp_extractor-0.4.0.dist-info/RECORD +0 -12
  52. {corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/WHEEL +0 -0
  53. {corp_extractor-0.4.0.dist-info → corp_extractor-0.5.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,138 @@
1
+ """
2
+ ConfidenceLabeler - Aggregates confidence scores from all pipeline stages.
3
+
4
+ Combines:
5
+ - Statement extraction confidence
6
+ - Entity extraction confidence
7
+ - Canonical match confidence
8
+ """
9
+
10
+ import logging
11
+ from typing import Optional
12
+
13
+ from ..base import BaseLabelerPlugin, PluginCapability
14
+ from ...pipeline.context import PipelineContext
15
+ from ...pipeline.registry import PluginRegistry
16
+ from ...models import (
17
+ PipelineStatement,
18
+ CanonicalEntity,
19
+ StatementLabel,
20
+ )
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ @PluginRegistry.labeler
26
+ class ConfidenceLabeler(BaseLabelerPlugin):
27
+ """
28
+ Labeler that aggregates confidence scores from all pipeline stages.
29
+
30
+ Produces an overall confidence score for each statement.
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ statement_weight: float = 0.4,
36
+ subject_weight: float = 0.2,
37
+ object_weight: float = 0.2,
38
+ canonical_weight: float = 0.2,
39
+ ):
40
+ """
41
+ Initialize the confidence labeler.
42
+
43
+ Args:
44
+ statement_weight: Weight for statement extraction confidence
45
+ subject_weight: Weight for subject entity confidence
46
+ object_weight: Weight for object entity confidence
47
+ canonical_weight: Weight for canonical match confidence
48
+ """
49
+ self._statement_weight = statement_weight
50
+ self._subject_weight = subject_weight
51
+ self._object_weight = object_weight
52
+ self._canonical_weight = canonical_weight
53
+
54
+ @property
55
+ def name(self) -> str:
56
+ return "confidence_labeler"
57
+
58
+ @property
59
+ def priority(self) -> int:
60
+ return 100 # Run after other labelers
61
+
62
+ @property
63
+ def capabilities(self) -> PluginCapability:
64
+ return PluginCapability.NONE
65
+
66
+ @property
67
+ def description(self) -> str:
68
+ return "Aggregates confidence scores from all pipeline stages"
69
+
70
+ @property
71
+ def label_type(self) -> str:
72
+ return "confidence"
73
+
74
+ def label(
75
+ self,
76
+ statement: PipelineStatement,
77
+ subject_canonical: CanonicalEntity,
78
+ object_canonical: CanonicalEntity,
79
+ context: PipelineContext,
80
+ ) -> Optional[StatementLabel]:
81
+ """
82
+ Calculate aggregate confidence for a statement.
83
+
84
+ Args:
85
+ statement: The statement to label
86
+ subject_canonical: Canonicalized subject
87
+ object_canonical: Canonicalized object
88
+ context: Pipeline context
89
+
90
+ Returns:
91
+ StatementLabel with aggregate confidence
92
+ """
93
+ scores = []
94
+ weights = []
95
+
96
+ # Statement confidence
97
+ if statement.confidence_score is not None:
98
+ scores.append(statement.confidence_score)
99
+ weights.append(self._statement_weight)
100
+
101
+ # Subject entity confidence
102
+ scores.append(statement.subject.confidence)
103
+ weights.append(self._subject_weight)
104
+
105
+ # Object entity confidence
106
+ scores.append(statement.object.confidence)
107
+ weights.append(self._object_weight)
108
+
109
+ # Canonical match confidence
110
+ subj_canon_conf = (
111
+ subject_canonical.canonical_match.match_confidence
112
+ if subject_canonical.canonical_match else 0.5
113
+ )
114
+ obj_canon_conf = (
115
+ object_canonical.canonical_match.match_confidence
116
+ if object_canonical.canonical_match else 0.5
117
+ )
118
+ avg_canon_conf = (subj_canon_conf + obj_canon_conf) / 2
119
+ scores.append(avg_canon_conf)
120
+ weights.append(self._canonical_weight)
121
+
122
+ # Calculate weighted average
123
+ total_weight = sum(weights)
124
+ if total_weight > 0:
125
+ aggregate_confidence = sum(s * w for s, w in zip(scores, weights)) / total_weight
126
+ else:
127
+ aggregate_confidence = 0.5
128
+
129
+ return StatementLabel(
130
+ label_type=self.label_type,
131
+ label_value=round(aggregate_confidence, 3),
132
+ confidence=1.0, # High confidence in our calculation
133
+ labeler=self.name,
134
+ )
135
+
136
+
137
+ # Allow importing without decorator for testing
138
+ ConfidenceLabelerClass = ConfidenceLabeler
@@ -0,0 +1,87 @@
1
+ """
2
+ RelationTypeLabeler - Uses predicate category from GLiNER2 extraction.
3
+
4
+ The relation type comes from the predicate category assigned during
5
+ Stage 2 extraction (GLiNER2). If no category is available, logs an error.
6
+ """
7
+
8
+ import logging
9
+ from typing import Optional
10
+
11
+ from ..base import BaseLabelerPlugin, PluginCapability
12
+ from ...pipeline.context import PipelineContext
13
+ from ...pipeline.registry import PluginRegistry
14
+ from ...models import (
15
+ PipelineStatement,
16
+ CanonicalEntity,
17
+ StatementLabel,
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @PluginRegistry.labeler
24
+ class RelationTypeLabeler(BaseLabelerPlugin):
25
+ """
26
+ Labeler that uses predicate category from GLiNER2 as relation type.
27
+
28
+ The category is set during Stage 2 extraction when GLiNER2 matches
29
+ a predicate from default_predicates.json (organized by category).
30
+ """
31
+
32
+ @property
33
+ def name(self) -> str:
34
+ return "relation_type_labeler"
35
+
36
+ @property
37
+ def priority(self) -> int:
38
+ return 20
39
+
40
+ @property
41
+ def capabilities(self) -> PluginCapability:
42
+ return PluginCapability.NONE
43
+
44
+ @property
45
+ def description(self) -> str:
46
+ return "Uses predicate category from GLiNER2 as relation type"
47
+
48
+ @property
49
+ def label_type(self) -> str:
50
+ return "relation_type"
51
+
52
+ def label(
53
+ self,
54
+ statement: PipelineStatement,
55
+ subject_canonical: CanonicalEntity,
56
+ object_canonical: CanonicalEntity,
57
+ context: PipelineContext,
58
+ ) -> Optional[StatementLabel]:
59
+ """
60
+ Get relation type from statement's predicate category.
61
+
62
+ Args:
63
+ statement: The statement to label
64
+ subject_canonical: Canonicalized subject
65
+ object_canonical: Canonicalized object
66
+ context: Pipeline context
67
+
68
+ Returns:
69
+ StatementLabel with relation type, or None if no category
70
+ """
71
+ if not statement.predicate_category:
72
+ logger.error(
73
+ f"No predicate_category for statement: "
74
+ f"'{statement.subject.text}' --[{statement.predicate}]--> '{statement.object.text}'"
75
+ )
76
+ return None
77
+
78
+ return StatementLabel(
79
+ label_type=self.label_type,
80
+ label_value=statement.predicate_category,
81
+ confidence=statement.confidence_score, # Use statement's confidence
82
+ labeler=self.name,
83
+ )
84
+
85
+
86
+ # Allow importing without decorator for testing
87
+ RelationTypeLabelerClass = RelationTypeLabeler
@@ -0,0 +1,159 @@
1
+ """
2
+ SentimentLabeler - Classifies statement sentiment.
3
+
4
+ Uses GLiNER2 classification when available, falls back to pattern matching.
5
+ """
6
+
7
+ import logging
8
+ import re
9
+ from typing import Optional
10
+
11
+ from ..base import BaseLabelerPlugin, ClassificationSchema, PluginCapability
12
+ from ...pipeline.context import PipelineContext
13
+ from ...pipeline.registry import PluginRegistry
14
+ from ...models import (
15
+ PipelineStatement,
16
+ CanonicalEntity,
17
+ StatementLabel,
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # Positive predicates and patterns
23
+ POSITIVE_PATTERNS = [
24
+ r'\b(acquired|announced|launched|released|expanded|grew|increased|improved|won|awarded)\b',
25
+ r'\b(partnered|collaborated|joined|signed|agreed|approved|completed|achieved)\b',
26
+ r'\b(invested|funded|raised|promoted|hired|appointed)\b',
27
+ ]
28
+
29
+ # Negative predicates and patterns
30
+ NEGATIVE_PATTERNS = [
31
+ r'\b(lost|declined|decreased|dropped|fell|failed|fired|laid off|resigned)\b',
32
+ r'\b(sued|accused|charged|investigated|fined|penalized|rejected|denied)\b',
33
+ r'\b(closed|shut down|cancelled|terminated|withdrew|abandoned)\b',
34
+ ]
35
+
36
+ # Neutral predicates (don't include words that are also in positive/negative lists)
37
+ NEUTRAL_PATTERNS = [
38
+ r'\b(said|stated|reported|confirmed|disclosed)\b',
39
+ r'\b(is|was|are|were|has|have|had)\b',
40
+ r'\b(located|based|headquartered|operates|employs)\b',
41
+ ]
42
+
43
+
44
+ def classify_sentiment(text: str) -> tuple[str, float]:
45
+ """
46
+ Classify sentiment of text using pattern matching.
47
+
48
+ Returns (sentiment, confidence) where sentiment is 'positive', 'negative', or 'neutral'.
49
+ """
50
+ text_lower = text.lower()
51
+
52
+ positive_matches = sum(
53
+ len(re.findall(pattern, text_lower, re.IGNORECASE))
54
+ for pattern in POSITIVE_PATTERNS
55
+ )
56
+ negative_matches = sum(
57
+ len(re.findall(pattern, text_lower, re.IGNORECASE))
58
+ for pattern in NEGATIVE_PATTERNS
59
+ )
60
+ neutral_matches = sum(
61
+ len(re.findall(pattern, text_lower, re.IGNORECASE))
62
+ for pattern in NEUTRAL_PATTERNS
63
+ )
64
+
65
+ total_matches = positive_matches + negative_matches + neutral_matches
66
+
67
+ if total_matches == 0:
68
+ return "neutral", 0.5
69
+
70
+ if positive_matches > negative_matches and positive_matches > neutral_matches:
71
+ confidence = min(0.6 + (positive_matches / total_matches) * 0.3, 0.9)
72
+ return "positive", confidence
73
+
74
+ if negative_matches > positive_matches and negative_matches > neutral_matches:
75
+ confidence = min(0.6 + (negative_matches / total_matches) * 0.3, 0.9)
76
+ return "negative", confidence
77
+
78
+ return "neutral", 0.6
79
+
80
+
81
+ @PluginRegistry.labeler
82
+ class SentimentLabeler(BaseLabelerPlugin):
83
+ """
84
+ Labeler that classifies statement sentiment.
85
+
86
+ Provides a ClassificationSchema so GLiNER2 can run classification.
87
+ Falls back to pattern matching if no pre-computed result available.
88
+ """
89
+
90
+ SENTIMENT_CHOICES = ["positive", "negative", "neutral"]
91
+
92
+ @property
93
+ def name(self) -> str:
94
+ return "sentiment_labeler"
95
+
96
+ @property
97
+ def priority(self) -> int:
98
+ return 10
99
+
100
+ @property
101
+ def capabilities(self) -> PluginCapability:
102
+ return PluginCapability.NONE
103
+
104
+ @property
105
+ def description(self) -> str:
106
+ return "Classifies statement sentiment (positive/negative/neutral)"
107
+
108
+ @property
109
+ def label_type(self) -> str:
110
+ return "sentiment"
111
+
112
+ @property
113
+ def classification_schema(self) -> ClassificationSchema:
114
+ """Provide schema for GLiNER2 to run classification."""
115
+ return ClassificationSchema(
116
+ label_type=self.label_type,
117
+ choices=self.SENTIMENT_CHOICES,
118
+ description="Classify the sentiment of this statement",
119
+ scope="statement",
120
+ )
121
+
122
+ def label(
123
+ self,
124
+ statement: PipelineStatement,
125
+ subject_canonical: CanonicalEntity,
126
+ object_canonical: CanonicalEntity,
127
+ context: PipelineContext,
128
+ ) -> Optional[StatementLabel]:
129
+ """
130
+ Classify sentiment of a statement.
131
+
132
+ First checks for pre-computed result from extractor (GLiNER2).
133
+ Falls back to pattern matching if not available.
134
+ """
135
+ # Check for pre-computed classification from extractor
136
+ result = context.get_classification(statement.source_text, self.label_type)
137
+ if result:
138
+ label_value, confidence = result
139
+ return StatementLabel(
140
+ label_type=self.label_type,
141
+ label_value=label_value,
142
+ confidence=confidence,
143
+ labeler=self.name,
144
+ )
145
+
146
+ # Fallback: pattern-based classification
147
+ text_to_analyze = f"{statement.predicate} {statement.source_text}"
148
+ sentiment, confidence = classify_sentiment(text_to_analyze)
149
+
150
+ return StatementLabel(
151
+ label_type=self.label_type,
152
+ label_value=sentiment,
153
+ confidence=confidence,
154
+ labeler=self.name,
155
+ )
156
+
157
+
158
+ # Allow importing without decorator for testing
159
+ SentimentLabelerClass = SentimentLabeler