corp-extractor 0.2.11__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,76 +2,194 @@
2
2
  Scoring module for statement extraction quality assessment.
3
3
 
4
4
  Provides:
5
- - TripleScorer: Score individual triples for groundedness
5
+ - TripleScorer: Score individual triples combining semantic similarity and grammatical accuracy
6
6
  - BeamScorer: Score and select/merge beams based on quality metrics
7
7
  """
8
8
 
9
9
  import logging
10
10
  from typing import Optional
11
11
 
12
+ import numpy as np
13
+
12
14
  from .models import ScoringConfig, Statement
13
15
 
14
16
  logger = logging.getLogger(__name__)
15
17
 
18
+ # Lazy-loaded spaCy model for grammatical analysis
19
+ _nlp = None
20
+
21
+
22
+ def _get_nlp():
23
+ """Lazy-load spaCy model for POS tagging."""
24
+ global _nlp
25
+ if _nlp is None:
26
+ import spacy
27
+ try:
28
+ _nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])
29
+ except OSError:
30
+ # Model not found, try to download
31
+ from .spacy_extraction import _download_model
32
+ if _download_model():
33
+ _nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])
34
+ else:
35
+ raise
36
+ return _nlp
37
+
16
38
 
17
39
  class TripleScorer:
18
40
  """
19
- Score individual triples for groundedness in source text.
20
-
21
- Groundedness is measured by checking:
22
- - Subject text appears in source
23
- - Object text appears in source
24
- - Subject and object are in proximity (same/nearby sentences)
25
- - Evidence span exists and is valid
41
+ Score individual triples combining semantic similarity and grammatical accuracy.
42
+
43
+ The score is a weighted combination of:
44
+ - Semantic similarity (50%): Cosine similarity between source text and reassembled triple
45
+ - Subject noun score (25%): How noun-like the subject is
46
+ - Object noun score (25%): How noun-like the object is
47
+
48
+ Noun scoring:
49
+ - Proper noun only (PROPN): 1.0
50
+ - Common noun only (NOUN): 0.8
51
+ - Contains noun + other words: 0.6
52
+ - No noun: 0.2
26
53
  """
27
54
 
28
- def __init__(self, config: Optional[ScoringConfig] = None):
55
+ def __init__(
56
+ self,
57
+ config: Optional[ScoringConfig] = None,
58
+ device: Optional[str] = None,
59
+ ):
29
60
  self.config = config or ScoringConfig()
30
61
 
62
+ # Auto-detect device
63
+ if device is None:
64
+ import torch
65
+ if torch.cuda.is_available():
66
+ self.device = "cuda"
67
+ elif torch.backends.mps.is_available():
68
+ self.device = "mps"
69
+ else:
70
+ self.device = "cpu"
71
+ else:
72
+ self.device = device
73
+
74
+ # Lazy-loaded embedding model
75
+ self._model = None
76
+ self._embedding_model_name = "all-MiniLM-L6-v2"
77
+
78
+ def _load_model(self):
79
+ """Load sentence-transformers model lazily."""
80
+ if self._model is not None:
81
+ return
82
+
83
+ from sentence_transformers import SentenceTransformer
84
+
85
+ logger.debug(f"Loading embedding model: {self._embedding_model_name} on {self.device}")
86
+ self._model = SentenceTransformer(self._embedding_model_name, device=self.device)
87
+ logger.debug(f"Embedding model loaded on {self.device}")
88
+
89
+ def _compute_embeddings(self, texts: list[str]) -> np.ndarray:
90
+ """Compute embeddings for a list of texts."""
91
+ self._load_model()
92
+ return self._model.encode(texts, convert_to_numpy=True)
93
+
94
+ def _cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
95
+ """Compute cosine similarity between two vectors."""
96
+ dot = np.dot(vec1, vec2)
97
+ norm1 = np.linalg.norm(vec1)
98
+ norm2 = np.linalg.norm(vec2)
99
+ if norm1 == 0 or norm2 == 0:
100
+ return 0.0
101
+ return float(dot / (norm1 * norm2))
102
+
103
+ def _score_noun_content(self, text: str) -> float:
104
+ """
105
+ Score how noun-like a text is.
106
+
107
+ Returns:
108
+ 1.0 - Entirely proper noun(s)
109
+ 0.8 - Entirely common noun(s)
110
+ 0.6 - Contains noun(s) but also other words
111
+ 0.2 - No nouns found
112
+ """
113
+ if not text or not text.strip():
114
+ return 0.2
115
+
116
+ try:
117
+ nlp = _get_nlp()
118
+ doc = nlp(text)
119
+
120
+ # Count token types (excluding punctuation and spaces)
121
+ tokens = [t for t in doc if not t.is_punct and not t.is_space]
122
+ if not tokens:
123
+ return 0.2
124
+
125
+ proper_nouns = sum(1 for t in tokens if t.pos_ == "PROPN")
126
+ common_nouns = sum(1 for t in tokens if t.pos_ == "NOUN")
127
+ total_nouns = proper_nouns + common_nouns
128
+ total_tokens = len(tokens)
129
+
130
+ if total_nouns == 0:
131
+ # No nouns at all
132
+ return 0.2
133
+
134
+ if total_nouns == total_tokens:
135
+ # Entirely nouns
136
+ if proper_nouns == total_tokens:
137
+ # All proper nouns
138
+ return 1.0
139
+ elif common_nouns == total_tokens:
140
+ # All common nouns
141
+ return 0.8
142
+ else:
143
+ # Mix of proper and common nouns
144
+ return 0.9
145
+
146
+ # Contains nouns but also other words
147
+ # Score based on noun ratio
148
+ noun_ratio = total_nouns / total_tokens
149
+ return 0.4 + (noun_ratio * 0.4) # Range: 0.4 to 0.8
150
+
151
+ except Exception as e:
152
+ logger.debug(f"Noun scoring failed for '{text}': {e}")
153
+ return 0.5 # Neutral score on error
154
+
31
155
  def score_triple(self, statement: Statement, source_text: str) -> float:
32
156
  """
33
- Score a triple's groundedness (0-1).
157
+ Score a triple's quality (0-1) combining semantic similarity and grammatical accuracy.
34
158
 
35
- Higher scores indicate better grounding in source text.
159
+ The score is a weighted combination of:
160
+ - Semantic similarity (50%): How well the triple captures the source meaning
161
+ - Subject noun score (25%): Grammatical quality of subject
162
+ - Object noun score (25%): Grammatical quality of object
163
+
164
+ Higher scores indicate better overall quality.
36
165
  """
37
- if not source_text:
166
+ # Use statement's source_text if available, otherwise use provided source_text
167
+ reference_text = statement.source_text or source_text
168
+ if not reference_text:
38
169
  logger.debug(f" No source text, returning neutral score 0.5")
39
170
  return 0.5 # Neutral score if no source text
40
171
 
41
- score = 0.0
42
- weights_sum = 0.0
43
-
44
- # Check subject appears in source (weight: 0.3)
45
- subject_found = self._text_appears_in(statement.subject.text, source_text)
46
- score += 0.3 * (1.0 if subject_found else 0.0)
47
- weights_sum += 0.3
48
-
49
- # Check object appears in source (weight: 0.3)
50
- object_found = self._text_appears_in(statement.object.text, source_text)
51
- score += 0.3 * (1.0 if object_found else 0.0)
52
- weights_sum += 0.3
53
-
54
- # Check predicate has lexical trigger (weight: 0.2)
55
- predicate_grounded = self._predicate_has_trigger(statement.predicate, source_text)
56
- score += 0.2 * (1.0 if predicate_grounded else 0.0)
57
- weights_sum += 0.2
58
-
59
- # Check proximity - subject and object in same/nearby region (weight: 0.2)
60
- proximity_score = 0.0
61
- if subject_found and object_found:
62
- proximity_score = self._compute_proximity(
63
- statement.subject.text,
64
- statement.object.text,
65
- source_text
66
- )
67
- score += 0.2 * proximity_score
68
- weights_sum += 0.2
172
+ # Reassemble the triple
173
+ reassembled = f"{statement.subject.text} {statement.predicate} {statement.object.text}"
174
+
175
+ # Compute semantic similarity
176
+ embeddings = self._compute_embeddings([reference_text, reassembled])
177
+ semantic_similarity = self._cosine_similarity(embeddings[0], embeddings[1])
69
178
 
70
- final_score = score / weights_sum if weights_sum > 0 else 0.0
179
+ # Compute grammatical scores for subject and object
180
+ subject_noun_score = self._score_noun_content(statement.subject.text)
181
+ object_noun_score = self._score_noun_content(statement.object.text)
182
+
183
+ # Weighted combination: 50% semantic, 25% subject, 25% object
184
+ final_score = (
185
+ semantic_similarity * 0.5 +
186
+ subject_noun_score * 0.25 +
187
+ object_noun_score * 0.25
188
+ )
71
189
 
72
190
  logger.debug(
73
191
  f" Score for '{statement.subject.text}' --[{statement.predicate}]--> '{statement.object.text}': "
74
- f"{final_score:.2f} (subj={subject_found}, obj={object_found}, pred={predicate_grounded}, prox={proximity_score:.2f})"
192
+ f"{final_score:.3f} (semantic={semantic_similarity:.2f}, subj_noun={subject_noun_score:.2f}, obj_noun={object_noun_score:.2f})"
75
193
  )
76
194
 
77
195
  return final_score
@@ -115,54 +233,6 @@ class TripleScorer:
115
233
 
116
234
  return None
117
235
 
118
- def _text_appears_in(self, text: str, source: str) -> bool:
119
- """Check if text appears in source (case-insensitive)."""
120
- return text.lower() in source.lower()
121
-
122
- def _predicate_has_trigger(self, predicate: str, source: str) -> bool:
123
- """Check if predicate has a lexical trigger in source."""
124
- # Extract main verb/word from predicate
125
- words = predicate.lower().split()
126
- source_lower = source.lower()
127
-
128
- # Check if any predicate word appears in source
129
- for word in words:
130
- if len(word) > 2 and word in source_lower:
131
- return True
132
- return False
133
-
134
- def _compute_proximity(
135
- self,
136
- subject_text: str,
137
- object_text: str,
138
- source: str
139
- ) -> float:
140
- """
141
- Compute proximity score (0-1) based on distance between subject and object.
142
-
143
- Returns 1.0 if same sentence, decreasing with distance.
144
- """
145
- source_lower = source.lower()
146
- subj_pos = source_lower.find(subject_text.lower())
147
- obj_pos = source_lower.find(object_text.lower())
148
-
149
- if subj_pos < 0 or obj_pos < 0:
150
- return 0.0
151
-
152
- # Check if in same sentence
153
- start = min(subj_pos, obj_pos)
154
- end = max(subj_pos, obj_pos)
155
- region = source[start:end]
156
-
157
- # If no sentence boundary between them, high proximity
158
- if '.' not in region and '!' not in region and '?' not in region:
159
- return 1.0
160
-
161
- # Otherwise, score decreases with distance
162
- # Assume ~100 chars per sentence on average
163
- sentence_distance = region.count('.') + region.count('!') + region.count('?')
164
- return max(0.0, 1.0 - (sentence_distance * 0.2))
165
-
166
236
  def _extend_to_sentence(
167
237
  self,
168
238
  source: str,
@@ -0,0 +1,386 @@
1
+ """
2
+ spaCy-based triple extraction.
3
+
4
+ Uses spaCy dependency parsing to extract subject, predicate, and object
5
+ from source text. T5-Gemma model provides triple structure and coreference
6
+ resolution, while spaCy handles linguistic analysis.
7
+
8
+ The spaCy model is downloaded automatically on first use.
9
+ """
10
+
11
+ import logging
12
+ from typing import Optional
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Lazy-loaded spaCy model
17
+ _nlp = None
18
+
19
+
20
+ def _download_model():
21
+ """Download the spaCy model if not present."""
22
+ import shutil
23
+ import subprocess
24
+ import sys
25
+
26
+ # Direct URL to the spaCy model wheel
27
+ MODEL_URL = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl"
28
+
29
+ logger.info("Downloading spaCy model 'en_core_web_sm'...")
30
+
31
+ # Try uv first (for uv-managed environments)
32
+ uv_path = shutil.which("uv")
33
+ if uv_path:
34
+ try:
35
+ result = subprocess.run(
36
+ [uv_path, "pip", "install", MODEL_URL],
37
+ capture_output=True,
38
+ text=True,
39
+ )
40
+ if result.returncode == 0:
41
+ logger.info("Successfully downloaded spaCy model via uv")
42
+ return True
43
+ logger.debug(f"uv pip install failed: {result.stderr}")
44
+ except Exception as e:
45
+ logger.debug(f"uv pip install failed: {e}")
46
+
47
+ # Try pip directly
48
+ try:
49
+ result = subprocess.run(
50
+ [sys.executable, "-m", "pip", "install", MODEL_URL],
51
+ capture_output=True,
52
+ text=True,
53
+ )
54
+ if result.returncode == 0:
55
+ logger.info("Successfully downloaded spaCy model via pip")
56
+ return True
57
+ logger.debug(f"pip install failed: {result.stderr}")
58
+ except Exception as e:
59
+ logger.debug(f"pip install failed: {e}")
60
+
61
+ # Try spacy's download as last resort
62
+ try:
63
+ from spacy.cli import download
64
+ download("en_core_web_sm")
65
+ # Check if it actually worked
66
+ import spacy
67
+ spacy.load("en_core_web_sm")
68
+ logger.info("Successfully downloaded spaCy model via spacy")
69
+ return True
70
+ except Exception:
71
+ pass
72
+
73
+ logger.warning(
74
+ "Failed to download spaCy model automatically. "
75
+ "Please run: uv pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl"
76
+ )
77
+ return False
78
+
79
+
80
+ def _get_nlp():
81
+ """
82
+ Lazy-load the spaCy model.
83
+
84
+ Disables NER and lemmatizer for faster processing since we only
85
+ need dependency parsing. Automatically downloads the model if not present.
86
+ """
87
+ global _nlp
88
+ if _nlp is None:
89
+ import spacy
90
+
91
+ # Try to load the model, download if not present
92
+ try:
93
+ _nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])
94
+ logger.debug("Loaded spaCy model for extraction")
95
+ except OSError:
96
+ # Model not found, try to download it
97
+ if _download_model():
98
+ _nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])
99
+ logger.debug("Loaded spaCy model after download")
100
+ else:
101
+ raise OSError(
102
+ "spaCy model not found and automatic download failed. "
103
+ "Please run: python -m spacy download en_core_web_sm"
104
+ )
105
+ return _nlp
106
+
107
+
108
+ def _get_full_noun_phrase(token) -> str:
109
+ """
110
+ Get the full noun phrase for a token, including compounds and modifiers.
111
+ """
112
+ # Get all tokens in the subtree that form the noun phrase
113
+ phrase_tokens = []
114
+
115
+ # Collect compound modifiers and the token itself
116
+ for t in token.subtree:
117
+ # Include compounds, adjectives, determiners, and the head noun
118
+ if t.dep_ in ("compound", "amod", "det", "poss", "nummod", "nmod") or t == token:
119
+ phrase_tokens.append(t)
120
+
121
+ # Sort by position and join
122
+ phrase_tokens.sort(key=lambda x: x.i)
123
+ return " ".join([t.text for t in phrase_tokens])
124
+
125
+
126
+ def _extract_verb_phrase(verb_token) -> str:
127
+ """
128
+ Extract the full verb phrase including auxiliaries and particles.
129
+ """
130
+ parts = []
131
+
132
+ # Collect auxiliaries that come before the verb
133
+ for child in verb_token.children:
134
+ if child.dep_ in ("aux", "auxpass") and child.i < verb_token.i:
135
+ parts.append((child.i, child.text))
136
+
137
+ # Add the main verb
138
+ parts.append((verb_token.i, verb_token.text))
139
+
140
+ # Collect particles and prepositions that are part of phrasal verbs
141
+ for child in verb_token.children:
142
+ if child.dep_ == "prt" and child.i > verb_token.i:
143
+ parts.append((child.i, child.text))
144
+ # Include prepositions for phrasal verbs like "announced by"
145
+ elif child.dep_ == "agent" and child.i > verb_token.i:
146
+ # For passive constructions, include "by"
147
+ parts.append((child.i, child.text))
148
+
149
+ # Sort by position and join
150
+ parts.sort(key=lambda x: x[0])
151
+ return " ".join([p[1] for p in parts])
152
+
153
+
154
+ def _match_entity_boundaries(
155
+ spacy_text: str,
156
+ model_text: str,
157
+ source_text: str,
158
+ ) -> str:
159
+ """
160
+ Match entity boundaries between spaCy extraction and model hint.
161
+
162
+ If model text is a superset that includes spaCy text, use model text
163
+ for better entity boundaries (e.g., "Apple" -> "Apple Inc.").
164
+ """
165
+ spacy_lower = spacy_text.lower()
166
+ model_lower = model_text.lower()
167
+
168
+ # If model text contains spaCy text, prefer model text
169
+ if spacy_lower in model_lower:
170
+ return model_text
171
+
172
+ # If spaCy text contains model text, prefer spaCy text
173
+ if model_lower in spacy_lower:
174
+ return spacy_text
175
+
176
+ # If they overlap significantly, prefer the one that appears in source
177
+ if spacy_text in source_text:
178
+ return spacy_text
179
+ if model_text in source_text:
180
+ return model_text
181
+
182
+ # Default to spaCy extraction
183
+ return spacy_text
184
+
185
+
186
+ def _extract_spacy_triple(doc, model_subject: str, model_object: str, source_text: str) -> tuple[str | None, str | None, str | None]:
187
+ """Extract subject, predicate, object from spaCy doc."""
188
+ # Find the root verb
189
+ root = None
190
+ for token in doc:
191
+ if token.dep_ == "ROOT":
192
+ root = token
193
+ break
194
+
195
+ if root is None:
196
+ return None, None, None
197
+
198
+ # Extract predicate from root verb
199
+ predicate = None
200
+ if root.pos_ == "VERB":
201
+ predicate = _extract_verb_phrase(root)
202
+ elif root.pos_ == "AUX":
203
+ predicate = root.text
204
+
205
+ # Extract subject (nsubj, nsubjpass)
206
+ subject = None
207
+ for child in root.children:
208
+ if child.dep_ in ("nsubj", "nsubjpass"):
209
+ subject = _get_full_noun_phrase(child)
210
+ break
211
+
212
+ # If no direct subject, check parent
213
+ if subject is None and root.head != root:
214
+ for child in root.head.children:
215
+ if child.dep_ in ("nsubj", "nsubjpass"):
216
+ subject = _get_full_noun_phrase(child)
217
+ break
218
+
219
+ # Extract object (dobj, pobj, attr, oprd)
220
+ obj = None
221
+ for child in root.children:
222
+ if child.dep_ in ("dobj", "attr", "oprd"):
223
+ obj = _get_full_noun_phrase(child)
224
+ break
225
+ elif child.dep_ == "prep":
226
+ for pchild in child.children:
227
+ if pchild.dep_ == "pobj":
228
+ obj = _get_full_noun_phrase(pchild)
229
+ break
230
+ if obj:
231
+ break
232
+ elif child.dep_ == "agent":
233
+ for pchild in child.children:
234
+ if pchild.dep_ == "pobj":
235
+ obj = _get_full_noun_phrase(pchild)
236
+ break
237
+ if obj:
238
+ break
239
+
240
+ # Match against model values for better entity boundaries
241
+ if subject:
242
+ subject = _match_entity_boundaries(subject, model_subject, source_text)
243
+ if obj:
244
+ obj = _match_entity_boundaries(obj, model_object, source_text)
245
+
246
+ return subject, predicate, obj
247
+
248
+
249
+ def extract_triple_from_text(
250
+ source_text: str,
251
+ model_subject: str,
252
+ model_object: str,
253
+ model_predicate: str,
254
+ ) -> tuple[str, str, str] | None:
255
+ """
256
+ Extract subject, predicate, object from source text using spaCy.
257
+
258
+ Returns a spaCy-based triple that can be added to the candidate pool
259
+ alongside the model's triple. The existing scoring/dedup logic will
260
+ pick the best one.
261
+
262
+ Args:
263
+ source_text: The source sentence to analyze
264
+ model_subject: Subject from T5-Gemma (used for entity boundary matching)
265
+ model_object: Object from T5-Gemma (used for entity boundary matching)
266
+ model_predicate: Predicate from T5-Gemma (unused, kept for API compat)
267
+
268
+ Returns:
269
+ Tuple of (subject, predicate, object) from spaCy, or None if extraction fails
270
+ """
271
+ if not source_text:
272
+ return None
273
+
274
+ try:
275
+ nlp = _get_nlp()
276
+ doc = nlp(source_text)
277
+ spacy_subject, spacy_predicate, spacy_object = _extract_spacy_triple(
278
+ doc, model_subject, model_object, source_text
279
+ )
280
+
281
+ # Only return if we got at least a predicate
282
+ if spacy_predicate:
283
+ logger.debug(
284
+ f"spaCy extracted: subj='{spacy_subject}', pred='{spacy_predicate}', obj='{spacy_object}'"
285
+ )
286
+ return (
287
+ spacy_subject or model_subject,
288
+ spacy_predicate,
289
+ spacy_object or model_object,
290
+ )
291
+
292
+ return None
293
+
294
+ except OSError as e:
295
+ logger.debug(f"Cannot load spaCy model: {e}")
296
+ return None
297
+ except Exception as e:
298
+ logger.debug(f"spaCy extraction failed: {e}")
299
+ return None
300
+
301
+
302
+ def extract_triple_by_predicate_split(
303
+ source_text: str,
304
+ predicate: str,
305
+ ) -> tuple[str, str, str] | None:
306
+ """
307
+ Extract subject and object by splitting the source text around the predicate.
308
+
309
+ This is useful when the predicate is known but subject/object boundaries
310
+ are uncertain. Uses the predicate as an anchor point.
311
+
312
+ Args:
313
+ source_text: The source sentence
314
+ predicate: The predicate (verb phrase) to split on
315
+
316
+ Returns:
317
+ Tuple of (subject, predicate, object) or None if split fails
318
+ """
319
+ if not source_text or not predicate:
320
+ return None
321
+
322
+ # Find the predicate in the source text (case-insensitive)
323
+ source_lower = source_text.lower()
324
+ pred_lower = predicate.lower()
325
+
326
+ pred_pos = source_lower.find(pred_lower)
327
+ if pred_pos < 0:
328
+ # Try finding just the main verb (first word of predicate)
329
+ main_verb = pred_lower.split()[0] if pred_lower.split() else ""
330
+ if main_verb and len(main_verb) > 2:
331
+ pred_pos = source_lower.find(main_verb)
332
+ if pred_pos >= 0:
333
+ # Adjust to use the actual predicate length for splitting
334
+ predicate = main_verb
335
+
336
+ if pred_pos < 0:
337
+ return None
338
+
339
+ # Extract subject (text before predicate, trimmed)
340
+ subject = source_text[:pred_pos].strip()
341
+
342
+ # Extract object (text after predicate, trimmed)
343
+ pred_end = pred_pos + len(predicate)
344
+ obj = source_text[pred_end:].strip()
345
+
346
+ # Clean up: remove trailing punctuation from object
347
+ obj = obj.rstrip('.,;:!?')
348
+
349
+ # Clean up: remove leading articles/prepositions from object if very short
350
+ obj_words = obj.split()
351
+ if obj_words and obj_words[0].lower() in ('a', 'an', 'the', 'to', 'of', 'for'):
352
+ if len(obj_words) > 1:
353
+ obj = ' '.join(obj_words[1:])
354
+
355
+ # Validate: both subject and object should have meaningful content
356
+ if len(subject) < 2 or len(obj) < 2:
357
+ return None
358
+
359
+ logger.debug(
360
+ f"Predicate-split extracted: subj='{subject}', pred='{predicate}', obj='{obj}'"
361
+ )
362
+
363
+ return (subject, predicate, obj)
364
+
365
+
366
+ # Keep old function for backwards compatibility
367
+ def infer_predicate(
368
+ subject: str,
369
+ obj: str,
370
+ source_text: str,
371
+ ) -> Optional[str]:
372
+ """
373
+ Infer the predicate from source text using dependency parsing.
374
+
375
+ DEPRECATED: Use extract_triple_from_text instead.
376
+ """
377
+ result = extract_triple_from_text(
378
+ source_text=source_text,
379
+ model_subject=subject,
380
+ model_object=obj,
381
+ model_predicate="",
382
+ )
383
+ if result:
384
+ _, predicate, _ = result
385
+ return predicate if predicate else None
386
+ return None
@@ -1,11 +0,0 @@
1
- statement_extractor/__init__.py,sha256=MIZgn-lD9-XGJapzdyYxMhEJFRrTzftbRklrhwA4e8w,2967
2
- statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
3
- statement_extractor/cli.py,sha256=NIGCpqcnzF42B16RCiSu4kN0RlnVne2ZAT8341Znt1g,8558
4
- statement_extractor/extractor.py,sha256=r2gcCfZT43Q8STPuzaXmhbjWXTAs4JwMeAtCjQxlsIQ,25870
5
- statement_extractor/models.py,sha256=IE3TyIiOl2CINPMroQnGT12rSeQFR0bV3y4BJ79wLmI,10877
6
- statement_extractor/predicate_comparer.py,sha256=jcuaBi5BYqD3TKoyj3pR9dxtX5ihfDJvjdhEd2LHCwc,26184
7
- statement_extractor/scoring.py,sha256=xs0SxrV42QNBULQguU1-HhcCc-HnS-ekbcdx7FqWGVk,15663
8
- corp_extractor-0.2.11.dist-info/METADATA,sha256=D-fs9i9kn4v5bRAHCHxI3cq_6vosNgDCN7uuYwVZztM,13775
9
- corp_extractor-0.2.11.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
10
- corp_extractor-0.2.11.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
11
- corp_extractor-0.2.11.dist-info/RECORD,,