corp-extractor 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,386 @@
1
+ """
2
+ spaCy-based triple extraction.
3
+
4
+ Uses spaCy dependency parsing to extract subject, predicate, and object
5
+ from source text. T5-Gemma model provides triple structure and coreference
6
+ resolution, while spaCy handles linguistic analysis.
7
+
8
+ The spaCy model is downloaded automatically on first use.
9
+ """
10
+
11
+ import logging
12
+ from typing import Optional
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Lazy-loaded spaCy model
17
+ _nlp = None
18
+
19
+
20
+ def _download_model():
21
+ """Download the spaCy model if not present."""
22
+ import shutil
23
+ import subprocess
24
+ import sys
25
+
26
+ # Direct URL to the spaCy model wheel
27
+ MODEL_URL = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl"
28
+
29
+ logger.info("Downloading spaCy model 'en_core_web_sm'...")
30
+
31
+ # Try uv first (for uv-managed environments)
32
+ uv_path = shutil.which("uv")
33
+ if uv_path:
34
+ try:
35
+ result = subprocess.run(
36
+ [uv_path, "pip", "install", MODEL_URL],
37
+ capture_output=True,
38
+ text=True,
39
+ )
40
+ if result.returncode == 0:
41
+ logger.info("Successfully downloaded spaCy model via uv")
42
+ return True
43
+ logger.debug(f"uv pip install failed: {result.stderr}")
44
+ except Exception as e:
45
+ logger.debug(f"uv pip install failed: {e}")
46
+
47
+ # Try pip directly
48
+ try:
49
+ result = subprocess.run(
50
+ [sys.executable, "-m", "pip", "install", MODEL_URL],
51
+ capture_output=True,
52
+ text=True,
53
+ )
54
+ if result.returncode == 0:
55
+ logger.info("Successfully downloaded spaCy model via pip")
56
+ return True
57
+ logger.debug(f"pip install failed: {result.stderr}")
58
+ except Exception as e:
59
+ logger.debug(f"pip install failed: {e}")
60
+
61
+ # Try spacy's download as last resort
62
+ try:
63
+ from spacy.cli import download
64
+ download("en_core_web_sm")
65
+ # Check if it actually worked
66
+ import spacy
67
+ spacy.load("en_core_web_sm")
68
+ logger.info("Successfully downloaded spaCy model via spacy")
69
+ return True
70
+ except Exception:
71
+ pass
72
+
73
+ logger.warning(
74
+ "Failed to download spaCy model automatically. "
75
+ "Please run: uv pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl"
76
+ )
77
+ return False
78
+
79
+
80
+ def _get_nlp():
81
+ """
82
+ Lazy-load the spaCy model.
83
+
84
+ Disables NER and lemmatizer for faster processing since we only
85
+ need dependency parsing. Automatically downloads the model if not present.
86
+ """
87
+ global _nlp
88
+ if _nlp is None:
89
+ import spacy
90
+
91
+ # Try to load the model, download if not present
92
+ try:
93
+ _nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])
94
+ logger.debug("Loaded spaCy model for extraction")
95
+ except OSError:
96
+ # Model not found, try to download it
97
+ if _download_model():
98
+ _nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])
99
+ logger.debug("Loaded spaCy model after download")
100
+ else:
101
+ raise OSError(
102
+ "spaCy model not found and automatic download failed. "
103
+ "Please run: python -m spacy download en_core_web_sm"
104
+ )
105
+ return _nlp
106
+
107
+
108
+ def _get_full_noun_phrase(token) -> str:
109
+ """
110
+ Get the full noun phrase for a token, including compounds and modifiers.
111
+ """
112
+ # Get all tokens in the subtree that form the noun phrase
113
+ phrase_tokens = []
114
+
115
+ # Collect compound modifiers and the token itself
116
+ for t in token.subtree:
117
+ # Include compounds, adjectives, determiners, and the head noun
118
+ if t.dep_ in ("compound", "amod", "det", "poss", "nummod", "nmod") or t == token:
119
+ phrase_tokens.append(t)
120
+
121
+ # Sort by position and join
122
+ phrase_tokens.sort(key=lambda x: x.i)
123
+ return " ".join([t.text for t in phrase_tokens])
124
+
125
+
126
+ def _extract_verb_phrase(verb_token) -> str:
127
+ """
128
+ Extract the full verb phrase including auxiliaries and particles.
129
+ """
130
+ parts = []
131
+
132
+ # Collect auxiliaries that come before the verb
133
+ for child in verb_token.children:
134
+ if child.dep_ in ("aux", "auxpass") and child.i < verb_token.i:
135
+ parts.append((child.i, child.text))
136
+
137
+ # Add the main verb
138
+ parts.append((verb_token.i, verb_token.text))
139
+
140
+ # Collect particles and prepositions that are part of phrasal verbs
141
+ for child in verb_token.children:
142
+ if child.dep_ == "prt" and child.i > verb_token.i:
143
+ parts.append((child.i, child.text))
144
+ # Include prepositions for phrasal verbs like "announced by"
145
+ elif child.dep_ == "agent" and child.i > verb_token.i:
146
+ # For passive constructions, include "by"
147
+ parts.append((child.i, child.text))
148
+
149
+ # Sort by position and join
150
+ parts.sort(key=lambda x: x[0])
151
+ return " ".join([p[1] for p in parts])
152
+
153
+
154
+ def _match_entity_boundaries(
155
+ spacy_text: str,
156
+ model_text: str,
157
+ source_text: str,
158
+ ) -> str:
159
+ """
160
+ Match entity boundaries between spaCy extraction and model hint.
161
+
162
+ If model text is a superset that includes spaCy text, use model text
163
+ for better entity boundaries (e.g., "Apple" -> "Apple Inc.").
164
+ """
165
+ spacy_lower = spacy_text.lower()
166
+ model_lower = model_text.lower()
167
+
168
+ # If model text contains spaCy text, prefer model text
169
+ if spacy_lower in model_lower:
170
+ return model_text
171
+
172
+ # If spaCy text contains model text, prefer spaCy text
173
+ if model_lower in spacy_lower:
174
+ return spacy_text
175
+
176
+ # If they overlap significantly, prefer the one that appears in source
177
+ if spacy_text in source_text:
178
+ return spacy_text
179
+ if model_text in source_text:
180
+ return model_text
181
+
182
+ # Default to spaCy extraction
183
+ return spacy_text
184
+
185
+
186
+ def _extract_spacy_triple(doc, model_subject: str, model_object: str, source_text: str) -> tuple[str | None, str | None, str | None]:
187
+ """Extract subject, predicate, object from spaCy doc."""
188
+ # Find the root verb
189
+ root = None
190
+ for token in doc:
191
+ if token.dep_ == "ROOT":
192
+ root = token
193
+ break
194
+
195
+ if root is None:
196
+ return None, None, None
197
+
198
+ # Extract predicate from root verb
199
+ predicate = None
200
+ if root.pos_ == "VERB":
201
+ predicate = _extract_verb_phrase(root)
202
+ elif root.pos_ == "AUX":
203
+ predicate = root.text
204
+
205
+ # Extract subject (nsubj, nsubjpass)
206
+ subject = None
207
+ for child in root.children:
208
+ if child.dep_ in ("nsubj", "nsubjpass"):
209
+ subject = _get_full_noun_phrase(child)
210
+ break
211
+
212
+ # If no direct subject, check parent
213
+ if subject is None and root.head != root:
214
+ for child in root.head.children:
215
+ if child.dep_ in ("nsubj", "nsubjpass"):
216
+ subject = _get_full_noun_phrase(child)
217
+ break
218
+
219
+ # Extract object (dobj, pobj, attr, oprd)
220
+ obj = None
221
+ for child in root.children:
222
+ if child.dep_ in ("dobj", "attr", "oprd"):
223
+ obj = _get_full_noun_phrase(child)
224
+ break
225
+ elif child.dep_ == "prep":
226
+ for pchild in child.children:
227
+ if pchild.dep_ == "pobj":
228
+ obj = _get_full_noun_phrase(pchild)
229
+ break
230
+ if obj:
231
+ break
232
+ elif child.dep_ == "agent":
233
+ for pchild in child.children:
234
+ if pchild.dep_ == "pobj":
235
+ obj = _get_full_noun_phrase(pchild)
236
+ break
237
+ if obj:
238
+ break
239
+
240
+ # Match against model values for better entity boundaries
241
+ if subject:
242
+ subject = _match_entity_boundaries(subject, model_subject, source_text)
243
+ if obj:
244
+ obj = _match_entity_boundaries(obj, model_object, source_text)
245
+
246
+ return subject, predicate, obj
247
+
248
+
249
+ def extract_triple_from_text(
250
+ source_text: str,
251
+ model_subject: str,
252
+ model_object: str,
253
+ model_predicate: str,
254
+ ) -> tuple[str, str, str] | None:
255
+ """
256
+ Extract subject, predicate, object from source text using spaCy.
257
+
258
+ Returns a spaCy-based triple that can be added to the candidate pool
259
+ alongside the model's triple. The existing scoring/dedup logic will
260
+ pick the best one.
261
+
262
+ Args:
263
+ source_text: The source sentence to analyze
264
+ model_subject: Subject from T5-Gemma (used for entity boundary matching)
265
+ model_object: Object from T5-Gemma (used for entity boundary matching)
266
+ model_predicate: Predicate from T5-Gemma (unused, kept for API compat)
267
+
268
+ Returns:
269
+ Tuple of (subject, predicate, object) from spaCy, or None if extraction fails
270
+ """
271
+ if not source_text:
272
+ return None
273
+
274
+ try:
275
+ nlp = _get_nlp()
276
+ doc = nlp(source_text)
277
+ spacy_subject, spacy_predicate, spacy_object = _extract_spacy_triple(
278
+ doc, model_subject, model_object, source_text
279
+ )
280
+
281
+ # Only return if we got at least a predicate
282
+ if spacy_predicate:
283
+ logger.debug(
284
+ f"spaCy extracted: subj='{spacy_subject}', pred='{spacy_predicate}', obj='{spacy_object}'"
285
+ )
286
+ return (
287
+ spacy_subject or model_subject,
288
+ spacy_predicate,
289
+ spacy_object or model_object,
290
+ )
291
+
292
+ return None
293
+
294
+ except OSError as e:
295
+ logger.debug(f"Cannot load spaCy model: {e}")
296
+ return None
297
+ except Exception as e:
298
+ logger.debug(f"spaCy extraction failed: {e}")
299
+ return None
300
+
301
+
302
+ def extract_triple_by_predicate_split(
303
+ source_text: str,
304
+ predicate: str,
305
+ ) -> tuple[str, str, str] | None:
306
+ """
307
+ Extract subject and object by splitting the source text around the predicate.
308
+
309
+ This is useful when the predicate is known but subject/object boundaries
310
+ are uncertain. Uses the predicate as an anchor point.
311
+
312
+ Args:
313
+ source_text: The source sentence
314
+ predicate: The predicate (verb phrase) to split on
315
+
316
+ Returns:
317
+ Tuple of (subject, predicate, object) or None if split fails
318
+ """
319
+ if not source_text or not predicate:
320
+ return None
321
+
322
+ # Find the predicate in the source text (case-insensitive)
323
+ source_lower = source_text.lower()
324
+ pred_lower = predicate.lower()
325
+
326
+ pred_pos = source_lower.find(pred_lower)
327
+ if pred_pos < 0:
328
+ # Try finding just the main verb (first word of predicate)
329
+ main_verb = pred_lower.split()[0] if pred_lower.split() else ""
330
+ if main_verb and len(main_verb) > 2:
331
+ pred_pos = source_lower.find(main_verb)
332
+ if pred_pos >= 0:
333
+ # Adjust to use the actual predicate length for splitting
334
+ predicate = main_verb
335
+
336
+ if pred_pos < 0:
337
+ return None
338
+
339
+ # Extract subject (text before predicate, trimmed)
340
+ subject = source_text[:pred_pos].strip()
341
+
342
+ # Extract object (text after predicate, trimmed)
343
+ pred_end = pred_pos + len(predicate)
344
+ obj = source_text[pred_end:].strip()
345
+
346
+ # Clean up: remove trailing punctuation from object
347
+ obj = obj.rstrip('.,;:!?')
348
+
349
+ # Clean up: remove leading articles/prepositions from object if very short
350
+ obj_words = obj.split()
351
+ if obj_words and obj_words[0].lower() in ('a', 'an', 'the', 'to', 'of', 'for'):
352
+ if len(obj_words) > 1:
353
+ obj = ' '.join(obj_words[1:])
354
+
355
+ # Validate: both subject and object should have meaningful content
356
+ if len(subject) < 2 or len(obj) < 2:
357
+ return None
358
+
359
+ logger.debug(
360
+ f"Predicate-split extracted: subj='{subject}', pred='{predicate}', obj='{obj}'"
361
+ )
362
+
363
+ return (subject, predicate, obj)
364
+
365
+
366
+ # Keep old function for backwards compatibility
367
+ def infer_predicate(
368
+ subject: str,
369
+ obj: str,
370
+ source_text: str,
371
+ ) -> Optional[str]:
372
+ """
373
+ Infer the predicate from source text using dependency parsing.
374
+
375
+ DEPRECATED: Use extract_triple_from_text instead.
376
+ """
377
+ result = extract_triple_from_text(
378
+ source_text=source_text,
379
+ model_subject=subject,
380
+ model_object=obj,
381
+ model_predicate="",
382
+ )
383
+ if result:
384
+ _, predicate, _ = result
385
+ return predicate if predicate else None
386
+ return None
@@ -1,11 +0,0 @@
1
- statement_extractor/__init__.py,sha256=MIZgn-lD9-XGJapzdyYxMhEJFRrTzftbRklrhwA4e8w,2967
2
- statement_extractor/canonicalization.py,sha256=ZMLs6RLWJa_rOJ8XZ7PoHFU13-zeJkOMDnvK-ZaFa5s,5991
3
- statement_extractor/cli.py,sha256=kJnZm_mbq4np1vTxSjczMZM5zGuDlC8Z5xLJd8O3xZ4,7605
4
- statement_extractor/extractor.py,sha256=PX0SiJnYUnh06seyH5W77FcPpcvLXwEM8IGsuVuRh0Q,22158
5
- statement_extractor/models.py,sha256=xDF3pDPhIiqiMwFMPV94aBEgZGbSe-x2TkshahOiCog,10739
6
- statement_extractor/predicate_comparer.py,sha256=iwBfNJFNOFv8ODKN9F9EtmknpCeSThOpnu6P_PJSmgE,24898
7
- statement_extractor/scoring.py,sha256=Wa1BW6jXtHD7dZkUXwdwE39hwFo2ko6BuIogBc4E2Lk,14493
8
- corp_extractor-0.2.5.dist-info/METADATA,sha256=iN_MPbqHhizaFAGJKzR5JNSbDivrS133oSTiYWrFht4,13552
9
- corp_extractor-0.2.5.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
10
- corp_extractor-0.2.5.dist-info/entry_points.txt,sha256=i0iKFqPIusvb-QTQ1zNnFgAqatgVah-jIhahbs5TToQ,115
11
- corp_extractor-0.2.5.dist-info/RECORD,,