memplex 3.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- memnex/__init__.py +31 -0
- memnex/__main__.py +6 -0
- memnex/_plugin/.claude-plugin/plugin.json +24 -0
- memnex/_plugin/.mcp.json +9 -0
- memnex/_plugin/__init__.py +0 -0
- memnex/_plugin/hooks/hooks.json +43 -0
- memnex/_plugin/scripts/hook-runner.py +166 -0
- memnex/_plugin/skills/mem-explore/SKILL.md +83 -0
- memnex/_plugin/skills/mem-manage/SKILL.md +92 -0
- memnex/_plugin/skills/mem-search/SKILL.md +85 -0
- memnex/_plugin/skills/mem-write/SKILL.md +78 -0
- memnex/adapters/__init__.py +14 -0
- memnex/adapters/claude_skill.py +169 -0
- memnex/adapters/cli.py +525 -0
- memnex/adapters/http_api.py +314 -0
- memnex/adapters/mcp_server.py +448 -0
- memnex/compaction.py +563 -0
- memnex/config.py +366 -0
- memnex/core/__init__.py +13 -0
- memnex/core/associator/__init__.py +8 -0
- memnex/core/associator/domain_classifier.py +75 -0
- memnex/core/associator/entity_aligner.py +127 -0
- memnex/core/associator/ref_linker.py +197 -0
- memnex/core/associator/term_mapper.py +77 -0
- memnex/core/dictionaries/__init__.py +50 -0
- memnex/core/engine.py +667 -0
- memnex/core/extractors/__init__.py +15 -0
- memnex/core/extractors/docx.py +97 -0
- memnex/core/extractors/image.py +233 -0
- memnex/core/extractors/markdown.py +139 -0
- memnex/core/extractors/pdf.py +133 -0
- memnex/core/extractors/vision_mapper.py +131 -0
- memnex/core/handlers/__init__.py +7 -0
- memnex/core/handlers/clipboard.py +40 -0
- memnex/core/handlers/file_handler.py +62 -0
- memnex/core/handlers/url_handler.py +132 -0
- memnex/llm/__init__.py +25 -0
- memnex/llm/enhancer.py +226 -0
- memnex/llm/fallback_chain.py +87 -0
- memnex/llm/injection_guard.py +178 -0
- memnex/llm/provider.py +130 -0
- memnex/llm/providers/__init__.py +22 -0
- memnex/llm/providers/anthropic.py +135 -0
- memnex/llm/providers/local.py +135 -0
- memnex/llm/providers/rule_based.py +68 -0
- memnex/llm/sanitizer.py +67 -0
- memnex/models/__init__.py +68 -0
- memnex/models/feedback.py +42 -0
- memnex/models/graph.py +33 -0
- memnex/models/memory.py +102 -0
- memnex/models/misc.py +185 -0
- memnex/models/paragraph.py +45 -0
- memnex/models/search.py +51 -0
- memnex/models/source.py +23 -0
- memnex/models/task.py +62 -0
- memnex/processing/__init__.py +1 -0
- memnex/processing/graph_builder.py +278 -0
- memnex/processing/merger/__init__.py +6 -0
- memnex/processing/merger/confidence_calculator.py +127 -0
- memnex/processing/merger/conflict_resolver.py +116 -0
- memnex/retrieval/__init__.py +1 -0
- memnex/retrieval/dedup.py +386 -0
- memnex/retrieval/embedding.py +289 -0
- memnex/retrieval/reranker.py +299 -0
- memnex/service.py +902 -0
- memnex/storage/__init__.py +65 -0
- memnex/storage/base.py +132 -0
- memnex/storage/changelog.py +106 -0
- memnex/storage/feedback.py +486 -0
- memnex/storage/lite/__init__.py +5 -0
- memnex/storage/lite/store.py +606 -0
- memnex/storage/vector.py +265 -0
- memnex/wiki/__init__.py +11 -0
- memnex/wiki/community.py +221 -0
- memnex/wiki/compiler.py +545 -0
- memnex/wiki/generator.py +270 -0
- memnex/wiki/search.py +282 -0
- memnex/worker.py +412 -0
- memplex-3.2.0.dist-info/METADATA +37 -0
- memplex-3.2.0.dist-info/RECORD +83 -0
- memplex-3.2.0.dist-info/WHEEL +5 -0
- memplex-3.2.0.dist-info/entry_points.txt +2 -0
- memplex-3.2.0.dist-info/top_level.txt +1 -0
memnex/core/engine.py
ADDED
|
@@ -0,0 +1,667 @@
|
|
|
1
|
+
"""CoreEngine -- pure computation layer for SourceDocument -> ExtractedData.
|
|
2
|
+
|
|
3
|
+
Zero-dependency on Agent platforms. Input and output are data structures only.
|
|
4
|
+
All I/O (storage, network) is handled by callers (MemNexService).
|
|
5
|
+
|
|
6
|
+
Usage::
|
|
7
|
+
|
|
8
|
+
from memnex.core import CoreEngine
|
|
9
|
+
|
|
10
|
+
engine = CoreEngine()
|
|
11
|
+
extracted = engine.extract(source)
|
|
12
|
+
for func in extracted.functions:
|
|
13
|
+
print(func.name)
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import hashlib
|
|
19
|
+
import logging
|
|
20
|
+
import re
|
|
21
|
+
from datetime import datetime
|
|
22
|
+
from typing import List, Optional
|
|
23
|
+
|
|
24
|
+
from memnex.core.associator.domain_classifier import DomainClassifier
|
|
25
|
+
from memnex.core.associator.entity_aligner import EntityAligner
|
|
26
|
+
from memnex.core.associator.ref_linker import RefLinker
|
|
27
|
+
from memnex.core.associator.term_mapper import TermMapper
|
|
28
|
+
from memnex.core.extractors.docx import DOCXExtractor
|
|
29
|
+
from memnex.core.extractors.image import ImageExtractor
|
|
30
|
+
from memnex.core.extractors.markdown import MarkdownExtractor
|
|
31
|
+
from memnex.core.extractors.pdf import PDFExtractor
|
|
32
|
+
from memnex.core.extractors.vision_mapper import VisionMapper
|
|
33
|
+
from memnex.processing.graph_builder import GraphBuilder
|
|
34
|
+
from memnex.core.handlers.clipboard import ClipboardHandler
|
|
35
|
+
from memnex.core.handlers.file_handler import FileHandler
|
|
36
|
+
from memnex.core.handlers.url_handler import URLHandler
|
|
37
|
+
from memnex.processing.merger.confidence_calculator import ConfidenceCalculator
|
|
38
|
+
from memnex.processing.merger.conflict_resolver import ConflictResolver
|
|
39
|
+
from memnex.models import (
|
|
40
|
+
ExtractedData,
|
|
41
|
+
FieldValue,
|
|
42
|
+
Function,
|
|
43
|
+
GraphData,
|
|
44
|
+
SourceDocument,
|
|
45
|
+
)
|
|
46
|
+
from memnex.models.paragraph import Paragraph, ParagraphCollection
|
|
47
|
+
|
|
48
|
+
logger = logging.getLogger(__name__)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# ── Helpers ──────────────────────────────────────────────────────────
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _detect_memory_type(text: str) -> str:
|
|
55
|
+
"""Heuristic: classify text into a memory type.
|
|
56
|
+
|
|
57
|
+
Returns one of ``"function"`` | ``"fact"`` | ``"preference"`` |
|
|
58
|
+
``"observation"``.
|
|
59
|
+
"""
|
|
60
|
+
text_lower = text.lower()
|
|
61
|
+
|
|
62
|
+
obs_keywords = [
|
|
63
|
+
"observe", "observed", "noticed", "happened", "occurred",
|
|
64
|
+
"事件", "观察", "发生", "记录",
|
|
65
|
+
]
|
|
66
|
+
if any(k in text_lower for k in obs_keywords):
|
|
67
|
+
return "observation"
|
|
68
|
+
|
|
69
|
+
pref_keywords = [
|
|
70
|
+
"prefer", "like", "dislike", "want", "always", "never",
|
|
71
|
+
"喜欢", "偏好", "讨厌", "倾向", "总是", "从不",
|
|
72
|
+
]
|
|
73
|
+
if any(k in text_lower for k in pref_keywords):
|
|
74
|
+
return "preference"
|
|
75
|
+
|
|
76
|
+
fact_keywords = [
|
|
77
|
+
"is", "are", "means", "defined as", "refers to",
|
|
78
|
+
"是", "意味着", "定义为", "指的是", "事实",
|
|
79
|
+
]
|
|
80
|
+
if any(k in text_lower for k in fact_keywords):
|
|
81
|
+
return "fact"
|
|
82
|
+
|
|
83
|
+
return "function"
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _normalize_name(name: str) -> str:
|
|
87
|
+
"""Generate name_normalized from a display name.
|
|
88
|
+
|
|
89
|
+
Rules (per spec SS1.3):
|
|
90
|
+
1. lowercase
|
|
91
|
+
2. strip whitespace
|
|
92
|
+
3. collapse consecutive whitespace to single space
|
|
93
|
+
4. remove punctuation (keep letters, digits, CJK, spaces, underscores, hyphens)
|
|
94
|
+
"""
|
|
95
|
+
normalized = name.lower().strip()
|
|
96
|
+
normalized = re.sub(r"\s+", " ", normalized)
|
|
97
|
+
normalized = re.sub(r"[^a-z0-9一-鿿 _-]", "", normalized)
|
|
98
|
+
return normalized
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# ── CoreEngine ───────────────────────────────────────────────────────
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class CoreEngine:
|
|
105
|
+
"""Pure computation engine: ``SourceDocument`` -> ``ExtractedData``.
|
|
106
|
+
|
|
107
|
+
Orchestrates the full extraction pipeline:
|
|
108
|
+
|
|
109
|
+
1. **Handler** -- acquire raw content from source type
|
|
110
|
+
2. **Extractor** -- content -> L1 Paragraphs
|
|
111
|
+
3. **Paragraph -> Function** -- with multi-value ``FieldValue`` fields
|
|
112
|
+
4. **DomainClassifier** -- assign domain
|
|
113
|
+
5. **RefLinker** -- resolve cross-references
|
|
114
|
+
6. **EntityAligner** -- deduplicate/merge
|
|
115
|
+
7. **ConflictResolver** -- detect conflicts
|
|
116
|
+
8. **ConfidenceCalculator** -- compute confidence
|
|
117
|
+
9. **GraphBuilder** -- build relationship edges
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
store:
|
|
122
|
+
Optional :class:`MemoryStore`. Required only when
|
|
123
|
+
:meth:`extract` needs to look up existing Functions for
|
|
124
|
+
graph-edge detection. Pass ``None`` for stateless usage
|
|
125
|
+
(graph edges will be built from the batch only).
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
def __init__(self, store=None) -> None:
|
|
129
|
+
# ── Extractors ──────────────────────────────────────────────
|
|
130
|
+
self.markdown_extractor = MarkdownExtractor()
|
|
131
|
+
self.image_extractor = ImageExtractor()
|
|
132
|
+
self.pdf_extractor = PDFExtractor()
|
|
133
|
+
self.docx_extractor = DOCXExtractor()
|
|
134
|
+
self.vision_mapper = VisionMapper()
|
|
135
|
+
|
|
136
|
+
# ── Handlers ────────────────────────────────────────────────
|
|
137
|
+
self.file_handler = FileHandler()
|
|
138
|
+
self.url_handler = URLHandler()
|
|
139
|
+
self.clipboard_handler = ClipboardHandler()
|
|
140
|
+
|
|
141
|
+
# ── Associators ─────────────────────────────────────────────
|
|
142
|
+
self.term_mapper = TermMapper()
|
|
143
|
+
self.ref_linker = RefLinker()
|
|
144
|
+
self.entity_aligner = EntityAligner()
|
|
145
|
+
self.domain_classifier = DomainClassifier()
|
|
146
|
+
|
|
147
|
+
# ── Merge layer ─────────────────────────────────────────────
|
|
148
|
+
self.conflict_resolver = ConflictResolver()
|
|
149
|
+
self.confidence_calculator = ConfidenceCalculator()
|
|
150
|
+
|
|
151
|
+
# ── Graph builder (optional store) ──────────────────────────
|
|
152
|
+
self._store = store
|
|
153
|
+
|
|
154
|
+
# ════════════════════════════════════════════════════════════════
|
|
155
|
+
# Public API
|
|
156
|
+
# ════════════════════════════════════════════════════════════════
|
|
157
|
+
|
|
158
|
+
def extract(self, source: SourceDocument) -> ExtractedData:
|
|
159
|
+
"""Main extraction pipeline.
|
|
160
|
+
|
|
161
|
+
Parameters
|
|
162
|
+
----------
|
|
163
|
+
source:
|
|
164
|
+
The source document to process.
|
|
165
|
+
|
|
166
|
+
Returns
|
|
167
|
+
-------
|
|
168
|
+
ExtractedData
|
|
169
|
+
Extracted Functions and graph edges.
|
|
170
|
+
"""
|
|
171
|
+
# Step 1: Acquire content via handler
|
|
172
|
+
text, extracted_images, source_hint = self._acquire_content(source)
|
|
173
|
+
|
|
174
|
+
# Step 2: Extract L1 Paragraphs
|
|
175
|
+
paragraphs = self._extract_paragraphs(text, source_hint)
|
|
176
|
+
|
|
177
|
+
# Step 3: Handle vision / image extracted data
|
|
178
|
+
vision_functions = []
|
|
179
|
+
if source.vision:
|
|
180
|
+
vision_functions = self.vision_mapper.vision_to_functions(
|
|
181
|
+
source.vision,
|
|
182
|
+
source_id=source.type,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# Also process extracted images from PDF
|
|
186
|
+
image_functions = []
|
|
187
|
+
for img_info in extracted_images:
|
|
188
|
+
img_path = img_info.get("path")
|
|
189
|
+
if not img_path:
|
|
190
|
+
continue
|
|
191
|
+
full = self.image_extractor.extract_full(img_path)
|
|
192
|
+
if full and full.get("vision"):
|
|
193
|
+
img_funcs = self.vision_mapper.vision_to_functions(
|
|
194
|
+
full["vision"],
|
|
195
|
+
source_id=f"pdf_img_{img_info.get('page', 0)}_{img_info.get('index', 0)}",
|
|
196
|
+
)
|
|
197
|
+
image_functions.extend(img_funcs)
|
|
198
|
+
# Cleanup temp file
|
|
199
|
+
if img_info.get("_tmp"):
|
|
200
|
+
try:
|
|
201
|
+
import os
|
|
202
|
+
os.unlink(img_path)
|
|
203
|
+
except OSError:
|
|
204
|
+
pass
|
|
205
|
+
|
|
206
|
+
# Step 4: Paragraphs -> Functions
|
|
207
|
+
functions = self._paragraphs_to_functions(paragraphs, source)
|
|
208
|
+
|
|
209
|
+
# Merge vision/image functions
|
|
210
|
+
functions.extend(vision_functions)
|
|
211
|
+
functions.extend(image_functions)
|
|
212
|
+
|
|
213
|
+
if not functions:
|
|
214
|
+
return ExtractedData(
|
|
215
|
+
functions=[],
|
|
216
|
+
graph=GraphData(nodes=[], edges=[]),
|
|
217
|
+
delta=False,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Step 5: DomainClassifier
|
|
221
|
+
for func in functions:
|
|
222
|
+
func.domain = self.domain_classifier.classify(func)
|
|
223
|
+
|
|
224
|
+
# Step 6: RefLinker -- extract cross-references from raw text
|
|
225
|
+
if text:
|
|
226
|
+
refs = self.ref_linker.extract_references(text)
|
|
227
|
+
for func in functions:
|
|
228
|
+
func.cross_references = refs
|
|
229
|
+
|
|
230
|
+
# Step 7: EntityAligner -- deduplicate/merge
|
|
231
|
+
functions = self._deduplicate_functions(functions)
|
|
232
|
+
|
|
233
|
+
# Step 8: ConflictResolver -- detect conflicts
|
|
234
|
+
conflicts = self.conflict_resolver.detect_conflicts(functions)
|
|
235
|
+
for conflict in conflicts:
|
|
236
|
+
if conflict.needs_human:
|
|
237
|
+
for val in conflict.values:
|
|
238
|
+
target_id = val.get("source", "")
|
|
239
|
+
for func in functions:
|
|
240
|
+
if func.id == target_id or func.source_paragraphs and target_id in func.source_paragraphs:
|
|
241
|
+
func.needs_review = True
|
|
242
|
+
|
|
243
|
+
# Step 9: ConfidenceCalculator -- compute confidence for each function
|
|
244
|
+
for func in functions:
|
|
245
|
+
if func.confidence == 1.0:
|
|
246
|
+
func.confidence = self._calculate_function_confidence(func, paragraphs, source_hint)
|
|
247
|
+
|
|
248
|
+
# Step 10: GraphBuilder -- build edges
|
|
249
|
+
graph = self._build_graph(functions)
|
|
250
|
+
|
|
251
|
+
return ExtractedData(
|
|
252
|
+
functions=functions,
|
|
253
|
+
graph=graph,
|
|
254
|
+
delta=False,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
def extract_batch(self, sources: List[SourceDocument]) -> ExtractedData:
|
|
258
|
+
"""Batch extraction: process multiple sources and merge results.
|
|
259
|
+
|
|
260
|
+
Parameters
|
|
261
|
+
----------
|
|
262
|
+
sources:
|
|
263
|
+
List of source documents to process.
|
|
264
|
+
|
|
265
|
+
Returns
|
|
266
|
+
-------
|
|
267
|
+
ExtractedData
|
|
268
|
+
Merged extraction results from all sources.
|
|
269
|
+
"""
|
|
270
|
+
all_functions: List[Function] = []
|
|
271
|
+
all_edges: list = []
|
|
272
|
+
|
|
273
|
+
for source in sources:
|
|
274
|
+
extracted = self.extract(source)
|
|
275
|
+
all_functions.extend(extracted.functions)
|
|
276
|
+
all_edges.extend(extracted.graph.edges)
|
|
277
|
+
|
|
278
|
+
# Deduplicate across batch
|
|
279
|
+
all_functions = self._deduplicate_functions(all_functions)
|
|
280
|
+
|
|
281
|
+
# Rebuild graph with deduped functions
|
|
282
|
+
graph = self._build_graph(all_functions)
|
|
283
|
+
|
|
284
|
+
return ExtractedData(
|
|
285
|
+
functions=all_functions,
|
|
286
|
+
graph=graph,
|
|
287
|
+
delta=False,
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
# ════════════════════════════════════════════════════════════════
|
|
291
|
+
# Internal: content acquisition
|
|
292
|
+
# ════════════════════════════════════════════════════════════════
|
|
293
|
+
|
|
294
|
+
def _acquire_content(self, source: SourceDocument):
|
|
295
|
+
"""Route source to the correct handler and return normalized content.
|
|
296
|
+
|
|
297
|
+
Returns
|
|
298
|
+
-------
|
|
299
|
+
tuple of (text, extracted_images, source_hint)
|
|
300
|
+
text: str -- the textual content to process
|
|
301
|
+
extracted_images: list -- image dicts extracted from PDF etc.
|
|
302
|
+
source_hint: str -- hint for confidence calculation
|
|
303
|
+
"""
|
|
304
|
+
source_type = source.type
|
|
305
|
+
extracted_images = []
|
|
306
|
+
|
|
307
|
+
# Text / clipboard content
|
|
308
|
+
if source_type in ("text", "clipboard"):
|
|
309
|
+
text = source.content or ""
|
|
310
|
+
# Use ClipboardHandler to detect subtype
|
|
311
|
+
if source_type == "clipboard":
|
|
312
|
+
parsed = self.clipboard_handler.parse(text)
|
|
313
|
+
if parsed:
|
|
314
|
+
source_hint = parsed[0][0] # "markdown" or "text"
|
|
315
|
+
else:
|
|
316
|
+
source_hint = "text"
|
|
317
|
+
else:
|
|
318
|
+
source_hint = "text"
|
|
319
|
+
return text, extracted_images, source_hint
|
|
320
|
+
|
|
321
|
+
# File content
|
|
322
|
+
if source_type == "file" and source.source_path:
|
|
323
|
+
result = self.file_handler.read(source.source_path)
|
|
324
|
+
if result is None:
|
|
325
|
+
return "", extracted_images, "text"
|
|
326
|
+
|
|
327
|
+
content_type, content = result
|
|
328
|
+
|
|
329
|
+
if content_type == "image":
|
|
330
|
+
# Process image: OCR + vision
|
|
331
|
+
full = self.image_extractor.extract_full(content)
|
|
332
|
+
combined_text = full.get("combined_text", "")
|
|
333
|
+
return combined_text, extracted_images, "image"
|
|
334
|
+
|
|
335
|
+
if content_type == "pdf":
|
|
336
|
+
full = self.pdf_extractor.extract_full(content)
|
|
337
|
+
if full is None:
|
|
338
|
+
return "", extracted_images, "pdf"
|
|
339
|
+
text = "\n\n".join(full.get("pages", []))
|
|
340
|
+
extracted_images = []
|
|
341
|
+
for page_images in full.get("images", []):
|
|
342
|
+
for img in page_images:
|
|
343
|
+
if img.get("path"):
|
|
344
|
+
img["_tmp"] = True
|
|
345
|
+
extracted_images.append(img)
|
|
346
|
+
return text, extracted_images, "pdf"
|
|
347
|
+
|
|
348
|
+
if content_type == "docx":
|
|
349
|
+
docx_text = self.docx_extractor.extract(content)
|
|
350
|
+
return docx_text or "", extracted_images, "docx"
|
|
351
|
+
|
|
352
|
+
# markdown / text
|
|
353
|
+
return content, extracted_images, content_type
|
|
354
|
+
|
|
355
|
+
# URL content
|
|
356
|
+
if source_type == "url" and source.url:
|
|
357
|
+
result = self.url_handler.fetch(source.url)
|
|
358
|
+
if result is None:
|
|
359
|
+
return "", extracted_images, "url"
|
|
360
|
+
|
|
361
|
+
content_type, content = result
|
|
362
|
+
|
|
363
|
+
if content_type == "image":
|
|
364
|
+
full = self.image_extractor.extract_full(content)
|
|
365
|
+
combined_text = full.get("combined_text", "")
|
|
366
|
+
# Cleanup temp file
|
|
367
|
+
self.url_handler.cleanup_temp_file(content)
|
|
368
|
+
return combined_text, extracted_images, "image"
|
|
369
|
+
|
|
370
|
+
if content_type == "pdf":
|
|
371
|
+
full = self.pdf_extractor.extract_full(content)
|
|
372
|
+
if full is None:
|
|
373
|
+
self.url_handler.cleanup_temp_file(content)
|
|
374
|
+
return "", extracted_images, "pdf"
|
|
375
|
+
text = "\n\n".join(full.get("pages", []))
|
|
376
|
+
for page_images in full.get("images", []):
|
|
377
|
+
for img in page_images:
|
|
378
|
+
if img.get("path"):
|
|
379
|
+
img["_tmp"] = True
|
|
380
|
+
extracted_images.append(img)
|
|
381
|
+
self.url_handler.cleanup_temp_file(content)
|
|
382
|
+
return text, extracted_images, "pdf"
|
|
383
|
+
|
|
384
|
+
# text / markdown / html
|
|
385
|
+
return content, extracted_images, content_type
|
|
386
|
+
|
|
387
|
+
# Fallback: use source.content directly
|
|
388
|
+
return source.content or "", extracted_images, "text"
|
|
389
|
+
|
|
390
|
+
# ════════════════════════════════════════════════════════════════
|
|
391
|
+
# Internal: extraction pipeline steps
|
|
392
|
+
# ════════════════════════════════════════════════════════════════
|
|
393
|
+
|
|
394
|
+
def _extract_paragraphs(
|
|
395
|
+
self, text: str, source_hint: str
|
|
396
|
+
) -> ParagraphCollection:
|
|
397
|
+
"""Route content to the correct extractor and return L1 Paragraphs."""
|
|
398
|
+
if not text or not text.strip():
|
|
399
|
+
return ParagraphCollection()
|
|
400
|
+
|
|
401
|
+
# All text goes through MarkdownExtractor (handles plain text too)
|
|
402
|
+
return self.markdown_extractor.extract(text, source=source_hint)
|
|
403
|
+
|
|
404
|
+
def _paragraphs_to_functions(
|
|
405
|
+
self,
|
|
406
|
+
paragraphs: ParagraphCollection,
|
|
407
|
+
source: SourceDocument,
|
|
408
|
+
) -> List[Function]:
|
|
409
|
+
"""Convert L1 Paragraphs to L2 Functions with multi-value fields."""
|
|
410
|
+
functions: List[Function] = []
|
|
411
|
+
source_id = source.type
|
|
412
|
+
|
|
413
|
+
for para in paragraphs.paragraphs:
|
|
414
|
+
if not para.raw_text or not para.raw_text.strip():
|
|
415
|
+
continue
|
|
416
|
+
|
|
417
|
+
# Generate stable ID from content hash
|
|
418
|
+
content_hash = hashlib.sha256(
|
|
419
|
+
para.raw_text.encode()
|
|
420
|
+
).hexdigest()[:16]
|
|
421
|
+
func_id = f"func_{content_hash}"
|
|
422
|
+
|
|
423
|
+
name = para.section if para.section else para.raw_text[:50]
|
|
424
|
+
name_normalized = _normalize_name(
|
|
425
|
+
para.section if para.section else para.raw_text[:50]
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
# Classify FieldValues from sentences
|
|
429
|
+
triggers: List[FieldValue] = []
|
|
430
|
+
conditions: List[FieldValue] = []
|
|
431
|
+
actions: List[FieldValue] = []
|
|
432
|
+
benefits: List[FieldValue] = []
|
|
433
|
+
|
|
434
|
+
for sent in para.sentences:
|
|
435
|
+
fv = FieldValue(
|
|
436
|
+
desc=sent.text,
|
|
437
|
+
sources=[f"{source_id}:{para.id}"],
|
|
438
|
+
source_method="rule_based",
|
|
439
|
+
weight=0.7,
|
|
440
|
+
)
|
|
441
|
+
if sent.role == "trigger":
|
|
442
|
+
triggers.append(fv)
|
|
443
|
+
elif sent.role == "condition":
|
|
444
|
+
conditions.append(fv)
|
|
445
|
+
elif sent.role in ("action",):
|
|
446
|
+
actions.append(fv)
|
|
447
|
+
elif sent.role == "result":
|
|
448
|
+
benefits.append(fv)
|
|
449
|
+
else:
|
|
450
|
+
# "statement" -> put as action by default
|
|
451
|
+
if not actions:
|
|
452
|
+
actions.append(fv)
|
|
453
|
+
else:
|
|
454
|
+
actions.append(fv)
|
|
455
|
+
|
|
456
|
+
# If no structured sentences, use raw text as trigger/action
|
|
457
|
+
if not triggers and not actions and para.raw_text:
|
|
458
|
+
sentences_text = [s.strip() for s in re.split(r"[。.!?!?]", para.raw_text) if s.strip()]
|
|
459
|
+
if sentences_text:
|
|
460
|
+
triggers.append(FieldValue(
|
|
461
|
+
desc=sentences_text[0],
|
|
462
|
+
sources=[f"{source_id}:{para.id}"],
|
|
463
|
+
source_method="rule_based",
|
|
464
|
+
weight=0.7,
|
|
465
|
+
))
|
|
466
|
+
for s in sentences_text[1:]:
|
|
467
|
+
actions.append(FieldValue(
|
|
468
|
+
desc=s,
|
|
469
|
+
sources=[f"{source_id}:{para.id}"],
|
|
470
|
+
source_method="rule_based",
|
|
471
|
+
weight=0.7,
|
|
472
|
+
))
|
|
473
|
+
else:
|
|
474
|
+
triggers.append(FieldValue(
|
|
475
|
+
desc=para.raw_text,
|
|
476
|
+
sources=[f"{source_id}:{para.id}"],
|
|
477
|
+
source_method="rule_based",
|
|
478
|
+
weight=0.7,
|
|
479
|
+
))
|
|
480
|
+
|
|
481
|
+
func = Function(
|
|
482
|
+
id=func_id,
|
|
483
|
+
name=name,
|
|
484
|
+
name_normalized=name_normalized,
|
|
485
|
+
trigger=triggers,
|
|
486
|
+
condition=conditions,
|
|
487
|
+
action=actions,
|
|
488
|
+
benefit=benefits,
|
|
489
|
+
source_paragraphs=[para.id],
|
|
490
|
+
source_type=source.source_type,
|
|
491
|
+
content_hash=hashlib.sha256(
|
|
492
|
+
para.raw_text.encode()
|
|
493
|
+
).hexdigest(),
|
|
494
|
+
)
|
|
495
|
+
functions.append(func)
|
|
496
|
+
|
|
497
|
+
return functions
|
|
498
|
+
|
|
499
|
+
def _deduplicate_functions(
|
|
500
|
+
self, functions: List[Function]
|
|
501
|
+
) -> List[Function]:
|
|
502
|
+
"""Use EntityAligner to merge duplicate Functions."""
|
|
503
|
+
if len(functions) <= 1:
|
|
504
|
+
return functions
|
|
505
|
+
|
|
506
|
+
# Build entity dicts for EntityAligner
|
|
507
|
+
entity_dicts = [
|
|
508
|
+
{"id": f.id, "name": f.name, "name_normalized": f.name_normalized}
|
|
509
|
+
for f in functions
|
|
510
|
+
]
|
|
511
|
+
|
|
512
|
+
merge_groups = self.entity_aligner.find_merge_candidates(
|
|
513
|
+
entity_dicts, threshold=0.9
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
if not merge_groups:
|
|
517
|
+
return functions
|
|
518
|
+
|
|
519
|
+
# Build merge map
|
|
520
|
+
merge_map: dict = {} # canonical_id -> list of Function
|
|
521
|
+
func_by_id: dict = {f.id: f for f in functions}
|
|
522
|
+
merged_ids: set = set()
|
|
523
|
+
|
|
524
|
+
for group in merge_groups:
|
|
525
|
+
# Use first entity as canonical
|
|
526
|
+
canonical_id = group[0]["id"]
|
|
527
|
+
for member in group:
|
|
528
|
+
member_id = member["id"]
|
|
529
|
+
merge_map.setdefault(canonical_id, []).append(
|
|
530
|
+
func_by_id[member_id]
|
|
531
|
+
)
|
|
532
|
+
if member_id != canonical_id:
|
|
533
|
+
merged_ids.add(member_id)
|
|
534
|
+
|
|
535
|
+
# Merge fields for grouped functions
|
|
536
|
+
result: List[Function] = []
|
|
537
|
+
for func in functions:
|
|
538
|
+
if func.id in merged_ids:
|
|
539
|
+
continue
|
|
540
|
+
if func.id in merge_map:
|
|
541
|
+
merged = self._merge_function_fields(
|
|
542
|
+
merge_map[func.id]
|
|
543
|
+
)
|
|
544
|
+
result.append(merged)
|
|
545
|
+
else:
|
|
546
|
+
result.append(func)
|
|
547
|
+
|
|
548
|
+
return result
|
|
549
|
+
|
|
550
|
+
def _merge_function_fields(
|
|
551
|
+
self, functions: List[Function]
|
|
552
|
+
) -> Function:
|
|
553
|
+
"""Merge FieldValues from multiple Functions into one."""
|
|
554
|
+
if not functions:
|
|
555
|
+
raise ValueError("Cannot merge empty function list")
|
|
556
|
+
canonical = functions[0]
|
|
557
|
+
|
|
558
|
+
for other in functions[1:]:
|
|
559
|
+
# Merge each role field
|
|
560
|
+
for role in ("trigger", "condition", "action", "benefit"):
|
|
561
|
+
existing_descs = {
|
|
562
|
+
fv.desc for fv in getattr(canonical, role)
|
|
563
|
+
}
|
|
564
|
+
for fv in getattr(other, role):
|
|
565
|
+
if fv.desc not in existing_descs:
|
|
566
|
+
getattr(canonical, role).append(fv)
|
|
567
|
+
existing_descs.add(fv.desc)
|
|
568
|
+
|
|
569
|
+
# Merge source_paragraphs
|
|
570
|
+
for sp in other.source_paragraphs:
|
|
571
|
+
if sp not in canonical.source_paragraphs:
|
|
572
|
+
canonical.source_paragraphs.append(sp)
|
|
573
|
+
|
|
574
|
+
return canonical
|
|
575
|
+
|
|
576
|
+
def _calculate_function_confidence(
|
|
577
|
+
self,
|
|
578
|
+
func: Function,
|
|
579
|
+
paragraphs: ParagraphCollection,
|
|
580
|
+
source_hint: str,
|
|
581
|
+
) -> float:
|
|
582
|
+
"""Calculate confidence for a Function using ConfidenceCalculator."""
|
|
583
|
+
# Find matching paragraph
|
|
584
|
+
matching_para = None
|
|
585
|
+
for para in paragraphs.paragraphs:
|
|
586
|
+
if para.id in func.source_paragraphs:
|
|
587
|
+
matching_para = para
|
|
588
|
+
break
|
|
589
|
+
|
|
590
|
+
if matching_para:
|
|
591
|
+
return self.confidence_calculator.calculate_paragraph_confidence(
|
|
592
|
+
matching_para, source_hint
|
|
593
|
+
)
|
|
594
|
+
|
|
595
|
+
# Fallback: use source-based base confidence
|
|
596
|
+
return self.confidence_calculator._get_base_confidence(source_hint)
|
|
597
|
+
|
|
598
|
+
def _build_graph(self, functions: List[Function]) -> GraphData:
|
|
599
|
+
"""Build relationship edges between Functions using GraphBuilder."""
|
|
600
|
+
edges = []
|
|
601
|
+
|
|
602
|
+
if self._store is not None:
|
|
603
|
+
# Use store-aware GraphBuilder
|
|
604
|
+
try:
|
|
605
|
+
builder = GraphBuilder(store=self._store)
|
|
606
|
+
edges = builder.build_from_batch(functions)
|
|
607
|
+
except Exception as exc:
|
|
608
|
+
logger.warning("GraphBuilder failed: %s", exc)
|
|
609
|
+
edges = self._build_edges_rule_based(functions)
|
|
610
|
+
else:
|
|
611
|
+
# Stateless: use rule-based edge detection
|
|
612
|
+
edges = self._build_edges_rule_based(functions)
|
|
613
|
+
|
|
614
|
+
return GraphData(nodes=functions, edges=edges)
|
|
615
|
+
|
|
616
|
+
def _build_edges_rule_based(
|
|
617
|
+
self, functions: List[Function]
|
|
618
|
+
) -> list:
|
|
619
|
+
"""Simple rule-based edge detection when no store is available."""
|
|
620
|
+
from memnex.models import GraphEdge
|
|
621
|
+
|
|
622
|
+
edges: List[GraphEdge] = []
|
|
623
|
+
seen: set = set()
|
|
624
|
+
|
|
625
|
+
for func in functions:
|
|
626
|
+
# REFERENCES from cross-references
|
|
627
|
+
for ref in func.cross_references:
|
|
628
|
+
target = ref.get("target", "")
|
|
629
|
+
if not target:
|
|
630
|
+
continue
|
|
631
|
+
# Find matching function by name
|
|
632
|
+
for other in functions:
|
|
633
|
+
if other.id == func.id:
|
|
634
|
+
continue
|
|
635
|
+
if target.lower() in other.name.lower() or target.lower() in other.name_normalized:
|
|
636
|
+
key = (func.id, other.id, "REFERENCES")
|
|
637
|
+
if key not in seen:
|
|
638
|
+
seen.add(key)
|
|
639
|
+
edges.append(GraphEdge(
|
|
640
|
+
source=func.id,
|
|
641
|
+
target=other.id,
|
|
642
|
+
edge_type="REFERENCES",
|
|
643
|
+
weight=1.0,
|
|
644
|
+
evidence=[f"cross-reference: {func.name} -> {other.name}"],
|
|
645
|
+
created_at=datetime.now(),
|
|
646
|
+
))
|
|
647
|
+
|
|
648
|
+
# ASSOCIATED_WITH: shared domain
|
|
649
|
+
if func.domain:
|
|
650
|
+
for other in functions:
|
|
651
|
+
if other.id == func.id:
|
|
652
|
+
continue
|
|
653
|
+
if other.domain == func.domain:
|
|
654
|
+
key = (func.id, other.id, "ASSOCIATED_WITH")
|
|
655
|
+
rev_key = (other.id, func.id, "ASSOCIATED_WITH")
|
|
656
|
+
if key not in seen and rev_key not in seen:
|
|
657
|
+
seen.add(key)
|
|
658
|
+
edges.append(GraphEdge(
|
|
659
|
+
source=func.id,
|
|
660
|
+
target=other.id,
|
|
661
|
+
edge_type="ASSOCIATED_WITH",
|
|
662
|
+
weight=0.5,
|
|
663
|
+
evidence=[f"shared domain: {func.domain}"],
|
|
664
|
+
created_at=datetime.now(),
|
|
665
|
+
))
|
|
666
|
+
|
|
667
|
+
return edges
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Extractors for various document formats."""
|
|
2
|
+
|
|
3
|
+
from .markdown import MarkdownExtractor
|
|
4
|
+
from .image import ImageExtractor
|
|
5
|
+
from .pdf import PDFExtractor
|
|
6
|
+
from .docx import DOCXExtractor
|
|
7
|
+
from .vision_mapper import VisionMapper
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"MarkdownExtractor",
|
|
11
|
+
"ImageExtractor",
|
|
12
|
+
"PDFExtractor",
|
|
13
|
+
"DOCXExtractor",
|
|
14
|
+
"VisionMapper",
|
|
15
|
+
]
|