memplex 3.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. memnex/__init__.py +31 -0
  2. memnex/__main__.py +6 -0
  3. memnex/_plugin/.claude-plugin/plugin.json +24 -0
  4. memnex/_plugin/.mcp.json +9 -0
  5. memnex/_plugin/__init__.py +0 -0
  6. memnex/_plugin/hooks/hooks.json +43 -0
  7. memnex/_plugin/scripts/hook-runner.py +166 -0
  8. memnex/_plugin/skills/mem-explore/SKILL.md +83 -0
  9. memnex/_plugin/skills/mem-manage/SKILL.md +92 -0
  10. memnex/_plugin/skills/mem-search/SKILL.md +85 -0
  11. memnex/_plugin/skills/mem-write/SKILL.md +78 -0
  12. memnex/adapters/__init__.py +14 -0
  13. memnex/adapters/claude_skill.py +169 -0
  14. memnex/adapters/cli.py +525 -0
  15. memnex/adapters/http_api.py +314 -0
  16. memnex/adapters/mcp_server.py +448 -0
  17. memnex/compaction.py +563 -0
  18. memnex/config.py +366 -0
  19. memnex/core/__init__.py +13 -0
  20. memnex/core/associator/__init__.py +8 -0
  21. memnex/core/associator/domain_classifier.py +75 -0
  22. memnex/core/associator/entity_aligner.py +127 -0
  23. memnex/core/associator/ref_linker.py +197 -0
  24. memnex/core/associator/term_mapper.py +77 -0
  25. memnex/core/dictionaries/__init__.py +50 -0
  26. memnex/core/engine.py +667 -0
  27. memnex/core/extractors/__init__.py +15 -0
  28. memnex/core/extractors/docx.py +97 -0
  29. memnex/core/extractors/image.py +233 -0
  30. memnex/core/extractors/markdown.py +139 -0
  31. memnex/core/extractors/pdf.py +133 -0
  32. memnex/core/extractors/vision_mapper.py +131 -0
  33. memnex/core/handlers/__init__.py +7 -0
  34. memnex/core/handlers/clipboard.py +40 -0
  35. memnex/core/handlers/file_handler.py +62 -0
  36. memnex/core/handlers/url_handler.py +132 -0
  37. memnex/llm/__init__.py +25 -0
  38. memnex/llm/enhancer.py +226 -0
  39. memnex/llm/fallback_chain.py +87 -0
  40. memnex/llm/injection_guard.py +178 -0
  41. memnex/llm/provider.py +130 -0
  42. memnex/llm/providers/__init__.py +22 -0
  43. memnex/llm/providers/anthropic.py +135 -0
  44. memnex/llm/providers/local.py +135 -0
  45. memnex/llm/providers/rule_based.py +68 -0
  46. memnex/llm/sanitizer.py +67 -0
  47. memnex/models/__init__.py +68 -0
  48. memnex/models/feedback.py +42 -0
  49. memnex/models/graph.py +33 -0
  50. memnex/models/memory.py +102 -0
  51. memnex/models/misc.py +185 -0
  52. memnex/models/paragraph.py +45 -0
  53. memnex/models/search.py +51 -0
  54. memnex/models/source.py +23 -0
  55. memnex/models/task.py +62 -0
  56. memnex/processing/__init__.py +1 -0
  57. memnex/processing/graph_builder.py +278 -0
  58. memnex/processing/merger/__init__.py +6 -0
  59. memnex/processing/merger/confidence_calculator.py +127 -0
  60. memnex/processing/merger/conflict_resolver.py +116 -0
  61. memnex/retrieval/__init__.py +1 -0
  62. memnex/retrieval/dedup.py +386 -0
  63. memnex/retrieval/embedding.py +289 -0
  64. memnex/retrieval/reranker.py +299 -0
  65. memnex/service.py +902 -0
  66. memnex/storage/__init__.py +65 -0
  67. memnex/storage/base.py +132 -0
  68. memnex/storage/changelog.py +106 -0
  69. memnex/storage/feedback.py +486 -0
  70. memnex/storage/lite/__init__.py +5 -0
  71. memnex/storage/lite/store.py +606 -0
  72. memnex/storage/vector.py +265 -0
  73. memnex/wiki/__init__.py +11 -0
  74. memnex/wiki/community.py +221 -0
  75. memnex/wiki/compiler.py +545 -0
  76. memnex/wiki/generator.py +270 -0
  77. memnex/wiki/search.py +282 -0
  78. memnex/worker.py +412 -0
  79. memplex-3.2.0.dist-info/METADATA +37 -0
  80. memplex-3.2.0.dist-info/RECORD +83 -0
  81. memplex-3.2.0.dist-info/WHEEL +5 -0
  82. memplex-3.2.0.dist-info/entry_points.txt +2 -0
  83. memplex-3.2.0.dist-info/top_level.txt +1 -0
memnex/core/engine.py ADDED
@@ -0,0 +1,667 @@
1
+ """CoreEngine -- pure computation layer for SourceDocument -> ExtractedData.
2
+
3
+ Zero-dependency on Agent platforms. Input and output are data structures only.
4
+ All I/O (storage, network) is handled by callers (MemNexService).
5
+
6
+ Usage::
7
+
8
+ from memnex.core import CoreEngine
9
+
10
+ engine = CoreEngine()
11
+ extracted = engine.extract(source)
12
+ for func in extracted.functions:
13
+ print(func.name)
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import hashlib
19
+ import logging
20
+ import re
21
+ from datetime import datetime
22
+ from typing import List, Optional
23
+
24
+ from memnex.core.associator.domain_classifier import DomainClassifier
25
+ from memnex.core.associator.entity_aligner import EntityAligner
26
+ from memnex.core.associator.ref_linker import RefLinker
27
+ from memnex.core.associator.term_mapper import TermMapper
28
+ from memnex.core.extractors.docx import DOCXExtractor
29
+ from memnex.core.extractors.image import ImageExtractor
30
+ from memnex.core.extractors.markdown import MarkdownExtractor
31
+ from memnex.core.extractors.pdf import PDFExtractor
32
+ from memnex.core.extractors.vision_mapper import VisionMapper
33
+ from memnex.processing.graph_builder import GraphBuilder
34
+ from memnex.core.handlers.clipboard import ClipboardHandler
35
+ from memnex.core.handlers.file_handler import FileHandler
36
+ from memnex.core.handlers.url_handler import URLHandler
37
+ from memnex.processing.merger.confidence_calculator import ConfidenceCalculator
38
+ from memnex.processing.merger.conflict_resolver import ConflictResolver
39
+ from memnex.models import (
40
+ ExtractedData,
41
+ FieldValue,
42
+ Function,
43
+ GraphData,
44
+ SourceDocument,
45
+ )
46
+ from memnex.models.paragraph import Paragraph, ParagraphCollection
47
+
48
+ logger = logging.getLogger(__name__)
49
+
50
+
51
+ # ── Helpers ──────────────────────────────────────────────────────────
52
+
53
+
54
+ def _detect_memory_type(text: str) -> str:
55
+ """Heuristic: classify text into a memory type.
56
+
57
+ Returns one of ``"function"`` | ``"fact"`` | ``"preference"`` |
58
+ ``"observation"``.
59
+ """
60
+ text_lower = text.lower()
61
+
62
+ obs_keywords = [
63
+ "observe", "observed", "noticed", "happened", "occurred",
64
+ "事件", "观察", "发生", "记录",
65
+ ]
66
+ if any(k in text_lower for k in obs_keywords):
67
+ return "observation"
68
+
69
+ pref_keywords = [
70
+ "prefer", "like", "dislike", "want", "always", "never",
71
+ "喜欢", "偏好", "讨厌", "倾向", "总是", "从不",
72
+ ]
73
+ if any(k in text_lower for k in pref_keywords):
74
+ return "preference"
75
+
76
+ fact_keywords = [
77
+ "is", "are", "means", "defined as", "refers to",
78
+ "是", "意味着", "定义为", "指的是", "事实",
79
+ ]
80
+ if any(k in text_lower for k in fact_keywords):
81
+ return "fact"
82
+
83
+ return "function"
84
+
85
+
86
+ def _normalize_name(name: str) -> str:
87
+ """Generate name_normalized from a display name.
88
+
89
+ Rules (per spec SS1.3):
90
+ 1. lowercase
91
+ 2. strip whitespace
92
+ 3. collapse consecutive whitespace to single space
93
+ 4. remove punctuation (keep letters, digits, CJK, spaces, underscores, hyphens)
94
+ """
95
+ normalized = name.lower().strip()
96
+ normalized = re.sub(r"\s+", " ", normalized)
97
+ normalized = re.sub(r"[^a-z0-9一-鿿 _-]", "", normalized)
98
+ return normalized
99
+
100
+
101
+ # ── CoreEngine ───────────────────────────────────────────────────────
102
+
103
+
104
+ class CoreEngine:
105
+ """Pure computation engine: ``SourceDocument`` -> ``ExtractedData``.
106
+
107
+ Orchestrates the full extraction pipeline:
108
+
109
+ 1. **Handler** -- acquire raw content from source type
110
+ 2. **Extractor** -- content -> L1 Paragraphs
111
+ 3. **Paragraph -> Function** -- with multi-value ``FieldValue`` fields
112
+ 4. **DomainClassifier** -- assign domain
113
+ 5. **RefLinker** -- resolve cross-references
114
+ 6. **EntityAligner** -- deduplicate/merge
115
+ 7. **ConflictResolver** -- detect conflicts
116
+ 8. **ConfidenceCalculator** -- compute confidence
117
+ 9. **GraphBuilder** -- build relationship edges
118
+
119
+ Parameters
120
+ ----------
121
+ store:
122
+ Optional :class:`MemoryStore`. Required only when
123
+ :meth:`extract` needs to look up existing Functions for
124
+ graph-edge detection. Pass ``None`` for stateless usage
125
+ (graph edges will be built from the batch only).
126
+ """
127
+
128
+ def __init__(self, store=None) -> None:
129
+ # ── Extractors ──────────────────────────────────────────────
130
+ self.markdown_extractor = MarkdownExtractor()
131
+ self.image_extractor = ImageExtractor()
132
+ self.pdf_extractor = PDFExtractor()
133
+ self.docx_extractor = DOCXExtractor()
134
+ self.vision_mapper = VisionMapper()
135
+
136
+ # ── Handlers ────────────────────────────────────────────────
137
+ self.file_handler = FileHandler()
138
+ self.url_handler = URLHandler()
139
+ self.clipboard_handler = ClipboardHandler()
140
+
141
+ # ── Associators ─────────────────────────────────────────────
142
+ self.term_mapper = TermMapper()
143
+ self.ref_linker = RefLinker()
144
+ self.entity_aligner = EntityAligner()
145
+ self.domain_classifier = DomainClassifier()
146
+
147
+ # ── Merge layer ─────────────────────────────────────────────
148
+ self.conflict_resolver = ConflictResolver()
149
+ self.confidence_calculator = ConfidenceCalculator()
150
+
151
+ # ── Graph builder (optional store) ──────────────────────────
152
+ self._store = store
153
+
154
+ # ════════════════════════════════════════════════════════════════
155
+ # Public API
156
+ # ════════════════════════════════════════════════════════════════
157
+
158
+ def extract(self, source: SourceDocument) -> ExtractedData:
159
+ """Main extraction pipeline.
160
+
161
+ Parameters
162
+ ----------
163
+ source:
164
+ The source document to process.
165
+
166
+ Returns
167
+ -------
168
+ ExtractedData
169
+ Extracted Functions and graph edges.
170
+ """
171
+ # Step 1: Acquire content via handler
172
+ text, extracted_images, source_hint = self._acquire_content(source)
173
+
174
+ # Step 2: Extract L1 Paragraphs
175
+ paragraphs = self._extract_paragraphs(text, source_hint)
176
+
177
+ # Step 3: Handle vision / image extracted data
178
+ vision_functions = []
179
+ if source.vision:
180
+ vision_functions = self.vision_mapper.vision_to_functions(
181
+ source.vision,
182
+ source_id=source.type,
183
+ )
184
+
185
+ # Also process extracted images from PDF
186
+ image_functions = []
187
+ for img_info in extracted_images:
188
+ img_path = img_info.get("path")
189
+ if not img_path:
190
+ continue
191
+ full = self.image_extractor.extract_full(img_path)
192
+ if full and full.get("vision"):
193
+ img_funcs = self.vision_mapper.vision_to_functions(
194
+ full["vision"],
195
+ source_id=f"pdf_img_{img_info.get('page', 0)}_{img_info.get('index', 0)}",
196
+ )
197
+ image_functions.extend(img_funcs)
198
+ # Cleanup temp file
199
+ if img_info.get("_tmp"):
200
+ try:
201
+ import os
202
+ os.unlink(img_path)
203
+ except OSError:
204
+ pass
205
+
206
+ # Step 4: Paragraphs -> Functions
207
+ functions = self._paragraphs_to_functions(paragraphs, source)
208
+
209
+ # Merge vision/image functions
210
+ functions.extend(vision_functions)
211
+ functions.extend(image_functions)
212
+
213
+ if not functions:
214
+ return ExtractedData(
215
+ functions=[],
216
+ graph=GraphData(nodes=[], edges=[]),
217
+ delta=False,
218
+ )
219
+
220
+ # Step 5: DomainClassifier
221
+ for func in functions:
222
+ func.domain = self.domain_classifier.classify(func)
223
+
224
+ # Step 6: RefLinker -- extract cross-references from raw text
225
+ if text:
226
+ refs = self.ref_linker.extract_references(text)
227
+ for func in functions:
228
+ func.cross_references = refs
229
+
230
+ # Step 7: EntityAligner -- deduplicate/merge
231
+ functions = self._deduplicate_functions(functions)
232
+
233
+ # Step 8: ConflictResolver -- detect conflicts
234
+ conflicts = self.conflict_resolver.detect_conflicts(functions)
235
+ for conflict in conflicts:
236
+ if conflict.needs_human:
237
+ for val in conflict.values:
238
+ target_id = val.get("source", "")
239
+ for func in functions:
240
+ if func.id == target_id or func.source_paragraphs and target_id in func.source_paragraphs:
241
+ func.needs_review = True
242
+
243
+ # Step 9: ConfidenceCalculator -- compute confidence for each function
244
+ for func in functions:
245
+ if func.confidence == 1.0:
246
+ func.confidence = self._calculate_function_confidence(func, paragraphs, source_hint)
247
+
248
+ # Step 10: GraphBuilder -- build edges
249
+ graph = self._build_graph(functions)
250
+
251
+ return ExtractedData(
252
+ functions=functions,
253
+ graph=graph,
254
+ delta=False,
255
+ )
256
+
257
+ def extract_batch(self, sources: List[SourceDocument]) -> ExtractedData:
258
+ """Batch extraction: process multiple sources and merge results.
259
+
260
+ Parameters
261
+ ----------
262
+ sources:
263
+ List of source documents to process.
264
+
265
+ Returns
266
+ -------
267
+ ExtractedData
268
+ Merged extraction results from all sources.
269
+ """
270
+ all_functions: List[Function] = []
271
+ all_edges: list = []
272
+
273
+ for source in sources:
274
+ extracted = self.extract(source)
275
+ all_functions.extend(extracted.functions)
276
+ all_edges.extend(extracted.graph.edges)
277
+
278
+ # Deduplicate across batch
279
+ all_functions = self._deduplicate_functions(all_functions)
280
+
281
+ # Rebuild graph with deduped functions
282
+ graph = self._build_graph(all_functions)
283
+
284
+ return ExtractedData(
285
+ functions=all_functions,
286
+ graph=graph,
287
+ delta=False,
288
+ )
289
+
290
+ # ════════════════════════════════════════════════════════════════
291
+ # Internal: content acquisition
292
+ # ════════════════════════════════════════════════════════════════
293
+
294
+ def _acquire_content(self, source: SourceDocument):
295
+ """Route source to the correct handler and return normalized content.
296
+
297
+ Returns
298
+ -------
299
+ tuple of (text, extracted_images, source_hint)
300
+ text: str -- the textual content to process
301
+ extracted_images: list -- image dicts extracted from PDF etc.
302
+ source_hint: str -- hint for confidence calculation
303
+ """
304
+ source_type = source.type
305
+ extracted_images = []
306
+
307
+ # Text / clipboard content
308
+ if source_type in ("text", "clipboard"):
309
+ text = source.content or ""
310
+ # Use ClipboardHandler to detect subtype
311
+ if source_type == "clipboard":
312
+ parsed = self.clipboard_handler.parse(text)
313
+ if parsed:
314
+ source_hint = parsed[0][0] # "markdown" or "text"
315
+ else:
316
+ source_hint = "text"
317
+ else:
318
+ source_hint = "text"
319
+ return text, extracted_images, source_hint
320
+
321
+ # File content
322
+ if source_type == "file" and source.source_path:
323
+ result = self.file_handler.read(source.source_path)
324
+ if result is None:
325
+ return "", extracted_images, "text"
326
+
327
+ content_type, content = result
328
+
329
+ if content_type == "image":
330
+ # Process image: OCR + vision
331
+ full = self.image_extractor.extract_full(content)
332
+ combined_text = full.get("combined_text", "")
333
+ return combined_text, extracted_images, "image"
334
+
335
+ if content_type == "pdf":
336
+ full = self.pdf_extractor.extract_full(content)
337
+ if full is None:
338
+ return "", extracted_images, "pdf"
339
+ text = "\n\n".join(full.get("pages", []))
340
+ extracted_images = []
341
+ for page_images in full.get("images", []):
342
+ for img in page_images:
343
+ if img.get("path"):
344
+ img["_tmp"] = True
345
+ extracted_images.append(img)
346
+ return text, extracted_images, "pdf"
347
+
348
+ if content_type == "docx":
349
+ docx_text = self.docx_extractor.extract(content)
350
+ return docx_text or "", extracted_images, "docx"
351
+
352
+ # markdown / text
353
+ return content, extracted_images, content_type
354
+
355
+ # URL content
356
+ if source_type == "url" and source.url:
357
+ result = self.url_handler.fetch(source.url)
358
+ if result is None:
359
+ return "", extracted_images, "url"
360
+
361
+ content_type, content = result
362
+
363
+ if content_type == "image":
364
+ full = self.image_extractor.extract_full(content)
365
+ combined_text = full.get("combined_text", "")
366
+ # Cleanup temp file
367
+ self.url_handler.cleanup_temp_file(content)
368
+ return combined_text, extracted_images, "image"
369
+
370
+ if content_type == "pdf":
371
+ full = self.pdf_extractor.extract_full(content)
372
+ if full is None:
373
+ self.url_handler.cleanup_temp_file(content)
374
+ return "", extracted_images, "pdf"
375
+ text = "\n\n".join(full.get("pages", []))
376
+ for page_images in full.get("images", []):
377
+ for img in page_images:
378
+ if img.get("path"):
379
+ img["_tmp"] = True
380
+ extracted_images.append(img)
381
+ self.url_handler.cleanup_temp_file(content)
382
+ return text, extracted_images, "pdf"
383
+
384
+ # text / markdown / html
385
+ return content, extracted_images, content_type
386
+
387
+ # Fallback: use source.content directly
388
+ return source.content or "", extracted_images, "text"
389
+
390
+ # ════════════════════════════════════════════════════════════════
391
+ # Internal: extraction pipeline steps
392
+ # ════════════════════════════════════════════════════════════════
393
+
394
+ def _extract_paragraphs(
395
+ self, text: str, source_hint: str
396
+ ) -> ParagraphCollection:
397
+ """Route content to the correct extractor and return L1 Paragraphs."""
398
+ if not text or not text.strip():
399
+ return ParagraphCollection()
400
+
401
+ # All text goes through MarkdownExtractor (handles plain text too)
402
+ return self.markdown_extractor.extract(text, source=source_hint)
403
+
404
+ def _paragraphs_to_functions(
405
+ self,
406
+ paragraphs: ParagraphCollection,
407
+ source: SourceDocument,
408
+ ) -> List[Function]:
409
+ """Convert L1 Paragraphs to L2 Functions with multi-value fields."""
410
+ functions: List[Function] = []
411
+ source_id = source.type
412
+
413
+ for para in paragraphs.paragraphs:
414
+ if not para.raw_text or not para.raw_text.strip():
415
+ continue
416
+
417
+ # Generate stable ID from content hash
418
+ content_hash = hashlib.sha256(
419
+ para.raw_text.encode()
420
+ ).hexdigest()[:16]
421
+ func_id = f"func_{content_hash}"
422
+
423
+ name = para.section if para.section else para.raw_text[:50]
424
+ name_normalized = _normalize_name(
425
+ para.section if para.section else para.raw_text[:50]
426
+ )
427
+
428
+ # Classify FieldValues from sentences
429
+ triggers: List[FieldValue] = []
430
+ conditions: List[FieldValue] = []
431
+ actions: List[FieldValue] = []
432
+ benefits: List[FieldValue] = []
433
+
434
+ for sent in para.sentences:
435
+ fv = FieldValue(
436
+ desc=sent.text,
437
+ sources=[f"{source_id}:{para.id}"],
438
+ source_method="rule_based",
439
+ weight=0.7,
440
+ )
441
+ if sent.role == "trigger":
442
+ triggers.append(fv)
443
+ elif sent.role == "condition":
444
+ conditions.append(fv)
445
+ elif sent.role in ("action",):
446
+ actions.append(fv)
447
+ elif sent.role == "result":
448
+ benefits.append(fv)
449
+ else:
450
+ # "statement" -> put as action by default
451
+ if not actions:
452
+ actions.append(fv)
453
+ else:
454
+ actions.append(fv)
455
+
456
+ # If no structured sentences, use raw text as trigger/action
457
+ if not triggers and not actions and para.raw_text:
458
+ sentences_text = [s.strip() for s in re.split(r"[。.!?!?]", para.raw_text) if s.strip()]
459
+ if sentences_text:
460
+ triggers.append(FieldValue(
461
+ desc=sentences_text[0],
462
+ sources=[f"{source_id}:{para.id}"],
463
+ source_method="rule_based",
464
+ weight=0.7,
465
+ ))
466
+ for s in sentences_text[1:]:
467
+ actions.append(FieldValue(
468
+ desc=s,
469
+ sources=[f"{source_id}:{para.id}"],
470
+ source_method="rule_based",
471
+ weight=0.7,
472
+ ))
473
+ else:
474
+ triggers.append(FieldValue(
475
+ desc=para.raw_text,
476
+ sources=[f"{source_id}:{para.id}"],
477
+ source_method="rule_based",
478
+ weight=0.7,
479
+ ))
480
+
481
+ func = Function(
482
+ id=func_id,
483
+ name=name,
484
+ name_normalized=name_normalized,
485
+ trigger=triggers,
486
+ condition=conditions,
487
+ action=actions,
488
+ benefit=benefits,
489
+ source_paragraphs=[para.id],
490
+ source_type=source.source_type,
491
+ content_hash=hashlib.sha256(
492
+ para.raw_text.encode()
493
+ ).hexdigest(),
494
+ )
495
+ functions.append(func)
496
+
497
+ return functions
498
+
499
+ def _deduplicate_functions(
500
+ self, functions: List[Function]
501
+ ) -> List[Function]:
502
+ """Use EntityAligner to merge duplicate Functions."""
503
+ if len(functions) <= 1:
504
+ return functions
505
+
506
+ # Build entity dicts for EntityAligner
507
+ entity_dicts = [
508
+ {"id": f.id, "name": f.name, "name_normalized": f.name_normalized}
509
+ for f in functions
510
+ ]
511
+
512
+ merge_groups = self.entity_aligner.find_merge_candidates(
513
+ entity_dicts, threshold=0.9
514
+ )
515
+
516
+ if not merge_groups:
517
+ return functions
518
+
519
+ # Build merge map
520
+ merge_map: dict = {} # canonical_id -> list of Function
521
+ func_by_id: dict = {f.id: f for f in functions}
522
+ merged_ids: set = set()
523
+
524
+ for group in merge_groups:
525
+ # Use first entity as canonical
526
+ canonical_id = group[0]["id"]
527
+ for member in group:
528
+ member_id = member["id"]
529
+ merge_map.setdefault(canonical_id, []).append(
530
+ func_by_id[member_id]
531
+ )
532
+ if member_id != canonical_id:
533
+ merged_ids.add(member_id)
534
+
535
+ # Merge fields for grouped functions
536
+ result: List[Function] = []
537
+ for func in functions:
538
+ if func.id in merged_ids:
539
+ continue
540
+ if func.id in merge_map:
541
+ merged = self._merge_function_fields(
542
+ merge_map[func.id]
543
+ )
544
+ result.append(merged)
545
+ else:
546
+ result.append(func)
547
+
548
+ return result
549
+
550
+ def _merge_function_fields(
551
+ self, functions: List[Function]
552
+ ) -> Function:
553
+ """Merge FieldValues from multiple Functions into one."""
554
+ if not functions:
555
+ raise ValueError("Cannot merge empty function list")
556
+ canonical = functions[0]
557
+
558
+ for other in functions[1:]:
559
+ # Merge each role field
560
+ for role in ("trigger", "condition", "action", "benefit"):
561
+ existing_descs = {
562
+ fv.desc for fv in getattr(canonical, role)
563
+ }
564
+ for fv in getattr(other, role):
565
+ if fv.desc not in existing_descs:
566
+ getattr(canonical, role).append(fv)
567
+ existing_descs.add(fv.desc)
568
+
569
+ # Merge source_paragraphs
570
+ for sp in other.source_paragraphs:
571
+ if sp not in canonical.source_paragraphs:
572
+ canonical.source_paragraphs.append(sp)
573
+
574
+ return canonical
575
+
576
+ def _calculate_function_confidence(
577
+ self,
578
+ func: Function,
579
+ paragraphs: ParagraphCollection,
580
+ source_hint: str,
581
+ ) -> float:
582
+ """Calculate confidence for a Function using ConfidenceCalculator."""
583
+ # Find matching paragraph
584
+ matching_para = None
585
+ for para in paragraphs.paragraphs:
586
+ if para.id in func.source_paragraphs:
587
+ matching_para = para
588
+ break
589
+
590
+ if matching_para:
591
+ return self.confidence_calculator.calculate_paragraph_confidence(
592
+ matching_para, source_hint
593
+ )
594
+
595
+ # Fallback: use source-based base confidence
596
+ return self.confidence_calculator._get_base_confidence(source_hint)
597
+
598
+ def _build_graph(self, functions: List[Function]) -> GraphData:
599
+ """Build relationship edges between Functions using GraphBuilder."""
600
+ edges = []
601
+
602
+ if self._store is not None:
603
+ # Use store-aware GraphBuilder
604
+ try:
605
+ builder = GraphBuilder(store=self._store)
606
+ edges = builder.build_from_batch(functions)
607
+ except Exception as exc:
608
+ logger.warning("GraphBuilder failed: %s", exc)
609
+ edges = self._build_edges_rule_based(functions)
610
+ else:
611
+ # Stateless: use rule-based edge detection
612
+ edges = self._build_edges_rule_based(functions)
613
+
614
+ return GraphData(nodes=functions, edges=edges)
615
+
616
+ def _build_edges_rule_based(
617
+ self, functions: List[Function]
618
+ ) -> list:
619
+ """Simple rule-based edge detection when no store is available."""
620
+ from memnex.models import GraphEdge
621
+
622
+ edges: List[GraphEdge] = []
623
+ seen: set = set()
624
+
625
+ for func in functions:
626
+ # REFERENCES from cross-references
627
+ for ref in func.cross_references:
628
+ target = ref.get("target", "")
629
+ if not target:
630
+ continue
631
+ # Find matching function by name
632
+ for other in functions:
633
+ if other.id == func.id:
634
+ continue
635
+ if target.lower() in other.name.lower() or target.lower() in other.name_normalized:
636
+ key = (func.id, other.id, "REFERENCES")
637
+ if key not in seen:
638
+ seen.add(key)
639
+ edges.append(GraphEdge(
640
+ source=func.id,
641
+ target=other.id,
642
+ edge_type="REFERENCES",
643
+ weight=1.0,
644
+ evidence=[f"cross-reference: {func.name} -> {other.name}"],
645
+ created_at=datetime.now(),
646
+ ))
647
+
648
+ # ASSOCIATED_WITH: shared domain
649
+ if func.domain:
650
+ for other in functions:
651
+ if other.id == func.id:
652
+ continue
653
+ if other.domain == func.domain:
654
+ key = (func.id, other.id, "ASSOCIATED_WITH")
655
+ rev_key = (other.id, func.id, "ASSOCIATED_WITH")
656
+ if key not in seen and rev_key not in seen:
657
+ seen.add(key)
658
+ edges.append(GraphEdge(
659
+ source=func.id,
660
+ target=other.id,
661
+ edge_type="ASSOCIATED_WITH",
662
+ weight=0.5,
663
+ evidence=[f"shared domain: {func.domain}"],
664
+ created_at=datetime.now(),
665
+ ))
666
+
667
+ return edges
@@ -0,0 +1,15 @@
1
+ """Extractors for various document formats."""
2
+
3
+ from .markdown import MarkdownExtractor
4
+ from .image import ImageExtractor
5
+ from .pdf import PDFExtractor
6
+ from .docx import DOCXExtractor
7
+ from .vision_mapper import VisionMapper
8
+
9
+ __all__ = [
10
+ "MarkdownExtractor",
11
+ "ImageExtractor",
12
+ "PDFExtractor",
13
+ "DOCXExtractor",
14
+ "VisionMapper",
15
+ ]