memplex 3.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. memnex/__init__.py +31 -0
  2. memnex/__main__.py +6 -0
  3. memnex/_plugin/.claude-plugin/plugin.json +24 -0
  4. memnex/_plugin/.mcp.json +9 -0
  5. memnex/_plugin/__init__.py +0 -0
  6. memnex/_plugin/hooks/hooks.json +43 -0
  7. memnex/_plugin/scripts/hook-runner.py +166 -0
  8. memnex/_plugin/skills/mem-explore/SKILL.md +83 -0
  9. memnex/_plugin/skills/mem-manage/SKILL.md +92 -0
  10. memnex/_plugin/skills/mem-search/SKILL.md +85 -0
  11. memnex/_plugin/skills/mem-write/SKILL.md +78 -0
  12. memnex/adapters/__init__.py +14 -0
  13. memnex/adapters/claude_skill.py +169 -0
  14. memnex/adapters/cli.py +525 -0
  15. memnex/adapters/http_api.py +314 -0
  16. memnex/adapters/mcp_server.py +448 -0
  17. memnex/compaction.py +563 -0
  18. memnex/config.py +366 -0
  19. memnex/core/__init__.py +13 -0
  20. memnex/core/associator/__init__.py +8 -0
  21. memnex/core/associator/domain_classifier.py +75 -0
  22. memnex/core/associator/entity_aligner.py +127 -0
  23. memnex/core/associator/ref_linker.py +197 -0
  24. memnex/core/associator/term_mapper.py +77 -0
  25. memnex/core/dictionaries/__init__.py +50 -0
  26. memnex/core/engine.py +667 -0
  27. memnex/core/extractors/__init__.py +15 -0
  28. memnex/core/extractors/docx.py +97 -0
  29. memnex/core/extractors/image.py +233 -0
  30. memnex/core/extractors/markdown.py +139 -0
  31. memnex/core/extractors/pdf.py +133 -0
  32. memnex/core/extractors/vision_mapper.py +131 -0
  33. memnex/core/handlers/__init__.py +7 -0
  34. memnex/core/handlers/clipboard.py +40 -0
  35. memnex/core/handlers/file_handler.py +62 -0
  36. memnex/core/handlers/url_handler.py +132 -0
  37. memnex/llm/__init__.py +25 -0
  38. memnex/llm/enhancer.py +226 -0
  39. memnex/llm/fallback_chain.py +87 -0
  40. memnex/llm/injection_guard.py +178 -0
  41. memnex/llm/provider.py +130 -0
  42. memnex/llm/providers/__init__.py +22 -0
  43. memnex/llm/providers/anthropic.py +135 -0
  44. memnex/llm/providers/local.py +135 -0
  45. memnex/llm/providers/rule_based.py +68 -0
  46. memnex/llm/sanitizer.py +67 -0
  47. memnex/models/__init__.py +68 -0
  48. memnex/models/feedback.py +42 -0
  49. memnex/models/graph.py +33 -0
  50. memnex/models/memory.py +102 -0
  51. memnex/models/misc.py +185 -0
  52. memnex/models/paragraph.py +45 -0
  53. memnex/models/search.py +51 -0
  54. memnex/models/source.py +23 -0
  55. memnex/models/task.py +62 -0
  56. memnex/processing/__init__.py +1 -0
  57. memnex/processing/graph_builder.py +278 -0
  58. memnex/processing/merger/__init__.py +6 -0
  59. memnex/processing/merger/confidence_calculator.py +127 -0
  60. memnex/processing/merger/conflict_resolver.py +116 -0
  61. memnex/retrieval/__init__.py +1 -0
  62. memnex/retrieval/dedup.py +386 -0
  63. memnex/retrieval/embedding.py +289 -0
  64. memnex/retrieval/reranker.py +299 -0
  65. memnex/service.py +902 -0
  66. memnex/storage/__init__.py +65 -0
  67. memnex/storage/base.py +132 -0
  68. memnex/storage/changelog.py +106 -0
  69. memnex/storage/feedback.py +486 -0
  70. memnex/storage/lite/__init__.py +5 -0
  71. memnex/storage/lite/store.py +606 -0
  72. memnex/storage/vector.py +265 -0
  73. memnex/wiki/__init__.py +11 -0
  74. memnex/wiki/community.py +221 -0
  75. memnex/wiki/compiler.py +545 -0
  76. memnex/wiki/generator.py +270 -0
  77. memnex/wiki/search.py +282 -0
  78. memnex/worker.py +412 -0
  79. memplex-3.2.0.dist-info/METADATA +37 -0
  80. memplex-3.2.0.dist-info/RECORD +83 -0
  81. memplex-3.2.0.dist-info/WHEEL +5 -0
  82. memplex-3.2.0.dist-info/entry_points.txt +2 -0
  83. memplex-3.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,97 @@
1
+ """Extract content from DOCX files."""
2
+
3
+ from typing import Optional
4
+
5
+
6
+ class DOCXExtractor:
7
+ """Extracts text content from .docx files."""
8
+
9
+ def is_available(self) -> bool:
10
+ """Check if python-docx is installed."""
11
+ try:
12
+ import docx
13
+ return True
14
+ except ImportError:
15
+ return False
16
+
17
+ def extract(self, path: str) -> Optional[str]:
18
+ """
19
+ Extract text from a DOCX file.
20
+
21
+ Args:
22
+ path: Path to the .docx file
23
+
24
+ Returns:
25
+ Extracted text content, or None if extraction fails
26
+ """
27
+ if not self.is_available():
28
+ return None
29
+
30
+ try:
31
+ import docx
32
+ doc = docx.Document(path)
33
+ paragraphs = []
34
+
35
+ for para in doc.paragraphs:
36
+ text = para.text.strip()
37
+ if text:
38
+ paragraphs.append(text)
39
+
40
+ for table in doc.tables:
41
+ for row in table.rows:
42
+ for cell in row.cells:
43
+ text = cell.text.strip()
44
+ if text:
45
+ paragraphs.append(text)
46
+
47
+ return "\n\n".join(paragraphs)
48
+
49
+ except Exception:
50
+ return None
51
+
52
+ def extract_full(self, path: str) -> dict:
53
+ """
54
+ Extract full content from DOCX including metadata.
55
+
56
+ Returns:
57
+ dict with keys: text, paragraphs, page_count (estimate), tables
58
+ """
59
+ if not self.is_available():
60
+ return None
61
+
62
+ try:
63
+ import docx
64
+ doc = docx.Document(path)
65
+ paragraphs = []
66
+ tables_text = []
67
+
68
+ for para in doc.paragraphs:
69
+ text = para.text.strip()
70
+ if text:
71
+ paragraphs.append(text)
72
+
73
+ for table in doc.tables:
74
+ table_rows = []
75
+ for row in table.rows:
76
+ row_texts = [cell.text.strip() for cell in row.cells if cell.text.strip()]
77
+ if row_texts:
78
+ table_rows.append(" | ".join(row_texts))
79
+ if table_rows:
80
+ tables_text.append("\n".join(table_rows))
81
+
82
+ total_chars = sum(len(p) for p in paragraphs)
83
+ page_count = max(1, total_chars // 500)
84
+
85
+ return {
86
+ "text": "\n\n".join(paragraphs),
87
+ "paragraphs": paragraphs,
88
+ "page_count": page_count,
89
+ "tables": tables_text,
90
+ "metadata": {
91
+ "paragraph_count": len(paragraphs),
92
+ "table_count": len(tables_text)
93
+ }
94
+ }
95
+
96
+ except Exception:
97
+ return None
@@ -0,0 +1,233 @@
1
+ """Extract text from images using OCR or external vision providers."""
2
+
3
+ from typing import Optional, Callable
4
+ import os
5
+
6
+
7
+ class ImageExtractor:
8
+ """
9
+ Extracts text and visual information from images.
10
+
11
+ Supports two modes:
12
+ 1. Internal OCR: uses pytesseract (if available)
13
+ 2. External providers: registered via set_*_provider() methods
14
+
15
+ Priority: External provider > Internal OCR
16
+ """
17
+
18
+ def __init__(self):
19
+ self._ocr_available = None
20
+ self._external_ocr: Optional[Callable] = None
21
+ self._external_vision: Optional[Callable] = None
22
+ self._vision_timeout = 60 # seconds
23
+ self._vision_max_retries = 2
24
+
25
+ def set_ocr_provider(self, fn: Callable[[str], Optional[str]]) -> None:
26
+ """
27
+ Register an external OCR provider.
28
+
29
+ Args:
30
+ fn: Callable that takes (image_path) returns OCR text or None.
31
+ Example: lambda path: subprocess.run(['tesseract', path, 'stdout'])
32
+ """
33
+ self._external_ocr = fn
34
+
35
+ def set_vision_provider(self, fn: Callable[[str], Optional[dict]]) -> None:
36
+ """
37
+ Register an external vision/LLM provider.
38
+
39
+ Args:
40
+ fn: Callable that takes (image_path) returns vision dict or None.
41
+ The dict should have keys: page_type, components[], layout, design_tools, design_system
42
+ """
43
+ self._external_vision = fn
44
+
45
+ @property
46
+ def has_ocr(self) -> bool:
47
+ """Check if any OCR is available (external or internal)."""
48
+ if self._external_ocr is not None:
49
+ return True
50
+ if self._ocr_available is None:
51
+ try:
52
+ import pytesseract
53
+ from PIL import Image
54
+ self._ocr_available = True
55
+ except ImportError:
56
+ self._ocr_available = False
57
+ return self._ocr_available
58
+
59
+ @property
60
+ def has_vision(self) -> bool:
61
+ """Check if vision capability is available (external or internal)."""
62
+ return self._external_vision is not None
63
+
64
+ def extract(self, image_path: str) -> Optional[str]:
65
+ """
66
+ Extract text from image using OCR.
67
+
68
+ Priority: External OCR provider > Internal pytesseract.
69
+
70
+ Args:
71
+ image_path: Path to image file
72
+
73
+ Returns:
74
+ Extracted text or None if OCR unavailable
75
+ """
76
+ if not os.path.exists(image_path):
77
+ return None
78
+
79
+ # 1. External OCR provider (priority)
80
+ if self._external_ocr is not None:
81
+ try:
82
+ result = self._external_ocr(image_path)
83
+ if result:
84
+ return result.strip() if isinstance(result, str) else result
85
+ except Exception as e:
86
+ print(f"External OCR failed: {e}")
87
+
88
+ # 2. Internal pytesseract
89
+ try:
90
+ import pytesseract
91
+ from PIL import Image
92
+ image = Image.open(image_path)
93
+ text = pytesseract.image_to_string(image, lang='eng+chi')
94
+ return text.strip()
95
+ except Exception as e:
96
+ print(f"OCR failed for {image_path}: {e}")
97
+ return None
98
+
99
+ def extract_with_vision(self, image_path: str, prompt: str = None) -> Optional[dict]:
100
+ """
101
+ Extract visual understanding using vision model.
102
+
103
+ Priority: External vision provider (MCP/LLM) > None.
104
+ Applies timeout and retry with exponential backoff.
105
+
106
+ Args:
107
+ image_path: Path to image file
108
+ prompt: Custom prompt for vision analysis
109
+
110
+ Returns:
111
+ dict with vision analysis or None
112
+ """
113
+ import time
114
+
115
+ if not os.path.exists(image_path):
116
+ return None
117
+
118
+ # 1. External Vision provider (MCP/LLM)
119
+ if self._external_vision is not None:
120
+ last_error = None
121
+ for attempt in range(self._vision_max_retries):
122
+ try:
123
+ result = self._external_vision(image_path)
124
+ if result:
125
+ return result
126
+ except TimeoutError as e:
127
+ last_error = f"Timeout after {self._vision_timeout}s (attempt {attempt + 1}/{self._vision_max_retries})"
128
+ print(f"External vision timeout: {last_error}")
129
+ except Exception as e:
130
+ last_error = f"{type(e).__name__}: {e} (attempt {attempt + 1}/{self._vision_max_retries})"
131
+ print(f"External vision failed: {last_error}")
132
+
133
+ if attempt < self._vision_max_retries - 1:
134
+ wait_time = 2 ** attempt # exponential backoff: 1s, 2s
135
+ time.sleep(wait_time)
136
+
137
+ if last_error:
138
+ print(f"Vision exhausted after {self._vision_max_retries} attempts")
139
+
140
+ return None
141
+
142
+ def extract_full(self, image_path: str, vision_result: dict = None) -> dict:
143
+ """
144
+ Extract both OCR text and optionally pre-extracted vision understanding.
145
+
146
+ Args:
147
+ image_path: Path to image file
148
+ vision_result: Pre-extracted vision analysis (from LLM MCP).
149
+ If provided, skips internal vision extraction.
150
+
151
+ Returns:
152
+ dict with:
153
+ - ocr_text: raw text from OCR
154
+ - has_ocr: whether OCR succeeded
155
+ - vision: structured vision analysis
156
+ - has_vision: whether vision was provided or extracted
157
+ - combined_text: OCR + Vision text for paragraph extraction
158
+ """
159
+ result = {
160
+ "ref": image_path,
161
+ "type": "image",
162
+ "ocr_text": None,
163
+ "has_ocr": False,
164
+ "vision": vision_result,
165
+ "has_vision": vision_result is not None,
166
+ "combined_text": "",
167
+ }
168
+
169
+ # 1. OCR extraction
170
+ ocr_text = self.extract(image_path)
171
+ if ocr_text:
172
+ result["ocr_text"] = ocr_text
173
+ result["has_ocr"] = True
174
+ result["combined_text"] = ocr_text
175
+
176
+ # 2. Vision result (may be pre-extracted via MCP)
177
+ if vision_result:
178
+ result["vision"] = vision_result
179
+ result["has_vision"] = True
180
+ vision_text = self._vision_to_text(vision_result)
181
+ if result["combined_text"]:
182
+ result["combined_text"] += "\n" + vision_text
183
+ else:
184
+ result["combined_text"] = vision_text
185
+ else:
186
+ internal_vision = self.extract_with_vision(image_path)
187
+ if internal_vision:
188
+ result["vision"] = internal_vision
189
+ result["has_vision"] = True
190
+ vision_text = self._vision_to_text(internal_vision)
191
+ if result["combined_text"]:
192
+ result["combined_text"] += "\n" + vision_text
193
+ else:
194
+ result["combined_text"] = vision_text
195
+
196
+ return result
197
+
198
+ def _vision_to_text(self, vision: dict) -> str:
199
+ """Convert vision result to readable text."""
200
+ lines = []
201
+
202
+ if vision.get("page_type"):
203
+ lines.append(f"页面类型: {vision['page_type']}")
204
+
205
+ if vision.get("design_tools"):
206
+ lines.append(f"设计工具: {', '.join(vision['design_tools'])}")
207
+
208
+ if vision.get("design_system"):
209
+ lines.append(f"设计系统: {vision['design_system']}")
210
+
211
+ if vision.get("layout"):
212
+ lines.append(f"布局结构: {vision['layout']}")
213
+
214
+ if vision.get("components"):
215
+ lines.append("组件列表:")
216
+ for comp in vision["components"]:
217
+ data_str = f" (数据: {comp['data']})" if comp.get("data") else ""
218
+ lines.append(f" - [{comp['type']}] {comp['label']}: {comp.get('function', '')}{data_str}")
219
+
220
+ return "\n".join(lines)
221
+
222
+ def extract_with_metadata(self, image_path: str) -> dict:
223
+ """Extract text and image metadata (legacy method)."""
224
+ full_result = self.extract_full(image_path)
225
+ return {
226
+ "ref": image_path,
227
+ "type": "image",
228
+ "ocr_text": full_result["ocr_text"] or "",
229
+ "has_text": full_result["has_ocr"],
230
+ "vision": full_result["vision"],
231
+ "needs_vision_model": not full_result["has_vision"],
232
+ "visual_note": f"OCR: {len(full_result.get('ocr_text') or '')} chars, Vision: {'yes' if full_result['has_vision'] else 'no'}"
233
+ }
@@ -0,0 +1,139 @@
1
+ """Extract content from Markdown/Text documents."""
2
+
3
+ import re
4
+ from typing import List
5
+ from memnex.models.paragraph import Paragraph, Sentence, SentenceRelation, ParagraphCollection
6
+
7
+
8
+ class MarkdownExtractor:
9
+ """Extracts L1 paragraphs and L2 structured data from markdown."""
10
+
11
+ def extract(self, content: str, source: str = "document.md") -> ParagraphCollection:
12
+ """
13
+ Extract paragraphs from markdown content.
14
+
15
+ Args:
16
+ content: Markdown text content
17
+ source: Source identifier
18
+
19
+ Returns:
20
+ ParagraphCollection with extracted paragraphs
21
+ """
22
+ paragraphs = ParagraphCollection()
23
+
24
+ # Split by double newlines (paragraph separation)
25
+ blocks = re.split(r'\n\s*\n', content)
26
+
27
+ for i, block in enumerate(blocks):
28
+ block = block.strip()
29
+ if not block:
30
+ continue
31
+
32
+ para_id = f"para_{i+1:03d}"
33
+
34
+ # Detect section header
35
+ header_match = re.match(r'^(#{1,6})\s+(.+)$', block, re.MULTILINE)
36
+ section = ""
37
+ if header_match:
38
+ section = header_match.group(2).strip()
39
+
40
+ # Extract sentences and roles
41
+ sentences = self._extract_sentences(block, para_id)
42
+ relations = self._extract_relations(sentences)
43
+
44
+ paragraph = Paragraph(
45
+ id=para_id,
46
+ source=f"{source}#{para_id}",
47
+ section=section,
48
+ raw_text=block,
49
+ semantic_unit=True,
50
+ sentences=sentences,
51
+ sentence_relations=relations
52
+ )
53
+
54
+ paragraphs.add(paragraph)
55
+
56
+ return paragraphs
57
+
58
+ def _extract_sentences(self, text: str, para_id: str) -> List[Sentence]:
59
+ """Extract sentences and their roles from text."""
60
+ # Split on sentence-ending punctuation followed by whitespace and uppercase/Chinese
61
+ # Also split on newlines (paragraph breaks)
62
+ # Avoids splitting on decimals (1.5) or common abbreviations (e.g., i.e.)
63
+ sentence_texts = re.split(r'(?<=[.!?。!?])\s+(?=[A-Z一-鿿])|[\n]+', text)
64
+ sentences = []
65
+
66
+ for i, sent in enumerate(sentence_texts):
67
+ sent = sent.strip()
68
+ if not sent:
69
+ continue
70
+
71
+ role = self._infer_role(sent)
72
+ sentences.append(Sentence(
73
+ id=f"{para_id}_s{i+1}",
74
+ text=sent,
75
+ role=role
76
+ ))
77
+
78
+ return sentences
79
+
80
+ def _infer_role(self, sentence: str) -> str:
81
+ """Infer the role of a sentence from its content."""
82
+ sentence_lower = sentence.lower()
83
+
84
+ # Trigger indicators — user interaction or event start
85
+ # NOTE: avoid overly generic words like "登录" or "用户" that appear in all sentence types
86
+ trigger_patterns = [
87
+ '点击', '选择', '提交', '按下', '输入', '勾选',
88
+ '当', 'when', 'after', 'before',
89
+ '登录时', '登出时', '注册时',
90
+ ]
91
+ for p in trigger_patterns:
92
+ if p in sentence_lower:
93
+ return "trigger"
94
+
95
+ # Condition indicators — prerequisite or conditional logic
96
+ cond_patterns = [
97
+ '如果', '满足', '条件是', '前提是', 'when', 'if', 'unless',
98
+ '条件', 'required', '前提',
99
+ ]
100
+ for p in cond_patterns:
101
+ if p in sentence_lower:
102
+ return "condition"
103
+
104
+ # Action indicators — system or user performs an operation
105
+ action_patterns = [
106
+ '自动', '发送', '创建', '更新', '删除', '跳转', '打开', '关闭',
107
+ '显示', '隐藏', '提交', '保存', '取消',
108
+ 'action', 'do', 'perform', 'execute',
109
+ ]
110
+ for p in action_patterns:
111
+ if p in sentence_lower:
112
+ return "action"
113
+
114
+ # Result indicators — outcome or benefit
115
+ result_patterns = [
116
+ '享受', '获得', '收到', '看到', '返回', '跳转到',
117
+ 'result', 'then', '因此', '所以', '进入',
118
+ ]
119
+ for p in result_patterns:
120
+ if p in sentence_lower:
121
+ return "result"
122
+
123
+ return "statement"
124
+
125
+ def _extract_relations(self, sentences: List[Sentence]) -> List[SentenceRelation]:
126
+ """Extract relations between sentences."""
127
+ relations = []
128
+
129
+ for i, sent in enumerate(sentences):
130
+ if sent.role == "condition" and i + 1 < len(sentences):
131
+ next_sent = sentences[i + 1]
132
+ if next_sent.role == "action":
133
+ relations.append(SentenceRelation(
134
+ from_id=sent.id,
135
+ to_id=next_sent.id,
136
+ type="if_then"
137
+ ))
138
+
139
+ return relations
@@ -0,0 +1,133 @@
1
+ """Extract text and images from PDF files."""
2
+
3
+ from typing import List, Optional, Dict, Any
4
+ import os
5
+ import tempfile
6
+
7
+
8
+ class PDFExtractor:
9
+ """Extracts text and images from PDF files."""
10
+
11
+ def __init__(self):
12
+ self._available = None
13
+ self._pymupdf_available = None
14
+
15
+ def is_available(self) -> bool:
16
+ """Check if PDF extraction is available."""
17
+ if self._available is None:
18
+ try:
19
+ import pdfplumber
20
+ self._available = True
21
+ except ImportError:
22
+ self._available = False
23
+ return self._available
24
+
25
+ def _is_pymupdf_available(self) -> bool:
26
+ """Check if PyMuPDF is available for image extraction."""
27
+ if self._pymupdf_available is None:
28
+ try:
29
+ import fitz
30
+ self._pymupdf_available = True
31
+ except ImportError:
32
+ self._pymupdf_available = False
33
+ return self._pymupdf_available
34
+
35
+ def extract(self, pdf_path: str) -> Optional[List[str]]:
36
+ """Extract text from PDF file. Returns list of page texts."""
37
+ if not self.is_available():
38
+ return None
39
+ if not os.path.exists(pdf_path):
40
+ return None
41
+ try:
42
+ import pdfplumber
43
+ pages = []
44
+ with pdfplumber.open(pdf_path) as pdf:
45
+ for page in pdf.pages:
46
+ text = page.extract_text()
47
+ if text:
48
+ pages.append(text)
49
+ return pages if pages else None
50
+ except Exception:
51
+ return None
52
+
53
+ def _extract_images_pymupdf(self, pdf_path: str) -> List[List[Dict[str, Any]]]:
54
+ """Extract images from PDF using PyMuPDF. Returns list of images per page."""
55
+ try:
56
+ import fitz
57
+ images_per_page = []
58
+ doc = fitz.open(pdf_path)
59
+ for page_num, page in enumerate(doc):
60
+ page_images = []
61
+ image_list = page.get_images(full=True)
62
+ for img_index, img in enumerate(image_list):
63
+ xref = img[0]
64
+ pix = fitz.Pixmap(doc, xref)
65
+ if pix.n - pix.alpha < 4:
66
+ img_data = pix.tobytes("png")
67
+ else:
68
+ pix = fitz.Pixmap(fitz.csRGB, pix)
69
+ img_data = pix.tobytes("png")
70
+ temp_file = tempfile.NamedTemporaryFile(
71
+ suffix=".png", delete=False
72
+ )
73
+ temp_file.write(img_data)
74
+ temp_file.close()
75
+ page_images.append({
76
+ "path": temp_file.name,
77
+ "page": page_num,
78
+ "index": img_index,
79
+ "width": pix.width,
80
+ "height": pix.height,
81
+ })
82
+ images_per_page.append(page_images)
83
+ doc.close()
84
+ return images_per_page
85
+ except Exception:
86
+ return []
87
+
88
+ def _extract_images_pdfplumber(self, pdf_path: str) -> List[List[Dict[str, Any]]]:
89
+ """Extract image metadata from PDF using pdfplumber (no actual bytes)."""
90
+ try:
91
+ import pdfplumber
92
+ images_per_page = []
93
+ with pdfplumber.open(pdf_path) as pdf:
94
+ for page in pdf.pages:
95
+ page_images = []
96
+ for img in page.images:
97
+ page_images.append({
98
+ "path": None,
99
+ "page": page.page_number - 1,
100
+ "index": len(page_images),
101
+ "width": img.get("width"),
102
+ "height": img.get("height"),
103
+ "bbox": (img.get("x0"), img.get("y0"), img.get("x1"), img.get("y1")),
104
+ })
105
+ images_per_page.append(page_images)
106
+ return images_per_page
107
+ except Exception:
108
+ return []
109
+
110
+ def extract_full(self, pdf_path: str) -> Optional[dict]:
111
+ """Extract full content with text, metadata, and images."""
112
+ if not self.is_available():
113
+ return None
114
+ if not os.path.exists(pdf_path):
115
+ return None
116
+ try:
117
+ import pdfplumber
118
+ with pdfplumber.open(pdf_path) as pdf:
119
+ pages_text = [page.extract_text() or "" for page in pdf.pages]
120
+
121
+ if self._is_pymupdf_available():
122
+ images = self._extract_images_pymupdf(pdf_path)
123
+ else:
124
+ images = self._extract_images_pdfplumber(pdf_path)
125
+
126
+ return {
127
+ "pages": pages_text,
128
+ "metadata": pdf.metadata if hasattr(pdf, 'metadata') else {},
129
+ "page_count": len(pages_text),
130
+ "images": images,
131
+ }
132
+ except Exception:
133
+ return None