memplex 3.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- memnex/__init__.py +31 -0
- memnex/__main__.py +6 -0
- memnex/_plugin/.claude-plugin/plugin.json +24 -0
- memnex/_plugin/.mcp.json +9 -0
- memnex/_plugin/__init__.py +0 -0
- memnex/_plugin/hooks/hooks.json +43 -0
- memnex/_plugin/scripts/hook-runner.py +166 -0
- memnex/_plugin/skills/mem-explore/SKILL.md +83 -0
- memnex/_plugin/skills/mem-manage/SKILL.md +92 -0
- memnex/_plugin/skills/mem-search/SKILL.md +85 -0
- memnex/_plugin/skills/mem-write/SKILL.md +78 -0
- memnex/adapters/__init__.py +14 -0
- memnex/adapters/claude_skill.py +169 -0
- memnex/adapters/cli.py +525 -0
- memnex/adapters/http_api.py +314 -0
- memnex/adapters/mcp_server.py +448 -0
- memnex/compaction.py +563 -0
- memnex/config.py +366 -0
- memnex/core/__init__.py +13 -0
- memnex/core/associator/__init__.py +8 -0
- memnex/core/associator/domain_classifier.py +75 -0
- memnex/core/associator/entity_aligner.py +127 -0
- memnex/core/associator/ref_linker.py +197 -0
- memnex/core/associator/term_mapper.py +77 -0
- memnex/core/dictionaries/__init__.py +50 -0
- memnex/core/engine.py +667 -0
- memnex/core/extractors/__init__.py +15 -0
- memnex/core/extractors/docx.py +97 -0
- memnex/core/extractors/image.py +233 -0
- memnex/core/extractors/markdown.py +139 -0
- memnex/core/extractors/pdf.py +133 -0
- memnex/core/extractors/vision_mapper.py +131 -0
- memnex/core/handlers/__init__.py +7 -0
- memnex/core/handlers/clipboard.py +40 -0
- memnex/core/handlers/file_handler.py +62 -0
- memnex/core/handlers/url_handler.py +132 -0
- memnex/llm/__init__.py +25 -0
- memnex/llm/enhancer.py +226 -0
- memnex/llm/fallback_chain.py +87 -0
- memnex/llm/injection_guard.py +178 -0
- memnex/llm/provider.py +130 -0
- memnex/llm/providers/__init__.py +22 -0
- memnex/llm/providers/anthropic.py +135 -0
- memnex/llm/providers/local.py +135 -0
- memnex/llm/providers/rule_based.py +68 -0
- memnex/llm/sanitizer.py +67 -0
- memnex/models/__init__.py +68 -0
- memnex/models/feedback.py +42 -0
- memnex/models/graph.py +33 -0
- memnex/models/memory.py +102 -0
- memnex/models/misc.py +185 -0
- memnex/models/paragraph.py +45 -0
- memnex/models/search.py +51 -0
- memnex/models/source.py +23 -0
- memnex/models/task.py +62 -0
- memnex/processing/__init__.py +1 -0
- memnex/processing/graph_builder.py +278 -0
- memnex/processing/merger/__init__.py +6 -0
- memnex/processing/merger/confidence_calculator.py +127 -0
- memnex/processing/merger/conflict_resolver.py +116 -0
- memnex/retrieval/__init__.py +1 -0
- memnex/retrieval/dedup.py +386 -0
- memnex/retrieval/embedding.py +289 -0
- memnex/retrieval/reranker.py +299 -0
- memnex/service.py +902 -0
- memnex/storage/__init__.py +65 -0
- memnex/storage/base.py +132 -0
- memnex/storage/changelog.py +106 -0
- memnex/storage/feedback.py +486 -0
- memnex/storage/lite/__init__.py +5 -0
- memnex/storage/lite/store.py +606 -0
- memnex/storage/vector.py +265 -0
- memnex/wiki/__init__.py +11 -0
- memnex/wiki/community.py +221 -0
- memnex/wiki/compiler.py +545 -0
- memnex/wiki/generator.py +270 -0
- memnex/wiki/search.py +282 -0
- memnex/worker.py +412 -0
- memplex-3.2.0.dist-info/METADATA +37 -0
- memplex-3.2.0.dist-info/RECORD +83 -0
- memplex-3.2.0.dist-info/WHEEL +5 -0
- memplex-3.2.0.dist-info/entry_points.txt +2 -0
- memplex-3.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Extract content from DOCX files."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class DOCXExtractor:
|
|
7
|
+
"""Extracts text content from .docx files."""
|
|
8
|
+
|
|
9
|
+
def is_available(self) -> bool:
|
|
10
|
+
"""Check if python-docx is installed."""
|
|
11
|
+
try:
|
|
12
|
+
import docx
|
|
13
|
+
return True
|
|
14
|
+
except ImportError:
|
|
15
|
+
return False
|
|
16
|
+
|
|
17
|
+
def extract(self, path: str) -> Optional[str]:
|
|
18
|
+
"""
|
|
19
|
+
Extract text from a DOCX file.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
path: Path to the .docx file
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
Extracted text content, or None if extraction fails
|
|
26
|
+
"""
|
|
27
|
+
if not self.is_available():
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
import docx
|
|
32
|
+
doc = docx.Document(path)
|
|
33
|
+
paragraphs = []
|
|
34
|
+
|
|
35
|
+
for para in doc.paragraphs:
|
|
36
|
+
text = para.text.strip()
|
|
37
|
+
if text:
|
|
38
|
+
paragraphs.append(text)
|
|
39
|
+
|
|
40
|
+
for table in doc.tables:
|
|
41
|
+
for row in table.rows:
|
|
42
|
+
for cell in row.cells:
|
|
43
|
+
text = cell.text.strip()
|
|
44
|
+
if text:
|
|
45
|
+
paragraphs.append(text)
|
|
46
|
+
|
|
47
|
+
return "\n\n".join(paragraphs)
|
|
48
|
+
|
|
49
|
+
except Exception:
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
def extract_full(self, path: str) -> dict:
|
|
53
|
+
"""
|
|
54
|
+
Extract full content from DOCX including metadata.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
dict with keys: text, paragraphs, page_count (estimate), tables
|
|
58
|
+
"""
|
|
59
|
+
if not self.is_available():
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
try:
|
|
63
|
+
import docx
|
|
64
|
+
doc = docx.Document(path)
|
|
65
|
+
paragraphs = []
|
|
66
|
+
tables_text = []
|
|
67
|
+
|
|
68
|
+
for para in doc.paragraphs:
|
|
69
|
+
text = para.text.strip()
|
|
70
|
+
if text:
|
|
71
|
+
paragraphs.append(text)
|
|
72
|
+
|
|
73
|
+
for table in doc.tables:
|
|
74
|
+
table_rows = []
|
|
75
|
+
for row in table.rows:
|
|
76
|
+
row_texts = [cell.text.strip() for cell in row.cells if cell.text.strip()]
|
|
77
|
+
if row_texts:
|
|
78
|
+
table_rows.append(" | ".join(row_texts))
|
|
79
|
+
if table_rows:
|
|
80
|
+
tables_text.append("\n".join(table_rows))
|
|
81
|
+
|
|
82
|
+
total_chars = sum(len(p) for p in paragraphs)
|
|
83
|
+
page_count = max(1, total_chars // 500)
|
|
84
|
+
|
|
85
|
+
return {
|
|
86
|
+
"text": "\n\n".join(paragraphs),
|
|
87
|
+
"paragraphs": paragraphs,
|
|
88
|
+
"page_count": page_count,
|
|
89
|
+
"tables": tables_text,
|
|
90
|
+
"metadata": {
|
|
91
|
+
"paragraph_count": len(paragraphs),
|
|
92
|
+
"table_count": len(tables_text)
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
except Exception:
|
|
97
|
+
return None
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
"""Extract text from images using OCR or external vision providers."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Callable
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ImageExtractor:
|
|
8
|
+
"""
|
|
9
|
+
Extracts text and visual information from images.
|
|
10
|
+
|
|
11
|
+
Supports two modes:
|
|
12
|
+
1. Internal OCR: uses pytesseract (if available)
|
|
13
|
+
2. External providers: registered via set_*_provider() methods
|
|
14
|
+
|
|
15
|
+
Priority: External provider > Internal OCR
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self):
|
|
19
|
+
self._ocr_available = None
|
|
20
|
+
self._external_ocr: Optional[Callable] = None
|
|
21
|
+
self._external_vision: Optional[Callable] = None
|
|
22
|
+
self._vision_timeout = 60 # seconds
|
|
23
|
+
self._vision_max_retries = 2
|
|
24
|
+
|
|
25
|
+
def set_ocr_provider(self, fn: Callable[[str], Optional[str]]) -> None:
|
|
26
|
+
"""
|
|
27
|
+
Register an external OCR provider.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
fn: Callable that takes (image_path) returns OCR text or None.
|
|
31
|
+
Example: lambda path: subprocess.run(['tesseract', path, 'stdout'])
|
|
32
|
+
"""
|
|
33
|
+
self._external_ocr = fn
|
|
34
|
+
|
|
35
|
+
def set_vision_provider(self, fn: Callable[[str], Optional[dict]]) -> None:
|
|
36
|
+
"""
|
|
37
|
+
Register an external vision/LLM provider.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
fn: Callable that takes (image_path) returns vision dict or None.
|
|
41
|
+
The dict should have keys: page_type, components[], layout, design_tools, design_system
|
|
42
|
+
"""
|
|
43
|
+
self._external_vision = fn
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def has_ocr(self) -> bool:
|
|
47
|
+
"""Check if any OCR is available (external or internal)."""
|
|
48
|
+
if self._external_ocr is not None:
|
|
49
|
+
return True
|
|
50
|
+
if self._ocr_available is None:
|
|
51
|
+
try:
|
|
52
|
+
import pytesseract
|
|
53
|
+
from PIL import Image
|
|
54
|
+
self._ocr_available = True
|
|
55
|
+
except ImportError:
|
|
56
|
+
self._ocr_available = False
|
|
57
|
+
return self._ocr_available
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def has_vision(self) -> bool:
|
|
61
|
+
"""Check if vision capability is available (external or internal)."""
|
|
62
|
+
return self._external_vision is not None
|
|
63
|
+
|
|
64
|
+
def extract(self, image_path: str) -> Optional[str]:
|
|
65
|
+
"""
|
|
66
|
+
Extract text from image using OCR.
|
|
67
|
+
|
|
68
|
+
Priority: External OCR provider > Internal pytesseract.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
image_path: Path to image file
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
Extracted text or None if OCR unavailable
|
|
75
|
+
"""
|
|
76
|
+
if not os.path.exists(image_path):
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
# 1. External OCR provider (priority)
|
|
80
|
+
if self._external_ocr is not None:
|
|
81
|
+
try:
|
|
82
|
+
result = self._external_ocr(image_path)
|
|
83
|
+
if result:
|
|
84
|
+
return result.strip() if isinstance(result, str) else result
|
|
85
|
+
except Exception as e:
|
|
86
|
+
print(f"External OCR failed: {e}")
|
|
87
|
+
|
|
88
|
+
# 2. Internal pytesseract
|
|
89
|
+
try:
|
|
90
|
+
import pytesseract
|
|
91
|
+
from PIL import Image
|
|
92
|
+
image = Image.open(image_path)
|
|
93
|
+
text = pytesseract.image_to_string(image, lang='eng+chi')
|
|
94
|
+
return text.strip()
|
|
95
|
+
except Exception as e:
|
|
96
|
+
print(f"OCR failed for {image_path}: {e}")
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
|
+
def extract_with_vision(self, image_path: str, prompt: str = None) -> Optional[dict]:
|
|
100
|
+
"""
|
|
101
|
+
Extract visual understanding using vision model.
|
|
102
|
+
|
|
103
|
+
Priority: External vision provider (MCP/LLM) > None.
|
|
104
|
+
Applies timeout and retry with exponential backoff.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
image_path: Path to image file
|
|
108
|
+
prompt: Custom prompt for vision analysis
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
dict with vision analysis or None
|
|
112
|
+
"""
|
|
113
|
+
import time
|
|
114
|
+
|
|
115
|
+
if not os.path.exists(image_path):
|
|
116
|
+
return None
|
|
117
|
+
|
|
118
|
+
# 1. External Vision provider (MCP/LLM)
|
|
119
|
+
if self._external_vision is not None:
|
|
120
|
+
last_error = None
|
|
121
|
+
for attempt in range(self._vision_max_retries):
|
|
122
|
+
try:
|
|
123
|
+
result = self._external_vision(image_path)
|
|
124
|
+
if result:
|
|
125
|
+
return result
|
|
126
|
+
except TimeoutError as e:
|
|
127
|
+
last_error = f"Timeout after {self._vision_timeout}s (attempt {attempt + 1}/{self._vision_max_retries})"
|
|
128
|
+
print(f"External vision timeout: {last_error}")
|
|
129
|
+
except Exception as e:
|
|
130
|
+
last_error = f"{type(e).__name__}: {e} (attempt {attempt + 1}/{self._vision_max_retries})"
|
|
131
|
+
print(f"External vision failed: {last_error}")
|
|
132
|
+
|
|
133
|
+
if attempt < self._vision_max_retries - 1:
|
|
134
|
+
wait_time = 2 ** attempt # exponential backoff: 1s, 2s
|
|
135
|
+
time.sleep(wait_time)
|
|
136
|
+
|
|
137
|
+
if last_error:
|
|
138
|
+
print(f"Vision exhausted after {self._vision_max_retries} attempts")
|
|
139
|
+
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
def extract_full(self, image_path: str, vision_result: dict = None) -> dict:
|
|
143
|
+
"""
|
|
144
|
+
Extract both OCR text and optionally pre-extracted vision understanding.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
image_path: Path to image file
|
|
148
|
+
vision_result: Pre-extracted vision analysis (from LLM MCP).
|
|
149
|
+
If provided, skips internal vision extraction.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
dict with:
|
|
153
|
+
- ocr_text: raw text from OCR
|
|
154
|
+
- has_ocr: whether OCR succeeded
|
|
155
|
+
- vision: structured vision analysis
|
|
156
|
+
- has_vision: whether vision was provided or extracted
|
|
157
|
+
- combined_text: OCR + Vision text for paragraph extraction
|
|
158
|
+
"""
|
|
159
|
+
result = {
|
|
160
|
+
"ref": image_path,
|
|
161
|
+
"type": "image",
|
|
162
|
+
"ocr_text": None,
|
|
163
|
+
"has_ocr": False,
|
|
164
|
+
"vision": vision_result,
|
|
165
|
+
"has_vision": vision_result is not None,
|
|
166
|
+
"combined_text": "",
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
# 1. OCR extraction
|
|
170
|
+
ocr_text = self.extract(image_path)
|
|
171
|
+
if ocr_text:
|
|
172
|
+
result["ocr_text"] = ocr_text
|
|
173
|
+
result["has_ocr"] = True
|
|
174
|
+
result["combined_text"] = ocr_text
|
|
175
|
+
|
|
176
|
+
# 2. Vision result (may be pre-extracted via MCP)
|
|
177
|
+
if vision_result:
|
|
178
|
+
result["vision"] = vision_result
|
|
179
|
+
result["has_vision"] = True
|
|
180
|
+
vision_text = self._vision_to_text(vision_result)
|
|
181
|
+
if result["combined_text"]:
|
|
182
|
+
result["combined_text"] += "\n" + vision_text
|
|
183
|
+
else:
|
|
184
|
+
result["combined_text"] = vision_text
|
|
185
|
+
else:
|
|
186
|
+
internal_vision = self.extract_with_vision(image_path)
|
|
187
|
+
if internal_vision:
|
|
188
|
+
result["vision"] = internal_vision
|
|
189
|
+
result["has_vision"] = True
|
|
190
|
+
vision_text = self._vision_to_text(internal_vision)
|
|
191
|
+
if result["combined_text"]:
|
|
192
|
+
result["combined_text"] += "\n" + vision_text
|
|
193
|
+
else:
|
|
194
|
+
result["combined_text"] = vision_text
|
|
195
|
+
|
|
196
|
+
return result
|
|
197
|
+
|
|
198
|
+
def _vision_to_text(self, vision: dict) -> str:
|
|
199
|
+
"""Convert vision result to readable text."""
|
|
200
|
+
lines = []
|
|
201
|
+
|
|
202
|
+
if vision.get("page_type"):
|
|
203
|
+
lines.append(f"页面类型: {vision['page_type']}")
|
|
204
|
+
|
|
205
|
+
if vision.get("design_tools"):
|
|
206
|
+
lines.append(f"设计工具: {', '.join(vision['design_tools'])}")
|
|
207
|
+
|
|
208
|
+
if vision.get("design_system"):
|
|
209
|
+
lines.append(f"设计系统: {vision['design_system']}")
|
|
210
|
+
|
|
211
|
+
if vision.get("layout"):
|
|
212
|
+
lines.append(f"布局结构: {vision['layout']}")
|
|
213
|
+
|
|
214
|
+
if vision.get("components"):
|
|
215
|
+
lines.append("组件列表:")
|
|
216
|
+
for comp in vision["components"]:
|
|
217
|
+
data_str = f" (数据: {comp['data']})" if comp.get("data") else ""
|
|
218
|
+
lines.append(f" - [{comp['type']}] {comp['label']}: {comp.get('function', '')}{data_str}")
|
|
219
|
+
|
|
220
|
+
return "\n".join(lines)
|
|
221
|
+
|
|
222
|
+
def extract_with_metadata(self, image_path: str) -> dict:
|
|
223
|
+
"""Extract text and image metadata (legacy method)."""
|
|
224
|
+
full_result = self.extract_full(image_path)
|
|
225
|
+
return {
|
|
226
|
+
"ref": image_path,
|
|
227
|
+
"type": "image",
|
|
228
|
+
"ocr_text": full_result["ocr_text"] or "",
|
|
229
|
+
"has_text": full_result["has_ocr"],
|
|
230
|
+
"vision": full_result["vision"],
|
|
231
|
+
"needs_vision_model": not full_result["has_vision"],
|
|
232
|
+
"visual_note": f"OCR: {len(full_result.get('ocr_text') or '')} chars, Vision: {'yes' if full_result['has_vision'] else 'no'}"
|
|
233
|
+
}
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""Extract content from Markdown/Text documents."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import List
|
|
5
|
+
from memnex.models.paragraph import Paragraph, Sentence, SentenceRelation, ParagraphCollection
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class MarkdownExtractor:
|
|
9
|
+
"""Extracts L1 paragraphs and L2 structured data from markdown."""
|
|
10
|
+
|
|
11
|
+
def extract(self, content: str, source: str = "document.md") -> ParagraphCollection:
|
|
12
|
+
"""
|
|
13
|
+
Extract paragraphs from markdown content.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
content: Markdown text content
|
|
17
|
+
source: Source identifier
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
ParagraphCollection with extracted paragraphs
|
|
21
|
+
"""
|
|
22
|
+
paragraphs = ParagraphCollection()
|
|
23
|
+
|
|
24
|
+
# Split by double newlines (paragraph separation)
|
|
25
|
+
blocks = re.split(r'\n\s*\n', content)
|
|
26
|
+
|
|
27
|
+
for i, block in enumerate(blocks):
|
|
28
|
+
block = block.strip()
|
|
29
|
+
if not block:
|
|
30
|
+
continue
|
|
31
|
+
|
|
32
|
+
para_id = f"para_{i+1:03d}"
|
|
33
|
+
|
|
34
|
+
# Detect section header
|
|
35
|
+
header_match = re.match(r'^(#{1,6})\s+(.+)$', block, re.MULTILINE)
|
|
36
|
+
section = ""
|
|
37
|
+
if header_match:
|
|
38
|
+
section = header_match.group(2).strip()
|
|
39
|
+
|
|
40
|
+
# Extract sentences and roles
|
|
41
|
+
sentences = self._extract_sentences(block, para_id)
|
|
42
|
+
relations = self._extract_relations(sentences)
|
|
43
|
+
|
|
44
|
+
paragraph = Paragraph(
|
|
45
|
+
id=para_id,
|
|
46
|
+
source=f"{source}#{para_id}",
|
|
47
|
+
section=section,
|
|
48
|
+
raw_text=block,
|
|
49
|
+
semantic_unit=True,
|
|
50
|
+
sentences=sentences,
|
|
51
|
+
sentence_relations=relations
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
paragraphs.add(paragraph)
|
|
55
|
+
|
|
56
|
+
return paragraphs
|
|
57
|
+
|
|
58
|
+
def _extract_sentences(self, text: str, para_id: str) -> List[Sentence]:
|
|
59
|
+
"""Extract sentences and their roles from text."""
|
|
60
|
+
# Split on sentence-ending punctuation followed by whitespace and uppercase/Chinese
|
|
61
|
+
# Also split on newlines (paragraph breaks)
|
|
62
|
+
# Avoids splitting on decimals (1.5) or common abbreviations (e.g., i.e.)
|
|
63
|
+
sentence_texts = re.split(r'(?<=[.!?。!?])\s+(?=[A-Z一-鿿])|[\n]+', text)
|
|
64
|
+
sentences = []
|
|
65
|
+
|
|
66
|
+
for i, sent in enumerate(sentence_texts):
|
|
67
|
+
sent = sent.strip()
|
|
68
|
+
if not sent:
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
role = self._infer_role(sent)
|
|
72
|
+
sentences.append(Sentence(
|
|
73
|
+
id=f"{para_id}_s{i+1}",
|
|
74
|
+
text=sent,
|
|
75
|
+
role=role
|
|
76
|
+
))
|
|
77
|
+
|
|
78
|
+
return sentences
|
|
79
|
+
|
|
80
|
+
def _infer_role(self, sentence: str) -> str:
|
|
81
|
+
"""Infer the role of a sentence from its content."""
|
|
82
|
+
sentence_lower = sentence.lower()
|
|
83
|
+
|
|
84
|
+
# Trigger indicators — user interaction or event start
|
|
85
|
+
# NOTE: avoid overly generic words like "登录" or "用户" that appear in all sentence types
|
|
86
|
+
trigger_patterns = [
|
|
87
|
+
'点击', '选择', '提交', '按下', '输入', '勾选',
|
|
88
|
+
'当', 'when', 'after', 'before',
|
|
89
|
+
'登录时', '登出时', '注册时',
|
|
90
|
+
]
|
|
91
|
+
for p in trigger_patterns:
|
|
92
|
+
if p in sentence_lower:
|
|
93
|
+
return "trigger"
|
|
94
|
+
|
|
95
|
+
# Condition indicators — prerequisite or conditional logic
|
|
96
|
+
cond_patterns = [
|
|
97
|
+
'如果', '满足', '条件是', '前提是', 'when', 'if', 'unless',
|
|
98
|
+
'条件', 'required', '前提',
|
|
99
|
+
]
|
|
100
|
+
for p in cond_patterns:
|
|
101
|
+
if p in sentence_lower:
|
|
102
|
+
return "condition"
|
|
103
|
+
|
|
104
|
+
# Action indicators — system or user performs an operation
|
|
105
|
+
action_patterns = [
|
|
106
|
+
'自动', '发送', '创建', '更新', '删除', '跳转', '打开', '关闭',
|
|
107
|
+
'显示', '隐藏', '提交', '保存', '取消',
|
|
108
|
+
'action', 'do', 'perform', 'execute',
|
|
109
|
+
]
|
|
110
|
+
for p in action_patterns:
|
|
111
|
+
if p in sentence_lower:
|
|
112
|
+
return "action"
|
|
113
|
+
|
|
114
|
+
# Result indicators — outcome or benefit
|
|
115
|
+
result_patterns = [
|
|
116
|
+
'享受', '获得', '收到', '看到', '返回', '跳转到',
|
|
117
|
+
'result', 'then', '因此', '所以', '进入',
|
|
118
|
+
]
|
|
119
|
+
for p in result_patterns:
|
|
120
|
+
if p in sentence_lower:
|
|
121
|
+
return "result"
|
|
122
|
+
|
|
123
|
+
return "statement"
|
|
124
|
+
|
|
125
|
+
def _extract_relations(self, sentences: List[Sentence]) -> List[SentenceRelation]:
|
|
126
|
+
"""Extract relations between sentences."""
|
|
127
|
+
relations = []
|
|
128
|
+
|
|
129
|
+
for i, sent in enumerate(sentences):
|
|
130
|
+
if sent.role == "condition" and i + 1 < len(sentences):
|
|
131
|
+
next_sent = sentences[i + 1]
|
|
132
|
+
if next_sent.role == "action":
|
|
133
|
+
relations.append(SentenceRelation(
|
|
134
|
+
from_id=sent.id,
|
|
135
|
+
to_id=next_sent.id,
|
|
136
|
+
type="if_then"
|
|
137
|
+
))
|
|
138
|
+
|
|
139
|
+
return relations
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""Extract text and images from PDF files."""
|
|
2
|
+
|
|
3
|
+
from typing import List, Optional, Dict, Any
|
|
4
|
+
import os
|
|
5
|
+
import tempfile
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PDFExtractor:
|
|
9
|
+
"""Extracts text and images from PDF files."""
|
|
10
|
+
|
|
11
|
+
def __init__(self):
|
|
12
|
+
self._available = None
|
|
13
|
+
self._pymupdf_available = None
|
|
14
|
+
|
|
15
|
+
def is_available(self) -> bool:
|
|
16
|
+
"""Check if PDF extraction is available."""
|
|
17
|
+
if self._available is None:
|
|
18
|
+
try:
|
|
19
|
+
import pdfplumber
|
|
20
|
+
self._available = True
|
|
21
|
+
except ImportError:
|
|
22
|
+
self._available = False
|
|
23
|
+
return self._available
|
|
24
|
+
|
|
25
|
+
def _is_pymupdf_available(self) -> bool:
|
|
26
|
+
"""Check if PyMuPDF is available for image extraction."""
|
|
27
|
+
if self._pymupdf_available is None:
|
|
28
|
+
try:
|
|
29
|
+
import fitz
|
|
30
|
+
self._pymupdf_available = True
|
|
31
|
+
except ImportError:
|
|
32
|
+
self._pymupdf_available = False
|
|
33
|
+
return self._pymupdf_available
|
|
34
|
+
|
|
35
|
+
def extract(self, pdf_path: str) -> Optional[List[str]]:
|
|
36
|
+
"""Extract text from PDF file. Returns list of page texts."""
|
|
37
|
+
if not self.is_available():
|
|
38
|
+
return None
|
|
39
|
+
if not os.path.exists(pdf_path):
|
|
40
|
+
return None
|
|
41
|
+
try:
|
|
42
|
+
import pdfplumber
|
|
43
|
+
pages = []
|
|
44
|
+
with pdfplumber.open(pdf_path) as pdf:
|
|
45
|
+
for page in pdf.pages:
|
|
46
|
+
text = page.extract_text()
|
|
47
|
+
if text:
|
|
48
|
+
pages.append(text)
|
|
49
|
+
return pages if pages else None
|
|
50
|
+
except Exception:
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
def _extract_images_pymupdf(self, pdf_path: str) -> List[List[Dict[str, Any]]]:
|
|
54
|
+
"""Extract images from PDF using PyMuPDF. Returns list of images per page."""
|
|
55
|
+
try:
|
|
56
|
+
import fitz
|
|
57
|
+
images_per_page = []
|
|
58
|
+
doc = fitz.open(pdf_path)
|
|
59
|
+
for page_num, page in enumerate(doc):
|
|
60
|
+
page_images = []
|
|
61
|
+
image_list = page.get_images(full=True)
|
|
62
|
+
for img_index, img in enumerate(image_list):
|
|
63
|
+
xref = img[0]
|
|
64
|
+
pix = fitz.Pixmap(doc, xref)
|
|
65
|
+
if pix.n - pix.alpha < 4:
|
|
66
|
+
img_data = pix.tobytes("png")
|
|
67
|
+
else:
|
|
68
|
+
pix = fitz.Pixmap(fitz.csRGB, pix)
|
|
69
|
+
img_data = pix.tobytes("png")
|
|
70
|
+
temp_file = tempfile.NamedTemporaryFile(
|
|
71
|
+
suffix=".png", delete=False
|
|
72
|
+
)
|
|
73
|
+
temp_file.write(img_data)
|
|
74
|
+
temp_file.close()
|
|
75
|
+
page_images.append({
|
|
76
|
+
"path": temp_file.name,
|
|
77
|
+
"page": page_num,
|
|
78
|
+
"index": img_index,
|
|
79
|
+
"width": pix.width,
|
|
80
|
+
"height": pix.height,
|
|
81
|
+
})
|
|
82
|
+
images_per_page.append(page_images)
|
|
83
|
+
doc.close()
|
|
84
|
+
return images_per_page
|
|
85
|
+
except Exception:
|
|
86
|
+
return []
|
|
87
|
+
|
|
88
|
+
def _extract_images_pdfplumber(self, pdf_path: str) -> List[List[Dict[str, Any]]]:
|
|
89
|
+
"""Extract image metadata from PDF using pdfplumber (no actual bytes)."""
|
|
90
|
+
try:
|
|
91
|
+
import pdfplumber
|
|
92
|
+
images_per_page = []
|
|
93
|
+
with pdfplumber.open(pdf_path) as pdf:
|
|
94
|
+
for page in pdf.pages:
|
|
95
|
+
page_images = []
|
|
96
|
+
for img in page.images:
|
|
97
|
+
page_images.append({
|
|
98
|
+
"path": None,
|
|
99
|
+
"page": page.page_number - 1,
|
|
100
|
+
"index": len(page_images),
|
|
101
|
+
"width": img.get("width"),
|
|
102
|
+
"height": img.get("height"),
|
|
103
|
+
"bbox": (img.get("x0"), img.get("y0"), img.get("x1"), img.get("y1")),
|
|
104
|
+
})
|
|
105
|
+
images_per_page.append(page_images)
|
|
106
|
+
return images_per_page
|
|
107
|
+
except Exception:
|
|
108
|
+
return []
|
|
109
|
+
|
|
110
|
+
def extract_full(self, pdf_path: str) -> Optional[dict]:
|
|
111
|
+
"""Extract full content with text, metadata, and images."""
|
|
112
|
+
if not self.is_available():
|
|
113
|
+
return None
|
|
114
|
+
if not os.path.exists(pdf_path):
|
|
115
|
+
return None
|
|
116
|
+
try:
|
|
117
|
+
import pdfplumber
|
|
118
|
+
with pdfplumber.open(pdf_path) as pdf:
|
|
119
|
+
pages_text = [page.extract_text() or "" for page in pdf.pages]
|
|
120
|
+
|
|
121
|
+
if self._is_pymupdf_available():
|
|
122
|
+
images = self._extract_images_pymupdf(pdf_path)
|
|
123
|
+
else:
|
|
124
|
+
images = self._extract_images_pdfplumber(pdf_path)
|
|
125
|
+
|
|
126
|
+
return {
|
|
127
|
+
"pages": pages_text,
|
|
128
|
+
"metadata": pdf.metadata if hasattr(pdf, 'metadata') else {},
|
|
129
|
+
"page_count": len(pages_text),
|
|
130
|
+
"images": images,
|
|
131
|
+
}
|
|
132
|
+
except Exception:
|
|
133
|
+
return None
|