opencode-semantic-memory 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,344 @@
1
+ """Markdown parsing and entity extraction."""
2
+
3
+ import re
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+
7
+ from markdown_it import MarkdownIt
8
+
9
+ from opencode_memory.models import EntityType, Memory, MemoryCategory
10
+ from opencode_memory.project import detect_project_from_path
11
+
12
+
13
+ @dataclass
14
+ class ParsedDocument:
15
+ """Result of parsing a markdown document."""
16
+
17
+ file_path: str
18
+ title: str | None = None
19
+ entities: list[tuple[EntityType, str]] = field(default_factory=list)
20
+ memories: list[Memory] = field(default_factory=list)
21
+ sections: dict[str, str] = field(default_factory=dict)
22
+ urls: list[tuple[EntityType, str, str]] = field(default_factory=list)
23
+ file_paths: list[str] = field(default_factory=list)
24
+ dates: list[str] = field(default_factory=list)
25
+
26
+
27
+ ENTITY_PATTERNS = [
28
+ (EntityType.MR, r"!(\d+)"),
29
+ (EntityType.ISSUE, r"#(\d+)"),
30
+ (EntityType.EPIC, r"&(\d+)"),
31
+ (EntityType.PERSON, r"@([\w\.-]+)"),
32
+ ]
33
+
34
+ GITLAB_URL_PATTERNS = [
35
+ (EntityType.MR, r"https?://gitlab\.com/[\w\-./]+/-/merge_requests/(\d+)"),
36
+ (EntityType.ISSUE, r"https?://gitlab\.com/[\w\-./]+/-/issues/(\d+)"),
37
+ (EntityType.EPIC, r"https?://gitlab\.com/groups/[\w\-./]+/-/epics/(\d+)"),
38
+ ]
39
+
40
+ FILE_PATH_PATTERN = re.compile(
41
+ r"(?:^|[\s`\"\'])((?:ee/)?(?:app|lib|spec|config|db|scripts)/[\w/\-\.]+\.(?:rb|js|ts|vue|yml|yaml|json|md))"
42
+ )
43
+
44
+ DATE_HEADER_PATTERN = re.compile(r"^###\s+(\d{4}-\d{2}-\d{2})\s*$", re.MULTILINE)
45
+
46
+ MEMORY_SECTION_KEYWORDS = {
47
+ "blocker": MemoryCategory.BLOCKER,
48
+ "blocked": MemoryCategory.BLOCKER,
49
+ "decision": MemoryCategory.DECISION,
50
+ "decided": MemoryCategory.DECISION,
51
+ "learned": MemoryCategory.FACT,
52
+ "lesson": MemoryCategory.FACT,
53
+ "history": MemoryCategory.EVENT,
54
+ "event": MemoryCategory.EVENT,
55
+ "procedure": MemoryCategory.PROCEDURE,
56
+ "how to": MemoryCategory.PROCEDURE,
57
+ }
58
+
59
+
60
+ class MarkdownParser:
61
+ """Parse markdown files and extract entities and memories."""
62
+
63
+ def __init__(self) -> None:
64
+ self.md = MarkdownIt()
65
+
66
+ def parse_file(self, file_path: Path) -> ParsedDocument:
67
+ """Parse a markdown file."""
68
+ content = file_path.read_text()
69
+ return self.parse_content(str(file_path), content)
70
+
71
+ def parse_content(self, file_path: str, content: str) -> ParsedDocument:
72
+ """Parse markdown content."""
73
+ doc = ParsedDocument(file_path=file_path)
74
+
75
+ lines = content.split("\n")
76
+ doc.title = self._extract_title(lines)
77
+ doc.entities = self._extract_entities(content)
78
+ doc.urls = self._extract_urls(content)
79
+ doc.file_paths = self._extract_file_paths(content)
80
+ doc.dates = self._extract_dates(content)
81
+ doc.sections = self._extract_sections(lines)
82
+ doc.memories = self._extract_memories(doc)
83
+
84
+ return doc
85
+
86
+ def _extract_title(self, lines: list[str]) -> str | None:
87
+ """Extract the document title (first H1)."""
88
+ for line in lines:
89
+ if line.startswith("# "):
90
+ return line[2:].strip()
91
+ return None
92
+
93
+ def _extract_entities(self, content: str) -> list[tuple[EntityType, str]]:
94
+ """Extract entity references from content."""
95
+ entities: list[tuple[EntityType, str]] = []
96
+ seen: set[tuple[EntityType, str]] = set()
97
+
98
+ for entity_type, pattern in ENTITY_PATTERNS:
99
+ for match in re.finditer(pattern, content):
100
+ ref = match.group(0) if entity_type == EntityType.PERSON else match.group(0)
101
+ if entity_type == EntityType.PERSON:
102
+ ref = f"@{match.group(1)}"
103
+ key = (entity_type, ref)
104
+ if key not in seen:
105
+ seen.add(key)
106
+ entities.append(key)
107
+
108
+ return entities
109
+
110
+ def _extract_sections(self, lines: list[str]) -> dict[str, str]:
111
+ """Extract sections by header."""
112
+ sections: dict[str, str] = {}
113
+ current_header: str | None = None
114
+ current_content: list[str] = []
115
+
116
+ for line in lines:
117
+ if line.startswith("## "):
118
+ if current_header is not None:
119
+ sections[current_header] = "\n".join(current_content).strip()
120
+ current_header = line[3:].strip().lower()
121
+ current_content = []
122
+ elif line.startswith("### "):
123
+ if current_header is not None:
124
+ sections[current_header] = "\n".join(current_content).strip()
125
+ current_header = line[4:].strip().lower()
126
+ current_content = []
127
+ elif current_header is not None:
128
+ current_content.append(line)
129
+
130
+ if current_header is not None:
131
+ sections[current_header] = "\n".join(current_content).strip()
132
+
133
+ return sections
134
+
135
+ def _extract_memories(self, doc: ParsedDocument) -> list[Memory]:
136
+ """Extract memories from parsed document.
137
+
138
+ Strategy:
139
+ 1. If sections have category keywords, extract those as categorized memories
140
+ 2. Otherwise, chunk the full content into semantic units (by headers/paragraphs)
141
+ 3. Each chunk becomes a searchable memory with the file as source
142
+ """
143
+ memories: list[Memory] = []
144
+ project = detect_project_from_path(doc.file_path)
145
+ entity_refs = [ref for _, ref in doc.entities]
146
+
147
+ # First pass: extract categorized sections
148
+ categorized_sections = set()
149
+ for section_name, section_content in doc.sections.items():
150
+ if not section_content.strip():
151
+ continue
152
+
153
+ category = self._categorize_section(section_name)
154
+ if category:
155
+ categorized_sections.add(section_name)
156
+ memories.append(
157
+ Memory(
158
+ source_file=doc.file_path,
159
+ project=project,
160
+ category=category,
161
+ content=section_content,
162
+ what=section_name.title(),
163
+ entities=entity_refs,
164
+ )
165
+ )
166
+
167
+ # Second pass: chunk remaining content (sections without category keywords)
168
+ for section_name, section_content in doc.sections.items():
169
+ if section_name in categorized_sections or not section_content.strip():
170
+ continue
171
+
172
+ # Chunk large sections, keep small ones whole
173
+ chunks = self._chunk_content(section_content, section_name)
174
+ for chunk in chunks:
175
+ memories.append(
176
+ Memory(
177
+ source_file=doc.file_path,
178
+ project=project,
179
+ category=MemoryCategory.FACT,
180
+ content=chunk,
181
+ what=section_name.title() if len(chunks) == 1 else None,
182
+ entities=entity_refs,
183
+ )
184
+ )
185
+
186
+ # If no sections found, chunk the entire file content
187
+ if not memories:
188
+ full_content = Path(doc.file_path).read_text() if Path(doc.file_path).exists() else ""
189
+ if full_content.strip():
190
+ chunks = self._chunk_content(full_content, doc.title)
191
+ for i, chunk in enumerate(chunks):
192
+ memories.append(
193
+ Memory(
194
+ source_file=doc.file_path,
195
+ project=project,
196
+ category=MemoryCategory.FACT,
197
+ content=chunk,
198
+ what=doc.title if i == 0 else f"{doc.title} (part {i + 1})",
199
+ entities=entity_refs,
200
+ )
201
+ )
202
+
203
+ return memories
204
+
205
+ def _chunk_content(
206
+ self, content: str, context: str | None = None, max_chunk_size: int = 1500
207
+ ) -> list[str]:
208
+ """Split content into semantic chunks.
209
+
210
+ Tries to split on natural boundaries (headers, blank lines, paragraphs)
211
+ while keeping chunks under max_chunk_size characters.
212
+ """
213
+ if len(content) <= max_chunk_size:
214
+ return [content.strip()] if content.strip() else []
215
+
216
+ chunks: list[str] = []
217
+
218
+ # Try splitting by headers first (## or ###)
219
+ header_pattern = re.compile(r"\n(?=#{2,3}\s)")
220
+ sections = header_pattern.split(content)
221
+
222
+ current_chunk = ""
223
+ for section in sections:
224
+ section = section.strip()
225
+ if not section:
226
+ continue
227
+
228
+ # If adding this section would exceed limit, save current and start new
229
+ if current_chunk and len(current_chunk) + len(section) + 2 > max_chunk_size:
230
+ chunks.append(current_chunk.strip())
231
+ current_chunk = section
232
+ else:
233
+ current_chunk = current_chunk + "\n\n" + section if current_chunk else section
234
+
235
+ if current_chunk.strip():
236
+ chunks.append(current_chunk.strip())
237
+
238
+ # If still too large, split by paragraphs (double newlines)
239
+ final_chunks: list[str] = []
240
+ for chunk in chunks:
241
+ if len(chunk) <= max_chunk_size:
242
+ final_chunks.append(chunk)
243
+ else:
244
+ # Split by paragraphs
245
+ paragraphs = re.split(r"\n\n+", chunk)
246
+ current = ""
247
+ for para in paragraphs:
248
+ para = para.strip()
249
+ if not para:
250
+ continue
251
+ if current and len(current) + len(para) + 2 > max_chunk_size:
252
+ final_chunks.append(current.strip())
253
+ current = para
254
+ else:
255
+ current = current + "\n\n" + para if current else para
256
+ if current.strip():
257
+ # If a single paragraph is still too large, split by sentences
258
+ if len(current) > max_chunk_size:
259
+ final_chunks.extend(self._split_by_sentences(current, max_chunk_size))
260
+ else:
261
+ final_chunks.append(current.strip())
262
+
263
+ return final_chunks
264
+
265
+ def _split_by_sentences(self, text: str, max_size: int) -> list[str]:
266
+ """Split text by sentences when paragraphs are too large."""
267
+ # Simple sentence splitting (period/question/exclamation followed by space)
268
+ sentences = re.split(r"(?<=[.!?])\s+", text)
269
+
270
+ chunks: list[str] = []
271
+ current = ""
272
+ for sentence in sentences:
273
+ if current and len(current) + len(sentence) + 1 > max_size:
274
+ chunks.append(current.strip())
275
+ current = sentence
276
+ else:
277
+ current = current + " " + sentence if current else sentence
278
+
279
+ if current.strip():
280
+ # If still too large (single very long sentence), just truncate
281
+ if len(current) > max_size:
282
+ # Split at max_size boundaries
283
+ while current:
284
+ chunks.append(current[:max_size].strip())
285
+ current = current[max_size:]
286
+ else:
287
+ chunks.append(current.strip())
288
+
289
+ return chunks
290
+
291
+ def _categorize_section(self, section_name: str) -> MemoryCategory | None:
292
+ """Determine the memory category for a section."""
293
+ section_lower = section_name.lower()
294
+ for keyword, category in MEMORY_SECTION_KEYWORDS.items():
295
+ if keyword in section_lower:
296
+ return category
297
+ return None
298
+
299
+ def _extract_urls(self, content: str) -> list[tuple[EntityType, str, str]]:
300
+ """Extract GitLab URLs and their entity references."""
301
+ urls: list[tuple[EntityType, str, str]] = []
302
+ seen: set[str] = set()
303
+
304
+ for entity_type, pattern in GITLAB_URL_PATTERNS:
305
+ for match in re.finditer(pattern, content):
306
+ url = match.group(0)
307
+ entity_id = match.group(1)
308
+ if url not in seen:
309
+ seen.add(url)
310
+ if entity_type == EntityType.MR:
311
+ ref = f"!{entity_id}"
312
+ elif entity_type == EntityType.ISSUE:
313
+ ref = f"#{entity_id}"
314
+ else:
315
+ ref = f"&{entity_id}"
316
+ urls.append((entity_type, ref, url))
317
+
318
+ return urls
319
+
320
+ def _extract_file_paths(self, content: str) -> list[str]:
321
+ """Extract file paths like ee/app/models/foo.rb."""
322
+ paths: list[str] = []
323
+ seen: set[str] = set()
324
+
325
+ for match in FILE_PATH_PATTERN.finditer(content):
326
+ path = match.group(1)
327
+ if path not in seen:
328
+ seen.add(path)
329
+ paths.append(path)
330
+
331
+ return paths
332
+
333
+ def _extract_dates(self, content: str) -> list[str]:
334
+ """Extract dates from ### YYYY-MM-DD style headers."""
335
+ dates: list[str] = []
336
+ seen: set[str] = set()
337
+
338
+ for match in DATE_HEADER_PATTERN.finditer(content):
339
+ date = match.group(1)
340
+ if date not in seen:
341
+ seen.add(date)
342
+ dates.append(date)
343
+
344
+ return dates
@@ -0,0 +1,88 @@
1
+ """File system watcher for automatic ingestion."""
2
+
3
+ import logging
4
+ import time
5
+ from collections.abc import Callable
6
+ from pathlib import Path
7
+
8
+ from watchdog.events import FileSystemEvent, FileSystemEventHandler
9
+ from watchdog.observers import Observer
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ DEFAULT_DEBOUNCE_SECONDS = 1.0
14
+
15
+
16
+ class MemoryFileHandler(FileSystemEventHandler):
17
+ """Handle file system events for memory ingestion."""
18
+
19
+ def __init__(
20
+ self,
21
+ on_file_changed: Callable[[Path], None],
22
+ extensions: set[str] | None = None,
23
+ debounce_seconds: float = DEFAULT_DEBOUNCE_SECONDS,
24
+ ):
25
+ self.on_file_changed = on_file_changed
26
+ self.extensions = extensions or {".md"}
27
+ self.debounce_seconds = debounce_seconds
28
+ self._last_processed: dict[str, float] = {}
29
+
30
+ def _should_process(self, path: str) -> bool:
31
+ """Check if we should process this file."""
32
+ p = Path(path)
33
+ if p.suffix not in self.extensions:
34
+ return False
35
+ if "node_modules" in p.parts:
36
+ return False
37
+ if p.name.startswith("."):
38
+ return False
39
+ return True
40
+
41
+ def _is_debounced(self, path: str) -> bool:
42
+ """Check if this file was processed recently (debounce)."""
43
+ now = time.time()
44
+ last_time = self._last_processed.get(path)
45
+ if last_time is not None and (now - last_time) < self.debounce_seconds:
46
+ return True
47
+ self._last_processed[path] = now
48
+ return False
49
+
50
+ def on_created(self, event: FileSystemEvent) -> None:
51
+ if not event.is_directory and self._should_process(event.src_path):
52
+ if self._is_debounced(event.src_path):
53
+ logger.debug(f"Debounced file create: {event.src_path}")
54
+ return
55
+ logger.info(f"File created: {event.src_path}")
56
+ self.on_file_changed(Path(event.src_path))
57
+
58
+ def on_modified(self, event: FileSystemEvent) -> None:
59
+ if not event.is_directory and self._should_process(event.src_path):
60
+ if self._is_debounced(event.src_path):
61
+ logger.debug(f"Debounced file modify: {event.src_path}")
62
+ return
63
+ logger.debug(f"File modified: {event.src_path}")
64
+ self.on_file_changed(Path(event.src_path))
65
+
66
+
67
+ class FileWatcher:
68
+ """Watch directories for file changes and trigger ingestion."""
69
+
70
+ def __init__(self, on_file_changed: Callable[[Path], None]):
71
+ self.on_file_changed = on_file_changed
72
+ self.observer = Observer()
73
+ self.handler = MemoryFileHandler(on_file_changed)
74
+
75
+ def add_watch(self, path: Path) -> None:
76
+ """Add a directory to watch."""
77
+ if path.exists() and path.is_dir():
78
+ self.observer.schedule(self.handler, str(path), recursive=True)
79
+ logger.info(f"Watching directory: {path}")
80
+
81
+ def start(self) -> None:
82
+ """Start watching."""
83
+ self.observer.start()
84
+
85
+ def stop(self) -> None:
86
+ """Stop watching."""
87
+ self.observer.stop()
88
+ self.observer.join()
@@ -0,0 +1,5 @@
1
+ """Memory linking module for discovering relationships between memories."""
2
+
3
+ from opencode_memory.linking.linker import MemoryLinker
4
+
5
+ __all__ = ["MemoryLinker"]