PyPI - opencode-semantic-memory - Versions diffs - 0.1.0__py3-none-any.whl - Mend

opencode-semantic-memory 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

opencode_memory/__init__.py +3 -0
opencode_memory/cache.py +261 -0
opencode_memory/cli.py +794 -0
opencode_memory/config.py +89 -0
opencode_memory/daemon.py +879 -0
opencode_memory/enrichment/__init__.py +0 -0
opencode_memory/enrichment/gitlab.py +237 -0
opencode_memory/extraction.py +225 -0
opencode_memory/historical_ingest.py +142 -0
opencode_memory/http_server.py +464 -0
opencode_memory/ingestion/__init__.py +7 -0
opencode_memory/ingestion/embeddings.py +211 -0
opencode_memory/ingestion/extractors.py +287 -0
opencode_memory/ingestion/opencode_db.py +448 -0
opencode_memory/ingestion/parser.py +344 -0
opencode_memory/ingestion/watcher.py +88 -0
opencode_memory/linking/__init__.py +5 -0
opencode_memory/linking/linker.py +323 -0
opencode_memory/metrics.py +273 -0
opencode_memory/models.py +171 -0
opencode_memory/project.py +86 -0
opencode_memory/query/__init__.py +5 -0
opencode_memory/query/hybrid.py +196 -0
opencode_memory/server.py +2795 -0
opencode_memory/session/__init__.py +5 -0
opencode_memory/session/registry.py +57 -0
opencode_memory/storage/__init__.py +6 -0
opencode_memory/storage/sqlite.py +1608 -0
opencode_memory/storage/vectors.py +199 -0
opencode_semantic_memory-0.1.0.dist-info/METADATA +531 -0
opencode_semantic_memory-0.1.0.dist-info/RECORD +33 -0
opencode_semantic_memory-0.1.0.dist-info/WHEEL +4 -0
opencode_semantic_memory-0.1.0.dist-info/entry_points.txt +3 -0

opencode_memory/ingestion/parser.py ADDED Viewed

@@ -0,0 +1,344 @@
+"""Markdown parsing and entity extraction."""
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+from markdown_it import MarkdownIt
+from opencode_memory.models import EntityType, Memory, MemoryCategory
+from opencode_memory.project import detect_project_from_path
+@dataclass
+class ParsedDocument:
+    """Result of parsing a markdown document."""
+    file_path: str
+    title: str | None = None
+    entities: list[tuple[EntityType, str]] = field(default_factory=list)
+    memories: list[Memory] = field(default_factory=list)
+    sections: dict[str, str] = field(default_factory=dict)
+    urls: list[tuple[EntityType, str, str]] = field(default_factory=list)
+    file_paths: list[str] = field(default_factory=list)
+    dates: list[str] = field(default_factory=list)
+ENTITY_PATTERNS = [
+    (EntityType.MR, r"!(\d+)"),
+    (EntityType.ISSUE, r"#(\d+)"),
+    (EntityType.EPIC, r"&(\d+)"),
+    (EntityType.PERSON, r"@([\w\.-]+)"),
+]
+GITLAB_URL_PATTERNS = [
+    (EntityType.MR, r"https?://gitlab\.com/[\w\-./]+/-/merge_requests/(\d+)"),
+    (EntityType.ISSUE, r"https?://gitlab\.com/[\w\-./]+/-/issues/(\d+)"),
+    (EntityType.EPIC, r"https?://gitlab\.com/groups/[\w\-./]+/-/epics/(\d+)"),
+]
+FILE_PATH_PATTERN = re.compile(
+    r"(?:^|[\s`\"\'])((?:ee/)?(?:app|lib|spec|config|db|scripts)/[\w/\-\.]+\.(?:rb|js|ts|vue|yml|yaml|json|md))"
+)
+DATE_HEADER_PATTERN = re.compile(r"^###\s+(\d{4}-\d{2}-\d{2})\s*$", re.MULTILINE)
+MEMORY_SECTION_KEYWORDS = {
+    "blocker": MemoryCategory.BLOCKER,
+    "blocked": MemoryCategory.BLOCKER,
+    "decision": MemoryCategory.DECISION,
+    "decided": MemoryCategory.DECISION,
+    "learned": MemoryCategory.FACT,
+    "lesson": MemoryCategory.FACT,
+    "history": MemoryCategory.EVENT,
+    "event": MemoryCategory.EVENT,
+    "procedure": MemoryCategory.PROCEDURE,
+    "how to": MemoryCategory.PROCEDURE,
+}
+class MarkdownParser:
+    """Parse markdown files and extract entities and memories."""
+    def __init__(self) -> None:
+        self.md = MarkdownIt()
+    def parse_file(self, file_path: Path) -> ParsedDocument:
+        """Parse a markdown file."""
+        content = file_path.read_text()
+        return self.parse_content(str(file_path), content)
+    def parse_content(self, file_path: str, content: str) -> ParsedDocument:
+        """Parse markdown content."""
+        doc = ParsedDocument(file_path=file_path)
+        lines = content.split("\n")
+        doc.title = self._extract_title(lines)
+        doc.entities = self._extract_entities(content)
+        doc.urls = self._extract_urls(content)
+        doc.file_paths = self._extract_file_paths(content)
+        doc.dates = self._extract_dates(content)
+        doc.sections = self._extract_sections(lines)
+        doc.memories = self._extract_memories(doc)
+        return doc
+    def _extract_title(self, lines: list[str]) -> str | None:
+        """Extract the document title (first H1)."""
+        for line in lines:
+            if line.startswith("# "):
+                return line[2:].strip()
+        return None
+    def _extract_entities(self, content: str) -> list[tuple[EntityType, str]]:
+        """Extract entity references from content."""
+        entities: list[tuple[EntityType, str]] = []
+        seen: set[tuple[EntityType, str]] = set()
+        for entity_type, pattern in ENTITY_PATTERNS:
+            for match in re.finditer(pattern, content):
+                ref = match.group(0) if entity_type == EntityType.PERSON else match.group(0)
+                if entity_type == EntityType.PERSON:
+                    ref = f"@{match.group(1)}"
+                key = (entity_type, ref)
+                if key not in seen:
+                    seen.add(key)
+                    entities.append(key)
+        return entities
+    def _extract_sections(self, lines: list[str]) -> dict[str, str]:
+        """Extract sections by header."""
+        sections: dict[str, str] = {}
+        current_header: str | None = None
+        current_content: list[str] = []
+        for line in lines:
+            if line.startswith("## "):
+                if current_header is not None:
+                    sections[current_header] = "\n".join(current_content).strip()
+                current_header = line[3:].strip().lower()
+                current_content = []
+            elif line.startswith("### "):
+                if current_header is not None:
+                    sections[current_header] = "\n".join(current_content).strip()
+                current_header = line[4:].strip().lower()
+                current_content = []
+            elif current_header is not None:
+                current_content.append(line)
+        if current_header is not None:
+            sections[current_header] = "\n".join(current_content).strip()
+        return sections
+    def _extract_memories(self, doc: ParsedDocument) -> list[Memory]:
+        """Extract memories from parsed document.
+        Strategy:
+        1. If sections have category keywords, extract those as categorized memories
+        2. Otherwise, chunk the full content into semantic units (by headers/paragraphs)
+        3. Each chunk becomes a searchable memory with the file as source
+        """
+        memories: list[Memory] = []
+        project = detect_project_from_path(doc.file_path)
+        entity_refs = [ref for _, ref in doc.entities]
+        # First pass: extract categorized sections
+        categorized_sections = set()
+        for section_name, section_content in doc.sections.items():
+            if not section_content.strip():
+                continue
+            category = self._categorize_section(section_name)
+            if category:
+                categorized_sections.add(section_name)
+                memories.append(
+                    Memory(
+                        source_file=doc.file_path,
+                        project=project,
+                        category=category,
+                        content=section_content,
+                        what=section_name.title(),
+                        entities=entity_refs,
+                    )
+                )
+        # Second pass: chunk remaining content (sections without category keywords)
+        for section_name, section_content in doc.sections.items():
+            if section_name in categorized_sections or not section_content.strip():
+                continue
+            # Chunk large sections, keep small ones whole
+            chunks = self._chunk_content(section_content, section_name)
+            for chunk in chunks:
+                memories.append(
+                    Memory(
+                        source_file=doc.file_path,
+                        project=project,
+                        category=MemoryCategory.FACT,
+                        content=chunk,
+                        what=section_name.title() if len(chunks) == 1 else None,
+                        entities=entity_refs,
+                    )
+                )
+        # If no sections found, chunk the entire file content
+        if not memories:
+            full_content = Path(doc.file_path).read_text() if Path(doc.file_path).exists() else ""
+            if full_content.strip():
+                chunks = self._chunk_content(full_content, doc.title)
+                for i, chunk in enumerate(chunks):
+                    memories.append(
+                        Memory(
+                            source_file=doc.file_path,
+                            project=project,
+                            category=MemoryCategory.FACT,
+                            content=chunk,
+                            what=doc.title if i == 0 else f"{doc.title} (part {i + 1})",
+                            entities=entity_refs,
+                        )
+                    )
+        return memories
+    def _chunk_content(
+        self, content: str, context: str | None = None, max_chunk_size: int = 1500
+    ) -> list[str]:
+        """Split content into semantic chunks.
+        Tries to split on natural boundaries (headers, blank lines, paragraphs)
+        while keeping chunks under max_chunk_size characters.
+        """
+        if len(content) <= max_chunk_size:
+            return [content.strip()] if content.strip() else []
+        chunks: list[str] = []
+        # Try splitting by headers first (## or ###)
+        header_pattern = re.compile(r"\n(?=#{2,3}\s)")
+        sections = header_pattern.split(content)
+        current_chunk = ""
+        for section in sections:
+            section = section.strip()
+            if not section:
+                continue
+            # If adding this section would exceed limit, save current and start new
+            if current_chunk and len(current_chunk) + len(section) + 2 > max_chunk_size:
+                chunks.append(current_chunk.strip())
+                current_chunk = section
+            else:
+                current_chunk = current_chunk + "\n\n" + section if current_chunk else section
+        if current_chunk.strip():
+            chunks.append(current_chunk.strip())
+        # If still too large, split by paragraphs (double newlines)
+        final_chunks: list[str] = []
+        for chunk in chunks:
+            if len(chunk) <= max_chunk_size:
+                final_chunks.append(chunk)
+            else:
+                # Split by paragraphs
+                paragraphs = re.split(r"\n\n+", chunk)
+                current = ""
+                for para in paragraphs:
+                    para = para.strip()
+                    if not para:
+                        continue
+                    if current and len(current) + len(para) + 2 > max_chunk_size:
+                        final_chunks.append(current.strip())
+                        current = para
+                    else:
+                        current = current + "\n\n" + para if current else para
+                if current.strip():
+                    # If a single paragraph is still too large, split by sentences
+                    if len(current) > max_chunk_size:
+                        final_chunks.extend(self._split_by_sentences(current, max_chunk_size))
+                    else:
+                        final_chunks.append(current.strip())
+        return final_chunks
+    def _split_by_sentences(self, text: str, max_size: int) -> list[str]:
+        """Split text by sentences when paragraphs are too large."""
+        # Simple sentence splitting (period/question/exclamation followed by space)
+        sentences = re.split(r"(?<=[.!?])\s+", text)
+        chunks: list[str] = []
+        current = ""
+        for sentence in sentences:
+            if current and len(current) + len(sentence) + 1 > max_size:
+                chunks.append(current.strip())
+                current = sentence
+            else:
+                current = current + " " + sentence if current else sentence
+        if current.strip():
+            # If still too large (single very long sentence), just truncate
+            if len(current) > max_size:
+                # Split at max_size boundaries
+                while current:
+                    chunks.append(current[:max_size].strip())
+                    current = current[max_size:]
+            else:
+                chunks.append(current.strip())
+        return chunks
+    def _categorize_section(self, section_name: str) -> MemoryCategory | None:
+        """Determine the memory category for a section."""
+        section_lower = section_name.lower()
+        for keyword, category in MEMORY_SECTION_KEYWORDS.items():
+            if keyword in section_lower:
+                return category
+        return None
+    def _extract_urls(self, content: str) -> list[tuple[EntityType, str, str]]:
+        """Extract GitLab URLs and their entity references."""
+        urls: list[tuple[EntityType, str, str]] = []
+        seen: set[str] = set()
+        for entity_type, pattern in GITLAB_URL_PATTERNS:
+            for match in re.finditer(pattern, content):
+                url = match.group(0)
+                entity_id = match.group(1)
+                if url not in seen:
+                    seen.add(url)
+                    if entity_type == EntityType.MR:
+                        ref = f"!{entity_id}"
+                    elif entity_type == EntityType.ISSUE:
+                        ref = f"#{entity_id}"
+                    else:
+                        ref = f"&{entity_id}"
+                    urls.append((entity_type, ref, url))
+        return urls
+    def _extract_file_paths(self, content: str) -> list[str]:
+        """Extract file paths like ee/app/models/foo.rb."""
+        paths: list[str] = []
+        seen: set[str] = set()
+        for match in FILE_PATH_PATTERN.finditer(content):
+            path = match.group(1)
+            if path not in seen:
+                seen.add(path)
+                paths.append(path)
+        return paths
+    def _extract_dates(self, content: str) -> list[str]:
+        """Extract dates from ### YYYY-MM-DD style headers."""
+        dates: list[str] = []
+        seen: set[str] = set()
+        for match in DATE_HEADER_PATTERN.finditer(content):
+            date = match.group(1)
+            if date not in seen:
+                seen.add(date)
+                dates.append(date)
+        return dates

opencode_memory/ingestion/watcher.py ADDED Viewed

@@ -0,0 +1,88 @@
+"""File system watcher for automatic ingestion."""
+import logging
+import time
+from collections.abc import Callable
+from pathlib import Path
+from watchdog.events import FileSystemEvent, FileSystemEventHandler
+from watchdog.observers import Observer
+logger = logging.getLogger(__name__)
+DEFAULT_DEBOUNCE_SECONDS = 1.0
+class MemoryFileHandler(FileSystemEventHandler):
+    """Handle file system events for memory ingestion."""
+    def __init__(
+        self,
+        on_file_changed: Callable[[Path], None],
+        extensions: set[str] | None = None,
+        debounce_seconds: float = DEFAULT_DEBOUNCE_SECONDS,
+    ):
+        self.on_file_changed = on_file_changed
+        self.extensions = extensions or {".md"}
+        self.debounce_seconds = debounce_seconds
+        self._last_processed: dict[str, float] = {}
+    def _should_process(self, path: str) -> bool:
+        """Check if we should process this file."""
+        p = Path(path)
+        if p.suffix not in self.extensions:
+            return False
+        if "node_modules" in p.parts:
+            return False
+        if p.name.startswith("."):
+            return False
+        return True
+    def _is_debounced(self, path: str) -> bool:
+        """Check if this file was processed recently (debounce)."""
+        now = time.time()
+        last_time = self._last_processed.get(path)
+        if last_time is not None and (now - last_time) < self.debounce_seconds:
+            return True
+        self._last_processed[path] = now
+        return False
+    def on_created(self, event: FileSystemEvent) -> None:
+        if not event.is_directory and self._should_process(event.src_path):
+            if self._is_debounced(event.src_path):
+                logger.debug(f"Debounced file create: {event.src_path}")
+                return
+            logger.info(f"File created: {event.src_path}")
+            self.on_file_changed(Path(event.src_path))
+    def on_modified(self, event: FileSystemEvent) -> None:
+        if not event.is_directory and self._should_process(event.src_path):
+            if self._is_debounced(event.src_path):
+                logger.debug(f"Debounced file modify: {event.src_path}")
+                return
+            logger.debug(f"File modified: {event.src_path}")
+            self.on_file_changed(Path(event.src_path))
+class FileWatcher:
+    """Watch directories for file changes and trigger ingestion."""
+    def __init__(self, on_file_changed: Callable[[Path], None]):
+        self.on_file_changed = on_file_changed
+        self.observer = Observer()
+        self.handler = MemoryFileHandler(on_file_changed)
+    def add_watch(self, path: Path) -> None:
+        """Add a directory to watch."""
+        if path.exists() and path.is_dir():
+            self.observer.schedule(self.handler, str(path), recursive=True)
+            logger.info(f"Watching directory: {path}")
+    def start(self) -> None:
+        """Start watching."""
+        self.observer.start()
+    def stop(self) -> None:
+        """Stop watching."""
+        self.observer.stop()
+        self.observer.join()

opencode_memory/linking/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Memory linking module for discovering relationships between memories."""
+from opencode_memory.linking.linker import MemoryLinker
+__all__ = ["MemoryLinker"]