PyPI - opencode-semantic-memory - Versions diffs - 0.1.0__py3-none-any.whl - Mend

opencode-semantic-memory 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

opencode_memory/__init__.py +3 -0
opencode_memory/cache.py +261 -0
opencode_memory/cli.py +794 -0
opencode_memory/config.py +89 -0
opencode_memory/daemon.py +879 -0
opencode_memory/enrichment/__init__.py +0 -0
opencode_memory/enrichment/gitlab.py +237 -0
opencode_memory/extraction.py +225 -0
opencode_memory/historical_ingest.py +142 -0
opencode_memory/http_server.py +464 -0
opencode_memory/ingestion/__init__.py +7 -0
opencode_memory/ingestion/embeddings.py +211 -0
opencode_memory/ingestion/extractors.py +287 -0
opencode_memory/ingestion/opencode_db.py +448 -0
opencode_memory/ingestion/parser.py +344 -0
opencode_memory/ingestion/watcher.py +88 -0
opencode_memory/linking/__init__.py +5 -0
opencode_memory/linking/linker.py +323 -0
opencode_memory/metrics.py +273 -0
opencode_memory/models.py +171 -0
opencode_memory/project.py +86 -0
opencode_memory/query/__init__.py +5 -0
opencode_memory/query/hybrid.py +196 -0
opencode_memory/server.py +2795 -0
opencode_memory/session/__init__.py +5 -0
opencode_memory/session/registry.py +57 -0
opencode_memory/storage/__init__.py +6 -0
opencode_memory/storage/sqlite.py +1608 -0
opencode_memory/storage/vectors.py +199 -0
opencode_semantic_memory-0.1.0.dist-info/METADATA +531 -0
opencode_semantic_memory-0.1.0.dist-info/RECORD +33 -0
opencode_semantic_memory-0.1.0.dist-info/WHEEL +4 -0
opencode_semantic_memory-0.1.0.dist-info/entry_points.txt +3 -0

opencode_memory/ingestion/extractors.py ADDED Viewed

@@ -0,0 +1,287 @@
+"""Pattern-based extractors for decisions, blockers, and learnings from text."""
+import re
+from dataclasses import dataclass
+DECISION_PATTERNS = [
+    (
+        r"(?:we |I )?(?:decided|chose|went with|selected|opted for)\s+to\s+(.{20,300}?)(?:\.|$)",
+        True,
+    ),
+    (r"(?:the )?decision(?:\s+is)?\s*(?:to|:)\s+(.{20,300}?)(?:\.|$)", True),
+    (r"(?:we'll |I'll |we will |I will )(.{20,250}?)(?:\s+because|\s+since|\.|$)", True),
+    (
+        r"going to\s+(?:use|implement|create|build|add|remove|change|update|fix)\s+(.{15,200}?)(?:\.|$)",
+        False,
+    ),
+]
+BLOCKER_PATTERNS = [
+    # Explicit blocker declarations - high confidence
+    (r"(?:the )?blocker(?:\s+is)?(?:\s*:\s*|\s+)(.{20,250}?)(?:\.|$)", True),
+    (r"blocking issue(?:\s*:\s*|\s+is\s+)(.{20,250}?)(?:\.|$)", True),
+    # Blocked by pattern - common usage
+    (r"(?:we're|we are|I'm|I am)\s+blocked\s+by\s+(.{20,200}?)(?:\.|$)", True),
+    (r"blocked\s+by\s+(?:the\s+)?(.{20,200}?)(?:\.|$)", True),
+    # Waiting on/for pattern - with human context (capture the whole thing including approval/review)
+    (
+        r"[Ww]aiting\s+(?:on|for)\s+((?:approval|review|feedback|response|sign-off|authorization).{10,200}?)(?:\.|$)",
+        True,
+    ),
+    (r"(?:we're|we are|I'm|I am)\s+waiting\s+(?:on|for)\s+(.{20,200}?)(?:\.|$)", True),
+    # Can't merge/ship patterns
+    (r"[Cc]an't\s+(?:merge|ship|release|deploy)\s+(?:until|because)\s+(.{20,200}?)(?:\.|$)", True),
+    # Needs to wait pattern
+    (r"need(?:s)?\s+to\s+wait\s+(?:for\s+)?(.{20,200}?)(?:\s+before|\.|$)", True),
+]
+# Patterns that indicate something is NOT a real blocker (technical waits, code behavior)
+BLOCKER_FALSE_POSITIVE_PATTERNS = [
+    r"(?:the |a )?(?:code|function|method|test|script|loop|task|job|process)\s+(?:is\s+)?(?:waiting|blocked)",
+    r"wait(?:ing)?\s+(?:for|on)\s+(?:the\s+)?(?:response|result|callback|promise|async|event|signal|input|output)",
+    r"(?:need|waiting)\s+(?:for\s+)?(?:the\s+)?(?:UI|DOM|page|component|element|render|load)",
+    r"(?:waiting|blocked)\s+(?:on|for)\s+(?:the\s+)?(?:lock|mutex|semaphore|thread|connection)",
+    r"(?:await|waiting)\s+(?:for\s+)?(?:the\s+)?(?:model|embedding|transport|stream|socket)",
+    r"pipeline\s+to\s+(?:complete|finish|pass)",
+    r"(?:waiting|need)\s+(?:for\s+)?(?:it|this|that)\s+to\s+(?:complete|finish|load|ready)",
+]
+LEARNING_PATTERNS = [
+    (r"(?:TIL|today I learned)\s*[:\s]+(.{20,250}?)(?:\.|$)", True),
+    (r"(?:discovered|found out|realized)\s+that\s+(.{20,250}?)(?:\.|$)", True),
+    (r"turns out\s+(?:that\s+)?(.{20,250}?)(?:\.|$)", True),
+    (
+        r"(?:the |a )?(?:key|important)\s+(?:insight|takeaway|lesson)\s+(?:is\s+)?(?:that\s+)?(.{20,250}?)(?:\.|$)",
+        True,
+    ),
+    (r"(?:I |we )?learned\s+that\s+(.{20,250}?)(?:\.|$)", True),
+]
+CODE_INDICATORS = [
+    r"^\s*[{}\[\]();,]",
+    r"[{}\[\]]",
+    r"^\s*(?:def|class|function|const|let|var|import|from|return|if|else|for|while)\s",
+    r"^\s*[A-Z_]{2,}\s*[=:]",
+    r"^\s*\w+\s*=\s*[{\[\(]",
+    r"^\s*#\s*\w+",
+    r"^\s*//",
+    r"^\s*\*\s",
+    r"^\s*-\s*\[",
+    r"^\s*```",
+    r"^\s*\|",
+    r"^\s*>",
+    r"TEXT,?\s*--",
+    r"^\s*\d+:\s",
+    r"\.(?:rb|py|js|ts|go|rs|java|cpp|c|h|yml|yaml|json|md|txt)\s*$",
+    r":\s*\w+\s*}",
+]
+NOISE_PATTERNS = [
+    r"^(?:fix|update|add|remove|change|check|test|run|see|look|try|use)\s+\w+$",
+    r"^\w+\s+\w+$",
+    r"^the\s+\w+$",
+    r"^\d+",
+    r"^[^a-zA-Z]*$",
+    r'["\']$',
+    r"^\s*$",
+    r"^it is\s+",
+]
+FRAGMENT_INDICATORS = [
+    r"^\*+\s",
+    r"^-+\s",
+    r"^\|",
+    r"^>",
+    r"^#",
+    r"^\d+\.\s",
+    r"^\d+\)\s",
+    r"^[a-z]+\.\s",
+    r"^[a-z]+,\s",
+    r"\*\*\s*$",
+    r"\*\*\s*-",
+    r"\s+\+\s+",
+    r"\s+-\s+",
+]
+BROKEN_START_PATTERNS = [
+    r"^and\s+(?:the|a|if|when|also)\b",
+    r"^or\s+(?:the|a|if|when|also)\b",
+    r"^but\s+(?:the|a|now|if|when)\b",
+    r"^[a-z]+\s+and\s+if\b",
+]
+MIN_WORD_COUNT = 4
+MIN_CONTENT_LENGTH = 25
+MAX_CONTENT_LENGTH = 400
+@dataclass
+class ExtractedInsight:
+    """An insight extracted from text."""
+    category: str
+    content: str
+    context: str
+def _is_code_fragment(text: str) -> bool:
+    """Check if text looks like a code fragment."""
+    for pattern in CODE_INDICATORS:
+        if re.search(pattern, text, re.MULTILINE | re.IGNORECASE):
+            return True
+    return False
+def _is_noise(text: str) -> bool:
+    """Check if text is likely noise."""
+    for pattern in NOISE_PATTERNS:
+        if re.match(pattern, text.strip(), re.IGNORECASE):
+            return True
+    return False
+def _clean_content(text: str) -> str:
+    """Clean extracted content."""
+    text = re.sub(r"\s+", " ", text)
+    text = re.sub(r"^[^\w]+", "", text)
+    text = re.sub(r"[^\w.!?]+$", "", text)
+    text = text.strip()
+    return text
+def _is_fragment(text: str) -> bool:
+    """Check if text looks like a broken fragment (markdown artifacts, list items, etc)."""
+    text = text.strip()
+    if not text:
+        return True
+    for pattern in FRAGMENT_INDICATORS:
+        if re.search(pattern, text):
+            return True
+    for pattern in BROKEN_START_PATTERNS:
+        if re.match(pattern, text, re.IGNORECASE):
+            return True
+    if text and text[-1] in ";":
+        return True
+    return False
+def _is_valid_insight(content: str) -> bool:
+    """Check if content is a valid insight worth storing."""
+    if len(content) < MIN_CONTENT_LENGTH:
+        return False
+    if len(content) > MAX_CONTENT_LENGTH:
+        return False
+    word_count = len(content.split())
+    if word_count < MIN_WORD_COUNT:
+        return False
+    if _is_code_fragment(content):
+        return False
+    if _is_noise(content):
+        return False
+    if _is_fragment(content):
+        return False
+    alpha_chars = sum(1 for c in content if c.isalpha())
+    if alpha_chars < len(content) * 0.5:
+        return False
+    return True
+def _normalize_for_dedup(text: str) -> str:
+    """Normalize text for deduplication comparison."""
+    words = re.findall(r"\w+", text.lower())
+    return " ".join(sorted(set(words)))
+def _is_duplicate(content: str, seen_contents: set[str]) -> bool:
+    """Check if content is a duplicate of something already seen."""
+    content_words = set(re.findall(r"\w{3,}", content.lower()))
+    for seen in seen_contents:
+        seen_words = set(re.findall(r"\w{3,}", seen.lower()))
+        if not content_words or not seen_words:
+            continue
+        common_words = content_words & seen_words
+        smaller_set = min(len(content_words), len(seen_words))
+        if smaller_set > 0 and len(common_words) / smaller_set > 0.6:
+            return True
+    return False
+def _extract_with_patterns(
+    text: str, patterns: list[tuple[str, bool]], category: str
+) -> list[ExtractedInsight]:
+    """Extract insights using a list of patterns."""
+    insights = []
+    seen_contents: set[str] = set()
+    for pattern, _strict in patterns:
+        for match in re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE):
+            content = _clean_content(match.group(1))
+            if not _is_valid_insight(content):
+                continue
+            if _is_duplicate(content, seen_contents):
+                continue
+            seen_contents.add(content)
+            start = max(0, match.start() - 100)
+            end = min(len(text), match.end() + 100)
+            context = text[start:end].strip()
+            insights.append(ExtractedInsight(category=category, content=content, context=context))
+    return insights
+def extract_decisions(text: str) -> list[ExtractedInsight]:
+    """Extract decision statements from text."""
+    return _extract_with_patterns(text, DECISION_PATTERNS, "decision")
+def _is_blocker_false_positive(content: str, context: str) -> bool:
+    """Check if a blocker match is likely a false positive."""
+    combined = f"{context} {content}".lower()
+    for pattern in BLOCKER_FALSE_POSITIVE_PATTERNS:
+        if re.search(pattern, combined, re.IGNORECASE):
+            return True
+    return False
+def extract_blockers(text: str) -> list[ExtractedInsight]:
+    """Extract blocker statements from text."""
+    raw_insights = _extract_with_patterns(text, BLOCKER_PATTERNS, "blocker")
+    # Filter out false positives
+    return [
+        insight
+        for insight in raw_insights
+        if not _is_blocker_false_positive(insight.content, insight.context)
+    ]
+def extract_learnings(text: str) -> list[ExtractedInsight]:
+    """Extract learning/insight statements from text."""
+    return _extract_with_patterns(text, LEARNING_PATTERNS, "fact")
+def extract_all_insights(text: str) -> list[ExtractedInsight]:
+    """Extract all types of insights from text."""
+    insights = []
+    insights.extend(extract_decisions(text))
+    insights.extend(extract_blockers(text))
+    insights.extend(extract_learnings(text))
+    return insights