@dinasor/mnemo-cli 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/CHANGELOG.md +46 -0
  2. package/LICENSE +21 -0
  3. package/README.md +263 -0
  4. package/VERSION +1 -0
  5. package/bin/mnemo.js +139 -0
  6. package/memory.ps1 +178 -0
  7. package/memory_mac.sh +2447 -0
  8. package/package.json +36 -0
  9. package/scripts/memory/installer/bootstrap.ps1 +21 -0
  10. package/scripts/memory/installer/core/bridge.ps1 +285 -0
  11. package/scripts/memory/installer/core/io.ps1 +110 -0
  12. package/scripts/memory/installer/core/paths.ps1 +83 -0
  13. package/scripts/memory/installer/features/gitignore_setup.ps1 +80 -0
  14. package/scripts/memory/installer/features/hooks_setup.ps1 +157 -0
  15. package/scripts/memory/installer/features/mcp_setup.ps1 +87 -0
  16. package/scripts/memory/installer/features/memory_scaffold.ps1 +541 -0
  17. package/scripts/memory/installer/features/vector_setup.ps1 +103 -0
  18. package/scripts/memory/installer/templates/add-journal-entry.ps1 +122 -0
  19. package/scripts/memory/installer/templates/add-lesson.ps1 +151 -0
  20. package/scripts/memory/installer/templates/autonomy/__init__.py +6 -0
  21. package/scripts/memory/installer/templates/autonomy/context_safety.py +181 -0
  22. package/scripts/memory/installer/templates/autonomy/entity_resolver.py +215 -0
  23. package/scripts/memory/installer/templates/autonomy/ingest_pipeline.py +252 -0
  24. package/scripts/memory/installer/templates/autonomy/lifecycle_engine.py +254 -0
  25. package/scripts/memory/installer/templates/autonomy/policies.yaml +59 -0
  26. package/scripts/memory/installer/templates/autonomy/reranker.py +220 -0
  27. package/scripts/memory/installer/templates/autonomy/retrieval_router.py +148 -0
  28. package/scripts/memory/installer/templates/autonomy/runner.py +272 -0
  29. package/scripts/memory/installer/templates/autonomy/schema.py +150 -0
  30. package/scripts/memory/installer/templates/autonomy/vault_policy.py +205 -0
  31. package/scripts/memory/installer/templates/build-memory-sqlite.py +111 -0
  32. package/scripts/memory/installer/templates/clear-active.ps1 +55 -0
  33. package/scripts/memory/installer/templates/customization.md +84 -0
  34. package/scripts/memory/installer/templates/lint-memory.ps1 +217 -0
  35. package/scripts/memory/installer/templates/mnemo_vector.py +556 -0
  36. package/scripts/memory/installer/templates/query-memory-sqlite.py +95 -0
  37. package/scripts/memory/installer/templates/query-memory.ps1 +122 -0
  38. package/scripts/memory/installer/templates/rebuild-memory-index.ps1 +293 -0
@@ -0,0 +1,122 @@
1
+ <#
2
+ add-journal-entry.ps1
3
+ Adds a journal entry to the current month's journal file.
4
+ Ensures only one heading per date (appends to existing date if present).
5
+ Tags are canonicalized against tag-vocabulary.md.
6
+ BOM-safe file reading.
7
+
8
+ USAGE:
9
+ powershell -File .\scripts\memory\add-journal-entry.ps1 -Tags "UI,Fix" -Title "Fixed button alignment"
10
+ powershell -File .\scripts\memory\add-journal-entry.ps1 -Tags "Build" -Title "Updated dependencies" -Files "package.json"
11
+ #>
12
+
13
+ [CmdletBinding()]
14
+ param(
15
+ [Parameter(Mandatory=$true)][string]$Tags,
16
+ [Parameter(Mandatory=$true)][string]$Title,
17
+ [string]$Files = "",
18
+ [string]$Why = "",
19
+ [string]$Date = (Get-Date -Format "yyyy-MM-dd")
20
+ )
21
+
22
+ Set-StrictMode -Version Latest
23
+ $ErrorActionPreference = "Stop"
24
+
25
+ if ($PSScriptRoot) {
26
+ $RepoRoot = (Resolve-Path (Join-Path $PSScriptRoot "..\..")).Path
27
+ } else {
28
+ $RepoRoot = (Get-Location).Path
29
+ }
30
+
31
+ function Resolve-MnemoMemoryDir([string]$Root) {
32
+ $candidates = @(
33
+ (Join-Path $Root ".mnemo\memory"),
34
+ (Join-Path $Root ".cursor\memory")
35
+ )
36
+ foreach ($candidate in $candidates) {
37
+ if (Test-Path -LiteralPath $candidate) { return $candidate }
38
+ }
39
+ return $candidates[0]
40
+ }
41
+
42
+ $MemoryDir = Resolve-MnemoMemoryDir -Root $RepoRoot
43
+ $JournalDir = Join-Path $MemoryDir "journal"
44
+ $TagVocabPath = Join-Path $MemoryDir "tag-vocabulary.md"
45
+
46
+ if (-not (Test-Path $JournalDir)) { New-Item -ItemType Directory -Force -Path $JournalDir | Out-Null }
47
+
48
+ function ReadText([string]$p) {
49
+ $t = Get-Content -Raw -Encoding UTF8 -ErrorAction Stop $p
50
+ if ($t.Length -gt 0 -and [int]$t[0] -eq 0xFEFF) { $t = $t.Substring(1) }
51
+ return $t
52
+ }
53
+
54
+ $canonTags = @{}
55
+ if (Test-Path $TagVocabPath) {
56
+ $tv = ReadText $TagVocabPath
57
+ foreach ($m in [regex]::Matches($tv, '(?m)^\-\s+\[([^\]]+)\]')) {
58
+ $canon = $m.Groups[1].Value.Trim()
59
+ $canonTags[$canon.ToLower()] = $canon
60
+ }
61
+ }
62
+
63
+ $month = $Date.Substring(0, 7)
64
+ $journalFile = Join-Path $JournalDir "$month.md"
65
+
66
+ $rawTags = $Tags -split ',' | ForEach-Object { $_.Trim() } | Where-Object { $_ -ne "" }
67
+ $finalTags = @()
68
+ foreach ($t in $rawTags) {
69
+ $k = $t.ToLower()
70
+ if ($canonTags.Count -gt 0) {
71
+ if ($canonTags.ContainsKey($k)) { $finalTags += $canonTags[$k] }
72
+ else { throw "Unknown tag '$t'. Add it to tag-vocabulary.md or fix the tag." }
73
+ } else {
74
+ $finalTags += $t
75
+ }
76
+ }
77
+ $finalTags = $finalTags | Select-Object -Unique
78
+ $tagString = ($finalTags | ForEach-Object { "[$_]" }) -join ""
79
+
80
+ $entryLines = @()
81
+ $entryLines += "- $tagString $Title"
82
+ if ($Why) { $entryLines += " - Why: $Why" }
83
+ if ($Files) {
84
+ $entryLines += " - Key files:"
85
+ foreach ($f in ($Files -split ',')) { $entryLines += " - ``$($f.Trim())``" }
86
+ }
87
+ $entry = $entryLines -join "`r`n"
88
+
89
+ $enc = New-Object System.Text.UTF8Encoding($false)
90
+ $dateHeading = "## $Date"
91
+ $safeDate = [regex]::Escape($Date)
92
+
93
+ if (Test-Path $journalFile) {
94
+ $content = ReadText $journalFile
95
+ if ($content -match "(?m)^##\s+$safeDate\s*$") {
96
+ $pattern = "(?ms)(^##\s+$safeDate\s*\r?\n)(.*?)(?=^##\s+\d{4}-\d{2}-\d{2}\s*$|\z)"
97
+ $rx = New-Object System.Text.RegularExpressions.Regex($pattern, [System.Text.RegularExpressions.RegexOptions]::Multiline)
98
+ $content = $rx.Replace($content, {
99
+ param($m)
100
+ $block = $m.Value.TrimEnd()
101
+ return $block + "`r`n`r`n" + $entry + "`r`n"
102
+ }, 1)
103
+ } else {
104
+ $content = $content.TrimEnd() + "`r`n`r`n$dateHeading`r`n`r`n$entry`r`n"
105
+ }
106
+ [System.IO.File]::WriteAllText($journalFile, ($content -replace "`r?`n", "`r`n"), $enc)
107
+ } else {
108
+ $projectName = Split-Path -Leaf $RepoRoot
109
+ $header = @"
110
+ # Development Journal - $projectName ($month)
111
+
112
+ ## $Date
113
+
114
+ $entry
115
+ "@
116
+ [System.IO.File]::WriteAllText($journalFile, ($header -replace "`r?`n", "`r`n"), $enc)
117
+ }
118
+
119
+ Write-Host "Added journal entry to: $journalFile" -ForegroundColor Green
120
+ Write-Host " Date: $Date" -ForegroundColor Gray
121
+ Write-Host " Tags: $tagString" -ForegroundColor Gray
122
+ Write-Host " Title: $Title" -ForegroundColor Gray
@@ -0,0 +1,151 @@
1
+ <#
2
+ add-lesson.ps1
3
+ Creates a new lesson file with proper ID and YAML frontmatter.
4
+ Automatically assigns the next available lesson ID.
5
+ Tags are canonicalized against tag-vocabulary.md.
6
+
7
+ USAGE:
8
+ powershell -File .\scripts\memory\add-lesson.ps1 -Title "Always validate input" -Tags "Reliability,Data" -Rule "Validate all user input before processing"
9
+ #>
10
+
11
+ [CmdletBinding()]
12
+ param(
13
+ [Parameter(Mandatory=$true)][string]$Title,
14
+ [Parameter(Mandatory=$true)][string]$Tags,
15
+ [Parameter(Mandatory=$true)][string]$Rule,
16
+ [string]$AppliesTo = "*",
17
+ [string]$Triggers = ""
18
+ )
19
+
20
+ Set-StrictMode -Version Latest
21
+ $ErrorActionPreference = "Stop"
22
+
23
+ if ($PSScriptRoot) {
24
+ $RepoRoot = (Resolve-Path (Join-Path $PSScriptRoot "..\..")).Path
25
+ } else {
26
+ $RepoRoot = (Get-Location).Path
27
+ }
28
+
29
+ function Resolve-MnemoMemoryDir([string]$Root) {
30
+ $candidates = @(
31
+ (Join-Path $Root ".mnemo\memory"),
32
+ (Join-Path $Root ".cursor\memory")
33
+ )
34
+ foreach ($candidate in $candidates) {
35
+ if (Test-Path -LiteralPath $candidate) { return $candidate }
36
+ }
37
+ return $candidates[0]
38
+ }
39
+
40
+ $MemoryDir = Resolve-MnemoMemoryDir -Root $RepoRoot
41
+ $LessonsDir = Join-Path $MemoryDir "lessons"
42
+ $TagVocabPath = Join-Path $MemoryDir "tag-vocabulary.md"
43
+
44
+ if (-not (Test-Path $LessonsDir)) { New-Item -ItemType Directory -Force -Path $LessonsDir | Out-Null }
45
+
46
+ function ReadText([string]$p) {
47
+ $t = Get-Content -Raw -Encoding UTF8 -ErrorAction Stop $p
48
+ if ($t.Length -gt 0 -and [int]$t[0] -eq 0xFEFF) { $t = $t.Substring(1) }
49
+ return $t
50
+ }
51
+
52
+ $canonTags = @{}
53
+ if (Test-Path $TagVocabPath) {
54
+ $tv = ReadText $TagVocabPath
55
+ foreach ($m in [regex]::Matches($tv, '(?m)^\-\s+\[([^\]]+)\]')) {
56
+ $canon = $m.Groups[1].Value.Trim()
57
+ $canonTags[$canon.ToLower()] = $canon
58
+ }
59
+ }
60
+
61
+ $existingLessons = Get-ChildItem -Path $LessonsDir -Filter "L-*.md" -ErrorAction SilentlyContinue
62
+ $maxId = 0
63
+ foreach ($lf in $existingLessons) {
64
+ if ($lf.Name -match '^L-(\d{3})') {
65
+ $id = [int]$Matches[1]
66
+ if ($id -gt $maxId) { $maxId = $id }
67
+ }
68
+ }
69
+
70
+ $lessonId = "L-{0:D3}" -f ($maxId + 1)
71
+
72
+ $kebabTitle = ($Title.ToLower() -replace '[^a-z0-9]+', '-' -replace '^-|-$', '')
73
+ if ([string]::IsNullOrWhiteSpace($kebabTitle)) { $kebabTitle = "lesson" }
74
+ if ($kebabTitle.Length -gt 50) { $kebabTitle = $kebabTitle.Substring(0, 50) }
75
+ $fileName = "$lessonId-$kebabTitle.md"
76
+ $filePath = Join-Path $LessonsDir $fileName
77
+
78
+ $rawTags = $Tags -split ',' | ForEach-Object { $_.Trim() } | Where-Object { $_ -ne "" }
79
+ $finalTags = @()
80
+ foreach ($t in $rawTags) {
81
+ $k = $t.ToLower()
82
+ if ($canonTags.Count -gt 0) {
83
+ if ($canonTags.ContainsKey($k)) { $finalTags += $canonTags[$k] }
84
+ else { throw "Unknown tag '$t'. Add it to tag-vocabulary.md or fix the tag." }
85
+ } else {
86
+ $finalTags += $t
87
+ }
88
+ }
89
+ $finalTags = $finalTags | Select-Object -Unique
90
+ $tagsYaml = "[$($finalTags -join ', ')]"
91
+
92
+ $appliesLines = @()
93
+ foreach ($a in ($AppliesTo -split ',')) { $appliesLines += " - $($a.Trim())" }
94
+ $appliesYaml = $appliesLines -join "`r`n"
95
+
96
+ if ($Triggers) {
97
+ $triggerLines = @()
98
+ foreach ($t in ($Triggers -split ',')) { $triggerLines += " - $($t.Trim())" }
99
+ $triggersYaml = "triggers:`r`n" + ($triggerLines -join "`r`n")
100
+ } else {
101
+ $triggersYaml = "triggers:`r`n - TODO: add error messages or keywords"
102
+ }
103
+
104
+ $today = Get-Date -Format "yyyy-MM-dd"
105
+
106
+ $content = @"
107
+ ---
108
+ id: $lessonId
109
+ title: $Title
110
+ status: Active
111
+ tags: $tagsYaml
112
+ introduced: $today
113
+ applies_to:
114
+ $appliesYaml
115
+ $triggersYaml
116
+ rule: $Rule
117
+ ---
118
+
119
+ # $lessonId - $Title
120
+
121
+ ## Symptom
122
+
123
+ TODO: Describe what happened
124
+
125
+ ## Root Cause
126
+
127
+ TODO: Describe why it happened
128
+
129
+ ## Wrong Approach (DO NOT REPEAT)
130
+
131
+ - TODO: What not to do
132
+
133
+ ## Correct Approach
134
+
135
+ - TODO: What to do instead
136
+
137
+ ## References
138
+
139
+ - Files: ``TODO``
140
+ - Journal: ``journal/$($today.Substring(0,7)).md#$today``
141
+ "@
142
+
143
+ $enc = New-Object System.Text.UTF8Encoding($false)
144
+ [System.IO.File]::WriteAllText($filePath, ($content -replace "`r?`n", "`r`n"), $enc)
145
+
146
+ Write-Host "Created lesson: $filePath" -ForegroundColor Green
147
+ Write-Host " ID: $lessonId" -ForegroundColor Gray
148
+ Write-Host " Title: $Title" -ForegroundColor Gray
149
+ Write-Host " Tags: $tagsYaml" -ForegroundColor Gray
150
+ Write-Host ""
151
+ Write-Host "Next: run scripts\memory\rebuild-memory-index.ps1" -ForegroundColor Cyan
@@ -0,0 +1,6 @@
1
+ """
2
+ Mnemo Autonomous Memory Runtime.
3
+ Provides no-human-in-the-loop memory ingestion, lifecycle governance,
4
+ retrieval routing, safety filtering, and quality monitoring.
5
+ """
6
+ __version__ = "1.0.0"
@@ -0,0 +1,181 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ context_safety.py - Context safety guard for Mnemo retrieval packs.
4
+
5
+ Runs automatically on every retrieval/context-pack build.
6
+ Checks:
7
+ 1. Duplicate snippet detection (prevents redundant context)
8
+ 2. Contradiction detection (alerts on conflicting facts)
9
+ 3. Low-signal suppression (filters empty/trivial content)
10
+ 4. Token budget enforcement (hard cap on total context chars)
11
+ 5. Sensitivity redaction (vault/secret entries stripped before delivery)
12
+
13
+ No human required: all checks are policy-driven.
14
+ """
15
+ import re
16
+ import sqlite3
17
+ from dataclasses import dataclass, field
18
+ from typing import Optional
19
+
20
+ from autonomy.schema import get_db
21
+ from autonomy.reranker import RankedResult
22
+
23
+ DEFAULT_TOKEN_BUDGET = 6000 # chars (~1500 tokens)
24
+ MIN_CONTENT_CHARS = 20 # snippets shorter than this are suppressed
25
+ DUPLICATE_JACCARD_THRESHOLD = 0.85 # above = duplicate
26
+ CONTRADICTION_KEYWORD_PAIRS = [
27
+ ("do not", "always"),
28
+ ("never", "must"),
29
+ ("disabled", "enabled"),
30
+ ("false", "true"),
31
+ ("forbidden", "required"),
32
+ ]
33
+
34
+
35
+ @dataclass
36
+ class SafetyCheckResult:
37
+ passed: bool
38
+ issues: list[str] = field(default_factory=list)
39
+ filtered_results: list[RankedResult] = field(default_factory=list)
40
+ token_budget_used: int = 0
41
+ token_budget_max: int = DEFAULT_TOKEN_BUDGET
42
+
43
+ def summary(self) -> str:
44
+ s = f"safety={'PASS' if self.passed else 'ISSUES'} used={self.token_budget_used}/{self.token_budget_max}"
45
+ if self.issues:
46
+ s += f" issues={len(self.issues)}"
47
+ return s
48
+
49
+
50
+ def _jaccard(a: str, b: str) -> float:
51
+ ta = set(re.findall(r"\w+", a.lower()))
52
+ tb = set(re.findall(r"\w+", b.lower()))
53
+ if not ta or not tb:
54
+ return 0.0
55
+ return len(ta & tb) / len(ta | tb)
56
+
57
+
58
+ def _detect_duplicates(results: list[RankedResult]) -> list[tuple[int, int]]:
59
+ """Return (i, j) pairs where results[i] and results[j] are near-duplicates."""
60
+ pairs: list[tuple[int, int]] = []
61
+ for i in range(len(results)):
62
+ for j in range(i + 1, len(results)):
63
+ if _jaccard(results[i].content, results[j].content) >= DUPLICATE_JACCARD_THRESHOLD:
64
+ pairs.append((i, j))
65
+ return pairs
66
+
67
+
68
+ def _detect_contradictions(results: list[RankedResult]) -> list[tuple[int, int, str]]:
69
+ """Return (i, j, reason) triples indicating contradicting result pairs."""
70
+ contradictions: list[tuple[int, int, str]] = []
71
+ for i in range(len(results)):
72
+ for j in range(i + 1, len(results)):
73
+ a = results[i].content.lower()
74
+ b = results[j].content.lower()
75
+ for neg, pos in CONTRADICTION_KEYWORD_PAIRS:
76
+ if neg in a and pos in b:
77
+ contradictions.append((i, j, f"'{neg}' vs '{pos}'"))
78
+ break
79
+ if pos in a and neg in b:
80
+ contradictions.append((i, j, f"'{pos}' vs '{neg}'"))
81
+ break
82
+ return contradictions
83
+
84
+
85
+ class ContextSafetyGuard:
86
+ def __init__(
87
+ self,
88
+ token_budget: int = DEFAULT_TOKEN_BUDGET,
89
+ db: Optional[sqlite3.Connection] = None,
90
+ ):
91
+ self.token_budget = token_budget
92
+ self.db = db or get_db()
93
+
94
+ def check(self, results: list[RankedResult]) -> SafetyCheckResult:
95
+ """
96
+ Run all safety checks on results, return SafetyCheckResult.
97
+ Modifies results list by removing failing entries.
98
+ """
99
+ issues: list[str] = []
100
+ filtered: list[RankedResult] = []
101
+
102
+ # 1. Sensitivity redaction (vault/secret always stripped first)
103
+ for r in results:
104
+ if r.memory_type == "vault":
105
+ issues.append(f"REDACTED vault entry: {r.ref_path}")
106
+ continue
107
+ # Check DB sensitivity flag
108
+ row = self.db.execute(
109
+ "SELECT sensitivity FROM memory_units WHERE source_ref = ?",
110
+ (r.source_file,),
111
+ ).fetchone()
112
+ if row and row["sensitivity"] == "secret":
113
+ issues.append(f"REDACTED secret entry: {r.ref_path}")
114
+ continue
115
+ filtered.append(r)
116
+
117
+ # 2. Low-signal suppression
118
+ filtered = [r for r in filtered if len(r.content.strip()) >= MIN_CONTENT_CHARS]
119
+ suppressed = len(results) - len(filtered)
120
+ if suppressed > 0:
121
+ issues.append(f"Suppressed {suppressed} low-signal entries (<{MIN_CONTENT_CHARS} chars)")
122
+
123
+ # 3. Duplicate detection (remove lower-scored duplicate)
124
+ dup_pairs = _detect_duplicates(filtered)
125
+ to_remove: set[int] = set()
126
+ for i, j in dup_pairs:
127
+ # Keep higher-scored result
128
+ remove_idx = j if filtered[i].final_score >= filtered[j].final_score else i
129
+ to_remove.add(remove_idx)
130
+ issues.append(f"Duplicate pair removed: {filtered[remove_idx].ref_path}")
131
+ filtered = [r for idx, r in enumerate(filtered) if idx not in to_remove]
132
+
133
+ # 4. Contradiction detection (warn but keep both, lower confidence)
134
+ contradictions = _detect_contradictions(filtered)
135
+ for i, j, reason in contradictions:
136
+ issues.append(
137
+ f"Contradiction detected ({reason}): {filtered[i].ref_path} vs {filtered[j].ref_path}"
138
+ )
139
+ # Reduce score of both to signal uncertainty
140
+ object.__setattr__(filtered[i], "final_score", filtered[i].final_score * 0.9)
141
+ object.__setattr__(filtered[j], "final_score", filtered[j].final_score * 0.9)
142
+
143
+ # 5. Token budget enforcement
144
+ budget_used = 0
145
+ final: list[RankedResult] = []
146
+ for r in sorted(filtered, key=lambda x: x.final_score, reverse=True):
147
+ chars = len(r.content)
148
+ if budget_used + chars > self.token_budget:
149
+ issues.append(f"Token budget exceeded; truncated at {budget_used} chars")
150
+ break
151
+ final.append(r)
152
+ budget_used += chars
153
+
154
+ passed = not any("REDACTED secret" in i or "REDACTED vault" in i for i in issues)
155
+ return SafetyCheckResult(
156
+ passed=passed,
157
+ issues=issues,
158
+ filtered_results=final,
159
+ token_budget_used=budget_used,
160
+ token_budget_max=self.token_budget,
161
+ )
162
+
163
+ def build_context_pack(self, results: list[RankedResult]) -> str:
164
+ """
165
+ Build a formatted context pack string from safety-checked results.
166
+ Safe to pass directly to LLM context.
167
+ """
168
+ check = self.check(results)
169
+ lines: list[str] = []
170
+
171
+ if not check.passed:
172
+ sensitive_warnings = [i for i in check.issues if "REDACTED" in i]
173
+ if sensitive_warnings:
174
+ lines.append(f"[Safety] {'; '.join(sensitive_warnings)}")
175
+
176
+ for r in check.filtered_results:
177
+ lines.append(f"<!-- {r.ref_path} | score={r.final_score:.3f} type={r.memory_type} -->")
178
+ lines.append(r.content.strip())
179
+ lines.append("")
180
+
181
+ return "\n".join(lines).strip()
@@ -0,0 +1,215 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ entity_resolver.py - Stable entity IDs and alias resolution.
4
+
5
+ Extracts named entities from memory units (paths, modules, concepts),
6
+ assigns stable UUIDs, maintains alias mappings with confidence scores,
7
+ and propagates entity_tags back to memory_units.
8
+
9
+ No human required: alias merging is automatic with confidence thresholds.
10
+ """
11
+ import json
12
+ import re
13
+ import sqlite3
14
+ import uuid
15
+ from dataclasses import dataclass
16
+ from typing import Optional
17
+
18
+ from autonomy.schema import get_db
19
+ from autonomy.ingest_pipeline import MemoryUnit
20
+
21
+ ALIAS_MERGE_THRESHOLD = 0.85 # confidence required to merge aliases
22
+ ENTITY_CONFIDENCE_DECAY = 0.02 # decay per cycle without reinforcement
23
+
24
+
25
+ @dataclass
26
+ class Entity:
27
+ entity_id: str
28
+ entity_name: str
29
+ entity_type: str
30
+ confidence: float
31
+
32
+
33
+ def _extract_entities_from_text(content: str, source_ref: str) -> list[tuple[str, str]]:
34
+ """
35
+ Heuristic entity extraction from markdown content.
36
+ Returns list of (entity_name, entity_type) tuples.
37
+ """
38
+ entities: list[tuple[str, str]] = []
39
+
40
+ # File/path references (e.g., `path/to/file.py`)
41
+ for m in re.finditer(r"`([^`]+\.[a-z]{1,10})`", content):
42
+ p = m.group(1)
43
+ if "/" in p or "\\" in p:
44
+ entities.append((p, "file"))
45
+
46
+ # Module/class names (CamelCase words used multiple times)
47
+ camel_matches = re.findall(r"\b([A-Z][a-zA-Z]{3,}(?:[A-Z][a-z]+)+)\b", content)
48
+ freq: dict[str, int] = {}
49
+ for name in camel_matches:
50
+ freq[name] = freq.get(name, 0) + 1
51
+ for name, count in freq.items():
52
+ if count >= 2:
53
+ entities.append((name, "class"))
54
+
55
+ # Lesson references (L-XXX)
56
+ for m in re.finditer(r"\bL-(\d{3})\b", content):
57
+ entities.append((f"L-{m.group(1)}", "lesson"))
58
+
59
+ # Function/method references (snake_case in backticks)
60
+ for m in re.finditer(r"`([a-z][a-z0-9_]{3,})\(\)`", content):
61
+ entities.append((m.group(1), "function"))
62
+
63
+ return list(set(entities))[:30] # deduplicate + cap
64
+
65
+
66
+ class EntityResolver:
67
+ def __init__(self, db: Optional[sqlite3.Connection] = None):
68
+ self.db = db or get_db()
69
+
70
+ def resolve(self, unit: MemoryUnit) -> list[str]:
71
+ """
72
+ Extract entities from unit content, resolve/create IDs, update unit.
73
+ Returns list of entity_ids assigned to this unit.
74
+ """
75
+ raw_entities = _extract_entities_from_text(unit.content, unit.source_ref)
76
+ entity_ids: list[str] = []
77
+
78
+ for entity_name, entity_type in raw_entities:
79
+ eid = self._get_or_create(entity_name, entity_type)
80
+ if eid:
81
+ entity_ids.append(eid)
82
+
83
+ if entity_ids:
84
+ self.db.execute(
85
+ "UPDATE memory_units SET entity_tags=?, updated_at=unixepoch('now') WHERE unit_id=?",
86
+ (json.dumps(entity_ids), unit.unit_id),
87
+ )
88
+ self.db.commit()
89
+ unit.entity_tags = entity_ids
90
+
91
+ return entity_ids
92
+
93
+ def _get_or_create(self, entity_name: str, entity_type: str) -> Optional[str]:
94
+ """Resolve entity by exact match or alias, or create new."""
95
+ # Check exact entity name
96
+ row = self.db.execute(
97
+ "SELECT entity_id FROM entities WHERE entity_name = ?", (entity_name,)
98
+ ).fetchone()
99
+ if row:
100
+ self._reinforce(row["entity_id"])
101
+ return row["entity_id"]
102
+
103
+ # Check aliases
104
+ alias_row = self.db.execute(
105
+ "SELECT entity_id FROM entity_aliases WHERE alias_text = ?", (entity_name,)
106
+ ).fetchone()
107
+ if alias_row:
108
+ self._reinforce(alias_row["entity_id"])
109
+ return alias_row["entity_id"]
110
+
111
+ # Try fuzzy alias match
112
+ similar_id = self._find_similar_entity(entity_name)
113
+ if similar_id:
114
+ # Add as alias
115
+ try:
116
+ self.db.execute(
117
+ "INSERT INTO entity_aliases(alias_id, entity_id, alias_text, confidence) VALUES (?,?,?,?)",
118
+ (str(uuid.uuid4()), similar_id, entity_name, ALIAS_MERGE_THRESHOLD),
119
+ )
120
+ self.db.commit()
121
+ self._reinforce(similar_id)
122
+ return similar_id
123
+ except sqlite3.IntegrityError:
124
+ return similar_id
125
+
126
+ # Create new entity
127
+ entity_id = str(uuid.uuid4())
128
+ try:
129
+ self.db.execute(
130
+ "INSERT INTO entities(entity_id, entity_name, entity_type, confidence) VALUES (?,?,?,1.0)",
131
+ (entity_id, entity_name, entity_type),
132
+ )
133
+ self.db.commit()
134
+ except sqlite3.IntegrityError:
135
+ # Race condition: entity was created concurrently
136
+ row = self.db.execute(
137
+ "SELECT entity_id FROM entities WHERE entity_name = ?", (entity_name,)
138
+ ).fetchone()
139
+ return row["entity_id"] if row else None
140
+ return entity_id
141
+
142
+ def _find_similar_entity(self, name: str) -> Optional[str]:
143
+ """
144
+ Find an entity whose name is highly similar (token Jaccard).
145
+ Returns entity_id if confidence >= threshold, else None.
146
+ """
147
+ candidates = self.db.execute(
148
+ "SELECT entity_id, entity_name FROM entities"
149
+ ).fetchall()
150
+
151
+ name_tokens = set(re.findall(r"\w+", name.lower()))
152
+ best_id = None
153
+ best_score = 0.0
154
+
155
+ for row in candidates:
156
+ cand_tokens = set(re.findall(r"\w+", row["entity_name"].lower()))
157
+ if not name_tokens or not cand_tokens:
158
+ continue
159
+ score = len(name_tokens & cand_tokens) / len(name_tokens | cand_tokens)
160
+ if score > best_score:
161
+ best_score = score
162
+ best_id = row["entity_id"]
163
+
164
+ return best_id if best_score >= ALIAS_MERGE_THRESHOLD else None
165
+
166
+ def _reinforce(self, entity_id: str) -> None:
167
+ """Increase confidence for an entity (re-observed)."""
168
+ self.db.execute(
169
+ "UPDATE entities SET confidence=MIN(confidence+0.01, 1.0) WHERE entity_id=?",
170
+ (entity_id,),
171
+ )
172
+
173
+ def decay_stale_entities(self, min_confidence: float = 0.2) -> int:
174
+ """
175
+ Decay confidence of entities not reinforced recently.
176
+ Quarantine entities below min_confidence.
177
+ Returns number of quarantined entities.
178
+ """
179
+ self.db.execute(
180
+ """
181
+ UPDATE entities SET confidence = MAX(confidence - ?, 0.0)
182
+ WHERE entity_id NOT IN (
183
+ SELECT DISTINCT json_each.value
184
+ FROM memory_units, json_each(memory_units.entity_tags)
185
+ )
186
+ """,
187
+ (ENTITY_CONFIDENCE_DECAY,),
188
+ )
189
+ quarantined = self.db.execute(
190
+ "SELECT COUNT(*) FROM entities WHERE confidence < ?", (min_confidence,)
191
+ ).fetchone()[0]
192
+ self.db.commit()
193
+ return quarantined
194
+
195
+ def get_entity_by_alias(self, alias: str) -> Optional[Entity]:
196
+ """Resolve any alias or name to canonical Entity."""
197
+ # Direct name match
198
+ row = self.db.execute(
199
+ "SELECT entity_id, entity_name, entity_type, confidence FROM entities WHERE entity_name = ?",
200
+ (alias,),
201
+ ).fetchone()
202
+ if row:
203
+ return Entity(**dict(row))
204
+ # Alias lookup
205
+ alias_row = self.db.execute(
206
+ """
207
+ SELECT e.entity_id, e.entity_name, e.entity_type, e.confidence
208
+ FROM entity_aliases ea JOIN entities e ON ea.entity_id = e.entity_id
209
+ WHERE ea.alias_text = ?
210
+ """,
211
+ (alias,),
212
+ ).fetchone()
213
+ if alias_row:
214
+ return Entity(**dict(alias_row))
215
+ return None