@dinasor/mnemo-cli 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +46 -0
- package/LICENSE +21 -0
- package/README.md +263 -0
- package/VERSION +1 -0
- package/bin/mnemo.js +139 -0
- package/memory.ps1 +178 -0
- package/memory_mac.sh +2447 -0
- package/package.json +36 -0
- package/scripts/memory/installer/bootstrap.ps1 +21 -0
- package/scripts/memory/installer/core/bridge.ps1 +285 -0
- package/scripts/memory/installer/core/io.ps1 +110 -0
- package/scripts/memory/installer/core/paths.ps1 +83 -0
- package/scripts/memory/installer/features/gitignore_setup.ps1 +80 -0
- package/scripts/memory/installer/features/hooks_setup.ps1 +157 -0
- package/scripts/memory/installer/features/mcp_setup.ps1 +87 -0
- package/scripts/memory/installer/features/memory_scaffold.ps1 +541 -0
- package/scripts/memory/installer/features/vector_setup.ps1 +103 -0
- package/scripts/memory/installer/templates/add-journal-entry.ps1 +122 -0
- package/scripts/memory/installer/templates/add-lesson.ps1 +151 -0
- package/scripts/memory/installer/templates/autonomy/__init__.py +6 -0
- package/scripts/memory/installer/templates/autonomy/context_safety.py +181 -0
- package/scripts/memory/installer/templates/autonomy/entity_resolver.py +215 -0
- package/scripts/memory/installer/templates/autonomy/ingest_pipeline.py +252 -0
- package/scripts/memory/installer/templates/autonomy/lifecycle_engine.py +254 -0
- package/scripts/memory/installer/templates/autonomy/policies.yaml +59 -0
- package/scripts/memory/installer/templates/autonomy/reranker.py +220 -0
- package/scripts/memory/installer/templates/autonomy/retrieval_router.py +148 -0
- package/scripts/memory/installer/templates/autonomy/runner.py +272 -0
- package/scripts/memory/installer/templates/autonomy/schema.py +150 -0
- package/scripts/memory/installer/templates/autonomy/vault_policy.py +205 -0
- package/scripts/memory/installer/templates/build-memory-sqlite.py +111 -0
- package/scripts/memory/installer/templates/clear-active.ps1 +55 -0
- package/scripts/memory/installer/templates/customization.md +84 -0
- package/scripts/memory/installer/templates/lint-memory.ps1 +217 -0
- package/scripts/memory/installer/templates/mnemo_vector.py +556 -0
- package/scripts/memory/installer/templates/query-memory-sqlite.py +95 -0
- package/scripts/memory/installer/templates/query-memory.ps1 +122 -0
- package/scripts/memory/installer/templates/rebuild-memory-index.ps1 +293 -0
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
<#
|
|
2
|
+
add-journal-entry.ps1
|
|
3
|
+
Adds a journal entry to the current month's journal file.
|
|
4
|
+
Ensures only one heading per date (appends to existing date if present).
|
|
5
|
+
Tags are canonicalized against tag-vocabulary.md.
|
|
6
|
+
BOM-safe file reading.
|
|
7
|
+
|
|
8
|
+
USAGE:
|
|
9
|
+
powershell -File .\scripts\memory\add-journal-entry.ps1 -Tags "UI,Fix" -Title "Fixed button alignment"
|
|
10
|
+
powershell -File .\scripts\memory\add-journal-entry.ps1 -Tags "Build" -Title "Updated dependencies" -Files "package.json"
|
|
11
|
+
#>
|
|
12
|
+
|
|
13
|
+
[CmdletBinding()]
|
|
14
|
+
param(
|
|
15
|
+
[Parameter(Mandatory=$true)][string]$Tags,
|
|
16
|
+
[Parameter(Mandatory=$true)][string]$Title,
|
|
17
|
+
[string]$Files = "",
|
|
18
|
+
[string]$Why = "",
|
|
19
|
+
[string]$Date = (Get-Date -Format "yyyy-MM-dd")
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
Set-StrictMode -Version Latest
|
|
23
|
+
$ErrorActionPreference = "Stop"
|
|
24
|
+
|
|
25
|
+
if ($PSScriptRoot) {
|
|
26
|
+
$RepoRoot = (Resolve-Path (Join-Path $PSScriptRoot "..\..")).Path
|
|
27
|
+
} else {
|
|
28
|
+
$RepoRoot = (Get-Location).Path
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function Resolve-MnemoMemoryDir([string]$Root) {
|
|
32
|
+
$candidates = @(
|
|
33
|
+
(Join-Path $Root ".mnemo\memory"),
|
|
34
|
+
(Join-Path $Root ".cursor\memory")
|
|
35
|
+
)
|
|
36
|
+
foreach ($candidate in $candidates) {
|
|
37
|
+
if (Test-Path -LiteralPath $candidate) { return $candidate }
|
|
38
|
+
}
|
|
39
|
+
return $candidates[0]
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
$MemoryDir = Resolve-MnemoMemoryDir -Root $RepoRoot
|
|
43
|
+
$JournalDir = Join-Path $MemoryDir "journal"
|
|
44
|
+
$TagVocabPath = Join-Path $MemoryDir "tag-vocabulary.md"
|
|
45
|
+
|
|
46
|
+
if (-not (Test-Path $JournalDir)) { New-Item -ItemType Directory -Force -Path $JournalDir | Out-Null }
|
|
47
|
+
|
|
48
|
+
function ReadText([string]$p) {
|
|
49
|
+
$t = Get-Content -Raw -Encoding UTF8 -ErrorAction Stop $p
|
|
50
|
+
if ($t.Length -gt 0 -and [int]$t[0] -eq 0xFEFF) { $t = $t.Substring(1) }
|
|
51
|
+
return $t
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
$canonTags = @{}
|
|
55
|
+
if (Test-Path $TagVocabPath) {
|
|
56
|
+
$tv = ReadText $TagVocabPath
|
|
57
|
+
foreach ($m in [regex]::Matches($tv, '(?m)^\-\s+\[([^\]]+)\]')) {
|
|
58
|
+
$canon = $m.Groups[1].Value.Trim()
|
|
59
|
+
$canonTags[$canon.ToLower()] = $canon
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
$month = $Date.Substring(0, 7)
|
|
64
|
+
$journalFile = Join-Path $JournalDir "$month.md"
|
|
65
|
+
|
|
66
|
+
$rawTags = $Tags -split ',' | ForEach-Object { $_.Trim() } | Where-Object { $_ -ne "" }
|
|
67
|
+
$finalTags = @()
|
|
68
|
+
foreach ($t in $rawTags) {
|
|
69
|
+
$k = $t.ToLower()
|
|
70
|
+
if ($canonTags.Count -gt 0) {
|
|
71
|
+
if ($canonTags.ContainsKey($k)) { $finalTags += $canonTags[$k] }
|
|
72
|
+
else { throw "Unknown tag '$t'. Add it to tag-vocabulary.md or fix the tag." }
|
|
73
|
+
} else {
|
|
74
|
+
$finalTags += $t
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
$finalTags = $finalTags | Select-Object -Unique
|
|
78
|
+
$tagString = ($finalTags | ForEach-Object { "[$_]" }) -join ""
|
|
79
|
+
|
|
80
|
+
$entryLines = @()
|
|
81
|
+
$entryLines += "- $tagString $Title"
|
|
82
|
+
if ($Why) { $entryLines += " - Why: $Why" }
|
|
83
|
+
if ($Files) {
|
|
84
|
+
$entryLines += " - Key files:"
|
|
85
|
+
foreach ($f in ($Files -split ',')) { $entryLines += " - ``$($f.Trim())``" }
|
|
86
|
+
}
|
|
87
|
+
$entry = $entryLines -join "`r`n"
|
|
88
|
+
|
|
89
|
+
$enc = New-Object System.Text.UTF8Encoding($false)
|
|
90
|
+
$dateHeading = "## $Date"
|
|
91
|
+
$safeDate = [regex]::Escape($Date)
|
|
92
|
+
|
|
93
|
+
if (Test-Path $journalFile) {
|
|
94
|
+
$content = ReadText $journalFile
|
|
95
|
+
if ($content -match "(?m)^##\s+$safeDate\s*$") {
|
|
96
|
+
$pattern = "(?ms)(^##\s+$safeDate\s*\r?\n)(.*?)(?=^##\s+\d{4}-\d{2}-\d{2}\s*$|\z)"
|
|
97
|
+
$rx = New-Object System.Text.RegularExpressions.Regex($pattern, [System.Text.RegularExpressions.RegexOptions]::Multiline)
|
|
98
|
+
$content = $rx.Replace($content, {
|
|
99
|
+
param($m)
|
|
100
|
+
$block = $m.Value.TrimEnd()
|
|
101
|
+
return $block + "`r`n`r`n" + $entry + "`r`n"
|
|
102
|
+
}, 1)
|
|
103
|
+
} else {
|
|
104
|
+
$content = $content.TrimEnd() + "`r`n`r`n$dateHeading`r`n`r`n$entry`r`n"
|
|
105
|
+
}
|
|
106
|
+
[System.IO.File]::WriteAllText($journalFile, ($content -replace "`r?`n", "`r`n"), $enc)
|
|
107
|
+
} else {
|
|
108
|
+
$projectName = Split-Path -Leaf $RepoRoot
|
|
109
|
+
$header = @"
|
|
110
|
+
# Development Journal - $projectName ($month)
|
|
111
|
+
|
|
112
|
+
## $Date
|
|
113
|
+
|
|
114
|
+
$entry
|
|
115
|
+
"@
|
|
116
|
+
[System.IO.File]::WriteAllText($journalFile, ($header -replace "`r?`n", "`r`n"), $enc)
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
Write-Host "Added journal entry to: $journalFile" -ForegroundColor Green
|
|
120
|
+
Write-Host " Date: $Date" -ForegroundColor Gray
|
|
121
|
+
Write-Host " Tags: $tagString" -ForegroundColor Gray
|
|
122
|
+
Write-Host " Title: $Title" -ForegroundColor Gray
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
<#
|
|
2
|
+
add-lesson.ps1
|
|
3
|
+
Creates a new lesson file with proper ID and YAML frontmatter.
|
|
4
|
+
Automatically assigns the next available lesson ID.
|
|
5
|
+
Tags are canonicalized against tag-vocabulary.md.
|
|
6
|
+
|
|
7
|
+
USAGE:
|
|
8
|
+
powershell -File .\scripts\memory\add-lesson.ps1 -Title "Always validate input" -Tags "Reliability,Data" -Rule "Validate all user input before processing"
|
|
9
|
+
#>
|
|
10
|
+
|
|
11
|
+
[CmdletBinding()]
|
|
12
|
+
param(
|
|
13
|
+
[Parameter(Mandatory=$true)][string]$Title,
|
|
14
|
+
[Parameter(Mandatory=$true)][string]$Tags,
|
|
15
|
+
[Parameter(Mandatory=$true)][string]$Rule,
|
|
16
|
+
[string]$AppliesTo = "*",
|
|
17
|
+
[string]$Triggers = ""
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
Set-StrictMode -Version Latest
|
|
21
|
+
$ErrorActionPreference = "Stop"
|
|
22
|
+
|
|
23
|
+
if ($PSScriptRoot) {
|
|
24
|
+
$RepoRoot = (Resolve-Path (Join-Path $PSScriptRoot "..\..")).Path
|
|
25
|
+
} else {
|
|
26
|
+
$RepoRoot = (Get-Location).Path
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
function Resolve-MnemoMemoryDir([string]$Root) {
|
|
30
|
+
$candidates = @(
|
|
31
|
+
(Join-Path $Root ".mnemo\memory"),
|
|
32
|
+
(Join-Path $Root ".cursor\memory")
|
|
33
|
+
)
|
|
34
|
+
foreach ($candidate in $candidates) {
|
|
35
|
+
if (Test-Path -LiteralPath $candidate) { return $candidate }
|
|
36
|
+
}
|
|
37
|
+
return $candidates[0]
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
$MemoryDir = Resolve-MnemoMemoryDir -Root $RepoRoot
|
|
41
|
+
$LessonsDir = Join-Path $MemoryDir "lessons"
|
|
42
|
+
$TagVocabPath = Join-Path $MemoryDir "tag-vocabulary.md"
|
|
43
|
+
|
|
44
|
+
if (-not (Test-Path $LessonsDir)) { New-Item -ItemType Directory -Force -Path $LessonsDir | Out-Null }
|
|
45
|
+
|
|
46
|
+
function ReadText([string]$p) {
|
|
47
|
+
$t = Get-Content -Raw -Encoding UTF8 -ErrorAction Stop $p
|
|
48
|
+
if ($t.Length -gt 0 -and [int]$t[0] -eq 0xFEFF) { $t = $t.Substring(1) }
|
|
49
|
+
return $t
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
$canonTags = @{}
|
|
53
|
+
if (Test-Path $TagVocabPath) {
|
|
54
|
+
$tv = ReadText $TagVocabPath
|
|
55
|
+
foreach ($m in [regex]::Matches($tv, '(?m)^\-\s+\[([^\]]+)\]')) {
|
|
56
|
+
$canon = $m.Groups[1].Value.Trim()
|
|
57
|
+
$canonTags[$canon.ToLower()] = $canon
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
$existingLessons = Get-ChildItem -Path $LessonsDir -Filter "L-*.md" -ErrorAction SilentlyContinue
|
|
62
|
+
$maxId = 0
|
|
63
|
+
foreach ($lf in $existingLessons) {
|
|
64
|
+
if ($lf.Name -match '^L-(\d{3})') {
|
|
65
|
+
$id = [int]$Matches[1]
|
|
66
|
+
if ($id -gt $maxId) { $maxId = $id }
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
$lessonId = "L-{0:D3}" -f ($maxId + 1)
|
|
71
|
+
|
|
72
|
+
$kebabTitle = ($Title.ToLower() -replace '[^a-z0-9]+', '-' -replace '^-|-$', '')
|
|
73
|
+
if ([string]::IsNullOrWhiteSpace($kebabTitle)) { $kebabTitle = "lesson" }
|
|
74
|
+
if ($kebabTitle.Length -gt 50) { $kebabTitle = $kebabTitle.Substring(0, 50) }
|
|
75
|
+
$fileName = "$lessonId-$kebabTitle.md"
|
|
76
|
+
$filePath = Join-Path $LessonsDir $fileName
|
|
77
|
+
|
|
78
|
+
$rawTags = $Tags -split ',' | ForEach-Object { $_.Trim() } | Where-Object { $_ -ne "" }
|
|
79
|
+
$finalTags = @()
|
|
80
|
+
foreach ($t in $rawTags) {
|
|
81
|
+
$k = $t.ToLower()
|
|
82
|
+
if ($canonTags.Count -gt 0) {
|
|
83
|
+
if ($canonTags.ContainsKey($k)) { $finalTags += $canonTags[$k] }
|
|
84
|
+
else { throw "Unknown tag '$t'. Add it to tag-vocabulary.md or fix the tag." }
|
|
85
|
+
} else {
|
|
86
|
+
$finalTags += $t
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
$finalTags = $finalTags | Select-Object -Unique
|
|
90
|
+
$tagsYaml = "[$($finalTags -join ', ')]"
|
|
91
|
+
|
|
92
|
+
$appliesLines = @()
|
|
93
|
+
foreach ($a in ($AppliesTo -split ',')) { $appliesLines += " - $($a.Trim())" }
|
|
94
|
+
$appliesYaml = $appliesLines -join "`r`n"
|
|
95
|
+
|
|
96
|
+
if ($Triggers) {
|
|
97
|
+
$triggerLines = @()
|
|
98
|
+
foreach ($t in ($Triggers -split ',')) { $triggerLines += " - $($t.Trim())" }
|
|
99
|
+
$triggersYaml = "triggers:`r`n" + ($triggerLines -join "`r`n")
|
|
100
|
+
} else {
|
|
101
|
+
$triggersYaml = "triggers:`r`n - TODO: add error messages or keywords"
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
$today = Get-Date -Format "yyyy-MM-dd"
|
|
105
|
+
|
|
106
|
+
$content = @"
|
|
107
|
+
---
|
|
108
|
+
id: $lessonId
|
|
109
|
+
title: $Title
|
|
110
|
+
status: Active
|
|
111
|
+
tags: $tagsYaml
|
|
112
|
+
introduced: $today
|
|
113
|
+
applies_to:
|
|
114
|
+
$appliesYaml
|
|
115
|
+
$triggersYaml
|
|
116
|
+
rule: $Rule
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
# $lessonId - $Title
|
|
120
|
+
|
|
121
|
+
## Symptom
|
|
122
|
+
|
|
123
|
+
TODO: Describe what happened
|
|
124
|
+
|
|
125
|
+
## Root Cause
|
|
126
|
+
|
|
127
|
+
TODO: Describe why it happened
|
|
128
|
+
|
|
129
|
+
## Wrong Approach (DO NOT REPEAT)
|
|
130
|
+
|
|
131
|
+
- TODO: What not to do
|
|
132
|
+
|
|
133
|
+
## Correct Approach
|
|
134
|
+
|
|
135
|
+
- TODO: What to do instead
|
|
136
|
+
|
|
137
|
+
## References
|
|
138
|
+
|
|
139
|
+
- Files: ``TODO``
|
|
140
|
+
- Journal: ``journal/$($today.Substring(0,7)).md#$today``
|
|
141
|
+
"@
|
|
142
|
+
|
|
143
|
+
$enc = New-Object System.Text.UTF8Encoding($false)
|
|
144
|
+
[System.IO.File]::WriteAllText($filePath, ($content -replace "`r?`n", "`r`n"), $enc)
|
|
145
|
+
|
|
146
|
+
Write-Host "Created lesson: $filePath" -ForegroundColor Green
|
|
147
|
+
Write-Host " ID: $lessonId" -ForegroundColor Gray
|
|
148
|
+
Write-Host " Title: $Title" -ForegroundColor Gray
|
|
149
|
+
Write-Host " Tags: $tagsYaml" -ForegroundColor Gray
|
|
150
|
+
Write-Host ""
|
|
151
|
+
Write-Host "Next: run scripts\memory\rebuild-memory-index.ps1" -ForegroundColor Cyan
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
context_safety.py - Context safety guard for Mnemo retrieval packs.
|
|
4
|
+
|
|
5
|
+
Runs automatically on every retrieval/context-pack build.
|
|
6
|
+
Checks:
|
|
7
|
+
1. Duplicate snippet detection (prevents redundant context)
|
|
8
|
+
2. Contradiction detection (alerts on conflicting facts)
|
|
9
|
+
3. Low-signal suppression (filters empty/trivial content)
|
|
10
|
+
4. Token budget enforcement (hard cap on total context chars)
|
|
11
|
+
5. Sensitivity redaction (vault/secret entries stripped before delivery)
|
|
12
|
+
|
|
13
|
+
No human required: all checks are policy-driven.
|
|
14
|
+
"""
|
|
15
|
+
import re
|
|
16
|
+
import sqlite3
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
from typing import Optional
|
|
19
|
+
|
|
20
|
+
from autonomy.schema import get_db
|
|
21
|
+
from autonomy.reranker import RankedResult
|
|
22
|
+
|
|
23
|
+
DEFAULT_TOKEN_BUDGET = 6000 # chars (~1500 tokens)
|
|
24
|
+
MIN_CONTENT_CHARS = 20 # snippets shorter than this are suppressed
|
|
25
|
+
DUPLICATE_JACCARD_THRESHOLD = 0.85 # above = duplicate
|
|
26
|
+
CONTRADICTION_KEYWORD_PAIRS = [
|
|
27
|
+
("do not", "always"),
|
|
28
|
+
("never", "must"),
|
|
29
|
+
("disabled", "enabled"),
|
|
30
|
+
("false", "true"),
|
|
31
|
+
("forbidden", "required"),
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class SafetyCheckResult:
|
|
37
|
+
passed: bool
|
|
38
|
+
issues: list[str] = field(default_factory=list)
|
|
39
|
+
filtered_results: list[RankedResult] = field(default_factory=list)
|
|
40
|
+
token_budget_used: int = 0
|
|
41
|
+
token_budget_max: int = DEFAULT_TOKEN_BUDGET
|
|
42
|
+
|
|
43
|
+
def summary(self) -> str:
|
|
44
|
+
s = f"safety={'PASS' if self.passed else 'ISSUES'} used={self.token_budget_used}/{self.token_budget_max}"
|
|
45
|
+
if self.issues:
|
|
46
|
+
s += f" issues={len(self.issues)}"
|
|
47
|
+
return s
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _jaccard(a: str, b: str) -> float:
|
|
51
|
+
ta = set(re.findall(r"\w+", a.lower()))
|
|
52
|
+
tb = set(re.findall(r"\w+", b.lower()))
|
|
53
|
+
if not ta or not tb:
|
|
54
|
+
return 0.0
|
|
55
|
+
return len(ta & tb) / len(ta | tb)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _detect_duplicates(results: list[RankedResult]) -> list[tuple[int, int]]:
|
|
59
|
+
"""Return (i, j) pairs where results[i] and results[j] are near-duplicates."""
|
|
60
|
+
pairs: list[tuple[int, int]] = []
|
|
61
|
+
for i in range(len(results)):
|
|
62
|
+
for j in range(i + 1, len(results)):
|
|
63
|
+
if _jaccard(results[i].content, results[j].content) >= DUPLICATE_JACCARD_THRESHOLD:
|
|
64
|
+
pairs.append((i, j))
|
|
65
|
+
return pairs
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _detect_contradictions(results: list[RankedResult]) -> list[tuple[int, int, str]]:
|
|
69
|
+
"""Return (i, j, reason) triples indicating contradicting result pairs."""
|
|
70
|
+
contradictions: list[tuple[int, int, str]] = []
|
|
71
|
+
for i in range(len(results)):
|
|
72
|
+
for j in range(i + 1, len(results)):
|
|
73
|
+
a = results[i].content.lower()
|
|
74
|
+
b = results[j].content.lower()
|
|
75
|
+
for neg, pos in CONTRADICTION_KEYWORD_PAIRS:
|
|
76
|
+
if neg in a and pos in b:
|
|
77
|
+
contradictions.append((i, j, f"'{neg}' vs '{pos}'"))
|
|
78
|
+
break
|
|
79
|
+
if pos in a and neg in b:
|
|
80
|
+
contradictions.append((i, j, f"'{pos}' vs '{neg}'"))
|
|
81
|
+
break
|
|
82
|
+
return contradictions
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class ContextSafetyGuard:
|
|
86
|
+
def __init__(
|
|
87
|
+
self,
|
|
88
|
+
token_budget: int = DEFAULT_TOKEN_BUDGET,
|
|
89
|
+
db: Optional[sqlite3.Connection] = None,
|
|
90
|
+
):
|
|
91
|
+
self.token_budget = token_budget
|
|
92
|
+
self.db = db or get_db()
|
|
93
|
+
|
|
94
|
+
def check(self, results: list[RankedResult]) -> SafetyCheckResult:
|
|
95
|
+
"""
|
|
96
|
+
Run all safety checks on results, return SafetyCheckResult.
|
|
97
|
+
Modifies results list by removing failing entries.
|
|
98
|
+
"""
|
|
99
|
+
issues: list[str] = []
|
|
100
|
+
filtered: list[RankedResult] = []
|
|
101
|
+
|
|
102
|
+
# 1. Sensitivity redaction (vault/secret always stripped first)
|
|
103
|
+
for r in results:
|
|
104
|
+
if r.memory_type == "vault":
|
|
105
|
+
issues.append(f"REDACTED vault entry: {r.ref_path}")
|
|
106
|
+
continue
|
|
107
|
+
# Check DB sensitivity flag
|
|
108
|
+
row = self.db.execute(
|
|
109
|
+
"SELECT sensitivity FROM memory_units WHERE source_ref = ?",
|
|
110
|
+
(r.source_file,),
|
|
111
|
+
).fetchone()
|
|
112
|
+
if row and row["sensitivity"] == "secret":
|
|
113
|
+
issues.append(f"REDACTED secret entry: {r.ref_path}")
|
|
114
|
+
continue
|
|
115
|
+
filtered.append(r)
|
|
116
|
+
|
|
117
|
+
# 2. Low-signal suppression
|
|
118
|
+
filtered = [r for r in filtered if len(r.content.strip()) >= MIN_CONTENT_CHARS]
|
|
119
|
+
suppressed = len(results) - len(filtered)
|
|
120
|
+
if suppressed > 0:
|
|
121
|
+
issues.append(f"Suppressed {suppressed} low-signal entries (<{MIN_CONTENT_CHARS} chars)")
|
|
122
|
+
|
|
123
|
+
# 3. Duplicate detection (remove lower-scored duplicate)
|
|
124
|
+
dup_pairs = _detect_duplicates(filtered)
|
|
125
|
+
to_remove: set[int] = set()
|
|
126
|
+
for i, j in dup_pairs:
|
|
127
|
+
# Keep higher-scored result
|
|
128
|
+
remove_idx = j if filtered[i].final_score >= filtered[j].final_score else i
|
|
129
|
+
to_remove.add(remove_idx)
|
|
130
|
+
issues.append(f"Duplicate pair removed: {filtered[remove_idx].ref_path}")
|
|
131
|
+
filtered = [r for idx, r in enumerate(filtered) if idx not in to_remove]
|
|
132
|
+
|
|
133
|
+
# 4. Contradiction detection (warn but keep both, lower confidence)
|
|
134
|
+
contradictions = _detect_contradictions(filtered)
|
|
135
|
+
for i, j, reason in contradictions:
|
|
136
|
+
issues.append(
|
|
137
|
+
f"Contradiction detected ({reason}): {filtered[i].ref_path} vs {filtered[j].ref_path}"
|
|
138
|
+
)
|
|
139
|
+
# Reduce score of both to signal uncertainty
|
|
140
|
+
object.__setattr__(filtered[i], "final_score", filtered[i].final_score * 0.9)
|
|
141
|
+
object.__setattr__(filtered[j], "final_score", filtered[j].final_score * 0.9)
|
|
142
|
+
|
|
143
|
+
# 5. Token budget enforcement
|
|
144
|
+
budget_used = 0
|
|
145
|
+
final: list[RankedResult] = []
|
|
146
|
+
for r in sorted(filtered, key=lambda x: x.final_score, reverse=True):
|
|
147
|
+
chars = len(r.content)
|
|
148
|
+
if budget_used + chars > self.token_budget:
|
|
149
|
+
issues.append(f"Token budget exceeded; truncated at {budget_used} chars")
|
|
150
|
+
break
|
|
151
|
+
final.append(r)
|
|
152
|
+
budget_used += chars
|
|
153
|
+
|
|
154
|
+
passed = not any("REDACTED secret" in i or "REDACTED vault" in i for i in issues)
|
|
155
|
+
return SafetyCheckResult(
|
|
156
|
+
passed=passed,
|
|
157
|
+
issues=issues,
|
|
158
|
+
filtered_results=final,
|
|
159
|
+
token_budget_used=budget_used,
|
|
160
|
+
token_budget_max=self.token_budget,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def build_context_pack(self, results: list[RankedResult]) -> str:
|
|
164
|
+
"""
|
|
165
|
+
Build a formatted context pack string from safety-checked results.
|
|
166
|
+
Safe to pass directly to LLM context.
|
|
167
|
+
"""
|
|
168
|
+
check = self.check(results)
|
|
169
|
+
lines: list[str] = []
|
|
170
|
+
|
|
171
|
+
if not check.passed:
|
|
172
|
+
sensitive_warnings = [i for i in check.issues if "REDACTED" in i]
|
|
173
|
+
if sensitive_warnings:
|
|
174
|
+
lines.append(f"[Safety] {'; '.join(sensitive_warnings)}")
|
|
175
|
+
|
|
176
|
+
for r in check.filtered_results:
|
|
177
|
+
lines.append(f"<!-- {r.ref_path} | score={r.final_score:.3f} type={r.memory_type} -->")
|
|
178
|
+
lines.append(r.content.strip())
|
|
179
|
+
lines.append("")
|
|
180
|
+
|
|
181
|
+
return "\n".join(lines).strip()
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
entity_resolver.py - Stable entity IDs and alias resolution.
|
|
4
|
+
|
|
5
|
+
Extracts named entities from memory units (paths, modules, concepts),
|
|
6
|
+
assigns stable UUIDs, maintains alias mappings with confidence scores,
|
|
7
|
+
and propagates entity_tags back to memory_units.
|
|
8
|
+
|
|
9
|
+
No human required: alias merging is automatic with confidence thresholds.
|
|
10
|
+
"""
|
|
11
|
+
import json
|
|
12
|
+
import re
|
|
13
|
+
import sqlite3
|
|
14
|
+
import uuid
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from typing import Optional
|
|
17
|
+
|
|
18
|
+
from autonomy.schema import get_db
|
|
19
|
+
from autonomy.ingest_pipeline import MemoryUnit
|
|
20
|
+
|
|
21
|
+
ALIAS_MERGE_THRESHOLD = 0.85 # confidence required to merge aliases
|
|
22
|
+
ENTITY_CONFIDENCE_DECAY = 0.02 # decay per cycle without reinforcement
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class Entity:
|
|
27
|
+
entity_id: str
|
|
28
|
+
entity_name: str
|
|
29
|
+
entity_type: str
|
|
30
|
+
confidence: float
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _extract_entities_from_text(content: str, source_ref: str) -> list[tuple[str, str]]:
|
|
34
|
+
"""
|
|
35
|
+
Heuristic entity extraction from markdown content.
|
|
36
|
+
Returns list of (entity_name, entity_type) tuples.
|
|
37
|
+
"""
|
|
38
|
+
entities: list[tuple[str, str]] = []
|
|
39
|
+
|
|
40
|
+
# File/path references (e.g., `path/to/file.py`)
|
|
41
|
+
for m in re.finditer(r"`([^`]+\.[a-z]{1,10})`", content):
|
|
42
|
+
p = m.group(1)
|
|
43
|
+
if "/" in p or "\\" in p:
|
|
44
|
+
entities.append((p, "file"))
|
|
45
|
+
|
|
46
|
+
# Module/class names (CamelCase words used multiple times)
|
|
47
|
+
camel_matches = re.findall(r"\b([A-Z][a-zA-Z]{3,}(?:[A-Z][a-z]+)+)\b", content)
|
|
48
|
+
freq: dict[str, int] = {}
|
|
49
|
+
for name in camel_matches:
|
|
50
|
+
freq[name] = freq.get(name, 0) + 1
|
|
51
|
+
for name, count in freq.items():
|
|
52
|
+
if count >= 2:
|
|
53
|
+
entities.append((name, "class"))
|
|
54
|
+
|
|
55
|
+
# Lesson references (L-XXX)
|
|
56
|
+
for m in re.finditer(r"\bL-(\d{3})\b", content):
|
|
57
|
+
entities.append((f"L-{m.group(1)}", "lesson"))
|
|
58
|
+
|
|
59
|
+
# Function/method references (snake_case in backticks)
|
|
60
|
+
for m in re.finditer(r"`([a-z][a-z0-9_]{3,})\(\)`", content):
|
|
61
|
+
entities.append((m.group(1), "function"))
|
|
62
|
+
|
|
63
|
+
return list(set(entities))[:30] # deduplicate + cap
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class EntityResolver:
|
|
67
|
+
def __init__(self, db: Optional[sqlite3.Connection] = None):
|
|
68
|
+
self.db = db or get_db()
|
|
69
|
+
|
|
70
|
+
def resolve(self, unit: MemoryUnit) -> list[str]:
|
|
71
|
+
"""
|
|
72
|
+
Extract entities from unit content, resolve/create IDs, update unit.
|
|
73
|
+
Returns list of entity_ids assigned to this unit.
|
|
74
|
+
"""
|
|
75
|
+
raw_entities = _extract_entities_from_text(unit.content, unit.source_ref)
|
|
76
|
+
entity_ids: list[str] = []
|
|
77
|
+
|
|
78
|
+
for entity_name, entity_type in raw_entities:
|
|
79
|
+
eid = self._get_or_create(entity_name, entity_type)
|
|
80
|
+
if eid:
|
|
81
|
+
entity_ids.append(eid)
|
|
82
|
+
|
|
83
|
+
if entity_ids:
|
|
84
|
+
self.db.execute(
|
|
85
|
+
"UPDATE memory_units SET entity_tags=?, updated_at=unixepoch('now') WHERE unit_id=?",
|
|
86
|
+
(json.dumps(entity_ids), unit.unit_id),
|
|
87
|
+
)
|
|
88
|
+
self.db.commit()
|
|
89
|
+
unit.entity_tags = entity_ids
|
|
90
|
+
|
|
91
|
+
return entity_ids
|
|
92
|
+
|
|
93
|
+
def _get_or_create(self, entity_name: str, entity_type: str) -> Optional[str]:
|
|
94
|
+
"""Resolve entity by exact match or alias, or create new."""
|
|
95
|
+
# Check exact entity name
|
|
96
|
+
row = self.db.execute(
|
|
97
|
+
"SELECT entity_id FROM entities WHERE entity_name = ?", (entity_name,)
|
|
98
|
+
).fetchone()
|
|
99
|
+
if row:
|
|
100
|
+
self._reinforce(row["entity_id"])
|
|
101
|
+
return row["entity_id"]
|
|
102
|
+
|
|
103
|
+
# Check aliases
|
|
104
|
+
alias_row = self.db.execute(
|
|
105
|
+
"SELECT entity_id FROM entity_aliases WHERE alias_text = ?", (entity_name,)
|
|
106
|
+
).fetchone()
|
|
107
|
+
if alias_row:
|
|
108
|
+
self._reinforce(alias_row["entity_id"])
|
|
109
|
+
return alias_row["entity_id"]
|
|
110
|
+
|
|
111
|
+
# Try fuzzy alias match
|
|
112
|
+
similar_id = self._find_similar_entity(entity_name)
|
|
113
|
+
if similar_id:
|
|
114
|
+
# Add as alias
|
|
115
|
+
try:
|
|
116
|
+
self.db.execute(
|
|
117
|
+
"INSERT INTO entity_aliases(alias_id, entity_id, alias_text, confidence) VALUES (?,?,?,?)",
|
|
118
|
+
(str(uuid.uuid4()), similar_id, entity_name, ALIAS_MERGE_THRESHOLD),
|
|
119
|
+
)
|
|
120
|
+
self.db.commit()
|
|
121
|
+
self._reinforce(similar_id)
|
|
122
|
+
return similar_id
|
|
123
|
+
except sqlite3.IntegrityError:
|
|
124
|
+
return similar_id
|
|
125
|
+
|
|
126
|
+
# Create new entity
|
|
127
|
+
entity_id = str(uuid.uuid4())
|
|
128
|
+
try:
|
|
129
|
+
self.db.execute(
|
|
130
|
+
"INSERT INTO entities(entity_id, entity_name, entity_type, confidence) VALUES (?,?,?,1.0)",
|
|
131
|
+
(entity_id, entity_name, entity_type),
|
|
132
|
+
)
|
|
133
|
+
self.db.commit()
|
|
134
|
+
except sqlite3.IntegrityError:
|
|
135
|
+
# Race condition: entity was created concurrently
|
|
136
|
+
row = self.db.execute(
|
|
137
|
+
"SELECT entity_id FROM entities WHERE entity_name = ?", (entity_name,)
|
|
138
|
+
).fetchone()
|
|
139
|
+
return row["entity_id"] if row else None
|
|
140
|
+
return entity_id
|
|
141
|
+
|
|
142
|
+
def _find_similar_entity(self, name: str) -> Optional[str]:
|
|
143
|
+
"""
|
|
144
|
+
Find an entity whose name is highly similar (token Jaccard).
|
|
145
|
+
Returns entity_id if confidence >= threshold, else None.
|
|
146
|
+
"""
|
|
147
|
+
candidates = self.db.execute(
|
|
148
|
+
"SELECT entity_id, entity_name FROM entities"
|
|
149
|
+
).fetchall()
|
|
150
|
+
|
|
151
|
+
name_tokens = set(re.findall(r"\w+", name.lower()))
|
|
152
|
+
best_id = None
|
|
153
|
+
best_score = 0.0
|
|
154
|
+
|
|
155
|
+
for row in candidates:
|
|
156
|
+
cand_tokens = set(re.findall(r"\w+", row["entity_name"].lower()))
|
|
157
|
+
if not name_tokens or not cand_tokens:
|
|
158
|
+
continue
|
|
159
|
+
score = len(name_tokens & cand_tokens) / len(name_tokens | cand_tokens)
|
|
160
|
+
if score > best_score:
|
|
161
|
+
best_score = score
|
|
162
|
+
best_id = row["entity_id"]
|
|
163
|
+
|
|
164
|
+
return best_id if best_score >= ALIAS_MERGE_THRESHOLD else None
|
|
165
|
+
|
|
166
|
+
def _reinforce(self, entity_id: str) -> None:
|
|
167
|
+
"""Increase confidence for an entity (re-observed)."""
|
|
168
|
+
self.db.execute(
|
|
169
|
+
"UPDATE entities SET confidence=MIN(confidence+0.01, 1.0) WHERE entity_id=?",
|
|
170
|
+
(entity_id,),
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
def decay_stale_entities(self, min_confidence: float = 0.2) -> int:
|
|
174
|
+
"""
|
|
175
|
+
Decay confidence of entities not reinforced recently.
|
|
176
|
+
Quarantine entities below min_confidence.
|
|
177
|
+
Returns number of quarantined entities.
|
|
178
|
+
"""
|
|
179
|
+
self.db.execute(
|
|
180
|
+
"""
|
|
181
|
+
UPDATE entities SET confidence = MAX(confidence - ?, 0.0)
|
|
182
|
+
WHERE entity_id NOT IN (
|
|
183
|
+
SELECT DISTINCT json_each.value
|
|
184
|
+
FROM memory_units, json_each(memory_units.entity_tags)
|
|
185
|
+
)
|
|
186
|
+
""",
|
|
187
|
+
(ENTITY_CONFIDENCE_DECAY,),
|
|
188
|
+
)
|
|
189
|
+
quarantined = self.db.execute(
|
|
190
|
+
"SELECT COUNT(*) FROM entities WHERE confidence < ?", (min_confidence,)
|
|
191
|
+
).fetchone()[0]
|
|
192
|
+
self.db.commit()
|
|
193
|
+
return quarantined
|
|
194
|
+
|
|
195
|
+
def get_entity_by_alias(self, alias: str) -> Optional[Entity]:
|
|
196
|
+
"""Resolve any alias or name to canonical Entity."""
|
|
197
|
+
# Direct name match
|
|
198
|
+
row = self.db.execute(
|
|
199
|
+
"SELECT entity_id, entity_name, entity_type, confidence FROM entities WHERE entity_name = ?",
|
|
200
|
+
(alias,),
|
|
201
|
+
).fetchone()
|
|
202
|
+
if row:
|
|
203
|
+
return Entity(**dict(row))
|
|
204
|
+
# Alias lookup
|
|
205
|
+
alias_row = self.db.execute(
|
|
206
|
+
"""
|
|
207
|
+
SELECT e.entity_id, e.entity_name, e.entity_type, e.confidence
|
|
208
|
+
FROM entity_aliases ea JOIN entities e ON ea.entity_id = e.entity_id
|
|
209
|
+
WHERE ea.alias_text = ?
|
|
210
|
+
""",
|
|
211
|
+
(alias,),
|
|
212
|
+
).fetchone()
|
|
213
|
+
if alias_row:
|
|
214
|
+
return Entity(**dict(alias_row))
|
|
215
|
+
return None
|