cherry-docs 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- app/__init__.py +0 -0
- app/repo_scope.py +24 -0
- app/services/__init__.py +0 -0
- app/services/agent_protocol.py +59 -0
- app/services/auto_promote_sessions.py +245 -0
- app/services/capture_adapters.py +89 -0
- app/services/capture_core.py +164 -0
- app/services/internal_memory_agent.py +214 -0
- app/services/memory_evidence.py +89 -0
- app/services/memory_extraction_normalize.py +134 -0
- app/services/memory_lifecycle.py +258 -0
- app/services/memory_profiles.py +88 -0
- app/services/memory_providers.py +113 -0
- app/services/memory_retrieval.py +327 -0
- app/services/memory_retrieval_scoring.py +106 -0
- app/services/memory_retrieval_text.py +113 -0
- app/services/memory_similarity.py +135 -0
- app/services/privacy.py +72 -0
- app/services/promoted_memory_answer.py +157 -0
- app/services/promoted_memory_pipeline.py +194 -0
- app/services/promoted_memory_store.py +57 -0
- cherry_docs-0.2.0.dist-info/METADATA +143 -0
- cherry_docs-0.2.0.dist-info/RECORD +42 -0
- cherry_docs-0.2.0.dist-info/WHEEL +5 -0
- cherry_docs-0.2.0.dist-info/entry_points.txt +4 -0
- cherry_docs-0.2.0.dist-info/top_level.txt +3 -0
- cherrydocs/__init__.py +3 -0
- cherrydocs/cli.py +213 -0
- cherrydocs/hook.py +27 -0
- cherrydocs/mcp.py +22 -0
- scripts/__init__.py +0 -0
- scripts/auto_promote_capture.py +63 -0
- scripts/check_size_limits.py +115 -0
- scripts/ci_auto_capture.py +289 -0
- scripts/claude_hooks/__init__.py +0 -0
- scripts/claude_hooks/state_manager.py +526 -0
- scripts/coverage_regression_gate.py +121 -0
- scripts/eval_projects.py +247 -0
- scripts/install.py +212 -0
- scripts/pr_gate_report.py +282 -0
- scripts/promptfoo_regression_gate.py +176 -0
- scripts/render_agent_prompts.py +57 -0
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""Provider-agnostic internal memory-agent: prompt building, LLM extraction, chunked pipeline."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from collections.abc import Iterable
|
|
7
|
+
from typing import Any, Protocol
|
|
8
|
+
|
|
9
|
+
from app.services.memory_extraction_normalize import (
|
|
10
|
+
MemoryCandidate,
|
|
11
|
+
MemoryExtractionResult,
|
|
12
|
+
_is_near_duplicate,
|
|
13
|
+
normalize_memory_candidates,
|
|
14
|
+
)
|
|
15
|
+
from app.services.memory_profiles import (
|
|
16
|
+
MemoryPromptProfile,
|
|
17
|
+
MemoryPromptProfileName,
|
|
18
|
+
resolve_memory_prompt_profile,
|
|
19
|
+
)
|
|
20
|
+
from app.services.memory_providers import AnthropicMemoryProvider, OllamaMemoryProvider
|
|
21
|
+
|
|
22
|
+
_CHUNK_SIZE = 25 # events per LLM call
|
|
23
|
+
_MAX_TOTAL_CANDIDATES = 12 # cap across all chunks for one session
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class MemoryModelProvider(Protocol):
|
|
27
|
+
def extract(self, prompt: str) -> dict[str, Any]:
|
|
28
|
+
"""Return parsed JSON-like extraction payload."""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _trim_text(value: str, limit: int = 1200) -> str:
|
|
32
|
+
text = " ".join((value or "").split())
|
|
33
|
+
if len(text) <= limit:
|
|
34
|
+
return text
|
|
35
|
+
return f"{text[: limit - 3]}..."
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def build_recent_capture_window(events: Iterable[dict[str, Any]], limit: int = _CHUNK_SIZE) -> list[dict[str, Any]]:
|
|
39
|
+
relevant: list[dict[str, Any]] = []
|
|
40
|
+
for event in events:
|
|
41
|
+
event_type = str(event.get("event_type") or "")
|
|
42
|
+
if event_type not in {
|
|
43
|
+
"user_prompt",
|
|
44
|
+
"assistant_output",
|
|
45
|
+
"shell_result",
|
|
46
|
+
"tool_result",
|
|
47
|
+
"remember",
|
|
48
|
+
"test_result",
|
|
49
|
+
}:
|
|
50
|
+
continue
|
|
51
|
+
text = str(event.get("text") or "").strip()
|
|
52
|
+
if not text and event_type not in {"shell_result", "tool_result", "test_result"}:
|
|
53
|
+
continue
|
|
54
|
+
relevant.append(event)
|
|
55
|
+
return relevant[-max(1, limit):]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def build_internal_memory_prompt(
|
|
59
|
+
events: Iterable[dict[str, Any]],
|
|
60
|
+
*,
|
|
61
|
+
project_hint: str | None = None,
|
|
62
|
+
existing_context: list[str] | None = None,
|
|
63
|
+
profile: MemoryPromptProfileName | str | None = None,
|
|
64
|
+
) -> str:
|
|
65
|
+
prompt_profile = resolve_memory_prompt_profile(profile)
|
|
66
|
+
window = build_recent_capture_window(events)
|
|
67
|
+
event_lines: list[str] = []
|
|
68
|
+
for event in window:
|
|
69
|
+
event_type = str(event.get("event_type") or "unknown")
|
|
70
|
+
text = _trim_text(str(event.get("text") or ""))
|
|
71
|
+
command = _trim_text(str(event.get("command") or ""), 240)
|
|
72
|
+
exit_code = event.get("exit_code")
|
|
73
|
+
_meta = event.get("metadata")
|
|
74
|
+
metadata: dict = _meta if isinstance(_meta, dict) else {}
|
|
75
|
+
parts = [f"type={event_type}"]
|
|
76
|
+
if command:
|
|
77
|
+
parts.append(f"command={command}")
|
|
78
|
+
if exit_code is not None:
|
|
79
|
+
parts.append(f"exit_code={exit_code}")
|
|
80
|
+
verification_status = str(metadata.get("verification_status") or "").strip()
|
|
81
|
+
verification_kind = str(metadata.get("verification_kind") or "").strip()
|
|
82
|
+
if verification_kind:
|
|
83
|
+
parts.append(f"verification_kind={verification_kind}")
|
|
84
|
+
if verification_status:
|
|
85
|
+
parts.append(f"verification_status={verification_status}")
|
|
86
|
+
if text:
|
|
87
|
+
parts.append(f"text={text}")
|
|
88
|
+
event_lines.append(" | ".join(parts))
|
|
89
|
+
|
|
90
|
+
prior = "\n".join(f"- {item}" for item in (existing_context or []) if item) or "- none"
|
|
91
|
+
rendered_events = "\n".join(f"- {line}" for line in event_lines) or "- none"
|
|
92
|
+
project_label = project_hint or "current project"
|
|
93
|
+
keep_rules = "\n".join(f"- {line}" for line in prompt_profile.keep_rules)
|
|
94
|
+
extra_rules = "\n".join(f"- {line}" for line in prompt_profile.extra_rules)
|
|
95
|
+
extraction_bias = "\n".join(f"- {line}" for line in prompt_profile.extraction_bias)
|
|
96
|
+
schema = {
|
|
97
|
+
"candidates": [
|
|
98
|
+
{
|
|
99
|
+
"memory_type": "episodic|heuristic|procedural|factual|noise",
|
|
100
|
+
"kind": "attempt|decision|conclusion|warning|procedure|pattern|fact",
|
|
101
|
+
"summary": "short useful memory",
|
|
102
|
+
"rationale": "why this matters for a future AI",
|
|
103
|
+
"confidence": 0.0,
|
|
104
|
+
"should_store": True,
|
|
105
|
+
"sensitive": False,
|
|
106
|
+
}
|
|
107
|
+
],
|
|
108
|
+
"notes": ["optional short notes"],
|
|
109
|
+
}
|
|
110
|
+
return f"""You are CherryDocs' internal memory distiller for {project_label}.
|
|
111
|
+
|
|
112
|
+
Your job is to turn recent AI work traces into useful shared project memory.
|
|
113
|
+
|
|
114
|
+
Active profile: {prompt_profile.name}
|
|
115
|
+
Profile intent: {prompt_profile.description}
|
|
116
|
+
|
|
117
|
+
{prompt_profile.system_focus}
|
|
118
|
+
{keep_rules}
|
|
119
|
+
|
|
120
|
+
Memory type rules:
|
|
121
|
+
- factual = what is permanently true right now: a command, path, config, API, or project state
|
|
122
|
+
- episodic = a concrete attempt, decision, or conclusion from this work
|
|
123
|
+
- heuristic = a reusable warning, hidden constraint, or recurring pattern
|
|
124
|
+
- procedural = an explicit step-by-step way of doing something repeatedly
|
|
125
|
+
- noise = routine chatter, weak summaries, or anything cheap to rediscover
|
|
126
|
+
|
|
127
|
+
Kind rules:
|
|
128
|
+
- fact = stable project truth: entry points, config paths, APIs, project-level invariants. NOT test results or error counts.
|
|
129
|
+
- attempt = "we tried X"
|
|
130
|
+
- decision = "we chose X over Y because..."
|
|
131
|
+
- conclusion = "we learned/found that..."
|
|
132
|
+
- warning = "avoid X because..."
|
|
133
|
+
- pattern = "X keeps happening when..."
|
|
134
|
+
- procedure = a repeatable workflow with actual steps
|
|
135
|
+
|
|
136
|
+
Important:
|
|
137
|
+
- Never store a fact whose value changes daily: test pass counts, lint error totals, CI status, PR review status.
|
|
138
|
+
- Use procedure only for explicit repeatable workflows; prefer decision for refactors/pivots.
|
|
139
|
+
- If a candidate is noise, set memory_type=noise and should_store=false.
|
|
140
|
+
- If no durable memory is present, return an empty candidates list.
|
|
141
|
+
- Return at most 3 non-overlapping candidates.
|
|
142
|
+
{extra_rules}
|
|
143
|
+
|
|
144
|
+
Prefer extracting:
|
|
145
|
+
{extraction_bias}
|
|
146
|
+
|
|
147
|
+
Output strict JSON only.
|
|
148
|
+
|
|
149
|
+
Prior remembered context:
|
|
150
|
+
{prior}
|
|
151
|
+
|
|
152
|
+
Recent captured events:
|
|
153
|
+
{rendered_events}
|
|
154
|
+
|
|
155
|
+
Return JSON matching this schema:
|
|
156
|
+
{json.dumps(schema, indent=2)}
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def extract_memory_candidates(
|
|
161
|
+
events: Iterable[dict[str, Any]],
|
|
162
|
+
*,
|
|
163
|
+
provider: MemoryModelProvider,
|
|
164
|
+
project_hint: str | None = None,
|
|
165
|
+
existing_context: list[str] | None = None,
|
|
166
|
+
profile: MemoryPromptProfileName | str | None = None,
|
|
167
|
+
) -> MemoryExtractionResult:
|
|
168
|
+
"""Extract memories from all events in chunks, accumulating across the full session."""
|
|
169
|
+
all_events = list(events)
|
|
170
|
+
relevant = build_recent_capture_window(all_events, limit=len(all_events) or 1)
|
|
171
|
+
if not relevant:
|
|
172
|
+
return MemoryExtractionResult()
|
|
173
|
+
|
|
174
|
+
accumulated: list[Any] = []
|
|
175
|
+
running_context = list(existing_context or [])
|
|
176
|
+
notes: list[str] = []
|
|
177
|
+
|
|
178
|
+
for i in range(0, len(relevant), _CHUNK_SIZE):
|
|
179
|
+
chunk = relevant[i: i + _CHUNK_SIZE]
|
|
180
|
+
prompt = build_internal_memory_prompt(
|
|
181
|
+
chunk,
|
|
182
|
+
project_hint=project_hint,
|
|
183
|
+
existing_context=running_context,
|
|
184
|
+
profile=profile,
|
|
185
|
+
)
|
|
186
|
+
result = normalize_memory_candidates(provider.extract(prompt))
|
|
187
|
+
notes.extend(result.notes)
|
|
188
|
+
for candidate in result.candidates:
|
|
189
|
+
if candidate.memory_type == "noise" or not candidate.should_store:
|
|
190
|
+
continue
|
|
191
|
+
if not any(_is_near_duplicate(candidate, seen) for seen in accumulated):
|
|
192
|
+
accumulated.append(candidate)
|
|
193
|
+
running_context = [*running_context, candidate.summary]
|
|
194
|
+
if len(accumulated) >= _MAX_TOTAL_CANDIDATES:
|
|
195
|
+
break
|
|
196
|
+
|
|
197
|
+
return MemoryExtractionResult(candidates=accumulated[:_MAX_TOTAL_CANDIDATES], notes=notes)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
__all__ = [
|
|
201
|
+
# Re-exported from sub-modules so callers import from here unchanged.
|
|
202
|
+
"AnthropicMemoryProvider",
|
|
203
|
+
"MemoryCandidate",
|
|
204
|
+
"MemoryExtractionResult",
|
|
205
|
+
"MemoryModelProvider",
|
|
206
|
+
"MemoryPromptProfile",
|
|
207
|
+
"MemoryPromptProfileName",
|
|
208
|
+
"OllamaMemoryProvider",
|
|
209
|
+
"build_internal_memory_prompt",
|
|
210
|
+
"build_recent_capture_window",
|
|
211
|
+
"extract_memory_candidates",
|
|
212
|
+
"normalize_memory_candidates",
|
|
213
|
+
"resolve_memory_prompt_profile",
|
|
214
|
+
]
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Evidence linking: connect memory candidates to their supporting raw capture events."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
from app.services.memory_similarity import _similarity
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from app.services.memory_extraction_normalize import MemoryCandidate
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def build_event_evidence_id(event: dict) -> str:
|
|
14
|
+
event_type = str(event.get("event_type") or "unknown")
|
|
15
|
+
timestamp = str(event.get("timestamp") or "unknown")
|
|
16
|
+
return f"{event_type}:{timestamp}"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def collect_candidate_evidence_refs(
|
|
20
|
+
candidate: MemoryCandidate,
|
|
21
|
+
events: list[dict],
|
|
22
|
+
*,
|
|
23
|
+
limit: int = 3,
|
|
24
|
+
) -> list[str]:
|
|
25
|
+
ranked: list[tuple[float, str]] = []
|
|
26
|
+
summary_anchor = candidate.summary.strip()
|
|
27
|
+
rationale_anchor = candidate.rationale.strip()
|
|
28
|
+
combined_anchor = f"{summary_anchor} {rationale_anchor}".strip()
|
|
29
|
+
for event in events:
|
|
30
|
+
event_type = str(event.get("event_type") or "")
|
|
31
|
+
if event_type not in {"assistant_output", "remember", "tool_result", "shell_result", "test_result"}:
|
|
32
|
+
continue
|
|
33
|
+
text = str(event.get("text") or "").strip()
|
|
34
|
+
if not text:
|
|
35
|
+
continue
|
|
36
|
+
summary_overlap = _similarity(summary_anchor, text)
|
|
37
|
+
rationale_overlap = _similarity(rationale_anchor, text) if rationale_anchor else 0.0
|
|
38
|
+
overlap = _similarity(combined_anchor, text)
|
|
39
|
+
if summary_overlap < 0.35 and not (summary_overlap >= 0.2 and rationale_overlap >= 0.2):
|
|
40
|
+
continue
|
|
41
|
+
score = (summary_overlap * 1.7) + rationale_overlap + (overlap * 0.4)
|
|
42
|
+
if event_type == "assistant_output":
|
|
43
|
+
score += 0.2
|
|
44
|
+
ranked.append((score, build_event_evidence_id(event)))
|
|
45
|
+
ranked.sort(key=lambda item: item[0], reverse=True)
|
|
46
|
+
return [evidence_id for _, evidence_id in ranked[: max(1, limit)]]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def collect_candidate_context(
|
|
50
|
+
candidate: MemoryCandidate,
|
|
51
|
+
events: list[dict],
|
|
52
|
+
*,
|
|
53
|
+
evidence_refs: list[str] | None = None,
|
|
54
|
+
max_files_per_event: int = 20,
|
|
55
|
+
max_files_total: int = 5,
|
|
56
|
+
) -> dict:
|
|
57
|
+
matched_ids = set(evidence_refs or collect_candidate_evidence_refs(candidate, events))
|
|
58
|
+
if not matched_ids:
|
|
59
|
+
return {"repo": None, "commit": None, "files": []}
|
|
60
|
+
|
|
61
|
+
repo: str | None = None
|
|
62
|
+
commit: str | None = None
|
|
63
|
+
weighted_files: dict[str, int] = {}
|
|
64
|
+
for event in events:
|
|
65
|
+
event_id = build_event_evidence_id(event)
|
|
66
|
+
if event_id not in matched_ids:
|
|
67
|
+
continue
|
|
68
|
+
if not repo:
|
|
69
|
+
repo = str(event.get("repo") or "").strip() or None
|
|
70
|
+
if not commit:
|
|
71
|
+
commit = str(event.get("commit") or "").strip() or None
|
|
72
|
+
_raw_files = event.get("files")
|
|
73
|
+
event_files = [
|
|
74
|
+
str(p).strip()
|
|
75
|
+
for p in (_raw_files if isinstance(_raw_files, list) else [])
|
|
76
|
+
if str(p).strip()
|
|
77
|
+
]
|
|
78
|
+
if not event_files or len(event_files) > max_files_per_event:
|
|
79
|
+
continue
|
|
80
|
+
for path in event_files:
|
|
81
|
+
weighted_files[path] = weighted_files.get(path, 0) + 1
|
|
82
|
+
|
|
83
|
+
files = [
|
|
84
|
+
path
|
|
85
|
+
for path, _count in sorted(
|
|
86
|
+
weighted_files.items(), key=lambda item: (-item[1], item[0])
|
|
87
|
+
)[:max_files_total]
|
|
88
|
+
]
|
|
89
|
+
return {"repo": repo, "commit": commit, "files": files}
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""Output normalization and deduplication for LLM-extracted memory candidates."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
9
|
+
|
|
10
|
+
_VALID_MEMORY_TYPES = {"episodic", "heuristic", "procedural", "factual", "noise"}
|
|
11
|
+
_VALID_KINDS = {"attempt", "decision", "conclusion", "warning", "procedure", "pattern", "fact"}
|
|
12
|
+
_KINDS_BY_MEMORY_TYPE = {
|
|
13
|
+
"episodic": {"attempt", "decision", "conclusion"},
|
|
14
|
+
"heuristic": {"warning", "pattern"},
|
|
15
|
+
"procedural": {"procedure"},
|
|
16
|
+
"factual": {"fact"},
|
|
17
|
+
"noise": _VALID_KINDS,
|
|
18
|
+
}
|
|
19
|
+
_MEMORY_TYPE_BY_KIND = {
|
|
20
|
+
"attempt": "episodic",
|
|
21
|
+
"decision": "episodic",
|
|
22
|
+
"conclusion": "episodic",
|
|
23
|
+
"warning": "heuristic",
|
|
24
|
+
"pattern": "heuristic",
|
|
25
|
+
"procedure": "procedural",
|
|
26
|
+
"fact": "factual",
|
|
27
|
+
}
|
|
28
|
+
_STOPWORDS = {
|
|
29
|
+
"a", "an", "and", "are", "as", "at", "be", "because", "for", "from", "how",
|
|
30
|
+
"if", "in", "into", "is", "it", "of", "on", "or", "over", "that", "the",
|
|
31
|
+
"this", "to", "use", "we", "with",
|
|
32
|
+
}
|
|
33
|
+
_PROCEDURE_HINTS = {"step", "steps", "first", "then", "next", "run", "repeat", "workflow", "checklist"}
|
|
34
|
+
_DECISION_HINTS = {"choose", "decide", "switch", "move", "refactor", "introduce", "keep", "adopt", "use"}
|
|
35
|
+
_TRANSIENT_RE = re.compile(
|
|
36
|
+
r"ruff |pytest |tests? (pass|fail)|\d+ (error|test)|pr (is |was )?(clean|ready)|dead code removed",
|
|
37
|
+
re.IGNORECASE,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class MemoryCandidate(BaseModel):
|
|
42
|
+
model_config = ConfigDict(extra="ignore")
|
|
43
|
+
|
|
44
|
+
memory_type: str
|
|
45
|
+
kind: str
|
|
46
|
+
summary: str
|
|
47
|
+
rationale: str = ""
|
|
48
|
+
confidence: float = 0.0
|
|
49
|
+
should_store: bool = True
|
|
50
|
+
sensitive: bool = False
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class MemoryExtractionResult(BaseModel):
|
|
54
|
+
model_config = ConfigDict(extra="ignore")
|
|
55
|
+
|
|
56
|
+
candidates: list[MemoryCandidate] = Field(default_factory=list)
|
|
57
|
+
notes: list[str] = Field(default_factory=list)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _canonical_tokens(value: str) -> set[str]:
|
|
61
|
+
return {
|
|
62
|
+
token
|
|
63
|
+
for token in re.findall(r"[a-z0-9]+", value.lower())
|
|
64
|
+
if token and token not in _STOPWORDS and len(token) > 2
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _is_near_duplicate(left: MemoryCandidate, right: MemoryCandidate) -> bool:
|
|
69
|
+
left_tokens = _canonical_tokens(f"{left.summary} {left.rationale}")
|
|
70
|
+
right_tokens = _canonical_tokens(f"{right.summary} {right.rationale}")
|
|
71
|
+
if not left_tokens or not right_tokens:
|
|
72
|
+
return left.summary.strip().lower() == right.summary.strip().lower()
|
|
73
|
+
overlap = len(left_tokens & right_tokens)
|
|
74
|
+
smaller = min(len(left_tokens), len(right_tokens))
|
|
75
|
+
return smaller > 0 and (overlap / smaller) >= 0.7
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def normalize_memory_candidates(payload: dict[str, Any]) -> MemoryExtractionResult:
|
|
79
|
+
try:
|
|
80
|
+
parsed = MemoryExtractionResult.model_validate(payload)
|
|
81
|
+
except Exception:
|
|
82
|
+
return MemoryExtractionResult()
|
|
83
|
+
|
|
84
|
+
normalized: list[MemoryCandidate] = []
|
|
85
|
+
for item in parsed.candidates:
|
|
86
|
+
memory_type = (item.memory_type or "").strip().lower()
|
|
87
|
+
kind = (item.kind or "").strip().lower()
|
|
88
|
+
if kind not in _VALID_KINDS:
|
|
89
|
+
continue
|
|
90
|
+
if memory_type not in _VALID_MEMORY_TYPES:
|
|
91
|
+
memory_type = _MEMORY_TYPE_BY_KIND.get(kind, "")
|
|
92
|
+
if not memory_type:
|
|
93
|
+
continue
|
|
94
|
+
if kind not in _KINDS_BY_MEMORY_TYPE.get(memory_type, set()):
|
|
95
|
+
inferred_type = _MEMORY_TYPE_BY_KIND.get(kind)
|
|
96
|
+
if inferred_type:
|
|
97
|
+
memory_type = inferred_type
|
|
98
|
+
summary = " ".join(item.summary.split()).strip()
|
|
99
|
+
if not summary or _TRANSIENT_RE.search(summary):
|
|
100
|
+
continue
|
|
101
|
+
rationale = " ".join(item.rationale.split()).strip()
|
|
102
|
+
should_store = bool(item.should_store)
|
|
103
|
+
if memory_type == "procedural" and kind == "procedure":
|
|
104
|
+
tokens = _canonical_tokens(f"{summary} {rationale}")
|
|
105
|
+
if not (tokens & _PROCEDURE_HINTS) and tokens & _DECISION_HINTS:
|
|
106
|
+
memory_type = "episodic"
|
|
107
|
+
kind = "decision"
|
|
108
|
+
if memory_type == "noise":
|
|
109
|
+
should_store = False
|
|
110
|
+
candidate = item.model_copy(
|
|
111
|
+
update={
|
|
112
|
+
"memory_type": memory_type,
|
|
113
|
+
"kind": kind,
|
|
114
|
+
"summary": summary,
|
|
115
|
+
"rationale": rationale,
|
|
116
|
+
"confidence": max(0.0, min(1.0, float(item.confidence or 0.0))),
|
|
117
|
+
"should_store": should_store,
|
|
118
|
+
}
|
|
119
|
+
)
|
|
120
|
+
normalized.append(candidate)
|
|
121
|
+
normalized.sort(
|
|
122
|
+
key=lambda c: (not c.should_store, c.memory_type == "noise", -c.confidence, len(c.summary)),
|
|
123
|
+
)
|
|
124
|
+
deduped: list[MemoryCandidate] = []
|
|
125
|
+
for candidate in normalized:
|
|
126
|
+
if any(_is_near_duplicate(candidate, existing) for existing in deduped):
|
|
127
|
+
continue
|
|
128
|
+
deduped.append(candidate)
|
|
129
|
+
if len(deduped) >= 3:
|
|
130
|
+
break
|
|
131
|
+
return MemoryExtractionResult(
|
|
132
|
+
candidates=deduped,
|
|
133
|
+
notes=[note for note in parsed.notes if str(note).strip()],
|
|
134
|
+
)
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
"""Deterministic memory lifecycle helpers for capture-first CherryDocs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import UTC, datetime
|
|
6
|
+
from typing import Literal
|
|
7
|
+
from uuid import uuid4
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
10
|
+
|
|
11
|
+
from app.services.memory_evidence import (
|
|
12
|
+
build_event_evidence_id,
|
|
13
|
+
collect_candidate_context,
|
|
14
|
+
collect_candidate_evidence_refs,
|
|
15
|
+
)
|
|
16
|
+
from app.services.memory_extraction_normalize import MemoryCandidate
|
|
17
|
+
from app.services.memory_similarity import (
|
|
18
|
+
_best_matching_record,
|
|
19
|
+
_is_conflict,
|
|
20
|
+
_is_repeatable_procedure,
|
|
21
|
+
_should_merge,
|
|
22
|
+
_should_supersede,
|
|
23
|
+
topic_key_for_text,
|
|
24
|
+
)
|
|
25
|
+
from app.services.privacy import detect_blocking_secret_fields, redact_text
|
|
26
|
+
|
|
27
|
+
MemoryStatus = Literal["active", "tentative", "superseded", "stale", "contradicted", "archived"]
|
|
28
|
+
MemoryAction = Literal["create", "merge", "supersede", "conflict", "ignore"]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class MemoryRecord(BaseModel):
|
|
32
|
+
model_config = ConfigDict(extra="ignore")
|
|
33
|
+
|
|
34
|
+
memory_id: str
|
|
35
|
+
memory_type: str
|
|
36
|
+
kind: str
|
|
37
|
+
summary: str
|
|
38
|
+
rationale: str = ""
|
|
39
|
+
confidence: float = 0.0
|
|
40
|
+
status: MemoryStatus = "active"
|
|
41
|
+
topic_key: str = ""
|
|
42
|
+
project_id: str | None = None
|
|
43
|
+
source: str | None = None
|
|
44
|
+
session_id: str | None = None
|
|
45
|
+
repo: str | None = None
|
|
46
|
+
branch: str | None = None
|
|
47
|
+
commit: str | None = None
|
|
48
|
+
files: list[str] = Field(default_factory=list)
|
|
49
|
+
evidence: list[str] = Field(default_factory=list)
|
|
50
|
+
derived_from_paths: list[str] = Field(default_factory=list)
|
|
51
|
+
supersedes: list[str] = Field(default_factory=list)
|
|
52
|
+
superseded_by: str | None = None
|
|
53
|
+
retrieval_count: int = 0
|
|
54
|
+
evidence_hit_count: int = 0
|
|
55
|
+
last_retrieved_at: str | None = None
|
|
56
|
+
last_evidence_hit_at: str | None = None
|
|
57
|
+
created_at: str = Field(default_factory=lambda: datetime.now(UTC).isoformat())
|
|
58
|
+
updated_at: str = Field(default_factory=lambda: datetime.now(UTC).isoformat())
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class MemoryLifecycleDecision(BaseModel):
|
|
62
|
+
model_config = ConfigDict(extra="ignore")
|
|
63
|
+
|
|
64
|
+
action: MemoryAction
|
|
65
|
+
reason: str
|
|
66
|
+
candidate: MemoryCandidate
|
|
67
|
+
target_memory_id: str | None = None
|
|
68
|
+
resulting_record: MemoryRecord | None = None
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class MemoryPromotionResult(BaseModel):
|
|
72
|
+
model_config = ConfigDict(extra="ignore")
|
|
73
|
+
|
|
74
|
+
decisions: list[MemoryLifecycleDecision] = Field(default_factory=list)
|
|
75
|
+
records: list[MemoryRecord] = Field(default_factory=list)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def build_existing_memory_context(records: list[MemoryRecord], limit: int = 8) -> list[str]:
|
|
79
|
+
selected = sorted(
|
|
80
|
+
[record for record in records if record.status in {"active", "tentative"}],
|
|
81
|
+
key=lambda item: (item.status != "active", -item.confidence, item.updated_at),
|
|
82
|
+
)[: max(1, limit)]
|
|
83
|
+
lines: list[str] = []
|
|
84
|
+
for record in selected:
|
|
85
|
+
detail = f"{record.memory_type}/{record.kind}: {record.summary}"
|
|
86
|
+
if record.rationale:
|
|
87
|
+
detail += f" | why={record.rationale}"
|
|
88
|
+
if record.status != "active":
|
|
89
|
+
detail += f" | status={record.status}"
|
|
90
|
+
lines.append(detail)
|
|
91
|
+
return lines
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def sanitize_candidate(candidate: MemoryCandidate) -> MemoryCandidate:
|
|
95
|
+
payload = {"summary": candidate.summary, "rationale": candidate.rationale}
|
|
96
|
+
findings = detect_blocking_secret_fields(payload)
|
|
97
|
+
if findings:
|
|
98
|
+
return candidate.model_copy(update={"sensitive": True, "should_store": False})
|
|
99
|
+
return candidate.model_copy(update={
|
|
100
|
+
"summary": redact_text(candidate.summary),
|
|
101
|
+
"rationale": redact_text(candidate.rationale),
|
|
102
|
+
})
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def age_memory_records(
|
|
106
|
+
records: list[MemoryRecord],
|
|
107
|
+
*,
|
|
108
|
+
now: datetime | None = None,
|
|
109
|
+
stale_after_days: int = 180,
|
|
110
|
+
) -> list[MemoryRecord]:
|
|
111
|
+
reference = now or datetime.now(UTC)
|
|
112
|
+
aged: list[MemoryRecord] = []
|
|
113
|
+
for record in records:
|
|
114
|
+
if record.status not in {"active", "tentative"}:
|
|
115
|
+
aged.append(record)
|
|
116
|
+
continue
|
|
117
|
+
try:
|
|
118
|
+
updated_at = datetime.fromisoformat(record.updated_at)
|
|
119
|
+
except Exception:
|
|
120
|
+
aged.append(record)
|
|
121
|
+
continue
|
|
122
|
+
age_days = (reference - updated_at).days
|
|
123
|
+
aged.append(record.model_copy(update={"status": "stale"}) if age_days >= stale_after_days else record)
|
|
124
|
+
return aged
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _make_record(
|
|
128
|
+
candidate: MemoryCandidate,
|
|
129
|
+
*,
|
|
130
|
+
project_id: str | None,
|
|
131
|
+
source: str | None,
|
|
132
|
+
session_id: str | None,
|
|
133
|
+
repo: str | None,
|
|
134
|
+
branch: str | None,
|
|
135
|
+
commit: str | None,
|
|
136
|
+
files: list[str] | None,
|
|
137
|
+
evidence: list[str] | None,
|
|
138
|
+
derived_from_paths: list[str] | None = None,
|
|
139
|
+
supersedes: list[str] | None = None,
|
|
140
|
+
) -> MemoryRecord:
|
|
141
|
+
return MemoryRecord(
|
|
142
|
+
memory_id=f"mem-{uuid4().hex[:12]}",
|
|
143
|
+
memory_type=candidate.memory_type,
|
|
144
|
+
kind=candidate.kind,
|
|
145
|
+
summary=candidate.summary,
|
|
146
|
+
rationale=candidate.rationale,
|
|
147
|
+
confidence=candidate.confidence,
|
|
148
|
+
status="tentative" if candidate.confidence < 0.65 else "active",
|
|
149
|
+
topic_key=topic_key_for_text(candidate.summary, candidate.rationale),
|
|
150
|
+
project_id=project_id,
|
|
151
|
+
source=source,
|
|
152
|
+
session_id=session_id,
|
|
153
|
+
repo=repo,
|
|
154
|
+
branch=branch,
|
|
155
|
+
commit=commit,
|
|
156
|
+
files=list(files or []),
|
|
157
|
+
evidence=list(evidence or []),
|
|
158
|
+
derived_from_paths=list(derived_from_paths or []),
|
|
159
|
+
supersedes=list(supersedes or []),
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _decide_for_candidate(
|
|
164
|
+
candidate: MemoryCandidate,
|
|
165
|
+
records: list[MemoryRecord],
|
|
166
|
+
*,
|
|
167
|
+
make_kwargs: dict,
|
|
168
|
+
) -> tuple[MemoryLifecycleDecision, list[MemoryRecord]]:
|
|
169
|
+
"""Return (decision, updated_records) for one candidate."""
|
|
170
|
+
best = _best_matching_record(candidate, records)
|
|
171
|
+
|
|
172
|
+
if best and _should_supersede(candidate, best):
|
|
173
|
+
replacement = _make_record(candidate, supersedes=[best.memory_id], **make_kwargs)
|
|
174
|
+
old = best.model_copy(update={"status": "superseded", "superseded_by": replacement.memory_id})
|
|
175
|
+
updated = [old if r.memory_id == best.memory_id else r for r in records] + [replacement]
|
|
176
|
+
return MemoryLifecycleDecision(
|
|
177
|
+
action="supersede", reason=f"supersedes {best.memory_id}",
|
|
178
|
+
candidate=candidate, target_memory_id=best.memory_id, resulting_record=replacement,
|
|
179
|
+
), updated
|
|
180
|
+
|
|
181
|
+
if best and _should_merge(candidate, best):
|
|
182
|
+
merged = best.model_copy(update={
|
|
183
|
+
"confidence": max(best.confidence, candidate.confidence),
|
|
184
|
+
"updated_at": datetime.now(UTC).isoformat(),
|
|
185
|
+
"rationale": candidate.rationale or best.rationale,
|
|
186
|
+
"repo": make_kwargs.get("repo") or best.repo,
|
|
187
|
+
"commit": make_kwargs.get("commit") or best.commit,
|
|
188
|
+
"files": list(dict.fromkeys([*best.files, *(make_kwargs.get("files") or [])])),
|
|
189
|
+
"evidence": list(dict.fromkeys([*best.evidence, *(make_kwargs.get("evidence") or [])])),
|
|
190
|
+
})
|
|
191
|
+
updated = [merged if r.memory_id == best.memory_id else r for r in records]
|
|
192
|
+
return MemoryLifecycleDecision(
|
|
193
|
+
action="merge", reason=f"merged into {best.memory_id}",
|
|
194
|
+
candidate=candidate, target_memory_id=best.memory_id, resulting_record=merged,
|
|
195
|
+
), updated
|
|
196
|
+
|
|
197
|
+
if best and _is_conflict(candidate, best):
|
|
198
|
+
record = _make_record(candidate, **make_kwargs)
|
|
199
|
+
record.status = "contradicted"
|
|
200
|
+
return MemoryLifecycleDecision(
|
|
201
|
+
action="conflict", reason=f"conflicts with {best.memory_id}",
|
|
202
|
+
candidate=candidate, target_memory_id=best.memory_id, resulting_record=record,
|
|
203
|
+
), [*records, record]
|
|
204
|
+
|
|
205
|
+
created = _make_record(candidate, **make_kwargs)
|
|
206
|
+
return MemoryLifecycleDecision(
|
|
207
|
+
action="create", reason="new durable memory candidate",
|
|
208
|
+
candidate=candidate, resulting_record=created,
|
|
209
|
+
), [*records, created]
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def promote_memory_candidates(
|
|
213
|
+
candidates: list[MemoryCandidate],
|
|
214
|
+
*,
|
|
215
|
+
existing_records: list[MemoryRecord] | None = None,
|
|
216
|
+
project_id: str | None = None,
|
|
217
|
+
source: str | None = None,
|
|
218
|
+
session_id: str | None = None,
|
|
219
|
+
repo: str | None = None,
|
|
220
|
+
branch: str | None = None,
|
|
221
|
+
commit: str | None = None,
|
|
222
|
+
files: list[str] | None = None,
|
|
223
|
+
evidence: list[str] | None = None,
|
|
224
|
+
) -> MemoryPromotionResult:
|
|
225
|
+
records = [r.model_copy(deep=True) for r in (existing_records or [])]
|
|
226
|
+
decisions: list[MemoryLifecycleDecision] = []
|
|
227
|
+
make_kwargs = dict(
|
|
228
|
+
project_id=project_id, source=source, session_id=session_id,
|
|
229
|
+
repo=repo, branch=branch, commit=commit, files=files, evidence=evidence,
|
|
230
|
+
)
|
|
231
|
+
for candidate in candidates:
|
|
232
|
+
candidate = sanitize_candidate(candidate)
|
|
233
|
+
if candidate.sensitive or not candidate.should_store or candidate.memory_type == "noise":
|
|
234
|
+
decisions.append(MemoryLifecycleDecision(
|
|
235
|
+
action="ignore", reason="noise or sensitive", candidate=candidate,
|
|
236
|
+
))
|
|
237
|
+
continue
|
|
238
|
+
if (candidate.memory_type == "procedural" and candidate.kind == "procedure"
|
|
239
|
+
and not _is_repeatable_procedure(candidate.summary, candidate.rationale)):
|
|
240
|
+
candidate = candidate.model_copy(update={"memory_type": "episodic", "kind": "decision"})
|
|
241
|
+
decision, records = _decide_for_candidate(candidate, records, make_kwargs=make_kwargs)
|
|
242
|
+
decisions.append(decision)
|
|
243
|
+
return MemoryPromotionResult(decisions=decisions, records=age_memory_records(records))
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
# Re-export evidence helpers so callers can import them from here unchanged.
|
|
247
|
+
__all__ = [
|
|
248
|
+
"MemoryLifecycleDecision",
|
|
249
|
+
"MemoryPromotionResult",
|
|
250
|
+
"MemoryRecord",
|
|
251
|
+
"age_memory_records",
|
|
252
|
+
"build_event_evidence_id",
|
|
253
|
+
"build_existing_memory_context",
|
|
254
|
+
"collect_candidate_context",
|
|
255
|
+
"collect_candidate_evidence_refs",
|
|
256
|
+
"promote_memory_candidates",
|
|
257
|
+
"sanitize_candidate",
|
|
258
|
+
]
|