cherry-docs 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- app/__init__.py +0 -0
- app/repo_scope.py +24 -0
- app/services/__init__.py +0 -0
- app/services/agent_protocol.py +59 -0
- app/services/auto_promote_sessions.py +245 -0
- app/services/capture_adapters.py +89 -0
- app/services/capture_core.py +164 -0
- app/services/internal_memory_agent.py +214 -0
- app/services/memory_evidence.py +89 -0
- app/services/memory_extraction_normalize.py +134 -0
- app/services/memory_lifecycle.py +258 -0
- app/services/memory_profiles.py +88 -0
- app/services/memory_providers.py +113 -0
- app/services/memory_retrieval.py +327 -0
- app/services/memory_retrieval_scoring.py +106 -0
- app/services/memory_retrieval_text.py +113 -0
- app/services/memory_similarity.py +135 -0
- app/services/privacy.py +72 -0
- app/services/promoted_memory_answer.py +157 -0
- app/services/promoted_memory_pipeline.py +194 -0
- app/services/promoted_memory_store.py +57 -0
- cherry_docs-0.2.0.dist-info/METADATA +143 -0
- cherry_docs-0.2.0.dist-info/RECORD +42 -0
- cherry_docs-0.2.0.dist-info/WHEEL +5 -0
- cherry_docs-0.2.0.dist-info/entry_points.txt +4 -0
- cherry_docs-0.2.0.dist-info/top_level.txt +3 -0
- cherrydocs/__init__.py +3 -0
- cherrydocs/cli.py +213 -0
- cherrydocs/hook.py +27 -0
- cherrydocs/mcp.py +22 -0
- scripts/__init__.py +0 -0
- scripts/auto_promote_capture.py +63 -0
- scripts/check_size_limits.py +115 -0
- scripts/ci_auto_capture.py +289 -0
- scripts/claude_hooks/__init__.py +0 -0
- scripts/claude_hooks/state_manager.py +526 -0
- scripts/coverage_regression_gate.py +121 -0
- scripts/eval_projects.py +247 -0
- scripts/install.py +212 -0
- scripts/pr_gate_report.py +282 -0
- scripts/promptfoo_regression_gate.py +176 -0
- scripts/render_agent_prompts.py +57 -0
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""Text/token helpers for memory retrieval — tokenization, overlap, question mode, file matching."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
_STOPWORDS = {
|
|
8
|
+
"a",
|
|
9
|
+
"an",
|
|
10
|
+
"and",
|
|
11
|
+
"are",
|
|
12
|
+
"as",
|
|
13
|
+
"at",
|
|
14
|
+
"be",
|
|
15
|
+
"because",
|
|
16
|
+
"for",
|
|
17
|
+
"from",
|
|
18
|
+
"how",
|
|
19
|
+
"i",
|
|
20
|
+
"if",
|
|
21
|
+
"in",
|
|
22
|
+
"into",
|
|
23
|
+
"is",
|
|
24
|
+
"it",
|
|
25
|
+
"of",
|
|
26
|
+
"on",
|
|
27
|
+
"or",
|
|
28
|
+
"that",
|
|
29
|
+
"the",
|
|
30
|
+
"this",
|
|
31
|
+
"to",
|
|
32
|
+
"was",
|
|
33
|
+
"what",
|
|
34
|
+
"why",
|
|
35
|
+
"with",
|
|
36
|
+
}
|
|
37
|
+
_RECENCY_QUERY_TERMS = {"latest", "recent", "newest", "changed", "change", "now", "currently", "just"}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _stem(token: str) -> str:
|
|
41
|
+
"""Minimal English stemmer: strip common suffixes so plural/verb forms match."""
|
|
42
|
+
for suffix in ("tions", "tion", "ings", "ing", "ions", "ion", "ies", "ed", "es", "ly", "s"):
|
|
43
|
+
if token.endswith(suffix) and len(token) - len(suffix) > 2:
|
|
44
|
+
return token[: -len(suffix)]
|
|
45
|
+
return token
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _tokens(value: str) -> set[str]:
|
|
49
|
+
raw = {
|
|
50
|
+
token
|
|
51
|
+
for token in re.findall(r"[a-z0-9]+", value.lower())
|
|
52
|
+
if token and token not in _STOPWORDS and len(token) > 2
|
|
53
|
+
}
|
|
54
|
+
return {_stem(t) for t in raw}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _score_overlap(left: str, right: str) -> float:
|
|
58
|
+
left_tokens = _tokens(left)
|
|
59
|
+
right_tokens = _tokens(right)
|
|
60
|
+
if not left_tokens or not right_tokens:
|
|
61
|
+
return 0.0
|
|
62
|
+
overlap = len(left_tokens & right_tokens)
|
|
63
|
+
return overlap / max(1, min(len(left_tokens), len(right_tokens)))
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _question_mode(question: str) -> str:
|
|
67
|
+
lowered = question.lower()
|
|
68
|
+
if "why" in lowered:
|
|
69
|
+
return "why"
|
|
70
|
+
if "how" in lowered:
|
|
71
|
+
return "how"
|
|
72
|
+
if "fail" in lowered:
|
|
73
|
+
return "what_failed"
|
|
74
|
+
if "what" in lowered or "which" in lowered or "state" in lowered or "current" in lowered:
|
|
75
|
+
return "what"
|
|
76
|
+
return "generic"
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _wants_recent_context(question: str) -> bool:
|
|
80
|
+
lowered = question.lower()
|
|
81
|
+
return any(term in lowered for term in _RECENCY_QUERY_TERMS)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _trim_text(value: str, limit: int = 280) -> str:
|
|
85
|
+
text = " ".join((value or "").split())
|
|
86
|
+
if len(text) <= limit:
|
|
87
|
+
return text
|
|
88
|
+
return f"{text[: limit - 3]}..."
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _file_query_hint(file_path: str | None) -> str:
|
|
92
|
+
if not file_path:
|
|
93
|
+
return ""
|
|
94
|
+
normalized = file_path.strip().lower().strip("/")
|
|
95
|
+
if not normalized:
|
|
96
|
+
return ""
|
|
97
|
+
parts = [p for p in re.split(r"[/_.-]+", normalized) if p and len(p) > 2]
|
|
98
|
+
hints = set(parts) | {p[:-1] for p in parts if p.endswith("s") and len(p) > 4}
|
|
99
|
+
return " ".join(sorted(hints))
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _file_overlap(file_path: str | None, files: list[str]) -> float:
|
|
103
|
+
if not file_path:
|
|
104
|
+
return 0.0
|
|
105
|
+
normalized = file_path.strip().lower().lstrip("/")
|
|
106
|
+
if not normalized:
|
|
107
|
+
return 0.0
|
|
108
|
+
lowered = [str(path).strip().lower().lstrip("/") for path in files if str(path).strip()]
|
|
109
|
+
if normalized in lowered:
|
|
110
|
+
return 1.0
|
|
111
|
+
if any(path.endswith(normalized) or normalized.endswith(path) for path in lowered):
|
|
112
|
+
return 0.7
|
|
113
|
+
return 0.0
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""Similarity and matching heuristics for memory lifecycle decisions.
|
|
2
|
+
|
|
3
|
+
Uses TYPE_CHECKING for MemoryRecord to avoid circular imports at runtime
|
|
4
|
+
(memory_lifecycle imports these helpers, and MemoryRecord lives there).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
from app.services.memory_extraction_normalize import MemoryCandidate
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from app.services.memory_lifecycle import MemoryRecord
|
|
16
|
+
|
|
17
|
+
_STOPWORDS = {
|
|
18
|
+
"a", "an", "and", "are", "as", "at", "be", "because", "for", "from",
|
|
19
|
+
"how", "if", "in", "into", "is", "it", "of", "on", "or", "that", "the",
|
|
20
|
+
"this", "to", "use", "using", "we", "with",
|
|
21
|
+
}
|
|
22
|
+
_NEGATIVE_HINTS = {"avoid", "reject", "deprecated", "legacy", "superseded", "replace", "replaced", "drop"}
|
|
23
|
+
_POSITIVE_HINTS = {"choose", "chosen", "use", "using", "adopt", "keep", "kept", "implement", "switch", "move"}
|
|
24
|
+
_PROCEDURE_HINTS = {"first", "then", "next", "workflow", "runbook", "checklist", "repeatable", "repeat"}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _normalize_tokens(value: str) -> list[str]:
|
|
28
|
+
return [
|
|
29
|
+
token
|
|
30
|
+
for token in re.findall(r"[a-z0-9]+", value.lower())
|
|
31
|
+
if token and token not in _STOPWORDS and len(token) > 2
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _raw_tokens(value: str) -> set[str]:
|
|
36
|
+
return {token for token in re.findall(r"[a-z0-9]+", value.lower()) if token}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def topic_key_for_text(*parts: str, limit: int = 4) -> str:
|
|
40
|
+
tokens: list[str] = []
|
|
41
|
+
seen: set[str] = set()
|
|
42
|
+
for part in parts:
|
|
43
|
+
for token in _normalize_tokens(part):
|
|
44
|
+
if token in seen:
|
|
45
|
+
continue
|
|
46
|
+
tokens.append(token)
|
|
47
|
+
seen.add(token)
|
|
48
|
+
if len(tokens) >= limit:
|
|
49
|
+
return " ".join(tokens)
|
|
50
|
+
return " ".join(tokens)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _similarity(left: str, right: str) -> float:
|
|
54
|
+
left_tokens = set(_normalize_tokens(left))
|
|
55
|
+
right_tokens = set(_normalize_tokens(right))
|
|
56
|
+
if not left_tokens or not right_tokens:
|
|
57
|
+
return 0.0
|
|
58
|
+
overlap = len(left_tokens & right_tokens)
|
|
59
|
+
return overlap / max(1, min(len(left_tokens), len(right_tokens)))
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _candidate_text(candidate: MemoryCandidate) -> str:
|
|
63
|
+
return f"{candidate.summary} {candidate.rationale}".strip()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _record_text(record: MemoryRecord) -> str:
|
|
67
|
+
return f"{record.summary} {record.rationale}".strip()
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _has_any_hint(text: str, hints: set[str]) -> bool:
|
|
71
|
+
return bool(_raw_tokens(text) & hints)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _is_repeatable_procedure(summary: str, rationale: str) -> bool:
|
|
75
|
+
return _has_any_hint(f"{summary} {rationale}".strip(), _PROCEDURE_HINTS)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _is_conflict(candidate: MemoryCandidate, record: MemoryRecord) -> bool:
|
|
79
|
+
if candidate.memory_type != record.memory_type:
|
|
80
|
+
return False
|
|
81
|
+
similarity = _similarity(_candidate_text(candidate), _record_text(record))
|
|
82
|
+
topic_similarity = _similarity(candidate.summary, record.summary)
|
|
83
|
+
if max(similarity, topic_similarity) < 0.3:
|
|
84
|
+
return False
|
|
85
|
+
candidate_negative = _has_any_hint(_candidate_text(candidate), _NEGATIVE_HINTS)
|
|
86
|
+
record_negative = _has_any_hint(_record_text(record), _NEGATIVE_HINTS)
|
|
87
|
+
candidate_positive = _has_any_hint(_candidate_text(candidate), _POSITIVE_HINTS)
|
|
88
|
+
record_positive = _has_any_hint(_record_text(record), _POSITIVE_HINTS)
|
|
89
|
+
return (candidate_negative and record_positive) or (candidate_positive and record_negative)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _should_supersede(candidate: MemoryCandidate, record: MemoryRecord) -> bool:
|
|
93
|
+
if record.status != "active":
|
|
94
|
+
return False
|
|
95
|
+
if _similarity(_candidate_text(candidate), _record_text(record)) < 0.45:
|
|
96
|
+
return False
|
|
97
|
+
if candidate.confidence + 0.05 < record.confidence:
|
|
98
|
+
return False
|
|
99
|
+
return _has_any_hint(
|
|
100
|
+
_candidate_text(candidate),
|
|
101
|
+
{"replace", "replaced", "switch", "superseded", "migrate", "deprecated"},
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _should_merge(candidate: MemoryCandidate, record: MemoryRecord) -> bool:
|
|
106
|
+
if candidate.memory_type != record.memory_type:
|
|
107
|
+
return False
|
|
108
|
+
if candidate.kind != record.kind:
|
|
109
|
+
return False
|
|
110
|
+
similarity = _similarity(_candidate_text(candidate), _record_text(record))
|
|
111
|
+
if similarity >= 0.7:
|
|
112
|
+
return True
|
|
113
|
+
if candidate.kind == "decision":
|
|
114
|
+
summary_similarity = _similarity(candidate.summary, record.summary)
|
|
115
|
+
rationale_similarity = _similarity(candidate.rationale, record.rationale)
|
|
116
|
+
topic_similarity = _similarity(
|
|
117
|
+
topic_key_for_text(candidate.summary, candidate.rationale),
|
|
118
|
+
topic_key_for_text(record.summary, record.rationale),
|
|
119
|
+
)
|
|
120
|
+
return max(summary_similarity, rationale_similarity, topic_similarity) >= 0.4
|
|
121
|
+
return False
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _best_matching_record(
|
|
125
|
+
candidate: MemoryCandidate, records: list[MemoryRecord]
|
|
126
|
+
) -> MemoryRecord | None:
|
|
127
|
+
best: MemoryRecord | None = None
|
|
128
|
+
best_score = 0.0
|
|
129
|
+
ctext = _candidate_text(candidate)
|
|
130
|
+
for record in records:
|
|
131
|
+
score = _similarity(ctext, _record_text(record))
|
|
132
|
+
if score > best_score:
|
|
133
|
+
best_score = score
|
|
134
|
+
best = record
|
|
135
|
+
return best
|
app/services/privacy.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Privacy and redaction: detect and redact secrets before persistence."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
_REDACT_PATTERNS = {
|
|
8
|
+
"EMAIL": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
|
|
9
|
+
"API_KEY": r"(?i)(api[_-]?key|token|secret|password|auth)['\"]?\s*[:=]\s*['\"]?([a-zA-Z0-9_\-]{20,})['\"]?",
|
|
10
|
+
"JWT": r'ey[A-Za-z0-9-_=]+\.[A-Za-z0-9-_=]+\.?[A-Za-z0-9-_.+/=]*',
|
|
11
|
+
"GROQ": r'gsk_[a-zA-Z0-9]{20,}',
|
|
12
|
+
"OPENAI": r'sk-[a-zA-Z0-9\-_]{20,}',
|
|
13
|
+
"GITHUB": r'gh[pousr]_[a-zA-Z0-9]{36}',
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
# High-confidence blocking patterns — used to fail-closed before persistence.
|
|
17
|
+
_BLOCKING_PATTERNS: dict[str, str] = {
|
|
18
|
+
"openai_like_key": r"\bsk-[A-Za-z0-9_\-]{20,}\b",
|
|
19
|
+
"groq_key": r"\bgsk_[A-Za-z0-9]{20,}\b",
|
|
20
|
+
"github_pat": r"\bgh[pousr]_[A-Za-z0-9]{20,}\b",
|
|
21
|
+
"huggingface_token": r"\bhf_[A-Za-z0-9]{20,}\b",
|
|
22
|
+
"aws_access_key": r"\bAKIA[0-9A-Z]{16}\b",
|
|
23
|
+
"private_key_block": r"-----BEGIN [A-Z ]+PRIVATE KEY-----",
|
|
24
|
+
"credential_assignment": (
|
|
25
|
+
r"(?i)\b(api[_-]?key|access[_-]?token|refresh[_-]?token|secret|password|passwd|authorization)\b"
|
|
26
|
+
r"\s*[:=]\s*[\"']?(?:bearer\s+)?[A-Za-z0-9._\-]{16,}[\"']?"
|
|
27
|
+
),
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _scan_text_for_blocking_secret(text: str) -> list[str]:
|
|
32
|
+
return [label for label, pattern in _BLOCKING_PATTERNS.items() if re.search(pattern, text)]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _walk_payload_for_secrets(value: Any, path: str, findings: list[str], max_hits: int) -> None:
|
|
36
|
+
if len(findings) >= max_hits:
|
|
37
|
+
return
|
|
38
|
+
if isinstance(value, str):
|
|
39
|
+
labels = _scan_text_for_blocking_secret(value)
|
|
40
|
+
if labels:
|
|
41
|
+
findings.append(f"{path} ({', '.join(sorted(set(labels)))})")
|
|
42
|
+
elif isinstance(value, dict):
|
|
43
|
+
for key, nested in value.items():
|
|
44
|
+
_walk_payload_for_secrets(nested, f"{path}.{key}", findings, max_hits)
|
|
45
|
+
if len(findings) >= max_hits:
|
|
46
|
+
return
|
|
47
|
+
elif isinstance(value, list):
|
|
48
|
+
for idx, nested in enumerate(value):
|
|
49
|
+
_walk_payload_for_secrets(nested, f"{path}[{idx}]", findings, max_hits)
|
|
50
|
+
if len(findings) >= max_hits:
|
|
51
|
+
return
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def detect_blocking_secret_fields(data: dict[str, Any], max_hits: int = 8) -> list[str]:
|
|
55
|
+
"""Return payload paths containing high-confidence secret material."""
|
|
56
|
+
findings: list[str] = []
|
|
57
|
+
_walk_payload_for_secrets(data, "payload", findings, max_hits=max_hits)
|
|
58
|
+
return findings
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def redact_text(text: str) -> str:
|
|
62
|
+
"""Redact PII and secrets from a string."""
|
|
63
|
+
if not text:
|
|
64
|
+
return text
|
|
65
|
+
redacted = text
|
|
66
|
+
redacted = re.sub(_REDACT_PATTERNS["GROQ"], "[REDACTED_GROQ_KEY]", redacted)
|
|
67
|
+
redacted = re.sub(_REDACT_PATTERNS["OPENAI"], "[REDACTED_OPENAI_KEY]", redacted)
|
|
68
|
+
redacted = re.sub(_REDACT_PATTERNS["GITHUB"], "[REDACTED_GITHUB_TOKEN]", redacted)
|
|
69
|
+
redacted = re.sub(_REDACT_PATTERNS["API_KEY"], lambda m: f"{m.group(1)}: [REDACTED_SECRET]", redacted)
|
|
70
|
+
redacted = re.sub(_REDACT_PATTERNS["EMAIL"], "[REDACTED_EMAIL]", redacted)
|
|
71
|
+
redacted = re.sub(_REDACT_PATTERNS["JWT"], "[REDACTED_JWT]", redacted)
|
|
72
|
+
return redacted
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""Promoted-memory answer helpers."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from app.services.auto_promote_sessions import (
|
|
5
|
+
AutoPromotionPolicy,
|
|
6
|
+
AutoPromotionRunReport,
|
|
7
|
+
auto_promote_captured_sessions,
|
|
8
|
+
)
|
|
9
|
+
from app.services.capture_core import LocalCaptureBuffer
|
|
10
|
+
from app.services.internal_memory_agent import MemoryModelProvider
|
|
11
|
+
from app.services.memory_providers import resolve_provider
|
|
12
|
+
from app.services.memory_lifecycle import build_event_evidence_id
|
|
13
|
+
from app.services.memory_retrieval import (
|
|
14
|
+
RetrievalAnswer,
|
|
15
|
+
apply_retrieval_feedback,
|
|
16
|
+
synthesize_retrieval_answer,
|
|
17
|
+
)
|
|
18
|
+
from app.services.promoted_memory_store import DEFAULT_PROMOTED_ROOT, LocalPromotedMemoryStore
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _compact_files(paths: list[str], *, limit: int = 6) -> list[str]:
|
|
22
|
+
compacted: list[str] = []
|
|
23
|
+
seen: set[str] = set()
|
|
24
|
+
for raw in paths:
|
|
25
|
+
path = str(raw).strip()
|
|
26
|
+
if not path or path in seen:
|
|
27
|
+
continue
|
|
28
|
+
seen.add(path)
|
|
29
|
+
compacted.append(path)
|
|
30
|
+
if len(compacted) >= limit:
|
|
31
|
+
break
|
|
32
|
+
return compacted
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def events_from_memory(
|
|
36
|
+
records,
|
|
37
|
+
*,
|
|
38
|
+
buffer_dir: str = ".cherrydocs/capture",
|
|
39
|
+
) -> list[dict]:
|
|
40
|
+
"""Load raw capture events referenced by promoted memory records."""
|
|
41
|
+
evidence_ids = {
|
|
42
|
+
ev_id
|
|
43
|
+
for record in records
|
|
44
|
+
for ev_id in record.evidence
|
|
45
|
+
if ":" in ev_id
|
|
46
|
+
}
|
|
47
|
+
if not evidence_ids:
|
|
48
|
+
return []
|
|
49
|
+
|
|
50
|
+
events: list[dict] = []
|
|
51
|
+
capture_buffer = LocalCaptureBuffer(buffer_dir)
|
|
52
|
+
loaded_sessions: set[str] = set()
|
|
53
|
+
for record in records:
|
|
54
|
+
session_id = str(record.session_id or "").strip()
|
|
55
|
+
if not session_id or session_id in loaded_sessions:
|
|
56
|
+
continue
|
|
57
|
+
loaded_sessions.add(session_id)
|
|
58
|
+
for event in capture_buffer.read(session_id):
|
|
59
|
+
if build_event_evidence_id(event) not in evidence_ids:
|
|
60
|
+
continue
|
|
61
|
+
events.append(
|
|
62
|
+
{
|
|
63
|
+
"event_type": str(event.get("event_type") or "unknown"),
|
|
64
|
+
"timestamp": event.get("timestamp"),
|
|
65
|
+
"text": str(event.get("text") or "").strip(),
|
|
66
|
+
"files": _compact_files(
|
|
67
|
+
[str(p) for p in (event.get("files") or []) if str(p).strip()]
|
|
68
|
+
),
|
|
69
|
+
"branch": event.get("branch"),
|
|
70
|
+
"commit": event.get("commit"),
|
|
71
|
+
}
|
|
72
|
+
)
|
|
73
|
+
return events
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def answer_from_promoted_memory(
|
|
77
|
+
*,
|
|
78
|
+
project_id: str,
|
|
79
|
+
question: str,
|
|
80
|
+
branch: str | None = None,
|
|
81
|
+
file_path: str | None = None,
|
|
82
|
+
buffer_dir: str = ".cherrydocs/capture",
|
|
83
|
+
promoted_root: str = DEFAULT_PROMOTED_ROOT,
|
|
84
|
+
) -> RetrievalAnswer:
|
|
85
|
+
store = LocalPromotedMemoryStore(promoted_root)
|
|
86
|
+
all_records = store.load_records(project_id)
|
|
87
|
+
records = [r for r in all_records if not branch or not r.branch or r.branch == branch]
|
|
88
|
+
answer = synthesize_retrieval_answer(
|
|
89
|
+
question,
|
|
90
|
+
records,
|
|
91
|
+
events_from_memory(records, buffer_dir=buffer_dir),
|
|
92
|
+
branch=branch,
|
|
93
|
+
file_path=file_path,
|
|
94
|
+
)
|
|
95
|
+
if answer.memories:
|
|
96
|
+
updated_records = apply_retrieval_feedback(records, answer.memories, answer.evidence)
|
|
97
|
+
try:
|
|
98
|
+
store.upsert_records(project_id, updated_records)
|
|
99
|
+
except Exception:
|
|
100
|
+
pass
|
|
101
|
+
return answer
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _should_refresh_answer(answer: RetrievalAnswer) -> bool:
|
|
105
|
+
"""Only refresh when no memories at all or the top match is inactive."""
|
|
106
|
+
if not answer.memories:
|
|
107
|
+
return True
|
|
108
|
+
return answer.memories[0].memory.status != "active"
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def answer_from_promoted_memory_with_refresh(
|
|
112
|
+
*,
|
|
113
|
+
project_id: str,
|
|
114
|
+
question: str,
|
|
115
|
+
branch: str | None = None,
|
|
116
|
+
file_path: str | None = None,
|
|
117
|
+
auto_promote_if_empty: bool = True,
|
|
118
|
+
project_hint: str | None = None,
|
|
119
|
+
buffer_dir: str = ".cherrydocs/capture",
|
|
120
|
+
promoted_root: str = DEFAULT_PROMOTED_ROOT,
|
|
121
|
+
provider: MemoryModelProvider | None = None,
|
|
122
|
+
policy: AutoPromotionPolicy | None = None,
|
|
123
|
+
memory_profile: str | None = None,
|
|
124
|
+
) -> tuple[RetrievalAnswer, AutoPromotionRunReport | None]:
|
|
125
|
+
answer = answer_from_promoted_memory(
|
|
126
|
+
project_id=project_id,
|
|
127
|
+
question=question,
|
|
128
|
+
branch=branch,
|
|
129
|
+
file_path=file_path,
|
|
130
|
+
buffer_dir=buffer_dir,
|
|
131
|
+
promoted_root=promoted_root,
|
|
132
|
+
)
|
|
133
|
+
if not _should_refresh_answer(answer) or not auto_promote_if_empty:
|
|
134
|
+
return answer, None
|
|
135
|
+
|
|
136
|
+
report = auto_promote_captured_sessions(
|
|
137
|
+
project_id=project_id,
|
|
138
|
+
buffer_dir=buffer_dir,
|
|
139
|
+
promoted_root=promoted_root,
|
|
140
|
+
provider=provider or resolve_provider(),
|
|
141
|
+
project_hint=project_hint,
|
|
142
|
+
memory_profile=memory_profile,
|
|
143
|
+
branch=branch,
|
|
144
|
+
policy=policy or AutoPromotionPolicy(max_sessions=3, min_event_count=3, min_candidate_confidence=0.8),
|
|
145
|
+
)
|
|
146
|
+
if not report.processed:
|
|
147
|
+
return answer, report
|
|
148
|
+
|
|
149
|
+
refreshed = answer_from_promoted_memory(
|
|
150
|
+
project_id=project_id,
|
|
151
|
+
question=question,
|
|
152
|
+
branch=branch,
|
|
153
|
+
file_path=file_path,
|
|
154
|
+
buffer_dir=buffer_dir,
|
|
155
|
+
promoted_root=promoted_root,
|
|
156
|
+
)
|
|
157
|
+
return refreshed, report
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""End-to-end capture -> distill -> promote pipeline for AI-facing session memory."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
6
|
+
|
|
7
|
+
from app.services.internal_memory_agent import (
|
|
8
|
+
MemoryExtractionResult,
|
|
9
|
+
MemoryModelProvider,
|
|
10
|
+
build_recent_capture_window,
|
|
11
|
+
extract_memory_candidates,
|
|
12
|
+
)
|
|
13
|
+
from app.services.memory_lifecycle import (
|
|
14
|
+
MemoryPromotionResult,
|
|
15
|
+
MemoryRecord,
|
|
16
|
+
build_existing_memory_context,
|
|
17
|
+
collect_candidate_context,
|
|
18
|
+
collect_candidate_evidence_refs,
|
|
19
|
+
promote_memory_candidates,
|
|
20
|
+
)
|
|
21
|
+
from app.services.promoted_memory_store import DEFAULT_PROMOTED_ROOT, LocalPromotedMemoryStore
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DistillationTrace(BaseModel):
|
|
25
|
+
model_config = ConfigDict(extra="ignore")
|
|
26
|
+
|
|
27
|
+
memory_profile: str
|
|
28
|
+
provider_label: str
|
|
29
|
+
input_event_count: int = 0
|
|
30
|
+
extracted_candidate_count: int = 0
|
|
31
|
+
extracted_store_count: int = 0
|
|
32
|
+
extracted_noise_count: int = 0
|
|
33
|
+
promoted_record_count: int = 0
|
|
34
|
+
stored_record_count: int = 0
|
|
35
|
+
extracted_kind_counts: dict[str, int] = Field(default_factory=dict)
|
|
36
|
+
extracted_memory_type_counts: dict[str, int] = Field(default_factory=dict)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class SessionPromotionReport(BaseModel):
|
|
40
|
+
model_config = ConfigDict(extra="ignore")
|
|
41
|
+
|
|
42
|
+
session_id: str
|
|
43
|
+
project_id: str
|
|
44
|
+
extracted: MemoryExtractionResult
|
|
45
|
+
promotion: MemoryPromotionResult
|
|
46
|
+
session_records: list[MemoryRecord] = Field(default_factory=list)
|
|
47
|
+
stored_records: list[MemoryRecord] = Field(default_factory=list)
|
|
48
|
+
distillation_trace: DistillationTrace
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _increment_count(bucket: dict[str, int], key: str) -> None:
|
|
52
|
+
normalized = " ".join(str(key or "").split()).strip().lower() or "unknown"
|
|
53
|
+
bucket[normalized] = bucket.get(normalized, 0) + 1
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _provider_label(provider: MemoryModelProvider) -> str:
|
|
57
|
+
model = " ".join(str(getattr(provider, "model", "") or "").split()).strip()
|
|
58
|
+
if model:
|
|
59
|
+
return model
|
|
60
|
+
return provider.__class__.__name__
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _build_distillation_trace(
|
|
64
|
+
*,
|
|
65
|
+
memory_profile: str | None,
|
|
66
|
+
provider: MemoryModelProvider,
|
|
67
|
+
events: list[dict],
|
|
68
|
+
extraction: MemoryExtractionResult,
|
|
69
|
+
session_records: list[MemoryRecord],
|
|
70
|
+
stored_records: list[MemoryRecord],
|
|
71
|
+
) -> DistillationTrace:
|
|
72
|
+
kind_counts: dict[str, int] = {}
|
|
73
|
+
memory_type_counts: dict[str, int] = {}
|
|
74
|
+
store_count = 0
|
|
75
|
+
noise_count = 0
|
|
76
|
+
for candidate in extraction.candidates:
|
|
77
|
+
_increment_count(kind_counts, candidate.kind)
|
|
78
|
+
_increment_count(memory_type_counts, candidate.memory_type)
|
|
79
|
+
if candidate.memory_type == "noise":
|
|
80
|
+
noise_count += 1
|
|
81
|
+
if candidate.should_store and candidate.memory_type != "noise":
|
|
82
|
+
store_count += 1
|
|
83
|
+
|
|
84
|
+
return DistillationTrace(
|
|
85
|
+
memory_profile=" ".join(str(memory_profile or "").split()).strip() or "default",
|
|
86
|
+
provider_label=_provider_label(provider),
|
|
87
|
+
input_event_count=len(events),
|
|
88
|
+
extracted_candidate_count=len(extraction.candidates),
|
|
89
|
+
extracted_store_count=store_count,
|
|
90
|
+
extracted_noise_count=noise_count,
|
|
91
|
+
promoted_record_count=len(session_records),
|
|
92
|
+
stored_record_count=len(stored_records),
|
|
93
|
+
extracted_kind_counts=kind_counts,
|
|
94
|
+
extracted_memory_type_counts=memory_type_counts,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _apply_candidate_context(
|
|
99
|
+
extracted: MemoryExtractionResult,
|
|
100
|
+
promoted: MemoryPromotionResult,
|
|
101
|
+
events: list[dict],
|
|
102
|
+
) -> None:
|
|
103
|
+
evidence_window = build_recent_capture_window(events, limit=48)
|
|
104
|
+
evidence_refs_by_candidate = {
|
|
105
|
+
index: collect_candidate_evidence_refs(candidate, evidence_window)
|
|
106
|
+
for index, candidate in enumerate(extracted.candidates)
|
|
107
|
+
}
|
|
108
|
+
context_by_candidate = {
|
|
109
|
+
index: collect_candidate_context(candidate, evidence_window, evidence_refs=evidence_refs_by_candidate.get(index, []))
|
|
110
|
+
for index, candidate in enumerate(extracted.candidates)
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
record_refs_by_id: dict[str, list[str]] = {}
|
|
114
|
+
for index, decision in enumerate(promoted.decisions):
|
|
115
|
+
record = decision.resulting_record
|
|
116
|
+
if not record:
|
|
117
|
+
continue
|
|
118
|
+
refs = evidence_refs_by_candidate.get(index, [])
|
|
119
|
+
context = context_by_candidate.get(index, {})
|
|
120
|
+
merged = list(dict.fromkeys([*record.evidence, *refs]))
|
|
121
|
+
record.evidence = merged
|
|
122
|
+
record.repo = str(context.get("repo") or record.repo or "") or record.repo
|
|
123
|
+
record.commit = str(context.get("commit") or record.commit or "") or record.commit
|
|
124
|
+
ctx_files = context.get("files")
|
|
125
|
+
if isinstance(ctx_files, list):
|
|
126
|
+
record.files = list(dict.fromkeys([*record.files, *ctx_files]))
|
|
127
|
+
record_refs_by_id[record.memory_id] = merged
|
|
128
|
+
|
|
129
|
+
for record in promoted.records:
|
|
130
|
+
stored_refs = record_refs_by_id.get(record.memory_id)
|
|
131
|
+
if stored_refs:
|
|
132
|
+
record.evidence = stored_refs
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def run_session_promotion(
|
|
136
|
+
*,
|
|
137
|
+
events: list[dict],
|
|
138
|
+
session_id: str,
|
|
139
|
+
project_id: str,
|
|
140
|
+
provider: MemoryModelProvider,
|
|
141
|
+
project_hint: str | None = None,
|
|
142
|
+
branch: str | None = None,
|
|
143
|
+
commit: str | None = None,
|
|
144
|
+
existing_records: list[MemoryRecord] | None = None,
|
|
145
|
+
source: str = "capture-session",
|
|
146
|
+
min_confidence: float = 0.0,
|
|
147
|
+
memory_profile: str | None = None,
|
|
148
|
+
) -> SessionPromotionReport:
|
|
149
|
+
prior_records = list(existing_records or [])
|
|
150
|
+
extraction = extract_memory_candidates(
|
|
151
|
+
events,
|
|
152
|
+
provider=provider,
|
|
153
|
+
project_hint=project_hint,
|
|
154
|
+
existing_context=build_existing_memory_context(prior_records),
|
|
155
|
+
profile=memory_profile,
|
|
156
|
+
)
|
|
157
|
+
eligible_candidates = [
|
|
158
|
+
candidate
|
|
159
|
+
for candidate in extraction.candidates
|
|
160
|
+
if candidate.should_store and candidate.memory_type != "noise" and candidate.confidence >= min_confidence
|
|
161
|
+
]
|
|
162
|
+
promoted = promote_memory_candidates(
|
|
163
|
+
eligible_candidates,
|
|
164
|
+
existing_records=prior_records,
|
|
165
|
+
project_id=project_id,
|
|
166
|
+
source=source,
|
|
167
|
+
session_id=session_id,
|
|
168
|
+
branch=branch,
|
|
169
|
+
commit=commit,
|
|
170
|
+
evidence=[session_id],
|
|
171
|
+
)
|
|
172
|
+
_apply_candidate_context(extraction, promoted, events)
|
|
173
|
+
session_records = [decision.resulting_record for decision in promoted.decisions if decision.resulting_record]
|
|
174
|
+
stored_records = promoted.records
|
|
175
|
+
distillation_trace = _build_distillation_trace(
|
|
176
|
+
memory_profile=memory_profile,
|
|
177
|
+
provider=provider,
|
|
178
|
+
events=events,
|
|
179
|
+
extraction=extraction,
|
|
180
|
+
session_records=session_records,
|
|
181
|
+
stored_records=stored_records,
|
|
182
|
+
)
|
|
183
|
+
return SessionPromotionReport(
|
|
184
|
+
session_id=session_id,
|
|
185
|
+
project_id=project_id,
|
|
186
|
+
extracted=extraction,
|
|
187
|
+
promotion=promoted,
|
|
188
|
+
session_records=session_records,
|
|
189
|
+
stored_records=stored_records,
|
|
190
|
+
distillation_trace=distillation_trace,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
__all__ = ["DistillationTrace", "SessionPromotionReport", "run_session_promotion"]
|