cherry-docs 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. app/__init__.py +0 -0
  2. app/repo_scope.py +24 -0
  3. app/services/__init__.py +0 -0
  4. app/services/agent_protocol.py +59 -0
  5. app/services/auto_promote_sessions.py +245 -0
  6. app/services/capture_adapters.py +89 -0
  7. app/services/capture_core.py +164 -0
  8. app/services/internal_memory_agent.py +214 -0
  9. app/services/memory_evidence.py +89 -0
  10. app/services/memory_extraction_normalize.py +134 -0
  11. app/services/memory_lifecycle.py +258 -0
  12. app/services/memory_profiles.py +88 -0
  13. app/services/memory_providers.py +113 -0
  14. app/services/memory_retrieval.py +327 -0
  15. app/services/memory_retrieval_scoring.py +106 -0
  16. app/services/memory_retrieval_text.py +113 -0
  17. app/services/memory_similarity.py +135 -0
  18. app/services/privacy.py +72 -0
  19. app/services/promoted_memory_answer.py +157 -0
  20. app/services/promoted_memory_pipeline.py +194 -0
  21. app/services/promoted_memory_store.py +57 -0
  22. cherry_docs-0.2.0.dist-info/METADATA +143 -0
  23. cherry_docs-0.2.0.dist-info/RECORD +42 -0
  24. cherry_docs-0.2.0.dist-info/WHEEL +5 -0
  25. cherry_docs-0.2.0.dist-info/entry_points.txt +4 -0
  26. cherry_docs-0.2.0.dist-info/top_level.txt +3 -0
  27. cherrydocs/__init__.py +3 -0
  28. cherrydocs/cli.py +213 -0
  29. cherrydocs/hook.py +27 -0
  30. cherrydocs/mcp.py +22 -0
  31. scripts/__init__.py +0 -0
  32. scripts/auto_promote_capture.py +63 -0
  33. scripts/check_size_limits.py +115 -0
  34. scripts/ci_auto_capture.py +289 -0
  35. scripts/claude_hooks/__init__.py +0 -0
  36. scripts/claude_hooks/state_manager.py +526 -0
  37. scripts/coverage_regression_gate.py +121 -0
  38. scripts/eval_projects.py +247 -0
  39. scripts/install.py +212 -0
  40. scripts/pr_gate_report.py +282 -0
  41. scripts/promptfoo_regression_gate.py +176 -0
  42. scripts/render_agent_prompts.py +57 -0
@@ -0,0 +1,113 @@
1
+ """Text/token helpers for memory retrieval — tokenization, overlap, question mode, file matching."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ _STOPWORDS = {
8
+ "a",
9
+ "an",
10
+ "and",
11
+ "are",
12
+ "as",
13
+ "at",
14
+ "be",
15
+ "because",
16
+ "for",
17
+ "from",
18
+ "how",
19
+ "i",
20
+ "if",
21
+ "in",
22
+ "into",
23
+ "is",
24
+ "it",
25
+ "of",
26
+ "on",
27
+ "or",
28
+ "that",
29
+ "the",
30
+ "this",
31
+ "to",
32
+ "was",
33
+ "what",
34
+ "why",
35
+ "with",
36
+ }
37
+ _RECENCY_QUERY_TERMS = {"latest", "recent", "newest", "changed", "change", "now", "currently", "just"}
38
+
39
+
40
+ def _stem(token: str) -> str:
41
+ """Minimal English stemmer: strip common suffixes so plural/verb forms match."""
42
+ for suffix in ("tions", "tion", "ings", "ing", "ions", "ion", "ies", "ed", "es", "ly", "s"):
43
+ if token.endswith(suffix) and len(token) - len(suffix) > 2:
44
+ return token[: -len(suffix)]
45
+ return token
46
+
47
+
48
+ def _tokens(value: str) -> set[str]:
49
+ raw = {
50
+ token
51
+ for token in re.findall(r"[a-z0-9]+", value.lower())
52
+ if token and token not in _STOPWORDS and len(token) > 2
53
+ }
54
+ return {_stem(t) for t in raw}
55
+
56
+
57
+ def _score_overlap(left: str, right: str) -> float:
58
+ left_tokens = _tokens(left)
59
+ right_tokens = _tokens(right)
60
+ if not left_tokens or not right_tokens:
61
+ return 0.0
62
+ overlap = len(left_tokens & right_tokens)
63
+ return overlap / max(1, min(len(left_tokens), len(right_tokens)))
64
+
65
+
66
+ def _question_mode(question: str) -> str:
67
+ lowered = question.lower()
68
+ if "why" in lowered:
69
+ return "why"
70
+ if "how" in lowered:
71
+ return "how"
72
+ if "fail" in lowered:
73
+ return "what_failed"
74
+ if "what" in lowered or "which" in lowered or "state" in lowered or "current" in lowered:
75
+ return "what"
76
+ return "generic"
77
+
78
+
79
+ def _wants_recent_context(question: str) -> bool:
80
+ lowered = question.lower()
81
+ return any(term in lowered for term in _RECENCY_QUERY_TERMS)
82
+
83
+
84
+ def _trim_text(value: str, limit: int = 280) -> str:
85
+ text = " ".join((value or "").split())
86
+ if len(text) <= limit:
87
+ return text
88
+ return f"{text[: limit - 3]}..."
89
+
90
+
91
+ def _file_query_hint(file_path: str | None) -> str:
92
+ if not file_path:
93
+ return ""
94
+ normalized = file_path.strip().lower().strip("/")
95
+ if not normalized:
96
+ return ""
97
+ parts = [p for p in re.split(r"[/_.-]+", normalized) if p and len(p) > 2]
98
+ hints = set(parts) | {p[:-1] for p in parts if p.endswith("s") and len(p) > 4}
99
+ return " ".join(sorted(hints))
100
+
101
+
102
+ def _file_overlap(file_path: str | None, files: list[str]) -> float:
103
+ if not file_path:
104
+ return 0.0
105
+ normalized = file_path.strip().lower().lstrip("/")
106
+ if not normalized:
107
+ return 0.0
108
+ lowered = [str(path).strip().lower().lstrip("/") for path in files if str(path).strip()]
109
+ if normalized in lowered:
110
+ return 1.0
111
+ if any(path.endswith(normalized) or normalized.endswith(path) for path in lowered):
112
+ return 0.7
113
+ return 0.0
@@ -0,0 +1,135 @@
1
+ """Similarity and matching heuristics for memory lifecycle decisions.
2
+
3
+ Uses TYPE_CHECKING for MemoryRecord to avoid circular imports at runtime
4
+ (memory_lifecycle imports these helpers, and MemoryRecord lives there).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import re
10
+ from typing import TYPE_CHECKING
11
+
12
+ from app.services.memory_extraction_normalize import MemoryCandidate
13
+
14
+ if TYPE_CHECKING:
15
+ from app.services.memory_lifecycle import MemoryRecord
16
+
17
+ _STOPWORDS = {
18
+ "a", "an", "and", "are", "as", "at", "be", "because", "for", "from",
19
+ "how", "if", "in", "into", "is", "it", "of", "on", "or", "that", "the",
20
+ "this", "to", "use", "using", "we", "with",
21
+ }
22
+ _NEGATIVE_HINTS = {"avoid", "reject", "deprecated", "legacy", "superseded", "replace", "replaced", "drop"}
23
+ _POSITIVE_HINTS = {"choose", "chosen", "use", "using", "adopt", "keep", "kept", "implement", "switch", "move"}
24
+ _PROCEDURE_HINTS = {"first", "then", "next", "workflow", "runbook", "checklist", "repeatable", "repeat"}
25
+
26
+
27
+ def _normalize_tokens(value: str) -> list[str]:
28
+ return [
29
+ token
30
+ for token in re.findall(r"[a-z0-9]+", value.lower())
31
+ if token and token not in _STOPWORDS and len(token) > 2
32
+ ]
33
+
34
+
35
+ def _raw_tokens(value: str) -> set[str]:
36
+ return {token for token in re.findall(r"[a-z0-9]+", value.lower()) if token}
37
+
38
+
39
+ def topic_key_for_text(*parts: str, limit: int = 4) -> str:
40
+ tokens: list[str] = []
41
+ seen: set[str] = set()
42
+ for part in parts:
43
+ for token in _normalize_tokens(part):
44
+ if token in seen:
45
+ continue
46
+ tokens.append(token)
47
+ seen.add(token)
48
+ if len(tokens) >= limit:
49
+ return " ".join(tokens)
50
+ return " ".join(tokens)
51
+
52
+
53
+ def _similarity(left: str, right: str) -> float:
54
+ left_tokens = set(_normalize_tokens(left))
55
+ right_tokens = set(_normalize_tokens(right))
56
+ if not left_tokens or not right_tokens:
57
+ return 0.0
58
+ overlap = len(left_tokens & right_tokens)
59
+ return overlap / max(1, min(len(left_tokens), len(right_tokens)))
60
+
61
+
62
+ def _candidate_text(candidate: MemoryCandidate) -> str:
63
+ return f"{candidate.summary} {candidate.rationale}".strip()
64
+
65
+
66
+ def _record_text(record: MemoryRecord) -> str:
67
+ return f"{record.summary} {record.rationale}".strip()
68
+
69
+
70
+ def _has_any_hint(text: str, hints: set[str]) -> bool:
71
+ return bool(_raw_tokens(text) & hints)
72
+
73
+
74
+ def _is_repeatable_procedure(summary: str, rationale: str) -> bool:
75
+ return _has_any_hint(f"{summary} {rationale}".strip(), _PROCEDURE_HINTS)
76
+
77
+
78
+ def _is_conflict(candidate: MemoryCandidate, record: MemoryRecord) -> bool:
79
+ if candidate.memory_type != record.memory_type:
80
+ return False
81
+ similarity = _similarity(_candidate_text(candidate), _record_text(record))
82
+ topic_similarity = _similarity(candidate.summary, record.summary)
83
+ if max(similarity, topic_similarity) < 0.3:
84
+ return False
85
+ candidate_negative = _has_any_hint(_candidate_text(candidate), _NEGATIVE_HINTS)
86
+ record_negative = _has_any_hint(_record_text(record), _NEGATIVE_HINTS)
87
+ candidate_positive = _has_any_hint(_candidate_text(candidate), _POSITIVE_HINTS)
88
+ record_positive = _has_any_hint(_record_text(record), _POSITIVE_HINTS)
89
+ return (candidate_negative and record_positive) or (candidate_positive and record_negative)
90
+
91
+
92
+ def _should_supersede(candidate: MemoryCandidate, record: MemoryRecord) -> bool:
93
+ if record.status != "active":
94
+ return False
95
+ if _similarity(_candidate_text(candidate), _record_text(record)) < 0.45:
96
+ return False
97
+ if candidate.confidence + 0.05 < record.confidence:
98
+ return False
99
+ return _has_any_hint(
100
+ _candidate_text(candidate),
101
+ {"replace", "replaced", "switch", "superseded", "migrate", "deprecated"},
102
+ )
103
+
104
+
105
+ def _should_merge(candidate: MemoryCandidate, record: MemoryRecord) -> bool:
106
+ if candidate.memory_type != record.memory_type:
107
+ return False
108
+ if candidate.kind != record.kind:
109
+ return False
110
+ similarity = _similarity(_candidate_text(candidate), _record_text(record))
111
+ if similarity >= 0.7:
112
+ return True
113
+ if candidate.kind == "decision":
114
+ summary_similarity = _similarity(candidate.summary, record.summary)
115
+ rationale_similarity = _similarity(candidate.rationale, record.rationale)
116
+ topic_similarity = _similarity(
117
+ topic_key_for_text(candidate.summary, candidate.rationale),
118
+ topic_key_for_text(record.summary, record.rationale),
119
+ )
120
+ return max(summary_similarity, rationale_similarity, topic_similarity) >= 0.4
121
+ return False
122
+
123
+
124
+ def _best_matching_record(
125
+ candidate: MemoryCandidate, records: list[MemoryRecord]
126
+ ) -> MemoryRecord | None:
127
+ best: MemoryRecord | None = None
128
+ best_score = 0.0
129
+ ctext = _candidate_text(candidate)
130
+ for record in records:
131
+ score = _similarity(ctext, _record_text(record))
132
+ if score > best_score:
133
+ best_score = score
134
+ best = record
135
+ return best
@@ -0,0 +1,72 @@
1
+ """Privacy and redaction: detect and redact secrets before persistence."""
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from typing import Any
6
+
7
+ _REDACT_PATTERNS = {
8
+ "EMAIL": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
9
+ "API_KEY": r"(?i)(api[_-]?key|token|secret|password|auth)['\"]?\s*[:=]\s*['\"]?([a-zA-Z0-9_\-]{20,})['\"]?",
10
+ "JWT": r'ey[A-Za-z0-9-_=]+\.[A-Za-z0-9-_=]+\.?[A-Za-z0-9-_.+/=]*',
11
+ "GROQ": r'gsk_[a-zA-Z0-9]{20,}',
12
+ "OPENAI": r'sk-[a-zA-Z0-9\-_]{20,}',
13
+ "GITHUB": r'gh[pousr]_[a-zA-Z0-9]{36}',
14
+ }
15
+
16
+ # High-confidence blocking patterns — used to fail-closed before persistence.
17
+ _BLOCKING_PATTERNS: dict[str, str] = {
18
+ "openai_like_key": r"\bsk-[A-Za-z0-9_\-]{20,}\b",
19
+ "groq_key": r"\bgsk_[A-Za-z0-9]{20,}\b",
20
+ "github_pat": r"\bgh[pousr]_[A-Za-z0-9]{20,}\b",
21
+ "huggingface_token": r"\bhf_[A-Za-z0-9]{20,}\b",
22
+ "aws_access_key": r"\bAKIA[0-9A-Z]{16}\b",
23
+ "private_key_block": r"-----BEGIN [A-Z ]+PRIVATE KEY-----",
24
+ "credential_assignment": (
25
+ r"(?i)\b(api[_-]?key|access[_-]?token|refresh[_-]?token|secret|password|passwd|authorization)\b"
26
+ r"\s*[:=]\s*[\"']?(?:bearer\s+)?[A-Za-z0-9._\-]{16,}[\"']?"
27
+ ),
28
+ }
29
+
30
+
31
+ def _scan_text_for_blocking_secret(text: str) -> list[str]:
32
+ return [label for label, pattern in _BLOCKING_PATTERNS.items() if re.search(pattern, text)]
33
+
34
+
35
+ def _walk_payload_for_secrets(value: Any, path: str, findings: list[str], max_hits: int) -> None:
36
+ if len(findings) >= max_hits:
37
+ return
38
+ if isinstance(value, str):
39
+ labels = _scan_text_for_blocking_secret(value)
40
+ if labels:
41
+ findings.append(f"{path} ({', '.join(sorted(set(labels)))})")
42
+ elif isinstance(value, dict):
43
+ for key, nested in value.items():
44
+ _walk_payload_for_secrets(nested, f"{path}.{key}", findings, max_hits)
45
+ if len(findings) >= max_hits:
46
+ return
47
+ elif isinstance(value, list):
48
+ for idx, nested in enumerate(value):
49
+ _walk_payload_for_secrets(nested, f"{path}[{idx}]", findings, max_hits)
50
+ if len(findings) >= max_hits:
51
+ return
52
+
53
+
54
+ def detect_blocking_secret_fields(data: dict[str, Any], max_hits: int = 8) -> list[str]:
55
+ """Return payload paths containing high-confidence secret material."""
56
+ findings: list[str] = []
57
+ _walk_payload_for_secrets(data, "payload", findings, max_hits=max_hits)
58
+ return findings
59
+
60
+
61
+ def redact_text(text: str) -> str:
62
+ """Redact PII and secrets from a string."""
63
+ if not text:
64
+ return text
65
+ redacted = text
66
+ redacted = re.sub(_REDACT_PATTERNS["GROQ"], "[REDACTED_GROQ_KEY]", redacted)
67
+ redacted = re.sub(_REDACT_PATTERNS["OPENAI"], "[REDACTED_OPENAI_KEY]", redacted)
68
+ redacted = re.sub(_REDACT_PATTERNS["GITHUB"], "[REDACTED_GITHUB_TOKEN]", redacted)
69
+ redacted = re.sub(_REDACT_PATTERNS["API_KEY"], lambda m: f"{m.group(1)}: [REDACTED_SECRET]", redacted)
70
+ redacted = re.sub(_REDACT_PATTERNS["EMAIL"], "[REDACTED_EMAIL]", redacted)
71
+ redacted = re.sub(_REDACT_PATTERNS["JWT"], "[REDACTED_JWT]", redacted)
72
+ return redacted
@@ -0,0 +1,157 @@
1
+ """Promoted-memory answer helpers."""
2
+ from __future__ import annotations
3
+
4
+ from app.services.auto_promote_sessions import (
5
+ AutoPromotionPolicy,
6
+ AutoPromotionRunReport,
7
+ auto_promote_captured_sessions,
8
+ )
9
+ from app.services.capture_core import LocalCaptureBuffer
10
+ from app.services.internal_memory_agent import MemoryModelProvider
11
+ from app.services.memory_providers import resolve_provider
12
+ from app.services.memory_lifecycle import build_event_evidence_id
13
+ from app.services.memory_retrieval import (
14
+ RetrievalAnswer,
15
+ apply_retrieval_feedback,
16
+ synthesize_retrieval_answer,
17
+ )
18
+ from app.services.promoted_memory_store import DEFAULT_PROMOTED_ROOT, LocalPromotedMemoryStore
19
+
20
+
21
+ def _compact_files(paths: list[str], *, limit: int = 6) -> list[str]:
22
+ compacted: list[str] = []
23
+ seen: set[str] = set()
24
+ for raw in paths:
25
+ path = str(raw).strip()
26
+ if not path or path in seen:
27
+ continue
28
+ seen.add(path)
29
+ compacted.append(path)
30
+ if len(compacted) >= limit:
31
+ break
32
+ return compacted
33
+
34
+
35
+ def events_from_memory(
36
+ records,
37
+ *,
38
+ buffer_dir: str = ".cherrydocs/capture",
39
+ ) -> list[dict]:
40
+ """Load raw capture events referenced by promoted memory records."""
41
+ evidence_ids = {
42
+ ev_id
43
+ for record in records
44
+ for ev_id in record.evidence
45
+ if ":" in ev_id
46
+ }
47
+ if not evidence_ids:
48
+ return []
49
+
50
+ events: list[dict] = []
51
+ capture_buffer = LocalCaptureBuffer(buffer_dir)
52
+ loaded_sessions: set[str] = set()
53
+ for record in records:
54
+ session_id = str(record.session_id or "").strip()
55
+ if not session_id or session_id in loaded_sessions:
56
+ continue
57
+ loaded_sessions.add(session_id)
58
+ for event in capture_buffer.read(session_id):
59
+ if build_event_evidence_id(event) not in evidence_ids:
60
+ continue
61
+ events.append(
62
+ {
63
+ "event_type": str(event.get("event_type") or "unknown"),
64
+ "timestamp": event.get("timestamp"),
65
+ "text": str(event.get("text") or "").strip(),
66
+ "files": _compact_files(
67
+ [str(p) for p in (event.get("files") or []) if str(p).strip()]
68
+ ),
69
+ "branch": event.get("branch"),
70
+ "commit": event.get("commit"),
71
+ }
72
+ )
73
+ return events
74
+
75
+
76
+ def answer_from_promoted_memory(
77
+ *,
78
+ project_id: str,
79
+ question: str,
80
+ branch: str | None = None,
81
+ file_path: str | None = None,
82
+ buffer_dir: str = ".cherrydocs/capture",
83
+ promoted_root: str = DEFAULT_PROMOTED_ROOT,
84
+ ) -> RetrievalAnswer:
85
+ store = LocalPromotedMemoryStore(promoted_root)
86
+ all_records = store.load_records(project_id)
87
+ records = [r for r in all_records if not branch or not r.branch or r.branch == branch]
88
+ answer = synthesize_retrieval_answer(
89
+ question,
90
+ records,
91
+ events_from_memory(records, buffer_dir=buffer_dir),
92
+ branch=branch,
93
+ file_path=file_path,
94
+ )
95
+ if answer.memories:
96
+ updated_records = apply_retrieval_feedback(records, answer.memories, answer.evidence)
97
+ try:
98
+ store.upsert_records(project_id, updated_records)
99
+ except Exception:
100
+ pass
101
+ return answer
102
+
103
+
104
+ def _should_refresh_answer(answer: RetrievalAnswer) -> bool:
105
+ """Only refresh when no memories at all or the top match is inactive."""
106
+ if not answer.memories:
107
+ return True
108
+ return answer.memories[0].memory.status != "active"
109
+
110
+
111
+ def answer_from_promoted_memory_with_refresh(
112
+ *,
113
+ project_id: str,
114
+ question: str,
115
+ branch: str | None = None,
116
+ file_path: str | None = None,
117
+ auto_promote_if_empty: bool = True,
118
+ project_hint: str | None = None,
119
+ buffer_dir: str = ".cherrydocs/capture",
120
+ promoted_root: str = DEFAULT_PROMOTED_ROOT,
121
+ provider: MemoryModelProvider | None = None,
122
+ policy: AutoPromotionPolicy | None = None,
123
+ memory_profile: str | None = None,
124
+ ) -> tuple[RetrievalAnswer, AutoPromotionRunReport | None]:
125
+ answer = answer_from_promoted_memory(
126
+ project_id=project_id,
127
+ question=question,
128
+ branch=branch,
129
+ file_path=file_path,
130
+ buffer_dir=buffer_dir,
131
+ promoted_root=promoted_root,
132
+ )
133
+ if not _should_refresh_answer(answer) or not auto_promote_if_empty:
134
+ return answer, None
135
+
136
+ report = auto_promote_captured_sessions(
137
+ project_id=project_id,
138
+ buffer_dir=buffer_dir,
139
+ promoted_root=promoted_root,
140
+ provider=provider or resolve_provider(),
141
+ project_hint=project_hint,
142
+ memory_profile=memory_profile,
143
+ branch=branch,
144
+ policy=policy or AutoPromotionPolicy(max_sessions=3, min_event_count=3, min_candidate_confidence=0.8),
145
+ )
146
+ if not report.processed:
147
+ return answer, report
148
+
149
+ refreshed = answer_from_promoted_memory(
150
+ project_id=project_id,
151
+ question=question,
152
+ branch=branch,
153
+ file_path=file_path,
154
+ buffer_dir=buffer_dir,
155
+ promoted_root=promoted_root,
156
+ )
157
+ return refreshed, report
@@ -0,0 +1,194 @@
1
+ """End-to-end capture -> distill -> promote pipeline for AI-facing session memory."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pydantic import BaseModel, ConfigDict, Field
6
+
7
+ from app.services.internal_memory_agent import (
8
+ MemoryExtractionResult,
9
+ MemoryModelProvider,
10
+ build_recent_capture_window,
11
+ extract_memory_candidates,
12
+ )
13
+ from app.services.memory_lifecycle import (
14
+ MemoryPromotionResult,
15
+ MemoryRecord,
16
+ build_existing_memory_context,
17
+ collect_candidate_context,
18
+ collect_candidate_evidence_refs,
19
+ promote_memory_candidates,
20
+ )
21
+ from app.services.promoted_memory_store import DEFAULT_PROMOTED_ROOT, LocalPromotedMemoryStore
22
+
23
+
24
+ class DistillationTrace(BaseModel):
25
+ model_config = ConfigDict(extra="ignore")
26
+
27
+ memory_profile: str
28
+ provider_label: str
29
+ input_event_count: int = 0
30
+ extracted_candidate_count: int = 0
31
+ extracted_store_count: int = 0
32
+ extracted_noise_count: int = 0
33
+ promoted_record_count: int = 0
34
+ stored_record_count: int = 0
35
+ extracted_kind_counts: dict[str, int] = Field(default_factory=dict)
36
+ extracted_memory_type_counts: dict[str, int] = Field(default_factory=dict)
37
+
38
+
39
+ class SessionPromotionReport(BaseModel):
40
+ model_config = ConfigDict(extra="ignore")
41
+
42
+ session_id: str
43
+ project_id: str
44
+ extracted: MemoryExtractionResult
45
+ promotion: MemoryPromotionResult
46
+ session_records: list[MemoryRecord] = Field(default_factory=list)
47
+ stored_records: list[MemoryRecord] = Field(default_factory=list)
48
+ distillation_trace: DistillationTrace
49
+
50
+
51
+ def _increment_count(bucket: dict[str, int], key: str) -> None:
52
+ normalized = " ".join(str(key or "").split()).strip().lower() or "unknown"
53
+ bucket[normalized] = bucket.get(normalized, 0) + 1
54
+
55
+
56
+ def _provider_label(provider: MemoryModelProvider) -> str:
57
+ model = " ".join(str(getattr(provider, "model", "") or "").split()).strip()
58
+ if model:
59
+ return model
60
+ return provider.__class__.__name__
61
+
62
+
63
+ def _build_distillation_trace(
64
+ *,
65
+ memory_profile: str | None,
66
+ provider: MemoryModelProvider,
67
+ events: list[dict],
68
+ extraction: MemoryExtractionResult,
69
+ session_records: list[MemoryRecord],
70
+ stored_records: list[MemoryRecord],
71
+ ) -> DistillationTrace:
72
+ kind_counts: dict[str, int] = {}
73
+ memory_type_counts: dict[str, int] = {}
74
+ store_count = 0
75
+ noise_count = 0
76
+ for candidate in extraction.candidates:
77
+ _increment_count(kind_counts, candidate.kind)
78
+ _increment_count(memory_type_counts, candidate.memory_type)
79
+ if candidate.memory_type == "noise":
80
+ noise_count += 1
81
+ if candidate.should_store and candidate.memory_type != "noise":
82
+ store_count += 1
83
+
84
+ return DistillationTrace(
85
+ memory_profile=" ".join(str(memory_profile or "").split()).strip() or "default",
86
+ provider_label=_provider_label(provider),
87
+ input_event_count=len(events),
88
+ extracted_candidate_count=len(extraction.candidates),
89
+ extracted_store_count=store_count,
90
+ extracted_noise_count=noise_count,
91
+ promoted_record_count=len(session_records),
92
+ stored_record_count=len(stored_records),
93
+ extracted_kind_counts=kind_counts,
94
+ extracted_memory_type_counts=memory_type_counts,
95
+ )
96
+
97
+
98
+ def _apply_candidate_context(
99
+ extracted: MemoryExtractionResult,
100
+ promoted: MemoryPromotionResult,
101
+ events: list[dict],
102
+ ) -> None:
103
+ evidence_window = build_recent_capture_window(events, limit=48)
104
+ evidence_refs_by_candidate = {
105
+ index: collect_candidate_evidence_refs(candidate, evidence_window)
106
+ for index, candidate in enumerate(extracted.candidates)
107
+ }
108
+ context_by_candidate = {
109
+ index: collect_candidate_context(candidate, evidence_window, evidence_refs=evidence_refs_by_candidate.get(index, []))
110
+ for index, candidate in enumerate(extracted.candidates)
111
+ }
112
+
113
+ record_refs_by_id: dict[str, list[str]] = {}
114
+ for index, decision in enumerate(promoted.decisions):
115
+ record = decision.resulting_record
116
+ if not record:
117
+ continue
118
+ refs = evidence_refs_by_candidate.get(index, [])
119
+ context = context_by_candidate.get(index, {})
120
+ merged = list(dict.fromkeys([*record.evidence, *refs]))
121
+ record.evidence = merged
122
+ record.repo = str(context.get("repo") or record.repo or "") or record.repo
123
+ record.commit = str(context.get("commit") or record.commit or "") or record.commit
124
+ ctx_files = context.get("files")
125
+ if isinstance(ctx_files, list):
126
+ record.files = list(dict.fromkeys([*record.files, *ctx_files]))
127
+ record_refs_by_id[record.memory_id] = merged
128
+
129
+ for record in promoted.records:
130
+ stored_refs = record_refs_by_id.get(record.memory_id)
131
+ if stored_refs:
132
+ record.evidence = stored_refs
133
+
134
+
135
+ def run_session_promotion(
136
+ *,
137
+ events: list[dict],
138
+ session_id: str,
139
+ project_id: str,
140
+ provider: MemoryModelProvider,
141
+ project_hint: str | None = None,
142
+ branch: str | None = None,
143
+ commit: str | None = None,
144
+ existing_records: list[MemoryRecord] | None = None,
145
+ source: str = "capture-session",
146
+ min_confidence: float = 0.0,
147
+ memory_profile: str | None = None,
148
+ ) -> SessionPromotionReport:
149
+ prior_records = list(existing_records or [])
150
+ extraction = extract_memory_candidates(
151
+ events,
152
+ provider=provider,
153
+ project_hint=project_hint,
154
+ existing_context=build_existing_memory_context(prior_records),
155
+ profile=memory_profile,
156
+ )
157
+ eligible_candidates = [
158
+ candidate
159
+ for candidate in extraction.candidates
160
+ if candidate.should_store and candidate.memory_type != "noise" and candidate.confidence >= min_confidence
161
+ ]
162
+ promoted = promote_memory_candidates(
163
+ eligible_candidates,
164
+ existing_records=prior_records,
165
+ project_id=project_id,
166
+ source=source,
167
+ session_id=session_id,
168
+ branch=branch,
169
+ commit=commit,
170
+ evidence=[session_id],
171
+ )
172
+ _apply_candidate_context(extraction, promoted, events)
173
+ session_records = [decision.resulting_record for decision in promoted.decisions if decision.resulting_record]
174
+ stored_records = promoted.records
175
+ distillation_trace = _build_distillation_trace(
176
+ memory_profile=memory_profile,
177
+ provider=provider,
178
+ events=events,
179
+ extraction=extraction,
180
+ session_records=session_records,
181
+ stored_records=stored_records,
182
+ )
183
+ return SessionPromotionReport(
184
+ session_id=session_id,
185
+ project_id=project_id,
186
+ extracted=extraction,
187
+ promotion=promoted,
188
+ session_records=session_records,
189
+ stored_records=stored_records,
190
+ distillation_trace=distillation_trace,
191
+ )
192
+
193
+
194
+ __all__ = ["DistillationTrace", "SessionPromotionReport", "run_session_promotion"]