@onlooker-community/ecosystem 0.21.0 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +13 -0
- package/.claude-plugin/plugin.json +1 -1
- package/.release-please-manifest.json +3 -2
- package/CHANGELOG.md +15 -0
- package/hooks/hooks.json +4 -0
- package/package.json +2 -2
- package/plugins/historian/.claude-plugin/plugin.json +14 -0
- package/plugins/historian/CHANGELOG.md +17 -0
- package/plugins/historian/README.md +84 -0
- package/plugins/historian/config.json +46 -0
- package/plugins/historian/hooks/hooks.json +26 -0
- package/plugins/historian/scripts/hooks/historian-prompt-submit.sh +269 -0
- package/plugins/historian/scripts/hooks/historian-session-end.sh +235 -0
- package/plugins/historian/scripts/lib/historian-chunker.sh +129 -0
- package/plugins/historian/scripts/lib/historian-config.sh +66 -0
- package/plugins/historian/scripts/lib/historian-embedder.sh +126 -0
- package/plugins/historian/scripts/lib/historian-emit.sh +61 -0
- package/plugins/historian/scripts/lib/historian-project-key.sh +80 -0
- package/plugins/historian/scripts/lib/historian-retriever.sh +191 -0
- package/plugins/historian/scripts/lib/historian-sanitizer.sh +123 -0
- package/plugins/historian/scripts/lib/historian-storage.sh +157 -0
- package/plugins/historian/scripts/lib/historian-transcript.sh +83 -0
- package/plugins/historian/scripts/lib/historian-ulid.sh +43 -0
- package/release-please-config.json +16 -0
- package/scripts/hooks/memory-recall-tracker.sh +206 -0
- package/test/bats/historian-prompt-submit.bats +236 -0
- package/test/bats/historian-session-end.bats +296 -0
- package/test/bats/memory-recall-tracker.bats +189 -0
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Project key derivation for Historian.
|
|
3
|
+
#
|
|
4
|
+
# Historian stores chunk records under the ecosystem-wide 12-char hex
|
|
5
|
+
# project key so state survives clone path changes and is shared across
|
|
6
|
+
# worktrees / clones of the same repo.
|
|
7
|
+
#
|
|
8
|
+
# Resolution order:
|
|
9
|
+
# 1. SHA256(`git remote get-url origin`) — preferred, machine-portable
|
|
10
|
+
# 2. SHA256(realpath of `git rev-parse --show-toplevel`) — fallback for
|
|
11
|
+
# repos without an origin remote
|
|
12
|
+
#
|
|
13
|
+
# Returns the first 12 hex chars. Empty when not in a git repo at all.
|
|
14
|
+
|
|
15
|
+
_historian_sha256_first12() {
|
|
16
|
+
local input="$1"
|
|
17
|
+
if command -v shasum >/dev/null 2>&1; then
|
|
18
|
+
printf '%s' "$input" | shasum -a 256 2>/dev/null | cut -c1-12
|
|
19
|
+
elif command -v sha256sum >/dev/null 2>&1; then
|
|
20
|
+
printf '%s' "$input" | sha256sum 2>/dev/null | cut -c1-12
|
|
21
|
+
else
|
|
22
|
+
return 1
|
|
23
|
+
fi
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
historian_project_remote_url() {
|
|
27
|
+
local cwd="${1:-}"
|
|
28
|
+
[[ -z "$cwd" || ! -d "$cwd" ]] && return 0
|
|
29
|
+
git -C "$cwd" remote get-url origin 2>/dev/null || true
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
historian_project_repo_root() {
|
|
33
|
+
local cwd="${1:-}"
|
|
34
|
+
[[ -z "$cwd" || ! -d "$cwd" ]] && return 0
|
|
35
|
+
|
|
36
|
+
if ! git -C "$cwd" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
|
|
37
|
+
return 0
|
|
38
|
+
fi
|
|
39
|
+
|
|
40
|
+
local common_dir toplevel
|
|
41
|
+
common_dir=$(git -C "$cwd" rev-parse --git-common-dir 2>/dev/null) || return 0
|
|
42
|
+
|
|
43
|
+
if [[ -n "$common_dir" && "$common_dir" != /* ]]; then
|
|
44
|
+
common_dir="$(cd "$cwd" && cd "$common_dir" 2>/dev/null && pwd -P)" || common_dir=""
|
|
45
|
+
fi
|
|
46
|
+
|
|
47
|
+
if [[ -n "$common_dir" && -d "$common_dir" ]]; then
|
|
48
|
+
toplevel="$(cd "$common_dir/.." 2>/dev/null && pwd -P)" || toplevel=""
|
|
49
|
+
fi
|
|
50
|
+
|
|
51
|
+
if [[ -z "$toplevel" ]]; then
|
|
52
|
+
toplevel=$(git -C "$cwd" rev-parse --show-toplevel 2>/dev/null || true)
|
|
53
|
+
[[ -n "$toplevel" ]] && toplevel="$(cd "$toplevel" 2>/dev/null && pwd -P)"
|
|
54
|
+
fi
|
|
55
|
+
|
|
56
|
+
printf '%s' "$toplevel"
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
# Compute the project key for the given cwd. Prints the key or empty.
|
|
60
|
+
# Usage: key=$(historian_project_key "$CWD")
|
|
61
|
+
historian_project_key() {
|
|
62
|
+
local cwd="${1:-}"
|
|
63
|
+
[[ -z "$cwd" ]] && cwd="$(pwd)"
|
|
64
|
+
|
|
65
|
+
local remote
|
|
66
|
+
remote=$(historian_project_remote_url "$cwd")
|
|
67
|
+
if [[ -n "$remote" ]]; then
|
|
68
|
+
_historian_sha256_first12 "remote:$remote"
|
|
69
|
+
return 0
|
|
70
|
+
fi
|
|
71
|
+
|
|
72
|
+
local root
|
|
73
|
+
root=$(historian_project_repo_root "$cwd")
|
|
74
|
+
if [[ -n "$root" ]]; then
|
|
75
|
+
_historian_sha256_first12 "root:$root"
|
|
76
|
+
return 0
|
|
77
|
+
fi
|
|
78
|
+
|
|
79
|
+
return 0
|
|
80
|
+
}
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Similarity-search retriever for Historian.
|
|
3
|
+
#
|
|
4
|
+
# Given a query embedding and a project key, walks every JSONL chunk
|
|
5
|
+
# record under ~/.onlooker/historian/<key>/sessions/, computes cosine
|
|
6
|
+
# similarity between the query vector and each chunk's `embedding`
|
|
7
|
+
# field, and returns the top-K candidates above a similarity floor.
|
|
8
|
+
#
|
|
9
|
+
# Chunks indexed before the embedder shipped don't have an `embedding`
|
|
10
|
+
# field; the retriever silently skips them rather than treating them as
|
|
11
|
+
# zero-similarity. They'll join the index after the next SessionEnd
|
|
12
|
+
# indexing pass.
|
|
13
|
+
|
|
14
|
+
# Aggregate every chunk record for the project. Returns a JSON array.
|
|
15
|
+
historian_retriever_load_all_chunks() {
|
|
16
|
+
local key="$1"
|
|
17
|
+
[[ -z "$key" ]] && { echo '[]'; return 0; }
|
|
18
|
+
|
|
19
|
+
local dir
|
|
20
|
+
dir=$(historian_sessions_dir "$key")
|
|
21
|
+
[[ -d "$dir" ]] || { echo '[]'; return 0; }
|
|
22
|
+
|
|
23
|
+
# Walk every *.jsonl, emit one JSON array. Use python3 to avoid the
|
|
24
|
+
# `jq -s` quirks around very large inputs and to control the chunk
|
|
25
|
+
# shape (drop the embedding from filtering candidates but keep it
|
|
26
|
+
# for the math).
|
|
27
|
+
python3 - "$dir" <<'PY'
|
|
28
|
+
import json, os, sys
|
|
29
|
+
dir_path = sys.argv[1]
|
|
30
|
+
out = []
|
|
31
|
+
try:
|
|
32
|
+
for name in sorted(os.listdir(dir_path)):
|
|
33
|
+
if not name.endswith(".jsonl"):
|
|
34
|
+
continue
|
|
35
|
+
path = os.path.join(dir_path, name)
|
|
36
|
+
try:
|
|
37
|
+
with open(path, "r", encoding="utf-8", errors="replace") as f:
|
|
38
|
+
for line in f:
|
|
39
|
+
line = line.strip()
|
|
40
|
+
if not line:
|
|
41
|
+
continue
|
|
42
|
+
try:
|
|
43
|
+
rec = json.loads(line)
|
|
44
|
+
except json.JSONDecodeError:
|
|
45
|
+
continue
|
|
46
|
+
out.append(rec)
|
|
47
|
+
except OSError:
|
|
48
|
+
continue
|
|
49
|
+
except FileNotFoundError:
|
|
50
|
+
pass
|
|
51
|
+
print(json.dumps(out))
|
|
52
|
+
PY
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
# Compute top-K cosine-similarity matches against the query embedding.
|
|
56
|
+
#
|
|
57
|
+
# The chunks are streamed from disk one line at a time so memory and
|
|
58
|
+
# argv stay bounded as the per-project store grows. Earlier versions
|
|
59
|
+
# passed the full chunks array as an argv string, which would trip the
|
|
60
|
+
# OS ARG_MAX limit somewhere around tens of thousands of chunks; this
|
|
61
|
+
# form never holds more than one chunk in memory at a time.
|
|
62
|
+
#
|
|
63
|
+
# Usage: historian_retriever_search <sessions_dir>
|
|
64
|
+
# <query_embedding_json>
|
|
65
|
+
# <top_k> <min_similarity>
|
|
66
|
+
# <max_age_days> <current_session_id>
|
|
67
|
+
#
|
|
68
|
+
# Output: JSON array sorted by similarity descending, length <= top_k.
|
|
69
|
+
# Each entry: {
|
|
70
|
+
# chunk_id, session_id, similarity, age_days, body_redacted,
|
|
71
|
+
# chunk_index, start_turn_index, end_turn_index, source
|
|
72
|
+
# }
|
|
73
|
+
historian_retriever_search() {
|
|
74
|
+
local sessions_dir="${1:-}"
|
|
75
|
+
local query="${2:-[]}"
|
|
76
|
+
local top_k="${3:-5}"
|
|
77
|
+
local min_sim="${4:-0.55}"
|
|
78
|
+
local max_age_days="${5:-180}"
|
|
79
|
+
local current_session="${6:-}"
|
|
80
|
+
|
|
81
|
+
if [[ -z "$sessions_dir" || ! -d "$sessions_dir" ]]; then
|
|
82
|
+
echo '[]'
|
|
83
|
+
return 0
|
|
84
|
+
fi
|
|
85
|
+
|
|
86
|
+
python3 - "$sessions_dir" "$top_k" "$min_sim" "$max_age_days" "$current_session" "$query" <<'PY'
|
|
87
|
+
import datetime, json, math, os, sys
|
|
88
|
+
|
|
89
|
+
sessions_dir = sys.argv[1]
|
|
90
|
+
top_k = int(sys.argv[2])
|
|
91
|
+
min_sim = float(sys.argv[3])
|
|
92
|
+
max_age_days = int(sys.argv[4])
|
|
93
|
+
current_session = sys.argv[5]
|
|
94
|
+
query = json.loads(sys.argv[6] or "null")
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def cosine(a, b):
|
|
98
|
+
if not a or not b or len(a) != len(b):
|
|
99
|
+
return None
|
|
100
|
+
dot = 0.0
|
|
101
|
+
na = 0.0
|
|
102
|
+
nb = 0.0
|
|
103
|
+
for x, y in zip(a, b):
|
|
104
|
+
dot += x * y
|
|
105
|
+
na += x * x
|
|
106
|
+
nb += y * y
|
|
107
|
+
if na <= 0.0 or nb <= 0.0:
|
|
108
|
+
return None
|
|
109
|
+
return dot / (math.sqrt(na) * math.sqrt(nb))
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def parse_iso(s):
|
|
113
|
+
if not s:
|
|
114
|
+
return None
|
|
115
|
+
try:
|
|
116
|
+
return datetime.datetime.strptime(s, "%Y-%m-%dT%H:%M:%SZ").replace(
|
|
117
|
+
tzinfo=datetime.timezone.utc
|
|
118
|
+
)
|
|
119
|
+
except ValueError:
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
if not isinstance(query, list) or not query:
|
|
124
|
+
print("[]")
|
|
125
|
+
sys.exit(0)
|
|
126
|
+
|
|
127
|
+
now = datetime.datetime.now(datetime.timezone.utc)
|
|
128
|
+
scored = []
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def consider(chunk):
|
|
132
|
+
sid = chunk.get("session_id", "")
|
|
133
|
+
# Exclude chunks from the session that is currently asking for
|
|
134
|
+
# context; a session retrieving its own chunks is a degenerate case.
|
|
135
|
+
if current_session and sid == current_session:
|
|
136
|
+
return
|
|
137
|
+
embedding = chunk.get("embedding")
|
|
138
|
+
if not isinstance(embedding, list) or not embedding:
|
|
139
|
+
return
|
|
140
|
+
sim = cosine(query, embedding)
|
|
141
|
+
if sim is None or sim < min_sim:
|
|
142
|
+
return
|
|
143
|
+
created = parse_iso(chunk.get("created_at"))
|
|
144
|
+
if created is None:
|
|
145
|
+
age_days = -1
|
|
146
|
+
else:
|
|
147
|
+
age_days = (now - created).days
|
|
148
|
+
if max_age_days > 0 and age_days > max_age_days:
|
|
149
|
+
return
|
|
150
|
+
scored.append(
|
|
151
|
+
{
|
|
152
|
+
"chunk_id": chunk.get("chunk_id"),
|
|
153
|
+
"session_id": sid,
|
|
154
|
+
"similarity": round(sim, 4),
|
|
155
|
+
"age_days": age_days,
|
|
156
|
+
"body_redacted": chunk.get("body_redacted", ""),
|
|
157
|
+
"chunk_index": chunk.get("chunk_index"),
|
|
158
|
+
"start_turn_index": chunk.get("start_turn_index"),
|
|
159
|
+
"end_turn_index": chunk.get("end_turn_index"),
|
|
160
|
+
"source": chunk.get("source", "local"),
|
|
161
|
+
}
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
names = sorted(os.listdir(sessions_dir))
|
|
167
|
+
except OSError:
|
|
168
|
+
names = []
|
|
169
|
+
|
|
170
|
+
for name in names:
|
|
171
|
+
if not name.endswith(".jsonl"):
|
|
172
|
+
continue
|
|
173
|
+
path = os.path.join(sessions_dir, name)
|
|
174
|
+
try:
|
|
175
|
+
with open(path, "r", encoding="utf-8", errors="replace") as f:
|
|
176
|
+
for line in f:
|
|
177
|
+
line = line.strip()
|
|
178
|
+
if not line:
|
|
179
|
+
continue
|
|
180
|
+
try:
|
|
181
|
+
chunk = json.loads(line)
|
|
182
|
+
except json.JSONDecodeError:
|
|
183
|
+
continue
|
|
184
|
+
consider(chunk)
|
|
185
|
+
except OSError:
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
scored.sort(key=lambda c: c["similarity"], reverse=True)
|
|
189
|
+
print(json.dumps(scored[:top_k]))
|
|
190
|
+
PY
|
|
191
|
+
}
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Sanitizer for Historian chunks.
|
|
3
|
+
#
|
|
4
|
+
# Three layers, in order:
|
|
5
|
+
# 1. Secret-shaped substrings are redacted to "[REDACTED:secret]".
|
|
6
|
+
# Patterns cover AWS access keys, GitHub PATs, Anthropic API keys,
|
|
7
|
+
# bearer tokens, and KEY=value-style env assignments containing
|
|
8
|
+
# key/secret/token in the key name.
|
|
9
|
+
# 2. `[historian:skip]` markers cause the entire chunk to be dropped.
|
|
10
|
+
# 3. Path-deny: if the chunk references any path under
|
|
11
|
+
# `never_index_paths` (substring match against each entry), the
|
|
12
|
+
# chunk is dropped.
|
|
13
|
+
#
|
|
14
|
+
# Input: JSON array of chunk records from the chunker (each with `body`).
|
|
15
|
+
# Output: JSON array of surviving chunk records, each with `body_redacted`
|
|
16
|
+
# (instead of `body`) and a `redaction_count`, plus a sibling
|
|
17
|
+
# array of `dropped` records keyed by reason.
|
|
18
|
+
|
|
19
|
+
# Usage: historian_sanitizer_run <chunks_json> <never_index_paths_json>
|
|
20
|
+
# <redact_secret_patterns> <drop_skip_marker>
|
|
21
|
+
#
|
|
22
|
+
# The two boolean args honor the corresponding config knobs:
|
|
23
|
+
# redact_secret_patterns: false → skip the secret regex substitutions
|
|
24
|
+
# (chunk bodies copy through unchanged)
|
|
25
|
+
# drop_skip_marker: false → keep chunks even when they contain the
|
|
26
|
+
# [historian:skip] marker
|
|
27
|
+
#
|
|
28
|
+
# Output: { "kept": [...], "dropped": [...] }
|
|
29
|
+
historian_sanitizer_run() {
|
|
30
|
+
local chunks="${1:-[]}"
|
|
31
|
+
local never_index_paths="${2:-[]}"
|
|
32
|
+
local redact_secrets="${3:-true}"
|
|
33
|
+
local drop_skip="${4:-true}"
|
|
34
|
+
|
|
35
|
+
python3 - "$chunks" "$never_index_paths" "$redact_secrets" "$drop_skip" <<'PY'
|
|
36
|
+
import json, re, sys
|
|
37
|
+
|
|
38
|
+
chunks = json.loads(sys.argv[1] or "[]")
|
|
39
|
+
deny_paths = json.loads(sys.argv[2] or "[]")
|
|
40
|
+
redact_secrets = sys.argv[3] != "false"
|
|
41
|
+
drop_skip = sys.argv[4] != "false"
|
|
42
|
+
|
|
43
|
+
# Secret-shaped patterns. Conservative — false positives are acceptable;
|
|
44
|
+
# false negatives are the failure mode we care about. Bearer matches
|
|
45
|
+
# case-insensitively because the "Bearer" scheme is case-insensitive per
|
|
46
|
+
# RFC 6750 and uppercase / lowercase variants occur in the wild.
|
|
47
|
+
SECRET_PATTERNS = [
|
|
48
|
+
# AWS access keys (AKIA followed by 16 base32-ish chars).
|
|
49
|
+
re.compile(r"\bAKIA[0-9A-Z]{16}\b"),
|
|
50
|
+
# GitHub PATs.
|
|
51
|
+
re.compile(r"\bghp_[A-Za-z0-9]{20,}\b"),
|
|
52
|
+
re.compile(r"\bgho_[A-Za-z0-9]{20,}\b"),
|
|
53
|
+
re.compile(r"\bghs_[A-Za-z0-9]{20,}\b"),
|
|
54
|
+
re.compile(r"\bghu_[A-Za-z0-9]{20,}\b"),
|
|
55
|
+
re.compile(r"\bghr_[A-Za-z0-9]{20,}\b"),
|
|
56
|
+
# Anthropic API keys.
|
|
57
|
+
re.compile(r"\bsk-ant-[A-Za-z0-9_-]{20,}\b"),
|
|
58
|
+
# Bearer tokens in headers. Case-insensitive on the scheme name only.
|
|
59
|
+
re.compile(r"(?i:Bearer)\s+[A-Za-z0-9._\-+/=]{20,}"),
|
|
60
|
+
# KEY=value where KEY contains key/secret/token (case-insensitive).
|
|
61
|
+
# We redact only the value (everything after the first =).
|
|
62
|
+
re.compile(
|
|
63
|
+
r"\b([A-Z][A-Z0-9_]*(?:KEY|SECRET|TOKEN|PASSWORD|PASSWD)[A-Z0-9_]*)\s*=\s*\S+",
|
|
64
|
+
re.IGNORECASE,
|
|
65
|
+
),
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def sanitize(body):
|
|
70
|
+
count = 0
|
|
71
|
+
out = body
|
|
72
|
+
for pat in SECRET_PATTERNS[:-1]:
|
|
73
|
+
new = pat.sub("[REDACTED:secret]", out)
|
|
74
|
+
matches = pat.findall(out)
|
|
75
|
+
if matches:
|
|
76
|
+
count += len(matches)
|
|
77
|
+
out = new
|
|
78
|
+
# KEY=value form: preserve the key, redact the value.
|
|
79
|
+
last = SECRET_PATTERNS[-1]
|
|
80
|
+
matches = list(last.finditer(out))
|
|
81
|
+
if matches:
|
|
82
|
+
count += len(matches)
|
|
83
|
+
|
|
84
|
+
def repl(m):
|
|
85
|
+
key = m.group(1)
|
|
86
|
+
return f"{key}=[REDACTED:secret]"
|
|
87
|
+
|
|
88
|
+
out = last.sub(repl, out)
|
|
89
|
+
return out, count
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
SKIP_MARKER = "[historian:skip]"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
kept = []
|
|
96
|
+
dropped = []
|
|
97
|
+
for chunk in chunks:
|
|
98
|
+
body = chunk.get("body", "")
|
|
99
|
+
if drop_skip and SKIP_MARKER in body:
|
|
100
|
+
dropped.append({
|
|
101
|
+
"chunk_index": chunk.get("chunk_index"),
|
|
102
|
+
"reason": "skip_marker",
|
|
103
|
+
})
|
|
104
|
+
continue
|
|
105
|
+
if deny_paths and any(p and p in body for p in deny_paths):
|
|
106
|
+
dropped.append({
|
|
107
|
+
"chunk_index": chunk.get("chunk_index"),
|
|
108
|
+
"reason": "never_index_path",
|
|
109
|
+
})
|
|
110
|
+
continue
|
|
111
|
+
if redact_secrets:
|
|
112
|
+
redacted, count = sanitize(body)
|
|
113
|
+
else:
|
|
114
|
+
redacted, count = body, 0
|
|
115
|
+
new_chunk = dict(chunk)
|
|
116
|
+
new_chunk.pop("body", None)
|
|
117
|
+
new_chunk["body_redacted"] = redacted
|
|
118
|
+
new_chunk["redaction_count"] = count
|
|
119
|
+
kept.append(new_chunk)
|
|
120
|
+
|
|
121
|
+
print(json.dumps({"kept": kept, "dropped": dropped}))
|
|
122
|
+
PY
|
|
123
|
+
}
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Storage layout helpers for Historian.
|
|
3
|
+
#
|
|
4
|
+
# Layout (under $ONLOOKER_DIR/historian/<project-key>/):
|
|
5
|
+
# manifest.json project metadata (remote_url, repo_root, last_seen_at)
|
|
6
|
+
# sessions/<session_id>.jsonl append-only chunk records, one per line
|
|
7
|
+
#
|
|
8
|
+
# Chunk record shape:
|
|
9
|
+
# { chunk_id, session_id, chunk_index, start_turn_index, end_turn_index,
|
|
10
|
+
# body_redacted, body_chars, created_at, source, redaction_count }
|
|
11
|
+
#
|
|
12
|
+
# Append-only writes keep the indexing path simple and safe to re-run; if a
|
|
13
|
+
# session is re-indexed (rare; SessionEnd should fire once), callers can
|
|
14
|
+
# truncate the file before appending or accept duplicate chunk records.
|
|
15
|
+
|
|
16
|
+
historian_storage_root() {
|
|
17
|
+
local base="${ONLOOKER_DIR:-$HOME/.onlooker}"
|
|
18
|
+
printf '%s/historian' "$base"
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
historian_project_dir() {
|
|
22
|
+
local key="$1"
|
|
23
|
+
printf '%s/%s' "$(historian_storage_root)" "$key"
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
historian_sessions_dir() {
|
|
27
|
+
local key="$1"
|
|
28
|
+
printf '%s/sessions' "$(historian_project_dir "$key")"
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
historian_session_file() {
|
|
32
|
+
local key="$1"
|
|
33
|
+
local session_id="$2"
|
|
34
|
+
# Sanitize session_id for filesystem use: strip anything outside
|
|
35
|
+
# [A-Za-z0-9._-]. session_id comes from the Claude Code hook payload
|
|
36
|
+
# and is normally a clean ULID-ish string, but guard against
|
|
37
|
+
# unexpected shapes.
|
|
38
|
+
local safe
|
|
39
|
+
safe=$(printf '%s' "$session_id" | tr -cd '[:alnum:]._-')
|
|
40
|
+
[[ -z "$safe" ]] && safe="unknown"
|
|
41
|
+
printf '%s/%s.jsonl' "$(historian_sessions_dir "$key")" "$safe"
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
historian_storage_init() {
|
|
45
|
+
local key="$1"
|
|
46
|
+
[[ -z "$key" ]] && return 1
|
|
47
|
+
local project_dir
|
|
48
|
+
project_dir=$(historian_project_dir "$key")
|
|
49
|
+
mkdir -p "$project_dir/sessions" 2>/dev/null
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
# Usage: historian_storage_write_manifest <key> <remote_url> <repo_root>
|
|
53
|
+
historian_storage_write_manifest() {
|
|
54
|
+
local key="$1"
|
|
55
|
+
local remote_url="$2"
|
|
56
|
+
local repo_root="$3"
|
|
57
|
+
[[ -z "$key" ]] && return 1
|
|
58
|
+
|
|
59
|
+
historian_storage_init "$key" || return 1
|
|
60
|
+
local manifest_path now
|
|
61
|
+
manifest_path="$(historian_project_dir "$key")/manifest.json"
|
|
62
|
+
now=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
63
|
+
|
|
64
|
+
jq -n \
|
|
65
|
+
--arg key "$key" \
|
|
66
|
+
--arg remote "$remote_url" \
|
|
67
|
+
--arg root "$repo_root" \
|
|
68
|
+
--arg now "$now" \
|
|
69
|
+
'{
|
|
70
|
+
project_key: $key,
|
|
71
|
+
remote_url: (if $remote == "" then null else $remote end),
|
|
72
|
+
repo_root: (if $root == "" then null else $root end),
|
|
73
|
+
last_seen_at: $now
|
|
74
|
+
}' > "$manifest_path" 2>/dev/null
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
# Append a single chunk record (one JSON line) to a session's file.
|
|
78
|
+
# Usage: historian_storage_append_chunk <key> <session_id> <chunk_json>
|
|
79
|
+
historian_storage_append_chunk() {
|
|
80
|
+
local key="$1"
|
|
81
|
+
local session_id="$2"
|
|
82
|
+
local chunk_json="$3"
|
|
83
|
+
[[ -z "$key" || -z "$session_id" || -z "$chunk_json" ]] && return 1
|
|
84
|
+
|
|
85
|
+
historian_storage_init "$key" || return 1
|
|
86
|
+
local path
|
|
87
|
+
path=$(historian_session_file "$key" "$session_id")
|
|
88
|
+
printf '%s\n' "$chunk_json" >> "$path" 2>/dev/null
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
# Count chunks for a session. Returns 0 when the file is absent.
|
|
92
|
+
historian_storage_chunk_count() {
|
|
93
|
+
local key="$1"
|
|
94
|
+
local session_id="$2"
|
|
95
|
+
local path
|
|
96
|
+
path=$(historian_session_file "$key" "$session_id")
|
|
97
|
+
[[ -f "$path" ]] || { echo 0; return 0; }
|
|
98
|
+
wc -l < "$path" 2>/dev/null | tr -d ' '
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
# Reset (truncate) the chunk file for a session. Used when SessionEnd
|
|
102
|
+
# re-runs against a transcript that was previously indexed.
|
|
103
|
+
historian_storage_reset_session() {
|
|
104
|
+
local key="$1"
|
|
105
|
+
local session_id="$2"
|
|
106
|
+
local path
|
|
107
|
+
path=$(historian_session_file "$key" "$session_id")
|
|
108
|
+
[[ -f "$path" ]] || return 0
|
|
109
|
+
: > "$path"
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
# ============================================================================
|
|
113
|
+
# Retrieval watermarks (per-session, scoped to the project key)
|
|
114
|
+
# ============================================================================
|
|
115
|
+
|
|
116
|
+
# Path used to hold the per-session retrieval state (count + last_ms) so
|
|
117
|
+
# the rate gate persists across UserPromptSubmit invocations within a
|
|
118
|
+
# single session. We key on (project, session) so cross-session retrieval
|
|
119
|
+
# limits don't leak. The state file uses `last_ms` — an epoch-millisecond
|
|
120
|
+
# timestamp of the last retrieval the rate gate let through — and the
|
|
121
|
+
# cooldown gate compares (now_ms - last_ms) against cooldown_seconds.
|
|
122
|
+
historian_retrieval_state_path() {
|
|
123
|
+
local key="$1"
|
|
124
|
+
local session_id="$2"
|
|
125
|
+
local safe
|
|
126
|
+
safe=$(printf '%s' "$session_id" | tr -cd '[:alnum:]._-')
|
|
127
|
+
[[ -z "$safe" ]] && safe="unknown"
|
|
128
|
+
printf '%s/retrieval-state/%s.json' "$(historian_project_dir "$key")" "$safe"
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
# Read the JSON document at the watermark path. Returns {"count":0,
|
|
132
|
+
# "last_ms":0} when the file is absent or unreadable.
|
|
133
|
+
historian_retrieval_state_read() {
|
|
134
|
+
local key="$1"
|
|
135
|
+
local session_id="$2"
|
|
136
|
+
local path
|
|
137
|
+
path=$(historian_retrieval_state_path "$key" "$session_id")
|
|
138
|
+
if [[ -f "$path" ]]; then
|
|
139
|
+
jq -c '. // {count:0, last_ms:0}' "$path" 2>/dev/null \
|
|
140
|
+
|| printf '%s' '{"count":0,"last_ms":0}'
|
|
141
|
+
else
|
|
142
|
+
printf '%s' '{"count":0,"last_ms":0}'
|
|
143
|
+
fi
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
# Bump the count and update last_ms.
|
|
147
|
+
historian_retrieval_state_write() {
|
|
148
|
+
local key="$1"
|
|
149
|
+
local session_id="$2"
|
|
150
|
+
local count="$3"
|
|
151
|
+
local last_ms="$4"
|
|
152
|
+
local path
|
|
153
|
+
path=$(historian_retrieval_state_path "$key" "$session_id")
|
|
154
|
+
mkdir -p "$(dirname "$path")" 2>/dev/null
|
|
155
|
+
jq -cn --argjson count "$count" --argjson last_ms "$last_ms" \
|
|
156
|
+
'{ count: $count, last_ms: $last_ms }' > "$path" 2>/dev/null
|
|
157
|
+
}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Transcript reading for Historian.
|
|
3
|
+
#
|
|
4
|
+
# Claude Code records each session's transcript as JSONL where each line
|
|
5
|
+
# is an entry like { "role": "user"|"assistant"|"system", "content": "...",
|
|
6
|
+
# ... }. Historian only embeds user + assistant turns — tool calls and tool
|
|
7
|
+
# results are dropped at this stage so the chunked content stays
|
|
8
|
+
# semantically focused on the conversation.
|
|
9
|
+
|
|
10
|
+
# Load the transcript and emit a JSON array of normalized turn records:
|
|
11
|
+
# [
|
|
12
|
+
# { "turn_index": 0, "role": "user", "content": "..." },
|
|
13
|
+
# { "turn_index": 1, "role": "assistant", "content": "..." },
|
|
14
|
+
# ...
|
|
15
|
+
# ]
|
|
16
|
+
#
|
|
17
|
+
# Returns an empty array when the transcript is absent or unreadable.
|
|
18
|
+
#
|
|
19
|
+
# Usage: historian_transcript_load <transcript_path>
|
|
20
|
+
historian_transcript_load() {
|
|
21
|
+
local path="${1:-}"
|
|
22
|
+
[[ -z "$path" || ! -f "$path" ]] && { echo '[]'; return 0; }
|
|
23
|
+
|
|
24
|
+
# Filter to user/assistant role entries with non-empty content, keep
|
|
25
|
+
# their original order (the JSONL is recorded chronologically), and
|
|
26
|
+
# attach a turn_index. Content may be a string OR an array of content
|
|
27
|
+
# blocks (Anthropic SDK shape); flatten array forms to text.
|
|
28
|
+
python3 - "$path" <<'PY'
|
|
29
|
+
import json, sys
|
|
30
|
+
|
|
31
|
+
path = sys.argv[1]
|
|
32
|
+
out = []
|
|
33
|
+
turn_index = 0
|
|
34
|
+
try:
|
|
35
|
+
with open(path, "r", encoding="utf-8", errors="replace") as f:
|
|
36
|
+
for line in f:
|
|
37
|
+
line = line.strip()
|
|
38
|
+
if not line:
|
|
39
|
+
continue
|
|
40
|
+
try:
|
|
41
|
+
rec = json.loads(line)
|
|
42
|
+
except json.JSONDecodeError:
|
|
43
|
+
continue
|
|
44
|
+
role = rec.get("role") or rec.get("type")
|
|
45
|
+
if role not in ("user", "assistant"):
|
|
46
|
+
continue
|
|
47
|
+
raw = rec.get("content", "")
|
|
48
|
+
if isinstance(raw, list):
|
|
49
|
+
# Anthropic content-blocks form. Concatenate the text-typed
|
|
50
|
+
# blocks; drop tool_use / tool_result entries here.
|
|
51
|
+
parts = []
|
|
52
|
+
for block in raw:
|
|
53
|
+
if not isinstance(block, dict):
|
|
54
|
+
continue
|
|
55
|
+
if block.get("type") in (None, "text"):
|
|
56
|
+
t = block.get("text") or ""
|
|
57
|
+
if t:
|
|
58
|
+
parts.append(t)
|
|
59
|
+
content = "\n\n".join(parts)
|
|
60
|
+
else:
|
|
61
|
+
content = str(raw)
|
|
62
|
+
content = content.strip()
|
|
63
|
+
if not content:
|
|
64
|
+
continue
|
|
65
|
+
out.append({
|
|
66
|
+
"turn_index": turn_index,
|
|
67
|
+
"role": role,
|
|
68
|
+
"content": content,
|
|
69
|
+
})
|
|
70
|
+
turn_index += 1
|
|
71
|
+
except OSError:
|
|
72
|
+
pass
|
|
73
|
+
|
|
74
|
+
print(json.dumps(out))
|
|
75
|
+
PY
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
# Return the total content character count across normalized turns.
|
|
79
|
+
# Usage: historian_transcript_char_count <turns_json>
|
|
80
|
+
historian_transcript_char_count() {
|
|
81
|
+
local turns="${1:-[]}"
|
|
82
|
+
printf '%s' "$turns" | jq '[.[] | (.content | length)] | add // 0' 2>/dev/null
|
|
83
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Minimal ULID generator for Historian chunk IDs.
|
|
3
|
+
#
|
|
4
|
+
# Spec: https://github.com/ulid/spec — 48-bit timestamp + 80-bit randomness,
|
|
5
|
+
# lexicographically sortable, Crockford Base32. Monotonicity within a single
|
|
6
|
+
# millisecond is not required at SessionEnd cadence.
|
|
7
|
+
|
|
8
|
+
_HISTORIAN_ULID_ALPHABET="0123456789ABCDEFGHJKMNPQRSTVWXYZ"
|
|
9
|
+
|
|
10
|
+
_historian_ulid_encode() {
|
|
11
|
+
local n="$1"
|
|
12
|
+
local len="$2"
|
|
13
|
+
local out=""
|
|
14
|
+
local i
|
|
15
|
+
for ((i = 0; i < len; i++)); do
|
|
16
|
+
out="${_HISTORIAN_ULID_ALPHABET:$((n % 32)):1}${out}"
|
|
17
|
+
n=$((n / 32))
|
|
18
|
+
done
|
|
19
|
+
printf '%s' "$out"
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
historian_ulid() {
|
|
23
|
+
local now_ms
|
|
24
|
+
if [[ "$(uname)" == "Darwin" ]]; then
|
|
25
|
+
now_ms=$(python3 -c 'import time; print(int(time.time() * 1000))' 2>/dev/null) \
|
|
26
|
+
|| now_ms=$(($(date +%s) * 1000))
|
|
27
|
+
else
|
|
28
|
+
now_ms=$(date +%s%3N 2>/dev/null) || now_ms=$(($(date +%s) * 1000))
|
|
29
|
+
fi
|
|
30
|
+
|
|
31
|
+
local rand_hi rand_lo
|
|
32
|
+
rand_hi=$((RANDOM * 32768 + RANDOM))
|
|
33
|
+
rand_lo=$((RANDOM * 32768 + RANDOM))
|
|
34
|
+
rand_hi=$(((rand_hi * 256 + RANDOM % 256) & ((1 << 40) - 1)))
|
|
35
|
+
rand_lo=$(((rand_lo * 256 + RANDOM % 256) & ((1 << 40) - 1)))
|
|
36
|
+
|
|
37
|
+
local ts_part hi_part lo_part
|
|
38
|
+
ts_part=$(_historian_ulid_encode "$now_ms" 10)
|
|
39
|
+
hi_part=$(_historian_ulid_encode "$rand_hi" 8)
|
|
40
|
+
lo_part=$(_historian_ulid_encode "$rand_lo" 8)
|
|
41
|
+
|
|
42
|
+
printf '%s%s%s' "$ts_part" "$hi_part" "$lo_part"
|
|
43
|
+
}
|