@onlooker-community/ecosystem 0.21.0 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +13 -0
- package/.claude-plugin/plugin.json +1 -1
- package/.release-please-manifest.json +3 -2
- package/CHANGELOG.md +15 -0
- package/hooks/hooks.json +4 -0
- package/package.json +2 -2
- package/plugins/historian/.claude-plugin/plugin.json +14 -0
- package/plugins/historian/CHANGELOG.md +17 -0
- package/plugins/historian/README.md +84 -0
- package/plugins/historian/config.json +46 -0
- package/plugins/historian/hooks/hooks.json +26 -0
- package/plugins/historian/scripts/hooks/historian-prompt-submit.sh +269 -0
- package/plugins/historian/scripts/hooks/historian-session-end.sh +235 -0
- package/plugins/historian/scripts/lib/historian-chunker.sh +129 -0
- package/plugins/historian/scripts/lib/historian-config.sh +66 -0
- package/plugins/historian/scripts/lib/historian-embedder.sh +126 -0
- package/plugins/historian/scripts/lib/historian-emit.sh +61 -0
- package/plugins/historian/scripts/lib/historian-project-key.sh +80 -0
- package/plugins/historian/scripts/lib/historian-retriever.sh +191 -0
- package/plugins/historian/scripts/lib/historian-sanitizer.sh +123 -0
- package/plugins/historian/scripts/lib/historian-storage.sh +157 -0
- package/plugins/historian/scripts/lib/historian-transcript.sh +83 -0
- package/plugins/historian/scripts/lib/historian-ulid.sh +43 -0
- package/release-please-config.json +16 -0
- package/scripts/hooks/memory-recall-tracker.sh +206 -0
- package/test/bats/historian-prompt-submit.bats +236 -0
- package/test/bats/historian-session-end.bats +296 -0
- package/test/bats/memory-recall-tracker.bats +189 -0
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Historian SessionEnd indexing pipeline.
|
|
3
|
+
#
|
|
4
|
+
# Reads the session transcript, drops tool calls / tool results, chunks
|
|
5
|
+
# the remaining user + assistant turns at turn boundaries, redacts
|
|
6
|
+
# secret-shaped substrings, and appends one JSONL line per surviving
|
|
7
|
+
# chunk to ~/.onlooker/historian/<project-key>/sessions/<session-id>.jsonl.
|
|
8
|
+
#
|
|
9
|
+
# Hook contract:
|
|
10
|
+
# - Always exits 0. Never blocks session shutdown.
|
|
11
|
+
# - No-ops when historian.enabled is not true.
|
|
12
|
+
# - No-ops when there is no project key, no transcript path, or the
|
|
13
|
+
# transcript is shorter than min_transcript_chars_to_index.
|
|
14
|
+
# - Indexing failures are fail-soft: an emitted historian.indexing.complete
|
|
15
|
+
# with outcome "skipped" + a skip_reason is the worst case.
|
|
16
|
+
|
|
17
|
+
set -uo pipefail
|
|
18
|
+
|
|
19
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
20
|
+
PLUGIN_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
|
21
|
+
|
|
22
|
+
_ECOSYSTEM_ROOT="${ONLOOKER_ECOSYSTEM_ROOT:-}"
|
|
23
|
+
if [[ -z "$_ECOSYSTEM_ROOT" ]]; then
|
|
24
|
+
_candidate="$(cd "${PLUGIN_ROOT}/../.." 2>/dev/null && pwd)"
|
|
25
|
+
if [[ -f "${_candidate}/scripts/lib/validate-path.sh" ]]; then
|
|
26
|
+
_ECOSYSTEM_ROOT="$_candidate"
|
|
27
|
+
fi
|
|
28
|
+
fi
|
|
29
|
+
if [[ -n "$_ECOSYSTEM_ROOT" && -f "${_ECOSYSTEM_ROOT}/scripts/lib/validate-path.sh" ]]; then
|
|
30
|
+
# shellcheck disable=SC1091
|
|
31
|
+
CLAUDE_PLUGIN_ROOT="$_ECOSYSTEM_ROOT" source "${_ECOSYSTEM_ROOT}/scripts/lib/validate-path.sh"
|
|
32
|
+
fi
|
|
33
|
+
|
|
34
|
+
# shellcheck source=../lib/historian-config.sh
|
|
35
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-config.sh"
|
|
36
|
+
# shellcheck source=../lib/historian-project-key.sh
|
|
37
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-project-key.sh"
|
|
38
|
+
# shellcheck source=../lib/historian-ulid.sh
|
|
39
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-ulid.sh"
|
|
40
|
+
# shellcheck source=../lib/historian-storage.sh
|
|
41
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-storage.sh"
|
|
42
|
+
# shellcheck source=../lib/historian-emit.sh
|
|
43
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-emit.sh"
|
|
44
|
+
# shellcheck source=../lib/historian-transcript.sh
|
|
45
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-transcript.sh"
|
|
46
|
+
# shellcheck source=../lib/historian-chunker.sh
|
|
47
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-chunker.sh"
|
|
48
|
+
# shellcheck source=../lib/historian-sanitizer.sh
|
|
49
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-sanitizer.sh"
|
|
50
|
+
# shellcheck source=../lib/historian-embedder.sh
|
|
51
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-embedder.sh"
|
|
52
|
+
|
|
53
|
+
INPUT=$(cat 2>/dev/null || true)
|
|
54
|
+
CWD=$(printf '%s' "$INPUT" | jq -r '.cwd // ""' 2>/dev/null) || CWD=""
|
|
55
|
+
SESSION_ID=$(printf '%s' "$INPUT" | jq -r '.session_id // ""' 2>/dev/null) || SESSION_ID=""
|
|
56
|
+
TRANSCRIPT_PATH=$(printf '%s' "$INPUT" | jq -r '.transcript_path // ""' 2>/dev/null) || TRANSCRIPT_PATH=""
|
|
57
|
+
[[ -z "$CWD" ]] && CWD="$(pwd)"
|
|
58
|
+
[[ -z "$SESSION_ID" ]] && SESSION_ID="unknown"
|
|
59
|
+
|
|
60
|
+
REPO_ROOT=$(historian_project_repo_root "$CWD")
|
|
61
|
+
historian_config_load "$REPO_ROOT"
|
|
62
|
+
historian_config_enabled || exit 0
|
|
63
|
+
|
|
64
|
+
PROJECT_KEY=$(historian_project_key "$CWD")
|
|
65
|
+
[[ -z "$PROJECT_KEY" ]] && exit 0
|
|
66
|
+
|
|
67
|
+
historian_storage_init "$PROJECT_KEY" || exit 0
|
|
68
|
+
REMOTE_URL=$(historian_project_remote_url "$CWD")
|
|
69
|
+
historian_storage_write_manifest "$PROJECT_KEY" "$REMOTE_URL" "$REPO_ROOT" || true
|
|
70
|
+
|
|
71
|
+
# ----------------------------------------------------------------------------
|
|
72
|
+
# Transcript-availability check first — emit no started/complete for the
|
|
73
|
+
# transcript_unavailable path, just a complete-with-skip so the timeline
|
|
74
|
+
# reads cleanly. Once we have a real char count, emit started with that
|
|
75
|
+
# count (the schema requires transcript_chars on started, so emitting
|
|
76
|
+
# zero before the read produced misleading telemetry).
|
|
77
|
+
# ----------------------------------------------------------------------------
|
|
78
|
+
|
|
79
|
+
SCAN_START_MS=$(python3 -c 'import time; print(int(time.time() * 1000))' 2>/dev/null) \
|
|
80
|
+
|| SCAN_START_MS=$(($(date +%s) * 1000))
|
|
81
|
+
|
|
82
|
+
_emit_skip() {
|
|
83
|
+
local reason="$1"
|
|
84
|
+
local now_ms duration_ms
|
|
85
|
+
now_ms=$(python3 -c 'import time; print(int(time.time() * 1000))' 2>/dev/null) \
|
|
86
|
+
|| now_ms=$(($(date +%s) * 1000))
|
|
87
|
+
duration_ms=$((now_ms - SCAN_START_MS))
|
|
88
|
+
historian_emit "historian.indexing.complete" "$SESSION_ID" "$(jq -cn \
|
|
89
|
+
--arg outcome "skipped" \
|
|
90
|
+
--arg skip_reason "$reason" \
|
|
91
|
+
--argjson duration_ms "$duration_ms" \
|
|
92
|
+
'{ outcome: $outcome, skip_reason: $skip_reason, duration_ms: $duration_ms }')"
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
if [[ -z "$TRANSCRIPT_PATH" || ! -f "$TRANSCRIPT_PATH" ]]; then
|
|
96
|
+
_emit_skip "transcript_unavailable"
|
|
97
|
+
exit 0
|
|
98
|
+
fi
|
|
99
|
+
|
|
100
|
+
MIN_CHARS=$(historian_config_get '.historian.indexing.min_transcript_chars_to_index')
|
|
101
|
+
[[ -z "$MIN_CHARS" || "$MIN_CHARS" == "null" ]] && MIN_CHARS=1200
|
|
102
|
+
|
|
103
|
+
TURNS=$(historian_transcript_load "$TRANSCRIPT_PATH")
|
|
104
|
+
TRANSCRIPT_CHARS=$(historian_transcript_char_count "$TURNS")
|
|
105
|
+
[[ -z "$TRANSCRIPT_CHARS" || "$TRANSCRIPT_CHARS" == "null" ]] && TRANSCRIPT_CHARS=0
|
|
106
|
+
|
|
107
|
+
historian_emit "historian.indexing.started" "$SESSION_ID" "$(jq -cn \
|
|
108
|
+
--arg session_id "$SESSION_ID" \
|
|
109
|
+
--argjson transcript_chars "$TRANSCRIPT_CHARS" \
|
|
110
|
+
'{ session_id: $session_id, transcript_chars: $transcript_chars }')"
|
|
111
|
+
|
|
112
|
+
if (( TRANSCRIPT_CHARS < MIN_CHARS )); then
|
|
113
|
+
_emit_skip "too_short"
|
|
114
|
+
exit 0
|
|
115
|
+
fi
|
|
116
|
+
|
|
117
|
+
# ----------------------------------------------------------------------------
|
|
118
|
+
# Chunker → sanitizer → JSONL store.
|
|
119
|
+
# ----------------------------------------------------------------------------
|
|
120
|
+
|
|
121
|
+
TARGET_CHARS=$(historian_config_get '.historian.indexing.chunk_target_chars')
|
|
122
|
+
[[ -z "$TARGET_CHARS" || "$TARGET_CHARS" == "null" ]] && TARGET_CHARS=2400
|
|
123
|
+
OVERLAP_CHARS=$(historian_config_get '.historian.indexing.chunk_overlap_chars')
|
|
124
|
+
[[ -z "$OVERLAP_CHARS" || "$OVERLAP_CHARS" == "null" ]] && OVERLAP_CHARS=400
|
|
125
|
+
|
|
126
|
+
CHUNKS=$(historian_chunker_split "$TURNS" "$TARGET_CHARS" "$OVERLAP_CHARS")
|
|
127
|
+
NEVER_INDEX_PATHS=$(historian_config_get '.historian.sanitization.never_index_paths | tojson')
|
|
128
|
+
[[ -z "$NEVER_INDEX_PATHS" || "$NEVER_INDEX_PATHS" == "null" ]] && NEVER_INDEX_PATHS='[]'
|
|
129
|
+
|
|
130
|
+
# Honor the two on/off knobs from the config block.
|
|
131
|
+
REDACT_SECRETS=$(historian_config_get '.historian.sanitization.redact_secret_patterns')
|
|
132
|
+
[[ -z "$REDACT_SECRETS" || "$REDACT_SECRETS" == "null" ]] && REDACT_SECRETS="true"
|
|
133
|
+
DROP_SKIP=$(historian_config_get '.historian.sanitization.drop_skip_marker')
|
|
134
|
+
[[ -z "$DROP_SKIP" || "$DROP_SKIP" == "null" ]] && DROP_SKIP="true"
|
|
135
|
+
|
|
136
|
+
SANITIZED=$(historian_sanitizer_run "$CHUNKS" "$NEVER_INDEX_PATHS" "$REDACT_SECRETS" "$DROP_SKIP")
|
|
137
|
+
KEPT=$(printf '%s' "$SANITIZED" | jq '.kept')
|
|
138
|
+
DROPPED=$(printf '%s' "$SANITIZED" | jq '.dropped')
|
|
139
|
+
|
|
140
|
+
# Probe the embedder once before the chunk loop. If unavailable we
|
|
141
|
+
# index without vectors. The retriever shipped today is embedding-only,
|
|
142
|
+
# so chunks written without an `embedding` field are persisted but
|
|
143
|
+
# invisible to retrieval until they are re-indexed against a working
|
|
144
|
+
# embedder. Chunk bodies stay intact, so re-indexing is a re-embed pass
|
|
145
|
+
# rather than a full re-chunk.
|
|
146
|
+
EMBEDDER_READY=0
|
|
147
|
+
EMBEDDER_BACKEND=$(historian_config_get '.historian.embedder.backend')
|
|
148
|
+
[[ -z "$EMBEDDER_BACKEND" || "$EMBEDDER_BACKEND" == "null" ]] && EMBEDDER_BACKEND="none"
|
|
149
|
+
if [[ "$EMBEDDER_BACKEND" != "none" ]]; then
|
|
150
|
+
if historian_embedder_available; then
|
|
151
|
+
EMBEDDER_READY=1
|
|
152
|
+
else
|
|
153
|
+
historian_emit "historian.embedder.unavailable" "$SESSION_ID" "$(jq -cn \
|
|
154
|
+
--arg backend "$EMBEDDER_BACKEND" \
|
|
155
|
+
'{ backend: $backend }')"
|
|
156
|
+
fi
|
|
157
|
+
fi
|
|
158
|
+
|
|
159
|
+
# Re-indexing replaces the existing session file rather than appending,
|
|
160
|
+
# so SessionEnd is safely idempotent if re-fired against the same id.
|
|
161
|
+
historian_storage_reset_session "$PROJECT_KEY" "$SESSION_ID"
|
|
162
|
+
|
|
163
|
+
NOW_TS=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
164
|
+
CHUNKS_INDEXED=0
|
|
165
|
+
KEPT_COUNT=$(printf '%s' "$KEPT" | jq 'length' 2>/dev/null) || KEPT_COUNT=0
|
|
166
|
+
|
|
167
|
+
for ((i = 0; i < KEPT_COUNT; i++)); do
|
|
168
|
+
CHUNK=$(printf '%s' "$KEPT" | jq -c ".[$i]")
|
|
169
|
+
[[ -z "$CHUNK" || "$CHUNK" == "null" ]] && continue
|
|
170
|
+
|
|
171
|
+
CHUNK_ID=$(historian_ulid)
|
|
172
|
+
REDACTION_COUNT=$(printf '%s' "$CHUNK" | jq -r '.redaction_count // 0')
|
|
173
|
+
BODY=$(printf '%s' "$CHUNK" | jq -r '.body_redacted // ""')
|
|
174
|
+
|
|
175
|
+
# Build the base record. The embedding (if any) is added below.
|
|
176
|
+
RECORD=$(jq -cn \
|
|
177
|
+
--arg chunk_id "$CHUNK_ID" \
|
|
178
|
+
--arg session_id "$SESSION_ID" \
|
|
179
|
+
--argjson chunk_input "$CHUNK" \
|
|
180
|
+
--arg created_at "$NOW_TS" \
|
|
181
|
+
--arg source "local" \
|
|
182
|
+
'$chunk_input + {
|
|
183
|
+
chunk_id: $chunk_id,
|
|
184
|
+
session_id: $session_id,
|
|
185
|
+
created_at: $created_at,
|
|
186
|
+
source: $source
|
|
187
|
+
}')
|
|
188
|
+
|
|
189
|
+
if (( EMBEDDER_READY == 1 )) && [[ -n "$BODY" ]]; then
|
|
190
|
+
EMBEDDING=$(historian_embedder_embed "$BODY")
|
|
191
|
+
if [[ -n "$EMBEDDING" ]]; then
|
|
192
|
+
RECORD=$(printf '%s' "$RECORD" | jq -c --argjson v "$EMBEDDING" \
|
|
193
|
+
'. + { embedding: $v }')
|
|
194
|
+
fi
|
|
195
|
+
fi
|
|
196
|
+
|
|
197
|
+
if historian_storage_append_chunk "$PROJECT_KEY" "$SESSION_ID" "$RECORD"; then
|
|
198
|
+
CHUNKS_INDEXED=$((CHUNKS_INDEXED + 1))
|
|
199
|
+
if (( REDACTION_COUNT > 0 )); then
|
|
200
|
+
historian_emit "historian.chunk.sanitized" "$SESSION_ID" "$(jq -cn \
|
|
201
|
+
--arg chunk_id "$CHUNK_ID" \
|
|
202
|
+
--argjson redaction_count "$REDACTION_COUNT" \
|
|
203
|
+
'{ chunk_id: $chunk_id, redaction_count: $redaction_count }')"
|
|
204
|
+
fi
|
|
205
|
+
fi
|
|
206
|
+
done
|
|
207
|
+
|
|
208
|
+
# Emit one chunk.dropped event per skip reason summary (caps at the
|
|
209
|
+
# number of unique reasons; per-chunk emission would spam the log).
|
|
210
|
+
DROPPED_COUNT=$(printf '%s' "$DROPPED" | jq 'length' 2>/dev/null) || DROPPED_COUNT=0
|
|
211
|
+
if (( DROPPED_COUNT > 0 )); then
|
|
212
|
+
for reason in $(printf '%s' "$DROPPED" | jq -r '.[].reason' | sort -u); do
|
|
213
|
+
historian_emit "historian.chunk.dropped" "$SESSION_ID" "$(jq -cn \
|
|
214
|
+
--arg reason "$reason" \
|
|
215
|
+
'{ reason: $reason }')"
|
|
216
|
+
done
|
|
217
|
+
fi
|
|
218
|
+
|
|
219
|
+
NOW_MS=$(python3 -c 'import time; print(int(time.time() * 1000))' 2>/dev/null) \
|
|
220
|
+
|| NOW_MS=$(($(date +%s) * 1000))
|
|
221
|
+
DURATION_MS=$((NOW_MS - SCAN_START_MS))
|
|
222
|
+
|
|
223
|
+
historian_emit "historian.indexing.complete" "$SESSION_ID" "$(jq -cn \
|
|
224
|
+
--arg outcome "ok" \
|
|
225
|
+
--argjson chunks_indexed "$CHUNKS_INDEXED" \
|
|
226
|
+
--argjson chunks_dropped "$DROPPED_COUNT" \
|
|
227
|
+
--argjson duration_ms "$DURATION_MS" \
|
|
228
|
+
'{
|
|
229
|
+
outcome: $outcome,
|
|
230
|
+
chunks_indexed: $chunks_indexed,
|
|
231
|
+
chunks_dropped: $chunks_dropped,
|
|
232
|
+
duration_ms: $duration_ms
|
|
233
|
+
}')"
|
|
234
|
+
|
|
235
|
+
exit 0
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Chunker for Historian.
|
|
3
|
+
#
|
|
4
|
+
# Given a JSON array of normalized turns (from historian-transcript.sh),
|
|
5
|
+
# produces a JSON array of chunk records. Each chunk:
|
|
6
|
+
# - Respects turn boundaries (no mid-turn splits)
|
|
7
|
+
# - Targets `target_chars` characters with `overlap_chars` overlap
|
|
8
|
+
# (carrying the last N chars of one chunk's content as the start of
|
|
9
|
+
# the next)
|
|
10
|
+
# - Records start_turn_index, end_turn_index, body_chars
|
|
11
|
+
#
|
|
12
|
+
# Character-based chunking instead of token-based: tokenizers vary by
|
|
13
|
+
# embedder, and the chunker shouldn't have to know which embedder will
|
|
14
|
+
# run downstream. Char counts approximate token counts at ~4 chars / token
|
|
15
|
+
# for English-ish prose; configs are tunable.
|
|
16
|
+
|
|
17
|
+
# Usage: historian_chunker_split <turns_json> <target_chars> <overlap_chars>
|
|
18
|
+
# Output: JSON array of chunks.
|
|
19
|
+
historian_chunker_split() {
|
|
20
|
+
local turns="${1:-[]}"
|
|
21
|
+
local target_chars="${2:-2400}"
|
|
22
|
+
local overlap_chars="${3:-400}"
|
|
23
|
+
|
|
24
|
+
python3 - "$target_chars" "$overlap_chars" "$turns" <<'PY'
|
|
25
|
+
import json, sys
|
|
26
|
+
|
|
27
|
+
target = int(sys.argv[1])
|
|
28
|
+
overlap = max(0, int(sys.argv[2]))
|
|
29
|
+
turns = json.loads(sys.argv[3] or "[]")
|
|
30
|
+
|
|
31
|
+
chunks = []
|
|
32
|
+
chunk_index = 0
|
|
33
|
+
buf_parts = []
|
|
34
|
+
buf_chars = 0
|
|
35
|
+
buf_start = None
|
|
36
|
+
buf_end = None
|
|
37
|
+
|
|
38
|
+
# Pending overlap text carried from the previous chunk. It seeds the next
|
|
39
|
+
# chunk's body but doesn't get attributed a turn (the overlap is purely
|
|
40
|
+
# textual continuity for the embedder).
|
|
41
|
+
pending_overlap = ""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def flush(force_text=None):
|
|
45
|
+
"""Emit the current buffer as a chunk. force_text overrides the
|
|
46
|
+
accumulated body and is used when a single turn exceeds the target."""
|
|
47
|
+
global chunk_index, buf_parts, buf_chars, buf_start, buf_end
|
|
48
|
+
if force_text is None:
|
|
49
|
+
if not buf_parts:
|
|
50
|
+
return
|
|
51
|
+
body = "\n\n".join(buf_parts)
|
|
52
|
+
else:
|
|
53
|
+
body = force_text
|
|
54
|
+
if not body.strip():
|
|
55
|
+
# Reset and skip empty bodies (can happen with overlap-only carry).
|
|
56
|
+
buf_parts = []
|
|
57
|
+
buf_chars = 0
|
|
58
|
+
buf_start = None
|
|
59
|
+
buf_end = None
|
|
60
|
+
return
|
|
61
|
+
chunks.append({
|
|
62
|
+
"chunk_index": chunk_index,
|
|
63
|
+
"start_turn_index": buf_start,
|
|
64
|
+
"end_turn_index": buf_end,
|
|
65
|
+
"body": body,
|
|
66
|
+
"body_chars": len(body),
|
|
67
|
+
})
|
|
68
|
+
chunk_index += 1
|
|
69
|
+
buf_parts = []
|
|
70
|
+
buf_chars = 0
|
|
71
|
+
buf_start = None
|
|
72
|
+
buf_end = None
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
for turn in turns:
|
|
76
|
+
role = turn.get("role", "")
|
|
77
|
+
content = turn.get("content", "")
|
|
78
|
+
if not content:
|
|
79
|
+
continue
|
|
80
|
+
rendered = f"{role}: {content}"
|
|
81
|
+
rendered_len = len(rendered)
|
|
82
|
+
|
|
83
|
+
# If this single turn exceeds the target, flush whatever's pending and
|
|
84
|
+
# emit the oversized turn as its own chunk. The next chunk's overlap
|
|
85
|
+
# carries the last `overlap` chars of this turn's body.
|
|
86
|
+
if rendered_len > target:
|
|
87
|
+
# Flush pending buffer first.
|
|
88
|
+
if buf_parts:
|
|
89
|
+
flush()
|
|
90
|
+
# Seed an oversized chunk on its own.
|
|
91
|
+
body_for_chunk = (pending_overlap + ("\n\n" if pending_overlap else "")) + rendered
|
|
92
|
+
# Set start/end markers for the standalone chunk.
|
|
93
|
+
buf_start = turn["turn_index"]
|
|
94
|
+
buf_end = turn["turn_index"]
|
|
95
|
+
flush(force_text=body_for_chunk)
|
|
96
|
+
pending_overlap = body_for_chunk[-overlap:] if overlap > 0 else ""
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
candidate_len = buf_chars + rendered_len + (2 if buf_parts else 0) # 2 for "\n\n"
|
|
100
|
+
if buf_parts and candidate_len > target:
|
|
101
|
+
# Flush the buffer; start a new chunk seeded with overlap from the
|
|
102
|
+
# body we just emitted.
|
|
103
|
+
last_body = ""
|
|
104
|
+
if chunks:
|
|
105
|
+
last_body = chunks[-1]["body"]
|
|
106
|
+
flush()
|
|
107
|
+
if overlap > 0 and last_body:
|
|
108
|
+
pending_overlap = last_body[-overlap:]
|
|
109
|
+
else:
|
|
110
|
+
pending_overlap = ""
|
|
111
|
+
|
|
112
|
+
if not buf_parts and pending_overlap:
|
|
113
|
+
buf_parts.append(pending_overlap)
|
|
114
|
+
buf_chars += len(pending_overlap)
|
|
115
|
+
pending_overlap = ""
|
|
116
|
+
|
|
117
|
+
buf_parts.append(rendered)
|
|
118
|
+
buf_chars += rendered_len + (2 if len(buf_parts) > 1 else 0)
|
|
119
|
+
if buf_start is None:
|
|
120
|
+
buf_start = turn["turn_index"]
|
|
121
|
+
buf_end = turn["turn_index"]
|
|
122
|
+
|
|
123
|
+
# Final flush.
|
|
124
|
+
if buf_parts:
|
|
125
|
+
flush()
|
|
126
|
+
|
|
127
|
+
print(json.dumps(chunks))
|
|
128
|
+
PY
|
|
129
|
+
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Config resolution for Historian.
|
|
3
|
+
#
|
|
4
|
+
# Reads three layers, latest wins:
|
|
5
|
+
# 1. plugins/historian/config.json (defaults shipped with the plugin)
|
|
6
|
+
# 2. ~/.claude/settings.json
|
|
7
|
+
# 3. <repo>/.claude/settings.json
|
|
8
|
+
#
|
|
9
|
+
# Exposes:
|
|
10
|
+
# historian_config_load <repo_root> # populates _HISTORIAN_CONFIG (JSON)
|
|
11
|
+
# historian_config_get <jq-path> # echoes string value (empty if unset)
|
|
12
|
+
# historian_config_enabled # 0 if historian.enabled is true
|
|
13
|
+
#
|
|
14
|
+
# Settings overlay only touches the `historian.*` subtree of settings.json.
|
|
15
|
+
|
|
16
|
+
_HISTORIAN_CONFIG="{}"
|
|
17
|
+
|
|
18
|
+
historian_config_load() {
|
|
19
|
+
local repo_root="${1:-}"
|
|
20
|
+
local plugin_root="${CLAUDE_PLUGIN_ROOT:-}"
|
|
21
|
+
local home_dir="${HOME:-}"
|
|
22
|
+
|
|
23
|
+
local merged="{}"
|
|
24
|
+
local file
|
|
25
|
+
|
|
26
|
+
file="${plugin_root}/config.json"
|
|
27
|
+
if [[ -f "$file" ]]; then
|
|
28
|
+
local defaults
|
|
29
|
+
defaults=$(jq '.' "$file" 2>/dev/null) || defaults="{}"
|
|
30
|
+
merged=$(jq -n --argjson a "$merged" --argjson b "$defaults" '$a * $b' 2>/dev/null) \
|
|
31
|
+
|| merged="$defaults"
|
|
32
|
+
fi
|
|
33
|
+
|
|
34
|
+
for file in "${home_dir}/.claude/settings.json" "${repo_root}/.claude/settings.json"; do
|
|
35
|
+
[[ -n "$file" && -f "$file" ]] || continue
|
|
36
|
+
local overlay
|
|
37
|
+
overlay=$(jq '{ historian: (.historian // {}) }' "$file" 2>/dev/null) || continue
|
|
38
|
+
[[ -z "$overlay" ]] && continue
|
|
39
|
+
merged=$(jq -n --argjson a "$merged" --argjson b "$overlay" '
|
|
40
|
+
def deepmerge($a; $b):
|
|
41
|
+
if ($a|type) == "object" and ($b|type) == "object" then
|
|
42
|
+
reduce (($a|keys) + ($b|keys) | unique)[] as $k
|
|
43
|
+
({}; .[$k] = deepmerge($a[$k]; $b[$k]))
|
|
44
|
+
elif $b == null then $a
|
|
45
|
+
else $b end;
|
|
46
|
+
deepmerge($a; $b)
|
|
47
|
+
' 2>/dev/null) || true
|
|
48
|
+
done
|
|
49
|
+
|
|
50
|
+
_HISTORIAN_CONFIG="$merged"
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
# Read a value from the loaded config. The explicit null check (instead of
|
|
54
|
+
# `// empty`) preserves boolean `false` — `// empty` would treat it the same
|
|
55
|
+
# as null and silently drop "explicitly disabled" settings.
|
|
56
|
+
historian_config_get() {
|
|
57
|
+
local path="$1"
|
|
58
|
+
printf '%s' "$_HISTORIAN_CONFIG" \
|
|
59
|
+
| jq -r "${path} | if . == null then empty else . end" 2>/dev/null
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
historian_config_enabled() {
|
|
63
|
+
local v
|
|
64
|
+
v=$(historian_config_get '.historian.enabled')
|
|
65
|
+
[[ "$v" == "true" ]]
|
|
66
|
+
}
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Embedder client for Historian.
|
|
3
|
+
#
|
|
4
|
+
# Per ADR-001, the default backend is local ollama with the
|
|
5
|
+
# `nomic-embed-text` model. The interface is intentionally a single
|
|
6
|
+
# function that takes a string and returns a JSON array of floats, so
|
|
7
|
+
# alternate backends (fastembed sidecar, remote API) can drop in later
|
|
8
|
+
# without changing callers.
|
|
9
|
+
#
|
|
10
|
+
# Fail-soft: returns empty string on any failure (ollama not reachable,
|
|
11
|
+
# JSON decode error, missing curl). Callers treat empty as "skip the
|
|
12
|
+
# embedding and emit historian.embedder.unavailable".
|
|
13
|
+
|
|
14
|
+
# Resolve config (the caller has typically run historian_config_load
|
|
15
|
+
# before invoking us). We re-read the config knobs here so this lib can
|
|
16
|
+
# be sourced and used outside the SessionEnd hook context.
|
|
17
|
+
|
|
18
|
+
_historian_embedder_backend() {
|
|
19
|
+
local v
|
|
20
|
+
v=$(historian_config_get '.historian.embedder.backend' 2>/dev/null)
|
|
21
|
+
[[ -z "$v" ]] && v="none"
|
|
22
|
+
printf '%s' "$v"
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
_historian_embedder_ollama_host() {
|
|
26
|
+
local v
|
|
27
|
+
v=$(historian_config_get '.historian.embedder.ollama.host' 2>/dev/null)
|
|
28
|
+
[[ -z "$v" ]] && v="http://127.0.0.1:11434"
|
|
29
|
+
printf '%s' "$v"
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
_historian_embedder_ollama_model() {
|
|
33
|
+
local v
|
|
34
|
+
v=$(historian_config_get '.historian.embedder.ollama.model' 2>/dev/null)
|
|
35
|
+
[[ -z "$v" ]] && v="nomic-embed-text"
|
|
36
|
+
printf '%s' "$v"
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
_historian_embedder_ollama_timeout() {
|
|
40
|
+
local v
|
|
41
|
+
v=$(historian_config_get '.historian.embedder.ollama.request_timeout_seconds' 2>/dev/null)
|
|
42
|
+
[[ -z "$v" || "$v" == "null" ]] && v=8
|
|
43
|
+
printf '%s' "$v"
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
# Returns 0 if the currently-configured embedder is reachable and the
|
|
47
|
+
# backend is something other than "none". A side-effect-free probe.
|
|
48
|
+
historian_embedder_available() {
|
|
49
|
+
local backend
|
|
50
|
+
backend=$(_historian_embedder_backend)
|
|
51
|
+
case "$backend" in
|
|
52
|
+
none|"")
|
|
53
|
+
return 1
|
|
54
|
+
;;
|
|
55
|
+
ollama)
|
|
56
|
+
command -v curl >/dev/null 2>&1 || return 1
|
|
57
|
+
local host timeout
|
|
58
|
+
host=$(_historian_embedder_ollama_host)
|
|
59
|
+
timeout=$(_historian_embedder_ollama_timeout)
|
|
60
|
+
# HEAD `/api/tags` is the cheapest way to confirm the daemon
|
|
61
|
+
# is up without rendering a payload.
|
|
62
|
+
curl -fsS --max-time "$timeout" -o /dev/null "${host}/api/tags" 2>/dev/null
|
|
63
|
+
;;
|
|
64
|
+
*)
|
|
65
|
+
# fastembed / remote backends not implemented yet — treat as
|
|
66
|
+
# unavailable.
|
|
67
|
+
return 1
|
|
68
|
+
;;
|
|
69
|
+
esac
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
# Embed a single string. Prints a JSON array of floats on success
|
|
73
|
+
# (e.g. `[0.123,0.456,...]`), or empty string on any error.
|
|
74
|
+
# Usage: historian_embedder_embed <text>
|
|
75
|
+
historian_embedder_embed() {
|
|
76
|
+
local text="${1:-}"
|
|
77
|
+
[[ -z "$text" ]] && return 0
|
|
78
|
+
|
|
79
|
+
local backend
|
|
80
|
+
backend=$(_historian_embedder_backend)
|
|
81
|
+
case "$backend" in
|
|
82
|
+
none|"")
|
|
83
|
+
return 0
|
|
84
|
+
;;
|
|
85
|
+
ollama)
|
|
86
|
+
_historian_embedder_embed_ollama "$text"
|
|
87
|
+
;;
|
|
88
|
+
*)
|
|
89
|
+
# Backend declared but not implemented — fail-soft.
|
|
90
|
+
return 0
|
|
91
|
+
;;
|
|
92
|
+
esac
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
# Internal: call ollama's /api/embeddings endpoint.
|
|
96
|
+
_historian_embedder_embed_ollama() {
|
|
97
|
+
local text="$1"
|
|
98
|
+
command -v curl >/dev/null 2>&1 || return 0
|
|
99
|
+
|
|
100
|
+
local host model timeout payload response
|
|
101
|
+
host=$(_historian_embedder_ollama_host)
|
|
102
|
+
model=$(_historian_embedder_ollama_model)
|
|
103
|
+
timeout=$(_historian_embedder_ollama_timeout)
|
|
104
|
+
|
|
105
|
+
payload=$(jq -cn --arg model "$model" --arg prompt "$text" \
|
|
106
|
+
'{ model: $model, prompt: $prompt }') || return 0
|
|
107
|
+
|
|
108
|
+
response=$(curl -fsS --max-time "$timeout" \
|
|
109
|
+
-H 'Content-Type: application/json' \
|
|
110
|
+
-d "$payload" \
|
|
111
|
+
"${host}/api/embeddings" 2>/dev/null) || return 0
|
|
112
|
+
[[ -z "$response" ]] && return 0
|
|
113
|
+
|
|
114
|
+
# The ollama embeddings endpoint returns `{"embedding":[...]}`. Pull
|
|
115
|
+
# just the array and validate it parses + is non-empty.
|
|
116
|
+
local vector
|
|
117
|
+
vector=$(printf '%s' "$response" | jq -c '.embedding // empty' 2>/dev/null)
|
|
118
|
+
[[ -z "$vector" || "$vector" == "null" ]] && return 0
|
|
119
|
+
|
|
120
|
+
# Sanity: must be an array of numbers, length > 0.
|
|
121
|
+
printf '%s' "$vector" | jq -e '
|
|
122
|
+
type == "array" and length > 0 and all(.[]; type == "number")
|
|
123
|
+
' >/dev/null 2>&1 || return 0
|
|
124
|
+
|
|
125
|
+
printf '%s' "$vector"
|
|
126
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Event emission helpers for Historian.
|
|
3
|
+
#
|
|
4
|
+
# Thin wrapper around onlooker-event.mjs `emit` mode for historian.* events.
|
|
5
|
+
# Fail-soft: returns 0 on success or when the substrate is unavailable.
|
|
6
|
+
|
|
7
|
+
_historian_resolve_event_js() {
|
|
8
|
+
local script_dir plugin_root ecosystem_root candidate
|
|
9
|
+
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
10
|
+
plugin_root="$(cd "${script_dir}/../.." && pwd)"
|
|
11
|
+
|
|
12
|
+
ecosystem_root="${ONLOOKER_ECOSYSTEM_ROOT:-}"
|
|
13
|
+
if [[ -z "$ecosystem_root" ]]; then
|
|
14
|
+
candidate="$(cd "${plugin_root}/../.." 2>/dev/null && pwd)"
|
|
15
|
+
if [[ -f "${candidate}/scripts/lib/onlooker-event.mjs" ]]; then
|
|
16
|
+
ecosystem_root="$candidate"
|
|
17
|
+
fi
|
|
18
|
+
fi
|
|
19
|
+
|
|
20
|
+
if [[ -n "$ecosystem_root" ]]; then
|
|
21
|
+
printf '%s/scripts/lib/onlooker-event.mjs' "$ecosystem_root"
|
|
22
|
+
fi
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
_HISTORIAN_EVENT_JS="${_HISTORIAN_EVENT_JS:-$(_historian_resolve_event_js)}"
|
|
26
|
+
|
|
27
|
+
# Emit a historian.* event. Fail-soft: returns 0 on any error.
|
|
28
|
+
# Usage: historian_emit <event_type> <session_id> <payload_json>
|
|
29
|
+
historian_emit() {
|
|
30
|
+
local event_type="${1:-}"
|
|
31
|
+
local session_id="${2:-}"
|
|
32
|
+
local payload="${3:-{\}}"
|
|
33
|
+
|
|
34
|
+
[[ -z "$event_type" || -z "$session_id" ]] && return 0
|
|
35
|
+
[[ -z "$_HISTORIAN_EVENT_JS" || ! -f "$_HISTORIAN_EVENT_JS" ]] && return 0
|
|
36
|
+
command -v node >/dev/null 2>&1 || return 0
|
|
37
|
+
[[ -z "${ONLOOKER_EVENTS_LOG:-}" ]] && return 0
|
|
38
|
+
|
|
39
|
+
local params event_json
|
|
40
|
+
params=$(jq -cn \
|
|
41
|
+
--arg plugin "historian" \
|
|
42
|
+
--arg session_id "$session_id" \
|
|
43
|
+
--arg event_type "$event_type" \
|
|
44
|
+
--argjson payload "$payload" \
|
|
45
|
+
'{
|
|
46
|
+
plugin: $plugin,
|
|
47
|
+
session_id: $session_id,
|
|
48
|
+
event_type: $event_type,
|
|
49
|
+
payload: $payload
|
|
50
|
+
}') || return 0
|
|
51
|
+
|
|
52
|
+
event_json=$(
|
|
53
|
+
ONLOOKER_DIR="${ONLOOKER_DIR:-$HOME/.onlooker}" \
|
|
54
|
+
ONLOOKER_PLUGIN_NAME="historian" \
|
|
55
|
+
printf '%s' "$params" | node "$_HISTORIAN_EVENT_JS" emit 2>/dev/null
|
|
56
|
+
) || return 0
|
|
57
|
+
[[ -z "$event_json" ]] && return 0
|
|
58
|
+
|
|
59
|
+
mkdir -p "$(dirname "$ONLOOKER_EVENTS_LOG")" 2>/dev/null
|
|
60
|
+
printf '%s\n' "$event_json" >> "$ONLOOKER_EVENTS_LOG" 2>/dev/null
|
|
61
|
+
}
|