@onlooker-community/ecosystem 0.20.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +39 -0
- package/.claude-plugin/plugin.json +1 -1
- package/.release-please-manifest.json +4 -2
- package/CHANGELOG.md +14 -0
- package/package.json +2 -2
- package/plugins/curator/.claude-plugin/plugin.json +14 -0
- package/plugins/curator/CHANGELOG.md +10 -0
- package/plugins/curator/README.md +55 -0
- package/plugins/curator/config.json +41 -0
- package/plugins/curator/hooks/hooks.json +15 -0
- package/plugins/curator/scripts/hooks/curator-session-start.sh +343 -0
- package/plugins/curator/scripts/lib/curator-checks.sh +155 -0
- package/plugins/curator/scripts/lib/curator-config.sh +67 -0
- package/plugins/curator/scripts/lib/curator-emit.sh +61 -0
- package/plugins/curator/scripts/lib/curator-memory-reader.sh +225 -0
- package/plugins/curator/scripts/lib/curator-project-key.sh +82 -0
- package/plugins/curator/scripts/lib/curator-storage.sh +176 -0
- package/plugins/curator/scripts/lib/curator-ulid.sh +43 -0
- package/plugins/historian/.claude-plugin/plugin.json +14 -0
- package/plugins/historian/CHANGELOG.md +10 -0
- package/plugins/historian/README.md +70 -0
- package/plugins/historian/config.json +30 -0
- package/plugins/historian/hooks/hooks.json +26 -0
- package/plugins/historian/scripts/hooks/historian-prompt-submit.sh +15 -0
- package/plugins/historian/scripts/hooks/historian-session-end.sh +204 -0
- package/plugins/historian/scripts/lib/historian-chunker.sh +129 -0
- package/plugins/historian/scripts/lib/historian-config.sh +66 -0
- package/plugins/historian/scripts/lib/historian-emit.sh +61 -0
- package/plugins/historian/scripts/lib/historian-project-key.sh +80 -0
- package/plugins/historian/scripts/lib/historian-sanitizer.sh +123 -0
- package/plugins/historian/scripts/lib/historian-storage.sh +110 -0
- package/plugins/historian/scripts/lib/historian-transcript.sh +83 -0
- package/plugins/historian/scripts/lib/historian-ulid.sh +43 -0
- package/release-please-config.json +32 -0
- package/test/bats/curator-session-start.bats +316 -0
- package/test/bats/historian-session-end.bats +296 -0
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Storage layout helpers for Curator.
|
|
3
|
+
#
|
|
4
|
+
# Layout (under $ONLOOKER_DIR/curator/<project-key>/):
|
|
5
|
+
# manifest.json project metadata (remote_url, repo_root, last_seen_at)
|
|
6
|
+
# last_cheap_scan.json watermark: when cheap-tier last ran
|
|
7
|
+
# last_llm_sweep.json watermark: when LLM sweep last ran
|
|
8
|
+
# findings/<ulid>.json one finding per file (open, acknowledged, or resolved)
|
|
9
|
+
|
|
10
|
+
# ============================================================================
|
|
11
|
+
# Path helpers
|
|
12
|
+
# ============================================================================
|
|
13
|
+
|
|
14
|
+
curator_storage_root() {
|
|
15
|
+
local base="${ONLOOKER_DIR:-$HOME/.onlooker}"
|
|
16
|
+
printf '%s/curator' "$base"
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
curator_project_dir() {
|
|
20
|
+
local key="$1"
|
|
21
|
+
printf '%s/%s' "$(curator_storage_root)" "$key"
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
curator_findings_dir() {
|
|
25
|
+
local key="$1"
|
|
26
|
+
printf '%s/findings' "$(curator_project_dir "$key")"
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
curator_storage_init() {
|
|
30
|
+
local key="$1"
|
|
31
|
+
[[ -z "$key" ]] && return 1
|
|
32
|
+
local project_dir
|
|
33
|
+
project_dir=$(curator_project_dir "$key")
|
|
34
|
+
mkdir -p "$project_dir/findings" 2>/dev/null
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
curator_storage_write_manifest() {
|
|
38
|
+
local key="$1"
|
|
39
|
+
local remote_url="$2"
|
|
40
|
+
local repo_root="$3"
|
|
41
|
+
[[ -z "$key" ]] && return 1
|
|
42
|
+
|
|
43
|
+
curator_storage_init "$key" || return 1
|
|
44
|
+
local manifest_path now
|
|
45
|
+
manifest_path="$(curator_project_dir "$key")/manifest.json"
|
|
46
|
+
now=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
47
|
+
|
|
48
|
+
jq -n \
|
|
49
|
+
--arg key "$key" \
|
|
50
|
+
--arg remote "$remote_url" \
|
|
51
|
+
--arg root "$repo_root" \
|
|
52
|
+
--arg now "$now" \
|
|
53
|
+
'{
|
|
54
|
+
project_key: $key,
|
|
55
|
+
remote_url: (if $remote == "" then null else $remote end),
|
|
56
|
+
repo_root: (if $root == "" then null else $root end),
|
|
57
|
+
last_seen_at: $now
|
|
58
|
+
}' > "$manifest_path" 2>/dev/null
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
# ============================================================================
|
|
62
|
+
# Watermarks
|
|
63
|
+
# ============================================================================
|
|
64
|
+
|
|
65
|
+
curator_last_cheap_scan_path() {
|
|
66
|
+
printf '%s/last_cheap_scan.json' "$(curator_project_dir "$1")"
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
curator_last_llm_sweep_path() {
|
|
70
|
+
printf '%s/last_llm_sweep.json' "$(curator_project_dir "$1")"
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
curator_storage_read_watermark() {
|
|
74
|
+
local path="$1"
|
|
75
|
+
[[ -f "$path" ]] || return 0
|
|
76
|
+
jq -r '.scanned_at // empty' "$path" 2>/dev/null
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
curator_storage_write_watermark() {
|
|
80
|
+
local path="$1"
|
|
81
|
+
[[ -z "$path" ]] && return 1
|
|
82
|
+
mkdir -p "$(dirname "$path")" 2>/dev/null
|
|
83
|
+
local now
|
|
84
|
+
now=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
85
|
+
jq -n --arg t "$now" '{ scanned_at: $t }' > "$path" 2>/dev/null
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
# ============================================================================
|
|
89
|
+
# Findings
|
|
90
|
+
# ============================================================================
|
|
91
|
+
|
|
92
|
+
# Write a finding to disk, keyed by ULID. Dedup is by deduped_hash so a
|
|
93
|
+
# repeat scan that surfaces the same fact does not write a new finding.
|
|
94
|
+
#
|
|
95
|
+
# Usage: curator_storage_write_finding <key> <ulid> <json>
|
|
96
|
+
curator_storage_write_finding() {
|
|
97
|
+
local key="$1"
|
|
98
|
+
local id="$2"
|
|
99
|
+
local json="$3"
|
|
100
|
+
[[ -z "$key" || -z "$id" || -z "$json" ]] && return 1
|
|
101
|
+
|
|
102
|
+
curator_storage_init "$key" || return 1
|
|
103
|
+
local path
|
|
104
|
+
path="$(curator_findings_dir "$key")/${id}.json"
|
|
105
|
+
printf '%s\n' "$json" > "$path" 2>/dev/null && printf '%s' "$path"
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
# Load all findings for a project key as a JSON array.
|
|
109
|
+
curator_storage_load_findings() {
|
|
110
|
+
local key="$1"
|
|
111
|
+
[[ -z "$key" ]] && { echo '[]'; return 0; }
|
|
112
|
+
local dir
|
|
113
|
+
dir=$(curator_findings_dir "$key")
|
|
114
|
+
[[ -d "$dir" ]] || { echo '[]'; return 0; }
|
|
115
|
+
|
|
116
|
+
local file all='[]'
|
|
117
|
+
for file in "$dir"/*.json; do
|
|
118
|
+
[[ -f "$file" ]] || continue
|
|
119
|
+
local item
|
|
120
|
+
item=$(jq '.' "$file" 2>/dev/null) || continue
|
|
121
|
+
all=$(printf '%s' "$all" | jq --argjson item "$item" '. + [$item]')
|
|
122
|
+
done
|
|
123
|
+
printf '%s' "$all"
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
# Return 0 if a finding with the given dedup hash already exists (open).
|
|
127
|
+
curator_storage_has_finding_with_hash() {
|
|
128
|
+
local key="$1"
|
|
129
|
+
local hash="$2"
|
|
130
|
+
[[ -z "$key" || -z "$hash" ]] && return 1
|
|
131
|
+
local existing
|
|
132
|
+
existing=$(curator_storage_load_findings "$key")
|
|
133
|
+
printf '%s' "$existing" | jq -e --arg h "$hash" '
|
|
134
|
+
any(.[]; (.deduped_hash // "") == $h and (.status // "open") == "open")
|
|
135
|
+
' >/dev/null 2>&1
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
curator_storage_count_open() {
|
|
139
|
+
local key="$1"
|
|
140
|
+
local all
|
|
141
|
+
all=$(curator_storage_load_findings "$key")
|
|
142
|
+
printf '%s' "$all" | jq '[.[] | select((.status // "open") == "open")] | length' 2>/dev/null
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
# Open-finding counts grouped by kind. Used by the surfacer to render a
|
|
146
|
+
# pointer like "2 path-broken, 1 date-decayed".
|
|
147
|
+
#
|
|
148
|
+
# jq's group_by groups CONSECUTIVE matches, so the array must be sorted
|
|
149
|
+
# by .kind first or the same kind can produce multiple groups (and the
|
|
150
|
+
# downstream summary double-counts).
|
|
151
|
+
curator_storage_open_counts_by_kind() {
|
|
152
|
+
local key="$1"
|
|
153
|
+
local all
|
|
154
|
+
all=$(curator_storage_load_findings "$key")
|
|
155
|
+
printf '%s' "$all" | jq -c '
|
|
156
|
+
[.[] | select((.status // "open") == "open")]
|
|
157
|
+
| sort_by(.kind)
|
|
158
|
+
| group_by(.kind)
|
|
159
|
+
| map({ kind: .[0].kind, count: length })
|
|
160
|
+
| sort_by(-.count)
|
|
161
|
+
'
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
# Hash a finding's identity-relevant fields. Two findings with the same
|
|
165
|
+
# kind + memory_file + matched_phrase (where applicable) share a hash.
|
|
166
|
+
# Plain shasum input — no expensive normalization needed.
|
|
167
|
+
curator_finding_hash() {
|
|
168
|
+
local raw="$1"
|
|
169
|
+
if command -v shasum >/dev/null 2>&1; then
|
|
170
|
+
printf '%s' "$raw" | shasum -a 256 2>/dev/null | cut -c1-16
|
|
171
|
+
elif command -v sha256sum >/dev/null 2>&1; then
|
|
172
|
+
printf '%s' "$raw" | sha256sum 2>/dev/null | cut -c1-16
|
|
173
|
+
else
|
|
174
|
+
return 1
|
|
175
|
+
fi
|
|
176
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Minimal ULID generator for Curator finding IDs.
|
|
3
|
+
#
|
|
4
|
+
# Spec: https://github.com/ulid/spec — 48-bit timestamp + 80-bit randomness,
|
|
5
|
+
# lexicographically sortable, Crockford Base32. Monotonicity within a single
|
|
6
|
+
# millisecond is not required; findings are written infrequently.
|
|
7
|
+
|
|
8
|
+
_CURATOR_ULID_ALPHABET="0123456789ABCDEFGHJKMNPQRSTVWXYZ"
|
|
9
|
+
|
|
10
|
+
_curator_ulid_encode() {
|
|
11
|
+
local n="$1"
|
|
12
|
+
local len="$2"
|
|
13
|
+
local out=""
|
|
14
|
+
local i
|
|
15
|
+
for ((i = 0; i < len; i++)); do
|
|
16
|
+
out="${_CURATOR_ULID_ALPHABET:$((n % 32)):1}${out}"
|
|
17
|
+
n=$((n / 32))
|
|
18
|
+
done
|
|
19
|
+
printf '%s' "$out"
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
curator_ulid() {
|
|
23
|
+
local now_ms
|
|
24
|
+
if [[ "$(uname)" == "Darwin" ]]; then
|
|
25
|
+
now_ms=$(python3 -c 'import time; print(int(time.time() * 1000))' 2>/dev/null) \
|
|
26
|
+
|| now_ms=$(($(date +%s) * 1000))
|
|
27
|
+
else
|
|
28
|
+
now_ms=$(date +%s%3N 2>/dev/null) || now_ms=$(($(date +%s) * 1000))
|
|
29
|
+
fi
|
|
30
|
+
|
|
31
|
+
local rand_hi rand_lo
|
|
32
|
+
rand_hi=$((RANDOM * 32768 + RANDOM))
|
|
33
|
+
rand_lo=$((RANDOM * 32768 + RANDOM))
|
|
34
|
+
rand_hi=$(((rand_hi * 256 + RANDOM % 256) & ((1 << 40) - 1)))
|
|
35
|
+
rand_lo=$(((rand_lo * 256 + RANDOM % 256) & ((1 << 40) - 1)))
|
|
36
|
+
|
|
37
|
+
local ts_part hi_part lo_part
|
|
38
|
+
ts_part=$(_curator_ulid_encode "$now_ms" 10)
|
|
39
|
+
hi_part=$(_curator_ulid_encode "$rand_hi" 8)
|
|
40
|
+
lo_part=$(_curator_ulid_encode "$rand_lo" 8)
|
|
41
|
+
|
|
42
|
+
printf '%s%s%s' "$ts_part" "$hi_part" "$lo_part"
|
|
43
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "historian",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Episodic memory layer. At SessionEnd, chunks and sanitizes the session transcript and stores the chunks locally under ~/.onlooker/historian/<project-key>/sessions/. Future-tense retrieval (vector embeddings + UserPromptSubmit similarity surfacer) lands in a follow-up; this PR ships the indexing pipeline only. Builds on the Onlooker ecosystem plugin.",
|
|
5
|
+
"author": {
|
|
6
|
+
"name": "Onlooker Community",
|
|
7
|
+
"url": "https://onlooker.dev"
|
|
8
|
+
},
|
|
9
|
+
"homepage": "https://onlooker.dev",
|
|
10
|
+
"repository": "https://github.com/onlooker-community/ecosystem",
|
|
11
|
+
"license": "MIT",
|
|
12
|
+
"skills": [],
|
|
13
|
+
"agents": []
|
|
14
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [0.1.0](https://github.com/onlooker-community/ecosystem/compare/historian-v0.0.1...historian-v0.1.0) (2026-06-04)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
### Features
|
|
7
|
+
|
|
8
|
+
* **historian:** introduce SessionEnd indexing :spiral_notepad: ([#59](https://github.com/onlooker-community/ecosystem/issues/59)) ([dd6c7f6](https://github.com/onlooker-community/ecosystem/commit/dd6c7f6ea872437cab6b16de50838dfc72750c7b))
|
|
9
|
+
|
|
10
|
+
## Changelog
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# Historian
|
|
2
|
+
|
|
3
|
+
Episodic memory layer for past Claude Code sessions.
|
|
4
|
+
|
|
5
|
+
At every `SessionEnd`, Historian reads the session transcript, splits it into overlapping chunks at turn boundaries, redacts secret-shaped substrings, and persists the chunks under `~/.onlooker/historian/<project-key>/sessions/<session-id>.jsonl`. Future sessions can retrieve relevant past chunks when the user starts a similar problem.
|
|
6
|
+
|
|
7
|
+
Historian is a sibling plugin to [`ecosystem`](../../) and assumes the Onlooker observability substrate (`~/.onlooker/`) is present. It is parallel to [`librarian`](../librarian) (which consolidates session decisions into the typed memory store) — both turn session-scoped material into something queryable across sessions, but at different levels of distillation. Librarian distills; historian preserves verbatim.
|
|
8
|
+
|
|
9
|
+
See [`docs/design.md`](docs/design.md) and [ADR-001](docs/adr/001-local-embeddings-only.md) for the full design, including the local-embeddings-by-default decision.
|
|
10
|
+
|
|
11
|
+
## How it works
|
|
12
|
+
|
|
13
|
+
| Hook | What Historian does |
|
|
14
|
+
|------|---------------------|
|
|
15
|
+
| `SessionEnd` | Reads the transcript at `transcript_path`, drops tool calls and tool results (keeps user + assistant messages), chunks at turn boundaries inside the configured character target with overlap, runs the sanitizer (secret redaction + `[historian:skip]` markers + path-deny list), and appends one JSONL line per chunk to the session's file. Emits `historian.indexing.*` and `historian.chunk.*` events along the way. |
|
|
16
|
+
| `UserPromptSubmit` | No-op in this PR — the rate gate, query embedder, ANN lookup, and surfacer are deferred to a follow-up that ships the retrieval pipeline alongside the first embedder backend. |
|
|
17
|
+
|
|
18
|
+
## Activation
|
|
19
|
+
|
|
20
|
+
Historian is **off by default**. Enable per-project in `.claude/settings.json`:
|
|
21
|
+
|
|
22
|
+
```json
|
|
23
|
+
{
|
|
24
|
+
"historian": {
|
|
25
|
+
"enabled": true
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
See [`config.json`](config.json) for the full set of tunable defaults.
|
|
31
|
+
|
|
32
|
+
## Storage layout
|
|
33
|
+
|
|
34
|
+
```text
|
|
35
|
+
~/.onlooker/historian/<project-key>/
|
|
36
|
+
├── manifest.json # project metadata
|
|
37
|
+
└── sessions/<session-id>.jsonl # one chunk per line, append-only
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Each chunk line:
|
|
41
|
+
|
|
42
|
+
```json
|
|
43
|
+
{
|
|
44
|
+
"chunk_id": "01J...",
|
|
45
|
+
"session_id": "...",
|
|
46
|
+
"chunk_index": 0,
|
|
47
|
+
"start_turn_index": 0,
|
|
48
|
+
"end_turn_index": 3,
|
|
49
|
+
"body_redacted": "...",
|
|
50
|
+
"body_chars": 2103,
|
|
51
|
+
"created_at": "2026-06-04T...",
|
|
52
|
+
"source": "local",
|
|
53
|
+
"redaction_count": 0
|
|
54
|
+
}
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Status
|
|
58
|
+
|
|
59
|
+
This plugin ships **scaffolding + the SessionEnd indexing pipeline (transcript reader → chunker → sanitizer → JSONL store)**. Deferred to follow-up landings:
|
|
60
|
+
|
|
61
|
+
- **Retrieval and surfacer** — `UserPromptSubmit` rate gate, query embedding, ANN lookup, and `additionalContext` injection of the top match.
|
|
62
|
+
- **Embedder backends** — ollama (`nomic-embed-text`), fastembed sidecar, and remote (opt-in via the two-key egress affirmation from [ADR-001](docs/adr/001-local-embeddings-only.md)). Chunks are indexed without vectors today; the JSONL records make adding embeddings a future column-add, not a re-index.
|
|
63
|
+
- **Prune (retention sweep) and purge (manual)** skills.
|
|
64
|
+
- **`/historian recall`, `/historian setup`, `/historian stats`, `/historian purge`** slash commands.
|
|
65
|
+
|
|
66
|
+
## Requirements
|
|
67
|
+
|
|
68
|
+
- The `ecosystem` plugin installed (for `~/.onlooker/` substrate).
|
|
69
|
+
- `jq` for JSON manipulation.
|
|
70
|
+
- `python3` for chunking and sanitization (no extra packages — stdlib only).
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"plugin_name": "historian",
|
|
3
|
+
"storage_path": "~/.onlooker",
|
|
4
|
+
"historian": {
|
|
5
|
+
"enabled": false,
|
|
6
|
+
"indexing": {
|
|
7
|
+
"trigger": "SessionEnd",
|
|
8
|
+
"min_transcript_chars_to_index": 1200,
|
|
9
|
+
"chunk_target_chars": 2400,
|
|
10
|
+
"chunk_overlap_chars": 400,
|
|
11
|
+
"retention_days": 365
|
|
12
|
+
},
|
|
13
|
+
"sanitization": {
|
|
14
|
+
"redact_secret_patterns": true,
|
|
15
|
+
"drop_skip_marker": true,
|
|
16
|
+
"never_index_paths": []
|
|
17
|
+
},
|
|
18
|
+
"session_archive": {
|
|
19
|
+
"enabled": false,
|
|
20
|
+
"_note": "When true, the full transcript at SessionEnd is copied alongside the chunks so retrieval can link to the source. When false, only chunk bodies are retained."
|
|
21
|
+
},
|
|
22
|
+
"embedder": {
|
|
23
|
+
"backend": "none",
|
|
24
|
+
"_note": "Embedder backends (ollama, fastembed, remote) are deferred to a follow-up. The current 'none' value indexes chunks without vectors; retrieval is also deferred."
|
|
25
|
+
},
|
|
26
|
+
"retrieval": {
|
|
27
|
+
"_note": "UserPromptSubmit retrieval and surfacer are deferred to a follow-up commit; the hook currently no-ops."
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
{
|
|
2
|
+
"hooks": {
|
|
3
|
+
"SessionEnd": [
|
|
4
|
+
{
|
|
5
|
+
"matcher": "*",
|
|
6
|
+
"hooks": [
|
|
7
|
+
{
|
|
8
|
+
"type": "command",
|
|
9
|
+
"command": "\"$CLAUDE_PLUGIN_ROOT\"/scripts/hooks/historian-session-end.sh"
|
|
10
|
+
}
|
|
11
|
+
]
|
|
12
|
+
}
|
|
13
|
+
],
|
|
14
|
+
"UserPromptSubmit": [
|
|
15
|
+
{
|
|
16
|
+
"matcher": "*",
|
|
17
|
+
"hooks": [
|
|
18
|
+
{
|
|
19
|
+
"type": "command",
|
|
20
|
+
"command": "\"$CLAUDE_PLUGIN_ROOT\"/scripts/hooks/historian-prompt-submit.sh"
|
|
21
|
+
}
|
|
22
|
+
]
|
|
23
|
+
}
|
|
24
|
+
]
|
|
25
|
+
}
|
|
26
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Historian UserPromptSubmit hook — STUB.
|
|
3
|
+
#
|
|
4
|
+
# The full retrieval pipeline (rate gate → query embedder → ANN lookup →
|
|
5
|
+
# additionalContext surfacer) is deferred to a follow-up landing that ships
|
|
6
|
+
# the first embedder backend. Today the hook is intentionally a no-op so
|
|
7
|
+
# the plugin can be installed and indexing can run without retrieval.
|
|
8
|
+
#
|
|
9
|
+
# Hook contract:
|
|
10
|
+
# - Always exits 0.
|
|
11
|
+
# - Never produces additionalContext while the retrieval pipeline is
|
|
12
|
+
# unimplemented.
|
|
13
|
+
|
|
14
|
+
set -uo pipefail
|
|
15
|
+
exit 0
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Historian SessionEnd indexing pipeline.
|
|
3
|
+
#
|
|
4
|
+
# Reads the session transcript, drops tool calls / tool results, chunks
|
|
5
|
+
# the remaining user + assistant turns at turn boundaries, redacts
|
|
6
|
+
# secret-shaped substrings, and appends one JSONL line per surviving
|
|
7
|
+
# chunk to ~/.onlooker/historian/<project-key>/sessions/<session-id>.jsonl.
|
|
8
|
+
#
|
|
9
|
+
# Hook contract:
|
|
10
|
+
# - Always exits 0. Never blocks session shutdown.
|
|
11
|
+
# - No-ops when historian.enabled is not true.
|
|
12
|
+
# - No-ops when there is no project key, no transcript path, or the
|
|
13
|
+
# transcript is shorter than min_transcript_chars_to_index.
|
|
14
|
+
# - Indexing failures are fail-soft: an emitted historian.indexing.complete
|
|
15
|
+
# with outcome "skipped" + a skip_reason is the worst case.
|
|
16
|
+
|
|
17
|
+
set -uo pipefail
|
|
18
|
+
|
|
19
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
20
|
+
PLUGIN_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
|
21
|
+
|
|
22
|
+
_ECOSYSTEM_ROOT="${ONLOOKER_ECOSYSTEM_ROOT:-}"
|
|
23
|
+
if [[ -z "$_ECOSYSTEM_ROOT" ]]; then
|
|
24
|
+
_candidate="$(cd "${PLUGIN_ROOT}/../.." 2>/dev/null && pwd)"
|
|
25
|
+
if [[ -f "${_candidate}/scripts/lib/validate-path.sh" ]]; then
|
|
26
|
+
_ECOSYSTEM_ROOT="$_candidate"
|
|
27
|
+
fi
|
|
28
|
+
fi
|
|
29
|
+
if [[ -n "$_ECOSYSTEM_ROOT" && -f "${_ECOSYSTEM_ROOT}/scripts/lib/validate-path.sh" ]]; then
|
|
30
|
+
# shellcheck disable=SC1091
|
|
31
|
+
CLAUDE_PLUGIN_ROOT="$_ECOSYSTEM_ROOT" source "${_ECOSYSTEM_ROOT}/scripts/lib/validate-path.sh"
|
|
32
|
+
fi
|
|
33
|
+
|
|
34
|
+
# shellcheck source=../lib/historian-config.sh
|
|
35
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-config.sh"
|
|
36
|
+
# shellcheck source=../lib/historian-project-key.sh
|
|
37
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-project-key.sh"
|
|
38
|
+
# shellcheck source=../lib/historian-ulid.sh
|
|
39
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-ulid.sh"
|
|
40
|
+
# shellcheck source=../lib/historian-storage.sh
|
|
41
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-storage.sh"
|
|
42
|
+
# shellcheck source=../lib/historian-emit.sh
|
|
43
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-emit.sh"
|
|
44
|
+
# shellcheck source=../lib/historian-transcript.sh
|
|
45
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-transcript.sh"
|
|
46
|
+
# shellcheck source=../lib/historian-chunker.sh
|
|
47
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-chunker.sh"
|
|
48
|
+
# shellcheck source=../lib/historian-sanitizer.sh
|
|
49
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-sanitizer.sh"
|
|
50
|
+
|
|
51
|
+
INPUT=$(cat 2>/dev/null || true)
|
|
52
|
+
CWD=$(printf '%s' "$INPUT" | jq -r '.cwd // ""' 2>/dev/null) || CWD=""
|
|
53
|
+
SESSION_ID=$(printf '%s' "$INPUT" | jq -r '.session_id // ""' 2>/dev/null) || SESSION_ID=""
|
|
54
|
+
TRANSCRIPT_PATH=$(printf '%s' "$INPUT" | jq -r '.transcript_path // ""' 2>/dev/null) || TRANSCRIPT_PATH=""
|
|
55
|
+
[[ -z "$CWD" ]] && CWD="$(pwd)"
|
|
56
|
+
[[ -z "$SESSION_ID" ]] && SESSION_ID="unknown"
|
|
57
|
+
|
|
58
|
+
REPO_ROOT=$(historian_project_repo_root "$CWD")
|
|
59
|
+
historian_config_load "$REPO_ROOT"
|
|
60
|
+
historian_config_enabled || exit 0
|
|
61
|
+
|
|
62
|
+
PROJECT_KEY=$(historian_project_key "$CWD")
|
|
63
|
+
[[ -z "$PROJECT_KEY" ]] && exit 0
|
|
64
|
+
|
|
65
|
+
historian_storage_init "$PROJECT_KEY" || exit 0
|
|
66
|
+
REMOTE_URL=$(historian_project_remote_url "$CWD")
|
|
67
|
+
historian_storage_write_manifest "$PROJECT_KEY" "$REMOTE_URL" "$REPO_ROOT" || true
|
|
68
|
+
|
|
69
|
+
# ----------------------------------------------------------------------------
|
|
70
|
+
# Transcript-availability check first — emit no started/complete for the
|
|
71
|
+
# transcript_unavailable path, just a complete-with-skip so the timeline
|
|
72
|
+
# reads cleanly. Once we have a real char count, emit started with that
|
|
73
|
+
# count (the schema requires transcript_chars on started, so emitting
|
|
74
|
+
# zero before the read produced misleading telemetry).
|
|
75
|
+
# ----------------------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
SCAN_START_MS=$(python3 -c 'import time; print(int(time.time() * 1000))' 2>/dev/null) \
|
|
78
|
+
|| SCAN_START_MS=$(($(date +%s) * 1000))
|
|
79
|
+
|
|
80
|
+
_emit_skip() {
|
|
81
|
+
local reason="$1"
|
|
82
|
+
local now_ms duration_ms
|
|
83
|
+
now_ms=$(python3 -c 'import time; print(int(time.time() * 1000))' 2>/dev/null) \
|
|
84
|
+
|| now_ms=$(($(date +%s) * 1000))
|
|
85
|
+
duration_ms=$((now_ms - SCAN_START_MS))
|
|
86
|
+
historian_emit "historian.indexing.complete" "$SESSION_ID" "$(jq -cn \
|
|
87
|
+
--arg outcome "skipped" \
|
|
88
|
+
--arg skip_reason "$reason" \
|
|
89
|
+
--argjson duration_ms "$duration_ms" \
|
|
90
|
+
'{ outcome: $outcome, skip_reason: $skip_reason, duration_ms: $duration_ms }')"
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if [[ -z "$TRANSCRIPT_PATH" || ! -f "$TRANSCRIPT_PATH" ]]; then
|
|
94
|
+
_emit_skip "transcript_unavailable"
|
|
95
|
+
exit 0
|
|
96
|
+
fi
|
|
97
|
+
|
|
98
|
+
MIN_CHARS=$(historian_config_get '.historian.indexing.min_transcript_chars_to_index')
|
|
99
|
+
[[ -z "$MIN_CHARS" || "$MIN_CHARS" == "null" ]] && MIN_CHARS=1200
|
|
100
|
+
|
|
101
|
+
TURNS=$(historian_transcript_load "$TRANSCRIPT_PATH")
|
|
102
|
+
TRANSCRIPT_CHARS=$(historian_transcript_char_count "$TURNS")
|
|
103
|
+
[[ -z "$TRANSCRIPT_CHARS" || "$TRANSCRIPT_CHARS" == "null" ]] && TRANSCRIPT_CHARS=0
|
|
104
|
+
|
|
105
|
+
historian_emit "historian.indexing.started" "$SESSION_ID" "$(jq -cn \
|
|
106
|
+
--arg session_id "$SESSION_ID" \
|
|
107
|
+
--argjson transcript_chars "$TRANSCRIPT_CHARS" \
|
|
108
|
+
'{ session_id: $session_id, transcript_chars: $transcript_chars }')"
|
|
109
|
+
|
|
110
|
+
if (( TRANSCRIPT_CHARS < MIN_CHARS )); then
|
|
111
|
+
_emit_skip "too_short"
|
|
112
|
+
exit 0
|
|
113
|
+
fi
|
|
114
|
+
|
|
115
|
+
# ----------------------------------------------------------------------------
|
|
116
|
+
# Chunker → sanitizer → JSONL store.
|
|
117
|
+
# ----------------------------------------------------------------------------
|
|
118
|
+
|
|
119
|
+
TARGET_CHARS=$(historian_config_get '.historian.indexing.chunk_target_chars')
|
|
120
|
+
[[ -z "$TARGET_CHARS" || "$TARGET_CHARS" == "null" ]] && TARGET_CHARS=2400
|
|
121
|
+
OVERLAP_CHARS=$(historian_config_get '.historian.indexing.chunk_overlap_chars')
|
|
122
|
+
[[ -z "$OVERLAP_CHARS" || "$OVERLAP_CHARS" == "null" ]] && OVERLAP_CHARS=400
|
|
123
|
+
|
|
124
|
+
CHUNKS=$(historian_chunker_split "$TURNS" "$TARGET_CHARS" "$OVERLAP_CHARS")
|
|
125
|
+
NEVER_INDEX_PATHS=$(historian_config_get '.historian.sanitization.never_index_paths | tojson')
|
|
126
|
+
[[ -z "$NEVER_INDEX_PATHS" || "$NEVER_INDEX_PATHS" == "null" ]] && NEVER_INDEX_PATHS='[]'
|
|
127
|
+
|
|
128
|
+
# Honor the two on/off knobs from the config block.
|
|
129
|
+
REDACT_SECRETS=$(historian_config_get '.historian.sanitization.redact_secret_patterns')
|
|
130
|
+
[[ -z "$REDACT_SECRETS" || "$REDACT_SECRETS" == "null" ]] && REDACT_SECRETS="true"
|
|
131
|
+
DROP_SKIP=$(historian_config_get '.historian.sanitization.drop_skip_marker')
|
|
132
|
+
[[ -z "$DROP_SKIP" || "$DROP_SKIP" == "null" ]] && DROP_SKIP="true"
|
|
133
|
+
|
|
134
|
+
SANITIZED=$(historian_sanitizer_run "$CHUNKS" "$NEVER_INDEX_PATHS" "$REDACT_SECRETS" "$DROP_SKIP")
|
|
135
|
+
KEPT=$(printf '%s' "$SANITIZED" | jq '.kept')
|
|
136
|
+
DROPPED=$(printf '%s' "$SANITIZED" | jq '.dropped')
|
|
137
|
+
|
|
138
|
+
# Re-indexing replaces the existing session file rather than appending,
|
|
139
|
+
# so SessionEnd is safely idempotent if re-fired against the same id.
|
|
140
|
+
historian_storage_reset_session "$PROJECT_KEY" "$SESSION_ID"
|
|
141
|
+
|
|
142
|
+
NOW_TS=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
143
|
+
CHUNKS_INDEXED=0
|
|
144
|
+
KEPT_COUNT=$(printf '%s' "$KEPT" | jq 'length' 2>/dev/null) || KEPT_COUNT=0
|
|
145
|
+
|
|
146
|
+
for ((i = 0; i < KEPT_COUNT; i++)); do
|
|
147
|
+
CHUNK=$(printf '%s' "$KEPT" | jq -c ".[$i]")
|
|
148
|
+
[[ -z "$CHUNK" || "$CHUNK" == "null" ]] && continue
|
|
149
|
+
|
|
150
|
+
CHUNK_ID=$(historian_ulid)
|
|
151
|
+
REDACTION_COUNT=$(printf '%s' "$CHUNK" | jq -r '.redaction_count // 0')
|
|
152
|
+
|
|
153
|
+
RECORD=$(jq -cn \
|
|
154
|
+
--arg chunk_id "$CHUNK_ID" \
|
|
155
|
+
--arg session_id "$SESSION_ID" \
|
|
156
|
+
--argjson chunk_input "$CHUNK" \
|
|
157
|
+
--arg created_at "$NOW_TS" \
|
|
158
|
+
--arg source "local" \
|
|
159
|
+
'$chunk_input + {
|
|
160
|
+
chunk_id: $chunk_id,
|
|
161
|
+
session_id: $session_id,
|
|
162
|
+
created_at: $created_at,
|
|
163
|
+
source: $source
|
|
164
|
+
}')
|
|
165
|
+
|
|
166
|
+
if historian_storage_append_chunk "$PROJECT_KEY" "$SESSION_ID" "$RECORD"; then
|
|
167
|
+
CHUNKS_INDEXED=$((CHUNKS_INDEXED + 1))
|
|
168
|
+
if (( REDACTION_COUNT > 0 )); then
|
|
169
|
+
historian_emit "historian.chunk.sanitized" "$SESSION_ID" "$(jq -cn \
|
|
170
|
+
--arg chunk_id "$CHUNK_ID" \
|
|
171
|
+
--argjson redaction_count "$REDACTION_COUNT" \
|
|
172
|
+
'{ chunk_id: $chunk_id, redaction_count: $redaction_count }')"
|
|
173
|
+
fi
|
|
174
|
+
fi
|
|
175
|
+
done
|
|
176
|
+
|
|
177
|
+
# Emit one chunk.dropped event per skip reason summary (caps at the
|
|
178
|
+
# number of unique reasons; per-chunk emission would spam the log).
|
|
179
|
+
DROPPED_COUNT=$(printf '%s' "$DROPPED" | jq 'length' 2>/dev/null) || DROPPED_COUNT=0
|
|
180
|
+
if (( DROPPED_COUNT > 0 )); then
|
|
181
|
+
for reason in $(printf '%s' "$DROPPED" | jq -r '.[].reason' | sort -u); do
|
|
182
|
+
historian_emit "historian.chunk.dropped" "$SESSION_ID" "$(jq -cn \
|
|
183
|
+
--arg reason "$reason" \
|
|
184
|
+
'{ reason: $reason }')"
|
|
185
|
+
done
|
|
186
|
+
fi
|
|
187
|
+
|
|
188
|
+
NOW_MS=$(python3 -c 'import time; print(int(time.time() * 1000))' 2>/dev/null) \
|
|
189
|
+
|| NOW_MS=$(($(date +%s) * 1000))
|
|
190
|
+
DURATION_MS=$((NOW_MS - SCAN_START_MS))
|
|
191
|
+
|
|
192
|
+
historian_emit "historian.indexing.complete" "$SESSION_ID" "$(jq -cn \
|
|
193
|
+
--arg outcome "ok" \
|
|
194
|
+
--argjson chunks_indexed "$CHUNKS_INDEXED" \
|
|
195
|
+
--argjson chunks_dropped "$DROPPED_COUNT" \
|
|
196
|
+
--argjson duration_ms "$DURATION_MS" \
|
|
197
|
+
'{
|
|
198
|
+
outcome: $outcome,
|
|
199
|
+
chunks_indexed: $chunks_indexed,
|
|
200
|
+
chunks_dropped: $chunks_dropped,
|
|
201
|
+
duration_ms: $duration_ms
|
|
202
|
+
}')"
|
|
203
|
+
|
|
204
|
+
exit 0
|