@onlooker-community/ecosystem 0.21.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +13 -0
- package/.claude-plugin/plugin.json +1 -1
- package/.release-please-manifest.json +3 -2
- package/CHANGELOG.md +7 -0
- package/package.json +2 -2
- package/plugins/historian/.claude-plugin/plugin.json +14 -0
- package/plugins/historian/CHANGELOG.md +10 -0
- package/plugins/historian/README.md +70 -0
- package/plugins/historian/config.json +30 -0
- package/plugins/historian/hooks/hooks.json +26 -0
- package/plugins/historian/scripts/hooks/historian-prompt-submit.sh +15 -0
- package/plugins/historian/scripts/hooks/historian-session-end.sh +204 -0
- package/plugins/historian/scripts/lib/historian-chunker.sh +129 -0
- package/plugins/historian/scripts/lib/historian-config.sh +66 -0
- package/plugins/historian/scripts/lib/historian-emit.sh +61 -0
- package/plugins/historian/scripts/lib/historian-project-key.sh +80 -0
- package/plugins/historian/scripts/lib/historian-sanitizer.sh +123 -0
- package/plugins/historian/scripts/lib/historian-storage.sh +110 -0
- package/plugins/historian/scripts/lib/historian-transcript.sh +83 -0
- package/plugins/historian/scripts/lib/historian-ulid.sh +43 -0
- package/release-please-config.json +16 -0
- package/test/bats/historian-session-end.bats +296 -0
|
@@ -150,6 +150,19 @@
|
|
|
150
150
|
"license": "MIT",
|
|
151
151
|
"keywords": ["memory", "audit", "staleness", "findings", "auto-memory", "decay"],
|
|
152
152
|
"tags": ["memory", "context-engineering"]
|
|
153
|
+
},
|
|
154
|
+
{
|
|
155
|
+
"name": "historian",
|
|
156
|
+
"source": "./plugins/historian",
|
|
157
|
+
"description": "Episodic memory layer for past Claude Code sessions. At SessionEnd, reads the session transcript, drops tool calls and tool results, chunks the remaining user + assistant turns at turn boundaries with overlap, redacts secret-shaped substrings (AWS keys, GitHub PATs, Anthropic API keys, KEY=value env assignments), and appends one JSONL line per surviving chunk to ~/.onlooker/historian/<project-key>/sessions/<session-id>.jsonl. Future-tense retrieval (vector embeddings + UserPromptSubmit similarity surfacer) lands in a follow-up; this version ships the indexing pipeline only. Requires the ecosystem plugin.",
|
|
158
|
+
"author": {
|
|
159
|
+
"name": "Onlooker Community"
|
|
160
|
+
},
|
|
161
|
+
"homepage": "https://onlooker.dev",
|
|
162
|
+
"repository": "https://github.com/onlooker-community/ecosystem",
|
|
163
|
+
"license": "MIT",
|
|
164
|
+
"keywords": ["memory", "episodic", "transcript", "indexing", "session", "retrieval"],
|
|
165
|
+
"tags": ["memory", "context-engineering"]
|
|
153
166
|
}
|
|
154
167
|
]
|
|
155
168
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "ecosystem",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.22.0",
|
|
4
4
|
"description": "Observability substrate for Claude Code. Provides the shared ~/.onlooker/ storage root, canonical schema-validated event emission, session and tool tracking hooks, and prompt rules. Required by all other Onlooker plugins.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Onlooker Community",
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
{
|
|
2
|
-
".": "0.
|
|
2
|
+
".": "0.22.0",
|
|
3
3
|
"plugins/archivist": "0.1.0",
|
|
4
4
|
"plugins/tribunal": "1.0.1",
|
|
5
5
|
"plugins/echo": "0.2.0",
|
|
@@ -10,5 +10,6 @@
|
|
|
10
10
|
"plugins/counsel": "0.2.0",
|
|
11
11
|
"plugins/warden": "0.2.0",
|
|
12
12
|
"plugins/librarian": "0.1.0",
|
|
13
|
-
"plugins/curator": "0.1.0"
|
|
13
|
+
"plugins/curator": "0.1.0",
|
|
14
|
+
"plugins/historian": "0.1.0"
|
|
14
15
|
}
|
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.22.0](https://github.com/onlooker-community/ecosystem/compare/ecosystem-v0.21.0...ecosystem-v0.22.0) (2026-06-04)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
### Features
|
|
7
|
+
|
|
8
|
+
* **historian:** introduce SessionEnd indexing :spiral_notepad: ([#59](https://github.com/onlooker-community/ecosystem/issues/59)) ([dd6c7f6](https://github.com/onlooker-community/ecosystem/commit/dd6c7f6ea872437cab6b16de50838dfc72750c7b))
|
|
9
|
+
|
|
3
10
|
## [0.21.0](https://github.com/onlooker-community/ecosystem/compare/ecosystem-v0.20.0...ecosystem-v0.21.0) (2026-06-04)
|
|
4
11
|
|
|
5
12
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@onlooker-community/ecosystem",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.22.0",
|
|
4
4
|
"description": "Agents, skills, hooks, commands, rules, and MCP configurations that power [Onlooker](https://onlooker.dev)",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Onlooker Community",
|
|
@@ -26,7 +26,7 @@
|
|
|
26
26
|
"test": "npm run test:bats && npm run test:schema",
|
|
27
27
|
"test:bats": "bats test/bats",
|
|
28
28
|
"test:schema": "node --test test/node/*.test.mjs",
|
|
29
|
-
"test:shellcheck": "shellcheck -S error -x install.sh scripts/common.sh scripts/hooks/*.sh scripts/lib/*.sh plugins/archivist/scripts/hooks/*.sh plugins/archivist/scripts/lib/*.sh plugins/tribunal/scripts/hooks/*.sh plugins/tribunal/scripts/lib/*.sh plugins/echo/scripts/hooks/*.sh plugins/echo/scripts/lib/*.sh plugins/governor/scripts/hooks/*.sh plugins/governor/scripts/lib/*.sh plugins/compass/scripts/hooks/*.sh plugins/compass/scripts/lib/*.sh plugins/scribe/scripts/hooks/*.sh plugins/scribe/scripts/lib/*.sh plugins/counsel/scripts/hooks/*.sh plugins/counsel/scripts/lib/*.sh plugins/warden/scripts/hooks/*.sh plugins/warden/scripts/lib/*.sh plugins/librarian/scripts/hooks/*.sh plugins/librarian/scripts/lib/*.sh plugins/curator/scripts/hooks/*.sh plugins/curator/scripts/lib/*.sh",
|
|
29
|
+
"test:shellcheck": "shellcheck -S error -x install.sh scripts/common.sh scripts/hooks/*.sh scripts/lib/*.sh plugins/archivist/scripts/hooks/*.sh plugins/archivist/scripts/lib/*.sh plugins/tribunal/scripts/hooks/*.sh plugins/tribunal/scripts/lib/*.sh plugins/echo/scripts/hooks/*.sh plugins/echo/scripts/lib/*.sh plugins/governor/scripts/hooks/*.sh plugins/governor/scripts/lib/*.sh plugins/compass/scripts/hooks/*.sh plugins/compass/scripts/lib/*.sh plugins/scribe/scripts/hooks/*.sh plugins/scribe/scripts/lib/*.sh plugins/counsel/scripts/hooks/*.sh plugins/counsel/scripts/lib/*.sh plugins/warden/scripts/hooks/*.sh plugins/warden/scripts/lib/*.sh plugins/librarian/scripts/hooks/*.sh plugins/librarian/scripts/lib/*.sh plugins/curator/scripts/hooks/*.sh plugins/curator/scripts/lib/*.sh plugins/historian/scripts/hooks/*.sh plugins/historian/scripts/lib/*.sh",
|
|
30
30
|
"lint:references": "node scripts/lint/check-references.mjs",
|
|
31
31
|
"lint:manifests": "node scripts/lint/check-manifests.mjs",
|
|
32
32
|
"coverage:node": "node scripts/coverage/run-coverage.mjs",
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "historian",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Episodic memory layer. At SessionEnd, chunks and sanitizes the session transcript and stores the chunks locally under ~/.onlooker/historian/<project-key>/sessions/. Future-tense retrieval (vector embeddings + UserPromptSubmit similarity surfacer) lands in a follow-up; this PR ships the indexing pipeline only. Builds on the Onlooker ecosystem plugin.",
|
|
5
|
+
"author": {
|
|
6
|
+
"name": "Onlooker Community",
|
|
7
|
+
"url": "https://onlooker.dev"
|
|
8
|
+
},
|
|
9
|
+
"homepage": "https://onlooker.dev",
|
|
10
|
+
"repository": "https://github.com/onlooker-community/ecosystem",
|
|
11
|
+
"license": "MIT",
|
|
12
|
+
"skills": [],
|
|
13
|
+
"agents": []
|
|
14
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [0.1.0](https://github.com/onlooker-community/ecosystem/compare/historian-v0.0.1...historian-v0.1.0) (2026-06-04)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
### Features
|
|
7
|
+
|
|
8
|
+
* **historian:** introduce SessionEnd indexing :spiral_notepad: ([#59](https://github.com/onlooker-community/ecosystem/issues/59)) ([dd6c7f6](https://github.com/onlooker-community/ecosystem/commit/dd6c7f6ea872437cab6b16de50838dfc72750c7b))
|
|
9
|
+
|
|
10
|
+
## Changelog
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# Historian
|
|
2
|
+
|
|
3
|
+
Episodic memory layer for past Claude Code sessions.
|
|
4
|
+
|
|
5
|
+
At every `SessionEnd`, Historian reads the session transcript, splits it into overlapping chunks at turn boundaries, redacts secret-shaped substrings, and persists the chunks under `~/.onlooker/historian/<project-key>/sessions/<session-id>.jsonl`. Future sessions can retrieve relevant past chunks when the user starts a similar problem.
|
|
6
|
+
|
|
7
|
+
Historian is a sibling plugin to [`ecosystem`](../../) and assumes the Onlooker observability substrate (`~/.onlooker/`) is present. It is parallel to [`librarian`](../librarian) (which consolidates session decisions into the typed memory store) — both turn session-scoped material into something queryable across sessions, but at different levels of distillation. Librarian distills; historian preserves verbatim.
|
|
8
|
+
|
|
9
|
+
See [`docs/design.md`](docs/design.md) and [ADR-001](docs/adr/001-local-embeddings-only.md) for the full design, including the local-embeddings-by-default decision.
|
|
10
|
+
|
|
11
|
+
## How it works
|
|
12
|
+
|
|
13
|
+
| Hook | What Historian does |
|
|
14
|
+
|------|---------------------|
|
|
15
|
+
| `SessionEnd` | Reads the transcript at `transcript_path`, drops tool calls and tool results (keeps user + assistant messages), chunks at turn boundaries inside the configured character target with overlap, runs the sanitizer (secret redaction + `[historian:skip]` markers + path-deny list), and appends one JSONL line per chunk to the session's file. Emits `historian.indexing.*` and `historian.chunk.*` events along the way. |
|
|
16
|
+
| `UserPromptSubmit` | No-op in this PR — the rate gate, query embedder, ANN lookup, and surfacer are deferred to a follow-up that ships the retrieval pipeline alongside the first embedder backend. |
|
|
17
|
+
|
|
18
|
+
## Activation
|
|
19
|
+
|
|
20
|
+
Historian is **off by default**. Enable per-project in `.claude/settings.json`:
|
|
21
|
+
|
|
22
|
+
```json
|
|
23
|
+
{
|
|
24
|
+
"historian": {
|
|
25
|
+
"enabled": true
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
See [`config.json`](config.json) for the full set of tunable defaults.
|
|
31
|
+
|
|
32
|
+
## Storage layout
|
|
33
|
+
|
|
34
|
+
```text
|
|
35
|
+
~/.onlooker/historian/<project-key>/
|
|
36
|
+
├── manifest.json # project metadata
|
|
37
|
+
└── sessions/<session-id>.jsonl # one chunk per line, append-only
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Each chunk line:
|
|
41
|
+
|
|
42
|
+
```json
|
|
43
|
+
{
|
|
44
|
+
"chunk_id": "01J...",
|
|
45
|
+
"session_id": "...",
|
|
46
|
+
"chunk_index": 0,
|
|
47
|
+
"start_turn_index": 0,
|
|
48
|
+
"end_turn_index": 3,
|
|
49
|
+
"body_redacted": "...",
|
|
50
|
+
"body_chars": 2103,
|
|
51
|
+
"created_at": "2026-06-04T...",
|
|
52
|
+
"source": "local",
|
|
53
|
+
"redaction_count": 0
|
|
54
|
+
}
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Status
|
|
58
|
+
|
|
59
|
+
This plugin ships **scaffolding + the SessionEnd indexing pipeline (transcript reader → chunker → sanitizer → JSONL store)**. Deferred to follow-up landings:
|
|
60
|
+
|
|
61
|
+
- **Retrieval and surfacer** — `UserPromptSubmit` rate gate, query embedding, ANN lookup, and `additionalContext` injection of the top match.
|
|
62
|
+
- **Embedder backends** — ollama (`nomic-embed-text`), fastembed sidecar, and remote (opt-in via the two-key egress affirmation from [ADR-001](docs/adr/001-local-embeddings-only.md)). Chunks are indexed without vectors today; the JSONL records make adding embeddings a future column-add, not a re-index.
|
|
63
|
+
- **Prune (retention sweep) and purge (manual)** skills.
|
|
64
|
+
- **`/historian recall`, `/historian setup`, `/historian stats`, `/historian purge`** slash commands.
|
|
65
|
+
|
|
66
|
+
## Requirements
|
|
67
|
+
|
|
68
|
+
- The `ecosystem` plugin installed (for `~/.onlooker/` substrate).
|
|
69
|
+
- `jq` for JSON manipulation.
|
|
70
|
+
- `python3` for chunking and sanitization (no extra packages — stdlib only).
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"plugin_name": "historian",
|
|
3
|
+
"storage_path": "~/.onlooker",
|
|
4
|
+
"historian": {
|
|
5
|
+
"enabled": false,
|
|
6
|
+
"indexing": {
|
|
7
|
+
"trigger": "SessionEnd",
|
|
8
|
+
"min_transcript_chars_to_index": 1200,
|
|
9
|
+
"chunk_target_chars": 2400,
|
|
10
|
+
"chunk_overlap_chars": 400,
|
|
11
|
+
"retention_days": 365
|
|
12
|
+
},
|
|
13
|
+
"sanitization": {
|
|
14
|
+
"redact_secret_patterns": true,
|
|
15
|
+
"drop_skip_marker": true,
|
|
16
|
+
"never_index_paths": []
|
|
17
|
+
},
|
|
18
|
+
"session_archive": {
|
|
19
|
+
"enabled": false,
|
|
20
|
+
"_note": "When true, the full transcript at SessionEnd is copied alongside the chunks so retrieval can link to the source. When false, only chunk bodies are retained."
|
|
21
|
+
},
|
|
22
|
+
"embedder": {
|
|
23
|
+
"backend": "none",
|
|
24
|
+
"_note": "Embedder backends (ollama, fastembed, remote) are deferred to a follow-up. The current 'none' value indexes chunks without vectors; retrieval is also deferred."
|
|
25
|
+
},
|
|
26
|
+
"retrieval": {
|
|
27
|
+
"_note": "UserPromptSubmit retrieval and surfacer are deferred to a follow-up commit; the hook currently no-ops."
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
{
|
|
2
|
+
"hooks": {
|
|
3
|
+
"SessionEnd": [
|
|
4
|
+
{
|
|
5
|
+
"matcher": "*",
|
|
6
|
+
"hooks": [
|
|
7
|
+
{
|
|
8
|
+
"type": "command",
|
|
9
|
+
"command": "\"$CLAUDE_PLUGIN_ROOT\"/scripts/hooks/historian-session-end.sh"
|
|
10
|
+
}
|
|
11
|
+
]
|
|
12
|
+
}
|
|
13
|
+
],
|
|
14
|
+
"UserPromptSubmit": [
|
|
15
|
+
{
|
|
16
|
+
"matcher": "*",
|
|
17
|
+
"hooks": [
|
|
18
|
+
{
|
|
19
|
+
"type": "command",
|
|
20
|
+
"command": "\"$CLAUDE_PLUGIN_ROOT\"/scripts/hooks/historian-prompt-submit.sh"
|
|
21
|
+
}
|
|
22
|
+
]
|
|
23
|
+
}
|
|
24
|
+
]
|
|
25
|
+
}
|
|
26
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Historian UserPromptSubmit hook — STUB.
|
|
3
|
+
#
|
|
4
|
+
# The full retrieval pipeline (rate gate → query embedder → ANN lookup →
|
|
5
|
+
# additionalContext surfacer) is deferred to a follow-up landing that ships
|
|
6
|
+
# the first embedder backend. Today the hook is intentionally a no-op so
|
|
7
|
+
# the plugin can be installed and indexing can run without retrieval.
|
|
8
|
+
#
|
|
9
|
+
# Hook contract:
|
|
10
|
+
# - Always exits 0.
|
|
11
|
+
# - Never produces additionalContext while the retrieval pipeline is
|
|
12
|
+
# unimplemented.
|
|
13
|
+
|
|
14
|
+
set -uo pipefail
|
|
15
|
+
exit 0
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Historian SessionEnd indexing pipeline.
|
|
3
|
+
#
|
|
4
|
+
# Reads the session transcript, drops tool calls / tool results, chunks
|
|
5
|
+
# the remaining user + assistant turns at turn boundaries, redacts
|
|
6
|
+
# secret-shaped substrings, and appends one JSONL line per surviving
|
|
7
|
+
# chunk to ~/.onlooker/historian/<project-key>/sessions/<session-id>.jsonl.
|
|
8
|
+
#
|
|
9
|
+
# Hook contract:
|
|
10
|
+
# - Always exits 0. Never blocks session shutdown.
|
|
11
|
+
# - No-ops when historian.enabled is not true.
|
|
12
|
+
# - No-ops when there is no project key, no transcript path, or the
|
|
13
|
+
# transcript is shorter than min_transcript_chars_to_index.
|
|
14
|
+
# - Indexing failures are fail-soft: an emitted historian.indexing.complete
|
|
15
|
+
# with outcome "skipped" + a skip_reason is the worst case.
|
|
16
|
+
|
|
17
|
+
set -uo pipefail
|
|
18
|
+
|
|
19
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
20
|
+
PLUGIN_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
|
21
|
+
|
|
22
|
+
_ECOSYSTEM_ROOT="${ONLOOKER_ECOSYSTEM_ROOT:-}"
|
|
23
|
+
if [[ -z "$_ECOSYSTEM_ROOT" ]]; then
|
|
24
|
+
_candidate="$(cd "${PLUGIN_ROOT}/../.." 2>/dev/null && pwd)"
|
|
25
|
+
if [[ -f "${_candidate}/scripts/lib/validate-path.sh" ]]; then
|
|
26
|
+
_ECOSYSTEM_ROOT="$_candidate"
|
|
27
|
+
fi
|
|
28
|
+
fi
|
|
29
|
+
if [[ -n "$_ECOSYSTEM_ROOT" && -f "${_ECOSYSTEM_ROOT}/scripts/lib/validate-path.sh" ]]; then
|
|
30
|
+
# shellcheck disable=SC1091
|
|
31
|
+
CLAUDE_PLUGIN_ROOT="$_ECOSYSTEM_ROOT" source "${_ECOSYSTEM_ROOT}/scripts/lib/validate-path.sh"
|
|
32
|
+
fi
|
|
33
|
+
|
|
34
|
+
# shellcheck source=../lib/historian-config.sh
|
|
35
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-config.sh"
|
|
36
|
+
# shellcheck source=../lib/historian-project-key.sh
|
|
37
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-project-key.sh"
|
|
38
|
+
# shellcheck source=../lib/historian-ulid.sh
|
|
39
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-ulid.sh"
|
|
40
|
+
# shellcheck source=../lib/historian-storage.sh
|
|
41
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-storage.sh"
|
|
42
|
+
# shellcheck source=../lib/historian-emit.sh
|
|
43
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-emit.sh"
|
|
44
|
+
# shellcheck source=../lib/historian-transcript.sh
|
|
45
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-transcript.sh"
|
|
46
|
+
# shellcheck source=../lib/historian-chunker.sh
|
|
47
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-chunker.sh"
|
|
48
|
+
# shellcheck source=../lib/historian-sanitizer.sh
|
|
49
|
+
source "${PLUGIN_ROOT}/scripts/lib/historian-sanitizer.sh"
|
|
50
|
+
|
|
51
|
+
INPUT=$(cat 2>/dev/null || true)
|
|
52
|
+
CWD=$(printf '%s' "$INPUT" | jq -r '.cwd // ""' 2>/dev/null) || CWD=""
|
|
53
|
+
SESSION_ID=$(printf '%s' "$INPUT" | jq -r '.session_id // ""' 2>/dev/null) || SESSION_ID=""
|
|
54
|
+
TRANSCRIPT_PATH=$(printf '%s' "$INPUT" | jq -r '.transcript_path // ""' 2>/dev/null) || TRANSCRIPT_PATH=""
|
|
55
|
+
[[ -z "$CWD" ]] && CWD="$(pwd)"
|
|
56
|
+
[[ -z "$SESSION_ID" ]] && SESSION_ID="unknown"
|
|
57
|
+
|
|
58
|
+
REPO_ROOT=$(historian_project_repo_root "$CWD")
|
|
59
|
+
historian_config_load "$REPO_ROOT"
|
|
60
|
+
historian_config_enabled || exit 0
|
|
61
|
+
|
|
62
|
+
PROJECT_KEY=$(historian_project_key "$CWD")
|
|
63
|
+
[[ -z "$PROJECT_KEY" ]] && exit 0
|
|
64
|
+
|
|
65
|
+
historian_storage_init "$PROJECT_KEY" || exit 0
|
|
66
|
+
REMOTE_URL=$(historian_project_remote_url "$CWD")
|
|
67
|
+
historian_storage_write_manifest "$PROJECT_KEY" "$REMOTE_URL" "$REPO_ROOT" || true
|
|
68
|
+
|
|
69
|
+
# ----------------------------------------------------------------------------
|
|
70
|
+
# Transcript-availability check first — emit no started/complete for the
|
|
71
|
+
# transcript_unavailable path, just a complete-with-skip so the timeline
|
|
72
|
+
# reads cleanly. Once we have a real char count, emit started with that
|
|
73
|
+
# count (the schema requires transcript_chars on started, so emitting
|
|
74
|
+
# zero before the read produced misleading telemetry).
|
|
75
|
+
# ----------------------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
SCAN_START_MS=$(python3 -c 'import time; print(int(time.time() * 1000))' 2>/dev/null) \
|
|
78
|
+
|| SCAN_START_MS=$(($(date +%s) * 1000))
|
|
79
|
+
|
|
80
|
+
_emit_skip() {
|
|
81
|
+
local reason="$1"
|
|
82
|
+
local now_ms duration_ms
|
|
83
|
+
now_ms=$(python3 -c 'import time; print(int(time.time() * 1000))' 2>/dev/null) \
|
|
84
|
+
|| now_ms=$(($(date +%s) * 1000))
|
|
85
|
+
duration_ms=$((now_ms - SCAN_START_MS))
|
|
86
|
+
historian_emit "historian.indexing.complete" "$SESSION_ID" "$(jq -cn \
|
|
87
|
+
--arg outcome "skipped" \
|
|
88
|
+
--arg skip_reason "$reason" \
|
|
89
|
+
--argjson duration_ms "$duration_ms" \
|
|
90
|
+
'{ outcome: $outcome, skip_reason: $skip_reason, duration_ms: $duration_ms }')"
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if [[ -z "$TRANSCRIPT_PATH" || ! -f "$TRANSCRIPT_PATH" ]]; then
|
|
94
|
+
_emit_skip "transcript_unavailable"
|
|
95
|
+
exit 0
|
|
96
|
+
fi
|
|
97
|
+
|
|
98
|
+
MIN_CHARS=$(historian_config_get '.historian.indexing.min_transcript_chars_to_index')
|
|
99
|
+
[[ -z "$MIN_CHARS" || "$MIN_CHARS" == "null" ]] && MIN_CHARS=1200
|
|
100
|
+
|
|
101
|
+
TURNS=$(historian_transcript_load "$TRANSCRIPT_PATH")
|
|
102
|
+
TRANSCRIPT_CHARS=$(historian_transcript_char_count "$TURNS")
|
|
103
|
+
[[ -z "$TRANSCRIPT_CHARS" || "$TRANSCRIPT_CHARS" == "null" ]] && TRANSCRIPT_CHARS=0
|
|
104
|
+
|
|
105
|
+
historian_emit "historian.indexing.started" "$SESSION_ID" "$(jq -cn \
|
|
106
|
+
--arg session_id "$SESSION_ID" \
|
|
107
|
+
--argjson transcript_chars "$TRANSCRIPT_CHARS" \
|
|
108
|
+
'{ session_id: $session_id, transcript_chars: $transcript_chars }')"
|
|
109
|
+
|
|
110
|
+
if (( TRANSCRIPT_CHARS < MIN_CHARS )); then
|
|
111
|
+
_emit_skip "too_short"
|
|
112
|
+
exit 0
|
|
113
|
+
fi
|
|
114
|
+
|
|
115
|
+
# ----------------------------------------------------------------------------
|
|
116
|
+
# Chunker → sanitizer → JSONL store.
|
|
117
|
+
# ----------------------------------------------------------------------------
|
|
118
|
+
|
|
119
|
+
TARGET_CHARS=$(historian_config_get '.historian.indexing.chunk_target_chars')
|
|
120
|
+
[[ -z "$TARGET_CHARS" || "$TARGET_CHARS" == "null" ]] && TARGET_CHARS=2400
|
|
121
|
+
OVERLAP_CHARS=$(historian_config_get '.historian.indexing.chunk_overlap_chars')
|
|
122
|
+
[[ -z "$OVERLAP_CHARS" || "$OVERLAP_CHARS" == "null" ]] && OVERLAP_CHARS=400
|
|
123
|
+
|
|
124
|
+
CHUNKS=$(historian_chunker_split "$TURNS" "$TARGET_CHARS" "$OVERLAP_CHARS")
|
|
125
|
+
NEVER_INDEX_PATHS=$(historian_config_get '.historian.sanitization.never_index_paths | tojson')
|
|
126
|
+
[[ -z "$NEVER_INDEX_PATHS" || "$NEVER_INDEX_PATHS" == "null" ]] && NEVER_INDEX_PATHS='[]'
|
|
127
|
+
|
|
128
|
+
# Honor the two on/off knobs from the config block.
|
|
129
|
+
REDACT_SECRETS=$(historian_config_get '.historian.sanitization.redact_secret_patterns')
|
|
130
|
+
[[ -z "$REDACT_SECRETS" || "$REDACT_SECRETS" == "null" ]] && REDACT_SECRETS="true"
|
|
131
|
+
DROP_SKIP=$(historian_config_get '.historian.sanitization.drop_skip_marker')
|
|
132
|
+
[[ -z "$DROP_SKIP" || "$DROP_SKIP" == "null" ]] && DROP_SKIP="true"
|
|
133
|
+
|
|
134
|
+
SANITIZED=$(historian_sanitizer_run "$CHUNKS" "$NEVER_INDEX_PATHS" "$REDACT_SECRETS" "$DROP_SKIP")
|
|
135
|
+
KEPT=$(printf '%s' "$SANITIZED" | jq '.kept')
|
|
136
|
+
DROPPED=$(printf '%s' "$SANITIZED" | jq '.dropped')
|
|
137
|
+
|
|
138
|
+
# Re-indexing replaces the existing session file rather than appending,
|
|
139
|
+
# so SessionEnd is safely idempotent if re-fired against the same id.
|
|
140
|
+
historian_storage_reset_session "$PROJECT_KEY" "$SESSION_ID"
|
|
141
|
+
|
|
142
|
+
NOW_TS=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
143
|
+
CHUNKS_INDEXED=0
|
|
144
|
+
KEPT_COUNT=$(printf '%s' "$KEPT" | jq 'length' 2>/dev/null) || KEPT_COUNT=0
|
|
145
|
+
|
|
146
|
+
for ((i = 0; i < KEPT_COUNT; i++)); do
|
|
147
|
+
CHUNK=$(printf '%s' "$KEPT" | jq -c ".[$i]")
|
|
148
|
+
[[ -z "$CHUNK" || "$CHUNK" == "null" ]] && continue
|
|
149
|
+
|
|
150
|
+
CHUNK_ID=$(historian_ulid)
|
|
151
|
+
REDACTION_COUNT=$(printf '%s' "$CHUNK" | jq -r '.redaction_count // 0')
|
|
152
|
+
|
|
153
|
+
RECORD=$(jq -cn \
|
|
154
|
+
--arg chunk_id "$CHUNK_ID" \
|
|
155
|
+
--arg session_id "$SESSION_ID" \
|
|
156
|
+
--argjson chunk_input "$CHUNK" \
|
|
157
|
+
--arg created_at "$NOW_TS" \
|
|
158
|
+
--arg source "local" \
|
|
159
|
+
'$chunk_input + {
|
|
160
|
+
chunk_id: $chunk_id,
|
|
161
|
+
session_id: $session_id,
|
|
162
|
+
created_at: $created_at,
|
|
163
|
+
source: $source
|
|
164
|
+
}')
|
|
165
|
+
|
|
166
|
+
if historian_storage_append_chunk "$PROJECT_KEY" "$SESSION_ID" "$RECORD"; then
|
|
167
|
+
CHUNKS_INDEXED=$((CHUNKS_INDEXED + 1))
|
|
168
|
+
if (( REDACTION_COUNT > 0 )); then
|
|
169
|
+
historian_emit "historian.chunk.sanitized" "$SESSION_ID" "$(jq -cn \
|
|
170
|
+
--arg chunk_id "$CHUNK_ID" \
|
|
171
|
+
--argjson redaction_count "$REDACTION_COUNT" \
|
|
172
|
+
'{ chunk_id: $chunk_id, redaction_count: $redaction_count }')"
|
|
173
|
+
fi
|
|
174
|
+
fi
|
|
175
|
+
done
|
|
176
|
+
|
|
177
|
+
# Emit one chunk.dropped event per skip reason summary (caps at the
|
|
178
|
+
# number of unique reasons; per-chunk emission would spam the log).
|
|
179
|
+
DROPPED_COUNT=$(printf '%s' "$DROPPED" | jq 'length' 2>/dev/null) || DROPPED_COUNT=0
|
|
180
|
+
if (( DROPPED_COUNT > 0 )); then
|
|
181
|
+
for reason in $(printf '%s' "$DROPPED" | jq -r '.[].reason' | sort -u); do
|
|
182
|
+
historian_emit "historian.chunk.dropped" "$SESSION_ID" "$(jq -cn \
|
|
183
|
+
--arg reason "$reason" \
|
|
184
|
+
'{ reason: $reason }')"
|
|
185
|
+
done
|
|
186
|
+
fi
|
|
187
|
+
|
|
188
|
+
NOW_MS=$(python3 -c 'import time; print(int(time.time() * 1000))' 2>/dev/null) \
|
|
189
|
+
|| NOW_MS=$(($(date +%s) * 1000))
|
|
190
|
+
DURATION_MS=$((NOW_MS - SCAN_START_MS))
|
|
191
|
+
|
|
192
|
+
historian_emit "historian.indexing.complete" "$SESSION_ID" "$(jq -cn \
|
|
193
|
+
--arg outcome "ok" \
|
|
194
|
+
--argjson chunks_indexed "$CHUNKS_INDEXED" \
|
|
195
|
+
--argjson chunks_dropped "$DROPPED_COUNT" \
|
|
196
|
+
--argjson duration_ms "$DURATION_MS" \
|
|
197
|
+
'{
|
|
198
|
+
outcome: $outcome,
|
|
199
|
+
chunks_indexed: $chunks_indexed,
|
|
200
|
+
chunks_dropped: $chunks_dropped,
|
|
201
|
+
duration_ms: $duration_ms
|
|
202
|
+
}')"
|
|
203
|
+
|
|
204
|
+
exit 0
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Chunker for Historian.
|
|
3
|
+
#
|
|
4
|
+
# Given a JSON array of normalized turns (from historian-transcript.sh),
|
|
5
|
+
# produces a JSON array of chunk records. Each chunk:
|
|
6
|
+
# - Respects turn boundaries (no mid-turn splits)
|
|
7
|
+
# - Targets `target_chars` characters with `overlap_chars` overlap
|
|
8
|
+
# (carrying the last N chars of one chunk's content as the start of
|
|
9
|
+
# the next)
|
|
10
|
+
# - Records start_turn_index, end_turn_index, body_chars
|
|
11
|
+
#
|
|
12
|
+
# Character-based chunking instead of token-based: tokenizers vary by
|
|
13
|
+
# embedder, and the chunker shouldn't have to know which embedder will
|
|
14
|
+
# run downstream. Char counts approximate token counts at ~4 chars / token
|
|
15
|
+
# for English-ish prose; configs are tunable.
|
|
16
|
+
|
|
17
|
+
# Usage: historian_chunker_split <turns_json> <target_chars> <overlap_chars>
|
|
18
|
+
# Output: JSON array of chunks.
|
|
19
|
+
historian_chunker_split() {
|
|
20
|
+
local turns="${1:-[]}"
|
|
21
|
+
local target_chars="${2:-2400}"
|
|
22
|
+
local overlap_chars="${3:-400}"
|
|
23
|
+
|
|
24
|
+
python3 - "$target_chars" "$overlap_chars" "$turns" <<'PY'
|
|
25
|
+
import json, sys
|
|
26
|
+
|
|
27
|
+
target = int(sys.argv[1])
|
|
28
|
+
overlap = max(0, int(sys.argv[2]))
|
|
29
|
+
turns = json.loads(sys.argv[3] or "[]")
|
|
30
|
+
|
|
31
|
+
chunks = []
|
|
32
|
+
chunk_index = 0
|
|
33
|
+
buf_parts = []
|
|
34
|
+
buf_chars = 0
|
|
35
|
+
buf_start = None
|
|
36
|
+
buf_end = None
|
|
37
|
+
|
|
38
|
+
# Pending overlap text carried from the previous chunk. It seeds the next
|
|
39
|
+
# chunk's body but doesn't get attributed a turn (the overlap is purely
|
|
40
|
+
# textual continuity for the embedder).
|
|
41
|
+
pending_overlap = ""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def flush(force_text=None):
|
|
45
|
+
"""Emit the current buffer as a chunk. force_text overrides the
|
|
46
|
+
accumulated body and is used when a single turn exceeds the target."""
|
|
47
|
+
global chunk_index, buf_parts, buf_chars, buf_start, buf_end
|
|
48
|
+
if force_text is None:
|
|
49
|
+
if not buf_parts:
|
|
50
|
+
return
|
|
51
|
+
body = "\n\n".join(buf_parts)
|
|
52
|
+
else:
|
|
53
|
+
body = force_text
|
|
54
|
+
if not body.strip():
|
|
55
|
+
# Reset and skip empty bodies (can happen with overlap-only carry).
|
|
56
|
+
buf_parts = []
|
|
57
|
+
buf_chars = 0
|
|
58
|
+
buf_start = None
|
|
59
|
+
buf_end = None
|
|
60
|
+
return
|
|
61
|
+
chunks.append({
|
|
62
|
+
"chunk_index": chunk_index,
|
|
63
|
+
"start_turn_index": buf_start,
|
|
64
|
+
"end_turn_index": buf_end,
|
|
65
|
+
"body": body,
|
|
66
|
+
"body_chars": len(body),
|
|
67
|
+
})
|
|
68
|
+
chunk_index += 1
|
|
69
|
+
buf_parts = []
|
|
70
|
+
buf_chars = 0
|
|
71
|
+
buf_start = None
|
|
72
|
+
buf_end = None
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
for turn in turns:
|
|
76
|
+
role = turn.get("role", "")
|
|
77
|
+
content = turn.get("content", "")
|
|
78
|
+
if not content:
|
|
79
|
+
continue
|
|
80
|
+
rendered = f"{role}: {content}"
|
|
81
|
+
rendered_len = len(rendered)
|
|
82
|
+
|
|
83
|
+
# If this single turn exceeds the target, flush whatever's pending and
|
|
84
|
+
# emit the oversized turn as its own chunk. The next chunk's overlap
|
|
85
|
+
# carries the last `overlap` chars of this turn's body.
|
|
86
|
+
if rendered_len > target:
|
|
87
|
+
# Flush pending buffer first.
|
|
88
|
+
if buf_parts:
|
|
89
|
+
flush()
|
|
90
|
+
# Seed an oversized chunk on its own.
|
|
91
|
+
body_for_chunk = (pending_overlap + ("\n\n" if pending_overlap else "")) + rendered
|
|
92
|
+
# Set start/end markers for the standalone chunk.
|
|
93
|
+
buf_start = turn["turn_index"]
|
|
94
|
+
buf_end = turn["turn_index"]
|
|
95
|
+
flush(force_text=body_for_chunk)
|
|
96
|
+
pending_overlap = body_for_chunk[-overlap:] if overlap > 0 else ""
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
candidate_len = buf_chars + rendered_len + (2 if buf_parts else 0) # 2 for "\n\n"
|
|
100
|
+
if buf_parts and candidate_len > target:
|
|
101
|
+
# Flush the buffer; start a new chunk seeded with overlap from the
|
|
102
|
+
# body we just emitted.
|
|
103
|
+
last_body = ""
|
|
104
|
+
if chunks:
|
|
105
|
+
last_body = chunks[-1]["body"]
|
|
106
|
+
flush()
|
|
107
|
+
if overlap > 0 and last_body:
|
|
108
|
+
pending_overlap = last_body[-overlap:]
|
|
109
|
+
else:
|
|
110
|
+
pending_overlap = ""
|
|
111
|
+
|
|
112
|
+
if not buf_parts and pending_overlap:
|
|
113
|
+
buf_parts.append(pending_overlap)
|
|
114
|
+
buf_chars += len(pending_overlap)
|
|
115
|
+
pending_overlap = ""
|
|
116
|
+
|
|
117
|
+
buf_parts.append(rendered)
|
|
118
|
+
buf_chars += rendered_len + (2 if len(buf_parts) > 1 else 0)
|
|
119
|
+
if buf_start is None:
|
|
120
|
+
buf_start = turn["turn_index"]
|
|
121
|
+
buf_end = turn["turn_index"]
|
|
122
|
+
|
|
123
|
+
# Final flush.
|
|
124
|
+
if buf_parts:
|
|
125
|
+
flush()
|
|
126
|
+
|
|
127
|
+
print(json.dumps(chunks))
|
|
128
|
+
PY
|
|
129
|
+
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Config resolution for Historian.
|
|
3
|
+
#
|
|
4
|
+
# Reads three layers, latest wins:
|
|
5
|
+
# 1. plugins/historian/config.json (defaults shipped with the plugin)
|
|
6
|
+
# 2. ~/.claude/settings.json
|
|
7
|
+
# 3. <repo>/.claude/settings.json
|
|
8
|
+
#
|
|
9
|
+
# Exposes:
|
|
10
|
+
# historian_config_load <repo_root> # populates _HISTORIAN_CONFIG (JSON)
|
|
11
|
+
# historian_config_get <jq-path> # echoes string value (empty if unset)
|
|
12
|
+
# historian_config_enabled # 0 if historian.enabled is true
|
|
13
|
+
#
|
|
14
|
+
# Settings overlay only touches the `historian.*` subtree of settings.json.
|
|
15
|
+
|
|
16
|
+
_HISTORIAN_CONFIG="{}"
|
|
17
|
+
|
|
18
|
+
historian_config_load() {
|
|
19
|
+
local repo_root="${1:-}"
|
|
20
|
+
local plugin_root="${CLAUDE_PLUGIN_ROOT:-}"
|
|
21
|
+
local home_dir="${HOME:-}"
|
|
22
|
+
|
|
23
|
+
local merged="{}"
|
|
24
|
+
local file
|
|
25
|
+
|
|
26
|
+
file="${plugin_root}/config.json"
|
|
27
|
+
if [[ -f "$file" ]]; then
|
|
28
|
+
local defaults
|
|
29
|
+
defaults=$(jq '.' "$file" 2>/dev/null) || defaults="{}"
|
|
30
|
+
merged=$(jq -n --argjson a "$merged" --argjson b "$defaults" '$a * $b' 2>/dev/null) \
|
|
31
|
+
|| merged="$defaults"
|
|
32
|
+
fi
|
|
33
|
+
|
|
34
|
+
for file in "${home_dir}/.claude/settings.json" "${repo_root}/.claude/settings.json"; do
|
|
35
|
+
[[ -n "$file" && -f "$file" ]] || continue
|
|
36
|
+
local overlay
|
|
37
|
+
overlay=$(jq '{ historian: (.historian // {}) }' "$file" 2>/dev/null) || continue
|
|
38
|
+
[[ -z "$overlay" ]] && continue
|
|
39
|
+
merged=$(jq -n --argjson a "$merged" --argjson b "$overlay" '
|
|
40
|
+
def deepmerge($a; $b):
|
|
41
|
+
if ($a|type) == "object" and ($b|type) == "object" then
|
|
42
|
+
reduce (($a|keys) + ($b|keys) | unique)[] as $k
|
|
43
|
+
({}; .[$k] = deepmerge($a[$k]; $b[$k]))
|
|
44
|
+
elif $b == null then $a
|
|
45
|
+
else $b end;
|
|
46
|
+
deepmerge($a; $b)
|
|
47
|
+
' 2>/dev/null) || true
|
|
48
|
+
done
|
|
49
|
+
|
|
50
|
+
_HISTORIAN_CONFIG="$merged"
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
# Read a value from the loaded config. The explicit null check (instead of
|
|
54
|
+
# `// empty`) preserves boolean `false` — `// empty` would treat it the same
|
|
55
|
+
# as null and silently drop "explicitly disabled" settings.
|
|
56
|
+
historian_config_get() {
|
|
57
|
+
local path="$1"
|
|
58
|
+
printf '%s' "$_HISTORIAN_CONFIG" \
|
|
59
|
+
| jq -r "${path} | if . == null then empty else . end" 2>/dev/null
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
historian_config_enabled() {
|
|
63
|
+
local v
|
|
64
|
+
v=$(historian_config_get '.historian.enabled')
|
|
65
|
+
[[ "$v" == "true" ]]
|
|
66
|
+
}
|