@onlooker-community/ecosystem 0.22.0 → 0.23.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,191 @@
1
+ #!/usr/bin/env bash
2
+ # Similarity-search retriever for Historian.
3
+ #
4
+ # Given a query embedding and a project key, walks every JSONL chunk
5
+ # record under ~/.onlooker/historian/<key>/sessions/, computes cosine
6
+ # similarity between the query vector and each chunk's `embedding`
7
+ # field, and returns the top-K candidates above a similarity floor.
8
+ #
9
+ # Chunks indexed before the embedder shipped don't have an `embedding`
10
+ # field; the retriever silently skips them rather than treating them as
11
+ # zero-similarity. They'll join the index after the next SessionEnd
12
+ # indexing pass.
13
+
14
+ # Aggregate every chunk record for the project. Returns a JSON array.
15
+ historian_retriever_load_all_chunks() {
16
+ local key="$1"
17
+ [[ -z "$key" ]] && { echo '[]'; return 0; }
18
+
19
+ local dir
20
+ dir=$(historian_sessions_dir "$key")
21
+ [[ -d "$dir" ]] || { echo '[]'; return 0; }
22
+
23
+ # Walk every *.jsonl, emit one JSON array. Use python3 to avoid the
24
+ # `jq -s` quirks around very large inputs and to control the chunk
25
+ # shape (drop the embedding from filtering candidates but keep it
26
+ # for the math).
27
+ python3 - "$dir" <<'PY'
28
+ import json, os, sys
29
+ dir_path = sys.argv[1]
30
+ out = []
31
+ try:
32
+ for name in sorted(os.listdir(dir_path)):
33
+ if not name.endswith(".jsonl"):
34
+ continue
35
+ path = os.path.join(dir_path, name)
36
+ try:
37
+ with open(path, "r", encoding="utf-8", errors="replace") as f:
38
+ for line in f:
39
+ line = line.strip()
40
+ if not line:
41
+ continue
42
+ try:
43
+ rec = json.loads(line)
44
+ except json.JSONDecodeError:
45
+ continue
46
+ out.append(rec)
47
+ except OSError:
48
+ continue
49
+ except FileNotFoundError:
50
+ pass
51
+ print(json.dumps(out))
52
+ PY
53
+ }
54
+
55
+ # Compute top-K cosine-similarity matches against the query embedding.
56
+ #
57
+ # The chunks are streamed from disk one line at a time so memory and
58
+ # argv stay bounded as the per-project store grows. Earlier versions
59
+ # passed the full chunks array as an argv string, which would trip the
60
+ # OS ARG_MAX limit somewhere around tens of thousands of chunks; this
61
+ # form never holds more than one chunk in memory at a time.
62
+ #
63
+ # Usage: historian_retriever_search <sessions_dir>
64
+ # <query_embedding_json>
65
+ # <top_k> <min_similarity>
66
+ # <max_age_days> <current_session_id>
67
+ #
68
+ # Output: JSON array sorted by similarity descending, length <= top_k.
69
+ # Each entry: {
70
+ # chunk_id, session_id, similarity, age_days, body_redacted,
71
+ # chunk_index, start_turn_index, end_turn_index, source
72
+ # }
73
+ historian_retriever_search() {
74
+ local sessions_dir="${1:-}"
75
+ local query="${2:-[]}"
76
+ local top_k="${3:-5}"
77
+ local min_sim="${4:-0.55}"
78
+ local max_age_days="${5:-180}"
79
+ local current_session="${6:-}"
80
+
81
+ if [[ -z "$sessions_dir" || ! -d "$sessions_dir" ]]; then
82
+ echo '[]'
83
+ return 0
84
+ fi
85
+
86
+ python3 - "$sessions_dir" "$top_k" "$min_sim" "$max_age_days" "$current_session" "$query" <<'PY'
87
+ import datetime, json, math, os, sys
88
+
89
+ sessions_dir = sys.argv[1]
90
+ top_k = int(sys.argv[2])
91
+ min_sim = float(sys.argv[3])
92
+ max_age_days = int(sys.argv[4])
93
+ current_session = sys.argv[5]
94
+ query = json.loads(sys.argv[6] or "null")
95
+
96
+
97
+ def cosine(a, b):
98
+ if not a or not b or len(a) != len(b):
99
+ return None
100
+ dot = 0.0
101
+ na = 0.0
102
+ nb = 0.0
103
+ for x, y in zip(a, b):
104
+ dot += x * y
105
+ na += x * x
106
+ nb += y * y
107
+ if na <= 0.0 or nb <= 0.0:
108
+ return None
109
+ return dot / (math.sqrt(na) * math.sqrt(nb))
110
+
111
+
112
+ def parse_iso(s):
113
+ if not s:
114
+ return None
115
+ try:
116
+ return datetime.datetime.strptime(s, "%Y-%m-%dT%H:%M:%SZ").replace(
117
+ tzinfo=datetime.timezone.utc
118
+ )
119
+ except ValueError:
120
+ return None
121
+
122
+
123
+ if not isinstance(query, list) or not query:
124
+ print("[]")
125
+ sys.exit(0)
126
+
127
+ now = datetime.datetime.now(datetime.timezone.utc)
128
+ scored = []
129
+
130
+
131
+ def consider(chunk):
132
+ sid = chunk.get("session_id", "")
133
+ # Exclude chunks from the session that is currently asking for
134
+ # context; a session retrieving its own chunks is a degenerate case.
135
+ if current_session and sid == current_session:
136
+ return
137
+ embedding = chunk.get("embedding")
138
+ if not isinstance(embedding, list) or not embedding:
139
+ return
140
+ sim = cosine(query, embedding)
141
+ if sim is None or sim < min_sim:
142
+ return
143
+ created = parse_iso(chunk.get("created_at"))
144
+ if created is None:
145
+ age_days = -1
146
+ else:
147
+ age_days = (now - created).days
148
+ if max_age_days > 0 and age_days > max_age_days:
149
+ return
150
+ scored.append(
151
+ {
152
+ "chunk_id": chunk.get("chunk_id"),
153
+ "session_id": sid,
154
+ "similarity": round(sim, 4),
155
+ "age_days": age_days,
156
+ "body_redacted": chunk.get("body_redacted", ""),
157
+ "chunk_index": chunk.get("chunk_index"),
158
+ "start_turn_index": chunk.get("start_turn_index"),
159
+ "end_turn_index": chunk.get("end_turn_index"),
160
+ "source": chunk.get("source", "local"),
161
+ }
162
+ )
163
+
164
+
165
+ try:
166
+ names = sorted(os.listdir(sessions_dir))
167
+ except OSError:
168
+ names = []
169
+
170
+ for name in names:
171
+ if not name.endswith(".jsonl"):
172
+ continue
173
+ path = os.path.join(sessions_dir, name)
174
+ try:
175
+ with open(path, "r", encoding="utf-8", errors="replace") as f:
176
+ for line in f:
177
+ line = line.strip()
178
+ if not line:
179
+ continue
180
+ try:
181
+ chunk = json.loads(line)
182
+ except json.JSONDecodeError:
183
+ continue
184
+ consider(chunk)
185
+ except OSError:
186
+ continue
187
+
188
+ scored.sort(key=lambda c: c["similarity"], reverse=True)
189
+ print(json.dumps(scored[:top_k]))
190
+ PY
191
+ }
@@ -108,3 +108,50 @@ historian_storage_reset_session() {
108
108
  [[ -f "$path" ]] || return 0
109
109
  : > "$path"
110
110
  }
111
+
112
+ # ============================================================================
113
+ # Retrieval watermarks (per-session, scoped to the project key)
114
+ # ============================================================================
115
+
116
+ # Path used to hold the per-session retrieval state (count + last_ms) so
117
+ # the rate gate persists across UserPromptSubmit invocations within a
118
+ # single session. We key on (project, session) so cross-session retrieval
119
+ # limits don't leak. The state file uses `last_ms` — an epoch-millisecond
120
+ # timestamp of the last retrieval the rate gate let through — and the
121
+ # cooldown gate compares (now_ms - last_ms) against cooldown_seconds.
122
+ historian_retrieval_state_path() {
123
+ local key="$1"
124
+ local session_id="$2"
125
+ local safe
126
+ safe=$(printf '%s' "$session_id" | tr -cd '[:alnum:]._-')
127
+ [[ -z "$safe" ]] && safe="unknown"
128
+ printf '%s/retrieval-state/%s.json' "$(historian_project_dir "$key")" "$safe"
129
+ }
130
+
131
+ # Read the JSON document at the watermark path. Returns {"count":0,
132
+ # "last_ms":0} when the file is absent or unreadable.
133
+ historian_retrieval_state_read() {
134
+ local key="$1"
135
+ local session_id="$2"
136
+ local path
137
+ path=$(historian_retrieval_state_path "$key" "$session_id")
138
+ if [[ -f "$path" ]]; then
139
+ jq -c '. // {count:0, last_ms:0}' "$path" 2>/dev/null \
140
+ || printf '%s' '{"count":0,"last_ms":0}'
141
+ else
142
+ printf '%s' '{"count":0,"last_ms":0}'
143
+ fi
144
+ }
145
+
146
+ # Bump the count and update last_ms.
147
+ historian_retrieval_state_write() {
148
+ local key="$1"
149
+ local session_id="$2"
150
+ local count="$3"
151
+ local last_ms="$4"
152
+ local path
153
+ path=$(historian_retrieval_state_path "$key" "$session_id")
154
+ mkdir -p "$(dirname "$path")" 2>/dev/null
155
+ jq -cn --argjson count "$count" --argjson last_ms "$last_ms" \
156
+ '{ count: $count, last_ms: $last_ms }' > "$path" 2>/dev/null
157
+ }
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "scribe",
3
- "version": "0.2.0",
3
+ "version": "0.2.1",
4
4
  "description": "Intent documentation from agent activity. Captures why changes were made — problem context, decisions, tradeoffs — and distills them into readable artifacts at session end.",
5
5
  "author": {
6
6
  "name": "Onlooker Community",
@@ -8,5 +8,7 @@
8
8
  },
9
9
  "homepage": "https://onlooker.dev",
10
10
  "license": "MIT",
11
- "requires": ["ecosystem"]
11
+ "requires": [
12
+ "ecosystem"
13
+ ]
12
14
  }
@@ -1,5 +1,12 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.2.1](https://github.com/onlooker-community/ecosystem/compare/scribe-v0.2.0...scribe-v0.2.1) (2026-06-04)
4
+
5
+
6
+ ### Bug Fixes
7
+
8
+ * **scribe:** mark hook scripts executable :relieved: ([#64](https://github.com/onlooker-community/ecosystem/issues/64)) ([05603e5](https://github.com/onlooker-community/ecosystem/commit/05603e56895c009c1435d1712592adbbc4c15e61))
9
+
3
10
  ## [0.2.0](https://github.com/onlooker-community/ecosystem/compare/scribe-v0.1.0...scribe-v0.2.0) (2026-06-01)
4
11
 
5
12
 
File without changes
File without changes
@@ -0,0 +1,206 @@
1
+ #!/usr/bin/env bash
2
+ # Onlooker Memory Recall Tracker
3
+ # Invoked by SessionStart (matcher: *) when a session boots, resumes, or
4
+ # restarts after compaction. Emits one canonical `memory.recalled` event
5
+ # per typed-memory file present at the project's per-checkout memory
6
+ # store path. This approximates the substrate signal "these memories are
7
+ # now in the model's context for the session about to begin".
8
+ #
9
+ # Curator's usage tracker (and any future plugin that reasons about how
10
+ # often a memory is in scope) depends on this. The signal is coarse —
11
+ # per-session-load rather than per-recall — but actionable in aggregate.
12
+ #
13
+ # Hook contract:
14
+ # - Always exits 0. Never blocks SessionStart.
15
+ # - No-ops when there is no project memory store, no git context, or
16
+ # when the source is `compact` (compaction is metadata-only; the
17
+ # same memories remain in scope, so re-emitting would double-count).
18
+
19
+ set -uo pipefail # No -e: never block session startup
20
+
21
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
22
+ # shellcheck source=../lib/validate-path.sh
23
+ source "$SCRIPT_DIR/../lib/validate-path.sh"
24
+ # shellcheck source=../lib/onlooker-schema.sh
25
+ source "$SCRIPT_DIR/../lib/onlooker-schema.sh"
26
+
27
+ # Standard hook health instrumentation. hook_register sets up the timer;
28
+ # hook_set_context exports _HOOK_SESSION_ID + _HOOK_EVENT_NAME so failures
29
+ # attach to the right session in ~/.onlooker/logs/hook-health.jsonl;
30
+ # hook_success / hook_failure close the health record.
31
+ hook_register "memory-recall-tracker" "Memory Recall Tracker" "Emits memory.recalled per typed memory file present at SessionStart"
32
+
33
+ INPUT=$(cat 2>/dev/null || true)
34
+ hook_set_context "$INPUT" "SessionStart"
35
+
36
+ CWD=$(printf '%s' "$INPUT" | jq -r '.cwd // ""' 2>/dev/null) || CWD=""
37
+ SESSION_ID=$(printf '%s' "$INPUT" | jq -r '.session_id // ""' 2>/dev/null) || SESSION_ID=""
38
+ SOURCE=$(printf '%s' "$INPUT" | jq -r '.source // "startup"' 2>/dev/null) || SOURCE="startup"
39
+ [[ -z "$CWD" ]] && CWD="$(pwd)"
40
+ [[ -z "$SESSION_ID" ]] && SESSION_ID="unknown"
41
+
42
+ # Compaction reloads the session with the same memories still in scope.
43
+ # Re-emitting on each compaction would inflate usage counts; skip.
44
+ if [[ "$SOURCE" == "compact" ]]; then
45
+ hook_success
46
+ exit 0
47
+ fi
48
+
49
+ # ---------------------------------------------------------------------------
50
+ # Resolve project_key. Mirrors the SHA256-of-remote-URL + common-dir
51
+ # fallback every memory plugin uses (see plugins/librarian/scripts/lib/
52
+ # librarian-project-key.sh and friends): if there's no origin remote,
53
+ # anchor the key on git --git-common-dir rather than --show-toplevel so
54
+ # two worktrees of the same local-only repo share a key.
55
+ # ---------------------------------------------------------------------------
56
+
57
+ _memory_sha256_first12() {
58
+ local input="$1"
59
+ if command -v shasum >/dev/null 2>&1; then
60
+ printf '%s' "$input" | shasum -a 256 2>/dev/null | cut -c1-12
61
+ elif command -v sha256sum >/dev/null 2>&1; then
62
+ printf '%s' "$input" | sha256sum 2>/dev/null | cut -c1-12
63
+ else
64
+ return 1
65
+ fi
66
+ }
67
+
68
+ _memory_repo_root_via_common_dir() {
69
+ local cwd="$1"
70
+ local common_dir toplevel
71
+ common_dir=$(git -C "$cwd" rev-parse --git-common-dir 2>/dev/null) || return 0
72
+ # git-common-dir may be relative; resolve relative to cwd.
73
+ if [[ -n "$common_dir" && "$common_dir" != /* ]]; then
74
+ common_dir="$(cd "$cwd" && cd "$common_dir" 2>/dev/null && pwd -P)" || common_dir=""
75
+ fi
76
+ if [[ -n "$common_dir" && -d "$common_dir" ]]; then
77
+ # common_dir is typically the .git dir of the main repo; its
78
+ # parent is the canonical repo root (shared across worktrees).
79
+ toplevel="$(cd "$common_dir/.." 2>/dev/null && pwd -P)" || toplevel=""
80
+ fi
81
+ if [[ -z "$toplevel" ]]; then
82
+ toplevel=$(git -C "$cwd" rev-parse --show-toplevel 2>/dev/null || true)
83
+ [[ -n "$toplevel" ]] && toplevel="$(cd "$toplevel" 2>/dev/null && pwd -P)"
84
+ fi
85
+ printf '%s' "$toplevel"
86
+ }
87
+
88
+ PROJECT_KEY=""
89
+ if git -C "$CWD" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
90
+ REMOTE=$(git -C "$CWD" remote get-url origin 2>/dev/null || true)
91
+ if [[ -n "$REMOTE" ]]; then
92
+ PROJECT_KEY=$(_memory_sha256_first12 "remote:${REMOTE}")
93
+ else
94
+ ROOT=$(_memory_repo_root_via_common_dir "$CWD")
95
+ if [[ -n "$ROOT" ]]; then
96
+ PROJECT_KEY=$(_memory_sha256_first12 "root:${ROOT}")
97
+ fi
98
+ fi
99
+ fi
100
+
101
+ if [[ -z "$PROJECT_KEY" ]]; then
102
+ hook_success
103
+ exit 0
104
+ fi
105
+
106
+ # ---------------------------------------------------------------------------
107
+ # Resolve the per-project typed-memory store at
108
+ # ~/.claude/projects/<encoded>/memory/. Claude Code encodes the project
109
+ # path by replacing path separators with `-` and prepending a leading `-`.
110
+ # Prefer $CLAUDE_PROJECT_ENCODED when the harness has populated it; fall
111
+ # back to deriving from CWD.
112
+ # ---------------------------------------------------------------------------
113
+
114
+ ENCODED="${CLAUDE_PROJECT_ENCODED:-}"
115
+ if [[ -z "$ENCODED" ]]; then
116
+ # Encode the absolute cwd: drop leading slash, swap remaining `/` for
117
+ # `-`, prepend the leading `-`.
118
+ ABS_CWD=$(cd "$CWD" 2>/dev/null && pwd -P) || ABS_CWD=""
119
+ if [[ -n "$ABS_CWD" ]]; then
120
+ ENCODED=$(printf '%s' "$ABS_CWD" | sed -E 's#/#-#g')
121
+ fi
122
+ fi
123
+
124
+ MEMORY_DIR="${CLAUDE_HOME}/projects/${ENCODED}/memory"
125
+ if [[ -z "$ENCODED" || ! -d "$MEMORY_DIR" ]]; then
126
+ hook_success
127
+ exit 0
128
+ fi
129
+
130
+ # ---------------------------------------------------------------------------
131
+ # Walk every *.md file (excluding MEMORY.md itself, which is the index, not
132
+ # a memory). For each, parse the YAML frontmatter's `type` field. Skip
133
+ # files whose type isn't one of the four valid enum values — emitting
134
+ # anything else would fail schema validation and the event would be
135
+ # silently dropped.
136
+ # ---------------------------------------------------------------------------
137
+
138
+ _extract_type() {
139
+ local path="$1"
140
+ [[ -f "$path" ]] || return 0
141
+ # Parse frontmatter type via awk + sed (no python dep, no yq dep).
142
+ awk '
143
+ NR == 1 && /^---/ { in_fm = 1; next }
144
+ in_fm && /^---/ { exit }
145
+ in_fm
146
+ ' "$path" 2>/dev/null \
147
+ | sed -nE 's/^type:[[:space:]]*(.*)$/\1/p' \
148
+ | head -1 \
149
+ | tr -d '"' \
150
+ | tr -d "'"
151
+ }
152
+
153
+ position=0
154
+ for file in "$MEMORY_DIR"/*.md; do
155
+ [[ -f "$file" ]] || continue
156
+ fname=$(basename "$file")
157
+ [[ "$fname" == "MEMORY.md" ]] && continue
158
+
159
+ memory_type=$(_extract_type "$file")
160
+ case "$memory_type" in
161
+ user|feedback|project|reference)
162
+ ;;
163
+ *)
164
+ # Untyped or unknown-typed memories don't fit the schema's
165
+ # enum. Skip silently rather than tank schema validation.
166
+ continue
167
+ ;;
168
+ esac
169
+
170
+ payload=$(jq -cn \
171
+ --arg project_key "$PROJECT_KEY" \
172
+ --arg memory_file "$fname" \
173
+ --arg memory_type "$memory_type" \
174
+ --argjson recall_position "$position" \
175
+ '{
176
+ project_key: $project_key,
177
+ memory_file: $memory_file,
178
+ memory_type: $memory_type,
179
+ recall_position: $recall_position
180
+ }')
181
+
182
+ # Use the canonical ecosystem plugin name (matches the
183
+ # `${ONLOOKER_PLUGIN_NAME:-onlooker}` default that scripts/lib/
184
+ # onlooker-emit.sh and onlooker-event.mjs both fall back to). Other
185
+ # substrate-level emissions land under "onlooker" too, so this stays
186
+ # consistent with the existing event stream.
187
+ local_plugin="${ONLOOKER_PLUGIN_NAME:-onlooker}"
188
+
189
+ params=$(jq -cn \
190
+ --arg plugin "$local_plugin" \
191
+ --arg sid "$SESSION_ID" \
192
+ --arg type "memory.recalled" \
193
+ --argjson payload "$payload" \
194
+ '{ plugin: $plugin, session_id: $sid, event_type: $type, payload: $payload }')
195
+
196
+ event_json=$(printf '%s' "$params" \
197
+ | ONLOOKER_DIR="$ONLOOKER_DIR" ONLOOKER_PLUGIN_NAME="$local_plugin" \
198
+ node "$_ONLOOKER_EVENT_JS" emit 2>/dev/null) || event_json=""
199
+ [[ -z "$event_json" ]] && continue
200
+
201
+ onlooker_append_event "$event_json" || true
202
+ position=$((position + 1))
203
+ done
204
+
205
+ hook_success
206
+ exit 0