@jeganwrites/claudash 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/scanner.py ADDED
@@ -0,0 +1,385 @@
1
+ import json
2
+ import os
3
+ import sys
4
+ import time
5
+ import threading
6
+ from datetime import datetime, timezone
7
+
8
+ from config import UNKNOWN_PROJECT, MODEL_PRICING
9
+ from db import get_conn, insert_session, get_accounts_config, get_project_map_config
10
+
11
+ _last_scan_time = 0
12
+
13
+
14
+ def get_last_scan_time():
15
+ return _last_scan_time
16
+
17
+
18
+ def normalize_model(model_str):
19
+ if not model_str:
20
+ return "claude-sonnet"
21
+ m = model_str.lower()
22
+ if "opus" in m:
23
+ return "claude-opus"
24
+ if "haiku" in m:
25
+ return "claude-haiku"
26
+ return "claude-sonnet"
27
+
28
+
29
+ def resolve_project(folder_path, project_map=None):
30
+ if project_map is None:
31
+ project_map = get_project_map_config()
32
+ path_lower = folder_path.lower()
33
+ for project_name, info in project_map.items():
34
+ for kw in info["keywords"]:
35
+ if kw in path_lower:
36
+ return project_name, info["account"]
37
+ return UNKNOWN_PROJECT, "personal_max"
38
+
39
+
40
+ def compute_cost(model, input_tokens, output_tokens, cache_read, cache_create):
41
+ pricing = MODEL_PRICING.get(model, MODEL_PRICING["claude-sonnet"])
42
+ cost = 0.0
43
+ cost += (input_tokens / 1_000_000) * pricing["input"]
44
+ cost += (output_tokens / 1_000_000) * pricing["output"]
45
+ cost += (cache_read / 1_000_000) * pricing["cache_read"]
46
+ cost += (cache_create / 1_000_000) * pricing["cache_write"]
47
+ return round(cost, 8)
48
+
49
+
50
+ def parse_timestamp(ts_str):
51
+ if not ts_str:
52
+ return None
53
+ try:
54
+ clean = ts_str.replace("Z", "").replace("+00:00", "")
55
+ if "." in clean:
56
+ clean = clean.split(".")[0]
57
+ if "T" in clean:
58
+ dt = datetime.strptime(clean, "%Y-%m-%dT%H:%M:%S")
59
+ return int(dt.replace(tzinfo=timezone.utc).timestamp())
60
+ except Exception:
61
+ pass
62
+ return None
63
+
64
+
65
+ def _detect_compaction(session_rows):
66
+ """Detect compaction events: a >30% drop in total inbound context
67
+ (input_tokens + cache_read_tokens) between consecutive turns in a session.
68
+ We use total context size because under Claude Code's prompt caching the
69
+ bulk of the prompt lives in cache_read_tokens while input_tokens stays near
70
+ zero — watching input_tokens alone misses every real compaction."""
71
+ events = []
72
+ for i in range(1, len(session_rows)):
73
+ prev = session_rows[i - 1]
74
+ curr = session_rows[i]
75
+ prev_ctx = prev.get("input_tokens", 0) + prev.get("cache_read_tokens", 0)
76
+ curr_ctx = curr.get("input_tokens", 0) + curr.get("cache_read_tokens", 0)
77
+ if prev_ctx > 1000 and curr_ctx < prev_ctx * 0.7:
78
+ events.append((i, prev_ctx, curr_ctx))
79
+ return events
80
+
81
+
82
+ def _parse_line(line):
83
+ """Parse a single JSONL line into a raw row dict, or None if invalid."""
84
+ line = line.strip()
85
+ if not line:
86
+ return None
87
+ try:
88
+ obj = json.loads(line)
89
+ except json.JSONDecodeError:
90
+ return None
91
+
92
+ # Prefer Claude Code's per-conversation sessionId. `uuid` is per-MESSAGE
93
+ # (unique every row) and using it as session_id silently breaks every
94
+ # per-session metric (compaction, session_depth, sessions_today).
95
+ session_id = obj.get("sessionId") or obj.get("session_id") or obj.get("uuid", "")
96
+ ts_str = obj.get("timestamp") or obj.get("ts", "")
97
+ ts = parse_timestamp(ts_str) if isinstance(ts_str, str) else (int(ts_str) if ts_str else None)
98
+ if not ts:
99
+ return None
100
+
101
+ model_raw = obj.get("model", "")
102
+ if not model_raw and "message" in obj:
103
+ model_raw = obj["message"].get("model", "")
104
+
105
+ usage = {}
106
+ if "message" in obj and isinstance(obj["message"], dict):
107
+ usage = obj["message"].get("usage", {})
108
+ if not usage:
109
+ usage = obj.get("usage", {})
110
+
111
+ input_tokens = usage.get("input_tokens", 0) or 0
112
+ output_tokens = usage.get("output_tokens", 0) or 0
113
+ cache_read = usage.get("cache_read_input_tokens", 0) or 0
114
+ cache_create = usage.get("cache_creation_input_tokens", 0) or 0
115
+
116
+ if input_tokens == 0 and output_tokens == 0:
117
+ return None
118
+
119
+ model = normalize_model(model_raw)
120
+ cost = compute_cost(model, input_tokens, output_tokens, cache_read, cache_create)
121
+
122
+ return {
123
+ "session_id": session_id,
124
+ "timestamp": ts,
125
+ "model": model,
126
+ "input_tokens": input_tokens,
127
+ "output_tokens": output_tokens,
128
+ "cache_read_tokens": cache_read,
129
+ "cache_creation_tokens": cache_create,
130
+ "cost_usd": cost,
131
+ "compaction_detected": 0,
132
+ "tokens_before_compact": None,
133
+ "tokens_after_compact": None,
134
+ }
135
+
136
+
137
+ def _get_scan_state(conn, filepath):
138
+ """Get (last_offset, lines_processed) for a file, or (0, 0) if new."""
139
+ row = conn.execute(
140
+ "SELECT last_offset, lines_processed FROM scan_state WHERE file_path = ?",
141
+ (filepath,),
142
+ ).fetchone()
143
+ if row:
144
+ return row[0] or 0, row[1] or 0
145
+ return 0, 0
146
+
147
+
148
+ def _set_scan_state(conn, filepath, offset, lines_processed):
149
+ now = int(time.time())
150
+ conn.execute(
151
+ """INSERT INTO scan_state (file_path, last_offset, last_scanned, lines_processed)
152
+ VALUES (?, ?, ?, ?)
153
+ ON CONFLICT(file_path) DO UPDATE SET
154
+ last_offset=excluded.last_offset,
155
+ last_scanned=excluded.last_scanned,
156
+ lines_processed=excluded.lines_processed""",
157
+ (filepath, offset, now, lines_processed),
158
+ )
159
+
160
+
161
+ def _parse_subagent_info(filepath):
162
+ """If the file lives under a `/subagents/` directory, return
163
+ (is_subagent=1, parent_session_id), else (0, None).
164
+
165
+ Expected shape: .../<parent_session_uuid>/subagents/agent-*.jsonl
166
+ The parent's session UUID is the folder immediately above `subagents`.
167
+ """
168
+ if "/subagents/" not in filepath:
169
+ return 0, None
170
+ parent_dir = filepath.split("/subagents/")[0]
171
+ parent_uuid = os.path.basename(parent_dir)
172
+ return 1, (parent_uuid or None)
173
+
174
+
175
+ def scan_jsonl_file(filepath, folder_path, conn, source_path="", project_map=None):
176
+ """Parse new lines from a JSONL file using incremental offset tracking."""
177
+ # For subagent files, resolve project from the *parent* project folder
178
+ # (the grandparent of `subagents/`) so the subagent inherits the parent's
179
+ # project tag even if the `subagents` directory itself has no matching
180
+ # keyword.
181
+ is_subagent, parent_sid = _parse_subagent_info(filepath)
182
+ resolve_against = folder_path
183
+ if is_subagent:
184
+ parent_project_folder = filepath.split("/subagents/")[0]
185
+ parent_project_folder = os.path.dirname(parent_project_folder)
186
+ resolve_against = parent_project_folder or folder_path
187
+ project, account = resolve_project(resolve_against, project_map)
188
+
189
+ try:
190
+ file_size = os.path.getsize(filepath)
191
+ except OSError:
192
+ return 0
193
+
194
+ last_offset, prev_lines = _get_scan_state(conn, filepath)
195
+
196
+ # File was truncated/rotated — reset
197
+ if file_size < last_offset:
198
+ last_offset = 0
199
+ prev_lines = 0
200
+
201
+ # Nothing new to read
202
+ if file_size == last_offset:
203
+ return 0
204
+
205
+ raw_rows = []
206
+ new_lines = 0
207
+ try:
208
+ with open(filepath, "r", errors="replace") as f:
209
+ if last_offset > 0:
210
+ f.seek(last_offset)
211
+ for line in f:
212
+ if len(line) > 1_000_000: # 1MB max line
213
+ print(f"WARNING: skipping oversized line ({len(line)} bytes) in {filepath}", file=sys.stderr)
214
+ continue
215
+ if not line.strip():
216
+ continue
217
+ new_lines += 1
218
+ parsed = _parse_line(line)
219
+ if parsed:
220
+ parsed["project"] = project
221
+ parsed["account"] = account
222
+ # Store the actual JSONL file path, not the data_path root.
223
+ # Without the full path we can't re-resolve the project from
224
+ # a session row, and every session from one data_path looks
225
+ # identical. scan_state tracks the same key.
226
+ parsed["source_path"] = filepath
227
+ parsed["is_subagent"] = is_subagent
228
+ parsed["parent_session_id"] = parent_sid
229
+ raw_rows.append(parsed)
230
+ end_offset = f.tell()
231
+ except Exception as e:
232
+ print(f"[scanner] Error reading {filepath}: {e}", file=sys.stderr)
233
+ return 0
234
+
235
+ # Compaction detection within this batch
236
+ sessions = {}
237
+ for i, row in enumerate(raw_rows):
238
+ sid = row["session_id"]
239
+ if sid not in sessions:
240
+ sessions[sid] = []
241
+ sessions[sid].append(i)
242
+
243
+ for sid, indices in sessions.items():
244
+ indices.sort(key=lambda idx: raw_rows[idx]["timestamp"])
245
+ session_data = [raw_rows[idx] for idx in indices]
246
+ compaction_events = _detect_compaction(session_data)
247
+ for evt_idx, before, after in compaction_events:
248
+ real_idx = indices[evt_idx]
249
+ raw_rows[real_idx]["compaction_detected"] = 1
250
+ raw_rows[real_idx]["tokens_before_compact"] = before
251
+ raw_rows[real_idx]["tokens_after_compact"] = after
252
+
253
+ added = 0
254
+ for row in raw_rows:
255
+ before = conn.total_changes
256
+ insert_session(conn, row)
257
+ if conn.total_changes > before:
258
+ added += 1
259
+
260
+ _set_scan_state(conn, filepath, end_offset, prev_lines + new_lines)
261
+ return added
262
+
263
+
264
+ def scan_all(account_filter=None):
265
+ """Walk all configured data_paths and scan JSONL files incrementally."""
266
+ global _last_scan_time
267
+ conn = get_conn()
268
+ accounts = get_accounts_config(conn)
269
+ project_map = get_project_map_config(conn)
270
+ total_added = 0
271
+ files_scanned = 0
272
+
273
+ for account_key, acct in accounts.items():
274
+ if account_filter and account_key != account_filter:
275
+ continue
276
+ for data_path in acct.get("data_paths", []):
277
+ if not os.path.isdir(data_path):
278
+ print(f"[scanner] {data_path} does not exist, skipping", file=sys.stderr)
279
+ continue
280
+
281
+ for root, dirs, files in os.walk(data_path):
282
+ for fname in files:
283
+ if not fname.endswith(".jsonl"):
284
+ continue
285
+ filepath = os.path.join(root, fname)
286
+ added = scan_jsonl_file(filepath, root, conn, source_path=data_path, project_map=project_map)
287
+ total_added += added
288
+ if added > 0:
289
+ files_scanned += 1
290
+
291
+ conn.commit()
292
+ conn.close()
293
+ _last_scan_time = int(time.time())
294
+ print(f"[scanner] Scan complete: {total_added} new rows (incremental, {files_scanned} files changed)", file=sys.stderr)
295
+ return total_added
296
+
297
+
298
+ def preview_paths(data_paths):
299
+ result = []
300
+ for p in data_paths:
301
+ expanded = os.path.expanduser(p)
302
+ exists = os.path.isdir(expanded)
303
+ count = 0
304
+ if exists:
305
+ for root, dirs, files in os.walk(expanded):
306
+ for fname in files:
307
+ if fname.endswith(".jsonl"):
308
+ count += 1
309
+ result.append({"path": p, "expanded": expanded, "exists": exists, "jsonl_files": count})
310
+ return result
311
+
312
+
313
+ def discover_claude_paths():
314
+ import glob
315
+ import platform
316
+ home = os.path.expanduser("~")
317
+ system = platform.system()
318
+
319
+ # Platform-specific candidate directories
320
+ if system == "Windows":
321
+ candidates = [
322
+ os.path.join(home, "AppData", "Roaming", "Claude", "projects"),
323
+ os.path.join(home, "AppData", "Local", "Claude", "projects"),
324
+ os.path.join(home, "AppData", "Roaming", "anthropic", "claude", "projects"),
325
+ os.path.join(home, ".claude", "projects"),
326
+ ]
327
+ elif system == "Darwin":
328
+ candidates = [
329
+ os.path.join(home, ".claude", "projects"),
330
+ os.path.join(home, "Library", "Application Support", "Claude", "projects"),
331
+ ]
332
+ else: # Linux
333
+ candidates = [
334
+ os.path.join(home, ".claude", "projects"),
335
+ os.path.join(home, ".config", "claude", "projects"),
336
+ os.path.join(home, ".local", "share", "claude", "projects"),
337
+ ]
338
+
339
+ found = set()
340
+ for c in candidates:
341
+ if os.path.isdir(c):
342
+ found.add(c + "/")
343
+
344
+ # Also glob for .claude-* variants
345
+ patterns = [
346
+ os.path.join(home, ".claude", "projects"),
347
+ os.path.join(home, ".claude-*", "projects"),
348
+ ]
349
+ for pattern in patterns:
350
+ for match in glob.glob(pattern):
351
+ if os.path.isdir(match):
352
+ found.add(match + "/")
353
+
354
+ try:
355
+ for entry in os.listdir(home):
356
+ if "claude" in entry.lower() and entry.startswith("."):
357
+ candidate = os.path.join(home, entry, "projects")
358
+ if os.path.isdir(candidate):
359
+ found.add(candidate + "/")
360
+ except OSError:
361
+ pass
362
+
363
+ result = []
364
+ for p in sorted(found):
365
+ count = 0
366
+ for root, dirs, files in os.walk(p):
367
+ for fname in files:
368
+ if fname.endswith(".jsonl"):
369
+ count += 1
370
+ result.append({"path": p, "exists": True, "estimated_records": count})
371
+ return result
372
+
373
+
374
+ def start_periodic_scan(interval_seconds=300):
375
+ def _run():
376
+ while True:
377
+ try:
378
+ scan_all()
379
+ except Exception as e:
380
+ print(f"[scanner] Periodic scan error: {e}", file=sys.stderr)
381
+ time.sleep(interval_seconds)
382
+
383
+ t = threading.Thread(target=_run, daemon=True)
384
+ t.start()
385
+ return t