agentic-dataset-builder 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,701 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import base64
6
+ import copy
7
+ import hashlib
8
+ import json
9
+ import os
10
+ import sys
11
+ from datetime import datetime, timezone
12
+ from pathlib import Path
13
+ from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple
14
+
15
+ try:
16
+ from .platform_paths import default_pi_session_root
17
+ except ImportError: # pragma: no cover - direct script execution fallback
18
+ from platform_paths import default_pi_session_root # type: ignore
19
+
20
+ DEFAULT_SESSION_ROOT = default_pi_session_root()
21
+ EXPORT_VERSION = 1
22
+ TEXT_SAMPLE_LIMIT = 4000
23
+ RAW_MODE = 'raw'
24
+ FULL_MODE = 'full'
25
+ SUPPORTED_MODES = (FULL_MODE, RAW_MODE)
26
+ ARTIFACT_KEY = 'fullOutputPath'
27
+
28
+
29
+ class SessionExportError(RuntimeError):
30
+ pass
31
+
32
+
33
+ def parse_args(argv: Sequence[str]) -> argparse.Namespace:
34
+ command_names = {'list', 'verify', 'health'}
35
+ argv_list = list(argv)
36
+ command = 'export'
37
+ if argv_list and argv_list[0] in command_names:
38
+ command = argv_list[0]
39
+ argv_list = argv_list[1:]
40
+
41
+ parser = argparse.ArgumentParser(
42
+ description='Export Pi sessions into self-contained JSONL files.',
43
+ )
44
+ parser.add_argument('--session-root', default=str(DEFAULT_SESSION_ROOT), help='Pi session root directory.')
45
+ parser.add_argument('--cwd', default=os.getcwd(), help='Project directory used for default session lookup.')
46
+
47
+ if command == 'list':
48
+ parser.add_argument('--all', action='store_true', help='List sessions across every project directory.')
49
+ parser.add_argument('--json', action='store_true', help='Emit machine-readable JSON.')
50
+ args = parser.parse_args(argv_list)
51
+ args.command = command
52
+ return args
53
+
54
+ if command == 'verify':
55
+ parser.add_argument('path', help='Exported full-session JSONL to verify.')
56
+ parser.add_argument('--json', action='store_true', help='Emit machine-readable JSON.')
57
+ args = parser.parse_args(argv_list)
58
+ args.command = command
59
+ return args
60
+
61
+ if command == 'health':
62
+ parser.add_argument('--all', action='store_true', help='Scan every project directory instead of only the current cwd.')
63
+ parser.add_argument('--json', action='store_true', help='Emit machine-readable JSON.')
64
+ args = parser.parse_args(argv_list)
65
+ args.command = command
66
+ return args
67
+
68
+ parser.add_argument('--session', help='Session file path, session UUID, or unique UUID prefix.')
69
+ parser.add_argument('--select', action='store_true', help='Interactively choose a session before exporting.')
70
+ parser.add_argument('--mode', choices=SUPPORTED_MODES, default=FULL_MODE, help='Export mode.')
71
+ parser.add_argument('--out', help='Output file or directory. Defaults to ./pi-session-exports/.')
72
+ parser.add_argument('--json', action='store_true', help='Emit machine-readable JSON.')
73
+ args = parser.parse_args(argv_list)
74
+ args.command = command
75
+ return args
76
+
77
+
78
+ def session_dir_name(cwd: Path) -> str:
79
+ normalized = str(cwd.resolve()).strip('/')
80
+ return f"--{normalized.replace('/', '-')}--"
81
+
82
+
83
+ def iter_session_files(session_root: Path) -> Iterator[Path]:
84
+ if not session_root.exists():
85
+ return iter(())
86
+ return session_root.rglob('*.jsonl')
87
+
88
+
89
+ def session_sort_key(path: Path) -> Tuple[str, str]:
90
+ return (path.name, str(path))
91
+
92
+
93
+ def read_jsonl(path: Path) -> List[Dict[str, Any]]:
94
+ lines: List[Dict[str, Any]] = []
95
+ with path.open('r', encoding='utf-8') as handle:
96
+ for line_number, raw_line in enumerate(handle, start=1):
97
+ text = raw_line.strip()
98
+ if not text:
99
+ continue
100
+ try:
101
+ lines.append(json.loads(text))
102
+ except json.JSONDecodeError as exc:
103
+ raise SessionExportError(f'Invalid JSON at {path}:{line_number}: {exc}') from exc
104
+ if not lines:
105
+ raise SessionExportError(f'Session file is empty: {path}')
106
+ return lines
107
+
108
+
109
+ def load_session_metadata(path: Path) -> Dict[str, Any]:
110
+ entries = read_jsonl(path)
111
+ header = entries[0]
112
+ session_name: Optional[str] = None
113
+ first_user_text: Optional[str] = None
114
+ for entry in entries[1:]:
115
+ if entry.get('type') == 'session_info' and isinstance(entry.get('name'), str) and entry['name'].strip():
116
+ session_name = entry['name'].strip()
117
+ if first_user_text is None and entry.get('type') == 'message':
118
+ message = entry.get('message')
119
+ if isinstance(message, dict) and message.get('role') == 'user':
120
+ first_user_text = extract_message_text(message.get('content'))
121
+ return {
122
+ 'path': path,
123
+ 'header': header,
124
+ 'session_id': header.get('id'),
125
+ 'cwd': header.get('cwd'),
126
+ 'timestamp': header.get('timestamp'),
127
+ 'name': session_name,
128
+ 'preview': first_user_text,
129
+ }
130
+
131
+
132
+ def extract_message_text(content: Any) -> Optional[str]:
133
+ if isinstance(content, str):
134
+ return compact_text(content)
135
+ if not isinstance(content, list):
136
+ return None
137
+ parts: List[str] = []
138
+ for block in content:
139
+ if not isinstance(block, dict):
140
+ continue
141
+ text = block.get('text')
142
+ if isinstance(text, str) and text.strip():
143
+ parts.append(text.strip())
144
+ if len(' '.join(parts)) >= 160:
145
+ break
146
+ if not parts:
147
+ return None
148
+ return compact_text(' '.join(parts))
149
+
150
+
151
+ def compact_text(value: str, limit: int = 80) -> str:
152
+ single_line = ' '.join(value.split())
153
+ if len(single_line) <= limit:
154
+ return single_line
155
+ return single_line[: limit - 3] + '...'
156
+
157
+
158
+ def list_project_sessions(session_root: Path, cwd: Path) -> List[Dict[str, Any]]:
159
+ directory = session_root / session_dir_name(cwd)
160
+ if not directory.exists():
161
+ return []
162
+ paths = sorted(directory.glob('*.jsonl'), key=session_sort_key, reverse=True)
163
+ return [load_session_metadata(path) for path in paths]
164
+
165
+
166
+ def list_all_sessions(session_root: Path) -> List[Dict[str, Any]]:
167
+ paths = sorted(iter_session_files(session_root), key=session_sort_key, reverse=True)
168
+ return [load_session_metadata(path) for path in paths]
169
+
170
+
171
+ def resolve_session(session_root: Path, cwd: Path, session_value: Optional[str], select: bool) -> Dict[str, Any]:
172
+ if select:
173
+ candidates = list_project_sessions(session_root, cwd)
174
+ if not candidates:
175
+ raise SessionExportError(f'No sessions found for cwd {cwd}')
176
+ return select_session_interactively(candidates)
177
+
178
+ if session_value:
179
+ resolved = resolve_session_value(session_root, cwd, session_value)
180
+ return load_session_metadata(resolved)
181
+
182
+ candidates = list_project_sessions(session_root, cwd)
183
+ if not candidates:
184
+ raise SessionExportError(
185
+ f'No sessions found for cwd {cwd}. Use --session to specify one or --select to choose interactively.'
186
+ )
187
+ return candidates[0]
188
+
189
+
190
+ def resolve_session_value(session_root: Path, cwd: Path, value: str) -> Path:
191
+ candidate_path = Path(value).expanduser()
192
+ if candidate_path.exists():
193
+ return candidate_path.resolve()
194
+
195
+ cwd_relative = (cwd / value).expanduser()
196
+ if cwd_relative.exists():
197
+ return cwd_relative.resolve()
198
+
199
+ matches: List[Path] = []
200
+ for path in iter_session_files(session_root):
201
+ name = path.name
202
+ if name.endswith(f'_{value}.jsonl') or value in name:
203
+ matches.append(path)
204
+ continue
205
+ try:
206
+ header = read_jsonl(path)[0]
207
+ except SessionExportError:
208
+ continue
209
+ session_id = header.get('id')
210
+ if isinstance(session_id, str) and session_id.startswith(value):
211
+ matches.append(path)
212
+ unique_matches = dedupe_paths(matches)
213
+ if not unique_matches:
214
+ raise SessionExportError(f'Could not resolve session {value!r}')
215
+ if len(unique_matches) > 1:
216
+ raise SessionExportError(
217
+ f'Session selector {value!r} matched multiple files: ' + ', '.join(str(path) for path in unique_matches[:5])
218
+ )
219
+ return unique_matches[0]
220
+
221
+
222
+ def dedupe_paths(paths: Iterable[Path]) -> List[Path]:
223
+ seen: set[str] = set()
224
+ output: List[Path] = []
225
+ for path in paths:
226
+ key = str(path.resolve())
227
+ if key in seen:
228
+ continue
229
+ seen.add(key)
230
+ output.append(path.resolve())
231
+ return output
232
+
233
+
234
+ def select_session_interactively(candidates: List[Dict[str, Any]]) -> Dict[str, Any]:
235
+ print('Choose a session to export:', file=sys.stderr)
236
+ for index, meta in enumerate(candidates, start=1):
237
+ label = session_display_label(meta)
238
+ print(f' {index:>2}. {label}', file=sys.stderr)
239
+ while True:
240
+ raw = input('Enter number: ').strip()
241
+ if not raw:
242
+ continue
243
+ if not raw.isdigit():
244
+ print('Please enter a numeric choice.', file=sys.stderr)
245
+ continue
246
+ choice = int(raw)
247
+ if 1 <= choice <= len(candidates):
248
+ return candidates[choice - 1]
249
+ print(f'Please choose between 1 and {len(candidates)}.', file=sys.stderr)
250
+
251
+
252
+ def session_display_label(meta: Dict[str, Any]) -> str:
253
+ header = meta['header']
254
+ session_id = header.get('id', 'unknown')
255
+ timestamp = meta.get('timestamp') or 'unknown-time'
256
+ name = meta.get('name') or meta.get('preview') or 'unnamed'
257
+ cwd = meta.get('cwd') or 'unknown-cwd'
258
+ return f'{timestamp} {compact_text(name, limit=48)} id:{session_id} cwd:{cwd}'
259
+
260
+
261
+ def file_sha256_bytes(data: bytes) -> str:
262
+ return hashlib.sha256(data).hexdigest()
263
+
264
+
265
+ def file_sha256(path: Path) -> str:
266
+ digest = hashlib.sha256()
267
+ with path.open('rb') as handle:
268
+ for chunk in iter(lambda: handle.read(1024 * 1024), b''):
269
+ digest.update(chunk)
270
+ return digest.hexdigest()
271
+
272
+
273
+ def make_artifact_payload(path: Path) -> Dict[str, Any]:
274
+ data = path.read_bytes()
275
+ payload: Dict[str, Any] = {
276
+ 'sourcePath': str(path),
277
+ 'size': len(data),
278
+ 'sha256': file_sha256_bytes(data),
279
+ }
280
+ try:
281
+ text = data.decode('utf-8')
282
+ except UnicodeDecodeError:
283
+ payload['encoding'] = 'base64'
284
+ payload['base64'] = base64.b64encode(data).decode('ascii')
285
+ else:
286
+ payload['encoding'] = 'utf-8'
287
+ payload['text'] = text
288
+ payload['textPreview'] = compact_text(text, limit=TEXT_SAMPLE_LIMIT)
289
+ return payload
290
+
291
+
292
+ def build_export_info(session_path: Path, source_sha256: str, mode: str, artifact_count: int, missing_count: int) -> Dict[str, Any]:
293
+ return {
294
+ 'format': 'pi-session-full-export',
295
+ 'version': EXPORT_VERSION,
296
+ 'mode': mode,
297
+ 'exportedAt': datetime.now(timezone.utc).isoformat(),
298
+ 'sourceSessionPath': str(session_path),
299
+ 'sourceSessionSha256': source_sha256,
300
+ 'fullyEmbedded': mode == FULL_MODE,
301
+ 'artifactKey': ARTIFACT_KEY,
302
+ 'embeddedArtifactCount': artifact_count,
303
+ 'missingArtifactCount': missing_count,
304
+ }
305
+
306
+
307
+ def export_session(session_meta: Dict[str, Any], mode: str) -> Dict[str, Any]:
308
+ session_path = Path(session_meta['path']).resolve()
309
+ entries = read_jsonl(session_path)
310
+ source_sha = file_sha256(session_path)
311
+ header = copy.deepcopy(entries[0])
312
+ body = [copy.deepcopy(entry) for entry in entries[1:]]
313
+
314
+ artifact_count = 0
315
+ missing_artifacts: List[Dict[str, Any]] = []
316
+
317
+ if mode == FULL_MODE:
318
+ for entry in body:
319
+ artifact_count += embed_entry_artifacts(entry, missing_artifacts)
320
+
321
+ export_info = build_export_info(session_path, source_sha, mode, artifact_count, len(missing_artifacts))
322
+ if missing_artifacts:
323
+ export_info['missingArtifacts'] = missing_artifacts
324
+ header['exportInfo'] = export_info
325
+
326
+ return {
327
+ 'header': header,
328
+ 'entries': body,
329
+ 'artifactCount': artifact_count,
330
+ 'missingArtifacts': missing_artifacts,
331
+ 'sourceSha256': source_sha,
332
+ 'sessionPath': session_path,
333
+ 'sessionId': header.get('id'),
334
+ }
335
+
336
+
337
+ def embed_entry_artifacts(entry: Dict[str, Any], missing_artifacts: List[Dict[str, Any]]) -> int:
338
+ count = 0
339
+
340
+ def walk(node: Any, trail: List[str]) -> None:
341
+ nonlocal count
342
+ if isinstance(node, dict):
343
+ for key, value in list(node.items()):
344
+ current_trail = trail + [key]
345
+ if key == ARTIFACT_KEY and isinstance(value, str):
346
+ source = Path(value).expanduser()
347
+ embedded_key = f'{key}Embedded'
348
+ if source.is_file():
349
+ node[embedded_key] = make_artifact_payload(source)
350
+ count += 1
351
+ else:
352
+ missing_artifacts.append(
353
+ {
354
+ 'path': value,
355
+ 'reason': 'missing_or_not_file',
356
+ 'entryType': entry.get('type'),
357
+ 'entryId': entry.get('id'),
358
+ 'field': '.'.join(current_trail),
359
+ }
360
+ )
361
+ walk(value, current_trail)
362
+ elif isinstance(node, list):
363
+ for index, item in enumerate(node):
364
+ walk(item, trail + [str(index)])
365
+
366
+ walk(entry, [])
367
+ return count
368
+
369
+
370
+ def default_output_path(exported: Dict[str, Any], requested_out: Optional[str]) -> Path:
371
+ session_id = exported.get('sessionId') or 'unknown-session'
372
+ source_path = Path(exported['sessionPath'])
373
+ base_name = source_path.stem
374
+ suffix = '.full.jsonl' if exported['header'].get('exportInfo', {}).get('mode') == FULL_MODE else '.raw.jsonl'
375
+
376
+ if requested_out:
377
+ out_path = Path(requested_out).expanduser()
378
+ if out_path.exists() and out_path.is_dir():
379
+ return (out_path / f'{base_name}{suffix}').resolve()
380
+ if requested_out.endswith(os.sep):
381
+ return (out_path / f'{base_name}{suffix}').resolve()
382
+ if out_path.suffix:
383
+ return out_path.resolve()
384
+ return (out_path / f'{base_name}{suffix}').resolve()
385
+
386
+ default_dir = Path.cwd() / 'pi-session-exports'
387
+ return (default_dir / f'{base_name}{suffix}').resolve()
388
+
389
+
390
+ def write_export(exported: Dict[str, Any], output_path: Path) -> None:
391
+ output_path.parent.mkdir(parents=True, exist_ok=True)
392
+ with output_path.open('w', encoding='utf-8') as handle:
393
+ handle.write(json.dumps(exported['header'], ensure_ascii=False) + '\n')
394
+ for entry in exported['entries']:
395
+ handle.write(json.dumps(entry, ensure_ascii=False) + '\n')
396
+
397
+
398
+ def verify_export(path: Path) -> Dict[str, Any]:
399
+ entries = read_jsonl(path)
400
+ header = entries[0]
401
+ export_info = header.get('exportInfo')
402
+ issues: List[str] = []
403
+ embedded_count = 0
404
+ missing_refs: List[Dict[str, Any]] = []
405
+
406
+ if not isinstance(export_info, dict):
407
+ issues.append('Missing exportInfo header.')
408
+ else:
409
+ if export_info.get('format') != 'pi-session-full-export':
410
+ issues.append('Unexpected exportInfo.format.')
411
+ if export_info.get('mode') == FULL_MODE and not export_info.get('fullyEmbedded'):
412
+ issues.append('Full export is not marked fullyEmbedded.')
413
+
414
+ for entry in entries[1:]:
415
+ count, missing = verify_entry_artifacts(entry)
416
+ embedded_count += count
417
+ missing_refs.extend(missing)
418
+
419
+ if export_info and export_info.get('mode') == FULL_MODE and missing_refs:
420
+ issues.append('Found external artifact references without embedded payloads.')
421
+
422
+ return {
423
+ 'path': str(path),
424
+ 'sessionId': header.get('id'),
425
+ 'mode': export_info.get('mode') if isinstance(export_info, dict) else None,
426
+ 'embeddedArtifactCount': embedded_count,
427
+ 'missingEmbeddedArtifacts': missing_refs,
428
+ 'issues': issues,
429
+ 'ok': not issues,
430
+ }
431
+
432
+
433
+ def verify_entry_artifacts(entry: Dict[str, Any]) -> Tuple[int, List[Dict[str, Any]]]:
434
+ embedded_count = 0
435
+ missing: List[Dict[str, Any]] = []
436
+
437
+ def walk(node: Any, trail: List[str]) -> None:
438
+ nonlocal embedded_count
439
+ if isinstance(node, dict):
440
+ for key, value in node.items():
441
+ current_trail = trail + [key]
442
+ if key == ARTIFACT_KEY and isinstance(value, str):
443
+ embedded = node.get(f'{key}Embedded')
444
+ if embedded is None:
445
+ missing.append(
446
+ {
447
+ 'entryId': entry.get('id'),
448
+ 'entryType': entry.get('type'),
449
+ 'field': '.'.join(current_trail),
450
+ 'path': value,
451
+ }
452
+ )
453
+ else:
454
+ embedded_count += 1
455
+ walk(value, current_trail)
456
+ elif isinstance(node, list):
457
+ for index, item in enumerate(node):
458
+ walk(item, trail + [str(index)])
459
+
460
+ walk(entry, [])
461
+ return embedded_count, missing
462
+
463
+
464
+ def format_export_result(exported: Dict[str, Any], output_path: Path) -> Dict[str, Any]:
465
+ export_info = exported['header']['exportInfo']
466
+ return {
467
+ 'source': str(exported['sessionPath']),
468
+ 'output': str(output_path),
469
+ 'sessionId': exported.get('sessionId'),
470
+ 'mode': export_info.get('mode'),
471
+ 'embeddedArtifacts': exported.get('artifactCount', 0),
472
+ 'missingArtifacts': exported.get('missingArtifacts', []),
473
+ 'sourceSha256': exported.get('sourceSha256'),
474
+ }
475
+
476
+
477
+ def print_list(sessions: List[Dict[str, Any]], as_json: bool) -> int:
478
+ rows = [
479
+ {
480
+ 'path': str(meta['path']),
481
+ 'sessionId': meta['session_id'],
482
+ 'timestamp': meta['timestamp'],
483
+ 'cwd': meta['cwd'],
484
+ 'name': meta['name'],
485
+ 'preview': meta['preview'],
486
+ }
487
+ for meta in sessions
488
+ ]
489
+ if as_json:
490
+ print(json.dumps(rows, ensure_ascii=False, indent=2))
491
+ return 0
492
+ if not rows:
493
+ print('No sessions found.')
494
+ return 0
495
+ for index, row in enumerate(rows, start=1):
496
+ name = row['name'] or row['preview'] or 'unnamed'
497
+ print(f'{index:>2}. {row["timestamp"]} id:{row["sessionId"]} {compact_text(name, 60)}')
498
+ print(f' cwd: {row["cwd"]}')
499
+ print(f' path: {row["path"]}')
500
+ return 0
501
+
502
+
503
+ def print_result(payload: Dict[str, Any], as_json: bool) -> int:
504
+ if as_json:
505
+ print(json.dumps(payload, ensure_ascii=False, indent=2))
506
+ return 0
507
+ print(f"source: {payload['source']}")
508
+ print(f"output: {payload['output']}")
509
+ print(f"session id: {payload['sessionId']}")
510
+ print(f"mode: {payload['mode']}")
511
+ print(f"embedded artifacts: {payload['embeddedArtifacts']}")
512
+ print(f"missing artifacts: {len(payload['missingArtifacts'])}")
513
+ if payload['missingArtifacts']:
514
+ for item in payload['missingArtifacts'][:10]:
515
+ print(f" - {item['path']} ({item['field']})")
516
+ return 0
517
+
518
+
519
+ def print_verify_result(payload: Dict[str, Any], as_json: bool) -> int:
520
+ if as_json:
521
+ print(json.dumps(payload, ensure_ascii=False, indent=2))
522
+ return 0 if payload['ok'] else 1
523
+ print(f"path: {payload['path']}")
524
+ print(f"session id: {payload['sessionId']}")
525
+ print(f"mode: {payload['mode']}")
526
+ print(f"embedded artifacts: {payload['embeddedArtifactCount']}")
527
+ print(f"ok: {'yes' if payload['ok'] else 'no'}")
528
+ if payload['issues']:
529
+ print('issues:')
530
+ for issue in payload['issues']:
531
+ print(f' - {issue}')
532
+ if payload['missingEmbeddedArtifacts']:
533
+ print('missing embedded artifacts:')
534
+ for item in payload['missingEmbeddedArtifacts'][:10]:
535
+ print(f" - {item['path']} ({item['field']})")
536
+ return 0 if payload['ok'] else 1
537
+
538
+
539
+ def analyze_session_health(path: Path) -> Dict[str, Any]:
540
+ entries = read_jsonl(path)
541
+ header = entries[0]
542
+ refs: List[Dict[str, Any]] = []
543
+
544
+ for entry in entries[1:]:
545
+ refs.extend(find_artifact_refs(entry))
546
+
547
+ existing_refs = [ref for ref in refs if Path(ref['path']).is_file()]
548
+ missing_refs = [ref for ref in refs if not Path(ref['path']).is_file()]
549
+ return {
550
+ 'path': str(path),
551
+ 'sessionId': header.get('id'),
552
+ 'cwd': header.get('cwd'),
553
+ 'timestamp': header.get('timestamp'),
554
+ 'totalRefs': len(refs),
555
+ 'existingRefs': len(existing_refs),
556
+ 'missingRefs': len(missing_refs),
557
+ 'ready': len(missing_refs) == 0,
558
+ 'refs': refs,
559
+ }
560
+
561
+
562
+ def find_artifact_refs(entry: Dict[str, Any]) -> List[Dict[str, Any]]:
563
+ refs: List[Dict[str, Any]] = []
564
+
565
+ def walk(node: Any, trail: List[str]) -> None:
566
+ if isinstance(node, dict):
567
+ for key, value in node.items():
568
+ current_trail = trail + [key]
569
+ if key == ARTIFACT_KEY and isinstance(value, str):
570
+ refs.append(
571
+ {
572
+ 'entryId': entry.get('id'),
573
+ 'entryType': entry.get('type'),
574
+ 'field': '.'.join(current_trail),
575
+ 'path': value,
576
+ }
577
+ )
578
+ else:
579
+ walk(value, current_trail)
580
+ elif isinstance(node, list):
581
+ for index, item in enumerate(node):
582
+ walk(item, trail + [str(index)])
583
+
584
+ walk(entry, [])
585
+ return refs
586
+
587
+
588
+ def build_health_report(session_root: Path, cwd: Path, include_all: bool) -> Dict[str, Any]:
589
+ sessions = list_all_sessions(session_root) if include_all else list_project_sessions(session_root, cwd)
590
+ session_paths = [Path(meta['path']) for meta in sessions]
591
+ reports = [analyze_session_health(path) for path in session_paths]
592
+
593
+ total_sessions = len(reports)
594
+ sessions_with_refs = [report for report in reports if report['totalRefs'] > 0]
595
+ sessions_ready = [report for report in reports if report['ready']]
596
+ sessions_blocked = [report for report in reports if report['missingRefs'] > 0]
597
+ total_refs = sum(report['totalRefs'] for report in reports)
598
+ existing_refs = sum(report['existingRefs'] for report in reports)
599
+ missing_refs = sum(report['missingRefs'] for report in reports)
600
+
601
+ projects: Dict[str, Dict[str, Any]] = {}
602
+ for report in reports:
603
+ key = report.get('cwd') or 'unknown-cwd'
604
+ project = projects.setdefault(
605
+ key,
606
+ {
607
+ 'cwd': key,
608
+ 'sessions': 0,
609
+ 'sessionsWithRefs': 0,
610
+ 'sessionsBlocked': 0,
611
+ 'totalRefs': 0,
612
+ 'missingRefs': 0,
613
+ },
614
+ )
615
+ project['sessions'] += 1
616
+ project['totalRefs'] += report['totalRefs']
617
+ project['missingRefs'] += report['missingRefs']
618
+ if report['totalRefs'] > 0:
619
+ project['sessionsWithRefs'] += 1
620
+ if report['missingRefs'] > 0:
621
+ project['sessionsBlocked'] += 1
622
+
623
+ return {
624
+ 'scope': 'all' if include_all else 'cwd',
625
+ 'sessionRoot': str(session_root),
626
+ 'cwd': str(cwd),
627
+ 'totalSessions': total_sessions,
628
+ 'sessionsReady': len(sessions_ready),
629
+ 'sessionsWithRefs': len(sessions_with_refs),
630
+ 'sessionsBlocked': len(sessions_blocked),
631
+ 'totalRefs': total_refs,
632
+ 'existingRefs': existing_refs,
633
+ 'missingRefs': missing_refs,
634
+ 'missingRatio': (missing_refs / total_refs) if total_refs else 0.0,
635
+ 'projects': sorted(projects.values(), key=lambda item: (item['sessionsBlocked'], item['missingRefs']), reverse=True),
636
+ 'blockedSessions': sorted(sessions_blocked, key=lambda item: (item['missingRefs'], item['timestamp'] or ''), reverse=True),
637
+ }
638
+
639
+
640
+ def print_health_report(payload: Dict[str, Any], as_json: bool) -> int:
641
+ if as_json:
642
+ print(json.dumps(payload, ensure_ascii=False, indent=2))
643
+ return 0
644
+ print(f"scope: {payload['scope']}")
645
+ print(f"session root: {payload['sessionRoot']}")
646
+ print(f"cwd: {payload['cwd']}")
647
+ print(f"total sessions: {payload['totalSessions']}")
648
+ print(f"sessions ready: {payload['sessionsReady']}")
649
+ print(f"sessions with external refs: {payload['sessionsWithRefs']}")
650
+ print(f"sessions blocked by missing refs: {payload['sessionsBlocked']}")
651
+ print(f"total refs: {payload['totalRefs']}")
652
+ print(f"existing refs: {payload['existingRefs']}")
653
+ print(f"missing refs: {payload['missingRefs']}")
654
+ print(f"missing ratio: {payload['missingRatio']:.2%}")
655
+ if payload['projects']:
656
+ print('projects:')
657
+ for project in payload['projects'][:10]:
658
+ print(
659
+ f" - {project['cwd']}: sessions={project['sessions']}, with_refs={project['sessionsWithRefs']}, "
660
+ f"blocked={project['sessionsBlocked']}, missing_refs={project['missingRefs']}"
661
+ )
662
+ if payload['blockedSessions']:
663
+ print('blocked sessions:')
664
+ for report in payload['blockedSessions'][:10]:
665
+ print(
666
+ f" - {report['timestamp']} id:{report['sessionId']} missing={report['missingRefs']} path={report['path']}"
667
+ )
668
+ return 0
669
+
670
+
671
+ def main(argv: Sequence[str] | None = None) -> int:
672
+ args = parse_args(argv or sys.argv[1:])
673
+ session_root = Path(args.session_root).expanduser().resolve()
674
+ cwd = Path(args.cwd).expanduser().resolve()
675
+
676
+ try:
677
+ if args.command == 'list':
678
+ sessions = list_all_sessions(session_root) if args.all else list_project_sessions(session_root, cwd)
679
+ return print_list(sessions, args.json)
680
+
681
+ if args.command == 'verify':
682
+ result = verify_export(Path(args.path).expanduser().resolve())
683
+ return print_verify_result(result, args.json)
684
+
685
+ if args.command == 'health':
686
+ result = build_health_report(session_root, cwd, args.all)
687
+ return print_health_report(result, args.json)
688
+
689
+ session_meta = resolve_session(session_root, cwd, args.session, args.select)
690
+ exported = export_session(session_meta, args.mode)
691
+ output_path = default_output_path(exported, args.out)
692
+ write_export(exported, output_path)
693
+ result = format_export_result(exported, output_path)
694
+ return print_result(result, args.json)
695
+ except SessionExportError as exc:
696
+ print(f'error: {exc}', file=sys.stderr)
697
+ return 1
698
+
699
+
700
+ if __name__ == '__main__':
701
+ raise SystemExit(main())