agentic-dataset-builder 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +125 -0
- package/agentic_dataset/__init__.py +1 -0
- package/agentic_dataset/build_agentic_dataset.py +368 -0
- package/agentic_dataset/export_codex_session_to_qwen35.py +466 -0
- package/agentic_dataset/export_pi_session.py +701 -0
- package/agentic_dataset/export_pi_session_to_qwen35.py +742 -0
- package/agentic_dataset/export_qwen35_training.py +1559 -0
- package/agentic_dataset/label_qwen35_agentic.py +156 -0
- package/agentic_dataset/platform_paths.py +85 -0
- package/agentic_dataset/qwen35_training_record.py +179 -0
- package/bin/agentic-dataset-builder.js +77 -0
- package/package.json +40 -0
- package/requirements.txt +2 -0
- package/run.py +8 -0
|
@@ -0,0 +1,701 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import base64
|
|
6
|
+
import copy
|
|
7
|
+
import hashlib
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
import sys
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
from .platform_paths import default_pi_session_root
|
|
17
|
+
except ImportError: # pragma: no cover - direct script execution fallback
|
|
18
|
+
from platform_paths import default_pi_session_root # type: ignore
|
|
19
|
+
|
|
20
|
+
DEFAULT_SESSION_ROOT = default_pi_session_root()
|
|
21
|
+
EXPORT_VERSION = 1
|
|
22
|
+
TEXT_SAMPLE_LIMIT = 4000
|
|
23
|
+
RAW_MODE = 'raw'
|
|
24
|
+
FULL_MODE = 'full'
|
|
25
|
+
SUPPORTED_MODES = (FULL_MODE, RAW_MODE)
|
|
26
|
+
ARTIFACT_KEY = 'fullOutputPath'
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class SessionExportError(RuntimeError):
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def parse_args(argv: Sequence[str]) -> argparse.Namespace:
|
|
34
|
+
command_names = {'list', 'verify', 'health'}
|
|
35
|
+
argv_list = list(argv)
|
|
36
|
+
command = 'export'
|
|
37
|
+
if argv_list and argv_list[0] in command_names:
|
|
38
|
+
command = argv_list[0]
|
|
39
|
+
argv_list = argv_list[1:]
|
|
40
|
+
|
|
41
|
+
parser = argparse.ArgumentParser(
|
|
42
|
+
description='Export Pi sessions into self-contained JSONL files.',
|
|
43
|
+
)
|
|
44
|
+
parser.add_argument('--session-root', default=str(DEFAULT_SESSION_ROOT), help='Pi session root directory.')
|
|
45
|
+
parser.add_argument('--cwd', default=os.getcwd(), help='Project directory used for default session lookup.')
|
|
46
|
+
|
|
47
|
+
if command == 'list':
|
|
48
|
+
parser.add_argument('--all', action='store_true', help='List sessions across every project directory.')
|
|
49
|
+
parser.add_argument('--json', action='store_true', help='Emit machine-readable JSON.')
|
|
50
|
+
args = parser.parse_args(argv_list)
|
|
51
|
+
args.command = command
|
|
52
|
+
return args
|
|
53
|
+
|
|
54
|
+
if command == 'verify':
|
|
55
|
+
parser.add_argument('path', help='Exported full-session JSONL to verify.')
|
|
56
|
+
parser.add_argument('--json', action='store_true', help='Emit machine-readable JSON.')
|
|
57
|
+
args = parser.parse_args(argv_list)
|
|
58
|
+
args.command = command
|
|
59
|
+
return args
|
|
60
|
+
|
|
61
|
+
if command == 'health':
|
|
62
|
+
parser.add_argument('--all', action='store_true', help='Scan every project directory instead of only the current cwd.')
|
|
63
|
+
parser.add_argument('--json', action='store_true', help='Emit machine-readable JSON.')
|
|
64
|
+
args = parser.parse_args(argv_list)
|
|
65
|
+
args.command = command
|
|
66
|
+
return args
|
|
67
|
+
|
|
68
|
+
parser.add_argument('--session', help='Session file path, session UUID, or unique UUID prefix.')
|
|
69
|
+
parser.add_argument('--select', action='store_true', help='Interactively choose a session before exporting.')
|
|
70
|
+
parser.add_argument('--mode', choices=SUPPORTED_MODES, default=FULL_MODE, help='Export mode.')
|
|
71
|
+
parser.add_argument('--out', help='Output file or directory. Defaults to ./pi-session-exports/.')
|
|
72
|
+
parser.add_argument('--json', action='store_true', help='Emit machine-readable JSON.')
|
|
73
|
+
args = parser.parse_args(argv_list)
|
|
74
|
+
args.command = command
|
|
75
|
+
return args
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def session_dir_name(cwd: Path) -> str:
|
|
79
|
+
normalized = str(cwd.resolve()).strip('/')
|
|
80
|
+
return f"--{normalized.replace('/', '-')}--"
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def iter_session_files(session_root: Path) -> Iterator[Path]:
|
|
84
|
+
if not session_root.exists():
|
|
85
|
+
return iter(())
|
|
86
|
+
return session_root.rglob('*.jsonl')
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def session_sort_key(path: Path) -> Tuple[str, str]:
|
|
90
|
+
return (path.name, str(path))
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def read_jsonl(path: Path) -> List[Dict[str, Any]]:
|
|
94
|
+
lines: List[Dict[str, Any]] = []
|
|
95
|
+
with path.open('r', encoding='utf-8') as handle:
|
|
96
|
+
for line_number, raw_line in enumerate(handle, start=1):
|
|
97
|
+
text = raw_line.strip()
|
|
98
|
+
if not text:
|
|
99
|
+
continue
|
|
100
|
+
try:
|
|
101
|
+
lines.append(json.loads(text))
|
|
102
|
+
except json.JSONDecodeError as exc:
|
|
103
|
+
raise SessionExportError(f'Invalid JSON at {path}:{line_number}: {exc}') from exc
|
|
104
|
+
if not lines:
|
|
105
|
+
raise SessionExportError(f'Session file is empty: {path}')
|
|
106
|
+
return lines
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def load_session_metadata(path: Path) -> Dict[str, Any]:
|
|
110
|
+
entries = read_jsonl(path)
|
|
111
|
+
header = entries[0]
|
|
112
|
+
session_name: Optional[str] = None
|
|
113
|
+
first_user_text: Optional[str] = None
|
|
114
|
+
for entry in entries[1:]:
|
|
115
|
+
if entry.get('type') == 'session_info' and isinstance(entry.get('name'), str) and entry['name'].strip():
|
|
116
|
+
session_name = entry['name'].strip()
|
|
117
|
+
if first_user_text is None and entry.get('type') == 'message':
|
|
118
|
+
message = entry.get('message')
|
|
119
|
+
if isinstance(message, dict) and message.get('role') == 'user':
|
|
120
|
+
first_user_text = extract_message_text(message.get('content'))
|
|
121
|
+
return {
|
|
122
|
+
'path': path,
|
|
123
|
+
'header': header,
|
|
124
|
+
'session_id': header.get('id'),
|
|
125
|
+
'cwd': header.get('cwd'),
|
|
126
|
+
'timestamp': header.get('timestamp'),
|
|
127
|
+
'name': session_name,
|
|
128
|
+
'preview': first_user_text,
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def extract_message_text(content: Any) -> Optional[str]:
|
|
133
|
+
if isinstance(content, str):
|
|
134
|
+
return compact_text(content)
|
|
135
|
+
if not isinstance(content, list):
|
|
136
|
+
return None
|
|
137
|
+
parts: List[str] = []
|
|
138
|
+
for block in content:
|
|
139
|
+
if not isinstance(block, dict):
|
|
140
|
+
continue
|
|
141
|
+
text = block.get('text')
|
|
142
|
+
if isinstance(text, str) and text.strip():
|
|
143
|
+
parts.append(text.strip())
|
|
144
|
+
if len(' '.join(parts)) >= 160:
|
|
145
|
+
break
|
|
146
|
+
if not parts:
|
|
147
|
+
return None
|
|
148
|
+
return compact_text(' '.join(parts))
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def compact_text(value: str, limit: int = 80) -> str:
|
|
152
|
+
single_line = ' '.join(value.split())
|
|
153
|
+
if len(single_line) <= limit:
|
|
154
|
+
return single_line
|
|
155
|
+
return single_line[: limit - 3] + '...'
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def list_project_sessions(session_root: Path, cwd: Path) -> List[Dict[str, Any]]:
|
|
159
|
+
directory = session_root / session_dir_name(cwd)
|
|
160
|
+
if not directory.exists():
|
|
161
|
+
return []
|
|
162
|
+
paths = sorted(directory.glob('*.jsonl'), key=session_sort_key, reverse=True)
|
|
163
|
+
return [load_session_metadata(path) for path in paths]
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def list_all_sessions(session_root: Path) -> List[Dict[str, Any]]:
|
|
167
|
+
paths = sorted(iter_session_files(session_root), key=session_sort_key, reverse=True)
|
|
168
|
+
return [load_session_metadata(path) for path in paths]
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def resolve_session(session_root: Path, cwd: Path, session_value: Optional[str], select: bool) -> Dict[str, Any]:
|
|
172
|
+
if select:
|
|
173
|
+
candidates = list_project_sessions(session_root, cwd)
|
|
174
|
+
if not candidates:
|
|
175
|
+
raise SessionExportError(f'No sessions found for cwd {cwd}')
|
|
176
|
+
return select_session_interactively(candidates)
|
|
177
|
+
|
|
178
|
+
if session_value:
|
|
179
|
+
resolved = resolve_session_value(session_root, cwd, session_value)
|
|
180
|
+
return load_session_metadata(resolved)
|
|
181
|
+
|
|
182
|
+
candidates = list_project_sessions(session_root, cwd)
|
|
183
|
+
if not candidates:
|
|
184
|
+
raise SessionExportError(
|
|
185
|
+
f'No sessions found for cwd {cwd}. Use --session to specify one or --select to choose interactively.'
|
|
186
|
+
)
|
|
187
|
+
return candidates[0]
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def resolve_session_value(session_root: Path, cwd: Path, value: str) -> Path:
|
|
191
|
+
candidate_path = Path(value).expanduser()
|
|
192
|
+
if candidate_path.exists():
|
|
193
|
+
return candidate_path.resolve()
|
|
194
|
+
|
|
195
|
+
cwd_relative = (cwd / value).expanduser()
|
|
196
|
+
if cwd_relative.exists():
|
|
197
|
+
return cwd_relative.resolve()
|
|
198
|
+
|
|
199
|
+
matches: List[Path] = []
|
|
200
|
+
for path in iter_session_files(session_root):
|
|
201
|
+
name = path.name
|
|
202
|
+
if name.endswith(f'_{value}.jsonl') or value in name:
|
|
203
|
+
matches.append(path)
|
|
204
|
+
continue
|
|
205
|
+
try:
|
|
206
|
+
header = read_jsonl(path)[0]
|
|
207
|
+
except SessionExportError:
|
|
208
|
+
continue
|
|
209
|
+
session_id = header.get('id')
|
|
210
|
+
if isinstance(session_id, str) and session_id.startswith(value):
|
|
211
|
+
matches.append(path)
|
|
212
|
+
unique_matches = dedupe_paths(matches)
|
|
213
|
+
if not unique_matches:
|
|
214
|
+
raise SessionExportError(f'Could not resolve session {value!r}')
|
|
215
|
+
if len(unique_matches) > 1:
|
|
216
|
+
raise SessionExportError(
|
|
217
|
+
f'Session selector {value!r} matched multiple files: ' + ', '.join(str(path) for path in unique_matches[:5])
|
|
218
|
+
)
|
|
219
|
+
return unique_matches[0]
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def dedupe_paths(paths: Iterable[Path]) -> List[Path]:
|
|
223
|
+
seen: set[str] = set()
|
|
224
|
+
output: List[Path] = []
|
|
225
|
+
for path in paths:
|
|
226
|
+
key = str(path.resolve())
|
|
227
|
+
if key in seen:
|
|
228
|
+
continue
|
|
229
|
+
seen.add(key)
|
|
230
|
+
output.append(path.resolve())
|
|
231
|
+
return output
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def select_session_interactively(candidates: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
235
|
+
print('Choose a session to export:', file=sys.stderr)
|
|
236
|
+
for index, meta in enumerate(candidates, start=1):
|
|
237
|
+
label = session_display_label(meta)
|
|
238
|
+
print(f' {index:>2}. {label}', file=sys.stderr)
|
|
239
|
+
while True:
|
|
240
|
+
raw = input('Enter number: ').strip()
|
|
241
|
+
if not raw:
|
|
242
|
+
continue
|
|
243
|
+
if not raw.isdigit():
|
|
244
|
+
print('Please enter a numeric choice.', file=sys.stderr)
|
|
245
|
+
continue
|
|
246
|
+
choice = int(raw)
|
|
247
|
+
if 1 <= choice <= len(candidates):
|
|
248
|
+
return candidates[choice - 1]
|
|
249
|
+
print(f'Please choose between 1 and {len(candidates)}.', file=sys.stderr)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def session_display_label(meta: Dict[str, Any]) -> str:
|
|
253
|
+
header = meta['header']
|
|
254
|
+
session_id = header.get('id', 'unknown')
|
|
255
|
+
timestamp = meta.get('timestamp') or 'unknown-time'
|
|
256
|
+
name = meta.get('name') or meta.get('preview') or 'unnamed'
|
|
257
|
+
cwd = meta.get('cwd') or 'unknown-cwd'
|
|
258
|
+
return f'{timestamp} {compact_text(name, limit=48)} id:{session_id} cwd:{cwd}'
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def file_sha256_bytes(data: bytes) -> str:
|
|
262
|
+
return hashlib.sha256(data).hexdigest()
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def file_sha256(path: Path) -> str:
|
|
266
|
+
digest = hashlib.sha256()
|
|
267
|
+
with path.open('rb') as handle:
|
|
268
|
+
for chunk in iter(lambda: handle.read(1024 * 1024), b''):
|
|
269
|
+
digest.update(chunk)
|
|
270
|
+
return digest.hexdigest()
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def make_artifact_payload(path: Path) -> Dict[str, Any]:
|
|
274
|
+
data = path.read_bytes()
|
|
275
|
+
payload: Dict[str, Any] = {
|
|
276
|
+
'sourcePath': str(path),
|
|
277
|
+
'size': len(data),
|
|
278
|
+
'sha256': file_sha256_bytes(data),
|
|
279
|
+
}
|
|
280
|
+
try:
|
|
281
|
+
text = data.decode('utf-8')
|
|
282
|
+
except UnicodeDecodeError:
|
|
283
|
+
payload['encoding'] = 'base64'
|
|
284
|
+
payload['base64'] = base64.b64encode(data).decode('ascii')
|
|
285
|
+
else:
|
|
286
|
+
payload['encoding'] = 'utf-8'
|
|
287
|
+
payload['text'] = text
|
|
288
|
+
payload['textPreview'] = compact_text(text, limit=TEXT_SAMPLE_LIMIT)
|
|
289
|
+
return payload
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def build_export_info(session_path: Path, source_sha256: str, mode: str, artifact_count: int, missing_count: int) -> Dict[str, Any]:
|
|
293
|
+
return {
|
|
294
|
+
'format': 'pi-session-full-export',
|
|
295
|
+
'version': EXPORT_VERSION,
|
|
296
|
+
'mode': mode,
|
|
297
|
+
'exportedAt': datetime.now(timezone.utc).isoformat(),
|
|
298
|
+
'sourceSessionPath': str(session_path),
|
|
299
|
+
'sourceSessionSha256': source_sha256,
|
|
300
|
+
'fullyEmbedded': mode == FULL_MODE,
|
|
301
|
+
'artifactKey': ARTIFACT_KEY,
|
|
302
|
+
'embeddedArtifactCount': artifact_count,
|
|
303
|
+
'missingArtifactCount': missing_count,
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def export_session(session_meta: Dict[str, Any], mode: str) -> Dict[str, Any]:
|
|
308
|
+
session_path = Path(session_meta['path']).resolve()
|
|
309
|
+
entries = read_jsonl(session_path)
|
|
310
|
+
source_sha = file_sha256(session_path)
|
|
311
|
+
header = copy.deepcopy(entries[0])
|
|
312
|
+
body = [copy.deepcopy(entry) for entry in entries[1:]]
|
|
313
|
+
|
|
314
|
+
artifact_count = 0
|
|
315
|
+
missing_artifacts: List[Dict[str, Any]] = []
|
|
316
|
+
|
|
317
|
+
if mode == FULL_MODE:
|
|
318
|
+
for entry in body:
|
|
319
|
+
artifact_count += embed_entry_artifacts(entry, missing_artifacts)
|
|
320
|
+
|
|
321
|
+
export_info = build_export_info(session_path, source_sha, mode, artifact_count, len(missing_artifacts))
|
|
322
|
+
if missing_artifacts:
|
|
323
|
+
export_info['missingArtifacts'] = missing_artifacts
|
|
324
|
+
header['exportInfo'] = export_info
|
|
325
|
+
|
|
326
|
+
return {
|
|
327
|
+
'header': header,
|
|
328
|
+
'entries': body,
|
|
329
|
+
'artifactCount': artifact_count,
|
|
330
|
+
'missingArtifacts': missing_artifacts,
|
|
331
|
+
'sourceSha256': source_sha,
|
|
332
|
+
'sessionPath': session_path,
|
|
333
|
+
'sessionId': header.get('id'),
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def embed_entry_artifacts(entry: Dict[str, Any], missing_artifacts: List[Dict[str, Any]]) -> int:
|
|
338
|
+
count = 0
|
|
339
|
+
|
|
340
|
+
def walk(node: Any, trail: List[str]) -> None:
|
|
341
|
+
nonlocal count
|
|
342
|
+
if isinstance(node, dict):
|
|
343
|
+
for key, value in list(node.items()):
|
|
344
|
+
current_trail = trail + [key]
|
|
345
|
+
if key == ARTIFACT_KEY and isinstance(value, str):
|
|
346
|
+
source = Path(value).expanduser()
|
|
347
|
+
embedded_key = f'{key}Embedded'
|
|
348
|
+
if source.is_file():
|
|
349
|
+
node[embedded_key] = make_artifact_payload(source)
|
|
350
|
+
count += 1
|
|
351
|
+
else:
|
|
352
|
+
missing_artifacts.append(
|
|
353
|
+
{
|
|
354
|
+
'path': value,
|
|
355
|
+
'reason': 'missing_or_not_file',
|
|
356
|
+
'entryType': entry.get('type'),
|
|
357
|
+
'entryId': entry.get('id'),
|
|
358
|
+
'field': '.'.join(current_trail),
|
|
359
|
+
}
|
|
360
|
+
)
|
|
361
|
+
walk(value, current_trail)
|
|
362
|
+
elif isinstance(node, list):
|
|
363
|
+
for index, item in enumerate(node):
|
|
364
|
+
walk(item, trail + [str(index)])
|
|
365
|
+
|
|
366
|
+
walk(entry, [])
|
|
367
|
+
return count
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def default_output_path(exported: Dict[str, Any], requested_out: Optional[str]) -> Path:
|
|
371
|
+
session_id = exported.get('sessionId') or 'unknown-session'
|
|
372
|
+
source_path = Path(exported['sessionPath'])
|
|
373
|
+
base_name = source_path.stem
|
|
374
|
+
suffix = '.full.jsonl' if exported['header'].get('exportInfo', {}).get('mode') == FULL_MODE else '.raw.jsonl'
|
|
375
|
+
|
|
376
|
+
if requested_out:
|
|
377
|
+
out_path = Path(requested_out).expanduser()
|
|
378
|
+
if out_path.exists() and out_path.is_dir():
|
|
379
|
+
return (out_path / f'{base_name}{suffix}').resolve()
|
|
380
|
+
if requested_out.endswith(os.sep):
|
|
381
|
+
return (out_path / f'{base_name}{suffix}').resolve()
|
|
382
|
+
if out_path.suffix:
|
|
383
|
+
return out_path.resolve()
|
|
384
|
+
return (out_path / f'{base_name}{suffix}').resolve()
|
|
385
|
+
|
|
386
|
+
default_dir = Path.cwd() / 'pi-session-exports'
|
|
387
|
+
return (default_dir / f'{base_name}{suffix}').resolve()
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def write_export(exported: Dict[str, Any], output_path: Path) -> None:
|
|
391
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
392
|
+
with output_path.open('w', encoding='utf-8') as handle:
|
|
393
|
+
handle.write(json.dumps(exported['header'], ensure_ascii=False) + '\n')
|
|
394
|
+
for entry in exported['entries']:
|
|
395
|
+
handle.write(json.dumps(entry, ensure_ascii=False) + '\n')
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def verify_export(path: Path) -> Dict[str, Any]:
|
|
399
|
+
entries = read_jsonl(path)
|
|
400
|
+
header = entries[0]
|
|
401
|
+
export_info = header.get('exportInfo')
|
|
402
|
+
issues: List[str] = []
|
|
403
|
+
embedded_count = 0
|
|
404
|
+
missing_refs: List[Dict[str, Any]] = []
|
|
405
|
+
|
|
406
|
+
if not isinstance(export_info, dict):
|
|
407
|
+
issues.append('Missing exportInfo header.')
|
|
408
|
+
else:
|
|
409
|
+
if export_info.get('format') != 'pi-session-full-export':
|
|
410
|
+
issues.append('Unexpected exportInfo.format.')
|
|
411
|
+
if export_info.get('mode') == FULL_MODE and not export_info.get('fullyEmbedded'):
|
|
412
|
+
issues.append('Full export is not marked fullyEmbedded.')
|
|
413
|
+
|
|
414
|
+
for entry in entries[1:]:
|
|
415
|
+
count, missing = verify_entry_artifacts(entry)
|
|
416
|
+
embedded_count += count
|
|
417
|
+
missing_refs.extend(missing)
|
|
418
|
+
|
|
419
|
+
if export_info and export_info.get('mode') == FULL_MODE and missing_refs:
|
|
420
|
+
issues.append('Found external artifact references without embedded payloads.')
|
|
421
|
+
|
|
422
|
+
return {
|
|
423
|
+
'path': str(path),
|
|
424
|
+
'sessionId': header.get('id'),
|
|
425
|
+
'mode': export_info.get('mode') if isinstance(export_info, dict) else None,
|
|
426
|
+
'embeddedArtifactCount': embedded_count,
|
|
427
|
+
'missingEmbeddedArtifacts': missing_refs,
|
|
428
|
+
'issues': issues,
|
|
429
|
+
'ok': not issues,
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def verify_entry_artifacts(entry: Dict[str, Any]) -> Tuple[int, List[Dict[str, Any]]]:
|
|
434
|
+
embedded_count = 0
|
|
435
|
+
missing: List[Dict[str, Any]] = []
|
|
436
|
+
|
|
437
|
+
def walk(node: Any, trail: List[str]) -> None:
|
|
438
|
+
nonlocal embedded_count
|
|
439
|
+
if isinstance(node, dict):
|
|
440
|
+
for key, value in node.items():
|
|
441
|
+
current_trail = trail + [key]
|
|
442
|
+
if key == ARTIFACT_KEY and isinstance(value, str):
|
|
443
|
+
embedded = node.get(f'{key}Embedded')
|
|
444
|
+
if embedded is None:
|
|
445
|
+
missing.append(
|
|
446
|
+
{
|
|
447
|
+
'entryId': entry.get('id'),
|
|
448
|
+
'entryType': entry.get('type'),
|
|
449
|
+
'field': '.'.join(current_trail),
|
|
450
|
+
'path': value,
|
|
451
|
+
}
|
|
452
|
+
)
|
|
453
|
+
else:
|
|
454
|
+
embedded_count += 1
|
|
455
|
+
walk(value, current_trail)
|
|
456
|
+
elif isinstance(node, list):
|
|
457
|
+
for index, item in enumerate(node):
|
|
458
|
+
walk(item, trail + [str(index)])
|
|
459
|
+
|
|
460
|
+
walk(entry, [])
|
|
461
|
+
return embedded_count, missing
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
def format_export_result(exported: Dict[str, Any], output_path: Path) -> Dict[str, Any]:
|
|
465
|
+
export_info = exported['header']['exportInfo']
|
|
466
|
+
return {
|
|
467
|
+
'source': str(exported['sessionPath']),
|
|
468
|
+
'output': str(output_path),
|
|
469
|
+
'sessionId': exported.get('sessionId'),
|
|
470
|
+
'mode': export_info.get('mode'),
|
|
471
|
+
'embeddedArtifacts': exported.get('artifactCount', 0),
|
|
472
|
+
'missingArtifacts': exported.get('missingArtifacts', []),
|
|
473
|
+
'sourceSha256': exported.get('sourceSha256'),
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
def print_list(sessions: List[Dict[str, Any]], as_json: bool) -> int:
|
|
478
|
+
rows = [
|
|
479
|
+
{
|
|
480
|
+
'path': str(meta['path']),
|
|
481
|
+
'sessionId': meta['session_id'],
|
|
482
|
+
'timestamp': meta['timestamp'],
|
|
483
|
+
'cwd': meta['cwd'],
|
|
484
|
+
'name': meta['name'],
|
|
485
|
+
'preview': meta['preview'],
|
|
486
|
+
}
|
|
487
|
+
for meta in sessions
|
|
488
|
+
]
|
|
489
|
+
if as_json:
|
|
490
|
+
print(json.dumps(rows, ensure_ascii=False, indent=2))
|
|
491
|
+
return 0
|
|
492
|
+
if not rows:
|
|
493
|
+
print('No sessions found.')
|
|
494
|
+
return 0
|
|
495
|
+
for index, row in enumerate(rows, start=1):
|
|
496
|
+
name = row['name'] or row['preview'] or 'unnamed'
|
|
497
|
+
print(f'{index:>2}. {row["timestamp"]} id:{row["sessionId"]} {compact_text(name, 60)}')
|
|
498
|
+
print(f' cwd: {row["cwd"]}')
|
|
499
|
+
print(f' path: {row["path"]}')
|
|
500
|
+
return 0
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
def print_result(payload: Dict[str, Any], as_json: bool) -> int:
|
|
504
|
+
if as_json:
|
|
505
|
+
print(json.dumps(payload, ensure_ascii=False, indent=2))
|
|
506
|
+
return 0
|
|
507
|
+
print(f"source: {payload['source']}")
|
|
508
|
+
print(f"output: {payload['output']}")
|
|
509
|
+
print(f"session id: {payload['sessionId']}")
|
|
510
|
+
print(f"mode: {payload['mode']}")
|
|
511
|
+
print(f"embedded artifacts: {payload['embeddedArtifacts']}")
|
|
512
|
+
print(f"missing artifacts: {len(payload['missingArtifacts'])}")
|
|
513
|
+
if payload['missingArtifacts']:
|
|
514
|
+
for item in payload['missingArtifacts'][:10]:
|
|
515
|
+
print(f" - {item['path']} ({item['field']})")
|
|
516
|
+
return 0
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def print_verify_result(payload: Dict[str, Any], as_json: bool) -> int:
|
|
520
|
+
if as_json:
|
|
521
|
+
print(json.dumps(payload, ensure_ascii=False, indent=2))
|
|
522
|
+
return 0 if payload['ok'] else 1
|
|
523
|
+
print(f"path: {payload['path']}")
|
|
524
|
+
print(f"session id: {payload['sessionId']}")
|
|
525
|
+
print(f"mode: {payload['mode']}")
|
|
526
|
+
print(f"embedded artifacts: {payload['embeddedArtifactCount']}")
|
|
527
|
+
print(f"ok: {'yes' if payload['ok'] else 'no'}")
|
|
528
|
+
if payload['issues']:
|
|
529
|
+
print('issues:')
|
|
530
|
+
for issue in payload['issues']:
|
|
531
|
+
print(f' - {issue}')
|
|
532
|
+
if payload['missingEmbeddedArtifacts']:
|
|
533
|
+
print('missing embedded artifacts:')
|
|
534
|
+
for item in payload['missingEmbeddedArtifacts'][:10]:
|
|
535
|
+
print(f" - {item['path']} ({item['field']})")
|
|
536
|
+
return 0 if payload['ok'] else 1
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
def analyze_session_health(path: Path) -> Dict[str, Any]:
|
|
540
|
+
entries = read_jsonl(path)
|
|
541
|
+
header = entries[0]
|
|
542
|
+
refs: List[Dict[str, Any]] = []
|
|
543
|
+
|
|
544
|
+
for entry in entries[1:]:
|
|
545
|
+
refs.extend(find_artifact_refs(entry))
|
|
546
|
+
|
|
547
|
+
existing_refs = [ref for ref in refs if Path(ref['path']).is_file()]
|
|
548
|
+
missing_refs = [ref for ref in refs if not Path(ref['path']).is_file()]
|
|
549
|
+
return {
|
|
550
|
+
'path': str(path),
|
|
551
|
+
'sessionId': header.get('id'),
|
|
552
|
+
'cwd': header.get('cwd'),
|
|
553
|
+
'timestamp': header.get('timestamp'),
|
|
554
|
+
'totalRefs': len(refs),
|
|
555
|
+
'existingRefs': len(existing_refs),
|
|
556
|
+
'missingRefs': len(missing_refs),
|
|
557
|
+
'ready': len(missing_refs) == 0,
|
|
558
|
+
'refs': refs,
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def find_artifact_refs(entry: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
563
|
+
refs: List[Dict[str, Any]] = []
|
|
564
|
+
|
|
565
|
+
def walk(node: Any, trail: List[str]) -> None:
|
|
566
|
+
if isinstance(node, dict):
|
|
567
|
+
for key, value in node.items():
|
|
568
|
+
current_trail = trail + [key]
|
|
569
|
+
if key == ARTIFACT_KEY and isinstance(value, str):
|
|
570
|
+
refs.append(
|
|
571
|
+
{
|
|
572
|
+
'entryId': entry.get('id'),
|
|
573
|
+
'entryType': entry.get('type'),
|
|
574
|
+
'field': '.'.join(current_trail),
|
|
575
|
+
'path': value,
|
|
576
|
+
}
|
|
577
|
+
)
|
|
578
|
+
else:
|
|
579
|
+
walk(value, current_trail)
|
|
580
|
+
elif isinstance(node, list):
|
|
581
|
+
for index, item in enumerate(node):
|
|
582
|
+
walk(item, trail + [str(index)])
|
|
583
|
+
|
|
584
|
+
walk(entry, [])
|
|
585
|
+
return refs
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
def build_health_report(session_root: Path, cwd: Path, include_all: bool) -> Dict[str, Any]:
|
|
589
|
+
sessions = list_all_sessions(session_root) if include_all else list_project_sessions(session_root, cwd)
|
|
590
|
+
session_paths = [Path(meta['path']) for meta in sessions]
|
|
591
|
+
reports = [analyze_session_health(path) for path in session_paths]
|
|
592
|
+
|
|
593
|
+
total_sessions = len(reports)
|
|
594
|
+
sessions_with_refs = [report for report in reports if report['totalRefs'] > 0]
|
|
595
|
+
sessions_ready = [report for report in reports if report['ready']]
|
|
596
|
+
sessions_blocked = [report for report in reports if report['missingRefs'] > 0]
|
|
597
|
+
total_refs = sum(report['totalRefs'] for report in reports)
|
|
598
|
+
existing_refs = sum(report['existingRefs'] for report in reports)
|
|
599
|
+
missing_refs = sum(report['missingRefs'] for report in reports)
|
|
600
|
+
|
|
601
|
+
projects: Dict[str, Dict[str, Any]] = {}
|
|
602
|
+
for report in reports:
|
|
603
|
+
key = report.get('cwd') or 'unknown-cwd'
|
|
604
|
+
project = projects.setdefault(
|
|
605
|
+
key,
|
|
606
|
+
{
|
|
607
|
+
'cwd': key,
|
|
608
|
+
'sessions': 0,
|
|
609
|
+
'sessionsWithRefs': 0,
|
|
610
|
+
'sessionsBlocked': 0,
|
|
611
|
+
'totalRefs': 0,
|
|
612
|
+
'missingRefs': 0,
|
|
613
|
+
},
|
|
614
|
+
)
|
|
615
|
+
project['sessions'] += 1
|
|
616
|
+
project['totalRefs'] += report['totalRefs']
|
|
617
|
+
project['missingRefs'] += report['missingRefs']
|
|
618
|
+
if report['totalRefs'] > 0:
|
|
619
|
+
project['sessionsWithRefs'] += 1
|
|
620
|
+
if report['missingRefs'] > 0:
|
|
621
|
+
project['sessionsBlocked'] += 1
|
|
622
|
+
|
|
623
|
+
return {
|
|
624
|
+
'scope': 'all' if include_all else 'cwd',
|
|
625
|
+
'sessionRoot': str(session_root),
|
|
626
|
+
'cwd': str(cwd),
|
|
627
|
+
'totalSessions': total_sessions,
|
|
628
|
+
'sessionsReady': len(sessions_ready),
|
|
629
|
+
'sessionsWithRefs': len(sessions_with_refs),
|
|
630
|
+
'sessionsBlocked': len(sessions_blocked),
|
|
631
|
+
'totalRefs': total_refs,
|
|
632
|
+
'existingRefs': existing_refs,
|
|
633
|
+
'missingRefs': missing_refs,
|
|
634
|
+
'missingRatio': (missing_refs / total_refs) if total_refs else 0.0,
|
|
635
|
+
'projects': sorted(projects.values(), key=lambda item: (item['sessionsBlocked'], item['missingRefs']), reverse=True),
|
|
636
|
+
'blockedSessions': sorted(sessions_blocked, key=lambda item: (item['missingRefs'], item['timestamp'] or ''), reverse=True),
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
def print_health_report(payload: Dict[str, Any], as_json: bool) -> int:
|
|
641
|
+
if as_json:
|
|
642
|
+
print(json.dumps(payload, ensure_ascii=False, indent=2))
|
|
643
|
+
return 0
|
|
644
|
+
print(f"scope: {payload['scope']}")
|
|
645
|
+
print(f"session root: {payload['sessionRoot']}")
|
|
646
|
+
print(f"cwd: {payload['cwd']}")
|
|
647
|
+
print(f"total sessions: {payload['totalSessions']}")
|
|
648
|
+
print(f"sessions ready: {payload['sessionsReady']}")
|
|
649
|
+
print(f"sessions with external refs: {payload['sessionsWithRefs']}")
|
|
650
|
+
print(f"sessions blocked by missing refs: {payload['sessionsBlocked']}")
|
|
651
|
+
print(f"total refs: {payload['totalRefs']}")
|
|
652
|
+
print(f"existing refs: {payload['existingRefs']}")
|
|
653
|
+
print(f"missing refs: {payload['missingRefs']}")
|
|
654
|
+
print(f"missing ratio: {payload['missingRatio']:.2%}")
|
|
655
|
+
if payload['projects']:
|
|
656
|
+
print('projects:')
|
|
657
|
+
for project in payload['projects'][:10]:
|
|
658
|
+
print(
|
|
659
|
+
f" - {project['cwd']}: sessions={project['sessions']}, with_refs={project['sessionsWithRefs']}, "
|
|
660
|
+
f"blocked={project['sessionsBlocked']}, missing_refs={project['missingRefs']}"
|
|
661
|
+
)
|
|
662
|
+
if payload['blockedSessions']:
|
|
663
|
+
print('blocked sessions:')
|
|
664
|
+
for report in payload['blockedSessions'][:10]:
|
|
665
|
+
print(
|
|
666
|
+
f" - {report['timestamp']} id:{report['sessionId']} missing={report['missingRefs']} path={report['path']}"
|
|
667
|
+
)
|
|
668
|
+
return 0
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
def main(argv: Sequence[str] | None = None) -> int:
|
|
672
|
+
args = parse_args(argv or sys.argv[1:])
|
|
673
|
+
session_root = Path(args.session_root).expanduser().resolve()
|
|
674
|
+
cwd = Path(args.cwd).expanduser().resolve()
|
|
675
|
+
|
|
676
|
+
try:
|
|
677
|
+
if args.command == 'list':
|
|
678
|
+
sessions = list_all_sessions(session_root) if args.all else list_project_sessions(session_root, cwd)
|
|
679
|
+
return print_list(sessions, args.json)
|
|
680
|
+
|
|
681
|
+
if args.command == 'verify':
|
|
682
|
+
result = verify_export(Path(args.path).expanduser().resolve())
|
|
683
|
+
return print_verify_result(result, args.json)
|
|
684
|
+
|
|
685
|
+
if args.command == 'health':
|
|
686
|
+
result = build_health_report(session_root, cwd, args.all)
|
|
687
|
+
return print_health_report(result, args.json)
|
|
688
|
+
|
|
689
|
+
session_meta = resolve_session(session_root, cwd, args.session, args.select)
|
|
690
|
+
exported = export_session(session_meta, args.mode)
|
|
691
|
+
output_path = default_output_path(exported, args.out)
|
|
692
|
+
write_export(exported, output_path)
|
|
693
|
+
result = format_export_result(exported, output_path)
|
|
694
|
+
return print_result(result, args.json)
|
|
695
|
+
except SessionExportError as exc:
|
|
696
|
+
print(f'error: {exc}', file=sys.stderr)
|
|
697
|
+
return 1
|
|
698
|
+
|
|
699
|
+
|
|
700
|
+
if __name__ == '__main__':
|
|
701
|
+
raise SystemExit(main())
|