agentic-dataset-builder 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,368 +0,0 @@
1
- #!/usr/bin/env python3
2
- from __future__ import annotations
3
-
4
- import argparse
5
- import gzip
6
- import json
7
- import shutil
8
- import subprocess
9
- import sys
10
- from collections import Counter
11
- from datetime import datetime, timezone
12
- from pathlib import Path
13
- from typing import Any, Dict, Iterable, List, Sequence, Tuple
14
-
15
- from .export_qwen35_training import append_parquet_rows, ensure_parquet_runtime, record_to_parquet_row
16
- from .platform_paths import default_codex_session_root, default_pi_session_root
17
-
18
-
19
- def python_entry(module_name: str, script_dir: Path) -> List[str]:
20
- if __package__:
21
- package_root = __package__.split('.')[0]
22
- return [sys.executable, '-m', f'{package_root}.{module_name}']
23
- return [sys.executable, str(script_dir / f'{module_name}.py')]
24
-
25
-
26
- def parse_args(argv: Sequence[str]) -> argparse.Namespace:
27
- parser = argparse.ArgumentParser(description='Build one merged agentic dataset from local Pi and Codex sessions.')
28
- parser.add_argument('--pi-root', help='Pi session root. Defaults to auto-detected OS-specific location.')
29
- parser.add_argument('--codex-root', help='Codex session root. Defaults to auto-detected OS-specific location.')
30
- parser.add_argument('--output-root', required=True, help='Output root directory.')
31
- parser.add_argument('--include-sources', default='pi,codex', help='Comma-separated sources to include: pi,codex')
32
- parser.add_argument('--include-labels', default='cot_eligible,agent_only', help='Comma-separated labels to keep in final dataset.')
33
- parser.add_argument('--pi-session-root-override', help='Override session root passed to export_pi_session.py health check.')
34
- parser.add_argument('--skip-pi-health-check', action='store_true', help='Skip Pi health scan and export only by direct conversion assumptions.')
35
- parser.add_argument('--codex-limit', type=int, default=0, help='Optional limit for Codex session files.')
36
- parser.add_argument('--jsonl-only', action='store_true', help='Use JSONL intermediates only.')
37
- parser.add_argument('--final-format', choices=('jsonl', 'parquet', 'both'), default='parquet', help='Final merged dataset output format.')
38
- parser.add_argument('--keep-intermediates', action='store_true', help='Keep intermediate export and label directories instead of deleting them after a successful build.')
39
- return parser.parse_args(argv)
40
-
41
-
42
- def print_progress(step: str, message: str) -> None:
43
- print(f'[{step}] {message}', flush=True)
44
-
45
-
46
- def run_cmd(command: List[str], cwd: Path, step: str, log_path: Path) -> None:
47
- rendered_command = ' '.join(command)
48
- print_progress(step, 'running ' + rendered_command)
49
- with log_path.open('a', encoding='utf-8') as log_handle:
50
- log_handle.write(f'\n## {step}\n$ {rendered_command}\n')
51
- process = subprocess.Popen(
52
- command,
53
- cwd=str(cwd),
54
- text=True,
55
- stdout=subprocess.PIPE,
56
- stderr=subprocess.STDOUT,
57
- )
58
- assert process.stdout is not None
59
- for line in process.stdout:
60
- text = line.rstrip('\n')
61
- log_handle.write(line)
62
- print_progress(step, text)
63
- exit_code = process.wait()
64
- if exit_code != 0:
65
- raise subprocess.CalledProcessError(exit_code, command)
66
-
67
-
68
- def latest_dir(root: Path, pattern: str) -> Path:
69
- matches = sorted(root.glob(pattern))
70
- if not matches:
71
- raise FileNotFoundError(f'No directories match {pattern} under {root}')
72
- return matches[-1]
73
-
74
-
75
- def load_json(path: Path) -> Dict[str, Any]:
76
- return json.loads(path.read_text(encoding='utf-8'))
77
-
78
-
79
- def load_jsonl(path: Path) -> List[Dict[str, Any]]:
80
- rows: List[Dict[str, Any]] = []
81
- with path.open('r', encoding='utf-8') as handle:
82
- for line in handle:
83
- line = line.strip()
84
- if line:
85
- rows.append(json.loads(line))
86
- return rows
87
-
88
-
89
- def find_pi_sessions(session_root: Path) -> List[Path]:
90
- return sorted(session_root.rglob('*.jsonl'))
91
-
92
-
93
- def export_all_pi_sessions(script_dir: Path, output_root: Path, session_root: Path, log_path: Path) -> Path:
94
- sessions = find_pi_sessions(session_root)
95
- if not sessions:
96
- raise RuntimeError(f'No Pi sessions found under {session_root}')
97
- export_dir = output_root / 'pi-full-sessions'
98
- export_dir.mkdir(parents=True, exist_ok=True)
99
- print_progress('pi-export', f'exporting {len(sessions)} session files')
100
- for session_path in sessions:
101
- run_cmd(
102
- python_entry('export_pi_session', script_dir)
103
- + [
104
- '--session-root',
105
- str(session_root),
106
- '--session',
107
- str(session_path),
108
- '--out',
109
- str(export_dir),
110
- ],
111
- cwd=script_dir.parent,
112
- step='pi-export',
113
- log_path=log_path,
114
- )
115
- return export_dir
116
-
117
-
118
- def convert_pi(script_dir: Path, full_export_dir: Path, output_root: Path, jsonl_only: bool, log_path: Path) -> Path:
119
- fmt = 'jsonl' if jsonl_only else 'both'
120
- run_cmd(
121
- python_entry('export_pi_session_to_qwen35', script_dir)
122
- + [
123
- '--input',
124
- str(full_export_dir),
125
- '--output-root',
126
- str(output_root),
127
- '--output-format',
128
- fmt,
129
- ],
130
- cwd=script_dir.parent,
131
- step='pi-convert',
132
- log_path=log_path,
133
- )
134
- return latest_dir(output_root, 'qwen35-pi-session-*')
135
-
136
-
137
- def convert_codex(script_dir: Path, codex_root: Path, output_root: Path, jsonl_only: bool, limit: int, log_path: Path) -> Path:
138
- fmt = 'jsonl' if jsonl_only else 'both'
139
- command = python_entry('export_codex_session_to_qwen35', script_dir) + [
140
- '--input',
141
- str(codex_root),
142
- '--output-root',
143
- str(output_root),
144
- '--output-format',
145
- fmt,
146
- ]
147
- if limit > 0:
148
- command.extend(['--limit', str(limit)])
149
- run_cmd(command, cwd=script_dir.parent, step='codex-convert', log_path=log_path)
150
- return latest_dir(output_root, 'qwen35-codex-session-*')
151
-
152
-
153
- def label_export(script_dir: Path, export_dir: Path, output_root: Path, log_path: Path, step_name: str) -> Path:
154
- output_root.mkdir(parents=True, exist_ok=True)
155
- run_cmd(
156
- python_entry('label_qwen35_agentic', script_dir)
157
- + [
158
- '--input',
159
- str(export_dir),
160
- '--output-root',
161
- str(output_root),
162
- ],
163
- cwd=script_dir.parent,
164
- step=step_name,
165
- log_path=log_path,
166
- )
167
- return latest_dir(output_root, 'qwen35-agentic-labels-*')
168
-
169
-
170
- def build_record_index(export_dir: Path) -> Dict[str, Dict[str, Any]]:
171
- index: Dict[str, Dict[str, Any]] = {}
172
- for name in ('qwen35-train.jsonl', 'qwen35-train-lossy.jsonl'):
173
- path = export_dir / name
174
- if not path.exists():
175
- continue
176
- for record in load_jsonl(path):
177
- index[record['id']] = record
178
- return index
179
-
180
-
181
- def merge_labeled_datasets(
182
- label_dirs: List[Tuple[str, Path]],
183
- export_dirs: Dict[str, Path],
184
- keep_labels: set[str],
185
- output_dir: Path,
186
- final_format: str,
187
- ) -> Dict[str, Any]:
188
- dataset_path = output_dir / 'dataset.jsonl'
189
- dataset_gzip_path = output_dir / 'dataset.jsonl.gz'
190
- parquet_path = output_dir / 'dataset.parquet'
191
- stats = Counter()
192
- source_stats: Dict[str, Counter] = {}
193
- jsonl_enabled = final_format in {'jsonl', 'both'}
194
- parquet_enabled = final_format in {'parquet', 'both'}
195
- parquet_writer = None
196
- parquet_batch: List[Dict[str, Any]] = []
197
-
198
- out = dataset_path.open('w', encoding='utf-8') if jsonl_enabled else None
199
- out_gzip = gzip.open(dataset_gzip_path, 'wt', encoding='utf-8') if jsonl_enabled else None
200
- try:
201
- for source_name, label_dir in label_dirs:
202
- labels = load_jsonl(label_dir / 'labels.jsonl')
203
- record_index = build_record_index(export_dirs[source_name])
204
- source_counter = Counter()
205
- for label in labels:
206
- source_counter['records_seen'] += 1
207
- stats[f"labels_seen:{label['label']}"] += 1
208
- if label['label'] not in keep_labels:
209
- source_counter['records_skipped'] += 1
210
- continue
211
- record = record_index.get(label['id'])
212
- if record is None:
213
- source_counter['missing_records'] += 1
214
- continue
215
- merged = dict(record)
216
- merged_meta = dict(merged.get('meta', {}))
217
- merged['label'] = label['label']
218
- merged['source_system'] = source_name
219
- merged['source_bucket'] = label.get('bucket')
220
- merged['source_file'] = label.get('source_file')
221
- merged['agentic_label'] = {
222
- 'label': label['label'],
223
- 'tool_call_count': label.get('tool_call_count'),
224
- 'tool_message_count': label.get('tool_message_count'),
225
- 'dialogue_rounds_est': label.get('dialogue_rounds_est'),
226
- 'reasoning_chars': label.get('reasoning_chars'),
227
- 'has_reasoning': label.get('has_reasoning'),
228
- 'lossy_source': label.get('lossy_source'),
229
- 'lossy_reasons': label.get('lossy_reasons', []),
230
- }
231
- merged_meta['dataset_label'] = label['label']
232
- merged_meta['dataset_source_system'] = source_name
233
- merged_meta['dataset_source_bucket'] = label.get('bucket')
234
- merged_meta['dataset_source_file'] = label.get('source_file')
235
- merged_meta['dataset_has_reasoning'] = label.get('has_reasoning')
236
- merged_meta['dataset_reasoning_chars'] = label.get('reasoning_chars')
237
- merged['meta'] = merged_meta
238
- if out is not None:
239
- line = json.dumps(merged, ensure_ascii=False) + '\n'
240
- out.write(line)
241
- if out_gzip is not None:
242
- out_gzip.write(line)
243
- if parquet_enabled:
244
- parquet_batch.append(record_to_parquet_row(merged))
245
- if len(parquet_batch) >= 1000:
246
- parquet_writer = append_parquet_rows(parquet_writer, parquet_batch, parquet_path)
247
- parquet_batch = []
248
- source_counter['records_kept'] += 1
249
- source_counter[f"kept:{label['label']}"] += 1
250
- stats['records_kept'] += 1
251
- stats[f"kept:{label['label']}"] += 1
252
- source_stats[source_name] = source_counter
253
- if parquet_enabled:
254
- parquet_writer = append_parquet_rows(parquet_writer, parquet_batch, parquet_path)
255
- finally:
256
- if out is not None:
257
- out.close()
258
- if out_gzip is not None:
259
- out_gzip.close()
260
- if parquet_writer is not None:
261
- parquet_writer.close()
262
-
263
- result = {
264
- 'dataset_path': str(dataset_path) if jsonl_enabled else None,
265
- 'stats': dict(stats),
266
- 'source_stats': {name: dict(counter) for name, counter in source_stats.items()},
267
- }
268
- if jsonl_enabled:
269
- result['dataset_gzip_path'] = str(dataset_gzip_path)
270
- if parquet_enabled:
271
- result['dataset_parquet_path'] = str(parquet_path)
272
- return result
273
-
274
-
275
- def main(argv: Sequence[str] | None = None) -> int:
276
- args = parse_args(argv or sys.argv[1:])
277
- if args.final_format in {'parquet', 'both'}:
278
- ensure_parquet_runtime('parquet')
279
- script_dir = Path(__file__).resolve().parent
280
- workspace_root = script_dir.parent
281
- output_root = Path(args.output_root).expanduser().resolve()
282
- output_root.mkdir(parents=True, exist_ok=True)
283
- run_root = output_root / f'agentic-dataset-{datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")}'
284
- run_root.mkdir(parents=True, exist_ok=True)
285
- log_path = run_root / 'run.log'
286
-
287
- include_sources = {item.strip() for item in args.include_sources.split(',') if item.strip()}
288
- keep_labels = {item.strip() for item in args.include_labels.split(',') if item.strip()}
289
- pi_root = Path(args.pi_root).expanduser().resolve() if args.pi_root else default_pi_session_root()
290
- codex_root = Path(args.codex_root).expanduser().resolve() if args.codex_root else default_codex_session_root()
291
- export_dirs: Dict[str, Path] = {}
292
- label_dirs: List[Tuple[str, Path]] = []
293
- manifest: Dict[str, Any] = {
294
- 'run_dir': str(run_root),
295
- 'run_log': str(log_path),
296
- 'pi_root': str(pi_root),
297
- 'codex_root': str(codex_root),
298
- 'include_sources': sorted(include_sources),
299
- 'keep_labels': sorted(keep_labels),
300
- 'keep_intermediates': args.keep_intermediates,
301
- 'steps': {},
302
- }
303
- cleanup_paths: List[str] = []
304
-
305
- if 'pi' in include_sources:
306
- pi_run_root = run_root / 'pi'
307
- pi_run_root.mkdir(parents=True, exist_ok=True)
308
- full_dir = export_all_pi_sessions(script_dir, pi_run_root, pi_root, log_path)
309
- pi_export_dir = convert_pi(script_dir, full_dir, pi_run_root, args.jsonl_only, log_path)
310
- pi_label_dir = label_export(script_dir, pi_export_dir, pi_run_root / 'labels', log_path, 'pi-label')
311
- export_dirs['pi'] = pi_export_dir
312
- label_dirs.append(('pi', pi_label_dir))
313
- cleanup_paths.append(str(pi_run_root))
314
- manifest['steps']['pi'] = {
315
- 'full_export_dir': str(full_dir),
316
- 'qwen35_export_dir': str(pi_export_dir),
317
- 'label_dir': str(pi_label_dir),
318
- 'label_manifest': load_json(pi_label_dir / 'manifest.json'),
319
- 'export_manifest': load_json(pi_export_dir / 'manifest.json'),
320
- }
321
-
322
- if 'codex' in include_sources:
323
- codex_run_root = run_root / 'codex'
324
- codex_run_root.mkdir(parents=True, exist_ok=True)
325
- codex_export_dir = convert_codex(script_dir, codex_root, codex_run_root, args.jsonl_only, args.codex_limit, log_path)
326
- codex_label_dir = label_export(script_dir, codex_export_dir, codex_run_root / 'labels', log_path, 'codex-label')
327
- export_dirs['codex'] = codex_export_dir
328
- label_dirs.append(('codex', codex_label_dir))
329
- cleanup_paths.append(str(codex_run_root))
330
- manifest['steps']['codex'] = {
331
- 'qwen35_export_dir': str(codex_export_dir),
332
- 'label_dir': str(codex_label_dir),
333
- 'label_manifest': load_json(codex_label_dir / 'manifest.json'),
334
- 'export_manifest': load_json(codex_export_dir / 'manifest.json'),
335
- }
336
-
337
- merge_info = merge_labeled_datasets(label_dirs, export_dirs, keep_labels, run_root, args.final_format)
338
- manifest['final_dataset'] = merge_info
339
- if not args.keep_intermediates:
340
- removed: List[str] = []
341
- for path_str in cleanup_paths:
342
- path = Path(path_str)
343
- if path.exists():
344
- shutil.rmtree(path)
345
- removed.append(path_str)
346
- manifest['cleanup'] = {'enabled': True, 'removed_paths': removed}
347
- print_progress('cleanup', f'removed {len(removed)} intermediate directories')
348
- else:
349
- manifest['cleanup'] = {'enabled': False, 'removed_paths': []}
350
- manifest_path = run_root / 'manifest.json'
351
- manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2) + '\n', encoding='utf-8')
352
- print(
353
- json.dumps(
354
- {
355
- 'run_dir': str(run_root),
356
- 'dataset_path': merge_info.get('dataset_path'),
357
- 'dataset_parquet_path': merge_info.get('dataset_parquet_path'),
358
- 'stats': merge_info['stats'],
359
- },
360
- ensure_ascii=False,
361
- ),
362
- flush=True,
363
- )
364
- return 0
365
-
366
-
367
- if __name__ == '__main__':
368
- raise SystemExit(main())