agentic-dataset-builder 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Mindverse
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,125 @@
1
+ # Agentic Dataset Builder
2
+
3
+ Build one merged training dataset from local Pi and Codex session history.
4
+
5
+ ## Runtime
6
+
7
+ The core implementation is Python. User-facing entrypoints do not require bash.
8
+
9
+ Requirements:
10
+
11
+ - Python 3.10+
12
+ - Node 18+ if you want to run via `npx`
13
+
14
+ ## Recommended usage
15
+
16
+ Published npm-style entrypoint:
17
+
18
+ ```bash
19
+ npx agentic-dataset-builder --output-root ./out
20
+ ```
21
+
22
+ Local repo usage without bash:
23
+
24
+ ```bash
25
+ node cli.mjs --output-root ./out
26
+ ```
27
+
28
+ Direct Python entrypoint:
29
+
30
+ ```bash
31
+ python run.py --output-root ./out
32
+ ```
33
+
34
+ If you want to pre-create the Python environment yourself:
35
+
36
+ ```bash
37
+ pip install -r requirements.txt
38
+ python run.py --output-root ./out
39
+ ```
40
+
41
+ ## What users run
42
+
43
+ From this directory, the simplest no-bash local command is:
44
+
45
+ ```bash
46
+ node cli.mjs --output-root ./out
47
+ ```
48
+
49
+ That one command will:
50
+
51
+ - scan `~/.pi/agent/sessions`
52
+ - scan `~/.codex/sessions`
53
+ - convert session history into the local Qwen3.5 schema
54
+ - label records as `cot_eligible`, `agent_only`, or `discard`
55
+ - keep `cot_eligible` and `agent_only`
56
+ - merge them into one final parquet file
57
+ - remove intermediate directories automatically after success
58
+
59
+ ## Final output
60
+
61
+ Each run creates one directory under `./out/`:
62
+
63
+ ```text
64
+ out/agentic-dataset-<timestamp>/
65
+ dataset.parquet
66
+ manifest.json
67
+ run.log
68
+ ```
69
+
70
+ Default deliverable is just one user-facing dataset file:
71
+
72
+ - `dataset.parquet`
73
+
74
+ Supporting files:
75
+
76
+ - `manifest.json`: what was scanned, what was kept, summary stats
77
+ - `run.log`: full step-by-step execution log
78
+
79
+ ## Common options
80
+
81
+ ```bash
82
+ # only Pi
83
+ node cli.mjs --output-root ./out --include-sources pi
84
+
85
+ # only Codex
86
+ node cli.mjs --output-root ./out --include-sources codex
87
+
88
+ # keep intermediates for debugging
89
+ node cli.mjs --output-root ./out --keep-intermediates
90
+
91
+ # also emit final merged jsonl/jsonl.gz
92
+ node cli.mjs --output-root ./out --final-format both
93
+ ```
94
+
95
+ ## What is kept by default
96
+
97
+ - `cot_eligible`: agentic traces with visible reasoning
98
+ - `agent_only`: agentic traces without visible reasoning
99
+
100
+ `discard` records are excluded from the final dataset by default.
101
+
102
+ ## Package layout
103
+
104
+ ```text
105
+ agentic_dataset/
106
+ build_agentic_dataset.py
107
+ export_pi_session.py
108
+ export_pi_session_to_qwen35.py
109
+ export_codex_session_to_qwen35.py
110
+ label_qwen35_agentic.py
111
+ export_qwen35_training.py
112
+ qwen35_training_record.py
113
+ run.sh
114
+ run.py
115
+ cli.mjs
116
+ README.md
117
+ ```
118
+
119
+ ## Notes
120
+
121
+ - default session roots are auto-detected for Linux, macOS, and Windows
122
+ - override session paths with `--pi-root` and `--codex-root` if needed
123
+ - Pi currently provides much better visible reasoning coverage than Codex.
124
+ - Codex traces are still useful for agent-behavior distillation even when reasoning is encrypted-only.
125
+ - Redaction is not included yet. Add it before distributing the tool broadly if users may have sensitive local data.
@@ -0,0 +1 @@
1
+ """Agentic dataset builder package."""
@@ -0,0 +1,368 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import gzip
6
+ import json
7
+ import shutil
8
+ import subprocess
9
+ import sys
10
+ from collections import Counter
11
+ from datetime import datetime, timezone
12
+ from pathlib import Path
13
+ from typing import Any, Dict, Iterable, List, Sequence, Tuple
14
+
15
+ from .export_qwen35_training import append_parquet_rows, ensure_parquet_runtime, record_to_parquet_row
16
+ from .platform_paths import default_codex_session_root, default_pi_session_root
17
+
18
+
19
+ def python_entry(module_name: str, script_dir: Path) -> List[str]:
20
+ if __package__:
21
+ package_root = __package__.split('.')[0]
22
+ return [sys.executable, '-m', f'{package_root}.{module_name}']
23
+ return [sys.executable, str(script_dir / f'{module_name}.py')]
24
+
25
+
26
+ def parse_args(argv: Sequence[str]) -> argparse.Namespace:
27
+ parser = argparse.ArgumentParser(description='Build one merged agentic dataset from local Pi and Codex sessions.')
28
+ parser.add_argument('--pi-root', help='Pi session root. Defaults to auto-detected OS-specific location.')
29
+ parser.add_argument('--codex-root', help='Codex session root. Defaults to auto-detected OS-specific location.')
30
+ parser.add_argument('--output-root', required=True, help='Output root directory.')
31
+ parser.add_argument('--include-sources', default='pi,codex', help='Comma-separated sources to include: pi,codex')
32
+ parser.add_argument('--include-labels', default='cot_eligible,agent_only', help='Comma-separated labels to keep in final dataset.')
33
+ parser.add_argument('--pi-session-root-override', help='Override session root passed to export_pi_session.py health check.')
34
+ parser.add_argument('--skip-pi-health-check', action='store_true', help='Skip Pi health scan and export only by direct conversion assumptions.')
35
+ parser.add_argument('--codex-limit', type=int, default=0, help='Optional limit for Codex session files.')
36
+ parser.add_argument('--jsonl-only', action='store_true', help='Use JSONL intermediates only.')
37
+ parser.add_argument('--final-format', choices=('jsonl', 'parquet', 'both'), default='parquet', help='Final merged dataset output format.')
38
+ parser.add_argument('--keep-intermediates', action='store_true', help='Keep intermediate export and label directories instead of deleting them after a successful build.')
39
+ return parser.parse_args(argv)
40
+
41
+
42
+ def print_progress(step: str, message: str) -> None:
43
+ print(f'[{step}] {message}', flush=True)
44
+
45
+
46
+ def run_cmd(command: List[str], cwd: Path, step: str, log_path: Path) -> None:
47
+ rendered_command = ' '.join(command)
48
+ print_progress(step, 'running ' + rendered_command)
49
+ with log_path.open('a', encoding='utf-8') as log_handle:
50
+ log_handle.write(f'\n## {step}\n$ {rendered_command}\n')
51
+ process = subprocess.Popen(
52
+ command,
53
+ cwd=str(cwd),
54
+ text=True,
55
+ stdout=subprocess.PIPE,
56
+ stderr=subprocess.STDOUT,
57
+ )
58
+ assert process.stdout is not None
59
+ for line in process.stdout:
60
+ text = line.rstrip('\n')
61
+ log_handle.write(line)
62
+ print_progress(step, text)
63
+ exit_code = process.wait()
64
+ if exit_code != 0:
65
+ raise subprocess.CalledProcessError(exit_code, command)
66
+
67
+
68
+ def latest_dir(root: Path, pattern: str) -> Path:
69
+ matches = sorted(root.glob(pattern))
70
+ if not matches:
71
+ raise FileNotFoundError(f'No directories match {pattern} under {root}')
72
+ return matches[-1]
73
+
74
+
75
+ def load_json(path: Path) -> Dict[str, Any]:
76
+ return json.loads(path.read_text(encoding='utf-8'))
77
+
78
+
79
+ def load_jsonl(path: Path) -> List[Dict[str, Any]]:
80
+ rows: List[Dict[str, Any]] = []
81
+ with path.open('r', encoding='utf-8') as handle:
82
+ for line in handle:
83
+ line = line.strip()
84
+ if line:
85
+ rows.append(json.loads(line))
86
+ return rows
87
+
88
+
89
+ def find_pi_sessions(session_root: Path) -> List[Path]:
90
+ return sorted(session_root.rglob('*.jsonl'))
91
+
92
+
93
+ def export_all_pi_sessions(script_dir: Path, output_root: Path, session_root: Path, log_path: Path) -> Path:
94
+ sessions = find_pi_sessions(session_root)
95
+ if not sessions:
96
+ raise RuntimeError(f'No Pi sessions found under {session_root}')
97
+ export_dir = output_root / 'pi-full-sessions'
98
+ export_dir.mkdir(parents=True, exist_ok=True)
99
+ print_progress('pi-export', f'exporting {len(sessions)} session files')
100
+ for session_path in sessions:
101
+ run_cmd(
102
+ python_entry('export_pi_session', script_dir)
103
+ + [
104
+ '--session-root',
105
+ str(session_root),
106
+ '--session',
107
+ str(session_path),
108
+ '--out',
109
+ str(export_dir),
110
+ ],
111
+ cwd=script_dir.parent,
112
+ step='pi-export',
113
+ log_path=log_path,
114
+ )
115
+ return export_dir
116
+
117
+
118
+ def convert_pi(script_dir: Path, full_export_dir: Path, output_root: Path, jsonl_only: bool, log_path: Path) -> Path:
119
+ fmt = 'jsonl' if jsonl_only else 'both'
120
+ run_cmd(
121
+ python_entry('export_pi_session_to_qwen35', script_dir)
122
+ + [
123
+ '--input',
124
+ str(full_export_dir),
125
+ '--output-root',
126
+ str(output_root),
127
+ '--output-format',
128
+ fmt,
129
+ ],
130
+ cwd=script_dir.parent,
131
+ step='pi-convert',
132
+ log_path=log_path,
133
+ )
134
+ return latest_dir(output_root, 'qwen35-pi-session-*')
135
+
136
+
137
+ def convert_codex(script_dir: Path, codex_root: Path, output_root: Path, jsonl_only: bool, limit: int, log_path: Path) -> Path:
138
+ fmt = 'jsonl' if jsonl_only else 'both'
139
+ command = python_entry('export_codex_session_to_qwen35', script_dir) + [
140
+ '--input',
141
+ str(codex_root),
142
+ '--output-root',
143
+ str(output_root),
144
+ '--output-format',
145
+ fmt,
146
+ ]
147
+ if limit > 0:
148
+ command.extend(['--limit', str(limit)])
149
+ run_cmd(command, cwd=script_dir.parent, step='codex-convert', log_path=log_path)
150
+ return latest_dir(output_root, 'qwen35-codex-session-*')
151
+
152
+
153
+ def label_export(script_dir: Path, export_dir: Path, output_root: Path, log_path: Path, step_name: str) -> Path:
154
+ output_root.mkdir(parents=True, exist_ok=True)
155
+ run_cmd(
156
+ python_entry('label_qwen35_agentic', script_dir)
157
+ + [
158
+ '--input',
159
+ str(export_dir),
160
+ '--output-root',
161
+ str(output_root),
162
+ ],
163
+ cwd=script_dir.parent,
164
+ step=step_name,
165
+ log_path=log_path,
166
+ )
167
+ return latest_dir(output_root, 'qwen35-agentic-labels-*')
168
+
169
+
170
+ def build_record_index(export_dir: Path) -> Dict[str, Dict[str, Any]]:
171
+ index: Dict[str, Dict[str, Any]] = {}
172
+ for name in ('qwen35-train.jsonl', 'qwen35-train-lossy.jsonl'):
173
+ path = export_dir / name
174
+ if not path.exists():
175
+ continue
176
+ for record in load_jsonl(path):
177
+ index[record['id']] = record
178
+ return index
179
+
180
+
181
+ def merge_labeled_datasets(
182
+ label_dirs: List[Tuple[str, Path]],
183
+ export_dirs: Dict[str, Path],
184
+ keep_labels: set[str],
185
+ output_dir: Path,
186
+ final_format: str,
187
+ ) -> Dict[str, Any]:
188
+ dataset_path = output_dir / 'dataset.jsonl'
189
+ dataset_gzip_path = output_dir / 'dataset.jsonl.gz'
190
+ parquet_path = output_dir / 'dataset.parquet'
191
+ stats = Counter()
192
+ source_stats: Dict[str, Counter] = {}
193
+ jsonl_enabled = final_format in {'jsonl', 'both'}
194
+ parquet_enabled = final_format in {'parquet', 'both'}
195
+ parquet_writer = None
196
+ parquet_batch: List[Dict[str, Any]] = []
197
+
198
+ out = dataset_path.open('w', encoding='utf-8') if jsonl_enabled else None
199
+ out_gzip = gzip.open(dataset_gzip_path, 'wt', encoding='utf-8') if jsonl_enabled else None
200
+ try:
201
+ for source_name, label_dir in label_dirs:
202
+ labels = load_jsonl(label_dir / 'labels.jsonl')
203
+ record_index = build_record_index(export_dirs[source_name])
204
+ source_counter = Counter()
205
+ for label in labels:
206
+ source_counter['records_seen'] += 1
207
+ stats[f"labels_seen:{label['label']}"] += 1
208
+ if label['label'] not in keep_labels:
209
+ source_counter['records_skipped'] += 1
210
+ continue
211
+ record = record_index.get(label['id'])
212
+ if record is None:
213
+ source_counter['missing_records'] += 1
214
+ continue
215
+ merged = dict(record)
216
+ merged_meta = dict(merged.get('meta', {}))
217
+ merged['label'] = label['label']
218
+ merged['source_system'] = source_name
219
+ merged['source_bucket'] = label.get('bucket')
220
+ merged['source_file'] = label.get('source_file')
221
+ merged['agentic_label'] = {
222
+ 'label': label['label'],
223
+ 'tool_call_count': label.get('tool_call_count'),
224
+ 'tool_message_count': label.get('tool_message_count'),
225
+ 'dialogue_rounds_est': label.get('dialogue_rounds_est'),
226
+ 'reasoning_chars': label.get('reasoning_chars'),
227
+ 'has_reasoning': label.get('has_reasoning'),
228
+ 'lossy_source': label.get('lossy_source'),
229
+ 'lossy_reasons': label.get('lossy_reasons', []),
230
+ }
231
+ merged_meta['dataset_label'] = label['label']
232
+ merged_meta['dataset_source_system'] = source_name
233
+ merged_meta['dataset_source_bucket'] = label.get('bucket')
234
+ merged_meta['dataset_source_file'] = label.get('source_file')
235
+ merged_meta['dataset_has_reasoning'] = label.get('has_reasoning')
236
+ merged_meta['dataset_reasoning_chars'] = label.get('reasoning_chars')
237
+ merged['meta'] = merged_meta
238
+ if out is not None:
239
+ line = json.dumps(merged, ensure_ascii=False) + '\n'
240
+ out.write(line)
241
+ if out_gzip is not None:
242
+ out_gzip.write(line)
243
+ if parquet_enabled:
244
+ parquet_batch.append(record_to_parquet_row(merged))
245
+ if len(parquet_batch) >= 1000:
246
+ parquet_writer = append_parquet_rows(parquet_writer, parquet_batch, parquet_path)
247
+ parquet_batch = []
248
+ source_counter['records_kept'] += 1
249
+ source_counter[f"kept:{label['label']}"] += 1
250
+ stats['records_kept'] += 1
251
+ stats[f"kept:{label['label']}"] += 1
252
+ source_stats[source_name] = source_counter
253
+ if parquet_enabled:
254
+ parquet_writer = append_parquet_rows(parquet_writer, parquet_batch, parquet_path)
255
+ finally:
256
+ if out is not None:
257
+ out.close()
258
+ if out_gzip is not None:
259
+ out_gzip.close()
260
+ if parquet_writer is not None:
261
+ parquet_writer.close()
262
+
263
+ result = {
264
+ 'dataset_path': str(dataset_path) if jsonl_enabled else None,
265
+ 'stats': dict(stats),
266
+ 'source_stats': {name: dict(counter) for name, counter in source_stats.items()},
267
+ }
268
+ if jsonl_enabled:
269
+ result['dataset_gzip_path'] = str(dataset_gzip_path)
270
+ if parquet_enabled:
271
+ result['dataset_parquet_path'] = str(parquet_path)
272
+ return result
273
+
274
+
275
+ def main(argv: Sequence[str] | None = None) -> int:
276
+ args = parse_args(argv or sys.argv[1:])
277
+ if args.final_format in {'parquet', 'both'}:
278
+ ensure_parquet_runtime('parquet')
279
+ script_dir = Path(__file__).resolve().parent
280
+ workspace_root = script_dir.parent
281
+ output_root = Path(args.output_root).expanduser().resolve()
282
+ output_root.mkdir(parents=True, exist_ok=True)
283
+ run_root = output_root / f'agentic-dataset-{datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")}'
284
+ run_root.mkdir(parents=True, exist_ok=True)
285
+ log_path = run_root / 'run.log'
286
+
287
+ include_sources = {item.strip() for item in args.include_sources.split(',') if item.strip()}
288
+ keep_labels = {item.strip() for item in args.include_labels.split(',') if item.strip()}
289
+ pi_root = Path(args.pi_root).expanduser().resolve() if args.pi_root else default_pi_session_root()
290
+ codex_root = Path(args.codex_root).expanduser().resolve() if args.codex_root else default_codex_session_root()
291
+ export_dirs: Dict[str, Path] = {}
292
+ label_dirs: List[Tuple[str, Path]] = []
293
+ manifest: Dict[str, Any] = {
294
+ 'run_dir': str(run_root),
295
+ 'run_log': str(log_path),
296
+ 'pi_root': str(pi_root),
297
+ 'codex_root': str(codex_root),
298
+ 'include_sources': sorted(include_sources),
299
+ 'keep_labels': sorted(keep_labels),
300
+ 'keep_intermediates': args.keep_intermediates,
301
+ 'steps': {},
302
+ }
303
+ cleanup_paths: List[str] = []
304
+
305
+ if 'pi' in include_sources:
306
+ pi_run_root = run_root / 'pi'
307
+ pi_run_root.mkdir(parents=True, exist_ok=True)
308
+ full_dir = export_all_pi_sessions(script_dir, pi_run_root, pi_root, log_path)
309
+ pi_export_dir = convert_pi(script_dir, full_dir, pi_run_root, args.jsonl_only, log_path)
310
+ pi_label_dir = label_export(script_dir, pi_export_dir, pi_run_root / 'labels', log_path, 'pi-label')
311
+ export_dirs['pi'] = pi_export_dir
312
+ label_dirs.append(('pi', pi_label_dir))
313
+ cleanup_paths.append(str(pi_run_root))
314
+ manifest['steps']['pi'] = {
315
+ 'full_export_dir': str(full_dir),
316
+ 'qwen35_export_dir': str(pi_export_dir),
317
+ 'label_dir': str(pi_label_dir),
318
+ 'label_manifest': load_json(pi_label_dir / 'manifest.json'),
319
+ 'export_manifest': load_json(pi_export_dir / 'manifest.json'),
320
+ }
321
+
322
+ if 'codex' in include_sources:
323
+ codex_run_root = run_root / 'codex'
324
+ codex_run_root.mkdir(parents=True, exist_ok=True)
325
+ codex_export_dir = convert_codex(script_dir, codex_root, codex_run_root, args.jsonl_only, args.codex_limit, log_path)
326
+ codex_label_dir = label_export(script_dir, codex_export_dir, codex_run_root / 'labels', log_path, 'codex-label')
327
+ export_dirs['codex'] = codex_export_dir
328
+ label_dirs.append(('codex', codex_label_dir))
329
+ cleanup_paths.append(str(codex_run_root))
330
+ manifest['steps']['codex'] = {
331
+ 'qwen35_export_dir': str(codex_export_dir),
332
+ 'label_dir': str(codex_label_dir),
333
+ 'label_manifest': load_json(codex_label_dir / 'manifest.json'),
334
+ 'export_manifest': load_json(codex_export_dir / 'manifest.json'),
335
+ }
336
+
337
+ merge_info = merge_labeled_datasets(label_dirs, export_dirs, keep_labels, run_root, args.final_format)
338
+ manifest['final_dataset'] = merge_info
339
+ if not args.keep_intermediates:
340
+ removed: List[str] = []
341
+ for path_str in cleanup_paths:
342
+ path = Path(path_str)
343
+ if path.exists():
344
+ shutil.rmtree(path)
345
+ removed.append(path_str)
346
+ manifest['cleanup'] = {'enabled': True, 'removed_paths': removed}
347
+ print_progress('cleanup', f'removed {len(removed)} intermediate directories')
348
+ else:
349
+ manifest['cleanup'] = {'enabled': False, 'removed_paths': []}
350
+ manifest_path = run_root / 'manifest.json'
351
+ manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2) + '\n', encoding='utf-8')
352
+ print(
353
+ json.dumps(
354
+ {
355
+ 'run_dir': str(run_root),
356
+ 'dataset_path': merge_info.get('dataset_path'),
357
+ 'dataset_parquet_path': merge_info.get('dataset_parquet_path'),
358
+ 'stats': merge_info['stats'],
359
+ },
360
+ ensure_ascii=False,
361
+ ),
362
+ flush=True,
363
+ )
364
+ return 0
365
+
366
+
367
+ if __name__ == '__main__':
368
+ raise SystemExit(main())