agentic-dataset-builder 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,466 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ from collections import Counter
7
+ from datetime import datetime, timezone
8
+ from pathlib import Path
9
+ from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
10
+
11
+ from .export_qwen35_training import (
12
+ append_parquet_rows,
13
+ ensure_parquet_runtime,
14
+ record_to_parquet_row,
15
+ validate_record_payload,
16
+ )
17
+
18
+ BATCH_SIZE = 1000
19
+ DEFAULT_CODEX_HOME = Path.home() / '.codex'
20
+
21
+
22
+ class CodexConversionError(RuntimeError):
23
+ pass
24
+
25
+
26
+ class TurnBuilder:
27
+ def __init__(self, session_meta: Dict[str, Any], turn_id: str, start_ts: str) -> None:
28
+ self.session_meta = session_meta
29
+ self.turn_id = turn_id
30
+ self.start_ts = start_ts
31
+ self.messages: List[Dict[str, Any]] = []
32
+ self.pending_text_parts: List[str] = []
33
+ self.pending_reasoning_parts: List[str] = []
34
+ self.pending_tool_calls: List[Dict[str, Any]] = []
35
+ self.call_names: Dict[str, str] = {}
36
+ self.tool_specs: Dict[str, Dict[str, Any]] = {}
37
+ self.lossy_reasons: set[str] = set()
38
+ self.error_messages: List[str] = []
39
+ self.last_ts: str = start_ts
40
+ self.completed = False
41
+ self.last_agent_message: Optional[str] = None
42
+
43
+ def ingest(self, entry: Dict[str, Any]) -> None:
44
+ self.last_ts = entry.get('timestamp') or self.last_ts
45
+ entry_type = entry.get('type')
46
+ payload = entry.get('payload') if isinstance(entry.get('payload'), dict) else {}
47
+
48
+ if entry_type == 'response_item':
49
+ self._ingest_response_item(payload)
50
+ return
51
+
52
+ if entry_type == 'event_msg':
53
+ event_type = payload.get('type')
54
+ if event_type == 'exec_command_end':
55
+ self._ingest_exec_command_end(payload)
56
+ elif event_type == 'error':
57
+ message = payload.get('message')
58
+ if isinstance(message, str) and message:
59
+ self.error_messages.append(message)
60
+ self.lossy_reasons.add('turn_error')
61
+ elif event_type == 'task_complete':
62
+ last_agent_message = payload.get('last_agent_message')
63
+ if isinstance(last_agent_message, str) and last_agent_message.strip():
64
+ self.last_agent_message = last_agent_message
65
+ return
66
+
67
+ def _ingest_response_item(self, payload: Dict[str, Any]) -> None:
68
+ item_type = payload.get('type')
69
+ if item_type == 'message':
70
+ self._ingest_message(payload)
71
+ elif item_type == 'reasoning':
72
+ self._ingest_reasoning(payload)
73
+ elif item_type == 'function_call':
74
+ self._ingest_function_call(payload)
75
+ elif item_type == 'function_call_output':
76
+ self._ingest_function_call_output(payload)
77
+ elif item_type == 'custom_tool_call':
78
+ self._ingest_custom_tool_call(payload)
79
+ elif item_type == 'custom_tool_call_output':
80
+ self._ingest_custom_tool_call_output(payload)
81
+
82
+ def _ingest_message(self, payload: Dict[str, Any]) -> None:
83
+ role = payload.get('role')
84
+ content = payload.get('content') if isinstance(payload.get('content'), list) else []
85
+ text = extract_codex_text(content)
86
+ if role == 'assistant':
87
+ if text:
88
+ self.pending_text_parts.append(text)
89
+ return
90
+
91
+ self.flush_assistant()
92
+ if role == 'developer':
93
+ if text:
94
+ self.messages.append({'role': 'system', 'content': text})
95
+ return
96
+ if role == 'user':
97
+ if is_environment_context(text):
98
+ self.messages.append({'role': 'system', 'content': text})
99
+ elif text:
100
+ self.messages.append({'role': 'user', 'content': text})
101
+ return
102
+ if text:
103
+ self.lossy_reasons.add(f'unsupported_message_role_{role}')
104
+ self.messages.append({'role': 'assistant', 'content': text})
105
+
106
+ def _ingest_reasoning(self, payload: Dict[str, Any]) -> None:
107
+ summary = payload.get('summary')
108
+ extracted: List[str] = []
109
+ if isinstance(summary, list):
110
+ for item in summary:
111
+ if isinstance(item, dict):
112
+ if isinstance(item.get('text'), str) and item['text'].strip():
113
+ extracted.append(item['text'].strip())
114
+ elif isinstance(item.get('summary_text'), str) and item['summary_text'].strip():
115
+ extracted.append(item['summary_text'].strip())
116
+ content = payload.get('content')
117
+ if isinstance(content, str) and content.strip():
118
+ extracted.append(content.strip())
119
+ if extracted:
120
+ self.pending_reasoning_parts.extend(extracted)
121
+ elif payload.get('encrypted_content'):
122
+ self.lossy_reasons.add('encrypted_reasoning_without_summary')
123
+
124
+ def _ingest_function_call(self, payload: Dict[str, Any]) -> None:
125
+ name = payload.get('name') or 'unknown_function'
126
+ call_id = payload.get('call_id')
127
+ arguments = parse_json_object(payload.get('arguments'))
128
+ tool_call = {
129
+ 'type': 'function',
130
+ 'id': call_id,
131
+ 'function': {'name': name, 'arguments': arguments},
132
+ }
133
+ self.pending_tool_calls.append(tool_call)
134
+ if isinstance(call_id, str):
135
+ self.call_names[call_id] = name
136
+ self.tool_specs.setdefault(name, {'name': name})
137
+
138
+ def _ingest_custom_tool_call(self, payload: Dict[str, Any]) -> None:
139
+ name = payload.get('name') or 'custom_tool'
140
+ call_id = payload.get('call_id')
141
+ arguments = {'input': payload.get('input'), 'status': payload.get('status')}
142
+ tool_call = {
143
+ 'type': 'function',
144
+ 'id': call_id,
145
+ 'function': {'name': name, 'arguments': arguments},
146
+ }
147
+ self.pending_tool_calls.append(tool_call)
148
+ if isinstance(call_id, str):
149
+ self.call_names[call_id] = name
150
+ self.tool_specs.setdefault(name, {'name': name})
151
+
152
+ def _ingest_function_call_output(self, payload: Dict[str, Any]) -> None:
153
+ self.flush_assistant()
154
+ call_id = payload.get('call_id')
155
+ tool_name = self.call_names.get(call_id or '', 'tool')
156
+ output = payload.get('output')
157
+ if not isinstance(output, str):
158
+ output = json.dumps(output, ensure_ascii=False, sort_keys=True)
159
+ self.messages.append({'role': 'tool', 'name': tool_name, 'tool_call_id': call_id, 'content': output})
160
+ self.tool_specs.setdefault(tool_name, {'name': tool_name})
161
+
162
+ def _ingest_custom_tool_call_output(self, payload: Dict[str, Any]) -> None:
163
+ self.flush_assistant()
164
+ call_id = payload.get('call_id')
165
+ tool_name = self.call_names.get(call_id or '', 'custom_tool')
166
+ output = payload.get('output')
167
+ if not isinstance(output, str):
168
+ output = json.dumps(output, ensure_ascii=False, sort_keys=True)
169
+ self.messages.append({'role': 'tool', 'name': tool_name, 'tool_call_id': call_id, 'content': output})
170
+ self.tool_specs.setdefault(tool_name, {'name': tool_name})
171
+
172
+ def _ingest_exec_command_end(self, payload: Dict[str, Any]) -> None:
173
+ self.flush_assistant()
174
+ call_id = payload.get('call_id')
175
+ tool_name = self.call_names.get(call_id or '', 'exec_command')
176
+ content = json.dumps(
177
+ {
178
+ 'command': payload.get('command'),
179
+ 'cwd': payload.get('cwd'),
180
+ 'aggregated_output': payload.get('aggregated_output'),
181
+ 'exit_code': payload.get('exit_code'),
182
+ 'status': payload.get('status'),
183
+ 'duration': payload.get('duration'),
184
+ },
185
+ ensure_ascii=False,
186
+ sort_keys=True,
187
+ )
188
+ self.messages.append({'role': 'tool', 'name': tool_name, 'tool_call_id': call_id, 'content': content})
189
+ self.tool_specs.setdefault(tool_name, {'name': tool_name})
190
+
191
+ def flush_assistant(self) -> None:
192
+ if not self.pending_text_parts and not self.pending_reasoning_parts and not self.pending_tool_calls:
193
+ return
194
+ content = '\n\n'.join(part for part in self.pending_text_parts if part.strip())
195
+ message: Dict[str, Any] = {'role': 'assistant', 'content': content}
196
+ if self.pending_reasoning_parts:
197
+ message['reasoning_content'] = '\n\n'.join(self.pending_reasoning_parts)
198
+ if self.pending_tool_calls:
199
+ message['tool_calls'] = list(self.pending_tool_calls)
200
+ self.messages.append(message)
201
+ self.pending_text_parts = []
202
+ self.pending_reasoning_parts = []
203
+ self.pending_tool_calls = []
204
+
205
+ def finalize(self) -> Optional[Dict[str, Any]]:
206
+ if self.last_agent_message and not self.pending_text_parts:
207
+ # Some turns surface only `last_agent_message` at completion; preserve it as lossy synthetic text.
208
+ self.pending_text_parts.append(self.last_agent_message)
209
+ self.lossy_reasons.add('synthetic_last_agent_message')
210
+ self.flush_assistant()
211
+ if not any(message.get('role') == 'user' for message in self.messages):
212
+ return None
213
+ meta = build_meta(self)
214
+ record = {
215
+ 'id': f"{self.session_meta.get('id')}:{self.turn_id}",
216
+ 'request_id': self.turn_id,
217
+ 'messages': self.messages,
218
+ 'tools': list(self.tool_specs.values()),
219
+ 'meta': meta,
220
+ }
221
+ validate_record_payload(record)
222
+ return record
223
+
224
+
225
+ def parse_args(argv: Sequence[str]) -> argparse.Namespace:
226
+ parser = argparse.ArgumentParser(description='Convert Codex sessions into local Qwen3.5 training schema.')
227
+ parser.add_argument('--codex-home', default=str(DEFAULT_CODEX_HOME), help='Codex home directory.')
228
+ parser.add_argument('--input', help='Specific Codex session file or directory. Defaults to ~/.codex/sessions.')
229
+ parser.add_argument('--output-root', required=True, help='Output directory root for Qwen3.5 export.')
230
+ parser.add_argument('--output-format', choices=('jsonl', 'parquet', 'both'), default='jsonl')
231
+ parser.add_argument('--limit', type=int, default=0, help='Limit the number of session files processed.')
232
+ return parser.parse_args(argv)
233
+
234
+
235
+ def parse_json_object(raw: Any) -> Dict[str, Any]:
236
+ if isinstance(raw, dict):
237
+ return raw
238
+ if not isinstance(raw, str):
239
+ return {}
240
+ try:
241
+ parsed = json.loads(raw)
242
+ return parsed if isinstance(parsed, dict) else {'value': parsed}
243
+ except Exception:
244
+ return {'raw': raw}
245
+
246
+
247
+ def extract_codex_text(content: List[Dict[str, Any]]) -> str:
248
+ parts: List[str] = []
249
+ for item in content:
250
+ if not isinstance(item, dict):
251
+ continue
252
+ item_type = item.get('type')
253
+ if item_type in {'input_text', 'output_text'} and isinstance(item.get('text'), str):
254
+ parts.append(item['text'])
255
+ elif item_type == 'input_image':
256
+ parts.append('[image]')
257
+ return '\n'.join(part for part in parts if part)
258
+
259
+
260
+ def is_environment_context(text: str) -> bool:
261
+ return text.strip().startswith('<environment_context>') if isinstance(text, str) else False
262
+
263
+
264
+ def iter_session_files(input_path: Path) -> List[Path]:
265
+ if input_path.is_file():
266
+ return [input_path]
267
+ if not input_path.is_dir():
268
+ raise CodexConversionError(f'Input path does not exist: {input_path}')
269
+ return sorted(input_path.rglob('*.jsonl'))
270
+
271
+
272
+ def read_jsonl(path: Path) -> List[Dict[str, Any]]:
273
+ items: List[Dict[str, Any]] = []
274
+ with path.open('r', encoding='utf-8') as handle:
275
+ for line_number, raw in enumerate(handle, start=1):
276
+ raw = raw.strip()
277
+ if not raw:
278
+ continue
279
+ try:
280
+ items.append(json.loads(raw))
281
+ except json.JSONDecodeError as exc:
282
+ raise CodexConversionError(f'Invalid JSON at {path}:{line_number}: {exc}') from exc
283
+ return items
284
+
285
+
286
+ def build_meta(builder: TurnBuilder) -> Dict[str, Any]:
287
+ assistant_messages = [message for message in builder.messages if message.get('role') == 'assistant']
288
+ tool_messages = [message for message in builder.messages if message.get('role') == 'tool']
289
+ user_messages = [message for message in builder.messages if message.get('role') == 'user']
290
+ reasoning_count = sum(
291
+ 1
292
+ for message in assistant_messages
293
+ if isinstance(message.get('reasoning_content'), str) and message['reasoning_content'].strip()
294
+ )
295
+ return {
296
+ 'endpoint': 'codex/turn',
297
+ 'status': 200 if not builder.error_messages else 500,
298
+ 'ts': builder.last_ts or builder.start_ts or '',
299
+ 'key': builder.session_meta.get('id'),
300
+ 'source': (
301
+ f"codex:{builder.session_meta.get('source') or 'cli'}:"
302
+ f"session={builder.session_meta.get('id')}:turn={builder.turn_id}:"
303
+ f"cwd={builder.session_meta.get('cwd')}:cli={builder.session_meta.get('cli_version')}"
304
+ ),
305
+ 'requested_model': builder.session_meta.get('model'),
306
+ 'actual_model': builder.session_meta.get('model'),
307
+ 'stream': False,
308
+ 'thinking_level': builder.session_meta.get('reasoning_effort'),
309
+ 'reasoning_summary_mode': 'codex_reasoning_summary',
310
+ 'thinking_type': 'codex_turn',
311
+ 'thinking_budget_tokens': None,
312
+ 'max_output_tokens': None,
313
+ 'tool_spec_count': len(builder.tool_specs),
314
+ 'tool_choice': {'mode': 'session_trace'},
315
+ 'request_contains_non_text_content': False,
316
+ 'request_image_block_count': 0,
317
+ 'request_video_block_count': 0,
318
+ 'request_tool_call_block_count': 0,
319
+ 'request_tool_result_block_count': 0,
320
+ 'request_thinking_block_count': 0,
321
+ 'response_contains_non_text_content': False,
322
+ 'response_image_block_count': 0,
323
+ 'response_video_block_count': 0,
324
+ 'response_tool_call_block_count': sum(len(message.get('tool_calls') or []) for message in assistant_messages),
325
+ 'response_tool_result_block_count': len(tool_messages),
326
+ 'response_thinking_block_count': reasoning_count,
327
+ 'request_truncated': False,
328
+ 'response_truncated': False,
329
+ 'lossy_source': bool(builder.lossy_reasons),
330
+ 'lossy_reasons': sorted(builder.lossy_reasons),
331
+ }
332
+
333
+
334
+ def convert_session_file(path: Path) -> Tuple[List[Dict[str, Any]], Counter]:
335
+ entries = read_jsonl(path)
336
+ if not entries:
337
+ raise CodexConversionError(f'Empty session file: {path}')
338
+ session_meta_payload = entries[0].get('payload') if entries[0].get('type') == 'session_meta' else None
339
+ if not isinstance(session_meta_payload, dict):
340
+ raise CodexConversionError(f'Missing session_meta in {path}')
341
+
342
+ session_meta = dict(session_meta_payload)
343
+ stats = Counter()
344
+ stats['input_files'] += 1
345
+ records: List[Dict[str, Any]] = []
346
+ current: Optional[TurnBuilder] = None
347
+
348
+ for entry in entries:
349
+ entry_type = entry.get('type')
350
+ payload = entry.get('payload') if isinstance(entry.get('payload'), dict) else {}
351
+ if entry_type == 'turn_context':
352
+ session_meta['model'] = payload.get('model') or session_meta.get('model')
353
+ session_meta['reasoning_effort'] = payload.get('effort') or session_meta.get('reasoning_effort')
354
+ continue
355
+ if entry_type == 'event_msg' and payload.get('type') == 'task_started':
356
+ turn_id = payload.get('turn_id') or entry.get('timestamp')
357
+ current = TurnBuilder(session_meta, str(turn_id), entry.get('timestamp') or '')
358
+ continue
359
+ if current is None:
360
+ continue
361
+ current.ingest(entry)
362
+ if entry_type == 'event_msg' and payload.get('type') == 'task_complete':
363
+ record = current.finalize()
364
+ if record is not None:
365
+ records.append(record)
366
+ if record['meta']['lossy_source']:
367
+ stats['lossy_records'] += 1
368
+ else:
369
+ stats['strict_records'] += 1
370
+ stats['turns_total'] += 1
371
+ current = None
372
+
373
+ return records, stats
374
+
375
+
376
+ def main(argv: Sequence[str] | None = None) -> int:
377
+ args = parse_args(argv or [])
378
+ ensure_parquet_runtime(args.output_format)
379
+ codex_home = Path(args.codex_home).expanduser().resolve()
380
+ input_path = Path(args.input).expanduser().resolve() if args.input else (codex_home / 'sessions')
381
+ session_files = iter_session_files(input_path)
382
+ if args.limit > 0:
383
+ session_files = session_files[: args.limit]
384
+ if not session_files:
385
+ raise SystemExit('No Codex session files found.')
386
+
387
+ out_dir = Path(args.output_root).expanduser().resolve() / f'qwen35-codex-session-{datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")}'
388
+ out_dir.mkdir(parents=True, exist_ok=True)
389
+ strict_path = out_dir / 'qwen35-train.jsonl'
390
+ lossy_path = out_dir / 'qwen35-train-lossy.jsonl'
391
+ invalid_path = out_dir / 'invalid-records.jsonl'
392
+ strict_parquet_path = out_dir / 'qwen35-train.parquet'
393
+ lossy_parquet_path = out_dir / 'qwen35-train-lossy.parquet'
394
+ manifest_path = out_dir / 'manifest.json'
395
+
396
+ jsonl_enabled = args.output_format in {'jsonl', 'both'}
397
+ parquet_enabled = args.output_format in {'parquet', 'both'}
398
+ strict_out = strict_path.open('w', encoding='utf-8') if jsonl_enabled else None
399
+ lossy_out = lossy_path.open('w', encoding='utf-8') if jsonl_enabled else None
400
+ invalid_out = invalid_path.open('w', encoding='utf-8')
401
+ strict_writer = None
402
+ lossy_writer = None
403
+ strict_batch: List[Dict[str, Any]] = []
404
+ lossy_batch: List[Dict[str, Any]] = []
405
+ stats = Counter()
406
+
407
+ try:
408
+ for session_file in session_files:
409
+ try:
410
+ records, file_stats = convert_session_file(session_file)
411
+ stats.update(file_stats)
412
+ except Exception as exc:
413
+ stats['invalid_files'] += 1
414
+ invalid_out.write(json.dumps({'path': str(session_file), 'error': str(exc)}, ensure_ascii=False) + '\n')
415
+ continue
416
+
417
+ for record in records:
418
+ bucket = 'lossy' if record['meta']['lossy_source'] else 'strict'
419
+ if bucket == 'strict':
420
+ if strict_out is not None:
421
+ strict_out.write(json.dumps(record, ensure_ascii=False, separators=(',', ':')) + '\n')
422
+ if parquet_enabled:
423
+ strict_batch.append(record_to_parquet_row(record))
424
+ if len(strict_batch) >= BATCH_SIZE:
425
+ strict_writer = append_parquet_rows(strict_writer, strict_batch, strict_parquet_path)
426
+ strict_batch = []
427
+ else:
428
+ if lossy_out is not None:
429
+ lossy_out.write(json.dumps(record, ensure_ascii=False, separators=(',', ':')) + '\n')
430
+ if parquet_enabled:
431
+ lossy_batch.append(record_to_parquet_row(record))
432
+ if len(lossy_batch) >= BATCH_SIZE:
433
+ lossy_writer = append_parquet_rows(lossy_writer, lossy_batch, lossy_parquet_path)
434
+ lossy_batch = []
435
+ print(json.dumps({'processed_files': stats['input_files'], **dict(stats)}, ensure_ascii=False), flush=True)
436
+
437
+ if parquet_enabled:
438
+ strict_writer = append_parquet_rows(strict_writer, strict_batch, strict_parquet_path)
439
+ lossy_writer = append_parquet_rows(lossy_writer, lossy_batch, lossy_parquet_path)
440
+ finally:
441
+ if strict_out is not None:
442
+ strict_out.close()
443
+ if lossy_out is not None:
444
+ lossy_out.close()
445
+ invalid_out.close()
446
+ if strict_writer is not None:
447
+ strict_writer.close()
448
+ if lossy_writer is not None:
449
+ lossy_writer.close()
450
+
451
+ manifest = {
452
+ 'codex_home': str(codex_home),
453
+ 'input': str(input_path),
454
+ 'output_dir': str(out_dir),
455
+ 'input_files': [str(path) for path in session_files],
456
+ 'stats': dict(stats),
457
+ 'strict_records': stats.get('strict_records', 0),
458
+ 'lossy_records': stats.get('lossy_records', 0),
459
+ }
460
+ manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2) + '\n', encoding='utf-8')
461
+ print(json.dumps(manifest, ensure_ascii=False), flush=True)
462
+ return 0
463
+
464
+
465
+ if __name__ == '__main__':
466
+ raise SystemExit(main(__import__('sys').argv[1:]))