agentic-dataset-builder 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +125 -0
- package/agentic_dataset/__init__.py +1 -0
- package/agentic_dataset/build_agentic_dataset.py +368 -0
- package/agentic_dataset/export_codex_session_to_qwen35.py +466 -0
- package/agentic_dataset/export_pi_session.py +701 -0
- package/agentic_dataset/export_pi_session_to_qwen35.py +742 -0
- package/agentic_dataset/export_qwen35_training.py +1559 -0
- package/agentic_dataset/label_qwen35_agentic.py +156 -0
- package/agentic_dataset/platform_paths.py +85 -0
- package/agentic_dataset/qwen35_training_record.py +179 -0
- package/bin/agentic-dataset-builder.js +77 -0
- package/package.json +40 -0
- package/requirements.txt +2 -0
- package/run.py +8 -0
|
@@ -0,0 +1,466 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import json
|
|
6
|
+
from collections import Counter
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
|
|
10
|
+
|
|
11
|
+
from .export_qwen35_training import (
|
|
12
|
+
append_parquet_rows,
|
|
13
|
+
ensure_parquet_runtime,
|
|
14
|
+
record_to_parquet_row,
|
|
15
|
+
validate_record_payload,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
BATCH_SIZE = 1000
|
|
19
|
+
DEFAULT_CODEX_HOME = Path.home() / '.codex'
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class CodexConversionError(RuntimeError):
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class TurnBuilder:
|
|
27
|
+
def __init__(self, session_meta: Dict[str, Any], turn_id: str, start_ts: str) -> None:
|
|
28
|
+
self.session_meta = session_meta
|
|
29
|
+
self.turn_id = turn_id
|
|
30
|
+
self.start_ts = start_ts
|
|
31
|
+
self.messages: List[Dict[str, Any]] = []
|
|
32
|
+
self.pending_text_parts: List[str] = []
|
|
33
|
+
self.pending_reasoning_parts: List[str] = []
|
|
34
|
+
self.pending_tool_calls: List[Dict[str, Any]] = []
|
|
35
|
+
self.call_names: Dict[str, str] = {}
|
|
36
|
+
self.tool_specs: Dict[str, Dict[str, Any]] = {}
|
|
37
|
+
self.lossy_reasons: set[str] = set()
|
|
38
|
+
self.error_messages: List[str] = []
|
|
39
|
+
self.last_ts: str = start_ts
|
|
40
|
+
self.completed = False
|
|
41
|
+
self.last_agent_message: Optional[str] = None
|
|
42
|
+
|
|
43
|
+
def ingest(self, entry: Dict[str, Any]) -> None:
|
|
44
|
+
self.last_ts = entry.get('timestamp') or self.last_ts
|
|
45
|
+
entry_type = entry.get('type')
|
|
46
|
+
payload = entry.get('payload') if isinstance(entry.get('payload'), dict) else {}
|
|
47
|
+
|
|
48
|
+
if entry_type == 'response_item':
|
|
49
|
+
self._ingest_response_item(payload)
|
|
50
|
+
return
|
|
51
|
+
|
|
52
|
+
if entry_type == 'event_msg':
|
|
53
|
+
event_type = payload.get('type')
|
|
54
|
+
if event_type == 'exec_command_end':
|
|
55
|
+
self._ingest_exec_command_end(payload)
|
|
56
|
+
elif event_type == 'error':
|
|
57
|
+
message = payload.get('message')
|
|
58
|
+
if isinstance(message, str) and message:
|
|
59
|
+
self.error_messages.append(message)
|
|
60
|
+
self.lossy_reasons.add('turn_error')
|
|
61
|
+
elif event_type == 'task_complete':
|
|
62
|
+
last_agent_message = payload.get('last_agent_message')
|
|
63
|
+
if isinstance(last_agent_message, str) and last_agent_message.strip():
|
|
64
|
+
self.last_agent_message = last_agent_message
|
|
65
|
+
return
|
|
66
|
+
|
|
67
|
+
def _ingest_response_item(self, payload: Dict[str, Any]) -> None:
|
|
68
|
+
item_type = payload.get('type')
|
|
69
|
+
if item_type == 'message':
|
|
70
|
+
self._ingest_message(payload)
|
|
71
|
+
elif item_type == 'reasoning':
|
|
72
|
+
self._ingest_reasoning(payload)
|
|
73
|
+
elif item_type == 'function_call':
|
|
74
|
+
self._ingest_function_call(payload)
|
|
75
|
+
elif item_type == 'function_call_output':
|
|
76
|
+
self._ingest_function_call_output(payload)
|
|
77
|
+
elif item_type == 'custom_tool_call':
|
|
78
|
+
self._ingest_custom_tool_call(payload)
|
|
79
|
+
elif item_type == 'custom_tool_call_output':
|
|
80
|
+
self._ingest_custom_tool_call_output(payload)
|
|
81
|
+
|
|
82
|
+
def _ingest_message(self, payload: Dict[str, Any]) -> None:
|
|
83
|
+
role = payload.get('role')
|
|
84
|
+
content = payload.get('content') if isinstance(payload.get('content'), list) else []
|
|
85
|
+
text = extract_codex_text(content)
|
|
86
|
+
if role == 'assistant':
|
|
87
|
+
if text:
|
|
88
|
+
self.pending_text_parts.append(text)
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
self.flush_assistant()
|
|
92
|
+
if role == 'developer':
|
|
93
|
+
if text:
|
|
94
|
+
self.messages.append({'role': 'system', 'content': text})
|
|
95
|
+
return
|
|
96
|
+
if role == 'user':
|
|
97
|
+
if is_environment_context(text):
|
|
98
|
+
self.messages.append({'role': 'system', 'content': text})
|
|
99
|
+
elif text:
|
|
100
|
+
self.messages.append({'role': 'user', 'content': text})
|
|
101
|
+
return
|
|
102
|
+
if text:
|
|
103
|
+
self.lossy_reasons.add(f'unsupported_message_role_{role}')
|
|
104
|
+
self.messages.append({'role': 'assistant', 'content': text})
|
|
105
|
+
|
|
106
|
+
def _ingest_reasoning(self, payload: Dict[str, Any]) -> None:
|
|
107
|
+
summary = payload.get('summary')
|
|
108
|
+
extracted: List[str] = []
|
|
109
|
+
if isinstance(summary, list):
|
|
110
|
+
for item in summary:
|
|
111
|
+
if isinstance(item, dict):
|
|
112
|
+
if isinstance(item.get('text'), str) and item['text'].strip():
|
|
113
|
+
extracted.append(item['text'].strip())
|
|
114
|
+
elif isinstance(item.get('summary_text'), str) and item['summary_text'].strip():
|
|
115
|
+
extracted.append(item['summary_text'].strip())
|
|
116
|
+
content = payload.get('content')
|
|
117
|
+
if isinstance(content, str) and content.strip():
|
|
118
|
+
extracted.append(content.strip())
|
|
119
|
+
if extracted:
|
|
120
|
+
self.pending_reasoning_parts.extend(extracted)
|
|
121
|
+
elif payload.get('encrypted_content'):
|
|
122
|
+
self.lossy_reasons.add('encrypted_reasoning_without_summary')
|
|
123
|
+
|
|
124
|
+
def _ingest_function_call(self, payload: Dict[str, Any]) -> None:
|
|
125
|
+
name = payload.get('name') or 'unknown_function'
|
|
126
|
+
call_id = payload.get('call_id')
|
|
127
|
+
arguments = parse_json_object(payload.get('arguments'))
|
|
128
|
+
tool_call = {
|
|
129
|
+
'type': 'function',
|
|
130
|
+
'id': call_id,
|
|
131
|
+
'function': {'name': name, 'arguments': arguments},
|
|
132
|
+
}
|
|
133
|
+
self.pending_tool_calls.append(tool_call)
|
|
134
|
+
if isinstance(call_id, str):
|
|
135
|
+
self.call_names[call_id] = name
|
|
136
|
+
self.tool_specs.setdefault(name, {'name': name})
|
|
137
|
+
|
|
138
|
+
def _ingest_custom_tool_call(self, payload: Dict[str, Any]) -> None:
|
|
139
|
+
name = payload.get('name') or 'custom_tool'
|
|
140
|
+
call_id = payload.get('call_id')
|
|
141
|
+
arguments = {'input': payload.get('input'), 'status': payload.get('status')}
|
|
142
|
+
tool_call = {
|
|
143
|
+
'type': 'function',
|
|
144
|
+
'id': call_id,
|
|
145
|
+
'function': {'name': name, 'arguments': arguments},
|
|
146
|
+
}
|
|
147
|
+
self.pending_tool_calls.append(tool_call)
|
|
148
|
+
if isinstance(call_id, str):
|
|
149
|
+
self.call_names[call_id] = name
|
|
150
|
+
self.tool_specs.setdefault(name, {'name': name})
|
|
151
|
+
|
|
152
|
+
def _ingest_function_call_output(self, payload: Dict[str, Any]) -> None:
|
|
153
|
+
self.flush_assistant()
|
|
154
|
+
call_id = payload.get('call_id')
|
|
155
|
+
tool_name = self.call_names.get(call_id or '', 'tool')
|
|
156
|
+
output = payload.get('output')
|
|
157
|
+
if not isinstance(output, str):
|
|
158
|
+
output = json.dumps(output, ensure_ascii=False, sort_keys=True)
|
|
159
|
+
self.messages.append({'role': 'tool', 'name': tool_name, 'tool_call_id': call_id, 'content': output})
|
|
160
|
+
self.tool_specs.setdefault(tool_name, {'name': tool_name})
|
|
161
|
+
|
|
162
|
+
def _ingest_custom_tool_call_output(self, payload: Dict[str, Any]) -> None:
|
|
163
|
+
self.flush_assistant()
|
|
164
|
+
call_id = payload.get('call_id')
|
|
165
|
+
tool_name = self.call_names.get(call_id or '', 'custom_tool')
|
|
166
|
+
output = payload.get('output')
|
|
167
|
+
if not isinstance(output, str):
|
|
168
|
+
output = json.dumps(output, ensure_ascii=False, sort_keys=True)
|
|
169
|
+
self.messages.append({'role': 'tool', 'name': tool_name, 'tool_call_id': call_id, 'content': output})
|
|
170
|
+
self.tool_specs.setdefault(tool_name, {'name': tool_name})
|
|
171
|
+
|
|
172
|
+
def _ingest_exec_command_end(self, payload: Dict[str, Any]) -> None:
|
|
173
|
+
self.flush_assistant()
|
|
174
|
+
call_id = payload.get('call_id')
|
|
175
|
+
tool_name = self.call_names.get(call_id or '', 'exec_command')
|
|
176
|
+
content = json.dumps(
|
|
177
|
+
{
|
|
178
|
+
'command': payload.get('command'),
|
|
179
|
+
'cwd': payload.get('cwd'),
|
|
180
|
+
'aggregated_output': payload.get('aggregated_output'),
|
|
181
|
+
'exit_code': payload.get('exit_code'),
|
|
182
|
+
'status': payload.get('status'),
|
|
183
|
+
'duration': payload.get('duration'),
|
|
184
|
+
},
|
|
185
|
+
ensure_ascii=False,
|
|
186
|
+
sort_keys=True,
|
|
187
|
+
)
|
|
188
|
+
self.messages.append({'role': 'tool', 'name': tool_name, 'tool_call_id': call_id, 'content': content})
|
|
189
|
+
self.tool_specs.setdefault(tool_name, {'name': tool_name})
|
|
190
|
+
|
|
191
|
+
def flush_assistant(self) -> None:
|
|
192
|
+
if not self.pending_text_parts and not self.pending_reasoning_parts and not self.pending_tool_calls:
|
|
193
|
+
return
|
|
194
|
+
content = '\n\n'.join(part for part in self.pending_text_parts if part.strip())
|
|
195
|
+
message: Dict[str, Any] = {'role': 'assistant', 'content': content}
|
|
196
|
+
if self.pending_reasoning_parts:
|
|
197
|
+
message['reasoning_content'] = '\n\n'.join(self.pending_reasoning_parts)
|
|
198
|
+
if self.pending_tool_calls:
|
|
199
|
+
message['tool_calls'] = list(self.pending_tool_calls)
|
|
200
|
+
self.messages.append(message)
|
|
201
|
+
self.pending_text_parts = []
|
|
202
|
+
self.pending_reasoning_parts = []
|
|
203
|
+
self.pending_tool_calls = []
|
|
204
|
+
|
|
205
|
+
def finalize(self) -> Optional[Dict[str, Any]]:
|
|
206
|
+
if self.last_agent_message and not self.pending_text_parts:
|
|
207
|
+
# Some turns surface only `last_agent_message` at completion; preserve it as lossy synthetic text.
|
|
208
|
+
self.pending_text_parts.append(self.last_agent_message)
|
|
209
|
+
self.lossy_reasons.add('synthetic_last_agent_message')
|
|
210
|
+
self.flush_assistant()
|
|
211
|
+
if not any(message.get('role') == 'user' for message in self.messages):
|
|
212
|
+
return None
|
|
213
|
+
meta = build_meta(self)
|
|
214
|
+
record = {
|
|
215
|
+
'id': f"{self.session_meta.get('id')}:{self.turn_id}",
|
|
216
|
+
'request_id': self.turn_id,
|
|
217
|
+
'messages': self.messages,
|
|
218
|
+
'tools': list(self.tool_specs.values()),
|
|
219
|
+
'meta': meta,
|
|
220
|
+
}
|
|
221
|
+
validate_record_payload(record)
|
|
222
|
+
return record
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def parse_args(argv: Sequence[str]) -> argparse.Namespace:
|
|
226
|
+
parser = argparse.ArgumentParser(description='Convert Codex sessions into local Qwen3.5 training schema.')
|
|
227
|
+
parser.add_argument('--codex-home', default=str(DEFAULT_CODEX_HOME), help='Codex home directory.')
|
|
228
|
+
parser.add_argument('--input', help='Specific Codex session file or directory. Defaults to ~/.codex/sessions.')
|
|
229
|
+
parser.add_argument('--output-root', required=True, help='Output directory root for Qwen3.5 export.')
|
|
230
|
+
parser.add_argument('--output-format', choices=('jsonl', 'parquet', 'both'), default='jsonl')
|
|
231
|
+
parser.add_argument('--limit', type=int, default=0, help='Limit the number of session files processed.')
|
|
232
|
+
return parser.parse_args(argv)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def parse_json_object(raw: Any) -> Dict[str, Any]:
|
|
236
|
+
if isinstance(raw, dict):
|
|
237
|
+
return raw
|
|
238
|
+
if not isinstance(raw, str):
|
|
239
|
+
return {}
|
|
240
|
+
try:
|
|
241
|
+
parsed = json.loads(raw)
|
|
242
|
+
return parsed if isinstance(parsed, dict) else {'value': parsed}
|
|
243
|
+
except Exception:
|
|
244
|
+
return {'raw': raw}
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def extract_codex_text(content: List[Dict[str, Any]]) -> str:
|
|
248
|
+
parts: List[str] = []
|
|
249
|
+
for item in content:
|
|
250
|
+
if not isinstance(item, dict):
|
|
251
|
+
continue
|
|
252
|
+
item_type = item.get('type')
|
|
253
|
+
if item_type in {'input_text', 'output_text'} and isinstance(item.get('text'), str):
|
|
254
|
+
parts.append(item['text'])
|
|
255
|
+
elif item_type == 'input_image':
|
|
256
|
+
parts.append('[image]')
|
|
257
|
+
return '\n'.join(part for part in parts if part)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def is_environment_context(text: str) -> bool:
|
|
261
|
+
return text.strip().startswith('<environment_context>') if isinstance(text, str) else False
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def iter_session_files(input_path: Path) -> List[Path]:
|
|
265
|
+
if input_path.is_file():
|
|
266
|
+
return [input_path]
|
|
267
|
+
if not input_path.is_dir():
|
|
268
|
+
raise CodexConversionError(f'Input path does not exist: {input_path}')
|
|
269
|
+
return sorted(input_path.rglob('*.jsonl'))
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def read_jsonl(path: Path) -> List[Dict[str, Any]]:
|
|
273
|
+
items: List[Dict[str, Any]] = []
|
|
274
|
+
with path.open('r', encoding='utf-8') as handle:
|
|
275
|
+
for line_number, raw in enumerate(handle, start=1):
|
|
276
|
+
raw = raw.strip()
|
|
277
|
+
if not raw:
|
|
278
|
+
continue
|
|
279
|
+
try:
|
|
280
|
+
items.append(json.loads(raw))
|
|
281
|
+
except json.JSONDecodeError as exc:
|
|
282
|
+
raise CodexConversionError(f'Invalid JSON at {path}:{line_number}: {exc}') from exc
|
|
283
|
+
return items
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def build_meta(builder: TurnBuilder) -> Dict[str, Any]:
|
|
287
|
+
assistant_messages = [message for message in builder.messages if message.get('role') == 'assistant']
|
|
288
|
+
tool_messages = [message for message in builder.messages if message.get('role') == 'tool']
|
|
289
|
+
user_messages = [message for message in builder.messages if message.get('role') == 'user']
|
|
290
|
+
reasoning_count = sum(
|
|
291
|
+
1
|
|
292
|
+
for message in assistant_messages
|
|
293
|
+
if isinstance(message.get('reasoning_content'), str) and message['reasoning_content'].strip()
|
|
294
|
+
)
|
|
295
|
+
return {
|
|
296
|
+
'endpoint': 'codex/turn',
|
|
297
|
+
'status': 200 if not builder.error_messages else 500,
|
|
298
|
+
'ts': builder.last_ts or builder.start_ts or '',
|
|
299
|
+
'key': builder.session_meta.get('id'),
|
|
300
|
+
'source': (
|
|
301
|
+
f"codex:{builder.session_meta.get('source') or 'cli'}:"
|
|
302
|
+
f"session={builder.session_meta.get('id')}:turn={builder.turn_id}:"
|
|
303
|
+
f"cwd={builder.session_meta.get('cwd')}:cli={builder.session_meta.get('cli_version')}"
|
|
304
|
+
),
|
|
305
|
+
'requested_model': builder.session_meta.get('model'),
|
|
306
|
+
'actual_model': builder.session_meta.get('model'),
|
|
307
|
+
'stream': False,
|
|
308
|
+
'thinking_level': builder.session_meta.get('reasoning_effort'),
|
|
309
|
+
'reasoning_summary_mode': 'codex_reasoning_summary',
|
|
310
|
+
'thinking_type': 'codex_turn',
|
|
311
|
+
'thinking_budget_tokens': None,
|
|
312
|
+
'max_output_tokens': None,
|
|
313
|
+
'tool_spec_count': len(builder.tool_specs),
|
|
314
|
+
'tool_choice': {'mode': 'session_trace'},
|
|
315
|
+
'request_contains_non_text_content': False,
|
|
316
|
+
'request_image_block_count': 0,
|
|
317
|
+
'request_video_block_count': 0,
|
|
318
|
+
'request_tool_call_block_count': 0,
|
|
319
|
+
'request_tool_result_block_count': 0,
|
|
320
|
+
'request_thinking_block_count': 0,
|
|
321
|
+
'response_contains_non_text_content': False,
|
|
322
|
+
'response_image_block_count': 0,
|
|
323
|
+
'response_video_block_count': 0,
|
|
324
|
+
'response_tool_call_block_count': sum(len(message.get('tool_calls') or []) for message in assistant_messages),
|
|
325
|
+
'response_tool_result_block_count': len(tool_messages),
|
|
326
|
+
'response_thinking_block_count': reasoning_count,
|
|
327
|
+
'request_truncated': False,
|
|
328
|
+
'response_truncated': False,
|
|
329
|
+
'lossy_source': bool(builder.lossy_reasons),
|
|
330
|
+
'lossy_reasons': sorted(builder.lossy_reasons),
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def convert_session_file(path: Path) -> Tuple[List[Dict[str, Any]], Counter]:
|
|
335
|
+
entries = read_jsonl(path)
|
|
336
|
+
if not entries:
|
|
337
|
+
raise CodexConversionError(f'Empty session file: {path}')
|
|
338
|
+
session_meta_payload = entries[0].get('payload') if entries[0].get('type') == 'session_meta' else None
|
|
339
|
+
if not isinstance(session_meta_payload, dict):
|
|
340
|
+
raise CodexConversionError(f'Missing session_meta in {path}')
|
|
341
|
+
|
|
342
|
+
session_meta = dict(session_meta_payload)
|
|
343
|
+
stats = Counter()
|
|
344
|
+
stats['input_files'] += 1
|
|
345
|
+
records: List[Dict[str, Any]] = []
|
|
346
|
+
current: Optional[TurnBuilder] = None
|
|
347
|
+
|
|
348
|
+
for entry in entries:
|
|
349
|
+
entry_type = entry.get('type')
|
|
350
|
+
payload = entry.get('payload') if isinstance(entry.get('payload'), dict) else {}
|
|
351
|
+
if entry_type == 'turn_context':
|
|
352
|
+
session_meta['model'] = payload.get('model') or session_meta.get('model')
|
|
353
|
+
session_meta['reasoning_effort'] = payload.get('effort') or session_meta.get('reasoning_effort')
|
|
354
|
+
continue
|
|
355
|
+
if entry_type == 'event_msg' and payload.get('type') == 'task_started':
|
|
356
|
+
turn_id = payload.get('turn_id') or entry.get('timestamp')
|
|
357
|
+
current = TurnBuilder(session_meta, str(turn_id), entry.get('timestamp') or '')
|
|
358
|
+
continue
|
|
359
|
+
if current is None:
|
|
360
|
+
continue
|
|
361
|
+
current.ingest(entry)
|
|
362
|
+
if entry_type == 'event_msg' and payload.get('type') == 'task_complete':
|
|
363
|
+
record = current.finalize()
|
|
364
|
+
if record is not None:
|
|
365
|
+
records.append(record)
|
|
366
|
+
if record['meta']['lossy_source']:
|
|
367
|
+
stats['lossy_records'] += 1
|
|
368
|
+
else:
|
|
369
|
+
stats['strict_records'] += 1
|
|
370
|
+
stats['turns_total'] += 1
|
|
371
|
+
current = None
|
|
372
|
+
|
|
373
|
+
return records, stats
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def main(argv: Sequence[str] | None = None) -> int:
|
|
377
|
+
args = parse_args(argv or [])
|
|
378
|
+
ensure_parquet_runtime(args.output_format)
|
|
379
|
+
codex_home = Path(args.codex_home).expanduser().resolve()
|
|
380
|
+
input_path = Path(args.input).expanduser().resolve() if args.input else (codex_home / 'sessions')
|
|
381
|
+
session_files = iter_session_files(input_path)
|
|
382
|
+
if args.limit > 0:
|
|
383
|
+
session_files = session_files[: args.limit]
|
|
384
|
+
if not session_files:
|
|
385
|
+
raise SystemExit('No Codex session files found.')
|
|
386
|
+
|
|
387
|
+
out_dir = Path(args.output_root).expanduser().resolve() / f'qwen35-codex-session-{datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")}'
|
|
388
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
389
|
+
strict_path = out_dir / 'qwen35-train.jsonl'
|
|
390
|
+
lossy_path = out_dir / 'qwen35-train-lossy.jsonl'
|
|
391
|
+
invalid_path = out_dir / 'invalid-records.jsonl'
|
|
392
|
+
strict_parquet_path = out_dir / 'qwen35-train.parquet'
|
|
393
|
+
lossy_parquet_path = out_dir / 'qwen35-train-lossy.parquet'
|
|
394
|
+
manifest_path = out_dir / 'manifest.json'
|
|
395
|
+
|
|
396
|
+
jsonl_enabled = args.output_format in {'jsonl', 'both'}
|
|
397
|
+
parquet_enabled = args.output_format in {'parquet', 'both'}
|
|
398
|
+
strict_out = strict_path.open('w', encoding='utf-8') if jsonl_enabled else None
|
|
399
|
+
lossy_out = lossy_path.open('w', encoding='utf-8') if jsonl_enabled else None
|
|
400
|
+
invalid_out = invalid_path.open('w', encoding='utf-8')
|
|
401
|
+
strict_writer = None
|
|
402
|
+
lossy_writer = None
|
|
403
|
+
strict_batch: List[Dict[str, Any]] = []
|
|
404
|
+
lossy_batch: List[Dict[str, Any]] = []
|
|
405
|
+
stats = Counter()
|
|
406
|
+
|
|
407
|
+
try:
|
|
408
|
+
for session_file in session_files:
|
|
409
|
+
try:
|
|
410
|
+
records, file_stats = convert_session_file(session_file)
|
|
411
|
+
stats.update(file_stats)
|
|
412
|
+
except Exception as exc:
|
|
413
|
+
stats['invalid_files'] += 1
|
|
414
|
+
invalid_out.write(json.dumps({'path': str(session_file), 'error': str(exc)}, ensure_ascii=False) + '\n')
|
|
415
|
+
continue
|
|
416
|
+
|
|
417
|
+
for record in records:
|
|
418
|
+
bucket = 'lossy' if record['meta']['lossy_source'] else 'strict'
|
|
419
|
+
if bucket == 'strict':
|
|
420
|
+
if strict_out is not None:
|
|
421
|
+
strict_out.write(json.dumps(record, ensure_ascii=False, separators=(',', ':')) + '\n')
|
|
422
|
+
if parquet_enabled:
|
|
423
|
+
strict_batch.append(record_to_parquet_row(record))
|
|
424
|
+
if len(strict_batch) >= BATCH_SIZE:
|
|
425
|
+
strict_writer = append_parquet_rows(strict_writer, strict_batch, strict_parquet_path)
|
|
426
|
+
strict_batch = []
|
|
427
|
+
else:
|
|
428
|
+
if lossy_out is not None:
|
|
429
|
+
lossy_out.write(json.dumps(record, ensure_ascii=False, separators=(',', ':')) + '\n')
|
|
430
|
+
if parquet_enabled:
|
|
431
|
+
lossy_batch.append(record_to_parquet_row(record))
|
|
432
|
+
if len(lossy_batch) >= BATCH_SIZE:
|
|
433
|
+
lossy_writer = append_parquet_rows(lossy_writer, lossy_batch, lossy_parquet_path)
|
|
434
|
+
lossy_batch = []
|
|
435
|
+
print(json.dumps({'processed_files': stats['input_files'], **dict(stats)}, ensure_ascii=False), flush=True)
|
|
436
|
+
|
|
437
|
+
if parquet_enabled:
|
|
438
|
+
strict_writer = append_parquet_rows(strict_writer, strict_batch, strict_parquet_path)
|
|
439
|
+
lossy_writer = append_parquet_rows(lossy_writer, lossy_batch, lossy_parquet_path)
|
|
440
|
+
finally:
|
|
441
|
+
if strict_out is not None:
|
|
442
|
+
strict_out.close()
|
|
443
|
+
if lossy_out is not None:
|
|
444
|
+
lossy_out.close()
|
|
445
|
+
invalid_out.close()
|
|
446
|
+
if strict_writer is not None:
|
|
447
|
+
strict_writer.close()
|
|
448
|
+
if lossy_writer is not None:
|
|
449
|
+
lossy_writer.close()
|
|
450
|
+
|
|
451
|
+
manifest = {
|
|
452
|
+
'codex_home': str(codex_home),
|
|
453
|
+
'input': str(input_path),
|
|
454
|
+
'output_dir': str(out_dir),
|
|
455
|
+
'input_files': [str(path) for path in session_files],
|
|
456
|
+
'stats': dict(stats),
|
|
457
|
+
'strict_records': stats.get('strict_records', 0),
|
|
458
|
+
'lossy_records': stats.get('lossy_records', 0),
|
|
459
|
+
}
|
|
460
|
+
manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2) + '\n', encoding='utf-8')
|
|
461
|
+
print(json.dumps(manifest, ensure_ascii=False), flush=True)
|
|
462
|
+
return 0
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
if __name__ == '__main__':
|
|
466
|
+
raise SystemExit(main(__import__('sys').argv[1:]))
|