agentic-dataset-builder 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -91
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +149 -0
- package/dist/labeling.d.ts +12 -0
- package/dist/labeling.js +22 -0
- package/dist/parquet.d.ts +4 -0
- package/dist/parquet.js +115 -0
- package/dist/platform/paths.d.ts +4 -0
- package/dist/platform/paths.js +62 -0
- package/dist/schemas/qwen35.d.ts +338 -0
- package/dist/schemas/qwen35.js +139 -0
- package/dist/sources/claude.d.ts +2 -0
- package/dist/sources/claude.js +64 -0
- package/dist/sources/codex.d.ts +2 -0
- package/dist/sources/codex.js +261 -0
- package/dist/sources/pi.d.ts +2 -0
- package/dist/sources/pi.js +276 -0
- package/dist/utils/common.d.ts +7 -0
- package/dist/utils/common.js +46 -0
- package/dist/utils/jsonl.d.ts +2 -0
- package/dist/utils/jsonl.js +17 -0
- package/package.json +24 -12
- package/agentic_dataset/__init__.py +0 -1
- package/agentic_dataset/build_agentic_dataset.py +0 -368
- package/agentic_dataset/export_codex_session_to_qwen35.py +0 -466
- package/agentic_dataset/export_pi_session.py +0 -701
- package/agentic_dataset/export_pi_session_to_qwen35.py +0 -742
- package/agentic_dataset/export_qwen35_training.py +0 -1559
- package/agentic_dataset/label_qwen35_agentic.py +0 -156
- package/agentic_dataset/platform_paths.py +0 -85
- package/agentic_dataset/qwen35_training_record.py +0 -179
- package/bin/agentic-dataset-builder.js +0 -77
- package/requirements.txt +0 -2
- package/run.py +0 -8
|
@@ -1,742 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
import argparse
|
|
5
|
-
import json
|
|
6
|
-
import statistics
|
|
7
|
-
from collections import Counter, defaultdict
|
|
8
|
-
from datetime import datetime, timezone
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple
|
|
11
|
-
|
|
12
|
-
from .export_pi_session import ARTIFACT_KEY, compact_text, read_jsonl
|
|
13
|
-
from .export_qwen35_training import (
|
|
14
|
-
append_parquet_rows,
|
|
15
|
-
ensure_parquet_runtime,
|
|
16
|
-
record_to_parquet_row,
|
|
17
|
-
validate_record_payload,
|
|
18
|
-
)
|
|
19
|
-
|
|
20
|
-
FULL_GLOB = '*.full.jsonl'
|
|
21
|
-
RAW_GLOB = '*.raw.jsonl'
|
|
22
|
-
BATCH_SIZE = 1000
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
class ConversionError(RuntimeError):
|
|
26
|
-
pass
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def parse_args(argv: Sequence[str]) -> argparse.Namespace:
|
|
30
|
-
argv_list = list(argv)
|
|
31
|
-
command = 'convert'
|
|
32
|
-
if argv_list and argv_list[0] == 'stats':
|
|
33
|
-
command = 'stats'
|
|
34
|
-
argv_list = argv_list[1:]
|
|
35
|
-
|
|
36
|
-
parser = argparse.ArgumentParser(description='Convert exported Pi sessions into local Qwen3.5 training schema.')
|
|
37
|
-
parser.add_argument('--input', required=True, help='Input full-session file or directory containing exported Pi sessions, or a generated Qwen35 export dir in stats mode.')
|
|
38
|
-
|
|
39
|
-
if command == 'stats':
|
|
40
|
-
parser.add_argument('--json', action='store_true', help='Emit machine-readable JSON.')
|
|
41
|
-
args = parser.parse_args(argv_list)
|
|
42
|
-
args.command = command
|
|
43
|
-
return args
|
|
44
|
-
|
|
45
|
-
parser.add_argument('--output-root', required=True, help='Output directory root for Qwen3.5 schema export.')
|
|
46
|
-
parser.add_argument('--output-format', choices=('jsonl', 'parquet', 'both'), default='jsonl')
|
|
47
|
-
parser.add_argument('--include-raw', action='store_true', help='Also read *.raw.jsonl files when scanning directories.')
|
|
48
|
-
parser.add_argument('--limit', type=int, default=0, help='Limit number of input files processed.')
|
|
49
|
-
args = parser.parse_args(argv_list)
|
|
50
|
-
args.command = command
|
|
51
|
-
return args
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def iter_input_files(input_path: Path, include_raw: bool) -> List[Path]:
|
|
55
|
-
if input_path.is_file():
|
|
56
|
-
return [input_path]
|
|
57
|
-
if not input_path.is_dir():
|
|
58
|
-
raise ConversionError(f'Input path does not exist: {input_path}')
|
|
59
|
-
patterns = [FULL_GLOB]
|
|
60
|
-
if include_raw:
|
|
61
|
-
patterns.append(RAW_GLOB)
|
|
62
|
-
files: List[Path] = []
|
|
63
|
-
for pattern in patterns:
|
|
64
|
-
files.extend(sorted(input_path.rglob(pattern)))
|
|
65
|
-
deduped: Dict[str, Path] = {}
|
|
66
|
-
for path in files:
|
|
67
|
-
deduped[str(path.resolve())] = path.resolve()
|
|
68
|
-
return list(deduped.values())
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def build_tree(entries: List[Dict[str, Any]]) -> Tuple[Dict[str, Dict[str, Any]], Dict[Optional[str], List[str]]]:
|
|
72
|
-
by_id: Dict[str, Dict[str, Any]] = {}
|
|
73
|
-
children: Dict[Optional[str], List[str]] = defaultdict(list)
|
|
74
|
-
for entry in entries:
|
|
75
|
-
entry_id = entry.get('id')
|
|
76
|
-
if not isinstance(entry_id, str):
|
|
77
|
-
continue
|
|
78
|
-
by_id[entry_id] = entry
|
|
79
|
-
children[entry.get('parentId')].append(entry_id)
|
|
80
|
-
return by_id, children
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
def leaf_ids(by_id: Dict[str, Dict[str, Any]], children: Dict[Optional[str], List[str]]) -> List[str]:
|
|
84
|
-
leaves = [entry_id for entry_id in by_id if not children.get(entry_id)]
|
|
85
|
-
return sorted(leaves)
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
def path_to_leaf(leaf_id: str, by_id: Dict[str, Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
89
|
-
ordered: List[Dict[str, Any]] = []
|
|
90
|
-
current_id: Optional[str] = leaf_id
|
|
91
|
-
while current_id is not None:
|
|
92
|
-
entry = by_id.get(current_id)
|
|
93
|
-
if entry is None:
|
|
94
|
-
break
|
|
95
|
-
ordered.append(entry)
|
|
96
|
-
parent_id = entry.get('parentId')
|
|
97
|
-
current_id = parent_id if isinstance(parent_id, str) else None
|
|
98
|
-
ordered.reverse()
|
|
99
|
-
return ordered
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
def convert_content_blocks(
|
|
103
|
-
content: Any,
|
|
104
|
-
lossy_reasons: set[str],
|
|
105
|
-
unsupported_reason_prefix: str,
|
|
106
|
-
) -> Any:
|
|
107
|
-
if isinstance(content, str):
|
|
108
|
-
return content
|
|
109
|
-
if not isinstance(content, list):
|
|
110
|
-
lossy_reasons.add(f'{unsupported_reason_prefix}_nonstandard_content')
|
|
111
|
-
return json.dumps(content, ensure_ascii=False, sort_keys=True)
|
|
112
|
-
|
|
113
|
-
blocks: List[Dict[str, Any]] = []
|
|
114
|
-
for block in content:
|
|
115
|
-
if not isinstance(block, dict):
|
|
116
|
-
lossy_reasons.add(f'{unsupported_reason_prefix}_non_dict_block')
|
|
117
|
-
continue
|
|
118
|
-
block_type = block.get('type')
|
|
119
|
-
if block_type == 'text':
|
|
120
|
-
blocks.append({'type': 'text', 'text': block.get('text', '')})
|
|
121
|
-
elif block_type == 'image':
|
|
122
|
-
metadata: Dict[str, Any] = {}
|
|
123
|
-
if isinstance(block.get('mimeType'), str):
|
|
124
|
-
metadata['mimeType'] = block['mimeType']
|
|
125
|
-
if isinstance(block.get('data'), str):
|
|
126
|
-
metadata['data'] = block['data']
|
|
127
|
-
blocks.append(
|
|
128
|
-
{
|
|
129
|
-
'type': 'image',
|
|
130
|
-
'placeholder': True,
|
|
131
|
-
'placeholder_token': '<image>',
|
|
132
|
-
'source_kind': 'pi_session_inline_image',
|
|
133
|
-
'metadata': metadata or None,
|
|
134
|
-
}
|
|
135
|
-
)
|
|
136
|
-
elif block_type == 'video':
|
|
137
|
-
metadata = {}
|
|
138
|
-
if isinstance(block.get('mimeType'), str):
|
|
139
|
-
metadata['mimeType'] = block['mimeType']
|
|
140
|
-
if isinstance(block.get('data'), str):
|
|
141
|
-
metadata['data'] = block['data']
|
|
142
|
-
blocks.append(
|
|
143
|
-
{
|
|
144
|
-
'type': 'video',
|
|
145
|
-
'placeholder': True,
|
|
146
|
-
'placeholder_token': '<video>',
|
|
147
|
-
'source_kind': 'pi_session_inline_video',
|
|
148
|
-
'metadata': metadata or None,
|
|
149
|
-
}
|
|
150
|
-
)
|
|
151
|
-
else:
|
|
152
|
-
lossy_reasons.add(f'{unsupported_reason_prefix}_unsupported_block_{block_type}')
|
|
153
|
-
blocks.append({'type': 'text', 'text': json.dumps(block, ensure_ascii=False, sort_keys=True)})
|
|
154
|
-
if not blocks:
|
|
155
|
-
return ''
|
|
156
|
-
if len(blocks) == 1 and blocks[0].get('type') == 'text':
|
|
157
|
-
return blocks[0]['text']
|
|
158
|
-
return blocks
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
def convert_assistant_message(message: Dict[str, Any], lossy_reasons: set[str], tools_seen: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
|
|
162
|
-
content = message.get('content')
|
|
163
|
-
text_blocks: List[Dict[str, Any]] = []
|
|
164
|
-
reasoning_chunks: List[str] = []
|
|
165
|
-
tool_calls: List[Dict[str, Any]] = []
|
|
166
|
-
|
|
167
|
-
if isinstance(content, str):
|
|
168
|
-
text_blocks = [{'type': 'text', 'text': content}]
|
|
169
|
-
elif isinstance(content, list):
|
|
170
|
-
for block in content:
|
|
171
|
-
if not isinstance(block, dict):
|
|
172
|
-
lossy_reasons.add('assistant_non_dict_block')
|
|
173
|
-
continue
|
|
174
|
-
block_type = block.get('type')
|
|
175
|
-
if block_type == 'text':
|
|
176
|
-
text_blocks.append({'type': 'text', 'text': block.get('text', '')})
|
|
177
|
-
elif block_type == 'thinking':
|
|
178
|
-
thinking = block.get('thinking')
|
|
179
|
-
if isinstance(thinking, str) and thinking:
|
|
180
|
-
reasoning_chunks.append(thinking)
|
|
181
|
-
elif block_type == 'toolCall':
|
|
182
|
-
name = block.get('name') or 'unknown_tool'
|
|
183
|
-
arguments = block.get('arguments') if isinstance(block.get('arguments'), dict) else {}
|
|
184
|
-
tool_calls.append(
|
|
185
|
-
{
|
|
186
|
-
'type': 'function',
|
|
187
|
-
'id': block.get('id'),
|
|
188
|
-
'function': {'name': name, 'arguments': arguments},
|
|
189
|
-
}
|
|
190
|
-
)
|
|
191
|
-
tools_seen.setdefault(name, {'name': name})
|
|
192
|
-
elif block_type in {'image', 'video'}:
|
|
193
|
-
converted = convert_content_blocks([block], lossy_reasons, 'assistant')
|
|
194
|
-
if isinstance(converted, list):
|
|
195
|
-
text_blocks.extend(converted)
|
|
196
|
-
elif isinstance(converted, str):
|
|
197
|
-
text_blocks.append({'type': 'text', 'text': converted})
|
|
198
|
-
else:
|
|
199
|
-
lossy_reasons.add(f'assistant_unsupported_block_{block_type}')
|
|
200
|
-
text_blocks.append({'type': 'text', 'text': json.dumps(block, ensure_ascii=False, sort_keys=True)})
|
|
201
|
-
else:
|
|
202
|
-
lossy_reasons.add('assistant_nonstandard_content')
|
|
203
|
-
text_blocks = [{'type': 'text', 'text': json.dumps(content, ensure_ascii=False, sort_keys=True)}]
|
|
204
|
-
|
|
205
|
-
assistant_content: Any
|
|
206
|
-
if not text_blocks:
|
|
207
|
-
assistant_content = ''
|
|
208
|
-
elif len(text_blocks) == 1 and text_blocks[0].get('type') == 'text':
|
|
209
|
-
assistant_content = text_blocks[0]['text']
|
|
210
|
-
else:
|
|
211
|
-
assistant_content = text_blocks
|
|
212
|
-
|
|
213
|
-
payload: Dict[str, Any] = {'role': 'assistant', 'content': assistant_content}
|
|
214
|
-
if reasoning_chunks:
|
|
215
|
-
payload['reasoning_content'] = '\n\n'.join(reasoning_chunks)
|
|
216
|
-
if tool_calls:
|
|
217
|
-
payload['tool_calls'] = tool_calls
|
|
218
|
-
return payload
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
def embedded_artifact_text(node: Dict[str, Any]) -> Optional[str]:
|
|
222
|
-
embedded = node.get(f'{ARTIFACT_KEY}Embedded')
|
|
223
|
-
if not isinstance(embedded, dict):
|
|
224
|
-
return None
|
|
225
|
-
if embedded.get('encoding') == 'utf-8' and isinstance(embedded.get('text'), str):
|
|
226
|
-
return embedded['text']
|
|
227
|
-
if embedded.get('encoding') == 'base64' and isinstance(embedded.get('base64'), str):
|
|
228
|
-
return '[binary artifact embedded as base64]'
|
|
229
|
-
return None
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
def format_bash_execution(message: Dict[str, Any], lossy_reasons: set[str]) -> str:
|
|
233
|
-
output = message.get('output')
|
|
234
|
-
if not isinstance(output, str):
|
|
235
|
-
output = ''
|
|
236
|
-
full_text = embedded_artifact_text(message)
|
|
237
|
-
if full_text is None and isinstance(message.get('details'), dict):
|
|
238
|
-
full_text = embedded_artifact_text(message['details'])
|
|
239
|
-
truncated = bool(message.get('truncated'))
|
|
240
|
-
if truncated and full_text is None and isinstance(message.get(ARTIFACT_KEY), str):
|
|
241
|
-
lossy_reasons.add('missing_embedded_full_output')
|
|
242
|
-
effective_output = full_text if full_text is not None else output
|
|
243
|
-
payload = {
|
|
244
|
-
'command': message.get('command'),
|
|
245
|
-
'exit_code': message.get('exitCode'),
|
|
246
|
-
'cancelled': message.get('cancelled', False),
|
|
247
|
-
'truncated': truncated,
|
|
248
|
-
'exclude_from_context': message.get('excludeFromContext', False),
|
|
249
|
-
'output': effective_output,
|
|
250
|
-
}
|
|
251
|
-
return json.dumps(payload, ensure_ascii=False, sort_keys=True)
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
def synthetic_message_from_entry(entry: Dict[str, Any], label: str, text: Optional[str], lossy_reasons: set[str]) -> Optional[Dict[str, Any]]:
|
|
255
|
-
if not isinstance(text, str) or not text.strip():
|
|
256
|
-
return None
|
|
257
|
-
lossy_reasons.add(f'synthetic_{label}_message')
|
|
258
|
-
return {'role': 'assistant', 'content': f'[{label}]\n{text.strip()}'}
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
def entry_has_missing_artifact(node: Any) -> bool:
|
|
262
|
-
if isinstance(node, dict):
|
|
263
|
-
for key, value in node.items():
|
|
264
|
-
if key == ARTIFACT_KEY and isinstance(value, str) and f'{ARTIFACT_KEY}Embedded' not in node:
|
|
265
|
-
return True
|
|
266
|
-
if entry_has_missing_artifact(value):
|
|
267
|
-
return True
|
|
268
|
-
elif isinstance(node, list):
|
|
269
|
-
return any(entry_has_missing_artifact(item) for item in node)
|
|
270
|
-
return False
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
def convert_entry_to_messages(
|
|
274
|
-
entry: Dict[str, Any],
|
|
275
|
-
lossy_reasons: set[str],
|
|
276
|
-
tools_seen: Dict[str, Dict[str, Any]],
|
|
277
|
-
) -> List[Dict[str, Any]]:
|
|
278
|
-
entry_type = entry.get('type')
|
|
279
|
-
if entry_type == 'message':
|
|
280
|
-
message = entry.get('message')
|
|
281
|
-
if not isinstance(message, dict):
|
|
282
|
-
lossy_reasons.add('message_entry_missing_payload')
|
|
283
|
-
return []
|
|
284
|
-
role = message.get('role')
|
|
285
|
-
if role == 'user':
|
|
286
|
-
return [{'role': 'user', 'content': convert_content_blocks(message.get('content'), lossy_reasons, 'user')}]
|
|
287
|
-
if role == 'assistant':
|
|
288
|
-
return [convert_assistant_message(message, lossy_reasons, tools_seen)]
|
|
289
|
-
if role == 'toolResult':
|
|
290
|
-
tool_name = message.get('toolName')
|
|
291
|
-
if isinstance(tool_name, str) and tool_name:
|
|
292
|
-
tools_seen.setdefault(tool_name, {'name': tool_name})
|
|
293
|
-
return [
|
|
294
|
-
{
|
|
295
|
-
'role': 'tool',
|
|
296
|
-
'content': convert_content_blocks(message.get('content'), lossy_reasons, 'tool_result'),
|
|
297
|
-
'tool_call_id': message.get('toolCallId'),
|
|
298
|
-
'name': tool_name,
|
|
299
|
-
}
|
|
300
|
-
]
|
|
301
|
-
if role == 'bashExecution':
|
|
302
|
-
tools_seen.setdefault('bash', {'name': 'bash'})
|
|
303
|
-
return [{'role': 'tool', 'content': format_bash_execution(message, lossy_reasons), 'name': 'bash'}]
|
|
304
|
-
if role == 'custom':
|
|
305
|
-
custom_type = message.get('customType') or 'custom'
|
|
306
|
-
custom_content = message.get('content')
|
|
307
|
-
converted = convert_content_blocks(custom_content, lossy_reasons, 'custom')
|
|
308
|
-
lossy_reasons.add('synthetic_custom_message')
|
|
309
|
-
return [{'role': 'assistant', 'content': f'[custom:{custom_type}]\n{converted}' if isinstance(converted, str) else converted}]
|
|
310
|
-
if role == 'branchSummary':
|
|
311
|
-
return [synthetic_message_from_entry(entry, 'branch_summary', message.get('summary'), lossy_reasons)] if message.get('summary') else []
|
|
312
|
-
if role == 'compactionSummary':
|
|
313
|
-
return [synthetic_message_from_entry(entry, 'compaction_summary', message.get('summary'), lossy_reasons)] if message.get('summary') else []
|
|
314
|
-
lossy_reasons.add(f'unsupported_message_role_{role}')
|
|
315
|
-
return [{'role': 'assistant', 'content': json.dumps(message, ensure_ascii=False, sort_keys=True)}]
|
|
316
|
-
|
|
317
|
-
if entry_type == 'branch_summary':
|
|
318
|
-
return [synthetic_message_from_entry(entry, 'branch_summary', entry.get('summary'), lossy_reasons)] if entry.get('summary') else []
|
|
319
|
-
if entry_type == 'compaction':
|
|
320
|
-
return [synthetic_message_from_entry(entry, 'compaction_summary', entry.get('summary'), lossy_reasons)] if entry.get('summary') else []
|
|
321
|
-
if entry_type == 'custom_message':
|
|
322
|
-
custom_type = entry.get('customType') or 'custom'
|
|
323
|
-
converted = convert_content_blocks(entry.get('content'), lossy_reasons, 'custom_message')
|
|
324
|
-
lossy_reasons.add('synthetic_custom_message')
|
|
325
|
-
return [{'role': 'assistant', 'content': f'[custom:{custom_type}]\n{converted}' if isinstance(converted, str) else converted}]
|
|
326
|
-
return []
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
def count_blocks(content: Any) -> Dict[str, int]:
|
|
330
|
-
counts = {
|
|
331
|
-
'contains_non_text_content': False,
|
|
332
|
-
'image_block_count': 0,
|
|
333
|
-
'video_block_count': 0,
|
|
334
|
-
}
|
|
335
|
-
if isinstance(content, list):
|
|
336
|
-
for block in content:
|
|
337
|
-
if not isinstance(block, dict):
|
|
338
|
-
continue
|
|
339
|
-
block_type = block.get('type')
|
|
340
|
-
if block_type == 'image':
|
|
341
|
-
counts['contains_non_text_content'] = True
|
|
342
|
-
counts['image_block_count'] += 1
|
|
343
|
-
elif block_type == 'video':
|
|
344
|
-
counts['contains_non_text_content'] = True
|
|
345
|
-
counts['video_block_count'] += 1
|
|
346
|
-
return counts
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
def compute_meta_counts(messages: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
350
|
-
counts = {
|
|
351
|
-
'request_contains_non_text_content': False,
|
|
352
|
-
'request_image_block_count': 0,
|
|
353
|
-
'request_video_block_count': 0,
|
|
354
|
-
'request_tool_call_block_count': 0,
|
|
355
|
-
'request_tool_result_block_count': 0,
|
|
356
|
-
'request_thinking_block_count': 0,
|
|
357
|
-
'response_contains_non_text_content': False,
|
|
358
|
-
'response_image_block_count': 0,
|
|
359
|
-
'response_video_block_count': 0,
|
|
360
|
-
'response_tool_call_block_count': 0,
|
|
361
|
-
'response_tool_result_block_count': 0,
|
|
362
|
-
'response_thinking_block_count': 0,
|
|
363
|
-
}
|
|
364
|
-
for message in messages:
|
|
365
|
-
role = message.get('role')
|
|
366
|
-
side = 'request' if role in {'system', 'user'} else 'response'
|
|
367
|
-
block_counts = count_blocks(message.get('content'))
|
|
368
|
-
counts[f'{side}_contains_non_text_content'] = counts[f'{side}_contains_non_text_content'] or block_counts['contains_non_text_content']
|
|
369
|
-
counts[f'{side}_image_block_count'] += block_counts['image_block_count']
|
|
370
|
-
counts[f'{side}_video_block_count'] += block_counts['video_block_count']
|
|
371
|
-
if role == 'assistant':
|
|
372
|
-
counts['response_tool_call_block_count'] += len(message.get('tool_calls') or [])
|
|
373
|
-
if isinstance(message.get('reasoning_content'), str) and message['reasoning_content'].strip():
|
|
374
|
-
counts['response_thinking_block_count'] += 1
|
|
375
|
-
if role == 'tool':
|
|
376
|
-
counts['response_tool_result_block_count'] += 1
|
|
377
|
-
return counts
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
def build_record_for_path(
|
|
381
|
-
header: Dict[str, Any],
|
|
382
|
-
path_entries: List[Dict[str, Any]],
|
|
383
|
-
source_path: Path,
|
|
384
|
-
leaf_id: str,
|
|
385
|
-
branch_index: int,
|
|
386
|
-
branch_count: int,
|
|
387
|
-
) -> Dict[str, Any]:
|
|
388
|
-
lossy_reasons: set[str] = set()
|
|
389
|
-
tools_seen: Dict[str, Dict[str, Any]] = {}
|
|
390
|
-
messages: List[Dict[str, Any]] = []
|
|
391
|
-
models_seen: List[str] = []
|
|
392
|
-
thinking_levels: List[str] = []
|
|
393
|
-
|
|
394
|
-
for entry in path_entries:
|
|
395
|
-
if entry_has_missing_artifact(entry):
|
|
396
|
-
lossy_reasons.add('missing_embedded_artifact')
|
|
397
|
-
if entry.get('type') == 'model_change':
|
|
398
|
-
model_id = entry.get('modelId')
|
|
399
|
-
provider = entry.get('provider')
|
|
400
|
-
if isinstance(model_id, str):
|
|
401
|
-
models_seen.append(f'{provider}/{model_id}' if provider else model_id)
|
|
402
|
-
continue
|
|
403
|
-
if entry.get('type') == 'thinking_level_change':
|
|
404
|
-
level = entry.get('thinkingLevel')
|
|
405
|
-
if isinstance(level, str):
|
|
406
|
-
thinking_levels.append(level)
|
|
407
|
-
continue
|
|
408
|
-
if entry.get('type') in {'session_info', 'label', 'custom'}:
|
|
409
|
-
continue
|
|
410
|
-
for message in convert_entry_to_messages(entry, lossy_reasons, tools_seen):
|
|
411
|
-
if message:
|
|
412
|
-
messages.append(message)
|
|
413
|
-
|
|
414
|
-
if not any(message.get('role') == 'user' for message in messages):
|
|
415
|
-
raise ConversionError(f'No user messages found on branch {leaf_id} from {source_path}')
|
|
416
|
-
|
|
417
|
-
if branch_count > 1:
|
|
418
|
-
lossy_reasons.add('session_tree_branch_selected')
|
|
419
|
-
|
|
420
|
-
export_info = header.get('exportInfo') if isinstance(header.get('exportInfo'), dict) else None
|
|
421
|
-
if isinstance(export_info, dict) and int(export_info.get('missingArtifactCount') or 0) > 0:
|
|
422
|
-
lossy_reasons.add('source_export_missing_artifacts')
|
|
423
|
-
|
|
424
|
-
if len(set(models_seen)) > 1:
|
|
425
|
-
lossy_reasons.add('multiple_models_on_branch')
|
|
426
|
-
if len(set(thinking_levels)) > 1:
|
|
427
|
-
lossy_reasons.add('multiple_thinking_levels_on_branch')
|
|
428
|
-
|
|
429
|
-
tools = list(tools_seen.values())
|
|
430
|
-
counts = compute_meta_counts(messages)
|
|
431
|
-
meta = {
|
|
432
|
-
'endpoint': 'pi/session_branch',
|
|
433
|
-
'status': 200,
|
|
434
|
-
'ts': path_entries[-1].get('timestamp') or header.get('timestamp') or '',
|
|
435
|
-
'key': header.get('id'),
|
|
436
|
-
'source': f'{source_path}#leaf={leaf_id}',
|
|
437
|
-
'requested_model': models_seen[0] if models_seen else None,
|
|
438
|
-
'actual_model': models_seen[-1] if models_seen else None,
|
|
439
|
-
'stream': False,
|
|
440
|
-
'thinking_level': thinking_levels[-1] if thinking_levels else None,
|
|
441
|
-
'reasoning_summary_mode': 'pi_session_branch',
|
|
442
|
-
'thinking_type': 'pi_session',
|
|
443
|
-
'thinking_budget_tokens': None,
|
|
444
|
-
'max_output_tokens': None,
|
|
445
|
-
'tool_spec_count': len(tools),
|
|
446
|
-
'tool_choice': {'mode': 'session_trace'},
|
|
447
|
-
'request_truncated': False,
|
|
448
|
-
'response_truncated': 'missing_embedded_full_output' in lossy_reasons,
|
|
449
|
-
'lossy_source': bool(lossy_reasons),
|
|
450
|
-
'lossy_reasons': sorted(lossy_reasons),
|
|
451
|
-
**counts,
|
|
452
|
-
}
|
|
453
|
-
|
|
454
|
-
record = {
|
|
455
|
-
'id': f"{header.get('id')}:{leaf_id}",
|
|
456
|
-
'request_id': header.get('id'),
|
|
457
|
-
'messages': messages,
|
|
458
|
-
'tools': tools,
|
|
459
|
-
'meta': meta,
|
|
460
|
-
}
|
|
461
|
-
validate_record_payload(record)
|
|
462
|
-
return record
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
def convert_file(path: Path) -> Tuple[List[Dict[str, Any]], Counter]:
|
|
466
|
-
entries = read_jsonl(path)
|
|
467
|
-
header = entries[0]
|
|
468
|
-
body = entries[1:]
|
|
469
|
-
by_id, children = build_tree(body)
|
|
470
|
-
leaves = leaf_ids(by_id, children)
|
|
471
|
-
stats = Counter()
|
|
472
|
-
stats['input_files'] += 1
|
|
473
|
-
stats['branches_total'] += len(leaves)
|
|
474
|
-
records: List[Dict[str, Any]] = []
|
|
475
|
-
for index, leaf_id in enumerate(leaves, start=1):
|
|
476
|
-
branch_path = path_to_leaf(leaf_id, by_id)
|
|
477
|
-
record = build_record_for_path(header, branch_path, path, leaf_id, index, len(leaves))
|
|
478
|
-
records.append(record)
|
|
479
|
-
if record['meta']['lossy_source']:
|
|
480
|
-
stats['lossy_records'] += 1
|
|
481
|
-
else:
|
|
482
|
-
stats['strict_records'] += 1
|
|
483
|
-
return records, stats
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
def load_qwen_records(input_path: Path) -> List[Dict[str, Any]]:
|
|
487
|
-
files: List[Path] = []
|
|
488
|
-
if input_path.is_file():
|
|
489
|
-
files = [input_path]
|
|
490
|
-
elif input_path.is_dir():
|
|
491
|
-
for name in ('qwen35-train.jsonl', 'qwen35-train-lossy.jsonl'):
|
|
492
|
-
candidate = input_path / name
|
|
493
|
-
if candidate.exists():
|
|
494
|
-
files.append(candidate)
|
|
495
|
-
if not files:
|
|
496
|
-
raise ConversionError(f'No Qwen35 jsonl files found under {input_path}')
|
|
497
|
-
|
|
498
|
-
records: List[Dict[str, Any]] = []
|
|
499
|
-
for path in files:
|
|
500
|
-
bucket = 'lossy' if 'lossy' in path.name else 'strict'
|
|
501
|
-
with path.open('r', encoding='utf-8') as handle:
|
|
502
|
-
for line in handle:
|
|
503
|
-
line = line.strip()
|
|
504
|
-
if not line:
|
|
505
|
-
continue
|
|
506
|
-
record = json.loads(line)
|
|
507
|
-
record['_bucket'] = bucket
|
|
508
|
-
records.append(record)
|
|
509
|
-
return records
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
def content_char_count(content: Any) -> int:
|
|
513
|
-
if isinstance(content, str):
|
|
514
|
-
return len(content)
|
|
515
|
-
if isinstance(content, list):
|
|
516
|
-
total = 0
|
|
517
|
-
for block in content:
|
|
518
|
-
if isinstance(block, dict):
|
|
519
|
-
if isinstance(block.get('text'), str):
|
|
520
|
-
total += len(block['text'])
|
|
521
|
-
else:
|
|
522
|
-
total += len(json.dumps(block, ensure_ascii=False, sort_keys=True))
|
|
523
|
-
else:
|
|
524
|
-
total += len(str(block))
|
|
525
|
-
return total
|
|
526
|
-
return len(json.dumps(content, ensure_ascii=False, sort_keys=True))
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
def stat_summary(values: List[int]) -> Optional[Dict[str, Any]]:
|
|
530
|
-
if not values:
|
|
531
|
-
return None
|
|
532
|
-
ordered = sorted(values)
|
|
533
|
-
return {
|
|
534
|
-
'min': ordered[0],
|
|
535
|
-
'median': statistics.median(ordered),
|
|
536
|
-
'mean': round(statistics.mean(ordered), 2),
|
|
537
|
-
'max': ordered[-1],
|
|
538
|
-
}
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
def build_stats_report(records: List[Dict[str, Any]], input_path: Path) -> Dict[str, Any]:
|
|
542
|
-
message_counts: List[int] = []
|
|
543
|
-
user_counts: List[int] = []
|
|
544
|
-
assistant_counts: List[int] = []
|
|
545
|
-
tool_counts: List[int] = []
|
|
546
|
-
round_counts: List[int] = []
|
|
547
|
-
tool_call_counts: List[int] = []
|
|
548
|
-
reasoning_message_counts: List[int] = []
|
|
549
|
-
reasoning_char_counts: List[int] = []
|
|
550
|
-
total_char_counts: List[int] = []
|
|
551
|
-
per_record: List[Dict[str, Any]] = []
|
|
552
|
-
assistant_total = 0
|
|
553
|
-
assistant_with_reasoning_total = 0
|
|
554
|
-
|
|
555
|
-
for record in records:
|
|
556
|
-
messages = record.get('messages', [])
|
|
557
|
-
users = [message for message in messages if message.get('role') == 'user']
|
|
558
|
-
assistants = [message for message in messages if message.get('role') == 'assistant']
|
|
559
|
-
tools = [message for message in messages if message.get('role') == 'tool']
|
|
560
|
-
reasoning_messages = [
|
|
561
|
-
message
|
|
562
|
-
for message in assistants
|
|
563
|
-
if isinstance(message.get('reasoning_content'), str) and message['reasoning_content'].strip()
|
|
564
|
-
]
|
|
565
|
-
reasoning_chars = sum(len(message['reasoning_content']) for message in reasoning_messages)
|
|
566
|
-
tool_calls = sum(len(message.get('tool_calls') or []) for message in assistants)
|
|
567
|
-
total_chars = sum(content_char_count(message.get('content')) for message in messages)
|
|
568
|
-
|
|
569
|
-
message_counts.append(len(messages))
|
|
570
|
-
user_counts.append(len(users))
|
|
571
|
-
assistant_counts.append(len(assistants))
|
|
572
|
-
tool_counts.append(len(tools))
|
|
573
|
-
round_counts.append(len(users))
|
|
574
|
-
tool_call_counts.append(tool_calls)
|
|
575
|
-
reasoning_message_counts.append(len(reasoning_messages))
|
|
576
|
-
reasoning_char_counts.append(reasoning_chars)
|
|
577
|
-
total_char_counts.append(total_chars)
|
|
578
|
-
assistant_total += len(assistants)
|
|
579
|
-
assistant_with_reasoning_total += len(reasoning_messages)
|
|
580
|
-
|
|
581
|
-
per_record.append(
|
|
582
|
-
{
|
|
583
|
-
'id': record.get('id'),
|
|
584
|
-
'bucket': record.get('_bucket'),
|
|
585
|
-
'messages': len(messages),
|
|
586
|
-
'users': len(users),
|
|
587
|
-
'assistants': len(assistants),
|
|
588
|
-
'tools': len(tools),
|
|
589
|
-
'dialogue_rounds_est': len(users),
|
|
590
|
-
'tool_calls': tool_calls,
|
|
591
|
-
'reasoning_messages': len(reasoning_messages),
|
|
592
|
-
'reasoning_chars': reasoning_chars,
|
|
593
|
-
'content_chars': total_chars,
|
|
594
|
-
'lossy_reasons': record.get('meta', {}).get('lossy_reasons', []),
|
|
595
|
-
}
|
|
596
|
-
)
|
|
597
|
-
|
|
598
|
-
records_with_reasoning = sum(1 for count in reasoning_message_counts if count > 0)
|
|
599
|
-
report = {
|
|
600
|
-
'input': str(input_path),
|
|
601
|
-
'records': len(records),
|
|
602
|
-
'strict_records': sum(1 for record in records if record.get('_bucket') == 'strict'),
|
|
603
|
-
'lossy_records': sum(1 for record in records if record.get('_bucket') == 'lossy'),
|
|
604
|
-
'message_count': stat_summary(message_counts),
|
|
605
|
-
'user_messages': stat_summary(user_counts),
|
|
606
|
-
'assistant_messages': stat_summary(assistant_counts),
|
|
607
|
-
'tool_messages': stat_summary(tool_counts),
|
|
608
|
-
'dialogue_rounds_est': stat_summary(round_counts),
|
|
609
|
-
'assistant_tool_calls': stat_summary(tool_call_counts),
|
|
610
|
-
'assistant_reasoning_messages': stat_summary(reasoning_message_counts),
|
|
611
|
-
'reasoning_chars_total_per_record': stat_summary(reasoning_char_counts),
|
|
612
|
-
'content_chars_total': stat_summary(total_char_counts),
|
|
613
|
-
'records_with_reasoning': records_with_reasoning,
|
|
614
|
-
'records_with_reasoning_ratio': round(records_with_reasoning / len(records), 4) if records else 0.0,
|
|
615
|
-
'assistant_messages_with_reasoning': assistant_with_reasoning_total,
|
|
616
|
-
'assistant_messages_total': assistant_total,
|
|
617
|
-
'assistant_reasoning_coverage': round(assistant_with_reasoning_total / assistant_total, 4) if assistant_total else 0.0,
|
|
618
|
-
'per_record': per_record,
|
|
619
|
-
}
|
|
620
|
-
return report
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
def print_stats_report(report: Dict[str, Any], as_json: bool) -> int:
|
|
624
|
-
if as_json:
|
|
625
|
-
print(json.dumps(report, ensure_ascii=False, indent=2))
|
|
626
|
-
return 0
|
|
627
|
-
print(f"input: {report['input']}")
|
|
628
|
-
print(f"records: {report['records']} (strict={report['strict_records']}, lossy={report['lossy_records']})")
|
|
629
|
-
print(f"records with reasoning: {report['records_with_reasoning']} ({report['records_with_reasoning_ratio']:.2%})")
|
|
630
|
-
print(
|
|
631
|
-
f"assistant reasoning coverage: {report['assistant_messages_with_reasoning']}/"
|
|
632
|
-
f"{report['assistant_messages_total']} ({report['assistant_reasoning_coverage']:.2%})"
|
|
633
|
-
)
|
|
634
|
-
print(f"message count: {report['message_count']}")
|
|
635
|
-
print(f"dialogue rounds est: {report['dialogue_rounds_est']}")
|
|
636
|
-
print(f"assistant tool calls: {report['assistant_tool_calls']}")
|
|
637
|
-
print(f"assistant reasoning messages: {report['assistant_reasoning_messages']}")
|
|
638
|
-
print(f"reasoning chars per record: {report['reasoning_chars_total_per_record']}")
|
|
639
|
-
print('per record:')
|
|
640
|
-
for item in report['per_record']:
|
|
641
|
-
print(
|
|
642
|
-
f" - {item['id']} [{item['bucket']}] msgs={item['messages']} rounds={item['dialogue_rounds_est']} "
|
|
643
|
-
f"tool_calls={item['tool_calls']} reasoning_msgs={item['reasoning_messages']} reasoning_chars={item['reasoning_chars']}"
|
|
644
|
-
)
|
|
645
|
-
return 0
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
def main(argv: Sequence[str] | None = None) -> int:
|
|
649
|
-
args = parse_args(argv or [])
|
|
650
|
-
|
|
651
|
-
if args.command == 'stats':
|
|
652
|
-
input_path = Path(args.input).expanduser().resolve()
|
|
653
|
-
records = load_qwen_records(input_path)
|
|
654
|
-
report = build_stats_report(records, input_path)
|
|
655
|
-
return print_stats_report(report, args.json)
|
|
656
|
-
|
|
657
|
-
ensure_parquet_runtime(args.output_format)
|
|
658
|
-
|
|
659
|
-
input_path = Path(args.input).expanduser().resolve()
|
|
660
|
-
input_files = iter_input_files(input_path, args.include_raw)
|
|
661
|
-
if args.limit > 0:
|
|
662
|
-
input_files = input_files[: args.limit]
|
|
663
|
-
if not input_files:
|
|
664
|
-
raise SystemExit('No exported Pi session files found.')
|
|
665
|
-
|
|
666
|
-
out_dir = Path(args.output_root).expanduser().resolve() / f'qwen35-pi-session-{datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")}'
|
|
667
|
-
out_dir.mkdir(parents=True, exist_ok=True)
|
|
668
|
-
strict_path = out_dir / 'qwen35-train.jsonl'
|
|
669
|
-
lossy_path = out_dir / 'qwen35-train-lossy.jsonl'
|
|
670
|
-
invalid_path = out_dir / 'invalid-records.jsonl'
|
|
671
|
-
strict_parquet_path = out_dir / 'qwen35-train.parquet'
|
|
672
|
-
lossy_parquet_path = out_dir / 'qwen35-train-lossy.parquet'
|
|
673
|
-
manifest_path = out_dir / 'manifest.json'
|
|
674
|
-
|
|
675
|
-
stats = Counter()
|
|
676
|
-
strict_out = strict_path.open('w', encoding='utf-8') if args.output_format in {'jsonl', 'both'} else None
|
|
677
|
-
lossy_out = lossy_path.open('w', encoding='utf-8') if args.output_format in {'jsonl', 'both'} else None
|
|
678
|
-
invalid_out = invalid_path.open('w', encoding='utf-8')
|
|
679
|
-
strict_writer = None
|
|
680
|
-
lossy_writer = None
|
|
681
|
-
strict_batch: List[Dict[str, Any]] = []
|
|
682
|
-
lossy_batch: List[Dict[str, Any]] = []
|
|
683
|
-
|
|
684
|
-
try:
|
|
685
|
-
for path in input_files:
|
|
686
|
-
try:
|
|
687
|
-
records, file_stats = convert_file(path)
|
|
688
|
-
stats.update(file_stats)
|
|
689
|
-
except Exception as exc:
|
|
690
|
-
stats['invalid_files'] += 1
|
|
691
|
-
invalid_out.write(json.dumps({'path': str(path), 'error': str(exc)}, ensure_ascii=False) + '\n')
|
|
692
|
-
continue
|
|
693
|
-
|
|
694
|
-
for record in records:
|
|
695
|
-
bucket = 'lossy' if record['meta']['lossy_source'] else 'strict'
|
|
696
|
-
if bucket == 'strict':
|
|
697
|
-
if strict_out is not None:
|
|
698
|
-
strict_out.write(json.dumps(record, ensure_ascii=False, separators=(',', ':')) + '\n')
|
|
699
|
-
if args.output_format in {'parquet', 'both'}:
|
|
700
|
-
strict_batch.append(record_to_parquet_row(record))
|
|
701
|
-
if len(strict_batch) >= BATCH_SIZE:
|
|
702
|
-
strict_writer = append_parquet_rows(strict_writer, strict_batch, strict_parquet_path)
|
|
703
|
-
strict_batch = []
|
|
704
|
-
else:
|
|
705
|
-
if lossy_out is not None:
|
|
706
|
-
lossy_out.write(json.dumps(record, ensure_ascii=False, separators=(',', ':')) + '\n')
|
|
707
|
-
if args.output_format in {'parquet', 'both'}:
|
|
708
|
-
lossy_batch.append(record_to_parquet_row(record))
|
|
709
|
-
if len(lossy_batch) >= BATCH_SIZE:
|
|
710
|
-
lossy_writer = append_parquet_rows(lossy_writer, lossy_batch, lossy_parquet_path)
|
|
711
|
-
lossy_batch = []
|
|
712
|
-
print(json.dumps({'processed_files': stats['input_files'], **dict(stats)}, ensure_ascii=False), flush=True)
|
|
713
|
-
|
|
714
|
-
if args.output_format in {'parquet', 'both'}:
|
|
715
|
-
strict_writer = append_parquet_rows(strict_writer, strict_batch, strict_parquet_path)
|
|
716
|
-
lossy_writer = append_parquet_rows(lossy_writer, lossy_batch, lossy_parquet_path)
|
|
717
|
-
finally:
|
|
718
|
-
if strict_out is not None:
|
|
719
|
-
strict_out.close()
|
|
720
|
-
if lossy_out is not None:
|
|
721
|
-
lossy_out.close()
|
|
722
|
-
invalid_out.close()
|
|
723
|
-
if strict_writer is not None:
|
|
724
|
-
strict_writer.close()
|
|
725
|
-
if lossy_writer is not None:
|
|
726
|
-
lossy_writer.close()
|
|
727
|
-
|
|
728
|
-
manifest = {
|
|
729
|
-
'input': str(input_path),
|
|
730
|
-
'output_dir': str(out_dir),
|
|
731
|
-
'input_files': [str(path) for path in input_files],
|
|
732
|
-
'stats': dict(stats),
|
|
733
|
-
'strict_records': stats.get('strict_records', 0),
|
|
734
|
-
'lossy_records': stats.get('lossy_records', 0),
|
|
735
|
-
}
|
|
736
|
-
manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2) + '\n', encoding='utf-8')
|
|
737
|
-
print(json.dumps(manifest, ensure_ascii=False), flush=True)
|
|
738
|
-
return 0
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
if __name__ == '__main__':
|
|
742
|
-
raise SystemExit(main(__import__('sys').argv[1:]))
|