agentic-dataset-builder 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,742 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ import statistics
7
+ from collections import Counter, defaultdict
8
+ from datetime import datetime, timezone
9
+ from pathlib import Path
10
+ from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple
11
+
12
+ from .export_pi_session import ARTIFACT_KEY, compact_text, read_jsonl
13
+ from .export_qwen35_training import (
14
+ append_parquet_rows,
15
+ ensure_parquet_runtime,
16
+ record_to_parquet_row,
17
+ validate_record_payload,
18
+ )
19
+
20
+ FULL_GLOB = '*.full.jsonl'
21
+ RAW_GLOB = '*.raw.jsonl'
22
+ BATCH_SIZE = 1000
23
+
24
+
25
+ class ConversionError(RuntimeError):
26
+ pass
27
+
28
+
29
+ def parse_args(argv: Sequence[str]) -> argparse.Namespace:
30
+ argv_list = list(argv)
31
+ command = 'convert'
32
+ if argv_list and argv_list[0] == 'stats':
33
+ command = 'stats'
34
+ argv_list = argv_list[1:]
35
+
36
+ parser = argparse.ArgumentParser(description='Convert exported Pi sessions into local Qwen3.5 training schema.')
37
+ parser.add_argument('--input', required=True, help='Input full-session file or directory containing exported Pi sessions, or a generated Qwen35 export dir in stats mode.')
38
+
39
+ if command == 'stats':
40
+ parser.add_argument('--json', action='store_true', help='Emit machine-readable JSON.')
41
+ args = parser.parse_args(argv_list)
42
+ args.command = command
43
+ return args
44
+
45
+ parser.add_argument('--output-root', required=True, help='Output directory root for Qwen3.5 schema export.')
46
+ parser.add_argument('--output-format', choices=('jsonl', 'parquet', 'both'), default='jsonl')
47
+ parser.add_argument('--include-raw', action='store_true', help='Also read *.raw.jsonl files when scanning directories.')
48
+ parser.add_argument('--limit', type=int, default=0, help='Limit number of input files processed.')
49
+ args = parser.parse_args(argv_list)
50
+ args.command = command
51
+ return args
52
+
53
+
54
+ def iter_input_files(input_path: Path, include_raw: bool) -> List[Path]:
55
+ if input_path.is_file():
56
+ return [input_path]
57
+ if not input_path.is_dir():
58
+ raise ConversionError(f'Input path does not exist: {input_path}')
59
+ patterns = [FULL_GLOB]
60
+ if include_raw:
61
+ patterns.append(RAW_GLOB)
62
+ files: List[Path] = []
63
+ for pattern in patterns:
64
+ files.extend(sorted(input_path.rglob(pattern)))
65
+ deduped: Dict[str, Path] = {}
66
+ for path in files:
67
+ deduped[str(path.resolve())] = path.resolve()
68
+ return list(deduped.values())
69
+
70
+
71
+ def build_tree(entries: List[Dict[str, Any]]) -> Tuple[Dict[str, Dict[str, Any]], Dict[Optional[str], List[str]]]:
72
+ by_id: Dict[str, Dict[str, Any]] = {}
73
+ children: Dict[Optional[str], List[str]] = defaultdict(list)
74
+ for entry in entries:
75
+ entry_id = entry.get('id')
76
+ if not isinstance(entry_id, str):
77
+ continue
78
+ by_id[entry_id] = entry
79
+ children[entry.get('parentId')].append(entry_id)
80
+ return by_id, children
81
+
82
+
83
+ def leaf_ids(by_id: Dict[str, Dict[str, Any]], children: Dict[Optional[str], List[str]]) -> List[str]:
84
+ leaves = [entry_id for entry_id in by_id if not children.get(entry_id)]
85
+ return sorted(leaves)
86
+
87
+
88
+ def path_to_leaf(leaf_id: str, by_id: Dict[str, Dict[str, Any]]) -> List[Dict[str, Any]]:
89
+ ordered: List[Dict[str, Any]] = []
90
+ current_id: Optional[str] = leaf_id
91
+ while current_id is not None:
92
+ entry = by_id.get(current_id)
93
+ if entry is None:
94
+ break
95
+ ordered.append(entry)
96
+ parent_id = entry.get('parentId')
97
+ current_id = parent_id if isinstance(parent_id, str) else None
98
+ ordered.reverse()
99
+ return ordered
100
+
101
+
102
+ def convert_content_blocks(
103
+ content: Any,
104
+ lossy_reasons: set[str],
105
+ unsupported_reason_prefix: str,
106
+ ) -> Any:
107
+ if isinstance(content, str):
108
+ return content
109
+ if not isinstance(content, list):
110
+ lossy_reasons.add(f'{unsupported_reason_prefix}_nonstandard_content')
111
+ return json.dumps(content, ensure_ascii=False, sort_keys=True)
112
+
113
+ blocks: List[Dict[str, Any]] = []
114
+ for block in content:
115
+ if not isinstance(block, dict):
116
+ lossy_reasons.add(f'{unsupported_reason_prefix}_non_dict_block')
117
+ continue
118
+ block_type = block.get('type')
119
+ if block_type == 'text':
120
+ blocks.append({'type': 'text', 'text': block.get('text', '')})
121
+ elif block_type == 'image':
122
+ metadata: Dict[str, Any] = {}
123
+ if isinstance(block.get('mimeType'), str):
124
+ metadata['mimeType'] = block['mimeType']
125
+ if isinstance(block.get('data'), str):
126
+ metadata['data'] = block['data']
127
+ blocks.append(
128
+ {
129
+ 'type': 'image',
130
+ 'placeholder': True,
131
+ 'placeholder_token': '<image>',
132
+ 'source_kind': 'pi_session_inline_image',
133
+ 'metadata': metadata or None,
134
+ }
135
+ )
136
+ elif block_type == 'video':
137
+ metadata = {}
138
+ if isinstance(block.get('mimeType'), str):
139
+ metadata['mimeType'] = block['mimeType']
140
+ if isinstance(block.get('data'), str):
141
+ metadata['data'] = block['data']
142
+ blocks.append(
143
+ {
144
+ 'type': 'video',
145
+ 'placeholder': True,
146
+ 'placeholder_token': '<video>',
147
+ 'source_kind': 'pi_session_inline_video',
148
+ 'metadata': metadata or None,
149
+ }
150
+ )
151
+ else:
152
+ lossy_reasons.add(f'{unsupported_reason_prefix}_unsupported_block_{block_type}')
153
+ blocks.append({'type': 'text', 'text': json.dumps(block, ensure_ascii=False, sort_keys=True)})
154
+ if not blocks:
155
+ return ''
156
+ if len(blocks) == 1 and blocks[0].get('type') == 'text':
157
+ return blocks[0]['text']
158
+ return blocks
159
+
160
+
161
+ def convert_assistant_message(message: Dict[str, Any], lossy_reasons: set[str], tools_seen: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
162
+ content = message.get('content')
163
+ text_blocks: List[Dict[str, Any]] = []
164
+ reasoning_chunks: List[str] = []
165
+ tool_calls: List[Dict[str, Any]] = []
166
+
167
+ if isinstance(content, str):
168
+ text_blocks = [{'type': 'text', 'text': content}]
169
+ elif isinstance(content, list):
170
+ for block in content:
171
+ if not isinstance(block, dict):
172
+ lossy_reasons.add('assistant_non_dict_block')
173
+ continue
174
+ block_type = block.get('type')
175
+ if block_type == 'text':
176
+ text_blocks.append({'type': 'text', 'text': block.get('text', '')})
177
+ elif block_type == 'thinking':
178
+ thinking = block.get('thinking')
179
+ if isinstance(thinking, str) and thinking:
180
+ reasoning_chunks.append(thinking)
181
+ elif block_type == 'toolCall':
182
+ name = block.get('name') or 'unknown_tool'
183
+ arguments = block.get('arguments') if isinstance(block.get('arguments'), dict) else {}
184
+ tool_calls.append(
185
+ {
186
+ 'type': 'function',
187
+ 'id': block.get('id'),
188
+ 'function': {'name': name, 'arguments': arguments},
189
+ }
190
+ )
191
+ tools_seen.setdefault(name, {'name': name})
192
+ elif block_type in {'image', 'video'}:
193
+ converted = convert_content_blocks([block], lossy_reasons, 'assistant')
194
+ if isinstance(converted, list):
195
+ text_blocks.extend(converted)
196
+ elif isinstance(converted, str):
197
+ text_blocks.append({'type': 'text', 'text': converted})
198
+ else:
199
+ lossy_reasons.add(f'assistant_unsupported_block_{block_type}')
200
+ text_blocks.append({'type': 'text', 'text': json.dumps(block, ensure_ascii=False, sort_keys=True)})
201
+ else:
202
+ lossy_reasons.add('assistant_nonstandard_content')
203
+ text_blocks = [{'type': 'text', 'text': json.dumps(content, ensure_ascii=False, sort_keys=True)}]
204
+
205
+ assistant_content: Any
206
+ if not text_blocks:
207
+ assistant_content = ''
208
+ elif len(text_blocks) == 1 and text_blocks[0].get('type') == 'text':
209
+ assistant_content = text_blocks[0]['text']
210
+ else:
211
+ assistant_content = text_blocks
212
+
213
+ payload: Dict[str, Any] = {'role': 'assistant', 'content': assistant_content}
214
+ if reasoning_chunks:
215
+ payload['reasoning_content'] = '\n\n'.join(reasoning_chunks)
216
+ if tool_calls:
217
+ payload['tool_calls'] = tool_calls
218
+ return payload
219
+
220
+
221
+ def embedded_artifact_text(node: Dict[str, Any]) -> Optional[str]:
222
+ embedded = node.get(f'{ARTIFACT_KEY}Embedded')
223
+ if not isinstance(embedded, dict):
224
+ return None
225
+ if embedded.get('encoding') == 'utf-8' and isinstance(embedded.get('text'), str):
226
+ return embedded['text']
227
+ if embedded.get('encoding') == 'base64' and isinstance(embedded.get('base64'), str):
228
+ return '[binary artifact embedded as base64]'
229
+ return None
230
+
231
+
232
+ def format_bash_execution(message: Dict[str, Any], lossy_reasons: set[str]) -> str:
233
+ output = message.get('output')
234
+ if not isinstance(output, str):
235
+ output = ''
236
+ full_text = embedded_artifact_text(message)
237
+ if full_text is None and isinstance(message.get('details'), dict):
238
+ full_text = embedded_artifact_text(message['details'])
239
+ truncated = bool(message.get('truncated'))
240
+ if truncated and full_text is None and isinstance(message.get(ARTIFACT_KEY), str):
241
+ lossy_reasons.add('missing_embedded_full_output')
242
+ effective_output = full_text if full_text is not None else output
243
+ payload = {
244
+ 'command': message.get('command'),
245
+ 'exit_code': message.get('exitCode'),
246
+ 'cancelled': message.get('cancelled', False),
247
+ 'truncated': truncated,
248
+ 'exclude_from_context': message.get('excludeFromContext', False),
249
+ 'output': effective_output,
250
+ }
251
+ return json.dumps(payload, ensure_ascii=False, sort_keys=True)
252
+
253
+
254
+ def synthetic_message_from_entry(entry: Dict[str, Any], label: str, text: Optional[str], lossy_reasons: set[str]) -> Optional[Dict[str, Any]]:
255
+ if not isinstance(text, str) or not text.strip():
256
+ return None
257
+ lossy_reasons.add(f'synthetic_{label}_message')
258
+ return {'role': 'assistant', 'content': f'[{label}]\n{text.strip()}'}
259
+
260
+
261
+ def entry_has_missing_artifact(node: Any) -> bool:
262
+ if isinstance(node, dict):
263
+ for key, value in node.items():
264
+ if key == ARTIFACT_KEY and isinstance(value, str) and f'{ARTIFACT_KEY}Embedded' not in node:
265
+ return True
266
+ if entry_has_missing_artifact(value):
267
+ return True
268
+ elif isinstance(node, list):
269
+ return any(entry_has_missing_artifact(item) for item in node)
270
+ return False
271
+
272
+
273
+ def convert_entry_to_messages(
274
+ entry: Dict[str, Any],
275
+ lossy_reasons: set[str],
276
+ tools_seen: Dict[str, Dict[str, Any]],
277
+ ) -> List[Dict[str, Any]]:
278
+ entry_type = entry.get('type')
279
+ if entry_type == 'message':
280
+ message = entry.get('message')
281
+ if not isinstance(message, dict):
282
+ lossy_reasons.add('message_entry_missing_payload')
283
+ return []
284
+ role = message.get('role')
285
+ if role == 'user':
286
+ return [{'role': 'user', 'content': convert_content_blocks(message.get('content'), lossy_reasons, 'user')}]
287
+ if role == 'assistant':
288
+ return [convert_assistant_message(message, lossy_reasons, tools_seen)]
289
+ if role == 'toolResult':
290
+ tool_name = message.get('toolName')
291
+ if isinstance(tool_name, str) and tool_name:
292
+ tools_seen.setdefault(tool_name, {'name': tool_name})
293
+ return [
294
+ {
295
+ 'role': 'tool',
296
+ 'content': convert_content_blocks(message.get('content'), lossy_reasons, 'tool_result'),
297
+ 'tool_call_id': message.get('toolCallId'),
298
+ 'name': tool_name,
299
+ }
300
+ ]
301
+ if role == 'bashExecution':
302
+ tools_seen.setdefault('bash', {'name': 'bash'})
303
+ return [{'role': 'tool', 'content': format_bash_execution(message, lossy_reasons), 'name': 'bash'}]
304
+ if role == 'custom':
305
+ custom_type = message.get('customType') or 'custom'
306
+ custom_content = message.get('content')
307
+ converted = convert_content_blocks(custom_content, lossy_reasons, 'custom')
308
+ lossy_reasons.add('synthetic_custom_message')
309
+ return [{'role': 'assistant', 'content': f'[custom:{custom_type}]\n{converted}' if isinstance(converted, str) else converted}]
310
+ if role == 'branchSummary':
311
+ return [synthetic_message_from_entry(entry, 'branch_summary', message.get('summary'), lossy_reasons)] if message.get('summary') else []
312
+ if role == 'compactionSummary':
313
+ return [synthetic_message_from_entry(entry, 'compaction_summary', message.get('summary'), lossy_reasons)] if message.get('summary') else []
314
+ lossy_reasons.add(f'unsupported_message_role_{role}')
315
+ return [{'role': 'assistant', 'content': json.dumps(message, ensure_ascii=False, sort_keys=True)}]
316
+
317
+ if entry_type == 'branch_summary':
318
+ return [synthetic_message_from_entry(entry, 'branch_summary', entry.get('summary'), lossy_reasons)] if entry.get('summary') else []
319
+ if entry_type == 'compaction':
320
+ return [synthetic_message_from_entry(entry, 'compaction_summary', entry.get('summary'), lossy_reasons)] if entry.get('summary') else []
321
+ if entry_type == 'custom_message':
322
+ custom_type = entry.get('customType') or 'custom'
323
+ converted = convert_content_blocks(entry.get('content'), lossy_reasons, 'custom_message')
324
+ lossy_reasons.add('synthetic_custom_message')
325
+ return [{'role': 'assistant', 'content': f'[custom:{custom_type}]\n{converted}' if isinstance(converted, str) else converted}]
326
+ return []
327
+
328
+
329
+ def count_blocks(content: Any) -> Dict[str, int]:
330
+ counts = {
331
+ 'contains_non_text_content': False,
332
+ 'image_block_count': 0,
333
+ 'video_block_count': 0,
334
+ }
335
+ if isinstance(content, list):
336
+ for block in content:
337
+ if not isinstance(block, dict):
338
+ continue
339
+ block_type = block.get('type')
340
+ if block_type == 'image':
341
+ counts['contains_non_text_content'] = True
342
+ counts['image_block_count'] += 1
343
+ elif block_type == 'video':
344
+ counts['contains_non_text_content'] = True
345
+ counts['video_block_count'] += 1
346
+ return counts
347
+
348
+
349
+ def compute_meta_counts(messages: List[Dict[str, Any]]) -> Dict[str, Any]:
350
+ counts = {
351
+ 'request_contains_non_text_content': False,
352
+ 'request_image_block_count': 0,
353
+ 'request_video_block_count': 0,
354
+ 'request_tool_call_block_count': 0,
355
+ 'request_tool_result_block_count': 0,
356
+ 'request_thinking_block_count': 0,
357
+ 'response_contains_non_text_content': False,
358
+ 'response_image_block_count': 0,
359
+ 'response_video_block_count': 0,
360
+ 'response_tool_call_block_count': 0,
361
+ 'response_tool_result_block_count': 0,
362
+ 'response_thinking_block_count': 0,
363
+ }
364
+ for message in messages:
365
+ role = message.get('role')
366
+ side = 'request' if role in {'system', 'user'} else 'response'
367
+ block_counts = count_blocks(message.get('content'))
368
+ counts[f'{side}_contains_non_text_content'] = counts[f'{side}_contains_non_text_content'] or block_counts['contains_non_text_content']
369
+ counts[f'{side}_image_block_count'] += block_counts['image_block_count']
370
+ counts[f'{side}_video_block_count'] += block_counts['video_block_count']
371
+ if role == 'assistant':
372
+ counts['response_tool_call_block_count'] += len(message.get('tool_calls') or [])
373
+ if isinstance(message.get('reasoning_content'), str) and message['reasoning_content'].strip():
374
+ counts['response_thinking_block_count'] += 1
375
+ if role == 'tool':
376
+ counts['response_tool_result_block_count'] += 1
377
+ return counts
378
+
379
+
380
+ def build_record_for_path(
381
+ header: Dict[str, Any],
382
+ path_entries: List[Dict[str, Any]],
383
+ source_path: Path,
384
+ leaf_id: str,
385
+ branch_index: int,
386
+ branch_count: int,
387
+ ) -> Dict[str, Any]:
388
+ lossy_reasons: set[str] = set()
389
+ tools_seen: Dict[str, Dict[str, Any]] = {}
390
+ messages: List[Dict[str, Any]] = []
391
+ models_seen: List[str] = []
392
+ thinking_levels: List[str] = []
393
+
394
+ for entry in path_entries:
395
+ if entry_has_missing_artifact(entry):
396
+ lossy_reasons.add('missing_embedded_artifact')
397
+ if entry.get('type') == 'model_change':
398
+ model_id = entry.get('modelId')
399
+ provider = entry.get('provider')
400
+ if isinstance(model_id, str):
401
+ models_seen.append(f'{provider}/{model_id}' if provider else model_id)
402
+ continue
403
+ if entry.get('type') == 'thinking_level_change':
404
+ level = entry.get('thinkingLevel')
405
+ if isinstance(level, str):
406
+ thinking_levels.append(level)
407
+ continue
408
+ if entry.get('type') in {'session_info', 'label', 'custom'}:
409
+ continue
410
+ for message in convert_entry_to_messages(entry, lossy_reasons, tools_seen):
411
+ if message:
412
+ messages.append(message)
413
+
414
+ if not any(message.get('role') == 'user' for message in messages):
415
+ raise ConversionError(f'No user messages found on branch {leaf_id} from {source_path}')
416
+
417
+ if branch_count > 1:
418
+ lossy_reasons.add('session_tree_branch_selected')
419
+
420
+ export_info = header.get('exportInfo') if isinstance(header.get('exportInfo'), dict) else None
421
+ if isinstance(export_info, dict) and int(export_info.get('missingArtifactCount') or 0) > 0:
422
+ lossy_reasons.add('source_export_missing_artifacts')
423
+
424
+ if len(set(models_seen)) > 1:
425
+ lossy_reasons.add('multiple_models_on_branch')
426
+ if len(set(thinking_levels)) > 1:
427
+ lossy_reasons.add('multiple_thinking_levels_on_branch')
428
+
429
+ tools = list(tools_seen.values())
430
+ counts = compute_meta_counts(messages)
431
+ meta = {
432
+ 'endpoint': 'pi/session_branch',
433
+ 'status': 200,
434
+ 'ts': path_entries[-1].get('timestamp') or header.get('timestamp') or '',
435
+ 'key': header.get('id'),
436
+ 'source': f'{source_path}#leaf={leaf_id}',
437
+ 'requested_model': models_seen[0] if models_seen else None,
438
+ 'actual_model': models_seen[-1] if models_seen else None,
439
+ 'stream': False,
440
+ 'thinking_level': thinking_levels[-1] if thinking_levels else None,
441
+ 'reasoning_summary_mode': 'pi_session_branch',
442
+ 'thinking_type': 'pi_session',
443
+ 'thinking_budget_tokens': None,
444
+ 'max_output_tokens': None,
445
+ 'tool_spec_count': len(tools),
446
+ 'tool_choice': {'mode': 'session_trace'},
447
+ 'request_truncated': False,
448
+ 'response_truncated': 'missing_embedded_full_output' in lossy_reasons,
449
+ 'lossy_source': bool(lossy_reasons),
450
+ 'lossy_reasons': sorted(lossy_reasons),
451
+ **counts,
452
+ }
453
+
454
+ record = {
455
+ 'id': f"{header.get('id')}:{leaf_id}",
456
+ 'request_id': header.get('id'),
457
+ 'messages': messages,
458
+ 'tools': tools,
459
+ 'meta': meta,
460
+ }
461
+ validate_record_payload(record)
462
+ return record
463
+
464
+
465
+ def convert_file(path: Path) -> Tuple[List[Dict[str, Any]], Counter]:
466
+ entries = read_jsonl(path)
467
+ header = entries[0]
468
+ body = entries[1:]
469
+ by_id, children = build_tree(body)
470
+ leaves = leaf_ids(by_id, children)
471
+ stats = Counter()
472
+ stats['input_files'] += 1
473
+ stats['branches_total'] += len(leaves)
474
+ records: List[Dict[str, Any]] = []
475
+ for index, leaf_id in enumerate(leaves, start=1):
476
+ branch_path = path_to_leaf(leaf_id, by_id)
477
+ record = build_record_for_path(header, branch_path, path, leaf_id, index, len(leaves))
478
+ records.append(record)
479
+ if record['meta']['lossy_source']:
480
+ stats['lossy_records'] += 1
481
+ else:
482
+ stats['strict_records'] += 1
483
+ return records, stats
484
+
485
+
486
+ def load_qwen_records(input_path: Path) -> List[Dict[str, Any]]:
487
+ files: List[Path] = []
488
+ if input_path.is_file():
489
+ files = [input_path]
490
+ elif input_path.is_dir():
491
+ for name in ('qwen35-train.jsonl', 'qwen35-train-lossy.jsonl'):
492
+ candidate = input_path / name
493
+ if candidate.exists():
494
+ files.append(candidate)
495
+ if not files:
496
+ raise ConversionError(f'No Qwen35 jsonl files found under {input_path}')
497
+
498
+ records: List[Dict[str, Any]] = []
499
+ for path in files:
500
+ bucket = 'lossy' if 'lossy' in path.name else 'strict'
501
+ with path.open('r', encoding='utf-8') as handle:
502
+ for line in handle:
503
+ line = line.strip()
504
+ if not line:
505
+ continue
506
+ record = json.loads(line)
507
+ record['_bucket'] = bucket
508
+ records.append(record)
509
+ return records
510
+
511
+
512
+ def content_char_count(content: Any) -> int:
513
+ if isinstance(content, str):
514
+ return len(content)
515
+ if isinstance(content, list):
516
+ total = 0
517
+ for block in content:
518
+ if isinstance(block, dict):
519
+ if isinstance(block.get('text'), str):
520
+ total += len(block['text'])
521
+ else:
522
+ total += len(json.dumps(block, ensure_ascii=False, sort_keys=True))
523
+ else:
524
+ total += len(str(block))
525
+ return total
526
+ return len(json.dumps(content, ensure_ascii=False, sort_keys=True))
527
+
528
+
529
+ def stat_summary(values: List[int]) -> Optional[Dict[str, Any]]:
530
+ if not values:
531
+ return None
532
+ ordered = sorted(values)
533
+ return {
534
+ 'min': ordered[0],
535
+ 'median': statistics.median(ordered),
536
+ 'mean': round(statistics.mean(ordered), 2),
537
+ 'max': ordered[-1],
538
+ }
539
+
540
+
541
+ def build_stats_report(records: List[Dict[str, Any]], input_path: Path) -> Dict[str, Any]:
542
+ message_counts: List[int] = []
543
+ user_counts: List[int] = []
544
+ assistant_counts: List[int] = []
545
+ tool_counts: List[int] = []
546
+ round_counts: List[int] = []
547
+ tool_call_counts: List[int] = []
548
+ reasoning_message_counts: List[int] = []
549
+ reasoning_char_counts: List[int] = []
550
+ total_char_counts: List[int] = []
551
+ per_record: List[Dict[str, Any]] = []
552
+ assistant_total = 0
553
+ assistant_with_reasoning_total = 0
554
+
555
+ for record in records:
556
+ messages = record.get('messages', [])
557
+ users = [message for message in messages if message.get('role') == 'user']
558
+ assistants = [message for message in messages if message.get('role') == 'assistant']
559
+ tools = [message for message in messages if message.get('role') == 'tool']
560
+ reasoning_messages = [
561
+ message
562
+ for message in assistants
563
+ if isinstance(message.get('reasoning_content'), str) and message['reasoning_content'].strip()
564
+ ]
565
+ reasoning_chars = sum(len(message['reasoning_content']) for message in reasoning_messages)
566
+ tool_calls = sum(len(message.get('tool_calls') or []) for message in assistants)
567
+ total_chars = sum(content_char_count(message.get('content')) for message in messages)
568
+
569
+ message_counts.append(len(messages))
570
+ user_counts.append(len(users))
571
+ assistant_counts.append(len(assistants))
572
+ tool_counts.append(len(tools))
573
+ round_counts.append(len(users))
574
+ tool_call_counts.append(tool_calls)
575
+ reasoning_message_counts.append(len(reasoning_messages))
576
+ reasoning_char_counts.append(reasoning_chars)
577
+ total_char_counts.append(total_chars)
578
+ assistant_total += len(assistants)
579
+ assistant_with_reasoning_total += len(reasoning_messages)
580
+
581
+ per_record.append(
582
+ {
583
+ 'id': record.get('id'),
584
+ 'bucket': record.get('_bucket'),
585
+ 'messages': len(messages),
586
+ 'users': len(users),
587
+ 'assistants': len(assistants),
588
+ 'tools': len(tools),
589
+ 'dialogue_rounds_est': len(users),
590
+ 'tool_calls': tool_calls,
591
+ 'reasoning_messages': len(reasoning_messages),
592
+ 'reasoning_chars': reasoning_chars,
593
+ 'content_chars': total_chars,
594
+ 'lossy_reasons': record.get('meta', {}).get('lossy_reasons', []),
595
+ }
596
+ )
597
+
598
+ records_with_reasoning = sum(1 for count in reasoning_message_counts if count > 0)
599
+ report = {
600
+ 'input': str(input_path),
601
+ 'records': len(records),
602
+ 'strict_records': sum(1 for record in records if record.get('_bucket') == 'strict'),
603
+ 'lossy_records': sum(1 for record in records if record.get('_bucket') == 'lossy'),
604
+ 'message_count': stat_summary(message_counts),
605
+ 'user_messages': stat_summary(user_counts),
606
+ 'assistant_messages': stat_summary(assistant_counts),
607
+ 'tool_messages': stat_summary(tool_counts),
608
+ 'dialogue_rounds_est': stat_summary(round_counts),
609
+ 'assistant_tool_calls': stat_summary(tool_call_counts),
610
+ 'assistant_reasoning_messages': stat_summary(reasoning_message_counts),
611
+ 'reasoning_chars_total_per_record': stat_summary(reasoning_char_counts),
612
+ 'content_chars_total': stat_summary(total_char_counts),
613
+ 'records_with_reasoning': records_with_reasoning,
614
+ 'records_with_reasoning_ratio': round(records_with_reasoning / len(records), 4) if records else 0.0,
615
+ 'assistant_messages_with_reasoning': assistant_with_reasoning_total,
616
+ 'assistant_messages_total': assistant_total,
617
+ 'assistant_reasoning_coverage': round(assistant_with_reasoning_total / assistant_total, 4) if assistant_total else 0.0,
618
+ 'per_record': per_record,
619
+ }
620
+ return report
621
+
622
+
623
+ def print_stats_report(report: Dict[str, Any], as_json: bool) -> int:
624
+ if as_json:
625
+ print(json.dumps(report, ensure_ascii=False, indent=2))
626
+ return 0
627
+ print(f"input: {report['input']}")
628
+ print(f"records: {report['records']} (strict={report['strict_records']}, lossy={report['lossy_records']})")
629
+ print(f"records with reasoning: {report['records_with_reasoning']} ({report['records_with_reasoning_ratio']:.2%})")
630
+ print(
631
+ f"assistant reasoning coverage: {report['assistant_messages_with_reasoning']}/"
632
+ f"{report['assistant_messages_total']} ({report['assistant_reasoning_coverage']:.2%})"
633
+ )
634
+ print(f"message count: {report['message_count']}")
635
+ print(f"dialogue rounds est: {report['dialogue_rounds_est']}")
636
+ print(f"assistant tool calls: {report['assistant_tool_calls']}")
637
+ print(f"assistant reasoning messages: {report['assistant_reasoning_messages']}")
638
+ print(f"reasoning chars per record: {report['reasoning_chars_total_per_record']}")
639
+ print('per record:')
640
+ for item in report['per_record']:
641
+ print(
642
+ f" - {item['id']} [{item['bucket']}] msgs={item['messages']} rounds={item['dialogue_rounds_est']} "
643
+ f"tool_calls={item['tool_calls']} reasoning_msgs={item['reasoning_messages']} reasoning_chars={item['reasoning_chars']}"
644
+ )
645
+ return 0
646
+
647
+
648
+ def main(argv: Sequence[str] | None = None) -> int:
649
+ args = parse_args(argv or [])
650
+
651
+ if args.command == 'stats':
652
+ input_path = Path(args.input).expanduser().resolve()
653
+ records = load_qwen_records(input_path)
654
+ report = build_stats_report(records, input_path)
655
+ return print_stats_report(report, args.json)
656
+
657
+ ensure_parquet_runtime(args.output_format)
658
+
659
+ input_path = Path(args.input).expanduser().resolve()
660
+ input_files = iter_input_files(input_path, args.include_raw)
661
+ if args.limit > 0:
662
+ input_files = input_files[: args.limit]
663
+ if not input_files:
664
+ raise SystemExit('No exported Pi session files found.')
665
+
666
+ out_dir = Path(args.output_root).expanduser().resolve() / f'qwen35-pi-session-{datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")}'
667
+ out_dir.mkdir(parents=True, exist_ok=True)
668
+ strict_path = out_dir / 'qwen35-train.jsonl'
669
+ lossy_path = out_dir / 'qwen35-train-lossy.jsonl'
670
+ invalid_path = out_dir / 'invalid-records.jsonl'
671
+ strict_parquet_path = out_dir / 'qwen35-train.parquet'
672
+ lossy_parquet_path = out_dir / 'qwen35-train-lossy.parquet'
673
+ manifest_path = out_dir / 'manifest.json'
674
+
675
+ stats = Counter()
676
+ strict_out = strict_path.open('w', encoding='utf-8') if args.output_format in {'jsonl', 'both'} else None
677
+ lossy_out = lossy_path.open('w', encoding='utf-8') if args.output_format in {'jsonl', 'both'} else None
678
+ invalid_out = invalid_path.open('w', encoding='utf-8')
679
+ strict_writer = None
680
+ lossy_writer = None
681
+ strict_batch: List[Dict[str, Any]] = []
682
+ lossy_batch: List[Dict[str, Any]] = []
683
+
684
+ try:
685
+ for path in input_files:
686
+ try:
687
+ records, file_stats = convert_file(path)
688
+ stats.update(file_stats)
689
+ except Exception as exc:
690
+ stats['invalid_files'] += 1
691
+ invalid_out.write(json.dumps({'path': str(path), 'error': str(exc)}, ensure_ascii=False) + '\n')
692
+ continue
693
+
694
+ for record in records:
695
+ bucket = 'lossy' if record['meta']['lossy_source'] else 'strict'
696
+ if bucket == 'strict':
697
+ if strict_out is not None:
698
+ strict_out.write(json.dumps(record, ensure_ascii=False, separators=(',', ':')) + '\n')
699
+ if args.output_format in {'parquet', 'both'}:
700
+ strict_batch.append(record_to_parquet_row(record))
701
+ if len(strict_batch) >= BATCH_SIZE:
702
+ strict_writer = append_parquet_rows(strict_writer, strict_batch, strict_parquet_path)
703
+ strict_batch = []
704
+ else:
705
+ if lossy_out is not None:
706
+ lossy_out.write(json.dumps(record, ensure_ascii=False, separators=(',', ':')) + '\n')
707
+ if args.output_format in {'parquet', 'both'}:
708
+ lossy_batch.append(record_to_parquet_row(record))
709
+ if len(lossy_batch) >= BATCH_SIZE:
710
+ lossy_writer = append_parquet_rows(lossy_writer, lossy_batch, lossy_parquet_path)
711
+ lossy_batch = []
712
+ print(json.dumps({'processed_files': stats['input_files'], **dict(stats)}, ensure_ascii=False), flush=True)
713
+
714
+ if args.output_format in {'parquet', 'both'}:
715
+ strict_writer = append_parquet_rows(strict_writer, strict_batch, strict_parquet_path)
716
+ lossy_writer = append_parquet_rows(lossy_writer, lossy_batch, lossy_parquet_path)
717
+ finally:
718
+ if strict_out is not None:
719
+ strict_out.close()
720
+ if lossy_out is not None:
721
+ lossy_out.close()
722
+ invalid_out.close()
723
+ if strict_writer is not None:
724
+ strict_writer.close()
725
+ if lossy_writer is not None:
726
+ lossy_writer.close()
727
+
728
+ manifest = {
729
+ 'input': str(input_path),
730
+ 'output_dir': str(out_dir),
731
+ 'input_files': [str(path) for path in input_files],
732
+ 'stats': dict(stats),
733
+ 'strict_records': stats.get('strict_records', 0),
734
+ 'lossy_records': stats.get('lossy_records', 0),
735
+ }
736
+ manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2) + '\n', encoding='utf-8')
737
+ print(json.dumps(manifest, ensure_ascii=False), flush=True)
738
+ return 0
739
+
740
+
741
+ if __name__ == '__main__':
742
+ raise SystemExit(main(__import__('sys').argv[1:]))