agentic-dataset-builder 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1559 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ import gzip
7
+ import hashlib
8
+ import json
9
+ import os
10
+ import re
11
+ import subprocess
12
+ from collections import Counter
13
+ from datetime import datetime, timezone
14
+ from pathlib import Path
15
+ from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple
16
+
17
+ try:
18
+ import pyarrow as pa
19
+ import pyarrow.parquet as pq
20
+ except Exception: # pragma: no cover - parquet is optional at runtime
21
+ pa = None
22
+ pq = None
23
+
24
+ PARQUET_SCHEMA = None
25
+ if pa is not None: # pragma: no branch
26
+ PARQUET_SCHEMA = pa.schema(
27
+ [
28
+ ('id', pa.string()),
29
+ ('request_id', pa.string()),
30
+ ('endpoint', pa.string()),
31
+ ('status', pa.int64()),
32
+ ('ts', pa.string()),
33
+ ('key', pa.string()),
34
+ ('source', pa.string()),
35
+ ('requested_model', pa.string()),
36
+ ('actual_model', pa.string()),
37
+ ('stream', pa.bool_()),
38
+ ('thinking_level', pa.string()),
39
+ ('reasoning_summary_mode_json', pa.string()),
40
+ ('thinking_type', pa.string()),
41
+ ('thinking_budget_tokens', pa.int64()),
42
+ ('max_output_tokens', pa.int64()),
43
+ ('tool_spec_count', pa.int64()),
44
+ ('tool_choice_json', pa.string()),
45
+ ('request_contains_non_text_content', pa.bool_()),
46
+ ('request_image_block_count', pa.int64()),
47
+ ('request_video_block_count', pa.int64()),
48
+ ('request_tool_call_block_count', pa.int64()),
49
+ ('request_tool_result_block_count', pa.int64()),
50
+ ('request_thinking_block_count', pa.int64()),
51
+ ('response_contains_non_text_content', pa.bool_()),
52
+ ('response_image_block_count', pa.int64()),
53
+ ('response_video_block_count', pa.int64()),
54
+ ('response_tool_call_block_count', pa.int64()),
55
+ ('response_tool_result_block_count', pa.int64()),
56
+ ('response_thinking_block_count', pa.int64()),
57
+ ('request_truncated', pa.bool_()),
58
+ ('response_truncated', pa.bool_()),
59
+ ('lossy_source', pa.bool_()),
60
+ ('lossy_reasons_json', pa.string()),
61
+ ('user_message_count', pa.int64()),
62
+ ('assistant_message_count', pa.int64()),
63
+ ('tool_message_count', pa.int64()),
64
+ ('dialogue_rounds_est', pa.int64()),
65
+ ('tool_call_count', pa.int64()),
66
+ ('has_reasoning', pa.bool_()),
67
+ ('reasoning_chars', pa.int64()),
68
+ ('content_chars_total', pa.int64()),
69
+ ('messages_json', pa.string()),
70
+ ('tools_json', pa.string()),
71
+ ('meta_json', pa.string()),
72
+ ]
73
+ )
74
+
75
+ try:
76
+ from .qwen35_training_record import Qwen35TrainingRecord
77
+ except Exception: # pragma: no cover - remote runtime may not ship pydantic
78
+ Qwen35TrainingRecord = None
79
+
80
+ MAIN_RE = re.compile(r'.*(?:🟢|⚠️ ?|❌|🟡)\s+(\d+)\s+(GET|POST|PUT|PATCH|DELETE)\s+(\S+)')
81
+ META_RE = re.compile(r'^\s*[├└]─\s+([^:]+):\s?(.*)$')
82
+ TARGET_PATHS = {
83
+ 'POST /openai/v1/responses',
84
+ 'POST /openai/v1/responses/compact',
85
+ 'POST /openai/v1/chat/completions',
86
+ 'POST /api/v1/messages',
87
+ }
88
+ TRUNCATED_MARKER = '...[truncated]'
89
+ TEXT_BLOCK_TYPES = {'text', 'input_text', 'output_text'}
90
+ IMAGE_BLOCK_TYPES = {'image', 'input_image', 'output_image', 'image_url'}
91
+ VIDEO_BLOCK_TYPES = {'video', 'input_video', 'output_video', 'video_url'}
92
+ TOOL_CALL_BLOCK_TYPES = {'tool_use', 'tool_call', 'function_call', 'custom_tool_call', 'web_search_call'}
93
+ TOOL_RESULT_BLOCK_TYPES = {'tool_result', 'tool_output', 'function_call_output'}
94
+ THINKING_BLOCK_TYPES = {'thinking', 'reasoning'}
95
+ VISION_IMAGE_TOKEN = '<|vision_start|><|image_pad|><|vision_end|>'
96
+ VISION_VIDEO_TOKEN = '<|vision_start|><|video_pad|><|vision_end|>'
97
+ THINK_INLINE_RE = re.compile(r'<think>\s*(.*?)\s*</think>', re.S)
98
+ TOOL_RESPONSE_RE = re.compile(r'<tool_response>\s*(.*?)\s*</tool_response>', re.S)
99
+ TOOL_CALL_RE = re.compile(r'<tool_call>\s*<function=([^>\n]+)>\s*(.*?)</function>\s*</tool_call>', re.S)
100
+ TOOL_PARAM_RE = re.compile(r'<parameter=([^>\n]+)>\s*(.*?)\s*</parameter>', re.S)
101
+ VISION_TOKEN_RE = re.compile(
102
+ f'({re.escape(VISION_IMAGE_TOKEN)}|{re.escape(VISION_VIDEO_TOKEN)})'
103
+ )
104
+
105
+
106
+ def parse_args() -> argparse.Namespace:
107
+ parser = argparse.ArgumentParser(description='Export relay logs into Qwen3.5-compatible JSONL.')
108
+ parser.add_argument('--output-root', required=True)
109
+ parser.add_argument(
110
+ '--archive-root',
111
+ default='/vePFS-Mindverse/share/yiwen/claude-relay-service/docker-json-logs/di-20260320122547-ws9d2/claude-relay-service-claude-relay-1',
112
+ )
113
+ parser.add_argument('--container', default='claude-relay-service-claude-relay-1')
114
+ parser.add_argument('--include-current', action='store_true', default=True)
115
+ parser.add_argument('--exclude-current', dest='include_current', action='store_false')
116
+ parser.add_argument('--limit-sources', type=int, default=0)
117
+ parser.add_argument('--dedupe-mode', choices=('requestid', 'content', 'none'), default='requestid')
118
+ parser.add_argument('--workers', type=int, default=0, help='Thread workers for per-source staging; 0 means auto.')
119
+ parser.add_argument(
120
+ '--output-format',
121
+ choices=('jsonl', 'parquet', 'both'),
122
+ default='parquet',
123
+ help='Emit JSONL, Parquet, or both. Parquet is the default and is optimized for analytics-first workflows.',
124
+ )
125
+ parser.add_argument('--keep-staging', action='store_true', help='Keep intermediate staged chunk files for debugging.')
126
+ return parser.parse_args()
127
+
128
+
129
+ def resolve_current_log_path(container: str) -> str:
130
+ cmd = ['bash', '-lc', f"export DOCKER_API_VERSION=1.43; docker inspect -f '{{{{.LogPath}}}}' {container}"]
131
+ return subprocess.check_output(cmd, text=True).strip()
132
+
133
+
134
+ def sorted_archive_sources(archive_root: str) -> List[str]:
135
+ if not os.path.isdir(archive_root):
136
+ return []
137
+
138
+ def sort_key(path: str) -> Tuple[int, str]:
139
+ base = os.path.basename(path)
140
+ head = base.split('_', 1)[0]
141
+ try:
142
+ return (int(head), base)
143
+ except ValueError:
144
+ return (0, base)
145
+
146
+ files = [
147
+ os.path.join(archive_root, name)
148
+ for name in os.listdir(archive_root)
149
+ if name.endswith('.gz') and os.path.isfile(os.path.join(archive_root, name))
150
+ ]
151
+ return sorted(files, key=sort_key)
152
+
153
+
154
+ def iter_sources(archive_root: str, current_log: Optional[str], limit: int) -> List[str]:
155
+ sources = sorted_archive_sources(archive_root)
156
+ if current_log:
157
+ sources.append(current_log)
158
+ if limit > 0:
159
+ return sources[:limit]
160
+ return sources
161
+
162
+
163
+ def open_log_file(path: str):
164
+ if path.endswith('.gz'):
165
+ return gzip.open(path, 'rt', encoding='utf-8', errors='replace')
166
+ return open(path, 'r', encoding='utf-8', errors='replace')
167
+
168
+
169
+ def iter_events(paths: List[str]) -> Iterator[Dict[str, Any]]:
170
+ current_event: Optional[Dict[str, Any]] = None
171
+ current_key: Optional[str] = None
172
+ current_source: Optional[str] = None
173
+
174
+ for path in paths:
175
+ current_source = path
176
+ with open_log_file(path) as handle:
177
+ for raw_line in handle:
178
+ try:
179
+ obj = json.loads(raw_line)
180
+ log_line = obj.get('log', '').rstrip('\n')
181
+ ts = obj.get('time')
182
+ except Exception:
183
+ continue
184
+
185
+ main_match = MAIN_RE.match(log_line)
186
+ if main_match:
187
+ if current_event is not None:
188
+ yield current_event
189
+ current_event = {
190
+ 'source': current_source,
191
+ 'ts': ts,
192
+ 'status': int(main_match.group(1)),
193
+ 'method': main_match.group(2),
194
+ 'path': main_match.group(3),
195
+ 'meta': {},
196
+ }
197
+ current_key = None
198
+ continue
199
+
200
+ if current_event is None:
201
+ continue
202
+
203
+ meta_match = META_RE.match(log_line)
204
+ if meta_match:
205
+ current_key = meta_match.group(1)
206
+ current_event['meta'][current_key] = meta_match.group(2)
207
+ continue
208
+
209
+ if current_key:
210
+ current_event['meta'][current_key] += log_line
211
+
212
+ if current_event is not None:
213
+ yield current_event
214
+
215
+
216
+ def parse_json_maybe(value: Any) -> Any:
217
+ if value is None:
218
+ return None
219
+ if isinstance(value, (dict, list)):
220
+ return value
221
+ if not isinstance(value, str):
222
+ return None
223
+ try:
224
+ return json.loads(value)
225
+ except Exception:
226
+ return None
227
+
228
+
229
+ def json_fallback(value: Any) -> str:
230
+ return json.dumps(value, ensure_ascii=False, sort_keys=True)
231
+
232
+
233
+ def has_truncation(value: Any) -> bool:
234
+ if isinstance(value, str):
235
+ return TRUNCATED_MARKER in value
236
+ if isinstance(value, list):
237
+ return any(has_truncation(item) for item in value)
238
+ if isinstance(value, dict):
239
+ if value.get('_truncated'):
240
+ return True
241
+ return any(has_truncation(item) for item in value.values())
242
+ return False
243
+
244
+
245
+ def normalize_role(role: Any) -> Optional[str]:
246
+ mapping = {
247
+ 'system': 'system',
248
+ 'developer': 'system',
249
+ 'user': 'user',
250
+ 'assistant': 'assistant',
251
+ 'tool': 'tool',
252
+ 'model': 'assistant',
253
+ }
254
+ return mapping.get(role)
255
+
256
+
257
+ def get_text_from_block(block: Dict[str, Any]) -> str:
258
+ if isinstance(block.get('text'), str):
259
+ return block['text']
260
+ if isinstance(block.get('content'), str):
261
+ return block['content']
262
+ if isinstance(block.get('reasoning'), str):
263
+ return block['reasoning']
264
+ if isinstance(block.get('thinking'), str):
265
+ return block['thinking']
266
+ if isinstance(block.get('content'), list):
267
+ return flatten_text_only(block['content'])
268
+ return ''
269
+
270
+
271
+ def flatten_text_only(value: Any) -> str:
272
+ if value is None:
273
+ return ''
274
+ if isinstance(value, str):
275
+ return value
276
+ if isinstance(value, dict):
277
+ block_type = value.get('type')
278
+ if block_type in TEXT_BLOCK_TYPES | THINKING_BLOCK_TYPES:
279
+ return get_text_from_block(value)
280
+ if isinstance(value.get('content'), list):
281
+ return flatten_text_only(value['content'])
282
+ return ''
283
+ if isinstance(value, list):
284
+ parts = [flatten_text_only(item) for item in value]
285
+ return '\n'.join(part for part in parts if part)
286
+ return str(value)
287
+
288
+
289
+ def parse_parameter_value(value: str) -> Any:
290
+ parsed = parse_json_maybe(value)
291
+ if parsed is not None:
292
+ return parsed
293
+ return value
294
+
295
+
296
+ def split_inline_reasoning(text: str, lossy_reasons: set[str]) -> Tuple[Optional[str], str]:
297
+ reasoning_parts: List[str] = []
298
+
299
+ def replacer(match: re.Match[str]) -> str:
300
+ inner = match.group(1).strip()
301
+ if inner:
302
+ reasoning_parts.append(inner)
303
+ return ''
304
+
305
+ cleaned = THINK_INLINE_RE.sub(replacer, text)
306
+ if '<think>' in cleaned or '</think>' in cleaned:
307
+ lossy_reasons.add('unbalanced_think_markup')
308
+ reasoning = '\n\n'.join(part for part in reasoning_parts if part).strip()
309
+ return (reasoning or None), cleaned.strip()
310
+
311
+
312
+ def split_vision_placeholder_text(text: str) -> List[Dict[str, Any]]:
313
+ blocks: List[Dict[str, Any]] = []
314
+ parts = VISION_TOKEN_RE.split(text)
315
+ for part in parts:
316
+ if not part:
317
+ continue
318
+ if part == VISION_IMAGE_TOKEN:
319
+ blocks.append(
320
+ {
321
+ 'type': 'image',
322
+ 'placeholder': True,
323
+ 'placeholder_token': VISION_IMAGE_TOKEN,
324
+ 'source_kind': 'placeholder',
325
+ }
326
+ )
327
+ elif part == VISION_VIDEO_TOKEN:
328
+ blocks.append(
329
+ {
330
+ 'type': 'video',
331
+ 'placeholder': True,
332
+ 'placeholder_token': VISION_VIDEO_TOKEN,
333
+ 'source_kind': 'placeholder',
334
+ }
335
+ )
336
+ else:
337
+ stripped = part.strip()
338
+ if stripped:
339
+ blocks.append({'type': 'text', 'text': stripped})
340
+ return blocks
341
+
342
+
343
+ def extract_tool_calls_from_text(text: str, lossy_reasons: set[str]) -> Tuple[List[Dict[str, Any]], str]:
344
+ tool_calls: List[Dict[str, Any]] = []
345
+
346
+ def replacer(match: re.Match[str]) -> str:
347
+ name = match.group(1).strip()
348
+ body = match.group(2)
349
+ arguments: Dict[str, Any] = {}
350
+ for param_match in TOOL_PARAM_RE.finditer(body):
351
+ param_name = param_match.group(1).strip()
352
+ param_value = param_match.group(2).strip()
353
+ if param_name:
354
+ arguments[param_name] = parse_parameter_value(param_value)
355
+ if not arguments and body.strip():
356
+ lossy_reasons.add('tool_call_markup_without_parameters')
357
+ tool_calls.append(
358
+ {
359
+ 'type': 'function',
360
+ 'function': {
361
+ 'name': name,
362
+ 'arguments': arguments,
363
+ },
364
+ }
365
+ )
366
+ return ''
367
+
368
+ cleaned = TOOL_CALL_RE.sub(replacer, text)
369
+ if '<tool_call>' in cleaned or '<function=' in cleaned:
370
+ lossy_reasons.add('unparsed_tool_call_markup')
371
+ return tool_calls, cleaned.strip()
372
+
373
+
374
+ def extract_tool_responses_from_text(text: str) -> Tuple[List[str], str]:
375
+ responses = [match.group(1).strip() for match in TOOL_RESPONSE_RE.finditer(text) if match.group(1).strip()]
376
+ cleaned = TOOL_RESPONSE_RE.sub('', text).strip()
377
+ return responses, cleaned
378
+
379
+
380
+ def parse_arguments_to_object(arguments: Any, lossy_reasons: set[str]) -> Dict[str, Any]:
381
+ if arguments is None:
382
+ return {}
383
+ if isinstance(arguments, dict):
384
+ return arguments
385
+ if isinstance(arguments, str):
386
+ parsed = parse_json_maybe(arguments)
387
+ if isinstance(parsed, dict):
388
+ return parsed
389
+ lossy_reasons.add('tool_arguments_not_object')
390
+ return {}
391
+ lossy_reasons.add('tool_arguments_not_object')
392
+ return {}
393
+
394
+
395
+ def normalize_tool_call(call: Dict[str, Any], lossy_reasons: set[str]) -> Optional[Dict[str, Any]]:
396
+ if not isinstance(call, dict):
397
+ lossy_reasons.add('invalid_tool_call')
398
+ return None
399
+
400
+ call_type = call.get('type')
401
+ call_id = call.get('id') or call.get('tool_call_id') or call.get('call_id')
402
+ function_block = call.get('function') if isinstance(call.get('function'), dict) else None
403
+
404
+ if function_block:
405
+ name = function_block.get('name') or call.get('name')
406
+ arguments = function_block.get('arguments')
407
+ else:
408
+ name = call.get('name')
409
+ arguments = call.get('arguments')
410
+ if arguments is None:
411
+ arguments = call.get('input')
412
+
413
+ if call_type == 'web_search_call' and (not isinstance(name, str) or not name):
414
+ name = 'web_search'
415
+ if arguments is None:
416
+ payload: Dict[str, Any] = {}
417
+ status = call.get('status')
418
+ if isinstance(status, str):
419
+ payload['status'] = status
420
+ arguments = payload
421
+
422
+ if call_type == 'custom_tool_call' and isinstance(arguments, str):
423
+ arguments = {'input': arguments}
424
+
425
+ if not isinstance(name, str) or not name:
426
+ lossy_reasons.add('tool_call_missing_name')
427
+ return None
428
+
429
+ return {
430
+ 'type': 'function',
431
+ 'id': call_id,
432
+ 'function': {
433
+ 'name': name,
434
+ 'arguments': parse_arguments_to_object(arguments, lossy_reasons),
435
+ },
436
+ }
437
+
438
+
439
+ def normalize_tool_specs(raw_tools: Any) -> List[Dict[str, Any]]:
440
+ if isinstance(raw_tools, str):
441
+ raw_tools = parse_json_maybe(raw_tools)
442
+ if not isinstance(raw_tools, list):
443
+ return []
444
+
445
+ normalized: List[Dict[str, Any]] = []
446
+ for tool in raw_tools:
447
+ if not isinstance(tool, dict):
448
+ continue
449
+ name = tool.get('name')
450
+ if not isinstance(name, str) or not name:
451
+ func = tool.get('function')
452
+ if isinstance(func, dict):
453
+ name = func.get('name')
454
+ if not isinstance(name, str) or not name:
455
+ continue
456
+ item = dict(tool)
457
+ item['name'] = name
458
+ normalized.append(item)
459
+ return normalized
460
+
461
+
462
+ def ensure_text_content(value: Any, lossy_reasons: set[str]) -> str:
463
+ if value is None:
464
+ return ''
465
+ if isinstance(value, str):
466
+ return value
467
+ text = flatten_text_only(value)
468
+ if text:
469
+ return text
470
+ lossy_reasons.add('non_text_tool_content')
471
+ return json_fallback(value)
472
+
473
+
474
+ def normalize_image_block(item: Dict[str, Any]) -> Dict[str, Any]:
475
+ image_url = None
476
+ if isinstance(item.get('image_url'), str):
477
+ image_url = item['image_url']
478
+ elif isinstance(item.get('image_url'), dict):
479
+ image_url = item['image_url'].get('url')
480
+ elif isinstance(item.get('url'), str):
481
+ image_url = item['url']
482
+ return {
483
+ 'type': 'image',
484
+ 'image_url': image_url,
485
+ 'placeholder': image_url is None,
486
+ 'placeholder_token': item.get('placeholder_token') or '<|vision_start|><|image_pad|><|vision_end|>',
487
+ 'source_kind': item.get('type') or ('image_url' if image_url else 'placeholder'),
488
+ }
489
+
490
+
491
+ def normalize_video_block(item: Dict[str, Any]) -> Dict[str, Any]:
492
+ video_url = None
493
+ if isinstance(item.get('video_url'), str):
494
+ video_url = item['video_url']
495
+ elif isinstance(item.get('video_url'), dict):
496
+ video_url = item['video_url'].get('url')
497
+ elif isinstance(item.get('url'), str):
498
+ video_url = item['url']
499
+ return {
500
+ 'type': 'video',
501
+ 'video_url': video_url,
502
+ 'placeholder': video_url is None,
503
+ 'placeholder_token': item.get('placeholder_token') or '<|vision_start|><|video_pad|><|vision_end|>',
504
+ 'source_kind': item.get('type') or ('video_url' if video_url else 'placeholder'),
505
+ }
506
+
507
+
508
+ def finalize_content(blocks: List[Dict[str, Any]]) -> Any:
509
+ if not blocks:
510
+ return ''
511
+ if all(block.get('type') == 'text' for block in blocks):
512
+ return '\n'.join(block['text'] for block in blocks if block.get('text'))
513
+ return blocks
514
+
515
+
516
+ def append_text_block(blocks: List[Dict[str, Any]], text: str) -> None:
517
+ text = text.strip()
518
+ if not text:
519
+ return
520
+ if blocks and blocks[-1].get('type') == 'text':
521
+ blocks[-1]['text'] += '\n\n' + text
522
+ else:
523
+ blocks.append({'type': 'text', 'text': text})
524
+
525
+
526
+ def merge_initial_system_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
527
+ system_messages: List[Dict[str, Any]] = []
528
+ rest: List[Dict[str, Any]] = []
529
+ leading = True
530
+ for message in messages:
531
+ if leading and message.get('role') == 'system':
532
+ system_messages.append(message)
533
+ else:
534
+ leading = False
535
+ rest.append(message)
536
+
537
+ if len(system_messages) <= 1:
538
+ return messages
539
+
540
+ merged_content_parts: List[str] = []
541
+ for message in system_messages:
542
+ content = render_content_for_system_merge(message.get('content'))
543
+ if content.strip():
544
+ merged_content_parts.append(content.strip())
545
+ merged_system = {
546
+ 'role': 'system',
547
+ 'content': '\n\n'.join(merged_content_parts),
548
+ }
549
+ return [merged_system] + rest
550
+
551
+
552
+ def render_content_for_system_merge(content: Any) -> str:
553
+ if isinstance(content, str):
554
+ return content
555
+ if isinstance(content, list):
556
+ parts: List[str] = []
557
+ for block in content:
558
+ if isinstance(block, dict) and block.get('type') == 'text' and isinstance(block.get('text'), str):
559
+ parts.append(block['text'])
560
+ return '\n'.join(part for part in parts if part)
561
+ return ''
562
+
563
+
564
+ def is_effectively_empty_content(content: Any) -> bool:
565
+ if content is None:
566
+ return True
567
+ if isinstance(content, str):
568
+ return not content.strip()
569
+ if isinstance(content, list):
570
+ if not content:
571
+ return True
572
+ for block in content:
573
+ if not isinstance(block, dict):
574
+ return False
575
+ if block.get('type') != 'text':
576
+ return False
577
+ if isinstance(block.get('text'), str) and block['text'].strip():
578
+ return False
579
+ return True
580
+ return False
581
+
582
+
583
+ def content_features(content: Any) -> Dict[str, int]:
584
+ counts = Counter()
585
+ if isinstance(content, list):
586
+ for block in content:
587
+ if not isinstance(block, dict):
588
+ continue
589
+ block_type = block.get('type')
590
+ if block_type == 'image':
591
+ counts['image'] += 1
592
+ elif block_type == 'video':
593
+ counts['video'] += 1
594
+ elif block_type not in (None, 'text'):
595
+ counts['other'] += 1
596
+ return counts
597
+
598
+
599
+ def parse_message(
600
+ role: str,
601
+ raw_content: Any,
602
+ explicit_tool_calls: Any = None,
603
+ explicit_reasoning: Optional[str] = None,
604
+ explicit_tool_call_id: Optional[str] = None,
605
+ explicit_tool_name: Optional[str] = None,
606
+ ) -> Tuple[List[Dict[str, Any]], Counter, set[str]]:
607
+ lossy_reasons: set[str] = set()
608
+ feature_counts = Counter()
609
+ tool_messages: List[Dict[str, Any]] = []
610
+ tool_calls: List[Dict[str, Any]] = []
611
+ reasoning_parts: List[str] = []
612
+ content_blocks: List[Dict[str, Any]] = []
613
+
614
+ if explicit_tool_calls is not None:
615
+ raw_tool_calls = explicit_tool_calls
616
+ if isinstance(raw_tool_calls, list):
617
+ for item in raw_tool_calls:
618
+ normalized = normalize_tool_call(item, lossy_reasons)
619
+ if normalized:
620
+ tool_calls.append(normalized)
621
+ feature_counts['tool_call'] += 1
622
+ if explicit_reasoning:
623
+ reasoning_parts.append(explicit_reasoning.strip())
624
+ feature_counts['thinking'] += 1
625
+
626
+ items: List[Any]
627
+ if isinstance(raw_content, list):
628
+ items = raw_content
629
+ elif raw_content is None:
630
+ items = []
631
+ else:
632
+ items = [raw_content]
633
+
634
+ for item in items:
635
+ if isinstance(item, str):
636
+ text = item
637
+ if role == 'assistant':
638
+ inline_reasoning, text = split_inline_reasoning(text, lossy_reasons)
639
+ if inline_reasoning:
640
+ reasoning_parts.append(inline_reasoning)
641
+ feature_counts['thinking'] += 1
642
+ inline_tool_calls, text = extract_tool_calls_from_text(text, lossy_reasons)
643
+ if inline_tool_calls:
644
+ tool_calls.extend(inline_tool_calls)
645
+ feature_counts['tool_call'] += len(inline_tool_calls)
646
+ if role == 'user':
647
+ tool_responses, text = extract_tool_responses_from_text(text)
648
+ for payload in tool_responses:
649
+ tool_messages.append(
650
+ {
651
+ 'role': 'tool',
652
+ 'content': payload,
653
+ 'tool_call_id': None,
654
+ 'name': None,
655
+ }
656
+ )
657
+ feature_counts['tool_result'] += 1
658
+ blocks = split_vision_placeholder_text(text)
659
+ if role == 'system' and any(block['type'] in {'image', 'video'} for block in blocks):
660
+ lossy_reasons.add('system_multimodal_not_supported')
661
+ append_text_block(content_blocks, text)
662
+ else:
663
+ for block in blocks:
664
+ if block['type'] == 'text':
665
+ append_text_block(content_blocks, block['text'])
666
+ else:
667
+ content_blocks.append(block)
668
+ feature_counts[block['type']] += 1
669
+ continue
670
+ if not isinstance(item, dict):
671
+ append_text_block(content_blocks, str(item))
672
+ lossy_reasons.add('non_dict_content_item')
673
+ continue
674
+
675
+ block_type = item.get('type')
676
+ if block_type in TEXT_BLOCK_TYPES or (
677
+ 'text' in item and block_type not in IMAGE_BLOCK_TYPES | VIDEO_BLOCK_TYPES | THINKING_BLOCK_TYPES
678
+ ):
679
+ text = item.get('text') if isinstance(item.get('text'), str) else None
680
+ if text is None and isinstance(item.get('content'), str):
681
+ text = item['content']
682
+ if text is None:
683
+ text = flatten_text_only(item)
684
+ if role == 'assistant':
685
+ inline_reasoning, text = split_inline_reasoning(text, lossy_reasons)
686
+ if inline_reasoning:
687
+ reasoning_parts.append(inline_reasoning)
688
+ feature_counts['thinking'] += 1
689
+ inline_tool_calls, text = extract_tool_calls_from_text(text, lossy_reasons)
690
+ if inline_tool_calls:
691
+ tool_calls.extend(inline_tool_calls)
692
+ feature_counts['tool_call'] += len(inline_tool_calls)
693
+ if role == 'user':
694
+ tool_responses, text = extract_tool_responses_from_text(text)
695
+ for payload in tool_responses:
696
+ tool_messages.append(
697
+ {
698
+ 'role': 'tool',
699
+ 'content': payload,
700
+ 'tool_call_id': None,
701
+ 'name': None,
702
+ }
703
+ )
704
+ feature_counts['tool_result'] += 1
705
+ blocks = split_vision_placeholder_text(text)
706
+ if role == 'system' and any(block['type'] in {'image', 'video'} for block in blocks):
707
+ lossy_reasons.add('system_multimodal_not_supported')
708
+ append_text_block(content_blocks, text)
709
+ else:
710
+ for block in blocks:
711
+ if block['type'] == 'text':
712
+ append_text_block(content_blocks, block['text'])
713
+ else:
714
+ content_blocks.append(block)
715
+ feature_counts[block['type']] += 1
716
+ elif block_type in THINKING_BLOCK_TYPES:
717
+ text = get_text_from_block(item)
718
+ if text:
719
+ reasoning_parts.append(text.strip())
720
+ feature_counts['thinking'] += 1
721
+ elif block_type in IMAGE_BLOCK_TYPES or 'image_url' in item or 'image' in item:
722
+ if role == 'system':
723
+ append_text_block(content_blocks, '[unsupported system image omitted]')
724
+ lossy_reasons.add('system_multimodal_not_supported')
725
+ else:
726
+ content_blocks.append(normalize_image_block(item))
727
+ feature_counts['image'] += 1
728
+ elif block_type in VIDEO_BLOCK_TYPES or 'video_url' in item or 'video' in item:
729
+ if role == 'system':
730
+ append_text_block(content_blocks, '[unsupported system video omitted]')
731
+ lossy_reasons.add('system_multimodal_not_supported')
732
+ else:
733
+ content_blocks.append(normalize_video_block(item))
734
+ feature_counts['video'] += 1
735
+ elif block_type in TOOL_CALL_BLOCK_TYPES:
736
+ normalized = normalize_tool_call(item, lossy_reasons)
737
+ if normalized:
738
+ tool_calls.append(normalized)
739
+ feature_counts['tool_call'] += 1
740
+ elif block_type in TOOL_RESULT_BLOCK_TYPES:
741
+ tool_messages.append(
742
+ {
743
+ 'role': 'tool',
744
+ 'content': ensure_text_content(item.get('content') or item.get('text'), lossy_reasons),
745
+ 'tool_call_id': item.get('tool_use_id') or item.get('tool_call_id') or item.get('id') or item.get('call_id'),
746
+ 'name': item.get('name') or item.get('tool_name'),
747
+ }
748
+ )
749
+ feature_counts['tool_result'] += 1
750
+ else:
751
+ if isinstance(item.get('content'), list):
752
+ nested_messages, nested_features, nested_lossy = parse_message(
753
+ role,
754
+ item['content'],
755
+ item.get('tool_calls'),
756
+ item.get('reasoning_content') if isinstance(item.get('reasoning_content'), str) else None,
757
+ item.get('tool_call_id'),
758
+ item.get('name'),
759
+ )
760
+ feature_counts.update(nested_features)
761
+ lossy_reasons.update(nested_lossy)
762
+ if nested_messages:
763
+ primary = nested_messages[0]
764
+ primary_content = primary.get('content', '')
765
+ if isinstance(primary_content, str):
766
+ append_text_block(content_blocks, primary_content)
767
+ elif isinstance(primary_content, list):
768
+ content_blocks.extend(primary_content)
769
+ else:
770
+ append_text_block(content_blocks, json_fallback(item))
771
+ lossy_reasons.add('unknown_content_block')
772
+
773
+ message: Dict[str, Any] = {'role': role, 'content': finalize_content(content_blocks)}
774
+ if role == 'assistant':
775
+ reasoning = '\n\n'.join(part for part in reasoning_parts if part).strip()
776
+ if reasoning:
777
+ message['reasoning_content'] = reasoning
778
+ if tool_calls:
779
+ message['tool_calls'] = tool_calls
780
+ if role == 'tool':
781
+ if explicit_tool_call_id:
782
+ message['tool_call_id'] = explicit_tool_call_id
783
+ if explicit_tool_name:
784
+ message['name'] = explicit_tool_name
785
+ if not isinstance(message['content'], str):
786
+ message['content'] = ensure_text_content(message['content'], lossy_reasons)
787
+ messages = [message]
788
+ messages.extend(tool_messages)
789
+ return messages, feature_counts, lossy_reasons
790
+
791
+
792
+ def normalize_message_sequence(raw_messages: Any, endpoint: str) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
793
+ if isinstance(raw_messages, str):
794
+ parsed = parse_json_maybe(raw_messages)
795
+ if parsed is not None:
796
+ raw_messages = parsed
797
+ messages: List[Dict[str, Any]] = []
798
+ feature_counts = Counter()
799
+ lossy_reasons: set[str] = set()
800
+
801
+ if isinstance(raw_messages, str):
802
+ msg_list, msg_features, msg_lossy = parse_message('user', raw_messages)
803
+ messages.extend(msg_list)
804
+ feature_counts.update(msg_features)
805
+ lossy_reasons.update(msg_lossy)
806
+ elif isinstance(raw_messages, dict):
807
+ role = normalize_role(raw_messages.get('role') or raw_messages.get('type'))
808
+ if role:
809
+ msg_list, msg_features, msg_lossy = parse_message(
810
+ role,
811
+ raw_messages.get('content') if 'content' in raw_messages else raw_messages.get('text'),
812
+ raw_messages.get('tool_calls'),
813
+ raw_messages.get('reasoning_content') if isinstance(raw_messages.get('reasoning_content'), str) else None,
814
+ raw_messages.get('tool_call_id'),
815
+ raw_messages.get('name'),
816
+ )
817
+ messages.extend(msg_list)
818
+ feature_counts.update(msg_features)
819
+ lossy_reasons.update(msg_lossy)
820
+ elif isinstance(raw_messages, list):
821
+ for item in raw_messages:
822
+ if isinstance(item, dict) and ('role' in item or item.get('type') == 'message'):
823
+ role = normalize_role(item.get('role'))
824
+ if not role:
825
+ lossy_reasons.add('unsupported_role')
826
+ continue
827
+ msg_list, msg_features, msg_lossy = parse_message(
828
+ role,
829
+ item.get('content') if 'content' in item else item.get('text'),
830
+ item.get('tool_calls'),
831
+ item.get('reasoning_content') if isinstance(item.get('reasoning_content'), str) else None,
832
+ item.get('tool_call_id'),
833
+ item.get('name'),
834
+ )
835
+ messages.extend(msg_list)
836
+ feature_counts.update(msg_features)
837
+ lossy_reasons.update(msg_lossy)
838
+ else:
839
+ msg_list, msg_features, msg_lossy = parse_message('user', item)
840
+ messages.extend(msg_list)
841
+ feature_counts.update(msg_features)
842
+ lossy_reasons.update(msg_lossy)
843
+
844
+ # Merge consecutive messages with same role except tool role.
845
+ merged: List[Dict[str, Any]] = []
846
+ for message in messages:
847
+ if (
848
+ merged
849
+ and merged[-1]['role'] == message['role']
850
+ and message['role'] != 'tool'
851
+ and 'tool_calls' not in merged[-1]
852
+ and 'tool_calls' not in message
853
+ and 'reasoning_content' not in merged[-1]
854
+ and 'reasoning_content' not in message
855
+ ):
856
+ prev = merged[-1]
857
+ if isinstance(prev['content'], str) and isinstance(message['content'], str):
858
+ prev['content'] = (prev['content'] + '\n\n' + message['content']).strip()
859
+ elif isinstance(prev['content'], list) and isinstance(message['content'], list):
860
+ prev['content'].extend(message['content'])
861
+ else:
862
+ prev['content'] = ensure_text_content(prev['content'], lossy_reasons) + '\n\n' + ensure_text_content(message['content'], lossy_reasons)
863
+ else:
864
+ merged.append(message)
865
+
866
+ system_messages = [message for message in merged if message['role'] == 'system']
867
+ non_system_messages = [message for message in merged if message['role'] != 'system']
868
+ if system_messages and merged[: len(system_messages)] != system_messages:
869
+ lossy_reasons.add('system_reordered')
870
+ merged = system_messages + non_system_messages
871
+
872
+ flags = {
873
+ 'contains_non_text_content': feature_counts['image'] > 0 or feature_counts['video'] > 0,
874
+ 'image_block_count': feature_counts['image'],
875
+ 'video_block_count': feature_counts['video'],
876
+ 'tool_call_block_count': feature_counts['tool_call'],
877
+ 'tool_result_block_count': feature_counts['tool_result'],
878
+ 'thinking_block_count': feature_counts['thinking'],
879
+ 'lossy_reasons': sorted(lossy_reasons),
880
+ }
881
+ return merged, flags
882
+
883
+
884
+ def extract_request_meta(endpoint: str, req_obj: Dict[str, Any]) -> Dict[str, Any]:
885
+ meta: Dict[str, Any] = {}
886
+ requested_model = req_obj.get('model')
887
+ if isinstance(requested_model, str) and requested_model:
888
+ meta['requested_model'] = requested_model
889
+ if isinstance(req_obj.get('stream'), bool):
890
+ meta['stream'] = req_obj['stream']
891
+ reasoning = req_obj.get('reasoning')
892
+ if isinstance(reasoning, dict):
893
+ if isinstance(reasoning.get('effort'), str):
894
+ meta['thinking_level'] = reasoning['effort']
895
+ if 'summary' in reasoning:
896
+ meta['reasoning_summary_mode'] = reasoning['summary']
897
+ thinking = req_obj.get('thinking')
898
+ if isinstance(thinking, dict):
899
+ if isinstance(thinking.get('type'), str):
900
+ meta['thinking_type'] = thinking['type']
901
+ if isinstance(thinking.get('budget_tokens'), int):
902
+ meta['thinking_budget_tokens'] = thinking['budget_tokens']
903
+ if isinstance(req_obj.get('max_output_tokens'), int):
904
+ meta['max_output_tokens'] = req_obj['max_output_tokens']
905
+ elif isinstance(req_obj.get('max_tokens'), int):
906
+ meta['max_output_tokens'] = req_obj['max_tokens']
907
+ tools = normalize_tool_specs(req_obj.get('tools'))
908
+ if tools:
909
+ meta['tool_spec_count'] = len(tools)
910
+ tool_choice = req_obj.get('tool_choice') or req_obj.get('toolChoice')
911
+ if tool_choice is not None:
912
+ meta['tool_choice'] = tool_choice
913
+ return meta
914
+
915
+
916
+ def extract_response_meta(endpoint: str, res_obj: Dict[str, Any]) -> Dict[str, Any]:
917
+ meta: Dict[str, Any] = {}
918
+ body = res_obj.get('response') if isinstance(res_obj.get('response'), dict) else res_obj
919
+ if isinstance(body, dict):
920
+ actual_model = body.get('model')
921
+ if isinstance(actual_model, str) and actual_model:
922
+ meta['actual_model'] = actual_model
923
+ usage = body.get('usage')
924
+ if isinstance(usage, dict):
925
+ total_tokens = usage.get('total_tokens')
926
+ if isinstance(total_tokens, int):
927
+ meta['total_tokens'] = total_tokens
928
+ return meta
929
+
930
+
931
+ def normalize_request_messages(endpoint: str, req_obj: Dict[str, Any]) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
932
+ messages: List[Dict[str, Any]] = []
933
+ flags_total = Counter()
934
+ lossy_reasons: set[str] = set()
935
+
936
+ def absorb(seq: List[Dict[str, Any]], flags: Dict[str, Any]) -> None:
937
+ messages.extend(seq)
938
+ flags_total['image'] += flags['image_block_count']
939
+ flags_total['video'] += flags['video_block_count']
940
+ flags_total['tool_call'] += flags['tool_call_block_count']
941
+ flags_total['tool_result'] += flags['tool_result_block_count']
942
+ flags_total['thinking'] += flags['thinking_block_count']
943
+ if flags['contains_non_text_content']:
944
+ flags_total['non_text'] += 1
945
+ lossy_reasons.update(flags['lossy_reasons'])
946
+
947
+ if endpoint in ('POST /openai/v1/responses', 'POST /openai/v1/responses/compact'):
948
+ instructions = req_obj.get('instructions')
949
+ if instructions:
950
+ messages.append({'role': 'system', 'content': str(instructions)})
951
+ seq, flags = normalize_message_sequence(req_obj.get('input'), endpoint)
952
+ absorb(seq, flags)
953
+ elif endpoint == 'POST /openai/v1/chat/completions':
954
+ instructions = req_obj.get('instructions')
955
+ if instructions:
956
+ messages.append({'role': 'system', 'content': str(instructions)})
957
+ seq, flags = normalize_message_sequence(req_obj.get('messages'), endpoint)
958
+ absorb(seq, flags)
959
+ elif endpoint == 'POST /api/v1/messages':
960
+ system_content = req_obj.get('system')
961
+ if system_content is not None:
962
+ seq, flags = normalize_message_sequence([{'role': 'system', 'content': system_content}], endpoint)
963
+ absorb(seq, flags)
964
+ seq, flags = normalize_message_sequence(req_obj.get('messages'), endpoint)
965
+ absorb(seq, flags)
966
+ else:
967
+ seq, flags = normalize_message_sequence(req_obj, endpoint)
968
+ absorb(seq, flags)
969
+
970
+ if not any(message['role'] == 'user' for message in messages):
971
+ lossy_reasons.add('missing_user_query')
972
+
973
+ merged_messages = merge_initial_system_messages(messages)
974
+ if len(merged_messages) != len(messages):
975
+ lossy_reasons.add('merged_initial_system_messages')
976
+ messages = merged_messages
977
+
978
+ return messages, {
979
+ 'contains_non_text_content': bool(flags_total['non_text']),
980
+ 'image_block_count': flags_total['image'],
981
+ 'video_block_count': flags_total['video'],
982
+ 'tool_call_block_count': flags_total['tool_call'],
983
+ 'tool_result_block_count': flags_total['tool_result'],
984
+ 'thinking_block_count': flags_total['thinking'],
985
+ 'lossy_reasons': sorted(lossy_reasons),
986
+ }
987
+
988
+
989
+ def normalize_response_messages(endpoint: str, res_obj: Any) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
990
+ if not isinstance(res_obj, dict):
991
+ return [], {
992
+ 'contains_non_text_content': False,
993
+ 'image_block_count': 0,
994
+ 'video_block_count': 0,
995
+ 'tool_call_block_count': 0,
996
+ 'tool_result_block_count': 0,
997
+ 'thinking_block_count': 0,
998
+ 'lossy_reasons': [],
999
+ }
1000
+
1001
+ body = res_obj.get('response') if isinstance(res_obj.get('response'), dict) else res_obj
1002
+ if endpoint in ('POST /openai/v1/responses', 'POST /openai/v1/responses/compact') and isinstance(body, dict):
1003
+ output = body.get('output')
1004
+ if isinstance(output, list) and output:
1005
+ if all(isinstance(item, dict) and 'role' not in item and item.get('type') != 'message' for item in output):
1006
+ messages, features, lossy = parse_message('assistant', output)
1007
+ return messages, {
1008
+ 'contains_non_text_content': features['image'] > 0 or features['video'] > 0,
1009
+ 'image_block_count': features['image'],
1010
+ 'video_block_count': features['video'],
1011
+ 'tool_call_block_count': features['tool_call'],
1012
+ 'tool_result_block_count': features['tool_result'],
1013
+ 'thinking_block_count': features['thinking'],
1014
+ 'lossy_reasons': sorted(lossy),
1015
+ }
1016
+ messages, flags = normalize_message_sequence(output, endpoint)
1017
+ if messages:
1018
+ return messages, flags
1019
+ if isinstance(body.get('output_text'), str) and body.get('output_text').strip():
1020
+ messages, flags = normalize_message_sequence([{'role': 'assistant', 'content': body['output_text']}], endpoint)
1021
+ return messages, flags
1022
+ if endpoint == 'POST /openai/v1/chat/completions':
1023
+ choices = body.get('choices') if isinstance(body, dict) else None
1024
+ if isinstance(choices, list) and choices and isinstance(choices[0], dict):
1025
+ msg = choices[0].get('message')
1026
+ messages, flags = normalize_message_sequence([msg], endpoint)
1027
+ return messages, flags
1028
+ if endpoint == 'POST /api/v1/messages':
1029
+ messages, flags = normalize_message_sequence([{'role': 'assistant', 'content': body.get('content')}], endpoint)
1030
+ return messages, flags
1031
+ return [], {
1032
+ 'contains_non_text_content': False,
1033
+ 'image_block_count': 0,
1034
+ 'video_block_count': 0,
1035
+ 'tool_call_block_count': 0,
1036
+ 'tool_result_block_count': 0,
1037
+ 'thinking_block_count': 0,
1038
+ 'lossy_reasons': [],
1039
+ }
1040
+
1041
+
1042
+ def record_hash(messages: List[Dict[str, Any]], tools: List[Dict[str, Any]]) -> str:
1043
+ payload = json.dumps({'messages': messages, 'tools': tools}, ensure_ascii=False, sort_keys=True, separators=(',', ':'))
1044
+ return hashlib.sha256(payload.encode('utf-8')).hexdigest()
1045
+
1046
+
1047
+ def dedupe_key(mode: str, record_id: str, request_id: Optional[str], messages: List[Dict[str, Any]], tools: List[Dict[str, Any]]) -> Optional[str]:
1048
+ if mode == 'none':
1049
+ return None
1050
+ if mode == 'requestid':
1051
+ return request_id or record_id
1052
+ return record_hash(messages, tools)
1053
+
1054
+
1055
+ def lightweight_validate_record(payload: Dict[str, Any]) -> None:
1056
+ messages = payload.get('messages') or []
1057
+ if not messages:
1058
+ raise ValueError('messages must not be empty')
1059
+
1060
+ seen_user = False
1061
+ seen_non_system = False
1062
+ for message in messages:
1063
+ role = message.get('role')
1064
+ if role != 'system':
1065
+ seen_non_system = True
1066
+ elif seen_non_system:
1067
+ raise ValueError('system messages must appear only at the beginning')
1068
+
1069
+ if role == 'user':
1070
+ seen_user = True
1071
+ if role == 'system' and isinstance(message.get('content'), list):
1072
+ if any(isinstance(block, dict) and block.get('type') in {'image', 'video'} for block in message['content']):
1073
+ raise ValueError('system messages cannot contain image/video blocks')
1074
+ if role == 'assistant':
1075
+ reasoning = message.get('reasoning_content')
1076
+ if isinstance(reasoning, str) and ('<think>' in reasoning or '</think>' in reasoning):
1077
+ raise ValueError('reasoning_content must not contain think wrappers')
1078
+ content = message.get('content')
1079
+ if isinstance(content, str) and ('<think>' in content or '</think>' in content):
1080
+ raise ValueError('assistant content must not contain inline think wrappers')
1081
+
1082
+ if not seen_user:
1083
+ raise ValueError('at least one user message is required')
1084
+
1085
+ meta = payload.get('meta') or {}
1086
+ if meta.get('lossy_source') and not meta.get('lossy_reasons'):
1087
+ raise ValueError('lossy_source requires lossy_reasons')
1088
+
1089
+
1090
+ def validate_record_payload(payload: Dict[str, Any]) -> Any:
1091
+ if Qwen35TrainingRecord is not None:
1092
+ return Qwen35TrainingRecord.model_validate(payload)
1093
+ lightweight_validate_record(payload)
1094
+ return payload
1095
+
1096
+
1097
+ def record_messages_and_tools(record: Any) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
1098
+ if Qwen35TrainingRecord is not None and hasattr(record, 'messages'):
1099
+ return (
1100
+ [message.model_dump(exclude_none=True) for message in record.messages],
1101
+ [tool.model_dump(exclude_none=True) for tool in record.tools],
1102
+ )
1103
+ return record['messages'], record.get('tools', [])
1104
+
1105
+
1106
+ def record_is_lossy(record: Any) -> bool:
1107
+ if Qwen35TrainingRecord is not None and hasattr(record, 'meta'):
1108
+ return bool(record.meta.lossy_source)
1109
+ return bool(record.get('meta', {}).get('lossy_source'))
1110
+
1111
+
1112
+ def record_id_value(record: Any) -> str:
1113
+ return record.id if hasattr(record, 'id') else record['id']
1114
+
1115
+
1116
+ def record_request_id_value(record: Any) -> Optional[str]:
1117
+ return record.request_id if hasattr(record, 'request_id') else record.get('request_id')
1118
+
1119
+
1120
+ def record_dump_json(record: Any) -> str:
1121
+ if Qwen35TrainingRecord is not None and hasattr(record, 'model_dump_json'):
1122
+ return record.model_dump_json(exclude_none=True)
1123
+ return json.dumps(record, ensure_ascii=False, separators=(',', ':'))
1124
+
1125
+
1126
+ def record_as_dict(record: Any) -> Dict[str, Any]:
1127
+ if Qwen35TrainingRecord is not None and hasattr(record, 'model_dump'):
1128
+ return record.model_dump(exclude_none=True)
1129
+ return record
1130
+
1131
+
1132
+ def parquet_content_projection(content: Any) -> Tuple[Optional[str], List[Dict[str, Any]]]:
1133
+ if isinstance(content, str):
1134
+ return content, []
1135
+ blocks: List[Dict[str, Any]] = []
1136
+ if isinstance(content, list):
1137
+ for block in content:
1138
+ if not isinstance(block, dict):
1139
+ continue
1140
+ blocks.append(
1141
+ {
1142
+ 'type': block.get('type'),
1143
+ 'text': block.get('text'),
1144
+ 'image_url': block.get('image_url'),
1145
+ 'video_url': block.get('video_url'),
1146
+ 'placeholder': block.get('placeholder'),
1147
+ 'placeholder_token': block.get('placeholder_token'),
1148
+ 'source_kind': block.get('source_kind'),
1149
+ 'metadata_json': json.dumps(block.get('metadata'), ensure_ascii=False, sort_keys=True)
1150
+ if isinstance(block.get('metadata'), dict)
1151
+ else None,
1152
+ }
1153
+ )
1154
+ return None, blocks
1155
+
1156
+
1157
+ def parquet_tool_calls_projection(tool_calls: Any) -> List[Dict[str, Any]]:
1158
+ rows: List[Dict[str, Any]] = []
1159
+ if not isinstance(tool_calls, list):
1160
+ return rows
1161
+ for call in tool_calls:
1162
+ if not isinstance(call, dict):
1163
+ continue
1164
+ function = call.get('function') if isinstance(call.get('function'), dict) else {}
1165
+ rows.append(
1166
+ {
1167
+ 'id': call.get('id'),
1168
+ 'type': call.get('type'),
1169
+ 'function_name': function.get('name'),
1170
+ 'function_arguments_json': json.dumps(function.get('arguments', {}), ensure_ascii=False, sort_keys=True),
1171
+ }
1172
+ )
1173
+ return rows
1174
+
1175
+
1176
+ def parquet_tools_projection(tools: Any) -> List[Dict[str, Any]]:
1177
+ rows: List[Dict[str, Any]] = []
1178
+ if not isinstance(tools, list):
1179
+ return rows
1180
+ for tool in tools:
1181
+ if not isinstance(tool, dict):
1182
+ continue
1183
+ rows.append(
1184
+ {
1185
+ 'name': tool.get('name'),
1186
+ 'description': tool.get('description'),
1187
+ 'parameters_json': json.dumps(tool.get('parameters'), ensure_ascii=False, sort_keys=True)
1188
+ if tool.get('parameters') is not None
1189
+ else None,
1190
+ 'raw_json': json.dumps(tool, ensure_ascii=False, sort_keys=True),
1191
+ }
1192
+ )
1193
+ return rows
1194
+
1195
+
1196
+ def record_to_parquet_row(record: Dict[str, Any]) -> Dict[str, Any]:
1197
+ meta = dict(record.get('meta', {}))
1198
+ messages = record.get('messages', []) if isinstance(record.get('messages'), list) else []
1199
+ user_message_count = sum(1 for message in messages if isinstance(message, dict) and message.get('role') == 'user')
1200
+ assistant_message_count = sum(1 for message in messages if isinstance(message, dict) and message.get('role') == 'assistant')
1201
+ tool_message_count = sum(1 for message in messages if isinstance(message, dict) and message.get('role') == 'tool')
1202
+ tool_call_count = sum(
1203
+ len(message.get('tool_calls') or [])
1204
+ for message in messages
1205
+ if isinstance(message, dict) and message.get('role') == 'assistant'
1206
+ )
1207
+ has_reasoning = any(
1208
+ isinstance(message, dict)
1209
+ and message.get('role') == 'assistant'
1210
+ and isinstance(message.get('reasoning_content'), str)
1211
+ and bool(message.get('reasoning_content').strip())
1212
+ for message in messages
1213
+ )
1214
+ reasoning_chars = sum(
1215
+ len(message.get('reasoning_content', ''))
1216
+ for message in messages
1217
+ if isinstance(message, dict)
1218
+ and message.get('role') == 'assistant'
1219
+ and isinstance(message.get('reasoning_content'), str)
1220
+ )
1221
+ content_chars_total = sum(
1222
+ len(message.get('content'))
1223
+ if isinstance(message.get('content'), str)
1224
+ else len(json.dumps(message.get('content'), ensure_ascii=False, sort_keys=True))
1225
+ for message in messages
1226
+ if isinstance(message, dict) and message.get('content') is not None
1227
+ )
1228
+ return {
1229
+ 'id': record.get('id'),
1230
+ 'request_id': record.get('request_id'),
1231
+ 'endpoint': meta.get('endpoint'),
1232
+ 'status': meta.get('status'),
1233
+ 'ts': meta.get('ts'),
1234
+ 'key': meta.get('key'),
1235
+ 'source': meta.get('source'),
1236
+ 'requested_model': meta.get('requested_model'),
1237
+ 'actual_model': meta.get('actual_model'),
1238
+ 'stream': meta.get('stream'),
1239
+ 'thinking_level': meta.get('thinking_level'),
1240
+ 'reasoning_summary_mode_json': json.dumps(meta.get('reasoning_summary_mode'), ensure_ascii=False, sort_keys=True),
1241
+ 'thinking_type': meta.get('thinking_type'),
1242
+ 'thinking_budget_tokens': meta.get('thinking_budget_tokens'),
1243
+ 'max_output_tokens': meta.get('max_output_tokens'),
1244
+ 'tool_spec_count': meta.get('tool_spec_count'),
1245
+ 'tool_choice_json': json.dumps(meta.get('tool_choice'), ensure_ascii=False, sort_keys=True),
1246
+ 'request_contains_non_text_content': meta.get('request_contains_non_text_content'),
1247
+ 'request_image_block_count': meta.get('request_image_block_count'),
1248
+ 'request_video_block_count': meta.get('request_video_block_count'),
1249
+ 'request_tool_call_block_count': meta.get('request_tool_call_block_count'),
1250
+ 'request_tool_result_block_count': meta.get('request_tool_result_block_count'),
1251
+ 'request_thinking_block_count': meta.get('request_thinking_block_count'),
1252
+ 'response_contains_non_text_content': meta.get('response_contains_non_text_content'),
1253
+ 'response_image_block_count': meta.get('response_image_block_count'),
1254
+ 'response_video_block_count': meta.get('response_video_block_count'),
1255
+ 'response_tool_call_block_count': meta.get('response_tool_call_block_count'),
1256
+ 'response_tool_result_block_count': meta.get('response_tool_result_block_count'),
1257
+ 'response_thinking_block_count': meta.get('response_thinking_block_count'),
1258
+ 'request_truncated': meta.get('request_truncated'),
1259
+ 'response_truncated': meta.get('response_truncated'),
1260
+ 'lossy_source': meta.get('lossy_source'),
1261
+ 'lossy_reasons_json': json.dumps(meta.get('lossy_reasons', []), ensure_ascii=False, sort_keys=True),
1262
+ 'user_message_count': user_message_count,
1263
+ 'assistant_message_count': assistant_message_count,
1264
+ 'tool_message_count': tool_message_count,
1265
+ 'dialogue_rounds_est': user_message_count,
1266
+ 'tool_call_count': tool_call_count,
1267
+ 'has_reasoning': has_reasoning,
1268
+ 'reasoning_chars': reasoning_chars,
1269
+ 'content_chars_total': content_chars_total,
1270
+ 'messages_json': json.dumps(record.get('messages', []), ensure_ascii=False, sort_keys=True),
1271
+ 'tools_json': json.dumps(record.get('tools', []), ensure_ascii=False, sort_keys=True),
1272
+ 'meta_json': json.dumps(record.get('meta', {}), ensure_ascii=False, sort_keys=True),
1273
+ }
1274
+
1275
+
1276
+ def auto_worker_count(requested_workers: int, source_count: int) -> int:
1277
+ if requested_workers > 0:
1278
+ return max(1, requested_workers)
1279
+ cpu = os.cpu_count() or 4
1280
+ return max(1, min(source_count, min(cpu, 8)))
1281
+
1282
+
1283
+ def ensure_parquet_runtime(output_format: str) -> None:
1284
+ if output_format in {'parquet', 'both'} and (pa is None or pq is None):
1285
+ raise RuntimeError('pyarrow is required for Parquet output')
1286
+
1287
+
1288
+ def build_staged_entry_from_event(event: Dict[str, Any], endpoint: str, event_index: int, dedupe_mode: str) -> Tuple[Optional[Dict[str, Any]], Counter]:
1289
+ stats = Counter()
1290
+ stats[f'events:{endpoint}'] += 1
1291
+
1292
+ req_obj = parse_json_maybe(event['meta'].get('req'))
1293
+ if not isinstance(req_obj, dict):
1294
+ stats[f'bad_req:{endpoint}'] += 1
1295
+ return None, stats
1296
+
1297
+ tools = normalize_tool_specs(req_obj.get('tools'))
1298
+ request_messages, request_flags = normalize_request_messages(endpoint, req_obj)
1299
+ response_obj = parse_json_maybe(event['meta'].get('res'))
1300
+ response_messages, response_flags = normalize_response_messages(endpoint, response_obj)
1301
+
1302
+ messages = request_messages + response_messages
1303
+ if not messages:
1304
+ stats[f'empty_messages:{endpoint}'] += 1
1305
+ return None, stats
1306
+
1307
+ request_id = event['meta'].get('requestId')
1308
+ lossy_reasons = set(request_flags['lossy_reasons']) | set(response_flags['lossy_reasons'])
1309
+ request_truncated = has_truncation(req_obj)
1310
+ response_truncated = has_truncation(response_obj)
1311
+ if request_truncated:
1312
+ lossy_reasons.add('request_truncated')
1313
+ if response_truncated:
1314
+ lossy_reasons.add('response_truncated')
1315
+
1316
+ record_id = f"{endpoint}:{event.get('ts')}:{event_index}"
1317
+ meta = {
1318
+ 'endpoint': endpoint,
1319
+ 'status': event['status'],
1320
+ 'ts': event.get('ts') or '',
1321
+ 'key': event['meta'].get('key'),
1322
+ 'source': event.get('source'),
1323
+ 'request_contains_non_text_content': request_flags['contains_non_text_content'],
1324
+ 'request_image_block_count': request_flags['image_block_count'],
1325
+ 'request_video_block_count': request_flags['video_block_count'],
1326
+ 'request_tool_call_block_count': request_flags['tool_call_block_count'],
1327
+ 'request_tool_result_block_count': request_flags['tool_result_block_count'],
1328
+ 'request_thinking_block_count': request_flags['thinking_block_count'],
1329
+ 'response_contains_non_text_content': response_flags['contains_non_text_content'],
1330
+ 'response_image_block_count': response_flags['image_block_count'],
1331
+ 'response_video_block_count': response_flags['video_block_count'],
1332
+ 'response_tool_call_block_count': response_flags['tool_call_block_count'],
1333
+ 'response_tool_result_block_count': response_flags['tool_result_block_count'],
1334
+ 'response_thinking_block_count': response_flags['thinking_block_count'],
1335
+ 'request_truncated': request_truncated,
1336
+ 'response_truncated': response_truncated,
1337
+ 'lossy_source': bool(lossy_reasons),
1338
+ 'lossy_reasons': sorted(lossy_reasons),
1339
+ }
1340
+ meta.update(extract_request_meta(endpoint, req_obj))
1341
+ if isinstance(response_obj, dict):
1342
+ meta.update(extract_response_meta(endpoint, response_obj))
1343
+
1344
+ try:
1345
+ record = validate_record_payload(
1346
+ {
1347
+ 'id': record_id,
1348
+ 'request_id': request_id,
1349
+ 'messages': messages,
1350
+ 'tools': tools,
1351
+ 'meta': meta,
1352
+ }
1353
+ )
1354
+ except Exception as exc:
1355
+ stats[f'invalid_record:{endpoint}'] += 1
1356
+ stats['invalid_records_total'] += 1
1357
+ return {
1358
+ 'bucket': 'invalid',
1359
+ 'endpoint': endpoint,
1360
+ 'error': str(exc),
1361
+ 'event_index': event_index,
1362
+ }, stats
1363
+
1364
+ record_dict = record_as_dict(record)
1365
+ key = dedupe_key(
1366
+ dedupe_mode,
1367
+ record_id_value(record),
1368
+ record_request_id_value(record),
1369
+ record_dict['messages'],
1370
+ record_dict.get('tools', []),
1371
+ )
1372
+ bucket = 'lossy' if record_is_lossy(record) else 'strict'
1373
+ stats[f'{bucket}_records_staged'] += 1
1374
+ return {'bucket': bucket, 'dedupe_key': key, 'record': record_dict}, stats
1375
+
1376
+
1377
+ def process_source_to_stage(source_path: str, staging_dir: Path, dedupe_mode: str) -> Dict[str, Any]:
1378
+ stats = Counter()
1379
+ safe_name = hashlib.sha1(source_path.encode('utf-8')).hexdigest()[:16]
1380
+ chunk_path = staging_dir / f'{safe_name}.jsonl'
1381
+ invalid_path = staging_dir / f'{safe_name}.invalid.jsonl'
1382
+
1383
+ with chunk_path.open('w', encoding='utf-8') as chunk_out, invalid_path.open('w', encoding='utf-8') as invalid_out:
1384
+ for local_index, event in enumerate(iter_events([source_path]), start=1):
1385
+ endpoint = f"{event['method']} {event['path']}"
1386
+ if endpoint not in TARGET_PATHS:
1387
+ continue
1388
+ staged_entry, event_stats = build_staged_entry_from_event(event, endpoint, local_index, dedupe_mode)
1389
+ stats.update(event_stats)
1390
+ if not staged_entry:
1391
+ continue
1392
+ if staged_entry['bucket'] == 'invalid':
1393
+ invalid_out.write(json.dumps(staged_entry, ensure_ascii=False) + '\n')
1394
+ else:
1395
+ chunk_out.write(json.dumps(staged_entry, ensure_ascii=False) + '\n')
1396
+
1397
+ return {
1398
+ 'source': source_path,
1399
+ 'chunk_path': str(chunk_path),
1400
+ 'invalid_path': str(invalid_path),
1401
+ 'stats': dict(stats),
1402
+ }
1403
+
1404
+
1405
+ def append_parquet_rows(writer: Any, rows: List[Dict[str, Any]], path: Path) -> Any:
1406
+ if not rows:
1407
+ return writer
1408
+ table = pa.Table.from_pylist(rows, schema=PARQUET_SCHEMA)
1409
+ if writer is None:
1410
+ writer = pq.ParquetWriter(str(path), PARQUET_SCHEMA)
1411
+ writer.write_table(table)
1412
+ return writer
1413
+
1414
+
1415
+ def main() -> int:
1416
+ args = parse_args()
1417
+ ensure_parquet_runtime(args.output_format)
1418
+ current_log = resolve_current_log_path(args.container) if args.include_current else None
1419
+ sources = iter_sources(args.archive_root, current_log, args.limit_sources)
1420
+ if not sources:
1421
+ raise SystemExit('No log sources found.')
1422
+
1423
+ out_dir = Path(args.output_root) / f'qwen35-export-{datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")}'
1424
+ out_dir.mkdir(parents=True, exist_ok=True)
1425
+ staging_dir = out_dir / 'staging'
1426
+ staging_dir.mkdir(parents=True, exist_ok=True)
1427
+ strict_path = out_dir / 'qwen35-train.jsonl'
1428
+ lossy_path = out_dir / 'qwen35-train-lossy.jsonl'
1429
+ strict_parquet_path = out_dir / 'qwen35-train.parquet'
1430
+ lossy_parquet_path = out_dir / 'qwen35-train-lossy.parquet'
1431
+ invalid_path = out_dir / 'invalid-records.jsonl'
1432
+ manifest_path = out_dir / 'manifest.json'
1433
+
1434
+ stats = Counter()
1435
+ worker_count = auto_worker_count(args.workers, len(sources))
1436
+ worker_results: List[Dict[str, Any]] = []
1437
+
1438
+ with ThreadPoolExecutor(max_workers=worker_count) as executor:
1439
+ futures = {
1440
+ executor.submit(process_source_to_stage, source_path, staging_dir, args.dedupe_mode): source_path
1441
+ for source_path in sources
1442
+ }
1443
+ files_done = 0
1444
+ for future in as_completed(futures):
1445
+ result = future.result()
1446
+ worker_results.append(result)
1447
+ stats.update(result['stats'])
1448
+ files_done += 1
1449
+ print(
1450
+ json.dumps(
1451
+ {'files_done': files_done, 'sources_total': len(sources), **dict(stats)},
1452
+ ensure_ascii=False,
1453
+ ),
1454
+ flush=True,
1455
+ )
1456
+
1457
+ jsonl_enabled = args.output_format in {'jsonl', 'both'}
1458
+ parquet_enabled = args.output_format in {'parquet', 'both'}
1459
+ strict_seen: set[str] = set()
1460
+ lossy_seen: set[str] = set()
1461
+ strict_writer = None
1462
+ lossy_writer = None
1463
+ strict_batch: List[Dict[str, Any]] = []
1464
+ lossy_batch: List[Dict[str, Any]] = []
1465
+ batch_size = 1000
1466
+
1467
+ strict_out = strict_path.open('w', encoding='utf-8') if jsonl_enabled else None
1468
+ lossy_out = lossy_path.open('w', encoding='utf-8') if jsonl_enabled else None
1469
+ invalid_out = invalid_path.open('w', encoding='utf-8')
1470
+
1471
+ try:
1472
+ for result in sorted(worker_results, key=lambda item: item['source']):
1473
+ chunk_path = Path(result['chunk_path'])
1474
+ invalid_chunk_path = Path(result['invalid_path'])
1475
+
1476
+ if invalid_chunk_path.exists():
1477
+ with invalid_chunk_path.open('r', encoding='utf-8') as invalid_in:
1478
+ for line in invalid_in:
1479
+ invalid_out.write(line)
1480
+
1481
+ if chunk_path.exists():
1482
+ with chunk_path.open('r', encoding='utf-8') as chunk_in:
1483
+ for line in chunk_in:
1484
+ if not line.strip():
1485
+ continue
1486
+ staged = json.loads(line)
1487
+ bucket = staged['bucket']
1488
+ dedupe = staged.get('dedupe_key')
1489
+ record = staged['record']
1490
+ seen = strict_seen if bucket == 'strict' else lossy_seen
1491
+ if dedupe is not None and dedupe in seen:
1492
+ stats[f'{bucket}_records_deduped'] += 1
1493
+ continue
1494
+ if dedupe is not None:
1495
+ seen.add(dedupe)
1496
+
1497
+ if bucket == 'strict':
1498
+ if strict_out is not None:
1499
+ strict_out.write(json.dumps(record, ensure_ascii=False, separators=(',', ':')) + '\n')
1500
+ if parquet_enabled:
1501
+ strict_batch.append(record_to_parquet_row(record))
1502
+ if len(strict_batch) >= batch_size:
1503
+ strict_writer = append_parquet_rows(strict_writer, strict_batch, strict_parquet_path)
1504
+ strict_batch = []
1505
+ stats['strict_records_written'] += 1
1506
+ else:
1507
+ if lossy_out is not None:
1508
+ lossy_out.write(json.dumps(record, ensure_ascii=False, separators=(',', ':')) + '\n')
1509
+ if parquet_enabled:
1510
+ lossy_batch.append(record_to_parquet_row(record))
1511
+ if len(lossy_batch) >= batch_size:
1512
+ lossy_writer = append_parquet_rows(lossy_writer, lossy_batch, lossy_parquet_path)
1513
+ lossy_batch = []
1514
+ stats['lossy_records_written'] += 1
1515
+
1516
+ if not args.keep_staging:
1517
+ if chunk_path.exists():
1518
+ chunk_path.unlink()
1519
+ if invalid_chunk_path.exists():
1520
+ invalid_chunk_path.unlink()
1521
+
1522
+ if parquet_enabled:
1523
+ strict_writer = append_parquet_rows(strict_writer, strict_batch, strict_parquet_path)
1524
+ lossy_writer = append_parquet_rows(lossy_writer, lossy_batch, lossy_parquet_path)
1525
+ finally:
1526
+ if strict_out is not None:
1527
+ strict_out.close()
1528
+ if lossy_out is not None:
1529
+ lossy_out.close()
1530
+ invalid_out.close()
1531
+ if strict_writer is not None:
1532
+ strict_writer.close()
1533
+ if lossy_writer is not None:
1534
+ lossy_writer.close()
1535
+
1536
+ if not args.keep_staging:
1537
+ try:
1538
+ staging_dir.rmdir()
1539
+ except OSError:
1540
+ pass
1541
+
1542
+ manifest = {
1543
+ 'output_dir': str(out_dir),
1544
+ 'source_count': len(sources),
1545
+ 'sources': sources,
1546
+ 'workers': worker_count,
1547
+ 'dedupe_mode': args.dedupe_mode,
1548
+ 'output_format': args.output_format,
1549
+ 'strict_records': stats['strict_records_written'],
1550
+ 'lossy_records': stats['lossy_records_written'],
1551
+ 'stats': dict(stats),
1552
+ }
1553
+ manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2) + '\n', encoding='utf-8')
1554
+ print(json.dumps(manifest, ensure_ascii=False), flush=True)
1555
+ return 0
1556
+
1557
+
1558
+ if __name__ == '__main__':
1559
+ raise SystemExit(main())