agentic-dataset-builder 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +125 -0
- package/agentic_dataset/__init__.py +1 -0
- package/agentic_dataset/build_agentic_dataset.py +368 -0
- package/agentic_dataset/export_codex_session_to_qwen35.py +466 -0
- package/agentic_dataset/export_pi_session.py +701 -0
- package/agentic_dataset/export_pi_session_to_qwen35.py +742 -0
- package/agentic_dataset/export_qwen35_training.py +1559 -0
- package/agentic_dataset/label_qwen35_agentic.py +156 -0
- package/agentic_dataset/platform_paths.py +85 -0
- package/agentic_dataset/qwen35_training_record.py +179 -0
- package/bin/agentic-dataset-builder.js +77 -0
- package/package.json +40 -0
- package/requirements.txt +2 -0
- package/run.py +8 -0
|
@@ -0,0 +1,1559 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
6
|
+
import gzip
|
|
7
|
+
import hashlib
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
import re
|
|
11
|
+
import subprocess
|
|
12
|
+
from collections import Counter
|
|
13
|
+
from datetime import datetime, timezone
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
import pyarrow as pa
|
|
19
|
+
import pyarrow.parquet as pq
|
|
20
|
+
except Exception: # pragma: no cover - parquet is optional at runtime
|
|
21
|
+
pa = None
|
|
22
|
+
pq = None
|
|
23
|
+
|
|
24
|
+
PARQUET_SCHEMA = None
|
|
25
|
+
if pa is not None: # pragma: no branch
|
|
26
|
+
PARQUET_SCHEMA = pa.schema(
|
|
27
|
+
[
|
|
28
|
+
('id', pa.string()),
|
|
29
|
+
('request_id', pa.string()),
|
|
30
|
+
('endpoint', pa.string()),
|
|
31
|
+
('status', pa.int64()),
|
|
32
|
+
('ts', pa.string()),
|
|
33
|
+
('key', pa.string()),
|
|
34
|
+
('source', pa.string()),
|
|
35
|
+
('requested_model', pa.string()),
|
|
36
|
+
('actual_model', pa.string()),
|
|
37
|
+
('stream', pa.bool_()),
|
|
38
|
+
('thinking_level', pa.string()),
|
|
39
|
+
('reasoning_summary_mode_json', pa.string()),
|
|
40
|
+
('thinking_type', pa.string()),
|
|
41
|
+
('thinking_budget_tokens', pa.int64()),
|
|
42
|
+
('max_output_tokens', pa.int64()),
|
|
43
|
+
('tool_spec_count', pa.int64()),
|
|
44
|
+
('tool_choice_json', pa.string()),
|
|
45
|
+
('request_contains_non_text_content', pa.bool_()),
|
|
46
|
+
('request_image_block_count', pa.int64()),
|
|
47
|
+
('request_video_block_count', pa.int64()),
|
|
48
|
+
('request_tool_call_block_count', pa.int64()),
|
|
49
|
+
('request_tool_result_block_count', pa.int64()),
|
|
50
|
+
('request_thinking_block_count', pa.int64()),
|
|
51
|
+
('response_contains_non_text_content', pa.bool_()),
|
|
52
|
+
('response_image_block_count', pa.int64()),
|
|
53
|
+
('response_video_block_count', pa.int64()),
|
|
54
|
+
('response_tool_call_block_count', pa.int64()),
|
|
55
|
+
('response_tool_result_block_count', pa.int64()),
|
|
56
|
+
('response_thinking_block_count', pa.int64()),
|
|
57
|
+
('request_truncated', pa.bool_()),
|
|
58
|
+
('response_truncated', pa.bool_()),
|
|
59
|
+
('lossy_source', pa.bool_()),
|
|
60
|
+
('lossy_reasons_json', pa.string()),
|
|
61
|
+
('user_message_count', pa.int64()),
|
|
62
|
+
('assistant_message_count', pa.int64()),
|
|
63
|
+
('tool_message_count', pa.int64()),
|
|
64
|
+
('dialogue_rounds_est', pa.int64()),
|
|
65
|
+
('tool_call_count', pa.int64()),
|
|
66
|
+
('has_reasoning', pa.bool_()),
|
|
67
|
+
('reasoning_chars', pa.int64()),
|
|
68
|
+
('content_chars_total', pa.int64()),
|
|
69
|
+
('messages_json', pa.string()),
|
|
70
|
+
('tools_json', pa.string()),
|
|
71
|
+
('meta_json', pa.string()),
|
|
72
|
+
]
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
from .qwen35_training_record import Qwen35TrainingRecord
|
|
77
|
+
except Exception: # pragma: no cover - remote runtime may not ship pydantic
|
|
78
|
+
Qwen35TrainingRecord = None
|
|
79
|
+
|
|
80
|
+
MAIN_RE = re.compile(r'.*(?:🟢|⚠️ ?|❌|🟡)\s+(\d+)\s+(GET|POST|PUT|PATCH|DELETE)\s+(\S+)')
|
|
81
|
+
META_RE = re.compile(r'^\s*[├└]─\s+([^:]+):\s?(.*)$')
|
|
82
|
+
TARGET_PATHS = {
|
|
83
|
+
'POST /openai/v1/responses',
|
|
84
|
+
'POST /openai/v1/responses/compact',
|
|
85
|
+
'POST /openai/v1/chat/completions',
|
|
86
|
+
'POST /api/v1/messages',
|
|
87
|
+
}
|
|
88
|
+
TRUNCATED_MARKER = '...[truncated]'
|
|
89
|
+
TEXT_BLOCK_TYPES = {'text', 'input_text', 'output_text'}
|
|
90
|
+
IMAGE_BLOCK_TYPES = {'image', 'input_image', 'output_image', 'image_url'}
|
|
91
|
+
VIDEO_BLOCK_TYPES = {'video', 'input_video', 'output_video', 'video_url'}
|
|
92
|
+
TOOL_CALL_BLOCK_TYPES = {'tool_use', 'tool_call', 'function_call', 'custom_tool_call', 'web_search_call'}
|
|
93
|
+
TOOL_RESULT_BLOCK_TYPES = {'tool_result', 'tool_output', 'function_call_output'}
|
|
94
|
+
THINKING_BLOCK_TYPES = {'thinking', 'reasoning'}
|
|
95
|
+
VISION_IMAGE_TOKEN = '<|vision_start|><|image_pad|><|vision_end|>'
|
|
96
|
+
VISION_VIDEO_TOKEN = '<|vision_start|><|video_pad|><|vision_end|>'
|
|
97
|
+
THINK_INLINE_RE = re.compile(r'<think>\s*(.*?)\s*</think>', re.S)
|
|
98
|
+
TOOL_RESPONSE_RE = re.compile(r'<tool_response>\s*(.*?)\s*</tool_response>', re.S)
|
|
99
|
+
TOOL_CALL_RE = re.compile(r'<tool_call>\s*<function=([^>\n]+)>\s*(.*?)</function>\s*</tool_call>', re.S)
|
|
100
|
+
TOOL_PARAM_RE = re.compile(r'<parameter=([^>\n]+)>\s*(.*?)\s*</parameter>', re.S)
|
|
101
|
+
VISION_TOKEN_RE = re.compile(
|
|
102
|
+
f'({re.escape(VISION_IMAGE_TOKEN)}|{re.escape(VISION_VIDEO_TOKEN)})'
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def parse_args() -> argparse.Namespace:
|
|
107
|
+
parser = argparse.ArgumentParser(description='Export relay logs into Qwen3.5-compatible JSONL.')
|
|
108
|
+
parser.add_argument('--output-root', required=True)
|
|
109
|
+
parser.add_argument(
|
|
110
|
+
'--archive-root',
|
|
111
|
+
default='/vePFS-Mindverse/share/yiwen/claude-relay-service/docker-json-logs/di-20260320122547-ws9d2/claude-relay-service-claude-relay-1',
|
|
112
|
+
)
|
|
113
|
+
parser.add_argument('--container', default='claude-relay-service-claude-relay-1')
|
|
114
|
+
parser.add_argument('--include-current', action='store_true', default=True)
|
|
115
|
+
parser.add_argument('--exclude-current', dest='include_current', action='store_false')
|
|
116
|
+
parser.add_argument('--limit-sources', type=int, default=0)
|
|
117
|
+
parser.add_argument('--dedupe-mode', choices=('requestid', 'content', 'none'), default='requestid')
|
|
118
|
+
parser.add_argument('--workers', type=int, default=0, help='Thread workers for per-source staging; 0 means auto.')
|
|
119
|
+
parser.add_argument(
|
|
120
|
+
'--output-format',
|
|
121
|
+
choices=('jsonl', 'parquet', 'both'),
|
|
122
|
+
default='parquet',
|
|
123
|
+
help='Emit JSONL, Parquet, or both. Parquet is the default and is optimized for analytics-first workflows.',
|
|
124
|
+
)
|
|
125
|
+
parser.add_argument('--keep-staging', action='store_true', help='Keep intermediate staged chunk files for debugging.')
|
|
126
|
+
return parser.parse_args()
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def resolve_current_log_path(container: str) -> str:
|
|
130
|
+
cmd = ['bash', '-lc', f"export DOCKER_API_VERSION=1.43; docker inspect -f '{{{{.LogPath}}}}' {container}"]
|
|
131
|
+
return subprocess.check_output(cmd, text=True).strip()
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def sorted_archive_sources(archive_root: str) -> List[str]:
|
|
135
|
+
if not os.path.isdir(archive_root):
|
|
136
|
+
return []
|
|
137
|
+
|
|
138
|
+
def sort_key(path: str) -> Tuple[int, str]:
|
|
139
|
+
base = os.path.basename(path)
|
|
140
|
+
head = base.split('_', 1)[0]
|
|
141
|
+
try:
|
|
142
|
+
return (int(head), base)
|
|
143
|
+
except ValueError:
|
|
144
|
+
return (0, base)
|
|
145
|
+
|
|
146
|
+
files = [
|
|
147
|
+
os.path.join(archive_root, name)
|
|
148
|
+
for name in os.listdir(archive_root)
|
|
149
|
+
if name.endswith('.gz') and os.path.isfile(os.path.join(archive_root, name))
|
|
150
|
+
]
|
|
151
|
+
return sorted(files, key=sort_key)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def iter_sources(archive_root: str, current_log: Optional[str], limit: int) -> List[str]:
|
|
155
|
+
sources = sorted_archive_sources(archive_root)
|
|
156
|
+
if current_log:
|
|
157
|
+
sources.append(current_log)
|
|
158
|
+
if limit > 0:
|
|
159
|
+
return sources[:limit]
|
|
160
|
+
return sources
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def open_log_file(path: str):
|
|
164
|
+
if path.endswith('.gz'):
|
|
165
|
+
return gzip.open(path, 'rt', encoding='utf-8', errors='replace')
|
|
166
|
+
return open(path, 'r', encoding='utf-8', errors='replace')
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def iter_events(paths: List[str]) -> Iterator[Dict[str, Any]]:
|
|
170
|
+
current_event: Optional[Dict[str, Any]] = None
|
|
171
|
+
current_key: Optional[str] = None
|
|
172
|
+
current_source: Optional[str] = None
|
|
173
|
+
|
|
174
|
+
for path in paths:
|
|
175
|
+
current_source = path
|
|
176
|
+
with open_log_file(path) as handle:
|
|
177
|
+
for raw_line in handle:
|
|
178
|
+
try:
|
|
179
|
+
obj = json.loads(raw_line)
|
|
180
|
+
log_line = obj.get('log', '').rstrip('\n')
|
|
181
|
+
ts = obj.get('time')
|
|
182
|
+
except Exception:
|
|
183
|
+
continue
|
|
184
|
+
|
|
185
|
+
main_match = MAIN_RE.match(log_line)
|
|
186
|
+
if main_match:
|
|
187
|
+
if current_event is not None:
|
|
188
|
+
yield current_event
|
|
189
|
+
current_event = {
|
|
190
|
+
'source': current_source,
|
|
191
|
+
'ts': ts,
|
|
192
|
+
'status': int(main_match.group(1)),
|
|
193
|
+
'method': main_match.group(2),
|
|
194
|
+
'path': main_match.group(3),
|
|
195
|
+
'meta': {},
|
|
196
|
+
}
|
|
197
|
+
current_key = None
|
|
198
|
+
continue
|
|
199
|
+
|
|
200
|
+
if current_event is None:
|
|
201
|
+
continue
|
|
202
|
+
|
|
203
|
+
meta_match = META_RE.match(log_line)
|
|
204
|
+
if meta_match:
|
|
205
|
+
current_key = meta_match.group(1)
|
|
206
|
+
current_event['meta'][current_key] = meta_match.group(2)
|
|
207
|
+
continue
|
|
208
|
+
|
|
209
|
+
if current_key:
|
|
210
|
+
current_event['meta'][current_key] += log_line
|
|
211
|
+
|
|
212
|
+
if current_event is not None:
|
|
213
|
+
yield current_event
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def parse_json_maybe(value: Any) -> Any:
|
|
217
|
+
if value is None:
|
|
218
|
+
return None
|
|
219
|
+
if isinstance(value, (dict, list)):
|
|
220
|
+
return value
|
|
221
|
+
if not isinstance(value, str):
|
|
222
|
+
return None
|
|
223
|
+
try:
|
|
224
|
+
return json.loads(value)
|
|
225
|
+
except Exception:
|
|
226
|
+
return None
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def json_fallback(value: Any) -> str:
|
|
230
|
+
return json.dumps(value, ensure_ascii=False, sort_keys=True)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def has_truncation(value: Any) -> bool:
|
|
234
|
+
if isinstance(value, str):
|
|
235
|
+
return TRUNCATED_MARKER in value
|
|
236
|
+
if isinstance(value, list):
|
|
237
|
+
return any(has_truncation(item) for item in value)
|
|
238
|
+
if isinstance(value, dict):
|
|
239
|
+
if value.get('_truncated'):
|
|
240
|
+
return True
|
|
241
|
+
return any(has_truncation(item) for item in value.values())
|
|
242
|
+
return False
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def normalize_role(role: Any) -> Optional[str]:
|
|
246
|
+
mapping = {
|
|
247
|
+
'system': 'system',
|
|
248
|
+
'developer': 'system',
|
|
249
|
+
'user': 'user',
|
|
250
|
+
'assistant': 'assistant',
|
|
251
|
+
'tool': 'tool',
|
|
252
|
+
'model': 'assistant',
|
|
253
|
+
}
|
|
254
|
+
return mapping.get(role)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def get_text_from_block(block: Dict[str, Any]) -> str:
|
|
258
|
+
if isinstance(block.get('text'), str):
|
|
259
|
+
return block['text']
|
|
260
|
+
if isinstance(block.get('content'), str):
|
|
261
|
+
return block['content']
|
|
262
|
+
if isinstance(block.get('reasoning'), str):
|
|
263
|
+
return block['reasoning']
|
|
264
|
+
if isinstance(block.get('thinking'), str):
|
|
265
|
+
return block['thinking']
|
|
266
|
+
if isinstance(block.get('content'), list):
|
|
267
|
+
return flatten_text_only(block['content'])
|
|
268
|
+
return ''
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def flatten_text_only(value: Any) -> str:
|
|
272
|
+
if value is None:
|
|
273
|
+
return ''
|
|
274
|
+
if isinstance(value, str):
|
|
275
|
+
return value
|
|
276
|
+
if isinstance(value, dict):
|
|
277
|
+
block_type = value.get('type')
|
|
278
|
+
if block_type in TEXT_BLOCK_TYPES | THINKING_BLOCK_TYPES:
|
|
279
|
+
return get_text_from_block(value)
|
|
280
|
+
if isinstance(value.get('content'), list):
|
|
281
|
+
return flatten_text_only(value['content'])
|
|
282
|
+
return ''
|
|
283
|
+
if isinstance(value, list):
|
|
284
|
+
parts = [flatten_text_only(item) for item in value]
|
|
285
|
+
return '\n'.join(part for part in parts if part)
|
|
286
|
+
return str(value)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def parse_parameter_value(value: str) -> Any:
|
|
290
|
+
parsed = parse_json_maybe(value)
|
|
291
|
+
if parsed is not None:
|
|
292
|
+
return parsed
|
|
293
|
+
return value
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def split_inline_reasoning(text: str, lossy_reasons: set[str]) -> Tuple[Optional[str], str]:
|
|
297
|
+
reasoning_parts: List[str] = []
|
|
298
|
+
|
|
299
|
+
def replacer(match: re.Match[str]) -> str:
|
|
300
|
+
inner = match.group(1).strip()
|
|
301
|
+
if inner:
|
|
302
|
+
reasoning_parts.append(inner)
|
|
303
|
+
return ''
|
|
304
|
+
|
|
305
|
+
cleaned = THINK_INLINE_RE.sub(replacer, text)
|
|
306
|
+
if '<think>' in cleaned or '</think>' in cleaned:
|
|
307
|
+
lossy_reasons.add('unbalanced_think_markup')
|
|
308
|
+
reasoning = '\n\n'.join(part for part in reasoning_parts if part).strip()
|
|
309
|
+
return (reasoning or None), cleaned.strip()
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def split_vision_placeholder_text(text: str) -> List[Dict[str, Any]]:
|
|
313
|
+
blocks: List[Dict[str, Any]] = []
|
|
314
|
+
parts = VISION_TOKEN_RE.split(text)
|
|
315
|
+
for part in parts:
|
|
316
|
+
if not part:
|
|
317
|
+
continue
|
|
318
|
+
if part == VISION_IMAGE_TOKEN:
|
|
319
|
+
blocks.append(
|
|
320
|
+
{
|
|
321
|
+
'type': 'image',
|
|
322
|
+
'placeholder': True,
|
|
323
|
+
'placeholder_token': VISION_IMAGE_TOKEN,
|
|
324
|
+
'source_kind': 'placeholder',
|
|
325
|
+
}
|
|
326
|
+
)
|
|
327
|
+
elif part == VISION_VIDEO_TOKEN:
|
|
328
|
+
blocks.append(
|
|
329
|
+
{
|
|
330
|
+
'type': 'video',
|
|
331
|
+
'placeholder': True,
|
|
332
|
+
'placeholder_token': VISION_VIDEO_TOKEN,
|
|
333
|
+
'source_kind': 'placeholder',
|
|
334
|
+
}
|
|
335
|
+
)
|
|
336
|
+
else:
|
|
337
|
+
stripped = part.strip()
|
|
338
|
+
if stripped:
|
|
339
|
+
blocks.append({'type': 'text', 'text': stripped})
|
|
340
|
+
return blocks
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def extract_tool_calls_from_text(text: str, lossy_reasons: set[str]) -> Tuple[List[Dict[str, Any]], str]:
|
|
344
|
+
tool_calls: List[Dict[str, Any]] = []
|
|
345
|
+
|
|
346
|
+
def replacer(match: re.Match[str]) -> str:
|
|
347
|
+
name = match.group(1).strip()
|
|
348
|
+
body = match.group(2)
|
|
349
|
+
arguments: Dict[str, Any] = {}
|
|
350
|
+
for param_match in TOOL_PARAM_RE.finditer(body):
|
|
351
|
+
param_name = param_match.group(1).strip()
|
|
352
|
+
param_value = param_match.group(2).strip()
|
|
353
|
+
if param_name:
|
|
354
|
+
arguments[param_name] = parse_parameter_value(param_value)
|
|
355
|
+
if not arguments and body.strip():
|
|
356
|
+
lossy_reasons.add('tool_call_markup_without_parameters')
|
|
357
|
+
tool_calls.append(
|
|
358
|
+
{
|
|
359
|
+
'type': 'function',
|
|
360
|
+
'function': {
|
|
361
|
+
'name': name,
|
|
362
|
+
'arguments': arguments,
|
|
363
|
+
},
|
|
364
|
+
}
|
|
365
|
+
)
|
|
366
|
+
return ''
|
|
367
|
+
|
|
368
|
+
cleaned = TOOL_CALL_RE.sub(replacer, text)
|
|
369
|
+
if '<tool_call>' in cleaned or '<function=' in cleaned:
|
|
370
|
+
lossy_reasons.add('unparsed_tool_call_markup')
|
|
371
|
+
return tool_calls, cleaned.strip()
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def extract_tool_responses_from_text(text: str) -> Tuple[List[str], str]:
|
|
375
|
+
responses = [match.group(1).strip() for match in TOOL_RESPONSE_RE.finditer(text) if match.group(1).strip()]
|
|
376
|
+
cleaned = TOOL_RESPONSE_RE.sub('', text).strip()
|
|
377
|
+
return responses, cleaned
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def parse_arguments_to_object(arguments: Any, lossy_reasons: set[str]) -> Dict[str, Any]:
|
|
381
|
+
if arguments is None:
|
|
382
|
+
return {}
|
|
383
|
+
if isinstance(arguments, dict):
|
|
384
|
+
return arguments
|
|
385
|
+
if isinstance(arguments, str):
|
|
386
|
+
parsed = parse_json_maybe(arguments)
|
|
387
|
+
if isinstance(parsed, dict):
|
|
388
|
+
return parsed
|
|
389
|
+
lossy_reasons.add('tool_arguments_not_object')
|
|
390
|
+
return {}
|
|
391
|
+
lossy_reasons.add('tool_arguments_not_object')
|
|
392
|
+
return {}
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def normalize_tool_call(call: Dict[str, Any], lossy_reasons: set[str]) -> Optional[Dict[str, Any]]:
|
|
396
|
+
if not isinstance(call, dict):
|
|
397
|
+
lossy_reasons.add('invalid_tool_call')
|
|
398
|
+
return None
|
|
399
|
+
|
|
400
|
+
call_type = call.get('type')
|
|
401
|
+
call_id = call.get('id') or call.get('tool_call_id') or call.get('call_id')
|
|
402
|
+
function_block = call.get('function') if isinstance(call.get('function'), dict) else None
|
|
403
|
+
|
|
404
|
+
if function_block:
|
|
405
|
+
name = function_block.get('name') or call.get('name')
|
|
406
|
+
arguments = function_block.get('arguments')
|
|
407
|
+
else:
|
|
408
|
+
name = call.get('name')
|
|
409
|
+
arguments = call.get('arguments')
|
|
410
|
+
if arguments is None:
|
|
411
|
+
arguments = call.get('input')
|
|
412
|
+
|
|
413
|
+
if call_type == 'web_search_call' and (not isinstance(name, str) or not name):
|
|
414
|
+
name = 'web_search'
|
|
415
|
+
if arguments is None:
|
|
416
|
+
payload: Dict[str, Any] = {}
|
|
417
|
+
status = call.get('status')
|
|
418
|
+
if isinstance(status, str):
|
|
419
|
+
payload['status'] = status
|
|
420
|
+
arguments = payload
|
|
421
|
+
|
|
422
|
+
if call_type == 'custom_tool_call' and isinstance(arguments, str):
|
|
423
|
+
arguments = {'input': arguments}
|
|
424
|
+
|
|
425
|
+
if not isinstance(name, str) or not name:
|
|
426
|
+
lossy_reasons.add('tool_call_missing_name')
|
|
427
|
+
return None
|
|
428
|
+
|
|
429
|
+
return {
|
|
430
|
+
'type': 'function',
|
|
431
|
+
'id': call_id,
|
|
432
|
+
'function': {
|
|
433
|
+
'name': name,
|
|
434
|
+
'arguments': parse_arguments_to_object(arguments, lossy_reasons),
|
|
435
|
+
},
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def normalize_tool_specs(raw_tools: Any) -> List[Dict[str, Any]]:
|
|
440
|
+
if isinstance(raw_tools, str):
|
|
441
|
+
raw_tools = parse_json_maybe(raw_tools)
|
|
442
|
+
if not isinstance(raw_tools, list):
|
|
443
|
+
return []
|
|
444
|
+
|
|
445
|
+
normalized: List[Dict[str, Any]] = []
|
|
446
|
+
for tool in raw_tools:
|
|
447
|
+
if not isinstance(tool, dict):
|
|
448
|
+
continue
|
|
449
|
+
name = tool.get('name')
|
|
450
|
+
if not isinstance(name, str) or not name:
|
|
451
|
+
func = tool.get('function')
|
|
452
|
+
if isinstance(func, dict):
|
|
453
|
+
name = func.get('name')
|
|
454
|
+
if not isinstance(name, str) or not name:
|
|
455
|
+
continue
|
|
456
|
+
item = dict(tool)
|
|
457
|
+
item['name'] = name
|
|
458
|
+
normalized.append(item)
|
|
459
|
+
return normalized
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def ensure_text_content(value: Any, lossy_reasons: set[str]) -> str:
|
|
463
|
+
if value is None:
|
|
464
|
+
return ''
|
|
465
|
+
if isinstance(value, str):
|
|
466
|
+
return value
|
|
467
|
+
text = flatten_text_only(value)
|
|
468
|
+
if text:
|
|
469
|
+
return text
|
|
470
|
+
lossy_reasons.add('non_text_tool_content')
|
|
471
|
+
return json_fallback(value)
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
def normalize_image_block(item: Dict[str, Any]) -> Dict[str, Any]:
|
|
475
|
+
image_url = None
|
|
476
|
+
if isinstance(item.get('image_url'), str):
|
|
477
|
+
image_url = item['image_url']
|
|
478
|
+
elif isinstance(item.get('image_url'), dict):
|
|
479
|
+
image_url = item['image_url'].get('url')
|
|
480
|
+
elif isinstance(item.get('url'), str):
|
|
481
|
+
image_url = item['url']
|
|
482
|
+
return {
|
|
483
|
+
'type': 'image',
|
|
484
|
+
'image_url': image_url,
|
|
485
|
+
'placeholder': image_url is None,
|
|
486
|
+
'placeholder_token': item.get('placeholder_token') or '<|vision_start|><|image_pad|><|vision_end|>',
|
|
487
|
+
'source_kind': item.get('type') or ('image_url' if image_url else 'placeholder'),
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def normalize_video_block(item: Dict[str, Any]) -> Dict[str, Any]:
|
|
492
|
+
video_url = None
|
|
493
|
+
if isinstance(item.get('video_url'), str):
|
|
494
|
+
video_url = item['video_url']
|
|
495
|
+
elif isinstance(item.get('video_url'), dict):
|
|
496
|
+
video_url = item['video_url'].get('url')
|
|
497
|
+
elif isinstance(item.get('url'), str):
|
|
498
|
+
video_url = item['url']
|
|
499
|
+
return {
|
|
500
|
+
'type': 'video',
|
|
501
|
+
'video_url': video_url,
|
|
502
|
+
'placeholder': video_url is None,
|
|
503
|
+
'placeholder_token': item.get('placeholder_token') or '<|vision_start|><|video_pad|><|vision_end|>',
|
|
504
|
+
'source_kind': item.get('type') or ('video_url' if video_url else 'placeholder'),
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
def finalize_content(blocks: List[Dict[str, Any]]) -> Any:
|
|
509
|
+
if not blocks:
|
|
510
|
+
return ''
|
|
511
|
+
if all(block.get('type') == 'text' for block in blocks):
|
|
512
|
+
return '\n'.join(block['text'] for block in blocks if block.get('text'))
|
|
513
|
+
return blocks
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
def append_text_block(blocks: List[Dict[str, Any]], text: str) -> None:
|
|
517
|
+
text = text.strip()
|
|
518
|
+
if not text:
|
|
519
|
+
return
|
|
520
|
+
if blocks and blocks[-1].get('type') == 'text':
|
|
521
|
+
blocks[-1]['text'] += '\n\n' + text
|
|
522
|
+
else:
|
|
523
|
+
blocks.append({'type': 'text', 'text': text})
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
def merge_initial_system_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
527
|
+
system_messages: List[Dict[str, Any]] = []
|
|
528
|
+
rest: List[Dict[str, Any]] = []
|
|
529
|
+
leading = True
|
|
530
|
+
for message in messages:
|
|
531
|
+
if leading and message.get('role') == 'system':
|
|
532
|
+
system_messages.append(message)
|
|
533
|
+
else:
|
|
534
|
+
leading = False
|
|
535
|
+
rest.append(message)
|
|
536
|
+
|
|
537
|
+
if len(system_messages) <= 1:
|
|
538
|
+
return messages
|
|
539
|
+
|
|
540
|
+
merged_content_parts: List[str] = []
|
|
541
|
+
for message in system_messages:
|
|
542
|
+
content = render_content_for_system_merge(message.get('content'))
|
|
543
|
+
if content.strip():
|
|
544
|
+
merged_content_parts.append(content.strip())
|
|
545
|
+
merged_system = {
|
|
546
|
+
'role': 'system',
|
|
547
|
+
'content': '\n\n'.join(merged_content_parts),
|
|
548
|
+
}
|
|
549
|
+
return [merged_system] + rest
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
def render_content_for_system_merge(content: Any) -> str:
|
|
553
|
+
if isinstance(content, str):
|
|
554
|
+
return content
|
|
555
|
+
if isinstance(content, list):
|
|
556
|
+
parts: List[str] = []
|
|
557
|
+
for block in content:
|
|
558
|
+
if isinstance(block, dict) and block.get('type') == 'text' and isinstance(block.get('text'), str):
|
|
559
|
+
parts.append(block['text'])
|
|
560
|
+
return '\n'.join(part for part in parts if part)
|
|
561
|
+
return ''
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
def is_effectively_empty_content(content: Any) -> bool:
|
|
565
|
+
if content is None:
|
|
566
|
+
return True
|
|
567
|
+
if isinstance(content, str):
|
|
568
|
+
return not content.strip()
|
|
569
|
+
if isinstance(content, list):
|
|
570
|
+
if not content:
|
|
571
|
+
return True
|
|
572
|
+
for block in content:
|
|
573
|
+
if not isinstance(block, dict):
|
|
574
|
+
return False
|
|
575
|
+
if block.get('type') != 'text':
|
|
576
|
+
return False
|
|
577
|
+
if isinstance(block.get('text'), str) and block['text'].strip():
|
|
578
|
+
return False
|
|
579
|
+
return True
|
|
580
|
+
return False
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
def content_features(content: Any) -> Dict[str, int]:
|
|
584
|
+
counts = Counter()
|
|
585
|
+
if isinstance(content, list):
|
|
586
|
+
for block in content:
|
|
587
|
+
if not isinstance(block, dict):
|
|
588
|
+
continue
|
|
589
|
+
block_type = block.get('type')
|
|
590
|
+
if block_type == 'image':
|
|
591
|
+
counts['image'] += 1
|
|
592
|
+
elif block_type == 'video':
|
|
593
|
+
counts['video'] += 1
|
|
594
|
+
elif block_type not in (None, 'text'):
|
|
595
|
+
counts['other'] += 1
|
|
596
|
+
return counts
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
def parse_message(
|
|
600
|
+
role: str,
|
|
601
|
+
raw_content: Any,
|
|
602
|
+
explicit_tool_calls: Any = None,
|
|
603
|
+
explicit_reasoning: Optional[str] = None,
|
|
604
|
+
explicit_tool_call_id: Optional[str] = None,
|
|
605
|
+
explicit_tool_name: Optional[str] = None,
|
|
606
|
+
) -> Tuple[List[Dict[str, Any]], Counter, set[str]]:
|
|
607
|
+
lossy_reasons: set[str] = set()
|
|
608
|
+
feature_counts = Counter()
|
|
609
|
+
tool_messages: List[Dict[str, Any]] = []
|
|
610
|
+
tool_calls: List[Dict[str, Any]] = []
|
|
611
|
+
reasoning_parts: List[str] = []
|
|
612
|
+
content_blocks: List[Dict[str, Any]] = []
|
|
613
|
+
|
|
614
|
+
if explicit_tool_calls is not None:
|
|
615
|
+
raw_tool_calls = explicit_tool_calls
|
|
616
|
+
if isinstance(raw_tool_calls, list):
|
|
617
|
+
for item in raw_tool_calls:
|
|
618
|
+
normalized = normalize_tool_call(item, lossy_reasons)
|
|
619
|
+
if normalized:
|
|
620
|
+
tool_calls.append(normalized)
|
|
621
|
+
feature_counts['tool_call'] += 1
|
|
622
|
+
if explicit_reasoning:
|
|
623
|
+
reasoning_parts.append(explicit_reasoning.strip())
|
|
624
|
+
feature_counts['thinking'] += 1
|
|
625
|
+
|
|
626
|
+
items: List[Any]
|
|
627
|
+
if isinstance(raw_content, list):
|
|
628
|
+
items = raw_content
|
|
629
|
+
elif raw_content is None:
|
|
630
|
+
items = []
|
|
631
|
+
else:
|
|
632
|
+
items = [raw_content]
|
|
633
|
+
|
|
634
|
+
for item in items:
|
|
635
|
+
if isinstance(item, str):
|
|
636
|
+
text = item
|
|
637
|
+
if role == 'assistant':
|
|
638
|
+
inline_reasoning, text = split_inline_reasoning(text, lossy_reasons)
|
|
639
|
+
if inline_reasoning:
|
|
640
|
+
reasoning_parts.append(inline_reasoning)
|
|
641
|
+
feature_counts['thinking'] += 1
|
|
642
|
+
inline_tool_calls, text = extract_tool_calls_from_text(text, lossy_reasons)
|
|
643
|
+
if inline_tool_calls:
|
|
644
|
+
tool_calls.extend(inline_tool_calls)
|
|
645
|
+
feature_counts['tool_call'] += len(inline_tool_calls)
|
|
646
|
+
if role == 'user':
|
|
647
|
+
tool_responses, text = extract_tool_responses_from_text(text)
|
|
648
|
+
for payload in tool_responses:
|
|
649
|
+
tool_messages.append(
|
|
650
|
+
{
|
|
651
|
+
'role': 'tool',
|
|
652
|
+
'content': payload,
|
|
653
|
+
'tool_call_id': None,
|
|
654
|
+
'name': None,
|
|
655
|
+
}
|
|
656
|
+
)
|
|
657
|
+
feature_counts['tool_result'] += 1
|
|
658
|
+
blocks = split_vision_placeholder_text(text)
|
|
659
|
+
if role == 'system' and any(block['type'] in {'image', 'video'} for block in blocks):
|
|
660
|
+
lossy_reasons.add('system_multimodal_not_supported')
|
|
661
|
+
append_text_block(content_blocks, text)
|
|
662
|
+
else:
|
|
663
|
+
for block in blocks:
|
|
664
|
+
if block['type'] == 'text':
|
|
665
|
+
append_text_block(content_blocks, block['text'])
|
|
666
|
+
else:
|
|
667
|
+
content_blocks.append(block)
|
|
668
|
+
feature_counts[block['type']] += 1
|
|
669
|
+
continue
|
|
670
|
+
if not isinstance(item, dict):
|
|
671
|
+
append_text_block(content_blocks, str(item))
|
|
672
|
+
lossy_reasons.add('non_dict_content_item')
|
|
673
|
+
continue
|
|
674
|
+
|
|
675
|
+
block_type = item.get('type')
|
|
676
|
+
if block_type in TEXT_BLOCK_TYPES or (
|
|
677
|
+
'text' in item and block_type not in IMAGE_BLOCK_TYPES | VIDEO_BLOCK_TYPES | THINKING_BLOCK_TYPES
|
|
678
|
+
):
|
|
679
|
+
text = item.get('text') if isinstance(item.get('text'), str) else None
|
|
680
|
+
if text is None and isinstance(item.get('content'), str):
|
|
681
|
+
text = item['content']
|
|
682
|
+
if text is None:
|
|
683
|
+
text = flatten_text_only(item)
|
|
684
|
+
if role == 'assistant':
|
|
685
|
+
inline_reasoning, text = split_inline_reasoning(text, lossy_reasons)
|
|
686
|
+
if inline_reasoning:
|
|
687
|
+
reasoning_parts.append(inline_reasoning)
|
|
688
|
+
feature_counts['thinking'] += 1
|
|
689
|
+
inline_tool_calls, text = extract_tool_calls_from_text(text, lossy_reasons)
|
|
690
|
+
if inline_tool_calls:
|
|
691
|
+
tool_calls.extend(inline_tool_calls)
|
|
692
|
+
feature_counts['tool_call'] += len(inline_tool_calls)
|
|
693
|
+
if role == 'user':
|
|
694
|
+
tool_responses, text = extract_tool_responses_from_text(text)
|
|
695
|
+
for payload in tool_responses:
|
|
696
|
+
tool_messages.append(
|
|
697
|
+
{
|
|
698
|
+
'role': 'tool',
|
|
699
|
+
'content': payload,
|
|
700
|
+
'tool_call_id': None,
|
|
701
|
+
'name': None,
|
|
702
|
+
}
|
|
703
|
+
)
|
|
704
|
+
feature_counts['tool_result'] += 1
|
|
705
|
+
blocks = split_vision_placeholder_text(text)
|
|
706
|
+
if role == 'system' and any(block['type'] in {'image', 'video'} for block in blocks):
|
|
707
|
+
lossy_reasons.add('system_multimodal_not_supported')
|
|
708
|
+
append_text_block(content_blocks, text)
|
|
709
|
+
else:
|
|
710
|
+
for block in blocks:
|
|
711
|
+
if block['type'] == 'text':
|
|
712
|
+
append_text_block(content_blocks, block['text'])
|
|
713
|
+
else:
|
|
714
|
+
content_blocks.append(block)
|
|
715
|
+
feature_counts[block['type']] += 1
|
|
716
|
+
elif block_type in THINKING_BLOCK_TYPES:
|
|
717
|
+
text = get_text_from_block(item)
|
|
718
|
+
if text:
|
|
719
|
+
reasoning_parts.append(text.strip())
|
|
720
|
+
feature_counts['thinking'] += 1
|
|
721
|
+
elif block_type in IMAGE_BLOCK_TYPES or 'image_url' in item or 'image' in item:
|
|
722
|
+
if role == 'system':
|
|
723
|
+
append_text_block(content_blocks, '[unsupported system image omitted]')
|
|
724
|
+
lossy_reasons.add('system_multimodal_not_supported')
|
|
725
|
+
else:
|
|
726
|
+
content_blocks.append(normalize_image_block(item))
|
|
727
|
+
feature_counts['image'] += 1
|
|
728
|
+
elif block_type in VIDEO_BLOCK_TYPES or 'video_url' in item or 'video' in item:
|
|
729
|
+
if role == 'system':
|
|
730
|
+
append_text_block(content_blocks, '[unsupported system video omitted]')
|
|
731
|
+
lossy_reasons.add('system_multimodal_not_supported')
|
|
732
|
+
else:
|
|
733
|
+
content_blocks.append(normalize_video_block(item))
|
|
734
|
+
feature_counts['video'] += 1
|
|
735
|
+
elif block_type in TOOL_CALL_BLOCK_TYPES:
|
|
736
|
+
normalized = normalize_tool_call(item, lossy_reasons)
|
|
737
|
+
if normalized:
|
|
738
|
+
tool_calls.append(normalized)
|
|
739
|
+
feature_counts['tool_call'] += 1
|
|
740
|
+
elif block_type in TOOL_RESULT_BLOCK_TYPES:
|
|
741
|
+
tool_messages.append(
|
|
742
|
+
{
|
|
743
|
+
'role': 'tool',
|
|
744
|
+
'content': ensure_text_content(item.get('content') or item.get('text'), lossy_reasons),
|
|
745
|
+
'tool_call_id': item.get('tool_use_id') or item.get('tool_call_id') or item.get('id') or item.get('call_id'),
|
|
746
|
+
'name': item.get('name') or item.get('tool_name'),
|
|
747
|
+
}
|
|
748
|
+
)
|
|
749
|
+
feature_counts['tool_result'] += 1
|
|
750
|
+
else:
|
|
751
|
+
if isinstance(item.get('content'), list):
|
|
752
|
+
nested_messages, nested_features, nested_lossy = parse_message(
|
|
753
|
+
role,
|
|
754
|
+
item['content'],
|
|
755
|
+
item.get('tool_calls'),
|
|
756
|
+
item.get('reasoning_content') if isinstance(item.get('reasoning_content'), str) else None,
|
|
757
|
+
item.get('tool_call_id'),
|
|
758
|
+
item.get('name'),
|
|
759
|
+
)
|
|
760
|
+
feature_counts.update(nested_features)
|
|
761
|
+
lossy_reasons.update(nested_lossy)
|
|
762
|
+
if nested_messages:
|
|
763
|
+
primary = nested_messages[0]
|
|
764
|
+
primary_content = primary.get('content', '')
|
|
765
|
+
if isinstance(primary_content, str):
|
|
766
|
+
append_text_block(content_blocks, primary_content)
|
|
767
|
+
elif isinstance(primary_content, list):
|
|
768
|
+
content_blocks.extend(primary_content)
|
|
769
|
+
else:
|
|
770
|
+
append_text_block(content_blocks, json_fallback(item))
|
|
771
|
+
lossy_reasons.add('unknown_content_block')
|
|
772
|
+
|
|
773
|
+
message: Dict[str, Any] = {'role': role, 'content': finalize_content(content_blocks)}
|
|
774
|
+
if role == 'assistant':
|
|
775
|
+
reasoning = '\n\n'.join(part for part in reasoning_parts if part).strip()
|
|
776
|
+
if reasoning:
|
|
777
|
+
message['reasoning_content'] = reasoning
|
|
778
|
+
if tool_calls:
|
|
779
|
+
message['tool_calls'] = tool_calls
|
|
780
|
+
if role == 'tool':
|
|
781
|
+
if explicit_tool_call_id:
|
|
782
|
+
message['tool_call_id'] = explicit_tool_call_id
|
|
783
|
+
if explicit_tool_name:
|
|
784
|
+
message['name'] = explicit_tool_name
|
|
785
|
+
if not isinstance(message['content'], str):
|
|
786
|
+
message['content'] = ensure_text_content(message['content'], lossy_reasons)
|
|
787
|
+
messages = [message]
|
|
788
|
+
messages.extend(tool_messages)
|
|
789
|
+
return messages, feature_counts, lossy_reasons
|
|
790
|
+
|
|
791
|
+
|
|
792
|
+
def normalize_message_sequence(raw_messages: Any, endpoint: str) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
|
|
793
|
+
if isinstance(raw_messages, str):
|
|
794
|
+
parsed = parse_json_maybe(raw_messages)
|
|
795
|
+
if parsed is not None:
|
|
796
|
+
raw_messages = parsed
|
|
797
|
+
messages: List[Dict[str, Any]] = []
|
|
798
|
+
feature_counts = Counter()
|
|
799
|
+
lossy_reasons: set[str] = set()
|
|
800
|
+
|
|
801
|
+
if isinstance(raw_messages, str):
|
|
802
|
+
msg_list, msg_features, msg_lossy = parse_message('user', raw_messages)
|
|
803
|
+
messages.extend(msg_list)
|
|
804
|
+
feature_counts.update(msg_features)
|
|
805
|
+
lossy_reasons.update(msg_lossy)
|
|
806
|
+
elif isinstance(raw_messages, dict):
|
|
807
|
+
role = normalize_role(raw_messages.get('role') or raw_messages.get('type'))
|
|
808
|
+
if role:
|
|
809
|
+
msg_list, msg_features, msg_lossy = parse_message(
|
|
810
|
+
role,
|
|
811
|
+
raw_messages.get('content') if 'content' in raw_messages else raw_messages.get('text'),
|
|
812
|
+
raw_messages.get('tool_calls'),
|
|
813
|
+
raw_messages.get('reasoning_content') if isinstance(raw_messages.get('reasoning_content'), str) else None,
|
|
814
|
+
raw_messages.get('tool_call_id'),
|
|
815
|
+
raw_messages.get('name'),
|
|
816
|
+
)
|
|
817
|
+
messages.extend(msg_list)
|
|
818
|
+
feature_counts.update(msg_features)
|
|
819
|
+
lossy_reasons.update(msg_lossy)
|
|
820
|
+
elif isinstance(raw_messages, list):
|
|
821
|
+
for item in raw_messages:
|
|
822
|
+
if isinstance(item, dict) and ('role' in item or item.get('type') == 'message'):
|
|
823
|
+
role = normalize_role(item.get('role'))
|
|
824
|
+
if not role:
|
|
825
|
+
lossy_reasons.add('unsupported_role')
|
|
826
|
+
continue
|
|
827
|
+
msg_list, msg_features, msg_lossy = parse_message(
|
|
828
|
+
role,
|
|
829
|
+
item.get('content') if 'content' in item else item.get('text'),
|
|
830
|
+
item.get('tool_calls'),
|
|
831
|
+
item.get('reasoning_content') if isinstance(item.get('reasoning_content'), str) else None,
|
|
832
|
+
item.get('tool_call_id'),
|
|
833
|
+
item.get('name'),
|
|
834
|
+
)
|
|
835
|
+
messages.extend(msg_list)
|
|
836
|
+
feature_counts.update(msg_features)
|
|
837
|
+
lossy_reasons.update(msg_lossy)
|
|
838
|
+
else:
|
|
839
|
+
msg_list, msg_features, msg_lossy = parse_message('user', item)
|
|
840
|
+
messages.extend(msg_list)
|
|
841
|
+
feature_counts.update(msg_features)
|
|
842
|
+
lossy_reasons.update(msg_lossy)
|
|
843
|
+
|
|
844
|
+
# Merge consecutive messages with same role except tool role.
|
|
845
|
+
merged: List[Dict[str, Any]] = []
|
|
846
|
+
for message in messages:
|
|
847
|
+
if (
|
|
848
|
+
merged
|
|
849
|
+
and merged[-1]['role'] == message['role']
|
|
850
|
+
and message['role'] != 'tool'
|
|
851
|
+
and 'tool_calls' not in merged[-1]
|
|
852
|
+
and 'tool_calls' not in message
|
|
853
|
+
and 'reasoning_content' not in merged[-1]
|
|
854
|
+
and 'reasoning_content' not in message
|
|
855
|
+
):
|
|
856
|
+
prev = merged[-1]
|
|
857
|
+
if isinstance(prev['content'], str) and isinstance(message['content'], str):
|
|
858
|
+
prev['content'] = (prev['content'] + '\n\n' + message['content']).strip()
|
|
859
|
+
elif isinstance(prev['content'], list) and isinstance(message['content'], list):
|
|
860
|
+
prev['content'].extend(message['content'])
|
|
861
|
+
else:
|
|
862
|
+
prev['content'] = ensure_text_content(prev['content'], lossy_reasons) + '\n\n' + ensure_text_content(message['content'], lossy_reasons)
|
|
863
|
+
else:
|
|
864
|
+
merged.append(message)
|
|
865
|
+
|
|
866
|
+
system_messages = [message for message in merged if message['role'] == 'system']
|
|
867
|
+
non_system_messages = [message for message in merged if message['role'] != 'system']
|
|
868
|
+
if system_messages and merged[: len(system_messages)] != system_messages:
|
|
869
|
+
lossy_reasons.add('system_reordered')
|
|
870
|
+
merged = system_messages + non_system_messages
|
|
871
|
+
|
|
872
|
+
flags = {
|
|
873
|
+
'contains_non_text_content': feature_counts['image'] > 0 or feature_counts['video'] > 0,
|
|
874
|
+
'image_block_count': feature_counts['image'],
|
|
875
|
+
'video_block_count': feature_counts['video'],
|
|
876
|
+
'tool_call_block_count': feature_counts['tool_call'],
|
|
877
|
+
'tool_result_block_count': feature_counts['tool_result'],
|
|
878
|
+
'thinking_block_count': feature_counts['thinking'],
|
|
879
|
+
'lossy_reasons': sorted(lossy_reasons),
|
|
880
|
+
}
|
|
881
|
+
return merged, flags
|
|
882
|
+
|
|
883
|
+
|
|
884
|
+
def extract_request_meta(endpoint: str, req_obj: Dict[str, Any]) -> Dict[str, Any]:
|
|
885
|
+
meta: Dict[str, Any] = {}
|
|
886
|
+
requested_model = req_obj.get('model')
|
|
887
|
+
if isinstance(requested_model, str) and requested_model:
|
|
888
|
+
meta['requested_model'] = requested_model
|
|
889
|
+
if isinstance(req_obj.get('stream'), bool):
|
|
890
|
+
meta['stream'] = req_obj['stream']
|
|
891
|
+
reasoning = req_obj.get('reasoning')
|
|
892
|
+
if isinstance(reasoning, dict):
|
|
893
|
+
if isinstance(reasoning.get('effort'), str):
|
|
894
|
+
meta['thinking_level'] = reasoning['effort']
|
|
895
|
+
if 'summary' in reasoning:
|
|
896
|
+
meta['reasoning_summary_mode'] = reasoning['summary']
|
|
897
|
+
thinking = req_obj.get('thinking')
|
|
898
|
+
if isinstance(thinking, dict):
|
|
899
|
+
if isinstance(thinking.get('type'), str):
|
|
900
|
+
meta['thinking_type'] = thinking['type']
|
|
901
|
+
if isinstance(thinking.get('budget_tokens'), int):
|
|
902
|
+
meta['thinking_budget_tokens'] = thinking['budget_tokens']
|
|
903
|
+
if isinstance(req_obj.get('max_output_tokens'), int):
|
|
904
|
+
meta['max_output_tokens'] = req_obj['max_output_tokens']
|
|
905
|
+
elif isinstance(req_obj.get('max_tokens'), int):
|
|
906
|
+
meta['max_output_tokens'] = req_obj['max_tokens']
|
|
907
|
+
tools = normalize_tool_specs(req_obj.get('tools'))
|
|
908
|
+
if tools:
|
|
909
|
+
meta['tool_spec_count'] = len(tools)
|
|
910
|
+
tool_choice = req_obj.get('tool_choice') or req_obj.get('toolChoice')
|
|
911
|
+
if tool_choice is not None:
|
|
912
|
+
meta['tool_choice'] = tool_choice
|
|
913
|
+
return meta
|
|
914
|
+
|
|
915
|
+
|
|
916
|
+
def extract_response_meta(endpoint: str, res_obj: Dict[str, Any]) -> Dict[str, Any]:
|
|
917
|
+
meta: Dict[str, Any] = {}
|
|
918
|
+
body = res_obj.get('response') if isinstance(res_obj.get('response'), dict) else res_obj
|
|
919
|
+
if isinstance(body, dict):
|
|
920
|
+
actual_model = body.get('model')
|
|
921
|
+
if isinstance(actual_model, str) and actual_model:
|
|
922
|
+
meta['actual_model'] = actual_model
|
|
923
|
+
usage = body.get('usage')
|
|
924
|
+
if isinstance(usage, dict):
|
|
925
|
+
total_tokens = usage.get('total_tokens')
|
|
926
|
+
if isinstance(total_tokens, int):
|
|
927
|
+
meta['total_tokens'] = total_tokens
|
|
928
|
+
return meta
|
|
929
|
+
|
|
930
|
+
|
|
931
|
+
def normalize_request_messages(endpoint: str, req_obj: Dict[str, Any]) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
|
|
932
|
+
messages: List[Dict[str, Any]] = []
|
|
933
|
+
flags_total = Counter()
|
|
934
|
+
lossy_reasons: set[str] = set()
|
|
935
|
+
|
|
936
|
+
def absorb(seq: List[Dict[str, Any]], flags: Dict[str, Any]) -> None:
|
|
937
|
+
messages.extend(seq)
|
|
938
|
+
flags_total['image'] += flags['image_block_count']
|
|
939
|
+
flags_total['video'] += flags['video_block_count']
|
|
940
|
+
flags_total['tool_call'] += flags['tool_call_block_count']
|
|
941
|
+
flags_total['tool_result'] += flags['tool_result_block_count']
|
|
942
|
+
flags_total['thinking'] += flags['thinking_block_count']
|
|
943
|
+
if flags['contains_non_text_content']:
|
|
944
|
+
flags_total['non_text'] += 1
|
|
945
|
+
lossy_reasons.update(flags['lossy_reasons'])
|
|
946
|
+
|
|
947
|
+
if endpoint in ('POST /openai/v1/responses', 'POST /openai/v1/responses/compact'):
|
|
948
|
+
instructions = req_obj.get('instructions')
|
|
949
|
+
if instructions:
|
|
950
|
+
messages.append({'role': 'system', 'content': str(instructions)})
|
|
951
|
+
seq, flags = normalize_message_sequence(req_obj.get('input'), endpoint)
|
|
952
|
+
absorb(seq, flags)
|
|
953
|
+
elif endpoint == 'POST /openai/v1/chat/completions':
|
|
954
|
+
instructions = req_obj.get('instructions')
|
|
955
|
+
if instructions:
|
|
956
|
+
messages.append({'role': 'system', 'content': str(instructions)})
|
|
957
|
+
seq, flags = normalize_message_sequence(req_obj.get('messages'), endpoint)
|
|
958
|
+
absorb(seq, flags)
|
|
959
|
+
elif endpoint == 'POST /api/v1/messages':
|
|
960
|
+
system_content = req_obj.get('system')
|
|
961
|
+
if system_content is not None:
|
|
962
|
+
seq, flags = normalize_message_sequence([{'role': 'system', 'content': system_content}], endpoint)
|
|
963
|
+
absorb(seq, flags)
|
|
964
|
+
seq, flags = normalize_message_sequence(req_obj.get('messages'), endpoint)
|
|
965
|
+
absorb(seq, flags)
|
|
966
|
+
else:
|
|
967
|
+
seq, flags = normalize_message_sequence(req_obj, endpoint)
|
|
968
|
+
absorb(seq, flags)
|
|
969
|
+
|
|
970
|
+
if not any(message['role'] == 'user' for message in messages):
|
|
971
|
+
lossy_reasons.add('missing_user_query')
|
|
972
|
+
|
|
973
|
+
merged_messages = merge_initial_system_messages(messages)
|
|
974
|
+
if len(merged_messages) != len(messages):
|
|
975
|
+
lossy_reasons.add('merged_initial_system_messages')
|
|
976
|
+
messages = merged_messages
|
|
977
|
+
|
|
978
|
+
return messages, {
|
|
979
|
+
'contains_non_text_content': bool(flags_total['non_text']),
|
|
980
|
+
'image_block_count': flags_total['image'],
|
|
981
|
+
'video_block_count': flags_total['video'],
|
|
982
|
+
'tool_call_block_count': flags_total['tool_call'],
|
|
983
|
+
'tool_result_block_count': flags_total['tool_result'],
|
|
984
|
+
'thinking_block_count': flags_total['thinking'],
|
|
985
|
+
'lossy_reasons': sorted(lossy_reasons),
|
|
986
|
+
}
|
|
987
|
+
|
|
988
|
+
|
|
989
|
+
def normalize_response_messages(endpoint: str, res_obj: Any) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
|
|
990
|
+
if not isinstance(res_obj, dict):
|
|
991
|
+
return [], {
|
|
992
|
+
'contains_non_text_content': False,
|
|
993
|
+
'image_block_count': 0,
|
|
994
|
+
'video_block_count': 0,
|
|
995
|
+
'tool_call_block_count': 0,
|
|
996
|
+
'tool_result_block_count': 0,
|
|
997
|
+
'thinking_block_count': 0,
|
|
998
|
+
'lossy_reasons': [],
|
|
999
|
+
}
|
|
1000
|
+
|
|
1001
|
+
body = res_obj.get('response') if isinstance(res_obj.get('response'), dict) else res_obj
|
|
1002
|
+
if endpoint in ('POST /openai/v1/responses', 'POST /openai/v1/responses/compact') and isinstance(body, dict):
|
|
1003
|
+
output = body.get('output')
|
|
1004
|
+
if isinstance(output, list) and output:
|
|
1005
|
+
if all(isinstance(item, dict) and 'role' not in item and item.get('type') != 'message' for item in output):
|
|
1006
|
+
messages, features, lossy = parse_message('assistant', output)
|
|
1007
|
+
return messages, {
|
|
1008
|
+
'contains_non_text_content': features['image'] > 0 or features['video'] > 0,
|
|
1009
|
+
'image_block_count': features['image'],
|
|
1010
|
+
'video_block_count': features['video'],
|
|
1011
|
+
'tool_call_block_count': features['tool_call'],
|
|
1012
|
+
'tool_result_block_count': features['tool_result'],
|
|
1013
|
+
'thinking_block_count': features['thinking'],
|
|
1014
|
+
'lossy_reasons': sorted(lossy),
|
|
1015
|
+
}
|
|
1016
|
+
messages, flags = normalize_message_sequence(output, endpoint)
|
|
1017
|
+
if messages:
|
|
1018
|
+
return messages, flags
|
|
1019
|
+
if isinstance(body.get('output_text'), str) and body.get('output_text').strip():
|
|
1020
|
+
messages, flags = normalize_message_sequence([{'role': 'assistant', 'content': body['output_text']}], endpoint)
|
|
1021
|
+
return messages, flags
|
|
1022
|
+
if endpoint == 'POST /openai/v1/chat/completions':
|
|
1023
|
+
choices = body.get('choices') if isinstance(body, dict) else None
|
|
1024
|
+
if isinstance(choices, list) and choices and isinstance(choices[0], dict):
|
|
1025
|
+
msg = choices[0].get('message')
|
|
1026
|
+
messages, flags = normalize_message_sequence([msg], endpoint)
|
|
1027
|
+
return messages, flags
|
|
1028
|
+
if endpoint == 'POST /api/v1/messages':
|
|
1029
|
+
messages, flags = normalize_message_sequence([{'role': 'assistant', 'content': body.get('content')}], endpoint)
|
|
1030
|
+
return messages, flags
|
|
1031
|
+
return [], {
|
|
1032
|
+
'contains_non_text_content': False,
|
|
1033
|
+
'image_block_count': 0,
|
|
1034
|
+
'video_block_count': 0,
|
|
1035
|
+
'tool_call_block_count': 0,
|
|
1036
|
+
'tool_result_block_count': 0,
|
|
1037
|
+
'thinking_block_count': 0,
|
|
1038
|
+
'lossy_reasons': [],
|
|
1039
|
+
}
|
|
1040
|
+
|
|
1041
|
+
|
|
1042
|
+
def record_hash(messages: List[Dict[str, Any]], tools: List[Dict[str, Any]]) -> str:
|
|
1043
|
+
payload = json.dumps({'messages': messages, 'tools': tools}, ensure_ascii=False, sort_keys=True, separators=(',', ':'))
|
|
1044
|
+
return hashlib.sha256(payload.encode('utf-8')).hexdigest()
|
|
1045
|
+
|
|
1046
|
+
|
|
1047
|
+
def dedupe_key(mode: str, record_id: str, request_id: Optional[str], messages: List[Dict[str, Any]], tools: List[Dict[str, Any]]) -> Optional[str]:
|
|
1048
|
+
if mode == 'none':
|
|
1049
|
+
return None
|
|
1050
|
+
if mode == 'requestid':
|
|
1051
|
+
return request_id or record_id
|
|
1052
|
+
return record_hash(messages, tools)
|
|
1053
|
+
|
|
1054
|
+
|
|
1055
|
+
def lightweight_validate_record(payload: Dict[str, Any]) -> None:
|
|
1056
|
+
messages = payload.get('messages') or []
|
|
1057
|
+
if not messages:
|
|
1058
|
+
raise ValueError('messages must not be empty')
|
|
1059
|
+
|
|
1060
|
+
seen_user = False
|
|
1061
|
+
seen_non_system = False
|
|
1062
|
+
for message in messages:
|
|
1063
|
+
role = message.get('role')
|
|
1064
|
+
if role != 'system':
|
|
1065
|
+
seen_non_system = True
|
|
1066
|
+
elif seen_non_system:
|
|
1067
|
+
raise ValueError('system messages must appear only at the beginning')
|
|
1068
|
+
|
|
1069
|
+
if role == 'user':
|
|
1070
|
+
seen_user = True
|
|
1071
|
+
if role == 'system' and isinstance(message.get('content'), list):
|
|
1072
|
+
if any(isinstance(block, dict) and block.get('type') in {'image', 'video'} for block in message['content']):
|
|
1073
|
+
raise ValueError('system messages cannot contain image/video blocks')
|
|
1074
|
+
if role == 'assistant':
|
|
1075
|
+
reasoning = message.get('reasoning_content')
|
|
1076
|
+
if isinstance(reasoning, str) and ('<think>' in reasoning or '</think>' in reasoning):
|
|
1077
|
+
raise ValueError('reasoning_content must not contain think wrappers')
|
|
1078
|
+
content = message.get('content')
|
|
1079
|
+
if isinstance(content, str) and ('<think>' in content or '</think>' in content):
|
|
1080
|
+
raise ValueError('assistant content must not contain inline think wrappers')
|
|
1081
|
+
|
|
1082
|
+
if not seen_user:
|
|
1083
|
+
raise ValueError('at least one user message is required')
|
|
1084
|
+
|
|
1085
|
+
meta = payload.get('meta') or {}
|
|
1086
|
+
if meta.get('lossy_source') and not meta.get('lossy_reasons'):
|
|
1087
|
+
raise ValueError('lossy_source requires lossy_reasons')
|
|
1088
|
+
|
|
1089
|
+
|
|
1090
|
+
def validate_record_payload(payload: Dict[str, Any]) -> Any:
|
|
1091
|
+
if Qwen35TrainingRecord is not None:
|
|
1092
|
+
return Qwen35TrainingRecord.model_validate(payload)
|
|
1093
|
+
lightweight_validate_record(payload)
|
|
1094
|
+
return payload
|
|
1095
|
+
|
|
1096
|
+
|
|
1097
|
+
def record_messages_and_tools(record: Any) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
|
1098
|
+
if Qwen35TrainingRecord is not None and hasattr(record, 'messages'):
|
|
1099
|
+
return (
|
|
1100
|
+
[message.model_dump(exclude_none=True) for message in record.messages],
|
|
1101
|
+
[tool.model_dump(exclude_none=True) for tool in record.tools],
|
|
1102
|
+
)
|
|
1103
|
+
return record['messages'], record.get('tools', [])
|
|
1104
|
+
|
|
1105
|
+
|
|
1106
|
+
def record_is_lossy(record: Any) -> bool:
|
|
1107
|
+
if Qwen35TrainingRecord is not None and hasattr(record, 'meta'):
|
|
1108
|
+
return bool(record.meta.lossy_source)
|
|
1109
|
+
return bool(record.get('meta', {}).get('lossy_source'))
|
|
1110
|
+
|
|
1111
|
+
|
|
1112
|
+
def record_id_value(record: Any) -> str:
|
|
1113
|
+
return record.id if hasattr(record, 'id') else record['id']
|
|
1114
|
+
|
|
1115
|
+
|
|
1116
|
+
def record_request_id_value(record: Any) -> Optional[str]:
|
|
1117
|
+
return record.request_id if hasattr(record, 'request_id') else record.get('request_id')
|
|
1118
|
+
|
|
1119
|
+
|
|
1120
|
+
def record_dump_json(record: Any) -> str:
|
|
1121
|
+
if Qwen35TrainingRecord is not None and hasattr(record, 'model_dump_json'):
|
|
1122
|
+
return record.model_dump_json(exclude_none=True)
|
|
1123
|
+
return json.dumps(record, ensure_ascii=False, separators=(',', ':'))
|
|
1124
|
+
|
|
1125
|
+
|
|
1126
|
+
def record_as_dict(record: Any) -> Dict[str, Any]:
|
|
1127
|
+
if Qwen35TrainingRecord is not None and hasattr(record, 'model_dump'):
|
|
1128
|
+
return record.model_dump(exclude_none=True)
|
|
1129
|
+
return record
|
|
1130
|
+
|
|
1131
|
+
|
|
1132
|
+
def parquet_content_projection(content: Any) -> Tuple[Optional[str], List[Dict[str, Any]]]:
|
|
1133
|
+
if isinstance(content, str):
|
|
1134
|
+
return content, []
|
|
1135
|
+
blocks: List[Dict[str, Any]] = []
|
|
1136
|
+
if isinstance(content, list):
|
|
1137
|
+
for block in content:
|
|
1138
|
+
if not isinstance(block, dict):
|
|
1139
|
+
continue
|
|
1140
|
+
blocks.append(
|
|
1141
|
+
{
|
|
1142
|
+
'type': block.get('type'),
|
|
1143
|
+
'text': block.get('text'),
|
|
1144
|
+
'image_url': block.get('image_url'),
|
|
1145
|
+
'video_url': block.get('video_url'),
|
|
1146
|
+
'placeholder': block.get('placeholder'),
|
|
1147
|
+
'placeholder_token': block.get('placeholder_token'),
|
|
1148
|
+
'source_kind': block.get('source_kind'),
|
|
1149
|
+
'metadata_json': json.dumps(block.get('metadata'), ensure_ascii=False, sort_keys=True)
|
|
1150
|
+
if isinstance(block.get('metadata'), dict)
|
|
1151
|
+
else None,
|
|
1152
|
+
}
|
|
1153
|
+
)
|
|
1154
|
+
return None, blocks
|
|
1155
|
+
|
|
1156
|
+
|
|
1157
|
+
def parquet_tool_calls_projection(tool_calls: Any) -> List[Dict[str, Any]]:
|
|
1158
|
+
rows: List[Dict[str, Any]] = []
|
|
1159
|
+
if not isinstance(tool_calls, list):
|
|
1160
|
+
return rows
|
|
1161
|
+
for call in tool_calls:
|
|
1162
|
+
if not isinstance(call, dict):
|
|
1163
|
+
continue
|
|
1164
|
+
function = call.get('function') if isinstance(call.get('function'), dict) else {}
|
|
1165
|
+
rows.append(
|
|
1166
|
+
{
|
|
1167
|
+
'id': call.get('id'),
|
|
1168
|
+
'type': call.get('type'),
|
|
1169
|
+
'function_name': function.get('name'),
|
|
1170
|
+
'function_arguments_json': json.dumps(function.get('arguments', {}), ensure_ascii=False, sort_keys=True),
|
|
1171
|
+
}
|
|
1172
|
+
)
|
|
1173
|
+
return rows
|
|
1174
|
+
|
|
1175
|
+
|
|
1176
|
+
def parquet_tools_projection(tools: Any) -> List[Dict[str, Any]]:
|
|
1177
|
+
rows: List[Dict[str, Any]] = []
|
|
1178
|
+
if not isinstance(tools, list):
|
|
1179
|
+
return rows
|
|
1180
|
+
for tool in tools:
|
|
1181
|
+
if not isinstance(tool, dict):
|
|
1182
|
+
continue
|
|
1183
|
+
rows.append(
|
|
1184
|
+
{
|
|
1185
|
+
'name': tool.get('name'),
|
|
1186
|
+
'description': tool.get('description'),
|
|
1187
|
+
'parameters_json': json.dumps(tool.get('parameters'), ensure_ascii=False, sort_keys=True)
|
|
1188
|
+
if tool.get('parameters') is not None
|
|
1189
|
+
else None,
|
|
1190
|
+
'raw_json': json.dumps(tool, ensure_ascii=False, sort_keys=True),
|
|
1191
|
+
}
|
|
1192
|
+
)
|
|
1193
|
+
return rows
|
|
1194
|
+
|
|
1195
|
+
|
|
1196
|
+
def record_to_parquet_row(record: Dict[str, Any]) -> Dict[str, Any]:
|
|
1197
|
+
meta = dict(record.get('meta', {}))
|
|
1198
|
+
messages = record.get('messages', []) if isinstance(record.get('messages'), list) else []
|
|
1199
|
+
user_message_count = sum(1 for message in messages if isinstance(message, dict) and message.get('role') == 'user')
|
|
1200
|
+
assistant_message_count = sum(1 for message in messages if isinstance(message, dict) and message.get('role') == 'assistant')
|
|
1201
|
+
tool_message_count = sum(1 for message in messages if isinstance(message, dict) and message.get('role') == 'tool')
|
|
1202
|
+
tool_call_count = sum(
|
|
1203
|
+
len(message.get('tool_calls') or [])
|
|
1204
|
+
for message in messages
|
|
1205
|
+
if isinstance(message, dict) and message.get('role') == 'assistant'
|
|
1206
|
+
)
|
|
1207
|
+
has_reasoning = any(
|
|
1208
|
+
isinstance(message, dict)
|
|
1209
|
+
and message.get('role') == 'assistant'
|
|
1210
|
+
and isinstance(message.get('reasoning_content'), str)
|
|
1211
|
+
and bool(message.get('reasoning_content').strip())
|
|
1212
|
+
for message in messages
|
|
1213
|
+
)
|
|
1214
|
+
reasoning_chars = sum(
|
|
1215
|
+
len(message.get('reasoning_content', ''))
|
|
1216
|
+
for message in messages
|
|
1217
|
+
if isinstance(message, dict)
|
|
1218
|
+
and message.get('role') == 'assistant'
|
|
1219
|
+
and isinstance(message.get('reasoning_content'), str)
|
|
1220
|
+
)
|
|
1221
|
+
content_chars_total = sum(
|
|
1222
|
+
len(message.get('content'))
|
|
1223
|
+
if isinstance(message.get('content'), str)
|
|
1224
|
+
else len(json.dumps(message.get('content'), ensure_ascii=False, sort_keys=True))
|
|
1225
|
+
for message in messages
|
|
1226
|
+
if isinstance(message, dict) and message.get('content') is not None
|
|
1227
|
+
)
|
|
1228
|
+
return {
|
|
1229
|
+
'id': record.get('id'),
|
|
1230
|
+
'request_id': record.get('request_id'),
|
|
1231
|
+
'endpoint': meta.get('endpoint'),
|
|
1232
|
+
'status': meta.get('status'),
|
|
1233
|
+
'ts': meta.get('ts'),
|
|
1234
|
+
'key': meta.get('key'),
|
|
1235
|
+
'source': meta.get('source'),
|
|
1236
|
+
'requested_model': meta.get('requested_model'),
|
|
1237
|
+
'actual_model': meta.get('actual_model'),
|
|
1238
|
+
'stream': meta.get('stream'),
|
|
1239
|
+
'thinking_level': meta.get('thinking_level'),
|
|
1240
|
+
'reasoning_summary_mode_json': json.dumps(meta.get('reasoning_summary_mode'), ensure_ascii=False, sort_keys=True),
|
|
1241
|
+
'thinking_type': meta.get('thinking_type'),
|
|
1242
|
+
'thinking_budget_tokens': meta.get('thinking_budget_tokens'),
|
|
1243
|
+
'max_output_tokens': meta.get('max_output_tokens'),
|
|
1244
|
+
'tool_spec_count': meta.get('tool_spec_count'),
|
|
1245
|
+
'tool_choice_json': json.dumps(meta.get('tool_choice'), ensure_ascii=False, sort_keys=True),
|
|
1246
|
+
'request_contains_non_text_content': meta.get('request_contains_non_text_content'),
|
|
1247
|
+
'request_image_block_count': meta.get('request_image_block_count'),
|
|
1248
|
+
'request_video_block_count': meta.get('request_video_block_count'),
|
|
1249
|
+
'request_tool_call_block_count': meta.get('request_tool_call_block_count'),
|
|
1250
|
+
'request_tool_result_block_count': meta.get('request_tool_result_block_count'),
|
|
1251
|
+
'request_thinking_block_count': meta.get('request_thinking_block_count'),
|
|
1252
|
+
'response_contains_non_text_content': meta.get('response_contains_non_text_content'),
|
|
1253
|
+
'response_image_block_count': meta.get('response_image_block_count'),
|
|
1254
|
+
'response_video_block_count': meta.get('response_video_block_count'),
|
|
1255
|
+
'response_tool_call_block_count': meta.get('response_tool_call_block_count'),
|
|
1256
|
+
'response_tool_result_block_count': meta.get('response_tool_result_block_count'),
|
|
1257
|
+
'response_thinking_block_count': meta.get('response_thinking_block_count'),
|
|
1258
|
+
'request_truncated': meta.get('request_truncated'),
|
|
1259
|
+
'response_truncated': meta.get('response_truncated'),
|
|
1260
|
+
'lossy_source': meta.get('lossy_source'),
|
|
1261
|
+
'lossy_reasons_json': json.dumps(meta.get('lossy_reasons', []), ensure_ascii=False, sort_keys=True),
|
|
1262
|
+
'user_message_count': user_message_count,
|
|
1263
|
+
'assistant_message_count': assistant_message_count,
|
|
1264
|
+
'tool_message_count': tool_message_count,
|
|
1265
|
+
'dialogue_rounds_est': user_message_count,
|
|
1266
|
+
'tool_call_count': tool_call_count,
|
|
1267
|
+
'has_reasoning': has_reasoning,
|
|
1268
|
+
'reasoning_chars': reasoning_chars,
|
|
1269
|
+
'content_chars_total': content_chars_total,
|
|
1270
|
+
'messages_json': json.dumps(record.get('messages', []), ensure_ascii=False, sort_keys=True),
|
|
1271
|
+
'tools_json': json.dumps(record.get('tools', []), ensure_ascii=False, sort_keys=True),
|
|
1272
|
+
'meta_json': json.dumps(record.get('meta', {}), ensure_ascii=False, sort_keys=True),
|
|
1273
|
+
}
|
|
1274
|
+
|
|
1275
|
+
|
|
1276
|
+
def auto_worker_count(requested_workers: int, source_count: int) -> int:
|
|
1277
|
+
if requested_workers > 0:
|
|
1278
|
+
return max(1, requested_workers)
|
|
1279
|
+
cpu = os.cpu_count() or 4
|
|
1280
|
+
return max(1, min(source_count, min(cpu, 8)))
|
|
1281
|
+
|
|
1282
|
+
|
|
1283
|
+
def ensure_parquet_runtime(output_format: str) -> None:
|
|
1284
|
+
if output_format in {'parquet', 'both'} and (pa is None or pq is None):
|
|
1285
|
+
raise RuntimeError('pyarrow is required for Parquet output')
|
|
1286
|
+
|
|
1287
|
+
|
|
1288
|
+
def build_staged_entry_from_event(event: Dict[str, Any], endpoint: str, event_index: int, dedupe_mode: str) -> Tuple[Optional[Dict[str, Any]], Counter]:
|
|
1289
|
+
stats = Counter()
|
|
1290
|
+
stats[f'events:{endpoint}'] += 1
|
|
1291
|
+
|
|
1292
|
+
req_obj = parse_json_maybe(event['meta'].get('req'))
|
|
1293
|
+
if not isinstance(req_obj, dict):
|
|
1294
|
+
stats[f'bad_req:{endpoint}'] += 1
|
|
1295
|
+
return None, stats
|
|
1296
|
+
|
|
1297
|
+
tools = normalize_tool_specs(req_obj.get('tools'))
|
|
1298
|
+
request_messages, request_flags = normalize_request_messages(endpoint, req_obj)
|
|
1299
|
+
response_obj = parse_json_maybe(event['meta'].get('res'))
|
|
1300
|
+
response_messages, response_flags = normalize_response_messages(endpoint, response_obj)
|
|
1301
|
+
|
|
1302
|
+
messages = request_messages + response_messages
|
|
1303
|
+
if not messages:
|
|
1304
|
+
stats[f'empty_messages:{endpoint}'] += 1
|
|
1305
|
+
return None, stats
|
|
1306
|
+
|
|
1307
|
+
request_id = event['meta'].get('requestId')
|
|
1308
|
+
lossy_reasons = set(request_flags['lossy_reasons']) | set(response_flags['lossy_reasons'])
|
|
1309
|
+
request_truncated = has_truncation(req_obj)
|
|
1310
|
+
response_truncated = has_truncation(response_obj)
|
|
1311
|
+
if request_truncated:
|
|
1312
|
+
lossy_reasons.add('request_truncated')
|
|
1313
|
+
if response_truncated:
|
|
1314
|
+
lossy_reasons.add('response_truncated')
|
|
1315
|
+
|
|
1316
|
+
record_id = f"{endpoint}:{event.get('ts')}:{event_index}"
|
|
1317
|
+
meta = {
|
|
1318
|
+
'endpoint': endpoint,
|
|
1319
|
+
'status': event['status'],
|
|
1320
|
+
'ts': event.get('ts') or '',
|
|
1321
|
+
'key': event['meta'].get('key'),
|
|
1322
|
+
'source': event.get('source'),
|
|
1323
|
+
'request_contains_non_text_content': request_flags['contains_non_text_content'],
|
|
1324
|
+
'request_image_block_count': request_flags['image_block_count'],
|
|
1325
|
+
'request_video_block_count': request_flags['video_block_count'],
|
|
1326
|
+
'request_tool_call_block_count': request_flags['tool_call_block_count'],
|
|
1327
|
+
'request_tool_result_block_count': request_flags['tool_result_block_count'],
|
|
1328
|
+
'request_thinking_block_count': request_flags['thinking_block_count'],
|
|
1329
|
+
'response_contains_non_text_content': response_flags['contains_non_text_content'],
|
|
1330
|
+
'response_image_block_count': response_flags['image_block_count'],
|
|
1331
|
+
'response_video_block_count': response_flags['video_block_count'],
|
|
1332
|
+
'response_tool_call_block_count': response_flags['tool_call_block_count'],
|
|
1333
|
+
'response_tool_result_block_count': response_flags['tool_result_block_count'],
|
|
1334
|
+
'response_thinking_block_count': response_flags['thinking_block_count'],
|
|
1335
|
+
'request_truncated': request_truncated,
|
|
1336
|
+
'response_truncated': response_truncated,
|
|
1337
|
+
'lossy_source': bool(lossy_reasons),
|
|
1338
|
+
'lossy_reasons': sorted(lossy_reasons),
|
|
1339
|
+
}
|
|
1340
|
+
meta.update(extract_request_meta(endpoint, req_obj))
|
|
1341
|
+
if isinstance(response_obj, dict):
|
|
1342
|
+
meta.update(extract_response_meta(endpoint, response_obj))
|
|
1343
|
+
|
|
1344
|
+
try:
|
|
1345
|
+
record = validate_record_payload(
|
|
1346
|
+
{
|
|
1347
|
+
'id': record_id,
|
|
1348
|
+
'request_id': request_id,
|
|
1349
|
+
'messages': messages,
|
|
1350
|
+
'tools': tools,
|
|
1351
|
+
'meta': meta,
|
|
1352
|
+
}
|
|
1353
|
+
)
|
|
1354
|
+
except Exception as exc:
|
|
1355
|
+
stats[f'invalid_record:{endpoint}'] += 1
|
|
1356
|
+
stats['invalid_records_total'] += 1
|
|
1357
|
+
return {
|
|
1358
|
+
'bucket': 'invalid',
|
|
1359
|
+
'endpoint': endpoint,
|
|
1360
|
+
'error': str(exc),
|
|
1361
|
+
'event_index': event_index,
|
|
1362
|
+
}, stats
|
|
1363
|
+
|
|
1364
|
+
record_dict = record_as_dict(record)
|
|
1365
|
+
key = dedupe_key(
|
|
1366
|
+
dedupe_mode,
|
|
1367
|
+
record_id_value(record),
|
|
1368
|
+
record_request_id_value(record),
|
|
1369
|
+
record_dict['messages'],
|
|
1370
|
+
record_dict.get('tools', []),
|
|
1371
|
+
)
|
|
1372
|
+
bucket = 'lossy' if record_is_lossy(record) else 'strict'
|
|
1373
|
+
stats[f'{bucket}_records_staged'] += 1
|
|
1374
|
+
return {'bucket': bucket, 'dedupe_key': key, 'record': record_dict}, stats
|
|
1375
|
+
|
|
1376
|
+
|
|
1377
|
+
def process_source_to_stage(source_path: str, staging_dir: Path, dedupe_mode: str) -> Dict[str, Any]:
|
|
1378
|
+
stats = Counter()
|
|
1379
|
+
safe_name = hashlib.sha1(source_path.encode('utf-8')).hexdigest()[:16]
|
|
1380
|
+
chunk_path = staging_dir / f'{safe_name}.jsonl'
|
|
1381
|
+
invalid_path = staging_dir / f'{safe_name}.invalid.jsonl'
|
|
1382
|
+
|
|
1383
|
+
with chunk_path.open('w', encoding='utf-8') as chunk_out, invalid_path.open('w', encoding='utf-8') as invalid_out:
|
|
1384
|
+
for local_index, event in enumerate(iter_events([source_path]), start=1):
|
|
1385
|
+
endpoint = f"{event['method']} {event['path']}"
|
|
1386
|
+
if endpoint not in TARGET_PATHS:
|
|
1387
|
+
continue
|
|
1388
|
+
staged_entry, event_stats = build_staged_entry_from_event(event, endpoint, local_index, dedupe_mode)
|
|
1389
|
+
stats.update(event_stats)
|
|
1390
|
+
if not staged_entry:
|
|
1391
|
+
continue
|
|
1392
|
+
if staged_entry['bucket'] == 'invalid':
|
|
1393
|
+
invalid_out.write(json.dumps(staged_entry, ensure_ascii=False) + '\n')
|
|
1394
|
+
else:
|
|
1395
|
+
chunk_out.write(json.dumps(staged_entry, ensure_ascii=False) + '\n')
|
|
1396
|
+
|
|
1397
|
+
return {
|
|
1398
|
+
'source': source_path,
|
|
1399
|
+
'chunk_path': str(chunk_path),
|
|
1400
|
+
'invalid_path': str(invalid_path),
|
|
1401
|
+
'stats': dict(stats),
|
|
1402
|
+
}
|
|
1403
|
+
|
|
1404
|
+
|
|
1405
|
+
def append_parquet_rows(writer: Any, rows: List[Dict[str, Any]], path: Path) -> Any:
|
|
1406
|
+
if not rows:
|
|
1407
|
+
return writer
|
|
1408
|
+
table = pa.Table.from_pylist(rows, schema=PARQUET_SCHEMA)
|
|
1409
|
+
if writer is None:
|
|
1410
|
+
writer = pq.ParquetWriter(str(path), PARQUET_SCHEMA)
|
|
1411
|
+
writer.write_table(table)
|
|
1412
|
+
return writer
|
|
1413
|
+
|
|
1414
|
+
|
|
1415
|
+
def main() -> int:
|
|
1416
|
+
args = parse_args()
|
|
1417
|
+
ensure_parquet_runtime(args.output_format)
|
|
1418
|
+
current_log = resolve_current_log_path(args.container) if args.include_current else None
|
|
1419
|
+
sources = iter_sources(args.archive_root, current_log, args.limit_sources)
|
|
1420
|
+
if not sources:
|
|
1421
|
+
raise SystemExit('No log sources found.')
|
|
1422
|
+
|
|
1423
|
+
out_dir = Path(args.output_root) / f'qwen35-export-{datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")}'
|
|
1424
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
1425
|
+
staging_dir = out_dir / 'staging'
|
|
1426
|
+
staging_dir.mkdir(parents=True, exist_ok=True)
|
|
1427
|
+
strict_path = out_dir / 'qwen35-train.jsonl'
|
|
1428
|
+
lossy_path = out_dir / 'qwen35-train-lossy.jsonl'
|
|
1429
|
+
strict_parquet_path = out_dir / 'qwen35-train.parquet'
|
|
1430
|
+
lossy_parquet_path = out_dir / 'qwen35-train-lossy.parquet'
|
|
1431
|
+
invalid_path = out_dir / 'invalid-records.jsonl'
|
|
1432
|
+
manifest_path = out_dir / 'manifest.json'
|
|
1433
|
+
|
|
1434
|
+
stats = Counter()
|
|
1435
|
+
worker_count = auto_worker_count(args.workers, len(sources))
|
|
1436
|
+
worker_results: List[Dict[str, Any]] = []
|
|
1437
|
+
|
|
1438
|
+
with ThreadPoolExecutor(max_workers=worker_count) as executor:
|
|
1439
|
+
futures = {
|
|
1440
|
+
executor.submit(process_source_to_stage, source_path, staging_dir, args.dedupe_mode): source_path
|
|
1441
|
+
for source_path in sources
|
|
1442
|
+
}
|
|
1443
|
+
files_done = 0
|
|
1444
|
+
for future in as_completed(futures):
|
|
1445
|
+
result = future.result()
|
|
1446
|
+
worker_results.append(result)
|
|
1447
|
+
stats.update(result['stats'])
|
|
1448
|
+
files_done += 1
|
|
1449
|
+
print(
|
|
1450
|
+
json.dumps(
|
|
1451
|
+
{'files_done': files_done, 'sources_total': len(sources), **dict(stats)},
|
|
1452
|
+
ensure_ascii=False,
|
|
1453
|
+
),
|
|
1454
|
+
flush=True,
|
|
1455
|
+
)
|
|
1456
|
+
|
|
1457
|
+
jsonl_enabled = args.output_format in {'jsonl', 'both'}
|
|
1458
|
+
parquet_enabled = args.output_format in {'parquet', 'both'}
|
|
1459
|
+
strict_seen: set[str] = set()
|
|
1460
|
+
lossy_seen: set[str] = set()
|
|
1461
|
+
strict_writer = None
|
|
1462
|
+
lossy_writer = None
|
|
1463
|
+
strict_batch: List[Dict[str, Any]] = []
|
|
1464
|
+
lossy_batch: List[Dict[str, Any]] = []
|
|
1465
|
+
batch_size = 1000
|
|
1466
|
+
|
|
1467
|
+
strict_out = strict_path.open('w', encoding='utf-8') if jsonl_enabled else None
|
|
1468
|
+
lossy_out = lossy_path.open('w', encoding='utf-8') if jsonl_enabled else None
|
|
1469
|
+
invalid_out = invalid_path.open('w', encoding='utf-8')
|
|
1470
|
+
|
|
1471
|
+
try:
|
|
1472
|
+
for result in sorted(worker_results, key=lambda item: item['source']):
|
|
1473
|
+
chunk_path = Path(result['chunk_path'])
|
|
1474
|
+
invalid_chunk_path = Path(result['invalid_path'])
|
|
1475
|
+
|
|
1476
|
+
if invalid_chunk_path.exists():
|
|
1477
|
+
with invalid_chunk_path.open('r', encoding='utf-8') as invalid_in:
|
|
1478
|
+
for line in invalid_in:
|
|
1479
|
+
invalid_out.write(line)
|
|
1480
|
+
|
|
1481
|
+
if chunk_path.exists():
|
|
1482
|
+
with chunk_path.open('r', encoding='utf-8') as chunk_in:
|
|
1483
|
+
for line in chunk_in:
|
|
1484
|
+
if not line.strip():
|
|
1485
|
+
continue
|
|
1486
|
+
staged = json.loads(line)
|
|
1487
|
+
bucket = staged['bucket']
|
|
1488
|
+
dedupe = staged.get('dedupe_key')
|
|
1489
|
+
record = staged['record']
|
|
1490
|
+
seen = strict_seen if bucket == 'strict' else lossy_seen
|
|
1491
|
+
if dedupe is not None and dedupe in seen:
|
|
1492
|
+
stats[f'{bucket}_records_deduped'] += 1
|
|
1493
|
+
continue
|
|
1494
|
+
if dedupe is not None:
|
|
1495
|
+
seen.add(dedupe)
|
|
1496
|
+
|
|
1497
|
+
if bucket == 'strict':
|
|
1498
|
+
if strict_out is not None:
|
|
1499
|
+
strict_out.write(json.dumps(record, ensure_ascii=False, separators=(',', ':')) + '\n')
|
|
1500
|
+
if parquet_enabled:
|
|
1501
|
+
strict_batch.append(record_to_parquet_row(record))
|
|
1502
|
+
if len(strict_batch) >= batch_size:
|
|
1503
|
+
strict_writer = append_parquet_rows(strict_writer, strict_batch, strict_parquet_path)
|
|
1504
|
+
strict_batch = []
|
|
1505
|
+
stats['strict_records_written'] += 1
|
|
1506
|
+
else:
|
|
1507
|
+
if lossy_out is not None:
|
|
1508
|
+
lossy_out.write(json.dumps(record, ensure_ascii=False, separators=(',', ':')) + '\n')
|
|
1509
|
+
if parquet_enabled:
|
|
1510
|
+
lossy_batch.append(record_to_parquet_row(record))
|
|
1511
|
+
if len(lossy_batch) >= batch_size:
|
|
1512
|
+
lossy_writer = append_parquet_rows(lossy_writer, lossy_batch, lossy_parquet_path)
|
|
1513
|
+
lossy_batch = []
|
|
1514
|
+
stats['lossy_records_written'] += 1
|
|
1515
|
+
|
|
1516
|
+
if not args.keep_staging:
|
|
1517
|
+
if chunk_path.exists():
|
|
1518
|
+
chunk_path.unlink()
|
|
1519
|
+
if invalid_chunk_path.exists():
|
|
1520
|
+
invalid_chunk_path.unlink()
|
|
1521
|
+
|
|
1522
|
+
if parquet_enabled:
|
|
1523
|
+
strict_writer = append_parquet_rows(strict_writer, strict_batch, strict_parquet_path)
|
|
1524
|
+
lossy_writer = append_parquet_rows(lossy_writer, lossy_batch, lossy_parquet_path)
|
|
1525
|
+
finally:
|
|
1526
|
+
if strict_out is not None:
|
|
1527
|
+
strict_out.close()
|
|
1528
|
+
if lossy_out is not None:
|
|
1529
|
+
lossy_out.close()
|
|
1530
|
+
invalid_out.close()
|
|
1531
|
+
if strict_writer is not None:
|
|
1532
|
+
strict_writer.close()
|
|
1533
|
+
if lossy_writer is not None:
|
|
1534
|
+
lossy_writer.close()
|
|
1535
|
+
|
|
1536
|
+
if not args.keep_staging:
|
|
1537
|
+
try:
|
|
1538
|
+
staging_dir.rmdir()
|
|
1539
|
+
except OSError:
|
|
1540
|
+
pass
|
|
1541
|
+
|
|
1542
|
+
manifest = {
|
|
1543
|
+
'output_dir': str(out_dir),
|
|
1544
|
+
'source_count': len(sources),
|
|
1545
|
+
'sources': sources,
|
|
1546
|
+
'workers': worker_count,
|
|
1547
|
+
'dedupe_mode': args.dedupe_mode,
|
|
1548
|
+
'output_format': args.output_format,
|
|
1549
|
+
'strict_records': stats['strict_records_written'],
|
|
1550
|
+
'lossy_records': stats['lossy_records_written'],
|
|
1551
|
+
'stats': dict(stats),
|
|
1552
|
+
}
|
|
1553
|
+
manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2) + '\n', encoding='utf-8')
|
|
1554
|
+
print(json.dumps(manifest, ensure_ascii=False), flush=True)
|
|
1555
|
+
return 0
|
|
1556
|
+
|
|
1557
|
+
|
|
1558
|
+
if __name__ == '__main__':
|
|
1559
|
+
raise SystemExit(main())
|