agentic-dataset-builder 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -91
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +149 -0
- package/dist/labeling.d.ts +12 -0
- package/dist/labeling.js +22 -0
- package/dist/parquet.d.ts +4 -0
- package/dist/parquet.js +115 -0
- package/dist/platform/paths.d.ts +4 -0
- package/dist/platform/paths.js +62 -0
- package/dist/schemas/qwen35.d.ts +338 -0
- package/dist/schemas/qwen35.js +139 -0
- package/dist/sources/claude.d.ts +2 -0
- package/dist/sources/claude.js +64 -0
- package/dist/sources/codex.d.ts +2 -0
- package/dist/sources/codex.js +261 -0
- package/dist/sources/pi.d.ts +2 -0
- package/dist/sources/pi.js +276 -0
- package/dist/utils/common.d.ts +7 -0
- package/dist/utils/common.js +46 -0
- package/dist/utils/jsonl.d.ts +2 -0
- package/dist/utils/jsonl.js +17 -0
- package/package.json +24 -12
- package/agentic_dataset/__init__.py +0 -1
- package/agentic_dataset/build_agentic_dataset.py +0 -368
- package/agentic_dataset/export_codex_session_to_qwen35.py +0 -466
- package/agentic_dataset/export_pi_session.py +0 -701
- package/agentic_dataset/export_pi_session_to_qwen35.py +0 -742
- package/agentic_dataset/export_qwen35_training.py +0 -1559
- package/agentic_dataset/label_qwen35_agentic.py +0 -156
- package/agentic_dataset/platform_paths.py +0 -85
- package/agentic_dataset/qwen35_training_record.py +0 -179
- package/bin/agentic-dataset-builder.js +0 -77
- package/requirements.txt +0 -2
- package/run.py +0 -8
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
import fg from 'fast-glob';
|
|
2
|
+
import { z } from 'zod';
|
|
3
|
+
import { Qwen35RecordSchema } from '../schemas/qwen35.js';
|
|
4
|
+
import { readJsonl } from '../utils/jsonl.js';
|
|
5
|
+
const EntrySchema = z.object({
|
|
6
|
+
timestamp: z.string().optional(),
|
|
7
|
+
type: z.string(),
|
|
8
|
+
payload: z.record(z.string(), z.unknown()).optional(),
|
|
9
|
+
}).passthrough();
|
|
10
|
+
class TurnBuilder {
|
|
11
|
+
sessionMeta;
|
|
12
|
+
turnId;
|
|
13
|
+
startTs;
|
|
14
|
+
messages = [];
|
|
15
|
+
pendingText = [];
|
|
16
|
+
pendingReasoning = [];
|
|
17
|
+
pendingToolCalls = [];
|
|
18
|
+
callNames = new Map();
|
|
19
|
+
tools = new Map();
|
|
20
|
+
lossyReasons = new Set();
|
|
21
|
+
lastTs;
|
|
22
|
+
lastAgentMessage;
|
|
23
|
+
constructor(sessionMeta, turnId, startTs) {
|
|
24
|
+
this.sessionMeta = sessionMeta;
|
|
25
|
+
this.turnId = turnId;
|
|
26
|
+
this.startTs = startTs;
|
|
27
|
+
this.lastTs = startTs;
|
|
28
|
+
}
|
|
29
|
+
ingest(entry) {
|
|
30
|
+
this.lastTs = entry.timestamp ?? this.lastTs;
|
|
31
|
+
const payload = (entry.payload ?? {});
|
|
32
|
+
if (entry.type === 'response_item')
|
|
33
|
+
this.ingestResponseItem(payload);
|
|
34
|
+
if (entry.type === 'event_msg')
|
|
35
|
+
this.ingestEvent(payload);
|
|
36
|
+
}
|
|
37
|
+
ingestResponseItem(payload) {
|
|
38
|
+
const type = asString(payload.type);
|
|
39
|
+
if (type === 'message')
|
|
40
|
+
this.ingestMessage(payload);
|
|
41
|
+
if (type === 'reasoning')
|
|
42
|
+
this.ingestReasoning(payload);
|
|
43
|
+
if (type === 'function_call')
|
|
44
|
+
this.ingestFunctionCall(payload);
|
|
45
|
+
if (type === 'function_call_output')
|
|
46
|
+
this.ingestFunctionCallOutput(payload);
|
|
47
|
+
if (type === 'custom_tool_call')
|
|
48
|
+
this.ingestCustomToolCall(payload);
|
|
49
|
+
if (type === 'custom_tool_call_output')
|
|
50
|
+
this.ingestCustomToolCallOutput(payload);
|
|
51
|
+
}
|
|
52
|
+
ingestEvent(payload) {
|
|
53
|
+
const type = asString(payload.type);
|
|
54
|
+
if (type === 'exec_command_end')
|
|
55
|
+
this.ingestExecCommandEnd(payload);
|
|
56
|
+
if (type === 'task_complete') {
|
|
57
|
+
const msg = asString(payload.last_agent_message);
|
|
58
|
+
if (msg)
|
|
59
|
+
this.lastAgentMessage = msg;
|
|
60
|
+
}
|
|
61
|
+
if (type === 'error' && asString(payload.message))
|
|
62
|
+
this.lossyReasons.add('turn_error');
|
|
63
|
+
}
|
|
64
|
+
ingestMessage(payload) {
|
|
65
|
+
const role = asString(payload.role);
|
|
66
|
+
const content = Array.isArray(payload.content) ? payload.content : [];
|
|
67
|
+
const text = extractCodexText(content);
|
|
68
|
+
if (role === 'assistant') {
|
|
69
|
+
if (text)
|
|
70
|
+
this.pendingText.push(text);
|
|
71
|
+
return;
|
|
72
|
+
}
|
|
73
|
+
this.flushAssistant();
|
|
74
|
+
if (role === 'user')
|
|
75
|
+
this.messages.push({ role: 'user', content: text });
|
|
76
|
+
else if (role === 'developer' && text)
|
|
77
|
+
this.messages.push({ role: 'system', content: text });
|
|
78
|
+
}
|
|
79
|
+
ingestReasoning(payload) {
|
|
80
|
+
const summary = Array.isArray(payload.summary) ? payload.summary : [];
|
|
81
|
+
const visible = summary
|
|
82
|
+
.map((item) => (item && typeof item === 'object' ? asString(item.text) ?? asString(item.summary_text) : undefined))
|
|
83
|
+
.filter(Boolean);
|
|
84
|
+
const content = asString(payload.content);
|
|
85
|
+
if (content)
|
|
86
|
+
visible.push(content);
|
|
87
|
+
if (visible.length)
|
|
88
|
+
this.pendingReasoning.push(...visible);
|
|
89
|
+
else if (payload.encrypted_content)
|
|
90
|
+
this.lossyReasons.add('encrypted_reasoning_without_summary');
|
|
91
|
+
}
|
|
92
|
+
ingestFunctionCall(payload) {
|
|
93
|
+
const name = asString(payload.name) ?? 'tool';
|
|
94
|
+
const callId = asString(payload.call_id);
|
|
95
|
+
const args = parseJsonObject(payload.arguments);
|
|
96
|
+
this.pendingToolCalls.push({ type: 'function', id: callId, function: { name, arguments: args } });
|
|
97
|
+
if (callId)
|
|
98
|
+
this.callNames.set(callId, name);
|
|
99
|
+
this.tools.set(name, { name });
|
|
100
|
+
}
|
|
101
|
+
ingestCustomToolCall(payload) {
|
|
102
|
+
const name = asString(payload.name) ?? 'custom_tool';
|
|
103
|
+
const callId = asString(payload.call_id);
|
|
104
|
+
this.pendingToolCalls.push({
|
|
105
|
+
type: 'function',
|
|
106
|
+
id: callId,
|
|
107
|
+
function: { name, arguments: { input: payload.input, status: payload.status } },
|
|
108
|
+
});
|
|
109
|
+
if (callId)
|
|
110
|
+
this.callNames.set(callId, name);
|
|
111
|
+
this.tools.set(name, { name });
|
|
112
|
+
}
|
|
113
|
+
ingestFunctionCallOutput(payload) {
|
|
114
|
+
this.flushAssistant();
|
|
115
|
+
const callId = asString(payload.call_id);
|
|
116
|
+
const name = callId ? this.callNames.get(callId) ?? 'tool' : 'tool';
|
|
117
|
+
const output = typeof payload.output === 'string' ? payload.output : JSON.stringify(payload.output);
|
|
118
|
+
this.messages.push({ role: 'tool', name, tool_call_id: callId, content: output });
|
|
119
|
+
this.tools.set(name, { name });
|
|
120
|
+
}
|
|
121
|
+
ingestCustomToolCallOutput(payload) {
|
|
122
|
+
this.ingestFunctionCallOutput(payload);
|
|
123
|
+
}
|
|
124
|
+
ingestExecCommandEnd(payload) {
|
|
125
|
+
this.flushAssistant();
|
|
126
|
+
const callId = asString(payload.call_id);
|
|
127
|
+
const name = callId ? this.callNames.get(callId) ?? 'exec_command' : 'exec_command';
|
|
128
|
+
this.tools.set(name, { name });
|
|
129
|
+
this.messages.push({
|
|
130
|
+
role: 'tool',
|
|
131
|
+
name,
|
|
132
|
+
tool_call_id: callId,
|
|
133
|
+
content: JSON.stringify({
|
|
134
|
+
command: payload.command,
|
|
135
|
+
cwd: payload.cwd,
|
|
136
|
+
aggregated_output: payload.aggregated_output,
|
|
137
|
+
exit_code: payload.exit_code,
|
|
138
|
+
status: payload.status,
|
|
139
|
+
duration: payload.duration,
|
|
140
|
+
}),
|
|
141
|
+
});
|
|
142
|
+
}
|
|
143
|
+
flushAssistant() {
|
|
144
|
+
if (!this.pendingText.length && !this.pendingReasoning.length && !this.pendingToolCalls.length)
|
|
145
|
+
return;
|
|
146
|
+
const message = { role: 'assistant', content: this.pendingText.join('\n\n') };
|
|
147
|
+
if (this.pendingReasoning.length)
|
|
148
|
+
message.reasoning_content = this.pendingReasoning.join('\n\n');
|
|
149
|
+
if (this.pendingToolCalls.length)
|
|
150
|
+
message.tool_calls = [...this.pendingToolCalls];
|
|
151
|
+
this.messages.push(message);
|
|
152
|
+
this.pendingText = [];
|
|
153
|
+
this.pendingReasoning = [];
|
|
154
|
+
this.pendingToolCalls = [];
|
|
155
|
+
}
|
|
156
|
+
finalize() {
|
|
157
|
+
if (this.lastAgentMessage && !this.pendingText.length) {
|
|
158
|
+
this.pendingText.push(this.lastAgentMessage);
|
|
159
|
+
this.lossyReasons.add('synthetic_last_agent_message');
|
|
160
|
+
}
|
|
161
|
+
this.flushAssistant();
|
|
162
|
+
if (!this.messages.some((message) => message.role === 'user'))
|
|
163
|
+
return null;
|
|
164
|
+
return Qwen35RecordSchema.parse({
|
|
165
|
+
id: `${asString(this.sessionMeta.id) ?? 'codex'}:${this.turnId}`,
|
|
166
|
+
request_id: this.turnId,
|
|
167
|
+
messages: this.messages,
|
|
168
|
+
tools: [...this.tools.values()],
|
|
169
|
+
meta: {
|
|
170
|
+
endpoint: 'codex/turn',
|
|
171
|
+
status: this.lossyReasons.has('turn_error') ? 500 : 200,
|
|
172
|
+
ts: this.lastTs,
|
|
173
|
+
key: asString(this.sessionMeta.id),
|
|
174
|
+
source: `codex:session=${asString(this.sessionMeta.id)}:turn=${this.turnId}:cwd=${asString(this.sessionMeta.cwd)}`,
|
|
175
|
+
requested_model: asString(this.sessionMeta.model),
|
|
176
|
+
actual_model: asString(this.sessionMeta.model),
|
|
177
|
+
stream: false,
|
|
178
|
+
thinking_level: asString(this.sessionMeta.reasoning_effort),
|
|
179
|
+
reasoning_summary_mode: 'codex_reasoning_summary',
|
|
180
|
+
thinking_type: 'codex_turn',
|
|
181
|
+
tool_spec_count: this.tools.size,
|
|
182
|
+
tool_choice: { mode: 'session_trace' },
|
|
183
|
+
request_contains_non_text_content: false,
|
|
184
|
+
request_image_block_count: 0,
|
|
185
|
+
request_video_block_count: 0,
|
|
186
|
+
request_tool_call_block_count: 0,
|
|
187
|
+
request_tool_result_block_count: 0,
|
|
188
|
+
request_thinking_block_count: 0,
|
|
189
|
+
response_contains_non_text_content: false,
|
|
190
|
+
response_image_block_count: 0,
|
|
191
|
+
response_video_block_count: 0,
|
|
192
|
+
response_tool_call_block_count: this.messages.filter((m) => m.role === 'assistant').reduce((sum, m) => sum + (m.tool_calls?.length ?? 0), 0),
|
|
193
|
+
response_tool_result_block_count: this.messages.filter((m) => m.role === 'tool').length,
|
|
194
|
+
response_thinking_block_count: this.messages.filter((m) => m.role === 'assistant' && typeof m.reasoning_content === 'string' && m.reasoning_content.length > 0).length,
|
|
195
|
+
request_truncated: false,
|
|
196
|
+
response_truncated: false,
|
|
197
|
+
lossy_source: this.lossyReasons.size > 0,
|
|
198
|
+
lossy_reasons: [...this.lossyReasons],
|
|
199
|
+
},
|
|
200
|
+
});
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
export async function collectCodexRecords(root) {
|
|
204
|
+
const files = await fg('**/*.jsonl', { cwd: root, absolute: true, onlyFiles: true });
|
|
205
|
+
const records = [];
|
|
206
|
+
for (const file of files.sort()) {
|
|
207
|
+
const entries = (await readJsonl(file)).map((entry) => EntrySchema.parse(entry));
|
|
208
|
+
const sessionMeta = (entries.find((entry) => entry.type === 'session_meta')?.payload ?? {});
|
|
209
|
+
let builder = null;
|
|
210
|
+
for (const entry of entries) {
|
|
211
|
+
const payload = (entry.payload ?? {});
|
|
212
|
+
if (entry.type === 'turn_context') {
|
|
213
|
+
sessionMeta.model = payload.model;
|
|
214
|
+
sessionMeta.reasoning_effort = payload.effort;
|
|
215
|
+
}
|
|
216
|
+
if (entry.type === 'event_msg' && payload.type === 'task_started') {
|
|
217
|
+
builder = new TurnBuilder(sessionMeta, asString(payload.turn_id) ?? entry.timestamp ?? 'turn', entry.timestamp ?? '');
|
|
218
|
+
continue;
|
|
219
|
+
}
|
|
220
|
+
if (!builder)
|
|
221
|
+
continue;
|
|
222
|
+
builder.ingest(entry);
|
|
223
|
+
if (entry.type === 'event_msg' && payload.type === 'task_complete') {
|
|
224
|
+
const record = builder.finalize();
|
|
225
|
+
if (record)
|
|
226
|
+
records.push(record);
|
|
227
|
+
builder = null;
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
return records;
|
|
232
|
+
}
|
|
233
|
+
function parseJsonObject(value) {
|
|
234
|
+
if (value && typeof value === 'object' && !Array.isArray(value))
|
|
235
|
+
return value;
|
|
236
|
+
if (typeof value !== 'string')
|
|
237
|
+
return {};
|
|
238
|
+
try {
|
|
239
|
+
const parsed = JSON.parse(value);
|
|
240
|
+
return parsed && typeof parsed === 'object' && !Array.isArray(parsed) ? parsed : { value: parsed };
|
|
241
|
+
}
|
|
242
|
+
catch {
|
|
243
|
+
return { raw: value };
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
function extractCodexText(content) {
|
|
247
|
+
return content
|
|
248
|
+
.map((item) => {
|
|
249
|
+
const type = asString(item.type);
|
|
250
|
+
if ((type === 'input_text' || type === 'output_text') && typeof item.text === 'string')
|
|
251
|
+
return item.text;
|
|
252
|
+
if (type === 'input_image')
|
|
253
|
+
return '[image]';
|
|
254
|
+
return '';
|
|
255
|
+
})
|
|
256
|
+
.filter(Boolean)
|
|
257
|
+
.join('\n');
|
|
258
|
+
}
|
|
259
|
+
function asString(value) {
|
|
260
|
+
return typeof value === 'string' ? value : undefined;
|
|
261
|
+
}
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import fg from 'fast-glob';
|
|
3
|
+
import { z } from 'zod';
|
|
4
|
+
import { Qwen35RecordSchema } from '../schemas/qwen35.js';
|
|
5
|
+
import { isFile } from '../utils/common.js';
|
|
6
|
+
import { readJsonl } from '../utils/jsonl.js';
|
|
7
|
+
const SessionEntrySchema = z.object({
|
|
8
|
+
type: z.string(),
|
|
9
|
+
id: z.string().optional(),
|
|
10
|
+
parentId: z.string().nullable().optional(),
|
|
11
|
+
timestamp: z.string().optional(),
|
|
12
|
+
}).passthrough();
|
|
13
|
+
export async function collectPiRecords(root) {
|
|
14
|
+
const files = await fg('**/*.jsonl', { cwd: root, absolute: true, onlyFiles: true });
|
|
15
|
+
const records = [];
|
|
16
|
+
for (const file of files.sort()) {
|
|
17
|
+
const rows = (await readJsonl(file)).map((row) => SessionEntrySchema.parse(row));
|
|
18
|
+
if (!rows.length)
|
|
19
|
+
continue;
|
|
20
|
+
const header = rows[0];
|
|
21
|
+
const body = rows.slice(1);
|
|
22
|
+
const byId = new Map();
|
|
23
|
+
const children = new Map();
|
|
24
|
+
for (const entry of body) {
|
|
25
|
+
if (!entry.id)
|
|
26
|
+
continue;
|
|
27
|
+
byId.set(entry.id, entry);
|
|
28
|
+
const key = entry.parentId ?? null;
|
|
29
|
+
const bucket = children.get(key) ?? [];
|
|
30
|
+
bucket.push(entry.id);
|
|
31
|
+
children.set(key, bucket);
|
|
32
|
+
}
|
|
33
|
+
const leaves = [...byId.keys()].filter((id) => !children.get(id)?.length).sort();
|
|
34
|
+
for (const leaf of leaves) {
|
|
35
|
+
const pathEntries = branchEntries(leaf, byId);
|
|
36
|
+
const record = buildPiRecord(pathEntries, header, file, leaves.length > 1);
|
|
37
|
+
records.push(Qwen35RecordSchema.parse(record));
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
return records;
|
|
41
|
+
}
|
|
42
|
+
function branchEntries(leaf, byId) {
|
|
43
|
+
const ordered = [];
|
|
44
|
+
let current = leaf;
|
|
45
|
+
while (current) {
|
|
46
|
+
const entry = byId.get(current);
|
|
47
|
+
if (!entry)
|
|
48
|
+
break;
|
|
49
|
+
ordered.push(entry);
|
|
50
|
+
current = entry.parentId ?? null;
|
|
51
|
+
}
|
|
52
|
+
return ordered.reverse();
|
|
53
|
+
}
|
|
54
|
+
function buildPiRecord(entries, header, sourceFile, branched) {
|
|
55
|
+
const messages = [];
|
|
56
|
+
const tools = new Map();
|
|
57
|
+
const lossyReasons = new Set();
|
|
58
|
+
const models = [];
|
|
59
|
+
const thinkingLevels = [];
|
|
60
|
+
for (const entry of entries) {
|
|
61
|
+
if (entry.type === 'model_change') {
|
|
62
|
+
const provider = asString(entry.provider);
|
|
63
|
+
const modelId = asString(entry.modelId);
|
|
64
|
+
if (modelId)
|
|
65
|
+
models.push(provider ? `${provider}/${modelId}` : modelId);
|
|
66
|
+
continue;
|
|
67
|
+
}
|
|
68
|
+
if (entry.type === 'thinking_level_change') {
|
|
69
|
+
const level = asString(entry.thinkingLevel);
|
|
70
|
+
if (level)
|
|
71
|
+
thinkingLevels.push(level);
|
|
72
|
+
continue;
|
|
73
|
+
}
|
|
74
|
+
if (entry.type === 'message') {
|
|
75
|
+
const msg = entry.message;
|
|
76
|
+
if (!msg)
|
|
77
|
+
continue;
|
|
78
|
+
const role = asString(msg.role);
|
|
79
|
+
if (role === 'user') {
|
|
80
|
+
messages.push({ role: 'user', content: normalizeContent(msg.content, lossyReasons, 'user') });
|
|
81
|
+
}
|
|
82
|
+
else if (role === 'assistant') {
|
|
83
|
+
messages.push(normalizeAssistant(msg, tools, lossyReasons));
|
|
84
|
+
}
|
|
85
|
+
else if (role === 'toolResult') {
|
|
86
|
+
const toolName = asString(msg.toolName) ?? 'tool';
|
|
87
|
+
tools.set(toolName, { name: toolName });
|
|
88
|
+
messages.push({
|
|
89
|
+
role: 'tool',
|
|
90
|
+
name: toolName,
|
|
91
|
+
tool_call_id: asString(msg.toolCallId),
|
|
92
|
+
content: normalizeContent(msg.content, lossyReasons, 'tool_result'),
|
|
93
|
+
});
|
|
94
|
+
}
|
|
95
|
+
else if (role === 'bashExecution') {
|
|
96
|
+
tools.set('bash', { name: 'bash' });
|
|
97
|
+
messages.push({ role: 'tool', name: 'bash', content: formatBash(msg, lossyReasons) });
|
|
98
|
+
}
|
|
99
|
+
continue;
|
|
100
|
+
}
|
|
101
|
+
if (entry.type === 'branch_summary') {
|
|
102
|
+
const summary = asString(entry.summary);
|
|
103
|
+
if (summary) {
|
|
104
|
+
lossyReasons.add('synthetic_branch_summary');
|
|
105
|
+
messages.push({ role: 'assistant', content: `[branch_summary]\n${summary}` });
|
|
106
|
+
}
|
|
107
|
+
continue;
|
|
108
|
+
}
|
|
109
|
+
if (entry.type === 'compaction') {
|
|
110
|
+
const summary = asString(entry.summary);
|
|
111
|
+
if (summary) {
|
|
112
|
+
lossyReasons.add('synthetic_compaction_summary');
|
|
113
|
+
messages.push({ role: 'assistant', content: `[compaction_summary]\n${summary}` });
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
if (branched)
|
|
118
|
+
lossyReasons.add('session_tree_branch_selected');
|
|
119
|
+
if (new Set(models).size > 1)
|
|
120
|
+
lossyReasons.add('multiple_models_on_branch');
|
|
121
|
+
if (new Set(thinkingLevels).size > 1)
|
|
122
|
+
lossyReasons.add('multiple_thinking_levels_on_branch');
|
|
123
|
+
const meta = buildMeta(messages, {
|
|
124
|
+
endpoint: 'pi/session_branch',
|
|
125
|
+
ts: asString(entries.at(-1)?.timestamp) ?? asString(header.timestamp) ?? '',
|
|
126
|
+
key: asString(header.id) ?? undefined,
|
|
127
|
+
source: `${sourceFile}#leaf=${entries.at(-1)?.id ?? ''}`,
|
|
128
|
+
requested_model: models[0] ?? undefined,
|
|
129
|
+
actual_model: models.at(-1) ?? undefined,
|
|
130
|
+
thinking_level: thinkingLevels.at(-1) ?? undefined,
|
|
131
|
+
tool_spec_count: tools.size,
|
|
132
|
+
tool_choice: { mode: 'session_trace' },
|
|
133
|
+
reasoning_summary_mode: 'pi_session_branch',
|
|
134
|
+
thinking_type: 'pi_session',
|
|
135
|
+
lossy_reasons: [...lossyReasons],
|
|
136
|
+
});
|
|
137
|
+
return {
|
|
138
|
+
id: `${asString(header.id) ?? 'pi'}:${entries.at(-1)?.id ?? 'leaf'}`,
|
|
139
|
+
request_id: asString(header.id) ?? undefined,
|
|
140
|
+
messages,
|
|
141
|
+
tools: [...tools.values()],
|
|
142
|
+
meta,
|
|
143
|
+
};
|
|
144
|
+
}
|
|
145
|
+
function normalizeAssistant(msg, tools, lossyReasons) {
|
|
146
|
+
const content = msg.content;
|
|
147
|
+
const textBlocks = [];
|
|
148
|
+
const reasoning = [];
|
|
149
|
+
const toolCalls = [];
|
|
150
|
+
if (Array.isArray(content)) {
|
|
151
|
+
for (const raw of content) {
|
|
152
|
+
if (!raw || typeof raw !== 'object')
|
|
153
|
+
continue;
|
|
154
|
+
const block = raw;
|
|
155
|
+
const type = asString(block.type);
|
|
156
|
+
if (type === 'text') {
|
|
157
|
+
textBlocks.push({ type: 'text', text: asString(block.text) ?? '' });
|
|
158
|
+
}
|
|
159
|
+
else if (type === 'thinking') {
|
|
160
|
+
const thinking = asString(block.thinking);
|
|
161
|
+
if (thinking)
|
|
162
|
+
reasoning.push(thinking);
|
|
163
|
+
if (!thinking && asString(block.thinkingSignature))
|
|
164
|
+
lossyReasons.add('encrypted_reasoning_without_visible_text');
|
|
165
|
+
}
|
|
166
|
+
else if (type === 'toolCall') {
|
|
167
|
+
const name = asString(block.name) ?? 'tool';
|
|
168
|
+
tools.set(name, { name });
|
|
169
|
+
toolCalls.push({
|
|
170
|
+
type: 'function',
|
|
171
|
+
id: asString(block.id),
|
|
172
|
+
function: {
|
|
173
|
+
name,
|
|
174
|
+
arguments: isRecord(block.arguments) ? block.arguments : {},
|
|
175
|
+
},
|
|
176
|
+
});
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
const assistant = {
|
|
181
|
+
role: 'assistant',
|
|
182
|
+
content: textBlocks.length === 1 ? textBlocks[0].text : textBlocks,
|
|
183
|
+
};
|
|
184
|
+
if (reasoning.length)
|
|
185
|
+
assistant.reasoning_content = reasoning.join('\n\n');
|
|
186
|
+
if (toolCalls.length)
|
|
187
|
+
assistant.tool_calls = toolCalls;
|
|
188
|
+
return assistant;
|
|
189
|
+
}
|
|
190
|
+
function formatBash(msg, lossyReasons) {
|
|
191
|
+
const truncated = Boolean(msg.truncated);
|
|
192
|
+
let output = asString(msg.output) ?? '';
|
|
193
|
+
const fullOutputPath = asString(msg.fullOutputPath) ?? asString(msg.details?.fullOutputPath);
|
|
194
|
+
if (truncated && fullOutputPath) {
|
|
195
|
+
if (isFile(fullOutputPath)) {
|
|
196
|
+
output = fs.readFileSync(fullOutputPath, 'utf8');
|
|
197
|
+
}
|
|
198
|
+
else {
|
|
199
|
+
lossyReasons.add('missing_embedded_full_output');
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
return JSON.stringify({
|
|
203
|
+
command: asString(msg.command),
|
|
204
|
+
exit_code: asNumber(msg.exitCode),
|
|
205
|
+
cancelled: Boolean(msg.cancelled),
|
|
206
|
+
truncated,
|
|
207
|
+
exclude_from_context: Boolean(msg.excludeFromContext),
|
|
208
|
+
output,
|
|
209
|
+
});
|
|
210
|
+
}
|
|
211
|
+
function normalizeContent(content, lossyReasons, prefix) {
|
|
212
|
+
if (typeof content === 'string')
|
|
213
|
+
return content;
|
|
214
|
+
if (!Array.isArray(content)) {
|
|
215
|
+
lossyReasons.add(`${prefix}_nonstandard_content`);
|
|
216
|
+
return JSON.stringify(content);
|
|
217
|
+
}
|
|
218
|
+
const blocks = [];
|
|
219
|
+
for (const raw of content) {
|
|
220
|
+
if (!raw || typeof raw !== 'object')
|
|
221
|
+
continue;
|
|
222
|
+
const block = raw;
|
|
223
|
+
const type = asString(block.type);
|
|
224
|
+
if (type === 'text')
|
|
225
|
+
blocks.push({ type: 'text', text: asString(block.text) ?? '' });
|
|
226
|
+
}
|
|
227
|
+
if (blocks.length === 1)
|
|
228
|
+
return blocks[0].text;
|
|
229
|
+
return blocks;
|
|
230
|
+
}
|
|
231
|
+
function buildMeta(messages, seed) {
|
|
232
|
+
const assistantMessages = messages.filter((m) => m.role === 'assistant');
|
|
233
|
+
const toolMessages = messages.filter((m) => m.role === 'tool');
|
|
234
|
+
return {
|
|
235
|
+
endpoint: seed.endpoint,
|
|
236
|
+
status: 200,
|
|
237
|
+
ts: seed.ts,
|
|
238
|
+
key: seed.key,
|
|
239
|
+
source: seed.source,
|
|
240
|
+
requested_model: seed.requested_model,
|
|
241
|
+
actual_model: seed.actual_model,
|
|
242
|
+
stream: false,
|
|
243
|
+
thinking_level: seed.thinking_level,
|
|
244
|
+
reasoning_summary_mode: seed.reasoning_summary_mode,
|
|
245
|
+
thinking_type: seed.thinking_type,
|
|
246
|
+
thinking_budget_tokens: undefined,
|
|
247
|
+
max_output_tokens: undefined,
|
|
248
|
+
tool_spec_count: seed.tool_spec_count,
|
|
249
|
+
tool_choice: seed.tool_choice,
|
|
250
|
+
request_contains_non_text_content: false,
|
|
251
|
+
request_image_block_count: 0,
|
|
252
|
+
request_video_block_count: 0,
|
|
253
|
+
request_tool_call_block_count: 0,
|
|
254
|
+
request_tool_result_block_count: 0,
|
|
255
|
+
request_thinking_block_count: 0,
|
|
256
|
+
response_contains_non_text_content: false,
|
|
257
|
+
response_image_block_count: 0,
|
|
258
|
+
response_video_block_count: 0,
|
|
259
|
+
response_tool_call_block_count: assistantMessages.reduce((sum, msg) => sum + (msg.tool_calls?.length ?? 0), 0),
|
|
260
|
+
response_tool_result_block_count: toolMessages.length,
|
|
261
|
+
response_thinking_block_count: assistantMessages.filter((msg) => typeof msg.reasoning_content === 'string' && msg.reasoning_content.length > 0).length,
|
|
262
|
+
request_truncated: false,
|
|
263
|
+
response_truncated: Array.isArray(seed.lossy_reasons) ? seed.lossy_reasons.includes('missing_embedded_full_output') : false,
|
|
264
|
+
lossy_source: seed.lossy_reasons.length > 0,
|
|
265
|
+
lossy_reasons: seed.lossy_reasons,
|
|
266
|
+
};
|
|
267
|
+
}
|
|
268
|
+
function asString(value) {
|
|
269
|
+
return typeof value === 'string' ? value : undefined;
|
|
270
|
+
}
|
|
271
|
+
function asNumber(value) {
|
|
272
|
+
return typeof value === 'number' ? value : undefined;
|
|
273
|
+
}
|
|
274
|
+
function isRecord(value) {
|
|
275
|
+
return Boolean(value) && typeof value === 'object' && !Array.isArray(value);
|
|
276
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
export declare function compactText(value: string, limit?: number): string;
|
|
2
|
+
export declare function ensureDir(dir: string): Promise<void>;
|
|
3
|
+
export declare function timestampDir(prefix: string): string;
|
|
4
|
+
export declare function safeReadFile(filePath: string): string | null;
|
|
5
|
+
export declare function isFile(filePath: string): boolean;
|
|
6
|
+
export declare function walkObject(value: unknown, visit: (obj: Record<string, unknown>) => void): void;
|
|
7
|
+
export declare function resolveChildren(base: string, ...parts: string[]): string;
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
export function compactText(value, limit = 120) {
|
|
4
|
+
const oneLine = value.replace(/\s+/g, ' ').trim();
|
|
5
|
+
return oneLine.length <= limit ? oneLine : `${oneLine.slice(0, limit - 3)}...`;
|
|
6
|
+
}
|
|
7
|
+
export async function ensureDir(dir) {
|
|
8
|
+
await fs.promises.mkdir(dir, { recursive: true });
|
|
9
|
+
}
|
|
10
|
+
export function timestampDir(prefix) {
|
|
11
|
+
const now = new Date();
|
|
12
|
+
const stamp = now.toISOString().replace(/[-:]/g, '').replace(/\.\d+Z$/, '');
|
|
13
|
+
return `${prefix}-${stamp}`;
|
|
14
|
+
}
|
|
15
|
+
export function safeReadFile(filePath) {
|
|
16
|
+
try {
|
|
17
|
+
return fs.readFileSync(filePath, 'utf8');
|
|
18
|
+
}
|
|
19
|
+
catch {
|
|
20
|
+
return null;
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
export function isFile(filePath) {
|
|
24
|
+
try {
|
|
25
|
+
return fs.statSync(filePath).isFile();
|
|
26
|
+
}
|
|
27
|
+
catch {
|
|
28
|
+
return false;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
export function walkObject(value, visit) {
|
|
32
|
+
if (Array.isArray(value)) {
|
|
33
|
+
for (const item of value)
|
|
34
|
+
walkObject(item, visit);
|
|
35
|
+
return;
|
|
36
|
+
}
|
|
37
|
+
if (value && typeof value === 'object') {
|
|
38
|
+
const obj = value;
|
|
39
|
+
visit(obj);
|
|
40
|
+
for (const child of Object.values(obj))
|
|
41
|
+
walkObject(child, visit);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
export function resolveChildren(base, ...parts) {
|
|
45
|
+
return path.resolve(base, ...parts);
|
|
46
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import readline from 'node:readline';
|
|
3
|
+
export async function readJsonl(filePath) {
|
|
4
|
+
const rows = [];
|
|
5
|
+
const stream = fs.createReadStream(filePath, { encoding: 'utf8' });
|
|
6
|
+
const rl = readline.createInterface({ input: stream, crlfDelay: Infinity });
|
|
7
|
+
for await (const line of rl) {
|
|
8
|
+
const trimmed = line.trim();
|
|
9
|
+
if (!trimmed)
|
|
10
|
+
continue;
|
|
11
|
+
rows.push(JSON.parse(trimmed));
|
|
12
|
+
}
|
|
13
|
+
return rows;
|
|
14
|
+
}
|
|
15
|
+
export async function writeJsonl(filePath, rows) {
|
|
16
|
+
await fs.promises.writeFile(filePath, rows.map((row) => JSON.stringify(row)).join('\n') + (rows.length ? '\n' : ''), 'utf8');
|
|
17
|
+
}
|
package/package.json
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agentic-dataset-builder",
|
|
3
|
-
"version": "0.
|
|
4
|
-
"description": "
|
|
3
|
+
"version": "0.2.0",
|
|
4
|
+
"description": "Pure TypeScript agentic dataset builder for Pi, Codex, and Claude Code history",
|
|
5
|
+
"license": "MIT",
|
|
6
|
+
"type": "module",
|
|
5
7
|
"homepage": "https://github.com/Dominic789654/agentic-dataset-builder",
|
|
6
8
|
"repository": {
|
|
7
9
|
"type": "git",
|
|
@@ -10,31 +12,41 @@
|
|
|
10
12
|
"bugs": {
|
|
11
13
|
"url": "https://github.com/Dominic789654/agentic-dataset-builder/issues"
|
|
12
14
|
},
|
|
13
|
-
"license": "MIT",
|
|
14
15
|
"keywords": [
|
|
15
16
|
"agentic",
|
|
16
17
|
"dataset",
|
|
17
18
|
"pi",
|
|
18
19
|
"codex",
|
|
20
|
+
"claude",
|
|
19
21
|
"qwen",
|
|
22
|
+
"zod",
|
|
20
23
|
"parquet"
|
|
21
24
|
],
|
|
25
|
+
"bin": {
|
|
26
|
+
"agentic-dataset-builder": "./dist/cli.js"
|
|
27
|
+
},
|
|
22
28
|
"files": [
|
|
23
|
-
"
|
|
24
|
-
"run.py",
|
|
25
|
-
"requirements.txt",
|
|
29
|
+
"dist",
|
|
26
30
|
"README.md",
|
|
27
|
-
"LICENSE"
|
|
28
|
-
"agentic_dataset"
|
|
31
|
+
"LICENSE"
|
|
29
32
|
],
|
|
30
|
-
"bin": {
|
|
31
|
-
"agentic-dataset-builder": "./bin/agentic-dataset-builder.js"
|
|
32
|
-
},
|
|
33
|
-
"type": "module",
|
|
34
33
|
"engines": {
|
|
35
34
|
"node": ">=18"
|
|
36
35
|
},
|
|
37
36
|
"scripts": {
|
|
37
|
+
"build": "tsc -p tsconfig.json",
|
|
38
|
+
"dev": "tsx src/cli.ts",
|
|
39
|
+
"check": "tsc -p tsconfig.json --noEmit",
|
|
38
40
|
"pack:check": "npm pack --dry-run"
|
|
41
|
+
},
|
|
42
|
+
"dependencies": {
|
|
43
|
+
"fast-glob": "^3.3.3",
|
|
44
|
+
"parquetjs-lite": "^0.8.7",
|
|
45
|
+
"zod": "^4.1.11"
|
|
46
|
+
},
|
|
47
|
+
"devDependencies": {
|
|
48
|
+
"@types/node": "^24.7.2",
|
|
49
|
+
"tsx": "^4.20.6",
|
|
50
|
+
"typescript": "^5.9.3"
|
|
39
51
|
}
|
|
40
52
|
}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
"""Agentic dataset builder package."""
|