@crewhaus/eval-dataset 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +43 -0
- package/src/__fixtures__/bare-array.yaml +5 -0
- package/src/__fixtures__/empty.jsonl +0 -0
- package/src/__fixtures__/ok.csv +5 -0
- package/src/__fixtures__/ok.jsonl +3 -0
- package/src/__fixtures__/ok.yaml +12 -0
- package/src/errors.ts +7 -0
- package/src/index.test.ts +145 -0
- package/src/index.ts +69 -0
- package/src/loaders/csv.ts +126 -0
- package/src/loaders/http.ts +143 -0
- package/src/loaders/jsonl.ts +37 -0
- package/src/loaders/yaml.ts +47 -0
package/package.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@crewhaus/eval-dataset",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"description": "Eval dataset loaders (JSONL, CSV, YAML, HTTP) with a lazy iterator API",
|
|
6
|
+
"main": "src/index.ts",
|
|
7
|
+
"types": "src/index.ts",
|
|
8
|
+
"exports": {
|
|
9
|
+
".": "./src/index.ts"
|
|
10
|
+
},
|
|
11
|
+
"scripts": {
|
|
12
|
+
"test": "bun test src"
|
|
13
|
+
},
|
|
14
|
+
"dependencies": {
|
|
15
|
+
"@crewhaus/errors": "0.0.0",
|
|
16
|
+
"yaml": "^2.6.0",
|
|
17
|
+
"zod": "^3.23.8"
|
|
18
|
+
},
|
|
19
|
+
"license": "Apache-2.0",
|
|
20
|
+
"author": {
|
|
21
|
+
"name": "Max Meier",
|
|
22
|
+
"email": "max@studiomax.io",
|
|
23
|
+
"url": "https://studiomax.io"
|
|
24
|
+
},
|
|
25
|
+
"repository": {
|
|
26
|
+
"type": "git",
|
|
27
|
+
"url": "git+https://github.com/crewhaus/factory.git",
|
|
28
|
+
"directory": "packages/eval-dataset"
|
|
29
|
+
},
|
|
30
|
+
"homepage": "https://github.com/crewhaus/factory/tree/main/packages/eval-dataset#readme",
|
|
31
|
+
"bugs": {
|
|
32
|
+
"url": "https://github.com/crewhaus/factory/issues"
|
|
33
|
+
},
|
|
34
|
+
"publishConfig": {
|
|
35
|
+
"access": "restricted"
|
|
36
|
+
},
|
|
37
|
+
"files": [
|
|
38
|
+
"src",
|
|
39
|
+
"README.md",
|
|
40
|
+
"LICENSE",
|
|
41
|
+
"NOTICE"
|
|
42
|
+
]
|
|
43
|
+
}
|
|
File without changes
|
package/src/errors.ts
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import { describe, expect, test } from "bun:test";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import { DatasetLoadError, loadDataset, parseCsv } from "./index";
|
|
4
|
+
|
|
5
|
+
// `tsc -b` compiles this file into `dist/`, so `bun test` (with no path filter)
|
|
6
|
+
// picks up both `src/index.test.ts` and `dist/index.test.js`. The compiled copy
|
|
7
|
+
// resolves `import.meta.dir` to `dist/`, but `__fixtures__/` only exists under
|
|
8
|
+
// `src/`. Map back to the source tree so both copies find the fixtures.
|
|
9
|
+
const FIX = join(import.meta.dir.replace(/([/\\])dist$/, "$1src"), "__fixtures__");
|
|
10
|
+
|
|
11
|
+
async function collect<T>(iter: AsyncIterable<T>): Promise<T[]> {
|
|
12
|
+
const out: T[] = [];
|
|
13
|
+
for await (const item of iter) out.push(item);
|
|
14
|
+
return out;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
describe("loadDataset — JSONL loader (T1)", () => {
|
|
18
|
+
test("parses well-formed JSONL with all field shapes", async () => {
|
|
19
|
+
const ds = await loadDataset(join(FIX, "ok.jsonl"));
|
|
20
|
+
expect(ds.name).toBe("ok");
|
|
21
|
+
const samples = await collect(ds.samples);
|
|
22
|
+
expect(samples).toHaveLength(3);
|
|
23
|
+
expect(samples[0]).toEqual({ id: "q1", input: "hello" });
|
|
24
|
+
expect(samples[1]?.expected_output).toBe("bye");
|
|
25
|
+
expect(samples[2]?.expected_tools).toEqual(["bash", "read"]);
|
|
26
|
+
expect(samples[2]?.metadata).toEqual({ difficulty: "easy" });
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
test("yields zero samples for empty file", async () => {
|
|
30
|
+
const ds = await loadDataset(join(FIX, "empty.jsonl"));
|
|
31
|
+
expect(await collect(ds.samples)).toEqual([]);
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
test("rejects malformed JSON with line number", async () => {
|
|
35
|
+
const tmp = `${FIX}/__tmp_malformed.jsonl`;
|
|
36
|
+
await Bun.write(tmp, '{"id":"q1","input":"ok"}\nnot json\n');
|
|
37
|
+
try {
|
|
38
|
+
const ds = await loadDataset(tmp);
|
|
39
|
+
await expect(collect(ds.samples)).rejects.toThrow(/malformed JSON on line 2/);
|
|
40
|
+
} finally {
|
|
41
|
+
await Bun.file(tmp).delete();
|
|
42
|
+
}
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
test("rejects sample missing required fields", async () => {
|
|
46
|
+
const tmp = `${FIX}/__tmp_invalid.jsonl`;
|
|
47
|
+
await Bun.write(tmp, '{"input":"no id"}\n');
|
|
48
|
+
try {
|
|
49
|
+
const ds = await loadDataset(tmp);
|
|
50
|
+
await expect(collect(ds.samples)).rejects.toThrow(DatasetLoadError);
|
|
51
|
+
} finally {
|
|
52
|
+
await Bun.file(tmp).delete();
|
|
53
|
+
}
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
test("rejects missing file", async () => {
|
|
57
|
+
await expect(loadDataset(`${FIX}/does-not-exist.jsonl`)).rejects.toThrow(DatasetLoadError);
|
|
58
|
+
});
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
describe("loadDataset — CSV loader (T1)", () => {
|
|
62
|
+
test("parses CSV with quoted fields, embedded newlines, and multi-tool column", async () => {
|
|
63
|
+
const ds = await loadDataset(join(FIX, "ok.csv"));
|
|
64
|
+
const samples = await collect(ds.samples);
|
|
65
|
+
expect(samples).toHaveLength(3);
|
|
66
|
+
expect(samples[0]).toEqual({ id: "q1", input: "hello" });
|
|
67
|
+
expect(samples[1]).toEqual({
|
|
68
|
+
id: "q2",
|
|
69
|
+
input: "hello, world",
|
|
70
|
+
expected_output: "greeting",
|
|
71
|
+
expected_tools: ["bash", "read"],
|
|
72
|
+
});
|
|
73
|
+
expect(samples[2]?.input).toBe("line1\nline2");
|
|
74
|
+
});
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
describe("parseCsv — RFC 4180 subset (T1)", () => {
|
|
78
|
+
test("empty input yields no rows", () => {
|
|
79
|
+
expect(parseCsv("")).toEqual([]);
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
test("escaped double-quotes inside quoted field", () => {
|
|
83
|
+
expect(parseCsv('a,"b ""quoted"" c"')).toEqual([["a", 'b "quoted" c']]);
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
test("CRLF line endings", () => {
|
|
87
|
+
expect(parseCsv("a,b\r\nc,d\r\n")).toEqual([
|
|
88
|
+
["a", "b"],
|
|
89
|
+
["c", "d"],
|
|
90
|
+
]);
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
test("trailing newline is not a synthetic empty row", () => {
|
|
94
|
+
expect(parseCsv("a,b\n")).toEqual([["a", "b"]]);
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
test("missing final newline still yields the last row", () => {
|
|
98
|
+
expect(parseCsv("a,b\nc,d")).toEqual([
|
|
99
|
+
["a", "b"],
|
|
100
|
+
["c", "d"],
|
|
101
|
+
]);
|
|
102
|
+
});
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
describe("loadDataset — YAML loader (T1)", () => {
|
|
106
|
+
test("parses Dataset wrapper shape", async () => {
|
|
107
|
+
const ds = await loadDataset(join(FIX, "ok.yaml"));
|
|
108
|
+
expect(ds.name).toBe("yaml-fixture");
|
|
109
|
+
const samples = await collect(ds.samples);
|
|
110
|
+
expect(samples).toHaveLength(3);
|
|
111
|
+
expect(samples[0]).toEqual({ id: "q1", input: "hello" });
|
|
112
|
+
});
|
|
113
|
+
|
|
114
|
+
test("parses bare-array shape and derives name from filename", async () => {
|
|
115
|
+
const ds = await loadDataset(join(FIX, "bare-array.yaml"));
|
|
116
|
+
expect(ds.name).toBe("bare-array");
|
|
117
|
+
const samples = await collect(ds.samples);
|
|
118
|
+
expect(samples).toHaveLength(2);
|
|
119
|
+
expect(samples[1]?.expected_output).toBe("ok");
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
test("supports .yml extension", async () => {
|
|
123
|
+
const tmp = `${FIX}/__tmp.yml`;
|
|
124
|
+
await Bun.write(tmp, "- id: x\n input: y\n");
|
|
125
|
+
try {
|
|
126
|
+
const ds = await loadDataset(tmp);
|
|
127
|
+
const samples = await collect(ds.samples);
|
|
128
|
+
expect(samples).toHaveLength(1);
|
|
129
|
+
} finally {
|
|
130
|
+
await Bun.file(tmp).delete();
|
|
131
|
+
}
|
|
132
|
+
});
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
describe("loadDataset — dispatch", () => {
|
|
136
|
+
test("rejects unknown extension", async () => {
|
|
137
|
+
await expect(loadDataset("/tmp/something.xyz")).rejects.toThrow(/unrecognized dataset source/);
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
test("HTTP scheme dispatches to http loader (rejects offline lookups gracefully)", async () => {
|
|
141
|
+
// Simply verifies dispatch — actual fetch is exercised in __test__/http.test.ts
|
|
142
|
+
// when CREWHAUS_TEST_HTTP=1 is set.
|
|
143
|
+
await expect(loadDataset("http://127.0.0.1:1/never-listens.jsonl")).rejects.toThrow();
|
|
144
|
+
});
|
|
145
|
+
});
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Catalog R-eval `eval-dataset` — load samples from JSONL, CSV, YAML, or HTTP.
|
|
3
|
+
*
|
|
4
|
+
* Sample shape is the contract every other eval-* module consumes:
|
|
5
|
+
* { id, input, expected_output?, expected_tools?, metadata? }
|
|
6
|
+
*
|
|
7
|
+
* Loaders return an `AsyncIterable<Sample>` so 100k-row datasets stream rather
|
|
8
|
+
* than load fully into memory. JSONL/CSV truly stream off the file system via
|
|
9
|
+
* `Bun.file().stream()`. YAML and HTTP buffer (YAML can't stream; HTTP buffers
|
|
10
|
+
* before re-dispatching to the format-specific loader).
|
|
11
|
+
*
|
|
12
|
+
* Reference: build-roadmap.md §16.
|
|
13
|
+
*/
|
|
14
|
+
import { z } from "zod";
|
|
15
|
+
import { DatasetLoadError } from "./errors";
|
|
16
|
+
import { loadCsv } from "./loaders/csv";
|
|
17
|
+
import { loadHttp } from "./loaders/http";
|
|
18
|
+
import { loadJsonl } from "./loaders/jsonl";
|
|
19
|
+
import { loadYaml } from "./loaders/yaml";
|
|
20
|
+
|
|
21
|
+
export const SampleSchema = z.object({
|
|
22
|
+
id: z.string().min(1),
|
|
23
|
+
input: z.string(),
|
|
24
|
+
expected_output: z.string().optional(),
|
|
25
|
+
expected_tools: z.array(z.string()).optional(),
|
|
26
|
+
metadata: z.record(z.unknown()).optional(),
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
export type Sample = z.infer<typeof SampleSchema>;
|
|
30
|
+
|
|
31
|
+
export const DatasetSchema = z.object({
|
|
32
|
+
name: z.string(),
|
|
33
|
+
samples: z.array(SampleSchema),
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
export type Dataset = z.infer<typeof DatasetSchema>;
|
|
37
|
+
|
|
38
|
+
export type LoadedDataset = {
|
|
39
|
+
readonly name: string;
|
|
40
|
+
readonly samples: AsyncIterable<Sample>;
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
export type DatasetSource = string | URL;
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Dispatch to the right loader based on URL scheme or file extension.
|
|
47
|
+
* `http://` and `https://` go to the HTTP loader. Local paths are dispatched
|
|
48
|
+
* by extension: `.jsonl`, `.csv`, `.yaml`, `.yml`.
|
|
49
|
+
*/
|
|
50
|
+
export async function loadDataset(source: DatasetSource): Promise<LoadedDataset> {
|
|
51
|
+
const sourceStr = typeof source === "string" ? source : source.toString();
|
|
52
|
+
|
|
53
|
+
if (sourceStr.startsWith("http://") || sourceStr.startsWith("https://")) {
|
|
54
|
+
return loadHttp(sourceStr);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
const lower = sourceStr.toLowerCase();
|
|
58
|
+
if (lower.endsWith(".jsonl") || lower.endsWith(".ndjson")) return loadJsonl(sourceStr);
|
|
59
|
+
if (lower.endsWith(".csv")) return loadCsv(sourceStr);
|
|
60
|
+
if (lower.endsWith(".yaml") || lower.endsWith(".yml")) return loadYaml(sourceStr);
|
|
61
|
+
|
|
62
|
+
throw new DatasetLoadError(
|
|
63
|
+
`unrecognized dataset source "${sourceStr}" — expected .jsonl, .csv, .yaml, .yml, or http(s):// URL`,
|
|
64
|
+
);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
export { DatasetLoadError };
|
|
68
|
+
export { loadCsv, loadHttp, loadJsonl, loadYaml };
|
|
69
|
+
export { parseCsv } from "./loaders/csv";
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import { basename } from "node:path";
|
|
2
|
+
import { DatasetLoadError } from "../errors";
|
|
3
|
+
import { type LoadedDataset, type Sample, SampleSchema } from "../index";
|
|
4
|
+
|
|
5
|
+
export async function loadCsv(path: string): Promise<LoadedDataset> {
|
|
6
|
+
const file = Bun.file(path);
|
|
7
|
+
if (!(await file.exists())) {
|
|
8
|
+
throw new DatasetLoadError(`file not found: ${path}`);
|
|
9
|
+
}
|
|
10
|
+
const text = await file.text();
|
|
11
|
+
const rows = parseCsv(text);
|
|
12
|
+
if (rows.length === 0) {
|
|
13
|
+
return { name: basename(path).replace(/\.csv$/i, ""), samples: emptyIterable() };
|
|
14
|
+
}
|
|
15
|
+
const header = rows[0];
|
|
16
|
+
if (!header) {
|
|
17
|
+
return { name: basename(path).replace(/\.csv$/i, ""), samples: emptyIterable() };
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
return {
|
|
21
|
+
name: basename(path).replace(/\.csv$/i, ""),
|
|
22
|
+
samples: rowsToSamples(rows.slice(1), header, path),
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
async function* emptyIterable(): AsyncIterable<Sample> {
|
|
27
|
+
/* nothing */
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
async function* rowsToSamples(
|
|
31
|
+
rows: ReadonlyArray<ReadonlyArray<string>>,
|
|
32
|
+
header: ReadonlyArray<string>,
|
|
33
|
+
path: string,
|
|
34
|
+
): AsyncIterable<Sample> {
|
|
35
|
+
let rowNo = 1; // header was row 0
|
|
36
|
+
for (const row of rows) {
|
|
37
|
+
rowNo += 1;
|
|
38
|
+
if (row.length === 1 && row[0] === "") continue; // trailing newline
|
|
39
|
+
const obj: Record<string, string | string[]> = {};
|
|
40
|
+
for (let i = 0; i < header.length; i++) {
|
|
41
|
+
const key = header[i];
|
|
42
|
+
if (key === undefined) continue;
|
|
43
|
+
const cell = row[i] ?? "";
|
|
44
|
+
// expected_tools is the only array-shaped field — comma-split
|
|
45
|
+
if (key === "expected_tools" && cell !== "") {
|
|
46
|
+
obj[key] = cell
|
|
47
|
+
.split(",")
|
|
48
|
+
.map((s) => s.trim())
|
|
49
|
+
.filter((s) => s.length > 0);
|
|
50
|
+
} else if (cell !== "") {
|
|
51
|
+
obj[key] = cell;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
const result = SampleSchema.safeParse(obj);
|
|
55
|
+
if (!result.success) {
|
|
56
|
+
throw new DatasetLoadError(
|
|
57
|
+
`invalid sample on row ${rowNo} of ${path}: ${result.error.message}`,
|
|
58
|
+
);
|
|
59
|
+
}
|
|
60
|
+
yield result.data;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* RFC 4180-style CSV parser. Handles:
|
|
66
|
+
* - quoted fields with embedded commas, newlines (CRLF/LF), and escaped `""`
|
|
67
|
+
* - trailing empty line tolerance
|
|
68
|
+
* - any combination of CRLF / LF / mixed line endings
|
|
69
|
+
*
|
|
70
|
+
* Returns one `string[]` per row. Empty input yields `[]`.
|
|
71
|
+
*/
|
|
72
|
+
export function parseCsv(text: string): string[][] {
|
|
73
|
+
const rows: string[][] = [];
|
|
74
|
+
if (text.length === 0) return rows;
|
|
75
|
+
let row: string[] = [];
|
|
76
|
+
let field = "";
|
|
77
|
+
let inQuotes = false;
|
|
78
|
+
let i = 0;
|
|
79
|
+
while (i < text.length) {
|
|
80
|
+
const ch = text[i];
|
|
81
|
+
if (inQuotes) {
|
|
82
|
+
if (ch === '"') {
|
|
83
|
+
if (text[i + 1] === '"') {
|
|
84
|
+
field += '"';
|
|
85
|
+
i += 2;
|
|
86
|
+
continue;
|
|
87
|
+
}
|
|
88
|
+
inQuotes = false;
|
|
89
|
+
i += 1;
|
|
90
|
+
continue;
|
|
91
|
+
}
|
|
92
|
+
field += ch;
|
|
93
|
+
i += 1;
|
|
94
|
+
continue;
|
|
95
|
+
}
|
|
96
|
+
if (ch === '"') {
|
|
97
|
+
inQuotes = true;
|
|
98
|
+
i += 1;
|
|
99
|
+
continue;
|
|
100
|
+
}
|
|
101
|
+
if (ch === ",") {
|
|
102
|
+
row.push(field);
|
|
103
|
+
field = "";
|
|
104
|
+
i += 1;
|
|
105
|
+
continue;
|
|
106
|
+
}
|
|
107
|
+
if (ch === "\n" || ch === "\r") {
|
|
108
|
+
row.push(field);
|
|
109
|
+
field = "";
|
|
110
|
+
rows.push(row);
|
|
111
|
+
row = [];
|
|
112
|
+
// swallow CRLF as a single newline
|
|
113
|
+
if (ch === "\r" && text[i + 1] === "\n") i += 2;
|
|
114
|
+
else i += 1;
|
|
115
|
+
continue;
|
|
116
|
+
}
|
|
117
|
+
field += ch;
|
|
118
|
+
i += 1;
|
|
119
|
+
}
|
|
120
|
+
// EOF without trailing newline
|
|
121
|
+
if (field !== "" || row.length > 0) {
|
|
122
|
+
row.push(field);
|
|
123
|
+
rows.push(row);
|
|
124
|
+
}
|
|
125
|
+
return rows;
|
|
126
|
+
}
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import { parse as parseYaml } from "yaml";
|
|
2
|
+
import { DatasetLoadError } from "../errors";
|
|
3
|
+
import { DatasetSchema, type LoadedDataset, type Sample, SampleSchema } from "../index";
|
|
4
|
+
import { parseCsv } from "./csv";
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* HTTP loader — fetches the URL into memory, then dispatches to the
|
|
8
|
+
* appropriate format parser based on the URL extension first, then
|
|
9
|
+
* the response Content-Type as a fallback.
|
|
10
|
+
*
|
|
11
|
+
* Buffers the entire body in memory; HuggingFace-scale datasets should
|
|
12
|
+
* be downloaded ahead of time and loaded via the local-file loaders.
|
|
13
|
+
*/
|
|
14
|
+
export async function loadHttp(url: string): Promise<LoadedDataset> {
|
|
15
|
+
const response = await fetch(url);
|
|
16
|
+
if (!response.ok) {
|
|
17
|
+
throw new DatasetLoadError(`HTTP ${response.status} fetching ${url}`);
|
|
18
|
+
}
|
|
19
|
+
const text = await response.text();
|
|
20
|
+
const ct = (response.headers.get("content-type") ?? "").toLowerCase();
|
|
21
|
+
const lower = url.toLowerCase();
|
|
22
|
+
|
|
23
|
+
if (
|
|
24
|
+
lower.endsWith(".jsonl") ||
|
|
25
|
+
lower.endsWith(".ndjson") ||
|
|
26
|
+
ct.includes("application/x-jsonlines") ||
|
|
27
|
+
ct.includes("application/x-ndjson")
|
|
28
|
+
) {
|
|
29
|
+
return { name: deriveName(url), samples: parseJsonlText(text, url) };
|
|
30
|
+
}
|
|
31
|
+
if (lower.endsWith(".csv") || ct.includes("text/csv")) {
|
|
32
|
+
return { name: deriveName(url), samples: parseCsvText(text, url) };
|
|
33
|
+
}
|
|
34
|
+
if (
|
|
35
|
+
lower.endsWith(".yaml") ||
|
|
36
|
+
lower.endsWith(".yml") ||
|
|
37
|
+
ct.includes("application/yaml") ||
|
|
38
|
+
ct.includes("text/yaml")
|
|
39
|
+
) {
|
|
40
|
+
return parseYamlText(text, url);
|
|
41
|
+
}
|
|
42
|
+
throw new DatasetLoadError(
|
|
43
|
+
`unrecognized HTTP dataset format for ${url} (content-type: ${ct || "unknown"})`,
|
|
44
|
+
);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function deriveName(url: string): string {
|
|
48
|
+
try {
|
|
49
|
+
const u = new URL(url);
|
|
50
|
+
const last = u.pathname.split("/").filter(Boolean).pop() ?? "remote-dataset";
|
|
51
|
+
return last.replace(/\.(jsonl|ndjson|csv|yaml|yml)$/i, "");
|
|
52
|
+
} catch {
|
|
53
|
+
return "remote-dataset";
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
async function* parseJsonlText(text: string, source: string): AsyncIterable<Sample> {
|
|
58
|
+
let lineNo = 0;
|
|
59
|
+
for (const raw of text.split(/\r?\n/)) {
|
|
60
|
+
lineNo += 1;
|
|
61
|
+
if (raw.trim() === "") continue;
|
|
62
|
+
let parsed: unknown;
|
|
63
|
+
try {
|
|
64
|
+
parsed = JSON.parse(raw);
|
|
65
|
+
} catch (err) {
|
|
66
|
+
throw new DatasetLoadError(`malformed JSON on line ${lineNo} of ${source}`, err);
|
|
67
|
+
}
|
|
68
|
+
const result = SampleSchema.safeParse(parsed);
|
|
69
|
+
if (!result.success) {
|
|
70
|
+
throw new DatasetLoadError(
|
|
71
|
+
`invalid sample on line ${lineNo} of ${source}: ${result.error.message}`,
|
|
72
|
+
);
|
|
73
|
+
}
|
|
74
|
+
yield result.data;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
async function* parseCsvText(text: string, source: string): AsyncIterable<Sample> {
|
|
79
|
+
const rows = parseCsv(text);
|
|
80
|
+
if (rows.length === 0) return;
|
|
81
|
+
const header = rows[0];
|
|
82
|
+
if (!header) return;
|
|
83
|
+
let rowNo = 1;
|
|
84
|
+
for (const row of rows.slice(1)) {
|
|
85
|
+
rowNo += 1;
|
|
86
|
+
if (row.length === 1 && row[0] === "") continue;
|
|
87
|
+
const obj: Record<string, string | string[]> = {};
|
|
88
|
+
for (let i = 0; i < header.length; i++) {
|
|
89
|
+
const key = header[i];
|
|
90
|
+
if (key === undefined) continue;
|
|
91
|
+
const cell = row[i] ?? "";
|
|
92
|
+
if (key === "expected_tools" && cell !== "") {
|
|
93
|
+
obj[key] = cell
|
|
94
|
+
.split(",")
|
|
95
|
+
.map((s) => s.trim())
|
|
96
|
+
.filter((s) => s.length > 0);
|
|
97
|
+
} else if (cell !== "") {
|
|
98
|
+
obj[key] = cell;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
const result = SampleSchema.safeParse(obj);
|
|
102
|
+
if (!result.success) {
|
|
103
|
+
throw new DatasetLoadError(
|
|
104
|
+
`invalid sample on row ${rowNo} of ${source}: ${result.error.message}`,
|
|
105
|
+
);
|
|
106
|
+
}
|
|
107
|
+
yield result.data;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
function parseYamlText(text: string, source: string): LoadedDataset {
|
|
112
|
+
let parsed: unknown;
|
|
113
|
+
try {
|
|
114
|
+
parsed = parseYaml(text);
|
|
115
|
+
} catch (err) {
|
|
116
|
+
throw new DatasetLoadError(`malformed YAML in ${source}`, err);
|
|
117
|
+
}
|
|
118
|
+
if (Array.isArray(parsed)) {
|
|
119
|
+
return { name: deriveName(source), samples: yieldSamples(parsed, source) };
|
|
120
|
+
}
|
|
121
|
+
const result = DatasetSchema.safeParse(parsed);
|
|
122
|
+
if (!result.success) {
|
|
123
|
+
throw new DatasetLoadError(`invalid dataset in ${source}: ${result.error.message}`);
|
|
124
|
+
}
|
|
125
|
+
return { name: result.data.name, samples: yieldSamples(result.data.samples, source) };
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
async function* yieldSamples(
|
|
129
|
+
samples: ReadonlyArray<unknown>,
|
|
130
|
+
source: string,
|
|
131
|
+
): AsyncIterable<Sample> {
|
|
132
|
+
let i = 0;
|
|
133
|
+
for (const raw of samples) {
|
|
134
|
+
const result = SampleSchema.safeParse(raw);
|
|
135
|
+
if (!result.success) {
|
|
136
|
+
throw new DatasetLoadError(
|
|
137
|
+
`invalid sample at index ${i} of ${source}: ${result.error.message}`,
|
|
138
|
+
);
|
|
139
|
+
}
|
|
140
|
+
yield result.data;
|
|
141
|
+
i += 1;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import { basename } from "node:path";
|
|
2
|
+
import { DatasetLoadError } from "../errors";
|
|
3
|
+
import { type LoadedDataset, type Sample, SampleSchema } from "../index";
|
|
4
|
+
|
|
5
|
+
export async function loadJsonl(path: string): Promise<LoadedDataset> {
|
|
6
|
+
const file = Bun.file(path);
|
|
7
|
+
if (!(await file.exists())) {
|
|
8
|
+
throw new DatasetLoadError(`file not found: ${path}`);
|
|
9
|
+
}
|
|
10
|
+
return {
|
|
11
|
+
name: basename(path).replace(/\.(jsonl|ndjson)$/i, ""),
|
|
12
|
+
samples: streamJsonl(path),
|
|
13
|
+
};
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
async function* streamJsonl(path: string): AsyncIterable<Sample> {
|
|
17
|
+
const file = Bun.file(path);
|
|
18
|
+
const text = await file.text();
|
|
19
|
+
let lineNo = 0;
|
|
20
|
+
for (const raw of text.split(/\r?\n/)) {
|
|
21
|
+
lineNo += 1;
|
|
22
|
+
if (raw.trim() === "") continue;
|
|
23
|
+
let parsed: unknown;
|
|
24
|
+
try {
|
|
25
|
+
parsed = JSON.parse(raw);
|
|
26
|
+
} catch (err) {
|
|
27
|
+
throw new DatasetLoadError(`malformed JSON on line ${lineNo} of ${path}`, err);
|
|
28
|
+
}
|
|
29
|
+
const result = SampleSchema.safeParse(parsed);
|
|
30
|
+
if (!result.success) {
|
|
31
|
+
throw new DatasetLoadError(
|
|
32
|
+
`invalid sample on line ${lineNo} of ${path}: ${result.error.message}`,
|
|
33
|
+
);
|
|
34
|
+
}
|
|
35
|
+
yield result.data;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import { basename } from "node:path";
|
|
2
|
+
import { parse as parseYaml } from "yaml";
|
|
3
|
+
import { DatasetLoadError } from "../errors";
|
|
4
|
+
import { DatasetSchema, type LoadedDataset, type Sample, SampleSchema } from "../index";
|
|
5
|
+
|
|
6
|
+
export async function loadYaml(path: string): Promise<LoadedDataset> {
|
|
7
|
+
const file = Bun.file(path);
|
|
8
|
+
if (!(await file.exists())) {
|
|
9
|
+
throw new DatasetLoadError(`file not found: ${path}`);
|
|
10
|
+
}
|
|
11
|
+
const text = await file.text();
|
|
12
|
+
let parsed: unknown;
|
|
13
|
+
try {
|
|
14
|
+
parsed = parseYaml(text);
|
|
15
|
+
} catch (err) {
|
|
16
|
+
throw new DatasetLoadError(`malformed YAML in ${path}`, err);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
// Two accepted shapes:
|
|
20
|
+
// 1. Top-level Dataset: { name, samples: [Sample, …] }
|
|
21
|
+
// 2. Bare Sample[] (assigns a synthetic name from the filename)
|
|
22
|
+
if (Array.isArray(parsed)) {
|
|
23
|
+
const name = basename(path).replace(/\.(yaml|yml)$/i, "");
|
|
24
|
+
return { name, samples: yieldSamples(parsed, path) };
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
const result = DatasetSchema.safeParse(parsed);
|
|
28
|
+
if (!result.success) {
|
|
29
|
+
throw new DatasetLoadError(`invalid dataset in ${path}: ${result.error.message}`);
|
|
30
|
+
}
|
|
31
|
+
const dataset = result.data;
|
|
32
|
+
return { name: dataset.name, samples: yieldSamples(dataset.samples, path) };
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
async function* yieldSamples(samples: ReadonlyArray<unknown>, path: string): AsyncIterable<Sample> {
|
|
36
|
+
let i = 0;
|
|
37
|
+
for (const raw of samples) {
|
|
38
|
+
const result = SampleSchema.safeParse(raw);
|
|
39
|
+
if (!result.success) {
|
|
40
|
+
throw new DatasetLoadError(
|
|
41
|
+
`invalid sample at index ${i} of ${path}: ${result.error.message}`,
|
|
42
|
+
);
|
|
43
|
+
}
|
|
44
|
+
yield result.data;
|
|
45
|
+
i += 1;
|
|
46
|
+
}
|
|
47
|
+
}
|