deeplake 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/browser/client.d.ts +33 -0
- package/dist/browser/browser/client.d.ts.map +1 -0
- package/dist/browser/browser/client.js +312 -0
- package/dist/browser/browser/client.js.map +1 -0
- package/dist/browser/browser/index.d.ts +15 -0
- package/dist/browser/browser/index.d.ts.map +1 -0
- package/dist/browser/browser/index.js +11 -0
- package/dist/browser/browser/index.js.map +1 -0
- package/dist/browser/browser/wasm.d.ts +8 -0
- package/dist/browser/browser/wasm.d.ts.map +1 -0
- package/dist/browser/browser/wasm.js +53 -0
- package/dist/browser/browser/wasm.js.map +1 -0
- package/dist/browser/shared/api.d.ts +9 -0
- package/dist/browser/shared/api.d.ts.map +1 -0
- package/dist/browser/shared/api.js +67 -0
- package/dist/browser/shared/api.js.map +1 -0
- package/dist/browser/shared/credentials.d.ts +4 -0
- package/dist/browser/shared/credentials.d.ts.map +1 -0
- package/dist/browser/shared/credentials.js +24 -0
- package/dist/browser/shared/credentials.js.map +1 -0
- package/dist/browser/shared/database.d.ts +26 -0
- package/dist/browser/shared/database.d.ts.map +1 -0
- package/dist/browser/shared/database.js +67 -0
- package/dist/browser/shared/database.js.map +1 -0
- package/dist/browser/shared/dlref.d.ts +22 -0
- package/dist/browser/shared/dlref.d.ts.map +1 -0
- package/dist/browser/shared/dlref.js +48 -0
- package/dist/browser/shared/dlref.js.map +1 -0
- package/dist/browser/shared/errors.d.ts +22 -0
- package/dist/browser/shared/errors.d.ts.map +1 -0
- package/dist/browser/shared/errors.js +43 -0
- package/dist/browser/shared/errors.js.map +1 -0
- package/dist/browser/shared/schema.d.ts +7 -0
- package/dist/browser/shared/schema.d.ts.map +1 -0
- package/dist/browser/shared/schema.js +151 -0
- package/dist/browser/shared/schema.js.map +1 -0
- package/dist/browser/shared/token.d.ts +4 -0
- package/dist/browser/shared/token.d.ts.map +1 -0
- package/dist/browser/shared/token.js +69 -0
- package/dist/browser/shared/token.js.map +1 -0
- package/dist/browser/shared/types.d.ts +33 -0
- package/dist/browser/shared/types.d.ts.map +1 -0
- package/dist/browser/shared/types.js +2 -0
- package/dist/browser/shared/types.js.map +1 -0
- package/dist/browser/shared/wasm-common.d.ts +39 -0
- package/dist/browser/shared/wasm-common.d.ts.map +1 -0
- package/dist/browser/shared/wasm-common.js +287 -0
- package/dist/browser/shared/wasm-common.js.map +1 -0
- package/dist/node/node/client.d.ts +49 -0
- package/dist/node/node/client.d.ts.map +1 -0
- package/dist/node/node/client.js +670 -0
- package/dist/node/node/client.js.map +1 -0
- package/dist/node/node/formats/coco.d.ts +24 -0
- package/dist/node/node/formats/coco.d.ts.map +1 -0
- package/dist/node/node/formats/coco.js +342 -0
- package/dist/node/node/formats/coco.js.map +1 -0
- package/dist/node/node/formats/coco_panoptic.d.ts +19 -0
- package/dist/node/node/formats/coco_panoptic.d.ts.map +1 -0
- package/dist/node/node/formats/coco_panoptic.js +125 -0
- package/dist/node/node/formats/coco_panoptic.js.map +1 -0
- package/dist/node/node/formats/index.d.ts +7 -0
- package/dist/node/node/formats/index.d.ts.map +1 -0
- package/dist/node/node/formats/index.js +12 -0
- package/dist/node/node/formats/index.js.map +1 -0
- package/dist/node/node/formats/lerobot.d.ts +42 -0
- package/dist/node/node/formats/lerobot.d.ts.map +1 -0
- package/dist/node/node/formats/lerobot.js +351 -0
- package/dist/node/node/formats/lerobot.js.map +1 -0
- package/dist/node/node/index.d.ts +19 -0
- package/dist/node/node/index.d.ts.map +1 -0
- package/dist/node/node/index.js +75 -0
- package/dist/node/node/index.js.map +1 -0
- package/dist/node/node/mime.d.ts +2 -0
- package/dist/node/node/mime.d.ts.map +1 -0
- package/dist/node/node/mime.js +76 -0
- package/dist/node/node/mime.js.map +1 -0
- package/dist/node/node/normalizers/binary.d.ts +3 -0
- package/dist/node/node/normalizers/binary.d.ts.map +1 -0
- package/dist/node/node/normalizers/binary.js +51 -0
- package/dist/node/node/normalizers/binary.js.map +1 -0
- package/dist/node/node/normalizers/dict.d.ts +3 -0
- package/dist/node/node/normalizers/dict.d.ts.map +1 -0
- package/dist/node/node/normalizers/dict.js +17 -0
- package/dist/node/node/normalizers/dict.js.map +1 -0
- package/dist/node/node/normalizers/files.d.ts +3 -0
- package/dist/node/node/normalizers/files.d.ts.map +1 -0
- package/dist/node/node/normalizers/files.js +109 -0
- package/dist/node/node/normalizers/files.js.map +1 -0
- package/dist/node/node/normalizers/huggingface.d.ts +3 -0
- package/dist/node/node/normalizers/huggingface.d.ts.map +1 -0
- package/dist/node/node/normalizers/huggingface.js +158 -0
- package/dist/node/node/normalizers/huggingface.js.map +1 -0
- package/dist/node/node/normalizers/image.d.ts +3 -0
- package/dist/node/node/normalizers/image.d.ts.map +1 -0
- package/dist/node/node/normalizers/image.js +52 -0
- package/dist/node/node/normalizers/image.js.map +1 -0
- package/dist/node/node/normalizers/index.d.ts +9 -0
- package/dist/node/node/normalizers/index.d.ts.map +1 -0
- package/dist/node/node/normalizers/index.js +21 -0
- package/dist/node/node/normalizers/index.js.map +1 -0
- package/dist/node/node/normalizers/pdf.d.ts +3 -0
- package/dist/node/node/normalizers/pdf.d.ts.map +1 -0
- package/dist/node/node/normalizers/pdf.js +75 -0
- package/dist/node/node/normalizers/pdf.js.map +1 -0
- package/dist/node/node/normalizers/text.d.ts +4 -0
- package/dist/node/node/normalizers/text.d.ts.map +1 -0
- package/dist/node/node/normalizers/text.js +83 -0
- package/dist/node/node/normalizers/text.js.map +1 -0
- package/dist/node/node/normalizers/video.d.ts +3 -0
- package/dist/node/node/normalizers/video.d.ts.map +1 -0
- package/dist/node/node/normalizers/video.js +119 -0
- package/dist/node/node/normalizers/video.js.map +1 -0
- package/dist/node/node/storage.d.ts +15 -0
- package/dist/node/node/storage.d.ts.map +1 -0
- package/dist/node/node/storage.js +477 -0
- package/dist/node/node/storage.js.map +1 -0
- package/dist/node/node/wasm.d.ts +3 -0
- package/dist/node/node/wasm.d.ts.map +1 -0
- package/dist/node/node/wasm.js +49 -0
- package/dist/node/node/wasm.js.map +1 -0
- package/dist/node/shared/api.d.ts +9 -0
- package/dist/node/shared/api.d.ts.map +1 -0
- package/dist/node/shared/api.js +70 -0
- package/dist/node/shared/api.js.map +1 -0
- package/dist/node/shared/credentials.d.ts +4 -0
- package/dist/node/shared/credentials.d.ts.map +1 -0
- package/dist/node/shared/credentials.js +28 -0
- package/dist/node/shared/credentials.js.map +1 -0
- package/dist/node/shared/database.d.ts +26 -0
- package/dist/node/shared/database.d.ts.map +1 -0
- package/dist/node/shared/database.js +71 -0
- package/dist/node/shared/database.js.map +1 -0
- package/dist/node/shared/dlref.d.ts +22 -0
- package/dist/node/shared/dlref.d.ts.map +1 -0
- package/dist/node/shared/dlref.js +52 -0
- package/dist/node/shared/dlref.js.map +1 -0
- package/dist/node/shared/errors.d.ts +22 -0
- package/dist/node/shared/errors.d.ts.map +1 -0
- package/dist/node/shared/errors.js +53 -0
- package/dist/node/shared/errors.js.map +1 -0
- package/dist/node/shared/schema.d.ts +7 -0
- package/dist/node/shared/schema.d.ts.map +1 -0
- package/dist/node/shared/schema.js +157 -0
- package/dist/node/shared/schema.js.map +1 -0
- package/dist/node/shared/token.d.ts +4 -0
- package/dist/node/shared/token.d.ts.map +1 -0
- package/dist/node/shared/token.js +74 -0
- package/dist/node/shared/token.js.map +1 -0
- package/dist/node/shared/types.d.ts +33 -0
- package/dist/node/shared/types.d.ts.map +1 -0
- package/dist/node/shared/types.js +3 -0
- package/dist/node/shared/types.js.map +1 -0
- package/dist/node/shared/wasm-common.d.ts +39 -0
- package/dist/node/shared/wasm-common.d.ts.map +1 -0
- package/dist/node/shared/wasm-common.js +310 -0
- package/dist/node/shared/wasm-common.js.map +1 -0
- package/package.json +43 -18
- package/README.md +0 -39
- package/index.d.ts +0 -2
- package/index.js +0 -26
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.normalizeBinary = normalizeBinary;
|
|
37
|
+
const fs = __importStar(require("fs"));
|
|
38
|
+
const path = __importStar(require("path"));
|
|
39
|
+
const crypto = __importStar(require("crypto"));
|
|
40
|
+
function normalizeBinary(filePath) {
|
|
41
|
+
const fileId = crypto.randomUUID();
|
|
42
|
+
const data = fs.readFileSync(filePath);
|
|
43
|
+
const filename = path.basename(filePath);
|
|
44
|
+
return {
|
|
45
|
+
id: [`${fileId}_0`],
|
|
46
|
+
file_id: [fileId],
|
|
47
|
+
data: [data],
|
|
48
|
+
filename: [filename],
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
//# sourceMappingURL=binary.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"binary.js","sourceRoot":"","sources":["../../../../src/node/normalizers/binary.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAKA,0CAWC;AAhBD,uCAAyB;AACzB,2CAA6B;AAC7B,+CAAiC;AAGjC,SAAgB,eAAe,CAAC,QAAgB;IAC9C,MAAM,MAAM,GAAG,MAAM,CAAC,UAAU,EAAE,CAAC;IACnC,MAAM,IAAI,GAAG,EAAE,CAAC,YAAY,CAAC,QAAQ,CAAC,CAAC;IACvC,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAEzC,OAAO;QACL,EAAE,EAAE,CAAC,GAAG,MAAM,IAAI,CAAC;QACnB,OAAO,EAAE,CAAC,MAAM,CAAC;QACjB,IAAI,EAAE,CAAC,IAAI,CAAC;QACZ,QAAQ,EAAE,CAAC,QAAQ,CAAC;KACrB,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dict.d.ts","sourceRoot":"","sources":["../../../../src/node/normalizers/dict.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,oBAAoB,CAAC;AAEhD,wBAAiB,aAAa,CAC5B,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,EAAE,CAAC,EAC/B,SAAS,GAAE,MAAa,GACvB,SAAS,CAAC,KAAK,CAAC,CAalB"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.normalizeDict = normalizeDict;
|
|
4
|
+
function* normalizeDict(data, batchSize = 1000) {
|
|
5
|
+
const firstCol = Object.keys(data)[0];
|
|
6
|
+
if (!firstCol)
|
|
7
|
+
return;
|
|
8
|
+
const rowCount = data[firstCol].length;
|
|
9
|
+
for (let i = 0; i < rowCount; i += batchSize) {
|
|
10
|
+
const batch = {};
|
|
11
|
+
for (const [col, values] of Object.entries(data)) {
|
|
12
|
+
batch[col] = values.slice(i, i + batchSize);
|
|
13
|
+
}
|
|
14
|
+
yield batch;
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
//# sourceMappingURL=dict.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dict.js","sourceRoot":"","sources":["../../../../src/node/normalizers/dict.ts"],"names":[],"mappings":";;AAEA,sCAgBC;AAhBD,QAAe,CAAC,CAAC,aAAa,CAC5B,IAA+B,EAC/B,YAAoB,IAAI;IAExB,MAAM,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IACtC,IAAI,CAAC,QAAQ;QAAE,OAAO;IAEtB,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC;IAEvC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,EAAE,CAAC,IAAI,SAAS,EAAE,CAAC;QAC7C,MAAM,KAAK,GAAU,EAAE,CAAC;QACxB,KAAK,MAAM,CAAC,GAAG,EAAE,MAAM,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC;YACjD,KAAK,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,CAAC;QAC9C,CAAC;QACD,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"files.d.ts","sourceRoot":"","sources":["../../../../src/node/normalizers/files.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,oBAAoB,CAAC;AAiDhD,wBAAuB,cAAc,CACnC,KAAK,EAAE,MAAM,EAAE,EACf,SAAS,EAAE,MAAM,EACjB,YAAY,EAAE,MAAM,EACpB,WAAW,GAAE,MAAU,GACtB,cAAc,CAAC,KAAK,CAAC,CA6CvB"}
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.normalizeFiles = normalizeFiles;
|
|
37
|
+
const path = __importStar(require("path"));
|
|
38
|
+
const fs = __importStar(require("fs"));
|
|
39
|
+
const errors_1 = require("../../shared/errors");
|
|
40
|
+
const image_1 = require("./image");
|
|
41
|
+
const text_1 = require("./text");
|
|
42
|
+
const binary_1 = require("./binary");
|
|
43
|
+
const video_1 = require("./video");
|
|
44
|
+
const pdf_1 = require("./pdf");
|
|
45
|
+
const VIDEO_EXTS = new Set(['.mp4', '.mov', '.avi', '.mkv', '.webm']);
|
|
46
|
+
const IMAGE_EXTS = new Set(['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']);
|
|
47
|
+
const TEXT_EXTS = new Set(['.txt', '.md', '.json', '.csv', '.xml', '.html']);
|
|
48
|
+
async function normalizeOneFile(filePath, chunkSize, chunkOverlap) {
|
|
49
|
+
if (!fs.existsSync(filePath)) {
|
|
50
|
+
throw new errors_1.IngestError(`File not found: ${filePath}`);
|
|
51
|
+
}
|
|
52
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
53
|
+
if (VIDEO_EXTS.has(ext)) {
|
|
54
|
+
const batches = [];
|
|
55
|
+
for await (const batch of (0, video_1.normalizeVideo)(filePath)) {
|
|
56
|
+
batches.push(batch);
|
|
57
|
+
}
|
|
58
|
+
return batches;
|
|
59
|
+
}
|
|
60
|
+
if (IMAGE_EXTS.has(ext)) {
|
|
61
|
+
return [(0, image_1.normalizeImage)(filePath)];
|
|
62
|
+
}
|
|
63
|
+
if (ext === '.pdf') {
|
|
64
|
+
const batches = [];
|
|
65
|
+
for await (const batch of (0, pdf_1.normalizePdf)(filePath)) {
|
|
66
|
+
batches.push(batch);
|
|
67
|
+
}
|
|
68
|
+
return batches;
|
|
69
|
+
}
|
|
70
|
+
if (TEXT_EXTS.has(ext)) {
|
|
71
|
+
return [(0, text_1.normalizeText)(filePath, chunkSize, chunkOverlap)];
|
|
72
|
+
}
|
|
73
|
+
return [(0, binary_1.normalizeBinary)(filePath)];
|
|
74
|
+
}
|
|
75
|
+
async function* normalizeFiles(files, chunkSize, chunkOverlap, concurrency = 4) {
|
|
76
|
+
const maxWorkers = Math.min(concurrency, files.length);
|
|
77
|
+
if (maxWorkers <= 1) {
|
|
78
|
+
for (const filePath of files) {
|
|
79
|
+
const batches = await normalizeOneFile(filePath, chunkSize, chunkOverlap);
|
|
80
|
+
for (const batch of batches) {
|
|
81
|
+
yield batch;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
return;
|
|
85
|
+
}
|
|
86
|
+
// Process files with limited concurrency
|
|
87
|
+
let index = 0;
|
|
88
|
+
const pending = [];
|
|
89
|
+
function enqueue() {
|
|
90
|
+
while (pending.length < maxWorkers && index < files.length) {
|
|
91
|
+
const filePath = files[index++];
|
|
92
|
+
pending.push(normalizeOneFile(filePath, chunkSize, chunkOverlap)
|
|
93
|
+
.then((batches) => ({ filePath, batches }))
|
|
94
|
+
.catch((e) => {
|
|
95
|
+
throw new errors_1.IngestError(`Failed to process ${filePath}: ${e instanceof Error ? e.message : e}`);
|
|
96
|
+
}));
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
enqueue();
|
|
100
|
+
while (pending.length > 0) {
|
|
101
|
+
const result = await Promise.race(pending.map((p, i) => p.then((r) => ({ ...r, idx: i }))));
|
|
102
|
+
pending.splice(result.idx, 1);
|
|
103
|
+
enqueue();
|
|
104
|
+
for (const batch of result.batches) {
|
|
105
|
+
yield batch;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
//# sourceMappingURL=files.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"files.js","sourceRoot":"","sources":["../../../../src/node/normalizers/files.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAoDA,wCAkDC;AAtGD,2CAA6B;AAC7B,uCAAyB;AACzB,gDAAkD;AAElD,mCAAyC;AACzC,iCAAuC;AACvC,qCAA2C;AAC3C,mCAAyC;AACzC,+BAAqC;AAErC,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC;AACtE,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC;AAC/E,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,CAAC,MAAM,EAAE,KAAK,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC;AAE7E,KAAK,UAAU,gBAAgB,CAC7B,QAAgB,EAChB,SAAiB,EACjB,YAAoB;IAEpB,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;QAC7B,MAAM,IAAI,oBAAW,CAAC,mBAAmB,QAAQ,EAAE,CAAC,CAAC;IACvD,CAAC;IAED,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;IAEjD,IAAI,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;QACxB,MAAM,OAAO,GAAY,EAAE,CAAC;QAC5B,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,IAAA,sBAAc,EAAC,QAAQ,CAAC,EAAE,CAAC;YACnD,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACtB,CAAC;QACD,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,IAAI,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;QACxB,OAAO,CAAC,IAAA,sBAAc,EAAC,QAAQ,CAAC,CAAC,CAAC;IACpC,CAAC;IAED,IAAI,GAAG,KAAK,MAAM,EAAE,CAAC;QACnB,MAAM,OAAO,GAAY,EAAE,CAAC;QAC5B,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,IAAA,kBAAY,EAAC,QAAQ,CAAC,EAAE,CAAC;YACjD,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACtB,CAAC;QACD,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,IAAI,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;QACvB,OAAO,CAAC,IAAA,oBAAa,EAAC,QAAQ,EAAE,SAAS,EAAE,YAAY,CAAC,CAAC,CAAC;IAC5D,CAAC;IAED,OAAO,CAAC,IAAA,wBAAe,EAAC,QAAQ,CAAC,CAAC,CAAC;AACrC,CAAC;AAEM,KAAK,SAAS,CAAC,CAAC,cAAc,CACnC,KAAe,EACf,SAAiB,EACjB,YAAoB,EACpB,cAAsB,CAAC;IAEvB,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IAEvD,IAAI,UAAU,IAAI,CAAC,EAAE,CAAC;QACpB,KAAK,MAAM,QAAQ,IAAI,KAAK,EAAE,CAAC;YAC7B,MAAM,OAAO,GAAG,MAAM,gBAAgB,CAAC,QAAQ,EAAE,SAAS,EAAE,YAAY,CAAC,CAAC;YAC1E,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;gBAC5B,MAAM,KAAK,CAAC;YACd,CAAC;QACH,CAAC;QACD,OAAO;IACT,CAAC;IAED,yCAAyC;IACzC,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,MAAM,OAAO,GAAsD,EAAE,CAAC;IAEtE,SAAS,OAAO;QACd,OAAO,OAAO,CAAC,MAAM,GAAG,UAAU,IAAI,KAAK,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YAC3D,MAAM,QAAQ,GAAG,KAAK,CAAC,KAAK,EAAE,CAAC,CAAC;YAChC,OAAO,CAAC,IAAI,CACV,gBAAgB,CAAC,QAAQ,EAAE,SAAS,EAAE,YAAY,CAAC;iBAChD,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC,EAAE,QAAQ,EAAE,OAAO,EAAE,CAAC,CAAC;iBAC1C,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE;gBACX,MAAM,IAAI,oBAAW,CACnB,qBAAqB,QAAQ,KAAK,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CACvE,CAAC;YACJ,CAAC,CAAC,CACL,CAAC;QACJ,CAAC;IACH,CAAC;IAED,OAAO,EAAE,CAAC;IAEV,OAAO,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC1B,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,IAAI,CAC/B,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,CACzD,CAAC;QACF,OAAO,CAAC,MAAM,CAAC,MAAM,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;QAC9B,OAAO,EAAE,CAAC;QAEV,KAAK,MAAM,KAAK,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;YACnC,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"huggingface.d.ts","sourceRoot":"","sources":["../../../../src/node/normalizers/huggingface.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,oBAAoB,CAAC;AAEhD,wBAAuB,oBAAoB,CACzC,WAAW,EAAE,MAAM,EACnB,SAAS,GAAE,MAAY,EACvB,KAAK,GAAE,MAAgB,GACtB,cAAc,CAAC,KAAK,CAAC,CA4CvB"}
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.normalizeHuggingface = normalizeHuggingface;
|
|
37
|
+
async function* normalizeHuggingface(datasetName, batchSize = 100, split = 'train') {
|
|
38
|
+
let rows;
|
|
39
|
+
try {
|
|
40
|
+
const hub = require('@huggingface/hub');
|
|
41
|
+
if (typeof hub.listDatasetFiles === 'function') {
|
|
42
|
+
rows = streamViaHub(hub, datasetName, split);
|
|
43
|
+
}
|
|
44
|
+
else {
|
|
45
|
+
throw new Error('Incompatible @huggingface/hub version');
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
catch {
|
|
49
|
+
try {
|
|
50
|
+
rows = streamViaPython(datasetName, split);
|
|
51
|
+
}
|
|
52
|
+
catch {
|
|
53
|
+
throw new Error(`No HuggingFace dataset library available. Install one of:\n` +
|
|
54
|
+
` npm install @huggingface/hub\n` +
|
|
55
|
+
`Or ensure Python is available with: pip install datasets`);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
const batch = [];
|
|
59
|
+
for await (const item of rows) {
|
|
60
|
+
const row = {};
|
|
61
|
+
for (const [col, val] of Object.entries(item)) {
|
|
62
|
+
if (val instanceof Uint8Array && !Buffer.isBuffer(val)) {
|
|
63
|
+
row[col] = Buffer.from(val);
|
|
64
|
+
}
|
|
65
|
+
else {
|
|
66
|
+
row[col] = val;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
batch.push(row);
|
|
70
|
+
if (batch.length >= batchSize) {
|
|
71
|
+
yield rowsToBatch(batch);
|
|
72
|
+
batch.length = 0;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
if (batch.length > 0) {
|
|
76
|
+
yield rowsToBatch(batch);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
async function* streamViaHub(hub, datasetName, split) {
|
|
80
|
+
const baseUrl = `https://datasets-server.huggingface.co/rows?dataset=${encodeURIComponent(datasetName)}&config=default&split=${encodeURIComponent(split)}`;
|
|
81
|
+
let offset = 0;
|
|
82
|
+
const length = 100;
|
|
83
|
+
while (true) {
|
|
84
|
+
const url = `${baseUrl}&offset=${offset}&length=${length}`;
|
|
85
|
+
const resp = await fetch(url);
|
|
86
|
+
if (!resp.ok)
|
|
87
|
+
break;
|
|
88
|
+
const data = (await resp.json());
|
|
89
|
+
const rows = data.rows;
|
|
90
|
+
if (!rows || rows.length === 0)
|
|
91
|
+
break;
|
|
92
|
+
for (const rowWrapper of rows) {
|
|
93
|
+
yield rowWrapper.row;
|
|
94
|
+
}
|
|
95
|
+
if (rows.length < length)
|
|
96
|
+
break;
|
|
97
|
+
offset += rows.length;
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
async function* streamViaPython(datasetName, split) {
|
|
101
|
+
const { spawn } = await Promise.resolve().then(() => __importStar(require('child_process')));
|
|
102
|
+
const readline = await Promise.resolve().then(() => __importStar(require('readline')));
|
|
103
|
+
const pyScript = `
|
|
104
|
+
import json, sys
|
|
105
|
+
from datasets import load_dataset
|
|
106
|
+
ds = load_dataset("${datasetName.replace(/"/g, '\\"')}", split="${split.replace(/"/g, '\\"')}", streaming=True)
|
|
107
|
+
for item in ds:
|
|
108
|
+
row = {}
|
|
109
|
+
for k, v in item.items():
|
|
110
|
+
if hasattr(v, 'tobytes'):
|
|
111
|
+
import base64
|
|
112
|
+
row[k] = {"__bytes__": base64.b64encode(v.tobytes()).decode()}
|
|
113
|
+
elif hasattr(v, 'tolist'):
|
|
114
|
+
row[k] = v.tolist()
|
|
115
|
+
else:
|
|
116
|
+
row[k] = v
|
|
117
|
+
print(json.dumps(row))
|
|
118
|
+
`;
|
|
119
|
+
const proc = spawn('python3', ['-c', pyScript], {
|
|
120
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
121
|
+
});
|
|
122
|
+
const rl = readline.createInterface({ input: proc.stdout });
|
|
123
|
+
for await (const line of rl) {
|
|
124
|
+
const trimmed = line.trim();
|
|
125
|
+
if (!trimmed)
|
|
126
|
+
continue;
|
|
127
|
+
const row = JSON.parse(trimmed);
|
|
128
|
+
for (const [key, val] of Object.entries(row)) {
|
|
129
|
+
if (val &&
|
|
130
|
+
typeof val === 'object' &&
|
|
131
|
+
'__bytes__' in val) {
|
|
132
|
+
row[key] = Buffer.from(val.__bytes__, 'base64');
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
yield row;
|
|
136
|
+
}
|
|
137
|
+
await new Promise((resolve, reject) => {
|
|
138
|
+
proc.on('close', (code) => {
|
|
139
|
+
if (code !== 0) {
|
|
140
|
+
reject(new Error(`Python process exited with code ${code}`));
|
|
141
|
+
}
|
|
142
|
+
else {
|
|
143
|
+
resolve();
|
|
144
|
+
}
|
|
145
|
+
});
|
|
146
|
+
});
|
|
147
|
+
}
|
|
148
|
+
function rowsToBatch(rows) {
|
|
149
|
+
if (rows.length === 0)
|
|
150
|
+
return {};
|
|
151
|
+
const keys = Object.keys(rows[0]);
|
|
152
|
+
const result = {};
|
|
153
|
+
for (const key of keys) {
|
|
154
|
+
result[key] = rows.map((r) => r[key]);
|
|
155
|
+
}
|
|
156
|
+
return result;
|
|
157
|
+
}
|
|
158
|
+
//# sourceMappingURL=huggingface.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"huggingface.js","sourceRoot":"","sources":["../../../../src/node/normalizers/huggingface.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAEA,oDAgDC;AAhDM,KAAK,SAAS,CAAC,CAAC,oBAAoB,CACzC,WAAmB,EACnB,YAAoB,GAAG,EACvB,QAAgB,OAAO;IAEvB,IAAI,IAA4C,CAAC;IAEjD,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,OAAO,CAAC,kBAAkB,CAAQ,CAAC;QAC/C,IAAI,OAAO,GAAG,CAAC,gBAAgB,KAAK,UAAU,EAAE,CAAC;YAC/C,IAAI,GAAG,YAAY,CAAC,GAAG,EAAE,WAAW,EAAE,KAAK,CAAC,CAAC;QAC/C,CAAC;aAAM,CAAC;YACN,MAAM,IAAI,KAAK,CAAC,uCAAuC,CAAC,CAAC;QAC3D,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,IAAI,CAAC;YACH,IAAI,GAAG,eAAe,CAAC,WAAW,EAAE,KAAK,CAAC,CAAC;QAC7C,CAAC;QAAC,MAAM,CAAC;YACP,MAAM,IAAI,KAAK,CACb,6DAA6D;gBAC7D,kCAAkC;gBAClC,0DAA0D,CAC3D,CAAC;QACJ,CAAC;IACH,CAAC;IAED,MAAM,KAAK,GAA8B,EAAE,CAAC;IAE5C,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,IAAI,EAAE,CAAC;QAC9B,MAAM,GAAG,GAA4B,EAAE,CAAC;QACxC,KAAK,MAAM,CAAC,GAAG,EAAE,GAAG,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC;YAC9C,IAAI,GAAG,YAAY,UAAU,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;gBACvD,GAAG,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YAC9B,CAAC;iBAAM,CAAC;gBACN,GAAG,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC;YACjB,CAAC;QACH,CAAC;QACD,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAEhB,IAAI,KAAK,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;YAC9B,MAAM,WAAW,CAAC,KAAK,CAAC,CAAC;YACzB,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC;QACnB,CAAC;IACH,CAAC;IAED,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACrB,MAAM,WAAW,CAAC,KAAK,CAAC,CAAC;IAC3B,CAAC;AACH,CAAC;AAED,KAAK,SAAS,CAAC,CAAC,YAAY,CAC1B,GAAQ,EACR,WAAmB,EACnB,KAAa;IAEb,MAAM,OAAO,GAAG,uDAAuD,kBAAkB,CAAC,WAAW,CAAC,yBAAyB,kBAAkB,CAAC,KAAK,CAAC,EAAE,CAAC;IAE3J,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,MAAM,MAAM,GAAG,GAAG,CAAC;IAEnB,OAAO,IAAI,EAAE,CAAC;QACZ,MAAM,GAAG,GAAG,GAAG,OAAO,WAAW,MAAM,WAAW,MAAM,EAAE,CAAC;QAC3D,MAAM,IAAI,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC;QAC9B,IAAI,CAAC,IAAI,CAAC,EAAE;YAAE,MAAM;QAEpB,MAAM,IAAI,GAAG,CAAC,MAAM,IAAI,CAAC,IAAI,EAAE,CAAQ,CAAC;QACxC,MAAM,IAAI,GAAG,IAAI,CAAC,IAAa,CAAC;QAChC,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;YAAE,MAAM;QAEtC,KAAK,MAAM,UAAU,IAAI,IAAI,EAAE,CAAC;YAC9B,MAAM,UAAU,CAAC,GAA8B,CAAC;QAClD,CAAC;QAED,IAAI,IAAI,CAAC,MAAM,GAAG,MAAM;YAAE,MAAM;QAChC,MAAM,IAAI,IAAI,CAAC,MAAM,CAAC;IACxB,CAAC;AACH,CAAC;AAED,KAAK,SAAS,CAAC,CAAC,eAAe,CAC7B,WAAmB,EACnB,KAAa;IAEb,MAAM,EAAE,KAAK,EAAE,GAAG,wDAAa,eAAe,GAAC,CAAC;IAChD,MAAM,QAAQ,GAAG,wDAAa,UAAU,GAAC,CAAC;IAE1C,MAAM,QAAQ,GAAG;;;qBAGE,WAAW,CAAC,OAAO,CAAC,IAAI,EAAE,KAAK,CAAC,aAAa,KAAK,CAAC,OAAO,CAAC,IAAI,EAAE,KAAK,CAAC;;;;;;;;;;;;CAY3F,CAAC;IAEA,MAAM,IAAI,GAAG,KAAK,CAAC,SAAS,EAAE,CAAC,IAAI,EAAE,QAAQ,CAAC,EAAE;QAC9C,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC;KAChC,CAAC,CAAC;IAEH,MAAM,EAAE,GAAG,QAAQ,CAAC,eAAe,CAAC,EAAE,KAAK,EAAE,IAAI,CAAC,MAAO,EAAE,CAAC,CAAC;IAE7D,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,EAAE,EAAE,CAAC;QAC5B,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;QAC5B,IAAI,CAAC,OAAO;YAAE,SAAS;QAEvB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAA4B,CAAC;QAE3D,KAAK,MAAM,CAAC,GAAG,EAAE,GAAG,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC;YAC7C,IACE,GAAG;gBACH,OAAO,GAAG,KAAK,QAAQ;gBACvB,WAAW,IAAK,GAA+B,EAC/C,CAAC;gBACD,GAAG,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,IAAI,CACnB,GAA6B,CAAC,SAAS,EACxC,QAAQ,CACT,CAAC;YACJ,CAAC;QACH,CAAC;QAED,MAAM,GAAG,CAAC;IACZ,CAAC;IAED,MAAM,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QAC1C,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;YACxB,IAAI,IAAI,KAAK,CAAC,EAAE,CAAC;gBACf,MAAM,CAAC,IAAI,KAAK,CAAC,mCAAmC,IAAI,EAAE,CAAC,CAAC,CAAC;YAC/D,CAAC;iBAAM,CAAC;gBACN,OAAO,EAAE,CAAC;YACZ,CAAC;QACH,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AAED,SAAS,WAAW,CAAC,IAA+B;IAClD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IACjC,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IAClC,MAAM,MAAM,GAAU,EAAE,CAAC;IACzB,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACxC,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"image.d.ts","sourceRoot":"","sources":["../../../../src/node/normalizers/image.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,oBAAoB,CAAC;AAEhD,wBAAgB,cAAc,CAAC,QAAQ,EAAE,MAAM,GAAG,KAAK,CAYtD"}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.normalizeImage = normalizeImage;
|
|
37
|
+
const fs = __importStar(require("fs"));
|
|
38
|
+
const path = __importStar(require("path"));
|
|
39
|
+
const crypto = __importStar(require("crypto"));
|
|
40
|
+
function normalizeImage(filePath) {
|
|
41
|
+
const fileId = crypto.randomUUID();
|
|
42
|
+
const data = fs.readFileSync(filePath);
|
|
43
|
+
const filename = path.basename(filePath);
|
|
44
|
+
return {
|
|
45
|
+
id: [`${fileId}_0`],
|
|
46
|
+
file_id: [fileId],
|
|
47
|
+
image: [data],
|
|
48
|
+
filename: [filename],
|
|
49
|
+
text: [`Image: ${filename}`],
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
//# sourceMappingURL=image.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"image.js","sourceRoot":"","sources":["../../../../src/node/normalizers/image.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAKA,wCAYC;AAjBD,uCAAyB;AACzB,2CAA6B;AAC7B,+CAAiC;AAGjC,SAAgB,cAAc,CAAC,QAAgB;IAC7C,MAAM,MAAM,GAAG,MAAM,CAAC,UAAU,EAAE,CAAC;IACnC,MAAM,IAAI,GAAG,EAAE,CAAC,YAAY,CAAC,QAAQ,CAAC,CAAC;IACvC,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAEzC,OAAO;QACL,EAAE,EAAE,CAAC,GAAG,MAAM,IAAI,CAAC;QACnB,OAAO,EAAE,CAAC,MAAM,CAAC;QACjB,KAAK,EAAE,CAAC,IAAI,CAAC;QACb,QAAQ,EAAE,CAAC,QAAQ,CAAC;QACpB,IAAI,EAAE,CAAC,UAAU,QAAQ,EAAE,CAAC;KAC7B,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
export { normalizeDict } from './dict';
|
|
2
|
+
export { chunkText, normalizeText } from './text';
|
|
3
|
+
export { normalizeBinary } from './binary';
|
|
4
|
+
export { normalizeImage } from './image';
|
|
5
|
+
export { normalizeVideo } from './video';
|
|
6
|
+
export { normalizePdf } from './pdf';
|
|
7
|
+
export { normalizeFiles } from './files';
|
|
8
|
+
export { normalizeHuggingface } from './huggingface';
|
|
9
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/node/normalizers/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,aAAa,EAAE,MAAM,QAAQ,CAAC;AACvC,OAAO,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM,QAAQ,CAAC;AAClD,OAAO,EAAE,eAAe,EAAE,MAAM,UAAU,CAAC;AAC3C,OAAO,EAAE,cAAc,EAAE,MAAM,SAAS,CAAC;AACzC,OAAO,EAAE,cAAc,EAAE,MAAM,SAAS,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,OAAO,CAAC;AACrC,OAAO,EAAE,cAAc,EAAE,MAAM,SAAS,CAAC;AACzC,OAAO,EAAE,oBAAoB,EAAE,MAAM,eAAe,CAAC"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.normalizeHuggingface = exports.normalizeFiles = exports.normalizePdf = exports.normalizeVideo = exports.normalizeImage = exports.normalizeBinary = exports.normalizeText = exports.chunkText = exports.normalizeDict = void 0;
|
|
4
|
+
var dict_1 = require("./dict");
|
|
5
|
+
Object.defineProperty(exports, "normalizeDict", { enumerable: true, get: function () { return dict_1.normalizeDict; } });
|
|
6
|
+
var text_1 = require("./text");
|
|
7
|
+
Object.defineProperty(exports, "chunkText", { enumerable: true, get: function () { return text_1.chunkText; } });
|
|
8
|
+
Object.defineProperty(exports, "normalizeText", { enumerable: true, get: function () { return text_1.normalizeText; } });
|
|
9
|
+
var binary_1 = require("./binary");
|
|
10
|
+
Object.defineProperty(exports, "normalizeBinary", { enumerable: true, get: function () { return binary_1.normalizeBinary; } });
|
|
11
|
+
var image_1 = require("./image");
|
|
12
|
+
Object.defineProperty(exports, "normalizeImage", { enumerable: true, get: function () { return image_1.normalizeImage; } });
|
|
13
|
+
var video_1 = require("./video");
|
|
14
|
+
Object.defineProperty(exports, "normalizeVideo", { enumerable: true, get: function () { return video_1.normalizeVideo; } });
|
|
15
|
+
var pdf_1 = require("./pdf");
|
|
16
|
+
Object.defineProperty(exports, "normalizePdf", { enumerable: true, get: function () { return pdf_1.normalizePdf; } });
|
|
17
|
+
var files_1 = require("./files");
|
|
18
|
+
Object.defineProperty(exports, "normalizeFiles", { enumerable: true, get: function () { return files_1.normalizeFiles; } });
|
|
19
|
+
var huggingface_1 = require("./huggingface");
|
|
20
|
+
Object.defineProperty(exports, "normalizeHuggingface", { enumerable: true, get: function () { return huggingface_1.normalizeHuggingface; } });
|
|
21
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../src/node/normalizers/index.ts"],"names":[],"mappings":";;;AAAA,+BAAuC;AAA9B,qGAAA,aAAa,OAAA;AACtB,+BAAkD;AAAzC,iGAAA,SAAS,OAAA;AAAE,qGAAA,aAAa,OAAA;AACjC,mCAA2C;AAAlC,yGAAA,eAAe,OAAA;AACxB,iCAAyC;AAAhC,uGAAA,cAAc,OAAA;AACvB,iCAAyC;AAAhC,uGAAA,cAAc,OAAA;AACvB,6BAAqC;AAA5B,mGAAA,YAAY,OAAA;AACrB,iCAAyC;AAAhC,uGAAA,cAAc,OAAA;AACvB,6CAAqD;AAA5C,mHAAA,oBAAoB,OAAA"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdf.d.ts","sourceRoot":"","sources":["../../../../src/node/normalizers/pdf.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,oBAAoB,CAAC;AAEhD,wBAAuB,YAAY,CAAC,QAAQ,EAAE,MAAM,GAAG,cAAc,CAAC,KAAK,CAAC,CAsC3E"}
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.normalizePdf = normalizePdf;
|
|
37
|
+
const fs = __importStar(require("fs"));
|
|
38
|
+
const path = __importStar(require("path"));
|
|
39
|
+
const crypto = __importStar(require("crypto"));
|
|
40
|
+
async function* normalizePdf(filePath) {
|
|
41
|
+
const fileId = crypto.randomUUID();
|
|
42
|
+
const filename = path.basename(filePath);
|
|
43
|
+
let pdfjs;
|
|
44
|
+
try {
|
|
45
|
+
pdfjs = require('pdfjs-dist');
|
|
46
|
+
}
|
|
47
|
+
catch {
|
|
48
|
+
// Fallback: store whole PDF as binary
|
|
49
|
+
yield {
|
|
50
|
+
id: [`${fileId}_0`],
|
|
51
|
+
file_id: [fileId],
|
|
52
|
+
page_index: [0],
|
|
53
|
+
data: [fs.readFileSync(filePath)],
|
|
54
|
+
text: [`PDF: ${filename}`],
|
|
55
|
+
};
|
|
56
|
+
return;
|
|
57
|
+
}
|
|
58
|
+
const data = new Uint8Array(fs.readFileSync(filePath));
|
|
59
|
+
const doc = await pdfjs.getDocument({ data }).promise;
|
|
60
|
+
for (let pageIdx = 0; pageIdx < doc.numPages; pageIdx++) {
|
|
61
|
+
const page = await doc.getPage(pageIdx + 1);
|
|
62
|
+
const textContent = await page.getTextContent();
|
|
63
|
+
const text = textContent.items
|
|
64
|
+
.map((item) => item.str)
|
|
65
|
+
.join(' ');
|
|
66
|
+
yield {
|
|
67
|
+
id: [`${fileId}_${pageIdx}`],
|
|
68
|
+
file_id: [fileId],
|
|
69
|
+
page_index: [pageIdx],
|
|
70
|
+
text: [text || `PDF page ${pageIdx + 1}`],
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
doc.destroy();
|
|
74
|
+
}
|
|
75
|
+
//# sourceMappingURL=pdf.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdf.js","sourceRoot":"","sources":["../../../../src/node/normalizers/pdf.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAKA,oCAsCC;AA3CD,uCAAyB;AACzB,2CAA6B;AAC7B,+CAAiC;AAG1B,KAAK,SAAS,CAAC,CAAC,YAAY,CAAC,QAAgB;IAClD,MAAM,MAAM,GAAG,MAAM,CAAC,UAAU,EAAE,CAAC;IACnC,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAEzC,IAAI,KAAU,CAAC;IACf,IAAI,CAAC;QACH,KAAK,GAAG,OAAO,CAAC,YAAY,CAAC,CAAC;IAChC,CAAC;IAAC,MAAM,CAAC;QACP,sCAAsC;QACtC,MAAM;YACJ,EAAE,EAAE,CAAC,GAAG,MAAM,IAAI,CAAC;YACnB,OAAO,EAAE,CAAC,MAAM,CAAC;YACjB,UAAU,EAAE,CAAC,CAAC,CAAC;YACf,IAAI,EAAE,CAAC,EAAE,CAAC,YAAY,CAAC,QAAQ,CAAC,CAAC;YACjC,IAAI,EAAE,CAAC,QAAQ,QAAQ,EAAE,CAAC;SAC3B,CAAC;QACF,OAAO;IACT,CAAC;IAED,MAAM,IAAI,GAAG,IAAI,UAAU,CAAC,EAAE,CAAC,YAAY,CAAC,QAAQ,CAAC,CAAC,CAAC;IACvD,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,WAAW,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC;IAEtD,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,GAAG,GAAG,CAAC,QAAQ,EAAE,OAAO,EAAE,EAAE,CAAC;QACxD,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,OAAO,CAAC,OAAO,GAAG,CAAC,CAAC,CAAC;QAC5C,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,cAAc,EAAE,CAAC;QAChD,MAAM,IAAI,GAAG,WAAW,CAAC,KAAK;aAC3B,GAAG,CAAC,CAAC,IAAS,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC;aAC5B,IAAI,CAAC,GAAG,CAAC,CAAC;QAEb,MAAM;YACJ,EAAE,EAAE,CAAC,GAAG,MAAM,IAAI,OAAO,EAAE,CAAC;YAC5B,OAAO,EAAE,CAAC,MAAM,CAAC;YACjB,UAAU,EAAE,CAAC,OAAO,CAAC;YACrB,IAAI,EAAE,CAAC,IAAI,IAAI,YAAY,OAAO,GAAG,CAAC,EAAE,CAAC;SAC1C,CAAC;IACJ,CAAC;IAED,GAAG,CAAC,OAAO,EAAE,CAAC;AAChB,CAAC"}
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
import type { Batch } from '../../shared/types';
|
|
2
|
+
export declare function chunkText(text: string, chunkSize: number, overlap: number): string[];
|
|
3
|
+
export declare function normalizeText(filePath: string, chunkSize: number, chunkOverlap: number): Batch;
|
|
4
|
+
//# sourceMappingURL=text.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"text.d.ts","sourceRoot":"","sources":["../../../../src/node/normalizers/text.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,oBAAoB,CAAC;AAEhD,wBAAgB,SAAS,CACvB,IAAI,EAAE,MAAM,EACZ,SAAS,EAAE,MAAM,EACjB,OAAO,EAAE,MAAM,GACd,MAAM,EAAE,CAoCV;AAED,wBAAgB,aAAa,CAC3B,QAAQ,EAAE,MAAM,EAChB,SAAS,EAAE,MAAM,EACjB,YAAY,EAAE,MAAM,GACnB,KAAK,CAYP"}
|