deeplake 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. package/dist/browser/browser/client.d.ts +33 -0
  2. package/dist/browser/browser/client.d.ts.map +1 -0
  3. package/dist/browser/browser/client.js +312 -0
  4. package/dist/browser/browser/client.js.map +1 -0
  5. package/dist/browser/browser/index.d.ts +15 -0
  6. package/dist/browser/browser/index.d.ts.map +1 -0
  7. package/dist/browser/browser/index.js +11 -0
  8. package/dist/browser/browser/index.js.map +1 -0
  9. package/dist/browser/browser/wasm.d.ts +8 -0
  10. package/dist/browser/browser/wasm.d.ts.map +1 -0
  11. package/dist/browser/browser/wasm.js +53 -0
  12. package/dist/browser/browser/wasm.js.map +1 -0
  13. package/dist/browser/shared/api.d.ts +9 -0
  14. package/dist/browser/shared/api.d.ts.map +1 -0
  15. package/dist/browser/shared/api.js +67 -0
  16. package/dist/browser/shared/api.js.map +1 -0
  17. package/dist/browser/shared/credentials.d.ts +4 -0
  18. package/dist/browser/shared/credentials.d.ts.map +1 -0
  19. package/dist/browser/shared/credentials.js +24 -0
  20. package/dist/browser/shared/credentials.js.map +1 -0
  21. package/dist/browser/shared/database.d.ts +26 -0
  22. package/dist/browser/shared/database.d.ts.map +1 -0
  23. package/dist/browser/shared/database.js +67 -0
  24. package/dist/browser/shared/database.js.map +1 -0
  25. package/dist/browser/shared/dlref.d.ts +22 -0
  26. package/dist/browser/shared/dlref.d.ts.map +1 -0
  27. package/dist/browser/shared/dlref.js +48 -0
  28. package/dist/browser/shared/dlref.js.map +1 -0
  29. package/dist/browser/shared/errors.d.ts +22 -0
  30. package/dist/browser/shared/errors.d.ts.map +1 -0
  31. package/dist/browser/shared/errors.js +43 -0
  32. package/dist/browser/shared/errors.js.map +1 -0
  33. package/dist/browser/shared/schema.d.ts +7 -0
  34. package/dist/browser/shared/schema.d.ts.map +1 -0
  35. package/dist/browser/shared/schema.js +151 -0
  36. package/dist/browser/shared/schema.js.map +1 -0
  37. package/dist/browser/shared/token.d.ts +4 -0
  38. package/dist/browser/shared/token.d.ts.map +1 -0
  39. package/dist/browser/shared/token.js +69 -0
  40. package/dist/browser/shared/token.js.map +1 -0
  41. package/dist/browser/shared/types.d.ts +33 -0
  42. package/dist/browser/shared/types.d.ts.map +1 -0
  43. package/dist/browser/shared/types.js +2 -0
  44. package/dist/browser/shared/types.js.map +1 -0
  45. package/dist/browser/shared/wasm-common.d.ts +39 -0
  46. package/dist/browser/shared/wasm-common.d.ts.map +1 -0
  47. package/dist/browser/shared/wasm-common.js +287 -0
  48. package/dist/browser/shared/wasm-common.js.map +1 -0
  49. package/dist/node/node/client.d.ts +49 -0
  50. package/dist/node/node/client.d.ts.map +1 -0
  51. package/dist/node/node/client.js +670 -0
  52. package/dist/node/node/client.js.map +1 -0
  53. package/dist/node/node/formats/coco.d.ts +24 -0
  54. package/dist/node/node/formats/coco.d.ts.map +1 -0
  55. package/dist/node/node/formats/coco.js +342 -0
  56. package/dist/node/node/formats/coco.js.map +1 -0
  57. package/dist/node/node/formats/coco_panoptic.d.ts +19 -0
  58. package/dist/node/node/formats/coco_panoptic.d.ts.map +1 -0
  59. package/dist/node/node/formats/coco_panoptic.js +125 -0
  60. package/dist/node/node/formats/coco_panoptic.js.map +1 -0
  61. package/dist/node/node/formats/index.d.ts +7 -0
  62. package/dist/node/node/formats/index.d.ts.map +1 -0
  63. package/dist/node/node/formats/index.js +12 -0
  64. package/dist/node/node/formats/index.js.map +1 -0
  65. package/dist/node/node/formats/lerobot.d.ts +42 -0
  66. package/dist/node/node/formats/lerobot.d.ts.map +1 -0
  67. package/dist/node/node/formats/lerobot.js +351 -0
  68. package/dist/node/node/formats/lerobot.js.map +1 -0
  69. package/dist/node/node/index.d.ts +19 -0
  70. package/dist/node/node/index.d.ts.map +1 -0
  71. package/dist/node/node/index.js +75 -0
  72. package/dist/node/node/index.js.map +1 -0
  73. package/dist/node/node/mime.d.ts +2 -0
  74. package/dist/node/node/mime.d.ts.map +1 -0
  75. package/dist/node/node/mime.js +76 -0
  76. package/dist/node/node/mime.js.map +1 -0
  77. package/dist/node/node/normalizers/binary.d.ts +3 -0
  78. package/dist/node/node/normalizers/binary.d.ts.map +1 -0
  79. package/dist/node/node/normalizers/binary.js +51 -0
  80. package/dist/node/node/normalizers/binary.js.map +1 -0
  81. package/dist/node/node/normalizers/dict.d.ts +3 -0
  82. package/dist/node/node/normalizers/dict.d.ts.map +1 -0
  83. package/dist/node/node/normalizers/dict.js +17 -0
  84. package/dist/node/node/normalizers/dict.js.map +1 -0
  85. package/dist/node/node/normalizers/files.d.ts +3 -0
  86. package/dist/node/node/normalizers/files.d.ts.map +1 -0
  87. package/dist/node/node/normalizers/files.js +109 -0
  88. package/dist/node/node/normalizers/files.js.map +1 -0
  89. package/dist/node/node/normalizers/huggingface.d.ts +3 -0
  90. package/dist/node/node/normalizers/huggingface.d.ts.map +1 -0
  91. package/dist/node/node/normalizers/huggingface.js +158 -0
  92. package/dist/node/node/normalizers/huggingface.js.map +1 -0
  93. package/dist/node/node/normalizers/image.d.ts +3 -0
  94. package/dist/node/node/normalizers/image.d.ts.map +1 -0
  95. package/dist/node/node/normalizers/image.js +52 -0
  96. package/dist/node/node/normalizers/image.js.map +1 -0
  97. package/dist/node/node/normalizers/index.d.ts +9 -0
  98. package/dist/node/node/normalizers/index.d.ts.map +1 -0
  99. package/dist/node/node/normalizers/index.js +21 -0
  100. package/dist/node/node/normalizers/index.js.map +1 -0
  101. package/dist/node/node/normalizers/pdf.d.ts +3 -0
  102. package/dist/node/node/normalizers/pdf.d.ts.map +1 -0
  103. package/dist/node/node/normalizers/pdf.js +75 -0
  104. package/dist/node/node/normalizers/pdf.js.map +1 -0
  105. package/dist/node/node/normalizers/text.d.ts +4 -0
  106. package/dist/node/node/normalizers/text.d.ts.map +1 -0
  107. package/dist/node/node/normalizers/text.js +83 -0
  108. package/dist/node/node/normalizers/text.js.map +1 -0
  109. package/dist/node/node/normalizers/video.d.ts +3 -0
  110. package/dist/node/node/normalizers/video.d.ts.map +1 -0
  111. package/dist/node/node/normalizers/video.js +119 -0
  112. package/dist/node/node/normalizers/video.js.map +1 -0
  113. package/dist/node/node/storage.d.ts +15 -0
  114. package/dist/node/node/storage.d.ts.map +1 -0
  115. package/dist/node/node/storage.js +477 -0
  116. package/dist/node/node/storage.js.map +1 -0
  117. package/dist/node/node/wasm.d.ts +3 -0
  118. package/dist/node/node/wasm.d.ts.map +1 -0
  119. package/dist/node/node/wasm.js +49 -0
  120. package/dist/node/node/wasm.js.map +1 -0
  121. package/dist/node/shared/api.d.ts +9 -0
  122. package/dist/node/shared/api.d.ts.map +1 -0
  123. package/dist/node/shared/api.js +70 -0
  124. package/dist/node/shared/api.js.map +1 -0
  125. package/dist/node/shared/credentials.d.ts +4 -0
  126. package/dist/node/shared/credentials.d.ts.map +1 -0
  127. package/dist/node/shared/credentials.js +28 -0
  128. package/dist/node/shared/credentials.js.map +1 -0
  129. package/dist/node/shared/database.d.ts +26 -0
  130. package/dist/node/shared/database.d.ts.map +1 -0
  131. package/dist/node/shared/database.js +71 -0
  132. package/dist/node/shared/database.js.map +1 -0
  133. package/dist/node/shared/dlref.d.ts +22 -0
  134. package/dist/node/shared/dlref.d.ts.map +1 -0
  135. package/dist/node/shared/dlref.js +52 -0
  136. package/dist/node/shared/dlref.js.map +1 -0
  137. package/dist/node/shared/errors.d.ts +22 -0
  138. package/dist/node/shared/errors.d.ts.map +1 -0
  139. package/dist/node/shared/errors.js +53 -0
  140. package/dist/node/shared/errors.js.map +1 -0
  141. package/dist/node/shared/schema.d.ts +7 -0
  142. package/dist/node/shared/schema.d.ts.map +1 -0
  143. package/dist/node/shared/schema.js +157 -0
  144. package/dist/node/shared/schema.js.map +1 -0
  145. package/dist/node/shared/token.d.ts +4 -0
  146. package/dist/node/shared/token.d.ts.map +1 -0
  147. package/dist/node/shared/token.js +74 -0
  148. package/dist/node/shared/token.js.map +1 -0
  149. package/dist/node/shared/types.d.ts +33 -0
  150. package/dist/node/shared/types.d.ts.map +1 -0
  151. package/dist/node/shared/types.js +3 -0
  152. package/dist/node/shared/types.js.map +1 -0
  153. package/dist/node/shared/wasm-common.d.ts +39 -0
  154. package/dist/node/shared/wasm-common.d.ts.map +1 -0
  155. package/dist/node/shared/wasm-common.js +310 -0
  156. package/dist/node/shared/wasm-common.js.map +1 -0
  157. package/package.json +43 -18
  158. package/README.md +0 -39
  159. package/index.d.ts +0 -2
  160. package/index.js +0 -26
@@ -0,0 +1,51 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.normalizeBinary = normalizeBinary;
37
+ const fs = __importStar(require("fs"));
38
+ const path = __importStar(require("path"));
39
+ const crypto = __importStar(require("crypto"));
40
+ function normalizeBinary(filePath) {
41
+ const fileId = crypto.randomUUID();
42
+ const data = fs.readFileSync(filePath);
43
+ const filename = path.basename(filePath);
44
+ return {
45
+ id: [`${fileId}_0`],
46
+ file_id: [fileId],
47
+ data: [data],
48
+ filename: [filename],
49
+ };
50
+ }
51
+ //# sourceMappingURL=binary.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"binary.js","sourceRoot":"","sources":["../../../../src/node/normalizers/binary.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAKA,0CAWC;AAhBD,uCAAyB;AACzB,2CAA6B;AAC7B,+CAAiC;AAGjC,SAAgB,eAAe,CAAC,QAAgB;IAC9C,MAAM,MAAM,GAAG,MAAM,CAAC,UAAU,EAAE,CAAC;IACnC,MAAM,IAAI,GAAG,EAAE,CAAC,YAAY,CAAC,QAAQ,CAAC,CAAC;IACvC,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAEzC,OAAO;QACL,EAAE,EAAE,CAAC,GAAG,MAAM,IAAI,CAAC;QACnB,OAAO,EAAE,CAAC,MAAM,CAAC;QACjB,IAAI,EAAE,CAAC,IAAI,CAAC;QACZ,QAAQ,EAAE,CAAC,QAAQ,CAAC;KACrB,CAAC;AACJ,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { Batch } from '../../shared/types';
2
+ export declare function normalizeDict(data: Record<string, unknown[]>, batchSize?: number): Generator<Batch>;
3
+ //# sourceMappingURL=dict.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dict.d.ts","sourceRoot":"","sources":["../../../../src/node/normalizers/dict.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,oBAAoB,CAAC;AAEhD,wBAAiB,aAAa,CAC5B,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,EAAE,CAAC,EAC/B,SAAS,GAAE,MAAa,GACvB,SAAS,CAAC,KAAK,CAAC,CAalB"}
@@ -0,0 +1,17 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.normalizeDict = normalizeDict;
4
+ function* normalizeDict(data, batchSize = 1000) {
5
+ const firstCol = Object.keys(data)[0];
6
+ if (!firstCol)
7
+ return;
8
+ const rowCount = data[firstCol].length;
9
+ for (let i = 0; i < rowCount; i += batchSize) {
10
+ const batch = {};
11
+ for (const [col, values] of Object.entries(data)) {
12
+ batch[col] = values.slice(i, i + batchSize);
13
+ }
14
+ yield batch;
15
+ }
16
+ }
17
+ //# sourceMappingURL=dict.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dict.js","sourceRoot":"","sources":["../../../../src/node/normalizers/dict.ts"],"names":[],"mappings":";;AAEA,sCAgBC;AAhBD,QAAe,CAAC,CAAC,aAAa,CAC5B,IAA+B,EAC/B,YAAoB,IAAI;IAExB,MAAM,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IACtC,IAAI,CAAC,QAAQ;QAAE,OAAO;IAEtB,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC;IAEvC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,EAAE,CAAC,IAAI,SAAS,EAAE,CAAC;QAC7C,MAAM,KAAK,GAAU,EAAE,CAAC;QACxB,KAAK,MAAM,CAAC,GAAG,EAAE,MAAM,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC;YACjD,KAAK,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,CAAC;QAC9C,CAAC;QACD,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { Batch } from '../../shared/types';
2
+ export declare function normalizeFiles(files: string[], chunkSize: number, chunkOverlap: number, concurrency?: number): AsyncGenerator<Batch>;
3
+ //# sourceMappingURL=files.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"files.d.ts","sourceRoot":"","sources":["../../../../src/node/normalizers/files.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,oBAAoB,CAAC;AAiDhD,wBAAuB,cAAc,CACnC,KAAK,EAAE,MAAM,EAAE,EACf,SAAS,EAAE,MAAM,EACjB,YAAY,EAAE,MAAM,EACpB,WAAW,GAAE,MAAU,GACtB,cAAc,CAAC,KAAK,CAAC,CA6CvB"}
@@ -0,0 +1,109 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.normalizeFiles = normalizeFiles;
37
+ const path = __importStar(require("path"));
38
+ const fs = __importStar(require("fs"));
39
+ const errors_1 = require("../../shared/errors");
40
+ const image_1 = require("./image");
41
+ const text_1 = require("./text");
42
+ const binary_1 = require("./binary");
43
+ const video_1 = require("./video");
44
+ const pdf_1 = require("./pdf");
45
+ const VIDEO_EXTS = new Set(['.mp4', '.mov', '.avi', '.mkv', '.webm']);
46
+ const IMAGE_EXTS = new Set(['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']);
47
+ const TEXT_EXTS = new Set(['.txt', '.md', '.json', '.csv', '.xml', '.html']);
48
+ async function normalizeOneFile(filePath, chunkSize, chunkOverlap) {
49
+ if (!fs.existsSync(filePath)) {
50
+ throw new errors_1.IngestError(`File not found: ${filePath}`);
51
+ }
52
+ const ext = path.extname(filePath).toLowerCase();
53
+ if (VIDEO_EXTS.has(ext)) {
54
+ const batches = [];
55
+ for await (const batch of (0, video_1.normalizeVideo)(filePath)) {
56
+ batches.push(batch);
57
+ }
58
+ return batches;
59
+ }
60
+ if (IMAGE_EXTS.has(ext)) {
61
+ return [(0, image_1.normalizeImage)(filePath)];
62
+ }
63
+ if (ext === '.pdf') {
64
+ const batches = [];
65
+ for await (const batch of (0, pdf_1.normalizePdf)(filePath)) {
66
+ batches.push(batch);
67
+ }
68
+ return batches;
69
+ }
70
+ if (TEXT_EXTS.has(ext)) {
71
+ return [(0, text_1.normalizeText)(filePath, chunkSize, chunkOverlap)];
72
+ }
73
+ return [(0, binary_1.normalizeBinary)(filePath)];
74
+ }
75
+ async function* normalizeFiles(files, chunkSize, chunkOverlap, concurrency = 4) {
76
+ const maxWorkers = Math.min(concurrency, files.length);
77
+ if (maxWorkers <= 1) {
78
+ for (const filePath of files) {
79
+ const batches = await normalizeOneFile(filePath, chunkSize, chunkOverlap);
80
+ for (const batch of batches) {
81
+ yield batch;
82
+ }
83
+ }
84
+ return;
85
+ }
86
+ // Process files with limited concurrency
87
+ let index = 0;
88
+ const pending = [];
89
+ function enqueue() {
90
+ while (pending.length < maxWorkers && index < files.length) {
91
+ const filePath = files[index++];
92
+ pending.push(normalizeOneFile(filePath, chunkSize, chunkOverlap)
93
+ .then((batches) => ({ filePath, batches }))
94
+ .catch((e) => {
95
+ throw new errors_1.IngestError(`Failed to process ${filePath}: ${e instanceof Error ? e.message : e}`);
96
+ }));
97
+ }
98
+ }
99
+ enqueue();
100
+ while (pending.length > 0) {
101
+ const result = await Promise.race(pending.map((p, i) => p.then((r) => ({ ...r, idx: i }))));
102
+ pending.splice(result.idx, 1);
103
+ enqueue();
104
+ for (const batch of result.batches) {
105
+ yield batch;
106
+ }
107
+ }
108
+ }
109
+ //# sourceMappingURL=files.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"files.js","sourceRoot":"","sources":["../../../../src/node/normalizers/files.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAoDA,wCAkDC;AAtGD,2CAA6B;AAC7B,uCAAyB;AACzB,gDAAkD;AAElD,mCAAyC;AACzC,iCAAuC;AACvC,qCAA2C;AAC3C,mCAAyC;AACzC,+BAAqC;AAErC,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC;AACtE,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC;AAC/E,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,CAAC,MAAM,EAAE,KAAK,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC;AAE7E,KAAK,UAAU,gBAAgB,CAC7B,QAAgB,EAChB,SAAiB,EACjB,YAAoB;IAEpB,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;QAC7B,MAAM,IAAI,oBAAW,CAAC,mBAAmB,QAAQ,EAAE,CAAC,CAAC;IACvD,CAAC;IAED,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;IAEjD,IAAI,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;QACxB,MAAM,OAAO,GAAY,EAAE,CAAC;QAC5B,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,IAAA,sBAAc,EAAC,QAAQ,CAAC,EAAE,CAAC;YACnD,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACtB,CAAC;QACD,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,IAAI,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;QACxB,OAAO,CAAC,IAAA,sBAAc,EAAC,QAAQ,CAAC,CAAC,CAAC;IACpC,CAAC;IAED,IAAI,GAAG,KAAK,MAAM,EAAE,CAAC;QACnB,MAAM,OAAO,GAAY,EAAE,CAAC;QAC5B,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,IAAA,kBAAY,EAAC,QAAQ,CAAC,EAAE,CAAC;YACjD,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACtB,CAAC;QACD,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,IAAI,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;QACvB,OAAO,CAAC,IAAA,oBAAa,EAAC,QAAQ,EAAE,SAAS,EAAE,YAAY,CAAC,CAAC,CAAC;IAC5D,CAAC;IAED,OAAO,CAAC,IAAA,wBAAe,EAAC,QAAQ,CAAC,CAAC,CAAC;AACrC,CAAC;AAEM,KAAK,SAAS,CAAC,CAAC,cAAc,CACnC,KAAe,EACf,SAAiB,EACjB,YAAoB,EACpB,cAAsB,CAAC;IAEvB,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IAEvD,IAAI,UAAU,IAAI,CAAC,EAAE,CAAC;QACpB,KAAK,MAAM,QAAQ,IAAI,KAAK,EAAE,CAAC;YAC7B,MAAM,OAAO,GAAG,MAAM,gBAAgB,CAAC,QAAQ,EAAE,SAAS,EAAE,YAAY,CAAC,CAAC;YAC1E,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;gBAC5B,MAAM,KAAK,CAAC;YACd,CAAC;QACH,CAAC;QACD,OAAO;IACT,CAAC;IAED,yCAAyC;IACzC,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,MAAM,OAAO,GAAsD,EAAE,CAAC;IAEtE,SAAS,OAAO;QACd,OAAO,OAAO,CAAC,MAAM,GAAG,UAAU,IAAI,KAAK,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YAC3D,MAAM,QAAQ,GAAG,KAAK,CAAC,KAAK,EAAE,CAAC,CAAC;YAChC,OAAO,CAAC,IAAI,CACV,gBAAgB,CAAC,QAAQ,EAAE,SAAS,EAAE,YAAY,CAAC;iBAChD,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC,EAAE,QAAQ,EAAE,OAAO,EAAE,CAAC,CAAC;iBAC1C,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE;gBACX,MAAM,IAAI,oBAAW,CACnB,qBAAqB,QAAQ,KAAK,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CACvE,CAAC;YACJ,CAAC,CAAC,CACL,CAAC;QACJ,CAAC;IACH,CAAC;IAED,OAAO,EAAE,CAAC;IAEV,OAAO,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC1B,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,IAAI,CAC/B,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,CACzD,CAAC;QACF,OAAO,CAAC,MAAM,CAAC,MAAM,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;QAC9B,OAAO,EAAE,CAAC;QAEV,KAAK,MAAM,KAAK,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;YACnC,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;AACH,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { Batch } from '../../shared/types';
2
+ export declare function normalizeHuggingface(datasetName: string, batchSize?: number, split?: string): AsyncGenerator<Batch>;
3
+ //# sourceMappingURL=huggingface.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"huggingface.d.ts","sourceRoot":"","sources":["../../../../src/node/normalizers/huggingface.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,oBAAoB,CAAC;AAEhD,wBAAuB,oBAAoB,CACzC,WAAW,EAAE,MAAM,EACnB,SAAS,GAAE,MAAY,EACvB,KAAK,GAAE,MAAgB,GACtB,cAAc,CAAC,KAAK,CAAC,CA4CvB"}
@@ -0,0 +1,158 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.normalizeHuggingface = normalizeHuggingface;
37
+ async function* normalizeHuggingface(datasetName, batchSize = 100, split = 'train') {
38
+ let rows;
39
+ try {
40
+ const hub = require('@huggingface/hub');
41
+ if (typeof hub.listDatasetFiles === 'function') {
42
+ rows = streamViaHub(hub, datasetName, split);
43
+ }
44
+ else {
45
+ throw new Error('Incompatible @huggingface/hub version');
46
+ }
47
+ }
48
+ catch {
49
+ try {
50
+ rows = streamViaPython(datasetName, split);
51
+ }
52
+ catch {
53
+ throw new Error(`No HuggingFace dataset library available. Install one of:\n` +
54
+ ` npm install @huggingface/hub\n` +
55
+ `Or ensure Python is available with: pip install datasets`);
56
+ }
57
+ }
58
+ const batch = [];
59
+ for await (const item of rows) {
60
+ const row = {};
61
+ for (const [col, val] of Object.entries(item)) {
62
+ if (val instanceof Uint8Array && !Buffer.isBuffer(val)) {
63
+ row[col] = Buffer.from(val);
64
+ }
65
+ else {
66
+ row[col] = val;
67
+ }
68
+ }
69
+ batch.push(row);
70
+ if (batch.length >= batchSize) {
71
+ yield rowsToBatch(batch);
72
+ batch.length = 0;
73
+ }
74
+ }
75
+ if (batch.length > 0) {
76
+ yield rowsToBatch(batch);
77
+ }
78
+ }
79
+ async function* streamViaHub(hub, datasetName, split) {
80
+ const baseUrl = `https://datasets-server.huggingface.co/rows?dataset=${encodeURIComponent(datasetName)}&config=default&split=${encodeURIComponent(split)}`;
81
+ let offset = 0;
82
+ const length = 100;
83
+ while (true) {
84
+ const url = `${baseUrl}&offset=${offset}&length=${length}`;
85
+ const resp = await fetch(url);
86
+ if (!resp.ok)
87
+ break;
88
+ const data = (await resp.json());
89
+ const rows = data.rows;
90
+ if (!rows || rows.length === 0)
91
+ break;
92
+ for (const rowWrapper of rows) {
93
+ yield rowWrapper.row;
94
+ }
95
+ if (rows.length < length)
96
+ break;
97
+ offset += rows.length;
98
+ }
99
+ }
100
+ async function* streamViaPython(datasetName, split) {
101
+ const { spawn } = await Promise.resolve().then(() => __importStar(require('child_process')));
102
+ const readline = await Promise.resolve().then(() => __importStar(require('readline')));
103
+ const pyScript = `
104
+ import json, sys
105
+ from datasets import load_dataset
106
+ ds = load_dataset("${datasetName.replace(/"/g, '\\"')}", split="${split.replace(/"/g, '\\"')}", streaming=True)
107
+ for item in ds:
108
+ row = {}
109
+ for k, v in item.items():
110
+ if hasattr(v, 'tobytes'):
111
+ import base64
112
+ row[k] = {"__bytes__": base64.b64encode(v.tobytes()).decode()}
113
+ elif hasattr(v, 'tolist'):
114
+ row[k] = v.tolist()
115
+ else:
116
+ row[k] = v
117
+ print(json.dumps(row))
118
+ `;
119
+ const proc = spawn('python3', ['-c', pyScript], {
120
+ stdio: ['pipe', 'pipe', 'pipe'],
121
+ });
122
+ const rl = readline.createInterface({ input: proc.stdout });
123
+ for await (const line of rl) {
124
+ const trimmed = line.trim();
125
+ if (!trimmed)
126
+ continue;
127
+ const row = JSON.parse(trimmed);
128
+ for (const [key, val] of Object.entries(row)) {
129
+ if (val &&
130
+ typeof val === 'object' &&
131
+ '__bytes__' in val) {
132
+ row[key] = Buffer.from(val.__bytes__, 'base64');
133
+ }
134
+ }
135
+ yield row;
136
+ }
137
+ await new Promise((resolve, reject) => {
138
+ proc.on('close', (code) => {
139
+ if (code !== 0) {
140
+ reject(new Error(`Python process exited with code ${code}`));
141
+ }
142
+ else {
143
+ resolve();
144
+ }
145
+ });
146
+ });
147
+ }
148
+ function rowsToBatch(rows) {
149
+ if (rows.length === 0)
150
+ return {};
151
+ const keys = Object.keys(rows[0]);
152
+ const result = {};
153
+ for (const key of keys) {
154
+ result[key] = rows.map((r) => r[key]);
155
+ }
156
+ return result;
157
+ }
158
+ //# sourceMappingURL=huggingface.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"huggingface.js","sourceRoot":"","sources":["../../../../src/node/normalizers/huggingface.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAEA,oDAgDC;AAhDM,KAAK,SAAS,CAAC,CAAC,oBAAoB,CACzC,WAAmB,EACnB,YAAoB,GAAG,EACvB,QAAgB,OAAO;IAEvB,IAAI,IAA4C,CAAC;IAEjD,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,OAAO,CAAC,kBAAkB,CAAQ,CAAC;QAC/C,IAAI,OAAO,GAAG,CAAC,gBAAgB,KAAK,UAAU,EAAE,CAAC;YAC/C,IAAI,GAAG,YAAY,CAAC,GAAG,EAAE,WAAW,EAAE,KAAK,CAAC,CAAC;QAC/C,CAAC;aAAM,CAAC;YACN,MAAM,IAAI,KAAK,CAAC,uCAAuC,CAAC,CAAC;QAC3D,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,IAAI,CAAC;YACH,IAAI,GAAG,eAAe,CAAC,WAAW,EAAE,KAAK,CAAC,CAAC;QAC7C,CAAC;QAAC,MAAM,CAAC;YACP,MAAM,IAAI,KAAK,CACb,6DAA6D;gBAC7D,kCAAkC;gBAClC,0DAA0D,CAC3D,CAAC;QACJ,CAAC;IACH,CAAC;IAED,MAAM,KAAK,GAA8B,EAAE,CAAC;IAE5C,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,IAAI,EAAE,CAAC;QAC9B,MAAM,GAAG,GAA4B,EAAE,CAAC;QACxC,KAAK,MAAM,CAAC,GAAG,EAAE,GAAG,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC;YAC9C,IAAI,GAAG,YAAY,UAAU,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;gBACvD,GAAG,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YAC9B,CAAC;iBAAM,CAAC;gBACN,GAAG,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC;YACjB,CAAC;QACH,CAAC;QACD,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAEhB,IAAI,KAAK,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;YAC9B,MAAM,WAAW,CAAC,KAAK,CAAC,CAAC;YACzB,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC;QACnB,CAAC;IACH,CAAC;IAED,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACrB,MAAM,WAAW,CAAC,KAAK,CAAC,CAAC;IAC3B,CAAC;AACH,CAAC;AAED,KAAK,SAAS,CAAC,CAAC,YAAY,CAC1B,GAAQ,EACR,WAAmB,EACnB,KAAa;IAEb,MAAM,OAAO,GAAG,uDAAuD,kBAAkB,CAAC,WAAW,CAAC,yBAAyB,kBAAkB,CAAC,KAAK,CAAC,EAAE,CAAC;IAE3J,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,MAAM,MAAM,GAAG,GAAG,CAAC;IAEnB,OAAO,IAAI,EAAE,CAAC;QACZ,MAAM,GAAG,GAAG,GAAG,OAAO,WAAW,MAAM,WAAW,MAAM,EAAE,CAAC;QAC3D,MAAM,IAAI,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,CAAC;QAC9B,IAAI,CAAC,IAAI,CAAC,EAAE;YAAE,MAAM;QAEpB,MAAM,IAAI,GAAG,CAAC,MAAM,IAAI,CAAC,IAAI,EAAE,CAAQ,CAAC;QACxC,MAAM,IAAI,GAAG,IAAI,CAAC,IAAa,CAAC;QAChC,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;YAAE,MAAM;QAEtC,KAAK,MAAM,UAAU,IAAI,IAAI,EAAE,CAAC;YAC9B,MAAM,UAAU,CAAC,GAA8B,CAAC;QAClD,CAAC;QAED,IAAI,IAAI,CAAC,MAAM,GAAG,MAAM;YAAE,MAAM;QAChC,MAAM,IAAI,IAAI,CAAC,MAAM,CAAC;IACxB,CAAC;AACH,CAAC;AAED,KAAK,SAAS,CAAC,CAAC,eAAe,CAC7B,WAAmB,EACnB,KAAa;IAEb,MAAM,EAAE,KAAK,EAAE,GAAG,wDAAa,eAAe,GAAC,CAAC;IAChD,MAAM,QAAQ,GAAG,wDAAa,UAAU,GAAC,CAAC;IAE1C,MAAM,QAAQ,GAAG;;;qBAGE,WAAW,CAAC,OAAO,CAAC,IAAI,EAAE,KAAK,CAAC,aAAa,KAAK,CAAC,OAAO,CAAC,IAAI,EAAE,KAAK,CAAC;;;;;;;;;;;;CAY3F,CAAC;IAEA,MAAM,IAAI,GAAG,KAAK,CAAC,SAAS,EAAE,CAAC,IAAI,EAAE,QAAQ,CAAC,EAAE;QAC9C,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC;KAChC,CAAC,CAAC;IAEH,MAAM,EAAE,GAAG,QAAQ,CAAC,eAAe,CAAC,EAAE,KAAK,EAAE,IAAI,CAAC,MAAO,EAAE,CAAC,CAAC;IAE7D,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,EAAE,EAAE,CAAC;QAC5B,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;QAC5B,IAAI,CAAC,OAAO;YAAE,SAAS;QAEvB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAA4B,CAAC;QAE3D,KAAK,MAAM,CAAC,GAAG,EAAE,GAAG,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC;YAC7C,IACE,GAAG;gBACH,OAAO,GAAG,KAAK,QAAQ;gBACvB,WAAW,IAAK,GAA+B,EAC/C,CAAC;gBACD,GAAG,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,IAAI,CACnB,GAA6B,CAAC,SAAS,EACxC,QAAQ,CACT,CAAC;YACJ,CAAC;QACH,CAAC;QAED,MAAM,GAAG,CAAC;IACZ,CAAC;IAED,MAAM,IAAI,OAAO,CAAO,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QAC1C,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;YACxB,IAAI,IAAI,KAAK,CAAC,EAAE,CAAC;gBACf,MAAM,CAAC,IAAI,KAAK,CAAC,mCAAmC,IAAI,EAAE,CAAC,CAAC,CAAC;YAC/D,CAAC;iBAAM,CAAC;gBACN,OAAO,EAAE,CAAC;YACZ,CAAC;QACH,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AAED,SAAS,WAAW,CAAC,IAA+B;IAClD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IACjC,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IAClC,MAAM,MAAM,GAAU,EAAE,CAAC;IACzB,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACxC,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { Batch } from '../../shared/types';
2
+ export declare function normalizeImage(filePath: string): Batch;
3
+ //# sourceMappingURL=image.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"image.d.ts","sourceRoot":"","sources":["../../../../src/node/normalizers/image.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,oBAAoB,CAAC;AAEhD,wBAAgB,cAAc,CAAC,QAAQ,EAAE,MAAM,GAAG,KAAK,CAYtD"}
@@ -0,0 +1,52 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.normalizeImage = normalizeImage;
37
+ const fs = __importStar(require("fs"));
38
+ const path = __importStar(require("path"));
39
+ const crypto = __importStar(require("crypto"));
40
+ function normalizeImage(filePath) {
41
+ const fileId = crypto.randomUUID();
42
+ const data = fs.readFileSync(filePath);
43
+ const filename = path.basename(filePath);
44
+ return {
45
+ id: [`${fileId}_0`],
46
+ file_id: [fileId],
47
+ image: [data],
48
+ filename: [filename],
49
+ text: [`Image: ${filename}`],
50
+ };
51
+ }
52
+ //# sourceMappingURL=image.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"image.js","sourceRoot":"","sources":["../../../../src/node/normalizers/image.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAKA,wCAYC;AAjBD,uCAAyB;AACzB,2CAA6B;AAC7B,+CAAiC;AAGjC,SAAgB,cAAc,CAAC,QAAgB;IAC7C,MAAM,MAAM,GAAG,MAAM,CAAC,UAAU,EAAE,CAAC;IACnC,MAAM,IAAI,GAAG,EAAE,CAAC,YAAY,CAAC,QAAQ,CAAC,CAAC;IACvC,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAEzC,OAAO;QACL,EAAE,EAAE,CAAC,GAAG,MAAM,IAAI,CAAC;QACnB,OAAO,EAAE,CAAC,MAAM,CAAC;QACjB,KAAK,EAAE,CAAC,IAAI,CAAC;QACb,QAAQ,EAAE,CAAC,QAAQ,CAAC;QACpB,IAAI,EAAE,CAAC,UAAU,QAAQ,EAAE,CAAC;KAC7B,CAAC;AACJ,CAAC"}
@@ -0,0 +1,9 @@
1
+ export { normalizeDict } from './dict';
2
+ export { chunkText, normalizeText } from './text';
3
+ export { normalizeBinary } from './binary';
4
+ export { normalizeImage } from './image';
5
+ export { normalizeVideo } from './video';
6
+ export { normalizePdf } from './pdf';
7
+ export { normalizeFiles } from './files';
8
+ export { normalizeHuggingface } from './huggingface';
9
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/node/normalizers/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,aAAa,EAAE,MAAM,QAAQ,CAAC;AACvC,OAAO,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM,QAAQ,CAAC;AAClD,OAAO,EAAE,eAAe,EAAE,MAAM,UAAU,CAAC;AAC3C,OAAO,EAAE,cAAc,EAAE,MAAM,SAAS,CAAC;AACzC,OAAO,EAAE,cAAc,EAAE,MAAM,SAAS,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,OAAO,CAAC;AACrC,OAAO,EAAE,cAAc,EAAE,MAAM,SAAS,CAAC;AACzC,OAAO,EAAE,oBAAoB,EAAE,MAAM,eAAe,CAAC"}
@@ -0,0 +1,21 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.normalizeHuggingface = exports.normalizeFiles = exports.normalizePdf = exports.normalizeVideo = exports.normalizeImage = exports.normalizeBinary = exports.normalizeText = exports.chunkText = exports.normalizeDict = void 0;
4
+ var dict_1 = require("./dict");
5
+ Object.defineProperty(exports, "normalizeDict", { enumerable: true, get: function () { return dict_1.normalizeDict; } });
6
+ var text_1 = require("./text");
7
+ Object.defineProperty(exports, "chunkText", { enumerable: true, get: function () { return text_1.chunkText; } });
8
+ Object.defineProperty(exports, "normalizeText", { enumerable: true, get: function () { return text_1.normalizeText; } });
9
+ var binary_1 = require("./binary");
10
+ Object.defineProperty(exports, "normalizeBinary", { enumerable: true, get: function () { return binary_1.normalizeBinary; } });
11
+ var image_1 = require("./image");
12
+ Object.defineProperty(exports, "normalizeImage", { enumerable: true, get: function () { return image_1.normalizeImage; } });
13
+ var video_1 = require("./video");
14
+ Object.defineProperty(exports, "normalizeVideo", { enumerable: true, get: function () { return video_1.normalizeVideo; } });
15
+ var pdf_1 = require("./pdf");
16
+ Object.defineProperty(exports, "normalizePdf", { enumerable: true, get: function () { return pdf_1.normalizePdf; } });
17
+ var files_1 = require("./files");
18
+ Object.defineProperty(exports, "normalizeFiles", { enumerable: true, get: function () { return files_1.normalizeFiles; } });
19
+ var huggingface_1 = require("./huggingface");
20
+ Object.defineProperty(exports, "normalizeHuggingface", { enumerable: true, get: function () { return huggingface_1.normalizeHuggingface; } });
21
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../src/node/normalizers/index.ts"],"names":[],"mappings":";;;AAAA,+BAAuC;AAA9B,qGAAA,aAAa,OAAA;AACtB,+BAAkD;AAAzC,iGAAA,SAAS,OAAA;AAAE,qGAAA,aAAa,OAAA;AACjC,mCAA2C;AAAlC,yGAAA,eAAe,OAAA;AACxB,iCAAyC;AAAhC,uGAAA,cAAc,OAAA;AACvB,iCAAyC;AAAhC,uGAAA,cAAc,OAAA;AACvB,6BAAqC;AAA5B,mGAAA,YAAY,OAAA;AACrB,iCAAyC;AAAhC,uGAAA,cAAc,OAAA;AACvB,6CAAqD;AAA5C,mHAAA,oBAAoB,OAAA"}
@@ -0,0 +1,3 @@
1
+ import type { Batch } from '../../shared/types';
2
+ export declare function normalizePdf(filePath: string): AsyncGenerator<Batch>;
3
+ //# sourceMappingURL=pdf.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pdf.d.ts","sourceRoot":"","sources":["../../../../src/node/normalizers/pdf.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,oBAAoB,CAAC;AAEhD,wBAAuB,YAAY,CAAC,QAAQ,EAAE,MAAM,GAAG,cAAc,CAAC,KAAK,CAAC,CAsC3E"}
@@ -0,0 +1,75 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.normalizePdf = normalizePdf;
37
+ const fs = __importStar(require("fs"));
38
+ const path = __importStar(require("path"));
39
+ const crypto = __importStar(require("crypto"));
40
+ async function* normalizePdf(filePath) {
41
+ const fileId = crypto.randomUUID();
42
+ const filename = path.basename(filePath);
43
+ let pdfjs;
44
+ try {
45
+ pdfjs = require('pdfjs-dist');
46
+ }
47
+ catch {
48
+ // Fallback: store whole PDF as binary
49
+ yield {
50
+ id: [`${fileId}_0`],
51
+ file_id: [fileId],
52
+ page_index: [0],
53
+ data: [fs.readFileSync(filePath)],
54
+ text: [`PDF: ${filename}`],
55
+ };
56
+ return;
57
+ }
58
+ const data = new Uint8Array(fs.readFileSync(filePath));
59
+ const doc = await pdfjs.getDocument({ data }).promise;
60
+ for (let pageIdx = 0; pageIdx < doc.numPages; pageIdx++) {
61
+ const page = await doc.getPage(pageIdx + 1);
62
+ const textContent = await page.getTextContent();
63
+ const text = textContent.items
64
+ .map((item) => item.str)
65
+ .join(' ');
66
+ yield {
67
+ id: [`${fileId}_${pageIdx}`],
68
+ file_id: [fileId],
69
+ page_index: [pageIdx],
70
+ text: [text || `PDF page ${pageIdx + 1}`],
71
+ };
72
+ }
73
+ doc.destroy();
74
+ }
75
+ //# sourceMappingURL=pdf.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pdf.js","sourceRoot":"","sources":["../../../../src/node/normalizers/pdf.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAKA,oCAsCC;AA3CD,uCAAyB;AACzB,2CAA6B;AAC7B,+CAAiC;AAG1B,KAAK,SAAS,CAAC,CAAC,YAAY,CAAC,QAAgB;IAClD,MAAM,MAAM,GAAG,MAAM,CAAC,UAAU,EAAE,CAAC;IACnC,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAEzC,IAAI,KAAU,CAAC;IACf,IAAI,CAAC;QACH,KAAK,GAAG,OAAO,CAAC,YAAY,CAAC,CAAC;IAChC,CAAC;IAAC,MAAM,CAAC;QACP,sCAAsC;QACtC,MAAM;YACJ,EAAE,EAAE,CAAC,GAAG,MAAM,IAAI,CAAC;YACnB,OAAO,EAAE,CAAC,MAAM,CAAC;YACjB,UAAU,EAAE,CAAC,CAAC,CAAC;YACf,IAAI,EAAE,CAAC,EAAE,CAAC,YAAY,CAAC,QAAQ,CAAC,CAAC;YACjC,IAAI,EAAE,CAAC,QAAQ,QAAQ,EAAE,CAAC;SAC3B,CAAC;QACF,OAAO;IACT,CAAC;IAED,MAAM,IAAI,GAAG,IAAI,UAAU,CAAC,EAAE,CAAC,YAAY,CAAC,QAAQ,CAAC,CAAC,CAAC;IACvD,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,WAAW,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC;IAEtD,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,GAAG,GAAG,CAAC,QAAQ,EAAE,OAAO,EAAE,EAAE,CAAC;QACxD,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,OAAO,CAAC,OAAO,GAAG,CAAC,CAAC,CAAC;QAC5C,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,cAAc,EAAE,CAAC;QAChD,MAAM,IAAI,GAAG,WAAW,CAAC,KAAK;aAC3B,GAAG,CAAC,CAAC,IAAS,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC;aAC5B,IAAI,CAAC,GAAG,CAAC,CAAC;QAEb,MAAM;YACJ,EAAE,EAAE,CAAC,GAAG,MAAM,IAAI,OAAO,EAAE,CAAC;YAC5B,OAAO,EAAE,CAAC,MAAM,CAAC;YACjB,UAAU,EAAE,CAAC,OAAO,CAAC;YACrB,IAAI,EAAE,CAAC,IAAI,IAAI,YAAY,OAAO,GAAG,CAAC,EAAE,CAAC;SAC1C,CAAC;IACJ,CAAC;IAED,GAAG,CAAC,OAAO,EAAE,CAAC;AAChB,CAAC"}
@@ -0,0 +1,4 @@
1
+ import type { Batch } from '../../shared/types';
2
+ export declare function chunkText(text: string, chunkSize: number, overlap: number): string[];
3
+ export declare function normalizeText(filePath: string, chunkSize: number, chunkOverlap: number): Batch;
4
+ //# sourceMappingURL=text.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"text.d.ts","sourceRoot":"","sources":["../../../../src/node/normalizers/text.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,oBAAoB,CAAC;AAEhD,wBAAgB,SAAS,CACvB,IAAI,EAAE,MAAM,EACZ,SAAS,EAAE,MAAM,EACjB,OAAO,EAAE,MAAM,GACd,MAAM,EAAE,CAoCV;AAED,wBAAgB,aAAa,CAC3B,QAAQ,EAAE,MAAM,EAChB,SAAS,EAAE,MAAM,EACjB,YAAY,EAAE,MAAM,GACnB,KAAK,CAYP"}