@vespermcp/mcp-server 1.2.5 → 1.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js
CHANGED
|
@@ -131,8 +131,83 @@ function extractRequestedRows(query, requirements) {
|
|
|
131
131
|
return Math.max(...allNums);
|
|
132
132
|
return undefined;
|
|
133
133
|
}
|
|
134
|
+
const verifiedPythonModules = new Set();
|
|
135
|
+
function getPythonCommand() {
|
|
136
|
+
return process.platform === "win32" ? "py" : "python";
|
|
137
|
+
}
|
|
138
|
+
function runPythonProcess(args, timeoutMs = 300000) {
|
|
139
|
+
const pyCmd = getPythonCommand();
|
|
140
|
+
return new Promise((resolve, reject) => {
|
|
141
|
+
const proc = spawn(pyCmd, args, {
|
|
142
|
+
env: {
|
|
143
|
+
...process.env,
|
|
144
|
+
PIP_DISABLE_PIP_VERSION_CHECK: "1",
|
|
145
|
+
PYTHONUTF8: "1",
|
|
146
|
+
},
|
|
147
|
+
});
|
|
148
|
+
let stdout = "";
|
|
149
|
+
let stderr = "";
|
|
150
|
+
proc.stdout.on("data", (d) => (stdout += d.toString()));
|
|
151
|
+
proc.stderr.on("data", (d) => (stderr += d.toString()));
|
|
152
|
+
const timer = setTimeout(() => {
|
|
153
|
+
try {
|
|
154
|
+
proc.kill();
|
|
155
|
+
}
|
|
156
|
+
catch {
|
|
157
|
+
// no-op
|
|
158
|
+
}
|
|
159
|
+
reject(new Error(`Python command timed out after ${timeoutMs}ms: ${args.join(" ")}`));
|
|
160
|
+
}, timeoutMs);
|
|
161
|
+
proc.on("close", (code) => {
|
|
162
|
+
clearTimeout(timer);
|
|
163
|
+
resolve({ code: code ?? 1, stdout, stderr });
|
|
164
|
+
});
|
|
165
|
+
proc.on("error", (error) => {
|
|
166
|
+
clearTimeout(timer);
|
|
167
|
+
reject(error);
|
|
168
|
+
});
|
|
169
|
+
});
|
|
170
|
+
}
|
|
171
|
+
async function ensurePythonModules(modulePackagePairs) {
|
|
172
|
+
const missing = [];
|
|
173
|
+
for (const pair of modulePackagePairs) {
|
|
174
|
+
if (verifiedPythonModules.has(pair.module)) {
|
|
175
|
+
continue;
|
|
176
|
+
}
|
|
177
|
+
const check = await runPythonProcess([
|
|
178
|
+
"-c",
|
|
179
|
+
`import importlib.util,sys; sys.exit(0 if importlib.util.find_spec(${JSON.stringify(pair.module)}) else 1)`
|
|
180
|
+
], 20000);
|
|
181
|
+
if (check.code === 0) {
|
|
182
|
+
verifiedPythonModules.add(pair.module);
|
|
183
|
+
}
|
|
184
|
+
else {
|
|
185
|
+
missing.push(pair);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
if (missing.length === 0) {
|
|
189
|
+
return;
|
|
190
|
+
}
|
|
191
|
+
const packages = [...new Set(missing.map(m => m.packageName))];
|
|
192
|
+
console.error(`[Vesper] Installing missing Python packages: ${packages.join(", ")}`);
|
|
193
|
+
const installArgs = ["-m", "pip", "install", "--disable-pip-version-check", ...packages];
|
|
194
|
+
let install = await runPythonProcess(installArgs, 600000);
|
|
195
|
+
if (install.code !== 0) {
|
|
196
|
+
console.error(`[Vesper] pip install failed (trying --user fallback): ${(install.stderr || "").slice(0, 300)}`);
|
|
197
|
+
const userInstallArgs = ["-m", "pip", "install", "--disable-pip-version-check", "--user", ...packages];
|
|
198
|
+
install = await runPythonProcess(userInstallArgs, 600000);
|
|
199
|
+
}
|
|
200
|
+
if (install.code !== 0) {
|
|
201
|
+
const details = (install.stderr || install.stdout || "Unknown pip install error").trim();
|
|
202
|
+
throw new Error(`Failed to install required Python packages (${packages.join(", ")}). ${details}`);
|
|
203
|
+
}
|
|
204
|
+
console.error(`[Vesper] Successfully installed: ${packages.join(", ")}`);
|
|
205
|
+
for (const pair of missing) {
|
|
206
|
+
verifiedPythonModules.add(pair.module);
|
|
207
|
+
}
|
|
208
|
+
}
|
|
134
209
|
function runPythonJson(scriptPath, args) {
|
|
135
|
-
const pyCmd =
|
|
210
|
+
const pyCmd = getPythonCommand();
|
|
136
211
|
return new Promise((resolve, reject) => {
|
|
137
212
|
const proc = spawn(pyCmd, [scriptPath, ...args]);
|
|
138
213
|
let stdout = "";
|
|
@@ -932,6 +1007,27 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
932
1007
|
isError: true,
|
|
933
1008
|
};
|
|
934
1009
|
}
|
|
1010
|
+
const requiredModules = [
|
|
1011
|
+
{ module: "aiohttp", packageName: "aiohttp" },
|
|
1012
|
+
];
|
|
1013
|
+
if (source === "url") {
|
|
1014
|
+
requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
|
|
1015
|
+
}
|
|
1016
|
+
if (source === "huggingface") {
|
|
1017
|
+
requiredModules.push({ module: "datasets", packageName: "datasets" });
|
|
1018
|
+
}
|
|
1019
|
+
if (source === "kaggle") {
|
|
1020
|
+
requiredModules.push({ module: "kaggle", packageName: "kaggle" });
|
|
1021
|
+
}
|
|
1022
|
+
try {
|
|
1023
|
+
await ensurePythonModules(requiredModules);
|
|
1024
|
+
}
|
|
1025
|
+
catch (error) {
|
|
1026
|
+
return {
|
|
1027
|
+
content: [{ type: "text", text: `ERROR: Python dependency setup failed: ${error.message}` }],
|
|
1028
|
+
isError: true,
|
|
1029
|
+
};
|
|
1030
|
+
}
|
|
935
1031
|
const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
|
|
936
1032
|
const payload = {
|
|
937
1033
|
dataset_id: datasetId,
|
|
Binary file
|
|
@@ -101,27 +101,54 @@ class AssetDownloader:
|
|
|
101
101
|
if source not in {"huggingface", "kaggle", "url"}:
|
|
102
102
|
raise ValueError("source must be one of: huggingface, kaggle, url")
|
|
103
103
|
|
|
104
|
-
|
|
105
|
-
images_dir = dataset_dir / "images"
|
|
106
|
-
dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
107
|
-
images_dir.mkdir(parents=True, exist_ok=True)
|
|
108
|
-
|
|
109
|
-
errors_file = dataset_dir / "errors.jsonl"
|
|
110
|
-
metadata_file = dataset_dir / "metadata.jsonl"
|
|
111
|
-
|
|
104
|
+
# --- Validate imports and args BEFORE creating any directories ---
|
|
112
105
|
if source == "huggingface":
|
|
113
106
|
if not repo_id:
|
|
114
107
|
raise ValueError("repo_id is required for source=huggingface")
|
|
115
|
-
|
|
108
|
+
try:
|
|
109
|
+
from datasets import load_dataset as _ld # noqa: F401
|
|
110
|
+
except Exception as e:
|
|
111
|
+
raise RuntimeError(
|
|
112
|
+
f"datasets package is required for HuggingFace downloads. "
|
|
113
|
+
f"Install with: pip install datasets. Details: {e}"
|
|
114
|
+
)
|
|
116
115
|
elif source == "kaggle":
|
|
117
116
|
ref = kaggle_ref or repo_id
|
|
118
117
|
if not ref:
|
|
119
118
|
raise ValueError("kaggle_ref is required for source=kaggle")
|
|
120
|
-
|
|
119
|
+
try:
|
|
120
|
+
from kaggle.api.kaggle_api_extended import KaggleApi as _Ka # noqa: F401
|
|
121
|
+
except Exception as e:
|
|
122
|
+
raise RuntimeError(
|
|
123
|
+
f"kaggle package is required for Kaggle downloads. "
|
|
124
|
+
f"Install with: pip install kaggle. Details: {e}"
|
|
125
|
+
)
|
|
121
126
|
else:
|
|
122
127
|
if not urls:
|
|
123
128
|
raise ValueError("urls are required for source=url")
|
|
124
|
-
|
|
129
|
+
|
|
130
|
+
# --- Now safe to create directories ---
|
|
131
|
+
dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
|
|
132
|
+
images_dir = dataset_dir / "images"
|
|
133
|
+
dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
134
|
+
images_dir.mkdir(parents=True, exist_ok=True)
|
|
135
|
+
|
|
136
|
+
errors_file = dataset_dir / "errors.jsonl"
|
|
137
|
+
metadata_file = dataset_dir / "metadata.jsonl"
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
if source == "huggingface":
|
|
141
|
+
summary = await self._download_huggingface(repo_id, dataset_id, images_dir, metadata_file, errors_file, max_items, image_column)
|
|
142
|
+
elif source == "kaggle":
|
|
143
|
+
ref = kaggle_ref or repo_id
|
|
144
|
+
summary = await self._download_kaggle(ref, dataset_id, images_dir, metadata_file, errors_file, max_items)
|
|
145
|
+
else:
|
|
146
|
+
summary = await self._download_urls(urls, dataset_id, images_dir, metadata_file, errors_file, max_items)
|
|
147
|
+
except Exception:
|
|
148
|
+
# Clean up empty directories on failure so we don't leave ghost artifacts
|
|
149
|
+
if images_dir.exists() and not any(images_dir.iterdir()):
|
|
150
|
+
shutil.rmtree(dataset_dir, ignore_errors=True)
|
|
151
|
+
raise
|
|
125
152
|
|
|
126
153
|
if output_format == "webdataset":
|
|
127
154
|
await self._write_webdataset(dataset_dir, images_dir, metadata_file)
|
|
@@ -150,10 +177,7 @@ class AssetDownloader:
|
|
|
150
177
|
max_items: Optional[int],
|
|
151
178
|
image_column: Optional[str],
|
|
152
179
|
) -> Dict[str, int]:
|
|
153
|
-
|
|
154
|
-
from datasets import load_dataset
|
|
155
|
-
except Exception as e:
|
|
156
|
-
raise RuntimeError(f"datasets package is required. Install with: pip install datasets. Details: {e}")
|
|
180
|
+
from datasets import load_dataset # validated in download_assets()
|
|
157
181
|
|
|
158
182
|
await self._emit("start", {"source": "huggingface", "repo_id": repo_id})
|
|
159
183
|
|
|
@@ -212,10 +236,7 @@ class AssetDownloader:
|
|
|
212
236
|
errors_file: Path,
|
|
213
237
|
max_items: Optional[int],
|
|
214
238
|
) -> Dict[str, int]:
|
|
215
|
-
|
|
216
|
-
from kaggle.api.kaggle_api_extended import KaggleApi
|
|
217
|
-
except Exception as e:
|
|
218
|
-
raise RuntimeError(f"kaggle package is required: {e}")
|
|
239
|
+
from kaggle.api.kaggle_api_extended import KaggleApi # validated in download_assets()
|
|
219
240
|
|
|
220
241
|
await self._emit("start", {"source": "kaggle", "dataset": kaggle_ref})
|
|
221
242
|
|
|
@@ -353,24 +374,59 @@ class AssetDownloader:
|
|
|
353
374
|
raise ValueError(f"Unsupported image value type: {type(value)}")
|
|
354
375
|
|
|
355
376
|
async def _write_webdataset(self, dataset_dir: Path, images_dir: Path, metadata_file: Path) -> None:
|
|
356
|
-
|
|
357
|
-
|
|
377
|
+
"""Write a webdataset-compatible tar archive.
|
|
378
|
+
|
|
379
|
+
Uses Python's built-in tarfile module instead of wds.ShardWriter to
|
|
380
|
+
avoid the gopen() handler issue on Windows (backslash paths).
|
|
381
|
+
The resulting .tar files are fully compatible with webdataset readers.
|
|
382
|
+
"""
|
|
383
|
+
import io
|
|
384
|
+
import tarfile as _tarfile
|
|
385
|
+
|
|
386
|
+
max_per_shard = 5000
|
|
387
|
+
shard_idx = 0
|
|
388
|
+
count_in_shard = 0
|
|
389
|
+
current_tar: _tarfile.TarFile | None = None
|
|
390
|
+
|
|
391
|
+
def _open_shard() -> _tarfile.TarFile:
|
|
392
|
+
nonlocal shard_idx
|
|
393
|
+
shard_path = dataset_dir / f"shard-{shard_idx:06d}.tar"
|
|
394
|
+
shard_idx += 1
|
|
395
|
+
return _tarfile.open(str(shard_path), "w")
|
|
358
396
|
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
ext
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
397
|
+
try:
|
|
398
|
+
current_tar = _open_shard()
|
|
399
|
+
|
|
400
|
+
with metadata_file.open("r", encoding="utf-8") as mf:
|
|
401
|
+
for line in mf:
|
|
402
|
+
row = json.loads(line)
|
|
403
|
+
image_path = Path(row["image_path"])
|
|
404
|
+
if not image_path.exists():
|
|
405
|
+
continue
|
|
406
|
+
|
|
407
|
+
key = image_path.stem
|
|
408
|
+
ext = image_path.suffix.lstrip(".") or "jpg"
|
|
409
|
+
|
|
410
|
+
# Add image file
|
|
411
|
+
img_data = image_path.read_bytes()
|
|
412
|
+
img_info = _tarfile.TarInfo(name=f"{key}.{ext}")
|
|
413
|
+
img_info.size = len(img_data)
|
|
414
|
+
current_tar.addfile(img_info, io.BytesIO(img_data))
|
|
415
|
+
|
|
416
|
+
# Add JSON metadata sidecar
|
|
417
|
+
json_data = json.dumps(row, ensure_ascii=False).encode("utf-8")
|
|
418
|
+
json_info = _tarfile.TarInfo(name=f"{key}.json")
|
|
419
|
+
json_info.size = len(json_data)
|
|
420
|
+
current_tar.addfile(json_info, io.BytesIO(json_data))
|
|
421
|
+
|
|
422
|
+
count_in_shard += 1
|
|
423
|
+
if count_in_shard >= max_per_shard:
|
|
424
|
+
current_tar.close()
|
|
425
|
+
current_tar = _open_shard()
|
|
426
|
+
count_in_shard = 0
|
|
427
|
+
finally:
|
|
428
|
+
if current_tar is not None:
|
|
429
|
+
current_tar.close()
|
|
374
430
|
|
|
375
431
|
async def _write_parquet(self, dataset_dir: Path, metadata_file: Path) -> None:
|
|
376
432
|
try:
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vespermcp/mcp-server",
|
|
3
|
-
"version": "1.2.
|
|
3
|
+
"version": "1.2.7",
|
|
4
4
|
"description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "build/index.js",
|
package/scripts/postinstall.cjs
CHANGED
|
@@ -101,27 +101,54 @@ class AssetDownloader:
|
|
|
101
101
|
if source not in {"huggingface", "kaggle", "url"}:
|
|
102
102
|
raise ValueError("source must be one of: huggingface, kaggle, url")
|
|
103
103
|
|
|
104
|
-
|
|
105
|
-
images_dir = dataset_dir / "images"
|
|
106
|
-
dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
107
|
-
images_dir.mkdir(parents=True, exist_ok=True)
|
|
108
|
-
|
|
109
|
-
errors_file = dataset_dir / "errors.jsonl"
|
|
110
|
-
metadata_file = dataset_dir / "metadata.jsonl"
|
|
111
|
-
|
|
104
|
+
# --- Validate imports and args BEFORE creating any directories ---
|
|
112
105
|
if source == "huggingface":
|
|
113
106
|
if not repo_id:
|
|
114
107
|
raise ValueError("repo_id is required for source=huggingface")
|
|
115
|
-
|
|
108
|
+
try:
|
|
109
|
+
from datasets import load_dataset as _ld # noqa: F401
|
|
110
|
+
except Exception as e:
|
|
111
|
+
raise RuntimeError(
|
|
112
|
+
f"datasets package is required for HuggingFace downloads. "
|
|
113
|
+
f"Install with: pip install datasets. Details: {e}"
|
|
114
|
+
)
|
|
116
115
|
elif source == "kaggle":
|
|
117
116
|
ref = kaggle_ref or repo_id
|
|
118
117
|
if not ref:
|
|
119
118
|
raise ValueError("kaggle_ref is required for source=kaggle")
|
|
120
|
-
|
|
119
|
+
try:
|
|
120
|
+
from kaggle.api.kaggle_api_extended import KaggleApi as _Ka # noqa: F401
|
|
121
|
+
except Exception as e:
|
|
122
|
+
raise RuntimeError(
|
|
123
|
+
f"kaggle package is required for Kaggle downloads. "
|
|
124
|
+
f"Install with: pip install kaggle. Details: {e}"
|
|
125
|
+
)
|
|
121
126
|
else:
|
|
122
127
|
if not urls:
|
|
123
128
|
raise ValueError("urls are required for source=url")
|
|
124
|
-
|
|
129
|
+
|
|
130
|
+
# --- Now safe to create directories ---
|
|
131
|
+
dataset_dir = self.output_root / dataset_id.replace("/", "_").replace(":", "_")
|
|
132
|
+
images_dir = dataset_dir / "images"
|
|
133
|
+
dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
134
|
+
images_dir.mkdir(parents=True, exist_ok=True)
|
|
135
|
+
|
|
136
|
+
errors_file = dataset_dir / "errors.jsonl"
|
|
137
|
+
metadata_file = dataset_dir / "metadata.jsonl"
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
if source == "huggingface":
|
|
141
|
+
summary = await self._download_huggingface(repo_id, dataset_id, images_dir, metadata_file, errors_file, max_items, image_column)
|
|
142
|
+
elif source == "kaggle":
|
|
143
|
+
ref = kaggle_ref or repo_id
|
|
144
|
+
summary = await self._download_kaggle(ref, dataset_id, images_dir, metadata_file, errors_file, max_items)
|
|
145
|
+
else:
|
|
146
|
+
summary = await self._download_urls(urls, dataset_id, images_dir, metadata_file, errors_file, max_items)
|
|
147
|
+
except Exception:
|
|
148
|
+
# Clean up empty directories on failure so we don't leave ghost artifacts
|
|
149
|
+
if images_dir.exists() and not any(images_dir.iterdir()):
|
|
150
|
+
shutil.rmtree(dataset_dir, ignore_errors=True)
|
|
151
|
+
raise
|
|
125
152
|
|
|
126
153
|
if output_format == "webdataset":
|
|
127
154
|
await self._write_webdataset(dataset_dir, images_dir, metadata_file)
|
|
@@ -150,10 +177,7 @@ class AssetDownloader:
|
|
|
150
177
|
max_items: Optional[int],
|
|
151
178
|
image_column: Optional[str],
|
|
152
179
|
) -> Dict[str, int]:
|
|
153
|
-
|
|
154
|
-
from datasets import load_dataset
|
|
155
|
-
except Exception as e:
|
|
156
|
-
raise RuntimeError(f"datasets package is required. Install with: pip install datasets. Details: {e}")
|
|
180
|
+
from datasets import load_dataset # validated in download_assets()
|
|
157
181
|
|
|
158
182
|
await self._emit("start", {"source": "huggingface", "repo_id": repo_id})
|
|
159
183
|
|
|
@@ -212,10 +236,7 @@ class AssetDownloader:
|
|
|
212
236
|
errors_file: Path,
|
|
213
237
|
max_items: Optional[int],
|
|
214
238
|
) -> Dict[str, int]:
|
|
215
|
-
|
|
216
|
-
from kaggle.api.kaggle_api_extended import KaggleApi
|
|
217
|
-
except Exception as e:
|
|
218
|
-
raise RuntimeError(f"kaggle package is required: {e}")
|
|
239
|
+
from kaggle.api.kaggle_api_extended import KaggleApi # validated in download_assets()
|
|
219
240
|
|
|
220
241
|
await self._emit("start", {"source": "kaggle", "dataset": kaggle_ref})
|
|
221
242
|
|
|
@@ -353,24 +374,59 @@ class AssetDownloader:
|
|
|
353
374
|
raise ValueError(f"Unsupported image value type: {type(value)}")
|
|
354
375
|
|
|
355
376
|
async def _write_webdataset(self, dataset_dir: Path, images_dir: Path, metadata_file: Path) -> None:
|
|
356
|
-
|
|
357
|
-
|
|
377
|
+
"""Write a webdataset-compatible tar archive.
|
|
378
|
+
|
|
379
|
+
Uses Python's built-in tarfile module instead of wds.ShardWriter to
|
|
380
|
+
avoid the gopen() handler issue on Windows (backslash paths).
|
|
381
|
+
The resulting .tar files are fully compatible with webdataset readers.
|
|
382
|
+
"""
|
|
383
|
+
import io
|
|
384
|
+
import tarfile as _tarfile
|
|
385
|
+
|
|
386
|
+
max_per_shard = 5000
|
|
387
|
+
shard_idx = 0
|
|
388
|
+
count_in_shard = 0
|
|
389
|
+
current_tar: _tarfile.TarFile | None = None
|
|
390
|
+
|
|
391
|
+
def _open_shard() -> _tarfile.TarFile:
|
|
392
|
+
nonlocal shard_idx
|
|
393
|
+
shard_path = dataset_dir / f"shard-{shard_idx:06d}.tar"
|
|
394
|
+
shard_idx += 1
|
|
395
|
+
return _tarfile.open(str(shard_path), "w")
|
|
358
396
|
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
ext
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
397
|
+
try:
|
|
398
|
+
current_tar = _open_shard()
|
|
399
|
+
|
|
400
|
+
with metadata_file.open("r", encoding="utf-8") as mf:
|
|
401
|
+
for line in mf:
|
|
402
|
+
row = json.loads(line)
|
|
403
|
+
image_path = Path(row["image_path"])
|
|
404
|
+
if not image_path.exists():
|
|
405
|
+
continue
|
|
406
|
+
|
|
407
|
+
key = image_path.stem
|
|
408
|
+
ext = image_path.suffix.lstrip(".") or "jpg"
|
|
409
|
+
|
|
410
|
+
# Add image file
|
|
411
|
+
img_data = image_path.read_bytes()
|
|
412
|
+
img_info = _tarfile.TarInfo(name=f"{key}.{ext}")
|
|
413
|
+
img_info.size = len(img_data)
|
|
414
|
+
current_tar.addfile(img_info, io.BytesIO(img_data))
|
|
415
|
+
|
|
416
|
+
# Add JSON metadata sidecar
|
|
417
|
+
json_data = json.dumps(row, ensure_ascii=False).encode("utf-8")
|
|
418
|
+
json_info = _tarfile.TarInfo(name=f"{key}.json")
|
|
419
|
+
json_info.size = len(json_data)
|
|
420
|
+
current_tar.addfile(json_info, io.BytesIO(json_data))
|
|
421
|
+
|
|
422
|
+
count_in_shard += 1
|
|
423
|
+
if count_in_shard >= max_per_shard:
|
|
424
|
+
current_tar.close()
|
|
425
|
+
current_tar = _open_shard()
|
|
426
|
+
count_in_shard = 0
|
|
427
|
+
finally:
|
|
428
|
+
if current_tar is not None:
|
|
429
|
+
current_tar.close()
|
|
374
430
|
|
|
375
431
|
async def _write_parquet(self, dataset_dir: Path, metadata_file: Path) -> None:
|
|
376
432
|
try:
|