vesper-wizard 2.0.8 → 2.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +970 -852
- package/build/metadata/scraper.js +49 -6
- package/build/python/export_engine.py +45 -0
- package/build/python/normalize_engine.py +83 -0
- package/build/search/engine.js +28 -0
- package/package.json +1 -1
- package/src/python/export_engine.py +45 -0
- package/src/python/normalize_engine.py +83 -0
|
@@ -150,18 +150,61 @@ export class HuggingFaceScraper {
|
|
|
150
150
|
initialDelay: 2000, // Start with 2 seconds for HF API
|
|
151
151
|
maxDelay: 30000 // Max 30 seconds
|
|
152
152
|
});
|
|
153
|
-
const
|
|
153
|
+
const cardData = fullInfo.cardData || {};
|
|
154
|
+
// Extract splits from cardData.dataset_info (where HF actually stores them)
|
|
155
|
+
// cardData.dataset_info can be an object (single config) or array (multi-config)
|
|
156
|
+
let rawSplits = [];
|
|
157
|
+
const datasetInfoField = cardData.dataset_info;
|
|
158
|
+
if (datasetInfoField) {
|
|
159
|
+
const configs = Array.isArray(datasetInfoField) ? datasetInfoField : [datasetInfoField];
|
|
160
|
+
for (const config of configs) {
|
|
161
|
+
if (config?.splits && Array.isArray(config.splits)) {
|
|
162
|
+
rawSplits = rawSplits.concat(config.splits);
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
// Fallback: try top-level splits from the SDK (rarely populated)
|
|
167
|
+
if (rawSplits.length === 0 && fullInfo.splits) {
|
|
168
|
+
rawSplits = fullInfo.splits;
|
|
169
|
+
}
|
|
170
|
+
const splits = rawSplits.map((s) => ({
|
|
154
171
|
name: s.name,
|
|
155
|
-
num_examples: s.numExamples || 0,
|
|
156
|
-
size_bytes: s.sizeBytes
|
|
157
|
-
}))
|
|
158
|
-
|
|
172
|
+
num_examples: s.num_examples || s.numExamples || 0,
|
|
173
|
+
size_bytes: s.num_bytes || s.sizeBytes || 0
|
|
174
|
+
}));
|
|
175
|
+
let totalExamples = splits.reduce((sum, s) => sum + (s.num_examples || 0), 0);
|
|
159
176
|
const totalSizeBytes = splits.reduce((sum, s) => sum + (s.size_bytes || 0), 0);
|
|
177
|
+
// Fallback: estimate from size_categories when splits give 0
|
|
178
|
+
if (totalExamples === 0) {
|
|
179
|
+
const sizeCategories = cardData.size_categories;
|
|
180
|
+
if (Array.isArray(sizeCategories) && sizeCategories.length > 0) {
|
|
181
|
+
const cat = sizeCategories[0];
|
|
182
|
+
const rangeMatch = cat.match(/([\d.]+[KMB]?)\s*<\s*n\s*<\s*([\d.]+[KMB]?)/i);
|
|
183
|
+
if (rangeMatch) {
|
|
184
|
+
const parseHumanNum = (s) => {
|
|
185
|
+
const m = s.match(/^([\d.]+)([KMB])?$/i);
|
|
186
|
+
if (!m)
|
|
187
|
+
return 0;
|
|
188
|
+
const base = parseFloat(m[1]);
|
|
189
|
+
const suffix = (m[2] || '').toUpperCase();
|
|
190
|
+
if (suffix === 'K')
|
|
191
|
+
return base * 1000;
|
|
192
|
+
if (suffix === 'M')
|
|
193
|
+
return base * 1_000_000;
|
|
194
|
+
if (suffix === 'B')
|
|
195
|
+
return base * 1_000_000_000;
|
|
196
|
+
return base;
|
|
197
|
+
};
|
|
198
|
+
const lo = parseHumanNum(rangeMatch[1]);
|
|
199
|
+
const hi = parseHumanNum(rangeMatch[2]);
|
|
200
|
+
totalExamples = Math.round((lo + hi) / 2);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
}
|
|
160
204
|
const totalSizeMB = totalSizeBytes ? Math.round(totalSizeBytes / (1024 * 1024) * 100) / 100 : undefined;
|
|
161
205
|
const hasValidationSplit = splits.some((s) => s.name === "validation" || s.name === "val");
|
|
162
206
|
const licenseTag = tags.find(t => t.startsWith("license:"));
|
|
163
207
|
const licenseId = licenseTag ? licenseTag.replace("license:", "") : fullInfo.license;
|
|
164
|
-
const cardData = fullInfo.cardData || {};
|
|
165
208
|
const licenseUrl = cardData.license?.[0]?.link || cardData.license_link;
|
|
166
209
|
const license = categorizeLicense(licenseId, licenseUrl);
|
|
167
210
|
if (license.category === "restricted") {
|
|
@@ -50,6 +50,51 @@ def _load(file_path: str, options: dict) -> pl.DataFrame:
|
|
|
50
50
|
df = pl.read_ipc(file_path)
|
|
51
51
|
elif ext == ".jsonl":
|
|
52
52
|
df = pl.read_ndjson(file_path)
|
|
53
|
+
elif ext == ".json":
|
|
54
|
+
# Auto-detect: array-of-objects vs NDJSON vs nested structures
|
|
55
|
+
try:
|
|
56
|
+
import json as _json
|
|
57
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
58
|
+
raw_text = fh.read(512) # peek
|
|
59
|
+
stripped = raw_text.lstrip()
|
|
60
|
+
if stripped.startswith("["):
|
|
61
|
+
# Array of objects — standard JSON
|
|
62
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
63
|
+
data = _json.load(fh)
|
|
64
|
+
if isinstance(data, list) and len(data) > 0:
|
|
65
|
+
df = pl.DataFrame(data)
|
|
66
|
+
else:
|
|
67
|
+
raise ValueError("JSON file is empty or not an array of objects")
|
|
68
|
+
elif stripped.startswith("{"):
|
|
69
|
+
# Could be NDJSON or a single object wrapping rows
|
|
70
|
+
try:
|
|
71
|
+
df = pl.read_ndjson(file_path)
|
|
72
|
+
except Exception:
|
|
73
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
74
|
+
data = _json.load(fh)
|
|
75
|
+
# Try common wrapper patterns: {"data": [...]}, {"rows": [...]}, etc.
|
|
76
|
+
rows = None
|
|
77
|
+
if isinstance(data, dict):
|
|
78
|
+
for key in ("data", "rows", "records", "items", "results", "entries"):
|
|
79
|
+
if key in data and isinstance(data[key], list):
|
|
80
|
+
rows = data[key]
|
|
81
|
+
break
|
|
82
|
+
if rows is None:
|
|
83
|
+
# Last resort: try to use the dict values
|
|
84
|
+
rows = [data]
|
|
85
|
+
if rows and len(rows) > 0:
|
|
86
|
+
df = pl.DataFrame(rows)
|
|
87
|
+
else:
|
|
88
|
+
raise ValueError("Could not parse JSON structure into tabular data")
|
|
89
|
+
else:
|
|
90
|
+
raise ValueError("JSON file does not start with [ or {")
|
|
91
|
+
except pl.exceptions.ComputeError as ce:
|
|
92
|
+
raise ValueError(f"Failed to parse JSON: {ce}")
|
|
93
|
+
elif ext == ".xlsx":
|
|
94
|
+
try:
|
|
95
|
+
df = pl.read_excel(file_path)
|
|
96
|
+
except Exception as e:
|
|
97
|
+
raise ValueError(f"Failed to read Excel file: {e}")
|
|
53
98
|
else:
|
|
54
99
|
raise ValueError(f"Unsupported input format: {ext}")
|
|
55
100
|
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Normalize any supported dataset file to parquet format.
|
|
3
|
+
Usage: normalize_engine.py <input_path> <output_path>
|
|
4
|
+
Outputs JSON: {"ok": true, "output_path": "...", "rows": N} or {"ok": false, "error": "..."}
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import polars as pl
|
|
12
|
+
except Exception:
|
|
13
|
+
print(json.dumps({"ok": False, "error": "polars is required"}))
|
|
14
|
+
sys.exit(1)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _load(src: str) -> pl.DataFrame:
|
|
18
|
+
ext = os.path.splitext(src)[1].lower()
|
|
19
|
+
|
|
20
|
+
if ext == ".csv":
|
|
21
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
22
|
+
if ext in (".tsv", ".tab"):
|
|
23
|
+
return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
|
|
24
|
+
if ext in (".parquet", ".pq"):
|
|
25
|
+
return pl.read_parquet(src)
|
|
26
|
+
if ext in (".feather", ".ftr", ".arrow", ".ipc"):
|
|
27
|
+
return pl.read_ipc(src)
|
|
28
|
+
if ext in (".jsonl", ".ndjson"):
|
|
29
|
+
return pl.read_ndjson(src)
|
|
30
|
+
if ext == ".json":
|
|
31
|
+
raw = open(src, "r", encoding="utf-8").read().strip()
|
|
32
|
+
if raw.startswith("["):
|
|
33
|
+
return pl.read_json(src)
|
|
34
|
+
# Try NDJSON
|
|
35
|
+
if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
|
|
36
|
+
return pl.read_ndjson(src)
|
|
37
|
+
# Try wrapper object
|
|
38
|
+
obj = json.loads(raw)
|
|
39
|
+
if isinstance(obj, dict):
|
|
40
|
+
for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
|
|
41
|
+
if key in obj and isinstance(obj[key], list):
|
|
42
|
+
return pl.DataFrame(obj[key])
|
|
43
|
+
# Last resort - take first list value
|
|
44
|
+
for v in obj.values():
|
|
45
|
+
if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
|
|
46
|
+
return pl.DataFrame(v)
|
|
47
|
+
return pl.read_json(src)
|
|
48
|
+
if ext == ".txt":
|
|
49
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
50
|
+
|
|
51
|
+
# Fallback: try csv
|
|
52
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def normalize(input_path: str, output_path: str):
|
|
56
|
+
df = _load(input_path)
|
|
57
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
58
|
+
df.write_parquet(output_path)
|
|
59
|
+
return df.height
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def main():
|
|
63
|
+
if len(sys.argv) < 3:
|
|
64
|
+
print(json.dumps({"ok": False, "error": "Usage: normalize_engine.py <input> <output>"}))
|
|
65
|
+
sys.exit(1)
|
|
66
|
+
|
|
67
|
+
input_path = sys.argv[1]
|
|
68
|
+
output_path = sys.argv[2]
|
|
69
|
+
|
|
70
|
+
if not os.path.exists(input_path):
|
|
71
|
+
print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
|
|
72
|
+
sys.exit(1)
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
rows = normalize(input_path, output_path)
|
|
76
|
+
print(json.dumps({"ok": True, "output_path": output_path, "rows": rows}))
|
|
77
|
+
except Exception as e:
|
|
78
|
+
print(json.dumps({"ok": False, "error": str(e)}))
|
|
79
|
+
sys.exit(1)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
if __name__ == "__main__":
|
|
83
|
+
main()
|
package/build/search/engine.js
CHANGED
|
@@ -96,6 +96,34 @@ export class SearchEngine {
|
|
|
96
96
|
if (lexicalScore === 0 && positiveKeywords.length > 1) {
|
|
97
97
|
penalty += 0.2;
|
|
98
98
|
}
|
|
99
|
+
// Penalty: Modality Mismatch
|
|
100
|
+
// Infer the expected modality from the query and penalize mismatches.
|
|
101
|
+
// e.g., "anime quotes" is text, so image-classification datasets get penalized.
|
|
102
|
+
const textIndicators = ["quotes", "text", "nlp", "sentiment", "review", "comment", "caption", "dialogue", "chat", "translation", "summarization", "classification"];
|
|
103
|
+
const imageIndicators = ["image", "photo", "picture", "vision", "detection", "segmentation", "face", "background"];
|
|
104
|
+
const queryLower = query.toLowerCase();
|
|
105
|
+
const queryLooksText = textIndicators.some(t => queryLower.includes(t));
|
|
106
|
+
const queryLooksImage = imageIndicators.some(t => queryLower.includes(t));
|
|
107
|
+
if (queryLooksText && !queryLooksImage) {
|
|
108
|
+
const resultTask = (metadata.task || "").toLowerCase();
|
|
109
|
+
const isImageResult = resultTask.includes("image") || resultTask.includes("object-detection") ||
|
|
110
|
+
text.includes("image classification") || text.includes("image-classification") ||
|
|
111
|
+
text.includes("object detection") || text.includes("image segmentation");
|
|
112
|
+
if (isImageResult) {
|
|
113
|
+
log(`Modality penalty: text query but image dataset ${match.id}`);
|
|
114
|
+
penalty += 0.35;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
if (queryLooksImage && !queryLooksText) {
|
|
118
|
+
const resultTask = (metadata.task || "").toLowerCase();
|
|
119
|
+
const isTextResult = resultTask.includes("text-classification") || resultTask.includes("text-generation") ||
|
|
120
|
+
resultTask.includes("translation") || resultTask.includes("summarization") ||
|
|
121
|
+
resultTask.includes("question-answering");
|
|
122
|
+
if (isTextResult) {
|
|
123
|
+
log(`Modality penalty: image query but text dataset ${match.id}`);
|
|
124
|
+
penalty += 0.35;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
99
127
|
// D. Accessibility Bonuses (Prioritize low-friction sources)
|
|
100
128
|
let bonus = 0;
|
|
101
129
|
const sourceBonuses = {
|
package/package.json
CHANGED
|
@@ -50,6 +50,51 @@ def _load(file_path: str, options: dict) -> pl.DataFrame:
|
|
|
50
50
|
df = pl.read_ipc(file_path)
|
|
51
51
|
elif ext == ".jsonl":
|
|
52
52
|
df = pl.read_ndjson(file_path)
|
|
53
|
+
elif ext == ".json":
|
|
54
|
+
# Auto-detect: array-of-objects vs NDJSON vs nested structures
|
|
55
|
+
try:
|
|
56
|
+
import json as _json
|
|
57
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
58
|
+
raw_text = fh.read(512) # peek
|
|
59
|
+
stripped = raw_text.lstrip()
|
|
60
|
+
if stripped.startswith("["):
|
|
61
|
+
# Array of objects — standard JSON
|
|
62
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
63
|
+
data = _json.load(fh)
|
|
64
|
+
if isinstance(data, list) and len(data) > 0:
|
|
65
|
+
df = pl.DataFrame(data)
|
|
66
|
+
else:
|
|
67
|
+
raise ValueError("JSON file is empty or not an array of objects")
|
|
68
|
+
elif stripped.startswith("{"):
|
|
69
|
+
# Could be NDJSON or a single object wrapping rows
|
|
70
|
+
try:
|
|
71
|
+
df = pl.read_ndjson(file_path)
|
|
72
|
+
except Exception:
|
|
73
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
74
|
+
data = _json.load(fh)
|
|
75
|
+
# Try common wrapper patterns: {"data": [...]}, {"rows": [...]}, etc.
|
|
76
|
+
rows = None
|
|
77
|
+
if isinstance(data, dict):
|
|
78
|
+
for key in ("data", "rows", "records", "items", "results", "entries"):
|
|
79
|
+
if key in data and isinstance(data[key], list):
|
|
80
|
+
rows = data[key]
|
|
81
|
+
break
|
|
82
|
+
if rows is None:
|
|
83
|
+
# Last resort: try to use the dict values
|
|
84
|
+
rows = [data]
|
|
85
|
+
if rows and len(rows) > 0:
|
|
86
|
+
df = pl.DataFrame(rows)
|
|
87
|
+
else:
|
|
88
|
+
raise ValueError("Could not parse JSON structure into tabular data")
|
|
89
|
+
else:
|
|
90
|
+
raise ValueError("JSON file does not start with [ or {")
|
|
91
|
+
except pl.exceptions.ComputeError as ce:
|
|
92
|
+
raise ValueError(f"Failed to parse JSON: {ce}")
|
|
93
|
+
elif ext == ".xlsx":
|
|
94
|
+
try:
|
|
95
|
+
df = pl.read_excel(file_path)
|
|
96
|
+
except Exception as e:
|
|
97
|
+
raise ValueError(f"Failed to read Excel file: {e}")
|
|
53
98
|
else:
|
|
54
99
|
raise ValueError(f"Unsupported input format: {ext}")
|
|
55
100
|
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Normalize any supported dataset file to parquet format.
|
|
3
|
+
Usage: normalize_engine.py <input_path> <output_path>
|
|
4
|
+
Outputs JSON: {"ok": true, "output_path": "...", "rows": N} or {"ok": false, "error": "..."}
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import polars as pl
|
|
12
|
+
except Exception:
|
|
13
|
+
print(json.dumps({"ok": False, "error": "polars is required"}))
|
|
14
|
+
sys.exit(1)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _load(src: str) -> pl.DataFrame:
|
|
18
|
+
ext = os.path.splitext(src)[1].lower()
|
|
19
|
+
|
|
20
|
+
if ext == ".csv":
|
|
21
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
22
|
+
if ext in (".tsv", ".tab"):
|
|
23
|
+
return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
|
|
24
|
+
if ext in (".parquet", ".pq"):
|
|
25
|
+
return pl.read_parquet(src)
|
|
26
|
+
if ext in (".feather", ".ftr", ".arrow", ".ipc"):
|
|
27
|
+
return pl.read_ipc(src)
|
|
28
|
+
if ext in (".jsonl", ".ndjson"):
|
|
29
|
+
return pl.read_ndjson(src)
|
|
30
|
+
if ext == ".json":
|
|
31
|
+
raw = open(src, "r", encoding="utf-8").read().strip()
|
|
32
|
+
if raw.startswith("["):
|
|
33
|
+
return pl.read_json(src)
|
|
34
|
+
# Try NDJSON
|
|
35
|
+
if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
|
|
36
|
+
return pl.read_ndjson(src)
|
|
37
|
+
# Try wrapper object
|
|
38
|
+
obj = json.loads(raw)
|
|
39
|
+
if isinstance(obj, dict):
|
|
40
|
+
for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
|
|
41
|
+
if key in obj and isinstance(obj[key], list):
|
|
42
|
+
return pl.DataFrame(obj[key])
|
|
43
|
+
# Last resort - take first list value
|
|
44
|
+
for v in obj.values():
|
|
45
|
+
if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
|
|
46
|
+
return pl.DataFrame(v)
|
|
47
|
+
return pl.read_json(src)
|
|
48
|
+
if ext == ".txt":
|
|
49
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
50
|
+
|
|
51
|
+
# Fallback: try csv
|
|
52
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def normalize(input_path: str, output_path: str):
|
|
56
|
+
df = _load(input_path)
|
|
57
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
58
|
+
df.write_parquet(output_path)
|
|
59
|
+
return df.height
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def main():
|
|
63
|
+
if len(sys.argv) < 3:
|
|
64
|
+
print(json.dumps({"ok": False, "error": "Usage: normalize_engine.py <input> <output>"}))
|
|
65
|
+
sys.exit(1)
|
|
66
|
+
|
|
67
|
+
input_path = sys.argv[1]
|
|
68
|
+
output_path = sys.argv[2]
|
|
69
|
+
|
|
70
|
+
if not os.path.exists(input_path):
|
|
71
|
+
print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
|
|
72
|
+
sys.exit(1)
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
rows = normalize(input_path, output_path)
|
|
76
|
+
print(json.dumps({"ok": True, "output_path": output_path, "rows": rows}))
|
|
77
|
+
except Exception as e:
|
|
78
|
+
print(json.dumps({"ok": False, "error": str(e)}))
|
|
79
|
+
sys.exit(1)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
if __name__ == "__main__":
|
|
83
|
+
main()
|