vesper-wizard 2.3.1 → 2.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -322
- package/package.json +34 -100
- package/vesper-mcp-config.json +6 -0
- package/{scripts/wizard.js → wizard.js} +1 -1
- package/LICENSE +0 -21
- package/build/cache/cdn.js +0 -34
- package/build/cache/service.js +0 -63
- package/build/cleaning/cleaner.js +0 -81
- package/build/cleaning/evaluator.js +0 -89
- package/build/cleaning/executor.js +0 -62
- package/build/cleaning/exporter.js +0 -87
- package/build/cleaning/planner.js +0 -127
- package/build/cleaning/rules.js +0 -57
- package/build/cleaning/types.js +0 -1
- package/build/cloud/adapters/local.js +0 -37
- package/build/cloud/adapters/s3.js +0 -24
- package/build/cloud/adapters/supabase.js +0 -49
- package/build/cloud/storage-manager.js +0 -26
- package/build/cloud/types.js +0 -1
- package/build/compliance/service.js +0 -73
- package/build/compliance/store.js +0 -80
- package/build/compliance/types.js +0 -1
- package/build/config/config-manager.js +0 -221
- package/build/config/secure-keys.js +0 -51
- package/build/config/user-config.js +0 -48
- package/build/data/processing-worker.js +0 -23
- package/build/data/streaming.js +0 -38
- package/build/data/worker-pool.js +0 -39
- package/build/export/exporter.js +0 -82
- package/build/export/packager.js +0 -100
- package/build/export/types.js +0 -1
- package/build/fusion/aligner.js +0 -56
- package/build/fusion/deduplicator.js +0 -69
- package/build/fusion/engine.js +0 -69
- package/build/fusion/harmonizer.js +0 -39
- package/build/fusion/orchestrator.js +0 -86
- package/build/fusion/types.js +0 -1
- package/build/gateway/unified-dataset-gateway.js +0 -410
- package/build/index.js +0 -3068
- package/build/ingestion/hf-downloader.js +0 -171
- package/build/ingestion/ingestor.js +0 -271
- package/build/ingestion/kaggle-downloader.js +0 -102
- package/build/install/install-service.js +0 -46
- package/build/jobs/manager.js +0 -136
- package/build/jobs/queue.js +0 -59
- package/build/jobs/types.js +0 -1
- package/build/lib/supabase.js +0 -3
- package/build/metadata/dataworld-source.js +0 -89
- package/build/metadata/domain.js +0 -147
- package/build/metadata/github-scraper.js +0 -47
- package/build/metadata/institutional-scrapers.js +0 -49
- package/build/metadata/kaggle-scraper.js +0 -182
- package/build/metadata/kaggle-source.js +0 -70
- package/build/metadata/license.js +0 -68
- package/build/metadata/monitoring-service.js +0 -107
- package/build/metadata/monitoring-store.js +0 -78
- package/build/metadata/monitoring-types.js +0 -1
- package/build/metadata/openml-source.js +0 -87
- package/build/metadata/quality.js +0 -48
- package/build/metadata/rate-limiter.js +0 -128
- package/build/metadata/scraper.js +0 -448
- package/build/metadata/store.js +0 -340
- package/build/metadata/types.js +0 -1
- package/build/metadata/uci-scraper.js +0 -49
- package/build/monitoring/observability.js +0 -76
- package/build/preparation/target-detector.js +0 -75
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +0 -94
- package/build/python/cleaner.py +0 -226
- package/build/python/config.py +0 -263
- package/build/python/convert_engine.py +0 -92
- package/build/python/dataworld_engine.py +0 -208
- package/build/python/export_engine.py +0 -288
- package/build/python/framework_adapters.py +0 -100
- package/build/python/fusion_engine.py +0 -368
- package/build/python/github_adapter.py +0 -106
- package/build/python/hf_fallback.py +0 -298
- package/build/python/image_engine.py +0 -86
- package/build/python/kaggle_engine.py +0 -295
- package/build/python/media_engine.py +0 -133
- package/build/python/nasa_adapter.py +0 -82
- package/build/python/normalize_engine.py +0 -83
- package/build/python/openml_engine.py +0 -146
- package/build/python/quality_engine.py +0 -267
- package/build/python/row_count.py +0 -54
- package/build/python/splitter_engine.py +0 -283
- package/build/python/target_engine.py +0 -154
- package/build/python/test_framework_adapters.py +0 -61
- package/build/python/test_fusion_engine.py +0 -89
- package/build/python/uci_adapter.py +0 -94
- package/build/python/vesper/__init__.py +0 -1
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +0 -1
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +0 -679
- package/build/python/vesper/core/download_recipe.py +0 -104
- package/build/python/worldbank_adapter.py +0 -99
- package/build/quality/analyzer.js +0 -93
- package/build/quality/image-analyzer.js +0 -114
- package/build/quality/media-analyzer.js +0 -115
- package/build/quality/quality-orchestrator.js +0 -162
- package/build/quality/types.js +0 -1
- package/build/scripts/build-index.js +0 -54
- package/build/scripts/check-db.js +0 -73
- package/build/scripts/check-jobs.js +0 -24
- package/build/scripts/check-naruto.js +0 -17
- package/build/scripts/cleanup-kaggle.js +0 -41
- package/build/scripts/demo-full-pipeline.js +0 -62
- package/build/scripts/demo-ui.js +0 -58
- package/build/scripts/e2e-demo.js +0 -72
- package/build/scripts/massive-scrape.js +0 -103
- package/build/scripts/ops-dashboard.js +0 -33
- package/build/scripts/repro-bug.js +0 -37
- package/build/scripts/repro-export-bug.js +0 -56
- package/build/scripts/scrape-metadata.js +0 -100
- package/build/scripts/search-cli.js +0 -26
- package/build/scripts/test-bias.js +0 -45
- package/build/scripts/test-caching.js +0 -51
- package/build/scripts/test-cleaning.js +0 -76
- package/build/scripts/test-cloud-storage.js +0 -48
- package/build/scripts/test-compliance.js +0 -58
- package/build/scripts/test-conversion.js +0 -64
- package/build/scripts/test-custom-rules.js +0 -58
- package/build/scripts/test-db-opt.js +0 -63
- package/build/scripts/test-export-custom.js +0 -33
- package/build/scripts/test-exporter.js +0 -53
- package/build/scripts/test-fusion.js +0 -61
- package/build/scripts/test-github.js +0 -27
- package/build/scripts/test-group-split.js +0 -52
- package/build/scripts/test-hf-download.js +0 -29
- package/build/scripts/test-holdout-manager.js +0 -61
- package/build/scripts/test-hybrid-search.js +0 -41
- package/build/scripts/test-image-analysis.js +0 -50
- package/build/scripts/test-ingestion-infra.js +0 -39
- package/build/scripts/test-install.js +0 -40
- package/build/scripts/test-institutional.js +0 -26
- package/build/scripts/test-integrity.js +0 -41
- package/build/scripts/test-jit.js +0 -42
- package/build/scripts/test-job-queue.js +0 -62
- package/build/scripts/test-kaggle-download.js +0 -34
- package/build/scripts/test-large-data.js +0 -50
- package/build/scripts/test-mcp-v5.js +0 -74
- package/build/scripts/test-media-analysis.js +0 -61
- package/build/scripts/test-monitoring.js +0 -91
- package/build/scripts/test-observability.js +0 -106
- package/build/scripts/test-packager.js +0 -55
- package/build/scripts/test-pipeline.js +0 -50
- package/build/scripts/test-planning.js +0 -64
- package/build/scripts/test-privacy.js +0 -38
- package/build/scripts/test-production-sync.js +0 -36
- package/build/scripts/test-quality.js +0 -43
- package/build/scripts/test-robust-ingestion.js +0 -41
- package/build/scripts/test-schema.js +0 -45
- package/build/scripts/test-split-validation.js +0 -40
- package/build/scripts/test-splitter.js +0 -93
- package/build/scripts/test-target-detector.js +0 -29
- package/build/scripts/test-uci.js +0 -27
- package/build/scripts/test-unified-quality.js +0 -86
- package/build/scripts/test-write.js +0 -14
- package/build/scripts/verify-integration.js +0 -57
- package/build/scripts/verify-priority.js +0 -33
- package/build/search/embedder.js +0 -34
- package/build/search/engine.js +0 -190
- package/build/search/jit-orchestrator.js +0 -262
- package/build/search/query-intent.js +0 -509
- package/build/search/vector-store.js +0 -123
- package/build/splitting/splitter.js +0 -82
- package/build/splitting/types.js +0 -1
- package/build/tools/formatter.js +0 -251
- package/build/utils/downloader.js +0 -52
- package/build/utils/python-runtime.js +0 -130
- package/build/utils/selector.js +0 -69
- package/mcp-config-template.json +0 -18
- package/scripts/postinstall.cjs +0 -170
- package/scripts/preindex_registry.cjs +0 -157
- package/scripts/refresh-index.cjs +0 -87
- package/scripts/wizard.cjs +0 -601
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +0 -94
- package/src/python/cleaner.py +0 -226
- package/src/python/config.py +0 -263
- package/src/python/convert_engine.py +0 -92
- package/src/python/dataworld_engine.py +0 -208
- package/src/python/export_engine.py +0 -288
- package/src/python/framework_adapters.py +0 -100
- package/src/python/fusion_engine.py +0 -368
- package/src/python/github_adapter.py +0 -106
- package/src/python/hf_fallback.py +0 -298
- package/src/python/image_engine.py +0 -86
- package/src/python/kaggle_engine.py +0 -295
- package/src/python/media_engine.py +0 -133
- package/src/python/nasa_adapter.py +0 -82
- package/src/python/normalize_engine.py +0 -83
- package/src/python/openml_engine.py +0 -146
- package/src/python/quality_engine.py +0 -267
- package/src/python/requirements.txt +0 -12
- package/src/python/row_count.py +0 -54
- package/src/python/splitter_engine.py +0 -283
- package/src/python/target_engine.py +0 -154
- package/src/python/test_framework_adapters.py +0 -61
- package/src/python/test_fusion_engine.py +0 -89
- package/src/python/uci_adapter.py +0 -94
- package/src/python/vesper/__init__.py +0 -1
- package/src/python/vesper/core/__init__.py +0 -1
- package/src/python/vesper/core/asset_downloader.py +0 -679
- package/src/python/vesper/core/download_recipe.py +0 -104
- package/src/python/worldbank_adapter.py +0 -99
- package/wizard.cjs +0 -3
package/scripts/postinstall.cjs
DELETED
|
@@ -1,170 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
const { execSync } = require('child_process');
|
|
4
|
-
const fs = require('fs');
|
|
5
|
-
const os = require('os');
|
|
6
|
-
const path = require('path');
|
|
7
|
-
|
|
8
|
-
console.log('\n🚀 Setting up Vesper MCP Server...\n');
|
|
9
|
-
|
|
10
|
-
function getPythonBootstrapCommand() {
|
|
11
|
-
const attempts = process.platform === 'win32'
|
|
12
|
-
? ['py -3', 'python']
|
|
13
|
-
: ['python3', 'python'];
|
|
14
|
-
|
|
15
|
-
for (const command of attempts) {
|
|
16
|
-
try {
|
|
17
|
-
execSync(`${command} --version`, { stdio: 'pipe' });
|
|
18
|
-
return command;
|
|
19
|
-
} catch {
|
|
20
|
-
// try next command
|
|
21
|
-
}
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
return null;
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
// 1. Check for Python
|
|
28
|
-
const pythonBootstrap = getPythonBootstrapCommand();
|
|
29
|
-
try {
|
|
30
|
-
if (!pythonBootstrap) {
|
|
31
|
-
throw new Error('Python not found');
|
|
32
|
-
}
|
|
33
|
-
console.log('✅ Python found');
|
|
34
|
-
} catch (e) {
|
|
35
|
-
console.warn('⚠️ Python not found. Please install Python 3.8+ for full functionality.');
|
|
36
|
-
console.warn(' Image/audio/video analysis features will not work without Python.\n');
|
|
37
|
-
process.exit(0); // Don't fail installation
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
const homeDir = os.homedir() || process.env.HOME || process.env.USERPROFILE;
|
|
41
|
-
const vesperDataDir = path.join(homeDir, '.vesper');
|
|
42
|
-
const managedVenvDir = path.join(vesperDataDir, '.venv');
|
|
43
|
-
const managedPython = process.platform === 'win32'
|
|
44
|
-
? path.join(managedVenvDir, 'Scripts', 'python.exe')
|
|
45
|
-
: path.join(managedVenvDir, 'bin', 'python');
|
|
46
|
-
const requirementsPath = path.resolve(__dirname, '..', 'src', 'python', 'requirements.txt');
|
|
47
|
-
|
|
48
|
-
// 2. Create data directories
|
|
49
|
-
const dirs = [
|
|
50
|
-
vesperDataDir,
|
|
51
|
-
path.join(vesperDataDir, 'data'),
|
|
52
|
-
path.join(vesperDataDir, 'data', 'raw'),
|
|
53
|
-
path.join(vesperDataDir, 'data', 'processed'),
|
|
54
|
-
path.join(vesperDataDir, 'datasets')
|
|
55
|
-
];
|
|
56
|
-
|
|
57
|
-
dirs.forEach(dir => {
|
|
58
|
-
if (!fs.existsSync(dir)) {
|
|
59
|
-
fs.mkdirSync(dir, { recursive: true });
|
|
60
|
-
}
|
|
61
|
-
});
|
|
62
|
-
|
|
63
|
-
console.log(`✅ Data directories created at ${vesperDataDir}`);
|
|
64
|
-
|
|
65
|
-
// 3. Create a managed Vesper Python environment
|
|
66
|
-
console.log('\n🐍 Preparing managed Python environment...');
|
|
67
|
-
try {
|
|
68
|
-
if (!fs.existsSync(managedPython)) {
|
|
69
|
-
execSync(`${pythonBootstrap} -m venv "${managedVenvDir}"`, {
|
|
70
|
-
stdio: 'inherit',
|
|
71
|
-
timeout: 180000,
|
|
72
|
-
});
|
|
73
|
-
}
|
|
74
|
-
console.log(`✅ Managed Python ready at ${managedVenvDir}`);
|
|
75
|
-
} catch (e) {
|
|
76
|
-
console.warn('⚠️ Failed to create the managed Vesper Python environment.');
|
|
77
|
-
console.warn(` Vesper will fall back to PATH Python and may need to self-heal at runtime. ${(e && e.message) || ''}`.trim());
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
// 4. Install Python dependencies into the managed environment
|
|
81
|
-
console.log('\n📦 Installing Python dependencies...');
|
|
82
|
-
const pythonPackages = [
|
|
83
|
-
'opencv-python',
|
|
84
|
-
'pillow',
|
|
85
|
-
'librosa',
|
|
86
|
-
'soundfile',
|
|
87
|
-
'pyarrow'
|
|
88
|
-
];
|
|
89
|
-
|
|
90
|
-
try {
|
|
91
|
-
const targetPython = fs.existsSync(managedPython) ? `"${managedPython}"` : pythonBootstrap;
|
|
92
|
-
execSync(`${targetPython} -m pip install --disable-pip-version-check --upgrade pip`, {
|
|
93
|
-
stdio: 'inherit',
|
|
94
|
-
timeout: 180000,
|
|
95
|
-
});
|
|
96
|
-
execSync(`${targetPython} -m pip install --disable-pip-version-check -r "${requirementsPath}" ${pythonPackages.join(' ')}`, {
|
|
97
|
-
stdio: 'inherit',
|
|
98
|
-
timeout: 600000,
|
|
99
|
-
});
|
|
100
|
-
console.log('✅ Python dependencies installed');
|
|
101
|
-
} catch (e) {
|
|
102
|
-
console.warn('⚠️ Failed to install some Python dependencies.');
|
|
103
|
-
console.warn(' You may need to install them manually into the Vesper runtime:');
|
|
104
|
-
console.warn(` ${fs.existsSync(managedPython) ? managedPython : pythonBootstrap} -m pip install -r "${requirementsPath}" ${pythonPackages.join(' ')}\n`);
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
// 5. Rebuild better-sqlite3 for current Node.js version
|
|
108
|
-
console.log('\n🔧 Rebuilding native modules for current Node.js...');
|
|
109
|
-
try {
|
|
110
|
-
execSync('npm rebuild better-sqlite3', {
|
|
111
|
-
stdio: 'pipe',
|
|
112
|
-
timeout: 60000,
|
|
113
|
-
cwd: path.resolve(__dirname, '..')
|
|
114
|
-
});
|
|
115
|
-
console.log('✅ Native modules rebuilt successfully');
|
|
116
|
-
} catch (e) {
|
|
117
|
-
console.warn('⚠️ Could not rebuild better-sqlite3: ' + (e.message || e));
|
|
118
|
-
console.warn(' If you see ERR_DLOPEN_FAILED, run: npm rebuild better-sqlite3');
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
// 6. Auto-configure Claude Desktop (Best Effort)
|
|
122
|
-
console.log('\n⚙️ Attempting to auto-configure Claude Desktop...');
|
|
123
|
-
|
|
124
|
-
function getClaudeConfigPath() {
|
|
125
|
-
const platform = process.platform;
|
|
126
|
-
const home = process.env.HOME || process.env.USERPROFILE;
|
|
127
|
-
|
|
128
|
-
if (platform === 'win32') {
|
|
129
|
-
return path.join(process.env.APPDATA, 'Claude', 'claude_desktop_config.json');
|
|
130
|
-
} else if (platform === 'darwin') {
|
|
131
|
-
return path.join(home, 'Library', 'Application Support', 'Claude', 'claude_desktop_config.json');
|
|
132
|
-
}
|
|
133
|
-
return null;
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
const configPath = getClaudeConfigPath();
|
|
137
|
-
|
|
138
|
-
if (configPath && fs.existsSync(configPath)) {
|
|
139
|
-
try {
|
|
140
|
-
const configContent = fs.readFileSync(configPath, 'utf8');
|
|
141
|
-
let config = JSON.parse(configContent);
|
|
142
|
-
|
|
143
|
-
if (!config.mcpServers) config.mcpServers = {};
|
|
144
|
-
|
|
145
|
-
if (!config.mcpServers.vesper) {
|
|
146
|
-
config.mcpServers.vesper = {
|
|
147
|
-
command: "vesper",
|
|
148
|
-
args: [],
|
|
149
|
-
env: {
|
|
150
|
-
"HF_TOKEN": ""
|
|
151
|
-
}
|
|
152
|
-
};
|
|
153
|
-
|
|
154
|
-
fs.writeFileSync(configPath, JSON.stringify(config, null, 2));
|
|
155
|
-
console.log(`✅ Automatically added 'vesper' to ${configPath}`);
|
|
156
|
-
} else {
|
|
157
|
-
console.log(`ℹ️ 'vesper' is already configured in ${configPath}`);
|
|
158
|
-
}
|
|
159
|
-
} catch (e) {
|
|
160
|
-
console.warn(`⚠️ Could not auto-configure Claude Desktop: ${e.message}`);
|
|
161
|
-
}
|
|
162
|
-
} else {
|
|
163
|
-
console.log('ℹ️ Claude Desktop config not found (skipping auto-config)');
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
console.log('\n✨ Vesper MCP Server installed successfully!\n');
|
|
167
|
-
console.log('📖 Next steps:');
|
|
168
|
-
console.log(' 1. Restart your AI assistant (Cursor/Claude)');
|
|
169
|
-
console.log(' 2. Try: search_datasets(query="sentiment analysis")');
|
|
170
|
-
console.log('\n💡 For full documentation, visit: https://github.com/vesper/mcp-server\n');
|
|
@@ -1,157 +0,0 @@
|
|
|
1
|
-
const fs = require('fs');
|
|
2
|
-
const path = require('path');
|
|
3
|
-
const os = require('os');
|
|
4
|
-
|
|
5
|
-
const { argv, cwd } = process;
|
|
6
|
-
|
|
7
|
-
function usage() {
|
|
8
|
-
console.log(`Usage: node scripts/preindex_registry.cjs [--scan dir1 dir2 ...] [--target N] [--out path] [--no-count]
|
|
9
|
-
|
|
10
|
-
Options:
|
|
11
|
-
--scan Directories to recursively scan for datasets (default: ./e2e_demo_output ./datasets)
|
|
12
|
-
--target Target total registry entries (if larger than scanned, will synthesize entries)
|
|
13
|
-
--out Output registry path (default: ~/.vesper/registry.json)
|
|
14
|
-
--no-count Skip expensive row counting for CSV/JSONL
|
|
15
|
-
`);
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
let scanDirs = [];
|
|
19
|
-
let target = 0;
|
|
20
|
-
let outPath = path.join(os.homedir(), '.vesper', 'registry.json');
|
|
21
|
-
let doCount = true;
|
|
22
|
-
|
|
23
|
-
for (let i = 2; i < argv.length; i++) {
|
|
24
|
-
const a = argv[i];
|
|
25
|
-
if (a === '--scan') {
|
|
26
|
-
i++;
|
|
27
|
-
while (i < argv.length && !argv[i].startsWith('--')) {
|
|
28
|
-
scanDirs.push(argv[i]);
|
|
29
|
-
i++;
|
|
30
|
-
}
|
|
31
|
-
i--;
|
|
32
|
-
} else if (a === '--target') {
|
|
33
|
-
target = parseInt(argv[++i], 10) || 0;
|
|
34
|
-
} else if (a === '--out') {
|
|
35
|
-
outPath = path.resolve(argv[++i]);
|
|
36
|
-
} else if (a === '--no-count') {
|
|
37
|
-
doCount = false;
|
|
38
|
-
} else if (a === '--help' || a === '-h') {
|
|
39
|
-
usage();
|
|
40
|
-
process.exit(0);
|
|
41
|
-
} else {
|
|
42
|
-
console.error('Unknown arg', a);
|
|
43
|
-
usage();
|
|
44
|
-
process.exit(2);
|
|
45
|
-
}
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
if (scanDirs.length === 0) scanDirs = [path.join(cwd(), 'e2e_demo_output'), path.join(cwd(), 'datasets')];
|
|
49
|
-
|
|
50
|
-
function normalizeId(s) {
|
|
51
|
-
return s.replace(/[^a-z0-9]+/gi, '_').replace(/^_+|_+$/g, '').toLowerCase();
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
function walk(dir, exts = ['.csv', '.jsonl', '.json', '.arrow', '.parquet', '.feather']) {
|
|
55
|
-
const results = [];
|
|
56
|
-
try {
|
|
57
|
-
const items = fs.readdirSync(dir, { withFileTypes: true });
|
|
58
|
-
for (const it of items) {
|
|
59
|
-
const p = path.join(dir, it.name);
|
|
60
|
-
if (it.isDirectory()) results.push(...walk(p, exts));
|
|
61
|
-
else if (it.isFile()) {
|
|
62
|
-
const ext = path.extname(it.name).toLowerCase();
|
|
63
|
-
if (exts.includes(ext)) results.push(p);
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
} catch (e) {
|
|
67
|
-
// ignore
|
|
68
|
-
}
|
|
69
|
-
return results;
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
function countCsvRows(filePath) {
|
|
73
|
-
return new Promise((resolve, reject) => {
|
|
74
|
-
let count = 0;
|
|
75
|
-
const rs = fs.createReadStream(filePath, { encoding: 'utf8' });
|
|
76
|
-
rs.on('data', chunk => {
|
|
77
|
-
for (let i = 0; i < chunk.length; i++) if (chunk[i] === '\n') count++;
|
|
78
|
-
});
|
|
79
|
-
rs.on('end', () => resolve(count));
|
|
80
|
-
rs.on('error', reject);
|
|
81
|
-
});
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
(async function main() {
|
|
85
|
-
const registryDir = path.dirname(outPath);
|
|
86
|
-
if (!fs.existsSync(registryDir)) fs.mkdirSync(registryDir, { recursive: true });
|
|
87
|
-
|
|
88
|
-
let existing = [];
|
|
89
|
-
if (fs.existsSync(outPath)) {
|
|
90
|
-
try { existing = JSON.parse(fs.readFileSync(outPath, 'utf8')); } catch (e) { existing = []; }
|
|
91
|
-
}
|
|
92
|
-
const map = new Map();
|
|
93
|
-
for (const e of existing) map.set(e.normalized_id || e.id, e);
|
|
94
|
-
|
|
95
|
-
let scanned = 0;
|
|
96
|
-
for (const dir of scanDirs) {
|
|
97
|
-
const abs = path.resolve(dir);
|
|
98
|
-
const files = walk(abs);
|
|
99
|
-
for (const f of files) {
|
|
100
|
-
const stats = fs.statSync(f);
|
|
101
|
-
const base = path.basename(f, path.extname(f));
|
|
102
|
-
const rel = path.relative(process.cwd(), f);
|
|
103
|
-
const id = normalizeId(rel || base);
|
|
104
|
-
let cols = null;
|
|
105
|
-
let rows = null;
|
|
106
|
-
if (doCount && (f.endsWith('.csv') || f.endsWith('.jsonl') || f.endsWith('.json'))) {
|
|
107
|
-
try {
|
|
108
|
-
if (f.endsWith('.csv')) {
|
|
109
|
-
const header = fs.readFileSync(f, { encoding: 'utf8', flag: 'r' }).split(/\r?\n/, 1)[0] || '';
|
|
110
|
-
cols = header ? header.split(',').length : 0;
|
|
111
|
-
rows = await countCsvRows(f);
|
|
112
|
-
} else if (f.endsWith('.jsonl')) {
|
|
113
|
-
rows = await countCsvRows(f);
|
|
114
|
-
}
|
|
115
|
-
} catch (e) {
|
|
116
|
-
// ignore
|
|
117
|
-
}
|
|
118
|
-
}
|
|
119
|
-
const entry = {
|
|
120
|
-
id: id,
|
|
121
|
-
normalized_id: id,
|
|
122
|
-
source: 'scanned',
|
|
123
|
-
path: f,
|
|
124
|
-
size: stats.size,
|
|
125
|
-
mtime: stats.mtime.toISOString(),
|
|
126
|
-
meta: { rows, cols }
|
|
127
|
-
};
|
|
128
|
-
map.set(id, entry);
|
|
129
|
-
scanned++;
|
|
130
|
-
}
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
// Synthesize if target requested
|
|
134
|
-
if (target > map.size) {
|
|
135
|
-
const synthCount = target - map.size;
|
|
136
|
-
const synthDir = path.join(path.dirname(outPath), 'local_library');
|
|
137
|
-
if (!fs.existsSync(synthDir)) fs.mkdirSync(synthDir, { recursive: true });
|
|
138
|
-
for (let i = 1; i <= synthCount; i++) {
|
|
139
|
-
const idx = map.size + i;
|
|
140
|
-
const id = `synth_${String(idx).padStart(6, '0')}`;
|
|
141
|
-
const entry = {
|
|
142
|
-
id,
|
|
143
|
-
normalized_id: id,
|
|
144
|
-
source: 'synthesized',
|
|
145
|
-
path: path.join(synthDir, `${id}.csv`),
|
|
146
|
-
size: 0,
|
|
147
|
-
mtime: new Date().toISOString(),
|
|
148
|
-
meta: { rows: Math.floor(Math.random() * 1000000), cols: Math.floor(Math.random() * 200) + 1 }
|
|
149
|
-
};
|
|
150
|
-
map.set(id, entry);
|
|
151
|
-
}
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
const outArr = Array.from(map.values());
|
|
155
|
-
fs.writeFileSync(outPath, JSON.stringify(outArr, null, 2), 'utf8');
|
|
156
|
-
console.log(`Wrote ${outArr.length} registry entries to ${outPath} (${scanned} scanned, ${Math.max(0, outArr.length - scanned)} synthesized)`);
|
|
157
|
-
})();
|
|
@@ -1,87 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
const { spawnSync } = require("child_process");
|
|
4
|
-
const fs = require("fs");
|
|
5
|
-
const path = require("path");
|
|
6
|
-
const os = require("os");
|
|
7
|
-
const Database = require("better-sqlite3");
|
|
8
|
-
|
|
9
|
-
function runCommand(command, args, options = {}) {
|
|
10
|
-
const result = spawnSync(command, args, {
|
|
11
|
-
stdio: "inherit",
|
|
12
|
-
shell: process.platform === "win32",
|
|
13
|
-
...options,
|
|
14
|
-
});
|
|
15
|
-
|
|
16
|
-
if (result.status !== 0) {
|
|
17
|
-
throw new Error(`Command failed: ${command} ${args.join(" ")} (exit ${result.status})`);
|
|
18
|
-
}
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
function countDatasets(dbPath) {
|
|
22
|
-
if (!fs.existsSync(dbPath)) return "N/A";
|
|
23
|
-
const db = new Database(dbPath);
|
|
24
|
-
const count = db.prepare("SELECT COUNT(*) AS c FROM datasets").get().c;
|
|
25
|
-
db.close();
|
|
26
|
-
return count;
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
function countVectors(jsonPath) {
|
|
30
|
-
if (!fs.existsSync(jsonPath)) return "N/A";
|
|
31
|
-
const data = JSON.parse(fs.readFileSync(jsonPath, "utf8"));
|
|
32
|
-
if (typeof data.count === "number") return data.count;
|
|
33
|
-
if (Array.isArray(data.ids)) return data.ids.length;
|
|
34
|
-
return "N/A";
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
function syncRuntime(workspaceRoot) {
|
|
38
|
-
const runtimeDir = path.join(os.homedir(), ".vesper", "data");
|
|
39
|
-
fs.mkdirSync(runtimeDir, { recursive: true });
|
|
40
|
-
|
|
41
|
-
const files = ["metadata.db", "vectors.json", "vectors.bin"];
|
|
42
|
-
for (const file of files) {
|
|
43
|
-
const src = path.join(workspaceRoot, "data", file);
|
|
44
|
-
const dest = path.join(runtimeDir, file);
|
|
45
|
-
if (!fs.existsSync(src)) {
|
|
46
|
-
throw new Error(`Missing source file: ${src}`);
|
|
47
|
-
}
|
|
48
|
-
fs.copyFileSync(src, dest);
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
return runtimeDir;
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
function main() {
|
|
55
|
-
const workspaceRoot = process.cwd();
|
|
56
|
-
const runtimeDbPath = path.join(os.homedir(), ".vesper", "data", "metadata.db");
|
|
57
|
-
const runtimeVecPath = path.join(os.homedir(), ".vesper", "data", "vectors.json");
|
|
58
|
-
const workspaceDbPath = path.join(workspaceRoot, "data", "metadata.db");
|
|
59
|
-
const workspaceVecPath = path.join(workspaceRoot, "data", "vectors.json");
|
|
60
|
-
|
|
61
|
-
console.log("\n[refresh-index] Step 1/3: Massive scrape...");
|
|
62
|
-
runCommand("npm", ["run", "massive-scrape"]);
|
|
63
|
-
|
|
64
|
-
console.log("\n[refresh-index] Step 2/3: High-memory indexing...");
|
|
65
|
-
const env = { ...process.env, NODE_OPTIONS: "--max-old-space-size=8192" };
|
|
66
|
-
runCommand("npm", ["run", "index"], { env });
|
|
67
|
-
|
|
68
|
-
console.log("\n[refresh-index] Step 3/3: Sync workspace index to runtime...");
|
|
69
|
-
const runtimeDir = syncRuntime(workspaceRoot);
|
|
70
|
-
|
|
71
|
-
const wsDb = countDatasets(workspaceDbPath);
|
|
72
|
-
const wsVec = countVectors(workspaceVecPath);
|
|
73
|
-
const rtDb = countDatasets(runtimeDbPath);
|
|
74
|
-
const rtVec = countVectors(runtimeVecPath);
|
|
75
|
-
|
|
76
|
-
console.log("\n[refresh-index] Completed successfully.");
|
|
77
|
-
console.log(`[refresh-index] Workspace: DB=${wsDb}, VECTORS=${wsVec}`);
|
|
78
|
-
console.log(`[refresh-index] Runtime: DB=${rtDb}, VECTORS=${rtVec}`);
|
|
79
|
-
console.log(`[refresh-index] Runtime path: ${runtimeDir}\n`);
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
try {
|
|
83
|
-
main();
|
|
84
|
-
} catch (error) {
|
|
85
|
-
console.error("\n[refresh-index] Failed:", error.message);
|
|
86
|
-
process.exit(1);
|
|
87
|
-
}
|