gitnexus 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +181 -0
- package/dist/cli/ai-context.d.ts +21 -0
- package/dist/cli/ai-context.js +219 -0
- package/dist/cli/analyze.d.ts +10 -0
- package/dist/cli/analyze.js +118 -0
- package/dist/cli/clean.d.ts +8 -0
- package/dist/cli/clean.js +29 -0
- package/dist/cli/index.d.ts +2 -0
- package/dist/cli/index.js +42 -0
- package/dist/cli/list.d.ts +6 -0
- package/dist/cli/list.js +27 -0
- package/dist/cli/mcp.d.ts +7 -0
- package/dist/cli/mcp.js +85 -0
- package/dist/cli/serve.d.ts +3 -0
- package/dist/cli/serve.js +5 -0
- package/dist/cli/status.d.ts +6 -0
- package/dist/cli/status.js +27 -0
- package/dist/config/ignore-service.d.ts +1 -0
- package/dist/config/ignore-service.js +208 -0
- package/dist/config/supported-languages.d.ts +11 -0
- package/dist/config/supported-languages.js +15 -0
- package/dist/core/embeddings/embedder.d.ts +60 -0
- package/dist/core/embeddings/embedder.js +205 -0
- package/dist/core/embeddings/embedding-pipeline.d.ts +50 -0
- package/dist/core/embeddings/embedding-pipeline.js +321 -0
- package/dist/core/embeddings/index.d.ts +9 -0
- package/dist/core/embeddings/index.js +9 -0
- package/dist/core/embeddings/text-generator.d.ts +24 -0
- package/dist/core/embeddings/text-generator.js +182 -0
- package/dist/core/embeddings/types.d.ts +87 -0
- package/dist/core/embeddings/types.js +32 -0
- package/dist/core/graph/graph.d.ts +2 -0
- package/dist/core/graph/graph.js +61 -0
- package/dist/core/graph/types.d.ts +50 -0
- package/dist/core/graph/types.js +1 -0
- package/dist/core/ingestion/ast-cache.d.ts +11 -0
- package/dist/core/ingestion/ast-cache.js +34 -0
- package/dist/core/ingestion/call-processor.d.ts +8 -0
- package/dist/core/ingestion/call-processor.js +269 -0
- package/dist/core/ingestion/cluster-enricher.d.ts +38 -0
- package/dist/core/ingestion/cluster-enricher.js +170 -0
- package/dist/core/ingestion/community-processor.d.ts +39 -0
- package/dist/core/ingestion/community-processor.js +269 -0
- package/dist/core/ingestion/entry-point-scoring.d.ts +39 -0
- package/dist/core/ingestion/entry-point-scoring.js +235 -0
- package/dist/core/ingestion/filesystem-walker.d.ts +5 -0
- package/dist/core/ingestion/filesystem-walker.js +26 -0
- package/dist/core/ingestion/framework-detection.d.ts +38 -0
- package/dist/core/ingestion/framework-detection.js +183 -0
- package/dist/core/ingestion/heritage-processor.d.ts +14 -0
- package/dist/core/ingestion/heritage-processor.js +134 -0
- package/dist/core/ingestion/import-processor.d.ts +8 -0
- package/dist/core/ingestion/import-processor.js +490 -0
- package/dist/core/ingestion/parsing-processor.d.ts +8 -0
- package/dist/core/ingestion/parsing-processor.js +249 -0
- package/dist/core/ingestion/pipeline.d.ts +2 -0
- package/dist/core/ingestion/pipeline.js +228 -0
- package/dist/core/ingestion/process-processor.d.ts +51 -0
- package/dist/core/ingestion/process-processor.js +278 -0
- package/dist/core/ingestion/structure-processor.d.ts +2 -0
- package/dist/core/ingestion/structure-processor.js +36 -0
- package/dist/core/ingestion/symbol-table.d.ts +33 -0
- package/dist/core/ingestion/symbol-table.js +38 -0
- package/dist/core/ingestion/tree-sitter-queries.d.ts +11 -0
- package/dist/core/ingestion/tree-sitter-queries.js +319 -0
- package/dist/core/ingestion/utils.d.ts +10 -0
- package/dist/core/ingestion/utils.js +44 -0
- package/dist/core/kuzu/csv-generator.d.ts +22 -0
- package/dist/core/kuzu/csv-generator.js +272 -0
- package/dist/core/kuzu/kuzu-adapter.d.ts +81 -0
- package/dist/core/kuzu/kuzu-adapter.js +568 -0
- package/dist/core/kuzu/schema.d.ts +53 -0
- package/dist/core/kuzu/schema.js +380 -0
- package/dist/core/search/bm25-index.d.ts +22 -0
- package/dist/core/search/bm25-index.js +52 -0
- package/dist/core/search/hybrid-search.d.ts +49 -0
- package/dist/core/search/hybrid-search.js +118 -0
- package/dist/core/tree-sitter/parser-loader.d.ts +4 -0
- package/dist/core/tree-sitter/parser-loader.js +42 -0
- package/dist/lib/utils.d.ts +1 -0
- package/dist/lib/utils.js +3 -0
- package/dist/mcp/core/embedder.d.ts +27 -0
- package/dist/mcp/core/embedder.js +93 -0
- package/dist/mcp/core/kuzu-adapter.d.ts +23 -0
- package/dist/mcp/core/kuzu-adapter.js +62 -0
- package/dist/mcp/local/local-backend.d.ts +73 -0
- package/dist/mcp/local/local-backend.js +752 -0
- package/dist/mcp/resources.d.ts +31 -0
- package/dist/mcp/resources.js +279 -0
- package/dist/mcp/server.d.ts +12 -0
- package/dist/mcp/server.js +130 -0
- package/dist/mcp/staleness.d.ts +15 -0
- package/dist/mcp/staleness.js +29 -0
- package/dist/mcp/tools.d.ts +24 -0
- package/dist/mcp/tools.js +160 -0
- package/dist/server/api.d.ts +6 -0
- package/dist/server/api.js +156 -0
- package/dist/storage/git.d.ts +7 -0
- package/dist/storage/git.js +39 -0
- package/dist/storage/repo-manager.d.ts +61 -0
- package/dist/storage/repo-manager.js +106 -0
- package/dist/types/pipeline.d.ts +28 -0
- package/dist/types/pipeline.js +16 -0
- package/package.json +80 -0
- package/skills/debugging.md +104 -0
- package/skills/exploring.md +112 -0
- package/skills/impact-analysis.md +114 -0
- package/skills/refactoring.md +119 -0
- package/vendor/leiden/index.cjs +355 -0
- package/vendor/leiden/utils.cjs +392 -0
package/dist/cli/mcp.js
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP Command
|
|
3
|
+
*
|
|
4
|
+
* Starts the MCP server in standalone mode using local .gitnexus/ index.
|
|
5
|
+
* Auto-detects repository by searching for .gitnexus/ folder.
|
|
6
|
+
*/
|
|
7
|
+
import path from 'path';
|
|
8
|
+
import fs from 'fs/promises';
|
|
9
|
+
import { startMCPServer } from '../mcp/server.js';
|
|
10
|
+
import { LocalBackend, findRepo } from '../mcp/local/local-backend.js';
|
|
11
|
+
/**
|
|
12
|
+
* Get candidate paths to search for .gitnexus/ folder
|
|
13
|
+
*/
|
|
14
|
+
function getCandidatePaths() {
|
|
15
|
+
const candidates = [];
|
|
16
|
+
// 1. Explicit override (highest priority)
|
|
17
|
+
if (process.env.GITNEXUS_CWD) {
|
|
18
|
+
candidates.push(process.env.GITNEXUS_CWD);
|
|
19
|
+
}
|
|
20
|
+
// 2. Current working directory
|
|
21
|
+
candidates.push(process.cwd());
|
|
22
|
+
// 3. VS Code workspace folders (if available via env)
|
|
23
|
+
if (process.env.VSCODE_WORKSPACE_FOLDER) {
|
|
24
|
+
candidates.push(process.env.VSCODE_WORKSPACE_FOLDER);
|
|
25
|
+
}
|
|
26
|
+
// Deduplicate while preserving order
|
|
27
|
+
return [...new Set(candidates.map(p => path.resolve(p)))];
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Find a git repository root by walking up the directory tree
|
|
31
|
+
*/
|
|
32
|
+
async function findGitRoot(startPath) {
|
|
33
|
+
let current = path.resolve(startPath);
|
|
34
|
+
const root = path.parse(current).root;
|
|
35
|
+
while (current !== root) {
|
|
36
|
+
try {
|
|
37
|
+
const gitPath = path.join(current, '.git');
|
|
38
|
+
const stat = await fs.stat(gitPath);
|
|
39
|
+
if (stat.isDirectory())
|
|
40
|
+
return current;
|
|
41
|
+
}
|
|
42
|
+
catch { }
|
|
43
|
+
current = path.dirname(current);
|
|
44
|
+
}
|
|
45
|
+
return null;
|
|
46
|
+
}
|
|
47
|
+
export const mcpCommand = async () => {
|
|
48
|
+
// Try multiple candidate paths to find .gitnexus/
|
|
49
|
+
const candidates = getCandidatePaths();
|
|
50
|
+
for (const candidate of candidates) {
|
|
51
|
+
const repo = await findRepo(candidate);
|
|
52
|
+
if (repo) {
|
|
53
|
+
const local = new LocalBackend();
|
|
54
|
+
await local.init(candidate);
|
|
55
|
+
console.error(`GitNexus: Found index at ${repo.storagePath}`);
|
|
56
|
+
// Start MCP server (no watcher daemon - staleness hints guide LLM to call analyze)
|
|
57
|
+
await startMCPServer(local);
|
|
58
|
+
return;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
// No index found - give helpful error message
|
|
62
|
+
for (const candidate of candidates) {
|
|
63
|
+
const gitRoot = await findGitRoot(candidate);
|
|
64
|
+
if (gitRoot) {
|
|
65
|
+
console.error('');
|
|
66
|
+
console.error('╔════════════════════════════════════════════════════╗');
|
|
67
|
+
console.error('║ GitNexus: Repository Not Indexed ║');
|
|
68
|
+
console.error('╠════════════════════════════════════════════════════╣');
|
|
69
|
+
console.error(`║ Found git repo: ${gitRoot.slice(0, 35).padEnd(35)} ║`);
|
|
70
|
+
console.error('║ ║');
|
|
71
|
+
console.error('║ To enable AI code understanding, run: ║');
|
|
72
|
+
console.error('║ ║');
|
|
73
|
+
console.error('║ npx gitnexus analyze ║');
|
|
74
|
+
console.error('║ ║');
|
|
75
|
+
console.error('║ Then restart your IDE. ║');
|
|
76
|
+
console.error('╚════════════════════════════════════════════════════╝');
|
|
77
|
+
console.error('');
|
|
78
|
+
process.exit(1);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
// No git repo found
|
|
82
|
+
console.error('GitNexus: No git repository found.');
|
|
83
|
+
console.error(`Searched: ${candidates.join(', ')}`);
|
|
84
|
+
process.exit(1);
|
|
85
|
+
};
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Status Command
|
|
3
|
+
*
|
|
4
|
+
* Shows the indexing status of the current repository.
|
|
5
|
+
*/
|
|
6
|
+
import { findRepo } from '../storage/repo-manager.js';
|
|
7
|
+
import { getCurrentCommit, isGitRepo } from '../storage/git.js';
|
|
8
|
+
export const statusCommand = async () => {
|
|
9
|
+
const cwd = process.cwd();
|
|
10
|
+
if (!isGitRepo(cwd)) {
|
|
11
|
+
console.log('Not a git repository.');
|
|
12
|
+
return;
|
|
13
|
+
}
|
|
14
|
+
const repo = await findRepo(cwd);
|
|
15
|
+
if (!repo) {
|
|
16
|
+
console.log('Repository not indexed.');
|
|
17
|
+
console.log('Run: gitnexus analyze');
|
|
18
|
+
return;
|
|
19
|
+
}
|
|
20
|
+
const currentCommit = getCurrentCommit(repo.repoPath);
|
|
21
|
+
const isUpToDate = currentCommit === repo.meta.lastCommit;
|
|
22
|
+
console.log(`Repository: ${repo.repoPath}`);
|
|
23
|
+
console.log(`Indexed: ${new Date(repo.meta.indexedAt).toLocaleString()}`);
|
|
24
|
+
console.log(`Indexed commit: ${repo.meta.lastCommit?.slice(0, 7)}`);
|
|
25
|
+
console.log(`Current commit: ${currentCommit?.slice(0, 7)}`);
|
|
26
|
+
console.log(`Status: ${isUpToDate ? '✅ up-to-date' : '⚠️ stale (re-run gitnexus analyze)'}`);
|
|
27
|
+
};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare const shouldIgnorePath: (filePath: string) => boolean;
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
const DEFAULT_IGNORE_LIST = new Set([
|
|
2
|
+
// Version Control
|
|
3
|
+
'.git',
|
|
4
|
+
'.svn',
|
|
5
|
+
'.hg',
|
|
6
|
+
'.bzr',
|
|
7
|
+
// IDEs & Editors
|
|
8
|
+
'.idea',
|
|
9
|
+
'.vscode',
|
|
10
|
+
'.vs',
|
|
11
|
+
'.eclipse',
|
|
12
|
+
'.settings',
|
|
13
|
+
'.DS_Store',
|
|
14
|
+
'Thumbs.db',
|
|
15
|
+
// Dependencies
|
|
16
|
+
'node_modules',
|
|
17
|
+
'bower_components',
|
|
18
|
+
'jspm_packages',
|
|
19
|
+
'vendor', // PHP/Go
|
|
20
|
+
// 'packages' removed - commonly used for monorepo source code (lerna, pnpm, yarn workspaces)
|
|
21
|
+
'venv',
|
|
22
|
+
'.venv',
|
|
23
|
+
'env',
|
|
24
|
+
'.env',
|
|
25
|
+
'__pycache__',
|
|
26
|
+
'.pytest_cache',
|
|
27
|
+
'.mypy_cache',
|
|
28
|
+
'site-packages',
|
|
29
|
+
'.tox',
|
|
30
|
+
'eggs',
|
|
31
|
+
'.eggs',
|
|
32
|
+
'lib64',
|
|
33
|
+
'parts',
|
|
34
|
+
'sdist',
|
|
35
|
+
'wheels',
|
|
36
|
+
// Build Outputs
|
|
37
|
+
'dist',
|
|
38
|
+
'build',
|
|
39
|
+
'out',
|
|
40
|
+
'output',
|
|
41
|
+
'bin',
|
|
42
|
+
'obj',
|
|
43
|
+
'target', // Java/Rust
|
|
44
|
+
'.next',
|
|
45
|
+
'.nuxt',
|
|
46
|
+
'.output',
|
|
47
|
+
'.vercel',
|
|
48
|
+
'.netlify',
|
|
49
|
+
'.serverless',
|
|
50
|
+
'_build',
|
|
51
|
+
'public/build',
|
|
52
|
+
'.parcel-cache',
|
|
53
|
+
'.turbo',
|
|
54
|
+
'.svelte-kit',
|
|
55
|
+
// Test & Coverage
|
|
56
|
+
'coverage',
|
|
57
|
+
'.nyc_output',
|
|
58
|
+
'htmlcov',
|
|
59
|
+
'.coverage',
|
|
60
|
+
'__tests__', // Often just test files
|
|
61
|
+
'__mocks__',
|
|
62
|
+
'.jest',
|
|
63
|
+
// Logs & Temp
|
|
64
|
+
'logs',
|
|
65
|
+
'log',
|
|
66
|
+
'tmp',
|
|
67
|
+
'temp',
|
|
68
|
+
'cache',
|
|
69
|
+
'.cache',
|
|
70
|
+
'.tmp',
|
|
71
|
+
'.temp',
|
|
72
|
+
// Generated/Compiled
|
|
73
|
+
'.generated',
|
|
74
|
+
'generated',
|
|
75
|
+
'auto-generated',
|
|
76
|
+
'.terraform',
|
|
77
|
+
'.serverless',
|
|
78
|
+
// Documentation (optional - might want to keep)
|
|
79
|
+
// 'docs',
|
|
80
|
+
// 'documentation',
|
|
81
|
+
// Misc
|
|
82
|
+
'.husky',
|
|
83
|
+
'.github', // GitHub config, not code
|
|
84
|
+
'.circleci',
|
|
85
|
+
'.gitlab',
|
|
86
|
+
'fixtures', // Test fixtures
|
|
87
|
+
'snapshots', // Jest snapshots
|
|
88
|
+
'__snapshots__',
|
|
89
|
+
]);
|
|
90
|
+
const IGNORED_EXTENSIONS = new Set([
|
|
91
|
+
// Images
|
|
92
|
+
'.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico', '.webp', '.bmp', '.tiff', '.tif',
|
|
93
|
+
'.psd', '.ai', '.sketch', '.fig', '.xd',
|
|
94
|
+
// Archives
|
|
95
|
+
'.zip', '.tar', '.gz', '.rar', '.7z', '.bz2', '.xz', '.tgz',
|
|
96
|
+
// Binary/Compiled
|
|
97
|
+
'.exe', '.dll', '.so', '.dylib', '.a', '.lib', '.o', '.obj',
|
|
98
|
+
'.class', '.jar', '.war', '.ear',
|
|
99
|
+
'.pyc', '.pyo', '.pyd',
|
|
100
|
+
'.beam', // Erlang
|
|
101
|
+
'.wasm', // WebAssembly - important!
|
|
102
|
+
'.node', // Native Node addons
|
|
103
|
+
// Documents
|
|
104
|
+
'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
|
|
105
|
+
'.odt', '.ods', '.odp',
|
|
106
|
+
// Media
|
|
107
|
+
'.mp4', '.mp3', '.wav', '.mov', '.avi', '.mkv', '.flv', '.wmv',
|
|
108
|
+
'.ogg', '.webm', '.flac', '.aac', '.m4a',
|
|
109
|
+
// Fonts
|
|
110
|
+
'.woff', '.woff2', '.ttf', '.eot', '.otf',
|
|
111
|
+
// Databases
|
|
112
|
+
'.db', '.sqlite', '.sqlite3', '.mdb', '.accdb',
|
|
113
|
+
// Minified/Bundled files
|
|
114
|
+
'.min.js', '.min.css', '.bundle.js', '.chunk.js',
|
|
115
|
+
// Source maps (debug files, not source)
|
|
116
|
+
'.map',
|
|
117
|
+
// Lock files (handled separately, but also here)
|
|
118
|
+
'.lock',
|
|
119
|
+
// Certificates & Keys (security - don't index!)
|
|
120
|
+
'.pem', '.key', '.crt', '.cer', '.p12', '.pfx',
|
|
121
|
+
// Data files (often large/binary)
|
|
122
|
+
'.csv', '.tsv', '.parquet', '.avro', '.feather',
|
|
123
|
+
'.npy', '.npz', '.pkl', '.pickle', '.h5', '.hdf5',
|
|
124
|
+
// Misc binary
|
|
125
|
+
'.bin', '.dat', '.data', '.raw',
|
|
126
|
+
'.iso', '.img', '.dmg',
|
|
127
|
+
]);
|
|
128
|
+
// Files to ignore by exact name
|
|
129
|
+
const IGNORED_FILES = new Set([
|
|
130
|
+
'package-lock.json',
|
|
131
|
+
'yarn.lock',
|
|
132
|
+
'pnpm-lock.yaml',
|
|
133
|
+
'composer.lock',
|
|
134
|
+
'Gemfile.lock',
|
|
135
|
+
'poetry.lock',
|
|
136
|
+
'Cargo.lock',
|
|
137
|
+
'go.sum',
|
|
138
|
+
'.gitignore',
|
|
139
|
+
'.gitattributes',
|
|
140
|
+
'.npmrc',
|
|
141
|
+
'.yarnrc',
|
|
142
|
+
'.editorconfig',
|
|
143
|
+
'.prettierrc',
|
|
144
|
+
'.prettierignore',
|
|
145
|
+
'.eslintignore',
|
|
146
|
+
'.dockerignore',
|
|
147
|
+
'Thumbs.db',
|
|
148
|
+
'.DS_Store',
|
|
149
|
+
'LICENSE',
|
|
150
|
+
'LICENSE.md',
|
|
151
|
+
'LICENSE.txt',
|
|
152
|
+
'CHANGELOG.md',
|
|
153
|
+
'CHANGELOG',
|
|
154
|
+
'CONTRIBUTING.md',
|
|
155
|
+
'CODE_OF_CONDUCT.md',
|
|
156
|
+
'SECURITY.md',
|
|
157
|
+
'.env',
|
|
158
|
+
'.env.local',
|
|
159
|
+
'.env.development',
|
|
160
|
+
'.env.production',
|
|
161
|
+
'.env.test',
|
|
162
|
+
'.env.example',
|
|
163
|
+
]);
|
|
164
|
+
export const shouldIgnorePath = (filePath) => {
|
|
165
|
+
const normalizedPath = filePath.replace(/\\/g, '/');
|
|
166
|
+
const parts = normalizedPath.split('/');
|
|
167
|
+
const fileName = parts[parts.length - 1];
|
|
168
|
+
const fileNameLower = fileName.toLowerCase();
|
|
169
|
+
// Check if any path segment is in ignore list
|
|
170
|
+
for (const part of parts) {
|
|
171
|
+
if (DEFAULT_IGNORE_LIST.has(part)) {
|
|
172
|
+
return true;
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
// Check exact filename matches
|
|
176
|
+
if (IGNORED_FILES.has(fileName) || IGNORED_FILES.has(fileNameLower)) {
|
|
177
|
+
return true;
|
|
178
|
+
}
|
|
179
|
+
// Check extension
|
|
180
|
+
const lastDotIndex = fileNameLower.lastIndexOf('.');
|
|
181
|
+
if (lastDotIndex !== -1) {
|
|
182
|
+
const ext = fileNameLower.substring(lastDotIndex);
|
|
183
|
+
if (IGNORED_EXTENSIONS.has(ext))
|
|
184
|
+
return true;
|
|
185
|
+
// Handle compound extensions like .min.js, .bundle.js
|
|
186
|
+
const secondLastDot = fileNameLower.lastIndexOf('.', lastDotIndex - 1);
|
|
187
|
+
if (secondLastDot !== -1) {
|
|
188
|
+
const compoundExt = fileNameLower.substring(secondLastDot);
|
|
189
|
+
if (IGNORED_EXTENSIONS.has(compoundExt))
|
|
190
|
+
return true;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
// Ignore hidden files (starting with .)
|
|
194
|
+
if (fileName.startsWith('.') && fileName !== '.') {
|
|
195
|
+
// But allow some important config files
|
|
196
|
+
const allowedDotFiles = ['.env', '.gitignore']; // Already in IGNORED_FILES, so this is redundant
|
|
197
|
+
// Actually, let's NOT ignore all dot files - many are important configs
|
|
198
|
+
// Just rely on the explicit lists above
|
|
199
|
+
}
|
|
200
|
+
// Ignore files that look like generated/bundled code
|
|
201
|
+
if (fileNameLower.includes('.bundle.') ||
|
|
202
|
+
fileNameLower.includes('.chunk.') ||
|
|
203
|
+
fileNameLower.includes('.generated.') ||
|
|
204
|
+
fileNameLower.endsWith('.d.ts')) { // TypeScript declaration files
|
|
205
|
+
return true;
|
|
206
|
+
}
|
|
207
|
+
return false;
|
|
208
|
+
};
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
export var SupportedLanguages;
|
|
2
|
+
(function (SupportedLanguages) {
|
|
3
|
+
SupportedLanguages["JavaScript"] = "javascript";
|
|
4
|
+
SupportedLanguages["TypeScript"] = "typescript";
|
|
5
|
+
SupportedLanguages["Python"] = "python";
|
|
6
|
+
SupportedLanguages["Java"] = "java";
|
|
7
|
+
SupportedLanguages["C"] = "c";
|
|
8
|
+
SupportedLanguages["CPlusPlus"] = "cpp";
|
|
9
|
+
SupportedLanguages["CSharp"] = "csharp";
|
|
10
|
+
SupportedLanguages["Go"] = "go";
|
|
11
|
+
SupportedLanguages["Rust"] = "rust";
|
|
12
|
+
// PHP = 'php',
|
|
13
|
+
// Ruby = 'ruby',
|
|
14
|
+
// Swift = 'swift',
|
|
15
|
+
})(SupportedLanguages || (SupportedLanguages = {}));
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embedder Module
|
|
3
|
+
*
|
|
4
|
+
* Singleton factory for transformers.js embedding pipeline.
|
|
5
|
+
* Handles model loading, caching, and both single and batch embedding operations.
|
|
6
|
+
*
|
|
7
|
+
* Uses snowflake-arctic-embed-xs by default (22M params, 384 dims, ~90MB)
|
|
8
|
+
*/
|
|
9
|
+
import { type FeatureExtractionPipeline } from '@huggingface/transformers';
|
|
10
|
+
import { type EmbeddingConfig, type ModelProgress } from './types.js';
|
|
11
|
+
/**
|
|
12
|
+
* Progress callback type for model loading
|
|
13
|
+
*/
|
|
14
|
+
export type ModelProgressCallback = (progress: ModelProgress) => void;
|
|
15
|
+
/**
|
|
16
|
+
* Get the current device being used for inference
|
|
17
|
+
*/
|
|
18
|
+
export declare const getCurrentDevice: () => "webgpu" | "cuda" | "cpu" | "wasm" | null;
|
|
19
|
+
/**
|
|
20
|
+
* Initialize the embedding model
|
|
21
|
+
* Uses singleton pattern - only loads once, subsequent calls return cached instance
|
|
22
|
+
*
|
|
23
|
+
* @param onProgress - Optional callback for model download progress
|
|
24
|
+
* @param config - Optional configuration override
|
|
25
|
+
* @param forceDevice - Force a specific device
|
|
26
|
+
* @returns Promise resolving to the embedder pipeline
|
|
27
|
+
*/
|
|
28
|
+
export declare const initEmbedder: (onProgress?: ModelProgressCallback, config?: Partial<EmbeddingConfig>, forceDevice?: "webgpu" | "cuda" | "cpu" | "wasm") => Promise<FeatureExtractionPipeline>;
|
|
29
|
+
/**
|
|
30
|
+
* Check if the embedder is initialized and ready
|
|
31
|
+
*/
|
|
32
|
+
export declare const isEmbedderReady: () => boolean;
|
|
33
|
+
/**
|
|
34
|
+
* Get the embedder instance (throws if not initialized)
|
|
35
|
+
*/
|
|
36
|
+
export declare const getEmbedder: () => FeatureExtractionPipeline;
|
|
37
|
+
/**
|
|
38
|
+
* Embed a single text string
|
|
39
|
+
*
|
|
40
|
+
* @param text - Text to embed
|
|
41
|
+
* @returns Float32Array of embedding vector (384 dimensions)
|
|
42
|
+
*/
|
|
43
|
+
export declare const embedText: (text: string) => Promise<Float32Array>;
|
|
44
|
+
/**
|
|
45
|
+
* Embed multiple texts in a single batch
|
|
46
|
+
* More efficient than calling embedText multiple times
|
|
47
|
+
*
|
|
48
|
+
* @param texts - Array of texts to embed
|
|
49
|
+
* @returns Array of Float32Array embedding vectors
|
|
50
|
+
*/
|
|
51
|
+
export declare const embedBatch: (texts: string[]) => Promise<Float32Array[]>;
|
|
52
|
+
/**
|
|
53
|
+
* Convert Float32Array to regular number array (for KuzuDB storage)
|
|
54
|
+
*/
|
|
55
|
+
export declare const embeddingToArray: (embedding: Float32Array) => number[];
|
|
56
|
+
/**
|
|
57
|
+
* Cleanup the embedder (free memory)
|
|
58
|
+
* Call this when done with embeddings
|
|
59
|
+
*/
|
|
60
|
+
export declare const disposeEmbedder: () => Promise<void>;
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embedder Module
|
|
3
|
+
*
|
|
4
|
+
* Singleton factory for transformers.js embedding pipeline.
|
|
5
|
+
* Handles model loading, caching, and both single and batch embedding operations.
|
|
6
|
+
*
|
|
7
|
+
* Uses snowflake-arctic-embed-xs by default (22M params, 384 dims, ~90MB)
|
|
8
|
+
*/
|
|
9
|
+
import { pipeline, env } from '@huggingface/transformers';
|
|
10
|
+
import { DEFAULT_EMBEDDING_CONFIG } from './types.js';
|
|
11
|
+
// Module-level state for singleton pattern
|
|
12
|
+
let embedderInstance = null;
|
|
13
|
+
let isInitializing = false;
|
|
14
|
+
let initPromise = null;
|
|
15
|
+
let currentDevice = null;
|
|
16
|
+
/**
|
|
17
|
+
* Get the current device being used for inference
|
|
18
|
+
*/
|
|
19
|
+
export const getCurrentDevice = () => currentDevice;
|
|
20
|
+
/**
|
|
21
|
+
* Initialize the embedding model
|
|
22
|
+
* Uses singleton pattern - only loads once, subsequent calls return cached instance
|
|
23
|
+
*
|
|
24
|
+
* @param onProgress - Optional callback for model download progress
|
|
25
|
+
* @param config - Optional configuration override
|
|
26
|
+
* @param forceDevice - Force a specific device
|
|
27
|
+
* @returns Promise resolving to the embedder pipeline
|
|
28
|
+
*/
|
|
29
|
+
export const initEmbedder = async (onProgress, config = {}, forceDevice) => {
|
|
30
|
+
// Return existing instance if available
|
|
31
|
+
if (embedderInstance) {
|
|
32
|
+
return embedderInstance;
|
|
33
|
+
}
|
|
34
|
+
// If already initializing, wait for that promise
|
|
35
|
+
if (isInitializing && initPromise) {
|
|
36
|
+
return initPromise;
|
|
37
|
+
}
|
|
38
|
+
isInitializing = true;
|
|
39
|
+
const finalConfig = { ...DEFAULT_EMBEDDING_CONFIG, ...config };
|
|
40
|
+
// On Windows, use webgpu for GPU acceleration (via DirectX12/DirectML)
|
|
41
|
+
// CUDA is only available on Linux with onnxruntime-node
|
|
42
|
+
const isWindows = process.platform === 'win32';
|
|
43
|
+
const gpuDevice = isWindows ? 'webgpu' : 'cuda';
|
|
44
|
+
let requestedDevice = forceDevice || (finalConfig.device === 'auto' ? gpuDevice : finalConfig.device);
|
|
45
|
+
initPromise = (async () => {
|
|
46
|
+
try {
|
|
47
|
+
// Configure transformers.js environment
|
|
48
|
+
env.allowLocalModels = false;
|
|
49
|
+
const isDev = process.env.NODE_ENV !== 'production';
|
|
50
|
+
if (isDev) {
|
|
51
|
+
console.log(`🧠 Loading embedding model: ${finalConfig.modelId}`);
|
|
52
|
+
}
|
|
53
|
+
const progressCallback = onProgress ? (data) => {
|
|
54
|
+
const progress = {
|
|
55
|
+
status: data.status || 'progress',
|
|
56
|
+
file: data.file,
|
|
57
|
+
progress: data.progress,
|
|
58
|
+
loaded: data.loaded,
|
|
59
|
+
total: data.total,
|
|
60
|
+
};
|
|
61
|
+
onProgress(progress);
|
|
62
|
+
} : undefined;
|
|
63
|
+
// Try GPU first if auto, fall back to CPU
|
|
64
|
+
// Windows: webgpu (DirectX12/DirectML), Linux: cuda
|
|
65
|
+
const devicesToTry = (requestedDevice === 'webgpu' || requestedDevice === 'cuda')
|
|
66
|
+
? [requestedDevice, 'cpu']
|
|
67
|
+
: [requestedDevice];
|
|
68
|
+
for (const device of devicesToTry) {
|
|
69
|
+
try {
|
|
70
|
+
if (isDev && device === 'webgpu') {
|
|
71
|
+
console.log('🔧 Trying WebGPU (DirectX12) backend...');
|
|
72
|
+
}
|
|
73
|
+
else if (isDev && device === 'cuda') {
|
|
74
|
+
console.log('🔧 Trying CUDA GPU backend...');
|
|
75
|
+
}
|
|
76
|
+
else if (isDev && device === 'cpu') {
|
|
77
|
+
console.log('🔧 Using CPU backend...');
|
|
78
|
+
}
|
|
79
|
+
else if (isDev && device === 'wasm') {
|
|
80
|
+
console.log('🔧 Using WASM backend (slower)...');
|
|
81
|
+
}
|
|
82
|
+
embedderInstance = await pipeline('feature-extraction', finalConfig.modelId, {
|
|
83
|
+
device: device,
|
|
84
|
+
dtype: 'fp32',
|
|
85
|
+
progress_callback: progressCallback,
|
|
86
|
+
});
|
|
87
|
+
currentDevice = device;
|
|
88
|
+
if (isDev) {
|
|
89
|
+
const label = device === 'webgpu' ? 'GPU (WebGPU/DirectX12)'
|
|
90
|
+
: device === 'cuda' ? 'GPU (CUDA)'
|
|
91
|
+
: device.toUpperCase();
|
|
92
|
+
console.log(`✅ Using ${label} backend`);
|
|
93
|
+
console.log('✅ Embedding model loaded successfully');
|
|
94
|
+
}
|
|
95
|
+
return embedderInstance;
|
|
96
|
+
}
|
|
97
|
+
catch (deviceError) {
|
|
98
|
+
if (isDev && (device === 'cuda' || device === 'webgpu')) {
|
|
99
|
+
const gpuType = device === 'webgpu' ? 'WebGPU' : 'CUDA';
|
|
100
|
+
console.log(`⚠️ ${gpuType} not available, falling back to CPU...`);
|
|
101
|
+
}
|
|
102
|
+
// Continue to next device in list
|
|
103
|
+
if (device === devicesToTry[devicesToTry.length - 1]) {
|
|
104
|
+
throw deviceError; // Last device failed, propagate error
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
throw new Error('No suitable device found for embedding model');
|
|
109
|
+
}
|
|
110
|
+
catch (error) {
|
|
111
|
+
isInitializing = false;
|
|
112
|
+
initPromise = null;
|
|
113
|
+
embedderInstance = null;
|
|
114
|
+
throw error;
|
|
115
|
+
}
|
|
116
|
+
finally {
|
|
117
|
+
isInitializing = false;
|
|
118
|
+
}
|
|
119
|
+
})();
|
|
120
|
+
return initPromise;
|
|
121
|
+
};
|
|
122
|
+
/**
|
|
123
|
+
* Check if the embedder is initialized and ready
|
|
124
|
+
*/
|
|
125
|
+
export const isEmbedderReady = () => {
|
|
126
|
+
return embedderInstance !== null;
|
|
127
|
+
};
|
|
128
|
+
/**
|
|
129
|
+
* Get the embedder instance (throws if not initialized)
|
|
130
|
+
*/
|
|
131
|
+
export const getEmbedder = () => {
|
|
132
|
+
if (!embedderInstance) {
|
|
133
|
+
throw new Error('Embedder not initialized. Call initEmbedder() first.');
|
|
134
|
+
}
|
|
135
|
+
return embedderInstance;
|
|
136
|
+
};
|
|
137
|
+
/**
|
|
138
|
+
* Embed a single text string
|
|
139
|
+
*
|
|
140
|
+
* @param text - Text to embed
|
|
141
|
+
* @returns Float32Array of embedding vector (384 dimensions)
|
|
142
|
+
*/
|
|
143
|
+
export const embedText = async (text) => {
|
|
144
|
+
const embedder = getEmbedder();
|
|
145
|
+
const result = await embedder(text, {
|
|
146
|
+
pooling: 'mean',
|
|
147
|
+
normalize: true,
|
|
148
|
+
});
|
|
149
|
+
// Result is a Tensor, convert to Float32Array
|
|
150
|
+
return new Float32Array(result.data);
|
|
151
|
+
};
|
|
152
|
+
/**
|
|
153
|
+
* Embed multiple texts in a single batch
|
|
154
|
+
* More efficient than calling embedText multiple times
|
|
155
|
+
*
|
|
156
|
+
* @param texts - Array of texts to embed
|
|
157
|
+
* @returns Array of Float32Array embedding vectors
|
|
158
|
+
*/
|
|
159
|
+
export const embedBatch = async (texts) => {
|
|
160
|
+
if (texts.length === 0) {
|
|
161
|
+
return [];
|
|
162
|
+
}
|
|
163
|
+
const embedder = getEmbedder();
|
|
164
|
+
// Process batch
|
|
165
|
+
const result = await embedder(texts, {
|
|
166
|
+
pooling: 'mean',
|
|
167
|
+
normalize: true,
|
|
168
|
+
});
|
|
169
|
+
// Result shape is [batch_size, dimensions]
|
|
170
|
+
// Need to split into individual vectors
|
|
171
|
+
const data = result.data;
|
|
172
|
+
const dimensions = DEFAULT_EMBEDDING_CONFIG.dimensions;
|
|
173
|
+
const embeddings = [];
|
|
174
|
+
for (let i = 0; i < texts.length; i++) {
|
|
175
|
+
const start = i * dimensions;
|
|
176
|
+
const end = start + dimensions;
|
|
177
|
+
embeddings.push(new Float32Array(Array.prototype.slice.call(data, start, end)));
|
|
178
|
+
}
|
|
179
|
+
return embeddings;
|
|
180
|
+
};
|
|
181
|
+
/**
|
|
182
|
+
* Convert Float32Array to regular number array (for KuzuDB storage)
|
|
183
|
+
*/
|
|
184
|
+
export const embeddingToArray = (embedding) => {
|
|
185
|
+
return Array.from(embedding);
|
|
186
|
+
};
|
|
187
|
+
/**
|
|
188
|
+
* Cleanup the embedder (free memory)
|
|
189
|
+
* Call this when done with embeddings
|
|
190
|
+
*/
|
|
191
|
+
export const disposeEmbedder = async () => {
|
|
192
|
+
if (embedderInstance) {
|
|
193
|
+
// transformers.js pipelines may have a dispose method
|
|
194
|
+
try {
|
|
195
|
+
if ('dispose' in embedderInstance && typeof embedderInstance.dispose === 'function') {
|
|
196
|
+
await embedderInstance.dispose();
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
catch {
|
|
200
|
+
// Ignore disposal errors
|
|
201
|
+
}
|
|
202
|
+
embedderInstance = null;
|
|
203
|
+
initPromise = null;
|
|
204
|
+
}
|
|
205
|
+
};
|