gitnexus 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. package/README.md +181 -0
  2. package/dist/cli/ai-context.d.ts +21 -0
  3. package/dist/cli/ai-context.js +219 -0
  4. package/dist/cli/analyze.d.ts +10 -0
  5. package/dist/cli/analyze.js +118 -0
  6. package/dist/cli/clean.d.ts +8 -0
  7. package/dist/cli/clean.js +29 -0
  8. package/dist/cli/index.d.ts +2 -0
  9. package/dist/cli/index.js +42 -0
  10. package/dist/cli/list.d.ts +6 -0
  11. package/dist/cli/list.js +27 -0
  12. package/dist/cli/mcp.d.ts +7 -0
  13. package/dist/cli/mcp.js +85 -0
  14. package/dist/cli/serve.d.ts +3 -0
  15. package/dist/cli/serve.js +5 -0
  16. package/dist/cli/status.d.ts +6 -0
  17. package/dist/cli/status.js +27 -0
  18. package/dist/config/ignore-service.d.ts +1 -0
  19. package/dist/config/ignore-service.js +208 -0
  20. package/dist/config/supported-languages.d.ts +11 -0
  21. package/dist/config/supported-languages.js +15 -0
  22. package/dist/core/embeddings/embedder.d.ts +60 -0
  23. package/dist/core/embeddings/embedder.js +205 -0
  24. package/dist/core/embeddings/embedding-pipeline.d.ts +50 -0
  25. package/dist/core/embeddings/embedding-pipeline.js +321 -0
  26. package/dist/core/embeddings/index.d.ts +9 -0
  27. package/dist/core/embeddings/index.js +9 -0
  28. package/dist/core/embeddings/text-generator.d.ts +24 -0
  29. package/dist/core/embeddings/text-generator.js +182 -0
  30. package/dist/core/embeddings/types.d.ts +87 -0
  31. package/dist/core/embeddings/types.js +32 -0
  32. package/dist/core/graph/graph.d.ts +2 -0
  33. package/dist/core/graph/graph.js +61 -0
  34. package/dist/core/graph/types.d.ts +50 -0
  35. package/dist/core/graph/types.js +1 -0
  36. package/dist/core/ingestion/ast-cache.d.ts +11 -0
  37. package/dist/core/ingestion/ast-cache.js +34 -0
  38. package/dist/core/ingestion/call-processor.d.ts +8 -0
  39. package/dist/core/ingestion/call-processor.js +269 -0
  40. package/dist/core/ingestion/cluster-enricher.d.ts +38 -0
  41. package/dist/core/ingestion/cluster-enricher.js +170 -0
  42. package/dist/core/ingestion/community-processor.d.ts +39 -0
  43. package/dist/core/ingestion/community-processor.js +269 -0
  44. package/dist/core/ingestion/entry-point-scoring.d.ts +39 -0
  45. package/dist/core/ingestion/entry-point-scoring.js +235 -0
  46. package/dist/core/ingestion/filesystem-walker.d.ts +5 -0
  47. package/dist/core/ingestion/filesystem-walker.js +26 -0
  48. package/dist/core/ingestion/framework-detection.d.ts +38 -0
  49. package/dist/core/ingestion/framework-detection.js +183 -0
  50. package/dist/core/ingestion/heritage-processor.d.ts +14 -0
  51. package/dist/core/ingestion/heritage-processor.js +134 -0
  52. package/dist/core/ingestion/import-processor.d.ts +8 -0
  53. package/dist/core/ingestion/import-processor.js +490 -0
  54. package/dist/core/ingestion/parsing-processor.d.ts +8 -0
  55. package/dist/core/ingestion/parsing-processor.js +249 -0
  56. package/dist/core/ingestion/pipeline.d.ts +2 -0
  57. package/dist/core/ingestion/pipeline.js +228 -0
  58. package/dist/core/ingestion/process-processor.d.ts +51 -0
  59. package/dist/core/ingestion/process-processor.js +278 -0
  60. package/dist/core/ingestion/structure-processor.d.ts +2 -0
  61. package/dist/core/ingestion/structure-processor.js +36 -0
  62. package/dist/core/ingestion/symbol-table.d.ts +33 -0
  63. package/dist/core/ingestion/symbol-table.js +38 -0
  64. package/dist/core/ingestion/tree-sitter-queries.d.ts +11 -0
  65. package/dist/core/ingestion/tree-sitter-queries.js +319 -0
  66. package/dist/core/ingestion/utils.d.ts +10 -0
  67. package/dist/core/ingestion/utils.js +44 -0
  68. package/dist/core/kuzu/csv-generator.d.ts +22 -0
  69. package/dist/core/kuzu/csv-generator.js +272 -0
  70. package/dist/core/kuzu/kuzu-adapter.d.ts +81 -0
  71. package/dist/core/kuzu/kuzu-adapter.js +568 -0
  72. package/dist/core/kuzu/schema.d.ts +53 -0
  73. package/dist/core/kuzu/schema.js +380 -0
  74. package/dist/core/search/bm25-index.d.ts +22 -0
  75. package/dist/core/search/bm25-index.js +52 -0
  76. package/dist/core/search/hybrid-search.d.ts +49 -0
  77. package/dist/core/search/hybrid-search.js +118 -0
  78. package/dist/core/tree-sitter/parser-loader.d.ts +4 -0
  79. package/dist/core/tree-sitter/parser-loader.js +42 -0
  80. package/dist/lib/utils.d.ts +1 -0
  81. package/dist/lib/utils.js +3 -0
  82. package/dist/mcp/core/embedder.d.ts +27 -0
  83. package/dist/mcp/core/embedder.js +93 -0
  84. package/dist/mcp/core/kuzu-adapter.d.ts +23 -0
  85. package/dist/mcp/core/kuzu-adapter.js +62 -0
  86. package/dist/mcp/local/local-backend.d.ts +73 -0
  87. package/dist/mcp/local/local-backend.js +752 -0
  88. package/dist/mcp/resources.d.ts +31 -0
  89. package/dist/mcp/resources.js +279 -0
  90. package/dist/mcp/server.d.ts +12 -0
  91. package/dist/mcp/server.js +130 -0
  92. package/dist/mcp/staleness.d.ts +15 -0
  93. package/dist/mcp/staleness.js +29 -0
  94. package/dist/mcp/tools.d.ts +24 -0
  95. package/dist/mcp/tools.js +160 -0
  96. package/dist/server/api.d.ts +6 -0
  97. package/dist/server/api.js +156 -0
  98. package/dist/storage/git.d.ts +7 -0
  99. package/dist/storage/git.js +39 -0
  100. package/dist/storage/repo-manager.d.ts +61 -0
  101. package/dist/storage/repo-manager.js +106 -0
  102. package/dist/types/pipeline.d.ts +28 -0
  103. package/dist/types/pipeline.js +16 -0
  104. package/package.json +80 -0
  105. package/skills/debugging.md +104 -0
  106. package/skills/exploring.md +112 -0
  107. package/skills/impact-analysis.md +114 -0
  108. package/skills/refactoring.md +119 -0
  109. package/vendor/leiden/index.cjs +355 -0
  110. package/vendor/leiden/utils.cjs +392 -0
@@ -0,0 +1,85 @@
1
+ /**
2
+ * MCP Command
3
+ *
4
+ * Starts the MCP server in standalone mode using local .gitnexus/ index.
5
+ * Auto-detects repository by searching for .gitnexus/ folder.
6
+ */
7
+ import path from 'path';
8
+ import fs from 'fs/promises';
9
+ import { startMCPServer } from '../mcp/server.js';
10
+ import { LocalBackend, findRepo } from '../mcp/local/local-backend.js';
11
+ /**
12
+ * Get candidate paths to search for .gitnexus/ folder
13
+ */
14
+ function getCandidatePaths() {
15
+ const candidates = [];
16
+ // 1. Explicit override (highest priority)
17
+ if (process.env.GITNEXUS_CWD) {
18
+ candidates.push(process.env.GITNEXUS_CWD);
19
+ }
20
+ // 2. Current working directory
21
+ candidates.push(process.cwd());
22
+ // 3. VS Code workspace folders (if available via env)
23
+ if (process.env.VSCODE_WORKSPACE_FOLDER) {
24
+ candidates.push(process.env.VSCODE_WORKSPACE_FOLDER);
25
+ }
26
+ // Deduplicate while preserving order
27
+ return [...new Set(candidates.map(p => path.resolve(p)))];
28
+ }
29
+ /**
30
+ * Find a git repository root by walking up the directory tree
31
+ */
32
+ async function findGitRoot(startPath) {
33
+ let current = path.resolve(startPath);
34
+ const root = path.parse(current).root;
35
+ while (current !== root) {
36
+ try {
37
+ const gitPath = path.join(current, '.git');
38
+ const stat = await fs.stat(gitPath);
39
+ if (stat.isDirectory())
40
+ return current;
41
+ }
42
+ catch { }
43
+ current = path.dirname(current);
44
+ }
45
+ return null;
46
+ }
47
+ export const mcpCommand = async () => {
48
+ // Try multiple candidate paths to find .gitnexus/
49
+ const candidates = getCandidatePaths();
50
+ for (const candidate of candidates) {
51
+ const repo = await findRepo(candidate);
52
+ if (repo) {
53
+ const local = new LocalBackend();
54
+ await local.init(candidate);
55
+ console.error(`GitNexus: Found index at ${repo.storagePath}`);
56
+ // Start MCP server (no watcher daemon - staleness hints guide LLM to call analyze)
57
+ await startMCPServer(local);
58
+ return;
59
+ }
60
+ }
61
+ // No index found - give helpful error message
62
+ for (const candidate of candidates) {
63
+ const gitRoot = await findGitRoot(candidate);
64
+ if (gitRoot) {
65
+ console.error('');
66
+ console.error('╔════════════════════════════════════════════════════╗');
67
+ console.error('║ GitNexus: Repository Not Indexed ║');
68
+ console.error('╠════════════════════════════════════════════════════╣');
69
+ console.error(`║ Found git repo: ${gitRoot.slice(0, 35).padEnd(35)} ║`);
70
+ console.error('║ ║');
71
+ console.error('║ To enable AI code understanding, run: ║');
72
+ console.error('║ ║');
73
+ console.error('║ npx gitnexus analyze ║');
74
+ console.error('║ ║');
75
+ console.error('║ Then restart your IDE. ║');
76
+ console.error('╚════════════════════════════════════════════════════╝');
77
+ console.error('');
78
+ process.exit(1);
79
+ }
80
+ }
81
+ // No git repo found
82
+ console.error('GitNexus: No git repository found.');
83
+ console.error(`Searched: ${candidates.join(', ')}`);
84
+ process.exit(1);
85
+ };
@@ -0,0 +1,3 @@
1
+ export declare const serveCommand: (options?: {
2
+ port?: string;
3
+ }) => Promise<void>;
@@ -0,0 +1,5 @@
1
+ import { createServer } from '../server/api.js';
2
+ export const serveCommand = async (options) => {
3
+ const port = Number(options?.port ?? 4747);
4
+ await createServer(port);
5
+ };
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Status Command
3
+ *
4
+ * Shows the indexing status of the current repository.
5
+ */
6
+ export declare const statusCommand: () => Promise<void>;
@@ -0,0 +1,27 @@
1
+ /**
2
+ * Status Command
3
+ *
4
+ * Shows the indexing status of the current repository.
5
+ */
6
+ import { findRepo } from '../storage/repo-manager.js';
7
+ import { getCurrentCommit, isGitRepo } from '../storage/git.js';
8
+ export const statusCommand = async () => {
9
+ const cwd = process.cwd();
10
+ if (!isGitRepo(cwd)) {
11
+ console.log('Not a git repository.');
12
+ return;
13
+ }
14
+ const repo = await findRepo(cwd);
15
+ if (!repo) {
16
+ console.log('Repository not indexed.');
17
+ console.log('Run: gitnexus analyze');
18
+ return;
19
+ }
20
+ const currentCommit = getCurrentCommit(repo.repoPath);
21
+ const isUpToDate = currentCommit === repo.meta.lastCommit;
22
+ console.log(`Repository: ${repo.repoPath}`);
23
+ console.log(`Indexed: ${new Date(repo.meta.indexedAt).toLocaleString()}`);
24
+ console.log(`Indexed commit: ${repo.meta.lastCommit?.slice(0, 7)}`);
25
+ console.log(`Current commit: ${currentCommit?.slice(0, 7)}`);
26
+ console.log(`Status: ${isUpToDate ? '✅ up-to-date' : '⚠️ stale (re-run gitnexus analyze)'}`);
27
+ };
@@ -0,0 +1 @@
1
+ export declare const shouldIgnorePath: (filePath: string) => boolean;
@@ -0,0 +1,208 @@
1
+ const DEFAULT_IGNORE_LIST = new Set([
2
+ // Version Control
3
+ '.git',
4
+ '.svn',
5
+ '.hg',
6
+ '.bzr',
7
+ // IDEs & Editors
8
+ '.idea',
9
+ '.vscode',
10
+ '.vs',
11
+ '.eclipse',
12
+ '.settings',
13
+ '.DS_Store',
14
+ 'Thumbs.db',
15
+ // Dependencies
16
+ 'node_modules',
17
+ 'bower_components',
18
+ 'jspm_packages',
19
+ 'vendor', // PHP/Go
20
+ // 'packages' removed - commonly used for monorepo source code (lerna, pnpm, yarn workspaces)
21
+ 'venv',
22
+ '.venv',
23
+ 'env',
24
+ '.env',
25
+ '__pycache__',
26
+ '.pytest_cache',
27
+ '.mypy_cache',
28
+ 'site-packages',
29
+ '.tox',
30
+ 'eggs',
31
+ '.eggs',
32
+ 'lib64',
33
+ 'parts',
34
+ 'sdist',
35
+ 'wheels',
36
+ // Build Outputs
37
+ 'dist',
38
+ 'build',
39
+ 'out',
40
+ 'output',
41
+ 'bin',
42
+ 'obj',
43
+ 'target', // Java/Rust
44
+ '.next',
45
+ '.nuxt',
46
+ '.output',
47
+ '.vercel',
48
+ '.netlify',
49
+ '.serverless',
50
+ '_build',
51
+ 'public/build',
52
+ '.parcel-cache',
53
+ '.turbo',
54
+ '.svelte-kit',
55
+ // Test & Coverage
56
+ 'coverage',
57
+ '.nyc_output',
58
+ 'htmlcov',
59
+ '.coverage',
60
+ '__tests__', // Often just test files
61
+ '__mocks__',
62
+ '.jest',
63
+ // Logs & Temp
64
+ 'logs',
65
+ 'log',
66
+ 'tmp',
67
+ 'temp',
68
+ 'cache',
69
+ '.cache',
70
+ '.tmp',
71
+ '.temp',
72
+ // Generated/Compiled
73
+ '.generated',
74
+ 'generated',
75
+ 'auto-generated',
76
+ '.terraform',
77
+ '.serverless',
78
+ // Documentation (optional - might want to keep)
79
+ // 'docs',
80
+ // 'documentation',
81
+ // Misc
82
+ '.husky',
83
+ '.github', // GitHub config, not code
84
+ '.circleci',
85
+ '.gitlab',
86
+ 'fixtures', // Test fixtures
87
+ 'snapshots', // Jest snapshots
88
+ '__snapshots__',
89
+ ]);
90
+ const IGNORED_EXTENSIONS = new Set([
91
+ // Images
92
+ '.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico', '.webp', '.bmp', '.tiff', '.tif',
93
+ '.psd', '.ai', '.sketch', '.fig', '.xd',
94
+ // Archives
95
+ '.zip', '.tar', '.gz', '.rar', '.7z', '.bz2', '.xz', '.tgz',
96
+ // Binary/Compiled
97
+ '.exe', '.dll', '.so', '.dylib', '.a', '.lib', '.o', '.obj',
98
+ '.class', '.jar', '.war', '.ear',
99
+ '.pyc', '.pyo', '.pyd',
100
+ '.beam', // Erlang
101
+ '.wasm', // WebAssembly - important!
102
+ '.node', // Native Node addons
103
+ // Documents
104
+ '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
105
+ '.odt', '.ods', '.odp',
106
+ // Media
107
+ '.mp4', '.mp3', '.wav', '.mov', '.avi', '.mkv', '.flv', '.wmv',
108
+ '.ogg', '.webm', '.flac', '.aac', '.m4a',
109
+ // Fonts
110
+ '.woff', '.woff2', '.ttf', '.eot', '.otf',
111
+ // Databases
112
+ '.db', '.sqlite', '.sqlite3', '.mdb', '.accdb',
113
+ // Minified/Bundled files
114
+ '.min.js', '.min.css', '.bundle.js', '.chunk.js',
115
+ // Source maps (debug files, not source)
116
+ '.map',
117
+ // Lock files (handled separately, but also here)
118
+ '.lock',
119
+ // Certificates & Keys (security - don't index!)
120
+ '.pem', '.key', '.crt', '.cer', '.p12', '.pfx',
121
+ // Data files (often large/binary)
122
+ '.csv', '.tsv', '.parquet', '.avro', '.feather',
123
+ '.npy', '.npz', '.pkl', '.pickle', '.h5', '.hdf5',
124
+ // Misc binary
125
+ '.bin', '.dat', '.data', '.raw',
126
+ '.iso', '.img', '.dmg',
127
+ ]);
128
+ // Files to ignore by exact name
129
+ const IGNORED_FILES = new Set([
130
+ 'package-lock.json',
131
+ 'yarn.lock',
132
+ 'pnpm-lock.yaml',
133
+ 'composer.lock',
134
+ 'Gemfile.lock',
135
+ 'poetry.lock',
136
+ 'Cargo.lock',
137
+ 'go.sum',
138
+ '.gitignore',
139
+ '.gitattributes',
140
+ '.npmrc',
141
+ '.yarnrc',
142
+ '.editorconfig',
143
+ '.prettierrc',
144
+ '.prettierignore',
145
+ '.eslintignore',
146
+ '.dockerignore',
147
+ 'Thumbs.db',
148
+ '.DS_Store',
149
+ 'LICENSE',
150
+ 'LICENSE.md',
151
+ 'LICENSE.txt',
152
+ 'CHANGELOG.md',
153
+ 'CHANGELOG',
154
+ 'CONTRIBUTING.md',
155
+ 'CODE_OF_CONDUCT.md',
156
+ 'SECURITY.md',
157
+ '.env',
158
+ '.env.local',
159
+ '.env.development',
160
+ '.env.production',
161
+ '.env.test',
162
+ '.env.example',
163
+ ]);
164
+ export const shouldIgnorePath = (filePath) => {
165
+ const normalizedPath = filePath.replace(/\\/g, '/');
166
+ const parts = normalizedPath.split('/');
167
+ const fileName = parts[parts.length - 1];
168
+ const fileNameLower = fileName.toLowerCase();
169
+ // Check if any path segment is in ignore list
170
+ for (const part of parts) {
171
+ if (DEFAULT_IGNORE_LIST.has(part)) {
172
+ return true;
173
+ }
174
+ }
175
+ // Check exact filename matches
176
+ if (IGNORED_FILES.has(fileName) || IGNORED_FILES.has(fileNameLower)) {
177
+ return true;
178
+ }
179
+ // Check extension
180
+ const lastDotIndex = fileNameLower.lastIndexOf('.');
181
+ if (lastDotIndex !== -1) {
182
+ const ext = fileNameLower.substring(lastDotIndex);
183
+ if (IGNORED_EXTENSIONS.has(ext))
184
+ return true;
185
+ // Handle compound extensions like .min.js, .bundle.js
186
+ const secondLastDot = fileNameLower.lastIndexOf('.', lastDotIndex - 1);
187
+ if (secondLastDot !== -1) {
188
+ const compoundExt = fileNameLower.substring(secondLastDot);
189
+ if (IGNORED_EXTENSIONS.has(compoundExt))
190
+ return true;
191
+ }
192
+ }
193
+ // Ignore hidden files (starting with .)
194
+ if (fileName.startsWith('.') && fileName !== '.') {
195
+ // But allow some important config files
196
+ const allowedDotFiles = ['.env', '.gitignore']; // Already in IGNORED_FILES, so this is redundant
197
+ // Actually, let's NOT ignore all dot files - many are important configs
198
+ // Just rely on the explicit lists above
199
+ }
200
+ // Ignore files that look like generated/bundled code
201
+ if (fileNameLower.includes('.bundle.') ||
202
+ fileNameLower.includes('.chunk.') ||
203
+ fileNameLower.includes('.generated.') ||
204
+ fileNameLower.endsWith('.d.ts')) { // TypeScript declaration files
205
+ return true;
206
+ }
207
+ return false;
208
+ };
@@ -0,0 +1,11 @@
1
+ export declare enum SupportedLanguages {
2
+ JavaScript = "javascript",
3
+ TypeScript = "typescript",
4
+ Python = "python",
5
+ Java = "java",
6
+ C = "c",
7
+ CPlusPlus = "cpp",
8
+ CSharp = "csharp",
9
+ Go = "go",
10
+ Rust = "rust"
11
+ }
@@ -0,0 +1,15 @@
1
+ export var SupportedLanguages;
2
+ (function (SupportedLanguages) {
3
+ SupportedLanguages["JavaScript"] = "javascript";
4
+ SupportedLanguages["TypeScript"] = "typescript";
5
+ SupportedLanguages["Python"] = "python";
6
+ SupportedLanguages["Java"] = "java";
7
+ SupportedLanguages["C"] = "c";
8
+ SupportedLanguages["CPlusPlus"] = "cpp";
9
+ SupportedLanguages["CSharp"] = "csharp";
10
+ SupportedLanguages["Go"] = "go";
11
+ SupportedLanguages["Rust"] = "rust";
12
+ // PHP = 'php',
13
+ // Ruby = 'ruby',
14
+ // Swift = 'swift',
15
+ })(SupportedLanguages || (SupportedLanguages = {}));
@@ -0,0 +1,60 @@
1
+ /**
2
+ * Embedder Module
3
+ *
4
+ * Singleton factory for transformers.js embedding pipeline.
5
+ * Handles model loading, caching, and both single and batch embedding operations.
6
+ *
7
+ * Uses snowflake-arctic-embed-xs by default (22M params, 384 dims, ~90MB)
8
+ */
9
+ import { type FeatureExtractionPipeline } from '@huggingface/transformers';
10
+ import { type EmbeddingConfig, type ModelProgress } from './types.js';
11
+ /**
12
+ * Progress callback type for model loading
13
+ */
14
+ export type ModelProgressCallback = (progress: ModelProgress) => void;
15
+ /**
16
+ * Get the current device being used for inference
17
+ */
18
+ export declare const getCurrentDevice: () => "webgpu" | "cuda" | "cpu" | "wasm" | null;
19
+ /**
20
+ * Initialize the embedding model
21
+ * Uses singleton pattern - only loads once, subsequent calls return cached instance
22
+ *
23
+ * @param onProgress - Optional callback for model download progress
24
+ * @param config - Optional configuration override
25
+ * @param forceDevice - Force a specific device
26
+ * @returns Promise resolving to the embedder pipeline
27
+ */
28
+ export declare const initEmbedder: (onProgress?: ModelProgressCallback, config?: Partial<EmbeddingConfig>, forceDevice?: "webgpu" | "cuda" | "cpu" | "wasm") => Promise<FeatureExtractionPipeline>;
29
+ /**
30
+ * Check if the embedder is initialized and ready
31
+ */
32
+ export declare const isEmbedderReady: () => boolean;
33
+ /**
34
+ * Get the embedder instance (throws if not initialized)
35
+ */
36
+ export declare const getEmbedder: () => FeatureExtractionPipeline;
37
+ /**
38
+ * Embed a single text string
39
+ *
40
+ * @param text - Text to embed
41
+ * @returns Float32Array of embedding vector (384 dimensions)
42
+ */
43
+ export declare const embedText: (text: string) => Promise<Float32Array>;
44
+ /**
45
+ * Embed multiple texts in a single batch
46
+ * More efficient than calling embedText multiple times
47
+ *
48
+ * @param texts - Array of texts to embed
49
+ * @returns Array of Float32Array embedding vectors
50
+ */
51
+ export declare const embedBatch: (texts: string[]) => Promise<Float32Array[]>;
52
+ /**
53
+ * Convert Float32Array to regular number array (for KuzuDB storage)
54
+ */
55
+ export declare const embeddingToArray: (embedding: Float32Array) => number[];
56
+ /**
57
+ * Cleanup the embedder (free memory)
58
+ * Call this when done with embeddings
59
+ */
60
+ export declare const disposeEmbedder: () => Promise<void>;
@@ -0,0 +1,205 @@
1
+ /**
2
+ * Embedder Module
3
+ *
4
+ * Singleton factory for transformers.js embedding pipeline.
5
+ * Handles model loading, caching, and both single and batch embedding operations.
6
+ *
7
+ * Uses snowflake-arctic-embed-xs by default (22M params, 384 dims, ~90MB)
8
+ */
9
+ import { pipeline, env } from '@huggingface/transformers';
10
+ import { DEFAULT_EMBEDDING_CONFIG } from './types.js';
11
+ // Module-level state for singleton pattern
12
+ let embedderInstance = null;
13
+ let isInitializing = false;
14
+ let initPromise = null;
15
+ let currentDevice = null;
16
+ /**
17
+ * Get the current device being used for inference
18
+ */
19
+ export const getCurrentDevice = () => currentDevice;
20
+ /**
21
+ * Initialize the embedding model
22
+ * Uses singleton pattern - only loads once, subsequent calls return cached instance
23
+ *
24
+ * @param onProgress - Optional callback for model download progress
25
+ * @param config - Optional configuration override
26
+ * @param forceDevice - Force a specific device
27
+ * @returns Promise resolving to the embedder pipeline
28
+ */
29
+ export const initEmbedder = async (onProgress, config = {}, forceDevice) => {
30
+ // Return existing instance if available
31
+ if (embedderInstance) {
32
+ return embedderInstance;
33
+ }
34
+ // If already initializing, wait for that promise
35
+ if (isInitializing && initPromise) {
36
+ return initPromise;
37
+ }
38
+ isInitializing = true;
39
+ const finalConfig = { ...DEFAULT_EMBEDDING_CONFIG, ...config };
40
+ // On Windows, use webgpu for GPU acceleration (via DirectX12/DirectML)
41
+ // CUDA is only available on Linux with onnxruntime-node
42
+ const isWindows = process.platform === 'win32';
43
+ const gpuDevice = isWindows ? 'webgpu' : 'cuda';
44
+ let requestedDevice = forceDevice || (finalConfig.device === 'auto' ? gpuDevice : finalConfig.device);
45
+ initPromise = (async () => {
46
+ try {
47
+ // Configure transformers.js environment
48
+ env.allowLocalModels = false;
49
+ const isDev = process.env.NODE_ENV !== 'production';
50
+ if (isDev) {
51
+ console.log(`🧠 Loading embedding model: ${finalConfig.modelId}`);
52
+ }
53
+ const progressCallback = onProgress ? (data) => {
54
+ const progress = {
55
+ status: data.status || 'progress',
56
+ file: data.file,
57
+ progress: data.progress,
58
+ loaded: data.loaded,
59
+ total: data.total,
60
+ };
61
+ onProgress(progress);
62
+ } : undefined;
63
+ // Try GPU first if auto, fall back to CPU
64
+ // Windows: webgpu (DirectX12/DirectML), Linux: cuda
65
+ const devicesToTry = (requestedDevice === 'webgpu' || requestedDevice === 'cuda')
66
+ ? [requestedDevice, 'cpu']
67
+ : [requestedDevice];
68
+ for (const device of devicesToTry) {
69
+ try {
70
+ if (isDev && device === 'webgpu') {
71
+ console.log('🔧 Trying WebGPU (DirectX12) backend...');
72
+ }
73
+ else if (isDev && device === 'cuda') {
74
+ console.log('🔧 Trying CUDA GPU backend...');
75
+ }
76
+ else if (isDev && device === 'cpu') {
77
+ console.log('🔧 Using CPU backend...');
78
+ }
79
+ else if (isDev && device === 'wasm') {
80
+ console.log('🔧 Using WASM backend (slower)...');
81
+ }
82
+ embedderInstance = await pipeline('feature-extraction', finalConfig.modelId, {
83
+ device: device,
84
+ dtype: 'fp32',
85
+ progress_callback: progressCallback,
86
+ });
87
+ currentDevice = device;
88
+ if (isDev) {
89
+ const label = device === 'webgpu' ? 'GPU (WebGPU/DirectX12)'
90
+ : device === 'cuda' ? 'GPU (CUDA)'
91
+ : device.toUpperCase();
92
+ console.log(`✅ Using ${label} backend`);
93
+ console.log('✅ Embedding model loaded successfully');
94
+ }
95
+ return embedderInstance;
96
+ }
97
+ catch (deviceError) {
98
+ if (isDev && (device === 'cuda' || device === 'webgpu')) {
99
+ const gpuType = device === 'webgpu' ? 'WebGPU' : 'CUDA';
100
+ console.log(`⚠️ ${gpuType} not available, falling back to CPU...`);
101
+ }
102
+ // Continue to next device in list
103
+ if (device === devicesToTry[devicesToTry.length - 1]) {
104
+ throw deviceError; // Last device failed, propagate error
105
+ }
106
+ }
107
+ }
108
+ throw new Error('No suitable device found for embedding model');
109
+ }
110
+ catch (error) {
111
+ isInitializing = false;
112
+ initPromise = null;
113
+ embedderInstance = null;
114
+ throw error;
115
+ }
116
+ finally {
117
+ isInitializing = false;
118
+ }
119
+ })();
120
+ return initPromise;
121
+ };
122
+ /**
123
+ * Check if the embedder is initialized and ready
124
+ */
125
+ export const isEmbedderReady = () => {
126
+ return embedderInstance !== null;
127
+ };
128
+ /**
129
+ * Get the embedder instance (throws if not initialized)
130
+ */
131
+ export const getEmbedder = () => {
132
+ if (!embedderInstance) {
133
+ throw new Error('Embedder not initialized. Call initEmbedder() first.');
134
+ }
135
+ return embedderInstance;
136
+ };
137
+ /**
138
+ * Embed a single text string
139
+ *
140
+ * @param text - Text to embed
141
+ * @returns Float32Array of embedding vector (384 dimensions)
142
+ */
143
+ export const embedText = async (text) => {
144
+ const embedder = getEmbedder();
145
+ const result = await embedder(text, {
146
+ pooling: 'mean',
147
+ normalize: true,
148
+ });
149
+ // Result is a Tensor, convert to Float32Array
150
+ return new Float32Array(result.data);
151
+ };
152
+ /**
153
+ * Embed multiple texts in a single batch
154
+ * More efficient than calling embedText multiple times
155
+ *
156
+ * @param texts - Array of texts to embed
157
+ * @returns Array of Float32Array embedding vectors
158
+ */
159
+ export const embedBatch = async (texts) => {
160
+ if (texts.length === 0) {
161
+ return [];
162
+ }
163
+ const embedder = getEmbedder();
164
+ // Process batch
165
+ const result = await embedder(texts, {
166
+ pooling: 'mean',
167
+ normalize: true,
168
+ });
169
+ // Result shape is [batch_size, dimensions]
170
+ // Need to split into individual vectors
171
+ const data = result.data;
172
+ const dimensions = DEFAULT_EMBEDDING_CONFIG.dimensions;
173
+ const embeddings = [];
174
+ for (let i = 0; i < texts.length; i++) {
175
+ const start = i * dimensions;
176
+ const end = start + dimensions;
177
+ embeddings.push(new Float32Array(Array.prototype.slice.call(data, start, end)));
178
+ }
179
+ return embeddings;
180
+ };
181
+ /**
182
+ * Convert Float32Array to regular number array (for KuzuDB storage)
183
+ */
184
+ export const embeddingToArray = (embedding) => {
185
+ return Array.from(embedding);
186
+ };
187
+ /**
188
+ * Cleanup the embedder (free memory)
189
+ * Call this when done with embeddings
190
+ */
191
+ export const disposeEmbedder = async () => {
192
+ if (embedderInstance) {
193
+ // transformers.js pipelines may have a dispose method
194
+ try {
195
+ if ('dispose' in embedderInstance && typeof embedderInstance.dispose === 'function') {
196
+ await embedderInstance.dispose();
197
+ }
198
+ }
199
+ catch {
200
+ // Ignore disposal errors
201
+ }
202
+ embedderInstance = null;
203
+ initPromise = null;
204
+ }
205
+ };