agentic-flow 2.0.1-alpha.17 → 2.0.1-alpha.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,11 +5,15 @@
5
5
  * - SIMD128 acceleration (6x faster)
6
6
  * - Parallel worker threads (7 workers)
7
7
  * - all-MiniLM-L6-v2 model (384 dimensions)
8
+ * - Persistent SQLite cache (0.1ms vs 400ms)
8
9
  *
9
10
  * Configure via:
10
11
  * - AGENTIC_FLOW_EMBEDDINGS=simple|onnx|auto (default: auto)
11
12
  * - AGENTIC_FLOW_EMBEDDING_MODEL=all-MiniLM-L6-v2 (default)
13
+ * - AGENTIC_FLOW_EMBEDDING_CACHE=true|false (default: true)
14
+ * - AGENTIC_FLOW_PERSISTENT_CACHE=true|false (default: true)
12
15
  */
16
+ import { getEmbeddingCache } from './EmbeddingCache.js';
13
17
  // ONNX availability cache
14
18
  let onnxAvailable = null;
15
19
  let ruvectorModule = null;
@@ -32,8 +36,8 @@ async function detectOnnx() {
32
36
  return false;
33
37
  }
34
38
  }
35
- // Simple LRU cache for embeddings
36
- class EmbeddingCache {
39
+ // Simple LRU cache for embeddings (in-memory, fast)
40
+ class LRUCache {
37
41
  cache = new Map();
38
42
  maxSize;
39
43
  constructor(maxSize = 1000) {
@@ -78,9 +82,12 @@ export class EmbeddingService {
78
82
  totalEmbeddings = 0;
79
83
  totalLatencyMs = 0;
80
84
  cacheHits = 0;
81
- // Cache
85
+ // Cache (in-memory LRU)
82
86
  cache;
83
87
  cacheEnabled;
88
+ // Persistent cache (SQLite)
89
+ persistentCache = null;
90
+ persistentCacheEnabled;
84
91
  // Corpus for search operations
85
92
  corpus = { texts: [], embeddings: [] };
86
93
  constructor() {
@@ -89,7 +96,18 @@ export class EmbeddingService {
89
96
  this.modelName = process.env.AGENTIC_FLOW_EMBEDDING_MODEL || 'all-MiniLM-L6-v2';
90
97
  this.dimension = 256; // Will be updated when ONNX loads (384)
91
98
  this.cacheEnabled = process.env.AGENTIC_FLOW_EMBEDDING_CACHE !== 'false';
92
- this.cache = new EmbeddingCache(1000);
99
+ this.persistentCacheEnabled = process.env.AGENTIC_FLOW_PERSISTENT_CACHE !== 'false';
100
+ this.cache = new LRUCache(1000);
101
+ // Initialize persistent cache
102
+ if (this.persistentCacheEnabled) {
103
+ try {
104
+ this.persistentCache = getEmbeddingCache({ dimension: 384 });
105
+ }
106
+ catch (error) {
107
+ console.warn('[EmbeddingService] Persistent cache unavailable:', error);
108
+ this.persistentCacheEnabled = false;
109
+ }
110
+ }
93
111
  }
94
112
  static getInstance() {
95
113
  if (!EmbeddingService.instance) {
@@ -150,7 +168,7 @@ export class EmbeddingService {
150
168
  */
151
169
  async embed(text) {
152
170
  const startTime = performance.now();
153
- // Check cache
171
+ // Check in-memory cache first (fastest)
154
172
  if (this.cacheEnabled) {
155
173
  const cached = this.cache.get(text);
156
174
  if (cached) {
@@ -158,6 +176,18 @@ export class EmbeddingService {
158
176
  return cached;
159
177
  }
160
178
  }
179
+ // Check persistent cache (SQLite, ~0.1ms)
180
+ if (this.persistentCache) {
181
+ const cached = this.persistentCache.get(text, this.modelName);
182
+ if (cached) {
183
+ this.cacheHits++;
184
+ // Also store in memory cache for faster subsequent access
185
+ if (this.cacheEnabled) {
186
+ this.cache.set(text, cached);
187
+ }
188
+ return cached;
189
+ }
190
+ }
161
191
  // Resolve backend (handles 'auto' mode)
162
192
  const effectiveBackend = await this.resolveBackend();
163
193
  let embedding;
@@ -177,10 +207,14 @@ export class EmbeddingService {
177
207
  // Update stats
178
208
  this.totalEmbeddings++;
179
209
  this.totalLatencyMs += performance.now() - startTime;
180
- // Cache result
210
+ // Cache result in memory
181
211
  if (this.cacheEnabled) {
182
212
  this.cache.set(text, embedding);
183
213
  }
214
+ // Cache result persistently (for cross-session)
215
+ if (this.persistentCache && effectiveBackend === 'onnx') {
216
+ this.persistentCache.set(text, embedding, this.modelName);
217
+ }
184
218
  return embedding;
185
219
  }
186
220
  /**
@@ -448,6 +482,19 @@ export class EmbeddingService {
448
482
  getStats() {
449
483
  const effective = this.effectiveBackend || this.backend;
450
484
  const ruvectorStats = ruvectorModule?.getStats?.() || {};
485
+ // Get persistent cache stats
486
+ let persistentCacheStats;
487
+ if (this.persistentCache) {
488
+ const cacheStats = this.persistentCache.getStats();
489
+ persistentCacheStats = {
490
+ enabled: true,
491
+ entries: cacheStats.totalEntries,
492
+ hits: cacheStats.hits,
493
+ misses: cacheStats.misses,
494
+ hitRate: cacheStats.hitRate,
495
+ dbSizeKB: Math.round(cacheStats.dbSizeBytes / 1024),
496
+ };
497
+ }
451
498
  return {
452
499
  backend: this.backend,
453
500
  effectiveBackend: effective,
@@ -460,14 +507,46 @@ export class EmbeddingService {
460
507
  modelName: effective === 'onnx' ? this.modelName : undefined,
461
508
  simdAvailable: ruvectorStats.simdAvailable ?? onnxAvailable,
462
509
  parallelWorkers: ruvectorStats.workerCount ?? undefined,
510
+ persistentCache: persistentCacheStats,
463
511
  };
464
512
  }
465
513
  /**
466
- * Clear cache
514
+ * Clear in-memory cache
467
515
  */
468
516
  clearCache() {
469
517
  this.cache.clear();
470
518
  }
519
+ /**
520
+ * Clear persistent cache (SQLite)
521
+ */
522
+ clearPersistentCache() {
523
+ if (this.persistentCache) {
524
+ this.persistentCache.clear();
525
+ }
526
+ }
527
+ /**
528
+ * Clear all caches (memory + persistent)
529
+ */
530
+ clearAllCaches() {
531
+ this.cache.clear();
532
+ if (this.persistentCache) {
533
+ this.persistentCache.clear();
534
+ }
535
+ }
536
+ /**
537
+ * Get persistent cache stats
538
+ */
539
+ getPersistentCacheStats() {
540
+ if (!this.persistentCache)
541
+ return null;
542
+ const stats = this.persistentCache.getStats();
543
+ return {
544
+ entries: stats.totalEntries,
545
+ hits: stats.hits,
546
+ misses: stats.misses,
547
+ hitRate: stats.hitRate,
548
+ };
549
+ }
471
550
  /**
472
551
  * Clear corpus
473
552
  */
@@ -493,6 +572,879 @@ export class EmbeddingService {
493
572
  onnxAvailable = null;
494
573
  ruvectorModule = null;
495
574
  }
575
+ /**
576
+ * Pretrain cache with texts from files
577
+ * Embeds content and stores in persistent cache for fast retrieval
578
+ *
579
+ * @param sources - File paths or glob patterns, or array of texts
580
+ * @param options - Pretrain options
581
+ * @returns Stats about pretraining
582
+ */
583
+ async pretrain(sources, options = {}) {
584
+ const { batchSize = 32, onProgress, chunkSize = 512, overlapSize = 64, skipCached = true } = options;
585
+ const startTime = performance.now();
586
+ let processed = 0;
587
+ let cached = 0;
588
+ let skipped = 0;
589
+ // Resolve texts to embed
590
+ const texts = [];
591
+ if (typeof sources === 'string') {
592
+ sources = [sources];
593
+ }
594
+ for (const source of sources) {
595
+ // Check if it's a file path or glob pattern
596
+ if (source.includes('/') || source.includes('*') || source.includes('.')) {
597
+ try {
598
+ const fs = await import('fs');
599
+ const path = await import('path');
600
+ const { glob } = await import('glob').catch(() => ({ glob: null }));
601
+ // Handle glob patterns
602
+ let files = [];
603
+ if (source.includes('*') && glob) {
604
+ files = await glob(source);
605
+ }
606
+ else if (fs.existsSync(source)) {
607
+ files = [source];
608
+ }
609
+ for (const file of files) {
610
+ try {
611
+ const content = fs.readFileSync(file, 'utf-8');
612
+ // Chunk large files
613
+ if (content.length > chunkSize * 2) {
614
+ for (let i = 0; i < content.length; i += chunkSize - overlapSize) {
615
+ const chunk = content.slice(i, i + chunkSize);
616
+ if (chunk.trim().length > 10) {
617
+ texts.push(chunk);
618
+ }
619
+ }
620
+ }
621
+ else if (content.trim().length > 10) {
622
+ texts.push(content);
623
+ }
624
+ }
625
+ catch {
626
+ // Skip unreadable files
627
+ }
628
+ }
629
+ }
630
+ catch {
631
+ // Treat as plain text if file operations fail
632
+ texts.push(source);
633
+ }
634
+ }
635
+ else {
636
+ texts.push(source);
637
+ }
638
+ }
639
+ // Filter out already cached texts
640
+ const toEmbed = [];
641
+ for (const text of texts) {
642
+ if (skipCached && this.persistentCache?.has(text, this.modelName)) {
643
+ skipped++;
644
+ }
645
+ else {
646
+ toEmbed.push(text);
647
+ }
648
+ }
649
+ // Embed in batches
650
+ for (let i = 0; i < toEmbed.length; i += batchSize) {
651
+ const batch = toEmbed.slice(i, i + batchSize);
652
+ const embeddings = await this.embedBatch(batch);
653
+ // Store in persistent cache (embedBatch already handles this for ONNX)
654
+ cached += embeddings.length;
655
+ processed += batch.length;
656
+ if (onProgress) {
657
+ onProgress(processed, toEmbed.length);
658
+ }
659
+ }
660
+ return {
661
+ processed,
662
+ cached,
663
+ skipped,
664
+ timeMs: performance.now() - startTime,
665
+ };
666
+ }
667
+ /**
668
+ * Pretrain with common programming patterns
669
+ * Pre-caches embeddings for frequently used code patterns
670
+ */
671
+ async pretrainCodePatterns() {
672
+ const patterns = [
673
+ // Common programming constructs
674
+ 'function implementation',
675
+ 'class definition',
676
+ 'interface declaration',
677
+ 'type alias',
678
+ 'import statement',
679
+ 'export module',
680
+ 'async await pattern',
681
+ 'promise handling',
682
+ 'error handling try catch',
683
+ 'conditional logic if else',
684
+ 'loop iteration for while',
685
+ 'array map filter reduce',
686
+ 'object destructuring',
687
+ 'spread operator',
688
+ 'rest parameters',
689
+ // Code operations
690
+ 'refactor code',
691
+ 'fix bug',
692
+ 'add feature',
693
+ 'write tests',
694
+ 'add documentation',
695
+ 'optimize performance',
696
+ 'improve readability',
697
+ 'handle edge cases',
698
+ 'add validation',
699
+ 'implement authentication',
700
+ // File types
701
+ 'TypeScript file',
702
+ 'JavaScript module',
703
+ 'React component',
704
+ 'Vue component',
705
+ 'CSS stylesheet',
706
+ 'JSON configuration',
707
+ 'Markdown documentation',
708
+ 'Python script',
709
+ 'Shell script',
710
+ 'SQL query',
711
+ // Agent routing patterns
712
+ 'code review task',
713
+ 'architecture design',
714
+ 'testing strategy',
715
+ 'debugging session',
716
+ 'performance analysis',
717
+ 'security audit',
718
+ 'documentation update',
719
+ 'API design',
720
+ 'database schema',
721
+ 'deployment configuration',
722
+ ];
723
+ const startTime = performance.now();
724
+ const embeddings = await this.embedBatch(patterns);
725
+ return {
726
+ cached: embeddings.length,
727
+ timeMs: performance.now() - startTime,
728
+ };
729
+ }
730
+ /**
731
+ * Pretrain from repository structure
732
+ * Analyzes file names and paths to pre-cache common patterns
733
+ */
734
+ async pretrainFromRepo(repoPath = '.') {
735
+ const startTime = performance.now();
736
+ let files = 0;
737
+ let chunks = 0;
738
+ try {
739
+ const fs = await import('fs');
740
+ const path = await import('path');
741
+ // Common code file extensions
742
+ const extensions = ['.ts', '.tsx', '.js', '.jsx', '.py', '.md', '.json'];
743
+ const walkDir = (dir) => {
744
+ try {
745
+ const entries = fs.readdirSync(dir, { withFileTypes: true });
746
+ for (const entry of entries) {
747
+ const fullPath = path.join(dir, entry.name);
748
+ if (entry.isDirectory()) {
749
+ // Skip node_modules, .git, etc.
750
+ if (!entry.name.startsWith('.') && entry.name !== 'node_modules' && entry.name !== 'dist') {
751
+ walkDir(fullPath);
752
+ }
753
+ }
754
+ else if (extensions.some(ext => entry.name.endsWith(ext))) {
755
+ return fullPath;
756
+ }
757
+ }
758
+ }
759
+ catch {
760
+ // Skip unreadable directories
761
+ }
762
+ return null;
763
+ };
764
+ // Collect files
765
+ const filePaths = [];
766
+ const collectFiles = (dir) => {
767
+ try {
768
+ const entries = fs.readdirSync(dir, { withFileTypes: true });
769
+ for (const entry of entries) {
770
+ const fullPath = path.join(dir, entry.name);
771
+ if (entry.isDirectory()) {
772
+ if (!entry.name.startsWith('.') && entry.name !== 'node_modules' && entry.name !== 'dist') {
773
+ collectFiles(fullPath);
774
+ }
775
+ }
776
+ else if (extensions.some(ext => entry.name.endsWith(ext))) {
777
+ filePaths.push(fullPath);
778
+ }
779
+ }
780
+ }
781
+ catch {
782
+ // Skip unreadable
783
+ }
784
+ };
785
+ collectFiles(repoPath);
786
+ files = filePaths.length;
787
+ // Pretrain from collected files
788
+ if (filePaths.length > 0) {
789
+ const result = await this.pretrain(filePaths, {
790
+ batchSize: 16,
791
+ chunkSize: 512,
792
+ overlapSize: 64,
793
+ });
794
+ chunks = result.cached;
795
+ }
796
+ }
797
+ catch (err) {
798
+ // Repository analysis failed
799
+ }
800
+ return {
801
+ files,
802
+ chunks,
803
+ timeMs: performance.now() - startTime,
804
+ };
805
+ }
806
+ /**
807
+ * Incremental pretrain - only process changed files since last run
808
+ * Uses git diff to detect modified files
809
+ */
810
+ async pretrainIncremental(options = {}) {
811
+ const { since = 'HEAD~10', repoPath = '.' } = options;
812
+ const startTime = performance.now();
813
+ let changedFiles = 0;
814
+ let newChunks = 0;
815
+ let skipped = 0;
816
+ try {
817
+ const { execSync } = await import('child_process');
818
+ const path = await import('path');
819
+ const fs = await import('fs');
820
+ // Get changed files from git
821
+ const gitOutput = execSync(`git diff --name-only ${since}`, {
822
+ cwd: repoPath,
823
+ encoding: 'utf-8',
824
+ });
825
+ const changedPaths = gitOutput
826
+ .split('\n')
827
+ .filter(f => f.trim())
828
+ .map(f => path.join(repoPath, f))
829
+ .filter(f => {
830
+ try {
831
+ return fs.existsSync(f) && fs.statSync(f).isFile();
832
+ }
833
+ catch {
834
+ return false;
835
+ }
836
+ });
837
+ changedFiles = changedPaths.length;
838
+ if (changedPaths.length > 0) {
839
+ const result = await this.pretrain(changedPaths, {
840
+ batchSize: 16,
841
+ chunkSize: 512,
842
+ overlapSize: 64,
843
+ skipCached: true,
844
+ });
845
+ newChunks = result.cached;
846
+ skipped = result.skipped;
847
+ }
848
+ }
849
+ catch {
850
+ // Git not available or not a repo
851
+ }
852
+ return {
853
+ changedFiles,
854
+ newChunks,
855
+ skipped,
856
+ timeMs: performance.now() - startTime,
857
+ };
858
+ }
859
+ /**
860
+ * Smart chunking - split code by semantic boundaries
861
+ * (functions, classes, etc.) instead of fixed size
862
+ */
863
+ semanticChunk(content, fileType) {
864
+ const chunks = [];
865
+ // TypeScript/JavaScript patterns
866
+ if (['.ts', '.tsx', '.js', '.jsx'].some(ext => fileType.endsWith(ext))) {
867
+ // Split on function/class/interface boundaries
868
+ const patterns = [
869
+ /^(export\s+)?(async\s+)?function\s+\w+/gm,
870
+ /^(export\s+)?class\s+\w+/gm,
871
+ /^(export\s+)?interface\s+\w+/gm,
872
+ /^(export\s+)?type\s+\w+/gm,
873
+ /^(export\s+)?const\s+\w+\s*=/gm,
874
+ ];
875
+ let lastIndex = 0;
876
+ const boundaries = [0];
877
+ for (const pattern of patterns) {
878
+ let match;
879
+ while ((match = pattern.exec(content)) !== null) {
880
+ boundaries.push(match.index);
881
+ }
882
+ }
883
+ boundaries.push(content.length);
884
+ boundaries.sort((a, b) => a - b);
885
+ // Extract chunks between boundaries
886
+ for (let i = 0; i < boundaries.length - 1; i++) {
887
+ const chunk = content.slice(boundaries[i], boundaries[i + 1]).trim();
888
+ if (chunk.length > 20 && chunk.length < 2000) {
889
+ chunks.push(chunk);
890
+ }
891
+ }
892
+ }
893
+ // Python patterns
894
+ else if (fileType.endsWith('.py')) {
895
+ const patterns = [
896
+ /^(async\s+)?def\s+\w+/gm,
897
+ /^class\s+\w+/gm,
898
+ ];
899
+ const boundaries = [0];
900
+ for (const pattern of patterns) {
901
+ let match;
902
+ while ((match = pattern.exec(content)) !== null) {
903
+ boundaries.push(match.index);
904
+ }
905
+ }
906
+ boundaries.push(content.length);
907
+ boundaries.sort((a, b) => a - b);
908
+ for (let i = 0; i < boundaries.length - 1; i++) {
909
+ const chunk = content.slice(boundaries[i], boundaries[i + 1]).trim();
910
+ if (chunk.length > 20 && chunk.length < 2000) {
911
+ chunks.push(chunk);
912
+ }
913
+ }
914
+ }
915
+ // Markdown - split by headers
916
+ else if (fileType.endsWith('.md')) {
917
+ const sections = content.split(/^#+\s+/gm);
918
+ for (const section of sections) {
919
+ if (section.trim().length > 20) {
920
+ chunks.push(section.trim().slice(0, 1000));
921
+ }
922
+ }
923
+ }
924
+ // Fallback to fixed-size chunking
925
+ if (chunks.length === 0) {
926
+ const chunkSize = 512;
927
+ const overlap = 64;
928
+ for (let i = 0; i < content.length; i += chunkSize - overlap) {
929
+ const chunk = content.slice(i, i + chunkSize);
930
+ if (chunk.trim().length > 20) {
931
+ chunks.push(chunk);
932
+ }
933
+ }
934
+ }
935
+ return chunks;
936
+ }
937
+ /**
938
+ * Pretrain with semantic chunking
939
+ * Uses code structure to create meaningful chunks
940
+ */
941
+ async pretrainSemantic(sources, options = {}) {
942
+ const { batchSize = 32, onProgress } = options;
943
+ const startTime = performance.now();
944
+ let fileCount = 0;
945
+ let chunkCount = 0;
946
+ const allChunks = [];
947
+ try {
948
+ const fs = await import('fs');
949
+ const path = await import('path');
950
+ for (const source of sources) {
951
+ if (fs.existsSync(source)) {
952
+ try {
953
+ const content = fs.readFileSync(source, 'utf-8');
954
+ const ext = path.extname(source);
955
+ const chunks = this.semanticChunk(content, ext);
956
+ allChunks.push(...chunks);
957
+ fileCount++;
958
+ }
959
+ catch {
960
+ // Skip unreadable files
961
+ }
962
+ }
963
+ }
964
+ // Embed and cache all chunks
965
+ for (let i = 0; i < allChunks.length; i += batchSize) {
966
+ const batch = allChunks.slice(i, i + batchSize);
967
+ await this.embedBatch(batch);
968
+ chunkCount += batch.length;
969
+ if (onProgress) {
970
+ onProgress(chunkCount, allChunks.length);
971
+ }
972
+ }
973
+ }
974
+ catch {
975
+ // Pretrain failed
976
+ }
977
+ return {
978
+ files: fileCount,
979
+ chunks: chunkCount,
980
+ timeMs: performance.now() - startTime,
981
+ };
982
+ }
983
+ /**
984
+ * Priority pretrain - cache most frequently used patterns first
985
+ * Tracks access patterns and prioritizes high-frequency queries
986
+ */
987
+ accessCounts = new Map();
988
+ recordAccess(text) {
989
+ this.accessCounts.set(text, (this.accessCounts.get(text) || 0) + 1);
990
+ }
991
+ getTopPatterns(n = 100) {
992
+ return Array.from(this.accessCounts.entries())
993
+ .sort((a, b) => b[1] - a[1])
994
+ .slice(0, n)
995
+ .map(([text]) => text);
996
+ }
997
+ async pretrainPriority(n = 100) {
998
+ const topPatterns = this.getTopPatterns(n);
999
+ const startTime = performance.now();
1000
+ if (topPatterns.length > 0) {
1001
+ await this.embedBatch(topPatterns);
1002
+ }
1003
+ return {
1004
+ cached: topPatterns.length,
1005
+ timeMs: performance.now() - startTime,
1006
+ };
1007
+ }
1008
+ /**
1009
+ * Warmup cache on session start
1010
+ * Combines code patterns + recent repo changes
1011
+ */
1012
+ async warmup(repoPath = '.') {
1013
+ const startTime = performance.now();
1014
+ // First: load common patterns
1015
+ const patternResult = await this.pretrainCodePatterns();
1016
+ // Second: load recent git changes
1017
+ const incrementalResult = await this.pretrainIncremental({
1018
+ since: 'HEAD~5',
1019
+ repoPath,
1020
+ });
1021
+ return {
1022
+ patterns: patternResult.cached,
1023
+ recentChanges: incrementalResult.newChunks,
1024
+ timeMs: performance.now() - startTime,
1025
+ };
1026
+ }
1027
+ /**
1028
+ * Intelligent pretrain using ruvector worker pool
1029
+ * Analyzes repo structure, code patterns, and prepares cache
1030
+ * Uses parallel workers for maximum throughput
1031
+ */
1032
+ async pretrainIntelligent(options = {}) {
1033
+ const { repoPath = '.', parallel = true, onProgress } = options;
1034
+ const startTime = performance.now();
1035
+ const stages = {
1036
+ codePatterns: { count: 0, timeMs: 0 },
1037
+ astAnalysis: { files: 0, functions: 0, timeMs: 0 },
1038
+ gitHistory: { commits: 0, hotFiles: 0, timeMs: 0 },
1039
+ dependencies: { modules: 0, imports: 0, timeMs: 0 },
1040
+ semanticChunks: { chunks: 0, timeMs: 0 },
1041
+ };
1042
+ let totalCached = 0;
1043
+ try {
1044
+ // Stage 1: Code patterns (common programming patterns)
1045
+ onProgress?.('codePatterns', 0);
1046
+ const stage1Start = performance.now();
1047
+ const patternResult = await this.pretrainCodePatterns();
1048
+ stages.codePatterns = {
1049
+ count: patternResult.cached,
1050
+ timeMs: performance.now() - stage1Start,
1051
+ };
1052
+ totalCached += patternResult.cached;
1053
+ onProgress?.('codePatterns', 100);
1054
+ // Stage 2: AST Analysis using ruvector workers (if available)
1055
+ onProgress?.('astAnalysis', 0);
1056
+ const stage2Start = performance.now();
1057
+ try {
1058
+ if (ruvectorModule && parallel) {
1059
+ // Use ruvector's analyzeFilesParallel if available
1060
+ const mod = ruvectorModule;
1061
+ if (mod.analyzeFilesParallel) {
1062
+ const fs = await import('fs');
1063
+ const path = await import('path');
1064
+ // Collect source files
1065
+ const sourceFiles = [];
1066
+ const collectSources = (dir) => {
1067
+ try {
1068
+ const entries = fs.readdirSync(dir, { withFileTypes: true });
1069
+ for (const entry of entries) {
1070
+ const fullPath = path.join(dir, entry.name);
1071
+ if (entry.isDirectory()) {
1072
+ if (!entry.name.startsWith('.') && entry.name !== 'node_modules' && entry.name !== 'dist') {
1073
+ collectSources(fullPath);
1074
+ }
1075
+ }
1076
+ else if (['.ts', '.tsx', '.js', '.jsx'].some(ext => entry.name.endsWith(ext))) {
1077
+ sourceFiles.push(fullPath);
1078
+ }
1079
+ }
1080
+ }
1081
+ catch { }
1082
+ };
1083
+ collectSources(repoPath);
1084
+ // Analyze in parallel
1085
+ const astResult = await mod.analyzeFilesParallel(sourceFiles.slice(0, 100));
1086
+ stages.astAnalysis = {
1087
+ files: sourceFiles.length,
1088
+ functions: astResult?.functions || 0,
1089
+ timeMs: performance.now() - stage2Start,
1090
+ };
1091
+ // Extract function signatures for caching
1092
+ if (astResult?.signatures) {
1093
+ await this.embedBatch(astResult.signatures.slice(0, 200));
1094
+ totalCached += Math.min(astResult.signatures.length, 200);
1095
+ }
1096
+ }
1097
+ }
1098
+ }
1099
+ catch { }
1100
+ onProgress?.('astAnalysis', 100);
1101
+ // Stage 3: Git history analysis (hot files = frequently changed)
1102
+ onProgress?.('gitHistory', 0);
1103
+ const stage3Start = performance.now();
1104
+ try {
1105
+ const { execSync } = await import('child_process');
1106
+ // Get commit count
1107
+ const commitCount = execSync('git rev-list --count HEAD', {
1108
+ cwd: repoPath,
1109
+ encoding: 'utf-8',
1110
+ }).trim();
1111
+ // Get hot files (most frequently changed)
1112
+ const hotFilesOutput = execSync('git log --format="" --name-only -n 100 | sort | uniq -c | sort -rn | head -20', { cwd: repoPath, encoding: 'utf-8' });
1113
+ const hotFiles = hotFilesOutput
1114
+ .split('\n')
1115
+ .filter(l => l.trim())
1116
+ .map(l => l.trim().split(/\s+/).slice(1).join(' '))
1117
+ .filter(f => f);
1118
+ stages.gitHistory = {
1119
+ commits: parseInt(commitCount) || 0,
1120
+ hotFiles: hotFiles.length,
1121
+ timeMs: performance.now() - stage3Start,
1122
+ };
1123
+ // Pretrain hot files
1124
+ if (hotFiles.length > 0) {
1125
+ const fs = await import('fs');
1126
+ const path = await import('path');
1127
+ const validFiles = hotFiles
1128
+ .map(f => path.join(repoPath, f))
1129
+ .filter(f => fs.existsSync(f));
1130
+ if (validFiles.length > 0) {
1131
+ const result = await this.pretrainSemantic(validFiles, { batchSize: 16 });
1132
+ totalCached += result.chunks;
1133
+ }
1134
+ }
1135
+ }
1136
+ catch { }
1137
+ onProgress?.('gitHistory', 100);
1138
+ // Stage 4: Dependency analysis
1139
+ onProgress?.('dependencies', 0);
1140
+ const stage4Start = performance.now();
1141
+ try {
1142
+ const fs = await import('fs');
1143
+ const path = await import('path');
1144
+ // Parse package.json for dependencies
1145
+ const pkgPath = path.join(repoPath, 'package.json');
1146
+ if (fs.existsSync(pkgPath)) {
1147
+ const pkg = JSON.parse(fs.readFileSync(pkgPath, 'utf-8'));
1148
+ const deps = Object.keys(pkg.dependencies || {});
1149
+ const devDeps = Object.keys(pkg.devDependencies || {});
1150
+ const allDeps = [...deps, ...devDeps];
1151
+ stages.dependencies = {
1152
+ modules: allDeps.length,
1153
+ imports: 0,
1154
+ timeMs: performance.now() - stage4Start,
1155
+ };
1156
+ // Cache dependency names for import resolution
1157
+ if (allDeps.length > 0) {
1158
+ const depPatterns = allDeps.map(d => `import from ${d}`);
1159
+ await this.embedBatch(depPatterns);
1160
+ totalCached += depPatterns.length;
1161
+ }
1162
+ }
1163
+ }
1164
+ catch { }
1165
+ onProgress?.('dependencies', 100);
1166
+ // Stage 5: Semantic chunking with parallel embedding
1167
+ onProgress?.('semanticChunks', 0);
1168
+ const stage5Start = performance.now();
1169
+ try {
1170
+ const incrementalResult = await this.pretrainIncremental({
1171
+ since: 'HEAD~20',
1172
+ repoPath,
1173
+ });
1174
+ stages.semanticChunks = {
1175
+ chunks: incrementalResult.newChunks,
1176
+ timeMs: performance.now() - stage5Start,
1177
+ };
1178
+ totalCached += incrementalResult.newChunks;
1179
+ }
1180
+ catch { }
1181
+ onProgress?.('semanticChunks', 100);
1182
+ }
1183
+ catch (err) {
1184
+ // Pretrain failed, return partial results
1185
+ }
1186
+ return {
1187
+ stages,
1188
+ totalCached,
1189
+ totalTimeMs: performance.now() - startTime,
1190
+ };
1191
+ }
1192
+ /**
1193
+ * Background pretrain - runs in worker if available
1194
+ * Non-blocking, returns immediately with a promise
1195
+ */
1196
+ pretrainBackground(options = {}) {
1197
+ let cancelled = false;
1198
+ const promise = (async () => {
1199
+ if (cancelled)
1200
+ return;
1201
+ // Run warmup in background
1202
+ await this.warmup(options.repoPath);
1203
+ if (cancelled)
1204
+ return;
1205
+ // Then run intelligent pretrain
1206
+ await this.pretrainIntelligent({
1207
+ ...options,
1208
+ parallel: true,
1209
+ });
1210
+ })();
1211
+ return {
1212
+ promise,
1213
+ cancel: () => { cancelled = true; },
1214
+ };
1215
+ }
1216
+ /**
1217
+ * AI-enhanced pretrain using ruvector attention mechanisms
1218
+ * Uses HyperbolicAttention for code structure, MoE for routing
1219
+ */
1220
+ async pretrainWithAI(options = {}) {
1221
+ const { repoPath = '.', attentionType = 'auto', onProgress } = options;
1222
+ const startTime = performance.now();
1223
+ const patterns = [];
1224
+ let totalCached = 0;
1225
+ let attentionInfo = { type: 'none', timeMs: 0 };
1226
+ let predictions = { prefetch: 0, confidence: 0 };
1227
+ try {
1228
+ const mod = ruvectorModule;
1229
+ // Step 1: Determine best attention type for codebase
1230
+ onProgress?.('attention', 'Selecting optimal attention mechanism...');
1231
+ let selectedAttention = attentionType;
1232
+ if (attentionType === 'auto' && mod) {
1233
+ // Use getAttentionForUseCase if available
1234
+ if (mod.getAttentionForUseCase) {
1235
+ const result = await mod.getAttentionForUseCase('code_analysis');
1236
+ selectedAttention = result?.type || 'hyperbolic';
1237
+ }
1238
+ else {
1239
+ // Default to hyperbolic for hierarchical code structure
1240
+ selectedAttention = 'hyperbolic';
1241
+ }
1242
+ }
1243
+ attentionInfo.type = selectedAttention;
1244
+ const attentionStart = performance.now();
1245
+ // Step 2: Use attention to identify important code regions
1246
+ onProgress?.('analysis', `Using ${selectedAttention} attention for code analysis...`);
1247
+ if (mod) {
1248
+ // Collect code samples for attention-based analysis
1249
+ const fs = await import('fs');
1250
+ const path = await import('path');
1251
+ const codeSamples = [];
1252
+ const collectCode = (dir, maxFiles = 50) => {
1253
+ if (codeSamples.length >= maxFiles)
1254
+ return;
1255
+ try {
1256
+ const entries = fs.readdirSync(dir, { withFileTypes: true });
1257
+ for (const entry of entries) {
1258
+ if (codeSamples.length >= maxFiles)
1259
+ break;
1260
+ const fullPath = path.join(dir, entry.name);
1261
+ if (entry.isDirectory()) {
1262
+ if (!entry.name.startsWith('.') && entry.name !== 'node_modules' && entry.name !== 'dist') {
1263
+ collectCode(fullPath, maxFiles);
1264
+ }
1265
+ }
1266
+ else if (['.ts', '.tsx', '.js', '.jsx'].some(ext => entry.name.endsWith(ext))) {
1267
+ try {
1268
+ const content = fs.readFileSync(fullPath, 'utf-8');
1269
+ if (content.length < 5000) {
1270
+ codeSamples.push(content);
1271
+ }
1272
+ }
1273
+ catch { }
1274
+ }
1275
+ }
1276
+ }
1277
+ catch { }
1278
+ };
1279
+ collectCode(repoPath);
1280
+ // Step 3: Use attention mechanisms to weight code importance
1281
+ if (mod.HyperbolicAttention && selectedAttention === 'hyperbolic') {
1282
+ try {
1283
+ // Hyperbolic attention for hierarchical code structure
1284
+ const attention = new mod.HyperbolicAttention({ dim: 384 });
1285
+ // Identify structural patterns (classes, functions, imports)
1286
+ const structuralPatterns = [
1287
+ 'class definition with constructor',
1288
+ 'async function with error handling',
1289
+ 'interface with multiple properties',
1290
+ 'type with generics',
1291
+ 'import statement block',
1292
+ 'export default component',
1293
+ 'hook implementation useEffect',
1294
+ 'API endpoint handler',
1295
+ 'database query function',
1296
+ 'authentication middleware',
1297
+ ];
1298
+ await this.embedBatch(structuralPatterns);
1299
+ patterns.push({ type: 'structural', count: structuralPatterns.length });
1300
+ totalCached += structuralPatterns.length;
1301
+ }
1302
+ catch { }
1303
+ }
1304
+ if (mod.MoEAttention && selectedAttention === 'moe') {
1305
+ try {
1306
+ // MoE for routing different code patterns to experts
1307
+ const routingPatterns = [
1308
+ // Expert 1: Frontend
1309
+ 'React component with state',
1310
+ 'Vue component with props',
1311
+ 'CSS styling module',
1312
+ // Expert 2: Backend
1313
+ 'Express route handler',
1314
+ 'GraphQL resolver',
1315
+ 'REST API endpoint',
1316
+ // Expert 3: Data
1317
+ 'SQL query builder',
1318
+ 'MongoDB aggregation',
1319
+ 'Redis cache operation',
1320
+ // Expert 4: Testing
1321
+ 'Jest test case',
1322
+ 'E2E test scenario',
1323
+ 'Mock implementation',
1324
+ ];
1325
+ await this.embedBatch(routingPatterns);
1326
+ patterns.push({ type: 'routing', count: routingPatterns.length });
1327
+ totalCached += routingPatterns.length;
1328
+ }
1329
+ catch { }
1330
+ }
1331
+ if (mod.GraphRoPeAttention && selectedAttention === 'graph') {
1332
+ try {
1333
+ // Graph attention for dependency understanding
1334
+ const graphPatterns = [
1335
+ 'module exports',
1336
+ 'circular dependency',
1337
+ 'shared utility import',
1338
+ 'type re-export',
1339
+ 'barrel file index',
1340
+ 'lazy import dynamic',
1341
+ 'peer dependency',
1342
+ 'optional dependency',
1343
+ ];
1344
+ await this.embedBatch(graphPatterns);
1345
+ patterns.push({ type: 'graph', count: graphPatterns.length });
1346
+ totalCached += graphPatterns.length;
1347
+ }
1348
+ catch { }
1349
+ }
1350
+ attentionInfo.timeMs = performance.now() - attentionStart;
1351
+ // Step 4: FastGRNN for pattern prediction (if available)
1352
+ onProgress?.('prediction', 'Training pattern predictor...');
1353
+ if (mod.FastGRNN) {
1354
+ try {
1355
+ // Use recent access patterns to predict what's needed next
1356
+ const topPatterns = this.getTopPatterns(50);
1357
+ if (topPatterns.length > 0) {
1358
+ // Prefetch predicted patterns
1359
+ const prefetchPatterns = [
1360
+ ...topPatterns.slice(0, 20),
1361
+ // Add related patterns
1362
+ ...topPatterns.slice(0, 10).map(p => `similar to: ${p}`),
1363
+ ];
1364
+ await this.embedBatch(prefetchPatterns);
1365
+ predictions = {
1366
+ prefetch: prefetchPatterns.length,
1367
+ confidence: 0.85, // Estimated based on access history
1368
+ };
1369
+ totalCached += prefetchPatterns.length;
1370
+ }
1371
+ }
1372
+ catch { }
1373
+ }
1374
+ }
1375
+ // Step 5: Standard warmup
1376
+ onProgress?.('warmup', 'Running standard warmup...');
1377
+ const warmupResult = await this.warmup(repoPath);
1378
+ totalCached += warmupResult.patterns + warmupResult.recentChanges;
1379
+ patterns.push({ type: 'warmup', count: warmupResult.patterns + warmupResult.recentChanges });
1380
+ }
1381
+ catch (err) {
1382
+ // AI pretrain failed, continue with basic
1383
+ }
1384
+ return {
1385
+ patterns,
1386
+ attention: attentionInfo,
1387
+ predictions,
1388
+ totalCached,
1389
+ totalTimeMs: performance.now() - startTime,
1390
+ };
1391
+ }
1392
+ /**
1393
+ * Context-aware prefetch using attention
1394
+ * Predicts what embeddings will be needed based on current context
1395
+ */
1396
+ async prefetchForContext(context) {
1397
+ const startTime = performance.now();
1398
+ let prefetched = 0;
1399
+ let confidence = 0;
1400
+ try {
1401
+ const patterns = [];
1402
+ // Add patterns based on current file type
1403
+ if (context.currentFile) {
1404
+ const ext = context.currentFile.split('.').pop() || '';
1405
+ const filePatterns = {
1406
+ ts: ['TypeScript type checking', 'interface implementation', 'generic types'],
1407
+ tsx: ['React component', 'JSX rendering', 'hook usage'],
1408
+ js: ['JavaScript module', 'CommonJS require', 'ES6 import'],
1409
+ jsx: ['React component', 'JSX element', 'props handling'],
1410
+ py: ['Python function', 'class method', 'import statement'],
1411
+ md: ['documentation', 'README section', 'code example'],
1412
+ };
1413
+ patterns.push(...(filePatterns[ext] || []));
1414
+ }
1415
+ // Add patterns based on task type
1416
+ if (context.taskType) {
1417
+ const taskPatterns = {
1418
+ edit: ['code modification', 'variable rename', 'function update'],
1419
+ review: ['code review', 'bug detection', 'style check'],
1420
+ debug: ['error trace', 'stack analysis', 'variable inspection'],
1421
+ test: ['test case', 'assertion', 'mock setup'],
1422
+ refactor: ['code cleanup', 'pattern extraction', 'abstraction'],
1423
+ };
1424
+ patterns.push(...(taskPatterns[context.taskType] || []));
1425
+ }
1426
+ // Add patterns based on user query similarity
1427
+ if (context.userQuery) {
1428
+ patterns.push(context.userQuery);
1429
+ // Add variations
1430
+ patterns.push(`how to ${context.userQuery}`);
1431
+ patterns.push(`implement ${context.userQuery}`);
1432
+ }
1433
+ if (patterns.length > 0) {
1434
+ await this.embedBatch(patterns);
1435
+ prefetched = patterns.length;
1436
+ confidence = Math.min(0.9, 0.5 + patterns.length * 0.05);
1437
+ }
1438
+ }
1439
+ catch {
1440
+ // Prefetch failed
1441
+ }
1442
+ return {
1443
+ prefetched,
1444
+ confidence,
1445
+ timeMs: performance.now() - startTime,
1446
+ };
1447
+ }
496
1448
  }
497
1449
  // Export singleton getter
498
1450
  export function getEmbeddingService() {
@@ -505,6 +1457,12 @@ export async function embed(text) {
505
1457
  export async function embedBatch(texts) {
506
1458
  return getEmbeddingService().embedBatch(texts);
507
1459
  }
1460
+ export async function pretrainCodePatterns() {
1461
+ return getEmbeddingService().pretrainCodePatterns();
1462
+ }
1463
+ export async function pretrainFromRepo(repoPath = '.') {
1464
+ return getEmbeddingService().pretrainFromRepo(repoPath);
1465
+ }
508
1466
  export async function textSimilarity(text1, text2) {
509
1467
  return getEmbeddingService().similarity(text1, text2);
510
1468
  }