@sparkleideas/embeddings 3.0.0-alpha.17 → 3.0.0-alpha.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -4,26 +4,27 @@
4
4
  [![npm downloads](https://img.shields.io/npm/dm/@claude-flow/embeddings.svg)](https://www.npmjs.com/package/@claude-flow/embeddings)
5
5
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
6
6
  [![TypeScript](https://img.shields.io/badge/TypeScript-5.0+-blue.svg)](https://www.typescriptlang.org/)
7
- [![Performance](https://img.shields.io/badge/Performance-<100ms-brightgreen.svg)](https://github.com/ruvnet/claude-flow)
7
+ [![Performance](https://img.shields.io/badge/Performance-<5ms-brightgreen.svg)](https://github.com/ruvnet/claude-flow)
8
8
 
9
- > High-performance embedding generation module for Claude Flow V3 - multi-provider support, LRU caching, batch processing, and similarity computation.
9
+ > High-performance embedding generation module for Claude Flow V3 - multi-provider support with persistent caching, document chunking, normalization, hyperbolic embeddings, and neural substrate integration.
10
10
 
11
11
  ## Features
12
12
 
13
- - **Multiple Providers** - OpenAI, Transformers.js (local), and Mock for testing
14
- - **LRU Caching** - Intelligent caching with configurable size and hit rate tracking
13
+ ### Core Embedding
14
+ - **Multiple Providers** - Agentic-Flow (ONNX), OpenAI, Transformers.js, and Mock
15
+ - **Auto-Install** - Automatically installs agentic-flow when using `provider: 'auto'`
16
+ - **Smart Fallback** - Graceful fallback chain: agentic-flow → transformers → mock
17
+ - **LRU + Disk Caching** - In-memory LRU + SQLite persistent cache with TTL
15
18
  - **Batch Processing** - Efficient batch embedding with partial cache hits
16
19
  - **Similarity Functions** - Cosine, Euclidean, and dot product metrics
17
- - **Event System** - Observable embedding operations with event listeners
18
- - **Type-Safe** - Full TypeScript support with comprehensive type definitions
20
+ - **75x Faster** - Agentic-flow ONNX is 75x faster than Transformers.js
19
21
 
20
- ## Performance Targets
21
-
22
- | Operation | API Provider | Local Provider |
23
- |-----------|--------------|----------------|
24
- | Single embedding | <100ms | <50ms |
25
- | Batch (10 items) | <500ms | <200ms |
26
- | Cache hit | <1ms | <1ms |
22
+ ### Advanced Features (New in v3.0.0-alpha.11)
23
+ - **Document Chunking** - Character, sentence, paragraph, and token-based chunking with overlap
24
+ - **Multiple Normalization** - L2, L1, min-max, and z-score normalization
25
+ - **Hyperbolic Embeddings** - Poincaré ball model for hierarchical representations
26
+ - **Neural Substrate** - Semantic drift detection, memory physics, swarm coordination
27
+ - **Persistent Cache** - SQLite-backed disk cache with LRU eviction and TTL
27
28
 
28
29
  ## Installation
29
30
 
@@ -66,19 +67,45 @@ const similarity = cosineSimilarity(
66
67
  console.log(`Similarity: ${similarity.toFixed(4)}`);
67
68
  ```
68
69
 
70
+ ## CLI Usage
71
+
72
+ ```bash
73
+ # Generate embedding from CLI
74
+ claude-flow embeddings embed "Your text here"
75
+
76
+ # Batch embed from file
77
+ claude-flow embeddings batch documents.txt -o embeddings.json
78
+
79
+ # Similarity search
80
+ claude-flow embeddings search "query" --index ./vectors
81
+
82
+ # Initialize agentic-flow model
83
+ claude-flow embeddings init --provider agentic-flow
84
+ ```
85
+
69
86
  ## API Reference
70
87
 
71
88
  ### Factory Functions
72
89
 
73
90
  ```typescript
74
- import { createEmbeddingService, getEmbedding } from '@claude-flow/embeddings';
91
+ import {
92
+ createEmbeddingService,
93
+ createEmbeddingServiceAsync,
94
+ getEmbedding
95
+ } from '@claude-flow/embeddings';
75
96
 
76
- // Create a service instance
97
+ // Sync: Create with known provider
77
98
  const service = createEmbeddingService({
78
99
  provider: 'openai',
79
100
  apiKey: 'your-api-key',
80
101
  model: 'text-embedding-3-small',
81
- cacheSize: 1000,
102
+ });
103
+
104
+ // Async: Auto-select best provider with fallback
105
+ const autoService = await createEmbeddingServiceAsync({
106
+ provider: 'auto', // agentic-flow → transformers → mock
107
+ autoInstall: true, // Install agentic-flow if missing
108
+ fallback: 'transformers', // Custom fallback
82
109
  });
83
110
 
84
111
  // Quick one-off embedding
@@ -108,6 +135,22 @@ const result = await service.embed('Your text here');
108
135
  console.log('Tokens used:', result.usage?.totalTokens);
109
136
  ```
110
137
 
138
+ ### Agentic-Flow Provider (Fastest)
139
+
140
+ ```typescript
141
+ import { AgenticFlowEmbeddingService } from '@claude-flow/embeddings';
142
+
143
+ const service = new AgenticFlowEmbeddingService({
144
+ provider: 'agentic-flow',
145
+ modelId: 'default', // Uses optimized ONNX model
146
+ cacheSize: 256,
147
+ });
148
+
149
+ // 75x faster than Transformers.js (3ms vs 233ms)
150
+ const result = await service.embed('Your text here');
151
+ console.log(`ONNX embedding in ${result.latencyMs}ms`);
152
+ ```
153
+
111
154
  ### Transformers.js Provider (Local)
112
155
 
113
156
  ```typescript
@@ -243,10 +286,17 @@ service.removeEventListener(listener);
243
286
 
244
287
  | Provider | Latency | Quality | Cost | Offline |
245
288
  |----------|---------|---------|------|---------|
289
+ | **Agentic-Flow** | ~3ms | Good | Free | Yes |
246
290
  | **OpenAI** | ~50-100ms | Excellent | $0.02-0.13/1M tokens | No |
247
- | **Transformers.js** | ~20-50ms | Good | Free | Yes |
291
+ | **Transformers.js** | ~230ms | Good | Free | Yes |
248
292
  | **Mock** | <1ms | N/A | Free | Yes |
249
293
 
294
+ ### Agentic-Flow (Recommended)
295
+
296
+ | Model | Dimensions | Speed | Best For |
297
+ |-------|------------|-------|----------|
298
+ | `default` | 384 | 3ms | General purpose, fastest |
299
+
250
300
  ### OpenAI Models
251
301
 
252
302
  | Model | Dimensions | Max Tokens | Best For |
@@ -272,7 +322,9 @@ import type {
272
322
  EmbeddingConfig,
273
323
  OpenAIEmbeddingConfig,
274
324
  TransformersEmbeddingConfig,
325
+ AgenticFlowEmbeddingConfig,
275
326
  MockEmbeddingConfig,
327
+ AutoEmbeddingConfig,
276
328
 
277
329
  // Result types
278
330
  EmbeddingResult,
@@ -349,6 +401,245 @@ const queryResult = await embeddings.embed('Search query');
349
401
  const results = await index.search(new Float32Array(queryResult.embedding), 5);
350
402
  ```
351
403
 
404
+ ## Document Chunking
405
+
406
+ Split long documents into overlapping chunks for embedding:
407
+
408
+ ```typescript
409
+ import { chunkText, estimateTokens, reconstructFromChunks } from '@claude-flow/embeddings';
410
+
411
+ // Chunk by sentence (default)
412
+ const result = chunkText(longDocument, {
413
+ maxChunkSize: 512,
414
+ overlap: 50,
415
+ strategy: 'sentence', // 'character' | 'sentence' | 'paragraph' | 'token'
416
+ minChunkSize: 100,
417
+ });
418
+
419
+ console.log('Chunks:', result.totalChunks);
420
+ result.chunks.forEach((chunk, i) => {
421
+ console.log(`Chunk ${i}: ${chunk.length} chars, ~${chunk.tokenCount} tokens`);
422
+ });
423
+
424
+ // Estimate tokens
425
+ const tokens = estimateTokens('Hello world'); // ~3 tokens
426
+
427
+ // Reconstruct (approximate)
428
+ const reconstructed = reconstructFromChunks(result.chunks);
429
+ ```
430
+
431
+ ## Normalization
432
+
433
+ Normalize embeddings for consistent similarity computation:
434
+
435
+ ```typescript
436
+ import {
437
+ l2Normalize, // Unit vector (Euclidean norm = 1)
438
+ l1Normalize, // Manhattan norm = 1
439
+ minMaxNormalize, // Values in [0, 1]
440
+ zScoreNormalize, // Mean 0, std 1
441
+ normalize, // Generic with type option
442
+ l2Norm,
443
+ isNormalized,
444
+ } from '@claude-flow/embeddings';
445
+
446
+ const embedding = new Float32Array([3, 4, 0]);
447
+
448
+ // L2 normalize (most common for cosine similarity)
449
+ const l2 = l2Normalize(embedding); // [0.6, 0.8, 0]
450
+ console.log('L2 norm:', l2Norm(l2)); // 1.0
451
+
452
+ // Check if already normalized
453
+ console.log(isNormalized(l2)); // true
454
+ console.log(isNormalized(embedding)); // false
455
+
456
+ // Generic normalize with type
457
+ const normalized = normalize(embedding, { type: 'l2' });
458
+ ```
459
+
460
+ ## Hyperbolic Embeddings (Poincaré Ball)
461
+
462
+ Transform embeddings to hyperbolic space for better hierarchical representation:
463
+
464
+ ```typescript
465
+ import {
466
+ euclideanToPoincare,
467
+ poincareToEuclidean,
468
+ hyperbolicDistance,
469
+ mobiusAdd,
470
+ isInPoincareBall,
471
+ batchEuclideanToPoincare,
472
+ hyperbolicCentroid,
473
+ } from '@claude-flow/embeddings';
474
+
475
+ // Convert Euclidean embedding to Poincaré ball
476
+ const euclidean = new Float32Array([0.5, 0.3, 0.2]);
477
+ const poincare = euclideanToPoincare(euclidean);
478
+
479
+ // Check if point is in the ball
480
+ console.log(isInPoincareBall(poincare)); // true
481
+
482
+ // Round-trip conversion
483
+ const back = poincareToEuclidean(poincare);
484
+
485
+ // Hyperbolic distance (geodesic in Poincaré ball)
486
+ const a = euclideanToPoincare(new Float32Array([0.1, 0.2, 0.1]));
487
+ const b = euclideanToPoincare(new Float32Array([0.3, 0.1, 0.2]));
488
+ const dist = hyperbolicDistance(a, b);
489
+
490
+ // Möbius addition (hyperbolic "plus")
491
+ const sum = mobiusAdd(a, b);
492
+
493
+ // Batch conversion
494
+ const embeddings = [vec1, vec2, vec3];
495
+ const hyperbolic = batchEuclideanToPoincare(embeddings);
496
+
497
+ // Hyperbolic centroid (Fréchet mean)
498
+ const centroid = hyperbolicCentroid(hyperbolic);
499
+ ```
500
+
501
+ ### Why Hyperbolic?
502
+
503
+ Hyperbolic space has natural properties for representing hierarchical data:
504
+ - **Exponential growth** - Tree-like structures fit naturally
505
+ - **Better hierarchy** - Parent-child relationships preserved
506
+ - **Lower distortion** - Taxonomies represented with less error
507
+
508
+ ## Neural Substrate Integration
509
+
510
+ Access agentic-flow's neural features for advanced embedding operations:
511
+
512
+ ```typescript
513
+ import {
514
+ NeuralEmbeddingService,
515
+ createNeuralService,
516
+ isNeuralAvailable,
517
+ listEmbeddingModels,
518
+ downloadEmbeddingModel,
519
+ } from '@claude-flow/embeddings';
520
+
521
+ // Check if neural features are available
522
+ const available = await isNeuralAvailable();
523
+
524
+ // Create neural service
525
+ const neural = createNeuralService({ dimension: 384 });
526
+ await neural.init();
527
+
528
+ if (neural.isAvailable()) {
529
+ // Semantic drift detection
530
+ await neural.setDriftBaseline('Initial context about the topic');
531
+ const drift = await neural.detectDrift('New input to check for drift');
532
+ console.log('Drift:', drift?.trend); // 'stable' | 'drifting' | 'accelerating'
533
+
534
+ // Memory with interference detection
535
+ const stored = await neural.storeMemory('mem-1', 'Important information');
536
+ console.log('Interference:', stored?.interference);
537
+
538
+ // Recall by similarity
539
+ const memories = await neural.recallMemories('query', 5);
540
+
541
+ // Swarm coordination
542
+ await neural.addSwarmAgent('agent-1', 'researcher');
543
+ const coordination = await neural.coordinateSwarm('Analyze this task');
544
+
545
+ // Coherence checking
546
+ await neural.calibrateCoherence(['good output 1', 'good output 2']);
547
+ const coherence = await neural.checkCoherence('Output to check');
548
+
549
+ // Health status
550
+ const health = neural.health();
551
+ console.log('Memory count:', health?.memoryCount);
552
+ }
553
+
554
+ // List available ONNX models
555
+ const models = await listEmbeddingModels();
556
+ console.log(models);
557
+ // [{ id: 'all-MiniLM-L6-v2', dimension: 384, size: '23MB', ... }]
558
+
559
+ // Download model
560
+ const path = await downloadEmbeddingModel('all-MiniLM-L6-v2', '.models');
561
+ ```
562
+
563
+ ## Persistent Disk Cache
564
+
565
+ SQLite-backed persistent cache for embeddings:
566
+
567
+ ```typescript
568
+ import { PersistentEmbeddingCache, isPersistentCacheAvailable } from '@claude-flow/embeddings';
569
+
570
+ // Check if SQLite is available
571
+ const hasSQLite = await isPersistentCacheAvailable();
572
+
573
+ // Create persistent cache
574
+ const cache = new PersistentEmbeddingCache({
575
+ dbPath: './embeddings.db', // SQLite database path
576
+ maxSize: 10000, // Max entries before LRU eviction
577
+ ttlMs: 7 * 24 * 60 * 60 * 1000, // 7 day TTL
578
+ });
579
+
580
+ // Initialize
581
+ await cache.init();
582
+
583
+ // Store embedding
584
+ await cache.set('my text', new Float32Array([0.1, 0.2, 0.3]));
585
+
586
+ // Retrieve
587
+ const embedding = await cache.get('my text');
588
+
589
+ // Get stats
590
+ const stats = await cache.getStats();
591
+ console.log('Cache stats:', {
592
+ size: stats.totalEntries,
593
+ hitRate: stats.hitRate,
594
+ avgLatency: stats.avgLatencyMs,
595
+ });
596
+
597
+ // Close when done
598
+ await cache.close();
599
+ ```
600
+
601
+ ### Enable in Embedding Service
602
+
603
+ ```typescript
604
+ const service = createEmbeddingService({
605
+ provider: 'openai',
606
+ apiKey: process.env.OPENAI_API_KEY!,
607
+ persistentCache: {
608
+ enabled: true,
609
+ dbPath: './cache/embeddings.db',
610
+ maxSize: 50000,
611
+ ttlMs: 30 * 24 * 60 * 60 * 1000, // 30 days
612
+ },
613
+ normalization: 'l2', // Auto-normalize embeddings
614
+ });
615
+ ```
616
+
617
+ ## CLI Commands (New)
618
+
619
+ ```bash
620
+ # Document chunking
621
+ claude-flow embeddings chunk document.txt --strategy sentence --max-size 512
622
+
623
+ # Normalize embedding file
624
+ claude-flow embeddings normalize embeddings.json --type l2 -o normalized.json
625
+
626
+ # Convert to hyperbolic
627
+ claude-flow embeddings hyperbolic embeddings.json -o poincare.json
628
+
629
+ # Neural operations
630
+ claude-flow embeddings neural drift --baseline "context" --input "check this"
631
+ claude-flow embeddings neural store --id mem-1 --content "data"
632
+ claude-flow embeddings neural recall "query" --top-k 5
633
+
634
+ # List/download models
635
+ claude-flow embeddings models list
636
+ claude-flow embeddings models download all-MiniLM-L6-v2
637
+
638
+ # Cache management
639
+ claude-flow embeddings cache stats
640
+ claude-flow embeddings cache clear --older-than 7d
641
+ ```
642
+
352
643
  ## Related Packages
353
644
 
354
645
  - [@claude-flow/memory](../memory) - HNSW indexing and vector storage
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@sparkleideas/embeddings",
3
- "version": "3.0.0-alpha.17",
4
- "description": "V3 Embedding Service - OpenAI, Transformers.js, and Mock providers",
3
+ "version": "3.0.0-alpha.26",
4
+ "description": "V3 Embedding Service - OpenAI, Transformers.js, Agentic-Flow (ONNX), Mock providers with hyperbolic embeddings, normalization, and chunking",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
7
7
  "types": "./dist/index.d.ts",
@@ -29,20 +29,32 @@
29
29
  "vector",
30
30
  "similarity",
31
31
  "claude-flow",
32
- "v3"
32
+ "v3",
33
+ "hyperbolic",
34
+ "poincare",
35
+ "normalization",
36
+ "chunking",
37
+ "neural-substrate"
33
38
  ],
34
39
  "author": "Claude Flow Team",
35
40
  "license": "MIT",
36
41
  "dependencies": {
37
- "@xenova/transformers": "^2.17.0"
42
+ "@xenova/transformers": "^2.17.0",
43
+ "sql.js": "^1.13.0"
38
44
  },
39
45
  "devDependencies": {
46
+ "@types/node": "^20.10.0",
40
47
  "typescript": "^5.3.0",
41
- "vitest": "^1.0.0",
42
- "@types/node": "^20.10.0"
48
+ "vitest": "^4.0.16"
43
49
  },
44
50
  "peerDependencies": {
45
- "@sparkleideas/shared": "*"
51
+ "@sparkleideas/shared": "*",
52
+ "@sparkleideas/agentic-flow": "*"
53
+ },
54
+ "peerDependenciesMeta": {
55
+ "agentic-flow": {
56
+ "optional": true
57
+ }
46
58
  },
47
59
  "engines": {
48
60
  "node": ">=20.0.0"