@soulcraft/brainy 2.11.0 → 2.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +15 -0
- package/dist/brainyData.d.ts +5 -8
- package/dist/brainyData.js +56 -39
- package/dist/config/index.d.ts +1 -0
- package/dist/config/index.js +2 -0
- package/dist/config/modelAutoConfig.d.ts +1 -0
- package/dist/config/modelAutoConfig.js +27 -22
- package/dist/config/modelPrecisionManager.d.ts +42 -0
- package/dist/config/modelPrecisionManager.js +98 -0
- package/dist/config/zeroConfig.js +1 -1
- package/dist/embeddings/CachedEmbeddings.d.ts +40 -0
- package/dist/embeddings/CachedEmbeddings.js +146 -0
- package/dist/embeddings/EmbeddingManager.d.ts +106 -0
- package/dist/embeddings/EmbeddingManager.js +296 -0
- package/dist/embeddings/SingletonModelManager.d.ts +95 -0
- package/dist/embeddings/SingletonModelManager.js +220 -0
- package/dist/embeddings/index.d.ts +12 -0
- package/dist/embeddings/index.js +16 -0
- package/dist/embeddings/lightweight-embedder.d.ts +0 -1
- package/dist/embeddings/lightweight-embedder.js +4 -12
- package/dist/embeddings/universal-memory-manager.js +13 -50
- package/dist/embeddings/worker-embedding.js +4 -8
- package/dist/neural/improvedNeuralAPI.d.ts +346 -0
- package/dist/neural/improvedNeuralAPI.js +2439 -0
- package/dist/neural/types.d.ts +267 -0
- package/dist/neural/types.js +24 -0
- package/dist/utils/embedding.d.ts +7 -2
- package/dist/utils/embedding.js +51 -33
- package/dist/utils/hybridModelManager.d.ts +19 -28
- package/dist/utils/hybridModelManager.js +36 -200
- package/package.json +1 -1
|
@@ -0,0 +1,2439 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Improved Neural API - Clean, Consistent, Performant
|
|
3
|
+
*
|
|
4
|
+
* Public API Surface:
|
|
5
|
+
* - brain.neural.similar(a, b, options?) // Similarity calculation
|
|
6
|
+
* - brain.neural.clusters(items?, options?) // Semantic clustering
|
|
7
|
+
* - brain.neural.neighbors(id, options?) // K-nearest neighbors
|
|
8
|
+
* - brain.neural.hierarchy(id, options?) // Semantic hierarchy
|
|
9
|
+
* - brain.neural.outliers(options?) // Anomaly detection
|
|
10
|
+
* - brain.neural.visualize(options?) // Visualization data
|
|
11
|
+
*
|
|
12
|
+
* Advanced Clustering:
|
|
13
|
+
* - brain.neural.clusterByDomain(field, options?) // Domain-aware clustering
|
|
14
|
+
* - brain.neural.clusterByTime(field, windows, options?) // Temporal clustering
|
|
15
|
+
* - brain.neural.clusterStream(options?) // AsyncIterator for streaming
|
|
16
|
+
* - brain.neural.updateClusters(items, options?) // Incremental clustering
|
|
17
|
+
*
|
|
18
|
+
* Private methods are prefixed with _ and not exposed in public API
|
|
19
|
+
*/
|
|
20
|
+
import { cosineDistance, euclideanDistance } from '../utils/distance.js';
|
|
21
|
+
import { NeuralAPIError, ClusteringError, SimilarityError } from './types.js';
|
|
22
|
+
export class ImprovedNeuralAPI {
|
|
23
|
+
constructor(brain, config = {}) {
|
|
24
|
+
// Caching for performance
|
|
25
|
+
this.similarityCache = new Map();
|
|
26
|
+
this.clusterCache = new Map();
|
|
27
|
+
this.hierarchyCache = new Map();
|
|
28
|
+
this.neighborsCache = new Map();
|
|
29
|
+
// Performance tracking
|
|
30
|
+
this.performanceMetrics = new Map();
|
|
31
|
+
this.brain = brain;
|
|
32
|
+
this.config = {
|
|
33
|
+
cacheSize: 1000,
|
|
34
|
+
defaultAlgorithm: 'auto',
|
|
35
|
+
similarityMetric: 'cosine',
|
|
36
|
+
performanceTracking: true,
|
|
37
|
+
maxMemoryUsage: '1GB',
|
|
38
|
+
parallelProcessing: true,
|
|
39
|
+
streamingBatchSize: 100,
|
|
40
|
+
...config
|
|
41
|
+
};
|
|
42
|
+
this._initializeCleanupTimer();
|
|
43
|
+
}
|
|
44
|
+
// ===== PUBLIC API: SIMILARITY =====
|
|
45
|
+
/**
|
|
46
|
+
* Calculate similarity between any two items (auto-detection)
|
|
47
|
+
* Supports: IDs, text strings, vectors, or mixed types
|
|
48
|
+
*/
|
|
49
|
+
async similar(a, b, options = {}) {
|
|
50
|
+
const startTime = performance.now();
|
|
51
|
+
try {
|
|
52
|
+
// Create cache key
|
|
53
|
+
const cacheKey = this._createSimilarityKey(a, b, options);
|
|
54
|
+
if (this.similarityCache.has(cacheKey)) {
|
|
55
|
+
return this.similarityCache.get(cacheKey);
|
|
56
|
+
}
|
|
57
|
+
let result;
|
|
58
|
+
// Auto-detect input types and route accordingly
|
|
59
|
+
if (this._isId(a) && this._isId(b)) {
|
|
60
|
+
result = await this._similarityById(a, b, options);
|
|
61
|
+
}
|
|
62
|
+
else if (this._isVector(a) && this._isVector(b)) {
|
|
63
|
+
result = await this._similarityByVector(a, b, options);
|
|
64
|
+
}
|
|
65
|
+
else if (typeof a === 'string' && typeof b === 'string') {
|
|
66
|
+
result = await this._similarityByText(a, b, options);
|
|
67
|
+
}
|
|
68
|
+
else {
|
|
69
|
+
// Mixed types - convert to vectors
|
|
70
|
+
const vectorA = await this._convertToVector(a);
|
|
71
|
+
const vectorB = await this._convertToVector(b);
|
|
72
|
+
result = await this._similarityByVector(vectorA, vectorB, options);
|
|
73
|
+
}
|
|
74
|
+
// Cache result
|
|
75
|
+
this._cacheResult(cacheKey, result, this.similarityCache);
|
|
76
|
+
// Track performance
|
|
77
|
+
this._trackPerformance('similarity', startTime, 2, 'mixed');
|
|
78
|
+
return result;
|
|
79
|
+
}
|
|
80
|
+
catch (error) {
|
|
81
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
82
|
+
throw new SimilarityError(`Failed to calculate similarity: ${errorMessage}`, {
|
|
83
|
+
inputA: typeof a === 'object' ? 'vector' : String(a).substring(0, 50),
|
|
84
|
+
inputB: typeof b === 'object' ? 'vector' : String(b).substring(0, 50),
|
|
85
|
+
options
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
// ===== PUBLIC API: CLUSTERING =====
|
|
90
|
+
/**
|
|
91
|
+
* Intelligent semantic clustering with auto-routing
|
|
92
|
+
* - No input: Cluster all data
|
|
93
|
+
* - Array: Cluster specific items
|
|
94
|
+
* - String: Find clusters near this item
|
|
95
|
+
* - Options object: Advanced configuration
|
|
96
|
+
*/
|
|
97
|
+
async clusters(input) {
|
|
98
|
+
const startTime = performance.now();
|
|
99
|
+
try {
|
|
100
|
+
let options = {};
|
|
101
|
+
let items;
|
|
102
|
+
// Parse input
|
|
103
|
+
if (!input) {
|
|
104
|
+
// Cluster all data
|
|
105
|
+
items = undefined;
|
|
106
|
+
options = { algorithm: 'auto' };
|
|
107
|
+
}
|
|
108
|
+
else if (Array.isArray(input)) {
|
|
109
|
+
// Cluster specific items
|
|
110
|
+
items = input;
|
|
111
|
+
options = { algorithm: 'auto' };
|
|
112
|
+
}
|
|
113
|
+
else if (typeof input === 'string') {
|
|
114
|
+
// Find clusters near this item
|
|
115
|
+
const nearbyResult = await this.neighbors(input, { limit: 100 });
|
|
116
|
+
items = nearbyResult.neighbors.map(n => n.id);
|
|
117
|
+
options = { algorithm: 'auto' };
|
|
118
|
+
}
|
|
119
|
+
else if (typeof input === 'object') {
|
|
120
|
+
// Configuration object
|
|
121
|
+
options = input;
|
|
122
|
+
items = undefined;
|
|
123
|
+
}
|
|
124
|
+
else {
|
|
125
|
+
throw new ClusteringError('Invalid input for clustering', { input });
|
|
126
|
+
}
|
|
127
|
+
// Check cache
|
|
128
|
+
const cacheKey = this._createClusteringKey(items, options);
|
|
129
|
+
if (this.clusterCache.has(cacheKey)) {
|
|
130
|
+
const cached = this.clusterCache.get(cacheKey);
|
|
131
|
+
return cached.clusters;
|
|
132
|
+
}
|
|
133
|
+
// Route to optimal algorithm
|
|
134
|
+
const result = await this._routeClusteringAlgorithm(items, options);
|
|
135
|
+
// Cache result
|
|
136
|
+
this._cacheResult(cacheKey, result, this.clusterCache);
|
|
137
|
+
// Track performance
|
|
138
|
+
this._trackPerformance('clustering', startTime, items?.length || 0, options.algorithm || 'auto');
|
|
139
|
+
return result.clusters;
|
|
140
|
+
}
|
|
141
|
+
catch (error) {
|
|
142
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
143
|
+
throw new ClusteringError(`Failed to perform clustering: ${errorMessage}`, {
|
|
144
|
+
input: typeof input === 'object' ? JSON.stringify(input) : input,
|
|
145
|
+
});
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Fast hierarchical clustering using HNSW levels
|
|
150
|
+
*/
|
|
151
|
+
async clusterFast(options = {}) {
|
|
152
|
+
const fullOptions = {
|
|
153
|
+
algorithm: 'hierarchical',
|
|
154
|
+
maxClusters: options.maxClusters,
|
|
155
|
+
...options
|
|
156
|
+
};
|
|
157
|
+
const result = await this._performHierarchicalClustering(undefined, fullOptions);
|
|
158
|
+
return result.clusters;
|
|
159
|
+
}
|
|
160
|
+
/**
|
|
161
|
+
* Large-scale clustering with intelligent sampling
|
|
162
|
+
*/
|
|
163
|
+
async clusterLarge(options = {}) {
|
|
164
|
+
const fullOptions = {
|
|
165
|
+
algorithm: 'auto',
|
|
166
|
+
sampleSize: options.sampleSize || 1000,
|
|
167
|
+
strategy: options.strategy || 'diverse',
|
|
168
|
+
...options
|
|
169
|
+
};
|
|
170
|
+
const result = await this._performSampledClustering(undefined, fullOptions);
|
|
171
|
+
return result.clusters;
|
|
172
|
+
}
|
|
173
|
+
// ===== PUBLIC API: ADVANCED CLUSTERING =====
|
|
174
|
+
/**
|
|
175
|
+
* Domain-aware clustering based on metadata fields
|
|
176
|
+
*/
|
|
177
|
+
async clusterByDomain(field, options = {}) {
|
|
178
|
+
const startTime = performance.now();
|
|
179
|
+
try {
|
|
180
|
+
// Get all items with the specified field
|
|
181
|
+
const items = await this._getItemsByField(field);
|
|
182
|
+
if (items.length === 0) {
|
|
183
|
+
return [];
|
|
184
|
+
}
|
|
185
|
+
// Group by domain values
|
|
186
|
+
const domainGroups = this._groupByDomain(items, field);
|
|
187
|
+
const domainClusters = [];
|
|
188
|
+
// Cluster within each domain
|
|
189
|
+
for (const [domain, domainItems] of domainGroups) {
|
|
190
|
+
const domainOptions = {
|
|
191
|
+
...options,
|
|
192
|
+
algorithm: 'auto',
|
|
193
|
+
maxClusters: Math.min(options.maxClusters || 10, Math.ceil(domainItems.length / 3))
|
|
194
|
+
};
|
|
195
|
+
const clusters = await this._performClustering(domainItems.map(item => item.id), domainOptions);
|
|
196
|
+
// Convert to domain clusters
|
|
197
|
+
for (const cluster of clusters.clusters) {
|
|
198
|
+
domainClusters.push({
|
|
199
|
+
...cluster,
|
|
200
|
+
domain,
|
|
201
|
+
domainConfidence: this._calculateDomainConfidence(cluster, domainItems),
|
|
202
|
+
crossDomainMembers: options.crossDomainThreshold
|
|
203
|
+
? await this._findCrossDomainMembers(cluster, options.crossDomainThreshold)
|
|
204
|
+
: undefined
|
|
205
|
+
});
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
// Handle cross-domain clustering if enabled
|
|
209
|
+
if (!options.preserveDomainBoundaries) {
|
|
210
|
+
const crossDomainClusters = await this._findCrossDomainClusters(domainClusters, options.crossDomainThreshold || 0.8);
|
|
211
|
+
domainClusters.push(...crossDomainClusters);
|
|
212
|
+
}
|
|
213
|
+
this._trackPerformance('domainClustering', startTime, items.length, field);
|
|
214
|
+
return domainClusters;
|
|
215
|
+
}
|
|
216
|
+
catch (error) {
|
|
217
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
218
|
+
throw new ClusteringError(`Failed to cluster by domain: ${errorMessage}`, { field, options });
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
/**
|
|
222
|
+
* Temporal clustering based on time windows
|
|
223
|
+
*/
|
|
224
|
+
async clusterByTime(timeField, windows, options = { timeField, windows }) {
|
|
225
|
+
const startTime = performance.now();
|
|
226
|
+
try {
|
|
227
|
+
const temporalClusters = [];
|
|
228
|
+
for (const window of windows) {
|
|
229
|
+
// Get items in this time window
|
|
230
|
+
const windowItems = await this._getItemsByTimeWindow(timeField, window);
|
|
231
|
+
if (windowItems.length === 0)
|
|
232
|
+
continue;
|
|
233
|
+
// Cluster items in this window
|
|
234
|
+
const clusteringOptions = {
|
|
235
|
+
...options,
|
|
236
|
+
algorithm: 'auto'
|
|
237
|
+
};
|
|
238
|
+
const clusters = await this._performClustering(windowItems.map(item => item.id), clusteringOptions);
|
|
239
|
+
// Convert to temporal clusters
|
|
240
|
+
for (const cluster of clusters.clusters) {
|
|
241
|
+
const temporal = await this._calculateTemporalMetrics(cluster, windowItems, timeField);
|
|
242
|
+
temporalClusters.push({
|
|
243
|
+
...cluster,
|
|
244
|
+
timeWindow: window,
|
|
245
|
+
trend: temporal.trend,
|
|
246
|
+
temporal: temporal.metrics
|
|
247
|
+
});
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
// Handle overlapping windows
|
|
251
|
+
if (options.overlapStrategy === 'merge') {
|
|
252
|
+
return this._mergeOverlappingTemporalClusters(temporalClusters);
|
|
253
|
+
}
|
|
254
|
+
this._trackPerformance('temporalClustering', startTime, temporalClusters.length, 'temporal');
|
|
255
|
+
return temporalClusters;
|
|
256
|
+
}
|
|
257
|
+
catch (error) {
|
|
258
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
259
|
+
throw new ClusteringError(`Failed to cluster by time: ${errorMessage}`, { timeField, windows, options });
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
/**
|
|
263
|
+
* Streaming clustering with real-time updates
|
|
264
|
+
*/
|
|
265
|
+
async *clusterStream(options = {}) {
|
|
266
|
+
const batchSize = options.batchSize || this.config.streamingBatchSize || 100;
|
|
267
|
+
let batchNumber = 0;
|
|
268
|
+
let processedCount = 0;
|
|
269
|
+
try {
|
|
270
|
+
// Get all items for processing
|
|
271
|
+
const allItems = await this._getAllItemIds();
|
|
272
|
+
const totalItems = allItems.length;
|
|
273
|
+
// Process in batches
|
|
274
|
+
for (let i = 0; i < allItems.length; i += batchSize) {
|
|
275
|
+
const startTime = performance.now();
|
|
276
|
+
const batch = allItems.slice(i, i + batchSize);
|
|
277
|
+
// Perform clustering on this batch
|
|
278
|
+
const result = await this._performClustering(batch, {
|
|
279
|
+
...options,
|
|
280
|
+
algorithm: 'auto',
|
|
281
|
+
cacheResults: false // Don't cache streaming results
|
|
282
|
+
});
|
|
283
|
+
processedCount += batch.length;
|
|
284
|
+
const isComplete = processedCount >= totalItems;
|
|
285
|
+
yield {
|
|
286
|
+
clusters: result.clusters,
|
|
287
|
+
batchNumber: ++batchNumber,
|
|
288
|
+
isComplete,
|
|
289
|
+
progress: {
|
|
290
|
+
processed: processedCount,
|
|
291
|
+
total: totalItems,
|
|
292
|
+
percentage: (processedCount / totalItems) * 100
|
|
293
|
+
},
|
|
294
|
+
metrics: {
|
|
295
|
+
...result.metrics,
|
|
296
|
+
executionTime: performance.now() - startTime
|
|
297
|
+
}
|
|
298
|
+
};
|
|
299
|
+
// Adaptive threshold adjustment
|
|
300
|
+
if (options.adaptiveThreshold && batchNumber > 1) {
|
|
301
|
+
options.threshold = this._adjustThresholdAdaptively(result.clusters, options.threshold);
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
catch (error) {
|
|
306
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
307
|
+
throw new ClusteringError(`Failed in streaming clustering: ${errorMessage}`, { options, batchNumber });
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
/**
|
|
311
|
+
* Incremental clustering - add new items to existing clusters
|
|
312
|
+
*/
|
|
313
|
+
async updateClusters(newItems, options = {}) {
|
|
314
|
+
const startTime = performance.now();
|
|
315
|
+
try {
|
|
316
|
+
// Get existing clusters
|
|
317
|
+
const existingClusters = await this.clusters({ ...options, algorithm: 'auto' });
|
|
318
|
+
// For each new item, find best cluster or create new one
|
|
319
|
+
const updatedClusters = [...existingClusters];
|
|
320
|
+
const unassignedItems = [];
|
|
321
|
+
for (const itemId of newItems) {
|
|
322
|
+
let bestCluster = null;
|
|
323
|
+
let bestSimilarity = 0;
|
|
324
|
+
// Find most similar existing cluster
|
|
325
|
+
for (const cluster of updatedClusters) {
|
|
326
|
+
const similarity = await this._calculateItemToClusterSimilarity(itemId, cluster);
|
|
327
|
+
if (similarity > bestSimilarity && similarity > (options.threshold || 0.6)) {
|
|
328
|
+
bestSimilarity = similarity;
|
|
329
|
+
bestCluster = cluster;
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
if (bestCluster) {
|
|
333
|
+
// Add to existing cluster
|
|
334
|
+
bestCluster.members.push(itemId);
|
|
335
|
+
bestCluster.size = bestCluster.members.length;
|
|
336
|
+
// Recalculate centroid
|
|
337
|
+
bestCluster.centroid = await this._recalculateClusterCentroid(bestCluster);
|
|
338
|
+
}
|
|
339
|
+
else {
|
|
340
|
+
// Item doesn't fit existing clusters
|
|
341
|
+
unassignedItems.push(itemId);
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
// Create new clusters for unassigned items
|
|
345
|
+
if (unassignedItems.length > 0) {
|
|
346
|
+
const newClusters = await this._performClustering(unassignedItems, options);
|
|
347
|
+
updatedClusters.push(...newClusters.clusters);
|
|
348
|
+
}
|
|
349
|
+
this._trackPerformance('incrementalClustering', startTime, newItems.length, 'incremental');
|
|
350
|
+
return updatedClusters;
|
|
351
|
+
}
|
|
352
|
+
catch (error) {
|
|
353
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
354
|
+
throw new ClusteringError(`Failed to update clusters: ${errorMessage}`, { newItems, options });
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
// ===== PUBLIC API: NEIGHBORS & HIERARCHY =====
|
|
358
|
+
/**
|
|
359
|
+
* Find K-nearest semantic neighbors
|
|
360
|
+
*/
|
|
361
|
+
async neighbors(id, options = {}) {
|
|
362
|
+
const startTime = performance.now();
|
|
363
|
+
try {
|
|
364
|
+
const cacheKey = `neighbors:${id}:${JSON.stringify(options)}`;
|
|
365
|
+
if (this.neighborsCache.has(cacheKey)) {
|
|
366
|
+
return this.neighborsCache.get(cacheKey);
|
|
367
|
+
}
|
|
368
|
+
const limit = options.limit || 10;
|
|
369
|
+
const minSimilarity = options.minSimilarity || 0.1;
|
|
370
|
+
// Use HNSW index for efficient neighbor search
|
|
371
|
+
const searchResults = await this.brain.search('', {
|
|
372
|
+
...options,
|
|
373
|
+
limit: limit * 2, // Get more than needed for filtering
|
|
374
|
+
metadata: options.includeMetadata ? {} : undefined
|
|
375
|
+
});
|
|
376
|
+
// Filter and sort neighbors
|
|
377
|
+
const neighbors = [];
|
|
378
|
+
for (const result of searchResults) {
|
|
379
|
+
if (result.id === id)
|
|
380
|
+
continue; // Skip self
|
|
381
|
+
const similarity = await this._calculateSimilarity(id, result.id);
|
|
382
|
+
if (similarity >= minSimilarity) {
|
|
383
|
+
neighbors.push({
|
|
384
|
+
id: result.id,
|
|
385
|
+
similarity,
|
|
386
|
+
data: result.content || result.data,
|
|
387
|
+
metadata: options.includeMetadata ? result.metadata : undefined,
|
|
388
|
+
distance: 1 - similarity
|
|
389
|
+
});
|
|
390
|
+
}
|
|
391
|
+
if (neighbors.length >= limit)
|
|
392
|
+
break;
|
|
393
|
+
}
|
|
394
|
+
// Sort by specified criteria
|
|
395
|
+
this._sortNeighbors(neighbors, options.sortBy || 'similarity');
|
|
396
|
+
const result = {
|
|
397
|
+
neighbors: neighbors.slice(0, limit),
|
|
398
|
+
queryId: id,
|
|
399
|
+
totalFound: neighbors.length,
|
|
400
|
+
averageSimilarity: neighbors.reduce((sum, n) => sum + n.similarity, 0) / neighbors.length
|
|
401
|
+
};
|
|
402
|
+
this._cacheResult(cacheKey, result, this.neighborsCache);
|
|
403
|
+
this._trackPerformance('neighbors', startTime, limit, 'knn');
|
|
404
|
+
return result;
|
|
405
|
+
}
|
|
406
|
+
catch (error) {
|
|
407
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
408
|
+
throw new NeuralAPIError(`Failed to find neighbors: ${errorMessage}`, 'NEIGHBORS_ERROR', { id, options });
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
/**
|
|
412
|
+
* Build semantic hierarchy around an item
|
|
413
|
+
*/
|
|
414
|
+
async hierarchy(id, options = {}) {
|
|
415
|
+
const startTime = performance.now();
|
|
416
|
+
try {
|
|
417
|
+
const cacheKey = `hierarchy:${id}:${JSON.stringify(options)}`;
|
|
418
|
+
if (this.hierarchyCache.has(cacheKey)) {
|
|
419
|
+
return this.hierarchyCache.get(cacheKey);
|
|
420
|
+
}
|
|
421
|
+
// Get item data
|
|
422
|
+
const item = await this.brain.getNoun(id);
|
|
423
|
+
if (!item) {
|
|
424
|
+
throw new Error(`Item with ID ${id} not found`);
|
|
425
|
+
}
|
|
426
|
+
// Build hierarchy based on strategy
|
|
427
|
+
const hierarchy = await this._buildSemanticHierarchy(item, options);
|
|
428
|
+
this._cacheResult(cacheKey, hierarchy, this.hierarchyCache);
|
|
429
|
+
this._trackPerformance('hierarchy', startTime, 1, 'hierarchy');
|
|
430
|
+
return hierarchy;
|
|
431
|
+
}
|
|
432
|
+
catch (error) {
|
|
433
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
434
|
+
throw new NeuralAPIError(`Failed to build hierarchy: ${errorMessage}`, 'HIERARCHY_ERROR', { id, options });
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
// ===== PUBLIC API: ANALYSIS =====
|
|
438
|
+
/**
|
|
439
|
+
* Detect outliers and anomalous items
|
|
440
|
+
*/
|
|
441
|
+
async outliers(options = {}) {
|
|
442
|
+
const startTime = performance.now();
|
|
443
|
+
try {
|
|
444
|
+
const threshold = options.threshold || 0.3;
|
|
445
|
+
const method = options.method || 'cluster-based';
|
|
446
|
+
let outliers = [];
|
|
447
|
+
switch (method) {
|
|
448
|
+
case 'isolation':
|
|
449
|
+
outliers = await this._detectOutliersIsolation(threshold, options);
|
|
450
|
+
break;
|
|
451
|
+
case 'statistical':
|
|
452
|
+
outliers = await this._detectOutliersStatistical(threshold, options);
|
|
453
|
+
break;
|
|
454
|
+
case 'cluster-based':
|
|
455
|
+
default:
|
|
456
|
+
outliers = await this._detectOutliersClusterBased(threshold, options);
|
|
457
|
+
break;
|
|
458
|
+
}
|
|
459
|
+
this._trackPerformance('outlierDetection', startTime, outliers.length, method);
|
|
460
|
+
return outliers;
|
|
461
|
+
}
|
|
462
|
+
catch (error) {
|
|
463
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
464
|
+
throw new NeuralAPIError(`Failed to detect outliers: ${errorMessage}`, 'OUTLIER_ERROR', { options });
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
/**
|
|
468
|
+
* Generate visualization data for graph libraries
|
|
469
|
+
*/
|
|
470
|
+
async visualize(options = {}) {
|
|
471
|
+
const startTime = performance.now();
|
|
472
|
+
try {
|
|
473
|
+
const maxNodes = options.maxNodes || 100;
|
|
474
|
+
const dimensions = options.dimensions || 2;
|
|
475
|
+
const algorithm = options.algorithm || 'force';
|
|
476
|
+
// Get data for visualization
|
|
477
|
+
const nodes = await this._generateVisualizationNodes(maxNodes, options);
|
|
478
|
+
const edges = options.includeEdges ? await this._generateVisualizationEdges(nodes, options) : [];
|
|
479
|
+
const clusters = options.clusterColors ? await this._generateVisualizationClusters(nodes) : [];
|
|
480
|
+
// Apply layout algorithm
|
|
481
|
+
const positionedNodes = await this._applyLayoutAlgorithm(nodes, edges, algorithm, dimensions);
|
|
482
|
+
const result = {
|
|
483
|
+
nodes: positionedNodes,
|
|
484
|
+
edges,
|
|
485
|
+
clusters,
|
|
486
|
+
metadata: {
|
|
487
|
+
algorithm,
|
|
488
|
+
dimensions,
|
|
489
|
+
totalNodes: nodes.length,
|
|
490
|
+
totalEdges: edges.length,
|
|
491
|
+
generatedAt: new Date()
|
|
492
|
+
}
|
|
493
|
+
};
|
|
494
|
+
this._trackPerformance('visualization', startTime, nodes.length, algorithm);
|
|
495
|
+
return result;
|
|
496
|
+
}
|
|
497
|
+
catch (error) {
|
|
498
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
499
|
+
throw new NeuralAPIError(`Failed to generate visualization: ${errorMessage}`, 'VISUALIZATION_ERROR', { options });
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
// ===== PRIVATE IMPLEMENTATION METHODS =====
|
|
503
|
+
async _routeClusteringAlgorithm(items, options) {
|
|
504
|
+
const algorithm = options.algorithm || 'auto';
|
|
505
|
+
const itemCount = items?.length || await this._getTotalItemCount();
|
|
506
|
+
// Auto-select optimal algorithm based on data size and characteristics
|
|
507
|
+
if (algorithm === 'auto') {
|
|
508
|
+
// Intelligent algorithm selection based on data characteristics
|
|
509
|
+
const itemIds = items || await this._getAllItemIds();
|
|
510
|
+
const dataCharacteristics = await this._analyzeDataCharacteristics(itemIds);
|
|
511
|
+
const hasRichGraph = dataCharacteristics.graphDensity > 0.05;
|
|
512
|
+
const hasSemanticTypes = Object.keys(dataCharacteristics.typeDistribution).length > 3;
|
|
513
|
+
if (hasRichGraph && hasSemanticTypes) {
|
|
514
|
+
// Best of all worlds for rich semantic graphs
|
|
515
|
+
return this._performMultiModalClustering(items, { ...options, algorithm: 'multimodal' });
|
|
516
|
+
}
|
|
517
|
+
else if (hasRichGraph) {
|
|
518
|
+
// Strong relationship network - use graph clustering
|
|
519
|
+
return this._performGraphClustering(items, { ...options, algorithm: 'graph' });
|
|
520
|
+
}
|
|
521
|
+
else if (hasSemanticTypes) {
|
|
522
|
+
// Rich semantic taxonomy - use semantic clustering
|
|
523
|
+
return this._performSemanticClustering(items, { ...options, algorithm: 'semantic' });
|
|
524
|
+
}
|
|
525
|
+
else if (itemCount > 10000) {
|
|
526
|
+
// Large dataset - use sampling
|
|
527
|
+
return this._performSampledClustering(items, { ...options, algorithm: 'sample' });
|
|
528
|
+
}
|
|
529
|
+
else if (itemCount > 1000) {
|
|
530
|
+
// Medium dataset - use hierarchical HNSW
|
|
531
|
+
return this._performHierarchicalClustering(items, { ...options, algorithm: 'hierarchical' });
|
|
532
|
+
}
|
|
533
|
+
else {
|
|
534
|
+
// Small dataset - use k-means for quality
|
|
535
|
+
return this._performKMeansClustering(items, { ...options, algorithm: 'kmeans' });
|
|
536
|
+
}
|
|
537
|
+
}
|
|
538
|
+
// Use specified algorithm
|
|
539
|
+
switch (algorithm) {
|
|
540
|
+
case 'hierarchical':
|
|
541
|
+
return this._performHierarchicalClustering(items, options);
|
|
542
|
+
case 'semantic':
|
|
543
|
+
return this._performSemanticClustering(items, options);
|
|
544
|
+
case 'graph':
|
|
545
|
+
return this._performGraphClustering(items, options);
|
|
546
|
+
case 'multimodal':
|
|
547
|
+
return this._performMultiModalClustering(items, options);
|
|
548
|
+
case 'kmeans':
|
|
549
|
+
return this._performKMeansClustering(items, options);
|
|
550
|
+
case 'dbscan':
|
|
551
|
+
return this._performDBSCANClustering(items, options);
|
|
552
|
+
case 'sample':
|
|
553
|
+
return this._performSampledClustering(items, options);
|
|
554
|
+
default:
|
|
555
|
+
throw new ClusteringError(`Unsupported algorithm: ${algorithm}`);
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
async _performClustering(items, options) {
|
|
559
|
+
// This is the main clustering dispatcher - routes to specific algorithms
|
|
560
|
+
return this._routeClusteringAlgorithm(items, options);
|
|
561
|
+
}
|
|
562
|
+
// ===== REAL CLUSTERING IMPLEMENTATIONS =====
|
|
563
|
+
/**
|
|
564
|
+
* SEMANTIC-AWARE CLUSTERING: Uses existing NounType/VerbType taxonomy + HNSW
|
|
565
|
+
*/
|
|
566
|
+
async _performSemanticClustering(items, options) {
|
|
567
|
+
const startTime = performance.now();
|
|
568
|
+
// Get all items if not specified
|
|
569
|
+
const itemIds = items || await this._getAllItemIds();
|
|
570
|
+
if (itemIds.length === 0) {
|
|
571
|
+
return this._createEmptyResult(startTime, 'semantic');
|
|
572
|
+
}
|
|
573
|
+
// 1. Group items by semantic type (NounType) - O(n) operation
|
|
574
|
+
const itemsWithMetadata = await this._getItemsWithMetadata(itemIds);
|
|
575
|
+
const typeGroups = this._groupBySemanticType(itemsWithMetadata);
|
|
576
|
+
const allClusters = [];
|
|
577
|
+
// 2. Cluster within each semantic type using HNSW - parallel processing
|
|
578
|
+
const typeClusteringPromises = Array.from(typeGroups.entries()).map(async ([nounType, groupItems]) => {
|
|
579
|
+
if (groupItems.length < (options.minClusterSize || 2)) {
|
|
580
|
+
// Create single cluster for small groups
|
|
581
|
+
return [{
|
|
582
|
+
id: `semantic-${nounType}`,
|
|
583
|
+
centroid: await this._calculateGroupCentroid(groupItems),
|
|
584
|
+
members: groupItems.map(item => item.id),
|
|
585
|
+
size: groupItems.length,
|
|
586
|
+
confidence: 0.9, // High confidence for type-based clustering
|
|
587
|
+
label: `${nounType} cluster`,
|
|
588
|
+
metadata: { semanticType: nounType, clustering: 'semantic' }
|
|
589
|
+
}];
|
|
590
|
+
}
|
|
591
|
+
// Use HNSW hierarchical clustering within type
|
|
592
|
+
return this._clusterWithinSemanticType(groupItems, options);
|
|
593
|
+
});
|
|
594
|
+
const typeClusterResults = await Promise.all(typeClusteringPromises);
|
|
595
|
+
typeClusterResults.forEach(clusters => allClusters.push(...clusters));
|
|
596
|
+
// 3. Find cross-type relationships using existing verb connections
|
|
597
|
+
const crossTypeConnections = await this._findCrossTypeConnections(typeGroups, options);
|
|
598
|
+
// 4. Merge clusters that have strong cross-type relationships
|
|
599
|
+
const finalClusters = await this._mergeSemanticClusters(allClusters, crossTypeConnections);
|
|
600
|
+
return {
|
|
601
|
+
clusters: finalClusters.slice(0, options.maxClusters || finalClusters.length),
|
|
602
|
+
metrics: this._createPerformanceMetrics(startTime, itemIds.length, 'semantic'),
|
|
603
|
+
metadata: {
|
|
604
|
+
totalItems: itemIds.length,
|
|
605
|
+
clustersFound: finalClusters.length,
|
|
606
|
+
averageClusterSize: finalClusters.reduce((sum, c) => sum + c.size, 0) / finalClusters.length || 0,
|
|
607
|
+
semanticTypes: Array.from(typeGroups.keys()).length,
|
|
608
|
+
timestamp: new Date()
|
|
609
|
+
}
|
|
610
|
+
};
|
|
611
|
+
}
|
|
612
|
+
/**
|
|
613
|
+
* HIERARCHICAL CLUSTERING: Uses existing HNSW levels for O(n) clustering
|
|
614
|
+
*/
|
|
615
|
+
async _performHierarchicalClustering(items, options) {
|
|
616
|
+
const startTime = performance.now();
|
|
617
|
+
const itemIds = items || await this._getAllItemIds();
|
|
618
|
+
if (itemIds.length === 0) {
|
|
619
|
+
return this._createEmptyResult(startTime, 'hierarchical');
|
|
620
|
+
}
|
|
621
|
+
// Use existing HNSW level structure for natural clustering
|
|
622
|
+
const level = options.level || this._getOptimalClusteringLevel(itemIds.length);
|
|
623
|
+
const maxClusters = options.maxClusters || Math.min(50, Math.ceil(itemIds.length / 20));
|
|
624
|
+
// Get HNSW level representatives - these are natural cluster centers
|
|
625
|
+
const levelNodes = await this._getHNSWLevelNodes(level);
|
|
626
|
+
const clusterCenters = levelNodes.slice(0, maxClusters);
|
|
627
|
+
const clusters = [];
|
|
628
|
+
// Create clusters around each level representative
|
|
629
|
+
for (let i = 0; i < clusterCenters.length; i++) {
|
|
630
|
+
const center = clusterCenters[i];
|
|
631
|
+
// Find items that belong to this cluster using HNSW neighbors
|
|
632
|
+
const members = await this._findClusterMembers(center, itemIds, 0.5);
|
|
633
|
+
if (members.length > 0) {
|
|
634
|
+
// Get actual node data for creating cluster
|
|
635
|
+
const memberData = await this._getItemsWithMetadata(members);
|
|
636
|
+
const centroid = await this._calculateCentroidFromItems(memberData);
|
|
637
|
+
clusters.push({
|
|
638
|
+
id: `hierarchical-${i}`,
|
|
639
|
+
centroid,
|
|
640
|
+
members,
|
|
641
|
+
size: members.length,
|
|
642
|
+
confidence: await this._calculateHierarchicalConfidence(members),
|
|
643
|
+
label: await this._generateClusterLabel(memberData, 'hierarchical'),
|
|
644
|
+
metadata: { level, clusterCenter: center, clustering: 'hierarchical' }
|
|
645
|
+
});
|
|
646
|
+
}
|
|
647
|
+
}
|
|
648
|
+
// Assign remaining items to nearest clusters
|
|
649
|
+
const assignedItems = new Set(clusters.flatMap(c => c.members));
|
|
650
|
+
const unassignedItems = itemIds.filter(id => !assignedItems.has(id));
|
|
651
|
+
if (unassignedItems.length > 0) {
|
|
652
|
+
await this._assignUnassignedItems(unassignedItems, clusters);
|
|
653
|
+
}
|
|
654
|
+
return {
|
|
655
|
+
clusters,
|
|
656
|
+
metrics: this._createPerformanceMetrics(startTime, itemIds.length, 'hierarchical'),
|
|
657
|
+
metadata: {
|
|
658
|
+
totalItems: itemIds.length,
|
|
659
|
+
clustersFound: clusters.length,
|
|
660
|
+
averageClusterSize: clusters.reduce((sum, c) => sum + c.size, 0) / clusters.length || 0,
|
|
661
|
+
hnswLevel: level,
|
|
662
|
+
timestamp: new Date()
|
|
663
|
+
}
|
|
664
|
+
};
|
|
665
|
+
}
|
|
666
|
+
/**
|
|
667
|
+
* K-MEANS CLUSTERING: Real implementation using existing distance functions
|
|
668
|
+
*/
|
|
669
|
+
async _performKMeansClustering(items, options) {
|
|
670
|
+
const startTime = performance.now();
|
|
671
|
+
const itemIds = items || await this._getAllItemIds();
|
|
672
|
+
if (itemIds.length === 0) {
|
|
673
|
+
return this._createEmptyResult(startTime, 'kmeans');
|
|
674
|
+
}
|
|
675
|
+
// Get vectors for all items using existing infrastructure
|
|
676
|
+
const itemsWithVectors = await this._getItemsWithVectors(itemIds);
|
|
677
|
+
// Determine optimal k
|
|
678
|
+
const k = options.maxClusters || Math.min(Math.floor(Math.sqrt(itemsWithVectors.length / 2)), 50 // Maximum clusters for practical use
|
|
679
|
+
);
|
|
680
|
+
if (k <= 1) {
|
|
681
|
+
// Single cluster case
|
|
682
|
+
return {
|
|
683
|
+
clusters: [{
|
|
684
|
+
id: 'kmeans-single',
|
|
685
|
+
centroid: await this._calculateCentroidFromItems(itemsWithVectors),
|
|
686
|
+
members: itemIds,
|
|
687
|
+
size: itemIds.length,
|
|
688
|
+
confidence: 1.0,
|
|
689
|
+
label: 'Single cluster',
|
|
690
|
+
metadata: { clustering: 'kmeans', k: 1 }
|
|
691
|
+
}],
|
|
692
|
+
metrics: this._createPerformanceMetrics(startTime, itemIds.length, 'kmeans'),
|
|
693
|
+
metadata: {
|
|
694
|
+
totalItems: itemIds.length,
|
|
695
|
+
clustersFound: 1,
|
|
696
|
+
averageClusterSize: itemIds.length,
|
|
697
|
+
kValue: 1,
|
|
698
|
+
timestamp: new Date()
|
|
699
|
+
}
|
|
700
|
+
};
|
|
701
|
+
}
|
|
702
|
+
// Initialize centroids using k-means++ for better convergence
|
|
703
|
+
const centroids = await this._initializeCentroidsKMeansPlusPlus(itemsWithVectors, k);
|
|
704
|
+
let assignments = new Array(itemsWithVectors.length).fill(0);
|
|
705
|
+
let hasConverged = false;
|
|
706
|
+
const maxIterations = options.maxIterations || 100;
|
|
707
|
+
const tolerance = options.tolerance || 1e-4;
|
|
708
|
+
// K-means iteration loop
|
|
709
|
+
for (let iteration = 0; iteration < maxIterations && !hasConverged; iteration++) {
|
|
710
|
+
// Assignment step: assign each point to nearest centroid
|
|
711
|
+
const newAssignments = await this._assignPointsToCentroids(itemsWithVectors, centroids);
|
|
712
|
+
// Update step: recalculate centroids
|
|
713
|
+
const newCentroids = await this._updateCentroids(itemsWithVectors, newAssignments, k);
|
|
714
|
+
// Check convergence: has assignment changed significantly?
|
|
715
|
+
const changeRate = this._calculateAssignmentChangeRate(assignments, newAssignments);
|
|
716
|
+
hasConverged = changeRate < tolerance;
|
|
717
|
+
assignments = newAssignments;
|
|
718
|
+
// Update centroids for next iteration
|
|
719
|
+
for (let i = 0; i < centroids.length; i++) {
|
|
720
|
+
centroids[i] = newCentroids[i];
|
|
721
|
+
}
|
|
722
|
+
}
|
|
723
|
+
// Create semantic clusters from k-means results
|
|
724
|
+
const clusters = [];
|
|
725
|
+
for (let clusterIndex = 0; clusterIndex < k; clusterIndex++) {
|
|
726
|
+
const clusterMembers = itemsWithVectors.filter((_, i) => assignments[i] === clusterIndex);
|
|
727
|
+
if (clusterMembers.length > 0) {
|
|
728
|
+
const memberIds = clusterMembers.map(item => item.id);
|
|
729
|
+
clusters.push({
|
|
730
|
+
id: `kmeans-${clusterIndex}`,
|
|
731
|
+
centroid: centroids[clusterIndex],
|
|
732
|
+
members: memberIds,
|
|
733
|
+
size: memberIds.length,
|
|
734
|
+
confidence: await this._calculateKMeansClusterConfidence(clusterMembers, centroids[clusterIndex]),
|
|
735
|
+
label: await this._generateClusterLabel(clusterMembers, 'kmeans'),
|
|
736
|
+
metadata: {
|
|
737
|
+
clustering: 'kmeans',
|
|
738
|
+
k,
|
|
739
|
+
clusterIndex,
|
|
740
|
+
convergenceIterations: maxIterations
|
|
741
|
+
}
|
|
742
|
+
});
|
|
743
|
+
}
|
|
744
|
+
}
|
|
745
|
+
return {
|
|
746
|
+
clusters,
|
|
747
|
+
metrics: this._createPerformanceMetrics(startTime, itemIds.length, 'kmeans'),
|
|
748
|
+
metadata: {
|
|
749
|
+
totalItems: itemIds.length,
|
|
750
|
+
clustersFound: clusters.length,
|
|
751
|
+
averageClusterSize: clusters.reduce((sum, c) => sum + c.size, 0) / clusters.length || 0,
|
|
752
|
+
kValue: k,
|
|
753
|
+
hasConverged,
|
|
754
|
+
timestamp: new Date()
|
|
755
|
+
}
|
|
756
|
+
};
|
|
757
|
+
}
|
|
758
|
+
/**
|
|
759
|
+
* DBSCAN CLUSTERING: Density-based clustering with adaptive parameters using HNSW
|
|
760
|
+
*/
|
|
761
|
+
async _performDBSCANClustering(items, options) {
|
|
762
|
+
const startTime = performance.now();
|
|
763
|
+
const itemIds = items || await this._getAllItemIds();
|
|
764
|
+
if (itemIds.length === 0) {
|
|
765
|
+
return this._createEmptyResult(startTime, 'dbscan');
|
|
766
|
+
}
|
|
767
|
+
const itemsWithVectors = await this._getItemsWithVectors(itemIds);
|
|
768
|
+
// Adaptive parameter selection using HNSW neighbors
|
|
769
|
+
const minPts = options.minClusterSize || Math.max(4, Math.floor(Math.log2(itemsWithVectors.length)));
|
|
770
|
+
const eps = options.threshold || await this._estimateOptimalEps(itemsWithVectors, minPts);
|
|
771
|
+
// DBSCAN state tracking
|
|
772
|
+
const NOISE = -1;
|
|
773
|
+
const UNVISITED = 0;
|
|
774
|
+
const visited = new Map();
|
|
775
|
+
const clusterAssignments = new Map();
|
|
776
|
+
let currentClusterId = 1;
|
|
777
|
+
// Process each point
|
|
778
|
+
for (const item of itemsWithVectors) {
|
|
779
|
+
if (visited.get(item.id))
|
|
780
|
+
continue;
|
|
781
|
+
visited.set(item.id, true);
|
|
782
|
+
// Find neighbors using existing HNSW infrastructure for efficiency
|
|
783
|
+
const neighbors = await this._findNeighborsWithinEps(item, itemsWithVectors, eps);
|
|
784
|
+
if (neighbors.length < minPts) {
|
|
785
|
+
// Mark as noise (outlier)
|
|
786
|
+
clusterAssignments.set(item.id, NOISE);
|
|
787
|
+
}
|
|
788
|
+
else {
|
|
789
|
+
// Start new cluster
|
|
790
|
+
await this._expandCluster(item, neighbors, currentClusterId, eps, minPts, itemsWithVectors, visited, clusterAssignments);
|
|
791
|
+
currentClusterId++;
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
// Convert DBSCAN results to SemanticCluster format
|
|
795
|
+
const clusters = [];
|
|
796
|
+
const clusterGroups = new Map();
|
|
797
|
+
const outliers = [];
|
|
798
|
+
// Group items by cluster assignment
|
|
799
|
+
for (const [itemId, clusterId] of clusterAssignments) {
|
|
800
|
+
if (clusterId === NOISE) {
|
|
801
|
+
outliers.push(itemId);
|
|
802
|
+
}
|
|
803
|
+
else {
|
|
804
|
+
if (!clusterGroups.has(clusterId)) {
|
|
805
|
+
clusterGroups.set(clusterId, []);
|
|
806
|
+
}
|
|
807
|
+
clusterGroups.get(clusterId).push(itemId);
|
|
808
|
+
}
|
|
809
|
+
}
|
|
810
|
+
// Create SemanticCluster objects
|
|
811
|
+
for (const [clusterId, memberIds] of clusterGroups) {
|
|
812
|
+
if (memberIds.length > 0) {
|
|
813
|
+
const members = itemsWithVectors.filter(item => memberIds.includes(item.id));
|
|
814
|
+
clusters.push({
|
|
815
|
+
id: `dbscan-${clusterId}`,
|
|
816
|
+
centroid: await this._calculateCentroidFromItems(members),
|
|
817
|
+
members: memberIds,
|
|
818
|
+
size: memberIds.length,
|
|
819
|
+
confidence: await this._calculateDBSCANClusterConfidence(members, eps),
|
|
820
|
+
label: await this._generateClusterLabel(members, 'dbscan'),
|
|
821
|
+
metadata: {
|
|
822
|
+
clustering: 'dbscan',
|
|
823
|
+
clusterId,
|
|
824
|
+
eps,
|
|
825
|
+
minPts,
|
|
826
|
+
isDensityBased: true
|
|
827
|
+
}
|
|
828
|
+
});
|
|
829
|
+
}
|
|
830
|
+
}
|
|
831
|
+
// Handle outliers - optionally create outlier cluster or assign to nearest
|
|
832
|
+
if (outliers.length > 0 && options.includeOutliers) {
|
|
833
|
+
const outlierMembers = itemsWithVectors.filter(item => outliers.includes(item.id));
|
|
834
|
+
clusters.push({
|
|
835
|
+
id: 'dbscan-outliers',
|
|
836
|
+
centroid: await this._calculateCentroidFromItems(outlierMembers),
|
|
837
|
+
members: outliers,
|
|
838
|
+
size: outliers.length,
|
|
839
|
+
confidence: 0.1, // Low confidence for outliers
|
|
840
|
+
label: 'Outliers',
|
|
841
|
+
metadata: {
|
|
842
|
+
clustering: 'dbscan',
|
|
843
|
+
isOutlierCluster: true,
|
|
844
|
+
eps,
|
|
845
|
+
minPts
|
|
846
|
+
}
|
|
847
|
+
});
|
|
848
|
+
}
|
|
849
|
+
return {
|
|
850
|
+
clusters,
|
|
851
|
+
metrics: this._createPerformanceMetrics(startTime, itemIds.length, 'dbscan'),
|
|
852
|
+
metadata: {
|
|
853
|
+
totalItems: itemIds.length,
|
|
854
|
+
clustersFound: clusters.length,
|
|
855
|
+
averageClusterSize: clusters.reduce((sum, c) => sum + c.size, 0) / clusters.length || 0,
|
|
856
|
+
outlierCount: outliers.length,
|
|
857
|
+
eps,
|
|
858
|
+
minPts,
|
|
859
|
+
timestamp: new Date()
|
|
860
|
+
}
|
|
861
|
+
};
|
|
862
|
+
}
|
|
863
|
+
/**
|
|
864
|
+
* GRAPH COMMUNITY DETECTION: Uses existing verb relationships for clustering
|
|
865
|
+
*/
|
|
866
|
+
async _performGraphClustering(items, options) {
|
|
867
|
+
const startTime = performance.now();
|
|
868
|
+
const itemIds = items || await this._getAllItemIds();
|
|
869
|
+
if (itemIds.length === 0) {
|
|
870
|
+
return this._createEmptyResult(startTime, 'graph');
|
|
871
|
+
}
|
|
872
|
+
// Build graph from existing verb relationships
|
|
873
|
+
const graph = await this._buildGraphFromVerbs(itemIds, options);
|
|
874
|
+
// Detect communities using modularity optimization
|
|
875
|
+
const communities = await this._detectCommunities(graph, options);
|
|
876
|
+
// Enhance communities with vector similarity for boundary refinement
|
|
877
|
+
const refinedCommunities = await this._refineCommunitiesWithVectors(communities, options);
|
|
878
|
+
// Convert to SemanticCluster format with Triple Intelligence labeling
|
|
879
|
+
const clusters = [];
|
|
880
|
+
for (let i = 0; i < refinedCommunities.length; i++) {
|
|
881
|
+
const community = refinedCommunities[i];
|
|
882
|
+
if (community.members.length > 0) {
|
|
883
|
+
const members = await this._getItemsWithMetadata(community.members);
|
|
884
|
+
// Use Triple Intelligence for intelligent cluster labeling
|
|
885
|
+
const clusterLabel = await this._generateIntelligentClusterLabel(members, 'graph');
|
|
886
|
+
const clusterCentroid = await this._calculateCentroidFromItems(members);
|
|
887
|
+
clusters.push({
|
|
888
|
+
id: `graph-${i}`,
|
|
889
|
+
centroid: clusterCentroid,
|
|
890
|
+
members: community.members,
|
|
891
|
+
size: community.members.length,
|
|
892
|
+
confidence: community.modularity || 0.7,
|
|
893
|
+
label: clusterLabel,
|
|
894
|
+
metadata: {
|
|
895
|
+
clustering: 'graph',
|
|
896
|
+
communityId: i,
|
|
897
|
+
modularity: community.modularity,
|
|
898
|
+
graphDensity: community.density,
|
|
899
|
+
strongestConnections: community.strongestConnections
|
|
900
|
+
}
|
|
901
|
+
});
|
|
902
|
+
}
|
|
903
|
+
}
|
|
904
|
+
return {
|
|
905
|
+
clusters,
|
|
906
|
+
metrics: this._createPerformanceMetrics(startTime, itemIds.length, 'graph'),
|
|
907
|
+
metadata: {
|
|
908
|
+
totalItems: itemIds.length,
|
|
909
|
+
clustersFound: clusters.length,
|
|
910
|
+
averageClusterSize: clusters.reduce((sum, c) => sum + c.size, 0) / clusters.length || 0,
|
|
911
|
+
averageModularity: clusters.reduce((sum, c) => sum + (c.metadata?.modularity || 0), 0) / clusters.length || 0,
|
|
912
|
+
timestamp: new Date()
|
|
913
|
+
}
|
|
914
|
+
};
|
|
915
|
+
}
|
|
916
|
+
/**
|
|
917
|
+
* MULTI-MODAL FUSION: Combines vector + graph + semantic + Triple Intelligence
|
|
918
|
+
*/
|
|
919
|
+
async _performMultiModalClustering(items, options) {
|
|
920
|
+
const startTime = performance.now();
|
|
921
|
+
const itemIds = items || await this._getAllItemIds();
|
|
922
|
+
if (itemIds.length === 0) {
|
|
923
|
+
return this._createEmptyResult(startTime, 'multimodal');
|
|
924
|
+
}
|
|
925
|
+
// Run multiple clustering algorithms in parallel
|
|
926
|
+
const [vectorClusters, graphClusters, semanticClusters] = await Promise.all([
|
|
927
|
+
this._performHierarchicalClustering(itemIds, { ...options, algorithm: 'hierarchical' }),
|
|
928
|
+
this._performGraphClustering(itemIds, { ...options, algorithm: 'graph' }),
|
|
929
|
+
this._performSemanticClustering(itemIds, { ...options, algorithm: 'semantic' })
|
|
930
|
+
]);
|
|
931
|
+
// Fuse results using intelligent consensus with Triple Intelligence
|
|
932
|
+
const fusedClusters = await this._fuseClusteringResultsWithTripleIntelligence([vectorClusters.clusters, graphClusters.clusters, semanticClusters.clusters], options);
|
|
933
|
+
return {
|
|
934
|
+
clusters: fusedClusters,
|
|
935
|
+
metrics: this._createPerformanceMetrics(startTime, itemIds.length, 'multimodal'),
|
|
936
|
+
metadata: {
|
|
937
|
+
totalItems: itemIds.length,
|
|
938
|
+
clustersFound: fusedClusters.length,
|
|
939
|
+
averageClusterSize: fusedClusters.reduce((sum, c) => sum + c.size, 0) / fusedClusters.length || 0,
|
|
940
|
+
fusionMethod: 'triple_intelligence_consensus',
|
|
941
|
+
componentAlgorithms: ['hierarchical', 'graph', 'semantic'],
|
|
942
|
+
timestamp: new Date()
|
|
943
|
+
}
|
|
944
|
+
};
|
|
945
|
+
}
|
|
946
|
+
/**
|
|
947
|
+
* SAMPLED CLUSTERING: For very large datasets using intelligent sampling
|
|
948
|
+
*/
|
|
949
|
+
async _performSampledClustering(items, options) {
|
|
950
|
+
const startTime = performance.now();
|
|
951
|
+
const itemIds = items || await this._getAllItemIds();
|
|
952
|
+
if (itemIds.length === 0) {
|
|
953
|
+
return this._createEmptyResult(startTime, 'sampled');
|
|
954
|
+
}
|
|
955
|
+
const sampleSize = Math.min(options.sampleSize || 1000, itemIds.length);
|
|
956
|
+
const strategy = options.strategy || 'diverse';
|
|
957
|
+
// Intelligent sampling using existing infrastructure
|
|
958
|
+
const sample = await this._getSampleUsingStrategy(itemIds, sampleSize, strategy);
|
|
959
|
+
// Cluster the sample using the best algorithm for the sample size
|
|
960
|
+
const sampleResult = await this._performHierarchicalClustering(sample, {
|
|
961
|
+
...options,
|
|
962
|
+
maxClusters: Math.min(options.maxClusters || 50, Math.ceil(sample.length / 10))
|
|
963
|
+
});
|
|
964
|
+
// Project clusters back to full dataset using HNSW neighbors
|
|
965
|
+
const projectedClusters = await this._projectClustersToFullDataset(sampleResult.clusters, itemIds, sample);
|
|
966
|
+
return {
|
|
967
|
+
clusters: projectedClusters,
|
|
968
|
+
metrics: this._createPerformanceMetrics(startTime, itemIds.length, 'sampled'),
|
|
969
|
+
metadata: {
|
|
970
|
+
totalItems: itemIds.length,
|
|
971
|
+
sampleSize: sample.length,
|
|
972
|
+
samplingStrategy: strategy,
|
|
973
|
+
clustersFound: projectedClusters.length,
|
|
974
|
+
averageClusterSize: projectedClusters.reduce((sum, c) => sum + c.size, 0) / projectedClusters.length || 0,
|
|
975
|
+
timestamp: new Date()
|
|
976
|
+
}
|
|
977
|
+
};
|
|
978
|
+
}
|
|
979
|
+
// Similarity implementation methods
|
|
980
|
+
async _similarityById(id1, id2, options) {
|
|
981
|
+
// Get vectors for both items
|
|
982
|
+
const item1 = await this.brain.getNoun(id1);
|
|
983
|
+
const item2 = await this.brain.getNoun(id2);
|
|
984
|
+
if (!item1 || !item2) {
|
|
985
|
+
return 0;
|
|
986
|
+
}
|
|
987
|
+
return this._similarityByVector(item1.vector, item2.vector, options);
|
|
988
|
+
}
|
|
989
|
+
async _similarityByVector(v1, v2, options) {
|
|
990
|
+
const metric = options.metric || this.config.similarityMetric || 'cosine';
|
|
991
|
+
let score = 0;
|
|
992
|
+
switch (metric) {
|
|
993
|
+
case 'cosine':
|
|
994
|
+
score = 1 - cosineDistance(v1, v2);
|
|
995
|
+
break;
|
|
996
|
+
case 'euclidean':
|
|
997
|
+
score = 1 / (1 + euclideanDistance(v1, v2));
|
|
998
|
+
break;
|
|
999
|
+
case 'manhattan':
|
|
1000
|
+
score = 1 / (1 + this._manhattanDistance(v1, v2));
|
|
1001
|
+
break;
|
|
1002
|
+
default:
|
|
1003
|
+
score = 1 - cosineDistance(v1, v2);
|
|
1004
|
+
}
|
|
1005
|
+
if (options.detailed) {
|
|
1006
|
+
return {
|
|
1007
|
+
score: options.normalized !== false ? Math.max(0, Math.min(1, score)) : score,
|
|
1008
|
+
confidence: this._calculateConfidence(score, v1, v2),
|
|
1009
|
+
explanation: this._generateSimilarityExplanation(score, metric),
|
|
1010
|
+
metric
|
|
1011
|
+
};
|
|
1012
|
+
}
|
|
1013
|
+
return options.normalized !== false ? Math.max(0, Math.min(1, score)) : score;
|
|
1014
|
+
}
|
|
1015
|
+
async _similarityByText(text1, text2, options) {
|
|
1016
|
+
// Convert text to vectors using brain's embedding function
|
|
1017
|
+
const vector1 = await this.brain.embed(text1);
|
|
1018
|
+
const vector2 = await this.brain.embed(text2);
|
|
1019
|
+
return this._similarityByVector(vector1, vector2, options);
|
|
1020
|
+
}
|
|
1021
|
+
// Utility methods for internal operations
|
|
1022
|
+
_isId(value) {
|
|
1023
|
+
return typeof value === 'string' &&
|
|
1024
|
+
(value.length === 36 && value.includes('-')) || // UUID-like
|
|
1025
|
+
(value.length > 10 && !value.includes(' ')); // ID-like string
|
|
1026
|
+
}
|
|
1027
|
+
_isVector(value) {
|
|
1028
|
+
return Array.isArray(value) &&
|
|
1029
|
+
value.length > 0 &&
|
|
1030
|
+
typeof value[0] === 'number';
|
|
1031
|
+
}
|
|
1032
|
+
async _convertToVector(input) {
|
|
1033
|
+
if (this._isVector(input)) {
|
|
1034
|
+
return input;
|
|
1035
|
+
}
|
|
1036
|
+
else if (this._isId(input)) {
|
|
1037
|
+
const item = await this.brain.getNoun(input);
|
|
1038
|
+
return item?.vector || [];
|
|
1039
|
+
}
|
|
1040
|
+
else if (typeof input === 'string') {
|
|
1041
|
+
return await this.brain.embed(input);
|
|
1042
|
+
}
|
|
1043
|
+
else {
|
|
1044
|
+
throw new Error(`Cannot convert input to vector: ${typeof input}`);
|
|
1045
|
+
}
|
|
1046
|
+
}
|
|
1047
|
+
_createSimilarityKey(a, b, options) {
|
|
1048
|
+
const aKey = typeof a === 'object' ? JSON.stringify(a).substring(0, 50) : String(a);
|
|
1049
|
+
const bKey = typeof b === 'object' ? JSON.stringify(b).substring(0, 50) : String(b);
|
|
1050
|
+
return `${aKey}|${bKey}|${JSON.stringify(options)}`;
|
|
1051
|
+
}
|
|
1052
|
+
_createClusteringKey(items, options) {
|
|
1053
|
+
const itemsKey = items ? [...items].sort().join(',') : 'all';
|
|
1054
|
+
return `clustering:${itemsKey}:${JSON.stringify(options)}`;
|
|
1055
|
+
}
|
|
1056
|
+
_cacheResult(key, result, cache) {
|
|
1057
|
+
if (cache.size >= (this.config.cacheSize || 1000)) {
|
|
1058
|
+
// Remove oldest entries (simple LRU)
|
|
1059
|
+
const firstKey = cache.keys().next().value;
|
|
1060
|
+
if (firstKey)
|
|
1061
|
+
cache.delete(firstKey);
|
|
1062
|
+
}
|
|
1063
|
+
cache.set(key, result);
|
|
1064
|
+
}
|
|
1065
|
+
_trackPerformance(operation, startTime, itemCount, algorithm) {
|
|
1066
|
+
if (!this.config.performanceTracking)
|
|
1067
|
+
return;
|
|
1068
|
+
const metrics = {
|
|
1069
|
+
executionTime: performance.now() - startTime,
|
|
1070
|
+
memoryUsed: 0, // Would implement actual memory tracking
|
|
1071
|
+
itemsProcessed: itemCount,
|
|
1072
|
+
cacheHits: 0, // Would track actual cache hits
|
|
1073
|
+
cacheMisses: 0, // Would track actual cache misses
|
|
1074
|
+
algorithm
|
|
1075
|
+
};
|
|
1076
|
+
if (!this.performanceMetrics.has(operation)) {
|
|
1077
|
+
this.performanceMetrics.set(operation, []);
|
|
1078
|
+
}
|
|
1079
|
+
this.performanceMetrics.get(operation).push(metrics);
|
|
1080
|
+
}
|
|
1081
|
+
_createPerformanceMetrics(startTime, itemCount, algorithm) {
|
|
1082
|
+
return {
|
|
1083
|
+
executionTime: performance.now() - startTime,
|
|
1084
|
+
memoryUsed: 0,
|
|
1085
|
+
itemsProcessed: itemCount,
|
|
1086
|
+
cacheHits: 0,
|
|
1087
|
+
cacheMisses: 0,
|
|
1088
|
+
algorithm
|
|
1089
|
+
};
|
|
1090
|
+
}
|
|
1091
|
+
_initializeCleanupTimer() {
|
|
1092
|
+
// Periodically clean up caches to prevent memory leaks
|
|
1093
|
+
setInterval(() => {
|
|
1094
|
+
if (this.similarityCache.size > (this.config.cacheSize || 1000)) {
|
|
1095
|
+
this.similarityCache.clear();
|
|
1096
|
+
}
|
|
1097
|
+
if (this.clusterCache.size > (this.config.cacheSize || 1000)) {
|
|
1098
|
+
this.clusterCache.clear();
|
|
1099
|
+
}
|
|
1100
|
+
if (this.hierarchyCache.size > (this.config.cacheSize || 1000)) {
|
|
1101
|
+
this.hierarchyCache.clear();
|
|
1102
|
+
}
|
|
1103
|
+
if (this.neighborsCache.size > (this.config.cacheSize || 1000)) {
|
|
1104
|
+
this.neighborsCache.clear();
|
|
1105
|
+
}
|
|
1106
|
+
}, 300000); // Clean every 5 minutes
|
|
1107
|
+
}
|
|
1108
|
+
// ===== GRAPH COMMUNITY DETECTION UTILITIES =====
|
|
1109
|
+
/**
|
|
1110
|
+
* Build graph structure from existing verb relationships
|
|
1111
|
+
*/
|
|
1112
|
+
async _buildGraphFromVerbs(itemIds, options) {
|
|
1113
|
+
const nodes = new Set(itemIds);
|
|
1114
|
+
const edges = new Map();
|
|
1115
|
+
const verbWeights = new Map();
|
|
1116
|
+
// Initialize verb relationship weights
|
|
1117
|
+
const relationshipWeights = {
|
|
1118
|
+
'creates': 1.0,
|
|
1119
|
+
'partOf': 0.9,
|
|
1120
|
+
'contains': 0.9,
|
|
1121
|
+
'relatedTo': 0.7,
|
|
1122
|
+
'references': 0.6,
|
|
1123
|
+
'causes': 0.8,
|
|
1124
|
+
'dependsOn': 0.8,
|
|
1125
|
+
'memberOf': 0.9,
|
|
1126
|
+
'worksWith': 0.7,
|
|
1127
|
+
'communicates': 0.6
|
|
1128
|
+
};
|
|
1129
|
+
// Get all verbs connecting the items
|
|
1130
|
+
for (const sourceId of itemIds) {
|
|
1131
|
+
const sourceVerbs = await this.brain.getVerbsForNoun(sourceId);
|
|
1132
|
+
for (const verb of sourceVerbs) {
|
|
1133
|
+
const targetId = verb.target;
|
|
1134
|
+
if (nodes.has(targetId) && sourceId !== targetId) {
|
|
1135
|
+
// Initialize edge map if needed
|
|
1136
|
+
if (!edges.has(sourceId)) {
|
|
1137
|
+
edges.set(sourceId, new Map());
|
|
1138
|
+
}
|
|
1139
|
+
// Calculate edge weight from verb type and metadata
|
|
1140
|
+
const verbType = verb.verb;
|
|
1141
|
+
const baseWeight = relationshipWeights[verbType] || 0.5;
|
|
1142
|
+
const confidenceWeight = verb.confidence || 1.0;
|
|
1143
|
+
const weight = baseWeight * confidenceWeight;
|
|
1144
|
+
// Add or strengthen edge
|
|
1145
|
+
const currentWeight = edges.get(sourceId)?.get(targetId) || 0;
|
|
1146
|
+
edges.get(sourceId).set(targetId, Math.min(currentWeight + weight, 1.0));
|
|
1147
|
+
// Make graph undirected by adding reverse edge
|
|
1148
|
+
if (!edges.has(targetId)) {
|
|
1149
|
+
edges.set(targetId, new Map());
|
|
1150
|
+
}
|
|
1151
|
+
const reverseWeight = edges.get(targetId)?.get(sourceId) || 0;
|
|
1152
|
+
edges.get(targetId).set(sourceId, Math.min(reverseWeight + weight, 1.0));
|
|
1153
|
+
}
|
|
1154
|
+
}
|
|
1155
|
+
}
|
|
1156
|
+
return {
|
|
1157
|
+
nodes: Array.from(nodes),
|
|
1158
|
+
edges,
|
|
1159
|
+
nodeCount: nodes.size,
|
|
1160
|
+
edgeCount: Array.from(edges.values()).reduce((sum, edgeMap) => sum + edgeMap.size, 0) / 2 // Undirected
|
|
1161
|
+
};
|
|
1162
|
+
}
|
|
1163
|
+
/**
|
|
1164
|
+
* Detect communities using Louvain modularity optimization
|
|
1165
|
+
*/
|
|
1166
|
+
async _detectCommunities(graph, options) {
|
|
1167
|
+
const { nodes, edges } = graph;
|
|
1168
|
+
// Initialize each node as its own community
|
|
1169
|
+
const communities = new Map();
|
|
1170
|
+
nodes.forEach((node, index) => communities.set(node, index));
|
|
1171
|
+
const totalWeight = this._calculateTotalWeight(edges);
|
|
1172
|
+
let improved = true;
|
|
1173
|
+
let iteration = 0;
|
|
1174
|
+
const maxIterations = 50;
|
|
1175
|
+
// Louvain algorithm: iteratively move nodes to communities that maximize modularity
|
|
1176
|
+
while (improved && iteration < maxIterations) {
|
|
1177
|
+
improved = false;
|
|
1178
|
+
iteration++;
|
|
1179
|
+
for (const node of nodes) {
|
|
1180
|
+
const currentCommunity = communities.get(node);
|
|
1181
|
+
let bestCommunity = currentCommunity;
|
|
1182
|
+
let bestGain = 0;
|
|
1183
|
+
// Consider neighboring communities
|
|
1184
|
+
const neighborCommunities = this._getNeighborCommunities(node, edges, communities);
|
|
1185
|
+
for (const neighborCommunity of neighborCommunities) {
|
|
1186
|
+
if (neighborCommunity !== currentCommunity) {
|
|
1187
|
+
const gain = this._calculateModularityGain(node, currentCommunity, neighborCommunity, edges, communities, totalWeight);
|
|
1188
|
+
if (gain > bestGain) {
|
|
1189
|
+
bestGain = gain;
|
|
1190
|
+
bestCommunity = neighborCommunity;
|
|
1191
|
+
}
|
|
1192
|
+
}
|
|
1193
|
+
}
|
|
1194
|
+
// Move node if beneficial
|
|
1195
|
+
if (bestCommunity !== currentCommunity) {
|
|
1196
|
+
communities.set(node, bestCommunity);
|
|
1197
|
+
improved = true;
|
|
1198
|
+
}
|
|
1199
|
+
}
|
|
1200
|
+
}
|
|
1201
|
+
// Group nodes by final community assignment
|
|
1202
|
+
const communityGroups = new Map();
|
|
1203
|
+
for (const [node, communityId] of communities) {
|
|
1204
|
+
if (!communityGroups.has(communityId)) {
|
|
1205
|
+
communityGroups.set(communityId, []);
|
|
1206
|
+
}
|
|
1207
|
+
communityGroups.get(communityId).push(node);
|
|
1208
|
+
}
|
|
1209
|
+
// Convert to Community objects with metadata
|
|
1210
|
+
const result = [];
|
|
1211
|
+
for (const [communityId, members] of communityGroups) {
|
|
1212
|
+
if (members.length >= (options.minClusterSize || 2)) {
|
|
1213
|
+
const modularity = this._calculateCommunityModularity(members, edges, totalWeight);
|
|
1214
|
+
const density = this._calculateCommunityDensity(members, edges);
|
|
1215
|
+
const strongestConnections = this._findStrongestConnections(members, edges, 3);
|
|
1216
|
+
result.push({
|
|
1217
|
+
id: communityId,
|
|
1218
|
+
members,
|
|
1219
|
+
modularity,
|
|
1220
|
+
density,
|
|
1221
|
+
strongestConnections
|
|
1222
|
+
});
|
|
1223
|
+
}
|
|
1224
|
+
}
|
|
1225
|
+
return result;
|
|
1226
|
+
}
|
|
1227
|
+
/**
|
|
1228
|
+
* Refine community boundaries using vector similarity
|
|
1229
|
+
*/
|
|
1230
|
+
async _refineCommunitiesWithVectors(communities, options) {
|
|
1231
|
+
const refined = [];
|
|
1232
|
+
for (const community of communities) {
|
|
1233
|
+
const membersWithVectors = await this._getItemsWithVectors(community.members);
|
|
1234
|
+
// Check if community is coherent in vector space
|
|
1235
|
+
const vectorCoherence = await this._calculateVectorCoherence(membersWithVectors);
|
|
1236
|
+
if (vectorCoherence > 0.3) {
|
|
1237
|
+
// Community is coherent, keep as is
|
|
1238
|
+
refined.push(community);
|
|
1239
|
+
}
|
|
1240
|
+
else {
|
|
1241
|
+
// Split community using vector-based sub-clustering
|
|
1242
|
+
const subClusters = await this._performHierarchicalClustering(community.members, { ...options, maxClusters: Math.ceil(community.members.length / 5) });
|
|
1243
|
+
// Convert sub-clusters to communities
|
|
1244
|
+
for (let i = 0; i < subClusters.clusters.length; i++) {
|
|
1245
|
+
const subCluster = subClusters.clusters[i];
|
|
1246
|
+
refined.push({
|
|
1247
|
+
id: community.id * 1000 + i, // Unique sub-community ID
|
|
1248
|
+
members: subCluster.members,
|
|
1249
|
+
modularity: community.modularity * 0.8, // Slightly lower modularity for sub-communities
|
|
1250
|
+
density: community.density,
|
|
1251
|
+
strongestConnections: []
|
|
1252
|
+
});
|
|
1253
|
+
}
|
|
1254
|
+
}
|
|
1255
|
+
}
|
|
1256
|
+
return refined;
|
|
1257
|
+
}
|
|
1258
|
+
// ===== SEMANTIC CLUSTERING UTILITIES =====
|
|
1259
|
+
/**
|
|
1260
|
+
* Get items with their metadata including noun types
|
|
1261
|
+
*/
|
|
1262
|
+
async _getItemsWithMetadata(itemIds) {
|
|
1263
|
+
const items = await Promise.all(itemIds.map(async (id) => {
|
|
1264
|
+
const noun = await this.brain.getNoun(id);
|
|
1265
|
+
return {
|
|
1266
|
+
id,
|
|
1267
|
+
vector: noun?.vector || [],
|
|
1268
|
+
metadata: noun?.data || {},
|
|
1269
|
+
nounType: noun?.noun || 'concept',
|
|
1270
|
+
label: noun?.label || id,
|
|
1271
|
+
data: noun?.data
|
|
1272
|
+
};
|
|
1273
|
+
}));
|
|
1274
|
+
return items.filter(item => item.vector.length > 0);
|
|
1275
|
+
}
|
|
1276
|
+
/**
|
|
1277
|
+
* Group items by their semantic noun types
|
|
1278
|
+
*/
|
|
1279
|
+
_groupBySemanticType(items) {
|
|
1280
|
+
const groups = new Map();
|
|
1281
|
+
for (const item of items) {
|
|
1282
|
+
const type = item.nounType;
|
|
1283
|
+
if (!groups.has(type)) {
|
|
1284
|
+
groups.set(type, []);
|
|
1285
|
+
}
|
|
1286
|
+
groups.get(type).push(item);
|
|
1287
|
+
}
|
|
1288
|
+
return groups;
|
|
1289
|
+
}
|
|
1290
|
+
// Placeholder implementations for complex operations
|
|
1291
|
+
async _getAllItemIds() {
|
|
1292
|
+
// Get all noun IDs from the brain
|
|
1293
|
+
const stats = await this.brain.getStatistics();
|
|
1294
|
+
if (!stats.totalNodes || stats.totalNodes === 0) {
|
|
1295
|
+
return [];
|
|
1296
|
+
}
|
|
1297
|
+
// Use a simple approach: get recent items or sample
|
|
1298
|
+
// In practice, this could be optimized with pagination
|
|
1299
|
+
const items = await this.brain.getRecent(Math.min(stats.totalNodes, 10000));
|
|
1300
|
+
return items.map((item) => item.id);
|
|
1301
|
+
}
|
|
1302
|
+
async _getTotalItemCount() {
|
|
1303
|
+
const stats = await this.brain.getStatistics();
|
|
1304
|
+
return stats.totalNodes || 0;
|
|
1305
|
+
}
|
|
1306
|
+
// ===== GRAPH ALGORITHM SUPPORTING METHODS =====
|
|
1307
|
+
_calculateTotalWeight(edges) {
|
|
1308
|
+
let total = 0;
|
|
1309
|
+
for (const edgeMap of edges.values()) {
|
|
1310
|
+
for (const weight of edgeMap.values()) {
|
|
1311
|
+
total += weight;
|
|
1312
|
+
}
|
|
1313
|
+
}
|
|
1314
|
+
return total / 2; // Undirected graph, so divide by 2
|
|
1315
|
+
}
|
|
1316
|
+
_getNeighborCommunities(node, edges, communities) {
|
|
1317
|
+
const neighborCommunities = new Set();
|
|
1318
|
+
const nodeEdges = edges.get(node);
|
|
1319
|
+
if (nodeEdges) {
|
|
1320
|
+
for (const neighbor of nodeEdges.keys()) {
|
|
1321
|
+
const neighborCommunity = communities.get(neighbor);
|
|
1322
|
+
if (neighborCommunity !== undefined) {
|
|
1323
|
+
neighborCommunities.add(neighborCommunity);
|
|
1324
|
+
}
|
|
1325
|
+
}
|
|
1326
|
+
}
|
|
1327
|
+
return neighborCommunities;
|
|
1328
|
+
}
|
|
1329
|
+
_calculateModularityGain(node, oldCommunity, newCommunity, edges, communities, totalWeight) {
|
|
1330
|
+
// Calculate the degree of the node
|
|
1331
|
+
const nodeDegree = this._getNodeDegree(node, edges);
|
|
1332
|
+
// Calculate edges to old and new communities
|
|
1333
|
+
const edgesToOld = this._getEdgesToCommunity(node, oldCommunity, edges, communities);
|
|
1334
|
+
const edgesToNew = this._getEdgesToCommunity(node, newCommunity, edges, communities);
|
|
1335
|
+
// Calculate community weights
|
|
1336
|
+
const oldCommunityWeight = this._getCommunityWeight(oldCommunity, edges, communities);
|
|
1337
|
+
const newCommunityWeight = this._getCommunityWeight(newCommunity, edges, communities);
|
|
1338
|
+
// Modularity gain calculation (simplified)
|
|
1339
|
+
const oldContrib = edgesToOld - (nodeDegree * oldCommunityWeight) / (2 * totalWeight);
|
|
1340
|
+
const newContrib = edgesToNew - (nodeDegree * newCommunityWeight) / (2 * totalWeight);
|
|
1341
|
+
return newContrib - oldContrib;
|
|
1342
|
+
}
|
|
1343
|
+
_getNodeDegree(node, edges) {
|
|
1344
|
+
const nodeEdges = edges.get(node);
|
|
1345
|
+
if (!nodeEdges)
|
|
1346
|
+
return 0;
|
|
1347
|
+
return Array.from(nodeEdges.values()).reduce((sum, weight) => sum + weight, 0);
|
|
1348
|
+
}
|
|
1349
|
+
_getEdgesToCommunity(node, community, edges, communities) {
|
|
1350
|
+
const nodeEdges = edges.get(node);
|
|
1351
|
+
if (!nodeEdges)
|
|
1352
|
+
return 0;
|
|
1353
|
+
let total = 0;
|
|
1354
|
+
for (const [neighbor, weight] of nodeEdges) {
|
|
1355
|
+
if (communities.get(neighbor) === community) {
|
|
1356
|
+
total += weight;
|
|
1357
|
+
}
|
|
1358
|
+
}
|
|
1359
|
+
return total;
|
|
1360
|
+
}
|
|
1361
|
+
_getCommunityWeight(community, edges, communities) {
|
|
1362
|
+
let total = 0;
|
|
1363
|
+
for (const [node, nodeCommunity] of communities) {
|
|
1364
|
+
if (nodeCommunity === community) {
|
|
1365
|
+
total += this._getNodeDegree(node, edges);
|
|
1366
|
+
}
|
|
1367
|
+
}
|
|
1368
|
+
return total;
|
|
1369
|
+
}
|
|
1370
|
+
_calculateCommunityModularity(members, edges, totalWeight) {
|
|
1371
|
+
if (members.length < 2)
|
|
1372
|
+
return 0;
|
|
1373
|
+
let internalWeight = 0;
|
|
1374
|
+
let totalDegree = 0;
|
|
1375
|
+
for (const member of members) {
|
|
1376
|
+
const memberEdges = edges.get(member);
|
|
1377
|
+
if (memberEdges) {
|
|
1378
|
+
totalDegree += Array.from(memberEdges.values()).reduce((sum, w) => sum + w, 0);
|
|
1379
|
+
// Count internal edges
|
|
1380
|
+
for (const [neighbor, weight] of memberEdges) {
|
|
1381
|
+
if (members.includes(neighbor)) {
|
|
1382
|
+
internalWeight += weight;
|
|
1383
|
+
}
|
|
1384
|
+
}
|
|
1385
|
+
}
|
|
1386
|
+
}
|
|
1387
|
+
internalWeight /= 2; // Undirected graph
|
|
1388
|
+
const expectedInternal = (totalDegree * totalDegree) / (4 * totalWeight);
|
|
1389
|
+
return (internalWeight / totalWeight) - expectedInternal / totalWeight;
|
|
1390
|
+
}
|
|
1391
|
+
_calculateCommunityDensity(members, edges) {
|
|
1392
|
+
if (members.length < 2)
|
|
1393
|
+
return 0;
|
|
1394
|
+
let actualEdges = 0;
|
|
1395
|
+
const maxPossibleEdges = (members.length * (members.length - 1)) / 2;
|
|
1396
|
+
for (const member of members) {
|
|
1397
|
+
const memberEdges = edges.get(member);
|
|
1398
|
+
if (memberEdges) {
|
|
1399
|
+
for (const neighbor of memberEdges.keys()) {
|
|
1400
|
+
if (members.includes(neighbor) && member < neighbor) { // Avoid double counting
|
|
1401
|
+
actualEdges++;
|
|
1402
|
+
}
|
|
1403
|
+
}
|
|
1404
|
+
}
|
|
1405
|
+
}
|
|
1406
|
+
return actualEdges / maxPossibleEdges;
|
|
1407
|
+
}
|
|
1408
|
+
_findStrongestConnections(members, edges, limit) {
|
|
1409
|
+
const connections = [];
|
|
1410
|
+
for (const member of members) {
|
|
1411
|
+
const memberEdges = edges.get(member);
|
|
1412
|
+
if (memberEdges) {
|
|
1413
|
+
for (const [neighbor, weight] of memberEdges) {
|
|
1414
|
+
if (members.includes(neighbor) && member < neighbor) { // Avoid duplicates
|
|
1415
|
+
connections.push({ from: member, to: neighbor, weight });
|
|
1416
|
+
}
|
|
1417
|
+
}
|
|
1418
|
+
}
|
|
1419
|
+
}
|
|
1420
|
+
return connections
|
|
1421
|
+
.sort((a, b) => b.weight - a.weight)
|
|
1422
|
+
.slice(0, limit);
|
|
1423
|
+
}
|
|
1424
|
+
// ===== K-MEANS UTILITIES =====
|
|
1425
|
+
/**
|
|
1426
|
+
* Get items with their vector representations
|
|
1427
|
+
*/
|
|
1428
|
+
async _getItemsWithVectors(itemIds) {
|
|
1429
|
+
const items = await Promise.all(itemIds.map(async (id) => {
|
|
1430
|
+
const noun = await this.brain.getNoun(id);
|
|
1431
|
+
return {
|
|
1432
|
+
id,
|
|
1433
|
+
vector: noun?.vector || []
|
|
1434
|
+
};
|
|
1435
|
+
}));
|
|
1436
|
+
return items.filter(item => item.vector.length > 0);
|
|
1437
|
+
}
|
|
1438
|
+
/**
|
|
1439
|
+
* Calculate centroid from items using existing distance functions
|
|
1440
|
+
*/
|
|
1441
|
+
async _calculateCentroidFromItems(items) {
|
|
1442
|
+
if (items.length === 0)
|
|
1443
|
+
return [];
|
|
1444
|
+
if (items.length === 1)
|
|
1445
|
+
return [...items[0].vector];
|
|
1446
|
+
const dimensions = items[0].vector.length;
|
|
1447
|
+
const centroid = new Array(dimensions).fill(0);
|
|
1448
|
+
for (const item of items) {
|
|
1449
|
+
for (let i = 0; i < dimensions; i++) {
|
|
1450
|
+
centroid[i] += item.vector[i];
|
|
1451
|
+
}
|
|
1452
|
+
}
|
|
1453
|
+
for (let i = 0; i < dimensions; i++) {
|
|
1454
|
+
centroid[i] /= items.length;
|
|
1455
|
+
}
|
|
1456
|
+
return centroid;
|
|
1457
|
+
}
|
|
1458
|
+
/**
|
|
1459
|
+
* Initialize centroids using k-means++ algorithm for better convergence
|
|
1460
|
+
*/
|
|
1461
|
+
async _initializeCentroidsKMeansPlusPlus(items, k) {
|
|
1462
|
+
const centroids = [];
|
|
1463
|
+
// Choose first centroid randomly
|
|
1464
|
+
const firstIdx = Math.floor(Math.random() * items.length);
|
|
1465
|
+
centroids.push([...items[firstIdx].vector]);
|
|
1466
|
+
// Choose remaining centroids using k-means++ probability
|
|
1467
|
+
for (let i = 1; i < k; i++) {
|
|
1468
|
+
const distances = items.map(item => {
|
|
1469
|
+
// Find distance to closest existing centroid
|
|
1470
|
+
let minDist = Infinity;
|
|
1471
|
+
for (const centroid of centroids) {
|
|
1472
|
+
const dist = this._calculateSquaredDistance(item.vector, centroid);
|
|
1473
|
+
minDist = Math.min(minDist, dist);
|
|
1474
|
+
}
|
|
1475
|
+
return minDist;
|
|
1476
|
+
});
|
|
1477
|
+
// Choose next centroid with probability proportional to squared distance
|
|
1478
|
+
const totalDistance = distances.reduce((sum, d) => sum + d, 0);
|
|
1479
|
+
const target = Math.random() * totalDistance;
|
|
1480
|
+
let cumulative = 0;
|
|
1481
|
+
for (let j = 0; j < distances.length; j++) {
|
|
1482
|
+
cumulative += distances[j];
|
|
1483
|
+
if (cumulative >= target) {
|
|
1484
|
+
centroids.push([...items[j].vector]);
|
|
1485
|
+
break;
|
|
1486
|
+
}
|
|
1487
|
+
}
|
|
1488
|
+
}
|
|
1489
|
+
return centroids;
|
|
1490
|
+
}
|
|
1491
|
+
/**
|
|
1492
|
+
* Assign points to nearest centroids using existing distance functions
|
|
1493
|
+
*/
|
|
1494
|
+
async _assignPointsToCentroids(items, centroids) {
|
|
1495
|
+
const assignments = [];
|
|
1496
|
+
for (const item of items) {
|
|
1497
|
+
let bestCentroid = 0;
|
|
1498
|
+
let minDistance = Infinity;
|
|
1499
|
+
for (let i = 0; i < centroids.length; i++) {
|
|
1500
|
+
const distance = this._calculateSquaredDistance(item.vector, centroids[i]);
|
|
1501
|
+
if (distance < minDistance) {
|
|
1502
|
+
minDistance = distance;
|
|
1503
|
+
bestCentroid = i;
|
|
1504
|
+
}
|
|
1505
|
+
}
|
|
1506
|
+
assignments.push(bestCentroid);
|
|
1507
|
+
}
|
|
1508
|
+
return assignments;
|
|
1509
|
+
}
|
|
1510
|
+
/**
|
|
1511
|
+
* Update centroids based on current assignments
|
|
1512
|
+
*/
|
|
1513
|
+
async _updateCentroids(items, assignments, k) {
|
|
1514
|
+
const newCentroids = [];
|
|
1515
|
+
for (let i = 0; i < k; i++) {
|
|
1516
|
+
const clusterItems = items.filter((_, idx) => assignments[idx] === i);
|
|
1517
|
+
if (clusterItems.length > 0) {
|
|
1518
|
+
newCentroids.push(await this._calculateCentroidFromItems(clusterItems));
|
|
1519
|
+
}
|
|
1520
|
+
else {
|
|
1521
|
+
// Keep old centroid if no items assigned
|
|
1522
|
+
newCentroids.push(new Array(items[0].vector.length).fill(0));
|
|
1523
|
+
}
|
|
1524
|
+
}
|
|
1525
|
+
return newCentroids;
|
|
1526
|
+
}
|
|
1527
|
+
/**
|
|
1528
|
+
* Calculate how much assignments have changed between iterations
|
|
1529
|
+
*/
|
|
1530
|
+
_calculateAssignmentChangeRate(oldAssignments, newAssignments) {
|
|
1531
|
+
if (oldAssignments.length !== newAssignments.length)
|
|
1532
|
+
return 1.0;
|
|
1533
|
+
let changes = 0;
|
|
1534
|
+
for (let i = 0; i < oldAssignments.length; i++) {
|
|
1535
|
+
if (oldAssignments[i] !== newAssignments[i]) {
|
|
1536
|
+
changes++;
|
|
1537
|
+
}
|
|
1538
|
+
}
|
|
1539
|
+
return changes / oldAssignments.length;
|
|
1540
|
+
}
|
|
1541
|
+
/**
|
|
1542
|
+
* Calculate cluster confidence for k-means clusters
|
|
1543
|
+
*/
|
|
1544
|
+
async _calculateKMeansClusterConfidence(clusterItems, centroid) {
|
|
1545
|
+
if (clusterItems.length <= 1)
|
|
1546
|
+
return 1.0;
|
|
1547
|
+
// Calculate average distance to centroid
|
|
1548
|
+
const distances = clusterItems.map(item => this._calculateSquaredDistance(item.vector, centroid));
|
|
1549
|
+
const avgDistance = distances.reduce((sum, d) => sum + d, 0) / distances.length;
|
|
1550
|
+
// Calculate standard deviation
|
|
1551
|
+
const variance = distances.reduce((sum, d) => sum + Math.pow(d - avgDistance, 2), 0) / distances.length;
|
|
1552
|
+
const stdDev = Math.sqrt(variance);
|
|
1553
|
+
// Higher confidence for tighter clusters
|
|
1554
|
+
const tightness = avgDistance > 0 ? Math.max(0, 1 - (stdDev / avgDistance)) : 1.0;
|
|
1555
|
+
return Math.min(1.0, tightness);
|
|
1556
|
+
}
|
|
1557
|
+
// ===== DBSCAN UTILITIES =====
|
|
1558
|
+
/**
|
|
1559
|
+
* Estimate optimal eps parameter using k-nearest neighbor distances
|
|
1560
|
+
*/
|
|
1561
|
+
async _estimateOptimalEps(items, minPts) {
|
|
1562
|
+
if (items.length < minPts)
|
|
1563
|
+
return 0.5;
|
|
1564
|
+
// Calculate k-nearest neighbor distances for each point
|
|
1565
|
+
const kDistances = [];
|
|
1566
|
+
for (const item of items) {
|
|
1567
|
+
const distances = [];
|
|
1568
|
+
for (const otherItem of items) {
|
|
1569
|
+
if (item.id !== otherItem.id) {
|
|
1570
|
+
const distance = Math.sqrt(this._calculateSquaredDistance(item.vector, otherItem.vector));
|
|
1571
|
+
distances.push(distance);
|
|
1572
|
+
}
|
|
1573
|
+
}
|
|
1574
|
+
distances.sort((a, b) => a - b);
|
|
1575
|
+
// Get k-th nearest neighbor distance (minPts-1 because we exclude self)
|
|
1576
|
+
const kthDistance = distances[Math.min(minPts - 1, distances.length - 1)];
|
|
1577
|
+
kDistances.push(kthDistance);
|
|
1578
|
+
}
|
|
1579
|
+
kDistances.sort((a, b) => a - b);
|
|
1580
|
+
// Use knee point detection - find point with maximum curvature
|
|
1581
|
+
// Simplified approach: use 90th percentile of k-distances
|
|
1582
|
+
const percentileIndex = Math.floor(kDistances.length * 0.9);
|
|
1583
|
+
return kDistances[percentileIndex] || 0.5;
|
|
1584
|
+
}
|
|
1585
|
+
/**
|
|
1586
|
+
* Find neighbors within epsilon distance using efficient vector operations
|
|
1587
|
+
*/
|
|
1588
|
+
async _findNeighborsWithinEps(item, allItems, eps) {
|
|
1589
|
+
const neighbors = [];
|
|
1590
|
+
const epsSquared = eps * eps;
|
|
1591
|
+
for (const otherItem of allItems) {
|
|
1592
|
+
if (item.id !== otherItem.id) {
|
|
1593
|
+
const distanceSquared = this._calculateSquaredDistance(item.vector, otherItem.vector);
|
|
1594
|
+
if (distanceSquared <= epsSquared) {
|
|
1595
|
+
neighbors.push(otherItem);
|
|
1596
|
+
}
|
|
1597
|
+
}
|
|
1598
|
+
}
|
|
1599
|
+
return neighbors;
|
|
1600
|
+
}
|
|
1601
|
+
/**
|
|
1602
|
+
* Expand DBSCAN cluster by adding density-reachable points
|
|
1603
|
+
*/
|
|
1604
|
+
async _expandCluster(seedPoint, neighbors, clusterId, eps, minPts, allItems, visited, clusterAssignments) {
|
|
1605
|
+
clusterAssignments.set(seedPoint.id, clusterId);
|
|
1606
|
+
let i = 0;
|
|
1607
|
+
while (i < neighbors.length) {
|
|
1608
|
+
const neighbor = neighbors[i];
|
|
1609
|
+
if (!visited.get(neighbor.id)) {
|
|
1610
|
+
visited.set(neighbor.id, true);
|
|
1611
|
+
const neighborNeighbors = await this._findNeighborsWithinEps(neighbor, allItems, eps);
|
|
1612
|
+
if (neighborNeighbors.length >= minPts) {
|
|
1613
|
+
// Add new neighbors to the list (union operation)
|
|
1614
|
+
for (const newNeighbor of neighborNeighbors) {
|
|
1615
|
+
if (!neighbors.some(n => n.id === newNeighbor.id)) {
|
|
1616
|
+
neighbors.push(newNeighbor);
|
|
1617
|
+
}
|
|
1618
|
+
}
|
|
1619
|
+
}
|
|
1620
|
+
}
|
|
1621
|
+
// If neighbor is not assigned to any cluster, assign to current cluster
|
|
1622
|
+
if (!clusterAssignments.has(neighbor.id)) {
|
|
1623
|
+
clusterAssignments.set(neighbor.id, clusterId);
|
|
1624
|
+
}
|
|
1625
|
+
i++;
|
|
1626
|
+
}
|
|
1627
|
+
}
|
|
1628
|
+
/**
|
|
1629
|
+
* Calculate DBSCAN cluster confidence based on density
|
|
1630
|
+
*/
|
|
1631
|
+
async _calculateDBSCANClusterConfidence(clusterItems, eps) {
|
|
1632
|
+
if (clusterItems.length <= 1)
|
|
1633
|
+
return 1.0;
|
|
1634
|
+
// Calculate average density within the cluster
|
|
1635
|
+
let totalNeighborCount = 0;
|
|
1636
|
+
const epsSquared = eps * eps;
|
|
1637
|
+
for (const item of clusterItems) {
|
|
1638
|
+
let neighborCount = 0;
|
|
1639
|
+
for (const otherItem of clusterItems) {
|
|
1640
|
+
if (item !== otherItem) {
|
|
1641
|
+
const distanceSquared = this._calculateSquaredDistance(item.vector, otherItem.vector);
|
|
1642
|
+
if (distanceSquared <= epsSquared) {
|
|
1643
|
+
neighborCount++;
|
|
1644
|
+
}
|
|
1645
|
+
}
|
|
1646
|
+
}
|
|
1647
|
+
totalNeighborCount += neighborCount;
|
|
1648
|
+
}
|
|
1649
|
+
const avgDensity = totalNeighborCount / clusterItems.length;
|
|
1650
|
+
const maxPossibleDensity = clusterItems.length - 1;
|
|
1651
|
+
return maxPossibleDensity > 0 ? avgDensity / maxPossibleDensity : 1.0;
|
|
1652
|
+
}
|
|
1653
|
+
// ===== VECTOR UTILITIES =====
|
|
1654
|
+
/**
|
|
1655
|
+
* Calculate squared Euclidean distance (more efficient than sqrt)
|
|
1656
|
+
*/
|
|
1657
|
+
_calculateSquaredDistance(vec1, vec2) {
|
|
1658
|
+
if (vec1.length !== vec2.length)
|
|
1659
|
+
return Infinity;
|
|
1660
|
+
let sum = 0;
|
|
1661
|
+
for (let i = 0; i < vec1.length; i++) {
|
|
1662
|
+
const diff = vec1[i] - vec2[i];
|
|
1663
|
+
sum += diff * diff;
|
|
1664
|
+
}
|
|
1665
|
+
return sum;
|
|
1666
|
+
}
|
|
1667
|
+
/**
|
|
1668
|
+
* Calculate vector coherence for community refinement
|
|
1669
|
+
*/
|
|
1670
|
+
async _calculateVectorCoherence(items) {
|
|
1671
|
+
if (items.length <= 1)
|
|
1672
|
+
return 1.0;
|
|
1673
|
+
const centroid = await this._calculateCentroidFromItems(items);
|
|
1674
|
+
// Calculate average distance to centroid
|
|
1675
|
+
const distances = items.map(item => Math.sqrt(this._calculateSquaredDistance(item.vector, centroid)));
|
|
1676
|
+
const avgDistance = distances.reduce((sum, d) => sum + d, 0) / distances.length;
|
|
1677
|
+
// Calculate cohesion as inverse of average distance (normalized)
|
|
1678
|
+
const maxDistance = Math.sqrt(centroid.length); // Rough normalization
|
|
1679
|
+
return Math.max(0, 1 - (avgDistance / maxDistance));
|
|
1680
|
+
}
|
|
1681
|
+
async _getItemsByField(field) {
|
|
1682
|
+
// Implementation would query items by metadata field
|
|
1683
|
+
return [];
|
|
1684
|
+
}
|
|
1685
|
+
// ===== TRIPLE INTELLIGENCE INTEGRATION =====
|
|
1686
|
+
/**
|
|
1687
|
+
* Generate intelligent cluster labels using Triple Intelligence
|
|
1688
|
+
*/
|
|
1689
|
+
async _generateIntelligentClusterLabel(members, algorithm) {
|
|
1690
|
+
if (members.length === 0)
|
|
1691
|
+
return `${algorithm}-cluster`;
|
|
1692
|
+
try {
|
|
1693
|
+
// Lazy load Triple Intelligence if available
|
|
1694
|
+
const TripleIntelligenceEngine = await import('../triple/TripleIntelligence.js')
|
|
1695
|
+
.then(m => m.TripleIntelligenceEngine)
|
|
1696
|
+
.catch(() => null);
|
|
1697
|
+
if (!TripleIntelligenceEngine) {
|
|
1698
|
+
return this._generateClusterLabel(members, algorithm);
|
|
1699
|
+
}
|
|
1700
|
+
const intelligence = new TripleIntelligenceEngine(this.brain);
|
|
1701
|
+
// Extract key features from cluster members
|
|
1702
|
+
const memberData = members.map(m => ({
|
|
1703
|
+
id: m.id,
|
|
1704
|
+
type: m.nounType,
|
|
1705
|
+
label: m.label,
|
|
1706
|
+
data: m.data
|
|
1707
|
+
}));
|
|
1708
|
+
// Use Triple Intelligence to analyze the cluster and generate label
|
|
1709
|
+
const prompt = `Analyze this cluster of ${memberData.length} related items and provide a concise, descriptive label (2-4 words):
|
|
1710
|
+
|
|
1711
|
+
Items:
|
|
1712
|
+
${memberData.map(item => `- ${item.label || item.id} (${item.type})`).join('\n')}
|
|
1713
|
+
|
|
1714
|
+
The items were grouped using ${algorithm} clustering. What is the most appropriate label that captures their common theme or relationship?`;
|
|
1715
|
+
const response = await intelligence.find({
|
|
1716
|
+
like: prompt,
|
|
1717
|
+
limit: 1
|
|
1718
|
+
});
|
|
1719
|
+
// Extract clean label from response
|
|
1720
|
+
const firstResult = response[0];
|
|
1721
|
+
const label = (firstResult?.metadata?.content || firstResult?.id || `${algorithm}-cluster`)
|
|
1722
|
+
.toString()
|
|
1723
|
+
.replace(/^(Label:|Cluster:|Theme:)/i, '')
|
|
1724
|
+
.trim()
|
|
1725
|
+
.replace(/['"]/g, '')
|
|
1726
|
+
.slice(0, 50);
|
|
1727
|
+
return label || `${algorithm}-cluster`;
|
|
1728
|
+
}
|
|
1729
|
+
catch (error) {
|
|
1730
|
+
// Fallback to simple labeling
|
|
1731
|
+
return this._generateClusterLabel(members, algorithm);
|
|
1732
|
+
}
|
|
1733
|
+
}
|
|
1734
|
+
/**
|
|
1735
|
+
* Generate simple cluster labels based on semantic analysis
|
|
1736
|
+
*/
|
|
1737
|
+
async _generateClusterLabel(members, algorithm) {
|
|
1738
|
+
if (members.length === 0)
|
|
1739
|
+
return `${algorithm}-cluster`;
|
|
1740
|
+
// Analyze member types and create descriptive label
|
|
1741
|
+
const typeCount = new Map();
|
|
1742
|
+
for (const member of members) {
|
|
1743
|
+
const type = member.nounType || 'unknown';
|
|
1744
|
+
typeCount.set(type, (typeCount.get(type) || 0) + 1);
|
|
1745
|
+
}
|
|
1746
|
+
// Find most common type
|
|
1747
|
+
let dominantType = 'mixed';
|
|
1748
|
+
let maxCount = 0;
|
|
1749
|
+
for (const [type, count] of typeCount) {
|
|
1750
|
+
if (count > maxCount) {
|
|
1751
|
+
maxCount = count;
|
|
1752
|
+
dominantType = type;
|
|
1753
|
+
}
|
|
1754
|
+
}
|
|
1755
|
+
// Generate label based on dominant type and size
|
|
1756
|
+
const size = members.length;
|
|
1757
|
+
const typePercent = Math.round((maxCount / size) * 100);
|
|
1758
|
+
if (typePercent >= 80) {
|
|
1759
|
+
return `${dominantType} group (${size})`;
|
|
1760
|
+
}
|
|
1761
|
+
else if (typePercent >= 60) {
|
|
1762
|
+
return `mostly ${dominantType} (${size})`;
|
|
1763
|
+
}
|
|
1764
|
+
else {
|
|
1765
|
+
const topTypes = Array.from(typeCount.entries())
|
|
1766
|
+
.sort((a, b) => b[1] - a[1])
|
|
1767
|
+
.slice(0, 2)
|
|
1768
|
+
.map(([type]) => type)
|
|
1769
|
+
.join(' & ');
|
|
1770
|
+
return `${topTypes} cluster (${size})`;
|
|
1771
|
+
}
|
|
1772
|
+
}
|
|
1773
|
+
/**
|
|
1774
|
+
* Fuse clustering results using Triple Intelligence consensus
|
|
1775
|
+
*/
|
|
1776
|
+
async _fuseClusteringResultsWithTripleIntelligence(clusterSets, options) {
|
|
1777
|
+
if (clusterSets.length === 0)
|
|
1778
|
+
return [];
|
|
1779
|
+
if (clusterSets.length === 1)
|
|
1780
|
+
return clusterSets[0];
|
|
1781
|
+
// Simple weighted fusion if Triple Intelligence is not available
|
|
1782
|
+
const [vectorClusters, graphClusters, semanticClusters] = clusterSets;
|
|
1783
|
+
// Create consensus mapping of items to clusters
|
|
1784
|
+
const itemClusterMapping = new Map();
|
|
1785
|
+
// Collect all cluster assignments
|
|
1786
|
+
const allAlgorithms = ['vector', 'graph', 'semantic'];
|
|
1787
|
+
const algorithmClusters = [vectorClusters, graphClusters, semanticClusters];
|
|
1788
|
+
for (let i = 0; i < algorithmClusters.length; i++) {
|
|
1789
|
+
const algorithm = allAlgorithms[i];
|
|
1790
|
+
const clusters = algorithmClusters[i] || [];
|
|
1791
|
+
for (const cluster of clusters) {
|
|
1792
|
+
for (const memberId of cluster.members) {
|
|
1793
|
+
if (!itemClusterMapping.has(memberId)) {
|
|
1794
|
+
itemClusterMapping.set(memberId, []);
|
|
1795
|
+
}
|
|
1796
|
+
itemClusterMapping.get(memberId).push({
|
|
1797
|
+
algorithm,
|
|
1798
|
+
clusterId: cluster.id,
|
|
1799
|
+
confidence: cluster.confidence
|
|
1800
|
+
});
|
|
1801
|
+
}
|
|
1802
|
+
}
|
|
1803
|
+
}
|
|
1804
|
+
// Find consensus clusters - items that appear together in multiple algorithms
|
|
1805
|
+
const consensusClusters = new Map();
|
|
1806
|
+
const processedItems = new Set();
|
|
1807
|
+
for (const [itemId, assignments] of itemClusterMapping) {
|
|
1808
|
+
if (processedItems.has(itemId))
|
|
1809
|
+
continue;
|
|
1810
|
+
// Find all items that consistently cluster with this item
|
|
1811
|
+
const consensusGroup = new Set([itemId]);
|
|
1812
|
+
// Look for items that share clusters with this item across algorithms
|
|
1813
|
+
for (const assignment of assignments) {
|
|
1814
|
+
const sameClusterItems = this._getItemsInCluster(assignment.clusterId, clusterSets);
|
|
1815
|
+
for (const otherItem of sameClusterItems) {
|
|
1816
|
+
if (!processedItems.has(otherItem) && otherItem !== itemId) {
|
|
1817
|
+
const otherAssignments = itemClusterMapping.get(otherItem) || [];
|
|
1818
|
+
// Check if items co-occur in multiple algorithms
|
|
1819
|
+
const coOccurrences = this._countCoOccurrences(assignments, otherAssignments);
|
|
1820
|
+
if (coOccurrences >= 2) { // Must appear together in at least 2 algorithms
|
|
1821
|
+
consensusGroup.add(otherItem);
|
|
1822
|
+
}
|
|
1823
|
+
}
|
|
1824
|
+
}
|
|
1825
|
+
}
|
|
1826
|
+
// Mark all items in this consensus group as processed
|
|
1827
|
+
for (const groupItem of consensusGroup) {
|
|
1828
|
+
processedItems.add(groupItem);
|
|
1829
|
+
}
|
|
1830
|
+
if (consensusGroup.size >= (options.minClusterSize || 2)) {
|
|
1831
|
+
const consensusId = `fusion-${consensusClusters.size}`;
|
|
1832
|
+
consensusClusters.set(consensusId, consensusGroup);
|
|
1833
|
+
}
|
|
1834
|
+
}
|
|
1835
|
+
// Convert consensus groups to SemanticCluster objects
|
|
1836
|
+
const fusedClusters = [];
|
|
1837
|
+
for (const [clusterId, memberSet] of consensusClusters) {
|
|
1838
|
+
const members = Array.from(memberSet);
|
|
1839
|
+
const membersWithMetadata = await this._getItemsWithMetadata(members);
|
|
1840
|
+
if (membersWithMetadata.length > 0) {
|
|
1841
|
+
const centroid = await this._calculateCentroidFromItems(membersWithMetadata);
|
|
1842
|
+
const label = await this._generateIntelligentClusterLabel(membersWithMetadata, 'multimodal');
|
|
1843
|
+
// Calculate fusion confidence based on algorithm agreement
|
|
1844
|
+
const avgConfidence = this._calculateFusionConfidence(members, itemClusterMapping);
|
|
1845
|
+
fusedClusters.push({
|
|
1846
|
+
id: clusterId,
|
|
1847
|
+
centroid,
|
|
1848
|
+
members,
|
|
1849
|
+
size: members.length,
|
|
1850
|
+
confidence: avgConfidence,
|
|
1851
|
+
label,
|
|
1852
|
+
metadata: {
|
|
1853
|
+
clustering: 'multimodal_fusion',
|
|
1854
|
+
algorithms: allAlgorithms,
|
|
1855
|
+
fusionMethod: 'consensus',
|
|
1856
|
+
agreementLevel: avgConfidence
|
|
1857
|
+
}
|
|
1858
|
+
});
|
|
1859
|
+
}
|
|
1860
|
+
}
|
|
1861
|
+
return fusedClusters;
|
|
1862
|
+
}
|
|
1863
|
+
/**
|
|
1864
|
+
* Get items in a specific cluster from cluster sets
|
|
1865
|
+
*/
|
|
1866
|
+
_getItemsInCluster(clusterId, clusterSets) {
|
|
1867
|
+
for (const clusterSet of clusterSets) {
|
|
1868
|
+
for (const cluster of clusterSet) {
|
|
1869
|
+
if (cluster.id === clusterId) {
|
|
1870
|
+
return cluster.members;
|
|
1871
|
+
}
|
|
1872
|
+
}
|
|
1873
|
+
}
|
|
1874
|
+
return [];
|
|
1875
|
+
}
|
|
1876
|
+
/**
|
|
1877
|
+
* Count co-occurrences between two sets of assignments
|
|
1878
|
+
*/
|
|
1879
|
+
_countCoOccurrences(assignments1, assignments2) {
|
|
1880
|
+
let count = 0;
|
|
1881
|
+
for (const assignment1 of assignments1) {
|
|
1882
|
+
for (const assignment2 of assignments2) {
|
|
1883
|
+
if (assignment1.algorithm === assignment2.algorithm &&
|
|
1884
|
+
assignment1.clusterId === assignment2.clusterId) {
|
|
1885
|
+
count++;
|
|
1886
|
+
}
|
|
1887
|
+
}
|
|
1888
|
+
}
|
|
1889
|
+
return count;
|
|
1890
|
+
}
|
|
1891
|
+
/**
|
|
1892
|
+
* Calculate fusion confidence based on algorithm agreement
|
|
1893
|
+
*/
|
|
1894
|
+
_calculateFusionConfidence(members, itemClusterMapping) {
|
|
1895
|
+
let totalConfidence = 0;
|
|
1896
|
+
let totalAssignments = 0;
|
|
1897
|
+
for (const member of members) {
|
|
1898
|
+
const assignments = itemClusterMapping.get(member) || [];
|
|
1899
|
+
for (const assignment of assignments) {
|
|
1900
|
+
totalConfidence += assignment.confidence;
|
|
1901
|
+
totalAssignments++;
|
|
1902
|
+
}
|
|
1903
|
+
}
|
|
1904
|
+
return totalAssignments > 0 ? totalConfidence / totalAssignments : 0.5;
|
|
1905
|
+
}
|
|
1906
|
+
// ===== ADDITIONAL UTILITIES =====
|
|
1907
|
+
/**
|
|
1908
|
+
* Generate empty clustering result for edge cases
|
|
1909
|
+
*/
|
|
1910
|
+
_createEmptyResult(startTime, algorithm) {
|
|
1911
|
+
return {
|
|
1912
|
+
clusters: [],
|
|
1913
|
+
metrics: this._createPerformanceMetrics(startTime, 0, algorithm),
|
|
1914
|
+
metadata: {
|
|
1915
|
+
totalItems: 0,
|
|
1916
|
+
clustersFound: 0,
|
|
1917
|
+
averageClusterSize: 0,
|
|
1918
|
+
timestamp: new Date()
|
|
1919
|
+
}
|
|
1920
|
+
};
|
|
1921
|
+
}
|
|
1922
|
+
// ===== SAMPLING AND PROJECTION UTILITIES =====
|
|
1923
|
+
/**
|
|
1924
|
+
* Get sample using specified strategy for large dataset clustering
|
|
1925
|
+
*/
|
|
1926
|
+
async _getSampleUsingStrategy(itemIds, sampleSize, strategy) {
|
|
1927
|
+
if (itemIds.length <= sampleSize)
|
|
1928
|
+
return itemIds;
|
|
1929
|
+
switch (strategy) {
|
|
1930
|
+
case 'random':
|
|
1931
|
+
return this._getRandomSample(itemIds, sampleSize);
|
|
1932
|
+
case 'diverse':
|
|
1933
|
+
return await this._getDiverseSample(itemIds, sampleSize);
|
|
1934
|
+
case 'recent':
|
|
1935
|
+
return await this._getRecentSample(itemIds, sampleSize);
|
|
1936
|
+
case 'important':
|
|
1937
|
+
return await this._getImportantSample(itemIds, sampleSize);
|
|
1938
|
+
default:
|
|
1939
|
+
return this._getRandomSample(itemIds, sampleSize);
|
|
1940
|
+
}
|
|
1941
|
+
}
|
|
1942
|
+
/**
|
|
1943
|
+
* Random sampling
|
|
1944
|
+
*/
|
|
1945
|
+
_getRandomSample(itemIds, sampleSize) {
|
|
1946
|
+
const shuffled = [...itemIds].sort(() => Math.random() - 0.5);
|
|
1947
|
+
return shuffled.slice(0, sampleSize);
|
|
1948
|
+
}
|
|
1949
|
+
/**
|
|
1950
|
+
* Diverse sampling using vector space distribution
|
|
1951
|
+
*/
|
|
1952
|
+
async _getDiverseSample(itemIds, sampleSize) {
|
|
1953
|
+
// Get vectors for all items
|
|
1954
|
+
const itemsWithVectors = await this._getItemsWithVectors(itemIds);
|
|
1955
|
+
if (itemsWithVectors.length <= sampleSize) {
|
|
1956
|
+
return itemIds;
|
|
1957
|
+
}
|
|
1958
|
+
// Use k-means++ style selection for diversity
|
|
1959
|
+
const sample = [];
|
|
1960
|
+
// Select first item randomly
|
|
1961
|
+
let remainingItems = [...itemsWithVectors];
|
|
1962
|
+
const firstIdx = Math.floor(Math.random() * remainingItems.length);
|
|
1963
|
+
sample.push(remainingItems[firstIdx].id);
|
|
1964
|
+
remainingItems.splice(firstIdx, 1);
|
|
1965
|
+
// Select remaining items based on maximum distance to already selected items
|
|
1966
|
+
while (sample.length < sampleSize && remainingItems.length > 0) {
|
|
1967
|
+
let maxDistance = -1;
|
|
1968
|
+
let bestIdx = 0;
|
|
1969
|
+
for (let i = 0; i < remainingItems.length; i++) {
|
|
1970
|
+
const item = remainingItems[i];
|
|
1971
|
+
// Find minimum distance to any selected item
|
|
1972
|
+
let minDistanceToSelected = Infinity;
|
|
1973
|
+
for (const selectedId of sample) {
|
|
1974
|
+
const selectedItem = itemsWithVectors.find(it => it.id === selectedId);
|
|
1975
|
+
if (selectedItem) {
|
|
1976
|
+
const distance = Math.sqrt(this._calculateSquaredDistance(item.vector, selectedItem.vector));
|
|
1977
|
+
minDistanceToSelected = Math.min(minDistanceToSelected, distance);
|
|
1978
|
+
}
|
|
1979
|
+
}
|
|
1980
|
+
// Select item with maximum minimum distance (most diverse)
|
|
1981
|
+
if (minDistanceToSelected > maxDistance) {
|
|
1982
|
+
maxDistance = minDistanceToSelected;
|
|
1983
|
+
bestIdx = i;
|
|
1984
|
+
}
|
|
1985
|
+
}
|
|
1986
|
+
sample.push(remainingItems[bestIdx].id);
|
|
1987
|
+
remainingItems.splice(bestIdx, 1);
|
|
1988
|
+
}
|
|
1989
|
+
return sample;
|
|
1990
|
+
}
|
|
1991
|
+
/**
|
|
1992
|
+
* Recent sampling based on creation time
|
|
1993
|
+
*/
|
|
1994
|
+
async _getRecentSample(itemIds, sampleSize) {
|
|
1995
|
+
const items = await Promise.all(itemIds.map(async (id) => {
|
|
1996
|
+
const noun = await this.brain.getNoun(id);
|
|
1997
|
+
return {
|
|
1998
|
+
id,
|
|
1999
|
+
createdAt: noun?.createdAt || new Date(0)
|
|
2000
|
+
};
|
|
2001
|
+
}));
|
|
2002
|
+
// Sort by creation time (most recent first)
|
|
2003
|
+
items.sort((a, b) => b.createdAt.getTime() - a.createdAt.getTime());
|
|
2004
|
+
return items.slice(0, sampleSize).map(item => item.id);
|
|
2005
|
+
}
|
|
2006
|
+
/**
|
|
2007
|
+
* Important sampling based on connection count and metadata
|
|
2008
|
+
*/
|
|
2009
|
+
async _getImportantSample(itemIds, sampleSize) {
|
|
2010
|
+
const items = await Promise.all(itemIds.map(async (id) => {
|
|
2011
|
+
const verbs = await this.brain.getVerbsForNoun(id);
|
|
2012
|
+
const noun = await this.brain.getNoun(id);
|
|
2013
|
+
// Calculate importance score
|
|
2014
|
+
const connectionScore = verbs.length;
|
|
2015
|
+
const dataScore = noun?.data ? Object.keys(noun.data).length : 0;
|
|
2016
|
+
const importanceScore = connectionScore * 2 + dataScore;
|
|
2017
|
+
return {
|
|
2018
|
+
id,
|
|
2019
|
+
importance: importanceScore
|
|
2020
|
+
};
|
|
2021
|
+
}));
|
|
2022
|
+
// Sort by importance (highest first)
|
|
2023
|
+
items.sort((a, b) => b.importance - a.importance);
|
|
2024
|
+
return items.slice(0, sampleSize).map(item => item.id);
|
|
2025
|
+
}
|
|
2026
|
+
/**
|
|
2027
|
+
* Project clusters back to full dataset using HNSW neighbors
|
|
2028
|
+
*/
|
|
2029
|
+
async _projectClustersToFullDataset(sampleClusters, fullItemIds, sampleIds) {
|
|
2030
|
+
const projectedClusters = [];
|
|
2031
|
+
// Create mapping of items not in sample
|
|
2032
|
+
const remainingItems = fullItemIds.filter(id => !sampleIds.includes(id));
|
|
2033
|
+
// For each sample cluster, find which remaining items should belong to it
|
|
2034
|
+
for (const sampleCluster of sampleClusters) {
|
|
2035
|
+
const projectedMembers = [...sampleCluster.members];
|
|
2036
|
+
// For each remaining item, find its nearest neighbors in the sample
|
|
2037
|
+
for (const itemId of remainingItems) {
|
|
2038
|
+
try {
|
|
2039
|
+
const neighbors = await this.brain.neural.neighbors(itemId, {
|
|
2040
|
+
limit: 3,
|
|
2041
|
+
includeMetadata: false
|
|
2042
|
+
});
|
|
2043
|
+
// Check if any of the nearest neighbors belong to this cluster
|
|
2044
|
+
let belongsToCluster = false;
|
|
2045
|
+
for (const neighbor of neighbors.neighbors) {
|
|
2046
|
+
if (sampleCluster.members.includes(neighbor.id) && neighbor.similarity > 0.7) {
|
|
2047
|
+
belongsToCluster = true;
|
|
2048
|
+
break;
|
|
2049
|
+
}
|
|
2050
|
+
}
|
|
2051
|
+
if (belongsToCluster) {
|
|
2052
|
+
projectedMembers.push(itemId);
|
|
2053
|
+
}
|
|
2054
|
+
}
|
|
2055
|
+
catch (error) {
|
|
2056
|
+
// Skip items that can't be processed
|
|
2057
|
+
continue;
|
|
2058
|
+
}
|
|
2059
|
+
}
|
|
2060
|
+
// Create projected cluster
|
|
2061
|
+
if (projectedMembers.length > 0) {
|
|
2062
|
+
const membersWithVectors = await this._getItemsWithVectors(projectedMembers);
|
|
2063
|
+
projectedClusters.push({
|
|
2064
|
+
...sampleCluster,
|
|
2065
|
+
id: `projected-${sampleCluster.id}`,
|
|
2066
|
+
members: projectedMembers,
|
|
2067
|
+
size: projectedMembers.length,
|
|
2068
|
+
centroid: await this._calculateCentroidFromItems(membersWithVectors),
|
|
2069
|
+
confidence: sampleCluster.confidence * 0.9, // Slightly lower confidence for projection
|
|
2070
|
+
metadata: {
|
|
2071
|
+
...sampleCluster.metadata,
|
|
2072
|
+
isProjected: true,
|
|
2073
|
+
originalSampleSize: sampleCluster.size,
|
|
2074
|
+
projectedSize: projectedMembers.length
|
|
2075
|
+
}
|
|
2076
|
+
});
|
|
2077
|
+
}
|
|
2078
|
+
}
|
|
2079
|
+
return projectedClusters;
|
|
2080
|
+
}
|
|
2081
|
+
_groupByDomain(items, field) {
|
|
2082
|
+
const groups = new Map();
|
|
2083
|
+
for (const item of items) {
|
|
2084
|
+
const domain = item.metadata?.[field] || 'unknown';
|
|
2085
|
+
if (!groups.has(domain)) {
|
|
2086
|
+
groups.set(domain, []);
|
|
2087
|
+
}
|
|
2088
|
+
groups.get(domain).push(item);
|
|
2089
|
+
}
|
|
2090
|
+
return groups;
|
|
2091
|
+
}
|
|
2092
|
+
_calculateDomainConfidence(cluster, domainItems) {
|
|
2093
|
+
// Calculate how well this cluster represents the domain
|
|
2094
|
+
return 0.8; // Placeholder
|
|
2095
|
+
}
|
|
2096
|
+
async _findCrossDomainMembers(cluster, threshold) {
|
|
2097
|
+
// Find members that might belong to multiple domains
|
|
2098
|
+
return [];
|
|
2099
|
+
}
|
|
2100
|
+
async _findCrossDomainClusters(clusters, threshold) {
|
|
2101
|
+
// Find clusters that span multiple domains
|
|
2102
|
+
return [];
|
|
2103
|
+
}
|
|
2104
|
+
async _getItemsByTimeWindow(timeField, window) {
|
|
2105
|
+
// Implementation would query items within time window
|
|
2106
|
+
return [];
|
|
2107
|
+
}
|
|
2108
|
+
async _calculateTemporalMetrics(cluster, items, timeField) {
|
|
2109
|
+
// Calculate temporal characteristics of the cluster
|
|
2110
|
+
return {
|
|
2111
|
+
trend: 'stable',
|
|
2112
|
+
metrics: {
|
|
2113
|
+
startTime: new Date(),
|
|
2114
|
+
endTime: new Date(),
|
|
2115
|
+
peakTime: new Date(),
|
|
2116
|
+
frequency: 1
|
|
2117
|
+
}
|
|
2118
|
+
};
|
|
2119
|
+
}
|
|
2120
|
+
_mergeOverlappingTemporalClusters(clusters) {
|
|
2121
|
+
// Merge clusters from overlapping time windows
|
|
2122
|
+
return clusters;
|
|
2123
|
+
}
|
|
2124
|
+
_adjustThresholdAdaptively(clusters, currentThreshold) {
|
|
2125
|
+
// Adjust clustering threshold based on results
|
|
2126
|
+
return currentThreshold || 0.6;
|
|
2127
|
+
}
|
|
2128
|
+
async _calculateItemToClusterSimilarity(itemId, cluster) {
|
|
2129
|
+
// Calculate similarity between an item and a cluster centroid
|
|
2130
|
+
return 0.5; // Placeholder
|
|
2131
|
+
}
|
|
2132
|
+
async _recalculateClusterCentroid(cluster) {
|
|
2133
|
+
// Recalculate centroid after adding new members
|
|
2134
|
+
return cluster.centroid;
|
|
2135
|
+
}
|
|
2136
|
+
async _calculateSimilarity(id1, id2) {
|
|
2137
|
+
return await this.similar(id1, id2);
|
|
2138
|
+
}
|
|
2139
|
+
_sortNeighbors(neighbors, sortBy) {
|
|
2140
|
+
switch (sortBy) {
|
|
2141
|
+
case 'similarity':
|
|
2142
|
+
neighbors.sort((a, b) => b.similarity - a.similarity);
|
|
2143
|
+
break;
|
|
2144
|
+
case 'importance':
|
|
2145
|
+
neighbors.sort((a, b) => (b.metadata?.importance || 0) - (a.metadata?.importance || 0));
|
|
2146
|
+
break;
|
|
2147
|
+
case 'recency':
|
|
2148
|
+
neighbors.sort((a, b) => {
|
|
2149
|
+
const aTime = new Date(a.metadata?.createdAt || 0).getTime();
|
|
2150
|
+
const bTime = new Date(b.metadata?.createdAt || 0).getTime();
|
|
2151
|
+
return bTime - aTime;
|
|
2152
|
+
});
|
|
2153
|
+
break;
|
|
2154
|
+
}
|
|
2155
|
+
}
|
|
2156
|
+
async _buildSemanticHierarchy(item, options) {
|
|
2157
|
+
// Build semantic hierarchy around an item
|
|
2158
|
+
return {
|
|
2159
|
+
self: { id: item.id, vector: item.vector, metadata: item.metadata }
|
|
2160
|
+
};
|
|
2161
|
+
}
|
|
2162
|
+
async _detectOutliersClusterBased(threshold, options) {
|
|
2163
|
+
// Detect outliers using cluster-based method
|
|
2164
|
+
return [];
|
|
2165
|
+
}
|
|
2166
|
+
async _detectOutliersIsolation(threshold, options) {
|
|
2167
|
+
// Detect outliers using isolation forest method
|
|
2168
|
+
return [];
|
|
2169
|
+
}
|
|
2170
|
+
async _detectOutliersStatistical(threshold, options) {
|
|
2171
|
+
// Detect outliers using statistical methods
|
|
2172
|
+
return [];
|
|
2173
|
+
}
|
|
2174
|
+
async _generateVisualizationNodes(maxNodes, options) {
|
|
2175
|
+
// Generate nodes for visualization
|
|
2176
|
+
return [];
|
|
2177
|
+
}
|
|
2178
|
+
async _generateVisualizationEdges(nodes, options) {
|
|
2179
|
+
// Generate edges for visualization
|
|
2180
|
+
return [];
|
|
2181
|
+
}
|
|
2182
|
+
async _generateVisualizationClusters(nodes) {
|
|
2183
|
+
// Generate cluster information for visualization
|
|
2184
|
+
return [];
|
|
2185
|
+
}
|
|
2186
|
+
async _applyLayoutAlgorithm(nodes, edges, algorithm, dimensions) {
|
|
2187
|
+
// Apply layout algorithm to position nodes
|
|
2188
|
+
return nodes.map((node, i) => ({
|
|
2189
|
+
...node,
|
|
2190
|
+
x: Math.random() * 100,
|
|
2191
|
+
y: Math.random() * 100,
|
|
2192
|
+
z: dimensions === 3 ? Math.random() * 100 : undefined
|
|
2193
|
+
}));
|
|
2194
|
+
}
|
|
2195
|
+
_manhattanDistance(v1, v2) {
|
|
2196
|
+
let sum = 0;
|
|
2197
|
+
for (let i = 0; i < v1.length; i++) {
|
|
2198
|
+
sum += Math.abs(v1[i] - v2[i]);
|
|
2199
|
+
}
|
|
2200
|
+
return sum;
|
|
2201
|
+
}
|
|
2202
|
+
_calculateConfidence(score, v1, v2) {
|
|
2203
|
+
// Calculate confidence based on vector magnitudes and score
|
|
2204
|
+
return Math.min(1, score + 0.1);
|
|
2205
|
+
}
|
|
2206
|
+
_generateSimilarityExplanation(score, metric) {
|
|
2207
|
+
if (score > 0.9)
|
|
2208
|
+
return `Very high similarity using ${metric} distance`;
|
|
2209
|
+
if (score > 0.7)
|
|
2210
|
+
return `High similarity using ${metric} distance`;
|
|
2211
|
+
if (score > 0.5)
|
|
2212
|
+
return `Moderate similarity using ${metric} distance`;
|
|
2213
|
+
if (score > 0.3)
|
|
2214
|
+
return `Low similarity using ${metric} distance`;
|
|
2215
|
+
return `Very low similarity using ${metric} distance`;
|
|
2216
|
+
}
|
|
2217
|
+
// ===== PUBLIC API: UTILITY & STATUS =====
|
|
2218
|
+
/**
|
|
2219
|
+
* Get performance metrics for monitoring
|
|
2220
|
+
*/
|
|
2221
|
+
getPerformanceMetrics(operation) {
|
|
2222
|
+
if (operation) {
|
|
2223
|
+
return this.performanceMetrics.get(operation) || [];
|
|
2224
|
+
}
|
|
2225
|
+
return this.performanceMetrics;
|
|
2226
|
+
}
|
|
2227
|
+
/**
|
|
2228
|
+
* Clear all caches
|
|
2229
|
+
*/
|
|
2230
|
+
clearCaches() {
|
|
2231
|
+
this.similarityCache.clear();
|
|
2232
|
+
this.clusterCache.clear();
|
|
2233
|
+
this.hierarchyCache.clear();
|
|
2234
|
+
this.neighborsCache.clear();
|
|
2235
|
+
}
|
|
2236
|
+
/**
|
|
2237
|
+
* Get cache statistics
|
|
2238
|
+
*/
|
|
2239
|
+
getCacheStats() {
|
|
2240
|
+
const maxSize = this.config.cacheSize || 1000;
|
|
2241
|
+
return {
|
|
2242
|
+
similarity: { size: this.similarityCache.size, maxSize },
|
|
2243
|
+
clustering: { size: this.clusterCache.size, maxSize },
|
|
2244
|
+
hierarchy: { size: this.hierarchyCache.size, maxSize },
|
|
2245
|
+
neighbors: { size: this.neighborsCache.size, maxSize }
|
|
2246
|
+
};
|
|
2247
|
+
}
|
|
2248
|
+
// ===== MISSING HELPER METHODS =====
|
|
2249
|
+
/**
|
|
2250
|
+
* Analyze data characteristics for algorithm selection
|
|
2251
|
+
*/
|
|
2252
|
+
async _analyzeDataCharacteristics(itemIds) {
|
|
2253
|
+
const size = itemIds.length;
|
|
2254
|
+
const items = await this._getItemsWithMetadata(itemIds.slice(0, Math.min(100, size)));
|
|
2255
|
+
const dimensionality = items.length > 0 ? items[0].vector.length : 0;
|
|
2256
|
+
// Calculate graph density by sampling verb relationships
|
|
2257
|
+
let connectionCount = 0;
|
|
2258
|
+
const sampleSize = Math.min(50, itemIds.length);
|
|
2259
|
+
for (let i = 0; i < sampleSize; i++) {
|
|
2260
|
+
try {
|
|
2261
|
+
const verbs = await this.brain.getVerbsForNoun(itemIds[i]);
|
|
2262
|
+
connectionCount += verbs.length;
|
|
2263
|
+
}
|
|
2264
|
+
catch (error) {
|
|
2265
|
+
// Skip items that can't be processed
|
|
2266
|
+
continue;
|
|
2267
|
+
}
|
|
2268
|
+
}
|
|
2269
|
+
const graphDensity = sampleSize > 0 ? connectionCount / (sampleSize * sampleSize) : 0;
|
|
2270
|
+
// Calculate type distribution
|
|
2271
|
+
const typeDistribution = {};
|
|
2272
|
+
for (const item of items) {
|
|
2273
|
+
const type = item.nounType;
|
|
2274
|
+
typeDistribution[type] = (typeDistribution[type] || 0) + 1;
|
|
2275
|
+
}
|
|
2276
|
+
return { size, dimensionality, graphDensity, typeDistribution };
|
|
2277
|
+
}
|
|
2278
|
+
/**
|
|
2279
|
+
* Calculate centroid for a group of items
|
|
2280
|
+
*/
|
|
2281
|
+
async _calculateGroupCentroid(items) {
|
|
2282
|
+
return this._calculateCentroidFromItems(items);
|
|
2283
|
+
}
|
|
2284
|
+
/**
|
|
2285
|
+
* Cluster within semantic type using vector similarity
|
|
2286
|
+
*/
|
|
2287
|
+
async _clusterWithinSemanticType(items, options) {
|
|
2288
|
+
if (items.length <= 2) {
|
|
2289
|
+
return [{
|
|
2290
|
+
id: `semantic-single-${items[0]?.nounType || 'unknown'}`,
|
|
2291
|
+
centroid: await this._calculateCentroidFromItems(items),
|
|
2292
|
+
members: items.map(item => item.id),
|
|
2293
|
+
size: items.length,
|
|
2294
|
+
confidence: 1.0,
|
|
2295
|
+
label: `${items[0]?.nounType || 'unknown'} group`,
|
|
2296
|
+
metadata: { clustering: 'semantic', nounType: items[0]?.nounType }
|
|
2297
|
+
}];
|
|
2298
|
+
}
|
|
2299
|
+
// Use hierarchical clustering for within-type clustering
|
|
2300
|
+
const result = await this._performHierarchicalClustering(items.map(item => item.id), { ...options, maxClusters: Math.min(Math.ceil(items.length / 3), 10) });
|
|
2301
|
+
return result.clusters;
|
|
2302
|
+
}
|
|
2303
|
+
/**
|
|
2304
|
+
* Find cross-type connections via verbs
|
|
2305
|
+
*/
|
|
2306
|
+
async _findCrossTypeConnections(typeGroups, _options) {
|
|
2307
|
+
const connections = [];
|
|
2308
|
+
// Convert Map to array for compatibility
|
|
2309
|
+
const typeGroupsArray = Array.from(typeGroups.entries());
|
|
2310
|
+
for (const [fromType, fromItems] of typeGroupsArray) {
|
|
2311
|
+
for (const [toType, toItems] of typeGroupsArray) {
|
|
2312
|
+
if (fromType !== toType) {
|
|
2313
|
+
for (const fromItem of fromItems.slice(0, 10)) { // Sample to avoid N^2
|
|
2314
|
+
try {
|
|
2315
|
+
const verbs = await this.brain.getVerbsForNoun(fromItem.id);
|
|
2316
|
+
for (const verb of verbs) {
|
|
2317
|
+
const toItem = toItems.find(item => item.id === verb.target);
|
|
2318
|
+
if (toItem) {
|
|
2319
|
+
connections.push({
|
|
2320
|
+
from: fromItem.id,
|
|
2321
|
+
to: toItem.id,
|
|
2322
|
+
strength: verb.confidence || 0.7
|
|
2323
|
+
});
|
|
2324
|
+
}
|
|
2325
|
+
}
|
|
2326
|
+
}
|
|
2327
|
+
catch (error) {
|
|
2328
|
+
// Skip items that can't be processed
|
|
2329
|
+
continue;
|
|
2330
|
+
}
|
|
2331
|
+
}
|
|
2332
|
+
}
|
|
2333
|
+
}
|
|
2334
|
+
}
|
|
2335
|
+
return connections.filter(conn => conn.strength > 0.5);
|
|
2336
|
+
}
|
|
2337
|
+
/**
|
|
2338
|
+
* Merge semantic clusters based on connections
|
|
2339
|
+
*/
|
|
2340
|
+
async _mergeSemanticClusters(clusters, connections) {
|
|
2341
|
+
// Simple merging based on strong connections
|
|
2342
|
+
const merged = [...clusters];
|
|
2343
|
+
for (const connection of connections) {
|
|
2344
|
+
if (connection.strength > 0.8) {
|
|
2345
|
+
const fromCluster = merged.find(c => c.members.includes(connection.from));
|
|
2346
|
+
const toCluster = merged.find(c => c.members.includes(connection.to));
|
|
2347
|
+
if (fromCluster && toCluster && fromCluster !== toCluster) {
|
|
2348
|
+
// Merge clusters
|
|
2349
|
+
fromCluster.members = [...fromCluster.members, ...toCluster.members];
|
|
2350
|
+
fromCluster.size = fromCluster.members.length;
|
|
2351
|
+
fromCluster.label = `merged ${fromCluster.label}`;
|
|
2352
|
+
// Remove merged cluster
|
|
2353
|
+
const index = merged.indexOf(toCluster);
|
|
2354
|
+
if (index > -1)
|
|
2355
|
+
merged.splice(index, 1);
|
|
2356
|
+
}
|
|
2357
|
+
}
|
|
2358
|
+
}
|
|
2359
|
+
return merged;
|
|
2360
|
+
}
|
|
2361
|
+
/**
|
|
2362
|
+
* Get optimal clustering level for HNSW
|
|
2363
|
+
*/
|
|
2364
|
+
_getOptimalClusteringLevel(totalItems) {
|
|
2365
|
+
if (totalItems < 100)
|
|
2366
|
+
return 0;
|
|
2367
|
+
if (totalItems < 1000)
|
|
2368
|
+
return 1;
|
|
2369
|
+
if (totalItems < 10000)
|
|
2370
|
+
return 2;
|
|
2371
|
+
return 3;
|
|
2372
|
+
}
|
|
2373
|
+
/**
|
|
2374
|
+
* Get nodes at HNSW level
|
|
2375
|
+
*/
|
|
2376
|
+
async _getHNSWLevelNodes(level) {
|
|
2377
|
+
// This would use the HNSW index to get nodes at specified level
|
|
2378
|
+
// For now, return a sample of all items
|
|
2379
|
+
const allItems = await this._getAllItemIds();
|
|
2380
|
+
const sampleSize = Math.max(10, Math.floor(allItems.length / Math.pow(2, level + 1)));
|
|
2381
|
+
return this._getRandomSample(allItems, sampleSize);
|
|
2382
|
+
}
|
|
2383
|
+
/**
|
|
2384
|
+
* Find cluster members using HNSW neighbors
|
|
2385
|
+
*/
|
|
2386
|
+
async _findClusterMembers(levelNode, _allItems, threshold) {
|
|
2387
|
+
try {
|
|
2388
|
+
const neighbors = await this.brain.neural.neighbors(levelNode, {
|
|
2389
|
+
limit: Math.min(50, Math.floor(_allItems.length / 10)),
|
|
2390
|
+
minSimilarity: threshold
|
|
2391
|
+
});
|
|
2392
|
+
return [levelNode, ...neighbors.neighbors.map((n) => n.id)];
|
|
2393
|
+
}
|
|
2394
|
+
catch (error) {
|
|
2395
|
+
return [levelNode];
|
|
2396
|
+
}
|
|
2397
|
+
}
|
|
2398
|
+
/**
|
|
2399
|
+
* Calculate hierarchical clustering confidence
|
|
2400
|
+
*/
|
|
2401
|
+
async _calculateHierarchicalConfidence(members) {
|
|
2402
|
+
if (members.length <= 1)
|
|
2403
|
+
return 1.0;
|
|
2404
|
+
const items = await this._getItemsWithVectors(members);
|
|
2405
|
+
const coherence = await this._calculateVectorCoherence(items);
|
|
2406
|
+
return coherence;
|
|
2407
|
+
}
|
|
2408
|
+
/**
|
|
2409
|
+
* Assign unassigned items to nearest clusters
|
|
2410
|
+
*/
|
|
2411
|
+
async _assignUnassignedItems(unassigned, clusters) {
|
|
2412
|
+
for (const itemId of unassigned) {
|
|
2413
|
+
if (clusters.length === 0)
|
|
2414
|
+
break;
|
|
2415
|
+
try {
|
|
2416
|
+
const noun = await this.brain.getNoun(itemId);
|
|
2417
|
+
const itemVector = noun?.vector || [];
|
|
2418
|
+
if (itemVector.length === 0)
|
|
2419
|
+
continue;
|
|
2420
|
+
let bestCluster = clusters[0];
|
|
2421
|
+
let minDistance = Infinity;
|
|
2422
|
+
for (const cluster of clusters) {
|
|
2423
|
+
const distance = Math.sqrt(this._calculateSquaredDistance(itemVector, cluster.centroid));
|
|
2424
|
+
if (distance < minDistance) {
|
|
2425
|
+
minDistance = distance;
|
|
2426
|
+
bestCluster = cluster;
|
|
2427
|
+
}
|
|
2428
|
+
}
|
|
2429
|
+
bestCluster.members.push(itemId);
|
|
2430
|
+
bestCluster.size++;
|
|
2431
|
+
}
|
|
2432
|
+
catch (error) {
|
|
2433
|
+
// Skip items that can't be processed
|
|
2434
|
+
continue;
|
|
2435
|
+
}
|
|
2436
|
+
}
|
|
2437
|
+
}
|
|
2438
|
+
}
|
|
2439
|
+
//# sourceMappingURL=improvedNeuralAPI.js.map
|