@soulcraft/brainy 4.1.4 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/import/FormatDetector.d.ts +6 -1
- package/dist/import/FormatDetector.js +40 -1
- package/dist/import/ImportCoordinator.d.ts +102 -4
- package/dist/import/ImportCoordinator.js +248 -6
- package/dist/import/InstancePool.d.ts +136 -0
- package/dist/import/InstancePool.js +231 -0
- package/dist/importers/SmartCSVImporter.d.ts +2 -1
- package/dist/importers/SmartCSVImporter.js +11 -22
- package/dist/importers/SmartDOCXImporter.d.ts +125 -0
- package/dist/importers/SmartDOCXImporter.js +227 -0
- package/dist/importers/SmartExcelImporter.d.ts +12 -1
- package/dist/importers/SmartExcelImporter.js +40 -25
- package/dist/importers/SmartJSONImporter.d.ts +1 -0
- package/dist/importers/SmartJSONImporter.js +25 -6
- package/dist/importers/SmartMarkdownImporter.d.ts +2 -1
- package/dist/importers/SmartMarkdownImporter.js +11 -16
- package/dist/importers/SmartPDFImporter.d.ts +2 -1
- package/dist/importers/SmartPDFImporter.js +11 -22
- package/dist/importers/SmartYAMLImporter.d.ts +121 -0
- package/dist/importers/SmartYAMLImporter.js +275 -0
- package/dist/importers/VFSStructureGenerator.js +12 -0
- package/dist/neural/SmartExtractor.d.ts +279 -0
- package/dist/neural/SmartExtractor.js +592 -0
- package/dist/neural/SmartRelationshipExtractor.d.ts +217 -0
- package/dist/neural/SmartRelationshipExtractor.js +396 -0
- package/dist/neural/embeddedTypeEmbeddings.d.ts +1 -1
- package/dist/neural/embeddedTypeEmbeddings.js +2 -2
- package/dist/neural/entityExtractor.d.ts +3 -0
- package/dist/neural/entityExtractor.js +34 -36
- package/dist/neural/presets.d.ts +189 -0
- package/dist/neural/presets.js +365 -0
- package/dist/neural/signals/ContextSignal.d.ts +166 -0
- package/dist/neural/signals/ContextSignal.js +646 -0
- package/dist/neural/signals/EmbeddingSignal.d.ts +175 -0
- package/dist/neural/signals/EmbeddingSignal.js +435 -0
- package/dist/neural/signals/ExactMatchSignal.d.ts +220 -0
- package/dist/neural/signals/ExactMatchSignal.js +542 -0
- package/dist/neural/signals/PatternSignal.d.ts +159 -0
- package/dist/neural/signals/PatternSignal.js +478 -0
- package/dist/neural/signals/VerbContextSignal.d.ts +102 -0
- package/dist/neural/signals/VerbContextSignal.js +390 -0
- package/dist/neural/signals/VerbEmbeddingSignal.d.ts +131 -0
- package/dist/neural/signals/VerbEmbeddingSignal.js +304 -0
- package/dist/neural/signals/VerbExactMatchSignal.d.ts +115 -0
- package/dist/neural/signals/VerbExactMatchSignal.js +335 -0
- package/dist/neural/signals/VerbPatternSignal.d.ts +104 -0
- package/dist/neural/signals/VerbPatternSignal.js +457 -0
- package/dist/types/graphTypes.d.ts +2 -0
- package/package.json +4 -1
|
@@ -2,11 +2,13 @@
|
|
|
2
2
|
* Neural Entity Extractor using Brainy's NounTypes
|
|
3
3
|
* Uses embeddings and similarity matching for accurate type detection
|
|
4
4
|
*
|
|
5
|
+
* v4.2.0: Now powered by SmartExtractor for ultra-neural classification
|
|
5
6
|
* PRODUCTION-READY with caching support
|
|
6
7
|
*/
|
|
7
8
|
import { NounType } from '../types/graphTypes.js';
|
|
8
9
|
import { EntityExtractionCache, generateFileCacheKey, generateContentCacheKey, computeContentHash } from './entityExtractionCache.js';
|
|
9
10
|
import { getNounTypeEmbeddings } from './embeddedTypeEmbeddings.js';
|
|
11
|
+
import { SmartExtractor } from './SmartExtractor.js';
|
|
10
12
|
export class NeuralEntityExtractor {
|
|
11
13
|
constructor(brain, cacheOptions) {
|
|
12
14
|
// Type embeddings for similarity matching
|
|
@@ -22,6 +24,11 @@ export class NeuralEntityExtractor {
|
|
|
22
24
|
};
|
|
23
25
|
this.brain = brain;
|
|
24
26
|
this.cache = new EntityExtractionCache(cacheOptions);
|
|
27
|
+
this.smartExtractor = new SmartExtractor(brain, {
|
|
28
|
+
enableEnsemble: true,
|
|
29
|
+
enableFormatHints: true,
|
|
30
|
+
minConfidence: 0.60
|
|
31
|
+
});
|
|
25
32
|
}
|
|
26
33
|
/**
|
|
27
34
|
* Initialize type embeddings for neural matching
|
|
@@ -75,46 +82,37 @@ export class NeuralEntityExtractor {
|
|
|
75
82
|
const useNeuralMatching = options?.neuralMatching !== false; // Default true
|
|
76
83
|
// Step 1: Extract potential entities using patterns
|
|
77
84
|
const candidates = await this.extractCandidates(text);
|
|
78
|
-
// Step 2: Classify each candidate using
|
|
85
|
+
// Step 2: Classify each candidate using SmartExtractor (v4.2.0)
|
|
79
86
|
for (const candidate of candidates) {
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
if (!typeVector)
|
|
89
|
-
continue;
|
|
90
|
-
const similarity = this.cosineSimilarity(candidateVector, typeVector);
|
|
91
|
-
// Apply context-based boosting
|
|
92
|
-
const contextBoost = this.getContextBoost(candidate.text, candidate.context, type);
|
|
93
|
-
const adjustedConfidence = similarity * (1 + contextBoost);
|
|
94
|
-
if (adjustedConfidence > bestConfidence) {
|
|
95
|
-
bestConfidence = adjustedConfidence;
|
|
96
|
-
bestType = type;
|
|
97
|
-
}
|
|
98
|
-
}
|
|
87
|
+
// Use SmartExtractor for unified neural + rule-based classification
|
|
88
|
+
const classification = await this.smartExtractor.extract(candidate.text, {
|
|
89
|
+
definition: candidate.context,
|
|
90
|
+
allTerms: [candidate.text, candidate.context]
|
|
91
|
+
});
|
|
92
|
+
// Skip if SmartExtractor returns null (low confidence) or below threshold
|
|
93
|
+
if (!classification || classification.confidence < minConfidence) {
|
|
94
|
+
continue;
|
|
99
95
|
}
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
bestType = classification.type;
|
|
104
|
-
bestConfidence = classification.confidence;
|
|
96
|
+
// Filter by requested types if specified
|
|
97
|
+
if (options?.types && !options.types.includes(classification.type)) {
|
|
98
|
+
continue;
|
|
105
99
|
}
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
100
|
+
// Calculate weight from signal results (average of all signals that voted)
|
|
101
|
+
const signalResults = classification.metadata?.signalResults || [];
|
|
102
|
+
const avgWeight = signalResults.length > 0
|
|
103
|
+
? signalResults.reduce((sum, s) => sum + s.weight, 0) / signalResults.length
|
|
104
|
+
: 1.0;
|
|
105
|
+
const entity = {
|
|
106
|
+
text: candidate.text,
|
|
107
|
+
type: classification.type,
|
|
108
|
+
position: candidate.position,
|
|
109
|
+
confidence: classification.confidence,
|
|
110
|
+
weight: avgWeight
|
|
111
|
+
};
|
|
112
|
+
if (options?.includeVectors) {
|
|
113
|
+
entity.vector = await this.getEmbedding(candidate.text);
|
|
117
114
|
}
|
|
115
|
+
entities.push(entity);
|
|
118
116
|
}
|
|
119
117
|
// Remove duplicates and overlaps
|
|
120
118
|
const deduplicatedEntities = this.deduplicateEntities(entities);
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Smart Import Presets - Zero-Configuration Auto-Detection
|
|
3
|
+
*
|
|
4
|
+
* Automatically selects optimal import strategy based on:
|
|
5
|
+
* - File type (Excel, CSV, PDF, Markdown, JSON)
|
|
6
|
+
* - File size and row count
|
|
7
|
+
* - Column structure (explicit relationships vs narrative)
|
|
8
|
+
* - Available memory and performance requirements
|
|
9
|
+
*
|
|
10
|
+
* Production-ready: Handles billions of entities with optimal performance
|
|
11
|
+
*/
|
|
12
|
+
/**
|
|
13
|
+
* Signal types used for entity classification
|
|
14
|
+
*/
|
|
15
|
+
export type SignalType = 'embedding' | 'exact' | 'pattern' | 'context';
|
|
16
|
+
/**
|
|
17
|
+
* Strategy types used for relationship extraction
|
|
18
|
+
*/
|
|
19
|
+
export type StrategyType = 'explicit' | 'pattern' | 'embedding';
|
|
20
|
+
/**
|
|
21
|
+
* Import context for preset auto-detection
|
|
22
|
+
*/
|
|
23
|
+
export interface ImportContext {
|
|
24
|
+
fileType?: 'excel' | 'csv' | 'json' | 'pdf' | 'markdown' | 'unknown';
|
|
25
|
+
fileSize?: number;
|
|
26
|
+
rowCount?: number;
|
|
27
|
+
hasExplicitColumns?: boolean;
|
|
28
|
+
hasNarrativeContent?: boolean;
|
|
29
|
+
avgDefinitionLength?: number;
|
|
30
|
+
memoryAvailable?: number;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Signal configuration with weights
|
|
34
|
+
*/
|
|
35
|
+
export interface SignalConfig {
|
|
36
|
+
enabled: SignalType[];
|
|
37
|
+
weights: Record<SignalType, number>;
|
|
38
|
+
timeout: number;
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Strategy configuration with priorities
|
|
42
|
+
*/
|
|
43
|
+
export interface StrategyConfig {
|
|
44
|
+
enabled: StrategyType[];
|
|
45
|
+
timeout: number;
|
|
46
|
+
earlyTermination: boolean;
|
|
47
|
+
minConfidence: number;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Complete preset configuration
|
|
51
|
+
*/
|
|
52
|
+
export interface PresetConfig {
|
|
53
|
+
name: string;
|
|
54
|
+
description: string;
|
|
55
|
+
signals: SignalConfig;
|
|
56
|
+
strategies: StrategyConfig;
|
|
57
|
+
streaming: boolean;
|
|
58
|
+
batchSize: number;
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Fast Preset - For large imports (>10K rows)
|
|
62
|
+
*
|
|
63
|
+
* Optimized for speed over accuracy:
|
|
64
|
+
* - Only exact match and pattern signals
|
|
65
|
+
* - Only explicit strategy (O(1) lookups)
|
|
66
|
+
* - Streaming enabled for memory efficiency
|
|
67
|
+
* - Early termination on first high-confidence match
|
|
68
|
+
*
|
|
69
|
+
* Use case: Bulk imports, data migrations
|
|
70
|
+
* Performance: ~10ms per row
|
|
71
|
+
* Accuracy: ~85%
|
|
72
|
+
*/
|
|
73
|
+
export declare const FAST_PRESET: PresetConfig;
|
|
74
|
+
/**
|
|
75
|
+
* Balanced Preset - Default for most imports
|
|
76
|
+
*
|
|
77
|
+
* Good balance of speed and accuracy:
|
|
78
|
+
* - All signals except context (embedding, exact, pattern)
|
|
79
|
+
* - All strategies with smart ordering
|
|
80
|
+
* - Moderate timeouts
|
|
81
|
+
* - Early termination after high-confidence matches
|
|
82
|
+
*
|
|
83
|
+
* Use case: Standard imports, general glossaries
|
|
84
|
+
* Performance: ~30ms per row
|
|
85
|
+
* Accuracy: ~92%
|
|
86
|
+
*/
|
|
87
|
+
export declare const BALANCED_PRESET: PresetConfig;
|
|
88
|
+
/**
|
|
89
|
+
* Accurate Preset - For small, critical imports
|
|
90
|
+
*
|
|
91
|
+
* Optimized for accuracy over speed:
|
|
92
|
+
* - All signals including context
|
|
93
|
+
* - All strategies, no early termination
|
|
94
|
+
* - Longer timeouts for thorough analysis
|
|
95
|
+
* - Lower confidence threshold (accept more matches)
|
|
96
|
+
*
|
|
97
|
+
* Use case: Knowledge bases, critical taxonomies
|
|
98
|
+
* Performance: ~100ms per row
|
|
99
|
+
* Accuracy: ~97%
|
|
100
|
+
*/
|
|
101
|
+
export declare const ACCURATE_PRESET: PresetConfig;
|
|
102
|
+
/**
|
|
103
|
+
* Explicit Preset - For glossaries with relationship columns
|
|
104
|
+
*
|
|
105
|
+
* Optimized for structured data with explicit relationships:
|
|
106
|
+
* - Only exact match signals (no AI needed)
|
|
107
|
+
* - Only explicit and pattern strategies
|
|
108
|
+
* - Fast, deterministic results
|
|
109
|
+
* - Perfect for Excel/CSV with "Related Terms" columns
|
|
110
|
+
*
|
|
111
|
+
* Use case: Workshop glossary, structured taxonomies
|
|
112
|
+
* Performance: ~5ms per row
|
|
113
|
+
* Accuracy: ~99% (high confidence)
|
|
114
|
+
*/
|
|
115
|
+
export declare const EXPLICIT_PRESET: PresetConfig;
|
|
116
|
+
/**
|
|
117
|
+
* Pattern Preset - For documents with narrative content
|
|
118
|
+
*
|
|
119
|
+
* Optimized for unstructured text with rich patterns:
|
|
120
|
+
* - Embedding and pattern signals (semantic understanding)
|
|
121
|
+
* - Pattern and embedding strategies
|
|
122
|
+
* - Good for PDFs, articles, documentation
|
|
123
|
+
*
|
|
124
|
+
* Use case: PDF imports, markdown docs, articles
|
|
125
|
+
* Performance: ~50ms per row
|
|
126
|
+
* Accuracy: ~90%
|
|
127
|
+
*/
|
|
128
|
+
export declare const PATTERN_PRESET: PresetConfig;
|
|
129
|
+
/**
|
|
130
|
+
* All available presets
|
|
131
|
+
*/
|
|
132
|
+
export declare const PRESETS: Record<string, PresetConfig>;
|
|
133
|
+
/**
|
|
134
|
+
* Auto-detect optimal preset based on import context
|
|
135
|
+
*
|
|
136
|
+
* Decision tree:
|
|
137
|
+
* 1. Large dataset (>10K rows or >10MB) → fast
|
|
138
|
+
* 2. Small dataset (<100 rows) → accurate
|
|
139
|
+
* 3. Excel/CSV with explicit columns → explicit
|
|
140
|
+
* 4. PDF/Markdown with long content → pattern
|
|
141
|
+
* 5. Default → balanced
|
|
142
|
+
*
|
|
143
|
+
* @param context Import context (file type, size, structure)
|
|
144
|
+
* @returns Optimal preset configuration
|
|
145
|
+
*/
|
|
146
|
+
export declare function autoDetectPreset(context?: ImportContext): PresetConfig;
|
|
147
|
+
/**
|
|
148
|
+
* Get preset by name
|
|
149
|
+
*
|
|
150
|
+
* @param name Preset name (fast, balanced, accurate, explicit, pattern)
|
|
151
|
+
* @returns Preset configuration
|
|
152
|
+
* @throws Error if preset not found
|
|
153
|
+
*/
|
|
154
|
+
export declare function getPreset(name: string): PresetConfig;
|
|
155
|
+
/**
|
|
156
|
+
* Get all available preset names
|
|
157
|
+
*
|
|
158
|
+
* @returns Array of preset names
|
|
159
|
+
*/
|
|
160
|
+
export declare function getPresetNames(): string[];
|
|
161
|
+
/**
|
|
162
|
+
* Explain why a preset was selected
|
|
163
|
+
*
|
|
164
|
+
* @param context Import context
|
|
165
|
+
* @returns Human-readable explanation
|
|
166
|
+
*/
|
|
167
|
+
export declare function explainPresetChoice(context?: ImportContext): string;
|
|
168
|
+
/**
|
|
169
|
+
* Create custom preset by merging with base preset
|
|
170
|
+
*
|
|
171
|
+
* @param baseName Base preset name
|
|
172
|
+
* @param overrides Custom overrides
|
|
173
|
+
* @returns Custom preset configuration
|
|
174
|
+
*/
|
|
175
|
+
export declare function createCustomPreset(baseName: string, overrides: Partial<PresetConfig>): PresetConfig;
|
|
176
|
+
/**
|
|
177
|
+
* Validate preset configuration
|
|
178
|
+
*
|
|
179
|
+
* @param preset Preset to validate
|
|
180
|
+
* @returns True if valid, throws error otherwise
|
|
181
|
+
*/
|
|
182
|
+
export declare function validatePreset(preset: PresetConfig): boolean;
|
|
183
|
+
/**
|
|
184
|
+
* Format preset for display
|
|
185
|
+
*
|
|
186
|
+
* @param preset Preset configuration
|
|
187
|
+
* @returns Human-readable preset summary
|
|
188
|
+
*/
|
|
189
|
+
export declare function formatPreset(preset: PresetConfig): string;
|
|
@@ -0,0 +1,365 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Smart Import Presets - Zero-Configuration Auto-Detection
|
|
3
|
+
*
|
|
4
|
+
* Automatically selects optimal import strategy based on:
|
|
5
|
+
* - File type (Excel, CSV, PDF, Markdown, JSON)
|
|
6
|
+
* - File size and row count
|
|
7
|
+
* - Column structure (explicit relationships vs narrative)
|
|
8
|
+
* - Available memory and performance requirements
|
|
9
|
+
*
|
|
10
|
+
* Production-ready: Handles billions of entities with optimal performance
|
|
11
|
+
*/
|
|
12
|
+
/**
|
|
13
|
+
* Fast Preset - For large imports (>10K rows)
|
|
14
|
+
*
|
|
15
|
+
* Optimized for speed over accuracy:
|
|
16
|
+
* - Only exact match and pattern signals
|
|
17
|
+
* - Only explicit strategy (O(1) lookups)
|
|
18
|
+
* - Streaming enabled for memory efficiency
|
|
19
|
+
* - Early termination on first high-confidence match
|
|
20
|
+
*
|
|
21
|
+
* Use case: Bulk imports, data migrations
|
|
22
|
+
* Performance: ~10ms per row
|
|
23
|
+
* Accuracy: ~85%
|
|
24
|
+
*/
|
|
25
|
+
export const FAST_PRESET = {
|
|
26
|
+
name: 'fast',
|
|
27
|
+
description: 'Fast bulk import for large datasets',
|
|
28
|
+
signals: {
|
|
29
|
+
enabled: ['exact', 'pattern'],
|
|
30
|
+
weights: {
|
|
31
|
+
exact: 0.70,
|
|
32
|
+
pattern: 0.30,
|
|
33
|
+
embedding: 0,
|
|
34
|
+
context: 0
|
|
35
|
+
},
|
|
36
|
+
timeout: 50
|
|
37
|
+
},
|
|
38
|
+
strategies: {
|
|
39
|
+
enabled: ['explicit'],
|
|
40
|
+
timeout: 100,
|
|
41
|
+
earlyTermination: true,
|
|
42
|
+
minConfidence: 0.70
|
|
43
|
+
},
|
|
44
|
+
streaming: true,
|
|
45
|
+
batchSize: 1000
|
|
46
|
+
};
|
|
47
|
+
/**
|
|
48
|
+
* Balanced Preset - Default for most imports
|
|
49
|
+
*
|
|
50
|
+
* Good balance of speed and accuracy:
|
|
51
|
+
* - All signals except context (embedding, exact, pattern)
|
|
52
|
+
* - All strategies with smart ordering
|
|
53
|
+
* - Moderate timeouts
|
|
54
|
+
* - Early termination after high-confidence matches
|
|
55
|
+
*
|
|
56
|
+
* Use case: Standard imports, general glossaries
|
|
57
|
+
* Performance: ~30ms per row
|
|
58
|
+
* Accuracy: ~92%
|
|
59
|
+
*/
|
|
60
|
+
export const BALANCED_PRESET = {
|
|
61
|
+
name: 'balanced',
|
|
62
|
+
description: 'Balanced speed and accuracy for most imports',
|
|
63
|
+
signals: {
|
|
64
|
+
enabled: ['exact', 'embedding', 'pattern'],
|
|
65
|
+
weights: {
|
|
66
|
+
exact: 0.40,
|
|
67
|
+
embedding: 0.35,
|
|
68
|
+
pattern: 0.25,
|
|
69
|
+
context: 0
|
|
70
|
+
},
|
|
71
|
+
timeout: 100
|
|
72
|
+
},
|
|
73
|
+
strategies: {
|
|
74
|
+
enabled: ['explicit', 'pattern', 'embedding'],
|
|
75
|
+
timeout: 200,
|
|
76
|
+
earlyTermination: true,
|
|
77
|
+
minConfidence: 0.65
|
|
78
|
+
},
|
|
79
|
+
streaming: false,
|
|
80
|
+
batchSize: 500
|
|
81
|
+
};
|
|
82
|
+
/**
|
|
83
|
+
* Accurate Preset - For small, critical imports
|
|
84
|
+
*
|
|
85
|
+
* Optimized for accuracy over speed:
|
|
86
|
+
* - All signals including context
|
|
87
|
+
* - All strategies, no early termination
|
|
88
|
+
* - Longer timeouts for thorough analysis
|
|
89
|
+
* - Lower confidence threshold (accept more matches)
|
|
90
|
+
*
|
|
91
|
+
* Use case: Knowledge bases, critical taxonomies
|
|
92
|
+
* Performance: ~100ms per row
|
|
93
|
+
* Accuracy: ~97%
|
|
94
|
+
*/
|
|
95
|
+
export const ACCURATE_PRESET = {
|
|
96
|
+
name: 'accurate',
|
|
97
|
+
description: 'Maximum accuracy for critical imports',
|
|
98
|
+
signals: {
|
|
99
|
+
enabled: ['exact', 'embedding', 'pattern', 'context'],
|
|
100
|
+
weights: {
|
|
101
|
+
exact: 0.40,
|
|
102
|
+
embedding: 0.35,
|
|
103
|
+
pattern: 0.20,
|
|
104
|
+
context: 0.05
|
|
105
|
+
},
|
|
106
|
+
timeout: 500
|
|
107
|
+
},
|
|
108
|
+
strategies: {
|
|
109
|
+
enabled: ['explicit', 'pattern', 'embedding'],
|
|
110
|
+
timeout: 1000,
|
|
111
|
+
earlyTermination: false,
|
|
112
|
+
minConfidence: 0.50
|
|
113
|
+
},
|
|
114
|
+
streaming: false,
|
|
115
|
+
batchSize: 100
|
|
116
|
+
};
|
|
117
|
+
/**
|
|
118
|
+
* Explicit Preset - For glossaries with relationship columns
|
|
119
|
+
*
|
|
120
|
+
* Optimized for structured data with explicit relationships:
|
|
121
|
+
* - Only exact match signals (no AI needed)
|
|
122
|
+
* - Only explicit and pattern strategies
|
|
123
|
+
* - Fast, deterministic results
|
|
124
|
+
* - Perfect for Excel/CSV with "Related Terms" columns
|
|
125
|
+
*
|
|
126
|
+
* Use case: Workshop glossary, structured taxonomies
|
|
127
|
+
* Performance: ~5ms per row
|
|
128
|
+
* Accuracy: ~99% (high confidence)
|
|
129
|
+
*/
|
|
130
|
+
export const EXPLICIT_PRESET = {
|
|
131
|
+
name: 'explicit',
|
|
132
|
+
description: 'For glossaries with explicit relationship columns',
|
|
133
|
+
signals: {
|
|
134
|
+
enabled: ['exact', 'pattern'],
|
|
135
|
+
weights: {
|
|
136
|
+
exact: 0.70,
|
|
137
|
+
pattern: 0.30,
|
|
138
|
+
embedding: 0,
|
|
139
|
+
context: 0
|
|
140
|
+
},
|
|
141
|
+
timeout: 50
|
|
142
|
+
},
|
|
143
|
+
strategies: {
|
|
144
|
+
enabled: ['explicit', 'pattern'],
|
|
145
|
+
timeout: 100,
|
|
146
|
+
earlyTermination: true,
|
|
147
|
+
minConfidence: 0.80
|
|
148
|
+
},
|
|
149
|
+
streaming: false,
|
|
150
|
+
batchSize: 500
|
|
151
|
+
};
|
|
152
|
+
/**
|
|
153
|
+
* Pattern Preset - For documents with narrative content
|
|
154
|
+
*
|
|
155
|
+
* Optimized for unstructured text with rich patterns:
|
|
156
|
+
* - Embedding and pattern signals (semantic understanding)
|
|
157
|
+
* - Pattern and embedding strategies
|
|
158
|
+
* - Good for PDFs, articles, documentation
|
|
159
|
+
*
|
|
160
|
+
* Use case: PDF imports, markdown docs, articles
|
|
161
|
+
* Performance: ~50ms per row
|
|
162
|
+
* Accuracy: ~90%
|
|
163
|
+
*/
|
|
164
|
+
export const PATTERN_PRESET = {
|
|
165
|
+
name: 'pattern',
|
|
166
|
+
description: 'For documents with narrative content',
|
|
167
|
+
signals: {
|
|
168
|
+
enabled: ['embedding', 'pattern', 'context'],
|
|
169
|
+
weights: {
|
|
170
|
+
embedding: 0.50,
|
|
171
|
+
pattern: 0.40,
|
|
172
|
+
context: 0.10,
|
|
173
|
+
exact: 0
|
|
174
|
+
},
|
|
175
|
+
timeout: 200
|
|
176
|
+
},
|
|
177
|
+
strategies: {
|
|
178
|
+
enabled: ['pattern', 'embedding'],
|
|
179
|
+
timeout: 300,
|
|
180
|
+
earlyTermination: false,
|
|
181
|
+
minConfidence: 0.60
|
|
182
|
+
},
|
|
183
|
+
streaming: false,
|
|
184
|
+
batchSize: 200
|
|
185
|
+
};
|
|
186
|
+
/**
|
|
187
|
+
* All available presets
|
|
188
|
+
*/
|
|
189
|
+
export const PRESETS = {
|
|
190
|
+
fast: FAST_PRESET,
|
|
191
|
+
balanced: BALANCED_PRESET,
|
|
192
|
+
accurate: ACCURATE_PRESET,
|
|
193
|
+
explicit: EXPLICIT_PRESET,
|
|
194
|
+
pattern: PATTERN_PRESET
|
|
195
|
+
};
|
|
196
|
+
/**
|
|
197
|
+
* Auto-detect optimal preset based on import context
|
|
198
|
+
*
|
|
199
|
+
* Decision tree:
|
|
200
|
+
* 1. Large dataset (>10K rows or >10MB) → fast
|
|
201
|
+
* 2. Small dataset (<100 rows) → accurate
|
|
202
|
+
* 3. Excel/CSV with explicit columns → explicit
|
|
203
|
+
* 4. PDF/Markdown with long content → pattern
|
|
204
|
+
* 5. Default → balanced
|
|
205
|
+
*
|
|
206
|
+
* @param context Import context (file type, size, structure)
|
|
207
|
+
* @returns Optimal preset configuration
|
|
208
|
+
*/
|
|
209
|
+
export function autoDetectPreset(context = {}) {
|
|
210
|
+
const { fileType = 'unknown', fileSize = 0, rowCount = 0, hasExplicitColumns = false, hasNarrativeContent = false, avgDefinitionLength = 0 } = context;
|
|
211
|
+
// Rule 1: Large imports → fast preset (prioritize speed)
|
|
212
|
+
if (rowCount > 10000 || fileSize > 10000000) {
|
|
213
|
+
return FAST_PRESET;
|
|
214
|
+
}
|
|
215
|
+
// Rule 2: Small critical imports → accurate preset (prioritize accuracy)
|
|
216
|
+
if (rowCount > 0 && rowCount < 100) {
|
|
217
|
+
return ACCURATE_PRESET;
|
|
218
|
+
}
|
|
219
|
+
// Rule 3: Structured data with explicit relationships → explicit preset
|
|
220
|
+
// Perfect for Workshop bug fix!
|
|
221
|
+
if (hasExplicitColumns && (fileType === 'excel' || fileType === 'csv')) {
|
|
222
|
+
return EXPLICIT_PRESET;
|
|
223
|
+
}
|
|
224
|
+
// Rule 4: Narrative content → pattern preset
|
|
225
|
+
// Good for PDFs, articles, documentation
|
|
226
|
+
if (hasNarrativeContent ||
|
|
227
|
+
fileType === 'pdf' ||
|
|
228
|
+
fileType === 'markdown' ||
|
|
229
|
+
avgDefinitionLength > 500) {
|
|
230
|
+
return PATTERN_PRESET;
|
|
231
|
+
}
|
|
232
|
+
// Rule 5: JSON data → balanced preset
|
|
233
|
+
if (fileType === 'json') {
|
|
234
|
+
return BALANCED_PRESET;
|
|
235
|
+
}
|
|
236
|
+
// Default: balanced preset
|
|
237
|
+
return BALANCED_PRESET;
|
|
238
|
+
}
|
|
239
|
+
/**
|
|
240
|
+
* Get preset by name
|
|
241
|
+
*
|
|
242
|
+
* @param name Preset name (fast, balanced, accurate, explicit, pattern)
|
|
243
|
+
* @returns Preset configuration
|
|
244
|
+
* @throws Error if preset not found
|
|
245
|
+
*/
|
|
246
|
+
export function getPreset(name) {
|
|
247
|
+
const preset = PRESETS[name.toLowerCase()];
|
|
248
|
+
if (!preset) {
|
|
249
|
+
throw new Error(`Unknown preset: ${name}. Available: ${Object.keys(PRESETS).join(', ')}`);
|
|
250
|
+
}
|
|
251
|
+
return preset;
|
|
252
|
+
}
|
|
253
|
+
/**
|
|
254
|
+
* Get all available preset names
|
|
255
|
+
*
|
|
256
|
+
* @returns Array of preset names
|
|
257
|
+
*/
|
|
258
|
+
export function getPresetNames() {
|
|
259
|
+
return Object.keys(PRESETS);
|
|
260
|
+
}
|
|
261
|
+
/**
|
|
262
|
+
* Explain why a preset was selected
|
|
263
|
+
*
|
|
264
|
+
* @param context Import context
|
|
265
|
+
* @returns Human-readable explanation
|
|
266
|
+
*/
|
|
267
|
+
export function explainPresetChoice(context = {}) {
|
|
268
|
+
const { fileType = 'unknown', fileSize = 0, rowCount = 0, hasExplicitColumns = false, hasNarrativeContent = false, avgDefinitionLength = 0 } = context;
|
|
269
|
+
if (rowCount > 10000 || fileSize > 10000000) {
|
|
270
|
+
return `Large dataset (${rowCount} rows, ${(fileSize / 1000000).toFixed(1)}MB) → fast preset for optimal performance`;
|
|
271
|
+
}
|
|
272
|
+
if (rowCount > 0 && rowCount < 100) {
|
|
273
|
+
return `Small critical dataset (${rowCount} rows) → accurate preset for maximum accuracy`;
|
|
274
|
+
}
|
|
275
|
+
if (hasExplicitColumns && (fileType === 'excel' || fileType === 'csv')) {
|
|
276
|
+
return `${fileType.toUpperCase()} with explicit relationship columns → explicit preset for deterministic results`;
|
|
277
|
+
}
|
|
278
|
+
if (hasNarrativeContent || fileType === 'pdf' || fileType === 'markdown') {
|
|
279
|
+
return `Narrative content (${fileType}) → pattern preset for semantic understanding`;
|
|
280
|
+
}
|
|
281
|
+
if (fileType === 'json') {
|
|
282
|
+
return `JSON data → balanced preset for structured imports`;
|
|
283
|
+
}
|
|
284
|
+
return `Standard import → balanced preset (default)`;
|
|
285
|
+
}
|
|
286
|
+
/**
|
|
287
|
+
* Create custom preset by merging with base preset
|
|
288
|
+
*
|
|
289
|
+
* @param baseName Base preset name
|
|
290
|
+
* @param overrides Custom overrides
|
|
291
|
+
* @returns Custom preset configuration
|
|
292
|
+
*/
|
|
293
|
+
export function createCustomPreset(baseName, overrides) {
|
|
294
|
+
const base = getPreset(baseName);
|
|
295
|
+
return {
|
|
296
|
+
...base,
|
|
297
|
+
...overrides,
|
|
298
|
+
signals: {
|
|
299
|
+
...base.signals,
|
|
300
|
+
...(overrides.signals || {})
|
|
301
|
+
},
|
|
302
|
+
strategies: {
|
|
303
|
+
...base.strategies,
|
|
304
|
+
...(overrides.strategies || {})
|
|
305
|
+
}
|
|
306
|
+
};
|
|
307
|
+
}
|
|
308
|
+
/**
|
|
309
|
+
* Validate preset configuration
|
|
310
|
+
*
|
|
311
|
+
* @param preset Preset to validate
|
|
312
|
+
* @returns True if valid, throws error otherwise
|
|
313
|
+
*/
|
|
314
|
+
export function validatePreset(preset) {
|
|
315
|
+
// Validate signals
|
|
316
|
+
if (preset.signals.enabled.length === 0) {
|
|
317
|
+
throw new Error('Preset must have at least one enabled signal');
|
|
318
|
+
}
|
|
319
|
+
// Validate strategies
|
|
320
|
+
if (preset.strategies.enabled.length === 0) {
|
|
321
|
+
throw new Error('Preset must have at least one enabled strategy');
|
|
322
|
+
}
|
|
323
|
+
// Validate weights sum to ~1.0
|
|
324
|
+
const enabledSignals = preset.signals.enabled;
|
|
325
|
+
const totalWeight = enabledSignals.reduce((sum, signal) => sum + preset.signals.weights[signal], 0);
|
|
326
|
+
if (Math.abs(totalWeight - 1.0) > 0.01) {
|
|
327
|
+
throw new Error(`Signal weights must sum to 1.0, got ${totalWeight.toFixed(2)}`);
|
|
328
|
+
}
|
|
329
|
+
// Validate timeouts
|
|
330
|
+
if (preset.signals.timeout <= 0 || preset.strategies.timeout <= 0) {
|
|
331
|
+
throw new Error('Timeouts must be positive');
|
|
332
|
+
}
|
|
333
|
+
// Validate batch size
|
|
334
|
+
if (preset.batchSize <= 0) {
|
|
335
|
+
throw new Error('Batch size must be positive');
|
|
336
|
+
}
|
|
337
|
+
return true;
|
|
338
|
+
}
|
|
339
|
+
/**
|
|
340
|
+
* Format preset for display
|
|
341
|
+
*
|
|
342
|
+
* @param preset Preset configuration
|
|
343
|
+
* @returns Human-readable preset summary
|
|
344
|
+
*/
|
|
345
|
+
export function formatPreset(preset) {
|
|
346
|
+
const lines = [
|
|
347
|
+
`Preset: ${preset.name}`,
|
|
348
|
+
`Description: ${preset.description}`,
|
|
349
|
+
'',
|
|
350
|
+
'Signals:',
|
|
351
|
+
...preset.signals.enabled.map((s) => ` - ${s}: ${(preset.signals.weights[s] * 100).toFixed(0)}%`),
|
|
352
|
+
` Timeout: ${preset.signals.timeout}ms`,
|
|
353
|
+
'',
|
|
354
|
+
'Strategies:',
|
|
355
|
+
...preset.strategies.enabled.map((s) => ` - ${s}`),
|
|
356
|
+
` Timeout: ${preset.strategies.timeout}ms`,
|
|
357
|
+
` Early termination: ${preset.strategies.earlyTermination}`,
|
|
358
|
+
` Min confidence: ${preset.strategies.minConfidence}`,
|
|
359
|
+
'',
|
|
360
|
+
`Streaming: ${preset.streaming}`,
|
|
361
|
+
`Batch size: ${preset.batchSize}`
|
|
362
|
+
];
|
|
363
|
+
return lines.join('\n');
|
|
364
|
+
}
|
|
365
|
+
//# sourceMappingURL=presets.js.map
|