@crashbytes/semantic-text-toolkit 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,225 @@
1
+ /**
2
+ * Semantic Engine - Core Embedding Generation
3
+ *
4
+ * Architectural Principles:
5
+ * - Lazy initialization minimizes startup overhead
6
+ * - Singleton pattern prevents redundant model loading
7
+ * - Resource management through explicit lifecycle control
8
+ * - Defensive error handling with semantic codes
9
+ */
10
+
11
+ import { pipeline, FeatureExtractionPipeline } from '@xenova/transformers';
12
+ import {
13
+ ModelConfig,
14
+ EmbeddingResult,
15
+ SimilarityResult,
16
+ Embedding,
17
+ BatchOptions,
18
+ SemanticError,
19
+ SemanticErrorCode,
20
+ } from '../types';
21
+ import { cosineSimilarity, euclideanDistance, dotProduct } from '../utils/vector';
22
+
23
+ const DEFAULT_CONFIG: Required<ModelConfig> = {
24
+ modelName: 'Xenova/all-MiniLM-L6-v2',
25
+ maxLength: 512,
26
+ quantized: true,
27
+ onProgress: () => {},
28
+ };
29
+
30
+ export class SemanticEngine {
31
+ private model: FeatureExtractionPipeline | null = null;
32
+ private config: Required<ModelConfig>;
33
+ private initializationPromise: Promise<void> | null = null;
34
+
35
+ constructor(config: ModelConfig = {}) {
36
+ this.config = { ...DEFAULT_CONFIG, ...config };
37
+ }
38
+
39
+ async initialize(): Promise<void> {
40
+ if (this.initializationPromise) {
41
+ return this.initializationPromise;
42
+ }
43
+
44
+ if (this.model) {
45
+ return Promise.resolve();
46
+ }
47
+
48
+ this.initializationPromise = this._performInitialization();
49
+
50
+ try {
51
+ await this.initializationPromise;
52
+ } finally {
53
+ this.initializationPromise = null;
54
+ }
55
+ }
56
+
57
+ private async _performInitialization(): Promise<void> {
58
+ try {
59
+ this.config.onProgress({
60
+ status: 'downloading',
61
+ progress: 0,
62
+ });
63
+
64
+ this.model = await pipeline(
65
+ 'feature-extraction',
66
+ this.config.modelName,
67
+ {
68
+ quantized: this.config.quantized,
69
+ }
70
+ );
71
+
72
+ this.config.onProgress({
73
+ status: 'ready',
74
+ progress: 100,
75
+ });
76
+ } catch (error) {
77
+ throw new SemanticError(
78
+ SemanticErrorCode.MODEL_NOT_LOADED,
79
+ `Failed to initialize model: ${error instanceof Error ? error.message : 'Unknown error'}`,
80
+ { modelName: this.config.modelName, error }
81
+ );
82
+ }
83
+ }
84
+
85
+ private assertInitialized(): void {
86
+ if (!this.model) {
87
+ throw new SemanticError(
88
+ SemanticErrorCode.MODEL_NOT_LOADED,
89
+ 'Model not initialized. Call initialize() first.'
90
+ );
91
+ }
92
+ }
93
+
94
+ async embed(text: string): Promise<EmbeddingResult> {
95
+ this.assertInitialized();
96
+
97
+ if (!text || typeof text !== 'string') {
98
+ throw new SemanticError(
99
+ SemanticErrorCode.INVALID_INPUT,
100
+ 'Text must be a non-empty string',
101
+ { text }
102
+ );
103
+ }
104
+
105
+ const startTime = performance.now();
106
+
107
+ try {
108
+ const output = await this.model!(text, {
109
+ pooling: 'mean',
110
+ normalize: true,
111
+ });
112
+
113
+ const embedding = Array.from(output.data) as Embedding;
114
+ const processingTime = performance.now() - startTime;
115
+
116
+ return {
117
+ embedding,
118
+ text,
119
+ metadata: {
120
+ dimensions: embedding.length,
121
+ modelName: this.config.modelName,
122
+ processingTime,
123
+ },
124
+ };
125
+ } catch (error) {
126
+ throw new SemanticError(
127
+ SemanticErrorCode.EMBEDDING_FAILED,
128
+ `Failed to generate embedding: ${error instanceof Error ? error.message : 'Unknown error'}`,
129
+ { text: text.substring(0, 100), error }
130
+ );
131
+ }
132
+ }
133
+
134
+ async embedBatch(
135
+ texts: string[],
136
+ options: BatchOptions = {}
137
+ ): Promise<EmbeddingResult[]> {
138
+ this.assertInitialized();
139
+
140
+ const { batchSize = 32, onProgress } = options;
141
+
142
+ if (!Array.isArray(texts) || texts.length === 0) {
143
+ throw new SemanticError(
144
+ SemanticErrorCode.INVALID_INPUT,
145
+ 'Texts must be a non-empty array'
146
+ );
147
+ }
148
+
149
+ const results: EmbeddingResult[] = [];
150
+ const batches = Math.ceil(texts.length / batchSize);
151
+
152
+ for (let i = 0; i < batches; i++) {
153
+ const start = i * batchSize;
154
+ const end = Math.min(start + batchSize, texts.length);
155
+ const batch = texts.slice(start, end);
156
+
157
+ const batchResults = await Promise.all(
158
+ batch.map(text => this.embed(text))
159
+ );
160
+
161
+ results.push(...batchResults);
162
+
163
+ if (onProgress) {
164
+ onProgress(end, texts.length);
165
+ }
166
+ }
167
+
168
+ return results;
169
+ }
170
+
171
+ async similarity(
172
+ textA: string,
173
+ textB: string,
174
+ method: 'cosine' | 'euclidean' | 'dot' = 'cosine'
175
+ ): Promise<SimilarityResult> {
176
+ const startTime = performance.now();
177
+
178
+ const [resultA, resultB] = await Promise.all([
179
+ this.embed(textA),
180
+ this.embed(textB),
181
+ ]);
182
+
183
+ let score: number;
184
+ switch (method) {
185
+ case 'cosine':
186
+ score = cosineSimilarity(resultA.embedding, resultB.embedding);
187
+ break;
188
+ case 'euclidean':
189
+ score = -euclideanDistance(resultA.embedding, resultB.embedding);
190
+ break;
191
+ case 'dot':
192
+ score = dotProduct(resultA.embedding, resultB.embedding);
193
+ break;
194
+ default:
195
+ throw new SemanticError(
196
+ SemanticErrorCode.INVALID_INPUT,
197
+ `Unknown similarity method: ${method}`
198
+ );
199
+ }
200
+
201
+ const processingTime = performance.now() - startTime;
202
+
203
+ return {
204
+ score,
205
+ texts: [textA, textB],
206
+ metadata: {
207
+ method,
208
+ processingTime,
209
+ },
210
+ };
211
+ }
212
+
213
+ dispose(): void {
214
+ this.model = null;
215
+ this.initializationPromise = null;
216
+ }
217
+
218
+ isReady(): boolean {
219
+ return this.model !== null;
220
+ }
221
+
222
+ getConfig(): Required<ModelConfig> {
223
+ return { ...this.config };
224
+ }
225
+ }
package/src/index.ts ADDED
@@ -0,0 +1,31 @@
1
+ /**
2
+ * Semantic Text Toolkit
3
+ * Production-grade semantic text analysis
4
+ *
5
+ * @module @crashbytes/semantic-text-toolkit
6
+ * @author Blackhole Software, LLC
7
+ * @license MIT
8
+ */
9
+
10
+ export { SemanticEngine } from './engine/SemanticEngine';
11
+ export { SemanticSearch, type SearchConfig, type IndexedItem } from './search/SemanticSearch';
12
+ export { cosineSimilarity, euclideanDistance, dotProduct, magnitude, normalize, centroid, topKSimilar } from './utils/vector';
13
+ export { type Embedding, type ModelConfig, type EmbeddingResult, type SimilarityResult, type SearchResult, type BatchOptions, type ModelLoadProgress, SemanticError, SemanticErrorCode } from './types';
14
+
15
+ export async function createSemanticEngine(config?: import('./types').ModelConfig) {
16
+ const { SemanticEngine } = await import('./engine/SemanticEngine');
17
+ const engine = new SemanticEngine(config);
18
+ await engine.initialize();
19
+ return engine;
20
+ }
21
+
22
+ export async function createSemanticSearch<T = string>(
23
+ items: T[],
24
+ config?: import('./search/SemanticSearch').SearchConfig<T>
25
+ ) {
26
+ const engine = await createSemanticEngine();
27
+ const { SemanticSearch } = await import('./search/SemanticSearch');
28
+ const search = new SemanticSearch(engine, config);
29
+ await search.index(items);
30
+ return search;
31
+ }
@@ -0,0 +1,154 @@
1
+ /**
2
+ * Semantic Search - Vector-Based Document Retrieval
3
+ *
4
+ * Architectural Principles:
5
+ * - Pre-computed embeddings optimize retrieval latency
6
+ * - Configurable ranking strategies enable domain customization
7
+ * - Metadata filtering supports complex queries
8
+ * - O(n log k) complexity for top-k selection
9
+ */
10
+
11
+ import { SemanticEngine } from '../engine/SemanticEngine';
12
+ import {
13
+ Embedding,
14
+ SearchResult,
15
+ SemanticError,
16
+ SemanticErrorCode,
17
+ } from '../types';
18
+ import { topKSimilar } from '../utils/vector';
19
+
20
+ export interface SearchConfig<T = string> {
21
+ topK?: number;
22
+ threshold?: number;
23
+ textExtractor?: (item: T) => string;
24
+ metadataExtractor?: (item: T) => Record<string, unknown>;
25
+ }
26
+
27
+ export interface IndexedItem<T = string> {
28
+ item: T;
29
+ embedding: Embedding;
30
+ metadata?: Record<string, unknown>;
31
+ }
32
+
33
+ export class SemanticSearch<T = string> {
34
+ private engine: SemanticEngine;
35
+ private indexedItems: IndexedItem<T>[] = [];
36
+ private config: Required<SearchConfig<T>>;
37
+
38
+ constructor(engine: SemanticEngine, config: SearchConfig<T> = {}) {
39
+ this.engine = engine;
40
+ this.config = {
41
+ topK: config.topK ?? 10,
42
+ threshold: config.threshold ?? 0,
43
+ textExtractor: config.textExtractor ?? ((item) => String(item)),
44
+ metadataExtractor: config.metadataExtractor ?? (() => ({})),
45
+ };
46
+ }
47
+
48
+ async index(items: T[], replace: boolean = false): Promise<void> {
49
+ if (!Array.isArray(items) || items.length === 0) {
50
+ throw new SemanticError(
51
+ SemanticErrorCode.INVALID_INPUT,
52
+ 'Items must be a non-empty array'
53
+ );
54
+ }
55
+
56
+ if (replace) {
57
+ this.indexedItems = [];
58
+ }
59
+
60
+ const texts = items.map(this.config.textExtractor);
61
+ const results = await this.engine.embedBatch(texts, { batchSize: 32 });
62
+
63
+ const newIndexItems = items.map((item, idx) => ({
64
+ item,
65
+ embedding: results[idx].embedding,
66
+ metadata: this.config.metadataExtractor(item),
67
+ }));
68
+
69
+ this.indexedItems.push(...newIndexItems);
70
+ }
71
+
72
+ async search(
73
+ query: string,
74
+ overrideConfig?: Partial<SearchConfig<T>>
75
+ ): Promise<SearchResult<T>[]> {
76
+ if (this.indexedItems.length === 0) {
77
+ throw new SemanticError(
78
+ SemanticErrorCode.INVALID_INPUT,
79
+ 'Index is empty. Call index() before searching.'
80
+ );
81
+ }
82
+
83
+ const config = { ...this.config, ...overrideConfig };
84
+ const queryResult = await this.engine.embed(query);
85
+ const candidateEmbeddings = this.indexedItems.map(item => item.embedding);
86
+ const topK = topKSimilar(queryResult.embedding, candidateEmbeddings, config.topK);
87
+
88
+ const results: SearchResult<T>[] = [];
89
+ let rank = 1;
90
+
91
+ for (const [idx, score] of topK) {
92
+ if (score < config.threshold) continue;
93
+
94
+ results.push({
95
+ item: this.indexedItems[idx].item,
96
+ score,
97
+ rank: rank++,
98
+ });
99
+ }
100
+
101
+ return results;
102
+ }
103
+
104
+ async searchWithFilter(
105
+ query: string,
106
+ filter: (metadata: Record<string, unknown>) => boolean,
107
+ config?: Partial<SearchConfig<T>>
108
+ ): Promise<SearchResult<T>[]> {
109
+ const originalIndex = this.indexedItems;
110
+ this.indexedItems = originalIndex.filter(item => filter(item.metadata ?? {}));
111
+
112
+ try {
113
+ return await this.search(query, config);
114
+ } finally {
115
+ this.indexedItems = originalIndex;
116
+ }
117
+ }
118
+
119
+ async findSimilar(
120
+ item: T,
121
+ config?: Partial<SearchConfig<T>>
122
+ ): Promise<SearchResult<T>[]> {
123
+ const text = this.config.textExtractor(item);
124
+ return this.search(text, config);
125
+ }
126
+
127
+ getStats(): {
128
+ itemCount: number;
129
+ dimensions: number;
130
+ memoryEstimate: string;
131
+ } {
132
+ const itemCount = this.indexedItems.length;
133
+ const dimensions = this.indexedItems[0]?.embedding.length ?? 0;
134
+ const totalBytes = itemCount * dimensions * 8;
135
+
136
+ const memoryEstimate = totalBytes < 1024 * 1024
137
+ ? `${(totalBytes / 1024).toFixed(2)} KB`
138
+ : `${(totalBytes / (1024 * 1024)).toFixed(2)} MB`;
139
+
140
+ return { itemCount, dimensions, memoryEstimate };
141
+ }
142
+
143
+ clear(): void {
144
+ this.indexedItems = [];
145
+ }
146
+
147
+ exportIndex(): IndexedItem<T>[] {
148
+ return [...this.indexedItems];
149
+ }
150
+
151
+ importIndex(index: IndexedItem<T>[]): void {
152
+ this.indexedItems = [...index];
153
+ }
154
+ }
package/src/types.ts ADDED
@@ -0,0 +1,73 @@
1
+ /**
2
+ * Core Type Definitions
3
+ *
4
+ * Design Philosophy:
5
+ * - Type safety prevents runtime failures
6
+ * - Semantic error codes enable precise debugging
7
+ * - Generic types provide flexibility without sacrificing safety
8
+ */
9
+
10
+ export type Embedding = number[];
11
+
12
+ export interface ModelConfig {
13
+ modelName?: string;
14
+ maxLength?: number;
15
+ quantized?: boolean;
16
+ onProgress?: (progress: ModelLoadProgress) => void;
17
+ }
18
+
19
+ export interface ModelLoadProgress {
20
+ status: 'downloading' | 'loading' | 'ready';
21
+ progress: number;
22
+ file?: string;
23
+ }
24
+
25
+ export interface EmbeddingResult {
26
+ embedding: Embedding;
27
+ text: string;
28
+ metadata: {
29
+ dimensions: number;
30
+ modelName: string;
31
+ processingTime: number;
32
+ };
33
+ }
34
+
35
+ export interface SimilarityResult {
36
+ score: number;
37
+ texts: [string, string];
38
+ metadata: {
39
+ method: 'cosine' | 'euclidean' | 'dot';
40
+ processingTime: number;
41
+ };
42
+ }
43
+
44
+ export interface SearchResult<T = string> {
45
+ item: T;
46
+ score: number;
47
+ rank: number;
48
+ }
49
+
50
+ export interface BatchOptions {
51
+ batchSize?: number;
52
+ parallel?: boolean;
53
+ onProgress?: (completed: number, total: number) => void;
54
+ }
55
+
56
+ export enum SemanticErrorCode {
57
+ MODEL_NOT_LOADED = 'MODEL_NOT_LOADED',
58
+ INVALID_INPUT = 'INVALID_INPUT',
59
+ EMBEDDING_FAILED = 'EMBEDDING_FAILED',
60
+ COMPUTATION_FAILED = 'COMPUTATION_FAILED',
61
+ DIMENSION_MISMATCH = 'DIMENSION_MISMATCH',
62
+ }
63
+
64
+ export class SemanticError extends Error {
65
+ constructor(
66
+ public code: SemanticErrorCode,
67
+ message: string,
68
+ public details?: Record<string, unknown>
69
+ ) {
70
+ super(message);
71
+ this.name = 'SemanticError';
72
+ }
73
+ }
@@ -0,0 +1,158 @@
1
+ /**
2
+ * Vector Mathematics - Performance-Optimized Operations
3
+ *
4
+ * Architectural Principles:
5
+ * - Pure functions ensure predictability
6
+ * - O(n) time complexity for scalability
7
+ * - Defensive validation at boundaries
8
+ * - Zero external dependencies
9
+ */
10
+
11
+ import { Embedding, SemanticError, SemanticErrorCode } from '../types';
12
+
13
+ function validateDimensions(a: Embedding, b: Embedding): void {
14
+ if (a.length !== b.length) {
15
+ throw new SemanticError(
16
+ SemanticErrorCode.DIMENSION_MISMATCH,
17
+ `Embedding dimensions must match. Got ${a.length} and ${b.length}`,
18
+ { dimensions: [a.length, b.length] }
19
+ );
20
+ }
21
+ }
22
+
23
+ function validateEmbedding(embedding: Embedding, name: string = 'embedding'): void {
24
+ if (!embedding || embedding.length === 0) {
25
+ throw new SemanticError(
26
+ SemanticErrorCode.INVALID_INPUT,
27
+ `${name} must be a non-empty array`,
28
+ { length: embedding?.length }
29
+ );
30
+ }
31
+ }
32
+
33
+ export function dotProduct(a: Embedding, b: Embedding): number {
34
+ validateEmbedding(a, 'first embedding');
35
+ validateEmbedding(b, 'second embedding');
36
+ validateDimensions(a, b);
37
+
38
+ let sum = 0;
39
+ for (let i = 0; i < a.length; i++) {
40
+ sum += a[i] * b[i];
41
+ }
42
+ return sum;
43
+ }
44
+
45
+ export function magnitude(vector: Embedding): number {
46
+ validateEmbedding(vector);
47
+
48
+ let sum = 0;
49
+ for (let i = 0; i < vector.length; i++) {
50
+ sum += vector[i] * vector[i];
51
+ }
52
+ return Math.sqrt(sum);
53
+ }
54
+
55
+ export function cosineSimilarity(a: Embedding, b: Embedding): number {
56
+ validateEmbedding(a, 'first embedding');
57
+ validateEmbedding(b, 'second embedding');
58
+ validateDimensions(a, b);
59
+
60
+ const dot = dotProduct(a, b);
61
+ const magA = magnitude(a);
62
+ const magB = magnitude(b);
63
+
64
+ if (magA === 0 || magB === 0) {
65
+ throw new SemanticError(
66
+ SemanticErrorCode.COMPUTATION_FAILED,
67
+ 'Cannot compute cosine similarity with zero-magnitude vector',
68
+ { magnitudes: [magA, magB] }
69
+ );
70
+ }
71
+
72
+ return dot / (magA * magB);
73
+ }
74
+
75
+ export function euclideanDistance(a: Embedding, b: Embedding): number {
76
+ validateEmbedding(a, 'first embedding');
77
+ validateEmbedding(b, 'second embedding');
78
+ validateDimensions(a, b);
79
+
80
+ let sum = 0;
81
+ for (let i = 0; i < a.length; i++) {
82
+ const diff = a[i] - b[i];
83
+ sum += diff * diff;
84
+ }
85
+ return Math.sqrt(sum);
86
+ }
87
+
88
+ export function normalize(vector: Embedding): Embedding {
89
+ validateEmbedding(vector);
90
+
91
+ const mag = magnitude(vector);
92
+ if (mag === 0) {
93
+ throw new SemanticError(
94
+ SemanticErrorCode.COMPUTATION_FAILED,
95
+ 'Cannot normalize zero-magnitude vector'
96
+ );
97
+ }
98
+
99
+ return vector.map(v => v / mag);
100
+ }
101
+
102
+ export function centroid(embeddings: Embedding[]): Embedding {
103
+ if (!embeddings || embeddings.length === 0) {
104
+ throw new SemanticError(
105
+ SemanticErrorCode.INVALID_INPUT,
106
+ 'Cannot compute centroid of empty array'
107
+ );
108
+ }
109
+
110
+ const dim = embeddings[0].length;
111
+ const result = new Array(dim).fill(0);
112
+
113
+ for (const embedding of embeddings) {
114
+ if (embedding.length !== dim) {
115
+ throw new SemanticError(
116
+ SemanticErrorCode.DIMENSION_MISMATCH,
117
+ 'All embeddings must have same dimensions'
118
+ );
119
+ }
120
+ for (let i = 0; i < dim; i++) {
121
+ result[i] += embedding[i];
122
+ }
123
+ }
124
+
125
+ return result.map(v => v / embeddings.length);
126
+ }
127
+
128
+ export function topKSimilar(
129
+ query: Embedding,
130
+ candidates: Embedding[],
131
+ k: number = 10
132
+ ): Array<[number, number]> {
133
+ validateEmbedding(query, 'query');
134
+
135
+ if (!candidates || candidates.length === 0) {
136
+ return [];
137
+ }
138
+
139
+ if (k <= 0) {
140
+ throw new SemanticError(
141
+ SemanticErrorCode.INVALID_INPUT,
142
+ 'k must be positive',
143
+ { k }
144
+ );
145
+ }
146
+
147
+ const similarities: Array<[number, number]> = candidates.map((candidate, idx) => {
148
+ try {
149
+ return [idx, cosineSimilarity(query, candidate)];
150
+ } catch (error) {
151
+ return [idx, -Infinity];
152
+ }
153
+ });
154
+
155
+ similarities.sort((a, b) => b[1] - a[1]);
156
+
157
+ return similarities.slice(0, Math.min(k, similarities.length));
158
+ }
package/tsconfig.json ADDED
@@ -0,0 +1,25 @@
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ES2020",
4
+ "module": "ESNext",
5
+ "lib": ["ES2020"],
6
+ "moduleResolution": "node",
7
+ "declaration": true,
8
+ "declarationMap": true,
9
+ "sourceMap": true,
10
+ "outDir": "./dist",
11
+ "rootDir": "./src",
12
+ "strict": true,
13
+ "esModuleInterop": true,
14
+ "skipLibCheck": true,
15
+ "forceConsistentCasingInFileNames": true,
16
+ "resolveJsonModule": true,
17
+ "isolatedModules": true,
18
+ "noUnusedLocals": true,
19
+ "noUnusedParameters": true,
20
+ "noImplicitReturns": true,
21
+ "noFallthroughCasesInSwitch": true
22
+ },
23
+ "include": ["src/**/*"],
24
+ "exclude": ["node_modules", "dist", "**/*.test.ts"]
25
+ }