@yamo/memory-mesh 2.3.2 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/memory_mesh.js +1 -1
- package/lib/llm/client.d.ts +111 -0
- package/lib/llm/client.js +299 -357
- package/lib/llm/client.ts +413 -0
- package/lib/llm/index.d.ts +17 -0
- package/lib/llm/index.js +15 -8
- package/lib/llm/index.ts +19 -0
- package/lib/memory/adapters/client.d.ts +183 -0
- package/lib/memory/adapters/client.js +518 -0
- package/lib/memory/adapters/client.ts +678 -0
- package/lib/memory/adapters/config.d.ts +137 -0
- package/lib/memory/adapters/config.js +189 -0
- package/lib/memory/adapters/config.ts +259 -0
- package/lib/memory/adapters/errors.d.ts +76 -0
- package/lib/memory/adapters/errors.js +128 -0
- package/lib/memory/adapters/errors.ts +166 -0
- package/lib/memory/context-manager.d.ts +44 -0
- package/lib/memory/context-manager.js +344 -0
- package/lib/memory/context-manager.ts +432 -0
- package/lib/memory/embeddings/factory.d.ts +59 -0
- package/lib/memory/embeddings/factory.js +148 -0
- package/lib/{embeddings/factory.js → memory/embeddings/factory.ts} +69 -28
- package/lib/memory/embeddings/index.d.ts +2 -0
- package/lib/memory/embeddings/index.js +2 -0
- package/lib/memory/embeddings/index.ts +2 -0
- package/lib/memory/embeddings/service.d.ts +164 -0
- package/lib/memory/embeddings/service.js +515 -0
- package/lib/{embeddings/service.js → memory/embeddings/service.ts} +223 -156
- package/lib/memory/index.d.ts +9 -0
- package/lib/memory/index.js +9 -1
- package/lib/memory/index.ts +20 -0
- package/lib/memory/memory-mesh.d.ts +274 -0
- package/lib/memory/memory-mesh.js +1469 -678
- package/lib/memory/memory-mesh.ts +1803 -0
- package/lib/memory/memory-translator.d.ts +19 -0
- package/lib/memory/memory-translator.js +125 -0
- package/lib/memory/memory-translator.ts +158 -0
- package/lib/memory/schema.d.ts +111 -0
- package/lib/memory/schema.js +183 -0
- package/lib/memory/schema.ts +267 -0
- package/lib/memory/scorer.d.ts +26 -0
- package/lib/memory/scorer.js +77 -0
- package/lib/memory/scorer.ts +95 -0
- package/lib/memory/search/index.d.ts +1 -0
- package/lib/memory/search/index.js +1 -0
- package/lib/memory/search/index.ts +1 -0
- package/lib/memory/search/keyword-search.d.ts +62 -0
- package/lib/memory/search/keyword-search.js +135 -0
- package/lib/{search/keyword-search.js → memory/search/keyword-search.ts} +66 -36
- package/lib/scrubber/config/defaults.d.ts +53 -0
- package/lib/scrubber/config/defaults.js +49 -57
- package/lib/scrubber/config/defaults.ts +117 -0
- package/lib/scrubber/index.d.ts +6 -0
- package/lib/scrubber/index.js +3 -23
- package/lib/scrubber/index.ts +7 -0
- package/lib/scrubber/scrubber.d.ts +61 -0
- package/lib/scrubber/scrubber.js +99 -121
- package/lib/scrubber/scrubber.ts +168 -0
- package/lib/scrubber/stages/chunker.d.ts +13 -0
- package/lib/scrubber/stages/metadata-annotator.d.ts +18 -0
- package/lib/scrubber/stages/normalizer.d.ts +13 -0
- package/lib/scrubber/stages/semantic-filter.d.ts +13 -0
- package/lib/scrubber/stages/structural-cleaner.d.ts +13 -0
- package/lib/scrubber/stages/validator.d.ts +18 -0
- package/lib/scrubber/telemetry.d.ts +36 -0
- package/lib/scrubber/telemetry.js +53 -58
- package/lib/scrubber/telemetry.ts +99 -0
- package/lib/utils/logger.d.ts +29 -0
- package/lib/utils/logger.js +64 -0
- package/lib/utils/logger.ts +85 -0
- package/lib/utils/skill-metadata.d.ts +32 -0
- package/lib/utils/skill-metadata.js +132 -0
- package/lib/utils/skill-metadata.ts +147 -0
- package/lib/yamo/emitter.d.ts +73 -0
- package/lib/yamo/emitter.js +78 -143
- package/lib/yamo/emitter.ts +249 -0
- package/lib/yamo/schema.d.ts +58 -0
- package/lib/yamo/schema.js +81 -108
- package/lib/yamo/schema.ts +165 -0
- package/package.json +11 -8
- package/index.d.ts +0 -111
- package/lib/embeddings/index.js +0 -2
- package/lib/index.js +0 -6
- package/lib/lancedb/client.js +0 -633
- package/lib/lancedb/config.js +0 -215
- package/lib/lancedb/errors.js +0 -144
- package/lib/lancedb/index.js +0 -4
- package/lib/lancedb/schema.js +0 -217
- package/lib/scrubber/errors/scrubber-error.js +0 -43
- package/lib/scrubber/stages/chunker.js +0 -103
- package/lib/scrubber/stages/metadata-annotator.js +0 -74
- package/lib/scrubber/stages/normalizer.js +0 -59
- package/lib/scrubber/stages/semantic-filter.js +0 -61
- package/lib/scrubber/stages/structural-cleaner.js +0 -82
- package/lib/scrubber/stages/validator.js +0 -66
- package/lib/scrubber/utils/hash.js +0 -39
- package/lib/scrubber/utils/html-parser.js +0 -45
- package/lib/scrubber/utils/pattern-matcher.js +0 -63
- package/lib/scrubber/utils/token-counter.js +0 -31
- package/lib/search/index.js +0 -1
- package/lib/utils/index.js +0 -1
- package/lib/yamo/index.js +0 -15
|
@@ -3,40 +3,64 @@
|
|
|
3
3
|
* Provides basic TF-IDF style retrieval to complement vector search
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
|
+
export interface KeywordDoc {
|
|
7
|
+
content: string;
|
|
8
|
+
metadata?: any;
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export interface KeywordSearchResult extends KeywordDoc {
|
|
12
|
+
id: string;
|
|
13
|
+
score: number;
|
|
14
|
+
matches: string[];
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export interface SearchOptions {
|
|
18
|
+
limit?: number;
|
|
19
|
+
}
|
|
20
|
+
|
|
6
21
|
export class KeywordSearch {
|
|
22
|
+
index: Map<string, Map<string, number>>; // token -> Map<docId, tf>
|
|
23
|
+
docLengths: Map<string, number>; // docId -> length
|
|
24
|
+
idf: Map<string, number>; // token -> idf value
|
|
25
|
+
docs: Map<string, KeywordDoc>; // docId -> content (optional, for snippet)
|
|
26
|
+
isDirty: boolean;
|
|
27
|
+
|
|
7
28
|
constructor() {
|
|
8
|
-
this.index = new Map();
|
|
9
|
-
this.docLengths = new Map();
|
|
10
|
-
this.idf = new Map();
|
|
11
|
-
this.docs = new Map();
|
|
29
|
+
this.index = new Map();
|
|
30
|
+
this.docLengths = new Map();
|
|
31
|
+
this.idf = new Map();
|
|
32
|
+
this.docs = new Map();
|
|
12
33
|
this.isDirty = false;
|
|
13
34
|
}
|
|
14
35
|
|
|
15
36
|
/**
|
|
16
37
|
* Tokenize text into normalized terms
|
|
17
|
-
* @param {string} text
|
|
38
|
+
* @param {string} text
|
|
18
39
|
* @returns {string[]} tokens
|
|
19
40
|
*/
|
|
20
|
-
tokenize(text) {
|
|
21
|
-
if (!text)
|
|
22
|
-
|
|
23
|
-
|
|
41
|
+
tokenize(text: string): string[] {
|
|
42
|
+
if (!text) {
|
|
43
|
+
return [];
|
|
44
|
+
}
|
|
45
|
+
return text
|
|
46
|
+
.toLowerCase()
|
|
47
|
+
.replace(/[^\w\s]/g, "") // Remove punctuation
|
|
24
48
|
.split(/\s+/)
|
|
25
|
-
.filter(t => t.length > 2) // Filter stopwords/short
|
|
26
|
-
.map(t => t.substring(0, 20)); // Truncate
|
|
49
|
+
.filter((t) => t.length > 2) // Filter stopwords/short
|
|
50
|
+
.map((t) => t.substring(0, 20)); // Truncate
|
|
27
51
|
}
|
|
28
52
|
|
|
29
53
|
/**
|
|
30
54
|
* Add a document to the index
|
|
31
|
-
* @param {string} id
|
|
32
|
-
* @param {string} content
|
|
55
|
+
* @param {string} id
|
|
56
|
+
* @param {string} content
|
|
33
57
|
* @param {Object} [metadata]
|
|
34
58
|
*/
|
|
35
|
-
add(id, content, metadata = {}) {
|
|
59
|
+
add(id: string, content: string, metadata: any = {}): void {
|
|
36
60
|
const tokens = this.tokenize(content);
|
|
37
|
-
const termFreqs = new Map();
|
|
61
|
+
const termFreqs = new Map<string, number>();
|
|
38
62
|
|
|
39
|
-
tokens.forEach(t => {
|
|
63
|
+
tokens.forEach((t) => {
|
|
40
64
|
termFreqs.set(t, (termFreqs.get(t) || 0) + 1);
|
|
41
65
|
});
|
|
42
66
|
|
|
@@ -48,7 +72,7 @@ export class KeywordSearch {
|
|
|
48
72
|
if (!this.index.has(token)) {
|
|
49
73
|
this.index.set(token, new Map());
|
|
50
74
|
}
|
|
51
|
-
this.index.get(token)
|
|
75
|
+
this.index.get(token)!.set(id, freq);
|
|
52
76
|
}
|
|
53
77
|
|
|
54
78
|
this.isDirty = true;
|
|
@@ -56,12 +80,12 @@ export class KeywordSearch {
|
|
|
56
80
|
|
|
57
81
|
/**
|
|
58
82
|
* Remove a document
|
|
59
|
-
* @param {string} id
|
|
83
|
+
* @param {string} id
|
|
60
84
|
*/
|
|
61
|
-
remove(id) {
|
|
85
|
+
remove(id: string): void {
|
|
62
86
|
this.docLengths.delete(id);
|
|
63
87
|
this.docs.delete(id);
|
|
64
|
-
|
|
88
|
+
|
|
65
89
|
// This is expensive O(Vocab), but okay for small scale
|
|
66
90
|
for (const docMap of this.index.values()) {
|
|
67
91
|
docMap.delete(id);
|
|
@@ -72,9 +96,11 @@ export class KeywordSearch {
|
|
|
72
96
|
/**
|
|
73
97
|
* Recalculate IDF scores
|
|
74
98
|
*/
|
|
75
|
-
_computeStats() {
|
|
76
|
-
if (!this.isDirty)
|
|
77
|
-
|
|
99
|
+
_computeStats(): void {
|
|
100
|
+
if (!this.isDirty) {
|
|
101
|
+
return;
|
|
102
|
+
}
|
|
103
|
+
|
|
78
104
|
const N = this.docLengths.size;
|
|
79
105
|
this.idf.clear();
|
|
80
106
|
|
|
@@ -94,18 +120,20 @@ export class KeywordSearch {
|
|
|
94
120
|
* @param {Object} options
|
|
95
121
|
* @returns {Array<{id: string, score: number, matches: string[], content: string, metadata: Object}>}
|
|
96
122
|
*/
|
|
97
|
-
search(query, options = {}) {
|
|
123
|
+
search(query: string, options: SearchOptions = {}): KeywordSearchResult[] {
|
|
98
124
|
this._computeStats();
|
|
99
|
-
|
|
125
|
+
|
|
100
126
|
const tokens = this.tokenize(query);
|
|
101
|
-
const scores = new Map(); // docId -> score
|
|
102
|
-
const matches = new Map(); // docId -> matched tokens
|
|
127
|
+
const scores = new Map<string, number>(); // docId -> score
|
|
128
|
+
const matches = new Map<string, string[]>(); // docId -> matched tokens
|
|
103
129
|
|
|
104
130
|
const limit = options.limit || 10;
|
|
105
131
|
|
|
106
132
|
for (const token of tokens) {
|
|
107
133
|
const docMap = this.index.get(token);
|
|
108
|
-
if (!docMap)
|
|
134
|
+
if (!docMap) {
|
|
135
|
+
continue;
|
|
136
|
+
}
|
|
109
137
|
|
|
110
138
|
const idf = this.idf.get(token) || 0;
|
|
111
139
|
|
|
@@ -114,11 +142,13 @@ export class KeywordSearch {
|
|
|
114
142
|
// Score = tf * idf * (normalization?)
|
|
115
143
|
// Simple variant:
|
|
116
144
|
const score = tf * idf;
|
|
117
|
-
|
|
145
|
+
|
|
118
146
|
scores.set(docId, (scores.get(docId) || 0) + score);
|
|
119
|
-
|
|
120
|
-
if (!matches.has(docId))
|
|
121
|
-
|
|
147
|
+
|
|
148
|
+
if (!matches.has(docId)) {
|
|
149
|
+
matches.set(docId, []);
|
|
150
|
+
}
|
|
151
|
+
matches.get(docId)!.push(token);
|
|
122
152
|
}
|
|
123
153
|
}
|
|
124
154
|
|
|
@@ -128,7 +158,7 @@ export class KeywordSearch {
|
|
|
128
158
|
id,
|
|
129
159
|
score,
|
|
130
160
|
matches: matches.get(id) || [],
|
|
131
|
-
...this.docs.get(id)
|
|
161
|
+
...this.docs.get(id)!,
|
|
132
162
|
}))
|
|
133
163
|
.sort((a, b) => b.score - a.score)
|
|
134
164
|
.slice(0, limit);
|
|
@@ -136,9 +166,9 @@ export class KeywordSearch {
|
|
|
136
166
|
|
|
137
167
|
/**
|
|
138
168
|
* Bulk load records
|
|
139
|
-
* @param {Array} records
|
|
169
|
+
* @param {Array} records
|
|
140
170
|
*/
|
|
141
|
-
load(records) {
|
|
142
|
-
records.forEach(r => this.add(r.id, r.content, r.metadata));
|
|
171
|
+
load(records: { id: string; content: string; metadata?: any }[]): void {
|
|
172
|
+
records.forEach((r) => this.add(r.id, r.content, r.metadata));
|
|
143
173
|
}
|
|
144
174
|
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* S-MORA Layer 0 Scrubber Default Configuration
|
|
3
|
+
* @module smora/scrubber/config/defaults
|
|
4
|
+
*/
|
|
5
|
+
export interface StructuralConfig {
|
|
6
|
+
stripHTML: boolean;
|
|
7
|
+
normalizeMarkdown: boolean;
|
|
8
|
+
collapseWhitespace: boolean;
|
|
9
|
+
removeScripts: boolean;
|
|
10
|
+
removeStyles: boolean;
|
|
11
|
+
}
|
|
12
|
+
export interface SemanticConfig {
|
|
13
|
+
removeDuplicates: boolean;
|
|
14
|
+
removeBoilerplate: boolean;
|
|
15
|
+
minSignalRatio: number;
|
|
16
|
+
boilerplatePatterns: string;
|
|
17
|
+
}
|
|
18
|
+
export interface NormalizationConfig {
|
|
19
|
+
normalizeHeadings: boolean;
|
|
20
|
+
normalizeLists: boolean;
|
|
21
|
+
normalizePunctuation: boolean;
|
|
22
|
+
}
|
|
23
|
+
export interface ChunkingConfig {
|
|
24
|
+
maxTokens: number;
|
|
25
|
+
minTokens: number;
|
|
26
|
+
hardMaxTokens: number;
|
|
27
|
+
splitOnHeadings: boolean;
|
|
28
|
+
preserveContext: boolean;
|
|
29
|
+
}
|
|
30
|
+
export interface MetadataConfig {
|
|
31
|
+
addSource: boolean;
|
|
32
|
+
addSection: boolean;
|
|
33
|
+
addHeadingPath: boolean;
|
|
34
|
+
addTimestamp: boolean;
|
|
35
|
+
addHash: boolean;
|
|
36
|
+
}
|
|
37
|
+
export interface ValidationConfig {
|
|
38
|
+
enforceMinLength: boolean;
|
|
39
|
+
enforceMaxLength: boolean;
|
|
40
|
+
rejectEmptyChunks: boolean;
|
|
41
|
+
}
|
|
42
|
+
export interface ScrubberConfig {
|
|
43
|
+
enabled: boolean;
|
|
44
|
+
structural: StructuralConfig;
|
|
45
|
+
semantic: SemanticConfig;
|
|
46
|
+
normalization: NormalizationConfig;
|
|
47
|
+
chunking: ChunkingConfig;
|
|
48
|
+
metadata: MetadataConfig;
|
|
49
|
+
validation: ValidationConfig;
|
|
50
|
+
logTransformations: boolean;
|
|
51
|
+
cachePatterns: boolean;
|
|
52
|
+
}
|
|
53
|
+
export declare const defaultScrubberConfig: ScrubberConfig;
|
|
@@ -2,61 +2,53 @@
|
|
|
2
2
|
* S-MORA Layer 0 Scrubber Default Configuration
|
|
3
3
|
* @module smora/scrubber/config/defaults
|
|
4
4
|
*/
|
|
5
|
-
|
|
6
5
|
export const defaultScrubberConfig = {
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
rejectEmptyChunks: true
|
|
57
|
-
},
|
|
58
|
-
|
|
59
|
-
// Performance
|
|
60
|
-
logTransformations: false,
|
|
61
|
-
cachePatterns: true
|
|
62
|
-
};
|
|
6
|
+
// Master switch - enabled by default for security (PII/sensitive data protection)
|
|
7
|
+
enabled: true,
|
|
8
|
+
// Stage 1: Structural Cleaning
|
|
9
|
+
structural: {
|
|
10
|
+
stripHTML: true,
|
|
11
|
+
normalizeMarkdown: true,
|
|
12
|
+
collapseWhitespace: true,
|
|
13
|
+
removeScripts: true,
|
|
14
|
+
removeStyles: true,
|
|
15
|
+
},
|
|
16
|
+
// Stage 2: Semantic Filtering
|
|
17
|
+
semantic: {
|
|
18
|
+
removeDuplicates: true,
|
|
19
|
+
removeBoilerplate: true,
|
|
20
|
+
minSignalRatio: 0.3,
|
|
21
|
+
boilerplatePatterns: "default",
|
|
22
|
+
},
|
|
23
|
+
// Stage 3: Normalization
|
|
24
|
+
normalization: {
|
|
25
|
+
normalizeHeadings: true,
|
|
26
|
+
normalizeLists: true,
|
|
27
|
+
normalizePunctuation: true,
|
|
28
|
+
},
|
|
29
|
+
// Stage 4: Chunking
|
|
30
|
+
chunking: {
|
|
31
|
+
maxTokens: 500,
|
|
32
|
+
minTokens: 10,
|
|
33
|
+
hardMaxTokens: 2000,
|
|
34
|
+
splitOnHeadings: true,
|
|
35
|
+
preserveContext: true,
|
|
36
|
+
},
|
|
37
|
+
// Stage 5: Metadata Annotation
|
|
38
|
+
metadata: {
|
|
39
|
+
addSource: true,
|
|
40
|
+
addSection: true,
|
|
41
|
+
addHeadingPath: true,
|
|
42
|
+
addTimestamp: true,
|
|
43
|
+
addHash: true,
|
|
44
|
+
},
|
|
45
|
+
// Stage 6: Validation
|
|
46
|
+
validation: {
|
|
47
|
+
enforceMinLength: true,
|
|
48
|
+
enforceMaxLength: true,
|
|
49
|
+
rejectEmptyChunks: true,
|
|
50
|
+
},
|
|
51
|
+
// Performance
|
|
52
|
+
logTransformations: false,
|
|
53
|
+
cachePatterns: true,
|
|
54
|
+
};
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* S-MORA Layer 0 Scrubber Default Configuration
|
|
3
|
+
* @module smora/scrubber/config/defaults
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
export interface StructuralConfig {
|
|
7
|
+
stripHTML: boolean;
|
|
8
|
+
normalizeMarkdown: boolean;
|
|
9
|
+
collapseWhitespace: boolean;
|
|
10
|
+
removeScripts: boolean;
|
|
11
|
+
removeStyles: boolean;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export interface SemanticConfig {
|
|
15
|
+
removeDuplicates: boolean;
|
|
16
|
+
removeBoilerplate: boolean;
|
|
17
|
+
minSignalRatio: number;
|
|
18
|
+
boilerplatePatterns: string;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export interface NormalizationConfig {
|
|
22
|
+
normalizeHeadings: boolean;
|
|
23
|
+
normalizeLists: boolean;
|
|
24
|
+
normalizePunctuation: boolean;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export interface ChunkingConfig {
|
|
28
|
+
maxTokens: number;
|
|
29
|
+
minTokens: number;
|
|
30
|
+
hardMaxTokens: number;
|
|
31
|
+
splitOnHeadings: boolean;
|
|
32
|
+
preserveContext: boolean;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export interface MetadataConfig {
|
|
36
|
+
addSource: boolean;
|
|
37
|
+
addSection: boolean;
|
|
38
|
+
addHeadingPath: boolean;
|
|
39
|
+
addTimestamp: boolean;
|
|
40
|
+
addHash: boolean;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
export interface ValidationConfig {
|
|
44
|
+
enforceMinLength: boolean;
|
|
45
|
+
enforceMaxLength: boolean;
|
|
46
|
+
rejectEmptyChunks: boolean;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export interface ScrubberConfig {
|
|
50
|
+
enabled: boolean;
|
|
51
|
+
structural: StructuralConfig;
|
|
52
|
+
semantic: SemanticConfig;
|
|
53
|
+
normalization: NormalizationConfig;
|
|
54
|
+
chunking: ChunkingConfig;
|
|
55
|
+
metadata: MetadataConfig;
|
|
56
|
+
validation: ValidationConfig;
|
|
57
|
+
logTransformations: boolean;
|
|
58
|
+
cachePatterns: boolean;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export const defaultScrubberConfig: ScrubberConfig = {
|
|
62
|
+
// Master switch - enabled by default for security (PII/sensitive data protection)
|
|
63
|
+
enabled: true,
|
|
64
|
+
|
|
65
|
+
// Stage 1: Structural Cleaning
|
|
66
|
+
structural: {
|
|
67
|
+
stripHTML: true,
|
|
68
|
+
normalizeMarkdown: true,
|
|
69
|
+
collapseWhitespace: true,
|
|
70
|
+
removeScripts: true,
|
|
71
|
+
removeStyles: true,
|
|
72
|
+
},
|
|
73
|
+
|
|
74
|
+
// Stage 2: Semantic Filtering
|
|
75
|
+
semantic: {
|
|
76
|
+
removeDuplicates: true,
|
|
77
|
+
removeBoilerplate: true,
|
|
78
|
+
minSignalRatio: 0.3,
|
|
79
|
+
boilerplatePatterns: "default",
|
|
80
|
+
},
|
|
81
|
+
|
|
82
|
+
// Stage 3: Normalization
|
|
83
|
+
normalization: {
|
|
84
|
+
normalizeHeadings: true,
|
|
85
|
+
normalizeLists: true,
|
|
86
|
+
normalizePunctuation: true,
|
|
87
|
+
},
|
|
88
|
+
|
|
89
|
+
// Stage 4: Chunking
|
|
90
|
+
chunking: {
|
|
91
|
+
maxTokens: 500,
|
|
92
|
+
minTokens: 10,
|
|
93
|
+
hardMaxTokens: 2000,
|
|
94
|
+
splitOnHeadings: true,
|
|
95
|
+
preserveContext: true,
|
|
96
|
+
},
|
|
97
|
+
|
|
98
|
+
// Stage 5: Metadata Annotation
|
|
99
|
+
metadata: {
|
|
100
|
+
addSource: true,
|
|
101
|
+
addSection: true,
|
|
102
|
+
addHeadingPath: true,
|
|
103
|
+
addTimestamp: true,
|
|
104
|
+
addHash: true,
|
|
105
|
+
},
|
|
106
|
+
|
|
107
|
+
// Stage 6: Validation
|
|
108
|
+
validation: {
|
|
109
|
+
enforceMinLength: true,
|
|
110
|
+
enforceMaxLength: true,
|
|
111
|
+
rejectEmptyChunks: true,
|
|
112
|
+
},
|
|
113
|
+
|
|
114
|
+
// Performance
|
|
115
|
+
logTransformations: false,
|
|
116
|
+
cachePatterns: true,
|
|
117
|
+
};
|
package/lib/scrubber/index.js
CHANGED
|
@@ -1,25 +1,5 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
3
|
-
*
|
|
4
|
-
* @module smora/scrubber
|
|
2
|
+
* YAMO Scrubber Module
|
|
3
|
+
* PII and sensitive data sanitization
|
|
5
4
|
*/
|
|
6
|
-
|
|
7
|
-
export { defaultScrubberConfig } from './config/defaults.js';
|
|
8
|
-
export {
|
|
9
|
-
ScrubberError,
|
|
10
|
-
StructuralCleaningError,
|
|
11
|
-
ChunkingError,
|
|
12
|
-
ValidationError
|
|
13
|
-
} from './errors/scrubber-error.js';
|
|
14
|
-
export { ScrubberTelemetry } from './telemetry.js';
|
|
15
|
-
export { Scrubber } from './scrubber.js';
|
|
16
|
-
export { HashUtil } from './utils/hash.js';
|
|
17
|
-
export { TokenCounter } from './utils/token-counter.js';
|
|
18
|
-
export { PatternMatcher } from './utils/pattern-matcher.js';
|
|
19
|
-
export { HTMLParser } from './utils/html-parser.js';
|
|
20
|
-
export { StructuralCleaner } from './stages/structural-cleaner.js';
|
|
21
|
-
export { SemanticFilter } from './stages/semantic-filter.js';
|
|
22
|
-
export { Normalizer } from './stages/normalizer.js';
|
|
23
|
-
export { Chunker } from './stages/chunker.js';
|
|
24
|
-
export { MetadataAnnotator } from './stages/metadata-annotator.js';
|
|
25
|
-
export { Validator } from './stages/validator.js';
|
|
5
|
+
export { Scrubber } from "./scrubber.js";
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* S-MORA Layer 0 Scrubber - Main Orchestrator
|
|
3
|
+
* @module smora/scrubber/scrubber
|
|
4
|
+
*/
|
|
5
|
+
import { StructuralCleaner } from "./stages/structural-cleaner.js";
|
|
6
|
+
import { SemanticFilter } from "./stages/semantic-filter.js";
|
|
7
|
+
import { Normalizer } from "./stages/normalizer.js";
|
|
8
|
+
import { Chunker } from "./stages/chunker.js";
|
|
9
|
+
import { MetadataAnnotator } from "./stages/metadata-annotator.js";
|
|
10
|
+
import { Validator } from "./stages/validator.js";
|
|
11
|
+
import { ScrubberTelemetry, TelemetrySummary, StageSummary } from "./telemetry.js";
|
|
12
|
+
import { ScrubberConfig } from "./config/defaults.js";
|
|
13
|
+
export interface ScrubberDocument {
|
|
14
|
+
content: string;
|
|
15
|
+
source: string;
|
|
16
|
+
type: string;
|
|
17
|
+
}
|
|
18
|
+
export interface Chunk {
|
|
19
|
+
text: string;
|
|
20
|
+
[key: string]: any;
|
|
21
|
+
}
|
|
22
|
+
export interface ScrubberResult {
|
|
23
|
+
chunks: Chunk[];
|
|
24
|
+
metadata: {
|
|
25
|
+
source: string;
|
|
26
|
+
type: string;
|
|
27
|
+
processingTimestamp: string;
|
|
28
|
+
[key: string]: any;
|
|
29
|
+
};
|
|
30
|
+
telemetry: Partial<Record<string, StageSummary>> & {
|
|
31
|
+
totalDuration?: number;
|
|
32
|
+
};
|
|
33
|
+
success?: boolean;
|
|
34
|
+
error?: string;
|
|
35
|
+
}
|
|
36
|
+
export declare class Scrubber {
|
|
37
|
+
config: ScrubberConfig;
|
|
38
|
+
stages: any;
|
|
39
|
+
telemetry: ScrubberTelemetry;
|
|
40
|
+
constructor(config?: Partial<ScrubberConfig>);
|
|
41
|
+
/**
|
|
42
|
+
* Main entry point - process a raw document
|
|
43
|
+
* @param {Object} document - { content: string, source: string, type: 'html'|'md'|'txt' }
|
|
44
|
+
* @returns {Promise<Object>} - { chunks: Array, metadata: Object, telemetry: Object }
|
|
45
|
+
*/
|
|
46
|
+
process(document: ScrubberDocument): Promise<ScrubberResult>;
|
|
47
|
+
_executeStage<T>(stageName: string, stageFn: () => Promise<T> | T): Promise<T>;
|
|
48
|
+
_initializeStages(): {
|
|
49
|
+
structural: StructuralCleaner;
|
|
50
|
+
semantic: SemanticFilter;
|
|
51
|
+
normalizer: Normalizer;
|
|
52
|
+
chunker: Chunker;
|
|
53
|
+
metadata: MetadataAnnotator;
|
|
54
|
+
validator: Validator;
|
|
55
|
+
};
|
|
56
|
+
getMetrics(): TelemetrySummary;
|
|
57
|
+
healthCheck(): Promise<{
|
|
58
|
+
status: string;
|
|
59
|
+
}>;
|
|
60
|
+
}
|
|
61
|
+
export default Scrubber;
|