@yamo/memory-mesh 3.0.0 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -2
- package/lib/llm/client.d.ts +23 -48
- package/lib/llm/client.js +1 -0
- package/lib/llm/client.ts +298 -377
- package/lib/llm/index.js +1 -0
- package/lib/llm/index.ts +1 -2
- package/lib/memory/adapters/client.d.ts +22 -85
- package/lib/memory/adapters/client.js +1 -0
- package/lib/memory/adapters/client.ts +474 -633
- package/lib/memory/adapters/config.d.ts +82 -89
- package/lib/memory/adapters/config.js +1 -0
- package/lib/memory/adapters/config.ts +156 -225
- package/lib/memory/adapters/errors.d.ts +28 -20
- package/lib/memory/adapters/errors.js +1 -0
- package/lib/memory/adapters/errors.ts +83 -120
- package/lib/memory/context-manager.d.ts +15 -18
- package/lib/memory/context-manager.js +1 -0
- package/lib/memory/context-manager.ts +314 -401
- package/lib/memory/embeddings/factory.d.ts +18 -20
- package/lib/memory/embeddings/factory.js +1 -0
- package/lib/memory/embeddings/factory.ts +130 -173
- package/lib/memory/embeddings/index.js +1 -0
- package/lib/memory/embeddings/index.ts +1 -0
- package/lib/memory/embeddings/service.d.ts +36 -66
- package/lib/memory/embeddings/service.js +1 -0
- package/lib/memory/embeddings/service.ts +479 -616
- package/lib/memory/index.d.ts +2 -2
- package/lib/memory/index.js +1 -0
- package/lib/memory/index.ts +3 -13
- package/lib/memory/memory-mesh.d.ts +151 -93
- package/lib/memory/memory-mesh.js +1 -0
- package/lib/memory/memory-mesh.ts +1406 -1692
- package/lib/memory/memory-translator.d.ts +1 -6
- package/lib/memory/memory-translator.js +1 -0
- package/lib/memory/memory-translator.ts +96 -128
- package/lib/memory/schema.d.ts +29 -10
- package/lib/memory/schema.js +1 -0
- package/lib/memory/schema.ts +102 -185
- package/lib/memory/scorer.d.ts +3 -4
- package/lib/memory/scorer.js +1 -0
- package/lib/memory/scorer.ts +69 -86
- package/lib/memory/search/index.js +1 -0
- package/lib/memory/search/index.ts +1 -0
- package/lib/memory/search/keyword-search.d.ts +10 -26
- package/lib/memory/search/keyword-search.js +1 -0
- package/lib/memory/search/keyword-search.ts +123 -161
- package/lib/scrubber/config/defaults.d.ts +39 -46
- package/lib/scrubber/config/defaults.js +1 -0
- package/lib/scrubber/config/defaults.ts +50 -112
- package/lib/scrubber/errors/scrubber-error.d.ts +22 -0
- package/lib/scrubber/errors/scrubber-error.js +39 -0
- package/lib/scrubber/errors/scrubber-error.ts +44 -0
- package/lib/scrubber/index.d.ts +0 -1
- package/lib/scrubber/index.js +1 -0
- package/lib/scrubber/index.ts +1 -2
- package/lib/scrubber/scrubber.d.ts +14 -31
- package/lib/scrubber/scrubber.js +1 -0
- package/lib/scrubber/scrubber.ts +93 -152
- package/lib/scrubber/stages/chunker.d.ts +22 -10
- package/lib/scrubber/stages/chunker.js +86 -0
- package/lib/scrubber/stages/chunker.ts +104 -0
- package/lib/scrubber/stages/metadata-annotator.d.ts +14 -15
- package/lib/scrubber/stages/metadata-annotator.js +64 -0
- package/lib/scrubber/stages/metadata-annotator.ts +75 -0
- package/lib/scrubber/stages/normalizer.d.ts +13 -10
- package/lib/scrubber/stages/normalizer.js +51 -0
- package/lib/scrubber/stages/normalizer.ts +60 -0
- package/lib/scrubber/stages/semantic-filter.d.ts +13 -10
- package/lib/scrubber/stages/semantic-filter.js +51 -0
- package/lib/scrubber/stages/semantic-filter.ts +62 -0
- package/lib/scrubber/stages/structural-cleaner.d.ts +15 -10
- package/lib/scrubber/stages/structural-cleaner.js +73 -0
- package/lib/scrubber/stages/structural-cleaner.ts +83 -0
- package/lib/scrubber/stages/validator.d.ts +14 -15
- package/lib/scrubber/stages/validator.js +56 -0
- package/lib/scrubber/stages/validator.ts +67 -0
- package/lib/scrubber/telemetry.d.ts +20 -27
- package/lib/scrubber/telemetry.js +1 -0
- package/lib/scrubber/telemetry.ts +53 -90
- package/lib/scrubber/utils/hash.d.ts +14 -0
- package/lib/scrubber/utils/hash.js +37 -0
- package/lib/scrubber/utils/hash.ts +40 -0
- package/lib/scrubber/utils/html-parser.d.ts +14 -0
- package/lib/scrubber/utils/html-parser.js +38 -0
- package/lib/scrubber/utils/html-parser.ts +46 -0
- package/lib/scrubber/utils/pattern-matcher.d.ts +12 -0
- package/lib/scrubber/utils/pattern-matcher.js +54 -0
- package/lib/scrubber/utils/pattern-matcher.ts +64 -0
- package/lib/scrubber/utils/token-counter.d.ts +18 -0
- package/lib/scrubber/utils/token-counter.js +30 -0
- package/lib/scrubber/utils/token-counter.ts +32 -0
- package/lib/utils/logger.d.ts +1 -11
- package/lib/utils/logger.js +1 -0
- package/lib/utils/logger.ts +43 -63
- package/lib/utils/skill-metadata.d.ts +6 -14
- package/lib/utils/skill-metadata.js +1 -0
- package/lib/utils/skill-metadata.ts +89 -103
- package/lib/yamo/emitter.d.ts +8 -35
- package/lib/yamo/emitter.js +1 -0
- package/lib/yamo/emitter.ts +77 -155
- package/lib/yamo/index.d.ts +14 -0
- package/lib/yamo/index.js +14 -0
- package/lib/yamo/index.ts +16 -0
- package/lib/yamo/schema.d.ts +8 -10
- package/lib/yamo/schema.js +1 -0
- package/lib/yamo/schema.ts +82 -114
- package/package.json +4 -2
|
@@ -1,174 +1,136 @@
|
|
|
1
|
+
// @ts-nocheck
|
|
1
2
|
/**
|
|
2
3
|
* Simple Keyword Search Engine (In-Memory)
|
|
3
4
|
* Provides basic TF-IDF style retrieval to complement vector search
|
|
4
5
|
*/
|
|
5
|
-
|
|
6
|
-
export interface KeywordDoc {
|
|
7
|
-
content: string;
|
|
8
|
-
metadata?: any;
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
export interface KeywordSearchResult extends KeywordDoc {
|
|
12
|
-
id: string;
|
|
13
|
-
score: number;
|
|
14
|
-
matches: string[];
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
export interface SearchOptions {
|
|
18
|
-
limit?: number;
|
|
19
|
-
}
|
|
20
|
-
|
|
21
6
|
export class KeywordSearch {
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
this.isDirty = false;
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
/**
|
|
37
|
-
* Tokenize text into normalized terms
|
|
38
|
-
* @param {string} text
|
|
39
|
-
* @returns {string[]} tokens
|
|
40
|
-
*/
|
|
41
|
-
tokenize(text: string): string[] {
|
|
42
|
-
if (!text) {
|
|
43
|
-
return [];
|
|
7
|
+
index; // token -> Map<docId, tf>
|
|
8
|
+
docLengths; // docId -> length
|
|
9
|
+
idf; // token -> idf value
|
|
10
|
+
docs; // docId -> content (optional, for snippet)
|
|
11
|
+
isDirty;
|
|
12
|
+
constructor() {
|
|
13
|
+
this.index = new Map();
|
|
14
|
+
this.docLengths = new Map();
|
|
15
|
+
this.idf = new Map();
|
|
16
|
+
this.docs = new Map();
|
|
17
|
+
this.isDirty = false;
|
|
44
18
|
}
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
const tokens = this.tokenize(content);
|
|
61
|
-
const termFreqs = new Map<string, number>();
|
|
62
|
-
|
|
63
|
-
tokens.forEach((t) => {
|
|
64
|
-
termFreqs.set(t, (termFreqs.get(t) || 0) + 1);
|
|
65
|
-
});
|
|
66
|
-
|
|
67
|
-
this.docLengths.set(id, tokens.length);
|
|
68
|
-
this.docs.set(id, { content, metadata });
|
|
69
|
-
|
|
70
|
-
// Update index
|
|
71
|
-
for (const [token, freq] of termFreqs.entries()) {
|
|
72
|
-
if (!this.index.has(token)) {
|
|
73
|
-
this.index.set(token, new Map());
|
|
74
|
-
}
|
|
75
|
-
this.index.get(token)!.set(id, freq);
|
|
19
|
+
/**
|
|
20
|
+
* Tokenize text into normalized terms
|
|
21
|
+
* @param {string} text
|
|
22
|
+
* @returns {string[]} tokens
|
|
23
|
+
*/
|
|
24
|
+
tokenize(text) {
|
|
25
|
+
if (!text) {
|
|
26
|
+
return [];
|
|
27
|
+
}
|
|
28
|
+
return text
|
|
29
|
+
.toLowerCase()
|
|
30
|
+
.replace(/[^\w\s]/g, "") // Remove punctuation
|
|
31
|
+
.split(/\s+/)
|
|
32
|
+
.filter((t) => t.length > 2) // Filter stopwords/short
|
|
33
|
+
.map((t) => t.substring(0, 20)); // Truncate
|
|
76
34
|
}
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
35
|
+
/**
|
|
36
|
+
* Add a document to the index
|
|
37
|
+
* @param {string} id
|
|
38
|
+
* @param {string} content
|
|
39
|
+
* @param {Object} [metadata]
|
|
40
|
+
*/
|
|
41
|
+
add(id, content, metadata = {}) {
|
|
42
|
+
const tokens = this.tokenize(content);
|
|
43
|
+
const termFreqs = new Map();
|
|
44
|
+
tokens.forEach((t) => {
|
|
45
|
+
termFreqs.set(t, (termFreqs.get(t) || 0) + 1);
|
|
46
|
+
});
|
|
47
|
+
this.docLengths.set(id, tokens.length);
|
|
48
|
+
this.docs.set(id, { content, metadata });
|
|
49
|
+
// Update index
|
|
50
|
+
for (const [token, freq] of termFreqs.entries()) {
|
|
51
|
+
if (!this.index.has(token)) {
|
|
52
|
+
this.index.set(token, new Map());
|
|
53
|
+
}
|
|
54
|
+
this.index.get(token).set(id, freq);
|
|
55
|
+
}
|
|
56
|
+
this.isDirty = true;
|
|
92
57
|
}
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
58
|
+
/**
|
|
59
|
+
* Remove a document
|
|
60
|
+
* @param {string} id
|
|
61
|
+
*/
|
|
62
|
+
remove(id) {
|
|
63
|
+
this.docLengths.delete(id);
|
|
64
|
+
this.docs.delete(id);
|
|
65
|
+
// This is expensive O(Vocab), but okay for small scale
|
|
66
|
+
for (const docMap of this.index.values()) {
|
|
67
|
+
docMap.delete(id);
|
|
68
|
+
}
|
|
69
|
+
this.isDirty = true;
|
|
102
70
|
}
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
71
|
+
/**
|
|
72
|
+
* Recalculate IDF scores
|
|
73
|
+
*/
|
|
74
|
+
_computeStats() {
|
|
75
|
+
if (!this.isDirty) {
|
|
76
|
+
return;
|
|
77
|
+
}
|
|
78
|
+
const N = this.docLengths.size;
|
|
79
|
+
this.idf.clear();
|
|
80
|
+
for (const [token, docMap] of this.index.entries()) {
|
|
81
|
+
const df = docMap.size;
|
|
82
|
+
// Standard IDF: log(N / (df + 1)) + 1
|
|
83
|
+
const idf = Math.log(N / (df + 1)) + 1;
|
|
84
|
+
this.idf.set(token, idf);
|
|
85
|
+
}
|
|
86
|
+
this.isDirty = false;
|
|
112
87
|
}
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
// Score = tf * idf * (normalization?)
|
|
143
|
-
// Simple variant:
|
|
144
|
-
const score = tf * idf;
|
|
145
|
-
|
|
146
|
-
scores.set(docId, (scores.get(docId) || 0) + score);
|
|
147
|
-
|
|
148
|
-
if (!matches.has(docId)) {
|
|
149
|
-
matches.set(docId, []);
|
|
88
|
+
/**
|
|
89
|
+
* Search for query terms
|
|
90
|
+
* @param {string} query
|
|
91
|
+
* @param {Object} options
|
|
92
|
+
* @returns {Array<{id: string, score: number, matches: string[], content: string, metadata: Object}>}
|
|
93
|
+
*/
|
|
94
|
+
search(query, options = {}) {
|
|
95
|
+
this._computeStats();
|
|
96
|
+
const tokens = this.tokenize(query);
|
|
97
|
+
const scores = new Map(); // docId -> score
|
|
98
|
+
const matches = new Map(); // docId -> matched tokens
|
|
99
|
+
const limit = options.limit || 10;
|
|
100
|
+
for (const token of tokens) {
|
|
101
|
+
const docMap = this.index.get(token);
|
|
102
|
+
if (!docMap) {
|
|
103
|
+
continue;
|
|
104
|
+
}
|
|
105
|
+
const idf = this.idf.get(token) || 0;
|
|
106
|
+
for (const [docId, tf] of docMap.entries()) {
|
|
107
|
+
// TF-IDF Score
|
|
108
|
+
// Score = tf * idf * (normalization?)
|
|
109
|
+
// Simple variant:
|
|
110
|
+
const score = tf * idf;
|
|
111
|
+
scores.set(docId, (scores.get(docId) || 0) + score);
|
|
112
|
+
if (!matches.has(docId)) {
|
|
113
|
+
matches.set(docId, []);
|
|
114
|
+
}
|
|
115
|
+
matches.get(docId).push(token);
|
|
116
|
+
}
|
|
150
117
|
}
|
|
151
|
-
|
|
152
|
-
|
|
118
|
+
// Convert to array and sort
|
|
119
|
+
return Array.from(scores.entries())
|
|
120
|
+
.map(([id, score]) => ({
|
|
121
|
+
id,
|
|
122
|
+
score,
|
|
123
|
+
matches: matches.get(id) || [],
|
|
124
|
+
...this.docs.get(id),
|
|
125
|
+
}))
|
|
126
|
+
.sort((a, b) => b.score - a.score)
|
|
127
|
+
.slice(0, limit);
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Bulk load records
|
|
131
|
+
* @param {Array} records
|
|
132
|
+
*/
|
|
133
|
+
load(records) {
|
|
134
|
+
records.forEach((r) => this.add(r.id, r.content, r.metadata));
|
|
153
135
|
}
|
|
154
|
-
|
|
155
|
-
// Convert to array and sort
|
|
156
|
-
return Array.from(scores.entries())
|
|
157
|
-
.map(([id, score]) => ({
|
|
158
|
-
id,
|
|
159
|
-
score,
|
|
160
|
-
matches: matches.get(id) || [],
|
|
161
|
-
...this.docs.get(id)!,
|
|
162
|
-
}))
|
|
163
|
-
.sort((a, b) => b.score - a.score)
|
|
164
|
-
.slice(0, limit);
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
/**
|
|
168
|
-
* Bulk load records
|
|
169
|
-
* @param {Array} records
|
|
170
|
-
*/
|
|
171
|
-
load(records: { id: string; content: string; metadata?: any }[]): void {
|
|
172
|
-
records.forEach((r) => this.add(r.id, r.content, r.metadata));
|
|
173
|
-
}
|
|
174
136
|
}
|
|
@@ -2,52 +2,45 @@
|
|
|
2
2
|
* S-MORA Layer 0 Scrubber Default Configuration
|
|
3
3
|
* @module smora/scrubber/config/defaults
|
|
4
4
|
*/
|
|
5
|
-
export
|
|
6
|
-
stripHTML: boolean;
|
|
7
|
-
normalizeMarkdown: boolean;
|
|
8
|
-
collapseWhitespace: boolean;
|
|
9
|
-
removeScripts: boolean;
|
|
10
|
-
removeStyles: boolean;
|
|
11
|
-
}
|
|
12
|
-
export interface SemanticConfig {
|
|
13
|
-
removeDuplicates: boolean;
|
|
14
|
-
removeBoilerplate: boolean;
|
|
15
|
-
minSignalRatio: number;
|
|
16
|
-
boilerplatePatterns: string;
|
|
17
|
-
}
|
|
18
|
-
export interface NormalizationConfig {
|
|
19
|
-
normalizeHeadings: boolean;
|
|
20
|
-
normalizeLists: boolean;
|
|
21
|
-
normalizePunctuation: boolean;
|
|
22
|
-
}
|
|
23
|
-
export interface ChunkingConfig {
|
|
24
|
-
maxTokens: number;
|
|
25
|
-
minTokens: number;
|
|
26
|
-
hardMaxTokens: number;
|
|
27
|
-
splitOnHeadings: boolean;
|
|
28
|
-
preserveContext: boolean;
|
|
29
|
-
}
|
|
30
|
-
export interface MetadataConfig {
|
|
31
|
-
addSource: boolean;
|
|
32
|
-
addSection: boolean;
|
|
33
|
-
addHeadingPath: boolean;
|
|
34
|
-
addTimestamp: boolean;
|
|
35
|
-
addHash: boolean;
|
|
36
|
-
}
|
|
37
|
-
export interface ValidationConfig {
|
|
38
|
-
enforceMinLength: boolean;
|
|
39
|
-
enforceMaxLength: boolean;
|
|
40
|
-
rejectEmptyChunks: boolean;
|
|
41
|
-
}
|
|
42
|
-
export interface ScrubberConfig {
|
|
5
|
+
export declare const defaultScrubberConfig: {
|
|
43
6
|
enabled: boolean;
|
|
44
|
-
structural:
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
7
|
+
structural: {
|
|
8
|
+
stripHTML: boolean;
|
|
9
|
+
normalizeMarkdown: boolean;
|
|
10
|
+
collapseWhitespace: boolean;
|
|
11
|
+
removeScripts: boolean;
|
|
12
|
+
removeStyles: boolean;
|
|
13
|
+
};
|
|
14
|
+
semantic: {
|
|
15
|
+
removeDuplicates: boolean;
|
|
16
|
+
removeBoilerplate: boolean;
|
|
17
|
+
minSignalRatio: number;
|
|
18
|
+
boilerplatePatterns: string;
|
|
19
|
+
};
|
|
20
|
+
normalization: {
|
|
21
|
+
normalizeHeadings: boolean;
|
|
22
|
+
normalizeLists: boolean;
|
|
23
|
+
normalizePunctuation: boolean;
|
|
24
|
+
};
|
|
25
|
+
chunking: {
|
|
26
|
+
maxTokens: number;
|
|
27
|
+
minTokens: number;
|
|
28
|
+
hardMaxTokens: number;
|
|
29
|
+
splitOnHeadings: boolean;
|
|
30
|
+
preserveContext: boolean;
|
|
31
|
+
};
|
|
32
|
+
metadata: {
|
|
33
|
+
addSource: boolean;
|
|
34
|
+
addSection: boolean;
|
|
35
|
+
addHeadingPath: boolean;
|
|
36
|
+
addTimestamp: boolean;
|
|
37
|
+
addHash: boolean;
|
|
38
|
+
};
|
|
39
|
+
validation: {
|
|
40
|
+
enforceMinLength: boolean;
|
|
41
|
+
enforceMaxLength: boolean;
|
|
42
|
+
rejectEmptyChunks: boolean;
|
|
43
|
+
};
|
|
50
44
|
logTransformations: boolean;
|
|
51
45
|
cachePatterns: boolean;
|
|
52
|
-
}
|
|
53
|
-
export declare const defaultScrubberConfig: ScrubberConfig;
|
|
46
|
+
};
|
|
@@ -1,117 +1,55 @@
|
|
|
1
|
+
// @ts-nocheck
|
|
1
2
|
/**
|
|
2
3
|
* S-MORA Layer 0 Scrubber Default Configuration
|
|
3
4
|
* @module smora/scrubber/config/defaults
|
|
4
5
|
*/
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
chunking: ChunkingConfig;
|
|
55
|
-
metadata: MetadataConfig;
|
|
56
|
-
validation: ValidationConfig;
|
|
57
|
-
logTransformations: boolean;
|
|
58
|
-
cachePatterns: boolean;
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
export const defaultScrubberConfig: ScrubberConfig = {
|
|
62
|
-
// Master switch - enabled by default for security (PII/sensitive data protection)
|
|
63
|
-
enabled: true,
|
|
64
|
-
|
|
65
|
-
// Stage 1: Structural Cleaning
|
|
66
|
-
structural: {
|
|
67
|
-
stripHTML: true,
|
|
68
|
-
normalizeMarkdown: true,
|
|
69
|
-
collapseWhitespace: true,
|
|
70
|
-
removeScripts: true,
|
|
71
|
-
removeStyles: true,
|
|
72
|
-
},
|
|
73
|
-
|
|
74
|
-
// Stage 2: Semantic Filtering
|
|
75
|
-
semantic: {
|
|
76
|
-
removeDuplicates: true,
|
|
77
|
-
removeBoilerplate: true,
|
|
78
|
-
minSignalRatio: 0.3,
|
|
79
|
-
boilerplatePatterns: "default",
|
|
80
|
-
},
|
|
81
|
-
|
|
82
|
-
// Stage 3: Normalization
|
|
83
|
-
normalization: {
|
|
84
|
-
normalizeHeadings: true,
|
|
85
|
-
normalizeLists: true,
|
|
86
|
-
normalizePunctuation: true,
|
|
87
|
-
},
|
|
88
|
-
|
|
89
|
-
// Stage 4: Chunking
|
|
90
|
-
chunking: {
|
|
91
|
-
maxTokens: 500,
|
|
92
|
-
minTokens: 10,
|
|
93
|
-
hardMaxTokens: 2000,
|
|
94
|
-
splitOnHeadings: true,
|
|
95
|
-
preserveContext: true,
|
|
96
|
-
},
|
|
97
|
-
|
|
98
|
-
// Stage 5: Metadata Annotation
|
|
99
|
-
metadata: {
|
|
100
|
-
addSource: true,
|
|
101
|
-
addSection: true,
|
|
102
|
-
addHeadingPath: true,
|
|
103
|
-
addTimestamp: true,
|
|
104
|
-
addHash: true,
|
|
105
|
-
},
|
|
106
|
-
|
|
107
|
-
// Stage 6: Validation
|
|
108
|
-
validation: {
|
|
109
|
-
enforceMinLength: true,
|
|
110
|
-
enforceMaxLength: true,
|
|
111
|
-
rejectEmptyChunks: true,
|
|
112
|
-
},
|
|
113
|
-
|
|
114
|
-
// Performance
|
|
115
|
-
logTransformations: false,
|
|
116
|
-
cachePatterns: true,
|
|
6
|
+
export const defaultScrubberConfig = {
|
|
7
|
+
// Master switch - enabled by default for security (PII/sensitive data protection)
|
|
8
|
+
enabled: true,
|
|
9
|
+
// Stage 1: Structural Cleaning
|
|
10
|
+
structural: {
|
|
11
|
+
stripHTML: true,
|
|
12
|
+
normalizeMarkdown: true,
|
|
13
|
+
collapseWhitespace: true,
|
|
14
|
+
removeScripts: true,
|
|
15
|
+
removeStyles: true,
|
|
16
|
+
},
|
|
17
|
+
// Stage 2: Semantic Filtering
|
|
18
|
+
semantic: {
|
|
19
|
+
removeDuplicates: true,
|
|
20
|
+
removeBoilerplate: true,
|
|
21
|
+
minSignalRatio: 0.3,
|
|
22
|
+
boilerplatePatterns: "default",
|
|
23
|
+
},
|
|
24
|
+
// Stage 3: Normalization
|
|
25
|
+
normalization: {
|
|
26
|
+
normalizeHeadings: true,
|
|
27
|
+
normalizeLists: true,
|
|
28
|
+
normalizePunctuation: true,
|
|
29
|
+
},
|
|
30
|
+
// Stage 4: Chunking
|
|
31
|
+
chunking: {
|
|
32
|
+
maxTokens: 500,
|
|
33
|
+
minTokens: 10,
|
|
34
|
+
hardMaxTokens: 2000,
|
|
35
|
+
splitOnHeadings: true,
|
|
36
|
+
preserveContext: true,
|
|
37
|
+
},
|
|
38
|
+
// Stage 5: Metadata Annotation
|
|
39
|
+
metadata: {
|
|
40
|
+
addSource: true,
|
|
41
|
+
addSection: true,
|
|
42
|
+
addHeadingPath: true,
|
|
43
|
+
addTimestamp: true,
|
|
44
|
+
addHash: true,
|
|
45
|
+
},
|
|
46
|
+
// Stage 6: Validation
|
|
47
|
+
validation: {
|
|
48
|
+
enforceMinLength: true,
|
|
49
|
+
enforceMaxLength: true,
|
|
50
|
+
rejectEmptyChunks: true,
|
|
51
|
+
},
|
|
52
|
+
// Performance
|
|
53
|
+
logTransformations: false,
|
|
54
|
+
cachePatterns: true,
|
|
117
55
|
};
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* S-MORA Layer 0 Scrubber Error Classes
|
|
3
|
+
* @module smora/scrubber/errors/scrubber-error
|
|
4
|
+
*/
|
|
5
|
+
export declare class ScrubberError extends Error {
|
|
6
|
+
constructor(message: any, details?: {});
|
|
7
|
+
toJSON(): {
|
|
8
|
+
name: string;
|
|
9
|
+
message: string;
|
|
10
|
+
details: any;
|
|
11
|
+
timestamp: any;
|
|
12
|
+
};
|
|
13
|
+
}
|
|
14
|
+
export declare class StructuralCleaningError extends ScrubberError {
|
|
15
|
+
constructor(message: any, details?: {});
|
|
16
|
+
}
|
|
17
|
+
export declare class ChunkingError extends ScrubberError {
|
|
18
|
+
constructor(message: any, details?: {});
|
|
19
|
+
}
|
|
20
|
+
export declare class ValidationError extends ScrubberError {
|
|
21
|
+
constructor(message: any, details?: {});
|
|
22
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
// @ts-nocheck
|
|
2
|
+
/**
|
|
3
|
+
* S-MORA Layer 0 Scrubber Error Classes
|
|
4
|
+
* @module smora/scrubber/errors/scrubber-error
|
|
5
|
+
*/
|
|
6
|
+
export class ScrubberError extends Error {
|
|
7
|
+
constructor(message, details = {}) {
|
|
8
|
+
super(message);
|
|
9
|
+
this.name = 'ScrubberError';
|
|
10
|
+
this.details = details;
|
|
11
|
+
this.timestamp = new Date().toISOString();
|
|
12
|
+
}
|
|
13
|
+
toJSON() {
|
|
14
|
+
return {
|
|
15
|
+
name: this.name,
|
|
16
|
+
message: this.message,
|
|
17
|
+
details: this.details,
|
|
18
|
+
timestamp: this.timestamp
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
export class StructuralCleaningError extends ScrubberError {
|
|
23
|
+
constructor(message, details = {}) {
|
|
24
|
+
super(message, details);
|
|
25
|
+
this.name = 'StructuralCleaningError';
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
export class ChunkingError extends ScrubberError {
|
|
29
|
+
constructor(message, details = {}) {
|
|
30
|
+
super(message, details);
|
|
31
|
+
this.name = 'ChunkingError';
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
export class ValidationError extends ScrubberError {
|
|
35
|
+
constructor(message, details = {}) {
|
|
36
|
+
super(message, details);
|
|
37
|
+
this.name = 'ValidationError';
|
|
38
|
+
}
|
|
39
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
// @ts-nocheck
|
|
2
|
+
/**
|
|
3
|
+
* S-MORA Layer 0 Scrubber Error Classes
|
|
4
|
+
* @module smora/scrubber/errors/scrubber-error
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
export class ScrubberError extends Error {
|
|
8
|
+
constructor(message, details = {}) {
|
|
9
|
+
super(message);
|
|
10
|
+
this.name = 'ScrubberError';
|
|
11
|
+
this.details = details;
|
|
12
|
+
this.timestamp = new Date().toISOString();
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
toJSON() {
|
|
16
|
+
return {
|
|
17
|
+
name: this.name,
|
|
18
|
+
message: this.message,
|
|
19
|
+
details: this.details,
|
|
20
|
+
timestamp: this.timestamp
|
|
21
|
+
};
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export class StructuralCleaningError extends ScrubberError {
|
|
26
|
+
constructor(message, details = {}) {
|
|
27
|
+
super(message, details);
|
|
28
|
+
this.name = 'StructuralCleaningError';
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export class ChunkingError extends ScrubberError {
|
|
33
|
+
constructor(message, details = {}) {
|
|
34
|
+
super(message, details);
|
|
35
|
+
this.name = 'ChunkingError';
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
export class ValidationError extends ScrubberError {
|
|
40
|
+
constructor(message, details = {}) {
|
|
41
|
+
super(message, details);
|
|
42
|
+
this.name = 'ValidationError';
|
|
43
|
+
}
|
|
44
|
+
}
|
package/lib/scrubber/index.d.ts
CHANGED
package/lib/scrubber/index.js
CHANGED