@yamo/memory-mesh 3.0.0 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -2
- package/lib/llm/client.d.ts +23 -48
- package/lib/llm/client.js +1 -0
- package/lib/llm/client.ts +298 -377
- package/lib/llm/index.js +1 -0
- package/lib/llm/index.ts +1 -2
- package/lib/memory/adapters/client.d.ts +22 -85
- package/lib/memory/adapters/client.js +1 -0
- package/lib/memory/adapters/client.ts +474 -633
- package/lib/memory/adapters/config.d.ts +82 -89
- package/lib/memory/adapters/config.js +1 -0
- package/lib/memory/adapters/config.ts +156 -225
- package/lib/memory/adapters/errors.d.ts +28 -20
- package/lib/memory/adapters/errors.js +1 -0
- package/lib/memory/adapters/errors.ts +83 -120
- package/lib/memory/context-manager.d.ts +15 -18
- package/lib/memory/context-manager.js +1 -0
- package/lib/memory/context-manager.ts +314 -401
- package/lib/memory/embeddings/factory.d.ts +18 -20
- package/lib/memory/embeddings/factory.js +1 -0
- package/lib/memory/embeddings/factory.ts +130 -173
- package/lib/memory/embeddings/index.js +1 -0
- package/lib/memory/embeddings/index.ts +1 -0
- package/lib/memory/embeddings/service.d.ts +36 -66
- package/lib/memory/embeddings/service.js +1 -0
- package/lib/memory/embeddings/service.ts +479 -616
- package/lib/memory/index.d.ts +2 -2
- package/lib/memory/index.js +1 -0
- package/lib/memory/index.ts +3 -13
- package/lib/memory/memory-mesh.d.ts +151 -93
- package/lib/memory/memory-mesh.js +1 -0
- package/lib/memory/memory-mesh.ts +1406 -1692
- package/lib/memory/memory-translator.d.ts +1 -6
- package/lib/memory/memory-translator.js +1 -0
- package/lib/memory/memory-translator.ts +96 -128
- package/lib/memory/schema.d.ts +29 -10
- package/lib/memory/schema.js +1 -0
- package/lib/memory/schema.ts +102 -185
- package/lib/memory/scorer.d.ts +3 -4
- package/lib/memory/scorer.js +1 -0
- package/lib/memory/scorer.ts +69 -86
- package/lib/memory/search/index.js +1 -0
- package/lib/memory/search/index.ts +1 -0
- package/lib/memory/search/keyword-search.d.ts +10 -26
- package/lib/memory/search/keyword-search.js +1 -0
- package/lib/memory/search/keyword-search.ts +123 -161
- package/lib/scrubber/config/defaults.d.ts +39 -46
- package/lib/scrubber/config/defaults.js +1 -0
- package/lib/scrubber/config/defaults.ts +50 -112
- package/lib/scrubber/errors/scrubber-error.d.ts +22 -0
- package/lib/scrubber/errors/scrubber-error.js +39 -0
- package/lib/scrubber/errors/scrubber-error.ts +44 -0
- package/lib/scrubber/index.d.ts +0 -1
- package/lib/scrubber/index.js +1 -0
- package/lib/scrubber/index.ts +1 -2
- package/lib/scrubber/scrubber.d.ts +14 -31
- package/lib/scrubber/scrubber.js +1 -0
- package/lib/scrubber/scrubber.ts +93 -152
- package/lib/scrubber/stages/chunker.d.ts +22 -10
- package/lib/scrubber/stages/chunker.js +86 -0
- package/lib/scrubber/stages/chunker.ts +104 -0
- package/lib/scrubber/stages/metadata-annotator.d.ts +14 -15
- package/lib/scrubber/stages/metadata-annotator.js +64 -0
- package/lib/scrubber/stages/metadata-annotator.ts +75 -0
- package/lib/scrubber/stages/normalizer.d.ts +13 -10
- package/lib/scrubber/stages/normalizer.js +51 -0
- package/lib/scrubber/stages/normalizer.ts +60 -0
- package/lib/scrubber/stages/semantic-filter.d.ts +13 -10
- package/lib/scrubber/stages/semantic-filter.js +51 -0
- package/lib/scrubber/stages/semantic-filter.ts +62 -0
- package/lib/scrubber/stages/structural-cleaner.d.ts +15 -10
- package/lib/scrubber/stages/structural-cleaner.js +73 -0
- package/lib/scrubber/stages/structural-cleaner.ts +83 -0
- package/lib/scrubber/stages/validator.d.ts +14 -15
- package/lib/scrubber/stages/validator.js +56 -0
- package/lib/scrubber/stages/validator.ts +67 -0
- package/lib/scrubber/telemetry.d.ts +20 -27
- package/lib/scrubber/telemetry.js +1 -0
- package/lib/scrubber/telemetry.ts +53 -90
- package/lib/scrubber/utils/hash.d.ts +14 -0
- package/lib/scrubber/utils/hash.js +37 -0
- package/lib/scrubber/utils/hash.ts +40 -0
- package/lib/scrubber/utils/html-parser.d.ts +14 -0
- package/lib/scrubber/utils/html-parser.js +38 -0
- package/lib/scrubber/utils/html-parser.ts +46 -0
- package/lib/scrubber/utils/pattern-matcher.d.ts +12 -0
- package/lib/scrubber/utils/pattern-matcher.js +54 -0
- package/lib/scrubber/utils/pattern-matcher.ts +64 -0
- package/lib/scrubber/utils/token-counter.d.ts +18 -0
- package/lib/scrubber/utils/token-counter.js +30 -0
- package/lib/scrubber/utils/token-counter.ts +32 -0
- package/lib/utils/logger.d.ts +1 -11
- package/lib/utils/logger.js +1 -0
- package/lib/utils/logger.ts +43 -63
- package/lib/utils/skill-metadata.d.ts +6 -14
- package/lib/utils/skill-metadata.js +1 -0
- package/lib/utils/skill-metadata.ts +89 -103
- package/lib/yamo/emitter.d.ts +8 -35
- package/lib/yamo/emitter.js +1 -0
- package/lib/yamo/emitter.ts +77 -155
- package/lib/yamo/index.d.ts +14 -0
- package/lib/yamo/index.js +14 -0
- package/lib/yamo/index.ts +16 -0
- package/lib/yamo/schema.d.ts +8 -10
- package/lib/yamo/schema.js +1 -0
- package/lib/yamo/schema.ts +82 -114
- package/package.json +4 -2
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
// @ts-nocheck
|
|
2
|
+
/**
|
|
3
|
+
* S-MORA Layer 0 Scrubber - Stage 5: Metadata Annotation
|
|
4
|
+
* @module smora/scrubber/stages/metadata-annotator
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { HashUtil } from '../utils/hash.js';
|
|
8
|
+
|
|
9
|
+
export class MetadataAnnotator {
|
|
10
|
+
constructor(config) {
|
|
11
|
+
this.config = config;
|
|
12
|
+
this.hashUtil = new HashUtil();
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Add metadata to chunks
|
|
17
|
+
* @param {Array} chunks - Array of chunks
|
|
18
|
+
* @param {Object} document - Original document metadata
|
|
19
|
+
* @returns {Promise<Array>} - Annotated chunks
|
|
20
|
+
*/
|
|
21
|
+
async annotate(chunks, document) {
|
|
22
|
+
const headingPath = [];
|
|
23
|
+
|
|
24
|
+
return chunks.map((chunk, index) => {
|
|
25
|
+
const metadata = {
|
|
26
|
+
...chunk.metadata,
|
|
27
|
+
source: this.config.addSource ? document.source : undefined,
|
|
28
|
+
doc_type: this.config.addSource ? document.type : undefined,
|
|
29
|
+
section: this.config.addSection ? this._extractSection(chunk) : undefined,
|
|
30
|
+
heading_path: this.config.addHeadingPath ?
|
|
31
|
+
this._buildHeadingPath(chunk, headingPath) :
|
|
32
|
+
undefined,
|
|
33
|
+
ingestion_timestamp: this.config.addTimestamp ?
|
|
34
|
+
new Date().toISOString() :
|
|
35
|
+
undefined,
|
|
36
|
+
hash: this.config.addHash ?
|
|
37
|
+
this.hashUtil.hash(chunk.text) :
|
|
38
|
+
undefined
|
|
39
|
+
};
|
|
40
|
+
|
|
41
|
+
return {
|
|
42
|
+
...chunk,
|
|
43
|
+
metadata: Object.fromEntries(
|
|
44
|
+
Object.entries(metadata).filter(([_, v]) => v !== undefined)
|
|
45
|
+
)
|
|
46
|
+
};
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
_extractSection(chunk) {
|
|
51
|
+
if (chunk.metadata.heading) {
|
|
52
|
+
return chunk.metadata.heading;
|
|
53
|
+
}
|
|
54
|
+
return 'unnamed-section';
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
_buildHeadingPath(chunk, currentPath) {
|
|
58
|
+
const heading = chunk.metadata.heading;
|
|
59
|
+
|
|
60
|
+
if (heading && heading !== currentPath[currentPath.length - 1]) {
|
|
61
|
+
if (currentPath.length === 0 || this._isSubHeading(heading, currentPath[currentPath.length - 1])) {
|
|
62
|
+
currentPath.push(heading);
|
|
63
|
+
} else {
|
|
64
|
+
currentPath.length = 0;
|
|
65
|
+
currentPath.push(heading);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
return [...currentPath];
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
_isSubHeading(heading1, heading2) {
|
|
73
|
+
return heading1.length > heading2.length;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* S-MORA Layer 0 Scrubber - Stage 3: Normalization
|
|
3
|
+
* @module smora/scrubber/stages/normalizer
|
|
3
4
|
*/
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
}
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
5
|
+
export declare class Normalizer {
|
|
6
|
+
constructor(config: any);
|
|
7
|
+
/**
|
|
8
|
+
* Normalize content structure
|
|
9
|
+
* @param {string} content - Filtered content
|
|
10
|
+
* @returns {Promise<string>} - Normalized content
|
|
11
|
+
*/
|
|
12
|
+
normalize(content: any): Promise<any>;
|
|
13
|
+
_normalizeHeadings(content: any): any;
|
|
14
|
+
_normalizeLists(content: any): any;
|
|
15
|
+
_normalizePunctuation(content: any): any;
|
|
13
16
|
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
// @ts-nocheck
|
|
2
|
+
/**
|
|
3
|
+
* S-MORA Layer 0 Scrubber - Stage 3: Normalization
|
|
4
|
+
* @module smora/scrubber/stages/normalizer
|
|
5
|
+
*/
|
|
6
|
+
export class Normalizer {
|
|
7
|
+
constructor(config) {
|
|
8
|
+
this.config = config;
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Normalize content structure
|
|
12
|
+
* @param {string} content - Filtered content
|
|
13
|
+
* @returns {Promise<string>} - Normalized content
|
|
14
|
+
*/
|
|
15
|
+
async normalize(content) {
|
|
16
|
+
let normalized = content;
|
|
17
|
+
if (this.config.normalizeHeadings) {
|
|
18
|
+
normalized = this._normalizeHeadings(normalized);
|
|
19
|
+
}
|
|
20
|
+
if (this.config.normalizeLists) {
|
|
21
|
+
normalized = this._normalizeLists(normalized);
|
|
22
|
+
}
|
|
23
|
+
if (this.config.normalizePunctuation) {
|
|
24
|
+
normalized = this._normalizePunctuation(normalized);
|
|
25
|
+
}
|
|
26
|
+
return normalized;
|
|
27
|
+
}
|
|
28
|
+
_normalizeHeadings(content) {
|
|
29
|
+
let normalized = content.replace(/(#{1,6})([^\s#])/g, '$1 $2');
|
|
30
|
+
normalized = normalized.replace(/^\s*(#{1,6})/gm, '$1');
|
|
31
|
+
normalized = normalized.replace(/#{7,}/g, '######');
|
|
32
|
+
return normalized;
|
|
33
|
+
}
|
|
34
|
+
_normalizeLists(content) {
|
|
35
|
+
let normalized = content.replace(/(\s*)([-*+])(\S)/g, '$1$2 $3');
|
|
36
|
+
normalized = normalized.replace(/(\s*)(\d+)(\S)/g, (match, ws, num, char) => {
|
|
37
|
+
if (!/\.\s/.test(match.substring(ws.length + num.length))) {
|
|
38
|
+
return `${ws}${num}. ${char}`;
|
|
39
|
+
}
|
|
40
|
+
return match;
|
|
41
|
+
});
|
|
42
|
+
return normalized;
|
|
43
|
+
}
|
|
44
|
+
_normalizePunctuation(content) {
|
|
45
|
+
// Remove quotes (both straight and curly)
|
|
46
|
+
let normalized = content.replace(/["'""''`]/g, '');
|
|
47
|
+
normalized = normalized.replace(/ +/g, ' ');
|
|
48
|
+
normalized = normalized.replace(/\.{4,}/g, '...');
|
|
49
|
+
return normalized;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
// @ts-nocheck
|
|
2
|
+
/**
|
|
3
|
+
* S-MORA Layer 0 Scrubber - Stage 3: Normalization
|
|
4
|
+
* @module smora/scrubber/stages/normalizer
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
export class Normalizer {
|
|
8
|
+
constructor(config) {
|
|
9
|
+
this.config = config;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Normalize content structure
|
|
14
|
+
* @param {string} content - Filtered content
|
|
15
|
+
* @returns {Promise<string>} - Normalized content
|
|
16
|
+
*/
|
|
17
|
+
async normalize(content) {
|
|
18
|
+
let normalized = content;
|
|
19
|
+
|
|
20
|
+
if (this.config.normalizeHeadings) {
|
|
21
|
+
normalized = this._normalizeHeadings(normalized);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
if (this.config.normalizeLists) {
|
|
25
|
+
normalized = this._normalizeLists(normalized);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
if (this.config.normalizePunctuation) {
|
|
29
|
+
normalized = this._normalizePunctuation(normalized);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
return normalized;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
_normalizeHeadings(content) {
|
|
36
|
+
let normalized = content.replace(/(#{1,6})([^\s#])/g, '$1 $2');
|
|
37
|
+
normalized = normalized.replace(/^\s*(#{1,6})/gm, '$1');
|
|
38
|
+
normalized = normalized.replace(/#{7,}/g, '######');
|
|
39
|
+
return normalized;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
_normalizeLists(content) {
|
|
43
|
+
let normalized = content.replace(/(\s*)([-*+])(\S)/g, '$1$2 $3');
|
|
44
|
+
normalized = normalized.replace(/(\s*)(\d+)(\S)/g, (match, ws, num, char) => {
|
|
45
|
+
if (!/\.\s/.test(match.substring(ws.length + num.length))) {
|
|
46
|
+
return `${ws}${num}. ${char}`;
|
|
47
|
+
}
|
|
48
|
+
return match;
|
|
49
|
+
});
|
|
50
|
+
return normalized;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
_normalizePunctuation(content) {
|
|
54
|
+
// Remove quotes (both straight and curly)
|
|
55
|
+
let normalized = content.replace(/["'""''`]/g, '');
|
|
56
|
+
normalized = normalized.replace(/ +/g, ' ');
|
|
57
|
+
normalized = normalized.replace(/\.{4,}/g, '...');
|
|
58
|
+
return normalized;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* S-MORA Layer 0 Scrubber - Stage 2: Semantic Filtering
|
|
3
|
+
* @module smora/scrubber/stages/semantic-filter
|
|
3
4
|
*/
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
}
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
5
|
+
export declare class SemanticFilter {
|
|
6
|
+
constructor(config: any);
|
|
7
|
+
/**
|
|
8
|
+
* Filter semantically empty content
|
|
9
|
+
* @param {string} content - Cleaned content
|
|
10
|
+
* @returns {Promise<string>} - Filtered content
|
|
11
|
+
*/
|
|
12
|
+
filter(content: any): Promise<any>;
|
|
13
|
+
_isBoilerplate(paragraph: any): any;
|
|
14
|
+
_removeDuplicates(paragraphs: any): Promise<any>;
|
|
15
|
+
_hasSignal(paragraph: any): boolean;
|
|
13
16
|
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
// @ts-nocheck
|
|
2
|
+
/**
|
|
3
|
+
* S-MORA Layer 0 Scrubber - Stage 2: Semantic Filtering
|
|
4
|
+
* @module smora/scrubber/stages/semantic-filter
|
|
5
|
+
*/
|
|
6
|
+
import { PatternMatcher } from '../utils/pattern-matcher.js';
|
|
7
|
+
import { HashUtil } from '../utils/hash.js';
|
|
8
|
+
export class SemanticFilter {
|
|
9
|
+
constructor(config) {
|
|
10
|
+
this.config = config;
|
|
11
|
+
this.patternMatcher = new PatternMatcher();
|
|
12
|
+
this.hashUtil = new HashUtil();
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Filter semantically empty content
|
|
16
|
+
* @param {string} content - Cleaned content
|
|
17
|
+
* @returns {Promise<string>} - Filtered content
|
|
18
|
+
*/
|
|
19
|
+
async filter(content) {
|
|
20
|
+
const paragraphs = content.split(/\n\n+/);
|
|
21
|
+
let filtered = paragraphs.filter(p => !this._isBoilerplate(p));
|
|
22
|
+
filtered = await this._removeDuplicates(filtered);
|
|
23
|
+
filtered = filtered.filter(p => this._hasSignal(p));
|
|
24
|
+
return filtered.join('\n\n');
|
|
25
|
+
}
|
|
26
|
+
_isBoilerplate(paragraph) {
|
|
27
|
+
return this.patternMatcher.isBoilerplate(paragraph);
|
|
28
|
+
}
|
|
29
|
+
async _removeDuplicates(paragraphs) {
|
|
30
|
+
if (!this.config.removeDuplicates)
|
|
31
|
+
return paragraphs;
|
|
32
|
+
const seen = new Set();
|
|
33
|
+
const unique = [];
|
|
34
|
+
for (const para of paragraphs) {
|
|
35
|
+
const hash = this.hashUtil.hash(para);
|
|
36
|
+
if (!seen.has(hash)) {
|
|
37
|
+
seen.add(hash);
|
|
38
|
+
unique.push(para);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
return unique;
|
|
42
|
+
}
|
|
43
|
+
_hasSignal(paragraph) {
|
|
44
|
+
const text = paragraph.trim();
|
|
45
|
+
if (text.length < 10)
|
|
46
|
+
return false;
|
|
47
|
+
const signalChars = text.replace(/[^a-zA-Z0-9]/g, '').length;
|
|
48
|
+
const ratio = signalChars / text.length;
|
|
49
|
+
return ratio >= (this.config.minSignalRatio || 0.3);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
// @ts-nocheck
|
|
2
|
+
/**
|
|
3
|
+
* S-MORA Layer 0 Scrubber - Stage 2: Semantic Filtering
|
|
4
|
+
* @module smora/scrubber/stages/semantic-filter
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { PatternMatcher } from '../utils/pattern-matcher.js';
|
|
8
|
+
import { HashUtil } from '../utils/hash.js';
|
|
9
|
+
|
|
10
|
+
export class SemanticFilter {
|
|
11
|
+
constructor(config) {
|
|
12
|
+
this.config = config;
|
|
13
|
+
this.patternMatcher = new PatternMatcher();
|
|
14
|
+
this.hashUtil = new HashUtil();
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Filter semantically empty content
|
|
19
|
+
* @param {string} content - Cleaned content
|
|
20
|
+
* @returns {Promise<string>} - Filtered content
|
|
21
|
+
*/
|
|
22
|
+
async filter(content) {
|
|
23
|
+
const paragraphs = content.split(/\n\n+/);
|
|
24
|
+
|
|
25
|
+
let filtered = paragraphs.filter(p => !this._isBoilerplate(p));
|
|
26
|
+
filtered = await this._removeDuplicates(filtered);
|
|
27
|
+
filtered = filtered.filter(p => this._hasSignal(p));
|
|
28
|
+
|
|
29
|
+
return filtered.join('\n\n');
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
_isBoilerplate(paragraph) {
|
|
33
|
+
return this.patternMatcher.isBoilerplate(paragraph);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
async _removeDuplicates(paragraphs) {
|
|
37
|
+
if (!this.config.removeDuplicates) return paragraphs;
|
|
38
|
+
|
|
39
|
+
const seen = new Set();
|
|
40
|
+
const unique = [];
|
|
41
|
+
|
|
42
|
+
for (const para of paragraphs) {
|
|
43
|
+
const hash = this.hashUtil.hash(para);
|
|
44
|
+
if (!seen.has(hash)) {
|
|
45
|
+
seen.add(hash);
|
|
46
|
+
unique.push(para);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
return unique;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
_hasSignal(paragraph) {
|
|
54
|
+
const text = paragraph.trim();
|
|
55
|
+
if (text.length < 10) return false;
|
|
56
|
+
|
|
57
|
+
const signalChars = text.replace(/[^a-zA-Z0-9]/g, '').length;
|
|
58
|
+
const ratio = signalChars / text.length;
|
|
59
|
+
|
|
60
|
+
return ratio >= (this.config.minSignalRatio || 0.3);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
@@ -1,13 +1,18 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* S-MORA Layer 0 Scrubber - Stage 1: Structural Cleaning
|
|
3
|
+
* @module smora/scrubber/stages/structural-cleaner
|
|
3
4
|
*/
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
}
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
5
|
+
export declare class StructuralCleaner {
|
|
6
|
+
constructor(config: any);
|
|
7
|
+
/**
|
|
8
|
+
* Clean document structure
|
|
9
|
+
* @param {string} content - Raw document content
|
|
10
|
+
* @returns {Promise<string>} - Cleaned content
|
|
11
|
+
*/
|
|
12
|
+
clean(content: any): Promise<any>;
|
|
13
|
+
_detectType(content: any): "html" | "markdown" | "text";
|
|
14
|
+
_cleanHTML(content: any): Promise<any>;
|
|
15
|
+
_cleanMarkdown(content: any): Promise<any>;
|
|
16
|
+
_collapseWhitespace(content: any): any;
|
|
17
|
+
_normalizeLineBreaks(content: any): any;
|
|
13
18
|
}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
// @ts-nocheck
|
|
2
|
+
/**
|
|
3
|
+
* S-MORA Layer 0 Scrubber - Stage 1: Structural Cleaning
|
|
4
|
+
* @module smora/scrubber/stages/structural-cleaner
|
|
5
|
+
*/
|
|
6
|
+
import { HTMLParser } from '../utils/html-parser.js';
|
|
7
|
+
import { ScrubberError } from '../errors/scrubber-error.js';
|
|
8
|
+
export class StructuralCleaner {
|
|
9
|
+
constructor(config) {
|
|
10
|
+
this.config = config;
|
|
11
|
+
this.htmlParser = new HTMLParser();
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Clean document structure
|
|
15
|
+
* @param {string} content - Raw document content
|
|
16
|
+
* @returns {Promise<string>} - Cleaned content
|
|
17
|
+
*/
|
|
18
|
+
async clean(content) {
|
|
19
|
+
try {
|
|
20
|
+
const type = this._detectType(content);
|
|
21
|
+
let cleaned = content;
|
|
22
|
+
if (type === 'html') {
|
|
23
|
+
cleaned = await this._cleanHTML(cleaned);
|
|
24
|
+
// HTML may have markdown headings, normalize them
|
|
25
|
+
cleaned = await this._cleanMarkdown(cleaned);
|
|
26
|
+
}
|
|
27
|
+
else if (type === 'markdown') {
|
|
28
|
+
cleaned = await this._cleanMarkdown(cleaned);
|
|
29
|
+
}
|
|
30
|
+
cleaned = this._collapseWhitespace(cleaned);
|
|
31
|
+
cleaned = this._normalizeLineBreaks(cleaned);
|
|
32
|
+
return cleaned;
|
|
33
|
+
}
|
|
34
|
+
catch (error) {
|
|
35
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
36
|
+
throw new ScrubberError(`Failed to clean content: ${message}`, { stage: 'structural-cleaner', originalError: error });
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
_detectType(content) {
|
|
40
|
+
if (content.trim().startsWith('<'))
|
|
41
|
+
return 'html';
|
|
42
|
+
if (/^#{1,6}\s/.test(content) || /^#{1,6}[A-Za-z]/.test(content))
|
|
43
|
+
return 'markdown';
|
|
44
|
+
return 'text';
|
|
45
|
+
}
|
|
46
|
+
async _cleanHTML(content) {
|
|
47
|
+
return this.htmlParser.parse(content);
|
|
48
|
+
}
|
|
49
|
+
async _cleanMarkdown(content) {
|
|
50
|
+
let cleaned = content;
|
|
51
|
+
// Add space after heading markers when missing
|
|
52
|
+
cleaned = cleaned.replace(/(#{1,6})([^\s#])/g, '$1 $2');
|
|
53
|
+
// Add space after list markers when missing
|
|
54
|
+
cleaned = cleaned.replace(/(\s*)([-*+])(\S)/g, '$1$2 $3');
|
|
55
|
+
// Add space after numbered list markers when missing
|
|
56
|
+
cleaned = cleaned.replace(/(\s*)(\d+)(\S)/g, (match, ws, num, char) => {
|
|
57
|
+
// Only if it looks like a numbered list (digit followed by non-dot, non-space)
|
|
58
|
+
if (!/\.\s/.test(match.substring(ws.length + num.length))) {
|
|
59
|
+
return `${ws}${num}. ${char}`;
|
|
60
|
+
}
|
|
61
|
+
return match;
|
|
62
|
+
});
|
|
63
|
+
return cleaned;
|
|
64
|
+
}
|
|
65
|
+
_collapseWhitespace(content) {
|
|
66
|
+
let cleaned = content.replace(/[ \t]+/g, ' ');
|
|
67
|
+
cleaned = cleaned.replace(/\n{3,}/g, '\n\n');
|
|
68
|
+
return cleaned;
|
|
69
|
+
}
|
|
70
|
+
_normalizeLineBreaks(content) {
|
|
71
|
+
return content.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
|
|
72
|
+
}
|
|
73
|
+
}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
// @ts-nocheck
|
|
2
|
+
/**
|
|
3
|
+
* S-MORA Layer 0 Scrubber - Stage 1: Structural Cleaning
|
|
4
|
+
* @module smora/scrubber/stages/structural-cleaner
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { HTMLParser } from '../utils/html-parser.js';
|
|
8
|
+
import { StructuralCleaningError, ScrubberError } from '../errors/scrubber-error.js';
|
|
9
|
+
|
|
10
|
+
export class StructuralCleaner {
|
|
11
|
+
constructor(config) {
|
|
12
|
+
this.config = config;
|
|
13
|
+
this.htmlParser = new HTMLParser();
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Clean document structure
|
|
18
|
+
* @param {string} content - Raw document content
|
|
19
|
+
* @returns {Promise<string>} - Cleaned content
|
|
20
|
+
*/
|
|
21
|
+
async clean(content) {
|
|
22
|
+
try {
|
|
23
|
+
const type = this._detectType(content);
|
|
24
|
+
let cleaned = content;
|
|
25
|
+
|
|
26
|
+
if (type === 'html') {
|
|
27
|
+
cleaned = await this._cleanHTML(cleaned);
|
|
28
|
+
// HTML may have markdown headings, normalize them
|
|
29
|
+
cleaned = await this._cleanMarkdown(cleaned);
|
|
30
|
+
} else if (type === 'markdown') {
|
|
31
|
+
cleaned = await this._cleanMarkdown(cleaned);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
cleaned = this._collapseWhitespace(cleaned);
|
|
35
|
+
cleaned = this._normalizeLineBreaks(cleaned);
|
|
36
|
+
|
|
37
|
+
return cleaned;
|
|
38
|
+
} catch (error) {
|
|
39
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
40
|
+
throw new ScrubberError(
|
|
41
|
+
`Failed to clean content: ${message}`,
|
|
42
|
+
{ stage: 'structural-cleaner', originalError: error }
|
|
43
|
+
);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
_detectType(content) {
|
|
48
|
+
if (content.trim().startsWith('<')) return 'html';
|
|
49
|
+
if (/^#{1,6}\s/.test(content) || /^#{1,6}[A-Za-z]/.test(content)) return 'markdown';
|
|
50
|
+
return 'text';
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
async _cleanHTML(content) {
|
|
54
|
+
return this.htmlParser.parse(content);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
async _cleanMarkdown(content) {
|
|
58
|
+
let cleaned = content;
|
|
59
|
+
// Add space after heading markers when missing
|
|
60
|
+
cleaned = cleaned.replace(/(#{1,6})([^\s#])/g, '$1 $2');
|
|
61
|
+
// Add space after list markers when missing
|
|
62
|
+
cleaned = cleaned.replace(/(\s*)([-*+])(\S)/g, '$1$2 $3');
|
|
63
|
+
// Add space after numbered list markers when missing
|
|
64
|
+
cleaned = cleaned.replace(/(\s*)(\d+)(\S)/g, (match, ws, num, char) => {
|
|
65
|
+
// Only if it looks like a numbered list (digit followed by non-dot, non-space)
|
|
66
|
+
if (!/\.\s/.test(match.substring(ws.length + num.length))) {
|
|
67
|
+
return `${ws}${num}. ${char}`;
|
|
68
|
+
}
|
|
69
|
+
return match;
|
|
70
|
+
});
|
|
71
|
+
return cleaned;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
_collapseWhitespace(content) {
|
|
75
|
+
let cleaned = content.replace(/[ \t]+/g, ' ');
|
|
76
|
+
cleaned = cleaned.replace(/\n{3,}/g, '\n\n');
|
|
77
|
+
return cleaned;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
_normalizeLineBreaks(content) {
|
|
81
|
+
return content.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
|
|
82
|
+
}
|
|
83
|
+
}
|
|
@@ -1,18 +1,17 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* S-MORA Layer 0 Scrubber - Stage 6: Validation
|
|
3
|
+
* @module smora/scrubber/stages/validator
|
|
3
4
|
*/
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
}
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
constructor(config?: ValidatorConfig);
|
|
17
|
-
validate(content: string): Promise<ValidationResult>;
|
|
5
|
+
export declare class Validator {
|
|
6
|
+
constructor(config: any);
|
|
7
|
+
/**
|
|
8
|
+
* Validate chunks
|
|
9
|
+
* @param {Array} chunks - Array of chunks
|
|
10
|
+
* @returns {Promise<Array>} - Validated chunks
|
|
11
|
+
*/
|
|
12
|
+
validate(chunks: any): Promise<any[]>;
|
|
13
|
+
_validateChunk(chunk: any): {
|
|
14
|
+
valid: boolean;
|
|
15
|
+
errors: any[];
|
|
16
|
+
};
|
|
18
17
|
}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
// @ts-nocheck
|
|
2
|
+
/**
|
|
3
|
+
* S-MORA Layer 0 Scrubber - Stage 6: Validation
|
|
4
|
+
* @module smora/scrubber/stages/validator
|
|
5
|
+
*/
|
|
6
|
+
import { TokenCounter } from '../utils/token-counter.js';
|
|
7
|
+
export class Validator {
|
|
8
|
+
constructor(config) {
|
|
9
|
+
this.config = config;
|
|
10
|
+
this.tokenCounter = new TokenCounter();
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* Validate chunks
|
|
14
|
+
* @param {Array} chunks - Array of chunks
|
|
15
|
+
* @returns {Promise<Array>} - Validated chunks
|
|
16
|
+
*/
|
|
17
|
+
async validate(chunks) {
|
|
18
|
+
const valid = [];
|
|
19
|
+
const errors = [];
|
|
20
|
+
for (const chunk of chunks) {
|
|
21
|
+
const validation = this._validateChunk(chunk);
|
|
22
|
+
if (validation.valid) {
|
|
23
|
+
valid.push(chunk);
|
|
24
|
+
}
|
|
25
|
+
else {
|
|
26
|
+
errors.push({
|
|
27
|
+
chunkIndex: chunk.index,
|
|
28
|
+
errors: validation.errors
|
|
29
|
+
});
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
return valid;
|
|
33
|
+
}
|
|
34
|
+
_validateChunk(chunk) {
|
|
35
|
+
const errors = [];
|
|
36
|
+
if (this.config.rejectEmptyChunks && !chunk.text.trim()) {
|
|
37
|
+
errors.push('empty_chunk');
|
|
38
|
+
}
|
|
39
|
+
if (this.config.enforceMinLength) {
|
|
40
|
+
const tokens = this.tokenCounter.count(chunk.text);
|
|
41
|
+
if (tokens < this.config.minTokens) {
|
|
42
|
+
errors.push(`chunk_too_short: ${tokens} < ${this.config.minTokens}`);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
if (this.config.enforceMaxLength) {
|
|
46
|
+
const tokens = this.tokenCounter.count(chunk.text);
|
|
47
|
+
if (tokens > this.config.hardMaxTokens) {
|
|
48
|
+
errors.push(`chunk_too_long: ${tokens} > ${this.config.hardMaxTokens}`);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
return {
|
|
52
|
+
valid: errors.length === 0,
|
|
53
|
+
errors
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
}
|