@yamo/memory-mesh 3.0.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. package/README.md +8 -2
  2. package/lib/llm/client.d.ts +23 -48
  3. package/lib/llm/client.js +1 -0
  4. package/lib/llm/client.ts +298 -377
  5. package/lib/llm/index.js +1 -0
  6. package/lib/llm/index.ts +1 -2
  7. package/lib/memory/adapters/client.d.ts +22 -85
  8. package/lib/memory/adapters/client.js +1 -0
  9. package/lib/memory/adapters/client.ts +474 -633
  10. package/lib/memory/adapters/config.d.ts +82 -89
  11. package/lib/memory/adapters/config.js +1 -0
  12. package/lib/memory/adapters/config.ts +156 -225
  13. package/lib/memory/adapters/errors.d.ts +28 -20
  14. package/lib/memory/adapters/errors.js +1 -0
  15. package/lib/memory/adapters/errors.ts +83 -120
  16. package/lib/memory/context-manager.d.ts +15 -18
  17. package/lib/memory/context-manager.js +1 -0
  18. package/lib/memory/context-manager.ts +314 -401
  19. package/lib/memory/embeddings/factory.d.ts +18 -20
  20. package/lib/memory/embeddings/factory.js +1 -0
  21. package/lib/memory/embeddings/factory.ts +130 -173
  22. package/lib/memory/embeddings/index.js +1 -0
  23. package/lib/memory/embeddings/index.ts +1 -0
  24. package/lib/memory/embeddings/service.d.ts +36 -66
  25. package/lib/memory/embeddings/service.js +1 -0
  26. package/lib/memory/embeddings/service.ts +479 -616
  27. package/lib/memory/index.d.ts +2 -2
  28. package/lib/memory/index.js +1 -0
  29. package/lib/memory/index.ts +3 -13
  30. package/lib/memory/memory-mesh.d.ts +151 -93
  31. package/lib/memory/memory-mesh.js +1 -0
  32. package/lib/memory/memory-mesh.ts +1406 -1692
  33. package/lib/memory/memory-translator.d.ts +1 -6
  34. package/lib/memory/memory-translator.js +1 -0
  35. package/lib/memory/memory-translator.ts +96 -128
  36. package/lib/memory/schema.d.ts +29 -10
  37. package/lib/memory/schema.js +1 -0
  38. package/lib/memory/schema.ts +102 -185
  39. package/lib/memory/scorer.d.ts +3 -4
  40. package/lib/memory/scorer.js +1 -0
  41. package/lib/memory/scorer.ts +69 -86
  42. package/lib/memory/search/index.js +1 -0
  43. package/lib/memory/search/index.ts +1 -0
  44. package/lib/memory/search/keyword-search.d.ts +10 -26
  45. package/lib/memory/search/keyword-search.js +1 -0
  46. package/lib/memory/search/keyword-search.ts +123 -161
  47. package/lib/scrubber/config/defaults.d.ts +39 -46
  48. package/lib/scrubber/config/defaults.js +1 -0
  49. package/lib/scrubber/config/defaults.ts +50 -112
  50. package/lib/scrubber/errors/scrubber-error.d.ts +22 -0
  51. package/lib/scrubber/errors/scrubber-error.js +39 -0
  52. package/lib/scrubber/errors/scrubber-error.ts +44 -0
  53. package/lib/scrubber/index.d.ts +0 -1
  54. package/lib/scrubber/index.js +1 -0
  55. package/lib/scrubber/index.ts +1 -2
  56. package/lib/scrubber/scrubber.d.ts +14 -31
  57. package/lib/scrubber/scrubber.js +1 -0
  58. package/lib/scrubber/scrubber.ts +93 -152
  59. package/lib/scrubber/stages/chunker.d.ts +22 -10
  60. package/lib/scrubber/stages/chunker.js +86 -0
  61. package/lib/scrubber/stages/chunker.ts +104 -0
  62. package/lib/scrubber/stages/metadata-annotator.d.ts +14 -15
  63. package/lib/scrubber/stages/metadata-annotator.js +64 -0
  64. package/lib/scrubber/stages/metadata-annotator.ts +75 -0
  65. package/lib/scrubber/stages/normalizer.d.ts +13 -10
  66. package/lib/scrubber/stages/normalizer.js +51 -0
  67. package/lib/scrubber/stages/normalizer.ts +60 -0
  68. package/lib/scrubber/stages/semantic-filter.d.ts +13 -10
  69. package/lib/scrubber/stages/semantic-filter.js +51 -0
  70. package/lib/scrubber/stages/semantic-filter.ts +62 -0
  71. package/lib/scrubber/stages/structural-cleaner.d.ts +15 -10
  72. package/lib/scrubber/stages/structural-cleaner.js +73 -0
  73. package/lib/scrubber/stages/structural-cleaner.ts +83 -0
  74. package/lib/scrubber/stages/validator.d.ts +14 -15
  75. package/lib/scrubber/stages/validator.js +56 -0
  76. package/lib/scrubber/stages/validator.ts +67 -0
  77. package/lib/scrubber/telemetry.d.ts +20 -27
  78. package/lib/scrubber/telemetry.js +1 -0
  79. package/lib/scrubber/telemetry.ts +53 -90
  80. package/lib/scrubber/utils/hash.d.ts +14 -0
  81. package/lib/scrubber/utils/hash.js +37 -0
  82. package/lib/scrubber/utils/hash.ts +40 -0
  83. package/lib/scrubber/utils/html-parser.d.ts +14 -0
  84. package/lib/scrubber/utils/html-parser.js +38 -0
  85. package/lib/scrubber/utils/html-parser.ts +46 -0
  86. package/lib/scrubber/utils/pattern-matcher.d.ts +12 -0
  87. package/lib/scrubber/utils/pattern-matcher.js +54 -0
  88. package/lib/scrubber/utils/pattern-matcher.ts +64 -0
  89. package/lib/scrubber/utils/token-counter.d.ts +18 -0
  90. package/lib/scrubber/utils/token-counter.js +30 -0
  91. package/lib/scrubber/utils/token-counter.ts +32 -0
  92. package/lib/utils/logger.d.ts +1 -11
  93. package/lib/utils/logger.js +1 -0
  94. package/lib/utils/logger.ts +43 -63
  95. package/lib/utils/skill-metadata.d.ts +6 -14
  96. package/lib/utils/skill-metadata.js +1 -0
  97. package/lib/utils/skill-metadata.ts +89 -103
  98. package/lib/yamo/emitter.d.ts +8 -35
  99. package/lib/yamo/emitter.js +1 -0
  100. package/lib/yamo/emitter.ts +77 -155
  101. package/lib/yamo/index.d.ts +14 -0
  102. package/lib/yamo/index.js +14 -0
  103. package/lib/yamo/index.ts +16 -0
  104. package/lib/yamo/schema.d.ts +8 -10
  105. package/lib/yamo/schema.js +1 -0
  106. package/lib/yamo/schema.ts +82 -114
  107. package/package.json +4 -2
@@ -0,0 +1,75 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * S-MORA Layer 0 Scrubber - Stage 5: Metadata Annotation
4
+ * @module smora/scrubber/stages/metadata-annotator
5
+ */
6
+
7
+ import { HashUtil } from '../utils/hash.js';
8
+
9
+ export class MetadataAnnotator {
10
+ constructor(config) {
11
+ this.config = config;
12
+ this.hashUtil = new HashUtil();
13
+ }
14
+
15
+ /**
16
+ * Add metadata to chunks
17
+ * @param {Array} chunks - Array of chunks
18
+ * @param {Object} document - Original document metadata
19
+ * @returns {Promise<Array>} - Annotated chunks
20
+ */
21
+ async annotate(chunks, document) {
22
+ const headingPath = [];
23
+
24
+ return chunks.map((chunk, index) => {
25
+ const metadata = {
26
+ ...chunk.metadata,
27
+ source: this.config.addSource ? document.source : undefined,
28
+ doc_type: this.config.addSource ? document.type : undefined,
29
+ section: this.config.addSection ? this._extractSection(chunk) : undefined,
30
+ heading_path: this.config.addHeadingPath ?
31
+ this._buildHeadingPath(chunk, headingPath) :
32
+ undefined,
33
+ ingestion_timestamp: this.config.addTimestamp ?
34
+ new Date().toISOString() :
35
+ undefined,
36
+ hash: this.config.addHash ?
37
+ this.hashUtil.hash(chunk.text) :
38
+ undefined
39
+ };
40
+
41
+ return {
42
+ ...chunk,
43
+ metadata: Object.fromEntries(
44
+ Object.entries(metadata).filter(([_, v]) => v !== undefined)
45
+ )
46
+ };
47
+ });
48
+ }
49
+
50
+ _extractSection(chunk) {
51
+ if (chunk.metadata.heading) {
52
+ return chunk.metadata.heading;
53
+ }
54
+ return 'unnamed-section';
55
+ }
56
+
57
+ _buildHeadingPath(chunk, currentPath) {
58
+ const heading = chunk.metadata.heading;
59
+
60
+ if (heading && heading !== currentPath[currentPath.length - 1]) {
61
+ if (currentPath.length === 0 || this._isSubHeading(heading, currentPath[currentPath.length - 1])) {
62
+ currentPath.push(heading);
63
+ } else {
64
+ currentPath.length = 0;
65
+ currentPath.push(heading);
66
+ }
67
+ }
68
+
69
+ return [...currentPath];
70
+ }
71
+
72
+ _isSubHeading(heading1, heading2) {
73
+ return heading1.length > heading2.length;
74
+ }
75
+ }
@@ -1,13 +1,16 @@
1
1
  /**
2
- * Type definitions for normalizer.js
2
+ * S-MORA Layer 0 Scrubber - Stage 3: Normalization
3
+ * @module smora/scrubber/stages/normalizer
3
4
  */
4
-
5
- export interface NormalizerConfig {
6
- lowercase?: boolean;
7
- [key: string]: any;
8
- }
9
-
10
- export class Normalizer {
11
- constructor(config?: NormalizerConfig);
12
- normalize(content: string): Promise<string>;
5
+ export declare class Normalizer {
6
+ constructor(config: any);
7
+ /**
8
+ * Normalize content structure
9
+ * @param {string} content - Filtered content
10
+ * @returns {Promise<string>} - Normalized content
11
+ */
12
+ normalize(content: any): Promise<any>;
13
+ _normalizeHeadings(content: any): any;
14
+ _normalizeLists(content: any): any;
15
+ _normalizePunctuation(content: any): any;
13
16
  }
@@ -0,0 +1,51 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * S-MORA Layer 0 Scrubber - Stage 3: Normalization
4
+ * @module smora/scrubber/stages/normalizer
5
+ */
6
+ export class Normalizer {
7
+ constructor(config) {
8
+ this.config = config;
9
+ }
10
+ /**
11
+ * Normalize content structure
12
+ * @param {string} content - Filtered content
13
+ * @returns {Promise<string>} - Normalized content
14
+ */
15
+ async normalize(content) {
16
+ let normalized = content;
17
+ if (this.config.normalizeHeadings) {
18
+ normalized = this._normalizeHeadings(normalized);
19
+ }
20
+ if (this.config.normalizeLists) {
21
+ normalized = this._normalizeLists(normalized);
22
+ }
23
+ if (this.config.normalizePunctuation) {
24
+ normalized = this._normalizePunctuation(normalized);
25
+ }
26
+ return normalized;
27
+ }
28
+ _normalizeHeadings(content) {
29
+ let normalized = content.replace(/(#{1,6})([^\s#])/g, '$1 $2');
30
+ normalized = normalized.replace(/^\s*(#{1,6})/gm, '$1');
31
+ normalized = normalized.replace(/#{7,}/g, '######');
32
+ return normalized;
33
+ }
34
+ _normalizeLists(content) {
35
+ let normalized = content.replace(/(\s*)([-*+])(\S)/g, '$1$2 $3');
36
+ normalized = normalized.replace(/(\s*)(\d+)(\S)/g, (match, ws, num, char) => {
37
+ if (!/\.\s/.test(match.substring(ws.length + num.length))) {
38
+ return `${ws}${num}. ${char}`;
39
+ }
40
+ return match;
41
+ });
42
+ return normalized;
43
+ }
44
+ _normalizePunctuation(content) {
45
+ // Remove quotes (both straight and curly)
46
+ let normalized = content.replace(/["'""''`]/g, '');
47
+ normalized = normalized.replace(/ +/g, ' ');
48
+ normalized = normalized.replace(/\.{4,}/g, '...');
49
+ return normalized;
50
+ }
51
+ }
@@ -0,0 +1,60 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * S-MORA Layer 0 Scrubber - Stage 3: Normalization
4
+ * @module smora/scrubber/stages/normalizer
5
+ */
6
+
7
+ export class Normalizer {
8
+ constructor(config) {
9
+ this.config = config;
10
+ }
11
+
12
+ /**
13
+ * Normalize content structure
14
+ * @param {string} content - Filtered content
15
+ * @returns {Promise<string>} - Normalized content
16
+ */
17
+ async normalize(content) {
18
+ let normalized = content;
19
+
20
+ if (this.config.normalizeHeadings) {
21
+ normalized = this._normalizeHeadings(normalized);
22
+ }
23
+
24
+ if (this.config.normalizeLists) {
25
+ normalized = this._normalizeLists(normalized);
26
+ }
27
+
28
+ if (this.config.normalizePunctuation) {
29
+ normalized = this._normalizePunctuation(normalized);
30
+ }
31
+
32
+ return normalized;
33
+ }
34
+
35
+ _normalizeHeadings(content) {
36
+ let normalized = content.replace(/(#{1,6})([^\s#])/g, '$1 $2');
37
+ normalized = normalized.replace(/^\s*(#{1,6})/gm, '$1');
38
+ normalized = normalized.replace(/#{7,}/g, '######');
39
+ return normalized;
40
+ }
41
+
42
+ _normalizeLists(content) {
43
+ let normalized = content.replace(/(\s*)([-*+])(\S)/g, '$1$2 $3');
44
+ normalized = normalized.replace(/(\s*)(\d+)(\S)/g, (match, ws, num, char) => {
45
+ if (!/\.\s/.test(match.substring(ws.length + num.length))) {
46
+ return `${ws}${num}. ${char}`;
47
+ }
48
+ return match;
49
+ });
50
+ return normalized;
51
+ }
52
+
53
+ _normalizePunctuation(content) {
54
+ // Remove quotes (both straight and curly)
55
+ let normalized = content.replace(/["'""''`]/g, '');
56
+ normalized = normalized.replace(/ +/g, ' ');
57
+ normalized = normalized.replace(/\.{4,}/g, '...');
58
+ return normalized;
59
+ }
60
+ }
@@ -1,13 +1,16 @@
1
1
  /**
2
- * Type definitions for semantic-filter.js
2
+ * S-MORA Layer 0 Scrubber - Stage 2: Semantic Filtering
3
+ * @module smora/scrubber/stages/semantic-filter
3
4
  */
4
-
5
- export interface FilterConfig {
6
- threshold?: number;
7
- [key: string]: any;
8
- }
9
-
10
- export class SemanticFilter {
11
- constructor(config?: FilterConfig);
12
- filter(content: string): Promise<string>;
5
+ export declare class SemanticFilter {
6
+ constructor(config: any);
7
+ /**
8
+ * Filter semantically empty content
9
+ * @param {string} content - Cleaned content
10
+ * @returns {Promise<string>} - Filtered content
11
+ */
12
+ filter(content: any): Promise<any>;
13
+ _isBoilerplate(paragraph: any): any;
14
+ _removeDuplicates(paragraphs: any): Promise<any>;
15
+ _hasSignal(paragraph: any): boolean;
13
16
  }
@@ -0,0 +1,51 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * S-MORA Layer 0 Scrubber - Stage 2: Semantic Filtering
4
+ * @module smora/scrubber/stages/semantic-filter
5
+ */
6
+ import { PatternMatcher } from '../utils/pattern-matcher.js';
7
+ import { HashUtil } from '../utils/hash.js';
8
+ export class SemanticFilter {
9
+ constructor(config) {
10
+ this.config = config;
11
+ this.patternMatcher = new PatternMatcher();
12
+ this.hashUtil = new HashUtil();
13
+ }
14
+ /**
15
+ * Filter semantically empty content
16
+ * @param {string} content - Cleaned content
17
+ * @returns {Promise<string>} - Filtered content
18
+ */
19
+ async filter(content) {
20
+ const paragraphs = content.split(/\n\n+/);
21
+ let filtered = paragraphs.filter(p => !this._isBoilerplate(p));
22
+ filtered = await this._removeDuplicates(filtered);
23
+ filtered = filtered.filter(p => this._hasSignal(p));
24
+ return filtered.join('\n\n');
25
+ }
26
+ _isBoilerplate(paragraph) {
27
+ return this.patternMatcher.isBoilerplate(paragraph);
28
+ }
29
+ async _removeDuplicates(paragraphs) {
30
+ if (!this.config.removeDuplicates)
31
+ return paragraphs;
32
+ const seen = new Set();
33
+ const unique = [];
34
+ for (const para of paragraphs) {
35
+ const hash = this.hashUtil.hash(para);
36
+ if (!seen.has(hash)) {
37
+ seen.add(hash);
38
+ unique.push(para);
39
+ }
40
+ }
41
+ return unique;
42
+ }
43
+ _hasSignal(paragraph) {
44
+ const text = paragraph.trim();
45
+ if (text.length < 10)
46
+ return false;
47
+ const signalChars = text.replace(/[^a-zA-Z0-9]/g, '').length;
48
+ const ratio = signalChars / text.length;
49
+ return ratio >= (this.config.minSignalRatio || 0.3);
50
+ }
51
+ }
@@ -0,0 +1,62 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * S-MORA Layer 0 Scrubber - Stage 2: Semantic Filtering
4
+ * @module smora/scrubber/stages/semantic-filter
5
+ */
6
+
7
+ import { PatternMatcher } from '../utils/pattern-matcher.js';
8
+ import { HashUtil } from '../utils/hash.js';
9
+
10
+ export class SemanticFilter {
11
+ constructor(config) {
12
+ this.config = config;
13
+ this.patternMatcher = new PatternMatcher();
14
+ this.hashUtil = new HashUtil();
15
+ }
16
+
17
+ /**
18
+ * Filter semantically empty content
19
+ * @param {string} content - Cleaned content
20
+ * @returns {Promise<string>} - Filtered content
21
+ */
22
+ async filter(content) {
23
+ const paragraphs = content.split(/\n\n+/);
24
+
25
+ let filtered = paragraphs.filter(p => !this._isBoilerplate(p));
26
+ filtered = await this._removeDuplicates(filtered);
27
+ filtered = filtered.filter(p => this._hasSignal(p));
28
+
29
+ return filtered.join('\n\n');
30
+ }
31
+
32
+ _isBoilerplate(paragraph) {
33
+ return this.patternMatcher.isBoilerplate(paragraph);
34
+ }
35
+
36
+ async _removeDuplicates(paragraphs) {
37
+ if (!this.config.removeDuplicates) return paragraphs;
38
+
39
+ const seen = new Set();
40
+ const unique = [];
41
+
42
+ for (const para of paragraphs) {
43
+ const hash = this.hashUtil.hash(para);
44
+ if (!seen.has(hash)) {
45
+ seen.add(hash);
46
+ unique.push(para);
47
+ }
48
+ }
49
+
50
+ return unique;
51
+ }
52
+
53
+ _hasSignal(paragraph) {
54
+ const text = paragraph.trim();
55
+ if (text.length < 10) return false;
56
+
57
+ const signalChars = text.replace(/[^a-zA-Z0-9]/g, '').length;
58
+ const ratio = signalChars / text.length;
59
+
60
+ return ratio >= (this.config.minSignalRatio || 0.3);
61
+ }
62
+ }
@@ -1,13 +1,18 @@
1
1
  /**
2
- * Type definitions for structural-cleaner.js
2
+ * S-MORA Layer 0 Scrubber - Stage 1: Structural Cleaning
3
+ * @module smora/scrubber/stages/structural-cleaner
3
4
  */
4
-
5
- export interface CleanConfig {
6
- preserveStructure?: boolean;
7
- [key: string]: any;
8
- }
9
-
10
- export class StructuralCleaner {
11
- constructor(config?: CleanConfig);
12
- clean(content: string): Promise<string>;
5
+ export declare class StructuralCleaner {
6
+ constructor(config: any);
7
+ /**
8
+ * Clean document structure
9
+ * @param {string} content - Raw document content
10
+ * @returns {Promise<string>} - Cleaned content
11
+ */
12
+ clean(content: any): Promise<any>;
13
+ _detectType(content: any): "html" | "markdown" | "text";
14
+ _cleanHTML(content: any): Promise<any>;
15
+ _cleanMarkdown(content: any): Promise<any>;
16
+ _collapseWhitespace(content: any): any;
17
+ _normalizeLineBreaks(content: any): any;
13
18
  }
@@ -0,0 +1,73 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * S-MORA Layer 0 Scrubber - Stage 1: Structural Cleaning
4
+ * @module smora/scrubber/stages/structural-cleaner
5
+ */
6
+ import { HTMLParser } from '../utils/html-parser.js';
7
+ import { ScrubberError } from '../errors/scrubber-error.js';
8
+ export class StructuralCleaner {
9
+ constructor(config) {
10
+ this.config = config;
11
+ this.htmlParser = new HTMLParser();
12
+ }
13
+ /**
14
+ * Clean document structure
15
+ * @param {string} content - Raw document content
16
+ * @returns {Promise<string>} - Cleaned content
17
+ */
18
+ async clean(content) {
19
+ try {
20
+ const type = this._detectType(content);
21
+ let cleaned = content;
22
+ if (type === 'html') {
23
+ cleaned = await this._cleanHTML(cleaned);
24
+ // HTML may have markdown headings, normalize them
25
+ cleaned = await this._cleanMarkdown(cleaned);
26
+ }
27
+ else if (type === 'markdown') {
28
+ cleaned = await this._cleanMarkdown(cleaned);
29
+ }
30
+ cleaned = this._collapseWhitespace(cleaned);
31
+ cleaned = this._normalizeLineBreaks(cleaned);
32
+ return cleaned;
33
+ }
34
+ catch (error) {
35
+ const message = error instanceof Error ? error.message : String(error);
36
+ throw new ScrubberError(`Failed to clean content: ${message}`, { stage: 'structural-cleaner', originalError: error });
37
+ }
38
+ }
39
+ _detectType(content) {
40
+ if (content.trim().startsWith('<'))
41
+ return 'html';
42
+ if (/^#{1,6}\s/.test(content) || /^#{1,6}[A-Za-z]/.test(content))
43
+ return 'markdown';
44
+ return 'text';
45
+ }
46
+ async _cleanHTML(content) {
47
+ return this.htmlParser.parse(content);
48
+ }
49
+ async _cleanMarkdown(content) {
50
+ let cleaned = content;
51
+ // Add space after heading markers when missing
52
+ cleaned = cleaned.replace(/(#{1,6})([^\s#])/g, '$1 $2');
53
+ // Add space after list markers when missing
54
+ cleaned = cleaned.replace(/(\s*)([-*+])(\S)/g, '$1$2 $3');
55
+ // Add space after numbered list markers when missing
56
+ cleaned = cleaned.replace(/(\s*)(\d+)(\S)/g, (match, ws, num, char) => {
57
+ // Only if it looks like a numbered list (digit followed by non-dot, non-space)
58
+ if (!/\.\s/.test(match.substring(ws.length + num.length))) {
59
+ return `${ws}${num}. ${char}`;
60
+ }
61
+ return match;
62
+ });
63
+ return cleaned;
64
+ }
65
+ _collapseWhitespace(content) {
66
+ let cleaned = content.replace(/[ \t]+/g, ' ');
67
+ cleaned = cleaned.replace(/\n{3,}/g, '\n\n');
68
+ return cleaned;
69
+ }
70
+ _normalizeLineBreaks(content) {
71
+ return content.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
72
+ }
73
+ }
@@ -0,0 +1,83 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * S-MORA Layer 0 Scrubber - Stage 1: Structural Cleaning
4
+ * @module smora/scrubber/stages/structural-cleaner
5
+ */
6
+
7
+ import { HTMLParser } from '../utils/html-parser.js';
8
+ import { StructuralCleaningError, ScrubberError } from '../errors/scrubber-error.js';
9
+
10
+ export class StructuralCleaner {
11
+ constructor(config) {
12
+ this.config = config;
13
+ this.htmlParser = new HTMLParser();
14
+ }
15
+
16
+ /**
17
+ * Clean document structure
18
+ * @param {string} content - Raw document content
19
+ * @returns {Promise<string>} - Cleaned content
20
+ */
21
+ async clean(content) {
22
+ try {
23
+ const type = this._detectType(content);
24
+ let cleaned = content;
25
+
26
+ if (type === 'html') {
27
+ cleaned = await this._cleanHTML(cleaned);
28
+ // HTML may have markdown headings, normalize them
29
+ cleaned = await this._cleanMarkdown(cleaned);
30
+ } else if (type === 'markdown') {
31
+ cleaned = await this._cleanMarkdown(cleaned);
32
+ }
33
+
34
+ cleaned = this._collapseWhitespace(cleaned);
35
+ cleaned = this._normalizeLineBreaks(cleaned);
36
+
37
+ return cleaned;
38
+ } catch (error) {
39
+ const message = error instanceof Error ? error.message : String(error);
40
+ throw new ScrubberError(
41
+ `Failed to clean content: ${message}`,
42
+ { stage: 'structural-cleaner', originalError: error }
43
+ );
44
+ }
45
+ }
46
+
47
+ _detectType(content) {
48
+ if (content.trim().startsWith('<')) return 'html';
49
+ if (/^#{1,6}\s/.test(content) || /^#{1,6}[A-Za-z]/.test(content)) return 'markdown';
50
+ return 'text';
51
+ }
52
+
53
+ async _cleanHTML(content) {
54
+ return this.htmlParser.parse(content);
55
+ }
56
+
57
+ async _cleanMarkdown(content) {
58
+ let cleaned = content;
59
+ // Add space after heading markers when missing
60
+ cleaned = cleaned.replace(/(#{1,6})([^\s#])/g, '$1 $2');
61
+ // Add space after list markers when missing
62
+ cleaned = cleaned.replace(/(\s*)([-*+])(\S)/g, '$1$2 $3');
63
+ // Add space after numbered list markers when missing
64
+ cleaned = cleaned.replace(/(\s*)(\d+)(\S)/g, (match, ws, num, char) => {
65
+ // Only if it looks like a numbered list (digit followed by non-dot, non-space)
66
+ if (!/\.\s/.test(match.substring(ws.length + num.length))) {
67
+ return `${ws}${num}. ${char}`;
68
+ }
69
+ return match;
70
+ });
71
+ return cleaned;
72
+ }
73
+
74
+ _collapseWhitespace(content) {
75
+ let cleaned = content.replace(/[ \t]+/g, ' ');
76
+ cleaned = cleaned.replace(/\n{3,}/g, '\n\n');
77
+ return cleaned;
78
+ }
79
+
80
+ _normalizeLineBreaks(content) {
81
+ return content.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
82
+ }
83
+ }
@@ -1,18 +1,17 @@
1
1
  /**
2
- * Type definitions for validator.js
2
+ * S-MORA Layer 0 Scrubber - Stage 6: Validation
3
+ * @module smora/scrubber/stages/validator
3
4
  */
4
-
5
- export interface ValidatorConfig {
6
- strict?: boolean;
7
- [key: string]: any;
8
- }
9
-
10
- export interface ValidationResult {
11
- valid: boolean;
12
- errors?: string[];
13
- }
14
-
15
- export class Validator {
16
- constructor(config?: ValidatorConfig);
17
- validate(content: string): Promise<ValidationResult>;
5
+ export declare class Validator {
6
+ constructor(config: any);
7
+ /**
8
+ * Validate chunks
9
+ * @param {Array} chunks - Array of chunks
10
+ * @returns {Promise<Array>} - Validated chunks
11
+ */
12
+ validate(chunks: any): Promise<any[]>;
13
+ _validateChunk(chunk: any): {
14
+ valid: boolean;
15
+ errors: any[];
16
+ };
18
17
  }
@@ -0,0 +1,56 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * S-MORA Layer 0 Scrubber - Stage 6: Validation
4
+ * @module smora/scrubber/stages/validator
5
+ */
6
+ import { TokenCounter } from '../utils/token-counter.js';
7
+ export class Validator {
8
+ constructor(config) {
9
+ this.config = config;
10
+ this.tokenCounter = new TokenCounter();
11
+ }
12
+ /**
13
+ * Validate chunks
14
+ * @param {Array} chunks - Array of chunks
15
+ * @returns {Promise<Array>} - Validated chunks
16
+ */
17
+ async validate(chunks) {
18
+ const valid = [];
19
+ const errors = [];
20
+ for (const chunk of chunks) {
21
+ const validation = this._validateChunk(chunk);
22
+ if (validation.valid) {
23
+ valid.push(chunk);
24
+ }
25
+ else {
26
+ errors.push({
27
+ chunkIndex: chunk.index,
28
+ errors: validation.errors
29
+ });
30
+ }
31
+ }
32
+ return valid;
33
+ }
34
+ _validateChunk(chunk) {
35
+ const errors = [];
36
+ if (this.config.rejectEmptyChunks && !chunk.text.trim()) {
37
+ errors.push('empty_chunk');
38
+ }
39
+ if (this.config.enforceMinLength) {
40
+ const tokens = this.tokenCounter.count(chunk.text);
41
+ if (tokens < this.config.minTokens) {
42
+ errors.push(`chunk_too_short: ${tokens} < ${this.config.minTokens}`);
43
+ }
44
+ }
45
+ if (this.config.enforceMaxLength) {
46
+ const tokens = this.tokenCounter.count(chunk.text);
47
+ if (tokens > this.config.hardMaxTokens) {
48
+ errors.push(`chunk_too_long: ${tokens} > ${this.config.hardMaxTokens}`);
49
+ }
50
+ }
51
+ return {
52
+ valid: errors.length === 0,
53
+ errors
54
+ };
55
+ }
56
+ }