@yamo/memory-mesh 3.0.0 → 3.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/README.md +9 -3
  2. package/bin/memory_mesh.js +95 -8
  3. package/lib/llm/client.d.ts +23 -48
  4. package/lib/llm/client.js +1 -0
  5. package/lib/llm/client.ts +298 -377
  6. package/lib/llm/index.js +1 -0
  7. package/lib/llm/index.ts +1 -2
  8. package/lib/memory/adapters/client.d.ts +22 -85
  9. package/lib/memory/adapters/client.js +1 -0
  10. package/lib/memory/adapters/client.ts +474 -633
  11. package/lib/memory/adapters/config.d.ts +82 -89
  12. package/lib/memory/adapters/config.js +1 -0
  13. package/lib/memory/adapters/config.ts +156 -225
  14. package/lib/memory/adapters/errors.d.ts +28 -20
  15. package/lib/memory/adapters/errors.js +1 -0
  16. package/lib/memory/adapters/errors.ts +83 -120
  17. package/lib/memory/context-manager.d.ts +15 -18
  18. package/lib/memory/context-manager.js +1 -0
  19. package/lib/memory/context-manager.ts +314 -401
  20. package/lib/memory/embeddings/factory.d.ts +18 -20
  21. package/lib/memory/embeddings/factory.js +1 -0
  22. package/lib/memory/embeddings/factory.ts +130 -173
  23. package/lib/memory/embeddings/index.js +1 -0
  24. package/lib/memory/embeddings/index.ts +1 -0
  25. package/lib/memory/embeddings/service.d.ts +36 -66
  26. package/lib/memory/embeddings/service.js +1 -0
  27. package/lib/memory/embeddings/service.ts +479 -616
  28. package/lib/memory/index.d.ts +2 -2
  29. package/lib/memory/index.js +1 -0
  30. package/lib/memory/index.ts +3 -13
  31. package/lib/memory/memory-mesh.d.ts +151 -93
  32. package/lib/memory/memory-mesh.js +1 -0
  33. package/lib/memory/memory-mesh.ts +1406 -1692
  34. package/lib/memory/memory-translator.d.ts +1 -6
  35. package/lib/memory/memory-translator.js +1 -0
  36. package/lib/memory/memory-translator.ts +96 -128
  37. package/lib/memory/schema.d.ts +29 -10
  38. package/lib/memory/schema.js +1 -0
  39. package/lib/memory/schema.ts +102 -185
  40. package/lib/memory/scorer.d.ts +3 -4
  41. package/lib/memory/scorer.js +1 -0
  42. package/lib/memory/scorer.ts +69 -86
  43. package/lib/memory/search/index.js +1 -0
  44. package/lib/memory/search/index.ts +1 -0
  45. package/lib/memory/search/keyword-search.d.ts +10 -26
  46. package/lib/memory/search/keyword-search.js +1 -0
  47. package/lib/memory/search/keyword-search.ts +123 -161
  48. package/lib/scrubber/config/defaults.d.ts +39 -46
  49. package/lib/scrubber/config/defaults.js +1 -0
  50. package/lib/scrubber/config/defaults.ts +50 -112
  51. package/lib/scrubber/errors/scrubber-error.d.ts +22 -0
  52. package/lib/scrubber/errors/scrubber-error.js +39 -0
  53. package/lib/scrubber/errors/scrubber-error.ts +44 -0
  54. package/lib/scrubber/index.d.ts +0 -1
  55. package/lib/scrubber/index.js +1 -0
  56. package/lib/scrubber/index.ts +1 -2
  57. package/lib/scrubber/scrubber.d.ts +14 -31
  58. package/lib/scrubber/scrubber.js +1 -0
  59. package/lib/scrubber/scrubber.ts +93 -152
  60. package/lib/scrubber/stages/chunker.d.ts +22 -10
  61. package/lib/scrubber/stages/chunker.js +86 -0
  62. package/lib/scrubber/stages/chunker.ts +104 -0
  63. package/lib/scrubber/stages/metadata-annotator.d.ts +14 -15
  64. package/lib/scrubber/stages/metadata-annotator.js +64 -0
  65. package/lib/scrubber/stages/metadata-annotator.ts +75 -0
  66. package/lib/scrubber/stages/normalizer.d.ts +13 -10
  67. package/lib/scrubber/stages/normalizer.js +51 -0
  68. package/lib/scrubber/stages/normalizer.ts +60 -0
  69. package/lib/scrubber/stages/semantic-filter.d.ts +13 -10
  70. package/lib/scrubber/stages/semantic-filter.js +51 -0
  71. package/lib/scrubber/stages/semantic-filter.ts +62 -0
  72. package/lib/scrubber/stages/structural-cleaner.d.ts +15 -10
  73. package/lib/scrubber/stages/structural-cleaner.js +73 -0
  74. package/lib/scrubber/stages/structural-cleaner.ts +83 -0
  75. package/lib/scrubber/stages/validator.d.ts +14 -15
  76. package/lib/scrubber/stages/validator.js +56 -0
  77. package/lib/scrubber/stages/validator.ts +67 -0
  78. package/lib/scrubber/telemetry.d.ts +20 -27
  79. package/lib/scrubber/telemetry.js +1 -0
  80. package/lib/scrubber/telemetry.ts +53 -90
  81. package/lib/scrubber/utils/hash.d.ts +14 -0
  82. package/lib/scrubber/utils/hash.js +37 -0
  83. package/lib/scrubber/utils/hash.ts +40 -0
  84. package/lib/scrubber/utils/html-parser.d.ts +14 -0
  85. package/lib/scrubber/utils/html-parser.js +38 -0
  86. package/lib/scrubber/utils/html-parser.ts +46 -0
  87. package/lib/scrubber/utils/pattern-matcher.d.ts +12 -0
  88. package/lib/scrubber/utils/pattern-matcher.js +54 -0
  89. package/lib/scrubber/utils/pattern-matcher.ts +64 -0
  90. package/lib/scrubber/utils/token-counter.d.ts +18 -0
  91. package/lib/scrubber/utils/token-counter.js +30 -0
  92. package/lib/scrubber/utils/token-counter.ts +32 -0
  93. package/lib/utils/logger.d.ts +1 -11
  94. package/lib/utils/logger.js +1 -0
  95. package/lib/utils/logger.ts +43 -63
  96. package/lib/utils/skill-metadata.d.ts +6 -14
  97. package/lib/utils/skill-metadata.js +1 -0
  98. package/lib/utils/skill-metadata.ts +89 -103
  99. package/lib/yamo/emitter.d.ts +8 -35
  100. package/lib/yamo/emitter.js +1 -0
  101. package/lib/yamo/emitter.ts +77 -155
  102. package/lib/yamo/index.d.ts +14 -0
  103. package/lib/yamo/index.js +14 -0
  104. package/lib/yamo/index.ts +16 -0
  105. package/lib/yamo/schema.d.ts +8 -10
  106. package/lib/yamo/schema.js +1 -0
  107. package/lib/yamo/schema.ts +82 -114
  108. package/package.json +5 -2
@@ -0,0 +1,67 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * S-MORA Layer 0 Scrubber - Stage 6: Validation
4
+ * @module smora/scrubber/stages/validator
5
+ */
6
+
7
+ import { TokenCounter } from '../utils/token-counter.js';
8
+ import { ValidationError } from '../errors/scrubber-error.js';
9
+
10
+ export class Validator {
11
+ constructor(config) {
12
+ this.config = config;
13
+ this.tokenCounter = new TokenCounter();
14
+ }
15
+
16
+ /**
17
+ * Validate chunks
18
+ * @param {Array} chunks - Array of chunks
19
+ * @returns {Promise<Array>} - Validated chunks
20
+ */
21
+ async validate(chunks) {
22
+ const valid = [];
23
+ const errors = [];
24
+
25
+ for (const chunk of chunks) {
26
+ const validation = this._validateChunk(chunk);
27
+
28
+ if (validation.valid) {
29
+ valid.push(chunk);
30
+ } else {
31
+ errors.push({
32
+ chunkIndex: chunk.index,
33
+ errors: validation.errors
34
+ });
35
+ }
36
+ }
37
+
38
+ return valid;
39
+ }
40
+
41
+ _validateChunk(chunk) {
42
+ const errors = [];
43
+
44
+ if (this.config.rejectEmptyChunks && !chunk.text.trim()) {
45
+ errors.push('empty_chunk');
46
+ }
47
+
48
+ if (this.config.enforceMinLength) {
49
+ const tokens = this.tokenCounter.count(chunk.text);
50
+ if (tokens < this.config.minTokens) {
51
+ errors.push(`chunk_too_short: ${tokens} < ${this.config.minTokens}`);
52
+ }
53
+ }
54
+
55
+ if (this.config.enforceMaxLength) {
56
+ const tokens = this.tokenCounter.count(chunk.text);
57
+ if (tokens > this.config.hardMaxTokens) {
58
+ errors.push(`chunk_too_long: ${tokens} > ${this.config.hardMaxTokens}`);
59
+ }
60
+ }
61
+
62
+ return {
63
+ valid: errors.length === 0,
64
+ errors
65
+ };
66
+ }
67
+ }
@@ -2,35 +2,28 @@
2
2
  * S-MORA Layer 0 Scrubber Telemetry Collection
3
3
  * @module smora/scrubber/telemetry
4
4
  */
5
- export interface StageStats {
6
- count: number;
7
- totalTime: number;
8
- errors: number;
9
- }
10
- export interface StageSummary {
11
- count: number;
12
- avgTime: number;
13
- totalTime: number;
14
- errors: number;
15
- }
16
- export interface TelemetrySummary {
17
- stages: Record<string, StageStats>;
18
- performance: {
19
- structural: number;
20
- semantic: number;
21
- normalization: number;
22
- chunking: number;
23
- metadata: number;
24
- validation: number;
25
- total: number;
26
- };
27
- }
28
5
  export declare class ScrubberTelemetry {
29
- stats: Record<string, StageStats>;
6
+ stats: any;
30
7
  constructor();
31
- recordStage(stage: string, duration: number, success?: boolean): void;
32
- getStageStats(stage: string): StageSummary;
33
- getSummary(): TelemetrySummary;
8
+ recordStage(stage: any, duration: any, success?: boolean): void;
9
+ getStageStats(stage: any): {
10
+ count: any;
11
+ avgTime: number;
12
+ totalTime: any;
13
+ errors: any;
14
+ };
15
+ getSummary(): {
16
+ stages: any;
17
+ performance: {
18
+ structural: any;
19
+ semantic: any;
20
+ normalization: any;
21
+ chunking: any;
22
+ metadata: any;
23
+ validation: any;
24
+ total: unknown;
25
+ };
26
+ };
34
27
  reset(): void;
35
28
  assertPerformanceBudget(budget?: number): void;
36
29
  }
@@ -1,3 +1,4 @@
1
+ // @ts-nocheck
1
2
  /**
2
3
  * S-MORA Layer 0 Scrubber Telemetry Collection
3
4
  * @module smora/scrubber/telemetry
@@ -1,99 +1,62 @@
1
+ // @ts-nocheck
1
2
  /**
2
3
  * S-MORA Layer 0 Scrubber Telemetry Collection
3
4
  * @module smora/scrubber/telemetry
4
5
  */
5
-
6
- export interface StageStats {
7
- count: number;
8
- totalTime: number;
9
- errors: number;
10
- }
11
-
12
- export interface StageSummary {
13
- count: number;
14
- avgTime: number;
15
- totalTime: number;
16
- errors: number;
17
- }
18
-
19
- export interface TelemetrySummary {
20
- stages: Record<string, StageStats>;
21
- performance: {
22
- structural: number;
23
- semantic: number;
24
- normalization: number;
25
- chunking: number;
26
- metadata: number;
27
- validation: number;
28
- total: number;
29
- };
30
- }
31
-
32
6
  export class ScrubberTelemetry {
33
- stats: Record<string, StageStats>;
34
-
35
- constructor() {
36
- this.stats = {
37
- structural: { count: 0, totalTime: 0, errors: 0 },
38
- semantic: { count: 0, totalTime: 0, errors: 0 },
39
- normalization: { count: 0, totalTime: 0, errors: 0 },
40
- chunking: { count: 0, totalTime: 0, errors: 0 },
41
- metadata: { count: 0, totalTime: 0, errors: 0 },
42
- validation: { count: 0, totalTime: 0, errors: 0 },
43
- };
44
- }
45
-
46
- recordStage(stage: string, duration: number, success: boolean = true): void {
47
- if (!this.stats[stage]) {
48
- this.stats[stage] = { count: 0, totalTime: 0, errors: 0 };
7
+ stats;
8
+ constructor() {
9
+ this.stats = {
10
+ structural: { count: 0, totalTime: 0, errors: 0 },
11
+ semantic: { count: 0, totalTime: 0, errors: 0 },
12
+ normalization: { count: 0, totalTime: 0, errors: 0 },
13
+ chunking: { count: 0, totalTime: 0, errors: 0 },
14
+ metadata: { count: 0, totalTime: 0, errors: 0 },
15
+ validation: { count: 0, totalTime: 0, errors: 0 },
16
+ };
17
+ }
18
+ recordStage(stage, duration, success = true) {
19
+ if (!this.stats[stage]) {
20
+ this.stats[stage] = { count: 0, totalTime: 0, errors: 0 };
21
+ }
22
+ this.stats[stage].count++;
23
+ this.stats[stage].totalTime += duration;
24
+ if (!success) {
25
+ this.stats[stage].errors++;
26
+ }
27
+ }
28
+ getStageStats(stage) {
29
+ const stats = this.stats[stage] || { count: 0, totalTime: 0, errors: 0 };
30
+ return {
31
+ count: stats.count,
32
+ avgTime: stats.count > 0 ? stats.totalTime / stats.count : 0,
33
+ totalTime: stats.totalTime,
34
+ errors: stats.errors,
35
+ };
36
+ }
37
+ getSummary() {
38
+ return {
39
+ stages: this.stats,
40
+ performance: {
41
+ structural: this.stats.structural.totalTime,
42
+ semantic: this.stats.semantic.totalTime,
43
+ normalization: this.stats.normalization.totalTime,
44
+ chunking: this.stats.chunking.totalTime,
45
+ metadata: this.stats.metadata.totalTime,
46
+ validation: this.stats.validation.totalTime,
47
+ total: Object.values(this.stats).reduce((sum, s) => sum + s.totalTime, 0),
48
+ },
49
+ };
49
50
  }
50
- this.stats[stage].count++;
51
- this.stats[stage].totalTime += duration;
52
- if (!success) {
53
- this.stats[stage].errors++;
51
+ reset() {
52
+ Object.keys(this.stats).forEach((key) => {
53
+ this.stats[key] = { count: 0, totalTime: 0, errors: 0 };
54
+ });
54
55
  }
55
- }
56
-
57
- getStageStats(stage: string): StageSummary {
58
- const stats = this.stats[stage] || { count: 0, totalTime: 0, errors: 0 };
59
- return {
60
- count: stats.count,
61
- avgTime: stats.count > 0 ? stats.totalTime / stats.count : 0,
62
- totalTime: stats.totalTime,
63
- errors: stats.errors,
64
- };
65
- }
66
-
67
- getSummary(): TelemetrySummary {
68
- return {
69
- stages: this.stats,
70
- performance: {
71
- structural: this.stats.structural.totalTime,
72
- semantic: this.stats.semantic.totalTime,
73
- normalization: this.stats.normalization.totalTime,
74
- chunking: this.stats.chunking.totalTime,
75
- metadata: this.stats.metadata.totalTime,
76
- validation: this.stats.validation.totalTime,
77
- total: Object.values(this.stats).reduce(
78
- (sum, s) => sum + s.totalTime,
79
- 0,
80
- ),
81
- },
82
- };
83
- }
84
-
85
- reset(): void {
86
- Object.keys(this.stats).forEach((key) => {
87
- this.stats[key] = { count: 0, totalTime: 0, errors: 0 };
88
- });
89
- }
90
-
91
- assertPerformanceBudget(budget: number = 10): void {
92
- const summary = this.getSummary();
93
- if (summary.performance.total > budget) {
94
- throw new Error(
95
- `Performance budget exceeded: ${summary.performance.total}ms > ${budget}ms`,
96
- );
56
+ assertPerformanceBudget(budget = 10) {
57
+ const summary = this.getSummary();
58
+ if (summary.performance.total > budget) {
59
+ throw new Error(`Performance budget exceeded: ${summary.performance.total}ms > ${budget}ms`);
60
+ }
97
61
  }
98
- }
99
62
  }
@@ -0,0 +1,14 @@
1
+ export declare class HashUtil {
2
+ /**
3
+ * Hash content for deduplication
4
+ * @param {string} content - Content to hash
5
+ * @returns {string} - SHA256 hash
6
+ */
7
+ hash(content: any): string;
8
+ /**
9
+ * Fast hash for caching (non-cryptographic)
10
+ * @param {string} content - Content to hash
11
+ * @returns {string} - Simple hash
12
+ */
13
+ fastHash(content: any): string;
14
+ }
@@ -0,0 +1,37 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * Content Hashing Utilities
4
+ * @module smora/scrubber/utils/hash
5
+ */
6
+ import crypto from 'crypto';
7
+ export class HashUtil {
8
+ /**
9
+ * Hash content for deduplication
10
+ * @param {string} content - Content to hash
11
+ * @returns {string} - SHA256 hash
12
+ */
13
+ hash(content) {
14
+ const normalized = content
15
+ .toLowerCase()
16
+ .trim()
17
+ .replace(/\s+/g, ' ');
18
+ return crypto
19
+ .createHash('sha256')
20
+ .update(normalized)
21
+ .digest('hex');
22
+ }
23
+ /**
24
+ * Fast hash for caching (non-cryptographic)
25
+ * @param {string} content - Content to hash
26
+ * @returns {string} - Simple hash
27
+ */
28
+ fastHash(content) {
29
+ let hash = 0;
30
+ for (let i = 0; i < content.length; i++) {
31
+ const char = content.charCodeAt(i);
32
+ hash = ((hash << 5) - hash) + char;
33
+ hash = hash & hash;
34
+ }
35
+ return hash.toString(36);
36
+ }
37
+ }
@@ -0,0 +1,40 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * Content Hashing Utilities
4
+ * @module smora/scrubber/utils/hash
5
+ */
6
+ import crypto from 'crypto';
7
+
8
+ export class HashUtil {
9
+ /**
10
+ * Hash content for deduplication
11
+ * @param {string} content - Content to hash
12
+ * @returns {string} - SHA256 hash
13
+ */
14
+ hash(content) {
15
+ const normalized = content
16
+ .toLowerCase()
17
+ .trim()
18
+ .replace(/\s+/g, ' ');
19
+
20
+ return crypto
21
+ .createHash('sha256')
22
+ .update(normalized)
23
+ .digest('hex');
24
+ }
25
+
26
+ /**
27
+ * Fast hash for caching (non-cryptographic)
28
+ * @param {string} content - Content to hash
29
+ * @returns {string} - Simple hash
30
+ */
31
+ fastHash(content) {
32
+ let hash = 0;
33
+ for (let i = 0; i < content.length; i++) {
34
+ const char = content.charCodeAt(i);
35
+ hash = ((hash << 5) - hash) + char;
36
+ hash = hash & hash;
37
+ }
38
+ return hash.toString(36);
39
+ }
40
+ }
@@ -0,0 +1,14 @@
1
+ /**
2
+ * HTML Parsing Utilities
3
+ * @module smora/scrubber/utils/html-parser
4
+ */
5
+ export declare class HTMLParser {
6
+ /**
7
+ * Extract text content from HTML
8
+ * @param {string} html - HTML content
9
+ * @returns {string} - Extracted text
10
+ */
11
+ parse(html: any): any;
12
+ _extractText(html: any): any;
13
+ _stripTags(html: any): any;
14
+ }
@@ -0,0 +1,38 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * HTML Parsing Utilities
4
+ * @module smora/scrubber/utils/html-parser
5
+ */
6
+ export class HTMLParser {
7
+ /**
8
+ * Extract text content from HTML
9
+ * @param {string} html - HTML content
10
+ * @returns {string} - Extracted text
11
+ */
12
+ parse(html) {
13
+ return this._extractText(html);
14
+ }
15
+ _extractText(html) {
16
+ // Remove scripts, styles, and comments
17
+ let text = html;
18
+ text = text.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '');
19
+ text = text.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '');
20
+ text = text.replace(/<!--[\s\S]*?-->/g, '');
21
+ // Convert headings to markdown
22
+ text = text.replace(/<h([1-6])([^>]*)>(.*?)<\/h\1>/gi, (match, level, attrs, content) => {
23
+ const headingLevel = parseInt(level);
24
+ const hashes = '#'.repeat(headingLevel);
25
+ return `${hashes} ${this._stripTags(content)}\n\n`;
26
+ });
27
+ // Convert paragraphs
28
+ text = text.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n');
29
+ // Convert lists
30
+ text = text.replace(/<li[^>]*>(.*?)<\/li>/gi, '- $1\n');
31
+ // Remove remaining tags
32
+ text = text.replace(/<[^>]+>/g, '');
33
+ return text;
34
+ }
35
+ _stripTags(html) {
36
+ return html.replace(/<[^>]+>/g, '');
37
+ }
38
+ }
@@ -0,0 +1,46 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * HTML Parsing Utilities
4
+ * @module smora/scrubber/utils/html-parser
5
+ */
6
+
7
+ export class HTMLParser {
8
+ /**
9
+ * Extract text content from HTML
10
+ * @param {string} html - HTML content
11
+ * @returns {string} - Extracted text
12
+ */
13
+ parse(html) {
14
+ return this._extractText(html);
15
+ }
16
+
17
+ _extractText(html) {
18
+ // Remove scripts, styles, and comments
19
+ let text = html;
20
+ text = text.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '');
21
+ text = text.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '');
22
+ text = text.replace(/<!--[\s\S]*?-->/g, '');
23
+
24
+ // Convert headings to markdown
25
+ text = text.replace(/<h([1-6])([^>]*)>(.*?)<\/h\1>/gi, (match, level, attrs, content) => {
26
+ const headingLevel = parseInt(level);
27
+ const hashes = '#'.repeat(headingLevel);
28
+ return `${hashes} ${this._stripTags(content)}\n\n`;
29
+ });
30
+
31
+ // Convert paragraphs
32
+ text = text.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n');
33
+
34
+ // Convert lists
35
+ text = text.replace(/<li[^>]*>(.*?)<\/li>/gi, '- $1\n');
36
+
37
+ // Remove remaining tags
38
+ text = text.replace(/<[^>]+>/g, '');
39
+
40
+ return text;
41
+ }
42
+
43
+ _stripTags(html) {
44
+ return html.replace(/<[^>]+>/g, '');
45
+ }
46
+ }
@@ -0,0 +1,12 @@
1
+ /**
2
+ * Boilerplate Pattern Matching Utilities
3
+ * @module smora/scrubber/utils/pattern-matcher
4
+ */
5
+ export declare class PatternMatcher {
6
+ constructor();
7
+ _loadDefaultPatterns(): (string | RegExp)[];
8
+ getBoilerplatePatterns(): any;
9
+ addPattern(pattern: any): void;
10
+ removePattern(index: any): void;
11
+ isBoilerplate(text: any): any;
12
+ }
@@ -0,0 +1,54 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * Boilerplate Pattern Matching Utilities
4
+ * @module smora/scrubber/utils/pattern-matcher
5
+ */
6
+ export class PatternMatcher {
7
+ constructor() {
8
+ this.boilerplatePatterns = this._loadDefaultPatterns();
9
+ }
10
+ _loadDefaultPatterns() {
11
+ return [
12
+ // Legal/Footer
13
+ /©\s*\d{4}/i,
14
+ /all rights reserved/i,
15
+ /copyright\s+\d{4}/i,
16
+ // Navigation
17
+ /^home\s*\|/i,
18
+ /^navigation\s*:|menu\s*:/i,
19
+ /sidebar/i,
20
+ // Meta
21
+ /^last\s+updated?\s*:/i,
22
+ /cookie\s+policy/i,
23
+ /privacy\s+policy/i,
24
+ // Auto-generated
25
+ /^table\s+of\s+contents?$/i,
26
+ /^contents\s*$/i,
27
+ /jump\s+to\s+(section|navigation)/i,
28
+ // Strings
29
+ 'home | docs | contact',
30
+ 'skip to main content',
31
+ 'this site uses cookies'
32
+ ];
33
+ }
34
+ getBoilerplatePatterns() {
35
+ return this.boilerplatePatterns;
36
+ }
37
+ addPattern(pattern) {
38
+ this.boilerplatePatterns.push(pattern);
39
+ }
40
+ removePattern(index) {
41
+ if (index >= 0 && index < this.boilerplatePatterns.length) {
42
+ this.boilerplatePatterns.splice(index, 1);
43
+ }
44
+ }
45
+ isBoilerplate(text) {
46
+ const lowerText = text.toLowerCase().trim();
47
+ return this.boilerplatePatterns.some(pattern => {
48
+ if (pattern instanceof RegExp) {
49
+ return pattern.test(lowerText);
50
+ }
51
+ return lowerText.includes(pattern);
52
+ });
53
+ }
54
+ }
@@ -0,0 +1,64 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * Boilerplate Pattern Matching Utilities
4
+ * @module smora/scrubber/utils/pattern-matcher
5
+ */
6
+
7
+ export class PatternMatcher {
8
+ constructor() {
9
+ this.boilerplatePatterns = this._loadDefaultPatterns();
10
+ }
11
+
12
+ _loadDefaultPatterns() {
13
+ return [
14
+ // Legal/Footer
15
+ /©\s*\d{4}/i,
16
+ /all rights reserved/i,
17
+ /copyright\s+\d{4}/i,
18
+
19
+ // Navigation
20
+ /^home\s*\|/i,
21
+ /^navigation\s*:|menu\s*:/i,
22
+ /sidebar/i,
23
+
24
+ // Meta
25
+ /^last\s+updated?\s*:/i,
26
+ /cookie\s+policy/i,
27
+ /privacy\s+policy/i,
28
+
29
+ // Auto-generated
30
+ /^table\s+of\s+contents?$/i,
31
+ /^contents\s*$/i,
32
+ /jump\s+to\s+(section|navigation)/i,
33
+
34
+ // Strings
35
+ 'home | docs | contact',
36
+ 'skip to main content',
37
+ 'this site uses cookies'
38
+ ];
39
+ }
40
+
41
+ getBoilerplatePatterns() {
42
+ return this.boilerplatePatterns;
43
+ }
44
+
45
+ addPattern(pattern) {
46
+ this.boilerplatePatterns.push(pattern);
47
+ }
48
+
49
+ removePattern(index) {
50
+ if (index >= 0 && index < this.boilerplatePatterns.length) {
51
+ this.boilerplatePatterns.splice(index, 1);
52
+ }
53
+ }
54
+
55
+ isBoilerplate(text) {
56
+ const lowerText = text.toLowerCase().trim();
57
+ return this.boilerplatePatterns.some(pattern => {
58
+ if (pattern instanceof RegExp) {
59
+ return pattern.test(lowerText);
60
+ }
61
+ return lowerText.includes(pattern);
62
+ });
63
+ }
64
+ }
@@ -0,0 +1,18 @@
1
+ /**
2
+ * Token Counting Utilities
3
+ * @module smora/scrubber/utils/token-counter
4
+ */
5
+ export declare class TokenCounter {
6
+ /**
7
+ * Estimate token count (approximation)
8
+ * @param {string} text - Text to count
9
+ * @returns {number} - Estimated token count
10
+ */
11
+ count(text: any): number;
12
+ /**
13
+ * More accurate token count (slower)
14
+ * @param {string} text - Text to count
15
+ * @returns {number} - More accurate token count
16
+ */
17
+ countAccurate(text: any): any;
18
+ }
@@ -0,0 +1,30 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * Token Counting Utilities
4
+ * @module smora/scrubber/utils/token-counter
5
+ */
6
+ export class TokenCounter {
7
+ /**
8
+ * Estimate token count (approximation)
9
+ * @param {string} text - Text to count
10
+ * @returns {number} - Estimated token count
11
+ */
12
+ count(text) {
13
+ // Simple approximation: ~4 characters per token
14
+ return Math.ceil(text.length / 4);
15
+ }
16
+ /**
17
+ * More accurate token count (slower)
18
+ * @param {string} text - Text to count
19
+ * @returns {number} - More accurate token count
20
+ */
21
+ countAccurate(text) {
22
+ const words = text.split(/\s+/).filter(w => w.length > 0);
23
+ let tokens = words.length;
24
+ const punctuationMatches = text.match(/[.,!?;:]/g);
25
+ if (punctuationMatches) {
26
+ tokens += punctuationMatches.length;
27
+ }
28
+ return tokens;
29
+ }
30
+ }