@yamo/memory-mesh 3.0.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. package/README.md +8 -2
  2. package/lib/llm/client.d.ts +23 -48
  3. package/lib/llm/client.js +1 -0
  4. package/lib/llm/client.ts +298 -377
  5. package/lib/llm/index.js +1 -0
  6. package/lib/llm/index.ts +1 -2
  7. package/lib/memory/adapters/client.d.ts +22 -85
  8. package/lib/memory/adapters/client.js +1 -0
  9. package/lib/memory/adapters/client.ts +474 -633
  10. package/lib/memory/adapters/config.d.ts +82 -89
  11. package/lib/memory/adapters/config.js +1 -0
  12. package/lib/memory/adapters/config.ts +156 -225
  13. package/lib/memory/adapters/errors.d.ts +28 -20
  14. package/lib/memory/adapters/errors.js +1 -0
  15. package/lib/memory/adapters/errors.ts +83 -120
  16. package/lib/memory/context-manager.d.ts +15 -18
  17. package/lib/memory/context-manager.js +1 -0
  18. package/lib/memory/context-manager.ts +314 -401
  19. package/lib/memory/embeddings/factory.d.ts +18 -20
  20. package/lib/memory/embeddings/factory.js +1 -0
  21. package/lib/memory/embeddings/factory.ts +130 -173
  22. package/lib/memory/embeddings/index.js +1 -0
  23. package/lib/memory/embeddings/index.ts +1 -0
  24. package/lib/memory/embeddings/service.d.ts +36 -66
  25. package/lib/memory/embeddings/service.js +1 -0
  26. package/lib/memory/embeddings/service.ts +479 -616
  27. package/lib/memory/index.d.ts +2 -2
  28. package/lib/memory/index.js +1 -0
  29. package/lib/memory/index.ts +3 -13
  30. package/lib/memory/memory-mesh.d.ts +151 -93
  31. package/lib/memory/memory-mesh.js +1 -0
  32. package/lib/memory/memory-mesh.ts +1406 -1692
  33. package/lib/memory/memory-translator.d.ts +1 -6
  34. package/lib/memory/memory-translator.js +1 -0
  35. package/lib/memory/memory-translator.ts +96 -128
  36. package/lib/memory/schema.d.ts +29 -10
  37. package/lib/memory/schema.js +1 -0
  38. package/lib/memory/schema.ts +102 -185
  39. package/lib/memory/scorer.d.ts +3 -4
  40. package/lib/memory/scorer.js +1 -0
  41. package/lib/memory/scorer.ts +69 -86
  42. package/lib/memory/search/index.js +1 -0
  43. package/lib/memory/search/index.ts +1 -0
  44. package/lib/memory/search/keyword-search.d.ts +10 -26
  45. package/lib/memory/search/keyword-search.js +1 -0
  46. package/lib/memory/search/keyword-search.ts +123 -161
  47. package/lib/scrubber/config/defaults.d.ts +39 -46
  48. package/lib/scrubber/config/defaults.js +1 -0
  49. package/lib/scrubber/config/defaults.ts +50 -112
  50. package/lib/scrubber/errors/scrubber-error.d.ts +22 -0
  51. package/lib/scrubber/errors/scrubber-error.js +39 -0
  52. package/lib/scrubber/errors/scrubber-error.ts +44 -0
  53. package/lib/scrubber/index.d.ts +0 -1
  54. package/lib/scrubber/index.js +1 -0
  55. package/lib/scrubber/index.ts +1 -2
  56. package/lib/scrubber/scrubber.d.ts +14 -31
  57. package/lib/scrubber/scrubber.js +1 -0
  58. package/lib/scrubber/scrubber.ts +93 -152
  59. package/lib/scrubber/stages/chunker.d.ts +22 -10
  60. package/lib/scrubber/stages/chunker.js +86 -0
  61. package/lib/scrubber/stages/chunker.ts +104 -0
  62. package/lib/scrubber/stages/metadata-annotator.d.ts +14 -15
  63. package/lib/scrubber/stages/metadata-annotator.js +64 -0
  64. package/lib/scrubber/stages/metadata-annotator.ts +75 -0
  65. package/lib/scrubber/stages/normalizer.d.ts +13 -10
  66. package/lib/scrubber/stages/normalizer.js +51 -0
  67. package/lib/scrubber/stages/normalizer.ts +60 -0
  68. package/lib/scrubber/stages/semantic-filter.d.ts +13 -10
  69. package/lib/scrubber/stages/semantic-filter.js +51 -0
  70. package/lib/scrubber/stages/semantic-filter.ts +62 -0
  71. package/lib/scrubber/stages/structural-cleaner.d.ts +15 -10
  72. package/lib/scrubber/stages/structural-cleaner.js +73 -0
  73. package/lib/scrubber/stages/structural-cleaner.ts +83 -0
  74. package/lib/scrubber/stages/validator.d.ts +14 -15
  75. package/lib/scrubber/stages/validator.js +56 -0
  76. package/lib/scrubber/stages/validator.ts +67 -0
  77. package/lib/scrubber/telemetry.d.ts +20 -27
  78. package/lib/scrubber/telemetry.js +1 -0
  79. package/lib/scrubber/telemetry.ts +53 -90
  80. package/lib/scrubber/utils/hash.d.ts +14 -0
  81. package/lib/scrubber/utils/hash.js +37 -0
  82. package/lib/scrubber/utils/hash.ts +40 -0
  83. package/lib/scrubber/utils/html-parser.d.ts +14 -0
  84. package/lib/scrubber/utils/html-parser.js +38 -0
  85. package/lib/scrubber/utils/html-parser.ts +46 -0
  86. package/lib/scrubber/utils/pattern-matcher.d.ts +12 -0
  87. package/lib/scrubber/utils/pattern-matcher.js +54 -0
  88. package/lib/scrubber/utils/pattern-matcher.ts +64 -0
  89. package/lib/scrubber/utils/token-counter.d.ts +18 -0
  90. package/lib/scrubber/utils/token-counter.js +30 -0
  91. package/lib/scrubber/utils/token-counter.ts +32 -0
  92. package/lib/utils/logger.d.ts +1 -11
  93. package/lib/utils/logger.js +1 -0
  94. package/lib/utils/logger.ts +43 -63
  95. package/lib/utils/skill-metadata.d.ts +6 -14
  96. package/lib/utils/skill-metadata.js +1 -0
  97. package/lib/utils/skill-metadata.ts +89 -103
  98. package/lib/yamo/emitter.d.ts +8 -35
  99. package/lib/yamo/emitter.js +1 -0
  100. package/lib/yamo/emitter.ts +77 -155
  101. package/lib/yamo/index.d.ts +14 -0
  102. package/lib/yamo/index.js +14 -0
  103. package/lib/yamo/index.ts +16 -0
  104. package/lib/yamo/schema.d.ts +8 -10
  105. package/lib/yamo/schema.js +1 -0
  106. package/lib/yamo/schema.ts +82 -114
  107. package/package.json +4 -2
@@ -0,0 +1,67 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * S-MORA Layer 0 Scrubber - Stage 6: Validation
4
+ * @module smora/scrubber/stages/validator
5
+ */
6
+
7
+ import { TokenCounter } from '../utils/token-counter.js';
8
+ import { ValidationError } from '../errors/scrubber-error.js';
9
+
10
+ export class Validator {
11
+ constructor(config) {
12
+ this.config = config;
13
+ this.tokenCounter = new TokenCounter();
14
+ }
15
+
16
+ /**
17
+ * Validate chunks
18
+ * @param {Array} chunks - Array of chunks
19
+ * @returns {Promise<Array>} - Validated chunks
20
+ */
21
+ async validate(chunks) {
22
+ const valid = [];
23
+ const errors = [];
24
+
25
+ for (const chunk of chunks) {
26
+ const validation = this._validateChunk(chunk);
27
+
28
+ if (validation.valid) {
29
+ valid.push(chunk);
30
+ } else {
31
+ errors.push({
32
+ chunkIndex: chunk.index,
33
+ errors: validation.errors
34
+ });
35
+ }
36
+ }
37
+
38
+ return valid;
39
+ }
40
+
41
+ _validateChunk(chunk) {
42
+ const errors = [];
43
+
44
+ if (this.config.rejectEmptyChunks && !chunk.text.trim()) {
45
+ errors.push('empty_chunk');
46
+ }
47
+
48
+ if (this.config.enforceMinLength) {
49
+ const tokens = this.tokenCounter.count(chunk.text);
50
+ if (tokens < this.config.minTokens) {
51
+ errors.push(`chunk_too_short: ${tokens} < ${this.config.minTokens}`);
52
+ }
53
+ }
54
+
55
+ if (this.config.enforceMaxLength) {
56
+ const tokens = this.tokenCounter.count(chunk.text);
57
+ if (tokens > this.config.hardMaxTokens) {
58
+ errors.push(`chunk_too_long: ${tokens} > ${this.config.hardMaxTokens}`);
59
+ }
60
+ }
61
+
62
+ return {
63
+ valid: errors.length === 0,
64
+ errors
65
+ };
66
+ }
67
+ }
@@ -2,35 +2,28 @@
2
2
  * S-MORA Layer 0 Scrubber Telemetry Collection
3
3
  * @module smora/scrubber/telemetry
4
4
  */
5
- export interface StageStats {
6
- count: number;
7
- totalTime: number;
8
- errors: number;
9
- }
10
- export interface StageSummary {
11
- count: number;
12
- avgTime: number;
13
- totalTime: number;
14
- errors: number;
15
- }
16
- export interface TelemetrySummary {
17
- stages: Record<string, StageStats>;
18
- performance: {
19
- structural: number;
20
- semantic: number;
21
- normalization: number;
22
- chunking: number;
23
- metadata: number;
24
- validation: number;
25
- total: number;
26
- };
27
- }
28
5
  export declare class ScrubberTelemetry {
29
- stats: Record<string, StageStats>;
6
+ stats: any;
30
7
  constructor();
31
- recordStage(stage: string, duration: number, success?: boolean): void;
32
- getStageStats(stage: string): StageSummary;
33
- getSummary(): TelemetrySummary;
8
+ recordStage(stage: any, duration: any, success?: boolean): void;
9
+ getStageStats(stage: any): {
10
+ count: any;
11
+ avgTime: number;
12
+ totalTime: any;
13
+ errors: any;
14
+ };
15
+ getSummary(): {
16
+ stages: any;
17
+ performance: {
18
+ structural: any;
19
+ semantic: any;
20
+ normalization: any;
21
+ chunking: any;
22
+ metadata: any;
23
+ validation: any;
24
+ total: unknown;
25
+ };
26
+ };
34
27
  reset(): void;
35
28
  assertPerformanceBudget(budget?: number): void;
36
29
  }
@@ -1,3 +1,4 @@
1
+ // @ts-nocheck
1
2
  /**
2
3
  * S-MORA Layer 0 Scrubber Telemetry Collection
3
4
  * @module smora/scrubber/telemetry
@@ -1,99 +1,62 @@
1
+ // @ts-nocheck
1
2
  /**
2
3
  * S-MORA Layer 0 Scrubber Telemetry Collection
3
4
  * @module smora/scrubber/telemetry
4
5
  */
5
-
6
- export interface StageStats {
7
- count: number;
8
- totalTime: number;
9
- errors: number;
10
- }
11
-
12
- export interface StageSummary {
13
- count: number;
14
- avgTime: number;
15
- totalTime: number;
16
- errors: number;
17
- }
18
-
19
- export interface TelemetrySummary {
20
- stages: Record<string, StageStats>;
21
- performance: {
22
- structural: number;
23
- semantic: number;
24
- normalization: number;
25
- chunking: number;
26
- metadata: number;
27
- validation: number;
28
- total: number;
29
- };
30
- }
31
-
32
6
  export class ScrubberTelemetry {
33
- stats: Record<string, StageStats>;
34
-
35
- constructor() {
36
- this.stats = {
37
- structural: { count: 0, totalTime: 0, errors: 0 },
38
- semantic: { count: 0, totalTime: 0, errors: 0 },
39
- normalization: { count: 0, totalTime: 0, errors: 0 },
40
- chunking: { count: 0, totalTime: 0, errors: 0 },
41
- metadata: { count: 0, totalTime: 0, errors: 0 },
42
- validation: { count: 0, totalTime: 0, errors: 0 },
43
- };
44
- }
45
-
46
- recordStage(stage: string, duration: number, success: boolean = true): void {
47
- if (!this.stats[stage]) {
48
- this.stats[stage] = { count: 0, totalTime: 0, errors: 0 };
7
+ stats;
8
+ constructor() {
9
+ this.stats = {
10
+ structural: { count: 0, totalTime: 0, errors: 0 },
11
+ semantic: { count: 0, totalTime: 0, errors: 0 },
12
+ normalization: { count: 0, totalTime: 0, errors: 0 },
13
+ chunking: { count: 0, totalTime: 0, errors: 0 },
14
+ metadata: { count: 0, totalTime: 0, errors: 0 },
15
+ validation: { count: 0, totalTime: 0, errors: 0 },
16
+ };
17
+ }
18
+ recordStage(stage, duration, success = true) {
19
+ if (!this.stats[stage]) {
20
+ this.stats[stage] = { count: 0, totalTime: 0, errors: 0 };
21
+ }
22
+ this.stats[stage].count++;
23
+ this.stats[stage].totalTime += duration;
24
+ if (!success) {
25
+ this.stats[stage].errors++;
26
+ }
27
+ }
28
+ getStageStats(stage) {
29
+ const stats = this.stats[stage] || { count: 0, totalTime: 0, errors: 0 };
30
+ return {
31
+ count: stats.count,
32
+ avgTime: stats.count > 0 ? stats.totalTime / stats.count : 0,
33
+ totalTime: stats.totalTime,
34
+ errors: stats.errors,
35
+ };
36
+ }
37
+ getSummary() {
38
+ return {
39
+ stages: this.stats,
40
+ performance: {
41
+ structural: this.stats.structural.totalTime,
42
+ semantic: this.stats.semantic.totalTime,
43
+ normalization: this.stats.normalization.totalTime,
44
+ chunking: this.stats.chunking.totalTime,
45
+ metadata: this.stats.metadata.totalTime,
46
+ validation: this.stats.validation.totalTime,
47
+ total: Object.values(this.stats).reduce((sum, s) => sum + s.totalTime, 0),
48
+ },
49
+ };
49
50
  }
50
- this.stats[stage].count++;
51
- this.stats[stage].totalTime += duration;
52
- if (!success) {
53
- this.stats[stage].errors++;
51
+ reset() {
52
+ Object.keys(this.stats).forEach((key) => {
53
+ this.stats[key] = { count: 0, totalTime: 0, errors: 0 };
54
+ });
54
55
  }
55
- }
56
-
57
- getStageStats(stage: string): StageSummary {
58
- const stats = this.stats[stage] || { count: 0, totalTime: 0, errors: 0 };
59
- return {
60
- count: stats.count,
61
- avgTime: stats.count > 0 ? stats.totalTime / stats.count : 0,
62
- totalTime: stats.totalTime,
63
- errors: stats.errors,
64
- };
65
- }
66
-
67
- getSummary(): TelemetrySummary {
68
- return {
69
- stages: this.stats,
70
- performance: {
71
- structural: this.stats.structural.totalTime,
72
- semantic: this.stats.semantic.totalTime,
73
- normalization: this.stats.normalization.totalTime,
74
- chunking: this.stats.chunking.totalTime,
75
- metadata: this.stats.metadata.totalTime,
76
- validation: this.stats.validation.totalTime,
77
- total: Object.values(this.stats).reduce(
78
- (sum, s) => sum + s.totalTime,
79
- 0,
80
- ),
81
- },
82
- };
83
- }
84
-
85
- reset(): void {
86
- Object.keys(this.stats).forEach((key) => {
87
- this.stats[key] = { count: 0, totalTime: 0, errors: 0 };
88
- });
89
- }
90
-
91
- assertPerformanceBudget(budget: number = 10): void {
92
- const summary = this.getSummary();
93
- if (summary.performance.total > budget) {
94
- throw new Error(
95
- `Performance budget exceeded: ${summary.performance.total}ms > ${budget}ms`,
96
- );
56
+ assertPerformanceBudget(budget = 10) {
57
+ const summary = this.getSummary();
58
+ if (summary.performance.total > budget) {
59
+ throw new Error(`Performance budget exceeded: ${summary.performance.total}ms > ${budget}ms`);
60
+ }
97
61
  }
98
- }
99
62
  }
@@ -0,0 +1,14 @@
1
+ export declare class HashUtil {
2
+ /**
3
+ * Hash content for deduplication
4
+ * @param {string} content - Content to hash
5
+ * @returns {string} - SHA256 hash
6
+ */
7
+ hash(content: any): string;
8
+ /**
9
+ * Fast hash for caching (non-cryptographic)
10
+ * @param {string} content - Content to hash
11
+ * @returns {string} - Simple hash
12
+ */
13
+ fastHash(content: any): string;
14
+ }
@@ -0,0 +1,37 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * Content Hashing Utilities
4
+ * @module smora/scrubber/utils/hash
5
+ */
6
+ import crypto from 'crypto';
7
+ export class HashUtil {
8
+ /**
9
+ * Hash content for deduplication
10
+ * @param {string} content - Content to hash
11
+ * @returns {string} - SHA256 hash
12
+ */
13
+ hash(content) {
14
+ const normalized = content
15
+ .toLowerCase()
16
+ .trim()
17
+ .replace(/\s+/g, ' ');
18
+ return crypto
19
+ .createHash('sha256')
20
+ .update(normalized)
21
+ .digest('hex');
22
+ }
23
+ /**
24
+ * Fast hash for caching (non-cryptographic)
25
+ * @param {string} content - Content to hash
26
+ * @returns {string} - Simple hash
27
+ */
28
+ fastHash(content) {
29
+ let hash = 0;
30
+ for (let i = 0; i < content.length; i++) {
31
+ const char = content.charCodeAt(i);
32
+ hash = ((hash << 5) - hash) + char;
33
+ hash = hash & hash;
34
+ }
35
+ return hash.toString(36);
36
+ }
37
+ }
@@ -0,0 +1,40 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * Content Hashing Utilities
4
+ * @module smora/scrubber/utils/hash
5
+ */
6
+ import crypto from 'crypto';
7
+
8
+ export class HashUtil {
9
+ /**
10
+ * Hash content for deduplication
11
+ * @param {string} content - Content to hash
12
+ * @returns {string} - SHA256 hash
13
+ */
14
+ hash(content) {
15
+ const normalized = content
16
+ .toLowerCase()
17
+ .trim()
18
+ .replace(/\s+/g, ' ');
19
+
20
+ return crypto
21
+ .createHash('sha256')
22
+ .update(normalized)
23
+ .digest('hex');
24
+ }
25
+
26
+ /**
27
+ * Fast hash for caching (non-cryptographic)
28
+ * @param {string} content - Content to hash
29
+ * @returns {string} - Simple hash
30
+ */
31
+ fastHash(content) {
32
+ let hash = 0;
33
+ for (let i = 0; i < content.length; i++) {
34
+ const char = content.charCodeAt(i);
35
+ hash = ((hash << 5) - hash) + char;
36
+ hash = hash & hash;
37
+ }
38
+ return hash.toString(36);
39
+ }
40
+ }
@@ -0,0 +1,14 @@
1
+ /**
2
+ * HTML Parsing Utilities
3
+ * @module smora/scrubber/utils/html-parser
4
+ */
5
+ export declare class HTMLParser {
6
+ /**
7
+ * Extract text content from HTML
8
+ * @param {string} html - HTML content
9
+ * @returns {string} - Extracted text
10
+ */
11
+ parse(html: any): any;
12
+ _extractText(html: any): any;
13
+ _stripTags(html: any): any;
14
+ }
@@ -0,0 +1,38 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * HTML Parsing Utilities
4
+ * @module smora/scrubber/utils/html-parser
5
+ */
6
+ export class HTMLParser {
7
+ /**
8
+ * Extract text content from HTML
9
+ * @param {string} html - HTML content
10
+ * @returns {string} - Extracted text
11
+ */
12
+ parse(html) {
13
+ return this._extractText(html);
14
+ }
15
+ _extractText(html) {
16
+ // Remove scripts, styles, and comments
17
+ let text = html;
18
+ text = text.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '');
19
+ text = text.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '');
20
+ text = text.replace(/<!--[\s\S]*?-->/g, '');
21
+ // Convert headings to markdown
22
+ text = text.replace(/<h([1-6])([^>]*)>(.*?)<\/h\1>/gi, (match, level, attrs, content) => {
23
+ const headingLevel = parseInt(level);
24
+ const hashes = '#'.repeat(headingLevel);
25
+ return `${hashes} ${this._stripTags(content)}\n\n`;
26
+ });
27
+ // Convert paragraphs
28
+ text = text.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n');
29
+ // Convert lists
30
+ text = text.replace(/<li[^>]*>(.*?)<\/li>/gi, '- $1\n');
31
+ // Remove remaining tags
32
+ text = text.replace(/<[^>]+>/g, '');
33
+ return text;
34
+ }
35
+ _stripTags(html) {
36
+ return html.replace(/<[^>]+>/g, '');
37
+ }
38
+ }
@@ -0,0 +1,46 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * HTML Parsing Utilities
4
+ * @module smora/scrubber/utils/html-parser
5
+ */
6
+
7
+ export class HTMLParser {
8
+ /**
9
+ * Extract text content from HTML
10
+ * @param {string} html - HTML content
11
+ * @returns {string} - Extracted text
12
+ */
13
+ parse(html) {
14
+ return this._extractText(html);
15
+ }
16
+
17
+ _extractText(html) {
18
+ // Remove scripts, styles, and comments
19
+ let text = html;
20
+ text = text.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '');
21
+ text = text.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '');
22
+ text = text.replace(/<!--[\s\S]*?-->/g, '');
23
+
24
+ // Convert headings to markdown
25
+ text = text.replace(/<h([1-6])([^>]*)>(.*?)<\/h\1>/gi, (match, level, attrs, content) => {
26
+ const headingLevel = parseInt(level);
27
+ const hashes = '#'.repeat(headingLevel);
28
+ return `${hashes} ${this._stripTags(content)}\n\n`;
29
+ });
30
+
31
+ // Convert paragraphs
32
+ text = text.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n');
33
+
34
+ // Convert lists
35
+ text = text.replace(/<li[^>]*>(.*?)<\/li>/gi, '- $1\n');
36
+
37
+ // Remove remaining tags
38
+ text = text.replace(/<[^>]+>/g, '');
39
+
40
+ return text;
41
+ }
42
+
43
+ _stripTags(html) {
44
+ return html.replace(/<[^>]+>/g, '');
45
+ }
46
+ }
@@ -0,0 +1,12 @@
1
+ /**
2
+ * Boilerplate Pattern Matching Utilities
3
+ * @module smora/scrubber/utils/pattern-matcher
4
+ */
5
+ export declare class PatternMatcher {
6
+ constructor();
7
+ _loadDefaultPatterns(): (string | RegExp)[];
8
+ getBoilerplatePatterns(): any;
9
+ addPattern(pattern: any): void;
10
+ removePattern(index: any): void;
11
+ isBoilerplate(text: any): any;
12
+ }
@@ -0,0 +1,54 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * Boilerplate Pattern Matching Utilities
4
+ * @module smora/scrubber/utils/pattern-matcher
5
+ */
6
+ export class PatternMatcher {
7
+ constructor() {
8
+ this.boilerplatePatterns = this._loadDefaultPatterns();
9
+ }
10
+ _loadDefaultPatterns() {
11
+ return [
12
+ // Legal/Footer
13
+ /©\s*\d{4}/i,
14
+ /all rights reserved/i,
15
+ /copyright\s+\d{4}/i,
16
+ // Navigation
17
+ /^home\s*\|/i,
18
+ /^navigation\s*:|menu\s*:/i,
19
+ /sidebar/i,
20
+ // Meta
21
+ /^last\s+updated?\s*:/i,
22
+ /cookie\s+policy/i,
23
+ /privacy\s+policy/i,
24
+ // Auto-generated
25
+ /^table\s+of\s+contents?$/i,
26
+ /^contents\s*$/i,
27
+ /jump\s+to\s+(section|navigation)/i,
28
+ // Strings
29
+ 'home | docs | contact',
30
+ 'skip to main content',
31
+ 'this site uses cookies'
32
+ ];
33
+ }
34
+ getBoilerplatePatterns() {
35
+ return this.boilerplatePatterns;
36
+ }
37
+ addPattern(pattern) {
38
+ this.boilerplatePatterns.push(pattern);
39
+ }
40
+ removePattern(index) {
41
+ if (index >= 0 && index < this.boilerplatePatterns.length) {
42
+ this.boilerplatePatterns.splice(index, 1);
43
+ }
44
+ }
45
+ isBoilerplate(text) {
46
+ const lowerText = text.toLowerCase().trim();
47
+ return this.boilerplatePatterns.some(pattern => {
48
+ if (pattern instanceof RegExp) {
49
+ return pattern.test(lowerText);
50
+ }
51
+ return lowerText.includes(pattern);
52
+ });
53
+ }
54
+ }
@@ -0,0 +1,64 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * Boilerplate Pattern Matching Utilities
4
+ * @module smora/scrubber/utils/pattern-matcher
5
+ */
6
+
7
+ export class PatternMatcher {
8
+ constructor() {
9
+ this.boilerplatePatterns = this._loadDefaultPatterns();
10
+ }
11
+
12
+ _loadDefaultPatterns() {
13
+ return [
14
+ // Legal/Footer
15
+ /©\s*\d{4}/i,
16
+ /all rights reserved/i,
17
+ /copyright\s+\d{4}/i,
18
+
19
+ // Navigation
20
+ /^home\s*\|/i,
21
+ /^navigation\s*:|menu\s*:/i,
22
+ /sidebar/i,
23
+
24
+ // Meta
25
+ /^last\s+updated?\s*:/i,
26
+ /cookie\s+policy/i,
27
+ /privacy\s+policy/i,
28
+
29
+ // Auto-generated
30
+ /^table\s+of\s+contents?$/i,
31
+ /^contents\s*$/i,
32
+ /jump\s+to\s+(section|navigation)/i,
33
+
34
+ // Strings
35
+ 'home | docs | contact',
36
+ 'skip to main content',
37
+ 'this site uses cookies'
38
+ ];
39
+ }
40
+
41
+ getBoilerplatePatterns() {
42
+ return this.boilerplatePatterns;
43
+ }
44
+
45
+ addPattern(pattern) {
46
+ this.boilerplatePatterns.push(pattern);
47
+ }
48
+
49
+ removePattern(index) {
50
+ if (index >= 0 && index < this.boilerplatePatterns.length) {
51
+ this.boilerplatePatterns.splice(index, 1);
52
+ }
53
+ }
54
+
55
+ isBoilerplate(text) {
56
+ const lowerText = text.toLowerCase().trim();
57
+ return this.boilerplatePatterns.some(pattern => {
58
+ if (pattern instanceof RegExp) {
59
+ return pattern.test(lowerText);
60
+ }
61
+ return lowerText.includes(pattern);
62
+ });
63
+ }
64
+ }
@@ -0,0 +1,18 @@
1
+ /**
2
+ * Token Counting Utilities
3
+ * @module smora/scrubber/utils/token-counter
4
+ */
5
+ export declare class TokenCounter {
6
+ /**
7
+ * Estimate token count (approximation)
8
+ * @param {string} text - Text to count
9
+ * @returns {number} - Estimated token count
10
+ */
11
+ count(text: any): number;
12
+ /**
13
+ * More accurate token count (slower)
14
+ * @param {string} text - Text to count
15
+ * @returns {number} - More accurate token count
16
+ */
17
+ countAccurate(text: any): any;
18
+ }
@@ -0,0 +1,30 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * Token Counting Utilities
4
+ * @module smora/scrubber/utils/token-counter
5
+ */
6
+ export class TokenCounter {
7
+ /**
8
+ * Estimate token count (approximation)
9
+ * @param {string} text - Text to count
10
+ * @returns {number} - Estimated token count
11
+ */
12
+ count(text) {
13
+ // Simple approximation: ~4 characters per token
14
+ return Math.ceil(text.length / 4);
15
+ }
16
+ /**
17
+ * More accurate token count (slower)
18
+ * @param {string} text - Text to count
19
+ * @returns {number} - More accurate token count
20
+ */
21
+ countAccurate(text) {
22
+ const words = text.split(/\s+/).filter(w => w.length > 0);
23
+ let tokens = words.length;
24
+ const punctuationMatches = text.match(/[.,!?;:]/g);
25
+ if (punctuationMatches) {
26
+ tokens += punctuationMatches.length;
27
+ }
28
+ return tokens;
29
+ }
30
+ }