@yamo/memory-mesh 2.3.2 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. package/README.md +8 -2
  2. package/bin/memory_mesh.js +1 -1
  3. package/lib/llm/client.d.ts +86 -0
  4. package/lib/llm/client.js +300 -357
  5. package/lib/llm/client.ts +334 -0
  6. package/lib/llm/index.d.ts +17 -0
  7. package/lib/llm/index.js +16 -8
  8. package/lib/llm/index.ts +18 -0
  9. package/lib/memory/adapters/client.d.ts +120 -0
  10. package/lib/memory/adapters/client.js +519 -0
  11. package/lib/memory/adapters/client.ts +519 -0
  12. package/lib/memory/adapters/config.d.ts +130 -0
  13. package/lib/memory/adapters/config.js +190 -0
  14. package/lib/memory/adapters/config.ts +190 -0
  15. package/lib/memory/adapters/errors.d.ts +84 -0
  16. package/lib/memory/adapters/errors.js +129 -0
  17. package/lib/memory/adapters/errors.ts +129 -0
  18. package/lib/memory/context-manager.d.ts +41 -0
  19. package/lib/memory/context-manager.js +345 -0
  20. package/lib/memory/context-manager.ts +345 -0
  21. package/lib/memory/embeddings/factory.d.ts +57 -0
  22. package/lib/memory/embeddings/factory.js +149 -0
  23. package/lib/memory/embeddings/factory.ts +149 -0
  24. package/lib/memory/embeddings/index.d.ts +2 -0
  25. package/lib/memory/embeddings/index.js +3 -0
  26. package/lib/memory/embeddings/index.ts +3 -0
  27. package/lib/memory/embeddings/service.d.ts +134 -0
  28. package/lib/memory/embeddings/service.js +516 -0
  29. package/lib/memory/embeddings/service.ts +516 -0
  30. package/lib/memory/index.d.ts +9 -0
  31. package/lib/memory/index.js +10 -1
  32. package/lib/memory/index.ts +10 -0
  33. package/lib/memory/memory-mesh.d.ts +332 -0
  34. package/lib/memory/memory-mesh.js +1470 -678
  35. package/lib/memory/memory-mesh.ts +1517 -0
  36. package/lib/memory/memory-translator.d.ts +14 -0
  37. package/lib/memory/memory-translator.js +126 -0
  38. package/lib/memory/memory-translator.ts +126 -0
  39. package/lib/memory/schema.d.ts +130 -0
  40. package/lib/memory/schema.js +184 -0
  41. package/lib/memory/schema.ts +184 -0
  42. package/lib/memory/scorer.d.ts +25 -0
  43. package/lib/memory/scorer.js +78 -0
  44. package/lib/memory/scorer.ts +78 -0
  45. package/lib/memory/search/index.d.ts +1 -0
  46. package/lib/memory/search/index.js +2 -0
  47. package/lib/memory/search/index.ts +2 -0
  48. package/lib/memory/search/keyword-search.d.ts +46 -0
  49. package/lib/memory/search/keyword-search.js +136 -0
  50. package/lib/memory/search/keyword-search.ts +136 -0
  51. package/lib/scrubber/config/defaults.d.ts +46 -0
  52. package/lib/scrubber/config/defaults.js +50 -57
  53. package/lib/scrubber/config/defaults.ts +55 -0
  54. package/lib/scrubber/errors/scrubber-error.d.ts +22 -0
  55. package/lib/scrubber/errors/scrubber-error.js +28 -32
  56. package/lib/scrubber/errors/scrubber-error.ts +44 -0
  57. package/lib/scrubber/index.d.ts +5 -0
  58. package/lib/scrubber/index.js +4 -23
  59. package/lib/scrubber/index.ts +6 -0
  60. package/lib/scrubber/scrubber.d.ts +44 -0
  61. package/lib/scrubber/scrubber.js +100 -121
  62. package/lib/scrubber/scrubber.ts +109 -0
  63. package/lib/scrubber/stages/chunker.d.ts +25 -0
  64. package/lib/scrubber/stages/chunker.js +74 -91
  65. package/lib/scrubber/stages/chunker.ts +104 -0
  66. package/lib/scrubber/stages/metadata-annotator.d.ts +17 -0
  67. package/lib/scrubber/stages/metadata-annotator.js +55 -65
  68. package/lib/scrubber/stages/metadata-annotator.ts +75 -0
  69. package/lib/scrubber/stages/normalizer.d.ts +16 -0
  70. package/lib/scrubber/stages/normalizer.js +42 -50
  71. package/lib/scrubber/stages/normalizer.ts +60 -0
  72. package/lib/scrubber/stages/semantic-filter.d.ts +16 -0
  73. package/lib/scrubber/stages/semantic-filter.js +42 -52
  74. package/lib/scrubber/stages/semantic-filter.ts +62 -0
  75. package/lib/scrubber/stages/structural-cleaner.d.ts +18 -0
  76. package/lib/scrubber/stages/structural-cleaner.js +66 -75
  77. package/lib/scrubber/stages/structural-cleaner.ts +83 -0
  78. package/lib/scrubber/stages/validator.d.ts +17 -0
  79. package/lib/scrubber/stages/validator.js +46 -56
  80. package/lib/scrubber/stages/validator.ts +67 -0
  81. package/lib/scrubber/telemetry.d.ts +29 -0
  82. package/lib/scrubber/telemetry.js +54 -58
  83. package/lib/scrubber/telemetry.ts +62 -0
  84. package/lib/scrubber/utils/hash.d.ts +14 -0
  85. package/lib/scrubber/utils/hash.js +30 -32
  86. package/lib/scrubber/utils/hash.ts +40 -0
  87. package/lib/scrubber/utils/html-parser.d.ts +14 -0
  88. package/lib/scrubber/utils/html-parser.js +32 -39
  89. package/lib/scrubber/utils/html-parser.ts +46 -0
  90. package/lib/scrubber/utils/pattern-matcher.d.ts +12 -0
  91. package/lib/scrubber/utils/pattern-matcher.js +48 -57
  92. package/lib/scrubber/utils/pattern-matcher.ts +64 -0
  93. package/lib/scrubber/utils/token-counter.d.ts +18 -0
  94. package/lib/scrubber/utils/token-counter.js +24 -25
  95. package/lib/scrubber/utils/token-counter.ts +32 -0
  96. package/lib/utils/logger.d.ts +19 -0
  97. package/lib/utils/logger.js +65 -0
  98. package/lib/utils/logger.ts +65 -0
  99. package/lib/utils/skill-metadata.d.ts +24 -0
  100. package/lib/utils/skill-metadata.js +133 -0
  101. package/lib/utils/skill-metadata.ts +133 -0
  102. package/lib/yamo/emitter.d.ts +46 -0
  103. package/lib/yamo/emitter.js +79 -143
  104. package/lib/yamo/emitter.ts +171 -0
  105. package/lib/yamo/index.d.ts +14 -0
  106. package/lib/yamo/index.js +6 -7
  107. package/lib/yamo/index.ts +16 -0
  108. package/lib/yamo/schema.d.ts +56 -0
  109. package/lib/yamo/schema.js +82 -108
  110. package/lib/yamo/schema.ts +133 -0
  111. package/package.json +13 -8
  112. package/index.d.ts +0 -111
  113. package/lib/embeddings/factory.js +0 -151
  114. package/lib/embeddings/index.js +0 -2
  115. package/lib/embeddings/service.js +0 -586
  116. package/lib/index.js +0 -6
  117. package/lib/lancedb/client.js +0 -633
  118. package/lib/lancedb/config.js +0 -215
  119. package/lib/lancedb/errors.js +0 -144
  120. package/lib/lancedb/index.js +0 -4
  121. package/lib/lancedb/schema.js +0 -217
  122. package/lib/search/index.js +0 -1
  123. package/lib/search/keyword-search.js +0 -144
  124. package/lib/utils/index.js +0 -1
@@ -1,66 +1,56 @@
1
+ // @ts-nocheck
1
2
  /**
2
3
  * S-MORA Layer 0 Scrubber - Stage 6: Validation
3
4
  * @module smora/scrubber/stages/validator
4
5
  */
5
-
6
6
  import { TokenCounter } from '../utils/token-counter.js';
7
- import { ValidationError } from '../errors/scrubber-error.js';
8
-
9
7
  export class Validator {
10
- constructor(config) {
11
- this.config = config;
12
- this.tokenCounter = new TokenCounter();
13
- }
14
-
15
- /**
16
- * Validate chunks
17
- * @param {Array} chunks - Array of chunks
18
- * @returns {Promise<Array>} - Validated chunks
19
- */
20
- async validate(chunks) {
21
- const valid = [];
22
- const errors = [];
23
-
24
- for (const chunk of chunks) {
25
- const validation = this._validateChunk(chunk);
26
-
27
- if (validation.valid) {
28
- valid.push(chunk);
29
- } else {
30
- errors.push({
31
- chunkIndex: chunk.index,
32
- errors: validation.errors
33
- });
34
- }
8
+ constructor(config) {
9
+ this.config = config;
10
+ this.tokenCounter = new TokenCounter();
35
11
  }
36
-
37
- return valid;
38
- }
39
-
40
- _validateChunk(chunk) {
41
- const errors = [];
42
-
43
- if (this.config.rejectEmptyChunks && !chunk.text.trim()) {
44
- errors.push('empty_chunk');
12
+ /**
13
+ * Validate chunks
14
+ * @param {Array} chunks - Array of chunks
15
+ * @returns {Promise<Array>} - Validated chunks
16
+ */
17
+ async validate(chunks) {
18
+ const valid = [];
19
+ const errors = [];
20
+ for (const chunk of chunks) {
21
+ const validation = this._validateChunk(chunk);
22
+ if (validation.valid) {
23
+ valid.push(chunk);
24
+ }
25
+ else {
26
+ errors.push({
27
+ chunkIndex: chunk.index,
28
+ errors: validation.errors
29
+ });
30
+ }
31
+ }
32
+ return valid;
45
33
  }
46
-
47
- if (this.config.enforceMinLength) {
48
- const tokens = this.tokenCounter.count(chunk.text);
49
- if (tokens < this.config.minTokens) {
50
- errors.push(`chunk_too_short: ${tokens} < ${this.config.minTokens}`);
51
- }
34
+ _validateChunk(chunk) {
35
+ const errors = [];
36
+ if (this.config.rejectEmptyChunks && !chunk.text.trim()) {
37
+ errors.push('empty_chunk');
38
+ }
39
+ if (this.config.enforceMinLength) {
40
+ const tokens = this.tokenCounter.count(chunk.text);
41
+ if (tokens < this.config.minTokens) {
42
+ errors.push(`chunk_too_short: ${tokens} < ${this.config.minTokens}`);
43
+ }
44
+ }
45
+ if (this.config.enforceMaxLength) {
46
+ const tokens = this.tokenCounter.count(chunk.text);
47
+ if (tokens > this.config.hardMaxTokens) {
48
+ errors.push(`chunk_too_long: ${tokens} > ${this.config.hardMaxTokens}`);
49
+ }
50
+ }
51
+ return {
52
+ valid: errors.length === 0,
53
+ errors
54
+ };
52
55
  }
53
-
54
- if (this.config.enforceMaxLength) {
55
- const tokens = this.tokenCounter.count(chunk.text);
56
- if (tokens > this.config.hardMaxTokens) {
57
- errors.push(`chunk_too_long: ${tokens} > ${this.config.hardMaxTokens}`);
58
- }
59
- }
60
-
61
- return {
62
- valid: errors.length === 0,
63
- errors
64
- };
65
- }
66
56
  }
@@ -0,0 +1,67 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * S-MORA Layer 0 Scrubber - Stage 6: Validation
4
+ * @module smora/scrubber/stages/validator
5
+ */
6
+
7
+ import { TokenCounter } from '../utils/token-counter.js';
8
+ import { ValidationError } from '../errors/scrubber-error.js';
9
+
10
+ export class Validator {
11
+ constructor(config) {
12
+ this.config = config;
13
+ this.tokenCounter = new TokenCounter();
14
+ }
15
+
16
+ /**
17
+ * Validate chunks
18
+ * @param {Array} chunks - Array of chunks
19
+ * @returns {Promise<Array>} - Validated chunks
20
+ */
21
+ async validate(chunks) {
22
+ const valid = [];
23
+ const errors = [];
24
+
25
+ for (const chunk of chunks) {
26
+ const validation = this._validateChunk(chunk);
27
+
28
+ if (validation.valid) {
29
+ valid.push(chunk);
30
+ } else {
31
+ errors.push({
32
+ chunkIndex: chunk.index,
33
+ errors: validation.errors
34
+ });
35
+ }
36
+ }
37
+
38
+ return valid;
39
+ }
40
+
41
+ _validateChunk(chunk) {
42
+ const errors = [];
43
+
44
+ if (this.config.rejectEmptyChunks && !chunk.text.trim()) {
45
+ errors.push('empty_chunk');
46
+ }
47
+
48
+ if (this.config.enforceMinLength) {
49
+ const tokens = this.tokenCounter.count(chunk.text);
50
+ if (tokens < this.config.minTokens) {
51
+ errors.push(`chunk_too_short: ${tokens} < ${this.config.minTokens}`);
52
+ }
53
+ }
54
+
55
+ if (this.config.enforceMaxLength) {
56
+ const tokens = this.tokenCounter.count(chunk.text);
57
+ if (tokens > this.config.hardMaxTokens) {
58
+ errors.push(`chunk_too_long: ${tokens} > ${this.config.hardMaxTokens}`);
59
+ }
60
+ }
61
+
62
+ return {
63
+ valid: errors.length === 0,
64
+ errors
65
+ };
66
+ }
67
+ }
@@ -0,0 +1,29 @@
1
+ /**
2
+ * S-MORA Layer 0 Scrubber Telemetry Collection
3
+ * @module smora/scrubber/telemetry
4
+ */
5
+ export declare class ScrubberTelemetry {
6
+ stats: any;
7
+ constructor();
8
+ recordStage(stage: any, duration: any, success?: boolean): void;
9
+ getStageStats(stage: any): {
10
+ count: any;
11
+ avgTime: number;
12
+ totalTime: any;
13
+ errors: any;
14
+ };
15
+ getSummary(): {
16
+ stages: any;
17
+ performance: {
18
+ structural: any;
19
+ semantic: any;
20
+ normalization: any;
21
+ chunking: any;
22
+ metadata: any;
23
+ validation: any;
24
+ total: unknown;
25
+ };
26
+ };
27
+ reset(): void;
28
+ assertPerformanceBudget(budget?: number): void;
29
+ }
@@ -1,66 +1,62 @@
1
+ // @ts-nocheck
1
2
  /**
2
3
  * S-MORA Layer 0 Scrubber Telemetry Collection
3
4
  * @module smora/scrubber/telemetry
4
5
  */
5
-
6
6
  export class ScrubberTelemetry {
7
- constructor() {
8
- this.stats = {
9
- structural: { count: 0, totalTime: 0, errors: 0 },
10
- semantic: { count: 0, totalTime: 0, errors: 0 },
11
- normalization: { count: 0, totalTime: 0, errors: 0 },
12
- chunking: { count: 0, totalTime: 0, errors: 0 },
13
- metadata: { count: 0, totalTime: 0, errors: 0 },
14
- validation: { count: 0, totalTime: 0, errors: 0 }
15
- };
16
- }
17
-
18
- recordStage(stage, duration, success = true) {
19
- if (!this.stats[stage]) {
20
- this.stats[stage] = { count: 0, totalTime: 0, errors: 0 };
7
+ stats;
8
+ constructor() {
9
+ this.stats = {
10
+ structural: { count: 0, totalTime: 0, errors: 0 },
11
+ semantic: { count: 0, totalTime: 0, errors: 0 },
12
+ normalization: { count: 0, totalTime: 0, errors: 0 },
13
+ chunking: { count: 0, totalTime: 0, errors: 0 },
14
+ metadata: { count: 0, totalTime: 0, errors: 0 },
15
+ validation: { count: 0, totalTime: 0, errors: 0 },
16
+ };
21
17
  }
22
- this.stats[stage].count++;
23
- this.stats[stage].totalTime += duration;
24
- if (!success) this.stats[stage].errors++;
25
- }
26
-
27
- getStageStats(stage) {
28
- const stats = this.stats[stage];
29
- return {
30
- count: stats.count,
31
- avgTime: stats.count > 0 ? stats.totalTime / stats.count : 0,
32
- totalTime: stats.totalTime,
33
- errors: stats.errors
34
- };
35
- }
36
-
37
- getSummary() {
38
- return {
39
- stages: this.stats,
40
- performance: {
41
- structural: this.stats.structural.totalTime,
42
- semantic: this.stats.semantic.totalTime,
43
- normalization: this.stats.normalization.totalTime,
44
- chunking: this.stats.chunking.totalTime,
45
- metadata: this.stats.metadata.totalTime,
46
- validation: this.stats.validation.totalTime,
47
- total: Object.values(this.stats).reduce((sum, s) => sum + s.totalTime, 0)
48
- }
49
- };
50
- }
51
-
52
- reset() {
53
- Object.keys(this.stats).forEach(key => {
54
- this.stats[key] = { count: 0, totalTime: 0, errors: 0 };
55
- });
56
- }
57
-
58
- assertPerformanceBudget(budget = 10) {
59
- const summary = this.getSummary();
60
- if (summary.performance.total > budget) {
61
- throw new Error(
62
- `Performance budget exceeded: ${summary.performance.total}ms > ${budget}ms`
63
- );
18
+ recordStage(stage, duration, success = true) {
19
+ if (!this.stats[stage]) {
20
+ this.stats[stage] = { count: 0, totalTime: 0, errors: 0 };
21
+ }
22
+ this.stats[stage].count++;
23
+ this.stats[stage].totalTime += duration;
24
+ if (!success) {
25
+ this.stats[stage].errors++;
26
+ }
27
+ }
28
+ getStageStats(stage) {
29
+ const stats = this.stats[stage] || { count: 0, totalTime: 0, errors: 0 };
30
+ return {
31
+ count: stats.count,
32
+ avgTime: stats.count > 0 ? stats.totalTime / stats.count : 0,
33
+ totalTime: stats.totalTime,
34
+ errors: stats.errors,
35
+ };
36
+ }
37
+ getSummary() {
38
+ return {
39
+ stages: this.stats,
40
+ performance: {
41
+ structural: this.stats.structural.totalTime,
42
+ semantic: this.stats.semantic.totalTime,
43
+ normalization: this.stats.normalization.totalTime,
44
+ chunking: this.stats.chunking.totalTime,
45
+ metadata: this.stats.metadata.totalTime,
46
+ validation: this.stats.validation.totalTime,
47
+ total: Object.values(this.stats).reduce((sum, s) => sum + s.totalTime, 0),
48
+ },
49
+ };
50
+ }
51
+ reset() {
52
+ Object.keys(this.stats).forEach((key) => {
53
+ this.stats[key] = { count: 0, totalTime: 0, errors: 0 };
54
+ });
55
+ }
56
+ assertPerformanceBudget(budget = 10) {
57
+ const summary = this.getSummary();
58
+ if (summary.performance.total > budget) {
59
+ throw new Error(`Performance budget exceeded: ${summary.performance.total}ms > ${budget}ms`);
60
+ }
64
61
  }
65
- }
66
62
  }
@@ -0,0 +1,62 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * S-MORA Layer 0 Scrubber Telemetry Collection
4
+ * @module smora/scrubber/telemetry
5
+ */
6
+ export class ScrubberTelemetry {
7
+ stats;
8
+ constructor() {
9
+ this.stats = {
10
+ structural: { count: 0, totalTime: 0, errors: 0 },
11
+ semantic: { count: 0, totalTime: 0, errors: 0 },
12
+ normalization: { count: 0, totalTime: 0, errors: 0 },
13
+ chunking: { count: 0, totalTime: 0, errors: 0 },
14
+ metadata: { count: 0, totalTime: 0, errors: 0 },
15
+ validation: { count: 0, totalTime: 0, errors: 0 },
16
+ };
17
+ }
18
+ recordStage(stage, duration, success = true) {
19
+ if (!this.stats[stage]) {
20
+ this.stats[stage] = { count: 0, totalTime: 0, errors: 0 };
21
+ }
22
+ this.stats[stage].count++;
23
+ this.stats[stage].totalTime += duration;
24
+ if (!success) {
25
+ this.stats[stage].errors++;
26
+ }
27
+ }
28
+ getStageStats(stage) {
29
+ const stats = this.stats[stage] || { count: 0, totalTime: 0, errors: 0 };
30
+ return {
31
+ count: stats.count,
32
+ avgTime: stats.count > 0 ? stats.totalTime / stats.count : 0,
33
+ totalTime: stats.totalTime,
34
+ errors: stats.errors,
35
+ };
36
+ }
37
+ getSummary() {
38
+ return {
39
+ stages: this.stats,
40
+ performance: {
41
+ structural: this.stats.structural.totalTime,
42
+ semantic: this.stats.semantic.totalTime,
43
+ normalization: this.stats.normalization.totalTime,
44
+ chunking: this.stats.chunking.totalTime,
45
+ metadata: this.stats.metadata.totalTime,
46
+ validation: this.stats.validation.totalTime,
47
+ total: Object.values(this.stats).reduce((sum, s) => sum + s.totalTime, 0),
48
+ },
49
+ };
50
+ }
51
+ reset() {
52
+ Object.keys(this.stats).forEach((key) => {
53
+ this.stats[key] = { count: 0, totalTime: 0, errors: 0 };
54
+ });
55
+ }
56
+ assertPerformanceBudget(budget = 10) {
57
+ const summary = this.getSummary();
58
+ if (summary.performance.total > budget) {
59
+ throw new Error(`Performance budget exceeded: ${summary.performance.total}ms > ${budget}ms`);
60
+ }
61
+ }
62
+ }
@@ -0,0 +1,14 @@
1
+ export declare class HashUtil {
2
+ /**
3
+ * Hash content for deduplication
4
+ * @param {string} content - Content to hash
5
+ * @returns {string} - SHA256 hash
6
+ */
7
+ hash(content: any): string;
8
+ /**
9
+ * Fast hash for caching (non-cryptographic)
10
+ * @param {string} content - Content to hash
11
+ * @returns {string} - Simple hash
12
+ */
13
+ fastHash(content: any): string;
14
+ }
@@ -1,39 +1,37 @@
1
+ // @ts-nocheck
1
2
  /**
2
3
  * Content Hashing Utilities
3
4
  * @module smora/scrubber/utils/hash
4
5
  */
5
6
  import crypto from 'crypto';
6
-
7
7
  export class HashUtil {
8
- /**
9
- * Hash content for deduplication
10
- * @param {string} content - Content to hash
11
- * @returns {string} - SHA256 hash
12
- */
13
- hash(content) {
14
- const normalized = content
15
- .toLowerCase()
16
- .trim()
17
- .replace(/\s+/g, ' ');
18
-
19
- return crypto
20
- .createHash('sha256')
21
- .update(normalized)
22
- .digest('hex');
23
- }
24
-
25
- /**
26
- * Fast hash for caching (non-cryptographic)
27
- * @param {string} content - Content to hash
28
- * @returns {string} - Simple hash
29
- */
30
- fastHash(content) {
31
- let hash = 0;
32
- for (let i = 0; i < content.length; i++) {
33
- const char = content.charCodeAt(i);
34
- hash = ((hash << 5) - hash) + char;
35
- hash = hash & hash;
8
+ /**
9
+ * Hash content for deduplication
10
+ * @param {string} content - Content to hash
11
+ * @returns {string} - SHA256 hash
12
+ */
13
+ hash(content) {
14
+ const normalized = content
15
+ .toLowerCase()
16
+ .trim()
17
+ .replace(/\s+/g, ' ');
18
+ return crypto
19
+ .createHash('sha256')
20
+ .update(normalized)
21
+ .digest('hex');
36
22
  }
37
- return hash.toString(36);
38
- }
39
- }
23
+ /**
24
+ * Fast hash for caching (non-cryptographic)
25
+ * @param {string} content - Content to hash
26
+ * @returns {string} - Simple hash
27
+ */
28
+ fastHash(content) {
29
+ let hash = 0;
30
+ for (let i = 0; i < content.length; i++) {
31
+ const char = content.charCodeAt(i);
32
+ hash = ((hash << 5) - hash) + char;
33
+ hash = hash & hash;
34
+ }
35
+ return hash.toString(36);
36
+ }
37
+ }
@@ -0,0 +1,40 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * Content Hashing Utilities
4
+ * @module smora/scrubber/utils/hash
5
+ */
6
+ import crypto from 'crypto';
7
+
8
+ export class HashUtil {
9
+ /**
10
+ * Hash content for deduplication
11
+ * @param {string} content - Content to hash
12
+ * @returns {string} - SHA256 hash
13
+ */
14
+ hash(content) {
15
+ const normalized = content
16
+ .toLowerCase()
17
+ .trim()
18
+ .replace(/\s+/g, ' ');
19
+
20
+ return crypto
21
+ .createHash('sha256')
22
+ .update(normalized)
23
+ .digest('hex');
24
+ }
25
+
26
+ /**
27
+ * Fast hash for caching (non-cryptographic)
28
+ * @param {string} content - Content to hash
29
+ * @returns {string} - Simple hash
30
+ */
31
+ fastHash(content) {
32
+ let hash = 0;
33
+ for (let i = 0; i < content.length; i++) {
34
+ const char = content.charCodeAt(i);
35
+ hash = ((hash << 5) - hash) + char;
36
+ hash = hash & hash;
37
+ }
38
+ return hash.toString(36);
39
+ }
40
+ }
@@ -0,0 +1,14 @@
1
+ /**
2
+ * HTML Parsing Utilities
3
+ * @module smora/scrubber/utils/html-parser
4
+ */
5
+ export declare class HTMLParser {
6
+ /**
7
+ * Extract text content from HTML
8
+ * @param {string} html - HTML content
9
+ * @returns {string} - Extracted text
10
+ */
11
+ parse(html: any): any;
12
+ _extractText(html: any): any;
13
+ _stripTags(html: any): any;
14
+ }
@@ -1,45 +1,38 @@
1
+ // @ts-nocheck
1
2
  /**
2
3
  * HTML Parsing Utilities
3
4
  * @module smora/scrubber/utils/html-parser
4
5
  */
5
-
6
6
  export class HTMLParser {
7
- /**
8
- * Extract text content from HTML
9
- * @param {string} html - HTML content
10
- * @returns {string} - Extracted text
11
- */
12
- parse(html) {
13
- return this._extractText(html);
14
- }
15
-
16
- _extractText(html) {
17
- // Remove scripts, styles, and comments
18
- let text = html;
19
- text = text.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '');
20
- text = text.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '');
21
- text = text.replace(/<!--[\s\S]*?-->/g, '');
22
-
23
- // Convert headings to markdown
24
- text = text.replace(/<h([1-6])([^>]*)>(.*?)<\/h\1>/gi, (match, level, attrs, content) => {
25
- const headingLevel = parseInt(level);
26
- const hashes = '#'.repeat(headingLevel);
27
- return `${hashes} ${this._stripTags(content)}\n\n`;
28
- });
29
-
30
- // Convert paragraphs
31
- text = text.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n');
32
-
33
- // Convert lists
34
- text = text.replace(/<li[^>]*>(.*?)<\/li>/gi, '- $1\n');
35
-
36
- // Remove remaining tags
37
- text = text.replace(/<[^>]+>/g, '');
38
-
39
- return text;
40
- }
41
-
42
- _stripTags(html) {
43
- return html.replace(/<[^>]+>/g, '');
44
- }
7
+ /**
8
+ * Extract text content from HTML
9
+ * @param {string} html - HTML content
10
+ * @returns {string} - Extracted text
11
+ */
12
+ parse(html) {
13
+ return this._extractText(html);
14
+ }
15
+ _extractText(html) {
16
+ // Remove scripts, styles, and comments
17
+ let text = html;
18
+ text = text.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '');
19
+ text = text.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '');
20
+ text = text.replace(/<!--[\s\S]*?-->/g, '');
21
+ // Convert headings to markdown
22
+ text = text.replace(/<h([1-6])([^>]*)>(.*?)<\/h\1>/gi, (match, level, attrs, content) => {
23
+ const headingLevel = parseInt(level);
24
+ const hashes = '#'.repeat(headingLevel);
25
+ return `${hashes} ${this._stripTags(content)}\n\n`;
26
+ });
27
+ // Convert paragraphs
28
+ text = text.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n');
29
+ // Convert lists
30
+ text = text.replace(/<li[^>]*>(.*?)<\/li>/gi, '- $1\n');
31
+ // Remove remaining tags
32
+ text = text.replace(/<[^>]+>/g, '');
33
+ return text;
34
+ }
35
+ _stripTags(html) {
36
+ return html.replace(/<[^>]+>/g, '');
37
+ }
45
38
  }
@@ -0,0 +1,46 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * HTML Parsing Utilities
4
+ * @module smora/scrubber/utils/html-parser
5
+ */
6
+
7
+ export class HTMLParser {
8
+ /**
9
+ * Extract text content from HTML
10
+ * @param {string} html - HTML content
11
+ * @returns {string} - Extracted text
12
+ */
13
+ parse(html) {
14
+ return this._extractText(html);
15
+ }
16
+
17
+ _extractText(html) {
18
+ // Remove scripts, styles, and comments
19
+ let text = html;
20
+ text = text.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '');
21
+ text = text.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '');
22
+ text = text.replace(/<!--[\s\S]*?-->/g, '');
23
+
24
+ // Convert headings to markdown
25
+ text = text.replace(/<h([1-6])([^>]*)>(.*?)<\/h\1>/gi, (match, level, attrs, content) => {
26
+ const headingLevel = parseInt(level);
27
+ const hashes = '#'.repeat(headingLevel);
28
+ return `${hashes} ${this._stripTags(content)}\n\n`;
29
+ });
30
+
31
+ // Convert paragraphs
32
+ text = text.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n');
33
+
34
+ // Convert lists
35
+ text = text.replace(/<li[^>]*>(.*?)<\/li>/gi, '- $1\n');
36
+
37
+ // Remove remaining tags
38
+ text = text.replace(/<[^>]+>/g, '');
39
+
40
+ return text;
41
+ }
42
+
43
+ _stripTags(html) {
44
+ return html.replace(/<[^>]+>/g, '');
45
+ }
46
+ }
@@ -0,0 +1,12 @@
1
+ /**
2
+ * Boilerplate Pattern Matching Utilities
3
+ * @module smora/scrubber/utils/pattern-matcher
4
+ */
5
+ export declare class PatternMatcher {
6
+ constructor();
7
+ _loadDefaultPatterns(): (string | RegExp)[];
8
+ getBoilerplatePatterns(): any;
9
+ addPattern(pattern: any): void;
10
+ removePattern(index: any): void;
11
+ isBoilerplate(text: any): any;
12
+ }