@yamo/memory-mesh 2.3.2 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. package/README.md +8 -2
  2. package/bin/memory_mesh.js +1 -1
  3. package/lib/llm/client.d.ts +86 -0
  4. package/lib/llm/client.js +300 -357
  5. package/lib/llm/client.ts +334 -0
  6. package/lib/llm/index.d.ts +17 -0
  7. package/lib/llm/index.js +16 -8
  8. package/lib/llm/index.ts +18 -0
  9. package/lib/memory/adapters/client.d.ts +120 -0
  10. package/lib/memory/adapters/client.js +519 -0
  11. package/lib/memory/adapters/client.ts +519 -0
  12. package/lib/memory/adapters/config.d.ts +130 -0
  13. package/lib/memory/adapters/config.js +190 -0
  14. package/lib/memory/adapters/config.ts +190 -0
  15. package/lib/memory/adapters/errors.d.ts +84 -0
  16. package/lib/memory/adapters/errors.js +129 -0
  17. package/lib/memory/adapters/errors.ts +129 -0
  18. package/lib/memory/context-manager.d.ts +41 -0
  19. package/lib/memory/context-manager.js +345 -0
  20. package/lib/memory/context-manager.ts +345 -0
  21. package/lib/memory/embeddings/factory.d.ts +57 -0
  22. package/lib/memory/embeddings/factory.js +149 -0
  23. package/lib/memory/embeddings/factory.ts +149 -0
  24. package/lib/memory/embeddings/index.d.ts +2 -0
  25. package/lib/memory/embeddings/index.js +3 -0
  26. package/lib/memory/embeddings/index.ts +3 -0
  27. package/lib/memory/embeddings/service.d.ts +134 -0
  28. package/lib/memory/embeddings/service.js +516 -0
  29. package/lib/memory/embeddings/service.ts +516 -0
  30. package/lib/memory/index.d.ts +9 -0
  31. package/lib/memory/index.js +10 -1
  32. package/lib/memory/index.ts +10 -0
  33. package/lib/memory/memory-mesh.d.ts +332 -0
  34. package/lib/memory/memory-mesh.js +1470 -678
  35. package/lib/memory/memory-mesh.ts +1517 -0
  36. package/lib/memory/memory-translator.d.ts +14 -0
  37. package/lib/memory/memory-translator.js +126 -0
  38. package/lib/memory/memory-translator.ts +126 -0
  39. package/lib/memory/schema.d.ts +130 -0
  40. package/lib/memory/schema.js +184 -0
  41. package/lib/memory/schema.ts +184 -0
  42. package/lib/memory/scorer.d.ts +25 -0
  43. package/lib/memory/scorer.js +78 -0
  44. package/lib/memory/scorer.ts +78 -0
  45. package/lib/memory/search/index.d.ts +1 -0
  46. package/lib/memory/search/index.js +2 -0
  47. package/lib/memory/search/index.ts +2 -0
  48. package/lib/memory/search/keyword-search.d.ts +46 -0
  49. package/lib/memory/search/keyword-search.js +136 -0
  50. package/lib/memory/search/keyword-search.ts +136 -0
  51. package/lib/scrubber/config/defaults.d.ts +46 -0
  52. package/lib/scrubber/config/defaults.js +50 -57
  53. package/lib/scrubber/config/defaults.ts +55 -0
  54. package/lib/scrubber/errors/scrubber-error.d.ts +22 -0
  55. package/lib/scrubber/errors/scrubber-error.js +28 -32
  56. package/lib/scrubber/errors/scrubber-error.ts +44 -0
  57. package/lib/scrubber/index.d.ts +5 -0
  58. package/lib/scrubber/index.js +4 -23
  59. package/lib/scrubber/index.ts +6 -0
  60. package/lib/scrubber/scrubber.d.ts +44 -0
  61. package/lib/scrubber/scrubber.js +100 -121
  62. package/lib/scrubber/scrubber.ts +109 -0
  63. package/lib/scrubber/stages/chunker.d.ts +25 -0
  64. package/lib/scrubber/stages/chunker.js +74 -91
  65. package/lib/scrubber/stages/chunker.ts +104 -0
  66. package/lib/scrubber/stages/metadata-annotator.d.ts +17 -0
  67. package/lib/scrubber/stages/metadata-annotator.js +55 -65
  68. package/lib/scrubber/stages/metadata-annotator.ts +75 -0
  69. package/lib/scrubber/stages/normalizer.d.ts +16 -0
  70. package/lib/scrubber/stages/normalizer.js +42 -50
  71. package/lib/scrubber/stages/normalizer.ts +60 -0
  72. package/lib/scrubber/stages/semantic-filter.d.ts +16 -0
  73. package/lib/scrubber/stages/semantic-filter.js +42 -52
  74. package/lib/scrubber/stages/semantic-filter.ts +62 -0
  75. package/lib/scrubber/stages/structural-cleaner.d.ts +18 -0
  76. package/lib/scrubber/stages/structural-cleaner.js +66 -75
  77. package/lib/scrubber/stages/structural-cleaner.ts +83 -0
  78. package/lib/scrubber/stages/validator.d.ts +17 -0
  79. package/lib/scrubber/stages/validator.js +46 -56
  80. package/lib/scrubber/stages/validator.ts +67 -0
  81. package/lib/scrubber/telemetry.d.ts +29 -0
  82. package/lib/scrubber/telemetry.js +54 -58
  83. package/lib/scrubber/telemetry.ts +62 -0
  84. package/lib/scrubber/utils/hash.d.ts +14 -0
  85. package/lib/scrubber/utils/hash.js +30 -32
  86. package/lib/scrubber/utils/hash.ts +40 -0
  87. package/lib/scrubber/utils/html-parser.d.ts +14 -0
  88. package/lib/scrubber/utils/html-parser.js +32 -39
  89. package/lib/scrubber/utils/html-parser.ts +46 -0
  90. package/lib/scrubber/utils/pattern-matcher.d.ts +12 -0
  91. package/lib/scrubber/utils/pattern-matcher.js +48 -57
  92. package/lib/scrubber/utils/pattern-matcher.ts +64 -0
  93. package/lib/scrubber/utils/token-counter.d.ts +18 -0
  94. package/lib/scrubber/utils/token-counter.js +24 -25
  95. package/lib/scrubber/utils/token-counter.ts +32 -0
  96. package/lib/utils/logger.d.ts +19 -0
  97. package/lib/utils/logger.js +65 -0
  98. package/lib/utils/logger.ts +65 -0
  99. package/lib/utils/skill-metadata.d.ts +24 -0
  100. package/lib/utils/skill-metadata.js +133 -0
  101. package/lib/utils/skill-metadata.ts +133 -0
  102. package/lib/yamo/emitter.d.ts +46 -0
  103. package/lib/yamo/emitter.js +79 -143
  104. package/lib/yamo/emitter.ts +171 -0
  105. package/lib/yamo/index.d.ts +14 -0
  106. package/lib/yamo/index.js +6 -7
  107. package/lib/yamo/index.ts +16 -0
  108. package/lib/yamo/schema.d.ts +56 -0
  109. package/lib/yamo/schema.js +82 -108
  110. package/lib/yamo/schema.ts +133 -0
  111. package/package.json +13 -8
  112. package/index.d.ts +0 -111
  113. package/lib/embeddings/factory.js +0 -151
  114. package/lib/embeddings/index.js +0 -2
  115. package/lib/embeddings/service.js +0 -586
  116. package/lib/index.js +0 -6
  117. package/lib/lancedb/client.js +0 -633
  118. package/lib/lancedb/config.js +0 -215
  119. package/lib/lancedb/errors.js +0 -144
  120. package/lib/lancedb/index.js +0 -4
  121. package/lib/lancedb/schema.js +0 -217
  122. package/lib/search/index.js +0 -1
  123. package/lib/search/keyword-search.js +0 -144
  124. package/lib/utils/index.js +0 -1
@@ -0,0 +1,46 @@
1
+ /**
2
+ * S-MORA Layer 0 Scrubber Default Configuration
3
+ * @module smora/scrubber/config/defaults
4
+ */
5
+ export declare const defaultScrubberConfig: {
6
+ enabled: boolean;
7
+ structural: {
8
+ stripHTML: boolean;
9
+ normalizeMarkdown: boolean;
10
+ collapseWhitespace: boolean;
11
+ removeScripts: boolean;
12
+ removeStyles: boolean;
13
+ };
14
+ semantic: {
15
+ removeDuplicates: boolean;
16
+ removeBoilerplate: boolean;
17
+ minSignalRatio: number;
18
+ boilerplatePatterns: string;
19
+ };
20
+ normalization: {
21
+ normalizeHeadings: boolean;
22
+ normalizeLists: boolean;
23
+ normalizePunctuation: boolean;
24
+ };
25
+ chunking: {
26
+ maxTokens: number;
27
+ minTokens: number;
28
+ hardMaxTokens: number;
29
+ splitOnHeadings: boolean;
30
+ preserveContext: boolean;
31
+ };
32
+ metadata: {
33
+ addSource: boolean;
34
+ addSection: boolean;
35
+ addHeadingPath: boolean;
36
+ addTimestamp: boolean;
37
+ addHash: boolean;
38
+ };
39
+ validation: {
40
+ enforceMinLength: boolean;
41
+ enforceMaxLength: boolean;
42
+ rejectEmptyChunks: boolean;
43
+ };
44
+ logTransformations: boolean;
45
+ cachePatterns: boolean;
46
+ };
@@ -1,62 +1,55 @@
1
+ // @ts-nocheck
1
2
  /**
2
3
  * S-MORA Layer 0 Scrubber Default Configuration
3
4
  * @module smora/scrubber/config/defaults
4
5
  */
5
-
6
6
  export const defaultScrubberConfig = {
7
- // Master switch
8
- enabled: false,
9
-
10
- // Stage 1: Structural Cleaning
11
- structural: {
12
- stripHTML: true,
13
- normalizeMarkdown: true,
14
- collapseWhitespace: true,
15
- removeScripts: true,
16
- removeStyles: true
17
- },
18
-
19
- // Stage 2: Semantic Filtering
20
- semantic: {
21
- removeDuplicates: true,
22
- removeBoilerplate: true,
23
- minSignalRatio: 0.3,
24
- boilerplatePatterns: 'default'
25
- },
26
-
27
- // Stage 3: Normalization
28
- normalization: {
29
- normalizeHeadings: true,
30
- normalizeLists: true,
31
- normalizePunctuation: true
32
- },
33
-
34
- // Stage 4: Chunking
35
- chunking: {
36
- maxTokens: 500,
37
- minTokens: 10,
38
- hardMaxTokens: 2000,
39
- splitOnHeadings: true,
40
- preserveContext: true
41
- },
42
-
43
- // Stage 5: Metadata Annotation
44
- metadata: {
45
- addSource: true,
46
- addSection: true,
47
- addHeadingPath: true,
48
- addTimestamp: true,
49
- addHash: true
50
- },
51
-
52
- // Stage 6: Validation
53
- validation: {
54
- enforceMinLength: true,
55
- enforceMaxLength: true,
56
- rejectEmptyChunks: true
57
- },
58
-
59
- // Performance
60
- logTransformations: false,
61
- cachePatterns: true
62
- };
7
+ // Master switch - enabled by default for security (PII/sensitive data protection)
8
+ enabled: true,
9
+ // Stage 1: Structural Cleaning
10
+ structural: {
11
+ stripHTML: true,
12
+ normalizeMarkdown: true,
13
+ collapseWhitespace: true,
14
+ removeScripts: true,
15
+ removeStyles: true,
16
+ },
17
+ // Stage 2: Semantic Filtering
18
+ semantic: {
19
+ removeDuplicates: true,
20
+ removeBoilerplate: true,
21
+ minSignalRatio: 0.3,
22
+ boilerplatePatterns: "default",
23
+ },
24
+ // Stage 3: Normalization
25
+ normalization: {
26
+ normalizeHeadings: true,
27
+ normalizeLists: true,
28
+ normalizePunctuation: true,
29
+ },
30
+ // Stage 4: Chunking
31
+ chunking: {
32
+ maxTokens: 500,
33
+ minTokens: 10,
34
+ hardMaxTokens: 2000,
35
+ splitOnHeadings: true,
36
+ preserveContext: true,
37
+ },
38
+ // Stage 5: Metadata Annotation
39
+ metadata: {
40
+ addSource: true,
41
+ addSection: true,
42
+ addHeadingPath: true,
43
+ addTimestamp: true,
44
+ addHash: true,
45
+ },
46
+ // Stage 6: Validation
47
+ validation: {
48
+ enforceMinLength: true,
49
+ enforceMaxLength: true,
50
+ rejectEmptyChunks: true,
51
+ },
52
+ // Performance
53
+ logTransformations: false,
54
+ cachePatterns: true,
55
+ };
@@ -0,0 +1,55 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * S-MORA Layer 0 Scrubber Default Configuration
4
+ * @module smora/scrubber/config/defaults
5
+ */
6
+ export const defaultScrubberConfig = {
7
+ // Master switch - enabled by default for security (PII/sensitive data protection)
8
+ enabled: true,
9
+ // Stage 1: Structural Cleaning
10
+ structural: {
11
+ stripHTML: true,
12
+ normalizeMarkdown: true,
13
+ collapseWhitespace: true,
14
+ removeScripts: true,
15
+ removeStyles: true,
16
+ },
17
+ // Stage 2: Semantic Filtering
18
+ semantic: {
19
+ removeDuplicates: true,
20
+ removeBoilerplate: true,
21
+ minSignalRatio: 0.3,
22
+ boilerplatePatterns: "default",
23
+ },
24
+ // Stage 3: Normalization
25
+ normalization: {
26
+ normalizeHeadings: true,
27
+ normalizeLists: true,
28
+ normalizePunctuation: true,
29
+ },
30
+ // Stage 4: Chunking
31
+ chunking: {
32
+ maxTokens: 500,
33
+ minTokens: 10,
34
+ hardMaxTokens: 2000,
35
+ splitOnHeadings: true,
36
+ preserveContext: true,
37
+ },
38
+ // Stage 5: Metadata Annotation
39
+ metadata: {
40
+ addSource: true,
41
+ addSection: true,
42
+ addHeadingPath: true,
43
+ addTimestamp: true,
44
+ addHash: true,
45
+ },
46
+ // Stage 6: Validation
47
+ validation: {
48
+ enforceMinLength: true,
49
+ enforceMaxLength: true,
50
+ rejectEmptyChunks: true,
51
+ },
52
+ // Performance
53
+ logTransformations: false,
54
+ cachePatterns: true,
55
+ };
@@ -0,0 +1,22 @@
1
+ /**
2
+ * S-MORA Layer 0 Scrubber Error Classes
3
+ * @module smora/scrubber/errors/scrubber-error
4
+ */
5
+ export declare class ScrubberError extends Error {
6
+ constructor(message: any, details?: {});
7
+ toJSON(): {
8
+ name: string;
9
+ message: string;
10
+ details: any;
11
+ timestamp: any;
12
+ };
13
+ }
14
+ export declare class StructuralCleaningError extends ScrubberError {
15
+ constructor(message: any, details?: {});
16
+ }
17
+ export declare class ChunkingError extends ScrubberError {
18
+ constructor(message: any, details?: {});
19
+ }
20
+ export declare class ValidationError extends ScrubberError {
21
+ constructor(message: any, details?: {});
22
+ }
@@ -1,43 +1,39 @@
1
+ // @ts-nocheck
1
2
  /**
2
3
  * S-MORA Layer 0 Scrubber Error Classes
3
4
  * @module smora/scrubber/errors/scrubber-error
4
5
  */
5
-
6
6
  export class ScrubberError extends Error {
7
- constructor(message, details = {}) {
8
- super(message);
9
- this.name = 'ScrubberError';
10
- this.details = details;
11
- this.timestamp = new Date().toISOString();
12
- }
13
-
14
- toJSON() {
15
- return {
16
- name: this.name,
17
- message: this.message,
18
- details: this.details,
19
- timestamp: this.timestamp
20
- };
21
- }
7
+ constructor(message, details = {}) {
8
+ super(message);
9
+ this.name = 'ScrubberError';
10
+ this.details = details;
11
+ this.timestamp = new Date().toISOString();
12
+ }
13
+ toJSON() {
14
+ return {
15
+ name: this.name,
16
+ message: this.message,
17
+ details: this.details,
18
+ timestamp: this.timestamp
19
+ };
20
+ }
22
21
  }
23
-
24
22
  export class StructuralCleaningError extends ScrubberError {
25
- constructor(message, details = {}) {
26
- super(message, details);
27
- this.name = 'StructuralCleaningError';
28
- }
23
+ constructor(message, details = {}) {
24
+ super(message, details);
25
+ this.name = 'StructuralCleaningError';
26
+ }
29
27
  }
30
-
31
28
  export class ChunkingError extends ScrubberError {
32
- constructor(message, details = {}) {
33
- super(message, details);
34
- this.name = 'ChunkingError';
35
- }
29
+ constructor(message, details = {}) {
30
+ super(message, details);
31
+ this.name = 'ChunkingError';
32
+ }
36
33
  }
37
-
38
34
  export class ValidationError extends ScrubberError {
39
- constructor(message, details = {}) {
40
- super(message, details);
41
- this.name = 'ValidationError';
42
- }
43
- }
35
+ constructor(message, details = {}) {
36
+ super(message, details);
37
+ this.name = 'ValidationError';
38
+ }
39
+ }
@@ -0,0 +1,44 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * S-MORA Layer 0 Scrubber Error Classes
4
+ * @module smora/scrubber/errors/scrubber-error
5
+ */
6
+
7
+ export class ScrubberError extends Error {
8
+ constructor(message, details = {}) {
9
+ super(message);
10
+ this.name = 'ScrubberError';
11
+ this.details = details;
12
+ this.timestamp = new Date().toISOString();
13
+ }
14
+
15
+ toJSON() {
16
+ return {
17
+ name: this.name,
18
+ message: this.message,
19
+ details: this.details,
20
+ timestamp: this.timestamp
21
+ };
22
+ }
23
+ }
24
+
25
+ export class StructuralCleaningError extends ScrubberError {
26
+ constructor(message, details = {}) {
27
+ super(message, details);
28
+ this.name = 'StructuralCleaningError';
29
+ }
30
+ }
31
+
32
+ export class ChunkingError extends ScrubberError {
33
+ constructor(message, details = {}) {
34
+ super(message, details);
35
+ this.name = 'ChunkingError';
36
+ }
37
+ }
38
+
39
+ export class ValidationError extends ScrubberError {
40
+ constructor(message, details = {}) {
41
+ super(message, details);
42
+ this.name = 'ValidationError';
43
+ }
44
+ }
@@ -0,0 +1,5 @@
1
+ /**
2
+ * YAMO Scrubber Module
3
+ * PII and sensitive data sanitization
4
+ */
5
+ export { Scrubber } from "./scrubber.js";
@@ -1,25 +1,6 @@
1
+ // @ts-nocheck
1
2
  /**
2
- * S-MORA Layer 0 Scrubber
3
- * Deterministic ingestion-time preprocessing layer
4
- * @module smora/scrubber
3
+ * YAMO Scrubber Module
4
+ * PII and sensitive data sanitization
5
5
  */
6
-
7
- export { defaultScrubberConfig } from './config/defaults.js';
8
- export {
9
- ScrubberError,
10
- StructuralCleaningError,
11
- ChunkingError,
12
- ValidationError
13
- } from './errors/scrubber-error.js';
14
- export { ScrubberTelemetry } from './telemetry.js';
15
- export { Scrubber } from './scrubber.js';
16
- export { HashUtil } from './utils/hash.js';
17
- export { TokenCounter } from './utils/token-counter.js';
18
- export { PatternMatcher } from './utils/pattern-matcher.js';
19
- export { HTMLParser } from './utils/html-parser.js';
20
- export { StructuralCleaner } from './stages/structural-cleaner.js';
21
- export { SemanticFilter } from './stages/semantic-filter.js';
22
- export { Normalizer } from './stages/normalizer.js';
23
- export { Chunker } from './stages/chunker.js';
24
- export { MetadataAnnotator } from './stages/metadata-annotator.js';
25
- export { Validator } from './stages/validator.js';
6
+ export { Scrubber } from "./scrubber.js";
@@ -0,0 +1,6 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * YAMO Scrubber Module
4
+ * PII and sensitive data sanitization
5
+ */
6
+ export { Scrubber } from "./scrubber.js";
@@ -0,0 +1,44 @@
1
+ /**
2
+ * S-MORA Layer 0 Scrubber - Main Orchestrator
3
+ * @module smora/scrubber/scrubber
4
+ */
5
+ import { StructuralCleaner } from "./stages/structural-cleaner.js";
6
+ import { SemanticFilter } from "./stages/semantic-filter.js";
7
+ import { Normalizer } from "./stages/normalizer.js";
8
+ import { Chunker } from "./stages/chunker.js";
9
+ import { MetadataAnnotator } from "./stages/metadata-annotator.js";
10
+ import { Validator } from "./stages/validator.js";
11
+ export declare class Scrubber {
12
+ config: any;
13
+ stages: any;
14
+ telemetry: any;
15
+ constructor(config?: {});
16
+ /**
17
+ * Main entry point - process a raw document
18
+ * @param {Object} document - { content: string, source: string, type: 'html'|'md'|'txt' }
19
+ * @returns {Promise<Object>} - { chunks: Array, metadata: Object, telemetry: Object }
20
+ */
21
+ process(document: any): Promise<{
22
+ chunks: any[];
23
+ metadata: {
24
+ source: any;
25
+ type: any;
26
+ processingTimestamp: string;
27
+ };
28
+ telemetry: {};
29
+ }>;
30
+ _executeStage(stageName: any, stageFn: any): Promise<any>;
31
+ _initializeStages(): {
32
+ structural: StructuralCleaner;
33
+ semantic: SemanticFilter;
34
+ normalizer: Normalizer;
35
+ chunker: Chunker;
36
+ metadata: MetadataAnnotator;
37
+ validator: Validator;
38
+ };
39
+ getMetrics(): any;
40
+ healthCheck(): Promise<{
41
+ status: string;
42
+ }>;
43
+ }
44
+ export default Scrubber;
@@ -1,130 +1,109 @@
1
+ // @ts-nocheck
1
2
  /**
2
3
  * S-MORA Layer 0 Scrubber - Main Orchestrator
3
4
  * @module smora/scrubber/scrubber
4
5
  */
5
-
6
- import { StructuralCleaner } from './stages/structural-cleaner.js';
7
- import { SemanticFilter } from './stages/semantic-filter.js';
8
- import { Normalizer } from './stages/normalizer.js';
9
- import { Chunker } from './stages/chunker.js';
10
- import { MetadataAnnotator } from './stages/metadata-annotator.js';
11
- import { Validator } from './stages/validator.js';
12
- import { ScrubberTelemetry } from './telemetry.js';
13
- import { ScrubberError } from './errors/scrubber-error.js';
14
- import { defaultScrubberConfig } from './config/defaults.js';
15
-
6
+ import { StructuralCleaner } from "./stages/structural-cleaner.js";
7
+ import { SemanticFilter } from "./stages/semantic-filter.js";
8
+ import { Normalizer } from "./stages/normalizer.js";
9
+ import { Chunker } from "./stages/chunker.js";
10
+ import { MetadataAnnotator } from "./stages/metadata-annotator.js";
11
+ import { Validator } from "./stages/validator.js";
12
+ import { ScrubberTelemetry, } from "./telemetry.js";
13
+ // import { ScrubberError } from './errors/scrubber-error'; // Assuming this exists or I should check
14
+ import { defaultScrubberConfig } from "./config/defaults.js";
16
15
  export class Scrubber {
17
- constructor(config = {}) {
18
- this.config = { ...defaultScrubberConfig, ...config };
19
- this.stages = this._initializeStages();
20
- this.telemetry = new ScrubberTelemetry();
21
- }
22
-
23
- /**
24
- * Main entry point - process a raw document
25
- * @param {Object} document - { content: string, source: string, type: 'html'|'md'|'txt' }
26
- * @returns {Promise<Object>} - { chunks: Array, metadata: Object, telemetry: Object }
27
- */
28
- async process(document) {
29
- const startTime = Date.now();
30
- const result = {
31
- chunks: [],
32
- metadata: {
33
- source: document.source,
34
- type: document.type,
35
- processingTimestamp: new Date().toISOString()
36
- },
37
- telemetry: {}
38
- };
39
-
40
- try {
41
- // If disabled, return empty chunks
42
- if (!this.config.enabled) {
43
- result.success = true;
44
- result.telemetry.totalDuration = Date.now() - startTime;
45
- return result;
46
- }
47
-
48
- // Stage 1: Structural Cleaning
49
- const cleaned = await this._executeStage('structural', () =>
50
- this.stages.structural.clean(document.content)
51
- );
52
- result.telemetry.structural = this.telemetry.getStageStats('structural');
53
-
54
- // Stage 2: Semantic Filtering
55
- const filtered = await this._executeStage('semantic', () =>
56
- this.stages.semantic.filter(cleaned)
57
- );
58
- result.telemetry.semantic = this.telemetry.getStageStats('semantic');
59
-
60
- // Stage 3: Normalization
61
- const normalized = await this._executeStage('normalization', () =>
62
- this.stages.normalizer.normalize(filtered)
63
- );
64
- result.telemetry.normalization = this.telemetry.getStageStats('normalization');
65
-
66
- // Stage 4: Chunking
67
- const chunks = await this._executeStage('chunking', () =>
68
- this.stages.chunker.chunk(normalized)
69
- );
70
- result.telemetry.chunking = this.telemetry.getStageStats('chunking');
71
-
72
- // Stage 5: Metadata Annotation
73
- const annotated = await this._executeStage('metadata', () =>
74
- this.stages.metadata.annotate(chunks, document)
75
- );
76
- result.telemetry.metadata = this.telemetry.getStageStats('metadata');
77
-
78
- // Stage 6: Validation
79
- result.chunks = await this._executeStage('validation', () =>
80
- this.stages.validator.validate(annotated)
81
- );
82
- result.telemetry.validation = this.telemetry.getStageStats('validation');
83
-
84
- result.telemetry.totalDuration = Date.now() - startTime;
85
- result.success = true;
86
-
87
- return result;
88
- } catch (error) {
89
- const message = error instanceof Error ? error.message : String(error);
90
- result.success = false;
91
- result.error = message;
92
- result.telemetry.totalDuration = Date.now() - startTime;
16
+ config;
17
+ stages; // Using any for stages as they are not yet converted
18
+ telemetry;
19
+ constructor(config = {}) {
20
+ this.config = { ...defaultScrubberConfig, ...config };
21
+ this.stages = this._initializeStages();
22
+ this.telemetry = new ScrubberTelemetry();
93
23
  }
94
- }
95
-
96
- async _executeStage(stageName, stageFn) {
97
- const startTime = Date.now();
98
- try {
99
- const result = await stageFn();
100
- const duration = Date.now() - startTime;
101
- this.telemetry.recordStage(stageName, duration, true);
102
- return result;
103
- } catch (error) {
104
- const duration = Date.now() - startTime;
105
- this.telemetry.recordStage(stageName, duration, false);
106
- throw error;
24
+ /**
25
+ * Main entry point - process a raw document
26
+ * @param {Object} document - { content: string, source: string, type: 'html'|'md'|'txt' }
27
+ * @returns {Promise<Object>} - { chunks: Array, metadata: Object, telemetry: Object }
28
+ */
29
+ async process(document) {
30
+ const startTime = Date.now();
31
+ const result = {
32
+ chunks: [],
33
+ metadata: {
34
+ source: document.source,
35
+ type: document.type,
36
+ processingTimestamp: new Date().toISOString(),
37
+ },
38
+ telemetry: {},
39
+ };
40
+ try {
41
+ // If disabled, return empty chunks
42
+ if (!this.config.enabled) {
43
+ result.success = true;
44
+ result.telemetry.totalDuration = Date.now() - startTime;
45
+ return result;
46
+ }
47
+ // Stage 1: Structural Cleaning
48
+ const cleaned = await this._executeStage("structural", () => this.stages.structural.clean(document.content));
49
+ result.telemetry.structural = this.telemetry.getStageStats("structural");
50
+ // Stage 2: Semantic Filtering
51
+ const filtered = await this._executeStage("semantic", () => this.stages.semantic.filter(cleaned));
52
+ result.telemetry.semantic = this.telemetry.getStageStats("semantic");
53
+ // Stage 3: Normalization
54
+ const normalized = await this._executeStage("normalization", () => this.stages.normalizer.normalize(filtered));
55
+ result.telemetry.normalization =
56
+ this.telemetry.getStageStats("normalization");
57
+ // Stage 4: Chunking
58
+ const chunks = await this._executeStage("chunking", () => this.stages.chunker.chunk(normalized));
59
+ result.telemetry.chunking = this.telemetry.getStageStats("chunking");
60
+ // Stage 5: Metadata Annotation
61
+ const annotated = await this._executeStage("metadata", () => this.stages.metadata.annotate(chunks, document));
62
+ result.telemetry.metadata = this.telemetry.getStageStats("metadata");
63
+ // Stage 6: Validation
64
+ result.chunks = await this._executeStage("validation", () => this.stages.validator.validate(annotated));
65
+ result.telemetry.validation = this.telemetry.getStageStats("validation");
66
+ result.telemetry.totalDuration = Date.now() - startTime;
67
+ result.success = true;
68
+ return result;
69
+ }
70
+ catch (error) {
71
+ const message = error instanceof Error ? error.message : String(error);
72
+ result.success = false;
73
+ result.error = message;
74
+ result.telemetry.totalDuration = Date.now() - startTime;
75
+ return result;
76
+ }
77
+ }
78
+ async _executeStage(stageName, stageFn) {
79
+ const startTime = Date.now();
80
+ try {
81
+ const result = await stageFn();
82
+ const duration = Date.now() - startTime;
83
+ this.telemetry.recordStage(stageName, duration, true);
84
+ return result;
85
+ }
86
+ catch (error) {
87
+ const duration = Date.now() - startTime;
88
+ this.telemetry.recordStage(stageName, duration, false);
89
+ throw error;
90
+ }
91
+ }
92
+ _initializeStages() {
93
+ return {
94
+ structural: new StructuralCleaner(this.config.structural),
95
+ semantic: new SemanticFilter(this.config.semantic),
96
+ normalizer: new Normalizer(this.config.normalization),
97
+ chunker: new Chunker(this.config.chunking),
98
+ metadata: new MetadataAnnotator(this.config.metadata),
99
+ validator: new Validator(this.config.validation),
100
+ };
101
+ }
102
+ getMetrics() {
103
+ return this.telemetry.getSummary();
104
+ }
105
+ healthCheck() {
106
+ return Promise.resolve({ status: "healthy" });
107
107
  }
108
- }
109
-
110
- _initializeStages() {
111
- return {
112
- structural: new StructuralCleaner(this.config.structural),
113
- semantic: new SemanticFilter(this.config.semantic),
114
- normalizer: new Normalizer(this.config.normalization),
115
- chunker: new Chunker(this.config.chunking),
116
- metadata: new MetadataAnnotator(this.config.metadata),
117
- validator: new Validator(this.config.validation)
118
- };
119
- }
120
-
121
- getMetrics() {
122
- return this.telemetry.getSummary();
123
- }
124
-
125
- async healthCheck() {
126
- return { status: 'healthy' };
127
- }
128
108
  }
129
-
130
109
  export default Scrubber;