@yamo/memory-mesh 3.0.0 → 3.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/README.md +9 -3
  2. package/bin/memory_mesh.js +95 -8
  3. package/lib/llm/client.d.ts +23 -48
  4. package/lib/llm/client.js +1 -0
  5. package/lib/llm/client.ts +298 -377
  6. package/lib/llm/index.js +1 -0
  7. package/lib/llm/index.ts +1 -2
  8. package/lib/memory/adapters/client.d.ts +22 -85
  9. package/lib/memory/adapters/client.js +1 -0
  10. package/lib/memory/adapters/client.ts +474 -633
  11. package/lib/memory/adapters/config.d.ts +82 -89
  12. package/lib/memory/adapters/config.js +1 -0
  13. package/lib/memory/adapters/config.ts +156 -225
  14. package/lib/memory/adapters/errors.d.ts +28 -20
  15. package/lib/memory/adapters/errors.js +1 -0
  16. package/lib/memory/adapters/errors.ts +83 -120
  17. package/lib/memory/context-manager.d.ts +15 -18
  18. package/lib/memory/context-manager.js +1 -0
  19. package/lib/memory/context-manager.ts +314 -401
  20. package/lib/memory/embeddings/factory.d.ts +18 -20
  21. package/lib/memory/embeddings/factory.js +1 -0
  22. package/lib/memory/embeddings/factory.ts +130 -173
  23. package/lib/memory/embeddings/index.js +1 -0
  24. package/lib/memory/embeddings/index.ts +1 -0
  25. package/lib/memory/embeddings/service.d.ts +36 -66
  26. package/lib/memory/embeddings/service.js +1 -0
  27. package/lib/memory/embeddings/service.ts +479 -616
  28. package/lib/memory/index.d.ts +2 -2
  29. package/lib/memory/index.js +1 -0
  30. package/lib/memory/index.ts +3 -13
  31. package/lib/memory/memory-mesh.d.ts +151 -93
  32. package/lib/memory/memory-mesh.js +1 -0
  33. package/lib/memory/memory-mesh.ts +1406 -1692
  34. package/lib/memory/memory-translator.d.ts +1 -6
  35. package/lib/memory/memory-translator.js +1 -0
  36. package/lib/memory/memory-translator.ts +96 -128
  37. package/lib/memory/schema.d.ts +29 -10
  38. package/lib/memory/schema.js +1 -0
  39. package/lib/memory/schema.ts +102 -185
  40. package/lib/memory/scorer.d.ts +3 -4
  41. package/lib/memory/scorer.js +1 -0
  42. package/lib/memory/scorer.ts +69 -86
  43. package/lib/memory/search/index.js +1 -0
  44. package/lib/memory/search/index.ts +1 -0
  45. package/lib/memory/search/keyword-search.d.ts +10 -26
  46. package/lib/memory/search/keyword-search.js +1 -0
  47. package/lib/memory/search/keyword-search.ts +123 -161
  48. package/lib/scrubber/config/defaults.d.ts +39 -46
  49. package/lib/scrubber/config/defaults.js +1 -0
  50. package/lib/scrubber/config/defaults.ts +50 -112
  51. package/lib/scrubber/errors/scrubber-error.d.ts +22 -0
  52. package/lib/scrubber/errors/scrubber-error.js +39 -0
  53. package/lib/scrubber/errors/scrubber-error.ts +44 -0
  54. package/lib/scrubber/index.d.ts +0 -1
  55. package/lib/scrubber/index.js +1 -0
  56. package/lib/scrubber/index.ts +1 -2
  57. package/lib/scrubber/scrubber.d.ts +14 -31
  58. package/lib/scrubber/scrubber.js +1 -0
  59. package/lib/scrubber/scrubber.ts +93 -152
  60. package/lib/scrubber/stages/chunker.d.ts +22 -10
  61. package/lib/scrubber/stages/chunker.js +86 -0
  62. package/lib/scrubber/stages/chunker.ts +104 -0
  63. package/lib/scrubber/stages/metadata-annotator.d.ts +14 -15
  64. package/lib/scrubber/stages/metadata-annotator.js +64 -0
  65. package/lib/scrubber/stages/metadata-annotator.ts +75 -0
  66. package/lib/scrubber/stages/normalizer.d.ts +13 -10
  67. package/lib/scrubber/stages/normalizer.js +51 -0
  68. package/lib/scrubber/stages/normalizer.ts +60 -0
  69. package/lib/scrubber/stages/semantic-filter.d.ts +13 -10
  70. package/lib/scrubber/stages/semantic-filter.js +51 -0
  71. package/lib/scrubber/stages/semantic-filter.ts +62 -0
  72. package/lib/scrubber/stages/structural-cleaner.d.ts +15 -10
  73. package/lib/scrubber/stages/structural-cleaner.js +73 -0
  74. package/lib/scrubber/stages/structural-cleaner.ts +83 -0
  75. package/lib/scrubber/stages/validator.d.ts +14 -15
  76. package/lib/scrubber/stages/validator.js +56 -0
  77. package/lib/scrubber/stages/validator.ts +67 -0
  78. package/lib/scrubber/telemetry.d.ts +20 -27
  79. package/lib/scrubber/telemetry.js +1 -0
  80. package/lib/scrubber/telemetry.ts +53 -90
  81. package/lib/scrubber/utils/hash.d.ts +14 -0
  82. package/lib/scrubber/utils/hash.js +37 -0
  83. package/lib/scrubber/utils/hash.ts +40 -0
  84. package/lib/scrubber/utils/html-parser.d.ts +14 -0
  85. package/lib/scrubber/utils/html-parser.js +38 -0
  86. package/lib/scrubber/utils/html-parser.ts +46 -0
  87. package/lib/scrubber/utils/pattern-matcher.d.ts +12 -0
  88. package/lib/scrubber/utils/pattern-matcher.js +54 -0
  89. package/lib/scrubber/utils/pattern-matcher.ts +64 -0
  90. package/lib/scrubber/utils/token-counter.d.ts +18 -0
  91. package/lib/scrubber/utils/token-counter.js +30 -0
  92. package/lib/scrubber/utils/token-counter.ts +32 -0
  93. package/lib/utils/logger.d.ts +1 -11
  94. package/lib/utils/logger.js +1 -0
  95. package/lib/utils/logger.ts +43 -63
  96. package/lib/utils/skill-metadata.d.ts +6 -14
  97. package/lib/utils/skill-metadata.js +1 -0
  98. package/lib/utils/skill-metadata.ts +89 -103
  99. package/lib/yamo/emitter.d.ts +8 -35
  100. package/lib/yamo/emitter.js +1 -0
  101. package/lib/yamo/emitter.ts +77 -155
  102. package/lib/yamo/index.d.ts +14 -0
  103. package/lib/yamo/index.js +14 -0
  104. package/lib/yamo/index.ts +16 -0
  105. package/lib/yamo/schema.d.ts +8 -10
  106. package/lib/yamo/schema.js +1 -0
  107. package/lib/yamo/schema.ts +82 -114
  108. package/package.json +5 -2
@@ -1,7 +1,6 @@
1
+ // @ts-nocheck
1
2
  /**
2
3
  * YAMO Scrubber Module
3
4
  * PII and sensitive data sanitization
4
5
  */
5
-
6
6
  export { Scrubber } from "./scrubber.js";
7
- export { ScrubberConfig } from "./config/defaults.js";
@@ -8,43 +8,26 @@ import { Normalizer } from "./stages/normalizer.js";
8
8
  import { Chunker } from "./stages/chunker.js";
9
9
  import { MetadataAnnotator } from "./stages/metadata-annotator.js";
10
10
  import { Validator } from "./stages/validator.js";
11
- import { ScrubberTelemetry, TelemetrySummary, StageSummary } from "./telemetry.js";
12
- import { ScrubberConfig } from "./config/defaults.js";
13
- export interface ScrubberDocument {
14
- content: string;
15
- source: string;
16
- type: string;
17
- }
18
- export interface Chunk {
19
- text: string;
20
- [key: string]: any;
21
- }
22
- export interface ScrubberResult {
23
- chunks: Chunk[];
24
- metadata: {
25
- source: string;
26
- type: string;
27
- processingTimestamp: string;
28
- [key: string]: any;
29
- };
30
- telemetry: Partial<Record<string, StageSummary>> & {
31
- totalDuration?: number;
32
- };
33
- success?: boolean;
34
- error?: string;
35
- }
36
11
  export declare class Scrubber {
37
- config: ScrubberConfig;
12
+ config: any;
38
13
  stages: any;
39
- telemetry: ScrubberTelemetry;
40
- constructor(config?: Partial<ScrubberConfig>);
14
+ telemetry: any;
15
+ constructor(config?: {});
41
16
  /**
42
17
  * Main entry point - process a raw document
43
18
  * @param {Object} document - { content: string, source: string, type: 'html'|'md'|'txt' }
44
19
  * @returns {Promise<Object>} - { chunks: Array, metadata: Object, telemetry: Object }
45
20
  */
46
- process(document: ScrubberDocument): Promise<ScrubberResult>;
47
- _executeStage<T>(stageName: string, stageFn: () => Promise<T> | T): Promise<T>;
21
+ process(document: any): Promise<{
22
+ chunks: any[];
23
+ metadata: {
24
+ source: any;
25
+ type: any;
26
+ processingTimestamp: string;
27
+ };
28
+ telemetry: {};
29
+ }>;
30
+ _executeStage(stageName: any, stageFn: any): Promise<any>;
48
31
  _initializeStages(): {
49
32
  structural: StructuralCleaner;
50
33
  semantic: SemanticFilter;
@@ -53,7 +36,7 @@ export declare class Scrubber {
53
36
  metadata: MetadataAnnotator;
54
37
  validator: Validator;
55
38
  };
56
- getMetrics(): TelemetrySummary;
39
+ getMetrics(): any;
57
40
  healthCheck(): Promise<{
58
41
  status: string;
59
42
  }>;
@@ -1,3 +1,4 @@
1
+ // @ts-nocheck
1
2
  /**
2
3
  * S-MORA Layer 0 Scrubber - Main Orchestrator
3
4
  * @module smora/scrubber/scrubber
@@ -1,168 +1,109 @@
1
+ // @ts-nocheck
1
2
  /**
2
3
  * S-MORA Layer 0 Scrubber - Main Orchestrator
3
4
  * @module smora/scrubber/scrubber
4
5
  */
5
-
6
6
  import { StructuralCleaner } from "./stages/structural-cleaner.js";
7
7
  import { SemanticFilter } from "./stages/semantic-filter.js";
8
8
  import { Normalizer } from "./stages/normalizer.js";
9
9
  import { Chunker } from "./stages/chunker.js";
10
10
  import { MetadataAnnotator } from "./stages/metadata-annotator.js";
11
11
  import { Validator } from "./stages/validator.js";
12
- import {
13
- ScrubberTelemetry,
14
- TelemetrySummary,
15
- StageSummary,
16
- } from "./telemetry.js";
12
+ import { ScrubberTelemetry, } from "./telemetry.js";
17
13
  // import { ScrubberError } from './errors/scrubber-error'; // Assuming this exists or I should check
18
- import { defaultScrubberConfig, ScrubberConfig } from "./config/defaults.js";
19
-
20
- // Interfaces for input/output
21
- export interface ScrubberDocument {
22
- content: string;
23
- source: string;
24
- type: string;
25
- }
26
-
27
- export interface Chunk {
28
- text: string;
29
- [key: string]: any;
30
- }
31
-
32
- export interface ScrubberResult {
33
- chunks: Chunk[];
34
- metadata: {
35
- source: string;
36
- type: string;
37
- processingTimestamp: string;
38
- [key: string]: any;
39
- };
40
- telemetry: Partial<Record<string, StageSummary>> & { totalDuration?: number };
41
- success?: boolean;
42
- error?: string;
43
- }
44
-
14
+ import { defaultScrubberConfig } from "./config/defaults.js";
45
15
  export class Scrubber {
46
- config: ScrubberConfig;
47
- stages: any; // Using any for stages as they are not yet converted
48
- telemetry: ScrubberTelemetry;
49
-
50
- constructor(config: Partial<ScrubberConfig> = {}) {
51
- this.config = { ...defaultScrubberConfig, ...config };
52
- this.stages = this._initializeStages();
53
- this.telemetry = new ScrubberTelemetry();
54
- }
55
-
56
- /**
57
- * Main entry point - process a raw document
58
- * @param {Object} document - { content: string, source: string, type: 'html'|'md'|'txt' }
59
- * @returns {Promise<Object>} - { chunks: Array, metadata: Object, telemetry: Object }
60
- */
61
- async process(document: ScrubberDocument): Promise<ScrubberResult> {
62
- const startTime = Date.now();
63
- const result: ScrubberResult = {
64
- chunks: [],
65
- metadata: {
66
- source: document.source,
67
- type: document.type,
68
- processingTimestamp: new Date().toISOString(),
69
- },
70
- telemetry: {},
71
- };
72
-
73
- try {
74
- // If disabled, return empty chunks
75
- if (!this.config.enabled) {
76
- result.success = true;
77
- result.telemetry.totalDuration = Date.now() - startTime;
78
- return result;
79
- }
80
-
81
- // Stage 1: Structural Cleaning
82
- const cleaned = await this._executeStage("structural", () =>
83
- this.stages.structural.clean(document.content),
84
- );
85
- result.telemetry.structural = this.telemetry.getStageStats("structural");
86
-
87
- // Stage 2: Semantic Filtering
88
- const filtered = await this._executeStage("semantic", () =>
89
- this.stages.semantic.filter(cleaned),
90
- );
91
- result.telemetry.semantic = this.telemetry.getStageStats("semantic");
92
-
93
- // Stage 3: Normalization
94
- const normalized = await this._executeStage("normalization", () =>
95
- this.stages.normalizer.normalize(filtered),
96
- );
97
- result.telemetry.normalization =
98
- this.telemetry.getStageStats("normalization");
99
-
100
- // Stage 4: Chunking
101
- const chunks = await this._executeStage("chunking", () =>
102
- this.stages.chunker.chunk(normalized),
103
- );
104
- result.telemetry.chunking = this.telemetry.getStageStats("chunking");
105
-
106
- // Stage 5: Metadata Annotation
107
- const annotated = await this._executeStage("metadata", () =>
108
- this.stages.metadata.annotate(chunks, document),
109
- );
110
- result.telemetry.metadata = this.telemetry.getStageStats("metadata");
111
-
112
- // Stage 6: Validation
113
- result.chunks = await this._executeStage("validation", () =>
114
- this.stages.validator.validate(annotated),
115
- );
116
- result.telemetry.validation = this.telemetry.getStageStats("validation");
117
-
118
- result.telemetry.totalDuration = Date.now() - startTime;
119
- result.success = true;
120
-
121
- return result;
122
- } catch (error) {
123
- const message = error instanceof Error ? error.message : String(error);
124
- result.success = false;
125
- result.error = message;
126
- result.telemetry.totalDuration = Date.now() - startTime;
127
- return result;
16
+ config;
17
+ stages; // Using any for stages as they are not yet converted
18
+ telemetry;
19
+ constructor(config = {}) {
20
+ this.config = { ...defaultScrubberConfig, ...config };
21
+ this.stages = this._initializeStages();
22
+ this.telemetry = new ScrubberTelemetry();
23
+ }
24
+ /**
25
+ * Main entry point - process a raw document
26
+ * @param {Object} document - { content: string, source: string, type: 'html'|'md'|'txt' }
27
+ * @returns {Promise<Object>} - { chunks: Array, metadata: Object, telemetry: Object }
28
+ */
29
+ async process(document) {
30
+ const startTime = Date.now();
31
+ const result = {
32
+ chunks: [],
33
+ metadata: {
34
+ source: document.source,
35
+ type: document.type,
36
+ processingTimestamp: new Date().toISOString(),
37
+ },
38
+ telemetry: {},
39
+ };
40
+ try {
41
+ // If disabled, return empty chunks
42
+ if (!this.config.enabled) {
43
+ result.success = true;
44
+ result.telemetry.totalDuration = Date.now() - startTime;
45
+ return result;
46
+ }
47
+ // Stage 1: Structural Cleaning
48
+ const cleaned = await this._executeStage("structural", () => this.stages.structural.clean(document.content));
49
+ result.telemetry.structural = this.telemetry.getStageStats("structural");
50
+ // Stage 2: Semantic Filtering
51
+ const filtered = await this._executeStage("semantic", () => this.stages.semantic.filter(cleaned));
52
+ result.telemetry.semantic = this.telemetry.getStageStats("semantic");
53
+ // Stage 3: Normalization
54
+ const normalized = await this._executeStage("normalization", () => this.stages.normalizer.normalize(filtered));
55
+ result.telemetry.normalization =
56
+ this.telemetry.getStageStats("normalization");
57
+ // Stage 4: Chunking
58
+ const chunks = await this._executeStage("chunking", () => this.stages.chunker.chunk(normalized));
59
+ result.telemetry.chunking = this.telemetry.getStageStats("chunking");
60
+ // Stage 5: Metadata Annotation
61
+ const annotated = await this._executeStage("metadata", () => this.stages.metadata.annotate(chunks, document));
62
+ result.telemetry.metadata = this.telemetry.getStageStats("metadata");
63
+ // Stage 6: Validation
64
+ result.chunks = await this._executeStage("validation", () => this.stages.validator.validate(annotated));
65
+ result.telemetry.validation = this.telemetry.getStageStats("validation");
66
+ result.telemetry.totalDuration = Date.now() - startTime;
67
+ result.success = true;
68
+ return result;
69
+ }
70
+ catch (error) {
71
+ const message = error instanceof Error ? error.message : String(error);
72
+ result.success = false;
73
+ result.error = message;
74
+ result.telemetry.totalDuration = Date.now() - startTime;
75
+ return result;
76
+ }
77
+ }
78
+ async _executeStage(stageName, stageFn) {
79
+ const startTime = Date.now();
80
+ try {
81
+ const result = await stageFn();
82
+ const duration = Date.now() - startTime;
83
+ this.telemetry.recordStage(stageName, duration, true);
84
+ return result;
85
+ }
86
+ catch (error) {
87
+ const duration = Date.now() - startTime;
88
+ this.telemetry.recordStage(stageName, duration, false);
89
+ throw error;
90
+ }
91
+ }
92
+ _initializeStages() {
93
+ return {
94
+ structural: new StructuralCleaner(this.config.structural),
95
+ semantic: new SemanticFilter(this.config.semantic),
96
+ normalizer: new Normalizer(this.config.normalization),
97
+ chunker: new Chunker(this.config.chunking),
98
+ metadata: new MetadataAnnotator(this.config.metadata),
99
+ validator: new Validator(this.config.validation),
100
+ };
101
+ }
102
+ getMetrics() {
103
+ return this.telemetry.getSummary();
128
104
  }
129
- }
130
-
131
- async _executeStage<T>(
132
- stageName: string,
133
- stageFn: () => Promise<T> | T,
134
- ): Promise<T> {
135
- const startTime = Date.now();
136
- try {
137
- const result = await stageFn();
138
- const duration = Date.now() - startTime;
139
- this.telemetry.recordStage(stageName, duration, true);
140
- return result;
141
- } catch (error) {
142
- const duration = Date.now() - startTime;
143
- this.telemetry.recordStage(stageName, duration, false);
144
- throw error;
105
+ healthCheck() {
106
+ return Promise.resolve({ status: "healthy" });
145
107
  }
146
- }
147
-
148
- _initializeStages() {
149
- return {
150
- structural: new StructuralCleaner(this.config.structural),
151
- semantic: new SemanticFilter(this.config.semantic),
152
- normalizer: new Normalizer(this.config.normalization),
153
- chunker: new Chunker(this.config.chunking),
154
- metadata: new MetadataAnnotator(this.config.metadata),
155
- validator: new Validator(this.config.validation),
156
- };
157
- }
158
-
159
- getMetrics(): TelemetrySummary {
160
- return this.telemetry.getSummary();
161
- }
162
-
163
- healthCheck(): Promise<{ status: string }> {
164
- return Promise.resolve({ status: "healthy" });
165
- }
166
108
  }
167
-
168
109
  export default Scrubber;
@@ -1,13 +1,25 @@
1
1
  /**
2
- * Type definitions for chunker.js
2
+ * S-MORA Layer 0 Scrubber - Stage 4: Chunking
3
+ * @module smora/scrubber/stages/chunker
3
4
  */
4
-
5
- export interface ChunkerConfig {
6
- maxSize?: number;
7
- [key: string]: any;
8
- }
9
-
10
- export class Chunker {
11
- constructor(config?: ChunkerConfig);
12
- chunk(content: string): Promise<string[]>;
5
+ export declare class Chunker {
6
+ constructor(config: any);
7
+ /**
8
+ * Split content into chunks
9
+ * @param {string} content - Normalized content
10
+ * @returns {Promise<Array>} - Array of chunks with metadata
11
+ */
12
+ chunk(content: any): Promise<{
13
+ index: number;
14
+ text: any;
15
+ metadata: {
16
+ tokens: any;
17
+ heading: any;
18
+ position: number;
19
+ };
20
+ }[]>;
21
+ _isHeading(line: any): boolean;
22
+ _shouldStartNewChunk(currentChunk: any, para: any, paraTokens: any, isHeading: any): boolean;
23
+ _extractInitialHeading(content: any): any;
24
+ _extractHeadingText(headingLine: any): any;
13
25
  }
@@ -0,0 +1,86 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * S-MORA Layer 0 Scrubber - Stage 4: Chunking
4
+ * @module smora/scrubber/stages/chunker
5
+ */
6
+ import { TokenCounter } from '../utils/token-counter.js';
7
+ import { ScrubberError } from '../errors/scrubber-error.js';
8
+ export class Chunker {
9
+ constructor(config) {
10
+ this.config = config;
11
+ this.tokenCounter = new TokenCounter();
12
+ }
13
+ /**
14
+ * Split content into chunks
15
+ * @param {string} content - Normalized content
16
+ * @returns {Promise<Array>} - Array of chunks with metadata
17
+ */
18
+ async chunk(content) {
19
+ try {
20
+ const chunks = [];
21
+ const paragraphs = content.split(/\n\n+/);
22
+ let currentChunk = {
23
+ text: '',
24
+ tokens: 0,
25
+ heading: this._extractInitialHeading(content)
26
+ };
27
+ for (const para of paragraphs) {
28
+ const isHeading = this._isHeading(para);
29
+ const paraTokens = this.tokenCounter.count(para);
30
+ if (this._shouldStartNewChunk(currentChunk, para, paraTokens, isHeading)) {
31
+ if (currentChunk.tokens >= this.config.minTokens) {
32
+ chunks.push({ ...currentChunk });
33
+ }
34
+ currentChunk = {
35
+ text: '',
36
+ tokens: 0,
37
+ heading: isHeading ? this._extractHeadingText(para) : currentChunk.heading
38
+ };
39
+ }
40
+ currentChunk.text += (currentChunk.text ? '\n\n' : '') + para;
41
+ currentChunk.tokens += paraTokens;
42
+ if (currentChunk.tokens > this.config.hardMaxTokens) {
43
+ chunks.push({ ...currentChunk });
44
+ currentChunk = { text: '', tokens: 0, heading: null };
45
+ }
46
+ }
47
+ if (currentChunk.tokens >= this.config.minTokens) {
48
+ chunks.push(currentChunk);
49
+ }
50
+ return chunks.map((chunk, index) => ({
51
+ index,
52
+ text: chunk.text.trim(),
53
+ metadata: {
54
+ tokens: chunk.tokens,
55
+ heading: chunk.heading,
56
+ position: index
57
+ }
58
+ }));
59
+ }
60
+ catch (error) {
61
+ const message = error instanceof Error ? error.message : String(error);
62
+ throw new ScrubberError(`Failed to chunk content: ${message}`, { stage: 'chunker', originalError: error });
63
+ }
64
+ }
65
+ _isHeading(line) {
66
+ return /^#{1,6}\s/.test(line);
67
+ }
68
+ _shouldStartNewChunk(currentChunk, para, paraTokens, isHeading) {
69
+ if (this.config.splitOnHeadings && isHeading && currentChunk.tokens > 0) {
70
+ return true;
71
+ }
72
+ const wouldExceed = (currentChunk.tokens + paraTokens) > this.config.maxTokens;
73
+ if (wouldExceed && currentChunk.tokens > 0) {
74
+ return true;
75
+ }
76
+ return false;
77
+ }
78
+ _extractInitialHeading(content) {
79
+ const match = content.match(/^#{1,6}\s+(.+)$/m);
80
+ return match ? match[1] : null;
81
+ }
82
+ _extractHeadingText(headingLine) {
83
+ const match = headingLine.match(/^#{1,6}\s+(.+)$/);
84
+ return match ? match[1] : null;
85
+ }
86
+ }
@@ -0,0 +1,104 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * S-MORA Layer 0 Scrubber - Stage 4: Chunking
4
+ * @module smora/scrubber/stages/chunker
5
+ */
6
+
7
+ import { TokenCounter } from '../utils/token-counter.js';
8
+ import { ChunkingError, ScrubberError } from '../errors/scrubber-error.js';
9
+
10
+ export class Chunker {
11
+ constructor(config) {
12
+ this.config = config;
13
+ this.tokenCounter = new TokenCounter();
14
+ }
15
+
16
+ /**
17
+ * Split content into chunks
18
+ * @param {string} content - Normalized content
19
+ * @returns {Promise<Array>} - Array of chunks with metadata
20
+ */
21
+ async chunk(content) {
22
+ try {
23
+ const chunks = [];
24
+ const paragraphs = content.split(/\n\n+/);
25
+
26
+ let currentChunk = {
27
+ text: '',
28
+ tokens: 0,
29
+ heading: this._extractInitialHeading(content)
30
+ };
31
+
32
+ for (const para of paragraphs) {
33
+ const isHeading = this._isHeading(para);
34
+ const paraTokens = this.tokenCounter.count(para);
35
+
36
+ if (this._shouldStartNewChunk(currentChunk, para, paraTokens, isHeading)) {
37
+ if (currentChunk.tokens >= this.config.minTokens) {
38
+ chunks.push({ ...currentChunk });
39
+ }
40
+ currentChunk = {
41
+ text: '',
42
+ tokens: 0,
43
+ heading: isHeading ? this._extractHeadingText(para) : currentChunk.heading
44
+ };
45
+ }
46
+
47
+ currentChunk.text += (currentChunk.text ? '\n\n' : '') + para;
48
+ currentChunk.tokens += paraTokens;
49
+
50
+ if (currentChunk.tokens > this.config.hardMaxTokens) {
51
+ chunks.push({ ...currentChunk });
52
+ currentChunk = { text: '', tokens: 0, heading: null };
53
+ }
54
+ }
55
+
56
+ if (currentChunk.tokens >= this.config.minTokens) {
57
+ chunks.push(currentChunk);
58
+ }
59
+
60
+ return chunks.map((chunk, index) => ({
61
+ index,
62
+ text: chunk.text.trim(),
63
+ metadata: {
64
+ tokens: chunk.tokens,
65
+ heading: chunk.heading,
66
+ position: index
67
+ }
68
+ }));
69
+ } catch (error) {
70
+ const message = error instanceof Error ? error.message : String(error);
71
+ throw new ScrubberError(
72
+ `Failed to chunk content: ${message}`,
73
+ { stage: 'chunker', originalError: error }
74
+ );
75
+ }
76
+ }
77
+
78
+ _isHeading(line) {
79
+ return /^#{1,6}\s/.test(line);
80
+ }
81
+
82
+ _shouldStartNewChunk(currentChunk, para, paraTokens, isHeading) {
83
+ if (this.config.splitOnHeadings && isHeading && currentChunk.tokens > 0) {
84
+ return true;
85
+ }
86
+
87
+ const wouldExceed = (currentChunk.tokens + paraTokens) > this.config.maxTokens;
88
+ if (wouldExceed && currentChunk.tokens > 0) {
89
+ return true;
90
+ }
91
+
92
+ return false;
93
+ }
94
+
95
+ _extractInitialHeading(content) {
96
+ const match = content.match(/^#{1,6}\s+(.+)$/m);
97
+ return match ? match[1] : null;
98
+ }
99
+
100
+ _extractHeadingText(headingLine) {
101
+ const match = headingLine.match(/^#{1,6}\s+(.+)$/);
102
+ return match ? match[1] : null;
103
+ }
104
+ }
@@ -1,18 +1,17 @@
1
1
  /**
2
- * Type definitions for metadata-annotator.js
2
+ * S-MORA Layer 0 Scrubber - Stage 5: Metadata Annotation
3
+ * @module smora/scrubber/stages/metadata-annotator
3
4
  */
4
-
5
- export interface AnnotatorConfig {
6
- includeTimestamp?: boolean;
7
- [key: string]: any;
8
- }
9
-
10
- export interface AnnotatedData {
11
- content: string;
12
- metadata: Record<string, any>;
13
- }
14
-
15
- export class MetadataAnnotator {
16
- constructor(config?: AnnotatorConfig);
17
- annotate(content: string): Promise<AnnotatedData>;
5
+ export declare class MetadataAnnotator {
6
+ constructor(config: any);
7
+ /**
8
+ * Add metadata to chunks
9
+ * @param {Array} chunks - Array of chunks
10
+ * @param {Object} document - Original document metadata
11
+ * @returns {Promise<Array>} - Annotated chunks
12
+ */
13
+ annotate(chunks: any, document: any): Promise<any>;
14
+ _extractSection(chunk: any): any;
15
+ _buildHeadingPath(chunk: any, currentPath: any): any[];
16
+ _isSubHeading(heading1: any, heading2: any): boolean;
18
17
  }
@@ -0,0 +1,64 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * S-MORA Layer 0 Scrubber - Stage 5: Metadata Annotation
4
+ * @module smora/scrubber/stages/metadata-annotator
5
+ */
6
+ import { HashUtil } from '../utils/hash.js';
7
+ export class MetadataAnnotator {
8
+ constructor(config) {
9
+ this.config = config;
10
+ this.hashUtil = new HashUtil();
11
+ }
12
+ /**
13
+ * Add metadata to chunks
14
+ * @param {Array} chunks - Array of chunks
15
+ * @param {Object} document - Original document metadata
16
+ * @returns {Promise<Array>} - Annotated chunks
17
+ */
18
+ async annotate(chunks, document) {
19
+ const headingPath = [];
20
+ return chunks.map((chunk, index) => {
21
+ const metadata = {
22
+ ...chunk.metadata,
23
+ source: this.config.addSource ? document.source : undefined,
24
+ doc_type: this.config.addSource ? document.type : undefined,
25
+ section: this.config.addSection ? this._extractSection(chunk) : undefined,
26
+ heading_path: this.config.addHeadingPath ?
27
+ this._buildHeadingPath(chunk, headingPath) :
28
+ undefined,
29
+ ingestion_timestamp: this.config.addTimestamp ?
30
+ new Date().toISOString() :
31
+ undefined,
32
+ hash: this.config.addHash ?
33
+ this.hashUtil.hash(chunk.text) :
34
+ undefined
35
+ };
36
+ return {
37
+ ...chunk,
38
+ metadata: Object.fromEntries(Object.entries(metadata).filter(([_, v]) => v !== undefined))
39
+ };
40
+ });
41
+ }
42
+ _extractSection(chunk) {
43
+ if (chunk.metadata.heading) {
44
+ return chunk.metadata.heading;
45
+ }
46
+ return 'unnamed-section';
47
+ }
48
+ _buildHeadingPath(chunk, currentPath) {
49
+ const heading = chunk.metadata.heading;
50
+ if (heading && heading !== currentPath[currentPath.length - 1]) {
51
+ if (currentPath.length === 0 || this._isSubHeading(heading, currentPath[currentPath.length - 1])) {
52
+ currentPath.push(heading);
53
+ }
54
+ else {
55
+ currentPath.length = 0;
56
+ currentPath.push(heading);
57
+ }
58
+ }
59
+ return [...currentPath];
60
+ }
61
+ _isSubHeading(heading1, heading2) {
62
+ return heading1.length > heading2.length;
63
+ }
64
+ }