@yamo/memory-mesh 2.3.2 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. package/README.md +8 -2
  2. package/bin/memory_mesh.js +1 -1
  3. package/lib/llm/client.d.ts +86 -0
  4. package/lib/llm/client.js +300 -357
  5. package/lib/llm/client.ts +334 -0
  6. package/lib/llm/index.d.ts +17 -0
  7. package/lib/llm/index.js +16 -8
  8. package/lib/llm/index.ts +18 -0
  9. package/lib/memory/adapters/client.d.ts +120 -0
  10. package/lib/memory/adapters/client.js +519 -0
  11. package/lib/memory/adapters/client.ts +519 -0
  12. package/lib/memory/adapters/config.d.ts +130 -0
  13. package/lib/memory/adapters/config.js +190 -0
  14. package/lib/memory/adapters/config.ts +190 -0
  15. package/lib/memory/adapters/errors.d.ts +84 -0
  16. package/lib/memory/adapters/errors.js +129 -0
  17. package/lib/memory/adapters/errors.ts +129 -0
  18. package/lib/memory/context-manager.d.ts +41 -0
  19. package/lib/memory/context-manager.js +345 -0
  20. package/lib/memory/context-manager.ts +345 -0
  21. package/lib/memory/embeddings/factory.d.ts +57 -0
  22. package/lib/memory/embeddings/factory.js +149 -0
  23. package/lib/memory/embeddings/factory.ts +149 -0
  24. package/lib/memory/embeddings/index.d.ts +2 -0
  25. package/lib/memory/embeddings/index.js +3 -0
  26. package/lib/memory/embeddings/index.ts +3 -0
  27. package/lib/memory/embeddings/service.d.ts +134 -0
  28. package/lib/memory/embeddings/service.js +516 -0
  29. package/lib/memory/embeddings/service.ts +516 -0
  30. package/lib/memory/index.d.ts +9 -0
  31. package/lib/memory/index.js +10 -1
  32. package/lib/memory/index.ts +10 -0
  33. package/lib/memory/memory-mesh.d.ts +332 -0
  34. package/lib/memory/memory-mesh.js +1470 -678
  35. package/lib/memory/memory-mesh.ts +1517 -0
  36. package/lib/memory/memory-translator.d.ts +14 -0
  37. package/lib/memory/memory-translator.js +126 -0
  38. package/lib/memory/memory-translator.ts +126 -0
  39. package/lib/memory/schema.d.ts +130 -0
  40. package/lib/memory/schema.js +184 -0
  41. package/lib/memory/schema.ts +184 -0
  42. package/lib/memory/scorer.d.ts +25 -0
  43. package/lib/memory/scorer.js +78 -0
  44. package/lib/memory/scorer.ts +78 -0
  45. package/lib/memory/search/index.d.ts +1 -0
  46. package/lib/memory/search/index.js +2 -0
  47. package/lib/memory/search/index.ts +2 -0
  48. package/lib/memory/search/keyword-search.d.ts +46 -0
  49. package/lib/memory/search/keyword-search.js +136 -0
  50. package/lib/memory/search/keyword-search.ts +136 -0
  51. package/lib/scrubber/config/defaults.d.ts +46 -0
  52. package/lib/scrubber/config/defaults.js +50 -57
  53. package/lib/scrubber/config/defaults.ts +55 -0
  54. package/lib/scrubber/errors/scrubber-error.d.ts +22 -0
  55. package/lib/scrubber/errors/scrubber-error.js +28 -32
  56. package/lib/scrubber/errors/scrubber-error.ts +44 -0
  57. package/lib/scrubber/index.d.ts +5 -0
  58. package/lib/scrubber/index.js +4 -23
  59. package/lib/scrubber/index.ts +6 -0
  60. package/lib/scrubber/scrubber.d.ts +44 -0
  61. package/lib/scrubber/scrubber.js +100 -121
  62. package/lib/scrubber/scrubber.ts +109 -0
  63. package/lib/scrubber/stages/chunker.d.ts +25 -0
  64. package/lib/scrubber/stages/chunker.js +74 -91
  65. package/lib/scrubber/stages/chunker.ts +104 -0
  66. package/lib/scrubber/stages/metadata-annotator.d.ts +17 -0
  67. package/lib/scrubber/stages/metadata-annotator.js +55 -65
  68. package/lib/scrubber/stages/metadata-annotator.ts +75 -0
  69. package/lib/scrubber/stages/normalizer.d.ts +16 -0
  70. package/lib/scrubber/stages/normalizer.js +42 -50
  71. package/lib/scrubber/stages/normalizer.ts +60 -0
  72. package/lib/scrubber/stages/semantic-filter.d.ts +16 -0
  73. package/lib/scrubber/stages/semantic-filter.js +42 -52
  74. package/lib/scrubber/stages/semantic-filter.ts +62 -0
  75. package/lib/scrubber/stages/structural-cleaner.d.ts +18 -0
  76. package/lib/scrubber/stages/structural-cleaner.js +66 -75
  77. package/lib/scrubber/stages/structural-cleaner.ts +83 -0
  78. package/lib/scrubber/stages/validator.d.ts +17 -0
  79. package/lib/scrubber/stages/validator.js +46 -56
  80. package/lib/scrubber/stages/validator.ts +67 -0
  81. package/lib/scrubber/telemetry.d.ts +29 -0
  82. package/lib/scrubber/telemetry.js +54 -58
  83. package/lib/scrubber/telemetry.ts +62 -0
  84. package/lib/scrubber/utils/hash.d.ts +14 -0
  85. package/lib/scrubber/utils/hash.js +30 -32
  86. package/lib/scrubber/utils/hash.ts +40 -0
  87. package/lib/scrubber/utils/html-parser.d.ts +14 -0
  88. package/lib/scrubber/utils/html-parser.js +32 -39
  89. package/lib/scrubber/utils/html-parser.ts +46 -0
  90. package/lib/scrubber/utils/pattern-matcher.d.ts +12 -0
  91. package/lib/scrubber/utils/pattern-matcher.js +48 -57
  92. package/lib/scrubber/utils/pattern-matcher.ts +64 -0
  93. package/lib/scrubber/utils/token-counter.d.ts +18 -0
  94. package/lib/scrubber/utils/token-counter.js +24 -25
  95. package/lib/scrubber/utils/token-counter.ts +32 -0
  96. package/lib/utils/logger.d.ts +19 -0
  97. package/lib/utils/logger.js +65 -0
  98. package/lib/utils/logger.ts +65 -0
  99. package/lib/utils/skill-metadata.d.ts +24 -0
  100. package/lib/utils/skill-metadata.js +133 -0
  101. package/lib/utils/skill-metadata.ts +133 -0
  102. package/lib/yamo/emitter.d.ts +46 -0
  103. package/lib/yamo/emitter.js +79 -143
  104. package/lib/yamo/emitter.ts +171 -0
  105. package/lib/yamo/index.d.ts +14 -0
  106. package/lib/yamo/index.js +6 -7
  107. package/lib/yamo/index.ts +16 -0
  108. package/lib/yamo/schema.d.ts +56 -0
  109. package/lib/yamo/schema.js +82 -108
  110. package/lib/yamo/schema.ts +133 -0
  111. package/package.json +13 -8
  112. package/index.d.ts +0 -111
  113. package/lib/embeddings/factory.js +0 -151
  114. package/lib/embeddings/index.js +0 -2
  115. package/lib/embeddings/service.js +0 -586
  116. package/lib/index.js +0 -6
  117. package/lib/lancedb/client.js +0 -633
  118. package/lib/lancedb/config.js +0 -215
  119. package/lib/lancedb/errors.js +0 -144
  120. package/lib/lancedb/index.js +0 -4
  121. package/lib/lancedb/schema.js +0 -217
  122. package/lib/search/index.js +0 -1
  123. package/lib/search/keyword-search.js +0 -144
  124. package/lib/utils/index.js +0 -1
@@ -0,0 +1,109 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * S-MORA Layer 0 Scrubber - Main Orchestrator
4
+ * @module smora/scrubber/scrubber
5
+ */
6
+ import { StructuralCleaner } from "./stages/structural-cleaner.js";
7
+ import { SemanticFilter } from "./stages/semantic-filter.js";
8
+ import { Normalizer } from "./stages/normalizer.js";
9
+ import { Chunker } from "./stages/chunker.js";
10
+ import { MetadataAnnotator } from "./stages/metadata-annotator.js";
11
+ import { Validator } from "./stages/validator.js";
12
+ import { ScrubberTelemetry, } from "./telemetry.js";
13
+ // import { ScrubberError } from './errors/scrubber-error'; // Assuming this exists or I should check
14
+ import { defaultScrubberConfig } from "./config/defaults.js";
15
+ export class Scrubber {
16
+ config;
17
+ stages; // Using any for stages as they are not yet converted
18
+ telemetry;
19
+ constructor(config = {}) {
20
+ this.config = { ...defaultScrubberConfig, ...config };
21
+ this.stages = this._initializeStages();
22
+ this.telemetry = new ScrubberTelemetry();
23
+ }
24
+ /**
25
+ * Main entry point - process a raw document
26
+ * @param {Object} document - { content: string, source: string, type: 'html'|'md'|'txt' }
27
+ * @returns {Promise<Object>} - { chunks: Array, metadata: Object, telemetry: Object }
28
+ */
29
+ async process(document) {
30
+ const startTime = Date.now();
31
+ const result = {
32
+ chunks: [],
33
+ metadata: {
34
+ source: document.source,
35
+ type: document.type,
36
+ processingTimestamp: new Date().toISOString(),
37
+ },
38
+ telemetry: {},
39
+ };
40
+ try {
41
+ // If disabled, return empty chunks
42
+ if (!this.config.enabled) {
43
+ result.success = true;
44
+ result.telemetry.totalDuration = Date.now() - startTime;
45
+ return result;
46
+ }
47
+ // Stage 1: Structural Cleaning
48
+ const cleaned = await this._executeStage("structural", () => this.stages.structural.clean(document.content));
49
+ result.telemetry.structural = this.telemetry.getStageStats("structural");
50
+ // Stage 2: Semantic Filtering
51
+ const filtered = await this._executeStage("semantic", () => this.stages.semantic.filter(cleaned));
52
+ result.telemetry.semantic = this.telemetry.getStageStats("semantic");
53
+ // Stage 3: Normalization
54
+ const normalized = await this._executeStage("normalization", () => this.stages.normalizer.normalize(filtered));
55
+ result.telemetry.normalization =
56
+ this.telemetry.getStageStats("normalization");
57
+ // Stage 4: Chunking
58
+ const chunks = await this._executeStage("chunking", () => this.stages.chunker.chunk(normalized));
59
+ result.telemetry.chunking = this.telemetry.getStageStats("chunking");
60
+ // Stage 5: Metadata Annotation
61
+ const annotated = await this._executeStage("metadata", () => this.stages.metadata.annotate(chunks, document));
62
+ result.telemetry.metadata = this.telemetry.getStageStats("metadata");
63
+ // Stage 6: Validation
64
+ result.chunks = await this._executeStage("validation", () => this.stages.validator.validate(annotated));
65
+ result.telemetry.validation = this.telemetry.getStageStats("validation");
66
+ result.telemetry.totalDuration = Date.now() - startTime;
67
+ result.success = true;
68
+ return result;
69
+ }
70
+ catch (error) {
71
+ const message = error instanceof Error ? error.message : String(error);
72
+ result.success = false;
73
+ result.error = message;
74
+ result.telemetry.totalDuration = Date.now() - startTime;
75
+ return result;
76
+ }
77
+ }
78
+ async _executeStage(stageName, stageFn) {
79
+ const startTime = Date.now();
80
+ try {
81
+ const result = await stageFn();
82
+ const duration = Date.now() - startTime;
83
+ this.telemetry.recordStage(stageName, duration, true);
84
+ return result;
85
+ }
86
+ catch (error) {
87
+ const duration = Date.now() - startTime;
88
+ this.telemetry.recordStage(stageName, duration, false);
89
+ throw error;
90
+ }
91
+ }
92
+ _initializeStages() {
93
+ return {
94
+ structural: new StructuralCleaner(this.config.structural),
95
+ semantic: new SemanticFilter(this.config.semantic),
96
+ normalizer: new Normalizer(this.config.normalization),
97
+ chunker: new Chunker(this.config.chunking),
98
+ metadata: new MetadataAnnotator(this.config.metadata),
99
+ validator: new Validator(this.config.validation),
100
+ };
101
+ }
102
+ getMetrics() {
103
+ return this.telemetry.getSummary();
104
+ }
105
+ healthCheck() {
106
+ return Promise.resolve({ status: "healthy" });
107
+ }
108
+ }
109
+ export default Scrubber;
@@ -0,0 +1,25 @@
1
+ /**
2
+ * S-MORA Layer 0 Scrubber - Stage 4: Chunking
3
+ * @module smora/scrubber/stages/chunker
4
+ */
5
+ export declare class Chunker {
6
+ constructor(config: any);
7
+ /**
8
+ * Split content into chunks
9
+ * @param {string} content - Normalized content
10
+ * @returns {Promise<Array>} - Array of chunks with metadata
11
+ */
12
+ chunk(content: any): Promise<{
13
+ index: number;
14
+ text: any;
15
+ metadata: {
16
+ tokens: any;
17
+ heading: any;
18
+ position: number;
19
+ };
20
+ }[]>;
21
+ _isHeading(line: any): boolean;
22
+ _shouldStartNewChunk(currentChunk: any, para: any, paraTokens: any, isHeading: any): boolean;
23
+ _extractInitialHeading(content: any): any;
24
+ _extractHeadingText(headingLine: any): any;
25
+ }
@@ -1,103 +1,86 @@
1
+ // @ts-nocheck
1
2
  /**
2
3
  * S-MORA Layer 0 Scrubber - Stage 4: Chunking
3
4
  * @module smora/scrubber/stages/chunker
4
5
  */
5
-
6
6
  import { TokenCounter } from '../utils/token-counter.js';
7
- import { ChunkingError, ScrubberError } from '../errors/scrubber-error.js';
8
-
7
+ import { ScrubberError } from '../errors/scrubber-error.js';
9
8
  export class Chunker {
10
- constructor(config) {
11
- this.config = config;
12
- this.tokenCounter = new TokenCounter();
13
- }
14
-
15
- /**
16
- * Split content into chunks
17
- * @param {string} content - Normalized content
18
- * @returns {Promise<Array>} - Array of chunks with metadata
19
- */
20
- async chunk(content) {
21
- try {
22
- const chunks = [];
23
- const paragraphs = content.split(/\n\n+/);
24
-
25
- let currentChunk = {
26
- text: '',
27
- tokens: 0,
28
- heading: this._extractInitialHeading(content)
29
- };
30
-
31
- for (const para of paragraphs) {
32
- const isHeading = this._isHeading(para);
33
- const paraTokens = this.tokenCounter.count(para);
34
-
35
- if (this._shouldStartNewChunk(currentChunk, para, paraTokens, isHeading)) {
36
- if (currentChunk.tokens >= this.config.minTokens) {
37
- chunks.push({ ...currentChunk });
38
- }
39
- currentChunk = {
40
- text: '',
41
- tokens: 0,
42
- heading: isHeading ? this._extractHeadingText(para) : currentChunk.heading
43
- };
9
+ constructor(config) {
10
+ this.config = config;
11
+ this.tokenCounter = new TokenCounter();
12
+ }
13
+ /**
14
+ * Split content into chunks
15
+ * @param {string} content - Normalized content
16
+ * @returns {Promise<Array>} - Array of chunks with metadata
17
+ */
18
+ async chunk(content) {
19
+ try {
20
+ const chunks = [];
21
+ const paragraphs = content.split(/\n\n+/);
22
+ let currentChunk = {
23
+ text: '',
24
+ tokens: 0,
25
+ heading: this._extractInitialHeading(content)
26
+ };
27
+ for (const para of paragraphs) {
28
+ const isHeading = this._isHeading(para);
29
+ const paraTokens = this.tokenCounter.count(para);
30
+ if (this._shouldStartNewChunk(currentChunk, para, paraTokens, isHeading)) {
31
+ if (currentChunk.tokens >= this.config.minTokens) {
32
+ chunks.push({ ...currentChunk });
33
+ }
34
+ currentChunk = {
35
+ text: '',
36
+ tokens: 0,
37
+ heading: isHeading ? this._extractHeadingText(para) : currentChunk.heading
38
+ };
39
+ }
40
+ currentChunk.text += (currentChunk.text ? '\n\n' : '') + para;
41
+ currentChunk.tokens += paraTokens;
42
+ if (currentChunk.tokens > this.config.hardMaxTokens) {
43
+ chunks.push({ ...currentChunk });
44
+ currentChunk = { text: '', tokens: 0, heading: null };
45
+ }
46
+ }
47
+ if (currentChunk.tokens >= this.config.minTokens) {
48
+ chunks.push(currentChunk);
49
+ }
50
+ return chunks.map((chunk, index) => ({
51
+ index,
52
+ text: chunk.text.trim(),
53
+ metadata: {
54
+ tokens: chunk.tokens,
55
+ heading: chunk.heading,
56
+ position: index
57
+ }
58
+ }));
59
+ }
60
+ catch (error) {
61
+ const message = error instanceof Error ? error.message : String(error);
62
+ throw new ScrubberError(`Failed to chunk content: ${message}`, { stage: 'chunker', originalError: error });
44
63
  }
45
-
46
- currentChunk.text += (currentChunk.text ? '\n\n' : '') + para;
47
- currentChunk.tokens += paraTokens;
48
-
49
- if (currentChunk.tokens > this.config.hardMaxTokens) {
50
- chunks.push({ ...currentChunk });
51
- currentChunk = { text: '', tokens: 0, heading: null };
64
+ }
65
+ _isHeading(line) {
66
+ return /^#{1,6}\s/.test(line);
67
+ }
68
+ _shouldStartNewChunk(currentChunk, para, paraTokens, isHeading) {
69
+ if (this.config.splitOnHeadings && isHeading && currentChunk.tokens > 0) {
70
+ return true;
52
71
  }
53
- }
54
-
55
- if (currentChunk.tokens >= this.config.minTokens) {
56
- chunks.push(currentChunk);
57
- }
58
-
59
- return chunks.map((chunk, index) => ({
60
- index,
61
- text: chunk.text.trim(),
62
- metadata: {
63
- tokens: chunk.tokens,
64
- heading: chunk.heading,
65
- position: index
72
+ const wouldExceed = (currentChunk.tokens + paraTokens) > this.config.maxTokens;
73
+ if (wouldExceed && currentChunk.tokens > 0) {
74
+ return true;
66
75
  }
67
- }));
68
- } catch (error) {
69
- const message = error instanceof Error ? error.message : String(error);
70
- throw new ScrubberError(
71
- `Failed to chunk content: ${message}`,
72
- { stage: 'chunker', originalError: error }
73
- );
76
+ return false;
74
77
  }
75
- }
76
-
77
- _isHeading(line) {
78
- return /^#{1,6}\s/.test(line);
79
- }
80
-
81
- _shouldStartNewChunk(currentChunk, para, paraTokens, isHeading) {
82
- if (this.config.splitOnHeadings && isHeading && currentChunk.tokens > 0) {
83
- return true;
78
+ _extractInitialHeading(content) {
79
+ const match = content.match(/^#{1,6}\s+(.+)$/m);
80
+ return match ? match[1] : null;
84
81
  }
85
-
86
- const wouldExceed = (currentChunk.tokens + paraTokens) > this.config.maxTokens;
87
- if (wouldExceed && currentChunk.tokens > 0) {
88
- return true;
82
+ _extractHeadingText(headingLine) {
83
+ const match = headingLine.match(/^#{1,6}\s+(.+)$/);
84
+ return match ? match[1] : null;
89
85
  }
90
-
91
- return false;
92
- }
93
-
94
- _extractInitialHeading(content) {
95
- const match = content.match(/^#{1,6}\s+(.+)$/m);
96
- return match ? match[1] : null;
97
- }
98
-
99
- _extractHeadingText(headingLine) {
100
- const match = headingLine.match(/^#{1,6}\s+(.+)$/);
101
- return match ? match[1] : null;
102
- }
103
- }
86
+ }
@@ -0,0 +1,104 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * S-MORA Layer 0 Scrubber - Stage 4: Chunking
4
+ * @module smora/scrubber/stages/chunker
5
+ */
6
+
7
+ import { TokenCounter } from '../utils/token-counter.js';
8
+ import { ChunkingError, ScrubberError } from '../errors/scrubber-error.js';
9
+
10
+ export class Chunker {
11
+ constructor(config) {
12
+ this.config = config;
13
+ this.tokenCounter = new TokenCounter();
14
+ }
15
+
16
+ /**
17
+ * Split content into chunks
18
+ * @param {string} content - Normalized content
19
+ * @returns {Promise<Array>} - Array of chunks with metadata
20
+ */
21
+ async chunk(content) {
22
+ try {
23
+ const chunks = [];
24
+ const paragraphs = content.split(/\n\n+/);
25
+
26
+ let currentChunk = {
27
+ text: '',
28
+ tokens: 0,
29
+ heading: this._extractInitialHeading(content)
30
+ };
31
+
32
+ for (const para of paragraphs) {
33
+ const isHeading = this._isHeading(para);
34
+ const paraTokens = this.tokenCounter.count(para);
35
+
36
+ if (this._shouldStartNewChunk(currentChunk, para, paraTokens, isHeading)) {
37
+ if (currentChunk.tokens >= this.config.minTokens) {
38
+ chunks.push({ ...currentChunk });
39
+ }
40
+ currentChunk = {
41
+ text: '',
42
+ tokens: 0,
43
+ heading: isHeading ? this._extractHeadingText(para) : currentChunk.heading
44
+ };
45
+ }
46
+
47
+ currentChunk.text += (currentChunk.text ? '\n\n' : '') + para;
48
+ currentChunk.tokens += paraTokens;
49
+
50
+ if (currentChunk.tokens > this.config.hardMaxTokens) {
51
+ chunks.push({ ...currentChunk });
52
+ currentChunk = { text: '', tokens: 0, heading: null };
53
+ }
54
+ }
55
+
56
+ if (currentChunk.tokens >= this.config.minTokens) {
57
+ chunks.push(currentChunk);
58
+ }
59
+
60
+ return chunks.map((chunk, index) => ({
61
+ index,
62
+ text: chunk.text.trim(),
63
+ metadata: {
64
+ tokens: chunk.tokens,
65
+ heading: chunk.heading,
66
+ position: index
67
+ }
68
+ }));
69
+ } catch (error) {
70
+ const message = error instanceof Error ? error.message : String(error);
71
+ throw new ScrubberError(
72
+ `Failed to chunk content: ${message}`,
73
+ { stage: 'chunker', originalError: error }
74
+ );
75
+ }
76
+ }
77
+
78
+ _isHeading(line) {
79
+ return /^#{1,6}\s/.test(line);
80
+ }
81
+
82
+ _shouldStartNewChunk(currentChunk, para, paraTokens, isHeading) {
83
+ if (this.config.splitOnHeadings && isHeading && currentChunk.tokens > 0) {
84
+ return true;
85
+ }
86
+
87
+ const wouldExceed = (currentChunk.tokens + paraTokens) > this.config.maxTokens;
88
+ if (wouldExceed && currentChunk.tokens > 0) {
89
+ return true;
90
+ }
91
+
92
+ return false;
93
+ }
94
+
95
+ _extractInitialHeading(content) {
96
+ const match = content.match(/^#{1,6}\s+(.+)$/m);
97
+ return match ? match[1] : null;
98
+ }
99
+
100
+ _extractHeadingText(headingLine) {
101
+ const match = headingLine.match(/^#{1,6}\s+(.+)$/);
102
+ return match ? match[1] : null;
103
+ }
104
+ }
@@ -0,0 +1,17 @@
1
+ /**
2
+ * S-MORA Layer 0 Scrubber - Stage 5: Metadata Annotation
3
+ * @module smora/scrubber/stages/metadata-annotator
4
+ */
5
+ export declare class MetadataAnnotator {
6
+ constructor(config: any);
7
+ /**
8
+ * Add metadata to chunks
9
+ * @param {Array} chunks - Array of chunks
10
+ * @param {Object} document - Original document metadata
11
+ * @returns {Promise<Array>} - Annotated chunks
12
+ */
13
+ annotate(chunks: any, document: any): Promise<any>;
14
+ _extractSection(chunk: any): any;
15
+ _buildHeadingPath(chunk: any, currentPath: any): any[];
16
+ _isSubHeading(heading1: any, heading2: any): boolean;
17
+ }
@@ -1,74 +1,64 @@
1
+ // @ts-nocheck
1
2
  /**
2
3
  * S-MORA Layer 0 Scrubber - Stage 5: Metadata Annotation
3
4
  * @module smora/scrubber/stages/metadata-annotator
4
5
  */
5
-
6
6
  import { HashUtil } from '../utils/hash.js';
7
-
8
7
  export class MetadataAnnotator {
9
- constructor(config) {
10
- this.config = config;
11
- this.hashUtil = new HashUtil();
12
- }
13
-
14
- /**
15
- * Add metadata to chunks
16
- * @param {Array} chunks - Array of chunks
17
- * @param {Object} document - Original document metadata
18
- * @returns {Promise<Array>} - Annotated chunks
19
- */
20
- async annotate(chunks, document) {
21
- const headingPath = [];
22
-
23
- return chunks.map((chunk, index) => {
24
- const metadata = {
25
- ...chunk.metadata,
26
- source: this.config.addSource ? document.source : undefined,
27
- doc_type: this.config.addSource ? document.type : undefined,
28
- section: this.config.addSection ? this._extractSection(chunk) : undefined,
29
- heading_path: this.config.addHeadingPath ?
30
- this._buildHeadingPath(chunk, headingPath) :
31
- undefined,
32
- ingestion_timestamp: this.config.addTimestamp ?
33
- new Date().toISOString() :
34
- undefined,
35
- hash: this.config.addHash ?
36
- this.hashUtil.hash(chunk.text) :
37
- undefined
38
- };
39
-
40
- return {
41
- ...chunk,
42
- metadata: Object.fromEntries(
43
- Object.entries(metadata).filter(([_, v]) => v !== undefined)
44
- )
45
- };
46
- });
47
- }
48
-
49
- _extractSection(chunk) {
50
- if (chunk.metadata.heading) {
51
- return chunk.metadata.heading;
8
+ constructor(config) {
9
+ this.config = config;
10
+ this.hashUtil = new HashUtil();
52
11
  }
53
- return 'unnamed-section';
54
- }
55
-
56
- _buildHeadingPath(chunk, currentPath) {
57
- const heading = chunk.metadata.heading;
58
-
59
- if (heading && heading !== currentPath[currentPath.length - 1]) {
60
- if (currentPath.length === 0 || this._isSubHeading(heading, currentPath[currentPath.length - 1])) {
61
- currentPath.push(heading);
62
- } else {
63
- currentPath.length = 0;
64
- currentPath.push(heading);
65
- }
12
+ /**
13
+ * Add metadata to chunks
14
+ * @param {Array} chunks - Array of chunks
15
+ * @param {Object} document - Original document metadata
16
+ * @returns {Promise<Array>} - Annotated chunks
17
+ */
18
+ async annotate(chunks, document) {
19
+ const headingPath = [];
20
+ return chunks.map((chunk, index) => {
21
+ const metadata = {
22
+ ...chunk.metadata,
23
+ source: this.config.addSource ? document.source : undefined,
24
+ doc_type: this.config.addSource ? document.type : undefined,
25
+ section: this.config.addSection ? this._extractSection(chunk) : undefined,
26
+ heading_path: this.config.addHeadingPath ?
27
+ this._buildHeadingPath(chunk, headingPath) :
28
+ undefined,
29
+ ingestion_timestamp: this.config.addTimestamp ?
30
+ new Date().toISOString() :
31
+ undefined,
32
+ hash: this.config.addHash ?
33
+ this.hashUtil.hash(chunk.text) :
34
+ undefined
35
+ };
36
+ return {
37
+ ...chunk,
38
+ metadata: Object.fromEntries(Object.entries(metadata).filter(([_, v]) => v !== undefined))
39
+ };
40
+ });
41
+ }
42
+ _extractSection(chunk) {
43
+ if (chunk.metadata.heading) {
44
+ return chunk.metadata.heading;
45
+ }
46
+ return 'unnamed-section';
47
+ }
48
+ _buildHeadingPath(chunk, currentPath) {
49
+ const heading = chunk.metadata.heading;
50
+ if (heading && heading !== currentPath[currentPath.length - 1]) {
51
+ if (currentPath.length === 0 || this._isSubHeading(heading, currentPath[currentPath.length - 1])) {
52
+ currentPath.push(heading);
53
+ }
54
+ else {
55
+ currentPath.length = 0;
56
+ currentPath.push(heading);
57
+ }
58
+ }
59
+ return [...currentPath];
60
+ }
61
+ _isSubHeading(heading1, heading2) {
62
+ return heading1.length > heading2.length;
66
63
  }
67
-
68
- return [...currentPath];
69
- }
70
-
71
- _isSubHeading(heading1, heading2) {
72
- return heading1.length > heading2.length;
73
- }
74
64
  }
@@ -0,0 +1,75 @@
1
+ // @ts-nocheck
2
+ /**
3
+ * S-MORA Layer 0 Scrubber - Stage 5: Metadata Annotation
4
+ * @module smora/scrubber/stages/metadata-annotator
5
+ */
6
+
7
+ import { HashUtil } from '../utils/hash.js';
8
+
9
+ export class MetadataAnnotator {
10
+ constructor(config) {
11
+ this.config = config;
12
+ this.hashUtil = new HashUtil();
13
+ }
14
+
15
+ /**
16
+ * Add metadata to chunks
17
+ * @param {Array} chunks - Array of chunks
18
+ * @param {Object} document - Original document metadata
19
+ * @returns {Promise<Array>} - Annotated chunks
20
+ */
21
+ async annotate(chunks, document) {
22
+ const headingPath = [];
23
+
24
+ return chunks.map((chunk, index) => {
25
+ const metadata = {
26
+ ...chunk.metadata,
27
+ source: this.config.addSource ? document.source : undefined,
28
+ doc_type: this.config.addSource ? document.type : undefined,
29
+ section: this.config.addSection ? this._extractSection(chunk) : undefined,
30
+ heading_path: this.config.addHeadingPath ?
31
+ this._buildHeadingPath(chunk, headingPath) :
32
+ undefined,
33
+ ingestion_timestamp: this.config.addTimestamp ?
34
+ new Date().toISOString() :
35
+ undefined,
36
+ hash: this.config.addHash ?
37
+ this.hashUtil.hash(chunk.text) :
38
+ undefined
39
+ };
40
+
41
+ return {
42
+ ...chunk,
43
+ metadata: Object.fromEntries(
44
+ Object.entries(metadata).filter(([_, v]) => v !== undefined)
45
+ )
46
+ };
47
+ });
48
+ }
49
+
50
+ _extractSection(chunk) {
51
+ if (chunk.metadata.heading) {
52
+ return chunk.metadata.heading;
53
+ }
54
+ return 'unnamed-section';
55
+ }
56
+
57
+ _buildHeadingPath(chunk, currentPath) {
58
+ const heading = chunk.metadata.heading;
59
+
60
+ if (heading && heading !== currentPath[currentPath.length - 1]) {
61
+ if (currentPath.length === 0 || this._isSubHeading(heading, currentPath[currentPath.length - 1])) {
62
+ currentPath.push(heading);
63
+ } else {
64
+ currentPath.length = 0;
65
+ currentPath.push(heading);
66
+ }
67
+ }
68
+
69
+ return [...currentPath];
70
+ }
71
+
72
+ _isSubHeading(heading1, heading2) {
73
+ return heading1.length > heading2.length;
74
+ }
75
+ }
@@ -0,0 +1,16 @@
1
+ /**
2
+ * S-MORA Layer 0 Scrubber - Stage 3: Normalization
3
+ * @module smora/scrubber/stages/normalizer
4
+ */
5
+ export declare class Normalizer {
6
+ constructor(config: any);
7
+ /**
8
+ * Normalize content structure
9
+ * @param {string} content - Filtered content
10
+ * @returns {Promise<string>} - Normalized content
11
+ */
12
+ normalize(content: any): Promise<any>;
13
+ _normalizeHeadings(content: any): any;
14
+ _normalizeLists(content: any): any;
15
+ _normalizePunctuation(content: any): any;
16
+ }