@yamo/memory-mesh 3.0.0 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -2
- package/lib/llm/client.d.ts +23 -48
- package/lib/llm/client.js +1 -0
- package/lib/llm/client.ts +298 -377
- package/lib/llm/index.js +1 -0
- package/lib/llm/index.ts +1 -2
- package/lib/memory/adapters/client.d.ts +22 -85
- package/lib/memory/adapters/client.js +1 -0
- package/lib/memory/adapters/client.ts +474 -633
- package/lib/memory/adapters/config.d.ts +82 -89
- package/lib/memory/adapters/config.js +1 -0
- package/lib/memory/adapters/config.ts +156 -225
- package/lib/memory/adapters/errors.d.ts +28 -20
- package/lib/memory/adapters/errors.js +1 -0
- package/lib/memory/adapters/errors.ts +83 -120
- package/lib/memory/context-manager.d.ts +15 -18
- package/lib/memory/context-manager.js +1 -0
- package/lib/memory/context-manager.ts +314 -401
- package/lib/memory/embeddings/factory.d.ts +18 -20
- package/lib/memory/embeddings/factory.js +1 -0
- package/lib/memory/embeddings/factory.ts +130 -173
- package/lib/memory/embeddings/index.js +1 -0
- package/lib/memory/embeddings/index.ts +1 -0
- package/lib/memory/embeddings/service.d.ts +36 -66
- package/lib/memory/embeddings/service.js +1 -0
- package/lib/memory/embeddings/service.ts +479 -616
- package/lib/memory/index.d.ts +2 -2
- package/lib/memory/index.js +1 -0
- package/lib/memory/index.ts +3 -13
- package/lib/memory/memory-mesh.d.ts +151 -93
- package/lib/memory/memory-mesh.js +1 -0
- package/lib/memory/memory-mesh.ts +1406 -1692
- package/lib/memory/memory-translator.d.ts +1 -6
- package/lib/memory/memory-translator.js +1 -0
- package/lib/memory/memory-translator.ts +96 -128
- package/lib/memory/schema.d.ts +29 -10
- package/lib/memory/schema.js +1 -0
- package/lib/memory/schema.ts +102 -185
- package/lib/memory/scorer.d.ts +3 -4
- package/lib/memory/scorer.js +1 -0
- package/lib/memory/scorer.ts +69 -86
- package/lib/memory/search/index.js +1 -0
- package/lib/memory/search/index.ts +1 -0
- package/lib/memory/search/keyword-search.d.ts +10 -26
- package/lib/memory/search/keyword-search.js +1 -0
- package/lib/memory/search/keyword-search.ts +123 -161
- package/lib/scrubber/config/defaults.d.ts +39 -46
- package/lib/scrubber/config/defaults.js +1 -0
- package/lib/scrubber/config/defaults.ts +50 -112
- package/lib/scrubber/errors/scrubber-error.d.ts +22 -0
- package/lib/scrubber/errors/scrubber-error.js +39 -0
- package/lib/scrubber/errors/scrubber-error.ts +44 -0
- package/lib/scrubber/index.d.ts +0 -1
- package/lib/scrubber/index.js +1 -0
- package/lib/scrubber/index.ts +1 -2
- package/lib/scrubber/scrubber.d.ts +14 -31
- package/lib/scrubber/scrubber.js +1 -0
- package/lib/scrubber/scrubber.ts +93 -152
- package/lib/scrubber/stages/chunker.d.ts +22 -10
- package/lib/scrubber/stages/chunker.js +86 -0
- package/lib/scrubber/stages/chunker.ts +104 -0
- package/lib/scrubber/stages/metadata-annotator.d.ts +14 -15
- package/lib/scrubber/stages/metadata-annotator.js +64 -0
- package/lib/scrubber/stages/metadata-annotator.ts +75 -0
- package/lib/scrubber/stages/normalizer.d.ts +13 -10
- package/lib/scrubber/stages/normalizer.js +51 -0
- package/lib/scrubber/stages/normalizer.ts +60 -0
- package/lib/scrubber/stages/semantic-filter.d.ts +13 -10
- package/lib/scrubber/stages/semantic-filter.js +51 -0
- package/lib/scrubber/stages/semantic-filter.ts +62 -0
- package/lib/scrubber/stages/structural-cleaner.d.ts +15 -10
- package/lib/scrubber/stages/structural-cleaner.js +73 -0
- package/lib/scrubber/stages/structural-cleaner.ts +83 -0
- package/lib/scrubber/stages/validator.d.ts +14 -15
- package/lib/scrubber/stages/validator.js +56 -0
- package/lib/scrubber/stages/validator.ts +67 -0
- package/lib/scrubber/telemetry.d.ts +20 -27
- package/lib/scrubber/telemetry.js +1 -0
- package/lib/scrubber/telemetry.ts +53 -90
- package/lib/scrubber/utils/hash.d.ts +14 -0
- package/lib/scrubber/utils/hash.js +37 -0
- package/lib/scrubber/utils/hash.ts +40 -0
- package/lib/scrubber/utils/html-parser.d.ts +14 -0
- package/lib/scrubber/utils/html-parser.js +38 -0
- package/lib/scrubber/utils/html-parser.ts +46 -0
- package/lib/scrubber/utils/pattern-matcher.d.ts +12 -0
- package/lib/scrubber/utils/pattern-matcher.js +54 -0
- package/lib/scrubber/utils/pattern-matcher.ts +64 -0
- package/lib/scrubber/utils/token-counter.d.ts +18 -0
- package/lib/scrubber/utils/token-counter.js +30 -0
- package/lib/scrubber/utils/token-counter.ts +32 -0
- package/lib/utils/logger.d.ts +1 -11
- package/lib/utils/logger.js +1 -0
- package/lib/utils/logger.ts +43 -63
- package/lib/utils/skill-metadata.d.ts +6 -14
- package/lib/utils/skill-metadata.js +1 -0
- package/lib/utils/skill-metadata.ts +89 -103
- package/lib/yamo/emitter.d.ts +8 -35
- package/lib/yamo/emitter.js +1 -0
- package/lib/yamo/emitter.ts +77 -155
- package/lib/yamo/index.d.ts +14 -0
- package/lib/yamo/index.js +14 -0
- package/lib/yamo/index.ts +16 -0
- package/lib/yamo/schema.d.ts +8 -10
- package/lib/yamo/schema.js +1 -0
- package/lib/yamo/schema.ts +82 -114
- package/package.json +4 -2
package/lib/scrubber/index.ts
CHANGED
|
@@ -8,43 +8,26 @@ import { Normalizer } from "./stages/normalizer.js";
|
|
|
8
8
|
import { Chunker } from "./stages/chunker.js";
|
|
9
9
|
import { MetadataAnnotator } from "./stages/metadata-annotator.js";
|
|
10
10
|
import { Validator } from "./stages/validator.js";
|
|
11
|
-
import { ScrubberTelemetry, TelemetrySummary, StageSummary } from "./telemetry.js";
|
|
12
|
-
import { ScrubberConfig } from "./config/defaults.js";
|
|
13
|
-
export interface ScrubberDocument {
|
|
14
|
-
content: string;
|
|
15
|
-
source: string;
|
|
16
|
-
type: string;
|
|
17
|
-
}
|
|
18
|
-
export interface Chunk {
|
|
19
|
-
text: string;
|
|
20
|
-
[key: string]: any;
|
|
21
|
-
}
|
|
22
|
-
export interface ScrubberResult {
|
|
23
|
-
chunks: Chunk[];
|
|
24
|
-
metadata: {
|
|
25
|
-
source: string;
|
|
26
|
-
type: string;
|
|
27
|
-
processingTimestamp: string;
|
|
28
|
-
[key: string]: any;
|
|
29
|
-
};
|
|
30
|
-
telemetry: Partial<Record<string, StageSummary>> & {
|
|
31
|
-
totalDuration?: number;
|
|
32
|
-
};
|
|
33
|
-
success?: boolean;
|
|
34
|
-
error?: string;
|
|
35
|
-
}
|
|
36
11
|
export declare class Scrubber {
|
|
37
|
-
config:
|
|
12
|
+
config: any;
|
|
38
13
|
stages: any;
|
|
39
|
-
telemetry:
|
|
40
|
-
constructor(config?:
|
|
14
|
+
telemetry: any;
|
|
15
|
+
constructor(config?: {});
|
|
41
16
|
/**
|
|
42
17
|
* Main entry point - process a raw document
|
|
43
18
|
* @param {Object} document - { content: string, source: string, type: 'html'|'md'|'txt' }
|
|
44
19
|
* @returns {Promise<Object>} - { chunks: Array, metadata: Object, telemetry: Object }
|
|
45
20
|
*/
|
|
46
|
-
process(document:
|
|
47
|
-
|
|
21
|
+
process(document: any): Promise<{
|
|
22
|
+
chunks: any[];
|
|
23
|
+
metadata: {
|
|
24
|
+
source: any;
|
|
25
|
+
type: any;
|
|
26
|
+
processingTimestamp: string;
|
|
27
|
+
};
|
|
28
|
+
telemetry: {};
|
|
29
|
+
}>;
|
|
30
|
+
_executeStage(stageName: any, stageFn: any): Promise<any>;
|
|
48
31
|
_initializeStages(): {
|
|
49
32
|
structural: StructuralCleaner;
|
|
50
33
|
semantic: SemanticFilter;
|
|
@@ -53,7 +36,7 @@ export declare class Scrubber {
|
|
|
53
36
|
metadata: MetadataAnnotator;
|
|
54
37
|
validator: Validator;
|
|
55
38
|
};
|
|
56
|
-
getMetrics():
|
|
39
|
+
getMetrics(): any;
|
|
57
40
|
healthCheck(): Promise<{
|
|
58
41
|
status: string;
|
|
59
42
|
}>;
|
package/lib/scrubber/scrubber.js
CHANGED
package/lib/scrubber/scrubber.ts
CHANGED
|
@@ -1,168 +1,109 @@
|
|
|
1
|
+
// @ts-nocheck
|
|
1
2
|
/**
|
|
2
3
|
* S-MORA Layer 0 Scrubber - Main Orchestrator
|
|
3
4
|
* @module smora/scrubber/scrubber
|
|
4
5
|
*/
|
|
5
|
-
|
|
6
6
|
import { StructuralCleaner } from "./stages/structural-cleaner.js";
|
|
7
7
|
import { SemanticFilter } from "./stages/semantic-filter.js";
|
|
8
8
|
import { Normalizer } from "./stages/normalizer.js";
|
|
9
9
|
import { Chunker } from "./stages/chunker.js";
|
|
10
10
|
import { MetadataAnnotator } from "./stages/metadata-annotator.js";
|
|
11
11
|
import { Validator } from "./stages/validator.js";
|
|
12
|
-
import {
|
|
13
|
-
ScrubberTelemetry,
|
|
14
|
-
TelemetrySummary,
|
|
15
|
-
StageSummary,
|
|
16
|
-
} from "./telemetry.js";
|
|
12
|
+
import { ScrubberTelemetry, } from "./telemetry.js";
|
|
17
13
|
// import { ScrubberError } from './errors/scrubber-error'; // Assuming this exists or I should check
|
|
18
|
-
import { defaultScrubberConfig
|
|
19
|
-
|
|
20
|
-
// Interfaces for input/output
|
|
21
|
-
export interface ScrubberDocument {
|
|
22
|
-
content: string;
|
|
23
|
-
source: string;
|
|
24
|
-
type: string;
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
export interface Chunk {
|
|
28
|
-
text: string;
|
|
29
|
-
[key: string]: any;
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
export interface ScrubberResult {
|
|
33
|
-
chunks: Chunk[];
|
|
34
|
-
metadata: {
|
|
35
|
-
source: string;
|
|
36
|
-
type: string;
|
|
37
|
-
processingTimestamp: string;
|
|
38
|
-
[key: string]: any;
|
|
39
|
-
};
|
|
40
|
-
telemetry: Partial<Record<string, StageSummary>> & { totalDuration?: number };
|
|
41
|
-
success?: boolean;
|
|
42
|
-
error?: string;
|
|
43
|
-
}
|
|
44
|
-
|
|
14
|
+
import { defaultScrubberConfig } from "./config/defaults.js";
|
|
45
15
|
export class Scrubber {
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
16
|
+
config;
|
|
17
|
+
stages; // Using any for stages as they are not yet converted
|
|
18
|
+
telemetry;
|
|
19
|
+
constructor(config = {}) {
|
|
20
|
+
this.config = { ...defaultScrubberConfig, ...config };
|
|
21
|
+
this.stages = this._initializeStages();
|
|
22
|
+
this.telemetry = new ScrubberTelemetry();
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Main entry point - process a raw document
|
|
26
|
+
* @param {Object} document - { content: string, source: string, type: 'html'|'md'|'txt' }
|
|
27
|
+
* @returns {Promise<Object>} - { chunks: Array, metadata: Object, telemetry: Object }
|
|
28
|
+
*/
|
|
29
|
+
async process(document) {
|
|
30
|
+
const startTime = Date.now();
|
|
31
|
+
const result = {
|
|
32
|
+
chunks: [],
|
|
33
|
+
metadata: {
|
|
34
|
+
source: document.source,
|
|
35
|
+
type: document.type,
|
|
36
|
+
processingTimestamp: new Date().toISOString(),
|
|
37
|
+
},
|
|
38
|
+
telemetry: {},
|
|
39
|
+
};
|
|
40
|
+
try {
|
|
41
|
+
// If disabled, return empty chunks
|
|
42
|
+
if (!this.config.enabled) {
|
|
43
|
+
result.success = true;
|
|
44
|
+
result.telemetry.totalDuration = Date.now() - startTime;
|
|
45
|
+
return result;
|
|
46
|
+
}
|
|
47
|
+
// Stage 1: Structural Cleaning
|
|
48
|
+
const cleaned = await this._executeStage("structural", () => this.stages.structural.clean(document.content));
|
|
49
|
+
result.telemetry.structural = this.telemetry.getStageStats("structural");
|
|
50
|
+
// Stage 2: Semantic Filtering
|
|
51
|
+
const filtered = await this._executeStage("semantic", () => this.stages.semantic.filter(cleaned));
|
|
52
|
+
result.telemetry.semantic = this.telemetry.getStageStats("semantic");
|
|
53
|
+
// Stage 3: Normalization
|
|
54
|
+
const normalized = await this._executeStage("normalization", () => this.stages.normalizer.normalize(filtered));
|
|
55
|
+
result.telemetry.normalization =
|
|
56
|
+
this.telemetry.getStageStats("normalization");
|
|
57
|
+
// Stage 4: Chunking
|
|
58
|
+
const chunks = await this._executeStage("chunking", () => this.stages.chunker.chunk(normalized));
|
|
59
|
+
result.telemetry.chunking = this.telemetry.getStageStats("chunking");
|
|
60
|
+
// Stage 5: Metadata Annotation
|
|
61
|
+
const annotated = await this._executeStage("metadata", () => this.stages.metadata.annotate(chunks, document));
|
|
62
|
+
result.telemetry.metadata = this.telemetry.getStageStats("metadata");
|
|
63
|
+
// Stage 6: Validation
|
|
64
|
+
result.chunks = await this._executeStage("validation", () => this.stages.validator.validate(annotated));
|
|
65
|
+
result.telemetry.validation = this.telemetry.getStageStats("validation");
|
|
66
|
+
result.telemetry.totalDuration = Date.now() - startTime;
|
|
67
|
+
result.success = true;
|
|
68
|
+
return result;
|
|
69
|
+
}
|
|
70
|
+
catch (error) {
|
|
71
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
72
|
+
result.success = false;
|
|
73
|
+
result.error = message;
|
|
74
|
+
result.telemetry.totalDuration = Date.now() - startTime;
|
|
75
|
+
return result;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
async _executeStage(stageName, stageFn) {
|
|
79
|
+
const startTime = Date.now();
|
|
80
|
+
try {
|
|
81
|
+
const result = await stageFn();
|
|
82
|
+
const duration = Date.now() - startTime;
|
|
83
|
+
this.telemetry.recordStage(stageName, duration, true);
|
|
84
|
+
return result;
|
|
85
|
+
}
|
|
86
|
+
catch (error) {
|
|
87
|
+
const duration = Date.now() - startTime;
|
|
88
|
+
this.telemetry.recordStage(stageName, duration, false);
|
|
89
|
+
throw error;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
_initializeStages() {
|
|
93
|
+
return {
|
|
94
|
+
structural: new StructuralCleaner(this.config.structural),
|
|
95
|
+
semantic: new SemanticFilter(this.config.semantic),
|
|
96
|
+
normalizer: new Normalizer(this.config.normalization),
|
|
97
|
+
chunker: new Chunker(this.config.chunking),
|
|
98
|
+
metadata: new MetadataAnnotator(this.config.metadata),
|
|
99
|
+
validator: new Validator(this.config.validation),
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
getMetrics() {
|
|
103
|
+
return this.telemetry.getSummary();
|
|
128
104
|
}
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
async _executeStage<T>(
|
|
132
|
-
stageName: string,
|
|
133
|
-
stageFn: () => Promise<T> | T,
|
|
134
|
-
): Promise<T> {
|
|
135
|
-
const startTime = Date.now();
|
|
136
|
-
try {
|
|
137
|
-
const result = await stageFn();
|
|
138
|
-
const duration = Date.now() - startTime;
|
|
139
|
-
this.telemetry.recordStage(stageName, duration, true);
|
|
140
|
-
return result;
|
|
141
|
-
} catch (error) {
|
|
142
|
-
const duration = Date.now() - startTime;
|
|
143
|
-
this.telemetry.recordStage(stageName, duration, false);
|
|
144
|
-
throw error;
|
|
105
|
+
healthCheck() {
|
|
106
|
+
return Promise.resolve({ status: "healthy" });
|
|
145
107
|
}
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
_initializeStages() {
|
|
149
|
-
return {
|
|
150
|
-
structural: new StructuralCleaner(this.config.structural),
|
|
151
|
-
semantic: new SemanticFilter(this.config.semantic),
|
|
152
|
-
normalizer: new Normalizer(this.config.normalization),
|
|
153
|
-
chunker: new Chunker(this.config.chunking),
|
|
154
|
-
metadata: new MetadataAnnotator(this.config.metadata),
|
|
155
|
-
validator: new Validator(this.config.validation),
|
|
156
|
-
};
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
getMetrics(): TelemetrySummary {
|
|
160
|
-
return this.telemetry.getSummary();
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
healthCheck(): Promise<{ status: string }> {
|
|
164
|
-
return Promise.resolve({ status: "healthy" });
|
|
165
|
-
}
|
|
166
108
|
}
|
|
167
|
-
|
|
168
109
|
export default Scrubber;
|
|
@@ -1,13 +1,25 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* S-MORA Layer 0 Scrubber - Stage 4: Chunking
|
|
3
|
+
* @module smora/scrubber/stages/chunker
|
|
3
4
|
*/
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
}
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
5
|
+
export declare class Chunker {
|
|
6
|
+
constructor(config: any);
|
|
7
|
+
/**
|
|
8
|
+
* Split content into chunks
|
|
9
|
+
* @param {string} content - Normalized content
|
|
10
|
+
* @returns {Promise<Array>} - Array of chunks with metadata
|
|
11
|
+
*/
|
|
12
|
+
chunk(content: any): Promise<{
|
|
13
|
+
index: number;
|
|
14
|
+
text: any;
|
|
15
|
+
metadata: {
|
|
16
|
+
tokens: any;
|
|
17
|
+
heading: any;
|
|
18
|
+
position: number;
|
|
19
|
+
};
|
|
20
|
+
}[]>;
|
|
21
|
+
_isHeading(line: any): boolean;
|
|
22
|
+
_shouldStartNewChunk(currentChunk: any, para: any, paraTokens: any, isHeading: any): boolean;
|
|
23
|
+
_extractInitialHeading(content: any): any;
|
|
24
|
+
_extractHeadingText(headingLine: any): any;
|
|
13
25
|
}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
// @ts-nocheck
|
|
2
|
+
/**
|
|
3
|
+
* S-MORA Layer 0 Scrubber - Stage 4: Chunking
|
|
4
|
+
* @module smora/scrubber/stages/chunker
|
|
5
|
+
*/
|
|
6
|
+
import { TokenCounter } from '../utils/token-counter.js';
|
|
7
|
+
import { ScrubberError } from '../errors/scrubber-error.js';
|
|
8
|
+
export class Chunker {
|
|
9
|
+
constructor(config) {
|
|
10
|
+
this.config = config;
|
|
11
|
+
this.tokenCounter = new TokenCounter();
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Split content into chunks
|
|
15
|
+
* @param {string} content - Normalized content
|
|
16
|
+
* @returns {Promise<Array>} - Array of chunks with metadata
|
|
17
|
+
*/
|
|
18
|
+
async chunk(content) {
|
|
19
|
+
try {
|
|
20
|
+
const chunks = [];
|
|
21
|
+
const paragraphs = content.split(/\n\n+/);
|
|
22
|
+
let currentChunk = {
|
|
23
|
+
text: '',
|
|
24
|
+
tokens: 0,
|
|
25
|
+
heading: this._extractInitialHeading(content)
|
|
26
|
+
};
|
|
27
|
+
for (const para of paragraphs) {
|
|
28
|
+
const isHeading = this._isHeading(para);
|
|
29
|
+
const paraTokens = this.tokenCounter.count(para);
|
|
30
|
+
if (this._shouldStartNewChunk(currentChunk, para, paraTokens, isHeading)) {
|
|
31
|
+
if (currentChunk.tokens >= this.config.minTokens) {
|
|
32
|
+
chunks.push({ ...currentChunk });
|
|
33
|
+
}
|
|
34
|
+
currentChunk = {
|
|
35
|
+
text: '',
|
|
36
|
+
tokens: 0,
|
|
37
|
+
heading: isHeading ? this._extractHeadingText(para) : currentChunk.heading
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
currentChunk.text += (currentChunk.text ? '\n\n' : '') + para;
|
|
41
|
+
currentChunk.tokens += paraTokens;
|
|
42
|
+
if (currentChunk.tokens > this.config.hardMaxTokens) {
|
|
43
|
+
chunks.push({ ...currentChunk });
|
|
44
|
+
currentChunk = { text: '', tokens: 0, heading: null };
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
if (currentChunk.tokens >= this.config.minTokens) {
|
|
48
|
+
chunks.push(currentChunk);
|
|
49
|
+
}
|
|
50
|
+
return chunks.map((chunk, index) => ({
|
|
51
|
+
index,
|
|
52
|
+
text: chunk.text.trim(),
|
|
53
|
+
metadata: {
|
|
54
|
+
tokens: chunk.tokens,
|
|
55
|
+
heading: chunk.heading,
|
|
56
|
+
position: index
|
|
57
|
+
}
|
|
58
|
+
}));
|
|
59
|
+
}
|
|
60
|
+
catch (error) {
|
|
61
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
62
|
+
throw new ScrubberError(`Failed to chunk content: ${message}`, { stage: 'chunker', originalError: error });
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
_isHeading(line) {
|
|
66
|
+
return /^#{1,6}\s/.test(line);
|
|
67
|
+
}
|
|
68
|
+
_shouldStartNewChunk(currentChunk, para, paraTokens, isHeading) {
|
|
69
|
+
if (this.config.splitOnHeadings && isHeading && currentChunk.tokens > 0) {
|
|
70
|
+
return true;
|
|
71
|
+
}
|
|
72
|
+
const wouldExceed = (currentChunk.tokens + paraTokens) > this.config.maxTokens;
|
|
73
|
+
if (wouldExceed && currentChunk.tokens > 0) {
|
|
74
|
+
return true;
|
|
75
|
+
}
|
|
76
|
+
return false;
|
|
77
|
+
}
|
|
78
|
+
_extractInitialHeading(content) {
|
|
79
|
+
const match = content.match(/^#{1,6}\s+(.+)$/m);
|
|
80
|
+
return match ? match[1] : null;
|
|
81
|
+
}
|
|
82
|
+
_extractHeadingText(headingLine) {
|
|
83
|
+
const match = headingLine.match(/^#{1,6}\s+(.+)$/);
|
|
84
|
+
return match ? match[1] : null;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
// @ts-nocheck
|
|
2
|
+
/**
|
|
3
|
+
* S-MORA Layer 0 Scrubber - Stage 4: Chunking
|
|
4
|
+
* @module smora/scrubber/stages/chunker
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { TokenCounter } from '../utils/token-counter.js';
|
|
8
|
+
import { ChunkingError, ScrubberError } from '../errors/scrubber-error.js';
|
|
9
|
+
|
|
10
|
+
export class Chunker {
|
|
11
|
+
constructor(config) {
|
|
12
|
+
this.config = config;
|
|
13
|
+
this.tokenCounter = new TokenCounter();
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Split content into chunks
|
|
18
|
+
* @param {string} content - Normalized content
|
|
19
|
+
* @returns {Promise<Array>} - Array of chunks with metadata
|
|
20
|
+
*/
|
|
21
|
+
async chunk(content) {
|
|
22
|
+
try {
|
|
23
|
+
const chunks = [];
|
|
24
|
+
const paragraphs = content.split(/\n\n+/);
|
|
25
|
+
|
|
26
|
+
let currentChunk = {
|
|
27
|
+
text: '',
|
|
28
|
+
tokens: 0,
|
|
29
|
+
heading: this._extractInitialHeading(content)
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
for (const para of paragraphs) {
|
|
33
|
+
const isHeading = this._isHeading(para);
|
|
34
|
+
const paraTokens = this.tokenCounter.count(para);
|
|
35
|
+
|
|
36
|
+
if (this._shouldStartNewChunk(currentChunk, para, paraTokens, isHeading)) {
|
|
37
|
+
if (currentChunk.tokens >= this.config.minTokens) {
|
|
38
|
+
chunks.push({ ...currentChunk });
|
|
39
|
+
}
|
|
40
|
+
currentChunk = {
|
|
41
|
+
text: '',
|
|
42
|
+
tokens: 0,
|
|
43
|
+
heading: isHeading ? this._extractHeadingText(para) : currentChunk.heading
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
currentChunk.text += (currentChunk.text ? '\n\n' : '') + para;
|
|
48
|
+
currentChunk.tokens += paraTokens;
|
|
49
|
+
|
|
50
|
+
if (currentChunk.tokens > this.config.hardMaxTokens) {
|
|
51
|
+
chunks.push({ ...currentChunk });
|
|
52
|
+
currentChunk = { text: '', tokens: 0, heading: null };
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
if (currentChunk.tokens >= this.config.minTokens) {
|
|
57
|
+
chunks.push(currentChunk);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
return chunks.map((chunk, index) => ({
|
|
61
|
+
index,
|
|
62
|
+
text: chunk.text.trim(),
|
|
63
|
+
metadata: {
|
|
64
|
+
tokens: chunk.tokens,
|
|
65
|
+
heading: chunk.heading,
|
|
66
|
+
position: index
|
|
67
|
+
}
|
|
68
|
+
}));
|
|
69
|
+
} catch (error) {
|
|
70
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
71
|
+
throw new ScrubberError(
|
|
72
|
+
`Failed to chunk content: ${message}`,
|
|
73
|
+
{ stage: 'chunker', originalError: error }
|
|
74
|
+
);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
_isHeading(line) {
|
|
79
|
+
return /^#{1,6}\s/.test(line);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
_shouldStartNewChunk(currentChunk, para, paraTokens, isHeading) {
|
|
83
|
+
if (this.config.splitOnHeadings && isHeading && currentChunk.tokens > 0) {
|
|
84
|
+
return true;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const wouldExceed = (currentChunk.tokens + paraTokens) > this.config.maxTokens;
|
|
88
|
+
if (wouldExceed && currentChunk.tokens > 0) {
|
|
89
|
+
return true;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
return false;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
_extractInitialHeading(content) {
|
|
96
|
+
const match = content.match(/^#{1,6}\s+(.+)$/m);
|
|
97
|
+
return match ? match[1] : null;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
_extractHeadingText(headingLine) {
|
|
101
|
+
const match = headingLine.match(/^#{1,6}\s+(.+)$/);
|
|
102
|
+
return match ? match[1] : null;
|
|
103
|
+
}
|
|
104
|
+
}
|
|
@@ -1,18 +1,17 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* S-MORA Layer 0 Scrubber - Stage 5: Metadata Annotation
|
|
3
|
+
* @module smora/scrubber/stages/metadata-annotator
|
|
3
4
|
*/
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
}
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
constructor(config?: AnnotatorConfig);
|
|
17
|
-
annotate(content: string): Promise<AnnotatedData>;
|
|
5
|
+
export declare class MetadataAnnotator {
|
|
6
|
+
constructor(config: any);
|
|
7
|
+
/**
|
|
8
|
+
* Add metadata to chunks
|
|
9
|
+
* @param {Array} chunks - Array of chunks
|
|
10
|
+
* @param {Object} document - Original document metadata
|
|
11
|
+
* @returns {Promise<Array>} - Annotated chunks
|
|
12
|
+
*/
|
|
13
|
+
annotate(chunks: any, document: any): Promise<any>;
|
|
14
|
+
_extractSection(chunk: any): any;
|
|
15
|
+
_buildHeadingPath(chunk: any, currentPath: any): any[];
|
|
16
|
+
_isSubHeading(heading1: any, heading2: any): boolean;
|
|
18
17
|
}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
// @ts-nocheck
|
|
2
|
+
/**
|
|
3
|
+
* S-MORA Layer 0 Scrubber - Stage 5: Metadata Annotation
|
|
4
|
+
* @module smora/scrubber/stages/metadata-annotator
|
|
5
|
+
*/
|
|
6
|
+
import { HashUtil } from '../utils/hash.js';
|
|
7
|
+
export class MetadataAnnotator {
|
|
8
|
+
constructor(config) {
|
|
9
|
+
this.config = config;
|
|
10
|
+
this.hashUtil = new HashUtil();
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* Add metadata to chunks
|
|
14
|
+
* @param {Array} chunks - Array of chunks
|
|
15
|
+
* @param {Object} document - Original document metadata
|
|
16
|
+
* @returns {Promise<Array>} - Annotated chunks
|
|
17
|
+
*/
|
|
18
|
+
async annotate(chunks, document) {
|
|
19
|
+
const headingPath = [];
|
|
20
|
+
return chunks.map((chunk, index) => {
|
|
21
|
+
const metadata = {
|
|
22
|
+
...chunk.metadata,
|
|
23
|
+
source: this.config.addSource ? document.source : undefined,
|
|
24
|
+
doc_type: this.config.addSource ? document.type : undefined,
|
|
25
|
+
section: this.config.addSection ? this._extractSection(chunk) : undefined,
|
|
26
|
+
heading_path: this.config.addHeadingPath ?
|
|
27
|
+
this._buildHeadingPath(chunk, headingPath) :
|
|
28
|
+
undefined,
|
|
29
|
+
ingestion_timestamp: this.config.addTimestamp ?
|
|
30
|
+
new Date().toISOString() :
|
|
31
|
+
undefined,
|
|
32
|
+
hash: this.config.addHash ?
|
|
33
|
+
this.hashUtil.hash(chunk.text) :
|
|
34
|
+
undefined
|
|
35
|
+
};
|
|
36
|
+
return {
|
|
37
|
+
...chunk,
|
|
38
|
+
metadata: Object.fromEntries(Object.entries(metadata).filter(([_, v]) => v !== undefined))
|
|
39
|
+
};
|
|
40
|
+
});
|
|
41
|
+
}
|
|
42
|
+
_extractSection(chunk) {
|
|
43
|
+
if (chunk.metadata.heading) {
|
|
44
|
+
return chunk.metadata.heading;
|
|
45
|
+
}
|
|
46
|
+
return 'unnamed-section';
|
|
47
|
+
}
|
|
48
|
+
_buildHeadingPath(chunk, currentPath) {
|
|
49
|
+
const heading = chunk.metadata.heading;
|
|
50
|
+
if (heading && heading !== currentPath[currentPath.length - 1]) {
|
|
51
|
+
if (currentPath.length === 0 || this._isSubHeading(heading, currentPath[currentPath.length - 1])) {
|
|
52
|
+
currentPath.push(heading);
|
|
53
|
+
}
|
|
54
|
+
else {
|
|
55
|
+
currentPath.length = 0;
|
|
56
|
+
currentPath.push(heading);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
return [...currentPath];
|
|
60
|
+
}
|
|
61
|
+
_isSubHeading(heading1, heading2) {
|
|
62
|
+
return heading1.length > heading2.length;
|
|
63
|
+
}
|
|
64
|
+
}
|