@yamo/memory-mesh 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +80 -0
- package/bin/memory_mesh.js +69 -0
- package/bin/scrubber.js +81 -0
- package/index.d.ts +111 -0
- package/lib/adapters/index.js +3 -0
- package/lib/embeddings/factory.js +150 -0
- package/lib/embeddings/index.js +2 -0
- package/lib/embeddings/service.js +586 -0
- package/lib/index.js +18 -0
- package/lib/lancedb/client.js +631 -0
- package/lib/lancedb/config.js +215 -0
- package/lib/lancedb/errors.js +144 -0
- package/lib/lancedb/index.js +4 -0
- package/lib/lancedb/schema.js +197 -0
- package/lib/memory/index.js +3 -0
- package/lib/memory/memory-context-manager.js +388 -0
- package/lib/memory/memory-mesh.js +910 -0
- package/lib/memory/memory-translator.js +130 -0
- package/lib/memory/migrate-memory.js +227 -0
- package/lib/memory/migrate-to-v2.js +120 -0
- package/lib/memory/scorer.js +85 -0
- package/lib/memory/vector-memory.js +364 -0
- package/lib/privacy/audit-logger.js +176 -0
- package/lib/privacy/dlp-redactor.js +72 -0
- package/lib/privacy/index.js +10 -0
- package/lib/reporting/skill-report-generator.js +283 -0
- package/lib/scrubber/.gitkeep +1 -0
- package/lib/scrubber/config/defaults.js +62 -0
- package/lib/scrubber/errors/scrubber-error.js +43 -0
- package/lib/scrubber/index.js +25 -0
- package/lib/scrubber/scrubber.js +130 -0
- package/lib/scrubber/stages/chunker.js +103 -0
- package/lib/scrubber/stages/metadata-annotator.js +74 -0
- package/lib/scrubber/stages/normalizer.js +59 -0
- package/lib/scrubber/stages/semantic-filter.js +61 -0
- package/lib/scrubber/stages/structural-cleaner.js +82 -0
- package/lib/scrubber/stages/validator.js +66 -0
- package/lib/scrubber/telemetry.js +66 -0
- package/lib/scrubber/utils/hash.js +39 -0
- package/lib/scrubber/utils/html-parser.js +45 -0
- package/lib/scrubber/utils/pattern-matcher.js +63 -0
- package/lib/scrubber/utils/token-counter.js +31 -0
- package/lib/search/filter.js +275 -0
- package/lib/search/hybrid.js +137 -0
- package/lib/search/index.js +3 -0
- package/lib/search/pattern-miner.js +160 -0
- package/lib/utils/error-sanitizer.js +84 -0
- package/lib/utils/handoff-validator.js +85 -0
- package/lib/utils/index.js +4 -0
- package/lib/utils/spinner.js +190 -0
- package/lib/utils/streaming-client.js +128 -0
- package/package.json +39 -0
- package/skills/SKILL.md +462 -0
- package/skills/skill-scrubber.yamo +41 -0
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
import { promises as fs } from 'fs';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Skill Execution Report Generator
|
|
6
|
+
*
|
|
7
|
+
* Generates JSON reports for skill executions, capturing:
|
|
8
|
+
* - Skill metadata (name, version, type)
|
|
9
|
+
* - Execution details (duration, status, provider)
|
|
10
|
+
* - Input/output metrics
|
|
11
|
+
* - Quality indicators
|
|
12
|
+
*/
|
|
13
|
+
export class SkillReportGenerator {
|
|
14
|
+
constructor(options = {}) {
|
|
15
|
+
this.reportsDir = options.reportsDir || this._getReportsDir();
|
|
16
|
+
this.version = '1.0.0';
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
_getReportsDir() {
|
|
20
|
+
// @ts-ignore
|
|
21
|
+
const home = process.env.HOME || process.env.USERPROFILE || process.cwd();
|
|
22
|
+
return path.join(home, '.yamo', 'reports');
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Generate a unique report ID
|
|
27
|
+
* @param {string} sessionId - Session identifier
|
|
28
|
+
* @returns {string} Report ID
|
|
29
|
+
*/
|
|
30
|
+
_generateReportId(sessionId) {
|
|
31
|
+
const timestamp = Date.now();
|
|
32
|
+
const shortSession = sessionId ? sessionId.substring(0, 8) : 'unknown';
|
|
33
|
+
return `skill_execution_${timestamp}_${shortSession}`;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Extract skill type from file path or name
|
|
38
|
+
* @param {string} skillName - Name of the skill
|
|
39
|
+
* @param {string[]} contextFiles - Context files used
|
|
40
|
+
* @returns {string} Skill type
|
|
41
|
+
*/
|
|
42
|
+
_getSkillType(skillName, contextFiles = []) {
|
|
43
|
+
if (skillName === 'LLMClient') return 'direct';
|
|
44
|
+
|
|
45
|
+
const skillFile = contextFiles.find(f => f.endsWith('.yamo'));
|
|
46
|
+
if (!skillFile) return 'unknown';
|
|
47
|
+
|
|
48
|
+
if (skillFile.includes('utility/')) return 'utility';
|
|
49
|
+
if (skillFile.includes('generator/')) return 'generator';
|
|
50
|
+
if (skillFile.includes('protocol/')) return 'protocol';
|
|
51
|
+
if (skillFile.includes('system-skills/')) return 'system';
|
|
52
|
+
|
|
53
|
+
return 'custom';
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Parse skill metadata from .yamo file path
|
|
58
|
+
* @param {string[]} contextFiles - Context files
|
|
59
|
+
* @returns {Object} Skill metadata
|
|
60
|
+
*/
|
|
61
|
+
_parseSkillMetadata(contextFiles = []) {
|
|
62
|
+
const skillFile = contextFiles.find(f => f.endsWith('.yamo'));
|
|
63
|
+
if (!skillFile) {
|
|
64
|
+
return { version: null, description: null };
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Return basic info - full parsing would require reading the file
|
|
68
|
+
return {
|
|
69
|
+
version: '1.0.0', // Default version
|
|
70
|
+
file: skillFile
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Create a report object from execution data
|
|
76
|
+
* @param {Object} executionData - Data from skill execution
|
|
77
|
+
* @returns {Object} Report object
|
|
78
|
+
*/
|
|
79
|
+
createReport(executionData) {
|
|
80
|
+
const {
|
|
81
|
+
skill,
|
|
82
|
+
sessionId,
|
|
83
|
+
duration,
|
|
84
|
+
provider,
|
|
85
|
+
model,
|
|
86
|
+
promptLength,
|
|
87
|
+
responseLength,
|
|
88
|
+
contextFiles = [],
|
|
89
|
+
parameters = {},
|
|
90
|
+
status = 'success',
|
|
91
|
+
error = null,
|
|
92
|
+
artifactsCreated = [],
|
|
93
|
+
memoryCaptured = false
|
|
94
|
+
} = executionData;
|
|
95
|
+
|
|
96
|
+
const reportId = this._generateReportId(sessionId);
|
|
97
|
+
const skillMeta = this._parseSkillMetadata(contextFiles);
|
|
98
|
+
const skillType = this._getSkillType(skill, contextFiles);
|
|
99
|
+
|
|
100
|
+
return {
|
|
101
|
+
report_id: reportId,
|
|
102
|
+
timestamp: new Date().toISOString(),
|
|
103
|
+
skill: {
|
|
104
|
+
name: skill,
|
|
105
|
+
version: skillMeta.version,
|
|
106
|
+
type: skillType,
|
|
107
|
+
file: skillMeta.file || null
|
|
108
|
+
},
|
|
109
|
+
execution: {
|
|
110
|
+
session_id: sessionId,
|
|
111
|
+
duration_ms: Math.round(duration),
|
|
112
|
+
status,
|
|
113
|
+
error: error ? String(error) : null,
|
|
114
|
+
provider,
|
|
115
|
+
model
|
|
116
|
+
},
|
|
117
|
+
input: {
|
|
118
|
+
prompt_length: promptLength,
|
|
119
|
+
context_files: contextFiles,
|
|
120
|
+
parameters
|
|
121
|
+
},
|
|
122
|
+
output: {
|
|
123
|
+
response_length: responseLength,
|
|
124
|
+
artifacts_created: artifactsCreated,
|
|
125
|
+
tokens_used: null // Could be populated if provider returns token count
|
|
126
|
+
},
|
|
127
|
+
quality: {
|
|
128
|
+
memory_captured: memoryCaptured,
|
|
129
|
+
artifacts_saved: artifactsCreated.length > 0
|
|
130
|
+
},
|
|
131
|
+
meta: {
|
|
132
|
+
generator: 'yamo-skills',
|
|
133
|
+
version: this.version
|
|
134
|
+
}
|
|
135
|
+
};
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Generate filename for a report
|
|
140
|
+
* @param {Object} report - Report object
|
|
141
|
+
* @returns {string} Filename
|
|
142
|
+
*/
|
|
143
|
+
getReportFilename(report) {
|
|
144
|
+
// Format: skill-{name}_{timestamp}_{ms}.json
|
|
145
|
+
// Include milliseconds for uniqueness when multiple reports per second
|
|
146
|
+
const safeName = report.skill.name.toLowerCase().replace(/[^a-z0-9]/g, '-');
|
|
147
|
+
const timestamp = report.timestamp.replace(/[:.]/g, '-').replace('T', '_').slice(0, 19);
|
|
148
|
+
const ms = report.timestamp.slice(20, 23) || '000';
|
|
149
|
+
return `skill-${safeName}_${timestamp}-${ms}.json`;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Ensure reports directory exists
|
|
154
|
+
*/
|
|
155
|
+
async _ensureReportsDir() {
|
|
156
|
+
try {
|
|
157
|
+
await fs.mkdir(this.reportsDir, { recursive: true });
|
|
158
|
+
} catch (error) {
|
|
159
|
+
const e = error instanceof Error ? error : new Error(String(error));
|
|
160
|
+
// @ts-ignore
|
|
161
|
+
if (e.code !== 'EEXIST') {
|
|
162
|
+
throw e;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Save a report to disk
|
|
169
|
+
* @param {Object} report - Report object to save
|
|
170
|
+
* @returns {Promise<string>} Path to saved report
|
|
171
|
+
*/
|
|
172
|
+
async saveReport(report) {
|
|
173
|
+
await this._ensureReportsDir();
|
|
174
|
+
|
|
175
|
+
const filename = this.getReportFilename(report);
|
|
176
|
+
const filepath = path.join(this.reportsDir, filename);
|
|
177
|
+
|
|
178
|
+
await fs.writeFile(filepath, JSON.stringify(report, null, 2), 'utf8');
|
|
179
|
+
|
|
180
|
+
return filepath;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
/**
|
|
184
|
+
* Generate and save a report in one call
|
|
185
|
+
* @param {Object} executionData - Data from skill execution
|
|
186
|
+
* @returns {Promise<Object>} Report object with filepath
|
|
187
|
+
*/
|
|
188
|
+
async generateAndSave(executionData) {
|
|
189
|
+
const report = this.createReport(executionData);
|
|
190
|
+
const filepath = await this.saveReport(report);
|
|
191
|
+
|
|
192
|
+
return {
|
|
193
|
+
...report,
|
|
194
|
+
_filepath: filepath
|
|
195
|
+
};
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* List recent reports
|
|
200
|
+
* @param {number} limit - Maximum number of reports to return
|
|
201
|
+
* @returns {Promise<string[]>} Array of report filenames
|
|
202
|
+
*/
|
|
203
|
+
async listReports(limit = 10) {
|
|
204
|
+
try {
|
|
205
|
+
await this._ensureReportsDir();
|
|
206
|
+
const files = await fs.readdir(this.reportsDir);
|
|
207
|
+
|
|
208
|
+
// Filter JSON files and sort by name (descending = newest first)
|
|
209
|
+
const reports = files
|
|
210
|
+
.filter(f => f.endsWith('.json'))
|
|
211
|
+
.sort((a, b) => b.localeCompare(a))
|
|
212
|
+
.slice(0, limit);
|
|
213
|
+
|
|
214
|
+
return reports;
|
|
215
|
+
} catch (error) {
|
|
216
|
+
return [];
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* Read a specific report
|
|
222
|
+
* @param {string} filename - Report filename
|
|
223
|
+
* @returns {Promise<Object|null>} Report object or null
|
|
224
|
+
*/
|
|
225
|
+
async readReport(filename) {
|
|
226
|
+
try {
|
|
227
|
+
const filepath = path.join(this.reportsDir, filename);
|
|
228
|
+
const content = await fs.readFile(filepath, 'utf8');
|
|
229
|
+
return JSON.parse(content);
|
|
230
|
+
} catch (error) {
|
|
231
|
+
return null;
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
/**
|
|
236
|
+
* Get aggregate statistics from recent reports
|
|
237
|
+
* @param {number} limit - Number of reports to analyze
|
|
238
|
+
* @returns {Promise<Object>} Statistics object
|
|
239
|
+
*/
|
|
240
|
+
async getStats(limit = 100) {
|
|
241
|
+
const reportFiles = await this.listReports(limit);
|
|
242
|
+
const stats = {
|
|
243
|
+
total_reports: reportFiles.length,
|
|
244
|
+
skills_used: {},
|
|
245
|
+
providers_used: {},
|
|
246
|
+
success_count: 0,
|
|
247
|
+
error_count: 0,
|
|
248
|
+
total_duration_ms: 0,
|
|
249
|
+
avg_duration_ms: 0
|
|
250
|
+
};
|
|
251
|
+
|
|
252
|
+
for (const filename of reportFiles) {
|
|
253
|
+
const report = await this.readReport(filename);
|
|
254
|
+
if (!report) continue;
|
|
255
|
+
|
|
256
|
+
// Count skills
|
|
257
|
+
const skillName = report.skill?.name || 'unknown';
|
|
258
|
+
stats.skills_used[skillName] = (stats.skills_used[skillName] || 0) + 1;
|
|
259
|
+
|
|
260
|
+
// Count providers
|
|
261
|
+
const provider = report.execution?.provider || 'unknown';
|
|
262
|
+
stats.providers_used[provider] = (stats.providers_used[provider] || 0) + 1;
|
|
263
|
+
|
|
264
|
+
// Count success/error
|
|
265
|
+
if (report.execution?.status === 'success') {
|
|
266
|
+
stats.success_count++;
|
|
267
|
+
} else {
|
|
268
|
+
stats.error_count++;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// Sum duration
|
|
272
|
+
stats.total_duration_ms += report.execution?.duration_ms || 0;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
if (stats.total_reports > 0) {
|
|
276
|
+
stats.avg_duration_ms = Math.round(stats.total_duration_ms / stats.total_reports);
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
return stats;
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
export default SkillReportGenerator;
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# This directory contains S-MORA Layer 0 Scrubber components
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* S-MORA Layer 0 Scrubber Default Configuration
|
|
3
|
+
* @module smora/scrubber/config/defaults
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
export const defaultScrubberConfig = {
|
|
7
|
+
// Master switch
|
|
8
|
+
enabled: false,
|
|
9
|
+
|
|
10
|
+
// Stage 1: Structural Cleaning
|
|
11
|
+
structural: {
|
|
12
|
+
stripHTML: true,
|
|
13
|
+
normalizeMarkdown: true,
|
|
14
|
+
collapseWhitespace: true,
|
|
15
|
+
removeScripts: true,
|
|
16
|
+
removeStyles: true
|
|
17
|
+
},
|
|
18
|
+
|
|
19
|
+
// Stage 2: Semantic Filtering
|
|
20
|
+
semantic: {
|
|
21
|
+
removeDuplicates: true,
|
|
22
|
+
removeBoilerplate: true,
|
|
23
|
+
minSignalRatio: 0.3,
|
|
24
|
+
boilerplatePatterns: 'default'
|
|
25
|
+
},
|
|
26
|
+
|
|
27
|
+
// Stage 3: Normalization
|
|
28
|
+
normalization: {
|
|
29
|
+
normalizeHeadings: true,
|
|
30
|
+
normalizeLists: true,
|
|
31
|
+
normalizePunctuation: true
|
|
32
|
+
},
|
|
33
|
+
|
|
34
|
+
// Stage 4: Chunking
|
|
35
|
+
chunking: {
|
|
36
|
+
maxTokens: 500,
|
|
37
|
+
minTokens: 10,
|
|
38
|
+
hardMaxTokens: 2000,
|
|
39
|
+
splitOnHeadings: true,
|
|
40
|
+
preserveContext: true
|
|
41
|
+
},
|
|
42
|
+
|
|
43
|
+
// Stage 5: Metadata Annotation
|
|
44
|
+
metadata: {
|
|
45
|
+
addSource: true,
|
|
46
|
+
addSection: true,
|
|
47
|
+
addHeadingPath: true,
|
|
48
|
+
addTimestamp: true,
|
|
49
|
+
addHash: true
|
|
50
|
+
},
|
|
51
|
+
|
|
52
|
+
// Stage 6: Validation
|
|
53
|
+
validation: {
|
|
54
|
+
enforceMinLength: true,
|
|
55
|
+
enforceMaxLength: true,
|
|
56
|
+
rejectEmptyChunks: true
|
|
57
|
+
},
|
|
58
|
+
|
|
59
|
+
// Performance
|
|
60
|
+
logTransformations: false,
|
|
61
|
+
cachePatterns: true
|
|
62
|
+
};
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* S-MORA Layer 0 Scrubber Error Classes
|
|
3
|
+
* @module smora/scrubber/errors/scrubber-error
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
export class ScrubberError extends Error {
|
|
7
|
+
constructor(message, details = {}) {
|
|
8
|
+
super(message);
|
|
9
|
+
this.name = 'ScrubberError';
|
|
10
|
+
this.details = details;
|
|
11
|
+
this.timestamp = new Date().toISOString();
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
toJSON() {
|
|
15
|
+
return {
|
|
16
|
+
name: this.name,
|
|
17
|
+
message: this.message,
|
|
18
|
+
details: this.details,
|
|
19
|
+
timestamp: this.timestamp
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export class StructuralCleaningError extends ScrubberError {
|
|
25
|
+
constructor(message, details = {}) {
|
|
26
|
+
super(message, details);
|
|
27
|
+
this.name = 'StructuralCleaningError';
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export class ChunkingError extends ScrubberError {
|
|
32
|
+
constructor(message, details = {}) {
|
|
33
|
+
super(message, details);
|
|
34
|
+
this.name = 'ChunkingError';
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export class ValidationError extends ScrubberError {
|
|
39
|
+
constructor(message, details = {}) {
|
|
40
|
+
super(message, details);
|
|
41
|
+
this.name = 'ValidationError';
|
|
42
|
+
}
|
|
43
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* S-MORA Layer 0 Scrubber
|
|
3
|
+
* Deterministic ingestion-time preprocessing layer
|
|
4
|
+
* @module smora/scrubber
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
export { defaultScrubberConfig } from './config/defaults.js';
|
|
8
|
+
export {
|
|
9
|
+
ScrubberError,
|
|
10
|
+
StructuralCleaningError,
|
|
11
|
+
ChunkingError,
|
|
12
|
+
ValidationError
|
|
13
|
+
} from './errors/scrubber-error.js';
|
|
14
|
+
export { ScrubberTelemetry } from './telemetry.js';
|
|
15
|
+
export { Scrubber } from './scrubber.js';
|
|
16
|
+
export { HashUtil } from './utils/hash.js';
|
|
17
|
+
export { TokenCounter } from './utils/token-counter.js';
|
|
18
|
+
export { PatternMatcher } from './utils/pattern-matcher.js';
|
|
19
|
+
export { HTMLParser } from './utils/html-parser.js';
|
|
20
|
+
export { StructuralCleaner } from './stages/structural-cleaner.js';
|
|
21
|
+
export { SemanticFilter } from './stages/semantic-filter.js';
|
|
22
|
+
export { Normalizer } from './stages/normalizer.js';
|
|
23
|
+
export { Chunker } from './stages/chunker.js';
|
|
24
|
+
export { MetadataAnnotator } from './stages/metadata-annotator.js';
|
|
25
|
+
export { Validator } from './stages/validator.js';
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* S-MORA Layer 0 Scrubber - Main Orchestrator
|
|
3
|
+
* @module smora/scrubber/scrubber
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { StructuralCleaner } from './stages/structural-cleaner.js';
|
|
7
|
+
import { SemanticFilter } from './stages/semantic-filter.js';
|
|
8
|
+
import { Normalizer } from './stages/normalizer.js';
|
|
9
|
+
import { Chunker } from './stages/chunker.js';
|
|
10
|
+
import { MetadataAnnotator } from './stages/metadata-annotator.js';
|
|
11
|
+
import { Validator } from './stages/validator.js';
|
|
12
|
+
import { ScrubberTelemetry } from './telemetry.js';
|
|
13
|
+
import { ScrubberError } from './errors/scrubber-error.js';
|
|
14
|
+
import { defaultScrubberConfig } from './config/defaults.js';
|
|
15
|
+
|
|
16
|
+
export class Scrubber {
|
|
17
|
+
constructor(config = {}) {
|
|
18
|
+
this.config = { ...defaultScrubberConfig, ...config };
|
|
19
|
+
this.stages = this._initializeStages();
|
|
20
|
+
this.telemetry = new ScrubberTelemetry();
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Main entry point - process a raw document
|
|
25
|
+
* @param {Object} document - { content: string, source: string, type: 'html'|'md'|'txt' }
|
|
26
|
+
* @returns {Promise<Object>} - { chunks: Array, metadata: Object, telemetry: Object }
|
|
27
|
+
*/
|
|
28
|
+
async process(document) {
|
|
29
|
+
const startTime = Date.now();
|
|
30
|
+
const result = {
|
|
31
|
+
chunks: [],
|
|
32
|
+
metadata: {
|
|
33
|
+
source: document.source,
|
|
34
|
+
type: document.type,
|
|
35
|
+
processingTimestamp: new Date().toISOString()
|
|
36
|
+
},
|
|
37
|
+
telemetry: {}
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
try {
|
|
41
|
+
// If disabled, return empty chunks
|
|
42
|
+
if (!this.config.enabled) {
|
|
43
|
+
result.success = true;
|
|
44
|
+
result.telemetry.totalDuration = Date.now() - startTime;
|
|
45
|
+
return result;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// Stage 1: Structural Cleaning
|
|
49
|
+
const cleaned = await this._executeStage('structural', () =>
|
|
50
|
+
this.stages.structural.clean(document.content)
|
|
51
|
+
);
|
|
52
|
+
result.telemetry.structural = this.telemetry.getStageStats('structural');
|
|
53
|
+
|
|
54
|
+
// Stage 2: Semantic Filtering
|
|
55
|
+
const filtered = await this._executeStage('semantic', () =>
|
|
56
|
+
this.stages.semantic.filter(cleaned)
|
|
57
|
+
);
|
|
58
|
+
result.telemetry.semantic = this.telemetry.getStageStats('semantic');
|
|
59
|
+
|
|
60
|
+
// Stage 3: Normalization
|
|
61
|
+
const normalized = await this._executeStage('normalization', () =>
|
|
62
|
+
this.stages.normalizer.normalize(filtered)
|
|
63
|
+
);
|
|
64
|
+
result.telemetry.normalization = this.telemetry.getStageStats('normalization');
|
|
65
|
+
|
|
66
|
+
// Stage 4: Chunking
|
|
67
|
+
const chunks = await this._executeStage('chunking', () =>
|
|
68
|
+
this.stages.chunker.chunk(normalized)
|
|
69
|
+
);
|
|
70
|
+
result.telemetry.chunking = this.telemetry.getStageStats('chunking');
|
|
71
|
+
|
|
72
|
+
// Stage 5: Metadata Annotation
|
|
73
|
+
const annotated = await this._executeStage('metadata', () =>
|
|
74
|
+
this.stages.metadata.annotate(chunks, document)
|
|
75
|
+
);
|
|
76
|
+
result.telemetry.metadata = this.telemetry.getStageStats('metadata');
|
|
77
|
+
|
|
78
|
+
// Stage 6: Validation
|
|
79
|
+
result.chunks = await this._executeStage('validation', () =>
|
|
80
|
+
this.stages.validator.validate(annotated)
|
|
81
|
+
);
|
|
82
|
+
result.telemetry.validation = this.telemetry.getStageStats('validation');
|
|
83
|
+
|
|
84
|
+
result.telemetry.totalDuration = Date.now() - startTime;
|
|
85
|
+
result.success = true;
|
|
86
|
+
|
|
87
|
+
return result;
|
|
88
|
+
} catch (error) {
|
|
89
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
90
|
+
result.success = false;
|
|
91
|
+
result.error = message;
|
|
92
|
+
result.telemetry.totalDuration = Date.now() - startTime;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
async _executeStage(stageName, stageFn) {
|
|
97
|
+
const startTime = Date.now();
|
|
98
|
+
try {
|
|
99
|
+
const result = await stageFn();
|
|
100
|
+
const duration = Date.now() - startTime;
|
|
101
|
+
this.telemetry.recordStage(stageName, duration, true);
|
|
102
|
+
return result;
|
|
103
|
+
} catch (error) {
|
|
104
|
+
const duration = Date.now() - startTime;
|
|
105
|
+
this.telemetry.recordStage(stageName, duration, false);
|
|
106
|
+
throw error;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
_initializeStages() {
|
|
111
|
+
return {
|
|
112
|
+
structural: new StructuralCleaner(this.config.structural),
|
|
113
|
+
semantic: new SemanticFilter(this.config.semantic),
|
|
114
|
+
normalizer: new Normalizer(this.config.normalization),
|
|
115
|
+
chunker: new Chunker(this.config.chunking),
|
|
116
|
+
metadata: new MetadataAnnotator(this.config.metadata),
|
|
117
|
+
validator: new Validator(this.config.validation)
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
getMetrics() {
|
|
122
|
+
return this.telemetry.getSummary();
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
async healthCheck() {
|
|
126
|
+
return { status: 'healthy' };
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
export default Scrubber;
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* S-MORA Layer 0 Scrubber - Stage 4: Chunking
|
|
3
|
+
* @module smora/scrubber/stages/chunker
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { TokenCounter } from '../utils/token-counter.js';
|
|
7
|
+
import { ChunkingError, ScrubberError } from '../errors/scrubber-error.js';
|
|
8
|
+
|
|
9
|
+
export class Chunker {
|
|
10
|
+
constructor(config) {
|
|
11
|
+
this.config = config;
|
|
12
|
+
this.tokenCounter = new TokenCounter();
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Split content into chunks
|
|
17
|
+
* @param {string} content - Normalized content
|
|
18
|
+
* @returns {Promise<Array>} - Array of chunks with metadata
|
|
19
|
+
*/
|
|
20
|
+
async chunk(content) {
|
|
21
|
+
try {
|
|
22
|
+
const chunks = [];
|
|
23
|
+
const paragraphs = content.split(/\n\n+/);
|
|
24
|
+
|
|
25
|
+
let currentChunk = {
|
|
26
|
+
text: '',
|
|
27
|
+
tokens: 0,
|
|
28
|
+
heading: this._extractInitialHeading(content)
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
for (const para of paragraphs) {
|
|
32
|
+
const isHeading = this._isHeading(para);
|
|
33
|
+
const paraTokens = this.tokenCounter.count(para);
|
|
34
|
+
|
|
35
|
+
if (this._shouldStartNewChunk(currentChunk, para, paraTokens, isHeading)) {
|
|
36
|
+
if (currentChunk.tokens >= this.config.minTokens) {
|
|
37
|
+
chunks.push({ ...currentChunk });
|
|
38
|
+
}
|
|
39
|
+
currentChunk = {
|
|
40
|
+
text: '',
|
|
41
|
+
tokens: 0,
|
|
42
|
+
heading: isHeading ? this._extractHeadingText(para) : currentChunk.heading
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
currentChunk.text += (currentChunk.text ? '\n\n' : '') + para;
|
|
47
|
+
currentChunk.tokens += paraTokens;
|
|
48
|
+
|
|
49
|
+
if (currentChunk.tokens > this.config.hardMaxTokens) {
|
|
50
|
+
chunks.push({ ...currentChunk });
|
|
51
|
+
currentChunk = { text: '', tokens: 0, heading: null };
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
if (currentChunk.tokens >= this.config.minTokens) {
|
|
56
|
+
chunks.push(currentChunk);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
return chunks.map((chunk, index) => ({
|
|
60
|
+
index,
|
|
61
|
+
text: chunk.text.trim(),
|
|
62
|
+
metadata: {
|
|
63
|
+
tokens: chunk.tokens,
|
|
64
|
+
heading: chunk.heading,
|
|
65
|
+
position: index
|
|
66
|
+
}
|
|
67
|
+
}));
|
|
68
|
+
} catch (error) {
|
|
69
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
70
|
+
throw new ScrubberError(
|
|
71
|
+
`Failed to chunk content: ${message}`,
|
|
72
|
+
{ stage: 'chunker', originalError: error }
|
|
73
|
+
);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
_isHeading(line) {
|
|
78
|
+
return /^#{1,6}\s/.test(line);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
_shouldStartNewChunk(currentChunk, para, paraTokens, isHeading) {
|
|
82
|
+
if (this.config.splitOnHeadings && isHeading && currentChunk.tokens > 0) {
|
|
83
|
+
return true;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const wouldExceed = (currentChunk.tokens + paraTokens) > this.config.maxTokens;
|
|
87
|
+
if (wouldExceed && currentChunk.tokens > 0) {
|
|
88
|
+
return true;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
return false;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
_extractInitialHeading(content) {
|
|
95
|
+
const match = content.match(/^#{1,6}\s+(.+)$/m);
|
|
96
|
+
return match ? match[1] : null;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
_extractHeadingText(headingLine) {
|
|
100
|
+
const match = headingLine.match(/^#{1,6}\s+(.+)$/);
|
|
101
|
+
return match ? match[1] : null;
|
|
102
|
+
}
|
|
103
|
+
}
|