@soulcraft/brainy 3.20.5 → 3.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +93 -0
- package/README.md +112 -2
- package/dist/augmentations/defaultAugmentations.d.ts +6 -0
- package/dist/augmentations/defaultAugmentations.js +12 -0
- package/dist/augmentations/intelligentImport/IntelligentImportAugmentation.d.ts +51 -0
- package/dist/augmentations/intelligentImport/IntelligentImportAugmentation.js +185 -0
- package/dist/augmentations/intelligentImport/handlers/base.d.ts +49 -0
- package/dist/augmentations/intelligentImport/handlers/base.js +149 -0
- package/dist/augmentations/intelligentImport/handlers/csvHandler.d.ts +34 -0
- package/dist/augmentations/intelligentImport/handlers/csvHandler.js +185 -0
- package/dist/augmentations/intelligentImport/handlers/excelHandler.d.ts +31 -0
- package/dist/augmentations/intelligentImport/handlers/excelHandler.js +148 -0
- package/dist/augmentations/intelligentImport/handlers/pdfHandler.d.ts +35 -0
- package/dist/augmentations/intelligentImport/handlers/pdfHandler.js +247 -0
- package/dist/augmentations/intelligentImport/index.d.ts +9 -0
- package/dist/augmentations/intelligentImport/index.js +9 -0
- package/dist/augmentations/intelligentImport/types.d.ts +111 -0
- package/dist/augmentations/intelligentImport/types.js +6 -0
- package/dist/neural/entityExtractionCache.d.ts +111 -0
- package/dist/neural/entityExtractionCache.js +208 -0
- package/dist/neural/entityExtractor.d.ts +33 -1
- package/dist/neural/entityExtractor.js +66 -2
- package/dist/neural/relationshipConfidence.d.ts +79 -0
- package/dist/neural/relationshipConfidence.js +204 -0
- package/dist/types/brainy.types.d.ts +18 -0
- package/dist/types/progress.types.d.ts +107 -0
- package/dist/types/progress.types.js +221 -0
- package/package.json +7 -2
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF Format Handler
|
|
3
|
+
* Handles PDF files with:
|
|
4
|
+
* - Text extraction with layout preservation
|
|
5
|
+
* - Table detection and extraction
|
|
6
|
+
* - Metadata extraction (author, dates, etc.)
|
|
7
|
+
* - Page-by-page processing
|
|
8
|
+
*/
|
|
9
|
+
import * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf.mjs';
|
|
10
|
+
import { BaseFormatHandler } from './base.js';
|
|
11
|
+
// Use built-in worker for Node.js environments
|
|
12
|
+
// In production, this can be customized via options
|
|
13
|
+
const initializeWorker = () => {
|
|
14
|
+
if (typeof pdfjsLib.GlobalWorkerOptions.workerSrc === 'undefined' ||
|
|
15
|
+
pdfjsLib.GlobalWorkerOptions.workerSrc === '') {
|
|
16
|
+
// Use a data URL to avoid file system dependencies
|
|
17
|
+
// This tells pdfjs to use the built-in fallback worker
|
|
18
|
+
try {
|
|
19
|
+
pdfjsLib.GlobalWorkerOptions.workerSrc = 'data:,';
|
|
20
|
+
}
|
|
21
|
+
catch {
|
|
22
|
+
// Ignore if already set or in incompatible environment
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
};
|
|
26
|
+
initializeWorker();
|
|
27
|
+
export class PDFHandler extends BaseFormatHandler {
|
|
28
|
+
constructor() {
|
|
29
|
+
super(...arguments);
|
|
30
|
+
this.format = 'pdf';
|
|
31
|
+
}
|
|
32
|
+
canHandle(data) {
|
|
33
|
+
const ext = this.detectExtension(data);
|
|
34
|
+
if (ext === 'pdf')
|
|
35
|
+
return true;
|
|
36
|
+
// Check for PDF magic bytes
|
|
37
|
+
if (Buffer.isBuffer(data)) {
|
|
38
|
+
const header = data.slice(0, 5).toString('ascii');
|
|
39
|
+
return header === '%PDF-';
|
|
40
|
+
}
|
|
41
|
+
return false;
|
|
42
|
+
}
|
|
43
|
+
async process(data, options) {
|
|
44
|
+
const startTime = Date.now();
|
|
45
|
+
// Convert to buffer
|
|
46
|
+
const buffer = Buffer.isBuffer(data) ? data : Buffer.from(data, 'binary');
|
|
47
|
+
try {
|
|
48
|
+
// Load PDF document
|
|
49
|
+
const loadingTask = pdfjsLib.getDocument({
|
|
50
|
+
data: new Uint8Array(buffer),
|
|
51
|
+
useSystemFonts: true,
|
|
52
|
+
standardFontDataUrl: undefined
|
|
53
|
+
});
|
|
54
|
+
const pdfDoc = await loadingTask.promise;
|
|
55
|
+
// Extract metadata
|
|
56
|
+
const metadata = await pdfDoc.getMetadata();
|
|
57
|
+
const numPages = pdfDoc.numPages;
|
|
58
|
+
// Extract text and structure from all pages
|
|
59
|
+
const allData = [];
|
|
60
|
+
let totalTextLength = 0;
|
|
61
|
+
let detectedTables = 0;
|
|
62
|
+
for (let pageNum = 1; pageNum <= numPages; pageNum++) {
|
|
63
|
+
const page = await pdfDoc.getPage(pageNum);
|
|
64
|
+
const textContent = await page.getTextContent();
|
|
65
|
+
// Extract text items with positions
|
|
66
|
+
const textItems = textContent.items.map((item) => ({
|
|
67
|
+
text: item.str,
|
|
68
|
+
x: item.transform[4],
|
|
69
|
+
y: item.transform[5],
|
|
70
|
+
width: item.width,
|
|
71
|
+
height: item.height
|
|
72
|
+
}));
|
|
73
|
+
// Combine text items into lines (group by similar Y position)
|
|
74
|
+
const lines = this.groupIntoLines(textItems);
|
|
75
|
+
// Detect tables if requested
|
|
76
|
+
if (options.pdfExtractTables !== false) {
|
|
77
|
+
const tables = this.detectTables(lines);
|
|
78
|
+
if (tables.length > 0) {
|
|
79
|
+
detectedTables += tables.length;
|
|
80
|
+
for (const table of tables) {
|
|
81
|
+
allData.push(...table.rows);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
// Extract paragraphs from non-table lines
|
|
86
|
+
const paragraphs = this.extractParagraphs(lines);
|
|
87
|
+
for (let i = 0; i < paragraphs.length; i++) {
|
|
88
|
+
const text = paragraphs[i].trim();
|
|
89
|
+
if (text.length > 0) {
|
|
90
|
+
totalTextLength += text.length;
|
|
91
|
+
allData.push({
|
|
92
|
+
_page: pageNum,
|
|
93
|
+
_type: 'paragraph',
|
|
94
|
+
_index: i,
|
|
95
|
+
text
|
|
96
|
+
});
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
const processingTime = Date.now() - startTime;
|
|
101
|
+
// Get all unique fields (excluding metadata fields)
|
|
102
|
+
const fields = allData.length > 0
|
|
103
|
+
? Object.keys(allData[0]).filter(k => !k.startsWith('_'))
|
|
104
|
+
: [];
|
|
105
|
+
return {
|
|
106
|
+
format: this.format,
|
|
107
|
+
data: allData,
|
|
108
|
+
metadata: this.createMetadata(allData.length, fields, processingTime, {
|
|
109
|
+
pageCount: numPages,
|
|
110
|
+
textLength: totalTextLength,
|
|
111
|
+
tableCount: detectedTables,
|
|
112
|
+
pdfMetadata: {
|
|
113
|
+
title: metadata.info?.Title || null,
|
|
114
|
+
author: metadata.info?.Author || null,
|
|
115
|
+
subject: metadata.info?.Subject || null,
|
|
116
|
+
creator: metadata.info?.Creator || null,
|
|
117
|
+
producer: metadata.info?.Producer || null,
|
|
118
|
+
creationDate: metadata.info?.CreationDate || null,
|
|
119
|
+
modificationDate: metadata.info?.ModDate || null
|
|
120
|
+
}
|
|
121
|
+
}),
|
|
122
|
+
filename: options.filename
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
catch (error) {
|
|
126
|
+
throw new Error(`PDF parsing failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Group text items into lines based on Y position
|
|
131
|
+
*/
|
|
132
|
+
groupIntoLines(items) {
|
|
133
|
+
if (items.length === 0)
|
|
134
|
+
return [];
|
|
135
|
+
// Sort by Y position (descending, since PDF coordinates go bottom-up)
|
|
136
|
+
const sorted = [...items].sort((a, b) => b.y - a.y);
|
|
137
|
+
const lines = [];
|
|
138
|
+
let currentLine = [];
|
|
139
|
+
let currentY = sorted[0].y;
|
|
140
|
+
for (const item of sorted) {
|
|
141
|
+
// If Y position differs by more than half the height, it's a new line
|
|
142
|
+
if (Math.abs(item.y - currentY) > (item.height / 2)) {
|
|
143
|
+
if (currentLine.length > 0) {
|
|
144
|
+
// Sort line items by X position
|
|
145
|
+
currentLine.sort((a, b) => a.x - b.x);
|
|
146
|
+
lines.push(currentLine);
|
|
147
|
+
}
|
|
148
|
+
currentLine = [];
|
|
149
|
+
currentY = item.y;
|
|
150
|
+
}
|
|
151
|
+
if (item.text.trim()) {
|
|
152
|
+
currentLine.push({ text: item.text, x: item.x });
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
// Add last line
|
|
156
|
+
if (currentLine.length > 0) {
|
|
157
|
+
currentLine.sort((a, b) => a.x - b.x);
|
|
158
|
+
lines.push(currentLine);
|
|
159
|
+
}
|
|
160
|
+
return lines;
|
|
161
|
+
}
|
|
162
|
+
/**
|
|
163
|
+
* Detect tables from lines
|
|
164
|
+
* Tables are detected when multiple consecutive lines have similar structure
|
|
165
|
+
*/
|
|
166
|
+
detectTables(lines) {
|
|
167
|
+
const tables = [];
|
|
168
|
+
let potentialTable = [];
|
|
169
|
+
for (let i = 0; i < lines.length; i++) {
|
|
170
|
+
const line = lines[i];
|
|
171
|
+
// A line with multiple items could be part of a table
|
|
172
|
+
if (line.length >= 2) {
|
|
173
|
+
potentialTable.push(line);
|
|
174
|
+
}
|
|
175
|
+
else {
|
|
176
|
+
// End of potential table
|
|
177
|
+
if (potentialTable.length >= 3) { // Need at least header + 2 rows
|
|
178
|
+
const table = this.parseTable(potentialTable);
|
|
179
|
+
if (table) {
|
|
180
|
+
tables.push(table);
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
potentialTable = [];
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
// Check last potential table
|
|
187
|
+
if (potentialTable.length >= 3) {
|
|
188
|
+
const table = this.parseTable(potentialTable);
|
|
189
|
+
if (table) {
|
|
190
|
+
tables.push(table);
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
return tables;
|
|
194
|
+
}
|
|
195
|
+
/**
|
|
196
|
+
* Parse a potential table into structured rows
|
|
197
|
+
*/
|
|
198
|
+
parseTable(lines) {
|
|
199
|
+
if (lines.length < 2)
|
|
200
|
+
return null;
|
|
201
|
+
// First line is headers
|
|
202
|
+
const headerLine = lines[0];
|
|
203
|
+
const headers = headerLine.map(item => this.sanitizeFieldName(item.text));
|
|
204
|
+
// Remaining lines are data
|
|
205
|
+
const rows = [];
|
|
206
|
+
for (let i = 1; i < lines.length; i++) {
|
|
207
|
+
const line = lines[i];
|
|
208
|
+
const row = { _type: 'table_row' };
|
|
209
|
+
// Match each item to closest header by X position
|
|
210
|
+
for (let j = 0; j < line.length && j < headers.length; j++) {
|
|
211
|
+
const header = headers[j];
|
|
212
|
+
const value = line[j].text.trim();
|
|
213
|
+
row[header] = value || null;
|
|
214
|
+
}
|
|
215
|
+
if (Object.keys(row).length > 1) { // More than just _type
|
|
216
|
+
rows.push(row);
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
return rows.length > 0 ? { rows } : null;
|
|
220
|
+
}
|
|
221
|
+
/**
|
|
222
|
+
* Extract paragraphs from lines
|
|
223
|
+
*/
|
|
224
|
+
extractParagraphs(lines) {
|
|
225
|
+
const paragraphs = [];
|
|
226
|
+
let currentParagraph = [];
|
|
227
|
+
for (const line of lines) {
|
|
228
|
+
const lineText = line.map(item => item.text).join(' ').trim();
|
|
229
|
+
if (lineText.length === 0) {
|
|
230
|
+
// Empty line - end paragraph
|
|
231
|
+
if (currentParagraph.length > 0) {
|
|
232
|
+
paragraphs.push(currentParagraph.join(' '));
|
|
233
|
+
currentParagraph = [];
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
else {
|
|
237
|
+
currentParagraph.push(lineText);
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
// Add last paragraph
|
|
241
|
+
if (currentParagraph.length > 0) {
|
|
242
|
+
paragraphs.push(currentParagraph.join(' '));
|
|
243
|
+
}
|
|
244
|
+
return paragraphs;
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
//# sourceMappingURL=pdfHandler.js.map
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Intelligent Import Module
|
|
3
|
+
* Exports main augmentation and types
|
|
4
|
+
*/
|
|
5
|
+
export { IntelligentImportAugmentation } from './IntelligentImportAugmentation.js';
|
|
6
|
+
export type { FormatHandler, FormatHandlerOptions, ProcessedData, IntelligentImportConfig } from './types.js';
|
|
7
|
+
export { CSVHandler } from './handlers/csvHandler.js';
|
|
8
|
+
export { ExcelHandler } from './handlers/excelHandler.js';
|
|
9
|
+
export { PDFHandler } from './handlers/pdfHandler.js';
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Intelligent Import Module
|
|
3
|
+
* Exports main augmentation and types
|
|
4
|
+
*/
|
|
5
|
+
export { IntelligentImportAugmentation } from './IntelligentImportAugmentation.js';
|
|
6
|
+
export { CSVHandler } from './handlers/csvHandler.js';
|
|
7
|
+
export { ExcelHandler } from './handlers/excelHandler.js';
|
|
8
|
+
export { PDFHandler } from './handlers/pdfHandler.js';
|
|
9
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Types for Intelligent Import Augmentation
|
|
3
|
+
* Handles Excel, PDF, and CSV import with intelligent extraction
|
|
4
|
+
*/
|
|
5
|
+
export interface FormatHandler {
|
|
6
|
+
/**
|
|
7
|
+
* Format name (e.g., 'csv', 'xlsx', 'pdf')
|
|
8
|
+
*/
|
|
9
|
+
readonly format: string;
|
|
10
|
+
/**
|
|
11
|
+
* Process raw data into structured format
|
|
12
|
+
* @param data Raw file data (Buffer or string)
|
|
13
|
+
* @param options Format-specific options
|
|
14
|
+
* @returns Structured data ready for entity extraction
|
|
15
|
+
*/
|
|
16
|
+
process(data: Buffer | string, options: FormatHandlerOptions): Promise<ProcessedData>;
|
|
17
|
+
/**
|
|
18
|
+
* Detect if this handler can process the given data
|
|
19
|
+
* @param data Raw data or filename
|
|
20
|
+
* @returns true if handler supports this format
|
|
21
|
+
*/
|
|
22
|
+
canHandle(data: Buffer | string | {
|
|
23
|
+
filename?: string;
|
|
24
|
+
ext?: string;
|
|
25
|
+
}): boolean;
|
|
26
|
+
}
|
|
27
|
+
export interface FormatHandlerOptions {
|
|
28
|
+
/** Source filename (for extension detection) */
|
|
29
|
+
filename?: string;
|
|
30
|
+
/** File extension (if known) */
|
|
31
|
+
ext?: string;
|
|
32
|
+
/** Encoding (auto-detected if not specified) */
|
|
33
|
+
encoding?: string;
|
|
34
|
+
/** CSV-specific: delimiter character */
|
|
35
|
+
csvDelimiter?: string;
|
|
36
|
+
/** CSV-specific: whether first row is headers */
|
|
37
|
+
csvHeaders?: boolean;
|
|
38
|
+
/** Excel-specific: sheet names to extract (or 'all') */
|
|
39
|
+
excelSheets?: string[] | 'all';
|
|
40
|
+
/** Excel-specific: whether to evaluate formulas */
|
|
41
|
+
excelEvaluateFormulas?: boolean;
|
|
42
|
+
/** PDF-specific: whether to extract tables */
|
|
43
|
+
pdfExtractTables?: boolean;
|
|
44
|
+
/** PDF-specific: whether to preserve layout */
|
|
45
|
+
pdfPreserveLayout?: boolean;
|
|
46
|
+
/** Maximum rows to process (for large files) */
|
|
47
|
+
maxRows?: number;
|
|
48
|
+
/** Whether to stream large files */
|
|
49
|
+
streaming?: boolean;
|
|
50
|
+
}
|
|
51
|
+
export interface ProcessedData {
|
|
52
|
+
/** Format that was processed */
|
|
53
|
+
format: string;
|
|
54
|
+
/** Structured data (array of objects) */
|
|
55
|
+
data: Array<Record<string, any>>;
|
|
56
|
+
/** Metadata about the processed data */
|
|
57
|
+
metadata: {
|
|
58
|
+
/** Number of rows/entities extracted */
|
|
59
|
+
rowCount: number;
|
|
60
|
+
/** Column/field names */
|
|
61
|
+
fields: string[];
|
|
62
|
+
/** Detected encoding (for text formats) */
|
|
63
|
+
encoding?: string;
|
|
64
|
+
/** Excel: sheet names */
|
|
65
|
+
sheets?: string[];
|
|
66
|
+
/** PDF: page count */
|
|
67
|
+
pageCount?: number;
|
|
68
|
+
/** PDF: extracted text length */
|
|
69
|
+
textLength?: number;
|
|
70
|
+
/** PDF: number of tables detected */
|
|
71
|
+
tableCount?: number;
|
|
72
|
+
/** Processing time in milliseconds */
|
|
73
|
+
processingTime: number;
|
|
74
|
+
/** Any warnings during processing */
|
|
75
|
+
warnings?: string[];
|
|
76
|
+
/** Format-specific metadata */
|
|
77
|
+
[key: string]: any;
|
|
78
|
+
};
|
|
79
|
+
/** Original filename (if available) */
|
|
80
|
+
filename?: string;
|
|
81
|
+
}
|
|
82
|
+
export interface HandlerRegistry {
|
|
83
|
+
/** Registered handlers by format extension */
|
|
84
|
+
handlers: Map<string, () => Promise<FormatHandler>>;
|
|
85
|
+
/** Loaded handler instances (lazy-loaded) */
|
|
86
|
+
loaded: Map<string, FormatHandler>;
|
|
87
|
+
/** Register a new handler */
|
|
88
|
+
register(extensions: string[], loader: () => Promise<FormatHandler>): void;
|
|
89
|
+
/** Get handler for a file/format */
|
|
90
|
+
getHandler(filenameOrExt: string): Promise<FormatHandler | null>;
|
|
91
|
+
}
|
|
92
|
+
export interface IntelligentImportConfig {
|
|
93
|
+
/** Enable CSV handler */
|
|
94
|
+
enableCSV: boolean;
|
|
95
|
+
/** Enable Excel handler */
|
|
96
|
+
enableExcel: boolean;
|
|
97
|
+
/** Enable PDF handler */
|
|
98
|
+
enablePDF: boolean;
|
|
99
|
+
/** Default options for CSV */
|
|
100
|
+
csvDefaults?: Partial<FormatHandlerOptions>;
|
|
101
|
+
/** Default options for Excel */
|
|
102
|
+
excelDefaults?: Partial<FormatHandlerOptions>;
|
|
103
|
+
/** Default options for PDF */
|
|
104
|
+
pdfDefaults?: Partial<FormatHandlerOptions>;
|
|
105
|
+
/** Maximum file size to process (bytes) */
|
|
106
|
+
maxFileSize?: number;
|
|
107
|
+
/** Enable caching of processed data */
|
|
108
|
+
enableCache?: boolean;
|
|
109
|
+
/** Cache TTL in milliseconds */
|
|
110
|
+
cacheTTL?: number;
|
|
111
|
+
}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Entity Extraction Cache
|
|
3
|
+
*
|
|
4
|
+
* Caches entity extraction results to avoid re-processing unchanged content.
|
|
5
|
+
* Uses file mtime or content hash for invalidation.
|
|
6
|
+
*
|
|
7
|
+
* PRODUCTION-READY - NO MOCKS, NO STUBS, REAL IMPLEMENTATION
|
|
8
|
+
*/
|
|
9
|
+
import { ExtractedEntity } from './entityExtractor.js';
|
|
10
|
+
/**
|
|
11
|
+
* Cache entry for extracted entities
|
|
12
|
+
*/
|
|
13
|
+
export interface EntityCacheEntry {
|
|
14
|
+
entities: ExtractedEntity[];
|
|
15
|
+
extractedAt: number;
|
|
16
|
+
expiresAt: number;
|
|
17
|
+
mtime?: number;
|
|
18
|
+
contentHash?: string;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Cache options
|
|
22
|
+
*/
|
|
23
|
+
export interface EntityCacheOptions {
|
|
24
|
+
enabled?: boolean;
|
|
25
|
+
ttl?: number;
|
|
26
|
+
invalidateOn?: 'mtime' | 'hash' | 'both';
|
|
27
|
+
maxEntries?: number;
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Cache statistics
|
|
31
|
+
*/
|
|
32
|
+
export interface EntityCacheStats {
|
|
33
|
+
hits: number;
|
|
34
|
+
misses: number;
|
|
35
|
+
evictions: number;
|
|
36
|
+
totalEntries: number;
|
|
37
|
+
hitRate: number;
|
|
38
|
+
averageEntitiesPerEntry: number;
|
|
39
|
+
cacheSize: number;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Entity Extraction Cache with LRU eviction
|
|
43
|
+
*/
|
|
44
|
+
export declare class EntityExtractionCache {
|
|
45
|
+
private cache;
|
|
46
|
+
private accessOrder;
|
|
47
|
+
private stats;
|
|
48
|
+
private accessCounter;
|
|
49
|
+
private maxEntries;
|
|
50
|
+
private defaultTtl;
|
|
51
|
+
constructor(options?: EntityCacheOptions);
|
|
52
|
+
/**
|
|
53
|
+
* Get cached entities
|
|
54
|
+
*/
|
|
55
|
+
get(key: string, options?: {
|
|
56
|
+
mtime?: number;
|
|
57
|
+
contentHash?: string;
|
|
58
|
+
}): ExtractedEntity[] | null;
|
|
59
|
+
/**
|
|
60
|
+
* Set cached entities
|
|
61
|
+
*/
|
|
62
|
+
set(key: string, entities: ExtractedEntity[], options?: {
|
|
63
|
+
ttl?: number;
|
|
64
|
+
mtime?: number;
|
|
65
|
+
contentHash?: string;
|
|
66
|
+
}): void;
|
|
67
|
+
/**
|
|
68
|
+
* Invalidate cache entry
|
|
69
|
+
*/
|
|
70
|
+
invalidate(key: string): boolean;
|
|
71
|
+
/**
|
|
72
|
+
* Invalidate all entries matching a prefix
|
|
73
|
+
*/
|
|
74
|
+
invalidatePrefix(prefix: string): number;
|
|
75
|
+
/**
|
|
76
|
+
* Clear entire cache
|
|
77
|
+
*/
|
|
78
|
+
clear(): void;
|
|
79
|
+
/**
|
|
80
|
+
* Evict least recently used entry
|
|
81
|
+
*/
|
|
82
|
+
private evictLRU;
|
|
83
|
+
/**
|
|
84
|
+
* Cleanup expired entries
|
|
85
|
+
*/
|
|
86
|
+
cleanup(): number;
|
|
87
|
+
/**
|
|
88
|
+
* Get cache statistics
|
|
89
|
+
*/
|
|
90
|
+
getStats(): EntityCacheStats;
|
|
91
|
+
/**
|
|
92
|
+
* Get cache size (number of entries)
|
|
93
|
+
*/
|
|
94
|
+
size(): number;
|
|
95
|
+
/**
|
|
96
|
+
* Check if cache has key
|
|
97
|
+
*/
|
|
98
|
+
has(key: string): boolean;
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Helper: Generate cache key from file path
|
|
102
|
+
*/
|
|
103
|
+
export declare function generateFileCacheKey(path: string): string;
|
|
104
|
+
/**
|
|
105
|
+
* Helper: Generate cache key from content hash
|
|
106
|
+
*/
|
|
107
|
+
export declare function generateContentCacheKey(content: string): string;
|
|
108
|
+
/**
|
|
109
|
+
* Helper: Compute content hash
|
|
110
|
+
*/
|
|
111
|
+
export declare function computeContentHash(content: string): string;
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Entity Extraction Cache
|
|
3
|
+
*
|
|
4
|
+
* Caches entity extraction results to avoid re-processing unchanged content.
|
|
5
|
+
* Uses file mtime or content hash for invalidation.
|
|
6
|
+
*
|
|
7
|
+
* PRODUCTION-READY - NO MOCKS, NO STUBS, REAL IMPLEMENTATION
|
|
8
|
+
*/
|
|
9
|
+
import { createHash } from 'crypto';
|
|
10
|
+
/**
|
|
11
|
+
* Entity Extraction Cache with LRU eviction
|
|
12
|
+
*/
|
|
13
|
+
export class EntityExtractionCache {
|
|
14
|
+
constructor(options = {}) {
|
|
15
|
+
this.cache = new Map();
|
|
16
|
+
this.accessOrder = new Map(); // Track access time for LRU
|
|
17
|
+
this.stats = {
|
|
18
|
+
hits: 0,
|
|
19
|
+
misses: 0,
|
|
20
|
+
evictions: 0
|
|
21
|
+
};
|
|
22
|
+
this.accessCounter = 0;
|
|
23
|
+
this.maxEntries = options.maxEntries || 1000;
|
|
24
|
+
this.defaultTtl = options.ttl || 7 * 24 * 60 * 60 * 1000; // 7 days default
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Get cached entities
|
|
28
|
+
*/
|
|
29
|
+
get(key, options) {
|
|
30
|
+
const entry = this.cache.get(key);
|
|
31
|
+
if (!entry) {
|
|
32
|
+
this.stats.misses++;
|
|
33
|
+
return null;
|
|
34
|
+
}
|
|
35
|
+
// Check expiration
|
|
36
|
+
if (Date.now() > entry.expiresAt) {
|
|
37
|
+
this.cache.delete(key);
|
|
38
|
+
this.accessOrder.delete(key);
|
|
39
|
+
this.stats.misses++;
|
|
40
|
+
return null;
|
|
41
|
+
}
|
|
42
|
+
// Check mtime invalidation
|
|
43
|
+
if (options?.mtime !== undefined && entry.mtime !== undefined) {
|
|
44
|
+
if (options.mtime !== entry.mtime) {
|
|
45
|
+
this.cache.delete(key);
|
|
46
|
+
this.accessOrder.delete(key);
|
|
47
|
+
this.stats.misses++;
|
|
48
|
+
return null;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
// Check content hash invalidation
|
|
52
|
+
if (options?.contentHash !== undefined && entry.contentHash !== undefined) {
|
|
53
|
+
if (options.contentHash !== entry.contentHash) {
|
|
54
|
+
this.cache.delete(key);
|
|
55
|
+
this.accessOrder.delete(key);
|
|
56
|
+
this.stats.misses++;
|
|
57
|
+
return null;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
// Cache hit - update access time
|
|
61
|
+
this.accessOrder.set(key, ++this.accessCounter);
|
|
62
|
+
this.stats.hits++;
|
|
63
|
+
return entry.entities;
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Set cached entities
|
|
67
|
+
*/
|
|
68
|
+
set(key, entities, options) {
|
|
69
|
+
// Check if we need to evict
|
|
70
|
+
if (this.cache.size >= this.maxEntries && !this.cache.has(key)) {
|
|
71
|
+
this.evictLRU();
|
|
72
|
+
}
|
|
73
|
+
const ttl = options?.ttl || this.defaultTtl;
|
|
74
|
+
const entry = {
|
|
75
|
+
entities,
|
|
76
|
+
extractedAt: Date.now(),
|
|
77
|
+
expiresAt: Date.now() + ttl,
|
|
78
|
+
mtime: options?.mtime,
|
|
79
|
+
contentHash: options?.contentHash
|
|
80
|
+
};
|
|
81
|
+
this.cache.set(key, entry);
|
|
82
|
+
this.accessOrder.set(key, ++this.accessCounter);
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Invalidate cache entry
|
|
86
|
+
*/
|
|
87
|
+
invalidate(key) {
|
|
88
|
+
const had = this.cache.has(key);
|
|
89
|
+
this.cache.delete(key);
|
|
90
|
+
this.accessOrder.delete(key);
|
|
91
|
+
return had;
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Invalidate all entries matching a prefix
|
|
95
|
+
*/
|
|
96
|
+
invalidatePrefix(prefix) {
|
|
97
|
+
let count = 0;
|
|
98
|
+
for (const key of this.cache.keys()) {
|
|
99
|
+
if (key.startsWith(prefix)) {
|
|
100
|
+
this.cache.delete(key);
|
|
101
|
+
this.accessOrder.delete(key);
|
|
102
|
+
count++;
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
return count;
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* Clear entire cache
|
|
109
|
+
*/
|
|
110
|
+
clear() {
|
|
111
|
+
this.cache.clear();
|
|
112
|
+
this.accessOrder.clear();
|
|
113
|
+
this.stats.hits = 0;
|
|
114
|
+
this.stats.misses = 0;
|
|
115
|
+
this.stats.evictions = 0;
|
|
116
|
+
this.accessCounter = 0;
|
|
117
|
+
}
|
|
118
|
+
/**
|
|
119
|
+
* Evict least recently used entry
|
|
120
|
+
*/
|
|
121
|
+
evictLRU() {
|
|
122
|
+
let lruKey = null;
|
|
123
|
+
let lruAccess = Infinity;
|
|
124
|
+
for (const [key, access] of this.accessOrder.entries()) {
|
|
125
|
+
if (access < lruAccess) {
|
|
126
|
+
lruAccess = access;
|
|
127
|
+
lruKey = key;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
if (lruKey) {
|
|
131
|
+
this.cache.delete(lruKey);
|
|
132
|
+
this.accessOrder.delete(lruKey);
|
|
133
|
+
this.stats.evictions++;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Cleanup expired entries
|
|
138
|
+
*/
|
|
139
|
+
cleanup() {
|
|
140
|
+
const now = Date.now();
|
|
141
|
+
let cleaned = 0;
|
|
142
|
+
for (const [key, entry] of this.cache.entries()) {
|
|
143
|
+
if (now > entry.expiresAt) {
|
|
144
|
+
this.cache.delete(key);
|
|
145
|
+
this.accessOrder.delete(key);
|
|
146
|
+
cleaned++;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
return cleaned;
|
|
150
|
+
}
|
|
151
|
+
/**
|
|
152
|
+
* Get cache statistics
|
|
153
|
+
*/
|
|
154
|
+
getStats() {
|
|
155
|
+
const total = this.stats.hits + this.stats.misses;
|
|
156
|
+
const hitRate = total > 0 ? this.stats.hits / total : 0;
|
|
157
|
+
let totalEntities = 0;
|
|
158
|
+
let totalSize = 0;
|
|
159
|
+
for (const entry of this.cache.values()) {
|
|
160
|
+
totalEntities += entry.entities.length;
|
|
161
|
+
// Rough estimate: each entity ~500 bytes
|
|
162
|
+
totalSize += entry.entities.length * 500;
|
|
163
|
+
}
|
|
164
|
+
return {
|
|
165
|
+
hits: this.stats.hits,
|
|
166
|
+
misses: this.stats.misses,
|
|
167
|
+
evictions: this.stats.evictions,
|
|
168
|
+
totalEntries: this.cache.size,
|
|
169
|
+
hitRate: Math.round(hitRate * 100) / 100,
|
|
170
|
+
averageEntitiesPerEntry: this.cache.size > 0
|
|
171
|
+
? Math.round((totalEntities / this.cache.size) * 10) / 10
|
|
172
|
+
: 0,
|
|
173
|
+
cacheSize: totalSize
|
|
174
|
+
};
|
|
175
|
+
}
|
|
176
|
+
/**
|
|
177
|
+
* Get cache size (number of entries)
|
|
178
|
+
*/
|
|
179
|
+
size() {
|
|
180
|
+
return this.cache.size;
|
|
181
|
+
}
|
|
182
|
+
/**
|
|
183
|
+
* Check if cache has key
|
|
184
|
+
*/
|
|
185
|
+
has(key) {
|
|
186
|
+
return this.cache.has(key);
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
/**
|
|
190
|
+
* Helper: Generate cache key from file path
|
|
191
|
+
*/
|
|
192
|
+
export function generateFileCacheKey(path) {
|
|
193
|
+
return `file:${path}`;
|
|
194
|
+
}
|
|
195
|
+
/**
|
|
196
|
+
* Helper: Generate cache key from content hash
|
|
197
|
+
*/
|
|
198
|
+
export function generateContentCacheKey(content) {
|
|
199
|
+
const hash = createHash('sha256').update(content).digest('hex');
|
|
200
|
+
return `hash:${hash.substring(0, 16)}`; // Use first 16 chars for brevity
|
|
201
|
+
}
|
|
202
|
+
/**
|
|
203
|
+
* Helper: Compute content hash
|
|
204
|
+
*/
|
|
205
|
+
export function computeContentHash(content) {
|
|
206
|
+
return createHash('sha256').update(content).digest('hex');
|
|
207
|
+
}
|
|
208
|
+
//# sourceMappingURL=entityExtractionCache.js.map
|