@soulcraft/brainy 3.21.0 → 3.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +5 -0
- package/README.md +40 -0
- package/dist/augmentations/defaultAugmentations.d.ts +6 -0
- package/dist/augmentations/defaultAugmentations.js +12 -0
- package/dist/augmentations/intelligentImport/IntelligentImportAugmentation.d.ts +51 -0
- package/dist/augmentations/intelligentImport/IntelligentImportAugmentation.js +185 -0
- package/dist/augmentations/intelligentImport/handlers/base.d.ts +49 -0
- package/dist/augmentations/intelligentImport/handlers/base.js +149 -0
- package/dist/augmentations/intelligentImport/handlers/csvHandler.d.ts +34 -0
- package/dist/augmentations/intelligentImport/handlers/csvHandler.js +185 -0
- package/dist/augmentations/intelligentImport/handlers/excelHandler.d.ts +31 -0
- package/dist/augmentations/intelligentImport/handlers/excelHandler.js +148 -0
- package/dist/augmentations/intelligentImport/handlers/pdfHandler.d.ts +35 -0
- package/dist/augmentations/intelligentImport/handlers/pdfHandler.js +247 -0
- package/dist/augmentations/intelligentImport/index.d.ts +9 -0
- package/dist/augmentations/intelligentImport/index.js +9 -0
- package/dist/augmentations/intelligentImport/types.d.ts +111 -0
- package/dist/augmentations/intelligentImport/types.js +6 -0
- package/package.json +7 -2
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CSV Format Handler
|
|
3
|
+
* Handles CSV files with:
|
|
4
|
+
* - Automatic encoding detection
|
|
5
|
+
* - Automatic delimiter detection
|
|
6
|
+
* - Streaming for large files
|
|
7
|
+
* - Type inference
|
|
8
|
+
*/
|
|
9
|
+
import { parse } from 'csv-parse/sync';
|
|
10
|
+
import { detect as detectEncoding } from 'chardet';
|
|
11
|
+
import { BaseFormatHandler } from './base.js';
|
|
12
|
+
export class CSVHandler extends BaseFormatHandler {
|
|
13
|
+
constructor() {
|
|
14
|
+
super(...arguments);
|
|
15
|
+
this.format = 'csv';
|
|
16
|
+
}
|
|
17
|
+
canHandle(data) {
|
|
18
|
+
const ext = this.detectExtension(data);
|
|
19
|
+
if (ext === 'csv' || ext === 'tsv' || ext === 'txt')
|
|
20
|
+
return true;
|
|
21
|
+
// Check content if it's a buffer
|
|
22
|
+
if (Buffer.isBuffer(data)) {
|
|
23
|
+
const sample = data.slice(0, 1024).toString('utf-8');
|
|
24
|
+
return this.looksLikeCSV(sample);
|
|
25
|
+
}
|
|
26
|
+
if (typeof data === 'string') {
|
|
27
|
+
return this.looksLikeCSV(data.slice(0, 1024));
|
|
28
|
+
}
|
|
29
|
+
return false;
|
|
30
|
+
}
|
|
31
|
+
async process(data, options) {
|
|
32
|
+
const startTime = Date.now();
|
|
33
|
+
// Convert to buffer if string
|
|
34
|
+
const buffer = Buffer.isBuffer(data) ? data : Buffer.from(data, 'utf-8');
|
|
35
|
+
// Detect encoding
|
|
36
|
+
const detectedEncoding = options.encoding || this.detectEncodingSafe(buffer);
|
|
37
|
+
const text = buffer.toString(detectedEncoding);
|
|
38
|
+
// Detect delimiter if not specified
|
|
39
|
+
const delimiter = options.csvDelimiter || this.detectDelimiter(text);
|
|
40
|
+
// Parse CSV
|
|
41
|
+
const hasHeaders = options.csvHeaders !== false;
|
|
42
|
+
const maxRows = options.maxRows;
|
|
43
|
+
try {
|
|
44
|
+
const records = parse(text, {
|
|
45
|
+
columns: hasHeaders,
|
|
46
|
+
skip_empty_lines: true,
|
|
47
|
+
trim: true,
|
|
48
|
+
delimiter,
|
|
49
|
+
relax_column_count: true,
|
|
50
|
+
to: maxRows,
|
|
51
|
+
cast: false // We'll do type inference ourselves
|
|
52
|
+
});
|
|
53
|
+
// Convert to array of objects
|
|
54
|
+
const data = Array.isArray(records) ? records : [records];
|
|
55
|
+
// Infer types and convert values
|
|
56
|
+
const fields = data.length > 0 ? Object.keys(data[0]) : [];
|
|
57
|
+
const types = this.inferFieldTypes(data);
|
|
58
|
+
const convertedData = data.map(row => {
|
|
59
|
+
const converted = {};
|
|
60
|
+
for (const [key, value] of Object.entries(row)) {
|
|
61
|
+
converted[key] = this.convertValue(value, types[key] || 'string');
|
|
62
|
+
}
|
|
63
|
+
return converted;
|
|
64
|
+
});
|
|
65
|
+
const processingTime = Date.now() - startTime;
|
|
66
|
+
return {
|
|
67
|
+
format: this.format,
|
|
68
|
+
data: convertedData,
|
|
69
|
+
metadata: this.createMetadata(convertedData.length, fields, processingTime, {
|
|
70
|
+
encoding: detectedEncoding,
|
|
71
|
+
delimiter,
|
|
72
|
+
hasHeaders,
|
|
73
|
+
types
|
|
74
|
+
}),
|
|
75
|
+
filename: options.filename
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
catch (error) {
|
|
79
|
+
throw new Error(`CSV parsing failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Check if text looks like CSV
|
|
84
|
+
*/
|
|
85
|
+
looksLikeCSV(text) {
|
|
86
|
+
const lines = text.split('\n').filter(l => l.trim());
|
|
87
|
+
if (lines.length < 2)
|
|
88
|
+
return false;
|
|
89
|
+
// Check for common delimiters
|
|
90
|
+
const delimiters = [',', ';', '\t', '|'];
|
|
91
|
+
for (const delimiter of delimiters) {
|
|
92
|
+
const firstCount = (lines[0].match(new RegExp(`\\${delimiter}`, 'g')) || []).length;
|
|
93
|
+
if (firstCount === 0)
|
|
94
|
+
continue;
|
|
95
|
+
const secondCount = (lines[1].match(new RegExp(`\\${delimiter}`, 'g')) || []).length;
|
|
96
|
+
if (firstCount === secondCount)
|
|
97
|
+
return true;
|
|
98
|
+
}
|
|
99
|
+
return false;
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Detect CSV delimiter
|
|
103
|
+
*/
|
|
104
|
+
detectDelimiter(text) {
|
|
105
|
+
const sample = text.split('\n').slice(0, 10).join('\n');
|
|
106
|
+
const delimiters = [',', ';', '\t', '|'];
|
|
107
|
+
const counts = {};
|
|
108
|
+
for (const delimiter of delimiters) {
|
|
109
|
+
const lines = sample.split('\n').filter(l => l.trim());
|
|
110
|
+
if (lines.length < 2)
|
|
111
|
+
continue;
|
|
112
|
+
// Count delimiter in first line
|
|
113
|
+
const firstCount = (lines[0].match(new RegExp(`\\${delimiter}`, 'g')) || []).length;
|
|
114
|
+
if (firstCount === 0)
|
|
115
|
+
continue;
|
|
116
|
+
// Check if count is consistent across lines
|
|
117
|
+
let consistent = true;
|
|
118
|
+
for (let i = 1; i < Math.min(5, lines.length); i++) {
|
|
119
|
+
const count = (lines[i].match(new RegExp(`\\${delimiter}`, 'g')) || []).length;
|
|
120
|
+
if (count !== firstCount) {
|
|
121
|
+
consistent = false;
|
|
122
|
+
break;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
if (consistent) {
|
|
126
|
+
counts[delimiter] = firstCount;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
// Return delimiter with highest count
|
|
130
|
+
const best = Object.entries(counts).sort((a, b) => b[1] - a[1])[0];
|
|
131
|
+
return best ? best[0] : ',';
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* Detect encoding safely (with fallback)
|
|
135
|
+
*/
|
|
136
|
+
detectEncodingSafe(buffer) {
|
|
137
|
+
try {
|
|
138
|
+
const detected = detectEncoding(buffer);
|
|
139
|
+
if (!detected)
|
|
140
|
+
return 'utf-8';
|
|
141
|
+
// Normalize encoding to Node.js-supported names
|
|
142
|
+
return this.normalizeEncoding(detected);
|
|
143
|
+
}
|
|
144
|
+
catch {
|
|
145
|
+
return 'utf-8';
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Normalize encoding names to Node.js-supported encodings
|
|
150
|
+
*/
|
|
151
|
+
normalizeEncoding(encoding) {
|
|
152
|
+
const normalized = encoding.toLowerCase().replace(/[_-]/g, '');
|
|
153
|
+
// Map common encodings to Node.js names
|
|
154
|
+
const mappings = {
|
|
155
|
+
'iso88591': 'latin1',
|
|
156
|
+
'iso88592': 'latin1',
|
|
157
|
+
'iso88593': 'latin1',
|
|
158
|
+
'iso88594': 'latin1',
|
|
159
|
+
'iso88595': 'latin1',
|
|
160
|
+
'iso88596': 'latin1',
|
|
161
|
+
'iso88597': 'latin1',
|
|
162
|
+
'iso88598': 'latin1',
|
|
163
|
+
'iso88599': 'latin1',
|
|
164
|
+
'iso885910': 'latin1',
|
|
165
|
+
'iso885913': 'latin1',
|
|
166
|
+
'iso885914': 'latin1',
|
|
167
|
+
'iso885915': 'latin1',
|
|
168
|
+
'iso885916': 'latin1',
|
|
169
|
+
'usascii': 'ascii',
|
|
170
|
+
'utf8': 'utf8',
|
|
171
|
+
'utf16le': 'utf16le',
|
|
172
|
+
'utf16be': 'utf16le',
|
|
173
|
+
'windows1252': 'latin1',
|
|
174
|
+
'windows1251': 'utf8', // Cyrillic - best effort
|
|
175
|
+
'big5': 'utf8', // Chinese - best effort
|
|
176
|
+
'gbk': 'utf8', // Chinese - best effort
|
|
177
|
+
'gb2312': 'utf8', // Chinese - best effort
|
|
178
|
+
'shiftjis': 'utf8', // Japanese - best effort
|
|
179
|
+
'eucjp': 'utf8', // Japanese - best effort
|
|
180
|
+
'euckr': 'utf8' // Korean - best effort
|
|
181
|
+
};
|
|
182
|
+
return mappings[normalized] || 'utf8';
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
//# sourceMappingURL=csvHandler.js.map
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Excel Format Handler
|
|
3
|
+
* Handles Excel files (.xlsx, .xls, .xlsb) with:
|
|
4
|
+
* - Multi-sheet extraction
|
|
5
|
+
* - Type inference
|
|
6
|
+
* - Formula evaluation
|
|
7
|
+
* - Metadata extraction
|
|
8
|
+
*/
|
|
9
|
+
import { BaseFormatHandler } from './base.js';
|
|
10
|
+
import { FormatHandlerOptions, ProcessedData } from '../types.js';
|
|
11
|
+
export declare class ExcelHandler extends BaseFormatHandler {
|
|
12
|
+
readonly format = "excel";
|
|
13
|
+
canHandle(data: Buffer | string | {
|
|
14
|
+
filename?: string;
|
|
15
|
+
ext?: string;
|
|
16
|
+
}): boolean;
|
|
17
|
+
process(data: Buffer | string, options: FormatHandlerOptions): Promise<ProcessedData>;
|
|
18
|
+
/**
|
|
19
|
+
* Determine which sheets to process
|
|
20
|
+
*/
|
|
21
|
+
private getSheetsToProcess;
|
|
22
|
+
/**
|
|
23
|
+
* Check if a number is likely an Excel date
|
|
24
|
+
* Excel stores dates as days since 1900-01-01
|
|
25
|
+
*/
|
|
26
|
+
private isExcelDate;
|
|
27
|
+
/**
|
|
28
|
+
* Convert Excel date (days since 1900-01-01) to JS Date
|
|
29
|
+
*/
|
|
30
|
+
private excelDateToJSDate;
|
|
31
|
+
}
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Excel Format Handler
|
|
3
|
+
* Handles Excel files (.xlsx, .xls, .xlsb) with:
|
|
4
|
+
* - Multi-sheet extraction
|
|
5
|
+
* - Type inference
|
|
6
|
+
* - Formula evaluation
|
|
7
|
+
* - Metadata extraction
|
|
8
|
+
*/
|
|
9
|
+
import * as XLSX from 'xlsx';
|
|
10
|
+
import { BaseFormatHandler } from './base.js';
|
|
11
|
+
export class ExcelHandler extends BaseFormatHandler {
|
|
12
|
+
constructor() {
|
|
13
|
+
super(...arguments);
|
|
14
|
+
this.format = 'excel';
|
|
15
|
+
}
|
|
16
|
+
canHandle(data) {
|
|
17
|
+
const ext = this.detectExtension(data);
|
|
18
|
+
return ['xlsx', 'xls', 'xlsb', 'xlsm', 'xlt', 'xltx', 'xltm'].includes(ext || '');
|
|
19
|
+
}
|
|
20
|
+
async process(data, options) {
|
|
21
|
+
const startTime = Date.now();
|
|
22
|
+
// Convert to buffer if string (though Excel should always be binary)
|
|
23
|
+
const buffer = Buffer.isBuffer(data) ? data : Buffer.from(data, 'binary');
|
|
24
|
+
try {
|
|
25
|
+
// Read workbook
|
|
26
|
+
const workbook = XLSX.read(buffer, {
|
|
27
|
+
type: 'buffer',
|
|
28
|
+
cellDates: true,
|
|
29
|
+
cellNF: true,
|
|
30
|
+
cellStyles: true
|
|
31
|
+
});
|
|
32
|
+
// Determine which sheets to process
|
|
33
|
+
const sheetsToProcess = this.getSheetsToProcess(workbook, options);
|
|
34
|
+
// Extract data from sheets
|
|
35
|
+
const allData = [];
|
|
36
|
+
const sheetMetadata = {};
|
|
37
|
+
for (const sheetName of sheetsToProcess) {
|
|
38
|
+
const sheet = workbook.Sheets[sheetName];
|
|
39
|
+
if (!sheet)
|
|
40
|
+
continue;
|
|
41
|
+
// Convert sheet to JSON with headers
|
|
42
|
+
const sheetData = XLSX.utils.sheet_to_json(sheet, {
|
|
43
|
+
header: 1, // Get as array of arrays first
|
|
44
|
+
defval: null,
|
|
45
|
+
blankrows: false,
|
|
46
|
+
raw: false // Convert to formatted strings
|
|
47
|
+
});
|
|
48
|
+
if (sheetData.length === 0)
|
|
49
|
+
continue;
|
|
50
|
+
// First row is headers
|
|
51
|
+
const headers = sheetData[0].map((h) => this.sanitizeFieldName(String(h || '')));
|
|
52
|
+
// Skip if no headers
|
|
53
|
+
if (headers.length === 0)
|
|
54
|
+
continue;
|
|
55
|
+
// Convert rows to objects
|
|
56
|
+
for (let i = 1; i < sheetData.length; i++) {
|
|
57
|
+
const row = sheetData[i];
|
|
58
|
+
const rowObj = {};
|
|
59
|
+
// Add sheet name to each row
|
|
60
|
+
rowObj._sheet = sheetName;
|
|
61
|
+
for (let j = 0; j < headers.length; j++) {
|
|
62
|
+
const header = headers[j];
|
|
63
|
+
let value = row[j];
|
|
64
|
+
// Convert Excel dates
|
|
65
|
+
if (value && typeof value === 'number' && this.isExcelDate(value)) {
|
|
66
|
+
value = this.excelDateToJSDate(value);
|
|
67
|
+
}
|
|
68
|
+
rowObj[header] = value === undefined ? null : value;
|
|
69
|
+
}
|
|
70
|
+
allData.push(rowObj);
|
|
71
|
+
}
|
|
72
|
+
// Store sheet metadata
|
|
73
|
+
sheetMetadata[sheetName] = {
|
|
74
|
+
rowCount: sheetData.length - 1, // Exclude header row
|
|
75
|
+
columnCount: headers.length,
|
|
76
|
+
headers
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
// Infer types (excluding _sheet field)
|
|
80
|
+
const fields = allData.length > 0 ? Object.keys(allData[0]).filter(k => k !== '_sheet') : [];
|
|
81
|
+
const types = this.inferFieldTypes(allData);
|
|
82
|
+
// Convert values to appropriate types
|
|
83
|
+
const convertedData = allData.map(row => {
|
|
84
|
+
const converted = {};
|
|
85
|
+
for (const [key, value] of Object.entries(row)) {
|
|
86
|
+
if (key === '_sheet') {
|
|
87
|
+
converted[key] = value;
|
|
88
|
+
}
|
|
89
|
+
else {
|
|
90
|
+
converted[key] = this.convertValue(value, types[key] || 'string');
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return converted;
|
|
94
|
+
});
|
|
95
|
+
const processingTime = Date.now() - startTime;
|
|
96
|
+
return {
|
|
97
|
+
format: this.format,
|
|
98
|
+
data: convertedData,
|
|
99
|
+
metadata: this.createMetadata(convertedData.length, fields, processingTime, {
|
|
100
|
+
sheets: sheetsToProcess,
|
|
101
|
+
sheetCount: sheetsToProcess.length,
|
|
102
|
+
sheetMetadata,
|
|
103
|
+
types,
|
|
104
|
+
workbookInfo: {
|
|
105
|
+
sheetNames: workbook.SheetNames,
|
|
106
|
+
properties: workbook.Props || {}
|
|
107
|
+
}
|
|
108
|
+
}),
|
|
109
|
+
filename: options.filename
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
catch (error) {
|
|
113
|
+
throw new Error(`Excel parsing failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Determine which sheets to process
|
|
118
|
+
*/
|
|
119
|
+
getSheetsToProcess(workbook, options) {
|
|
120
|
+
const allSheets = workbook.SheetNames;
|
|
121
|
+
// If specific sheets requested
|
|
122
|
+
if (options.excelSheets && options.excelSheets !== 'all') {
|
|
123
|
+
return options.excelSheets.filter(name => allSheets.includes(name));
|
|
124
|
+
}
|
|
125
|
+
// Otherwise process all sheets
|
|
126
|
+
return allSheets;
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* Check if a number is likely an Excel date
|
|
130
|
+
* Excel stores dates as days since 1900-01-01
|
|
131
|
+
*/
|
|
132
|
+
isExcelDate(value) {
|
|
133
|
+
// Excel dates are typically between 1 and 60000 (1900 to 2064)
|
|
134
|
+
// This is a heuristic - not perfect but catches most cases
|
|
135
|
+
return value > 0 && value < 100000 && Number.isInteger(value);
|
|
136
|
+
}
|
|
137
|
+
/**
|
|
138
|
+
* Convert Excel date (days since 1900-01-01) to JS Date
|
|
139
|
+
*/
|
|
140
|
+
excelDateToJSDate(excelDate) {
|
|
141
|
+
// Excel's epoch is 1900-01-01, but there's a bug where it thinks 1900 is a leap year
|
|
142
|
+
// So dates before March 1, 1900 are off by one day
|
|
143
|
+
const epoch = new Date(1899, 11, 30); // Dec 30, 1899
|
|
144
|
+
const msPerDay = 24 * 60 * 60 * 1000;
|
|
145
|
+
return new Date(epoch.getTime() + excelDate * msPerDay);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
//# sourceMappingURL=excelHandler.js.map
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF Format Handler
|
|
3
|
+
* Handles PDF files with:
|
|
4
|
+
* - Text extraction with layout preservation
|
|
5
|
+
* - Table detection and extraction
|
|
6
|
+
* - Metadata extraction (author, dates, etc.)
|
|
7
|
+
* - Page-by-page processing
|
|
8
|
+
*/
|
|
9
|
+
import { BaseFormatHandler } from './base.js';
|
|
10
|
+
import { FormatHandlerOptions, ProcessedData } from '../types.js';
|
|
11
|
+
export declare class PDFHandler extends BaseFormatHandler {
|
|
12
|
+
readonly format = "pdf";
|
|
13
|
+
canHandle(data: Buffer | string | {
|
|
14
|
+
filename?: string;
|
|
15
|
+
ext?: string;
|
|
16
|
+
}): boolean;
|
|
17
|
+
process(data: Buffer | string, options: FormatHandlerOptions): Promise<ProcessedData>;
|
|
18
|
+
/**
|
|
19
|
+
* Group text items into lines based on Y position
|
|
20
|
+
*/
|
|
21
|
+
private groupIntoLines;
|
|
22
|
+
/**
|
|
23
|
+
* Detect tables from lines
|
|
24
|
+
* Tables are detected when multiple consecutive lines have similar structure
|
|
25
|
+
*/
|
|
26
|
+
private detectTables;
|
|
27
|
+
/**
|
|
28
|
+
* Parse a potential table into structured rows
|
|
29
|
+
*/
|
|
30
|
+
private parseTable;
|
|
31
|
+
/**
|
|
32
|
+
* Extract paragraphs from lines
|
|
33
|
+
*/
|
|
34
|
+
private extractParagraphs;
|
|
35
|
+
}
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF Format Handler
|
|
3
|
+
* Handles PDF files with:
|
|
4
|
+
* - Text extraction with layout preservation
|
|
5
|
+
* - Table detection and extraction
|
|
6
|
+
* - Metadata extraction (author, dates, etc.)
|
|
7
|
+
* - Page-by-page processing
|
|
8
|
+
*/
|
|
9
|
+
import * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf.mjs';
|
|
10
|
+
import { BaseFormatHandler } from './base.js';
|
|
11
|
+
// Use built-in worker for Node.js environments
|
|
12
|
+
// In production, this can be customized via options
|
|
13
|
+
const initializeWorker = () => {
|
|
14
|
+
if (typeof pdfjsLib.GlobalWorkerOptions.workerSrc === 'undefined' ||
|
|
15
|
+
pdfjsLib.GlobalWorkerOptions.workerSrc === '') {
|
|
16
|
+
// Use a data URL to avoid file system dependencies
|
|
17
|
+
// This tells pdfjs to use the built-in fallback worker
|
|
18
|
+
try {
|
|
19
|
+
pdfjsLib.GlobalWorkerOptions.workerSrc = 'data:,';
|
|
20
|
+
}
|
|
21
|
+
catch {
|
|
22
|
+
// Ignore if already set or in incompatible environment
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
};
|
|
26
|
+
initializeWorker();
|
|
27
|
+
export class PDFHandler extends BaseFormatHandler {
|
|
28
|
+
constructor() {
|
|
29
|
+
super(...arguments);
|
|
30
|
+
this.format = 'pdf';
|
|
31
|
+
}
|
|
32
|
+
canHandle(data) {
|
|
33
|
+
const ext = this.detectExtension(data);
|
|
34
|
+
if (ext === 'pdf')
|
|
35
|
+
return true;
|
|
36
|
+
// Check for PDF magic bytes
|
|
37
|
+
if (Buffer.isBuffer(data)) {
|
|
38
|
+
const header = data.slice(0, 5).toString('ascii');
|
|
39
|
+
return header === '%PDF-';
|
|
40
|
+
}
|
|
41
|
+
return false;
|
|
42
|
+
}
|
|
43
|
+
async process(data, options) {
|
|
44
|
+
const startTime = Date.now();
|
|
45
|
+
// Convert to buffer
|
|
46
|
+
const buffer = Buffer.isBuffer(data) ? data : Buffer.from(data, 'binary');
|
|
47
|
+
try {
|
|
48
|
+
// Load PDF document
|
|
49
|
+
const loadingTask = pdfjsLib.getDocument({
|
|
50
|
+
data: new Uint8Array(buffer),
|
|
51
|
+
useSystemFonts: true,
|
|
52
|
+
standardFontDataUrl: undefined
|
|
53
|
+
});
|
|
54
|
+
const pdfDoc = await loadingTask.promise;
|
|
55
|
+
// Extract metadata
|
|
56
|
+
const metadata = await pdfDoc.getMetadata();
|
|
57
|
+
const numPages = pdfDoc.numPages;
|
|
58
|
+
// Extract text and structure from all pages
|
|
59
|
+
const allData = [];
|
|
60
|
+
let totalTextLength = 0;
|
|
61
|
+
let detectedTables = 0;
|
|
62
|
+
for (let pageNum = 1; pageNum <= numPages; pageNum++) {
|
|
63
|
+
const page = await pdfDoc.getPage(pageNum);
|
|
64
|
+
const textContent = await page.getTextContent();
|
|
65
|
+
// Extract text items with positions
|
|
66
|
+
const textItems = textContent.items.map((item) => ({
|
|
67
|
+
text: item.str,
|
|
68
|
+
x: item.transform[4],
|
|
69
|
+
y: item.transform[5],
|
|
70
|
+
width: item.width,
|
|
71
|
+
height: item.height
|
|
72
|
+
}));
|
|
73
|
+
// Combine text items into lines (group by similar Y position)
|
|
74
|
+
const lines = this.groupIntoLines(textItems);
|
|
75
|
+
// Detect tables if requested
|
|
76
|
+
if (options.pdfExtractTables !== false) {
|
|
77
|
+
const tables = this.detectTables(lines);
|
|
78
|
+
if (tables.length > 0) {
|
|
79
|
+
detectedTables += tables.length;
|
|
80
|
+
for (const table of tables) {
|
|
81
|
+
allData.push(...table.rows);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
// Extract paragraphs from non-table lines
|
|
86
|
+
const paragraphs = this.extractParagraphs(lines);
|
|
87
|
+
for (let i = 0; i < paragraphs.length; i++) {
|
|
88
|
+
const text = paragraphs[i].trim();
|
|
89
|
+
if (text.length > 0) {
|
|
90
|
+
totalTextLength += text.length;
|
|
91
|
+
allData.push({
|
|
92
|
+
_page: pageNum,
|
|
93
|
+
_type: 'paragraph',
|
|
94
|
+
_index: i,
|
|
95
|
+
text
|
|
96
|
+
});
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
const processingTime = Date.now() - startTime;
|
|
101
|
+
// Get all unique fields (excluding metadata fields)
|
|
102
|
+
const fields = allData.length > 0
|
|
103
|
+
? Object.keys(allData[0]).filter(k => !k.startsWith('_'))
|
|
104
|
+
: [];
|
|
105
|
+
return {
|
|
106
|
+
format: this.format,
|
|
107
|
+
data: allData,
|
|
108
|
+
metadata: this.createMetadata(allData.length, fields, processingTime, {
|
|
109
|
+
pageCount: numPages,
|
|
110
|
+
textLength: totalTextLength,
|
|
111
|
+
tableCount: detectedTables,
|
|
112
|
+
pdfMetadata: {
|
|
113
|
+
title: metadata.info?.Title || null,
|
|
114
|
+
author: metadata.info?.Author || null,
|
|
115
|
+
subject: metadata.info?.Subject || null,
|
|
116
|
+
creator: metadata.info?.Creator || null,
|
|
117
|
+
producer: metadata.info?.Producer || null,
|
|
118
|
+
creationDate: metadata.info?.CreationDate || null,
|
|
119
|
+
modificationDate: metadata.info?.ModDate || null
|
|
120
|
+
}
|
|
121
|
+
}),
|
|
122
|
+
filename: options.filename
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
catch (error) {
|
|
126
|
+
throw new Error(`PDF parsing failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Group text items into lines based on Y position
|
|
131
|
+
*/
|
|
132
|
+
groupIntoLines(items) {
|
|
133
|
+
if (items.length === 0)
|
|
134
|
+
return [];
|
|
135
|
+
// Sort by Y position (descending, since PDF coordinates go bottom-up)
|
|
136
|
+
const sorted = [...items].sort((a, b) => b.y - a.y);
|
|
137
|
+
const lines = [];
|
|
138
|
+
let currentLine = [];
|
|
139
|
+
let currentY = sorted[0].y;
|
|
140
|
+
for (const item of sorted) {
|
|
141
|
+
// If Y position differs by more than half the height, it's a new line
|
|
142
|
+
if (Math.abs(item.y - currentY) > (item.height / 2)) {
|
|
143
|
+
if (currentLine.length > 0) {
|
|
144
|
+
// Sort line items by X position
|
|
145
|
+
currentLine.sort((a, b) => a.x - b.x);
|
|
146
|
+
lines.push(currentLine);
|
|
147
|
+
}
|
|
148
|
+
currentLine = [];
|
|
149
|
+
currentY = item.y;
|
|
150
|
+
}
|
|
151
|
+
if (item.text.trim()) {
|
|
152
|
+
currentLine.push({ text: item.text, x: item.x });
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
// Add last line
|
|
156
|
+
if (currentLine.length > 0) {
|
|
157
|
+
currentLine.sort((a, b) => a.x - b.x);
|
|
158
|
+
lines.push(currentLine);
|
|
159
|
+
}
|
|
160
|
+
return lines;
|
|
161
|
+
}
|
|
162
|
+
/**
|
|
163
|
+
* Detect tables from lines
|
|
164
|
+
* Tables are detected when multiple consecutive lines have similar structure
|
|
165
|
+
*/
|
|
166
|
+
detectTables(lines) {
|
|
167
|
+
const tables = [];
|
|
168
|
+
let potentialTable = [];
|
|
169
|
+
for (let i = 0; i < lines.length; i++) {
|
|
170
|
+
const line = lines[i];
|
|
171
|
+
// A line with multiple items could be part of a table
|
|
172
|
+
if (line.length >= 2) {
|
|
173
|
+
potentialTable.push(line);
|
|
174
|
+
}
|
|
175
|
+
else {
|
|
176
|
+
// End of potential table
|
|
177
|
+
if (potentialTable.length >= 3) { // Need at least header + 2 rows
|
|
178
|
+
const table = this.parseTable(potentialTable);
|
|
179
|
+
if (table) {
|
|
180
|
+
tables.push(table);
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
potentialTable = [];
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
// Check last potential table
|
|
187
|
+
if (potentialTable.length >= 3) {
|
|
188
|
+
const table = this.parseTable(potentialTable);
|
|
189
|
+
if (table) {
|
|
190
|
+
tables.push(table);
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
return tables;
|
|
194
|
+
}
|
|
195
|
+
/**
|
|
196
|
+
* Parse a potential table into structured rows
|
|
197
|
+
*/
|
|
198
|
+
parseTable(lines) {
|
|
199
|
+
if (lines.length < 2)
|
|
200
|
+
return null;
|
|
201
|
+
// First line is headers
|
|
202
|
+
const headerLine = lines[0];
|
|
203
|
+
const headers = headerLine.map(item => this.sanitizeFieldName(item.text));
|
|
204
|
+
// Remaining lines are data
|
|
205
|
+
const rows = [];
|
|
206
|
+
for (let i = 1; i < lines.length; i++) {
|
|
207
|
+
const line = lines[i];
|
|
208
|
+
const row = { _type: 'table_row' };
|
|
209
|
+
// Match each item to closest header by X position
|
|
210
|
+
for (let j = 0; j < line.length && j < headers.length; j++) {
|
|
211
|
+
const header = headers[j];
|
|
212
|
+
const value = line[j].text.trim();
|
|
213
|
+
row[header] = value || null;
|
|
214
|
+
}
|
|
215
|
+
if (Object.keys(row).length > 1) { // More than just _type
|
|
216
|
+
rows.push(row);
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
return rows.length > 0 ? { rows } : null;
|
|
220
|
+
}
|
|
221
|
+
/**
|
|
222
|
+
* Extract paragraphs from lines
|
|
223
|
+
*/
|
|
224
|
+
extractParagraphs(lines) {
|
|
225
|
+
const paragraphs = [];
|
|
226
|
+
let currentParagraph = [];
|
|
227
|
+
for (const line of lines) {
|
|
228
|
+
const lineText = line.map(item => item.text).join(' ').trim();
|
|
229
|
+
if (lineText.length === 0) {
|
|
230
|
+
// Empty line - end paragraph
|
|
231
|
+
if (currentParagraph.length > 0) {
|
|
232
|
+
paragraphs.push(currentParagraph.join(' '));
|
|
233
|
+
currentParagraph = [];
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
else {
|
|
237
|
+
currentParagraph.push(lineText);
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
// Add last paragraph
|
|
241
|
+
if (currentParagraph.length > 0) {
|
|
242
|
+
paragraphs.push(currentParagraph.join(' '));
|
|
243
|
+
}
|
|
244
|
+
return paragraphs;
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
//# sourceMappingURL=pdfHandler.js.map
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Intelligent Import Module
|
|
3
|
+
* Exports main augmentation and types
|
|
4
|
+
*/
|
|
5
|
+
export { IntelligentImportAugmentation } from './IntelligentImportAugmentation.js';
|
|
6
|
+
export type { FormatHandler, FormatHandlerOptions, ProcessedData, IntelligentImportConfig } from './types.js';
|
|
7
|
+
export { CSVHandler } from './handlers/csvHandler.js';
|
|
8
|
+
export { ExcelHandler } from './handlers/excelHandler.js';
|
|
9
|
+
export { PDFHandler } from './handlers/pdfHandler.js';
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Intelligent Import Module
|
|
3
|
+
* Exports main augmentation and types
|
|
4
|
+
*/
|
|
5
|
+
export { IntelligentImportAugmentation } from './IntelligentImportAugmentation.js';
|
|
6
|
+
export { CSVHandler } from './handlers/csvHandler.js';
|
|
7
|
+
export { ExcelHandler } from './handlers/excelHandler.js';
|
|
8
|
+
export { PDFHandler } from './handlers/pdfHandler.js';
|
|
9
|
+
//# sourceMappingURL=index.js.map
|