llm-search-tools 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +244 -0
- package/dist/index.d.ts +18 -0
- package/dist/index.js +40 -0
- package/dist/index.js.map +1 -0
- package/dist/integration.test.d.ts +1 -0
- package/dist/integration.test.js +237 -0
- package/dist/modules/answerbox.test.d.ts +1 -0
- package/dist/modules/answerbox.test.js +105 -0
- package/dist/modules/autocomplete.d.ts +11 -0
- package/dist/modules/autocomplete.js +159 -0
- package/dist/modules/autocomplete.test.d.ts +1 -0
- package/dist/modules/autocomplete.test.js +188 -0
- package/dist/modules/common.d.ts +26 -0
- package/dist/modules/common.js +263 -0
- package/dist/modules/common.test.d.ts +1 -0
- package/dist/modules/common.test.js +87 -0
- package/dist/modules/crawl.d.ts +9 -0
- package/dist/modules/crawl.js +117 -0
- package/dist/modules/crawl.test.d.ts +1 -0
- package/dist/modules/crawl.test.js +48 -0
- package/dist/modules/events.d.ts +8 -0
- package/dist/modules/events.js +129 -0
- package/dist/modules/events.test.d.ts +1 -0
- package/dist/modules/events.test.js +104 -0
- package/dist/modules/finance.d.ts +10 -0
- package/dist/modules/finance.js +20 -0
- package/dist/modules/finance.test.d.ts +1 -0
- package/dist/modules/finance.test.js +77 -0
- package/dist/modules/flights.d.ts +8 -0
- package/dist/modules/flights.js +135 -0
- package/dist/modules/flights.test.d.ts +1 -0
- package/dist/modules/flights.test.js +128 -0
- package/dist/modules/hackernews.d.ts +8 -0
- package/dist/modules/hackernews.js +87 -0
- package/dist/modules/hackernews.js.map +1 -0
- package/dist/modules/images.test.d.ts +1 -0
- package/dist/modules/images.test.js +145 -0
- package/dist/modules/integrations.test.d.ts +1 -0
- package/dist/modules/integrations.test.js +93 -0
- package/dist/modules/media.d.ts +11 -0
- package/dist/modules/media.js +132 -0
- package/dist/modules/media.test.d.ts +1 -0
- package/dist/modules/media.test.js +186 -0
- package/dist/modules/news.d.ts +3 -0
- package/dist/modules/news.js +39 -0
- package/dist/modules/news.test.d.ts +1 -0
- package/dist/modules/news.test.js +88 -0
- package/dist/modules/parser.d.ts +19 -0
- package/dist/modules/parser.js +361 -0
- package/dist/modules/parser.test.d.ts +1 -0
- package/dist/modules/parser.test.js +151 -0
- package/dist/modules/reddit.d.ts +21 -0
- package/dist/modules/reddit.js +107 -0
- package/dist/modules/scrape.d.ts +16 -0
- package/dist/modules/scrape.js +272 -0
- package/dist/modules/scrape.test.d.ts +1 -0
- package/dist/modules/scrape.test.js +232 -0
- package/dist/modules/scraper.d.ts +12 -0
- package/dist/modules/scraper.js +640 -0
- package/dist/modules/scrapers/anidb.d.ts +8 -0
- package/dist/modules/scrapers/anidb.js +156 -0
- package/dist/modules/scrapers/duckduckgo.d.ts +6 -0
- package/dist/modules/scrapers/duckduckgo.js +284 -0
- package/dist/modules/scrapers/google-news.d.ts +2 -0
- package/dist/modules/scrapers/google-news.js +60 -0
- package/dist/modules/scrapers/google.d.ts +6 -0
- package/dist/modules/scrapers/google.js +211 -0
- package/dist/modules/scrapers/searxng.d.ts +2 -0
- package/dist/modules/scrapers/searxng.js +93 -0
- package/dist/modules/scrapers/thetvdb.d.ts +3 -0
- package/dist/modules/scrapers/thetvdb.js +147 -0
- package/dist/modules/scrapers/tmdb.d.ts +3 -0
- package/dist/modules/scrapers/tmdb.js +172 -0
- package/dist/modules/scrapers/yahoo-finance.d.ts +2 -0
- package/dist/modules/scrapers/yahoo-finance.js +33 -0
- package/dist/modules/search.d.ts +5 -0
- package/dist/modules/search.js +45 -0
- package/dist/modules/search.js.map +1 -0
- package/dist/modules/search.test.d.ts +1 -0
- package/dist/modules/search.test.js +219 -0
- package/dist/modules/urbandictionary.d.ts +12 -0
- package/dist/modules/urbandictionary.js +26 -0
- package/dist/modules/webpage.d.ts +4 -0
- package/dist/modules/webpage.js +150 -0
- package/dist/modules/webpage.js.map +1 -0
- package/dist/modules/wikipedia.d.ts +5 -0
- package/dist/modules/wikipedia.js +85 -0
- package/dist/modules/wikipedia.js.map +1 -0
- package/dist/scripts/interactive-search.d.ts +1 -0
- package/dist/scripts/interactive-search.js +98 -0
- package/dist/test.d.ts +1 -0
- package/dist/test.js +179 -0
- package/dist/test.js.map +1 -0
- package/dist/testBraveSearch.d.ts +1 -0
- package/dist/testBraveSearch.js +34 -0
- package/dist/testDuckDuckGo.d.ts +1 -0
- package/dist/testDuckDuckGo.js +52 -0
- package/dist/testEcosia.d.ts +1 -0
- package/dist/testEcosia.js +57 -0
- package/dist/testSearchModule.d.ts +1 -0
- package/dist/testSearchModule.js +95 -0
- package/dist/testwebpage.d.ts +1 -0
- package/dist/testwebpage.js +81 -0
- package/dist/types.d.ts +174 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/createTestDocx.d.ts +1 -0
- package/dist/utils/createTestDocx.js +58 -0
- package/dist/utils/htmlcleaner.d.ts +20 -0
- package/dist/utils/htmlcleaner.js +172 -0
- package/docs/README.md +275 -0
- package/docs/autocomplete.md +73 -0
- package/docs/crawling.md +88 -0
- package/docs/events.md +58 -0
- package/docs/examples.md +158 -0
- package/docs/finance.md +60 -0
- package/docs/flights.md +71 -0
- package/docs/hackernews.md +121 -0
- package/docs/media.md +87 -0
- package/docs/news.md +75 -0
- package/docs/parser.md +197 -0
- package/docs/scraper.md +347 -0
- package/docs/search.md +106 -0
- package/docs/wikipedia.md +91 -0
- package/package.json +97 -0
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.parse = parse;
|
|
7
|
+
// parser.ts - unified parser for various file types
|
|
8
|
+
const fs_1 = require("fs");
|
|
9
|
+
const path_1 = require("path");
|
|
10
|
+
const os_1 = require("os");
|
|
11
|
+
const crypto_1 = require("crypto");
|
|
12
|
+
const pdf_parse_1 = __importDefault(require("pdf-parse"));
|
|
13
|
+
const mammoth_1 = __importDefault(require("mammoth"));
|
|
14
|
+
const sync_1 = require("csv-parse/sync");
|
|
15
|
+
const tesseract_js_1 = require("tesseract.js");
|
|
16
|
+
const fast_xml_parser_1 = require("fast-xml-parser");
|
|
17
|
+
// detect file type from path or buffer
|
|
18
|
+
function detectFileType(pathOrBuffer, filename) {
|
|
19
|
+
// if we got a string path, use that
|
|
20
|
+
if (typeof pathOrBuffer === "string") {
|
|
21
|
+
const ext = (0, path_1.extname)(pathOrBuffer).toLowerCase();
|
|
22
|
+
return getTypeFromExtension(ext);
|
|
23
|
+
}
|
|
24
|
+
// if we got a filename hint with the buffer, use that
|
|
25
|
+
if (filename) {
|
|
26
|
+
const ext = (0, path_1.extname)(filename).toLowerCase();
|
|
27
|
+
return getTypeFromExtension(ext);
|
|
28
|
+
}
|
|
29
|
+
// ok fine we'll try to detect from buffer magic numbers
|
|
30
|
+
const header = pathOrBuffer.slice(0, 4).toString("hex");
|
|
31
|
+
// check magic numbers
|
|
32
|
+
if (header.startsWith("89504e47"))
|
|
33
|
+
return "image"; // PNG
|
|
34
|
+
if (header.startsWith("ffd8"))
|
|
35
|
+
return "image"; // JPEG
|
|
36
|
+
if (header.startsWith("424d"))
|
|
37
|
+
return "image"; // BMP
|
|
38
|
+
if (header.startsWith("47494638"))
|
|
39
|
+
return "image"; // GIF
|
|
40
|
+
if (header.startsWith("25504446"))
|
|
41
|
+
return "pdf"; // PDF
|
|
42
|
+
if (header.startsWith("504b"))
|
|
43
|
+
return "docx"; // ZIP/DOCX
|
|
44
|
+
if (pathOrBuffer.slice(0, 5).toString() === "<?xml")
|
|
45
|
+
return "xml";
|
|
46
|
+
// attempt json detection
|
|
47
|
+
try {
|
|
48
|
+
JSON.parse(pathOrBuffer.toString());
|
|
49
|
+
return "json";
|
|
50
|
+
}
|
|
51
|
+
catch {
|
|
52
|
+
// not json, continue
|
|
53
|
+
}
|
|
54
|
+
// check if it looks like csv
|
|
55
|
+
const firstLine = pathOrBuffer.toString().split("\n")[0];
|
|
56
|
+
if (firstLine && firstLine.includes(","))
|
|
57
|
+
return "csv";
|
|
58
|
+
// probably just text if we got here
|
|
59
|
+
if (pathOrBuffer.toString().trim())
|
|
60
|
+
return "text";
|
|
61
|
+
return "unknown";
|
|
62
|
+
}
|
|
63
|
+
// helper to get type from file extension
|
|
64
|
+
function getTypeFromExtension(ext) {
|
|
65
|
+
switch (ext) {
|
|
66
|
+
case ".pdf":
|
|
67
|
+
return "pdf";
|
|
68
|
+
case ".docx":
|
|
69
|
+
return "docx";
|
|
70
|
+
case ".csv":
|
|
71
|
+
return "csv";
|
|
72
|
+
case ".txt":
|
|
73
|
+
return "text";
|
|
74
|
+
case ".xml":
|
|
75
|
+
return "xml";
|
|
76
|
+
case ".json":
|
|
77
|
+
return "json";
|
|
78
|
+
case ".png":
|
|
79
|
+
case ".jpg":
|
|
80
|
+
case ".jpeg":
|
|
81
|
+
case ".bmp":
|
|
82
|
+
case ".gif":
|
|
83
|
+
return "image";
|
|
84
|
+
default:
|
|
85
|
+
return "unknown";
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
// parse JSON files
|
|
89
|
+
function parseJSON(buffer) {
|
|
90
|
+
try {
|
|
91
|
+
const text = buffer.toString();
|
|
92
|
+
const data = JSON.parse(text);
|
|
93
|
+
return {
|
|
94
|
+
type: "json",
|
|
95
|
+
text: text,
|
|
96
|
+
data: data,
|
|
97
|
+
metadata: {
|
|
98
|
+
length: text.length,
|
|
99
|
+
lines: text.split("\n").length,
|
|
100
|
+
},
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
catch (error) {
|
|
104
|
+
throw {
|
|
105
|
+
message: "Failed to parse JSON file",
|
|
106
|
+
code: "JSON_PARSE_ERROR",
|
|
107
|
+
originalError: error,
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
// parse PDF files
|
|
112
|
+
async function parsePDF(buffer) {
|
|
113
|
+
try {
|
|
114
|
+
const data = await (0, pdf_parse_1.default)(buffer);
|
|
115
|
+
return {
|
|
116
|
+
type: "pdf",
|
|
117
|
+
text: data.text,
|
|
118
|
+
metadata: {
|
|
119
|
+
pages: data.numpages,
|
|
120
|
+
info: data.info,
|
|
121
|
+
metadata: data.metadata,
|
|
122
|
+
version: data.version,
|
|
123
|
+
},
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
catch (error) {
|
|
127
|
+
throw {
|
|
128
|
+
message: "Failed to parse PDF file",
|
|
129
|
+
code: "PDF_PARSE_ERROR",
|
|
130
|
+
originalError: error,
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
// parse CSV files
|
|
135
|
+
function parseCSV(buffer, options) {
|
|
136
|
+
try {
|
|
137
|
+
const text = buffer.toString();
|
|
138
|
+
const lines = text.trim().split("\n");
|
|
139
|
+
if (lines.length === 0) {
|
|
140
|
+
throw new Error("Empty CSV file");
|
|
141
|
+
}
|
|
142
|
+
// Count columns from the header row
|
|
143
|
+
const headerRow = lines[0];
|
|
144
|
+
const columnCount = headerRow.split(options?.csv?.delimiter || ",").length;
|
|
145
|
+
// Parse the CSV
|
|
146
|
+
const records = (0, sync_1.parse)(text, {
|
|
147
|
+
delimiter: options?.csv?.delimiter,
|
|
148
|
+
columns: options?.csv?.columns ?? true, // Default to true for column headers
|
|
149
|
+
skip_empty_lines: true,
|
|
150
|
+
});
|
|
151
|
+
const headers = options?.csv?.columns !== false ? Object.keys(records[0] || {}) : undefined;
|
|
152
|
+
return {
|
|
153
|
+
type: "csv",
|
|
154
|
+
text: text,
|
|
155
|
+
data: records,
|
|
156
|
+
metadata: {
|
|
157
|
+
rowCount: records.length,
|
|
158
|
+
columnCount: columnCount,
|
|
159
|
+
headers: headers,
|
|
160
|
+
},
|
|
161
|
+
};
|
|
162
|
+
}
|
|
163
|
+
catch (error) {
|
|
164
|
+
throw {
|
|
165
|
+
message: "Failed to parse CSV file",
|
|
166
|
+
code: "CSV_PARSE_ERROR",
|
|
167
|
+
originalError: error,
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
// Helper function to extract text from HTML
|
|
172
|
+
function htmlToText(html) {
|
|
173
|
+
return html
|
|
174
|
+
.replace(/<br\s*\/?>/gi, "\n")
|
|
175
|
+
.replace(/<\/p>/gi, "\n")
|
|
176
|
+
.replace(/<\/div>/gi, "\n")
|
|
177
|
+
.replace(/<\/h[1-6]>/gi, "\n")
|
|
178
|
+
.replace(/<\/li>/gi, "\n")
|
|
179
|
+
.replace(/<[^>]+>/g, "")
|
|
180
|
+
.replace(/ /g, " ")
|
|
181
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
182
|
+
.trim();
|
|
183
|
+
}
|
|
184
|
+
// Helper function to create temp file
|
|
185
|
+
async function withTempFile(buffer, extension, callback) {
|
|
186
|
+
const tempFileName = `temp-${(0, crypto_1.randomBytes)(16).toString("hex")}${extension}`;
|
|
187
|
+
const tempPath = (0, path_1.join)((0, os_1.tmpdir)(), tempFileName);
|
|
188
|
+
try {
|
|
189
|
+
(0, fs_1.writeFileSync)(tempPath, buffer);
|
|
190
|
+
return await callback(tempPath);
|
|
191
|
+
}
|
|
192
|
+
finally {
|
|
193
|
+
try {
|
|
194
|
+
(0, fs_1.unlinkSync)(tempPath);
|
|
195
|
+
}
|
|
196
|
+
catch {
|
|
197
|
+
// ignore cleanup error
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
// parse DOCX files using mammoth
|
|
202
|
+
async function parseDOCX(buffer) {
|
|
203
|
+
try {
|
|
204
|
+
return await withTempFile(buffer, ".docx", async (tempPath) => {
|
|
205
|
+
// Try HTML conversion first
|
|
206
|
+
const htmlResult = await mammoth_1.default.convertToHtml({ path: tempPath });
|
|
207
|
+
if (!htmlResult.value) {
|
|
208
|
+
// Fallback to raw text extraction
|
|
209
|
+
const textResult = await mammoth_1.default.extractRawText({ path: tempPath });
|
|
210
|
+
if (!textResult.value) {
|
|
211
|
+
throw new Error("No content found in DOCX file");
|
|
212
|
+
}
|
|
213
|
+
const cleanText = textResult.value.trim();
|
|
214
|
+
return {
|
|
215
|
+
type: "docx",
|
|
216
|
+
text: cleanText,
|
|
217
|
+
metadata: {
|
|
218
|
+
type: "docx",
|
|
219
|
+
paragraphs: cleanText.split("\n").filter(Boolean).length,
|
|
220
|
+
warnings: textResult.messages,
|
|
221
|
+
method: "raw",
|
|
222
|
+
},
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
// Convert HTML to plain text
|
|
226
|
+
const text = htmlToText(htmlResult.value);
|
|
227
|
+
const paragraphs = text.split("\n").filter(Boolean);
|
|
228
|
+
return {
|
|
229
|
+
type: "docx",
|
|
230
|
+
text: text,
|
|
231
|
+
metadata: {
|
|
232
|
+
type: "docx",
|
|
233
|
+
paragraphs: paragraphs.length,
|
|
234
|
+
warnings: htmlResult.messages,
|
|
235
|
+
hasHtml: true,
|
|
236
|
+
method: "html",
|
|
237
|
+
},
|
|
238
|
+
};
|
|
239
|
+
});
|
|
240
|
+
}
|
|
241
|
+
catch (error) {
|
|
242
|
+
throw {
|
|
243
|
+
message: "Failed to parse DOCX file",
|
|
244
|
+
code: "DOCX_PARSE_ERROR",
|
|
245
|
+
originalError: error,
|
|
246
|
+
};
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
// parse plain text files
|
|
250
|
+
function parseText(buffer) {
|
|
251
|
+
try {
|
|
252
|
+
const text = buffer.toString();
|
|
253
|
+
return {
|
|
254
|
+
type: "text",
|
|
255
|
+
text: text,
|
|
256
|
+
metadata: {
|
|
257
|
+
length: text.length,
|
|
258
|
+
lines: text.split("\n").length,
|
|
259
|
+
},
|
|
260
|
+
};
|
|
261
|
+
}
|
|
262
|
+
catch (error) {
|
|
263
|
+
throw {
|
|
264
|
+
message: "Failed to parse text file",
|
|
265
|
+
code: "TEXT_PARSE_ERROR",
|
|
266
|
+
originalError: error,
|
|
267
|
+
};
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
// parse XML files
|
|
271
|
+
function parseXML(buffer, options) {
|
|
272
|
+
try {
|
|
273
|
+
const text = buffer.toString();
|
|
274
|
+
const parser = new fast_xml_parser_1.XMLParser({
|
|
275
|
+
ignoreAttributes: options?.xml?.ignoreAttributes ?? false,
|
|
276
|
+
parseAttributeValue: options?.xml?.parseAttributeValue ?? true,
|
|
277
|
+
});
|
|
278
|
+
const data = parser.parse(text);
|
|
279
|
+
return {
|
|
280
|
+
type: "xml",
|
|
281
|
+
text: text,
|
|
282
|
+
data: data,
|
|
283
|
+
metadata: {
|
|
284
|
+
length: text.length,
|
|
285
|
+
lines: text.split("\n").length,
|
|
286
|
+
},
|
|
287
|
+
};
|
|
288
|
+
}
|
|
289
|
+
catch (error) {
|
|
290
|
+
throw {
|
|
291
|
+
message: "Failed to parse XML file",
|
|
292
|
+
code: "XML_PARSE_ERROR",
|
|
293
|
+
originalError: error,
|
|
294
|
+
};
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
// parse images using OCR
|
|
298
|
+
async function parseImage(buffer, options) {
|
|
299
|
+
try {
|
|
300
|
+
const worker = await (0, tesseract_js_1.createWorker)();
|
|
301
|
+
const lang = options?.language || "eng";
|
|
302
|
+
// Initialize worker with language
|
|
303
|
+
await worker.reinitialize(lang);
|
|
304
|
+
const result = await worker.recognize(buffer);
|
|
305
|
+
await worker.terminate();
|
|
306
|
+
return {
|
|
307
|
+
type: "image",
|
|
308
|
+
text: result.data.text,
|
|
309
|
+
metadata: {
|
|
310
|
+
language: lang,
|
|
311
|
+
confidence: result.data.confidence,
|
|
312
|
+
},
|
|
313
|
+
};
|
|
314
|
+
}
|
|
315
|
+
catch (error) {
|
|
316
|
+
throw {
|
|
317
|
+
message: "Failed to parse image file",
|
|
318
|
+
code: "IMAGE_PARSE_ERROR",
|
|
319
|
+
originalError: error,
|
|
320
|
+
};
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
// main parse function
|
|
324
|
+
async function parse(pathOrBuffer, options = {}, filename) {
|
|
325
|
+
try {
|
|
326
|
+
// Get file buffer
|
|
327
|
+
const buffer = typeof pathOrBuffer === "string" ? (0, fs_1.readFileSync)(pathOrBuffer) : pathOrBuffer;
|
|
328
|
+
// Detect file type (pass filename hint if we have it)
|
|
329
|
+
const fileType = detectFileType(pathOrBuffer, filename);
|
|
330
|
+
// Parse based on file type
|
|
331
|
+
switch (fileType) {
|
|
332
|
+
case "pdf":
|
|
333
|
+
return await parsePDF(buffer);
|
|
334
|
+
case "docx":
|
|
335
|
+
return await parseDOCX(buffer);
|
|
336
|
+
case "csv":
|
|
337
|
+
return parseCSV(buffer, options);
|
|
338
|
+
case "text":
|
|
339
|
+
return parseText(buffer);
|
|
340
|
+
case "xml":
|
|
341
|
+
return parseXML(buffer, options);
|
|
342
|
+
case "json":
|
|
343
|
+
return parseJSON(buffer);
|
|
344
|
+
case "image":
|
|
345
|
+
return await parseImage(buffer, options);
|
|
346
|
+
default:
|
|
347
|
+
throw new Error(`Unsupported file type: ${fileType}`);
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
catch (error) {
|
|
351
|
+
const searchError = error;
|
|
352
|
+
if (searchError.code) {
|
|
353
|
+
throw searchError;
|
|
354
|
+
}
|
|
355
|
+
throw {
|
|
356
|
+
message: `Failed to parse file: ${error instanceof Error ? error.message : String(error)}`,
|
|
357
|
+
code: "PARSE_ERROR",
|
|
358
|
+
originalError: error,
|
|
359
|
+
};
|
|
360
|
+
}
|
|
361
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
const vitest_1 = require("vitest");
|
|
7
|
+
const parser_1 = require("./parser");
|
|
8
|
+
const pdf_parse_1 = __importDefault(require("pdf-parse"));
|
|
9
|
+
const mammoth_1 = __importDefault(require("mammoth"));
|
|
10
|
+
const tesseract_js_1 = require("tesseract.js");
|
|
11
|
+
const fs_1 = require("fs");
|
|
12
|
+
// Mock dependencies
|
|
13
|
+
vitest_1.vi.mock("pdf-parse", () => ({
|
|
14
|
+
default: vitest_1.vi.fn(),
|
|
15
|
+
}));
|
|
16
|
+
vitest_1.vi.mock("mammoth");
|
|
17
|
+
vitest_1.vi.mock("tesseract.js");
|
|
18
|
+
vitest_1.vi.mock("fs", async () => {
|
|
19
|
+
const actual = await vitest_1.vi.importActual("fs");
|
|
20
|
+
return {
|
|
21
|
+
...actual,
|
|
22
|
+
readFileSync: vitest_1.vi.fn(),
|
|
23
|
+
// We don't mock writeFileSync/unlinkSync as they are used for temp files in docx parsing
|
|
24
|
+
// and we want that to work or we mock the whole flow.
|
|
25
|
+
// For now, let's just mock readFileSync for the input file reading.
|
|
26
|
+
};
|
|
27
|
+
});
|
|
28
|
+
(0, vitest_1.describe)("Parser Module", () => {
|
|
29
|
+
(0, vitest_1.beforeEach)(() => {
|
|
30
|
+
vitest_1.vi.resetAllMocks();
|
|
31
|
+
});
|
|
32
|
+
(0, vitest_1.describe)("Text Parsing", () => {
|
|
33
|
+
(0, vitest_1.it)("should parse text from buffer", async () => {
|
|
34
|
+
const buffer = Buffer.from("Hello world");
|
|
35
|
+
const result = await (0, parser_1.parse)(buffer);
|
|
36
|
+
(0, vitest_1.expect)(result).toEqual({
|
|
37
|
+
type: "text",
|
|
38
|
+
text: "Hello world",
|
|
39
|
+
metadata: {
|
|
40
|
+
length: 11,
|
|
41
|
+
lines: 1,
|
|
42
|
+
},
|
|
43
|
+
});
|
|
44
|
+
});
|
|
45
|
+
(0, vitest_1.it)("should parse text from file path", async () => {
|
|
46
|
+
fs_1.readFileSync.mockReturnValue(Buffer.from("Hello file"));
|
|
47
|
+
const result = await (0, parser_1.parse)("test.txt");
|
|
48
|
+
(0, vitest_1.expect)(result).toEqual({
|
|
49
|
+
type: "text",
|
|
50
|
+
text: "Hello file",
|
|
51
|
+
metadata: {
|
|
52
|
+
length: 10,
|
|
53
|
+
lines: 1,
|
|
54
|
+
},
|
|
55
|
+
});
|
|
56
|
+
(0, vitest_1.expect)(fs_1.readFileSync).toHaveBeenCalledWith("test.txt");
|
|
57
|
+
});
|
|
58
|
+
});
|
|
59
|
+
(0, vitest_1.describe)("JSON Parsing", () => {
|
|
60
|
+
(0, vitest_1.it)("should parse JSON from buffer", async () => {
|
|
61
|
+
const data = { key: "value" };
|
|
62
|
+
const buffer = Buffer.from(JSON.stringify(data));
|
|
63
|
+
const result = await (0, parser_1.parse)(buffer, {}, "test.json");
|
|
64
|
+
(0, vitest_1.expect)(result.type).toBe("json");
|
|
65
|
+
(0, vitest_1.expect)(result.data).toEqual(data);
|
|
66
|
+
});
|
|
67
|
+
(0, vitest_1.it)("should detect JSON without extension", async () => {
|
|
68
|
+
const data = { key: "value" };
|
|
69
|
+
const buffer = Buffer.from(JSON.stringify(data));
|
|
70
|
+
const result = await (0, parser_1.parse)(buffer);
|
|
71
|
+
(0, vitest_1.expect)(result.type).toBe("json");
|
|
72
|
+
});
|
|
73
|
+
});
|
|
74
|
+
(0, vitest_1.describe)("CSV Parsing", () => {
|
|
75
|
+
(0, vitest_1.it)("should parse CSV from buffer", async () => {
|
|
76
|
+
const csv = "name,age\nAlice,30\nBob,25";
|
|
77
|
+
const buffer = Buffer.from(csv);
|
|
78
|
+
const result = await (0, parser_1.parse)(buffer, {}, "test.csv");
|
|
79
|
+
(0, vitest_1.expect)(result.type).toBe("csv");
|
|
80
|
+
(0, vitest_1.expect)(Array.isArray(result.data)).toBe(true);
|
|
81
|
+
const data = result.data;
|
|
82
|
+
(0, vitest_1.expect)(data).toHaveLength(2);
|
|
83
|
+
(0, vitest_1.expect)(data[0]).toEqual({ name: "Alice", age: "30" });
|
|
84
|
+
});
|
|
85
|
+
});
|
|
86
|
+
(0, vitest_1.describe)("PDF Parsing", () => {
|
|
87
|
+
(0, vitest_1.it)("should parse PDF using pdf-parse", async () => {
|
|
88
|
+
const mockData = {
|
|
89
|
+
numpages: 1,
|
|
90
|
+
info: {},
|
|
91
|
+
metadata: {},
|
|
92
|
+
version: "1.0",
|
|
93
|
+
text: "PDF content",
|
|
94
|
+
};
|
|
95
|
+
pdf_parse_1.default.mockResolvedValue(mockData);
|
|
96
|
+
const buffer = Buffer.from("%PDF-1.5"); // Magic bytes for PDF
|
|
97
|
+
const result = await (0, parser_1.parse)(buffer);
|
|
98
|
+
(0, vitest_1.expect)(result.type).toBe("pdf");
|
|
99
|
+
(0, vitest_1.expect)(result.text).toBe("PDF content");
|
|
100
|
+
(0, vitest_1.expect)(pdf_parse_1.default).toHaveBeenCalledWith(buffer);
|
|
101
|
+
});
|
|
102
|
+
});
|
|
103
|
+
(0, vitest_1.describe)("DOCX Parsing", () => {
|
|
104
|
+
(0, vitest_1.it)("should parse DOCX using mammoth", async () => {
|
|
105
|
+
const mockResult = { value: "DOCX content", messages: [] };
|
|
106
|
+
// Mock convertToHtml since that's what the parser uses first
|
|
107
|
+
mammoth_1.default.convertToHtml.mockResolvedValue(mockResult);
|
|
108
|
+
// Magic bytes for DOCX (PK zip header)
|
|
109
|
+
const buffer = Buffer.from("504b0304", "hex");
|
|
110
|
+
// We need to hint extension or provide magic bytes that match zip/docx
|
|
111
|
+
// The parser checks magic bytes '504b' -> 'docx'
|
|
112
|
+
const result = await (0, parser_1.parse)(buffer);
|
|
113
|
+
(0, vitest_1.expect)(result.type).toBe("docx");
|
|
114
|
+
(0, vitest_1.expect)(result.text).toBe("DOCX content");
|
|
115
|
+
});
|
|
116
|
+
});
|
|
117
|
+
(0, vitest_1.describe)("Image OCR", () => {
|
|
118
|
+
(0, vitest_1.it)("should parse image using tesseract", async () => {
|
|
119
|
+
const mockWorker = {
|
|
120
|
+
reinitialize: vitest_1.vi.fn(),
|
|
121
|
+
recognize: vitest_1.vi.fn().mockResolvedValue({
|
|
122
|
+
data: {
|
|
123
|
+
text: "OCR Text",
|
|
124
|
+
confidence: 90,
|
|
125
|
+
},
|
|
126
|
+
}),
|
|
127
|
+
terminate: vitest_1.vi.fn(),
|
|
128
|
+
};
|
|
129
|
+
tesseract_js_1.createWorker.mockResolvedValue(mockWorker);
|
|
130
|
+
// Magic bytes for PNG
|
|
131
|
+
const buffer = Buffer.from("89504e47", "hex");
|
|
132
|
+
const result = await (0, parser_1.parse)(buffer);
|
|
133
|
+
(0, vitest_1.expect)(result.type).toBe("image");
|
|
134
|
+
(0, vitest_1.expect)(result.text).toBe("OCR Text");
|
|
135
|
+
(0, vitest_1.expect)(mockWorker.recognize).toHaveBeenCalledWith(buffer);
|
|
136
|
+
(0, vitest_1.expect)(mockWorker.terminate).toHaveBeenCalled();
|
|
137
|
+
});
|
|
138
|
+
});
|
|
139
|
+
(0, vitest_1.describe)("Error Handling", () => {
|
|
140
|
+
(0, vitest_1.it)("should throw error for unsupported file type", async () => {
|
|
141
|
+
// Empty buffer returns "unknown" type because it fails the text check (trim())
|
|
142
|
+
const buffer = Buffer.from("");
|
|
143
|
+
await (0, vitest_1.expect)((0, parser_1.parse)(buffer)).rejects.toThrow("Unsupported file type: unknown");
|
|
144
|
+
});
|
|
145
|
+
(0, vitest_1.it)("should propagate parsing errors", async () => {
|
|
146
|
+
const buffer = Buffer.from("{ invalid json");
|
|
147
|
+
// Expect the specific error from the JSON parser, not the generic wrapper
|
|
148
|
+
await (0, vitest_1.expect)((0, parser_1.parse)(buffer, {}, "test.json")).rejects.toThrow("Failed to parse JSON file");
|
|
149
|
+
});
|
|
150
|
+
});
|
|
151
|
+
});
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
interface RedditPost {
|
|
2
|
+
title: string;
|
|
3
|
+
id: string;
|
|
4
|
+
url: string;
|
|
5
|
+
permalink: string;
|
|
6
|
+
author: string;
|
|
7
|
+
score: number;
|
|
8
|
+
num_comments: number;
|
|
9
|
+
created_utc: number;
|
|
10
|
+
selftext?: string;
|
|
11
|
+
subreddit: string;
|
|
12
|
+
is_self: boolean;
|
|
13
|
+
}
|
|
14
|
+
export declare function getSubredditHot(subreddit: string, limit?: number): Promise<RedditPost[]>;
|
|
15
|
+
export declare function searchReddit(query: string, options?: {
|
|
16
|
+
sort?: 'hot' | 'new' | 'top' | 'relevance';
|
|
17
|
+
limit?: number;
|
|
18
|
+
type?: 'posts' | 'comments';
|
|
19
|
+
}): Promise<RedditPost[]>;
|
|
20
|
+
export declare function getPostFromUrl(url: string): Promise<RedditPost>;
|
|
21
|
+
export {};
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// reddit.ts - handles reddit stuff like getting posts n searching
|
|
3
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
4
|
+
exports.getSubredditHot = getSubredditHot;
|
|
5
|
+
exports.searchReddit = searchReddit;
|
|
6
|
+
exports.getPostFromUrl = getPostFromUrl;
|
|
7
|
+
// Get hot posts from a subreddit
|
|
8
|
+
async function getSubredditHot(subreddit, limit = 25) {
|
|
9
|
+
try {
|
|
10
|
+
const url = `https://www.reddit.com/r/${subreddit}/hot.json?limit=${limit}`;
|
|
11
|
+
const response = await fetch(url);
|
|
12
|
+
if (!response.ok) {
|
|
13
|
+
throw new Error(`HTTP error! status: ${response.status}`);
|
|
14
|
+
}
|
|
15
|
+
const data = await response.json();
|
|
16
|
+
return data.data.children.map((post) => ({
|
|
17
|
+
title: post.data.title,
|
|
18
|
+
id: post.data.id,
|
|
19
|
+
url: post.data.url,
|
|
20
|
+
permalink: `https://reddit.com${post.data.permalink}`,
|
|
21
|
+
author: post.data.author,
|
|
22
|
+
score: post.data.score,
|
|
23
|
+
num_comments: post.data.num_comments,
|
|
24
|
+
created_utc: post.data.created_utc,
|
|
25
|
+
selftext: post.data.selftext,
|
|
26
|
+
subreddit: post.data.subreddit,
|
|
27
|
+
is_self: post.data.is_self
|
|
28
|
+
}));
|
|
29
|
+
}
|
|
30
|
+
catch (error) {
|
|
31
|
+
throw {
|
|
32
|
+
message: `failed to get subreddit posts: ${error instanceof Error ? error.message : String(error)}`,
|
|
33
|
+
code: 'REDDIT_ERROR',
|
|
34
|
+
originalError: error
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
// Search Reddit posts
|
|
39
|
+
async function searchReddit(query, options = {}) {
|
|
40
|
+
try {
|
|
41
|
+
const params = new URLSearchParams({
|
|
42
|
+
q: query,
|
|
43
|
+
sort: options.sort || 'hot',
|
|
44
|
+
type: options.type || 'posts',
|
|
45
|
+
limit: String(options.limit || 25)
|
|
46
|
+
});
|
|
47
|
+
const url = `https://www.reddit.com/search.json?${params.toString()}`;
|
|
48
|
+
const response = await fetch(url);
|
|
49
|
+
if (!response.ok) {
|
|
50
|
+
throw new Error(`HTTP error! status: ${response.status}`);
|
|
51
|
+
}
|
|
52
|
+
const data = await response.json();
|
|
53
|
+
return data.data.children.map((post) => ({
|
|
54
|
+
title: post.data.title,
|
|
55
|
+
id: post.data.id,
|
|
56
|
+
url: post.data.url,
|
|
57
|
+
permalink: `https://reddit.com${post.data.permalink}`,
|
|
58
|
+
author: post.data.author,
|
|
59
|
+
score: post.data.score,
|
|
60
|
+
num_comments: post.data.num_comments,
|
|
61
|
+
created_utc: post.data.created_utc,
|
|
62
|
+
selftext: post.data.selftext,
|
|
63
|
+
subreddit: post.data.subreddit,
|
|
64
|
+
is_self: post.data.is_self
|
|
65
|
+
}));
|
|
66
|
+
}
|
|
67
|
+
catch (error) {
|
|
68
|
+
throw {
|
|
69
|
+
message: `reddit search failed: ${error instanceof Error ? error.message : String(error)}`,
|
|
70
|
+
code: 'REDDIT_SEARCH_ERROR',
|
|
71
|
+
originalError: error
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
// Get post info by URL
|
|
76
|
+
async function getPostFromUrl(url) {
|
|
77
|
+
try {
|
|
78
|
+
// Convert post URL to .json URL
|
|
79
|
+
const jsonUrl = url.replace(/\/?$/, '.json');
|
|
80
|
+
const response = await fetch(jsonUrl);
|
|
81
|
+
if (!response.ok) {
|
|
82
|
+
throw new Error(`HTTP error! status: ${response.status}`);
|
|
83
|
+
}
|
|
84
|
+
const data = await response.json();
|
|
85
|
+
const post = data[0].data.children[0].data;
|
|
86
|
+
return {
|
|
87
|
+
title: post.title,
|
|
88
|
+
id: post.id,
|
|
89
|
+
url: post.url,
|
|
90
|
+
permalink: `https://reddit.com${post.permalink}`,
|
|
91
|
+
author: post.author,
|
|
92
|
+
score: post.score,
|
|
93
|
+
num_comments: post.num_comments,
|
|
94
|
+
created_utc: post.created_utc,
|
|
95
|
+
selftext: post.selftext,
|
|
96
|
+
subreddit: post.subreddit,
|
|
97
|
+
is_self: post.is_self
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
catch (error) {
|
|
101
|
+
throw {
|
|
102
|
+
message: `failed to get reddit post: ${error instanceof Error ? error.message : String(error)}`,
|
|
103
|
+
code: 'REDDIT_POST_ERROR',
|
|
104
|
+
originalError: error
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { ScraperOptions, WebpageContent } from "../types";
|
|
2
|
+
export interface NormalizeContentParams {
|
|
3
|
+
url: string;
|
|
4
|
+
html: string;
|
|
5
|
+
title?: string;
|
|
6
|
+
siteName?: string;
|
|
7
|
+
fallbackFavicon?: string;
|
|
8
|
+
skipReadability?: boolean;
|
|
9
|
+
}
|
|
10
|
+
export declare function normalizeContent(params: NormalizeContentParams): WebpageContent;
|
|
11
|
+
export declare function getWebpageContent(url: string, options?: ({
|
|
12
|
+
usePuppeteer?: boolean;
|
|
13
|
+
} & ScraperOptions) | boolean): Promise<WebpageContent>;
|
|
14
|
+
export declare function getWebpageText(url: string, options?: {
|
|
15
|
+
usePuppeteer?: boolean;
|
|
16
|
+
} & ScraperOptions): Promise<string>;
|