llm-search-tools 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +244 -0
  3. package/dist/index.d.ts +18 -0
  4. package/dist/index.js +40 -0
  5. package/dist/index.js.map +1 -0
  6. package/dist/integration.test.d.ts +1 -0
  7. package/dist/integration.test.js +237 -0
  8. package/dist/modules/answerbox.test.d.ts +1 -0
  9. package/dist/modules/answerbox.test.js +105 -0
  10. package/dist/modules/autocomplete.d.ts +11 -0
  11. package/dist/modules/autocomplete.js +159 -0
  12. package/dist/modules/autocomplete.test.d.ts +1 -0
  13. package/dist/modules/autocomplete.test.js +188 -0
  14. package/dist/modules/common.d.ts +26 -0
  15. package/dist/modules/common.js +263 -0
  16. package/dist/modules/common.test.d.ts +1 -0
  17. package/dist/modules/common.test.js +87 -0
  18. package/dist/modules/crawl.d.ts +9 -0
  19. package/dist/modules/crawl.js +117 -0
  20. package/dist/modules/crawl.test.d.ts +1 -0
  21. package/dist/modules/crawl.test.js +48 -0
  22. package/dist/modules/events.d.ts +8 -0
  23. package/dist/modules/events.js +129 -0
  24. package/dist/modules/events.test.d.ts +1 -0
  25. package/dist/modules/events.test.js +104 -0
  26. package/dist/modules/finance.d.ts +10 -0
  27. package/dist/modules/finance.js +20 -0
  28. package/dist/modules/finance.test.d.ts +1 -0
  29. package/dist/modules/finance.test.js +77 -0
  30. package/dist/modules/flights.d.ts +8 -0
  31. package/dist/modules/flights.js +135 -0
  32. package/dist/modules/flights.test.d.ts +1 -0
  33. package/dist/modules/flights.test.js +128 -0
  34. package/dist/modules/hackernews.d.ts +8 -0
  35. package/dist/modules/hackernews.js +87 -0
  36. package/dist/modules/hackernews.js.map +1 -0
  37. package/dist/modules/images.test.d.ts +1 -0
  38. package/dist/modules/images.test.js +145 -0
  39. package/dist/modules/integrations.test.d.ts +1 -0
  40. package/dist/modules/integrations.test.js +93 -0
  41. package/dist/modules/media.d.ts +11 -0
  42. package/dist/modules/media.js +132 -0
  43. package/dist/modules/media.test.d.ts +1 -0
  44. package/dist/modules/media.test.js +186 -0
  45. package/dist/modules/news.d.ts +3 -0
  46. package/dist/modules/news.js +39 -0
  47. package/dist/modules/news.test.d.ts +1 -0
  48. package/dist/modules/news.test.js +88 -0
  49. package/dist/modules/parser.d.ts +19 -0
  50. package/dist/modules/parser.js +361 -0
  51. package/dist/modules/parser.test.d.ts +1 -0
  52. package/dist/modules/parser.test.js +151 -0
  53. package/dist/modules/reddit.d.ts +21 -0
  54. package/dist/modules/reddit.js +107 -0
  55. package/dist/modules/scrape.d.ts +16 -0
  56. package/dist/modules/scrape.js +272 -0
  57. package/dist/modules/scrape.test.d.ts +1 -0
  58. package/dist/modules/scrape.test.js +232 -0
  59. package/dist/modules/scraper.d.ts +12 -0
  60. package/dist/modules/scraper.js +640 -0
  61. package/dist/modules/scrapers/anidb.d.ts +8 -0
  62. package/dist/modules/scrapers/anidb.js +156 -0
  63. package/dist/modules/scrapers/duckduckgo.d.ts +6 -0
  64. package/dist/modules/scrapers/duckduckgo.js +284 -0
  65. package/dist/modules/scrapers/google-news.d.ts +2 -0
  66. package/dist/modules/scrapers/google-news.js +60 -0
  67. package/dist/modules/scrapers/google.d.ts +6 -0
  68. package/dist/modules/scrapers/google.js +211 -0
  69. package/dist/modules/scrapers/searxng.d.ts +2 -0
  70. package/dist/modules/scrapers/searxng.js +93 -0
  71. package/dist/modules/scrapers/thetvdb.d.ts +3 -0
  72. package/dist/modules/scrapers/thetvdb.js +147 -0
  73. package/dist/modules/scrapers/tmdb.d.ts +3 -0
  74. package/dist/modules/scrapers/tmdb.js +172 -0
  75. package/dist/modules/scrapers/yahoo-finance.d.ts +2 -0
  76. package/dist/modules/scrapers/yahoo-finance.js +33 -0
  77. package/dist/modules/search.d.ts +5 -0
  78. package/dist/modules/search.js +45 -0
  79. package/dist/modules/search.js.map +1 -0
  80. package/dist/modules/search.test.d.ts +1 -0
  81. package/dist/modules/search.test.js +219 -0
  82. package/dist/modules/urbandictionary.d.ts +12 -0
  83. package/dist/modules/urbandictionary.js +26 -0
  84. package/dist/modules/webpage.d.ts +4 -0
  85. package/dist/modules/webpage.js +150 -0
  86. package/dist/modules/webpage.js.map +1 -0
  87. package/dist/modules/wikipedia.d.ts +5 -0
  88. package/dist/modules/wikipedia.js +85 -0
  89. package/dist/modules/wikipedia.js.map +1 -0
  90. package/dist/scripts/interactive-search.d.ts +1 -0
  91. package/dist/scripts/interactive-search.js +98 -0
  92. package/dist/test.d.ts +1 -0
  93. package/dist/test.js +179 -0
  94. package/dist/test.js.map +1 -0
  95. package/dist/testBraveSearch.d.ts +1 -0
  96. package/dist/testBraveSearch.js +34 -0
  97. package/dist/testDuckDuckGo.d.ts +1 -0
  98. package/dist/testDuckDuckGo.js +52 -0
  99. package/dist/testEcosia.d.ts +1 -0
  100. package/dist/testEcosia.js +57 -0
  101. package/dist/testSearchModule.d.ts +1 -0
  102. package/dist/testSearchModule.js +95 -0
  103. package/dist/testwebpage.d.ts +1 -0
  104. package/dist/testwebpage.js +81 -0
  105. package/dist/types.d.ts +174 -0
  106. package/dist/types.js +3 -0
  107. package/dist/types.js.map +1 -0
  108. package/dist/utils/createTestDocx.d.ts +1 -0
  109. package/dist/utils/createTestDocx.js +58 -0
  110. package/dist/utils/htmlcleaner.d.ts +20 -0
  111. package/dist/utils/htmlcleaner.js +172 -0
  112. package/docs/README.md +275 -0
  113. package/docs/autocomplete.md +73 -0
  114. package/docs/crawling.md +88 -0
  115. package/docs/events.md +58 -0
  116. package/docs/examples.md +158 -0
  117. package/docs/finance.md +60 -0
  118. package/docs/flights.md +71 -0
  119. package/docs/hackernews.md +121 -0
  120. package/docs/media.md +87 -0
  121. package/docs/news.md +75 -0
  122. package/docs/parser.md +197 -0
  123. package/docs/scraper.md +347 -0
  124. package/docs/search.md +106 -0
  125. package/docs/wikipedia.md +91 -0
  126. package/package.json +97 -0
@@ -0,0 +1,361 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.parse = parse;
7
+ // parser.ts - unified parser for various file types
8
+ const fs_1 = require("fs");
9
+ const path_1 = require("path");
10
+ const os_1 = require("os");
11
+ const crypto_1 = require("crypto");
12
+ const pdf_parse_1 = __importDefault(require("pdf-parse"));
13
+ const mammoth_1 = __importDefault(require("mammoth"));
14
+ const sync_1 = require("csv-parse/sync");
15
+ const tesseract_js_1 = require("tesseract.js");
16
+ const fast_xml_parser_1 = require("fast-xml-parser");
17
+ // detect file type from path or buffer
18
+ function detectFileType(pathOrBuffer, filename) {
19
+ // if we got a string path, use that
20
+ if (typeof pathOrBuffer === "string") {
21
+ const ext = (0, path_1.extname)(pathOrBuffer).toLowerCase();
22
+ return getTypeFromExtension(ext);
23
+ }
24
+ // if we got a filename hint with the buffer, use that
25
+ if (filename) {
26
+ const ext = (0, path_1.extname)(filename).toLowerCase();
27
+ return getTypeFromExtension(ext);
28
+ }
29
+ // ok fine we'll try to detect from buffer magic numbers
30
+ const header = pathOrBuffer.slice(0, 4).toString("hex");
31
+ // check magic numbers
32
+ if (header.startsWith("89504e47"))
33
+ return "image"; // PNG
34
+ if (header.startsWith("ffd8"))
35
+ return "image"; // JPEG
36
+ if (header.startsWith("424d"))
37
+ return "image"; // BMP
38
+ if (header.startsWith("47494638"))
39
+ return "image"; // GIF
40
+ if (header.startsWith("25504446"))
41
+ return "pdf"; // PDF
42
+ if (header.startsWith("504b"))
43
+ return "docx"; // ZIP/DOCX
44
+ if (pathOrBuffer.slice(0, 5).toString() === "<?xml")
45
+ return "xml";
46
+ // attempt json detection
47
+ try {
48
+ JSON.parse(pathOrBuffer.toString());
49
+ return "json";
50
+ }
51
+ catch {
52
+ // not json, continue
53
+ }
54
+ // check if it looks like csv
55
+ const firstLine = pathOrBuffer.toString().split("\n")[0];
56
+ if (firstLine && firstLine.includes(","))
57
+ return "csv";
58
+ // probably just text if we got here
59
+ if (pathOrBuffer.toString().trim())
60
+ return "text";
61
+ return "unknown";
62
+ }
63
+ // helper to get type from file extension
64
+ function getTypeFromExtension(ext) {
65
+ switch (ext) {
66
+ case ".pdf":
67
+ return "pdf";
68
+ case ".docx":
69
+ return "docx";
70
+ case ".csv":
71
+ return "csv";
72
+ case ".txt":
73
+ return "text";
74
+ case ".xml":
75
+ return "xml";
76
+ case ".json":
77
+ return "json";
78
+ case ".png":
79
+ case ".jpg":
80
+ case ".jpeg":
81
+ case ".bmp":
82
+ case ".gif":
83
+ return "image";
84
+ default:
85
+ return "unknown";
86
+ }
87
+ }
88
+ // parse JSON files
89
+ function parseJSON(buffer) {
90
+ try {
91
+ const text = buffer.toString();
92
+ const data = JSON.parse(text);
93
+ return {
94
+ type: "json",
95
+ text: text,
96
+ data: data,
97
+ metadata: {
98
+ length: text.length,
99
+ lines: text.split("\n").length,
100
+ },
101
+ };
102
+ }
103
+ catch (error) {
104
+ throw {
105
+ message: "Failed to parse JSON file",
106
+ code: "JSON_PARSE_ERROR",
107
+ originalError: error,
108
+ };
109
+ }
110
+ }
111
+ // parse PDF files
112
+ async function parsePDF(buffer) {
113
+ try {
114
+ const data = await (0, pdf_parse_1.default)(buffer);
115
+ return {
116
+ type: "pdf",
117
+ text: data.text,
118
+ metadata: {
119
+ pages: data.numpages,
120
+ info: data.info,
121
+ metadata: data.metadata,
122
+ version: data.version,
123
+ },
124
+ };
125
+ }
126
+ catch (error) {
127
+ throw {
128
+ message: "Failed to parse PDF file",
129
+ code: "PDF_PARSE_ERROR",
130
+ originalError: error,
131
+ };
132
+ }
133
+ }
134
+ // parse CSV files
135
+ function parseCSV(buffer, options) {
136
+ try {
137
+ const text = buffer.toString();
138
+ const lines = text.trim().split("\n");
139
+ if (lines.length === 0) {
140
+ throw new Error("Empty CSV file");
141
+ }
142
+ // Count columns from the header row
143
+ const headerRow = lines[0];
144
+ const columnCount = headerRow.split(options?.csv?.delimiter || ",").length;
145
+ // Parse the CSV
146
+ const records = (0, sync_1.parse)(text, {
147
+ delimiter: options?.csv?.delimiter,
148
+ columns: options?.csv?.columns ?? true, // Default to true for column headers
149
+ skip_empty_lines: true,
150
+ });
151
+ const headers = options?.csv?.columns !== false ? Object.keys(records[0] || {}) : undefined;
152
+ return {
153
+ type: "csv",
154
+ text: text,
155
+ data: records,
156
+ metadata: {
157
+ rowCount: records.length,
158
+ columnCount: columnCount,
159
+ headers: headers,
160
+ },
161
+ };
162
+ }
163
+ catch (error) {
164
+ throw {
165
+ message: "Failed to parse CSV file",
166
+ code: "CSV_PARSE_ERROR",
167
+ originalError: error,
168
+ };
169
+ }
170
+ }
171
+ // Helper function to extract text from HTML
172
+ function htmlToText(html) {
173
+ return html
174
+ .replace(/<br\s*\/?>/gi, "\n")
175
+ .replace(/<\/p>/gi, "\n")
176
+ .replace(/<\/div>/gi, "\n")
177
+ .replace(/<\/h[1-6]>/gi, "\n")
178
+ .replace(/<\/li>/gi, "\n")
179
+ .replace(/<[^>]+>/g, "")
180
+ .replace(/&nbsp;/g, " ")
181
+ .replace(/\n{3,}/g, "\n\n")
182
+ .trim();
183
+ }
184
+ // Helper function to create temp file
185
+ async function withTempFile(buffer, extension, callback) {
186
+ const tempFileName = `temp-${(0, crypto_1.randomBytes)(16).toString("hex")}${extension}`;
187
+ const tempPath = (0, path_1.join)((0, os_1.tmpdir)(), tempFileName);
188
+ try {
189
+ (0, fs_1.writeFileSync)(tempPath, buffer);
190
+ return await callback(tempPath);
191
+ }
192
+ finally {
193
+ try {
194
+ (0, fs_1.unlinkSync)(tempPath);
195
+ }
196
+ catch {
197
+ // ignore cleanup error
198
+ }
199
+ }
200
+ }
201
+ // parse DOCX files using mammoth
202
+ async function parseDOCX(buffer) {
203
+ try {
204
+ return await withTempFile(buffer, ".docx", async (tempPath) => {
205
+ // Try HTML conversion first
206
+ const htmlResult = await mammoth_1.default.convertToHtml({ path: tempPath });
207
+ if (!htmlResult.value) {
208
+ // Fallback to raw text extraction
209
+ const textResult = await mammoth_1.default.extractRawText({ path: tempPath });
210
+ if (!textResult.value) {
211
+ throw new Error("No content found in DOCX file");
212
+ }
213
+ const cleanText = textResult.value.trim();
214
+ return {
215
+ type: "docx",
216
+ text: cleanText,
217
+ metadata: {
218
+ type: "docx",
219
+ paragraphs: cleanText.split("\n").filter(Boolean).length,
220
+ warnings: textResult.messages,
221
+ method: "raw",
222
+ },
223
+ };
224
+ }
225
+ // Convert HTML to plain text
226
+ const text = htmlToText(htmlResult.value);
227
+ const paragraphs = text.split("\n").filter(Boolean);
228
+ return {
229
+ type: "docx",
230
+ text: text,
231
+ metadata: {
232
+ type: "docx",
233
+ paragraphs: paragraphs.length,
234
+ warnings: htmlResult.messages,
235
+ hasHtml: true,
236
+ method: "html",
237
+ },
238
+ };
239
+ });
240
+ }
241
+ catch (error) {
242
+ throw {
243
+ message: "Failed to parse DOCX file",
244
+ code: "DOCX_PARSE_ERROR",
245
+ originalError: error,
246
+ };
247
+ }
248
+ }
249
+ // parse plain text files
250
+ function parseText(buffer) {
251
+ try {
252
+ const text = buffer.toString();
253
+ return {
254
+ type: "text",
255
+ text: text,
256
+ metadata: {
257
+ length: text.length,
258
+ lines: text.split("\n").length,
259
+ },
260
+ };
261
+ }
262
+ catch (error) {
263
+ throw {
264
+ message: "Failed to parse text file",
265
+ code: "TEXT_PARSE_ERROR",
266
+ originalError: error,
267
+ };
268
+ }
269
+ }
270
+ // parse XML files
271
+ function parseXML(buffer, options) {
272
+ try {
273
+ const text = buffer.toString();
274
+ const parser = new fast_xml_parser_1.XMLParser({
275
+ ignoreAttributes: options?.xml?.ignoreAttributes ?? false,
276
+ parseAttributeValue: options?.xml?.parseAttributeValue ?? true,
277
+ });
278
+ const data = parser.parse(text);
279
+ return {
280
+ type: "xml",
281
+ text: text,
282
+ data: data,
283
+ metadata: {
284
+ length: text.length,
285
+ lines: text.split("\n").length,
286
+ },
287
+ };
288
+ }
289
+ catch (error) {
290
+ throw {
291
+ message: "Failed to parse XML file",
292
+ code: "XML_PARSE_ERROR",
293
+ originalError: error,
294
+ };
295
+ }
296
+ }
297
+ // parse images using OCR
298
+ async function parseImage(buffer, options) {
299
+ try {
300
+ const worker = await (0, tesseract_js_1.createWorker)();
301
+ const lang = options?.language || "eng";
302
+ // Initialize worker with language
303
+ await worker.reinitialize(lang);
304
+ const result = await worker.recognize(buffer);
305
+ await worker.terminate();
306
+ return {
307
+ type: "image",
308
+ text: result.data.text,
309
+ metadata: {
310
+ language: lang,
311
+ confidence: result.data.confidence,
312
+ },
313
+ };
314
+ }
315
+ catch (error) {
316
+ throw {
317
+ message: "Failed to parse image file",
318
+ code: "IMAGE_PARSE_ERROR",
319
+ originalError: error,
320
+ };
321
+ }
322
+ }
323
+ // main parse function
324
+ async function parse(pathOrBuffer, options = {}, filename) {
325
+ try {
326
+ // Get file buffer
327
+ const buffer = typeof pathOrBuffer === "string" ? (0, fs_1.readFileSync)(pathOrBuffer) : pathOrBuffer;
328
+ // Detect file type (pass filename hint if we have it)
329
+ const fileType = detectFileType(pathOrBuffer, filename);
330
+ // Parse based on file type
331
+ switch (fileType) {
332
+ case "pdf":
333
+ return await parsePDF(buffer);
334
+ case "docx":
335
+ return await parseDOCX(buffer);
336
+ case "csv":
337
+ return parseCSV(buffer, options);
338
+ case "text":
339
+ return parseText(buffer);
340
+ case "xml":
341
+ return parseXML(buffer, options);
342
+ case "json":
343
+ return parseJSON(buffer);
344
+ case "image":
345
+ return await parseImage(buffer, options);
346
+ default:
347
+ throw new Error(`Unsupported file type: ${fileType}`);
348
+ }
349
+ }
350
+ catch (error) {
351
+ const searchError = error;
352
+ if (searchError.code) {
353
+ throw searchError;
354
+ }
355
+ throw {
356
+ message: `Failed to parse file: ${error instanceof Error ? error.message : String(error)}`,
357
+ code: "PARSE_ERROR",
358
+ originalError: error,
359
+ };
360
+ }
361
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,151 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ const vitest_1 = require("vitest");
7
+ const parser_1 = require("./parser");
8
+ const pdf_parse_1 = __importDefault(require("pdf-parse"));
9
+ const mammoth_1 = __importDefault(require("mammoth"));
10
+ const tesseract_js_1 = require("tesseract.js");
11
+ const fs_1 = require("fs");
12
+ // Mock dependencies
13
+ vitest_1.vi.mock("pdf-parse", () => ({
14
+ default: vitest_1.vi.fn(),
15
+ }));
16
+ vitest_1.vi.mock("mammoth");
17
+ vitest_1.vi.mock("tesseract.js");
18
+ vitest_1.vi.mock("fs", async () => {
19
+ const actual = await vitest_1.vi.importActual("fs");
20
+ return {
21
+ ...actual,
22
+ readFileSync: vitest_1.vi.fn(),
23
+ // We don't mock writeFileSync/unlinkSync as they are used for temp files in docx parsing
24
+ // and we want that to work or we mock the whole flow.
25
+ // For now, let's just mock readFileSync for the input file reading.
26
+ };
27
+ });
28
+ (0, vitest_1.describe)("Parser Module", () => {
29
+ (0, vitest_1.beforeEach)(() => {
30
+ vitest_1.vi.resetAllMocks();
31
+ });
32
+ (0, vitest_1.describe)("Text Parsing", () => {
33
+ (0, vitest_1.it)("should parse text from buffer", async () => {
34
+ const buffer = Buffer.from("Hello world");
35
+ const result = await (0, parser_1.parse)(buffer);
36
+ (0, vitest_1.expect)(result).toEqual({
37
+ type: "text",
38
+ text: "Hello world",
39
+ metadata: {
40
+ length: 11,
41
+ lines: 1,
42
+ },
43
+ });
44
+ });
45
+ (0, vitest_1.it)("should parse text from file path", async () => {
46
+ fs_1.readFileSync.mockReturnValue(Buffer.from("Hello file"));
47
+ const result = await (0, parser_1.parse)("test.txt");
48
+ (0, vitest_1.expect)(result).toEqual({
49
+ type: "text",
50
+ text: "Hello file",
51
+ metadata: {
52
+ length: 10,
53
+ lines: 1,
54
+ },
55
+ });
56
+ (0, vitest_1.expect)(fs_1.readFileSync).toHaveBeenCalledWith("test.txt");
57
+ });
58
+ });
59
+ (0, vitest_1.describe)("JSON Parsing", () => {
60
+ (0, vitest_1.it)("should parse JSON from buffer", async () => {
61
+ const data = { key: "value" };
62
+ const buffer = Buffer.from(JSON.stringify(data));
63
+ const result = await (0, parser_1.parse)(buffer, {}, "test.json");
64
+ (0, vitest_1.expect)(result.type).toBe("json");
65
+ (0, vitest_1.expect)(result.data).toEqual(data);
66
+ });
67
+ (0, vitest_1.it)("should detect JSON without extension", async () => {
68
+ const data = { key: "value" };
69
+ const buffer = Buffer.from(JSON.stringify(data));
70
+ const result = await (0, parser_1.parse)(buffer);
71
+ (0, vitest_1.expect)(result.type).toBe("json");
72
+ });
73
+ });
74
+ (0, vitest_1.describe)("CSV Parsing", () => {
75
+ (0, vitest_1.it)("should parse CSV from buffer", async () => {
76
+ const csv = "name,age\nAlice,30\nBob,25";
77
+ const buffer = Buffer.from(csv);
78
+ const result = await (0, parser_1.parse)(buffer, {}, "test.csv");
79
+ (0, vitest_1.expect)(result.type).toBe("csv");
80
+ (0, vitest_1.expect)(Array.isArray(result.data)).toBe(true);
81
+ const data = result.data;
82
+ (0, vitest_1.expect)(data).toHaveLength(2);
83
+ (0, vitest_1.expect)(data[0]).toEqual({ name: "Alice", age: "30" });
84
+ });
85
+ });
86
+ (0, vitest_1.describe)("PDF Parsing", () => {
87
+ (0, vitest_1.it)("should parse PDF using pdf-parse", async () => {
88
+ const mockData = {
89
+ numpages: 1,
90
+ info: {},
91
+ metadata: {},
92
+ version: "1.0",
93
+ text: "PDF content",
94
+ };
95
+ pdf_parse_1.default.mockResolvedValue(mockData);
96
+ const buffer = Buffer.from("%PDF-1.5"); // Magic bytes for PDF
97
+ const result = await (0, parser_1.parse)(buffer);
98
+ (0, vitest_1.expect)(result.type).toBe("pdf");
99
+ (0, vitest_1.expect)(result.text).toBe("PDF content");
100
+ (0, vitest_1.expect)(pdf_parse_1.default).toHaveBeenCalledWith(buffer);
101
+ });
102
+ });
103
+ (0, vitest_1.describe)("DOCX Parsing", () => {
104
+ (0, vitest_1.it)("should parse DOCX using mammoth", async () => {
105
+ const mockResult = { value: "DOCX content", messages: [] };
106
+ // Mock convertToHtml since that's what the parser uses first
107
+ mammoth_1.default.convertToHtml.mockResolvedValue(mockResult);
108
+ // Magic bytes for DOCX (PK zip header)
109
+ const buffer = Buffer.from("504b0304", "hex");
110
+ // We need to hint extension or provide magic bytes that match zip/docx
111
+ // The parser checks magic bytes '504b' -> 'docx'
112
+ const result = await (0, parser_1.parse)(buffer);
113
+ (0, vitest_1.expect)(result.type).toBe("docx");
114
+ (0, vitest_1.expect)(result.text).toBe("DOCX content");
115
+ });
116
+ });
117
+ (0, vitest_1.describe)("Image OCR", () => {
118
+ (0, vitest_1.it)("should parse image using tesseract", async () => {
119
+ const mockWorker = {
120
+ reinitialize: vitest_1.vi.fn(),
121
+ recognize: vitest_1.vi.fn().mockResolvedValue({
122
+ data: {
123
+ text: "OCR Text",
124
+ confidence: 90,
125
+ },
126
+ }),
127
+ terminate: vitest_1.vi.fn(),
128
+ };
129
+ tesseract_js_1.createWorker.mockResolvedValue(mockWorker);
130
+ // Magic bytes for PNG
131
+ const buffer = Buffer.from("89504e47", "hex");
132
+ const result = await (0, parser_1.parse)(buffer);
133
+ (0, vitest_1.expect)(result.type).toBe("image");
134
+ (0, vitest_1.expect)(result.text).toBe("OCR Text");
135
+ (0, vitest_1.expect)(mockWorker.recognize).toHaveBeenCalledWith(buffer);
136
+ (0, vitest_1.expect)(mockWorker.terminate).toHaveBeenCalled();
137
+ });
138
+ });
139
+ (0, vitest_1.describe)("Error Handling", () => {
140
+ (0, vitest_1.it)("should throw error for unsupported file type", async () => {
141
+ // Empty buffer returns "unknown" type because it fails the text check (trim())
142
+ const buffer = Buffer.from("");
143
+ await (0, vitest_1.expect)((0, parser_1.parse)(buffer)).rejects.toThrow("Unsupported file type: unknown");
144
+ });
145
+ (0, vitest_1.it)("should propagate parsing errors", async () => {
146
+ const buffer = Buffer.from("{ invalid json");
147
+ // Expect the specific error from the JSON parser, not the generic wrapper
148
+ await (0, vitest_1.expect)((0, parser_1.parse)(buffer, {}, "test.json")).rejects.toThrow("Failed to parse JSON file");
149
+ });
150
+ });
151
+ });
@@ -0,0 +1,21 @@
1
+ interface RedditPost {
2
+ title: string;
3
+ id: string;
4
+ url: string;
5
+ permalink: string;
6
+ author: string;
7
+ score: number;
8
+ num_comments: number;
9
+ created_utc: number;
10
+ selftext?: string;
11
+ subreddit: string;
12
+ is_self: boolean;
13
+ }
14
+ export declare function getSubredditHot(subreddit: string, limit?: number): Promise<RedditPost[]>;
15
+ export declare function searchReddit(query: string, options?: {
16
+ sort?: 'hot' | 'new' | 'top' | 'relevance';
17
+ limit?: number;
18
+ type?: 'posts' | 'comments';
19
+ }): Promise<RedditPost[]>;
20
+ export declare function getPostFromUrl(url: string): Promise<RedditPost>;
21
+ export {};
@@ -0,0 +1,107 @@
1
+ "use strict";
2
+ // reddit.ts - handles reddit stuff like getting posts n searching
3
+ Object.defineProperty(exports, "__esModule", { value: true });
4
+ exports.getSubredditHot = getSubredditHot;
5
+ exports.searchReddit = searchReddit;
6
+ exports.getPostFromUrl = getPostFromUrl;
7
+ // Get hot posts from a subreddit
8
+ async function getSubredditHot(subreddit, limit = 25) {
9
+ try {
10
+ const url = `https://www.reddit.com/r/${subreddit}/hot.json?limit=${limit}`;
11
+ const response = await fetch(url);
12
+ if (!response.ok) {
13
+ throw new Error(`HTTP error! status: ${response.status}`);
14
+ }
15
+ const data = await response.json();
16
+ return data.data.children.map((post) => ({
17
+ title: post.data.title,
18
+ id: post.data.id,
19
+ url: post.data.url,
20
+ permalink: `https://reddit.com${post.data.permalink}`,
21
+ author: post.data.author,
22
+ score: post.data.score,
23
+ num_comments: post.data.num_comments,
24
+ created_utc: post.data.created_utc,
25
+ selftext: post.data.selftext,
26
+ subreddit: post.data.subreddit,
27
+ is_self: post.data.is_self
28
+ }));
29
+ }
30
+ catch (error) {
31
+ throw {
32
+ message: `failed to get subreddit posts: ${error instanceof Error ? error.message : String(error)}`,
33
+ code: 'REDDIT_ERROR',
34
+ originalError: error
35
+ };
36
+ }
37
+ }
38
+ // Search Reddit posts
39
+ async function searchReddit(query, options = {}) {
40
+ try {
41
+ const params = new URLSearchParams({
42
+ q: query,
43
+ sort: options.sort || 'hot',
44
+ type: options.type || 'posts',
45
+ limit: String(options.limit || 25)
46
+ });
47
+ const url = `https://www.reddit.com/search.json?${params.toString()}`;
48
+ const response = await fetch(url);
49
+ if (!response.ok) {
50
+ throw new Error(`HTTP error! status: ${response.status}`);
51
+ }
52
+ const data = await response.json();
53
+ return data.data.children.map((post) => ({
54
+ title: post.data.title,
55
+ id: post.data.id,
56
+ url: post.data.url,
57
+ permalink: `https://reddit.com${post.data.permalink}`,
58
+ author: post.data.author,
59
+ score: post.data.score,
60
+ num_comments: post.data.num_comments,
61
+ created_utc: post.data.created_utc,
62
+ selftext: post.data.selftext,
63
+ subreddit: post.data.subreddit,
64
+ is_self: post.data.is_self
65
+ }));
66
+ }
67
+ catch (error) {
68
+ throw {
69
+ message: `reddit search failed: ${error instanceof Error ? error.message : String(error)}`,
70
+ code: 'REDDIT_SEARCH_ERROR',
71
+ originalError: error
72
+ };
73
+ }
74
+ }
75
+ // Get post info by URL
76
+ async function getPostFromUrl(url) {
77
+ try {
78
+ // Convert post URL to .json URL
79
+ const jsonUrl = url.replace(/\/?$/, '.json');
80
+ const response = await fetch(jsonUrl);
81
+ if (!response.ok) {
82
+ throw new Error(`HTTP error! status: ${response.status}`);
83
+ }
84
+ const data = await response.json();
85
+ const post = data[0].data.children[0].data;
86
+ return {
87
+ title: post.title,
88
+ id: post.id,
89
+ url: post.url,
90
+ permalink: `https://reddit.com${post.permalink}`,
91
+ author: post.author,
92
+ score: post.score,
93
+ num_comments: post.num_comments,
94
+ created_utc: post.created_utc,
95
+ selftext: post.selftext,
96
+ subreddit: post.subreddit,
97
+ is_self: post.is_self
98
+ };
99
+ }
100
+ catch (error) {
101
+ throw {
102
+ message: `failed to get reddit post: ${error instanceof Error ? error.message : String(error)}`,
103
+ code: 'REDDIT_POST_ERROR',
104
+ originalError: error
105
+ };
106
+ }
107
+ }
@@ -0,0 +1,16 @@
1
+ import { ScraperOptions, WebpageContent } from "../types";
2
+ export interface NormalizeContentParams {
3
+ url: string;
4
+ html: string;
5
+ title?: string;
6
+ siteName?: string;
7
+ fallbackFavicon?: string;
8
+ skipReadability?: boolean;
9
+ }
10
+ export declare function normalizeContent(params: NormalizeContentParams): WebpageContent;
11
+ export declare function getWebpageContent(url: string, options?: ({
12
+ usePuppeteer?: boolean;
13
+ } & ScraperOptions) | boolean): Promise<WebpageContent>;
14
+ export declare function getWebpageText(url: string, options?: {
15
+ usePuppeteer?: boolean;
16
+ } & ScraperOptions): Promise<string>;