llm-search-tools 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +244 -0
  3. package/dist/index.d.ts +18 -0
  4. package/dist/index.js +40 -0
  5. package/dist/index.js.map +1 -0
  6. package/dist/integration.test.d.ts +1 -0
  7. package/dist/integration.test.js +237 -0
  8. package/dist/modules/answerbox.test.d.ts +1 -0
  9. package/dist/modules/answerbox.test.js +105 -0
  10. package/dist/modules/autocomplete.d.ts +11 -0
  11. package/dist/modules/autocomplete.js +159 -0
  12. package/dist/modules/autocomplete.test.d.ts +1 -0
  13. package/dist/modules/autocomplete.test.js +188 -0
  14. package/dist/modules/common.d.ts +26 -0
  15. package/dist/modules/common.js +263 -0
  16. package/dist/modules/common.test.d.ts +1 -0
  17. package/dist/modules/common.test.js +87 -0
  18. package/dist/modules/crawl.d.ts +9 -0
  19. package/dist/modules/crawl.js +117 -0
  20. package/dist/modules/crawl.test.d.ts +1 -0
  21. package/dist/modules/crawl.test.js +48 -0
  22. package/dist/modules/events.d.ts +8 -0
  23. package/dist/modules/events.js +129 -0
  24. package/dist/modules/events.test.d.ts +1 -0
  25. package/dist/modules/events.test.js +104 -0
  26. package/dist/modules/finance.d.ts +10 -0
  27. package/dist/modules/finance.js +20 -0
  28. package/dist/modules/finance.test.d.ts +1 -0
  29. package/dist/modules/finance.test.js +77 -0
  30. package/dist/modules/flights.d.ts +8 -0
  31. package/dist/modules/flights.js +135 -0
  32. package/dist/modules/flights.test.d.ts +1 -0
  33. package/dist/modules/flights.test.js +128 -0
  34. package/dist/modules/hackernews.d.ts +8 -0
  35. package/dist/modules/hackernews.js +87 -0
  36. package/dist/modules/hackernews.js.map +1 -0
  37. package/dist/modules/images.test.d.ts +1 -0
  38. package/dist/modules/images.test.js +145 -0
  39. package/dist/modules/integrations.test.d.ts +1 -0
  40. package/dist/modules/integrations.test.js +93 -0
  41. package/dist/modules/media.d.ts +11 -0
  42. package/dist/modules/media.js +132 -0
  43. package/dist/modules/media.test.d.ts +1 -0
  44. package/dist/modules/media.test.js +186 -0
  45. package/dist/modules/news.d.ts +3 -0
  46. package/dist/modules/news.js +39 -0
  47. package/dist/modules/news.test.d.ts +1 -0
  48. package/dist/modules/news.test.js +88 -0
  49. package/dist/modules/parser.d.ts +19 -0
  50. package/dist/modules/parser.js +361 -0
  51. package/dist/modules/parser.test.d.ts +1 -0
  52. package/dist/modules/parser.test.js +151 -0
  53. package/dist/modules/reddit.d.ts +21 -0
  54. package/dist/modules/reddit.js +107 -0
  55. package/dist/modules/scrape.d.ts +16 -0
  56. package/dist/modules/scrape.js +272 -0
  57. package/dist/modules/scrape.test.d.ts +1 -0
  58. package/dist/modules/scrape.test.js +232 -0
  59. package/dist/modules/scraper.d.ts +12 -0
  60. package/dist/modules/scraper.js +640 -0
  61. package/dist/modules/scrapers/anidb.d.ts +8 -0
  62. package/dist/modules/scrapers/anidb.js +156 -0
  63. package/dist/modules/scrapers/duckduckgo.d.ts +6 -0
  64. package/dist/modules/scrapers/duckduckgo.js +284 -0
  65. package/dist/modules/scrapers/google-news.d.ts +2 -0
  66. package/dist/modules/scrapers/google-news.js +60 -0
  67. package/dist/modules/scrapers/google.d.ts +6 -0
  68. package/dist/modules/scrapers/google.js +211 -0
  69. package/dist/modules/scrapers/searxng.d.ts +2 -0
  70. package/dist/modules/scrapers/searxng.js +93 -0
  71. package/dist/modules/scrapers/thetvdb.d.ts +3 -0
  72. package/dist/modules/scrapers/thetvdb.js +147 -0
  73. package/dist/modules/scrapers/tmdb.d.ts +3 -0
  74. package/dist/modules/scrapers/tmdb.js +172 -0
  75. package/dist/modules/scrapers/yahoo-finance.d.ts +2 -0
  76. package/dist/modules/scrapers/yahoo-finance.js +33 -0
  77. package/dist/modules/search.d.ts +5 -0
  78. package/dist/modules/search.js +45 -0
  79. package/dist/modules/search.js.map +1 -0
  80. package/dist/modules/search.test.d.ts +1 -0
  81. package/dist/modules/search.test.js +219 -0
  82. package/dist/modules/urbandictionary.d.ts +12 -0
  83. package/dist/modules/urbandictionary.js +26 -0
  84. package/dist/modules/webpage.d.ts +4 -0
  85. package/dist/modules/webpage.js +150 -0
  86. package/dist/modules/webpage.js.map +1 -0
  87. package/dist/modules/wikipedia.d.ts +5 -0
  88. package/dist/modules/wikipedia.js +85 -0
  89. package/dist/modules/wikipedia.js.map +1 -0
  90. package/dist/scripts/interactive-search.d.ts +1 -0
  91. package/dist/scripts/interactive-search.js +98 -0
  92. package/dist/test.d.ts +1 -0
  93. package/dist/test.js +179 -0
  94. package/dist/test.js.map +1 -0
  95. package/dist/testBraveSearch.d.ts +1 -0
  96. package/dist/testBraveSearch.js +34 -0
  97. package/dist/testDuckDuckGo.d.ts +1 -0
  98. package/dist/testDuckDuckGo.js +52 -0
  99. package/dist/testEcosia.d.ts +1 -0
  100. package/dist/testEcosia.js +57 -0
  101. package/dist/testSearchModule.d.ts +1 -0
  102. package/dist/testSearchModule.js +95 -0
  103. package/dist/testwebpage.d.ts +1 -0
  104. package/dist/testwebpage.js +81 -0
  105. package/dist/types.d.ts +174 -0
  106. package/dist/types.js +3 -0
  107. package/dist/types.js.map +1 -0
  108. package/dist/utils/createTestDocx.d.ts +1 -0
  109. package/dist/utils/createTestDocx.js +58 -0
  110. package/dist/utils/htmlcleaner.d.ts +20 -0
  111. package/dist/utils/htmlcleaner.js +172 -0
  112. package/docs/README.md +275 -0
  113. package/docs/autocomplete.md +73 -0
  114. package/docs/crawling.md +88 -0
  115. package/docs/events.md +58 -0
  116. package/docs/examples.md +158 -0
  117. package/docs/finance.md +60 -0
  118. package/docs/flights.md +71 -0
  119. package/docs/hackernews.md +121 -0
  120. package/docs/media.md +87 -0
  121. package/docs/news.md +75 -0
  122. package/docs/parser.md +197 -0
  123. package/docs/scraper.md +347 -0
  124. package/docs/search.md +106 -0
  125. package/docs/wikipedia.md +91 -0
  126. package/package.json +97 -0
@@ -0,0 +1,174 @@
1
+ export interface ProxyConfig {
2
+ type: "http" | "https" | "socks4" | "socks5";
3
+ host: string;
4
+ port: number;
5
+ auth?: {
6
+ username: string;
7
+ password: string;
8
+ };
9
+ url?: string;
10
+ }
11
+ export interface ScraperOptions {
12
+ limit?: number;
13
+ safeSearch?: boolean;
14
+ timeout?: number;
15
+ forcePuppeteer?: boolean;
16
+ proxy?: ProxyConfig | string;
17
+ antiBot?: {
18
+ enabled?: boolean;
19
+ maxRetries?: number;
20
+ retryDelay?: number;
21
+ };
22
+ searxngInstance?: string;
23
+ category?: "web" | "news" | "images" | "videos";
24
+ }
25
+ export interface SearchOptions {
26
+ limit?: number;
27
+ safeSearch?: boolean;
28
+ timeout?: number;
29
+ forcePuppeteer?: boolean;
30
+ proxy?: ProxyConfig | string;
31
+ antiBot?: {
32
+ enabled?: boolean;
33
+ maxRetries?: number;
34
+ retryDelay?: number;
35
+ };
36
+ searxngInstance?: string;
37
+ }
38
+ export interface SearchResult {
39
+ title: string;
40
+ url: string;
41
+ snippet?: string;
42
+ source: "google" | "duckduckgo" | "wikipedia" | "hackernews" | "searxng" | "google-news" | "duckduckgo-news" | "google-images" | "duckduckgo-images" | "searxng-images";
43
+ }
44
+ export interface ImageResult extends SearchResult {
45
+ imageUrl: string;
46
+ thumbnailUrl?: string;
47
+ width?: number;
48
+ height?: number;
49
+ source: "google-images" | "duckduckgo-images" | "searxng-images";
50
+ }
51
+ export interface NewsResult extends SearchResult {
52
+ sourceName?: string;
53
+ publishedAt?: string | Date;
54
+ imageUrl?: string;
55
+ }
56
+ export interface WikipediaResult extends SearchResult {
57
+ extract?: string;
58
+ thumbnail?: string;
59
+ }
60
+ export interface HackerNewsResult extends SearchResult {
61
+ id?: number;
62
+ points?: number;
63
+ author?: string;
64
+ comments?: number;
65
+ time?: Date;
66
+ }
67
+ export interface FinanceResult {
68
+ symbol: string;
69
+ shortName?: string;
70
+ longName?: string;
71
+ regularMarketPrice?: number;
72
+ regularMarketChange?: number;
73
+ regularMarketChangePercent?: number;
74
+ regularMarketTime?: Date;
75
+ currency?: string;
76
+ exchange?: string;
77
+ marketState?: string;
78
+ source: "yahoo-finance";
79
+ }
80
+ export interface WebpageContent {
81
+ title?: string;
82
+ content: string;
83
+ textContent: string;
84
+ length: number;
85
+ excerpt?: string;
86
+ siteName?: string;
87
+ favicon?: string;
88
+ markdown?: string;
89
+ imageUrls?: string[];
90
+ rawHtml?: string;
91
+ }
92
+ export interface SearchError {
93
+ message: string;
94
+ code: string;
95
+ originalError?: unknown;
96
+ }
97
+ export interface CrawlOptions extends ScraperOptions {
98
+ maxPages?: number;
99
+ maxDepth?: number;
100
+ crawlType?: "cheerio" | "puppeteer";
101
+ stayOnDomain?: boolean;
102
+ ignoreRobotsTxt?: boolean;
103
+ }
104
+ export interface CrawledPage extends WebpageContent {
105
+ url: string;
106
+ depth: number;
107
+ }
108
+ export type CrawlResult = CrawledPage[];
109
+ export interface AutocompleteResult {
110
+ query: string;
111
+ suggestions: string[];
112
+ source: string;
113
+ }
114
+ export interface AutocompleteOptions {
115
+ limit?: number;
116
+ proxy?: ProxyConfig | string;
117
+ timeout?: number;
118
+ }
119
+ export interface Flight {
120
+ airline: string;
121
+ departureTime: string;
122
+ arrivalTime: string;
123
+ duration: string;
124
+ price: string;
125
+ stops: string;
126
+ origin?: string;
127
+ destination?: string;
128
+ }
129
+ export interface FlightResult {
130
+ flights: Flight[];
131
+ url: string;
132
+ source: "google-flights";
133
+ }
134
+ export interface FlightSearchOptions extends ScraperOptions {
135
+ departureDate?: string;
136
+ returnDate?: string;
137
+ from?: string;
138
+ to?: string;
139
+ }
140
+ export interface Event {
141
+ title: string;
142
+ date: string;
143
+ location: string;
144
+ link?: string;
145
+ description?: string;
146
+ image?: string;
147
+ }
148
+ export interface EventResult {
149
+ events: Event[];
150
+ url: string;
151
+ source: "google-events";
152
+ }
153
+ export interface EventSearchOptions extends ScraperOptions {
154
+ date?: "today" | "tomorrow" | "week" | "weekend" | "month" | "next_month";
155
+ }
156
+ export interface MediaResult {
157
+ title: string;
158
+ description?: string;
159
+ rating?: string;
160
+ releaseDate?: string;
161
+ cast?: string[];
162
+ genres?: string[];
163
+ posterUrl?: string;
164
+ watchProviders?: {
165
+ name: string;
166
+ type: "stream" | "rent" | "buy";
167
+ }[];
168
+ url: string;
169
+ source: "tmdb" | "thetvdb" | "anidb";
170
+ mediaType: "movie" | "tv" | "anime";
171
+ }
172
+ export interface MediaSearchOptions extends ScraperOptions {
173
+ type?: "movie" | "tv" | "anime";
174
+ }
package/dist/types.js ADDED
@@ -0,0 +1,3 @@
1
+ "use strict";
2
+ // types.ts - all our shared types n stuff
3
+ Object.defineProperty(exports, "__esModule", { value: true });
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":";AAAA,0CAA0C"}
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,58 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ const jszip_1 = __importDefault(require("jszip"));
7
+ const fs_1 = require("fs");
8
+ async function createTestDocx() {
9
+ const zip = new jszip_1.default();
10
+ // Add required DOCX files
11
+ zip.file('word/document.xml', `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
12
+ <w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
13
+ <w:body>
14
+ <w:p>
15
+ <w:r>
16
+ <w:t>Sample Test Document</w:t>
17
+ </w:r>
18
+ </w:p>
19
+ <w:p>
20
+ <w:r>
21
+ <w:t>This is a test DOCX document for llm-kit parser testing.</w:t>
22
+ </w:r>
23
+ </w:p>
24
+ <w:p>
25
+ <w:r>
26
+ <w:t>It contains some sample paragraphs to verify DOCX parsing.</w:t>
27
+ </w:r>
28
+ </w:p>
29
+ <w:p>
30
+ <w:r>
31
+ <w:t>Each paragraph should be properly extracted and formatted.</w:t>
32
+ </w:r>
33
+ </w:p>
34
+ </w:body>
35
+ </w:document>`);
36
+ zip.file('[Content_Types].xml', `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
37
+ <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
38
+ <Default Extension="xml" ContentType="application/xml"/>
39
+ <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
40
+ <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
41
+ </Types>`);
42
+ zip.file('_rels/.rels', `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
43
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
44
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
45
+ </Relationships>`);
46
+ // Generate the DOCX file
47
+ const content = await zip.generateAsync({
48
+ type: 'nodebuffer',
49
+ compression: 'DEFLATE',
50
+ compressionOptions: {
51
+ level: 9
52
+ }
53
+ });
54
+ // Write to file
55
+ (0, fs_1.writeFileSync)('test/files/test.docx', content);
56
+ }
57
+ // Create the test DOCX file
58
+ createTestDocx().catch(console.error);
@@ -0,0 +1,20 @@
1
+ /**
2
+ * Process HTML to convert relative URLs to absolute ones
3
+ * @param html The HTML string
4
+ * @param baseUrl The base URL for resolving relative URLs
5
+ * @returns HTML with absolute URLs
6
+ */
7
+ export declare function makeUrlsAbsolute(html: string, baseUrl: string): string;
8
+ /**
9
+ * Cleans HTML by removing invisible elements and minifies it while preserving visible content
10
+ * @param html The HTML string to clean
11
+ * @returns Cleaned and minified HTML with visible content preserved
12
+ */
13
+ export declare function cleanAndMinifyHtml(html: string): string;
14
+ /**
15
+ * Process HTML by cleaning it and converting all URLs to absolute
16
+ * @param html The HTML string to process
17
+ * @param baseUrl The base URL for resolving relative URLs
18
+ * @returns Processed HTML
19
+ */
20
+ export declare function processHtml(html: string, baseUrl: string): string;
@@ -0,0 +1,172 @@
1
+ "use strict";
2
+ // utils/htmlcleaner.ts - utilities for processing and cleaning html
3
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
4
+ if (k2 === undefined) k2 = k;
5
+ var desc = Object.getOwnPropertyDescriptor(m, k);
6
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
7
+ desc = { enumerable: true, get: function() { return m[k]; } };
8
+ }
9
+ Object.defineProperty(o, k2, desc);
10
+ }) : (function(o, m, k, k2) {
11
+ if (k2 === undefined) k2 = k;
12
+ o[k2] = m[k];
13
+ }));
14
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
15
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
16
+ }) : function(o, v) {
17
+ o["default"] = v;
18
+ });
19
+ var __importStar = (this && this.__importStar) || (function () {
20
+ var ownKeys = function(o) {
21
+ ownKeys = Object.getOwnPropertyNames || function (o) {
22
+ var ar = [];
23
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
24
+ return ar;
25
+ };
26
+ return ownKeys(o);
27
+ };
28
+ return function (mod) {
29
+ if (mod && mod.__esModule) return mod;
30
+ var result = {};
31
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
32
+ __setModuleDefault(result, mod);
33
+ return result;
34
+ };
35
+ })();
36
+ Object.defineProperty(exports, "__esModule", { value: true });
37
+ exports.makeUrlsAbsolute = makeUrlsAbsolute;
38
+ exports.cleanAndMinifyHtml = cleanAndMinifyHtml;
39
+ exports.processHtml = processHtml;
40
+ const cheerio = __importStar(require("cheerio"));
41
+ /**
42
+ * Process HTML to convert relative URLs to absolute ones
43
+ * @param html The HTML string
44
+ * @param baseUrl The base URL for resolving relative URLs
45
+ * @returns HTML with absolute URLs
46
+ */
47
+ function makeUrlsAbsolute(html, baseUrl) {
48
+ const $ = cheerio.load(html);
49
+ // convert relative urls to absolute
50
+ ["href", "src"].forEach((attr) => {
51
+ $(`[${attr}]`).each((_, element) => {
52
+ const el = $(element);
53
+ const value = $(el).attr(attr);
54
+ if (value && !value.startsWith("data:") && !value.startsWith("http")) {
55
+ try {
56
+ const absoluteUrl = new URL(value, baseUrl).toString();
57
+ $(el).attr(attr, absoluteUrl);
58
+ }
59
+ catch {
60
+ // keep original if URL parsing fails
61
+ }
62
+ }
63
+ });
64
+ });
65
+ return $.html();
66
+ }
67
+ /**
68
+ * Cleans HTML by removing invisible elements and minifies it while preserving visible content
69
+ * @param html The HTML string to clean
70
+ * @returns Cleaned and minified HTML with visible content preserved
71
+ */
72
+ function cleanAndMinifyHtml(html) {
73
+ // Load HTML into Cheerio
74
+ const $ = cheerio.load(html);
75
+ // Remove invisible and non-content elements
76
+ const elementsToRemove = [
77
+ "script",
78
+ "style",
79
+ "meta",
80
+ 'link[rel="stylesheet"]',
81
+ 'link[rel="preload"]',
82
+ 'link[rel="prefetch"]',
83
+ "iframe",
84
+ "noscript",
85
+ "svg",
86
+ "video",
87
+ "object",
88
+ "embed",
89
+ "canvas",
90
+ "template",
91
+ '[style*="display:none"]',
92
+ '[style*="display: none"]',
93
+ '[style*="visibility:hidden"]',
94
+ '[style*="visibility: hidden"]',
95
+ "[hidden]",
96
+ '[aria-hidden="true"]',
97
+ ];
98
+ // Remove all specified elements
99
+ elementsToRemove.forEach((selector) => {
100
+ $(selector).remove();
101
+ });
102
+ // Remove JavaScript event handlers
103
+ $("*").each((_, element) => {
104
+ const el = $(element);
105
+ const tagName = el.prop('tagName')?.toLowerCase();
106
+ // Get all attributes
107
+ const attributes = [];
108
+ el[0].attribs && Object.keys(el[0].attribs).forEach(attr => {
109
+ attributes.push(attr);
110
+ });
111
+ // Process each attribute
112
+ attributes.forEach(attr => {
113
+ // Remove JavaScript event handlers
114
+ if (attr.startsWith("on")) {
115
+ el.removeAttr(attr);
116
+ return;
117
+ }
118
+ // Keep essential attributes but remove others
119
+ const keepAttribute =
120
+ // Keep structural attributes
121
+ attr === "href" ||
122
+ attr === "src" ||
123
+ attr === "alt" ||
124
+ attr === "title" ||
125
+ // Keep basic formatting
126
+ attr === "colspan" ||
127
+ attr === "rowspan" ||
128
+ // Keep semantic attributes
129
+ attr === "role" ||
130
+ attr === "aria-label" ||
131
+ // Keep header identification
132
+ (attr === "id" &&
133
+ (tagName === "h1" ||
134
+ tagName === "h2" ||
135
+ tagName === "h3" ||
136
+ tagName === "h4" ||
137
+ tagName === "h5" ||
138
+ tagName === "h6" ||
139
+ tagName === "header"));
140
+ if (!keepAttribute) {
141
+ el.removeAttr(attr);
142
+ }
143
+ });
144
+ });
145
+ // Basic HTML minification
146
+ let result = $.html();
147
+ // Minify the HTML
148
+ result = result
149
+ // Remove comments
150
+ .replace(/<!--[\s\S]*?-->/g, "")
151
+ // Remove extra whitespace
152
+ .replace(/\s{2,}/g, " ")
153
+ // Remove whitespace between tags
154
+ .replace(/>\s+</g, "><")
155
+ // Remove whitespace at start/end of each line
156
+ .replace(/^\s+|\s+$/gm, "")
157
+ // Remove unnecessary line breaks while keeping some structure
158
+ .replace(/\n+/g, "\n");
159
+ return result;
160
+ }
161
+ /**
162
+ * Process HTML by cleaning it and converting all URLs to absolute
163
+ * @param html The HTML string to process
164
+ * @param baseUrl The base URL for resolving relative URLs
165
+ * @returns Processed HTML
166
+ */
167
+ function processHtml(html, baseUrl) {
168
+ // first clean and minify
169
+ const cleaned = cleanAndMinifyHtml(html);
170
+ // then convert urls to absolute
171
+ return makeUrlsAbsolute(cleaned, baseUrl);
172
+ }