scrapex 0.5.2 → 1.0.0-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +392 -145
  3. package/dist/enhancer-Q6CSc1gA.mjs +220 -0
  4. package/dist/enhancer-Q6CSc1gA.mjs.map +1 -0
  5. package/dist/enhancer-oM4BhYYS.cjs +268 -0
  6. package/dist/enhancer-oM4BhYYS.cjs.map +1 -0
  7. package/dist/index.cjs +852 -0
  8. package/dist/index.cjs.map +1 -0
  9. package/dist/index.d.cts +264 -0
  10. package/dist/index.d.cts.map +1 -0
  11. package/dist/index.d.mts +264 -0
  12. package/dist/index.d.mts.map +1 -0
  13. package/dist/index.mjs +798 -0
  14. package/dist/index.mjs.map +1 -0
  15. package/dist/llm/index.cjs +316 -0
  16. package/dist/llm/index.cjs.map +1 -0
  17. package/dist/llm/index.d.cts +211 -0
  18. package/dist/llm/index.d.cts.map +1 -0
  19. package/dist/llm/index.d.mts +211 -0
  20. package/dist/llm/index.d.mts.map +1 -0
  21. package/dist/llm/index.mjs +310 -0
  22. package/dist/llm/index.mjs.map +1 -0
  23. package/dist/parsers/index.cjs +200 -0
  24. package/dist/parsers/index.cjs.map +1 -0
  25. package/dist/parsers/index.d.cts +133 -0
  26. package/dist/parsers/index.d.cts.map +1 -0
  27. package/dist/parsers/index.d.mts +133 -0
  28. package/dist/parsers/index.d.mts.map +1 -0
  29. package/dist/parsers/index.mjs +192 -0
  30. package/dist/parsers/index.mjs.map +1 -0
  31. package/dist/types-CNQZVW36.d.mts +150 -0
  32. package/dist/types-CNQZVW36.d.mts.map +1 -0
  33. package/dist/types-D0HYR95H.d.cts +150 -0
  34. package/dist/types-D0HYR95H.d.cts.map +1 -0
  35. package/package.json +80 -100
  36. package/dist/index.d.ts +0 -45
  37. package/dist/index.js +0 -8
  38. package/dist/scrapex.cjs.development.js +0 -1128
  39. package/dist/scrapex.cjs.development.js.map +0 -1
  40. package/dist/scrapex.cjs.production.min.js +0 -2
  41. package/dist/scrapex.cjs.production.min.js.map +0 -1
  42. package/dist/scrapex.esm.js +0 -1120
  43. package/dist/scrapex.esm.js.map +0 -1
@@ -0,0 +1,264 @@
1
+ import { a as ExtractedLink, c as ExtractionSchemaType, d as FetchResult, f as Fetcher, h as ScrapedData, i as ExtractedEntities, l as Extractor, m as ScrapeOptions, n as ContentType, o as ExtractionContext, p as LLMProvider, r as EnhancementType, s as ExtractionSchema, t as CompletionOptions, u as FetchOptions } from "./types-CNQZVW36.mjs";
2
+
3
+ //#region src/core/context.d.ts
4
+
5
+ /**
6
+ * Create an extraction context with lazy JSDOM loading.
7
+ *
8
+ * Cheerio is always available for fast DOM queries.
9
+ * JSDOM is only loaded when getDocument() is called (for Readability).
10
+ */
11
+ declare function createExtractionContext(url: string, finalUrl: string, html: string, options: ScrapeOptions): ExtractionContext;
12
+ /**
13
+ * Merge partial results into the context
14
+ */
15
+ declare function mergeResults(context: ExtractionContext, extracted: Partial<ScrapedData>): ExtractionContext;
16
+ //#endregion
17
+ //#region src/core/errors.d.ts
18
+ /**
19
+ * Error codes for scraping failures
20
+ */
21
+ type ScrapeErrorCode = 'FETCH_FAILED' | 'TIMEOUT' | 'INVALID_URL' | 'BLOCKED' | 'NOT_FOUND' | 'ROBOTS_BLOCKED' | 'PARSE_ERROR' | 'LLM_ERROR' | 'VALIDATION_ERROR';
22
+ /**
23
+ * Custom error class for scraping failures with structured error codes
24
+ */
25
+ declare class ScrapeError extends Error {
26
+ readonly code: ScrapeErrorCode;
27
+ readonly statusCode?: number;
28
+ constructor(message: string, code: ScrapeErrorCode, statusCode?: number, cause?: Error);
29
+ /**
30
+ * Create a ScrapeError from an unknown error
31
+ */
32
+ static from(error: unknown, code?: ScrapeErrorCode): ScrapeError;
33
+ /**
34
+ * Check if error is retryable (network issues, timeouts)
35
+ */
36
+ isRetryable(): boolean;
37
+ /**
38
+ * Convert to a plain object for serialization
39
+ */
40
+ toJSON(): Record<string, unknown>;
41
+ }
42
+ //#endregion
43
+ //#region src/core/scrape.d.ts
44
+ /**
45
+ * Scrape a URL and extract metadata and content.
46
+ *
47
+ * @param url - The URL to scrape
48
+ * @param options - Scraping options
49
+ * @returns Scraped data with metadata and content
50
+ *
51
+ * @example
52
+ * ```ts
53
+ * const result = await scrape('https://example.com/article');
54
+ * console.log(result.title, result.content);
55
+ * ```
56
+ */
57
+ declare function scrape(url: string, options?: ScrapeOptions): Promise<ScrapedData>;
58
+ /**
59
+ * Scrape from raw HTML string (no fetch).
60
+ *
61
+ * @param html - The HTML content
62
+ * @param url - The URL (for resolving relative links)
63
+ * @param options - Scraping options
64
+ * @returns Scraped data with metadata and content
65
+ *
66
+ * @example
67
+ * ```ts
68
+ * const html = await fetchSomehow('https://example.com');
69
+ * const result = await scrapeHtml(html, 'https://example.com');
70
+ * ```
71
+ */
72
+ declare function scrapeHtml(html: string, url: string, options?: ScrapeOptions): Promise<ScrapedData>;
73
+ //#endregion
74
+ //#region src/extractors/content.d.ts
75
+ /**
76
+ * Extracts main content using Mozilla Readability.
77
+ * Converts HTML to Markdown for LLM consumption.
78
+ */
79
+ declare class ContentExtractor implements Extractor {
80
+ readonly name = "content";
81
+ readonly priority = 50;
82
+ extract(context: ExtractionContext): Promise<Partial<ScrapedData>>;
83
+ private extractFallback;
84
+ private createExcerpt;
85
+ private detectContentType;
86
+ }
87
+ //#endregion
88
+ //#region src/extractors/favicon.d.ts
89
+ /**
90
+ * Extracts favicon URL from the page.
91
+ * Checks multiple sources in order of preference.
92
+ */
93
+ declare class FaviconExtractor implements Extractor {
94
+ readonly name = "favicon";
95
+ readonly priority = 70;
96
+ extract(context: ExtractionContext): Promise<Partial<ScrapedData>>;
97
+ }
98
+ //#endregion
99
+ //#region src/extractors/jsonld.d.ts
100
+ /**
101
+ * Extracts JSON-LD structured data from the page.
102
+ * Also extracts additional metadata from structured data.
103
+ */
104
+ declare class JsonLdExtractor implements Extractor {
105
+ readonly name = "jsonld";
106
+ readonly priority = 80;
107
+ extract(context: ExtractionContext): Promise<Partial<ScrapedData>>;
108
+ private extractMetadata;
109
+ private getType;
110
+ private getString;
111
+ private getAuthor;
112
+ private getImage;
113
+ private getKeywords;
114
+ }
115
+ //#endregion
116
+ //#region src/extractors/links.d.ts
117
+ /**
118
+ * Extracts links from the page content.
119
+ * Filters out navigation/footer links and focuses on content links.
120
+ */
121
+ declare class LinksExtractor implements Extractor {
122
+ readonly name = "links";
123
+ readonly priority = 30;
124
+ extract(context: ExtractionContext): Promise<Partial<ScrapedData>>;
125
+ }
126
+ //#endregion
127
+ //#region src/extractors/meta.d.ts
128
+ /**
129
+ * Extracts metadata from HTML meta tags, Open Graph, and Twitter cards.
130
+ * Runs first to provide basic metadata for other extractors.
131
+ */
132
+ declare class MetaExtractor implements Extractor {
133
+ readonly name = "meta";
134
+ readonly priority = 100;
135
+ extract(context: ExtractionContext): Promise<Partial<ScrapedData>>;
136
+ }
137
+ //#endregion
138
+ //#region src/extractors/index.d.ts
139
+ /**
140
+ * Default extractors in priority order.
141
+ * Higher priority runs first.
142
+ */
143
+ declare function createDefaultExtractors(): Extractor[];
144
+ /**
145
+ * Sort extractors by priority (higher first).
146
+ */
147
+ declare function sortExtractors(extractors: Extractor[]): Extractor[];
148
+ //#endregion
149
+ //#region src/fetchers/types.d.ts
150
+ /**
151
+ * Fetcher interface - allows swapping fetch implementation
152
+ * for Puppeteer, Playwright, or custom solutions
153
+ */
154
+ interface Fetcher$1 {
155
+ /**
156
+ * Fetch HTML from a URL
157
+ * @returns HTML content and final URL (after redirects)
158
+ */
159
+ fetch(url: string, options?: FetchOptions$1): Promise<FetchResult$1>;
160
+ /** Fetcher name for logging */
161
+ readonly name: string;
162
+ }
163
+ /**
164
+ * Options for fetching
165
+ */
166
+ interface FetchOptions$1 {
167
+ /** Timeout in milliseconds (default: 10000) */
168
+ timeout?: number;
169
+ /** User agent string */
170
+ userAgent?: string;
171
+ /** Additional headers to send */
172
+ headers?: Record<string, string>;
173
+ }
174
+ /**
175
+ * Result from fetching a URL
176
+ */
177
+ interface FetchResult$1 {
178
+ /** Raw HTML content */
179
+ html: string;
180
+ /** Final URL after redirects */
181
+ finalUrl: string;
182
+ /** HTTP status code */
183
+ statusCode: number;
184
+ /** Content-Type header */
185
+ contentType: string;
186
+ /** Response headers (optional) */
187
+ headers?: Record<string, string>;
188
+ }
189
+ /**
190
+ * Default user agent string
191
+ */
192
+ declare const DEFAULT_USER_AGENT = "Scrapex-Bot/2.0 (+https://github.com/developer-rakeshpaul/scrapex)";
193
+ /**
194
+ * Default timeout in milliseconds
195
+ */
196
+ declare const DEFAULT_TIMEOUT = 10000;
197
+ //#endregion
198
+ //#region src/fetchers/fetch.d.ts
199
+ /**
200
+ * Default fetcher using native fetch API.
201
+ * Works in Node.js 18+ without polyfills.
202
+ */
203
+ declare class NativeFetcher implements Fetcher$1 {
204
+ readonly name = "native-fetch";
205
+ fetch(url: string, options?: FetchOptions$1): Promise<FetchResult$1>;
206
+ }
207
+ /**
208
+ * Default fetcher instance
209
+ */
210
+ declare const defaultFetcher: NativeFetcher;
211
+ //#endregion
212
+ //#region src/fetchers/robots.d.ts
213
+ /**
214
+ * Result of robots.txt check
215
+ */
216
+ interface RobotsCheckResult {
217
+ allowed: boolean;
218
+ reason?: string;
219
+ }
220
+ /**
221
+ * Check if URL is allowed by robots.txt
222
+ *
223
+ * @param url - The URL to check
224
+ * @param userAgent - User agent to check rules for
225
+ * @returns Whether the URL is allowed and optional reason
226
+ */
227
+ declare function checkRobotsTxt(url: string, userAgent?: string): Promise<RobotsCheckResult>;
228
+ //#endregion
229
+ //#region src/utils/url.d.ts
230
+ /**
231
+ * Validate if a string is a valid URL
232
+ */
233
+ declare function isValidUrl(url: string): boolean;
234
+ /**
235
+ * Normalize URL by removing tracking params and trailing slashes
236
+ */
237
+ declare function normalizeUrl(url: string): string;
238
+ /**
239
+ * Extract domain from URL (without www prefix)
240
+ */
241
+ declare function extractDomain(url: string): string;
242
+ /**
243
+ * Resolve a potentially relative URL against a base URL
244
+ */
245
+ declare function resolveUrl(url: string | undefined | null, baseUrl: string): string | undefined;
246
+ /**
247
+ * Check if a URL is external relative to a domain
248
+ */
249
+ declare function isExternalUrl(url: string, baseDomain: string): boolean;
250
+ /**
251
+ * Extract protocol from URL
252
+ */
253
+ declare function getProtocol(url: string): string;
254
+ /**
255
+ * Get the path portion of a URL
256
+ */
257
+ declare function getPath(url: string): string;
258
+ /**
259
+ * Check if URL matches a pattern (supports * wildcard)
260
+ */
261
+ declare function matchesUrlPattern(url: string, pattern: string): boolean;
262
+ //#endregion
263
+ export { type CompletionOptions, ContentExtractor, type ContentType, DEFAULT_TIMEOUT, DEFAULT_USER_AGENT, type EnhancementType, type ExtractedEntities, type ExtractedLink, type ExtractionContext, type ExtractionSchema, type ExtractionSchemaType, type Extractor, FaviconExtractor, type FetchOptions, type FetchResult, type Fetcher, JsonLdExtractor, type LLMProvider, LinksExtractor, MetaExtractor, NativeFetcher, type RobotsCheckResult, ScrapeError, type ScrapeErrorCode, type ScrapeOptions, type ScrapedData, checkRobotsTxt, createDefaultExtractors, createExtractionContext, defaultFetcher, extractDomain, getPath, getProtocol, isExternalUrl, isValidUrl, matchesUrlPattern, mergeResults, normalizeUrl, resolveUrl, scrape, scrapeHtml, sortExtractors };
264
+ //# sourceMappingURL=index.d.mts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.mts","names":[],"sources":["../src/core/context.ts","../src/core/errors.ts","../src/core/scrape.ts","../src/extractors/content.ts","../src/extractors/favicon.ts","../src/extractors/jsonld.ts","../src/extractors/links.ts","../src/extractors/meta.ts","../src/extractors/index.ts","../src/fetchers/types.ts","../src/fetchers/fetch.ts","../src/fetchers/robots.ts","../src/utils/url.ts"],"sourcesContent":[],"mappings":";;;;;;;;;;iBAsBgB,uBAAA,uDAIL,gBACR;;ACxBH;AAcA;AACwB,iBDyCR,YAAA,CCzCQ,OAAA,ED0Cb,iBC1Ca,EAAA,SAAA,ED2CX,OC3CW,CD2CH,WC3CG,CAAA,CAAA,ED4CrB,iBC5CqB;;;;;;ADIR,KCnBJ,eAAA,GDmB2B,cAI5B,GAAA,SAAA,GACR,aAAA,GAAiB,SAAA,GAAA,WAAA,GAAA,gBAAA,GAAA,aAAA,GAAA,WAAA,GAAA,kBAAA;AAgCpB;;;AAEa,cC5CA,WAAA,SAAoB,KAAA,CD4CpB;EACV,SAAA,IAAA,EC5CqB,eD4CrB;EAAiB,SAAA,UAAA,CAAA,EAAA,MAAA;qCCzCiB,8CAA8C;;;AAlBnF;EAca,OAAA,IAAA,CAAA,KAAY,EAAA,OAAA,EAAA,IAAA,CAAA,EAmBW,eAnBX,CAAA,EAmB8C,WAnB9C;EACD;;;EAkBY,WAAA,CAAA,CAAA,EAAA,OAAA;EAAmC;;;EAnBjC,MAAA,CAAA,CAAA,EAyC1B,MAzC0B,CAAA,MAAA,EAAA,OAAA,CAAA;;;;;;ADKtC;AAqCA;;;;;;;;;ACxDA;AAca,iBCIS,MAAA,CDJG,GAAA,EAAA,MAAA,EAAA,OAAA,CAAA,ECI0B,aDJ1B,CAAA,ECI+C,ODJ/C,CCIuD,WDJvD,CAAA;;;;;;;;;;;;ACIzB;;;AAAwE,iBAmJlD,UAAA,CAnJkD,IAAA,EAAA,MAAA,EAAA,GAAA,EAAA,MAAA,EAAA,OAAA,CAAA,EAsJ7D,aAtJ6D,CAAA,EAuJrE,OAvJqE,CAuJ7D,WAvJ6D,CAAA;;;;;AFCxE;AAqCA;AACW,cGvCE,gBAAA,YAA4B,SHuC9B,CAAA;EACU,SAAA,IAAA,GAAA,SAAA;EAAR,SAAA,QAAA,GAAA,EAAA;EACV,OAAA,CAAA,OAAA,EGrCsB,iBHqCtB,CAAA,EGrC0C,OHqC1C,CGrCkD,OHqClD,CGrC0D,WHqC1D,CAAA,CAAA;EAAiB,QAAA,eAAA;;;;;;;;AAxCpB;AAqCA;AACW,cIrDE,gBAAA,YAA4B,SJqD9B,CAAA;EACU,SAAA,IAAA,GAAA,SAAA;EAAR,SAAA,QAAA,GAAA,EAAA;EACV,OAAA,CAAA,OAAA,EInDsB,iBJmDtB,CAAA,EInD0C,OJmD1C,CInDkD,OJmDlD,CInD0D,WJmD1D,CAAA,CAAA;;;;;;AAxCH;AAqCA;AACW,cKtDE,eAAA,YAA2B,SLsD7B,CAAA;EACU,SAAA,IAAA,GAAA,QAAA;EAAR,SAAA,QAAA,GAAA,EAAA;EACV,OAAA,CAAA,OAAA,EKpDsB,iBLoDtB,CAAA,EKpD0C,OLoD1C,CKpDkD,OLoDlD,CKpD0D,WLoD1D,CAAA,CAAA;EAAiB,QAAA,eAAA;;;;EC3DR,QAAA,QAAA;EAcC,QAAA,WAAY;;;;;;ADKzB;AAqCA;AACW,cMrDE,cAAA,YAA0B,SNqD5B,CAAA;EACU,SAAA,IAAA,GAAA,OAAA;EAAR,SAAA,QAAA,GAAA,EAAA;EACV,OAAA,CAAA,OAAA,EMnDsB,iBNmDtB,CAAA,EMnD0C,ONmD1C,CMnDkD,ONmDlD,CMnD0D,WNmD1D,CAAA,CAAA;;;;;;AAxCH;AAqCA;AACW,cOtDE,aAAA,YAAyB,SPsD3B,CAAA;EACU,SAAA,IAAA,GAAA,MAAA;EAAR,SAAA,QAAA,GAAA,GAAA;EACV,OAAA,CAAA,OAAA,EOpDsB,iBPoDtB,CAAA,EOpD0C,OPoD1C,COpDkD,OPoDlD,COpD0D,WPoD1D,CAAA,CAAA;;;;;;;;iBQ5Ca,uBAAA,CAAA,GAA2B;;;APf3C;AAca,iBOcG,cAAA,CPdS,UAAA,EOckB,SPdlB,EAAA,CAAA,EOcgC,SPdhC,EAAA;;;;;;ADKzB;AAqCgB,USvDC,SAAA,CTuDW;EACjB;;;;EAES,KAAA,CAAA,GAAA,EAAA,MAAA,EAAA,OAAA,CAAA,ESrDW,cTqDX,CAAA,ESrD0B,OTqD1B,CSrDkC,aTqDlC,CAAA;;;;AC3DpB;AAcA;;AAIqC,UQHpB,cAAA,CRGoB;EAA8C;EAe/C,OAAA,CAAA,EAAA,MAAA;EAAmC;EAsB3D,SAAA,CAAA,EAAA,MAAA;EAzCqB;EAAK,OAAA,CAAA,EQS1B,MRT0B,CAAA,MAAA,EAAA,MAAA,CAAA;;;;ACItC;AAAmD,UOWlC,aAAA,CPXkC;EAA6B;EAAR,IAAA,EAAA,MAAA;EAAO;EAmJzD,QAAA,EAAA,MAAU;EAGrB;EACA,UAAA,EAAA,MAAA;EAAR;EAAO,WAAA,EAAA,MAAA;;YO9HE;;ANzBZ;;;AAIqD,cM2BxC,kBAAA,GN3BwC,oEAAA;;;;cMiCxC,eAAA;;;;;ATpCb;AAqCA;AACW,cU/CE,aAAA,YAAyB,SV+C3B,CAAA;EACU,SAAA,IAAA,GAAA,cAAA;EAAR,KAAA,CAAA,GAAA,EAAA,MAAA,EAAA,OAAA,CAAA,EU7CuB,cV6CvB,CAAA,EU7C2C,OV6C3C,CU7CmD,aV6CnD,CAAA;;;;;cUuDA,gBAAc;;;;;;AV9FX,UWjBC,iBAAA,CXiBsB;EAqCvB,OAAA,EAAA,OAAY;EACjB,MAAA,CAAA,EAAA,MAAA;;;;;;;;ACzDX;AAca,iBUQS,cAAA,CVRG,GAAA,EAAA,MAAA,EAAA,SAAA,CAAA,EAAA,MAAA,CAAA,EUWtB,OVXsB,CUWd,iBVXc,CAAA;;;;;;ADKT,iBYKA,UAAA,CZLuB,GAAA,EAAA,MAI5B,CAAA,EAAA,OAAA;AAiCX;;;AAEa,iBYtBG,YAAA,CZsBH,GAAA,EAAA,MAAA,CAAA,EAAA,MAAA;;;;iBYEG,aAAA;;AX5DhB;AAcA;AACwB,iBWyDR,UAAA,CXzDQ,GAAA,EAAA,MAAA,GAAA,SAAA,GAAA,IAAA,EAAA,OAAA,EAAA,MAAA,CAAA,EAAA,MAAA,GAAA,SAAA;;;;AAkB+C,iBWoDvD,aAAA,CXpDuD,GAAA,EAAA,MAAA,EAAA,UAAA,EAAA,MAAA,CAAA,EAAA,OAAA;;;;iBWiEvD,WAAA;;;AVhFhB;AAAmD,iBU2FnC,OAAA,CV3FmC,GAAA,EAAA,MAAA,CAAA,EAAA,MAAA;;;;AAmJ7B,iBU7CN,iBAAA,CV6CgB,GAAA,EAAA,MAAA,EAAA,OAAA,EAAA,MAAA,CAAA,EAAA,OAAA"}