recker 1.0.85 → 1.0.86-next.a24fa13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/dist/browser/browser/index.d.ts +2 -0
  2. package/dist/browser/browser/index.js +1 -0
  3. package/dist/browser/browser/recker.d.ts +2 -0
  4. package/dist/browser/browser/recker.js +2 -0
  5. package/dist/browser/core/client.d.ts +2 -0
  6. package/dist/browser/core/client.js +8 -0
  7. package/dist/browser/core/request.d.ts +3 -0
  8. package/dist/browser/core/request.js +6 -2
  9. package/dist/browser/index.d.ts +2 -0
  10. package/dist/browser/index.iife.min.js +79 -79
  11. package/dist/browser/index.js +1 -0
  12. package/dist/browser/index.min.js +79 -79
  13. package/dist/browser/index.mini.iife.js +312 -15
  14. package/dist/browser/index.mini.iife.min.js +38 -38
  15. package/dist/browser/index.mini.min.js +42 -42
  16. package/dist/browser/index.mini.umd.js +312 -15
  17. package/dist/browser/index.mini.umd.min.js +38 -38
  18. package/dist/browser/index.umd.min.js +79 -79
  19. package/dist/browser/plugins/queue.d.ts +41 -0
  20. package/dist/browser/plugins/queue.js +184 -0
  21. package/dist/browser/recker.d.ts +2 -0
  22. package/dist/browser/recker.js +2 -0
  23. package/dist/browser/scrape/crawl-queue.d.ts +31 -0
  24. package/dist/browser/scrape/crawl-queue.js +40 -0
  25. package/dist/browser/scrape/crawl-storage.d.ts +33 -0
  26. package/dist/browser/scrape/crawl-storage.js +26 -0
  27. package/dist/browser/scrape/index.d.ts +6 -0
  28. package/dist/browser/scrape/index.js +3 -0
  29. package/dist/browser/scrape/proxy-adapter.d.ts +12 -0
  30. package/dist/browser/scrape/proxy-adapter.js +17 -0
  31. package/dist/browser/scrape/spider.d.ts +14 -4
  32. package/dist/browser/scrape/spider.js +119 -45
  33. package/dist/browser/transport/curl.js +53 -9
  34. package/dist/browser/transport/undici.js +4 -0
  35. package/dist/browser/types/index.d.ts +53 -2
  36. package/dist/core/client.d.ts +2 -0
  37. package/dist/core/client.js +8 -0
  38. package/dist/core/request.d.ts +3 -0
  39. package/dist/core/request.js +6 -2
  40. package/dist/index.d.ts +2 -0
  41. package/dist/index.js +2 -0
  42. package/dist/plugins/queue.d.ts +41 -0
  43. package/dist/plugins/queue.js +184 -0
  44. package/dist/queue/consumer.d.ts +17 -0
  45. package/dist/queue/consumer.js +48 -0
  46. package/dist/scrape/crawl-queue.d.ts +31 -0
  47. package/dist/scrape/crawl-queue.js +40 -0
  48. package/dist/scrape/crawl-storage.d.ts +33 -0
  49. package/dist/scrape/crawl-storage.js +26 -0
  50. package/dist/scrape/index.d.ts +6 -0
  51. package/dist/scrape/index.js +3 -0
  52. package/dist/scrape/proxy-adapter.d.ts +12 -0
  53. package/dist/scrape/proxy-adapter.js +17 -0
  54. package/dist/scrape/spider.d.ts +14 -4
  55. package/dist/scrape/spider.js +119 -45
  56. package/dist/transport/curl.js +53 -9
  57. package/dist/transport/undici.js +4 -0
  58. package/dist/types/index.d.ts +53 -2
  59. package/dist/version.js +1 -1
  60. package/package.json +2 -2
@@ -0,0 +1,41 @@
1
+ import { Plugin, ReckerResponse, QueueAdapter, QueueJob, QueueOptions } from '../types/index.js';
2
+ declare class QueuedResponse<T = unknown> implements ReckerResponse<T> {
3
+ readonly jobId: string;
4
+ readonly raw: Response;
5
+ readonly timings: undefined;
6
+ readonly connection: undefined;
7
+ constructor(jobId: string);
8
+ get status(): number;
9
+ get statusText(): string;
10
+ get headers(): Headers;
11
+ get ok(): boolean;
12
+ get url(): string;
13
+ json<R = T>(): Promise<R>;
14
+ text(): Promise<string>;
15
+ cleanText(): Promise<string>;
16
+ blob(): Promise<Blob>;
17
+ read(): ReadableStream<Uint8Array<ArrayBuffer>> | null;
18
+ clone(): ReckerResponse<T>;
19
+ sse(): AsyncGenerator<never, void, unknown>;
20
+ download(): AsyncGenerator<never, void, unknown>;
21
+ [Symbol.asyncIterator](): AsyncGenerator<Uint8Array<ArrayBuffer>, void, unknown>;
22
+ }
23
+ export declare function queuePlugin(options: QueueOptions): Plugin;
24
+ export declare class InMemoryQueueAdapter implements QueueAdapter {
25
+ private jobs;
26
+ enqueue(job: QueueJob): Promise<string>;
27
+ getStatus(jobId: string): Promise<{
28
+ jobId: string;
29
+ status: "unknown";
30
+ } | {
31
+ jobId: string;
32
+ status: "pending" | "processing" | "completed" | "failed";
33
+ }>;
34
+ getPendingJobs(): QueueJob[];
35
+ setJobStatus(jobId: string, status: string): void;
36
+ getJob(jobId: string): QueueJob | undefined;
37
+ clear(): void;
38
+ get size(): number;
39
+ close(): Promise<void>;
40
+ }
41
+ export { QueuedResponse };
@@ -0,0 +1,184 @@
1
+ let jobSeq = 0;
2
+ function generateJobId() {
3
+ jobSeq += 1;
4
+ return `qj-${Date.now().toString(36)}-${jobSeq.toString(36)}-${Math.random().toString(16).slice(2, 8)}`;
5
+ }
6
+ function serializeRequest(req, jobId, metadata) {
7
+ const headers = {};
8
+ req.headers.forEach((value, key) => {
9
+ headers[key] = value;
10
+ });
11
+ let body = null;
12
+ const bodyContentType = headers['content-type'];
13
+ if (req.body !== null && req.body !== undefined) {
14
+ if (typeof req.body === 'string') {
15
+ body = req.body;
16
+ }
17
+ else if (typeof req.body === 'object') {
18
+ try {
19
+ body = JSON.stringify(req.body);
20
+ }
21
+ catch {
22
+ body = String(req.body);
23
+ }
24
+ }
25
+ else {
26
+ body = String(req.body);
27
+ }
28
+ }
29
+ return {
30
+ jobId,
31
+ url: req.url,
32
+ method: req.method,
33
+ headers,
34
+ body,
35
+ bodyContentType: bodyContentType || undefined,
36
+ correlationId: req.correlationId,
37
+ traceId: req.traceId,
38
+ tenant: req.tenant,
39
+ policyTags: req.policyTags?.length ? req.policyTags : undefined,
40
+ createdAt: Date.now(),
41
+ metadata: metadata && Object.keys(metadata).length > 0 ? metadata : undefined,
42
+ };
43
+ }
44
+ function compileFilter(config) {
45
+ return (req) => {
46
+ if (config.methods && !config.methods.includes(req.method)) {
47
+ return false;
48
+ }
49
+ if (config.urlPatterns) {
50
+ const matched = config.urlPatterns.some((pattern) => {
51
+ if (typeof pattern === 'string')
52
+ return req.url.includes(pattern);
53
+ return pattern.test(req.url);
54
+ });
55
+ if (!matched)
56
+ return false;
57
+ }
58
+ if (config.headerPresent && !req.headers.has(config.headerPresent)) {
59
+ return false;
60
+ }
61
+ return true;
62
+ };
63
+ }
64
+ class QueuedResponse {
65
+ jobId;
66
+ raw;
67
+ timings = undefined;
68
+ connection = undefined;
69
+ constructor(jobId) {
70
+ this.jobId = jobId;
71
+ const responseBody = JSON.stringify({
72
+ queued: true,
73
+ jobId,
74
+ status: 'queued',
75
+ });
76
+ this.raw = new Response(responseBody, {
77
+ status: 202,
78
+ statusText: 'Accepted',
79
+ headers: {
80
+ 'Content-Type': 'application/json',
81
+ 'X-Queue-Job-Id': jobId,
82
+ 'X-Queue-Status': 'queued',
83
+ },
84
+ });
85
+ }
86
+ get status() { return 202; }
87
+ get statusText() { return 'Accepted'; }
88
+ get headers() { return this.raw.headers; }
89
+ get ok() { return true; }
90
+ get url() { return ''; }
91
+ json() { return this.raw.json(); }
92
+ text() { return this.raw.text(); }
93
+ async cleanText() { return this.raw.text(); }
94
+ blob() { return this.raw.blob(); }
95
+ read() { return this.raw.body; }
96
+ clone() {
97
+ return new QueuedResponse(this.jobId);
98
+ }
99
+ async *sse() { }
100
+ async *download() { }
101
+ async *[Symbol.asyncIterator]() {
102
+ if (!this.raw.body)
103
+ return;
104
+ const reader = this.raw.body.getReader();
105
+ while (true) {
106
+ const { done, value } = await reader.read();
107
+ if (done)
108
+ break;
109
+ yield value;
110
+ }
111
+ }
112
+ }
113
+ export function queuePlugin(options) {
114
+ const adapter = options.adapter;
115
+ const filter = typeof options.filter === 'function'
116
+ ? options.filter
117
+ : options.filter
118
+ ? compileFilter(options.filter)
119
+ : () => true;
120
+ const generateId = options.jobIdGenerator || generateJobId;
121
+ const queueMiddleware = async (req, next) => {
122
+ const perReq = req.queue;
123
+ if (perReq === false) {
124
+ return next(req);
125
+ }
126
+ const shouldQueue = perReq === true || typeof perReq === 'object' || filter(req);
127
+ if (!shouldQueue) {
128
+ return next(req);
129
+ }
130
+ const metadata = {
131
+ ...options.defaultMetadata,
132
+ ...(typeof perReq === 'object' && perReq !== null && 'metadata' in perReq
133
+ ? perReq.metadata
134
+ : undefined),
135
+ };
136
+ const jobId = generateId(req);
137
+ const job = serializeRequest(req, jobId, metadata);
138
+ const assignedId = await adapter.enqueue(job);
139
+ return new QueuedResponse(assignedId || jobId);
140
+ };
141
+ return (client) => {
142
+ client.use(queueMiddleware);
143
+ };
144
+ }
145
+ export class InMemoryQueueAdapter {
146
+ jobs = new Map();
147
+ async enqueue(job) {
148
+ this.jobs.set(job.jobId, { ...job, status: 'pending' });
149
+ return job.jobId;
150
+ }
151
+ async getStatus(jobId) {
152
+ const job = this.jobs.get(jobId);
153
+ if (!job)
154
+ return { jobId, status: 'unknown' };
155
+ return { jobId, status: job.status };
156
+ }
157
+ getPendingJobs() {
158
+ return Array.from(this.jobs.values())
159
+ .filter((j) => j.status === 'pending')
160
+ .map(({ status, ...job }) => job);
161
+ }
162
+ setJobStatus(jobId, status) {
163
+ const job = this.jobs.get(jobId);
164
+ if (job)
165
+ job.status = status;
166
+ }
167
+ getJob(jobId) {
168
+ const job = this.jobs.get(jobId);
169
+ if (!job)
170
+ return undefined;
171
+ const { status, ...rest } = job;
172
+ return rest;
173
+ }
174
+ clear() {
175
+ this.jobs.clear();
176
+ }
177
+ get size() {
178
+ return this.jobs.size;
179
+ }
180
+ async close() {
181
+ this.jobs.clear();
182
+ }
183
+ }
184
+ export { QueuedResponse };
@@ -4,6 +4,7 @@ import type { RequestOptions } from '../types/index.js';
4
4
  import { analyzeSeo } from '../seo/analyzer.js';
5
5
  import { createAI } from '../ai/index.js';
6
6
  import { simulateNetwork } from '../plugins/network-simulation.js';
7
+ import type { RaffelClientOptions, RaffelClient } from 'raffel/client/browser';
7
8
  export declare function get<T = unknown>(url: string, options?: RequestOptions): RequestPromise<T>;
8
9
  export declare function post<T = unknown>(url: string, options?: RequestOptions): RequestPromise<T>;
9
10
  export declare function post<T = unknown>(url: string, body?: unknown, options?: RequestOptions): RequestPromise<T>;
@@ -26,6 +27,7 @@ export declare const recker: {
26
27
  options: typeof options;
27
28
  purge: typeof purge;
28
29
  ws: typeof ws;
30
+ raffel: (url: string, options?: Omit<RaffelClientOptions, "url">) => RaffelClient;
29
31
  seo: typeof analyzeSeo;
30
32
  ai: typeof createAI;
31
33
  har: import("../plugins/har-recorder.js").HarRecorder;
@@ -4,6 +4,7 @@ import { analyzeSeo } from '../seo/analyzer.js';
4
4
  import { createAI } from '../ai/index.js';
5
5
  import { harRecorder } from '../plugins/har-recorder.js';
6
6
  import { simulateNetwork } from '../plugins/network-simulation.js';
7
+ import { createRaffelClient } from 'raffel/client/browser';
7
8
  let _defaultClient = null;
8
9
  const REQUEST_OPTIONS_HINTS = new Set([
9
10
  'method',
@@ -98,6 +99,7 @@ export const recker = {
98
99
  options,
99
100
  purge,
100
101
  ws,
102
+ raffel: (url, options) => createRaffelClient({ ...options, url }),
101
103
  seo: analyzeSeo,
102
104
  ai: createAI,
103
105
  har: harRecorder,
@@ -0,0 +1,31 @@
1
+ export interface CrawlQueueItem {
2
+ url: string;
3
+ depth: number;
4
+ priority?: number;
5
+ discoveredFrom?: string;
6
+ }
7
+ export interface CrawlQueueAdapter {
8
+ push(item: CrawlQueueItem): Promise<void>;
9
+ pushBatch?(items: CrawlQueueItem[]): Promise<void>;
10
+ pop(): Promise<CrawlQueueItem | null>;
11
+ hasVisited(url: string): Promise<boolean>;
12
+ hasVisitedBatch?(urls: string[]): Promise<Set<string>>;
13
+ markVisited(url: string): Promise<void>;
14
+ size(): Promise<number>;
15
+ clear(): Promise<void>;
16
+ close?(): Promise<void>;
17
+ }
18
+ export declare class InMemoryCrawlQueue implements CrawlQueueAdapter {
19
+ private queue;
20
+ private visited;
21
+ push(item: CrawlQueueItem): Promise<void>;
22
+ pushBatch(items: CrawlQueueItem[]): Promise<void>;
23
+ pop(): Promise<CrawlQueueItem | null>;
24
+ hasVisited(url: string): Promise<boolean>;
25
+ hasVisitedBatch(urls: string[]): Promise<Set<string>>;
26
+ markVisited(url: string): Promise<void>;
27
+ size(): Promise<number>;
28
+ clear(): Promise<void>;
29
+ close(): Promise<void>;
30
+ getVisited(): Set<string>;
31
+ }
@@ -0,0 +1,40 @@
1
+ export class InMemoryCrawlQueue {
2
+ queue = [];
3
+ visited = new Set();
4
+ async push(item) {
5
+ this.queue.push(item);
6
+ }
7
+ async pushBatch(items) {
8
+ this.queue.push(...items);
9
+ }
10
+ async pop() {
11
+ return this.queue.shift() ?? null;
12
+ }
13
+ async hasVisited(url) {
14
+ return this.visited.has(url);
15
+ }
16
+ async hasVisitedBatch(urls) {
17
+ const result = new Set();
18
+ for (const url of urls) {
19
+ if (this.visited.has(url))
20
+ result.add(url);
21
+ }
22
+ return result;
23
+ }
24
+ async markVisited(url) {
25
+ this.visited.add(url);
26
+ }
27
+ async size() {
28
+ return this.queue.length;
29
+ }
30
+ async clear() {
31
+ this.queue = [];
32
+ this.visited.clear();
33
+ }
34
+ async close() {
35
+ await this.clear();
36
+ }
37
+ getVisited() {
38
+ return this.visited;
39
+ }
40
+ }
@@ -0,0 +1,33 @@
1
+ import type { SpiderPageResult } from './spider.js';
2
+ export interface CrawlStorageAdapter {
3
+ saveResult(result: SpiderPageResult): Promise<void>;
4
+ saveError(error: {
5
+ url: string;
6
+ error: string;
7
+ }): Promise<void>;
8
+ getResultCount(): Promise<number>;
9
+ getResults(): Promise<SpiderPageResult[]>;
10
+ getErrors(): Promise<Array<{
11
+ url: string;
12
+ error: string;
13
+ }>>;
14
+ clear(): Promise<void>;
15
+ close?(): Promise<void>;
16
+ }
17
+ export declare class InMemoryCrawlStorage implements CrawlStorageAdapter {
18
+ private results;
19
+ private errors;
20
+ saveResult(result: SpiderPageResult): Promise<void>;
21
+ saveError(error: {
22
+ url: string;
23
+ error: string;
24
+ }): Promise<void>;
25
+ getResultCount(): Promise<number>;
26
+ getResults(): Promise<SpiderPageResult[]>;
27
+ getErrors(): Promise<Array<{
28
+ url: string;
29
+ error: string;
30
+ }>>;
31
+ clear(): Promise<void>;
32
+ close(): Promise<void>;
33
+ }
@@ -0,0 +1,26 @@
1
+ export class InMemoryCrawlStorage {
2
+ results = [];
3
+ errors = [];
4
+ async saveResult(result) {
5
+ this.results.push(result);
6
+ }
7
+ async saveError(error) {
8
+ this.errors.push(error);
9
+ }
10
+ async getResultCount() {
11
+ return this.results.length;
12
+ }
13
+ async getResults() {
14
+ return this.results;
15
+ }
16
+ async getErrors() {
17
+ return this.errors;
18
+ }
19
+ async clear() {
20
+ this.results = [];
21
+ this.errors = [];
22
+ }
23
+ async close() {
24
+ await this.clear();
25
+ }
26
+ }
@@ -4,5 +4,11 @@ export { ScrapeDocument } from './document.js';
4
4
  export { ScrapeElement } from './element.js';
5
5
  export { Spider, spider } from './spider.js';
6
6
  export type { SpiderOptions, SpiderPageResult, SpiderProgress, SpiderResult, } from './spider.js';
7
+ export { InMemoryCrawlQueue } from './crawl-queue.js';
8
+ export type { CrawlQueueAdapter, CrawlQueueItem } from './crawl-queue.js';
9
+ export { InMemoryCrawlStorage } from './crawl-storage.js';
10
+ export type { CrawlStorageAdapter } from './crawl-storage.js';
11
+ export { ListProxyAdapter } from './proxy-adapter.js';
12
+ export type { ProxyAdapter } from './proxy-adapter.js';
7
13
  export { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
8
14
  export type { ExtractedLink, ExtractedImage, ExtractedMeta, OpenGraphData, TwitterCardData, JsonLdData, ExtractedForm, ExtractedFormField, ExtractedTable, ExtractedScript, ExtractedStyle, ExtractionSchema, ExtractionSchemaField, ScrapeOptions, LinkExtractionOptions, ImageExtractionOptions, } from './types.js';
@@ -2,4 +2,7 @@ export { parse as parseHtmlSync, HTMLElement, TextNode, CommentNode, Node, NodeT
2
2
  export { ScrapeDocument } from './document.js';
3
3
  export { ScrapeElement } from './element.js';
4
4
  export { Spider, spider } from './spider.js';
5
+ export { InMemoryCrawlQueue } from './crawl-queue.js';
6
+ export { InMemoryCrawlStorage } from './crawl-storage.js';
7
+ export { ListProxyAdapter } from './proxy-adapter.js';
5
8
  export { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
@@ -0,0 +1,12 @@
1
+ export interface ProxyAdapter {
2
+ getProxy(): Promise<string | null>;
3
+ reportResult?(proxy: string, success: boolean): Promise<void>;
4
+ close?(): Promise<void>;
5
+ }
6
+ export declare class ListProxyAdapter implements ProxyAdapter {
7
+ private readonly proxies;
8
+ private index;
9
+ constructor(proxies: string[]);
10
+ getProxy(): Promise<string>;
11
+ close(): Promise<void>;
12
+ }
@@ -0,0 +1,17 @@
1
+ export class ListProxyAdapter {
2
+ proxies;
3
+ index = 0;
4
+ constructor(proxies) {
5
+ this.proxies = proxies;
6
+ if (proxies.length === 0) {
7
+ throw new Error('ListProxyAdapter requires at least one proxy');
8
+ }
9
+ }
10
+ async getProxy() {
11
+ const proxy = this.proxies[this.index % this.proxies.length];
12
+ this.index++;
13
+ return proxy;
14
+ }
15
+ async close() {
16
+ }
17
+ }
@@ -2,6 +2,9 @@ import type { ExtractedLink, ExtractionSchema } from './types.js';
2
2
  import type { Options as ParserOptions } from './parser/index.js';
3
3
  import { type SitemapUrl } from '../seo/validators/sitemap.js';
4
4
  import { type CaptchaDetectionResult, type BlockDetectionResult } from '../utils/block-detector.js';
5
+ import { type CrawlQueueAdapter } from './crawl-queue.js';
6
+ import { type CrawlStorageAdapter } from './crawl-storage.js';
7
+ import { type ProxyAdapter } from './proxy-adapter.js';
5
8
  export type SpiderTransport = 'auto' | 'undici' | 'curl';
6
9
  type CaptchaProvider = CaptchaDetectionResult['provider'];
7
10
  export interface SpiderOptions {
@@ -25,6 +28,7 @@ export interface SpiderOptions {
25
28
  respectRobotsTxt?: boolean;
26
29
  useSitemap?: boolean;
27
30
  sitemapUrl?: string;
31
+ proxy?: string | string[] | ProxyAdapter;
28
32
  transport?: SpiderTransport;
29
33
  preferCurlFirst?: boolean;
30
34
  onPage?: (result: SpiderPageResult) => void;
@@ -39,6 +43,8 @@ export interface SpiderOptions {
39
43
  onProgress?: (progress: SpiderProgress) => void;
40
44
  extract?: string[] | ExtractionSchema;
41
45
  parserOptions?: Partial<ParserOptions>;
46
+ crawlQueue?: CrawlQueueAdapter;
47
+ crawlStorage?: CrawlStorageAdapter;
42
48
  }
43
49
  export interface SpiderPageResult {
44
50
  url: string;
@@ -146,10 +152,13 @@ export declare class Spider {
146
152
  private options;
147
153
  private client;
148
154
  private pool;
149
- private visited;
150
- private queue;
151
- private results;
152
- private errors;
155
+ private crawlQueue;
156
+ private crawlStorage;
157
+ private proxyAdapter;
158
+ private proxyClients;
159
+ private _visitedCount;
160
+ private _queueSize;
161
+ private _resultCount;
153
162
  private baseHost;
154
163
  private running;
155
164
  private aborted;
@@ -170,6 +179,7 @@ export declare class Spider {
170
179
  private fetchSitemaps;
171
180
  private buildSitemapAnalysis;
172
181
  private buildRobotsAnalysis;
182
+ private getClientForProxy;
173
183
  private fetchPage;
174
184
  private crawlPage;
175
185
  private getOrCreateDomainState;