crawlee-one 1.1.3 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/README.md +880 -53
  2. package/dist/cjs/api/composer.d.ts +0 -0
  3. package/dist/cjs/api/composer.js +248 -0
  4. package/dist/cjs/api/composer.js.map +1 -0
  5. package/dist/cjs/api/types.d.ts +1 -0
  6. package/dist/cjs/api/types.js +121 -0
  7. package/dist/cjs/api/types.js.map +1 -0
  8. package/dist/cjs/api.d.ts +107 -0
  9. package/dist/cjs/api.js +57 -0
  10. package/dist/cjs/api.js.map +1 -0
  11. package/dist/cjs/cli/cli.js +27 -0
  12. package/dist/cjs/cli/cli.js.map +1 -1
  13. package/dist/cjs/cli/commands/codegen.d.ts +8 -0
  14. package/dist/cjs/cli/commands/codegen.js +177 -0
  15. package/dist/cjs/cli/commands/codegen.js.map +1 -0
  16. package/dist/cjs/cli/commands/config.d.ts +16 -0
  17. package/dist/cjs/cli/commands/config.js +65 -0
  18. package/dist/cjs/cli/commands/config.js.map +1 -0
  19. package/dist/cjs/composer.js +4 -4
  20. package/dist/cjs/composer.js.map +1 -1
  21. package/dist/cjs/config/config.d.ts +8 -0
  22. package/dist/cjs/config/config.js +161 -0
  23. package/dist/cjs/config/config.js.map +1 -0
  24. package/dist/cjs/config/types.d.ts +35 -0
  25. package/dist/cjs/config/types.js +1 -36
  26. package/dist/cjs/config/types.js.map +1 -1
  27. package/dist/cjs/constants.d.ts +17 -0
  28. package/dist/cjs/constants.js +21 -0
  29. package/dist/cjs/constants.js.map +1 -0
  30. package/dist/cjs/index.d.ts +7 -5
  31. package/dist/cjs/index.js +8 -4
  32. package/dist/cjs/index.js.map +1 -1
  33. package/dist/cjs/lib/actor/actor.d.ts +14 -14
  34. package/dist/cjs/lib/actor/actor.js +143 -89
  35. package/dist/cjs/lib/actor/actor.js.map +1 -1
  36. package/dist/cjs/lib/actor/types.d.ts +133 -49
  37. package/dist/cjs/lib/actor/types.js.map +1 -1
  38. package/dist/cjs/lib/error/errorHandler.d.ts +19 -52
  39. package/dist/cjs/lib/error/errorHandler.js +12 -12
  40. package/dist/cjs/lib/error/errorHandler.js.map +1 -1
  41. package/dist/cjs/lib/input.d.ts +562 -0
  42. package/dist/cjs/lib/input.js +707 -0
  43. package/dist/cjs/lib/input.js.map +1 -0
  44. package/dist/cjs/lib/integrations/apify.js +2 -3
  45. package/dist/cjs/lib/integrations/apify.js.map +1 -1
  46. package/dist/cjs/lib/integrations/types.d.ts +1 -10
  47. package/dist/cjs/lib/integrations/types.js.map +1 -1
  48. package/dist/cjs/lib/io/pushData.d.ts +27 -13
  49. package/dist/cjs/lib/io/pushData.js +24 -21
  50. package/dist/cjs/lib/io/pushData.js.map +1 -1
  51. package/dist/cjs/lib/io/pushRequests.d.ts +3 -2
  52. package/dist/cjs/lib/io/pushRequests.js.map +1 -1
  53. package/dist/cjs/lib/log.d.ts +4 -3
  54. package/dist/cjs/lib/log.js.map +1 -1
  55. package/dist/cjs/lib/router/router.d.ts +17 -12
  56. package/dist/cjs/lib/router/router.js +49 -27
  57. package/dist/cjs/lib/router/router.js.map +1 -1
  58. package/dist/cjs/lib/router/types.d.ts +13 -58
  59. package/dist/cjs/lib/router/types.js +0 -23
  60. package/dist/cjs/lib/router/types.js.map +1 -1
  61. package/dist/cjs/lib/telemetry/sentry.d.ts +2 -1
  62. package/dist/cjs/lib/telemetry/sentry.js +2 -9
  63. package/dist/cjs/lib/telemetry/sentry.js.map +1 -1
  64. package/dist/cjs/lib/telemetry/types.d.ts +5 -17
  65. package/dist/cjs/lib/telemetry/types.js.map +1 -1
  66. package/dist/cjs/lib/test/mockApifyClient.d.ts +2 -3
  67. package/dist/cjs/lib/test/mockApifyClient.js.map +1 -1
  68. package/dist/cjs/types/config.d.ts +35 -0
  69. package/dist/cjs/types/config.js +3 -0
  70. package/dist/cjs/types/config.js.map +1 -0
  71. package/dist/cjs/types/index.d.ts +52 -0
  72. package/dist/cjs/types/index.js +13 -0
  73. package/dist/cjs/types/index.js.map +1 -0
  74. package/dist/cjs/types.d.ts +3 -1
  75. package/dist/cjs/types.js +10 -0
  76. package/dist/cjs/types.js.map +1 -1
  77. package/dist/cjs/utils/types.d.ts +10 -1
  78. package/dist/cjs/utils/types.js +4 -2
  79. package/dist/cjs/utils/types.js.map +1 -1
  80. package/package.json +10 -7
package/README.md CHANGED
@@ -1,89 +1,216 @@
1
- # Crawlee One
1
+ # CrawleeOne
2
2
 
3
- _The crawler framework you can't refuse._
3
+ _The web scraping framework you can't refuse._
4
4
 
5
- Crawlee One is a framework built on top of Crawlee and Apify\* for writing robust and highly configurable web scrapers.
5
+ CrawleeOne is a feature-rich and highly configurable web scraping framework that empowers both scraper developers and their users.
6
6
 
7
- Crawlee One should be your choice if:
7
+ It is built on top of Crawlee and Apify\*. [Read here](./docs/scraping-workflow-summary.md) for the recap of how Crawlee and Apify work.
8
8
 
9
- - You're developing a long-lasting integration.
9
+ The appeal of CrawleeOne is that it works seamlessly with Apify platforn,
10
+ but can also be easily re-purposed to work with other web scraping platforms or your custom services.
11
+
12
+ When [deployed to Apify](#deploying-to-apify), or otherwise made available to be used by others,
13
+ **the users of your scraper will have the freedom to transform, filter, limit, or otherwise
14
+ modify both the scraped data and the requests to scrape.**
15
+
16
+ CrawleeOne should be especially your choice if:
17
+
18
+ - You're developing a long-lasting integrations.
10
19
  - Or your scraper will be part of a data pipeline.
11
20
  - Or you wish to make your scrapers available to others in your team / org, whether it's programmatically or via Apify UI.
12
21
 
13
- Conversely, Crawlee One is NOT suitable for:
22
+ > NOTE: `crawleeOne` allows you to easily switch between different implementations - Playwright, Cheerio, Puppeteer, ...
23
+ >
24
+ > However, you still need to write data extraction logic that's specific to the implementation.
25
+ >
26
+ > To make the transition between different implementations seamless, you can use [`portadom`](https://github.com/JuroOravec/portadom),
27
+ > which offers a single interface across all these implementations.
14
28
 
15
- - People not familiar with web scraping or Apify
16
- - One-off data extractions
29
+ ## Pre-requirements
17
30
 
18
- [Read here](./docs/scraping-workflow-summary.md) for the recap of how Crawlee and Apify work.
31
+ To make the most of CrawleeOne, you should be familiar with:
19
32
 
20
- ### What can Crawlee One do?
33
+ - Crawlee (AKA how to scrape data).
34
+ - Apify platform (AKA how to manage a scraped dataset and request queue).
21
35
 
22
- Crawlee One supports many common and advanced web scraping use cases. See the [Table of Content](#table-of-content) for the overview of the use cases.
36
+ ## Table of contents
23
37
 
24
- See the section [How to use](#how-to-use) for how Crawlee One looks from user's perspective.
38
+ - [Minimal example](#minimal-example)
39
+ - [What can CrawleeOne do?](#what-can-crawleeone-do)
40
+ - [Playbook & Use cases](#playbook--use-cases)
41
+ - [Actor input reference](#actor-input-reference)
42
+ - [Usage (for developers)](#usage-for-developers)
43
+ - [Deploying to Apify](#deploying-to-apify)
44
+ - [Usage (for end users)](#usage-for-end-users)
45
+ - [Codegen & Config file](#codegen--config-file)
46
+ - [Custom telemetry integration (CrawleeOneTelemetry)](#custom-telemetry-integration-crawleeonetelemetry)
47
+ - [Custom platform and storage integration (CrawleeOneIO)](#custom-platform-and-storage-integration-crawleeoneio)
48
+ - [Example projects](#example-projects)
49
+ - [Contributing](#contributing)
50
+ - [Supporting CrawleeOne](#supporting-crawleeone)
25
51
 
26
- ## Pre-requirements
52
+ ## Minimal example
27
53
 
28
- - Familiarity with Apify platform
29
- - For advanced use cases:
30
- - Basic familiarity with web scraping
31
- - Basic familiarity with JavaScript
54
+ Following example defines a [CheerioCrawler](https://crawlee.dev/docs/guides/cheerio-crawler-guide)
55
+ scraper with 2 routes (`mainPage` and `otherPage`) that process the incoming URLs either based on the URL,
56
+ or on the page HTML.
32
57
 
33
- ## Use cases
58
+ `pushData` is used to save the scraped data, while `pushRequests` enqueues more URLs to be scraped.
34
59
 
35
- Web crawlers written with Crawlee One can be configured via their input to handle following advanced use cases:
60
+ ```ts
61
+ import { crawleeOne } from 'crawlee-one';
36
62
 
37
- - [1. Import URLs to scrape from your database (or elsewhere)](./docs/playbook-01-import-urls.md)
38
- - [2. Proxy: Avoid rate limiting and geo-blocking](./docs/playbook-02-proxy.md)
39
- - [3. Simple transformations: Select and rename columns, set how many entries to scrape](./docs/playbook-03-results-mapping-simple.md)
40
- - [4. Advanced transformations & aggregations](./docs/playbook-04-results-mapping-advanced.md)
41
- - [5. Filtering results](./docs/playbook-05-results-filtering.md)
42
- - [6. Deciding what URLs to scrape: Filtering and transforming requests](./docs/playbook-06-requests-mapping-filtering.md)
43
- - [7. Caching: Extract only new or only previously-seen entries](./docs/playbook-07-caching.md)
44
- - [8. Configure crawler settings and performance](./docs/playbook-08-settings-performance.md)
45
- - [9. Create data pipelines from scrapers using metamorph](./docs/playbook-09-data-pipelines-metamorph.md)
46
- - [10. Privacy compliance: Include or omit personal data](./docs/playbook-10-privacy-compliance.md)
47
- - [11. Capture errors](./docs/playbook-11-errors.md)
48
- - [12. Source control: Keep scraper configuration in sync](./docs/playbook-12-source-control.md)
63
+ await crawleeOne({
64
+ type: 'cheerio',
65
+ routes: {
66
+ mainPage: {
67
+ match: /example\.com\/home/i,
68
+ handler: async (ctx) => {
69
+ const { $, request, pushData, pushRequests } = ctx;
70
+ // Scrape data from the page
71
+ const data = [
72
+ /* ... */
73
+ ];
74
+
75
+ // Save the scraped data.
76
+ await pushData(data, {
77
+ privacyMask: { author: true },
78
+ });
79
+
80
+ // If there's more URLs to scrape, enqueue them.
81
+ const reqs = ['https://...'].map((url) => ({ url }));
82
+ await pushRequests(reqs);
83
+ },
84
+ },
85
+ otherPage: {
86
+ match: (url, ctx) => url.startsWith('/') && ctx.$('.author').length,
87
+ handler: async (ctx) => {
88
+ // ...
89
+ },
90
+ },
91
+ },
92
+ hooks: {
93
+ onReady: async (inst) => {
94
+ await inst.runCrawler(['https://...']);
95
+ },
96
+ },
97
+ });
98
+ ```
99
+
100
+ If you're familiar with [Crawlee](https://github.com/apify/crawlee),
101
+ the minimal example above is roughly equivalent to:
102
+
103
+ ```ts
104
+ import { Actor } from 'apify';
105
+ import { CheerioCrawler, createCheerioRouter } from 'crawlee';
106
+
107
+ await Actor.main(async () => {
108
+ const rawInput = await Actor.getInput();
109
+ const input = {
110
+ ...rawInput,
111
+ ...(await fetchInput(rawInput.inputFromUrl)),
112
+ ...(await runFunc(rawInput.inputFromFunc)),
113
+ };
114
+
115
+ const router = createCheerioRouter();
49
116
 
50
- ## How to use
117
+ router.addHandler('mainPage', async (ctx) => {
118
+ await onBeforeHandler(ctx);
51
119
 
52
- [See here](./docs/user-guide.md) for how to use a Crawlee One web scraper through Apify platform.
120
+ // Scrape data from the page
121
+ const data = [
122
+ /* ... */
123
+ ];
124
+ const finalData = await transformAndFilterDataWithUserInput(data, ctx, input);
53
125
 
54
- ![](./docs/user-guide-input-ui-open.png)
126
+ // Save the scraped data
127
+ const dataset = await Actor.openDataset(input.datasetId);
128
+ await dataset.pushData(data);
55
129
 
56
- ## How to write a Crawlee One crawler
130
+ // If there's more URLs to scrape, enqueue them.
131
+ const reqs = ['https://...'].map((url) => ({ url }));
132
+ const finalReqs = await transformAndFilterReqsWithUserInput(reqs, ctx, input);
133
+ const queue = await Actor.openRequestQueue(input.requestQueueId);
134
+ await queue.addRequests(finalReqs);
57
135
 
58
- // TODO
136
+ await onAfterHandler(ctx);
137
+ });
59
138
 
60
- ## Library contents
139
+ router.addDefaultHandler(async (ctx) => {
140
+ await onBeforeHandler(ctx);
61
141
 
62
- Crawlee One includes a set of utility functions for:
142
+ const url = ctx.request.loadedUrl | ctx.request.url;
143
+
144
+ if (url.match(/example\.com\/home/i)) {
145
+ const req = { url, userData: { label: 'mainPage' } };
146
+ const finalReqs = await transformAndFilterReqsWithUserInput([req], ctx, input);
147
+ const queue = await Actor.openRequestQueue(input.requestQueueId);
148
+ await queue.addRequests(finalReqs);
149
+ }
150
+
151
+ await onAfterHandler(ctx);
152
+ });
153
+
154
+ const crawler = new CheerioCrawler({
155
+ ...input,
156
+ requestHandler: router,
157
+ });
158
+
159
+ if (onReadyFn) await onReadyFn({ crawler, router, input });
160
+ else crawler.run(['https://...']);
161
+ });
162
+ ```
163
+
164
+ As you can see, there's a lot going on behind the scenes, and that's far from everything.
165
+
166
+ > \* Apify can be replaced with your own implementation, so the data can be sent elsewhere, not just to Apify. This is set by the `io` options.
167
+
168
+ ## What can CrawleeOne do?
169
+
170
+ Beside the main `crawleeOne` function for running crawlers,
171
+ CrawleeOne also includes helpers and types for:
63
172
 
64
173
  - Actor boilterplating
65
- - Allows to set crawler settings from Apify input
66
- - Enrich data with metadata
67
- - Configure logging level
68
- - Routing
69
- - Error handling
70
- - Save errors to separate Apify dataset
71
- - Send errors to Sentry
174
+ - Code generation
175
+ - Configuring logging and error handling
176
+ - E.g. Save errors to separate dataset or send to telemetry
177
+ - Data and request filtering and post-processing
178
+ - E.g. Enrich data with metadata
179
+ - Routing
72
180
  - Testing actors
73
- - Manipulating DOM
74
181
  - Actor migration (conceptually similar to database migration)
75
182
  - CLI utility for updating actors via apify-client
76
- - Apify's `actor.json` generation
77
183
  - Privacy compliance
78
184
  - Metamorphing
79
185
 
80
- ## Actor Input Reference
186
+ CrawleeOne supports many common and advanced web scraping use cases. See the [Use cases](#use-cases) for the overview of the use cases.
187
+
188
+ See the section [Usage (for end users)](#usage-for-end-users) for how CrawleeOne looks from user's perspective.
81
189
 
82
- [See here](./docs/reference-input.md) the full list of all possible input options that a Crawlee One crawler can have.
190
+ ## Playbook & Use cases
83
191
 
84
- Crawlee One allows you to configure the following via the input:
192
+ Web crawlers written with CrawleeOne can be configured via their `input` field to handle following advanced use cases:
193
+
194
+ - [1. Import URLs to scrape from your database (or elsewhere)](./docs/playbook-01-import-urls.md)
195
+ - [2. Proxy: Avoid rate limiting and geo-blocking](./docs/playbook-02-proxy.md)
196
+ - [3. Simple transformations: Select and rename columns, set how many entries to scrape](./docs/playbook-03-results-mapping-simple.md)
197
+ - [4. Advanced transformations & aggregations](./docs/playbook-04-results-mapping-advanced.md)
198
+ - [5. Filtering results](./docs/playbook-05-results-filtering.md)
199
+ - [6. Deciding what URLs to scrape: Filtering and transforming requests](./docs/playbook-06-requests-mapping-filtering.md)
200
+ - [7. Caching: Extract only new or only previously-seen entries](./docs/playbook-07-caching.md)
201
+ - [8. Configure crawler settings and performance](./docs/playbook-08-settings-performance.md)
202
+ - [9. Create data pipelines from scrapers using metamorph](./docs/playbook-09-data-pipelines-metamorph.md)
203
+ - [10. Privacy compliance: Include or omit personal data](./docs/playbook-10-privacy-compliance.md)
204
+ - [11. Capture errors](./docs/playbook-11-errors.md)
205
+ - [12. Source control: Keep scraper configuration in sync](./docs/playbook-12-source-control.md)
85
206
 
86
- - [Programmatically-defined input](./docs/reference-input.md#programmatic-input-advanced)
207
+ ## Actor input reference
208
+
209
+ [See here the full list of all possible input options](./docs/reference-input.md) that a CrawleeOne crawler can have.
210
+
211
+ CrawleeOne allows you to configure the following via the `input`:
212
+
213
+ - [Input from URL or custom function](./docs/reference-input.md#programmatic-input-advanced)
87
214
  - [Starting URLs](./docs/reference-input.md#starting-urls)
88
215
  - [Proxy](./docs/reference-input.md#proxy)
89
216
  - [Privacy & Data governance (GDPR)](./docs/reference-input.md#privacy--data-governance-gdpr)
@@ -95,11 +222,711 @@ Crawlee One allows you to configure the following via the input:
95
222
  - [Logging & Error handling](./docs/reference-input.md#logging--error-handling-advanced)
96
223
  - [Integrations (Metamorphing)](./docs/reference-input.md#integrations-metamorphing-advanced)
97
224
 
225
+ ## Usage (for developers)
226
+
227
+ Let's revisit the [previous example](#minimal-example) this time with more options and explanations:
228
+
229
+ ```ts
230
+ import { Actor } from 'apify';
231
+ import { crawleeOne, apifyIO, createSentryTelemetry } from 'crawlee-one';
232
+
233
+ // When you call `crawleeOne`, following happens:
234
+ // 1. Crawler initializes.
235
+ // 2. Crawler processes the URLs in the RequestQueue.
236
+ // 3. For each request, it decides which handler to give
237
+ // the request to, based on the `match` fields.
238
+ // 4. The handler receives the request.
239
+ // 5. The handler may scrape data and save them with `pushData`.
240
+ // 6. The handler may find more URLs to scrape, and enqueue
241
+ // them with `pushRequests`.
242
+ // 7. If crawler runs out of Requests and `keepAlive` is not set,
243
+ // the crawler ends.
244
+ await crawleeOne({
245
+ // Specify Crawlee Crawler class. Changing this has 2 consequences:
246
+ // 1. It changes the crawler strategy - E.g. whether it uses Playwright,
247
+ // or makes plain HTTP requests.
248
+ // 2. It decides what data and methods will be available on the `context`
249
+ // object in the route `handler`.
250
+ //
251
+ // Available types are: 'basic', 'http', 'jsdom', 'cheerio', 'playwright', 'puppeteer'
252
+ type: 'cheerio',
253
+
254
+ // (Optional) Input allows you to configure everything:
255
+ // - The crawler class and configuration like max/min concurrency
256
+ // and timeouts.
257
+ // - To load URLs to scrape from remote URL or database.
258
+ // - Data and Request transformations, filtering, and more.
259
+ // - Run requests in batches.
260
+ // - Downstream crawler with Apify's "metamorph".
261
+ //
262
+ // See the Actor input reference for all input fields.
263
+ //
264
+ // Specify input if you plan to use the crawler yourself,
265
+ // otherwise use `inputDefaults` or set `mergeInput`.
266
+ //
267
+ // If you deploy the scraper on platforms like Apify,
268
+ // then your users will populate this field via `Actor.getInput()`.
269
+ input: {
270
+ outputTransform: (item) => { ... },
271
+ },
272
+
273
+ // Input fields that can be overriden by you or the crawler users
274
+ // See the Actor input reference for all input fields.
275
+ inputDefaults: {
276
+ // ...
277
+ },
278
+
279
+ // By default, if you specify the `input`, then user's input
280
+ // (from `Actor.getInput()`) will be ignored. Set this to `true`
281
+ // or to a custom merge function to include both your and user's `input` data.
282
+ mergeInput: true,
283
+ // is same as:
284
+ mergeInput: ({ defaults, overrides, env }) => ({ ...defaults, ...env, ...overrides, });,
285
+
286
+ // These settings are passed to Crawler class initialization.
287
+ // E.g. if `type: 'playwright'`, then this config is used as:
288
+ // `new PlaywrightCrawler(crawlerConfig);`
289
+ //
290
+ // Set `crawlerConfig` for config that cannot be configured via `input`,
291
+ // or when you need the crawler to use specific settings and you don't
292
+ // want users to override that.
293
+ crawlerConfig: {
294
+ maxRequestsPerMinute: 120,
295
+ requestHandlerTimeoutSecs: 180,
296
+ headless: true,
297
+ // ...
298
+ },
299
+
300
+ // Same as `crawlerConfig`, but these values can be overriden by the user's
301
+ // config from `input`, and by values in `crawlerConfig`.
302
+ crawlerConfigDefaults: {
303
+ // ...
304
+ },
305
+
306
+ // Each URL passed to the crawler will be compared against the routes,
307
+ // and passed to the handler of the first route that matches.
308
+ routes: {
309
+ mainPage: {
310
+ // Regex that a URL must match, or a predicate function, or array of the two.
311
+ match: /example\.com\/home/i,
312
+ // The handler function is equivalent to Crawlee's `Router.addHandler`,
313
+ // but with extra fields.
314
+ handler: async (ctx) => {
315
+ const { $, request, pushData, pushRequests } = ctx;
316
+ // Scrape data from the page
317
+ const data = [
318
+ /* ... */
319
+ ];
320
+
321
+ // Save the scraped data. When you save data with `ctx.pushData`,
322
+ // then you can filter, transform, limit, redact, and more.
323
+ //
324
+ // NOTE: `privacyMask` defines which fields are considered
325
+ // personal information. To ensure we think about privacy
326
+ // compliance when scraping, this field is required.
327
+ await pushData(data, {
328
+ privacyMask: { author: true },
329
+ });
330
+
331
+ // If there's more URLs to scrape, enqueue them.
332
+ // When you enqueue data with `ctx.pushRequests`,
333
+ // you can filter, transform, limit, and more.
334
+ const reqs = ['https://...'].map((url) => ({ url }));
335
+ await pushRequests(reqs);
336
+ },
337
+ },
338
+ },
339
+
340
+ hooks: {
341
+ // By default, CrawleeOne calls `Crawler.run()` once ready.
342
+ // If you override it, you have to call it yourself.
343
+ onReady: async (inst) => {
344
+ // E.g. in this example, user can select to scrape all entries
345
+ // or a certain kind by setting a custom `datasetType` input field.
346
+ const startUrls: string[] = [];
347
+ if (!actor.startUrls.length && actor.input?.datasetType) {
348
+ startUrls.push(datasetTypeToUrl[actor.input?.datasetType]);
349
+ }
350
+ await actor.runCrawler(startUrls);
351
+ },
352
+
353
+ // You can add hooks before and after each `handler` call:
354
+ // These hooks receive the same "context" as the handler functions,
355
+ // which is equivalent to Crawlee's `Router.addHandler`, but with extra fields.
356
+ onBeforeHandler: (ctx) => { /* ... */ },
357
+ onAfterHandler: (ctx) => { /* ... */ },
358
+
359
+ // If you run the crawler on Apify, or otherwise provide the crawler to others,
360
+ // tehn it's a good practice to validate their input.
361
+ validateInput: (input) => {
362
+ const schema = Joi.object({ ... });
363
+ Joi.assert(input, schema);
364
+ },
365
+ },
366
+
367
+ // Configure the Crawlee proxy. See Crawlee's `ProxyConfiguration`
368
+ // By default, no proxy is used.
369
+ //
370
+ // NOTE: DO NOT set proxy if you are deploying the crawler in Apify,
371
+ // and you want the user to specify the proxy!
372
+ proxy: Actor.createProxyConfiguration({ ... }),
373
+
374
+ // Provide a telemetry instance that is used for tracking errors.
375
+ // By default, no telemetry is used.
376
+ //
377
+ // See the docs for `CrawleeOneTelemetry`.
378
+ //
379
+ // In this example we track errors to Sentry.
380
+ telemetry: createSentryTelemetry({
381
+ dsn: 'https://xxxxxxxxxxxxxxxxxxxxxxx@yyyyyyy.ingest.sentry.io/zzzzzzzzzzzzzzzzzzzzz',
382
+ tracesSampleRate: 1.0,
383
+ serverName: 'myCrawler',
384
+ }),
385
+
386
+ // Provide an instance that is responsible for state management:
387
+ // - Adding scraped data to datasets
388
+ // - Adding and removing requests to/from queues
389
+ // - Cache storage
390
+ //
391
+ // This is an API based on Apify's `Actor` utility class, which is also
392
+ // the default.
393
+ //
394
+ // You don't need to override this in most of the cases.
395
+ //
396
+ // By default, the data is saved and kept locally in `./storage` directory.
397
+ // And if the cralwer runs in Apify's platform then it uses
398
+ // Apify's cloud storage.
399
+ //
400
+ // See the docs for `CrawleeOneIO`.
401
+ io: apifyIO,
402
+
403
+ // Provide a custom router instance. By default, uses Router from Crawlee.
404
+ // See the docs for Crawlee's Router.
405
+ router: myCustomRouter(),
406
+ });
407
+ ```
408
+
409
+ > You can find the full type definition of `crawleeOne` and its arguments here:
410
+ >
411
+ > - [crawleeOne](./docs/typedoc/modules.md#crawleeone)
412
+ > - [CrawleeOneArgs](./docs/typedoc/interfaces/CrawleeOneArgs.md)
413
+ >
414
+ > To learn more about `pushData` and `pushRequests`, see:
415
+ >
416
+ > - [pushData](./docs/typedoc/modules.md#pushdata)
417
+ > - NOTE: When you use `pushData` from within a handler, you omit the first argument (`ctx`).
418
+ > - [pushRequests](./docs/typedoc/modules.md#pushrequests)
419
+
420
+ ### Route handler context
421
+
422
+ Each route handler receives a context object, as defined by [Crawlee Router](https://crawlee.dev/api/core/class/Router).
423
+
424
+ CrawleeOne extends this context object with extra properties:
425
+
426
+ - `actor`: [`CrawleeOneActorInst`](./docs/typedoc/interfaces/CrawleeOneActorInst.md) - CrawleeOne instance
427
+ - `metamorph`: [`Metamorph`](./docs/typedoc/modules.md#metamorph) - Shortcut to `actor.metamorph`
428
+ - `pushData`: [`pushData`](./docs/typedoc/modules.md#pushData) - Shortcut to `actor.pushData`
429
+ - `pushRequests`: [`pushRequests`](./docs/typedoc/modules.md#pushRequests) - Shortcut to `actor.pushRequests`
430
+
431
+ ```ts
432
+ await crawleeOne({
433
+ // ...
434
+ routes: {
435
+ mainPage: {
436
+ match: /example\.com\/page/i,
437
+ // The `ctx` is the Crawlee Router context + extra properties
438
+ handler: (ctx) => {
439
+ // Crawlee properties
440
+ ctx.log('bla bla...')
441
+ const url = ctx.request.loadedUrl || ctx.request.url;
442
+ ctx.response
443
+ const $ = ctx.parseWithCheerio();
444
+ // And more...
445
+
446
+ // Extra props
447
+
448
+ // 1. CrawleeOne instance (type: CrawleeOneActorInst):
449
+ // - Save scraped items
450
+ await ctx.actor.pushData(scrapedItems);
451
+
452
+ // - Enqueue more URLs to scrape
453
+ const id = Math.floor(Math.random() * 100);
454
+ const url = `https://example.com/resource/${id}`;
455
+ await ctx.actor.pushRequests([{ url }]);
456
+
457
+ // - Access datasets and request queues
458
+ const dataset = await ctx.actor.io.openDataset();
459
+ const reqQueue = await ctx.actor.io.openRequestQueue();
460
+ const keyValStore = await ctx.actor.io.openKeyValueStore();
461
+
462
+ // - Access resolved `input`
463
+ if (ctx.actor.input.myCustomInput) {
464
+ // do something
465
+ }
466
+
467
+ // - Access resolved start URLs
468
+ if (ctx.actor.startUrls.length) {
469
+ // do something
470
+ }
471
+
472
+ // - Interact with state that can be access in hooks like `outputTransform` or `outputFilter`
473
+ ctx.actor.state.myVar = 1;
474
+
475
+ // 2. `pushData`, `pushRequests`, and `metamorph` can be accessed also
476
+ // directly from the `ctx` object
477
+ await ctx.pushData(scrapedItems) // Same as `ctx.actor.pushData`
478
+ await ctx.pushRequests(urlsToScrape) // Same as `ctx.actor.pushRequests`
479
+ await ctx.metamorph('nextCrawlerId', ...) // Same as `ctx.actor.metamorph`
480
+ },
481
+ }
482
+ },
483
+ });
484
+ ```
485
+
486
+ > The `actor` object is integral to CrawleeOne.
487
+ > [See here the full list of properties](./docs/typedoc/interfaces/CrawleeOneActorInst.md).
488
+
489
+ ## Deploying to Apify
490
+
491
+ See either of the two projects as examples:
492
+
493
+ - [SKCRIS Scraper](https://github.com/JuroOravec/apify-actor-skcris)
494
+ - [Profesia.sk Scraper](https://github.com/JuroOravec/apify-actor-profesia-sk)
495
+
496
+
497
+ #### 1. Write the crawler with CrawleeOne
498
+
499
+ Either use the example projects above or use your own boilerplate project, but remember that Apify requires you to Dockerize the
500
+ project in order to be deployed on their platform.
501
+
502
+ Remember to install `crawlee-one` package.
503
+
504
+ #### 2. Define the crawler's input
505
+
506
+ You need to tell Apify what kind of input can be passed to your crawler.
507
+ This is done by defining the
508
+ [`actor.json`](https://docs.apify.com/platform/actors/development/actor-definition/actor-json)
509
+ file.
510
+ You need to set this if you want to support the described [use cases](#playbook--use-cases).
511
+
512
+ For that, you will need to:
513
+
514
+ 1. Install [`apify-actor-config`](https://github.com/JuroOravec/apify-actor-config) as a dev dependency:
515
+
516
+ ```sh
517
+ npm i -D apify-actor-config
518
+ ```
519
+
520
+ [`apify-actor-config`](https://github.com/JuroOravec/apify-actor-config) is a sister package focused solely on working with and generating
521
+ Apify's `actor.json` config files.
522
+
523
+ 2. Write a JS/TS file where you will only define your config and export it as the *default* export.
524
+
525
+ [See here the example config file from Profesia.sk Scraper](https://github.com/JuroOravec/apify-actor-profesia-sk/blob/main/src/config.ts).
526
+
527
+ Note that to make use of the CrawleeOne inputs, we need to import `allActorInputs` and pass it to
528
+ `properties` field of `createActorInputSchema`.
529
+
530
+ ```ts
531
+ import { allActorInputs } from 'crawlee-one';
532
+ import { createActorConfig, createActorInputSchema } from 'apify-actor-config';
533
+
534
+ const inputSchema = createActorInputSchema({
535
+ schemaVersion: 1,
536
+ // ...
537
+ properties: {
538
+ ...customActorInput,
539
+ ...allActorInputs,
540
+ },
541
+ });
542
+
543
+ const config = createActorConfig({
544
+ actorSpecification: 1,
545
+ // ...
546
+ input: inputSchema,
547
+ });
548
+
549
+ export default config;
550
+ ```
551
+
552
+ Also note that we are able to override the defaults set in `allActorInputs` by directly
553
+ modifying the object:
554
+
555
+ ```ts
556
+ allActorInputs.requestHandlerTimeoutSecs.prefill = 60 * 3;
557
+ ```
558
+
559
+ 3. Build / transpile the config to vanilla JS if necessary.
560
+
561
+ In Profesia.sk Scraper, the config is defined as a TypeScript file, but `apify-actor-config` currently supports only JS files.
562
+
563
+ So if you are also using anything other than plain JavaScript, then you will need to build / transpile your project. Do so only once you're happy with the input fields and their defaults.
564
+
565
+ 4. Generate `actor.json` file
566
+
567
+ Run the `npx apify-actor-config gen` command and point it to the config JS file:
568
+
569
+ ```sh
570
+ npx apify-actor-config gen -c ./path/to/dist/config.js
571
+ ```
572
+
573
+ Optionally, set this as a script in `package.json`.
574
+
575
+ The command should generate a config file in `./actor/actor.json`, with all the inputs from `crawlee-one`. 🚀
576
+
577
+ 5. Deploy the project to Apify.
578
+
579
+ Now head over to Apify to deploy the crawler there. [See their docs on deployment](https://docs.apify.com/academy/deploying-your-code/deploying).
580
+
581
+ 6. Verify that the crawler offers all the inputs.
582
+
583
+ When you now go to see your crawler on Apify, you should see
584
+ that you can configure all kinds of various inputs. Congrats, you've got it working! 🚀
585
+
586
+ See the screenshot in the next section ([Usage (for end users)](#usage-for-end-users)) to see how the input looks like in the Apify UI.
587
+
588
+ ## Usage (for end users)
589
+
590
+ As a user of a crawler that was written with CrawleeOne, you have the option to
591
+ configure the crawler, and transform, filter & limit the scraped data and the "requests" (URLs to scrape).
592
+
593
+ CrawleeOne crawlers allow you to do literally anything with the scraped data.
594
+ [See the common use cases here](#playbook--use-cases).
595
+
596
+ [See here](./docs/user-guide.md) for how to use a CrawleeOne web scrapers through Apify platform.
597
+
598
+ ![Apify actor input page](./docs/user-guide-input-ui-open.png)
599
+
600
+ ## Codegen & Config file
601
+
602
+ With CrawleeOne, you can generate TypeScript types and helper functions to create new instances of CrawleeOne with full type support.
603
+
604
+ With these types:
605
+
606
+ - You get fully-typed scraper definition.
607
+ - You can easily split the project across multiple files, as the corresponding types can be imported.
608
+
609
+ The final result can look like this:
610
+
611
+ ```ts
612
+ // ./routes.ts
613
+ import { profesiaRoute } from './__generated__/crawler';
614
+
615
+ const otherPageRoute: profesiaRoute = {
616
+ match: (url) => url.match(/example\.com\/home/i),
617
+ handler: async (ctx) => {
618
+ // ...
619
+ await ctx.pushData(...);
620
+ },
621
+ };
622
+ ```
623
+
624
+ ```ts
625
+ // ./main.ts
626
+ import { profesiaCrawler, profesiaRoute } from './__generated__/crawler';
627
+ import { otherPageRoute } from './routes';
628
+
629
+ await profesiaCrawler({
630
+ hooks: {
631
+ validateInput,
632
+ },
633
+ routes: {
634
+ mainPage: {
635
+ match: /example\.com\/home/i,
636
+ handler: (ctx) => {
637
+ // NOTE: The type of `ctx` is inferred
638
+ ctx.parseWithCheerio();
639
+ // ...
640
+ },
641
+ },
642
+ otherPage: otherPageRoute,
643
+ },
644
+ });
645
+ ```
646
+
647
+ ### 1. Define the crawler schema in a config
648
+
649
+ To get started, you need to define the scraper schema.
650
+
651
+ Config may look like this:
652
+
653
+ ```js
654
+ module.exports = {
655
+ version: 1, // Currently always 1
656
+ schema: {
657
+ crawlers: {
658
+ // Crawler name
659
+ mainCrawler: {
660
+ // `type` is one of 'basic', 'http', 'jsdom', 'cheerio', 'playwright', 'puppeteer'
661
+ type: 'playwright',
662
+ routes: ['listingPage', 'detailPage'],
663
+ },
664
+ },
665
+ },
666
+ };
667
+ ```
668
+
669
+ Here is an example if we wrote the config in YAML and defined multiple crawlers:
670
+
671
+ ```yaml
672
+ version: 1
673
+ schema:
674
+ crawlers:
675
+ main:
676
+ type: 'playwright',
677
+ routes: ['listingPage', 'detailPage'],
678
+ other:
679
+ type: 'cheerio',
680
+ routes: ['someNoJSPage'],
681
+ ```
682
+
683
+ CrawleeOne uses
684
+ [cosmiconfig](https://github.com/cosmiconfig/cosmiconfig#usage-for-tooling-developers) to import the config. This means that you can define the config as any of the following:
685
+
686
+ - `crawlee-one` property in `package.json`
687
+ - `.crawlee-onerc` file in JSON or YAML format
688
+ - `.crawlee-onerc.json`, `.crawlee-onerc.yaml`, `.crawlee-onerc.yml`, `.crawlee-onerc.js`, `.crawlee-onerc.ts`, `.crawlee-onerc.mjs`, or `.crawlee-onerc.cjs` file
689
+ - `crawlee-onerc`, `crawlee-onerc.json`, `crawlee-onerc.yaml`, `crawlee-onerc.yml`, `crawlee-onerc.js`, `crawlee-onerc.ts` or `crawlee-onerc.cjs` file inside a `.config` subdirectory
690
+ - `crawlee-one.config.js`, `crawlee-one.config.ts`, `crawlee-one.config.mjs`, or `crawlee-one.config.cjs` file
691
+
692
+ ### 2. Generate types
693
+
694
+ To generate the types from the config, run the `generate` command:
695
+
696
+ ```sh
697
+ npx crawlee-one generate -o ./path/to/__generated__/file.ts
698
+ ```
699
+
700
+ ### 3. Use generated types
701
+
702
+ Once generated, we can use the types right away:
703
+
704
+ ```ts
705
+ import { mainCrawler } from './__generated__/file.ts';
706
+
707
+ await mainCrawler({
708
+ routes: {
709
+ listingPage: {
710
+ match: /example\.com\/home/i,
711
+ handler: (ctx) => {
712
+ // NOTE: The type of `ctx` is inferred
713
+ ctx.parseWithCheerio();
714
+ // ...
715
+ },
716
+ },
717
+ detailPage: {
718
+ // ...
719
+ },
720
+ },
721
+ });
722
+ ```
723
+
724
+ Or we can even run multiple crawlers simultaneously. This can be useful in cases where for some pages you need browser automation like Playwright, whereas for other you don't.
725
+
726
+ ```ts
727
+ import { mainCrawler, otherCrawler } from './__generated__/file.ts';
728
+
729
+ // Run the crawlers simulataneously
730
+ const mainPromise = mainCrawler({
731
+ routes: {
732
+ listingPage: {
733
+ match: /example\.com\/home/i,
734
+ handler: (ctx) => {
735
+ ctx.page.locator('...');
736
+ // ...
737
+
738
+ // Send URLs to scrape to the Cheerio crawler
739
+ await ctx.pushRequests([{ url: ... }], { requestQueueId: 'crawleeQueue' });
740
+ },
741
+ },
742
+ detailPage: {
743
+ // ...
744
+ },
745
+ },
746
+ });
747
+
748
+ const otherPromise = otherCrawler({
749
+ input: {
750
+ requestQueueId: 'crawleeQueue',
751
+ },
752
+ routes: {
753
+ someNoJSPage: {
754
+ match: /example\.com\/home/i,
755
+ handler: (ctx) => {
756
+ // NOTE: The type of `ctx` is inferred
757
+ ctx.parseWithCheerio();
758
+ // ...
759
+ await ctx.pushData(...)
760
+ },
761
+ },
762
+ },
763
+ });
764
+
765
+ await Promise.all([mainPromise, otherPromise]);
766
+ ```
767
+
768
+ ## Custom telemetry integration (CrawleeOneTelemetry)
769
+
770
+ You may want to track errors to a custom service. In that case, you can define and pass
771
+ a custom telemetry instance to the `telemetry` argument of
772
+ [`crawleeOne`](./docs/typedoc/interfaces/CrawleeOneArgs.md).
773
+
774
+ The instance needs to implement the
775
+ [`CrawleeOneTelemetry`](./docs/typedoc/interfaces/CrawleeOneTelemetry.md)
776
+ interface:
777
+
778
+ ```ts
779
+ interface CrawleeOneTelemetry {
780
+ setup: (actor: CrawleeOneActorInst) => Promise<void> | void;
781
+ onSendErrorToTelemetry: (
782
+ error: Error,
783
+ report: object, // Object with data on the error
784
+ options: {
785
+ io?: CrawleeOneIO;
786
+ allowScreenshot?: boolean;
787
+ reportingDatasetId?: string;
788
+ },
789
+ ctx: CrawleeOneCtx;
790
+ ) => Promise<void> | void;
791
+ }
792
+ ```
793
+
794
+ See existing integrations for inspiration:
795
+ - [Sentry](./src/lib/telemetry/sentry.ts)
796
+
797
+ Based on the above, here's an example of a custom telemetry implementation
798
+ that saves the errors to the local file system:
799
+
800
+ ```ts
801
+ import fs from 'fs';
802
+ import type { CrawleeOneCtx, CrawleeOneTelemetry } from 'crawlee-one';
803
+
804
+ export const createFsTelemetry = <T extends CrawleeOneTelemetry<CrawleeOneCtx>>() => {
805
+ const timestamp = new Date().getTime();
806
+ let errors = 0;
807
+
808
+ return {
809
+ setup: async (actor) => {
810
+ await fs.promises.mkdir('./temp/error');
811
+ },
812
+ onSendErrorToTelemetry: async (error, report, options, ctx) => {
813
+ // E.g. '1694344665557_00001.json'
814
+ const filename = timestamp + '_' + (errors++).toString().padStart(5, '0') + '.json';
815
+ const data = JSON.stringify({ error, report });
816
+ await fs.promises.writeFile(filename, data, 'utf-8');
817
+ },
818
+ } as T;
819
+ };
820
+
821
+ await crawleeOne({
822
+ telemetry: createFsTelemetry(),
823
+ // ...
824
+ });
825
+ ```
826
+
827
+ ## Custom platform and storage integration (CrawleeOneIO)
828
+
829
+ By default, CrawleeOne uses
830
+ [Apify](https://github.com/apify/apify-sdk-js/blob/master/packages/apify/README.md)
831
+ to manage datasets, request queue, and other platform-specific features.
832
+
833
+ In most of the cases, this should be fine, because Apify uses local file system
834
+ when the crawler is not running inside Apify's cloud platform.
835
+
836
+ Sometimes, you may want to send the data to a custom dataset, or use a shared service
837
+ for accessing requests or cache storage, or otherwise override the default behaviour.
838
+ In those cases, you can define and pass a custom
839
+ [`CrawleeOneIO`](./docs/typedoc/interfaces/CrawleeOneIO.md)
840
+ instance to the `io` argument of
841
+ [`crawleeOne`](./docs/typedoc/interfaces/CrawleeOneArgs.md).
842
+
843
+ The instance needs to implement the
844
+ [`CrawleeOneIO`](./docs/typedoc/interfaces/CrawleeOneIO.md)
845
+ interface:
846
+
847
+ ```ts
848
+ interface CrawleeOneIO {
849
+ openDataset: (id?: string | null) => MaybePromise<CrawleeOneDataset>;
850
+ openRequestQueue: (id?: string | null) => MaybePromise<CrawleeOneRequestQueue>;
851
+ openKeyValueStore: (id?: string | null) => MaybePromise<CrawleeOneKeyValueStore>;
852
+ getInput: () => Promise<Input | null>;
853
+ triggerDownstreamCrawler: (
854
+ targetActorId: string,
855
+ input?: TInput,
856
+ options?: {
857
+ build?: string;
858
+ }
859
+ ) => Promise<void>;
860
+ runInContext: (userFunc: () => MaybePromise<unknown>, options?: ExitOptions) => Promise<void>;
861
+ createDefaultProxyConfiguration: (
862
+ input?: T | Readonly<T>
863
+ ) => MaybePromise<ProxyConfiguration | undefined>;
864
+ isTelemetryEnabled: () => MaybePromise<boolean>;
865
+ generateErrorReport: (
866
+ input: CrawleeOneErrorHandlerInput,
867
+ options: PickRequired<CrawleeOneErrorHandlerOptions, 'io'>
868
+ ) => MaybePromise<object>;
869
+ generateEntryMetadata: (ctx: Ctx) => MaybePromise<TMetadata>;
870
+ }
871
+ ```
872
+
873
+ See existing integrations for inspiration:
874
+ - [Apify](./src/lib/integrations/apify.ts)
875
+
876
+ Based on the above, here's an example of a custom CrawleeOneIO implementation
877
+ that overrides the datasets to send them to a custom HTTP endpoint.
878
+
879
+ ```ts
880
+ import type { CrawleeOneIO, apifyIO } from 'crawlee-one';
881
+
882
+ export const createCustomIO = (baseUrl: string) => {
883
+ const createDatasetIO = (id?: string) => {
884
+ const fetchAllItems = () => {
885
+ const endpoint = `${baseUrl}/dataset/${id ?? 'default'}/all`;
886
+ return fetch(endpoint).then((d) => d.json());
887
+ };
888
+
889
+ const postItems = (items: any[]) => {
890
+ const endpoint = `${baseUrl}/dataset/${id ?? 'default'}`;
891
+ return fetch(endpoint, {
892
+ method: 'POST',
893
+ body: JSON.stringify(items),
894
+ }).then((d) => d.json());
895
+ };
896
+
897
+ return {
898
+ pushData: postItems,
899
+ getItems: fetchAllItems,
900
+ getItemsCount: () => fetchAllItems().then((d) => d.length),
901
+ };
902
+ };
903
+
904
+ return {
905
+ ...apifyIO,
906
+ openDataset: createDatasetIO,
907
+ } as CrawleeOneIO;
908
+ };
909
+
910
+ await crawleeOne({
911
+ io: createCustomIO(),
912
+ // ...
913
+ });
914
+ ```
915
+
98
916
  ## Example projects
99
917
 
100
918
  - [SKCRIS Scraper](https://github.com/JuroOravec/apify-actor-skcris)
101
919
  - [Profesia.sk Scraper](https://github.com/JuroOravec/apify-actor-profesia-sk)
102
920
 
103
- ---
921
+ ## Contributing
922
+
923
+ Found a bug or hav a feature request? Please open a new issue.
924
+
925
+ When contributing with your code, please follow the standard best practices:
926
+
927
+ - Make a fork with your changes, then make a Merge Request to merge it
928
+ - Be polite
929
+
930
+ ## Supporting CrawleeOne
104
931
 
105
- \* Apify can be replaced with your own implementation, so the data can be sent elsewhere, not just to Apify. This is set by the `io` options.
932
+ CrawleeOne is a labour of love. If you like what I do, you can support me on [BuyMeACoffee](https://www.buymeacoffee.com/jurooravec).