crawlee-one 1.0.8 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -0
- package/dist/cjs/composer.d.ts +0 -0
- package/dist/cjs/composer.js +93 -0
- package/dist/cjs/composer.js.map +1 -0
- package/dist/cjs/index.d.ts +3 -2
- package/dist/cjs/index.js +3 -5
- package/dist/cjs/index.js.map +1 -1
- package/dist/cjs/lib/actor/actor.d.ts +15 -20
- package/dist/cjs/lib/actor/actor.js +47 -29
- package/dist/cjs/lib/actor/actor.js.map +1 -1
- package/dist/cjs/lib/actor/types.d.ts +37 -22
- package/dist/cjs/lib/actor/types.js.map +1 -1
- package/dist/cjs/lib/config.d.ts +6 -6
- package/dist/cjs/lib/config.js +3 -3
- package/dist/cjs/lib/config.js.map +1 -1
- package/dist/cjs/lib/error/errorHandler.d.ts +22 -15
- package/dist/cjs/lib/error/errorHandler.js +10 -32
- package/dist/cjs/lib/error/errorHandler.js.map +1 -1
- package/dist/cjs/lib/integrations/types.d.ts +6 -4
- package/dist/cjs/lib/integrations/types.js.map +1 -1
- package/dist/cjs/lib/log.d.ts +3 -3
- package/dist/cjs/lib/log.js +1 -1
- package/dist/cjs/lib/log.js.map +1 -1
- package/dist/cjs/lib/router/router.d.ts +35 -12
- package/dist/cjs/lib/router/router.js +52 -10
- package/dist/cjs/lib/router/router.js.map +1 -1
- package/dist/cjs/lib/router/types.d.ts +21 -14
- package/dist/cjs/lib/router/types.js +7 -0
- package/dist/cjs/lib/router/types.js.map +1 -1
- package/dist/cjs/lib/telemetry/sentry.d.ts +3 -0
- package/dist/cjs/lib/telemetry/sentry.js +71 -0
- package/dist/cjs/lib/telemetry/sentry.js.map +1 -0
- package/dist/cjs/lib/telemetry/types.d.ts +28 -0
- package/dist/cjs/lib/telemetry/types.js +3 -0
- package/dist/cjs/lib/telemetry/types.js.map +1 -0
- package/dist/cjs/types.d.ts +25 -6
- package/dist/cjs/types.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -53,6 +53,10 @@ Web crawlers written with Crawlee One can be configured via their input to handl
|
|
|
53
53
|
|
|
54
54
|

|
|
55
55
|
|
|
56
|
+
## How to write a Crawlee One crawler
|
|
57
|
+
|
|
58
|
+
// TODO
|
|
59
|
+
|
|
56
60
|
## Library contents
|
|
57
61
|
|
|
58
62
|
Crawlee One includes a set of utility functions for:
|
|
@@ -94,6 +98,7 @@ Crawlee One allows you to configure the following via the input:
|
|
|
94
98
|
## Example projects
|
|
95
99
|
|
|
96
100
|
- [SKCRIS Scraper](https://github.com/JuroOravec/apify-actor-skcris)
|
|
101
|
+
- [Profesia.sk Scraper](https://github.com/JuroOravec/apify-actor-profesia-sk)
|
|
97
102
|
|
|
98
103
|
---
|
|
99
104
|
|
|
File without changes
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// @ts-nocheck
|
|
3
|
+
// interface ComposerCrawlerDef {
|
|
4
|
+
// }
|
|
5
|
+
// ```
|
|
6
|
+
// crawler mainCrawler {
|
|
7
|
+
// type: playwright
|
|
8
|
+
// datasetId: '45678'
|
|
9
|
+
// errorDatasetId: '098765'
|
|
10
|
+
// options: {
|
|
11
|
+
// ...
|
|
12
|
+
// }
|
|
13
|
+
// }
|
|
14
|
+
// route detailPage {
|
|
15
|
+
// // NOTE: If `match` is a regex, the regex is compared against the URL
|
|
16
|
+
// match: /[\W]profesia\.sk\/praca\/zoznam-[a-z0-9-]+\/?(?:[?#~]|$)/i
|
|
17
|
+
// handler: detailPageHandler
|
|
18
|
+
// }
|
|
19
|
+
// ```
|
|
20
|
+
// As JS:
|
|
21
|
+
// ```js
|
|
22
|
+
// import { detailPageHandler } from './handlers';
|
|
23
|
+
// const scraperDef = {
|
|
24
|
+
// crawlers: {
|
|
25
|
+
// mainCrawler {
|
|
26
|
+
// type: playwright
|
|
27
|
+
// datasetId: '45678'
|
|
28
|
+
// errorDatasetId: '098765'
|
|
29
|
+
// options: {
|
|
30
|
+
// ...
|
|
31
|
+
// },
|
|
32
|
+
// }
|
|
33
|
+
// },
|
|
34
|
+
// routes: {
|
|
35
|
+
// detailPage: {
|
|
36
|
+
// match: /[\W]profesia\.sk\/praca\/zoznam-[a-z0-9-]+\/?(?:[?#~]|$)/i,
|
|
37
|
+
// handler: detailPageHandler,
|
|
38
|
+
// }
|
|
39
|
+
// }
|
|
40
|
+
// };
|
|
41
|
+
// ```
|
|
42
|
+
// NOTES:
|
|
43
|
+
// - Enum with available route labels would be extracted from this definition.
|
|
44
|
+
// - If there is only 1 crawler defined, all routes use that. If there is more crawlers,
|
|
45
|
+
// they should define which crawler it relates to.
|
|
46
|
+
// ```
|
|
47
|
+
// crawler mainCrawler {
|
|
48
|
+
// type: playwright
|
|
49
|
+
// datasetId: '45678'
|
|
50
|
+
// errorDatasetId: '098765'
|
|
51
|
+
// options: {
|
|
52
|
+
// ...
|
|
53
|
+
// }
|
|
54
|
+
// ...
|
|
55
|
+
// }
|
|
56
|
+
// crawler productDetailsCrawler {
|
|
57
|
+
// type: cheerio
|
|
58
|
+
// datasetId: '45678'
|
|
59
|
+
// requestQueueId: 'abcdef'
|
|
60
|
+
// options: {
|
|
61
|
+
// ...
|
|
62
|
+
// }
|
|
63
|
+
// }
|
|
64
|
+
// requestQueue extraQueue {
|
|
65
|
+
// ...
|
|
66
|
+
// }
|
|
67
|
+
// dataset mainDataset {
|
|
68
|
+
// datasetId: '45678'
|
|
69
|
+
// ...
|
|
70
|
+
// }
|
|
71
|
+
// keyValueStore mainStore {
|
|
72
|
+
// datasetId: 'xyz'
|
|
73
|
+
// ...
|
|
74
|
+
// }
|
|
75
|
+
// route detailPage {
|
|
76
|
+
// crawler: 'productDetailsCrawler',
|
|
77
|
+
// // NOTE: If `match` is a regex, the regex is compared against the URL
|
|
78
|
+
// match: /[\W]profesia\.sk\/praca\/zoznam-[a-z0-9-]+\/?(?:[?#~]|$)/i
|
|
79
|
+
// handler: detailPageHandler,
|
|
80
|
+
// }
|
|
81
|
+
// route listing {
|
|
82
|
+
// crawler: 'mainCrawler',
|
|
83
|
+
// // Note: route object name is the 'label' by default, but label can be overriden
|
|
84
|
+
// label: 'DETAIL_PAGE'
|
|
85
|
+
// // NOTE: Otherwise `match` is a function that returns true/false
|
|
86
|
+
// match: async (url, ctx, route, handlers) => {
|
|
87
|
+
// const dom = cheerioPortadom(ctx.$.root(), url);
|
|
88
|
+
// const isNotCustomDesign = await dom.findMany('body.listing:not(.custom-design)').length;
|
|
89
|
+
// return isUrlOfCompanyProfile(url) && !!isNotCustomDesign;
|
|
90
|
+
// },
|
|
91
|
+
// }
|
|
92
|
+
// ```
|
|
93
|
+
//# sourceMappingURL=composer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"composer.js","sourceRoot":"","sources":["../../src/composer.ts"],"names":[],"mappings":";AAAA,cAAc;AACd,iCAAiC;AAEjC,IAAI;AAEJ,MAAM;AACN,wBAAwB;AACxB,qBAAqB;AACrB,uBAAuB;AACvB,6BAA6B;AAC7B,eAAe;AACf,UAAU;AACV,MAAM;AACN,IAAI;AAEJ,qBAAqB;AACrB,0EAA0E;AAC1E,uEAAuE;AACvE,+BAA+B;AAC/B,IAAI;AACJ,MAAM;AAEN,SAAS;AACT,QAAQ;AACR,kDAAkD;AAElD,uBAAuB;AACvB,gBAAgB;AAChB,oBAAoB;AACpB,yBAAyB;AACzB,2BAA2B;AAC3B,iCAAiC;AACjC,mBAAmB;AACnB,cAAc;AACd,WAAW;AACX,QAAQ;AACR,OAAO;AACP,cAAc;AACd,oBAAoB;AACpB,4EAA4E;AAC5E,oCAAoC;AACpC,QAAQ;AACR,MAAM;AACN,KAAK;AACL,MAAM;AAEN,SAAS;AACT,8EAA8E;AAC9E,wFAAwF;AACxF,oDAAoD;AAEpD,MAAM;AACN,wBAAwB;AACxB,qBAAqB;AACrB,uBAAuB;AACvB,6BAA6B;AAC7B,eAAe;AACf,UAAU;AACV,MAAM;AACN,QAAQ;AACR,IAAI;AAEJ,kCAAkC;AAClC,kBAAkB;AAClB,uBAAuB;AACvB,6BAA6B;AAC7B,eAAe;AACf,UAAU;AACV,MAAM;AACN,IAAI;AAEJ,4BAA4B;AAC5B,QAAQ;AACR,IAAI;AAEJ,wBAAwB;AACxB,uBAAuB;AACvB,QAAQ;AACR,IAAI;AAEJ,4BAA4B;AAC5B,qBAAqB;AACrB,QAAQ;AACR,IAAI;AAEJ,qBAAqB;AACrB,sCAAsC;AACtC,0EAA0E;AAC1E,uEAAuE;AACvE,gCAAgC;AAChC,IAAI;AAEJ,kBAAkB;AAClB,4BAA4B;AAC5B,qFAAqF;AACrF,yBAAyB;AACzB,qEAAqE;AACrE,kDAAkD;AAClD,sDAAsD;AACtD,+FAA+F;AAC/F,gEAAgE;AAChE,OAAO;AACP,IAAI;AACJ,MAAM","sourcesContent":["// @ts-nocheck\n// interface ComposerCrawlerDef {\n\n// }\n\n// ```\n// crawler mainCrawler {\n// type: playwright\n// datasetId: '45678'\n// errorDatasetId: '098765'\n// options: {\n// ...\n// }\n// }\n\n// route detailPage {\n// // NOTE: If `match` is a regex, the regex is compared against the URL\n// match: /[\\W]profesia\\.sk\\/praca\\/zoznam-[a-z0-9-]+\\/?(?:[?#~]|$)/i\n// handler: detailPageHandler\n// }\n// ```\n\n// As JS:\n// ```js\n// import { detailPageHandler } from './handlers';\n\n// const scraperDef = {\n// crawlers: {\n// mainCrawler {\n// type: playwright\n// datasetId: '45678'\n// errorDatasetId: '098765'\n// options: {\n// ...\n// },\n// }\n// },\n// routes: {\n// detailPage: {\n// match: /[\\W]profesia\\.sk\\/praca\\/zoznam-[a-z0-9-]+\\/?(?:[?#~]|$)/i,\n// handler: detailPageHandler,\n// }\n// }\n// };\n// ```\n\n// NOTES:\n// - Enum with available route labels would be extracted from this definition.\n// - If there is only 1 crawler defined, all routes use that. If there is more crawlers,\n// they should define which crawler it relates to.\n\n// ```\n// crawler mainCrawler {\n// type: playwright\n// datasetId: '45678'\n// errorDatasetId: '098765'\n// options: {\n// ...\n// }\n// ...\n// }\n\n// crawler productDetailsCrawler {\n// type: cheerio\n// datasetId: '45678'\n// requestQueueId: 'abcdef'\n// options: {\n// ...\n// }\n// }\n\n// requestQueue extraQueue {\n// ...\n// }\n\n// dataset mainDataset {\n// datasetId: '45678'\n// ...\n// }\n\n// keyValueStore mainStore {\n// datasetId: 'xyz'\n// ...\n// }\n\n// route detailPage {\n// crawler: 'productDetailsCrawler',\n// // NOTE: If `match` is a regex, the regex is compared against the URL\n// match: /[\\W]profesia\\.sk\\/praca\\/zoznam-[a-z0-9-]+\\/?(?:[?#~]|$)/i\n// handler: detailPageHandler,\n// }\n\n// route listing {\n// crawler: 'mainCrawler',\n// // Note: route object name is the 'label' by default, but label can be overriden\n// label: 'DETAIL_PAGE'\n// // NOTE: Otherwise `match` is a function that returns true/false\n// match: async (url, ctx, route, handlers) => {\n// const dom = cheerioPortadom(ctx.$.root(), url);\n// const isNotCustomDesign = await dom.findMany('body.listing:not(.custom-design)').length;\n// return isUrlOfCompanyProfile(url) && !!isNotCustomDesign;\n// },\n// }\n// ```\n"]}
|
package/dist/cjs/index.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
export
|
|
1
|
+
export * from './lib/actor/actor';
|
|
2
2
|
export * from './lib/actor/types';
|
|
3
3
|
export * from './lib/actorSpec';
|
|
4
4
|
export * from './lib/config';
|
|
@@ -11,7 +11,6 @@ export * from './lib/actions/domUtils';
|
|
|
11
11
|
export * from './lib/actions/page';
|
|
12
12
|
export * from './lib/actions/scrapeListing';
|
|
13
13
|
export * from './lib/error/errorHandler';
|
|
14
|
-
export * from './lib/error/sentry';
|
|
15
14
|
export * from './lib/migrate/localMigrator';
|
|
16
15
|
export * from './lib/migrate/localState';
|
|
17
16
|
export * from './lib/migrate/types';
|
|
@@ -25,3 +24,5 @@ export * from './lib/test/mockApifyClient';
|
|
|
25
24
|
export type { CrawlerUrl, CrawlerType } from './types';
|
|
26
25
|
export * from './lib/integrations/apify';
|
|
27
26
|
export * from './lib/integrations/types';
|
|
27
|
+
export * from './lib/telemetry/types';
|
|
28
|
+
export * from './lib/telemetry/sentry';
|
package/dist/cjs/index.js
CHANGED
|
@@ -14,10 +14,7 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
|
14
14
|
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
15
|
};
|
|
16
16
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
-
|
|
18
|
-
var actor_1 = require("./lib/actor/actor");
|
|
19
|
-
Object.defineProperty(exports, "createAndRunCrawleeOne", { enumerable: true, get: function () { return actor_1.createAndRunCrawleeOne; } });
|
|
20
|
-
Object.defineProperty(exports, "createHttpCrawlerOptions", { enumerable: true, get: function () { return actor_1.createHttpCrawlerOptions; } });
|
|
17
|
+
__exportStar(require("./lib/actor/actor"), exports);
|
|
21
18
|
__exportStar(require("./lib/actor/types"), exports);
|
|
22
19
|
__exportStar(require("./lib/actorSpec"), exports);
|
|
23
20
|
__exportStar(require("./lib/config"), exports);
|
|
@@ -30,7 +27,6 @@ __exportStar(require("./lib/actions/domUtils"), exports);
|
|
|
30
27
|
__exportStar(require("./lib/actions/page"), exports);
|
|
31
28
|
__exportStar(require("./lib/actions/scrapeListing"), exports);
|
|
32
29
|
__exportStar(require("./lib/error/errorHandler"), exports);
|
|
33
|
-
__exportStar(require("./lib/error/sentry"), exports);
|
|
34
30
|
__exportStar(require("./lib/migrate/localMigrator"), exports);
|
|
35
31
|
__exportStar(require("./lib/migrate/localState"), exports);
|
|
36
32
|
__exportStar(require("./lib/migrate/types"), exports);
|
|
@@ -43,4 +39,6 @@ __exportStar(require("./lib/test/actor"), exports);
|
|
|
43
39
|
__exportStar(require("./lib/test/mockApifyClient"), exports);
|
|
44
40
|
__exportStar(require("./lib/integrations/apify"), exports);
|
|
45
41
|
__exportStar(require("./lib/integrations/types"), exports);
|
|
42
|
+
__exportStar(require("./lib/telemetry/types"), exports);
|
|
43
|
+
__exportStar(require("./lib/telemetry/sentry"), exports);
|
|
46
44
|
//# sourceMappingURL=index.js.map
|
package/dist/cjs/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;AAAA,oDAAkC;AAClC,oDAAkC;AAClC,kDAAgC;AAChC,+CAA6B;AAC7B,mDAAiC;AACjC,wDAAsC;AACtC,oDAAkC;AAClC,wDAAsC;AACtC,oDAAkC;AAClC,yDAAuC;AACvC,qDAAmC;AACnC,8DAA4C;AAC5C,2DAAyC;AACzC,8DAA4C;AAC5C,2DAAyC;AACzC,sDAAoC;AACpC,4DAA0C;AAC1C,2DAAyC;AACzC,sDAAoC;AACpC,qDAAmC;AACnC,4CAA0B;AAC1B,mDAAiC;AACjC,6DAA2C;AAE3C,2DAAyC;AACzC,2DAAyC;AACzC,wDAAsC;AACtC,yDAAuC","sourcesContent":["export * from './lib/actor/actor';\nexport * from './lib/actor/types';\nexport * from './lib/actorSpec';\nexport * from './lib/config';\nexport * from './lib/io/dataset';\nexport * from './lib/io/requestQueue';\nexport * from './lib/io/pushData';\nexport * from './lib/io/pushRequests';\nexport * from './lib/actions/dom';\nexport * from './lib/actions/domUtils';\nexport * from './lib/actions/page';\nexport * from './lib/actions/scrapeListing';\nexport * from './lib/error/errorHandler';\nexport * from './lib/migrate/localMigrator';\nexport * from './lib/migrate/localState';\nexport * from './lib/migrate/types';\nexport * from './lib/readme/apify/readme';\nexport * from './lib/readme/apify/types';\nexport * from './lib/router/router';\nexport * from './lib/router/types';\nexport * from './lib/log';\nexport * from './lib/test/actor';\nexport * from './lib/test/mockApifyClient';\nexport type { CrawlerUrl, CrawlerType } from './types';\nexport * from './lib/integrations/apify';\nexport * from './lib/integrations/types';\nexport * from './lib/telemetry/types';\nexport * from './lib/telemetry/sentry';\n"]}
|
|
@@ -1,53 +1,48 @@
|
|
|
1
1
|
/// <reference types="lodash" />
|
|
2
2
|
import { BasicCrawler, CrawlingContext, BasicCrawlerOptions } from 'crawlee';
|
|
3
|
-
import * as Sentry from '@sentry/node';
|
|
4
3
|
import type { CrawlerMeta, CrawlerType } from '../../types';
|
|
5
4
|
import type { MaybePromise, PickPartial } from '../../utils/types';
|
|
6
5
|
import type { CrawleeOneIO } from '../integrations/types';
|
|
7
|
-
import type {
|
|
6
|
+
import type { CrawleeOneTelemetry } from '../telemetry/types';
|
|
7
|
+
import type { CrawleeOneActorCtx, CrawleeOneActorDef } from './types';
|
|
8
8
|
/**
|
|
9
|
-
*
|
|
10
|
-
*
|
|
9
|
+
* Options available when creating default configuration for an opinionated Crawlee actor,
|
|
10
|
+
* which is then run within Apify's `Actor.main()` context.
|
|
11
11
|
*
|
|
12
12
|
* Apify context can be replaced with custom implementation using the `actorConfig.io` option.
|
|
13
13
|
*
|
|
14
14
|
* Read more about what this actor does at {@link createCrawleeOne}.
|
|
15
15
|
*/
|
|
16
|
-
export
|
|
16
|
+
export interface RunCrawleeOneOptions<TCrawlerType extends CrawlerType, Labels extends string = string, Input extends Record<string, any> = Record<string, any>, TIO extends CrawleeOneIO = CrawleeOneIO, Telem extends CrawleeOneTelemetry<any, any> = CrawleeOneTelemetry<any, any>, Ctx extends CrawlerMeta<TCrawlerType, any>['context'] = CrawlingContext<BasicCrawler>> {
|
|
17
17
|
/** String idetifying the actor class, e.g. `'cheerio'` */
|
|
18
18
|
actorType: TCrawlerType;
|
|
19
19
|
actorName: string;
|
|
20
20
|
/** Config passed to the {@link createCrawleeOne} */
|
|
21
|
-
actorConfig: PickPartial<
|
|
21
|
+
actorConfig: PickPartial<CrawleeOneActorDef<Labels, Input, TIO, Telem, Ctx>, 'router' | 'createCrawler' | 'io' | 'telemetry'>;
|
|
22
22
|
/**
|
|
23
23
|
* If using default `createCrawler` implementation, these are crawler options
|
|
24
24
|
* that may be overriden by user input.
|
|
25
25
|
*/
|
|
26
|
-
crawlerConfigDefaults?: CrawlerMeta<TCrawlerType, any>[
|
|
26
|
+
crawlerConfigDefaults?: CrawlerMeta<TCrawlerType, any>['options'];
|
|
27
27
|
/**
|
|
28
28
|
* If using default `createCrawler` implementation, these are crawler options
|
|
29
29
|
* that will override user input.
|
|
30
30
|
*
|
|
31
31
|
* This is useful for testing env.
|
|
32
32
|
*/
|
|
33
|
-
crawlerConfigOverrides?: CrawlerMeta<TCrawlerType, any>[
|
|
34
|
-
/**
|
|
35
|
-
* Sentry configuration. If using default `createCrawler` implementation,
|
|
36
|
-
* failed requests are optionally reported to Sentry.
|
|
37
|
-
*
|
|
38
|
-
* To disable Sentry, set `"enabled": false`.
|
|
39
|
-
*/
|
|
40
|
-
sentryOptions?: Sentry.NodeOptions | undefined;
|
|
33
|
+
crawlerConfigOverrides?: CrawlerMeta<TCrawlerType, any>['options'];
|
|
41
34
|
/**
|
|
42
35
|
* Callback with the created actor. The callback is called within
|
|
43
36
|
* the `Actor.main()` context.
|
|
44
37
|
*/
|
|
45
|
-
onActorReady?: (
|
|
46
|
-
}
|
|
38
|
+
onActorReady?: (actor: CrawleeOneActorCtx<Labels, Input, TIO, Telem, Ctx>) => MaybePromise<void>;
|
|
39
|
+
}
|
|
47
40
|
/**
|
|
48
|
-
* Create opinionated Crawlee crawler that uses
|
|
41
|
+
* Create opinionated Crawlee crawler that uses, and run it within Apify's `Actor.main()` context.
|
|
42
|
+
*
|
|
43
|
+
* Apify context can be replaced with custom implementation using the `actorConfig.io` option.
|
|
49
44
|
*
|
|
50
|
-
* This
|
|
45
|
+
* This function does the following for you:
|
|
51
46
|
*
|
|
52
47
|
* 1) Full TypeScript coverage - Ensure all components use the same Crawler / CrawlerContext.
|
|
53
48
|
*
|
|
@@ -72,7 +67,7 @@ export declare const createAndRunCrawleeOne: <TCrawlerType extends CrawlerType,
|
|
|
72
67
|
* 9) Apify context (e.g. calling `Actor.getInput`) can be replaced with custom
|
|
73
68
|
* implementation using the `io` option.
|
|
74
69
|
*/
|
|
75
|
-
export declare const
|
|
70
|
+
export declare const runCrawleeOne: <TType extends CrawlerType, Labels extends string = string, Input extends Record<string, any> = Record<string, any>, TIO extends CrawleeOneIO<object, object, object> = CrawleeOneIO<object, object, object>, Telem extends CrawleeOneTelemetry<any, any, CrawlingContext<BasicCrawler<import("crawlee").BasicCrawlingContext<import("crawlee").Dictionary>>, import("crawlee").Dictionary>> = CrawleeOneTelemetry<any, any, CrawlingContext<BasicCrawler<import("crawlee").BasicCrawlingContext<import("crawlee").Dictionary>>, import("crawlee").Dictionary>>, Ctx extends CrawlerMeta<TType, any>["context"] = CrawlerMeta<TType, any>["context"]>(args: RunCrawleeOneOptions<TType, Labels, Input, TIO, Telem, Ctx>) => Promise<void>;
|
|
76
71
|
/** Given the actor input, create common crawler options. */
|
|
77
72
|
export declare const createHttpCrawlerOptions: <TOpts extends BasicCrawlerOptions<any> = BasicCrawlerOptions<import("crawlee").BasicCrawlingContext<import("crawlee").Dictionary>>, Input extends Record<string, any> = Record<string, any>>({ input, defaults, overrides, }: {
|
|
78
73
|
/** Actor input */
|
|
@@ -9,12 +9,11 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
9
9
|
});
|
|
10
10
|
};
|
|
11
11
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
-
exports.createHttpCrawlerOptions = exports.
|
|
12
|
+
exports.createHttpCrawlerOptions = exports.runCrawleeOne = void 0;
|
|
13
13
|
const crawlee_1 = require("crawlee");
|
|
14
14
|
const lodash_1 = require("lodash");
|
|
15
15
|
const got_scraping_1 = require("got-scraping");
|
|
16
16
|
const errorHandler_1 = require("../error/errorHandler");
|
|
17
|
-
const sentry_1 = require("../error/sentry");
|
|
18
17
|
const pushData_1 = require("../io/pushData");
|
|
19
18
|
const dataset_1 = require("../io/dataset");
|
|
20
19
|
const pushRequests_1 = require("../io/pushRequests");
|
|
@@ -31,7 +30,7 @@ const actorClassByType = {
|
|
|
31
30
|
puppeteer: crawlee_1.PuppeteerCrawler,
|
|
32
31
|
};
|
|
33
32
|
const isRouter = (r) => {
|
|
34
|
-
return !!(r.addHandler && r.addDefaultHandler);
|
|
33
|
+
return !!((r === null || r === void 0 ? void 0 : r.addHandler) && (r === null || r === void 0 ? void 0 : r.addDefaultHandler));
|
|
35
34
|
};
|
|
36
35
|
const isFunc = (f) => {
|
|
37
36
|
return typeof f === 'function';
|
|
@@ -53,27 +52,48 @@ const genHookFn = (actor, fnStr) => {
|
|
|
53
52
|
return (...args) => __awaiter(void 0, void 0, void 0, function* () { return hookFn(...args, hookCtx); });
|
|
54
53
|
};
|
|
55
54
|
/**
|
|
56
|
-
* Create
|
|
57
|
-
* and run the actor within Apify's `Actor.main()` context.
|
|
55
|
+
* Create opinionated Crawlee crawler that uses, and run it within Apify's `Actor.main()` context.
|
|
58
56
|
*
|
|
59
57
|
* Apify context can be replaced with custom implementation using the `actorConfig.io` option.
|
|
60
58
|
*
|
|
61
|
-
*
|
|
59
|
+
* This function does the following for you:
|
|
60
|
+
*
|
|
61
|
+
* 1) Full TypeScript coverage - Ensure all components use the same Crawler / CrawlerContext.
|
|
62
|
+
*
|
|
63
|
+
* 2) Get Actor input from `Actor.getInput` if not given.
|
|
64
|
+
*
|
|
65
|
+
* 3) (Optional) Validate Actor input
|
|
66
|
+
*
|
|
67
|
+
* 4) Set up router such that requests that reach default route are
|
|
68
|
+
* redirected to labelled routes based on which item from "routes" they match.
|
|
69
|
+
*
|
|
70
|
+
* 5) Register all route handlers for you.
|
|
71
|
+
*
|
|
72
|
+
* 6) (Optional) Wrap all route handlers in a wrapper. Use this e.g.
|
|
73
|
+
* if you want to add a field to the context object, or handle errors
|
|
74
|
+
* from a single place.
|
|
75
|
+
*
|
|
76
|
+
* 7) (Optional) Support transformation and filtering of (scraped) entries,
|
|
77
|
+
* configured via Actor input.
|
|
78
|
+
*
|
|
79
|
+
* 8) (Optional) Support Actor metamorphing, configured via Actor input.
|
|
80
|
+
*
|
|
81
|
+
* 9) Apify context (e.g. calling `Actor.getInput`) can be replaced with custom
|
|
82
|
+
* implementation using the `io` option.
|
|
62
83
|
*/
|
|
63
|
-
const
|
|
64
|
-
const { actorType, actorName, actorConfig, crawlerConfigDefaults, crawlerConfigOverrides,
|
|
65
|
-
const { io = apify_1.apifyIO } = actorConfig;
|
|
66
|
-
yield (0, sentry_1.setupSentry)(Object.assign(Object.assign({}, sentryOptions), { serverName: actorName }), { io });
|
|
84
|
+
const runCrawleeOne = (args) => __awaiter(void 0, void 0, void 0, function* () {
|
|
85
|
+
const { actorType, actorName, actorConfig, crawlerConfigDefaults, crawlerConfigOverrides, onActorReady, } = args;
|
|
86
|
+
const { io = apify_1.apifyIO, telemetry } = actorConfig;
|
|
67
87
|
// See docs:
|
|
68
88
|
// - https://docs.apify.com/sdk/js/
|
|
69
89
|
// - https://docs.apify.com/academy/deploying-your-code/inputs-outputs#accepting-input-with-the-apify-sdk
|
|
70
90
|
// - https://docs.apify.com/sdk/js/docs/upgrading/upgrading-to-v3#apify-sdk
|
|
71
91
|
yield io.runInContext(() => __awaiter(void 0, void 0, void 0, function* () {
|
|
72
92
|
var _a, _b, _c;
|
|
93
|
+
yield (telemetry === null || telemetry === void 0 ? void 0 : telemetry.setup({ actorType, actorName, actorConfig: Object.assign(Object.assign({}, actorConfig), { io }) }));
|
|
73
94
|
const actorDefaults = {
|
|
74
|
-
io,
|
|
75
95
|
router: crawlee_1.Router.create(),
|
|
76
|
-
|
|
96
|
+
routeHandlerWrappers: ({ input }) => {
|
|
77
97
|
var _a;
|
|
78
98
|
return [
|
|
79
99
|
(0, log_1.logLevelHandlerWrapper)((_a = input === null || input === void 0 ? void 0 : input.logLevel) !== null && _a !== void 0 ? _a : 'info'),
|
|
@@ -85,25 +105,27 @@ const createAndRunCrawleeOne = (args) => __awaiter(void 0, void 0, void 0, funct
|
|
|
85
105
|
input,
|
|
86
106
|
defaults: crawlerConfigDefaults,
|
|
87
107
|
overrides: Object.assign({ requestHandler: router, proxyConfiguration: proxy,
|
|
88
|
-
// Capture errors in a separate (Apify) Dataset and pass errors to
|
|
108
|
+
// Capture errors in a separate (Apify) Dataset and pass errors to telemetry
|
|
89
109
|
failedRequestHandler: (0, errorHandler_1.createErrorHandler)({
|
|
90
110
|
io,
|
|
91
111
|
reportingDatasetId: (_a = input === null || input === void 0 ? void 0 : input.errorReportingDatasetId) !== null && _a !== void 0 ? _a : 'REPORTING',
|
|
92
|
-
|
|
112
|
+
sendToTelemetry: (_b = input === null || input === void 0 ? void 0 : input.errorSendToTelemetry) !== null && _b !== void 0 ? _b : true,
|
|
113
|
+
onSendErrorToTelemetry: telemetry === null || telemetry === void 0 ? void 0 : telemetry.onSendErrorToTelemetry,
|
|
93
114
|
}) }, crawlerConfigOverrides),
|
|
94
115
|
});
|
|
95
116
|
const CrawlerClass = actorClassByType[actorType];
|
|
96
117
|
return new CrawlerClass(options);
|
|
97
118
|
},
|
|
98
|
-
routes: [],
|
|
99
|
-
routeHandlers: {},
|
|
100
119
|
};
|
|
101
|
-
const actor = yield
|
|
120
|
+
const actor = yield createCrawleeOne(Object.assign(Object.assign({}, actorConfig), { io, router: (_a = actorConfig.router) !== null && _a !== void 0 ? _a : actorDefaults.router, routeHandlerWrappers: (_b = actorConfig.routeHandlerWrappers) !== null && _b !== void 0 ? _b : actorDefaults.routeHandlerWrappers, createCrawler: (_c = actorConfig.createCrawler) !== null && _c !== void 0 ? _c : actorDefaults.createCrawler }));
|
|
102
121
|
yield (onActorReady === null || onActorReady === void 0 ? void 0 : onActorReady(actor));
|
|
103
122
|
}), { statusMessage: 'Crawling finished!' });
|
|
104
123
|
});
|
|
105
|
-
exports.
|
|
124
|
+
exports.runCrawleeOne = runCrawleeOne;
|
|
106
125
|
/**
|
|
126
|
+
* NOTE: If you want to run a scraper, see {@link runCrawleeOne}. This is lower-level
|
|
127
|
+
* function that should be used only if you want to override the default behaviour of runCrawleeOne.
|
|
128
|
+
*
|
|
107
129
|
* Create opinionated Crawlee crawler that uses router for handling requests.
|
|
108
130
|
*
|
|
109
131
|
* This is a quality-of-life function that does the following for you:
|
|
@@ -132,7 +154,7 @@ exports.createAndRunCrawleeOne = createAndRunCrawleeOne;
|
|
|
132
154
|
* implementation using the `io` option.
|
|
133
155
|
*/
|
|
134
156
|
const createCrawleeOne = (config) => __awaiter(void 0, void 0, void 0, function* () {
|
|
135
|
-
const { io = apify_1.apifyIO } = config;
|
|
157
|
+
const { io = apify_1.apifyIO, telemetry } = config;
|
|
136
158
|
// Mutable state that is available to the actor hooks
|
|
137
159
|
const state = {};
|
|
138
160
|
// Initialize actor inputs
|
|
@@ -161,10 +183,11 @@ const createCrawleeOne = (config) => __awaiter(void 0, void 0, void 0, function*
|
|
|
161
183
|
: yield config.router(getConfig());
|
|
162
184
|
const routes = isFunc(config.routes) ? yield config.routes(getConfig()) : config.routes; // prettier-ignore
|
|
163
185
|
const routeHandlers = isFunc(config.routeHandlers) ? yield config.routeHandlers(getConfig()) : config.routeHandlers; // prettier-ignore
|
|
164
|
-
const
|
|
186
|
+
const routeHandlerWrappers = isFunc(config.routeHandlerWrappers) ? yield config.routeHandlerWrappers(getConfig()) : config.routeHandlerWrappers; // prettier-ignore
|
|
165
187
|
// Create Crawlee crawler
|
|
166
188
|
const getActorCtx = () => ({
|
|
167
189
|
io,
|
|
190
|
+
telemetry,
|
|
168
191
|
router,
|
|
169
192
|
routes,
|
|
170
193
|
routeHandlers,
|
|
@@ -188,26 +211,21 @@ const createCrawleeOne = (config) => __awaiter(void 0, void 0, void 0, function*
|
|
|
188
211
|
// Extra data that we make available to the route handlers
|
|
189
212
|
const routerContext = { actor, pushData: scopedPushData };
|
|
190
213
|
// Set up router
|
|
191
|
-
yield (0, router_1.
|
|
214
|
+
yield (0, router_1.setupDefaultHandlers)({
|
|
192
215
|
io,
|
|
193
216
|
router,
|
|
194
|
-
|
|
217
|
+
routeHandlerWrappers,
|
|
195
218
|
routerContext,
|
|
196
219
|
routes,
|
|
197
220
|
routeHandlers,
|
|
198
221
|
input,
|
|
199
222
|
});
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
routerWrappers,
|
|
203
|
-
routerContext,
|
|
204
|
-
routeHandlers,
|
|
205
|
-
});
|
|
223
|
+
// Register labelled handlers
|
|
224
|
+
yield (0, router_1.registerHandlers)(router, routeHandlers, { routerContext, handlerWrappers: routeHandlerWrappers });
|
|
206
225
|
// Now that the actor is ready, enqueue the URLs right away
|
|
207
226
|
yield scopedPushRequest(startUrls);
|
|
208
227
|
return actor;
|
|
209
228
|
});
|
|
210
|
-
exports.createCrawleeOne = createCrawleeOne;
|
|
211
229
|
const resolveInput = (input, state, options) => __awaiter(void 0, void 0, void 0, function* () {
|
|
212
230
|
var _d;
|
|
213
231
|
const { io = apify_1.apifyIO } = options !== null && options !== void 0 ? options : {};
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"actor.js","sourceRoot":"","sources":["../../../../src/lib/actor/actor.ts"],"names":[],"mappings":";;;;;;;;;;;;AAAA,qCAaiB;AACjB,mCAAgD;AAEhD,+CAA2C;AAI3C,wDAA2D;AAC3D,4CAA8C;AAC9C,6CAA8E;AAC9E,2CAAqD;AACrD,qDAAuE;AAEvE,iDAAgD;AAChD,6CAAuE;AACvE,sCAWmB;AACnB,gCAAmE;AAUnE,MAAM,gBAAgB,GAAG;IACvB,KAAK,EAAE,sBAAY;IACnB,IAAI,EAAE,qBAAW;IACjB,OAAO,EAAE,wBAAc;IACvB,KAAK,EAAE,sBAAY;IACnB,UAAU,EAAE,2BAAiB;IAC7B,SAAS,EAAE,0BAAgB;CAC+C,CAAC;AAE7E,MAAM,QAAQ,GAAG,CAAC,CAAM,EAA2B,EAAE;IACnD,OAAO,CAAC,CAAC,CAAE,CAAmB,CAAC,UAAU,IAAK,CAAmB,CAAC,iBAAiB,CAAC,CAAC;AACvF,CAAC,CAAC;AACF,MAAM,MAAM,GAAG,CAAC,CAAM,EAAgC,EAAE;IACtD,OAAO,OAAO,CAAC,KAAK,UAAU,CAAC;AACjC,CAAC,CAAC;AAEF,kEAAkE;AAClE,MAAM,SAAS,GAAG,CAMhB,KAA4E,EAC5E,KAAc,EACd,EAAE;IACF,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IAExB,MAAM,OAAO,GAAG;QACd,EAAE,EAAE,KAAK,CAAC,EAAE;QACZ,KAAK,EAAE,KAAK,CAAC,KAAK;QAClB,KAAK,EAAE,KAAK,CAAC,KAAK;QAClB,YAAY,EAAZ,uBAAY;QACZ,WAAW,EAAE,0BAAW;KACO,CAAC;IAElC,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC;IAC3B,IAAI,CAAC,MAAM;QAAE,OAAO,IAAI,CAAC;IAEzB,OAAO,CAAO,GAAG,IAAI,EAAE,EAAE,kDAAC,OAAA,MAAM,CAAC,GAAG,IAAI,EAAE,OAAO,CAAC,CAAA,GAAA,CAAC;AACrD,CAAC,CAAC;AAEF;;;;;;;GAOG;AACI,MAAM,sBAAsB,GAAG,CAMpC,IAiCD,EAAiB,EAAE;IAClB,MAAM,EACJ,SAAS,EACT,SAAS,EACT,WAAW,EACX,qBAAqB,EACrB,sBAAsB,EACtB,aAAa,EACb,YAAY,GACb,GAAG,IAAI,CAAC;IAET,MAAM,EAAE,EAAE,GAAG,eAAqB,EAAE,GAAG,WAAW,CAAC;IAEnD,MAAM,IAAA,oBAAW,kCAAM,aAAa,KAAE,UAAU,EAAE,SAAS,KAAI,EAAE,EAAE,EAAE,CAAC,CAAC;IAEvE,YAAY;IACZ,mCAAmC;IACnC,yGAAyG;IACzG,2EAA2E;IAC3E,MAAM,EAAE,CAAC,YAAY,CACnB,GAAS,EAAE;;QACT,MAAM,aAAa,GAA8D;YAC/E,EAAE;YACF,MAAM,EAAE,gBAAM,CAAC,MAAM,EAAO;YAC5B,cAAc,EAAE,CAAC,EAAE,KAAK,EAAE,EAAE,EAAE;;gBAAC,OAAA;oBAC7B,IAAA,4BAAsB,EAAW,MAAA,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,QAAQ,mCAAI,MAAM,CAAC;iBAC5D,CAAA;aAAA;YACD,aAAa,EAAE,CAAC,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,EAAE,EAAE;;gBAC1C,MAAM,OAAO,GAAG,IAAA,gCAAwB,EAGtC;oBACA,KAAK;oBACL,QAAQ,EAAE,qBAAqB;oBAC/B,SAAS,kBACP,cAAc,EAAE,MAAM,EACtB,kBAAkB,EAAE,KAAK;wBACzB,yEAAyE;wBACzE,oBAAoB,EAAE,IAAA,iCAAkB,EAAC;4BACvC,EAAE;4BACF,kBAAkB,EAAE,MAAA,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,uBAAuB,mCAAI,WAAW;4BACjE,YAAY,EAAE,MAAA,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,iBAAiB,mCAAI,IAAI;yBAC/C,CAAC,IACC,sBAAsB,CAC1B;iBACF,CAAC,CAAC;gBACH,MAAM,YAAY,GAAG,gBAAgB,CAAC,SAAS,CAAQ,CAAC;gBACxD,OAAO,IAAI,YAAY,CAAC,OAAO,CAAC,CAAC;YACnC,CAAC;YACD,MAAM,EAAE,EAAE;YACV,aAAa,EAAE,EAAS;SACzB,CAAC;QAEF,MAAM,KAAK,GAAG,MAAM,IAAA,wBAAgB,kCAC/B,WAAW,KACd,EAAE,EACF,MAAM,EAAE,MAAA,WAAW,CAAC,MAAM,mCAAK,aAAa,CAAC,MAAc,EAC3D,cAAc,EAAE,MAAA,WAAW,CAAC,cAAc,mCAAK,aAAa,CAAC,cAAsB,EACnF,aAAa,EAAE,MAAA,WAAW,CAAC,aAAa,mCAAK,aAAa,CAAC,aAAqB,IAChF,CAAC;QAEH,MAAM,CAAA,YAAY,aAAZ,YAAY,uBAAZ,YAAY,CAAG,KAAK,CAAC,CAAA,CAAC;IAC9B,CAAC,CAAA,EACD,EAAE,aAAa,EAAE,oBAAoB,EAAE,CACxC,CAAC;AACJ,CAAC,CAAA,CAAC;AAxGW,QAAA,sBAAsB,0BAwGjC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG;AACI,MAAM,gBAAgB,GAAG,CAM9B,MAAmE,EACnB,EAAE;IAClD,MAAM,EAAE,EAAE,GAAG,eAAqB,EAAE,GAAG,MAAM,CAAC;IAE9C,qDAAqD;IACrD,MAAM,KAAK,GAAG,EAAE,CAAC;IAEjB,0BAA0B;IAC1B,MAAM,QAAQ,GAAG,MAAM,CAAC,KAAK;QAC3B,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC;YACpB,CAAC,CAAC,MAAM,MAAM,CAAC,KAAK,iCAAM,MAAM,KAAE,EAAE,IAAG;YACvC,CAAC,CAAC,MAAM,CAAC,KAAK;QAChB,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAS,CAAC;IAC/B,MAAM,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,YAAY,CAAe,QAAQ,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC;IAEvF,IAAI,MAAM,CAAC,aAAa;QAAE,MAAM,MAAM,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;IAE5D,MAAM,EAAE,QAAQ,EAAE,GAAG,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,EAAE,CAAsB,CAAC;IACxD,MAAM,GAAG,GAAG,IAAI,aAAG,CAAC,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC,CAAC,uBAAiB,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC;IAEnF,gFAAgF;IAChF,MAAM,SAAS,GAAG,GAAG,EAAE,CAAC,iCAAM,MAAM,KAAE,KAAK,EAAE,KAAK,EAAE,EAAE,EAAE,GAAG,IAAG,CAAC;IAE/D,eAAe;IACf,MAAM,YAAY,GAChB,MAAM,CAAC,KAAK,IAAI,IAAI,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,+BAA+B,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;IAClG,MAAM,KAAK,GACT,MAAM,CAAC,KAAK,IAAI,IAAI;QAClB,CAAC,CAAC,YAAY;QACd,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC;YACtB,CAAC,CAAC,MAAM,MAAM,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACjC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;IAEnB,+BAA+B;IAC/B,MAAM,MAAM,GAAuB,QAAQ,CAAC,MAAM,CAAC,MAAM,CAAC;QACxD,CAAC,CAAC,MAAM,CAAC,MAAM;QACf,CAAC,CAAC,MAAO,MAAM,CAAC,MAAc,CAAC,SAAS,EAAE,CAAC,CAAC;IAC9C,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,kBAAkB;IAC3G,MAAM,aAAa,GAAG,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,aAAa,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,kBAAkB;IACvI,MAAM,cAAc,GAAG,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,cAAc,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,kBAAkB;IAE3I,yBAAyB;IACzB,MAAM,WAAW,GAAG,GAAG,EAAE,CAAC,CAAC;QACzB,EAAE;QACF,MAAM;QACN,MAAM;QACN,aAAa;QACb,KAAK;QACL,MAAM;QACN,KAAK;QACL,KAAK;QACL,GAAG;KACJ,CAAC,CAAC;IACH,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,WAAW,EAAE,CAAC,CAAC;IAE1D,mCAAmC;IACnC,MAAM,QAAQ,mBAAK,OAAO,IAAK,WAAW,EAAE,CAAE,CAAC;IAC/C,MAAM,UAAU,GAAG,sBAAsB,CAAC,QAAQ,CAAC,CAAC;IACpD,MAAM,SAAS,GAAG,qBAAqB,CAAC,QAAQ,CAAC,CAAC;IAClD,MAAM,cAAc,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IACtD,MAAM,iBAAiB,GAAG,wBAAwB,CAAC,QAAQ,CAAC,CAAC;IAC7D,MAAM,SAAS,GAAG,MAAM,qBAAqB,CAAC,QAAQ,CAAC,CAAC;IAExD,MAAM,KAAK,GAAG,gCACT,QAAQ,KACX,OAAO;QACP,UAAU;QACV,SAAS,EACT,QAAQ,EAAE,cAAc,EACxB,YAAY,EAAE,iBAAiB,EAC/B,SAAS,GACsC,CAAC;IAElD,0DAA0D;IAC1D,MAAM,aAAa,GAAG,EAAE,KAAK,EAAE,QAAQ,EAAE,cAAc,EAAE,CAAC;IAE1D,gBAAgB;IAChB,MAAM,IAAA,0BAAiB,EAAkE;QACvF,EAAE;QACF,MAAM;QACN,cAAc;QACd,aAAa;QACb,MAAM;QACN,aAAa;QACb,KAAK;KACN,CAAC,CAAC;IACH,MAAM,IAAA,yBAAgB,EAA2D;QAC/E,MAAM;QACN,cAAc;QACd,aAAa;QACb,aAAa;KACd,CAAC,CAAC;IAEH,2DAA2D;IAC3D,MAAM,iBAAiB,CAAC,SAA6B,CAAC,CAAC;IAEvD,OAAO,KAAK,CAAC;AACf,CAAC,CAAA,CAAC;AAvGW,QAAA,gBAAgB,oBAuG3B;AAEF,MAAM,YAAY,GAAG,CACnB,KAAoB,EACpB,KAA8B,EAC9B,OAA+B,EAC/B,EAAE;;IACF,MAAM,EAAE,EAAE,GAAG,eAAuB,EAAE,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;IACvD,MAAM,EAAE,cAAc,EAAE,uBAAuB,EAAE,GAAG,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,EAAE,CAAoB,CAAC;IAErF,MAAM,YAAY,GAAG,cAAc,CAAC,CAAC,CAAC,MAAM,0BAAW,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC,IAAI,EAAU,CAAC,CAAC,CAAC,IAAI,CAAC;IAClG,MAAM,OAAO,GAAG,SAAS,CAAC,EAAE,KAAK,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,uBAAuB,CAAC,CAAC;IACzE,MAAM,aAAa,GAAG,MAAA,CAAC,MAAM,CAAA,OAAO,aAAP,OAAO,uBAAP,OAAO,EAAI,CAAA,CAAC,mCAAI,IAAI,CAAC;IAClD,MAAM,aAAa,iDAAQ,YAAY,GAAK,aAAa,GAAK,KAAK,CAAE,CAAC;IAEtE,OAAO,aAAkB,CAAC;AAC5B,CAAC,CAAA,CAAC;AAEF;;;;GAIG;AACH,MAAM,sBAAsB,GAAG,CAM7B,KAGC,EACD,EAAE;;IACF,MAAM,EACJ,sBAAsB,EACtB,qBAAqB,EACrB,mBAAmB,EACnB,kBAAkB,EAClB,qBAAqB,EACrB,oBAAoB,EACpB,kBAAkB,EAClB,iBAAiB,EACjB,kBAAkB,EAClB,yBAAyB,GAC1B,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCAAI,EAAE,CAAyC,CAAC;IAEhE,MAAM,SAAS,GAAG,qBAAqB,CAAC,KAAK,CAAC,CAAC;IAE/C,MAAM,UAAU,GAAoB,CAAO,QAAQ,EAAE,OAAO,EAAE,EAAE;;QAC9D,2CAA2C;QAC3C,IAAI,kBAAkB,IAAI,yBAAyB,KAAK,WAAW,EAAE;YACnE,MAAM,KAAK,GAAG,MAAM,KAAK,CAAC,EAAE,CAAC,iBAAiB,CAAC,kBAAkB,CAAC,CAAC;YACnE,MAAM,KAAK,CAAC,IAAI,EAAE,CAAC;SACpB;QAED,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,qBAAqB,CAAC,2CAAI,CAAA,CAAC;QAClD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,kBAAkB,CAAC,2CAAI,CAAA,CAAC;QAC/C,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,sBAAsB,CAAC,2CAAI,CAAA,CAAC;QACnD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,mBAAmB,CAAC,2CAAI,CAAA,CAAC;QAEhD,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAE1D,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,oBAAoB,CAAC,2CAAI,CAAA,CAAC;QACjD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,iBAAiB,CAAC,2CAAI,CAAA,CAAC;QAC9C,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,qBAAqB,CAAC,2CAAI,CAAA,CAAC;QAClD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,kBAAkB,CAAC,2CAAI,CAAA,CAAC;QAE/C,iDAAiD;QACjD,MAAM,SAAS,EAAE,CAAC;QAElB,OAAO,MAAM,CAAC;IAChB,CAAC,CAAA,CAAC;IAEF,OAAO,UAAU,CAAC;AACpB,CAAC,CAAC;AAEF,mFAAmF;AACnF,MAAM,qBAAqB,GAAG,CAAC,KAAyC,EAAE,EAAE;IAC1E,iDAAiD;IACjD,MAAM,SAAS,GAAc,CAAO,SAA+B,EAAE,EAAE;;QACrE,MAAM,EACJ,gBAAgB,EAChB,mBAAmB,EACnB,mBAAmB,GACpB,GAAG,IAAA,iBAAQ,EAAC,EAAE,EAAE,SAAS,EAAE,MAAA,KAAK,CAAC,KAAK,mCAAI,EAAE,CAAC,CAAC,CAAC,kBAAkB;QAElE,IAAI,CAAC,gBAAgB;YAAE,OAAO;QAE9B,MAAM,KAAK,CAAC,EAAE,CAAC,wBAAwB,CAAC,gBAAgB,EAAE,mBAAmB,EAAE;YAC7E,KAAK,EAAE,mBAAmB;SAC3B,CAAC,CAAC;IACL,CAAC,CAAA,CAAC;IAEF,OAAO,SAAS,CAAC;AACnB,CAAC,CAAC;AAEF,uEAAuE;AACvE,MAAM,oBAAoB,GAAG,CAAC,KAA2D,EAAE,EAAE;;IAC3F,MAAM,EACJ,mBAAmB,EACnB,cAAc,EACd,gBAAgB,EAChB,eAAe,EACf,YAAY,EACZ,eAAe,EACf,gBAAgB,EAChB,kBAAkB,EAClB,kBAAkB,EAClB,sBAAsB,EACtB,yBAAyB,GAC1B,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCAAI,EAAE,CAA6D,CAAC;IAEpF,MAAM,cAAc,GAA6B,CAAO,OAAO,EAAE,GAAG,EAAE,OAAO,EAAE,EAAE;QAC/E,MAAM,WAAW,GAAG,SAAS,CAAC,KAAK,EAAE,eAAe,CAAC,CAAC;QACtD,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,EAAE,YAAY,CAAC,CAAC;QAEhD,MAAM,aAAa,GAAG,gBACpB,EAAE,EAAE,KAAK,CAAC,EAAE,EACZ,GAAG,EAAE,KAAK,CAAC,GAAG,EACd,WAAW,EAAE,mBAAmB,EAChC,QAAQ,EAAE,gBAAgB,EAC1B,QAAQ,EAAE,gBAAgB,EAC1B,SAAS,EAAE,kBAAkB,EAC7B,SAAS,EAAE,WAAW,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EAChE,MAAM,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EACvD,SAAS,EAAE,eAAe,EAC1B,cAAc,EACd,YAAY,EAAE,kBAAkB,EAChC,gBAAgB,EAAE,sBAAsB,EACxC,mBAAmB,EAAE,yBAAyB,IAC3C,OAAO,CACuB,CAAC;QAEpC,OAAO,IAAA,mBAAQ,EAAC,OAAO,EAAE,GAAG,EAAE,aAAa,CAAC,CAAC;IAC/C,CAAC,CAAA,CAAC;IAEF,OAAO,cAAc,CAAC;AACxB,CAAC,CAAC;AAEF,2EAA2E;AAC3E,MAAM,wBAAwB,GAAG,CAAC,KAA2D,EAAE,EAAE;;IAC/F,MAAM,EAAE,cAAc,EAAE,iBAAiB,EAAE,gBAAgB,EAAE,aAAa,EAAE,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCACzF,EAAE,CAAsB,CAAC;IAE3B,MAAM,iBAAiB,GAAiC,CAAO,OAAO,EAAE,OAAO,EAAE,EAAE;QACjF,MAAM,WAAW,GAAG,SAAS,CAAC,KAAK,EAAE,gBAAgB,CAAC,CAAC;QACvD,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,EAAE,aAAa,CAAC,CAAC;QAEjD,MAAM,aAAa,GAAG,gBACpB,EAAE,EAAE,KAAK,CAAC,EAAE,EACZ,GAAG,EAAE,KAAK,CAAC,GAAG,EACd,QAAQ,EAAE,iBAAiB,EAC3B,SAAS,EAAE,WAAW,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EAChE,MAAM,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EACvD,cAAc,IACX,OAAO,CACwB,CAAC;QAErC,OAAO,IAAA,2BAAY,EAAC,OAAO,EAAE,aAAa,CAAC,CAAC;IAC9C,CAAC,CAAA,CAAC;IAEF,OAAO,iBAAiB,CAAC;AAC3B,CAAC,CAAC;AAEF,4DAA4D;AACrD,MAAM,wBAAwB,GAAG,CAGtC,EACA,KAAK,EACL,QAAQ,EACR,SAAS,GAcV,EAAE,EAAE;IACH,MAAM,sBAAsB,GAAG,CAAoC,MAAS,EAAE,EAAE,CAC9E,IAAA,aAAI,EAAC,MAAM,EAAE,MAAM,CAAC,IAAI,CAAC,qBAAY,CAAC,CAAC,CAAC;IAE1C,OAAO,8CAEF,IAAA,eAAM,EAAC,QAAQ,aAAR,QAAQ,cAAR,QAAQ,GAAK,EAAY,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,KAAK,SAAS,CAAC,GAEjE,IAAA,eAAM,EAAC,sBAAsB,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,EAAE,CAAC,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,KAAK,SAAS,CAAC,GAE3E,IAAA,eAAM,EAAC,SAAS,aAAT,SAAS,cAAT,SAAS,GAAK,EAAY,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,KAAK,SAAS,CAAC,CAC7C,CAAC;AAC7B,CAAC,CAAC;AAhCW,QAAA,wBAAwB,4BAgCnC;AAEF,MAAM,qBAAqB,GAAG,CAC5B,KAA2D,EAC3D,EAAE;;IACF,MAAM,EAAE,SAAS,EAAE,oBAAoB,EAAE,qBAAqB,EAAE,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCAC7E,EAAE,CAAwB,CAAC;IAE7B,MAAM,OAAO,GAAG,CAAC,GAAG,CAAC,SAAS,aAAT,SAAS,cAAT,SAAS,GAAI,EAAE,CAAC,CAAC,CAAC;IAEvC,IAAI,oBAAoB,EAAE;QACxB,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,mCAAmC,oBAAoB,EAAE,CAAC,CAAC;QAC3E,MAAM,CAAC,SAAS,EAAE,KAAK,CAAC,GAAG,oBAAoB,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC3D,MAAM,eAAe,GAAG,MAAM,IAAA,8BAAoB,EAAM,SAAS,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC,CAAC;QAC5F,OAAO,CAAC,IAAI,CAAC,GAAG,eAAe,CAAC,CAAC;KAClC;IAED,IAAI,qBAAqB,EAAE;QACzB,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,kCAAkC,CAAC,CAAC;QACpD,MAAM,UAAU,GAAG,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,qBAAqB,CAAC,2CAAI,CAAA,CAAC;QACrE,IAAI,UAAU;YAAE,OAAO,CAAC,IAAI,CAAC,GAAG,UAAU,CAAC,CAAC;KAC7C;IAED,OAAO,OAAO,CAAC;AACjB,CAAC,CAAA,CAAC","sourcesContent":["import {\n BasicCrawler,\n CrawlingContext,\n RouterHandler,\n BasicCrawlerOptions,\n CheerioCrawler,\n Router,\n HttpCrawler,\n JSDOMCrawler,\n PlaywrightCrawler,\n PuppeteerCrawler,\n Log,\n Request as CrawleeRequest,\n} from 'crawlee';\nimport { omitBy, pick, defaults } from 'lodash';\nimport * as Sentry from '@sentry/node';\nimport { gotScraping } from 'got-scraping';\n\nimport type { CrawlerMeta, CrawlerType } from '../../types';\nimport type { MaybePromise, PickPartial } from '../../utils/types';\nimport { createErrorHandler } from '../error/errorHandler';\nimport { setupSentry } from '../error/sentry';\nimport { type PushDataOptions, itemCacheKey, pushData } from '../io/pushData';\nimport { getColumnFromDataset } from '../io/dataset';\nimport { PushRequestsOptions, pushRequests } from '../io/pushRequests';\nimport type { CrawleeOneIO } from '../integrations/types';\nimport { apifyIO } from '../integrations/apify';\nimport { registerHandlers, setupDefaultRoute } from '../router/router';\nimport {\n CrawlerConfigActorInput,\n OutputActorInput,\n MetamorphActorInput,\n PrivacyActorInput,\n crawlerInput,\n StartUrlsActorInput,\n InputActorInput,\n RequestActorInput,\n AllActorInputs,\n LoggingActorInput,\n} from '../config';\nimport { logLevelHandlerWrapper, logLevelToCrawlee } from '../log';\nimport type {\n ActorContext,\n ActorDefinition,\n ActorHookContext,\n ActorRouterContext,\n Metamorph,\n RunCrawler,\n} from './types';\n\nconst actorClassByType = {\n basic: BasicCrawler,\n http: HttpCrawler,\n cheerio: CheerioCrawler,\n jsdom: JSDOMCrawler,\n playwright: PlaywrightCrawler,\n puppeteer: PuppeteerCrawler,\n} satisfies Record<CrawlerType, { new (options: Record<string, any>): any }>;\n\nconst isRouter = (r: any): r is RouterHandler<any> => {\n return !!((r as RouterHandler).addHandler && (r as RouterHandler).addDefaultHandler);\n};\nconst isFunc = (f: any): f is (...args: any[]) => any => {\n return typeof f === 'function';\n};\n\n/** Run a function that was defined as a string via Actor input */\nconst genHookFn = <\n Ctx extends CrawlingContext<any> = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n>(\n actor: Pick<ActorContext<Ctx, Labels, Input, TIO>, 'input' | 'state' | 'io'>,\n fnStr?: string\n) => {\n if (!fnStr) return null;\n\n const hookCtx = {\n io: actor.io,\n input: actor.input,\n state: actor.state,\n itemCacheKey,\n sendRequest: gotScraping,\n } satisfies ActorHookContext<TIO>;\n\n const hookFn = eval(fnStr);\n if (!hookFn) return null;\n\n return async (...args) => hookFn(...args, hookCtx);\n};\n\n/**\n * Create default configuration for an opinionated Crawlee actor,\n * and run the actor within Apify's `Actor.main()` context.\n *\n * Apify context can be replaced with custom implementation using the `actorConfig.io` option.\n *\n * Read more about what this actor does at {@link createCrawleeOne}.\n */\nexport const createAndRunCrawleeOne = async <\n TCrawlerType extends CrawlerType,\n Ctx extends CrawlerMeta<TCrawlerType, any>['context'] = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n>(args: {\n /** String idetifying the actor class, e.g. `'cheerio'` */\n actorType: TCrawlerType;\n actorName: string;\n /** Config passed to the {@link createCrawleeOne} */\n actorConfig: PickPartial<\n ActorDefinition<Ctx, Labels, Input, TIO>,\n 'router' | 'createCrawler' | 'io'\n >;\n /**\n * If using default `createCrawler` implementation, these are crawler options\n * that may be overriden by user input.\n */\n crawlerConfigDefaults?: CrawlerMeta<TCrawlerType, any>['options'];\n /**\n * If using default `createCrawler` implementation, these are crawler options\n * that will override user input.\n *\n * This is useful for testing env.\n */\n crawlerConfigOverrides?: CrawlerMeta<TCrawlerType, any>['options'];\n /**\n * Sentry configuration. If using default `createCrawler` implementation,\n * failed requests are optionally reported to Sentry.\n *\n * To disable Sentry, set `\"enabled\": false`.\n */\n sentryOptions?: Sentry.NodeOptions;\n /**\n * Callback with the created actor. The callback is called within\n * the `Actor.main()` context.\n */\n onActorReady?: (actor: ActorContext<Ctx, Labels, Input, TIO>) => MaybePromise<void>;\n}): Promise<void> => {\n const {\n actorType,\n actorName,\n actorConfig,\n crawlerConfigDefaults,\n crawlerConfigOverrides,\n sentryOptions,\n onActorReady,\n } = args;\n\n const { io = apifyIO as any as TIO } = actorConfig;\n\n await setupSentry({ ...sentryOptions, serverName: actorName }, { io });\n\n // See docs:\n // - https://docs.apify.com/sdk/js/\n // - https://docs.apify.com/academy/deploying-your-code/inputs-outputs#accepting-input-with-the-apify-sdk\n // - https://docs.apify.com/sdk/js/docs/upgrading/upgrading-to-v3#apify-sdk\n await io.runInContext(\n async () => {\n const actorDefaults: ActorDefinition<Ctx, Labels, Input & AllActorInputs, TIO> = {\n io,\n router: Router.create<Ctx>(),\n routerWrappers: ({ input }) => [\n logLevelHandlerWrapper<Ctx, any>(input?.logLevel ?? 'info'),\n ],\n createCrawler: ({ router, proxy, input }) => {\n const options = createHttpCrawlerOptions<\n CrawlerMeta<TCrawlerType, any>['options'],\n Input\n >({\n input,\n defaults: crawlerConfigDefaults,\n overrides: {\n requestHandler: router,\n proxyConfiguration: proxy,\n // Capture errors in a separate (Apify) Dataset and pass errors to Sentry\n failedRequestHandler: createErrorHandler({\n io,\n reportingDatasetId: input?.errorReportingDatasetId ?? 'REPORTING',\n sendToSentry: input?.errorSendToSentry ?? true,\n }),\n ...crawlerConfigOverrides,\n },\n });\n const CrawlerClass = actorClassByType[actorType] as any;\n return new CrawlerClass(options);\n },\n routes: [],\n routeHandlers: {} as any,\n };\n\n const actor = await createCrawleeOne<Ctx, Labels, Input, TIO>({\n ...actorConfig,\n io,\n router: actorConfig.router ?? (actorDefaults.router as any),\n routerWrappers: actorConfig.routerWrappers ?? (actorDefaults.routerWrappers as any),\n createCrawler: actorConfig.createCrawler ?? (actorDefaults.createCrawler as any),\n });\n\n await onActorReady?.(actor);\n },\n { statusMessage: 'Crawling finished!' }\n );\n};\n\n/**\n * Create opinionated Crawlee crawler that uses router for handling requests.\n *\n * This is a quality-of-life function that does the following for you:\n *\n * 1) Full TypeScript coverage - Ensure all components use the same Crawler / CrawlerContext.\n *\n * 2) Get Actor input from `Actor.getInput` if not given.\n *\n * 3) (Optional) Validate Actor input\n *\n * 4) Set up router such that requests that reach default route are\n * redirected to labelled routes based on which item from \"routes\" they match.\n *\n * 5) Register all route handlers for you.\n *\n * 6) (Optional) Wrap all route handlers in a wrapper. Use this e.g.\n * if you want to add a field to the context object, or handle errors\n * from a single place.\n *\n * 7) (Optional) Support transformation and filtering of (scraped) entries,\n * configured via Actor input.\n *\n * 8) (Optional) Support Actor metamorphing, configured via Actor input.\n *\n * 9) Apify context (e.g. calling `Actor.getInput`) can be replaced with custom\n * implementation using the `io` option.\n */\nexport const createCrawleeOne = async <\n Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n>(\n config: PickPartial<ActorDefinition<Ctx, Labels, Input, TIO>, 'io'>\n): Promise<ActorContext<Ctx, Labels, Input, TIO>> => {\n const { io = apifyIO as any as TIO } = config;\n\n // Mutable state that is available to the actor hooks\n const state = {};\n\n // Initialize actor inputs\n const rawInput = config.input\n ? isFunc(config.input)\n ? await config.input({ ...config, io })\n : config.input\n : await io.getInput<Input>();\n const input = Object.freeze(await resolveInput<Input | null>(rawInput, state, { io }));\n\n if (config.validateInput) await config.validateInput(input);\n\n const { logLevel } = (input ?? {}) as LoggingActorInput;\n const log = new Log({ level: logLevel ? logLevelToCrawlee[logLevel] : undefined });\n\n // This is context that is available to options that use initialization function\n const getConfig = () => ({ ...config, input, state, io, log });\n\n // Set up proxy\n const defaultProxy =\n config.proxy == null ? await io.createDefaultProxyConfiguration(input ?? undefined) : undefined;\n const proxy =\n config.proxy == null\n ? defaultProxy\n : isFunc(config.proxy)\n ? await config.proxy(getConfig())\n : config.proxy;\n\n // Run initialization functions\n const router: RouterHandler<Ctx> = isRouter(config.router)\n ? config.router\n : await (config.router as any)(getConfig());\n const routes = isFunc(config.routes) ? await config.routes(getConfig()) : config.routes; // prettier-ignore\n const routeHandlers = isFunc(config.routeHandlers) ? await config.routeHandlers(getConfig()) : config.routeHandlers; // prettier-ignore\n const routerWrappers = isFunc(config.routerWrappers) ? await config.routerWrappers(getConfig()) : config.routerWrappers; // prettier-ignore\n\n // Create Crawlee crawler\n const getActorCtx = () => ({\n io,\n router,\n routes,\n routeHandlers,\n proxy,\n config,\n input,\n state,\n log,\n });\n const crawler = await config.createCrawler(getActorCtx());\n\n // Create actor (our custom entity)\n const preActor = { crawler, ...getActorCtx() };\n const runCrawler = createScopedCrawlerRun(preActor);\n const metamorph = createScopedMetamorph(preActor);\n const scopedPushData = createScopedPushData(preActor);\n const scopedPushRequest = createScopedPushRequests(preActor);\n const startUrls = await getStartUrlsFromInput(preActor);\n\n const actor = {\n ...preActor,\n crawler,\n runCrawler,\n metamorph,\n pushData: scopedPushData,\n pushRequests: scopedPushRequest,\n startUrls,\n } satisfies ActorContext<Ctx, Labels, Input, TIO>;\n\n // Extra data that we make available to the route handlers\n const routerContext = { actor, pushData: scopedPushData };\n\n // Set up router\n await setupDefaultRoute<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>, Labels, Input>({\n io,\n router,\n routerWrappers,\n routerContext,\n routes,\n routeHandlers,\n input,\n });\n await registerHandlers<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>, Labels>({\n router,\n routerWrappers,\n routerContext,\n routeHandlers,\n });\n\n // Now that the actor is ready, enqueue the URLs right away\n await scopedPushRequest(startUrls as CrawleeRequest[]);\n\n return actor;\n};\n\nconst resolveInput = async <T extends Record<string, any> | null>(\n input: object | null,\n state: Record<string, unknown>,\n options?: { io?: CrawleeOneIO }\n) => {\n const { io = apifyIO as CrawleeOneIO } = options ?? {};\n const { inputExtendUrl, inputExtendFromFunction } = (input ?? {}) as InputActorInput;\n\n const inputFromUrl = inputExtendUrl ? await gotScraping.get(inputExtendUrl).json<object>() : null;\n const inputFn = genHookFn({ state, input, io }, inputExtendFromFunction);\n const inputFromFunc = (await inputFn?.()) ?? null;\n const extendedInput = { ...inputFromUrl, ...inputFromFunc, ...input };\n\n return extendedInput as T;\n};\n\n/**\n * Create a function that wraps `crawler.run(requests, runOtions)` with additional\n * features like:\n * - Automatically metamorph into another actor after the run finishes\n */\nconst createScopedCrawlerRun = <\n Ctx extends CrawlingContext<any> = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n>(\n actor: Omit<\n ActorContext<Ctx, Labels, Input, TIO>,\n 'runCrawler' | 'metamorph' | 'pushData' | 'pushRequests' | 'startUrls'\n >\n) => {\n const {\n requestTransformBefore,\n requestTransformAfter,\n requestFilterBefore,\n requestFilterAfter,\n outputTransformBefore,\n outputTransformAfter,\n outputFilterBefore,\n outputFilterAfter,\n outputCacheStoreId,\n outputCacheActionOnResult,\n } = (actor.input ?? {}) as OutputActorInput & RequestActorInput;\n\n const metamorph = createScopedMetamorph(actor);\n\n const runCrawler: RunCrawler<Ctx> = async (requests, options) => {\n // Clear cache if it was set from the input\n if (outputCacheStoreId && outputCacheActionOnResult === 'overwrite') {\n const store = await actor.io.openKeyValueStore(outputCacheStoreId);\n await store.drop();\n }\n\n await genHookFn(actor, outputTransformBefore)?.();\n await genHookFn(actor, outputFilterBefore)?.();\n await genHookFn(actor, requestTransformBefore)?.();\n await genHookFn(actor, requestFilterBefore)?.();\n\n const runRes = await actor.crawler.run(requests, options);\n\n await genHookFn(actor, outputTransformAfter)?.();\n await genHookFn(actor, outputFilterAfter)?.();\n await genHookFn(actor, requestTransformAfter)?.();\n await genHookFn(actor, requestFilterAfter)?.();\n\n // Trigger metamorph if it was set from the input\n await metamorph();\n\n return runRes;\n };\n\n return runCrawler;\n};\n\n/** Create a function that triggers metamorph, using Actor's inputs as defaults. */\nconst createScopedMetamorph = (actor: Pick<ActorContext, 'input' | 'io'>) => {\n // Trigger metamorph if it was set from the input\n const metamorph: Metamorph = async (overrides?: MetamorphActorInput) => {\n const {\n metamorphActorId,\n metamorphActorBuild,\n metamorphActorInput,\n } = defaults({}, overrides, actor.input ?? {}); // prettier-ignore\n\n if (!metamorphActorId) return;\n\n await actor.io.triggerDownstreamCrawler(metamorphActorId, metamorphActorInput, {\n build: metamorphActorBuild,\n });\n };\n\n return metamorph;\n};\n\n/** pushData wrapper that pre-populates options based on actor input */\nconst createScopedPushData = (actor: Pick<ActorContext, 'input' | 'state' | 'io' | 'log'>) => {\n const {\n includePersonalData,\n requestQueueId,\n outputMaxEntries,\n outputTransform,\n outputFilter,\n outputDatasetId,\n outputPickFields,\n outputRenameFields,\n outputCacheStoreId,\n outputCachePrimaryKeys,\n outputCacheActionOnResult,\n } = (actor.input ?? {}) as OutputActorInput & PrivacyActorInput & RequestActorInput;\n\n const scopedPushData: ActorContext['pushData'] = async (entries, ctx, options) => {\n const transformFn = genHookFn(actor, outputTransform);\n const filterFn = genHookFn(actor, outputFilter);\n\n const mergedOptions = {\n io: actor.io,\n log: actor.log,\n showPrivate: includePersonalData,\n maxCount: outputMaxEntries,\n pickKeys: outputPickFields,\n remapKeys: outputRenameFields,\n transform: transformFn ? (item) => transformFn(item) : undefined,\n filter: filterFn ? (item) => filterFn(item) : undefined,\n datasetId: outputDatasetId,\n requestQueueId,\n cacheStoreId: outputCacheStoreId,\n cachePrimaryKeys: outputCachePrimaryKeys,\n cacheActionOnResult: outputCacheActionOnResult,\n ...options,\n } satisfies PushDataOptions<object>;\n\n return pushData(entries, ctx, mergedOptions);\n };\n\n return scopedPushData;\n};\n\n/** pushRequests wrapper that pre-populates options based on actor input */\nconst createScopedPushRequests = (actor: Pick<ActorContext, 'input' | 'state' | 'io' | 'log'>) => {\n const { requestQueueId, requestMaxEntries, requestTransform, requestFilter } = (actor.input ??\n {}) as RequestActorInput;\n\n const scopedPushRequest: ActorContext['pushRequests'] = async (entries, options) => {\n const transformFn = genHookFn(actor, requestTransform);\n const filterFn = genHookFn(actor, requestFilter);\n\n const mergedOptions = {\n io: actor.io,\n log: actor.log,\n maxCount: requestMaxEntries,\n transform: transformFn ? (item) => transformFn(item) : undefined,\n filter: filterFn ? (item) => filterFn(item) : undefined,\n requestQueueId,\n ...options,\n } satisfies PushRequestsOptions<any>;\n\n return pushRequests(entries, mergedOptions);\n };\n\n return scopedPushRequest;\n};\n\n/** Given the actor input, create common crawler options. */\nexport const createHttpCrawlerOptions = <\n TOpts extends BasicCrawlerOptions<any> = BasicCrawlerOptions,\n Input extends Record<string, any> = Record<string, any>\n>({\n input,\n defaults,\n overrides,\n}: {\n /** Actor input */\n input: Input | null;\n /**\n * Default config options set by us. These may be overriden\n * by values from actor input (set by user).\n */\n defaults?: TOpts;\n /**\n * These config options will overwrite both the default and user\n * options. This is useful for hard-setting values e.g. in tests.\n */\n overrides?: TOpts;\n}) => {\n const pickCrawlerInputFields = <T extends CrawlerConfigActorInput>(config: T) =>\n pick(config, Object.keys(crawlerInput));\n\n return {\n // ----- 1. DEFAULTS -----\n ...omitBy(defaults ?? ({} as TOpts), (field) => field === undefined),\n // ----- 2. CONFIG FROM INPUT -----\n ...omitBy(pickCrawlerInputFields(input ?? {}), (field) => field === undefined),\n // ----- 3. OVERRIDES - E.G. TEST CONFIG -----\n ...omitBy(overrides ?? ({} as TOpts), (field) => field === undefined),\n } satisfies Partial<TOpts>;\n};\n\nconst getStartUrlsFromInput = async (\n actor: Pick<ActorContext, 'input' | 'state' | 'io' | 'log'>\n) => {\n const { startUrls, startUrlsFromDataset, startUrlsFromFunction } = (actor.input ??\n {}) as StartUrlsActorInput;\n\n const urlsAgg = [...(startUrls ?? [])];\n\n if (startUrlsFromDataset) {\n actor.log.debug(`Loading start URLs from Dataset ${startUrlsFromDataset}`);\n const [datasetId, field] = startUrlsFromDataset.split('#');\n const urlsFromDataset = await getColumnFromDataset<any>(datasetId, field, { io: actor.io });\n urlsAgg.push(...urlsFromDataset);\n }\n\n if (startUrlsFromFunction) {\n actor.log.debug(`Loading start URLs from function`);\n const urlsFromFn = await genHookFn(actor, startUrlsFromFunction)?.();\n if (urlsFromFn) urlsAgg.push(...urlsFromFn);\n }\n\n return urlsAgg;\n};\n"]}
|
|
1
|
+
{"version":3,"file":"actor.js","sourceRoot":"","sources":["../../../../src/lib/actor/actor.ts"],"names":[],"mappings":";;;;;;;;;;;;AAAA,qCAaiB;AACjB,mCAAgD;AAChD,+CAA2C;AAI3C,wDAA2D;AAC3D,6CAA8E;AAC9E,2CAAqD;AACrD,qDAAuE;AAEvE,iDAAgD;AAChD,6CAA0E;AAC1E,sCAWmB;AACnB,gCAAmE;AAWnE,MAAM,gBAAgB,GAAG;IACvB,KAAK,EAAE,sBAAY;IACnB,IAAI,EAAE,qBAAW;IACjB,OAAO,EAAE,wBAAc;IACvB,KAAK,EAAE,sBAAY;IACnB,UAAU,EAAE,2BAAiB;IAC7B,SAAS,EAAE,0BAAgB;CAC+C,CAAC;AAE7E,MAAM,QAAQ,GAAG,CAAC,CAAM,EAA2B,EAAE;IACnD,OAAO,CAAC,CAAC,CAAC,CAAC,CAAmB,aAAnB,CAAC,uBAAD,CAAC,CAAoB,UAAU,MAAK,CAAmB,aAAnB,CAAC,uBAAD,CAAC,CAAoB,iBAAiB,CAAA,CAAC,CAAC;AACzF,CAAC,CAAC;AAEF,MAAM,MAAM,GAAG,CAAC,CAAM,EAAgC,EAAE;IACtD,OAAO,OAAO,CAAC,KAAK,UAAU,CAAC;AACjC,CAAC,CAAC;AAEF,kEAAkE;AAClE,MAAM,SAAS,GAAG,CAIhB,KAAoF,EACpF,KAAc,EACd,EAAE;IACF,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IAExB,MAAM,OAAO,GAAG;QACd,EAAE,EAAE,KAAK,CAAC,EAAE;QACZ,KAAK,EAAE,KAAK,CAAC,KAAK;QAClB,KAAK,EAAE,KAAK,CAAC,KAAK;QAClB,YAAY,EAAZ,uBAAY;QACZ,WAAW,EAAE,0BAAW;KACe,CAAC;IAE1C,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC;IAC3B,IAAI,CAAC,MAAM;QAAE,OAAO,IAAI,CAAC;IAEzB,OAAO,CAAO,GAAG,IAAI,EAAE,EAAE,kDAAC,OAAA,MAAM,CAAC,GAAG,IAAI,EAAE,OAAO,CAAC,CAAA,GAAA,CAAC;AACrD,CAAC,CAAC;AA6CF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AACI,MAAM,aAAa,GAAG,CAQ3B,IAAiE,EAClD,EAAE;IACjB,MAAM,EACJ,SAAS,EACT,SAAS,EACT,WAAW,EACX,qBAAqB,EACrB,sBAAsB,EACtB,YAAY,GACb,GAAG,IAAI,CAAC;IAET,MAAM,EAAE,EAAE,GAAG,eAAqB,EAAE,SAAS,EAAE,GAAG,WAAW,CAAC;IAE9D,YAAY;IACZ,mCAAmC;IACnC,yGAAyG;IACzG,2EAA2E;IAC3E,MAAM,EAAE,CAAC,YAAY,CACnB,GAAS,EAAE;;QACT,MAAM,CAAA,SAAS,aAAT,SAAS,uBAAT,SAAS,CAAE,KAAK,CAAC,EAAE,SAAS,EAAE,SAAS,EAAE,WAAW,kCAAO,WAAW,KAAE,EAAE,GAAE,EAAE,CAAC,CAAA,CAAC;QAEtF,MAAM,aAAa,GAGf;YACF,MAAM,EAAE,gBAAM,CAAC,MAAM,EAAO;YAC5B,oBAAoB,EAAE,CAAC,EAAE,KAAK,EAAE,EAAE,EAAE;;gBAAC,OAAA;oBACnC,IAAA,4BAAsB,EAAW,MAAA,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,QAAQ,mCAAI,MAAM,CAAC;iBAC5D,CAAA;aAAA;YACD,aAAa,EAAE,CAAC,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,EAAE,EAAE;;gBAC1C,MAAM,OAAO,GAAG,IAAA,gCAAwB,EAA4C;oBAClF,KAAK;oBACL,QAAQ,EAAE,qBAAqB;oBAC/B,SAAS,kBACP,cAAc,EAAE,MAAM,EACtB,kBAAkB,EAAE,KAAK;wBACzB,4EAA4E;wBAC5E,oBAAoB,EAAE,IAAA,iCAAkB,EAAC;4BACvC,EAAE;4BACF,kBAAkB,EAAE,MAAA,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,uBAAuB,mCAAI,WAAW;4BACjE,eAAe,EAAE,MAAA,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,oBAAoB,mCAAI,IAAI;4BACpD,sBAAsB,EAAE,SAAS,aAAT,SAAS,uBAAT,SAAS,CAAE,sBAAsB;yBAC1D,CAAC,IACC,sBAAsB,CAC1B;iBACF,CAAC,CAAC;gBACH,MAAM,YAAY,GAAG,gBAAgB,CAAC,SAAS,CAAQ,CAAC;gBACxD,OAAO,IAAI,YAAY,CAAC,OAAO,CAAC,CAAC;YACnC,CAAC;SACF,CAAC;QAEF,MAAM,KAAK,GAAG,MAAM,gBAAgB,iCAC/B,WAAW,KACd,EAAE,EACF,MAAM,EAAE,MAAA,WAAW,CAAC,MAAM,mCAAK,aAAa,CAAC,MAAc,EAC3D,oBAAoB,EAClB,MAAA,WAAW,CAAC,oBAAoB,mCAAK,aAAa,CAAC,oBAA4B,EACjF,aAAa,EAAE,MAAA,WAAW,CAAC,aAAa,mCAAK,aAAa,CAAC,aAAqB,IAChF,CAAC;QAEH,MAAM,CAAA,YAAY,aAAZ,YAAY,uBAAZ,YAAY,CAAG,KAAK,CAAC,CAAA,CAAC;IAC9B,CAAC,CAAA,EACD,EAAE,aAAa,EAAE,oBAAoB,EAAE,CACxC,CAAC;AACJ,CAAC,CAAA,CAAC;AAxEW,QAAA,aAAa,iBAwExB;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AACH,MAAM,gBAAgB,GAAG,CAOvB,MAA6E,EAChB,EAAE;IAC/D,MAAM,EAAE,EAAE,GAAG,eAAqB,EAAE,SAAS,EAAE,GAAG,MAAM,CAAC;IAEzD,qDAAqD;IACrD,MAAM,KAAK,GAAG,EAAE,CAAC;IAEjB,0BAA0B;IAC1B,MAAM,QAAQ,GAAG,MAAM,CAAC,KAAK;QAC3B,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC;YACpB,CAAC,CAAC,MAAM,MAAM,CAAC,KAAK,iCAAM,MAAM,KAAE,EAAE,IAAG;YACvC,CAAC,CAAC,MAAM,CAAC,KAAK;QAChB,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAS,CAAC;IAC/B,MAAM,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,YAAY,CAAe,QAAQ,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC;IAEvF,IAAI,MAAM,CAAC,aAAa;QAAE,MAAM,MAAM,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;IAE5D,MAAM,EAAE,QAAQ,EAAE,GAAG,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,EAAE,CAAsB,CAAC;IACxD,MAAM,GAAG,GAAG,IAAI,aAAG,CAAC,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC,CAAC,uBAAiB,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC;IAEnF,gFAAgF;IAChF,MAAM,SAAS,GAAG,GAAG,EAAE,CAAC,iCAAM,MAAM,KAAE,KAAK,EAAE,KAAK,EAAE,EAAE,EAAE,GAAG,IAAG,CAAC;IAE/D,eAAe;IACf,MAAM,YAAY,GAChB,MAAM,CAAC,KAAK,IAAI,IAAI,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,+BAA+B,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;IAClG,MAAM,KAAK,GACT,MAAM,CAAC,KAAK,IAAI,IAAI;QAClB,CAAC,CAAC,YAAY;QACd,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC;YACtB,CAAC,CAAC,MAAM,MAAM,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACjC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;IAEnB,+BAA+B;IAC/B,MAAM,MAAM,GAAuB,QAAQ,CAAC,MAAM,CAAC,MAAM,CAAC;QACxD,CAAC,CAAC,MAAM,CAAC,MAAM;QACf,CAAC,CAAC,MAAO,MAAM,CAAC,MAAc,CAAC,SAAS,EAAE,CAAC,CAAC;IAC9C,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,kBAAkB;IAC3G,MAAM,aAAa,GAAG,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,aAAa,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,kBAAkB;IACvI,MAAM,oBAAoB,GAAG,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,oBAAoB,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC,kBAAkB;IAEnK,yBAAyB;IACzB,MAAM,WAAW,GAAG,GAAG,EAAE,CAAC,CAAC;QACzB,EAAE;QACF,SAAS;QACT,MAAM;QACN,MAAM;QACN,aAAa;QACb,KAAK;QACL,MAAM;QACN,KAAK;QACL,KAAK;QACL,GAAG;KACJ,CAAC,CAAC;IACH,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,WAAW,EAAE,CAAC,CAAC;IAE1D,mCAAmC;IACnC,MAAM,QAAQ,mBAAK,OAAO,IAAK,WAAW,EAAE,CAAE,CAAC;IAC/C,MAAM,UAAU,GAAG,sBAAsB,CAAC,QAAQ,CAAC,CAAC;IACpD,MAAM,SAAS,GAAG,qBAAqB,CAAC,QAAQ,CAAC,CAAC;IAClD,MAAM,cAAc,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IACtD,MAAM,iBAAiB,GAAG,wBAAwB,CAAC,QAAQ,CAAC,CAAC;IAC7D,MAAM,SAAS,GAAG,MAAM,qBAAqB,CAAC,QAAQ,CAAC,CAAC;IAExD,MAAM,KAAK,GAAG,gCACT,QAAQ,KACX,OAAO;QACP,UAAU;QACV,SAAS,EACT,QAAQ,EAAE,cAAc,EACxB,YAAY,EAAE,iBAAiB,EAC/B,SAAS,GACmD,CAAC;IAE/D,0DAA0D;IAC1D,MAAM,aAAa,GAAG,EAAE,KAAK,EAAE,QAAQ,EAAE,cAAc,EAAE,CAAC;IAE1D,gBAAgB;IAChB,MAAM,IAAA,6BAAoB,EAKxB;QACA,EAAE;QACF,MAAM;QACN,oBAAoB;QACpB,aAAa;QACb,MAAM;QACN,aAAa;QACb,KAAK;KACN,CAAC,CAAC;IAEH,6BAA6B;IAC7B,MAAM,IAAA,yBAAgB,EACpB,MAAM,EACN,aAAa,EACb,EAAE,aAAa,EAAE,eAAe,EAAE,oBAAoB,EAAE,CACzD,CAAC;IAEF,2DAA2D;IAC3D,MAAM,iBAAiB,CAAC,SAA6B,CAAC,CAAC;IAEvD,OAAO,KAAK,CAAC;AACf,CAAC,CAAA,CAAC;AAEF,MAAM,YAAY,GAAG,CACnB,KAAoB,EACpB,KAA8B,EAC9B,OAA+B,EAC/B,EAAE;;IACF,MAAM,EAAE,EAAE,GAAG,eAAuB,EAAE,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;IACvD,MAAM,EAAE,cAAc,EAAE,uBAAuB,EAAE,GAAG,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,EAAE,CAAoB,CAAC;IAErF,MAAM,YAAY,GAAG,cAAc,CAAC,CAAC,CAAC,MAAM,0BAAW,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC,IAAI,EAAU,CAAC,CAAC,CAAC,IAAI,CAAC;IAClG,MAAM,OAAO,GAAG,SAAS,CAAC,EAAE,KAAK,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,uBAAuB,CAAC,CAAC;IACzE,MAAM,aAAa,GAAG,MAAA,CAAC,MAAM,CAAA,OAAO,aAAP,OAAO,uBAAP,OAAO,EAAI,CAAA,CAAC,mCAAI,IAAI,CAAC;IAClD,MAAM,aAAa,iDAAQ,YAAY,GAAK,aAAa,GAAK,KAAK,CAAE,CAAC;IAEtE,OAAO,aAAkB,CAAC;AAC5B,CAAC,CAAA,CAAC;AAEF;;;;GAIG;AACH,MAAM,sBAAsB,GAAG,CAO7B,KAGC,EACD,EAAE;;IACF,MAAM,EACJ,sBAAsB,EACtB,qBAAqB,EACrB,mBAAmB,EACnB,kBAAkB,EAClB,qBAAqB,EACrB,oBAAoB,EACpB,kBAAkB,EAClB,iBAAiB,EACjB,kBAAkB,EAClB,yBAAyB,GAC1B,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCAAI,EAAE,CAAyC,CAAC;IAEhE,MAAM,SAAS,GAAG,qBAAqB,CAAC,KAAK,CAAC,CAAC;IAE/C,MAAM,UAAU,GAAoB,CAAO,QAAQ,EAAE,OAAO,EAAE,EAAE;;QAC9D,2CAA2C;QAC3C,IAAI,kBAAkB,IAAI,yBAAyB,KAAK,WAAW,EAAE;YACnE,MAAM,KAAK,GAAG,MAAM,KAAK,CAAC,EAAE,CAAC,iBAAiB,CAAC,kBAAkB,CAAC,CAAC;YACnE,MAAM,KAAK,CAAC,IAAI,EAAE,CAAC;SACpB;QAED,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,qBAAqB,CAAC,2CAAI,CAAA,CAAC;QAClD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,kBAAkB,CAAC,2CAAI,CAAA,CAAC;QAC/C,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,sBAAsB,CAAC,2CAAI,CAAA,CAAC;QACnD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,mBAAmB,CAAC,2CAAI,CAAA,CAAC;QAEhD,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAE1D,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,oBAAoB,CAAC,2CAAI,CAAA,CAAC;QACjD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,iBAAiB,CAAC,2CAAI,CAAA,CAAC;QAC9C,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,qBAAqB,CAAC,2CAAI,CAAA,CAAC;QAClD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,kBAAkB,CAAC,2CAAI,CAAA,CAAC;QAE/C,iDAAiD;QACjD,MAAM,SAAS,EAAE,CAAC;QAElB,OAAO,MAAM,CAAC;IAChB,CAAC,CAAA,CAAC;IAEF,OAAO,UAAU,CAAC;AACpB,CAAC,CAAC;AAEF,mFAAmF;AACnF,MAAM,qBAAqB,GAAG,CAAC,KAA+C,EAAE,EAAE;IAChF,iDAAiD;IACjD,MAAM,SAAS,GAAc,CAAO,SAA+B,EAAE,EAAE;;QACrE,MAAM,EACJ,gBAAgB,EAChB,mBAAmB,EACnB,mBAAmB,GACpB,GAAG,IAAA,iBAAQ,EAAC,EAAE,EAAE,SAAS,EAAE,MAAA,KAAK,CAAC,KAAK,mCAAI,EAAE,CAAC,CAAC,CAAC,kBAAkB;QAElE,IAAI,CAAC,gBAAgB;YAAE,OAAO;QAE9B,MAAM,KAAK,CAAC,EAAE,CAAC,wBAAwB,CAAC,gBAAgB,EAAE,mBAAmB,EAAE;YAC7E,KAAK,EAAE,mBAAmB;SAC3B,CAAC,CAAC;IACL,CAAC,CAAA,CAAC;IAEF,OAAO,SAAS,CAAC;AACnB,CAAC,CAAC;AAEF,uEAAuE;AACvE,MAAM,oBAAoB,GAAG,CAC3B,KAAiE,EACjE,EAAE;;IACF,MAAM,EACJ,mBAAmB,EACnB,cAAc,EACd,gBAAgB,EAChB,eAAe,EACf,YAAY,EACZ,eAAe,EACf,gBAAgB,EAChB,kBAAkB,EAClB,kBAAkB,EAClB,sBAAsB,EACtB,yBAAyB,GAC1B,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCAAI,EAAE,CAA6D,CAAC;IAEpF,MAAM,cAAc,GAAmC,CAAO,OAAO,EAAE,GAAG,EAAE,OAAO,EAAE,EAAE;QACrF,MAAM,WAAW,GAAG,SAAS,CAAC,KAAK,EAAE,eAAe,CAAC,CAAC;QACtD,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,EAAE,YAAY,CAAC,CAAC;QAEhD,MAAM,aAAa,GAAG,gBACpB,EAAE,EAAE,KAAK,CAAC,EAAE,EACZ,GAAG,EAAE,KAAK,CAAC,GAAG,EACd,WAAW,EAAE,mBAAmB,EAChC,QAAQ,EAAE,gBAAgB,EAC1B,QAAQ,EAAE,gBAAgB,EAC1B,SAAS,EAAE,kBAAkB,EAC7B,SAAS,EAAE,WAAW,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EAChE,MAAM,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EACvD,SAAS,EAAE,eAAe,EAC1B,cAAc,EACd,YAAY,EAAE,kBAAkB,EAChC,gBAAgB,EAAE,sBAAsB,EACxC,mBAAmB,EAAE,yBAAyB,IAC3C,OAAO,CACuB,CAAC;QAEpC,OAAO,IAAA,mBAAQ,EAAC,OAAO,EAAE,GAAG,EAAE,aAAa,CAAC,CAAC;IAC/C,CAAC,CAAA,CAAC;IAEF,OAAO,cAAc,CAAC;AACxB,CAAC,CAAC;AAEF,2EAA2E;AAC3E,MAAM,wBAAwB,GAAG,CAC/B,KAAiE,EACjE,EAAE;;IACF,MAAM,EAAE,cAAc,EAAE,iBAAiB,EAAE,gBAAgB,EAAE,aAAa,EAAE,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCACzF,EAAE,CAAsB,CAAC;IAE3B,MAAM,iBAAiB,GAAuC,CAAO,OAAO,EAAE,OAAO,EAAE,EAAE;QACvF,MAAM,WAAW,GAAG,SAAS,CAAC,KAAK,EAAE,gBAAgB,CAAC,CAAC;QACvD,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,EAAE,aAAa,CAAC,CAAC;QAEjD,MAAM,aAAa,GAAG,gBACpB,EAAE,EAAE,KAAK,CAAC,EAAE,EACZ,GAAG,EAAE,KAAK,CAAC,GAAG,EACd,QAAQ,EAAE,iBAAiB,EAC3B,SAAS,EAAE,WAAW,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EAChE,MAAM,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EACvD,cAAc,IACX,OAAO,CACwB,CAAC;QAErC,OAAO,IAAA,2BAAY,EAAC,OAAO,EAAE,aAAa,CAAC,CAAC;IAC9C,CAAC,CAAA,CAAC;IAEF,OAAO,iBAAiB,CAAC;AAC3B,CAAC,CAAC;AAEF,4DAA4D;AACrD,MAAM,wBAAwB,GAAG,CAGtC,EACA,KAAK,EACL,QAAQ,EACR,SAAS,GAcV,EAAE,EAAE;IACH,MAAM,sBAAsB,GAAG,CAAoC,MAAS,EAAE,EAAE,CAC9E,IAAA,aAAI,EAAC,MAAM,EAAE,MAAM,CAAC,IAAI,CAAC,qBAAY,CAAC,CAAC,CAAC;IAE1C,OAAO,8CAEF,IAAA,eAAM,EAAC,QAAQ,aAAR,QAAQ,cAAR,QAAQ,GAAK,EAAY,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,KAAK,SAAS,CAAC,GAEjE,IAAA,eAAM,EAAC,sBAAsB,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,EAAE,CAAC,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,KAAK,SAAS,CAAC,GAE3E,IAAA,eAAM,EAAC,SAAS,aAAT,SAAS,cAAT,SAAS,GAAK,EAAY,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,KAAK,SAAS,CAAC,CAC7C,CAAC;AAC7B,CAAC,CAAC;AAhCW,QAAA,wBAAwB,4BAgCnC;AAEF,MAAM,qBAAqB,GAAG,CAC5B,KAAiE,EACjE,EAAE;;IACF,MAAM,EAAE,SAAS,EAAE,oBAAoB,EAAE,qBAAqB,EAAE,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCAC7E,EAAE,CAAwB,CAAC;IAE7B,MAAM,OAAO,GAAG,CAAC,GAAG,CAAC,SAAS,aAAT,SAAS,cAAT,SAAS,GAAI,EAAE,CAAC,CAAC,CAAC;IAEvC,IAAI,oBAAoB,EAAE;QACxB,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,mCAAmC,oBAAoB,EAAE,CAAC,CAAC;QAC3E,MAAM,CAAC,SAAS,EAAE,KAAK,CAAC,GAAG,oBAAoB,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC3D,MAAM,eAAe,GAAG,MAAM,IAAA,8BAAoB,EAAM,SAAS,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC,CAAC;QAC5F,OAAO,CAAC,IAAI,CAAC,GAAG,eAAe,CAAC,CAAC;KAClC;IAED,IAAI,qBAAqB,EAAE;QACzB,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,kCAAkC,CAAC,CAAC;QACpD,MAAM,UAAU,GAAG,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,qBAAqB,CAAC,2CAAI,CAAA,CAAC;QACrE,IAAI,UAAU;YAAE,OAAO,CAAC,IAAI,CAAC,GAAG,UAAU,CAAC,CAAC;KAC7C;IAED,OAAO,OAAO,CAAC;AACjB,CAAC,CAAA,CAAC","sourcesContent":["import {\n BasicCrawler,\n CrawlingContext,\n RouterHandler,\n BasicCrawlerOptions,\n CheerioCrawler,\n Router,\n HttpCrawler,\n JSDOMCrawler,\n PlaywrightCrawler,\n PuppeteerCrawler,\n Log,\n Request as CrawleeRequest,\n} from 'crawlee';\nimport { omitBy, pick, defaults } from 'lodash';\nimport { gotScraping } from 'got-scraping';\n\nimport type { CrawlerMeta, CrawlerType } from '../../types';\nimport type { MaybePromise, PickPartial } from '../../utils/types';\nimport { createErrorHandler } from '../error/errorHandler';\nimport { type PushDataOptions, itemCacheKey, pushData } from '../io/pushData';\nimport { getColumnFromDataset } from '../io/dataset';\nimport { PushRequestsOptions, pushRequests } from '../io/pushRequests';\nimport type { CrawleeOneIO } from '../integrations/types';\nimport { apifyIO } from '../integrations/apify';\nimport { registerHandlers, setupDefaultHandlers } from '../router/router';\nimport {\n CrawlerConfigActorInput,\n OutputActorInput,\n MetamorphActorInput,\n PrivacyActorInput,\n crawlerInput,\n StartUrlsActorInput,\n InputActorInput,\n RequestActorInput,\n AllActorInputs,\n LoggingActorInput,\n} from '../config';\nimport { logLevelHandlerWrapper, logLevelToCrawlee } from '../log';\nimport type { CrawleeOneTelemetry } from '../telemetry/types';\nimport type {\n CrawleeOneActorCtx,\n CrawleeOneActorDef,\n CrawleeOneHookCtx,\n CrawleeOneActorRouterCtx,\n Metamorph,\n RunCrawler,\n} from './types';\n\nconst actorClassByType = {\n basic: BasicCrawler,\n http: HttpCrawler,\n cheerio: CheerioCrawler,\n jsdom: JSDOMCrawler,\n playwright: PlaywrightCrawler,\n puppeteer: PuppeteerCrawler,\n} satisfies Record<CrawlerType, { new (options: Record<string, any>): any }>;\n\nconst isRouter = (r: any): r is RouterHandler<any> => {\n return !!((r as RouterHandler)?.addHandler && (r as RouterHandler)?.addDefaultHandler);\n};\n\nconst isFunc = (f: any): f is (...args: any[]) => any => {\n return typeof f === 'function';\n};\n\n/** Run a function that was defined as a string via Actor input */\nconst genHookFn = <\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n>(\n actor: Pick<CrawleeOneActorCtx<any, Input, TIO, any, any>, 'input' | 'state' | 'io'>,\n fnStr?: string\n) => {\n if (!fnStr) return null;\n\n const hookCtx = {\n io: actor.io,\n input: actor.input,\n state: actor.state,\n itemCacheKey,\n sendRequest: gotScraping,\n } satisfies CrawleeOneHookCtx<Input, TIO>;\n\n const hookFn = eval(fnStr);\n if (!hookFn) return null;\n\n return async (...args) => hookFn(...args, hookCtx);\n};\n\n/**\n * Options available when creating default configuration for an opinionated Crawlee actor,\n * which is then run within Apify's `Actor.main()` context.\n *\n * Apify context can be replaced with custom implementation using the `actorConfig.io` option.\n *\n * Read more about what this actor does at {@link createCrawleeOne}.\n */\nexport interface RunCrawleeOneOptions<\n TCrawlerType extends CrawlerType,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO,\n Telem extends CrawleeOneTelemetry<any, any> = CrawleeOneTelemetry<any, any>,\n Ctx extends CrawlerMeta<TCrawlerType, any>['context'] = CrawlingContext<BasicCrawler>\n> {\n /** String idetifying the actor class, e.g. `'cheerio'` */\n actorType: TCrawlerType;\n actorName: string;\n /** Config passed to the {@link createCrawleeOne} */\n actorConfig: PickPartial<\n CrawleeOneActorDef<Labels, Input, TIO, Telem, Ctx>,\n 'router' | 'createCrawler' | 'io' | 'telemetry'\n >;\n /**\n * If using default `createCrawler` implementation, these are crawler options\n * that may be overriden by user input.\n */\n crawlerConfigDefaults?: CrawlerMeta<TCrawlerType, any>['options'];\n /**\n * If using default `createCrawler` implementation, these are crawler options\n * that will override user input.\n *\n * This is useful for testing env.\n */\n crawlerConfigOverrides?: CrawlerMeta<TCrawlerType, any>['options'];\n /**\n * Callback with the created actor. The callback is called within\n * the `Actor.main()` context.\n */\n onActorReady?: (actor: CrawleeOneActorCtx<Labels, Input, TIO, Telem, Ctx>) => MaybePromise<void>;\n}\n\n/**\n * Create opinionated Crawlee crawler that uses, and run it within Apify's `Actor.main()` context.\n *\n * Apify context can be replaced with custom implementation using the `actorConfig.io` option.\n *\n * This function does the following for you:\n *\n * 1) Full TypeScript coverage - Ensure all components use the same Crawler / CrawlerContext.\n *\n * 2) Get Actor input from `Actor.getInput` if not given.\n *\n * 3) (Optional) Validate Actor input\n *\n * 4) Set up router such that requests that reach default route are\n * redirected to labelled routes based on which item from \"routes\" they match.\n *\n * 5) Register all route handlers for you.\n *\n * 6) (Optional) Wrap all route handlers in a wrapper. Use this e.g.\n * if you want to add a field to the context object, or handle errors\n * from a single place.\n *\n * 7) (Optional) Support transformation and filtering of (scraped) entries,\n * configured via Actor input.\n *\n * 8) (Optional) Support Actor metamorphing, configured via Actor input.\n *\n * 9) Apify context (e.g. calling `Actor.getInput`) can be replaced with custom\n * implementation using the `io` option.\n */\nexport const runCrawleeOne = async <\n TType extends CrawlerType,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO,\n Telem extends CrawleeOneTelemetry<any, any> = CrawleeOneTelemetry<any, any>,\n Ctx extends CrawlerMeta<TType, any>['context'] = CrawlerMeta<TType, any>['context']\n>(\n args: RunCrawleeOneOptions<TType, Labels, Input, TIO, Telem, Ctx>\n): Promise<void> => {\n const {\n actorType,\n actorName,\n actorConfig,\n crawlerConfigDefaults,\n crawlerConfigOverrides,\n onActorReady,\n } = args;\n\n const { io = apifyIO as any as TIO, telemetry } = actorConfig;\n\n // See docs:\n // - https://docs.apify.com/sdk/js/\n // - https://docs.apify.com/academy/deploying-your-code/inputs-outputs#accepting-input-with-the-apify-sdk\n // - https://docs.apify.com/sdk/js/docs/upgrading/upgrading-to-v3#apify-sdk\n await io.runInContext(\n async () => {\n await telemetry?.setup({ actorType, actorName, actorConfig: { ...actorConfig, io } });\n\n const actorDefaults: Pick<\n CrawleeOneActorDef<Labels, Input & AllActorInputs, TIO, Telem, Ctx>,\n 'router' | 'routeHandlerWrappers' | 'createCrawler'\n > = {\n router: Router.create<Ctx>(),\n routeHandlerWrappers: ({ input }) => [\n logLevelHandlerWrapper<Ctx, any>(input?.logLevel ?? 'info'),\n ],\n createCrawler: ({ router, proxy, input }) => {\n const options = createHttpCrawlerOptions<CrawlerMeta<TType, any>['options'], Input>({\n input,\n defaults: crawlerConfigDefaults,\n overrides: {\n requestHandler: router,\n proxyConfiguration: proxy,\n // Capture errors in a separate (Apify) Dataset and pass errors to telemetry\n failedRequestHandler: createErrorHandler({\n io,\n reportingDatasetId: input?.errorReportingDatasetId ?? 'REPORTING',\n sendToTelemetry: input?.errorSendToTelemetry ?? true,\n onSendErrorToTelemetry: telemetry?.onSendErrorToTelemetry,\n }),\n ...crawlerConfigOverrides,\n },\n });\n const CrawlerClass = actorClassByType[actorType] as any;\n return new CrawlerClass(options);\n },\n };\n\n const actor = await createCrawleeOne<Labels, Input, TIO, Telem, Ctx>({\n ...actorConfig,\n io,\n router: actorConfig.router ?? (actorDefaults.router as any),\n routeHandlerWrappers:\n actorConfig.routeHandlerWrappers ?? (actorDefaults.routeHandlerWrappers as any),\n createCrawler: actorConfig.createCrawler ?? (actorDefaults.createCrawler as any),\n });\n\n await onActorReady?.(actor);\n },\n { statusMessage: 'Crawling finished!' }\n );\n};\n\n/**\n * NOTE: If you want to run a scraper, see {@link runCrawleeOne}. This is lower-level\n * function that should be used only if you want to override the default behaviour of runCrawleeOne.\n *\n * Create opinionated Crawlee crawler that uses router for handling requests.\n *\n * This is a quality-of-life function that does the following for you:\n *\n * 1) Full TypeScript coverage - Ensure all components use the same Crawler / CrawlerContext.\n *\n * 2) Get Actor input from `Actor.getInput` if not given.\n *\n * 3) (Optional) Validate Actor input\n *\n * 4) Set up router such that requests that reach default route are\n * redirected to labelled routes based on which item from \"routes\" they match.\n *\n * 5) Register all route handlers for you.\n *\n * 6) (Optional) Wrap all route handlers in a wrapper. Use this e.g.\n * if you want to add a field to the context object, or handle errors\n * from a single place.\n *\n * 7) (Optional) Support transformation and filtering of (scraped) entries,\n * configured via Actor input.\n *\n * 8) (Optional) Support Actor metamorphing, configured via Actor input.\n *\n * 9) Apify context (e.g. calling `Actor.getInput`) can be replaced with custom\n * implementation using the `io` option.\n */\nconst createCrawleeOne = async <\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO,\n Telem extends CrawleeOneTelemetry<any, any> = CrawleeOneTelemetry<any, any>,\n Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>\n>(\n config: PickPartial<CrawleeOneActorDef<Labels, Input, TIO, Telem, Ctx>, 'io'>\n): Promise<CrawleeOneActorCtx<Labels, Input, TIO, Telem, Ctx>> => {\n const { io = apifyIO as any as TIO, telemetry } = config;\n\n // Mutable state that is available to the actor hooks\n const state = {};\n\n // Initialize actor inputs\n const rawInput = config.input\n ? isFunc(config.input)\n ? await config.input({ ...config, io })\n : config.input\n : await io.getInput<Input>();\n const input = Object.freeze(await resolveInput<Input | null>(rawInput, state, { io }));\n\n if (config.validateInput) await config.validateInput(input);\n\n const { logLevel } = (input ?? {}) as LoggingActorInput;\n const log = new Log({ level: logLevel ? logLevelToCrawlee[logLevel] : undefined });\n\n // This is context that is available to options that use initialization function\n const getConfig = () => ({ ...config, input, state, io, log });\n\n // Set up proxy\n const defaultProxy =\n config.proxy == null ? await io.createDefaultProxyConfiguration(input ?? undefined) : undefined;\n const proxy =\n config.proxy == null\n ? defaultProxy\n : isFunc(config.proxy)\n ? await config.proxy(getConfig())\n : config.proxy;\n\n // Run initialization functions\n const router: RouterHandler<Ctx> = isRouter(config.router)\n ? config.router\n : await (config.router as any)(getConfig());\n const routes = isFunc(config.routes) ? await config.routes(getConfig()) : config.routes; // prettier-ignore\n const routeHandlers = isFunc(config.routeHandlers) ? await config.routeHandlers(getConfig()) : config.routeHandlers; // prettier-ignore\n const routeHandlerWrappers = isFunc(config.routeHandlerWrappers) ? await config.routeHandlerWrappers(getConfig()) : config.routeHandlerWrappers; // prettier-ignore\n\n // Create Crawlee crawler\n const getActorCtx = () => ({\n io,\n telemetry,\n router,\n routes,\n routeHandlers,\n proxy,\n config,\n input,\n state,\n log,\n });\n const crawler = await config.createCrawler(getActorCtx());\n\n // Create actor (our custom entity)\n const preActor = { crawler, ...getActorCtx() };\n const runCrawler = createScopedCrawlerRun(preActor);\n const metamorph = createScopedMetamorph(preActor);\n const scopedPushData = createScopedPushData(preActor);\n const scopedPushRequest = createScopedPushRequests(preActor);\n const startUrls = await getStartUrlsFromInput(preActor);\n\n const actor = {\n ...preActor,\n crawler,\n runCrawler,\n metamorph,\n pushData: scopedPushData,\n pushRequests: scopedPushRequest,\n startUrls,\n } satisfies CrawleeOneActorCtx<Labels, Input, TIO, Telem, Ctx>;\n\n // Extra data that we make available to the route handlers\n const routerContext = { actor, pushData: scopedPushData };\n\n // Set up router\n await setupDefaultHandlers<\n Ctx,\n CrawleeOneActorRouterCtx<Ctx, Labels, Input, TIO, Telem>,\n Labels,\n Input\n >({\n io,\n router,\n routeHandlerWrappers,\n routerContext,\n routes,\n routeHandlers,\n input,\n });\n\n // Register labelled handlers\n await registerHandlers<Ctx, CrawleeOneActorRouterCtx<Ctx, Labels, Input, TIO, Telem>, Labels>(\n router,\n routeHandlers,\n { routerContext, handlerWrappers: routeHandlerWrappers }\n );\n\n // Now that the actor is ready, enqueue the URLs right away\n await scopedPushRequest(startUrls as CrawleeRequest[]);\n\n return actor;\n};\n\nconst resolveInput = async <T extends Record<string, any> | null>(\n input: object | null,\n state: Record<string, unknown>,\n options?: { io?: CrawleeOneIO }\n) => {\n const { io = apifyIO as CrawleeOneIO } = options ?? {};\n const { inputExtendUrl, inputExtendFromFunction } = (input ?? {}) as InputActorInput;\n\n const inputFromUrl = inputExtendUrl ? await gotScraping.get(inputExtendUrl).json<object>() : null;\n const inputFn = genHookFn({ state, input, io }, inputExtendFromFunction);\n const inputFromFunc = (await inputFn?.()) ?? null;\n const extendedInput = { ...inputFromUrl, ...inputFromFunc, ...input };\n\n return extendedInput as T;\n};\n\n/**\n * Create a function that wraps `crawler.run(requests, runOtions)` with additional\n * features like:\n * - Automatically metamorph into another actor after the run finishes\n */\nconst createScopedCrawlerRun = <\n Ctx extends CrawlingContext<any> = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO,\n Telem extends CrawleeOneTelemetry<any, any> = CrawleeOneTelemetry<any, any>\n>(\n actor: Omit<\n CrawleeOneActorCtx<Labels, Input, TIO, Telem, Ctx>,\n 'runCrawler' | 'metamorph' | 'pushData' | 'pushRequests' | 'startUrls'\n >\n) => {\n const {\n requestTransformBefore,\n requestTransformAfter,\n requestFilterBefore,\n requestFilterAfter,\n outputTransformBefore,\n outputTransformAfter,\n outputFilterBefore,\n outputFilterAfter,\n outputCacheStoreId,\n outputCacheActionOnResult,\n } = (actor.input ?? {}) as OutputActorInput & RequestActorInput;\n\n const metamorph = createScopedMetamorph(actor);\n\n const runCrawler: RunCrawler<Ctx> = async (requests, options) => {\n // Clear cache if it was set from the input\n if (outputCacheStoreId && outputCacheActionOnResult === 'overwrite') {\n const store = await actor.io.openKeyValueStore(outputCacheStoreId);\n await store.drop();\n }\n\n await genHookFn(actor, outputTransformBefore)?.();\n await genHookFn(actor, outputFilterBefore)?.();\n await genHookFn(actor, requestTransformBefore)?.();\n await genHookFn(actor, requestFilterBefore)?.();\n\n const runRes = await actor.crawler.run(requests, options);\n\n await genHookFn(actor, outputTransformAfter)?.();\n await genHookFn(actor, outputFilterAfter)?.();\n await genHookFn(actor, requestTransformAfter)?.();\n await genHookFn(actor, requestFilterAfter)?.();\n\n // Trigger metamorph if it was set from the input\n await metamorph();\n\n return runRes;\n };\n\n return runCrawler;\n};\n\n/** Create a function that triggers metamorph, using Actor's inputs as defaults. */\nconst createScopedMetamorph = (actor: Pick<CrawleeOneActorCtx, 'input' | 'io'>) => {\n // Trigger metamorph if it was set from the input\n const metamorph: Metamorph = async (overrides?: MetamorphActorInput) => {\n const {\n metamorphActorId,\n metamorphActorBuild,\n metamorphActorInput,\n } = defaults({}, overrides, actor.input ?? {}); // prettier-ignore\n\n if (!metamorphActorId) return;\n\n await actor.io.triggerDownstreamCrawler(metamorphActorId, metamorphActorInput, {\n build: metamorphActorBuild,\n });\n };\n\n return metamorph;\n};\n\n/** pushData wrapper that pre-populates options based on actor input */\nconst createScopedPushData = (\n actor: Pick<CrawleeOneActorCtx, 'input' | 'state' | 'io' | 'log'>\n) => {\n const {\n includePersonalData,\n requestQueueId,\n outputMaxEntries,\n outputTransform,\n outputFilter,\n outputDatasetId,\n outputPickFields,\n outputRenameFields,\n outputCacheStoreId,\n outputCachePrimaryKeys,\n outputCacheActionOnResult,\n } = (actor.input ?? {}) as OutputActorInput & PrivacyActorInput & RequestActorInput;\n\n const scopedPushData: CrawleeOneActorCtx['pushData'] = async (entries, ctx, options) => {\n const transformFn = genHookFn(actor, outputTransform);\n const filterFn = genHookFn(actor, outputFilter);\n\n const mergedOptions = {\n io: actor.io,\n log: actor.log,\n showPrivate: includePersonalData,\n maxCount: outputMaxEntries,\n pickKeys: outputPickFields,\n remapKeys: outputRenameFields,\n transform: transformFn ? (item) => transformFn(item) : undefined,\n filter: filterFn ? (item) => filterFn(item) : undefined,\n datasetId: outputDatasetId,\n requestQueueId,\n cacheStoreId: outputCacheStoreId,\n cachePrimaryKeys: outputCachePrimaryKeys,\n cacheActionOnResult: outputCacheActionOnResult,\n ...options,\n } satisfies PushDataOptions<object>;\n\n return pushData(entries, ctx, mergedOptions);\n };\n\n return scopedPushData;\n};\n\n/** pushRequests wrapper that pre-populates options based on actor input */\nconst createScopedPushRequests = (\n actor: Pick<CrawleeOneActorCtx, 'input' | 'state' | 'io' | 'log'>\n) => {\n const { requestQueueId, requestMaxEntries, requestTransform, requestFilter } = (actor.input ??\n {}) as RequestActorInput;\n\n const scopedPushRequest: CrawleeOneActorCtx['pushRequests'] = async (entries, options) => {\n const transformFn = genHookFn(actor, requestTransform);\n const filterFn = genHookFn(actor, requestFilter);\n\n const mergedOptions = {\n io: actor.io,\n log: actor.log,\n maxCount: requestMaxEntries,\n transform: transformFn ? (item) => transformFn(item) : undefined,\n filter: filterFn ? (item) => filterFn(item) : undefined,\n requestQueueId,\n ...options,\n } satisfies PushRequestsOptions<any>;\n\n return pushRequests(entries, mergedOptions);\n };\n\n return scopedPushRequest;\n};\n\n/** Given the actor input, create common crawler options. */\nexport const createHttpCrawlerOptions = <\n TOpts extends BasicCrawlerOptions<any> = BasicCrawlerOptions,\n Input extends Record<string, any> = Record<string, any>\n>({\n input,\n defaults,\n overrides,\n}: {\n /** Actor input */\n input: Input | null;\n /**\n * Default config options set by us. These may be overriden\n * by values from actor input (set by user).\n */\n defaults?: TOpts;\n /**\n * These config options will overwrite both the default and user\n * options. This is useful for hard-setting values e.g. in tests.\n */\n overrides?: TOpts;\n}) => {\n const pickCrawlerInputFields = <T extends CrawlerConfigActorInput>(config: T) =>\n pick(config, Object.keys(crawlerInput));\n\n return {\n // ----- 1. DEFAULTS -----\n ...omitBy(defaults ?? ({} as TOpts), (field) => field === undefined),\n // ----- 2. CONFIG FROM INPUT -----\n ...omitBy(pickCrawlerInputFields(input ?? {}), (field) => field === undefined),\n // ----- 3. OVERRIDES - E.G. TEST CONFIG -----\n ...omitBy(overrides ?? ({} as TOpts), (field) => field === undefined),\n } satisfies Partial<TOpts>;\n};\n\nconst getStartUrlsFromInput = async (\n actor: Pick<CrawleeOneActorCtx, 'input' | 'state' | 'io' | 'log'>\n) => {\n const { startUrls, startUrlsFromDataset, startUrlsFromFunction } = (actor.input ??\n {}) as StartUrlsActorInput;\n\n const urlsAgg = [...(startUrls ?? [])];\n\n if (startUrlsFromDataset) {\n actor.log.debug(`Loading start URLs from Dataset ${startUrlsFromDataset}`);\n const [datasetId, field] = startUrlsFromDataset.split('#');\n const urlsFromDataset = await getColumnFromDataset<any>(datasetId, field, { io: actor.io });\n urlsAgg.push(...urlsFromDataset);\n }\n\n if (startUrlsFromFunction) {\n actor.log.debug(`Loading start URLs from function`);\n const urlsFromFn = await genHookFn(actor, startUrlsFromFunction)?.();\n if (urlsFromFn) urlsAgg.push(...urlsFromFn);\n }\n\n return urlsAgg;\n};\n"]}
|