crawlee-one 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/README.md +81 -0
  2. package/dist/cjs/cli/cli.d.ts +1 -0
  3. package/dist/cjs/cli/cli.js +61 -0
  4. package/dist/cjs/cli/cli.js.map +1 -0
  5. package/dist/cjs/cli/index.d.ts +2 -0
  6. package/dist/cjs/cli/index.js +6 -0
  7. package/dist/cjs/cli/index.js.map +1 -0
  8. package/dist/cjs/index.d.ts +24 -0
  9. package/dist/cjs/index.js +43 -0
  10. package/dist/cjs/index.js.map +1 -0
  11. package/dist/cjs/lib/actions/dom.d.ts +102 -0
  12. package/dist/cjs/lib/actions/dom.js +743 -0
  13. package/dist/cjs/lib/actions/dom.js.map +1 -0
  14. package/dist/cjs/lib/actions/domUtils.d.ts +42 -0
  15. package/dist/cjs/lib/actions/domUtils.js +126 -0
  16. package/dist/cjs/lib/actions/domUtils.js.map +1 -0
  17. package/dist/cjs/lib/actions/page.d.ts +69 -0
  18. package/dist/cjs/lib/actions/page.js +205 -0
  19. package/dist/cjs/lib/actions/page.js.map +1 -0
  20. package/dist/cjs/lib/actions/scrapeListing.d.ts +78 -0
  21. package/dist/cjs/lib/actions/scrapeListing.js +242 -0
  22. package/dist/cjs/lib/actions/scrapeListing.js.map +1 -0
  23. package/dist/cjs/lib/actor/actor.d.ts +90 -0
  24. package/dist/cjs/lib/actor/actor.js +306 -0
  25. package/dist/cjs/lib/actor/actor.js.map +1 -0
  26. package/dist/cjs/lib/actor/types.d.ts +162 -0
  27. package/dist/cjs/lib/actor/types.js +3 -0
  28. package/dist/cjs/lib/actor/types.js.map +1 -0
  29. package/dist/cjs/lib/actor.d.ts +189 -0
  30. package/dist/cjs/lib/actor.js +225 -0
  31. package/dist/cjs/lib/actor.js.map +1 -0
  32. package/dist/cjs/lib/actorSpec.d.ts +20 -0
  33. package/dist/cjs/lib/actorSpec.js +3 -0
  34. package/dist/cjs/lib/actorSpec.js.map +1 -0
  35. package/dist/cjs/lib/config.d.ts +561 -0
  36. package/dist/cjs/lib/config.js +707 -0
  37. package/dist/cjs/lib/config.js.map +1 -0
  38. package/dist/cjs/lib/dataset/maxCount.d.ts +30 -0
  39. package/dist/cjs/lib/dataset/maxCount.js +55 -0
  40. package/dist/cjs/lib/dataset/maxCount.js.map +1 -0
  41. package/dist/cjs/lib/dataset/pushData.d.ts +123 -0
  42. package/dist/cjs/lib/dataset/pushData.js +182 -0
  43. package/dist/cjs/lib/dataset/pushData.js.map +1 -0
  44. package/dist/cjs/lib/dataset.d.ts +98 -0
  45. package/dist/cjs/lib/dataset.js +122 -0
  46. package/dist/cjs/lib/dataset.js.map +1 -0
  47. package/dist/cjs/lib/dom.d.ts +78 -0
  48. package/dist/cjs/lib/dom.js +243 -0
  49. package/dist/cjs/lib/dom.js.map +1 -0
  50. package/dist/cjs/lib/error/errorHandler.d.ts +112 -0
  51. package/dist/cjs/lib/error/errorHandler.js +164 -0
  52. package/dist/cjs/lib/error/errorHandler.js.map +1 -0
  53. package/dist/cjs/lib/error/sentry.d.ts +11 -0
  54. package/dist/cjs/lib/error/sentry.js +60 -0
  55. package/dist/cjs/lib/error/sentry.js.map +1 -0
  56. package/dist/cjs/lib/integrations/apify.d.ts +67 -0
  57. package/dist/cjs/lib/integrations/apify.js +106 -0
  58. package/dist/cjs/lib/integrations/apify.js.map +1 -0
  59. package/dist/cjs/lib/integrations/types.d.ts +274 -0
  60. package/dist/cjs/lib/integrations/types.js +3 -0
  61. package/dist/cjs/lib/integrations/types.js.map +1 -0
  62. package/dist/cjs/lib/io/dataset.d.ts +67 -0
  63. package/dist/cjs/lib/io/dataset.js +86 -0
  64. package/dist/cjs/lib/io/dataset.js.map +1 -0
  65. package/dist/cjs/lib/io/maxCount.d.ts +30 -0
  66. package/dist/cjs/lib/io/maxCount.js +55 -0
  67. package/dist/cjs/lib/io/maxCount.js.map +1 -0
  68. package/dist/cjs/lib/io/pushData.d.ts +124 -0
  69. package/dist/cjs/lib/io/pushData.js +193 -0
  70. package/dist/cjs/lib/io/pushData.js.map +1 -0
  71. package/dist/cjs/lib/io/pushRequests.d.ts +38 -0
  72. package/dist/cjs/lib/io/pushRequests.js +63 -0
  73. package/dist/cjs/lib/io/pushRequests.js.map +1 -0
  74. package/dist/cjs/lib/io/requestQueue.d.ts +28 -0
  75. package/dist/cjs/lib/io/requestQueue.js +40 -0
  76. package/dist/cjs/lib/io/requestQueue.js.map +1 -0
  77. package/dist/cjs/lib/log.d.ts +38 -0
  78. package/dist/cjs/lib/log.js +54 -0
  79. package/dist/cjs/lib/log.js.map +1 -0
  80. package/dist/cjs/lib/migrate/localMigrator.d.ts +10 -0
  81. package/dist/cjs/lib/migrate/localMigrator.js +57 -0
  82. package/dist/cjs/lib/migrate/localMigrator.js.map +1 -0
  83. package/dist/cjs/lib/migrate/localState.d.ts +7 -0
  84. package/dist/cjs/lib/migrate/localState.js +43 -0
  85. package/dist/cjs/lib/migrate/localState.js.map +1 -0
  86. package/dist/cjs/lib/migrate/types.d.ts +6 -0
  87. package/dist/cjs/lib/migrate/types.js +3 -0
  88. package/dist/cjs/lib/migrate/types.js.map +1 -0
  89. package/dist/cjs/lib/readme/readme.d.ts +65 -0
  90. package/dist/cjs/lib/readme/readme.js +534 -0
  91. package/dist/cjs/lib/readme/readme.js.map +1 -0
  92. package/dist/cjs/lib/readme/types.d.ts +260 -0
  93. package/dist/cjs/lib/readme/types.js +54 -0
  94. package/dist/cjs/lib/readme/types.js.map +1 -0
  95. package/dist/cjs/lib/router.d.ts +132 -0
  96. package/dist/cjs/lib/router.js +165 -0
  97. package/dist/cjs/lib/router.js.map +1 -0
  98. package/dist/cjs/lib/scraper/scrapeListing.d.ts +78 -0
  99. package/dist/cjs/lib/scraper/scrapeListing.js +242 -0
  100. package/dist/cjs/lib/scraper/scrapeListing.js.map +1 -0
  101. package/dist/cjs/lib/test/actor.d.ts +21 -0
  102. package/dist/cjs/lib/test/actor.js +56 -0
  103. package/dist/cjs/lib/test/actor.js.map +1 -0
  104. package/dist/cjs/lib/test/mockApifyClient.d.ts +32 -0
  105. package/dist/cjs/lib/test/mockApifyClient.js +176 -0
  106. package/dist/cjs/lib/test/mockApifyClient.js.map +1 -0
  107. package/dist/cjs/types.d.ts +31 -0
  108. package/dist/cjs/types.js +3 -0
  109. package/dist/cjs/types.js.map +1 -0
  110. package/dist/cjs/utils/async.d.ts +19 -0
  111. package/dist/cjs/utils/async.js +74 -0
  112. package/dist/cjs/utils/async.js.map +1 -0
  113. package/dist/cjs/utils/error.d.ts +1 -0
  114. package/dist/cjs/utils/error.js +10 -0
  115. package/dist/cjs/utils/error.js.map +1 -0
  116. package/dist/cjs/utils/format.d.ts +9 -0
  117. package/dist/cjs/utils/format.js +19 -0
  118. package/dist/cjs/utils/format.js.map +1 -0
  119. package/dist/cjs/utils/package.d.ts +15 -0
  120. package/dist/cjs/utils/package.js +25 -0
  121. package/dist/cjs/utils/package.js.map +1 -0
  122. package/dist/cjs/utils/types.d.ts +6 -0
  123. package/dist/cjs/utils/types.js +9 -0
  124. package/dist/cjs/utils/types.js.map +1 -0
  125. package/dist/cjs/utils/url.d.ts +9 -0
  126. package/dist/cjs/utils/url.js +32 -0
  127. package/dist/cjs/utils/url.js.map +1 -0
  128. package/dist/cjs/utils/valueMonitor.d.ts +31 -0
  129. package/dist/cjs/utils/valueMonitor.js +91 -0
  130. package/dist/cjs/utils/valueMonitor.js.map +1 -0
  131. package/package.json +85 -0
@@ -0,0 +1,306 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.createHttpCrawlerOptions = exports.createCrawleeOne = exports.createAndRunCrawleeOne = void 0;
13
+ const crawlee_1 = require("crawlee");
14
+ const lodash_1 = require("lodash");
15
+ const got_scraping_1 = require("got-scraping");
16
+ const errorHandler_1 = require("../error/errorHandler");
17
+ const sentry_1 = require("../error/sentry");
18
+ const pushData_1 = require("../io/pushData");
19
+ const dataset_1 = require("../io/dataset");
20
+ const pushRequests_1 = require("../io/pushRequests");
21
+ const apify_1 = require("../integrations/apify");
22
+ const router_1 = require("../router");
23
+ const config_1 = require("../config");
24
+ const log_1 = require("../log");
25
+ const actorClassByType = {
26
+ basic: crawlee_1.BasicCrawler,
27
+ http: crawlee_1.HttpCrawler,
28
+ cheerio: crawlee_1.CheerioCrawler,
29
+ jsdom: crawlee_1.JSDOMCrawler,
30
+ playwright: crawlee_1.PlaywrightCrawler,
31
+ puppeteer: crawlee_1.PuppeteerCrawler,
32
+ };
33
+ const isRouter = (r) => {
34
+ return !!(r.addHandler && r.addDefaultHandler);
35
+ };
36
+ const isFunc = (f) => {
37
+ return typeof f === 'function';
38
+ };
39
+ /** Run a function that was defined as a string via Actor input */
40
+ const genHookFn = (actor, fnStr) => {
41
+ if (!fnStr)
42
+ return null;
43
+ const hookCtx = {
44
+ io: actor.io,
45
+ input: actor.input,
46
+ state: actor.state,
47
+ itemCacheKey: pushData_1.itemCacheKey,
48
+ sendRequest: got_scraping_1.gotScraping,
49
+ };
50
+ const hookFn = eval(fnStr);
51
+ if (!hookFn)
52
+ return null;
53
+ return (...args) => __awaiter(void 0, void 0, void 0, function* () { return hookFn(...args, hookCtx); });
54
+ };
55
+ /**
56
+ * Create default configuration for an opinionated Crawlee actor,
57
+ * and run the actor within Apify's `Actor.main()` context.
58
+ *
59
+ * Apify context can be replaced with custom implementation using the `actorConfig.io` option.
60
+ *
61
+ * Read more about what this actor does at {@link createCrawleeOne}.
62
+ */
63
+ const createAndRunCrawleeOne = (input) => __awaiter(void 0, void 0, void 0, function* () {
64
+ const { actorType, actorName, actorConfig, crawlerConfigDefaults, crawlerConfigOverrides, sentryOptions, onActorReady, } = input;
65
+ const { io = apify_1.apifyIO } = actorConfig;
66
+ yield (0, sentry_1.setupSentry)(Object.assign(Object.assign({}, sentryOptions), { serverName: actorName }), { io });
67
+ // See docs:
68
+ // - https://docs.apify.com/sdk/js/
69
+ // - https://docs.apify.com/academy/deploying-your-code/inputs-outputs#accepting-input-with-the-apify-sdk
70
+ // - https://docs.apify.com/sdk/js/docs/upgrading/upgrading-to-v3#apify-sdk
71
+ yield io.runInContext(() => __awaiter(void 0, void 0, void 0, function* () {
72
+ var _a, _b, _c;
73
+ const actorDefaults = {
74
+ io,
75
+ router: crawlee_1.Router.create(),
76
+ routerWrappers: ({ input }) => {
77
+ var _a;
78
+ return [
79
+ (0, log_1.logLevelHandlerWrapper)((_a = input === null || input === void 0 ? void 0 : input.logLevel) !== null && _a !== void 0 ? _a : 'info'),
80
+ ];
81
+ },
82
+ createCrawler: ({ router, proxy, input }) => {
83
+ var _a, _b;
84
+ const options = (0, exports.createHttpCrawlerOptions)({
85
+ input,
86
+ defaults: crawlerConfigDefaults,
87
+ overrides: Object.assign({ requestHandler: router, proxyConfiguration: proxy,
88
+ // Capture errors in a separate (Apify) Dataset and pass errors to Sentry
89
+ failedRequestHandler: (0, errorHandler_1.createErrorHandler)({
90
+ io,
91
+ reportingDatasetId: (_a = input === null || input === void 0 ? void 0 : input.errorReportingDatasetId) !== null && _a !== void 0 ? _a : 'REPORTING',
92
+ sendToSentry: (_b = input === null || input === void 0 ? void 0 : input.errorSendToSentry) !== null && _b !== void 0 ? _b : true,
93
+ }) }, crawlerConfigOverrides),
94
+ });
95
+ const CrawlerClass = actorClassByType[actorType];
96
+ return new CrawlerClass(options);
97
+ },
98
+ routes: [],
99
+ routeHandlers: {},
100
+ };
101
+ const actor = yield (0, exports.createCrawleeOne)(Object.assign(Object.assign({}, actorConfig), { io, router: (_a = actorConfig.router) !== null && _a !== void 0 ? _a : actorDefaults.router, routerWrappers: (_b = actorConfig.routerWrappers) !== null && _b !== void 0 ? _b : actorDefaults.routerWrappers, createCrawler: (_c = actorConfig.createCrawler) !== null && _c !== void 0 ? _c : actorDefaults.createCrawler }));
102
+ yield (onActorReady === null || onActorReady === void 0 ? void 0 : onActorReady(actor));
103
+ }), { statusMessage: 'Crawling finished!' });
104
+ });
105
+ exports.createAndRunCrawleeOne = createAndRunCrawleeOne;
106
+ /**
107
+ * Create opinionated Crawlee crawler that uses router for handling requests.
108
+ *
109
+ * This is a quality-of-life function that does the following for you:
110
+ *
111
+ * 1) Full TypeScript coverage - Ensure all components use the same Crawler / CrawlerContext.
112
+ *
113
+ * 2) Get Actor input from `Actor.getInput` if not given.
114
+ *
115
+ * 3) (Optional) Validate Actor input
116
+ *
117
+ * 4) Set up router such that requests that reach default route are
118
+ * redirected to labelled routes based on which item from "routes" they match.
119
+ *
120
+ * 5) Register all route handlers for you.
121
+ *
122
+ * 6) (Optional) Wrap all route handlers in a wrapper. Use this e.g.
123
+ * if you want to add a field to the context object, or handle errors
124
+ * from a single place.
125
+ *
126
+ * 7) (Optional) Support transformation and filtering of (scraped) entries,
127
+ * configured via Actor input.
128
+ *
129
+ * 8) (Optional) Support Actor metamorphing, configured via Actor input.
130
+ *
131
+ * 9) Apify context (e.g. calling `Actor.getInput`) can be replaced with custom
132
+ * implementation using the `io` option.
133
+ */
134
+ const createCrawleeOne = (config) => __awaiter(void 0, void 0, void 0, function* () {
135
+ const { io = apify_1.apifyIO } = config;
136
+ // Mutable state that is available to the actor hooks
137
+ const state = {};
138
+ // Initialize actor inputs
139
+ const rawInput = config.input
140
+ ? isFunc(config.input)
141
+ ? yield config.input(Object.assign(Object.assign({}, config), { io }))
142
+ : config.input
143
+ : yield io.getInput();
144
+ const input = Object.freeze(yield resolveInput(rawInput, state, { io }));
145
+ if (config.validateInput)
146
+ yield config.validateInput(input);
147
+ // This is context that is available to options that use initialization function
148
+ const getConfig = () => (Object.assign(Object.assign({}, config), { input, state, io }));
149
+ // Set up proxy
150
+ const defaultProxy = config.proxy == null ? yield io.createDefaultProxyConfiguration(input !== null && input !== void 0 ? input : undefined) : undefined;
151
+ const proxy = config.proxy == null
152
+ ? defaultProxy
153
+ : isFunc(config.proxy)
154
+ ? yield config.proxy(getConfig())
155
+ : config.proxy;
156
+ // Run initialization functions
157
+ const router = isRouter(config.router)
158
+ ? config.router
159
+ : yield config.router(getConfig());
160
+ const routes = isFunc(config.routes) ? yield config.routes(getConfig()) : config.routes; // prettier-ignore
161
+ const routeHandlers = isFunc(config.routeHandlers) ? yield config.routeHandlers(getConfig()) : config.routeHandlers; // prettier-ignore
162
+ const routerWrappers = isFunc(config.routerWrappers) ? yield config.routerWrappers(getConfig()) : config.routerWrappers; // prettier-ignore
163
+ // Create Crawlee crawler
164
+ const getActorCtx = () => ({
165
+ io,
166
+ router,
167
+ routes,
168
+ routeHandlers,
169
+ proxy,
170
+ config,
171
+ input,
172
+ state,
173
+ });
174
+ const crawler = yield config.createCrawler(getActorCtx());
175
+ // Create actor (our custom entity)
176
+ const preActor = Object.assign({ crawler }, getActorCtx());
177
+ const runCrawler = createScopedCrawlerRun(preActor);
178
+ const metamorph = createScopedMetamorph(preActor);
179
+ const scopedPushData = createScopedPushData(preActor);
180
+ const scopedPushRequest = createScopedPushRequests(preActor);
181
+ const startUrls = yield getStartUrlsFromInput(preActor);
182
+ const actor = Object.assign(Object.assign({}, preActor), { crawler,
183
+ runCrawler,
184
+ metamorph, pushData: scopedPushData, pushRequests: scopedPushRequest, startUrls });
185
+ // Extra data that we make available to the route handlers
186
+ const routerContext = { actor, pushData: scopedPushData };
187
+ // Set up router
188
+ yield (0, router_1.setupDefaultRoute)({
189
+ io,
190
+ router,
191
+ routerWrappers,
192
+ routerContext,
193
+ routes,
194
+ routeHandlers,
195
+ input,
196
+ });
197
+ yield (0, router_1.registerHandlers)({
198
+ router,
199
+ routerWrappers,
200
+ routerContext,
201
+ routeHandlers,
202
+ });
203
+ return actor;
204
+ });
205
+ exports.createCrawleeOne = createCrawleeOne;
206
+ const resolveInput = (input, state, options) => __awaiter(void 0, void 0, void 0, function* () {
207
+ var _d;
208
+ const { io = apify_1.apifyIO } = options !== null && options !== void 0 ? options : {};
209
+ const { inputExtendUrl, inputExtendFromFunction } = (input !== null && input !== void 0 ? input : {});
210
+ const inputFromUrl = inputExtendUrl ? yield got_scraping_1.gotScraping.get(inputExtendUrl).json() : null;
211
+ const inputFn = genHookFn({ state, input, io }, inputExtendFromFunction);
212
+ const inputFromFunc = (_d = (yield (inputFn === null || inputFn === void 0 ? void 0 : inputFn()))) !== null && _d !== void 0 ? _d : null;
213
+ const extendedInput = Object.assign(Object.assign(Object.assign({}, inputFromUrl), inputFromFunc), input);
214
+ return extendedInput;
215
+ });
216
+ /**
217
+ * Create a function that wraps `crawler.run(requests, runOtions)` with additional
218
+ * features like:
219
+ * - Automatically metamorph into another actor after the run finishes
220
+ */
221
+ const createScopedCrawlerRun = (actor) => {
222
+ var _a;
223
+ const { requestTransformBefore, requestTransformAfter, requestFilterBefore, requestFilterAfter, outputTransformBefore, outputTransformAfter, outputFilterBefore, outputFilterAfter, outputCacheStoreId, outputCacheActionOnResult, } = ((_a = actor.input) !== null && _a !== void 0 ? _a : {});
224
+ const metamorph = createScopedMetamorph(actor);
225
+ const runCrawler = (requests, options) => __awaiter(void 0, void 0, void 0, function* () {
226
+ var _b, _c, _d, _e, _f, _g, _h, _j;
227
+ // Clear cache if it was set from the input
228
+ if (outputCacheStoreId && outputCacheActionOnResult === 'overwrite') {
229
+ const store = yield actor.io.openKeyValueStore(outputCacheStoreId);
230
+ yield store.drop();
231
+ }
232
+ yield ((_b = genHookFn(actor, outputTransformBefore)) === null || _b === void 0 ? void 0 : _b());
233
+ yield ((_c = genHookFn(actor, outputFilterBefore)) === null || _c === void 0 ? void 0 : _c());
234
+ yield ((_d = genHookFn(actor, requestTransformBefore)) === null || _d === void 0 ? void 0 : _d());
235
+ yield ((_e = genHookFn(actor, requestFilterBefore)) === null || _e === void 0 ? void 0 : _e());
236
+ const runRes = yield actor.crawler.run(requests, options);
237
+ yield ((_f = genHookFn(actor, outputTransformAfter)) === null || _f === void 0 ? void 0 : _f());
238
+ yield ((_g = genHookFn(actor, outputFilterAfter)) === null || _g === void 0 ? void 0 : _g());
239
+ yield ((_h = genHookFn(actor, requestTransformAfter)) === null || _h === void 0 ? void 0 : _h());
240
+ yield ((_j = genHookFn(actor, requestFilterAfter)) === null || _j === void 0 ? void 0 : _j());
241
+ // Trigger metamorph if it was set from the input
242
+ yield metamorph();
243
+ return runRes;
244
+ });
245
+ return runCrawler;
246
+ };
247
+ /** Create a function that triggers metamorph, using Actor's inputs as defaults. */
248
+ const createScopedMetamorph = (actor) => {
249
+ // Trigger metamorph if it was set from the input
250
+ const metamorph = (overrides) => __awaiter(void 0, void 0, void 0, function* () {
251
+ var _a;
252
+ const { metamorphActorId, metamorphActorBuild, metamorphActorInput, } = (0, lodash_1.defaults)({}, overrides, (_a = actor.input) !== null && _a !== void 0 ? _a : {}); // prettier-ignore
253
+ if (!metamorphActorId)
254
+ return;
255
+ yield actor.io.triggerDownstreamCrawler(metamorphActorId, metamorphActorInput, {
256
+ build: metamorphActorBuild,
257
+ });
258
+ });
259
+ return metamorph;
260
+ };
261
+ /** pushData wrapper that pre-populates options based on actor input */
262
+ const createScopedPushData = (actor) => {
263
+ var _a;
264
+ const { includePersonalData, requestQueueId, outputMaxEntries, outputTransform, outputFilter, outputDatasetId, outputPickFields, outputRenameFields, outputCacheStoreId, outputCachePrimaryKeys, outputCacheActionOnResult, } = ((_a = actor.input) !== null && _a !== void 0 ? _a : {});
265
+ const scopedPushData = (entries, ctx, options) => __awaiter(void 0, void 0, void 0, function* () {
266
+ const transformFn = genHookFn(actor, outputTransform);
267
+ const filterFn = genHookFn(actor, outputFilter);
268
+ const mergedOptions = Object.assign({ io: actor.io, showPrivate: includePersonalData, maxCount: outputMaxEntries, pickKeys: outputPickFields, remapKeys: outputRenameFields, transform: transformFn ? (item) => transformFn(item) : undefined, filter: filterFn ? (item) => filterFn(item) : undefined, datasetId: outputDatasetId, requestQueueId, cacheStoreId: outputCacheStoreId, cachePrimaryKeys: outputCachePrimaryKeys, cacheActionOnResult: outputCacheActionOnResult }, options);
269
+ return (0, pushData_1.pushData)(entries, ctx, mergedOptions);
270
+ });
271
+ return scopedPushData;
272
+ };
273
+ /** pushRequests wrapper that pre-populates options based on actor input */
274
+ const createScopedPushRequests = (actor) => {
275
+ var _a;
276
+ const { requestQueueId, requestMaxEntries, requestTransform, requestFilter } = ((_a = actor.input) !== null && _a !== void 0 ? _a : {});
277
+ const scopedPushRequest = (entries, ctx, options) => __awaiter(void 0, void 0, void 0, function* () {
278
+ const transformFn = genHookFn(actor, requestTransform);
279
+ const filterFn = genHookFn(actor, requestFilter);
280
+ const mergedOptions = Object.assign({ io: actor.io, maxCount: requestMaxEntries, transform: transformFn ? (item) => transformFn(item) : undefined, filter: filterFn ? (item) => filterFn(item) : undefined, requestQueueId }, options);
281
+ return (0, pushRequests_1.pushRequests)(entries, ctx, mergedOptions);
282
+ });
283
+ return scopedPushRequest;
284
+ };
285
+ /** Given the actor input, create common crawler options. */
286
+ const createHttpCrawlerOptions = ({ input, defaults, overrides, }) => {
287
+ const pickCrawlerInputFields = (config) => (0, lodash_1.pick)(config, Object.keys(config_1.crawlerInput));
288
+ return Object.assign(Object.assign(Object.assign({}, (0, lodash_1.omitBy)(defaults !== null && defaults !== void 0 ? defaults : {}, (field) => field === undefined)), (0, lodash_1.omitBy)(pickCrawlerInputFields(input !== null && input !== void 0 ? input : {}), (field) => field === undefined)), (0, lodash_1.omitBy)(overrides !== null && overrides !== void 0 ? overrides : {}, (field) => field === undefined));
289
+ };
290
+ exports.createHttpCrawlerOptions = createHttpCrawlerOptions;
291
+ const getStartUrlsFromInput = (actor) => __awaiter(void 0, void 0, void 0, function* () {
292
+ var _e, _f;
293
+ const { startUrls, startUrlsFromDataset, startUrlsFromFunction } = ((_e = actor.input) !== null && _e !== void 0 ? _e : {});
294
+ const urlsAgg = [...(startUrls !== null && startUrls !== void 0 ? startUrls : [])];
295
+ if (startUrlsFromDataset) {
296
+ const [datasetId, field] = startUrlsFromDataset.split('#');
297
+ const urlsFromDataset = yield (0, dataset_1.getColumnFromDataset)(datasetId, field, { io: actor.io });
298
+ urlsAgg.push(...urlsFromDataset);
299
+ }
300
+ if (startUrlsFromFunction) {
301
+ const urlsFromFn = yield ((_f = genHookFn(actor, startUrlsFromFunction)) === null || _f === void 0 ? void 0 : _f());
302
+ urlsAgg.push(...urlsFromFn);
303
+ }
304
+ return urlsAgg;
305
+ });
306
+ //# sourceMappingURL=actor.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"actor.js","sourceRoot":"","sources":["../../../../src/lib/actor/actor.ts"],"names":[],"mappings":";;;;;;;;;;;;AAAA,qCAWiB;AACjB,mCAAgD;AAEhD,+CAA2C;AAI3C,wDAA2D;AAC3D,4CAA8C;AAC9C,6CAA8E;AAC9E,2CAAqD;AACrD,qDAAuE;AAEvE,iDAAgD;AAChD,sCAAgE;AAChE,sCAUmB;AACnB,gCAAgD;AAUhD,MAAM,gBAAgB,GAAG;IACvB,KAAK,EAAE,sBAAY;IACnB,IAAI,EAAE,qBAAW;IACjB,OAAO,EAAE,wBAAc;IACvB,KAAK,EAAE,sBAAY;IACnB,UAAU,EAAE,2BAAiB;IAC7B,SAAS,EAAE,0BAAgB;CAC+C,CAAC;AAE7E,MAAM,QAAQ,GAAG,CAAC,CAAM,EAA2B,EAAE;IACnD,OAAO,CAAC,CAAC,CAAE,CAAmB,CAAC,UAAU,IAAK,CAAmB,CAAC,iBAAiB,CAAC,CAAC;AACvF,CAAC,CAAC;AACF,MAAM,MAAM,GAAG,CAAC,CAAM,EAAgC,EAAE;IACtD,OAAO,OAAO,CAAC,KAAK,UAAU,CAAC;AACjC,CAAC,CAAC;AAEF,kEAAkE;AAClE,MAAM,SAAS,GAAG,CAMhB,KAA4E,EAC5E,KAAc,EACd,EAAE;IACF,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IAExB,MAAM,OAAO,GAAG;QACd,EAAE,EAAE,KAAK,CAAC,EAAE;QACZ,KAAK,EAAE,KAAK,CAAC,KAAK;QAClB,KAAK,EAAE,KAAK,CAAC,KAAK;QAClB,YAAY,EAAZ,uBAAY;QACZ,WAAW,EAAE,0BAAW;KACO,CAAC;IAElC,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC;IAC3B,IAAI,CAAC,MAAM;QAAE,OAAO,IAAI,CAAC;IAEzB,OAAO,CAAO,GAAG,IAAI,EAAE,EAAE,kDAAC,OAAA,MAAM,CAAC,GAAG,IAAI,EAAE,OAAO,CAAC,CAAA,GAAA,CAAC;AACrD,CAAC,CAAC;AAEF;;;;;;;GAOG;AACI,MAAM,sBAAsB,GAAG,CAMpC,KAiCD,EAAiB,EAAE;IAClB,MAAM,EACJ,SAAS,EACT,SAAS,EACT,WAAW,EACX,qBAAqB,EACrB,sBAAsB,EACtB,aAAa,EACb,YAAY,GACb,GAAG,KAAK,CAAC;IAEV,MAAM,EAAE,EAAE,GAAG,eAAqB,EAAE,GAAG,WAAW,CAAC;IAEnD,MAAM,IAAA,oBAAW,kCAAM,aAAa,KAAE,UAAU,EAAE,SAAS,KAAI,EAAE,EAAE,EAAE,CAAC,CAAC;IAEvE,YAAY;IACZ,mCAAmC;IACnC,yGAAyG;IACzG,2EAA2E;IAC3E,MAAM,EAAE,CAAC,YAAY,CACnB,GAAS,EAAE;;QACT,MAAM,aAAa,GAA8D;YAC/E,EAAE;YACF,MAAM,EAAE,gBAAM,CAAC,MAAM,EAAO;YAC5B,cAAc,EAAE,CAAC,EAAE,KAAK,EAAE,EAAE,EAAE;;gBAAC,OAAA;oBAC7B,IAAA,4BAAsB,EAAW,MAAA,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,QAAQ,mCAAI,MAAM,CAAC;iBAC5D,CAAA;aAAA;YACD,aAAa,EAAE,CAAC,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,EAAE,EAAE;;gBAC1C,MAAM,OAAO,GAAG,IAAA,gCAAwB,EAGtC;oBACA,KAAK;oBACL,QAAQ,EAAE,qBAAqB;oBAC/B,SAAS,kBACP,cAAc,EAAE,MAAM,EACtB,kBAAkB,EAAE,KAAK;wBACzB,yEAAyE;wBACzE,oBAAoB,EAAE,IAAA,iCAAkB,EAAC;4BACvC,EAAE;4BACF,kBAAkB,EAAE,MAAA,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,uBAAuB,mCAAI,WAAW;4BACjE,YAAY,EAAE,MAAA,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,iBAAiB,mCAAI,IAAI;yBAC/C,CAAC,IACC,sBAAsB,CAC1B;iBACF,CAAC,CAAC;gBACH,MAAM,YAAY,GAAG,gBAAgB,CAAC,SAAS,CAAQ,CAAC;gBACxD,OAAO,IAAI,YAAY,CAAC,OAAO,CAAC,CAAC;YACnC,CAAC;YACD,MAAM,EAAE,EAAE;YACV,aAAa,EAAE,EAAS;SACzB,CAAC;QAEF,MAAM,KAAK,GAAG,MAAM,IAAA,wBAAgB,kCAC/B,WAAW,KACd,EAAE,EACF,MAAM,EAAE,MAAA,WAAW,CAAC,MAAM,mCAAK,aAAa,CAAC,MAAc,EAC3D,cAAc,EAAE,MAAA,WAAW,CAAC,cAAc,mCAAK,aAAa,CAAC,cAAsB,EACnF,aAAa,EAAE,MAAA,WAAW,CAAC,aAAa,mCAAK,aAAa,CAAC,aAAqB,IAChF,CAAC;QAEH,MAAM,CAAA,YAAY,aAAZ,YAAY,uBAAZ,YAAY,CAAG,KAAK,CAAC,CAAA,CAAC;IAC9B,CAAC,CAAA,EACD,EAAE,aAAa,EAAE,oBAAoB,EAAE,CACxC,CAAC;AACJ,CAAC,CAAA,CAAC;AAxGW,QAAA,sBAAsB,0BAwGjC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG;AACI,MAAM,gBAAgB,GAAG,CAM9B,MAAmE,EACnB,EAAE;IAClD,MAAM,EAAE,EAAE,GAAG,eAAqB,EAAE,GAAG,MAAM,CAAC;IAE9C,qDAAqD;IACrD,MAAM,KAAK,GAAG,EAAE,CAAC;IAEjB,0BAA0B;IAC1B,MAAM,QAAQ,GAAG,MAAM,CAAC,KAAK;QAC3B,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC;YACpB,CAAC,CAAC,MAAM,MAAM,CAAC,KAAK,iCAAM,MAAM,KAAE,EAAE,IAAG;YACvC,CAAC,CAAC,MAAM,CAAC,KAAK;QAChB,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAS,CAAC;IAC/B,MAAM,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,YAAY,CAAe,QAAQ,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC;IAEvF,IAAI,MAAM,CAAC,aAAa;QAAE,MAAM,MAAM,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;IAE5D,gFAAgF;IAChF,MAAM,SAAS,GAAG,GAAG,EAAE,CAAC,iCAAM,MAAM,KAAE,KAAK,EAAE,KAAK,EAAE,EAAE,IAAG,CAAC;IAE1D,eAAe;IACf,MAAM,YAAY,GAChB,MAAM,CAAC,KAAK,IAAI,IAAI,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,+BAA+B,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;IAClG,MAAM,KAAK,GACT,MAAM,CAAC,KAAK,IAAI,IAAI;QAClB,CAAC,CAAC,YAAY;QACd,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC;YACtB,CAAC,CAAC,MAAM,MAAM,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACjC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;IAEnB,+BAA+B;IAC/B,MAAM,MAAM,GAAuB,QAAQ,CAAC,MAAM,CAAC,MAAM,CAAC;QACxD,CAAC,CAAC,MAAM,CAAC,MAAM;QACf,CAAC,CAAC,MAAO,MAAM,CAAC,MAAc,CAAC,SAAS,EAAE,CAAC,CAAC;IAC9C,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,kBAAkB;IAC3G,MAAM,aAAa,GAAG,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,aAAa,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,kBAAkB;IACvI,MAAM,cAAc,GAAG,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,cAAc,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,kBAAkB;IAE3I,yBAAyB;IACzB,MAAM,WAAW,GAAG,GAAG,EAAE,CAAC,CAAC;QACzB,EAAE;QACF,MAAM;QACN,MAAM;QACN,aAAa;QACb,KAAK;QACL,MAAM;QACN,KAAK;QACL,KAAK;KACN,CAAC,CAAC;IACH,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,WAAW,EAAE,CAAC,CAAC;IAE1D,mCAAmC;IACnC,MAAM,QAAQ,mBAAK,OAAO,IAAK,WAAW,EAAE,CAAE,CAAC;IAC/C,MAAM,UAAU,GAAG,sBAAsB,CAAC,QAAQ,CAAC,CAAC;IACpD,MAAM,SAAS,GAAG,qBAAqB,CAAC,QAAQ,CAAC,CAAC;IAClD,MAAM,cAAc,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IACtD,MAAM,iBAAiB,GAAG,wBAAwB,CAAC,QAAQ,CAAC,CAAC;IAC7D,MAAM,SAAS,GAAG,MAAM,qBAAqB,CAAC,QAAQ,CAAC,CAAC;IAExD,MAAM,KAAK,GAAG,gCACT,QAAQ,KACX,OAAO;QACP,UAAU;QACV,SAAS,EACT,QAAQ,EAAE,cAAc,EACxB,YAAY,EAAE,iBAAiB,EAC/B,SAAS,GACsC,CAAC;IAElD,0DAA0D;IAC1D,MAAM,aAAa,GAAG,EAAE,KAAK,EAAE,QAAQ,EAAE,cAAc,EAAE,CAAC;IAE1D,gBAAgB;IAChB,MAAM,IAAA,0BAAiB,EAAkE;QACvF,EAAE;QACF,MAAM;QACN,cAAc;QACd,aAAa;QACb,MAAM;QACN,aAAa;QACb,KAAK;KACN,CAAC,CAAC;IACH,MAAM,IAAA,yBAAgB,EAA2D;QAC/E,MAAM;QACN,cAAc;QACd,aAAa;QACb,aAAa;KACd,CAAC,CAAC;IAEH,OAAO,KAAK,CAAC;AACf,CAAC,CAAA,CAAC;AAhGW,QAAA,gBAAgB,oBAgG3B;AAEF,MAAM,YAAY,GAAG,CACnB,KAAoB,EACpB,KAA8B,EAC9B,OAA+B,EAC/B,EAAE;;IACF,MAAM,EAAE,EAAE,GAAG,eAAuB,EAAE,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;IACvD,MAAM,EAAE,cAAc,EAAE,uBAAuB,EAAE,GAAG,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,EAAE,CAAoB,CAAC;IAErF,MAAM,YAAY,GAAG,cAAc,CAAC,CAAC,CAAC,MAAM,0BAAW,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC,IAAI,EAAU,CAAC,CAAC,CAAC,IAAI,CAAC;IAClG,MAAM,OAAO,GAAG,SAAS,CAAC,EAAE,KAAK,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,uBAAuB,CAAC,CAAC;IACzE,MAAM,aAAa,GAAG,MAAA,CAAC,MAAM,CAAA,OAAO,aAAP,OAAO,uBAAP,OAAO,EAAI,CAAA,CAAC,mCAAI,IAAI,CAAC;IAClD,MAAM,aAAa,iDAAQ,YAAY,GAAK,aAAa,GAAK,KAAK,CAAE,CAAC;IAEtE,OAAO,aAAkB,CAAC;AAC5B,CAAC,CAAA,CAAC;AAEF;;;;GAIG;AACH,MAAM,sBAAsB,GAAG,CAM7B,KAGC,EACD,EAAE;;IACF,MAAM,EACJ,sBAAsB,EACtB,qBAAqB,EACrB,mBAAmB,EACnB,kBAAkB,EAClB,qBAAqB,EACrB,oBAAoB,EACpB,kBAAkB,EAClB,iBAAiB,EACjB,kBAAkB,EAClB,yBAAyB,GAC1B,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCAAI,EAAE,CAAyC,CAAC;IAEhE,MAAM,SAAS,GAAG,qBAAqB,CAAC,KAAK,CAAC,CAAC;IAE/C,MAAM,UAAU,GAAoB,CAAO,QAAQ,EAAE,OAAO,EAAE,EAAE;;QAC9D,2CAA2C;QAC3C,IAAI,kBAAkB,IAAI,yBAAyB,KAAK,WAAW,EAAE;YACnE,MAAM,KAAK,GAAG,MAAM,KAAK,CAAC,EAAE,CAAC,iBAAiB,CAAC,kBAAkB,CAAC,CAAC;YACnE,MAAM,KAAK,CAAC,IAAI,EAAE,CAAC;SACpB;QAED,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,qBAAqB,CAAC,2CAAI,CAAA,CAAC;QAClD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,kBAAkB,CAAC,2CAAI,CAAA,CAAC;QAC/C,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,sBAAsB,CAAC,2CAAI,CAAA,CAAC;QACnD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,mBAAmB,CAAC,2CAAI,CAAA,CAAC;QAEhD,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAE1D,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,oBAAoB,CAAC,2CAAI,CAAA,CAAC;QACjD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,iBAAiB,CAAC,2CAAI,CAAA,CAAC;QAC9C,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,qBAAqB,CAAC,2CAAI,CAAA,CAAC;QAClD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,kBAAkB,CAAC,2CAAI,CAAA,CAAC;QAE/C,iDAAiD;QACjD,MAAM,SAAS,EAAE,CAAC;QAElB,OAAO,MAAM,CAAC;IAChB,CAAC,CAAA,CAAC;IAEF,OAAO,UAAU,CAAC;AACpB,CAAC,CAAC;AAEF,mFAAmF;AACnF,MAAM,qBAAqB,GAAG,CAAC,KAAyC,EAAE,EAAE;IAC1E,iDAAiD;IACjD,MAAM,SAAS,GAAc,CAAO,SAA+B,EAAE,EAAE;;QACrE,MAAM,EACJ,gBAAgB,EAChB,mBAAmB,EACnB,mBAAmB,GACpB,GAAG,IAAA,iBAAQ,EAAC,EAAE,EAAE,SAAS,EAAE,MAAA,KAAK,CAAC,KAAK,mCAAI,EAAE,CAAC,CAAC,CAAC,kBAAkB;QAElE,IAAI,CAAC,gBAAgB;YAAE,OAAO;QAE9B,MAAM,KAAK,CAAC,EAAE,CAAC,wBAAwB,CAAC,gBAAgB,EAAE,mBAAmB,EAAE;YAC7E,KAAK,EAAE,mBAAmB;SAC3B,CAAC,CAAC;IACL,CAAC,CAAA,CAAC;IAEF,OAAO,SAAS,CAAC;AACnB,CAAC,CAAC;AAEF,uEAAuE;AACvE,MAAM,oBAAoB,GAAG,CAAC,KAAmD,EAAE,EAAE;;IACnF,MAAM,EACJ,mBAAmB,EACnB,cAAc,EACd,gBAAgB,EAChB,eAAe,EACf,YAAY,EACZ,eAAe,EACf,gBAAgB,EAChB,kBAAkB,EAClB,kBAAkB,EAClB,sBAAsB,EACtB,yBAAyB,GAC1B,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCAAI,EAAE,CAA6D,CAAC;IAEpF,MAAM,cAAc,GAA6B,CAAO,OAAO,EAAE,GAAG,EAAE,OAAO,EAAE,EAAE;QAC/E,MAAM,WAAW,GAAG,SAAS,CAAC,KAAK,EAAE,eAAe,CAAC,CAAC;QACtD,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,EAAE,YAAY,CAAC,CAAC;QAEhD,MAAM,aAAa,GAAG,gBACpB,EAAE,EAAE,KAAK,CAAC,EAAE,EACZ,WAAW,EAAE,mBAAmB,EAChC,QAAQ,EAAE,gBAAgB,EAC1B,QAAQ,EAAE,gBAAgB,EAC1B,SAAS,EAAE,kBAAkB,EAC7B,SAAS,EAAE,WAAW,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EAChE,MAAM,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EACvD,SAAS,EAAE,eAAe,EAC1B,cAAc,EACd,YAAY,EAAE,kBAAkB,EAChC,gBAAgB,EAAE,sBAAsB,EACxC,mBAAmB,EAAE,yBAAyB,IAC3C,OAAO,CACuB,CAAC;QAEpC,OAAO,IAAA,mBAAQ,EAAC,OAAO,EAAE,GAAG,EAAE,aAAa,CAAC,CAAC;IAC/C,CAAC,CAAA,CAAC;IAEF,OAAO,cAAc,CAAC;AACxB,CAAC,CAAC;AAEF,2EAA2E;AAC3E,MAAM,wBAAwB,GAAG,CAAC,KAAmD,EAAE,EAAE;;IACvF,MAAM,EAAE,cAAc,EAAE,iBAAiB,EAAE,gBAAgB,EAAE,aAAa,EAAE,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCACzF,EAAE,CAAsB,CAAC;IAE3B,MAAM,iBAAiB,GAAiC,CAAO,OAAO,EAAE,GAAG,EAAE,OAAO,EAAE,EAAE;QACtF,MAAM,WAAW,GAAG,SAAS,CAAC,KAAK,EAAE,gBAAgB,CAAC,CAAC;QACvD,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,EAAE,aAAa,CAAC,CAAC;QAEjD,MAAM,aAAa,GAAG,gBACpB,EAAE,EAAE,KAAK,CAAC,EAAE,EACZ,QAAQ,EAAE,iBAAiB,EAC3B,SAAS,EAAE,WAAW,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EAChE,MAAM,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EACvD,cAAc,IACX,OAAO,CACwB,CAAC;QAErC,OAAO,IAAA,2BAAY,EAAC,OAAO,EAAE,GAAG,EAAE,aAAa,CAAC,CAAC;IACnD,CAAC,CAAA,CAAC;IAEF,OAAO,iBAAiB,CAAC;AAC3B,CAAC,CAAC;AAEF,4DAA4D;AACrD,MAAM,wBAAwB,GAAG,CAGtC,EACA,KAAK,EACL,QAAQ,EACR,SAAS,GAcV,EAAE,EAAE;IACH,MAAM,sBAAsB,GAAG,CAAoC,MAAS,EAAE,EAAE,CAC9E,IAAA,aAAI,EAAC,MAAM,EAAE,MAAM,CAAC,IAAI,CAAC,qBAAY,CAAC,CAAC,CAAC;IAE1C,OAAO,8CAEF,IAAA,eAAM,EAAC,QAAQ,aAAR,QAAQ,cAAR,QAAQ,GAAK,EAAY,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,KAAK,SAAS,CAAC,GAEjE,IAAA,eAAM,EAAC,sBAAsB,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,EAAE,CAAC,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,KAAK,SAAS,CAAC,GAE3E,IAAA,eAAM,EAAC,SAAS,aAAT,SAAS,cAAT,SAAS,GAAK,EAAY,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,KAAK,SAAS,CAAC,CAC7C,CAAC;AAC7B,CAAC,CAAC;AAhCW,QAAA,wBAAwB,4BAgCnC;AAEF,MAAM,qBAAqB,GAAG,CAAO,KAAmD,EAAE,EAAE;;IAC1F,MAAM,EAAE,SAAS,EAAE,oBAAoB,EAAE,qBAAqB,EAAE,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCAC7E,EAAE,CAAwB,CAAC;IAE7B,MAAM,OAAO,GAAG,CAAC,GAAG,CAAC,SAAS,aAAT,SAAS,cAAT,SAAS,GAAI,EAAE,CAAC,CAAC,CAAC;IAEvC,IAAI,oBAAoB,EAAE;QACxB,MAAM,CAAC,SAAS,EAAE,KAAK,CAAC,GAAG,oBAAoB,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC3D,MAAM,eAAe,GAAG,MAAM,IAAA,8BAAoB,EAAM,SAAS,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC,CAAC;QAC5F,OAAO,CAAC,IAAI,CAAC,GAAG,eAAe,CAAC,CAAC;KAClC;IAED,IAAI,qBAAqB,EAAE;QACzB,MAAM,UAAU,GAAG,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,qBAAqB,CAAC,2CAAI,CAAA,CAAC;QACrE,OAAO,CAAC,IAAI,CAAC,GAAG,UAAU,CAAC,CAAC;KAC7B;IAED,OAAO,OAAO,CAAC;AACjB,CAAC,CAAA,CAAC","sourcesContent":["import {\n BasicCrawler,\n CrawlingContext,\n RouterHandler,\n BasicCrawlerOptions,\n CheerioCrawler,\n Router,\n HttpCrawler,\n JSDOMCrawler,\n PlaywrightCrawler,\n PuppeteerCrawler,\n} from 'crawlee';\nimport { omitBy, pick, defaults } from 'lodash';\nimport * as Sentry from '@sentry/node';\nimport { gotScraping } from 'got-scraping';\n\nimport type { CrawlerMeta, CrawlerType } from '../../types';\nimport type { MaybePromise, PickPartial } from '../../utils/types';\nimport { createErrorHandler } from '../error/errorHandler';\nimport { setupSentry } from '../error/sentry';\nimport { type PushDataOptions, itemCacheKey, pushData } from '../io/pushData';\nimport { getColumnFromDataset } from '../io/dataset';\nimport { PushRequestsOptions, pushRequests } from '../io/pushRequests';\nimport type { CrawleeOneIO } from '../integrations/types';\nimport { apifyIO } from '../integrations/apify';\nimport { registerHandlers, setupDefaultRoute } from '../router';\nimport {\n CrawlerConfigActorInput,\n OutputActorInput,\n MetamorphActorInput,\n PrivacyActorInput,\n crawlerInput,\n StartUrlsActorInput,\n InputActorInput,\n RequestActorInput,\n AllActorInputs,\n} from '../config';\nimport { logLevelHandlerWrapper } from '../log';\nimport type {\n ActorContext,\n ActorDefinition,\n ActorHookContext,\n ActorRouterContext,\n Metamorph,\n RunCrawler,\n} from './types';\n\nconst actorClassByType = {\n basic: BasicCrawler,\n http: HttpCrawler,\n cheerio: CheerioCrawler,\n jsdom: JSDOMCrawler,\n playwright: PlaywrightCrawler,\n puppeteer: PuppeteerCrawler,\n} satisfies Record<CrawlerType, { new (options: Record<string, any>): any }>;\n\nconst isRouter = (r: any): r is RouterHandler<any> => {\n return !!((r as RouterHandler).addHandler && (r as RouterHandler).addDefaultHandler);\n};\nconst isFunc = (f: any): f is (...args: any[]) => any => {\n return typeof f === 'function';\n};\n\n/** Run a function that was defined as a string via Actor input */\nconst genHookFn = <\n Ctx extends CrawlingContext<any> = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n>(\n actor: Pick<ActorContext<Ctx, Labels, Input, TIO>, 'input' | 'state' | 'io'>,\n fnStr?: string\n) => {\n if (!fnStr) return null;\n\n const hookCtx = {\n io: actor.io,\n input: actor.input,\n state: actor.state,\n itemCacheKey,\n sendRequest: gotScraping,\n } satisfies ActorHookContext<TIO>;\n\n const hookFn = eval(fnStr);\n if (!hookFn) return null;\n\n return async (...args) => hookFn(...args, hookCtx);\n};\n\n/**\n * Create default configuration for an opinionated Crawlee actor,\n * and run the actor within Apify's `Actor.main()` context.\n *\n * Apify context can be replaced with custom implementation using the `actorConfig.io` option.\n *\n * Read more about what this actor does at {@link createCrawleeOne}.\n */\nexport const createAndRunCrawleeOne = async <\n TCrawlerType extends CrawlerType,\n Ctx extends CrawlerMeta<TCrawlerType, any>['context'] = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n>(input: {\n /** String idetifying the actor class, e.g. `'cheerio'` */\n actorType: TCrawlerType;\n actorName: string;\n /** Config passed to the {@link createCrawleeOne} */\n actorConfig: PickPartial<\n ActorDefinition<Ctx, Labels, Input, TIO>,\n 'router' | 'createCrawler' | 'io'\n >;\n /**\n * If using default `createCrawler` implementation, these are crawler options\n * that may be overriden by user input.\n */\n crawlerConfigDefaults?: CrawlerMeta<TCrawlerType, any>['options'];\n /**\n * If using default `createCrawler` implementation, these are crawler options\n * that will override user input.\n *\n * This is useful for testing env.\n */\n crawlerConfigOverrides?: CrawlerMeta<TCrawlerType, any>['options'];\n /**\n * Sentry configuration. If using default `createCrawler` implementation,\n * failed requests are optionally reported to Sentry.\n *\n * To disable Sentry, set `\"enabled\": false`.\n */\n sentryOptions?: Sentry.NodeOptions;\n /**\n * Callback with the created actor. The callback is called within\n * the `Actor.main()` context.\n */\n onActorReady?: (actor: ActorContext<Ctx, Labels, Input, TIO>) => MaybePromise<void>;\n}): Promise<void> => {\n const {\n actorType,\n actorName,\n actorConfig,\n crawlerConfigDefaults,\n crawlerConfigOverrides,\n sentryOptions,\n onActorReady,\n } = input;\n\n const { io = apifyIO as any as TIO } = actorConfig;\n\n await setupSentry({ ...sentryOptions, serverName: actorName }, { io });\n\n // See docs:\n // - https://docs.apify.com/sdk/js/\n // - https://docs.apify.com/academy/deploying-your-code/inputs-outputs#accepting-input-with-the-apify-sdk\n // - https://docs.apify.com/sdk/js/docs/upgrading/upgrading-to-v3#apify-sdk\n await io.runInContext(\n async () => {\n const actorDefaults: ActorDefinition<Ctx, Labels, Input & AllActorInputs, TIO> = {\n io,\n router: Router.create<Ctx>(),\n routerWrappers: ({ input }) => [\n logLevelHandlerWrapper<Ctx, any>(input?.logLevel ?? 'info'),\n ],\n createCrawler: ({ router, proxy, input }) => {\n const options = createHttpCrawlerOptions<\n CrawlerMeta<TCrawlerType, any>['options'],\n Input\n >({\n input,\n defaults: crawlerConfigDefaults,\n overrides: {\n requestHandler: router,\n proxyConfiguration: proxy,\n // Capture errors in a separate (Apify) Dataset and pass errors to Sentry\n failedRequestHandler: createErrorHandler({\n io,\n reportingDatasetId: input?.errorReportingDatasetId ?? 'REPORTING',\n sendToSentry: input?.errorSendToSentry ?? true,\n }),\n ...crawlerConfigOverrides,\n },\n });\n const CrawlerClass = actorClassByType[actorType] as any;\n return new CrawlerClass(options);\n },\n routes: [],\n routeHandlers: {} as any,\n };\n\n const actor = await createCrawleeOne<Ctx, Labels, Input, TIO>({\n ...actorConfig,\n io,\n router: actorConfig.router ?? (actorDefaults.router as any),\n routerWrappers: actorConfig.routerWrappers ?? (actorDefaults.routerWrappers as any),\n createCrawler: actorConfig.createCrawler ?? (actorDefaults.createCrawler as any),\n });\n\n await onActorReady?.(actor);\n },\n { statusMessage: 'Crawling finished!' }\n );\n};\n\n/**\n * Create opinionated Crawlee crawler that uses router for handling requests.\n *\n * This is a quality-of-life function that does the following for you:\n *\n * 1) Full TypeScript coverage - Ensure all components use the same Crawler / CrawlerContext.\n *\n * 2) Get Actor input from `Actor.getInput` if not given.\n *\n * 3) (Optional) Validate Actor input\n *\n * 4) Set up router such that requests that reach default route are\n * redirected to labelled routes based on which item from \"routes\" they match.\n *\n * 5) Register all route handlers for you.\n *\n * 6) (Optional) Wrap all route handlers in a wrapper. Use this e.g.\n * if you want to add a field to the context object, or handle errors\n * from a single place.\n *\n * 7) (Optional) Support transformation and filtering of (scraped) entries,\n * configured via Actor input.\n *\n * 8) (Optional) Support Actor metamorphing, configured via Actor input.\n *\n * 9) Apify context (e.g. calling `Actor.getInput`) can be replaced with custom\n * implementation using the `io` option.\n */\nexport const createCrawleeOne = async <\n Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n>(\n config: PickPartial<ActorDefinition<Ctx, Labels, Input, TIO>, 'io'>\n): Promise<ActorContext<Ctx, Labels, Input, TIO>> => {\n const { io = apifyIO as any as TIO } = config;\n\n // Mutable state that is available to the actor hooks\n const state = {};\n\n // Initialize actor inputs\n const rawInput = config.input\n ? isFunc(config.input)\n ? await config.input({ ...config, io })\n : config.input\n : await io.getInput<Input>();\n const input = Object.freeze(await resolveInput<Input | null>(rawInput, state, { io }));\n\n if (config.validateInput) await config.validateInput(input);\n\n // This is context that is available to options that use initialization function\n const getConfig = () => ({ ...config, input, state, io });\n\n // Set up proxy\n const defaultProxy =\n config.proxy == null ? await io.createDefaultProxyConfiguration(input ?? undefined) : undefined;\n const proxy =\n config.proxy == null\n ? defaultProxy\n : isFunc(config.proxy)\n ? await config.proxy(getConfig())\n : config.proxy;\n\n // Run initialization functions\n const router: RouterHandler<Ctx> = isRouter(config.router)\n ? config.router\n : await (config.router as any)(getConfig());\n const routes = isFunc(config.routes) ? await config.routes(getConfig()) : config.routes; // prettier-ignore\n const routeHandlers = isFunc(config.routeHandlers) ? await config.routeHandlers(getConfig()) : config.routeHandlers; // prettier-ignore\n const routerWrappers = isFunc(config.routerWrappers) ? await config.routerWrappers(getConfig()) : config.routerWrappers; // prettier-ignore\n\n // Create Crawlee crawler\n const getActorCtx = () => ({\n io,\n router,\n routes,\n routeHandlers,\n proxy,\n config,\n input,\n state,\n });\n const crawler = await config.createCrawler(getActorCtx());\n\n // Create actor (our custom entity)\n const preActor = { crawler, ...getActorCtx() };\n const runCrawler = createScopedCrawlerRun(preActor);\n const metamorph = createScopedMetamorph(preActor);\n const scopedPushData = createScopedPushData(preActor);\n const scopedPushRequest = createScopedPushRequests(preActor);\n const startUrls = await getStartUrlsFromInput(preActor);\n\n const actor = {\n ...preActor,\n crawler,\n runCrawler,\n metamorph,\n pushData: scopedPushData,\n pushRequests: scopedPushRequest,\n startUrls,\n } satisfies ActorContext<Ctx, Labels, Input, TIO>;\n\n // Extra data that we make available to the route handlers\n const routerContext = { actor, pushData: scopedPushData };\n\n // Set up router\n await setupDefaultRoute<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>, Labels, Input>({\n io,\n router,\n routerWrappers,\n routerContext,\n routes,\n routeHandlers,\n input,\n });\n await registerHandlers<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>, Labels>({\n router,\n routerWrappers,\n routerContext,\n routeHandlers,\n });\n\n return actor;\n};\n\nconst resolveInput = async <T extends Record<string, any> | null>(\n input: object | null,\n state: Record<string, unknown>,\n options?: { io?: CrawleeOneIO }\n) => {\n const { io = apifyIO as CrawleeOneIO } = options ?? {};\n const { inputExtendUrl, inputExtendFromFunction } = (input ?? {}) as InputActorInput;\n\n const inputFromUrl = inputExtendUrl ? await gotScraping.get(inputExtendUrl).json<object>() : null;\n const inputFn = genHookFn({ state, input, io }, inputExtendFromFunction);\n const inputFromFunc = (await inputFn?.()) ?? null;\n const extendedInput = { ...inputFromUrl, ...inputFromFunc, ...input };\n\n return extendedInput as T;\n};\n\n/**\n * Create a function that wraps `crawler.run(requests, runOtions)` with additional\n * features like:\n * - Automatically metamorph into another actor after the run finishes\n */\nconst createScopedCrawlerRun = <\n Ctx extends CrawlingContext<any> = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n>(\n actor: Omit<\n ActorContext<Ctx, Labels, Input, TIO>,\n 'runCrawler' | 'metamorph' | 'pushData' | 'pushRequests' | 'startUrls'\n >\n) => {\n const {\n requestTransformBefore,\n requestTransformAfter,\n requestFilterBefore,\n requestFilterAfter,\n outputTransformBefore,\n outputTransformAfter,\n outputFilterBefore,\n outputFilterAfter,\n outputCacheStoreId,\n outputCacheActionOnResult,\n } = (actor.input ?? {}) as OutputActorInput & RequestActorInput;\n\n const metamorph = createScopedMetamorph(actor);\n\n const runCrawler: RunCrawler<Ctx> = async (requests, options) => {\n // Clear cache if it was set from the input\n if (outputCacheStoreId && outputCacheActionOnResult === 'overwrite') {\n const store = await actor.io.openKeyValueStore(outputCacheStoreId);\n await store.drop();\n }\n\n await genHookFn(actor, outputTransformBefore)?.();\n await genHookFn(actor, outputFilterBefore)?.();\n await genHookFn(actor, requestTransformBefore)?.();\n await genHookFn(actor, requestFilterBefore)?.();\n\n const runRes = await actor.crawler.run(requests, options);\n\n await genHookFn(actor, outputTransformAfter)?.();\n await genHookFn(actor, outputFilterAfter)?.();\n await genHookFn(actor, requestTransformAfter)?.();\n await genHookFn(actor, requestFilterAfter)?.();\n\n // Trigger metamorph if it was set from the input\n await metamorph();\n\n return runRes;\n };\n\n return runCrawler;\n};\n\n/** Create a function that triggers metamorph, using Actor's inputs as defaults. */\nconst createScopedMetamorph = (actor: Pick<ActorContext, 'input' | 'io'>) => {\n // Trigger metamorph if it was set from the input\n const metamorph: Metamorph = async (overrides?: MetamorphActorInput) => {\n const {\n metamorphActorId,\n metamorphActorBuild,\n metamorphActorInput,\n } = defaults({}, overrides, actor.input ?? {}); // prettier-ignore\n\n if (!metamorphActorId) return;\n\n await actor.io.triggerDownstreamCrawler(metamorphActorId, metamorphActorInput, {\n build: metamorphActorBuild,\n });\n };\n\n return metamorph;\n};\n\n/** pushData wrapper that pre-populates options based on actor input */\nconst createScopedPushData = (actor: Pick<ActorContext, 'input' | 'state' | 'io'>) => {\n const {\n includePersonalData,\n requestQueueId,\n outputMaxEntries,\n outputTransform,\n outputFilter,\n outputDatasetId,\n outputPickFields,\n outputRenameFields,\n outputCacheStoreId,\n outputCachePrimaryKeys,\n outputCacheActionOnResult,\n } = (actor.input ?? {}) as OutputActorInput & PrivacyActorInput & RequestActorInput;\n\n const scopedPushData: ActorContext['pushData'] = async (entries, ctx, options) => {\n const transformFn = genHookFn(actor, outputTransform);\n const filterFn = genHookFn(actor, outputFilter);\n\n const mergedOptions = {\n io: actor.io,\n showPrivate: includePersonalData,\n maxCount: outputMaxEntries,\n pickKeys: outputPickFields,\n remapKeys: outputRenameFields,\n transform: transformFn ? (item) => transformFn(item) : undefined,\n filter: filterFn ? (item) => filterFn(item) : undefined,\n datasetId: outputDatasetId,\n requestQueueId,\n cacheStoreId: outputCacheStoreId,\n cachePrimaryKeys: outputCachePrimaryKeys,\n cacheActionOnResult: outputCacheActionOnResult,\n ...options,\n } satisfies PushDataOptions<object>;\n\n return pushData(entries, ctx, mergedOptions);\n };\n\n return scopedPushData;\n};\n\n/** pushRequests wrapper that pre-populates options based on actor input */\nconst createScopedPushRequests = (actor: Pick<ActorContext, 'input' | 'state' | 'io'>) => {\n const { requestQueueId, requestMaxEntries, requestTransform, requestFilter } = (actor.input ??\n {}) as RequestActorInput;\n\n const scopedPushRequest: ActorContext['pushRequests'] = async (entries, ctx, options) => {\n const transformFn = genHookFn(actor, requestTransform);\n const filterFn = genHookFn(actor, requestFilter);\n\n const mergedOptions = {\n io: actor.io,\n maxCount: requestMaxEntries,\n transform: transformFn ? (item) => transformFn(item) : undefined,\n filter: filterFn ? (item) => filterFn(item) : undefined,\n requestQueueId,\n ...options,\n } satisfies PushRequestsOptions<any>;\n\n return pushRequests(entries, ctx, mergedOptions);\n };\n\n return scopedPushRequest;\n};\n\n/** Given the actor input, create common crawler options. */\nexport const createHttpCrawlerOptions = <\n TOpts extends BasicCrawlerOptions<any> = BasicCrawlerOptions,\n Input extends Record<string, any> = Record<string, any>\n>({\n input,\n defaults,\n overrides,\n}: {\n /** Actor input */\n input: Input | null;\n /**\n * Default config options set by us. These may be overriden\n * by values from actor input (set by user).\n */\n defaults?: TOpts;\n /**\n * These config options will overwrite both the default and user\n * options. This is useful for hard-setting values e.g. in tests.\n */\n overrides?: TOpts;\n}) => {\n const pickCrawlerInputFields = <T extends CrawlerConfigActorInput>(config: T) =>\n pick(config, Object.keys(crawlerInput));\n\n return {\n // ----- 1. DEFAULTS -----\n ...omitBy(defaults ?? ({} as TOpts), (field) => field === undefined),\n // ----- 2. CONFIG FROM INPUT -----\n ...omitBy(pickCrawlerInputFields(input ?? {}), (field) => field === undefined),\n // ----- 3. OVERRIDES - E.G. TEST CONFIG -----\n ...omitBy(overrides ?? ({} as TOpts), (field) => field === undefined),\n } satisfies Partial<TOpts>;\n};\n\nconst getStartUrlsFromInput = async (actor: Pick<ActorContext, 'input' | 'state' | 'io'>) => {\n const { startUrls, startUrlsFromDataset, startUrlsFromFunction } = (actor.input ??\n {}) as StartUrlsActorInput;\n\n const urlsAgg = [...(startUrls ?? [])];\n\n if (startUrlsFromDataset) {\n const [datasetId, field] = startUrlsFromDataset.split('#');\n const urlsFromDataset = await getColumnFromDataset<any>(datasetId, field, { io: actor.io });\n urlsAgg.push(...urlsFromDataset);\n }\n\n if (startUrlsFromFunction) {\n const urlsFromFn = await genHookFn(actor, startUrlsFromFunction)?.();\n urlsAgg.push(...urlsFromFn);\n }\n\n return urlsAgg;\n};\n"]}
@@ -0,0 +1,162 @@
1
+ import type { BasicCrawler, CrawlingContext, ProxyConfiguration, RouterHandler } from 'crawlee';
2
+ import type { gotScraping } from 'got-scraping';
3
+ import type { MaybePromise, PickPartial } from '../../utils/types';
4
+ import type { CrawlerUrl } from '../../types';
5
+ import type { itemCacheKey, pushData } from '../io/pushData';
6
+ import type { pushRequests } from '../io/pushRequests';
7
+ import type { RouteHandler, RouteMatcher, CrawlerRouterWrapper } from '../router';
8
+ import type { MetamorphActorInput } from '../config';
9
+ import type { CrawleeOneIO } from '../integrations/types';
10
+ type MaybeAsyncFn<R, Args extends any[]> = R | ((...args: Args) => MaybePromise<R>);
11
+ type OrigRunCrawler<T extends CrawlingContext<any, any>> = BasicCrawler<T>['run'];
12
+ /** Extended type of `crawler.run()` function */
13
+ export type RunCrawler<Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>> = (requests?: CrawlerUrl[], options?: Parameters<OrigRunCrawler<Ctx>>[1]) => ReturnType<OrigRunCrawler<Ctx>>;
14
+ /** Trigger actor metamorph, using actor's inputs as defaults. */
15
+ export type Metamorph = (overrides?: MetamorphActorInput) => Promise<void>;
16
+ /** Context passed to route handlers */
17
+ export type ActorRouterContext<Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>, Labels extends string = string, Input extends Record<string, any> = Record<string, any>, TIO extends CrawleeOneIO = CrawleeOneIO> = {
18
+ actor: ActorContext<Ctx, Labels, Input, TIO>;
19
+ };
20
+ /** Context passed to user-defined functions passed from input */
21
+ export type ActorHookContext<TIO extends CrawleeOneIO> = Pick<ActorContext, 'input' | 'state'> & {
22
+ io: TIO;
23
+ itemCacheKey: typeof itemCacheKey;
24
+ sendRequest: typeof gotScraping;
25
+ };
26
+ export interface ActorDefinition<Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>, Labels extends string = string, Input extends Record<string, any> = Record<string, any>, TIO extends CrawleeOneIO = CrawleeOneIO> {
27
+ /** Client for communicating with cloud/local storage. */
28
+ io: TIO;
29
+ /**
30
+ * Actor input which you can get e.g. via `Actor.getInput()`
31
+ *
32
+ * Input is automatically retrieved if undefined.
33
+ */
34
+ input?: MaybeAsyncFn<Input, [ActorDefinition<Ctx, Labels, Input, TIO>]>;
35
+ /** Validation for the actor input. Should throw error if validation fails. */
36
+ validateInput?: (input: Input | null) => MaybePromise<void>;
37
+ /**
38
+ * Router instance that redirects the request to handlers.
39
+ * @example
40
+ * import { createCheerioRouter } from 'crawlee';
41
+ *
42
+ * ({
43
+ * ...
44
+ * router: createCheerioRouter(),
45
+ * })
46
+ */
47
+ router: MaybeAsyncFn<RouterHandler<Ctx>, [ActorDefinitionWithInput<Ctx, Labels, Input, TIO>]>;
48
+ /**
49
+ * Criteria that un-labelled requests are matched against.
50
+ *
51
+ * E.g. If `match` function returns truthy value,
52
+ * the request is passed to the `action` function for processing.
53
+ *
54
+ * @example
55
+ * ({
56
+ * ...
57
+ * routes: [{
58
+ * // If match returns true, the request is forwarded to handler
59
+ * // with label JOB_DETAIL.
60
+ * name: 'Job detail',
61
+ * handlerLabel: routeLabels.JOB_DETAIL,
62
+ * match: (url) => isUrlOfJobOffer(url),
63
+ * }, {
64
+ * // Define custom action function:
65
+ * // If match returns true, we replace this request with new one
66
+ * // pointing to new domain.
67
+ * name: 'Main page',
68
+ * handlerLabel: null,
69
+ * match: (url) => url.match(/example\.com\/?(?:[?#~]|$)/i),
70
+ * action: async (url, ctx, _, handlers) => {
71
+ * ctx.log.info(`Redirecting to https://www.new-domain.com`);
72
+ * await ctx.crawler.addRequests(['https://www.new-domain.com'], { forefront: true });
73
+ * },
74
+ * }],
75
+ * })
76
+ */
77
+ routes: MaybeAsyncFn<RouteMatcher<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>, Labels>[], [
78
+ ActorDefinitionWithInput<Ctx, Labels, Input, TIO>
79
+ ]>;
80
+ /** Handlers for the labelled requests. The object keys are the labels. */
81
+ routeHandlers: MaybeAsyncFn<Record<Labels, RouteHandler<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>>>, [ActorDefinitionWithInput<Ctx, Labels, Input, TIO>]>;
82
+ /**
83
+ * Provides the option to modify or extend all router handlers by wrapping
84
+ * them in these functions.
85
+ *
86
+ * Wrappers are applied from right to left. That means that wrappers `[A, B, C]`
87
+ * will be applied like so `A( B( C( handler ) ) )`.
88
+ *
89
+ * Default `routerWrappers`:
90
+ * ```js
91
+ * {
92
+ * ...
93
+ * routerWrappers: ({ input }) => [
94
+ * logLevelHandlerWrapper<Ctx, any>(input?.logLevel ?? 'info'),
95
+ * ],
96
+ * }
97
+ * ```
98
+ */
99
+ routerWrappers?: MaybeAsyncFn<CrawlerRouterWrapper<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>>[], [ActorDefinitionWithInput<Ctx, Labels, Input, TIO>]>;
100
+ proxy?: MaybeAsyncFn<ProxyConfiguration, [ActorDefinitionWithInput<Ctx, Labels, Input, TIO>]>;
101
+ createCrawler: (actorCtx: Omit<ActorContext<Ctx, Labels, Input, TIO>, 'crawler' | 'runCrawler' | 'metamorph' | 'pushData' | 'pushRequests' | 'startUrls'>) => MaybePromise<Ctx['crawler']>;
102
+ }
103
+ /** ActorDefinition object where the input is already resolved */
104
+ export type ActorDefinitionWithInput<Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>, Labels extends string = string, Input extends Record<string, any> = Record<string, any>, TIO extends CrawleeOneIO = CrawleeOneIO> = Omit<ActorDefinition<Ctx, Labels, Input, TIO>, 'input'> & {
105
+ input: Input | null;
106
+ state: Record<string, unknown>;
107
+ };
108
+ /** Context available while creating a Crawlee crawler/actor */
109
+ export interface ActorContext<Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>, Labels extends string = string, Input extends Record<string, any> = Record<string, any>, TIO extends CrawleeOneIO = CrawleeOneIO> {
110
+ crawler: Ctx['crawler'];
111
+ /**
112
+ * This function wraps `crawler.run(requests, runOtions)` with additional
113
+ * features:
114
+ * - Automatically metamorph into another actor after the run finishes
115
+ */
116
+ runCrawler: RunCrawler<Ctx>;
117
+ /** Trigger actor metamorph, using actor's inputs as defaults. */
118
+ metamorph: Metamorph;
119
+ /**
120
+ * `Actor.pushData` with extra optional features:
121
+ *
122
+ * - Limit the number of entries pushed to the Dataset based on the Actor input
123
+ * - Transform and filter entries via Actor input.
124
+ * - Add metadata to entries before they are pushed to Dataset.
125
+ * - Set which (nested) properties are personal data optionally redact them for privacy compliance.
126
+ */
127
+ pushData: typeof pushData;
128
+ /**
129
+ * Similar to `Actor.openRequestQueue().addRequests`, but with extra features:
130
+ *
131
+ * - Limit the max size of the RequestQueue. No requests are added when RequestQueue is at or above the limit.
132
+ * - Transform and filter requests. Requests that did not pass the filter are not added to the RequestQueue.
133
+ */
134
+ pushRequests: typeof pushRequests;
135
+ /**
136
+ * A list of resolved Requests to be scraped.
137
+ *
138
+ * This list is a combination of 3 Actor inputs:
139
+ * - `startUrls` - Static list of URLs to scrape.
140
+ * - `startUrlsFromDataset` - From a specific field from a Dataset (e.g. "dataset123#fieldName" - Dataset: "dataset123", field: "fieldName").
141
+ * - `startUrlsFromFunction` - A function that is evaulated to generate the Requests.
142
+ */
143
+ startUrls: CrawlerUrl[];
144
+ proxy?: ProxyConfiguration;
145
+ router: RouterHandler<Ctx>;
146
+ routes: RouteMatcher<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>, Labels>[];
147
+ routeHandlers: Record<Labels, RouteHandler<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>>>;
148
+ /** Original config from which this actor context was created */
149
+ config: PickPartial<ActorDefinition<Ctx, Labels, Input, TIO>, 'io'>;
150
+ /** Read-only inputs passed to the actor */
151
+ input: Input | null;
152
+ /** Mutable state that is shared across setup and teardown hooks */
153
+ state: Record<string, unknown>;
154
+ /**
155
+ * Instance managing communication with databases - storage & retrieval
156
+ * (Dataset, RequestQueue, KeyValueStore).
157
+ *
158
+ * This is modelled and similar to Apify's `Actor` static class.
159
+ */
160
+ io: TIO;
161
+ }
162
+ export {};
@@ -0,0 +1,3 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../../../src/lib/actor/types.ts"],"names":[],"mappings":"","sourcesContent":["import type { BasicCrawler, CrawlingContext, ProxyConfiguration, RouterHandler } from 'crawlee';\nimport type { gotScraping } from 'got-scraping';\n\nimport type { MaybePromise, PickPartial } from '../../utils/types';\nimport type { CrawlerUrl } from '../../types';\nimport type { itemCacheKey, pushData } from '../io/pushData';\nimport type { pushRequests } from '../io/pushRequests';\nimport type { RouteHandler, RouteMatcher, CrawlerRouterWrapper } from '../router';\nimport type { MetamorphActorInput } from '../config';\nimport type { CrawleeOneIO } from '../integrations/types';\n\ntype MaybeAsyncFn<R, Args extends any[]> = R | ((...args: Args) => MaybePromise<R>);\n\ntype OrigRunCrawler<T extends CrawlingContext<any, any>> = BasicCrawler<T>['run'];\n\n/** Extended type of `crawler.run()` function */\nexport type RunCrawler<Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>> = (\n requests?: CrawlerUrl[],\n options?: Parameters<OrigRunCrawler<Ctx>>[1]\n) => ReturnType<OrigRunCrawler<Ctx>>;\n\n/** Trigger actor metamorph, using actor's inputs as defaults. */\nexport type Metamorph = (overrides?: MetamorphActorInput) => Promise<void>;\n\n/** Context passed to route handlers */\nexport type ActorRouterContext<\n Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n> = {\n actor: ActorContext<Ctx, Labels, Input, TIO>;\n};\n\n/** Context passed to user-defined functions passed from input */\nexport type ActorHookContext<TIO extends CrawleeOneIO> = Pick<ActorContext, 'input' | 'state'> & {\n io: TIO;\n itemCacheKey: typeof itemCacheKey;\n sendRequest: typeof gotScraping;\n};\n\nexport interface ActorDefinition<\n Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n> {\n /** Client for communicating with cloud/local storage. */\n io: TIO;\n\n // Actor input\n /**\n * Actor input which you can get e.g. via `Actor.getInput()`\n *\n * Input is automatically retrieved if undefined.\n */\n input?: MaybeAsyncFn<Input, [ActorDefinition<Ctx, Labels, Input, TIO>]>;\n /** Validation for the actor input. Should throw error if validation fails. */\n validateInput?: (input: Input | null) => MaybePromise<void>;\n\n // Router setup\n /**\n * Router instance that redirects the request to handlers.\n * @example\n * import { createCheerioRouter } from 'crawlee';\n *\n * ({\n * ...\n * router: createCheerioRouter(),\n * })\n */\n router: MaybeAsyncFn<RouterHandler<Ctx>, [ActorDefinitionWithInput<Ctx, Labels, Input, TIO>]>;\n /**\n * Criteria that un-labelled requests are matched against.\n *\n * E.g. If `match` function returns truthy value,\n * the request is passed to the `action` function for processing.\n *\n * @example\n * ({\n * ...\n * routes: [{\n * // If match returns true, the request is forwarded to handler\n * // with label JOB_DETAIL.\n * name: 'Job detail',\n * handlerLabel: routeLabels.JOB_DETAIL,\n * match: (url) => isUrlOfJobOffer(url),\n * }, {\n * // Define custom action function:\n * // If match returns true, we replace this request with new one\n * // pointing to new domain.\n * name: 'Main page',\n * handlerLabel: null,\n * match: (url) => url.match(/example\\.com\\/?(?:[?#~]|$)/i),\n * action: async (url, ctx, _, handlers) => {\n * ctx.log.info(`Redirecting to https://www.new-domain.com`);\n * await ctx.crawler.addRequests(['https://www.new-domain.com'], { forefront: true });\n * },\n * }],\n * })\n */\n routes: MaybeAsyncFn<\n RouteMatcher<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>, Labels>[],\n [ActorDefinitionWithInput<Ctx, Labels, Input, TIO>]\n >;\n /** Handlers for the labelled requests. The object keys are the labels. */\n routeHandlers: MaybeAsyncFn<Record<Labels, RouteHandler<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>>>, [ActorDefinitionWithInput<Ctx, Labels, Input, TIO>]>; // prettier-ignore\n /**\n * Provides the option to modify or extend all router handlers by wrapping\n * them in these functions.\n *\n * Wrappers are applied from right to left. That means that wrappers `[A, B, C]`\n * will be applied like so `A( B( C( handler ) ) )`.\n *\n * Default `routerWrappers`:\n * ```js\n * {\n * ...\n * routerWrappers: ({ input }) => [\n * logLevelHandlerWrapper<Ctx, any>(input?.logLevel ?? 'info'),\n * ],\n * }\n * ```\n */\n routerWrappers?: MaybeAsyncFn<CrawlerRouterWrapper<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>>[], [ActorDefinitionWithInput<Ctx, Labels, Input, TIO>]>; // prettier-ignore\n\n // Proxy setup\n proxy?: MaybeAsyncFn<ProxyConfiguration, [ActorDefinitionWithInput<Ctx, Labels, Input, TIO>]>; // prettier-ignore\n\n // Crawler setup\n createCrawler: (\n actorCtx: Omit<\n ActorContext<Ctx, Labels, Input, TIO>,\n 'crawler' | 'runCrawler' | 'metamorph' | 'pushData' | 'pushRequests' | 'startUrls'\n >\n ) => MaybePromise<Ctx['crawler']>;\n}\n\n/** ActorDefinition object where the input is already resolved */\nexport type ActorDefinitionWithInput<\n Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n> = Omit<ActorDefinition<Ctx, Labels, Input, TIO>, 'input'> & {\n input: Input | null;\n state: Record<string, unknown>;\n};\n\n/** Context available while creating a Crawlee crawler/actor */\nexport interface ActorContext<\n Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n> {\n crawler: Ctx['crawler'];\n /**\n * This function wraps `crawler.run(requests, runOtions)` with additional\n * features:\n * - Automatically metamorph into another actor after the run finishes\n */\n runCrawler: RunCrawler<Ctx>;\n /** Trigger actor metamorph, using actor's inputs as defaults. */\n metamorph: Metamorph;\n /**\n * `Actor.pushData` with extra optional features:\n *\n * - Limit the number of entries pushed to the Dataset based on the Actor input\n * - Transform and filter entries via Actor input.\n * - Add metadata to entries before they are pushed to Dataset.\n * - Set which (nested) properties are personal data optionally redact them for privacy compliance.\n */\n pushData: typeof pushData;\n /**\n * Similar to `Actor.openRequestQueue().addRequests`, but with extra features:\n *\n * - Limit the max size of the RequestQueue. No requests are added when RequestQueue is at or above the limit.\n * - Transform and filter requests. Requests that did not pass the filter are not added to the RequestQueue.\n */\n pushRequests: typeof pushRequests;\n /**\n * A list of resolved Requests to be scraped.\n *\n * This list is a combination of 3 Actor inputs:\n * - `startUrls` - Static list of URLs to scrape.\n * - `startUrlsFromDataset` - From a specific field from a Dataset (e.g. \"dataset123#fieldName\" - Dataset: \"dataset123\", field: \"fieldName\").\n * - `startUrlsFromFunction` - A function that is evaulated to generate the Requests.\n */\n startUrls: CrawlerUrl[];\n proxy?: ProxyConfiguration;\n router: RouterHandler<Ctx>;\n routes: RouteMatcher<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>, Labels>[];\n routeHandlers: Record<Labels, RouteHandler<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>>>;\n /** Original config from which this actor context was created */\n config: PickPartial<ActorDefinition<Ctx, Labels, Input, TIO>, 'io'>;\n /** Read-only inputs passed to the actor */\n input: Input | null;\n /** Mutable state that is shared across setup and teardown hooks */\n state: Record<string, unknown>;\n /**\n * Instance managing communication with databases - storage & retrieval\n * (Dataset, RequestQueue, KeyValueStore).\n *\n * This is modelled and similar to Apify's `Actor` static class.\n */\n io: TIO;\n}\n"]}