crawlee-one 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +81 -0
- package/dist/cjs/cli/cli.d.ts +1 -0
- package/dist/cjs/cli/cli.js +61 -0
- package/dist/cjs/cli/cli.js.map +1 -0
- package/dist/cjs/cli/index.d.ts +2 -0
- package/dist/cjs/cli/index.js +6 -0
- package/dist/cjs/cli/index.js.map +1 -0
- package/dist/cjs/index.d.ts +24 -0
- package/dist/cjs/index.js +43 -0
- package/dist/cjs/index.js.map +1 -0
- package/dist/cjs/lib/actions/dom.d.ts +102 -0
- package/dist/cjs/lib/actions/dom.js +743 -0
- package/dist/cjs/lib/actions/dom.js.map +1 -0
- package/dist/cjs/lib/actions/domUtils.d.ts +42 -0
- package/dist/cjs/lib/actions/domUtils.js +126 -0
- package/dist/cjs/lib/actions/domUtils.js.map +1 -0
- package/dist/cjs/lib/actions/page.d.ts +69 -0
- package/dist/cjs/lib/actions/page.js +205 -0
- package/dist/cjs/lib/actions/page.js.map +1 -0
- package/dist/cjs/lib/actions/scrapeListing.d.ts +78 -0
- package/dist/cjs/lib/actions/scrapeListing.js +242 -0
- package/dist/cjs/lib/actions/scrapeListing.js.map +1 -0
- package/dist/cjs/lib/actor/actor.d.ts +90 -0
- package/dist/cjs/lib/actor/actor.js +306 -0
- package/dist/cjs/lib/actor/actor.js.map +1 -0
- package/dist/cjs/lib/actor/types.d.ts +162 -0
- package/dist/cjs/lib/actor/types.js +3 -0
- package/dist/cjs/lib/actor/types.js.map +1 -0
- package/dist/cjs/lib/actor.d.ts +189 -0
- package/dist/cjs/lib/actor.js +225 -0
- package/dist/cjs/lib/actor.js.map +1 -0
- package/dist/cjs/lib/actorSpec.d.ts +20 -0
- package/dist/cjs/lib/actorSpec.js +3 -0
- package/dist/cjs/lib/actorSpec.js.map +1 -0
- package/dist/cjs/lib/config.d.ts +561 -0
- package/dist/cjs/lib/config.js +707 -0
- package/dist/cjs/lib/config.js.map +1 -0
- package/dist/cjs/lib/dataset/maxCount.d.ts +30 -0
- package/dist/cjs/lib/dataset/maxCount.js +55 -0
- package/dist/cjs/lib/dataset/maxCount.js.map +1 -0
- package/dist/cjs/lib/dataset/pushData.d.ts +123 -0
- package/dist/cjs/lib/dataset/pushData.js +182 -0
- package/dist/cjs/lib/dataset/pushData.js.map +1 -0
- package/dist/cjs/lib/dataset.d.ts +98 -0
- package/dist/cjs/lib/dataset.js +122 -0
- package/dist/cjs/lib/dataset.js.map +1 -0
- package/dist/cjs/lib/dom.d.ts +78 -0
- package/dist/cjs/lib/dom.js +243 -0
- package/dist/cjs/lib/dom.js.map +1 -0
- package/dist/cjs/lib/error/errorHandler.d.ts +112 -0
- package/dist/cjs/lib/error/errorHandler.js +164 -0
- package/dist/cjs/lib/error/errorHandler.js.map +1 -0
- package/dist/cjs/lib/error/sentry.d.ts +11 -0
- package/dist/cjs/lib/error/sentry.js +60 -0
- package/dist/cjs/lib/error/sentry.js.map +1 -0
- package/dist/cjs/lib/integrations/apify.d.ts +67 -0
- package/dist/cjs/lib/integrations/apify.js +106 -0
- package/dist/cjs/lib/integrations/apify.js.map +1 -0
- package/dist/cjs/lib/integrations/types.d.ts +274 -0
- package/dist/cjs/lib/integrations/types.js +3 -0
- package/dist/cjs/lib/integrations/types.js.map +1 -0
- package/dist/cjs/lib/io/dataset.d.ts +67 -0
- package/dist/cjs/lib/io/dataset.js +86 -0
- package/dist/cjs/lib/io/dataset.js.map +1 -0
- package/dist/cjs/lib/io/maxCount.d.ts +30 -0
- package/dist/cjs/lib/io/maxCount.js +55 -0
- package/dist/cjs/lib/io/maxCount.js.map +1 -0
- package/dist/cjs/lib/io/pushData.d.ts +124 -0
- package/dist/cjs/lib/io/pushData.js +193 -0
- package/dist/cjs/lib/io/pushData.js.map +1 -0
- package/dist/cjs/lib/io/pushRequests.d.ts +38 -0
- package/dist/cjs/lib/io/pushRequests.js +63 -0
- package/dist/cjs/lib/io/pushRequests.js.map +1 -0
- package/dist/cjs/lib/io/requestQueue.d.ts +28 -0
- package/dist/cjs/lib/io/requestQueue.js +40 -0
- package/dist/cjs/lib/io/requestQueue.js.map +1 -0
- package/dist/cjs/lib/log.d.ts +38 -0
- package/dist/cjs/lib/log.js +54 -0
- package/dist/cjs/lib/log.js.map +1 -0
- package/dist/cjs/lib/migrate/localMigrator.d.ts +10 -0
- package/dist/cjs/lib/migrate/localMigrator.js +57 -0
- package/dist/cjs/lib/migrate/localMigrator.js.map +1 -0
- package/dist/cjs/lib/migrate/localState.d.ts +7 -0
- package/dist/cjs/lib/migrate/localState.js +43 -0
- package/dist/cjs/lib/migrate/localState.js.map +1 -0
- package/dist/cjs/lib/migrate/types.d.ts +6 -0
- package/dist/cjs/lib/migrate/types.js +3 -0
- package/dist/cjs/lib/migrate/types.js.map +1 -0
- package/dist/cjs/lib/readme/readme.d.ts +65 -0
- package/dist/cjs/lib/readme/readme.js +534 -0
- package/dist/cjs/lib/readme/readme.js.map +1 -0
- package/dist/cjs/lib/readme/types.d.ts +260 -0
- package/dist/cjs/lib/readme/types.js +54 -0
- package/dist/cjs/lib/readme/types.js.map +1 -0
- package/dist/cjs/lib/router.d.ts +132 -0
- package/dist/cjs/lib/router.js +165 -0
- package/dist/cjs/lib/router.js.map +1 -0
- package/dist/cjs/lib/scraper/scrapeListing.d.ts +78 -0
- package/dist/cjs/lib/scraper/scrapeListing.js +242 -0
- package/dist/cjs/lib/scraper/scrapeListing.js.map +1 -0
- package/dist/cjs/lib/test/actor.d.ts +21 -0
- package/dist/cjs/lib/test/actor.js +56 -0
- package/dist/cjs/lib/test/actor.js.map +1 -0
- package/dist/cjs/lib/test/mockApifyClient.d.ts +32 -0
- package/dist/cjs/lib/test/mockApifyClient.js +176 -0
- package/dist/cjs/lib/test/mockApifyClient.js.map +1 -0
- package/dist/cjs/types.d.ts +31 -0
- package/dist/cjs/types.js +3 -0
- package/dist/cjs/types.js.map +1 -0
- package/dist/cjs/utils/async.d.ts +19 -0
- package/dist/cjs/utils/async.js +74 -0
- package/dist/cjs/utils/async.js.map +1 -0
- package/dist/cjs/utils/error.d.ts +1 -0
- package/dist/cjs/utils/error.js +10 -0
- package/dist/cjs/utils/error.js.map +1 -0
- package/dist/cjs/utils/format.d.ts +9 -0
- package/dist/cjs/utils/format.js +19 -0
- package/dist/cjs/utils/format.js.map +1 -0
- package/dist/cjs/utils/package.d.ts +15 -0
- package/dist/cjs/utils/package.js +25 -0
- package/dist/cjs/utils/package.js.map +1 -0
- package/dist/cjs/utils/types.d.ts +6 -0
- package/dist/cjs/utils/types.js +9 -0
- package/dist/cjs/utils/types.js.map +1 -0
- package/dist/cjs/utils/url.d.ts +9 -0
- package/dist/cjs/utils/url.js +32 -0
- package/dist/cjs/utils/url.js.map +1 -0
- package/dist/cjs/utils/valueMonitor.d.ts +31 -0
- package/dist/cjs/utils/valueMonitor.js +91 -0
- package/dist/cjs/utils/valueMonitor.js.map +1 -0
- package/package.json +85 -0
|
@@ -0,0 +1,707 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.allActorInputValidationFields = exports.metamorphInputValidationFields = exports.outputInputValidationFields = exports.requestInputValidationFields = exports.privacyInputValidationFields = exports.proxyInputValidationFields = exports.loggingInputValidationFields = exports.startUrlsInputValidationFields = exports.inputInputValidationFields = exports.perfInputValidationFields = exports.crawlerInputValidationFields = exports.allActorInputs = exports.metamorphInput = exports.outputInput = exports.requestInput = exports.privacyInput = exports.proxyInput = exports.loggingInput = exports.startUrlsInput = exports.perfInput = exports.crawlerInput = exports.inputInput = void 0;
|
|
7
|
+
const apify_actor_config_1 = require("apify-actor-config");
|
|
8
|
+
const joi_1 = __importDefault(require("joi"));
|
|
9
|
+
const log_1 = require("./log");
|
|
10
|
+
const datasetIdPattern = '^[a-zA-Z0-9][a-zA-Z0-9-]*$';
|
|
11
|
+
const datasetIdWithFieldPattern = `${datasetIdPattern.slice(0, -1)}#.+$`;
|
|
12
|
+
const newLine = (n) => '<br/>'.repeat(n);
|
|
13
|
+
const createHookFnExample = (args, mainCode, includeGuides) => {
|
|
14
|
+
const formattedArgs = Object.keys(args).length ? Object.keys(args).join(', ') + ', ' : '';
|
|
15
|
+
const formattedArgDesc = Object.entries(args).length
|
|
16
|
+
? Object.entries(args).map(([arg, desc]) => ` * \`${arg}\` - ${desc}.`)
|
|
17
|
+
: ` *`;
|
|
18
|
+
const formattedMainCode = mainCode
|
|
19
|
+
.split('\n')
|
|
20
|
+
.map((s) => '// ' + s)
|
|
21
|
+
.join('\n');
|
|
22
|
+
const guides = `//
|
|
23
|
+
// /* ========== SEE BELOW FOR MORE EXAMPLES ========= */
|
|
24
|
+
//
|
|
25
|
+
// /**
|
|
26
|
+
// * ======= ACCESSING DATASET ========
|
|
27
|
+
// * To save/load/access entries in Dataset.
|
|
28
|
+
// * Docs:
|
|
29
|
+
// * - https://docs.apify.com/platform/storage/dataset
|
|
30
|
+
// * - https://docs.apify.com/sdk/js/docs/guides/result-storage#dataset
|
|
31
|
+
// * - https://docs.apify.com/sdk/js/docs/examples/map-and-reduce
|
|
32
|
+
// */
|
|
33
|
+
// // const dataset = await io.openDataset('MyDatasetId');
|
|
34
|
+
// // const info = await dataset.getInfo();
|
|
35
|
+
// // console.log(info.itemCount);
|
|
36
|
+
// // // => 0
|
|
37
|
+
//
|
|
38
|
+
// /**
|
|
39
|
+
// * ======= ACCESSING REMOTE DATA ========
|
|
40
|
+
// * Use \`sendRequest\` to get data from the internet:
|
|
41
|
+
// * Docs:
|
|
42
|
+
// * - https://github.com/apify/got-scraping
|
|
43
|
+
// */
|
|
44
|
+
// // const catFact = await sendRequest.get('https://cat-fact.herokuapp.com/facts/5887e1d85c873e0011036889').json();
|
|
45
|
+
// // console.log(catFact.text);
|
|
46
|
+
// // // => "Cats make about 100 different sounds. Dogs make only about 10.",
|
|
47
|
+
//
|
|
48
|
+
// /**
|
|
49
|
+
// * ======= USING CACHE ========
|
|
50
|
+
// * To save the entry to the KeyValue cache (or retrieve it), you can use
|
|
51
|
+
// * \`itemCacheKey\` to create the entry's ID for you:
|
|
52
|
+
// */
|
|
53
|
+
// // const cacheId = itemCacheKey(item, input.cachePrimaryKeys);
|
|
54
|
+
// // const cache = await io.openKeyValueStore('MyStoreId');
|
|
55
|
+
// // cache.setValue(cacheId, entry);`;
|
|
56
|
+
const hookFnExample = `
|
|
57
|
+
/**
|
|
58
|
+
* Inputs:
|
|
59
|
+
${formattedArgDesc}
|
|
60
|
+
* \`ctx.io\` - Apify Actor class, see https://docs.apify.com/sdk/js/reference/class/Actor.
|
|
61
|
+
* \`ctx.input\` - The input object that was passed to this Actor.
|
|
62
|
+
* \`ctx.state\` - An object you can use to persist state across all your custom functions.
|
|
63
|
+
* \`ctx.sendRequest\` - Fetch remote data. Uses 'got-scraping', same as Apify's \`sendRequest\`.
|
|
64
|
+
* See https://crawlee.dev/docs/guides/got-scraping
|
|
65
|
+
* \`ctx.itemCacheKey\` - A function you can use to get cacheID for current \`entry\`.
|
|
66
|
+
* It takes the entry itself, and a list of properties to be used for hashing.
|
|
67
|
+
* By default, you should pass \`input.cachePrimaryKeys\` to it.
|
|
68
|
+
*
|
|
69
|
+
*/
|
|
70
|
+
// async (${formattedArgs}{ io, input, state, sendRequest, itemCacheKey }) => {
|
|
71
|
+
${formattedMainCode}
|
|
72
|
+
${includeGuides ? guides : '//'}
|
|
73
|
+
// };`;
|
|
74
|
+
return hookFnExample;
|
|
75
|
+
};
|
|
76
|
+
const CODE_EXAMPLES = {
|
|
77
|
+
// Input
|
|
78
|
+
inputExtendFromFunction: `// Example: Load Actor config from GitHub URL (public)
|
|
79
|
+
const config = await sendRequest.get('https://raw.githubusercontent.com/username/project/main/config.json').json();
|
|
80
|
+
|
|
81
|
+
// Increase concurrency during off-peak hours
|
|
82
|
+
// NOTE: Imagine we're targetting a small server, that can be slower during the day
|
|
83
|
+
const hours = new Date().getUTCHours();
|
|
84
|
+
const isOffPeak = hours < 6 || hours > 20;
|
|
85
|
+
config.maxConcurrency = isOffPeak ? 8 : 3;
|
|
86
|
+
|
|
87
|
+
return config;`,
|
|
88
|
+
startUrlsFromFunction: `// Example: Create and load URLs from a Dataset by combining multiple fields
|
|
89
|
+
const dataset = await io.openDataset(datasetNameOrId);
|
|
90
|
+
const data = await dataset.getData();
|
|
91
|
+
const urls = data.items.map((item) => \`https://example.com/u/\${item.userId}/list/\${item.listId}\`);
|
|
92
|
+
return urls;`,
|
|
93
|
+
// Output
|
|
94
|
+
outputTransform: `// Example: Add extra custom fields like aggregates
|
|
95
|
+
return {
|
|
96
|
+
...entry,
|
|
97
|
+
imagesCount: entry.images.length,
|
|
98
|
+
};`,
|
|
99
|
+
outputTransformBefore: `// Example: Fetch data or run code BEFORE entries are scraped.
|
|
100
|
+
state.categories = await sendRequest.get('https://example.com/my-categories').json();`,
|
|
101
|
+
outputTransformAfter: `// Example: Fetch data or run code AFTER entries are scraped.
|
|
102
|
+
delete state.categories;`,
|
|
103
|
+
outputFilter: `// Example: Filter entries based on number of images they have (at least 5)
|
|
104
|
+
return entry.images.length > 5;`,
|
|
105
|
+
outputFilterBefore: `// Example: Fetch data or run code BEFORE entries are scraped.
|
|
106
|
+
state.categories = await sendRequest.get('https://example.com/my-categories').json();`,
|
|
107
|
+
outputFilterAfter: `// Example: Fetch data or run code AFTER entries are scraped.
|
|
108
|
+
delete state.categories;`,
|
|
109
|
+
// Requests
|
|
110
|
+
requestTransform: `// Example: Tag requests
|
|
111
|
+
// (maybe because we use RequestQueue that pools multiple scrapers)
|
|
112
|
+
request.userData.tag = "VARIANT_A";
|
|
113
|
+
return requestQueue;`,
|
|
114
|
+
requestTransformBefore: `// Example: Fetch data or run code BEFORE requests are processed.
|
|
115
|
+
state.categories = await sendRequest.get('https://example.com/my-categories').json();`,
|
|
116
|
+
requestTransformAfter: `// Example: Fetch data or run code AFTER requests are processed.
|
|
117
|
+
delete state.categories;`,
|
|
118
|
+
requestFilter: `// Example: Filter requests based on their tag
|
|
119
|
+
// (maybe because we use RequestQueue that pools multiple scrapers)
|
|
120
|
+
return request.userData.tag === "VARIANT_A";`,
|
|
121
|
+
requestFilterBefore: `// Example: Fetch data or run code BEFORE requests are processed.
|
|
122
|
+
state.categories = await sendRequest.get('https://example.com/my-categories').json();`,
|
|
123
|
+
requestFilterAfter: `// Example: Fetch data or run code AFTER requests are processed.
|
|
124
|
+
delete state.categories;`,
|
|
125
|
+
};
|
|
126
|
+
/** Common input fields related to actor input */
|
|
127
|
+
exports.inputInput = {
|
|
128
|
+
inputExtendUrl: (0, apify_actor_config_1.createStringField)({
|
|
129
|
+
title: 'Extend Actor input from URL',
|
|
130
|
+
type: 'string',
|
|
131
|
+
editor: 'textfield',
|
|
132
|
+
description: `Extend Actor input with a config from a URL.${newLine(1)}
|
|
133
|
+
For example, you can store your actor input in a source control, and import it here.${newLine(1)}
|
|
134
|
+
In case of a conflict (if a field is defined both in Actor input and in imported input) the Actor input overwrites the imported fields.${newLine(1)}
|
|
135
|
+
The URL is requested with GET method, and must point to a JSON file containing a single object (the config).${newLine(1)}
|
|
136
|
+
If you need to send a POST request or to modify the response further, use \`inputExtendFromFunction\` instead.`,
|
|
137
|
+
example: 'https://raw.githubusercontent.com/jfairbank/programming-elm.com/master/cat-breeds.json',
|
|
138
|
+
nullable: true,
|
|
139
|
+
sectionCaption: 'Programmatic Input (Advanced)',
|
|
140
|
+
sectionDescription: "With these options you can configure other Actor options programmatically or from remote source.",
|
|
141
|
+
}),
|
|
142
|
+
inputExtendFromFunction: (0, apify_actor_config_1.createStringField)({
|
|
143
|
+
title: 'Extend Actor input from custom function',
|
|
144
|
+
type: 'string',
|
|
145
|
+
editor: 'javascript',
|
|
146
|
+
description: `Extend Actor input with a config defined by a custom function.${newLine(1)}
|
|
147
|
+
For example, you can store your actor input in a source control, and import it here.${newLine(1)}
|
|
148
|
+
In case of a conflict (if a field is defined both in Actor input and in imported input) the Actor input overwrites the imported fields.${newLine(1)}
|
|
149
|
+
The function must return an object (the config).`,
|
|
150
|
+
example: createHookFnExample({}, CODE_EXAMPLES.inputExtendFromFunction, false),
|
|
151
|
+
prefill: createHookFnExample({}, CODE_EXAMPLES.inputExtendFromFunction, true),
|
|
152
|
+
nullable: true,
|
|
153
|
+
}), // prettier-ignore
|
|
154
|
+
};
|
|
155
|
+
/** Common input fields related to crawler setup */
|
|
156
|
+
exports.crawlerInput = {
|
|
157
|
+
maxRequestRetries: (0, apify_actor_config_1.createIntegerField)({
|
|
158
|
+
title: 'maxRequestRetries',
|
|
159
|
+
type: 'integer',
|
|
160
|
+
description: 'Indicates how many times the request is retried if <a href="https://crawlee.dev/api/basic-crawler/interface/BasicCrawlerOptions#requestHandler">BasicCrawlerOptions.requestHandler</a> fails.',
|
|
161
|
+
example: 3,
|
|
162
|
+
prefill: 3,
|
|
163
|
+
minimum: 0,
|
|
164
|
+
nullable: true,
|
|
165
|
+
sectionCaption: 'Crawler configuration (Advanced)',
|
|
166
|
+
sectionDescription: "These options are applied directly to the Crawler. In majority of cases you don't need to change these. See https://crawlee.dev/api/basic-crawler/interface/BasicCrawlerOptions",
|
|
167
|
+
}),
|
|
168
|
+
maxRequestsPerMinute: (0, apify_actor_config_1.createIntegerField)({
|
|
169
|
+
title: 'maxRequestsPerMinute',
|
|
170
|
+
type: 'integer',
|
|
171
|
+
description: 'The maximum number of requests per minute the crawler should run. We can pass any positive, non-zero integer.',
|
|
172
|
+
example: 120,
|
|
173
|
+
prefill: 120,
|
|
174
|
+
minimum: 1,
|
|
175
|
+
nullable: true,
|
|
176
|
+
}),
|
|
177
|
+
maxRequestsPerCrawl: (0, apify_actor_config_1.createIntegerField)({
|
|
178
|
+
title: 'maxRequestsPerCrawl',
|
|
179
|
+
type: 'integer',
|
|
180
|
+
description: `Maximum number of pages that the crawler will open. The crawl will stop when this limit is reached.
|
|
181
|
+
${newLine(1)} <strong>NOTE:</strong> In cases of parallel crawling, the actual number of pages visited might be slightly higher than this value.`,
|
|
182
|
+
minimum: 1,
|
|
183
|
+
nullable: true,
|
|
184
|
+
}),
|
|
185
|
+
minConcurrency: (0, apify_actor_config_1.createIntegerField)({
|
|
186
|
+
title: 'minConcurrency',
|
|
187
|
+
type: 'integer',
|
|
188
|
+
description: `Sets the minimum concurrency (parallelism) for the crawl.${newLine(1)}
|
|
189
|
+
<strong>WARNING:</strong> If we set this value too high with respect to the available system memory and CPU, our crawler will run extremely slow or crash. If not sure, it's better to keep the default value and the concurrency will scale up automatically.`,
|
|
190
|
+
example: 1,
|
|
191
|
+
prefill: 1,
|
|
192
|
+
minimum: 1,
|
|
193
|
+
nullable: true,
|
|
194
|
+
}),
|
|
195
|
+
maxConcurrency: (0, apify_actor_config_1.createIntegerField)({
|
|
196
|
+
title: 'maxConcurrency',
|
|
197
|
+
type: 'integer',
|
|
198
|
+
description: 'Sets the maximum concurrency (parallelism) for the crawl.',
|
|
199
|
+
minimum: 1,
|
|
200
|
+
nullable: true,
|
|
201
|
+
}),
|
|
202
|
+
navigationTimeoutSecs: (0, apify_actor_config_1.createIntegerField)({
|
|
203
|
+
title: 'navigationTimeoutSecs',
|
|
204
|
+
type: 'integer',
|
|
205
|
+
description: 'Timeout in which the HTTP request to the resource needs to finish, given in seconds.',
|
|
206
|
+
minimum: 0,
|
|
207
|
+
nullable: true,
|
|
208
|
+
}),
|
|
209
|
+
requestHandlerTimeoutSecs: (0, apify_actor_config_1.createIntegerField)({
|
|
210
|
+
title: 'requestHandlerTimeoutSecs',
|
|
211
|
+
type: 'integer',
|
|
212
|
+
description: 'Timeout in which the function passed as <a href="https://crawlee.dev/api/basic-crawler/interface/BasicCrawlerOptions#requestHandler">BasicCrawlerOptions.requestHandler</a> needs to finish, in seconds.',
|
|
213
|
+
example: 180,
|
|
214
|
+
prefill: 180,
|
|
215
|
+
minimum: 0,
|
|
216
|
+
nullable: true,
|
|
217
|
+
}),
|
|
218
|
+
keepAlive: (0, apify_actor_config_1.createBooleanField)({
|
|
219
|
+
title: 'keepAlive',
|
|
220
|
+
type: 'boolean',
|
|
221
|
+
description: 'Allows to keep the crawler alive even if the RequestQueue gets empty. With keepAlive: true the crawler will keep running, waiting for more requests to come.',
|
|
222
|
+
nullable: true,
|
|
223
|
+
}),
|
|
224
|
+
ignoreSslErrors: (0, apify_actor_config_1.createBooleanField)({
|
|
225
|
+
title: 'ignoreSslErrors',
|
|
226
|
+
type: 'boolean',
|
|
227
|
+
description: 'If set to true, SSL certificate errors will be ignored.',
|
|
228
|
+
nullable: true,
|
|
229
|
+
}),
|
|
230
|
+
additionalMimeTypes: (0, apify_actor_config_1.createArrayField)({
|
|
231
|
+
title: 'additionalMimeTypes',
|
|
232
|
+
type: 'array',
|
|
233
|
+
description: 'An array of MIME types you want the crawler to load and process. By default, only text/html and application/xhtml+xml MIME types are supported.',
|
|
234
|
+
editor: 'stringList',
|
|
235
|
+
uniqueItems: true,
|
|
236
|
+
nullable: true,
|
|
237
|
+
}),
|
|
238
|
+
suggestResponseEncoding: (0, apify_actor_config_1.createStringField)({
|
|
239
|
+
title: 'suggestResponseEncoding',
|
|
240
|
+
type: 'string',
|
|
241
|
+
description: 'By default this crawler will extract correct encoding from the HTTP response headers. There are some websites which use invalid headers. Those are encoded using the UTF-8 encoding. If those sites actually use a different encoding, the response will be corrupted. You can use suggestResponseEncoding to fall back to a certain encoding, if you know that your target website uses it. To force a certain encoding, disregarding the response headers, use forceResponseEncoding.',
|
|
242
|
+
editor: 'textfield',
|
|
243
|
+
nullable: true,
|
|
244
|
+
}),
|
|
245
|
+
forceResponseEncoding: (0, apify_actor_config_1.createStringField)({
|
|
246
|
+
title: 'forceResponseEncoding',
|
|
247
|
+
type: 'string',
|
|
248
|
+
description: 'By default this crawler will extract correct encoding from the HTTP response headers. Use forceResponseEncoding to force a certain encoding, disregarding the response headers. To only provide a default for missing encodings, use suggestResponseEncoding.',
|
|
249
|
+
editor: 'textfield',
|
|
250
|
+
nullable: true,
|
|
251
|
+
}),
|
|
252
|
+
};
|
|
253
|
+
/** Common input fields related to performance which are not part of the CrawlerConfig */
|
|
254
|
+
exports.perfInput = {
|
|
255
|
+
perfBatchSize: (0, apify_actor_config_1.createIntegerField)({
|
|
256
|
+
title: 'Batch requests',
|
|
257
|
+
type: 'integer',
|
|
258
|
+
description: `If set, multiple Requests will be handled by a single Actor instance.${newLine(1)}
|
|
259
|
+
Example: If set to 20, then up to 20 requests will be handled in a single "go", after which the actor instance will reset.${newLine(1)}
|
|
260
|
+
<a href="https://docs.apify.com/platform/actors/development/performance#batch-jobs-win-over-the-single-jobs">See Apify documentation</a>.`,
|
|
261
|
+
example: 20,
|
|
262
|
+
minimum: 0,
|
|
263
|
+
nullable: true,
|
|
264
|
+
sectionCaption: 'Performance configuration (Advanced)',
|
|
265
|
+
sectionDescription: 'Standalone performance options. These are not passed to the Crawler.',
|
|
266
|
+
}),
|
|
267
|
+
perfBatchWaitSecs: (0, apify_actor_config_1.createIntegerField)({
|
|
268
|
+
title: 'Wait (in seconds) between processing requests in a single batch',
|
|
269
|
+
type: 'integer',
|
|
270
|
+
description: `How long to wait between entries within a single batch.${newLine(1)}
|
|
271
|
+
Increase this value if you're using batching and you're sending requests to the scraped website too fast.${newLine(1)}
|
|
272
|
+
Example: If set to 1, then after each entry in a batch, wait 1 second before continuing.`,
|
|
273
|
+
example: 1,
|
|
274
|
+
minimum: 0,
|
|
275
|
+
nullable: true,
|
|
276
|
+
}), // prettier-ignore
|
|
277
|
+
};
|
|
278
|
+
/** Common input fields for defining URLs to scrape */
|
|
279
|
+
exports.startUrlsInput = {
|
|
280
|
+
startUrls: (0, apify_actor_config_1.createArrayField)({
|
|
281
|
+
title: 'Start URLs',
|
|
282
|
+
type: 'array',
|
|
283
|
+
description: `List of URLs to scrape.`,
|
|
284
|
+
editor: 'requestListSources',
|
|
285
|
+
sectionCaption: 'Starting URLs',
|
|
286
|
+
}),
|
|
287
|
+
startUrlsFromDataset: (0, apify_actor_config_1.createStringField)({
|
|
288
|
+
title: 'Start URLs from Dataset',
|
|
289
|
+
type: 'string',
|
|
290
|
+
editor: 'textfield',
|
|
291
|
+
description: `Import URLs to scrape from an existing Dataset.${newLine(1)}
|
|
292
|
+
The dataset and the field to import from should be written as \`{datasetID}#{field}\`.${newLine(1)}
|
|
293
|
+
Example: \`datasetid123#url\` will take URLs from dataset \`datasetid123\` from field \`url\`.`,
|
|
294
|
+
pattern: datasetIdWithFieldPattern,
|
|
295
|
+
example: 'datasetid123#url',
|
|
296
|
+
nullable: true,
|
|
297
|
+
}),
|
|
298
|
+
startUrlsFromFunction: (0, apify_actor_config_1.createStringField)({
|
|
299
|
+
title: 'Start URLs from custom function',
|
|
300
|
+
type: 'string',
|
|
301
|
+
description: `Import or generate URLs to scrape using a custom function.${newLine(1)}`,
|
|
302
|
+
editor: 'javascript',
|
|
303
|
+
example: createHookFnExample({}, CODE_EXAMPLES.startUrlsFromFunction, false),
|
|
304
|
+
prefill: createHookFnExample({}, CODE_EXAMPLES.startUrlsFromFunction, true),
|
|
305
|
+
nullable: true,
|
|
306
|
+
}), // prettier-ignore
|
|
307
|
+
};
|
|
308
|
+
/** Common input fields related to logging setup */
|
|
309
|
+
exports.loggingInput = {
|
|
310
|
+
logLevel: (0, apify_actor_config_1.createStringField)({
|
|
311
|
+
title: 'Log Level',
|
|
312
|
+
type: 'string',
|
|
313
|
+
editor: 'select',
|
|
314
|
+
description: 'Select how detailed should be the logging.',
|
|
315
|
+
enum: ['off', 'debug', 'info', 'warn', 'error'],
|
|
316
|
+
enumTitles: [
|
|
317
|
+
'No logging (off)',
|
|
318
|
+
'Debug and higher priority',
|
|
319
|
+
'Info and higher priority',
|
|
320
|
+
'Warning and higher priority',
|
|
321
|
+
'Error and higher priority',
|
|
322
|
+
],
|
|
323
|
+
example: 'info',
|
|
324
|
+
prefill: 'info',
|
|
325
|
+
default: 'info',
|
|
326
|
+
nullable: true,
|
|
327
|
+
sectionCaption: 'Logging & Error handling (Advanced)',
|
|
328
|
+
sectionDescription: 'Configure how to handle errors or what should be displayed in the log console.',
|
|
329
|
+
}),
|
|
330
|
+
errorReportingDatasetId: (0, apify_actor_config_1.createStringField)({
|
|
331
|
+
title: 'Error reporting dataset ID',
|
|
332
|
+
type: 'string',
|
|
333
|
+
editor: 'textfield',
|
|
334
|
+
description: `Dataset ID to which errors should be captured.${newLine(1)}
|
|
335
|
+
Default: \`'REPORTING'\`.${newLine(1)}
|
|
336
|
+
<strong>NOTE:</strong> Dataset name can only contain letters 'a' through 'z', the digits '0' through '9', and the hyphen ('-') but only in the middle of the string (e.g. 'my-value-1')`,
|
|
337
|
+
example: 'REPORTING',
|
|
338
|
+
prefill: 'REPORTING',
|
|
339
|
+
default: 'REPORTING',
|
|
340
|
+
pattern: datasetIdPattern,
|
|
341
|
+
nullable: true,
|
|
342
|
+
}),
|
|
343
|
+
errorSendToSentry: (0, apify_actor_config_1.createBooleanField)({
|
|
344
|
+
title: 'Send errors to Sentry',
|
|
345
|
+
type: 'boolean',
|
|
346
|
+
editor: 'checkbox',
|
|
347
|
+
description: `Whether to send actor error reports to <a href="https://sentry.io/">Sentry</a>.${newLine(1)}
|
|
348
|
+
This info is used by the author of this actor to identify broken integrations,
|
|
349
|
+
and track down and fix issues.`,
|
|
350
|
+
example: true,
|
|
351
|
+
default: true,
|
|
352
|
+
nullable: true,
|
|
353
|
+
}), // prettier-ignore
|
|
354
|
+
};
|
|
355
|
+
/** Common input fields related to proxy setup */
|
|
356
|
+
exports.proxyInput = {
|
|
357
|
+
proxy: (0, apify_actor_config_1.createObjectField)({
|
|
358
|
+
title: 'Proxy configuration',
|
|
359
|
+
type: 'object',
|
|
360
|
+
description: 'Select proxies to be used by your crawler.',
|
|
361
|
+
editor: 'proxy',
|
|
362
|
+
sectionCaption: 'Proxy',
|
|
363
|
+
sectionDescription: 'Configure the proxy',
|
|
364
|
+
}),
|
|
365
|
+
};
|
|
366
|
+
/** Common input fields related to proxy setup */
|
|
367
|
+
exports.privacyInput = {
|
|
368
|
+
includePersonalData: (0, apify_actor_config_1.createBooleanField)({
|
|
369
|
+
title: 'Include personal data',
|
|
370
|
+
type: 'boolean',
|
|
371
|
+
description: `By default, fields that are potential personal data are censored. Toggle this option on to get the un-uncensored values.${newLine(1)}
|
|
372
|
+
<strong>WARNING:</strong> Turn this on ONLY if you have consent, legal basis for using the data, or at your own risk. <a href="https://gdpr.eu/eu-gdpr-personal-data/">Learn more</a>`,
|
|
373
|
+
default: false,
|
|
374
|
+
example: false,
|
|
375
|
+
nullable: true,
|
|
376
|
+
sectionCaption: 'Privacy & Data governance (GDPR)',
|
|
377
|
+
}), // prettier-ignore
|
|
378
|
+
};
|
|
379
|
+
/** Common input fields related to actor request */
|
|
380
|
+
exports.requestInput = {
|
|
381
|
+
requestMaxEntries: (0, apify_actor_config_1.createIntegerField)({
|
|
382
|
+
title: 'Limit the number of requests',
|
|
383
|
+
type: 'integer',
|
|
384
|
+
description: `If set, only at most this many requests will be processed.${newLine(1)}
|
|
385
|
+
The count is determined from the RequestQueue that's used for the Actor run.${newLine(1)}
|
|
386
|
+
This means that if \`requestMaxEntries\` is set to 50, but the associated queue already handled 40 requests, then only 10 new requests will be handled.`,
|
|
387
|
+
example: 50,
|
|
388
|
+
prefill: 50,
|
|
389
|
+
minimum: 0,
|
|
390
|
+
nullable: true,
|
|
391
|
+
sectionCaption: 'Requests limit, transformation & filtering (Advanced)',
|
|
392
|
+
}),
|
|
393
|
+
requestTransform: (0, apify_actor_config_1.createStringField)({
|
|
394
|
+
title: 'Transform requests',
|
|
395
|
+
type: 'string',
|
|
396
|
+
description: `Freely transform the request object using a custom function.${newLine(1)}
|
|
397
|
+
If not set, the request will remain as is.`,
|
|
398
|
+
editor: 'javascript',
|
|
399
|
+
example: createHookFnExample({ request: 'Request holding URL to be scraped' }, CODE_EXAMPLES.requestTransform, false),
|
|
400
|
+
prefill: createHookFnExample({ request: 'Request holding URL to be scraped' }, CODE_EXAMPLES.requestTransform, true),
|
|
401
|
+
nullable: true,
|
|
402
|
+
}),
|
|
403
|
+
requestTransformBefore: (0, apify_actor_config_1.createStringField)({
|
|
404
|
+
title: 'Transform requests - Setup',
|
|
405
|
+
type: 'string',
|
|
406
|
+
description: `Use this if you need to run one-time initialization code before \`requestTransform\`.`,
|
|
407
|
+
editor: 'javascript',
|
|
408
|
+
example: createHookFnExample({}, CODE_EXAMPLES.requestTransformBefore, false),
|
|
409
|
+
prefill: createHookFnExample({}, CODE_EXAMPLES.requestTransformBefore, true),
|
|
410
|
+
nullable: true,
|
|
411
|
+
}),
|
|
412
|
+
requestTransformAfter: (0, apify_actor_config_1.createStringField)({
|
|
413
|
+
title: 'Transform requests - Teardown',
|
|
414
|
+
type: 'string',
|
|
415
|
+
description: `Use this if you need to run one-time teardown code after \`requestTransform\`.`,
|
|
416
|
+
editor: 'javascript',
|
|
417
|
+
example: createHookFnExample({}, CODE_EXAMPLES.requestTransformAfter, false),
|
|
418
|
+
prefill: createHookFnExample({}, CODE_EXAMPLES.requestTransformAfter, true),
|
|
419
|
+
nullable: true,
|
|
420
|
+
}),
|
|
421
|
+
requestFilter: (0, apify_actor_config_1.createStringField)({
|
|
422
|
+
title: 'Filter requests',
|
|
423
|
+
type: 'string',
|
|
424
|
+
description: `Decide which requests should be processed by using a custom function.${newLine(1)}
|
|
425
|
+
If not set, all requests will be included.${newLine(1)}
|
|
426
|
+
This is done after \`requestTransform\`.${newLine(1)}`,
|
|
427
|
+
editor: 'javascript',
|
|
428
|
+
example: createHookFnExample({ request: 'Request holding URL to be scraped' }, CODE_EXAMPLES.requestFilter, false),
|
|
429
|
+
prefill: createHookFnExample({ request: 'Request holding URL to be scraped' }, CODE_EXAMPLES.requestFilter, true),
|
|
430
|
+
nullable: true,
|
|
431
|
+
}),
|
|
432
|
+
requestFilterBefore: (0, apify_actor_config_1.createStringField)({
|
|
433
|
+
title: 'Filter requests - Setup',
|
|
434
|
+
type: 'string',
|
|
435
|
+
description: `Use this if you need to run one-time initialization code before \`requestFilter\`.${newLine(1)}`,
|
|
436
|
+
editor: 'javascript',
|
|
437
|
+
example: createHookFnExample({}, CODE_EXAMPLES.requestFilterBefore, false),
|
|
438
|
+
prefill: createHookFnExample({}, CODE_EXAMPLES.requestFilterBefore, true),
|
|
439
|
+
nullable: true,
|
|
440
|
+
}),
|
|
441
|
+
requestFilterAfter: (0, apify_actor_config_1.createStringField)({
|
|
442
|
+
title: 'Filter requests - Teardown',
|
|
443
|
+
type: 'string',
|
|
444
|
+
description: `Use this if you need to run one-time teardown code after \`requestFilter\`.${newLine(1)}`,
|
|
445
|
+
editor: 'javascript',
|
|
446
|
+
example: createHookFnExample({}, CODE_EXAMPLES.requestFilterAfter, false),
|
|
447
|
+
prefill: createHookFnExample({}, CODE_EXAMPLES.requestFilterAfter, true),
|
|
448
|
+
nullable: true,
|
|
449
|
+
}),
|
|
450
|
+
requestQueueId: (0, apify_actor_config_1.createStringField)({
|
|
451
|
+
title: 'RequestQueue ID',
|
|
452
|
+
type: 'string',
|
|
453
|
+
description: `By default, requests are stored in the default request queue.
|
|
454
|
+
Set this option if you want to use a non-default queue.
|
|
455
|
+
<a href="https://docs.apify.com/sdk/python/docs/concepts/storages#opening-named-and-unnamed-storages">Learn more</a>${newLine(1)}
|
|
456
|
+
<strong>NOTE:</strong> RequestQueue name can only contain letters 'a' through 'z', the digits '0' through '9', and the hyphen ('-') but only in the middle of the string (e.g. 'my-value-1')`,
|
|
457
|
+
editor: 'textfield',
|
|
458
|
+
example: 'mIJVZsRQrDQf4rUAf',
|
|
459
|
+
pattern: datasetIdPattern,
|
|
460
|
+
nullable: true,
|
|
461
|
+
}), // prettier-ignore
|
|
462
|
+
};
|
|
463
|
+
/** Common input fields related to actor output */
|
|
464
|
+
exports.outputInput = {
|
|
465
|
+
outputMaxEntries: (0, apify_actor_config_1.createIntegerField)({
|
|
466
|
+
title: 'Limit the number of scraped entries',
|
|
467
|
+
type: 'integer',
|
|
468
|
+
description: `If set, only at most this many entries will be scraped.${newLine(1)}
|
|
469
|
+
The count is determined from the Dataset that's used for the Actor run.${newLine(1)}
|
|
470
|
+
This means that if \`outputMaxEntries\` is set to 50, but the associated Dataset already has 40 items in it, then only 10 new entries will be saved.`,
|
|
471
|
+
example: 50,
|
|
472
|
+
prefill: 50,
|
|
473
|
+
minimum: 0,
|
|
474
|
+
nullable: true,
|
|
475
|
+
sectionCaption: 'Output size, transformation & filtering (T in ETL) (Advanced)',
|
|
476
|
+
}),
|
|
477
|
+
outputRenameFields: (0, apify_actor_config_1.createObjectField)({
|
|
478
|
+
title: 'Rename dataset fields',
|
|
479
|
+
type: 'object',
|
|
480
|
+
description: `Rename fields (columns) of the output data.${newLine(1)}
|
|
481
|
+
If not set, all fields will have their original names.${newLine(1)}
|
|
482
|
+
This is done before \`outputPickFields\`.${newLine(1)}
|
|
483
|
+
Keys can be nested, e.g. \`"someProp.value[0]"\`.
|
|
484
|
+
Nested path is resolved using <a href="https://lodash.com/docs/4.17.15#get">Lodash.get()</a>.`,
|
|
485
|
+
editor: 'json',
|
|
486
|
+
example: { oldFieldName: 'newFieldName' },
|
|
487
|
+
nullable: true,
|
|
488
|
+
}),
|
|
489
|
+
outputPickFields: (0, apify_actor_config_1.createArrayField)({
|
|
490
|
+
title: 'Pick dataset fields',
|
|
491
|
+
type: 'array',
|
|
492
|
+
description: `Select a subset of fields of an entry that will be pushed to the dataset.${newLine(1)}
|
|
493
|
+
If not set, all fields on an entry will be pushed to the dataset.${newLine(1)}
|
|
494
|
+
This is done after \`outputRenameFields\`.${newLine(1)}
|
|
495
|
+
Keys can be nested, e.g. \`"someProp.value[0]"\`.
|
|
496
|
+
Nested path is resolved using <a href="https://lodash.com/docs/4.17.15#get">Lodash.get()</a>.`,
|
|
497
|
+
editor: 'stringList',
|
|
498
|
+
example: ['fieldName', 'another.nested[0].field'],
|
|
499
|
+
nullable: true,
|
|
500
|
+
}),
|
|
501
|
+
outputTransform: (0, apify_actor_config_1.createStringField)({
|
|
502
|
+
title: 'Transform entries',
|
|
503
|
+
type: 'string',
|
|
504
|
+
description: `Freely transform the output data object using a custom function.${newLine(1)}
|
|
505
|
+
If not set, the data will remain as is.${newLine(1)}
|
|
506
|
+
This is done after \`outputPickFields\` and \`outputRenameFields\`.${newLine(1)}`,
|
|
507
|
+
editor: 'javascript',
|
|
508
|
+
example: createHookFnExample({ entry: 'Scraped entry' }, CODE_EXAMPLES.outputTransform, false),
|
|
509
|
+
prefill: createHookFnExample({ entry: 'Scraped entry' }, CODE_EXAMPLES.outputTransform, true),
|
|
510
|
+
nullable: true,
|
|
511
|
+
}),
|
|
512
|
+
outputTransformBefore: (0, apify_actor_config_1.createStringField)({
|
|
513
|
+
title: 'Transform entries - Setup',
|
|
514
|
+
type: 'string',
|
|
515
|
+
description: `Use this if you need to run one-time initialization code before \`outputTransform\`.${newLine(1)}`,
|
|
516
|
+
editor: 'javascript',
|
|
517
|
+
example: createHookFnExample({}, CODE_EXAMPLES.outputTransformBefore, false),
|
|
518
|
+
prefill: createHookFnExample({}, CODE_EXAMPLES.outputTransformBefore, true),
|
|
519
|
+
nullable: true,
|
|
520
|
+
}),
|
|
521
|
+
outputTransformAfter: (0, apify_actor_config_1.createStringField)({
|
|
522
|
+
title: 'Transform entries - Teardown',
|
|
523
|
+
type: 'string',
|
|
524
|
+
description: `Use this if you need to run one-time teardown code after \`outputTransform\`.${newLine(1)}`,
|
|
525
|
+
editor: 'javascript',
|
|
526
|
+
example: createHookFnExample({}, CODE_EXAMPLES.outputTransformAfter, false),
|
|
527
|
+
prefill: createHookFnExample({}, CODE_EXAMPLES.outputTransformAfter, true),
|
|
528
|
+
nullable: true,
|
|
529
|
+
}),
|
|
530
|
+
outputFilter: (0, apify_actor_config_1.createStringField)({
|
|
531
|
+
title: 'Filter entries',
|
|
532
|
+
type: 'string',
|
|
533
|
+
description: `Decide which scraped entries should be included in the output by using a custom function.${newLine(1)}
|
|
534
|
+
If not set, all scraped entries will be included.${newLine(1)}
|
|
535
|
+
This is done after \`outputPickFields\`, \`outputRenameFields\`, and \`outputTransform\`.${newLine(1)}`,
|
|
536
|
+
editor: 'javascript',
|
|
537
|
+
example: createHookFnExample({ entry: 'Scraped entry' }, CODE_EXAMPLES.outputFilter, false),
|
|
538
|
+
prefill: createHookFnExample({ entry: 'Scraped entry' }, CODE_EXAMPLES.outputFilter, true),
|
|
539
|
+
nullable: true,
|
|
540
|
+
}),
|
|
541
|
+
outputFilterBefore: (0, apify_actor_config_1.createStringField)({
|
|
542
|
+
title: 'Filter entries - Setup',
|
|
543
|
+
type: 'string',
|
|
544
|
+
description: `Use this if you need to run one-time initialization code before \`outputFilter\`.${newLine(1)}`,
|
|
545
|
+
editor: 'javascript',
|
|
546
|
+
example: createHookFnExample({}, CODE_EXAMPLES.outputFilterBefore, false),
|
|
547
|
+
prefill: createHookFnExample({}, CODE_EXAMPLES.outputFilterBefore, true),
|
|
548
|
+
nullable: true,
|
|
549
|
+
}),
|
|
550
|
+
outputFilterAfter: (0, apify_actor_config_1.createStringField)({
|
|
551
|
+
title: 'Filter entries - Teardown',
|
|
552
|
+
type: 'string',
|
|
553
|
+
description: `Use this if you need to run one-time teardown code after \`outputFilter\`.${newLine(1)}`,
|
|
554
|
+
editor: 'javascript',
|
|
555
|
+
example: createHookFnExample({}, CODE_EXAMPLES.outputFilterAfter, false),
|
|
556
|
+
prefill: createHookFnExample({}, CODE_EXAMPLES.outputFilterAfter, true),
|
|
557
|
+
nullable: true,
|
|
558
|
+
}),
|
|
559
|
+
outputDatasetId: (0, apify_actor_config_1.createStringField)({
|
|
560
|
+
title: 'Dataset ID',
|
|
561
|
+
type: 'string',
|
|
562
|
+
description: `By default, data is written to Default dataset.
|
|
563
|
+
Set this option if you want to write data to non-default dataset.
|
|
564
|
+
<a href="https://docs.apify.com/sdk/python/docs/concepts/storages#opening-named-and-unnamed-storages">Learn more</a>${newLine(1)}
|
|
565
|
+
<strong>NOTE:</strong> Dataset name can only contain letters 'a' through 'z', the digits '0' through '9', and the hyphen ('-') but only in the middle of the string (e.g. 'my-value-1')`,
|
|
566
|
+
editor: 'textfield',
|
|
567
|
+
example: 'mIJVZsRQrDQf4rUAf',
|
|
568
|
+
pattern: datasetIdPattern,
|
|
569
|
+
nullable: true,
|
|
570
|
+
sectionCaption: 'Output Dataset & Caching (L in ETL) (Advanced)',
|
|
571
|
+
}),
|
|
572
|
+
outputCacheStoreId: (0, apify_actor_config_1.createStringField)({
|
|
573
|
+
title: 'Cache ID',
|
|
574
|
+
type: 'string',
|
|
575
|
+
description: `Set this option if you want to cache scraped entries in <a href="https://docs.apify.com/sdk/js/docs/guides/result-storage#key-value-store">Apify's Key-value store</a>.${newLine(1)}
|
|
576
|
+
This is useful for example when you want to scrape only NEW entries. In such case, you can use the \`outputFilter\` option to define a custom function to filter out entries already found in the cache.
|
|
577
|
+
<a href="https://docs.apify.com/sdk/python/docs/concepts/storages#working-with-key-value-stores">Learn more</a>${newLine(1)}
|
|
578
|
+
<strong>NOTE:</strong> Cache name can only contain letters 'a' through 'z', the digits '0' through '9', and the hyphen ('-') but only in the middle of the string (e.g. 'my-value-1')`,
|
|
579
|
+
editor: 'textfield',
|
|
580
|
+
example: 'mIJVZsRQrDQf4rUAf',
|
|
581
|
+
pattern: datasetIdPattern,
|
|
582
|
+
nullable: true,
|
|
583
|
+
}),
|
|
584
|
+
outputCachePrimaryKeys: (0, apify_actor_config_1.createArrayField)({
|
|
585
|
+
title: 'Cache primary keys',
|
|
586
|
+
type: 'array',
|
|
587
|
+
description: `Specify fields that uniquely identify entries (primary keys), so entries can be compared against the cache.${newLine(1)}
|
|
588
|
+
<strong>NOTE:</strong> If not set, the entries are hashed based on all fields`,
|
|
589
|
+
editor: 'stringList',
|
|
590
|
+
example: ['name', 'city'],
|
|
591
|
+
nullable: true,
|
|
592
|
+
}),
|
|
593
|
+
outputCacheActionOnResult: (0, apify_actor_config_1.createStringField)({
|
|
594
|
+
title: 'Cache action on result',
|
|
595
|
+
type: 'string',
|
|
596
|
+
description: `Specify whether scraped results should be added to, removed from, or overwrite the cache.${newLine(1)}
|
|
597
|
+
- <strong>add<strong> - Adds scraped results to the cache${newLine(1)}
|
|
598
|
+
- <strong>remove<strong> - Removes scraped results from the cache${newLine(1)}
|
|
599
|
+
- <strong>set<strong> - First clears all entries from the cache, then adds scraped results to the cache${newLine(1)}
|
|
600
|
+
<strong>NOTE:</strong> No action happens when this field is empty.`,
|
|
601
|
+
editor: 'select',
|
|
602
|
+
enum: ['add', 'remove', 'overwrite'],
|
|
603
|
+
example: 'add',
|
|
604
|
+
nullable: true,
|
|
605
|
+
}), // prettier-ignore
|
|
606
|
+
};
|
|
607
|
+
/** Common input fields related to actor metamorphing */
|
|
608
|
+
exports.metamorphInput = {
|
|
609
|
+
metamorphActorId: (0, apify_actor_config_1.createStringField)({
|
|
610
|
+
title: 'Metamorph actor ID - metamorph to another actor at the end',
|
|
611
|
+
type: 'string',
|
|
612
|
+
description: `Use this option if you want to run another actor with the same dataset after this actor has finished (AKA metamorph into another actor). <a href="https://docs.apify.com/sdk/python/docs/concepts/interacting-with-other-actors#actormetamorph">Learn more</a> ${newLine(1)}
|
|
613
|
+
New actor is identified by its ID, e.g. "apify/web-scraper".`,
|
|
614
|
+
editor: 'textfield',
|
|
615
|
+
example: 'apify/web-scraper',
|
|
616
|
+
nullable: true,
|
|
617
|
+
sectionCaption: 'Integrations (Metamorphing) (Advanced)',
|
|
618
|
+
}),
|
|
619
|
+
metamorphActorBuild: (0, apify_actor_config_1.createStringField)({
|
|
620
|
+
title: 'Metamorph actor build',
|
|
621
|
+
type: 'string',
|
|
622
|
+
description: `Tag or number of the target actor build to metamorph into (e.g. 'beta' or '1.2.345')`,
|
|
623
|
+
editor: 'textfield',
|
|
624
|
+
example: '1.2.345',
|
|
625
|
+
nullable: true,
|
|
626
|
+
}),
|
|
627
|
+
metamorphActorInput: (0, apify_actor_config_1.createObjectField)({
|
|
628
|
+
title: 'Metamorph actor input',
|
|
629
|
+
type: 'object',
|
|
630
|
+
description: `Input object passed to the follow-up (metamorph) actor. <a href="https://docs.apify.com/sdk/python/docs/concepts/interacting-with-other-actors#actormetamorph">Learn more</a>`,
|
|
631
|
+
editor: 'json',
|
|
632
|
+
example: { uploadDatasetToGDrive: true },
|
|
633
|
+
nullable: true,
|
|
634
|
+
}),
|
|
635
|
+
};
|
|
636
|
+
exports.allActorInputs = Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign({}, exports.inputInput), exports.startUrlsInput), exports.proxyInput), exports.privacyInput), exports.requestInput), exports.outputInput), exports.crawlerInput), exports.perfInput), exports.loggingInput), exports.metamorphInput);
|
|
637
|
+
exports.crawlerInputValidationFields = {
|
|
638
|
+
navigationTimeoutSecs: joi_1.default.number().integer().min(0).optional(),
|
|
639
|
+
ignoreSslErrors: joi_1.default.boolean().optional(),
|
|
640
|
+
additionalMimeTypes: joi_1.default.array().items(joi_1.default.string().min(1)).optional(),
|
|
641
|
+
suggestResponseEncoding: joi_1.default.string().min(1).optional(),
|
|
642
|
+
forceResponseEncoding: joi_1.default.string().min(1).optional(),
|
|
643
|
+
requestHandlerTimeoutSecs: joi_1.default.number().integer().min(0).optional(),
|
|
644
|
+
maxRequestRetries: joi_1.default.number().integer().min(0).optional(),
|
|
645
|
+
maxRequestsPerCrawl: joi_1.default.number().integer().min(0).optional(),
|
|
646
|
+
maxRequestsPerMinute: joi_1.default.number().integer().min(0).optional(),
|
|
647
|
+
minConcurrency: joi_1.default.number().integer().min(0).optional(),
|
|
648
|
+
maxConcurrency: joi_1.default.number().integer().min(0).optional(),
|
|
649
|
+
keepAlive: joi_1.default.boolean().optional(),
|
|
650
|
+
};
|
|
651
|
+
exports.perfInputValidationFields = {
|
|
652
|
+
perfBatchSize: joi_1.default.number().integer().min(0).optional(),
|
|
653
|
+
perfBatchWaitSecs: joi_1.default.number().integer().min(0).optional(),
|
|
654
|
+
};
|
|
655
|
+
exports.inputInputValidationFields = {
|
|
656
|
+
inputExtendUrl: joi_1.default.string().min(1).uri().optional(),
|
|
657
|
+
inputExtendFromFunction: joi_1.default.string().min(1).optional(),
|
|
658
|
+
};
|
|
659
|
+
exports.startUrlsInputValidationFields = {
|
|
660
|
+
startUrls: joi_1.default.array().items(joi_1.default.string().min(1), joi_1.default.object()).optional(),
|
|
661
|
+
startUrlsFromDataset: joi_1.default.string().min(1).pattern(new RegExp(datasetIdWithFieldPattern)).optional(),
|
|
662
|
+
startUrlsFromFunction: joi_1.default.string().min(1).optional(),
|
|
663
|
+
};
|
|
664
|
+
exports.loggingInputValidationFields = {
|
|
665
|
+
logLevel: joi_1.default.string().valid(...log_1.LOG_LEVEL).optional(),
|
|
666
|
+
errorReportingDatasetId: joi_1.default.string().min(1).pattern(new RegExp(datasetIdPattern)).optional(),
|
|
667
|
+
errorSendToSentry: joi_1.default.boolean().optional(),
|
|
668
|
+
};
|
|
669
|
+
exports.proxyInputValidationFields = {
|
|
670
|
+
proxy: joi_1.default.object().optional(), // NOTE: Expand this type?
|
|
671
|
+
};
|
|
672
|
+
exports.privacyInputValidationFields = {
|
|
673
|
+
includePersonalData: joi_1.default.boolean().optional(),
|
|
674
|
+
};
|
|
675
|
+
exports.requestInputValidationFields = {
|
|
676
|
+
requestMaxEntries: joi_1.default.number().integer().min(0).optional(),
|
|
677
|
+
requestTransform: joi_1.default.string().min(1).optional(),
|
|
678
|
+
requestTransformBefore: joi_1.default.string().min(1).optional(),
|
|
679
|
+
requestTransformAfter: joi_1.default.string().min(1).optional(),
|
|
680
|
+
requestFilter: joi_1.default.string().min(1).optional(),
|
|
681
|
+
requestFilterBefore: joi_1.default.string().min(1).optional(),
|
|
682
|
+
requestFilterAfter: joi_1.default.string().min(1).optional(),
|
|
683
|
+
requestQueueId: joi_1.default.string().min(1).pattern(new RegExp(datasetIdPattern)).optional(), // prettier-ignore
|
|
684
|
+
};
|
|
685
|
+
exports.outputInputValidationFields = {
|
|
686
|
+
outputMaxEntries: joi_1.default.number().integer().min(0).optional(),
|
|
687
|
+
outputPickFields: joi_1.default.array().items(joi_1.default.string().min(1)).optional(),
|
|
688
|
+
// https://stackoverflow.com/a/49898360/9788634
|
|
689
|
+
outputRenameFields: joi_1.default.object().pattern(/./, joi_1.default.string().min(1)).optional(),
|
|
690
|
+
outputTransform: joi_1.default.string().min(1).optional(),
|
|
691
|
+
outputTransformBefore: joi_1.default.string().min(1).optional(),
|
|
692
|
+
outputTransformAfter: joi_1.default.string().min(1).optional(),
|
|
693
|
+
outputFilter: joi_1.default.string().min(1).optional(),
|
|
694
|
+
outputFilterBefore: joi_1.default.string().min(1).optional(),
|
|
695
|
+
outputFilterAfter: joi_1.default.string().min(1).optional(),
|
|
696
|
+
outputCacheStoreId: joi_1.default.string().min(1).pattern(new RegExp(datasetIdPattern)).optional(),
|
|
697
|
+
outputCachePrimaryKeys: joi_1.default.array().items(joi_1.default.string().min(1)).optional(),
|
|
698
|
+
outputCacheActionOnResult: joi_1.default.string().min(1).allow('add', 'remove', 'overwrite').optional(),
|
|
699
|
+
outputDatasetId: joi_1.default.string().min(1).pattern(new RegExp(datasetIdPattern)).optional(), // prettier-ignore
|
|
700
|
+
};
|
|
701
|
+
exports.metamorphInputValidationFields = {
|
|
702
|
+
metamorphActorId: joi_1.default.string().min(1).optional(),
|
|
703
|
+
metamorphActorBuild: joi_1.default.string().min(1).optional(),
|
|
704
|
+
metamorphActorInput: joi_1.default.object().unknown(true).optional(),
|
|
705
|
+
};
|
|
706
|
+
exports.allActorInputValidationFields = Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign(Object.assign({}, exports.inputInputValidationFields), exports.startUrlsInputValidationFields), exports.proxyInputValidationFields), exports.privacyInputValidationFields), exports.requestInputValidationFields), exports.outputInputValidationFields), exports.crawlerInputValidationFields), exports.perfInputValidationFields), exports.loggingInputValidationFields), exports.metamorphInputValidationFields);
|
|
707
|
+
//# sourceMappingURL=config.js.map
|