crawlee-one 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/README.md +81 -0
  2. package/dist/cjs/cli/cli.d.ts +1 -0
  3. package/dist/cjs/cli/cli.js +61 -0
  4. package/dist/cjs/cli/cli.js.map +1 -0
  5. package/dist/cjs/cli/index.d.ts +2 -0
  6. package/dist/cjs/cli/index.js +6 -0
  7. package/dist/cjs/cli/index.js.map +1 -0
  8. package/dist/cjs/index.d.ts +24 -0
  9. package/dist/cjs/index.js +43 -0
  10. package/dist/cjs/index.js.map +1 -0
  11. package/dist/cjs/lib/actions/dom.d.ts +102 -0
  12. package/dist/cjs/lib/actions/dom.js +743 -0
  13. package/dist/cjs/lib/actions/dom.js.map +1 -0
  14. package/dist/cjs/lib/actions/domUtils.d.ts +42 -0
  15. package/dist/cjs/lib/actions/domUtils.js +126 -0
  16. package/dist/cjs/lib/actions/domUtils.js.map +1 -0
  17. package/dist/cjs/lib/actions/page.d.ts +69 -0
  18. package/dist/cjs/lib/actions/page.js +205 -0
  19. package/dist/cjs/lib/actions/page.js.map +1 -0
  20. package/dist/cjs/lib/actions/scrapeListing.d.ts +78 -0
  21. package/dist/cjs/lib/actions/scrapeListing.js +242 -0
  22. package/dist/cjs/lib/actions/scrapeListing.js.map +1 -0
  23. package/dist/cjs/lib/actor/actor.d.ts +90 -0
  24. package/dist/cjs/lib/actor/actor.js +306 -0
  25. package/dist/cjs/lib/actor/actor.js.map +1 -0
  26. package/dist/cjs/lib/actor/types.d.ts +162 -0
  27. package/dist/cjs/lib/actor/types.js +3 -0
  28. package/dist/cjs/lib/actor/types.js.map +1 -0
  29. package/dist/cjs/lib/actor.d.ts +189 -0
  30. package/dist/cjs/lib/actor.js +225 -0
  31. package/dist/cjs/lib/actor.js.map +1 -0
  32. package/dist/cjs/lib/actorSpec.d.ts +20 -0
  33. package/dist/cjs/lib/actorSpec.js +3 -0
  34. package/dist/cjs/lib/actorSpec.js.map +1 -0
  35. package/dist/cjs/lib/config.d.ts +561 -0
  36. package/dist/cjs/lib/config.js +707 -0
  37. package/dist/cjs/lib/config.js.map +1 -0
  38. package/dist/cjs/lib/dataset/maxCount.d.ts +30 -0
  39. package/dist/cjs/lib/dataset/maxCount.js +55 -0
  40. package/dist/cjs/lib/dataset/maxCount.js.map +1 -0
  41. package/dist/cjs/lib/dataset/pushData.d.ts +123 -0
  42. package/dist/cjs/lib/dataset/pushData.js +182 -0
  43. package/dist/cjs/lib/dataset/pushData.js.map +1 -0
  44. package/dist/cjs/lib/dataset.d.ts +98 -0
  45. package/dist/cjs/lib/dataset.js +122 -0
  46. package/dist/cjs/lib/dataset.js.map +1 -0
  47. package/dist/cjs/lib/dom.d.ts +78 -0
  48. package/dist/cjs/lib/dom.js +243 -0
  49. package/dist/cjs/lib/dom.js.map +1 -0
  50. package/dist/cjs/lib/error/errorHandler.d.ts +112 -0
  51. package/dist/cjs/lib/error/errorHandler.js +164 -0
  52. package/dist/cjs/lib/error/errorHandler.js.map +1 -0
  53. package/dist/cjs/lib/error/sentry.d.ts +11 -0
  54. package/dist/cjs/lib/error/sentry.js +60 -0
  55. package/dist/cjs/lib/error/sentry.js.map +1 -0
  56. package/dist/cjs/lib/integrations/apify.d.ts +67 -0
  57. package/dist/cjs/lib/integrations/apify.js +106 -0
  58. package/dist/cjs/lib/integrations/apify.js.map +1 -0
  59. package/dist/cjs/lib/integrations/types.d.ts +274 -0
  60. package/dist/cjs/lib/integrations/types.js +3 -0
  61. package/dist/cjs/lib/integrations/types.js.map +1 -0
  62. package/dist/cjs/lib/io/dataset.d.ts +67 -0
  63. package/dist/cjs/lib/io/dataset.js +86 -0
  64. package/dist/cjs/lib/io/dataset.js.map +1 -0
  65. package/dist/cjs/lib/io/maxCount.d.ts +30 -0
  66. package/dist/cjs/lib/io/maxCount.js +55 -0
  67. package/dist/cjs/lib/io/maxCount.js.map +1 -0
  68. package/dist/cjs/lib/io/pushData.d.ts +124 -0
  69. package/dist/cjs/lib/io/pushData.js +193 -0
  70. package/dist/cjs/lib/io/pushData.js.map +1 -0
  71. package/dist/cjs/lib/io/pushRequests.d.ts +38 -0
  72. package/dist/cjs/lib/io/pushRequests.js +63 -0
  73. package/dist/cjs/lib/io/pushRequests.js.map +1 -0
  74. package/dist/cjs/lib/io/requestQueue.d.ts +28 -0
  75. package/dist/cjs/lib/io/requestQueue.js +40 -0
  76. package/dist/cjs/lib/io/requestQueue.js.map +1 -0
  77. package/dist/cjs/lib/log.d.ts +38 -0
  78. package/dist/cjs/lib/log.js +54 -0
  79. package/dist/cjs/lib/log.js.map +1 -0
  80. package/dist/cjs/lib/migrate/localMigrator.d.ts +10 -0
  81. package/dist/cjs/lib/migrate/localMigrator.js +57 -0
  82. package/dist/cjs/lib/migrate/localMigrator.js.map +1 -0
  83. package/dist/cjs/lib/migrate/localState.d.ts +7 -0
  84. package/dist/cjs/lib/migrate/localState.js +43 -0
  85. package/dist/cjs/lib/migrate/localState.js.map +1 -0
  86. package/dist/cjs/lib/migrate/types.d.ts +6 -0
  87. package/dist/cjs/lib/migrate/types.js +3 -0
  88. package/dist/cjs/lib/migrate/types.js.map +1 -0
  89. package/dist/cjs/lib/readme/readme.d.ts +65 -0
  90. package/dist/cjs/lib/readme/readme.js +534 -0
  91. package/dist/cjs/lib/readme/readme.js.map +1 -0
  92. package/dist/cjs/lib/readme/types.d.ts +260 -0
  93. package/dist/cjs/lib/readme/types.js +54 -0
  94. package/dist/cjs/lib/readme/types.js.map +1 -0
  95. package/dist/cjs/lib/router.d.ts +132 -0
  96. package/dist/cjs/lib/router.js +165 -0
  97. package/dist/cjs/lib/router.js.map +1 -0
  98. package/dist/cjs/lib/scraper/scrapeListing.d.ts +78 -0
  99. package/dist/cjs/lib/scraper/scrapeListing.js +242 -0
  100. package/dist/cjs/lib/scraper/scrapeListing.js.map +1 -0
  101. package/dist/cjs/lib/test/actor.d.ts +21 -0
  102. package/dist/cjs/lib/test/actor.js +56 -0
  103. package/dist/cjs/lib/test/actor.js.map +1 -0
  104. package/dist/cjs/lib/test/mockApifyClient.d.ts +32 -0
  105. package/dist/cjs/lib/test/mockApifyClient.js +176 -0
  106. package/dist/cjs/lib/test/mockApifyClient.js.map +1 -0
  107. package/dist/cjs/types.d.ts +31 -0
  108. package/dist/cjs/types.js +3 -0
  109. package/dist/cjs/types.js.map +1 -0
  110. package/dist/cjs/utils/async.d.ts +19 -0
  111. package/dist/cjs/utils/async.js +74 -0
  112. package/dist/cjs/utils/async.js.map +1 -0
  113. package/dist/cjs/utils/error.d.ts +1 -0
  114. package/dist/cjs/utils/error.js +10 -0
  115. package/dist/cjs/utils/error.js.map +1 -0
  116. package/dist/cjs/utils/format.d.ts +9 -0
  117. package/dist/cjs/utils/format.js +19 -0
  118. package/dist/cjs/utils/format.js.map +1 -0
  119. package/dist/cjs/utils/package.d.ts +15 -0
  120. package/dist/cjs/utils/package.js +25 -0
  121. package/dist/cjs/utils/package.js.map +1 -0
  122. package/dist/cjs/utils/types.d.ts +6 -0
  123. package/dist/cjs/utils/types.js +9 -0
  124. package/dist/cjs/utils/types.js.map +1 -0
  125. package/dist/cjs/utils/url.d.ts +9 -0
  126. package/dist/cjs/utils/url.js +32 -0
  127. package/dist/cjs/utils/url.js.map +1 -0
  128. package/dist/cjs/utils/valueMonitor.d.ts +31 -0
  129. package/dist/cjs/utils/valueMonitor.js +91 -0
  130. package/dist/cjs/utils/valueMonitor.js.map +1 -0
  131. package/package.json +85 -0
@@ -0,0 +1,260 @@
1
+ import { ArrVal } from '../../utils/types';
2
+ import type { CrawleeOneScraperActorSpec } from '../actorSpec';
3
+ export declare const README_FEATURE: readonly ["datasets", "modes", "filters", "noBrowser", "proxy", "crawlerConfig", "tests", "privacy", "integratedETL", "integratedCache", "errorMonitoring", "metamorph"];
4
+ export declare const README_FEATURE_ENUM: {
5
+ proxy: "proxy";
6
+ datasets: "datasets";
7
+ metamorph: "metamorph";
8
+ filters: "filters";
9
+ modes: "modes";
10
+ noBrowser: "noBrowser";
11
+ crawlerConfig: "crawlerConfig";
12
+ tests: "tests";
13
+ privacy: "privacy";
14
+ integratedETL: "integratedETL";
15
+ integratedCache: "integratedCache";
16
+ errorMonitoring: "errorMonitoring";
17
+ };
18
+ export type ReadmeFeatureType = ArrVal<typeof README_FEATURE>;
19
+ export declare const README_HOOK: readonly ["introAfterBegin", "introAfterDatasets", "introBeforeEnd", "featuresAfterBegin", "featuresBeforeEnd", "useCases", "usageAfterBegin", "usageBeforeEnd", "costAfterBegin", "costAfterPerfTables", "costBeforeEnd", "inputAfterBegin", "inputBeforeEnd", "filterAfterBegin", "filterBeforeEnd", "limitAfterBegin", "limitBeforeEnd", "inputExampleAfterBegin", "inputExampleBeforeEnd", "outputAfterBegin", "outputBeforeEnd", "outputExampleAfterBegin", "outputExampleBeforeEnd", "integrationAfterBegin", "integrationBeforeEnd", "apifyAfterBegin", "apifyBeforeEnd", "legalityAfterBegin", "legalityBeforeEnd", "contactAfterBegin", "contactBeforeEnd"];
20
+ export declare const README_HOOK_ENUM: {
21
+ introAfterBegin: "introAfterBegin";
22
+ introAfterDatasets: "introAfterDatasets";
23
+ introBeforeEnd: "introBeforeEnd";
24
+ featuresAfterBegin: "featuresAfterBegin";
25
+ featuresBeforeEnd: "featuresBeforeEnd";
26
+ useCases: "useCases";
27
+ usageAfterBegin: "usageAfterBegin";
28
+ usageBeforeEnd: "usageBeforeEnd";
29
+ costAfterBegin: "costAfterBegin";
30
+ costAfterPerfTables: "costAfterPerfTables";
31
+ costBeforeEnd: "costBeforeEnd";
32
+ inputAfterBegin: "inputAfterBegin";
33
+ inputBeforeEnd: "inputBeforeEnd";
34
+ filterAfterBegin: "filterAfterBegin";
35
+ filterBeforeEnd: "filterBeforeEnd";
36
+ limitAfterBegin: "limitAfterBegin";
37
+ limitBeforeEnd: "limitBeforeEnd";
38
+ inputExampleAfterBegin: "inputExampleAfterBegin";
39
+ inputExampleBeforeEnd: "inputExampleBeforeEnd";
40
+ outputAfterBegin: "outputAfterBegin";
41
+ outputBeforeEnd: "outputBeforeEnd";
42
+ outputExampleAfterBegin: "outputExampleAfterBegin";
43
+ outputExampleBeforeEnd: "outputExampleBeforeEnd";
44
+ integrationAfterBegin: "integrationAfterBegin";
45
+ integrationBeforeEnd: "integrationBeforeEnd";
46
+ apifyAfterBegin: "apifyAfterBegin";
47
+ apifyBeforeEnd: "apifyBeforeEnd";
48
+ legalityAfterBegin: "legalityAfterBegin";
49
+ legalityBeforeEnd: "legalityBeforeEnd";
50
+ contactAfterBegin: "contactAfterBegin";
51
+ contactBeforeEnd: "contactBeforeEnd";
52
+ };
53
+ export type ReadmeHook = ArrVal<typeof README_HOOK>;
54
+ /**
55
+ * Defines how to render a feature block for crawler README.
56
+ *
57
+ * Example rendered output:
58
+ * ```markdown
59
+ * - **3 kinds of datasets**
60
+ * - test1
61
+ * - Scrape details of organisations, researchers or projects.
62
+ * - test2
63
+ * ```
64
+ */
65
+ export interface ReadmeFeature<TData extends any = any> {
66
+ /** Feature is considered supported by this actor if truthy value is returned. */
67
+ supported: (it: RenderContext) => any;
68
+ /**
69
+ * Title template string for the feature. E.g.
70
+ * ```js
71
+ * '<%~ it.a.datasets.length %> kinds of datasets'
72
+ * ```
73
+ * */
74
+ title: string;
75
+ /**
76
+ * Template string that goes BEFORE the mainText. E.g.
77
+ * ```js
78
+ * '- Scrape details of <%~ it.fn.enumerate(it.a.datasets.map((d) => d.name)) %>.'
79
+ * ```
80
+ */
81
+ afterBegin?: string;
82
+ /**
83
+ * Main body template string for the feature. E.g.
84
+ * ```js
85
+ * '- Scrape details of <%~ it.fn.enumerate(it.a.datasets.map((d) => d.name)) %>.'
86
+ * ```
87
+ */
88
+ mainText: string;
89
+ /**
90
+ * Template string that goes AFTER the mainText. E.g.
91
+ * ```js
92
+ * '- Scrape details of <%~ it.fn.enumerate(it.a.datasets.map((d) => d.name)) %>.'
93
+ * ```
94
+ */
95
+ beforeEnd?: string;
96
+ /**
97
+ * If you need to store/access some variables during render, use this object.
98
+ *
99
+ * E.g.
100
+ * ```js
101
+ * filters: {
102
+ * supported: (it) => it.a.datasets.some((d) => d.filters.length),
103
+ * title: 'Filter support',
104
+ * mainText:
105
+ * `- Filter the results by <%~ it.fn.enumerate(it.fn.collectFilters(it)) %>.\n` +
106
+ * `<% if (it.t.features.filters.data.maxEntriesSupported) { %>\n` +
107
+ * ` - Limit the number of results.\n` +
108
+ * `<% } %>`,
109
+ * data: {
110
+ * maxEntriesSupported: true,
111
+ * },
112
+ * },
113
+ * ```
114
+ */
115
+ data?: TData;
116
+ }
117
+ /**
118
+ * Defines how to render an example input block for crawler README.
119
+ *
120
+ * Example rendered output:
121
+ * ```markdown
122
+ * #### Example 2: Same as above, but specified by providing a start URL
123
+ *
124
+ * ```json
125
+ * {
126
+ * "startUrls": [
127
+ * "https://www.skcris.sk/portal/web/guest/register-organizations"
128
+ * ],
129
+ * // Omit relationships to other entries
130
+ * "entryIncludeLinkedResources": false,
131
+ * "listingFilterMaxCount": 200,
132
+ * "listingItemsPerPage": 200,
133
+ * }
134
+ * ```
135
+ */
136
+ export interface ReadmeExampleInput<TData extends object = object> {
137
+ /**
138
+ * Title of the example, e.g.
139
+ *
140
+ * ```js
141
+ * `Get first 200 organisations (fast mode)`
142
+ * ```
143
+ */
144
+ title: string;
145
+ /** Example input data */
146
+ inputData: TData;
147
+ /**
148
+ * Comments related to individual fields of `inputData`
149
+ *
150
+ * These comments may be rendered as such:
151
+ *
152
+ * ```json
153
+ * {
154
+ * "inputDataField1": 22,
155
+ * // This is a comment from inputDataComments.inputDataField2
156
+ * "inputDataField2": "Value from inputData.inputDataField2"
157
+ * }
158
+ * ```
159
+ */
160
+ inputDataComments?: Partial<Record<keyof TData, string>>;
161
+ }
162
+ /**
163
+ * Defines how to render a table block with data on performance
164
+ * for crawler README
165
+ *
166
+ * Example rendered output:
167
+ * ```markdown
168
+ * ### Organisations
169
+ *
170
+ * <table>
171
+ * <thead>
172
+ * <tr>
173
+ * <td></td>
174
+ * <td>
175
+ * <strong>100 results</strong>
176
+ * </td>
177
+ * <td>
178
+ * <strong>Full run (~ 2.6K results)</strong>
179
+ * </td>
180
+ * </tr>
181
+ * </thead>
182
+ * <tbody>
183
+ * <tr>
184
+ * <td>Fast run</td>
185
+ * <td>$0.014 in 2m 0s</td>
186
+ * <td>$0.289 in 42m 0s</td>
187
+ * </tr>
188
+ * <tr>
189
+ * <td>Detailed run</td>
190
+ * <td>$0.08 in 11m 37s</td>
191
+ * <td>$2.008 in 4h 52m </td>
192
+ * </tr>
193
+ * </tbody>
194
+ * </table>
195
+ * ```
196
+ */
197
+ export interface PerfTable {
198
+ /**
199
+ * Map rowIds to header text. E.g.
200
+ * ```ts
201
+ * [
202
+ * { rowId: 'fast', template: 'Fast run' },
203
+ * { rowId: 'detailed', template: 'Detailed run' },
204
+ * ]
205
+ * ```
206
+ */
207
+ rows: {
208
+ rowId: string;
209
+ template: string;
210
+ }[];
211
+ /**
212
+ * Map colIds to header text. E.g.
213
+ * ```ts
214
+ * [
215
+ * { colId: '100items', template: '100 results' },
216
+ * { colId: 'fullRun', template: 'Full run (~ <%~ it.fn.millify(it.dataset.size) %> results)' },
217
+ * ]
218
+ * ```
219
+ */
220
+ cols: {
221
+ colId: string;
222
+ template: string;
223
+ }[];
224
+ }
225
+ /** Defines how to render the crawler README. */
226
+ export interface CrawleeOneReadmeTemplates {
227
+ input: {
228
+ /**
229
+ * ID (key on input JSON) of the actor input that sets the max number of entries.
230
+ *
231
+ * E.g. `'listingFilterMaxCount'`
232
+ */
233
+ maxCount: string;
234
+ /**
235
+ * Name (as mentioned in UI) of the actor input that sets to include personal data.
236
+ *
237
+ * E.g. `'Include personal data'`
238
+ */
239
+ privacyName: string;
240
+ };
241
+ /** Configure how to render the performance/cost tables */
242
+ perfTables: Record<string, PerfTable>;
243
+ /** Configure how to render readme actor features */
244
+ features: Record<ReadmeFeatureType, ReadmeFeature>;
245
+ /** Configure how to render readme example inputs */
246
+ exampleInputs: ReadmeExampleInput[];
247
+ /**
248
+ * Hooks are various places throughout the README template
249
+ * where you can insert own text
250
+ *
251
+ * Just as other templates, the hooks receive the whole context (`it`)
252
+ */
253
+ hooks?: Partial<Record<ReadmeHook, string>>;
254
+ }
255
+ /** Context available inside the rendering via `it` */
256
+ export interface RenderContext {
257
+ fn: Record<string, any>;
258
+ t: CrawleeOneReadmeTemplates;
259
+ a: CrawleeOneScraperActorSpec;
260
+ }
@@ -0,0 +1,54 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.README_HOOK_ENUM = exports.README_HOOK = exports.README_FEATURE_ENUM = exports.README_FEATURE = void 0;
4
+ const types_1 = require("../../utils/types");
5
+ exports.README_FEATURE = [
6
+ 'datasets',
7
+ 'modes',
8
+ 'filters',
9
+ 'noBrowser',
10
+ 'proxy',
11
+ 'crawlerConfig',
12
+ 'tests',
13
+ 'privacy',
14
+ 'integratedETL',
15
+ 'integratedCache',
16
+ 'errorMonitoring',
17
+ 'metamorph',
18
+ ];
19
+ exports.README_FEATURE_ENUM = (0, types_1.enumFromArray)(exports.README_FEATURE);
20
+ exports.README_HOOK = [
21
+ 'introAfterBegin',
22
+ 'introAfterDatasets',
23
+ 'introBeforeEnd',
24
+ 'featuresAfterBegin',
25
+ 'featuresBeforeEnd',
26
+ 'useCases',
27
+ 'usageAfterBegin',
28
+ 'usageBeforeEnd',
29
+ 'costAfterBegin',
30
+ 'costAfterPerfTables',
31
+ 'costBeforeEnd',
32
+ 'inputAfterBegin',
33
+ 'inputBeforeEnd',
34
+ 'filterAfterBegin',
35
+ 'filterBeforeEnd',
36
+ 'limitAfterBegin',
37
+ 'limitBeforeEnd',
38
+ 'inputExampleAfterBegin',
39
+ 'inputExampleBeforeEnd',
40
+ 'outputAfterBegin',
41
+ 'outputBeforeEnd',
42
+ 'outputExampleAfterBegin',
43
+ 'outputExampleBeforeEnd',
44
+ 'integrationAfterBegin',
45
+ 'integrationBeforeEnd',
46
+ 'apifyAfterBegin',
47
+ 'apifyBeforeEnd',
48
+ 'legalityAfterBegin',
49
+ 'legalityBeforeEnd',
50
+ 'contactAfterBegin',
51
+ 'contactBeforeEnd',
52
+ ];
53
+ exports.README_HOOK_ENUM = (0, types_1.enumFromArray)(exports.README_HOOK);
54
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../../../src/lib/readme/types.ts"],"names":[],"mappings":";;;AAAA,6CAA0D;AAG7C,QAAA,cAAc,GAAG;IAC5B,UAAU;IACV,OAAO;IACP,SAAS;IACT,WAAW;IACX,OAAO;IACP,eAAe;IACf,OAAO;IACP,SAAS;IACT,eAAe;IACf,iBAAiB;IACjB,iBAAiB;IACjB,WAAW;CACH,CAAC;AACE,QAAA,mBAAmB,GAAG,IAAA,qBAAa,EAAC,sBAAc,CAAC,CAAC;AAGpD,QAAA,WAAW,GAAG;IACzB,iBAAiB;IACjB,oBAAoB;IACpB,gBAAgB;IAChB,oBAAoB;IACpB,mBAAmB;IACnB,UAAU;IACV,iBAAiB;IACjB,gBAAgB;IAChB,gBAAgB;IAChB,qBAAqB;IACrB,eAAe;IACf,iBAAiB;IACjB,gBAAgB;IAChB,kBAAkB;IAClB,iBAAiB;IACjB,iBAAiB;IACjB,gBAAgB;IAChB,wBAAwB;IACxB,uBAAuB;IACvB,kBAAkB;IAClB,iBAAiB;IACjB,yBAAyB;IACzB,wBAAwB;IACxB,uBAAuB;IACvB,sBAAsB;IACtB,iBAAiB;IACjB,gBAAgB;IAChB,oBAAoB;IACpB,mBAAmB;IACnB,mBAAmB;IACnB,kBAAkB;CACV,CAAC;AACE,QAAA,gBAAgB,GAAG,IAAA,qBAAa,EAAC,mBAAW,CAAC,CAAC","sourcesContent":["import { ArrVal, enumFromArray } from '../../utils/types';\nimport type { CrawleeOneScraperActorSpec } from '../actorSpec';\n\nexport const README_FEATURE = [\n 'datasets',\n 'modes',\n 'filters',\n 'noBrowser',\n 'proxy',\n 'crawlerConfig',\n 'tests',\n 'privacy',\n 'integratedETL',\n 'integratedCache',\n 'errorMonitoring',\n 'metamorph',\n] as const;\nexport const README_FEATURE_ENUM = enumFromArray(README_FEATURE);\nexport type ReadmeFeatureType = ArrVal<typeof README_FEATURE>;\n\nexport const README_HOOK = [\n 'introAfterBegin',\n 'introAfterDatasets',\n 'introBeforeEnd',\n 'featuresAfterBegin',\n 'featuresBeforeEnd',\n 'useCases',\n 'usageAfterBegin',\n 'usageBeforeEnd',\n 'costAfterBegin',\n 'costAfterPerfTables',\n 'costBeforeEnd',\n 'inputAfterBegin',\n 'inputBeforeEnd',\n 'filterAfterBegin',\n 'filterBeforeEnd',\n 'limitAfterBegin',\n 'limitBeforeEnd',\n 'inputExampleAfterBegin',\n 'inputExampleBeforeEnd',\n 'outputAfterBegin',\n 'outputBeforeEnd',\n 'outputExampleAfterBegin',\n 'outputExampleBeforeEnd',\n 'integrationAfterBegin',\n 'integrationBeforeEnd',\n 'apifyAfterBegin',\n 'apifyBeforeEnd',\n 'legalityAfterBegin',\n 'legalityBeforeEnd',\n 'contactAfterBegin',\n 'contactBeforeEnd',\n] as const;\nexport const README_HOOK_ENUM = enumFromArray(README_HOOK);\nexport type ReadmeHook = ArrVal<typeof README_HOOK>;\n\n/**\n * Defines how to render a feature block for crawler README.\n *\n * Example rendered output:\n * ```markdown\n * - **3 kinds of datasets**\n * - test1\n * - Scrape details of organisations, researchers or projects.\n * - test2\n * ```\n */\nexport interface ReadmeFeature<TData extends any = any> {\n /** Feature is considered supported by this actor if truthy value is returned. */\n supported: (it: RenderContext) => any;\n /**\n * Title template string for the feature. E.g.\n * ```js\n * '<%~ it.a.datasets.length %> kinds of datasets'\n * ```\n * */\n title: string;\n /**\n * Template string that goes BEFORE the mainText. E.g.\n * ```js\n * '- Scrape details of <%~ it.fn.enumerate(it.a.datasets.map((d) => d.name)) %>.'\n * ```\n */\n afterBegin?: string;\n /**\n * Main body template string for the feature. E.g.\n * ```js\n * '- Scrape details of <%~ it.fn.enumerate(it.a.datasets.map((d) => d.name)) %>.'\n * ```\n */\n mainText: string;\n /**\n * Template string that goes AFTER the mainText. E.g.\n * ```js\n * '- Scrape details of <%~ it.fn.enumerate(it.a.datasets.map((d) => d.name)) %>.'\n * ```\n */\n beforeEnd?: string;\n /**\n * If you need to store/access some variables during render, use this object.\n *\n * E.g.\n * ```js\n * filters: {\n * supported: (it) => it.a.datasets.some((d) => d.filters.length),\n * title: 'Filter support',\n * mainText:\n * `- Filter the results by <%~ it.fn.enumerate(it.fn.collectFilters(it)) %>.\\n` +\n * `<% if (it.t.features.filters.data.maxEntriesSupported) { %>\\n` +\n * ` - Limit the number of results.\\n` +\n * `<% } %>`,\n * data: {\n * maxEntriesSupported: true,\n * },\n * },\n * ```\n */\n data?: TData;\n}\n\n/**\n * Defines how to render an example input block for crawler README.\n *\n * Example rendered output:\n * ```markdown\n * #### Example 2: Same as above, but specified by providing a start URL\n *\n * ```json\n * {\n * \"startUrls\": [\n * \"https://www.skcris.sk/portal/web/guest/register-organizations\"\n * ],\n * // Omit relationships to other entries\n * \"entryIncludeLinkedResources\": false,\n * \"listingFilterMaxCount\": 200,\n * \"listingItemsPerPage\": 200,\n * }\n * ```\n */\nexport interface ReadmeExampleInput<TData extends object = object> {\n /**\n * Title of the example, e.g.\n *\n * ```js\n * `Get first 200 organisations (fast mode)`\n * ```\n */\n title: string;\n /** Example input data */\n inputData: TData;\n /**\n * Comments related to individual fields of `inputData`\n *\n * These comments may be rendered as such:\n *\n * ```json\n * {\n * \"inputDataField1\": 22,\n * // This is a comment from inputDataComments.inputDataField2\n * \"inputDataField2\": \"Value from inputData.inputDataField2\"\n * }\n * ```\n */\n inputDataComments?: Partial<Record<keyof TData, string>>;\n}\n\n/**\n * Defines how to render a table block with data on performance\n * for crawler README\n *\n * Example rendered output:\n * ```markdown\n * ### Organisations\n *\n * <table>\n * <thead>\n * <tr>\n * <td></td>\n * <td>\n * <strong>100 results</strong>\n * </td>\n * <td>\n * <strong>Full run (~ 2.6K results)</strong>\n * </td>\n * </tr>\n * </thead>\n * <tbody>\n * <tr>\n * <td>Fast run</td>\n * <td>$0.014 in 2m 0s</td>\n * <td>$0.289 in 42m 0s</td>\n * </tr>\n * <tr>\n * <td>Detailed run</td>\n * <td>$0.08 in 11m 37s</td>\n * <td>$2.008 in 4h 52m </td>\n * </tr>\n * </tbody>\n * </table>\n * ```\n */\nexport interface PerfTable {\n /**\n * Map rowIds to header text. E.g.\n * ```ts\n * [\n * { rowId: 'fast', template: 'Fast run' },\n * { rowId: 'detailed', template: 'Detailed run' },\n * ]\n * ```\n */\n rows: {\n rowId: string;\n template: string;\n }[];\n /**\n * Map colIds to header text. E.g.\n * ```ts\n * [\n * { colId: '100items', template: '100 results' },\n * { colId: 'fullRun', template: 'Full run (~ <%~ it.fn.millify(it.dataset.size) %> results)' },\n * ]\n * ```\n */\n cols: {\n colId: string;\n template: string;\n }[];\n}\n\n/** Defines how to render the crawler README. */\nexport interface CrawleeOneReadmeTemplates {\n input: {\n /**\n * ID (key on input JSON) of the actor input that sets the max number of entries.\n *\n * E.g. `'listingFilterMaxCount'`\n */\n maxCount: string;\n /**\n * Name (as mentioned in UI) of the actor input that sets to include personal data.\n *\n * E.g. `'Include personal data'`\n */\n privacyName: string;\n };\n /** Configure how to render the performance/cost tables */\n perfTables: Record<string, PerfTable>;\n /** Configure how to render readme actor features */\n features: Record<ReadmeFeatureType, ReadmeFeature>;\n /** Configure how to render readme example inputs */\n exampleInputs: ReadmeExampleInput[];\n /**\n * Hooks are various places throughout the README template\n * where you can insert own text\n *\n * Just as other templates, the hooks receive the whole context (`it`)\n */\n hooks?: Partial<Record<ReadmeHook, string>>;\n}\n\n/** Context available inside the rendering via `it` */\nexport interface RenderContext {\n fn: Record<string, any>;\n t: CrawleeOneReadmeTemplates;\n a: CrawleeOneScraperActorSpec;\n}\n"]}
@@ -0,0 +1,132 @@
1
+ import type { BasicCrawler, BasicCrawlingContext, CheerioCrawlingContext, CrawlingContext, HttpCrawlingContext, JSDOMCrawlingContext, PlaywrightCrawlingContext, PuppeteerCrawlingContext, RouterHandler as CrawlerRouter, Request as CrawlerRequest } from 'crawlee';
2
+ import type { MaybePromise } from '../utils/types';
3
+ import type { CrawleeOneIO } from './integrations/types';
4
+ /** Context object provided in CrawlerRouter */
5
+ export type RouterHandlerCtx<CrawlerCtx extends CrawlingContext> = Parameters<Parameters<CrawlerRouter<CrawlerCtx>['addHandler']>[1]>[0];
6
+ /** Function that's passed to `router.addHandler(label, handler)` */
7
+ export type RouteHandler<CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>, RouterCtx extends Record<string, any> = Record<string, any>> = Parameters<CrawlerRouter<RouterHandlerCtx<CrawlerCtx & RouterCtx>>['addHandler']>[1];
8
+ /** Wrapper that modifies behavior of RouteHandler */
9
+ export type CrawlerRouterWrapper<CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>, RouterCtx extends Record<string, any> = Record<string, any>> = (handler: (ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>) => Promise<void>) => (ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>) => Promise<void>;
10
+ /**
11
+ * Criteria that un-labelled requests are matched against.
12
+ *
13
+ * E.g. If `match` function returns truthy value,
14
+ * the request is passed to the `action` function for processing.
15
+ */
16
+ export interface RouteMatcher<CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>, RouterCtx extends Record<string, any> = Record<string, any>, Labels extends string = string> {
17
+ /** Human readable name */
18
+ name: string;
19
+ /**
20
+ * Label of the handler registered with `router.addHandler(label, handler)`
21
+ * that will process this request.
22
+ *
23
+ * NOTE: This value is used by the default `action` function. If you override
24
+ * the `action` function, `handlerLabel` is ignored and you have to process it yourself.
25
+ */
26
+ handlerLabel: Labels | null;
27
+ /**
28
+ * Function that decides whether the request will processed by this `action` function.
29
+ *
30
+ * @example
31
+ * [{
32
+ * // If match returns true, the request is forwarded to handler
33
+ * // with label JOB_DETAIL.
34
+ * name: 'Job detail',
35
+ * match: (url, ctx, route, handlers) => isUrlOfJobOffer(url),
36
+ * handlerLabel: routeLabels.JOB_DETAIL,
37
+ * }]
38
+ */
39
+ match: (url: string, ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>, route: RouteMatcher<CrawlerCtx, RouterCtx, Labels>, handlers: Record<Labels, RouteHandler<CrawlerCtx, RouterCtx>>) => unknown;
40
+ /**
41
+ * Request is passed to this function if `match` returned truthy value.
42
+ *
43
+ * @example
44
+ * [{
45
+ * // If match returns true, the request is forwarded to handler
46
+ * // with label JOB_DETAIL.
47
+ * name: 'Job detail',
48
+ * match: (url, ctx, route, handlers) => isUrlOfJobOffer(url),
49
+ * handlerLabel: routeLabels.JOB_DETAIL,
50
+ * }]
51
+ */
52
+ action?: (url: string, ctx: RouterHandlerCtx<CrawlerCtx>, route: RouteMatcher<CrawlerCtx, RouterCtx, Labels>, handlers: Record<Labels, RouteHandler<CrawlerCtx, RouterCtx>>) => MaybePromise<void>;
53
+ }
54
+ export declare const createRouteMatchers: <CrawlerCtx extends CrawlingContext<unknown, import("crawlee").Dictionary> = CrawlingContext<BasicCrawler<BasicCrawlingContext<import("crawlee").Dictionary>>, import("crawlee").Dictionary>, RouterCtx extends Record<string, any> = Record<string, any>, Labels extends string = string>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => RouteMatcher<CrawlerCtx, RouterCtx, Labels>[];
55
+ export declare const createBasicRouteMatchers: <CrawlerCtx extends BasicCrawlingContext<import("crawlee").Dictionary> = BasicCrawlingContext<import("crawlee").Dictionary>, RouterCtx extends Record<string, any> = Record<string, any>, Labels extends string = string>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => RouteMatcher<CrawlerCtx, RouterCtx, Labels>[];
56
+ export declare const createHttpRouteMatchers: <CrawlerCtx extends HttpCrawlingContext<any, any> = HttpCrawlingContext<any, any>, RouterCtx extends Record<string, any> = Record<string, any>, Labels extends string = string>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => RouteMatcher<CrawlerCtx, RouterCtx, Labels>[];
57
+ export declare const createJsdomRouteMatchers: <CrawlerCtx extends JSDOMCrawlingContext<any, any> = JSDOMCrawlingContext<any, any>, RouterCtx extends Record<string, any> = Record<string, any>, Labels extends string = string>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => RouteMatcher<CrawlerCtx, RouterCtx, Labels>[];
58
+ export declare const createCheerioRouteMatchers: <CrawlerCtx extends CheerioCrawlingContext<any, any> = CheerioCrawlingContext<any, any>, RouterCtx extends Record<string, any> = Record<string, any>, Labels extends string = string>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => RouteMatcher<CrawlerCtx, RouterCtx, Labels>[];
59
+ export declare const createPlaywrightRouteMatchers: <CrawlerCtx extends PlaywrightCrawlingContext<import("crawlee").Dictionary> = PlaywrightCrawlingContext<import("crawlee").Dictionary>, RouterCtx extends Record<string, any> = Record<string, any>, Labels extends string = string>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => RouteMatcher<CrawlerCtx, RouterCtx, Labels>[];
60
+ export declare const createPuppeteerRouteMatchers: <CrawlerCtx extends PuppeteerCrawlingContext<import("crawlee").Dictionary> = PuppeteerCrawlingContext<import("crawlee").Dictionary>, RouterCtx extends Record<string, any> = Record<string, any>, Labels extends string = string>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => RouteMatcher<CrawlerCtx, RouterCtx, Labels>[];
61
+ export declare const registerHandlers: <CrawlerCtx extends CrawlingContext<unknown, import("crawlee").Dictionary>, RouterCtx extends Record<string, any> = Record<string, any>, Labels extends string = string>({ router, routerWrappers, routerContext, routeHandlers, }: {
62
+ router: CrawlerRouter<CrawlerCtx>;
63
+ routerWrappers?: CrawlerRouterWrapper<CrawlerCtx, RouterCtx>[] | undefined;
64
+ routerContext?: RouterCtx | undefined;
65
+ routeHandlers: Record<Labels, (ctx: Omit<Omit<CrawlerCtx & RouterCtx, "request"> & {
66
+ request: CrawlerRequest<import("crawlee").Dictionary>;
67
+ }, "request"> & {
68
+ request: CrawlerRequest<import("crawlee").Dictionary>;
69
+ }) => import("@crawlee/core/typedefs").Awaitable<void>>;
70
+ }) => Promise<void>;
71
+ /**
72
+ * Configures the default router handler to redirect URLs to labelled route handlers
73
+ * based on which route the URL matches first.
74
+ *
75
+ * NOTE: This does mean that the URLs passed to this default handler will be fetched
76
+ * twice (as the URL will be requeued to the correct handler). We recommend to use this
77
+ * function only in the scenarios where there is a small number of startUrls, yet these
78
+ * may need various ways of processing based on different paths or etc.
79
+ *
80
+ * @example
81
+ *
82
+ * const routeLabels = {
83
+ * MAIN_PAGE: 'MAIN_PAGE',
84
+ * JOB_LISTING: 'JOB_LISTING',
85
+ * JOB_DETAIL: 'JOB_DETAIL',
86
+ * JOB_RELATED_LIST: 'JOB_RELATED_LIST',
87
+ * PARTNERS: 'PARTNERS',
88
+ * } as const;
89
+ *
90
+ * const router = createPlaywrightRouter();
91
+ *
92
+ * const routes = createPlaywrightRouteMatchers<typeof routeLabels>([
93
+ * // URLs that match this route are redirected to router.addHandler(routeLabels.MAIN_PAGE)
94
+ * {
95
+ * route: routeLabels.MAIN_PAGE,
96
+ * // Check for main page like https://www.profesia.sk/?#
97
+ * match: (url) => url.match(/[\W]profesia\.sk\/?(?:[?#~]|$)/i),
98
+ * },
99
+ *
100
+ * // Optionally override the logic that assigns the URL to the route by specifying the `action` prop
101
+ * {
102
+ * route: routeLabels.MAIN_PAGE,
103
+ * // Check for main page like https://www.profesia.sk/?#
104
+ * match: (url) => url.match(/[\W]profesia\.sk\/?(?:[?#~]|$)/i),
105
+ * action: async (ctx) => {
106
+ * await ctx.crawler.addRequests([{
107
+ * url: 'https://profesia.sk/praca',
108
+ * label: routeLabels.JOB_LISTING,
109
+ * }]);
110
+ * },
111
+ * },
112
+ * ]);
113
+ *
114
+ * // Set up default route to redirect to labelled routes
115
+ * setupDefaultRoute({ router, routes });
116
+ *
117
+ * // Now set up the labelled routes
118
+ * await router.addHandler(routeLabels.JOB_LISTING, async (ctx) => { ... }
119
+ */
120
+ export declare const setupDefaultRoute: <CrawlerCtx extends CrawlingContext<unknown, import("crawlee").Dictionary>, RouterCtx extends Record<string, any> = Record<string, any>, Labels extends string = string, Input extends Record<string, any> = Record<string, any>>({ io, router, routerWrappers, routerContext, routes, routeHandlers, input, }: {
121
+ io: CrawleeOneIO;
122
+ router: CrawlerRouter<CrawlerCtx>;
123
+ routerWrappers?: CrawlerRouterWrapper<CrawlerCtx, RouterCtx>[] | undefined;
124
+ routerContext?: RouterCtx | undefined;
125
+ routes: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[];
126
+ routeHandlers: Record<Labels, (ctx: Omit<Omit<CrawlerCtx & RouterCtx, "request"> & {
127
+ request: CrawlerRequest<import("crawlee").Dictionary>;
128
+ }, "request"> & {
129
+ request: CrawlerRequest<import("crawlee").Dictionary>;
130
+ }) => import("@crawlee/core/typedefs").Awaitable<void>>;
131
+ input?: Input | null | undefined;
132
+ }) => Promise<void>;
@@ -0,0 +1,165 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.setupDefaultRoute = exports.registerHandlers = exports.createPuppeteerRouteMatchers = exports.createPlaywrightRouteMatchers = exports.createCheerioRouteMatchers = exports.createJsdomRouteMatchers = exports.createHttpRouteMatchers = exports.createBasicRouteMatchers = exports.createRouteMatchers = void 0;
13
+ const async_1 = require("../utils/async");
14
+ const createRouteMatchers = (matchers) => matchers; // prettier-ignore
15
+ exports.createRouteMatchers = createRouteMatchers;
16
+ // Context-specific variants
17
+ const createBasicRouteMatchers = (matchers) => matchers; // prettier-ignore
18
+ exports.createBasicRouteMatchers = createBasicRouteMatchers;
19
+ const createHttpRouteMatchers = (matchers) => matchers; // prettier-ignore
20
+ exports.createHttpRouteMatchers = createHttpRouteMatchers;
21
+ const createJsdomRouteMatchers = (matchers) => matchers; // prettier-ignore
22
+ exports.createJsdomRouteMatchers = createJsdomRouteMatchers;
23
+ const createCheerioRouteMatchers = (matchers) => matchers; // prettier-ignore
24
+ exports.createCheerioRouteMatchers = createCheerioRouteMatchers;
25
+ const createPlaywrightRouteMatchers = (matchers) => matchers; // prettier-ignore
26
+ exports.createPlaywrightRouteMatchers = createPlaywrightRouteMatchers;
27
+ const createPuppeteerRouteMatchers = (matchers) => matchers; // prettier-ignore
28
+ exports.createPuppeteerRouteMatchers = createPuppeteerRouteMatchers;
29
+ const registerHandlers = ({ router, routerWrappers, routerContext, routeHandlers, }) => __awaiter(void 0, void 0, void 0, function* () {
30
+ yield (0, async_1.serialAsyncMap)(Object.entries(routeHandlers), ([key, handler]) => __awaiter(void 0, void 0, void 0, function* () {
31
+ const wrappedHandler = (routerWrappers !== null && routerWrappers !== void 0 ? routerWrappers : []).reduceRight((fn, wrapper) => wrapper((ctx) => fn(ctx)), handler);
32
+ yield router.addHandler(key, (ctx) => __awaiter(void 0, void 0, void 0, function* () { return wrappedHandler(Object.assign(Object.assign({}, routerContext), ctx)); }));
33
+ }));
34
+ });
35
+ exports.registerHandlers = registerHandlers;
36
+ /**
37
+ * Configures the default router handler to redirect URLs to labelled route handlers
38
+ * based on which route the URL matches first.
39
+ *
40
+ * NOTE: This does mean that the URLs passed to this default handler will be fetched
41
+ * twice (as the URL will be requeued to the correct handler). We recommend to use this
42
+ * function only in the scenarios where there is a small number of startUrls, yet these
43
+ * may need various ways of processing based on different paths or etc.
44
+ *
45
+ * @example
46
+ *
47
+ * const routeLabels = {
48
+ * MAIN_PAGE: 'MAIN_PAGE',
49
+ * JOB_LISTING: 'JOB_LISTING',
50
+ * JOB_DETAIL: 'JOB_DETAIL',
51
+ * JOB_RELATED_LIST: 'JOB_RELATED_LIST',
52
+ * PARTNERS: 'PARTNERS',
53
+ * } as const;
54
+ *
55
+ * const router = createPlaywrightRouter();
56
+ *
57
+ * const routes = createPlaywrightRouteMatchers<typeof routeLabels>([
58
+ * // URLs that match this route are redirected to router.addHandler(routeLabels.MAIN_PAGE)
59
+ * {
60
+ * route: routeLabels.MAIN_PAGE,
61
+ * // Check for main page like https://www.profesia.sk/?#
62
+ * match: (url) => url.match(/[\W]profesia\.sk\/?(?:[?#~]|$)/i),
63
+ * },
64
+ *
65
+ * // Optionally override the logic that assigns the URL to the route by specifying the `action` prop
66
+ * {
67
+ * route: routeLabels.MAIN_PAGE,
68
+ * // Check for main page like https://www.profesia.sk/?#
69
+ * match: (url) => url.match(/[\W]profesia\.sk\/?(?:[?#~]|$)/i),
70
+ * action: async (ctx) => {
71
+ * await ctx.crawler.addRequests([{
72
+ * url: 'https://profesia.sk/praca',
73
+ * label: routeLabels.JOB_LISTING,
74
+ * }]);
75
+ * },
76
+ * },
77
+ * ]);
78
+ *
79
+ * // Set up default route to redirect to labelled routes
80
+ * setupDefaultRoute({ router, routes });
81
+ *
82
+ * // Now set up the labelled routes
83
+ * await router.addHandler(routeLabels.JOB_LISTING, async (ctx) => { ... }
84
+ */
85
+ const setupDefaultRoute = ({ io, router, routerWrappers, routerContext, routes, routeHandlers, input, }) => __awaiter(void 0, void 0, void 0, function* () {
86
+ const { perfBatchSize, perfBatchWaitSecs, requestQueueId } = (input || {});
87
+ /** Redirect the URL to the labelled route identical to route's name */
88
+ // prettier-ignore
89
+ const defaultAction = (url, ctx, route) => __awaiter(void 0, void 0, void 0, function* () {
90
+ const handler = route.handlerLabel != null && routeHandlers[route.handlerLabel];
91
+ if (!handler) {
92
+ ctx.log.error(`No handler found for route ${route.name} (${route.handlerLabel}). URL will not be processed. URL: ${url}`); // prettier-ignore
93
+ return;
94
+ }
95
+ ctx.log.info(`Passing URL to handler ${route.handlerLabel}. URL: ${url}`);
96
+ yield handler(ctx);
97
+ });
98
+ const defaultHandler = (ctx) => __awaiter(void 0, void 0, void 0, function* () {
99
+ var _a;
100
+ const { page, log: parentLog } = ctx;
101
+ const log = parentLog.child({ prefix: '[Router] ' });
102
+ const reqQueue = yield io.openRequestQueue(requestQueueId);
103
+ let handledRequestsCount = 0;
104
+ let req = ctx.request;
105
+ const closeRequest = () => __awaiter(void 0, void 0, void 0, function* () {
106
+ if (!req)
107
+ return;
108
+ yield reqQueue.markRequestHandled(req);
109
+ handledRequestsCount++;
110
+ });
111
+ const loadNextRequest = (suffix) => __awaiter(void 0, void 0, void 0, function* () {
112
+ log.debug(`Checking for new Request in the queue. ${suffix}`);
113
+ if (perfBatchWaitSecs)
114
+ yield (0, async_1.wait)(perfBatchWaitSecs);
115
+ const newReq = yield reqQueue.fetchNextRequest();
116
+ req = newReq !== null && newReq !== void 0 ? newReq : null;
117
+ if (req) {
118
+ log.debug(`Found new Request in the queue. ${suffix}`);
119
+ // WARNING - For each subsequent Request, it must be loaded manually
120
+ // Hence, batching is suitable only for browser-based Crawlers
121
+ // like Playwright or Puppeteer.
122
+ if (page && page.goto)
123
+ yield page.goto(req.url);
124
+ }
125
+ else {
126
+ log.debug(`No more Requests in the queue. ${suffix}`);
127
+ }
128
+ });
129
+ const hasBatchReqs = () => perfBatchSize != null && req != null && handledRequestsCount < perfBatchSize;
130
+ try {
131
+ do {
132
+ const url = page ? yield page.url() : (req === null || req === void 0 ? void 0 : req.loadedUrl) || (req === null || req === void 0 ? void 0 : req.url);
133
+ const logSuffix = `Batch ${handledRequestsCount + 1} of ${perfBatchSize !== null && perfBatchSize !== void 0 ? perfBatchSize : 1}. URL: ${url}`;
134
+ // Find route handler for given URL
135
+ log.debug(`Searching for a handler for given Request. ${logSuffix}`);
136
+ const route = yield (0, async_1.serialAsyncFind)(routes, (currRoute) => __awaiter(void 0, void 0, void 0, function* () {
137
+ const isMatch = yield currRoute.match(url, ctx, currRoute, routeHandlers);
138
+ return isMatch;
139
+ }));
140
+ // Run the handler
141
+ if (route) {
142
+ log.info(`URL matched route ${route.name} (handlerLabel: ${route.handlerLabel}). ${logSuffix}`); // prettier-ignore
143
+ yield ((_a = route.action) !== null && _a !== void 0 ? _a : defaultAction)(url, ctx, route, routeHandlers);
144
+ }
145
+ else {
146
+ log.error(`No route matched URL. URL will not be processed. ${logSuffix}`);
147
+ }
148
+ // Clean up and move onto another request
149
+ yield closeRequest();
150
+ yield loadNextRequest(logSuffix);
151
+ } while (hasBatchReqs());
152
+ }
153
+ catch (err) {
154
+ log.error(`Failed to process a request, returning it to the queue. URL: ${(req === null || req === void 0 ? void 0 : req.loadedUrl) || (req === null || req === void 0 ? void 0 : req.url)}.`); // prettier-ignore
155
+ log.error(err);
156
+ // Reinsert the request into the queue if we failed to process it due to an error
157
+ if (req)
158
+ yield reqQueue.reclaimRequest(req, { forefront: true });
159
+ }
160
+ });
161
+ const wrappedHandler = (routerWrappers !== null && routerWrappers !== void 0 ? routerWrappers : []).reduceRight((fn, wrapper) => wrapper(fn), defaultHandler);
162
+ yield router.addDefaultHandler((ctx) => wrappedHandler(Object.assign(Object.assign({}, routerContext), ctx)));
163
+ });
164
+ exports.setupDefaultRoute = setupDefaultRoute;
165
+ //# sourceMappingURL=router.js.map