crawlee-one 1.1.1 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cjs/composer.js +1 -1
- package/dist/cjs/composer.js.map +1 -1
- package/dist/cjs/lib/actor/actor.js.map +1 -1
- package/dist/cjs/lib/actor/types.d.ts +3 -3
- package/dist/cjs/lib/actor/types.js.map +1 -1
- package/dist/cjs/lib/router/router.d.ts +3 -3
- package/dist/cjs/lib/router/router.js +22 -2
- package/dist/cjs/lib/router/router.js.map +1 -1
- package/dist/cjs/lib/router/types.d.ts +38 -15
- package/dist/cjs/lib/router/types.js +15 -15
- package/dist/cjs/lib/router/types.js.map +1 -1
- package/package.json +1 -1
package/dist/cjs/composer.js
CHANGED
|
@@ -113,7 +113,7 @@
|
|
|
113
113
|
//
|
|
114
114
|
// HOW TO GENERATE TYPE FOR MATCHERS?
|
|
115
115
|
// ```ts
|
|
116
|
-
// type
|
|
116
|
+
// type `Label`Matcher = CrawleeOneRouteMatcher<Labels, RouterCtx, CrawlerCtx>
|
|
117
117
|
// ```
|
|
118
118
|
//
|
|
119
119
|
// 1. For each route:
|
package/dist/cjs/composer.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"composer.js","sourceRoot":"","sources":["../../src/composer.ts"],"names":[],"mappings":";AAAA,cAAc;AACd,iCAAiC;AAEjC,IAAI;AAEJ,MAAM;AACN,wBAAwB;AACxB,qBAAqB;AACrB,uBAAuB;AACvB,6BAA6B;AAC7B,eAAe;AACf,UAAU;AACV,MAAM;AACN,IAAI;AAEJ,qBAAqB;AACrB,0EAA0E;AAC1E,uEAAuE;AACvE,+BAA+B;AAC/B,IAAI;AACJ,MAAM;AAEN,SAAS;AACT,QAAQ;AACR,kDAAkD;AAElD,uBAAuB;AACvB,gBAAgB;AAChB,oBAAoB;AACpB,2BAA2B;AAC3B,EAAE;AACF,2CAA2C;AAC3C,kFAAkF;AAClF,uBAAuB;AACvB,+BAA+B;AAC/B,kCAAkC;AAClC,2GAA2G;AAC3G,kEAAkE;AAClE,0FAA0F;AAC1F,gDAAgD;AAChD,6BAA6B;AAC7B,gDAAgD;AAChD,wCAAwC;AACxC,yBAAyB;AACzB,kBAAkB;AAClB,wDAAwD;AACxD,uDAAuD;AACvD,wDAAwD;AACxD,uDAAuD;AACvD,UAAU;AACV,qCAAqC;AACrC,gBAAgB;AAChB,oBAAoB;AACpB,aAAa;AACb,iBAAiB;AACjB,QAAQ;AACR,OAAO;AACP,EAAE;AACF,cAAc;AACd,oBAAoB;AACpB,gCAAgC;AAChC,EAAE;AACF,4EAA4E;AAC5E,WAAW;AACX,sDAAsD;AACtD,0DAA0D;AAC1D,mGAAmG;AACnG,oEAAoE;AACpE,WAAW;AACX,WAAW;AACX,iBAAiB;AACjB,sEAAsE;AACtE,WAAW;AACX,WAAW;AACX,iBAAiB;AACjB,uEAAuE;AACvE,wDAAwD;AACxD,WAAW;AACX,EAAE;AACF,oCAAoC;AACpC,kCAAkC;AAClC,6BAA6B;AAC7B,iCAAiC;AACjC,WAAW;AACX,QAAQ;AACR,MAAM;AACN,KAAK;AACL,MAAM;AAEN,qCAAqC;AACrC,QAAQ;AACR,iGAAiG;AACjG,MAAM;AACN,EAAE;AACF,qBAAqB;AACrB,8BAA8B;AAC9B,kGAAkG;AAClG,+FAA+F;AAC/F,oCAAoC;AACpC,yGAAyG;AACzG,wCAAwC;AACxC,aAAa;AACb,kIAAkI;AAClI,qGAAqG;AACrG,WAAW;AACX,qFAAqF;AACrF,iDAAiD;AACjD,eAAe;AACf,2EAA2E;AAC3E,aAAa;AACb,gGAAgG;AAChG,8CAA8C;AAC9C,4FAA4F;AAC5F,yBAAyB;AACzB,aAAa;AACb,uGAAuG;AACvG,WAAW;AACX,EAAE;AAEF,qCAAqC;AACrC,QAAQ;AACR,
|
|
1
|
+
{"version":3,"file":"composer.js","sourceRoot":"","sources":["../../src/composer.ts"],"names":[],"mappings":";AAAA,cAAc;AACd,iCAAiC;AAEjC,IAAI;AAEJ,MAAM;AACN,wBAAwB;AACxB,qBAAqB;AACrB,uBAAuB;AACvB,6BAA6B;AAC7B,eAAe;AACf,UAAU;AACV,MAAM;AACN,IAAI;AAEJ,qBAAqB;AACrB,0EAA0E;AAC1E,uEAAuE;AACvE,+BAA+B;AAC/B,IAAI;AACJ,MAAM;AAEN,SAAS;AACT,QAAQ;AACR,kDAAkD;AAElD,uBAAuB;AACvB,gBAAgB;AAChB,oBAAoB;AACpB,2BAA2B;AAC3B,EAAE;AACF,2CAA2C;AAC3C,kFAAkF;AAClF,uBAAuB;AACvB,+BAA+B;AAC/B,kCAAkC;AAClC,2GAA2G;AAC3G,kEAAkE;AAClE,0FAA0F;AAC1F,gDAAgD;AAChD,6BAA6B;AAC7B,gDAAgD;AAChD,wCAAwC;AACxC,yBAAyB;AACzB,kBAAkB;AAClB,wDAAwD;AACxD,uDAAuD;AACvD,wDAAwD;AACxD,uDAAuD;AACvD,UAAU;AACV,qCAAqC;AACrC,gBAAgB;AAChB,oBAAoB;AACpB,aAAa;AACb,iBAAiB;AACjB,QAAQ;AACR,OAAO;AACP,EAAE;AACF,cAAc;AACd,oBAAoB;AACpB,gCAAgC;AAChC,EAAE;AACF,4EAA4E;AAC5E,WAAW;AACX,sDAAsD;AACtD,0DAA0D;AAC1D,mGAAmG;AACnG,oEAAoE;AACpE,WAAW;AACX,WAAW;AACX,iBAAiB;AACjB,sEAAsE;AACtE,WAAW;AACX,WAAW;AACX,iBAAiB;AACjB,uEAAuE;AACvE,wDAAwD;AACxD,WAAW;AACX,EAAE;AACF,oCAAoC;AACpC,kCAAkC;AAClC,6BAA6B;AAC7B,iCAAiC;AACjC,WAAW;AACX,QAAQ;AACR,MAAM;AACN,KAAK;AACL,MAAM;AAEN,qCAAqC;AACrC,QAAQ;AACR,iGAAiG;AACjG,MAAM;AACN,EAAE;AACF,qBAAqB;AACrB,8BAA8B;AAC9B,kGAAkG;AAClG,+FAA+F;AAC/F,oCAAoC;AACpC,yGAAyG;AACzG,wCAAwC;AACxC,aAAa;AACb,kIAAkI;AAClI,qGAAqG;AACrG,WAAW;AACX,qFAAqF;AACrF,iDAAiD;AACjD,eAAe;AACf,2EAA2E;AAC3E,aAAa;AACb,gGAAgG;AAChG,8CAA8C;AAC9C,4FAA4F;AAC5F,yBAAyB;AACzB,aAAa;AACb,uGAAuG;AACvG,WAAW;AACX,EAAE;AAEF,qCAAqC;AACrC,QAAQ;AACR,8EAA8E;AAC9E,MAAM;AACN,EAAE;AACF,qBAAqB;AACrB,8BAA8B;AAC9B,kGAAkG;AAClG,+FAA+F;AAC/F,oCAAoC;AACpC,yGAAyG;AACzG,wCAAwC;AACxC,aAAa;AACb,kIAAkI;AAClI,qGAAqG;AACrG,WAAW;AACX,qFAAqF;AACrF,iDAAiD;AACjD,eAAe;AACf,2EAA2E;AAC3E,aAAa;AACb,gGAAgG;AAChG,8CAA8C;AAC9C,4FAA4F;AAC5F,yBAAyB;AACzB,aAAa;AACb,uGAAuG;AACvG,WAAW;AACX,EAAE;AAEF,SAAS;AACT,8EAA8E;AAC9E,wFAAwF;AACxF,oDAAoD;AAEpD,MAAM;AACN,wBAAwB;AACxB,qBAAqB;AACrB,uBAAuB;AACvB,6BAA6B;AAC7B,eAAe;AACf,UAAU;AACV,MAAM;AACN,QAAQ;AACR,IAAI;AAEJ,kCAAkC;AAClC,kBAAkB;AAClB,uBAAuB;AACvB,6BAA6B;AAC7B,eAAe;AACf,UAAU;AACV,MAAM;AACN,IAAI;AAEJ,4BAA4B;AAC5B,QAAQ;AACR,IAAI;AAEJ,wBAAwB;AACxB,uBAAuB;AACvB,QAAQ;AACR,IAAI;AAEJ,4BAA4B;AAC5B,qBAAqB;AACrB,QAAQ;AACR,IAAI;AAEJ,qBAAqB;AACrB,sCAAsC;AACtC,0EAA0E;AAC1E,uEAAuE;AACvE,gCAAgC;AAChC,IAAI;AAEJ,kBAAkB;AAClB,4BAA4B;AAC5B,qFAAqF;AACrF,yBAAyB;AACzB,qEAAqE;AACrE,kDAAkD;AAClD,sDAAsD;AACtD,+FAA+F;AAC/F,gEAAgE;AAChE,OAAO;AACP,IAAI;AACJ,MAAM","sourcesContent":["// @ts-nocheck\n// interface ComposerCrawlerDef {\n\n// }\n\n// ```\n// crawler mainCrawler {\n// type: playwright\n// datasetId: '45678'\n// errorDatasetId: '098765'\n// options: {\n// ...\n// }\n// }\n\n// route detailPage {\n// // NOTE: If `match` is a regex, the regex is compared against the URL\n// match: /[\\W]profesia\\.sk\\/praca\\/zoznam-[a-z0-9-]+\\/?(?:[?#~]|$)/i\n// handler: detailPageHandler\n// }\n// ```\n\n// As JS:\n// ```js\n// import { detailPageHandler } from './handlers';\n\n// const scraperDef = {\n// crawlers: {\n// mainCrawler {\n// type: 'playwright'\n//\n///////// Override crawler config /////////\n// //////// { ...crawlerConfigDefaults, ...io.getInput(), ...crawlerConfig }\n// crawlerConfig?\n// crawlerConfigDefaults?\n///////// Override input /////////\n// ///// If mergeInput = true, will merge inputDefaults, input, and io.getInput() similarly to config\n// //////// { ...inputDefaults, ...io.getInput(), ...input }\n// ///// If mergeInput = false, io.getInput() will be ignored if `input` is provided\n// //////// { ...inputDefaults, ...input }\n// mergeInput?: boolean\n// inputDefaults?: Partial<AllActorInputs>\n// input?: Partial<AllActorInputs>\n///////// Hooks /////////\n// hooks?: {\n// validateInput?: (input) => MaybePromise<void>\n// onActorReady?: (actor) => MaybePromise<void>\n// onBeforeHandler?: (ctx) => MaybePromise<void>\n// onAfterHandler?: (ctx) => MaybePromise<void>\n// }\n///////// Override services /////////\n// proxy?,\n// telemetry?,\n// io?,\n// router?,\n// }\n// },\n//\n// routes: {\n// detailPage: {\n// crawler?: 'mainCrawler'\n//\n// match: /[\\W]profesia\\.sk\\/praca\\/zoznam-[a-z0-9-]+\\/?(?:[?#~]|$)/i,\n// OR\n// match: async (url, ctx, route, handlers) => {\n// const dom = cheerioPortadom(ctx.$.root(), url);\n// const isNotCustomDesign = await dom.findMany('body.listing:not(.custom-design)').length;\n// return isUrlOfCompanyProfile(url) && !!isNotCustomDesign;\n// },\n// OR\n// match: [\n// /[\\W]profesia\\.sk\\/praca\\/zoznam-[a-z0-9-]+\\/?(?:[?#~]|$)/i\n// ],\n// OR\n// match: [\n// /[\\W]profesia\\.sk\\/praca\\/zoznam-[a-z0-9-]+\\/?(?:[?#~]|$)/i,\n// async (url, ctx, route, handlers) => { ... },\n// ],\n//\n// handler: detailPageHandler,\n// handler: async (ctx) => {\n// ctx.actor.pushData\n// ctx.actor.pushRequests\n// },\n// }\n// }\n// };\n// ```\n\n// HOW TO GENERATE TYPE FOR HANDLERS?\n// ```ts\n// type detailPageHandler = CrawleeOneRouteHandler<CheerioCrawlingContext, ProfesiaRouterContext>\n// ```\n//\n// 1. For each route:\n// 1.1 Get `CrawlingContext`\n// 1.1.1 Take `mainCrawler`, and find corresponding crawler. If no `mainCrawler`, there should\n// be only 1 crawler, and take that (if more crawler, there should've been an error).\n// 1.1.2 Find the `crawler.type`\n// 1.1.3 Take corresponding type based on `crawler.type`, e.g. 'cheerio' => `CheerioCrawlingContext`;\n// 1.2 Get actor router context, e.g.:\n// ```ts\n// type `CrawlerName`RouterContext = CrawleeOneActorRouterCtx<`type`CrawlingContext, `CrawlerName`RouteLabel, AllActorInput>;\n// // NOTE: We use `AllActorInput` because since it's in the code, then we can handle ALL inputs\n// ```\n// 1.2.1 Use same CrawlingContext as in step 1.1 (e.g. `CheerioCrawlingContext`).\n// 1.2.2 Create `CrawlerName`Label type, e.g.\n// ```ts\n// type `CrawlerName`RouteLabel = \"detailPage\" | \"otherLabel\" | ...;\n// ```\n// 1.2.2.1 Take key (crawler name), and filter for all routes where `route.crawler == key`\n// 1.2.2.2 Take the keys of these routes\n// 1.2.2.3 Generate `type ${key}Label = ${keys.map((s) => '\"' + s + '\"').join(' | ')}`\n// 1.3 Put it together:\n// ```ts\n// type `Label`Handler = CrawleeOneRouteHandler<`type`CrawlingContext, `CrawlerName`RouterContext>\n// ```\n//\n\n// HOW TO GENERATE TYPE FOR MATCHERS?\n// ```ts\n// type `Label`Matcher = CrawleeOneRouteMatcher<Labels, RouterCtx, CrawlerCtx>\n// ```\n//\n// 1. For each route:\n// 1.1 Get `CrawlingContext`\n// 1.1.1 Take `mainCrawler`, and find corresponding crawler. If no `mainCrawler`, there should\n// be only 1 crawler, and take that (if more crawler, there should've been an error).\n// 1.1.2 Find the `crawler.type`\n// 1.1.3 Take corresponding type based on `crawler.type`, e.g. 'cheerio' => `CheerioCrawlingContext`;\n// 1.2 Get actor router context, e.g.:\n// ```ts\n// type `CrawlerName`RouterContext = CrawleeOneActorRouterCtx<`type`CrawlingContext, `CrawlerName`RouteLabel, AllActorInput>;\n// // NOTE: We use `AllActorInput` because since it's in the code, then we can handle ALL inputs\n// ```\n// 1.2.1 Use same CrawlingContext as in step 1.1 (e.g. `CheerioCrawlingContext`).\n// 1.2.2 Create `CrawlerName`Label type, e.g.\n// ```ts\n// type `CrawlerName`RouteLabel = \"detailPage\" | \"otherLabel\" | ...;\n// ```\n// 1.2.2.1 Take key (crawler name), and filter for all routes where `route.crawler == key`\n// 1.2.2.2 Take the keys of these routes\n// 1.2.2.3 Generate `type ${key}Label = ${keys.map((s) => '\"' + s + '\"').join(' | ')}`\n// 1.3 Put it together:\n// ```ts\n// type `Label`Handler = CrawleeOneRouteHandler<`type`CrawlingContext, `CrawlerName`RouterContext>\n// ```\n//\n\n// NOTES:\n// - Enum with available route labels would be extracted from this definition.\n// - If there is only 1 crawler defined, all routes use that. If there is more crawlers,\n// they should define which crawler it relates to.\n\n// ```\n// crawler mainCrawler {\n// type: playwright\n// datasetId: '45678'\n// errorDatasetId: '098765'\n// options: {\n// ...\n// }\n// ...\n// }\n\n// crawler productDetailsCrawler {\n// type: cheerio\n// datasetId: '45678'\n// requestQueueId: 'abcdef'\n// options: {\n// ...\n// }\n// }\n\n// requestQueue extraQueue {\n// ...\n// }\n\n// dataset mainDataset {\n// datasetId: '45678'\n// ...\n// }\n\n// keyValueStore mainStore {\n// datasetId: 'xyz'\n// ...\n// }\n\n// route detailPage {\n// crawler: 'productDetailsCrawler',\n// // NOTE: If `match` is a regex, the regex is compared against the URL\n// match: /[\\W]profesia\\.sk\\/praca\\/zoznam-[a-z0-9-]+\\/?(?:[?#~]|$)/i\n// handler: detailPageHandler,\n// }\n\n// route listing {\n// crawler: 'mainCrawler',\n// // Note: route object name is the 'label' by default, but label can be overriden\n// label: 'DETAIL_PAGE'\n// // NOTE: Otherwise `match` is a function that returns true/false\n// match: async (url, ctx, route, handlers) => {\n// const dom = cheerioPortadom(ctx.$.root(), url);\n// const isNotCustomDesign = await dom.findMany('body.listing:not(.custom-design)').length;\n// return isUrlOfCompanyProfile(url) && !!isNotCustomDesign;\n// },\n// }\n// ```\n"]}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"actor.js","sourceRoot":"","sources":["../../../../src/lib/actor/actor.ts"],"names":[],"mappings":";;;;;;;;;;;;AAAA,qCAaiB;AACjB,mCAAgD;AAChD,+CAA2C;AAI3C,wDAA2D;AAC3D,6CAA8E;AAC9E,2CAAqD;AACrD,qDAAuE;AAEvE,iDAAgD;AAChD,6CAA0E;AAC1E,sCAWmB;AACnB,gCAAmE;AAWnE,MAAM,gBAAgB,GAAG;IACvB,KAAK,EAAE,sBAAY;IACnB,IAAI,EAAE,qBAAW;IACjB,OAAO,EAAE,wBAAc;IACvB,KAAK,EAAE,sBAAY;IACnB,UAAU,EAAE,2BAAiB;IAC7B,SAAS,EAAE,0BAAgB;CAC+C,CAAC;AAE7E,MAAM,QAAQ,GAAG,CAAC,CAAM,EAA2B,EAAE;IACnD,OAAO,CAAC,CAAC,CAAC,CAAC,CAAmB,aAAnB,CAAC,uBAAD,CAAC,CAAoB,UAAU,MAAK,CAAmB,aAAnB,CAAC,uBAAD,CAAC,CAAoB,iBAAiB,CAAA,CAAC,CAAC;AACzF,CAAC,CAAC;AAEF,MAAM,MAAM,GAAG,CAAC,CAAM,EAAgC,EAAE;IACtD,OAAO,OAAO,CAAC,KAAK,UAAU,CAAC;AACjC,CAAC,CAAC;AAEF,kEAAkE;AAClE,MAAM,SAAS,GAAG,CAIhB,KAAoF,EACpF,KAAc,EACd,EAAE;IACF,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IAExB,MAAM,OAAO,GAAG;QACd,EAAE,EAAE,KAAK,CAAC,EAAE;QACZ,KAAK,EAAE,KAAK,CAAC,KAAK;QAClB,KAAK,EAAE,KAAK,CAAC,KAAK;QAClB,YAAY,EAAZ,uBAAY;QACZ,WAAW,EAAE,0BAAW;KACe,CAAC;IAE1C,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC;IAC3B,IAAI,CAAC,MAAM;QAAE,OAAO,IAAI,CAAC;IAEzB,OAAO,CAAO,GAAG,IAAI,EAAE,EAAE,kDAAC,OAAA,MAAM,CAAC,GAAG,IAAI,EAAE,OAAO,CAAC,CAAA,GAAA,CAAC;AACrD,CAAC,CAAC;AA6CF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AACI,MAAM,aAAa,GAAG,CAQ3B,IAAiE,EAClD,EAAE;IACjB,MAAM,EACJ,SAAS,EACT,SAAS,EACT,WAAW,EACX,qBAAqB,EACrB,sBAAsB,EACtB,YAAY,GACb,GAAG,IAAI,CAAC;IAET,MAAM,EAAE,EAAE,GAAG,eAAqB,EAAE,SAAS,EAAE,GAAG,WAAW,CAAC;IAE9D,YAAY;IACZ,mCAAmC;IACnC,yGAAyG;IACzG,2EAA2E;IAC3E,MAAM,EAAE,CAAC,YAAY,CACnB,GAAS,EAAE;;QACT,MAAM,CAAA,SAAS,aAAT,SAAS,uBAAT,SAAS,CAAE,KAAK,CAAC,EAAE,SAAS,EAAE,SAAS,EAAE,WAAW,kCAAO,WAAW,KAAE,EAAE,GAAE,EAAE,CAAC,CAAA,CAAC;QAEtF,MAAM,aAAa,GAGf;YACF,MAAM,EAAE,gBAAM,CAAC,MAAM,EAAO;YAC5B,oBAAoB,EAAE,CAAC,EAAE,KAAK,EAAE,EAAE,EAAE;;gBAAC,OAAA;oBACnC,IAAA,4BAAsB,EAAW,MAAA,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,QAAQ,mCAAI,MAAM,CAAC;iBAC5D,CAAA;aAAA;YACD,aAAa,EAAE,CAAC,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,EAAE,EAAE;;gBAC1C,MAAM,OAAO,GAAG,IAAA,gCAAwB,EAA4C;oBAClF,KAAK;oBACL,QAAQ,EAAE,qBAAqB;oBAC/B,SAAS,kBACP,cAAc,EAAE,MAAM,EACtB,kBAAkB,EAAE,KAAK;wBACzB,4EAA4E;wBAC5E,oBAAoB,EAAE,IAAA,iCAAkB,EAAC;4BACvC,EAAE;4BACF,kBAAkB,EAAE,MAAA,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,uBAAuB,mCAAI,WAAW;4BACjE,eAAe,EAAE,MAAA,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,oBAAoB,mCAAI,IAAI;4BACpD,sBAAsB,EAAE,SAAS,aAAT,SAAS,uBAAT,SAAS,CAAE,sBAAsB;yBAC1D,CAAC,IACC,sBAAsB,CAC1B;iBACF,CAAC,CAAC;gBACH,MAAM,YAAY,GAAG,gBAAgB,CAAC,SAAS,CAAQ,CAAC;gBACxD,OAAO,IAAI,YAAY,CAAC,OAAO,CAAC,CAAC;YACnC,CAAC;SACF,CAAC;QAEF,MAAM,KAAK,GAAG,MAAM,gBAAgB,iCAC/B,WAAW,KACd,EAAE,EACF,MAAM,EAAE,MAAA,WAAW,CAAC,MAAM,mCAAK,aAAa,CAAC,MAAc,EAC3D,oBAAoB,EAClB,MAAA,WAAW,CAAC,oBAAoB,mCAAK,aAAa,CAAC,oBAA4B,EACjF,aAAa,EAAE,MAAA,WAAW,CAAC,aAAa,mCAAK,aAAa,CAAC,aAAqB,IAChF,CAAC;QAEH,MAAM,CAAA,YAAY,aAAZ,YAAY,uBAAZ,YAAY,CAAG,KAAK,CAAC,CAAA,CAAC;IAC9B,CAAC,CAAA,EACD,EAAE,aAAa,EAAE,oBAAoB,EAAE,CACxC,CAAC;AACJ,CAAC,CAAA,CAAC;AAxEW,QAAA,aAAa,iBAwExB;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AACH,MAAM,gBAAgB,GAAG,CAOvB,MAA6E,EAChB,EAAE;IAC/D,MAAM,EAAE,EAAE,GAAG,eAAqB,EAAE,SAAS,EAAE,GAAG,MAAM,CAAC;IAEzD,qDAAqD;IACrD,MAAM,KAAK,GAAG,EAAE,CAAC;IAEjB,0BAA0B;IAC1B,MAAM,QAAQ,GAAG,MAAM,CAAC,KAAK;QAC3B,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC;YACpB,CAAC,CAAC,MAAM,MAAM,CAAC,KAAK,iCAAM,MAAM,KAAE,EAAE,IAAG;YACvC,CAAC,CAAC,MAAM,CAAC,KAAK;QAChB,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAS,CAAC;IAC/B,MAAM,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,YAAY,CAAe,QAAQ,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC;IAEvF,IAAI,MAAM,CAAC,aAAa;QAAE,MAAM,MAAM,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;IAE5D,MAAM,EAAE,QAAQ,EAAE,GAAG,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,EAAE,CAAsB,CAAC;IACxD,MAAM,GAAG,GAAG,IAAI,aAAG,CAAC,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC,CAAC,uBAAiB,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC;IAEnF,gFAAgF;IAChF,MAAM,SAAS,GAAG,GAAG,EAAE,CAAC,iCAAM,MAAM,KAAE,KAAK,EAAE,KAAK,EAAE,EAAE,EAAE,GAAG,IAAG,CAAC;IAE/D,eAAe;IACf,MAAM,YAAY,GAChB,MAAM,CAAC,KAAK,IAAI,IAAI,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,+BAA+B,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;IAClG,MAAM,KAAK,GACT,MAAM,CAAC,KAAK,IAAI,IAAI;QAClB,CAAC,CAAC,YAAY;QACd,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC;YACtB,CAAC,CAAC,MAAM,MAAM,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACjC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;IAEnB,+BAA+B;IAC/B,MAAM,MAAM,GAAuB,QAAQ,CAAC,MAAM,CAAC,MAAM,CAAC;QACxD,CAAC,CAAC,MAAM,CAAC,MAAM;QACf,CAAC,CAAC,MAAO,MAAM,CAAC,MAAc,CAAC,SAAS,EAAE,CAAC,CAAC;IAC9C,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,kBAAkB;IAC3G,MAAM,aAAa,GAAG,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,aAAa,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,kBAAkB;IACvI,MAAM,oBAAoB,GAAG,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,oBAAoB,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC,kBAAkB;IAEnK,yBAAyB;IACzB,MAAM,WAAW,GAAG,GAAG,EAAE,CAAC,CAAC;QACzB,EAAE;QACF,SAAS;QACT,MAAM;QACN,MAAM;QACN,aAAa;QACb,KAAK;QACL,MAAM;QACN,KAAK;QACL,KAAK;QACL,GAAG;KACJ,CAAC,CAAC;IACH,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,WAAW,EAAE,CAAC,CAAC;IAE1D,mCAAmC;IACnC,MAAM,QAAQ,mBAAK,OAAO,IAAK,WAAW,EAAE,CAAE,CAAC;IAC/C,MAAM,UAAU,GAAG,sBAAsB,CAAC,QAAQ,CAAC,CAAC;IACpD,MAAM,SAAS,GAAG,qBAAqB,CAAC,QAAQ,CAAC,CAAC;IAClD,MAAM,cAAc,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IACtD,MAAM,iBAAiB,GAAG,wBAAwB,CAAC,QAAQ,CAAC,CAAC;IAC7D,MAAM,SAAS,GAAG,MAAM,qBAAqB,CAAC,QAAQ,CAAC,CAAC;IAExD,MAAM,KAAK,GAAG,gCACT,QAAQ,KACX,OAAO;QACP,UAAU;QACV,SAAS,EACT,QAAQ,EAAE,cAAc,EACxB,YAAY,EAAE,iBAAiB,EAC/B,SAAS,GACmD,CAAC;IAE/D,0DAA0D;IAC1D,MAAM,aAAa,GAAG,EAAE,KAAK,EAAE,QAAQ,EAAE,cAAc,EAAE,CAAC;IAE1D,gBAAgB;IAChB,MAAM,IAAA,6BAAoB,EAKxB;QACA,EAAE;QACF,MAAM;QACN,oBAAoB;QACpB,aAAa;QACb,MAAM;QACN,aAAa;QACb,KAAK;KACN,CAAC,CAAC;IAEH,6BAA6B;IAC7B,MAAM,IAAA,yBAAgB,EACpB,MAAM,EACN,aAAa,EACb,EAAE,aAAa,EAAE,eAAe,EAAE,oBAAoB,EAAE,CACzD,CAAC;IAEF,2DAA2D;IAC3D,MAAM,iBAAiB,CAAC,SAA6B,CAAC,CAAC;IAEvD,OAAO,KAAK,CAAC;AACf,CAAC,CAAA,CAAC;AAEF,MAAM,YAAY,GAAG,CACnB,KAAoB,EACpB,KAA8B,EAC9B,OAA+B,EAC/B,EAAE;;IACF,MAAM,EAAE,EAAE,GAAG,eAAuB,EAAE,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;IACvD,MAAM,EAAE,cAAc,EAAE,uBAAuB,EAAE,GAAG,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,EAAE,CAAoB,CAAC;IAErF,MAAM,YAAY,GAAG,cAAc,CAAC,CAAC,CAAC,MAAM,0BAAW,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC,IAAI,EAAU,CAAC,CAAC,CAAC,IAAI,CAAC;IAClG,MAAM,OAAO,GAAG,SAAS,CAAC,EAAE,KAAK,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,uBAAuB,CAAC,CAAC;IACzE,MAAM,aAAa,GAAG,MAAA,CAAC,MAAM,CAAA,OAAO,aAAP,OAAO,uBAAP,OAAO,EAAI,CAAA,CAAC,mCAAI,IAAI,CAAC;IAClD,MAAM,aAAa,iDAAQ,YAAY,GAAK,aAAa,GAAK,KAAK,CAAE,CAAC;IAEtE,OAAO,aAAkB,CAAC;AAC5B,CAAC,CAAA,CAAC;AAEF;;;;GAIG;AACH,MAAM,sBAAsB,GAAG,CAO7B,KAGC,EACD,EAAE;;IACF,MAAM,EACJ,sBAAsB,EACtB,qBAAqB,EACrB,mBAAmB,EACnB,kBAAkB,EAClB,qBAAqB,EACrB,oBAAoB,EACpB,kBAAkB,EAClB,iBAAiB,EACjB,kBAAkB,EAClB,yBAAyB,GAC1B,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCAAI,EAAE,CAAyC,CAAC;IAEhE,MAAM,SAAS,GAAG,qBAAqB,CAAC,KAAK,CAAC,CAAC;IAE/C,MAAM,UAAU,GAAoB,CAAO,QAAQ,EAAE,OAAO,EAAE,EAAE;;QAC9D,2CAA2C;QAC3C,IAAI,kBAAkB,IAAI,yBAAyB,KAAK,WAAW,EAAE;YACnE,MAAM,KAAK,GAAG,MAAM,KAAK,CAAC,EAAE,CAAC,iBAAiB,CAAC,kBAAkB,CAAC,CAAC;YACnE,MAAM,KAAK,CAAC,IAAI,EAAE,CAAC;SACpB;QAED,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,qBAAqB,CAAC,2CAAI,CAAA,CAAC;QAClD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,kBAAkB,CAAC,2CAAI,CAAA,CAAC;QAC/C,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,sBAAsB,CAAC,2CAAI,CAAA,CAAC;QACnD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,mBAAmB,CAAC,2CAAI,CAAA,CAAC;QAEhD,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAE1D,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,oBAAoB,CAAC,2CAAI,CAAA,CAAC;QACjD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,iBAAiB,CAAC,2CAAI,CAAA,CAAC;QAC9C,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,qBAAqB,CAAC,2CAAI,CAAA,CAAC;QAClD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,kBAAkB,CAAC,2CAAI,CAAA,CAAC;QAE/C,iDAAiD;QACjD,MAAM,SAAS,EAAE,CAAC;QAElB,OAAO,MAAM,CAAC;IAChB,CAAC,CAAA,CAAC;IAEF,OAAO,UAAU,CAAC;AACpB,CAAC,CAAC;AAEF,mFAAmF;AACnF,MAAM,qBAAqB,GAAG,CAAC,KAA+C,EAAE,EAAE;IAChF,iDAAiD;IACjD,MAAM,SAAS,GAAc,CAAO,SAA+B,EAAE,EAAE;;QACrE,MAAM,EACJ,gBAAgB,EAChB,mBAAmB,EACnB,mBAAmB,GACpB,GAAG,IAAA,iBAAQ,EAAC,EAAE,EAAE,SAAS,EAAE,MAAA,KAAK,CAAC,KAAK,mCAAI,EAAE,CAAC,CAAC,CAAC,kBAAkB;QAElE,IAAI,CAAC,gBAAgB;YAAE,OAAO;QAE9B,MAAM,KAAK,CAAC,EAAE,CAAC,wBAAwB,CAAC,gBAAgB,EAAE,mBAAmB,EAAE;YAC7E,KAAK,EAAE,mBAAmB;SAC3B,CAAC,CAAC;IACL,CAAC,CAAA,CAAC;IAEF,OAAO,SAAS,CAAC;AACnB,CAAC,CAAC;AAEF,uEAAuE;AACvE,MAAM,oBAAoB,GAAG,CAC3B,KAAiE,EACjE,EAAE;;IACF,MAAM,EACJ,mBAAmB,EACnB,cAAc,EACd,gBAAgB,EAChB,eAAe,EACf,YAAY,EACZ,eAAe,EACf,gBAAgB,EAChB,kBAAkB,EAClB,kBAAkB,EAClB,sBAAsB,EACtB,yBAAyB,GAC1B,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCAAI,EAAE,CAA6D,CAAC;IAEpF,MAAM,cAAc,GAAmC,CAAO,OAAO,EAAE,GAAG,EAAE,OAAO,EAAE,EAAE;QACrF,MAAM,WAAW,GAAG,SAAS,CAAC,KAAK,EAAE,eAAe,CAAC,CAAC;QACtD,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,EAAE,YAAY,CAAC,CAAC;QAEhD,MAAM,aAAa,GAAG,gBACpB,EAAE,EAAE,KAAK,CAAC,EAAE,EACZ,GAAG,EAAE,KAAK,CAAC,GAAG,EACd,WAAW,EAAE,mBAAmB,EAChC,QAAQ,EAAE,gBAAgB,EAC1B,QAAQ,EAAE,gBAAgB,EAC1B,SAAS,EAAE,kBAAkB,EAC7B,SAAS,EAAE,WAAW,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EAChE,MAAM,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EACvD,SAAS,EAAE,eAAe,EAC1B,cAAc,EACd,YAAY,EAAE,kBAAkB,EAChC,gBAAgB,EAAE,sBAAsB,EACxC,mBAAmB,EAAE,yBAAyB,IAC3C,OAAO,CACuB,CAAC;QAEpC,OAAO,IAAA,mBAAQ,EAAC,OAAO,EAAE,GAAG,EAAE,aAAa,CAAC,CAAC;IAC/C,CAAC,CAAA,CAAC;IAEF,OAAO,cAAc,CAAC;AACxB,CAAC,CAAC;AAEF,2EAA2E;AAC3E,MAAM,wBAAwB,GAAG,CAC/B,KAAiE,EACjE,EAAE;;IACF,MAAM,EAAE,cAAc,EAAE,iBAAiB,EAAE,gBAAgB,EAAE,aAAa,EAAE,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCACzF,EAAE,CAAsB,CAAC;IAE3B,MAAM,iBAAiB,GAAuC,CAAO,OAAO,EAAE,OAAO,EAAE,EAAE;QACvF,MAAM,WAAW,GAAG,SAAS,CAAC,KAAK,EAAE,gBAAgB,CAAC,CAAC;QACvD,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,EAAE,aAAa,CAAC,CAAC;QAEjD,MAAM,aAAa,GAAG,gBACpB,EAAE,EAAE,KAAK,CAAC,EAAE,EACZ,GAAG,EAAE,KAAK,CAAC,GAAG,EACd,QAAQ,EAAE,iBAAiB,EAC3B,SAAS,EAAE,WAAW,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EAChE,MAAM,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EACvD,cAAc,IACX,OAAO,CACwB,CAAC;QAErC,OAAO,IAAA,2BAAY,EAAC,OAAO,EAAE,aAAa,CAAC,CAAC;IAC9C,CAAC,CAAA,CAAC;IAEF,OAAO,iBAAiB,CAAC;AAC3B,CAAC,CAAC;AAEF,4DAA4D;AACrD,MAAM,wBAAwB,GAAG,CAGtC,EACA,KAAK,EACL,QAAQ,EACR,SAAS,GAcV,EAAE,EAAE;IACH,MAAM,sBAAsB,GAAG,CAAoC,MAAS,EAAE,EAAE,CAC9E,IAAA,aAAI,EAAC,MAAM,EAAE,MAAM,CAAC,IAAI,CAAC,qBAAY,CAAC,CAAC,CAAC;IAE1C,OAAO,8CAEF,IAAA,eAAM,EAAC,QAAQ,aAAR,QAAQ,cAAR,QAAQ,GAAK,EAAY,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,KAAK,SAAS,CAAC,GAEjE,IAAA,eAAM,EAAC,sBAAsB,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,EAAE,CAAC,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,KAAK,SAAS,CAAC,GAE3E,IAAA,eAAM,EAAC,SAAS,aAAT,SAAS,cAAT,SAAS,GAAK,EAAY,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,KAAK,SAAS,CAAC,CAC7C,CAAC;AAC7B,CAAC,CAAC;AAhCW,QAAA,wBAAwB,4BAgCnC;AAEF,MAAM,qBAAqB,GAAG,CAC5B,KAAiE,EACjE,EAAE;;IACF,MAAM,EAAE,SAAS,EAAE,oBAAoB,EAAE,qBAAqB,EAAE,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCAC7E,EAAE,CAAwB,CAAC;IAE7B,MAAM,OAAO,GAAG,CAAC,GAAG,CAAC,SAAS,aAAT,SAAS,cAAT,SAAS,GAAI,EAAE,CAAC,CAAC,CAAC;IAEvC,IAAI,oBAAoB,EAAE;QACxB,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,mCAAmC,oBAAoB,EAAE,CAAC,CAAC;QAC3E,MAAM,CAAC,SAAS,EAAE,KAAK,CAAC,GAAG,oBAAoB,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC3D,MAAM,eAAe,GAAG,MAAM,IAAA,8BAAoB,EAAM,SAAS,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC,CAAC;QAC5F,OAAO,CAAC,IAAI,CAAC,GAAG,eAAe,CAAC,CAAC;KAClC;IAED,IAAI,qBAAqB,EAAE;QACzB,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,kCAAkC,CAAC,CAAC;QACpD,MAAM,UAAU,GAAG,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,qBAAqB,CAAC,2CAAI,CAAA,CAAC;QACrE,IAAI,UAAU;YAAE,OAAO,CAAC,IAAI,CAAC,GAAG,UAAU,CAAC,CAAC;KAC7C;IAED,OAAO,OAAO,CAAC;AACjB,CAAC,CAAA,CAAC","sourcesContent":["import {\n BasicCrawler,\n CrawlingContext,\n RouterHandler,\n BasicCrawlerOptions,\n CheerioCrawler,\n Router,\n HttpCrawler,\n JSDOMCrawler,\n PlaywrightCrawler,\n PuppeteerCrawler,\n Log,\n Request as CrawleeRequest,\n} from 'crawlee';\nimport { omitBy, pick, defaults } from 'lodash';\nimport { gotScraping } from 'got-scraping';\n\nimport type { CrawlerMeta, CrawlerType } from '../../types';\nimport type { MaybePromise, PickPartial } from '../../utils/types';\nimport { createErrorHandler } from '../error/errorHandler';\nimport { type PushDataOptions, itemCacheKey, pushData } from '../io/pushData';\nimport { getColumnFromDataset } from '../io/dataset';\nimport { PushRequestsOptions, pushRequests } from '../io/pushRequests';\nimport type { CrawleeOneIO } from '../integrations/types';\nimport { apifyIO } from '../integrations/apify';\nimport { registerHandlers, setupDefaultHandlers } from '../router/router';\nimport {\n CrawlerConfigActorInput,\n OutputActorInput,\n MetamorphActorInput,\n PrivacyActorInput,\n crawlerInput,\n StartUrlsActorInput,\n InputActorInput,\n RequestActorInput,\n AllActorInputs,\n LoggingActorInput,\n} from '../config';\nimport { logLevelHandlerWrapper, logLevelToCrawlee } from '../log';\nimport type { CrawleeOneTelemetry } from '../telemetry/types';\nimport type {\n CrawleeOneActorCtx,\n CrawleeOneActorDef,\n CrawleeOneHookCtx,\n CrawleeOneActorRouterCtx,\n Metamorph,\n RunCrawler,\n} from './types';\n\nconst actorClassByType = {\n basic: BasicCrawler,\n http: HttpCrawler,\n cheerio: CheerioCrawler,\n jsdom: JSDOMCrawler,\n playwright: PlaywrightCrawler,\n puppeteer: PuppeteerCrawler,\n} satisfies Record<CrawlerType, { new (options: Record<string, any>): any }>;\n\nconst isRouter = (r: any): r is RouterHandler<any> => {\n return !!((r as RouterHandler)?.addHandler && (r as RouterHandler)?.addDefaultHandler);\n};\n\nconst isFunc = (f: any): f is (...args: any[]) => any => {\n return typeof f === 'function';\n};\n\n/** Run a function that was defined as a string via Actor input */\nconst genHookFn = <\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n>(\n actor: Pick<CrawleeOneActorCtx<any, Input, TIO, any, any>, 'input' | 'state' | 'io'>,\n fnStr?: string\n) => {\n if (!fnStr) return null;\n\n const hookCtx = {\n io: actor.io,\n input: actor.input,\n state: actor.state,\n itemCacheKey,\n sendRequest: gotScraping,\n } satisfies CrawleeOneHookCtx<Input, TIO>;\n\n const hookFn = eval(fnStr);\n if (!hookFn) return null;\n\n return async (...args) => hookFn(...args, hookCtx);\n};\n\n/**\n * Options available when creating default configuration for an opinionated Crawlee actor,\n * which is then run within Apify's `Actor.main()` context.\n *\n * Apify context can be replaced with custom implementation using the `actorConfig.io` option.\n *\n * Read more about what this actor does at {@link createCrawleeOne}.\n */\nexport interface RunCrawleeOneOptions<\n TCrawlerType extends CrawlerType,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO,\n Telem extends CrawleeOneTelemetry<any, any> = CrawleeOneTelemetry<any, any>,\n Ctx extends CrawlerMeta<TCrawlerType, any>['context'] = CrawlingContext<BasicCrawler>\n> {\n /** String idetifying the actor class, e.g. `'cheerio'` */\n actorType: TCrawlerType;\n actorName: string;\n /** Config passed to the {@link createCrawleeOne} */\n actorConfig: PickPartial<\n CrawleeOneActorDef<Labels, Input, TIO, Telem, Ctx>,\n 'router' | 'createCrawler' | 'io' | 'telemetry'\n >;\n /**\n * If using default `createCrawler` implementation, these are crawler options\n * that may be overriden by user input.\n */\n crawlerConfigDefaults?: CrawlerMeta<TCrawlerType, any>['options'];\n /**\n * If using default `createCrawler` implementation, these are crawler options\n * that will override user input.\n *\n * This is useful for testing env.\n */\n crawlerConfigOverrides?: CrawlerMeta<TCrawlerType, any>['options'];\n /**\n * Callback with the created actor. The callback is called within\n * the `Actor.main()` context.\n */\n onActorReady?: (actor: CrawleeOneActorCtx<Labels, Input, TIO, Telem, Ctx>) => MaybePromise<void>;\n}\n\n/**\n * Create opinionated Crawlee crawler that uses, and run it within Apify's `Actor.main()` context.\n *\n * Apify context can be replaced with custom implementation using the `actorConfig.io` option.\n *\n * This function does the following for you:\n *\n * 1) Full TypeScript coverage - Ensure all components use the same Crawler / CrawlerContext.\n *\n * 2) Get Actor input from `Actor.getInput` if not given.\n *\n * 3) (Optional) Validate Actor input\n *\n * 4) Set up router such that requests that reach default route are\n * redirected to labelled routes based on which item from \"routes\" they match.\n *\n * 5) Register all route handlers for you.\n *\n * 6) (Optional) Wrap all route handlers in a wrapper. Use this e.g.\n * if you want to add a field to the context object, or handle errors\n * from a single place.\n *\n * 7) (Optional) Support transformation and filtering of (scraped) entries,\n * configured via Actor input.\n *\n * 8) (Optional) Support Actor metamorphing, configured via Actor input.\n *\n * 9) Apify context (e.g. calling `Actor.getInput`) can be replaced with custom\n * implementation using the `io` option.\n */\nexport const runCrawleeOne = async <\n TType extends CrawlerType,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO,\n Telem extends CrawleeOneTelemetry<any, any> = CrawleeOneTelemetry<any, any>,\n Ctx extends CrawlerMeta<TType, any>['context'] = CrawlerMeta<TType, any>['context']\n>(\n args: RunCrawleeOneOptions<TType, Labels, Input, TIO, Telem, Ctx>\n): Promise<void> => {\n const {\n actorType,\n actorName,\n actorConfig,\n crawlerConfigDefaults,\n crawlerConfigOverrides,\n onActorReady,\n } = args;\n\n const { io = apifyIO as any as TIO, telemetry } = actorConfig;\n\n // See docs:\n // - https://docs.apify.com/sdk/js/\n // - https://docs.apify.com/academy/deploying-your-code/inputs-outputs#accepting-input-with-the-apify-sdk\n // - https://docs.apify.com/sdk/js/docs/upgrading/upgrading-to-v3#apify-sdk\n await io.runInContext(\n async () => {\n await telemetry?.setup({ actorType, actorName, actorConfig: { ...actorConfig, io } });\n\n const actorDefaults: Pick<\n CrawleeOneActorDef<Labels, Input & AllActorInputs, TIO, Telem, Ctx>,\n 'router' | 'routeHandlerWrappers' | 'createCrawler'\n > = {\n router: Router.create<Ctx>(),\n routeHandlerWrappers: ({ input }) => [\n logLevelHandlerWrapper<Ctx, any>(input?.logLevel ?? 'info'),\n ],\n createCrawler: ({ router, proxy, input }) => {\n const options = createHttpCrawlerOptions<CrawlerMeta<TType, any>['options'], Input>({\n input,\n defaults: crawlerConfigDefaults,\n overrides: {\n requestHandler: router,\n proxyConfiguration: proxy,\n // Capture errors in a separate (Apify) Dataset and pass errors to telemetry\n failedRequestHandler: createErrorHandler({\n io,\n reportingDatasetId: input?.errorReportingDatasetId ?? 'REPORTING',\n sendToTelemetry: input?.errorSendToTelemetry ?? true,\n onSendErrorToTelemetry: telemetry?.onSendErrorToTelemetry,\n }),\n ...crawlerConfigOverrides,\n },\n });\n const CrawlerClass = actorClassByType[actorType] as any;\n return new CrawlerClass(options);\n },\n };\n\n const actor = await createCrawleeOne<Labels, Input, TIO, Telem, Ctx>({\n ...actorConfig,\n io,\n router: actorConfig.router ?? (actorDefaults.router as any),\n routeHandlerWrappers:\n actorConfig.routeHandlerWrappers ?? (actorDefaults.routeHandlerWrappers as any),\n createCrawler: actorConfig.createCrawler ?? (actorDefaults.createCrawler as any),\n });\n\n await onActorReady?.(actor);\n },\n { statusMessage: 'Crawling finished!' }\n );\n};\n\n/**\n * NOTE: If you want to run a scraper, see {@link runCrawleeOne}. This is lower-level\n * function that should be used only if you want to override the default behaviour of runCrawleeOne.\n *\n * Create opinionated Crawlee crawler that uses router for handling requests.\n *\n * This is a quality-of-life function that does the following for you:\n *\n * 1) Full TypeScript coverage - Ensure all components use the same Crawler / CrawlerContext.\n *\n * 2) Get Actor input from `Actor.getInput` if not given.\n *\n * 3) (Optional) Validate Actor input\n *\n * 4) Set up router such that requests that reach default route are\n * redirected to labelled routes based on which item from \"routes\" they match.\n *\n * 5) Register all route handlers for you.\n *\n * 6) (Optional) Wrap all route handlers in a wrapper. Use this e.g.\n * if you want to add a field to the context object, or handle errors\n * from a single place.\n *\n * 7) (Optional) Support transformation and filtering of (scraped) entries,\n * configured via Actor input.\n *\n * 8) (Optional) Support Actor metamorphing, configured via Actor input.\n *\n * 9) Apify context (e.g. calling `Actor.getInput`) can be replaced with custom\n * implementation using the `io` option.\n */\nconst createCrawleeOne = async <\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO,\n Telem extends CrawleeOneTelemetry<any, any> = CrawleeOneTelemetry<any, any>,\n Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>\n>(\n config: PickPartial<CrawleeOneActorDef<Labels, Input, TIO, Telem, Ctx>, 'io'>\n): Promise<CrawleeOneActorCtx<Labels, Input, TIO, Telem, Ctx>> => {\n const { io = apifyIO as any as TIO, telemetry } = config;\n\n // Mutable state that is available to the actor hooks\n const state = {};\n\n // Initialize actor inputs\n const rawInput = config.input\n ? isFunc(config.input)\n ? await config.input({ ...config, io })\n : config.input\n : await io.getInput<Input>();\n const input = Object.freeze(await resolveInput<Input | null>(rawInput, state, { io }));\n\n if (config.validateInput) await config.validateInput(input);\n\n const { logLevel } = (input ?? {}) as LoggingActorInput;\n const log = new Log({ level: logLevel ? logLevelToCrawlee[logLevel] : undefined });\n\n // This is context that is available to options that use initialization function\n const getConfig = () => ({ ...config, input, state, io, log });\n\n // Set up proxy\n const defaultProxy =\n config.proxy == null ? await io.createDefaultProxyConfiguration(input ?? undefined) : undefined;\n const proxy =\n config.proxy == null\n ? defaultProxy\n : isFunc(config.proxy)\n ? await config.proxy(getConfig())\n : config.proxy;\n\n // Run initialization functions\n const router: RouterHandler<Ctx> = isRouter(config.router)\n ? config.router\n : await (config.router as any)(getConfig());\n const routes = isFunc(config.routes) ? await config.routes(getConfig()) : config.routes; // prettier-ignore\n const routeHandlers = isFunc(config.routeHandlers) ? await config.routeHandlers(getConfig()) : config.routeHandlers; // prettier-ignore\n const routeHandlerWrappers = isFunc(config.routeHandlerWrappers) ? await config.routeHandlerWrappers(getConfig()) : config.routeHandlerWrappers; // prettier-ignore\n\n // Create Crawlee crawler\n const getActorCtx = () => ({\n io,\n telemetry,\n router,\n routes,\n routeHandlers,\n proxy,\n config,\n input,\n state,\n log,\n });\n const crawler = await config.createCrawler(getActorCtx());\n\n // Create actor (our custom entity)\n const preActor = { crawler, ...getActorCtx() };\n const runCrawler = createScopedCrawlerRun(preActor);\n const metamorph = createScopedMetamorph(preActor);\n const scopedPushData = createScopedPushData(preActor);\n const scopedPushRequest = createScopedPushRequests(preActor);\n const startUrls = await getStartUrlsFromInput(preActor);\n\n const actor = {\n ...preActor,\n crawler,\n runCrawler,\n metamorph,\n pushData: scopedPushData,\n pushRequests: scopedPushRequest,\n startUrls,\n } satisfies CrawleeOneActorCtx<Labels, Input, TIO, Telem, Ctx>;\n\n // Extra data that we make available to the route handlers\n const routerContext = { actor, pushData: scopedPushData };\n\n // Set up router\n await setupDefaultHandlers<\n Ctx,\n CrawleeOneActorRouterCtx<Ctx, Labels, Input, TIO, Telem>,\n Labels,\n Input\n >({\n io,\n router,\n routeHandlerWrappers,\n routerContext,\n routes,\n routeHandlers,\n input,\n });\n\n // Register labelled handlers\n await registerHandlers<Ctx, CrawleeOneActorRouterCtx<Ctx, Labels, Input, TIO, Telem>, Labels>(\n router,\n routeHandlers,\n { routerContext, handlerWrappers: routeHandlerWrappers }\n );\n\n // Now that the actor is ready, enqueue the URLs right away\n await scopedPushRequest(startUrls as CrawleeRequest[]);\n\n return actor;\n};\n\nconst resolveInput = async <T extends Record<string, any> | null>(\n input: object | null,\n state: Record<string, unknown>,\n options?: { io?: CrawleeOneIO }\n) => {\n const { io = apifyIO as CrawleeOneIO } = options ?? {};\n const { inputExtendUrl, inputExtendFromFunction } = (input ?? {}) as InputActorInput;\n\n const inputFromUrl = inputExtendUrl ? await gotScraping.get(inputExtendUrl).json<object>() : null;\n const inputFn = genHookFn({ state, input, io }, inputExtendFromFunction);\n const inputFromFunc = (await inputFn?.()) ?? null;\n const extendedInput = { ...inputFromUrl, ...inputFromFunc, ...input };\n\n return extendedInput as T;\n};\n\n/**\n * Create a function that wraps `crawler.run(requests, runOtions)` with additional\n * features like:\n * - Automatically metamorph into another actor after the run finishes\n */\nconst createScopedCrawlerRun = <\n Ctx extends CrawlingContext<any> = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO,\n Telem extends CrawleeOneTelemetry<any, any> = CrawleeOneTelemetry<any, any>\n>(\n actor: Omit<\n CrawleeOneActorCtx<Labels, Input, TIO, Telem, Ctx>,\n 'runCrawler' | 'metamorph' | 'pushData' | 'pushRequests' | 'startUrls'\n >\n) => {\n const {\n requestTransformBefore,\n requestTransformAfter,\n requestFilterBefore,\n requestFilterAfter,\n outputTransformBefore,\n outputTransformAfter,\n outputFilterBefore,\n outputFilterAfter,\n outputCacheStoreId,\n outputCacheActionOnResult,\n } = (actor.input ?? {}) as OutputActorInput & RequestActorInput;\n\n const metamorph = createScopedMetamorph(actor);\n\n const runCrawler: RunCrawler<Ctx> = async (requests, options) => {\n // Clear cache if it was set from the input\n if (outputCacheStoreId && outputCacheActionOnResult === 'overwrite') {\n const store = await actor.io.openKeyValueStore(outputCacheStoreId);\n await store.drop();\n }\n\n await genHookFn(actor, outputTransformBefore)?.();\n await genHookFn(actor, outputFilterBefore)?.();\n await genHookFn(actor, requestTransformBefore)?.();\n await genHookFn(actor, requestFilterBefore)?.();\n\n const runRes = await actor.crawler.run(requests, options);\n\n await genHookFn(actor, outputTransformAfter)?.();\n await genHookFn(actor, outputFilterAfter)?.();\n await genHookFn(actor, requestTransformAfter)?.();\n await genHookFn(actor, requestFilterAfter)?.();\n\n // Trigger metamorph if it was set from the input\n await metamorph();\n\n return runRes;\n };\n\n return runCrawler;\n};\n\n/** Create a function that triggers metamorph, using Actor's inputs as defaults. */\nconst createScopedMetamorph = (actor: Pick<CrawleeOneActorCtx, 'input' | 'io'>) => {\n // Trigger metamorph if it was set from the input\n const metamorph: Metamorph = async (overrides?: MetamorphActorInput) => {\n const {\n metamorphActorId,\n metamorphActorBuild,\n metamorphActorInput,\n } = defaults({}, overrides, actor.input ?? {}); // prettier-ignore\n\n if (!metamorphActorId) return;\n\n await actor.io.triggerDownstreamCrawler(metamorphActorId, metamorphActorInput, {\n build: metamorphActorBuild,\n });\n };\n\n return metamorph;\n};\n\n/** pushData wrapper that pre-populates options based on actor input */\nconst createScopedPushData = (\n actor: Pick<CrawleeOneActorCtx, 'input' | 'state' | 'io' | 'log'>\n) => {\n const {\n includePersonalData,\n requestQueueId,\n outputMaxEntries,\n outputTransform,\n outputFilter,\n outputDatasetId,\n outputPickFields,\n outputRenameFields,\n outputCacheStoreId,\n outputCachePrimaryKeys,\n outputCacheActionOnResult,\n } = (actor.input ?? {}) as OutputActorInput & PrivacyActorInput & RequestActorInput;\n\n const scopedPushData: CrawleeOneActorCtx['pushData'] = async (entries, ctx, options) => {\n const transformFn = genHookFn(actor, outputTransform);\n const filterFn = genHookFn(actor, outputFilter);\n\n const mergedOptions = {\n io: actor.io,\n log: actor.log,\n showPrivate: includePersonalData,\n maxCount: outputMaxEntries,\n pickKeys: outputPickFields,\n remapKeys: outputRenameFields,\n transform: transformFn ? (item) => transformFn(item) : undefined,\n filter: filterFn ? (item) => filterFn(item) : undefined,\n datasetId: outputDatasetId,\n requestQueueId,\n cacheStoreId: outputCacheStoreId,\n cachePrimaryKeys: outputCachePrimaryKeys,\n cacheActionOnResult: outputCacheActionOnResult,\n ...options,\n } satisfies PushDataOptions<object>;\n\n return pushData(entries, ctx, mergedOptions);\n };\n\n return scopedPushData;\n};\n\n/** pushRequests wrapper that pre-populates options based on actor input */\nconst createScopedPushRequests = (\n actor: Pick<CrawleeOneActorCtx, 'input' | 'state' | 'io' | 'log'>\n) => {\n const { requestQueueId, requestMaxEntries, requestTransform, requestFilter } = (actor.input ??\n {}) as RequestActorInput;\n\n const scopedPushRequest: CrawleeOneActorCtx['pushRequests'] = async (entries, options) => {\n const transformFn = genHookFn(actor, requestTransform);\n const filterFn = genHookFn(actor, requestFilter);\n\n const mergedOptions = {\n io: actor.io,\n log: actor.log,\n maxCount: requestMaxEntries,\n transform: transformFn ? (item) => transformFn(item) : undefined,\n filter: filterFn ? (item) => filterFn(item) : undefined,\n requestQueueId,\n ...options,\n } satisfies PushRequestsOptions<any>;\n\n return pushRequests(entries, mergedOptions);\n };\n\n return scopedPushRequest;\n};\n\n/** Given the actor input, create common crawler options. */\nexport const createHttpCrawlerOptions = <\n TOpts extends BasicCrawlerOptions<any> = BasicCrawlerOptions,\n Input extends Record<string, any> = Record<string, any>\n>({\n input,\n defaults,\n overrides,\n}: {\n /** Actor input */\n input: Input | null;\n /**\n * Default config options set by us. These may be overriden\n * by values from actor input (set by user).\n */\n defaults?: TOpts;\n /**\n * These config options will overwrite both the default and user\n * options. This is useful for hard-setting values e.g. in tests.\n */\n overrides?: TOpts;\n}) => {\n const pickCrawlerInputFields = <T extends CrawlerConfigActorInput>(config: T) =>\n pick(config, Object.keys(crawlerInput));\n\n return {\n // ----- 1. DEFAULTS -----\n ...omitBy(defaults ?? ({} as TOpts), (field) => field === undefined),\n // ----- 2. CONFIG FROM INPUT -----\n ...omitBy(pickCrawlerInputFields(input ?? {}), (field) => field === undefined),\n // ----- 3. OVERRIDES - E.G. TEST CONFIG -----\n ...omitBy(overrides ?? ({} as TOpts), (field) => field === undefined),\n } satisfies Partial<TOpts>;\n};\n\nconst getStartUrlsFromInput = async (\n actor: Pick<CrawleeOneActorCtx, 'input' | 'state' | 'io' | 'log'>\n) => {\n const { startUrls, startUrlsFromDataset, startUrlsFromFunction } = (actor.input ??\n {}) as StartUrlsActorInput;\n\n const urlsAgg = [...(startUrls ?? [])];\n\n if (startUrlsFromDataset) {\n actor.log.debug(`Loading start URLs from Dataset ${startUrlsFromDataset}`);\n const [datasetId, field] = startUrlsFromDataset.split('#');\n const urlsFromDataset = await getColumnFromDataset<any>(datasetId, field, { io: actor.io });\n urlsAgg.push(...urlsFromDataset);\n }\n\n if (startUrlsFromFunction) {\n actor.log.debug(`Loading start URLs from function`);\n const urlsFromFn = await genHookFn(actor, startUrlsFromFunction)?.();\n if (urlsFromFn) urlsAgg.push(...urlsFromFn);\n }\n\n return urlsAgg;\n};\n"]}
|
|
1
|
+
{"version":3,"file":"actor.js","sourceRoot":"","sources":["../../../../src/lib/actor/actor.ts"],"names":[],"mappings":";;;;;;;;;;;;AAAA,qCAaiB;AACjB,mCAAgD;AAChD,+CAA2C;AAI3C,wDAA2D;AAC3D,6CAA8E;AAC9E,2CAAqD;AACrD,qDAAuE;AAEvE,iDAAgD;AAChD,6CAA0E;AAC1E,sCAWmB;AACnB,gCAAmE;AAWnE,MAAM,gBAAgB,GAAG;IACvB,KAAK,EAAE,sBAAY;IACnB,IAAI,EAAE,qBAAW;IACjB,OAAO,EAAE,wBAAc;IACvB,KAAK,EAAE,sBAAY;IACnB,UAAU,EAAE,2BAAiB;IAC7B,SAAS,EAAE,0BAAgB;CAC+C,CAAC;AAE7E,MAAM,QAAQ,GAAG,CAAC,CAAM,EAA2B,EAAE;IACnD,OAAO,CAAC,CAAC,CAAC,CAAC,CAAmB,aAAnB,CAAC,uBAAD,CAAC,CAAoB,UAAU,MAAK,CAAmB,aAAnB,CAAC,uBAAD,CAAC,CAAoB,iBAAiB,CAAA,CAAC,CAAC;AACzF,CAAC,CAAC;AAEF,MAAM,MAAM,GAAG,CAAC,CAAM,EAAgC,EAAE;IACtD,OAAO,OAAO,CAAC,KAAK,UAAU,CAAC;AACjC,CAAC,CAAC;AAEF,kEAAkE;AAClE,MAAM,SAAS,GAAG,CAIhB,KAAoF,EACpF,KAAc,EACd,EAAE;IACF,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IAExB,MAAM,OAAO,GAAG;QACd,EAAE,EAAE,KAAK,CAAC,EAAE;QACZ,KAAK,EAAE,KAAK,CAAC,KAAK;QAClB,KAAK,EAAE,KAAK,CAAC,KAAK;QAClB,YAAY,EAAZ,uBAAY;QACZ,WAAW,EAAE,0BAAW;KACe,CAAC;IAE1C,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC;IAC3B,IAAI,CAAC,MAAM;QAAE,OAAO,IAAI,CAAC;IAEzB,OAAO,CAAO,GAAG,IAAI,EAAE,EAAE,kDAAC,OAAA,MAAM,CAAC,GAAG,IAAI,EAAE,OAAO,CAAC,CAAA,GAAA,CAAC;AACrD,CAAC,CAAC;AA6CF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AACI,MAAM,aAAa,GAAG,CAQ3B,IAAiE,EAClD,EAAE;IACjB,MAAM,EACJ,SAAS,EACT,SAAS,EACT,WAAW,EACX,qBAAqB,EACrB,sBAAsB,EACtB,YAAY,GACb,GAAG,IAAI,CAAC;IAET,MAAM,EAAE,EAAE,GAAG,eAAqB,EAAE,SAAS,EAAE,GAAG,WAAW,CAAC;IAE9D,YAAY;IACZ,mCAAmC;IACnC,yGAAyG;IACzG,2EAA2E;IAC3E,MAAM,EAAE,CAAC,YAAY,CACnB,GAAS,EAAE;;QACT,MAAM,CAAA,SAAS,aAAT,SAAS,uBAAT,SAAS,CAAE,KAAK,CAAC,EAAE,SAAS,EAAE,SAAS,EAAE,WAAW,kCAAO,WAAW,KAAE,EAAE,GAAE,EAAE,CAAC,CAAA,CAAC;QAEtF,MAAM,aAAa,GAGf;YACF,MAAM,EAAE,gBAAM,CAAC,MAAM,EAAO;YAC5B,oBAAoB,EAAE,CAAC,EAAE,KAAK,EAAE,EAAE,EAAE;;gBAAC,OAAA;oBACnC,IAAA,4BAAsB,EAAW,MAAA,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,QAAQ,mCAAI,MAAM,CAAC;iBAC5D,CAAA;aAAA;YACD,aAAa,EAAE,CAAC,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,EAAE,EAAE;;gBAC1C,MAAM,OAAO,GAAG,IAAA,gCAAwB,EAA4C;oBAClF,KAAK;oBACL,QAAQ,EAAE,qBAAqB;oBAC/B,SAAS,kBACP,cAAc,EAAE,MAAM,EACtB,kBAAkB,EAAE,KAAK;wBACzB,4EAA4E;wBAC5E,oBAAoB,EAAE,IAAA,iCAAkB,EAAC;4BACvC,EAAE;4BACF,kBAAkB,EAAE,MAAA,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,uBAAuB,mCAAI,WAAW;4BACjE,eAAe,EAAE,MAAA,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,oBAAoB,mCAAI,IAAI;4BACpD,sBAAsB,EAAE,SAAS,aAAT,SAAS,uBAAT,SAAS,CAAE,sBAAsB;yBAC1D,CAAC,IACC,sBAAsB,CAC1B;iBACF,CAAC,CAAC;gBACH,MAAM,YAAY,GAAG,gBAAgB,CAAC,SAAS,CAAQ,CAAC;gBACxD,OAAO,IAAI,YAAY,CAAC,OAAO,CAAC,CAAC;YACnC,CAAC;SACF,CAAC;QAEF,MAAM,KAAK,GAAG,MAAM,gBAAgB,iCAC/B,WAAW,KACd,EAAE,EACF,MAAM,EAAE,MAAA,WAAW,CAAC,MAAM,mCAAK,aAAa,CAAC,MAAc,EAC3D,oBAAoB,EAClB,MAAA,WAAW,CAAC,oBAAoB,mCAAK,aAAa,CAAC,oBAA4B,EACjF,aAAa,EAAE,MAAA,WAAW,CAAC,aAAa,mCAAK,aAAa,CAAC,aAAqB,IAChF,CAAC;QAEH,MAAM,CAAA,YAAY,aAAZ,YAAY,uBAAZ,YAAY,CAAG,KAAK,CAAC,CAAA,CAAC;IAC9B,CAAC,CAAA,EACD,EAAE,aAAa,EAAE,oBAAoB,EAAE,CACxC,CAAC;AACJ,CAAC,CAAA,CAAC;AAxEW,QAAA,aAAa,iBAwExB;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AACH,MAAM,gBAAgB,GAAG,CAOvB,MAA6E,EAChB,EAAE;IAC/D,MAAM,EAAE,EAAE,GAAG,eAAqB,EAAE,SAAS,EAAE,GAAG,MAAM,CAAC;IAEzD,qDAAqD;IACrD,MAAM,KAAK,GAAG,EAAE,CAAC;IAEjB,0BAA0B;IAC1B,MAAM,QAAQ,GAAG,MAAM,CAAC,KAAK;QAC3B,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC;YACpB,CAAC,CAAC,MAAM,MAAM,CAAC,KAAK,iCAAM,MAAM,KAAE,EAAE,IAAG;YACvC,CAAC,CAAC,MAAM,CAAC,KAAK;QAChB,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAS,CAAC;IAC/B,MAAM,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,YAAY,CAAe,QAAQ,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC;IAEvF,IAAI,MAAM,CAAC,aAAa;QAAE,MAAM,MAAM,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;IAE5D,MAAM,EAAE,QAAQ,EAAE,GAAG,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,EAAE,CAAsB,CAAC;IACxD,MAAM,GAAG,GAAG,IAAI,aAAG,CAAC,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC,CAAC,uBAAiB,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC;IAEnF,gFAAgF;IAChF,MAAM,SAAS,GAAG,GAAG,EAAE,CAAC,iCAAM,MAAM,KAAE,KAAK,EAAE,KAAK,EAAE,EAAE,EAAE,GAAG,IAAG,CAAC;IAE/D,eAAe;IACf,MAAM,YAAY,GAChB,MAAM,CAAC,KAAK,IAAI,IAAI,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,+BAA+B,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;IAClG,MAAM,KAAK,GACT,MAAM,CAAC,KAAK,IAAI,IAAI;QAClB,CAAC,CAAC,YAAY;QACd,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC;YACtB,CAAC,CAAC,MAAM,MAAM,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACjC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;IAEnB,+BAA+B;IAC/B,MAAM,MAAM,GAAuB,QAAQ,CAAC,MAAM,CAAC,MAAM,CAAC;QACxD,CAAC,CAAC,MAAM,CAAC,MAAM;QACf,CAAC,CAAC,MAAO,MAAM,CAAC,MAAc,CAAC,SAAS,EAAE,CAAC,CAAC;IAC9C,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,kBAAkB;IAC3G,MAAM,aAAa,GAAG,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,aAAa,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,kBAAkB;IACvI,MAAM,oBAAoB,GAAG,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,oBAAoB,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC,kBAAkB;IAEnK,yBAAyB;IACzB,MAAM,WAAW,GAAG,GAAG,EAAE,CAAC,CAAC;QACzB,EAAE;QACF,SAAS;QACT,MAAM;QACN,MAAM;QACN,aAAa;QACb,KAAK;QACL,MAAM;QACN,KAAK;QACL,KAAK;QACL,GAAG;KACJ,CAAC,CAAC;IACH,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,WAAW,EAAE,CAAC,CAAC;IAE1D,mCAAmC;IACnC,MAAM,QAAQ,mBAAK,OAAO,IAAK,WAAW,EAAE,CAAE,CAAC;IAC/C,MAAM,UAAU,GAAG,sBAAsB,CAAC,QAAQ,CAAC,CAAC;IACpD,MAAM,SAAS,GAAG,qBAAqB,CAAC,QAAQ,CAAC,CAAC;IAClD,MAAM,cAAc,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IACtD,MAAM,iBAAiB,GAAG,wBAAwB,CAAC,QAAQ,CAAC,CAAC;IAC7D,MAAM,SAAS,GAAG,MAAM,qBAAqB,CAAC,QAAQ,CAAC,CAAC;IAExD,MAAM,KAAK,GAAG,gCACT,QAAQ,KACX,OAAO;QACP,UAAU;QACV,SAAS,EACT,QAAQ,EAAE,cAAc,EACxB,YAAY,EAAE,iBAAiB,EAC/B,SAAS,GACmD,CAAC;IAE/D,0DAA0D;IAC1D,MAAM,aAAa,GAAG,EAAE,KAAK,EAAE,QAAQ,EAAE,cAAc,EAAE,CAAC;IAE1D,gBAAgB;IAChB,MAAM,IAAA,6BAAoB,EAKxB;QACA,EAAE;QACF,MAAM;QACN,oBAAoB;QACpB,aAAa;QACb,MAAM;QACN,aAAa;QACb,KAAK;KACN,CAAC,CAAC;IAEH,6BAA6B;IAC7B,MAAM,IAAA,yBAAgB,EACpB,MAAM,EACN,aAAa,EACb,EAAE,aAAa,EAAE,eAAe,EAAE,oBAAoB,EAAE,CACzD,CAAC;IAEF,2DAA2D;IAC3D,MAAM,iBAAiB,CAAC,SAA6B,CAAC,CAAC;IAEvD,OAAO,KAAK,CAAC;AACf,CAAC,CAAA,CAAC;AAEF,MAAM,YAAY,GAAG,CACnB,KAAoB,EACpB,KAA8B,EAC9B,OAA+B,EAC/B,EAAE;;IACF,MAAM,EAAE,EAAE,GAAG,eAAuB,EAAE,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;IACvD,MAAM,EAAE,cAAc,EAAE,uBAAuB,EAAE,GAAG,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,EAAE,CAAoB,CAAC;IAErF,MAAM,YAAY,GAAG,cAAc,CAAC,CAAC,CAAC,MAAM,0BAAW,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC,IAAI,EAAU,CAAC,CAAC,CAAC,IAAI,CAAC;IAClG,MAAM,OAAO,GAAG,SAAS,CAAC,EAAE,KAAK,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,uBAAuB,CAAC,CAAC;IACzE,MAAM,aAAa,GAAG,MAAA,CAAC,MAAM,CAAA,OAAO,aAAP,OAAO,uBAAP,OAAO,EAAI,CAAA,CAAC,mCAAI,IAAI,CAAC;IAClD,MAAM,aAAa,iDAAQ,YAAY,GAAK,aAAa,GAAK,KAAK,CAAE,CAAC;IAEtE,OAAO,aAAkB,CAAC;AAC5B,CAAC,CAAA,CAAC;AAEF;;;;GAIG;AACH,MAAM,sBAAsB,GAAG,CAO7B,KAGC,EACD,EAAE;;IACF,MAAM,EACJ,sBAAsB,EACtB,qBAAqB,EACrB,mBAAmB,EACnB,kBAAkB,EAClB,qBAAqB,EACrB,oBAAoB,EACpB,kBAAkB,EAClB,iBAAiB,EACjB,kBAAkB,EAClB,yBAAyB,GAC1B,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCAAI,EAAE,CAAyC,CAAC;IAEhE,MAAM,SAAS,GAAG,qBAAqB,CAAC,KAAK,CAAC,CAAC;IAE/C,MAAM,UAAU,GAAoB,CAAO,QAAQ,EAAE,OAAO,EAAE,EAAE;;QAC9D,2CAA2C;QAC3C,IAAI,kBAAkB,IAAI,yBAAyB,KAAK,WAAW,EAAE;YACnE,MAAM,KAAK,GAAG,MAAM,KAAK,CAAC,EAAE,CAAC,iBAAiB,CAAC,kBAAkB,CAAC,CAAC;YACnE,MAAM,KAAK,CAAC,IAAI,EAAE,CAAC;SACpB;QAED,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,qBAAqB,CAAC,2CAAI,CAAA,CAAC;QAClD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,kBAAkB,CAAC,2CAAI,CAAA,CAAC;QAC/C,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,sBAAsB,CAAC,2CAAI,CAAA,CAAC;QACnD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,mBAAmB,CAAC,2CAAI,CAAA,CAAC;QAEhD,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAE1D,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,oBAAoB,CAAC,2CAAI,CAAA,CAAC;QACjD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,iBAAiB,CAAC,2CAAI,CAAA,CAAC;QAC9C,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,qBAAqB,CAAC,2CAAI,CAAA,CAAC;QAClD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,kBAAkB,CAAC,2CAAI,CAAA,CAAC;QAE/C,iDAAiD;QACjD,MAAM,SAAS,EAAE,CAAC;QAElB,OAAO,MAAM,CAAC;IAChB,CAAC,CAAA,CAAC;IAEF,OAAO,UAAU,CAAC;AACpB,CAAC,CAAC;AAEF,mFAAmF;AACnF,MAAM,qBAAqB,GAAG,CAAC,KAA+C,EAAE,EAAE;IAChF,iDAAiD;IACjD,MAAM,SAAS,GAAc,CAAO,SAA+B,EAAE,EAAE;;QACrE,MAAM,EACJ,gBAAgB,EAChB,mBAAmB,EACnB,mBAAmB,GACpB,GAAG,IAAA,iBAAQ,EAAC,EAAE,EAAE,SAAS,EAAE,MAAA,KAAK,CAAC,KAAK,mCAAI,EAAE,CAAC,CAAC,CAAC,kBAAkB;QAElE,IAAI,CAAC,gBAAgB;YAAE,OAAO;QAE9B,MAAM,KAAK,CAAC,EAAE,CAAC,wBAAwB,CAAC,gBAAgB,EAAE,mBAAmB,EAAE;YAC7E,KAAK,EAAE,mBAAmB;SAC3B,CAAC,CAAC;IACL,CAAC,CAAA,CAAC;IAEF,OAAO,SAAS,CAAC;AACnB,CAAC,CAAC;AAEF,uEAAuE;AACvE,MAAM,oBAAoB,GAAG,CAC3B,KAAiE,EACjE,EAAE;;IACF,MAAM,EACJ,mBAAmB,EACnB,cAAc,EACd,gBAAgB,EAChB,eAAe,EACf,YAAY,EACZ,eAAe,EACf,gBAAgB,EAChB,kBAAkB,EAClB,kBAAkB,EAClB,sBAAsB,EACtB,yBAAyB,GAC1B,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCAAI,EAAE,CAA6D,CAAC;IAEpF,MAAM,cAAc,GAAmC,CAAO,OAAO,EAAE,GAAG,EAAE,OAAO,EAAE,EAAE;QACrF,MAAM,WAAW,GAAG,SAAS,CAAC,KAAK,EAAE,eAAe,CAAC,CAAC;QACtD,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,EAAE,YAAY,CAAC,CAAC;QAEhD,MAAM,aAAa,GAAG,gBACpB,EAAE,EAAE,KAAK,CAAC,EAAE,EACZ,GAAG,EAAE,KAAK,CAAC,GAAG,EACd,WAAW,EAAE,mBAAmB,EAChC,QAAQ,EAAE,gBAAgB,EAC1B,QAAQ,EAAE,gBAAgB,EAC1B,SAAS,EAAE,kBAAkB,EAC7B,SAAS,EAAE,WAAW,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EAChE,MAAM,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EACvD,SAAS,EAAE,eAAe,EAC1B,cAAc,EACd,YAAY,EAAE,kBAAkB,EAChC,gBAAgB,EAAE,sBAAsB,EACxC,mBAAmB,EAAE,yBAAyB,IAC3C,OAAO,CACuB,CAAC;QAEpC,OAAO,IAAA,mBAAQ,EAAC,OAAO,EAAE,GAAG,EAAE,aAAa,CAAC,CAAC;IAC/C,CAAC,CAAA,CAAC;IAEF,OAAO,cAAc,CAAC;AACxB,CAAC,CAAC;AAEF,2EAA2E;AAC3E,MAAM,wBAAwB,GAAG,CAC/B,KAAiE,EACjE,EAAE;;IACF,MAAM,EAAE,cAAc,EAAE,iBAAiB,EAAE,gBAAgB,EAAE,aAAa,EAAE,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCACzF,EAAE,CAAsB,CAAC;IAE3B,MAAM,iBAAiB,GAAuC,CAAO,OAAO,EAAE,OAAO,EAAE,EAAE;QACvF,MAAM,WAAW,GAAG,SAAS,CAAC,KAAK,EAAE,gBAAgB,CAAC,CAAC;QACvD,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,EAAE,aAAa,CAAC,CAAC;QAEjD,MAAM,aAAa,GAAG,gBACpB,EAAE,EAAE,KAAK,CAAC,EAAE,EACZ,GAAG,EAAE,KAAK,CAAC,GAAG,EACd,QAAQ,EAAE,iBAAiB,EAC3B,SAAS,EAAE,WAAW,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EAChE,MAAM,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EACvD,cAAc,IACX,OAAO,CACwB,CAAC;QAErC,OAAO,IAAA,2BAAY,EAAC,OAAO,EAAE,aAAa,CAAC,CAAC;IAC9C,CAAC,CAAA,CAAC;IAEF,OAAO,iBAAiB,CAAC;AAC3B,CAAC,CAAC;AAEF,4DAA4D;AACrD,MAAM,wBAAwB,GAAG,CAGtC,EACA,KAAK,EACL,QAAQ,EACR,SAAS,GAcV,EAAE,EAAE;IACH,MAAM,sBAAsB,GAAG,CAAoC,MAAS,EAAE,EAAE,CAC9E,IAAA,aAAI,EAAC,MAAM,EAAE,MAAM,CAAC,IAAI,CAAC,qBAAY,CAAC,CAAC,CAAC;IAE1C,OAAO,8CAEF,IAAA,eAAM,EAAC,QAAQ,aAAR,QAAQ,cAAR,QAAQ,GAAK,EAAY,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,KAAK,SAAS,CAAC,GAEjE,IAAA,eAAM,EAAC,sBAAsB,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,EAAE,CAAC,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,KAAK,SAAS,CAAC,GAE3E,IAAA,eAAM,EAAC,SAAS,aAAT,SAAS,cAAT,SAAS,GAAK,EAAY,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,KAAK,SAAS,CAAC,CAC7C,CAAC;AAC7B,CAAC,CAAC;AAhCW,QAAA,wBAAwB,4BAgCnC;AAEF,MAAM,qBAAqB,GAAG,CAC5B,KAAiE,EACjE,EAAE;;IACF,MAAM,EAAE,SAAS,EAAE,oBAAoB,EAAE,qBAAqB,EAAE,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCAC7E,EAAE,CAAwB,CAAC;IAE7B,MAAM,OAAO,GAAG,CAAC,GAAG,CAAC,SAAS,aAAT,SAAS,cAAT,SAAS,GAAI,EAAE,CAAC,CAAC,CAAC;IAEvC,IAAI,oBAAoB,EAAE;QACxB,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,mCAAmC,oBAAoB,EAAE,CAAC,CAAC;QAC3E,MAAM,CAAC,SAAS,EAAE,KAAK,CAAC,GAAG,oBAAoB,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC3D,MAAM,eAAe,GAAG,MAAM,IAAA,8BAAoB,EAAM,SAAS,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC,CAAC;QAC5F,OAAO,CAAC,IAAI,CAAC,GAAG,eAAe,CAAC,CAAC;KAClC;IAED,IAAI,qBAAqB,EAAE;QACzB,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,kCAAkC,CAAC,CAAC;QACpD,MAAM,UAAU,GAAG,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,qBAAqB,CAAC,2CAAI,CAAA,CAAC;QACrE,IAAI,UAAU;YAAE,OAAO,CAAC,IAAI,CAAC,GAAG,UAAU,CAAC,CAAC;KAC7C;IAED,OAAO,OAAO,CAAC;AACjB,CAAC,CAAA,CAAC","sourcesContent":["import {\n BasicCrawler,\n CrawlingContext,\n RouterHandler,\n BasicCrawlerOptions,\n CheerioCrawler,\n Router,\n HttpCrawler,\n JSDOMCrawler,\n PlaywrightCrawler,\n PuppeteerCrawler,\n Log,\n Request as CrawleeRequest,\n} from 'crawlee';\nimport { omitBy, pick, defaults } from 'lodash';\nimport { gotScraping } from 'got-scraping';\n\nimport type { CrawlerMeta, CrawlerType } from '../../types';\nimport type { MaybePromise, PickPartial } from '../../utils/types';\nimport { createErrorHandler } from '../error/errorHandler';\nimport { type PushDataOptions, itemCacheKey, pushData } from '../io/pushData';\nimport { getColumnFromDataset } from '../io/dataset';\nimport { PushRequestsOptions, pushRequests } from '../io/pushRequests';\nimport type { CrawleeOneIO } from '../integrations/types';\nimport { apifyIO } from '../integrations/apify';\nimport { registerHandlers, setupDefaultHandlers } from '../router/router';\nimport {\n CrawlerConfigActorInput,\n OutputActorInput,\n MetamorphActorInput,\n PrivacyActorInput,\n crawlerInput,\n StartUrlsActorInput,\n InputActorInput,\n RequestActorInput,\n AllActorInputs,\n LoggingActorInput,\n} from '../config';\nimport { logLevelHandlerWrapper, logLevelToCrawlee } from '../log';\nimport type { CrawleeOneTelemetry } from '../telemetry/types';\nimport type {\n CrawleeOneActorCtx,\n CrawleeOneActorDef,\n CrawleeOneHookCtx,\n CrawleeOneActorRouterCtx,\n Metamorph,\n RunCrawler,\n} from './types';\n\nconst actorClassByType = {\n basic: BasicCrawler,\n http: HttpCrawler,\n cheerio: CheerioCrawler,\n jsdom: JSDOMCrawler,\n playwright: PlaywrightCrawler,\n puppeteer: PuppeteerCrawler,\n} satisfies Record<CrawlerType, { new (options: Record<string, any>): any }>;\n\nconst isRouter = (r: any): r is RouterHandler<any> => {\n return !!((r as RouterHandler)?.addHandler && (r as RouterHandler)?.addDefaultHandler);\n};\n\nconst isFunc = (f: any): f is (...args: any[]) => any => {\n return typeof f === 'function';\n};\n\n/** Run a function that was defined as a string via Actor input */\nconst genHookFn = <\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n>(\n actor: Pick<CrawleeOneActorCtx<any, Input, TIO, any, any>, 'input' | 'state' | 'io'>,\n fnStr?: string\n) => {\n if (!fnStr) return null;\n\n const hookCtx = {\n io: actor.io,\n input: actor.input,\n state: actor.state,\n itemCacheKey,\n sendRequest: gotScraping,\n } satisfies CrawleeOneHookCtx<Input, TIO>;\n\n const hookFn = eval(fnStr);\n if (!hookFn) return null;\n\n return async (...args) => hookFn(...args, hookCtx);\n};\n\n/**\n * Options available when creating default configuration for an opinionated Crawlee actor,\n * which is then run within Apify's `Actor.main()` context.\n *\n * Apify context can be replaced with custom implementation using the `actorConfig.io` option.\n *\n * Read more about what this actor does at {@link createCrawleeOne}.\n */\nexport interface RunCrawleeOneOptions<\n TCrawlerType extends CrawlerType,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO,\n Telem extends CrawleeOneTelemetry<any, any> = CrawleeOneTelemetry<any, any>,\n Ctx extends CrawlerMeta<TCrawlerType, any>['context'] = CrawlingContext<BasicCrawler>\n> {\n /** String idetifying the actor class, e.g. `'cheerio'` */\n actorType: TCrawlerType;\n actorName: string;\n /** Config passed to the {@link createCrawleeOne} */\n actorConfig: PickPartial<\n CrawleeOneActorDef<Labels, Input, TIO, Telem, Ctx>,\n 'router' | 'createCrawler' | 'io' | 'telemetry'\n >;\n /**\n * If using default `createCrawler` implementation, these are crawler options\n * that may be overriden by user input.\n */\n crawlerConfigDefaults?: CrawlerMeta<TCrawlerType, any>['options'];\n /**\n * If using default `createCrawler` implementation, these are crawler options\n * that will override user input.\n *\n * This is useful for testing env.\n */\n crawlerConfigOverrides?: CrawlerMeta<TCrawlerType, any>['options'];\n /**\n * Callback with the created actor. The callback is called within\n * the `Actor.main()` context.\n */\n onActorReady?: (actor: CrawleeOneActorCtx<Labels, Input, TIO, Telem, Ctx>) => MaybePromise<void>;\n}\n\n/**\n * Create opinionated Crawlee crawler that uses, and run it within Apify's `Actor.main()` context.\n *\n * Apify context can be replaced with custom implementation using the `actorConfig.io` option.\n *\n * This function does the following for you:\n *\n * 1) Full TypeScript coverage - Ensure all components use the same Crawler / CrawlerContext.\n *\n * 2) Get Actor input from `Actor.getInput` if not given.\n *\n * 3) (Optional) Validate Actor input\n *\n * 4) Set up router such that requests that reach default route are\n * redirected to labelled routes based on which item from \"routes\" they match.\n *\n * 5) Register all route handlers for you.\n *\n * 6) (Optional) Wrap all route handlers in a wrapper. Use this e.g.\n * if you want to add a field to the context object, or handle errors\n * from a single place.\n *\n * 7) (Optional) Support transformation and filtering of (scraped) entries,\n * configured via Actor input.\n *\n * 8) (Optional) Support Actor metamorphing, configured via Actor input.\n *\n * 9) Apify context (e.g. calling `Actor.getInput`) can be replaced with custom\n * implementation using the `io` option.\n */\nexport const runCrawleeOne = async <\n TType extends CrawlerType,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO,\n Telem extends CrawleeOneTelemetry<any, any> = CrawleeOneTelemetry<any, any>,\n Ctx extends CrawlerMeta<TType, any>['context'] = CrawlerMeta<TType, any>['context']\n>(\n args: RunCrawleeOneOptions<TType, Labels, Input, TIO, Telem, Ctx>\n): Promise<void> => {\n const {\n actorType,\n actorName,\n actorConfig,\n crawlerConfigDefaults,\n crawlerConfigOverrides,\n onActorReady,\n } = args;\n\n const { io = apifyIO as any as TIO, telemetry } = actorConfig;\n\n // See docs:\n // - https://docs.apify.com/sdk/js/\n // - https://docs.apify.com/academy/deploying-your-code/inputs-outputs#accepting-input-with-the-apify-sdk\n // - https://docs.apify.com/sdk/js/docs/upgrading/upgrading-to-v3#apify-sdk\n await io.runInContext(\n async () => {\n await telemetry?.setup({ actorType, actorName, actorConfig: { ...actorConfig, io } });\n\n const actorDefaults: Pick<\n CrawleeOneActorDef<Labels, Input & AllActorInputs, TIO, Telem, Ctx>,\n 'router' | 'routeHandlerWrappers' | 'createCrawler'\n > = {\n router: Router.create<Ctx>(),\n routeHandlerWrappers: ({ input }) => [\n logLevelHandlerWrapper<Ctx, any>(input?.logLevel ?? 'info'),\n ],\n createCrawler: ({ router, proxy, input }) => {\n const options = createHttpCrawlerOptions<CrawlerMeta<TType, any>['options'], Input>({\n input,\n defaults: crawlerConfigDefaults,\n overrides: {\n requestHandler: router,\n proxyConfiguration: proxy,\n // Capture errors in a separate (Apify) Dataset and pass errors to telemetry\n failedRequestHandler: createErrorHandler({\n io,\n reportingDatasetId: input?.errorReportingDatasetId ?? 'REPORTING',\n sendToTelemetry: input?.errorSendToTelemetry ?? true,\n onSendErrorToTelemetry: telemetry?.onSendErrorToTelemetry,\n }),\n ...crawlerConfigOverrides,\n },\n });\n const CrawlerClass = actorClassByType[actorType] as any;\n return new CrawlerClass(options);\n },\n };\n\n const actor = await createCrawleeOne<Labels, Input, TIO, Telem, Ctx>({\n ...actorConfig,\n io,\n router: actorConfig.router ?? (actorDefaults.router as any),\n routeHandlerWrappers:\n actorConfig.routeHandlerWrappers ?? (actorDefaults.routeHandlerWrappers as any),\n createCrawler: actorConfig.createCrawler ?? (actorDefaults.createCrawler as any),\n });\n\n await onActorReady?.(actor);\n },\n { statusMessage: 'Crawling finished!' }\n );\n};\n\n/**\n * NOTE: If you want to run a scraper, see {@link runCrawleeOne}. This is lower-level\n * function that should be used only if you want to override the default behaviour of runCrawleeOne.\n *\n * Create opinionated Crawlee crawler that uses router for handling requests.\n *\n * This is a quality-of-life function that does the following for you:\n *\n * 1) Full TypeScript coverage - Ensure all components use the same Crawler / CrawlerContext.\n *\n * 2) Get Actor input from `Actor.getInput` if not given.\n *\n * 3) (Optional) Validate Actor input\n *\n * 4) Set up router such that requests that reach default route are\n * redirected to labelled routes based on which item from \"routes\" they match.\n *\n * 5) Register all route handlers for you.\n *\n * 6) (Optional) Wrap all route handlers in a wrapper. Use this e.g.\n * if you want to add a field to the context object, or handle errors\n * from a single place.\n *\n * 7) (Optional) Support transformation and filtering of (scraped) entries,\n * configured via Actor input.\n *\n * 8) (Optional) Support Actor metamorphing, configured via Actor input.\n *\n * 9) Apify context (e.g. calling `Actor.getInput`) can be replaced with custom\n * implementation using the `io` option.\n */\nconst createCrawleeOne = async <\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO,\n Telem extends CrawleeOneTelemetry<any, any> = CrawleeOneTelemetry<any, any>,\n Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>\n>(\n config: PickPartial<CrawleeOneActorDef<Labels, Input, TIO, Telem, Ctx>, 'io'>\n): Promise<CrawleeOneActorCtx<Labels, Input, TIO, Telem, Ctx>> => {\n const { io = apifyIO as any as TIO, telemetry } = config;\n\n // Mutable state that is available to the actor hooks\n const state = {};\n\n // Initialize actor inputs\n const rawInput = config.input\n ? isFunc(config.input)\n ? await config.input({ ...config, io })\n : config.input\n : await io.getInput<Input>();\n const input = Object.freeze(await resolveInput<Input | null>(rawInput, state, { io }));\n\n if (config.validateInput) await config.validateInput(input);\n\n const { logLevel } = (input ?? {}) as LoggingActorInput;\n const log = new Log({ level: logLevel ? logLevelToCrawlee[logLevel] : undefined });\n\n // This is context that is available to options that use initialization function\n const getConfig = () => ({ ...config, input, state, io, log });\n\n // Set up proxy\n const defaultProxy =\n config.proxy == null ? await io.createDefaultProxyConfiguration(input ?? undefined) : undefined;\n const proxy =\n config.proxy == null\n ? defaultProxy\n : isFunc(config.proxy)\n ? await config.proxy(getConfig())\n : config.proxy;\n\n // Run initialization functions\n const router: RouterHandler<Ctx> = isRouter(config.router)\n ? config.router\n : await (config.router as any)(getConfig());\n const routes = isFunc(config.routes) ? await config.routes(getConfig()) : config.routes; // prettier-ignore\n const routeHandlers = isFunc(config.routeHandlers) ? await config.routeHandlers(getConfig()) : config.routeHandlers; // prettier-ignore\n const routeHandlerWrappers = isFunc(config.routeHandlerWrappers) ? await config.routeHandlerWrappers(getConfig()) : config.routeHandlerWrappers; // prettier-ignore\n\n // Create Crawlee crawler\n const getActorCtx = () => ({\n io,\n telemetry,\n router,\n routes,\n routeHandlers,\n proxy,\n config,\n input,\n state,\n log,\n });\n const crawler = await config.createCrawler(getActorCtx());\n\n // Create actor (our custom entity)\n const preActor = { crawler, ...getActorCtx() };\n const runCrawler = createScopedCrawlerRun(preActor);\n const metamorph = createScopedMetamorph(preActor);\n const scopedPushData = createScopedPushData(preActor);\n const scopedPushRequest = createScopedPushRequests(preActor);\n const startUrls = await getStartUrlsFromInput(preActor);\n\n const actor = {\n ...preActor,\n crawler,\n runCrawler,\n metamorph,\n pushData: scopedPushData,\n pushRequests: scopedPushRequest,\n startUrls,\n } satisfies CrawleeOneActorCtx<Labels, Input, TIO, Telem, Ctx>;\n\n // Extra data that we make available to the route handlers\n const routerContext = { actor, pushData: scopedPushData };\n\n // Set up router\n await setupDefaultHandlers<\n Ctx,\n CrawleeOneActorRouterCtx<Ctx, Labels, Input, TIO, Telem>,\n Labels,\n Input\n >({\n io,\n router,\n routeHandlerWrappers,\n routerContext,\n routes,\n routeHandlers,\n input,\n });\n\n // Register labelled handlers\n await registerHandlers<Labels, CrawleeOneActorRouterCtx<Ctx, Labels, Input, TIO, Telem>, Ctx>(\n router,\n routeHandlers,\n { routerContext, handlerWrappers: routeHandlerWrappers }\n );\n\n // Now that the actor is ready, enqueue the URLs right away\n await scopedPushRequest(startUrls as CrawleeRequest[]);\n\n return actor;\n};\n\nconst resolveInput = async <T extends Record<string, any> | null>(\n input: object | null,\n state: Record<string, unknown>,\n options?: { io?: CrawleeOneIO }\n) => {\n const { io = apifyIO as CrawleeOneIO } = options ?? {};\n const { inputExtendUrl, inputExtendFromFunction } = (input ?? {}) as InputActorInput;\n\n const inputFromUrl = inputExtendUrl ? await gotScraping.get(inputExtendUrl).json<object>() : null;\n const inputFn = genHookFn({ state, input, io }, inputExtendFromFunction);\n const inputFromFunc = (await inputFn?.()) ?? null;\n const extendedInput = { ...inputFromUrl, ...inputFromFunc, ...input };\n\n return extendedInput as T;\n};\n\n/**\n * Create a function that wraps `crawler.run(requests, runOtions)` with additional\n * features like:\n * - Automatically metamorph into another actor after the run finishes\n */\nconst createScopedCrawlerRun = <\n Ctx extends CrawlingContext<any> = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO,\n Telem extends CrawleeOneTelemetry<any, any> = CrawleeOneTelemetry<any, any>\n>(\n actor: Omit<\n CrawleeOneActorCtx<Labels, Input, TIO, Telem, Ctx>,\n 'runCrawler' | 'metamorph' | 'pushData' | 'pushRequests' | 'startUrls'\n >\n) => {\n const {\n requestTransformBefore,\n requestTransformAfter,\n requestFilterBefore,\n requestFilterAfter,\n outputTransformBefore,\n outputTransformAfter,\n outputFilterBefore,\n outputFilterAfter,\n outputCacheStoreId,\n outputCacheActionOnResult,\n } = (actor.input ?? {}) as OutputActorInput & RequestActorInput;\n\n const metamorph = createScopedMetamorph(actor);\n\n const runCrawler: RunCrawler<Ctx> = async (requests, options) => {\n // Clear cache if it was set from the input\n if (outputCacheStoreId && outputCacheActionOnResult === 'overwrite') {\n const store = await actor.io.openKeyValueStore(outputCacheStoreId);\n await store.drop();\n }\n\n await genHookFn(actor, outputTransformBefore)?.();\n await genHookFn(actor, outputFilterBefore)?.();\n await genHookFn(actor, requestTransformBefore)?.();\n await genHookFn(actor, requestFilterBefore)?.();\n\n const runRes = await actor.crawler.run(requests, options);\n\n await genHookFn(actor, outputTransformAfter)?.();\n await genHookFn(actor, outputFilterAfter)?.();\n await genHookFn(actor, requestTransformAfter)?.();\n await genHookFn(actor, requestFilterAfter)?.();\n\n // Trigger metamorph if it was set from the input\n await metamorph();\n\n return runRes;\n };\n\n return runCrawler;\n};\n\n/** Create a function that triggers metamorph, using Actor's inputs as defaults. */\nconst createScopedMetamorph = (actor: Pick<CrawleeOneActorCtx, 'input' | 'io'>) => {\n // Trigger metamorph if it was set from the input\n const metamorph: Metamorph = async (overrides?: MetamorphActorInput) => {\n const {\n metamorphActorId,\n metamorphActorBuild,\n metamorphActorInput,\n } = defaults({}, overrides, actor.input ?? {}); // prettier-ignore\n\n if (!metamorphActorId) return;\n\n await actor.io.triggerDownstreamCrawler(metamorphActorId, metamorphActorInput, {\n build: metamorphActorBuild,\n });\n };\n\n return metamorph;\n};\n\n/** pushData wrapper that pre-populates options based on actor input */\nconst createScopedPushData = (\n actor: Pick<CrawleeOneActorCtx, 'input' | 'state' | 'io' | 'log'>\n) => {\n const {\n includePersonalData,\n requestQueueId,\n outputMaxEntries,\n outputTransform,\n outputFilter,\n outputDatasetId,\n outputPickFields,\n outputRenameFields,\n outputCacheStoreId,\n outputCachePrimaryKeys,\n outputCacheActionOnResult,\n } = (actor.input ?? {}) as OutputActorInput & PrivacyActorInput & RequestActorInput;\n\n const scopedPushData: CrawleeOneActorCtx['pushData'] = async (entries, ctx, options) => {\n const transformFn = genHookFn(actor, outputTransform);\n const filterFn = genHookFn(actor, outputFilter);\n\n const mergedOptions = {\n io: actor.io,\n log: actor.log,\n showPrivate: includePersonalData,\n maxCount: outputMaxEntries,\n pickKeys: outputPickFields,\n remapKeys: outputRenameFields,\n transform: transformFn ? (item) => transformFn(item) : undefined,\n filter: filterFn ? (item) => filterFn(item) : undefined,\n datasetId: outputDatasetId,\n requestQueueId,\n cacheStoreId: outputCacheStoreId,\n cachePrimaryKeys: outputCachePrimaryKeys,\n cacheActionOnResult: outputCacheActionOnResult,\n ...options,\n } satisfies PushDataOptions<object>;\n\n return pushData(entries, ctx, mergedOptions);\n };\n\n return scopedPushData;\n};\n\n/** pushRequests wrapper that pre-populates options based on actor input */\nconst createScopedPushRequests = (\n actor: Pick<CrawleeOneActorCtx, 'input' | 'state' | 'io' | 'log'>\n) => {\n const { requestQueueId, requestMaxEntries, requestTransform, requestFilter } = (actor.input ??\n {}) as RequestActorInput;\n\n const scopedPushRequest: CrawleeOneActorCtx['pushRequests'] = async (entries, options) => {\n const transformFn = genHookFn(actor, requestTransform);\n const filterFn = genHookFn(actor, requestFilter);\n\n const mergedOptions = {\n io: actor.io,\n log: actor.log,\n maxCount: requestMaxEntries,\n transform: transformFn ? (item) => transformFn(item) : undefined,\n filter: filterFn ? (item) => filterFn(item) : undefined,\n requestQueueId,\n ...options,\n } satisfies PushRequestsOptions<any>;\n\n return pushRequests(entries, mergedOptions);\n };\n\n return scopedPushRequest;\n};\n\n/** Given the actor input, create common crawler options. */\nexport const createHttpCrawlerOptions = <\n TOpts extends BasicCrawlerOptions<any> = BasicCrawlerOptions,\n Input extends Record<string, any> = Record<string, any>\n>({\n input,\n defaults,\n overrides,\n}: {\n /** Actor input */\n input: Input | null;\n /**\n * Default config options set by us. These may be overriden\n * by values from actor input (set by user).\n */\n defaults?: TOpts;\n /**\n * These config options will overwrite both the default and user\n * options. This is useful for hard-setting values e.g. in tests.\n */\n overrides?: TOpts;\n}) => {\n const pickCrawlerInputFields = <T extends CrawlerConfigActorInput>(config: T) =>\n pick(config, Object.keys(crawlerInput));\n\n return {\n // ----- 1. DEFAULTS -----\n ...omitBy(defaults ?? ({} as TOpts), (field) => field === undefined),\n // ----- 2. CONFIG FROM INPUT -----\n ...omitBy(pickCrawlerInputFields(input ?? {}), (field) => field === undefined),\n // ----- 3. OVERRIDES - E.G. TEST CONFIG -----\n ...omitBy(overrides ?? ({} as TOpts), (field) => field === undefined),\n } satisfies Partial<TOpts>;\n};\n\nconst getStartUrlsFromInput = async (\n actor: Pick<CrawleeOneActorCtx, 'input' | 'state' | 'io' | 'log'>\n) => {\n const { startUrls, startUrlsFromDataset, startUrlsFromFunction } = (actor.input ??\n {}) as StartUrlsActorInput;\n\n const urlsAgg = [...(startUrls ?? [])];\n\n if (startUrlsFromDataset) {\n actor.log.debug(`Loading start URLs from Dataset ${startUrlsFromDataset}`);\n const [datasetId, field] = startUrlsFromDataset.split('#');\n const urlsFromDataset = await getColumnFromDataset<any>(datasetId, field, { io: actor.io });\n urlsAgg.push(...urlsFromDataset);\n }\n\n if (startUrlsFromFunction) {\n actor.log.debug(`Loading start URLs from function`);\n const urlsFromFn = await genHookFn(actor, startUrlsFromFunction)?.();\n if (urlsFromFn) urlsAgg.push(...urlsFromFn);\n }\n\n return urlsAgg;\n};\n"]}
|
|
@@ -4,7 +4,7 @@ import type { MaybePromise, PickPartial } from '../../utils/types';
|
|
|
4
4
|
import type { CrawlerUrl } from '../../types';
|
|
5
5
|
import type { itemCacheKey, pushData } from '../io/pushData';
|
|
6
6
|
import type { pushRequests } from '../io/pushRequests';
|
|
7
|
-
import type { CrawleeOneRouteHandler,
|
|
7
|
+
import type { CrawleeOneRouteHandler, CrawleeOneRoute, CrawleeOneRouteWrapper } from '../router/types';
|
|
8
8
|
import type { MetamorphActorInput } from '../config';
|
|
9
9
|
import type { CrawleeOneIO } from '../integrations/types';
|
|
10
10
|
import type { CrawleeOneTelemetry } from '../telemetry/types';
|
|
@@ -80,7 +80,7 @@ export interface CrawleeOneActorDef<Labels extends string = string, Input extend
|
|
|
80
80
|
* }],
|
|
81
81
|
* })
|
|
82
82
|
*/
|
|
83
|
-
routes: MaybeAsyncFn<
|
|
83
|
+
routes: MaybeAsyncFn<CrawleeOneRoute<Labels, CrawleeOneActorRouterCtx<Ctx, Labels, Input, TIO, Telem>, Ctx>[], [
|
|
84
84
|
CrawleeOneActorDefWithInput<Labels, Input, TIO, Telem, Ctx>
|
|
85
85
|
]>;
|
|
86
86
|
/** Handlers for the labelled requests. The object keys are the labels. */
|
|
@@ -155,7 +155,7 @@ export interface CrawleeOneActorCtx<Labels extends string = string, Input extend
|
|
|
155
155
|
startUrls: CrawlerUrl[];
|
|
156
156
|
proxy?: ProxyConfiguration;
|
|
157
157
|
router: RouterHandler<Ctx>;
|
|
158
|
-
routes:
|
|
158
|
+
routes: CrawleeOneRoute<Labels, CrawleeOneActorRouterCtx<Ctx, Labels, Input, TIO, Telem>, Ctx>[];
|
|
159
159
|
routeHandlers: Record<Labels, CrawleeOneRouteHandler<Ctx, CrawleeOneActorRouterCtx<Ctx, Labels, Input, TIO, Telem>>>;
|
|
160
160
|
/** Original config from which this actor context was created */
|
|
161
161
|
config: PickPartial<CrawleeOneActorDef<Labels, Input, TIO, Telem, Ctx>, 'io'>;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../../src/lib/actor/types.ts"],"names":[],"mappings":"","sourcesContent":["import type {\n BasicCrawler,\n CrawlingContext,\n Log,\n ProxyConfiguration,\n RouterHandler,\n} from 'crawlee';\nimport type { gotScraping } from 'got-scraping';\n\nimport type { MaybePromise, PickPartial } from '../../utils/types';\nimport type { CrawlerUrl } from '../../types';\nimport type { itemCacheKey, pushData } from '../io/pushData';\nimport type { pushRequests } from '../io/pushRequests';\nimport type {\n CrawleeOneRouteHandler,\n
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../../src/lib/actor/types.ts"],"names":[],"mappings":"","sourcesContent":["import type {\n BasicCrawler,\n CrawlingContext,\n Log,\n ProxyConfiguration,\n RouterHandler,\n} from 'crawlee';\nimport type { gotScraping } from 'got-scraping';\n\nimport type { MaybePromise, PickPartial } from '../../utils/types';\nimport type { CrawlerUrl } from '../../types';\nimport type { itemCacheKey, pushData } from '../io/pushData';\nimport type { pushRequests } from '../io/pushRequests';\nimport type {\n CrawleeOneRouteHandler,\n CrawleeOneRoute,\n CrawleeOneRouteWrapper,\n} from '../router/types';\nimport type { MetamorphActorInput } from '../config';\nimport type { CrawleeOneIO } from '../integrations/types';\nimport type { CrawleeOneTelemetry } from '../telemetry/types';\n\ntype MaybeAsyncFn<R, Args extends any[]> = R | ((...args: Args) => MaybePromise<R>);\n\ntype OrigRunCrawler<T extends CrawlingContext<any, any>> = BasicCrawler<T>['run'];\n\n/** Extended type of `crawler.run()` function */\nexport type RunCrawler<Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>> = (\n requests?: CrawlerUrl[],\n options?: Parameters<OrigRunCrawler<Ctx>>[1]\n) => ReturnType<OrigRunCrawler<Ctx>>;\n\n/** Trigger actor metamorph, using actor's inputs as defaults. */\nexport type Metamorph = (overrides?: MetamorphActorInput) => Promise<void>;\n\n/** Context passed from actor to route handlers */\nexport type CrawleeOneActorRouterCtx<\n Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO,\n Telem extends CrawleeOneTelemetry<any, any> = CrawleeOneTelemetry<any, any>\n> = {\n actor: CrawleeOneActorCtx<Labels, Input, TIO, Telem, Ctx>;\n};\n\n/** Context passed to user-defined functions passed from input */\nexport type CrawleeOneHookCtx<\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n> = Pick<CrawleeOneActorCtx<any, Input>, 'input' | 'state'> & {\n io: TIO;\n itemCacheKey: typeof itemCacheKey;\n sendRequest: typeof gotScraping;\n};\n\n/** All that's necessary to define a single CrawleeOne actor/crawler. */\nexport interface CrawleeOneActorDef<\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO,\n Telem extends CrawleeOneTelemetry<any, any> = CrawleeOneTelemetry<any, any>,\n Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>\n> {\n /** Client for communicating with cloud/local storage. */\n io: TIO;\n /** Client for telemetry like tracking errors. */\n telemetry?: Telem;\n\n // Actor input\n /**\n * Actor input which you can get e.g. via `Actor.getInput()`\n *\n * Input is automatically retrieved if undefined.\n */\n input?: MaybeAsyncFn<Input, [CrawleeOneActorDef<Labels, Input, TIO, Telem, Ctx>]>;\n /** Validation for the actor input. Should throw error if validation fails. */\n validateInput?: (input: Input | null) => MaybePromise<void>;\n\n // Router setup\n /**\n * Router instance that redirects the request to handlers.\n * @example\n * import { createCheerioRouter } from 'crawlee';\n *\n * ({\n * ...\n * router: createCheerioRouter(),\n * })\n */\n router: MaybeAsyncFn<\n RouterHandler<Ctx>,\n [CrawleeOneActorDefWithInput<Labels, Input, TIO, Telem, Ctx>]\n >;\n /**\n * Criteria that un-labelled requests are matched against.\n *\n * E.g. If `match` function returns truthy value,\n * the request is passed to the `action` function for processing.\n *\n * @example\n * ({\n * ...\n * routes: [{\n * // If match returns true, the request is forwarded to handler\n * // with label JOB_DETAIL.\n * name: 'Job detail',\n * handlerLabel: routeLabels.JOB_DETAIL,\n * match: (url) => isUrlOfJobOffer(url),\n * }, {\n * // Define custom action function:\n * // If match returns true, we replace this request with new one\n * // pointing to new domain.\n * name: 'Main page',\n * handlerLabel: null,\n * match: (url) => url.match(/example\\.com\\/?(?:[?#~]|$)/i),\n * action: async (url, ctx, _, handlers) => {\n * ctx.log.info(`Redirecting to https://www.new-domain.com`);\n * await ctx.crawler.addRequests(['https://www.new-domain.com'], { forefront: true });\n * },\n * }],\n * })\n */\n routes: MaybeAsyncFn<\n CrawleeOneRoute<Labels, CrawleeOneActorRouterCtx<Ctx, Labels, Input, TIO, Telem>, Ctx>[],\n [CrawleeOneActorDefWithInput<Labels, Input, TIO, Telem, Ctx>]\n >;\n /** Handlers for the labelled requests. The object keys are the labels. */\n routeHandlers: MaybeAsyncFn<\n Record<Labels, CrawleeOneRouteHandler<Ctx, CrawleeOneActorRouterCtx<Ctx, Labels, Input, TIO, Telem>>>,\n [CrawleeOneActorDefWithInput<Labels, Input, TIO, Telem, Ctx>]\n >; // prettier-ignore\n /**\n * Provides the option to modify or extend all router handlers by wrapping\n * them in these functions.\n *\n * Wrappers are applied from right to left. That means that wrappers `[A, B, C]`\n * will be applied like so `A( B( C( handler ) ) )`.\n *\n * Default `routeHandlerWrappers`:\n * ```js\n * {\n * ...\n * routeHandlerWrappers: ({ input }) => [\n * logLevelHandlerWrapper<Ctx, any>(input?.logLevel ?? 'info'),\n * ],\n * }\n * ```\n */\n routeHandlerWrappers?: MaybeAsyncFn<\n CrawleeOneRouteWrapper<Ctx, CrawleeOneActorRouterCtx<Ctx, Labels, Input, TIO, Telem>>[],\n [CrawleeOneActorDefWithInput<Labels, Input, TIO, Telem, Ctx>]\n >; // prettier-ignore\n\n // Proxy setup\n proxy?: MaybeAsyncFn<\n ProxyConfiguration,\n [CrawleeOneActorDefWithInput<Labels, Input, TIO, Telem, Ctx>]\n >; // prettier-ignore\n\n // Crawler setup\n createCrawler: (\n actorCtx: Omit<\n CrawleeOneActorCtx<Labels, Input, TIO, Telem, Ctx>,\n 'crawler' | 'runCrawler' | 'metamorph' | 'pushData' | 'pushRequests' | 'startUrls'\n >\n ) => MaybePromise<Ctx['crawler']>;\n}\n\n/** CrawleeOneActorDef object where the input is already resolved */\nexport type CrawleeOneActorDefWithInput<\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO,\n Telem extends CrawleeOneTelemetry<any, any> = CrawleeOneTelemetry<any, any>,\n Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>\n> = Omit<CrawleeOneActorDef<Labels, Input, TIO, Telem, Ctx>, 'input'> & {\n input: Input | null;\n state: Record<string, unknown>;\n};\n\n/** Context available while creating a Crawlee crawler/actor */\nexport interface CrawleeOneActorCtx<\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO,\n Telem extends CrawleeOneTelemetry<any, any> = CrawleeOneTelemetry<any, any>,\n Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>\n> {\n crawler: Ctx['crawler'];\n /**\n * This function wraps `crawler.run(requests, runOtions)` with additional\n * features:\n * - Automatically metamorph into another actor after the run finishes\n */\n runCrawler: RunCrawler<Ctx>;\n /** Trigger actor metamorph, using actor's inputs as defaults. */\n metamorph: Metamorph;\n /**\n * `Actor.pushData` with extra optional features:\n *\n * - Limit the number of entries pushed to the Dataset based on the Actor input\n * - Transform and filter entries via Actor input.\n * - Add metadata to entries before they are pushed to Dataset.\n * - Set which (nested) properties are personal data optionally redact them for privacy compliance.\n */\n pushData: typeof pushData;\n /**\n * Similar to `Actor.openRequestQueue().addRequests`, but with extra features:\n *\n * - Limit the max size of the RequestQueue. No requests are added when RequestQueue is at or above the limit.\n * - Transform and filter requests. Requests that did not pass the filter are not added to the RequestQueue.\n */\n pushRequests: typeof pushRequests;\n /**\n * A list of resolved Requests to be scraped.\n *\n * This list is a combination of 3 Actor inputs:\n * - `startUrls` - Static list of URLs to scrape.\n * - `startUrlsFromDataset` - From a specific field from a Dataset (e.g. \"dataset123#fieldName\" - Dataset: \"dataset123\", field: \"fieldName\").\n * - `startUrlsFromFunction` - A function that is evaulated to generate the Requests.\n */\n startUrls: CrawlerUrl[];\n proxy?: ProxyConfiguration;\n router: RouterHandler<Ctx>;\n routes: CrawleeOneRoute<Labels, CrawleeOneActorRouterCtx<Ctx, Labels, Input, TIO, Telem>, Ctx>[];\n routeHandlers: Record<\n Labels,\n CrawleeOneRouteHandler<Ctx, CrawleeOneActorRouterCtx<Ctx, Labels, Input, TIO, Telem>>\n >;\n /** Original config from which this actor context was created */\n config: PickPartial<CrawleeOneActorDef<Labels, Input, TIO, Telem, Ctx>, 'io'>;\n /** Read-only inputs passed to the actor */\n input: Input | null;\n /** Mutable state that is shared across setup and teardown hooks */\n state: Record<string, unknown>;\n /**\n * Instance managing communication with databases - storage & retrieval\n * (Dataset, RequestQueue, KeyValueStore).\n *\n * This is modelled and similar to Apify's `Actor` static class.\n */\n io: TIO;\n /** Instance managing telemetry like tracking errors. */\n telemetry?: Telem;\n /** Crawlee Log instance. */\n log: Log;\n}\n"]}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import type { CrawlingContext, RouterHandler as CrawlerRouter } from 'crawlee';
|
|
2
2
|
import type { CrawleeOneIO } from '../integrations/types';
|
|
3
|
-
import type { CrawleeOneRouteWrapper, CrawleeOneRouteHandler,
|
|
3
|
+
import type { CrawleeOneRouteWrapper, CrawleeOneRouteHandler, CrawleeOneRoute } from './types';
|
|
4
4
|
/**
|
|
5
5
|
* Register many handlers at once onto the Crawlee's RouterHandler.
|
|
6
6
|
*
|
|
@@ -26,7 +26,7 @@ import type { CrawleeOneRouteWrapper, CrawleeOneRouteHandler, CrawleeOneRouteMat
|
|
|
26
26
|
*
|
|
27
27
|
* The entries on the `routerContext` object will be made available to all handlers.
|
|
28
28
|
*/
|
|
29
|
-
export declare const registerHandlers: <
|
|
29
|
+
export declare const registerHandlers: <Labels extends string = string, RouterCtx extends Record<string, any> = Record<string, any>, CrawlerCtx extends CrawlingContext<unknown, import("crawlee").Dictionary> = CrawlingContext<unknown, import("crawlee").Dictionary>>(router: CrawlerRouter<CrawlerCtx>, routeHandlers: Record<Labels, CrawleeOneRouteHandler<CrawlerCtx, RouterCtx>>, options?: {
|
|
30
30
|
routerContext?: RouterCtx | undefined;
|
|
31
31
|
handlerWrappers?: CrawleeOneRouteWrapper<CrawlerCtx, RouterCtx>[] | undefined;
|
|
32
32
|
} | undefined) => Promise<void>;
|
|
@@ -84,7 +84,7 @@ export declare const setupDefaultHandlers: <CrawlerCtx extends CrawlingContext<u
|
|
|
84
84
|
router: CrawlerRouter<CrawlerCtx>;
|
|
85
85
|
routeHandlerWrappers?: CrawleeOneRouteWrapper<CrawlerCtx, RouterCtx>[] | undefined;
|
|
86
86
|
routerContext?: RouterCtx | undefined;
|
|
87
|
-
routes:
|
|
87
|
+
routes: CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[];
|
|
88
88
|
routeHandlers: Record<Labels, CrawleeOneRouteHandler<CrawlerCtx, RouterCtx>>;
|
|
89
89
|
input?: Input | null | undefined;
|
|
90
90
|
}) => Promise<void>;
|
|
@@ -25,6 +25,23 @@ const applyWrappersRight = (fn, wrappers = []) => {
|
|
|
25
25
|
return wrapper(interimFn);
|
|
26
26
|
}), Promise.resolve(fn));
|
|
27
27
|
};
|
|
28
|
+
const resolveRoutes = (routes) => routes.map((route) => {
|
|
29
|
+
const matchers = Array.isArray(route.match) ? route.match : [route.match];
|
|
30
|
+
if (!matchers.length) {
|
|
31
|
+
throw Error(`Route ${route.name} (label: ${route.handlerLabel}) has NO "match" item. It can be a RegExp, function, or array of the two.`); // prettier-ignore
|
|
32
|
+
}
|
|
33
|
+
const resolvedMatchers = matchers.map((matcher) => {
|
|
34
|
+
if (typeof matcher === 'function')
|
|
35
|
+
return matcher;
|
|
36
|
+
if (typeof matcher === 'string' || matcher instanceof RegExp) {
|
|
37
|
+
const newMatcherFn = (url) => !!url.match(matcher);
|
|
38
|
+
return newMatcherFn;
|
|
39
|
+
}
|
|
40
|
+
// We shouldn't get here!
|
|
41
|
+
throw Error(`Route ${route.name} (label: ${route.handlerLabel}) has INVALID "match" item. It can be only RegExp, function, or array of the two. Got ${matcher}`); // prettier-ignore
|
|
42
|
+
});
|
|
43
|
+
return Object.assign(Object.assign({}, route), { match: resolvedMatchers });
|
|
44
|
+
});
|
|
28
45
|
/**
|
|
29
46
|
* Register many handlers at once onto the Crawlee's RouterHandler.
|
|
30
47
|
*
|
|
@@ -63,6 +80,7 @@ const registerHandlers = (router, routeHandlers, options) => __awaiter(void 0, v
|
|
|
63
80
|
exports.registerHandlers = registerHandlers;
|
|
64
81
|
const createDefaultHandler = (input) => {
|
|
65
82
|
const { io, routes, routeHandlers, requestQueueId, perfBatchSize, perfBatchWaitSecs } = input;
|
|
83
|
+
const resolvedRoutes = resolveRoutes(routes);
|
|
66
84
|
// NOTE: Because we "clear" the queue by replacing it,
|
|
67
85
|
// we need to always call `openRequestQueue` to ensure we use the latest instance
|
|
68
86
|
const openQueue = () => io.openRequestQueue(requestQueueId);
|
|
@@ -127,8 +145,10 @@ const createDefaultHandler = (input) => {
|
|
|
127
145
|
const logSuffix = `Batch ${handledRequestsCount + 1} of ${perfBatchSize !== null && perfBatchSize !== void 0 ? perfBatchSize : 1}. URL: ${url}`;
|
|
128
146
|
// Find route handler for given URL
|
|
129
147
|
log.debug(`Searching for a handler for given Request. ${logSuffix}`);
|
|
130
|
-
const route = yield (0, async_1.serialAsyncFind)(
|
|
131
|
-
const isMatch = yield currRoute.match(
|
|
148
|
+
const route = yield (0, async_1.serialAsyncFind)(resolvedRoutes, (currRoute) => __awaiter(void 0, void 0, void 0, function* () {
|
|
149
|
+
const isMatch = yield (0, async_1.serialAsyncFind)(currRoute.match, (matchFn) => __awaiter(void 0, void 0, void 0, function* () {
|
|
150
|
+
return matchFn(url, ctx, currRoute, routeHandlers);
|
|
151
|
+
}));
|
|
132
152
|
return isMatch;
|
|
133
153
|
}));
|
|
134
154
|
// Run the handler
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"router.js","sourceRoot":"","sources":["../../../../src/lib/router/router.ts"],"names":[],"mappings":";;;;;;;;;;;;AASA,6CAA0D;AAW1D,wHAAwH;AAExH;;;;;;GAMG;AACH,MAAM,kBAAkB,GAAG,CAIzB,EAAO,EACP,WAAoB,EAAE,EACtB,EAAE;IACF,OAAO,QAAQ,CAAC,WAAW,CAAe,CAAO,gBAAgB,EAAE,OAAO,EAAE,EAAE;QAC5E,MAAM,SAAS,GAAG,MAAM,gBAAgB,CAAC;QACzC,OAAO,OAAO,CAAC,SAAS,CAAC,CAAC;IAC5B,CAAC,CAAA,EAAE,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,CAAC;AAC1B,CAAC,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACI,MAAM,gBAAgB,GAAG,CAK9B,MAAiC,EACjC,aAA4E,EAC5E,OAGC,EACD,EAAE;IACF,MAAM,EAAE,aAAa,EAAE,eAAe,EAAE,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;IAEzD,mBAAmB;IACnB,KAAK,MAAM,CAAC,GAAG,EAAE,OAAO,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,aAAa,CAAC,EAAE;QAC1D,4CAA4C;QAC5C,MAAM,cAAc,GAAG,MAAM,kBAAkB,CAC7C,OAA6E,EAC7E,eAAe,aAAf,eAAe,cAAf,eAAe,GAAI,EAAE,CACtB,CAAC;QAEF,sCAAsC;QACtC,MAAM,MAAM,CAAC,UAAU,CAAa,GAAG,EAAE,CAAO,GAAG,EAAE,EAAE,kDACrD,OAAA,cAAc,CAAC,gCAAK,aAAa,GAAK,GAAG,CAAS,CAAC,CAAA,GAAA,CACpD,CAAC;KACH;AACH,CAAC,CAAA,CAAC;AA3BW,QAAA,gBAAgB,oBA2B3B;AAEF,MAAM,oBAAoB,GAAG,CAK3B,KAK2C,EAC3C,EAAE;IACF,MAAM,EAAE,EAAE,EAAE,MAAM,EAAE,aAAa,EAAE,cAAc,EAAE,aAAa,EAAE,iBAAiB,EAAE,GAAG,KAAK,CAAC;IAE9F,sDAAsD;IACtD,iFAAiF;IACjF,MAAM,SAAS,GAAG,GAAG,EAAE,CAAC,EAAE,CAAC,gBAAgB,CAAC,cAAc,CAAC,CAAC;IAE5D,MAAM,YAAY,GAAG,CAAO,GAA0B,EAAE,EAAE;QACxD,IAAI,CAAC,GAAG;YAAE,OAAO;QACjB,MAAM,QAAQ,GAAG,MAAM,SAAS,EAAE,CAAC;QACnC,MAAM,QAAQ,CAAC,kBAAkB,CAAC,GAAG,CAAC,CAAC;IACzC,CAAC,CAAA,CAAC;IAEF,MAAM,eAAe,GAAG,CAAO,MAAc,EAAE,OAAoC,EAAE,EAAE;;QACrF,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;QAEpC,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,0CAA0C,MAAM,EAAE,CAAC,CAAC;QAE/D,IAAI,iBAAiB;YAAE,MAAM,IAAA,YAAI,EAAC,iBAAiB,CAAC,CAAC;QACrD,MAAM,QAAQ,GAAG,MAAM,SAAS,EAAE,CAAC;QACnC,MAAM,MAAM,GAAG,MAAA,CAAC,MAAM,QAAQ,CAAC,gBAAgB,EAAE,CAAC,mCAAI,IAAI,CAAC;QAE3D,IAAI,MAAM,EAAE;YACV,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,mCAAmC,MAAM,EAAE,CAAC,CAAC;YAExD,oEAAoE;YACpE,wEAAwE;YACxE,0CAA0C;YAC1C,IAAI,IAAI,aAAJ,IAAI,uBAAJ,IAAI,CAAE,IAAI;gBAAE,MAAM,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;SAC7C;aAAM;YACL,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,kCAAkC,MAAM,EAAE,CAAC,CAAC;SACxD;QACD,OAAO,MAAM,CAAC;IAChB,CAAC,CAAA,CAAC;IAEF,MAAM,OAAO,GAAG,CAAO,GAAQ,EAAE,GAA0B,EAAE,GAAQ,EAAE,EAAE;QACvE,GAAG,CAAC,KAAK,CAAC,gEAAgE,CAAA,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,SAAS,MAAI,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,GAAG,CAAA,GAAG,CAAC,CAAC,CAAC,kBAAkB;QAC5H,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QACf,iFAAiF;QACjF,IAAI,GAAG,EAAE;YACP,MAAM,QAAQ,GAAG,MAAM,SAAS,EAAE,CAAC;YACnC,MAAM,QAAQ,CAAC,cAAc,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;SACzD;IACH,CAAC,CAAA,CAAC;IAEF,uEAAuE;IACvE,kBAAkB;IAClB,MAAM,aAAa,GAAoE,CAAO,GAAG,EAAE,GAAG,EAAE,KAAK,EAAE,EAAE;QAC/G,MAAM,OAAO,GAAG,KAAK,CAAC,YAAY,IAAI,IAAI,IAAI,aAAa,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;QAChF,IAAI,CAAC,OAAO,EAAE;YACZ,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,8BAA8B,KAAK,CAAC,IAAI,KAAK,KAAK,CAAC,YAAY,sCAAsC,GAAG,EAAE,CAAC,CAAC,CAAC,kBAAkB;YAC7I,OAAO;SACR;QACD,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,0BAA0B,KAAK,CAAC,YAAY,UAAU,GAAG,EAAE,CAAC,CAAC;QAC1E,MAAM,OAAO,CAAC,GAAU,CAAC,CAAC;IAC5B,CAAC,CAAA,CAAC;IAEF,MAAM,cAAc,GAAG,CACrB,GAAM,EACS,EAAE;;QACjB,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,SAAS,EAAE,GAAG,GAAG,CAAC;QACrC,MAAM,GAAG,GAAG,SAAS,CAAC,KAAK,CAAC,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC,CAAC;QAErD,IAAI,oBAAoB,GAAG,CAAC,CAAC;QAC7B,IAAI,GAAG,GAA0B,MAAA,GAAG,CAAC,OAAO,mCAAI,IAAI,CAAC;QAErD,MAAM,YAAY,GAAG,GAAG,EAAE,CACxB,aAAa,IAAI,IAAI,IAAI,GAAG,IAAI,IAAI,IAAI,oBAAoB,GAAG,aAAa,CAAC;QAE/E,MAAM,MAAM,GAAG,GAAG,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAE,IAA0B,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,GAAI,CAAC,SAAS,IAAI,GAAI,CAAC,GAAG,CAAC,CAAC;QAE7F,MAAM,SAAS,GAAG,GAAS,EAAE;;YAC3B,MAAM,GAAG,GAAG,MAAM,MAAM,EAAE,CAAC;YAC3B,MAAM,SAAS,GAAG,SAAS,oBAAoB,GAAG,CAAC,OAAO,aAAa,aAAb,aAAa,cAAb,aAAa,GAAI,CAAC,UAAU,GAAG,EAAE,CAAC;YAE5F,mCAAmC;YACnC,GAAG,CAAC,KAAK,CAAC,8CAA8C,SAAS,EAAE,CAAC,CAAC;YACrE,MAAM,KAAK,GAAG,MAAM,IAAA,uBAAe,EAAC,MAAM,EAAE,CAAO,SAAS,EAAE,EAAE;gBAC9D,MAAM,OAAO,GAAG,MAAM,SAAS,CAAC,KAAK,CAAC,GAAG,EAAE,GAAG,EAAE,SAAS,EAAE,aAAa,CAAC,CAAC;gBAC1E,OAAO,OAAO,CAAC;YACjB,CAAC,CAAA,CAAC,CAAC;YAEH,kBAAkB;YAClB,IAAI,KAAK,EAAE;gBACT,GAAG,CAAC,IAAI,CAAC,qBAAqB,KAAK,CAAC,IAAI,mBAAmB,KAAK,CAAC,YAAY,MAAM,SAAS,EAAE,CAAC,CAAC,CAAC,kBAAkB;gBACnH,MAAM,MAAM,GAAG,MAAA,KAAK,CAAC,MAAM,mCAAI,aAAa,CAAC;gBAC7C,MAAM,MAAM,CAAC,GAAG,EAAE,GAAG,EAAE,KAAK,EAAE,aAAa,CAAC,CAAC;aAC9C;iBAAM;gBACL,GAAG,CAAC,KAAK,CAAC,oDAAoD,SAAS,EAAE,CAAC,CAAC;aAC5E;YAED,yCAAyC;YACzC,MAAM,YAAY,CAAC,GAAG,CAAC,CAAC;YACxB,oBAAoB,EAAE,CAAC;YAEvB,GAAG,GAAG,MAAM,eAAe,CAAC,SAAS,EAAE,EAAE,IAAI,EAAE,IAAY,EAAE,GAAG,EAAE,CAAC,CAAC;QACtE,CAAC,CAAA,CAAC;QAEF,IAAI;YACF,GAAG;gBACD,MAAM,SAAS,EAAE,CAAC;aACnB,QAAQ,YAAY,EAAE,EAAE;SAC1B;QAAC,OAAO,GAAG,EAAE;YACZ,MAAM,OAAO,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC;SAC9B;IACH,CAAC,CAAA,CAAC;IAEF,OAAO,cAAc,CAAC;AACxB,CAAC,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgDG;AACI,MAAM,oBAAoB,GAAG,CAKlC,EACA,EAAE,EACF,MAAM,EACN,oBAAoB,EACpB,aAAa,EACb,MAAM,EACN,aAAa,EACb,KAAK,GASN,EAAE,EAAE;IACH,MAAM,EAAE,aAAa,EAAE,iBAAiB,EAAE,cAAc,EAAE,GAAG,CAAC,KAAK,IAAI,EAAE,CACtD,CAAC;IAEpB,MAAM,cAAc,GAAG,oBAAoB,CAAC;QAC1C,EAAE;QACF,MAAM;QACN,aAAa;QACb,cAAc;QACd,aAAa;QACb,iBAAiB;KAClB,CAAC,CAAC;IAEH,MAAM,cAAc,GAAG,MAAM,kBAAkB,CAAC,cAAc,EAAE,oBAAoB,aAApB,oBAAoB,cAApB,oBAAoB,GAAI,EAAE,CAAC,CAAC;IAC5F,MAAM,MAAM,CAAC,iBAAiB,CAAa,CAAC,GAAG,EAAE,EAAE,CACjD,cAAc,CAAC,gCAAK,aAAa,GAAK,GAAG,CAAS,CAAC,CACpD,CAAC;AACJ,CAAC,CAAA,CAAC;AAtCW,QAAA,oBAAoB,wBAsC/B","sourcesContent":["import type {\n CrawlingContext,\n RouterHandler as CrawlerRouter,\n Request as CrawlerRequest,\n Log,\n} from 'crawlee';\nimport type { CommonPage } from '@crawlee/browser-pool';\nimport type { Page } from 'playwright';\n\nimport { serialAsyncFind, wait } from '../../utils/async';\nimport type { MaybePromise } from '../../utils/types';\nimport type { PerfActorInput, RequestActorInput } from '../config';\nimport type { CrawleeOneIO } from '../integrations/types';\nimport type {\n CrawleeOneRouteWrapper,\n CrawleeOneRouteHandler,\n CrawleeOneRouteMatcher,\n CrawleeOneRouteCtx,\n} from './types';\n\n// Read about router on https://docs.apify.com/academy/expert-scraping-with-apify/solutions/using-storage-creating-tasks\n\n/**\n * Given a function `fn` and a list of wrappers [a, b, c, ...],\n * wrap the function to generate composite `a( b( c( fn ) ) )`.\n *\n * That is, the wrappers on the left side of the array (start)\n * wrap those on the right side of the array (end).\n */\nconst applyWrappersRight = <\n TFn extends (...args: any[]) => any,\n TWrap extends (fn: TFn) => MaybePromise<TFn>\n>(\n fn: TFn,\n wrappers: TWrap[] = []\n) => {\n return wrappers.reduceRight<Promise<TFn>>(async (interimFnPromise, wrapper) => {\n const interimFn = await interimFnPromise;\n return wrapper(interimFn);\n }, Promise.resolve(fn));\n};\n\n/**\n * Register many handlers at once onto the Crawlee's RouterHandler.\n *\n * The labels under which the handlers are registered are the respective object keys.\n *\n * Example:\n *\n * ```js\n * registerHandlers(router, { labelA: fn1, labelB: fn2 });\n * ```\n *\n * Is similar to:\n * ```js\n * router.addHandler(labelA, fn1)\n * router.addHandler(labelB, fn2)\n * ```\n *\n * You can also specify a list of wrappers to override the behaviour of all handlers\n * all at once.\n *\n * A list of wrappers `[a, b, c]` will be applied to the handlers right-to-left as so\n * `a( b( c( handler ) ) )`.\n *\n * The entries on the `routerContext` object will be made available to all handlers.\n */\nexport const registerHandlers = async <\n CrawlerCtx extends CrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(\n router: CrawlerRouter<CrawlerCtx>,\n routeHandlers: Record<Labels, CrawleeOneRouteHandler<CrawlerCtx, RouterCtx>>,\n options?: {\n routerContext?: RouterCtx;\n handlerWrappers?: CrawleeOneRouteWrapper<CrawlerCtx, RouterCtx>[];\n }\n) => {\n const { routerContext, handlerWrappers } = options ?? {};\n\n // For each handler\n for (const [key, handler] of Object.entries(routeHandlers)) {\n // First apply all wrappers onto the handler\n const wrappedHandler = await applyWrappersRight(\n handler as (ctx: CrawleeOneRouteCtx<CrawlerCtx & RouterCtx>) => Promise<void>,\n handlerWrappers ?? []\n );\n\n // Then register the composite handler\n await router.addHandler<CrawlerCtx>(key, async (ctx) =>\n wrappedHandler({ ...routerContext, ...ctx } as any)\n );\n }\n};\n\nconst createDefaultHandler = <\n CrawlerCtx extends CrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(\n input: {\n io: CrawleeOneIO;\n routes: CrawleeOneRouteMatcher<Labels, RouterCtx, CrawlerCtx>[];\n routeHandlers: Record<Labels, CrawleeOneRouteHandler<CrawlerCtx, RouterCtx>>;\n } & PerfActorInput &\n Pick<RequestActorInput, 'requestQueueId'>\n) => {\n const { io, routes, routeHandlers, requestQueueId, perfBatchSize, perfBatchWaitSecs } = input;\n\n // NOTE: Because we \"clear\" the queue by replacing it,\n // we need to always call `openRequestQueue` to ensure we use the latest instance\n const openQueue = () => io.openRequestQueue(requestQueueId);\n\n const closeRequest = async (req: CrawlerRequest | null) => {\n if (!req) return;\n const reqQueue = await openQueue();\n await reqQueue.markRequestHandled(req);\n };\n\n const loadNextRequest = async (suffix: string, options?: { page?: Page; log?: Log }) => {\n const { page, log } = options ?? {};\n\n log?.debug(`Checking for new Request in the queue. ${suffix}`);\n\n if (perfBatchWaitSecs) await wait(perfBatchWaitSecs);\n const reqQueue = await openQueue();\n const newReq = (await reqQueue.fetchNextRequest()) ?? null;\n\n if (newReq) {\n log?.debug(`Found new Request in the queue. ${suffix}`);\n\n // WARNING - For each subsequent Request, it must be loaded manually\n // Hence, batching is suitable only for browser-based Crawlers\n // like Playwright or Puppeteer.\n if (page?.goto) await page.goto(newReq.url);\n } else {\n log?.debug(`No more Requests in the queue. ${suffix}`);\n }\n return newReq;\n };\n\n const onError = async (err: any, req: CrawlerRequest | null, log: Log) => {\n log.error(`Failed to process a request, returning it to the queue. URL: ${req?.loadedUrl || req?.url}.`); // prettier-ignore\n log.error(err);\n // Reinsert the request into the queue if we failed to process it due to an error\n if (req) {\n const reqQueue = await openQueue();\n await reqQueue.reclaimRequest(req, { forefront: true });\n }\n };\n\n /** Redirect the URL to the labelled route identical to route's name */\n // prettier-ignore\n const defaultAction: CrawleeOneRouteMatcher<Labels, RouterCtx, CrawlerCtx>['action'] = async (url, ctx, route) => {\n const handler = route.handlerLabel != null && routeHandlers[route.handlerLabel];\n if (!handler) {\n ctx.log.error(`No handler found for route ${route.name} (${route.handlerLabel}). URL will not be processed. URL: ${url}`); // prettier-ignore\n return;\n }\n ctx.log.info(`Passing URL to handler ${route.handlerLabel}. URL: ${url}`);\n await handler(ctx as any);\n };\n\n const defaultHandler = async <T extends CrawleeOneRouteCtx<CrawlerCtx & RouterCtx>>(\n ctx: T\n ): Promise<void> => {\n const { page, log: parentLog } = ctx;\n const log = parentLog.child({ prefix: '[Router] ' });\n\n let handledRequestsCount = 0;\n let req: CrawlerRequest | null = ctx.request ?? null;\n\n const hasBatchReqs = () =>\n perfBatchSize != null && req != null && handledRequestsCount < perfBatchSize;\n\n const getUrl = () => (page ? (page as any as CommonPage).url() : req!.loadedUrl || req!.url);\n\n const onRequest = async () => {\n const url = await getUrl();\n const logSuffix = `Batch ${handledRequestsCount + 1} of ${perfBatchSize ?? 1}. URL: ${url}`;\n\n // Find route handler for given URL\n log.debug(`Searching for a handler for given Request. ${logSuffix}`);\n const route = await serialAsyncFind(routes, async (currRoute) => {\n const isMatch = await currRoute.match(url, ctx, currRoute, routeHandlers);\n return isMatch;\n });\n\n // Run the handler\n if (route) {\n log.info(`URL matched route ${route.name} (handlerLabel: ${route.handlerLabel}). ${logSuffix}`); // prettier-ignore\n const action = route.action ?? defaultAction;\n await action(url, ctx, route, routeHandlers);\n } else {\n log.error(`No route matched URL. URL will not be processed. ${logSuffix}`);\n }\n\n // Clean up and move onto another request\n await closeRequest(req);\n handledRequestsCount++;\n\n req = await loadNextRequest(logSuffix, { page: page as Page, log });\n };\n\n try {\n do {\n await onRequest();\n } while (hasBatchReqs());\n } catch (err) {\n await onError(err, req, log);\n }\n };\n\n return defaultHandler;\n};\n\n/**\n * Configures the default router handler to redirect URLs to labelled route handlers\n * based on which route the URL matches first.\n *\n * NOTE: This does mean that the URLs passed to this default handler will be fetched\n * twice (as the URL will be requeued to the correct handler). We recommend to use this\n * function only in the scenarios where there is a small number of startUrls, yet these\n * may need various ways of processing based on different paths or etc.\n *\n * @example\n *\n * const routeLabels = {\n * MAIN_PAGE: 'MAIN_PAGE',\n * JOB_LISTING: 'JOB_LISTING',\n * JOB_DETAIL: 'JOB_DETAIL',\n * JOB_RELATED_LIST: 'JOB_RELATED_LIST',\n * PARTNERS: 'PARTNERS',\n * } as const;\n *\n * const router = createPlaywrightRouter();\n *\n * const routes = createPlaywrightCrawleeOneRouteMatchers<typeof routeLabels>([\n * // URLs that match this route are redirected to router.addHandler(routeLabels.MAIN_PAGE)\n * {\n * route: routeLabels.MAIN_PAGE,\n * // Check for main page like https://www.profesia.sk/?#\n * match: (url) => url.match(/[\\W]profesia\\.sk\\/?(?:[?#~]|$)/i),\n * },\n *\n * // Optionally override the logic that assigns the URL to the route by specifying the `action` prop\n * {\n * route: routeLabels.MAIN_PAGE,\n * // Check for main page like https://www.profesia.sk/?#\n * match: (url) => url.match(/[\\W]profesia\\.sk\\/?(?:[?#~]|$)/i),\n * action: async (ctx) => {\n * await ctx.crawler.addRequests([{\n * url: 'https://profesia.sk/praca',\n * label: routeLabels.JOB_LISTING,\n * }]);\n * },\n * },\n * ]);\n *\n * // Set up default route to redirect to labelled routes\n * setupDefaultHandlers({ router, routes });\n *\n * // Now set up the labelled routes\n * await router.addHandler(routeLabels.JOB_LISTING, async (ctx) => { ... }\n */\nexport const setupDefaultHandlers = async <\n CrawlerCtx extends CrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>\n>({\n io,\n router,\n routeHandlerWrappers,\n routerContext,\n routes,\n routeHandlers,\n input,\n}: {\n io: CrawleeOneIO;\n router: CrawlerRouter<CrawlerCtx>;\n routeHandlerWrappers?: CrawleeOneRouteWrapper<CrawlerCtx, RouterCtx>[];\n routerContext?: RouterCtx;\n routes: CrawleeOneRouteMatcher<Labels, RouterCtx, CrawlerCtx>[];\n routeHandlers: Record<Labels, CrawleeOneRouteHandler<CrawlerCtx, RouterCtx>>;\n input?: Input | null;\n}) => {\n const { perfBatchSize, perfBatchWaitSecs, requestQueueId } = (input || {}) as PerfActorInput &\n RequestActorInput;\n\n const defaultHandler = createDefaultHandler({\n io,\n routes,\n routeHandlers,\n requestQueueId,\n perfBatchSize,\n perfBatchWaitSecs,\n });\n\n const wrappedHandler = await applyWrappersRight(defaultHandler, routeHandlerWrappers ?? []);\n await router.addDefaultHandler<CrawlerCtx>((ctx) =>\n wrappedHandler({ ...routerContext, ...ctx } as any)\n );\n};\n"]}
|
|
1
|
+
{"version":3,"file":"router.js","sourceRoot":"","sources":["../../../../src/lib/router/router.ts"],"names":[],"mappings":";;;;;;;;;;;;AASA,6CAA0D;AAY1D,wHAAwH;AAExH;;;;;;GAMG;AACH,MAAM,kBAAkB,GAAG,CAIzB,EAAO,EACP,WAAoB,EAAE,EACtB,EAAE;IACF,OAAO,QAAQ,CAAC,WAAW,CAAe,CAAO,gBAAgB,EAAE,OAAO,EAAE,EAAE;QAC5E,MAAM,SAAS,GAAG,MAAM,gBAAgB,CAAC;QACzC,OAAO,OAAO,CAAC,SAAS,CAAC,CAAC;IAC5B,CAAC,CAAA,EAAE,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,CAAC;AAC1B,CAAC,CAAC;AAEF,MAAM,aAAa,GAAG,CAKpB,MAAwD,EACxD,EAAE,CACF,MAAM,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE;IACnB,MAAM,QAAQ,GAAG,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;IAC1E,IAAI,CAAC,QAAQ,CAAC,MAAM,EAAE;QACpB,MAAM,KAAK,CAAC,SAAS,KAAK,CAAC,IAAI,YAAY,KAAK,CAAC,YAAY,2EAA2E,CAAC,CAAC,CAAC,kBAAkB;KAC9J;IAED,MAAM,gBAAgB,GAAG,QAAQ,CAAC,GAAG,CACnC,CAAC,OAAO,EAAE,EAAE;QACV,IAAI,OAAO,OAAO,KAAK,UAAU;YAAE,OAAO,OAAO,CAAC;QAClD,IAAI,OAAO,OAAO,KAAK,QAAQ,IAAI,OAAO,YAAY,MAAM,EAAE;YAC5D,MAAM,YAAY,GAA4D,CAAC,GAAG,EAAE,EAAE,CACpF,CAAC,CAAC,GAAG,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;YACvB,OAAO,YAAY,CAAC;SACrB;QACD,yBAAyB;QACzB,MAAM,KAAK,CAAC,SAAS,KAAK,CAAC,IAAI,YAAY,KAAK,CAAC,YAAY,yFAAyF,OAAO,EAAE,CAAC,CAAC,CAAC,kBAAkB;IACtL,CAAC,CACF,CAAC;IAEF,uCAAY,KAAK,KAAE,KAAK,EAAE,gBAAgB,IAAG;AAC/C,CAAC,CAAC,CAAC;AAEL;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACI,MAAM,gBAAgB,GAAG,CAK9B,MAAiC,EACjC,aAA4E,EAC5E,OAGC,EACD,EAAE;IACF,MAAM,EAAE,aAAa,EAAE,eAAe,EAAE,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;IAEzD,mBAAmB;IACnB,KAAK,MAAM,CAAC,GAAG,EAAE,OAAO,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,aAAa,CAAC,EAAE;QAC1D,4CAA4C;QAC5C,MAAM,cAAc,GAAG,MAAM,kBAAkB,CAC7C,OAA6E,EAC7E,eAAe,aAAf,eAAe,cAAf,eAAe,GAAI,EAAE,CACtB,CAAC;QAEF,sCAAsC;QACtC,MAAM,MAAM,CAAC,UAAU,CAAa,GAAG,EAAE,CAAO,GAAG,EAAE,EAAE,kDACrD,OAAA,cAAc,CAAC,gCAAK,aAAa,GAAK,GAAG,CAAS,CAAC,CAAA,GAAA,CACpD,CAAC;KACH;AACH,CAAC,CAAA,CAAC;AA3BW,QAAA,gBAAgB,oBA2B3B;AAEF,MAAM,oBAAoB,GAAG,CAK3B,KAK2C,EAC3C,EAAE;IACF,MAAM,EAAE,EAAE,EAAE,MAAM,EAAE,aAAa,EAAE,cAAc,EAAE,aAAa,EAAE,iBAAiB,EAAE,GAAG,KAAK,CAAC;IAE9F,MAAM,cAAc,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC;IAE7C,sDAAsD;IACtD,iFAAiF;IACjF,MAAM,SAAS,GAAG,GAAG,EAAE,CAAC,EAAE,CAAC,gBAAgB,CAAC,cAAc,CAAC,CAAC;IAE5D,MAAM,YAAY,GAAG,CAAO,GAA0B,EAAE,EAAE;QACxD,IAAI,CAAC,GAAG;YAAE,OAAO;QACjB,MAAM,QAAQ,GAAG,MAAM,SAAS,EAAE,CAAC;QACnC,MAAM,QAAQ,CAAC,kBAAkB,CAAC,GAAG,CAAC,CAAC;IACzC,CAAC,CAAA,CAAC;IAEF,MAAM,eAAe,GAAG,CAAO,MAAc,EAAE,OAAoC,EAAE,EAAE;;QACrF,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;QAEpC,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,0CAA0C,MAAM,EAAE,CAAC,CAAC;QAE/D,IAAI,iBAAiB;YAAE,MAAM,IAAA,YAAI,EAAC,iBAAiB,CAAC,CAAC;QACrD,MAAM,QAAQ,GAAG,MAAM,SAAS,EAAE,CAAC;QACnC,MAAM,MAAM,GAAG,MAAA,CAAC,MAAM,QAAQ,CAAC,gBAAgB,EAAE,CAAC,mCAAI,IAAI,CAAC;QAE3D,IAAI,MAAM,EAAE;YACV,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,mCAAmC,MAAM,EAAE,CAAC,CAAC;YAExD,oEAAoE;YACpE,wEAAwE;YACxE,0CAA0C;YAC1C,IAAI,IAAI,aAAJ,IAAI,uBAAJ,IAAI,CAAE,IAAI;gBAAE,MAAM,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;SAC7C;aAAM;YACL,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,kCAAkC,MAAM,EAAE,CAAC,CAAC;SACxD;QACD,OAAO,MAAM,CAAC;IAChB,CAAC,CAAA,CAAC;IAEF,MAAM,OAAO,GAAG,CAAO,GAAQ,EAAE,GAA0B,EAAE,GAAQ,EAAE,EAAE;QACvE,GAAG,CAAC,KAAK,CAAC,gEAAgE,CAAA,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,SAAS,MAAI,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,GAAG,CAAA,GAAG,CAAC,CAAC,CAAC,kBAAkB;QAC5H,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QACf,iFAAiF;QACjF,IAAI,GAAG,EAAE;YACP,MAAM,QAAQ,GAAG,MAAM,SAAS,EAAE,CAAC;YACnC,MAAM,QAAQ,CAAC,cAAc,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;SACzD;IACH,CAAC,CAAA,CAAC;IAEF,uEAAuE;IACvE,kBAAkB;IAClB,MAAM,aAAa,GAA6D,CAAO,GAAG,EAAE,GAAG,EAAE,KAAK,EAAE,EAAE;QACxG,MAAM,OAAO,GAAG,KAAK,CAAC,YAAY,IAAI,IAAI,IAAI,aAAa,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;QAChF,IAAI,CAAC,OAAO,EAAE;YACZ,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,8BAA8B,KAAK,CAAC,IAAI,KAAK,KAAK,CAAC,YAAY,sCAAsC,GAAG,EAAE,CAAC,CAAC,CAAC,kBAAkB;YAC7I,OAAO;SACR;QACD,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,0BAA0B,KAAK,CAAC,YAAY,UAAU,GAAG,EAAE,CAAC,CAAC;QAC1E,MAAM,OAAO,CAAC,GAAU,CAAC,CAAC;IAC5B,CAAC,CAAA,CAAC;IAEF,MAAM,cAAc,GAAG,CACrB,GAAM,EACS,EAAE;;QACjB,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,SAAS,EAAE,GAAG,GAAG,CAAC;QACrC,MAAM,GAAG,GAAG,SAAS,CAAC,KAAK,CAAC,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC,CAAC;QAErD,IAAI,oBAAoB,GAAG,CAAC,CAAC;QAC7B,IAAI,GAAG,GAA0B,MAAA,GAAG,CAAC,OAAO,mCAAI,IAAI,CAAC;QAErD,MAAM,YAAY,GAAG,GAAG,EAAE,CACxB,aAAa,IAAI,IAAI,IAAI,GAAG,IAAI,IAAI,IAAI,oBAAoB,GAAG,aAAa,CAAC;QAE/E,MAAM,MAAM,GAAG,GAAG,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAE,IAA0B,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,GAAI,CAAC,SAAS,IAAI,GAAI,CAAC,GAAG,CAAC,CAAC;QAE7F,MAAM,SAAS,GAAG,GAAS,EAAE;;YAC3B,MAAM,GAAG,GAAG,MAAM,MAAM,EAAE,CAAC;YAC3B,MAAM,SAAS,GAAG,SAAS,oBAAoB,GAAG,CAAC,OAAO,aAAa,aAAb,aAAa,cAAb,aAAa,GAAI,CAAC,UAAU,GAAG,EAAE,CAAC;YAE5F,mCAAmC;YACnC,GAAG,CAAC,KAAK,CAAC,8CAA8C,SAAS,EAAE,CAAC,CAAC;YACrE,MAAM,KAAK,GAAG,MAAM,IAAA,uBAAe,EAAC,cAAc,EAAE,CAAO,SAAS,EAAE,EAAE;gBACtE,MAAM,OAAO,GAAG,MAAM,IAAA,uBAAe,EAAC,SAAS,CAAC,KAAK,EAAE,CAAO,OAAO,EAAE,EAAE;oBACvE,OAAO,OAAO,CAAC,GAAG,EAAE,GAAG,EAAE,SAAS,EAAE,aAAa,CAAC,CAAC;gBACrD,CAAC,CAAA,CAAC,CAAC;gBACH,OAAO,OAAO,CAAC;YACjB,CAAC,CAAA,CAAC,CAAC;YAEH,kBAAkB;YAClB,IAAI,KAAK,EAAE;gBACT,GAAG,CAAC,IAAI,CAAC,qBAAqB,KAAK,CAAC,IAAI,mBAAmB,KAAK,CAAC,YAAY,MAAM,SAAS,EAAE,CAAC,CAAC,CAAC,kBAAkB;gBACnH,MAAM,MAAM,GAAG,MAAA,KAAK,CAAC,MAAM,mCAAI,aAAa,CAAC;gBAC7C,MAAM,MAAM,CAAC,GAAG,EAAE,GAAG,EAAE,KAAK,EAAE,aAAa,CAAC,CAAC;aAC9C;iBAAM;gBACL,GAAG,CAAC,KAAK,CAAC,oDAAoD,SAAS,EAAE,CAAC,CAAC;aAC5E;YAED,yCAAyC;YACzC,MAAM,YAAY,CAAC,GAAG,CAAC,CAAC;YACxB,oBAAoB,EAAE,CAAC;YAEvB,GAAG,GAAG,MAAM,eAAe,CAAC,SAAS,EAAE,EAAE,IAAI,EAAE,IAAY,EAAE,GAAG,EAAE,CAAC,CAAC;QACtE,CAAC,CAAA,CAAC;QAEF,IAAI;YACF,GAAG;gBACD,MAAM,SAAS,EAAE,CAAC;aACnB,QAAQ,YAAY,EAAE,EAAE;SAC1B;QAAC,OAAO,GAAG,EAAE;YACZ,MAAM,OAAO,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC;SAC9B;IACH,CAAC,CAAA,CAAC;IAEF,OAAO,cAAc,CAAC;AACxB,CAAC,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgDG;AACI,MAAM,oBAAoB,GAAG,CAKlC,EACA,EAAE,EACF,MAAM,EACN,oBAAoB,EACpB,aAAa,EACb,MAAM,EACN,aAAa,EACb,KAAK,GASN,EAAE,EAAE;IACH,MAAM,EAAE,aAAa,EAAE,iBAAiB,EAAE,cAAc,EAAE,GAAG,CAAC,KAAK,IAAI,EAAE,CACtD,CAAC;IAEpB,MAAM,cAAc,GAAG,oBAAoB,CAAC;QAC1C,EAAE;QACF,MAAM;QACN,aAAa;QACb,cAAc;QACd,aAAa;QACb,iBAAiB;KAClB,CAAC,CAAC;IAEH,MAAM,cAAc,GAAG,MAAM,kBAAkB,CAAC,cAAc,EAAE,oBAAoB,aAApB,oBAAoB,cAApB,oBAAoB,GAAI,EAAE,CAAC,CAAC;IAC5F,MAAM,MAAM,CAAC,iBAAiB,CAAa,CAAC,GAAG,EAAE,EAAE,CACjD,cAAc,CAAC,gCAAK,aAAa,GAAK,GAAG,CAAS,CAAC,CACpD,CAAC;AACJ,CAAC,CAAA,CAAC;AAtCW,QAAA,oBAAoB,wBAsC/B","sourcesContent":["import type {\n CrawlingContext,\n RouterHandler as CrawlerRouter,\n Request as CrawlerRequest,\n Log,\n} from 'crawlee';\nimport type { CommonPage } from '@crawlee/browser-pool';\nimport type { Page } from 'playwright';\n\nimport { serialAsyncFind, wait } from '../../utils/async';\nimport type { MaybePromise } from '../../utils/types';\nimport type { PerfActorInput, RequestActorInput } from '../config';\nimport type { CrawleeOneIO } from '../integrations/types';\nimport type {\n CrawleeOneRouteWrapper,\n CrawleeOneRouteHandler,\n CrawleeOneRoute,\n CrawleeOneRouteCtx,\n CrawleeOneRouteMatcherFn,\n} from './types';\n\n// Read about router on https://docs.apify.com/academy/expert-scraping-with-apify/solutions/using-storage-creating-tasks\n\n/**\n * Given a function `fn` and a list of wrappers [a, b, c, ...],\n * wrap the function to generate composite `a( b( c( fn ) ) )`.\n *\n * That is, the wrappers on the left side of the array (start)\n * wrap those on the right side of the array (end).\n */\nconst applyWrappersRight = <\n TFn extends (...args: any[]) => any,\n TWrap extends (fn: TFn) => MaybePromise<TFn>\n>(\n fn: TFn,\n wrappers: TWrap[] = []\n) => {\n return wrappers.reduceRight<Promise<TFn>>(async (interimFnPromise, wrapper) => {\n const interimFn = await interimFnPromise;\n return wrapper(interimFn);\n }, Promise.resolve(fn));\n};\n\nconst resolveRoutes = <\n Labels extends string = string,\n RouterCtx extends Record<string, any> = Record<string, any>,\n CrawlerCtx extends CrawlingContext = CrawlingContext\n>(\n routes: CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[]\n) =>\n routes.map((route) => {\n const matchers = Array.isArray(route.match) ? route.match : [route.match];\n if (!matchers.length) {\n throw Error(`Route ${route.name} (label: ${route.handlerLabel}) has NO \"match\" item. It can be a RegExp, function, or array of the two.`); // prettier-ignore\n }\n\n const resolvedMatchers = matchers.map<CrawleeOneRouteMatcherFn<Labels, RouterCtx, CrawlerCtx>>(\n (matcher) => {\n if (typeof matcher === 'function') return matcher;\n if (typeof matcher === 'string' || matcher instanceof RegExp) {\n const newMatcherFn: CrawleeOneRouteMatcherFn<Labels, RouterCtx, CrawlerCtx> = (url) =>\n !!url.match(matcher);\n return newMatcherFn;\n }\n // We shouldn't get here!\n throw Error(`Route ${route.name} (label: ${route.handlerLabel}) has INVALID \"match\" item. It can be only RegExp, function, or array of the two. Got ${matcher}`); // prettier-ignore\n }\n );\n\n return { ...route, match: resolvedMatchers };\n });\n\n/**\n * Register many handlers at once onto the Crawlee's RouterHandler.\n *\n * The labels under which the handlers are registered are the respective object keys.\n *\n * Example:\n *\n * ```js\n * registerHandlers(router, { labelA: fn1, labelB: fn2 });\n * ```\n *\n * Is similar to:\n * ```js\n * router.addHandler(labelA, fn1)\n * router.addHandler(labelB, fn2)\n * ```\n *\n * You can also specify a list of wrappers to override the behaviour of all handlers\n * all at once.\n *\n * A list of wrappers `[a, b, c]` will be applied to the handlers right-to-left as so\n * `a( b( c( handler ) ) )`.\n *\n * The entries on the `routerContext` object will be made available to all handlers.\n */\nexport const registerHandlers = async <\n Labels extends string = string,\n RouterCtx extends Record<string, any> = Record<string, any>,\n CrawlerCtx extends CrawlingContext = CrawlingContext\n>(\n router: CrawlerRouter<CrawlerCtx>,\n routeHandlers: Record<Labels, CrawleeOneRouteHandler<CrawlerCtx, RouterCtx>>,\n options?: {\n routerContext?: RouterCtx;\n handlerWrappers?: CrawleeOneRouteWrapper<CrawlerCtx, RouterCtx>[];\n }\n) => {\n const { routerContext, handlerWrappers } = options ?? {};\n\n // For each handler\n for (const [key, handler] of Object.entries(routeHandlers)) {\n // First apply all wrappers onto the handler\n const wrappedHandler = await applyWrappersRight(\n handler as (ctx: CrawleeOneRouteCtx<CrawlerCtx & RouterCtx>) => Promise<void>,\n handlerWrappers ?? []\n );\n\n // Then register the composite handler\n await router.addHandler<CrawlerCtx>(key, async (ctx) =>\n wrappedHandler({ ...routerContext, ...ctx } as any)\n );\n }\n};\n\nconst createDefaultHandler = <\n Labels extends string = string,\n RouterCtx extends Record<string, any> = Record<string, any>,\n CrawlerCtx extends CrawlingContext = CrawlingContext\n>(\n input: {\n io: CrawleeOneIO;\n routes: CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[];\n routeHandlers: Record<Labels, CrawleeOneRouteHandler<CrawlerCtx, RouterCtx>>;\n } & PerfActorInput &\n Pick<RequestActorInput, 'requestQueueId'>\n) => {\n const { io, routes, routeHandlers, requestQueueId, perfBatchSize, perfBatchWaitSecs } = input;\n\n const resolvedRoutes = resolveRoutes(routes);\n\n // NOTE: Because we \"clear\" the queue by replacing it,\n // we need to always call `openRequestQueue` to ensure we use the latest instance\n const openQueue = () => io.openRequestQueue(requestQueueId);\n\n const closeRequest = async (req: CrawlerRequest | null) => {\n if (!req) return;\n const reqQueue = await openQueue();\n await reqQueue.markRequestHandled(req);\n };\n\n const loadNextRequest = async (suffix: string, options?: { page?: Page; log?: Log }) => {\n const { page, log } = options ?? {};\n\n log?.debug(`Checking for new Request in the queue. ${suffix}`);\n\n if (perfBatchWaitSecs) await wait(perfBatchWaitSecs);\n const reqQueue = await openQueue();\n const newReq = (await reqQueue.fetchNextRequest()) ?? null;\n\n if (newReq) {\n log?.debug(`Found new Request in the queue. ${suffix}`);\n\n // WARNING - For each subsequent Request, it must be loaded manually\n // Hence, batching is suitable only for browser-based Crawlers\n // like Playwright or Puppeteer.\n if (page?.goto) await page.goto(newReq.url);\n } else {\n log?.debug(`No more Requests in the queue. ${suffix}`);\n }\n return newReq;\n };\n\n const onError = async (err: any, req: CrawlerRequest | null, log: Log) => {\n log.error(`Failed to process a request, returning it to the queue. URL: ${req?.loadedUrl || req?.url}.`); // prettier-ignore\n log.error(err);\n // Reinsert the request into the queue if we failed to process it due to an error\n if (req) {\n const reqQueue = await openQueue();\n await reqQueue.reclaimRequest(req, { forefront: true });\n }\n };\n\n /** Redirect the URL to the labelled route identical to route's name */\n // prettier-ignore\n const defaultAction: CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>['action'] = async (url, ctx, route) => {\n const handler = route.handlerLabel != null && routeHandlers[route.handlerLabel];\n if (!handler) {\n ctx.log.error(`No handler found for route ${route.name} (${route.handlerLabel}). URL will not be processed. URL: ${url}`); // prettier-ignore\n return;\n }\n ctx.log.info(`Passing URL to handler ${route.handlerLabel}. URL: ${url}`);\n await handler(ctx as any);\n };\n\n const defaultHandler = async <T extends CrawleeOneRouteCtx<CrawlerCtx & RouterCtx>>(\n ctx: T\n ): Promise<void> => {\n const { page, log: parentLog } = ctx;\n const log = parentLog.child({ prefix: '[Router] ' });\n\n let handledRequestsCount = 0;\n let req: CrawlerRequest | null = ctx.request ?? null;\n\n const hasBatchReqs = () =>\n perfBatchSize != null && req != null && handledRequestsCount < perfBatchSize;\n\n const getUrl = () => (page ? (page as any as CommonPage).url() : req!.loadedUrl || req!.url);\n\n const onRequest = async () => {\n const url = await getUrl();\n const logSuffix = `Batch ${handledRequestsCount + 1} of ${perfBatchSize ?? 1}. URL: ${url}`;\n\n // Find route handler for given URL\n log.debug(`Searching for a handler for given Request. ${logSuffix}`);\n const route = await serialAsyncFind(resolvedRoutes, async (currRoute) => {\n const isMatch = await serialAsyncFind(currRoute.match, async (matchFn) => {\n return matchFn(url, ctx, currRoute, routeHandlers);\n });\n return isMatch;\n });\n\n // Run the handler\n if (route) {\n log.info(`URL matched route ${route.name} (handlerLabel: ${route.handlerLabel}). ${logSuffix}`); // prettier-ignore\n const action = route.action ?? defaultAction;\n await action(url, ctx, route, routeHandlers);\n } else {\n log.error(`No route matched URL. URL will not be processed. ${logSuffix}`);\n }\n\n // Clean up and move onto another request\n await closeRequest(req);\n handledRequestsCount++;\n\n req = await loadNextRequest(logSuffix, { page: page as Page, log });\n };\n\n try {\n do {\n await onRequest();\n } while (hasBatchReqs());\n } catch (err) {\n await onError(err, req, log);\n }\n };\n\n return defaultHandler;\n};\n\n/**\n * Configures the default router handler to redirect URLs to labelled route handlers\n * based on which route the URL matches first.\n *\n * NOTE: This does mean that the URLs passed to this default handler will be fetched\n * twice (as the URL will be requeued to the correct handler). We recommend to use this\n * function only in the scenarios where there is a small number of startUrls, yet these\n * may need various ways of processing based on different paths or etc.\n *\n * @example\n *\n * const routeLabels = {\n * MAIN_PAGE: 'MAIN_PAGE',\n * JOB_LISTING: 'JOB_LISTING',\n * JOB_DETAIL: 'JOB_DETAIL',\n * JOB_RELATED_LIST: 'JOB_RELATED_LIST',\n * PARTNERS: 'PARTNERS',\n * } as const;\n *\n * const router = createPlaywrightRouter();\n *\n * const routes = createPlaywrightCrawleeOneRouteMatchers<typeof routeLabels>([\n * // URLs that match this route are redirected to router.addHandler(routeLabels.MAIN_PAGE)\n * {\n * route: routeLabels.MAIN_PAGE,\n * // Check for main page like https://www.profesia.sk/?#\n * match: (url) => url.match(/[\\W]profesia\\.sk\\/?(?:[?#~]|$)/i),\n * },\n *\n * // Optionally override the logic that assigns the URL to the route by specifying the `action` prop\n * {\n * route: routeLabels.MAIN_PAGE,\n * // Check for main page like https://www.profesia.sk/?#\n * match: (url) => url.match(/[\\W]profesia\\.sk\\/?(?:[?#~]|$)/i),\n * action: async (ctx) => {\n * await ctx.crawler.addRequests([{\n * url: 'https://profesia.sk/praca',\n * label: routeLabels.JOB_LISTING,\n * }]);\n * },\n * },\n * ]);\n *\n * // Set up default route to redirect to labelled routes\n * setupDefaultHandlers({ router, routes });\n *\n * // Now set up the labelled routes\n * await router.addHandler(routeLabels.JOB_LISTING, async (ctx) => { ... }\n */\nexport const setupDefaultHandlers = async <\n CrawlerCtx extends CrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>\n>({\n io,\n router,\n routeHandlerWrappers,\n routerContext,\n routes,\n routeHandlers,\n input,\n}: {\n io: CrawleeOneIO;\n router: CrawlerRouter<CrawlerCtx>;\n routeHandlerWrappers?: CrawleeOneRouteWrapper<CrawlerCtx, RouterCtx>[];\n routerContext?: RouterCtx;\n routes: CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[];\n routeHandlers: Record<Labels, CrawleeOneRouteHandler<CrawlerCtx, RouterCtx>>;\n input?: Input | null;\n}) => {\n const { perfBatchSize, perfBatchWaitSecs, requestQueueId } = (input || {}) as PerfActorInput &\n RequestActorInput;\n\n const defaultHandler = createDefaultHandler({\n io,\n routes,\n routeHandlers,\n requestQueueId,\n perfBatchSize,\n perfBatchWaitSecs,\n });\n\n const wrappedHandler = await applyWrappersRight(defaultHandler, routeHandlerWrappers ?? []);\n await router.addDefaultHandler<CrawlerCtx>((ctx) =>\n wrappedHandler({ ...routerContext, ...ctx } as any)\n );\n};\n"]}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import type { BasicCrawler, BasicCrawlingContext, CheerioCrawlingContext, CrawlingContext, HttpCrawlingContext, JSDOMCrawlingContext, PlaywrightCrawlingContext, PuppeteerCrawlingContext, RouterHandler as CrawlerRouter } from 'crawlee';
|
|
2
|
-
import type { MaybePromise } from '../../utils/types';
|
|
2
|
+
import type { MaybeArray, MaybePromise } from '../../utils/types';
|
|
3
3
|
/** Context object provided in CrawlerRouter */
|
|
4
4
|
export type CrawleeOneRouteCtx<CrawlerCtx extends CrawlingContext> = Parameters<Parameters<CrawlerRouter<CrawlerCtx>['addHandler']>[1]>[0];
|
|
5
5
|
/** Function that's passed to `router.addHandler(label, handler)` */
|
|
@@ -7,12 +7,13 @@ export type CrawleeOneRouteHandler<CrawlerCtx extends CrawlingContext = Crawling
|
|
|
7
7
|
/** Wrapper that modifies behavior of CrawleeOneRouteHandler */
|
|
8
8
|
export type CrawleeOneRouteWrapper<CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>, RouterCtx extends Record<string, any> = Record<string, any>> = (handler: (ctx: CrawleeOneRouteCtx<CrawlerCtx & RouterCtx>) => Promise<void>) => MaybePromise<(ctx: CrawleeOneRouteCtx<CrawlerCtx & RouterCtx>) => Promise<void>>;
|
|
9
9
|
/**
|
|
10
|
-
*
|
|
10
|
+
* Route that a request will be sent to if the request doesn't have a label yet,
|
|
11
|
+
* and if the `match` function returns truthy value.
|
|
11
12
|
*
|
|
12
|
-
*
|
|
13
|
-
*
|
|
13
|
+
* If `match` function returns truthy value, the request is passed to the `action`
|
|
14
|
+
* function for processing.
|
|
14
15
|
*/
|
|
15
|
-
export interface
|
|
16
|
+
export interface CrawleeOneRoute<Labels extends string = string, RouterCtx extends Record<string, any> = Record<string, any>, CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>> {
|
|
16
17
|
/** Human readable name */
|
|
17
18
|
name: string;
|
|
18
19
|
/**
|
|
@@ -24,7 +25,8 @@ export interface CrawleeOneRouteMatcher<Labels extends string = string, RouterCt
|
|
|
24
25
|
*/
|
|
25
26
|
handlerLabel: Labels | null;
|
|
26
27
|
/**
|
|
27
|
-
* Function that decides whether the request will processed
|
|
28
|
+
* Function, RegExp, or a list of the two, that decides whether the request will processed
|
|
29
|
+
* by this `action` function.
|
|
28
30
|
*
|
|
29
31
|
* @example
|
|
30
32
|
* [{
|
|
@@ -35,7 +37,7 @@ export interface CrawleeOneRouteMatcher<Labels extends string = string, RouterCt
|
|
|
35
37
|
* handlerLabel: routeLabels.JOB_DETAIL,
|
|
36
38
|
* }]
|
|
37
39
|
*/
|
|
38
|
-
match:
|
|
40
|
+
match: CrawleeOneRouteMatcher<Labels, RouterCtx, CrawlerCtx>;
|
|
39
41
|
/**
|
|
40
42
|
* Request is passed to this function if `match` returned truthy value.
|
|
41
43
|
*
|
|
@@ -48,19 +50,40 @@ export interface CrawleeOneRouteMatcher<Labels extends string = string, RouterCt
|
|
|
48
50
|
* handlerLabel: routeLabels.JOB_DETAIL,
|
|
49
51
|
* }]
|
|
50
52
|
*/
|
|
51
|
-
action?: (url: string, ctx: CrawleeOneRouteCtx<CrawlerCtx>, route:
|
|
53
|
+
action?: (url: string, ctx: CrawleeOneRouteCtx<CrawlerCtx>, route: CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>, handlers: Record<Labels, CrawleeOneRouteHandler<CrawlerCtx, RouterCtx>>) => MaybePromise<void>;
|
|
52
54
|
}
|
|
55
|
+
/**
|
|
56
|
+
* Function or RegExp that checks if the {@link CrawleeOneRoute} this Matcher belongs to
|
|
57
|
+
* should handle the given request.
|
|
58
|
+
*
|
|
59
|
+
* If the Matcher returns truthy value, the request is passed to the `action`
|
|
60
|
+
* function of the same CrawleeOneRoute.
|
|
61
|
+
*
|
|
62
|
+
* The Matcher can be:
|
|
63
|
+
* - Regular expression
|
|
64
|
+
* - Function
|
|
65
|
+
* - Array of <RegExp | Function>
|
|
66
|
+
*/
|
|
67
|
+
export type CrawleeOneRouteMatcher<Labels extends string = string, RouterCtx extends Record<string, any> = Record<string, any>, CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>> = MaybeArray<RegExp | CrawleeOneRouteMatcherFn<Labels, RouterCtx, CrawlerCtx>>;
|
|
68
|
+
/**
|
|
69
|
+
* Function variant of Matcher. Matcher that checks if the {@link CrawleeOneRoute}
|
|
70
|
+
* this Matcher belongs to should handle the given request.
|
|
71
|
+
*
|
|
72
|
+
* If the Matcher returns truthy value, the request is passed to the `action`
|
|
73
|
+
* function of the same CrawleeOneRoute.
|
|
74
|
+
*/
|
|
75
|
+
export type CrawleeOneRouteMatcherFn<Labels extends string = string, RouterCtx extends Record<string, any> = Record<string, any>, CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>> = (url: string, ctx: CrawleeOneRouteCtx<CrawlerCtx & RouterCtx>, route: CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>, handlers: Record<Labels, CrawleeOneRouteHandler<CrawlerCtx, RouterCtx>>) => unknown;
|
|
53
76
|
/** Utility function that helps with typing the route definitions. */
|
|
54
|
-
export declare const
|
|
77
|
+
export declare const createRoutes: <Labels extends string = string, RouterCtx extends Record<string, any> = Record<string, any>, CrawlerCtx extends CrawlingContext<unknown, import("crawlee").Dictionary> = CrawlingContext<BasicCrawler<BasicCrawlingContext<import("crawlee").Dictionary>>, import("crawlee").Dictionary>>(routes: CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[]) => CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[];
|
|
55
78
|
/** Utility function that helps with typing the route definitions. */
|
|
56
|
-
export declare const
|
|
79
|
+
export declare const createBasicRoutes: <Labels extends string = string, RouterCtx extends Record<string, any> = Record<string, any>, CrawlerCtx extends BasicCrawlingContext<import("crawlee").Dictionary> = BasicCrawlingContext<import("crawlee").Dictionary>>(routes: CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[]) => CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[];
|
|
57
80
|
/** Utility function that helps with typing the route definitions. */
|
|
58
|
-
export declare const
|
|
81
|
+
export declare const createHttpRoutes: <Labels extends string = string, RouterCtx extends Record<string, any> = Record<string, any>, CrawlerCtx extends HttpCrawlingContext<any, any> = HttpCrawlingContext<any, any>>(routes: CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[]) => CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[];
|
|
59
82
|
/** Utility function that helps with typing the route definitions. */
|
|
60
|
-
export declare const
|
|
83
|
+
export declare const createJsdomRoutes: <Labels extends string = string, RouterCtx extends Record<string, any> = Record<string, any>, CrawlerCtx extends JSDOMCrawlingContext<any, any> = JSDOMCrawlingContext<any, any>>(routes: CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[]) => CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[];
|
|
61
84
|
/** Utility function that helps with typing the route definitions. */
|
|
62
|
-
export declare const
|
|
85
|
+
export declare const createCheerioRoutes: <Labels extends string = string, RouterCtx extends Record<string, any> = Record<string, any>, CrawlerCtx extends CheerioCrawlingContext<any, any> = CheerioCrawlingContext<any, any>>(routes: CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[]) => CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[];
|
|
63
86
|
/** Utility function that helps with typing the route definitions. */
|
|
64
|
-
export declare const
|
|
87
|
+
export declare const createPlaywrightRoutes: <Labels extends string = string, RouterCtx extends Record<string, any> = Record<string, any>, CrawlerCtx extends PlaywrightCrawlingContext<import("crawlee").Dictionary> = PlaywrightCrawlingContext<import("crawlee").Dictionary>>(routes: CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[]) => CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[];
|
|
65
88
|
/** Utility function that helps with typing the route definitions. */
|
|
66
|
-
export declare const
|
|
89
|
+
export declare const createPuppeteerRoutes: <Labels extends string = string, RouterCtx extends Record<string, any> = Record<string, any>, CrawlerCtx extends PuppeteerCrawlingContext<import("crawlee").Dictionary> = PuppeteerCrawlingContext<import("crawlee").Dictionary>>(routes: CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[]) => CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[];
|
|
@@ -1,26 +1,26 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.
|
|
3
|
+
exports.createPuppeteerRoutes = exports.createPlaywrightRoutes = exports.createCheerioRoutes = exports.createJsdomRoutes = exports.createHttpRoutes = exports.createBasicRoutes = exports.createRoutes = void 0;
|
|
4
4
|
/** Utility function that helps with typing the route definitions. */
|
|
5
|
-
const
|
|
6
|
-
exports.
|
|
5
|
+
const createRoutes = (routes) => routes; // prettier-ignore
|
|
6
|
+
exports.createRoutes = createRoutes;
|
|
7
7
|
// Context-specific variants
|
|
8
8
|
/** Utility function that helps with typing the route definitions. */
|
|
9
|
-
const
|
|
10
|
-
exports.
|
|
9
|
+
const createBasicRoutes = (routes) => routes; // prettier-ignore
|
|
10
|
+
exports.createBasicRoutes = createBasicRoutes;
|
|
11
11
|
/** Utility function that helps with typing the route definitions. */
|
|
12
|
-
const
|
|
13
|
-
exports.
|
|
12
|
+
const createHttpRoutes = (routes) => routes; // prettier-ignore
|
|
13
|
+
exports.createHttpRoutes = createHttpRoutes;
|
|
14
14
|
/** Utility function that helps with typing the route definitions. */
|
|
15
|
-
const
|
|
16
|
-
exports.
|
|
15
|
+
const createJsdomRoutes = (routes) => routes; // prettier-ignore
|
|
16
|
+
exports.createJsdomRoutes = createJsdomRoutes;
|
|
17
17
|
/** Utility function that helps with typing the route definitions. */
|
|
18
|
-
const
|
|
19
|
-
exports.
|
|
18
|
+
const createCheerioRoutes = (routes) => routes; // prettier-ignore
|
|
19
|
+
exports.createCheerioRoutes = createCheerioRoutes;
|
|
20
20
|
/** Utility function that helps with typing the route definitions. */
|
|
21
|
-
const
|
|
22
|
-
exports.
|
|
21
|
+
const createPlaywrightRoutes = (routes) => routes; // prettier-ignore
|
|
22
|
+
exports.createPlaywrightRoutes = createPlaywrightRoutes;
|
|
23
23
|
/** Utility function that helps with typing the route definitions. */
|
|
24
|
-
const
|
|
25
|
-
exports.
|
|
24
|
+
const createPuppeteerRoutes = (routes) => routes; // prettier-ignore
|
|
25
|
+
exports.createPuppeteerRoutes = createPuppeteerRoutes;
|
|
26
26
|
//# sourceMappingURL=types.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../../src/lib/router/types.ts"],"names":[],"mappings":";;;
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../../src/lib/router/types.ts"],"names":[],"mappings":";;;AA6HA,qEAAqE;AAC9D,MAAM,YAAY,GAAG,CAI1B,MAAwD,EAAE,EAAE,CAAC,MAAM,CAAC,CAAC,kBAAkB;AAJ5E,QAAA,YAAY,gBAI6C;AAEtE,4BAA4B;AAC5B,qEAAqE;AAC9D,MAAM,iBAAiB,GAAG,CAI/B,MAAwD,EAAE,EAAE,CAAC,MAAM,CAAC,CAAC,kBAAkB;AAJ5E,QAAA,iBAAiB,qBAIwC;AACtE,qEAAqE;AAC9D,MAAM,gBAAgB,GAAG,CAI9B,MAAwD,EAAE,EAAE,CAAC,MAAM,CAAC,CAAC,kBAAkB;AAJ5E,QAAA,gBAAgB,oBAIyC;AACtE,qEAAqE;AAC9D,MAAM,iBAAiB,GAAG,CAI/B,MAAwD,EAAE,EAAE,CAAC,MAAM,CAAC,CAAC,kBAAkB;AAJ5E,QAAA,iBAAiB,qBAIwC;AACtE,qEAAqE;AAC9D,MAAM,mBAAmB,GAAG,CAIjC,MAAwD,EAAE,EAAE,CAAC,MAAM,CAAC,CAAC,kBAAkB;AAJ5E,QAAA,mBAAmB,uBAIsC;AACtE,qEAAqE;AAC9D,MAAM,sBAAsB,GAAG,CAIpC,MAAwD,EAAE,EAAE,CAAC,MAAM,CAAC,CAAC,kBAAkB;AAJ5E,QAAA,sBAAsB,0BAImC;AACtE,qEAAqE;AAC9D,MAAM,qBAAqB,GAAG,CAInC,MAAwD,EAAE,EAAE,CAAC,MAAM,CAAC,CAAC,kBAAkB;AAJ5E,QAAA,qBAAqB,yBAIoC","sourcesContent":["import type {\n BasicCrawler,\n BasicCrawlingContext,\n CheerioCrawlingContext,\n CrawlingContext,\n HttpCrawlingContext,\n JSDOMCrawlingContext,\n PlaywrightCrawlingContext,\n PuppeteerCrawlingContext,\n RouterHandler as CrawlerRouter,\n} from 'crawlee';\n\nimport type { MaybeArray, MaybePromise } from '../../utils/types';\n\n/** Context object provided in CrawlerRouter */\nexport type CrawleeOneRouteCtx<CrawlerCtx extends CrawlingContext> = Parameters<\n Parameters<CrawlerRouter<CrawlerCtx>['addHandler']>[1]\n>[0];\n\n/** Function that's passed to `router.addHandler(label, handler)` */\nexport type CrawleeOneRouteHandler<\n CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n RouterCtx extends Record<string, any> = Record<string, any>,\n> = Parameters<CrawlerRouter<CrawleeOneRouteCtx<CrawlerCtx & RouterCtx>>['addHandler']>[1]; // prettier-ignore\n\n/** Wrapper that modifies behavior of CrawleeOneRouteHandler */\nexport type CrawleeOneRouteWrapper<\n CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n RouterCtx extends Record<string, any> = Record<string, any>\n> = (\n handler: (ctx: CrawleeOneRouteCtx<CrawlerCtx & RouterCtx>) => Promise<void>\n) => MaybePromise<(ctx: CrawleeOneRouteCtx<CrawlerCtx & RouterCtx>) => Promise<void>>;\n\n/**\n * Route that a request will be sent to if the request doesn't have a label yet,\n * and if the `match` function returns truthy value.\n *\n * If `match` function returns truthy value, the request is passed to the `action`\n * function for processing.\n */\nexport interface CrawleeOneRoute<\n Labels extends string = string,\n RouterCtx extends Record<string, any> = Record<string, any>,\n CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>\n> {\n /** Human readable name */\n name: string;\n /**\n * Label of the handler registered with `router.addHandler(label, handler)`\n * that will process this request.\n *\n * NOTE: This value is used by the default `action` function. If you override\n * the `action` function, `handlerLabel` is ignored and you have to process it yourself.\n */\n handlerLabel: Labels | null;\n /**\n * Function, RegExp, or a list of the two, that decides whether the request will processed\n * by this `action` function.\n *\n * @example\n * [{\n * // If match returns true, the request is forwarded to handler\n * // with label JOB_DETAIL.\n * name: 'Job detail',\n * match: (url, ctx, route, handlers) => isUrlOfJobOffer(url),\n * handlerLabel: routeLabels.JOB_DETAIL,\n * }]\n */\n match: CrawleeOneRouteMatcher<Labels, RouterCtx, CrawlerCtx>;\n /**\n * Request is passed to this function if `match` returned truthy value.\n *\n * @example\n * [{\n * // If match returns true, the request is forwarded to handler\n * // with label JOB_DETAIL.\n * name: 'Job detail',\n * match: (url, ctx, route, handlers) => isUrlOfJobOffer(url),\n * handlerLabel: routeLabels.JOB_DETAIL,\n * }]\n */\n action?: (\n url: string,\n ctx: CrawleeOneRouteCtx<CrawlerCtx>,\n route: CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>,\n handlers: Record<Labels, CrawleeOneRouteHandler<CrawlerCtx, RouterCtx>>\n ) => MaybePromise<void>;\n}\n\n/**\n * Function or RegExp that checks if the {@link CrawleeOneRoute} this Matcher belongs to\n * should handle the given request.\n *\n * If the Matcher returns truthy value, the request is passed to the `action`\n * function of the same CrawleeOneRoute.\n *\n * The Matcher can be:\n * - Regular expression\n * - Function\n * - Array of <RegExp | Function>\n */\nexport type CrawleeOneRouteMatcher<\n Labels extends string = string,\n RouterCtx extends Record<string, any> = Record<string, any>,\n CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>\n> = MaybeArray<RegExp | CrawleeOneRouteMatcherFn<Labels, RouterCtx, CrawlerCtx>>;\n\n/**\n * Function variant of Matcher. Matcher that checks if the {@link CrawleeOneRoute}\n * this Matcher belongs to should handle the given request.\n *\n * If the Matcher returns truthy value, the request is passed to the `action`\n * function of the same CrawleeOneRoute.\n */\nexport type CrawleeOneRouteMatcherFn<\n Labels extends string = string,\n RouterCtx extends Record<string, any> = Record<string, any>,\n CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>\n> = (\n url: string,\n ctx: CrawleeOneRouteCtx<CrawlerCtx & RouterCtx>,\n route: CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>,\n handlers: Record<Labels, CrawleeOneRouteHandler<CrawlerCtx, RouterCtx>>\n) => unknown;\n\n/** Utility function that helps with typing the route definitions. */\nexport const createRoutes = <\n Labels extends string = string,\n RouterCtx extends Record<string, any> = Record<string, any>,\n CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>\n>(routes: CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[]) => routes; // prettier-ignore\n\n// Context-specific variants\n/** Utility function that helps with typing the route definitions. */\nexport const createBasicRoutes = <\n Labels extends string = string,\n RouterCtx extends Record<string, any> = Record<string, any>,\n CrawlerCtx extends BasicCrawlingContext = BasicCrawlingContext,\n>(routes: CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[]) => routes; // prettier-ignore\n/** Utility function that helps with typing the route definitions. */\nexport const createHttpRoutes = <\n Labels extends string = string,\n RouterCtx extends Record<string, any> = Record<string, any>,\n CrawlerCtx extends HttpCrawlingContext = HttpCrawlingContext,\n>(routes: CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[]) => routes; // prettier-ignore\n/** Utility function that helps with typing the route definitions. */\nexport const createJsdomRoutes = <\n Labels extends string = string,\n RouterCtx extends Record<string, any> = Record<string, any>,\n CrawlerCtx extends JSDOMCrawlingContext = JSDOMCrawlingContext,\n>(routes: CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[]) => routes; // prettier-ignore\n/** Utility function that helps with typing the route definitions. */\nexport const createCheerioRoutes = <\nLabels extends string = string,\nRouterCtx extends Record<string, any> = Record<string, any>,\nCrawlerCtx extends CheerioCrawlingContext = CheerioCrawlingContext,\n>(routes: CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[]) => routes; // prettier-ignore\n/** Utility function that helps with typing the route definitions. */\nexport const createPlaywrightRoutes = <\nLabels extends string = string,\nRouterCtx extends Record<string, any> = Record<string, any>,\n CrawlerCtx extends PlaywrightCrawlingContext = PlaywrightCrawlingContext,\n>(routes: CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[]) => routes; // prettier-ignore\n/** Utility function that helps with typing the route definitions. */\nexport const createPuppeteerRoutes = <\nLabels extends string = string,\nRouterCtx extends Record<string, any> = Record<string, any>,\nCrawlerCtx extends PuppeteerCrawlingContext = PuppeteerCrawlingContext,\n>(routes: CrawleeOneRoute<Labels, RouterCtx, CrawlerCtx>[]) => routes; // prettier-ignore\n"]}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "crawlee-one",
|
|
3
|
-
"version": "1.1.
|
|
3
|
+
"version": "1.1.2",
|
|
4
4
|
"private": false,
|
|
5
5
|
"description": "Crawlee One is a framework built on top of Crawlee and Apify for writing robust and highly configurable web scrapers",
|
|
6
6
|
"author": "Juro Oravec <juraj.oravec.josefson@gmail.com>",
|