crawlee-one 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +81 -0
- package/dist/cjs/cli/cli.d.ts +1 -0
- package/dist/cjs/cli/cli.js +61 -0
- package/dist/cjs/cli/cli.js.map +1 -0
- package/dist/cjs/cli/index.d.ts +2 -0
- package/dist/cjs/cli/index.js +6 -0
- package/dist/cjs/cli/index.js.map +1 -0
- package/dist/cjs/index.d.ts +24 -0
- package/dist/cjs/index.js +43 -0
- package/dist/cjs/index.js.map +1 -0
- package/dist/cjs/lib/actions/dom.d.ts +102 -0
- package/dist/cjs/lib/actions/dom.js +743 -0
- package/dist/cjs/lib/actions/dom.js.map +1 -0
- package/dist/cjs/lib/actions/domUtils.d.ts +42 -0
- package/dist/cjs/lib/actions/domUtils.js +126 -0
- package/dist/cjs/lib/actions/domUtils.js.map +1 -0
- package/dist/cjs/lib/actions/page.d.ts +69 -0
- package/dist/cjs/lib/actions/page.js +205 -0
- package/dist/cjs/lib/actions/page.js.map +1 -0
- package/dist/cjs/lib/actions/scrapeListing.d.ts +78 -0
- package/dist/cjs/lib/actions/scrapeListing.js +242 -0
- package/dist/cjs/lib/actions/scrapeListing.js.map +1 -0
- package/dist/cjs/lib/actor/actor.d.ts +90 -0
- package/dist/cjs/lib/actor/actor.js +306 -0
- package/dist/cjs/lib/actor/actor.js.map +1 -0
- package/dist/cjs/lib/actor/types.d.ts +162 -0
- package/dist/cjs/lib/actor/types.js +3 -0
- package/dist/cjs/lib/actor/types.js.map +1 -0
- package/dist/cjs/lib/actor.d.ts +189 -0
- package/dist/cjs/lib/actor.js +225 -0
- package/dist/cjs/lib/actor.js.map +1 -0
- package/dist/cjs/lib/actorSpec.d.ts +20 -0
- package/dist/cjs/lib/actorSpec.js +3 -0
- package/dist/cjs/lib/actorSpec.js.map +1 -0
- package/dist/cjs/lib/config.d.ts +561 -0
- package/dist/cjs/lib/config.js +707 -0
- package/dist/cjs/lib/config.js.map +1 -0
- package/dist/cjs/lib/dataset/maxCount.d.ts +30 -0
- package/dist/cjs/lib/dataset/maxCount.js +55 -0
- package/dist/cjs/lib/dataset/maxCount.js.map +1 -0
- package/dist/cjs/lib/dataset/pushData.d.ts +123 -0
- package/dist/cjs/lib/dataset/pushData.js +182 -0
- package/dist/cjs/lib/dataset/pushData.js.map +1 -0
- package/dist/cjs/lib/dataset.d.ts +98 -0
- package/dist/cjs/lib/dataset.js +122 -0
- package/dist/cjs/lib/dataset.js.map +1 -0
- package/dist/cjs/lib/dom.d.ts +78 -0
- package/dist/cjs/lib/dom.js +243 -0
- package/dist/cjs/lib/dom.js.map +1 -0
- package/dist/cjs/lib/error/errorHandler.d.ts +112 -0
- package/dist/cjs/lib/error/errorHandler.js +164 -0
- package/dist/cjs/lib/error/errorHandler.js.map +1 -0
- package/dist/cjs/lib/error/sentry.d.ts +11 -0
- package/dist/cjs/lib/error/sentry.js +60 -0
- package/dist/cjs/lib/error/sentry.js.map +1 -0
- package/dist/cjs/lib/integrations/apify.d.ts +67 -0
- package/dist/cjs/lib/integrations/apify.js +106 -0
- package/dist/cjs/lib/integrations/apify.js.map +1 -0
- package/dist/cjs/lib/integrations/types.d.ts +274 -0
- package/dist/cjs/lib/integrations/types.js +3 -0
- package/dist/cjs/lib/integrations/types.js.map +1 -0
- package/dist/cjs/lib/io/dataset.d.ts +67 -0
- package/dist/cjs/lib/io/dataset.js +86 -0
- package/dist/cjs/lib/io/dataset.js.map +1 -0
- package/dist/cjs/lib/io/maxCount.d.ts +30 -0
- package/dist/cjs/lib/io/maxCount.js +55 -0
- package/dist/cjs/lib/io/maxCount.js.map +1 -0
- package/dist/cjs/lib/io/pushData.d.ts +124 -0
- package/dist/cjs/lib/io/pushData.js +193 -0
- package/dist/cjs/lib/io/pushData.js.map +1 -0
- package/dist/cjs/lib/io/pushRequests.d.ts +38 -0
- package/dist/cjs/lib/io/pushRequests.js +63 -0
- package/dist/cjs/lib/io/pushRequests.js.map +1 -0
- package/dist/cjs/lib/io/requestQueue.d.ts +28 -0
- package/dist/cjs/lib/io/requestQueue.js +40 -0
- package/dist/cjs/lib/io/requestQueue.js.map +1 -0
- package/dist/cjs/lib/log.d.ts +38 -0
- package/dist/cjs/lib/log.js +54 -0
- package/dist/cjs/lib/log.js.map +1 -0
- package/dist/cjs/lib/migrate/localMigrator.d.ts +10 -0
- package/dist/cjs/lib/migrate/localMigrator.js +57 -0
- package/dist/cjs/lib/migrate/localMigrator.js.map +1 -0
- package/dist/cjs/lib/migrate/localState.d.ts +7 -0
- package/dist/cjs/lib/migrate/localState.js +43 -0
- package/dist/cjs/lib/migrate/localState.js.map +1 -0
- package/dist/cjs/lib/migrate/types.d.ts +6 -0
- package/dist/cjs/lib/migrate/types.js +3 -0
- package/dist/cjs/lib/migrate/types.js.map +1 -0
- package/dist/cjs/lib/readme/readme.d.ts +65 -0
- package/dist/cjs/lib/readme/readme.js +534 -0
- package/dist/cjs/lib/readme/readme.js.map +1 -0
- package/dist/cjs/lib/readme/types.d.ts +260 -0
- package/dist/cjs/lib/readme/types.js +54 -0
- package/dist/cjs/lib/readme/types.js.map +1 -0
- package/dist/cjs/lib/router.d.ts +132 -0
- package/dist/cjs/lib/router.js +165 -0
- package/dist/cjs/lib/router.js.map +1 -0
- package/dist/cjs/lib/scraper/scrapeListing.d.ts +78 -0
- package/dist/cjs/lib/scraper/scrapeListing.js +242 -0
- package/dist/cjs/lib/scraper/scrapeListing.js.map +1 -0
- package/dist/cjs/lib/test/actor.d.ts +21 -0
- package/dist/cjs/lib/test/actor.js +56 -0
- package/dist/cjs/lib/test/actor.js.map +1 -0
- package/dist/cjs/lib/test/mockApifyClient.d.ts +32 -0
- package/dist/cjs/lib/test/mockApifyClient.js +176 -0
- package/dist/cjs/lib/test/mockApifyClient.js.map +1 -0
- package/dist/cjs/types.d.ts +31 -0
- package/dist/cjs/types.js +3 -0
- package/dist/cjs/types.js.map +1 -0
- package/dist/cjs/utils/async.d.ts +19 -0
- package/dist/cjs/utils/async.js +74 -0
- package/dist/cjs/utils/async.js.map +1 -0
- package/dist/cjs/utils/error.d.ts +1 -0
- package/dist/cjs/utils/error.js +10 -0
- package/dist/cjs/utils/error.js.map +1 -0
- package/dist/cjs/utils/format.d.ts +9 -0
- package/dist/cjs/utils/format.js +19 -0
- package/dist/cjs/utils/format.js.map +1 -0
- package/dist/cjs/utils/package.d.ts +15 -0
- package/dist/cjs/utils/package.js +25 -0
- package/dist/cjs/utils/package.js.map +1 -0
- package/dist/cjs/utils/types.d.ts +6 -0
- package/dist/cjs/utils/types.js +9 -0
- package/dist/cjs/utils/types.js.map +1 -0
- package/dist/cjs/utils/url.d.ts +9 -0
- package/dist/cjs/utils/url.js +32 -0
- package/dist/cjs/utils/url.js.map +1 -0
- package/dist/cjs/utils/valueMonitor.d.ts +31 -0
- package/dist/cjs/utils/valueMonitor.js +91 -0
- package/dist/cjs/utils/valueMonitor.js.map +1 -0
- package/package.json +85 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"router.js","sourceRoot":"","sources":["../../../src/lib/router.ts"],"names":[],"mappings":";;;;;;;;;;;;AAgBA,0CAAuE;AAoFhE,MAAM,mBAAmB,GAAG,CAIjC,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,mBAAmB,uBAIuC;AAEvE,4BAA4B;AACrB,MAAM,wBAAwB,GAAG,CAItC,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,wBAAwB,4BAIkC;AAChE,MAAM,uBAAuB,GAAG,CAIrC,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,uBAAuB,2BAImC;AAChE,MAAM,wBAAwB,GAAG,CAItC,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,wBAAwB,4BAIkC;AAChE,MAAM,0BAA0B,GAAG,CAIxC,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,0BAA0B,8BAIgC;AAChE,MAAM,6BAA6B,GAAG,CAI3C,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,6BAA6B,iCAI6B;AAChE,MAAM,4BAA4B,GAAG,CAI1C,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,4BAA4B,gCAI8B;AAEhE,MAAM,gBAAgB,GAAG,CAI9B,EACA,MAAM,EACN,cAAc,EACd,aAAa,EACb,aAAa,GAMd,EAAE,EAAE;IACH,MAAM,IAAA,sBAAc,EAAC,MAAM,CAAC,OAAO,CAAC,aAAa,CAAC,EAAE,CAAO,CAAC,GAAG,EAAE,OAAO,CAAC,EAAE,EAAE;QAC3E,MAAM,cAAc,GAAG,CAAC,cAAc,aAAd,cAAc,cAAd,cAAc,GAAI,EAAE,CAAC,CAAC,WAAW,CACvD,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,EAC1C,OAA2E,CAC5E,CAAC;QACF,MAAM,MAAM,CAAC,UAAU,CAAa,GAAG,EAAE,CAAO,GAAG,EAAE,EAAE,kDACrD,OAAA,cAAc,CAAC,gCAAK,aAAa,GAAK,GAAG,CAAS,CAAC,CAAA,GAAA,CACpD,CAAC;IACJ,CAAC,CAAA,CAAC,CAAC;AACL,CAAC,CAAA,CAAC;AAxBW,QAAA,gBAAgB,oBAwB3B;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgDG;AACI,MAAM,iBAAiB,GAAG,CAK/B,EACA,EAAE,EACF,MAAM,EACN,cAAc,EACd,aAAa,EACb,MAAM,EACN,aAAa,EACb,KAAK,GASN,EAAE,EAAE;IACH,MAAM,EAAE,aAAa,EAAE,iBAAiB,EAAE,cAAc,EAAE,GAAG,CAAC,KAAK,IAAI,EAAE,CACtD,CAAC;IAEpB,uEAAuE;IACvE,kBAAkB;IAClB,MAAM,aAAa,GAA0D,CAAO,GAAG,EAAE,GAAG,EAAE,KAAK,EAAE,EAAE;QACrG,MAAM,OAAO,GAAG,KAAK,CAAC,YAAY,IAAI,IAAI,IAAI,aAAa,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;QAChF,IAAI,CAAC,OAAO,EAAE;YACZ,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,8BAA8B,KAAK,CAAC,IAAI,KAAK,KAAK,CAAC,YAAY,sCAAsC,GAAG,EAAE,CAAC,CAAC,CAAC,kBAAkB;YAC7I,OAAO;SACR;QACD,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,0BAA0B,KAAK,CAAC,YAAY,UAAU,GAAG,EAAE,CAAC,CAAC;QAC1E,MAAM,OAAO,CAAC,GAAU,CAAC,CAAC;IAC5B,CAAC,CAAA,CAAC;IAEF,MAAM,cAAc,GAAG,CAAO,GAA6C,EAAiB,EAAE;;QAC5F,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,SAAS,EAAE,GAAG,GAAG,CAAC;QACrC,MAAM,GAAG,GAAG,SAAS,CAAC,KAAK,CAAC,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC,CAAC;QAErD,MAAM,QAAQ,GAAG,MAAM,EAAE,CAAC,gBAAgB,CAAC,cAAc,CAAC,CAAC;QAE3D,IAAI,oBAAoB,GAAG,CAAC,CAAC;QAC7B,IAAI,GAAG,GAA0B,GAAG,CAAC,OAAO,CAAC;QAE7C,MAAM,YAAY,GAAG,GAAS,EAAE;YAC9B,IAAI,CAAC,GAAG;gBAAE,OAAO;YACjB,MAAM,QAAQ,CAAC,kBAAkB,CAAC,GAAG,CAAC,CAAC;YACvC,oBAAoB,EAAE,CAAC;QACzB,CAAC,CAAA,CAAC;QAEF,MAAM,eAAe,GAAG,CAAO,MAAc,EAAE,EAAE;YAC/C,GAAG,CAAC,KAAK,CAAC,0CAA0C,MAAM,EAAE,CAAC,CAAC;YAE9D,IAAI,iBAAiB;gBAAE,MAAM,IAAA,YAAI,EAAC,iBAAiB,CAAC,CAAC;YACrD,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,gBAAgB,EAAE,CAAC;YACjD,GAAG,GAAG,MAAM,aAAN,MAAM,cAAN,MAAM,GAAI,IAAI,CAAC;YAErB,IAAI,GAAG,EAAE;gBACP,GAAG,CAAC,KAAK,CAAC,mCAAmC,MAAM,EAAE,CAAC,CAAC;gBAEvD,oEAAoE;gBACpE,wEAAwE;gBACxE,0CAA0C;gBAC1C,IAAI,IAAI,IAAI,IAAI,CAAC,IAAI;oBAAE,MAAO,IAAa,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;aAC3D;iBAAM;gBACL,GAAG,CAAC,KAAK,CAAC,kCAAkC,MAAM,EAAE,CAAC,CAAC;aACvD;QACH,CAAC,CAAA,CAAC;QAEF,MAAM,YAAY,GAAG,GAAG,EAAE,CACxB,aAAa,IAAI,IAAI,IAAI,GAAG,IAAI,IAAI,IAAI,oBAAoB,GAAG,aAAa,CAAC;QAE/E,IAAI;YACF,GAAG;gBACD,MAAM,GAAG,GAAG,IAAI,CAAC,CAAC,CAAC,MAAO,IAA0B,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAA,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,SAAS,MAAI,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,GAAG,CAAA,CAAC;gBACxF,MAAM,SAAS,GAAG,SAAS,oBAAoB,GAAG,CAAC,OAAO,aAAa,aAAb,aAAa,cAAb,aAAa,GAAI,CAAC,UAAU,GAAG,EAAE,CAAC;gBAE5F,mCAAmC;gBACnC,GAAG,CAAC,KAAK,CAAC,8CAA8C,SAAS,EAAE,CAAC,CAAC;gBACrE,MAAM,KAAK,GAAG,MAAM,IAAA,uBAAe,EAAC,MAAM,EAAE,CAAO,SAAS,EAAE,EAAE;oBAC9D,MAAM,OAAO,GAAG,MAAM,SAAS,CAAC,KAAK,CAAC,GAAG,EAAE,GAAG,EAAE,SAAS,EAAE,aAAa,CAAC,CAAC;oBAC1E,OAAO,OAAO,CAAC;gBACjB,CAAC,CAAA,CAAC,CAAC;gBAEH,kBAAkB;gBAClB,IAAI,KAAK,EAAE;oBACT,GAAG,CAAC,IAAI,CAAC,qBAAqB,KAAK,CAAC,IAAI,mBAAmB,KAAK,CAAC,YAAY,MAAM,SAAS,EAAE,CAAC,CAAC,CAAC,kBAAkB;oBACnH,MAAM,CAAC,MAAA,KAAK,CAAC,MAAM,mCAAI,aAAa,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,KAAK,EAAE,aAAa,CAAC,CAAC;iBACvE;qBAAM;oBACL,GAAG,CAAC,KAAK,CAAC,oDAAoD,SAAS,EAAE,CAAC,CAAC;iBAC5E;gBAED,yCAAyC;gBACzC,MAAM,YAAY,EAAE,CAAC;gBACrB,MAAM,eAAe,CAAC,SAAS,CAAC,CAAC;aAClC,QAAQ,YAAY,EAAE,EAAE;SAC1B;QAAC,OAAO,GAAG,EAAE;YACZ,GAAG,CAAC,KAAK,CAAC,gEAAgE,CAAA,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,SAAS,MAAI,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,GAAG,CAAA,GAAG,CAAC,CAAC,CAAC,kBAAkB;YAC5H,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YACf,iFAAiF;YACjF,IAAI,GAAG;gBAAE,MAAM,QAAQ,CAAC,cAAc,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;SAClE;IACH,CAAC,CAAA,CAAC;IAEF,MAAM,cAAc,GAAG,CAAC,cAAc,aAAd,cAAc,cAAd,cAAc,GAAI,EAAE,CAAC,CAAC,WAAW,CACvD,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,EAAE,CAAC,EAC5B,cAAc,CACf,CAAC;IACF,MAAM,MAAM,CAAC,iBAAiB,CAAa,CAAC,GAAG,EAAE,EAAE,CACjD,cAAc,CAAC,gCAAK,aAAa,GAAK,GAAG,CAAS,CAAC,CACpD,CAAC;AACJ,CAAC,CAAA,CAAC;AAjHW,QAAA,iBAAiB,qBAiH5B","sourcesContent":["import type {\n BasicCrawler,\n BasicCrawlingContext,\n CheerioCrawlingContext,\n CrawlingContext,\n HttpCrawlingContext,\n JSDOMCrawlingContext,\n PlaywrightCrawlingContext,\n PuppeteerCrawlingContext,\n RouterHandler as CrawlerRouter,\n Request as CrawlerRequest,\n} from 'crawlee';\nimport type { CommonPage } from '@crawlee/browser-pool';\nimport type { Page } from 'playwright';\n\nimport type { MaybePromise } from '../utils/types';\nimport { serialAsyncFind, serialAsyncMap, wait } from '../utils/async';\nimport type { PerfActorInput, RequestActorInput } from './config';\nimport type { CrawleeOneIO } from './integrations/types';\n\n// Read about router on https://docs.apify.com/academy/expert-scraping-with-apify/solutions/using-storage-creating-tasks\n\n/** Context object provided in CrawlerRouter */\nexport type RouterHandlerCtx<CrawlerCtx extends CrawlingContext> = Parameters<\n Parameters<CrawlerRouter<CrawlerCtx>['addHandler']>[1]\n>[0];\n\n/** Function that's passed to `router.addHandler(label, handler)` */\nexport type RouteHandler<\n CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n RouterCtx extends Record<string, any> = Record<string, any>,\n> = Parameters<CrawlerRouter<RouterHandlerCtx<CrawlerCtx & RouterCtx>>['addHandler']>[1]; // prettier-ignore\n\n/** Wrapper that modifies behavior of RouteHandler */\nexport type CrawlerRouterWrapper<\n CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n RouterCtx extends Record<string, any> = Record<string, any>\n> = (\n handler: (ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>) => Promise<void>\n) => (ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>) => Promise<void>;\n\n/**\n * Criteria that un-labelled requests are matched against.\n *\n * E.g. If `match` function returns truthy value,\n * the request is passed to the `action` function for processing.\n */\nexport interface RouteMatcher<\n CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n> {\n /** Human readable name */\n name: string;\n /**\n * Label of the handler registered with `router.addHandler(label, handler)`\n * that will process this request.\n *\n * NOTE: This value is used by the default `action` function. If you override\n * the `action` function, `handlerLabel` is ignored and you have to process it yourself.\n */\n handlerLabel: Labels | null;\n /**\n * Function that decides whether the request will processed by this `action` function.\n *\n * @example\n * [{\n * // If match returns true, the request is forwarded to handler\n * // with label JOB_DETAIL.\n * name: 'Job detail',\n * match: (url, ctx, route, handlers) => isUrlOfJobOffer(url),\n * handlerLabel: routeLabels.JOB_DETAIL,\n * }]\n */\n match: (\n url: string,\n ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>,\n route: RouteMatcher<CrawlerCtx, RouterCtx, Labels>,\n handlers: Record<Labels, RouteHandler<CrawlerCtx, RouterCtx>>\n ) => unknown;\n /**\n * Request is passed to this function if `match` returned truthy value.\n *\n * @example\n * [{\n * // If match returns true, the request is forwarded to handler\n * // with label JOB_DETAIL.\n * name: 'Job detail',\n * match: (url, ctx, route, handlers) => isUrlOfJobOffer(url),\n * handlerLabel: routeLabels.JOB_DETAIL,\n * }]\n */\n action?: (\n url: string,\n ctx: RouterHandlerCtx<CrawlerCtx>,\n route: RouteMatcher<CrawlerCtx, RouterCtx, Labels>,\n handlers: Record<Labels, RouteHandler<CrawlerCtx, RouterCtx>>\n ) => MaybePromise<void>;\n}\n\nexport const createRouteMatchers = <\n CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\n\n// Context-specific variants\nexport const createBasicRouteMatchers = <\n CrawlerCtx extends BasicCrawlingContext = BasicCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\nexport const createHttpRouteMatchers = <\n CrawlerCtx extends HttpCrawlingContext = HttpCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\nexport const createJsdomRouteMatchers = <\n CrawlerCtx extends JSDOMCrawlingContext = JSDOMCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\nexport const createCheerioRouteMatchers = <\nCrawlerCtx extends CheerioCrawlingContext = CheerioCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\nexport const createPlaywrightRouteMatchers = <\n CrawlerCtx extends PlaywrightCrawlingContext = PlaywrightCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\nexport const createPuppeteerRouteMatchers = <\n CrawlerCtx extends PuppeteerCrawlingContext = PuppeteerCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\n\nexport const registerHandlers = async <\n CrawlerCtx extends CrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>({\n router,\n routerWrappers,\n routerContext,\n routeHandlers,\n}: {\n router: CrawlerRouter<CrawlerCtx>;\n routerWrappers?: CrawlerRouterWrapper<CrawlerCtx, RouterCtx>[];\n routerContext?: RouterCtx;\n routeHandlers: Record<Labels, RouteHandler<CrawlerCtx, RouterCtx>>;\n}) => {\n await serialAsyncMap(Object.entries(routeHandlers), async ([key, handler]) => {\n const wrappedHandler = (routerWrappers ?? []).reduceRight(\n (fn, wrapper) => wrapper((ctx) => fn(ctx)),\n handler as (ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>) => Promise<void>\n );\n await router.addHandler<CrawlerCtx>(key, async (ctx) =>\n wrappedHandler({ ...routerContext, ...ctx } as any)\n );\n });\n};\n\n/**\n * Configures the default router handler to redirect URLs to labelled route handlers\n * based on which route the URL matches first.\n *\n * NOTE: This does mean that the URLs passed to this default handler will be fetched\n * twice (as the URL will be requeued to the correct handler). We recommend to use this\n * function only in the scenarios where there is a small number of startUrls, yet these\n * may need various ways of processing based on different paths or etc.\n *\n * @example\n *\n * const routeLabels = {\n * MAIN_PAGE: 'MAIN_PAGE',\n * JOB_LISTING: 'JOB_LISTING',\n * JOB_DETAIL: 'JOB_DETAIL',\n * JOB_RELATED_LIST: 'JOB_RELATED_LIST',\n * PARTNERS: 'PARTNERS',\n * } as const;\n *\n * const router = createPlaywrightRouter();\n *\n * const routes = createPlaywrightRouteMatchers<typeof routeLabels>([\n * // URLs that match this route are redirected to router.addHandler(routeLabels.MAIN_PAGE)\n * {\n * route: routeLabels.MAIN_PAGE,\n * // Check for main page like https://www.profesia.sk/?#\n * match: (url) => url.match(/[\\W]profesia\\.sk\\/?(?:[?#~]|$)/i),\n * },\n *\n * // Optionally override the logic that assigns the URL to the route by specifying the `action` prop\n * {\n * route: routeLabels.MAIN_PAGE,\n * // Check for main page like https://www.profesia.sk/?#\n * match: (url) => url.match(/[\\W]profesia\\.sk\\/?(?:[?#~]|$)/i),\n * action: async (ctx) => {\n * await ctx.crawler.addRequests([{\n * url: 'https://profesia.sk/praca',\n * label: routeLabels.JOB_LISTING,\n * }]);\n * },\n * },\n * ]);\n *\n * // Set up default route to redirect to labelled routes\n * setupDefaultRoute({ router, routes });\n *\n * // Now set up the labelled routes\n * await router.addHandler(routeLabels.JOB_LISTING, async (ctx) => { ... }\n */\nexport const setupDefaultRoute = async <\n CrawlerCtx extends CrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>\n>({\n io,\n router,\n routerWrappers,\n routerContext,\n routes,\n routeHandlers,\n input,\n}: {\n io: CrawleeOneIO;\n router: CrawlerRouter<CrawlerCtx>;\n routerWrappers?: CrawlerRouterWrapper<CrawlerCtx, RouterCtx>[];\n routerContext?: RouterCtx;\n routes: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[];\n routeHandlers: Record<Labels, RouteHandler<CrawlerCtx, RouterCtx>>;\n input?: Input | null;\n}) => {\n const { perfBatchSize, perfBatchWaitSecs, requestQueueId } = (input || {}) as PerfActorInput &\n RequestActorInput;\n\n /** Redirect the URL to the labelled route identical to route's name */\n // prettier-ignore\n const defaultAction: RouteMatcher<CrawlerCtx, RouterCtx, Labels>['action'] = async (url, ctx, route) => {\n const handler = route.handlerLabel != null && routeHandlers[route.handlerLabel];\n if (!handler) {\n ctx.log.error(`No handler found for route ${route.name} (${route.handlerLabel}). URL will not be processed. URL: ${url}`); // prettier-ignore\n return;\n }\n ctx.log.info(`Passing URL to handler ${route.handlerLabel}. URL: ${url}`);\n await handler(ctx as any);\n };\n\n const defaultHandler = async (ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>): Promise<void> => {\n const { page, log: parentLog } = ctx;\n const log = parentLog.child({ prefix: '[Router] ' });\n\n const reqQueue = await io.openRequestQueue(requestQueueId);\n\n let handledRequestsCount = 0;\n let req: CrawlerRequest | null = ctx.request;\n\n const closeRequest = async () => {\n if (!req) return;\n await reqQueue.markRequestHandled(req);\n handledRequestsCount++;\n };\n\n const loadNextRequest = async (suffix: string) => {\n log.debug(`Checking for new Request in the queue. ${suffix}`);\n\n if (perfBatchWaitSecs) await wait(perfBatchWaitSecs);\n const newReq = await reqQueue.fetchNextRequest();\n req = newReq ?? null;\n\n if (req) {\n log.debug(`Found new Request in the queue. ${suffix}`);\n\n // WARNING - For each subsequent Request, it must be loaded manually\n // Hence, batching is suitable only for browser-based Crawlers\n // like Playwright or Puppeteer.\n if (page && page.goto) await (page as Page).goto(req.url);\n } else {\n log.debug(`No more Requests in the queue. ${suffix}`);\n }\n };\n\n const hasBatchReqs = () =>\n perfBatchSize != null && req != null && handledRequestsCount < perfBatchSize;\n\n try {\n do {\n const url = page ? await (page as any as CommonPage).url() : req?.loadedUrl || req?.url;\n const logSuffix = `Batch ${handledRequestsCount + 1} of ${perfBatchSize ?? 1}. URL: ${url}`;\n\n // Find route handler for given URL\n log.debug(`Searching for a handler for given Request. ${logSuffix}`);\n const route = await serialAsyncFind(routes, async (currRoute) => {\n const isMatch = await currRoute.match(url, ctx, currRoute, routeHandlers);\n return isMatch;\n });\n\n // Run the handler\n if (route) {\n log.info(`URL matched route ${route.name} (handlerLabel: ${route.handlerLabel}). ${logSuffix}`); // prettier-ignore\n await (route.action ?? defaultAction)(url, ctx, route, routeHandlers);\n } else {\n log.error(`No route matched URL. URL will not be processed. ${logSuffix}`);\n }\n\n // Clean up and move onto another request\n await closeRequest();\n await loadNextRequest(logSuffix);\n } while (hasBatchReqs());\n } catch (err) {\n log.error(`Failed to process a request, returning it to the queue. URL: ${req?.loadedUrl || req?.url}.`); // prettier-ignore\n log.error(err);\n // Reinsert the request into the queue if we failed to process it due to an error\n if (req) await reqQueue.reclaimRequest(req, { forefront: true });\n }\n };\n\n const wrappedHandler = (routerWrappers ?? []).reduceRight(\n (fn, wrapper) => wrapper(fn),\n defaultHandler\n );\n await router.addDefaultHandler<CrawlerCtx>((ctx) =>\n wrappedHandler({ ...routerContext, ...ctx } as any)\n );\n};\n"]}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import type { MaybePromise } from '../../utils/types';
|
|
2
|
+
export interface ListingLogger {
|
|
3
|
+
debug: (msg: string, data?: any) => void;
|
|
4
|
+
info: (msg: string, data?: any) => void;
|
|
5
|
+
warning: (msg: string, data?: any) => void;
|
|
6
|
+
error: (msg: string, data?: any) => void;
|
|
7
|
+
}
|
|
8
|
+
export interface ListingPageFilter {
|
|
9
|
+
name: string;
|
|
10
|
+
disabled?: boolean;
|
|
11
|
+
initState: () => MaybePromise<boolean>;
|
|
12
|
+
resetState: () => MaybePromise<void>;
|
|
13
|
+
nextState: () => MaybePromise<void>;
|
|
14
|
+
hasNextState: () => MaybePromise<boolean>;
|
|
15
|
+
hasState: () => MaybePromise<boolean>;
|
|
16
|
+
loadState: () => MaybePromise<void>;
|
|
17
|
+
}
|
|
18
|
+
export interface ListingFiltersSetupOptions<Ctx extends object, UrlType> {
|
|
19
|
+
context: ListingPageScraperContext<Ctx, UrlType>;
|
|
20
|
+
filters?: ListingPageFilter[];
|
|
21
|
+
shouldApplyFilter?: (context: ListingPageScraperContext<Ctx, UrlType>, filter: ListingPageFilter, filters: ListingPageFilter[]) => MaybePromise<boolean>;
|
|
22
|
+
onResetFilters?: (context: ListingPageScraperContext<Ctx, UrlType>) => MaybePromise<void>;
|
|
23
|
+
onFiltersLoaded?: (context: ListingPageScraperContext<Ctx, UrlType>) => MaybePromise<void>;
|
|
24
|
+
log: ListingLogger;
|
|
25
|
+
}
|
|
26
|
+
export interface ListingPageScraperContext<Ctx extends object, UrlType> {
|
|
27
|
+
context: Ctx;
|
|
28
|
+
log: ListingLogger;
|
|
29
|
+
startUrl: UrlType;
|
|
30
|
+
filters: ListingPageFilter[];
|
|
31
|
+
/** Use this if you need to load filters again (eg after reloading page manually) */
|
|
32
|
+
loadFilterState: () => MaybePromise<void>;
|
|
33
|
+
/** Call this function from any callback to stop scraping */
|
|
34
|
+
abort: () => void;
|
|
35
|
+
}
|
|
36
|
+
export interface ListingPageScraperOptions<Ctx extends object, UrlType> extends Omit<ListingFiltersSetupOptions<Ctx, UrlType>, 'context'> {
|
|
37
|
+
context: Ctx;
|
|
38
|
+
startUrls: UrlType[];
|
|
39
|
+
listingCountOnly?: boolean;
|
|
40
|
+
/** Get ID of the current page in the pagination, so it can be logged */
|
|
41
|
+
pageId?: (context: ListingPageScraperContext<Ctx, UrlType>) => MaybePromise<string>;
|
|
42
|
+
log: ListingLogger;
|
|
43
|
+
onNavigate?: (context: ListingPageScraperContext<Ctx, UrlType>, url: UrlType) => MaybePromise<void>;
|
|
44
|
+
/**
|
|
45
|
+
* Hook triggered after navigating to the url using Page.goto().
|
|
46
|
+
*
|
|
47
|
+
* One use of this hook is to conditionally disable/enable filters based on the page content.
|
|
48
|
+
**/
|
|
49
|
+
onAfterNavigation?: (context: ListingPageScraperContext<Ctx, UrlType>) => MaybePromise<void>;
|
|
50
|
+
/** How many attempts are retried after filters failed to load. Defaults to 3 */
|
|
51
|
+
loadFiltersRetries?: number;
|
|
52
|
+
/**
|
|
53
|
+
* Hook triggered after a failed attempt at loading listings page filters.
|
|
54
|
+
*
|
|
55
|
+
* One use of this hook is to reload the page on failed attemp in case something didn't load correctly.
|
|
56
|
+
**/
|
|
57
|
+
onLoadFiltersError?: (context: ListingPageScraperContext<Ctx, UrlType>, error: any, retryIndex: number) => MaybePromise<void>;
|
|
58
|
+
/** Main logic to extract entries from a page */
|
|
59
|
+
extractEntries: (context: ListingPageScraperContext<Ctx, UrlType>, retryIndex: number) => MaybePromise<UrlType[]>;
|
|
60
|
+
/** How many attempts are retried after failed to scrape entries from a listing. Defaults to 3 */
|
|
61
|
+
extractEntriesRetries?: number;
|
|
62
|
+
/**
|
|
63
|
+
* Hook triggered after a failed attempt at scraping entries from a listing.
|
|
64
|
+
*
|
|
65
|
+
* One use of this hook is to reload the page on failed attemp in case something didn't load correctly.
|
|
66
|
+
**/
|
|
67
|
+
onExtractEntriesError?: (context: ListingPageScraperContext<Ctx, UrlType>, error: any, retryIndex: number) => MaybePromise<void>;
|
|
68
|
+
onExtractEntriesDone?: (context: ListingPageScraperContext<Ctx, UrlType>, entries: UrlType[] | null) => MaybePromise<void>;
|
|
69
|
+
/**
|
|
70
|
+
* If goToNextPage hook is defined, it will be called after each page. To indicate that there's no more
|
|
71
|
+
* pages left, throw an error.
|
|
72
|
+
**/
|
|
73
|
+
onGoToNextPage?: (context: ListingPageScraperContext<Ctx, UrlType>, entries: UrlType[] | null) => MaybePromise<void>;
|
|
74
|
+
/** How long to wait after we've navigated to the next page and before we start extracting? */
|
|
75
|
+
nextPageWait?: number;
|
|
76
|
+
}
|
|
77
|
+
/** Get entries from a listing page (eg URLs to profiles that should be scraped later) */
|
|
78
|
+
export declare const scrapeListingEntries: <Ctx extends object, UrlType>(options: ListingPageScraperOptions<Ctx, UrlType>) => Promise<UrlType[]>;
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.scrapeListingEntries = void 0;
|
|
13
|
+
const lodash_1 = require("lodash");
|
|
14
|
+
const async_1 = require("../../utils/async");
|
|
15
|
+
const url_1 = require("../../utils/url");
|
|
16
|
+
/**
|
|
17
|
+
* Given configuration for listing page filters, set up functions to
|
|
18
|
+
* navigate through the different states of filters, to allow to paginate
|
|
19
|
+
* through all states.
|
|
20
|
+
*/
|
|
21
|
+
const setupListingFilters = ({ context, filters = [], shouldApplyFilter, onResetFilters, onFiltersLoaded, log, }) => {
|
|
22
|
+
let filtersStack = filters;
|
|
23
|
+
const getNextFilterStateChangeIndex = () => __awaiter(void 0, void 0, void 0, function* () {
|
|
24
|
+
const hasNextStates = yield (0, async_1.serialAsyncMap)(filtersStack, (filter) => filter.hasNextState());
|
|
25
|
+
return (0, lodash_1.findLastIndex)(hasNextStates, (x) => x);
|
|
26
|
+
});
|
|
27
|
+
const hasState = () => __awaiter(void 0, void 0, void 0, function* () {
|
|
28
|
+
const hasStates = yield (0, async_1.serialAsyncMap)(filtersStack, (filter) => filter.hasState());
|
|
29
|
+
return hasStates.some(Boolean);
|
|
30
|
+
});
|
|
31
|
+
const hasNextState = () => __awaiter(void 0, void 0, void 0, function* () {
|
|
32
|
+
const nextFilterStateChangeIndex = yield getNextFilterStateChangeIndex();
|
|
33
|
+
return nextFilterStateChangeIndex > -1;
|
|
34
|
+
});
|
|
35
|
+
const nextState = () => __awaiter(void 0, void 0, void 0, function* () {
|
|
36
|
+
// Imagine we have 4 filters, each has 3 states (eg 3 options to select from)
|
|
37
|
+
// We start with all filters in the first state:
|
|
38
|
+
// State 1: F1(1), F2(1), F3(1), F4(1)
|
|
39
|
+
// As we progress, we increment it akin to numbers:
|
|
40
|
+
// State 2: F1(1), F2(1), F3(1), F4(2)
|
|
41
|
+
// State 3: F1(1), F2(1), F3(1), F4(3)
|
|
42
|
+
// State 4: F1(1), F2(1), F3(2), F4(1)
|
|
43
|
+
// All the way to the last state:
|
|
44
|
+
// State n: F1(3), F2(3), F3(3), F4(3)
|
|
45
|
+
//
|
|
46
|
+
// When we want move to a next state, we identify the RIGHT-most filter
|
|
47
|
+
// whose state can be incremented (in this case we select F2):
|
|
48
|
+
// YES YES NO NO
|
|
49
|
+
// State x: F1(1), F2(2), F3(3), F4(3)
|
|
50
|
+
//
|
|
51
|
+
// When we increment a filter state, all the other filter to the RIGHT
|
|
52
|
+
// will be reset:
|
|
53
|
+
// State x: F1(1), F2(2), F3(3), F4(3)
|
|
54
|
+
// State x+1: F1(1), F2(3), F3(1), F4(1)
|
|
55
|
+
const initStates = yield (0, async_1.serialAsyncMap)(filtersStack, (filter) => filter.initState());
|
|
56
|
+
if (initStates.some(Boolean))
|
|
57
|
+
return log.info('Initialised filters');
|
|
58
|
+
const nextFilterStateChangeIndex = yield getNextFilterStateChangeIndex();
|
|
59
|
+
if (nextFilterStateChangeIndex === -1)
|
|
60
|
+
throw Error('Cannot select next filter state - reached end of list');
|
|
61
|
+
const filterToNextState = filtersStack[nextFilterStateChangeIndex];
|
|
62
|
+
const filtersToReset = filtersStack.slice(nextFilterStateChangeIndex + 1);
|
|
63
|
+
log.info('Setting filters to next state');
|
|
64
|
+
yield filterToNextState.nextState();
|
|
65
|
+
for (const filter of filtersToReset) {
|
|
66
|
+
yield filter.resetState();
|
|
67
|
+
yield filter.nextState();
|
|
68
|
+
}
|
|
69
|
+
});
|
|
70
|
+
/** Load current filter state in the webpage */
|
|
71
|
+
const loadState = () => __awaiter(void 0, void 0, void 0, function* () {
|
|
72
|
+
yield resetState();
|
|
73
|
+
// Load filters one by one, and only if needed
|
|
74
|
+
filtersStack = [];
|
|
75
|
+
for (const filter of filters) {
|
|
76
|
+
const shouldUseFilter = shouldApplyFilter
|
|
77
|
+
? yield shouldApplyFilter(context, filter, filters)
|
|
78
|
+
: true;
|
|
79
|
+
if (!shouldUseFilter) {
|
|
80
|
+
log.info(`Not applying filter "${filter.name}" or further filters`);
|
|
81
|
+
break;
|
|
82
|
+
}
|
|
83
|
+
if (!filter.disabled) {
|
|
84
|
+
log.info(`Applying filter "${filter.name}"`);
|
|
85
|
+
yield filter.loadState();
|
|
86
|
+
}
|
|
87
|
+
else {
|
|
88
|
+
log.info(`Filter "${filter.name}" recognised but not applied because it is disabled`);
|
|
89
|
+
}
|
|
90
|
+
filtersStack.push(filter);
|
|
91
|
+
}
|
|
92
|
+
log.info(`Done loading filters`);
|
|
93
|
+
yield (onFiltersLoaded === null || onFiltersLoaded === void 0 ? void 0 : onFiltersLoaded(context));
|
|
94
|
+
});
|
|
95
|
+
/** Reset filter state */
|
|
96
|
+
const resetState = () => __awaiter(void 0, void 0, void 0, function* () {
|
|
97
|
+
log.info(`Resetting filter state`);
|
|
98
|
+
yield (onResetFilters === null || onResetFilters === void 0 ? void 0 : onResetFilters(context));
|
|
99
|
+
filtersStack = filters;
|
|
100
|
+
log.info(`Resetting filter state done`);
|
|
101
|
+
});
|
|
102
|
+
return {
|
|
103
|
+
loadState,
|
|
104
|
+
nextState,
|
|
105
|
+
hasNextState,
|
|
106
|
+
hasState,
|
|
107
|
+
};
|
|
108
|
+
};
|
|
109
|
+
/** Get entries from a listing page (eg URLs to profiles that should be scraped later) */
|
|
110
|
+
const scrapeListingEntries = (options) => __awaiter(void 0, void 0, void 0, function* () {
|
|
111
|
+
const { context, startUrls, listingCountOnly = false, log, pageId, onNavigate, onAfterNavigation, filters = [], shouldApplyFilter, loadFiltersRetries = 3, onLoadFiltersError = (_, err) => console.error(err), onFiltersLoaded, onResetFilters, extractEntries, extractEntriesRetries = 3, onExtractEntriesError = (_, err) => console.error(err), onExtractEntriesDone, onGoToNextPage, nextPageWait = 500, } = options;
|
|
112
|
+
/** Collection of ALL urls across all pages and startUrls */
|
|
113
|
+
const links = [];
|
|
114
|
+
yield (0, async_1.serialAsyncMap)(startUrls, (startUrl, index) => __awaiter(void 0, void 0, void 0, function* () {
|
|
115
|
+
if (listingCountOnly && index > 0)
|
|
116
|
+
return;
|
|
117
|
+
const logId = `${startUrl} (${index + 1}/${startUrls.length})`;
|
|
118
|
+
let userAskedToStop = false;
|
|
119
|
+
const abort = () => { userAskedToStop = true; }; // prettier-ignore
|
|
120
|
+
// Prepare context shared across all hooks
|
|
121
|
+
let filterObj = null;
|
|
122
|
+
const genCtxArg = () => {
|
|
123
|
+
var _a;
|
|
124
|
+
return ({
|
|
125
|
+
context,
|
|
126
|
+
log,
|
|
127
|
+
startUrl,
|
|
128
|
+
filters,
|
|
129
|
+
loadFilterState: (_a = filterObj === null || filterObj === void 0 ? void 0 : filterObj.loadState) !== null && _a !== void 0 ? _a : (() => { }),
|
|
130
|
+
abort,
|
|
131
|
+
});
|
|
132
|
+
};
|
|
133
|
+
log.debug(`Validating URL ${logId}`);
|
|
134
|
+
(0, url_1.validateUrl)(startUrl);
|
|
135
|
+
log.info(`Navigating URL ${logId}`);
|
|
136
|
+
yield (onNavigate === null || onNavigate === void 0 ? void 0 : onNavigate(genCtxArg(), startUrl));
|
|
137
|
+
log.debug(`Done navigating to URL ${logId}`);
|
|
138
|
+
filterObj = setupListingFilters({
|
|
139
|
+
context: genCtxArg(),
|
|
140
|
+
filters,
|
|
141
|
+
shouldApplyFilter,
|
|
142
|
+
onFiltersLoaded,
|
|
143
|
+
onResetFilters,
|
|
144
|
+
log,
|
|
145
|
+
});
|
|
146
|
+
log.debug(`Calling onAfterNavigation callback. URL ${logId}`); // prettier-ignore
|
|
147
|
+
yield (onAfterNavigation === null || onAfterNavigation === void 0 ? void 0 : onAfterNavigation(genCtxArg()));
|
|
148
|
+
log.debug(`Done calling onAfterNavigation callback. URL ${logId})`); // prettier-ignore
|
|
149
|
+
const isUsingFilters = filters.some((filter) => !filter.disabled);
|
|
150
|
+
let hasFilterStatesToProcess = true;
|
|
151
|
+
while (hasFilterStatesToProcess && !userAskedToStop) {
|
|
152
|
+
// Filter loop
|
|
153
|
+
// Load filters before we start paginating
|
|
154
|
+
log.info(`Setting up filters for URL ${logId}`);
|
|
155
|
+
yield (0, async_1.retryAsync)(() => __awaiter(void 0, void 0, void 0, function* () {
|
|
156
|
+
if (!filterObj)
|
|
157
|
+
throw Error(`Filter controller is missing. This should never happen. URL ${logId}`); // prettier-ignore
|
|
158
|
+
const filterHasState = yield filterObj.hasState();
|
|
159
|
+
if (!isUsingFilters || !filterHasState) {
|
|
160
|
+
log.info(`Not loading filters for URL ${logId}`);
|
|
161
|
+
return;
|
|
162
|
+
}
|
|
163
|
+
log.debug(`Loading filters for URL ${logId}`);
|
|
164
|
+
yield filterObj.nextState();
|
|
165
|
+
yield filterObj.loadState();
|
|
166
|
+
log.debug(`Done loading filters for URL ${logId}`);
|
|
167
|
+
}), {
|
|
168
|
+
maxRetries: loadFiltersRetries,
|
|
169
|
+
onError: (err, retryIndex) => onLoadFiltersError(genCtxArg(), err, retryIndex),
|
|
170
|
+
});
|
|
171
|
+
let nextPageAvailable = true;
|
|
172
|
+
while (nextPageAvailable && !userAskedToStop) {
|
|
173
|
+
// Pagination loop
|
|
174
|
+
let currPageId = 'next page';
|
|
175
|
+
if (pageId) {
|
|
176
|
+
log.debug(`Loading pageId for URL ${logId}`);
|
|
177
|
+
currPageId = yield pageId(genCtxArg());
|
|
178
|
+
log.debug(`Done loading pageId for URL ${logId}`);
|
|
179
|
+
}
|
|
180
|
+
const pageLogId = `${logId} (${currPageId})`;
|
|
181
|
+
// Extract page links
|
|
182
|
+
log.info(`Extracting links from page ${pageLogId}`);
|
|
183
|
+
const { result } = yield (0, async_1.retryAsync)((retryIndex) => __awaiter(void 0, void 0, void 0, function* () { return extractEntries(genCtxArg(), retryIndex); }), {
|
|
184
|
+
maxRetries: extractEntriesRetries,
|
|
185
|
+
onError: (err, retryIndex) => onExtractEntriesError(genCtxArg(), err, retryIndex),
|
|
186
|
+
});
|
|
187
|
+
log.debug(`Done extracting links from page ${pageLogId}`);
|
|
188
|
+
const pageLinks = result !== null && result !== void 0 ? result : [];
|
|
189
|
+
links.push(...pageLinks);
|
|
190
|
+
log.info(`Found ${pageLinks.length} links on page ${pageLogId}`);
|
|
191
|
+
// Leave after printing the count or on abort
|
|
192
|
+
if (listingCountOnly || userAskedToStop) {
|
|
193
|
+
nextPageAvailable = false;
|
|
194
|
+
if (listingCountOnly)
|
|
195
|
+
log.info(`Debugging mode. Entries are not scraped. Leaving now. URL ${pageLogId}`); // prettier-ignore
|
|
196
|
+
else if (userAskedToStop)
|
|
197
|
+
log.info(`Aborting. URL ${pageLogId}`);
|
|
198
|
+
continue;
|
|
199
|
+
}
|
|
200
|
+
log.debug(`Calling onExtractEntriesDone callback. URL ${pageLogId}`); // prettier-ignore
|
|
201
|
+
yield (onExtractEntriesDone === null || onExtractEntriesDone === void 0 ? void 0 : onExtractEntriesDone(genCtxArg(), pageLinks));
|
|
202
|
+
log.debug(`Done calling onExtractEntriesDone callback. URL ${pageLogId}`); // prettier-ignore
|
|
203
|
+
if (onGoToNextPage && !userAskedToStop) {
|
|
204
|
+
// If goToNextPage hook is defined, this will be called after each page, until it errors
|
|
205
|
+
try {
|
|
206
|
+
log.info(`Navigating to next page from URL ${pageLogId}`);
|
|
207
|
+
yield onGoToNextPage(genCtxArg(), pageLinks);
|
|
208
|
+
log.debug(`Done navigating to next page from URL ${pageLogId}`); // prettier-ignore
|
|
209
|
+
}
|
|
210
|
+
catch (e) {
|
|
211
|
+
log.info(`Failed navigating to next page from URL ${pageLogId}`); // prettier-ignore
|
|
212
|
+
log.error(e.toString());
|
|
213
|
+
nextPageAvailable = false;
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
else {
|
|
217
|
+
if (userAskedToStop)
|
|
218
|
+
log.info(`Aborting. URL ${pageLogId}`);
|
|
219
|
+
nextPageAvailable = false;
|
|
220
|
+
}
|
|
221
|
+
// Wait before we start scraping the next page
|
|
222
|
+
yield new Promise((res) => setTimeout(res, nextPageWait));
|
|
223
|
+
}
|
|
224
|
+
// Break out if we're not using filters or we've gone through them all
|
|
225
|
+
log.debug(`Checking if there are more filter states available for URL ${logId}`);
|
|
226
|
+
hasFilterStatesToProcess = isUsingFilters && (yield filterObj.hasNextState());
|
|
227
|
+
log.debug(`Done checking if there are more filter states available for URL ${logId}`);
|
|
228
|
+
if (hasFilterStatesToProcess) {
|
|
229
|
+
if (!userAskedToStop)
|
|
230
|
+
log.info(`Will repeat scraping this URL with different filter setting. URL ${logId}`); // prettier-ignore
|
|
231
|
+
else
|
|
232
|
+
log.info(`There are unprocessed filter setting remaining for this URL, but stopping due to abort. URL ${logId}`); // prettier-ignore
|
|
233
|
+
}
|
|
234
|
+
else
|
|
235
|
+
log.info(`No filter setting remain for scraping this URL. URL ${logId}`); // prettier-ignore
|
|
236
|
+
}
|
|
237
|
+
log.info(`Finished URL ${logId}`);
|
|
238
|
+
}));
|
|
239
|
+
return links;
|
|
240
|
+
});
|
|
241
|
+
exports.scrapeListingEntries = scrapeListingEntries;
|
|
242
|
+
//# sourceMappingURL=scrapeListing.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scrapeListing.js","sourceRoot":"","sources":["../../../../src/lib/scraper/scrapeListing.ts"],"names":[],"mappings":";;;;;;;;;;;;AAAA,mCAAuC;AAEvC,6CAA+D;AAC/D,yCAA8C;AA4G9C;;;;GAIG;AACH,MAAM,mBAAmB,GAAG,CAA8B,EACxD,OAAO,EACP,OAAO,GAAG,EAAE,EACZ,iBAAiB,EACjB,cAAc,EACd,eAAe,EACf,GAAG,GACsC,EAA2B,EAAE;IACtE,IAAI,YAAY,GAAwB,OAAO,CAAC;IAEhD,MAAM,6BAA6B,GAAG,GAAS,EAAE;QAC/C,MAAM,aAAa,GAAG,MAAM,IAAA,sBAAc,EAAC,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,YAAY,EAAE,CAAC,CAAC;QAC5F,OAAO,IAAA,sBAAa,EAAC,aAAa,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC;IAChD,CAAC,CAAA,CAAC;IAEF,MAAM,QAAQ,GAAG,GAAS,EAAE;QAC1B,MAAM,SAAS,GAAG,MAAM,IAAA,sBAAc,EAAC,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC;QACpF,OAAO,SAAS,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IACjC,CAAC,CAAA,CAAC;IAEF,MAAM,YAAY,GAAG,GAAS,EAAE;QAC9B,MAAM,0BAA0B,GAAG,MAAM,6BAA6B,EAAE,CAAC;QACzE,OAAO,0BAA0B,GAAG,CAAC,CAAC,CAAC;IACzC,CAAC,CAAA,CAAC;IAEF,MAAM,SAAS,GAAG,GAAS,EAAE;QAC3B,6EAA6E;QAC7E,gDAAgD;QAChD,wCAAwC;QACxC,mDAAmD;QACnD,wCAAwC;QACxC,wCAAwC;QACxC,wCAAwC;QACxC,iCAAiC;QACjC,wCAAwC;QACxC,EAAE;QACF,uEAAuE;QACvE,8DAA8D;QAC9D,sCAAsC;QACtC,wCAAwC;QACxC,EAAE;QACF,sEAAsE;QACtE,iBAAiB;QACjB,0CAA0C;QAC1C,0CAA0C;QAE1C,MAAM,UAAU,GAAG,MAAM,IAAA,sBAAc,EAAC,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC;QACtF,IAAI,UAAU,CAAC,IAAI,CAAC,OAAO,CAAC;YAAE,OAAO,GAAG,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC;QAErE,MAAM,0BAA0B,GAAG,MAAM,6BAA6B,EAAE,CAAC;QACzE,IAAI,0BAA0B,KAAK,CAAC,CAAC;YACnC,MAAM,KAAK,CAAC,uDAAuD,CAAC,CAAC;QAEvE,MAAM,iBAAiB,GAAG,YAAY,CAAC,0BAA0B,CAAC,CAAC;QACnE,MAAM,cAAc,GAAG,YAAY,CAAC,KAAK,CAAC,0BAA0B,GAAG,CAAC,CAAC,CAAC;QAE1E,GAAG,CAAC,IAAI,CAAC,+BAA+B,CAAC,CAAC;QAC1C,MAAM,iBAAiB,CAAC,SAAS,EAAE,CAAC;QACpC,KAAK,MAAM,MAAM,IAAI,cAAc,EAAE;YACnC,MAAM,MAAM,CAAC,UAAU,EAAE,CAAC;YAC1B,MAAM,MAAM,CAAC,SAAS,EAAE,CAAC;SAC1B;IACH,CAAC,CAAA,CAAC;IAEF,+CAA+C;IAC/C,MAAM,SAAS,GAAG,GAAS,EAAE;QAC3B,MAAM,UAAU,EAAE,CAAC;QAEnB,8CAA8C;QAC9C,YAAY,GAAG,EAAE,CAAC;QAClB,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE;YAC5B,MAAM,eAAe,GAAG,iBAAiB;gBACvC,CAAC,CAAC,MAAM,iBAAiB,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,CAAC;gBACnD,CAAC,CAAC,IAAI,CAAC;YACT,IAAI,CAAC,eAAe,EAAE;gBACpB,GAAG,CAAC,IAAI,CAAC,wBAAwB,MAAM,CAAC,IAAI,sBAAsB,CAAC,CAAC;gBACpE,MAAM;aACP;YAED,IAAI,CAAC,MAAM,CAAC,QAAQ,EAAE;gBACpB,GAAG,CAAC,IAAI,CAAC,oBAAoB,MAAM,CAAC,IAAI,GAAG,CAAC,CAAC;gBAC7C,MAAM,MAAM,CAAC,SAAS,EAAE,CAAC;aAC1B;iBAAM;gBACL,GAAG,CAAC,IAAI,CAAC,WAAW,MAAM,CAAC,IAAI,qDAAqD,CAAC,CAAC;aACvF;YAED,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;SAC3B;QAED,GAAG,CAAC,IAAI,CAAC,sBAAsB,CAAC,CAAC;QACjC,MAAM,CAAA,eAAe,aAAf,eAAe,uBAAf,eAAe,CAAG,OAAO,CAAC,CAAA,CAAC;IACnC,CAAC,CAAA,CAAC;IAEF,yBAAyB;IACzB,MAAM,UAAU,GAAG,GAAS,EAAE;QAC5B,GAAG,CAAC,IAAI,CAAC,wBAAwB,CAAC,CAAC;QACnC,MAAM,CAAA,cAAc,aAAd,cAAc,uBAAd,cAAc,CAAG,OAAO,CAAC,CAAA,CAAC;QAChC,YAAY,GAAG,OAAO,CAAC;QACvB,GAAG,CAAC,IAAI,CAAC,6BAA6B,CAAC,CAAC;IAC1C,CAAC,CAAA,CAAC;IAEF,OAAO;QACL,SAAS;QACT,SAAS;QACT,YAAY;QACZ,QAAQ;KACT,CAAC;AACJ,CAAC,CAAC;AAEF,yFAAyF;AAClF,MAAM,oBAAoB,GAAG,CAClC,OAAgD,EAChD,EAAE;IACF,MAAM,EACJ,OAAO,EACP,SAAS,EACT,gBAAgB,GAAG,KAAK,EACxB,GAAG,EACH,MAAM,EACN,UAAU,EACV,iBAAiB,EAEjB,OAAO,GAAG,EAAE,EACZ,iBAAiB,EACjB,kBAAkB,GAAG,CAAC,EACtB,kBAAkB,GAAG,CAAC,CAAC,EAAE,GAAG,EAAE,EAAE,CAAC,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,EACnD,eAAe,EACf,cAAc,EAEd,cAAc,EACd,qBAAqB,GAAG,CAAC,EACzB,qBAAqB,GAAG,CAAC,CAAC,EAAE,GAAG,EAAE,EAAE,CAAC,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,EACtD,oBAAoB,EAEpB,cAAc,EACd,YAAY,GAAG,GAAG,GACnB,GAAG,OAAO,CAAC;IAEZ,4DAA4D;IAC5D,MAAM,KAAK,GAAc,EAAE,CAAC;IAE5B,MAAM,IAAA,sBAAc,EAAC,SAAS,EAAE,CAAO,QAAQ,EAAE,KAAK,EAAE,EAAE;QACxD,IAAI,gBAAgB,IAAI,KAAK,GAAG,CAAC;YAAE,OAAO;QAE1C,MAAM,KAAK,GAAG,GAAG,QAAQ,KAAK,KAAK,GAAG,CAAC,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC;QAE/D,IAAI,eAAe,GAAG,KAAK,CAAC;QAC5B,MAAM,KAAK,GAAG,GAAG,EAAE,GAAG,eAAe,GAAG,IAAI,CAAA,CAAC,CAAC,CAAC,CAAC,kBAAkB;QAElE,0CAA0C;QAC1C,IAAI,SAAS,GAAmC,IAAI,CAAC;QACrD,MAAM,SAAS,GAAG,GAA4C,EAAE;;YAAC,OAAA,CAAC;gBAChE,OAAO;gBACP,GAAG;gBACH,QAAQ;gBACR,OAAO;gBACP,eAAe,EAAE,MAAA,SAAS,aAAT,SAAS,uBAAT,SAAS,CAAE,SAAS,mCAAI,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC;gBACnD,KAAK;aACN,CAAC,CAAA;SAAA,CAAC;QAEH,GAAG,CAAC,KAAK,CAAC,kBAAkB,KAAK,EAAE,CAAC,CAAC;QACrC,IAAA,iBAAW,EAAC,QAAkB,CAAC,CAAC;QAChC,GAAG,CAAC,IAAI,CAAC,kBAAkB,KAAK,EAAE,CAAC,CAAC;QACpC,MAAM,CAAA,UAAU,aAAV,UAAU,uBAAV,UAAU,CAAG,SAAS,EAAE,EAAE,QAAQ,CAAC,CAAA,CAAC;QAC1C,GAAG,CAAC,KAAK,CAAC,0BAA0B,KAAK,EAAE,CAAC,CAAC;QAE7C,SAAS,GAAG,mBAAmB,CAAC;YAC9B,OAAO,EAAE,SAAS,EAAE;YACpB,OAAO;YACP,iBAAiB;YACjB,eAAe;YACf,cAAc;YACd,GAAG;SACJ,CAAC,CAAC;QAEH,GAAG,CAAC,KAAK,CAAC,2CAA2C,KAAK,EAAE,CAAC,CAAC,CAAC,kBAAkB;QACjF,MAAM,CAAA,iBAAiB,aAAjB,iBAAiB,uBAAjB,iBAAiB,CAAG,SAAS,EAAE,CAAC,CAAA,CAAC;QACvC,GAAG,CAAC,KAAK,CAAC,gDAAgD,KAAK,GAAG,CAAC,CAAC,CAAC,kBAAkB;QAEvF,MAAM,cAAc,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;QAElE,IAAI,wBAAwB,GAAG,IAAI,CAAC;QACpC,OAAO,wBAAwB,IAAI,CAAC,eAAe,EAAE;YACnD,cAAc;YACd,0CAA0C;YAC1C,GAAG,CAAC,IAAI,CAAC,8BAA8B,KAAK,EAAE,CAAC,CAAC;YAChD,MAAM,IAAA,kBAAU,EACd,GAAS,EAAE;gBACT,IAAI,CAAC,SAAS;oBAAE,MAAM,KAAK,CAAC,+DAA+D,KAAK,EAAE,CAAC,CAAC,CAAC,kBAAkB;gBAEvH,MAAM,cAAc,GAAG,MAAM,SAAS,CAAC,QAAQ,EAAE,CAAC;gBAClD,IAAI,CAAC,cAAc,IAAI,CAAC,cAAc,EAAE;oBACtC,GAAG,CAAC,IAAI,CAAC,+BAA+B,KAAK,EAAE,CAAC,CAAC;oBACjD,OAAO;iBACR;gBAED,GAAG,CAAC,KAAK,CAAC,2BAA2B,KAAK,EAAE,CAAC,CAAC;gBAC9C,MAAM,SAAS,CAAC,SAAS,EAAE,CAAC;gBAC5B,MAAM,SAAS,CAAC,SAAS,EAAE,CAAC;gBAC5B,GAAG,CAAC,KAAK,CAAC,gCAAgC,KAAK,EAAE,CAAC,CAAC;YACrD,CAAC,CAAA,EACD;gBACE,UAAU,EAAE,kBAAkB;gBAC9B,OAAO,EAAE,CAAC,GAAG,EAAE,UAAU,EAAE,EAAE,CAAC,kBAAkB,CAAC,SAAS,EAAE,EAAE,GAAG,EAAE,UAAU,CAAC;aAC/E,CACF,CAAC;YAEF,IAAI,iBAAiB,GAAG,IAAI,CAAC;YAC7B,OAAO,iBAAiB,IAAI,CAAC,eAAe,EAAE;gBAC5C,kBAAkB;gBAClB,IAAI,UAAU,GAAG,WAAW,CAAC;gBAC7B,IAAI,MAAM,EAAE;oBACV,GAAG,CAAC,KAAK,CAAC,0BAA0B,KAAK,EAAE,CAAC,CAAC;oBAC7C,UAAU,GAAG,MAAM,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC;oBACvC,GAAG,CAAC,KAAK,CAAC,+BAA+B,KAAK,EAAE,CAAC,CAAC;iBACnD;gBACD,MAAM,SAAS,GAAG,GAAG,KAAK,KAAK,UAAU,GAAG,CAAC;gBAE7C,qBAAqB;gBACrB,GAAG,CAAC,IAAI,CAAC,8BAA8B,SAAS,EAAE,CAAC,CAAC;gBACpD,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,IAAA,kBAAU,EACjC,CAAO,UAAU,EAAE,EAAE,kDAAC,OAAA,cAAc,CAAC,SAAS,EAAE,EAAE,UAAU,CAAC,CAAA,GAAA,EAC7D;oBACE,UAAU,EAAE,qBAAqB;oBACjC,OAAO,EAAE,CAAC,GAAG,EAAE,UAAU,EAAE,EAAE,CAAC,qBAAqB,CAAC,SAAS,EAAE,EAAE,GAAG,EAAE,UAAU,CAAC;iBAClF,CACF,CAAC;gBACF,GAAG,CAAC,KAAK,CAAC,mCAAmC,SAAS,EAAE,CAAC,CAAC;gBAE1D,MAAM,SAAS,GAAG,MAAM,aAAN,MAAM,cAAN,MAAM,GAAI,EAAE,CAAC;gBAC/B,KAAK,CAAC,IAAI,CAAC,GAAG,SAAS,CAAC,CAAC;gBACzB,GAAG,CAAC,IAAI,CAAC,SAAS,SAAS,CAAC,MAAM,kBAAkB,SAAS,EAAE,CAAC,CAAC;gBAEjE,6CAA6C;gBAC7C,IAAI,gBAAgB,IAAI,eAAe,EAAE;oBACvC,iBAAiB,GAAG,KAAK,CAAC;oBAC1B,IAAI,gBAAgB;wBAAE,GAAG,CAAC,IAAI,CAAC,6DAA6D,SAAS,EAAE,CAAC,CAAC,CAAC,kBAAkB;yBACvH,IAAI,eAAe;wBAAE,GAAG,CAAC,IAAI,CAAC,iBAAiB,SAAS,EAAE,CAAC,CAAC;oBACjE,SAAS;iBACV;gBAED,GAAG,CAAC,KAAK,CAAC,8CAA8C,SAAS,EAAE,CAAC,CAAC,CAAC,kBAAkB;gBACxF,MAAM,CAAA,oBAAoB,aAApB,oBAAoB,uBAApB,oBAAoB,CAAG,SAAS,EAAE,EAAE,SAAS,CAAC,CAAA,CAAC;gBACrD,GAAG,CAAC,KAAK,CAAC,mDAAmD,SAAS,EAAE,CAAC,CAAC,CAAC,kBAAkB;gBAE7F,IAAI,cAAc,IAAI,CAAC,eAAe,EAAE;oBACtC,wFAAwF;oBACxF,IAAI;wBACF,GAAG,CAAC,IAAI,CAAC,oCAAoC,SAAS,EAAE,CAAC,CAAC;wBAC1D,MAAM,cAAc,CAAC,SAAS,EAAE,EAAE,SAAS,CAAC,CAAC;wBAC7C,GAAG,CAAC,KAAK,CAAC,yCAAyC,SAAS,EAAE,CAAC,CAAC,CAAC,kBAAkB;qBACpF;oBAAC,OAAO,CAAC,EAAE;wBACV,GAAG,CAAC,IAAI,CAAC,2CAA2C,SAAS,EAAE,CAAC,CAAC,CAAC,kBAAkB;wBACpF,GAAG,CAAC,KAAK,CAAE,CAAW,CAAC,QAAQ,EAAE,CAAC,CAAC;wBACnC,iBAAiB,GAAG,KAAK,CAAC;qBAC3B;iBACF;qBAAM;oBACL,IAAI,eAAe;wBAAE,GAAG,CAAC,IAAI,CAAC,iBAAiB,SAAS,EAAE,CAAC,CAAC;oBAC5D,iBAAiB,GAAG,KAAK,CAAC;iBAC3B;gBAED,8CAA8C;gBAC9C,MAAM,IAAI,OAAO,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,UAAU,CAAC,GAAG,EAAE,YAAY,CAAC,CAAC,CAAC;aAC3D;YAED,sEAAsE;YACtE,GAAG,CAAC,KAAK,CAAC,8DAA8D,KAAK,EAAE,CAAC,CAAC;YACjF,wBAAwB,GAAG,cAAc,IAAI,CAAC,MAAM,SAAS,CAAC,YAAY,EAAE,CAAC,CAAC;YAC9E,GAAG,CAAC,KAAK,CAAC,mEAAmE,KAAK,EAAE,CAAC,CAAC;YAEtF,IAAI,wBAAwB,EAAE;gBAC5B,IAAI,CAAC,eAAe;oBAAE,GAAG,CAAC,IAAI,CAAC,oEAAoE,KAAK,EAAE,CAAC,CAAC,CAAC,kBAAkB;;oBAC1H,GAAG,CAAC,IAAI,CAAC,+FAA+F,KAAK,EAAE,CAAC,CAAC,CAAC,kBAAkB;aAC1I;;gBAAM,GAAG,CAAC,IAAI,CAAC,uDAAuD,KAAK,EAAE,CAAC,CAAC,CAAC,kBAAkB;SACpG;QACD,GAAG,CAAC,IAAI,CAAC,gBAAgB,KAAK,EAAE,CAAC,CAAC;IACpC,CAAC,CAAA,CAAC,CAAC;IACH,OAAO,KAAK,CAAC;AACf,CAAC,CAAA,CAAC;AAxKW,QAAA,oBAAoB,wBAwK/B","sourcesContent":["import { findLastIndex } from 'lodash';\n\nimport { serialAsyncMap, retryAsync } from '../../utils/async';\nimport { validateUrl } from '../../utils/url';\nimport type { MaybePromise } from '../../utils/types';\n\nexport interface ListingLogger {\n debug: (msg: string, data?: any) => void;\n info: (msg: string, data?: any) => void;\n warning: (msg: string, data?: any) => void;\n error: (msg: string, data?: any) => void;\n}\n\nexport interface ListingPageFilter {\n name: string;\n disabled?: boolean;\n initState: () => MaybePromise<boolean>;\n resetState: () => MaybePromise<void>;\n nextState: () => MaybePromise<void>;\n hasNextState: () => MaybePromise<boolean>;\n hasState: () => MaybePromise<boolean>;\n loadState: () => MaybePromise<void>;\n}\n\nexport interface ListingFiltersSetupOptions<Ctx extends object, UrlType> {\n context: ListingPageScraperContext<Ctx, UrlType>;\n filters?: ListingPageFilter[];\n shouldApplyFilter?: (\n context: ListingPageScraperContext<Ctx, UrlType>,\n filter: ListingPageFilter,\n filters: ListingPageFilter[]\n ) => MaybePromise<boolean>;\n onResetFilters?: (context: ListingPageScraperContext<Ctx, UrlType>) => MaybePromise<void>;\n onFiltersLoaded?: (context: ListingPageScraperContext<Ctx, UrlType>) => MaybePromise<void>;\n log: ListingLogger;\n}\n\ntype ListingFilterController = Pick<ListingPageFilter, 'loadState' | 'nextState' | 'hasNextState' | 'hasState'>; // prettier-ignore\n\nexport interface ListingPageScraperContext<Ctx extends object, UrlType> {\n context: Ctx;\n log: ListingLogger;\n startUrl: UrlType;\n filters: ListingPageFilter[];\n /** Use this if you need to load filters again (eg after reloading page manually) */\n loadFilterState: () => MaybePromise<void>;\n /** Call this function from any callback to stop scraping */\n abort: () => void;\n}\n\n// prettier-ignore\nexport interface ListingPageScraperOptions<Ctx extends object, UrlType> extends Omit<ListingFiltersSetupOptions<Ctx, UrlType>, 'context'> {\n context: Ctx;\n startUrls: UrlType[];\n listingCountOnly?: boolean;\n /** Get ID of the current page in the pagination, so it can be logged */\n pageId?: (context: ListingPageScraperContext<Ctx, UrlType>) => MaybePromise<string>;\n log: ListingLogger;\n\n onNavigate?: (context: ListingPageScraperContext<Ctx, UrlType>, url: UrlType) => MaybePromise<void>;\n /**\n * Hook triggered after navigating to the url using Page.goto().\n *\n * One use of this hook is to conditionally disable/enable filters based on the page content.\n **/\n onAfterNavigation?: (context: ListingPageScraperContext<Ctx, UrlType>) => MaybePromise<void>;\n\n /** How many attempts are retried after filters failed to load. Defaults to 3 */\n loadFiltersRetries?: number;\n /**\n * Hook triggered after a failed attempt at loading listings page filters.\n *\n * One use of this hook is to reload the page on failed attemp in case something didn't load correctly.\n **/\n onLoadFiltersError?: (\n context: ListingPageScraperContext<Ctx, UrlType>,\n error: any,\n retryIndex: number\n ) => MaybePromise<void>;\n\n /** Main logic to extract entries from a page */\n extractEntries: (context: ListingPageScraperContext<Ctx, UrlType>, retryIndex: number) => MaybePromise<UrlType[]>;\n /** How many attempts are retried after failed to scrape entries from a listing. Defaults to 3 */\n extractEntriesRetries?: number;\n /**\n * Hook triggered after a failed attempt at scraping entries from a listing.\n *\n * One use of this hook is to reload the page on failed attemp in case something didn't load correctly.\n **/\n onExtractEntriesError?: (\n context: ListingPageScraperContext<Ctx, UrlType>,\n error: any,\n retryIndex: number\n ) => MaybePromise<void>;\n onExtractEntriesDone?: (\n context: ListingPageScraperContext<Ctx, UrlType>,\n entries: UrlType[] | null\n ) => MaybePromise<void>;\n\n /**\n * If goToNextPage hook is defined, it will be called after each page. To indicate that there's no more\n * pages left, throw an error.\n **/\n onGoToNextPage?: (\n context: ListingPageScraperContext<Ctx, UrlType>,\n entries: UrlType[] | null\n ) => MaybePromise<void>;\n /** How long to wait after we've navigated to the next page and before we start extracting? */\n nextPageWait?: number;\n}\n\n/**\n * Given configuration for listing page filters, set up functions to\n * navigate through the different states of filters, to allow to paginate\n * through all states.\n */\nconst setupListingFilters = <Ctx extends object, UrlType>({\n context,\n filters = [],\n shouldApplyFilter,\n onResetFilters,\n onFiltersLoaded,\n log,\n}: ListingFiltersSetupOptions<Ctx, UrlType>): ListingFilterController => {\n let filtersStack: ListingPageFilter[] = filters;\n\n const getNextFilterStateChangeIndex = async () => {\n const hasNextStates = await serialAsyncMap(filtersStack, (filter) => filter.hasNextState());\n return findLastIndex(hasNextStates, (x) => x);\n };\n\n const hasState = async () => {\n const hasStates = await serialAsyncMap(filtersStack, (filter) => filter.hasState());\n return hasStates.some(Boolean);\n };\n\n const hasNextState = async () => {\n const nextFilterStateChangeIndex = await getNextFilterStateChangeIndex();\n return nextFilterStateChangeIndex > -1;\n };\n\n const nextState = async () => {\n // Imagine we have 4 filters, each has 3 states (eg 3 options to select from)\n // We start with all filters in the first state:\n // State 1: F1(1), F2(1), F3(1), F4(1)\n // As we progress, we increment it akin to numbers:\n // State 2: F1(1), F2(1), F3(1), F4(2)\n // State 3: F1(1), F2(1), F3(1), F4(3)\n // State 4: F1(1), F2(1), F3(2), F4(1)\n // All the way to the last state:\n // State n: F1(3), F2(3), F3(3), F4(3)\n //\n // When we want move to a next state, we identify the RIGHT-most filter\n // whose state can be incremented (in this case we select F2):\n // YES YES NO NO\n // State x: F1(1), F2(2), F3(3), F4(3)\n //\n // When we increment a filter state, all the other filter to the RIGHT\n // will be reset:\n // State x: F1(1), F2(2), F3(3), F4(3)\n // State x+1: F1(1), F2(3), F3(1), F4(1)\n\n const initStates = await serialAsyncMap(filtersStack, (filter) => filter.initState());\n if (initStates.some(Boolean)) return log.info('Initialised filters');\n\n const nextFilterStateChangeIndex = await getNextFilterStateChangeIndex();\n if (nextFilterStateChangeIndex === -1)\n throw Error('Cannot select next filter state - reached end of list');\n\n const filterToNextState = filtersStack[nextFilterStateChangeIndex];\n const filtersToReset = filtersStack.slice(nextFilterStateChangeIndex + 1);\n\n log.info('Setting filters to next state');\n await filterToNextState.nextState();\n for (const filter of filtersToReset) {\n await filter.resetState();\n await filter.nextState();\n }\n };\n\n /** Load current filter state in the webpage */\n const loadState = async () => {\n await resetState();\n\n // Load filters one by one, and only if needed\n filtersStack = [];\n for (const filter of filters) {\n const shouldUseFilter = shouldApplyFilter\n ? await shouldApplyFilter(context, filter, filters)\n : true;\n if (!shouldUseFilter) {\n log.info(`Not applying filter \"${filter.name}\" or further filters`);\n break;\n }\n\n if (!filter.disabled) {\n log.info(`Applying filter \"${filter.name}\"`);\n await filter.loadState();\n } else {\n log.info(`Filter \"${filter.name}\" recognised but not applied because it is disabled`);\n }\n\n filtersStack.push(filter);\n }\n\n log.info(`Done loading filters`);\n await onFiltersLoaded?.(context);\n };\n\n /** Reset filter state */\n const resetState = async () => {\n log.info(`Resetting filter state`);\n await onResetFilters?.(context);\n filtersStack = filters;\n log.info(`Resetting filter state done`);\n };\n\n return {\n loadState,\n nextState,\n hasNextState,\n hasState,\n };\n};\n\n/** Get entries from a listing page (eg URLs to profiles that should be scraped later) */\nexport const scrapeListingEntries = async <Ctx extends object, UrlType>(\n options: ListingPageScraperOptions<Ctx, UrlType>\n) => {\n const {\n context,\n startUrls,\n listingCountOnly = false,\n log,\n pageId,\n onNavigate,\n onAfterNavigation,\n\n filters = [],\n shouldApplyFilter,\n loadFiltersRetries = 3,\n onLoadFiltersError = (_, err) => console.error(err),\n onFiltersLoaded,\n onResetFilters,\n\n extractEntries,\n extractEntriesRetries = 3,\n onExtractEntriesError = (_, err) => console.error(err),\n onExtractEntriesDone,\n\n onGoToNextPage,\n nextPageWait = 500,\n } = options;\n\n /** Collection of ALL urls across all pages and startUrls */\n const links: UrlType[] = [];\n\n await serialAsyncMap(startUrls, async (startUrl, index) => {\n if (listingCountOnly && index > 0) return;\n\n const logId = `${startUrl} (${index + 1}/${startUrls.length})`;\n\n let userAskedToStop = false;\n const abort = () => { userAskedToStop = true }; // prettier-ignore\n\n // Prepare context shared across all hooks\n let filterObj: ListingFilterController | null = null;\n const genCtxArg = (): ListingPageScraperContext<Ctx, UrlType> => ({\n context,\n log,\n startUrl,\n filters,\n loadFilterState: filterObj?.loadState ?? (() => {}),\n abort,\n });\n\n log.debug(`Validating URL ${logId}`);\n validateUrl(startUrl as string);\n log.info(`Navigating URL ${logId}`);\n await onNavigate?.(genCtxArg(), startUrl);\n log.debug(`Done navigating to URL ${logId}`);\n\n filterObj = setupListingFilters({\n context: genCtxArg(),\n filters,\n shouldApplyFilter,\n onFiltersLoaded,\n onResetFilters,\n log,\n });\n\n log.debug(`Calling onAfterNavigation callback. URL ${logId}`); // prettier-ignore\n await onAfterNavigation?.(genCtxArg());\n log.debug(`Done calling onAfterNavigation callback. URL ${logId})`); // prettier-ignore\n\n const isUsingFilters = filters.some((filter) => !filter.disabled);\n\n let hasFilterStatesToProcess = true;\n while (hasFilterStatesToProcess && !userAskedToStop) {\n // Filter loop\n // Load filters before we start paginating\n log.info(`Setting up filters for URL ${logId}`);\n await retryAsync(\n async () => {\n if (!filterObj) throw Error(`Filter controller is missing. This should never happen. URL ${logId}`); // prettier-ignore\n\n const filterHasState = await filterObj.hasState();\n if (!isUsingFilters || !filterHasState) {\n log.info(`Not loading filters for URL ${logId}`);\n return;\n }\n\n log.debug(`Loading filters for URL ${logId}`);\n await filterObj.nextState();\n await filterObj.loadState();\n log.debug(`Done loading filters for URL ${logId}`);\n },\n {\n maxRetries: loadFiltersRetries,\n onError: (err, retryIndex) => onLoadFiltersError(genCtxArg(), err, retryIndex),\n }\n );\n\n let nextPageAvailable = true;\n while (nextPageAvailable && !userAskedToStop) {\n // Pagination loop\n let currPageId = 'next page';\n if (pageId) {\n log.debug(`Loading pageId for URL ${logId}`);\n currPageId = await pageId(genCtxArg());\n log.debug(`Done loading pageId for URL ${logId}`);\n }\n const pageLogId = `${logId} (${currPageId})`;\n\n // Extract page links\n log.info(`Extracting links from page ${pageLogId}`);\n const { result } = await retryAsync(\n async (retryIndex) => extractEntries(genCtxArg(), retryIndex),\n {\n maxRetries: extractEntriesRetries,\n onError: (err, retryIndex) => onExtractEntriesError(genCtxArg(), err, retryIndex),\n }\n );\n log.debug(`Done extracting links from page ${pageLogId}`);\n\n const pageLinks = result ?? [];\n links.push(...pageLinks);\n log.info(`Found ${pageLinks.length} links on page ${pageLogId}`);\n\n // Leave after printing the count or on abort\n if (listingCountOnly || userAskedToStop) {\n nextPageAvailable = false;\n if (listingCountOnly) log.info(`Debugging mode. Entries are not scraped. Leaving now. URL ${pageLogId}`); // prettier-ignore\n else if (userAskedToStop) log.info(`Aborting. URL ${pageLogId}`);\n continue;\n }\n\n log.debug(`Calling onExtractEntriesDone callback. URL ${pageLogId}`); // prettier-ignore\n await onExtractEntriesDone?.(genCtxArg(), pageLinks);\n log.debug(`Done calling onExtractEntriesDone callback. URL ${pageLogId}`); // prettier-ignore\n\n if (onGoToNextPage && !userAskedToStop) {\n // If goToNextPage hook is defined, this will be called after each page, until it errors\n try {\n log.info(`Navigating to next page from URL ${pageLogId}`);\n await onGoToNextPage(genCtxArg(), pageLinks);\n log.debug(`Done navigating to next page from URL ${pageLogId}`); // prettier-ignore\n } catch (e) {\n log.info(`Failed navigating to next page from URL ${pageLogId}`); // prettier-ignore\n log.error((e as Error).toString());\n nextPageAvailable = false;\n }\n } else {\n if (userAskedToStop) log.info(`Aborting. URL ${pageLogId}`);\n nextPageAvailable = false;\n }\n\n // Wait before we start scraping the next page\n await new Promise((res) => setTimeout(res, nextPageWait));\n }\n\n // Break out if we're not using filters or we've gone through them all\n log.debug(`Checking if there are more filter states available for URL ${logId}`);\n hasFilterStatesToProcess = isUsingFilters && (await filterObj.hasNextState());\n log.debug(`Done checking if there are more filter states available for URL ${logId}`);\n\n if (hasFilterStatesToProcess) {\n if (!userAskedToStop) log.info(`Will repeat scraping this URL with different filter setting. URL ${logId}`); // prettier-ignore\n else log.info(`There are unprocessed filter setting remaining for this URL, but stopping due to abort. URL ${logId}`); // prettier-ignore\n } else log.info(`No filter setting remain for scraping this URL. URL ${logId}`); // prettier-ignore\n }\n log.info(`Finished URL ${logId}`);\n });\n return links;\n};\n"]}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import type { vi } from 'vitest';
|
|
2
|
+
import { Dictionary } from 'crawlee';
|
|
3
|
+
import type { MaybeArray, MaybePromise } from '../../utils/types';
|
|
4
|
+
import { OnBatchAddRequests } from './mockApifyClient';
|
|
5
|
+
export declare const setupMockApifyActor: <TInput, TData extends MaybeArray<Dictionary> = MaybeArray<Dictionary>>({ vi: viInstance, actorInput, log, onPushData, onBatchAddRequests, onGetInfo, }: {
|
|
6
|
+
vi: typeof vi;
|
|
7
|
+
actorInput?: TInput | undefined;
|
|
8
|
+
log?: ((...args: any[]) => void) | undefined;
|
|
9
|
+
onPushData?: ((data: TData) => MaybePromise<void>) | undefined;
|
|
10
|
+
onBatchAddRequests?: OnBatchAddRequests | undefined;
|
|
11
|
+
onGetInfo?: ((...args: any[]) => MaybePromise<void>) | undefined;
|
|
12
|
+
}) => Promise<void>;
|
|
13
|
+
export declare const runCrawlerTest: <TData extends MaybeArray<Dictionary>, TInput>({ vi: viInstance, input, runCrawler, log, onPushData, onBatchAddRequests, onDone, }: {
|
|
14
|
+
vi: typeof vi;
|
|
15
|
+
input: TInput;
|
|
16
|
+
runCrawler: () => MaybePromise<void>;
|
|
17
|
+
log?: ((...args: any[]) => void) | undefined;
|
|
18
|
+
onPushData?: ((data: any, done: () => void) => MaybePromise<void>) | undefined;
|
|
19
|
+
onBatchAddRequests?: OnBatchAddRequests | undefined;
|
|
20
|
+
onDone?: ((done: () => void) => MaybePromise<void>) | undefined;
|
|
21
|
+
}) => Promise<void>;
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.runCrawlerTest = exports.setupMockApifyActor = void 0;
|
|
13
|
+
const apify_1 = require("apify");
|
|
14
|
+
const crawlee_1 = require("crawlee");
|
|
15
|
+
const mockApifyClient_1 = require("./mockApifyClient");
|
|
16
|
+
const setupMockApifyActor = ({ vi: viInstance, actorInput, log, onPushData, onBatchAddRequests, onGetInfo, }) => __awaiter(void 0, void 0, void 0, function* () {
|
|
17
|
+
const mockStorageClient = (0, mockApifyClient_1.createMockStorageClient)({ log, onBatchAddRequests });
|
|
18
|
+
viInstance.spyOn(apify_1.Actor, 'main').mockImplementation((fn) => __awaiter(void 0, void 0, void 0, function* () { return fn(); }));
|
|
19
|
+
viInstance.spyOn(apify_1.Actor, 'getInput').mockImplementation(() => Promise.resolve(actorInput));
|
|
20
|
+
viInstance.spyOn(apify_1.Actor, 'openDataset').mockImplementation((datasetId, options) => __awaiter(void 0, void 0, void 0, function* () {
|
|
21
|
+
console.log('Mock Actor.openDataset: ', datasetId);
|
|
22
|
+
return (0, mockApifyClient_1.createMockStorageDataset)(datasetId, options, { log, onPushData, onGetInfo });
|
|
23
|
+
}));
|
|
24
|
+
viInstance.spyOn(apify_1.Actor, 'pushData').mockImplementation((data) => __awaiter(void 0, void 0, void 0, function* () {
|
|
25
|
+
console.log('Mock Actor.pushData');
|
|
26
|
+
if (onPushData)
|
|
27
|
+
yield onPushData(data);
|
|
28
|
+
}));
|
|
29
|
+
viInstance.spyOn(apify_1.RequestQueue, 'open').mockImplementation(() => __awaiter(void 0, void 0, void 0, function* () {
|
|
30
|
+
const reqQueue = new apify_1.RequestQueue({
|
|
31
|
+
id: 'test',
|
|
32
|
+
client: mockStorageClient,
|
|
33
|
+
});
|
|
34
|
+
return reqQueue;
|
|
35
|
+
}));
|
|
36
|
+
viInstance
|
|
37
|
+
.spyOn(crawlee_1.KeyValueStore, 'open')
|
|
38
|
+
.mockImplementation(() => __awaiter(void 0, void 0, void 0, function* () { return new crawlee_1.KeyValueStore({ id: 'keyvalstore', client: mockStorageClient }); }));
|
|
39
|
+
yield apify_1.Actor.init();
|
|
40
|
+
});
|
|
41
|
+
exports.setupMockApifyActor = setupMockApifyActor;
|
|
42
|
+
const runCrawlerTest = ({ vi: viInstance, input, runCrawler, log, onPushData, onBatchAddRequests, onDone = (done) => done(), }) => __awaiter(void 0, void 0, void 0, function* () {
|
|
43
|
+
yield new Promise((done) => __awaiter(void 0, void 0, void 0, function* () {
|
|
44
|
+
yield (0, exports.setupMockApifyActor)({
|
|
45
|
+
vi: viInstance,
|
|
46
|
+
actorInput: Object.assign({}, input),
|
|
47
|
+
log,
|
|
48
|
+
onPushData: (data) => onPushData === null || onPushData === void 0 ? void 0 : onPushData(data, done),
|
|
49
|
+
onBatchAddRequests,
|
|
50
|
+
});
|
|
51
|
+
yield runCrawler();
|
|
52
|
+
yield (onDone === null || onDone === void 0 ? void 0 : onDone(done));
|
|
53
|
+
}));
|
|
54
|
+
});
|
|
55
|
+
exports.runCrawlerTest = runCrawlerTest;
|
|
56
|
+
//# sourceMappingURL=actor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"actor.js","sourceRoot":"","sources":["../../../../src/lib/test/actor.ts"],"names":[],"mappings":";;;;;;;;;;;;AACA,iCAA4C;AAC5C,qCAAoD;AAGpD,uDAI2B;AAEpB,MAAM,mBAAmB,GAAG,CAGjC,EACA,EAAE,EAAE,UAAU,EACd,UAAU,EACV,GAAG,EACH,UAAU,EACV,kBAAkB,EAClB,SAAS,GAQV,EAAE,EAAE;IACH,MAAM,iBAAiB,GAAG,IAAA,yCAAuB,EAAC,EAAE,GAAG,EAAE,kBAAkB,EAAE,CAAC,CAAC;IAE/E,UAAU,CAAC,KAAK,CAAC,aAAK,EAAE,MAAM,CAAC,CAAC,kBAAkB,CAAC,CAAO,EAAE,EAAE,EAAE,kDAAC,OAAA,EAAE,EAAE,CAAA,GAAA,CAAC,CAAC;IACvE,UAAU,CAAC,KAAK,CAAC,aAAK,EAAE,UAAU,CAAC,CAAC,kBAAkB,CAAC,GAAG,EAAE,CAAC,OAAO,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC;IAE1F,UAAU,CAAC,KAAK,CAAC,aAAK,EAAE,aAAa,CAAC,CAAC,kBAAkB,CAAC,CAAO,SAAS,EAAE,OAAO,EAAE,EAAE;QACrF,OAAO,CAAC,GAAG,CAAC,0BAA0B,EAAE,SAAS,CAAC,CAAC;QACnD,OAAO,IAAA,0CAAwB,EAAC,SAAS,EAAE,OAAO,EAAE,EAAE,GAAG,EAAE,UAAU,EAAE,SAAS,EAAE,CAAC,CAAC;IACtF,CAAC,CAAA,CAAC,CAAC;IACH,UAAU,CAAC,KAAK,CAAC,aAAK,EAAE,UAAU,CAAC,CAAC,kBAAkB,CAAC,CAAO,IAAI,EAAE,EAAE;QACpE,OAAO,CAAC,GAAG,CAAC,qBAAqB,CAAC,CAAC;QACnC,IAAI,UAAU;YAAE,MAAM,UAAU,CAAC,IAAW,CAAC,CAAC;IAChD,CAAC,CAAA,CAAC,CAAC;IAEH,UAAU,CAAC,KAAK,CAAC,oBAAY,EAAE,MAAM,CAAC,CAAC,kBAAkB,CAAC,GAAS,EAAE;QACnE,MAAM,QAAQ,GAAG,IAAI,oBAAY,CAAC;YAChC,EAAE,EAAE,MAAM;YACV,MAAM,EAAE,iBAAiB;SAC1B,CAAC,CAAC;QACH,OAAO,QAAQ,CAAC;IAClB,CAAC,CAAA,CAAC,CAAC;IAEH,UAAU;SACP,KAAK,CAAC,uBAAa,EAAE,MAAM,CAAC;SAC5B,kBAAkB,CACjB,GAAS,EAAE,kDAAC,OAAA,IAAI,uBAAa,CAAC,EAAE,EAAE,EAAE,aAAa,EAAE,MAAM,EAAE,iBAAiB,EAAE,CAAC,CAAA,GAAA,CAChF,CAAC;IAEJ,MAAM,aAAK,CAAC,IAAI,EAAE,CAAC;AACrB,CAAC,CAAA,CAAC;AA/CW,QAAA,mBAAmB,uBA+C9B;AAEK,MAAM,cAAc,GAAG,CAAqD,EACjF,EAAE,EAAE,UAAU,EACd,KAAK,EACL,UAAU,EACV,GAAG,EACH,UAAU,EACV,kBAAkB,EAClB,MAAM,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,EAAE,GAS1B,EAAE,EAAE;IACH,MAAM,IAAI,OAAO,CAAO,CAAO,IAAI,EAAE,EAAE;QACrC,MAAM,IAAA,2BAAmB,EAAgB;YACvC,EAAE,EAAE,UAAU;YACd,UAAU,oBAAO,KAAK,CAAE;YACxB,GAAG;YACH,UAAU,EAAE,CAAC,IAAI,EAAE,EAAE,CAAC,UAAU,aAAV,UAAU,uBAAV,UAAU,CAAG,IAAI,EAAE,IAAI,CAAC;YAC9C,kBAAkB;SACnB,CAAC,CAAC;QAEH,MAAM,UAAU,EAAE,CAAC;QACnB,MAAM,CAAA,MAAM,aAAN,MAAM,uBAAN,MAAM,CAAG,IAAI,CAAC,CAAA,CAAC;IACvB,CAAC,CAAA,CAAC,CAAC;AACL,CAAC,CAAA,CAAC;AA7BW,QAAA,cAAc,kBA6BzB","sourcesContent":["import type { vi } from 'vitest';\nimport { Actor, RequestQueue } from 'apify';\nimport { Dictionary, KeyValueStore } from 'crawlee';\n\nimport type { MaybeArray, MaybePromise } from '../../utils/types';\nimport {\n OnBatchAddRequests,\n createMockStorageClient,\n createMockStorageDataset,\n} from './mockApifyClient';\n\nexport const setupMockApifyActor = async <\n TInput,\n TData extends MaybeArray<Dictionary> = MaybeArray<Dictionary>\n>({\n vi: viInstance,\n actorInput,\n log,\n onPushData,\n onBatchAddRequests,\n onGetInfo,\n}: {\n vi: typeof vi;\n actorInput?: TInput;\n log?: (...args: any[]) => void;\n onPushData?: (data: TData) => MaybePromise<void>;\n onBatchAddRequests?: OnBatchAddRequests;\n onGetInfo?: (...args: any[]) => MaybePromise<void>;\n}) => {\n const mockStorageClient = createMockStorageClient({ log, onBatchAddRequests });\n\n viInstance.spyOn(Actor, 'main').mockImplementation(async (fn) => fn());\n viInstance.spyOn(Actor, 'getInput').mockImplementation(() => Promise.resolve(actorInput));\n\n viInstance.spyOn(Actor, 'openDataset').mockImplementation(async (datasetId, options) => {\n console.log('Mock Actor.openDataset: ', datasetId);\n return createMockStorageDataset(datasetId, options, { log, onPushData, onGetInfo });\n });\n viInstance.spyOn(Actor, 'pushData').mockImplementation(async (data) => {\n console.log('Mock Actor.pushData');\n if (onPushData) await onPushData(data as any);\n });\n\n viInstance.spyOn(RequestQueue, 'open').mockImplementation(async () => {\n const reqQueue = new RequestQueue({\n id: 'test',\n client: mockStorageClient,\n });\n return reqQueue;\n });\n\n viInstance\n .spyOn(KeyValueStore, 'open')\n .mockImplementation(\n async () => new KeyValueStore({ id: 'keyvalstore', client: mockStorageClient })\n );\n\n await Actor.init();\n};\n\nexport const runCrawlerTest = async <TData extends MaybeArray<Dictionary>, TInput>({\n vi: viInstance,\n input,\n runCrawler,\n log,\n onPushData,\n onBatchAddRequests,\n onDone = (done) => done(),\n}: {\n vi: typeof vi;\n input: TInput;\n runCrawler: () => MaybePromise<void>;\n log?: (...args: any[]) => void;\n onPushData?: (data: any, done: () => void) => MaybePromise<void>;\n onBatchAddRequests?: OnBatchAddRequests;\n onDone?: (done: () => void) => MaybePromise<void>;\n}) => {\n await new Promise<void>(async (done) => {\n await setupMockApifyActor<TInput, TData>({\n vi: viInstance,\n actorInput: { ...input },\n log,\n onPushData: (data) => onPushData?.(data, done),\n onBatchAddRequests,\n });\n\n await runCrawler();\n await onDone?.(done);\n });\n};\n"]}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import type { Dataset, OpenStorageOptions } from 'apify';
|
|
2
|
+
import type { RequestQueue as ClientRequestQueue, RequestQueueClientBatchAddRequestWithRetriesOptions, RequestQueueClientRequestSchema } from 'apify-client';
|
|
3
|
+
import type { Dataset as ClientDataset, DatasetCollectionClient, KeyValueStoreClient, RequestQueueClient } from 'apify-client';
|
|
4
|
+
import type { StorageClient } from 'crawlee';
|
|
5
|
+
import type { MaybePromise } from '../../utils/types';
|
|
6
|
+
type BatchAddRequestsArgs = [
|
|
7
|
+
requests: Omit<RequestQueueClientRequestSchema, 'id'>[],
|
|
8
|
+
options?: RequestQueueClientBatchAddRequestWithRetriesOptions
|
|
9
|
+
];
|
|
10
|
+
export type OnBatchAddRequests = (...args: BatchAddRequestsArgs) => MaybePromise<void>;
|
|
11
|
+
export declare const createMockClientDataset: (overrides?: ClientDataset) => ClientDataset;
|
|
12
|
+
export declare const createMockClientRequestQueue: (overrides?: ClientRequestQueue) => ClientRequestQueue;
|
|
13
|
+
export declare const createMockKeyValueStoreClient: ({ log, }?: {
|
|
14
|
+
log?: ((args: any) => void) | undefined;
|
|
15
|
+
}) => KeyValueStoreClient;
|
|
16
|
+
export declare const createMockRequestQueueClient: ({ log, onBatchAddRequests, }?: {
|
|
17
|
+
log?: ((args: any) => void) | undefined;
|
|
18
|
+
onBatchAddRequests?: OnBatchAddRequests | undefined;
|
|
19
|
+
}) => RequestQueueClient;
|
|
20
|
+
export declare const createMockDatasetCollectionClient: ({ log, }?: {
|
|
21
|
+
log?: ((args: any) => void) | undefined;
|
|
22
|
+
}) => DatasetCollectionClient;
|
|
23
|
+
export declare const createMockStorageClient: ({ log, onBatchAddRequests, }?: {
|
|
24
|
+
log?: ((args: any) => void) | undefined;
|
|
25
|
+
onBatchAddRequests?: OnBatchAddRequests | undefined;
|
|
26
|
+
}) => StorageClient;
|
|
27
|
+
export declare const createMockStorageDataset: (datasetId?: string | null | undefined, options?: OpenStorageOptions | undefined, custom?: {
|
|
28
|
+
log?: ((...args: any[]) => void) | undefined;
|
|
29
|
+
onPushData?: ((...args: any[]) => MaybePromise<void>) | undefined;
|
|
30
|
+
onGetInfo?: ((...args: any[]) => MaybePromise<void>) | undefined;
|
|
31
|
+
} | undefined) => Promise<Dataset<any>>;
|
|
32
|
+
export {};
|