npm - crawlee-one - Versions diffs - 1.1.2 → 1.1.3 - Mend

crawlee-one 1.1.2 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/cjs/composer.js +76 -22
package/dist/cjs/composer.js.map +1 -1
package/dist/cjs/config/types.d.ts +0 -0
package/dist/cjs/config/types.js +38 -0
package/dist/cjs/config/types.js.map +1 -0
package/dist/cjs/index.d.ts +0 -3
package/dist/cjs/index.js +0 -3
package/dist/cjs/index.js.map +1 -1
package/package.json +1 -1

package/dist/cjs/composer.js CHANGED Viewed

@@ -17,6 +17,45 @@
 //   handler: detailPageHandler
 // }
 // ```
+// !!!!!!!!!!
+// UPDATES - START
+// !!!!!!!!!!
+// THE CONFIG SHOULD BE FIRST DEFINED AS PLAIN OBJECT, SO NO JS/TS,
+// SO WE CAN IMPORT IT FOR TYPE GENERATION.
+// SO IT SHOULD AT FIRST LOOK LIKE THIS (COULD BE JS/TS/JSON/YAML/...):
+// ```
+// {
+//   version: 1,
+//   schema: {
+//     crawlers: {
+//       mainCrawler {
+//         type: 'playwright'
+//         routes: ['listingPage', detailPage', 'home']
+//       }
+//     },
+//   }
+// };
+// ```
+// WE IMPORT THAT USING THE COSMICCONFIG (with https://github.com/codex-/cosmiconfig-typescript-loader)
+// See https://github.com/cosmiconfig/cosmiconfig#usage-for-tooling-developers
+//
+// AT THIS POINT DONT FORGET TO HAVE A VALIDATION SCHEMA TO COMPARE THE LAODED
+// CONFIG AGAINST. THROW ERROR IF INVALID.
+//
+// I SHOULD GENERATE TYPES FOR:
+// - The types I mentioned below
+// - Each Crawler based on their type (e.g. CheerioCrawleeOneCrawler<Labels, Inputs, ...>) (But also named
+//   variants like `mainCrawlerCrawler`)
+// - CRAWLER_NAME_ENUM
+// - All Crawlers obj = { `CrawlerName`: CheerioCrawleeOneCrawler, ... }
+// - Each Route based on their type (e.g. CheerioCrawleeOneRoute<Labels, Inputs, ...>) (But also named
+//   variants like `detailPageRoute`)
+// - CRAWLER_ROUTE_ENUM - e.g. `CrawlerName`RouteLabel = ...
+// - All Crawler Routes objs = { `detailPage`: detailPageRoute, ... }
+// - The whole object of { crawlers: { ...}, routes: { ... } }
+// !!!!!!!!!!
+// UPDATES - END
+// !!!!!!!!!!
 // As JS:
 // ```js
 // import { detailPageHandler } from './handlers';
@@ -39,8 +78,9 @@
 //       input?: Partial<AllActorInputs>
 ///////// Hooks /////////
 //       hooks?: {
-//         validateInput?: (input) => MaybePromise<void>
-//         onActorReady?: (actor) => MaybePromise<void>
+//         validateInput?: (input | null) => MaybePromise<void>
+//         onActorReady?: (actor) => MaybePromise<void> // NOTE: Move onACtorReady FN type to own public type
+//                        (actor: CrawleeOneActorCtx<Labels, Input, TIO, Telem, Ctx>) => MaybePromise<void>;
 //         onBeforeHandler?: (ctx) => MaybePromise<void>
 //         onAfterHandler?: (ctx) => MaybePromise<void>
 //       }
@@ -84,7 +124,7 @@
 // ```
 // HOW TO GENERATE TYPE FOR HANDLERS?
 // ```ts
-// type detailPageHandler = CrawleeOneRouteHandler<CheerioCrawlingContext, ProfesiaRouterContext>
+// type detailPageRouteHandler = CrawleeOneRouteHandler<CheerioCrawlingContext, ProfesiaRouterContext>
 // ```
 //
 // 1. For each route:
@@ -110,36 +150,50 @@
 //      ```ts
 //      type `Label`Handler = CrawleeOneRouteHandler<`type`CrawlingContext, `CrawlerName`RouterContext>
 //      ```
-//
 // HOW TO GENERATE TYPE FOR MATCHERS?
 // ```ts
-// type `Label`Matcher = CrawleeOneRouteMatcher<Labels, RouterCtx, CrawlerCtx>
+// type `Label`RouteMatcher = CrawleeOneRouteMatcher<Labels, RouterCtx, CrawlerCtx>
 // ```
 //
 // 1. For each route:
-//   1.1 Get `CrawlingContext`
-//     1.1.1 Take `mainCrawler`, and find corresponding crawler. If no `mainCrawler`, there should
-//           be only 1 crawler, and take that (if more crawler, there should've been an error).
-//     1.1.2 Find the `crawler.type`
-//     1.1.3 Take corresponding type based on `crawler.type`, e.g. 'cheerio' => `CheerioCrawlingContext`;
-//   1.2 Get actor router context, e.g.:
+//   1.1 Get `CrawlingContext` (See HANDLERS 1.1)
+//   1.2 Get actor router context (See HANDLERS 1.2)
+//   1.3 Create `CrawlerName`Label type (See HANDLERS 1.2.2)
+//   1.4 Put it together:
 //      ```ts
-//      type `CrawlerName`RouterContext = CrawleeOneActorRouterCtx<`type`CrawlingContext, `CrawlerName`RouteLabel, AllActorInput>;
-//      // NOTE: We use `AllActorInput` because since it's in the code, then we can handle ALL inputs
+//      type `Label`Matcher = CrawleeOneRouteMatcher<`CrawlerName`Label, `CrawlerName`RouterContext, `type`CrawlingContext>
 //      ```
-//     1.2.1 Use same CrawlingContext as in step 1.1 (e.g. `CheerioCrawlingContext`).
-//     1.2.2 Create `CrawlerName`Label type, e.g.
-//        ```ts
-//        type `CrawlerName`RouteLabel = "detailPage" | "otherLabel" | ...;
-//        ```
-//       1.2.2.1 Take key (crawler name), and filter for all routes where `route.crawler == key`
-//       1.2.2.2 Take the keys of these routes
-//       1.2.2.3 Generate `type ${key}Label = ${keys.map((s) => '"' + s + '"').join(' | ')}`
+// HOW TO GENERATE TYPE FOR ON_BEFORE AND ON_AFTER?
+// ```ts
+// type `CrawlerName`OnBeforeHandler = CrawleeOneRouteHandler<CheerioCrawlingContext, ProfesiaRouterContext>
+// type `CrawlerName`OnAfterHandler = CrawleeOneRouteHandler<CheerioCrawlingContext, ProfesiaRouterContext>
+// ```
+//
+// It's the same as for HANDLERS!
+// HOW TO GENERATE TYPE FOR ON_ACTOR_READY?
+// ```ts
+// type `CrawlerName`OnActorReady = (actor: CrawleeOneActorCtx<Labels, Input, TIO, Telem, Ctx>) => MaybePromise<void>;
+// ```
+//
+// 1. For each crawler:
+//   1.1 Get `CrawlingContext` (See HANDLERS 1.1)
+//   1.2 Create `CrawlerName`Label type (See HANDLERS 1.2.2)
 //   1.3 Put it together:
 //      ```ts
-//      type `Label`Handler = CrawleeOneRouteHandler<`type`CrawlingContext, `CrawlerName`RouterContext>
+//      type `CrawlerName`OnActorReady = <TIO, Telem>(actor: CrawleeOneActorCtx<`CrawlerName`Label, AllActorInputs, TIO, Telem, `type`CrawlingContext>) => MaybePromise<void>;
 //      ```
+// HOW TO FIND THE FILE BASED ON WHICH TO GENERATE?
+//
+// 1. User has to specify:
+//  - Path to file that exports
+//  - Whether it's TS or JS (or can be inferred based on extension)
 //
+// ```
+// presenter@Juros-MacBook-Pro apify-actor-utils % npx ts-node --project tsconfig.base.json -e 'import config from "./src/lib/router/router"; console.log(config); // NOTE: I HAD TO TRIM OFF THE EXTENSION'
+// { hello: 'world' }
+// presenter@Juros-MacBook-Pro apify-actor-utils % pwd
+// /Users/presenter/repos/apify-actor-utils
+// ```
 // NOTES:
 // - Enum with available route labels would be extracted from this definition.
 // - If there is only 1 crawler defined, all routes use that. If there is more crawlers,

package/dist/cjs/composer.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"composer.js","sourceRoot":"","sources":["../../src/composer.ts"],"names":[],"mappings":";AAAA,cAAc;AACd,iCAAiC;AAEjC,IAAI;AAEJ,MAAM;AACN,wBAAwB;AACxB,qBAAqB;AACrB,uBAAuB;AACvB,6BAA6B;AAC7B,eAAe;AACf,UAAU;AACV,MAAM;AACN,IAAI;AAEJ,qBAAqB;AACrB,0EAA0E;AAC1E,uEAAuE;AACvE,+BAA+B;AAC/B,IAAI;AACJ,MAAM;AAEN,SAAS;AACT,QAAQ;AACR,kDAAkD;AAElD,uBAAuB;AACvB,gBAAgB;AAChB,oBAAoB;AACpB,2BAA2B;AAC3B,EAAE;AACF,2CAA2C;AAC3C,kFAAkF;AAClF,uBAAuB;AACvB,+BAA+B;AAC/B,kCAAkC;AAClC,2GAA2G;AAC3G,kEAAkE;AAClE,0FAA0F;AAC1F,gDAAgD;AAChD,6BAA6B;AAC7B,gDAAgD;AAChD,wCAAwC;AACxC,yBAAyB;AACzB,kBAAkB;AAClB,wDAAwD;AACxD,uDAAuD;AACvD,wDAAwD;AACxD,uDAAuD;AACvD,UAAU;AACV,qCAAqC;AACrC,gBAAgB;AAChB,oBAAoB;AACpB,aAAa;AACb,iBAAiB;AACjB,QAAQ;AACR,OAAO;AACP,EAAE;AACF,cAAc;AACd,oBAAoB;AACpB,gCAAgC;AAChC,EAAE;AACF,4EAA4E;AAC5E,WAAW;AACX,sDAAsD;AACtD,0DAA0D;AAC1D,mGAAmG;AACnG,oEAAoE;AACpE,WAAW;AACX,WAAW;AACX,iBAAiB;AACjB,sEAAsE;AACtE,WAAW;AACX,WAAW;AACX,iBAAiB;AACjB,uEAAuE;AACvE,wDAAwD;AACxD,WAAW;AACX,EAAE;AACF,oCAAoC;AACpC,kCAAkC;AAClC,6BAA6B;AAC7B,iCAAiC;AACjC,WAAW;AACX,QAAQ;AACR,MAAM;AACN,KAAK;AACL,MAAM;AAEN,qCAAqC;AACrC,QAAQ;AACR,iGAAiG;AACjG,MAAM;AACN,EAAE;AACF,qBAAqB;AACrB,8BAA8B;AAC9B,kGAAkG;AAClG,+FAA+F;AAC/F,oCAAoC;AACpC,yGAAyG;AACzG,wCAAwC;AACxC,aAAa;AACb,kIAAkI;AAClI,qGAAqG;AACrG,WAAW;AACX,qFAAqF;AACrF,iDAAiD;AACjD,eAAe;AACf,2EAA2E;AAC3E,aAAa;AACb,gGAAgG;AAChG,8CAA8C;AAC9C,4FAA4F;AAC5F,yBAAyB;AACzB,aAAa;AACb,uGAAuG;AACvG,WAAW;AACX,EAAE;AAEF,qCAAqC;AACrC,QAAQ;AACR,8EAA8E;AAC9E,MAAM;AACN,EAAE;AACF,qBAAqB;AACrB,8BAA8B;AAC9B,kGAAkG;AAClG,+FAA+F;AAC/F,oCAAoC;AACpC,yGAAyG;AACzG,wCAAwC;AACxC,aAAa;AACb,kIAAkI;AAClI,qGAAqG;AACrG,WAAW;AACX,qFAAqF;AACrF,iDAAiD;AACjD,eAAe;AACf,2EAA2E;AAC3E,aAAa;AACb,gGAAgG;AAChG,8CAA8C;AAC9C,4FAA4F;AAC5F,yBAAyB;AACzB,aAAa;AACb,uGAAuG;AACvG,WAAW;AACX,EAAE;AAEF,SAAS;AACT,8EAA8E;AAC9E,wFAAwF;AACxF,oDAAoD;AAEpD,MAAM;AACN,wBAAwB;AACxB,qBAAqB;AACrB,uBAAuB;AACvB,6BAA6B;AAC7B,eAAe;AACf,UAAU;AACV,MAAM;AACN,QAAQ;AACR,IAAI;AAEJ,kCAAkC;AAClC,kBAAkB;AAClB,uBAAuB;AACvB,6BAA6B;AAC7B,eAAe;AACf,UAAU;AACV,MAAM;AACN,IAAI;AAEJ,4BAA4B;AAC5B,QAAQ;AACR,IAAI;AAEJ,wBAAwB;AACxB,uBAAuB;AACvB,QAAQ;AACR,IAAI;AAEJ,4BAA4B;AAC5B,qBAAqB;AACrB,QAAQ;AACR,IAAI;AAEJ,qBAAqB;AACrB,sCAAsC;AACtC,0EAA0E;AAC1E,uEAAuE;AACvE,gCAAgC;AAChC,IAAI;AAEJ,kBAAkB;AAClB,4BAA4B;AAC5B,qFAAqF;AACrF,yBAAyB;AACzB,qEAAqE;AACrE,kDAAkD;AAClD,sDAAsD;AACtD,+FAA+F;AAC/F,gEAAgE;AAChE,OAAO;AACP,IAAI;AACJ,MAAM","sourcesContent":["// @ts-nocheck\n// interface ComposerCrawlerDef {\n\n// }\n\n// ```\n// crawler mainCrawler {\n// type: playwright\n// datasetId: '45678'\n// errorDatasetId: '098765'\n// options: {\n// ...\n// }\n// }\n\n// route detailPage {\n// // NOTE: If `match` is a regex, the regex is compared against the URL\n// match: /[\\W]profesia\\.sk\\/praca\\/zoznam-[a-z0-9-]+\\/?(?:[?#~]\|$)/i\n// handler: detailPageHandler\n// }\n// ```\n\n// As JS:\n// ```js\n// import { detailPageHandler } from './handlers';\n\n// const scraperDef = {\n// crawlers: {\n// mainCrawler {\n// type: 'playwright'\n//\n///////// Override crawler config /////////\n// //////// { ...crawlerConfigDefaults, ...io.getInput(), ...crawlerConfig }\n// crawlerConfig?\n// crawlerConfigDefaults?\n///////// Override input /////////\n// ///// If mergeInput = true, will merge inputDefaults, input, and io.getInput() similarly to config\n// //////// { ...inputDefaults, ...io.getInput(), ...input }\n// ///// If mergeInput = false, io.getInput() will be ignored if `input` is provided\n// //////// { ...inputDefaults, ...input }\n// mergeInput?: boolean\n// inputDefaults?: Partial<AllActorInputs>\n// input?: Partial<AllActorInputs>\n///////// Hooks /////////\n// hooks?: {\n// validateInput?: (input) => MaybePromise<void>\n// onActorReady?: (actor) => MaybePromise<void>\n// onBeforeHandler?: (ctx) => MaybePromise<void>\n// onAfterHandler?: (ctx) => MaybePromise<void>\n// }\n///////// Override services /////////\n// proxy?,\n// telemetry?,\n// io?,\n// router?,\n// }\n// },\n//\n// routes: {\n// detailPage: {\n// crawler?: 'mainCrawler'\n//\n// match: /[\\W]profesia\\.sk\\/praca\\/zoznam-[a-z0-9-]+\\/?(?:[?#~]\|$)/i,\n// OR\n// match: async (url, ctx, route, handlers) => {\n// const dom = cheerioPortadom(ctx.$.root(), url);\n// const isNotCustomDesign = await dom.findMany('body.listing:not(.custom-design)').length;\n// return isUrlOfCompanyProfile(url) && !!isNotCustomDesign;\n// },\n// OR\n// match: [\n// /[\\W]profesia\\.sk\\/praca\\/zoznam-[a-z0-9-]+\\/?(?:[?#~]\|$)/i\n// ],\n// OR\n// match: [\n// /[\\W]profesia\\.sk\\/praca\\/zoznam-[a-z0-9-]+\\/?(?:[?#~]\|$)/i,\n// async (url, ctx, route, handlers) => { ... },\n// ],\n//\n// handler: detailPageHandler,\n// handler: async (ctx) => {\n// ctx.actor.pushData\n// ctx.actor.pushRequests\n// },\n// }\n// }\n// };\n// ```\n\n// HOW TO GENERATE TYPE FOR HANDLERS?\n// ```ts\n// type detailPageHandler = CrawleeOneRouteHandler<CheerioCrawlingContext, ProfesiaRouterContext>\n// ```\n//\n// 1. For each route:\n// 1.1 Get `CrawlingContext`\n// 1.1.1 Take `mainCrawler`, and find corresponding crawler. If no `mainCrawler`, there should\n// be only 1 crawler, and take that (if more crawler, there should've been an error).\n// 1.1.2 Find the `crawler.type`\n// 1.1.3 Take corresponding type based on `crawler.type`, e.g. 'cheerio' => `CheerioCrawlingContext`;\n// 1.2 Get actor router context, e.g.:\n// ```ts\n// type `CrawlerName`RouterContext = CrawleeOneActorRouterCtx<`type`CrawlingContext, `CrawlerName`RouteLabel, AllActorInput>;\n// // NOTE: We use `AllActorInput` because since it's in the code, then we can handle ALL inputs\n// ```\n// 1.2.1 Use same CrawlingContext as in step 1.1 (e.g. `CheerioCrawlingContext`).\n// 1.2.2 Create `CrawlerName`Label type, e.g.\n// ```ts\n// type `CrawlerName`RouteLabel = \"detailPage\" \| \"otherLabel\" \| ...;\n// ```\n// 1.2.2.1 Take key (crawler name), and filter for all routes where `route.crawler == key`\n// 1.2.2.2 Take the keys of these routes\n// 1.2.2.3 Generate `type ${key}Label = ${keys.map((s) => '\"' + s + '\"').join(' \| ')}`\n// 1.3 Put it together:\n// ```ts\n// type `Label`Handler = CrawleeOneRouteHandler<`type`CrawlingContext, `CrawlerName`RouterContext>\n// ```\n//\n\n// HOW TO GENERATE TYPE FOR MATCHERS?\n// ```ts\n// type `Label`Matcher = CrawleeOneRouteMatcher<Labels, RouterCtx, CrawlerCtx>\n// ```\n//\n// 1. For each route:\n// 1.1 Get `CrawlingContext`\n// 1.1.1 Take `mainCrawler`, and find corresponding crawler. If no `mainCrawler`, there should\n// be only 1 crawler, and take that (if more crawler, there should've been an error).\n// 1.1.2 Find the `crawler.type`\n// 1.1.3 Take corresponding type based on `crawler.type`, e.g. 'cheerio' => `CheerioCrawlingContext`;\n// 1.2 Get actor router context, e.g.:\n// ```ts\n// type `CrawlerName`RouterContext = CrawleeOneActorRouterCtx<`type`CrawlingContext, `CrawlerName`RouteLabel, AllActorInput>;\n// // NOTE: We use `AllActorInput` because since it's in the code, then we can handle ALL inputs\n// ```\n// 1.2.1 Use same CrawlingContext as in step 1.1 (e.g. `CheerioCrawlingContext`).\n// 1.2.2 Create `CrawlerName`Label type, e.g.\n// ```ts\n// type `CrawlerName`RouteLabel = \"detailPage\" \| \"otherLabel\" \| ...;\n// ```\n// 1.2.2.1 Take key (crawler name), and filter for all routes where `route.crawler == key`\n// 1.2.2.2 Take the keys of these routes\n// 1.2.2.3 Generate `type ${key}Label = ${keys.map((s) => '\"' + s + '\"').join(' \| ')}`\n// 1.3 Put it together:\n// ```ts\n// type `Label`Handler = CrawleeOneRouteHandler<`type`CrawlingContext, `CrawlerName`RouterContext>\n// ```\n//\n\n// NOTES:\n// - Enum with available route labels would be extracted from this definition.\n// - If there is only 1 crawler defined, all routes use that. If there is more crawlers,\n// they should define which crawler it relates to.\n\n// ```\n// crawler mainCrawler {\n// type: playwright\n// datasetId: '45678'\n// errorDatasetId: '098765'\n// options: {\n// ...\n// }\n// ...\n// }\n\n// crawler productDetailsCrawler {\n// type: cheerio\n// datasetId: '45678'\n// requestQueueId: 'abcdef'\n// options: {\n// ...\n// }\n// }\n\n// requestQueue extraQueue {\n// ...\n// }\n\n// dataset mainDataset {\n// datasetId: '45678'\n// ...\n// }\n\n// keyValueStore mainStore {\n// datasetId: 'xyz'\n// ...\n// }\n\n// route detailPage {\n// crawler: 'productDetailsCrawler',\n// // NOTE: If `match` is a regex, the regex is compared against the URL\n// match: /[\\W]profesia\\.sk\\/praca\\/zoznam-[a-z0-9-]+\\/?(?:[?#~]\|$)/i\n// handler: detailPageHandler,\n// }\n\n// route listing {\n// crawler: 'mainCrawler',\n// // Note: route object name is the 'label' by default, but label can be overriden\n// label: 'DETAIL_PAGE'\n// // NOTE: Otherwise `match` is a function that returns true/false\n// match: async (url, ctx, route, handlers) => {\n// const dom = cheerioPortadom(ctx.$.root(), url);\n// const isNotCustomDesign = await dom.findMany('body.listing:not(.custom-design)').length;\n// return isUrlOfCompanyProfile(url) && !!isNotCustomDesign;\n// },\n// }\n// ```\n"]}
1	+ {"version":3,"file":"composer.js","sourceRoot":"","sources":["../../src/composer.ts"],"names":[],"mappings":";AAAA,cAAc;AACd,iCAAiC;AAEjC,IAAI;AAEJ,MAAM;AACN,wBAAwB;AACxB,qBAAqB;AACrB,uBAAuB;AACvB,6BAA6B;AAC7B,eAAe;AACf,UAAU;AACV,MAAM;AACN,IAAI;AAEJ,qBAAqB;AACrB,0EAA0E;AAC1E,uEAAuE;AACvE,+BAA+B;AAC/B,IAAI;AACJ,MAAM;AAEN,aAAa;AACb,kBAAkB;AAClB,aAAa;AAEb,mEAAmE;AACnE,2CAA2C;AAC3C,uEAAuE;AACvE,MAAM;AACN,IAAI;AACJ,gBAAgB;AAChB,cAAc;AACd,kBAAkB;AAClB,sBAAsB;AACtB,6BAA6B;AAC7B,uDAAuD;AACvD,UAAU;AACV,SAAS;AACT,MAAM;AACN,KAAK;AACL,MAAM;AACN,uGAAuG;AACvG,8EAA8E;AAC9E,EAAE;AACF,8EAA8E;AAC9E,0CAA0C;AAC1C,EAAE;AACF,+BAA+B;AAC/B,gCAAgC;AAChC,0GAA0G;AAC1G,wCAAwC;AACxC,sBAAsB;AACtB,wEAAwE;AACxE,sGAAsG;AACtG,qCAAqC;AACrC,4DAA4D;AAC5D,qEAAqE;AACrE,8DAA8D;AAE9D,aAAa;AACb,gBAAgB;AAChB,aAAa;AAEb,SAAS;AACT,QAAQ;AACR,kDAAkD;AAElD,uBAAuB;AACvB,gBAAgB;AAChB,oBAAoB;AACpB,2BAA2B;AAC3B,EAAE;AACF,2CAA2C;AAC3C,kFAAkF;AAClF,uBAAuB;AACvB,+BAA+B;AAC/B,kCAAkC;AAClC,2GAA2G;AAC3G,kEAAkE;AAClE,0FAA0F;AAC1F,gDAAgD;AAChD,6BAA6B;AAC7B,gDAAgD;AAChD,wCAAwC;AACxC,yBAAyB;AACzB,kBAAkB;AAClB,+DAA+D;AAC/D,6GAA6G;AAC7G,4GAA4G;AAC5G,wDAAwD;AACxD,uDAAuD;AACvD,UAAU;AACV,qCAAqC;AACrC,gBAAgB;AAChB,oBAAoB;AACpB,aAAa;AACb,iBAAiB;AACjB,QAAQ;AACR,OAAO;AACP,EAAE;AACF,cAAc;AACd,oBAAoB;AACpB,gCAAgC;AAChC,EAAE;AACF,4EAA4E;AAC5E,WAAW;AACX,sDAAsD;AACtD,0DAA0D;AAC1D,mGAAmG;AACnG,oEAAoE;AACpE,WAAW;AACX,WAAW;AACX,iBAAiB;AACjB,sEAAsE;AACtE,WAAW;AACX,WAAW;AACX,iBAAiB;AACjB,uEAAuE;AACvE,wDAAwD;AACxD,WAAW;AACX,EAAE;AACF,oCAAoC;AACpC,kCAAkC;AAClC,6BAA6B;AAC7B,iCAAiC;AACjC,WAAW;AACX,QAAQ;AACR,MAAM;AACN,KAAK;AACL,MAAM;AAEN,qCAAqC;AACrC,QAAQ;AACR,sGAAsG;AACtG,MAAM;AACN,EAAE;AACF,qBAAqB;AACrB,8BAA8B;AAC9B,kGAAkG;AAClG,+FAA+F;AAC/F,oCAAoC;AACpC,yGAAyG;AACzG,wCAAwC;AACxC,aAAa;AACb,kIAAkI;AAClI,qGAAqG;AACrG,WAAW;AACX,qFAAqF;AACrF,iDAAiD;AACjD,eAAe;AACf,2EAA2E;AAC3E,aAAa;AACb,gGAAgG;AAChG,8CAA8C;AAC9C,4FAA4F;AAC5F,yBAAyB;AACzB,aAAa;AACb,uGAAuG;AACvG,WAAW;AAEX,qCAAqC;AACrC,QAAQ;AACR,mFAAmF;AACnF,MAAM;AACN,EAAE;AACF,qBAAqB;AACrB,iDAAiD;AACjD,oDAAoD;AACpD,4DAA4D;AAC5D,yBAAyB;AACzB,aAAa;AACb,2HAA2H;AAC3H,WAAW;AAEX,mDAAmD;AACnD,QAAQ;AACR,4GAA4G;AAC5G,2GAA2G;AAC3G,MAAM;AACN,EAAE;AACF,iCAAiC;AAEjC,2CAA2C;AAC3C,QAAQ;AACR,sHAAsH;AACtH,MAAM;AACN,EAAE;AACF,uBAAuB;AACvB,iDAAiD;AACjD,4DAA4D;AAC5D,yBAAyB;AACzB,aAAa;AACb,8KAA8K;AAC9K,WAAW;AAEX,mDAAmD;AACnD,EAAE;AACF,0BAA0B;AAC1B,+BAA+B;AAC/B,mEAAmE;AACnE,EAAE;AACF,MAAM;AACN,4MAA4M;AAC5M,qBAAqB;AACrB,sDAAsD;AACtD,2CAA2C;AAC3C,MAAM;AAEN,SAAS;AACT,8EAA8E;AAC9E,wFAAwF;AACxF,oDAAoD;AAEpD,MAAM;AACN,wBAAwB;AACxB,qBAAqB;AACrB,uBAAuB;AACvB,6BAA6B;AAC7B,eAAe;AACf,UAAU;AACV,MAAM;AACN,QAAQ;AACR,IAAI;AAEJ,kCAAkC;AAClC,kBAAkB;AAClB,uBAAuB;AACvB,6BAA6B;AAC7B,eAAe;AACf,UAAU;AACV,MAAM;AACN,IAAI;AAEJ,4BAA4B;AAC5B,QAAQ;AACR,IAAI;AAEJ,wBAAwB;AACxB,uBAAuB;AACvB,QAAQ;AACR,IAAI;AAEJ,4BAA4B;AAC5B,qBAAqB;AACrB,QAAQ;AACR,IAAI;AAEJ,qBAAqB;AACrB,sCAAsC;AACtC,0EAA0E;AAC1E,uEAAuE;AACvE,gCAAgC;AAChC,IAAI;AAEJ,kBAAkB;AAClB,4BAA4B;AAC5B,qFAAqF;AACrF,yBAAyB;AACzB,qEAAqE;AACrE,kDAAkD;AAClD,sDAAsD;AACtD,+FAA+F;AAC/F,gEAAgE;AAChE,OAAO;AACP,IAAI;AACJ,MAAM","sourcesContent":["// @ts-nocheck\n// interface ComposerCrawlerDef {\n\n// }\n\n// ```\n// crawler mainCrawler {\n// type: playwright\n// datasetId: '45678'\n// errorDatasetId: '098765'\n// options: {\n// ...\n// }\n// }\n\n// route detailPage {\n// // NOTE: If `match` is a regex, the regex is compared against the URL\n// match: /[\\W]profesia\\.sk\\/praca\\/zoznam-[a-z0-9-]+\\/?(?:[?#~]\|$)/i\n// handler: detailPageHandler\n// }\n// ```\n\n// !!!!!!!!!!\n// UPDATES - START\n// !!!!!!!!!!\n\n// THE CONFIG SHOULD BE FIRST DEFINED AS PLAIN OBJECT, SO NO JS/TS,\n// SO WE CAN IMPORT IT FOR TYPE GENERATION.\n// SO IT SHOULD AT FIRST LOOK LIKE THIS (COULD BE JS/TS/JSON/YAML/...):\n// ```\n// {\n// version: 1,\n// schema: {\n// crawlers: {\n// mainCrawler {\n// type: 'playwright'\n// routes: ['listingPage', detailPage', 'home']\n// }\n// },\n// }\n// };\n// ```\n// WE IMPORT THAT USING THE COSMICCONFIG (with https://github.com/codex-/cosmiconfig-typescript-loader)\n// See https://github.com/cosmiconfig/cosmiconfig#usage-for-tooling-developers\n//\n// AT THIS POINT DONT FORGET TO HAVE A VALIDATION SCHEMA TO COMPARE THE LAODED\n// CONFIG AGAINST. THROW ERROR IF INVALID.\n//\n// I SHOULD GENERATE TYPES FOR:\n// - The types I mentioned below\n// - Each Crawler based on their type (e.g. CheerioCrawleeOneCrawler<Labels, Inputs, ...>) (But also named\n// variants like `mainCrawlerCrawler`)\n// - CRAWLER_NAME_ENUM\n// - All Crawlers obj = { `CrawlerName`: CheerioCrawleeOneCrawler, ... }\n// - Each Route based on their type (e.g. CheerioCrawleeOneRoute<Labels, Inputs, ...>) (But also named\n// variants like `detailPageRoute`)\n// - CRAWLER_ROUTE_ENUM - e.g. `CrawlerName`RouteLabel = ...\n// - All Crawler Routes objs = { `detailPage`: detailPageRoute, ... }\n// - The whole object of { crawlers: { ...}, routes: { ... } }\n\n// !!!!!!!!!!\n// UPDATES - END\n// !!!!!!!!!!\n\n// As JS:\n// ```js\n// import { detailPageHandler } from './handlers';\n\n// const scraperDef = {\n// crawlers: {\n// mainCrawler {\n// type: 'playwright'\n//\n///////// Override crawler config /////////\n// //////// { ...crawlerConfigDefaults, ...io.getInput(), ...crawlerConfig }\n// crawlerConfig?\n// crawlerConfigDefaults?\n///////// Override input /////////\n// ///// If mergeInput = true, will merge inputDefaults, input, and io.getInput() similarly to config\n// //////// { ...inputDefaults, ...io.getInput(), ...input }\n// ///// If mergeInput = false, io.getInput() will be ignored if `input` is provided\n// //////// { ...inputDefaults, ...input }\n// mergeInput?: boolean\n// inputDefaults?: Partial<AllActorInputs>\n// input?: Partial<AllActorInputs>\n///////// Hooks /////////\n// hooks?: {\n// validateInput?: (input \| null) => MaybePromise<void>\n// onActorReady?: (actor) => MaybePromise<void> // NOTE: Move onACtorReady FN type to own public type\n// (actor: CrawleeOneActorCtx<Labels, Input, TIO, Telem, Ctx>) => MaybePromise<void>;\n// onBeforeHandler?: (ctx) => MaybePromise<void>\n// onAfterHandler?: (ctx) => MaybePromise<void>\n// }\n///////// Override services /////////\n// proxy?,\n// telemetry?,\n// io?,\n// router?,\n// }\n// },\n//\n// routes: {\n// detailPage: {\n// crawler?: 'mainCrawler'\n//\n// match: /[\\W]profesia\\.sk\\/praca\\/zoznam-[a-z0-9-]+\\/?(?:[?#~]\|$)/i,\n// OR\n// match: async (url, ctx, route, handlers) => {\n// const dom = cheerioPortadom(ctx.$.root(), url);\n// const isNotCustomDesign = await dom.findMany('body.listing:not(.custom-design)').length;\n// return isUrlOfCompanyProfile(url) && !!isNotCustomDesign;\n// },\n// OR\n// match: [\n// /[\\W]profesia\\.sk\\/praca\\/zoznam-[a-z0-9-]+\\/?(?:[?#~]\|$)/i\n// ],\n// OR\n// match: [\n// /[\\W]profesia\\.sk\\/praca\\/zoznam-[a-z0-9-]+\\/?(?:[?#~]\|$)/i,\n// async (url, ctx, route, handlers) => { ... },\n// ],\n//\n// handler: detailPageHandler,\n// handler: async (ctx) => {\n// ctx.actor.pushData\n// ctx.actor.pushRequests\n// },\n// }\n// }\n// };\n// ```\n\n// HOW TO GENERATE TYPE FOR HANDLERS?\n// ```ts\n// type detailPageRouteHandler = CrawleeOneRouteHandler<CheerioCrawlingContext, ProfesiaRouterContext>\n// ```\n//\n// 1. For each route:\n// 1.1 Get `CrawlingContext`\n// 1.1.1 Take `mainCrawler`, and find corresponding crawler. If no `mainCrawler`, there should\n// be only 1 crawler, and take that (if more crawler, there should've been an error).\n// 1.1.2 Find the `crawler.type`\n// 1.1.3 Take corresponding type based on `crawler.type`, e.g. 'cheerio' => `CheerioCrawlingContext`;\n// 1.2 Get actor router context, e.g.:\n// ```ts\n// type `CrawlerName`RouterContext = CrawleeOneActorRouterCtx<`type`CrawlingContext, `CrawlerName`RouteLabel, AllActorInput>;\n// // NOTE: We use `AllActorInput` because since it's in the code, then we can handle ALL inputs\n// ```\n// 1.2.1 Use same CrawlingContext as in step 1.1 (e.g. `CheerioCrawlingContext`).\n// 1.2.2 Create `CrawlerName`Label type, e.g.\n// ```ts\n// type `CrawlerName`RouteLabel = \"detailPage\" \| \"otherLabel\" \| ...;\n// ```\n// 1.2.2.1 Take key (crawler name), and filter for all routes where `route.crawler == key`\n// 1.2.2.2 Take the keys of these routes\n// 1.2.2.3 Generate `type ${key}Label = ${keys.map((s) => '\"' + s + '\"').join(' \| ')}`\n// 1.3 Put it together:\n// ```ts\n// type `Label`Handler = CrawleeOneRouteHandler<`type`CrawlingContext, `CrawlerName`RouterContext>\n// ```\n\n// HOW TO GENERATE TYPE FOR MATCHERS?\n// ```ts\n// type `Label`RouteMatcher = CrawleeOneRouteMatcher<Labels, RouterCtx, CrawlerCtx>\n// ```\n//\n// 1. For each route:\n// 1.1 Get `CrawlingContext` (See HANDLERS 1.1)\n// 1.2 Get actor router context (See HANDLERS 1.2)\n// 1.3 Create `CrawlerName`Label type (See HANDLERS 1.2.2)\n// 1.4 Put it together:\n// ```ts\n// type `Label`Matcher = CrawleeOneRouteMatcher<`CrawlerName`Label, `CrawlerName`RouterContext, `type`CrawlingContext>\n// ```\n\n// HOW TO GENERATE TYPE FOR ON_BEFORE AND ON_AFTER?\n// ```ts\n// type `CrawlerName`OnBeforeHandler = CrawleeOneRouteHandler<CheerioCrawlingContext, ProfesiaRouterContext>\n// type `CrawlerName`OnAfterHandler = CrawleeOneRouteHandler<CheerioCrawlingContext, ProfesiaRouterContext>\n// ```\n//\n// It's the same as for HANDLERS!\n\n// HOW TO GENERATE TYPE FOR ON_ACTOR_READY?\n// ```ts\n// type `CrawlerName`OnActorReady = (actor: CrawleeOneActorCtx<Labels, Input, TIO, Telem, Ctx>) => MaybePromise<void>;\n// ```\n//\n// 1. For each crawler:\n// 1.1 Get `CrawlingContext` (See HANDLERS 1.1)\n// 1.2 Create `CrawlerName`Label type (See HANDLERS 1.2.2)\n// 1.3 Put it together:\n// ```ts\n// type `CrawlerName`OnActorReady = <TIO, Telem>(actor: CrawleeOneActorCtx<`CrawlerName`Label, AllActorInputs, TIO, Telem, `type`CrawlingContext>) => MaybePromise<void>;\n// ```\n\n// HOW TO FIND THE FILE BASED ON WHICH TO GENERATE?\n//\n// 1. User has to specify:\n// - Path to file that exports\n// - Whether it's TS or JS (or can be inferred based on extension)\n//\n// ```\n// presenter@Juros-MacBook-Pro apify-actor-utils % npx ts-node --project tsconfig.base.json -e 'import config from \"./src/lib/router/router\"; console.log(config); // NOTE: I HAD TO TRIM OFF THE EXTENSION'\n// { hello: 'world' }\n// presenter@Juros-MacBook-Pro apify-actor-utils % pwd\n// /Users/presenter/repos/apify-actor-utils\n// ```\n\n// NOTES:\n// - Enum with available route labels would be extracted from this definition.\n// - If there is only 1 crawler defined, all routes use that. If there is more crawlers,\n// they should define which crawler it relates to.\n\n// ```\n// crawler mainCrawler {\n// type: playwright\n// datasetId: '45678'\n// errorDatasetId: '098765'\n// options: {\n// ...\n// }\n// ...\n// }\n\n// crawler productDetailsCrawler {\n// type: cheerio\n// datasetId: '45678'\n// requestQueueId: 'abcdef'\n// options: {\n// ...\n// }\n// }\n\n// requestQueue extraQueue {\n// ...\n// }\n\n// dataset mainDataset {\n// datasetId: '45678'\n// ...\n// }\n\n// keyValueStore mainStore {\n// datasetId: 'xyz'\n// ...\n// }\n\n// route detailPage {\n// crawler: 'productDetailsCrawler',\n// // NOTE: If `match` is a regex, the regex is compared against the URL\n// match: /[\\W]profesia\\.sk\\/praca\\/zoznam-[a-z0-9-]+\\/?(?:[?#~]\|$)/i\n// handler: detailPageHandler,\n// }\n\n// route listing {\n// crawler: 'mainCrawler',\n// // Note: route object name is the 'label' by default, but label can be overriden\n// label: 'DETAIL_PAGE'\n// // NOTE: Otherwise `match` is a function that returns true/false\n// match: async (url, ctx, route, handlers) => {\n// const dom = cheerioPortadom(ctx.$.root(), url);\n// const isNotCustomDesign = await dom.findMany('body.listing:not(.custom-design)').length;\n// return isUrlOfCompanyProfile(url) && !!isNotCustomDesign;\n// },\n// }\n// ```\n"]}

package/dist/cjs/config/types.d.ts ADDED Viewed

File without changes

package/dist/cjs/config/types.js ADDED Viewed

@@ -0,0 +1,38 @@
+"use strict";
+// import type { CrawlerType } from "../types";
+// export interface CrawleeOneConfig {
+//   /** Version of the CrawleeOne config. */
+//   version: 1;
+//   /** Schema defining the crawlers in this project. This schema is used for code generation. */
+//   schema: {
+//     /** Object holding crawler configurations. Each crawler is idefntified by its key.
+//      *
+//      * E.g.
+//      *
+//      * ```js
+//      * {
+//      *   myCrawler: {
+//      *     type: 'cheerio',
+//      *     routes: [...],
+//      *   }
+//      * }
+//      * ```
+//      */
+//     crawlers: {
+//       mainCrawler {
+//         type: 'playwright'
+//         routes: ['listingPage', detailPage', 'home']
+//       }
+//     },
+//   }
+// };
+// export interface CrawleeOneConfigCrawlerDef {
+//   /** Crawler type - Each type is linked to a different Crawlee crawler class.
+//    * Different classes may use different technologies / stack for scraping.
+//    *
+//    * E.g. type `cheerio` will use `CheerioCrawler` class.
+//    */
+//   type: CrawlerType;
+//   routes: ['listingPage', detailPage', 'home']
+// }
+//# sourceMappingURL=types.js.map

package/dist/cjs/config/types.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../../src/config/types.ts"],"names":[],"mappings":";AAAA,+CAA+C;AAE/C,sCAAsC;AACtC,6CAA6C;AAC7C,gBAAgB;AAChB,kGAAkG;AAClG,cAAc;AACd,yFAAyF;AACzF,UAAU;AACV,cAAc;AACd,UAAU;AACV,eAAe;AACf,WAAW;AACX,wBAAwB;AACxB,8BAA8B;AAC9B,4BAA4B;AAC5B,aAAa;AACb,WAAW;AACX,aAAa;AACb,UAAU;AACV,kBAAkB;AAClB,sBAAsB;AACtB,6BAA6B;AAC7B,uDAAuD;AACvD,UAAU;AACV,SAAS;AACT,MAAM;AACN,KAAK;AAEL,gDAAgD;AAChD,iFAAiF;AACjF,8EAA8E;AAC9E,QAAQ;AACR,4DAA4D;AAC5D,QAAQ;AACR,uBAAuB;AACvB,iDAAiD;AACjD,IAAI","sourcesContent":["// import type { CrawlerType } from \"../types\";\n\n// export interface CrawleeOneConfig {\n// /** Version of the CrawleeOne config. */\n// version: 1;\n// /** Schema defining the crawlers in this project. This schema is used for code generation. */\n// schema: {\n// /** Object holding crawler configurations. Each crawler is idefntified by its key.\n// * \n// * E.g.\n// * \n// * ```js\n// * {\n// * myCrawler: {\n// * type: 'cheerio',\n// * routes: [...],\n// * }\n// * }\n// * ```\n// */\n// crawlers: {\n// mainCrawler {\n// type: 'playwright'\n// routes: ['listingPage', detailPage', 'home']\n// }\n// },\n// }\n// };\n\n// export interface CrawleeOneConfigCrawlerDef {\n// /** Crawler type - Each type is linked to a different Crawlee crawler class.\n// * Different classes may use different technologies / stack for scraping.\n// * \n// * E.g. type `cheerio` will use `CheerioCrawler` class.\n// */\n// type: CrawlerType;\n// routes: ['listingPage', detailPage', 'home']\n// }\n"]}

package/dist/cjs/index.d.ts CHANGED Viewed

@@ -6,9 +6,6 @@ export * from './lib/io/dataset';
 export * from './lib/io/requestQueue';
 export * from './lib/io/pushData';
 export * from './lib/io/pushRequests';
-export * from './lib/actions/dom';
-export * from './lib/actions/domUtils';
-export * from './lib/actions/page';
 export * from './lib/actions/scrapeListing';
 export * from './lib/error/errorHandler';
 export * from './lib/migrate/localMigrator';

package/dist/cjs/index.js CHANGED Viewed

@@ -22,9 +22,6 @@ __exportStar(require("./lib/io/dataset"), exports);
 __exportStar(require("./lib/io/requestQueue"), exports);
 __exportStar(require("./lib/io/pushData"), exports);
 __exportStar(require("./lib/io/pushRequests"), exports);
-__exportStar(require("./lib/actions/dom"), exports);
-__exportStar(require("./lib/actions/domUtils"), exports);
-__exportStar(require("./lib/actions/page"), exports);
 __exportStar(require("./lib/actions/scrapeListing"), exports);
 __exportStar(require("./lib/error/errorHandler"), exports);
 __exportStar(require("./lib/migrate/localMigrator"), exports);

package/dist/cjs/index.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;AAAA,oDAAkC;AAClC,oDAAkC;AAClC,kDAAgC;AAChC,+CAA6B;AAC7B,mDAAiC;AACjC,wDAAsC;AACtC,oDAAkC;AAClC,wDAAsC;AACtC,~~oDAAkC;AAClC,yDAAuC;AACvC,qDAAmC;AACnC,~~8DAA4C;AAC5C,2DAAyC;AACzC,8DAA4C;AAC5C,2DAAyC;AACzC,sDAAoC;AACpC,4DAA0C;AAC1C,2DAAyC;AACzC,sDAAoC;AACpC,qDAAmC;AACnC,4CAA0B;AAC1B,mDAAiC;AACjC,6DAA2C;AAE3C,2DAAyC;AACzC,2DAAyC;AACzC,wDAAsC;AACtC,yDAAuC","sourcesContent":["export * from './lib/actor/actor';\nexport * from './lib/actor/types';\nexport * from './lib/actorSpec';\nexport * from './lib/config';\nexport * from './lib/io/dataset';\nexport * from './lib/io/requestQueue';\nexport * from './lib/io/pushData';\nexport * from './lib/io/pushRequests';\nexport * from './lib/actions/~~dom';\nexport * from './lib/actions/domUtils';\nexport * from './lib/actions/page';\nexport * from './lib/actions/~~scrapeListing';\nexport * from './lib/error/errorHandler';\nexport * from './lib/migrate/localMigrator';\nexport * from './lib/migrate/localState';\nexport * from './lib/migrate/types';\nexport * from './lib/readme/apify/readme';\nexport * from './lib/readme/apify/types';\nexport * from './lib/router/router';\nexport * from './lib/router/types';\nexport * from './lib/log';\nexport * from './lib/test/actor';\nexport * from './lib/test/mockApifyClient';\nexport type { CrawlerUrl, CrawlerType } from './types';\nexport * from './lib/integrations/apify';\nexport * from './lib/integrations/types';\nexport * from './lib/telemetry/types';\nexport * from './lib/telemetry/sentry';\n"]}
1	+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;AAAA,oDAAkC;AAClC,oDAAkC;AAClC,kDAAgC;AAChC,+CAA6B;AAC7B,mDAAiC;AACjC,wDAAsC;AACtC,oDAAkC;AAClC,wDAAsC;AACtC,8DAA4C;AAC5C,2DAAyC;AACzC,8DAA4C;AAC5C,2DAAyC;AACzC,sDAAoC;AACpC,4DAA0C;AAC1C,2DAAyC;AACzC,sDAAoC;AACpC,qDAAmC;AACnC,4CAA0B;AAC1B,mDAAiC;AACjC,6DAA2C;AAE3C,2DAAyC;AACzC,2DAAyC;AACzC,wDAAsC;AACtC,yDAAuC","sourcesContent":["export * from './lib/actor/actor';\nexport * from './lib/actor/types';\nexport * from './lib/actorSpec';\nexport * from './lib/config';\nexport * from './lib/io/dataset';\nexport * from './lib/io/requestQueue';\nexport * from './lib/io/pushData';\nexport * from './lib/io/pushRequests';\nexport * from './lib/actions/scrapeListing';\nexport * from './lib/error/errorHandler';\nexport * from './lib/migrate/localMigrator';\nexport * from './lib/migrate/localState';\nexport * from './lib/migrate/types';\nexport * from './lib/readme/apify/readme';\nexport * from './lib/readme/apify/types';\nexport * from './lib/router/router';\nexport * from './lib/router/types';\nexport * from './lib/log';\nexport * from './lib/test/actor';\nexport * from './lib/test/mockApifyClient';\nexport type { CrawlerUrl, CrawlerType } from './types';\nexport * from './lib/integrations/apify';\nexport * from './lib/integrations/types';\nexport * from './lib/telemetry/types';\nexport * from './lib/telemetry/sentry';\n"]}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "crawlee-one",
-  "version": "1.1.2",
+  "version": "1.1.3",
   "private": false,
   "description": "Crawlee One is a framework built on top of Crawlee and Apify for writing robust and highly configurable web scrapers",
   "author": "Juro Oravec <juraj.oravec.josefson@gmail.com>",