crawlee-one 1.0.4 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cjs/index.d.ts +2 -1
- package/dist/cjs/index.js +2 -1
- package/dist/cjs/index.js.map +1 -1
- package/dist/cjs/lib/actor/actor.js +1 -1
- package/dist/cjs/lib/actor/actor.js.map +1 -1
- package/dist/cjs/lib/actor/types.d.ts +1 -1
- package/dist/cjs/lib/actor/types.js.map +1 -1
- package/dist/cjs/lib/error/errorHandler.d.ts +1 -1
- package/dist/cjs/lib/error/errorHandler.js.map +1 -1
- package/dist/cjs/lib/integrations/apify.js +7 -3
- package/dist/cjs/lib/integrations/apify.js.map +1 -1
- package/dist/cjs/lib/io/pushRequests.d.ts +4 -3
- package/dist/cjs/lib/io/pushRequests.js +1 -1
- package/dist/cjs/lib/io/pushRequests.js.map +1 -1
- package/dist/cjs/lib/log.d.ts +1 -1
- package/dist/cjs/lib/log.js.map +1 -1
- package/dist/cjs/lib/router/router.d.ts +67 -0
- package/dist/cjs/lib/router/router.js +180 -0
- package/dist/cjs/lib/router/router.js.map +1 -0
- package/dist/cjs/lib/router/types.d.ts +59 -0
- package/dist/cjs/lib/router/types.js +19 -0
- package/dist/cjs/lib/router/types.js.map +1 -0
- package/dist/cjs/lib/router.js +8 -2
- package/dist/cjs/lib/router.js.map +1 -1
- package/package.json +1 -1
package/dist/cjs/index.d.ts
CHANGED
|
@@ -17,7 +17,8 @@ export * from './lib/migrate/localState';
|
|
|
17
17
|
export * from './lib/migrate/types';
|
|
18
18
|
export * from './lib/readme/readme';
|
|
19
19
|
export * from './lib/readme/types';
|
|
20
|
-
export * from './lib/router';
|
|
20
|
+
export * from './lib/router/router';
|
|
21
|
+
export * from './lib/router/types';
|
|
21
22
|
export * from './lib/log';
|
|
22
23
|
export * from './lib/test/actor';
|
|
23
24
|
export * from './lib/test/mockApifyClient';
|
package/dist/cjs/index.js
CHANGED
|
@@ -36,7 +36,8 @@ __exportStar(require("./lib/migrate/localState"), exports);
|
|
|
36
36
|
__exportStar(require("./lib/migrate/types"), exports);
|
|
37
37
|
__exportStar(require("./lib/readme/readme"), exports);
|
|
38
38
|
__exportStar(require("./lib/readme/types"), exports);
|
|
39
|
-
__exportStar(require("./lib/router"), exports);
|
|
39
|
+
__exportStar(require("./lib/router/router"), exports);
|
|
40
|
+
__exportStar(require("./lib/router/types"), exports);
|
|
40
41
|
__exportStar(require("./lib/log"), exports);
|
|
41
42
|
__exportStar(require("./lib/test/actor"), exports);
|
|
42
43
|
__exportStar(require("./lib/test/mockApifyClient"), exports);
|
package/dist/cjs/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;AAAA,2CAAqF;AAA5E,+GAAA,sBAAsB,OAAA;AAAE,iHAAA,wBAAwB,OAAA;AACzD,oDAAkC;AAClC,kDAAgC;AAChC,+CAA6B;AAC7B,mDAAiC;AACjC,wDAAsC;AACtC,oDAAkC;AAClC,wDAAsC;AACtC,oDAAkC;AAClC,yDAAuC;AACvC,qDAAmC;AACnC,8DAA4C;AAC5C,2DAAyC;AACzC,qDAAmC;AACnC,8DAA4C;AAC5C,2DAAyC;AACzC,sDAAoC;AACpC,sDAAoC;AACpC,qDAAmC;AACnC
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;AAAA,2CAAqF;AAA5E,+GAAA,sBAAsB,OAAA;AAAE,iHAAA,wBAAwB,OAAA;AACzD,oDAAkC;AAClC,kDAAgC;AAChC,+CAA6B;AAC7B,mDAAiC;AACjC,wDAAsC;AACtC,oDAAkC;AAClC,wDAAsC;AACtC,oDAAkC;AAClC,yDAAuC;AACvC,qDAAmC;AACnC,8DAA4C;AAC5C,2DAAyC;AACzC,qDAAmC;AACnC,8DAA4C;AAC5C,2DAAyC;AACzC,sDAAoC;AACpC,sDAAoC;AACpC,qDAAmC;AACnC,sDAAoC;AACpC,qDAAmC;AACnC,4CAA0B;AAC1B,mDAAiC;AACjC,6DAA2C;AAE3C,2DAAyC;AACzC,2DAAyC","sourcesContent":["export { createAndRunCrawleeOne, createHttpCrawlerOptions } from './lib/actor/actor';\nexport * from './lib/actor/types';\nexport * from './lib/actorSpec';\nexport * from './lib/config';\nexport * from './lib/io/dataset';\nexport * from './lib/io/requestQueue';\nexport * from './lib/io/pushData';\nexport * from './lib/io/pushRequests';\nexport * from './lib/actions/dom';\nexport * from './lib/actions/domUtils';\nexport * from './lib/actions/page';\nexport * from './lib/actions/scrapeListing';\nexport * from './lib/error/errorHandler';\nexport * from './lib/error/sentry';\nexport * from './lib/migrate/localMigrator';\nexport * from './lib/migrate/localState';\nexport * from './lib/migrate/types';\nexport * from './lib/readme/readme';\nexport * from './lib/readme/types';\nexport * from './lib/router/router';\nexport * from './lib/router/types';\nexport * from './lib/log';\nexport * from './lib/test/actor';\nexport * from './lib/test/mockApifyClient';\nexport type { CrawlerUrl, CrawlerType } from './types';\nexport * from './lib/integrations/apify';\nexport * from './lib/integrations/types';\n"]}
|
|
@@ -19,7 +19,7 @@ const pushData_1 = require("../io/pushData");
|
|
|
19
19
|
const dataset_1 = require("../io/dataset");
|
|
20
20
|
const pushRequests_1 = require("../io/pushRequests");
|
|
21
21
|
const apify_1 = require("../integrations/apify");
|
|
22
|
-
const router_1 = require("../router");
|
|
22
|
+
const router_1 = require("../router/router");
|
|
23
23
|
const config_1 = require("../config");
|
|
24
24
|
const log_1 = require("../log");
|
|
25
25
|
const actorClassByType = {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"actor.js","sourceRoot":"","sources":["../../../../src/lib/actor/actor.ts"],"names":[],"mappings":";;;;;;;;;;;;AAAA,qCAaiB;AACjB,mCAAgD;AAEhD,+CAA2C;AAI3C,wDAA2D;AAC3D,4CAA8C;AAC9C,6CAA8E;AAC9E,2CAAqD;AACrD,qDAAuE;AAEvE,iDAAgD;AAChD,sCAAgE;AAChE,sCAWmB;AACnB,gCAAmE;AAUnE,MAAM,gBAAgB,GAAG;IACvB,KAAK,EAAE,sBAAY;IACnB,IAAI,EAAE,qBAAW;IACjB,OAAO,EAAE,wBAAc;IACvB,KAAK,EAAE,sBAAY;IACnB,UAAU,EAAE,2BAAiB;IAC7B,SAAS,EAAE,0BAAgB;CAC+C,CAAC;AAE7E,MAAM,QAAQ,GAAG,CAAC,CAAM,EAA2B,EAAE;IACnD,OAAO,CAAC,CAAC,CAAE,CAAmB,CAAC,UAAU,IAAK,CAAmB,CAAC,iBAAiB,CAAC,CAAC;AACvF,CAAC,CAAC;AACF,MAAM,MAAM,GAAG,CAAC,CAAM,EAAgC,EAAE;IACtD,OAAO,OAAO,CAAC,KAAK,UAAU,CAAC;AACjC,CAAC,CAAC;AAEF,kEAAkE;AAClE,MAAM,SAAS,GAAG,CAMhB,KAA4E,EAC5E,KAAc,EACd,EAAE;IACF,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IAExB,MAAM,OAAO,GAAG;QACd,EAAE,EAAE,KAAK,CAAC,EAAE;QACZ,KAAK,EAAE,KAAK,CAAC,KAAK;QAClB,KAAK,EAAE,KAAK,CAAC,KAAK;QAClB,YAAY,EAAZ,uBAAY;QACZ,WAAW,EAAE,0BAAW;KACO,CAAC;IAElC,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC;IAC3B,IAAI,CAAC,MAAM;QAAE,OAAO,IAAI,CAAC;IAEzB,OAAO,CAAO,GAAG,IAAI,EAAE,EAAE,kDAAC,OAAA,MAAM,CAAC,GAAG,IAAI,EAAE,OAAO,CAAC,CAAA,GAAA,CAAC;AACrD,CAAC,CAAC;AAEF;;;;;;;GAOG;AACI,MAAM,sBAAsB,GAAG,CAMpC,IAiCD,EAAiB,EAAE;IAClB,MAAM,EACJ,SAAS,EACT,SAAS,EACT,WAAW,EACX,qBAAqB,EACrB,sBAAsB,EACtB,aAAa,EACb,YAAY,GACb,GAAG,IAAI,CAAC;IAET,MAAM,EAAE,EAAE,GAAG,eAAqB,EAAE,GAAG,WAAW,CAAC;IAEnD,MAAM,IAAA,oBAAW,kCAAM,aAAa,KAAE,UAAU,EAAE,SAAS,KAAI,EAAE,EAAE,EAAE,CAAC,CAAC;IAEvE,YAAY;IACZ,mCAAmC;IACnC,yGAAyG;IACzG,2EAA2E;IAC3E,MAAM,EAAE,CAAC,YAAY,CACnB,GAAS,EAAE;;QACT,MAAM,aAAa,GAA8D;YAC/E,EAAE;YACF,MAAM,EAAE,gBAAM,CAAC,MAAM,EAAO;YAC5B,cAAc,EAAE,CAAC,EAAE,KAAK,EAAE,EAAE,EAAE;;gBAAC,OAAA;oBAC7B,IAAA,4BAAsB,EAAW,MAAA,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,QAAQ,mCAAI,MAAM,CAAC;iBAC5D,CAAA;aAAA;YACD,aAAa,EAAE,CAAC,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,EAAE,EAAE;;gBAC1C,MAAM,OAAO,GAAG,IAAA,gCAAwB,EAGtC;oBACA,KAAK;oBACL,QAAQ,EAAE,qBAAqB;oBAC/B,SAAS,kBACP,cAAc,EAAE,MAAM,EACtB,kBAAkB,EAAE,KAAK;wBACzB,yEAAyE;wBACzE,oBAAoB,EAAE,IAAA,iCAAkB,EAAC;4BACvC,EAAE;4BACF,kBAAkB,EAAE,MAAA,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,uBAAuB,mCAAI,WAAW;4BACjE,YAAY,EAAE,MAAA,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,iBAAiB,mCAAI,IAAI;yBAC/C,CAAC,IACC,sBAAsB,CAC1B;iBACF,CAAC,CAAC;gBACH,MAAM,YAAY,GAAG,gBAAgB,CAAC,SAAS,CAAQ,CAAC;gBACxD,OAAO,IAAI,YAAY,CAAC,OAAO,CAAC,CAAC;YACnC,CAAC;YACD,MAAM,EAAE,EAAE;YACV,aAAa,EAAE,EAAS;SACzB,CAAC;QAEF,MAAM,KAAK,GAAG,MAAM,IAAA,wBAAgB,kCAC/B,WAAW,KACd,EAAE,EACF,MAAM,EAAE,MAAA,WAAW,CAAC,MAAM,mCAAK,aAAa,CAAC,MAAc,EAC3D,cAAc,EAAE,MAAA,WAAW,CAAC,cAAc,mCAAK,aAAa,CAAC,cAAsB,EACnF,aAAa,EAAE,MAAA,WAAW,CAAC,aAAa,mCAAK,aAAa,CAAC,aAAqB,IAChF,CAAC;QAEH,MAAM,CAAA,YAAY,aAAZ,YAAY,uBAAZ,YAAY,CAAG,KAAK,CAAC,CAAA,CAAC;IAC9B,CAAC,CAAA,EACD,EAAE,aAAa,EAAE,oBAAoB,EAAE,CACxC,CAAC;AACJ,CAAC,CAAA,CAAC;AAxGW,QAAA,sBAAsB,0BAwGjC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG;AACI,MAAM,gBAAgB,GAAG,CAM9B,MAAmE,EACnB,EAAE;IAClD,MAAM,EAAE,EAAE,GAAG,eAAqB,EAAE,GAAG,MAAM,CAAC;IAE9C,qDAAqD;IACrD,MAAM,KAAK,GAAG,EAAE,CAAC;IAEjB,0BAA0B;IAC1B,MAAM,QAAQ,GAAG,MAAM,CAAC,KAAK;QAC3B,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC;YACpB,CAAC,CAAC,MAAM,MAAM,CAAC,KAAK,iCAAM,MAAM,KAAE,EAAE,IAAG;YACvC,CAAC,CAAC,MAAM,CAAC,KAAK;QAChB,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAS,CAAC;IAC/B,MAAM,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,YAAY,CAAe,QAAQ,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC;IAEvF,IAAI,MAAM,CAAC,aAAa;QAAE,MAAM,MAAM,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;IAE5D,MAAM,EAAE,QAAQ,EAAE,GAAG,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,EAAE,CAAsB,CAAC;IACxD,MAAM,GAAG,GAAG,IAAI,aAAG,CAAC,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC,CAAC,uBAAiB,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC;IAEnF,gFAAgF;IAChF,MAAM,SAAS,GAAG,GAAG,EAAE,CAAC,iCAAM,MAAM,KAAE,KAAK,EAAE,KAAK,EAAE,EAAE,EAAE,GAAG,IAAG,CAAC;IAE/D,eAAe;IACf,MAAM,YAAY,GAChB,MAAM,CAAC,KAAK,IAAI,IAAI,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,+BAA+B,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;IAClG,MAAM,KAAK,GACT,MAAM,CAAC,KAAK,IAAI,IAAI;QAClB,CAAC,CAAC,YAAY;QACd,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC;YACtB,CAAC,CAAC,MAAM,MAAM,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACjC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;IAEnB,+BAA+B;IAC/B,MAAM,MAAM,GAAuB,QAAQ,CAAC,MAAM,CAAC,MAAM,CAAC;QACxD,CAAC,CAAC,MAAM,CAAC,MAAM;QACf,CAAC,CAAC,MAAO,MAAM,CAAC,MAAc,CAAC,SAAS,EAAE,CAAC,CAAC;IAC9C,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,kBAAkB;IAC3G,MAAM,aAAa,GAAG,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,aAAa,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,kBAAkB;IACvI,MAAM,cAAc,GAAG,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,cAAc,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,kBAAkB;IAE3I,yBAAyB;IACzB,MAAM,WAAW,GAAG,GAAG,EAAE,CAAC,CAAC;QACzB,EAAE;QACF,MAAM;QACN,MAAM;QACN,aAAa;QACb,KAAK;QACL,MAAM;QACN,KAAK;QACL,KAAK;QACL,GAAG;KACJ,CAAC,CAAC;IACH,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,WAAW,EAAE,CAAC,CAAC;IAE1D,mCAAmC;IACnC,MAAM,QAAQ,mBAAK,OAAO,IAAK,WAAW,EAAE,CAAE,CAAC;IAC/C,MAAM,UAAU,GAAG,sBAAsB,CAAC,QAAQ,CAAC,CAAC;IACpD,MAAM,SAAS,GAAG,qBAAqB,CAAC,QAAQ,CAAC,CAAC;IAClD,MAAM,cAAc,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IACtD,MAAM,iBAAiB,GAAG,wBAAwB,CAAC,QAAQ,CAAC,CAAC;IAC7D,MAAM,SAAS,GAAG,MAAM,qBAAqB,CAAC,QAAQ,CAAC,CAAC;IAExD,MAAM,KAAK,GAAG,gCACT,QAAQ,KACX,OAAO;QACP,UAAU;QACV,SAAS,EACT,QAAQ,EAAE,cAAc,EACxB,YAAY,EAAE,iBAAiB,EAC/B,SAAS,GACsC,CAAC;IAElD,0DAA0D;IAC1D,MAAM,aAAa,GAAG,EAAE,KAAK,EAAE,QAAQ,EAAE,cAAc,EAAE,CAAC;IAE1D,gBAAgB;IAChB,MAAM,IAAA,0BAAiB,EAAkE;QACvF,EAAE;QACF,MAAM;QACN,cAAc;QACd,aAAa;QACb,MAAM;QACN,aAAa;QACb,KAAK;KACN,CAAC,CAAC;IACH,MAAM,IAAA,yBAAgB,EAA2D;QAC/E,MAAM;QACN,cAAc;QACd,aAAa;QACb,aAAa;KACd,CAAC,CAAC;IAEH,2DAA2D;IAC3D,MAAM,iBAAiB,CAAC,SAA6B,CAAC,CAAC;IAEvD,OAAO,KAAK,CAAC;AACf,CAAC,CAAA,CAAC;AAvGW,QAAA,gBAAgB,oBAuG3B;AAEF,MAAM,YAAY,GAAG,CACnB,KAAoB,EACpB,KAA8B,EAC9B,OAA+B,EAC/B,EAAE;;IACF,MAAM,EAAE,EAAE,GAAG,eAAuB,EAAE,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;IACvD,MAAM,EAAE,cAAc,EAAE,uBAAuB,EAAE,GAAG,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,EAAE,CAAoB,CAAC;IAErF,MAAM,YAAY,GAAG,cAAc,CAAC,CAAC,CAAC,MAAM,0BAAW,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC,IAAI,EAAU,CAAC,CAAC,CAAC,IAAI,CAAC;IAClG,MAAM,OAAO,GAAG,SAAS,CAAC,EAAE,KAAK,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,uBAAuB,CAAC,CAAC;IACzE,MAAM,aAAa,GAAG,MAAA,CAAC,MAAM,CAAA,OAAO,aAAP,OAAO,uBAAP,OAAO,EAAI,CAAA,CAAC,mCAAI,IAAI,CAAC;IAClD,MAAM,aAAa,iDAAQ,YAAY,GAAK,aAAa,GAAK,KAAK,CAAE,CAAC;IAEtE,OAAO,aAAkB,CAAC;AAC5B,CAAC,CAAA,CAAC;AAEF;;;;GAIG;AACH,MAAM,sBAAsB,GAAG,CAM7B,KAGC,EACD,EAAE;;IACF,MAAM,EACJ,sBAAsB,EACtB,qBAAqB,EACrB,mBAAmB,EACnB,kBAAkB,EAClB,qBAAqB,EACrB,oBAAoB,EACpB,kBAAkB,EAClB,iBAAiB,EACjB,kBAAkB,EAClB,yBAAyB,GAC1B,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCAAI,EAAE,CAAyC,CAAC;IAEhE,MAAM,SAAS,GAAG,qBAAqB,CAAC,KAAK,CAAC,CAAC;IAE/C,MAAM,UAAU,GAAoB,CAAO,QAAQ,EAAE,OAAO,EAAE,EAAE;;QAC9D,2CAA2C;QAC3C,IAAI,kBAAkB,IAAI,yBAAyB,KAAK,WAAW,EAAE;YACnE,MAAM,KAAK,GAAG,MAAM,KAAK,CAAC,EAAE,CAAC,iBAAiB,CAAC,kBAAkB,CAAC,CAAC;YACnE,MAAM,KAAK,CAAC,IAAI,EAAE,CAAC;SACpB;QAED,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,qBAAqB,CAAC,2CAAI,CAAA,CAAC;QAClD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,kBAAkB,CAAC,2CAAI,CAAA,CAAC;QAC/C,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,sBAAsB,CAAC,2CAAI,CAAA,CAAC;QACnD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,mBAAmB,CAAC,2CAAI,CAAA,CAAC;QAEhD,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAE1D,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,oBAAoB,CAAC,2CAAI,CAAA,CAAC;QACjD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,iBAAiB,CAAC,2CAAI,CAAA,CAAC;QAC9C,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,qBAAqB,CAAC,2CAAI,CAAA,CAAC;QAClD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,kBAAkB,CAAC,2CAAI,CAAA,CAAC;QAE/C,iDAAiD;QACjD,MAAM,SAAS,EAAE,CAAC;QAElB,OAAO,MAAM,CAAC;IAChB,CAAC,CAAA,CAAC;IAEF,OAAO,UAAU,CAAC;AACpB,CAAC,CAAC;AAEF,mFAAmF;AACnF,MAAM,qBAAqB,GAAG,CAAC,KAAyC,EAAE,EAAE;IAC1E,iDAAiD;IACjD,MAAM,SAAS,GAAc,CAAO,SAA+B,EAAE,EAAE;;QACrE,MAAM,EACJ,gBAAgB,EAChB,mBAAmB,EACnB,mBAAmB,GACpB,GAAG,IAAA,iBAAQ,EAAC,EAAE,EAAE,SAAS,EAAE,MAAA,KAAK,CAAC,KAAK,mCAAI,EAAE,CAAC,CAAC,CAAC,kBAAkB;QAElE,IAAI,CAAC,gBAAgB;YAAE,OAAO;QAE9B,MAAM,KAAK,CAAC,EAAE,CAAC,wBAAwB,CAAC,gBAAgB,EAAE,mBAAmB,EAAE;YAC7E,KAAK,EAAE,mBAAmB;SAC3B,CAAC,CAAC;IACL,CAAC,CAAA,CAAC;IAEF,OAAO,SAAS,CAAC;AACnB,CAAC,CAAC;AAEF,uEAAuE;AACvE,MAAM,oBAAoB,GAAG,CAAC,KAA2D,EAAE,EAAE;;IAC3F,MAAM,EACJ,mBAAmB,EACnB,cAAc,EACd,gBAAgB,EAChB,eAAe,EACf,YAAY,EACZ,eAAe,EACf,gBAAgB,EAChB,kBAAkB,EAClB,kBAAkB,EAClB,sBAAsB,EACtB,yBAAyB,GAC1B,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCAAI,EAAE,CAA6D,CAAC;IAEpF,MAAM,cAAc,GAA6B,CAAO,OAAO,EAAE,GAAG,EAAE,OAAO,EAAE,EAAE;QAC/E,MAAM,WAAW,GAAG,SAAS,CAAC,KAAK,EAAE,eAAe,CAAC,CAAC;QACtD,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,EAAE,YAAY,CAAC,CAAC;QAEhD,MAAM,aAAa,GAAG,gBACpB,EAAE,EAAE,KAAK,CAAC,EAAE,EACZ,GAAG,EAAE,KAAK,CAAC,GAAG,EACd,WAAW,EAAE,mBAAmB,EAChC,QAAQ,EAAE,gBAAgB,EAC1B,QAAQ,EAAE,gBAAgB,EAC1B,SAAS,EAAE,kBAAkB,EAC7B,SAAS,EAAE,WAAW,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EAChE,MAAM,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EACvD,SAAS,EAAE,eAAe,EAC1B,cAAc,EACd,YAAY,EAAE,kBAAkB,EAChC,gBAAgB,EAAE,sBAAsB,EACxC,mBAAmB,EAAE,yBAAyB,IAC3C,OAAO,CACuB,CAAC;QAEpC,OAAO,IAAA,mBAAQ,EAAC,OAAO,EAAE,GAAG,EAAE,aAAa,CAAC,CAAC;IAC/C,CAAC,CAAA,CAAC;IAEF,OAAO,cAAc,CAAC;AACxB,CAAC,CAAC;AAEF,2EAA2E;AAC3E,MAAM,wBAAwB,GAAG,CAAC,KAA2D,EAAE,EAAE;;IAC/F,MAAM,EAAE,cAAc,EAAE,iBAAiB,EAAE,gBAAgB,EAAE,aAAa,EAAE,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCACzF,EAAE,CAAsB,CAAC;IAE3B,MAAM,iBAAiB,GAAiC,CAAO,OAAO,EAAE,OAAO,EAAE,EAAE;QACjF,MAAM,WAAW,GAAG,SAAS,CAAC,KAAK,EAAE,gBAAgB,CAAC,CAAC;QACvD,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,EAAE,aAAa,CAAC,CAAC;QAEjD,MAAM,aAAa,GAAG,gBACpB,EAAE,EAAE,KAAK,CAAC,EAAE,EACZ,GAAG,EAAE,KAAK,CAAC,GAAG,EACd,QAAQ,EAAE,iBAAiB,EAC3B,SAAS,EAAE,WAAW,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EAChE,MAAM,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EACvD,cAAc,IACX,OAAO,CACwB,CAAC;QAErC,OAAO,IAAA,2BAAY,EAAC,OAAO,EAAE,aAAa,CAAC,CAAC;IAC9C,CAAC,CAAA,CAAC;IAEF,OAAO,iBAAiB,CAAC;AAC3B,CAAC,CAAC;AAEF,4DAA4D;AACrD,MAAM,wBAAwB,GAAG,CAGtC,EACA,KAAK,EACL,QAAQ,EACR,SAAS,GAcV,EAAE,EAAE;IACH,MAAM,sBAAsB,GAAG,CAAoC,MAAS,EAAE,EAAE,CAC9E,IAAA,aAAI,EAAC,MAAM,EAAE,MAAM,CAAC,IAAI,CAAC,qBAAY,CAAC,CAAC,CAAC;IAE1C,OAAO,8CAEF,IAAA,eAAM,EAAC,QAAQ,aAAR,QAAQ,cAAR,QAAQ,GAAK,EAAY,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,KAAK,SAAS,CAAC,GAEjE,IAAA,eAAM,EAAC,sBAAsB,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,EAAE,CAAC,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,KAAK,SAAS,CAAC,GAE3E,IAAA,eAAM,EAAC,SAAS,aAAT,SAAS,cAAT,SAAS,GAAK,EAAY,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,KAAK,SAAS,CAAC,CAC7C,CAAC;AAC7B,CAAC,CAAC;AAhCW,QAAA,wBAAwB,4BAgCnC;AAEF,MAAM,qBAAqB,GAAG,CAC5B,KAA2D,EAC3D,EAAE;;IACF,MAAM,EAAE,SAAS,EAAE,oBAAoB,EAAE,qBAAqB,EAAE,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCAC7E,EAAE,CAAwB,CAAC;IAE7B,MAAM,OAAO,GAAG,CAAC,GAAG,CAAC,SAAS,aAAT,SAAS,cAAT,SAAS,GAAI,EAAE,CAAC,CAAC,CAAC;IAEvC,IAAI,oBAAoB,EAAE;QACxB,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,mCAAmC,oBAAoB,EAAE,CAAC,CAAC;QAC3E,MAAM,CAAC,SAAS,EAAE,KAAK,CAAC,GAAG,oBAAoB,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC3D,MAAM,eAAe,GAAG,MAAM,IAAA,8BAAoB,EAAM,SAAS,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC,CAAC;QAC5F,OAAO,CAAC,IAAI,CAAC,GAAG,eAAe,CAAC,CAAC;KAClC;IAED,IAAI,qBAAqB,EAAE;QACzB,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,kCAAkC,CAAC,CAAC;QACpD,MAAM,UAAU,GAAG,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,qBAAqB,CAAC,2CAAI,CAAA,CAAC;QACrE,OAAO,CAAC,IAAI,CAAC,GAAG,UAAU,CAAC,CAAC;KAC7B;IAED,OAAO,OAAO,CAAC;AACjB,CAAC,CAAA,CAAC","sourcesContent":["import {\n BasicCrawler,\n CrawlingContext,\n RouterHandler,\n BasicCrawlerOptions,\n CheerioCrawler,\n Router,\n HttpCrawler,\n JSDOMCrawler,\n PlaywrightCrawler,\n PuppeteerCrawler,\n Log,\n Request as CrawleeRequest,\n} from 'crawlee';\nimport { omitBy, pick, defaults } from 'lodash';\nimport * as Sentry from '@sentry/node';\nimport { gotScraping } from 'got-scraping';\n\nimport type { CrawlerMeta, CrawlerType } from '../../types';\nimport type { MaybePromise, PickPartial } from '../../utils/types';\nimport { createErrorHandler } from '../error/errorHandler';\nimport { setupSentry } from '../error/sentry';\nimport { type PushDataOptions, itemCacheKey, pushData } from '../io/pushData';\nimport { getColumnFromDataset } from '../io/dataset';\nimport { PushRequestsOptions, pushRequests } from '../io/pushRequests';\nimport type { CrawleeOneIO } from '../integrations/types';\nimport { apifyIO } from '../integrations/apify';\nimport { registerHandlers, setupDefaultRoute } from '../router';\nimport {\n CrawlerConfigActorInput,\n OutputActorInput,\n MetamorphActorInput,\n PrivacyActorInput,\n crawlerInput,\n StartUrlsActorInput,\n InputActorInput,\n RequestActorInput,\n AllActorInputs,\n LoggingActorInput,\n} from '../config';\nimport { logLevelHandlerWrapper, logLevelToCrawlee } from '../log';\nimport type {\n ActorContext,\n ActorDefinition,\n ActorHookContext,\n ActorRouterContext,\n Metamorph,\n RunCrawler,\n} from './types';\n\nconst actorClassByType = {\n basic: BasicCrawler,\n http: HttpCrawler,\n cheerio: CheerioCrawler,\n jsdom: JSDOMCrawler,\n playwright: PlaywrightCrawler,\n puppeteer: PuppeteerCrawler,\n} satisfies Record<CrawlerType, { new (options: Record<string, any>): any }>;\n\nconst isRouter = (r: any): r is RouterHandler<any> => {\n return !!((r as RouterHandler).addHandler && (r as RouterHandler).addDefaultHandler);\n};\nconst isFunc = (f: any): f is (...args: any[]) => any => {\n return typeof f === 'function';\n};\n\n/** Run a function that was defined as a string via Actor input */\nconst genHookFn = <\n Ctx extends CrawlingContext<any> = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n>(\n actor: Pick<ActorContext<Ctx, Labels, Input, TIO>, 'input' | 'state' | 'io'>,\n fnStr?: string\n) => {\n if (!fnStr) return null;\n\n const hookCtx = {\n io: actor.io,\n input: actor.input,\n state: actor.state,\n itemCacheKey,\n sendRequest: gotScraping,\n } satisfies ActorHookContext<TIO>;\n\n const hookFn = eval(fnStr);\n if (!hookFn) return null;\n\n return async (...args) => hookFn(...args, hookCtx);\n};\n\n/**\n * Create default configuration for an opinionated Crawlee actor,\n * and run the actor within Apify's `Actor.main()` context.\n *\n * Apify context can be replaced with custom implementation using the `actorConfig.io` option.\n *\n * Read more about what this actor does at {@link createCrawleeOne}.\n */\nexport const createAndRunCrawleeOne = async <\n TCrawlerType extends CrawlerType,\n Ctx extends CrawlerMeta<TCrawlerType, any>['context'] = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n>(args: {\n /** String idetifying the actor class, e.g. `'cheerio'` */\n actorType: TCrawlerType;\n actorName: string;\n /** Config passed to the {@link createCrawleeOne} */\n actorConfig: PickPartial<\n ActorDefinition<Ctx, Labels, Input, TIO>,\n 'router' | 'createCrawler' | 'io'\n >;\n /**\n * If using default `createCrawler` implementation, these are crawler options\n * that may be overriden by user input.\n */\n crawlerConfigDefaults?: CrawlerMeta<TCrawlerType, any>['options'];\n /**\n * If using default `createCrawler` implementation, these are crawler options\n * that will override user input.\n *\n * This is useful for testing env.\n */\n crawlerConfigOverrides?: CrawlerMeta<TCrawlerType, any>['options'];\n /**\n * Sentry configuration. If using default `createCrawler` implementation,\n * failed requests are optionally reported to Sentry.\n *\n * To disable Sentry, set `\"enabled\": false`.\n */\n sentryOptions?: Sentry.NodeOptions;\n /**\n * Callback with the created actor. The callback is called within\n * the `Actor.main()` context.\n */\n onActorReady?: (actor: ActorContext<Ctx, Labels, Input, TIO>) => MaybePromise<void>;\n}): Promise<void> => {\n const {\n actorType,\n actorName,\n actorConfig,\n crawlerConfigDefaults,\n crawlerConfigOverrides,\n sentryOptions,\n onActorReady,\n } = args;\n\n const { io = apifyIO as any as TIO } = actorConfig;\n\n await setupSentry({ ...sentryOptions, serverName: actorName }, { io });\n\n // See docs:\n // - https://docs.apify.com/sdk/js/\n // - https://docs.apify.com/academy/deploying-your-code/inputs-outputs#accepting-input-with-the-apify-sdk\n // - https://docs.apify.com/sdk/js/docs/upgrading/upgrading-to-v3#apify-sdk\n await io.runInContext(\n async () => {\n const actorDefaults: ActorDefinition<Ctx, Labels, Input & AllActorInputs, TIO> = {\n io,\n router: Router.create<Ctx>(),\n routerWrappers: ({ input }) => [\n logLevelHandlerWrapper<Ctx, any>(input?.logLevel ?? 'info'),\n ],\n createCrawler: ({ router, proxy, input }) => {\n const options = createHttpCrawlerOptions<\n CrawlerMeta<TCrawlerType, any>['options'],\n Input\n >({\n input,\n defaults: crawlerConfigDefaults,\n overrides: {\n requestHandler: router,\n proxyConfiguration: proxy,\n // Capture errors in a separate (Apify) Dataset and pass errors to Sentry\n failedRequestHandler: createErrorHandler({\n io,\n reportingDatasetId: input?.errorReportingDatasetId ?? 'REPORTING',\n sendToSentry: input?.errorSendToSentry ?? true,\n }),\n ...crawlerConfigOverrides,\n },\n });\n const CrawlerClass = actorClassByType[actorType] as any;\n return new CrawlerClass(options);\n },\n routes: [],\n routeHandlers: {} as any,\n };\n\n const actor = await createCrawleeOne<Ctx, Labels, Input, TIO>({\n ...actorConfig,\n io,\n router: actorConfig.router ?? (actorDefaults.router as any),\n routerWrappers: actorConfig.routerWrappers ?? (actorDefaults.routerWrappers as any),\n createCrawler: actorConfig.createCrawler ?? (actorDefaults.createCrawler as any),\n });\n\n await onActorReady?.(actor);\n },\n { statusMessage: 'Crawling finished!' }\n );\n};\n\n/**\n * Create opinionated Crawlee crawler that uses router for handling requests.\n *\n * This is a quality-of-life function that does the following for you:\n *\n * 1) Full TypeScript coverage - Ensure all components use the same Crawler / CrawlerContext.\n *\n * 2) Get Actor input from `Actor.getInput` if not given.\n *\n * 3) (Optional) Validate Actor input\n *\n * 4) Set up router such that requests that reach default route are\n * redirected to labelled routes based on which item from \"routes\" they match.\n *\n * 5) Register all route handlers for you.\n *\n * 6) (Optional) Wrap all route handlers in a wrapper. Use this e.g.\n * if you want to add a field to the context object, or handle errors\n * from a single place.\n *\n * 7) (Optional) Support transformation and filtering of (scraped) entries,\n * configured via Actor input.\n *\n * 8) (Optional) Support Actor metamorphing, configured via Actor input.\n *\n * 9) Apify context (e.g. calling `Actor.getInput`) can be replaced with custom\n * implementation using the `io` option.\n */\nexport const createCrawleeOne = async <\n Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n>(\n config: PickPartial<ActorDefinition<Ctx, Labels, Input, TIO>, 'io'>\n): Promise<ActorContext<Ctx, Labels, Input, TIO>> => {\n const { io = apifyIO as any as TIO } = config;\n\n // Mutable state that is available to the actor hooks\n const state = {};\n\n // Initialize actor inputs\n const rawInput = config.input\n ? isFunc(config.input)\n ? await config.input({ ...config, io })\n : config.input\n : await io.getInput<Input>();\n const input = Object.freeze(await resolveInput<Input | null>(rawInput, state, { io }));\n\n if (config.validateInput) await config.validateInput(input);\n\n const { logLevel } = (input ?? {}) as LoggingActorInput;\n const log = new Log({ level: logLevel ? logLevelToCrawlee[logLevel] : undefined });\n\n // This is context that is available to options that use initialization function\n const getConfig = () => ({ ...config, input, state, io, log });\n\n // Set up proxy\n const defaultProxy =\n config.proxy == null ? await io.createDefaultProxyConfiguration(input ?? undefined) : undefined;\n const proxy =\n config.proxy == null\n ? defaultProxy\n : isFunc(config.proxy)\n ? await config.proxy(getConfig())\n : config.proxy;\n\n // Run initialization functions\n const router: RouterHandler<Ctx> = isRouter(config.router)\n ? config.router\n : await (config.router as any)(getConfig());\n const routes = isFunc(config.routes) ? await config.routes(getConfig()) : config.routes; // prettier-ignore\n const routeHandlers = isFunc(config.routeHandlers) ? await config.routeHandlers(getConfig()) : config.routeHandlers; // prettier-ignore\n const routerWrappers = isFunc(config.routerWrappers) ? await config.routerWrappers(getConfig()) : config.routerWrappers; // prettier-ignore\n\n // Create Crawlee crawler\n const getActorCtx = () => ({\n io,\n router,\n routes,\n routeHandlers,\n proxy,\n config,\n input,\n state,\n log,\n });\n const crawler = await config.createCrawler(getActorCtx());\n\n // Create actor (our custom entity)\n const preActor = { crawler, ...getActorCtx() };\n const runCrawler = createScopedCrawlerRun(preActor);\n const metamorph = createScopedMetamorph(preActor);\n const scopedPushData = createScopedPushData(preActor);\n const scopedPushRequest = createScopedPushRequests(preActor);\n const startUrls = await getStartUrlsFromInput(preActor);\n\n const actor = {\n ...preActor,\n crawler,\n runCrawler,\n metamorph,\n pushData: scopedPushData,\n pushRequests: scopedPushRequest,\n startUrls,\n } satisfies ActorContext<Ctx, Labels, Input, TIO>;\n\n // Extra data that we make available to the route handlers\n const routerContext = { actor, pushData: scopedPushData };\n\n // Set up router\n await setupDefaultRoute<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>, Labels, Input>({\n io,\n router,\n routerWrappers,\n routerContext,\n routes,\n routeHandlers,\n input,\n });\n await registerHandlers<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>, Labels>({\n router,\n routerWrappers,\n routerContext,\n routeHandlers,\n });\n\n // Now that the actor is ready, enqueue the URLs right away\n await scopedPushRequest(startUrls as CrawleeRequest[]);\n\n return actor;\n};\n\nconst resolveInput = async <T extends Record<string, any> | null>(\n input: object | null,\n state: Record<string, unknown>,\n options?: { io?: CrawleeOneIO }\n) => {\n const { io = apifyIO as CrawleeOneIO } = options ?? {};\n const { inputExtendUrl, inputExtendFromFunction } = (input ?? {}) as InputActorInput;\n\n const inputFromUrl = inputExtendUrl ? await gotScraping.get(inputExtendUrl).json<object>() : null;\n const inputFn = genHookFn({ state, input, io }, inputExtendFromFunction);\n const inputFromFunc = (await inputFn?.()) ?? null;\n const extendedInput = { ...inputFromUrl, ...inputFromFunc, ...input };\n\n return extendedInput as T;\n};\n\n/**\n * Create a function that wraps `crawler.run(requests, runOtions)` with additional\n * features like:\n * - Automatically metamorph into another actor after the run finishes\n */\nconst createScopedCrawlerRun = <\n Ctx extends CrawlingContext<any> = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n>(\n actor: Omit<\n ActorContext<Ctx, Labels, Input, TIO>,\n 'runCrawler' | 'metamorph' | 'pushData' | 'pushRequests' | 'startUrls'\n >\n) => {\n const {\n requestTransformBefore,\n requestTransformAfter,\n requestFilterBefore,\n requestFilterAfter,\n outputTransformBefore,\n outputTransformAfter,\n outputFilterBefore,\n outputFilterAfter,\n outputCacheStoreId,\n outputCacheActionOnResult,\n } = (actor.input ?? {}) as OutputActorInput & RequestActorInput;\n\n const metamorph = createScopedMetamorph(actor);\n\n const runCrawler: RunCrawler<Ctx> = async (requests, options) => {\n // Clear cache if it was set from the input\n if (outputCacheStoreId && outputCacheActionOnResult === 'overwrite') {\n const store = await actor.io.openKeyValueStore(outputCacheStoreId);\n await store.drop();\n }\n\n await genHookFn(actor, outputTransformBefore)?.();\n await genHookFn(actor, outputFilterBefore)?.();\n await genHookFn(actor, requestTransformBefore)?.();\n await genHookFn(actor, requestFilterBefore)?.();\n\n const runRes = await actor.crawler.run(requests, options);\n\n await genHookFn(actor, outputTransformAfter)?.();\n await genHookFn(actor, outputFilterAfter)?.();\n await genHookFn(actor, requestTransformAfter)?.();\n await genHookFn(actor, requestFilterAfter)?.();\n\n // Trigger metamorph if it was set from the input\n await metamorph();\n\n return runRes;\n };\n\n return runCrawler;\n};\n\n/** Create a function that triggers metamorph, using Actor's inputs as defaults. */\nconst createScopedMetamorph = (actor: Pick<ActorContext, 'input' | 'io'>) => {\n // Trigger metamorph if it was set from the input\n const metamorph: Metamorph = async (overrides?: MetamorphActorInput) => {\n const {\n metamorphActorId,\n metamorphActorBuild,\n metamorphActorInput,\n } = defaults({}, overrides, actor.input ?? {}); // prettier-ignore\n\n if (!metamorphActorId) return;\n\n await actor.io.triggerDownstreamCrawler(metamorphActorId, metamorphActorInput, {\n build: metamorphActorBuild,\n });\n };\n\n return metamorph;\n};\n\n/** pushData wrapper that pre-populates options based on actor input */\nconst createScopedPushData = (actor: Pick<ActorContext, 'input' | 'state' | 'io' | 'log'>) => {\n const {\n includePersonalData,\n requestQueueId,\n outputMaxEntries,\n outputTransform,\n outputFilter,\n outputDatasetId,\n outputPickFields,\n outputRenameFields,\n outputCacheStoreId,\n outputCachePrimaryKeys,\n outputCacheActionOnResult,\n } = (actor.input ?? {}) as OutputActorInput & PrivacyActorInput & RequestActorInput;\n\n const scopedPushData: ActorContext['pushData'] = async (entries, ctx, options) => {\n const transformFn = genHookFn(actor, outputTransform);\n const filterFn = genHookFn(actor, outputFilter);\n\n const mergedOptions = {\n io: actor.io,\n log: actor.log,\n showPrivate: includePersonalData,\n maxCount: outputMaxEntries,\n pickKeys: outputPickFields,\n remapKeys: outputRenameFields,\n transform: transformFn ? (item) => transformFn(item) : undefined,\n filter: filterFn ? (item) => filterFn(item) : undefined,\n datasetId: outputDatasetId,\n requestQueueId,\n cacheStoreId: outputCacheStoreId,\n cachePrimaryKeys: outputCachePrimaryKeys,\n cacheActionOnResult: outputCacheActionOnResult,\n ...options,\n } satisfies PushDataOptions<object>;\n\n return pushData(entries, ctx, mergedOptions);\n };\n\n return scopedPushData;\n};\n\n/** pushRequests wrapper that pre-populates options based on actor input */\nconst createScopedPushRequests = (actor: Pick<ActorContext, 'input' | 'state' | 'io' | 'log'>) => {\n const { requestQueueId, requestMaxEntries, requestTransform, requestFilter } = (actor.input ??\n {}) as RequestActorInput;\n\n const scopedPushRequest: ActorContext['pushRequests'] = async (entries, options) => {\n const transformFn = genHookFn(actor, requestTransform);\n const filterFn = genHookFn(actor, requestFilter);\n\n const mergedOptions = {\n io: actor.io,\n log: actor.log,\n maxCount: requestMaxEntries,\n transform: transformFn ? (item) => transformFn(item) : undefined,\n filter: filterFn ? (item) => filterFn(item) : undefined,\n requestQueueId,\n ...options,\n } satisfies PushRequestsOptions<any>;\n\n return pushRequests(entries, mergedOptions);\n };\n\n return scopedPushRequest;\n};\n\n/** Given the actor input, create common crawler options. */\nexport const createHttpCrawlerOptions = <\n TOpts extends BasicCrawlerOptions<any> = BasicCrawlerOptions,\n Input extends Record<string, any> = Record<string, any>\n>({\n input,\n defaults,\n overrides,\n}: {\n /** Actor input */\n input: Input | null;\n /**\n * Default config options set by us. These may be overriden\n * by values from actor input (set by user).\n */\n defaults?: TOpts;\n /**\n * These config options will overwrite both the default and user\n * options. This is useful for hard-setting values e.g. in tests.\n */\n overrides?: TOpts;\n}) => {\n const pickCrawlerInputFields = <T extends CrawlerConfigActorInput>(config: T) =>\n pick(config, Object.keys(crawlerInput));\n\n return {\n // ----- 1. DEFAULTS -----\n ...omitBy(defaults ?? ({} as TOpts), (field) => field === undefined),\n // ----- 2. CONFIG FROM INPUT -----\n ...omitBy(pickCrawlerInputFields(input ?? {}), (field) => field === undefined),\n // ----- 3. OVERRIDES - E.G. TEST CONFIG -----\n ...omitBy(overrides ?? ({} as TOpts), (field) => field === undefined),\n } satisfies Partial<TOpts>;\n};\n\nconst getStartUrlsFromInput = async (\n actor: Pick<ActorContext, 'input' | 'state' | 'io' | 'log'>\n) => {\n const { startUrls, startUrlsFromDataset, startUrlsFromFunction } = (actor.input ??\n {}) as StartUrlsActorInput;\n\n const urlsAgg = [...(startUrls ?? [])];\n\n if (startUrlsFromDataset) {\n actor.log.debug(`Loading start URLs from Dataset ${startUrlsFromDataset}`);\n const [datasetId, field] = startUrlsFromDataset.split('#');\n const urlsFromDataset = await getColumnFromDataset<any>(datasetId, field, { io: actor.io });\n urlsAgg.push(...urlsFromDataset);\n }\n\n if (startUrlsFromFunction) {\n actor.log.debug(`Loading start URLs from function`);\n const urlsFromFn = await genHookFn(actor, startUrlsFromFunction)?.();\n urlsAgg.push(...urlsFromFn);\n }\n\n return urlsAgg;\n};\n"]}
|
|
1
|
+
{"version":3,"file":"actor.js","sourceRoot":"","sources":["../../../../src/lib/actor/actor.ts"],"names":[],"mappings":";;;;;;;;;;;;AAAA,qCAaiB;AACjB,mCAAgD;AAEhD,+CAA2C;AAI3C,wDAA2D;AAC3D,4CAA8C;AAC9C,6CAA8E;AAC9E,2CAAqD;AACrD,qDAAuE;AAEvE,iDAAgD;AAChD,6CAAuE;AACvE,sCAWmB;AACnB,gCAAmE;AAUnE,MAAM,gBAAgB,GAAG;IACvB,KAAK,EAAE,sBAAY;IACnB,IAAI,EAAE,qBAAW;IACjB,OAAO,EAAE,wBAAc;IACvB,KAAK,EAAE,sBAAY;IACnB,UAAU,EAAE,2BAAiB;IAC7B,SAAS,EAAE,0BAAgB;CAC+C,CAAC;AAE7E,MAAM,QAAQ,GAAG,CAAC,CAAM,EAA2B,EAAE;IACnD,OAAO,CAAC,CAAC,CAAE,CAAmB,CAAC,UAAU,IAAK,CAAmB,CAAC,iBAAiB,CAAC,CAAC;AACvF,CAAC,CAAC;AACF,MAAM,MAAM,GAAG,CAAC,CAAM,EAAgC,EAAE;IACtD,OAAO,OAAO,CAAC,KAAK,UAAU,CAAC;AACjC,CAAC,CAAC;AAEF,kEAAkE;AAClE,MAAM,SAAS,GAAG,CAMhB,KAA4E,EAC5E,KAAc,EACd,EAAE;IACF,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IAExB,MAAM,OAAO,GAAG;QACd,EAAE,EAAE,KAAK,CAAC,EAAE;QACZ,KAAK,EAAE,KAAK,CAAC,KAAK;QAClB,KAAK,EAAE,KAAK,CAAC,KAAK;QAClB,YAAY,EAAZ,uBAAY;QACZ,WAAW,EAAE,0BAAW;KACO,CAAC;IAElC,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC;IAC3B,IAAI,CAAC,MAAM;QAAE,OAAO,IAAI,CAAC;IAEzB,OAAO,CAAO,GAAG,IAAI,EAAE,EAAE,kDAAC,OAAA,MAAM,CAAC,GAAG,IAAI,EAAE,OAAO,CAAC,CAAA,GAAA,CAAC;AACrD,CAAC,CAAC;AAEF;;;;;;;GAOG;AACI,MAAM,sBAAsB,GAAG,CAMpC,IAiCD,EAAiB,EAAE;IAClB,MAAM,EACJ,SAAS,EACT,SAAS,EACT,WAAW,EACX,qBAAqB,EACrB,sBAAsB,EACtB,aAAa,EACb,YAAY,GACb,GAAG,IAAI,CAAC;IAET,MAAM,EAAE,EAAE,GAAG,eAAqB,EAAE,GAAG,WAAW,CAAC;IAEnD,MAAM,IAAA,oBAAW,kCAAM,aAAa,KAAE,UAAU,EAAE,SAAS,KAAI,EAAE,EAAE,EAAE,CAAC,CAAC;IAEvE,YAAY;IACZ,mCAAmC;IACnC,yGAAyG;IACzG,2EAA2E;IAC3E,MAAM,EAAE,CAAC,YAAY,CACnB,GAAS,EAAE;;QACT,MAAM,aAAa,GAA8D;YAC/E,EAAE;YACF,MAAM,EAAE,gBAAM,CAAC,MAAM,EAAO;YAC5B,cAAc,EAAE,CAAC,EAAE,KAAK,EAAE,EAAE,EAAE;;gBAAC,OAAA;oBAC7B,IAAA,4BAAsB,EAAW,MAAA,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,QAAQ,mCAAI,MAAM,CAAC;iBAC5D,CAAA;aAAA;YACD,aAAa,EAAE,CAAC,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,EAAE,EAAE;;gBAC1C,MAAM,OAAO,GAAG,IAAA,gCAAwB,EAGtC;oBACA,KAAK;oBACL,QAAQ,EAAE,qBAAqB;oBAC/B,SAAS,kBACP,cAAc,EAAE,MAAM,EACtB,kBAAkB,EAAE,KAAK;wBACzB,yEAAyE;wBACzE,oBAAoB,EAAE,IAAA,iCAAkB,EAAC;4BACvC,EAAE;4BACF,kBAAkB,EAAE,MAAA,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,uBAAuB,mCAAI,WAAW;4BACjE,YAAY,EAAE,MAAA,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,iBAAiB,mCAAI,IAAI;yBAC/C,CAAC,IACC,sBAAsB,CAC1B;iBACF,CAAC,CAAC;gBACH,MAAM,YAAY,GAAG,gBAAgB,CAAC,SAAS,CAAQ,CAAC;gBACxD,OAAO,IAAI,YAAY,CAAC,OAAO,CAAC,CAAC;YACnC,CAAC;YACD,MAAM,EAAE,EAAE;YACV,aAAa,EAAE,EAAS;SACzB,CAAC;QAEF,MAAM,KAAK,GAAG,MAAM,IAAA,wBAAgB,kCAC/B,WAAW,KACd,EAAE,EACF,MAAM,EAAE,MAAA,WAAW,CAAC,MAAM,mCAAK,aAAa,CAAC,MAAc,EAC3D,cAAc,EAAE,MAAA,WAAW,CAAC,cAAc,mCAAK,aAAa,CAAC,cAAsB,EACnF,aAAa,EAAE,MAAA,WAAW,CAAC,aAAa,mCAAK,aAAa,CAAC,aAAqB,IAChF,CAAC;QAEH,MAAM,CAAA,YAAY,aAAZ,YAAY,uBAAZ,YAAY,CAAG,KAAK,CAAC,CAAA,CAAC;IAC9B,CAAC,CAAA,EACD,EAAE,aAAa,EAAE,oBAAoB,EAAE,CACxC,CAAC;AACJ,CAAC,CAAA,CAAC;AAxGW,QAAA,sBAAsB,0BAwGjC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG;AACI,MAAM,gBAAgB,GAAG,CAM9B,MAAmE,EACnB,EAAE;IAClD,MAAM,EAAE,EAAE,GAAG,eAAqB,EAAE,GAAG,MAAM,CAAC;IAE9C,qDAAqD;IACrD,MAAM,KAAK,GAAG,EAAE,CAAC;IAEjB,0BAA0B;IAC1B,MAAM,QAAQ,GAAG,MAAM,CAAC,KAAK;QAC3B,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC;YACpB,CAAC,CAAC,MAAM,MAAM,CAAC,KAAK,iCAAM,MAAM,KAAE,EAAE,IAAG;YACvC,CAAC,CAAC,MAAM,CAAC,KAAK;QAChB,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAS,CAAC;IAC/B,MAAM,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,YAAY,CAAe,QAAQ,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC;IAEvF,IAAI,MAAM,CAAC,aAAa;QAAE,MAAM,MAAM,CAAC,aAAa,CAAC,KAAK,CAAC,CAAC;IAE5D,MAAM,EAAE,QAAQ,EAAE,GAAG,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,EAAE,CAAsB,CAAC;IACxD,MAAM,GAAG,GAAG,IAAI,aAAG,CAAC,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC,CAAC,uBAAiB,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC;IAEnF,gFAAgF;IAChF,MAAM,SAAS,GAAG,GAAG,EAAE,CAAC,iCAAM,MAAM,KAAE,KAAK,EAAE,KAAK,EAAE,EAAE,EAAE,GAAG,IAAG,CAAC;IAE/D,eAAe;IACf,MAAM,YAAY,GAChB,MAAM,CAAC,KAAK,IAAI,IAAI,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,+BAA+B,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;IAClG,MAAM,KAAK,GACT,MAAM,CAAC,KAAK,IAAI,IAAI;QAClB,CAAC,CAAC,YAAY;QACd,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC;YACtB,CAAC,CAAC,MAAM,MAAM,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACjC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;IAEnB,+BAA+B;IAC/B,MAAM,MAAM,GAAuB,QAAQ,CAAC,MAAM,CAAC,MAAM,CAAC;QACxD,CAAC,CAAC,MAAM,CAAC,MAAM;QACf,CAAC,CAAC,MAAO,MAAM,CAAC,MAAc,CAAC,SAAS,EAAE,CAAC,CAAC;IAC9C,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,kBAAkB;IAC3G,MAAM,aAAa,GAAG,MAAM,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,aAAa,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,kBAAkB;IACvI,MAAM,cAAc,GAAG,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,cAAc,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,kBAAkB;IAE3I,yBAAyB;IACzB,MAAM,WAAW,GAAG,GAAG,EAAE,CAAC,CAAC;QACzB,EAAE;QACF,MAAM;QACN,MAAM;QACN,aAAa;QACb,KAAK;QACL,MAAM;QACN,KAAK;QACL,KAAK;QACL,GAAG;KACJ,CAAC,CAAC;IACH,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,WAAW,EAAE,CAAC,CAAC;IAE1D,mCAAmC;IACnC,MAAM,QAAQ,mBAAK,OAAO,IAAK,WAAW,EAAE,CAAE,CAAC;IAC/C,MAAM,UAAU,GAAG,sBAAsB,CAAC,QAAQ,CAAC,CAAC;IACpD,MAAM,SAAS,GAAG,qBAAqB,CAAC,QAAQ,CAAC,CAAC;IAClD,MAAM,cAAc,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IACtD,MAAM,iBAAiB,GAAG,wBAAwB,CAAC,QAAQ,CAAC,CAAC;IAC7D,MAAM,SAAS,GAAG,MAAM,qBAAqB,CAAC,QAAQ,CAAC,CAAC;IAExD,MAAM,KAAK,GAAG,gCACT,QAAQ,KACX,OAAO;QACP,UAAU;QACV,SAAS,EACT,QAAQ,EAAE,cAAc,EACxB,YAAY,EAAE,iBAAiB,EAC/B,SAAS,GACsC,CAAC;IAElD,0DAA0D;IAC1D,MAAM,aAAa,GAAG,EAAE,KAAK,EAAE,QAAQ,EAAE,cAAc,EAAE,CAAC;IAE1D,gBAAgB;IAChB,MAAM,IAAA,0BAAiB,EAAkE;QACvF,EAAE;QACF,MAAM;QACN,cAAc;QACd,aAAa;QACb,MAAM;QACN,aAAa;QACb,KAAK;KACN,CAAC,CAAC;IACH,MAAM,IAAA,yBAAgB,EAA2D;QAC/E,MAAM;QACN,cAAc;QACd,aAAa;QACb,aAAa;KACd,CAAC,CAAC;IAEH,2DAA2D;IAC3D,MAAM,iBAAiB,CAAC,SAA6B,CAAC,CAAC;IAEvD,OAAO,KAAK,CAAC;AACf,CAAC,CAAA,CAAC;AAvGW,QAAA,gBAAgB,oBAuG3B;AAEF,MAAM,YAAY,GAAG,CACnB,KAAoB,EACpB,KAA8B,EAC9B,OAA+B,EAC/B,EAAE;;IACF,MAAM,EAAE,EAAE,GAAG,eAAuB,EAAE,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;IACvD,MAAM,EAAE,cAAc,EAAE,uBAAuB,EAAE,GAAG,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,EAAE,CAAoB,CAAC;IAErF,MAAM,YAAY,GAAG,cAAc,CAAC,CAAC,CAAC,MAAM,0BAAW,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC,IAAI,EAAU,CAAC,CAAC,CAAC,IAAI,CAAC;IAClG,MAAM,OAAO,GAAG,SAAS,CAAC,EAAE,KAAK,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,uBAAuB,CAAC,CAAC;IACzE,MAAM,aAAa,GAAG,MAAA,CAAC,MAAM,CAAA,OAAO,aAAP,OAAO,uBAAP,OAAO,EAAI,CAAA,CAAC,mCAAI,IAAI,CAAC;IAClD,MAAM,aAAa,iDAAQ,YAAY,GAAK,aAAa,GAAK,KAAK,CAAE,CAAC;IAEtE,OAAO,aAAkB,CAAC;AAC5B,CAAC,CAAA,CAAC;AAEF;;;;GAIG;AACH,MAAM,sBAAsB,GAAG,CAM7B,KAGC,EACD,EAAE;;IACF,MAAM,EACJ,sBAAsB,EACtB,qBAAqB,EACrB,mBAAmB,EACnB,kBAAkB,EAClB,qBAAqB,EACrB,oBAAoB,EACpB,kBAAkB,EAClB,iBAAiB,EACjB,kBAAkB,EAClB,yBAAyB,GAC1B,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCAAI,EAAE,CAAyC,CAAC;IAEhE,MAAM,SAAS,GAAG,qBAAqB,CAAC,KAAK,CAAC,CAAC;IAE/C,MAAM,UAAU,GAAoB,CAAO,QAAQ,EAAE,OAAO,EAAE,EAAE;;QAC9D,2CAA2C;QAC3C,IAAI,kBAAkB,IAAI,yBAAyB,KAAK,WAAW,EAAE;YACnE,MAAM,KAAK,GAAG,MAAM,KAAK,CAAC,EAAE,CAAC,iBAAiB,CAAC,kBAAkB,CAAC,CAAC;YACnE,MAAM,KAAK,CAAC,IAAI,EAAE,CAAC;SACpB;QAED,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,qBAAqB,CAAC,2CAAI,CAAA,CAAC;QAClD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,kBAAkB,CAAC,2CAAI,CAAA,CAAC;QAC/C,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,sBAAsB,CAAC,2CAAI,CAAA,CAAC;QACnD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,mBAAmB,CAAC,2CAAI,CAAA,CAAC;QAEhD,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAE1D,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,oBAAoB,CAAC,2CAAI,CAAA,CAAC;QACjD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,iBAAiB,CAAC,2CAAI,CAAA,CAAC;QAC9C,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,qBAAqB,CAAC,2CAAI,CAAA,CAAC;QAClD,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,kBAAkB,CAAC,2CAAI,CAAA,CAAC;QAE/C,iDAAiD;QACjD,MAAM,SAAS,EAAE,CAAC;QAElB,OAAO,MAAM,CAAC;IAChB,CAAC,CAAA,CAAC;IAEF,OAAO,UAAU,CAAC;AACpB,CAAC,CAAC;AAEF,mFAAmF;AACnF,MAAM,qBAAqB,GAAG,CAAC,KAAyC,EAAE,EAAE;IAC1E,iDAAiD;IACjD,MAAM,SAAS,GAAc,CAAO,SAA+B,EAAE,EAAE;;QACrE,MAAM,EACJ,gBAAgB,EAChB,mBAAmB,EACnB,mBAAmB,GACpB,GAAG,IAAA,iBAAQ,EAAC,EAAE,EAAE,SAAS,EAAE,MAAA,KAAK,CAAC,KAAK,mCAAI,EAAE,CAAC,CAAC,CAAC,kBAAkB;QAElE,IAAI,CAAC,gBAAgB;YAAE,OAAO;QAE9B,MAAM,KAAK,CAAC,EAAE,CAAC,wBAAwB,CAAC,gBAAgB,EAAE,mBAAmB,EAAE;YAC7E,KAAK,EAAE,mBAAmB;SAC3B,CAAC,CAAC;IACL,CAAC,CAAA,CAAC;IAEF,OAAO,SAAS,CAAC;AACnB,CAAC,CAAC;AAEF,uEAAuE;AACvE,MAAM,oBAAoB,GAAG,CAAC,KAA2D,EAAE,EAAE;;IAC3F,MAAM,EACJ,mBAAmB,EACnB,cAAc,EACd,gBAAgB,EAChB,eAAe,EACf,YAAY,EACZ,eAAe,EACf,gBAAgB,EAChB,kBAAkB,EAClB,kBAAkB,EAClB,sBAAsB,EACtB,yBAAyB,GAC1B,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCAAI,EAAE,CAA6D,CAAC;IAEpF,MAAM,cAAc,GAA6B,CAAO,OAAO,EAAE,GAAG,EAAE,OAAO,EAAE,EAAE;QAC/E,MAAM,WAAW,GAAG,SAAS,CAAC,KAAK,EAAE,eAAe,CAAC,CAAC;QACtD,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,EAAE,YAAY,CAAC,CAAC;QAEhD,MAAM,aAAa,GAAG,gBACpB,EAAE,EAAE,KAAK,CAAC,EAAE,EACZ,GAAG,EAAE,KAAK,CAAC,GAAG,EACd,WAAW,EAAE,mBAAmB,EAChC,QAAQ,EAAE,gBAAgB,EAC1B,QAAQ,EAAE,gBAAgB,EAC1B,SAAS,EAAE,kBAAkB,EAC7B,SAAS,EAAE,WAAW,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EAChE,MAAM,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EACvD,SAAS,EAAE,eAAe,EAC1B,cAAc,EACd,YAAY,EAAE,kBAAkB,EAChC,gBAAgB,EAAE,sBAAsB,EACxC,mBAAmB,EAAE,yBAAyB,IAC3C,OAAO,CACuB,CAAC;QAEpC,OAAO,IAAA,mBAAQ,EAAC,OAAO,EAAE,GAAG,EAAE,aAAa,CAAC,CAAC;IAC/C,CAAC,CAAA,CAAC;IAEF,OAAO,cAAc,CAAC;AACxB,CAAC,CAAC;AAEF,2EAA2E;AAC3E,MAAM,wBAAwB,GAAG,CAAC,KAA2D,EAAE,EAAE;;IAC/F,MAAM,EAAE,cAAc,EAAE,iBAAiB,EAAE,gBAAgB,EAAE,aAAa,EAAE,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCACzF,EAAE,CAAsB,CAAC;IAE3B,MAAM,iBAAiB,GAAiC,CAAO,OAAO,EAAE,OAAO,EAAE,EAAE;QACjF,MAAM,WAAW,GAAG,SAAS,CAAC,KAAK,EAAE,gBAAgB,CAAC,CAAC;QACvD,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,EAAE,aAAa,CAAC,CAAC;QAEjD,MAAM,aAAa,GAAG,gBACpB,EAAE,EAAE,KAAK,CAAC,EAAE,EACZ,GAAG,EAAE,KAAK,CAAC,GAAG,EACd,QAAQ,EAAE,iBAAiB,EAC3B,SAAS,EAAE,WAAW,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EAChE,MAAM,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,EACvD,cAAc,IACX,OAAO,CACwB,CAAC;QAErC,OAAO,IAAA,2BAAY,EAAC,OAAO,EAAE,aAAa,CAAC,CAAC;IAC9C,CAAC,CAAA,CAAC;IAEF,OAAO,iBAAiB,CAAC;AAC3B,CAAC,CAAC;AAEF,4DAA4D;AACrD,MAAM,wBAAwB,GAAG,CAGtC,EACA,KAAK,EACL,QAAQ,EACR,SAAS,GAcV,EAAE,EAAE;IACH,MAAM,sBAAsB,GAAG,CAAoC,MAAS,EAAE,EAAE,CAC9E,IAAA,aAAI,EAAC,MAAM,EAAE,MAAM,CAAC,IAAI,CAAC,qBAAY,CAAC,CAAC,CAAC;IAE1C,OAAO,8CAEF,IAAA,eAAM,EAAC,QAAQ,aAAR,QAAQ,cAAR,QAAQ,GAAK,EAAY,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,KAAK,SAAS,CAAC,GAEjE,IAAA,eAAM,EAAC,sBAAsB,CAAC,KAAK,aAAL,KAAK,cAAL,KAAK,GAAI,EAAE,CAAC,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,KAAK,SAAS,CAAC,GAE3E,IAAA,eAAM,EAAC,SAAS,aAAT,SAAS,cAAT,SAAS,GAAK,EAAY,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,KAAK,SAAS,CAAC,CAC7C,CAAC;AAC7B,CAAC,CAAC;AAhCW,QAAA,wBAAwB,4BAgCnC;AAEF,MAAM,qBAAqB,GAAG,CAC5B,KAA2D,EAC3D,EAAE;;IACF,MAAM,EAAE,SAAS,EAAE,oBAAoB,EAAE,qBAAqB,EAAE,GAAG,CAAC,MAAA,KAAK,CAAC,KAAK,mCAC7E,EAAE,CAAwB,CAAC;IAE7B,MAAM,OAAO,GAAG,CAAC,GAAG,CAAC,SAAS,aAAT,SAAS,cAAT,SAAS,GAAI,EAAE,CAAC,CAAC,CAAC;IAEvC,IAAI,oBAAoB,EAAE;QACxB,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,mCAAmC,oBAAoB,EAAE,CAAC,CAAC;QAC3E,MAAM,CAAC,SAAS,EAAE,KAAK,CAAC,GAAG,oBAAoB,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC3D,MAAM,eAAe,GAAG,MAAM,IAAA,8BAAoB,EAAM,SAAS,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC,CAAC;QAC5F,OAAO,CAAC,IAAI,CAAC,GAAG,eAAe,CAAC,CAAC;KAClC;IAED,IAAI,qBAAqB,EAAE;QACzB,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,kCAAkC,CAAC,CAAC;QACpD,MAAM,UAAU,GAAG,MAAM,CAAA,MAAA,SAAS,CAAC,KAAK,EAAE,qBAAqB,CAAC,2CAAI,CAAA,CAAC;QACrE,OAAO,CAAC,IAAI,CAAC,GAAG,UAAU,CAAC,CAAC;KAC7B;IAED,OAAO,OAAO,CAAC;AACjB,CAAC,CAAA,CAAC","sourcesContent":["import {\n BasicCrawler,\n CrawlingContext,\n RouterHandler,\n BasicCrawlerOptions,\n CheerioCrawler,\n Router,\n HttpCrawler,\n JSDOMCrawler,\n PlaywrightCrawler,\n PuppeteerCrawler,\n Log,\n Request as CrawleeRequest,\n} from 'crawlee';\nimport { omitBy, pick, defaults } from 'lodash';\nimport * as Sentry from '@sentry/node';\nimport { gotScraping } from 'got-scraping';\n\nimport type { CrawlerMeta, CrawlerType } from '../../types';\nimport type { MaybePromise, PickPartial } from '../../utils/types';\nimport { createErrorHandler } from '../error/errorHandler';\nimport { setupSentry } from '../error/sentry';\nimport { type PushDataOptions, itemCacheKey, pushData } from '../io/pushData';\nimport { getColumnFromDataset } from '../io/dataset';\nimport { PushRequestsOptions, pushRequests } from '../io/pushRequests';\nimport type { CrawleeOneIO } from '../integrations/types';\nimport { apifyIO } from '../integrations/apify';\nimport { registerHandlers, setupDefaultRoute } from '../router/router';\nimport {\n CrawlerConfigActorInput,\n OutputActorInput,\n MetamorphActorInput,\n PrivacyActorInput,\n crawlerInput,\n StartUrlsActorInput,\n InputActorInput,\n RequestActorInput,\n AllActorInputs,\n LoggingActorInput,\n} from '../config';\nimport { logLevelHandlerWrapper, logLevelToCrawlee } from '../log';\nimport type {\n ActorContext,\n ActorDefinition,\n ActorHookContext,\n ActorRouterContext,\n Metamorph,\n RunCrawler,\n} from './types';\n\nconst actorClassByType = {\n basic: BasicCrawler,\n http: HttpCrawler,\n cheerio: CheerioCrawler,\n jsdom: JSDOMCrawler,\n playwright: PlaywrightCrawler,\n puppeteer: PuppeteerCrawler,\n} satisfies Record<CrawlerType, { new (options: Record<string, any>): any }>;\n\nconst isRouter = (r: any): r is RouterHandler<any> => {\n return !!((r as RouterHandler).addHandler && (r as RouterHandler).addDefaultHandler);\n};\nconst isFunc = (f: any): f is (...args: any[]) => any => {\n return typeof f === 'function';\n};\n\n/** Run a function that was defined as a string via Actor input */\nconst genHookFn = <\n Ctx extends CrawlingContext<any> = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n>(\n actor: Pick<ActorContext<Ctx, Labels, Input, TIO>, 'input' | 'state' | 'io'>,\n fnStr?: string\n) => {\n if (!fnStr) return null;\n\n const hookCtx = {\n io: actor.io,\n input: actor.input,\n state: actor.state,\n itemCacheKey,\n sendRequest: gotScraping,\n } satisfies ActorHookContext<TIO>;\n\n const hookFn = eval(fnStr);\n if (!hookFn) return null;\n\n return async (...args) => hookFn(...args, hookCtx);\n};\n\n/**\n * Create default configuration for an opinionated Crawlee actor,\n * and run the actor within Apify's `Actor.main()` context.\n *\n * Apify context can be replaced with custom implementation using the `actorConfig.io` option.\n *\n * Read more about what this actor does at {@link createCrawleeOne}.\n */\nexport const createAndRunCrawleeOne = async <\n TCrawlerType extends CrawlerType,\n Ctx extends CrawlerMeta<TCrawlerType, any>['context'] = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n>(args: {\n /** String idetifying the actor class, e.g. `'cheerio'` */\n actorType: TCrawlerType;\n actorName: string;\n /** Config passed to the {@link createCrawleeOne} */\n actorConfig: PickPartial<\n ActorDefinition<Ctx, Labels, Input, TIO>,\n 'router' | 'createCrawler' | 'io'\n >;\n /**\n * If using default `createCrawler` implementation, these are crawler options\n * that may be overriden by user input.\n */\n crawlerConfigDefaults?: CrawlerMeta<TCrawlerType, any>['options'];\n /**\n * If using default `createCrawler` implementation, these are crawler options\n * that will override user input.\n *\n * This is useful for testing env.\n */\n crawlerConfigOverrides?: CrawlerMeta<TCrawlerType, any>['options'];\n /**\n * Sentry configuration. If using default `createCrawler` implementation,\n * failed requests are optionally reported to Sentry.\n *\n * To disable Sentry, set `\"enabled\": false`.\n */\n sentryOptions?: Sentry.NodeOptions;\n /**\n * Callback with the created actor. The callback is called within\n * the `Actor.main()` context.\n */\n onActorReady?: (actor: ActorContext<Ctx, Labels, Input, TIO>) => MaybePromise<void>;\n}): Promise<void> => {\n const {\n actorType,\n actorName,\n actorConfig,\n crawlerConfigDefaults,\n crawlerConfigOverrides,\n sentryOptions,\n onActorReady,\n } = args;\n\n const { io = apifyIO as any as TIO } = actorConfig;\n\n await setupSentry({ ...sentryOptions, serverName: actorName }, { io });\n\n // See docs:\n // - https://docs.apify.com/sdk/js/\n // - https://docs.apify.com/academy/deploying-your-code/inputs-outputs#accepting-input-with-the-apify-sdk\n // - https://docs.apify.com/sdk/js/docs/upgrading/upgrading-to-v3#apify-sdk\n await io.runInContext(\n async () => {\n const actorDefaults: ActorDefinition<Ctx, Labels, Input & AllActorInputs, TIO> = {\n io,\n router: Router.create<Ctx>(),\n routerWrappers: ({ input }) => [\n logLevelHandlerWrapper<Ctx, any>(input?.logLevel ?? 'info'),\n ],\n createCrawler: ({ router, proxy, input }) => {\n const options = createHttpCrawlerOptions<\n CrawlerMeta<TCrawlerType, any>['options'],\n Input\n >({\n input,\n defaults: crawlerConfigDefaults,\n overrides: {\n requestHandler: router,\n proxyConfiguration: proxy,\n // Capture errors in a separate (Apify) Dataset and pass errors to Sentry\n failedRequestHandler: createErrorHandler({\n io,\n reportingDatasetId: input?.errorReportingDatasetId ?? 'REPORTING',\n sendToSentry: input?.errorSendToSentry ?? true,\n }),\n ...crawlerConfigOverrides,\n },\n });\n const CrawlerClass = actorClassByType[actorType] as any;\n return new CrawlerClass(options);\n },\n routes: [],\n routeHandlers: {} as any,\n };\n\n const actor = await createCrawleeOne<Ctx, Labels, Input, TIO>({\n ...actorConfig,\n io,\n router: actorConfig.router ?? (actorDefaults.router as any),\n routerWrappers: actorConfig.routerWrappers ?? (actorDefaults.routerWrappers as any),\n createCrawler: actorConfig.createCrawler ?? (actorDefaults.createCrawler as any),\n });\n\n await onActorReady?.(actor);\n },\n { statusMessage: 'Crawling finished!' }\n );\n};\n\n/**\n * Create opinionated Crawlee crawler that uses router for handling requests.\n *\n * This is a quality-of-life function that does the following for you:\n *\n * 1) Full TypeScript coverage - Ensure all components use the same Crawler / CrawlerContext.\n *\n * 2) Get Actor input from `Actor.getInput` if not given.\n *\n * 3) (Optional) Validate Actor input\n *\n * 4) Set up router such that requests that reach default route are\n * redirected to labelled routes based on which item from \"routes\" they match.\n *\n * 5) Register all route handlers for you.\n *\n * 6) (Optional) Wrap all route handlers in a wrapper. Use this e.g.\n * if you want to add a field to the context object, or handle errors\n * from a single place.\n *\n * 7) (Optional) Support transformation and filtering of (scraped) entries,\n * configured via Actor input.\n *\n * 8) (Optional) Support Actor metamorphing, configured via Actor input.\n *\n * 9) Apify context (e.g. calling `Actor.getInput`) can be replaced with custom\n * implementation using the `io` option.\n */\nexport const createCrawleeOne = async <\n Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n>(\n config: PickPartial<ActorDefinition<Ctx, Labels, Input, TIO>, 'io'>\n): Promise<ActorContext<Ctx, Labels, Input, TIO>> => {\n const { io = apifyIO as any as TIO } = config;\n\n // Mutable state that is available to the actor hooks\n const state = {};\n\n // Initialize actor inputs\n const rawInput = config.input\n ? isFunc(config.input)\n ? await config.input({ ...config, io })\n : config.input\n : await io.getInput<Input>();\n const input = Object.freeze(await resolveInput<Input | null>(rawInput, state, { io }));\n\n if (config.validateInput) await config.validateInput(input);\n\n const { logLevel } = (input ?? {}) as LoggingActorInput;\n const log = new Log({ level: logLevel ? logLevelToCrawlee[logLevel] : undefined });\n\n // This is context that is available to options that use initialization function\n const getConfig = () => ({ ...config, input, state, io, log });\n\n // Set up proxy\n const defaultProxy =\n config.proxy == null ? await io.createDefaultProxyConfiguration(input ?? undefined) : undefined;\n const proxy =\n config.proxy == null\n ? defaultProxy\n : isFunc(config.proxy)\n ? await config.proxy(getConfig())\n : config.proxy;\n\n // Run initialization functions\n const router: RouterHandler<Ctx> = isRouter(config.router)\n ? config.router\n : await (config.router as any)(getConfig());\n const routes = isFunc(config.routes) ? await config.routes(getConfig()) : config.routes; // prettier-ignore\n const routeHandlers = isFunc(config.routeHandlers) ? await config.routeHandlers(getConfig()) : config.routeHandlers; // prettier-ignore\n const routerWrappers = isFunc(config.routerWrappers) ? await config.routerWrappers(getConfig()) : config.routerWrappers; // prettier-ignore\n\n // Create Crawlee crawler\n const getActorCtx = () => ({\n io,\n router,\n routes,\n routeHandlers,\n proxy,\n config,\n input,\n state,\n log,\n });\n const crawler = await config.createCrawler(getActorCtx());\n\n // Create actor (our custom entity)\n const preActor = { crawler, ...getActorCtx() };\n const runCrawler = createScopedCrawlerRun(preActor);\n const metamorph = createScopedMetamorph(preActor);\n const scopedPushData = createScopedPushData(preActor);\n const scopedPushRequest = createScopedPushRequests(preActor);\n const startUrls = await getStartUrlsFromInput(preActor);\n\n const actor = {\n ...preActor,\n crawler,\n runCrawler,\n metamorph,\n pushData: scopedPushData,\n pushRequests: scopedPushRequest,\n startUrls,\n } satisfies ActorContext<Ctx, Labels, Input, TIO>;\n\n // Extra data that we make available to the route handlers\n const routerContext = { actor, pushData: scopedPushData };\n\n // Set up router\n await setupDefaultRoute<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>, Labels, Input>({\n io,\n router,\n routerWrappers,\n routerContext,\n routes,\n routeHandlers,\n input,\n });\n await registerHandlers<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>, Labels>({\n router,\n routerWrappers,\n routerContext,\n routeHandlers,\n });\n\n // Now that the actor is ready, enqueue the URLs right away\n await scopedPushRequest(startUrls as CrawleeRequest[]);\n\n return actor;\n};\n\nconst resolveInput = async <T extends Record<string, any> | null>(\n input: object | null,\n state: Record<string, unknown>,\n options?: { io?: CrawleeOneIO }\n) => {\n const { io = apifyIO as CrawleeOneIO } = options ?? {};\n const { inputExtendUrl, inputExtendFromFunction } = (input ?? {}) as InputActorInput;\n\n const inputFromUrl = inputExtendUrl ? await gotScraping.get(inputExtendUrl).json<object>() : null;\n const inputFn = genHookFn({ state, input, io }, inputExtendFromFunction);\n const inputFromFunc = (await inputFn?.()) ?? null;\n const extendedInput = { ...inputFromUrl, ...inputFromFunc, ...input };\n\n return extendedInput as T;\n};\n\n/**\n * Create a function that wraps `crawler.run(requests, runOtions)` with additional\n * features like:\n * - Automatically metamorph into another actor after the run finishes\n */\nconst createScopedCrawlerRun = <\n Ctx extends CrawlingContext<any> = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n>(\n actor: Omit<\n ActorContext<Ctx, Labels, Input, TIO>,\n 'runCrawler' | 'metamorph' | 'pushData' | 'pushRequests' | 'startUrls'\n >\n) => {\n const {\n requestTransformBefore,\n requestTransformAfter,\n requestFilterBefore,\n requestFilterAfter,\n outputTransformBefore,\n outputTransformAfter,\n outputFilterBefore,\n outputFilterAfter,\n outputCacheStoreId,\n outputCacheActionOnResult,\n } = (actor.input ?? {}) as OutputActorInput & RequestActorInput;\n\n const metamorph = createScopedMetamorph(actor);\n\n const runCrawler: RunCrawler<Ctx> = async (requests, options) => {\n // Clear cache if it was set from the input\n if (outputCacheStoreId && outputCacheActionOnResult === 'overwrite') {\n const store = await actor.io.openKeyValueStore(outputCacheStoreId);\n await store.drop();\n }\n\n await genHookFn(actor, outputTransformBefore)?.();\n await genHookFn(actor, outputFilterBefore)?.();\n await genHookFn(actor, requestTransformBefore)?.();\n await genHookFn(actor, requestFilterBefore)?.();\n\n const runRes = await actor.crawler.run(requests, options);\n\n await genHookFn(actor, outputTransformAfter)?.();\n await genHookFn(actor, outputFilterAfter)?.();\n await genHookFn(actor, requestTransformAfter)?.();\n await genHookFn(actor, requestFilterAfter)?.();\n\n // Trigger metamorph if it was set from the input\n await metamorph();\n\n return runRes;\n };\n\n return runCrawler;\n};\n\n/** Create a function that triggers metamorph, using Actor's inputs as defaults. */\nconst createScopedMetamorph = (actor: Pick<ActorContext, 'input' | 'io'>) => {\n // Trigger metamorph if it was set from the input\n const metamorph: Metamorph = async (overrides?: MetamorphActorInput) => {\n const {\n metamorphActorId,\n metamorphActorBuild,\n metamorphActorInput,\n } = defaults({}, overrides, actor.input ?? {}); // prettier-ignore\n\n if (!metamorphActorId) return;\n\n await actor.io.triggerDownstreamCrawler(metamorphActorId, metamorphActorInput, {\n build: metamorphActorBuild,\n });\n };\n\n return metamorph;\n};\n\n/** pushData wrapper that pre-populates options based on actor input */\nconst createScopedPushData = (actor: Pick<ActorContext, 'input' | 'state' | 'io' | 'log'>) => {\n const {\n includePersonalData,\n requestQueueId,\n outputMaxEntries,\n outputTransform,\n outputFilter,\n outputDatasetId,\n outputPickFields,\n outputRenameFields,\n outputCacheStoreId,\n outputCachePrimaryKeys,\n outputCacheActionOnResult,\n } = (actor.input ?? {}) as OutputActorInput & PrivacyActorInput & RequestActorInput;\n\n const scopedPushData: ActorContext['pushData'] = async (entries, ctx, options) => {\n const transformFn = genHookFn(actor, outputTransform);\n const filterFn = genHookFn(actor, outputFilter);\n\n const mergedOptions = {\n io: actor.io,\n log: actor.log,\n showPrivate: includePersonalData,\n maxCount: outputMaxEntries,\n pickKeys: outputPickFields,\n remapKeys: outputRenameFields,\n transform: transformFn ? (item) => transformFn(item) : undefined,\n filter: filterFn ? (item) => filterFn(item) : undefined,\n datasetId: outputDatasetId,\n requestQueueId,\n cacheStoreId: outputCacheStoreId,\n cachePrimaryKeys: outputCachePrimaryKeys,\n cacheActionOnResult: outputCacheActionOnResult,\n ...options,\n } satisfies PushDataOptions<object>;\n\n return pushData(entries, ctx, mergedOptions);\n };\n\n return scopedPushData;\n};\n\n/** pushRequests wrapper that pre-populates options based on actor input */\nconst createScopedPushRequests = (actor: Pick<ActorContext, 'input' | 'state' | 'io' | 'log'>) => {\n const { requestQueueId, requestMaxEntries, requestTransform, requestFilter } = (actor.input ??\n {}) as RequestActorInput;\n\n const scopedPushRequest: ActorContext['pushRequests'] = async (entries, options) => {\n const transformFn = genHookFn(actor, requestTransform);\n const filterFn = genHookFn(actor, requestFilter);\n\n const mergedOptions = {\n io: actor.io,\n log: actor.log,\n maxCount: requestMaxEntries,\n transform: transformFn ? (item) => transformFn(item) : undefined,\n filter: filterFn ? (item) => filterFn(item) : undefined,\n requestQueueId,\n ...options,\n } satisfies PushRequestsOptions<any>;\n\n return pushRequests(entries, mergedOptions);\n };\n\n return scopedPushRequest;\n};\n\n/** Given the actor input, create common crawler options. */\nexport const createHttpCrawlerOptions = <\n TOpts extends BasicCrawlerOptions<any> = BasicCrawlerOptions,\n Input extends Record<string, any> = Record<string, any>\n>({\n input,\n defaults,\n overrides,\n}: {\n /** Actor input */\n input: Input | null;\n /**\n * Default config options set by us. These may be overriden\n * by values from actor input (set by user).\n */\n defaults?: TOpts;\n /**\n * These config options will overwrite both the default and user\n * options. This is useful for hard-setting values e.g. in tests.\n */\n overrides?: TOpts;\n}) => {\n const pickCrawlerInputFields = <T extends CrawlerConfigActorInput>(config: T) =>\n pick(config, Object.keys(crawlerInput));\n\n return {\n // ----- 1. DEFAULTS -----\n ...omitBy(defaults ?? ({} as TOpts), (field) => field === undefined),\n // ----- 2. CONFIG FROM INPUT -----\n ...omitBy(pickCrawlerInputFields(input ?? {}), (field) => field === undefined),\n // ----- 3. OVERRIDES - E.G. TEST CONFIG -----\n ...omitBy(overrides ?? ({} as TOpts), (field) => field === undefined),\n } satisfies Partial<TOpts>;\n};\n\nconst getStartUrlsFromInput = async (\n actor: Pick<ActorContext, 'input' | 'state' | 'io' | 'log'>\n) => {\n const { startUrls, startUrlsFromDataset, startUrlsFromFunction } = (actor.input ??\n {}) as StartUrlsActorInput;\n\n const urlsAgg = [...(startUrls ?? [])];\n\n if (startUrlsFromDataset) {\n actor.log.debug(`Loading start URLs from Dataset ${startUrlsFromDataset}`);\n const [datasetId, field] = startUrlsFromDataset.split('#');\n const urlsFromDataset = await getColumnFromDataset<any>(datasetId, field, { io: actor.io });\n urlsAgg.push(...urlsFromDataset);\n }\n\n if (startUrlsFromFunction) {\n actor.log.debug(`Loading start URLs from function`);\n const urlsFromFn = await genHookFn(actor, startUrlsFromFunction)?.();\n urlsAgg.push(...urlsFromFn);\n }\n\n return urlsAgg;\n};\n"]}
|
|
@@ -4,7 +4,7 @@ import type { MaybePromise, PickPartial } from '../../utils/types';
|
|
|
4
4
|
import type { CrawlerUrl } from '../../types';
|
|
5
5
|
import type { itemCacheKey, pushData } from '../io/pushData';
|
|
6
6
|
import type { pushRequests } from '../io/pushRequests';
|
|
7
|
-
import type { RouteHandler, RouteMatcher, CrawlerRouterWrapper } from '../router';
|
|
7
|
+
import type { RouteHandler, RouteMatcher, CrawlerRouterWrapper } from '../router/types';
|
|
8
8
|
import type { MetamorphActorInput } from '../config';
|
|
9
9
|
import type { CrawleeOneIO } from '../integrations/types';
|
|
10
10
|
type MaybeAsyncFn<R, Args extends any[]> = R | ((...args: Args) => MaybePromise<R>);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../../src/lib/actor/types.ts"],"names":[],"mappings":"","sourcesContent":["import type {\n BasicCrawler,\n CrawlingContext,\n Log,\n ProxyConfiguration,\n RouterHandler,\n} from 'crawlee';\nimport type { gotScraping } from 'got-scraping';\n\nimport type { MaybePromise, PickPartial } from '../../utils/types';\nimport type { CrawlerUrl } from '../../types';\nimport type { itemCacheKey, pushData } from '../io/pushData';\nimport type { pushRequests } from '../io/pushRequests';\nimport type { RouteHandler, RouteMatcher, CrawlerRouterWrapper } from '../router';\nimport type { MetamorphActorInput } from '../config';\nimport type { CrawleeOneIO } from '../integrations/types';\n\ntype MaybeAsyncFn<R, Args extends any[]> = R | ((...args: Args) => MaybePromise<R>);\n\ntype OrigRunCrawler<T extends CrawlingContext<any, any>> = BasicCrawler<T>['run'];\n\n/** Extended type of `crawler.run()` function */\nexport type RunCrawler<Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>> = (\n requests?: CrawlerUrl[],\n options?: Parameters<OrigRunCrawler<Ctx>>[1]\n) => ReturnType<OrigRunCrawler<Ctx>>;\n\n/** Trigger actor metamorph, using actor's inputs as defaults. */\nexport type Metamorph = (overrides?: MetamorphActorInput) => Promise<void>;\n\n/** Context passed to route handlers */\nexport type ActorRouterContext<\n Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n> = {\n actor: ActorContext<Ctx, Labels, Input, TIO>;\n};\n\n/** Context passed to user-defined functions passed from input */\nexport type ActorHookContext<TIO extends CrawleeOneIO> = Pick<ActorContext, 'input' | 'state'> & {\n io: TIO;\n itemCacheKey: typeof itemCacheKey;\n sendRequest: typeof gotScraping;\n};\n\nexport interface ActorDefinition<\n Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n> {\n /** Client for communicating with cloud/local storage. */\n io: TIO;\n\n // Actor input\n /**\n * Actor input which you can get e.g. via `Actor.getInput()`\n *\n * Input is automatically retrieved if undefined.\n */\n input?: MaybeAsyncFn<Input, [ActorDefinition<Ctx, Labels, Input, TIO>]>;\n /** Validation for the actor input. Should throw error if validation fails. */\n validateInput?: (input: Input | null) => MaybePromise<void>;\n\n // Router setup\n /**\n * Router instance that redirects the request to handlers.\n * @example\n * import { createCheerioRouter } from 'crawlee';\n *\n * ({\n * ...\n * router: createCheerioRouter(),\n * })\n */\n router: MaybeAsyncFn<RouterHandler<Ctx>, [ActorDefinitionWithInput<Ctx, Labels, Input, TIO>]>;\n /**\n * Criteria that un-labelled requests are matched against.\n *\n * E.g. If `match` function returns truthy value,\n * the request is passed to the `action` function for processing.\n *\n * @example\n * ({\n * ...\n * routes: [{\n * // If match returns true, the request is forwarded to handler\n * // with label JOB_DETAIL.\n * name: 'Job detail',\n * handlerLabel: routeLabels.JOB_DETAIL,\n * match: (url) => isUrlOfJobOffer(url),\n * }, {\n * // Define custom action function:\n * // If match returns true, we replace this request with new one\n * // pointing to new domain.\n * name: 'Main page',\n * handlerLabel: null,\n * match: (url) => url.match(/example\\.com\\/?(?:[?#~]|$)/i),\n * action: async (url, ctx, _, handlers) => {\n * ctx.log.info(`Redirecting to https://www.new-domain.com`);\n * await ctx.crawler.addRequests(['https://www.new-domain.com'], { forefront: true });\n * },\n * }],\n * })\n */\n routes: MaybeAsyncFn<\n RouteMatcher<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>, Labels>[],\n [ActorDefinitionWithInput<Ctx, Labels, Input, TIO>]\n >;\n /** Handlers for the labelled requests. The object keys are the labels. */\n routeHandlers: MaybeAsyncFn<Record<Labels, RouteHandler<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>>>, [ActorDefinitionWithInput<Ctx, Labels, Input, TIO>]>; // prettier-ignore\n /**\n * Provides the option to modify or extend all router handlers by wrapping\n * them in these functions.\n *\n * Wrappers are applied from right to left. That means that wrappers `[A, B, C]`\n * will be applied like so `A( B( C( handler ) ) )`.\n *\n * Default `routerWrappers`:\n * ```js\n * {\n * ...\n * routerWrappers: ({ input }) => [\n * logLevelHandlerWrapper<Ctx, any>(input?.logLevel ?? 'info'),\n * ],\n * }\n * ```\n */\n routerWrappers?: MaybeAsyncFn<CrawlerRouterWrapper<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>>[], [ActorDefinitionWithInput<Ctx, Labels, Input, TIO>]>; // prettier-ignore\n\n // Proxy setup\n proxy?: MaybeAsyncFn<ProxyConfiguration, [ActorDefinitionWithInput<Ctx, Labels, Input, TIO>]>; // prettier-ignore\n\n // Crawler setup\n createCrawler: (\n actorCtx: Omit<\n ActorContext<Ctx, Labels, Input, TIO>,\n 'crawler' | 'runCrawler' | 'metamorph' | 'pushData' | 'pushRequests' | 'startUrls'\n >\n ) => MaybePromise<Ctx['crawler']>;\n}\n\n/** ActorDefinition object where the input is already resolved */\nexport type ActorDefinitionWithInput<\n Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n> = Omit<ActorDefinition<Ctx, Labels, Input, TIO>, 'input'> & {\n input: Input | null;\n state: Record<string, unknown>;\n};\n\n/** Context available while creating a Crawlee crawler/actor */\nexport interface ActorContext<\n Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n> {\n crawler: Ctx['crawler'];\n /**\n * This function wraps `crawler.run(requests, runOtions)` with additional\n * features:\n * - Automatically metamorph into another actor after the run finishes\n */\n runCrawler: RunCrawler<Ctx>;\n /** Trigger actor metamorph, using actor's inputs as defaults. */\n metamorph: Metamorph;\n /**\n * `Actor.pushData` with extra optional features:\n *\n * - Limit the number of entries pushed to the Dataset based on the Actor input\n * - Transform and filter entries via Actor input.\n * - Add metadata to entries before they are pushed to Dataset.\n * - Set which (nested) properties are personal data optionally redact them for privacy compliance.\n */\n pushData: typeof pushData;\n /**\n * Similar to `Actor.openRequestQueue().addRequests`, but with extra features:\n *\n * - Limit the max size of the RequestQueue. No requests are added when RequestQueue is at or above the limit.\n * - Transform and filter requests. Requests that did not pass the filter are not added to the RequestQueue.\n */\n pushRequests: typeof pushRequests;\n /**\n * A list of resolved Requests to be scraped.\n *\n * This list is a combination of 3 Actor inputs:\n * - `startUrls` - Static list of URLs to scrape.\n * - `startUrlsFromDataset` - From a specific field from a Dataset (e.g. \"dataset123#fieldName\" - Dataset: \"dataset123\", field: \"fieldName\").\n * - `startUrlsFromFunction` - A function that is evaulated to generate the Requests.\n */\n startUrls: CrawlerUrl[];\n proxy?: ProxyConfiguration;\n router: RouterHandler<Ctx>;\n routes: RouteMatcher<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>, Labels>[];\n routeHandlers: Record<Labels, RouteHandler<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>>>;\n /** Original config from which this actor context was created */\n config: PickPartial<ActorDefinition<Ctx, Labels, Input, TIO>, 'io'>;\n /** Read-only inputs passed to the actor */\n input: Input | null;\n /** Mutable state that is shared across setup and teardown hooks */\n state: Record<string, unknown>;\n /**\n * Instance managing communication with databases - storage & retrieval\n * (Dataset, RequestQueue, KeyValueStore).\n *\n * This is modelled and similar to Apify's `Actor` static class.\n */\n io: TIO;\n log: Log;\n}\n"]}
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../../src/lib/actor/types.ts"],"names":[],"mappings":"","sourcesContent":["import type {\n BasicCrawler,\n CrawlingContext,\n Log,\n ProxyConfiguration,\n RouterHandler,\n} from 'crawlee';\nimport type { gotScraping } from 'got-scraping';\n\nimport type { MaybePromise, PickPartial } from '../../utils/types';\nimport type { CrawlerUrl } from '../../types';\nimport type { itemCacheKey, pushData } from '../io/pushData';\nimport type { pushRequests } from '../io/pushRequests';\nimport type { RouteHandler, RouteMatcher, CrawlerRouterWrapper } from '../router/types';\nimport type { MetamorphActorInput } from '../config';\nimport type { CrawleeOneIO } from '../integrations/types';\n\ntype MaybeAsyncFn<R, Args extends any[]> = R | ((...args: Args) => MaybePromise<R>);\n\ntype OrigRunCrawler<T extends CrawlingContext<any, any>> = BasicCrawler<T>['run'];\n\n/** Extended type of `crawler.run()` function */\nexport type RunCrawler<Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>> = (\n requests?: CrawlerUrl[],\n options?: Parameters<OrigRunCrawler<Ctx>>[1]\n) => ReturnType<OrigRunCrawler<Ctx>>;\n\n/** Trigger actor metamorph, using actor's inputs as defaults. */\nexport type Metamorph = (overrides?: MetamorphActorInput) => Promise<void>;\n\n/** Context passed to route handlers */\nexport type ActorRouterContext<\n Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n> = {\n actor: ActorContext<Ctx, Labels, Input, TIO>;\n};\n\n/** Context passed to user-defined functions passed from input */\nexport type ActorHookContext<TIO extends CrawleeOneIO> = Pick<ActorContext, 'input' | 'state'> & {\n io: TIO;\n itemCacheKey: typeof itemCacheKey;\n sendRequest: typeof gotScraping;\n};\n\nexport interface ActorDefinition<\n Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n> {\n /** Client for communicating with cloud/local storage. */\n io: TIO;\n\n // Actor input\n /**\n * Actor input which you can get e.g. via `Actor.getInput()`\n *\n * Input is automatically retrieved if undefined.\n */\n input?: MaybeAsyncFn<Input, [ActorDefinition<Ctx, Labels, Input, TIO>]>;\n /** Validation for the actor input. Should throw error if validation fails. */\n validateInput?: (input: Input | null) => MaybePromise<void>;\n\n // Router setup\n /**\n * Router instance that redirects the request to handlers.\n * @example\n * import { createCheerioRouter } from 'crawlee';\n *\n * ({\n * ...\n * router: createCheerioRouter(),\n * })\n */\n router: MaybeAsyncFn<RouterHandler<Ctx>, [ActorDefinitionWithInput<Ctx, Labels, Input, TIO>]>;\n /**\n * Criteria that un-labelled requests are matched against.\n *\n * E.g. If `match` function returns truthy value,\n * the request is passed to the `action` function for processing.\n *\n * @example\n * ({\n * ...\n * routes: [{\n * // If match returns true, the request is forwarded to handler\n * // with label JOB_DETAIL.\n * name: 'Job detail',\n * handlerLabel: routeLabels.JOB_DETAIL,\n * match: (url) => isUrlOfJobOffer(url),\n * }, {\n * // Define custom action function:\n * // If match returns true, we replace this request with new one\n * // pointing to new domain.\n * name: 'Main page',\n * handlerLabel: null,\n * match: (url) => url.match(/example\\.com\\/?(?:[?#~]|$)/i),\n * action: async (url, ctx, _, handlers) => {\n * ctx.log.info(`Redirecting to https://www.new-domain.com`);\n * await ctx.crawler.addRequests(['https://www.new-domain.com'], { forefront: true });\n * },\n * }],\n * })\n */\n routes: MaybeAsyncFn<\n RouteMatcher<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>, Labels>[],\n [ActorDefinitionWithInput<Ctx, Labels, Input, TIO>]\n >;\n /** Handlers for the labelled requests. The object keys are the labels. */\n routeHandlers: MaybeAsyncFn<Record<Labels, RouteHandler<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>>>, [ActorDefinitionWithInput<Ctx, Labels, Input, TIO>]>; // prettier-ignore\n /**\n * Provides the option to modify or extend all router handlers by wrapping\n * them in these functions.\n *\n * Wrappers are applied from right to left. That means that wrappers `[A, B, C]`\n * will be applied like so `A( B( C( handler ) ) )`.\n *\n * Default `routerWrappers`:\n * ```js\n * {\n * ...\n * routerWrappers: ({ input }) => [\n * logLevelHandlerWrapper<Ctx, any>(input?.logLevel ?? 'info'),\n * ],\n * }\n * ```\n */\n routerWrappers?: MaybeAsyncFn<CrawlerRouterWrapper<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>>[], [ActorDefinitionWithInput<Ctx, Labels, Input, TIO>]>; // prettier-ignore\n\n // Proxy setup\n proxy?: MaybeAsyncFn<ProxyConfiguration, [ActorDefinitionWithInput<Ctx, Labels, Input, TIO>]>; // prettier-ignore\n\n // Crawler setup\n createCrawler: (\n actorCtx: Omit<\n ActorContext<Ctx, Labels, Input, TIO>,\n 'crawler' | 'runCrawler' | 'metamorph' | 'pushData' | 'pushRequests' | 'startUrls'\n >\n ) => MaybePromise<Ctx['crawler']>;\n}\n\n/** ActorDefinition object where the input is already resolved */\nexport type ActorDefinitionWithInput<\n Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n> = Omit<ActorDefinition<Ctx, Labels, Input, TIO>, 'input'> & {\n input: Input | null;\n state: Record<string, unknown>;\n};\n\n/** Context available while creating a Crawlee crawler/actor */\nexport interface ActorContext<\n Ctx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>,\n TIO extends CrawleeOneIO = CrawleeOneIO\n> {\n crawler: Ctx['crawler'];\n /**\n * This function wraps `crawler.run(requests, runOtions)` with additional\n * features:\n * - Automatically metamorph into another actor after the run finishes\n */\n runCrawler: RunCrawler<Ctx>;\n /** Trigger actor metamorph, using actor's inputs as defaults. */\n metamorph: Metamorph;\n /**\n * `Actor.pushData` with extra optional features:\n *\n * - Limit the number of entries pushed to the Dataset based on the Actor input\n * - Transform and filter entries via Actor input.\n * - Add metadata to entries before they are pushed to Dataset.\n * - Set which (nested) properties are personal data optionally redact them for privacy compliance.\n */\n pushData: typeof pushData;\n /**\n * Similar to `Actor.openRequestQueue().addRequests`, but with extra features:\n *\n * - Limit the max size of the RequestQueue. No requests are added when RequestQueue is at or above the limit.\n * - Transform and filter requests. Requests that did not pass the filter are not added to the RequestQueue.\n */\n pushRequests: typeof pushRequests;\n /**\n * A list of resolved Requests to be scraped.\n *\n * This list is a combination of 3 Actor inputs:\n * - `startUrls` - Static list of URLs to scrape.\n * - `startUrlsFromDataset` - From a specific field from a Dataset (e.g. \"dataset123#fieldName\" - Dataset: \"dataset123\", field: \"fieldName\").\n * - `startUrlsFromFunction` - A function that is evaulated to generate the Requests.\n */\n startUrls: CrawlerUrl[];\n proxy?: ProxyConfiguration;\n router: RouterHandler<Ctx>;\n routes: RouteMatcher<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>, Labels>[];\n routeHandlers: Record<Labels, RouteHandler<Ctx, ActorRouterContext<Ctx, Labels, Input, TIO>>>;\n /** Original config from which this actor context was created */\n config: PickPartial<ActorDefinition<Ctx, Labels, Input, TIO>, 'io'>;\n /** Read-only inputs passed to the actor */\n input: Input | null;\n /** Mutable state that is shared across setup and teardown hooks */\n state: Record<string, unknown>;\n /**\n * Instance managing communication with databases - storage & retrieval\n * (Dataset, RequestQueue, KeyValueStore).\n *\n * This is modelled and similar to Apify's `Actor` static class.\n */\n io: TIO;\n log: Log;\n}\n"]}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import type { BasicCrawlingContext, CheerioCrawlingContext, CrawlingContext, ErrorHandler, HttpCrawlingContext, JSDOMCrawlingContext, PlaywrightCrawlingContext, PuppeteerCrawlingContext } from 'crawlee';
|
|
2
2
|
import type { MaybePromise, PickRequired } from '../../utils/types';
|
|
3
|
-
import type { RouteHandler, RouterHandlerCtx } from '../router';
|
|
3
|
+
import type { RouteHandler, RouterHandlerCtx } from '../router/types';
|
|
4
4
|
import type { CrawleeOneErrorHandlerInput, CrawleeOneErrorHandlerOptions } from '../integrations/types';
|
|
5
5
|
export type CaptureErrorInput = PickRequired<Partial<CrawleeOneErrorHandlerInput>, 'error'>;
|
|
6
6
|
export type CaptureError = (input: CaptureErrorInput) => MaybePromise<void>;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"errorHandler.js","sourceRoot":"","sources":["../../../../src/lib/error/errorHandler.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAUA,qDAAuC;AAUvC,iDAAgD;AAKhD;;;;;;GAMG;AACI,MAAM,YAAY,GAAG,CAC1B,KAAwB,EACxB,OAAqD,EACrD,EAAE;;IACF,MAAM,EAAE,KAAK,EAAE,GAAG,EAAE,SAAS,EAAE,GAAG,KAAK,CAAC;IACxC,MAAM,EACJ,EAAE,GAAG,eAA6C,EAClD,kBAAkB,EAClB,cAAc,GACf,GAAG,OAAO,CAAC;IAEZ,MAAM,GAAG,GAAG,MAAA,SAAS,aAAT,SAAS,uBAAT,SAAS,CAAE,KAAK,CAAC,EAAE,MAAM,EAAE,kBAAkB,EAAE,CAAC,mCAAI,IAAI,CAAC;IAErE,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,SAAS,KAAK,CAAC,IAAI,KAAK,KAAK,CAAC,OAAO,EAAE,EAAE,KAAK,CAAC,CAAC;IAC3D,OAAO,CAAC,KAAK,CAAC,SAAS,KAAK,CAAC,IAAI,KAAK,KAAK,CAAC,OAAO,EAAE,EAAE,KAAK,CAAC,CAAC;IAE9D,iCAAiC;IACjC,2DAA2D;IAC3D,MAAM,gBAAgB,GAAG,kBAAkB,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,WAAW,CAAC,kBAAkB,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAC9F,MAAM,MAAM,GAAG,MAAM,EAAE,CAAC,mBAAmB,CACzC,EAAE,KAAK,EAAE,IAAI,EAAE,MAAA,KAAK,CAAC,IAAI,mCAAI,IAAI,EAAE,GAAG,EAAE,MAAA,KAAK,CAAC,GAAG,mCAAI,IAAI,EAAE,GAAG,EAAE,kCAC3D,OAAO,KAAE,EAAE,IACjB,CAAC;IAEF,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,gCAAgC,EAAE,MAAM,CAAC,CAAC;IAErD,yBAAyB;IACzB,IAAI,kBAAkB,EAAE;QACtB,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,IAAI,CAAC,4CAA4C,kBAAkB,EAAE,CAAC,CAAC;QAC5E,MAAM,CAAA,gBAAgB,aAAhB,gBAAgB,uBAAhB,gBAAgB,CAAE,QAAQ,CAAC,MAAM,CAAC,CAAA,CAAC;QACzC,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,IAAI,CAAC,iDAAiD,kBAAkB,EAAE,CAAC,CAAC;KAClF;IAED,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,wCAAwC,CAAC,CAAC;IACrD,MAAM,CAAA,cAAc,aAAd,cAAc,uBAAd,cAAc,CAAG,EAAE,KAAK,EAAE,MAAM,EAAE,CAAC,CAAA,CAAC;IAC1C,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,6CAA6C,CAAC,CAAC;IAE1D,gEAAgE;IAChE,KAAK,CAAC,wBAAwB,GAAG,IAAI,CAAC;IACtC,sBAAsB;IACtB,MAAM,KAAK,CAAC;AACd,CAAC,CAAA,CAAC;AAzCW,QAAA,YAAY,gBAyCvB;AAEF;;;;GAIG;AACI,MAAM,mBAAmB,GAAG,CAIjC,EAAiE,EACjE,OAAqD,EACrD,EAAE;IACF,MAAM,oBAAoB,GAAiB,CAAC,KAAK,EAAE,EAAE,CAAC,IAAA,oBAAY,EAAC,KAAK,EAAE,OAAO,CAAC,CAAC;IAEnF,IAAI;QACF,4FAA4F;QAC5F,MAAM,EAAE,CAAC,EAAE,YAAY,EAAE,oBAAoB,EAAE,CAAC,CAAC;KAClD;IAAC,OAAO,KAAU,EAAE;QACnB,IAAI,CAAC,KAAK,CAAC,wBAAwB,EAAE;YACnC,+DAA+D;YAC/D,MAAM,oBAAoB,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC;SACzE;KACF;AACH,CAAC,CAAA,CAAC;AAlBW,QAAA,mBAAmB,uBAkB9B;AAEF;;;;;;;;;;;;;;;GAeG;AACI,MAAM,wBAAwB,GAAG,CAKtC,OAA4F,EAC5F,OAAqD,EACrD,EAAE;IACF,sFAAsF;IACtF,MAAM,cAAc,GAAG,CAAC,GAAqC,EAAE,EAAE;QAC/D,OAAO,IAAA,2BAAmB,EAAC,CAAC,EAAE,YAAY,EAAE,EAAE,EAAE;YAC9C,OAAO,OAAO,iCACR,GAAW;gBACf,0EAA0E;gBAC1E,YAAY,EAAE,CAAC,KAAK,EAAE,EAAE;;oBACtB,OAAA,YAAY,CAAC;wBACX,KAAK,EAAE,KAAK,CAAC,KAAK;wBAClB,IAAI,EAAE,MAAA,KAAK,CAAC,IAAI,mCAAI,GAAG,CAAC,IAAI;wBAC5B,GAAG,EAAE,KAAK,CAAC,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,GAAG;wBACjC,GAAG,EAAE,MAAA,KAAK,CAAC,GAAG,mCAAI,GAAG,CAAC,GAAG;qBAC1B,CAAC,CAAA;iBAAA,IACJ,CAAC;QACL,CAAC,EAAE,OAAO,CAAC,CAAC;IACd,CAAC,CAAC;IACF,OAAO,cAAc,CAAC;AACxB,CAAC,CAAC;AAzBW,QAAA,wBAAwB,4BAyBnC;AAEK,MAAM,6BAA6B,GAAG,CAAmC,GAAG,IAAsD,EAAE,EAAE,CAAC,IAAA,gCAAwB,EAAM,GAAG,IAAI,CAAC,CAAC,CAAC,kBAAkB;AAA3L,QAAA,6BAA6B,iCAA2I;AAC9K,MAAM,4BAA4B,GAAG,CAAkC,GAAG,IAAsD,EAAE,EAAE,CAAC,IAAA,gCAAwB,EAAM,GAAG,IAAI,CAAC,CAAC,CAAC,kBAAkB;AAAzL,QAAA,4BAA4B,gCAA0I;AAC5K,MAAM,6BAA6B,GAAG,CAAmC,GAAG,IAAsD,EAAE,EAAE,CAAC,IAAA,gCAAwB,EAAM,GAAG,IAAI,CAAC,CAAC,CAAC,kBAAkB;AAA3L,QAAA,6BAA6B,iCAA2I;AAC9K,MAAM,kCAAkC,GAAG,CAAwC,GAAG,IAAsD,EAAE,EAAE,CAAC,IAAA,gCAAwB,EAAM,GAAG,IAAI,CAAC,CAAC,CAAC,kBAAkB;AAArM,QAAA,kCAAkC,sCAAgJ;AACxL,MAAM,+BAA+B,GAAG,CAAqC,GAAG,IAAsD,EAAE,EAAE,CAAC,IAAA,gCAAwB,EAAM,GAAG,IAAI,CAAC,CAAC,CAAC,kBAAkB;AAA/L,QAAA,+BAA+B,mCAA6I;AAClL,MAAM,iCAAiC,GAAG,CAAuC,GAAG,IAAsD,EAAE,EAAE,CAAC,IAAA,gCAAwB,EAAM,GAAG,IAAI,CAAC,CAAC,CAAC,kBAAkB;AAAnM,QAAA,iCAAiC,qCAA+I;AAE7L;;;;;;;GAOG;AACI,MAAM,kBAAkB,GAAG,CAChC,OAAmE,EAChD,EAAE;IACrB,OAAO,CAAO,EAAE,OAAO,EAAE,GAAG,EAAE,IAAI,EAAE,EAAE,KAAK,EAAE,EAAE;;QAC7C,MAAM,GAAG,GAAG,OAAO,CAAC,SAAS,IAAI,OAAO,CAAC,GAAG,CAAC;QAC7C,IAAA,oBAAY,EACV,EAAE,KAAK,EAAE,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,IAAY,EAAE,EACvC;YACE,EAAE,EAAE,OAAO,CAAC,EAAE;YACd,kBAAkB,EAAE,OAAO,CAAC,kBAAkB;YAC9C,eAAe,EAAE,MAAA,OAAO,CAAC,eAAe,mCAAI,IAAI;YAChD,cAAc,EAAE,CAAC,EAAE,KAAK,EAAE,MAAM,EAAE,EAAE,EAAE;gBACpC,IAAI,CAAC,OAAO,CAAC,YAAY;oBAAE,OAAO;gBAElC,MAAM,CAAC,gBAAgB,CAAC,KAAK,EAAE,EAAE,KAAK,EAAE,MAAa,EAAE,CAAC,CAAC;YAC3D,CAAC;SACF,CACF,CAAC;IACJ,CAAC,CAAA,CAAC;AACJ,CAAC,CAAC;AAnBW,QAAA,kBAAkB,sBAmB7B","sourcesContent":["import type {\n BasicCrawlingContext,\n CheerioCrawlingContext,\n CrawlingContext,\n ErrorHandler,\n HttpCrawlingContext,\n JSDOMCrawlingContext,\n PlaywrightCrawlingContext,\n PuppeteerCrawlingContext,\n} from 'crawlee';\nimport * as Sentry from '@sentry/node';\nimport type { Page } from 'playwright';\n\nimport type { MaybePromise, PickRequired } from '../../utils/types';\nimport type { RouteHandler, RouterHandlerCtx } from '../router';\nimport type {\n CrawleeOneErrorHandlerInput,\n CrawleeOneErrorHandlerOptions,\n CrawleeOneIO,\n} from '../integrations/types';\nimport { apifyIO } from '../integrations/apify';\n\nexport type CaptureErrorInput = PickRequired<Partial<CrawleeOneErrorHandlerInput>, 'error'>;\nexport type CaptureError = (input: CaptureErrorInput) => MaybePromise<void>;\n\n/**\n * Error handling for CrawleeOne crawlers.\n *\n * By default, error reports are saved to Apify Dataset.\n *\n * See https://docs.apify.com/academy/node-js/analyzing-pages-and-fixing-errors#error-reporting\n */\nexport const captureError = async <TEnv extends object = object, TReport extends object = object>(\n input: CaptureErrorInput,\n options: CrawleeOneErrorHandlerOptions<TEnv, TReport>\n) => {\n const { error, log: parentLog } = input;\n const {\n io = apifyIO as any as CrawleeOneIO<TEnv, TReport>,\n reportingDatasetId,\n onErrorCapture,\n } = options;\n\n const log = parentLog?.child({ prefix: '[Error capture] ' }) ?? null;\n\n log?.error(`ERROR ${error.name}: ${error.message}`, error);\n console.error(`ERROR ${error.name}: ${error.message}`, error);\n\n // Let's create reporting dataset\n // If you already have one, this will continue adding to it\n const reportingDataset = reportingDatasetId ? await io.openDataset(reportingDatasetId) : null;\n const report = await io.generateErrorReport(\n { error, page: input.page ?? null, url: input.url ?? null, log },\n { ...options, io }\n );\n\n log?.error('[Error capture] Error captured', report);\n\n // And we push the report\n if (reportingDatasetId) {\n log?.info(`[Error capture] Pushing error to dataset ${reportingDatasetId}`);\n await reportingDataset?.pushData(report);\n log?.info(`[Error capture] DONE pushing error to dataset ${reportingDatasetId}`);\n }\n\n log?.error('[Error capture] Calling onErrorCapture');\n await onErrorCapture?.({ error, report });\n log?.error('[Error capture] Done calling onErrorCapture');\n\n // @ts-expect-error Tag the error, so we don't capture it twice.\n error._crawleeOneErrorCaptured = true;\n // Propagate the error\n throw error;\n};\n\n/**\n * Error handling for Crawlers as a function wrapper\n *\n * By default, error reports are saved to Apify Dataset.\n */\nexport const captureErrorWrapper = async <\n TEnv extends object = object,\n TReport extends object = object\n>(\n fn: (input: { captureError: CaptureError }) => MaybePromise<void>,\n options: CrawleeOneErrorHandlerOptions<TEnv, TReport>\n) => {\n const captureErrorWithArgs: CaptureError = (input) => captureError(input, options);\n\n try {\n // Pass the error capturing function to the wrapped function, so it can trigger it by itself\n await fn({ captureError: captureErrorWithArgs });\n } catch (error: any) {\n if (!error._crawleeOneErrorCaptured) {\n // And if the wrapped function fails, we capture error for them\n await captureErrorWithArgs({ error, url: null, page: null, log: null });\n }\n }\n};\n\n/**\n * Drop-in replacement for regular request handler callback for Crawlee route\n * that automatically tracks errors.\n *\n * By default, error reports are saved to Apify Dataset.\n *\n * @example\n *\n * router.addDefaultHandler(\n * captureErrorRouteHandler(async (ctx) => {\n * const { page, crawler } = ctx;\n * const url = page.url();\n * ...\n * })\n * );\n */\nexport const captureErrorRouteHandler = <\n Ctx extends CrawlingContext,\n TEnv extends object = object,\n TReport extends object = object\n>(\n handler: (ctx: RouterHandlerCtx<Ctx> & { captureError: CaptureError }) => MaybePromise<void>,\n options: CrawleeOneErrorHandlerOptions<TEnv, TReport>\n) => {\n // Wrap the original handler, so we can additionally pass it the captureError function\n const wrapperHandler = (ctx: Parameters<RouteHandler<Ctx>>[0]) => {\n return captureErrorWrapper(({ captureError }) => {\n return handler({\n ...(ctx as any),\n // And automatically feed contextual args (page, url, log) to captureError\n captureError: (input) =>\n captureError({\n error: input.error,\n page: input.page ?? ctx.page,\n url: input.url || ctx.request.url,\n log: input.log ?? ctx.log,\n }),\n });\n }, options);\n };\n return wrapperHandler;\n};\n\nexport const basicCaptureErrorRouteHandler = <Ctx extends BasicCrawlingContext>(...args: Parameters<typeof captureErrorRouteHandler<Ctx>>) => captureErrorRouteHandler<Ctx>(...args); // prettier-ignore\nexport const httpCaptureErrorRouteHandler = <Ctx extends HttpCrawlingContext>(...args: Parameters<typeof captureErrorRouteHandler<Ctx>>) => captureErrorRouteHandler<Ctx>(...args); // prettier-ignore\nexport const jsdomCaptureErrorRouteHandler = <Ctx extends JSDOMCrawlingContext>(...args: Parameters<typeof captureErrorRouteHandler<Ctx>>) => captureErrorRouteHandler<Ctx>(...args); // prettier-ignore\nexport const playwrightCaptureErrorRouteHandler = <Ctx extends PlaywrightCrawlingContext>(...args: Parameters<typeof captureErrorRouteHandler<Ctx>>) => captureErrorRouteHandler<Ctx>(...args); // prettier-ignore\nexport const cheerioCaptureErrorRouteHandler = <Ctx extends CheerioCrawlingContext>(...args: Parameters<typeof captureErrorRouteHandler<Ctx>>) => captureErrorRouteHandler<Ctx>(...args); // prettier-ignore\nexport const puppeteerCaptureErrorRouteHandler = <Ctx extends PuppeteerCrawlingContext>(...args: Parameters<typeof captureErrorRouteHandler<Ctx>>) => captureErrorRouteHandler<Ctx>(...args); // prettier-ignore\n\n/**\n * Create an `ErrorHandler` function that can be assigned to\n * `failedRequestHandler` option of `BasicCrawlerOptions`.\n *\n * The function saves error to a Dataset, and optionally forwards it to Sentry.\n *\n * By default, error reports are saved to Apify Dataset.\n */\nexport const createErrorHandler = <Ctx extends CrawlingContext>(\n options: CrawleeOneErrorHandlerOptions & { sendToSentry?: boolean }\n): ErrorHandler<Ctx> => {\n return async ({ request, log, page }, error) => {\n const url = request.loadedUrl || request.url;\n captureError(\n { error, url, log, page: page as Page },\n {\n io: options.io,\n reportingDatasetId: options.reportingDatasetId,\n allowScreenshot: options.allowScreenshot ?? true,\n onErrorCapture: ({ error, report }) => {\n if (!options.sendToSentry) return;\n\n Sentry.captureException(error, { extra: report as any });\n },\n }\n );\n };\n};\n"]}
|
|
1
|
+
{"version":3,"file":"errorHandler.js","sourceRoot":"","sources":["../../../../src/lib/error/errorHandler.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAUA,qDAAuC;AAUvC,iDAAgD;AAKhD;;;;;;GAMG;AACI,MAAM,YAAY,GAAG,CAC1B,KAAwB,EACxB,OAAqD,EACrD,EAAE;;IACF,MAAM,EAAE,KAAK,EAAE,GAAG,EAAE,SAAS,EAAE,GAAG,KAAK,CAAC;IACxC,MAAM,EACJ,EAAE,GAAG,eAA6C,EAClD,kBAAkB,EAClB,cAAc,GACf,GAAG,OAAO,CAAC;IAEZ,MAAM,GAAG,GAAG,MAAA,SAAS,aAAT,SAAS,uBAAT,SAAS,CAAE,KAAK,CAAC,EAAE,MAAM,EAAE,kBAAkB,EAAE,CAAC,mCAAI,IAAI,CAAC;IAErE,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,SAAS,KAAK,CAAC,IAAI,KAAK,KAAK,CAAC,OAAO,EAAE,EAAE,KAAK,CAAC,CAAC;IAC3D,OAAO,CAAC,KAAK,CAAC,SAAS,KAAK,CAAC,IAAI,KAAK,KAAK,CAAC,OAAO,EAAE,EAAE,KAAK,CAAC,CAAC;IAE9D,iCAAiC;IACjC,2DAA2D;IAC3D,MAAM,gBAAgB,GAAG,kBAAkB,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,WAAW,CAAC,kBAAkB,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAC9F,MAAM,MAAM,GAAG,MAAM,EAAE,CAAC,mBAAmB,CACzC,EAAE,KAAK,EAAE,IAAI,EAAE,MAAA,KAAK,CAAC,IAAI,mCAAI,IAAI,EAAE,GAAG,EAAE,MAAA,KAAK,CAAC,GAAG,mCAAI,IAAI,EAAE,GAAG,EAAE,kCAC3D,OAAO,KAAE,EAAE,IACjB,CAAC;IAEF,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,gCAAgC,EAAE,MAAM,CAAC,CAAC;IAErD,yBAAyB;IACzB,IAAI,kBAAkB,EAAE;QACtB,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,IAAI,CAAC,4CAA4C,kBAAkB,EAAE,CAAC,CAAC;QAC5E,MAAM,CAAA,gBAAgB,aAAhB,gBAAgB,uBAAhB,gBAAgB,CAAE,QAAQ,CAAC,MAAM,CAAC,CAAA,CAAC;QACzC,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,IAAI,CAAC,iDAAiD,kBAAkB,EAAE,CAAC,CAAC;KAClF;IAED,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,wCAAwC,CAAC,CAAC;IACrD,MAAM,CAAA,cAAc,aAAd,cAAc,uBAAd,cAAc,CAAG,EAAE,KAAK,EAAE,MAAM,EAAE,CAAC,CAAA,CAAC;IAC1C,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,6CAA6C,CAAC,CAAC;IAE1D,gEAAgE;IAChE,KAAK,CAAC,wBAAwB,GAAG,IAAI,CAAC;IACtC,sBAAsB;IACtB,MAAM,KAAK,CAAC;AACd,CAAC,CAAA,CAAC;AAzCW,QAAA,YAAY,gBAyCvB;AAEF;;;;GAIG;AACI,MAAM,mBAAmB,GAAG,CAIjC,EAAiE,EACjE,OAAqD,EACrD,EAAE;IACF,MAAM,oBAAoB,GAAiB,CAAC,KAAK,EAAE,EAAE,CAAC,IAAA,oBAAY,EAAC,KAAK,EAAE,OAAO,CAAC,CAAC;IAEnF,IAAI;QACF,4FAA4F;QAC5F,MAAM,EAAE,CAAC,EAAE,YAAY,EAAE,oBAAoB,EAAE,CAAC,CAAC;KAClD;IAAC,OAAO,KAAU,EAAE;QACnB,IAAI,CAAC,KAAK,CAAC,wBAAwB,EAAE;YACnC,+DAA+D;YAC/D,MAAM,oBAAoB,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC;SACzE;KACF;AACH,CAAC,CAAA,CAAC;AAlBW,QAAA,mBAAmB,uBAkB9B;AAEF;;;;;;;;;;;;;;;GAeG;AACI,MAAM,wBAAwB,GAAG,CAKtC,OAA4F,EAC5F,OAAqD,EACrD,EAAE;IACF,sFAAsF;IACtF,MAAM,cAAc,GAAG,CAAC,GAAqC,EAAE,EAAE;QAC/D,OAAO,IAAA,2BAAmB,EAAC,CAAC,EAAE,YAAY,EAAE,EAAE,EAAE;YAC9C,OAAO,OAAO,iCACR,GAAW;gBACf,0EAA0E;gBAC1E,YAAY,EAAE,CAAC,KAAK,EAAE,EAAE;;oBACtB,OAAA,YAAY,CAAC;wBACX,KAAK,EAAE,KAAK,CAAC,KAAK;wBAClB,IAAI,EAAE,MAAA,KAAK,CAAC,IAAI,mCAAI,GAAG,CAAC,IAAI;wBAC5B,GAAG,EAAE,KAAK,CAAC,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,GAAG;wBACjC,GAAG,EAAE,MAAA,KAAK,CAAC,GAAG,mCAAI,GAAG,CAAC,GAAG;qBAC1B,CAAC,CAAA;iBAAA,IACJ,CAAC;QACL,CAAC,EAAE,OAAO,CAAC,CAAC;IACd,CAAC,CAAC;IACF,OAAO,cAAc,CAAC;AACxB,CAAC,CAAC;AAzBW,QAAA,wBAAwB,4BAyBnC;AAEK,MAAM,6BAA6B,GAAG,CAAmC,GAAG,IAAsD,EAAE,EAAE,CAAC,IAAA,gCAAwB,EAAM,GAAG,IAAI,CAAC,CAAC,CAAC,kBAAkB;AAA3L,QAAA,6BAA6B,iCAA2I;AAC9K,MAAM,4BAA4B,GAAG,CAAkC,GAAG,IAAsD,EAAE,EAAE,CAAC,IAAA,gCAAwB,EAAM,GAAG,IAAI,CAAC,CAAC,CAAC,kBAAkB;AAAzL,QAAA,4BAA4B,gCAA0I;AAC5K,MAAM,6BAA6B,GAAG,CAAmC,GAAG,IAAsD,EAAE,EAAE,CAAC,IAAA,gCAAwB,EAAM,GAAG,IAAI,CAAC,CAAC,CAAC,kBAAkB;AAA3L,QAAA,6BAA6B,iCAA2I;AAC9K,MAAM,kCAAkC,GAAG,CAAwC,GAAG,IAAsD,EAAE,EAAE,CAAC,IAAA,gCAAwB,EAAM,GAAG,IAAI,CAAC,CAAC,CAAC,kBAAkB;AAArM,QAAA,kCAAkC,sCAAgJ;AACxL,MAAM,+BAA+B,GAAG,CAAqC,GAAG,IAAsD,EAAE,EAAE,CAAC,IAAA,gCAAwB,EAAM,GAAG,IAAI,CAAC,CAAC,CAAC,kBAAkB;AAA/L,QAAA,+BAA+B,mCAA6I;AAClL,MAAM,iCAAiC,GAAG,CAAuC,GAAG,IAAsD,EAAE,EAAE,CAAC,IAAA,gCAAwB,EAAM,GAAG,IAAI,CAAC,CAAC,CAAC,kBAAkB;AAAnM,QAAA,iCAAiC,qCAA+I;AAE7L;;;;;;;GAOG;AACI,MAAM,kBAAkB,GAAG,CAChC,OAAmE,EAChD,EAAE;IACrB,OAAO,CAAO,EAAE,OAAO,EAAE,GAAG,EAAE,IAAI,EAAE,EAAE,KAAK,EAAE,EAAE;;QAC7C,MAAM,GAAG,GAAG,OAAO,CAAC,SAAS,IAAI,OAAO,CAAC,GAAG,CAAC;QAC7C,IAAA,oBAAY,EACV,EAAE,KAAK,EAAE,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,IAAY,EAAE,EACvC;YACE,EAAE,EAAE,OAAO,CAAC,EAAE;YACd,kBAAkB,EAAE,OAAO,CAAC,kBAAkB;YAC9C,eAAe,EAAE,MAAA,OAAO,CAAC,eAAe,mCAAI,IAAI;YAChD,cAAc,EAAE,CAAC,EAAE,KAAK,EAAE,MAAM,EAAE,EAAE,EAAE;gBACpC,IAAI,CAAC,OAAO,CAAC,YAAY;oBAAE,OAAO;gBAElC,MAAM,CAAC,gBAAgB,CAAC,KAAK,EAAE,EAAE,KAAK,EAAE,MAAa,EAAE,CAAC,CAAC;YAC3D,CAAC;SACF,CACF,CAAC;IACJ,CAAC,CAAA,CAAC;AACJ,CAAC,CAAC;AAnBW,QAAA,kBAAkB,sBAmB7B","sourcesContent":["import type {\n BasicCrawlingContext,\n CheerioCrawlingContext,\n CrawlingContext,\n ErrorHandler,\n HttpCrawlingContext,\n JSDOMCrawlingContext,\n PlaywrightCrawlingContext,\n PuppeteerCrawlingContext,\n} from 'crawlee';\nimport * as Sentry from '@sentry/node';\nimport type { Page } from 'playwright';\n\nimport type { MaybePromise, PickRequired } from '../../utils/types';\nimport type { RouteHandler, RouterHandlerCtx } from '../router/types';\nimport type {\n CrawleeOneErrorHandlerInput,\n CrawleeOneErrorHandlerOptions,\n CrawleeOneIO,\n} from '../integrations/types';\nimport { apifyIO } from '../integrations/apify';\n\nexport type CaptureErrorInput = PickRequired<Partial<CrawleeOneErrorHandlerInput>, 'error'>;\nexport type CaptureError = (input: CaptureErrorInput) => MaybePromise<void>;\n\n/**\n * Error handling for CrawleeOne crawlers.\n *\n * By default, error reports are saved to Apify Dataset.\n *\n * See https://docs.apify.com/academy/node-js/analyzing-pages-and-fixing-errors#error-reporting\n */\nexport const captureError = async <TEnv extends object = object, TReport extends object = object>(\n input: CaptureErrorInput,\n options: CrawleeOneErrorHandlerOptions<TEnv, TReport>\n) => {\n const { error, log: parentLog } = input;\n const {\n io = apifyIO as any as CrawleeOneIO<TEnv, TReport>,\n reportingDatasetId,\n onErrorCapture,\n } = options;\n\n const log = parentLog?.child({ prefix: '[Error capture] ' }) ?? null;\n\n log?.error(`ERROR ${error.name}: ${error.message}`, error);\n console.error(`ERROR ${error.name}: ${error.message}`, error);\n\n // Let's create reporting dataset\n // If you already have one, this will continue adding to it\n const reportingDataset = reportingDatasetId ? await io.openDataset(reportingDatasetId) : null;\n const report = await io.generateErrorReport(\n { error, page: input.page ?? null, url: input.url ?? null, log },\n { ...options, io }\n );\n\n log?.error('[Error capture] Error captured', report);\n\n // And we push the report\n if (reportingDatasetId) {\n log?.info(`[Error capture] Pushing error to dataset ${reportingDatasetId}`);\n await reportingDataset?.pushData(report);\n log?.info(`[Error capture] DONE pushing error to dataset ${reportingDatasetId}`);\n }\n\n log?.error('[Error capture] Calling onErrorCapture');\n await onErrorCapture?.({ error, report });\n log?.error('[Error capture] Done calling onErrorCapture');\n\n // @ts-expect-error Tag the error, so we don't capture it twice.\n error._crawleeOneErrorCaptured = true;\n // Propagate the error\n throw error;\n};\n\n/**\n * Error handling for Crawlers as a function wrapper\n *\n * By default, error reports are saved to Apify Dataset.\n */\nexport const captureErrorWrapper = async <\n TEnv extends object = object,\n TReport extends object = object\n>(\n fn: (input: { captureError: CaptureError }) => MaybePromise<void>,\n options: CrawleeOneErrorHandlerOptions<TEnv, TReport>\n) => {\n const captureErrorWithArgs: CaptureError = (input) => captureError(input, options);\n\n try {\n // Pass the error capturing function to the wrapped function, so it can trigger it by itself\n await fn({ captureError: captureErrorWithArgs });\n } catch (error: any) {\n if (!error._crawleeOneErrorCaptured) {\n // And if the wrapped function fails, we capture error for them\n await captureErrorWithArgs({ error, url: null, page: null, log: null });\n }\n }\n};\n\n/**\n * Drop-in replacement for regular request handler callback for Crawlee route\n * that automatically tracks errors.\n *\n * By default, error reports are saved to Apify Dataset.\n *\n * @example\n *\n * router.addDefaultHandler(\n * captureErrorRouteHandler(async (ctx) => {\n * const { page, crawler } = ctx;\n * const url = page.url();\n * ...\n * })\n * );\n */\nexport const captureErrorRouteHandler = <\n Ctx extends CrawlingContext,\n TEnv extends object = object,\n TReport extends object = object\n>(\n handler: (ctx: RouterHandlerCtx<Ctx> & { captureError: CaptureError }) => MaybePromise<void>,\n options: CrawleeOneErrorHandlerOptions<TEnv, TReport>\n) => {\n // Wrap the original handler, so we can additionally pass it the captureError function\n const wrapperHandler = (ctx: Parameters<RouteHandler<Ctx>>[0]) => {\n return captureErrorWrapper(({ captureError }) => {\n return handler({\n ...(ctx as any),\n // And automatically feed contextual args (page, url, log) to captureError\n captureError: (input) =>\n captureError({\n error: input.error,\n page: input.page ?? ctx.page,\n url: input.url || ctx.request.url,\n log: input.log ?? ctx.log,\n }),\n });\n }, options);\n };\n return wrapperHandler;\n};\n\nexport const basicCaptureErrorRouteHandler = <Ctx extends BasicCrawlingContext>(...args: Parameters<typeof captureErrorRouteHandler<Ctx>>) => captureErrorRouteHandler<Ctx>(...args); // prettier-ignore\nexport const httpCaptureErrorRouteHandler = <Ctx extends HttpCrawlingContext>(...args: Parameters<typeof captureErrorRouteHandler<Ctx>>) => captureErrorRouteHandler<Ctx>(...args); // prettier-ignore\nexport const jsdomCaptureErrorRouteHandler = <Ctx extends JSDOMCrawlingContext>(...args: Parameters<typeof captureErrorRouteHandler<Ctx>>) => captureErrorRouteHandler<Ctx>(...args); // prettier-ignore\nexport const playwrightCaptureErrorRouteHandler = <Ctx extends PlaywrightCrawlingContext>(...args: Parameters<typeof captureErrorRouteHandler<Ctx>>) => captureErrorRouteHandler<Ctx>(...args); // prettier-ignore\nexport const cheerioCaptureErrorRouteHandler = <Ctx extends CheerioCrawlingContext>(...args: Parameters<typeof captureErrorRouteHandler<Ctx>>) => captureErrorRouteHandler<Ctx>(...args); // prettier-ignore\nexport const puppeteerCaptureErrorRouteHandler = <Ctx extends PuppeteerCrawlingContext>(...args: Parameters<typeof captureErrorRouteHandler<Ctx>>) => captureErrorRouteHandler<Ctx>(...args); // prettier-ignore\n\n/**\n * Create an `ErrorHandler` function that can be assigned to\n * `failedRequestHandler` option of `BasicCrawlerOptions`.\n *\n * The function saves error to a Dataset, and optionally forwards it to Sentry.\n *\n * By default, error reports are saved to Apify Dataset.\n */\nexport const createErrorHandler = <Ctx extends CrawlingContext>(\n options: CrawleeOneErrorHandlerOptions & { sendToSentry?: boolean }\n): ErrorHandler<Ctx> => {\n return async ({ request, log, page }, error) => {\n const url = request.loadedUrl || request.url;\n captureError(\n { error, url, log, page: page as Page },\n {\n io: options.io,\n reportingDatasetId: options.reportingDatasetId,\n allowScreenshot: options.allowScreenshot ?? true,\n onErrorCapture: ({ error, report }) => {\n if (!options.sendToSentry) return;\n\n Sentry.captureException(error, { extra: report as any });\n },\n }\n );\n };\n};\n"]}
|
|
@@ -87,10 +87,14 @@ exports.apifyIO = {
|
|
|
87
87
|
};
|
|
88
88
|
}),
|
|
89
89
|
openRequestQueue: (...args) => __awaiter(void 0, void 0, void 0, function* () {
|
|
90
|
-
|
|
90
|
+
const queue = yield apify_1.Actor.openRequestQueue(...args);
|
|
91
91
|
const clear = () => __awaiter(void 0, void 0, void 0, function* () {
|
|
92
|
-
|
|
93
|
-
|
|
92
|
+
let req;
|
|
93
|
+
do {
|
|
94
|
+
req = yield queue.fetchNextRequest();
|
|
95
|
+
if (req)
|
|
96
|
+
yield queue.markRequestHandled(req);
|
|
97
|
+
} while (req);
|
|
94
98
|
});
|
|
95
99
|
return {
|
|
96
100
|
addRequests: (...args) => queue.addRequests(...args),
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"apify.js","sourceRoot":"","sources":["../../../../src/lib/integrations/apify.ts"],"names":[],"mappings":";;;;;;;;;;;;AAAA,iCAAwC;AACxC,qCAAsF;AAuCtF,MAAM,wBAAwB,GAA6C,CACzE,KAAK,EACL,OAAO,EACP,EAAE;IACF,MAAM,EAAE,KAAK,EAAE,IAAI,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,KAAK,CAAC;IACxC,MAAM,EAAE,EAAE,EAAE,eAAe,EAAE,GAAG,OAAO,CAAC;IAExC,oEAAoE;IACpE,wCAAwC;IACxC,+CAA+C;IAC/C,MAAM,EAAE,OAAO,EAAE,UAAU,EAAE,sBAAsB,EAAE,OAAO,EAAE,GAAG,MAAM,EAAE,CAAC,MAAM,EAAE,CAAC;IAEnF,MAAM,WAAW,GAAG,oCAAoC,OAAO,SAAS,UAAU,EAAE,CAAC;IAErF,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;IACnC,MAAM,GAAG,GAAG,SAAS,YAAY,EAAE,CAAC;IAEpC,IAAI,cAAc,GAAkB,IAAI,CAAC;IACzC,IAAI,gBAAgB,GAAkB,IAAI,CAAC;IAC3C,IAAI,OAAO,GAAkB,GAAG,aAAH,GAAG,cAAH,GAAG,GAAI,IAAI,CAAC;IACzC,IAAI,IAAI,IAAI,eAAe,EAAE;QAC3B,OAAO,GAAG,OAAO,IAAI,IAAI,CAAC,GAAG,EAAE,CAAC;QAChC,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,IAAI,CAAC,yBAAyB,CAAC,CAAC;QACrC,MAAM,yBAAe,CAAC,YAAY,CAAC,IAAI,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC;QAClD,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,IAAI,CAAC,8BAA8B,CAAC,CAAC;QAC1C,0EAA0E;QAC1E,cAAc,GAAG,6CAA6C,OAAO,YAAY,GAAG,2BAA2B,CAAC;QAChH,gBAAgB,GAAG,6CAA6C,OAAO,YAAY,GAAG,4BAA4B,CAAC;KACpH;IAED,4BAA4B;IAC5B,MAAM,MAAM,GAAG;QACb,OAAO;QACP,UAAU;QACV,WAAW;QACX,SAAS,EAAE,KAAK,CAAC,IAAI;QACrB,YAAY,EAAE,KAAK,CAAC,QAAQ,EAAE;QAE9B,OAAO;QACP,gBAAgB;QAChB,cAAc;KACY,CAAC;IAE7B,OAAO,MAAM,CAAC;AAChB,CAAC,CAAA,CAAC;AAEF,MAAM,0BAA0B,GAAG,CAA8B,GAAQ,EAAE,EAAE;;IAC3E,MAAM,EAAE,OAAO,EAAE,UAAU,EAAE,GAAG,aAAK,CAAC,MAAM,EAAE,CAAC;IAC/C,MAAM,WAAW,GACf,OAAO,IAAI,IAAI,IAAI,UAAU,IAAI,IAAI;QACnC,CAAC,CAAC,oCAAoC,OAAO,SAAS,UAAU,EAAE;QAClE,CAAC,CAAC,IAAI,CAAC;IACX,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAE3C,MAAM,QAAQ,GAAG;QACf,OAAO;QACP,UAAU;QACV,WAAW;QACX,SAAS,EAAE,GAAG,CAAC,EAAE;QACjB,SAAS,EAAE,MAAA,GAAG,CAAC,OAAO,CAAC,EAAE,mCAAI,IAAI;QAEjC,WAAW,EAAE,MAAA,GAAG,CAAC,OAAO,CAAC,GAAG,mCAAI,IAAI;QACpC,SAAS,EAAE,MAAA,GAAG,CAAC,OAAO,CAAC,SAAS,mCAAI,IAAI;QAExC,WAAW,EAAE,GAAG,CAAC,OAAO,CAAC,SAAS,IAAI,SAAS;QAC/C,eAAe,EAAE,GAAG,CAAC,OAAO,CAAC,UAAU;KACX,CAAC;IAE/B,OAAO,QAAQ,CAAC;AAClB,CAAC,CAAC;AAEF;;;;GAIG;AACU,QAAA,OAAO,GAAsB;IACxC,WAAW,EAAE,CAAO,GAAG,IAAI,EAAE,EAAE;QAC7B,MAAM,OAAO,GAAG,MAAM,aAAK,CAAC,WAAW,CAAC,GAAG,IAAI,CAAC,CAAC;QACjD,MAAM,YAAY,GAAG,GAAS,EAAE,8DAAC,OAAA,MAAA,MAAA,CAAC,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC,0CAAE,SAAS,mCAAI,IAAI,CAAA,GAAA,CAAC;QAC9E,MAAM,QAAQ,GAAkC,CAAO,OAAO,EAAE,EAAE;YAChE,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,OAAO,iCAC/B,OAAO,KACV,SAAS,EAAE,IAAI,IACf,CAAC;YACH,OAAO,MAAM,CAAC,KAAK,CAAC;QACtB,CAAC,CAAA,CAAC;QAEF,OAAO;YACL,QAAQ,EAAE,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC;YACxC,QAAQ;YACR,YAAY;SACb,CAAC;IACJ,CAAC,CAAA;IACD,gBAAgB,EAAE,CAAO,GAAG,IAAI,EAAE,EAAE;QAClC,IAAI,KAAK,GAAG,MAAM,aAAK,CAAC,gBAAgB,CAAC,GAAG,IAAI,CAAC,CAAC;QAClD,MAAM,KAAK,GAAG,GAAS,EAAE;YACvB,MAAM,KAAK,CAAC,IAAI,EAAE,CAAC;YACnB,KAAK,GAAG,MAAM,aAAK,CAAC,gBAAgB,CAAC,GAAG,IAAI,CAAC,CAAC;QAChD,CAAC,CAAA,CAAC;QAEF,OAAO;YACL,WAAW,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,KAAK,CAAC,WAAW,CAAC,GAAG,IAAI,CAAC;YACpD,kBAAkB,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,KAAK,CAAC,kBAAkB,CAAC,GAAG,IAAI,CAAC;YAClE,gBAAgB,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,KAAK,CAAC,gBAAgB,CAAC,GAAG,IAAI,CAAC;YAC9D,cAAc,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,KAAK,CAAC,cAAc,CAAC,GAAG,IAAI,CAAC;YAC1D,UAAU,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,KAAK,CAAC,UAAU,CAAC,GAAG,IAAI,CAAC;YAClD,YAAY,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,KAAK,CAAC,YAAY,CAAC,GAAG,IAAI,CAAC;YACtD,IAAI,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC;YACtC,KAAK;SACN,CAAC;IACJ,CAAC,CAAA;IACD,iBAAiB,EAAE,CAAO,GAAG,IAAI,EAAE,EAAE;QACnC,MAAM,KAAK,GAAG,MAAM,aAAK,CAAC,iBAAiB,CAAC,GAAG,IAAI,CAAC,CAAC;QACrD,MAAM,KAAK,GAAG,GAAS,EAAE;YACvB,MAAM,KAAK,CAAC,UAAU,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC,CAAC;QAC7D,CAAC,CAAA,CAAC;QAEF,OAAO;YACL,QAAQ,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,IAAI,CAAC;YAC9C,QAAQ,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,IAAI,CAAC;YAC9C,IAAI,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC;YACtC,KAAK;SACN,CAAC;IACJ,CAAC,CAAA;IACD,MAAM,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,aAAK,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC;IAC1C,QAAQ,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,aAAK,CAAC,QAAQ,CAAC,GAAG,IAAI,CAAC;IAC9C,YAAY,EAAE,CAAO,GAAG,IAAI,EAAE,EAAE;QAC9B,MAAM,aAAK,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,CAAC;IAC5B,CAAC,CAAA;IACD,wBAAwB,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,aAAK,CAAC,SAAS,CAAC,GAAG,IAAI,CAAC;IAC/D,+BAA+B,EAAE,CAAO,KAAU,EAAE,EAAE;QACpD,OAAO,OAAO,CAAC,GAAG,CAAC,gBAAgB;YACjC,CAAC,CAAC,MAAM,aAAK,CAAC,wBAAwB,CAAC,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,KAAK,CAAC;YACpD,CAAC,CAAC,SAAS,CAAC;IAChB,CAAC,CAAA;IACD,kBAAkB,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,gBAAgB;IACxD,mBAAmB,EAAE,wBAAwB;IAC7C,qBAAqB,EAAE,0BAA0B;CACtB,CAAC","sourcesContent":["import { Actor, ApifyEnv } from 'apify';\nimport { CrawlingContext, Request as CrawleeRequest, playwrightUtils } from 'crawlee';\n\nimport type { CrawleeOneDataset, CrawleeOneIO } from './types';\n\nexport interface ApifyErrorReport {\n actorId: string | null;\n actorRunId: string | null;\n actorRunUrl: string;\n errorName: string;\n errorMessage: string;\n pageUrl: string | null;\n pageHtmlSnapshot: string | null;\n pageScreenshot: string | null;\n}\n\nexport interface ApifyEntryMetadata {\n actorId: string | null;\n actorRunId: string | null;\n actorRunUrl: string | null;\n contextId: string;\n requestId: string | null;\n\n /** The URL given to the crawler */\n originalUrl: string | null;\n /** The URL given to the crawler after possible redirects */\n loadedUrl: string | null;\n\n /** ISO datetime string that indicates the time when the request has been processed. */\n dateHandled: string;\n numberOfRetries: number;\n}\n\n/**\n * Integration between CrawleeOne and Apify.\n *\n * This is the default integration.\n */\nexport type ApifyCrawleeOneIO = CrawleeOneIO<ApifyEnv, ApifyErrorReport, ApifyEntryMetadata>;\n\nconst generateApifyErrorReport: ApifyCrawleeOneIO['generateErrorReport'] = async (\n input,\n options\n) => {\n const { error, page, url, log } = input;\n const { io, allowScreenshot } = options;\n\n // storeId is ID of current key-value store, where we save snapshots\n // We can also capture actor and run IDs\n // to have easy access in the reporting dataset\n const { actorId, actorRunId, defaultKeyValueStoreId: storeId } = await io.getEnv();\n\n const actorRunUrl = `https://console.apify.com/actors/${actorId}/runs/${actorRunId}`;\n\n const randomNumber = Math.random();\n const key = `ERROR-${randomNumber}`;\n\n let pageScreenshot: string | null = null;\n let pageHtmlSnapshot: string | null = null;\n let pageUrl: string | null = url ?? null;\n if (page && allowScreenshot) {\n pageUrl = pageUrl || page.url();\n log?.info('Capturing page snapshot');\n await playwrightUtils.saveSnapshot(page, { key });\n log?.info('DONE capturing page snapshot');\n // You will have to adjust the keys if you save them in a non-standard way\n pageScreenshot = `https://api.apify.com/v2/key-value-stores/${storeId}/records/${key}.jpg?disableRedirect=true`;\n pageHtmlSnapshot = `https://api.apify.com/v2/key-value-stores/${storeId}/records/${key}.html?disableRedirect=true`;\n }\n\n // We create a report object\n const report = {\n actorId,\n actorRunId,\n actorRunUrl,\n errorName: error.name,\n errorMessage: error.toString(),\n\n pageUrl,\n pageHtmlSnapshot,\n pageScreenshot,\n } satisfies ApifyErrorReport;\n\n return report;\n};\n\nconst generateApifyEntryMetadata = <Ctx extends CrawlingContext>(ctx: Ctx) => {\n const { actorId, actorRunId } = Actor.getEnv();\n const actorRunUrl =\n actorId != null && actorRunId != null\n ? `https://console.apify.com/actors/${actorId}/runs/${actorRunId}`\n : null;\n const handledAt = new Date().toISOString();\n\n const metadata = {\n actorId,\n actorRunId,\n actorRunUrl,\n contextId: ctx.id,\n requestId: ctx.request.id ?? null,\n\n originalUrl: ctx.request.url ?? null,\n loadedUrl: ctx.request.loadedUrl ?? null,\n\n dateHandled: ctx.request.handledAt || handledAt,\n numberOfRetries: ctx.request.retryCount,\n } satisfies ApifyEntryMetadata;\n\n return metadata;\n};\n\n/**\n * Integration between CrawleeOne and Apify.\n *\n * This is the default integration.\n */\nexport const apifyIO: ApifyCrawleeOneIO = {\n openDataset: async (...args) => {\n const dataset = await Actor.openDataset(...args);\n const getItemCount = async () => (await dataset.getInfo())?.itemCount ?? null;\n const getItems: CrawleeOneDataset['getItems'] = async (options) => {\n const result = await dataset.getData({\n ...options,\n skipEmpty: true,\n });\n return result.items;\n };\n\n return {\n pushData: dataset.pushData.bind(dataset),\n getItems,\n getItemCount,\n };\n },\n openRequestQueue: async (...args) => {\n let queue = await Actor.openRequestQueue(...args);\n const clear = async () => {\n await queue.drop();\n queue = await Actor.openRequestQueue(...args);\n };\n\n return {\n addRequests: (...args) => queue.addRequests(...args),\n markRequestHandled: (...args) => queue.markRequestHandled(...args),\n fetchNextRequest: (...args) => queue.fetchNextRequest(...args),\n reclaimRequest: (...args) => queue.reclaimRequest(...args),\n isFinished: (...args) => queue.isFinished(...args),\n handledCount: (...args) => queue.handledCount(...args),\n drop: (...args) => queue.drop(...args),\n clear,\n };\n },\n openKeyValueStore: async (...args) => {\n const store = await Actor.openKeyValueStore(...args);\n const clear = async () => {\n await store.forEachKey((key) => store.setValue(key, null));\n };\n\n return {\n getValue: (...args) => store.getValue(...args),\n setValue: (...args) => store.setValue(...args),\n drop: (...args) => store.drop(...args),\n clear,\n };\n },\n getEnv: (...args) => Actor.getEnv(...args),\n getInput: (...args) => Actor.getInput(...args),\n runInContext: async (...args) => {\n await Actor.main(...args);\n },\n triggerDownstreamCrawler: (...args) => Actor.metamorph(...args),\n createDefaultProxyConfiguration: async (input: any) => {\n return process.env.APIFY_IS_AT_HOME\n ? await Actor.createProxyConfiguration(input?.proxy)\n : undefined;\n },\n isTelemetryEnabled: () => !!process.env.APIFY_IS_AT_HOME,\n generateErrorReport: generateApifyErrorReport,\n generateEntryMetadata: generateApifyEntryMetadata,\n} satisfies ApifyCrawleeOneIO;\n"]}
|
|
1
|
+
{"version":3,"file":"apify.js","sourceRoot":"","sources":["../../../../src/lib/integrations/apify.ts"],"names":[],"mappings":";;;;;;;;;;;;AAAA,iCAAwC;AACxC,qCAAsF;AAuCtF,MAAM,wBAAwB,GAA6C,CACzE,KAAK,EACL,OAAO,EACP,EAAE;IACF,MAAM,EAAE,KAAK,EAAE,IAAI,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,KAAK,CAAC;IACxC,MAAM,EAAE,EAAE,EAAE,eAAe,EAAE,GAAG,OAAO,CAAC;IAExC,oEAAoE;IACpE,wCAAwC;IACxC,+CAA+C;IAC/C,MAAM,EAAE,OAAO,EAAE,UAAU,EAAE,sBAAsB,EAAE,OAAO,EAAE,GAAG,MAAM,EAAE,CAAC,MAAM,EAAE,CAAC;IAEnF,MAAM,WAAW,GAAG,oCAAoC,OAAO,SAAS,UAAU,EAAE,CAAC;IAErF,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;IACnC,MAAM,GAAG,GAAG,SAAS,YAAY,EAAE,CAAC;IAEpC,IAAI,cAAc,GAAkB,IAAI,CAAC;IACzC,IAAI,gBAAgB,GAAkB,IAAI,CAAC;IAC3C,IAAI,OAAO,GAAkB,GAAG,aAAH,GAAG,cAAH,GAAG,GAAI,IAAI,CAAC;IACzC,IAAI,IAAI,IAAI,eAAe,EAAE;QAC3B,OAAO,GAAG,OAAO,IAAI,IAAI,CAAC,GAAG,EAAE,CAAC;QAChC,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,IAAI,CAAC,yBAAyB,CAAC,CAAC;QACrC,MAAM,yBAAe,CAAC,YAAY,CAAC,IAAI,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC;QAClD,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,IAAI,CAAC,8BAA8B,CAAC,CAAC;QAC1C,0EAA0E;QAC1E,cAAc,GAAG,6CAA6C,OAAO,YAAY,GAAG,2BAA2B,CAAC;QAChH,gBAAgB,GAAG,6CAA6C,OAAO,YAAY,GAAG,4BAA4B,CAAC;KACpH;IAED,4BAA4B;IAC5B,MAAM,MAAM,GAAG;QACb,OAAO;QACP,UAAU;QACV,WAAW;QACX,SAAS,EAAE,KAAK,CAAC,IAAI;QACrB,YAAY,EAAE,KAAK,CAAC,QAAQ,EAAE;QAE9B,OAAO;QACP,gBAAgB;QAChB,cAAc;KACY,CAAC;IAE7B,OAAO,MAAM,CAAC;AAChB,CAAC,CAAA,CAAC;AAEF,MAAM,0BAA0B,GAAG,CAA8B,GAAQ,EAAE,EAAE;;IAC3E,MAAM,EAAE,OAAO,EAAE,UAAU,EAAE,GAAG,aAAK,CAAC,MAAM,EAAE,CAAC;IAC/C,MAAM,WAAW,GACf,OAAO,IAAI,IAAI,IAAI,UAAU,IAAI,IAAI;QACnC,CAAC,CAAC,oCAAoC,OAAO,SAAS,UAAU,EAAE;QAClE,CAAC,CAAC,IAAI,CAAC;IACX,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAE3C,MAAM,QAAQ,GAAG;QACf,OAAO;QACP,UAAU;QACV,WAAW;QACX,SAAS,EAAE,GAAG,CAAC,EAAE;QACjB,SAAS,EAAE,MAAA,GAAG,CAAC,OAAO,CAAC,EAAE,mCAAI,IAAI;QAEjC,WAAW,EAAE,MAAA,GAAG,CAAC,OAAO,CAAC,GAAG,mCAAI,IAAI;QACpC,SAAS,EAAE,MAAA,GAAG,CAAC,OAAO,CAAC,SAAS,mCAAI,IAAI;QAExC,WAAW,EAAE,GAAG,CAAC,OAAO,CAAC,SAAS,IAAI,SAAS;QAC/C,eAAe,EAAE,GAAG,CAAC,OAAO,CAAC,UAAU;KACX,CAAC;IAE/B,OAAO,QAAQ,CAAC;AAClB,CAAC,CAAC;AAEF;;;;GAIG;AACU,QAAA,OAAO,GAAsB;IACxC,WAAW,EAAE,CAAO,GAAG,IAAI,EAAE,EAAE;QAC7B,MAAM,OAAO,GAAG,MAAM,aAAK,CAAC,WAAW,CAAC,GAAG,IAAI,CAAC,CAAC;QACjD,MAAM,YAAY,GAAG,GAAS,EAAE,8DAAC,OAAA,MAAA,MAAA,CAAC,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC,0CAAE,SAAS,mCAAI,IAAI,CAAA,GAAA,CAAC;QAC9E,MAAM,QAAQ,GAAkC,CAAO,OAAO,EAAE,EAAE;YAChE,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,OAAO,iCAC/B,OAAO,KACV,SAAS,EAAE,IAAI,IACf,CAAC;YACH,OAAO,MAAM,CAAC,KAAK,CAAC;QACtB,CAAC,CAAA,CAAC;QAEF,OAAO;YACL,QAAQ,EAAE,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC;YACxC,QAAQ;YACR,YAAY;SACb,CAAC;IACJ,CAAC,CAAA;IACD,gBAAgB,EAAE,CAAO,GAAG,IAAI,EAAE,EAAE;QAClC,MAAM,KAAK,GAAG,MAAM,aAAK,CAAC,gBAAgB,CAAC,GAAG,IAAI,CAAC,CAAC;QACpD,MAAM,KAAK,GAAG,GAAS,EAAE;YACvB,IAAI,GAA0B,CAAC;YAC/B,GAAG;gBACD,GAAG,GAAG,MAAM,KAAK,CAAC,gBAAgB,EAAE,CAAC;gBACrC,IAAI,GAAG;oBAAE,MAAM,KAAK,CAAC,kBAAkB,CAAC,GAAG,CAAC,CAAC;aAC9C,QAAQ,GAAG,EAAE;QAChB,CAAC,CAAA,CAAC;QAEF,OAAO;YACL,WAAW,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,KAAK,CAAC,WAAW,CAAC,GAAG,IAAI,CAAC;YACpD,kBAAkB,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,KAAK,CAAC,kBAAkB,CAAC,GAAG,IAAI,CAAC;YAClE,gBAAgB,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,KAAK,CAAC,gBAAgB,CAAC,GAAG,IAAI,CAAC;YAC9D,cAAc,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,KAAK,CAAC,cAAc,CAAC,GAAG,IAAI,CAAC;YAC1D,UAAU,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,KAAK,CAAC,UAAU,CAAC,GAAG,IAAI,CAAC;YAClD,YAAY,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,KAAK,CAAC,YAAY,CAAC,GAAG,IAAI,CAAC;YACtD,IAAI,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC;YACtC,KAAK;SACN,CAAC;IACJ,CAAC,CAAA;IACD,iBAAiB,EAAE,CAAO,GAAG,IAAI,EAAE,EAAE;QACnC,MAAM,KAAK,GAAG,MAAM,aAAK,CAAC,iBAAiB,CAAC,GAAG,IAAI,CAAC,CAAC;QACrD,MAAM,KAAK,GAAG,GAAS,EAAE;YACvB,MAAM,KAAK,CAAC,UAAU,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC,CAAC;QAC7D,CAAC,CAAA,CAAC;QAEF,OAAO;YACL,QAAQ,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,IAAI,CAAC;YAC9C,QAAQ,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,IAAI,CAAC;YAC9C,IAAI,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC;YACtC,KAAK;SACN,CAAC;IACJ,CAAC,CAAA;IACD,MAAM,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,aAAK,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC;IAC1C,QAAQ,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,aAAK,CAAC,QAAQ,CAAC,GAAG,IAAI,CAAC;IAC9C,YAAY,EAAE,CAAO,GAAG,IAAI,EAAE,EAAE;QAC9B,MAAM,aAAK,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,CAAC;IAC5B,CAAC,CAAA;IACD,wBAAwB,EAAE,CAAC,GAAG,IAAI,EAAE,EAAE,CAAC,aAAK,CAAC,SAAS,CAAC,GAAG,IAAI,CAAC;IAC/D,+BAA+B,EAAE,CAAO,KAAU,EAAE,EAAE;QACpD,OAAO,OAAO,CAAC,GAAG,CAAC,gBAAgB;YACjC,CAAC,CAAC,MAAM,aAAK,CAAC,wBAAwB,CAAC,KAAK,aAAL,KAAK,uBAAL,KAAK,CAAE,KAAK,CAAC;YACpD,CAAC,CAAC,SAAS,CAAC;IAChB,CAAC,CAAA;IACD,kBAAkB,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,gBAAgB;IACxD,mBAAmB,EAAE,wBAAwB;IAC7C,qBAAqB,EAAE,0BAA0B;CACtB,CAAC","sourcesContent":["import { Actor, ApifyEnv } from 'apify';\nimport { CrawlingContext, Request as CrawleeRequest, playwrightUtils } from 'crawlee';\n\nimport type { CrawleeOneDataset, CrawleeOneIO } from './types';\n\nexport interface ApifyErrorReport {\n actorId: string | null;\n actorRunId: string | null;\n actorRunUrl: string;\n errorName: string;\n errorMessage: string;\n pageUrl: string | null;\n pageHtmlSnapshot: string | null;\n pageScreenshot: string | null;\n}\n\nexport interface ApifyEntryMetadata {\n actorId: string | null;\n actorRunId: string | null;\n actorRunUrl: string | null;\n contextId: string;\n requestId: string | null;\n\n /** The URL given to the crawler */\n originalUrl: string | null;\n /** The URL given to the crawler after possible redirects */\n loadedUrl: string | null;\n\n /** ISO datetime string that indicates the time when the request has been processed. */\n dateHandled: string;\n numberOfRetries: number;\n}\n\n/**\n * Integration between CrawleeOne and Apify.\n *\n * This is the default integration.\n */\nexport type ApifyCrawleeOneIO = CrawleeOneIO<ApifyEnv, ApifyErrorReport, ApifyEntryMetadata>;\n\nconst generateApifyErrorReport: ApifyCrawleeOneIO['generateErrorReport'] = async (\n input,\n options\n) => {\n const { error, page, url, log } = input;\n const { io, allowScreenshot } = options;\n\n // storeId is ID of current key-value store, where we save snapshots\n // We can also capture actor and run IDs\n // to have easy access in the reporting dataset\n const { actorId, actorRunId, defaultKeyValueStoreId: storeId } = await io.getEnv();\n\n const actorRunUrl = `https://console.apify.com/actors/${actorId}/runs/${actorRunId}`;\n\n const randomNumber = Math.random();\n const key = `ERROR-${randomNumber}`;\n\n let pageScreenshot: string | null = null;\n let pageHtmlSnapshot: string | null = null;\n let pageUrl: string | null = url ?? null;\n if (page && allowScreenshot) {\n pageUrl = pageUrl || page.url();\n log?.info('Capturing page snapshot');\n await playwrightUtils.saveSnapshot(page, { key });\n log?.info('DONE capturing page snapshot');\n // You will have to adjust the keys if you save them in a non-standard way\n pageScreenshot = `https://api.apify.com/v2/key-value-stores/${storeId}/records/${key}.jpg?disableRedirect=true`;\n pageHtmlSnapshot = `https://api.apify.com/v2/key-value-stores/${storeId}/records/${key}.html?disableRedirect=true`;\n }\n\n // We create a report object\n const report = {\n actorId,\n actorRunId,\n actorRunUrl,\n errorName: error.name,\n errorMessage: error.toString(),\n\n pageUrl,\n pageHtmlSnapshot,\n pageScreenshot,\n } satisfies ApifyErrorReport;\n\n return report;\n};\n\nconst generateApifyEntryMetadata = <Ctx extends CrawlingContext>(ctx: Ctx) => {\n const { actorId, actorRunId } = Actor.getEnv();\n const actorRunUrl =\n actorId != null && actorRunId != null\n ? `https://console.apify.com/actors/${actorId}/runs/${actorRunId}`\n : null;\n const handledAt = new Date().toISOString();\n\n const metadata = {\n actorId,\n actorRunId,\n actorRunUrl,\n contextId: ctx.id,\n requestId: ctx.request.id ?? null,\n\n originalUrl: ctx.request.url ?? null,\n loadedUrl: ctx.request.loadedUrl ?? null,\n\n dateHandled: ctx.request.handledAt || handledAt,\n numberOfRetries: ctx.request.retryCount,\n } satisfies ApifyEntryMetadata;\n\n return metadata;\n};\n\n/**\n * Integration between CrawleeOne and Apify.\n *\n * This is the default integration.\n */\nexport const apifyIO: ApifyCrawleeOneIO = {\n openDataset: async (...args) => {\n const dataset = await Actor.openDataset(...args);\n const getItemCount = async () => (await dataset.getInfo())?.itemCount ?? null;\n const getItems: CrawleeOneDataset['getItems'] = async (options) => {\n const result = await dataset.getData({\n ...options,\n skipEmpty: true,\n });\n return result.items;\n };\n\n return {\n pushData: dataset.pushData.bind(dataset),\n getItems,\n getItemCount,\n };\n },\n openRequestQueue: async (...args) => {\n const queue = await Actor.openRequestQueue(...args);\n const clear = async () => {\n let req: CrawleeRequest | null;\n do {\n req = await queue.fetchNextRequest();\n if (req) await queue.markRequestHandled(req);\n } while (req);\n };\n\n return {\n addRequests: (...args) => queue.addRequests(...args),\n markRequestHandled: (...args) => queue.markRequestHandled(...args),\n fetchNextRequest: (...args) => queue.fetchNextRequest(...args),\n reclaimRequest: (...args) => queue.reclaimRequest(...args),\n isFinished: (...args) => queue.isFinished(...args),\n handledCount: (...args) => queue.handledCount(...args),\n drop: (...args) => queue.drop(...args),\n clear,\n };\n },\n openKeyValueStore: async (...args) => {\n const store = await Actor.openKeyValueStore(...args);\n const clear = async () => {\n await store.forEachKey((key) => store.setValue(key, null));\n };\n\n return {\n getValue: (...args) => store.getValue(...args),\n setValue: (...args) => store.setValue(...args),\n drop: (...args) => store.drop(...args),\n clear,\n };\n },\n getEnv: (...args) => Actor.getEnv(...args),\n getInput: (...args) => Actor.getInput(...args),\n runInContext: async (...args) => {\n await Actor.main(...args);\n },\n triggerDownstreamCrawler: (...args) => Actor.metamorph(...args),\n createDefaultProxyConfiguration: async (input: any) => {\n return process.env.APIFY_IS_AT_HOME\n ? await Actor.createProxyConfiguration(input?.proxy)\n : undefined;\n },\n isTelemetryEnabled: () => !!process.env.APIFY_IS_AT_HOME,\n generateErrorReport: generateApifyErrorReport,\n generateEntryMetadata: generateApifyEntryMetadata,\n} satisfies ApifyCrawleeOneIO;\n"]}
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
import { Log,
|
|
1
|
+
import { Log, RequestQueueOperationOptions } from 'crawlee';
|
|
2
|
+
import type { CrawlerUrl } from '../../types';
|
|
2
3
|
import type { CrawleeOneIO } from '../integrations/types';
|
|
3
|
-
export interface PushRequestsOptions<T extends
|
|
4
|
+
export interface PushRequestsOptions<T extends Exclude<CrawlerUrl, string> = Exclude<CrawlerUrl, string>> {
|
|
4
5
|
io?: CrawleeOneIO<any, any>;
|
|
5
6
|
log?: Log;
|
|
6
7
|
/**
|
|
@@ -36,4 +37,4 @@ export interface PushRequestsOptions<T extends CrawleeRequest = CrawleeRequest>
|
|
|
36
37
|
* - Limit the max size of the RequestQueue. No requests are added when RequestQueue is at or above the limit.
|
|
37
38
|
* - Transform and filter requests. Requests that did not pass the filter are not added to the RequestQueue.
|
|
38
39
|
*/
|
|
39
|
-
export declare const pushRequests: <T extends
|
|
40
|
+
export declare const pushRequests: <T extends import("crawlee").RequestOptions<import("crawlee").Dictionary> | import("crawlee").Request<import("crawlee").Dictionary>>(oneOrManyItems: T | T[], options?: PushRequestsOptions<T> | undefined) => Promise<unknown[]>;
|
|
@@ -11,8 +11,8 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
11
11
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
12
|
exports.pushRequests = void 0;
|
|
13
13
|
const crawlee_1 = require("crawlee");
|
|
14
|
-
const requestQueue_1 = require("./requestQueue");
|
|
15
14
|
const apify_1 = require("../integrations/apify");
|
|
15
|
+
const requestQueue_1 = require("./requestQueue");
|
|
16
16
|
const shortenToSize = (entries, maxCount, options) => __awaiter(void 0, void 0, void 0, function* () {
|
|
17
17
|
const { requestQueueId, log } = options !== null && options !== void 0 ? options : {};
|
|
18
18
|
const queueName = requestQueueId ? `"${requestQueueId}"` : 'DEFAULT';
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"pushRequests.js","sourceRoot":"","sources":["../../../../src/lib/io/pushRequests.ts"],"names":[],"mappings":";;;;;;;;;;;;AAAA,
|
|
1
|
+
{"version":3,"file":"pushRequests.js","sourceRoot":"","sources":["../../../../src/lib/io/pushRequests.ts"],"names":[],"mappings":";;;;;;;;;;;;AAAA,qCAA4D;AAI5D,iDAAgD;AAChD,iDAAyD;AAoCzD,MAAM,aAAa,GAAG,CACpB,OAAY,EACZ,QAAgB,EAChB,OAAmE,EACnE,EAAE;IACF,MAAM,EAAE,cAAc,EAAE,GAAG,EAAE,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;IAE9C,MAAM,SAAS,GAAG,cAAc,CAAC,CAAC,CAAC,IAAI,cAAc,GAAG,CAAC,CAAC,CAAC,SAAS,CAAC;IAErE,MAAM,WAAW,GAAG,IAAA,sCAAuB,EAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IAE/D,uDAAuD;IACvD,MAAM,MAAM,GAAG,MAAM,WAAW,CAAC,MAAM,EAAE,CAAC;IAC1C,IAAI,MAAM,EAAE;QACV,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,OAAO,CAAC,iBAAiB,SAAS,sBAAsB,QAAQ,cAAc,OAAO,CAAC,MAAM,6BAA6B,CAAC,CAAC;QAChI,OAAO,EAAE,CAAC;KACX,CAAC,kBAAkB;IAEpB,8EAA8E;IAC9E,MAAM,aAAa,GAAG,MAAM,WAAW,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;IAC/D,IAAI,aAAa,CAAC,MAAM,KAAK,OAAO,CAAC,MAAM,EAAE;QAC3C,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,OAAO,CAAC,iBAAiB,SAAS,sBAAsB,QAAQ,cAAc,OAAO,CAAC,MAAM,6BAA6B,CAAC,CAAC;QAChI,OAAO,EAAE,CAAC;KACX,CAAC,kBAAkB;IAEpB,OAAO,aAAa,CAAC;AACvB,CAAC,CAAA,CAAC;AAEF;;;;;;GAMG;AACI,MAAM,YAAY,GAAG,CAC1B,cAAuB,EACvB,OAAgC,EAChC,EAAE;IACF,MAAM,EACJ,EAAE,GAAG,eAAuB,EAC5B,GAAG,GAAG,IAAI,aAAG,EAAE,EACf,QAAQ,EACR,SAAS,EACT,MAAM,EACN,cAAc,EACd,YAAY,GACb,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;IAElB,MAAM,SAAS,GAAG,KAAK,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC;IACpF,MAAM,KAAK,GACT,QAAQ,IAAI,IAAI;QACd,CAAC,CAAC,MAAM,aAAa,CAAC,SAAS,EAAE,QAAQ,EAAE,EAAE,EAAE,EAAE,cAAc,EAAE,GAAG,EAAE,CAAC;QACvE,CAAC,CAAC,SAAS,CAAC;IAEhB,GAAG,CAAC,KAAK,CAAC,qBAAqB,KAAK,CAAC,MAAM,oBAAoB,CAAC,CAAC,CAAC,kBAAkB;IAEpF,MAAM,aAAa,GAAG,MAAM,KAAK,CAAC,MAAM,CAAC,CAAO,UAAU,EAAE,IAAI,EAAE,EAAE;QAClE,MAAM,GAAG,GAAG,MAAM,UAAU,CAAC;QAE7B,MAAM,eAAe,GAAG,SAAS,CAAC,CAAC,CAAC,MAAM,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;QACjE,MAAM,YAAY,GAAG,MAAM,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;QAEnE,IAAI,YAAY;YAAE,GAAG,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;QAE5C,OAAO,GAAG,CAAC;IACb,CAAC,CAAA,EAAE,OAAO,CAAC,OAAO,CAAC,EAAe,CAAC,CAAC,CAAC;IAErC,wCAAwC;IACxC,GAAG,CAAC,IAAI,CAAC,WAAW,aAAa,CAAC,MAAM,oBAAoB,CAAC,CAAC;IAC9D,MAAM,QAAQ,GAAG,MAAM,EAAE,CAAC,gBAAgB,CAAC,cAAc,CAAC,CAAC;IAC3D,MAAM,QAAQ,CAAC,WAAW,CAAC,aAAsB,EAAE,YAAY,CAAC,CAAC;IACjE,GAAG,CAAC,IAAI,CAAC,gBAAgB,aAAa,CAAC,MAAM,oBAAoB,CAAC,CAAC;IAEnE,OAAO,aAAa,CAAC;AACvB,CAAC,CAAA,CAAC;AAxCW,QAAA,YAAY,gBAwCvB","sourcesContent":["import { Log, RequestQueueOperationOptions } from 'crawlee';\n\nimport type { CrawlerUrl } from '../../types';\nimport type { CrawleeOneIO } from '../integrations/types';\nimport { apifyIO } from '../integrations/apify';\nimport { requestQueueSizeMonitor } from './requestQueue';\n\nexport interface PushRequestsOptions<\n T extends Exclude<CrawlerUrl, string> = Exclude<CrawlerUrl, string>\n> {\n io?: CrawleeOneIO<any, any>;\n log?: Log;\n /**\n * If set, only at most this many requests will be added to the RequestQueue.\n *\n * The count is determined from the RequestQueue that's used for the crawler run.\n *\n * This means that if `maxCount` is set to 50, but the\n * associated RequestQueue already handled 40 requests, then only 10 new requests\n * will be processed.\n */\n maxCount?: number;\n /**\n * Option to freely transform a request before pushing it to the RequestQueue.\n *\n * This serves mainly to allow users to transform the requests from actor input UI.\n */\n transform?: (req: T) => any;\n /**\n * Option to filter a request before pushing it to the RequestQueue.\n *\n * This serves mainly to allow users to filter the requests from actor input UI.\n */\n filter?: (req: T) => any;\n /** ID of the RequestQueue to which the data should be pushed */\n requestQueueId?: string;\n\n // Pass-through options\n queueOptions?: RequestQueueOperationOptions;\n}\n\nconst shortenToSize = async <T>(\n entries: T[],\n maxCount: number,\n options?: { io?: CrawleeOneIO; requestQueueId?: string; log?: Log }\n) => {\n const { requestQueueId, log } = options ?? {};\n\n const queueName = requestQueueId ? `\"${requestQueueId}\"` : 'DEFAULT';\n\n const sizeMonitor = requestQueueSizeMonitor(maxCount, options);\n\n // Ignore incoming entries if the queue is already full\n const isFull = await sizeMonitor.isFull();\n if (isFull) {\n log?.warning(`RequestQueue (${queueName}) is already full (${maxCount} entries), ${entries.length} entries will be discarded.`);\n return [];\n } // prettier-ignore\n\n // Show warning when only part of the incoming requests made it into the queue\n const slicedEntries = await sizeMonitor.shortenToSize(entries);\n if (slicedEntries.length !== entries.length) {\n log?.warning(`RequestQueue (${queueName}) has become full (${maxCount} entries), ${entries.length} entries will be discarded.`);\n return [];\n } // prettier-ignore\n\n return slicedEntries;\n};\n\n/**\n * Similar to `Actor.openRequestQueue().addRequests`, but with extra features:\n *\n * - Data can be sent elsewhere, not just to Apify. This is set by the `io` options. By default data is sent using Apify (cloud/local).\n * - Limit the max size of the RequestQueue. No requests are added when RequestQueue is at or above the limit.\n * - Transform and filter requests. Requests that did not pass the filter are not added to the RequestQueue.\n */\nexport const pushRequests = async <T extends Exclude<CrawlerUrl, string>>(\n oneOrManyItems: T | T[],\n options?: PushRequestsOptions<T>\n) => {\n const {\n io = apifyIO as CrawleeOneIO,\n log = new Log(),\n maxCount,\n transform,\n filter,\n requestQueueId,\n queueOptions,\n } = options ?? {};\n\n const manyItems = Array.isArray(oneOrManyItems) ? oneOrManyItems : [oneOrManyItems];\n const items =\n maxCount != null\n ? await shortenToSize(manyItems, maxCount, { io, requestQueueId, log })\n : manyItems;\n\n log.debug(`Preparing to push ${items.length} requests to queue`); // prettier-ignore\n\n const adjustedItems = await items.reduce(async (aggPromise, item) => {\n const agg = await aggPromise;\n\n const transformedItem = transform ? await transform(item) : item;\n const passedFilter = filter ? await filter(transformedItem) : true;\n\n if (passedFilter) agg.push(transformedItem);\n\n return agg;\n }, Promise.resolve([] as unknown[]));\n\n // Push requests to primary RequestQueue\n log.info(`Pushing ${adjustedItems.length} requests to queue`);\n const reqQueue = await io.openRequestQueue(requestQueueId);\n await reqQueue.addRequests(adjustedItems as any[], queueOptions);\n log.info(`Done pushing ${adjustedItems.length} requests to queue`);\n\n return adjustedItems;\n};\n"]}
|
package/dist/cjs/lib/log.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { type CrawlingContext, LogLevel as CrawleeLogLevel } from 'crawlee';
|
|
2
2
|
import type { ArrVal } from '../utils/types';
|
|
3
|
-
import type { CrawlerRouterWrapper } from './router';
|
|
3
|
+
import type { CrawlerRouterWrapper } from './router/types';
|
|
4
4
|
export declare const LOG_LEVEL: readonly ["debug", "info", "warn", "error", "off"];
|
|
5
5
|
export type LogLevel = ArrVal<typeof LOG_LEVEL>;
|
|
6
6
|
/** Map log levels of `crawlee-one` to log levels of `crawlee` */
|
package/dist/cjs/lib/log.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"log.js","sourceRoot":"","sources":["../../../src/lib/log.ts"],"names":[],"mappings":";;;AAAA,qCAA4E;AAK/D,QAAA,SAAS,GAAG,CAAC,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,KAAK,CAAU,CAAC,CAAC,kBAAkB;AAG/F,iEAAiE;AACpD,QAAA,iBAAiB,GAAsC;IAClE,GAAG,EAAE,kBAAe,CAAC,GAAG;IACxB,KAAK,EAAE,kBAAe,CAAC,KAAK;IAC5B,IAAI,EAAE,kBAAe,CAAC,IAAI;IAC1B,IAAI,EAAE,kBAAe,CAAC,OAAO;IAC7B,KAAK,EAAE,kBAAe,CAAC,KAAK;CAC7B,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AACI,MAAM,sBAAsB,GAAG,CAIpC,QAAkB,EACkB,EAAE;IACtC,OAAO,CAAC,OAAO,EAAE,EAAE;QACjB,OAAO,CAAC,GAAG,EAAE,GAAG,IAAI,EAAE,EAAE;YACtB,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,wBAAwB,QAAQ,EAAE,CAAC,CAAC;YACjD,GAAG,CAAC,GAAG,CAAC,QAAQ,CAAC,yBAAiB,CAAC,QAAQ,CAAC,CAAC,CAAC;YAC9C,OAAO,OAAO,CAAC,GAAG,EAAE,GAAG,IAAI,CAAC,CAAC;QAC/B,CAAC,CAAC;IACJ,CAAC,CAAC;AACJ,CAAC,CAAC;AAbW,QAAA,sBAAsB,0BAajC","sourcesContent":["import { type CrawlingContext, LogLevel as CrawleeLogLevel } from 'crawlee';\n\nimport type { ArrVal } from '../utils/types';\nimport type { CrawlerRouterWrapper } from './router';\n\nexport const LOG_LEVEL = ['debug', 'info', 'warn', 'error', 'off'] as const; // prettier-ignore\nexport type LogLevel = ArrVal<typeof LOG_LEVEL>;\n\n/** Map log levels of `crawlee-one` to log levels of `crawlee` */\nexport const logLevelToCrawlee: Record<LogLevel, CrawleeLogLevel> = {\n off: CrawleeLogLevel.OFF,\n debug: CrawleeLogLevel.DEBUG,\n info: CrawleeLogLevel.INFO,\n warn: CrawleeLogLevel.WARNING,\n error: CrawleeLogLevel.ERROR,\n};\n\n/**\n * Wrapper for Crawlee route handler that configures log level.\n *\n *\n * Usage with Crawlee's `RouterHandler.addDefaultHandler`\n * ```ts\n * const wrappedHandler = logLevelHandlerWrapper('debug')(handler)\n * await router.addDefaultHandler<Ctx>(wrappedHandler);\n * ```\n *\n * Usage with Crawlee's `RouterHandler.addHandler`\n * ```ts\n * const wrappedHandler = logLevelHandlerWrapper('error')(handler)\n * await router.addHandler<Ctx>(wrappedHandler);\n * ```\n *\n * Usage with `createCrawleeOne`\n * ```ts\n * const actor = await createCrawleeOne<CheerioCrawlingContext>({\n * validateInput,\n * router: createCheerioRouter(),\n * routes,\n * routeHandlers: ({ input }) => createHandlers(input!),\n * routerWrappers: ({ input }) => [\n * logLevelHandlerWrapper<CheerioCrawlingContext<any, any>>(input?.logLevel ?? 'info'),\n * ],\n * createCrawler: ({ router, input }) => createCrawler({ router, input, crawlerConfig }),\n * });\n * ```\n */\nexport const logLevelHandlerWrapper = <\n T extends CrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>\n>(\n logLevel: LogLevel\n): CrawlerRouterWrapper<T, RouterCtx> => {\n return (handler) => {\n return (ctx, ...args) => {\n ctx.log.info(`Setting log level to ${logLevel}`);\n ctx.log.setLevel(logLevelToCrawlee[logLevel]);\n return handler(ctx, ...args);\n };\n };\n};\n"]}
|
|
1
|
+
{"version":3,"file":"log.js","sourceRoot":"","sources":["../../../src/lib/log.ts"],"names":[],"mappings":";;;AAAA,qCAA4E;AAK/D,QAAA,SAAS,GAAG,CAAC,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,KAAK,CAAU,CAAC,CAAC,kBAAkB;AAG/F,iEAAiE;AACpD,QAAA,iBAAiB,GAAsC;IAClE,GAAG,EAAE,kBAAe,CAAC,GAAG;IACxB,KAAK,EAAE,kBAAe,CAAC,KAAK;IAC5B,IAAI,EAAE,kBAAe,CAAC,IAAI;IAC1B,IAAI,EAAE,kBAAe,CAAC,OAAO;IAC7B,KAAK,EAAE,kBAAe,CAAC,KAAK;CAC7B,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AACI,MAAM,sBAAsB,GAAG,CAIpC,QAAkB,EACkB,EAAE;IACtC,OAAO,CAAC,OAAO,EAAE,EAAE;QACjB,OAAO,CAAC,GAAG,EAAE,GAAG,IAAI,EAAE,EAAE;YACtB,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,wBAAwB,QAAQ,EAAE,CAAC,CAAC;YACjD,GAAG,CAAC,GAAG,CAAC,QAAQ,CAAC,yBAAiB,CAAC,QAAQ,CAAC,CAAC,CAAC;YAC9C,OAAO,OAAO,CAAC,GAAG,EAAE,GAAG,IAAI,CAAC,CAAC;QAC/B,CAAC,CAAC;IACJ,CAAC,CAAC;AACJ,CAAC,CAAC;AAbW,QAAA,sBAAsB,0BAajC","sourcesContent":["import { type CrawlingContext, LogLevel as CrawleeLogLevel } from 'crawlee';\n\nimport type { ArrVal } from '../utils/types';\nimport type { CrawlerRouterWrapper } from './router/types';\n\nexport const LOG_LEVEL = ['debug', 'info', 'warn', 'error', 'off'] as const; // prettier-ignore\nexport type LogLevel = ArrVal<typeof LOG_LEVEL>;\n\n/** Map log levels of `crawlee-one` to log levels of `crawlee` */\nexport const logLevelToCrawlee: Record<LogLevel, CrawleeLogLevel> = {\n off: CrawleeLogLevel.OFF,\n debug: CrawleeLogLevel.DEBUG,\n info: CrawleeLogLevel.INFO,\n warn: CrawleeLogLevel.WARNING,\n error: CrawleeLogLevel.ERROR,\n};\n\n/**\n * Wrapper for Crawlee route handler that configures log level.\n *\n *\n * Usage with Crawlee's `RouterHandler.addDefaultHandler`\n * ```ts\n * const wrappedHandler = logLevelHandlerWrapper('debug')(handler)\n * await router.addDefaultHandler<Ctx>(wrappedHandler);\n * ```\n *\n * Usage with Crawlee's `RouterHandler.addHandler`\n * ```ts\n * const wrappedHandler = logLevelHandlerWrapper('error')(handler)\n * await router.addHandler<Ctx>(wrappedHandler);\n * ```\n *\n * Usage with `createCrawleeOne`\n * ```ts\n * const actor = await createCrawleeOne<CheerioCrawlingContext>({\n * validateInput,\n * router: createCheerioRouter(),\n * routes,\n * routeHandlers: ({ input }) => createHandlers(input!),\n * routerWrappers: ({ input }) => [\n * logLevelHandlerWrapper<CheerioCrawlingContext<any, any>>(input?.logLevel ?? 'info'),\n * ],\n * createCrawler: ({ router, input }) => createCrawler({ router, input, crawlerConfig }),\n * });\n * ```\n */\nexport const logLevelHandlerWrapper = <\n T extends CrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>\n>(\n logLevel: LogLevel\n): CrawlerRouterWrapper<T, RouterCtx> => {\n return (handler) => {\n return (ctx, ...args) => {\n ctx.log.info(`Setting log level to ${logLevel}`);\n ctx.log.setLevel(logLevelToCrawlee[logLevel]);\n return handler(ctx, ...args);\n };\n };\n};\n"]}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import type { CrawlingContext, RouterHandler as CrawlerRouter } from 'crawlee';
|
|
2
|
+
import type { CrawleeOneIO } from '../integrations/types';
|
|
3
|
+
import type { CrawlerRouterWrapper, RouteHandler, RouteMatcher } from './types';
|
|
4
|
+
export declare const registerHandlers: <CrawlerCtx extends CrawlingContext<unknown, import("crawlee").Dictionary>, RouterCtx extends Record<string, any> = Record<string, any>, Labels extends string = string>({ router, routerWrappers, routerContext, routeHandlers, }: {
|
|
5
|
+
router: CrawlerRouter<CrawlerCtx>;
|
|
6
|
+
routerWrappers?: CrawlerRouterWrapper<CrawlerCtx, RouterCtx>[] | undefined;
|
|
7
|
+
routerContext?: RouterCtx | undefined;
|
|
8
|
+
routeHandlers: Record<Labels, RouteHandler<CrawlerCtx, RouterCtx>>;
|
|
9
|
+
}) => Promise<void>;
|
|
10
|
+
/**
|
|
11
|
+
* Configures the default router handler to redirect URLs to labelled route handlers
|
|
12
|
+
* based on which route the URL matches first.
|
|
13
|
+
*
|
|
14
|
+
* NOTE: This does mean that the URLs passed to this default handler will be fetched
|
|
15
|
+
* twice (as the URL will be requeued to the correct handler). We recommend to use this
|
|
16
|
+
* function only in the scenarios where there is a small number of startUrls, yet these
|
|
17
|
+
* may need various ways of processing based on different paths or etc.
|
|
18
|
+
*
|
|
19
|
+
* @example
|
|
20
|
+
*
|
|
21
|
+
* const routeLabels = {
|
|
22
|
+
* MAIN_PAGE: 'MAIN_PAGE',
|
|
23
|
+
* JOB_LISTING: 'JOB_LISTING',
|
|
24
|
+
* JOB_DETAIL: 'JOB_DETAIL',
|
|
25
|
+
* JOB_RELATED_LIST: 'JOB_RELATED_LIST',
|
|
26
|
+
* PARTNERS: 'PARTNERS',
|
|
27
|
+
* } as const;
|
|
28
|
+
*
|
|
29
|
+
* const router = createPlaywrightRouter();
|
|
30
|
+
*
|
|
31
|
+
* const routes = createPlaywrightRouteMatchers<typeof routeLabels>([
|
|
32
|
+
* // URLs that match this route are redirected to router.addHandler(routeLabels.MAIN_PAGE)
|
|
33
|
+
* {
|
|
34
|
+
* route: routeLabels.MAIN_PAGE,
|
|
35
|
+
* // Check for main page like https://www.profesia.sk/?#
|
|
36
|
+
* match: (url) => url.match(/[\W]profesia\.sk\/?(?:[?#~]|$)/i),
|
|
37
|
+
* },
|
|
38
|
+
*
|
|
39
|
+
* // Optionally override the logic that assigns the URL to the route by specifying the `action` prop
|
|
40
|
+
* {
|
|
41
|
+
* route: routeLabels.MAIN_PAGE,
|
|
42
|
+
* // Check for main page like https://www.profesia.sk/?#
|
|
43
|
+
* match: (url) => url.match(/[\W]profesia\.sk\/?(?:[?#~]|$)/i),
|
|
44
|
+
* action: async (ctx) => {
|
|
45
|
+
* await ctx.crawler.addRequests([{
|
|
46
|
+
* url: 'https://profesia.sk/praca',
|
|
47
|
+
* label: routeLabels.JOB_LISTING,
|
|
48
|
+
* }]);
|
|
49
|
+
* },
|
|
50
|
+
* },
|
|
51
|
+
* ]);
|
|
52
|
+
*
|
|
53
|
+
* // Set up default route to redirect to labelled routes
|
|
54
|
+
* setupDefaultRoute({ router, routes });
|
|
55
|
+
*
|
|
56
|
+
* // Now set up the labelled routes
|
|
57
|
+
* await router.addHandler(routeLabels.JOB_LISTING, async (ctx) => { ... }
|
|
58
|
+
*/
|
|
59
|
+
export declare const setupDefaultRoute: <CrawlerCtx extends CrawlingContext<unknown, import("crawlee").Dictionary>, RouterCtx extends Record<string, any> = Record<string, any>, Labels extends string = string, Input extends Record<string, any> = Record<string, any>>({ io, router, routerWrappers, routerContext, routes, routeHandlers, input, }: {
|
|
60
|
+
io: CrawleeOneIO;
|
|
61
|
+
router: CrawlerRouter<CrawlerCtx>;
|
|
62
|
+
routerWrappers?: CrawlerRouterWrapper<CrawlerCtx, RouterCtx>[] | undefined;
|
|
63
|
+
routerContext?: RouterCtx | undefined;
|
|
64
|
+
routes: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[];
|
|
65
|
+
routeHandlers: Record<Labels, RouteHandler<CrawlerCtx, RouterCtx>>;
|
|
66
|
+
input?: Input | null | undefined;
|
|
67
|
+
}) => Promise<void>;
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.setupDefaultRoute = exports.registerHandlers = void 0;
|
|
13
|
+
const async_1 = require("../../utils/async");
|
|
14
|
+
// Read about router on https://docs.apify.com/academy/expert-scraping-with-apify/solutions/using-storage-creating-tasks
|
|
15
|
+
const registerHandlers = ({ router, routerWrappers, routerContext, routeHandlers, }) => __awaiter(void 0, void 0, void 0, function* () {
|
|
16
|
+
yield (0, async_1.serialAsyncMap)(Object.entries(routeHandlers), ([key, handler]) => __awaiter(void 0, void 0, void 0, function* () {
|
|
17
|
+
const wrappedHandler = (routerWrappers !== null && routerWrappers !== void 0 ? routerWrappers : []).reduceRight((fn, wrapper) => wrapper((ctx) => fn(ctx)), handler);
|
|
18
|
+
yield router.addHandler(key, (ctx) => __awaiter(void 0, void 0, void 0, function* () { return wrappedHandler(Object.assign(Object.assign({}, routerContext), ctx)); }));
|
|
19
|
+
}));
|
|
20
|
+
});
|
|
21
|
+
exports.registerHandlers = registerHandlers;
|
|
22
|
+
const createDefaultHandler = (input) => {
|
|
23
|
+
const { io, routes, routeHandlers, requestQueueId, perfBatchSize, perfBatchWaitSecs } = input;
|
|
24
|
+
// NOTE: Because we "clear" the queue by replacing it,
|
|
25
|
+
// we need to always call `openRequestQueue` to ensure we use the latest instance
|
|
26
|
+
const openQueue = () => io.openRequestQueue(requestQueueId);
|
|
27
|
+
const closeRequest = (req) => __awaiter(void 0, void 0, void 0, function* () {
|
|
28
|
+
if (!req)
|
|
29
|
+
return;
|
|
30
|
+
const reqQueue = yield openQueue();
|
|
31
|
+
yield reqQueue.markRequestHandled(req);
|
|
32
|
+
});
|
|
33
|
+
const loadNextRequest = (suffix, options) => __awaiter(void 0, void 0, void 0, function* () {
|
|
34
|
+
var _a;
|
|
35
|
+
const { page, log } = options !== null && options !== void 0 ? options : {};
|
|
36
|
+
log === null || log === void 0 ? void 0 : log.debug(`Checking for new Request in the queue. ${suffix}`);
|
|
37
|
+
if (perfBatchWaitSecs)
|
|
38
|
+
yield (0, async_1.wait)(perfBatchWaitSecs);
|
|
39
|
+
const reqQueue = yield openQueue();
|
|
40
|
+
const newReq = (_a = (yield reqQueue.fetchNextRequest())) !== null && _a !== void 0 ? _a : null;
|
|
41
|
+
if (newReq) {
|
|
42
|
+
log === null || log === void 0 ? void 0 : log.debug(`Found new Request in the queue. ${suffix}`);
|
|
43
|
+
// WARNING - For each subsequent Request, it must be loaded manually
|
|
44
|
+
// Hence, batching is suitable only for browser-based Crawlers
|
|
45
|
+
// like Playwright or Puppeteer.
|
|
46
|
+
if (page === null || page === void 0 ? void 0 : page.goto)
|
|
47
|
+
yield page.goto(newReq.url);
|
|
48
|
+
}
|
|
49
|
+
else {
|
|
50
|
+
log === null || log === void 0 ? void 0 : log.debug(`No more Requests in the queue. ${suffix}`);
|
|
51
|
+
}
|
|
52
|
+
return newReq;
|
|
53
|
+
});
|
|
54
|
+
const onError = (err, req, log) => __awaiter(void 0, void 0, void 0, function* () {
|
|
55
|
+
log.error(`Failed to process a request, returning it to the queue. URL: ${(req === null || req === void 0 ? void 0 : req.loadedUrl) || (req === null || req === void 0 ? void 0 : req.url)}.`); // prettier-ignore
|
|
56
|
+
log.error(err);
|
|
57
|
+
// Reinsert the request into the queue if we failed to process it due to an error
|
|
58
|
+
if (req) {
|
|
59
|
+
const reqQueue = yield openQueue();
|
|
60
|
+
yield reqQueue.reclaimRequest(req, { forefront: true });
|
|
61
|
+
}
|
|
62
|
+
});
|
|
63
|
+
/** Redirect the URL to the labelled route identical to route's name */
|
|
64
|
+
// prettier-ignore
|
|
65
|
+
const defaultAction = (url, ctx, route) => __awaiter(void 0, void 0, void 0, function* () {
|
|
66
|
+
const handler = route.handlerLabel != null && routeHandlers[route.handlerLabel];
|
|
67
|
+
if (!handler) {
|
|
68
|
+
ctx.log.error(`No handler found for route ${route.name} (${route.handlerLabel}). URL will not be processed. URL: ${url}`); // prettier-ignore
|
|
69
|
+
return;
|
|
70
|
+
}
|
|
71
|
+
ctx.log.info(`Passing URL to handler ${route.handlerLabel}. URL: ${url}`);
|
|
72
|
+
yield handler(ctx);
|
|
73
|
+
});
|
|
74
|
+
const defaultHandler = (ctx) => __awaiter(void 0, void 0, void 0, function* () {
|
|
75
|
+
var _b;
|
|
76
|
+
const { page, log: parentLog } = ctx;
|
|
77
|
+
const log = parentLog.child({ prefix: '[Router] ' });
|
|
78
|
+
let handledRequestsCount = 0;
|
|
79
|
+
let req = (_b = ctx.request) !== null && _b !== void 0 ? _b : null;
|
|
80
|
+
const hasBatchReqs = () => perfBatchSize != null && req != null && handledRequestsCount < perfBatchSize;
|
|
81
|
+
const getUrl = () => (page ? page.url() : req.loadedUrl || req.url);
|
|
82
|
+
const onRequest = () => __awaiter(void 0, void 0, void 0, function* () {
|
|
83
|
+
var _c;
|
|
84
|
+
const url = yield getUrl();
|
|
85
|
+
const logSuffix = `Batch ${handledRequestsCount + 1} of ${perfBatchSize !== null && perfBatchSize !== void 0 ? perfBatchSize : 1}. URL: ${url}`;
|
|
86
|
+
// Find route handler for given URL
|
|
87
|
+
log.debug(`Searching for a handler for given Request. ${logSuffix}`);
|
|
88
|
+
const route = yield (0, async_1.serialAsyncFind)(routes, (currRoute) => __awaiter(void 0, void 0, void 0, function* () {
|
|
89
|
+
const isMatch = yield currRoute.match(url, ctx, currRoute, routeHandlers);
|
|
90
|
+
return isMatch;
|
|
91
|
+
}));
|
|
92
|
+
// Run the handler
|
|
93
|
+
if (route) {
|
|
94
|
+
log.info(`URL matched route ${route.name} (handlerLabel: ${route.handlerLabel}). ${logSuffix}`); // prettier-ignore
|
|
95
|
+
const action = (_c = route.action) !== null && _c !== void 0 ? _c : defaultAction;
|
|
96
|
+
yield action(url, ctx, route, routeHandlers);
|
|
97
|
+
}
|
|
98
|
+
else {
|
|
99
|
+
log.error(`No route matched URL. URL will not be processed. ${logSuffix}`);
|
|
100
|
+
}
|
|
101
|
+
// Clean up and move onto another request
|
|
102
|
+
yield closeRequest(req);
|
|
103
|
+
handledRequestsCount++;
|
|
104
|
+
req = yield loadNextRequest(logSuffix, { page: page, log });
|
|
105
|
+
});
|
|
106
|
+
try {
|
|
107
|
+
do {
|
|
108
|
+
yield onRequest();
|
|
109
|
+
} while (hasBatchReqs());
|
|
110
|
+
}
|
|
111
|
+
catch (err) {
|
|
112
|
+
yield onError(err, req, log);
|
|
113
|
+
}
|
|
114
|
+
});
|
|
115
|
+
return defaultHandler;
|
|
116
|
+
};
|
|
117
|
+
/**
|
|
118
|
+
* Configures the default router handler to redirect URLs to labelled route handlers
|
|
119
|
+
* based on which route the URL matches first.
|
|
120
|
+
*
|
|
121
|
+
* NOTE: This does mean that the URLs passed to this default handler will be fetched
|
|
122
|
+
* twice (as the URL will be requeued to the correct handler). We recommend to use this
|
|
123
|
+
* function only in the scenarios where there is a small number of startUrls, yet these
|
|
124
|
+
* may need various ways of processing based on different paths or etc.
|
|
125
|
+
*
|
|
126
|
+
* @example
|
|
127
|
+
*
|
|
128
|
+
* const routeLabels = {
|
|
129
|
+
* MAIN_PAGE: 'MAIN_PAGE',
|
|
130
|
+
* JOB_LISTING: 'JOB_LISTING',
|
|
131
|
+
* JOB_DETAIL: 'JOB_DETAIL',
|
|
132
|
+
* JOB_RELATED_LIST: 'JOB_RELATED_LIST',
|
|
133
|
+
* PARTNERS: 'PARTNERS',
|
|
134
|
+
* } as const;
|
|
135
|
+
*
|
|
136
|
+
* const router = createPlaywrightRouter();
|
|
137
|
+
*
|
|
138
|
+
* const routes = createPlaywrightRouteMatchers<typeof routeLabels>([
|
|
139
|
+
* // URLs that match this route are redirected to router.addHandler(routeLabels.MAIN_PAGE)
|
|
140
|
+
* {
|
|
141
|
+
* route: routeLabels.MAIN_PAGE,
|
|
142
|
+
* // Check for main page like https://www.profesia.sk/?#
|
|
143
|
+
* match: (url) => url.match(/[\W]profesia\.sk\/?(?:[?#~]|$)/i),
|
|
144
|
+
* },
|
|
145
|
+
*
|
|
146
|
+
* // Optionally override the logic that assigns the URL to the route by specifying the `action` prop
|
|
147
|
+
* {
|
|
148
|
+
* route: routeLabels.MAIN_PAGE,
|
|
149
|
+
* // Check for main page like https://www.profesia.sk/?#
|
|
150
|
+
* match: (url) => url.match(/[\W]profesia\.sk\/?(?:[?#~]|$)/i),
|
|
151
|
+
* action: async (ctx) => {
|
|
152
|
+
* await ctx.crawler.addRequests([{
|
|
153
|
+
* url: 'https://profesia.sk/praca',
|
|
154
|
+
* label: routeLabels.JOB_LISTING,
|
|
155
|
+
* }]);
|
|
156
|
+
* },
|
|
157
|
+
* },
|
|
158
|
+
* ]);
|
|
159
|
+
*
|
|
160
|
+
* // Set up default route to redirect to labelled routes
|
|
161
|
+
* setupDefaultRoute({ router, routes });
|
|
162
|
+
*
|
|
163
|
+
* // Now set up the labelled routes
|
|
164
|
+
* await router.addHandler(routeLabels.JOB_LISTING, async (ctx) => { ... }
|
|
165
|
+
*/
|
|
166
|
+
const setupDefaultRoute = ({ io, router, routerWrappers, routerContext, routes, routeHandlers, input, }) => __awaiter(void 0, void 0, void 0, function* () {
|
|
167
|
+
const { perfBatchSize, perfBatchWaitSecs, requestQueueId } = (input || {});
|
|
168
|
+
const defaultHandler = createDefaultHandler({
|
|
169
|
+
io,
|
|
170
|
+
routes,
|
|
171
|
+
routeHandlers,
|
|
172
|
+
requestQueueId,
|
|
173
|
+
perfBatchSize,
|
|
174
|
+
perfBatchWaitSecs,
|
|
175
|
+
});
|
|
176
|
+
const wrappedHandler = (routerWrappers !== null && routerWrappers !== void 0 ? routerWrappers : []).reduceRight((fn, wrapper) => wrapper(fn), defaultHandler);
|
|
177
|
+
yield router.addDefaultHandler((ctx) => wrappedHandler(Object.assign(Object.assign({}, routerContext), ctx)));
|
|
178
|
+
});
|
|
179
|
+
exports.setupDefaultRoute = setupDefaultRoute;
|
|
180
|
+
//# sourceMappingURL=router.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"router.js","sourceRoot":"","sources":["../../../../src/lib/router/router.ts"],"names":[],"mappings":";;;;;;;;;;;;AASA,6CAA0E;AAK1E,wHAAwH;AAEjH,MAAM,gBAAgB,GAAG,CAI9B,EACA,MAAM,EACN,cAAc,EACd,aAAa,EACb,aAAa,GAMd,EAAE,EAAE;IACH,MAAM,IAAA,sBAAc,EAAC,MAAM,CAAC,OAAO,CAAC,aAAa,CAAC,EAAE,CAAO,CAAC,GAAG,EAAE,OAAO,CAAC,EAAE,EAAE;QAC3E,MAAM,cAAc,GAAG,CAAC,cAAc,aAAd,cAAc,cAAd,cAAc,GAAI,EAAE,CAAC,CAAC,WAAW,CACvD,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,EAC1C,OAA2E,CAC5E,CAAC;QACF,MAAM,MAAM,CAAC,UAAU,CAAa,GAAG,EAAE,CAAO,GAAG,EAAE,EAAE,kDACrD,OAAA,cAAc,CAAC,gCAAK,aAAa,GAAK,GAAG,CAAS,CAAC,CAAA,GAAA,CACpD,CAAC;IACJ,CAAC,CAAA,CAAC,CAAC;AACL,CAAC,CAAA,CAAC;AAxBW,QAAA,gBAAgB,oBAwB3B;AAEF,MAAM,oBAAoB,GAAG,CAK3B,KAK2C,EAC3C,EAAE;IACF,MAAM,EAAE,EAAE,EAAE,MAAM,EAAE,aAAa,EAAE,cAAc,EAAE,aAAa,EAAE,iBAAiB,EAAE,GAAG,KAAK,CAAC;IAE9F,sDAAsD;IACtD,iFAAiF;IACjF,MAAM,SAAS,GAAG,GAAG,EAAE,CAAC,EAAE,CAAC,gBAAgB,CAAC,cAAc,CAAC,CAAC;IAE5D,MAAM,YAAY,GAAG,CAAO,GAA0B,EAAE,EAAE;QACxD,IAAI,CAAC,GAAG;YAAE,OAAO;QACjB,MAAM,QAAQ,GAAG,MAAM,SAAS,EAAE,CAAC;QACnC,MAAM,QAAQ,CAAC,kBAAkB,CAAC,GAAG,CAAC,CAAC;IACzC,CAAC,CAAA,CAAC;IAEF,MAAM,eAAe,GAAG,CAAO,MAAc,EAAE,OAAoC,EAAE,EAAE;;QACrF,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;QAEpC,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,0CAA0C,MAAM,EAAE,CAAC,CAAC;QAE/D,IAAI,iBAAiB;YAAE,MAAM,IAAA,YAAI,EAAC,iBAAiB,CAAC,CAAC;QACrD,MAAM,QAAQ,GAAG,MAAM,SAAS,EAAE,CAAC;QACnC,MAAM,MAAM,GAAG,MAAA,CAAC,MAAM,QAAQ,CAAC,gBAAgB,EAAE,CAAC,mCAAI,IAAI,CAAC;QAE3D,IAAI,MAAM,EAAE;YACV,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,mCAAmC,MAAM,EAAE,CAAC,CAAC;YAExD,oEAAoE;YACpE,wEAAwE;YACxE,0CAA0C;YAC1C,IAAI,IAAI,aAAJ,IAAI,uBAAJ,IAAI,CAAE,IAAI;gBAAE,MAAM,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;SAC7C;aAAM;YACL,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,KAAK,CAAC,kCAAkC,MAAM,EAAE,CAAC,CAAC;SACxD;QACD,OAAO,MAAM,CAAC;IAChB,CAAC,CAAA,CAAC;IAEF,MAAM,OAAO,GAAG,CAAO,GAAQ,EAAE,GAA0B,EAAE,GAAQ,EAAE,EAAE;QACvE,GAAG,CAAC,KAAK,CAAC,gEAAgE,CAAA,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,SAAS,MAAI,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,GAAG,CAAA,GAAG,CAAC,CAAC,CAAC,kBAAkB;QAC5H,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QACf,iFAAiF;QACjF,IAAI,GAAG,EAAE;YACP,MAAM,QAAQ,GAAG,MAAM,SAAS,EAAE,CAAC;YACnC,MAAM,QAAQ,CAAC,cAAc,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;SACzD;IACH,CAAC,CAAA,CAAC;IAEF,uEAAuE;IACvE,kBAAkB;IAClB,MAAM,aAAa,GAA0D,CAAO,GAAG,EAAE,GAAG,EAAE,KAAK,EAAE,EAAE;QACrG,MAAM,OAAO,GAAG,KAAK,CAAC,YAAY,IAAI,IAAI,IAAI,aAAa,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;QAChF,IAAI,CAAC,OAAO,EAAE;YACZ,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,8BAA8B,KAAK,CAAC,IAAI,KAAK,KAAK,CAAC,YAAY,sCAAsC,GAAG,EAAE,CAAC,CAAC,CAAC,kBAAkB;YAC7I,OAAO;SACR;QACD,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,0BAA0B,KAAK,CAAC,YAAY,UAAU,GAAG,EAAE,CAAC,CAAC;QAC1E,MAAM,OAAO,CAAC,GAAU,CAAC,CAAC;IAC5B,CAAC,CAAA,CAAC;IAEF,MAAM,cAAc,GAAG,CACrB,GAAM,EACS,EAAE;;QACjB,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,SAAS,EAAE,GAAG,GAAG,CAAC;QACrC,MAAM,GAAG,GAAG,SAAS,CAAC,KAAK,CAAC,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC,CAAC;QAErD,IAAI,oBAAoB,GAAG,CAAC,CAAC;QAC7B,IAAI,GAAG,GAA0B,MAAA,GAAG,CAAC,OAAO,mCAAI,IAAI,CAAC;QAErD,MAAM,YAAY,GAAG,GAAG,EAAE,CACxB,aAAa,IAAI,IAAI,IAAI,GAAG,IAAI,IAAI,IAAI,oBAAoB,GAAG,aAAa,CAAC;QAE/E,MAAM,MAAM,GAAG,GAAG,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAE,IAA0B,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,GAAI,CAAC,SAAS,IAAI,GAAI,CAAC,GAAG,CAAC,CAAC;QAE7F,MAAM,SAAS,GAAG,GAAS,EAAE;;YAC3B,MAAM,GAAG,GAAG,MAAM,MAAM,EAAE,CAAC;YAC3B,MAAM,SAAS,GAAG,SAAS,oBAAoB,GAAG,CAAC,OAAO,aAAa,aAAb,aAAa,cAAb,aAAa,GAAI,CAAC,UAAU,GAAG,EAAE,CAAC;YAE5F,mCAAmC;YACnC,GAAG,CAAC,KAAK,CAAC,8CAA8C,SAAS,EAAE,CAAC,CAAC;YACrE,MAAM,KAAK,GAAG,MAAM,IAAA,uBAAe,EAAC,MAAM,EAAE,CAAO,SAAS,EAAE,EAAE;gBAC9D,MAAM,OAAO,GAAG,MAAM,SAAS,CAAC,KAAK,CAAC,GAAG,EAAE,GAAG,EAAE,SAAS,EAAE,aAAa,CAAC,CAAC;gBAC1E,OAAO,OAAO,CAAC;YACjB,CAAC,CAAA,CAAC,CAAC;YAEH,kBAAkB;YAClB,IAAI,KAAK,EAAE;gBACT,GAAG,CAAC,IAAI,CAAC,qBAAqB,KAAK,CAAC,IAAI,mBAAmB,KAAK,CAAC,YAAY,MAAM,SAAS,EAAE,CAAC,CAAC,CAAC,kBAAkB;gBACnH,MAAM,MAAM,GAAG,MAAA,KAAK,CAAC,MAAM,mCAAI,aAAa,CAAC;gBAC7C,MAAM,MAAM,CAAC,GAAG,EAAE,GAAG,EAAE,KAAK,EAAE,aAAa,CAAC,CAAC;aAC9C;iBAAM;gBACL,GAAG,CAAC,KAAK,CAAC,oDAAoD,SAAS,EAAE,CAAC,CAAC;aAC5E;YAED,yCAAyC;YACzC,MAAM,YAAY,CAAC,GAAG,CAAC,CAAC;YACxB,oBAAoB,EAAE,CAAC;YAEvB,GAAG,GAAG,MAAM,eAAe,CAAC,SAAS,EAAE,EAAE,IAAI,EAAE,IAAY,EAAE,GAAG,EAAE,CAAC,CAAC;QACtE,CAAC,CAAA,CAAC;QAEF,IAAI;YACF,GAAG;gBACD,MAAM,SAAS,EAAE,CAAC;aACnB,QAAQ,YAAY,EAAE,EAAE;SAC1B;QAAC,OAAO,GAAG,EAAE;YACZ,MAAM,OAAO,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC;SAC9B;IACH,CAAC,CAAA,CAAC;IAEF,OAAO,cAAc,CAAC;AACxB,CAAC,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgDG;AACI,MAAM,iBAAiB,GAAG,CAK/B,EACA,EAAE,EACF,MAAM,EACN,cAAc,EACd,aAAa,EACb,MAAM,EACN,aAAa,EACb,KAAK,GASN,EAAE,EAAE;IACH,MAAM,EAAE,aAAa,EAAE,iBAAiB,EAAE,cAAc,EAAE,GAAG,CAAC,KAAK,IAAI,EAAE,CACtD,CAAC;IAEpB,MAAM,cAAc,GAAG,oBAAoB,CAAC;QAC1C,EAAE;QACF,MAAM;QACN,aAAa;QACb,cAAc;QACd,aAAa;QACb,iBAAiB;KAClB,CAAC,CAAC;IAEH,MAAM,cAAc,GAAG,CAAC,cAAc,aAAd,cAAc,cAAd,cAAc,GAAI,EAAE,CAAC,CAAC,WAAW,CACvD,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,EAAE,CAAC,EAC5B,cAAc,CACf,CAAC;IACF,MAAM,MAAM,CAAC,iBAAiB,CAAa,CAAC,GAAG,EAAE,EAAE,CACjD,cAAc,CAAC,gCAAK,aAAa,GAAK,GAAG,CAAS,CAAC,CACpD,CAAC;AACJ,CAAC,CAAA,CAAC;AAzCW,QAAA,iBAAiB,qBAyC5B","sourcesContent":["import type {\n CrawlingContext,\n RouterHandler as CrawlerRouter,\n Request as CrawlerRequest,\n Log,\n} from 'crawlee';\nimport type { CommonPage } from '@crawlee/browser-pool';\nimport type { Page } from 'playwright';\n\nimport { serialAsyncFind, serialAsyncMap, wait } from '../../utils/async';\nimport type { PerfActorInput, RequestActorInput } from '../config';\nimport type { CrawleeOneIO } from '../integrations/types';\nimport type { CrawlerRouterWrapper, RouteHandler, RouteMatcher, RouterHandlerCtx } from './types';\n\n// Read about router on https://docs.apify.com/academy/expert-scraping-with-apify/solutions/using-storage-creating-tasks\n\nexport const registerHandlers = async <\n CrawlerCtx extends CrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>({\n router,\n routerWrappers,\n routerContext,\n routeHandlers,\n}: {\n router: CrawlerRouter<CrawlerCtx>;\n routerWrappers?: CrawlerRouterWrapper<CrawlerCtx, RouterCtx>[];\n routerContext?: RouterCtx;\n routeHandlers: Record<Labels, RouteHandler<CrawlerCtx, RouterCtx>>;\n}) => {\n await serialAsyncMap(Object.entries(routeHandlers), async ([key, handler]) => {\n const wrappedHandler = (routerWrappers ?? []).reduceRight(\n (fn, wrapper) => wrapper((ctx) => fn(ctx)),\n handler as (ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>) => Promise<void>\n );\n await router.addHandler<CrawlerCtx>(key, async (ctx) =>\n wrappedHandler({ ...routerContext, ...ctx } as any)\n );\n });\n};\n\nconst createDefaultHandler = <\n CrawlerCtx extends CrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(\n input: {\n io: CrawleeOneIO;\n routes: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[];\n routeHandlers: Record<Labels, RouteHandler<CrawlerCtx, RouterCtx>>;\n } & PerfActorInput &\n Pick<RequestActorInput, 'requestQueueId'>\n) => {\n const { io, routes, routeHandlers, requestQueueId, perfBatchSize, perfBatchWaitSecs } = input;\n\n // NOTE: Because we \"clear\" the queue by replacing it,\n // we need to always call `openRequestQueue` to ensure we use the latest instance\n const openQueue = () => io.openRequestQueue(requestQueueId);\n\n const closeRequest = async (req: CrawlerRequest | null) => {\n if (!req) return;\n const reqQueue = await openQueue();\n await reqQueue.markRequestHandled(req);\n };\n\n const loadNextRequest = async (suffix: string, options?: { page?: Page; log?: Log }) => {\n const { page, log } = options ?? {};\n\n log?.debug(`Checking for new Request in the queue. ${suffix}`);\n\n if (perfBatchWaitSecs) await wait(perfBatchWaitSecs);\n const reqQueue = await openQueue();\n const newReq = (await reqQueue.fetchNextRequest()) ?? null;\n\n if (newReq) {\n log?.debug(`Found new Request in the queue. ${suffix}`);\n\n // WARNING - For each subsequent Request, it must be loaded manually\n // Hence, batching is suitable only for browser-based Crawlers\n // like Playwright or Puppeteer.\n if (page?.goto) await page.goto(newReq.url);\n } else {\n log?.debug(`No more Requests in the queue. ${suffix}`);\n }\n return newReq;\n };\n\n const onError = async (err: any, req: CrawlerRequest | null, log: Log) => {\n log.error(`Failed to process a request, returning it to the queue. URL: ${req?.loadedUrl || req?.url}.`); // prettier-ignore\n log.error(err);\n // Reinsert the request into the queue if we failed to process it due to an error\n if (req) {\n const reqQueue = await openQueue();\n await reqQueue.reclaimRequest(req, { forefront: true });\n }\n };\n\n /** Redirect the URL to the labelled route identical to route's name */\n // prettier-ignore\n const defaultAction: RouteMatcher<CrawlerCtx, RouterCtx, Labels>['action'] = async (url, ctx, route) => {\n const handler = route.handlerLabel != null && routeHandlers[route.handlerLabel];\n if (!handler) {\n ctx.log.error(`No handler found for route ${route.name} (${route.handlerLabel}). URL will not be processed. URL: ${url}`); // prettier-ignore\n return;\n }\n ctx.log.info(`Passing URL to handler ${route.handlerLabel}. URL: ${url}`);\n await handler(ctx as any);\n };\n\n const defaultHandler = async <T extends RouterHandlerCtx<CrawlerCtx & RouterCtx>>(\n ctx: T\n ): Promise<void> => {\n const { page, log: parentLog } = ctx;\n const log = parentLog.child({ prefix: '[Router] ' });\n\n let handledRequestsCount = 0;\n let req: CrawlerRequest | null = ctx.request ?? null;\n\n const hasBatchReqs = () =>\n perfBatchSize != null && req != null && handledRequestsCount < perfBatchSize;\n\n const getUrl = () => (page ? (page as any as CommonPage).url() : req!.loadedUrl || req!.url);\n\n const onRequest = async () => {\n const url = await getUrl();\n const logSuffix = `Batch ${handledRequestsCount + 1} of ${perfBatchSize ?? 1}. URL: ${url}`;\n\n // Find route handler for given URL\n log.debug(`Searching for a handler for given Request. ${logSuffix}`);\n const route = await serialAsyncFind(routes, async (currRoute) => {\n const isMatch = await currRoute.match(url, ctx, currRoute, routeHandlers);\n return isMatch;\n });\n\n // Run the handler\n if (route) {\n log.info(`URL matched route ${route.name} (handlerLabel: ${route.handlerLabel}). ${logSuffix}`); // prettier-ignore\n const action = route.action ?? defaultAction;\n await action(url, ctx, route, routeHandlers);\n } else {\n log.error(`No route matched URL. URL will not be processed. ${logSuffix}`);\n }\n\n // Clean up and move onto another request\n await closeRequest(req);\n handledRequestsCount++;\n\n req = await loadNextRequest(logSuffix, { page: page as Page, log });\n };\n\n try {\n do {\n await onRequest();\n } while (hasBatchReqs());\n } catch (err) {\n await onError(err, req, log);\n }\n };\n\n return defaultHandler;\n};\n\n/**\n * Configures the default router handler to redirect URLs to labelled route handlers\n * based on which route the URL matches first.\n *\n * NOTE: This does mean that the URLs passed to this default handler will be fetched\n * twice (as the URL will be requeued to the correct handler). We recommend to use this\n * function only in the scenarios where there is a small number of startUrls, yet these\n * may need various ways of processing based on different paths or etc.\n *\n * @example\n *\n * const routeLabels = {\n * MAIN_PAGE: 'MAIN_PAGE',\n * JOB_LISTING: 'JOB_LISTING',\n * JOB_DETAIL: 'JOB_DETAIL',\n * JOB_RELATED_LIST: 'JOB_RELATED_LIST',\n * PARTNERS: 'PARTNERS',\n * } as const;\n *\n * const router = createPlaywrightRouter();\n *\n * const routes = createPlaywrightRouteMatchers<typeof routeLabels>([\n * // URLs that match this route are redirected to router.addHandler(routeLabels.MAIN_PAGE)\n * {\n * route: routeLabels.MAIN_PAGE,\n * // Check for main page like https://www.profesia.sk/?#\n * match: (url) => url.match(/[\\W]profesia\\.sk\\/?(?:[?#~]|$)/i),\n * },\n *\n * // Optionally override the logic that assigns the URL to the route by specifying the `action` prop\n * {\n * route: routeLabels.MAIN_PAGE,\n * // Check for main page like https://www.profesia.sk/?#\n * match: (url) => url.match(/[\\W]profesia\\.sk\\/?(?:[?#~]|$)/i),\n * action: async (ctx) => {\n * await ctx.crawler.addRequests([{\n * url: 'https://profesia.sk/praca',\n * label: routeLabels.JOB_LISTING,\n * }]);\n * },\n * },\n * ]);\n *\n * // Set up default route to redirect to labelled routes\n * setupDefaultRoute({ router, routes });\n *\n * // Now set up the labelled routes\n * await router.addHandler(routeLabels.JOB_LISTING, async (ctx) => { ... }\n */\nexport const setupDefaultRoute = async <\n CrawlerCtx extends CrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>\n>({\n io,\n router,\n routerWrappers,\n routerContext,\n routes,\n routeHandlers,\n input,\n}: {\n io: CrawleeOneIO;\n router: CrawlerRouter<CrawlerCtx>;\n routerWrappers?: CrawlerRouterWrapper<CrawlerCtx, RouterCtx>[];\n routerContext?: RouterCtx;\n routes: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[];\n routeHandlers: Record<Labels, RouteHandler<CrawlerCtx, RouterCtx>>;\n input?: Input | null;\n}) => {\n const { perfBatchSize, perfBatchWaitSecs, requestQueueId } = (input || {}) as PerfActorInput &\n RequestActorInput;\n\n const defaultHandler = createDefaultHandler({\n io,\n routes,\n routeHandlers,\n requestQueueId,\n perfBatchSize,\n perfBatchWaitSecs,\n });\n\n const wrappedHandler = (routerWrappers ?? []).reduceRight(\n (fn, wrapper) => wrapper(fn),\n defaultHandler\n );\n await router.addDefaultHandler<CrawlerCtx>((ctx) =>\n wrappedHandler({ ...routerContext, ...ctx } as any)\n );\n};\n"]}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import type { BasicCrawler, BasicCrawlingContext, CheerioCrawlingContext, CrawlingContext, HttpCrawlingContext, JSDOMCrawlingContext, PlaywrightCrawlingContext, PuppeteerCrawlingContext, RouterHandler as CrawlerRouter } from 'crawlee';
|
|
2
|
+
import type { MaybePromise } from '../../utils/types';
|
|
3
|
+
/** Context object provided in CrawlerRouter */
|
|
4
|
+
export type RouterHandlerCtx<CrawlerCtx extends CrawlingContext> = Parameters<Parameters<CrawlerRouter<CrawlerCtx>['addHandler']>[1]>[0];
|
|
5
|
+
/** Function that's passed to `router.addHandler(label, handler)` */
|
|
6
|
+
export type RouteHandler<CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>, RouterCtx extends Record<string, any> = Record<string, any>> = Parameters<CrawlerRouter<RouterHandlerCtx<CrawlerCtx & RouterCtx>>['addHandler']>[1];
|
|
7
|
+
/** Wrapper that modifies behavior of RouteHandler */
|
|
8
|
+
export type CrawlerRouterWrapper<CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>, RouterCtx extends Record<string, any> = Record<string, any>> = (handler: (ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>) => Promise<void>) => (ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>) => Promise<void>;
|
|
9
|
+
/**
|
|
10
|
+
* Criteria that un-labelled requests are matched against.
|
|
11
|
+
*
|
|
12
|
+
* E.g. If `match` function returns truthy value,
|
|
13
|
+
* the request is passed to the `action` function for processing.
|
|
14
|
+
*/
|
|
15
|
+
export interface RouteMatcher<CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>, RouterCtx extends Record<string, any> = Record<string, any>, Labels extends string = string> {
|
|
16
|
+
/** Human readable name */
|
|
17
|
+
name: string;
|
|
18
|
+
/**
|
|
19
|
+
* Label of the handler registered with `router.addHandler(label, handler)`
|
|
20
|
+
* that will process this request.
|
|
21
|
+
*
|
|
22
|
+
* NOTE: This value is used by the default `action` function. If you override
|
|
23
|
+
* the `action` function, `handlerLabel` is ignored and you have to process it yourself.
|
|
24
|
+
*/
|
|
25
|
+
handlerLabel: Labels | null;
|
|
26
|
+
/**
|
|
27
|
+
* Function that decides whether the request will processed by this `action` function.
|
|
28
|
+
*
|
|
29
|
+
* @example
|
|
30
|
+
* [{
|
|
31
|
+
* // If match returns true, the request is forwarded to handler
|
|
32
|
+
* // with label JOB_DETAIL.
|
|
33
|
+
* name: 'Job detail',
|
|
34
|
+
* match: (url, ctx, route, handlers) => isUrlOfJobOffer(url),
|
|
35
|
+
* handlerLabel: routeLabels.JOB_DETAIL,
|
|
36
|
+
* }]
|
|
37
|
+
*/
|
|
38
|
+
match: (url: string, ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>, route: RouteMatcher<CrawlerCtx, RouterCtx, Labels>, handlers: Record<Labels, RouteHandler<CrawlerCtx, RouterCtx>>) => unknown;
|
|
39
|
+
/**
|
|
40
|
+
* Request is passed to this function if `match` returned truthy value.
|
|
41
|
+
*
|
|
42
|
+
* @example
|
|
43
|
+
* [{
|
|
44
|
+
* // If match returns true, the request is forwarded to handler
|
|
45
|
+
* // with label JOB_DETAIL.
|
|
46
|
+
* name: 'Job detail',
|
|
47
|
+
* match: (url, ctx, route, handlers) => isUrlOfJobOffer(url),
|
|
48
|
+
* handlerLabel: routeLabels.JOB_DETAIL,
|
|
49
|
+
* }]
|
|
50
|
+
*/
|
|
51
|
+
action?: (url: string, ctx: RouterHandlerCtx<CrawlerCtx>, route: RouteMatcher<CrawlerCtx, RouterCtx, Labels>, handlers: Record<Labels, RouteHandler<CrawlerCtx, RouterCtx>>) => MaybePromise<void>;
|
|
52
|
+
}
|
|
53
|
+
export declare const createRouteMatchers: <CrawlerCtx extends CrawlingContext<unknown, import("crawlee").Dictionary> = CrawlingContext<BasicCrawler<BasicCrawlingContext<import("crawlee").Dictionary>>, import("crawlee").Dictionary>, RouterCtx extends Record<string, any> = Record<string, any>, Labels extends string = string>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => RouteMatcher<CrawlerCtx, RouterCtx, Labels>[];
|
|
54
|
+
export declare const createBasicRouteMatchers: <CrawlerCtx extends BasicCrawlingContext<import("crawlee").Dictionary> = BasicCrawlingContext<import("crawlee").Dictionary>, RouterCtx extends Record<string, any> = Record<string, any>, Labels extends string = string>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => RouteMatcher<CrawlerCtx, RouterCtx, Labels>[];
|
|
55
|
+
export declare const createHttpRouteMatchers: <CrawlerCtx extends HttpCrawlingContext<any, any> = HttpCrawlingContext<any, any>, RouterCtx extends Record<string, any> = Record<string, any>, Labels extends string = string>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => RouteMatcher<CrawlerCtx, RouterCtx, Labels>[];
|
|
56
|
+
export declare const createJsdomRouteMatchers: <CrawlerCtx extends JSDOMCrawlingContext<any, any> = JSDOMCrawlingContext<any, any>, RouterCtx extends Record<string, any> = Record<string, any>, Labels extends string = string>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => RouteMatcher<CrawlerCtx, RouterCtx, Labels>[];
|
|
57
|
+
export declare const createCheerioRouteMatchers: <CrawlerCtx extends CheerioCrawlingContext<any, any> = CheerioCrawlingContext<any, any>, RouterCtx extends Record<string, any> = Record<string, any>, Labels extends string = string>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => RouteMatcher<CrawlerCtx, RouterCtx, Labels>[];
|
|
58
|
+
export declare const createPlaywrightRouteMatchers: <CrawlerCtx extends PlaywrightCrawlingContext<import("crawlee").Dictionary> = PlaywrightCrawlingContext<import("crawlee").Dictionary>, RouterCtx extends Record<string, any> = Record<string, any>, Labels extends string = string>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => RouteMatcher<CrawlerCtx, RouterCtx, Labels>[];
|
|
59
|
+
export declare const createPuppeteerRouteMatchers: <CrawlerCtx extends PuppeteerCrawlingContext<import("crawlee").Dictionary> = PuppeteerCrawlingContext<import("crawlee").Dictionary>, RouterCtx extends Record<string, any> = Record<string, any>, Labels extends string = string>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => RouteMatcher<CrawlerCtx, RouterCtx, Labels>[];
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.createPuppeteerRouteMatchers = exports.createPlaywrightRouteMatchers = exports.createCheerioRouteMatchers = exports.createJsdomRouteMatchers = exports.createHttpRouteMatchers = exports.createBasicRouteMatchers = exports.createRouteMatchers = void 0;
|
|
4
|
+
const createRouteMatchers = (matchers) => matchers; // prettier-ignore
|
|
5
|
+
exports.createRouteMatchers = createRouteMatchers;
|
|
6
|
+
// Context-specific variants
|
|
7
|
+
const createBasicRouteMatchers = (matchers) => matchers; // prettier-ignore
|
|
8
|
+
exports.createBasicRouteMatchers = createBasicRouteMatchers;
|
|
9
|
+
const createHttpRouteMatchers = (matchers) => matchers; // prettier-ignore
|
|
10
|
+
exports.createHttpRouteMatchers = createHttpRouteMatchers;
|
|
11
|
+
const createJsdomRouteMatchers = (matchers) => matchers; // prettier-ignore
|
|
12
|
+
exports.createJsdomRouteMatchers = createJsdomRouteMatchers;
|
|
13
|
+
const createCheerioRouteMatchers = (matchers) => matchers; // prettier-ignore
|
|
14
|
+
exports.createCheerioRouteMatchers = createCheerioRouteMatchers;
|
|
15
|
+
const createPlaywrightRouteMatchers = (matchers) => matchers; // prettier-ignore
|
|
16
|
+
exports.createPlaywrightRouteMatchers = createPlaywrightRouteMatchers;
|
|
17
|
+
const createPuppeteerRouteMatchers = (matchers) => matchers; // prettier-ignore
|
|
18
|
+
exports.createPuppeteerRouteMatchers = createPuppeteerRouteMatchers;
|
|
19
|
+
//# sourceMappingURL=types.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../../src/lib/router/types.ts"],"names":[],"mappings":";;;AA4FO,MAAM,mBAAmB,GAAG,CAIjC,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,mBAAmB,uBAIuC;AAEvE,4BAA4B;AACrB,MAAM,wBAAwB,GAAG,CAItC,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,wBAAwB,4BAIkC;AAChE,MAAM,uBAAuB,GAAG,CAIrC,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,uBAAuB,2BAImC;AAChE,MAAM,wBAAwB,GAAG,CAItC,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,wBAAwB,4BAIkC;AAChE,MAAM,0BAA0B,GAAG,CAIxC,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,0BAA0B,8BAIgC;AAChE,MAAM,6BAA6B,GAAG,CAI3C,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,6BAA6B,iCAI6B;AAChE,MAAM,4BAA4B,GAAG,CAI1C,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,4BAA4B,gCAI8B","sourcesContent":["import type {\n BasicCrawler,\n BasicCrawlingContext,\n CheerioCrawlingContext,\n CrawlingContext,\n HttpCrawlingContext,\n JSDOMCrawlingContext,\n PlaywrightCrawlingContext,\n PuppeteerCrawlingContext,\n RouterHandler as CrawlerRouter,\n} from 'crawlee';\n\nimport type { MaybePromise } from '../../utils/types';\n\n/** Context object provided in CrawlerRouter */\nexport type RouterHandlerCtx<CrawlerCtx extends CrawlingContext> = Parameters<\n Parameters<CrawlerRouter<CrawlerCtx>['addHandler']>[1]\n>[0];\n\n/** Function that's passed to `router.addHandler(label, handler)` */\nexport type RouteHandler<\n CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n RouterCtx extends Record<string, any> = Record<string, any>,\n> = Parameters<CrawlerRouter<RouterHandlerCtx<CrawlerCtx & RouterCtx>>['addHandler']>[1]; // prettier-ignore\n\n/** Wrapper that modifies behavior of RouteHandler */\nexport type CrawlerRouterWrapper<\n CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n RouterCtx extends Record<string, any> = Record<string, any>\n> = (\n handler: (ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>) => Promise<void>\n) => (ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>) => Promise<void>;\n\n/**\n * Criteria that un-labelled requests are matched against.\n *\n * E.g. If `match` function returns truthy value,\n * the request is passed to the `action` function for processing.\n */\nexport interface RouteMatcher<\n CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n> {\n /** Human readable name */\n name: string;\n /**\n * Label of the handler registered with `router.addHandler(label, handler)`\n * that will process this request.\n *\n * NOTE: This value is used by the default `action` function. If you override\n * the `action` function, `handlerLabel` is ignored and you have to process it yourself.\n */\n handlerLabel: Labels | null;\n /**\n * Function that decides whether the request will processed by this `action` function.\n *\n * @example\n * [{\n * // If match returns true, the request is forwarded to handler\n * // with label JOB_DETAIL.\n * name: 'Job detail',\n * match: (url, ctx, route, handlers) => isUrlOfJobOffer(url),\n * handlerLabel: routeLabels.JOB_DETAIL,\n * }]\n */\n match: (\n url: string,\n ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>,\n route: RouteMatcher<CrawlerCtx, RouterCtx, Labels>,\n handlers: Record<Labels, RouteHandler<CrawlerCtx, RouterCtx>>\n ) => unknown;\n /**\n * Request is passed to this function if `match` returned truthy value.\n *\n * @example\n * [{\n * // If match returns true, the request is forwarded to handler\n * // with label JOB_DETAIL.\n * name: 'Job detail',\n * match: (url, ctx, route, handlers) => isUrlOfJobOffer(url),\n * handlerLabel: routeLabels.JOB_DETAIL,\n * }]\n */\n action?: (\n url: string,\n ctx: RouterHandlerCtx<CrawlerCtx>,\n route: RouteMatcher<CrawlerCtx, RouterCtx, Labels>,\n handlers: Record<Labels, RouteHandler<CrawlerCtx, RouterCtx>>\n ) => MaybePromise<void>;\n}\n\nexport const createRouteMatchers = <\n CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\n\n// Context-specific variants\nexport const createBasicRouteMatchers = <\n CrawlerCtx extends BasicCrawlingContext = BasicCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\nexport const createHttpRouteMatchers = <\n CrawlerCtx extends HttpCrawlingContext = HttpCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\nexport const createJsdomRouteMatchers = <\n CrawlerCtx extends JSDOMCrawlingContext = JSDOMCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\nexport const createCheerioRouteMatchers = <\nCrawlerCtx extends CheerioCrawlingContext = CheerioCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\nexport const createPlaywrightRouteMatchers = <\n CrawlerCtx extends PlaywrightCrawlingContext = PlaywrightCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\nexport const createPuppeteerRouteMatchers = <\n CrawlerCtx extends PuppeteerCrawlingContext = PuppeteerCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\n"]}
|
package/dist/cjs/lib/router.js
CHANGED
|
@@ -99,12 +99,15 @@ const setupDefaultRoute = ({ io, router, routerWrappers, routerContext, routes,
|
|
|
99
99
|
var _a;
|
|
100
100
|
const { page, log: parentLog } = ctx;
|
|
101
101
|
const log = parentLog.child({ prefix: '[Router] ' });
|
|
102
|
-
|
|
102
|
+
// NOTE: Because we "clear" the queue by replacing it,
|
|
103
|
+
// we need to always call `openRequestQueue` to ensure we use the latest instance
|
|
104
|
+
const openQueue = () => io.openRequestQueue(requestQueueId);
|
|
103
105
|
let handledRequestsCount = 0;
|
|
104
106
|
let req = ctx.request;
|
|
105
107
|
const closeRequest = () => __awaiter(void 0, void 0, void 0, function* () {
|
|
106
108
|
if (!req)
|
|
107
109
|
return;
|
|
110
|
+
const reqQueue = yield openQueue();
|
|
108
111
|
yield reqQueue.markRequestHandled(req);
|
|
109
112
|
handledRequestsCount++;
|
|
110
113
|
});
|
|
@@ -112,6 +115,7 @@ const setupDefaultRoute = ({ io, router, routerWrappers, routerContext, routes,
|
|
|
112
115
|
log.debug(`Checking for new Request in the queue. ${suffix}`);
|
|
113
116
|
if (perfBatchWaitSecs)
|
|
114
117
|
yield (0, async_1.wait)(perfBatchWaitSecs);
|
|
118
|
+
const reqQueue = yield openQueue();
|
|
115
119
|
const newReq = yield reqQueue.fetchNextRequest();
|
|
116
120
|
req = newReq !== null && newReq !== void 0 ? newReq : null;
|
|
117
121
|
if (req) {
|
|
@@ -154,8 +158,10 @@ const setupDefaultRoute = ({ io, router, routerWrappers, routerContext, routes,
|
|
|
154
158
|
log.error(`Failed to process a request, returning it to the queue. URL: ${(req === null || req === void 0 ? void 0 : req.loadedUrl) || (req === null || req === void 0 ? void 0 : req.url)}.`); // prettier-ignore
|
|
155
159
|
log.error(err);
|
|
156
160
|
// Reinsert the request into the queue if we failed to process it due to an error
|
|
157
|
-
if (req)
|
|
161
|
+
if (req) {
|
|
162
|
+
const reqQueue = yield openQueue();
|
|
158
163
|
yield reqQueue.reclaimRequest(req, { forefront: true });
|
|
164
|
+
}
|
|
159
165
|
}
|
|
160
166
|
});
|
|
161
167
|
const wrappedHandler = (routerWrappers !== null && routerWrappers !== void 0 ? routerWrappers : []).reduceRight((fn, wrapper) => wrapper(fn), defaultHandler);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"router.js","sourceRoot":"","sources":["../../../src/lib/router.ts"],"names":[],"mappings":";;;;;;;;;;;;AAgBA,0CAAuE;AAoFhE,MAAM,mBAAmB,GAAG,CAIjC,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,mBAAmB,uBAIuC;AAEvE,4BAA4B;AACrB,MAAM,wBAAwB,GAAG,CAItC,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,wBAAwB,4BAIkC;AAChE,MAAM,uBAAuB,GAAG,CAIrC,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,uBAAuB,2BAImC;AAChE,MAAM,wBAAwB,GAAG,CAItC,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,wBAAwB,4BAIkC;AAChE,MAAM,0BAA0B,GAAG,CAIxC,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,0BAA0B,8BAIgC;AAChE,MAAM,6BAA6B,GAAG,CAI3C,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,6BAA6B,iCAI6B;AAChE,MAAM,4BAA4B,GAAG,CAI1C,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,4BAA4B,gCAI8B;AAEhE,MAAM,gBAAgB,GAAG,CAI9B,EACA,MAAM,EACN,cAAc,EACd,aAAa,EACb,aAAa,GAMd,EAAE,EAAE;IACH,MAAM,IAAA,sBAAc,EAAC,MAAM,CAAC,OAAO,CAAC,aAAa,CAAC,EAAE,CAAO,CAAC,GAAG,EAAE,OAAO,CAAC,EAAE,EAAE;QAC3E,MAAM,cAAc,GAAG,CAAC,cAAc,aAAd,cAAc,cAAd,cAAc,GAAI,EAAE,CAAC,CAAC,WAAW,CACvD,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,EAC1C,OAA2E,CAC5E,CAAC;QACF,MAAM,MAAM,CAAC,UAAU,CAAa,GAAG,EAAE,CAAO,GAAG,EAAE,EAAE,kDACrD,OAAA,cAAc,CAAC,gCAAK,aAAa,GAAK,GAAG,CAAS,CAAC,CAAA,GAAA,CACpD,CAAC;IACJ,CAAC,CAAA,CAAC,CAAC;AACL,CAAC,CAAA,CAAC;AAxBW,QAAA,gBAAgB,oBAwB3B;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgDG;AACI,MAAM,iBAAiB,GAAG,CAK/B,EACA,EAAE,EACF,MAAM,EACN,cAAc,EACd,aAAa,EACb,MAAM,EACN,aAAa,EACb,KAAK,GASN,EAAE,EAAE;IACH,MAAM,EAAE,aAAa,EAAE,iBAAiB,EAAE,cAAc,EAAE,GAAG,CAAC,KAAK,IAAI,EAAE,CACtD,CAAC;IAEpB,uEAAuE;IACvE,kBAAkB;IAClB,MAAM,aAAa,GAA0D,CAAO,GAAG,EAAE,GAAG,EAAE,KAAK,EAAE,EAAE;QACrG,MAAM,OAAO,GAAG,KAAK,CAAC,YAAY,IAAI,IAAI,IAAI,aAAa,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;QAChF,IAAI,CAAC,OAAO,EAAE;YACZ,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,8BAA8B,KAAK,CAAC,IAAI,KAAK,KAAK,CAAC,YAAY,sCAAsC,GAAG,EAAE,CAAC,CAAC,CAAC,kBAAkB;YAC7I,OAAO;SACR;QACD,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,0BAA0B,KAAK,CAAC,YAAY,UAAU,GAAG,EAAE,CAAC,CAAC;QAC1E,MAAM,OAAO,CAAC,GAAU,CAAC,CAAC;IAC5B,CAAC,CAAA,CAAC;IAEF,MAAM,cAAc,GAAG,CAAO,GAA6C,EAAiB,EAAE;;QAC5F,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,SAAS,EAAE,GAAG,GAAG,CAAC;QACrC,MAAM,GAAG,GAAG,SAAS,CAAC,KAAK,CAAC,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC,CAAC;QAErD,MAAM,QAAQ,GAAG,MAAM,EAAE,CAAC,gBAAgB,CAAC,cAAc,CAAC,CAAC;QAE3D,IAAI,oBAAoB,GAAG,CAAC,CAAC;QAC7B,IAAI,GAAG,GAA0B,GAAG,CAAC,OAAO,CAAC;QAE7C,MAAM,YAAY,GAAG,GAAS,EAAE;YAC9B,IAAI,CAAC,GAAG;gBAAE,OAAO;YACjB,MAAM,QAAQ,CAAC,kBAAkB,CAAC,GAAG,CAAC,CAAC;YACvC,oBAAoB,EAAE,CAAC;QACzB,CAAC,CAAA,CAAC;QAEF,MAAM,eAAe,GAAG,CAAO,MAAc,EAAE,EAAE;YAC/C,GAAG,CAAC,KAAK,CAAC,0CAA0C,MAAM,EAAE,CAAC,CAAC;YAE9D,IAAI,iBAAiB;gBAAE,MAAM,IAAA,YAAI,EAAC,iBAAiB,CAAC,CAAC;YACrD,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,gBAAgB,EAAE,CAAC;YACjD,GAAG,GAAG,MAAM,aAAN,MAAM,cAAN,MAAM,GAAI,IAAI,CAAC;YAErB,IAAI,GAAG,EAAE;gBACP,GAAG,CAAC,KAAK,CAAC,mCAAmC,MAAM,EAAE,CAAC,CAAC;gBAEvD,oEAAoE;gBACpE,wEAAwE;gBACxE,0CAA0C;gBAC1C,IAAI,IAAI,IAAI,IAAI,CAAC,IAAI;oBAAE,MAAO,IAAa,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;aAC3D;iBAAM;gBACL,GAAG,CAAC,KAAK,CAAC,kCAAkC,MAAM,EAAE,CAAC,CAAC;aACvD;QACH,CAAC,CAAA,CAAC;QAEF,MAAM,YAAY,GAAG,GAAG,EAAE,CACxB,aAAa,IAAI,IAAI,IAAI,GAAG,IAAI,IAAI,IAAI,oBAAoB,GAAG,aAAa,CAAC;QAE/E,IAAI;YACF,GAAG;gBACD,MAAM,GAAG,GAAG,IAAI,CAAC,CAAC,CAAC,MAAO,IAA0B,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAA,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,SAAS,MAAI,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,GAAG,CAAA,CAAC;gBACxF,MAAM,SAAS,GAAG,SAAS,oBAAoB,GAAG,CAAC,OAAO,aAAa,aAAb,aAAa,cAAb,aAAa,GAAI,CAAC,UAAU,GAAG,EAAE,CAAC;gBAE5F,mCAAmC;gBACnC,GAAG,CAAC,KAAK,CAAC,8CAA8C,SAAS,EAAE,CAAC,CAAC;gBACrE,MAAM,KAAK,GAAG,MAAM,IAAA,uBAAe,EAAC,MAAM,EAAE,CAAO,SAAS,EAAE,EAAE;oBAC9D,MAAM,OAAO,GAAG,MAAM,SAAS,CAAC,KAAK,CAAC,GAAG,EAAE,GAAG,EAAE,SAAS,EAAE,aAAa,CAAC,CAAC;oBAC1E,OAAO,OAAO,CAAC;gBACjB,CAAC,CAAA,CAAC,CAAC;gBAEH,kBAAkB;gBAClB,IAAI,KAAK,EAAE;oBACT,GAAG,CAAC,IAAI,CAAC,qBAAqB,KAAK,CAAC,IAAI,mBAAmB,KAAK,CAAC,YAAY,MAAM,SAAS,EAAE,CAAC,CAAC,CAAC,kBAAkB;oBACnH,MAAM,CAAC,MAAA,KAAK,CAAC,MAAM,mCAAI,aAAa,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,KAAK,EAAE,aAAa,CAAC,CAAC;iBACvE;qBAAM;oBACL,GAAG,CAAC,KAAK,CAAC,oDAAoD,SAAS,EAAE,CAAC,CAAC;iBAC5E;gBAED,yCAAyC;gBACzC,MAAM,YAAY,EAAE,CAAC;gBACrB,MAAM,eAAe,CAAC,SAAS,CAAC,CAAC;aAClC,QAAQ,YAAY,EAAE,EAAE;SAC1B;QAAC,OAAO,GAAG,EAAE;YACZ,GAAG,CAAC,KAAK,CAAC,gEAAgE,CAAA,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,SAAS,MAAI,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,GAAG,CAAA,GAAG,CAAC,CAAC,CAAC,kBAAkB;YAC5H,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YACf,iFAAiF;YACjF,IAAI,GAAG;gBAAE,MAAM,QAAQ,CAAC,cAAc,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;SAClE;IACH,CAAC,CAAA,CAAC;IAEF,MAAM,cAAc,GAAG,CAAC,cAAc,aAAd,cAAc,cAAd,cAAc,GAAI,EAAE,CAAC,CAAC,WAAW,CACvD,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,EAAE,CAAC,EAC5B,cAAc,CACf,CAAC;IACF,MAAM,MAAM,CAAC,iBAAiB,CAAa,CAAC,GAAG,EAAE,EAAE,CACjD,cAAc,CAAC,gCAAK,aAAa,GAAK,GAAG,CAAS,CAAC,CACpD,CAAC;AACJ,CAAC,CAAA,CAAC;AAjHW,QAAA,iBAAiB,qBAiH5B","sourcesContent":["import type {\n BasicCrawler,\n BasicCrawlingContext,\n CheerioCrawlingContext,\n CrawlingContext,\n HttpCrawlingContext,\n JSDOMCrawlingContext,\n PlaywrightCrawlingContext,\n PuppeteerCrawlingContext,\n RouterHandler as CrawlerRouter,\n Request as CrawlerRequest,\n} from 'crawlee';\nimport type { CommonPage } from '@crawlee/browser-pool';\nimport type { Page } from 'playwright';\n\nimport type { MaybePromise } from '../utils/types';\nimport { serialAsyncFind, serialAsyncMap, wait } from '../utils/async';\nimport type { PerfActorInput, RequestActorInput } from './config';\nimport type { CrawleeOneIO } from './integrations/types';\n\n// Read about router on https://docs.apify.com/academy/expert-scraping-with-apify/solutions/using-storage-creating-tasks\n\n/** Context object provided in CrawlerRouter */\nexport type RouterHandlerCtx<CrawlerCtx extends CrawlingContext> = Parameters<\n Parameters<CrawlerRouter<CrawlerCtx>['addHandler']>[1]\n>[0];\n\n/** Function that's passed to `router.addHandler(label, handler)` */\nexport type RouteHandler<\n CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n RouterCtx extends Record<string, any> = Record<string, any>,\n> = Parameters<CrawlerRouter<RouterHandlerCtx<CrawlerCtx & RouterCtx>>['addHandler']>[1]; // prettier-ignore\n\n/** Wrapper that modifies behavior of RouteHandler */\nexport type CrawlerRouterWrapper<\n CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n RouterCtx extends Record<string, any> = Record<string, any>\n> = (\n handler: (ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>) => Promise<void>\n) => (ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>) => Promise<void>;\n\n/**\n * Criteria that un-labelled requests are matched against.\n *\n * E.g. If `match` function returns truthy value,\n * the request is passed to the `action` function for processing.\n */\nexport interface RouteMatcher<\n CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n> {\n /** Human readable name */\n name: string;\n /**\n * Label of the handler registered with `router.addHandler(label, handler)`\n * that will process this request.\n *\n * NOTE: This value is used by the default `action` function. If you override\n * the `action` function, `handlerLabel` is ignored and you have to process it yourself.\n */\n handlerLabel: Labels | null;\n /**\n * Function that decides whether the request will processed by this `action` function.\n *\n * @example\n * [{\n * // If match returns true, the request is forwarded to handler\n * // with label JOB_DETAIL.\n * name: 'Job detail',\n * match: (url, ctx, route, handlers) => isUrlOfJobOffer(url),\n * handlerLabel: routeLabels.JOB_DETAIL,\n * }]\n */\n match: (\n url: string,\n ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>,\n route: RouteMatcher<CrawlerCtx, RouterCtx, Labels>,\n handlers: Record<Labels, RouteHandler<CrawlerCtx, RouterCtx>>\n ) => unknown;\n /**\n * Request is passed to this function if `match` returned truthy value.\n *\n * @example\n * [{\n * // If match returns true, the request is forwarded to handler\n * // with label JOB_DETAIL.\n * name: 'Job detail',\n * match: (url, ctx, route, handlers) => isUrlOfJobOffer(url),\n * handlerLabel: routeLabels.JOB_DETAIL,\n * }]\n */\n action?: (\n url: string,\n ctx: RouterHandlerCtx<CrawlerCtx>,\n route: RouteMatcher<CrawlerCtx, RouterCtx, Labels>,\n handlers: Record<Labels, RouteHandler<CrawlerCtx, RouterCtx>>\n ) => MaybePromise<void>;\n}\n\nexport const createRouteMatchers = <\n CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\n\n// Context-specific variants\nexport const createBasicRouteMatchers = <\n CrawlerCtx extends BasicCrawlingContext = BasicCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\nexport const createHttpRouteMatchers = <\n CrawlerCtx extends HttpCrawlingContext = HttpCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\nexport const createJsdomRouteMatchers = <\n CrawlerCtx extends JSDOMCrawlingContext = JSDOMCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\nexport const createCheerioRouteMatchers = <\nCrawlerCtx extends CheerioCrawlingContext = CheerioCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\nexport const createPlaywrightRouteMatchers = <\n CrawlerCtx extends PlaywrightCrawlingContext = PlaywrightCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\nexport const createPuppeteerRouteMatchers = <\n CrawlerCtx extends PuppeteerCrawlingContext = PuppeteerCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\n\nexport const registerHandlers = async <\n CrawlerCtx extends CrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>({\n router,\n routerWrappers,\n routerContext,\n routeHandlers,\n}: {\n router: CrawlerRouter<CrawlerCtx>;\n routerWrappers?: CrawlerRouterWrapper<CrawlerCtx, RouterCtx>[];\n routerContext?: RouterCtx;\n routeHandlers: Record<Labels, RouteHandler<CrawlerCtx, RouterCtx>>;\n}) => {\n await serialAsyncMap(Object.entries(routeHandlers), async ([key, handler]) => {\n const wrappedHandler = (routerWrappers ?? []).reduceRight(\n (fn, wrapper) => wrapper((ctx) => fn(ctx)),\n handler as (ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>) => Promise<void>\n );\n await router.addHandler<CrawlerCtx>(key, async (ctx) =>\n wrappedHandler({ ...routerContext, ...ctx } as any)\n );\n });\n};\n\n/**\n * Configures the default router handler to redirect URLs to labelled route handlers\n * based on which route the URL matches first.\n *\n * NOTE: This does mean that the URLs passed to this default handler will be fetched\n * twice (as the URL will be requeued to the correct handler). We recommend to use this\n * function only in the scenarios where there is a small number of startUrls, yet these\n * may need various ways of processing based on different paths or etc.\n *\n * @example\n *\n * const routeLabels = {\n * MAIN_PAGE: 'MAIN_PAGE',\n * JOB_LISTING: 'JOB_LISTING',\n * JOB_DETAIL: 'JOB_DETAIL',\n * JOB_RELATED_LIST: 'JOB_RELATED_LIST',\n * PARTNERS: 'PARTNERS',\n * } as const;\n *\n * const router = createPlaywrightRouter();\n *\n * const routes = createPlaywrightRouteMatchers<typeof routeLabels>([\n * // URLs that match this route are redirected to router.addHandler(routeLabels.MAIN_PAGE)\n * {\n * route: routeLabels.MAIN_PAGE,\n * // Check for main page like https://www.profesia.sk/?#\n * match: (url) => url.match(/[\\W]profesia\\.sk\\/?(?:[?#~]|$)/i),\n * },\n *\n * // Optionally override the logic that assigns the URL to the route by specifying the `action` prop\n * {\n * route: routeLabels.MAIN_PAGE,\n * // Check for main page like https://www.profesia.sk/?#\n * match: (url) => url.match(/[\\W]profesia\\.sk\\/?(?:[?#~]|$)/i),\n * action: async (ctx) => {\n * await ctx.crawler.addRequests([{\n * url: 'https://profesia.sk/praca',\n * label: routeLabels.JOB_LISTING,\n * }]);\n * },\n * },\n * ]);\n *\n * // Set up default route to redirect to labelled routes\n * setupDefaultRoute({ router, routes });\n *\n * // Now set up the labelled routes\n * await router.addHandler(routeLabels.JOB_LISTING, async (ctx) => { ... }\n */\nexport const setupDefaultRoute = async <\n CrawlerCtx extends CrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>\n>({\n io,\n router,\n routerWrappers,\n routerContext,\n routes,\n routeHandlers,\n input,\n}: {\n io: CrawleeOneIO;\n router: CrawlerRouter<CrawlerCtx>;\n routerWrappers?: CrawlerRouterWrapper<CrawlerCtx, RouterCtx>[];\n routerContext?: RouterCtx;\n routes: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[];\n routeHandlers: Record<Labels, RouteHandler<CrawlerCtx, RouterCtx>>;\n input?: Input | null;\n}) => {\n const { perfBatchSize, perfBatchWaitSecs, requestQueueId } = (input || {}) as PerfActorInput &\n RequestActorInput;\n\n /** Redirect the URL to the labelled route identical to route's name */\n // prettier-ignore\n const defaultAction: RouteMatcher<CrawlerCtx, RouterCtx, Labels>['action'] = async (url, ctx, route) => {\n const handler = route.handlerLabel != null && routeHandlers[route.handlerLabel];\n if (!handler) {\n ctx.log.error(`No handler found for route ${route.name} (${route.handlerLabel}). URL will not be processed. URL: ${url}`); // prettier-ignore\n return;\n }\n ctx.log.info(`Passing URL to handler ${route.handlerLabel}. URL: ${url}`);\n await handler(ctx as any);\n };\n\n const defaultHandler = async (ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>): Promise<void> => {\n const { page, log: parentLog } = ctx;\n const log = parentLog.child({ prefix: '[Router] ' });\n\n const reqQueue = await io.openRequestQueue(requestQueueId);\n\n let handledRequestsCount = 0;\n let req: CrawlerRequest | null = ctx.request;\n\n const closeRequest = async () => {\n if (!req) return;\n await reqQueue.markRequestHandled(req);\n handledRequestsCount++;\n };\n\n const loadNextRequest = async (suffix: string) => {\n log.debug(`Checking for new Request in the queue. ${suffix}`);\n\n if (perfBatchWaitSecs) await wait(perfBatchWaitSecs);\n const newReq = await reqQueue.fetchNextRequest();\n req = newReq ?? null;\n\n if (req) {\n log.debug(`Found new Request in the queue. ${suffix}`);\n\n // WARNING - For each subsequent Request, it must be loaded manually\n // Hence, batching is suitable only for browser-based Crawlers\n // like Playwright or Puppeteer.\n if (page && page.goto) await (page as Page).goto(req.url);\n } else {\n log.debug(`No more Requests in the queue. ${suffix}`);\n }\n };\n\n const hasBatchReqs = () =>\n perfBatchSize != null && req != null && handledRequestsCount < perfBatchSize;\n\n try {\n do {\n const url = page ? await (page as any as CommonPage).url() : req?.loadedUrl || req?.url;\n const logSuffix = `Batch ${handledRequestsCount + 1} of ${perfBatchSize ?? 1}. URL: ${url}`;\n\n // Find route handler for given URL\n log.debug(`Searching for a handler for given Request. ${logSuffix}`);\n const route = await serialAsyncFind(routes, async (currRoute) => {\n const isMatch = await currRoute.match(url, ctx, currRoute, routeHandlers);\n return isMatch;\n });\n\n // Run the handler\n if (route) {\n log.info(`URL matched route ${route.name} (handlerLabel: ${route.handlerLabel}). ${logSuffix}`); // prettier-ignore\n await (route.action ?? defaultAction)(url, ctx, route, routeHandlers);\n } else {\n log.error(`No route matched URL. URL will not be processed. ${logSuffix}`);\n }\n\n // Clean up and move onto another request\n await closeRequest();\n await loadNextRequest(logSuffix);\n } while (hasBatchReqs());\n } catch (err) {\n log.error(`Failed to process a request, returning it to the queue. URL: ${req?.loadedUrl || req?.url}.`); // prettier-ignore\n log.error(err);\n // Reinsert the request into the queue if we failed to process it due to an error\n if (req) await reqQueue.reclaimRequest(req, { forefront: true });\n }\n };\n\n const wrappedHandler = (routerWrappers ?? []).reduceRight(\n (fn, wrapper) => wrapper(fn),\n defaultHandler\n );\n await router.addDefaultHandler<CrawlerCtx>((ctx) =>\n wrappedHandler({ ...routerContext, ...ctx } as any)\n );\n};\n"]}
|
|
1
|
+
{"version":3,"file":"router.js","sourceRoot":"","sources":["../../../src/lib/router.ts"],"names":[],"mappings":";;;;;;;;;;;;AAgBA,0CAAuE;AAoFhE,MAAM,mBAAmB,GAAG,CAIjC,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,mBAAmB,uBAIuC;AAEvE,4BAA4B;AACrB,MAAM,wBAAwB,GAAG,CAItC,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,wBAAwB,4BAIkC;AAChE,MAAM,uBAAuB,GAAG,CAIrC,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,uBAAuB,2BAImC;AAChE,MAAM,wBAAwB,GAAG,CAItC,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,wBAAwB,4BAIkC;AAChE,MAAM,0BAA0B,GAAG,CAIxC,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,0BAA0B,8BAIgC;AAChE,MAAM,6BAA6B,GAAG,CAI3C,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,6BAA6B,iCAI6B;AAChE,MAAM,4BAA4B,GAAG,CAI1C,QAAuD,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,kBAAkB;AAJ7E,QAAA,4BAA4B,gCAI8B;AAEhE,MAAM,gBAAgB,GAAG,CAI9B,EACA,MAAM,EACN,cAAc,EACd,aAAa,EACb,aAAa,GAMd,EAAE,EAAE;IACH,MAAM,IAAA,sBAAc,EAAC,MAAM,CAAC,OAAO,CAAC,aAAa,CAAC,EAAE,CAAO,CAAC,GAAG,EAAE,OAAO,CAAC,EAAE,EAAE;QAC3E,MAAM,cAAc,GAAG,CAAC,cAAc,aAAd,cAAc,cAAd,cAAc,GAAI,EAAE,CAAC,CAAC,WAAW,CACvD,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,EAC1C,OAA2E,CAC5E,CAAC;QACF,MAAM,MAAM,CAAC,UAAU,CAAa,GAAG,EAAE,CAAO,GAAG,EAAE,EAAE,kDACrD,OAAA,cAAc,CAAC,gCAAK,aAAa,GAAK,GAAG,CAAS,CAAC,CAAA,GAAA,CACpD,CAAC;IACJ,CAAC,CAAA,CAAC,CAAC;AACL,CAAC,CAAA,CAAC;AAxBW,QAAA,gBAAgB,oBAwB3B;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgDG;AACI,MAAM,iBAAiB,GAAG,CAK/B,EACA,EAAE,EACF,MAAM,EACN,cAAc,EACd,aAAa,EACb,MAAM,EACN,aAAa,EACb,KAAK,GASN,EAAE,EAAE;IACH,MAAM,EAAE,aAAa,EAAE,iBAAiB,EAAE,cAAc,EAAE,GAAG,CAAC,KAAK,IAAI,EAAE,CACtD,CAAC;IAEpB,uEAAuE;IACvE,kBAAkB;IAClB,MAAM,aAAa,GAA0D,CAAO,GAAG,EAAE,GAAG,EAAE,KAAK,EAAE,EAAE;QACrG,MAAM,OAAO,GAAG,KAAK,CAAC,YAAY,IAAI,IAAI,IAAI,aAAa,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;QAChF,IAAI,CAAC,OAAO,EAAE;YACZ,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,8BAA8B,KAAK,CAAC,IAAI,KAAK,KAAK,CAAC,YAAY,sCAAsC,GAAG,EAAE,CAAC,CAAC,CAAC,kBAAkB;YAC7I,OAAO;SACR;QACD,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,0BAA0B,KAAK,CAAC,YAAY,UAAU,GAAG,EAAE,CAAC,CAAC;QAC1E,MAAM,OAAO,CAAC,GAAU,CAAC,CAAC;IAC5B,CAAC,CAAA,CAAC;IAEF,MAAM,cAAc,GAAG,CAAO,GAA6C,EAAiB,EAAE;;QAC5F,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,SAAS,EAAE,GAAG,GAAG,CAAC;QACrC,MAAM,GAAG,GAAG,SAAS,CAAC,KAAK,CAAC,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC,CAAC;QAErD,sDAAsD;QACtD,iFAAiF;QACjF,MAAM,SAAS,GAAG,GAAG,EAAE,CAAC,EAAE,CAAC,gBAAgB,CAAC,cAAc,CAAC,CAAC;QAE5D,IAAI,oBAAoB,GAAG,CAAC,CAAC;QAC7B,IAAI,GAAG,GAA0B,GAAG,CAAC,OAAO,CAAC;QAE7C,MAAM,YAAY,GAAG,GAAS,EAAE;YAC9B,IAAI,CAAC,GAAG;gBAAE,OAAO;YACjB,MAAM,QAAQ,GAAG,MAAM,SAAS,EAAE,CAAC;YACnC,MAAM,QAAQ,CAAC,kBAAkB,CAAC,GAAG,CAAC,CAAC;YACvC,oBAAoB,EAAE,CAAC;QACzB,CAAC,CAAA,CAAC;QAEF,MAAM,eAAe,GAAG,CAAO,MAAc,EAAE,EAAE;YAC/C,GAAG,CAAC,KAAK,CAAC,0CAA0C,MAAM,EAAE,CAAC,CAAC;YAE9D,IAAI,iBAAiB;gBAAE,MAAM,IAAA,YAAI,EAAC,iBAAiB,CAAC,CAAC;YACrD,MAAM,QAAQ,GAAG,MAAM,SAAS,EAAE,CAAC;YACnC,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,gBAAgB,EAAE,CAAC;YACjD,GAAG,GAAG,MAAM,aAAN,MAAM,cAAN,MAAM,GAAI,IAAI,CAAC;YAErB,IAAI,GAAG,EAAE;gBACP,GAAG,CAAC,KAAK,CAAC,mCAAmC,MAAM,EAAE,CAAC,CAAC;gBAEvD,oEAAoE;gBACpE,wEAAwE;gBACxE,0CAA0C;gBAC1C,IAAI,IAAI,IAAI,IAAI,CAAC,IAAI;oBAAE,MAAO,IAAa,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;aAC3D;iBAAM;gBACL,GAAG,CAAC,KAAK,CAAC,kCAAkC,MAAM,EAAE,CAAC,CAAC;aACvD;QACH,CAAC,CAAA,CAAC;QAEF,MAAM,YAAY,GAAG,GAAG,EAAE,CACxB,aAAa,IAAI,IAAI,IAAI,GAAG,IAAI,IAAI,IAAI,oBAAoB,GAAG,aAAa,CAAC;QAE/E,IAAI;YACF,GAAG;gBACD,MAAM,GAAG,GAAG,IAAI,CAAC,CAAC,CAAC,MAAO,IAA0B,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAA,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,SAAS,MAAI,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,GAAG,CAAA,CAAC;gBACxF,MAAM,SAAS,GAAG,SAAS,oBAAoB,GAAG,CAAC,OAAO,aAAa,aAAb,aAAa,cAAb,aAAa,GAAI,CAAC,UAAU,GAAG,EAAE,CAAC;gBAE5F,mCAAmC;gBACnC,GAAG,CAAC,KAAK,CAAC,8CAA8C,SAAS,EAAE,CAAC,CAAC;gBACrE,MAAM,KAAK,GAAG,MAAM,IAAA,uBAAe,EAAC,MAAM,EAAE,CAAO,SAAS,EAAE,EAAE;oBAC9D,MAAM,OAAO,GAAG,MAAM,SAAS,CAAC,KAAK,CAAC,GAAG,EAAE,GAAG,EAAE,SAAS,EAAE,aAAa,CAAC,CAAC;oBAC1E,OAAO,OAAO,CAAC;gBACjB,CAAC,CAAA,CAAC,CAAC;gBAEH,kBAAkB;gBAClB,IAAI,KAAK,EAAE;oBACT,GAAG,CAAC,IAAI,CAAC,qBAAqB,KAAK,CAAC,IAAI,mBAAmB,KAAK,CAAC,YAAY,MAAM,SAAS,EAAE,CAAC,CAAC,CAAC,kBAAkB;oBACnH,MAAM,CAAC,MAAA,KAAK,CAAC,MAAM,mCAAI,aAAa,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,KAAK,EAAE,aAAa,CAAC,CAAC;iBACvE;qBAAM;oBACL,GAAG,CAAC,KAAK,CAAC,oDAAoD,SAAS,EAAE,CAAC,CAAC;iBAC5E;gBAED,yCAAyC;gBACzC,MAAM,YAAY,EAAE,CAAC;gBACrB,MAAM,eAAe,CAAC,SAAS,CAAC,CAAC;aAClC,QAAQ,YAAY,EAAE,EAAE;SAC1B;QAAC,OAAO,GAAG,EAAE;YACZ,GAAG,CAAC,KAAK,CAAC,gEAAgE,CAAA,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,SAAS,MAAI,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,GAAG,CAAA,GAAG,CAAC,CAAC,CAAC,kBAAkB;YAC5H,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YACf,iFAAiF;YACjF,IAAI,GAAG,EAAE;gBACP,MAAM,QAAQ,GAAG,MAAM,SAAS,EAAE,CAAC;gBACnC,MAAM,QAAQ,CAAC,cAAc,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;aACzD;SACF;IACH,CAAC,CAAA,CAAC;IAEF,MAAM,cAAc,GAAG,CAAC,cAAc,aAAd,cAAc,cAAd,cAAc,GAAI,EAAE,CAAC,CAAC,WAAW,CACvD,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,EAAE,CAAC,EAC5B,cAAc,CACf,CAAC;IACF,MAAM,MAAM,CAAC,iBAAiB,CAAa,CAAC,GAAG,EAAE,EAAE,CACjD,cAAc,CAAC,gCAAK,aAAa,GAAK,GAAG,CAAS,CAAC,CACpD,CAAC;AACJ,CAAC,CAAA,CAAC;AAxHW,QAAA,iBAAiB,qBAwH5B","sourcesContent":["import type {\n BasicCrawler,\n BasicCrawlingContext,\n CheerioCrawlingContext,\n CrawlingContext,\n HttpCrawlingContext,\n JSDOMCrawlingContext,\n PlaywrightCrawlingContext,\n PuppeteerCrawlingContext,\n RouterHandler as CrawlerRouter,\n Request as CrawlerRequest,\n} from 'crawlee';\nimport type { CommonPage } from '@crawlee/browser-pool';\nimport type { Page } from 'playwright';\n\nimport type { MaybePromise } from '../utils/types';\nimport { serialAsyncFind, serialAsyncMap, wait } from '../utils/async';\nimport type { PerfActorInput, RequestActorInput } from './config';\nimport type { CrawleeOneIO } from './integrations/types';\n\n// Read about router on https://docs.apify.com/academy/expert-scraping-with-apify/solutions/using-storage-creating-tasks\n\n/** Context object provided in CrawlerRouter */\nexport type RouterHandlerCtx<CrawlerCtx extends CrawlingContext> = Parameters<\n Parameters<CrawlerRouter<CrawlerCtx>['addHandler']>[1]\n>[0];\n\n/** Function that's passed to `router.addHandler(label, handler)` */\nexport type RouteHandler<\n CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n RouterCtx extends Record<string, any> = Record<string, any>,\n> = Parameters<CrawlerRouter<RouterHandlerCtx<CrawlerCtx & RouterCtx>>['addHandler']>[1]; // prettier-ignore\n\n/** Wrapper that modifies behavior of RouteHandler */\nexport type CrawlerRouterWrapper<\n CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n RouterCtx extends Record<string, any> = Record<string, any>\n> = (\n handler: (ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>) => Promise<void>\n) => (ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>) => Promise<void>;\n\n/**\n * Criteria that un-labelled requests are matched against.\n *\n * E.g. If `match` function returns truthy value,\n * the request is passed to the `action` function for processing.\n */\nexport interface RouteMatcher<\n CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n> {\n /** Human readable name */\n name: string;\n /**\n * Label of the handler registered with `router.addHandler(label, handler)`\n * that will process this request.\n *\n * NOTE: This value is used by the default `action` function. If you override\n * the `action` function, `handlerLabel` is ignored and you have to process it yourself.\n */\n handlerLabel: Labels | null;\n /**\n * Function that decides whether the request will processed by this `action` function.\n *\n * @example\n * [{\n * // If match returns true, the request is forwarded to handler\n * // with label JOB_DETAIL.\n * name: 'Job detail',\n * match: (url, ctx, route, handlers) => isUrlOfJobOffer(url),\n * handlerLabel: routeLabels.JOB_DETAIL,\n * }]\n */\n match: (\n url: string,\n ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>,\n route: RouteMatcher<CrawlerCtx, RouterCtx, Labels>,\n handlers: Record<Labels, RouteHandler<CrawlerCtx, RouterCtx>>\n ) => unknown;\n /**\n * Request is passed to this function if `match` returned truthy value.\n *\n * @example\n * [{\n * // If match returns true, the request is forwarded to handler\n * // with label JOB_DETAIL.\n * name: 'Job detail',\n * match: (url, ctx, route, handlers) => isUrlOfJobOffer(url),\n * handlerLabel: routeLabels.JOB_DETAIL,\n * }]\n */\n action?: (\n url: string,\n ctx: RouterHandlerCtx<CrawlerCtx>,\n route: RouteMatcher<CrawlerCtx, RouterCtx, Labels>,\n handlers: Record<Labels, RouteHandler<CrawlerCtx, RouterCtx>>\n ) => MaybePromise<void>;\n}\n\nexport const createRouteMatchers = <\n CrawlerCtx extends CrawlingContext = CrawlingContext<BasicCrawler>,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\n\n// Context-specific variants\nexport const createBasicRouteMatchers = <\n CrawlerCtx extends BasicCrawlingContext = BasicCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\nexport const createHttpRouteMatchers = <\n CrawlerCtx extends HttpCrawlingContext = HttpCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\nexport const createJsdomRouteMatchers = <\n CrawlerCtx extends JSDOMCrawlingContext = JSDOMCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\nexport const createCheerioRouteMatchers = <\nCrawlerCtx extends CheerioCrawlingContext = CheerioCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\nexport const createPlaywrightRouteMatchers = <\n CrawlerCtx extends PlaywrightCrawlingContext = PlaywrightCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\nexport const createPuppeteerRouteMatchers = <\n CrawlerCtx extends PuppeteerCrawlingContext = PuppeteerCrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>(matchers: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[]) => matchers; // prettier-ignore\n\nexport const registerHandlers = async <\n CrawlerCtx extends CrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string\n>({\n router,\n routerWrappers,\n routerContext,\n routeHandlers,\n}: {\n router: CrawlerRouter<CrawlerCtx>;\n routerWrappers?: CrawlerRouterWrapper<CrawlerCtx, RouterCtx>[];\n routerContext?: RouterCtx;\n routeHandlers: Record<Labels, RouteHandler<CrawlerCtx, RouterCtx>>;\n}) => {\n await serialAsyncMap(Object.entries(routeHandlers), async ([key, handler]) => {\n const wrappedHandler = (routerWrappers ?? []).reduceRight(\n (fn, wrapper) => wrapper((ctx) => fn(ctx)),\n handler as (ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>) => Promise<void>\n );\n await router.addHandler<CrawlerCtx>(key, async (ctx) =>\n wrappedHandler({ ...routerContext, ...ctx } as any)\n );\n });\n};\n\n/**\n * Configures the default router handler to redirect URLs to labelled route handlers\n * based on which route the URL matches first.\n *\n * NOTE: This does mean that the URLs passed to this default handler will be fetched\n * twice (as the URL will be requeued to the correct handler). We recommend to use this\n * function only in the scenarios where there is a small number of startUrls, yet these\n * may need various ways of processing based on different paths or etc.\n *\n * @example\n *\n * const routeLabels = {\n * MAIN_PAGE: 'MAIN_PAGE',\n * JOB_LISTING: 'JOB_LISTING',\n * JOB_DETAIL: 'JOB_DETAIL',\n * JOB_RELATED_LIST: 'JOB_RELATED_LIST',\n * PARTNERS: 'PARTNERS',\n * } as const;\n *\n * const router = createPlaywrightRouter();\n *\n * const routes = createPlaywrightRouteMatchers<typeof routeLabels>([\n * // URLs that match this route are redirected to router.addHandler(routeLabels.MAIN_PAGE)\n * {\n * route: routeLabels.MAIN_PAGE,\n * // Check for main page like https://www.profesia.sk/?#\n * match: (url) => url.match(/[\\W]profesia\\.sk\\/?(?:[?#~]|$)/i),\n * },\n *\n * // Optionally override the logic that assigns the URL to the route by specifying the `action` prop\n * {\n * route: routeLabels.MAIN_PAGE,\n * // Check for main page like https://www.profesia.sk/?#\n * match: (url) => url.match(/[\\W]profesia\\.sk\\/?(?:[?#~]|$)/i),\n * action: async (ctx) => {\n * await ctx.crawler.addRequests([{\n * url: 'https://profesia.sk/praca',\n * label: routeLabels.JOB_LISTING,\n * }]);\n * },\n * },\n * ]);\n *\n * // Set up default route to redirect to labelled routes\n * setupDefaultRoute({ router, routes });\n *\n * // Now set up the labelled routes\n * await router.addHandler(routeLabels.JOB_LISTING, async (ctx) => { ... }\n */\nexport const setupDefaultRoute = async <\n CrawlerCtx extends CrawlingContext,\n RouterCtx extends Record<string, any> = Record<string, any>,\n Labels extends string = string,\n Input extends Record<string, any> = Record<string, any>\n>({\n io,\n router,\n routerWrappers,\n routerContext,\n routes,\n routeHandlers,\n input,\n}: {\n io: CrawleeOneIO;\n router: CrawlerRouter<CrawlerCtx>;\n routerWrappers?: CrawlerRouterWrapper<CrawlerCtx, RouterCtx>[];\n routerContext?: RouterCtx;\n routes: RouteMatcher<CrawlerCtx, RouterCtx, Labels>[];\n routeHandlers: Record<Labels, RouteHandler<CrawlerCtx, RouterCtx>>;\n input?: Input | null;\n}) => {\n const { perfBatchSize, perfBatchWaitSecs, requestQueueId } = (input || {}) as PerfActorInput &\n RequestActorInput;\n\n /** Redirect the URL to the labelled route identical to route's name */\n // prettier-ignore\n const defaultAction: RouteMatcher<CrawlerCtx, RouterCtx, Labels>['action'] = async (url, ctx, route) => {\n const handler = route.handlerLabel != null && routeHandlers[route.handlerLabel];\n if (!handler) {\n ctx.log.error(`No handler found for route ${route.name} (${route.handlerLabel}). URL will not be processed. URL: ${url}`); // prettier-ignore\n return;\n }\n ctx.log.info(`Passing URL to handler ${route.handlerLabel}. URL: ${url}`);\n await handler(ctx as any);\n };\n\n const defaultHandler = async (ctx: RouterHandlerCtx<CrawlerCtx & RouterCtx>): Promise<void> => {\n const { page, log: parentLog } = ctx;\n const log = parentLog.child({ prefix: '[Router] ' });\n\n // NOTE: Because we \"clear\" the queue by replacing it,\n // we need to always call `openRequestQueue` to ensure we use the latest instance\n const openQueue = () => io.openRequestQueue(requestQueueId);\n\n let handledRequestsCount = 0;\n let req: CrawlerRequest | null = ctx.request;\n\n const closeRequest = async () => {\n if (!req) return;\n const reqQueue = await openQueue();\n await reqQueue.markRequestHandled(req);\n handledRequestsCount++;\n };\n\n const loadNextRequest = async (suffix: string) => {\n log.debug(`Checking for new Request in the queue. ${suffix}`);\n\n if (perfBatchWaitSecs) await wait(perfBatchWaitSecs);\n const reqQueue = await openQueue();\n const newReq = await reqQueue.fetchNextRequest();\n req = newReq ?? null;\n\n if (req) {\n log.debug(`Found new Request in the queue. ${suffix}`);\n\n // WARNING - For each subsequent Request, it must be loaded manually\n // Hence, batching is suitable only for browser-based Crawlers\n // like Playwright or Puppeteer.\n if (page && page.goto) await (page as Page).goto(req.url);\n } else {\n log.debug(`No more Requests in the queue. ${suffix}`);\n }\n };\n\n const hasBatchReqs = () =>\n perfBatchSize != null && req != null && handledRequestsCount < perfBatchSize;\n\n try {\n do {\n const url = page ? await (page as any as CommonPage).url() : req?.loadedUrl || req?.url;\n const logSuffix = `Batch ${handledRequestsCount + 1} of ${perfBatchSize ?? 1}. URL: ${url}`;\n\n // Find route handler for given URL\n log.debug(`Searching for a handler for given Request. ${logSuffix}`);\n const route = await serialAsyncFind(routes, async (currRoute) => {\n const isMatch = await currRoute.match(url, ctx, currRoute, routeHandlers);\n return isMatch;\n });\n\n // Run the handler\n if (route) {\n log.info(`URL matched route ${route.name} (handlerLabel: ${route.handlerLabel}). ${logSuffix}`); // prettier-ignore\n await (route.action ?? defaultAction)(url, ctx, route, routeHandlers);\n } else {\n log.error(`No route matched URL. URL will not be processed. ${logSuffix}`);\n }\n\n // Clean up and move onto another request\n await closeRequest();\n await loadNextRequest(logSuffix);\n } while (hasBatchReqs());\n } catch (err) {\n log.error(`Failed to process a request, returning it to the queue. URL: ${req?.loadedUrl || req?.url}.`); // prettier-ignore\n log.error(err);\n // Reinsert the request into the queue if we failed to process it due to an error\n if (req) {\n const reqQueue = await openQueue();\n await reqQueue.reclaimRequest(req, { forefront: true });\n }\n }\n };\n\n const wrappedHandler = (routerWrappers ?? []).reduceRight(\n (fn, wrapper) => wrapper(fn),\n defaultHandler\n );\n await router.addDefaultHandler<CrawlerCtx>((ctx) =>\n wrappedHandler({ ...routerContext, ...ctx } as any)\n );\n};\n"]}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "crawlee-one",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.6",
|
|
4
4
|
"private": false,
|
|
5
5
|
"description": "Crawlee One is a framework built on top of Crawlee and Apify for writing robust and highly configurable web scrapers",
|
|
6
6
|
"author": "Juro Oravec <juraj.oravec.josefson@gmail.com>",
|