crawlee-one 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/README.md +81 -0
  2. package/dist/cjs/cli/cli.d.ts +1 -0
  3. package/dist/cjs/cli/cli.js +61 -0
  4. package/dist/cjs/cli/cli.js.map +1 -0
  5. package/dist/cjs/cli/index.d.ts +2 -0
  6. package/dist/cjs/cli/index.js +6 -0
  7. package/dist/cjs/cli/index.js.map +1 -0
  8. package/dist/cjs/index.d.ts +24 -0
  9. package/dist/cjs/index.js +43 -0
  10. package/dist/cjs/index.js.map +1 -0
  11. package/dist/cjs/lib/actions/dom.d.ts +102 -0
  12. package/dist/cjs/lib/actions/dom.js +743 -0
  13. package/dist/cjs/lib/actions/dom.js.map +1 -0
  14. package/dist/cjs/lib/actions/domUtils.d.ts +42 -0
  15. package/dist/cjs/lib/actions/domUtils.js +126 -0
  16. package/dist/cjs/lib/actions/domUtils.js.map +1 -0
  17. package/dist/cjs/lib/actions/page.d.ts +69 -0
  18. package/dist/cjs/lib/actions/page.js +205 -0
  19. package/dist/cjs/lib/actions/page.js.map +1 -0
  20. package/dist/cjs/lib/actions/scrapeListing.d.ts +78 -0
  21. package/dist/cjs/lib/actions/scrapeListing.js +242 -0
  22. package/dist/cjs/lib/actions/scrapeListing.js.map +1 -0
  23. package/dist/cjs/lib/actor/actor.d.ts +90 -0
  24. package/dist/cjs/lib/actor/actor.js +306 -0
  25. package/dist/cjs/lib/actor/actor.js.map +1 -0
  26. package/dist/cjs/lib/actor/types.d.ts +162 -0
  27. package/dist/cjs/lib/actor/types.js +3 -0
  28. package/dist/cjs/lib/actor/types.js.map +1 -0
  29. package/dist/cjs/lib/actor.d.ts +189 -0
  30. package/dist/cjs/lib/actor.js +225 -0
  31. package/dist/cjs/lib/actor.js.map +1 -0
  32. package/dist/cjs/lib/actorSpec.d.ts +20 -0
  33. package/dist/cjs/lib/actorSpec.js +3 -0
  34. package/dist/cjs/lib/actorSpec.js.map +1 -0
  35. package/dist/cjs/lib/config.d.ts +561 -0
  36. package/dist/cjs/lib/config.js +707 -0
  37. package/dist/cjs/lib/config.js.map +1 -0
  38. package/dist/cjs/lib/dataset/maxCount.d.ts +30 -0
  39. package/dist/cjs/lib/dataset/maxCount.js +55 -0
  40. package/dist/cjs/lib/dataset/maxCount.js.map +1 -0
  41. package/dist/cjs/lib/dataset/pushData.d.ts +123 -0
  42. package/dist/cjs/lib/dataset/pushData.js +182 -0
  43. package/dist/cjs/lib/dataset/pushData.js.map +1 -0
  44. package/dist/cjs/lib/dataset.d.ts +98 -0
  45. package/dist/cjs/lib/dataset.js +122 -0
  46. package/dist/cjs/lib/dataset.js.map +1 -0
  47. package/dist/cjs/lib/dom.d.ts +78 -0
  48. package/dist/cjs/lib/dom.js +243 -0
  49. package/dist/cjs/lib/dom.js.map +1 -0
  50. package/dist/cjs/lib/error/errorHandler.d.ts +112 -0
  51. package/dist/cjs/lib/error/errorHandler.js +164 -0
  52. package/dist/cjs/lib/error/errorHandler.js.map +1 -0
  53. package/dist/cjs/lib/error/sentry.d.ts +11 -0
  54. package/dist/cjs/lib/error/sentry.js +60 -0
  55. package/dist/cjs/lib/error/sentry.js.map +1 -0
  56. package/dist/cjs/lib/integrations/apify.d.ts +67 -0
  57. package/dist/cjs/lib/integrations/apify.js +106 -0
  58. package/dist/cjs/lib/integrations/apify.js.map +1 -0
  59. package/dist/cjs/lib/integrations/types.d.ts +274 -0
  60. package/dist/cjs/lib/integrations/types.js +3 -0
  61. package/dist/cjs/lib/integrations/types.js.map +1 -0
  62. package/dist/cjs/lib/io/dataset.d.ts +67 -0
  63. package/dist/cjs/lib/io/dataset.js +86 -0
  64. package/dist/cjs/lib/io/dataset.js.map +1 -0
  65. package/dist/cjs/lib/io/maxCount.d.ts +30 -0
  66. package/dist/cjs/lib/io/maxCount.js +55 -0
  67. package/dist/cjs/lib/io/maxCount.js.map +1 -0
  68. package/dist/cjs/lib/io/pushData.d.ts +124 -0
  69. package/dist/cjs/lib/io/pushData.js +193 -0
  70. package/dist/cjs/lib/io/pushData.js.map +1 -0
  71. package/dist/cjs/lib/io/pushRequests.d.ts +38 -0
  72. package/dist/cjs/lib/io/pushRequests.js +63 -0
  73. package/dist/cjs/lib/io/pushRequests.js.map +1 -0
  74. package/dist/cjs/lib/io/requestQueue.d.ts +28 -0
  75. package/dist/cjs/lib/io/requestQueue.js +40 -0
  76. package/dist/cjs/lib/io/requestQueue.js.map +1 -0
  77. package/dist/cjs/lib/log.d.ts +38 -0
  78. package/dist/cjs/lib/log.js +54 -0
  79. package/dist/cjs/lib/log.js.map +1 -0
  80. package/dist/cjs/lib/migrate/localMigrator.d.ts +10 -0
  81. package/dist/cjs/lib/migrate/localMigrator.js +57 -0
  82. package/dist/cjs/lib/migrate/localMigrator.js.map +1 -0
  83. package/dist/cjs/lib/migrate/localState.d.ts +7 -0
  84. package/dist/cjs/lib/migrate/localState.js +43 -0
  85. package/dist/cjs/lib/migrate/localState.js.map +1 -0
  86. package/dist/cjs/lib/migrate/types.d.ts +6 -0
  87. package/dist/cjs/lib/migrate/types.js +3 -0
  88. package/dist/cjs/lib/migrate/types.js.map +1 -0
  89. package/dist/cjs/lib/readme/readme.d.ts +65 -0
  90. package/dist/cjs/lib/readme/readme.js +534 -0
  91. package/dist/cjs/lib/readme/readme.js.map +1 -0
  92. package/dist/cjs/lib/readme/types.d.ts +260 -0
  93. package/dist/cjs/lib/readme/types.js +54 -0
  94. package/dist/cjs/lib/readme/types.js.map +1 -0
  95. package/dist/cjs/lib/router.d.ts +132 -0
  96. package/dist/cjs/lib/router.js +165 -0
  97. package/dist/cjs/lib/router.js.map +1 -0
  98. package/dist/cjs/lib/scraper/scrapeListing.d.ts +78 -0
  99. package/dist/cjs/lib/scraper/scrapeListing.js +242 -0
  100. package/dist/cjs/lib/scraper/scrapeListing.js.map +1 -0
  101. package/dist/cjs/lib/test/actor.d.ts +21 -0
  102. package/dist/cjs/lib/test/actor.js +56 -0
  103. package/dist/cjs/lib/test/actor.js.map +1 -0
  104. package/dist/cjs/lib/test/mockApifyClient.d.ts +32 -0
  105. package/dist/cjs/lib/test/mockApifyClient.js +176 -0
  106. package/dist/cjs/lib/test/mockApifyClient.js.map +1 -0
  107. package/dist/cjs/types.d.ts +31 -0
  108. package/dist/cjs/types.js +3 -0
  109. package/dist/cjs/types.js.map +1 -0
  110. package/dist/cjs/utils/async.d.ts +19 -0
  111. package/dist/cjs/utils/async.js +74 -0
  112. package/dist/cjs/utils/async.js.map +1 -0
  113. package/dist/cjs/utils/error.d.ts +1 -0
  114. package/dist/cjs/utils/error.js +10 -0
  115. package/dist/cjs/utils/error.js.map +1 -0
  116. package/dist/cjs/utils/format.d.ts +9 -0
  117. package/dist/cjs/utils/format.js +19 -0
  118. package/dist/cjs/utils/format.js.map +1 -0
  119. package/dist/cjs/utils/package.d.ts +15 -0
  120. package/dist/cjs/utils/package.js +25 -0
  121. package/dist/cjs/utils/package.js.map +1 -0
  122. package/dist/cjs/utils/types.d.ts +6 -0
  123. package/dist/cjs/utils/types.js +9 -0
  124. package/dist/cjs/utils/types.js.map +1 -0
  125. package/dist/cjs/utils/url.d.ts +9 -0
  126. package/dist/cjs/utils/url.js +32 -0
  127. package/dist/cjs/utils/url.js.map +1 -0
  128. package/dist/cjs/utils/valueMonitor.d.ts +31 -0
  129. package/dist/cjs/utils/valueMonitor.js +91 -0
  130. package/dist/cjs/utils/valueMonitor.js.map +1 -0
  131. package/package.json +85 -0
@@ -0,0 +1 @@
1
+ {"version":3,"file":"page.js","sourceRoot":"","sources":["../../../../src/lib/actions/page.ts"],"names":[],"mappings":";;;;;;;;;;;;AAGA,6CAAkD;AAClD,yCAA6C;AAiF7C,8CAA8C;AACvC,MAAM,iBAAiB,GAAG,CAAuB,IAAO,EAAiC,EAAE;IAChG,MAAM,EAAE,YAAY,EAAE,SAAS,EAAE,UAAU,EAAE,GAAG,MAAM,kCAAkC,CAAC,IAAI,CAAC,CAAC;IAE/F,MAAM,cAAc,GAA2C,CAC7D,SAAS,EACT,aAAa,EACb,OAAO,EACP,EAAE;;QACF,MAAM,eAAe,GAAG,MAAA,OAAO,aAAP,OAAO,uBAAP,OAAO,CAAE,eAAe,mCAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAE,CAAmB,CAAC,QAAQ,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC,CAAE,EAAc,CAAC,iBAAiB,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,qBAAa,CAAC,CAAC,CAAC,CAAC,kBAAkB;QACvL,MAAM,cAAc,GAAG,MAAA,OAAO,aAAP,OAAO,uBAAP,OAAO,CAAE,cAAc,mCAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC,CAAE,EAAc,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,KAAK,CAAC,qBAAa,CAAC,CAAC,CAAC,CAAC,kBAAkB;QAChK,MAAM,cAAc,GAAG,MAAA,OAAO,aAAP,OAAO,uBAAP,OAAO,CAAE,cAAc,mCAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,EAAE,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,cAAc,EAAE,CAAA,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,qBAAa,CAAC,CAAC,CAAC,CAAC,kBAAkB;QACrJ,MAAM,eAAe,GAAG,MAAA,OAAO,aAAP,OAAO,uBAAP,OAAO,CAAE,eAAe,mCAAI,CAAC,GAAG,EAAE,CAAC,IAAI,CAAC,gBAAgB,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,kBAAkB;QAEpH,MAAM,eAAe,GAAG,OAAO,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;QAC5F,IAAI,IAAA,0BAAe,EAAC,eAAe,CAAC,IAAI,eAAe,CAAC,IAAI,EAAE,KAAK,IAAI,EAAE;YACvE,MAAM,KAAK,CAAC,wCAAwC,CAAC,CAAC;SACvD;QAED,MAAM,qBAAqB,CACzB,eAAe,EACf,CAAO,QAAQ,EAAE,GAAG,EAAE,MAAM,EAAE,EAAE;YAC9B,4DAA4D;YAC5D,MAAM,SAAS,GAAG,MAAM,UAAU,CAAC,QAAQ,CAAC,CAAC;YAC7C,yBAAyB;YACzB,MAAM,CAAA,aAAa,aAAb,aAAa,uBAAb,aAAa,CAAG,SAAS,kCAAO,GAAG,KAAE,IAAI,KAAI,MAAM,CAAC,CAAA,CAAC;QAC7D,CAAC,CAAA,EACD;YACE,OAAO,EAAE,MAAA,OAAO,aAAP,OAAO,uBAAP,OAAO,CAAE,OAAO,mCAAI,CAAC;YAC9B,eAAe,EAAE,CAAC,EAAE,EAAE,GAAG,EAAE,EAAE,CAAC,eAAe,CAAC,EAAE,kCAAO,GAAG,KAAE,IAAI,IAAG;YACnE,cAAc,EAAE,CAAO,MAAM,EAAE,GAAG,EAAE,EAAE;gBACpC,2DAA2D;gBAC3D,MAAM,cAAc,GAAG,MAAM,cAAc,CAAC,MAAM,kCAAO,GAAG,KAAE,IAAI,IAAG,CAAC;gBACtE,mEAAmE;gBACnE,MAAM,QAAQ,GAAG,MAAM,YAAY,CAAC,cAAc,CAAC,CAAC;gBACpD,OAAO,QAAQ,CAAC;YAClB,CAAC,CAAA;YACD,cAAc,EAAE,CAAO,OAAO,EAAE,GAAG,EAAE,EAAE;gBACrC,0DAA0D;gBAC1D,MAAM,aAAa,GAAG,MAAM,SAAS,CAAC,OAAO,CAAC,CAAC;gBAC/C,gDAAgD;gBAChD,MAAM,cAAc,CAAC,aAAa,kCAAO,GAAG,KAAE,IAAI,IAAG,CAAC;YACxD,CAAC,CAAA;YACD,eAAe,EAAE,CAAO,OAAO,EAAE,GAAG,EAAE,EAAE;gBACtC,0DAA0D;gBAC1D,MAAM,aAAa,GAAG,MAAM,SAAS,CAAC,OAAO,CAAC,CAAC;gBAC/C,iDAAiD;gBACjD,MAAM,eAAe,CAAC,aAAa,kCAAO,GAAG,KAAE,IAAI,IAAG,CAAC;YACzD,CAAC,CAAA;SACF,CACF,CAAC;IACJ,CAAC,CAAA,CAAC;IAEF,OAAO;QACL,IAAI;QAEJ,cAAc;KACgB,CAAC;AACnC,CAAC,CAAA,CAAC;AAzDW,QAAA,iBAAiB,qBAyD5B;AAEF,mEAAmE;AACnE,MAAM,qBAAqB,GAAG,CAC5B,SAAgE,EAChE,aAIuB,EACvB,OAAiD,EACjD,EAAE;IACF,MAAM,iBAAiB,GAAG,CACxB,OAAO,SAAS,KAAK,UAAU,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,GAAG,EAAE,CAAC,SAAS,CACxB,CAAC;IAExC,MAAM,iBAAiB,GAAG,IAAI,GAAG,EAAE,CAAC;IACpC,IAAI,eAAe,GAAG,KAAK,CAAC;IAE5B,MAAM,MAAM,GAAG,GAAG,EAAE;QAClB,eAAe,GAAG,IAAI,CAAC;IACzB,CAAC,CAAC;IAEF,MAAM,eAAe,GAAG,CAAO,UAAwB,EAAE,EAAE;QACzD,MAAM,WAAW,GAAG,MAAM,UAAU,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,iBAAiB,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;QAChF,MAAM,SAAS,GAAG,MAAM,iBAAiB,EAAE,CAAC;QAC5C,MAAM,CAAA,aAAa,aAAb,aAAa,uBAAb,aAAa,CAAG,WAAW,EAAE,EAAE,SAAS,EAAE,EAAE,MAAM,CAAC,CAAA,CAAC;QAC1D,WAAW,CAAC,OAAO,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,iBAAiB,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IACzD,CAAC,CAAA,CAAC;IAEF,MAAM,aAAa,GAAG,MAAM,iBAAiB,EAAE,CAAC;IAChD,IAAI,iBAAiB,GAAG,MAAM,OAAO,CAAC,eAAe,CAAC,aAAa,EAAE,EAAE,SAAS,EAAE,aAAa,EAAE,CAAC,CAAC,CAAC,kBAAkB;IACtH,IAAI,WAAW,GAAG,CAAC,CAAC;IAEpB,OAAO,CAAC,eAAe,EAAE;QACvB,oCAAoC;QACpC,MAAM,WAAW,GAAG,MAAM,iBAAiB,EAAE,CAAC;QAC9C,MAAM,YAAY,GAAG,CAAC,GAAG,CAAC,MAAM,OAAO,CAAC,cAAc,CAAC,WAAW,EAAE,EAAE,SAAS,EAAE,WAAW,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,kBAAkB;QACrH,MAAM,eAAe,CAAC,YAAY,CAAC,CAAC;QAEpC,IAAI,eAAe;YAAE,MAAM;QAE3B,kBAAkB;QAClB,MAAM,WAAW,GAAG,YAAY,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC9C,MAAM,OAAO,CAAC,cAAc,CAAC,WAAW,EAAE,EAAE,SAAS,EAAE,WAAW,EAAE,CAAC,CAAC;QACtE,MAAM,OAAO,CAAC,eAAe,CAAC,WAAW,EAAE,EAAE,SAAS,EAAE,WAAW,EAAE,CAAC,CAAC;QACvE,MAAM,gBAAgB,GAAG,MAAM,OAAO,CAAC,eAAe,CAAC,WAAW,EAAE,EAAE,SAAS,EAAE,WAAW,EAAE,CAAC,CAAC,CAAC,kBAAkB;QAEnH,IAAI,gBAAgB,IAAI,iBAAiB,EAAE;YACzC,IAAI,WAAW,IAAI,OAAO,CAAC,OAAO;gBAAE,MAAM;;gBACrC,WAAW,EAAE,CAAC;SACpB;aAAM;YACL,WAAW,GAAG,CAAC,CAAC;SACjB;QAED,iBAAiB,GAAG,gBAAgB,CAAC;KACtC;AACH,CAAC,CAAA,CAAC;AAOF;;;;;;;;;GASG;AACH,MAAM,kCAAkC,GAAG,CAAuB,IAAO,EAAE,EAAE;IAC3E,MAAM,MAAM,GAAG,iCAAiC,CAAC;IACjD,MAAM,SAAS,GAAG,GAAG,MAAM,iBAAiB,CAAC;IAC7C,+EAA+E;IAC/E,yBAAyB;IACzB,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,SAAA,EAAE,EAAI,CAAC,CAAA,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,kBAAkB;IACvG,MAAM,MAAM,GAAG,GAAG,MAAM,GAAG,WAAW,EAAE,CAAC;IAEzC,wFAAwF;IACxF,MAAM,IAAI;SACP,QAAQ,CACP,CAAC,EAAE,MAAM,EAAE,SAAS,EAAE,EAAE,EAAE;QACxB,wFAAwF;QACxF,wCAAwC;QACxC,UAAU,CAAC,SAAS,CAAC,GAAG,GAAG,EAAE;YAC3B,MAAM,KAAK,GAAG,CAAC,UAAU,CAAC,MAAM,CAAC,GAAG,UAAU,CAAC,MAAM,CAAC,IAAI,IAAI,GAAG,EAAE,CAAC,CAAC;YACrE,MAAM,QAAQ,GAAG,CAAC,UAAU,CAAC,GAAG,MAAM,MAAM,CAAC,GAAG,UAAU,CAAC,GAAG,MAAM,MAAM,CAAC,IAAI,IAAI,GAAG,EAAE,CAAC,CAAC;YAC1F,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAoD,CAAC;QAC/E,CAAC,CAAC;IACJ,CAAC,EACD,EAAE,MAAM,EAAE,SAAS,EAAE,CACtB;SACA,KAAK,CAAC,qBAAa,CAAC,CAAC;IAExB;;;;;;;;;;;;;OAaG;IACH,MAAM,YAAY,GAAG,CAAO,SAA4B,EAAE,EAAE;QAC1D,MAAM,GAAG,GAAG,MAAM,IAAI;aACnB,QAAQ,CACP,CAAC,EAAE,GAAG,EAAE,SAAS,EAAE,EAAE,EAAE;YACrB,IAAI,CAAC,GAAG;gBAAE,OAAO,EAAc,CAAC;YAEhC,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,GAAG,UAAU,CAAC,SAAS,CAAC,EAA6C,CAAC,CAAC,kBAAkB;YAElH,MAAM,QAAQ,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE;gBACnC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE;oBAClB,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,SAAA,EAAE,EAAI,CAAC,CAAA,CAAC;yBAC7C,QAAQ,EAAE;yBACV,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;oBACpB,KAAK,CAAC,GAAG,CAAC,EAAE,EAAE,IAAI,CAAC,CAAC;oBACpB,QAAQ,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;oBACvB,OAAO,IAAI,CAAC;iBACb;gBACD,OAAO,KAAK,CAAC,GAAG,CAAC,EAAE,CAAE,CAAC;YACxB,CAAC,CAAC,CAAC;YACH,OAAO,QAAQ,CAAC;QAClB,CAAC,EACD,EAAE,GAAG,EAAE,SAAS,EAAE,SAAS,EAAE,CAC9B;aACA,KAAK,CAAC,qBAAa,CAAC,CAAC;QAExB,OAAO,GAAG,CAAC;IACb,CAAC,CAAA,CAAC;IAEF;;;;;;;;;;;;OAYG;IACH,MAAM,UAAU,GAAG,CAAO,GAAa,EAAE,EAAE;QACzC,qDAAqD;QACrD,MAAM,SAAS,GAAG,MAAM,IAAI;aACzB,cAAc,CACb,CAAC,EAAE,GAAG,EAAE,SAAS,EAAE,EAAE,EAAE;YACrB,MAAM,EAAE,QAAQ,EAAE,GAAG,UAAU,CAAC,SAAS,CAAC,EAA6C,CAAC,CAAC,kBAAkB;YAE3G,MAAM,GAAG,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,IAAI,CAAC,CAAC;YACtD,OAAO,GAAG,CAAC;QACb,CAAC,EACD,EAAE,GAAG,EAAE,SAAS,EAAE,CACnB;aACA,KAAK,CAAC,qBAAa,CAAC,CAAC;QACxB,OAAO,SAAS,CAAC;IACnB,CAAC,CAAA,CAAC;IAEF,8BAA8B;IAC9B,MAAM,SAAS,GAAG,CAAO,EAAU,EAAE,EAAE;QACrC,MAAM,SAAS,GAAG,MAAM,UAAU,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QACzC,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,cAAc,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,qBAAa,CAAC,CAAC;QACpF,OAAO,MAAM,CAAC;IAChB,CAAC,CAAA,CAAC;IAEF,OAAO;QACL,YAAY;QACZ,UAAU;QACV,SAAS;KACV,CAAC;AACJ,CAAC,CAAA,CAAC","sourcesContent":["import type { Page, Locator, ElementHandle, JSHandle } from 'playwright';\n\nimport type { MaybePromise } from '../../utils/types';\nimport { logAndRethrow } from '../../utils/error';\nimport { handleIsLocator } from './domUtils';\n\ntype InfiScrollTypes<TContainer, TChild, TChildren, TCB> = {\n container: TContainer;\n child: TChild;\n children: TChildren;\n callbackArg: TCB;\n};\n\ntype AnyInfiScrollTypes = InfiScrollTypes<any, any, any, any>;\n\n/**\n * Common interface for working with browser page despite different environments\n * (e.g. Browser API, Playwright, Puppeteer, Selenium).\n *\n * This common interfaces makes the scraping code more portable between them.\n */\nexport interface PageLib<\n TPage,\n TScroll extends AnyInfiScrollTypes,\n TCtx extends { container: TScroll['container'] }\n> {\n page: TPage;\n\n /** Load entries via infinite scroll and process them as you go. */\n infiniteScroll: (\n /** A container, or selector for it, that includes the dynamically loaded items. */\n container: string | TScroll['container'],\n /**\n * Callback that receives a handle to the new child elements in the DOM\n *\n * Example:\n * ```js\n * // Get text from all new child elements of the infinite-scroller container\n * async (elementsHandle) => {\n * const result = await page.evaluate((els) => els.map((el) => el.textContent), elementsHandle);\n * return result;\n * };\n * ```\n */\n onNewChildren?: (\n /** New elements that were added */\n elsHandle: TScroll['callbackArg'],\n ctx: { page: TPage; container: TScroll['container'] },\n /** Function that, if called, stops the infinite scrolling */\n stop: () => void\n ) => MaybePromise<void>,\n options?: InfiniteScrollLoaderOptions<TScroll, TCtx>\n ) => MaybePromise<void>;\n}\n\ntype PlaywrightInfiScrollTypes = InfiScrollTypes<\n Locator | ElementHandle<Element>,\n JSHandle<Element | null>,\n JSHandle<Element[]>,\n JSHandle<(Element | null)[]>\n>;\ntype PWIST = PlaywrightInfiScrollTypes; // For brevity\n\nexport interface InfiniteScrollLoaderOptions<\n T extends AnyInfiScrollTypes,\n TCtx extends { container: T['container'] } = { container: T['container'] }\n> {\n /** How many times to retry the infinite scroll if new items aren't loading */\n retries?: number;\n /** Override how container children are counted. Default uses `el.childElementCount` */\n childrenCounter?: (containerEl: T['container'], ctx: TCtx) => MaybePromise<number>;\n /** Override how container children are extraced. Default uses `el.children` */\n childrenGetter?: (containerEl: T['container'], ctx: TCtx) => MaybePromise<T['children']>;\n /** Override how container children are scrolled into view. Default uses `el.scrollIntoView` */\n scrollIntoView?: (childEl: T['child'], ctx: TCtx) => MaybePromise<void>;\n /** Override whether and how to wait after scrolling into view */\n waitAfterScroll?: (childEl: T['child'], ctx: TCtx) => MaybePromise<void>;\n}\n\nexport type PlaywrightPageLib<T extends Page = Page> = PageLib<\n T,\n PWIST,\n { container: PWIST['container']; page: T }\n>;\n\n/** Implementation of PageLib in Playwright */\nexport const playwrightPageLib = async <T extends Page>(page: T): Promise<PlaywrightPageLib<T>> => {\n const { serializeEls, resolveId, resolveIds } = await _createPlaywrightElementSerializer(page);\n\n const infiniteScroll: PlaywrightPageLib<T>['infiniteScroll'] = async (\n container,\n onNewChildren,\n options\n ) => {\n const childrenCounter = options?.childrenCounter ?? ((h) => (h as ElementHandle).evaluate((el) => el ? (el as Element).childElementCount : 0).catch(logAndRethrow)); // prettier-ignore\n const childrenGetter = options?.childrenGetter ?? ((h) => h.evaluateHandle((el) => el ? (el as Element).children : []).catch(logAndRethrow)); // prettier-ignore\n const scrollIntoView = options?.scrollIntoView ?? ((h) => h.evaluate((el) => { el && el.scrollIntoView() }).catch(logAndRethrow)); // prettier-ignore\n const waitAfterScroll = options?.waitAfterScroll ?? (() => page.waitForLoadState('networkidle')); // prettier-ignore\n\n const handleOrLocator = typeof container === 'string' ? page.locator(container) : container;\n if (handleIsLocator(handleOrLocator) && handleOrLocator.page() !== page) {\n throw Error('Locator does not belong to given Page.');\n }\n\n await _infiniteScrollLoader<InfiScrollTypes<PWIST['container'], string, string[], string[]>>(\n handleOrLocator,\n async (childIds, ctx, stopFn) => {\n // Resolve child IDs to handle of child elements on the page\n const elsHandle = await resolveIds(childIds);\n // Then pass them to user\n await onNewChildren?.(elsHandle, { ...ctx, page }, stopFn);\n },\n {\n retries: options?.retries ?? 3,\n childrenCounter: (el, ctx) => childrenCounter(el, { ...ctx, page }),\n childrenGetter: async (handle, ctx) => {\n // First let user tell us how to collect the child elements\n const childElsHandle = await childrenGetter(handle, { ...ctx, page });\n // Then convert them to serializable IDs that we can return to user\n const childIds = await serializeEls(childElsHandle);\n return childIds;\n },\n scrollIntoView: async (childId, ctx) => {\n // First resolve serializable ID to an element on the page\n const childElHandle = await resolveId(childId);\n // Then let user tell us how to scroll into view\n await scrollIntoView(childElHandle, { ...ctx, page });\n },\n waitAfterScroll: async (childId, ctx) => {\n // First resolve serializable ID to an element on the page\n const childElHandle = await resolveId(childId);\n // Then let user tell us how to wait after scroll\n await waitAfterScroll(childElHandle, { ...ctx, page });\n },\n }\n );\n };\n\n return {\n page,\n\n infiniteScroll,\n } satisfies PlaywrightPageLib<T>;\n};\n\n/** Load entries via infinite scroll and process them as you go. */\nconst _infiniteScrollLoader = async <T extends AnyInfiScrollTypes>(\n container: T['container'] | (() => MaybePromise<T['container']>),\n onNewChildren: (\n childEls: T['callbackArg'],\n ctx: { container: T['container'] },\n stop: () => void\n ) => MaybePromise<void>,\n options: Required<InfiniteScrollLoaderOptions<T>>\n) => {\n const containerElGetter = (\n typeof container === 'function' ? container : () => container\n ) as () => MaybePromise<T['container']>;\n\n const processedChildren = new Set();\n let userAskedToStop = false;\n\n const stopFn = () => {\n userAskedToStop = true;\n };\n\n const processChildren = async (childrenEl: T['child'][]) => {\n const newChildren = await childrenEl.filter((el) => !processedChildren.has(el));\n const container = await containerElGetter();\n await onNewChildren?.(newChildren, { container }, stopFn);\n newChildren.forEach((el) => processedChildren.add(el));\n };\n\n const initContainer = await containerElGetter();\n let currChildrenCount = await options.childrenCounter(initContainer, { container: initContainer }); // prettier-ignore\n let currRetries = 0;\n\n while (!userAskedToStop) {\n // Process currently-loaded children\n const containerEl = await containerElGetter();\n const currChildren = [...(await options.childrenGetter(containerEl, { container: containerEl }))]; // prettier-ignore\n await processChildren(currChildren);\n\n if (userAskedToStop) break;\n\n // Load next batch\n const lastChildEl = currChildren.slice(-1)[0];\n await options.scrollIntoView(lastChildEl, { container: containerEl });\n await options.waitAfterScroll(lastChildEl, { container: containerEl });\n const newChildrenCount = await options.childrenCounter(containerEl, { container: containerEl }); // prettier-ignore\n\n if (newChildrenCount <= currChildrenCount) {\n if (currRetries >= options.retries) break;\n else currRetries++;\n } else {\n currRetries = 0;\n }\n\n currChildrenCount = newChildrenCount;\n }\n};\n\ninterface PlaywrightElementSerializerHelperResult {\n elMap: Map<Element, string>;\n elMapRev: Map<string, Element>;\n}\n\n/**\n * Helper methods that allow to represent HTML Elements on the Page as string IDs\n *\n * We use this so we can identify which elements have already been processed, and which have not.\n * Normally, the elements are represented via Playwright JSHandle/ElementHandle. However, if two\n * Handles are pointing to the same Element, we're unable to count them as one, because it's two\n * instances that don't have any IDs of the Elemenets. On the other hand, using the string IDs,\n * two different JSHandles will return the same string if they point to the same Element, so we\n * cache the IDs outside of Playwright in Sets or Maps.\n */\nconst _createPlaywrightElementSerializer = async <T extends Page>(page: T) => {\n const prefix = '__domLib_infiniteScrollLoader__';\n const helperKey = `${prefix}helpers_elIdMap`;\n // There may be multiple instances of this cache on the Page, so we distinguish\n // them with operationId.\n const operationId = Math.floor(Math.random() * 10 ** 9).toString().padStart(9, '0'); // prettier-ignore\n const mapKey = `${prefix}${operationId}`;\n\n // Prepare a function in-page that creates the cache to store and retrieve the elements.\n await page\n .evaluate(\n ({ mapKey, helperKey }) => {\n // Create mapping between IDs and HTMLElements, so we can pass the IDs as a serializable\n // reference to the in-page DOM elements\n globalThis[helperKey] = () => {\n const elMap = (globalThis[mapKey] = globalThis[mapKey] || new Map());\n const elMapRev = (globalThis[`${mapKey}_rev`] = globalThis[`${mapKey}_rev`] || new Map());\n return { elMap, elMapRev } satisfies PlaywrightElementSerializerHelperResult;\n };\n },\n { mapKey, helperKey }\n )\n .catch(logAndRethrow);\n\n /**\n * Given a Playwright JSHandle holding an array of Elements, cache the Elements\n * in the Page and generate serializable IDs that can be used to refer to these\n * elements outside of Playwright.\n *\n * This is the opposite of `_resolveIds`.\n *\n * We use this so we can identify which elements have already been processed, and which have not.\n * Normally, the elements are represented via Playwright JSHandle/ElementHandle. However, if two\n * Handles are pointing to the same Element, we're unable to count them as one, because it's two\n * instances that don't have any IDs of the Elemenets. On the other hand, using the string IDs,\n * two different JSHandles will return the same string if they point to the same Element, so we\n * cache the IDs outside of Playwright in Sets or Maps.\n */\n const serializeEls = async (elsHandle: PWIST['children']) => {\n const ids = await page\n .evaluate(\n ({ els, helperKey }) => {\n if (!els) return [] as string[];\n\n const { elMap, elMapRev } = globalThis[helperKey]() as PlaywrightElementSerializerHelperResult; // prettier-ignore\n\n const innerIds = [...els].map((el) => {\n if (!elMap.has(el)) {\n const elId = Math.floor(Math.random() * 10 ** 9)\n .toString()\n .padStart(9, '0');\n elMap.set(el, elId);\n elMapRev.set(elId, el);\n return elId;\n }\n return elMap.get(el)!;\n });\n return innerIds;\n },\n { els: elsHandle, helperKey }\n )\n .catch(logAndRethrow);\n\n return ids;\n };\n\n /**\n * Given an array of IDs, resolve them to a Playwright JSHandle holding an array of corresponding\n * Elements cached in Page's global context.\n *\n * This is the opposite of `_serializeEls`.\n *\n * We use this so we can identify which elements have already been processed, and which have not.\n * Normally, the elements are represented via Playwright JSHandle/ElementHandle. However, if two\n * Handles are pointing to the same Element, we're unable to count them as one, because it's two\n * instances that don't have any IDs of the Elemenets. On the other hand, using the string IDs,\n * two different JSHandles will return the same string if they point to the same Element, so we\n * cache the IDs outside of Playwright in Sets or Maps.\n */\n const resolveIds = async (ids: string[]) => {\n // Resolve serializable IDs to an element on the page\n const elsHandle = await page\n .evaluateHandle(\n ({ ids, helperKey }) => {\n const { elMapRev } = globalThis[helperKey]() as PlaywrightElementSerializerHelperResult; // prettier-ignore\n\n const els = ids.map((id) => elMapRev.get(id) || null);\n return els;\n },\n { ids, helperKey }\n )\n .catch(logAndRethrow);\n return elsHandle;\n };\n\n /** See {@link resolveIds}. */\n const resolveId = async (id: string) => {\n const elsHandle = await resolveIds([id]);\n const handle = await elsHandle.evaluateHandle((ids) => ids[0]).catch(logAndRethrow);\n return handle;\n };\n\n return {\n serializeEls,\n resolveIds,\n resolveId,\n };\n};\n"]}
@@ -0,0 +1,78 @@
1
+ import type { MaybePromise } from '../../utils/types';
2
+ export interface ListingLogger {
3
+ debug: (msg: string, data?: any) => void;
4
+ info: (msg: string, data?: any) => void;
5
+ warning: (msg: string, data?: any) => void;
6
+ error: (msg: string, data?: any) => void;
7
+ }
8
+ export interface ListingPageFilter {
9
+ name: string;
10
+ disabled?: boolean;
11
+ initState: () => MaybePromise<boolean>;
12
+ resetState: () => MaybePromise<void>;
13
+ nextState: () => MaybePromise<void>;
14
+ hasNextState: () => MaybePromise<boolean>;
15
+ hasState: () => MaybePromise<boolean>;
16
+ loadState: () => MaybePromise<void>;
17
+ }
18
+ export interface ListingFiltersSetupOptions<Ctx extends object, UrlType> {
19
+ context: ListingPageScraperContext<Ctx, UrlType>;
20
+ filters?: ListingPageFilter[];
21
+ shouldApplyFilter?: (context: ListingPageScraperContext<Ctx, UrlType>, filter: ListingPageFilter, filters: ListingPageFilter[]) => MaybePromise<boolean>;
22
+ onResetFilters?: (context: ListingPageScraperContext<Ctx, UrlType>) => MaybePromise<void>;
23
+ onFiltersLoaded?: (context: ListingPageScraperContext<Ctx, UrlType>) => MaybePromise<void>;
24
+ log: ListingLogger;
25
+ }
26
+ export interface ListingPageScraperContext<Ctx extends object, UrlType> {
27
+ context: Ctx;
28
+ log: ListingLogger;
29
+ startUrl: UrlType;
30
+ filters: ListingPageFilter[];
31
+ /** Use this if you need to load filters again (eg after reloading page manually) */
32
+ loadFilterState: () => MaybePromise<void>;
33
+ /** Call this function from any callback to stop scraping */
34
+ abort: () => void;
35
+ }
36
+ export interface ListingPageScraperOptions<Ctx extends object, UrlType> extends Omit<ListingFiltersSetupOptions<Ctx, UrlType>, 'context'> {
37
+ context: Ctx;
38
+ startUrls: UrlType[];
39
+ listingCountOnly?: boolean;
40
+ /** Get ID of the current page in the pagination, so it can be logged */
41
+ pageId?: (context: ListingPageScraperContext<Ctx, UrlType>) => MaybePromise<string>;
42
+ log: ListingLogger;
43
+ onNavigate?: (context: ListingPageScraperContext<Ctx, UrlType>, url: UrlType) => MaybePromise<void>;
44
+ /**
45
+ * Hook triggered after navigating to the url using Page.goto().
46
+ *
47
+ * One use of this hook is to conditionally disable/enable filters based on the page content.
48
+ **/
49
+ onAfterNavigation?: (context: ListingPageScraperContext<Ctx, UrlType>) => MaybePromise<void>;
50
+ /** How many attempts are retried after filters failed to load. Defaults to 3 */
51
+ loadFiltersRetries?: number;
52
+ /**
53
+ * Hook triggered after a failed attempt at loading listings page filters.
54
+ *
55
+ * One use of this hook is to reload the page on failed attemp in case something didn't load correctly.
56
+ **/
57
+ onLoadFiltersError?: (context: ListingPageScraperContext<Ctx, UrlType>, error: any, retryIndex: number) => MaybePromise<void>;
58
+ /** Main logic to extract entries from a page */
59
+ extractEntries: (context: ListingPageScraperContext<Ctx, UrlType>, retryIndex: number) => MaybePromise<UrlType[]>;
60
+ /** How many attempts are retried after failed to scrape entries from a listing. Defaults to 3 */
61
+ extractEntriesRetries?: number;
62
+ /**
63
+ * Hook triggered after a failed attempt at scraping entries from a listing.
64
+ *
65
+ * One use of this hook is to reload the page on failed attemp in case something didn't load correctly.
66
+ **/
67
+ onExtractEntriesError?: (context: ListingPageScraperContext<Ctx, UrlType>, error: any, retryIndex: number) => MaybePromise<void>;
68
+ onExtractEntriesDone?: (context: ListingPageScraperContext<Ctx, UrlType>, entries: UrlType[] | null) => MaybePromise<void>;
69
+ /**
70
+ * If goToNextPage hook is defined, it will be called after each page. To indicate that there's no more
71
+ * pages left, throw an error.
72
+ **/
73
+ onGoToNextPage?: (context: ListingPageScraperContext<Ctx, UrlType>, entries: UrlType[] | null) => MaybePromise<void>;
74
+ /** How long to wait after we've navigated to the next page and before we start extracting? */
75
+ nextPageWait?: number;
76
+ }
77
+ /** Get entries from a listing page (eg URLs to profiles that should be scraped later) */
78
+ export declare const scrapeListingEntries: <Ctx extends object, UrlType>(options: ListingPageScraperOptions<Ctx, UrlType>) => Promise<UrlType[]>;
@@ -0,0 +1,242 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.scrapeListingEntries = void 0;
13
+ const lodash_1 = require("lodash");
14
+ const async_1 = require("../../utils/async");
15
+ const url_1 = require("../../utils/url");
16
+ /**
17
+ * Given configuration for listing page filters, set up functions to
18
+ * navigate through the different states of filters, to allow to paginate
19
+ * through all states.
20
+ */
21
+ const setupListingFilters = ({ context, filters = [], shouldApplyFilter, onResetFilters, onFiltersLoaded, log, }) => {
22
+ let filtersStack = filters;
23
+ const getNextFilterStateChangeIndex = () => __awaiter(void 0, void 0, void 0, function* () {
24
+ const hasNextStates = yield (0, async_1.serialAsyncMap)(filtersStack, (filter) => filter.hasNextState());
25
+ return (0, lodash_1.findLastIndex)(hasNextStates, (x) => x);
26
+ });
27
+ const hasState = () => __awaiter(void 0, void 0, void 0, function* () {
28
+ const hasStates = yield (0, async_1.serialAsyncMap)(filtersStack, (filter) => filter.hasState());
29
+ return hasStates.some(Boolean);
30
+ });
31
+ const hasNextState = () => __awaiter(void 0, void 0, void 0, function* () {
32
+ const nextFilterStateChangeIndex = yield getNextFilterStateChangeIndex();
33
+ return nextFilterStateChangeIndex > -1;
34
+ });
35
+ const nextState = () => __awaiter(void 0, void 0, void 0, function* () {
36
+ // Imagine we have 4 filters, each has 3 states (eg 3 options to select from)
37
+ // We start with all filters in the first state:
38
+ // State 1: F1(1), F2(1), F3(1), F4(1)
39
+ // As we progress, we increment it akin to numbers:
40
+ // State 2: F1(1), F2(1), F3(1), F4(2)
41
+ // State 3: F1(1), F2(1), F3(1), F4(3)
42
+ // State 4: F1(1), F2(1), F3(2), F4(1)
43
+ // All the way to the last state:
44
+ // State n: F1(3), F2(3), F3(3), F4(3)
45
+ //
46
+ // When we want move to a next state, we identify the RIGHT-most filter
47
+ // whose state can be incremented (in this case we select F2):
48
+ // YES YES NO NO
49
+ // State x: F1(1), F2(2), F3(3), F4(3)
50
+ //
51
+ // When we increment a filter state, all the other filter to the RIGHT
52
+ // will be reset:
53
+ // State x: F1(1), F2(2), F3(3), F4(3)
54
+ // State x+1: F1(1), F2(3), F3(1), F4(1)
55
+ const initStates = yield (0, async_1.serialAsyncMap)(filtersStack, (filter) => filter.initState());
56
+ if (initStates.some(Boolean))
57
+ return log.info('Initialised filters');
58
+ const nextFilterStateChangeIndex = yield getNextFilterStateChangeIndex();
59
+ if (nextFilterStateChangeIndex === -1)
60
+ throw Error('Cannot select next filter state - reached end of list');
61
+ const filterToNextState = filtersStack[nextFilterStateChangeIndex];
62
+ const filtersToReset = filtersStack.slice(nextFilterStateChangeIndex + 1);
63
+ log.info('Setting filters to next state');
64
+ yield filterToNextState.nextState();
65
+ for (const filter of filtersToReset) {
66
+ yield filter.resetState();
67
+ yield filter.nextState();
68
+ }
69
+ });
70
+ /** Load current filter state in the webpage */
71
+ const loadState = () => __awaiter(void 0, void 0, void 0, function* () {
72
+ yield resetState();
73
+ // Load filters one by one, and only if needed
74
+ filtersStack = [];
75
+ for (const filter of filters) {
76
+ const shouldUseFilter = shouldApplyFilter
77
+ ? yield shouldApplyFilter(context, filter, filters)
78
+ : true;
79
+ if (!shouldUseFilter) {
80
+ log.info(`Not applying filter "${filter.name}" or further filters`);
81
+ break;
82
+ }
83
+ if (!filter.disabled) {
84
+ log.info(`Applying filter "${filter.name}"`);
85
+ yield filter.loadState();
86
+ }
87
+ else {
88
+ log.info(`Filter "${filter.name}" recognised but not applied because it is disabled`);
89
+ }
90
+ filtersStack.push(filter);
91
+ }
92
+ log.info(`Done loading filters`);
93
+ yield (onFiltersLoaded === null || onFiltersLoaded === void 0 ? void 0 : onFiltersLoaded(context));
94
+ });
95
+ /** Reset filter state */
96
+ const resetState = () => __awaiter(void 0, void 0, void 0, function* () {
97
+ log.info(`Resetting filter state`);
98
+ yield (onResetFilters === null || onResetFilters === void 0 ? void 0 : onResetFilters(context));
99
+ filtersStack = filters;
100
+ log.info(`Resetting filter state done`);
101
+ });
102
+ return {
103
+ loadState,
104
+ nextState,
105
+ hasNextState,
106
+ hasState,
107
+ };
108
+ };
109
+ /** Get entries from a listing page (eg URLs to profiles that should be scraped later) */
110
+ const scrapeListingEntries = (options) => __awaiter(void 0, void 0, void 0, function* () {
111
+ const { context, startUrls, listingCountOnly = false, log, pageId, onNavigate, onAfterNavigation, filters = [], shouldApplyFilter, loadFiltersRetries = 3, onLoadFiltersError = (_, err) => console.error(err), onFiltersLoaded, onResetFilters, extractEntries, extractEntriesRetries = 3, onExtractEntriesError = (_, err) => console.error(err), onExtractEntriesDone, onGoToNextPage, nextPageWait = 500, } = options;
112
+ /** Collection of ALL urls across all pages and startUrls */
113
+ const links = [];
114
+ yield (0, async_1.serialAsyncMap)(startUrls, (startUrl, index) => __awaiter(void 0, void 0, void 0, function* () {
115
+ if (listingCountOnly && index > 0)
116
+ return;
117
+ const logId = `${startUrl} (${index + 1}/${startUrls.length})`;
118
+ let userAskedToStop = false;
119
+ const abort = () => { userAskedToStop = true; }; // prettier-ignore
120
+ // Prepare context shared across all hooks
121
+ let filterObj = null;
122
+ const genCtxArg = () => {
123
+ var _a;
124
+ return ({
125
+ context,
126
+ log,
127
+ startUrl,
128
+ filters,
129
+ loadFilterState: (_a = filterObj === null || filterObj === void 0 ? void 0 : filterObj.loadState) !== null && _a !== void 0 ? _a : (() => { }),
130
+ abort,
131
+ });
132
+ };
133
+ log.debug(`Validating URL ${logId}`);
134
+ (0, url_1.validateUrl)(startUrl);
135
+ log.info(`Navigating URL ${logId}`);
136
+ yield (onNavigate === null || onNavigate === void 0 ? void 0 : onNavigate(genCtxArg(), startUrl));
137
+ log.debug(`Done navigating to URL ${logId}`);
138
+ filterObj = setupListingFilters({
139
+ context: genCtxArg(),
140
+ filters,
141
+ shouldApplyFilter,
142
+ onFiltersLoaded,
143
+ onResetFilters,
144
+ log,
145
+ });
146
+ log.debug(`Calling onAfterNavigation callback. URL ${logId}`); // prettier-ignore
147
+ yield (onAfterNavigation === null || onAfterNavigation === void 0 ? void 0 : onAfterNavigation(genCtxArg()));
148
+ log.debug(`Done calling onAfterNavigation callback. URL ${logId})`); // prettier-ignore
149
+ const isUsingFilters = filters.some((filter) => !filter.disabled);
150
+ let hasFilterStatesToProcess = true;
151
+ while (hasFilterStatesToProcess && !userAskedToStop) {
152
+ // Filter loop
153
+ // Load filters before we start paginating
154
+ log.info(`Setting up filters for URL ${logId}`);
155
+ yield (0, async_1.retryAsync)(() => __awaiter(void 0, void 0, void 0, function* () {
156
+ if (!filterObj)
157
+ throw Error(`Filter controller is missing. This should never happen. URL ${logId}`); // prettier-ignore
158
+ const filterHasState = yield filterObj.hasState();
159
+ if (!isUsingFilters || !filterHasState) {
160
+ log.info(`Not loading filters for URL ${logId}`);
161
+ return;
162
+ }
163
+ log.debug(`Loading filters for URL ${logId}`);
164
+ yield filterObj.nextState();
165
+ yield filterObj.loadState();
166
+ log.debug(`Done loading filters for URL ${logId}`);
167
+ }), {
168
+ maxRetries: loadFiltersRetries,
169
+ onError: (err, retryIndex) => onLoadFiltersError(genCtxArg(), err, retryIndex),
170
+ });
171
+ let nextPageAvailable = true;
172
+ while (nextPageAvailable && !userAskedToStop) {
173
+ // Pagination loop
174
+ let currPageId = 'next page';
175
+ if (pageId) {
176
+ log.debug(`Loading pageId for URL ${logId}`);
177
+ currPageId = yield pageId(genCtxArg());
178
+ log.debug(`Done loading pageId for URL ${logId}`);
179
+ }
180
+ const pageLogId = `${logId} (${currPageId})`;
181
+ // Extract page links
182
+ log.info(`Extracting links from page ${pageLogId}`);
183
+ const { result } = yield (0, async_1.retryAsync)((retryIndex) => __awaiter(void 0, void 0, void 0, function* () { return extractEntries(genCtxArg(), retryIndex); }), {
184
+ maxRetries: extractEntriesRetries,
185
+ onError: (err, retryIndex) => onExtractEntriesError(genCtxArg(), err, retryIndex),
186
+ });
187
+ log.debug(`Done extracting links from page ${pageLogId}`);
188
+ const pageLinks = result !== null && result !== void 0 ? result : [];
189
+ links.push(...pageLinks);
190
+ log.info(`Found ${pageLinks.length} links on page ${pageLogId}`);
191
+ // Leave after printing the count or on abort
192
+ if (listingCountOnly || userAskedToStop) {
193
+ nextPageAvailable = false;
194
+ if (listingCountOnly)
195
+ log.info(`Debugging mode. Entries are not scraped. Leaving now. URL ${pageLogId}`); // prettier-ignore
196
+ else if (userAskedToStop)
197
+ log.info(`Aborting. URL ${pageLogId}`);
198
+ continue;
199
+ }
200
+ log.debug(`Calling onExtractEntriesDone callback. URL ${pageLogId}`); // prettier-ignore
201
+ yield (onExtractEntriesDone === null || onExtractEntriesDone === void 0 ? void 0 : onExtractEntriesDone(genCtxArg(), pageLinks));
202
+ log.debug(`Done calling onExtractEntriesDone callback. URL ${pageLogId}`); // prettier-ignore
203
+ if (onGoToNextPage && !userAskedToStop) {
204
+ // If goToNextPage hook is defined, this will be called after each page, until it errors
205
+ try {
206
+ log.info(`Navigating to next page from URL ${pageLogId}`);
207
+ yield onGoToNextPage(genCtxArg(), pageLinks);
208
+ log.debug(`Done navigating to next page from URL ${pageLogId}`); // prettier-ignore
209
+ }
210
+ catch (e) {
211
+ log.info(`Failed navigating to next page from URL ${pageLogId}`); // prettier-ignore
212
+ log.error(e.toString());
213
+ nextPageAvailable = false;
214
+ }
215
+ }
216
+ else {
217
+ if (userAskedToStop)
218
+ log.info(`Aborting. URL ${pageLogId}`);
219
+ nextPageAvailable = false;
220
+ }
221
+ // Wait before we start scraping the next page
222
+ yield new Promise((res) => setTimeout(res, nextPageWait));
223
+ }
224
+ // Break out if we're not using filters or we've gone through them all
225
+ log.debug(`Checking if there are more filter states available for URL ${logId}`);
226
+ hasFilterStatesToProcess = isUsingFilters && (yield filterObj.hasNextState());
227
+ log.debug(`Done checking if there are more filter states available for URL ${logId}`);
228
+ if (hasFilterStatesToProcess) {
229
+ if (!userAskedToStop)
230
+ log.info(`Will repeat scraping this URL with different filter setting. URL ${logId}`); // prettier-ignore
231
+ else
232
+ log.info(`There are unprocessed filter setting remaining for this URL, but stopping due to abort. URL ${logId}`); // prettier-ignore
233
+ }
234
+ else
235
+ log.info(`No filter setting remain for scraping this URL. URL ${logId}`); // prettier-ignore
236
+ }
237
+ log.info(`Finished URL ${logId}`);
238
+ }));
239
+ return links;
240
+ });
241
+ exports.scrapeListingEntries = scrapeListingEntries;
242
+ //# sourceMappingURL=scrapeListing.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"scrapeListing.js","sourceRoot":"","sources":["../../../../src/lib/actions/scrapeListing.ts"],"names":[],"mappings":";;;;;;;;;;;;AAAA,mCAAuC;AAEvC,6CAA+D;AAC/D,yCAA8C;AA8G9C;;;;GAIG;AACH,MAAM,mBAAmB,GAAG,CAA8B,EACxD,OAAO,EACP,OAAO,GAAG,EAAE,EACZ,iBAAiB,EACjB,cAAc,EACd,eAAe,EACf,GAAG,GACsC,EAA2B,EAAE;IACtE,IAAI,YAAY,GAAwB,OAAO,CAAC;IAEhD,MAAM,6BAA6B,GAAG,GAAS,EAAE;QAC/C,MAAM,aAAa,GAAG,MAAM,IAAA,sBAAc,EAAC,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,YAAY,EAAE,CAAC,CAAC;QAC5F,OAAO,IAAA,sBAAa,EAAC,aAAa,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC;IAChD,CAAC,CAAA,CAAC;IAEF,MAAM,QAAQ,GAAG,GAAS,EAAE;QAC1B,MAAM,SAAS,GAAG,MAAM,IAAA,sBAAc,EAAC,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC;QACpF,OAAO,SAAS,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IACjC,CAAC,CAAA,CAAC;IAEF,MAAM,YAAY,GAAG,GAAS,EAAE;QAC9B,MAAM,0BAA0B,GAAG,MAAM,6BAA6B,EAAE,CAAC;QACzE,OAAO,0BAA0B,GAAG,CAAC,CAAC,CAAC;IACzC,CAAC,CAAA,CAAC;IAEF,MAAM,SAAS,GAAG,GAAS,EAAE;QAC3B,6EAA6E;QAC7E,gDAAgD;QAChD,wCAAwC;QACxC,mDAAmD;QACnD,wCAAwC;QACxC,wCAAwC;QACxC,wCAAwC;QACxC,iCAAiC;QACjC,wCAAwC;QACxC,EAAE;QACF,uEAAuE;QACvE,8DAA8D;QAC9D,sCAAsC;QACtC,wCAAwC;QACxC,EAAE;QACF,sEAAsE;QACtE,iBAAiB;QACjB,0CAA0C;QAC1C,0CAA0C;QAE1C,MAAM,UAAU,GAAG,MAAM,IAAA,sBAAc,EAAC,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC;QACtF,IAAI,UAAU,CAAC,IAAI,CAAC,OAAO,CAAC;YAAE,OAAO,GAAG,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC;QAErE,MAAM,0BAA0B,GAAG,MAAM,6BAA6B,EAAE,CAAC;QACzE,IAAI,0BAA0B,KAAK,CAAC,CAAC;YACnC,MAAM,KAAK,CAAC,uDAAuD,CAAC,CAAC;QAEvE,MAAM,iBAAiB,GAAG,YAAY,CAAC,0BAA0B,CAAC,CAAC;QACnE,MAAM,cAAc,GAAG,YAAY,CAAC,KAAK,CAAC,0BAA0B,GAAG,CAAC,CAAC,CAAC;QAE1E,GAAG,CAAC,IAAI,CAAC,+BAA+B,CAAC,CAAC;QAC1C,MAAM,iBAAiB,CAAC,SAAS,EAAE,CAAC;QACpC,KAAK,MAAM,MAAM,IAAI,cAAc,EAAE;YACnC,MAAM,MAAM,CAAC,UAAU,EAAE,CAAC;YAC1B,MAAM,MAAM,CAAC,SAAS,EAAE,CAAC;SAC1B;IACH,CAAC,CAAA,CAAC;IAEF,+CAA+C;IAC/C,MAAM,SAAS,GAAG,GAAS,EAAE;QAC3B,MAAM,UAAU,EAAE,CAAC;QAEnB,8CAA8C;QAC9C,YAAY,GAAG,EAAE,CAAC;QAClB,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE;YAC5B,MAAM,eAAe,GAAG,iBAAiB;gBACvC,CAAC,CAAC,MAAM,iBAAiB,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,CAAC;gBACnD,CAAC,CAAC,IAAI,CAAC;YACT,IAAI,CAAC,eAAe,EAAE;gBACpB,GAAG,CAAC,IAAI,CAAC,wBAAwB,MAAM,CAAC,IAAI,sBAAsB,CAAC,CAAC;gBACpE,MAAM;aACP;YAED,IAAI,CAAC,MAAM,CAAC,QAAQ,EAAE;gBACpB,GAAG,CAAC,IAAI,CAAC,oBAAoB,MAAM,CAAC,IAAI,GAAG,CAAC,CAAC;gBAC7C,MAAM,MAAM,CAAC,SAAS,EAAE,CAAC;aAC1B;iBAAM;gBACL,GAAG,CAAC,IAAI,CAAC,WAAW,MAAM,CAAC,IAAI,qDAAqD,CAAC,CAAC;aACvF;YAED,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;SAC3B;QAED,GAAG,CAAC,IAAI,CAAC,sBAAsB,CAAC,CAAC;QACjC,MAAM,CAAA,eAAe,aAAf,eAAe,uBAAf,eAAe,CAAG,OAAO,CAAC,CAAA,CAAC;IACnC,CAAC,CAAA,CAAC;IAEF,yBAAyB;IACzB,MAAM,UAAU,GAAG,GAAS,EAAE;QAC5B,GAAG,CAAC,IAAI,CAAC,wBAAwB,CAAC,CAAC;QACnC,MAAM,CAAA,cAAc,aAAd,cAAc,uBAAd,cAAc,CAAG,OAAO,CAAC,CAAA,CAAC;QAChC,YAAY,GAAG,OAAO,CAAC;QACvB,GAAG,CAAC,IAAI,CAAC,6BAA6B,CAAC,CAAC;IAC1C,CAAC,CAAA,CAAC;IAEF,OAAO;QACL,SAAS;QACT,SAAS;QACT,YAAY;QACZ,QAAQ;KACT,CAAC;AACJ,CAAC,CAAC;AAEF,yFAAyF;AAClF,MAAM,oBAAoB,GAAG,CAClC,OAAgD,EAChD,EAAE;IACF,MAAM,EACJ,OAAO,EACP,SAAS,EACT,gBAAgB,GAAG,KAAK,EACxB,GAAG,EACH,MAAM,EACN,UAAU,EACV,iBAAiB,EAEjB,OAAO,GAAG,EAAE,EACZ,iBAAiB,EACjB,kBAAkB,GAAG,CAAC,EACtB,kBAAkB,GAAG,CAAC,CAAC,EAAE,GAAG,EAAE,EAAE,CAAC,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,EACnD,eAAe,EACf,cAAc,EAEd,cAAc,EACd,qBAAqB,GAAG,CAAC,EACzB,qBAAqB,GAAG,CAAC,CAAC,EAAE,GAAG,EAAE,EAAE,CAAC,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,EACtD,oBAAoB,EAEpB,cAAc,EACd,YAAY,GAAG,GAAG,GACnB,GAAG,OAAO,CAAC;IAEZ,4DAA4D;IAC5D,MAAM,KAAK,GAAc,EAAE,CAAC;IAE5B,MAAM,IAAA,sBAAc,EAAC,SAAS,EAAE,CAAO,QAAQ,EAAE,KAAK,EAAE,EAAE;QACxD,IAAI,gBAAgB,IAAI,KAAK,GAAG,CAAC;YAAE,OAAO;QAE1C,MAAM,KAAK,GAAG,GAAG,QAAQ,KAAK,KAAK,GAAG,CAAC,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC;QAE/D,IAAI,eAAe,GAAG,KAAK,CAAC;QAC5B,MAAM,KAAK,GAAG,GAAG,EAAE,GAAG,eAAe,GAAG,IAAI,CAAA,CAAC,CAAC,CAAC,CAAC,kBAAkB;QAElE,0CAA0C;QAC1C,IAAI,SAAS,GAAmC,IAAI,CAAC;QACrD,MAAM,SAAS,GAAG,GAA4C,EAAE;;YAAC,OAAA,CAAC;gBAChE,OAAO;gBACP,GAAG;gBACH,QAAQ;gBACR,OAAO;gBACP,eAAe,EAAE,MAAA,SAAS,aAAT,SAAS,uBAAT,SAAS,CAAE,SAAS,mCAAI,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC;gBACnD,KAAK;aACN,CAAC,CAAA;SAAA,CAAC;QAEH,GAAG,CAAC,KAAK,CAAC,kBAAkB,KAAK,EAAE,CAAC,CAAC;QACrC,IAAA,iBAAW,EAAC,QAAkB,CAAC,CAAC;QAChC,GAAG,CAAC,IAAI,CAAC,kBAAkB,KAAK,EAAE,CAAC,CAAC;QACpC,MAAM,CAAA,UAAU,aAAV,UAAU,uBAAV,UAAU,CAAG,SAAS,EAAE,EAAE,QAAQ,CAAC,CAAA,CAAC;QAC1C,GAAG,CAAC,KAAK,CAAC,0BAA0B,KAAK,EAAE,CAAC,CAAC;QAE7C,SAAS,GAAG,mBAAmB,CAAC;YAC9B,OAAO,EAAE,SAAS,EAAE;YACpB,OAAO;YACP,iBAAiB;YACjB,eAAe;YACf,cAAc;YACd,GAAG;SACJ,CAAC,CAAC;QAEH,GAAG,CAAC,KAAK,CAAC,2CAA2C,KAAK,EAAE,CAAC,CAAC,CAAC,kBAAkB;QACjF,MAAM,CAAA,iBAAiB,aAAjB,iBAAiB,uBAAjB,iBAAiB,CAAG,SAAS,EAAE,CAAC,CAAA,CAAC;QACvC,GAAG,CAAC,KAAK,CAAC,gDAAgD,KAAK,GAAG,CAAC,CAAC,CAAC,kBAAkB;QAEvF,MAAM,cAAc,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;QAElE,IAAI,wBAAwB,GAAG,IAAI,CAAC;QACpC,OAAO,wBAAwB,IAAI,CAAC,eAAe,EAAE;YACnD,cAAc;YACd,0CAA0C;YAC1C,GAAG,CAAC,IAAI,CAAC,8BAA8B,KAAK,EAAE,CAAC,CAAC;YAChD,MAAM,IAAA,kBAAU,EACd,GAAS,EAAE;gBACT,IAAI,CAAC,SAAS;oBAAE,MAAM,KAAK,CAAC,+DAA+D,KAAK,EAAE,CAAC,CAAC,CAAC,kBAAkB;gBAEvH,MAAM,cAAc,GAAG,MAAM,SAAS,CAAC,QAAQ,EAAE,CAAC;gBAClD,IAAI,CAAC,cAAc,IAAI,CAAC,cAAc,EAAE;oBACtC,GAAG,CAAC,IAAI,CAAC,+BAA+B,KAAK,EAAE,CAAC,CAAC;oBACjD,OAAO;iBACR;gBAED,GAAG,CAAC,KAAK,CAAC,2BAA2B,KAAK,EAAE,CAAC,CAAC;gBAC9C,MAAM,SAAS,CAAC,SAAS,EAAE,CAAC;gBAC5B,MAAM,SAAS,CAAC,SAAS,EAAE,CAAC;gBAC5B,GAAG,CAAC,KAAK,CAAC,gCAAgC,KAAK,EAAE,CAAC,CAAC;YACrD,CAAC,CAAA,EACD;gBACE,UAAU,EAAE,kBAAkB;gBAC9B,OAAO,EAAE,CAAC,GAAG,EAAE,UAAU,EAAE,EAAE,CAAC,kBAAkB,CAAC,SAAS,EAAE,EAAE,GAAG,EAAE,UAAU,CAAC;aAC/E,CACF,CAAC;YAEF,IAAI,iBAAiB,GAAG,IAAI,CAAC;YAC7B,OAAO,iBAAiB,IAAI,CAAC,eAAe,EAAE;gBAC5C,kBAAkB;gBAClB,IAAI,UAAU,GAAG,WAAW,CAAC;gBAC7B,IAAI,MAAM,EAAE;oBACV,GAAG,CAAC,KAAK,CAAC,0BAA0B,KAAK,EAAE,CAAC,CAAC;oBAC7C,UAAU,GAAG,MAAM,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC;oBACvC,GAAG,CAAC,KAAK,CAAC,+BAA+B,KAAK,EAAE,CAAC,CAAC;iBACnD;gBACD,MAAM,SAAS,GAAG,GAAG,KAAK,KAAK,UAAU,GAAG,CAAC;gBAE7C,qBAAqB;gBACrB,GAAG,CAAC,IAAI,CAAC,8BAA8B,SAAS,EAAE,CAAC,CAAC;gBACpD,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,IAAA,kBAAU,EACjC,CAAO,UAAU,EAAE,EAAE,kDAAC,OAAA,cAAc,CAAC,SAAS,EAAE,EAAE,UAAU,CAAC,CAAA,GAAA,EAC7D;oBACE,UAAU,EAAE,qBAAqB;oBACjC,OAAO,EAAE,CAAC,GAAG,EAAE,UAAU,EAAE,EAAE,CAAC,qBAAqB,CAAC,SAAS,EAAE,EAAE,GAAG,EAAE,UAAU,CAAC;iBAClF,CACF,CAAC;gBACF,GAAG,CAAC,KAAK,CAAC,mCAAmC,SAAS,EAAE,CAAC,CAAC;gBAE1D,MAAM,SAAS,GAAG,MAAM,aAAN,MAAM,cAAN,MAAM,GAAI,EAAE,CAAC;gBAC/B,KAAK,CAAC,IAAI,CAAC,GAAG,SAAS,CAAC,CAAC;gBACzB,GAAG,CAAC,IAAI,CAAC,SAAS,SAAS,CAAC,MAAM,kBAAkB,SAAS,EAAE,CAAC,CAAC;gBAEjE,6CAA6C;gBAC7C,IAAI,gBAAgB,IAAI,eAAe,EAAE;oBACvC,iBAAiB,GAAG,KAAK,CAAC;oBAC1B,IAAI,gBAAgB;wBAAE,GAAG,CAAC,IAAI,CAAC,6DAA6D,SAAS,EAAE,CAAC,CAAC,CAAC,kBAAkB;yBACvH,IAAI,eAAe;wBAAE,GAAG,CAAC,IAAI,CAAC,iBAAiB,SAAS,EAAE,CAAC,CAAC;oBACjE,SAAS;iBACV;gBAED,GAAG,CAAC,KAAK,CAAC,8CAA8C,SAAS,EAAE,CAAC,CAAC,CAAC,kBAAkB;gBACxF,MAAM,CAAA,oBAAoB,aAApB,oBAAoB,uBAApB,oBAAoB,CAAG,SAAS,EAAE,EAAE,SAAS,CAAC,CAAA,CAAC;gBACrD,GAAG,CAAC,KAAK,CAAC,mDAAmD,SAAS,EAAE,CAAC,CAAC,CAAC,kBAAkB;gBAE7F,IAAI,cAAc,IAAI,CAAC,eAAe,EAAE;oBACtC,wFAAwF;oBACxF,IAAI;wBACF,GAAG,CAAC,IAAI,CAAC,oCAAoC,SAAS,EAAE,CAAC,CAAC;wBAC1D,MAAM,cAAc,CAAC,SAAS,EAAE,EAAE,SAAS,CAAC,CAAC;wBAC7C,GAAG,CAAC,KAAK,CAAC,yCAAyC,SAAS,EAAE,CAAC,CAAC,CAAC,kBAAkB;qBACpF;oBAAC,OAAO,CAAC,EAAE;wBACV,GAAG,CAAC,IAAI,CAAC,2CAA2C,SAAS,EAAE,CAAC,CAAC,CAAC,kBAAkB;wBACpF,GAAG,CAAC,KAAK,CAAE,CAAW,CAAC,QAAQ,EAAE,CAAC,CAAC;wBACnC,iBAAiB,GAAG,KAAK,CAAC;qBAC3B;iBACF;qBAAM;oBACL,IAAI,eAAe;wBAAE,GAAG,CAAC,IAAI,CAAC,iBAAiB,SAAS,EAAE,CAAC,CAAC;oBAC5D,iBAAiB,GAAG,KAAK,CAAC;iBAC3B;gBAED,8CAA8C;gBAC9C,MAAM,IAAI,OAAO,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,UAAU,CAAC,GAAG,EAAE,YAAY,CAAC,CAAC,CAAC;aAC3D;YAED,sEAAsE;YACtE,GAAG,CAAC,KAAK,CAAC,8DAA8D,KAAK,EAAE,CAAC,CAAC;YACjF,wBAAwB,GAAG,cAAc,IAAI,CAAC,MAAM,SAAS,CAAC,YAAY,EAAE,CAAC,CAAC;YAC9E,GAAG,CAAC,KAAK,CAAC,mEAAmE,KAAK,EAAE,CAAC,CAAC;YAEtF,IAAI,wBAAwB,EAAE;gBAC5B,IAAI,CAAC,eAAe;oBAAE,GAAG,CAAC,IAAI,CAAC,oEAAoE,KAAK,EAAE,CAAC,CAAC,CAAC,kBAAkB;;oBAC1H,GAAG,CAAC,IAAI,CAAC,+FAA+F,KAAK,EAAE,CAAC,CAAC,CAAC,kBAAkB;aAC1I;;gBAAM,GAAG,CAAC,IAAI,CAAC,uDAAuD,KAAK,EAAE,CAAC,CAAC,CAAC,kBAAkB;SACpG;QACD,GAAG,CAAC,IAAI,CAAC,gBAAgB,KAAK,EAAE,CAAC,CAAC;IACpC,CAAC,CAAA,CAAC,CAAC;IACH,OAAO,KAAK,CAAC;AACf,CAAC,CAAA,CAAC;AAxKW,QAAA,oBAAoB,wBAwK/B","sourcesContent":["import { findLastIndex } from 'lodash';\n\nimport { serialAsyncMap, retryAsync } from '../../utils/async';\nimport { validateUrl } from '../../utils/url';\nimport type { MaybePromise } from '../../utils/types';\n\n// TODO - Clean this up and merge it into PageLib\n\nexport interface ListingLogger {\n debug: (msg: string, data?: any) => void;\n info: (msg: string, data?: any) => void;\n warning: (msg: string, data?: any) => void;\n error: (msg: string, data?: any) => void;\n}\n\nexport interface ListingPageFilter {\n name: string;\n disabled?: boolean;\n initState: () => MaybePromise<boolean>;\n resetState: () => MaybePromise<void>;\n nextState: () => MaybePromise<void>;\n hasNextState: () => MaybePromise<boolean>;\n hasState: () => MaybePromise<boolean>;\n loadState: () => MaybePromise<void>;\n}\n\nexport interface ListingFiltersSetupOptions<Ctx extends object, UrlType> {\n context: ListingPageScraperContext<Ctx, UrlType>;\n filters?: ListingPageFilter[];\n shouldApplyFilter?: (\n context: ListingPageScraperContext<Ctx, UrlType>,\n filter: ListingPageFilter,\n filters: ListingPageFilter[]\n ) => MaybePromise<boolean>;\n onResetFilters?: (context: ListingPageScraperContext<Ctx, UrlType>) => MaybePromise<void>;\n onFiltersLoaded?: (context: ListingPageScraperContext<Ctx, UrlType>) => MaybePromise<void>;\n log: ListingLogger;\n}\n\ntype ListingFilterController = Pick<ListingPageFilter, 'loadState' | 'nextState' | 'hasNextState' | 'hasState'>; // prettier-ignore\n\nexport interface ListingPageScraperContext<Ctx extends object, UrlType> {\n context: Ctx;\n log: ListingLogger;\n startUrl: UrlType;\n filters: ListingPageFilter[];\n /** Use this if you need to load filters again (eg after reloading page manually) */\n loadFilterState: () => MaybePromise<void>;\n /** Call this function from any callback to stop scraping */\n abort: () => void;\n}\n\n// prettier-ignore\nexport interface ListingPageScraperOptions<Ctx extends object, UrlType> extends Omit<ListingFiltersSetupOptions<Ctx, UrlType>, 'context'> {\n context: Ctx;\n startUrls: UrlType[];\n listingCountOnly?: boolean;\n /** Get ID of the current page in the pagination, so it can be logged */\n pageId?: (context: ListingPageScraperContext<Ctx, UrlType>) => MaybePromise<string>;\n log: ListingLogger;\n\n onNavigate?: (context: ListingPageScraperContext<Ctx, UrlType>, url: UrlType) => MaybePromise<void>;\n /**\n * Hook triggered after navigating to the url using Page.goto().\n *\n * One use of this hook is to conditionally disable/enable filters based on the page content.\n **/\n onAfterNavigation?: (context: ListingPageScraperContext<Ctx, UrlType>) => MaybePromise<void>;\n\n /** How many attempts are retried after filters failed to load. Defaults to 3 */\n loadFiltersRetries?: number;\n /**\n * Hook triggered after a failed attempt at loading listings page filters.\n *\n * One use of this hook is to reload the page on failed attemp in case something didn't load correctly.\n **/\n onLoadFiltersError?: (\n context: ListingPageScraperContext<Ctx, UrlType>,\n error: any,\n retryIndex: number\n ) => MaybePromise<void>;\n\n /** Main logic to extract entries from a page */\n extractEntries: (context: ListingPageScraperContext<Ctx, UrlType>, retryIndex: number) => MaybePromise<UrlType[]>;\n /** How many attempts are retried after failed to scrape entries from a listing. Defaults to 3 */\n extractEntriesRetries?: number;\n /**\n * Hook triggered after a failed attempt at scraping entries from a listing.\n *\n * One use of this hook is to reload the page on failed attemp in case something didn't load correctly.\n **/\n onExtractEntriesError?: (\n context: ListingPageScraperContext<Ctx, UrlType>,\n error: any,\n retryIndex: number\n ) => MaybePromise<void>;\n onExtractEntriesDone?: (\n context: ListingPageScraperContext<Ctx, UrlType>,\n entries: UrlType[] | null\n ) => MaybePromise<void>;\n\n /**\n * If goToNextPage hook is defined, it will be called after each page. To indicate that there's no more\n * pages left, throw an error.\n **/\n onGoToNextPage?: (\n context: ListingPageScraperContext<Ctx, UrlType>,\n entries: UrlType[] | null\n ) => MaybePromise<void>;\n /** How long to wait after we've navigated to the next page and before we start extracting? */\n nextPageWait?: number;\n}\n\n/**\n * Given configuration for listing page filters, set up functions to\n * navigate through the different states of filters, to allow to paginate\n * through all states.\n */\nconst setupListingFilters = <Ctx extends object, UrlType>({\n context,\n filters = [],\n shouldApplyFilter,\n onResetFilters,\n onFiltersLoaded,\n log,\n}: ListingFiltersSetupOptions<Ctx, UrlType>): ListingFilterController => {\n let filtersStack: ListingPageFilter[] = filters;\n\n const getNextFilterStateChangeIndex = async () => {\n const hasNextStates = await serialAsyncMap(filtersStack, (filter) => filter.hasNextState());\n return findLastIndex(hasNextStates, (x) => x);\n };\n\n const hasState = async () => {\n const hasStates = await serialAsyncMap(filtersStack, (filter) => filter.hasState());\n return hasStates.some(Boolean);\n };\n\n const hasNextState = async () => {\n const nextFilterStateChangeIndex = await getNextFilterStateChangeIndex();\n return nextFilterStateChangeIndex > -1;\n };\n\n const nextState = async () => {\n // Imagine we have 4 filters, each has 3 states (eg 3 options to select from)\n // We start with all filters in the first state:\n // State 1: F1(1), F2(1), F3(1), F4(1)\n // As we progress, we increment it akin to numbers:\n // State 2: F1(1), F2(1), F3(1), F4(2)\n // State 3: F1(1), F2(1), F3(1), F4(3)\n // State 4: F1(1), F2(1), F3(2), F4(1)\n // All the way to the last state:\n // State n: F1(3), F2(3), F3(3), F4(3)\n //\n // When we want move to a next state, we identify the RIGHT-most filter\n // whose state can be incremented (in this case we select F2):\n // YES YES NO NO\n // State x: F1(1), F2(2), F3(3), F4(3)\n //\n // When we increment a filter state, all the other filter to the RIGHT\n // will be reset:\n // State x: F1(1), F2(2), F3(3), F4(3)\n // State x+1: F1(1), F2(3), F3(1), F4(1)\n\n const initStates = await serialAsyncMap(filtersStack, (filter) => filter.initState());\n if (initStates.some(Boolean)) return log.info('Initialised filters');\n\n const nextFilterStateChangeIndex = await getNextFilterStateChangeIndex();\n if (nextFilterStateChangeIndex === -1)\n throw Error('Cannot select next filter state - reached end of list');\n\n const filterToNextState = filtersStack[nextFilterStateChangeIndex];\n const filtersToReset = filtersStack.slice(nextFilterStateChangeIndex + 1);\n\n log.info('Setting filters to next state');\n await filterToNextState.nextState();\n for (const filter of filtersToReset) {\n await filter.resetState();\n await filter.nextState();\n }\n };\n\n /** Load current filter state in the webpage */\n const loadState = async () => {\n await resetState();\n\n // Load filters one by one, and only if needed\n filtersStack = [];\n for (const filter of filters) {\n const shouldUseFilter = shouldApplyFilter\n ? await shouldApplyFilter(context, filter, filters)\n : true;\n if (!shouldUseFilter) {\n log.info(`Not applying filter \"${filter.name}\" or further filters`);\n break;\n }\n\n if (!filter.disabled) {\n log.info(`Applying filter \"${filter.name}\"`);\n await filter.loadState();\n } else {\n log.info(`Filter \"${filter.name}\" recognised but not applied because it is disabled`);\n }\n\n filtersStack.push(filter);\n }\n\n log.info(`Done loading filters`);\n await onFiltersLoaded?.(context);\n };\n\n /** Reset filter state */\n const resetState = async () => {\n log.info(`Resetting filter state`);\n await onResetFilters?.(context);\n filtersStack = filters;\n log.info(`Resetting filter state done`);\n };\n\n return {\n loadState,\n nextState,\n hasNextState,\n hasState,\n };\n};\n\n/** Get entries from a listing page (eg URLs to profiles that should be scraped later) */\nexport const scrapeListingEntries = async <Ctx extends object, UrlType>(\n options: ListingPageScraperOptions<Ctx, UrlType>\n) => {\n const {\n context,\n startUrls,\n listingCountOnly = false,\n log,\n pageId,\n onNavigate,\n onAfterNavigation,\n\n filters = [],\n shouldApplyFilter,\n loadFiltersRetries = 3,\n onLoadFiltersError = (_, err) => console.error(err),\n onFiltersLoaded,\n onResetFilters,\n\n extractEntries,\n extractEntriesRetries = 3,\n onExtractEntriesError = (_, err) => console.error(err),\n onExtractEntriesDone,\n\n onGoToNextPage,\n nextPageWait = 500,\n } = options;\n\n /** Collection of ALL urls across all pages and startUrls */\n const links: UrlType[] = [];\n\n await serialAsyncMap(startUrls, async (startUrl, index) => {\n if (listingCountOnly && index > 0) return;\n\n const logId = `${startUrl} (${index + 1}/${startUrls.length})`;\n\n let userAskedToStop = false;\n const abort = () => { userAskedToStop = true }; // prettier-ignore\n\n // Prepare context shared across all hooks\n let filterObj: ListingFilterController | null = null;\n const genCtxArg = (): ListingPageScraperContext<Ctx, UrlType> => ({\n context,\n log,\n startUrl,\n filters,\n loadFilterState: filterObj?.loadState ?? (() => {}),\n abort,\n });\n\n log.debug(`Validating URL ${logId}`);\n validateUrl(startUrl as string);\n log.info(`Navigating URL ${logId}`);\n await onNavigate?.(genCtxArg(), startUrl);\n log.debug(`Done navigating to URL ${logId}`);\n\n filterObj = setupListingFilters({\n context: genCtxArg(),\n filters,\n shouldApplyFilter,\n onFiltersLoaded,\n onResetFilters,\n log,\n });\n\n log.debug(`Calling onAfterNavigation callback. URL ${logId}`); // prettier-ignore\n await onAfterNavigation?.(genCtxArg());\n log.debug(`Done calling onAfterNavigation callback. URL ${logId})`); // prettier-ignore\n\n const isUsingFilters = filters.some((filter) => !filter.disabled);\n\n let hasFilterStatesToProcess = true;\n while (hasFilterStatesToProcess && !userAskedToStop) {\n // Filter loop\n // Load filters before we start paginating\n log.info(`Setting up filters for URL ${logId}`);\n await retryAsync(\n async () => {\n if (!filterObj) throw Error(`Filter controller is missing. This should never happen. URL ${logId}`); // prettier-ignore\n\n const filterHasState = await filterObj.hasState();\n if (!isUsingFilters || !filterHasState) {\n log.info(`Not loading filters for URL ${logId}`);\n return;\n }\n\n log.debug(`Loading filters for URL ${logId}`);\n await filterObj.nextState();\n await filterObj.loadState();\n log.debug(`Done loading filters for URL ${logId}`);\n },\n {\n maxRetries: loadFiltersRetries,\n onError: (err, retryIndex) => onLoadFiltersError(genCtxArg(), err, retryIndex),\n }\n );\n\n let nextPageAvailable = true;\n while (nextPageAvailable && !userAskedToStop) {\n // Pagination loop\n let currPageId = 'next page';\n if (pageId) {\n log.debug(`Loading pageId for URL ${logId}`);\n currPageId = await pageId(genCtxArg());\n log.debug(`Done loading pageId for URL ${logId}`);\n }\n const pageLogId = `${logId} (${currPageId})`;\n\n // Extract page links\n log.info(`Extracting links from page ${pageLogId}`);\n const { result } = await retryAsync(\n async (retryIndex) => extractEntries(genCtxArg(), retryIndex),\n {\n maxRetries: extractEntriesRetries,\n onError: (err, retryIndex) => onExtractEntriesError(genCtxArg(), err, retryIndex),\n }\n );\n log.debug(`Done extracting links from page ${pageLogId}`);\n\n const pageLinks = result ?? [];\n links.push(...pageLinks);\n log.info(`Found ${pageLinks.length} links on page ${pageLogId}`);\n\n // Leave after printing the count or on abort\n if (listingCountOnly || userAskedToStop) {\n nextPageAvailable = false;\n if (listingCountOnly) log.info(`Debugging mode. Entries are not scraped. Leaving now. URL ${pageLogId}`); // prettier-ignore\n else if (userAskedToStop) log.info(`Aborting. URL ${pageLogId}`);\n continue;\n }\n\n log.debug(`Calling onExtractEntriesDone callback. URL ${pageLogId}`); // prettier-ignore\n await onExtractEntriesDone?.(genCtxArg(), pageLinks);\n log.debug(`Done calling onExtractEntriesDone callback. URL ${pageLogId}`); // prettier-ignore\n\n if (onGoToNextPage && !userAskedToStop) {\n // If goToNextPage hook is defined, this will be called after each page, until it errors\n try {\n log.info(`Navigating to next page from URL ${pageLogId}`);\n await onGoToNextPage(genCtxArg(), pageLinks);\n log.debug(`Done navigating to next page from URL ${pageLogId}`); // prettier-ignore\n } catch (e) {\n log.info(`Failed navigating to next page from URL ${pageLogId}`); // prettier-ignore\n log.error((e as Error).toString());\n nextPageAvailable = false;\n }\n } else {\n if (userAskedToStop) log.info(`Aborting. URL ${pageLogId}`);\n nextPageAvailable = false;\n }\n\n // Wait before we start scraping the next page\n await new Promise((res) => setTimeout(res, nextPageWait));\n }\n\n // Break out if we're not using filters or we've gone through them all\n log.debug(`Checking if there are more filter states available for URL ${logId}`);\n hasFilterStatesToProcess = isUsingFilters && (await filterObj.hasNextState());\n log.debug(`Done checking if there are more filter states available for URL ${logId}`);\n\n if (hasFilterStatesToProcess) {\n if (!userAskedToStop) log.info(`Will repeat scraping this URL with different filter setting. URL ${logId}`); // prettier-ignore\n else log.info(`There are unprocessed filter setting remaining for this URL, but stopping due to abort. URL ${logId}`); // prettier-ignore\n } else log.info(`No filter setting remain for scraping this URL. URL ${logId}`); // prettier-ignore\n }\n log.info(`Finished URL ${logId}`);\n });\n return links;\n};\n"]}
@@ -0,0 +1,90 @@
1
+ /// <reference types="lodash" />
2
+ import { BasicCrawler, CrawlingContext, BasicCrawlerOptions } from 'crawlee';
3
+ import * as Sentry from '@sentry/node';
4
+ import type { CrawlerMeta, CrawlerType } from '../../types';
5
+ import type { MaybePromise, PickPartial } from '../../utils/types';
6
+ import type { CrawleeOneIO } from '../integrations/types';
7
+ import type { ActorContext, ActorDefinition } from './types';
8
+ /**
9
+ * Create default configuration for an opinionated Crawlee actor,
10
+ * and run the actor within Apify's `Actor.main()` context.
11
+ *
12
+ * Apify context can be replaced with custom implementation using the `actorConfig.io` option.
13
+ *
14
+ * Read more about what this actor does at {@link createCrawleeOne}.
15
+ */
16
+ export declare const createAndRunCrawleeOne: <TCrawlerType extends CrawlerType, Ctx extends CrawlerMeta<TCrawlerType, any>["context"] = CrawlingContext<BasicCrawler<import("crawlee").BasicCrawlingContext<import("crawlee").Dictionary>>, import("crawlee").Dictionary>, Labels extends string = string, Input extends Record<string, any> = Record<string, any>, TIO extends CrawleeOneIO<object, object, object> = CrawleeOneIO<object, object, object>>(input: {
17
+ /** String idetifying the actor class, e.g. `'cheerio'` */
18
+ actorType: TCrawlerType;
19
+ actorName: string;
20
+ /** Config passed to the {@link createCrawleeOne} */
21
+ actorConfig: PickPartial<ActorDefinition<Ctx, Labels, Input, TIO>, "io" | "router" | "createCrawler">;
22
+ /**
23
+ * If using default `createCrawler` implementation, these are crawler options
24
+ * that may be overriden by user input.
25
+ */
26
+ crawlerConfigDefaults?: CrawlerMeta<TCrawlerType, any>["options"] | undefined;
27
+ /**
28
+ * If using default `createCrawler` implementation, these are crawler options
29
+ * that will override user input.
30
+ *
31
+ * This is useful for testing env.
32
+ */
33
+ crawlerConfigOverrides?: CrawlerMeta<TCrawlerType, any>["options"] | undefined;
34
+ /**
35
+ * Sentry configuration. If using default `createCrawler` implementation,
36
+ * failed requests are optionally reported to Sentry.
37
+ *
38
+ * To disable Sentry, set `"enabled": false`.
39
+ */
40
+ sentryOptions?: Sentry.NodeOptions | undefined;
41
+ /**
42
+ * Callback with the created actor. The callback is called within
43
+ * the `Actor.main()` context.
44
+ */
45
+ onActorReady?: ((actor: ActorContext<Ctx, Labels, Input, TIO>) => MaybePromise<void>) | undefined;
46
+ }) => Promise<void>;
47
+ /**
48
+ * Create opinionated Crawlee crawler that uses router for handling requests.
49
+ *
50
+ * This is a quality-of-life function that does the following for you:
51
+ *
52
+ * 1) Full TypeScript coverage - Ensure all components use the same Crawler / CrawlerContext.
53
+ *
54
+ * 2) Get Actor input from `Actor.getInput` if not given.
55
+ *
56
+ * 3) (Optional) Validate Actor input
57
+ *
58
+ * 4) Set up router such that requests that reach default route are
59
+ * redirected to labelled routes based on which item from "routes" they match.
60
+ *
61
+ * 5) Register all route handlers for you.
62
+ *
63
+ * 6) (Optional) Wrap all route handlers in a wrapper. Use this e.g.
64
+ * if you want to add a field to the context object, or handle errors
65
+ * from a single place.
66
+ *
67
+ * 7) (Optional) Support transformation and filtering of (scraped) entries,
68
+ * configured via Actor input.
69
+ *
70
+ * 8) (Optional) Support Actor metamorphing, configured via Actor input.
71
+ *
72
+ * 9) Apify context (e.g. calling `Actor.getInput`) can be replaced with custom
73
+ * implementation using the `io` option.
74
+ */
75
+ export declare const createCrawleeOne: <Ctx extends CrawlingContext<unknown, import("crawlee").Dictionary> = CrawlingContext<BasicCrawler<import("crawlee").BasicCrawlingContext<import("crawlee").Dictionary>>, import("crawlee").Dictionary>, Labels extends string = string, Input extends Record<string, any> = Record<string, any>, TIO extends CrawleeOneIO<object, object, object> = CrawleeOneIO<object, object, object>>(config: PickPartial<ActorDefinition<Ctx, Labels, Input, TIO>, "io">) => Promise<ActorContext<Ctx, Labels, Input, TIO>>;
76
+ /** Given the actor input, create common crawler options. */
77
+ export declare const createHttpCrawlerOptions: <TOpts extends BasicCrawlerOptions<any> = BasicCrawlerOptions<import("crawlee").BasicCrawlingContext<import("crawlee").Dictionary>>, Input extends Record<string, any> = Record<string, any>>({ input, defaults, overrides, }: {
78
+ /** Actor input */
79
+ input: Input | null;
80
+ /**
81
+ * Default config options set by us. These may be overriden
82
+ * by values from actor input (set by user).
83
+ */
84
+ defaults?: TOpts | undefined;
85
+ /**
86
+ * These config options will overwrite both the default and user
87
+ * options. This is useful for hard-setting values e.g. in tests.
88
+ */
89
+ overrides?: TOpts | undefined;
90
+ }) => Partial<TOpts> & import("lodash").Dictionary<TOpts["requestHandler"] | TOpts["handleRequestFunction"] | TOpts["requestList"] | TOpts["requestQueue"] | TOpts["requestHandlerTimeoutSecs"] | TOpts["handleRequestTimeoutSecs"] | TOpts["errorHandler"] | TOpts["failedRequestHandler"] | TOpts["handleFailedRequestFunction"] | TOpts["maxRequestRetries"] | TOpts["maxRequestsPerCrawl"] | TOpts["autoscaledPoolOptions"] | TOpts["minConcurrency"] | TOpts["maxConcurrency"] | TOpts["maxRequestsPerMinute"] | TOpts["keepAlive"] | TOpts["useSessionPool"] | TOpts["sessionPoolOptions"] | TOpts["loggingInterval"] | TOpts["log"]>;