apify 2.4.0b5__tar.gz → 2.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

Files changed (209) hide show
  1. {apify-2.4.0b5 → apify-2.5.0}/.github/workflows/run_code_checks.yaml +1 -1
  2. {apify-2.4.0b5 → apify-2.5.0}/CHANGELOG.md +11 -0
  3. {apify-2.4.0b5 → apify-2.5.0}/PKG-INFO +1 -1
  4. {apify-2.4.0b5 → apify-2.5.0}/docs/02_guides/05_scrapy.mdx +7 -0
  5. {apify-2.4.0b5 → apify-2.5.0}/docs/02_guides/code/scrapy_project/src/settings.py +2 -0
  6. {apify-2.4.0b5 → apify-2.5.0}/pyproject.toml +9 -11
  7. {apify-2.4.0b5 → apify-2.5.0}/src/apify/_actor.py +5 -5
  8. {apify-2.4.0b5 → apify-2.5.0}/src/apify/_configuration.py +15 -2
  9. {apify-2.4.0b5 → apify-2.5.0}/src/apify/_platform_event_manager.py +3 -3
  10. {apify-2.4.0b5 → apify-2.5.0}/src/apify/_utils.py +16 -6
  11. {apify-2.4.0b5 → apify-2.5.0}/src/apify/scrapy/_async_thread.py +1 -1
  12. apify-2.5.0/src/apify/scrapy/extensions/__init__.py +3 -0
  13. apify-2.5.0/src/apify/scrapy/extensions/_httpcache.py +212 -0
  14. {apify-2.4.0b5 → apify-2.5.0}/src/apify/scrapy/requests.py +1 -1
  15. {apify-2.4.0b5 → apify-2.5.0}/src/apify/scrapy/utils.py +3 -0
  16. {apify-2.4.0b5 → apify-2.5.0}/src/apify/storages/_request_list.py +9 -4
  17. {apify-2.4.0b5 → apify-2.5.0}/tests/integration/test_actor_scrapy.py +2 -1
  18. {apify-2.4.0b5 → apify-2.5.0}/tests/unit/actor/test_actor_lifecycle.py +7 -5
  19. apify-2.5.0/tests/unit/actor/test_configuration.py +21 -0
  20. apify-2.5.0/tests/unit/scrapy/extensions/test_httpcache.py +71 -0
  21. {apify-2.4.0b5 → apify-2.5.0}/uv.lock +352 -352
  22. {apify-2.4.0b5 → apify-2.5.0}/website/package-lock.json +833 -584
  23. {apify-2.4.0b5 → apify-2.5.0}/website/package.json +1 -1
  24. apify-2.5.0/website/static/.nojekyll +0 -0
  25. {apify-2.4.0b5 → apify-2.5.0}/.editorconfig +0 -0
  26. {apify-2.4.0b5 → apify-2.5.0}/.github/CODEOWNERS +0 -0
  27. {apify-2.4.0b5 → apify-2.5.0}/.github/workflows/build_and_deploy_docs.yaml +0 -0
  28. {apify-2.4.0b5 → apify-2.5.0}/.github/workflows/check_pr_title.yaml +0 -0
  29. {apify-2.4.0b5 → apify-2.5.0}/.github/workflows/pre_release.yaml +0 -0
  30. {apify-2.4.0b5 → apify-2.5.0}/.github/workflows/release.yaml +0 -0
  31. {apify-2.4.0b5 → apify-2.5.0}/.github/workflows/update_new_issue.yaml +0 -0
  32. {apify-2.4.0b5 → apify-2.5.0}/.gitignore +0 -0
  33. {apify-2.4.0b5 → apify-2.5.0}/.markdownlint.yaml +0 -0
  34. {apify-2.4.0b5 → apify-2.5.0}/.pre-commit-config.yaml +0 -0
  35. {apify-2.4.0b5 → apify-2.5.0}/CONTRIBUTING.md +0 -0
  36. {apify-2.4.0b5 → apify-2.5.0}/LICENSE +0 -0
  37. {apify-2.4.0b5 → apify-2.5.0}/Makefile +0 -0
  38. {apify-2.4.0b5 → apify-2.5.0}/README.md +0 -0
  39. {apify-2.4.0b5 → apify-2.5.0}/docs/01_overview/01_introduction.mdx +0 -0
  40. {apify-2.4.0b5 → apify-2.5.0}/docs/01_overview/02_running_actors_locally.mdx +0 -0
  41. {apify-2.4.0b5 → apify-2.5.0}/docs/01_overview/03_actor_structure.mdx +0 -0
  42. {apify-2.4.0b5 → apify-2.5.0}/docs/01_overview/code/01_introduction.py +0 -0
  43. {apify-2.4.0b5 → apify-2.5.0}/docs/01_overview/code/actor_structure/__init__.py +0 -0
  44. {apify-2.4.0b5 → apify-2.5.0}/docs/01_overview/code/actor_structure/__main__.py +0 -0
  45. {apify-2.4.0b5 → apify-2.5.0}/docs/01_overview/code/actor_structure/main.py +0 -0
  46. {apify-2.4.0b5 → apify-2.5.0}/docs/01_overview/code/actor_structure/py.typed +0 -0
  47. {apify-2.4.0b5 → apify-2.5.0}/docs/02_guides/01_beautifulsoup_httpx.mdx +0 -0
  48. {apify-2.4.0b5 → apify-2.5.0}/docs/02_guides/02_crawlee.mdx +0 -0
  49. {apify-2.4.0b5 → apify-2.5.0}/docs/02_guides/03_playwright.mdx +0 -0
  50. {apify-2.4.0b5 → apify-2.5.0}/docs/02_guides/04_selenium.mdx +0 -0
  51. {apify-2.4.0b5 → apify-2.5.0}/docs/02_guides/code/01_beautifulsoup_httpx.py +0 -0
  52. {apify-2.4.0b5 → apify-2.5.0}/docs/02_guides/code/02_crawlee_beautifulsoup.py +0 -0
  53. {apify-2.4.0b5 → apify-2.5.0}/docs/02_guides/code/02_crawlee_playwright.py +0 -0
  54. {apify-2.4.0b5 → apify-2.5.0}/docs/02_guides/code/03_playwright.py +0 -0
  55. {apify-2.4.0b5 → apify-2.5.0}/docs/02_guides/code/04_selenium.py +0 -0
  56. {apify-2.4.0b5 → apify-2.5.0}/docs/02_guides/code/scrapy_project/src/__init__.py +0 -0
  57. {apify-2.4.0b5 → apify-2.5.0}/docs/02_guides/code/scrapy_project/src/__main__.py +0 -0
  58. {apify-2.4.0b5 → apify-2.5.0}/docs/02_guides/code/scrapy_project/src/items.py +0 -0
  59. {apify-2.4.0b5 → apify-2.5.0}/docs/02_guides/code/scrapy_project/src/main.py +0 -0
  60. {apify-2.4.0b5 → apify-2.5.0}/docs/02_guides/code/scrapy_project/src/py.typed +0 -0
  61. {apify-2.4.0b5 → apify-2.5.0}/docs/02_guides/code/scrapy_project/src/spiders/__init__.py +0 -0
  62. {apify-2.4.0b5 → apify-2.5.0}/docs/02_guides/code/scrapy_project/src/spiders/py.typed +0 -0
  63. {apify-2.4.0b5 → apify-2.5.0}/docs/02_guides/code/scrapy_project/src/spiders/title.py +0 -0
  64. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/01_actor_lifecycle.mdx +0 -0
  65. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/02_actor_input.mdx +0 -0
  66. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/03_storages.mdx +0 -0
  67. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/04_actor_events.mdx +0 -0
  68. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/05_proxy_management.mdx +0 -0
  69. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/06_interacting_with_other_actors.mdx +0 -0
  70. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/07_webhooks.mdx +0 -0
  71. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/08_access_apify_api.mdx +0 -0
  72. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/09_running_webserver.mdx +0 -0
  73. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/10_logging.mdx +0 -0
  74. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/11_configuration.mdx +0 -0
  75. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/12_pay_per_event.mdx +0 -0
  76. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/01_context_manager.py +0 -0
  77. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/01_init_exit.py +0 -0
  78. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/01_reboot.py +0 -0
  79. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/01_status_message.py +0 -0
  80. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/02_input.py +0 -0
  81. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/03_dataset_exports.py +0 -0
  82. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/03_dataset_read_write.py +0 -0
  83. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/03_deleting_storages.py +0 -0
  84. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/03_kvs_iterating.py +0 -0
  85. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/03_kvs_public_url.py +0 -0
  86. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/03_kvs_read_write.py +0 -0
  87. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/03_opening_storages.py +0 -0
  88. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/03_rq.py +0 -0
  89. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/04_actor_events.py +0 -0
  90. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/05_apify_proxy.py +0 -0
  91. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/05_apify_proxy_config.py +0 -0
  92. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/05_custom_proxy.py +0 -0
  93. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/05_custom_proxy_function.py +0 -0
  94. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/05_proxy_actor_input.py +0 -0
  95. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/05_proxy_httpx.py +0 -0
  96. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/05_proxy_rotation.py +0 -0
  97. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/06_interacting_call.py +0 -0
  98. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/06_interacting_call_task.py +0 -0
  99. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/06_interacting_metamorph.py +0 -0
  100. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/06_interacting_start.py +0 -0
  101. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/07_webhook.py +0 -0
  102. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/07_webhook_preventing.py +0 -0
  103. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/08_actor_client.py +0 -0
  104. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/08_actor_new_client.py +0 -0
  105. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/09_webserver.py +0 -0
  106. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/10_log_config.py +0 -0
  107. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/10_logger_usage.py +0 -0
  108. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/11_config.py +0 -0
  109. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/actor_charge.py +0 -0
  110. {apify-2.4.0b5 → apify-2.5.0}/docs/03_concepts/code/conditional_actor_charge.py +0 -0
  111. {apify-2.4.0b5 → apify-2.5.0}/docs/04_upgrading/upgrading_to_v2.md +0 -0
  112. {apify-2.4.0b5 → apify-2.5.0}/docs/pyproject.toml +0 -0
  113. {apify-2.4.0b5 → apify-2.5.0}/renovate.json +0 -0
  114. {apify-2.4.0b5 → apify-2.5.0}/src/apify/__init__.py +0 -0
  115. {apify-2.4.0b5 → apify-2.5.0}/src/apify/_charging.py +0 -0
  116. {apify-2.4.0b5 → apify-2.5.0}/src/apify/_consts.py +0 -0
  117. {apify-2.4.0b5 → apify-2.5.0}/src/apify/_crypto.py +0 -0
  118. {apify-2.4.0b5 → apify-2.5.0}/src/apify/_models.py +0 -0
  119. {apify-2.4.0b5 → apify-2.5.0}/src/apify/_proxy_configuration.py +0 -0
  120. {apify-2.4.0b5 → apify-2.5.0}/src/apify/apify_storage_client/__init__.py +0 -0
  121. {apify-2.4.0b5 → apify-2.5.0}/src/apify/apify_storage_client/_apify_storage_client.py +0 -0
  122. {apify-2.4.0b5 → apify-2.5.0}/src/apify/apify_storage_client/_dataset_client.py +0 -0
  123. {apify-2.4.0b5 → apify-2.5.0}/src/apify/apify_storage_client/_dataset_collection_client.py +0 -0
  124. {apify-2.4.0b5 → apify-2.5.0}/src/apify/apify_storage_client/_key_value_store_client.py +0 -0
  125. {apify-2.4.0b5 → apify-2.5.0}/src/apify/apify_storage_client/_key_value_store_collection_client.py +0 -0
  126. {apify-2.4.0b5 → apify-2.5.0}/src/apify/apify_storage_client/_request_queue_client.py +0 -0
  127. {apify-2.4.0b5 → apify-2.5.0}/src/apify/apify_storage_client/_request_queue_collection_client.py +0 -0
  128. {apify-2.4.0b5 → apify-2.5.0}/src/apify/apify_storage_client/py.typed +0 -0
  129. {apify-2.4.0b5 → apify-2.5.0}/src/apify/log.py +0 -0
  130. {apify-2.4.0b5 → apify-2.5.0}/src/apify/py.typed +0 -0
  131. {apify-2.4.0b5 → apify-2.5.0}/src/apify/scrapy/__init__.py +0 -0
  132. {apify-2.4.0b5 → apify-2.5.0}/src/apify/scrapy/_actor_runner.py +0 -0
  133. {apify-2.4.0b5 → apify-2.5.0}/src/apify/scrapy/_logging_config.py +0 -0
  134. {apify-2.4.0b5 → apify-2.5.0}/src/apify/scrapy/middlewares/__init__.py +0 -0
  135. {apify-2.4.0b5 → apify-2.5.0}/src/apify/scrapy/middlewares/apify_proxy.py +0 -0
  136. {apify-2.4.0b5 → apify-2.5.0}/src/apify/scrapy/middlewares/py.typed +0 -0
  137. {apify-2.4.0b5 → apify-2.5.0}/src/apify/scrapy/pipelines/__init__.py +0 -0
  138. {apify-2.4.0b5 → apify-2.5.0}/src/apify/scrapy/pipelines/actor_dataset_push.py +0 -0
  139. {apify-2.4.0b5 → apify-2.5.0}/src/apify/scrapy/pipelines/py.typed +0 -0
  140. {apify-2.4.0b5 → apify-2.5.0}/src/apify/scrapy/py.typed +0 -0
  141. {apify-2.4.0b5 → apify-2.5.0}/src/apify/scrapy/scheduler.py +0 -0
  142. {apify-2.4.0b5 → apify-2.5.0}/src/apify/storages/__init__.py +0 -0
  143. {apify-2.4.0b5 → apify-2.5.0}/src/apify/storages/py.typed +0 -0
  144. {apify-2.4.0b5 → apify-2.5.0}/tests/integration/README.md +0 -0
  145. {apify-2.4.0b5 → apify-2.5.0}/tests/integration/__init__.py +0 -0
  146. {apify-2.4.0b5 → apify-2.5.0}/tests/integration/_utils.py +0 -0
  147. {apify-2.4.0b5 → apify-2.5.0}/tests/integration/actor_source_base/Dockerfile +0 -0
  148. {apify-2.4.0b5 → apify-2.5.0}/tests/integration/actor_source_base/requirements.txt +0 -0
  149. {apify-2.4.0b5 → apify-2.5.0}/tests/integration/actor_source_base/src/__init__.py +0 -0
  150. {apify-2.4.0b5 → apify-2.5.0}/tests/integration/actor_source_base/src/__main__.py +0 -0
  151. {apify-2.4.0b5 → apify-2.5.0}/tests/integration/actor_source_base/src/main.py +0 -0
  152. {apify-2.4.0b5 → apify-2.5.0}/tests/integration/conftest.py +0 -0
  153. {apify-2.4.0b5 → apify-2.5.0}/tests/integration/test_actor_api_helpers.py +0 -0
  154. {apify-2.4.0b5 → apify-2.5.0}/tests/integration/test_actor_charge.py +0 -0
  155. {apify-2.4.0b5 → apify-2.5.0}/tests/integration/test_actor_create_proxy_configuration.py +0 -0
  156. {apify-2.4.0b5 → apify-2.5.0}/tests/integration/test_actor_dataset.py +0 -0
  157. {apify-2.4.0b5 → apify-2.5.0}/tests/integration/test_actor_events.py +0 -0
  158. {apify-2.4.0b5 → apify-2.5.0}/tests/integration/test_actor_key_value_store.py +0 -0
  159. {apify-2.4.0b5 → apify-2.5.0}/tests/integration/test_actor_lifecycle.py +0 -0
  160. {apify-2.4.0b5 → apify-2.5.0}/tests/integration/test_actor_log.py +0 -0
  161. {apify-2.4.0b5 → apify-2.5.0}/tests/integration/test_actor_request_queue.py +0 -0
  162. {apify-2.4.0b5 → apify-2.5.0}/tests/integration/test_fixtures.py +0 -0
  163. {apify-2.4.0b5 → apify-2.5.0}/tests/integration/test_request_queue.py +0 -0
  164. {apify-2.4.0b5 → apify-2.5.0}/tests/unit/__init__.py +0 -0
  165. {apify-2.4.0b5 → apify-2.5.0}/tests/unit/actor/__init__.py +0 -0
  166. {apify-2.4.0b5 → apify-2.5.0}/tests/unit/actor/test_actor_create_proxy_configuration.py +0 -0
  167. {apify-2.4.0b5 → apify-2.5.0}/tests/unit/actor/test_actor_dataset.py +0 -0
  168. {apify-2.4.0b5 → apify-2.5.0}/tests/unit/actor/test_actor_env_helpers.py +0 -0
  169. {apify-2.4.0b5 → apify-2.5.0}/tests/unit/actor/test_actor_helpers.py +0 -0
  170. {apify-2.4.0b5 → apify-2.5.0}/tests/unit/actor/test_actor_key_value_store.py +0 -0
  171. {apify-2.4.0b5 → apify-2.5.0}/tests/unit/actor/test_actor_log.py +0 -0
  172. {apify-2.4.0b5 → apify-2.5.0}/tests/unit/actor/test_actor_non_default_instance.py +0 -0
  173. {apify-2.4.0b5 → apify-2.5.0}/tests/unit/actor/test_actor_request_queue.py +0 -0
  174. {apify-2.4.0b5 → apify-2.5.0}/tests/unit/actor/test_request_list.py +0 -0
  175. {apify-2.4.0b5 → apify-2.5.0}/tests/unit/conftest.py +0 -0
  176. {apify-2.4.0b5 → apify-2.5.0}/tests/unit/scrapy/__init__.py +0 -0
  177. {apify-2.4.0b5/tests/unit/scrapy/middlewares → apify-2.5.0/tests/unit/scrapy/extensions}/__init__.py +0 -0
  178. {apify-2.4.0b5/tests/unit/scrapy/pipelines → apify-2.5.0/tests/unit/scrapy/middlewares}/__init__.py +0 -0
  179. {apify-2.4.0b5 → apify-2.5.0}/tests/unit/scrapy/middlewares/test_apify_proxy.py +0 -0
  180. {apify-2.4.0b5/tests/unit/scrapy/requests → apify-2.5.0/tests/unit/scrapy/pipelines}/__init__.py +0 -0
  181. {apify-2.4.0b5 → apify-2.5.0}/tests/unit/scrapy/pipelines/test_actor_dataset_push.py +0 -0
  182. {apify-2.4.0b5/tests/unit/scrapy/utils → apify-2.5.0/tests/unit/scrapy/requests}/__init__.py +0 -0
  183. {apify-2.4.0b5 → apify-2.5.0}/tests/unit/scrapy/requests/test_to_apify_request.py +0 -0
  184. {apify-2.4.0b5 → apify-2.5.0}/tests/unit/scrapy/requests/test_to_scrapy_request.py +0 -0
  185. /apify-2.4.0b5/website/static/.nojekyll → /apify-2.5.0/tests/unit/scrapy/utils/__init__.py +0 -0
  186. {apify-2.4.0b5 → apify-2.5.0}/tests/unit/scrapy/utils/test_apply_apify_settings.py +0 -0
  187. {apify-2.4.0b5 → apify-2.5.0}/tests/unit/scrapy/utils/test_get_basic_auth_header.py +0 -0
  188. {apify-2.4.0b5 → apify-2.5.0}/tests/unit/test_crypto.py +0 -0
  189. {apify-2.4.0b5 → apify-2.5.0}/tests/unit/test_platform_event_manager.py +0 -0
  190. {apify-2.4.0b5 → apify-2.5.0}/tests/unit/test_proxy_configuration.py +0 -0
  191. {apify-2.4.0b5 → apify-2.5.0}/website/.eslintrc.json +0 -0
  192. {apify-2.4.0b5 → apify-2.5.0}/website/babel.config.js +0 -0
  193. {apify-2.4.0b5 → apify-2.5.0}/website/build_api_reference.sh +0 -0
  194. {apify-2.4.0b5 → apify-2.5.0}/website/docusaurus.config.js +0 -0
  195. {apify-2.4.0b5 → apify-2.5.0}/website/generate_module_shortcuts.py +0 -0
  196. {apify-2.4.0b5 → apify-2.5.0}/website/sidebars.js +0 -0
  197. {apify-2.4.0b5 → apify-2.5.0}/website/src/components/ApiLink.jsx +0 -0
  198. {apify-2.4.0b5 → apify-2.5.0}/website/src/components/Gradients.jsx +0 -0
  199. {apify-2.4.0b5 → apify-2.5.0}/website/src/components/Highlights.jsx +0 -0
  200. {apify-2.4.0b5 → apify-2.5.0}/website/src/components/Highlights.module.css +0 -0
  201. {apify-2.4.0b5 → apify-2.5.0}/website/src/components/RunnableCodeBlock.jsx +0 -0
  202. {apify-2.4.0b5 → apify-2.5.0}/website/src/components/RunnableCodeBlock.module.css +0 -0
  203. {apify-2.4.0b5 → apify-2.5.0}/website/src/css/custom.css +0 -0
  204. {apify-2.4.0b5 → apify-2.5.0}/website/src/pages/home_page_example.py +0 -0
  205. {apify-2.4.0b5 → apify-2.5.0}/website/src/pages/index.js +0 -0
  206. {apify-2.4.0b5 → apify-2.5.0}/website/src/pages/index.module.css +0 -0
  207. {apify-2.4.0b5 → apify-2.5.0}/website/static/img/docs-og.png +0 -0
  208. {apify-2.4.0b5 → apify-2.5.0}/website/tools/docs-prettier.config.js +0 -0
  209. {apify-2.4.0b5 → apify-2.5.0}/website/tools/utils/externalLink.js +0 -0
@@ -26,5 +26,5 @@ jobs:
26
26
  integration_tests:
27
27
  name: Integration tests
28
28
  needs: [lint_check, type_check, unit_tests]
29
- uses: apify/workflows/.github/workflows/python_integration_tests.yaml@main
29
+ uses: apify/workflows/.github/workflows/python_integration_tests.yaml@fix-integration-tests-from-forks
30
30
  secrets: inherit
@@ -2,6 +2,17 @@
2
2
 
3
3
  All notable changes to this project will be documented in this file.
4
4
 
5
+ ## [2.5.0](https://github.com/apify/apify-sdk-python/releases/tag/v2.5.0) (2025-03-27)
6
+
7
+ ### 🚀 Features
8
+
9
+ - Implement Scrapy HTTP cache backend ([#403](https://github.com/apify/apify-sdk-python/pull/403)) ([137e3c8](https://github.com/apify/apify-sdk-python/commit/137e3c8d5c6b28cf6935cfb742b5f072cd2e0a02)) by [@honzajavorek](https://github.com/honzajavorek)
10
+
11
+ ### 🐛 Bug Fixes
12
+
13
+ - Fix calculation of CPU utilization from SystemInfo events ([#447](https://github.com/apify/apify-sdk-python/pull/447)) ([eb4c8e4](https://github.com/apify/apify-sdk-python/commit/eb4c8e4e498e23f573b9e2d4c7dbd8e2ecc277d9)) by [@janbuchar](https://github.com/janbuchar)
14
+
15
+
5
16
  ## [2.4.0](https://github.com/apify/apify-sdk-python/releases/tag/v2.4.0) (2025-03-07)
6
17
 
7
18
  ### 🚀 Features
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: apify
3
- Version: 2.4.0b5
3
+ Version: 2.5.0
4
4
  Summary: Apify SDK for Python
5
5
  Project-URL: Homepage, https://docs.apify.com/sdk/python/
6
6
  Project-URL: Apify homepage, https://apify.com
@@ -40,6 +40,7 @@ The Apify SDK provides several custom components to support integration with the
40
40
  - [`apify.scrapy.ApifyScheduler`](https://docs.apify.com/sdk/python/reference/class/ApifyScheduler) - Replaces Scrapy's default [scheduler](https://docs.scrapy.org/en/latest/topics/scheduler.html) with one that uses Apify's [request queue](https://docs.apify.com/platform/storage/request-queue) for storing requests. It manages enqueuing, dequeuing, and maintaining the state and priority of requests.
41
41
  - [`apify.scrapy.ActorDatasetPushPipeline`](https://docs.apify.com/sdk/python/reference/class/ActorDatasetPushPipeline) - A Scrapy [item pipeline](https://docs.scrapy.org/en/latest/topics/item-pipeline.html) that pushes scraped items to Apify's [dataset](https://docs.apify.com/platform/storage/dataset). When enabled, every item produced by the spider is sent to the dataset.
42
42
  - [`apify.scrapy.ApifyHttpProxyMiddleware`](https://docs.apify.com/sdk/python/reference/class/ApifyHttpProxyMiddleware) - A Scrapy [middleware](https://docs.scrapy.org/en/latest/topics/downloader-middleware.html) that manages proxy configurations. This middleware replaces Scrapy's default `HttpProxyMiddleware` to facilitate the use of Apify's proxy service.
43
+ - [`apify.scrapy.extensions.ApifyCacheStorage`](https://docs.apify.com/sdk/python/reference/class/ApifyCacheStorage) - A storage backend for Scrapy's built-in [HTTP cache middleware](https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#module-scrapy.downloadermiddlewares.httpcache). This backend uses Apify's [key-value store](https://docs.apify.com/platform/storage/key-value-store). Make sure to set `HTTPCACHE_ENABLED` and `HTTPCACHE_EXPIRATION_SECS` in your settings, or caching won't work.
43
44
 
44
45
  Additional helper functions in the [`apify.scrapy`](https://github.com/apify/apify-sdk-python/tree/master/src/apify/scrapy) subpackage include:
45
46
 
@@ -94,6 +95,12 @@ The following example demonstrates a Scrapy Actor that scrapes page titles and e
94
95
  </TabItem>
95
96
  </Tabs>
96
97
 
98
+ ## Dealing with ‘imminent migration to another host’
99
+
100
+ Under some circumstances, the platform may decide to [migrate your Actor](https://docs.apify.com/academy/expert-scraping-with-apify/migrations-maintaining-state) from one piece of infrastructure to another while it's in progress. While [Crawlee](https://crawlee.dev/python)-based projects can pause and resume their work after a restart, achieving the same with a Scrapy-based project can be challenging.
101
+
102
+ As a workaround for this issue (tracked as [apify/actor-templates#303](https://github.com/apify/actor-templates/issues/303)), turn on caching with `HTTPCACHE_ENABLED` and set `HTTPCACHE_EXPIRATION_SECS` to at least a few minutes—the exact value depends on your use case. If your Actor gets migrated and restarted, the subsequent run will hit the cache, making it fast and avoiding unnecessary resource consumption.
103
+
97
104
  ## Conclusion
98
105
 
99
106
  In this guide you learned how to use Scrapy in Apify Actors. You can now start building your own web scraping projects using Scrapy, the Apify SDK and host them on the Apify platform. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!
@@ -7,3 +7,5 @@ SPIDER_MODULES = ['src.spiders']
7
7
  TELNETCONSOLE_ENABLED = False
8
8
  # Do not change the Twisted reactor unless you really know what you are doing.
9
9
  TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
10
+ HTTPCACHE_ENABLED = True
11
+ HTTPCACHE_EXPIRATION_SECS = 7200
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "apify"
7
- version = "2.4.0b5"
7
+ version = "2.5.0"
8
8
  description = "Apify SDK for Python"
9
9
  authors = [{ name = "Apify Technologies s.r.o.", email = "support@apify.com" }]
10
10
  license = { file = "LICENSE" }
@@ -59,20 +59,18 @@ scrapy = ["scrapy>=2.11.0"]
59
59
  [dependency-groups]
60
60
  dev = [
61
61
  "build~=1.2.0",
62
- "filelock~=3.17.0",
63
62
  "griffe~=1.6.0",
64
63
  "mypy~=1.15.0",
65
- "pre-commit~=4.1.0",
64
+ "pre-commit~=4.2.0",
66
65
  "pydoc-markdown~=4.8.0",
67
- "pytest~=8.3.0",
68
- "pytest-asyncio~=0.25.0",
66
+ "pytest-asyncio~=0.26.0",
69
67
  "pytest-cov~=6.0.0",
70
68
  "pytest-only~=2.1.0",
71
- "pytest-timeout~=2.3.0",
72
69
  "pytest-xdist~=3.6.0",
70
+ "pytest~=8.3.0",
73
71
  "respx~=0.22.0",
74
- "ruff~=0.9.0",
75
- "setuptools~=76.0.0", # setuptools are used by pytest but not explicitly required
72
+ "ruff~=0.11.0",
73
+ "setuptools~=78.1.0", # setuptools are used by pytest but not explicitly required
76
74
  ]
77
75
 
78
76
  [tool.hatch.build.targets.wheel]
@@ -93,6 +91,9 @@ ignore = [
93
91
  "D100", # Missing docstring in public module
94
92
  "D104", # Missing docstring in public package
95
93
  "D107", # Missing docstring in `__init__`
94
+ "D203", # One blank line required before class docstring
95
+ "D213", # Multi-line docstring summary should start at the second line
96
+ "D413", # Missing blank line after last section
96
97
  "EM", # flake8-errmsg
97
98
  "G004", # Logging statement uses f-string
98
99
  "ISC001", # This rule may cause conflicts when used with the formatter
@@ -164,9 +165,6 @@ runtime-evaluated-base-classes = [
164
165
  [tool.ruff.lint.flake8-builtins]
165
166
  builtins-ignorelist = ["id"]
166
167
 
167
- [tool.ruff.lint.pydocstyle]
168
- convention = "google"
169
-
170
168
  [tool.ruff.lint.isort]
171
169
  known-local-folder = ["apify"]
172
170
  known-first-party = ["apify_client", "apify_shared", "crawlee"]
@@ -141,7 +141,7 @@ class _ActorType:
141
141
  await self.exit()
142
142
 
143
143
  def __repr__(self) -> str:
144
- if self is cast(Proxy, Actor).__wrapped__:
144
+ if self is cast('Proxy', Actor).__wrapped__:
145
145
  return '<apify.Actor>'
146
146
 
147
147
  return super().__repr__()
@@ -222,7 +222,7 @@ class _ActorType:
222
222
  self.log.warning('Repeated Actor initialization detected - this is non-standard usage, proceed with care')
223
223
 
224
224
  # Make sure that the currently initialized instance is also available through the global `Actor` proxy
225
- cast(Proxy, Actor).__wrapped__ = self
225
+ cast('Proxy', Actor).__wrapped__ = self
226
226
 
227
227
  self._is_exiting = False
228
228
  self._was_final_persist_state_emitted = False
@@ -674,7 +674,7 @@ class _ActorType:
674
674
  elif isinstance(field.validation_alias, str):
675
675
  aliases = [field.validation_alias]
676
676
  elif isinstance(field.validation_alias, AliasChoices):
677
- aliases = cast(list[str], field.validation_alias.choices)
677
+ aliases = cast('list[str]', field.validation_alias.choices)
678
678
  else:
679
679
  aliases = [field_name]
680
680
 
@@ -1138,7 +1138,7 @@ class _ActorType:
1138
1138
  return proxy_configuration
1139
1139
 
1140
1140
  def _get_default_exit_process(self) -> bool:
1141
- """Returns False for IPython, Pytest, and Scrapy environments, True otherwise."""
1141
+ """Return False for IPython, Pytest, and Scrapy environments, True otherwise."""
1142
1142
  if is_running_in_ipython():
1143
1143
  self.log.debug('Running in IPython, setting default `exit_process` to False.')
1144
1144
  return False
@@ -1158,5 +1158,5 @@ class _ActorType:
1158
1158
  return True
1159
1159
 
1160
1160
 
1161
- Actor = cast(_ActorType, Proxy(_ActorType))
1161
+ Actor = cast('_ActorType', Proxy(_ActorType))
1162
1162
  """The entry point of the SDK, through which all the Actor operations should be done."""
@@ -5,8 +5,8 @@ from decimal import Decimal
5
5
  from logging import getLogger
6
6
  from typing import Annotated, Any
7
7
 
8
- from pydantic import AliasChoices, BeforeValidator, Field
9
- from typing_extensions import deprecated
8
+ from pydantic import AliasChoices, BeforeValidator, Field, model_validator
9
+ from typing_extensions import Self, deprecated
10
10
 
11
11
  from crawlee._utils.models import timedelta_ms
12
12
  from crawlee._utils.urls import validate_http_url
@@ -365,6 +365,19 @@ class Configuration(CrawleeConfiguration):
365
365
  ),
366
366
  ] = None
367
367
 
368
+ @model_validator(mode='after')
369
+ def disable_browser_sandbox_on_platform(self) -> Self:
370
+ """Disable the browser sandbox mode when running on the Apify platform.
371
+
372
+ Running in environment where `is_at_home` is True does not benefit from browser sandbox as it is already running
373
+ in a container. It can be on the contrary undesired as the process in the container might be running as root and
374
+ this will crash chromium that was started with browser sandbox mode.
375
+ """
376
+ if self.is_at_home and not self.disable_browser_sandbox:
377
+ self.disable_browser_sandbox = True
378
+ logger.warning('Actor is running on the Apify platform, `disable_browser_sandbox` was changed to True.')
379
+ return self
380
+
368
381
  @classmethod
369
382
  def get_global_configuration(cls) -> Configuration:
370
383
  """Retrieve the global instance of the configuration.
@@ -48,11 +48,11 @@ class SystemInfoEventData(BaseModel):
48
48
  is_cpu_overloaded: Annotated[bool, Field(alias='isCpuOverloaded')]
49
49
  created_at: Annotated[datetime, Field(alias='createdAt')]
50
50
 
51
- def to_crawlee_format(self) -> EventSystemInfoData:
51
+ def to_crawlee_format(self, dedicated_cpus: float) -> EventSystemInfoData:
52
52
  return EventSystemInfoData.model_validate(
53
53
  {
54
54
  'cpu_info': {
55
- 'used_ratio': self.cpu_current_usage / 100,
55
+ 'used_ratio': (self.cpu_current_usage / 100) / dedicated_cpus,
56
56
  'created_at': self.created_at,
57
57
  },
58
58
  'memory_info': {
@@ -218,7 +218,7 @@ class PlatformEventManager(EventManager):
218
218
  event=parsed_message.name,
219
219
  event_data=parsed_message.data
220
220
  if not isinstance(parsed_message.data, SystemInfoEventData)
221
- else parsed_message.data.to_crawlee_format(),
221
+ else parsed_message.data.to_crawlee_format(self._config.dedicated_cpus or 1),
222
222
  )
223
223
 
224
224
  if parsed_message.name == Event.MIGRATING:
@@ -31,10 +31,16 @@ GroupName = Literal['Classes', 'Abstract classes', 'Interfaces', 'Data structure
31
31
 
32
32
 
33
33
  def docs_group(group_name: GroupName) -> Callable: # noqa: ARG001
34
- """Decorator to mark symbols for rendering and grouping in documentation.
34
+ """Mark a symbol for rendering and grouping in documentation.
35
35
 
36
- This decorator is used purely for documentation purposes and does not alter the behavior
36
+ This decorator is used solely for documentation purposes and does not modify the behavior
37
37
  of the decorated callable.
38
+
39
+ Args:
40
+ group_name: The documentation group to which the symbol belongs.
41
+
42
+ Returns:
43
+ The original callable without modification.
38
44
  """
39
45
 
40
46
  def wrapper(func: Callable) -> Callable:
@@ -44,12 +50,16 @@ def docs_group(group_name: GroupName) -> Callable: # noqa: ARG001
44
50
 
45
51
 
46
52
  def docs_name(symbol_name: str) -> Callable: # noqa: ARG001
47
- """Decorator for renaming symbols in documentation.
53
+ """Rename a symbol for documentation rendering.
48
54
 
49
- This changes the rendered name of the symbol only in the rendered web documentation.
55
+ This decorator modifies only the displayed name of the symbol in the generated documentation
56
+ and does not affect its runtime behavior.
50
57
 
51
- This decorator is used purely for documentation purposes and does not alter the behavior
52
- of the decorated callable.
58
+ Args:
59
+ symbol_name: The name to be used in the documentation.
60
+
61
+ Returns:
62
+ The original callable without modification.
53
63
  """
54
64
 
55
65
  def wrapper(func: Callable) -> Callable:
@@ -113,7 +113,7 @@ class AsyncThread:
113
113
  await asyncio.gather(*tasks, return_exceptions=True)
114
114
 
115
115
  def _force_exit_event_loop(self) -> None:
116
- """Forcefully shut down the event loop and its thread."""
116
+ """Shut down the event loop and its thread forcefully."""
117
117
  try:
118
118
  logger.info('Forced shutdown of the event loop and its thread...')
119
119
  self._eventloop.call_soon_threadsafe(self._eventloop.stop)
@@ -0,0 +1,3 @@
1
+ from apify.scrapy.extensions._httpcache import ApifyCacheStorage
2
+
3
+ __all__ = ['ApifyCacheStorage']
@@ -0,0 +1,212 @@
1
+ from __future__ import annotations
2
+
3
+ import gzip
4
+ import io
5
+ import pickle
6
+ import re
7
+ import struct
8
+ from logging import getLogger
9
+ from time import time
10
+ from typing import TYPE_CHECKING
11
+
12
+ from scrapy.http.headers import Headers
13
+ from scrapy.responsetypes import responsetypes
14
+
15
+ from apify import Configuration
16
+ from apify.apify_storage_client import ApifyStorageClient
17
+ from apify.scrapy._async_thread import AsyncThread
18
+ from apify.storages import KeyValueStore
19
+
20
+ if TYPE_CHECKING:
21
+ from scrapy import Request, Spider
22
+ from scrapy.http.response import Response
23
+ from scrapy.settings import BaseSettings
24
+ from scrapy.utils.request import RequestFingerprinterProtocol
25
+
26
+ logger = getLogger(__name__)
27
+
28
+
29
+ class ApifyCacheStorage:
30
+ """A Scrapy cache storage that uses the Apify `KeyValueStore` to store responses.
31
+
32
+ It can be set as a storage for Scrapy's built-in `HttpCacheMiddleware`, which caches
33
+ responses to requests. See HTTPCache middleware settings (prefixed with `HTTPCACHE_`)
34
+ in the Scrapy documentation for more information. Requires the asyncio Twisted reactor
35
+ to be installed.
36
+ """
37
+
38
+ def __init__(self, settings: BaseSettings) -> None:
39
+ self._expiration_max_items = 100
40
+ self._expiration_secs: int = settings.getint('HTTPCACHE_EXPIRATION_SECS')
41
+ self._spider: Spider | None = None
42
+ self._kvs: KeyValueStore | None = None
43
+ self._fingerprinter: RequestFingerprinterProtocol | None = None
44
+ self._async_thread: AsyncThread | None = None
45
+
46
+ def open_spider(self, spider: Spider) -> None:
47
+ """Open the cache storage for a spider."""
48
+ logger.debug('Using Apify key value cache storage', extra={'spider': spider})
49
+ self._spider = spider
50
+ self._fingerprinter = spider.crawler.request_fingerprinter
51
+ kvs_name = get_kvs_name(spider.name)
52
+
53
+ async def open_kvs() -> KeyValueStore:
54
+ config = Configuration.get_global_configuration()
55
+ if config.is_at_home:
56
+ storage_client = ApifyStorageClient.from_config(config)
57
+ return await KeyValueStore.open(name=kvs_name, storage_client=storage_client)
58
+ return await KeyValueStore.open(name=kvs_name)
59
+
60
+ logger.debug("Starting background thread for cache storage's event loop")
61
+ self._async_thread = AsyncThread()
62
+ logger.debug(f"Opening cache storage's {kvs_name!r} key value store")
63
+ self._kvs = self._async_thread.run_coro(open_kvs())
64
+
65
+ def close_spider(self, _: Spider, current_time: int | None = None) -> None:
66
+ """Close the cache storage for a spider."""
67
+ if self._async_thread is None:
68
+ raise ValueError('Async thread not initialized')
69
+
70
+ logger.info(f'Cleaning up cache items (max {self._expiration_max_items})')
71
+ if self._expiration_secs > 0:
72
+ if current_time is None:
73
+ current_time = int(time())
74
+
75
+ async def expire_kvs() -> None:
76
+ if self._kvs is None:
77
+ raise ValueError('Key value store not initialized')
78
+ i = 0
79
+ async for item in self._kvs.iterate_keys():
80
+ value = await self._kvs.get_value(item.key)
81
+ try:
82
+ gzip_time = read_gzip_time(value)
83
+ except Exception as e:
84
+ logger.warning(f'Malformed cache item {item.key}: {e}')
85
+ await self._kvs.set_value(item.key, None)
86
+ else:
87
+ if self._expiration_secs < current_time - gzip_time:
88
+ logger.debug(f'Expired cache item {item.key}')
89
+ await self._kvs.set_value(item.key, None)
90
+ else:
91
+ logger.debug(f'Valid cache item {item.key}')
92
+ if i == self._expiration_max_items:
93
+ break
94
+ i += 1
95
+
96
+ self._async_thread.run_coro(expire_kvs())
97
+
98
+ logger.debug('Closing cache storage')
99
+ try:
100
+ self._async_thread.close()
101
+ except KeyboardInterrupt:
102
+ logger.warning('Shutdown interrupted by KeyboardInterrupt!')
103
+ except Exception:
104
+ logger.exception('Exception occurred while shutting down cache storage')
105
+ finally:
106
+ logger.debug('Cache storage closed')
107
+
108
+ def retrieve_response(self, _: Spider, request: Request, current_time: int | None = None) -> Response | None:
109
+ """Retrieve a response from the cache storage."""
110
+ if self._async_thread is None:
111
+ raise ValueError('Async thread not initialized')
112
+ if self._kvs is None:
113
+ raise ValueError('Key value store not initialized')
114
+ if self._fingerprinter is None:
115
+ raise ValueError('Request fingerprinter not initialized')
116
+
117
+ key = self._fingerprinter.fingerprint(request).hex()
118
+ value = self._async_thread.run_coro(self._kvs.get_value(key))
119
+
120
+ if value is None:
121
+ logger.debug('Cache miss', extra={'request': request})
122
+ return None
123
+
124
+ if current_time is None:
125
+ current_time = int(time())
126
+ if 0 < self._expiration_secs < current_time - read_gzip_time(value):
127
+ logger.debug('Cache expired', extra={'request': request})
128
+ return None
129
+
130
+ data = from_gzip(value)
131
+ url = data['url']
132
+ status = data['status']
133
+ headers = Headers(data['headers'])
134
+ body = data['body']
135
+ respcls = responsetypes.from_args(headers=headers, url=url, body=body)
136
+
137
+ logger.debug('Cache hit', extra={'request': request})
138
+ return respcls(url=url, headers=headers, status=status, body=body)
139
+
140
+ def store_response(self, _: Spider, request: Request, response: Response) -> None:
141
+ """Store a response in the cache storage."""
142
+ if self._async_thread is None:
143
+ raise ValueError('Async thread not initialized')
144
+ if self._kvs is None:
145
+ raise ValueError('Key value store not initialized')
146
+ if self._fingerprinter is None:
147
+ raise ValueError('Request fingerprinter not initialized')
148
+
149
+ key = self._fingerprinter.fingerprint(request).hex()
150
+ data = {
151
+ 'status': response.status,
152
+ 'url': response.url,
153
+ 'headers': dict(response.headers),
154
+ 'body': response.body,
155
+ }
156
+ value = to_gzip(data)
157
+ self._async_thread.run_coro(self._kvs.set_value(key, value))
158
+
159
+
160
+ def to_gzip(data: dict, mtime: int | None = None) -> bytes:
161
+ """Dump a dictionary to a gzip-compressed byte stream."""
162
+ with io.BytesIO() as byte_stream:
163
+ with gzip.GzipFile(fileobj=byte_stream, mode='wb', mtime=mtime) as gzip_file:
164
+ pickle.dump(data, gzip_file, protocol=4)
165
+ return byte_stream.getvalue()
166
+
167
+
168
+ def from_gzip(gzip_bytes: bytes) -> dict:
169
+ """Load a dictionary from a gzip-compressed byte stream."""
170
+ with io.BytesIO(gzip_bytes) as byte_stream, gzip.GzipFile(fileobj=byte_stream, mode='rb') as gzip_file:
171
+ data: dict = pickle.load(gzip_file)
172
+ return data
173
+
174
+
175
+ def read_gzip_time(gzip_bytes: bytes) -> int:
176
+ """Read the modification time from a gzip-compressed byte stream without decompressing the data."""
177
+ header = gzip_bytes[:10]
178
+ header_components = struct.unpack('<HBBI2B', header)
179
+ mtime: int = header_components[3]
180
+ return mtime
181
+
182
+
183
+ def get_kvs_name(spider_name: str, max_length: int = 60) -> str:
184
+ """Get the key value store name for a spider.
185
+
186
+ The key value store name is derived from the spider name by replacing all special characters
187
+ with hyphens and trimming leading and trailing hyphens. The resulting name is prefixed with
188
+ 'httpcache-' and truncated to the maximum length.
189
+
190
+ The documentation
191
+ [about storages](https://docs.apify.com/platform/storage/usage#named-and-unnamed-storages)
192
+ mentions that names can be up to 63 characters long, so the default max length is set to 60.
193
+
194
+ Such naming isn't unique per spider, but should be sufficiently unique for most use cases.
195
+ The name of the key value store should indicate to which spider it belongs, e.g. in
196
+ the listing in the Apify's console.
197
+
198
+ Args:
199
+ spider_name: Value of the Spider instance's name attribute.
200
+ max_length: Maximum length of the key value store name.
201
+
202
+ Returns: Key value store name.
203
+
204
+ Raises:
205
+ ValueError: If the spider name contains only special characters.
206
+ """
207
+ slug = re.sub(r'[^a-zA-Z0-9-]', '-', spider_name)
208
+ slug = re.sub(r'-+', '-', slug)
209
+ slug = slug.strip('-')
210
+ if not slug:
211
+ raise ValueError(f'Unsupported spider name: {spider_name!r} (slug: {slug!r})')
212
+ return f'httpcache-{slug}'[:max_length]
@@ -93,7 +93,7 @@ def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequ
93
93
  Returns:
94
94
  The converted Scrapy request.
95
95
  """
96
- if not isinstance(cast(Any, apify_request), ApifyRequest):
96
+ if not isinstance(cast('Any', apify_request), ApifyRequest):
97
97
  raise TypeError('apify_request must be a crawlee.ScrapyRequest instance')
98
98
 
99
99
  logger.debug(f'to_scrapy_request was called (apify_request={apify_request})...')
@@ -44,6 +44,9 @@ def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict
44
44
  settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware'] = None
45
45
  settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyHttpProxyMiddleware'] = 750
46
46
 
47
+ # Set the default HTTPCache middleware storage backend to ApifyCacheStorage
48
+ settings['HTTPCACHE_STORAGE'] = 'apify.scrapy.extensions.ApifyCacheStorage'
49
+
47
50
  # Store the proxy configuration
48
51
  settings['APIFY_PROXY_SETTINGS'] = proxy_config
49
52
 
@@ -51,7 +51,7 @@ class RequestList(CrawleeRequestList):
51
51
  request_list_sources_input: list[dict[str, Any]] | None = None,
52
52
  http_client: HttpClient | None = None,
53
53
  ) -> RequestList:
54
- """Creates RequestList from Actor input requestListSources.
54
+ """Initialize a new instance from request list source input.
55
55
 
56
56
  Args:
57
57
  name: Name of the returned RequestList.
@@ -108,9 +108,10 @@ class RequestList(CrawleeRequestList):
108
108
 
109
109
  @staticmethod
110
110
  async def _fetch_requests_from_url(
111
- remote_url_requests_inputs: list[_RequestsFromUrlInput], http_client: HttpClient
111
+ remote_url_requests_inputs: list[_RequestsFromUrlInput],
112
+ http_client: HttpClient,
112
113
  ) -> list[Request]:
113
- """Crete list of requests from url.
114
+ """Create list of requests from url.
114
115
 
115
116
  Send GET requests to urls defined in each requests_from_url of remote_url_requests_inputs. Run extracting
116
117
  callback on each response body and use URL_NO_COMMAS_REGEX regex to find all links. Create list of Requests from
@@ -119,7 +120,11 @@ class RequestList(CrawleeRequestList):
119
120
  created_requests: list[Request] = []
120
121
 
121
122
  def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
122
- """Callback to scrape response body with regexp and create Requests from matches."""
123
+ """Extract links from response body and use them to create `Request` objects.
124
+
125
+ Use the regular expression to find all matching links in the response body, then create `Request`
126
+ objects from these links and the provided input attributes.
127
+ """
123
128
  matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8'))
124
129
  created_requests.extend(
125
130
  [
@@ -41,7 +41,8 @@ async def test_actor_scrapy_title_spider(
41
41
 
42
42
  items = await actor.last_run().dataset().list_items()
43
43
 
44
- assert items.count >= 10
44
+ # CLOSESPIDER_PAGECOUNT is set to 10 in the spider settings.
45
+ assert items.count >= 9
45
46
 
46
47
  for item in items.items:
47
48
  assert 'url' in item
@@ -5,12 +5,11 @@ import contextlib
5
5
  import json
6
6
  import sys
7
7
  from datetime import datetime, timezone
8
- from typing import Any, Callable, cast
8
+ from typing import TYPE_CHECKING, Any, Callable, cast
9
9
  from unittest.mock import AsyncMock, Mock
10
10
 
11
11
  import pytest
12
12
  import websockets.asyncio.server
13
- from lazy_object_proxy import Proxy
14
13
 
15
14
  from apify_shared.consts import ActorEnvVars, ApifyEnvVars
16
15
  from crawlee.events._types import Event, EventPersistStateData
@@ -19,12 +18,15 @@ import apify._actor
19
18
  from apify import Actor
20
19
  from apify._actor import _ActorType
21
20
 
21
+ if TYPE_CHECKING:
22
+ from lazy_object_proxy import Proxy
23
+
22
24
 
23
25
  async def test_actor_properly_init_with_async() -> None:
24
26
  async with Actor:
25
- assert cast(Proxy, apify._actor.Actor).__wrapped__ is not None
26
- assert cast(Proxy, apify._actor.Actor).__wrapped__._is_initialized
27
- assert not cast(Proxy, apify._actor.Actor).__wrapped__._is_initialized
27
+ assert cast('Proxy', apify._actor.Actor).__wrapped__ is not None
28
+ assert cast('Proxy', apify._actor.Actor).__wrapped__._is_initialized
29
+ assert not cast('Proxy', apify._actor.Actor).__wrapped__._is_initialized
28
30
 
29
31
 
30
32
  async def test_actor_init() -> None:
@@ -0,0 +1,21 @@
1
+ import pytest
2
+
3
+ from apify import Configuration
4
+
5
+
6
+ @pytest.mark.parametrize(
7
+ ('is_at_home', 'disable_browser_sandbox_in', 'disable_browser_sandbox_out'),
8
+ [
9
+ (False, False, False),
10
+ (False, True, True),
11
+ (True, False, True),
12
+ (True, True, True),
13
+ ],
14
+ )
15
+ def test_disable_browser_sandbox(
16
+ *, is_at_home: bool, disable_browser_sandbox_in: bool, disable_browser_sandbox_out: bool
17
+ ) -> None:
18
+ assert (
19
+ Configuration(is_at_home=is_at_home, disable_browser_sandbox=disable_browser_sandbox_in).disable_browser_sandbox
20
+ == disable_browser_sandbox_out
21
+ )