crawlee 1.0.0rc1__tar.gz → 1.0.1b1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/.github/workflows/build_and_deploy_docs.yaml +3 -3
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/.github/workflows/check_pr_title.yaml +1 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/.github/workflows/templates_e2e_tests.yaml +2 -2
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/.github/workflows/update_new_issue.yaml +1 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/CHANGELOG.md +15 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/PKG-INFO +12 -5
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/README.md +0 -2
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/playwright_crawler_adaptive/handler.py +1 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/request_loaders/rl_basic_example.py +1 -0
- crawlee-1.0.1b1/docs/guides/code_examples/request_loaders/rl_basic_example_with_persist.py +46 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/request_loaders/rl_tandem_example.py +13 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/request_loaders/rl_tandem_example_explicit.py +11 -0
- crawlee-1.0.1b1/docs/guides/code_examples/request_loaders/sitemap_basic_example.py +30 -0
- crawlee-1.0.1b1/docs/guides/code_examples/request_loaders/sitemap_example_with_persist.py +45 -0
- crawlee-1.0.1b1/docs/guides/code_examples/request_loaders/sitemap_tandem_example.py +53 -0
- crawlee-1.0.1b1/docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py +54 -0
- crawlee-1.0.1b1/docs/guides/code_examples/service_locator/service_storage_configuration.py +30 -0
- crawlee-1.0.1b1/docs/guides/code_examples/storage_clients/sql_storage_client_basic_example.py +12 -0
- crawlee-1.0.1b1/docs/guides/code_examples/storage_clients/sql_storage_client_configuration_example.py +33 -0
- crawlee-1.0.1b1/docs/guides/code_examples/storages/opening.py +19 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/request_loaders.mdx +27 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/service_locator.mdx +1 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/storage_clients.mdx +188 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/storages.mdx +22 -9
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/03_adding_more_urls.mdx +1 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/upgrading/upgrading_to_v1.md +91 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/pyproject.toml +19 -10
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_autoscaling/snapshotter.py +1 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_request.py +2 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_service_locator.py +44 -24
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_types.py +76 -17
- crawlee-1.0.1b1/src/crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_utils/sitemap.py +3 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_utils/system.py +3 -3
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/configuration.py +1 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +3 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +6 -2
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +2 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_basic/_basic_crawler.py +107 -27
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_basic/_logging_utils.py +5 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_playwright/_playwright_crawler.py +6 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/events/_types.py +6 -6
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/fingerprint_suite/_types.py +2 -2
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/request_loaders/_request_list.py +1 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/request_loaders/_request_loader.py +5 -1
- crawlee-1.0.1b1/src/crawlee/request_loaders/_sitemap_request_loader.py +357 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/sessions/_models.py +2 -2
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/statistics/_models.py +1 -1
- crawlee-1.0.1b1/src/crawlee/storage_clients/__init__.py +21 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/storage_clients/_base/_storage_client.py +13 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/storage_clients/_file_system/_dataset_client.py +27 -25
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/storage_clients/_file_system/_key_value_store_client.py +27 -23
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/storage_clients/_file_system/_request_queue_client.py +84 -98
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/storage_clients/_memory/_dataset_client.py +14 -2
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/storage_clients/_memory/_key_value_store_client.py +14 -2
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/storage_clients/_memory/_request_queue_client.py +43 -12
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee-1.0.1b1/src/crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee-1.0.1b1/src/crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee-1.0.1b1/src/crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee-1.0.1b1/src/crawlee/storage_clients/_sql/_db_models.py +269 -0
- crawlee-1.0.1b1/src/crawlee/storage_clients/_sql/_key_value_store_client.py +299 -0
- crawlee-1.0.1b1/src/crawlee/storage_clients/_sql/_request_queue_client.py +706 -0
- crawlee-1.0.1b1/src/crawlee/storage_clients/_sql/_storage_client.py +282 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/storage_clients/models.py +10 -10
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/storages/_base.py +3 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/storages/_dataset.py +9 -2
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/storages/_key_value_store.py +9 -2
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/storages/_request_queue.py +7 -2
- crawlee-1.0.1b1/src/crawlee/storages/_storage_instance_manager.py +187 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/_autoscaling/test_autoscaled_pool.py +4 -2
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/_autoscaling/test_snapshotter.py +6 -6
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/_utils/test_byte_size.py +2 -2
- crawlee-1.0.1b1/tests/unit/_utils/test_raise_if_too_many_kwargs.py +38 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/browsers/test_browser_pool.py +5 -5
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/browsers/test_playwright_browser_controller.py +2 -2
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/browsers/test_playwright_browser_plugin.py +2 -2
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/conftest.py +10 -15
- crawlee-1.0.1b1/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawling_context.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/crawlers/_adaptive_playwright/test_predictor.py +1 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/crawlers/_basic/test_basic_crawler.py +224 -13
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py +110 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/crawlers/_http/test_http_crawler.py +7 -3
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/crawlers/_parsel/test_parsel_crawler.py +108 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/crawlers/_playwright/test_playwright_crawler.py +135 -1
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/events/test_event_manager.py +3 -3
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/fingerprint_suite/test_header_generator.py +2 -2
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/request_loaders/test_sitemap_request_loader.py +69 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/sessions/test_session_pool.py +5 -5
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py +5 -17
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py +3 -13
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/storage_clients/_file_system/test_fs_rq_client.py +4 -10
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/storage_clients/_memory/test_memory_dataset_client.py +0 -5
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/storage_clients/_memory/test_memory_kvs_client.py +0 -4
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/storage_clients/_memory/test_memory_rq_client.py +0 -5
- crawlee-1.0.1b1/tests/unit/storage_clients/_sql/test_sql_dataset_client.py +241 -0
- crawlee-1.0.1b1/tests/unit/storage_clients/_sql/test_sql_kvs_client.py +292 -0
- crawlee-1.0.1b1/tests/unit/storage_clients/_sql/test_sql_rq_client.py +244 -0
- crawlee-1.0.1b1/tests/unit/storages/conftest.py +21 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/storages/test_dataset.py +511 -40
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/storages/test_key_value_store.py +506 -41
- crawlee-1.0.1b1/tests/unit/storages/test_request_queue.py +1261 -0
- crawlee-1.0.1b1/tests/unit/storages/test_storage_instance_manager.py +143 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/test_service_locator.py +12 -16
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/uv.lock +482 -324
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/docusaurus.config.js +4 -4
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/package.json +12 -11
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/css/custom.css +4 -1
- crawlee-1.0.1b1/website/static/.nojekyll +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/yarn.lock +1483 -1102
- crawlee-1.0.0rc1/docs/guides/code_examples/request_loaders/sitemap_example.py +0 -28
- crawlee-1.0.0rc1/docs/guides/code_examples/request_loaders/sitemap_tandem_example.py +0 -40
- crawlee-1.0.0rc1/docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py +0 -43
- crawlee-1.0.0rc1/docs/guides/code_examples/service_locator/service_storage_configuration.py +0 -22
- crawlee-1.0.0rc1/src/crawlee/request_loaders/_sitemap_request_loader.py +0 -177
- crawlee-1.0.0rc1/src/crawlee/storage_clients/__init__.py +0 -9
- crawlee-1.0.0rc1/src/crawlee/storages/_storage_instance_manager.py +0 -133
- crawlee-1.0.0rc1/tests/unit/storages/test_request_queue.py +0 -644
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/.editorconfig +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/.github/CODEOWNERS +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/.github/pull_request_template.md +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/.github/workflows/pre_release.yaml +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/.github/workflows/release.yaml +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/.github/workflows/run_code_checks.yaml +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/.gitignore +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/.markdownlint.yaml +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/.pre-commit-config.yaml +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/CONTRIBUTING.md +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/LICENSE +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/Makefile +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/deployment/apify_platform.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/deployment/code_examples/apify/crawler_as_actor_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/deployment/code_examples/apify/get_public_url.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/deployment/code_examples/apify/log_with_config_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/deployment/code_examples/apify/proxy_advanced_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/deployment/code_examples/apify/proxy_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/deployment/code_examples/google/cloud_run_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/deployment/code_examples/google/google_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/deployment/google_cloud.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/deployment/google_cloud_run.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/add_data_to_dataset.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/beautifulsoup_crawler.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/capture_screenshot_using_playwright.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/capturing_page_snapshots_with_error_snapshotter.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/adaptive_playwright_crawler.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/add_data_to_dataset_bs.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/add_data_to_dataset_dataset.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/add_data_to_dataset_pw.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/beautifulsoup_crawler.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/beautifulsoup_crawler_keep_alive.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/beautifulsoup_crawler_stop.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/capture_screenshot_using_playwright.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/configure_json_logging.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/crawl_all_links_on_website_bs.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/crawl_all_links_on_website_pw.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/crawl_multiple_urls_bs.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/crawl_multiple_urls_pw.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/crawl_specific_links_on_website_bs.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/crawl_specific_links_on_website_pw.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/crawl_website_with_relative_links_all_links.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/crawl_website_with_relative_links_same_domain.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/crawl_website_with_relative_links_same_hostname.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/crawl_website_with_relative_links_same_origin.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/export_entire_dataset_to_file_csv.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/export_entire_dataset_to_file_json.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/extract_and_add_specific_links_on_website_bs.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/extract_and_add_specific_links_on_website_pw.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/fill_and_submit_web_form_crawler.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/fill_and_submit_web_form_request.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/parsel_crawler.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/parsel_crawler_with_error_snapshotter.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/playwright_block_requests.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/playwright_crawler.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/playwright_crawler_with_camoufox.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/playwright_crawler_with_error_snapshotter.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/playwright_crawler_with_fingerprint_generator.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/respect_robots_on_skipped_request.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/respect_robots_txt_file.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/code_examples/resuming_paused_crawl.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/crawl_all_links_on_website.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/crawl_multiple_urls.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/crawl_specific_links_on_website.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/crawl_website_with_relative_links.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/crawler_keep_alive.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/crawler_stop.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/export_entire_dataset_to_file.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/fill_and_submit_web_form.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/json_logging.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/parsel_crawler.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/playwright_crawler.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/playwright_crawler_adaptive.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/playwright_crawler_with_block_requests.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/playwright_crawler_with_camoufox.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/playwright_crawler_with_fingerprint_generator.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/respect_robots_txt_file.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/examples/resuming_paused_crawl.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/architecture_overview.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/avoid_blocking.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/avoid_blocking/default_fingerprint_generator_with_args.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/avoid_blocking/playwright_with_fingerprint_generator.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/creating_web_archive/manual_archiving_parsel_crawler.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/creating_web_archive/manual_archiving_playwright_crawler.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/creating_web_archive/simple_pw_through_proxy_pywb_server.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/error_handling/change_handle_error_status.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/error_handling/disable_retry.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/error_handling/handle_proxy_error.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/http_clients/parsel_curl_impersonate_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/http_clients/parsel_httpx_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/http_clients/parsel_impit_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/http_crawlers/beautifulsoup_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/http_crawlers/custom_crawler_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/http_crawlers/http_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/http_crawlers/parsel_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/login_crawler/http_login.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/login_crawler/playwright_login.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/playwright_crawler/browser_configuration_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/playwright_crawler/multiple_launch_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/playwright_crawler/plugin_browser_configuration_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/playwright_crawler/pre_navigation_hook_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/playwright_crawler_adaptive/init_beautifulsoup.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/playwright_crawler_adaptive/init_parsel.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/playwright_crawler_adaptive/init_prediction.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/playwright_crawler_adaptive/pre_nav_hooks.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/playwright_crawler_stagehand/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/playwright_crawler_stagehand/browser_classes.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/playwright_crawler_stagehand/stagehand_run.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/playwright_crawler_stagehand/support_classes.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/proxy_management/inspecting_bs_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/proxy_management/inspecting_pw_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/proxy_management/integration_bs_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/proxy_management/integration_pw_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/proxy_management/quick_start_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/proxy_management/session_bs_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/proxy_management/session_pw_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/proxy_management/tiers_bs_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/proxy_management/tiers_pw_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/request_router/adaptive_crawler_handlers.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/request_router/basic_request_handlers.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/request_router/custom_router_default_only.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/request_router/error_handler.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/request_router/failed_request_handler.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/request_router/http_pre_navigation.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/request_router/playwright_pre_navigation.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/request_router/simple_default_handler.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/running_in_web_server/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/running_in_web_server/crawler.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/running_in_web_server/server.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/scaling_crawlers/max_tasks_per_minute_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/scaling_crawlers/min_and_max_concurrency_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/service_locator/service_conflicts.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/service_locator/service_crawler_configuration.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/service_locator/service_crawler_event_manager.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/service_locator/service_crawler_storage_client.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/service_locator/service_locator_configuration.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/service_locator/service_locator_event_manager.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/service_locator/service_locator_storage_client.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/service_locator/service_storage_storage_client.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/session_management/multi_sessions_http.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/session_management/one_session_http.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/session_management/sm_basic.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/session_management/sm_beautifulsoup.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/session_management/sm_http.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/session_management/sm_parsel.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/session_management/sm_playwright.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/session_management/sm_standalone.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/storage_clients/custom_storage_client_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/storage_clients/file_system_storage_client_basic_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/storage_clients/file_system_storage_client_configuration_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/storage_clients/memory_storage_client_basic_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/storage_clients/registering_storage_clients_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/storages/cleaning_do_not_purge_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/storages/cleaning_purge_explicitly_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/storages/dataset_basic_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/storages/dataset_with_crawler_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/storages/dataset_with_crawler_explicit_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/storages/helper_add_requests_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/storages/helper_enqueue_links_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/storages/kvs_basic_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/storages/kvs_with_crawler_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/storages/kvs_with_crawler_explicit_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/storages/rq_basic_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/storages/rq_with_crawler_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/storages/rq_with_crawler_explicit_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/trace_and_monitor_crawlers/instrument_crawler.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/crawler_login.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/creating_web_archive.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/error_handling.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/http_clients.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/http_crawlers.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/playwright_crawler.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/playwright_crawler_adaptive.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/playwright_crawler_stagehand.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/proxy_management.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/request_router.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/running_in_web_server.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/scaling_crawlers.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/session_management.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/trace_and_monitor_crawlers.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/01_setting_up.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/02_first_crawler.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/04_real_world_project.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/05_crawling.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/06_scraping.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/07_saving_data.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/08_refactoring.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/09_running_in_cloud.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/code_examples/02_bs.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/code_examples/02_bs_better.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/code_examples/02_request_queue.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/code_examples/03_enqueue_strategy.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/code_examples/03_finding_new_links.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/code_examples/03_globs.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/code_examples/03_original_code.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/code_examples/03_transform_request.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/code_examples/04_sanity_check.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/code_examples/05_crawling_detail.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/code_examples/05_crawling_listing.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/code_examples/06_scraping.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/code_examples/07_final_code.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/code_examples/07_first_code.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/code_examples/08_main.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/code_examples/08_routes.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/code_examples/09_apify_sdk.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/code_examples/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/code_examples/routes.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/introduction/index.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/pyproject.toml +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/quick-start/code_examples/beautifulsoup_crawler_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/quick-start/code_examples/parsel_crawler_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/quick-start/code_examples/playwright_crawler_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/quick-start/code_examples/playwright_crawler_headful_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/quick-start/index.mdx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/upgrading/upgrading_to_v0x.md +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/renovate.json +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_autoscaling/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_autoscaling/_types.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_autoscaling/autoscaled_pool.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_autoscaling/py.typed +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_autoscaling/system_status.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_browserforge_workaround.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_cli.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_consts.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_log_config.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_utils/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_utils/blocked.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_utils/byte_size.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_utils/console.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_utils/context.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_utils/crypto.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_utils/docs.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_utils/file.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_utils/globs.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_utils/html_to_text.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_utils/models.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_utils/recoverable_state.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_utils/recurring_task.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_utils/requests.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_utils/robots.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_utils/time.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_utils/try_import.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_utils/urls.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_utils/wait.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/_utils/web.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/browsers/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/browsers/_browser_controller.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/browsers/_browser_plugin.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/browsers/_browser_pool.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/browsers/_playwright_browser.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/browsers/_playwright_browser_controller.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/browsers/_playwright_browser_plugin.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/browsers/_types.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/browsers/py.typed +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_abstract_http/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_abstract_http/py.typed +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_adaptive_playwright/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_adaptive_playwright/_result_comparator.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_adaptive_playwright/_utils.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_basic/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_basic/_basic_crawling_context.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_basic/_context_pipeline.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_basic/py.typed +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_beautifulsoup/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawling_context.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_beautifulsoup/_utils.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_beautifulsoup/py.typed +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_http/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_http/_http_crawler.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_http/_http_parser.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_parsel/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_parsel/_parsel_crawler.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_parsel/_parsel_crawling_context.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_parsel/_parsel_parser.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_parsel/_utils.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_playwright/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_playwright/_playwright_crawling_context.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_playwright/_playwright_http_client.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_playwright/_types.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_playwright/_utils.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/_types.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/crawlers/py.typed +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/errors.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/events/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/events/_event_manager.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/events/_local_event_manager.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/events/py.typed +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/fingerprint_suite/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/fingerprint_suite/_browserforge_adapter.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/fingerprint_suite/_consts.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/fingerprint_suite/_header_generator.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/fingerprint_suite/py.typed +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/http_clients/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/http_clients/_base.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/http_clients/_curl_impersonate.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/http_clients/_httpx.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/http_clients/_impit.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/otel/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/otel/crawler_instrumentor.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/project_template/cookiecutter.json +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/project_template/hooks/post_gen_project.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/project_template/hooks/pre_gen_project.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/project_template/templates/main.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/project_template/templates/main_beautifulsoup.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/project_template/templates/main_parsel.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/project_template/templates/main_playwright.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/project_template/templates/main_playwright_camoufox.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/project_template/templates/routes_beautifulsoup.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/project_template/templates/routes_camoufox.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/project_template/templates/routes_parsel.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/project_template/templates/routes_playwright.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/project_template/templates/routes_playwright_camoufox.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/project_template/{{cookiecutter.project_name}}/.dockerignore +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/project_template/{{cookiecutter.project_name}}/README.md +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__main__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/main.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/project_template/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/proxy_configuration.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/py.typed +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/request_loaders/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/request_loaders/_request_manager.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/request_loaders/_request_manager_tandem.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/router.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/sessions/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/sessions/_cookies.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/sessions/_session.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/sessions/_session_pool.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/sessions/py.typed +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/statistics/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/statistics/_error_snapshotter.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/statistics/_error_tracker.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/statistics/_statistics.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/storage_clients/_base/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/storage_clients/_base/_dataset_client.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/storage_clients/_base/_key_value_store_client.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/storage_clients/_base/_request_queue_client.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/storage_clients/_base/py.typed +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/storage_clients/_file_system/__init__.py +0 -0
- /crawlee-1.0.0rc1/src/crawlee/storage_clients/_file_system/py.typed → /crawlee-1.0.1b1/src/crawlee/storage_clients/_file_system/_utils.py +0 -0
- {crawlee-1.0.0rc1/src/crawlee/storage_clients/_memory → crawlee-1.0.1b1/src/crawlee/storage_clients/_file_system}/py.typed +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/storage_clients/_memory/__init__.py +0 -0
- {crawlee-1.0.0rc1/src/crawlee/storage_clients → crawlee-1.0.1b1/src/crawlee/storage_clients/_memory}/py.typed +0 -0
- {crawlee-1.0.0rc1/src/crawlee/storages → crawlee-1.0.1b1/src/crawlee/storage_clients/_sql}/py.typed +0 -0
- /crawlee-1.0.0rc1/tests/__init__.py → /crawlee-1.0.1b1/src/crawlee/storage_clients/py.typed +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/src/crawlee/storages/__init__.py +0 -0
- /crawlee-1.0.0rc1/tests/e2e/__init__.py → /crawlee-1.0.1b1/src/crawlee/storages/py.typed +0 -0
- {crawlee-1.0.0rc1/tests/unit → crawlee-1.0.1b1/tests}/__init__.py +0 -0
- /crawlee-1.0.0rc1/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawling_context.py → /crawlee-1.0.1b1/tests/e2e/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/e2e/conftest.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/e2e/project_template/test_static_crawlers_templates.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/e2e/project_template/utils.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/README.md +0 -0
- /crawlee-1.0.0rc1/website/static/.nojekyll → /crawlee-1.0.1b1/tests/unit/__init__.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/_autoscaling/test_system_status.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/_statistics/test_error_tracker.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/_statistics/test_periodic_logging.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/_statistics/test_persistence.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/_statistics/test_request_processing_record.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/_utils/test_console.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/_utils/test_crypto.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/_utils/test_file.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/_utils/test_globs.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/_utils/test_html_to_text.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/_utils/test_measure_time.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/_utils/test_recurring_task.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/_utils/test_requests.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/_utils/test_robots.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/_utils/test_sitemap.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/_utils/test_system.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/_utils/test_timedelata_ms.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/_utils/test_urls.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/browsers/test_playwright_browser.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler_statistics.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/crawlers/_basic/test_context_pipeline.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/events/test_local_event_manager.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/fingerprint_suite/test_adapters.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/http_clients/test_http_clients.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/http_clients/test_httpx.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/otel/test_crawler_instrumentor.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/proxy_configuration/test_new_proxy_info.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/proxy_configuration/test_tiers.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/request_loaders/test_request_list.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/server.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/server_endpoints.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/sessions/test_cookies.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/sessions/test_models.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/sessions/test_session.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/storages/test_request_manager_tandem.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/test_cli.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/test_configuration.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/test_log_config.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/tests/unit/test_router.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/.eslintrc.json +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/.yarnrc.yml +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/babel.config.js +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/build_api_reference.sh +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/generate_module_shortcuts.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/patches/@docusaurus+core+3.4.0.patch +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/patches/@docusaurus+core+3.5.2.patch +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/roa-loader/index.js +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/roa-loader/package.json +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/sidebars.js +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/ApiLink.jsx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/Button.jsx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/Button.module.css +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/CopyButton.jsx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/CopyButton.module.css +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/Gradients.jsx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/Highlights.jsx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/Highlights.module.css +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/Homepage/HomepageCliExample.jsx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/Homepage/HomepageCliExample.module.css +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/Homepage/HomepageCtaSection.jsx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/Homepage/HomepageCtaSection.module.css +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/Homepage/HomepageHeroSection.jsx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/Homepage/HomepageHeroSection.module.css +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/Homepage/LanguageInfoWidget.jsx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/Homepage/LanguageInfoWidget.module.css +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/Homepage/LanguageSwitch.jsx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/Homepage/LanguageSwitch.module.css +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/Homepage/RiverSection.jsx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/Homepage/RiverSection.module.css +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/Homepage/ThreeCardsWithIcon.jsx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/Homepage/ThreeCardsWithIcon.module.css +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/Homepage/animated-crawlee-logo-dark.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/Homepage/animated-crawlee-logo-light.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/RunnableCodeBlock.jsx +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/components/RunnableCodeBlock.module.css +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/pages/home_page_example.py +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/pages/index.js +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/pages/index.module.css +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/theme/ColorModeToggle/dark-mode-icon.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/theme/ColorModeToggle/index.js +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/theme/ColorModeToggle/light-mode-icon.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/theme/ColorModeToggle/styles.module.css +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/theme/DocItem/Layout/index.js +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/theme/DocItem/Layout/styles.module.css +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/theme/Footer/LinkItem/index.js +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/theme/Footer/LinkItem/index.module.css +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/theme/Footer/index.js +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/theme/Footer/index.module.css +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/theme/MDXComponents/A.js +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/theme/Navbar/Content/index.js +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/theme/Navbar/Content/styles.module.css +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/theme/Navbar/Logo/index.js +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/theme/Navbar/Logo/index.module.css +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/theme/Navbar/MobileSidebar/Header/index.js +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/theme/Navbar/MobileSidebar/Header/index.module.css +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/theme/Navbar/MobileSidebar/Layout/index.js +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/theme/Navbar/MobileSidebar/PrimaryMenu/index.js +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/theme/Navbar/MobileSidebar/index.js +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/src/theme/NavbarItem/ComponentTypes.js +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/font/lota.woff +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/font/lota.woff2 +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/API.png +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/apify_logo.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/apify_og_SDK.png +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/apify_sdk.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/apify_sdk_white.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/arrow_right.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/auto-scaling-dark.webp +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/auto-scaling-light.webp +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/check.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/chrome-scrape-dark.gif +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/chrome-scrape-light.gif +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/cloud_icon.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/community-dark-icon.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/community-light-icon.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/crawlee-dark-new.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/crawlee-dark.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/crawlee-javascript-dark.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/crawlee-javascript-light.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/crawlee-light-new.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/crawlee-light.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/crawlee-logo-monocolor.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/crawlee-logo.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/crawlee-python-dark.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/crawlee-python-light.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/crawlee-python-og.png +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/defaults-dark-icon.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/defaults-light-icon.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/discord-brand-dark.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/discord-brand.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/docusaurus.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/external-link.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/favicon.ico +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/favorite-tools-dark.webp +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/favorite-tools-light.webp +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/features/auto-scaling.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/features/automate-everything.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/features/fingerprints.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/features/node-requests.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/features/runs-on-py.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/features/storage.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/features/works-everywhere.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/fill-and-submit-web-form/00.jpg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/fill-and-submit-web-form/01.jpg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/fill-and-submit-web-form/02.jpg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/fill-and-submit-web-form/03.jpg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/getting-started/current-price.jpg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/getting-started/scraping-practice.jpg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/getting-started/select-an-element.jpg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/getting-started/selected-element.jpg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/getting-started/sku.jpg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/getting-started/title.jpg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/github-brand-dark.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/github-brand.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/guides/jaeger_otel_search_view_example.png +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/guides/jaeger_otel_trace_example.png +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/hearth copy.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/hearth.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/javascript_logo.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/js_file.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/logo-big.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/logo-blur.png +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/logo-blur.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/logo-zoom.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/menu-arrows.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/oss_logo.png +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/puppeteer-live-view-dashboard.png +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/puppeteer-live-view-detail.png +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/queue-dark-icon.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/queue-light-icon.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/resuming-paused-crawl/00.webp +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/resuming-paused-crawl/01.webp +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/robot.png +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/routing-dark-icon.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/routing-light-icon.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/scraping-utils-dark-icon.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/scraping-utils-light-icon.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/smart-proxy-dark.webp +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/smart-proxy-light.webp +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/source_code.png +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/system.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/triangles_dark.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/triangles_light.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/workflow.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/zero-setup-dark-icon.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/img/zero-setup-light-icon.svg +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/js/custom.js +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/static/robots.txt +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/tools/docs-prettier.config.js +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/tools/utils/externalLink.js +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/tools/website_gif/chrome-scrape-dark.gif +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/tools/website_gif/chrome-scrape-dark.mp4 +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/tools/website_gif/chrome-scrape-light.gif +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/tools/website_gif/chrome-scrape-light.mp4 +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/tools/website_gif/website_gif.mjs +0 -0
- {crawlee-1.0.0rc1 → crawlee-1.0.1b1}/website/tsconfig.eslint.json +0 -0
|
@@ -30,12 +30,12 @@ jobs:
|
|
|
30
30
|
ref: ${{ github.event_name == 'workflow_call' && inputs.ref || github.ref }}
|
|
31
31
|
|
|
32
32
|
- name: Set up Node
|
|
33
|
-
uses: actions/setup-node@
|
|
33
|
+
uses: actions/setup-node@v5
|
|
34
34
|
with:
|
|
35
35
|
node-version: ${{ env.NODE_VERSION }}
|
|
36
36
|
|
|
37
37
|
- name: Set up Python
|
|
38
|
-
uses: actions/setup-python@
|
|
38
|
+
uses: actions/setup-python@v6
|
|
39
39
|
with:
|
|
40
40
|
python-version: ${{ env.PYTHON_VERSION }}
|
|
41
41
|
|
|
@@ -59,7 +59,7 @@ jobs:
|
|
|
59
59
|
uses: actions/configure-pages@v5
|
|
60
60
|
|
|
61
61
|
- name: Upload GitHub Pages artifact
|
|
62
|
-
uses: actions/upload-pages-artifact@
|
|
62
|
+
uses: actions/upload-pages-artifact@v4
|
|
63
63
|
with:
|
|
64
64
|
path: ./website/build
|
|
65
65
|
|
|
@@ -27,7 +27,7 @@ jobs:
|
|
|
27
27
|
uses: actions/checkout@v5
|
|
28
28
|
|
|
29
29
|
- name: Setup node
|
|
30
|
-
uses: actions/setup-node@
|
|
30
|
+
uses: actions/setup-node@v5
|
|
31
31
|
with:
|
|
32
32
|
node-version: ${{ env.NODE_VERSION }}
|
|
33
33
|
|
|
@@ -35,7 +35,7 @@ jobs:
|
|
|
35
35
|
run: npm install -g apify-cli
|
|
36
36
|
|
|
37
37
|
- name: Set up Python ${{ env.PYTHON_VERSION }}
|
|
38
|
-
uses: actions/setup-python@
|
|
38
|
+
uses: actions/setup-python@v6
|
|
39
39
|
with:
|
|
40
40
|
python-version: ${{ env.PYTHON_VERSION }}
|
|
41
41
|
|
|
@@ -2,7 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to this project will be documented in this file.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
<!-- git-cliff-unreleased-start -->
|
|
6
|
+
## 1.0.1 - **not yet released**
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
<!-- git-cliff-unreleased-end -->
|
|
10
|
+
## [1.0.0](https://github.com/apify/crawlee-python/releases/tag/v1.0.0) (2025-09-29)
|
|
6
11
|
|
|
7
12
|
### 🚀 Features
|
|
8
13
|
|
|
@@ -17,6 +22,10 @@ All notable changes to this project will be documented in this file.
|
|
|
17
22
|
- Add `impit` option for Crawlee CLI ([#1312](https://github.com/apify/crawlee-python/pull/1312)) ([508d7ce](https://github.com/apify/crawlee-python/commit/508d7ce4d998f37ab2adcf9c057c3c635a69f863)) by [@Mantisus](https://github.com/Mantisus)
|
|
18
23
|
- Persist RequestList state ([#1274](https://github.com/apify/crawlee-python/pull/1274)) ([cc68014](https://github.com/apify/crawlee-python/commit/cc680147ba3cc8b35b9da70274e53e6f5dd92434)) by [@janbuchar](https://github.com/janbuchar), closes [#99](https://github.com/apify/crawlee-python/issues/99)
|
|
19
24
|
- Persist `DefaultRenderingTypePredictor` state ([#1340](https://github.com/apify/crawlee-python/pull/1340)) ([fad4c25](https://github.com/apify/crawlee-python/commit/fad4c25fc712915c4a45b24e3290b6f5dbd8a683)) by [@Mantisus](https://github.com/Mantisus), closes [#1272](https://github.com/apify/crawlee-python/issues/1272)
|
|
25
|
+
- Persist the `SitemapRequestLoader` state ([#1347](https://github.com/apify/crawlee-python/pull/1347)) ([27ef9ad](https://github.com/apify/crawlee-python/commit/27ef9ad194552ea9f1321d91a7a52054be9a8a51)) by [@Mantisus](https://github.com/Mantisus), closes [#1269](https://github.com/apify/crawlee-python/issues/1269)
|
|
26
|
+
- Add support for NDU storages ([#1401](https://github.com/apify/crawlee-python/pull/1401)) ([5dbd212](https://github.com/apify/crawlee-python/commit/5dbd212663e7abc37535713f4c6e3a5bbf30a12e)) by [@vdusek](https://github.com/vdusek), closes [#1175](https://github.com/apify/crawlee-python/issues/1175)
|
|
27
|
+
- Add RQ id, name, alias args to `add_requests` and `enqueue_links` methods ([#1413](https://github.com/apify/crawlee-python/pull/1413)) ([1cae2bc](https://github.com/apify/crawlee-python/commit/1cae2bca0b1508fcb3cb419dc239caf33e20a7ef)) by [@Mantisus](https://github.com/Mantisus), closes [#1402](https://github.com/apify/crawlee-python/issues/1402)
|
|
28
|
+
- Add `SqlStorageClient` based on `sqlalchemy` v2+ ([#1339](https://github.com/apify/crawlee-python/pull/1339)) ([07c75a0](https://github.com/apify/crawlee-python/commit/07c75a078b443b58bfaaeb72eb2aa1439458dc47)) by [@Mantisus](https://github.com/Mantisus), closes [#307](https://github.com/apify/crawlee-python/issues/307)
|
|
20
29
|
|
|
21
30
|
### 🐛 Bug Fixes
|
|
22
31
|
|
|
@@ -27,6 +36,9 @@ All notable changes to this project will be documented in this file.
|
|
|
27
36
|
- Fix `timeout` for `stream` method in `ImpitHttpClient` ([#1352](https://github.com/apify/crawlee-python/pull/1352)) ([54b693b](https://github.com/apify/crawlee-python/commit/54b693b838f135a596e1e9493b565bc558b19a3a)) by [@Mantisus](https://github.com/Mantisus)
|
|
28
37
|
- Include reason in the session rotation warning logs ([#1363](https://github.com/apify/crawlee-python/pull/1363)) ([d6d7a45](https://github.com/apify/crawlee-python/commit/d6d7a45dd64a906419d9552c45062d726cbb1a0f)) by [@vdusek](https://github.com/vdusek), closes [#1318](https://github.com/apify/crawlee-python/issues/1318)
|
|
29
38
|
- Improve crawler statistics logging ([#1364](https://github.com/apify/crawlee-python/pull/1364)) ([1eb6da5](https://github.com/apify/crawlee-python/commit/1eb6da5dd85870124593dcad877284ccaed9c0ce)) by [@vdusek](https://github.com/vdusek), closes [#1317](https://github.com/apify/crawlee-python/issues/1317)
|
|
39
|
+
- Do not add a request that is already in progress to `MemoryRequestQueueClient` ([#1384](https://github.com/apify/crawlee-python/pull/1384)) ([3af326c](https://github.com/apify/crawlee-python/commit/3af326c9dfa8fffd56a42ca42981374613739e39)) by [@Mantisus](https://github.com/Mantisus), closes [#1383](https://github.com/apify/crawlee-python/issues/1383)
|
|
40
|
+
- Save `RequestQueueState` for `FileSystemRequestQueueClient` in default KVS ([#1411](https://github.com/apify/crawlee-python/pull/1411)) ([6ee60a0](https://github.com/apify/crawlee-python/commit/6ee60a08ac1f9414e1b792f4935cc3799cb5089a)) by [@Mantisus](https://github.com/Mantisus), closes [#1410](https://github.com/apify/crawlee-python/issues/1410)
|
|
41
|
+
- Set default desired concurrency for non-browser crawlers to 10 ([#1419](https://github.com/apify/crawlee-python/pull/1419)) ([1cc9401](https://github.com/apify/crawlee-python/commit/1cc940197600d2539bda967880d7f9d241eb8c3e)) by [@vdusek](https://github.com/vdusek)
|
|
30
42
|
|
|
31
43
|
### Refactor
|
|
32
44
|
|
|
@@ -36,6 +48,8 @@ All notable changes to this project will be documented in this file.
|
|
|
36
48
|
- [**breaking**] Replace `HttpxHttpClient` with `ImpitHttpClient` as default HTTP client ([#1307](https://github.com/apify/crawlee-python/pull/1307)) ([c803a97](https://github.com/apify/crawlee-python/commit/c803a976776a76846866d533e3a3ee8144e248c4)) by [@Mantisus](https://github.com/Mantisus), closes [#1079](https://github.com/apify/crawlee-python/issues/1079)
|
|
37
49
|
- [**breaking**] Change Dataset unwind parameter to accept list of strings ([#1357](https://github.com/apify/crawlee-python/pull/1357)) ([862a203](https://github.com/apify/crawlee-python/commit/862a20398f00fe91802fe7a1ccd58b05aee053a1)) by [@vdusek](https://github.com/vdusek)
|
|
38
50
|
- [**breaking**] Remove `Request.id` field ([#1366](https://github.com/apify/crawlee-python/pull/1366)) ([32f3580](https://github.com/apify/crawlee-python/commit/32f3580e9775a871924ab1233085d0c549c4cd52)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1358](https://github.com/apify/crawlee-python/issues/1358)
|
|
51
|
+
- [**breaking**] Refactor storage creation and caching, configuration and services ([#1386](https://github.com/apify/crawlee-python/pull/1386)) ([04649bd](https://github.com/apify/crawlee-python/commit/04649bde60d46b2bc18ae4f6e3fd9667d02a9cef)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1379](https://github.com/apify/crawlee-python/issues/1379)
|
|
52
|
+
|
|
39
53
|
|
|
40
54
|
|
|
41
55
|
## [0.6.12](https://github.com/apify/crawlee-python/releases/tag/v0.6.12) (2025-07-30)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: crawlee
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.1b1
|
|
4
4
|
Summary: Crawlee for Python
|
|
5
5
|
Project-URL: Apify Homepage, https://apify.com
|
|
6
6
|
Project-URL: Changelog, https://crawlee.dev/python/docs/changelog
|
|
@@ -227,12 +227,12 @@ Classifier: Topic :: Software Development :: Libraries
|
|
|
227
227
|
Requires-Python: >=3.10
|
|
228
228
|
Requires-Dist: cachetools>=5.5.0
|
|
229
229
|
Requires-Dist: colorama>=0.4.0
|
|
230
|
-
Requires-Dist: impit>=0.
|
|
230
|
+
Requires-Dist: impit>=0.6.1
|
|
231
231
|
Requires-Dist: more-itertools>=10.2.0
|
|
232
232
|
Requires-Dist: protego>=0.5.0
|
|
233
233
|
Requires-Dist: psutil>=6.0.0
|
|
234
|
-
Requires-Dist: pydantic!=2.10.0,!=2.10.1,!=2.10.2,>=2.8.0
|
|
235
234
|
Requires-Dist: pydantic-settings!=2.7.0,!=2.7.1,!=2.8.0,>=2.2.0
|
|
235
|
+
Requires-Dist: pydantic>=2.11.0
|
|
236
236
|
Requires-Dist: pyee>=9.0.0
|
|
237
237
|
Requires-Dist: tldextract>=5.1.0
|
|
238
238
|
Requires-Dist: typing-extensions>=4.1.0
|
|
@@ -244,7 +244,9 @@ Requires-Dist: jaro-winkler>=2.0.3; extra == 'adaptive-crawler'
|
|
|
244
244
|
Requires-Dist: playwright>=1.27.0; extra == 'adaptive-crawler'
|
|
245
245
|
Requires-Dist: scikit-learn>=1.6.0; extra == 'adaptive-crawler'
|
|
246
246
|
Provides-Extra: all
|
|
247
|
+
Requires-Dist: aiosqlite>=0.21.0; extra == 'all'
|
|
247
248
|
Requires-Dist: apify-fingerprint-datapoints>=0.0.2; extra == 'all'
|
|
249
|
+
Requires-Dist: asyncpg>=0.24.0; extra == 'all'
|
|
248
250
|
Requires-Dist: beautifulsoup4[lxml]>=4.12.0; extra == 'all'
|
|
249
251
|
Requires-Dist: browserforge>=1.2.3; extra == 'all'
|
|
250
252
|
Requires-Dist: cookiecutter>=2.6.0; extra == 'all'
|
|
@@ -263,6 +265,7 @@ Requires-Dist: parsel>=1.10.0; extra == 'all'
|
|
|
263
265
|
Requires-Dist: playwright>=1.27.0; extra == 'all'
|
|
264
266
|
Requires-Dist: rich>=13.9.0; extra == 'all'
|
|
265
267
|
Requires-Dist: scikit-learn>=1.6.0; extra == 'all'
|
|
268
|
+
Requires-Dist: sqlalchemy[asyncio]<3.0.0,>=2.0.0; extra == 'all'
|
|
266
269
|
Requires-Dist: typer>=0.12.0; extra == 'all'
|
|
267
270
|
Requires-Dist: wrapt>=1.17.0; extra == 'all'
|
|
268
271
|
Provides-Extra: beautifulsoup
|
|
@@ -293,6 +296,12 @@ Provides-Extra: playwright
|
|
|
293
296
|
Requires-Dist: apify-fingerprint-datapoints>=0.0.2; extra == 'playwright'
|
|
294
297
|
Requires-Dist: browserforge>=1.2.3; extra == 'playwright'
|
|
295
298
|
Requires-Dist: playwright>=1.27.0; extra == 'playwright'
|
|
299
|
+
Provides-Extra: sql-postgres
|
|
300
|
+
Requires-Dist: asyncpg>=0.24.0; extra == 'sql-postgres'
|
|
301
|
+
Requires-Dist: sqlalchemy[asyncio]<3.0.0,>=2.0.0; extra == 'sql-postgres'
|
|
302
|
+
Provides-Extra: sql-sqlite
|
|
303
|
+
Requires-Dist: aiosqlite>=0.21.0; extra == 'sql-sqlite'
|
|
304
|
+
Requires-Dist: sqlalchemy[asyncio]<3.0.0,>=2.0.0; extra == 'sql-sqlite'
|
|
296
305
|
Description-Content-Type: text/markdown
|
|
297
306
|
|
|
298
307
|
<h1 align="center">
|
|
@@ -327,8 +336,6 @@ Description-Content-Type: text/markdown
|
|
|
327
336
|
|
|
328
337
|
Crawlee covers your crawling and scraping end-to-end and **helps you build reliable scrapers. Fast.**
|
|
329
338
|
|
|
330
|
-
> 🚀 Crawlee for Python is open to early adopters!
|
|
331
|
-
|
|
332
339
|
Your crawlers will appear almost human-like and fly under the radar of modern bot protections even with the default configuration. Crawlee gives you the tools to crawl the web for links, scrape data and persistently store it in machine-readable formats, without having to worry about the technical details. And thanks to rich configuration options, you can tweak almost any aspect of Crawlee to suit your project's needs if the default settings don't cut it.
|
|
333
340
|
|
|
334
341
|
> 👉 **View full documentation, guides and examples on the [Crawlee project website](https://crawlee.dev/python/)** 👈
|
|
@@ -30,8 +30,6 @@
|
|
|
30
30
|
|
|
31
31
|
Crawlee covers your crawling and scraping end-to-end and **helps you build reliable scrapers. Fast.**
|
|
32
32
|
|
|
33
|
-
> 🚀 Crawlee for Python is open to early adopters!
|
|
34
|
-
|
|
35
33
|
Your crawlers will appear almost human-like and fly under the radar of modern bot protections even with the default configuration. Crawlee gives you the tools to crawl the web for links, scrape data and persistently store it in machine-readable formats, without having to worry about the technical details. And thanks to rich configuration options, you can tweak almost any aspect of Crawlee to suit your project's needs if the default settings don't cut it.
|
|
36
34
|
|
|
37
35
|
> 👉 **View full documentation, guides and examples on the [Crawlee project website](https://crawlee.dev/python/)** 👈
|
|
@@ -5,7 +5,7 @@ from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawli
|
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
async def main() -> None:
|
|
8
|
-
crawler = AdaptivePlaywrightCrawler.
|
|
8
|
+
crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser()
|
|
9
9
|
|
|
10
10
|
@crawler.router.default_handler
|
|
11
11
|
async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
|
{crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/request_loaders/rl_basic_example.py
RENAMED
|
@@ -18,6 +18,7 @@ async def main() -> None:
|
|
|
18
18
|
# Fetch and process requests from the queue.
|
|
19
19
|
while request := await request_list.fetch_next_request():
|
|
20
20
|
# Do something with it...
|
|
21
|
+
print(f'Processing {request.url}')
|
|
21
22
|
|
|
22
23
|
# And mark it as handled.
|
|
23
24
|
await request_list.mark_request_as_handled(request)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
from crawlee import service_locator
|
|
5
|
+
from crawlee.request_loaders import RequestList
|
|
6
|
+
|
|
7
|
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s-%(levelname)s-%(message)s')
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# Disable clearing the `KeyValueStore` on each run.
|
|
12
|
+
# This is necessary so that the state keys are not cleared at startup.
|
|
13
|
+
# The recommended way to achieve this behavior is setting the environment variable
|
|
14
|
+
# `CRAWLEE_PURGE_ON_START=0`
|
|
15
|
+
configuration = service_locator.get_configuration()
|
|
16
|
+
configuration.purge_on_start = False
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
async def main() -> None:
|
|
20
|
+
# Open the request list, if it does not exist, it will be created.
|
|
21
|
+
# Leave name empty to use the default request list.
|
|
22
|
+
request_list = RequestList(
|
|
23
|
+
name='my-request-list',
|
|
24
|
+
requests=[
|
|
25
|
+
'https://apify.com/',
|
|
26
|
+
'https://crawlee.dev/',
|
|
27
|
+
'https://crawlee.dev/python/',
|
|
28
|
+
],
|
|
29
|
+
# Enable persistence
|
|
30
|
+
persist_state_key='my-persist-state',
|
|
31
|
+
persist_requests_key='my-persist-requests',
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# We receive only one request.
|
|
35
|
+
# Each time you run it, it will be a new request until you exhaust the `RequestList`.
|
|
36
|
+
request = await request_list.fetch_next_request()
|
|
37
|
+
if request:
|
|
38
|
+
logger.info(f'Processing request: {request.url}')
|
|
39
|
+
# Do something with it...
|
|
40
|
+
|
|
41
|
+
# And mark it as handled.
|
|
42
|
+
await request_list.mark_request_as_handled(request)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
if __name__ == '__main__':
|
|
46
|
+
asyncio.run(main())
|
{crawlee-1.0.0rc1 → crawlee-1.0.1b1}/docs/guides/code_examples/request_loaders/rl_tandem_example.py
RENAMED
|
@@ -8,9 +8,11 @@ async def main() -> None:
|
|
|
8
8
|
# Create a static request list.
|
|
9
9
|
request_list = RequestList(['https://crawlee.dev', 'https://apify.com'])
|
|
10
10
|
|
|
11
|
+
# highlight-start
|
|
11
12
|
# Convert the request list to a request manager using the to_tandem method.
|
|
12
13
|
# It is a tandem with the default request queue.
|
|
13
14
|
request_manager = await request_list.to_tandem()
|
|
15
|
+
# highlight-end
|
|
14
16
|
|
|
15
17
|
# Create a crawler and pass the request manager to it.
|
|
16
18
|
crawler = ParselCrawler(
|
|
@@ -20,9 +22,20 @@ async def main() -> None:
|
|
|
20
22
|
|
|
21
23
|
@crawler.router.default_handler
|
|
22
24
|
async def handler(context: ParselCrawlingContext) -> None:
|
|
25
|
+
context.log.info(f'Processing {context.request.url}')
|
|
26
|
+
|
|
23
27
|
# New links will be enqueued directly to the queue.
|
|
24
28
|
await context.enqueue_links()
|
|
25
29
|
|
|
30
|
+
# Extract data using Parsel's XPath and CSS selectors.
|
|
31
|
+
data = {
|
|
32
|
+
'url': context.request.url,
|
|
33
|
+
'title': context.selector.xpath('//title/text()').get(),
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
# Push extracted data to the dataset.
|
|
37
|
+
await context.push_data(data)
|
|
38
|
+
|
|
26
39
|
await crawler.run()
|
|
27
40
|
|
|
28
41
|
|
|
@@ -23,9 +23,20 @@ async def main() -> None:
|
|
|
23
23
|
|
|
24
24
|
@crawler.router.default_handler
|
|
25
25
|
async def handler(context: ParselCrawlingContext) -> None:
|
|
26
|
+
context.log.info(f'Processing {context.request.url}')
|
|
27
|
+
|
|
26
28
|
# New links will be enqueued directly to the queue.
|
|
27
29
|
await context.enqueue_links()
|
|
28
30
|
|
|
31
|
+
# Extract data using Parsel's XPath and CSS selectors.
|
|
32
|
+
data = {
|
|
33
|
+
'url': context.request.url,
|
|
34
|
+
'title': context.selector.xpath('//title/text()').get(),
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
# Push extracted data to the dataset.
|
|
38
|
+
await context.push_data(data)
|
|
39
|
+
|
|
29
40
|
await crawler.run()
|
|
30
41
|
|
|
31
42
|
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
from crawlee.http_clients import ImpitHttpClient
|
|
5
|
+
from crawlee.request_loaders import SitemapRequestLoader
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
async def main() -> None:
|
|
9
|
+
# Create an HTTP client for fetching the sitemap.
|
|
10
|
+
http_client = ImpitHttpClient()
|
|
11
|
+
|
|
12
|
+
# Create a sitemap request loader with filtering rules.
|
|
13
|
+
sitemap_loader = SitemapRequestLoader(
|
|
14
|
+
sitemap_urls=['https://crawlee.dev/sitemap.xml'],
|
|
15
|
+
http_client=http_client,
|
|
16
|
+
include=[re.compile(r'.*docs.*')], # Only include URLs containing 'docs'.
|
|
17
|
+
max_buffer_size=500, # Keep up to 500 URLs in memory before processing.
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# We work with the loader until we process all relevant links from the sitemap.
|
|
21
|
+
while request := await sitemap_loader.fetch_next_request():
|
|
22
|
+
# Do something with it...
|
|
23
|
+
print(f'Processing {request.url}')
|
|
24
|
+
|
|
25
|
+
# And mark it as handled.
|
|
26
|
+
await sitemap_loader.mark_request_as_handled(request)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
if __name__ == '__main__':
|
|
30
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
from crawlee import service_locator
|
|
5
|
+
from crawlee.http_clients import ImpitHttpClient
|
|
6
|
+
from crawlee.request_loaders import SitemapRequestLoader
|
|
7
|
+
|
|
8
|
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s-%(levelname)s-%(message)s')
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# Disable clearing the `KeyValueStore` on each run.
|
|
13
|
+
# This is necessary so that the state keys are not cleared at startup.
|
|
14
|
+
# The recommended way to achieve this behavior is setting the environment variable
|
|
15
|
+
# `CRAWLEE_PURGE_ON_START=0`
|
|
16
|
+
configuration = service_locator.get_configuration()
|
|
17
|
+
configuration.purge_on_start = False
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
async def main() -> None:
|
|
21
|
+
# Create an HTTP client for fetching sitemaps
|
|
22
|
+
# Use the context manager for `SitemapRequestLoader` to correctly save the state when
|
|
23
|
+
# the work is completed.
|
|
24
|
+
async with (
|
|
25
|
+
ImpitHttpClient() as http_client,
|
|
26
|
+
SitemapRequestLoader(
|
|
27
|
+
sitemap_urls=['https://crawlee.dev/sitemap.xml'],
|
|
28
|
+
http_client=http_client,
|
|
29
|
+
# Enable persistence
|
|
30
|
+
persist_state_key='my-persist-state',
|
|
31
|
+
) as sitemap_loader,
|
|
32
|
+
):
|
|
33
|
+
# We receive only one request.
|
|
34
|
+
# Each time you run it, it will be a new request until you exhaust the sitemap.
|
|
35
|
+
request = await sitemap_loader.fetch_next_request()
|
|
36
|
+
if request:
|
|
37
|
+
logger.info(f'Processing request: {request.url}')
|
|
38
|
+
# Do something with it...
|
|
39
|
+
|
|
40
|
+
# And mark it as handled.
|
|
41
|
+
await sitemap_loader.mark_request_as_handled(request)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
if __name__ == '__main__':
|
|
45
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
|
|
5
|
+
from crawlee.http_clients import ImpitHttpClient
|
|
6
|
+
from crawlee.request_loaders import SitemapRequestLoader
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
async def main() -> None:
|
|
10
|
+
# Create an HTTP client for fetching the sitemap.
|
|
11
|
+
http_client = ImpitHttpClient()
|
|
12
|
+
|
|
13
|
+
# Create a sitemap request loader with filtering rules.
|
|
14
|
+
sitemap_loader = SitemapRequestLoader(
|
|
15
|
+
sitemap_urls=['https://crawlee.dev/sitemap.xml'],
|
|
16
|
+
http_client=http_client,
|
|
17
|
+
include=[re.compile(r'.*docs.*')], # Only include URLs containing 'docs'.
|
|
18
|
+
max_buffer_size=500, # Keep up to 500 URLs in memory before processing.
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# highlight-start
|
|
22
|
+
# Convert the sitemap loader into a request manager linked
|
|
23
|
+
# to the default request queue.
|
|
24
|
+
request_manager = await sitemap_loader.to_tandem()
|
|
25
|
+
# highlight-end
|
|
26
|
+
|
|
27
|
+
# Create a crawler and pass the request manager to it.
|
|
28
|
+
crawler = ParselCrawler(
|
|
29
|
+
request_manager=request_manager,
|
|
30
|
+
max_requests_per_crawl=10, # Limit the max requests per crawl.
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
@crawler.router.default_handler
|
|
34
|
+
async def handler(context: ParselCrawlingContext) -> None:
|
|
35
|
+
context.log.info(f'Processing {context.request.url}')
|
|
36
|
+
|
|
37
|
+
# New links will be enqueued directly to the queue.
|
|
38
|
+
await context.enqueue_links()
|
|
39
|
+
|
|
40
|
+
# Extract data using Parsel's XPath and CSS selectors.
|
|
41
|
+
data = {
|
|
42
|
+
'url': context.request.url,
|
|
43
|
+
'title': context.selector.xpath('//title/text()').get(),
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
# Push extracted data to the dataset.
|
|
47
|
+
await context.push_data(data)
|
|
48
|
+
|
|
49
|
+
await crawler.run()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
if __name__ == '__main__':
|
|
53
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
|
|
5
|
+
from crawlee.http_clients import ImpitHttpClient
|
|
6
|
+
from crawlee.request_loaders import RequestManagerTandem, SitemapRequestLoader
|
|
7
|
+
from crawlee.storages import RequestQueue
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
async def main() -> None:
|
|
11
|
+
# Create an HTTP client for fetching the sitemap.
|
|
12
|
+
http_client = ImpitHttpClient()
|
|
13
|
+
|
|
14
|
+
# Create a sitemap request loader with filtering rules.
|
|
15
|
+
sitemap_loader = SitemapRequestLoader(
|
|
16
|
+
sitemap_urls=['https://crawlee.dev/sitemap.xml'],
|
|
17
|
+
http_client=http_client,
|
|
18
|
+
include=[re.compile(r'.*docs.*')], # Only include URLs containing 'docs'.
|
|
19
|
+
max_buffer_size=500, # Keep up to 500 URLs in memory before processing.
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# Open the default request queue.
|
|
23
|
+
request_queue = await RequestQueue.open()
|
|
24
|
+
|
|
25
|
+
# And combine them together to a single request manager.
|
|
26
|
+
request_manager = RequestManagerTandem(sitemap_loader, request_queue)
|
|
27
|
+
|
|
28
|
+
# Create a crawler and pass the request manager to it.
|
|
29
|
+
crawler = ParselCrawler(
|
|
30
|
+
request_manager=request_manager,
|
|
31
|
+
max_requests_per_crawl=10, # Limit the max requests per crawl.
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
@crawler.router.default_handler
|
|
35
|
+
async def handler(context: ParselCrawlingContext) -> None:
|
|
36
|
+
context.log.info(f'Processing {context.request.url}')
|
|
37
|
+
|
|
38
|
+
# New links will be enqueued directly to the queue.
|
|
39
|
+
await context.enqueue_links()
|
|
40
|
+
|
|
41
|
+
# Extract data using Parsel's XPath and CSS selectors.
|
|
42
|
+
data = {
|
|
43
|
+
'url': context.request.url,
|
|
44
|
+
'title': context.selector.xpath('//title/text()').get(),
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
# Push extracted data to the dataset.
|
|
48
|
+
await context.push_data(data)
|
|
49
|
+
|
|
50
|
+
await crawler.run()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
if __name__ == '__main__':
|
|
54
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from datetime import timedelta
|
|
3
|
+
|
|
4
|
+
from crawlee import service_locator
|
|
5
|
+
from crawlee.configuration import Configuration
|
|
6
|
+
from crawlee.storage_clients import MemoryStorageClient
|
|
7
|
+
from crawlee.storages import Dataset
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
async def main() -> None:
|
|
11
|
+
configuration = Configuration(
|
|
12
|
+
log_level='DEBUG',
|
|
13
|
+
headless=False,
|
|
14
|
+
persist_state_interval=timedelta(seconds=30),
|
|
15
|
+
)
|
|
16
|
+
# Set the custom configuration as the global default configuration.
|
|
17
|
+
service_locator.set_configuration(configuration)
|
|
18
|
+
|
|
19
|
+
# Use the global defaults when creating the dataset (or other storage).
|
|
20
|
+
dataset_1 = await Dataset.open()
|
|
21
|
+
|
|
22
|
+
# Or set explicitly specific configuration if
|
|
23
|
+
# you do not want to rely on global defaults.
|
|
24
|
+
dataset_2 = await Dataset.open(
|
|
25
|
+
storage_client=MemoryStorageClient(), configuration=configuration
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
if __name__ == '__main__':
|
|
30
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from crawlee.crawlers import ParselCrawler
|
|
2
|
+
from crawlee.storage_clients import SqlStorageClient
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
async def main() -> None:
|
|
6
|
+
# Create a new instance of storage client.
|
|
7
|
+
# This will create an SQLite database file crawlee.db or created tables in your
|
|
8
|
+
# database if you pass `connection_string` or `engine`
|
|
9
|
+
# Use the context manager to ensure that connections are properly cleaned up.
|
|
10
|
+
async with SqlStorageClient() as storage_client:
|
|
11
|
+
# And pass it to the crawler.
|
|
12
|
+
crawler = ParselCrawler(storage_client=storage_client)
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from sqlalchemy.ext.asyncio import create_async_engine
|
|
2
|
+
|
|
3
|
+
from crawlee.configuration import Configuration
|
|
4
|
+
from crawlee.crawlers import ParselCrawler
|
|
5
|
+
from crawlee.storage_clients import SqlStorageClient
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
async def main() -> None:
|
|
9
|
+
# Create a new instance of storage client.
|
|
10
|
+
# On first run, also creates tables in your PostgreSQL database.
|
|
11
|
+
# Use the context manager to ensure that connections are properly cleaned up.
|
|
12
|
+
async with SqlStorageClient(
|
|
13
|
+
# Create an `engine` with the desired configuration
|
|
14
|
+
engine=create_async_engine(
|
|
15
|
+
'postgresql+asyncpg://myuser:mypassword@localhost:5432/postgres',
|
|
16
|
+
future=True,
|
|
17
|
+
pool_size=5,
|
|
18
|
+
max_overflow=10,
|
|
19
|
+
pool_recycle=3600,
|
|
20
|
+
pool_pre_ping=True,
|
|
21
|
+
echo=False,
|
|
22
|
+
)
|
|
23
|
+
) as storage_client:
|
|
24
|
+
# Create a configuration with custom settings.
|
|
25
|
+
configuration = Configuration(
|
|
26
|
+
purge_on_start=False,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# And pass them to the crawler.
|
|
30
|
+
crawler = ParselCrawler(
|
|
31
|
+
storage_client=storage_client,
|
|
32
|
+
configuration=configuration,
|
|
33
|
+
)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
|
|
3
|
+
from crawlee.storages import Dataset
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
async def main() -> None:
|
|
7
|
+
# Named storage (persists across runs)
|
|
8
|
+
dataset_named = await Dataset.open(name='my-persistent-dataset')
|
|
9
|
+
|
|
10
|
+
# Unnamed storage with alias (purged on start)
|
|
11
|
+
dataset_unnamed = await Dataset.open(alias='temporary-results')
|
|
12
|
+
|
|
13
|
+
# Default unnamed storage (both are equivalent and purged on start)
|
|
14
|
+
dataset_default = await Dataset.open()
|
|
15
|
+
dataset_default = await Dataset.open(alias='default')
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
if __name__ == '__main__':
|
|
19
|
+
asyncio.run(main())
|
|
@@ -10,11 +10,13 @@ import TabItem from '@theme/TabItem';
|
|
|
10
10
|
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
|
|
11
11
|
|
|
12
12
|
import RlBasicExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_basic_example.py';
|
|
13
|
-
import SitemapExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/
|
|
13
|
+
import SitemapExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_basic_example.py';
|
|
14
14
|
import RlTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_tandem_example.py';
|
|
15
15
|
import RlExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_tandem_example_explicit.py';
|
|
16
16
|
import SitemapTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_tandem_example.py';
|
|
17
17
|
import SitemapExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_tandem_example_explicit.py';
|
|
18
|
+
import RlBasicPersistExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_basic_example_with_persist.py';
|
|
19
|
+
import SitemapPersistExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_example_with_persist.py';
|
|
18
20
|
|
|
19
21
|
The [`request_loaders`](https://github.com/apify/crawlee-python/tree/master/src/crawlee/request_loaders) sub-package extends the functionality of the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>, providing additional tools for managing URLs and requests. If you are new to Crawlee and unfamiliar with the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>, consider starting with the [Storages](https://crawlee.dev/python/docs/guides/storages) guide first. Request loaders define how requests are fetched and stored, enabling various use cases such as reading URLs from files, external APIs, or combining multiple sources together.
|
|
20
22
|
|
|
@@ -102,6 +104,10 @@ RequestManager --|> RequestManagerTandem
|
|
|
102
104
|
|
|
103
105
|
The <ApiLink to="class/RequestLoader">`RequestLoader`</ApiLink> interface defines the foundation for fetching requests during a crawl. It provides abstract methods for basic operations like retrieving, marking, and checking the status of requests. Concrete implementations, such as <ApiLink to="class/RequestList">`RequestList`</ApiLink>, build on this interface to handle specific scenarios. You can create your own custom loader that reads from an external file, web endpoint, database, or any other specific data source. For more details, refer to the <ApiLink to="class/RequestLoader">`RequestLoader`</ApiLink> API reference.
|
|
104
106
|
|
|
107
|
+
:::info NOTE
|
|
108
|
+
To learn how to use request loaders in your crawlers, see the [Request manager tandem](#request-manager-tandem) section below.
|
|
109
|
+
:::
|
|
110
|
+
|
|
105
111
|
### Request list
|
|
106
112
|
|
|
107
113
|
The <ApiLink to="class/RequestList">`RequestList`</ApiLink> can accept an asynchronous generator as input, allowing requests to be streamed rather than loading them all into memory at once. This can significantly reduce memory usage, especially when working with large sets of URLs.
|
|
@@ -112,6 +118,16 @@ Here is a basic example of working with the <ApiLink to="class/RequestList">`Req
|
|
|
112
118
|
{RlBasicExample}
|
|
113
119
|
</RunnableCodeBlock>
|
|
114
120
|
|
|
121
|
+
### Request list with persistence
|
|
122
|
+
|
|
123
|
+
The <ApiLink to="class/RequestList">`RequestList`</ApiLink> supports state persistence, allowing it to resume from where it left off after interruption. This is particularly useful for long-running crawls or when you need to pause and resume crawling later.
|
|
124
|
+
|
|
125
|
+
To enable persistence, provide `persist_state_key` and optionally `persist_requests_key` parameters, and disable automatic cleanup by setting `purge_on_start = False` in the configuration. The `persist_state_key` saves the loader's progress, while `persist_requests_key` ensures that the request data doesn't change between runs. For more details on resuming interrupted crawls, see the [Resuming a paused crawl](../examples/resuming-paused-crawl) example.
|
|
126
|
+
|
|
127
|
+
<RunnableCodeBlock className="language-python" language="python">
|
|
128
|
+
{RlBasicPersistExample}
|
|
129
|
+
</RunnableCodeBlock>
|
|
130
|
+
|
|
115
131
|
### Sitemap request loader
|
|
116
132
|
|
|
117
133
|
The <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> is a specialized request loader that reads URLs from XML sitemaps. It's particularly useful when you want to crawl a website systematically by following its sitemap structure. The loader supports filtering URLs using glob patterns and regular expressions, allowing you to include or exclude specific types of URLs. The <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> provides streaming processing of sitemaps, ensuring efficient memory usage without loading the entire sitemap into memory.
|
|
@@ -120,6 +136,16 @@ The <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> is
|
|
|
120
136
|
{SitemapExample}
|
|
121
137
|
</RunnableCodeBlock>
|
|
122
138
|
|
|
139
|
+
### Sitemap request loader with persistence
|
|
140
|
+
|
|
141
|
+
Similarly, the <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> supports state persistence to resume processing from where it left off. This is especially valuable when processing large sitemaps that may take considerable time to complete.
|
|
142
|
+
|
|
143
|
+
<RunnableCodeBlock className="language-python" language="python">
|
|
144
|
+
{SitemapPersistExample}
|
|
145
|
+
</RunnableCodeBlock>
|
|
146
|
+
|
|
147
|
+
When using persistence with `SitemapRequestLoader`, make sure to use the context manager (`async with`) to properly save the state when the work is completed.
|
|
148
|
+
|
|
123
149
|
## Request managers
|
|
124
150
|
|
|
125
151
|
The <ApiLink to="class/RequestManager">`RequestManager`</ApiLink> extends `RequestLoader` with write capabilities. In addition to reading requests, a request manager can add and reclaim them. This is essential for dynamic crawling projects where new URLs may emerge during the crawl process, or when certain requests fail and need to be retried. For more details, refer to the <ApiLink to="class/RequestManager">`RequestManager`</ApiLink> API reference.
|