crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +2 -1
- crawlo/__version__.py +1 -1
- crawlo/cli.py +2 -2
- crawlo/commands/check.py +1 -1
- crawlo/commands/help.py +5 -3
- crawlo/commands/list.py +1 -1
- crawlo/commands/run.py +49 -11
- crawlo/commands/stats.py +1 -1
- crawlo/config.py +12 -4
- crawlo/config_validator.py +1 -1
- crawlo/core/engine.py +20 -7
- crawlo/core/processor.py +1 -1
- crawlo/core/scheduler.py +4 -5
- crawlo/crawler.py +51 -10
- crawlo/downloader/__init__.py +7 -3
- crawlo/downloader/aiohttp_downloader.py +18 -18
- crawlo/downloader/cffi_downloader.py +5 -2
- crawlo/downloader/httpx_downloader.py +9 -3
- crawlo/downloader/hybrid_downloader.py +2 -2
- crawlo/downloader/playwright_downloader.py +38 -15
- crawlo/downloader/selenium_downloader.py +16 -2
- crawlo/event.py +42 -8
- crawlo/exceptions.py +157 -24
- crawlo/extension/__init__.py +10 -9
- crawlo/extension/health_check.py +7 -7
- crawlo/extension/log_interval.py +6 -6
- crawlo/extension/log_stats.py +2 -2
- crawlo/extension/logging_extension.py +4 -12
- crawlo/extension/memory_monitor.py +5 -5
- crawlo/extension/performance_profiler.py +5 -5
- crawlo/extension/request_recorder.py +6 -6
- crawlo/factories/base.py +1 -1
- crawlo/factories/crawler.py +61 -60
- crawlo/factories/utils.py +135 -0
- crawlo/filters/__init__.py +19 -2
- crawlo/filters/aioredis_filter.py +133 -49
- crawlo/filters/memory_filter.py +6 -21
- crawlo/framework.py +22 -8
- crawlo/initialization/built_in.py +24 -67
- crawlo/initialization/core.py +65 -19
- crawlo/initialization/phases.py +83 -2
- crawlo/initialization/registry.py +5 -7
- crawlo/initialization/utils.py +49 -0
- crawlo/logging/__init__.py +6 -10
- crawlo/logging/config.py +106 -22
- crawlo/logging/factory.py +12 -8
- crawlo/logging/manager.py +19 -27
- crawlo/middleware/__init__.py +72 -9
- crawlo/middleware/default_header.py +2 -2
- crawlo/middleware/download_delay.py +2 -2
- crawlo/middleware/middleware_manager.py +6 -6
- crawlo/middleware/offsite.py +2 -2
- crawlo/middleware/proxy.py +2 -2
- crawlo/middleware/request_ignore.py +4 -4
- crawlo/middleware/response_code.py +2 -2
- crawlo/middleware/response_filter.py +2 -2
- crawlo/middleware/retry.py +1 -1
- crawlo/mode_manager.py +38 -4
- crawlo/network/request.py +54 -26
- crawlo/network/response.py +69 -135
- crawlo/pipelines/__init__.py +40 -9
- crawlo/pipelines/base_pipeline.py +452 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
- crawlo/pipelines/console_pipeline.py +2 -2
- crawlo/pipelines/csv_pipeline.py +4 -4
- crawlo/pipelines/database_dedup_pipeline.py +4 -5
- crawlo/pipelines/json_pipeline.py +4 -4
- crawlo/pipelines/memory_dedup_pipeline.py +4 -5
- crawlo/pipelines/mongo_pipeline.py +23 -14
- crawlo/pipelines/mysql_pipeline.py +31 -39
- crawlo/pipelines/pipeline_manager.py +8 -8
- crawlo/pipelines/redis_dedup_pipeline.py +13 -14
- crawlo/project.py +1 -1
- crawlo/queue/__init__.py +10 -0
- crawlo/queue/queue_manager.py +79 -13
- crawlo/queue/redis_priority_queue.py +196 -47
- crawlo/settings/default_settings.py +16 -6
- crawlo/spider/__init__.py +6 -5
- crawlo/stats_collector.py +2 -2
- crawlo/task_manager.py +1 -1
- crawlo/templates/crawlo.cfg.tmpl +3 -3
- crawlo/templates/project/__init__.py.tmpl +1 -3
- crawlo/templates/project/items.py.tmpl +2 -6
- crawlo/templates/project/middlewares.py.tmpl +1 -1
- crawlo/templates/project/pipelines.py.tmpl +1 -2
- crawlo/templates/project/settings.py.tmpl +12 -10
- crawlo/templates/project/settings_distributed.py.tmpl +14 -13
- crawlo/templates/project/settings_gentle.py.tmpl +21 -23
- crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
- crawlo/templates/project/settings_minimal.py.tmpl +10 -8
- crawlo/templates/project/settings_simple.py.tmpl +21 -23
- crawlo/templates/run.py.tmpl +1 -1
- crawlo/templates/spider/spider.py.tmpl +4 -12
- crawlo/templates/spiders_init.py.tmpl +3 -8
- crawlo/tools/__init__.py +0 -103
- crawlo/tools/scenario_adapter.py +1 -1
- crawlo/utils/__init__.py +25 -1
- crawlo/utils/batch_processor.py +23 -6
- crawlo/utils/config_manager.py +442 -0
- crawlo/utils/controlled_spider_mixin.py +1 -1
- crawlo/utils/db_helper.py +1 -1
- crawlo/utils/encoding_helper.py +190 -0
- crawlo/utils/error_handler.py +2 -2
- crawlo/utils/large_scale_helper.py +1 -1
- crawlo/utils/leak_detector.py +335 -0
- crawlo/utils/mongo_connection_pool.py +157 -0
- crawlo/utils/mysql_connection_pool.py +197 -0
- crawlo/utils/performance_monitor.py +1 -1
- crawlo/utils/redis_checker.py +91 -0
- crawlo/utils/redis_connection_pool.py +260 -70
- crawlo/utils/redis_key_validator.py +1 -1
- crawlo/utils/request.py +24 -2
- crawlo/utils/request_serializer.py +1 -1
- crawlo/utils/resource_manager.py +337 -0
- crawlo/utils/response_helper.py +113 -0
- crawlo/utils/selector_helper.py +3 -2
- crawlo/utils/singleton.py +70 -0
- crawlo/utils/spider_loader.py +1 -1
- crawlo/utils/text_helper.py +1 -1
- crawlo-1.4.8.dist-info/METADATA +831 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
- tests/advanced_tools_example.py +10 -68
- tests/distributed_dedup_test.py +467 -0
- tests/monitor_redis_dedup.sh +72 -0
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
- tests/simple_cli_test.py +55 -0
- tests/test_cli_arguments.py +119 -0
- tests/test_dedup_fix.py +10 -10
- crawlo/logging/async_handler.py +0 -181
- crawlo/logging/monitor.py +0 -153
- crawlo/logging/sampler.py +0 -167
- crawlo/tools/authenticated_proxy.py +0 -241
- crawlo/tools/data_formatter.py +0 -226
- crawlo/tools/data_validator.py +0 -181
- crawlo/tools/encoding_converter.py +0 -127
- crawlo/tools/network_diagnostic.py +0 -365
- crawlo/tools/request_tools.py +0 -83
- crawlo/tools/retry_mechanism.py +0 -224
- crawlo/utils/env_config.py +0 -143
- crawlo/utils/large_scale_config.py +0 -287
- crawlo/utils/log.py +0 -80
- crawlo/utils/system.py +0 -11
- crawlo/utils/tools.py +0 -5
- crawlo/utils/url.py +0 -40
- crawlo-1.4.6.dist-info/METADATA +0 -329
- tests/env_config_example.py +0 -134
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
- tests/test_authenticated_proxy.py +0 -142
- tests/test_comprehensive.py +0 -147
- tests/test_dynamic_downloaders_proxy.py +0 -125
- tests/test_dynamic_proxy.py +0 -93
- tests/test_dynamic_proxy_config.py +0 -147
- tests/test_dynamic_proxy_real.py +0 -110
- tests/test_env_config.py +0 -122
- tests/test_framework_env_usage.py +0 -104
- tests/test_large_scale_config.py +0 -113
- tests/test_proxy_api.py +0 -265
- tests/test_real_scenario_proxy.py +0 -196
- tests/tools_example.py +0 -261
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
|
@@ -1,158 +1,154 @@
|
|
|
1
|
-
crawlo/__init__.py,sha256=
|
|
2
|
-
crawlo/__version__.py,sha256=
|
|
3
|
-
crawlo/cli.py,sha256=
|
|
4
|
-
crawlo/config.py,sha256=
|
|
5
|
-
crawlo/config_validator.py,sha256=
|
|
6
|
-
crawlo/crawler.py,sha256=
|
|
7
|
-
crawlo/event.py,sha256=
|
|
8
|
-
crawlo/exceptions.py,sha256=
|
|
9
|
-
crawlo/framework.py,sha256=
|
|
1
|
+
crawlo/__init__.py,sha256=weXNnBkBDN3htDD9LyVo2pJWYW8cPDhC2MwzYIA-Y9Y,2205
|
|
2
|
+
crawlo/__version__.py,sha256=_9NYs3PyBz_2XOfnTsADFo2yWVgtsIhY2vyJebGs93E,23
|
|
3
|
+
crawlo/cli.py,sha256=zbhj9RJ06C18Lc6sQjehgM3ViCBRiWh3CxtYbJeOtQc,2428
|
|
4
|
+
crawlo/config.py,sha256=VRAgw00OnSFLtowD41DOm9faXxuQuymzKWDM3eeQnkY,10052
|
|
5
|
+
crawlo/config_validator.py,sha256=YLE5JUZ-hQrWhX8_4Iq8zf7BaYl-rhG7aO-edstVFpE,11514
|
|
6
|
+
crawlo/crawler.py,sha256=13eKFmXBZfJbxkOc1cKhfdENqiFDg86Aw6E0fpjzLYA,29331
|
|
7
|
+
crawlo/event.py,sha256=P3NtfWkgidoxEmJ4xo5uG_vhAUarnrGbi9GjkdH1pSc,1297
|
|
8
|
+
crawlo/exceptions.py,sha256=B1LQsvXyOBl3GkXaUc_jMkcqB4Z_E3DNdBE4z2n3PrM,5281
|
|
9
|
+
crawlo/framework.py,sha256=yJDOdEfK7Rldga6R1jWje1P2tfHhv_-PaGeokpQ35Wk,10037
|
|
10
10
|
crawlo/interfaces.py,sha256=q1vwMSiZLfLpPhFa9Y0hAcjYEKvLkW2fZ2fmoAZ-5TE,653
|
|
11
|
-
crawlo/mode_manager.py,sha256=
|
|
12
|
-
crawlo/project.py,sha256=
|
|
13
|
-
crawlo/stats_collector.py,sha256=
|
|
11
|
+
crawlo/mode_manager.py,sha256=bbv4sdpLXoIjdIuHrJlys-AGHVND1Fr7sJaoV6kQdyc,10026
|
|
12
|
+
crawlo/project.py,sha256=nNnhwIjYxx6R7M25Wr7QDloynYJHJS2cyUrd98Rr_gc,13996
|
|
13
|
+
crawlo/stats_collector.py,sha256=8BlR5SKMTuTpCO_mtTxSYfZ80bCEe8SUJrWbS7UyphE,2793
|
|
14
14
|
crawlo/subscriber.py,sha256=h8fx69NJZeWem0ZkCmfHAi2kgfDGFObHpwN0aGNUM6Y,5115
|
|
15
|
-
crawlo/task_manager.py,sha256=
|
|
15
|
+
crawlo/task_manager.py,sha256=KuExi62Z_ns5qQk-TCVfMlVBcK6mBC4-RYxGlcOKxm4,5964
|
|
16
16
|
crawlo/commands/__init__.py,sha256=orvY6wLOBwGUEJKeF3h_T1fxj8AaQLjngBDd-3xKOE4,392
|
|
17
|
-
crawlo/commands/check.py,sha256=
|
|
17
|
+
crawlo/commands/check.py,sha256=Qz7c5k7BeBgD9GmA6rgIS1VP5eq-gCyDKFsSpqPZyx0,23197
|
|
18
18
|
crawlo/commands/genspider.py,sha256=JB4ZuFpKsYwtjx3DSsxugH7e3kqxhDWPG5ZKfvM0isI,6041
|
|
19
|
-
crawlo/commands/help.py,sha256=
|
|
20
|
-
crawlo/commands/list.py,sha256=
|
|
21
|
-
crawlo/commands/run.py,sha256=
|
|
19
|
+
crawlo/commands/help.py,sha256=VzYmVpsTUZIFxJXu0iS0oVDCLmjdlNzVtYkqg4kIOzA,5322
|
|
20
|
+
crawlo/commands/list.py,sha256=MaiaJcrasvzDX-sawzCRk4j1VMJR4IgBT66aczpXGZ0,5875
|
|
21
|
+
crawlo/commands/run.py,sha256=VHtgSMtjvD6srw1un07iu54owWYbJWhG_yfZf_bv0QA,14722
|
|
22
22
|
crawlo/commands/startproject.py,sha256=boZrMyn6TgCi1jt3D3DQfui6hJitjwNO8mqlWKNOBns,17366
|
|
23
|
-
crawlo/commands/stats.py,sha256=
|
|
23
|
+
crawlo/commands/stats.py,sha256=TC_uVi9IYKuvBcGJtduUJPzaWGFN9ZWrNv_YUqg6b6s,6199
|
|
24
24
|
crawlo/commands/utils.py,sha256=YVNEEzlm_qNY3SVvU8h6o2lQMkVgypvoB4ZFrP4gln0,5578
|
|
25
25
|
crawlo/core/__init__.py,sha256=BWkj3AqZwp2Bk73UzUlC_qsqv_MH_HNrzy4DY1hosj4,1330
|
|
26
|
-
crawlo/core/engine.py,sha256=
|
|
27
|
-
crawlo/core/processor.py,sha256=
|
|
28
|
-
crawlo/core/scheduler.py,sha256=
|
|
26
|
+
crawlo/core/engine.py,sha256=qKUYlEB9BhdUngKX85IVQMGPcSDVRnuldLSdmJERSWM,20090
|
|
27
|
+
crawlo/core/processor.py,sha256=GpmhrtB-plp3XwckdrcibdkTn0REE9uhOl5aPcQzLtY,1555
|
|
28
|
+
crawlo/core/scheduler.py,sha256=ldkF1KwGo4LsSldGa-clS8n1UZQBItrn757ZChGyb20,13980
|
|
29
29
|
crawlo/data/__init__.py,sha256=UPqgioMdu3imSUmpLWzVlpvoBnEfaPSAT-crCcWd7iw,121
|
|
30
30
|
crawlo/data/user_agents.py,sha256=zjjFkldQkqtrn45j0WZplaZLannPxZDeAU0JofxQcBc,9891
|
|
31
|
-
crawlo/downloader/__init__.py,sha256=
|
|
32
|
-
crawlo/downloader/aiohttp_downloader.py,sha256
|
|
33
|
-
crawlo/downloader/cffi_downloader.py,sha256=
|
|
34
|
-
crawlo/downloader/httpx_downloader.py,sha256=
|
|
35
|
-
crawlo/downloader/hybrid_downloader.py,sha256=
|
|
36
|
-
crawlo/downloader/playwright_downloader.py,sha256=
|
|
37
|
-
crawlo/downloader/selenium_downloader.py,sha256=
|
|
38
|
-
crawlo/extension/__init__.py,sha256=
|
|
39
|
-
crawlo/extension/health_check.py,sha256=
|
|
40
|
-
crawlo/extension/log_interval.py,sha256=
|
|
41
|
-
crawlo/extension/log_stats.py,sha256
|
|
42
|
-
crawlo/extension/logging_extension.py,sha256=
|
|
43
|
-
crawlo/extension/memory_monitor.py,sha256=
|
|
44
|
-
crawlo/extension/performance_profiler.py,sha256=
|
|
45
|
-
crawlo/extension/request_recorder.py,sha256=
|
|
31
|
+
crawlo/downloader/__init__.py,sha256=wa5AY9XlWFTkZi2VBZ0hThwuPJZYdYjjHMU7nPvfmFw,9078
|
|
32
|
+
crawlo/downloader/aiohttp_downloader.py,sha256=aqmkFGcp0yFo9JdxB1UKWMVNDHGpvdRUZrM_IXZb8H4,9069
|
|
33
|
+
crawlo/downloader/cffi_downloader.py,sha256=410FOS3T6PXS4Vv6SH6I8tsOs4hVXsXny_oR4OP6M7s,10649
|
|
34
|
+
crawlo/downloader/httpx_downloader.py,sha256=BWqq-h63LYgnEWa7v0-Lwzof-lA9kREHWVkZRs7-PbY,12458
|
|
35
|
+
crawlo/downloader/hybrid_downloader.py,sha256=tXL7ceRWFrkh-YmUqVXBStjRqq0UsBjy-5NAltKOfvY,8203
|
|
36
|
+
crawlo/downloader/playwright_downloader.py,sha256=N1k7ZiRKnwhGSZnKbQ3M4SJzXJ5H-JG-nInpHHDhs3o,17622
|
|
37
|
+
crawlo/downloader/selenium_downloader.py,sha256=t5HU5utEmvLItn_Oou8a_ZGpzVZ-m_5QN7VJqh-TSdE,22151
|
|
38
|
+
crawlo/extension/__init__.py,sha256=viNq8fFDmSbvhmJUJX3-kUSikRKgFNDW8q-g8p2nsrg,2951
|
|
39
|
+
crawlo/extension/health_check.py,sha256=GL1JOuvnAPrdmmIY7uQ_YVdt0dIsYPtbof9OY75GLF8,5618
|
|
40
|
+
crawlo/extension/log_interval.py,sha256=eM6nN6-BQcGMtIAae9QSPJVK20ErzGtgriOZeVzq_j0,4462
|
|
41
|
+
crawlo/extension/log_stats.py,sha256=-CaNnBI4Ldaltjun6RyegSCIKuegfKO5xuA9nhsiXBo,2415
|
|
42
|
+
crawlo/extension/logging_extension.py,sha256=KALP8JxidozzYxD39bdkUL35HRTFdFqKYXatVKroNbU,2143
|
|
43
|
+
crawlo/extension/memory_monitor.py,sha256=dKSP6EkDnbGrv_ZSDjrHdNHMp9spDqJ9ZvvWC9dC-pk,4367
|
|
44
|
+
crawlo/extension/performance_profiler.py,sha256=SLzG9TioFO-SLyzd1-qNpsYWjc9sS4ybmza9jWzwdHg,5005
|
|
45
|
+
crawlo/extension/request_recorder.py,sha256=qRAaC9H67zr8Rp57cWqdtc21Re2XsCi49yhGWJk8uXU,4178
|
|
46
46
|
crawlo/factories/__init__.py,sha256=24dH70p05pZerO9-9gaKpTawRGeGvQYw7j5brvq8GUg,714
|
|
47
|
-
crawlo/factories/base.py,sha256=
|
|
48
|
-
crawlo/factories/crawler.py,sha256=
|
|
47
|
+
crawlo/factories/base.py,sha256=a8S5dhT7Pk2ybs34VAMmskIoa4svTcNy5KzF25tIELQ,1822
|
|
48
|
+
crawlo/factories/crawler.py,sha256=6fYbv-9UGLVmbnra-ZcXpDKYy7zKf9KhYw4GtPfxHp4,3170
|
|
49
49
|
crawlo/factories/registry.py,sha256=YU87CdsntOz609M0aQbGcCG9glPinUJxOn-_CaM4f-M,2595
|
|
50
|
-
crawlo/
|
|
51
|
-
crawlo/filters/
|
|
52
|
-
crawlo/filters/
|
|
50
|
+
crawlo/factories/utils.py,sha256=TIUCUjUTjh2lx5tmLdlKp1ai0C8YhF10--hY59YP-GY,4056
|
|
51
|
+
crawlo/filters/__init__.py,sha256=dKoMIZXl2W1qOsTGPW_RXB2o4DvHBFLDjkZnMxVdU5g,4982
|
|
52
|
+
crawlo/filters/aioredis_filter.py,sha256=BOiPalIMJRu_q85A6rfZKjq07m0Hi3KkDOZEIYffhBU,13752
|
|
53
|
+
crawlo/filters/memory_filter.py,sha256=CJX7P2QXy-k7wBycFfYL3_MUqqCWzIYIPpOqHUgrr6g,9227
|
|
53
54
|
crawlo/initialization/__init__.py,sha256=uNRMm9GccMYZi51scpvo-CPx_3ayp3Y81psBHlUoDfw,1132
|
|
54
|
-
crawlo/initialization/built_in.py,sha256=
|
|
55
|
+
crawlo/initialization/built_in.py,sha256=zODKOy6HQ1m3leG8MJg0WBKAmq-NTezL9FKY4mvSh6I,14833
|
|
55
56
|
crawlo/initialization/context.py,sha256=wG9t-M-Qttj7TN6gDumPX5Q5GHaPDUpLTZZDne2r3TE,4863
|
|
56
|
-
crawlo/initialization/core.py,sha256=
|
|
57
|
-
crawlo/initialization/phases.py,sha256=
|
|
58
|
-
crawlo/initialization/registry.py,sha256=
|
|
57
|
+
crawlo/initialization/core.py,sha256=sjjnzlivnuBkxVtOnEQL5YyaawYKXVDgUXyaj7dlfj4,8648
|
|
58
|
+
crawlo/initialization/phases.py,sha256=cqHBiN56a93OWrx1zRi_AsHiVoULG8-Xf9REA1Tr1NM,7171
|
|
59
|
+
crawlo/initialization/registry.py,sha256=u2AIjmsiyH13vD46RFFX5t86tOiDb1dQATOKa8JX5fI,4913
|
|
60
|
+
crawlo/initialization/utils.py,sha256=xuLIy9AUTgV23DB5UpDcRAtIMLRPWcdEUGYd0Exuf7k,1192
|
|
59
61
|
crawlo/items/__init__.py,sha256=rFpx1qFBo0Ik7bSdnXC8EVTJUOQdoJYGVdhYjaH00nk,409
|
|
60
62
|
crawlo/items/base.py,sha256=q0YTJlqUtizsqXwfWlk0ndcINV9dDyUckwMx8_JrkeY,602
|
|
61
63
|
crawlo/items/fields.py,sha256=l-DIwK6CCpdzNvf6ELz7Ckc7YCghZD9UCXA8vhNn2UE,1852
|
|
62
64
|
crawlo/items/items.py,sha256=OmVEvMmgofMU95GkaiWkfNQ2fjsH2fY9sw3SKcmUhLs,3478
|
|
63
|
-
crawlo/logging/__init__.py,sha256=
|
|
64
|
-
crawlo/logging/
|
|
65
|
-
crawlo/logging/
|
|
66
|
-
crawlo/logging/
|
|
67
|
-
crawlo/
|
|
68
|
-
crawlo/
|
|
69
|
-
crawlo/
|
|
70
|
-
crawlo/middleware/
|
|
71
|
-
crawlo/middleware/
|
|
72
|
-
crawlo/middleware/
|
|
73
|
-
crawlo/middleware/
|
|
74
|
-
crawlo/middleware/
|
|
75
|
-
crawlo/middleware/
|
|
76
|
-
crawlo/middleware/
|
|
77
|
-
crawlo/middleware/response_code.py,sha256=d5t0hmP8QliuvvtFOqW-ogCBtZxg2eyjsOtlQAEUxM8,4533
|
|
78
|
-
crawlo/middleware/response_filter.py,sha256=tVGr06bfJBR3xAHI2G5c3WimFsGHu8qoJtDcsVuCATU,4384
|
|
79
|
-
crawlo/middleware/retry.py,sha256=Acfo95B9wF8fQTCQIqluZOS2hHdnknQu_FOHvpGKJp0,4248
|
|
65
|
+
crawlo/logging/__init__.py,sha256=D9qhyaHHxf6a8syEeqSu8uiV1fjiu0wH6mrZb544N8s,932
|
|
66
|
+
crawlo/logging/config.py,sha256=Ozouc320Y9_lFoDNqf1huL4_hN07LxEJwx4r5_l9-9g,11299
|
|
67
|
+
crawlo/logging/factory.py,sha256=3WtZcdmoypbPVpWlNHCBb45Fd-WR6y_U9y4SLR_CWKI,7019
|
|
68
|
+
crawlo/logging/manager.py,sha256=fF24ZiEPqCafpFF4e8wtZPg7SE93IcmMnVGcezbqWNM,2721
|
|
69
|
+
crawlo/middleware/__init__.py,sha256=YdPoIaPTom2r7kbTYsaNq841Idn1Fi54bs0MrkI8N-c,2269
|
|
70
|
+
crawlo/middleware/default_header.py,sha256=MAFK29Z5AVDtP5McaxTeU45BYaFm0Cxs9fRyF0J0pKg,5151
|
|
71
|
+
crawlo/middleware/download_delay.py,sha256=OVJ0Mii8bvJgbv-faeHfKp_fs2lob2vvoKw7aAfnYUA,3529
|
|
72
|
+
crawlo/middleware/middleware_manager.py,sha256=bhVkfqE1q-VGuGax3pBgjQmsPfzNLz8-pnXF6Y7fDd4,6626
|
|
73
|
+
crawlo/middleware/offsite.py,sha256=ZfiYasOhKha_uPh72E_N4uL2NEvL4zO2RDJx7jJoSQU,4669
|
|
74
|
+
crawlo/middleware/proxy.py,sha256=gNeafGnYr21is4Lth5OrNLyH3aXxs9shBWMLZkM7APs,9578
|
|
75
|
+
crawlo/middleware/request_ignore.py,sha256=I2HwUQjRG95o7qksf0CfdhaWfypu7y9IhRo3dHKa0dM,2661
|
|
76
|
+
crawlo/middleware/response_code.py,sha256=mi20tfnaBCff4fmNJ4n8txgddNyK14hoYHYVWg0-y7Y,4520
|
|
77
|
+
crawlo/middleware/response_filter.py,sha256=pLoKadA4bANC1Hdw_QJkvsiF2jhepIlUAapVKhk7NDQ,4371
|
|
78
|
+
crawlo/middleware/retry.py,sha256=_xfqzirLsgFdxIIJrfjtxuW3mCnT6Ww1EF_VSh9jzAY,4246
|
|
80
79
|
crawlo/network/__init__.py,sha256=bvEnpEUBZJ79URfNZbsHhsBKna54hM2-x_BV8eotTA4,418
|
|
81
|
-
crawlo/network/request.py,sha256=
|
|
82
|
-
crawlo/network/response.py,sha256
|
|
83
|
-
crawlo/pipelines/__init__.py,sha256=
|
|
84
|
-
crawlo/pipelines/
|
|
85
|
-
crawlo/pipelines/
|
|
86
|
-
crawlo/pipelines/
|
|
87
|
-
crawlo/pipelines/
|
|
88
|
-
crawlo/pipelines/
|
|
89
|
-
crawlo/pipelines/
|
|
90
|
-
crawlo/pipelines/
|
|
91
|
-
crawlo/pipelines/
|
|
92
|
-
crawlo/pipelines/
|
|
93
|
-
crawlo/pipelines/
|
|
94
|
-
crawlo/
|
|
80
|
+
crawlo/network/request.py,sha256=VygsnSHZFdE1XqLdmF-An0_Eq2fC53mXNeCljuKpq7g,15344
|
|
81
|
+
crawlo/network/response.py,sha256=ckWUEYEkXlbWxoOslZSB1Rl4L9TFFouA7qkBaMAEj1c,21450
|
|
82
|
+
crawlo/pipelines/__init__.py,sha256=YJ5XQUJPZWR2yiNb4O7pTNfgrWnmNgZZlcgQ4wo9IOg,1476
|
|
83
|
+
crawlo/pipelines/base_pipeline.py,sha256=8bK9kV9-zfT-NaEi7vWDmMdryYeEWHjpypHs3RYQIi0,14839
|
|
84
|
+
crawlo/pipelines/bloom_dedup_pipeline.py,sha256=o36WsfaGrGu5pJL1ULFy3Ykb-YnBgWRvVyU7cM-inU0,5429
|
|
85
|
+
crawlo/pipelines/console_pipeline.py,sha256=1-rUglds9HRhPUHiWtP927-ryIOMPmgMWgEjzIpuBaY,1274
|
|
86
|
+
crawlo/pipelines/csv_pipeline.py,sha256=MP31vod41PU5DzPWVUl2uCp2GSH7W1NW0MES0U2iFOI,12392
|
|
87
|
+
crawlo/pipelines/database_dedup_pipeline.py,sha256=QHmb0vQC0ZkIEw6cq2F3mSxvRhkvDaP1BtNzGxP39oI,7260
|
|
88
|
+
crawlo/pipelines/json_pipeline.py,sha256=nW4BI6N7OCixcicdbbQquSgLUF3Z4dP3ktBRsLFRmOw,8487
|
|
89
|
+
crawlo/pipelines/memory_dedup_pipeline.py,sha256=6tgnBTr1zTs4uJQcwNthofg9QijNvgqTpqfvteIy0es,3562
|
|
90
|
+
crawlo/pipelines/mongo_pipeline.py,sha256=0iSlUlvjO5AsfuE5zHCKjL0YbhHR5OtZjIjvegNzr7c,6213
|
|
91
|
+
crawlo/pipelines/mysql_pipeline.py,sha256=ZCz1YE6OdjCPzhaCcFN2eBk37bxLXrFDZt-ceS93uks,23235
|
|
92
|
+
crawlo/pipelines/pipeline_manager.py,sha256=c1t-YcIspp7o09NEfaYSBSaUUIg2-0fJhjqvG7_3Jqw,4286
|
|
93
|
+
crawlo/pipelines/redis_dedup_pipeline.py,sha256=j_i9Tl3L7tt4Lkc4tLCIcY3tWMEvvVxDH1TYbbqUWzo,5914
|
|
94
|
+
crawlo/queue/__init__.py,sha256=wY-W_KLmjCbMGYUebmOcTh98JO28pTBH-7ifCAaXBXc,259
|
|
95
95
|
crawlo/queue/pqueue.py,sha256=bbgd3l1VfqYXfz-4VFaiWLmJit1LdB3qHalCtNqyrqI,1210
|
|
96
|
-
crawlo/queue/queue_manager.py,sha256=
|
|
97
|
-
crawlo/queue/redis_priority_queue.py,sha256=
|
|
96
|
+
crawlo/queue/queue_manager.py,sha256=CjaA0C6HMaET2n_osFr2mxO_fzt8Oq9pOqQIPnt63r4,25059
|
|
97
|
+
crawlo/queue/redis_priority_queue.py,sha256=B9QYTczt12kc4QlZSt1_XyvqvefwesOV-rCUPr_KyXo,22471
|
|
98
98
|
crawlo/settings/__init__.py,sha256=NgYFLfk_Bw7h6KSoepJn_lMBSqVbCHebjKxaE3_eMgw,130
|
|
99
|
-
crawlo/settings/default_settings.py,sha256=
|
|
99
|
+
crawlo/settings/default_settings.py,sha256=xTOZAfle2TUVcMkP1m1XAB6MSg-RvCrJp1YvOB8aUEI,13485
|
|
100
100
|
crawlo/settings/setting_manager.py,sha256=yI1tGaludevxKGGZO3Pn4aYofrg2cwYwvMZCFC5PPZw,8595
|
|
101
|
-
crawlo/spider/__init__.py,sha256=
|
|
102
|
-
crawlo/templates/crawlo.cfg.tmpl,sha256=
|
|
103
|
-
crawlo/templates/run.py.tmpl,sha256=
|
|
104
|
-
crawlo/templates/spiders_init.py.tmpl,sha256=
|
|
105
|
-
crawlo/templates/project/__init__.py.tmpl,sha256=
|
|
106
|
-
crawlo/templates/project/items.py.tmpl,sha256=
|
|
107
|
-
crawlo/templates/project/middlewares.py.tmpl,sha256=
|
|
108
|
-
crawlo/templates/project/pipelines.py.tmpl,sha256=
|
|
109
|
-
crawlo/templates/project/settings.py.tmpl,sha256=
|
|
110
|
-
crawlo/templates/project/settings_distributed.py.tmpl,sha256=
|
|
111
|
-
crawlo/templates/project/settings_gentle.py.tmpl,sha256=
|
|
112
|
-
crawlo/templates/project/settings_high_performance.py.tmpl,sha256=
|
|
113
|
-
crawlo/templates/project/settings_minimal.py.tmpl,sha256=
|
|
114
|
-
crawlo/templates/project/settings_simple.py.tmpl,sha256=
|
|
101
|
+
crawlo/spider/__init__.py,sha256=91NbcZGUiKOgPpvu8QfsSE6_A2Gw2XQoaiTPxwP-8k8,21855
|
|
102
|
+
crawlo/templates/crawlo.cfg.tmpl,sha256=DfmftICPPsopFGkmOqTWo55PCLboUk5iyFBtGqD_V1g,212
|
|
103
|
+
crawlo/templates/run.py.tmpl,sha256=jr2T6vARwMEJ3v4WXkLH1X0-wDTnoOlJTW-q2255z9o,499
|
|
104
|
+
crawlo/templates/spiders_init.py.tmpl,sha256=QvPw4DxiIjzyue6JDgfFtmuRKrE-jmjlBbvDK10zQwY,109
|
|
105
|
+
crawlo/templates/project/__init__.py.tmpl,sha256=WI5rG7-4rvwmwJWoGCzAgiw4hH2mL6qCYp5GbgiqxJY,54
|
|
106
|
+
crawlo/templates/project/items.py.tmpl,sha256=DrNj48b1W54DmntTLwU--ow2Fw4wSKw8MAmV1K8Vh2E,236
|
|
107
|
+
crawlo/templates/project/middlewares.py.tmpl,sha256=QQlb_bIhurQx4ZOUge3pwHrNIN0Z7RXNXCwA52Hs0Sw,1096
|
|
108
|
+
crawlo/templates/project/pipelines.py.tmpl,sha256=b_c_xLUzxLx3z2rMS0JG4JagM6_s38dx-VDHV47HaLQ,845
|
|
109
|
+
crawlo/templates/project/settings.py.tmpl,sha256=Wcchnbehzg-BY7XCxnrOz-pUwm6PDThN2mp42a-z3iA,3759
|
|
110
|
+
crawlo/templates/project/settings_distributed.py.tmpl,sha256=CpL3TkXkXEk90rNAnQRCTvu8jKlkhOckCin8245bdQM,5514
|
|
111
|
+
crawlo/templates/project/settings_gentle.py.tmpl,sha256=T6tPA91KtmOU_oyosrt-Z5LZ73jT_HWBu2mT0FrRxjg,5840
|
|
112
|
+
crawlo/templates/project/settings_high_performance.py.tmpl,sha256=kTdFI_FF_gzYxxASHiXEtfv2ptOHSD6X5K5VHMxPYJs,5960
|
|
113
|
+
crawlo/templates/project/settings_minimal.py.tmpl,sha256=CE-bGRFs12EY3psa6o-F3HB78nBF_YeuhMbgYN9efN8,3461
|
|
114
|
+
crawlo/templates/project/settings_simple.py.tmpl,sha256=Z5nKJXfKVBgBpp4B5DeuyopMPLOcorJJ4Bu6UG8pams,5692
|
|
115
115
|
crawlo/templates/project/spiders/__init__.py.tmpl,sha256=llhcIItXpm0TlEeumeLwp4fcYv2NHl8Iru7tLhDhxiE,216
|
|
116
|
-
crawlo/templates/spider/spider.py.tmpl,sha256=
|
|
117
|
-
crawlo/tools/__init__.py,sha256=
|
|
118
|
-
crawlo/tools/authenticated_proxy.py,sha256=ULCK0Cc9F2rGhRqu6kzKBdxzK9v2n1CsatSQ_PMxpAg,7272
|
|
119
|
-
crawlo/tools/data_formatter.py,sha256=iBDHpZBZvn9O7pLkTQilE1TzYJQEc3z3f6HXoVus0f0,7808
|
|
120
|
-
crawlo/tools/data_validator.py,sha256=bLWnkpFdclJuqjtSAgMI5nznN4vAuPwE1YaiFWKWenM,5490
|
|
116
|
+
crawlo/templates/spider/spider.py.tmpl,sha256=MpnTd69yawJ11rqyYRApj3_InQf_gMzNqPfL5bF7UgM,887
|
|
117
|
+
crawlo/tools/__init__.py,sha256=fatgJTaA06jvINxsqn7yJ-QtnRpxc2eDLnDFTmE_ykI,1768
|
|
121
118
|
crawlo/tools/date_tools.py,sha256=QOT3W5MqcEQhVM3cTZYxu1MRfgX-TI4aF1RI9s0QbdE,9195
|
|
122
119
|
crawlo/tools/distributed_coordinator.py,sha256=kkRbRoxz7iXKI3AQElyTptDpYl352ErbSkM3wjSHVwU,12574
|
|
123
|
-
crawlo/tools/
|
|
124
|
-
crawlo/tools/network_diagnostic.py,sha256=X1hSbUthIVbMHCU7ti43Zpu8XTaDJd5Oxr2zAkEuSB0,13013
|
|
125
|
-
crawlo/tools/request_tools.py,sha256=oXrk4yWMACVa65fDQCQgzsg6a94FH4_lS7qNR53FHYU,2420
|
|
126
|
-
crawlo/tools/retry_mechanism.py,sha256=4AQ_HLuYt4hYMI9XHoKFk2GQKEiDJB5pAnsMCfjc6Bk,7777
|
|
127
|
-
crawlo/tools/scenario_adapter.py,sha256=pzysL1B2uQ1ZSEncVHd9Hv2viHNgaxP44YAUcDcppfw,9660
|
|
120
|
+
crawlo/tools/scenario_adapter.py,sha256=gBe1nQiVO6c9Lt7GkA25zyidX_jaXUHWLjKWBtvXz5A,9658
|
|
128
121
|
crawlo/tools/text_cleaner.py,sha256=UrMGcgRnJaufjmDKIDsRYKMA8znCAArHDgouttWPygk,6690
|
|
129
|
-
crawlo/utils/__init__.py,sha256=
|
|
130
|
-
crawlo/utils/batch_processor.py,sha256=
|
|
131
|
-
crawlo/utils/
|
|
132
|
-
crawlo/utils/
|
|
133
|
-
crawlo/utils/
|
|
134
|
-
crawlo/utils/
|
|
122
|
+
crawlo/utils/__init__.py,sha256=FFMqPPGW9oLHDIJoF-ImxHwCLL_CKe5rMmi87CShKMs,1511
|
|
123
|
+
crawlo/utils/batch_processor.py,sha256=yGAS-Gp4ZAUEE1O8DHlSVSG2qEhfa2_nlDiFxIZUfyU,9882
|
|
124
|
+
crawlo/utils/config_manager.py,sha256=Qgrtm1v3rdlm-VMCjJOnllxQAizRu1oU6bGOhX_GEv0,14111
|
|
125
|
+
crawlo/utils/controlled_spider_mixin.py,sha256=U8fmo4aMEghmgoPZ8ZAZFutM-VaMFs33VwFS5rRR1XQ,16934
|
|
126
|
+
crawlo/utils/db_helper.py,sha256=auS7KOBvXolpfO1a6McWHBt1PgcTD6FbkkjNdvdjS7s,8700
|
|
127
|
+
crawlo/utils/encoding_helper.py,sha256=4MjF1Nzllt-kqIqLGp4p415KukouNZbxTOacTEFWi1M,6016
|
|
128
|
+
crawlo/utils/error_handler.py,sha256=Wm4fieywfR4M39v7GR0Tj2WTBcOQ8Mjf6PY5VF8VNTc,16234
|
|
135
129
|
crawlo/utils/fingerprint.py,sha256=3IbctH3zwyBjN_12SH7-vrFt-akA2WSo3iAzHc6u--s,3689
|
|
136
130
|
crawlo/utils/func_tools.py,sha256=y-TYP9H3X67MS_foWy9Z2LIS6GP7Y4Cy3T168ulq3Jc,2451
|
|
137
|
-
crawlo/utils/
|
|
138
|
-
crawlo/utils/
|
|
139
|
-
crawlo/utils/log.py,sha256=KmUWVYq8t6fSGOC88nnYCDxwBUdoPWvaBmpOSHn2oWI,2914
|
|
131
|
+
crawlo/utils/large_scale_helper.py,sha256=pHu4597lVtycvNwJXbw1IWCF6U7L8pMzWwBSwEZl-Fk,12450
|
|
132
|
+
crawlo/utils/leak_detector.py,sha256=7ycQEpVRglYUdueOzY-Kkt1mQK8nk0wbDRVO9uo4jhw,11922
|
|
140
133
|
crawlo/utils/misc.py,sha256=m_TbfMf4Aoe70zmkv7XWyFg8Rz0qOYPXepwB6EcYr7Y,2519
|
|
141
|
-
crawlo/utils/
|
|
134
|
+
crawlo/utils/mongo_connection_pool.py,sha256=CCWsgF82LzQSBBlNJAW2AfWosw13sqOQbGXfKKNoFGk,6046
|
|
135
|
+
crawlo/utils/mysql_connection_pool.py,sha256=xNCGvZV1ytnejPmxISb4uDVqdCeGz4PNXXtpkcBz7yc,7254
|
|
136
|
+
crawlo/utils/performance_monitor.py,sha256=QevfmkIbu0Ox3kd7eiH_IIWiFy1zfNcqOlnBH8OhdLE,9830
|
|
142
137
|
crawlo/utils/queue_helper.py,sha256=gFmkh1jKlIcN1rmo2Jl6vYcLP5ByUWlfHO9eNlZPBLs,4918
|
|
143
|
-
crawlo/utils/
|
|
144
|
-
crawlo/utils/
|
|
145
|
-
crawlo/utils/
|
|
146
|
-
crawlo/utils/
|
|
147
|
-
crawlo/utils/
|
|
148
|
-
crawlo/utils/
|
|
149
|
-
crawlo/utils/
|
|
150
|
-
crawlo/utils/
|
|
151
|
-
crawlo/utils/
|
|
152
|
-
crawlo/utils/
|
|
138
|
+
crawlo/utils/redis_checker.py,sha256=XP05VK8dpFYYt8p3eK2tiNskmSCQXJAshuJ-BqsqV-U,2639
|
|
139
|
+
crawlo/utils/redis_connection_pool.py,sha256=DWqcusN6jfFn9YA7kuhVNsLZr4P0bOIFXUaYqTueewQ,21397
|
|
140
|
+
crawlo/utils/redis_key_validator.py,sha256=v7pat8g30MyqQDaXJbDDH4CS8irmu0ijTGZRfm7nh7g,5807
|
|
141
|
+
crawlo/utils/request.py,sha256=mMDo85uEbjLYJ-Np2VT0GMeXD7L8IYY4eIV98lIspYU,9438
|
|
142
|
+
crawlo/utils/request_serializer.py,sha256=zuH2_AhSZDVDmKUo6NrfG6kt4ZsbBict9pUf7CZvaM8,8929
|
|
143
|
+
crawlo/utils/resource_manager.py,sha256=SoZV7Z980HZaDazchUY8fc8J481VGPrgW18bqFzIZAs,11209
|
|
144
|
+
crawlo/utils/response_helper.py,sha256=YJYnK_NYWEIm7iF9rfgU-xB2WvGfJo28MGYRWao-Ghs,3392
|
|
145
|
+
crawlo/utils/selector_helper.py,sha256=kfqgy3ZV0RuGE_I8sNqQemr1XuJYBmM1Hkk7CD61HzU,4314
|
|
146
|
+
crawlo/utils/singleton.py,sha256=oXtNVB1_yTBv1Hvq8_jU_X3eKMKmQ21Kl0G9SS1vcWc,1892
|
|
147
|
+
crawlo/utils/spider_loader.py,sha256=CVyzuVmMFZ02ur8USna3jZNyMrhFUdPKoatnbDkOnSc,7675
|
|
148
|
+
crawlo/utils/text_helper.py,sha256=gYIrkH4_vFHbKZH9m6d1BVO5dqRqUccJqSWMdgVOb4g,2918
|
|
153
149
|
examples/__init__.py,sha256=NkRbV8_S1tb8S2AW6BE2U6P2-eGOPwMR1k0YQAwQpSE,130
|
|
154
150
|
tests/__init__.py,sha256=409aRX8hsPffiZCVjOogtxwhACzBp8G2UTJyUQSxhK0,136
|
|
155
|
-
tests/advanced_tools_example.py,sha256=
|
|
151
|
+
tests/advanced_tools_example.py,sha256=W_QLm62QJGDSsjTi1ZrTfzrwfakk21PF_iOQvraBc94,7031
|
|
156
152
|
tests/authenticated_proxy_example.py,sha256=ZgLrU-1GaBhkJK1Wy0X93lHP1GT2sU2_wi3RI1CfrVc,3135
|
|
157
153
|
tests/baidu_performance_test.py,sha256=wxdaI7UwKboMYH_qcaqZLxAStvndH60bvKGzD8F-jaI,3974
|
|
158
154
|
tests/baidu_test.py,sha256=NKYnwDbPJX3tmKtRn7uQ_QWzUXiLTQC-Gdr1cQkJzEo,1874
|
|
@@ -171,11 +167,11 @@ tests/debug_log_levels.py,sha256=CZWG3KGDq-hYJ5TPhoZTyjKFKkkM-AoK3oP1w-JC1sc,216
|
|
|
171
167
|
tests/debug_pipelines.py,sha256=FMb36bH9lQxBLb-nM579hBRK1S16Vxu1t_BC3Dj8O2w,2164
|
|
172
168
|
tests/detailed_log_test.py,sha256=oTCFF_Un7Jq2gV4rpRDFOxlHJSthnQhvEf0CSItfB7I,7501
|
|
173
169
|
tests/direct_selector_helper_test.py,sha256=p7_x3x87JUnpKplmwYO4zN5ympcPJSPdHsviso-LmpI,2862
|
|
170
|
+
tests/distributed_dedup_test.py,sha256=EaRnWH3ADsJN67Kn7T5TYPiQyMvyRtV6OCPk1uUTXQM,16507
|
|
174
171
|
tests/distributed_test.py,sha256=u6cEiymZzCItaTClKTxwVjNmOj9_PZii4_eGNAVMDW8,1825
|
|
175
172
|
tests/distributed_test_debug.py,sha256=pUv6ZKEJ5pK9xOA7lgVk6WW3cBAtnb1bsuZzJ8oGLvY,2181
|
|
176
173
|
tests/dynamic_loading_example.py,sha256=7LdeQZFevrb-U1_dgr4oX3aYo2Da4HvE_0KIf1fw4Ew,18786
|
|
177
174
|
tests/dynamic_loading_test.py,sha256=dzDW7b66HeDsIYsYgvNRihE3V6b6gEbUGQpp-eJbcIM,3413
|
|
178
|
-
tests/env_config_example.py,sha256=_ZRDh_LR23ZKpy9E--y_KM0QIOiZF5vRT98QTn52TY8,4951
|
|
179
175
|
tests/error_handling_example.py,sha256=grTeo1X17rFz4lhgASb0g5yu4NWbmNz5neyuonnNR40,5294
|
|
180
176
|
tests/explain_mysql_update_behavior.py,sha256=uBrJwiYujTJF35oF1kYMRjYU5k5Y3YlqOfOni0oPQtY,2865
|
|
181
177
|
tests/final_comprehensive_test.py,sha256=szTNbtwKfYNmE0kzDPCsE_kvnTG7FNKl2JERakGhKIk,4314
|
|
@@ -185,6 +181,7 @@ tests/fix_log_test.py,sha256=hcRy0j3j0CT0oLN7KNA0VL-_o4M-uE1amR6GziBflfU,4440
|
|
|
185
181
|
tests/framework_performance_test.py,sha256=Qp47VrsCK0ylEhDkFOm7lnD8rVkaJ7u1MopsEhAomrE,6985
|
|
186
182
|
tests/log_buffering_test.py,sha256=0B5UY1yQuxnBU1pEyz3IBYweN__4fOkPXly-kYfOpNU,3226
|
|
187
183
|
tests/log_generation_timing_test.py,sha256=zHb_m2FqlpRCYw-wqFWFn8cbVH8UR3VvXKSM6nNnbgo,4681
|
|
184
|
+
tests/monitor_redis_dedup.sh,sha256=2nFs5zYiguVqL2YSw_XbhGb30a_EDg0wTIV7zOa0pNg,2284
|
|
188
185
|
tests/optimized_performance_test.py,sha256=bA0dN4j7ViyTSSiCJEjlkJ9Y7jspTFKs2xX7UXHE8Gs,7379
|
|
189
186
|
tests/performance_comparison.py,sha256=UevHOM_9z2ILedf_xZ_8F8QiPjb_M8WTfGQrxzKtgco,9266
|
|
190
187
|
tests/queue_blocking_test.py,sha256=hp-6hmTOO64oOAWVtlN8cFJ95GjbK3t9fj-4q_TKowk,3955
|
|
@@ -192,6 +189,7 @@ tests/queue_test.py,sha256=HeBiBXqAgIAbUkLVQ3McS6NdRselA30m3lnuxNBvZbk,2689
|
|
|
192
189
|
tests/redis_key_validation_demo.py,sha256=WD2jvuBwHhLYIb3lVFtvYSSnmXWn1EW4EPCEwFhfi6M,4467
|
|
193
190
|
tests/request_params_example.py,sha256=J50NdsnK1sDrqG-5m3oA-mu1_wHwVwHIfsWxGeQpz7o,4250
|
|
194
191
|
tests/response_improvements_example.py,sha256=t1cbG3nesp82bqog4_ku1GvQzNbhRyWa5EaKTmOPrSk,5402
|
|
192
|
+
tests/simple_cli_test.py,sha256=t-MyZIk65B3l_JT7Ocf2K3d_bQHBhzzkyx3lvpHw6eU,1594
|
|
195
193
|
tests/simple_command_test.py,sha256=8TowzW45ukKTPeaNC5uij3RR7rqPULiBr2PguSSMdP8,3688
|
|
196
194
|
tests/simple_crawlo_test.py,sha256=FYDn5cgAxHN81QSYa_wcJcxJit7aLnIopnkHKKr83dE,4801
|
|
197
195
|
tests/simple_follow_test.py,sha256=3vNT5Eqwza6fxAY9Xl_9xtFGdfrPwm6NnVHdRmJsH8A,1053
|
|
@@ -210,11 +208,10 @@ tests/test_all_commands.py,sha256=VgVa9SzU5Irvn5igHpC2W4E_6ZDWDt7jc-T4UPK_PFE,77
|
|
|
210
208
|
tests/test_all_pipeline_fingerprints.py,sha256=NDrBYr0f9CAhjmSezTS4NUrAdcotrSX3ElJTWqjXXbU,5308
|
|
211
209
|
tests/test_all_redis_key_configs.py,sha256=dWc4Dsr07_vuSpb4hwkMpyy6XO8SI7vglVjGuGvXoa4,5710
|
|
212
210
|
tests/test_asyncmy_usage.py,sha256=gxENdxrcLlDG2m8V-j4ZnSJYFc3x6CvKvgPAhOC13DE,1688
|
|
213
|
-
tests/test_authenticated_proxy.py,sha256=lnvmQwuf0zaZP_E05EzcNFR2VJbwTkLjOmZGNoJKaC4,4339
|
|
214
211
|
tests/test_batch_processor.py,sha256=4_nYlu9R1JkDCFHq0bYc9LUNqsg41r7sQ879hkrhEts,7212
|
|
215
212
|
tests/test_cleaners.py,sha256=HDK8_YU7GUj_3hGU415cxEeUR74mnDSk0yroLlgDI0I,1816
|
|
213
|
+
tests/test_cli_arguments.py,sha256=_wNtGNLnrrw3zVyickLcI9qI6ncjms_8AH0IrCNWk4U,5443
|
|
216
214
|
tests/test_component_factory.py,sha256=V3hO5pJHSDtViLAykXSUqkeH4g-GB4GczwutrTatS2U,5809
|
|
217
|
-
tests/test_comprehensive.py,sha256=dvRJeeVYc1cgXK9Y171hH9Y847zZpWSAFFH-EI3UepQ,5182
|
|
218
215
|
tests/test_config_consistency.py,sha256=RgSxyaypMpysltsGSh1vFMeOShiZZG0rmUKzEhNLpYw,2001
|
|
219
216
|
tests/test_config_merge.py,sha256=ts1j-TIKkFS0EO5q1I4O7f4YUKR5MLTmRSqOpOlv094,5606
|
|
220
217
|
tests/test_config_validator.py,sha256=Z4gBHkI0_fEx-xgiiG4T33F4BAuePuF81obpNTXfseY,6202
|
|
@@ -223,7 +220,7 @@ tests/test_crawler_process_import.py,sha256=iIPqSCpv2VRb_hWTu5euLME4PDFf7NwixeBy
|
|
|
223
220
|
tests/test_crawler_process_spider_modules.py,sha256=uMr4esj6ascVBzt0WrPd3ZOQfKD00O6tJrNhuWOdvV0,1395
|
|
224
221
|
tests/test_crawlo_proxy_integration.py,sha256=JFBI82ILXMwAIJ29C8uhu5r-hH3UhMC50jKr5-jy6Ng,3059
|
|
225
222
|
tests/test_date_tools.py,sha256=pcLDyhLrZ_jh-PhPm4CvLZEgNeH9kLMPKN5zacHwuWM,4053
|
|
226
|
-
tests/test_dedup_fix.py,sha256=
|
|
223
|
+
tests/test_dedup_fix.py,sha256=6gQKDatida54itwPtB1-HwUTKbdwwJA2Yc-HhhHj_wM,8747
|
|
227
224
|
tests/test_dedup_pipeline_consistency.py,sha256=dn5EAZSU5gQOV5EQwreHp76i5aQZ9tEdltSGO7dif5M,5176
|
|
228
225
|
tests/test_default_header_middleware.py,sha256=UDjEPIUCre1M6ndjV_uXLVCfY7WJwyN-1Xn15hzbKMo,13126
|
|
229
226
|
tests/test_distributed.py,sha256=78Pn4HPLIaO8t1IiaSkckBmuEVTcnC8IDw7znf9_Zcw,1790
|
|
@@ -231,28 +228,21 @@ tests/test_double_crawlo_fix.py,sha256=lZwrT5ij6Jbh0EzZswhw05FXwgKaEZsSHekLTrJJa
|
|
|
231
228
|
tests/test_double_crawlo_fix_simple.py,sha256=NDmCEeyvpf_D1tGQMA66iLPPKlAnSZcEg71e7GHYcjg,4768
|
|
232
229
|
tests/test_download_delay_middleware.py,sha256=Idc6KzhL3hY3aDKgn1j_v5-mLIHz7dTnV5c4tJVZh5Q,9107
|
|
233
230
|
tests/test_downloader_proxy_compatibility.py,sha256=NJJ-g_I665lHLsJZd7ONvKubHRxv82FADZR9WYzgyzA,9418
|
|
234
|
-
tests/test_dynamic_downloaders_proxy.py,sha256=t_aWpxOHi4h3_fg2ImtIq7IIJ0r3PTHtnXiopPe2ZlM,4450
|
|
235
|
-
tests/test_dynamic_proxy.py,sha256=zi7Ocbhc9GL1zCs0XhmG2NvBBeIZ2d2hPJVh18lH4Y0,3172
|
|
236
|
-
tests/test_dynamic_proxy_config.py,sha256=C_9CEjCJtrr0SxIXCyLDhSIi88ujF7UAT1F-FAphd0w,5853
|
|
237
|
-
tests/test_dynamic_proxy_real.py,sha256=krWnbFIH26mWNPhOfPMmx3ZxJfOreZxMZFGwVb_8-K8,3511
|
|
238
231
|
tests/test_edge_cases.py,sha256=460JtYR6yuTo8J4wqJScMzDkrrDUE2Q8R425AaUycIQ,11127
|
|
239
232
|
tests/test_encoding_core.py,sha256=k5fZET0R1KInhAlbbHEJv4m9d6NuibOxxfIcR43TS7Y,1681
|
|
240
233
|
tests/test_encoding_detection.py,sha256=Zb1KkF2CR57qa0Hr_Iv8msompGJZT2EIL_2mGp0zX9Q,4245
|
|
241
234
|
tests/test_enhanced_error_handler.py,sha256=Ku_86jv7iDe25v8ZxalcXxJJjIiIvQXWH8ZldbwdVm8,8581
|
|
242
235
|
tests/test_enhanced_error_handler_comprehensive.py,sha256=j_cxyIPGks9A3untKhAdj5HU0hrLbbzOLu0uAtGUlJo,9369
|
|
243
|
-
tests/test_env_config.py,sha256=Qu1sDeADs69dSr1x0QmEe8nJrMHneE_4JClt-N901e8,4867
|
|
244
236
|
tests/test_error_handler_compatibility.py,sha256=xJ43cmCwfBGh-qBwCGiMDPPlfNDLw4ZrmlrHN9IojkU,4241
|
|
245
237
|
tests/test_factories.py,sha256=wKFfr8YBXPs-AQ8YOFgDhINn5uivKqPBZQPUe5GL9Ig,8865
|
|
246
238
|
tests/test_factory_compatibility.py,sha256=zzTXd3ku3iedgxgB1DxTt3zfetiIl6wCjL9yXIUCpic,6260
|
|
247
239
|
tests/test_final_validation.py,sha256=OuZI01O0E68Pao--bD-BFDTRZFPc_Mt4W-OXUzlt6ZA,4966
|
|
248
240
|
tests/test_fingerprint_consistency.py,sha256=68V5u_2hNABI5pNWzXUrA1PJ08Xh9x3-JsMSNNjORMo,4956
|
|
249
241
|
tests/test_fingerprint_simple.py,sha256=qiSba8gF3Zl91QO_ijJO7KstLdjATs30V_GZCNHShig,1626
|
|
250
|
-
tests/test_framework_env_usage.py,sha256=bFb_ptdLeX2obdJWEqEHPWweiWR-wR2BpvEaJMQK7h4,4201
|
|
251
242
|
tests/test_get_component_logger.py,sha256=UKj5uT1F3L3atoJFmpk7QSDO2fZHgw-7Y84vMFbHRkM,2285
|
|
252
243
|
tests/test_hash_performance.py,sha256=4eVPwbu66Oun0LVyTTNd9d2cj2V1xq0YZkRg8Z0TO-Q,3211
|
|
253
244
|
tests/test_integration.py,sha256=lVEzKNAjFzFZHRNZAyJmXxa_5Ogf_qqL4APqs620o58,4839
|
|
254
245
|
tests/test_item_dedup_redis_key.py,sha256=dp_H59exJLaZHh5oMtmMEOWh-DNZwbnwIFYDjOpHgd0,3842
|
|
255
|
-
tests/test_large_scale_config.py,sha256=Ik32ilAOQXsyw2sHR53gDPNNjY0AXybQ9ya2JY-EeqM,4296
|
|
256
246
|
tests/test_large_scale_helper.py,sha256=0L6EKHcKgh7XHvoW4wRSkxmw8GolUwSOCgZ_-ZmCyDo,8371
|
|
257
247
|
tests/test_logging_enhancements.py,sha256=YHcYWC8PG_AP5wZnmOr6H7QuU7m-3xzxEhppM0Jubvg,12731
|
|
258
248
|
tests/test_logging_final.py,sha256=K9vxyODslXza05hElVEcvzbXgzthYKK5CRj4UJTftIw,6376
|
|
@@ -281,7 +271,6 @@ tests/test_pipeline_fingerprint_consistency.py,sha256=LL55oGSDGy0K8LxoyKa6ogNHXh
|
|
|
281
271
|
tests/test_priority_behavior.py,sha256=JQ5uv80cAUKV9Eh3S8j5zxYSSL-dmzhwhuKOINM26zU,9325
|
|
282
272
|
tests/test_priority_consistency.py,sha256=rVX7nku5N_QpB_ffDu3xqREkCWPX5aNNiXy112o9wpA,5756
|
|
283
273
|
tests/test_priority_consistency_fixed.py,sha256=MlYi5PIr5wxunC3Ku4ilnxOatKyRu2qIvhV7pjadkjg,10765
|
|
284
|
-
tests/test_proxy_api.py,sha256=XnmklS-xU4ke_560gV6AIlBsRmG8YLQTGFAZrTUZuhc,11013
|
|
285
274
|
tests/test_proxy_health_check.py,sha256=_tDlxa_6TdL3M5RLkHF82roXJ8WIuG5hELBp2GADyKQ,1123
|
|
286
275
|
tests/test_proxy_middleware.py,sha256=MC2Hg88Pdpv6i_gTAy4ocIWOOxQ8bF7hYtszwpOzilE,8716
|
|
287
276
|
tests/test_proxy_middleware_enhanced.py,sha256=N7Ly3koCH2uRYk6pxhEJwWpChKdIucdrj0nKvq_E4bw,6896
|
|
@@ -301,7 +290,6 @@ tests/test_queue_type_redis_config_consistency.py,sha256=1ew7Zp9CxH1DQ0RUmsZMV-n
|
|
|
301
290
|
tests/test_random_headers_default.py,sha256=ulDb3_kRpnTCN1-TO3m6wVM-eMkZS_ezsSbd1ur8Xpg,12772
|
|
302
291
|
tests/test_random_headers_necessity.py,sha256=SSbNQIE347oCQvuG6yaAambFU-3MyQzTV5jN1kArRGY,11741
|
|
303
292
|
tests/test_random_user_agent.py,sha256=6HjU4iUcMk-J6bR2N5FhIkWDfnaFKAPNVyRzxmQQ14k,2302
|
|
304
|
-
tests/test_real_scenario_proxy.py,sha256=L2Mfwt47pvs6dYJDcazeyupoQ_DuvhdulCz6-2GFR9Y,7527
|
|
305
293
|
tests/test_redis_config.py,sha256=51_Fy1PqIhS0MMO2nR4q6oQjBFxfqcUPK_4NNf5s83g,903
|
|
306
294
|
tests/test_redis_connection_pool.py,sha256=pKfXdE3Cm_L_fNqI9zqFmqiidCwR0t7hiM_Fu_V1cNI,9328
|
|
307
295
|
tests/test_redis_key_naming.py,sha256=MTFk656JhiGVTsMctBDhBNOMFcBDZrsQA3UfPZ-Dgj4,6911
|
|
@@ -336,7 +324,6 @@ tests/test_template_redis_key.py,sha256=99-s0_-8MFJbIvGG_X__sH0qkXWTtJv8fdTdlfts
|
|
|
336
324
|
tests/test_tools.py,sha256=z50Bvq_q8FwpyxNkmh00_A3sXkSv2l1Q_EbK02FDYgk,5504
|
|
337
325
|
tests/test_user_agent_randomness.py,sha256=tE8_zh-BjMAQ9CTgScxZze6JarNher6COkdoLU68YfA,5681
|
|
338
326
|
tests/test_user_agents.py,sha256=e4haX-o8Janl-PawGJ9MemZyMqTX33_tBF_WnYSVoUw,3327
|
|
339
|
-
tests/tools_example.py,sha256=Rxu5vVKnj3CZ3mCx-EEotBWPtZs2S7ktyqq-SYeclxU,7999
|
|
340
327
|
tests/untested_features_report.md,sha256=31aUlsw_1OKe0_ijAjeH85kJ7HJ8qzKLJdOHDjWtYdY,4169
|
|
341
328
|
tests/verify_debug.py,sha256=iQ4Efwg9bQTHscr73VYAAZ8rBIe1u6mQfeaEK5YgneY,1564
|
|
342
329
|
tests/verify_distributed.py,sha256=0IolM4ymuPOz_uTfHSWFO3Vxzp7Lo6i0zhSbzJhHFtI,4045
|
|
@@ -348,14 +335,13 @@ tests/ofweek_scrapy/ofweek_scrapy/items.py,sha256=Y_TwwHPAgOXTuCTdnhRxil7vYPk1_r
|
|
|
348
335
|
tests/ofweek_scrapy/ofweek_scrapy/middlewares.py,sha256=O4jVSXZgxtsRzU9O_O3YdkS7_QLndzv3uYP-Op8g254,3654
|
|
349
336
|
tests/ofweek_scrapy/ofweek_scrapy/pipelines.py,sha256=ZO6WqTqPpTwLvnwO7YL0E35OPp4zSfJ_GhMeshNRSow,379
|
|
350
337
|
tests/ofweek_scrapy/ofweek_scrapy/settings.py,sha256=X3Y6goZluAz0n2bepWAKEhZX0URFfe9_lBRBCPgtLPk,2933
|
|
351
|
-
tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py,sha256=
|
|
352
|
-
tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py,sha256=gcfKze-ipzP7JTDGCL3TgtjwIwfgI7dPL6GmdXVT0fs,6880
|
|
338
|
+
tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py,sha256=a-Jax6MTMZC2HOw3mUBcNu-T44dOUHDsb22Oly4HTnM,165
|
|
353
339
|
tests/scrapy_comparison/ofweek_scrapy.py,sha256=rhVds_WjYum1bLuWWe90HtXE51fZXEqhhPSc822ZasQ,5790
|
|
354
340
|
tests/scrapy_comparison/scrapy_test.py,sha256=-IsGUHPBgEL0TmXjeLZl-TUA01B7Dsc2nRo4JZbFwZA,5599
|
|
355
341
|
tests/test_spiders/__init__.py,sha256=Ws2DhfUA0Xh5Cxr9M46td7B6hyNoLTyAhZ60FnIh6D0,20
|
|
356
342
|
tests/test_spiders/test_spider.py,sha256=kNGEg80HMMFgzVseI1jJjljZEBy3QYKt_3SXGASffFM,168
|
|
357
|
-
crawlo-1.4.
|
|
358
|
-
crawlo-1.4.
|
|
359
|
-
crawlo-1.4.
|
|
360
|
-
crawlo-1.4.
|
|
361
|
-
crawlo-1.4.
|
|
343
|
+
crawlo-1.4.8.dist-info/METADATA,sha256=-3vWrJ0Mpd-DMRD4S0bRZwzUuKF17FyauuSI78eD7fo,24526
|
|
344
|
+
crawlo-1.4.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
345
|
+
crawlo-1.4.8.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
|
|
346
|
+
crawlo-1.4.8.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
|
|
347
|
+
crawlo-1.4.8.dist-info/RECORD,,
|
tests/advanced_tools_example.py
CHANGED
|
@@ -22,11 +22,6 @@ from crawlo.tools import (
|
|
|
22
22
|
handle_captcha,
|
|
23
23
|
detect_rate_limiting,
|
|
24
24
|
|
|
25
|
-
# 带认证代理工具
|
|
26
|
-
AuthenticatedProxy,
|
|
27
|
-
create_proxy_config,
|
|
28
|
-
format_proxy_for_request,
|
|
29
|
-
|
|
30
25
|
# 分布式协调工具
|
|
31
26
|
generate_pagination_tasks,
|
|
32
27
|
distribute_tasks,
|
|
@@ -125,32 +120,6 @@ def demo_anti_crawler_tools():
|
|
|
125
120
|
print()
|
|
126
121
|
|
|
127
122
|
|
|
128
|
-
def demo_authenticated_proxy_tools():
|
|
129
|
-
"""演示带认证代理工具的使用"""
|
|
130
|
-
print("=== 带认证代理工具演示 ===\n")
|
|
131
|
-
|
|
132
|
-
# 不同类型的代理URL
|
|
133
|
-
proxy_urls = [
|
|
134
|
-
"http://user:pass@proxy1.example.com:8080", # 带认证HTTP代理
|
|
135
|
-
"https://username:password@proxy2.example.com:443", # 带认证HTTPS代理
|
|
136
|
-
"http://proxy3.example.com:8080" # 不带认证代理
|
|
137
|
-
]
|
|
138
|
-
|
|
139
|
-
for proxy_url in proxy_urls:
|
|
140
|
-
print(f"处理代理: {proxy_url}")
|
|
141
|
-
|
|
142
|
-
# 创建代理对象
|
|
143
|
-
proxy = AuthenticatedProxy(proxy_url)
|
|
144
|
-
|
|
145
|
-
# 为不同下载器格式化代理配置
|
|
146
|
-
for downloader in ["aiohttp", "httpx", "curl_cffi"]:
|
|
147
|
-
config = create_proxy_config(proxy_url)
|
|
148
|
-
formatted = format_proxy_for_request(config, downloader)
|
|
149
|
-
print(f" {downloader}格式: {formatted}")
|
|
150
|
-
|
|
151
|
-
print()
|
|
152
|
-
|
|
153
|
-
|
|
154
123
|
def demo_distributed_coordinator_tools():
|
|
155
124
|
"""演示分布式协调工具的使用"""
|
|
156
125
|
print("=== 分布式协调工具演示 ===\n")
|
|
@@ -191,8 +160,7 @@ from crawlo.tools import (
|
|
|
191
160
|
validate_email,
|
|
192
161
|
AntiCrawler,
|
|
193
162
|
DistributedCoordinator,
|
|
194
|
-
retry
|
|
195
|
-
AuthenticatedProxy
|
|
163
|
+
retry
|
|
196
164
|
)
|
|
197
165
|
|
|
198
166
|
class AdvancedSpider(Spider):
|
|
@@ -200,40 +168,19 @@ class AdvancedSpider(Spider):
|
|
|
200
168
|
super().__init__()
|
|
201
169
|
self.anti_crawler = AntiCrawler()
|
|
202
170
|
self.coordinator = DistributedCoordinator()
|
|
203
|
-
# 代理列表
|
|
204
|
-
self.proxy_urls = [
|
|
205
|
-
"http://user1:pass1@proxy1.example.com:8080",
|
|
206
|
-
"http://user2:pass2@proxy2.example.com:8080",
|
|
207
|
-
"http://proxy3.example.com:8080" # 不带认证
|
|
208
|
-
]
|
|
209
171
|
|
|
210
172
|
def start_requests(self):
|
|
211
173
|
# 生成分页任务
|
|
212
174
|
base_url = "https://api.example.com/products"
|
|
213
175
|
pagination_tasks = self.coordinator.generate_pagination_tasks(base_url, 1, 100)
|
|
214
176
|
|
|
215
|
-
for
|
|
216
|
-
#
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
# 根据下载器类型设置代理
|
|
223
|
-
downloader_type = self.crawler.settings.get("DOWNLOADER_TYPE", "aiohttp")
|
|
224
|
-
if downloader_type == "aiohttp":
|
|
225
|
-
request.proxy = proxy.clean_url
|
|
226
|
-
auth = proxy.get_auth_credentials()
|
|
227
|
-
if auth:
|
|
228
|
-
request.meta["proxy_auth"] = auth
|
|
229
|
-
elif downloader_type == "httpx":
|
|
230
|
-
request.proxy = proxy.clean_url
|
|
231
|
-
elif downloader_type == "curl_cffi":
|
|
232
|
-
request.proxy = proxy.proxy_dict
|
|
233
|
-
auth_header = proxy.get_auth_header()
|
|
234
|
-
if auth_header:
|
|
235
|
-
request.headers["Proxy-Authorization"] = auth_header
|
|
236
|
-
|
|
177
|
+
for url in pagination_tasks:
|
|
178
|
+
# 直接使用带认证的代理URL(框架原生支持)
|
|
179
|
+
request = Request(
|
|
180
|
+
url,
|
|
181
|
+
callback=self.parse,
|
|
182
|
+
proxy="http://user:pass@proxy.example.com:8080" # 所有下载器都支持
|
|
183
|
+
)
|
|
237
184
|
yield request
|
|
238
185
|
|
|
239
186
|
@retry(max_retries=3)
|
|
@@ -248,19 +195,15 @@ class AdvancedSpider(Spider):
|
|
|
248
195
|
products = response.css('.product-item')
|
|
249
196
|
for product in products:
|
|
250
197
|
name = product.css('.product-name::text').get()
|
|
251
|
-
price_text = product.css('.price::text').get()
|
|
252
198
|
email = product.css('.contact-email::text').get()
|
|
253
199
|
|
|
254
200
|
# 数据清洗和验证
|
|
255
201
|
clean_name = clean_text(name) if name else None
|
|
256
|
-
clean_price = clean_text(price_text) if price_text else None
|
|
257
202
|
is_valid_email = validate_email(email) if email else False
|
|
258
203
|
|
|
259
204
|
# 检查数据是否重复
|
|
260
|
-
if not await self.coordinator.is_duplicate({"name": clean_name
|
|
261
|
-
|
|
262
|
-
await self.coordinator.add_to_dedup({"name": clean_name, "price": clean_price})
|
|
263
|
-
|
|
205
|
+
if not await self.coordinator.is_duplicate({"name": clean_name}):
|
|
206
|
+
await self.coordinator.add_to_dedup({"name": clean_name})
|
|
264
207
|
# 处理产品数据...
|
|
265
208
|
pass
|
|
266
209
|
""")
|
|
@@ -271,6 +214,5 @@ if __name__ == '__main__':
|
|
|
271
214
|
demo_data_processing_tools()
|
|
272
215
|
demo_retry_mechanism()
|
|
273
216
|
demo_anti_crawler_tools()
|
|
274
|
-
demo_authenticated_proxy_tools()
|
|
275
217
|
demo_distributed_coordinator_tools()
|
|
276
218
|
demo_in_spider()
|