crawlo 1.4.6__py3-none-any.whl → 1.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (162) hide show
  1. crawlo/__init__.py +2 -1
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +2 -2
  4. crawlo/commands/check.py +1 -1
  5. crawlo/commands/help.py +5 -3
  6. crawlo/commands/list.py +1 -1
  7. crawlo/commands/run.py +49 -11
  8. crawlo/commands/stats.py +1 -1
  9. crawlo/config.py +12 -4
  10. crawlo/config_validator.py +1 -1
  11. crawlo/core/engine.py +20 -7
  12. crawlo/core/processor.py +1 -1
  13. crawlo/core/scheduler.py +4 -5
  14. crawlo/crawler.py +51 -10
  15. crawlo/downloader/__init__.py +7 -3
  16. crawlo/downloader/aiohttp_downloader.py +18 -18
  17. crawlo/downloader/cffi_downloader.py +5 -2
  18. crawlo/downloader/httpx_downloader.py +9 -3
  19. crawlo/downloader/hybrid_downloader.py +2 -2
  20. crawlo/downloader/playwright_downloader.py +38 -15
  21. crawlo/downloader/selenium_downloader.py +16 -2
  22. crawlo/event.py +42 -8
  23. crawlo/exceptions.py +157 -24
  24. crawlo/extension/__init__.py +10 -9
  25. crawlo/extension/health_check.py +7 -7
  26. crawlo/extension/log_interval.py +6 -6
  27. crawlo/extension/log_stats.py +2 -2
  28. crawlo/extension/logging_extension.py +4 -12
  29. crawlo/extension/memory_monitor.py +5 -5
  30. crawlo/extension/performance_profiler.py +5 -5
  31. crawlo/extension/request_recorder.py +6 -6
  32. crawlo/factories/base.py +1 -1
  33. crawlo/factories/crawler.py +61 -60
  34. crawlo/factories/utils.py +135 -0
  35. crawlo/filters/__init__.py +19 -2
  36. crawlo/filters/aioredis_filter.py +133 -49
  37. crawlo/filters/memory_filter.py +6 -21
  38. crawlo/framework.py +22 -8
  39. crawlo/initialization/built_in.py +24 -67
  40. crawlo/initialization/core.py +65 -19
  41. crawlo/initialization/phases.py +83 -2
  42. crawlo/initialization/registry.py +5 -7
  43. crawlo/initialization/utils.py +49 -0
  44. crawlo/logging/__init__.py +6 -10
  45. crawlo/logging/config.py +106 -22
  46. crawlo/logging/factory.py +12 -8
  47. crawlo/logging/manager.py +19 -27
  48. crawlo/middleware/__init__.py +72 -9
  49. crawlo/middleware/default_header.py +2 -2
  50. crawlo/middleware/download_delay.py +2 -2
  51. crawlo/middleware/middleware_manager.py +6 -6
  52. crawlo/middleware/offsite.py +2 -2
  53. crawlo/middleware/proxy.py +2 -2
  54. crawlo/middleware/request_ignore.py +4 -4
  55. crawlo/middleware/response_code.py +2 -2
  56. crawlo/middleware/response_filter.py +2 -2
  57. crawlo/middleware/retry.py +1 -1
  58. crawlo/mode_manager.py +38 -4
  59. crawlo/network/request.py +54 -26
  60. crawlo/network/response.py +69 -135
  61. crawlo/pipelines/__init__.py +40 -9
  62. crawlo/pipelines/base_pipeline.py +452 -0
  63. crawlo/pipelines/bloom_dedup_pipeline.py +4 -5
  64. crawlo/pipelines/console_pipeline.py +2 -2
  65. crawlo/pipelines/csv_pipeline.py +4 -4
  66. crawlo/pipelines/database_dedup_pipeline.py +4 -5
  67. crawlo/pipelines/json_pipeline.py +4 -4
  68. crawlo/pipelines/memory_dedup_pipeline.py +4 -5
  69. crawlo/pipelines/mongo_pipeline.py +23 -14
  70. crawlo/pipelines/mysql_pipeline.py +31 -39
  71. crawlo/pipelines/pipeline_manager.py +8 -8
  72. crawlo/pipelines/redis_dedup_pipeline.py +13 -14
  73. crawlo/project.py +1 -1
  74. crawlo/queue/__init__.py +10 -0
  75. crawlo/queue/queue_manager.py +79 -13
  76. crawlo/queue/redis_priority_queue.py +196 -47
  77. crawlo/settings/default_settings.py +16 -6
  78. crawlo/spider/__init__.py +6 -5
  79. crawlo/stats_collector.py +2 -2
  80. crawlo/task_manager.py +1 -1
  81. crawlo/templates/crawlo.cfg.tmpl +3 -3
  82. crawlo/templates/project/__init__.py.tmpl +1 -3
  83. crawlo/templates/project/items.py.tmpl +2 -6
  84. crawlo/templates/project/middlewares.py.tmpl +1 -1
  85. crawlo/templates/project/pipelines.py.tmpl +1 -2
  86. crawlo/templates/project/settings.py.tmpl +12 -10
  87. crawlo/templates/project/settings_distributed.py.tmpl +14 -13
  88. crawlo/templates/project/settings_gentle.py.tmpl +21 -23
  89. crawlo/templates/project/settings_high_performance.py.tmpl +21 -23
  90. crawlo/templates/project/settings_minimal.py.tmpl +10 -8
  91. crawlo/templates/project/settings_simple.py.tmpl +21 -23
  92. crawlo/templates/run.py.tmpl +1 -1
  93. crawlo/templates/spider/spider.py.tmpl +4 -12
  94. crawlo/templates/spiders_init.py.tmpl +3 -8
  95. crawlo/tools/__init__.py +0 -103
  96. crawlo/tools/scenario_adapter.py +1 -1
  97. crawlo/utils/__init__.py +25 -1
  98. crawlo/utils/batch_processor.py +23 -6
  99. crawlo/utils/config_manager.py +442 -0
  100. crawlo/utils/controlled_spider_mixin.py +1 -1
  101. crawlo/utils/db_helper.py +1 -1
  102. crawlo/utils/encoding_helper.py +190 -0
  103. crawlo/utils/error_handler.py +2 -2
  104. crawlo/utils/large_scale_helper.py +1 -1
  105. crawlo/utils/leak_detector.py +335 -0
  106. crawlo/utils/mongo_connection_pool.py +157 -0
  107. crawlo/utils/mysql_connection_pool.py +197 -0
  108. crawlo/utils/performance_monitor.py +1 -1
  109. crawlo/utils/redis_checker.py +91 -0
  110. crawlo/utils/redis_connection_pool.py +260 -70
  111. crawlo/utils/redis_key_validator.py +1 -1
  112. crawlo/utils/request.py +24 -2
  113. crawlo/utils/request_serializer.py +1 -1
  114. crawlo/utils/resource_manager.py +337 -0
  115. crawlo/utils/response_helper.py +113 -0
  116. crawlo/utils/selector_helper.py +3 -2
  117. crawlo/utils/singleton.py +70 -0
  118. crawlo/utils/spider_loader.py +1 -1
  119. crawlo/utils/text_helper.py +1 -1
  120. crawlo-1.4.8.dist-info/METADATA +831 -0
  121. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/RECORD +131 -145
  122. tests/advanced_tools_example.py +10 -68
  123. tests/distributed_dedup_test.py +467 -0
  124. tests/monitor_redis_dedup.sh +72 -0
  125. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
  126. tests/simple_cli_test.py +55 -0
  127. tests/test_cli_arguments.py +119 -0
  128. tests/test_dedup_fix.py +10 -10
  129. crawlo/logging/async_handler.py +0 -181
  130. crawlo/logging/monitor.py +0 -153
  131. crawlo/logging/sampler.py +0 -167
  132. crawlo/tools/authenticated_proxy.py +0 -241
  133. crawlo/tools/data_formatter.py +0 -226
  134. crawlo/tools/data_validator.py +0 -181
  135. crawlo/tools/encoding_converter.py +0 -127
  136. crawlo/tools/network_diagnostic.py +0 -365
  137. crawlo/tools/request_tools.py +0 -83
  138. crawlo/tools/retry_mechanism.py +0 -224
  139. crawlo/utils/env_config.py +0 -143
  140. crawlo/utils/large_scale_config.py +0 -287
  141. crawlo/utils/log.py +0 -80
  142. crawlo/utils/system.py +0 -11
  143. crawlo/utils/tools.py +0 -5
  144. crawlo/utils/url.py +0 -40
  145. crawlo-1.4.6.dist-info/METADATA +0 -329
  146. tests/env_config_example.py +0 -134
  147. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
  148. tests/test_authenticated_proxy.py +0 -142
  149. tests/test_comprehensive.py +0 -147
  150. tests/test_dynamic_downloaders_proxy.py +0 -125
  151. tests/test_dynamic_proxy.py +0 -93
  152. tests/test_dynamic_proxy_config.py +0 -147
  153. tests/test_dynamic_proxy_real.py +0 -110
  154. tests/test_env_config.py +0 -122
  155. tests/test_framework_env_usage.py +0 -104
  156. tests/test_large_scale_config.py +0 -113
  157. tests/test_proxy_api.py +0 -265
  158. tests/test_real_scenario_proxy.py +0 -196
  159. tests/tools_example.py +0 -261
  160. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
  161. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
  162. {crawlo-1.4.6.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
@@ -1,158 +1,154 @@
1
- crawlo/__init__.py,sha256=n5vFwi0iuYrpAIyoNJZzWHV1gvF-vh-Yze3jiuwEXqM,2180
2
- crawlo/__version__.py,sha256=C1PbImXkZPhAW7rUcTV61OKrbIa2DpoQJ2Kmga3lWwM,23
3
- crawlo/cli.py,sha256=AQnAB5NMI-Ic1VPw_Jjng8L4AI4-wMozOwzE6CfXkZU,2402
4
- crawlo/config.py,sha256=EQIT7WpkXAlr2ocd5SYJYOKTSWUlQx2AkTHX7ErEWxw,9798
5
- crawlo/config_validator.py,sha256=oY4-2bwXUlwHAnGgkI-EznviDfML_dcxbWSGXNSxC2k,11516
6
- crawlo/crawler.py,sha256=6f9eDeUEZVfnUywaZ6CnL5R3bHO4sG82z-Syl3zZKvE,27360
7
- crawlo/event.py,sha256=ZhoPW5CglCEuZNFEwviSCBIw0pT5O6jT98bqYrDFd3E,324
8
- crawlo/exceptions.py,sha256=YVIDnC1bKSMv3fXH_6tinWMuD9HmKHIaUfO4_fkX5sY,1247
9
- crawlo/framework.py,sha256=9gP6VN4MHqutGXaxnwpNMSULfVYbNp906UdZiJGywlQ,9458
1
+ crawlo/__init__.py,sha256=weXNnBkBDN3htDD9LyVo2pJWYW8cPDhC2MwzYIA-Y9Y,2205
2
+ crawlo/__version__.py,sha256=_9NYs3PyBz_2XOfnTsADFo2yWVgtsIhY2vyJebGs93E,23
3
+ crawlo/cli.py,sha256=zbhj9RJ06C18Lc6sQjehgM3ViCBRiWh3CxtYbJeOtQc,2428
4
+ crawlo/config.py,sha256=VRAgw00OnSFLtowD41DOm9faXxuQuymzKWDM3eeQnkY,10052
5
+ crawlo/config_validator.py,sha256=YLE5JUZ-hQrWhX8_4Iq8zf7BaYl-rhG7aO-edstVFpE,11514
6
+ crawlo/crawler.py,sha256=13eKFmXBZfJbxkOc1cKhfdENqiFDg86Aw6E0fpjzLYA,29331
7
+ crawlo/event.py,sha256=P3NtfWkgidoxEmJ4xo5uG_vhAUarnrGbi9GjkdH1pSc,1297
8
+ crawlo/exceptions.py,sha256=B1LQsvXyOBl3GkXaUc_jMkcqB4Z_E3DNdBE4z2n3PrM,5281
9
+ crawlo/framework.py,sha256=yJDOdEfK7Rldga6R1jWje1P2tfHhv_-PaGeokpQ35Wk,10037
10
10
  crawlo/interfaces.py,sha256=q1vwMSiZLfLpPhFa9Y0hAcjYEKvLkW2fZ2fmoAZ-5TE,653
11
- crawlo/mode_manager.py,sha256=e8QmwsnndFx_hGME_7w-hazKo0GOYjUr-7FBf7dWxgc,8903
12
- crawlo/project.py,sha256=9wnlHd-rYAC3TT1Fc1ftyUBx7mbDT6TQCqoaIP6N3iA,13998
13
- crawlo/stats_collector.py,sha256=mzNHu628a31PwqpkBXN90PhD-xhMSunNNxAm-ney5JU,2803
11
+ crawlo/mode_manager.py,sha256=bbv4sdpLXoIjdIuHrJlys-AGHVND1Fr7sJaoV6kQdyc,10026
12
+ crawlo/project.py,sha256=nNnhwIjYxx6R7M25Wr7QDloynYJHJS2cyUrd98Rr_gc,13996
13
+ crawlo/stats_collector.py,sha256=8BlR5SKMTuTpCO_mtTxSYfZ80bCEe8SUJrWbS7UyphE,2793
14
14
  crawlo/subscriber.py,sha256=h8fx69NJZeWem0ZkCmfHAi2kgfDGFObHpwN0aGNUM6Y,5115
15
- crawlo/task_manager.py,sha256=Ic6PFUqZOhLXuZ_UEk_8Neb9FmqYv8I2RzV3vLzFNSU,5966
15
+ crawlo/task_manager.py,sha256=KuExi62Z_ns5qQk-TCVfMlVBcK6mBC4-RYxGlcOKxm4,5964
16
16
  crawlo/commands/__init__.py,sha256=orvY6wLOBwGUEJKeF3h_T1fxj8AaQLjngBDd-3xKOE4,392
17
- crawlo/commands/check.py,sha256=TKDhI_sj7kErgiJpt2vCZ9QL-g6yWjrrPWKbgh8pgEU,23199
17
+ crawlo/commands/check.py,sha256=Qz7c5k7BeBgD9GmA6rgIS1VP5eq-gCyDKFsSpqPZyx0,23197
18
18
  crawlo/commands/genspider.py,sha256=JB4ZuFpKsYwtjx3DSsxugH7e3kqxhDWPG5ZKfvM0isI,6041
19
- crawlo/commands/help.py,sha256=8xPC0iNCg1rRBoK2bb6noAEANc1JwrdM35eF-j6yeZM,5111
20
- crawlo/commands/list.py,sha256=trzcd3kG6DhkOqYZADcl3yR7M8iJBgRw5fE-g9e0gVM,5877
21
- crawlo/commands/run.py,sha256=EjpIilgCTkXGVSV4rEISbJubdhqrok9nNe5-xDfDK5E,13169
19
+ crawlo/commands/help.py,sha256=VzYmVpsTUZIFxJXu0iS0oVDCLmjdlNzVtYkqg4kIOzA,5322
20
+ crawlo/commands/list.py,sha256=MaiaJcrasvzDX-sawzCRk4j1VMJR4IgBT66aczpXGZ0,5875
21
+ crawlo/commands/run.py,sha256=VHtgSMtjvD6srw1un07iu54owWYbJWhG_yfZf_bv0QA,14722
22
22
  crawlo/commands/startproject.py,sha256=boZrMyn6TgCi1jt3D3DQfui6hJitjwNO8mqlWKNOBns,17366
23
- crawlo/commands/stats.py,sha256=vlGJLyiXZtY0ASdzCK59JNereSsAel4W9JCGaOzCr-8,6201
23
+ crawlo/commands/stats.py,sha256=TC_uVi9IYKuvBcGJtduUJPzaWGFN9ZWrNv_YUqg6b6s,6199
24
24
  crawlo/commands/utils.py,sha256=YVNEEzlm_qNY3SVvU8h6o2lQMkVgypvoB4ZFrP4gln0,5578
25
25
  crawlo/core/__init__.py,sha256=BWkj3AqZwp2Bk73UzUlC_qsqv_MH_HNrzy4DY1hosj4,1330
26
- crawlo/core/engine.py,sha256=znJ0VDFBImYi6KkTD8GHNo-V9BDnPSv9iYfTYLPsVSc,19379
27
- crawlo/core/processor.py,sha256=hR5MrbeZvDUx0ShKntr4qwkeVZzjlPJ8EAKgIFkNVts,1555
28
- crawlo/core/scheduler.py,sha256=G9xtrvE1wsTSOTOFUKDEphJvy6Xk5icuCGXTScYy7nQ,14084
26
+ crawlo/core/engine.py,sha256=qKUYlEB9BhdUngKX85IVQMGPcSDVRnuldLSdmJERSWM,20090
27
+ crawlo/core/processor.py,sha256=GpmhrtB-plp3XwckdrcibdkTn0REE9uhOl5aPcQzLtY,1555
28
+ crawlo/core/scheduler.py,sha256=ldkF1KwGo4LsSldGa-clS8n1UZQBItrn757ZChGyb20,13980
29
29
  crawlo/data/__init__.py,sha256=UPqgioMdu3imSUmpLWzVlpvoBnEfaPSAT-crCcWd7iw,121
30
30
  crawlo/data/user_agents.py,sha256=zjjFkldQkqtrn45j0WZplaZLannPxZDeAU0JofxQcBc,9891
31
- crawlo/downloader/__init__.py,sha256=P5pl-BGYCkdKWgoIewcYPz7ocVLixVfYuCDFmYyuqIw,8966
32
- crawlo/downloader/aiohttp_downloader.py,sha256=-dIFucMOQhiiEmtgEpG2Lqh1vF-PvDddbIrZ8Hge0Ig,9556
33
- crawlo/downloader/cffi_downloader.py,sha256=aKmrooictEFNfsmM3t4dpkGEALI85E7eLOAxm4LPQAU,10585
34
- crawlo/downloader/httpx_downloader.py,sha256=MpgDeIdGqNsiSKLOEDBnr5Z0eUbhHnqVEmAuoIfJmFU,12296
35
- crawlo/downloader/hybrid_downloader.py,sha256=dNnFeegRnyLaOxTWI6XrWKqqVPx80AZBZNgmrcKRVBM,8240
36
- crawlo/downloader/playwright_downloader.py,sha256=L-TVzG7cYfuBlqW0XSZuz5C_r9fpJrmYNcoQ-cDEna4,16663
37
- crawlo/downloader/selenium_downloader.py,sha256=P8GuhEw6OYVeN3oeksuBLpUJCELXiu0mAR23X6IIOAA,21508
38
- crawlo/extension/__init__.py,sha256=wwaTTWYUzbg5b84sQn2JvBlyuhVGkw-REkhVlR2vymA,2980
39
- crawlo/extension/health_check.py,sha256=stDpyP4gOzAdbBlPbSf0rge0QounAhF8CtrGq5fa_7c,5657
40
- crawlo/extension/log_interval.py,sha256=N25aNjFkjh9br6g3ViFqRrz06C2geAdfGas-OT2oZh8,4497
41
- crawlo/extension/log_stats.py,sha256=CWjMb_V1o8j8uwGFvh9SZ7EYLl_OYzmuIsOT5V-_BE4,2452
42
- crawlo/extension/logging_extension.py,sha256=WnHVoC4aLHYLapAN0ylt3k5aanP_T1GOyJrAVy-6ePE,2415
43
- crawlo/extension/memory_monitor.py,sha256=fClPchpCkVjcIiU0AJHCKDd7HEiz5B4KqNqKTRZ2hcU,4394
44
- crawlo/extension/performance_profiler.py,sha256=BjWD3LOb4VwjQJQvQtWNg7GluEwFquI1CztNfgMzy3c,5032
45
- crawlo/extension/request_recorder.py,sha256=KA_RmcfscDxP5wPdolO76yKfRj-1jmHhG3jkVGO1pbc,4181
31
+ crawlo/downloader/__init__.py,sha256=wa5AY9XlWFTkZi2VBZ0hThwuPJZYdYjjHMU7nPvfmFw,9078
32
+ crawlo/downloader/aiohttp_downloader.py,sha256=aqmkFGcp0yFo9JdxB1UKWMVNDHGpvdRUZrM_IXZb8H4,9069
33
+ crawlo/downloader/cffi_downloader.py,sha256=410FOS3T6PXS4Vv6SH6I8tsOs4hVXsXny_oR4OP6M7s,10649
34
+ crawlo/downloader/httpx_downloader.py,sha256=BWqq-h63LYgnEWa7v0-Lwzof-lA9kREHWVkZRs7-PbY,12458
35
+ crawlo/downloader/hybrid_downloader.py,sha256=tXL7ceRWFrkh-YmUqVXBStjRqq0UsBjy-5NAltKOfvY,8203
36
+ crawlo/downloader/playwright_downloader.py,sha256=N1k7ZiRKnwhGSZnKbQ3M4SJzXJ5H-JG-nInpHHDhs3o,17622
37
+ crawlo/downloader/selenium_downloader.py,sha256=t5HU5utEmvLItn_Oou8a_ZGpzVZ-m_5QN7VJqh-TSdE,22151
38
+ crawlo/extension/__init__.py,sha256=viNq8fFDmSbvhmJUJX3-kUSikRKgFNDW8q-g8p2nsrg,2951
39
+ crawlo/extension/health_check.py,sha256=GL1JOuvnAPrdmmIY7uQ_YVdt0dIsYPtbof9OY75GLF8,5618
40
+ crawlo/extension/log_interval.py,sha256=eM6nN6-BQcGMtIAae9QSPJVK20ErzGtgriOZeVzq_j0,4462
41
+ crawlo/extension/log_stats.py,sha256=-CaNnBI4Ldaltjun6RyegSCIKuegfKO5xuA9nhsiXBo,2415
42
+ crawlo/extension/logging_extension.py,sha256=KALP8JxidozzYxD39bdkUL35HRTFdFqKYXatVKroNbU,2143
43
+ crawlo/extension/memory_monitor.py,sha256=dKSP6EkDnbGrv_ZSDjrHdNHMp9spDqJ9ZvvWC9dC-pk,4367
44
+ crawlo/extension/performance_profiler.py,sha256=SLzG9TioFO-SLyzd1-qNpsYWjc9sS4ybmza9jWzwdHg,5005
45
+ crawlo/extension/request_recorder.py,sha256=qRAaC9H67zr8Rp57cWqdtc21Re2XsCi49yhGWJk8uXU,4178
46
46
  crawlo/factories/__init__.py,sha256=24dH70p05pZerO9-9gaKpTawRGeGvQYw7j5brvq8GUg,714
47
- crawlo/factories/base.py,sha256=loB_vyc0CsQK0BgwRoSOFS8gPcmv-b9irtjC9UaBGA4,1832
48
- crawlo/factories/crawler.py,sha256=e9zl4qytByzsYbz66klY3cZTvQei0-9GjdFK4XCyXcg,3198
47
+ crawlo/factories/base.py,sha256=a8S5dhT7Pk2ybs34VAMmskIoa4svTcNy5KzF25tIELQ,1822
48
+ crawlo/factories/crawler.py,sha256=6fYbv-9UGLVmbnra-ZcXpDKYy7zKf9KhYw4GtPfxHp4,3170
49
49
  crawlo/factories/registry.py,sha256=YU87CdsntOz609M0aQbGcCG9glPinUJxOn-_CaM4f-M,2595
50
- crawlo/filters/__init__.py,sha256=noSe07tp2Ip_zXwAbS021BojrqNRaObDO-2YV6DOQfc,4381
51
- crawlo/filters/aioredis_filter.py,sha256=WglGW-XLjsy8r_NDrNsXk_nzwaIq081MBnooHqCCQZA,9841
52
- crawlo/filters/memory_filter.py,sha256=gIPXCw650v81XRiz0MhWXH-zcn24ERzDTzBQZRoy1YU,9890
50
+ crawlo/factories/utils.py,sha256=TIUCUjUTjh2lx5tmLdlKp1ai0C8YhF10--hY59YP-GY,4056
51
+ crawlo/filters/__init__.py,sha256=dKoMIZXl2W1qOsTGPW_RXB2o4DvHBFLDjkZnMxVdU5g,4982
52
+ crawlo/filters/aioredis_filter.py,sha256=BOiPalIMJRu_q85A6rfZKjq07m0Hi3KkDOZEIYffhBU,13752
53
+ crawlo/filters/memory_filter.py,sha256=CJX7P2QXy-k7wBycFfYL3_MUqqCWzIYIPpOqHUgrr6g,9227
53
54
  crawlo/initialization/__init__.py,sha256=uNRMm9GccMYZi51scpvo-CPx_3ayp3Y81psBHlUoDfw,1132
54
- crawlo/initialization/built_in.py,sha256=DlZf4k9FlU52tnwlFtKqWHqlFZpo-VHB0qP61rVqJzo,16259
55
+ crawlo/initialization/built_in.py,sha256=zODKOy6HQ1m3leG8MJg0WBKAmq-NTezL9FKY4mvSh6I,14833
55
56
  crawlo/initialization/context.py,sha256=wG9t-M-Qttj7TN6gDumPX5Q5GHaPDUpLTZZDne2r3TE,4863
56
- crawlo/initialization/core.py,sha256=GWc9QNSp2JmHlCAhgq1aqGDXHcO6QlxFAVfePKC1xeo,6872
57
- crawlo/initialization/phases.py,sha256=iWhGITh9eudfSmzf2G3DLPAIJkCDrv9TVBtnAoS1_3c,4176
58
- crawlo/initialization/registry.py,sha256=kKVegqWxtPCaZ1mTyVHN4yFecAGDOPFJfebkP-SoobE,4919
57
+ crawlo/initialization/core.py,sha256=sjjnzlivnuBkxVtOnEQL5YyaawYKXVDgUXyaj7dlfj4,8648
58
+ crawlo/initialization/phases.py,sha256=cqHBiN56a93OWrx1zRi_AsHiVoULG8-Xf9REA1Tr1NM,7171
59
+ crawlo/initialization/registry.py,sha256=u2AIjmsiyH13vD46RFFX5t86tOiDb1dQATOKa8JX5fI,4913
60
+ crawlo/initialization/utils.py,sha256=xuLIy9AUTgV23DB5UpDcRAtIMLRPWcdEUGYd0Exuf7k,1192
59
61
  crawlo/items/__init__.py,sha256=rFpx1qFBo0Ik7bSdnXC8EVTJUOQdoJYGVdhYjaH00nk,409
60
62
  crawlo/items/base.py,sha256=q0YTJlqUtizsqXwfWlk0ndcINV9dDyUckwMx8_JrkeY,602
61
63
  crawlo/items/fields.py,sha256=l-DIwK6CCpdzNvf6ELz7Ckc7YCghZD9UCXA8vhNn2UE,1852
62
64
  crawlo/items/items.py,sha256=OmVEvMmgofMU95GkaiWkfNQ2fjsH2fY9sw3SKcmUhLs,3478
63
- crawlo/logging/__init__.py,sha256=NlvL0sc9NZqy-Poplwpd2wsUleGyg33MyRb1wxyG-zs,1184
64
- crawlo/logging/async_handler.py,sha256=cMBVi9Ue1y5yZ7r0Uzwr7j_4nyxwmcGXwSE1J7_yIOw,5259
65
- crawlo/logging/config.py,sha256=0N7w542vlKyE5tSLAbDJYm9U4_1lQro_TQKVIJE_pA4,7167
66
- crawlo/logging/factory.py,sha256=b4Z0fBmP00GpvpJ7k4QxqYP32n_EqG5KD3ouUWU7L4U,6656
67
- crawlo/logging/manager.py,sha256=aem7yla0q83rf2CpwQEyg5YMbey4TzkquBVWiKtcqdQ,3182
68
- crawlo/logging/monitor.py,sha256=mzZWm3rQ2mGUoAmkEJPUkBmR0VWK66l14aqqhQ0zwE8,4935
69
- crawlo/logging/sampler.py,sha256=1BoRMpusP3wbXRnet5xl9_Yb_3_-AUq9WJhK9gYg7v4,5292
70
- crawlo/middleware/__init__.py,sha256=khNCstVcYlL14SbLZ8ys9ub1-C8k4FIiMQ99Vw9wA-0,635
71
- crawlo/middleware/default_header.py,sha256=Pw-ev8ffi16GeCh84R5L3hAZgp3G1QXS-H5kV3JEp4Q,5164
72
- crawlo/middleware/download_delay.py,sha256=2iWnJFtWDlqDy5MsAob8TPiJQoiz9v21yatkBI0eptg,3542
73
- crawlo/middleware/middleware_manager.py,sha256=H_o0nwo_xQ8aSRnnvEs2Ho3fS-3WNi_5AjChhqvRYnk,6645
74
- crawlo/middleware/offsite.py,sha256=4tUkPqXMMXsi1WwYnJ_e7wMd6sRgK19QHRCYq8-w8jk,4682
75
- crawlo/middleware/proxy.py,sha256=jfaM4gL78ga_F7LN891dULjjO2zqFmulwQMDs5eJD6k,9591
76
- crawlo/middleware/request_ignore.py,sha256=7qdX4zAimjSGwdod_aWUbOTfzLBWZ5KzLVFchGMCxCI,2663
77
- crawlo/middleware/response_code.py,sha256=d5t0hmP8QliuvvtFOqW-ogCBtZxg2eyjsOtlQAEUxM8,4533
78
- crawlo/middleware/response_filter.py,sha256=tVGr06bfJBR3xAHI2G5c3WimFsGHu8qoJtDcsVuCATU,4384
79
- crawlo/middleware/retry.py,sha256=Acfo95B9wF8fQTCQIqluZOS2hHdnknQu_FOHvpGKJp0,4248
65
+ crawlo/logging/__init__.py,sha256=D9qhyaHHxf6a8syEeqSu8uiV1fjiu0wH6mrZb544N8s,932
66
+ crawlo/logging/config.py,sha256=Ozouc320Y9_lFoDNqf1huL4_hN07LxEJwx4r5_l9-9g,11299
67
+ crawlo/logging/factory.py,sha256=3WtZcdmoypbPVpWlNHCBb45Fd-WR6y_U9y4SLR_CWKI,7019
68
+ crawlo/logging/manager.py,sha256=fF24ZiEPqCafpFF4e8wtZPg7SE93IcmMnVGcezbqWNM,2721
69
+ crawlo/middleware/__init__.py,sha256=YdPoIaPTom2r7kbTYsaNq841Idn1Fi54bs0MrkI8N-c,2269
70
+ crawlo/middleware/default_header.py,sha256=MAFK29Z5AVDtP5McaxTeU45BYaFm0Cxs9fRyF0J0pKg,5151
71
+ crawlo/middleware/download_delay.py,sha256=OVJ0Mii8bvJgbv-faeHfKp_fs2lob2vvoKw7aAfnYUA,3529
72
+ crawlo/middleware/middleware_manager.py,sha256=bhVkfqE1q-VGuGax3pBgjQmsPfzNLz8-pnXF6Y7fDd4,6626
73
+ crawlo/middleware/offsite.py,sha256=ZfiYasOhKha_uPh72E_N4uL2NEvL4zO2RDJx7jJoSQU,4669
74
+ crawlo/middleware/proxy.py,sha256=gNeafGnYr21is4Lth5OrNLyH3aXxs9shBWMLZkM7APs,9578
75
+ crawlo/middleware/request_ignore.py,sha256=I2HwUQjRG95o7qksf0CfdhaWfypu7y9IhRo3dHKa0dM,2661
76
+ crawlo/middleware/response_code.py,sha256=mi20tfnaBCff4fmNJ4n8txgddNyK14hoYHYVWg0-y7Y,4520
77
+ crawlo/middleware/response_filter.py,sha256=pLoKadA4bANC1Hdw_QJkvsiF2jhepIlUAapVKhk7NDQ,4371
78
+ crawlo/middleware/retry.py,sha256=_xfqzirLsgFdxIIJrfjtxuW3mCnT6Ww1EF_VSh9jzAY,4246
80
79
  crawlo/network/__init__.py,sha256=bvEnpEUBZJ79URfNZbsHhsBKna54hM2-x_BV8eotTA4,418
81
- crawlo/network/request.py,sha256=e6-YLgK7SU8D19n21mQwqt_b_aeRVJFOgWPIBPal2ys,14178
82
- crawlo/network/response.py,sha256=-URnNc_J7qBSG19uJbfuF6A_14MHLOtY78FvcZDzbsI,23418
83
- crawlo/pipelines/__init__.py,sha256=FDe2Pr5tiHtV8hFlheElRO_O1aVKvSWlkTcAl9BXAKA,637
84
- crawlo/pipelines/bloom_dedup_pipeline.py,sha256=vIF_6noJAdpotrJpnCmrVXCi59gRmHHn28mYW6VukbM,5465
85
- crawlo/pipelines/console_pipeline.py,sha256=bwe5hZgaVSWmh3R8XpOaaeAjJme-Ttrpo6G6f1cnLIg,1287
86
- crawlo/pipelines/csv_pipeline.py,sha256=qbXZoqq4FIR9QkUGpC0ryWzmqGJSrM2bxmWLM4I1nXM,12490
87
- crawlo/pipelines/database_dedup_pipeline.py,sha256=IxahtD_mhni-Y21_idOMX58_Htf46A7n52enG9VR2PI,7296
88
- crawlo/pipelines/json_pipeline.py,sha256=wrCsh8YInmcPLAkhPrHObMx89VZfhf-c7qRrYsTixPE,8585
89
- crawlo/pipelines/memory_dedup_pipeline.py,sha256=lKkYPu6vkpPjfQ1-xOLvPFT4VdTI8QVx0yjqtVR0ZB0,3598
90
- crawlo/pipelines/mongo_pipeline.py,sha256=PohTKTGw3QRvuP-T6SrquwW3FAHSno8jQ2D2cH_d75U,5837
91
- crawlo/pipelines/mysql_pipeline.py,sha256=jlTP1X5QMrSVZjLD4lMS1BUTz-x6bagUEODddvHI2Vg,23702
92
- crawlo/pipelines/pipeline_manager.py,sha256=_DtWfxcTinIf5ApzUOVjZksd2tPbc7qeKi92IVd_kbs,4387
93
- crawlo/pipelines/redis_dedup_pipeline.py,sha256=RB1kXLr8ZuWNrgZKYwt--tlmnWsQTbuwTsSt3pafol8,6077
94
- crawlo/queue/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
80
+ crawlo/network/request.py,sha256=VygsnSHZFdE1XqLdmF-An0_Eq2fC53mXNeCljuKpq7g,15344
81
+ crawlo/network/response.py,sha256=ckWUEYEkXlbWxoOslZSB1Rl4L9TFFouA7qkBaMAEj1c,21450
82
+ crawlo/pipelines/__init__.py,sha256=YJ5XQUJPZWR2yiNb4O7pTNfgrWnmNgZZlcgQ4wo9IOg,1476
83
+ crawlo/pipelines/base_pipeline.py,sha256=8bK9kV9-zfT-NaEi7vWDmMdryYeEWHjpypHs3RYQIi0,14839
84
+ crawlo/pipelines/bloom_dedup_pipeline.py,sha256=o36WsfaGrGu5pJL1ULFy3Ykb-YnBgWRvVyU7cM-inU0,5429
85
+ crawlo/pipelines/console_pipeline.py,sha256=1-rUglds9HRhPUHiWtP927-ryIOMPmgMWgEjzIpuBaY,1274
86
+ crawlo/pipelines/csv_pipeline.py,sha256=MP31vod41PU5DzPWVUl2uCp2GSH7W1NW0MES0U2iFOI,12392
87
+ crawlo/pipelines/database_dedup_pipeline.py,sha256=QHmb0vQC0ZkIEw6cq2F3mSxvRhkvDaP1BtNzGxP39oI,7260
88
+ crawlo/pipelines/json_pipeline.py,sha256=nW4BI6N7OCixcicdbbQquSgLUF3Z4dP3ktBRsLFRmOw,8487
89
+ crawlo/pipelines/memory_dedup_pipeline.py,sha256=6tgnBTr1zTs4uJQcwNthofg9QijNvgqTpqfvteIy0es,3562
90
+ crawlo/pipelines/mongo_pipeline.py,sha256=0iSlUlvjO5AsfuE5zHCKjL0YbhHR5OtZjIjvegNzr7c,6213
91
+ crawlo/pipelines/mysql_pipeline.py,sha256=ZCz1YE6OdjCPzhaCcFN2eBk37bxLXrFDZt-ceS93uks,23235
92
+ crawlo/pipelines/pipeline_manager.py,sha256=c1t-YcIspp7o09NEfaYSBSaUUIg2-0fJhjqvG7_3Jqw,4286
93
+ crawlo/pipelines/redis_dedup_pipeline.py,sha256=j_i9Tl3L7tt4Lkc4tLCIcY3tWMEvvVxDH1TYbbqUWzo,5914
94
+ crawlo/queue/__init__.py,sha256=wY-W_KLmjCbMGYUebmOcTh98JO28pTBH-7ifCAaXBXc,259
95
95
  crawlo/queue/pqueue.py,sha256=bbgd3l1VfqYXfz-4VFaiWLmJit1LdB3qHalCtNqyrqI,1210
96
- crawlo/queue/queue_manager.py,sha256=8rKygMxr6DgSjnGsKFmvlTI5XAARvQIN_ENkAruHGXs,21532
97
- crawlo/queue/redis_priority_queue.py,sha256=vLvg2toKaRrXD1QyEdu1ZjTmANv7clFaBF7mCtstBmI,15995
96
+ crawlo/queue/queue_manager.py,sha256=CjaA0C6HMaET2n_osFr2mxO_fzt8Oq9pOqQIPnt63r4,25059
97
+ crawlo/queue/redis_priority_queue.py,sha256=B9QYTczt12kc4QlZSt1_XyvqvefwesOV-rCUPr_KyXo,22471
98
98
  crawlo/settings/__init__.py,sha256=NgYFLfk_Bw7h6KSoepJn_lMBSqVbCHebjKxaE3_eMgw,130
99
- crawlo/settings/default_settings.py,sha256=TvtXgLzgc9_j_ITt8_xYhag29k6dCJiPU0Yq-snMkt4,12704
99
+ crawlo/settings/default_settings.py,sha256=xTOZAfle2TUVcMkP1m1XAB6MSg-RvCrJp1YvOB8aUEI,13485
100
100
  crawlo/settings/setting_manager.py,sha256=yI1tGaludevxKGGZO3Pn4aYofrg2cwYwvMZCFC5PPZw,8595
101
- crawlo/spider/__init__.py,sha256=QGhe_yNsnfnCF3G9nSoWEw23b8SkP5oSFU5W79C5DzI,21881
102
- crawlo/templates/crawlo.cfg.tmpl,sha256=lwiUVe5sFixJgHFEjn1OtbAeyWsECOrz37uheuVtulk,240
103
- crawlo/templates/run.py.tmpl,sha256=1ge0XILc3O5u7S8rsyg_rpe2B2ULokJcrKRVHMwPKj0,511
104
- crawlo/templates/spiders_init.py.tmpl,sha256=p6UK8KWr8FDydNxiAh6Iz29MY5WmgXIkf2z-buOGhOM,354
105
- crawlo/templates/project/__init__.py.tmpl,sha256=aQnHaOjMSkTviOC8COUX0fKymuyf8lx2tGduxkMkXEE,61
106
- crawlo/templates/project/items.py.tmpl,sha256=hpQ2AfUmhddnzMuKM5LF6t44dOfFXwJRAZlWFKUFOZw,343
107
- crawlo/templates/project/middlewares.py.tmpl,sha256=eEobZl8g_0DtiwLYbirQULqOacH-yUrrs4PUrGcJ2UE,1098
108
- crawlo/templates/project/pipelines.py.tmpl,sha256=7BeaQDMHbIjhKzRtzlCMiFlU8xgMzDs2PIHq3EVUAlQ,887
109
- crawlo/templates/project/settings.py.tmpl,sha256=fYK2NCJOc_jVRraKkEzH8beyax16KgNa-9s6TsQrdpI,3606
110
- crawlo/templates/project/settings_distributed.py.tmpl,sha256=ULXyi5GDsZggk1Z4SRkalm2g7kJQx9ul6bCARN2I-TM,5566
111
- crawlo/templates/project/settings_gentle.py.tmpl,sha256=NZjSqAqWmYlNE15Zt6-wY4rtxp7ID6HFUafoOvt7VAE,6039
112
- crawlo/templates/project/settings_high_performance.py.tmpl,sha256=QYN4hJqvGmL7oayJjLcx4Mr3jedqRSvdlWkivom2M2o,6129
113
- crawlo/templates/project/settings_minimal.py.tmpl,sha256=8XS_ButRDJxYRQSRHTc_l8ej2DbUnR0j891m0j-gjTY,3122
114
- crawlo/templates/project/settings_simple.py.tmpl,sha256=OmL4GCPpFseRIG0CgL7625IWipc6vG_Da5tefXv_MD0,5891
101
+ crawlo/spider/__init__.py,sha256=91NbcZGUiKOgPpvu8QfsSE6_A2Gw2XQoaiTPxwP-8k8,21855
102
+ crawlo/templates/crawlo.cfg.tmpl,sha256=DfmftICPPsopFGkmOqTWo55PCLboUk5iyFBtGqD_V1g,212
103
+ crawlo/templates/run.py.tmpl,sha256=jr2T6vARwMEJ3v4WXkLH1X0-wDTnoOlJTW-q2255z9o,499
104
+ crawlo/templates/spiders_init.py.tmpl,sha256=QvPw4DxiIjzyue6JDgfFtmuRKrE-jmjlBbvDK10zQwY,109
105
+ crawlo/templates/project/__init__.py.tmpl,sha256=WI5rG7-4rvwmwJWoGCzAgiw4hH2mL6qCYp5GbgiqxJY,54
106
+ crawlo/templates/project/items.py.tmpl,sha256=DrNj48b1W54DmntTLwU--ow2Fw4wSKw8MAmV1K8Vh2E,236
107
+ crawlo/templates/project/middlewares.py.tmpl,sha256=QQlb_bIhurQx4ZOUge3pwHrNIN0Z7RXNXCwA52Hs0Sw,1096
108
+ crawlo/templates/project/pipelines.py.tmpl,sha256=b_c_xLUzxLx3z2rMS0JG4JagM6_s38dx-VDHV47HaLQ,845
109
+ crawlo/templates/project/settings.py.tmpl,sha256=Wcchnbehzg-BY7XCxnrOz-pUwm6PDThN2mp42a-z3iA,3759
110
+ crawlo/templates/project/settings_distributed.py.tmpl,sha256=CpL3TkXkXEk90rNAnQRCTvu8jKlkhOckCin8245bdQM,5514
111
+ crawlo/templates/project/settings_gentle.py.tmpl,sha256=T6tPA91KtmOU_oyosrt-Z5LZ73jT_HWBu2mT0FrRxjg,5840
112
+ crawlo/templates/project/settings_high_performance.py.tmpl,sha256=kTdFI_FF_gzYxxASHiXEtfv2ptOHSD6X5K5VHMxPYJs,5960
113
+ crawlo/templates/project/settings_minimal.py.tmpl,sha256=CE-bGRFs12EY3psa6o-F3HB78nBF_YeuhMbgYN9efN8,3461
114
+ crawlo/templates/project/settings_simple.py.tmpl,sha256=Z5nKJXfKVBgBpp4B5DeuyopMPLOcorJJ4Bu6UG8pams,5692
115
115
  crawlo/templates/project/spiders/__init__.py.tmpl,sha256=llhcIItXpm0TlEeumeLwp4fcYv2NHl8Iru7tLhDhxiE,216
116
- crawlo/templates/spider/spider.py.tmpl,sha256=4E4DDoOfI0vN_zLjfmMX_QNmWCx8EbrOKWBg6zozVqs,1065
117
- crawlo/tools/__init__.py,sha256=sXDMZNP6EwZIFivGcRthxqD1DFMMS8UOJvULAzHD-w4,3927
118
- crawlo/tools/authenticated_proxy.py,sha256=ULCK0Cc9F2rGhRqu6kzKBdxzK9v2n1CsatSQ_PMxpAg,7272
119
- crawlo/tools/data_formatter.py,sha256=iBDHpZBZvn9O7pLkTQilE1TzYJQEc3z3f6HXoVus0f0,7808
120
- crawlo/tools/data_validator.py,sha256=bLWnkpFdclJuqjtSAgMI5nznN4vAuPwE1YaiFWKWenM,5490
116
+ crawlo/templates/spider/spider.py.tmpl,sha256=MpnTd69yawJ11rqyYRApj3_InQf_gMzNqPfL5bF7UgM,887
117
+ crawlo/tools/__init__.py,sha256=fatgJTaA06jvINxsqn7yJ-QtnRpxc2eDLnDFTmE_ykI,1768
121
118
  crawlo/tools/date_tools.py,sha256=QOT3W5MqcEQhVM3cTZYxu1MRfgX-TI4aF1RI9s0QbdE,9195
122
119
  crawlo/tools/distributed_coordinator.py,sha256=kkRbRoxz7iXKI3AQElyTptDpYl352ErbSkM3wjSHVwU,12574
123
- crawlo/tools/encoding_converter.py,sha256=CqHAsR2rwxuzsyR-TeQNb79HX5mH4KAUixEY-sX7204,4170
124
- crawlo/tools/network_diagnostic.py,sha256=X1hSbUthIVbMHCU7ti43Zpu8XTaDJd5Oxr2zAkEuSB0,13013
125
- crawlo/tools/request_tools.py,sha256=oXrk4yWMACVa65fDQCQgzsg6a94FH4_lS7qNR53FHYU,2420
126
- crawlo/tools/retry_mechanism.py,sha256=4AQ_HLuYt4hYMI9XHoKFk2GQKEiDJB5pAnsMCfjc6Bk,7777
127
- crawlo/tools/scenario_adapter.py,sha256=pzysL1B2uQ1ZSEncVHd9Hv2viHNgaxP44YAUcDcppfw,9660
120
+ crawlo/tools/scenario_adapter.py,sha256=gBe1nQiVO6c9Lt7GkA25zyidX_jaXUHWLjKWBtvXz5A,9658
128
121
  crawlo/tools/text_cleaner.py,sha256=UrMGcgRnJaufjmDKIDsRYKMA8znCAArHDgouttWPygk,6690
129
- crawlo/utils/__init__.py,sha256=nxLnfqcEGLnsfSEagoKNyu-pm2ByU9BwE5tLxcS71Qo,1003
130
- crawlo/utils/batch_processor.py,sha256=8LNy-K2SrQVUxmGEWxQyYw_j9M-erN4Ie7O4d3zpBvM,9142
131
- crawlo/utils/controlled_spider_mixin.py,sha256=8CuM3Cr2wQLHbaO_ohbCsPcImJnyfZHpERbSeMgQ-AQ,16936
132
- crawlo/utils/db_helper.py,sha256=zFr4BpEMbaY86DrR5Ol5-hfvkSXcG66prl00LPHLl8E,8702
133
- crawlo/utils/env_config.py,sha256=W-VD_WF63DHxsyJysvp1eJwRh3L_pBRl_PitQAY3nQY,4079
134
- crawlo/utils/error_handler.py,sha256=e2LeUGT_OMcNKcjiX9Pp-NuQh5spsHBqIPBd7VxA2IQ,16247
122
+ crawlo/utils/__init__.py,sha256=FFMqPPGW9oLHDIJoF-ImxHwCLL_CKe5rMmi87CShKMs,1511
123
+ crawlo/utils/batch_processor.py,sha256=yGAS-Gp4ZAUEE1O8DHlSVSG2qEhfa2_nlDiFxIZUfyU,9882
124
+ crawlo/utils/config_manager.py,sha256=Qgrtm1v3rdlm-VMCjJOnllxQAizRu1oU6bGOhX_GEv0,14111
125
+ crawlo/utils/controlled_spider_mixin.py,sha256=U8fmo4aMEghmgoPZ8ZAZFutM-VaMFs33VwFS5rRR1XQ,16934
126
+ crawlo/utils/db_helper.py,sha256=auS7KOBvXolpfO1a6McWHBt1PgcTD6FbkkjNdvdjS7s,8700
127
+ crawlo/utils/encoding_helper.py,sha256=4MjF1Nzllt-kqIqLGp4p415KukouNZbxTOacTEFWi1M,6016
128
+ crawlo/utils/error_handler.py,sha256=Wm4fieywfR4M39v7GR0Tj2WTBcOQ8Mjf6PY5VF8VNTc,16234
135
129
  crawlo/utils/fingerprint.py,sha256=3IbctH3zwyBjN_12SH7-vrFt-akA2WSo3iAzHc6u--s,3689
136
130
  crawlo/utils/func_tools.py,sha256=y-TYP9H3X67MS_foWy9Z2LIS6GP7Y4Cy3T168ulq3Jc,2451
137
- crawlo/utils/large_scale_config.py,sha256=NZMsDj4qbVx06Fu0aHqNKX1yzo6WFT7CgrhVnvw1ZFs,8372
138
- crawlo/utils/large_scale_helper.py,sha256=4ORkZcIrwJ0SlKOUh7l7WIuERORuRhNBgHCM71Rz0n0,12452
139
- crawlo/utils/log.py,sha256=KmUWVYq8t6fSGOC88nnYCDxwBUdoPWvaBmpOSHn2oWI,2914
131
+ crawlo/utils/large_scale_helper.py,sha256=pHu4597lVtycvNwJXbw1IWCF6U7L8pMzWwBSwEZl-Fk,12450
132
+ crawlo/utils/leak_detector.py,sha256=7ycQEpVRglYUdueOzY-Kkt1mQK8nk0wbDRVO9uo4jhw,11922
140
133
  crawlo/utils/misc.py,sha256=m_TbfMf4Aoe70zmkv7XWyFg8Rz0qOYPXepwB6EcYr7Y,2519
141
- crawlo/utils/performance_monitor.py,sha256=32KspSo7RWvCX_fl0ZFn4ScWWOqbVVwEhPRd921Ez6I,9832
134
+ crawlo/utils/mongo_connection_pool.py,sha256=CCWsgF82LzQSBBlNJAW2AfWosw13sqOQbGXfKKNoFGk,6046
135
+ crawlo/utils/mysql_connection_pool.py,sha256=xNCGvZV1ytnejPmxISb4uDVqdCeGz4PNXXtpkcBz7yc,7254
136
+ crawlo/utils/performance_monitor.py,sha256=QevfmkIbu0Ox3kd7eiH_IIWiFy1zfNcqOlnBH8OhdLE,9830
142
137
  crawlo/utils/queue_helper.py,sha256=gFmkh1jKlIcN1rmo2Jl6vYcLP5ByUWlfHO9eNlZPBLs,4918
143
- crawlo/utils/redis_connection_pool.py,sha256=EsPZkmQctWkoYU2wcrqkgwnIWnE6nG4XCXECKn216JA,12575
144
- crawlo/utils/redis_key_validator.py,sha256=-UTTx0Ul184pzwSply8hVdH0lp-gkXXOc_gEHR_7VlU,5809
145
- crawlo/utils/request.py,sha256=RcINrLvShfZ5VHu1T_hJJRXp-viKWSo35C2JOgWyl2k,8641
146
- crawlo/utils/request_serializer.py,sha256=b5abcgjJk4IU6Wfg46AmOAU2wmzu_WqcpEbuAncRMGQ,8931
147
- crawlo/utils/selector_helper.py,sha256=BVczzsSzPL5zF5KHXK3hyuqEl9o0ADYEuCH7Aw8aj98,4332
148
- crawlo/utils/spider_loader.py,sha256=oxifl0p4SOFhvvnD38Em4zGtC7sRr_pw4dki01MoAq0,7677
149
- crawlo/utils/system.py,sha256=24zGmtHNhDFMGVo7ftMV-Pqg6_5d63zsyNey9udvJJk,248
150
- crawlo/utils/text_helper.py,sha256=TTZgQPayMFUOYj8syt47Gwa4AQVY15W1b56STJetAKE,2920
151
- crawlo/utils/tools.py,sha256=uy7qw5Z1BIhyEgiHENvtM7WoGCJxlS8EX3PmOA7ouCo,275
152
- crawlo/utils/url.py,sha256=RKe_iqdjafsNcp-P2GVLYpsL1qbxiuZLiFc-SqOQkcs,1521
138
+ crawlo/utils/redis_checker.py,sha256=XP05VK8dpFYYt8p3eK2tiNskmSCQXJAshuJ-BqsqV-U,2639
139
+ crawlo/utils/redis_connection_pool.py,sha256=DWqcusN6jfFn9YA7kuhVNsLZr4P0bOIFXUaYqTueewQ,21397
140
+ crawlo/utils/redis_key_validator.py,sha256=v7pat8g30MyqQDaXJbDDH4CS8irmu0ijTGZRfm7nh7g,5807
141
+ crawlo/utils/request.py,sha256=mMDo85uEbjLYJ-Np2VT0GMeXD7L8IYY4eIV98lIspYU,9438
142
+ crawlo/utils/request_serializer.py,sha256=zuH2_AhSZDVDmKUo6NrfG6kt4ZsbBict9pUf7CZvaM8,8929
143
+ crawlo/utils/resource_manager.py,sha256=SoZV7Z980HZaDazchUY8fc8J481VGPrgW18bqFzIZAs,11209
144
+ crawlo/utils/response_helper.py,sha256=YJYnK_NYWEIm7iF9rfgU-xB2WvGfJo28MGYRWao-Ghs,3392
145
+ crawlo/utils/selector_helper.py,sha256=kfqgy3ZV0RuGE_I8sNqQemr1XuJYBmM1Hkk7CD61HzU,4314
146
+ crawlo/utils/singleton.py,sha256=oXtNVB1_yTBv1Hvq8_jU_X3eKMKmQ21Kl0G9SS1vcWc,1892
147
+ crawlo/utils/spider_loader.py,sha256=CVyzuVmMFZ02ur8USna3jZNyMrhFUdPKoatnbDkOnSc,7675
148
+ crawlo/utils/text_helper.py,sha256=gYIrkH4_vFHbKZH9m6d1BVO5dqRqUccJqSWMdgVOb4g,2918
153
149
  examples/__init__.py,sha256=NkRbV8_S1tb8S2AW6BE2U6P2-eGOPwMR1k0YQAwQpSE,130
154
150
  tests/__init__.py,sha256=409aRX8hsPffiZCVjOogtxwhACzBp8G2UTJyUQSxhK0,136
155
- tests/advanced_tools_example.py,sha256=1_iitECKCuWUYMNNGo61l3lmwMRrWdA8F_Xw56UaGZY,9340
151
+ tests/advanced_tools_example.py,sha256=W_QLm62QJGDSsjTi1ZrTfzrwfakk21PF_iOQvraBc94,7031
156
152
  tests/authenticated_proxy_example.py,sha256=ZgLrU-1GaBhkJK1Wy0X93lHP1GT2sU2_wi3RI1CfrVc,3135
157
153
  tests/baidu_performance_test.py,sha256=wxdaI7UwKboMYH_qcaqZLxAStvndH60bvKGzD8F-jaI,3974
158
154
  tests/baidu_test.py,sha256=NKYnwDbPJX3tmKtRn7uQ_QWzUXiLTQC-Gdr1cQkJzEo,1874
@@ -171,11 +167,11 @@ tests/debug_log_levels.py,sha256=CZWG3KGDq-hYJ5TPhoZTyjKFKkkM-AoK3oP1w-JC1sc,216
171
167
  tests/debug_pipelines.py,sha256=FMb36bH9lQxBLb-nM579hBRK1S16Vxu1t_BC3Dj8O2w,2164
172
168
  tests/detailed_log_test.py,sha256=oTCFF_Un7Jq2gV4rpRDFOxlHJSthnQhvEf0CSItfB7I,7501
173
169
  tests/direct_selector_helper_test.py,sha256=p7_x3x87JUnpKplmwYO4zN5ympcPJSPdHsviso-LmpI,2862
170
+ tests/distributed_dedup_test.py,sha256=EaRnWH3ADsJN67Kn7T5TYPiQyMvyRtV6OCPk1uUTXQM,16507
174
171
  tests/distributed_test.py,sha256=u6cEiymZzCItaTClKTxwVjNmOj9_PZii4_eGNAVMDW8,1825
175
172
  tests/distributed_test_debug.py,sha256=pUv6ZKEJ5pK9xOA7lgVk6WW3cBAtnb1bsuZzJ8oGLvY,2181
176
173
  tests/dynamic_loading_example.py,sha256=7LdeQZFevrb-U1_dgr4oX3aYo2Da4HvE_0KIf1fw4Ew,18786
177
174
  tests/dynamic_loading_test.py,sha256=dzDW7b66HeDsIYsYgvNRihE3V6b6gEbUGQpp-eJbcIM,3413
178
- tests/env_config_example.py,sha256=_ZRDh_LR23ZKpy9E--y_KM0QIOiZF5vRT98QTn52TY8,4951
179
175
  tests/error_handling_example.py,sha256=grTeo1X17rFz4lhgASb0g5yu4NWbmNz5neyuonnNR40,5294
180
176
  tests/explain_mysql_update_behavior.py,sha256=uBrJwiYujTJF35oF1kYMRjYU5k5Y3YlqOfOni0oPQtY,2865
181
177
  tests/final_comprehensive_test.py,sha256=szTNbtwKfYNmE0kzDPCsE_kvnTG7FNKl2JERakGhKIk,4314
@@ -185,6 +181,7 @@ tests/fix_log_test.py,sha256=hcRy0j3j0CT0oLN7KNA0VL-_o4M-uE1amR6GziBflfU,4440
185
181
  tests/framework_performance_test.py,sha256=Qp47VrsCK0ylEhDkFOm7lnD8rVkaJ7u1MopsEhAomrE,6985
186
182
  tests/log_buffering_test.py,sha256=0B5UY1yQuxnBU1pEyz3IBYweN__4fOkPXly-kYfOpNU,3226
187
183
  tests/log_generation_timing_test.py,sha256=zHb_m2FqlpRCYw-wqFWFn8cbVH8UR3VvXKSM6nNnbgo,4681
184
+ tests/monitor_redis_dedup.sh,sha256=2nFs5zYiguVqL2YSw_XbhGb30a_EDg0wTIV7zOa0pNg,2284
188
185
  tests/optimized_performance_test.py,sha256=bA0dN4j7ViyTSSiCJEjlkJ9Y7jspTFKs2xX7UXHE8Gs,7379
189
186
  tests/performance_comparison.py,sha256=UevHOM_9z2ILedf_xZ_8F8QiPjb_M8WTfGQrxzKtgco,9266
190
187
  tests/queue_blocking_test.py,sha256=hp-6hmTOO64oOAWVtlN8cFJ95GjbK3t9fj-4q_TKowk,3955
@@ -192,6 +189,7 @@ tests/queue_test.py,sha256=HeBiBXqAgIAbUkLVQ3McS6NdRselA30m3lnuxNBvZbk,2689
192
189
  tests/redis_key_validation_demo.py,sha256=WD2jvuBwHhLYIb3lVFtvYSSnmXWn1EW4EPCEwFhfi6M,4467
193
190
  tests/request_params_example.py,sha256=J50NdsnK1sDrqG-5m3oA-mu1_wHwVwHIfsWxGeQpz7o,4250
194
191
  tests/response_improvements_example.py,sha256=t1cbG3nesp82bqog4_ku1GvQzNbhRyWa5EaKTmOPrSk,5402
192
+ tests/simple_cli_test.py,sha256=t-MyZIk65B3l_JT7Ocf2K3d_bQHBhzzkyx3lvpHw6eU,1594
195
193
  tests/simple_command_test.py,sha256=8TowzW45ukKTPeaNC5uij3RR7rqPULiBr2PguSSMdP8,3688
196
194
  tests/simple_crawlo_test.py,sha256=FYDn5cgAxHN81QSYa_wcJcxJit7aLnIopnkHKKr83dE,4801
197
195
  tests/simple_follow_test.py,sha256=3vNT5Eqwza6fxAY9Xl_9xtFGdfrPwm6NnVHdRmJsH8A,1053
@@ -210,11 +208,10 @@ tests/test_all_commands.py,sha256=VgVa9SzU5Irvn5igHpC2W4E_6ZDWDt7jc-T4UPK_PFE,77
210
208
  tests/test_all_pipeline_fingerprints.py,sha256=NDrBYr0f9CAhjmSezTS4NUrAdcotrSX3ElJTWqjXXbU,5308
211
209
  tests/test_all_redis_key_configs.py,sha256=dWc4Dsr07_vuSpb4hwkMpyy6XO8SI7vglVjGuGvXoa4,5710
212
210
  tests/test_asyncmy_usage.py,sha256=gxENdxrcLlDG2m8V-j4ZnSJYFc3x6CvKvgPAhOC13DE,1688
213
- tests/test_authenticated_proxy.py,sha256=lnvmQwuf0zaZP_E05EzcNFR2VJbwTkLjOmZGNoJKaC4,4339
214
211
  tests/test_batch_processor.py,sha256=4_nYlu9R1JkDCFHq0bYc9LUNqsg41r7sQ879hkrhEts,7212
215
212
  tests/test_cleaners.py,sha256=HDK8_YU7GUj_3hGU415cxEeUR74mnDSk0yroLlgDI0I,1816
213
+ tests/test_cli_arguments.py,sha256=_wNtGNLnrrw3zVyickLcI9qI6ncjms_8AH0IrCNWk4U,5443
216
214
  tests/test_component_factory.py,sha256=V3hO5pJHSDtViLAykXSUqkeH4g-GB4GczwutrTatS2U,5809
217
- tests/test_comprehensive.py,sha256=dvRJeeVYc1cgXK9Y171hH9Y847zZpWSAFFH-EI3UepQ,5182
218
215
  tests/test_config_consistency.py,sha256=RgSxyaypMpysltsGSh1vFMeOShiZZG0rmUKzEhNLpYw,2001
219
216
  tests/test_config_merge.py,sha256=ts1j-TIKkFS0EO5q1I4O7f4YUKR5MLTmRSqOpOlv094,5606
220
217
  tests/test_config_validator.py,sha256=Z4gBHkI0_fEx-xgiiG4T33F4BAuePuF81obpNTXfseY,6202
@@ -223,7 +220,7 @@ tests/test_crawler_process_import.py,sha256=iIPqSCpv2VRb_hWTu5euLME4PDFf7NwixeBy
223
220
  tests/test_crawler_process_spider_modules.py,sha256=uMr4esj6ascVBzt0WrPd3ZOQfKD00O6tJrNhuWOdvV0,1395
224
221
  tests/test_crawlo_proxy_integration.py,sha256=JFBI82ILXMwAIJ29C8uhu5r-hH3UhMC50jKr5-jy6Ng,3059
225
222
  tests/test_date_tools.py,sha256=pcLDyhLrZ_jh-PhPm4CvLZEgNeH9kLMPKN5zacHwuWM,4053
226
- tests/test_dedup_fix.py,sha256=UFdm8lIi0ZIdp40W8ruxRD69bxzijuFUfNyJmB4Fwl0,8788
223
+ tests/test_dedup_fix.py,sha256=6gQKDatida54itwPtB1-HwUTKbdwwJA2Yc-HhhHj_wM,8747
227
224
  tests/test_dedup_pipeline_consistency.py,sha256=dn5EAZSU5gQOV5EQwreHp76i5aQZ9tEdltSGO7dif5M,5176
228
225
  tests/test_default_header_middleware.py,sha256=UDjEPIUCre1M6ndjV_uXLVCfY7WJwyN-1Xn15hzbKMo,13126
229
226
  tests/test_distributed.py,sha256=78Pn4HPLIaO8t1IiaSkckBmuEVTcnC8IDw7znf9_Zcw,1790
@@ -231,28 +228,21 @@ tests/test_double_crawlo_fix.py,sha256=lZwrT5ij6Jbh0EzZswhw05FXwgKaEZsSHekLTrJJa
231
228
  tests/test_double_crawlo_fix_simple.py,sha256=NDmCEeyvpf_D1tGQMA66iLPPKlAnSZcEg71e7GHYcjg,4768
232
229
  tests/test_download_delay_middleware.py,sha256=Idc6KzhL3hY3aDKgn1j_v5-mLIHz7dTnV5c4tJVZh5Q,9107
233
230
  tests/test_downloader_proxy_compatibility.py,sha256=NJJ-g_I665lHLsJZd7ONvKubHRxv82FADZR9WYzgyzA,9418
234
- tests/test_dynamic_downloaders_proxy.py,sha256=t_aWpxOHi4h3_fg2ImtIq7IIJ0r3PTHtnXiopPe2ZlM,4450
235
- tests/test_dynamic_proxy.py,sha256=zi7Ocbhc9GL1zCs0XhmG2NvBBeIZ2d2hPJVh18lH4Y0,3172
236
- tests/test_dynamic_proxy_config.py,sha256=C_9CEjCJtrr0SxIXCyLDhSIi88ujF7UAT1F-FAphd0w,5853
237
- tests/test_dynamic_proxy_real.py,sha256=krWnbFIH26mWNPhOfPMmx3ZxJfOreZxMZFGwVb_8-K8,3511
238
231
  tests/test_edge_cases.py,sha256=460JtYR6yuTo8J4wqJScMzDkrrDUE2Q8R425AaUycIQ,11127
239
232
  tests/test_encoding_core.py,sha256=k5fZET0R1KInhAlbbHEJv4m9d6NuibOxxfIcR43TS7Y,1681
240
233
  tests/test_encoding_detection.py,sha256=Zb1KkF2CR57qa0Hr_Iv8msompGJZT2EIL_2mGp0zX9Q,4245
241
234
  tests/test_enhanced_error_handler.py,sha256=Ku_86jv7iDe25v8ZxalcXxJJjIiIvQXWH8ZldbwdVm8,8581
242
235
  tests/test_enhanced_error_handler_comprehensive.py,sha256=j_cxyIPGks9A3untKhAdj5HU0hrLbbzOLu0uAtGUlJo,9369
243
- tests/test_env_config.py,sha256=Qu1sDeADs69dSr1x0QmEe8nJrMHneE_4JClt-N901e8,4867
244
236
  tests/test_error_handler_compatibility.py,sha256=xJ43cmCwfBGh-qBwCGiMDPPlfNDLw4ZrmlrHN9IojkU,4241
245
237
  tests/test_factories.py,sha256=wKFfr8YBXPs-AQ8YOFgDhINn5uivKqPBZQPUe5GL9Ig,8865
246
238
  tests/test_factory_compatibility.py,sha256=zzTXd3ku3iedgxgB1DxTt3zfetiIl6wCjL9yXIUCpic,6260
247
239
  tests/test_final_validation.py,sha256=OuZI01O0E68Pao--bD-BFDTRZFPc_Mt4W-OXUzlt6ZA,4966
248
240
  tests/test_fingerprint_consistency.py,sha256=68V5u_2hNABI5pNWzXUrA1PJ08Xh9x3-JsMSNNjORMo,4956
249
241
  tests/test_fingerprint_simple.py,sha256=qiSba8gF3Zl91QO_ijJO7KstLdjATs30V_GZCNHShig,1626
250
- tests/test_framework_env_usage.py,sha256=bFb_ptdLeX2obdJWEqEHPWweiWR-wR2BpvEaJMQK7h4,4201
251
242
  tests/test_get_component_logger.py,sha256=UKj5uT1F3L3atoJFmpk7QSDO2fZHgw-7Y84vMFbHRkM,2285
252
243
  tests/test_hash_performance.py,sha256=4eVPwbu66Oun0LVyTTNd9d2cj2V1xq0YZkRg8Z0TO-Q,3211
253
244
  tests/test_integration.py,sha256=lVEzKNAjFzFZHRNZAyJmXxa_5Ogf_qqL4APqs620o58,4839
254
245
  tests/test_item_dedup_redis_key.py,sha256=dp_H59exJLaZHh5oMtmMEOWh-DNZwbnwIFYDjOpHgd0,3842
255
- tests/test_large_scale_config.py,sha256=Ik32ilAOQXsyw2sHR53gDPNNjY0AXybQ9ya2JY-EeqM,4296
256
246
  tests/test_large_scale_helper.py,sha256=0L6EKHcKgh7XHvoW4wRSkxmw8GolUwSOCgZ_-ZmCyDo,8371
257
247
  tests/test_logging_enhancements.py,sha256=YHcYWC8PG_AP5wZnmOr6H7QuU7m-3xzxEhppM0Jubvg,12731
258
248
  tests/test_logging_final.py,sha256=K9vxyODslXza05hElVEcvzbXgzthYKK5CRj4UJTftIw,6376
@@ -281,7 +271,6 @@ tests/test_pipeline_fingerprint_consistency.py,sha256=LL55oGSDGy0K8LxoyKa6ogNHXh
281
271
  tests/test_priority_behavior.py,sha256=JQ5uv80cAUKV9Eh3S8j5zxYSSL-dmzhwhuKOINM26zU,9325
282
272
  tests/test_priority_consistency.py,sha256=rVX7nku5N_QpB_ffDu3xqREkCWPX5aNNiXy112o9wpA,5756
283
273
  tests/test_priority_consistency_fixed.py,sha256=MlYi5PIr5wxunC3Ku4ilnxOatKyRu2qIvhV7pjadkjg,10765
284
- tests/test_proxy_api.py,sha256=XnmklS-xU4ke_560gV6AIlBsRmG8YLQTGFAZrTUZuhc,11013
285
274
  tests/test_proxy_health_check.py,sha256=_tDlxa_6TdL3M5RLkHF82roXJ8WIuG5hELBp2GADyKQ,1123
286
275
  tests/test_proxy_middleware.py,sha256=MC2Hg88Pdpv6i_gTAy4ocIWOOxQ8bF7hYtszwpOzilE,8716
287
276
  tests/test_proxy_middleware_enhanced.py,sha256=N7Ly3koCH2uRYk6pxhEJwWpChKdIucdrj0nKvq_E4bw,6896
@@ -301,7 +290,6 @@ tests/test_queue_type_redis_config_consistency.py,sha256=1ew7Zp9CxH1DQ0RUmsZMV-n
301
290
  tests/test_random_headers_default.py,sha256=ulDb3_kRpnTCN1-TO3m6wVM-eMkZS_ezsSbd1ur8Xpg,12772
302
291
  tests/test_random_headers_necessity.py,sha256=SSbNQIE347oCQvuG6yaAambFU-3MyQzTV5jN1kArRGY,11741
303
292
  tests/test_random_user_agent.py,sha256=6HjU4iUcMk-J6bR2N5FhIkWDfnaFKAPNVyRzxmQQ14k,2302
304
- tests/test_real_scenario_proxy.py,sha256=L2Mfwt47pvs6dYJDcazeyupoQ_DuvhdulCz6-2GFR9Y,7527
305
293
  tests/test_redis_config.py,sha256=51_Fy1PqIhS0MMO2nR4q6oQjBFxfqcUPK_4NNf5s83g,903
306
294
  tests/test_redis_connection_pool.py,sha256=pKfXdE3Cm_L_fNqI9zqFmqiidCwR0t7hiM_Fu_V1cNI,9328
307
295
  tests/test_redis_key_naming.py,sha256=MTFk656JhiGVTsMctBDhBNOMFcBDZrsQA3UfPZ-Dgj4,6911
@@ -336,7 +324,6 @@ tests/test_template_redis_key.py,sha256=99-s0_-8MFJbIvGG_X__sH0qkXWTtJv8fdTdlfts
336
324
  tests/test_tools.py,sha256=z50Bvq_q8FwpyxNkmh00_A3sXkSv2l1Q_EbK02FDYgk,5504
337
325
  tests/test_user_agent_randomness.py,sha256=tE8_zh-BjMAQ9CTgScxZze6JarNher6COkdoLU68YfA,5681
338
326
  tests/test_user_agents.py,sha256=e4haX-o8Janl-PawGJ9MemZyMqTX33_tBF_WnYSVoUw,3327
339
- tests/tools_example.py,sha256=Rxu5vVKnj3CZ3mCx-EEotBWPtZs2S7ktyqq-SYeclxU,7999
340
327
  tests/untested_features_report.md,sha256=31aUlsw_1OKe0_ijAjeH85kJ7HJ8qzKLJdOHDjWtYdY,4169
341
328
  tests/verify_debug.py,sha256=iQ4Efwg9bQTHscr73VYAAZ8rBIe1u6mQfeaEK5YgneY,1564
342
329
  tests/verify_distributed.py,sha256=0IolM4ymuPOz_uTfHSWFO3Vxzp7Lo6i0zhSbzJhHFtI,4045
@@ -348,14 +335,13 @@ tests/ofweek_scrapy/ofweek_scrapy/items.py,sha256=Y_TwwHPAgOXTuCTdnhRxil7vYPk1_r
348
335
  tests/ofweek_scrapy/ofweek_scrapy/middlewares.py,sha256=O4jVSXZgxtsRzU9O_O3YdkS7_QLndzv3uYP-Op8g254,3654
349
336
  tests/ofweek_scrapy/ofweek_scrapy/pipelines.py,sha256=ZO6WqTqPpTwLvnwO7YL0E35OPp4zSfJ_GhMeshNRSow,379
350
337
  tests/ofweek_scrapy/ofweek_scrapy/settings.py,sha256=X3Y6goZluAz0n2bepWAKEhZX0URFfe9_lBRBCPgtLPk,2933
351
- tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py,sha256=ULwecZkx3_NTphkz7y_qiazBeUoHFnCCWnKSjoDCZj0,161
352
- tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py,sha256=gcfKze-ipzP7JTDGCL3TgtjwIwfgI7dPL6GmdXVT0fs,6880
338
+ tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py,sha256=a-Jax6MTMZC2HOw3mUBcNu-T44dOUHDsb22Oly4HTnM,165
353
339
  tests/scrapy_comparison/ofweek_scrapy.py,sha256=rhVds_WjYum1bLuWWe90HtXE51fZXEqhhPSc822ZasQ,5790
354
340
  tests/scrapy_comparison/scrapy_test.py,sha256=-IsGUHPBgEL0TmXjeLZl-TUA01B7Dsc2nRo4JZbFwZA,5599
355
341
  tests/test_spiders/__init__.py,sha256=Ws2DhfUA0Xh5Cxr9M46td7B6hyNoLTyAhZ60FnIh6D0,20
356
342
  tests/test_spiders/test_spider.py,sha256=kNGEg80HMMFgzVseI1jJjljZEBy3QYKt_3SXGASffFM,168
357
- crawlo-1.4.6.dist-info/METADATA,sha256=j66m-xE1oVuLE4WEnDbBjH6PXGbfbgM7yxSF616EOHo,9355
358
- crawlo-1.4.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
359
- crawlo-1.4.6.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
360
- crawlo-1.4.6.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
361
- crawlo-1.4.6.dist-info/RECORD,,
343
+ crawlo-1.4.8.dist-info/METADATA,sha256=-3vWrJ0Mpd-DMRD4S0bRZwzUuKF17FyauuSI78eD7fo,24526
344
+ crawlo-1.4.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
345
+ crawlo-1.4.8.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
346
+ crawlo-1.4.8.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
347
+ crawlo-1.4.8.dist-info/RECORD,,
@@ -22,11 +22,6 @@ from crawlo.tools import (
22
22
  handle_captcha,
23
23
  detect_rate_limiting,
24
24
 
25
- # 带认证代理工具
26
- AuthenticatedProxy,
27
- create_proxy_config,
28
- format_proxy_for_request,
29
-
30
25
  # 分布式协调工具
31
26
  generate_pagination_tasks,
32
27
  distribute_tasks,
@@ -125,32 +120,6 @@ def demo_anti_crawler_tools():
125
120
  print()
126
121
 
127
122
 
128
- def demo_authenticated_proxy_tools():
129
- """演示带认证代理工具的使用"""
130
- print("=== 带认证代理工具演示 ===\n")
131
-
132
- # 不同类型的代理URL
133
- proxy_urls = [
134
- "http://user:pass@proxy1.example.com:8080", # 带认证HTTP代理
135
- "https://username:password@proxy2.example.com:443", # 带认证HTTPS代理
136
- "http://proxy3.example.com:8080" # 不带认证代理
137
- ]
138
-
139
- for proxy_url in proxy_urls:
140
- print(f"处理代理: {proxy_url}")
141
-
142
- # 创建代理对象
143
- proxy = AuthenticatedProxy(proxy_url)
144
-
145
- # 为不同下载器格式化代理配置
146
- for downloader in ["aiohttp", "httpx", "curl_cffi"]:
147
- config = create_proxy_config(proxy_url)
148
- formatted = format_proxy_for_request(config, downloader)
149
- print(f" {downloader}格式: {formatted}")
150
-
151
- print()
152
-
153
-
154
123
  def demo_distributed_coordinator_tools():
155
124
  """演示分布式协调工具的使用"""
156
125
  print("=== 分布式协调工具演示 ===\n")
@@ -191,8 +160,7 @@ from crawlo.tools import (
191
160
  validate_email,
192
161
  AntiCrawler,
193
162
  DistributedCoordinator,
194
- retry,
195
- AuthenticatedProxy
163
+ retry
196
164
  )
197
165
 
198
166
  class AdvancedSpider(Spider):
@@ -200,40 +168,19 @@ class AdvancedSpider(Spider):
200
168
  super().__init__()
201
169
  self.anti_crawler = AntiCrawler()
202
170
  self.coordinator = DistributedCoordinator()
203
- # 代理列表
204
- self.proxy_urls = [
205
- "http://user1:pass1@proxy1.example.com:8080",
206
- "http://user2:pass2@proxy2.example.com:8080",
207
- "http://proxy3.example.com:8080" # 不带认证
208
- ]
209
171
 
210
172
  def start_requests(self):
211
173
  # 生成分页任务
212
174
  base_url = "https://api.example.com/products"
213
175
  pagination_tasks = self.coordinator.generate_pagination_tasks(base_url, 1, 100)
214
176
 
215
- for i, url in enumerate(pagination_tasks):
216
- # 轮换使用代理
217
- proxy_url = self.proxy_urls[i % len(self.proxy_urls)]
218
- proxy = AuthenticatedProxy(proxy_url)
219
-
220
- request = Request(url, callback=self.parse)
221
-
222
- # 根据下载器类型设置代理
223
- downloader_type = self.crawler.settings.get("DOWNLOADER_TYPE", "aiohttp")
224
- if downloader_type == "aiohttp":
225
- request.proxy = proxy.clean_url
226
- auth = proxy.get_auth_credentials()
227
- if auth:
228
- request.meta["proxy_auth"] = auth
229
- elif downloader_type == "httpx":
230
- request.proxy = proxy.clean_url
231
- elif downloader_type == "curl_cffi":
232
- request.proxy = proxy.proxy_dict
233
- auth_header = proxy.get_auth_header()
234
- if auth_header:
235
- request.headers["Proxy-Authorization"] = auth_header
236
-
177
+ for url in pagination_tasks:
178
+ # 直接使用带认证的代理URL(框架原生支持)
179
+ request = Request(
180
+ url,
181
+ callback=self.parse,
182
+ proxy="http://user:pass@proxy.example.com:8080" # 所有下载器都支持
183
+ )
237
184
  yield request
238
185
 
239
186
  @retry(max_retries=3)
@@ -248,19 +195,15 @@ class AdvancedSpider(Spider):
248
195
  products = response.css('.product-item')
249
196
  for product in products:
250
197
  name = product.css('.product-name::text').get()
251
- price_text = product.css('.price::text').get()
252
198
  email = product.css('.contact-email::text').get()
253
199
 
254
200
  # 数据清洗和验证
255
201
  clean_name = clean_text(name) if name else None
256
- clean_price = clean_text(price_text) if price_text else None
257
202
  is_valid_email = validate_email(email) if email else False
258
203
 
259
204
  # 检查数据是否重复
260
- if not await self.coordinator.is_duplicate({"name": clean_name, "price": clean_price}):
261
- # 添加到去重集合
262
- await self.coordinator.add_to_dedup({"name": clean_name, "price": clean_price})
263
-
205
+ if not await self.coordinator.is_duplicate({"name": clean_name}):
206
+ await self.coordinator.add_to_dedup({"name": clean_name})
264
207
  # 处理产品数据...
265
208
  pass
266
209
  """)
@@ -271,6 +214,5 @@ if __name__ == '__main__':
271
214
  demo_data_processing_tools()
272
215
  demo_retry_mechanism()
273
216
  demo_anti_crawler_tools()
274
- demo_authenticated_proxy_tools()
275
217
  demo_distributed_coordinator_tools()
276
218
  demo_in_spider()