crawlo 1.3.2__py3-none-any.whl → 1.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (219) hide show
  1. crawlo/__init__.py +63 -63
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +151 -151
  7. crawlo/commands/help.py +138 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +322 -314
  10. crawlo/commands/startproject.py +436 -436
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +312 -312
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +2 -2
  16. crawlo/core/engine.py +365 -365
  17. crawlo/core/processor.py +40 -40
  18. crawlo/core/scheduler.py +256 -256
  19. crawlo/crawler.py +1166 -1168
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +273 -273
  23. crawlo/downloader/aiohttp_downloader.py +226 -226
  24. crawlo/downloader/cffi_downloader.py +245 -245
  25. crawlo/downloader/httpx_downloader.py +259 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +402 -402
  28. crawlo/downloader/selenium_downloader.py +472 -472
  29. crawlo/event.py +11 -11
  30. crawlo/exceptions.py +81 -81
  31. crawlo/extension/__init__.py +39 -39
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +57 -57
  34. crawlo/extension/log_stats.py +81 -81
  35. crawlo/extension/logging_extension.py +52 -45
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/filters/__init__.py +154 -154
  40. crawlo/filters/aioredis_filter.py +234 -234
  41. crawlo/filters/memory_filter.py +269 -269
  42. crawlo/items/__init__.py +23 -23
  43. crawlo/items/base.py +21 -21
  44. crawlo/items/fields.py +52 -52
  45. crawlo/items/items.py +104 -104
  46. crawlo/middleware/__init__.py +21 -21
  47. crawlo/middleware/default_header.py +132 -132
  48. crawlo/middleware/download_delay.py +104 -104
  49. crawlo/middleware/middleware_manager.py +135 -135
  50. crawlo/middleware/offsite.py +123 -123
  51. crawlo/middleware/proxy.py +386 -386
  52. crawlo/middleware/request_ignore.py +86 -86
  53. crawlo/middleware/response_code.py +163 -163
  54. crawlo/middleware/response_filter.py +136 -136
  55. crawlo/middleware/retry.py +124 -124
  56. crawlo/middleware/simple_proxy.py +65 -65
  57. crawlo/mode_manager.py +187 -187
  58. crawlo/network/__init__.py +21 -21
  59. crawlo/network/request.py +379 -379
  60. crawlo/network/response.py +359 -359
  61. crawlo/pipelines/__init__.py +21 -21
  62. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  63. crawlo/pipelines/console_pipeline.py +39 -39
  64. crawlo/pipelines/csv_pipeline.py +316 -316
  65. crawlo/pipelines/database_dedup_pipeline.py +222 -222
  66. crawlo/pipelines/json_pipeline.py +218 -218
  67. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  68. crawlo/pipelines/mongo_pipeline.py +131 -131
  69. crawlo/pipelines/mysql_pipeline.py +318 -318
  70. crawlo/pipelines/pipeline_manager.py +75 -75
  71. crawlo/pipelines/redis_dedup_pipeline.py +166 -166
  72. crawlo/project.py +325 -297
  73. crawlo/queue/pqueue.py +37 -37
  74. crawlo/queue/queue_manager.py +379 -379
  75. crawlo/queue/redis_priority_queue.py +306 -306
  76. crawlo/settings/__init__.py +7 -7
  77. crawlo/settings/default_settings.py +225 -225
  78. crawlo/settings/setting_manager.py +198 -198
  79. crawlo/spider/__init__.py +639 -639
  80. crawlo/stats_collector.py +59 -59
  81. crawlo/subscriber.py +129 -129
  82. crawlo/task_manager.py +30 -30
  83. crawlo/templates/crawlo.cfg.tmpl +10 -10
  84. crawlo/templates/project/__init__.py.tmpl +3 -3
  85. crawlo/templates/project/items.py.tmpl +17 -17
  86. crawlo/templates/project/middlewares.py.tmpl +118 -118
  87. crawlo/templates/project/pipelines.py.tmpl +96 -96
  88. crawlo/templates/project/settings.py.tmpl +266 -266
  89. crawlo/templates/project/settings_distributed.py.tmpl +179 -179
  90. crawlo/templates/project/settings_gentle.py.tmpl +60 -60
  91. crawlo/templates/project/settings_high_performance.py.tmpl +130 -130
  92. crawlo/templates/project/settings_minimal.py.tmpl +34 -34
  93. crawlo/templates/project/settings_simple.py.tmpl +101 -101
  94. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  95. crawlo/templates/run.py.tmpl +38 -38
  96. crawlo/templates/spider/spider.py.tmpl +143 -143
  97. crawlo/tools/__init__.py +200 -200
  98. crawlo/tools/anti_crawler.py +268 -268
  99. crawlo/tools/authenticated_proxy.py +240 -240
  100. crawlo/tools/data_formatter.py +225 -225
  101. crawlo/tools/data_validator.py +180 -180
  102. crawlo/tools/date_tools.py +289 -289
  103. crawlo/tools/distributed_coordinator.py +388 -388
  104. crawlo/tools/encoding_converter.py +127 -127
  105. crawlo/tools/request_tools.py +82 -82
  106. crawlo/tools/retry_mechanism.py +224 -224
  107. crawlo/tools/scenario_adapter.py +262 -262
  108. crawlo/tools/text_cleaner.py +232 -232
  109. crawlo/utils/__init__.py +34 -34
  110. crawlo/utils/batch_processor.py +259 -259
  111. crawlo/utils/controlled_spider_mixin.py +439 -439
  112. crawlo/utils/db_helper.py +343 -343
  113. crawlo/utils/enhanced_error_handler.py +356 -356
  114. crawlo/utils/env_config.py +142 -142
  115. crawlo/utils/error_handler.py +123 -123
  116. crawlo/utils/func_tools.py +82 -82
  117. crawlo/utils/large_scale_config.py +286 -286
  118. crawlo/utils/large_scale_helper.py +344 -344
  119. crawlo/utils/log.py +199 -146
  120. crawlo/utils/performance_monitor.py +285 -285
  121. crawlo/utils/queue_helper.py +175 -175
  122. crawlo/utils/redis_connection_pool.py +351 -351
  123. crawlo/utils/redis_key_validator.py +198 -198
  124. crawlo/utils/request.py +267 -267
  125. crawlo/utils/request_serializer.py +218 -218
  126. crawlo/utils/spider_loader.py +61 -61
  127. crawlo/utils/system.py +11 -11
  128. crawlo/utils/tools.py +4 -4
  129. crawlo/utils/url.py +39 -39
  130. {crawlo-1.3.2.dist-info → crawlo-1.3.3.dist-info}/METADATA +1020 -1020
  131. crawlo-1.3.3.dist-info/RECORD +219 -0
  132. examples/__init__.py +7 -7
  133. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
  134. tests/__init__.py +7 -7
  135. tests/advanced_tools_example.py +275 -275
  136. tests/authenticated_proxy_example.py +107 -107
  137. tests/cleaners_example.py +160 -160
  138. tests/config_validation_demo.py +142 -142
  139. tests/controlled_spider_example.py +205 -205
  140. tests/date_tools_example.py +180 -180
  141. tests/debug_pipelines.py +66 -66
  142. tests/dynamic_loading_example.py +523 -523
  143. tests/dynamic_loading_test.py +104 -104
  144. tests/env_config_example.py +133 -133
  145. tests/error_handling_example.py +171 -171
  146. tests/redis_key_validation_demo.py +130 -130
  147. tests/request_params_example.py +150 -150
  148. tests/response_improvements_example.py +144 -144
  149. tests/test_advanced_tools.py +148 -148
  150. tests/test_all_redis_key_configs.py +145 -145
  151. tests/test_authenticated_proxy.py +141 -141
  152. tests/test_cleaners.py +54 -54
  153. tests/test_comprehensive.py +146 -146
  154. tests/test_config_consistency.py +80 -80
  155. tests/test_config_merge.py +152 -152
  156. tests/test_config_validator.py +182 -182
  157. tests/test_crawlo_proxy_integration.py +108 -108
  158. tests/test_date_tools.py +123 -123
  159. tests/test_default_header_middleware.py +158 -158
  160. tests/test_distributed.py +65 -65
  161. tests/test_double_crawlo_fix.py +207 -207
  162. tests/test_double_crawlo_fix_simple.py +124 -124
  163. tests/test_download_delay_middleware.py +221 -221
  164. tests/test_downloader_proxy_compatibility.py +268 -268
  165. tests/test_dynamic_downloaders_proxy.py +124 -124
  166. tests/test_dynamic_proxy.py +92 -92
  167. tests/test_dynamic_proxy_config.py +146 -146
  168. tests/test_dynamic_proxy_real.py +109 -109
  169. tests/test_edge_cases.py +303 -303
  170. tests/test_enhanced_error_handler.py +270 -270
  171. tests/test_env_config.py +121 -121
  172. tests/test_error_handler_compatibility.py +112 -112
  173. tests/test_final_validation.py +153 -153
  174. tests/test_framework_env_usage.py +103 -103
  175. tests/test_integration.py +169 -169
  176. tests/test_item_dedup_redis_key.py +122 -122
  177. tests/test_mode_consistency.py +51 -51
  178. tests/test_offsite_middleware.py +221 -221
  179. tests/test_parsel.py +29 -29
  180. tests/test_performance.py +327 -327
  181. tests/test_proxy_api.py +264 -264
  182. tests/test_proxy_health_check.py +32 -32
  183. tests/test_proxy_middleware.py +121 -121
  184. tests/test_proxy_middleware_enhanced.py +216 -216
  185. tests/test_proxy_middleware_integration.py +136 -136
  186. tests/test_proxy_middleware_refactored.py +184 -184
  187. tests/test_proxy_providers.py +56 -56
  188. tests/test_proxy_stats.py +19 -19
  189. tests/test_proxy_strategies.py +59 -59
  190. tests/test_queue_manager_double_crawlo.py +173 -173
  191. tests/test_queue_manager_redis_key.py +176 -176
  192. tests/test_random_user_agent.py +72 -72
  193. tests/test_real_scenario_proxy.py +195 -195
  194. tests/test_redis_config.py +28 -28
  195. tests/test_redis_connection_pool.py +294 -294
  196. tests/test_redis_key_naming.py +181 -181
  197. tests/test_redis_key_validator.py +123 -123
  198. tests/test_redis_queue.py +224 -224
  199. tests/test_request_ignore_middleware.py +182 -182
  200. tests/test_request_params.py +111 -111
  201. tests/test_request_serialization.py +70 -70
  202. tests/test_response_code_middleware.py +349 -349
  203. tests/test_response_filter_middleware.py +427 -427
  204. tests/test_response_improvements.py +152 -152
  205. tests/test_retry_middleware.py +241 -241
  206. tests/test_scheduler.py +252 -252
  207. tests/test_scheduler_config_update.py +133 -133
  208. tests/test_simple_response.py +61 -61
  209. tests/test_telecom_spider_redis_key.py +205 -205
  210. tests/test_template_content.py +87 -87
  211. tests/test_template_redis_key.py +134 -134
  212. tests/test_tools.py +159 -159
  213. tests/test_user_agents.py +96 -96
  214. tests/tools_example.py +260 -260
  215. tests/verify_distributed.py +117 -117
  216. crawlo-1.3.2.dist-info/RECORD +0 -219
  217. {crawlo-1.3.2.dist-info → crawlo-1.3.3.dist-info}/WHEEL +0 -0
  218. {crawlo-1.3.2.dist-info → crawlo-1.3.3.dist-info}/entry_points.txt +0 -0
  219. {crawlo-1.3.2.dist-info → crawlo-1.3.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,219 @@
1
+ crawlo/__init__.py,sha256=2H7llH-yRV3N5_DomJ02JYsd5wNJdUNZI3VowiTQvOc,1444
2
+ crawlo/__version__.py,sha256=9ap_Mho2n-5Wj2kAxLL8yqq57pG-v6Z_7an7VCKga44,23
3
+ crawlo/cli.py,sha256=AQnAB5NMI-Ic1VPw_Jjng8L4AI4-wMozOwzE6CfXkZU,2402
4
+ crawlo/config.py,sha256=EQIT7WpkXAlr2ocd5SYJYOKTSWUlQx2AkTHX7ErEWxw,9798
5
+ crawlo/config_validator.py,sha256=oY4-2bwXUlwHAnGgkI-EznviDfML_dcxbWSGXNSxC2k,11516
6
+ crawlo/crawler.py,sha256=rixy3qIy7e0vg1Ns4u1NC3S3Lbi-Mbqe_edPkXgV0yc,43600
7
+ crawlo/event.py,sha256=ZhoPW5CglCEuZNFEwviSCBIw0pT5O6jT98bqYrDFd3E,324
8
+ crawlo/exceptions.py,sha256=YVIDnC1bKSMv3fXH_6tinWMuD9HmKHIaUfO4_fkX5sY,1247
9
+ crawlo/mode_manager.py,sha256=h6ZWOK9U9WZXCLk1MXwBkpzMOw6l5royxfrst4vCtJg,6573
10
+ crawlo/project.py,sha256=Qw_Z8-ppYdv-aynBlzxH9CqABbrMIyGjtfLsM0werqc,12550
11
+ crawlo/stats_collector.py,sha256=v4jC9BAe-23w93hWzbeMCCgQ9VuFPyxw5JV9ItbGH8w,1636
12
+ crawlo/subscriber.py,sha256=h8fx69NJZeWem0ZkCmfHAi2kgfDGFObHpwN0aGNUM6Y,5115
13
+ crawlo/task_manager.py,sha256=PScfEB03306Txa0l38AeQ_0WVhKzeWOFyT3bnrkbHW0,849
14
+ crawlo/commands/__init__.py,sha256=orvY6wLOBwGUEJKeF3h_T1fxj8AaQLjngBDd-3xKOE4,392
15
+ crawlo/commands/check.py,sha256=TKDhI_sj7kErgiJpt2vCZ9QL-g6yWjrrPWKbgh8pgEU,23199
16
+ crawlo/commands/genspider.py,sha256=7YGZdv12G341SWmkGbyDeMde2RgqGYxYXRExFy7KKNc,5088
17
+ crawlo/commands/help.py,sha256=8xPC0iNCg1rRBoK2bb6noAEANc1JwrdM35eF-j6yeZM,5111
18
+ crawlo/commands/list.py,sha256=trzcd3kG6DhkOqYZADcl3yR7M8iJBgRw5fE-g9e0gVM,5877
19
+ crawlo/commands/run.py,sha256=ybATvNXjXdr6GczW6gujkaTK05dhfk1tUTA0qXCO_rg,12360
20
+ crawlo/commands/startproject.py,sha256=-Bo8vvDfIhqzGmWyhxMatBlPLhYpRwJC7l4fpbN8vVk,16506
21
+ crawlo/commands/stats.py,sha256=vlGJLyiXZtY0ASdzCK59JNereSsAel4W9JCGaOzCr-8,6201
22
+ crawlo/commands/utils.py,sha256=YVNEEzlm_qNY3SVvU8h6o2lQMkVgypvoB4ZFrP4gln0,5578
23
+ crawlo/core/__init__.py,sha256=JYSAn15r8yWgRK_Nc69t_8tZCyb70MiPZKssA8wrYz0,43
24
+ crawlo/core/engine.py,sha256=d6L4Xwwjc1UQJY9QutqC_Uk88ZzBCyN9T_7z3lMSuIQ,14861
25
+ crawlo/core/processor.py,sha256=oHLs-cno0bJGTNc9NGD2S7_2-grI3ruvggO0SY2mf3Q,1180
26
+ crawlo/core/scheduler.py,sha256=AaZID01ovAbFzf1Urp55SPyUd7AOtHetX6R_GEYMTfA,12849
27
+ crawlo/data/__init__.py,sha256=UPqgioMdu3imSUmpLWzVlpvoBnEfaPSAT-crCcWd7iw,121
28
+ crawlo/data/user_agents.py,sha256=zjjFkldQkqtrn45j0WZplaZLannPxZDeAU0JofxQcBc,9891
29
+ crawlo/downloader/__init__.py,sha256=VZG5HiSHOmimiH9okQN3MBwgXsCzxr2awflVz5UiboY,8897
30
+ crawlo/downloader/aiohttp_downloader.py,sha256=GaUgR5WwG7VvMDQnL9tdwnLgu2bt8btdMuecWKyP2Uk,9195
31
+ crawlo/downloader/cffi_downloader.py,sha256=QxoeocCE2DsQCnhZla6-BjhplaTZDWMbEJmNrghWSDA,10488
32
+ crawlo/downloader/httpx_downloader.py,sha256=MpgDeIdGqNsiSKLOEDBnr5Z0eUbhHnqVEmAuoIfJmFU,12296
33
+ crawlo/downloader/hybrid_downloader.py,sha256=dNnFeegRnyLaOxTWI6XrWKqqVPx80AZBZNgmrcKRVBM,8240
34
+ crawlo/downloader/playwright_downloader.py,sha256=L-TVzG7cYfuBlqW0XSZuz5C_r9fpJrmYNcoQ-cDEna4,16663
35
+ crawlo/downloader/selenium_downloader.py,sha256=P8GuhEw6OYVeN3oeksuBLpUJCELXiu0mAR23X6IIOAA,21508
36
+ crawlo/extension/__init__.py,sha256=jOdyLjtf-JqEKN67x2haIhtMhy_5bGSMbdFIdsERU7o,1633
37
+ crawlo/extension/health_check.py,sha256=stDpyP4gOzAdbBlPbSf0rge0QounAhF8CtrGq5fa_7c,5657
38
+ crawlo/extension/log_interval.py,sha256=2R3XVdM1grDN8wh9TTHRB_WmQypCr5YSGvESNDnS16s,2474
39
+ crawlo/extension/log_stats.py,sha256=6Hoq0ASU8evjT5AsUuc0b018-vkzeeO6CyJrU9ZabWk,2989
40
+ crawlo/extension/logging_extension.py,sha256=hAi3hUbrVMRcE7b0tqybSRgnDYrgZYIDTsF-wxmezI0,1940
41
+ crawlo/extension/memory_monitor.py,sha256=fClPchpCkVjcIiU0AJHCKDd7HEiz5B4KqNqKTRZ2hcU,4394
42
+ crawlo/extension/performance_profiler.py,sha256=BjWD3LOb4VwjQJQvQtWNg7GluEwFquI1CztNfgMzy3c,5032
43
+ crawlo/extension/request_recorder.py,sha256=KA_RmcfscDxP5wPdolO76yKfRj-1jmHhG3jkVGO1pbc,4181
44
+ crawlo/filters/__init__.py,sha256=lX-QOCDTiTRFoiK1qrZ5HABo7LgZfcxScx_lELYEvJk,4395
45
+ crawlo/filters/aioredis_filter.py,sha256=aB1GPCALikvPUWdoACaGsvmnkzseKXxpR7l3gh1glsY,8479
46
+ crawlo/filters/memory_filter.py,sha256=ZojFhZ6gE76aQBC-rfImxSkSMwQtiotenx0pIcQOaFg,9561
47
+ crawlo/items/__init__.py,sha256=rFpx1qFBo0Ik7bSdnXC8EVTJUOQdoJYGVdhYjaH00nk,409
48
+ crawlo/items/base.py,sha256=hwGJEdFWOdaZfalFX8umRkh_HUWLEbCjvq4j70fplMQ,598
49
+ crawlo/items/fields.py,sha256=l-DIwK6CCpdzNvf6ELz7Ckc7YCghZD9UCXA8vhNn2UE,1852
50
+ crawlo/items/items.py,sha256=OmVEvMmgofMU95GkaiWkfNQ2fjsH2fY9sw3SKcmUhLs,3478
51
+ crawlo/middleware/__init__.py,sha256=PSwpRLdBUopaQzBp1S0zK_TZbrRagQ4yzvgyLy4tBk8,570
52
+ crawlo/middleware/default_header.py,sha256=Pw-ev8ffi16GeCh84R5L3hAZgp3G1QXS-H5kV3JEp4Q,5164
53
+ crawlo/middleware/download_delay.py,sha256=2iWnJFtWDlqDy5MsAob8TPiJQoiz9v21yatkBI0eptg,3542
54
+ crawlo/middleware/middleware_manager.py,sha256=69l0QS6HJA2TmhdEHgyXMMhJ1nZlVUjODUFo3xhSth4,6413
55
+ crawlo/middleware/offsite.py,sha256=4tUkPqXMMXsi1WwYnJ_e7wMd6sRgK19QHRCYq8-w8jk,4682
56
+ crawlo/middleware/proxy.py,sha256=uKk5OSLIs7jv9bBgkZwsi1rIpthooxhMrGBC2BPRDCc,16022
57
+ crawlo/middleware/request_ignore.py,sha256=7qdX4zAimjSGwdod_aWUbOTfzLBWZ5KzLVFchGMCxCI,2663
58
+ crawlo/middleware/response_code.py,sha256=0_NbiCzLgJmSuSSF2_jqpypWYy0ES4GV-0iWQPLfYLc,5097
59
+ crawlo/middleware/response_filter.py,sha256=tVGr06bfJBR3xAHI2G5c3WimFsGHu8qoJtDcsVuCATU,4384
60
+ crawlo/middleware/retry.py,sha256=Acfo95B9wF8fQTCQIqluZOS2hHdnknQu_FOHvpGKJp0,4248
61
+ crawlo/middleware/simple_proxy.py,sha256=rQ4RkqewGvDRCw021nGrg8ngkBzg3wqrEVqvSmBgQ6M,2256
62
+ crawlo/network/__init__.py,sha256=bvEnpEUBZJ79URfNZbsHhsBKna54hM2-x_BV8eotTA4,418
63
+ crawlo/network/request.py,sha256=e6-YLgK7SU8D19n21mQwqt_b_aeRVJFOgWPIBPal2ys,14178
64
+ crawlo/network/response.py,sha256=QwJhL3xJfPVy_gwtGrg61oAgaqCoCmjyj1Ug7Zju7Pg,13060
65
+ crawlo/pipelines/__init__.py,sha256=FDe2Pr5tiHtV8hFlheElRO_O1aVKvSWlkTcAl9BXAKA,637
66
+ crawlo/pipelines/bloom_dedup_pipeline.py,sha256=NoqU0pCS8clRvdmR-7EsJEzBGn_RJvI5-Fz-iVpO5mc,5817
67
+ crawlo/pipelines/console_pipeline.py,sha256=bwe5hZgaVSWmh3R8XpOaaeAjJme-Ttrpo6G6f1cnLIg,1287
68
+ crawlo/pipelines/csv_pipeline.py,sha256=qbXZoqq4FIR9QkUGpC0ryWzmqGJSrM2bxmWLM4I1nXM,12490
69
+ crawlo/pipelines/database_dedup_pipeline.py,sha256=L9lc6k62kUzwcDPgUJ0wT3KHhnC_lls_L5XMb08i_H8,8200
70
+ crawlo/pipelines/json_pipeline.py,sha256=wrCsh8YInmcPLAkhPrHObMx89VZfhf-c7qRrYsTixPE,8585
71
+ crawlo/pipelines/memory_dedup_pipeline.py,sha256=Wf_M7-FFmqXvcr3_Rpz97q0KcKebx8Ii2iRHv2A3orc,3952
72
+ crawlo/pipelines/mongo_pipeline.py,sha256=PohTKTGw3QRvuP-T6SrquwW3FAHSno8jQ2D2cH_d75U,5837
73
+ crawlo/pipelines/mysql_pipeline.py,sha256=fESKJ6qBcW3NZ9Gz1ACASL-PILvYAW3YagIZMg7H1h0,13818
74
+ crawlo/pipelines/pipeline_manager.py,sha256=wr79XeVDH-v7meSwB10W1qU3AZDh9IopxGWg5rWFerw,3154
75
+ crawlo/pipelines/redis_dedup_pipeline.py,sha256=kexmobW_JNSkaVRTQ4uhsKW5hGTXeqjGjOFmOP_wflw,6508
76
+ crawlo/queue/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
+ crawlo/queue/pqueue.py,sha256=qTFOuvEXsYEZbm0ULjsOeZo0XtSsZ-SHpx7nFEtmluE,1095
78
+ crawlo/queue/queue_manager.py,sha256=xjodzF8Yjb1wJ3ut_Mu3eRFrqeCMo5O0RXW5tdw9o1M,15320
79
+ crawlo/queue/redis_priority_queue.py,sha256=J30HcC16E3FjxfUCbL_9HbFoaszBy7prFvG8xRvWd3M,13432
80
+ crawlo/settings/__init__.py,sha256=NgYFLfk_Bw7h6KSoepJn_lMBSqVbCHebjKxaE3_eMgw,130
81
+ crawlo/settings/default_settings.py,sha256=Wu1iUdhdgsFihCSuJJiXYE8DAHeV0HNuR663Hqsmg0U,9436
82
+ crawlo/settings/setting_manager.py,sha256=LTs4NQ_CXvfhnDmmdKGlvosIjHtZk_48v7BEa_O0ghQ,7710
83
+ crawlo/spider/__init__.py,sha256=I2_eb6NtgTQ-dckhQXZyDFQORUTx1OHqcn-9yleumkg,21074
84
+ crawlo/templates/crawlo.cfg.tmpl,sha256=lwiUVe5sFixJgHFEjn1OtbAeyWsECOrz37uheuVtulk,240
85
+ crawlo/templates/run.py.tmpl,sha256=vYCRPWpG2LxK3UvTxIyCDJh7qy43eoaU1CrJgBF-I6Y,1071
86
+ crawlo/templates/project/__init__.py.tmpl,sha256=aQnHaOjMSkTviOC8COUX0fKymuyf8lx2tGduxkMkXEE,61
87
+ crawlo/templates/project/items.py.tmpl,sha256=8_3DBA8HrS2XbfHzsMZNJiZbFY6fDJUUMFoFti_obJk,314
88
+ crawlo/templates/project/middlewares.py.tmpl,sha256=fxHqi-Sjec5GiHJciprOU-6SAUTzM728NlZckIqf9hM,4278
89
+ crawlo/templates/project/pipelines.py.tmpl,sha256=j9oqEhCezmmHlBhMWgYtlgup4jhWnMlv6AEiAOHODkg,2704
90
+ crawlo/templates/project/settings.py.tmpl,sha256=5J__P6nTNr-3-ziSPdbBe-aklP4G6OFglI7UgEEF1zE,10562
91
+ crawlo/templates/project/settings_distributed.py.tmpl,sha256=ci_vMOTwVwUeoqKGTa5tA6ygeruuKGt677liMYcp4Tw,7181
92
+ crawlo/templates/project/settings_gentle.py.tmpl,sha256=CJMn4gvTg8xNUJLgObh8OmBCyGGyeCcLrfEdBAw8I4I,2770
93
+ crawlo/templates/project/settings_high_performance.py.tmpl,sha256=rqPqIeChbfjEvQmMjAbuRml7pMhxTv2WBIkn21CZ6ew,5134
94
+ crawlo/templates/project/settings_minimal.py.tmpl,sha256=TIKOnEbE2A6P52oe_aVtVYpR6zN-4-ECwl-zjNSFGV4,1241
95
+ crawlo/templates/project/settings_simple.py.tmpl,sha256=8a0c1KTtnA4JzszhifIzG8wf9P7yEMjMhCMjoO0Qz2s,4500
96
+ crawlo/templates/project/spiders/__init__.py.tmpl,sha256=zMbePipgLsctQUEnda4WkHz8rDLUX--rc8ruI6zkpWc,111
97
+ crawlo/templates/spider/spider.py.tmpl,sha256=KvU-9YpN6MifDE7XzejjyyQS7RUjLDLZ8zqJcLwSsu0,5198
98
+ crawlo/tools/__init__.py,sha256=tOYfYPvZlrO8cmvnMWBjTma6UTLTFZN3qdC8pJwHrzI,4142
99
+ crawlo/tools/anti_crawler.py,sha256=LwLC6BkxDSkxc5H1hQ6kY9j7O0PZGAMPZECr7gbqw2M,9431
100
+ crawlo/tools/authenticated_proxy.py,sha256=ULCK0Cc9F2rGhRqu6kzKBdxzK9v2n1CsatSQ_PMxpAg,7272
101
+ crawlo/tools/data_formatter.py,sha256=iBDHpZBZvn9O7pLkTQilE1TzYJQEc3z3f6HXoVus0f0,7808
102
+ crawlo/tools/data_validator.py,sha256=bLWnkpFdclJuqjtSAgMI5nznN4vAuPwE1YaiFWKWenM,5490
103
+ crawlo/tools/date_tools.py,sha256=QOT3W5MqcEQhVM3cTZYxu1MRfgX-TI4aF1RI9s0QbdE,9195
104
+ crawlo/tools/distributed_coordinator.py,sha256=0Ej8hv5GA0UmUI7EXNpCNdgh-D-DC7Eapm_3O2POV0U,12711
105
+ crawlo/tools/encoding_converter.py,sha256=CqHAsR2rwxuzsyR-TeQNb79HX5mH4KAUixEY-sX7204,4170
106
+ crawlo/tools/request_tools.py,sha256=oXrk4yWMACVa65fDQCQgzsg6a94FH4_lS7qNR53FHYU,2420
107
+ crawlo/tools/retry_mechanism.py,sha256=4AQ_HLuYt4hYMI9XHoKFk2GQKEiDJB5pAnsMCfjc6Bk,7777
108
+ crawlo/tools/scenario_adapter.py,sha256=pzysL1B2uQ1ZSEncVHd9Hv2viHNgaxP44YAUcDcppfw,9660
109
+ crawlo/tools/text_cleaner.py,sha256=UrMGcgRnJaufjmDKIDsRYKMA8znCAArHDgouttWPygk,6690
110
+ crawlo/utils/__init__.py,sha256=8kMbOZf9bzOUjtvh2QvqXZmiZh3pYzxXH9YQhYcwcoY,597
111
+ crawlo/utils/batch_processor.py,sha256=8LNy-K2SrQVUxmGEWxQyYw_j9M-erN4Ie7O4d3zpBvM,9142
112
+ crawlo/utils/controlled_spider_mixin.py,sha256=8CuM3Cr2wQLHbaO_ohbCsPcImJnyfZHpERbSeMgQ-AQ,16936
113
+ crawlo/utils/db_helper.py,sha256=ZqOt1d3mErVv4TOvoWlov0niUxORB9aHByTmMoNFIDw,10902
114
+ crawlo/utils/enhanced_error_handler.py,sha256=fJC__rnYNKTNUHNbgjZtT846HoE31qyGbPft9bwyYLU,14214
115
+ crawlo/utils/env_config.py,sha256=W-VD_WF63DHxsyJysvp1eJwRh3L_pBRl_PitQAY3nQY,4079
116
+ crawlo/utils/error_handler.py,sha256=N6suB8Utcn7tp6WRJ8gKECr0RIAG86dcOXdwOr998OE,4367
117
+ crawlo/utils/func_tools.py,sha256=y-TYP9H3X67MS_foWy9Z2LIS6GP7Y4Cy3T168ulq3Jc,2451
118
+ crawlo/utils/large_scale_config.py,sha256=lsraHTAQx3sMPjTnCBY_SVIpkuIBUb3zD9eFvmccOOM,8440
119
+ crawlo/utils/large_scale_helper.py,sha256=4ORkZcIrwJ0SlKOUh7l7WIuERORuRhNBgHCM71Rz0n0,12452
120
+ crawlo/utils/log.py,sha256=05-OrQW-qNsAJHoVytICss6oHVSIHVCMnfivg2qI6tQ,7829
121
+ crawlo/utils/performance_monitor.py,sha256=32KspSo7RWvCX_fl0ZFn4ScWWOqbVVwEhPRd921Ez6I,9832
122
+ crawlo/utils/queue_helper.py,sha256=gFmkh1jKlIcN1rmo2Jl6vYcLP5ByUWlfHO9eNlZPBLs,4918
123
+ crawlo/utils/redis_connection_pool.py,sha256=Czm0RoYmgJ5E5xIuVbBnm0IgSWH2AfeaFTGnYVWneYk,11401
124
+ crawlo/utils/redis_key_validator.py,sha256=-UTTx0Ul184pzwSply8hVdH0lp-gkXXOc_gEHR_7VlU,5809
125
+ crawlo/utils/request.py,sha256=ejdKpTwc-HE04HQybafhOVywzz57IV3pY0YMkSLyGUo,9065
126
+ crawlo/utils/request_serializer.py,sha256=KIQBbQWCb5Ne3jFPMtqD96TNs5dTD85Ex3xr16vBrUM,8739
127
+ crawlo/utils/spider_loader.py,sha256=xNzQb7qhQ7TqZsfFtCLpuVcsGi-USriZosU0YSBr9II,2233
128
+ crawlo/utils/system.py,sha256=24zGmtHNhDFMGVo7ftMV-Pqg6_5d63zsyNey9udvJJk,248
129
+ crawlo/utils/tools.py,sha256=uy7qw5Z1BIhyEgiHENvtM7WoGCJxlS8EX3PmOA7ouCo,275
130
+ crawlo/utils/url.py,sha256=RKe_iqdjafsNcp-P2GVLYpsL1qbxiuZLiFc-SqOQkcs,1521
131
+ examples/__init__.py,sha256=NkRbV8_S1tb8S2AW6BE2U6P2-eGOPwMR1k0YQAwQpSE,130
132
+ tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md,sha256=4W6HlT9Uc3cyu77T9pfbkrMxpAZ-xq_L9MU-GbukLV0,3427
133
+ tests/__init__.py,sha256=409aRX8hsPffiZCVjOogtxwhACzBp8G2UTJyUQSxhK0,136
134
+ tests/advanced_tools_example.py,sha256=1_iitECKCuWUYMNNGo61l3lmwMRrWdA8F_Xw56UaGZY,9340
135
+ tests/authenticated_proxy_example.py,sha256=GhmbISta3lDnNRl6Cr1P2ZEy2jXasz942cOeXiSbVXk,2971
136
+ tests/cleaners_example.py,sha256=blVqSJ7SeWUNd17JjHZJgVTzWH65XKevLyaMB_Wg8qA,5324
137
+ tests/config_validation_demo.py,sha256=jbZ7h-HGsJmuqBb1euB_AhmKjllkvPmItRF1K0MQrVM,4171
138
+ tests/controlled_spider_example.py,sha256=2SAQKoREGHe-OzVaSkGpopCcrou6QXmeW7rLdmsyopw,7981
139
+ tests/date_tools_example.py,sha256=XI3iFEzeo7Nb5YepK8WHytIaBegtxWVSISpqQgpV6M8,5042
140
+ tests/debug_pipelines.py,sha256=FMb36bH9lQxBLb-nM579hBRK1S16Vxu1t_BC3Dj8O2w,2164
141
+ tests/dynamic_loading_example.py,sha256=7LdeQZFevrb-U1_dgr4oX3aYo2Da4HvE_0KIf1fw4Ew,18786
142
+ tests/dynamic_loading_test.py,sha256=dzDW7b66HeDsIYsYgvNRihE3V6b6gEbUGQpp-eJbcIM,3413
143
+ tests/env_config_example.py,sha256=_ZRDh_LR23ZKpy9E--y_KM0QIOiZF5vRT98QTn52TY8,4951
144
+ tests/error_handling_example.py,sha256=grTeo1X17rFz4lhgASb0g5yu4NWbmNz5neyuonnNR40,5294
145
+ tests/redis_key_validation_demo.py,sha256=WD2jvuBwHhLYIb3lVFtvYSSnmXWn1EW4EPCEwFhfi6M,4467
146
+ tests/request_params_example.py,sha256=J50NdsnK1sDrqG-5m3oA-mu1_wHwVwHIfsWxGeQpz7o,4250
147
+ tests/response_improvements_example.py,sha256=t1cbG3nesp82bqog4_ku1GvQzNbhRyWa5EaKTmOPrSk,5402
148
+ tests/test_advanced_tools.py,sha256=HT_TcwfFzli-CavIJSqQqnCxnBn5FDMX09zL7AJ5tNY,5398
149
+ tests/test_all_redis_key_configs.py,sha256=dWc4Dsr07_vuSpb4hwkMpyy6XO8SI7vglVjGuGvXoa4,5710
150
+ tests/test_authenticated_proxy.py,sha256=lnvmQwuf0zaZP_E05EzcNFR2VJbwTkLjOmZGNoJKaC4,4339
151
+ tests/test_cleaners.py,sha256=HDK8_YU7GUj_3hGU415cxEeUR74mnDSk0yroLlgDI0I,1816
152
+ tests/test_comprehensive.py,sha256=dvRJeeVYc1cgXK9Y171hH9Y847zZpWSAFFH-EI3UepQ,5182
153
+ tests/test_config_consistency.py,sha256=RgSxyaypMpysltsGSh1vFMeOShiZZG0rmUKzEhNLpYw,2001
154
+ tests/test_config_merge.py,sha256=ts1j-TIKkFS0EO5q1I4O7f4YUKR5MLTmRSqOpOlv094,5606
155
+ tests/test_config_validator.py,sha256=Z4gBHkI0_fEx-xgiiG4T33F4BAuePuF81obpNTXfseY,6202
156
+ tests/test_crawlo_proxy_integration.py,sha256=miag_gufreZodFBwYGtyZNN-wfGyxUztg4w5-HEiBCY,2728
157
+ tests/test_date_tools.py,sha256=pcLDyhLrZ_jh-PhPm4CvLZEgNeH9kLMPKN5zacHwuWM,4053
158
+ tests/test_default_header_middleware.py,sha256=v-ei_1EY7cvFSsySrQPXF5-DmyGsq2yzjYhhrwFMOXs,6003
159
+ tests/test_distributed.py,sha256=78Pn4HPLIaO8t1IiaSkckBmuEVTcnC8IDw7znf9_Zcw,1790
160
+ tests/test_double_crawlo_fix.py,sha256=uT-PJLxGS4psOvVkJhurffV19hxhhlX5zHMPEyi59og,7977
161
+ tests/test_double_crawlo_fix_simple.py,sha256=NDmCEeyvpf_D1tGQMA66iLPPKlAnSZcEg71e7GHYcjg,4768
162
+ tests/test_download_delay_middleware.py,sha256=Idc6KzhL3hY3aDKgn1j_v5-mLIHz7dTnV5c4tJVZh5Q,9107
163
+ tests/test_downloader_proxy_compatibility.py,sha256=0hgIzWXIqd92YXEB5sNneyp4Sk7PaG76up2cd6N9QQY,8903
164
+ tests/test_dynamic_downloaders_proxy.py,sha256=t_aWpxOHi4h3_fg2ImtIq7IIJ0r3PTHtnXiopPe2ZlM,4450
165
+ tests/test_dynamic_proxy.py,sha256=zi7Ocbhc9GL1zCs0XhmG2NvBBeIZ2d2hPJVh18lH4Y0,3172
166
+ tests/test_dynamic_proxy_config.py,sha256=C_9CEjCJtrr0SxIXCyLDhSIi88ujF7UAT1F-FAphd0w,5853
167
+ tests/test_dynamic_proxy_real.py,sha256=krWnbFIH26mWNPhOfPMmx3ZxJfOreZxMZFGwVb_8-K8,3511
168
+ tests/test_edge_cases.py,sha256=1RnFaCebYTDNNz_LK8M0MepiSwPvJUk_FBK4nQTCUbg,10729
169
+ tests/test_enhanced_error_handler.py,sha256=Ku_86jv7iDe25v8ZxalcXxJJjIiIvQXWH8ZldbwdVm8,8581
170
+ tests/test_env_config.py,sha256=Qu1sDeADs69dSr1x0QmEe8nJrMHneE_4JClt-N901e8,4867
171
+ tests/test_error_handler_compatibility.py,sha256=xJ43cmCwfBGh-qBwCGiMDPPlfNDLw4ZrmlrHN9IojkU,4241
172
+ tests/test_final_validation.py,sha256=OuZI01O0E68Pao--bD-BFDTRZFPc_Mt4W-OXUzlt6ZA,4966
173
+ tests/test_framework_env_usage.py,sha256=bFb_ptdLeX2obdJWEqEHPWweiWR-wR2BpvEaJMQK7h4,4201
174
+ tests/test_integration.py,sha256=lVEzKNAjFzFZHRNZAyJmXxa_5Ogf_qqL4APqs620o58,4839
175
+ tests/test_item_dedup_redis_key.py,sha256=dp_H59exJLaZHh5oMtmMEOWh-DNZwbnwIFYDjOpHgd0,3842
176
+ tests/test_mode_consistency.py,sha256=hS9JwawnBvNwSu1l3DfArlGQGWPyVYXGuXCQtMuDHKs,1226
177
+ tests/test_offsite_middleware.py,sha256=1DYktO_D-hiLEB6dBnc0iOvnWimqOdE6kimnS8aof_s,7764
178
+ tests/test_parsel.py,sha256=wuZqRFIm9xx1tt6o3Xi_OjvwhT_MPmHiUEj2ax06zlo,701
179
+ tests/test_performance.py,sha256=Lqs2iu3dmWipZkBPARcwIjDLXsqe42ntz1M4RzqqXKo,11457
180
+ tests/test_proxy_api.py,sha256=XnmklS-xU4ke_560gV6AIlBsRmG8YLQTGFAZrTUZuhc,11013
181
+ tests/test_proxy_health_check.py,sha256=_tDlxa_6TdL3M5RLkHF82roXJ8WIuG5hELBp2GADyKQ,1123
182
+ tests/test_proxy_middleware.py,sha256=EdQAfwwAJIBxw9JmUFTDEu_pdxapaTlcJr7KcrY6-AY,4021
183
+ tests/test_proxy_middleware_enhanced.py,sha256=QR-p26F63N7MxNjZ2QJUeerh_xdnCDejkrGPIh7Fh4U,7035
184
+ tests/test_proxy_middleware_integration.py,sha256=mTPK_XvbmLCV_QoVZzA3ybWOOX61493Ew78WfTp-bYQ,4441
185
+ tests/test_proxy_middleware_refactored.py,sha256=VbkTWkmmomcyswobA_gf3p_bERl_eexY2e6ohJQS_A8,6960
186
+ tests/test_proxy_providers.py,sha256=u_R2fhab90vqvQEaOAztpAOe9tJXvUMIdoDxmStmXJ4,1749
187
+ tests/test_proxy_stats.py,sha256=ES00CEoDITYPFBGPk8pecFzD3ItYIv6NSpcqNd8-kvo,526
188
+ tests/test_proxy_strategies.py,sha256=9Z1pXmTNyw-eIhGXlf2abZbJx6igLohYq-_3hldQ5uE,1868
189
+ tests/test_queue_manager_double_crawlo.py,sha256=MijZ3JuyHMuqGbRC-8kclFr-4O7m_T8CqezP4qiWk-E,6957
190
+ tests/test_queue_manager_redis_key.py,sha256=B6JqScm_8FcriPb0UtBDLoEmLCGCI2z3NuqC0UMJsWo,6407
191
+ tests/test_random_user_agent.py,sha256=6HjU4iUcMk-J6bR2N5FhIkWDfnaFKAPNVyRzxmQQ14k,2302
192
+ tests/test_real_scenario_proxy.py,sha256=clmLvBfap5OpsaCE08MAWap-78jhVrxYfVfDNyoa4Hg,8454
193
+ tests/test_redis_config.py,sha256=51_Fy1PqIhS0MMO2nR4q6oQjBFxfqcUPK_4NNf5s83g,903
194
+ tests/test_redis_connection_pool.py,sha256=pKfXdE3Cm_L_fNqI9zqFmqiidCwR0t7hiM_Fu_V1cNI,9328
195
+ tests/test_redis_key_naming.py,sha256=MTFk656JhiGVTsMctBDhBNOMFcBDZrsQA3UfPZ-Dgj4,6911
196
+ tests/test_redis_key_validator.py,sha256=GszSzGADgk3uN6Bye1d8pS-AtMVgB8jwqW-22gPNM6M,4418
197
+ tests/test_redis_queue.py,sha256=WQV3MtGg8rJzHgC2kRfXM6lSMXpwXJVQZfqn2dVrhg0,6758
198
+ tests/test_request_ignore_middleware.py,sha256=QN81wgG_W_XfXCF9LvJNxCNwbOH6_tZnLIwLDTK2K5Q,6229
199
+ tests/test_request_params.py,sha256=l2etiDebqylPBym1e9DSDn4wxwTHv8DQHKq9AzlzlG0,4287
200
+ tests/test_request_serialization.py,sha256=Ikgec8tt_sPCK6jcZyK8vRw84zRNE6nxQy9rba1WKmE,2332
201
+ tests/test_response_code_middleware.py,sha256=wSe525bm-bk_iWMjPDzUu1LfOQrwJY8_MLKAspq2dzk,12193
202
+ tests/test_response_filter_middleware.py,sha256=YWrGzJ7wmftTjJXcNTtJl3b3EdJsO4oR22ZLWwgErhg,16327
203
+ tests/test_response_improvements.py,sha256=vNqHKyoEoYeEGAUiRzdsff2V6yvJ9QnDwGg7gmN38Ow,6028
204
+ tests/test_retry_middleware.py,sha256=RmSYSf0GagGPGAVi5TXJWc0bZlmAI_hwFr2FYhvuKrk,8097
205
+ tests/test_scheduler.py,sha256=1fCu35QgK5gzgrhD0aUZj5lxL0QbokzPav-yEJxz9Ig,8182
206
+ tests/test_scheduler_config_update.py,sha256=LuxjEbt20QrPyVkjSFxvTnFtUxwMaHB6TcqjFyo8bow,4261
207
+ tests/test_simple_response.py,sha256=_ui2PuVZvJcAuLY7HZ8xcsy_tDBimgBqX0ukj3kE5J0,1549
208
+ tests/test_telecom_spider_redis_key.py,sha256=c-gfixPul2VlYMQJGf0H5ZgYJ461fQgSKbCPrbAU45M,7625
209
+ tests/test_template_content.py,sha256=2RgCdOA3pMUSOqC_JbTGeW7KonbTqJ0ySYJNWegU-v0,2903
210
+ tests/test_template_redis_key.py,sha256=99-s0_-8MFJbIvGG_X__sH0qkXWTtJv8fdTdlftsq4I,4876
211
+ tests/test_tools.py,sha256=z50Bvq_q8FwpyxNkmh00_A3sXkSv2l1Q_EbK02FDYgk,5504
212
+ tests/test_user_agents.py,sha256=e4haX-o8Janl-PawGJ9MemZyMqTX33_tBF_WnYSVoUw,3327
213
+ tests/tools_example.py,sha256=Rxu5vVKnj3CZ3mCx-EEotBWPtZs2S7ktyqq-SYeclxU,7999
214
+ tests/verify_distributed.py,sha256=0IolM4ymuPOz_uTfHSWFO3Vxzp7Lo6i0zhSbzJhHFtI,4045
215
+ crawlo-1.3.3.dist-info/METADATA,sha256=I5cs60iVGIq6oms6duJsGrlzs-WGdlVtUYbMkzYBXvE,27833
216
+ crawlo-1.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
217
+ crawlo-1.3.3.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
218
+ crawlo-1.3.3.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
219
+ crawlo-1.3.3.dist-info/RECORD,,
examples/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- # @Time : 2025-02-05 12:36
5
- # @Author : oscar
6
- # @Desc : None
7
- """
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ # @Time : 2025-02-05 12:36
5
+ # @Author : oscar
6
+ # @Desc : None
7
+ """
@@ -1,82 +1,82 @@
1
- # 双重 crawlo 前缀问题修复报告
2
-
3
- ## 问题描述
4
- 用户在使用分布式爬虫时发现Redis key中出现了双重`crawlo`前缀,例如`crawlo:crawlo:queue:processing:data`。这导致了Redis key命名不一致和潜在的混淆问题。
5
-
6
- ## 问题分析
7
- 经过代码分析,发现问题出在以下两个方面:
8
- 1. RedisPriorityQueue类在处理队列名称时会自动修改用户提供的队列名称
9
- 2. QueueManager类在提取项目名称时没有正确处理双重`crawlo`前缀的情况
10
-
11
- ## 修复方案
12
-
13
- ### 1. RedisPriorityQueue类修复
14
- 文件:`crawlo/queue/redis_priority_queue.py`
15
-
16
- **修复前**:
17
- ```python
18
- # 如果提供了 queue_name,确保符合命名规范
19
- # 处理可能的重复前缀问题
20
- if queue_name.startswith("crawlo:crawlo:"):
21
- # 修复双重 crawlo 前缀
22
- self.queue_name = queue_name.replace("crawlo:crawlo:", "crawlo:", 1)
23
- elif not queue_name.startswith("crawlo:"):
24
- # 如果没有 crawlo 前缀,添加它
25
- self.queue_name = f"crawlo:{module_name}:queue:requests"
26
- else:
27
- # 已经有正确的 crawlo 前缀
28
- self.queue_name = queue_name
29
- ```
30
-
31
- **修复后**:
32
- ```python
33
- # 保持用户提供的队列名称不变,不做修改
34
- self.queue_name = queue_name
35
- ```
36
-
37
- ### 2. QueueManager类修复
38
- 文件:`crawlo/queue/queue_manager.py`
39
-
40
- **修复后**:
41
- ```python
42
- # 处理可能的双重 crawlo 前缀
43
- if parts[0] == "crawlo" and parts[1] == "crawlo":
44
- # 双重 crawlo 前缀,取第三个部分作为项目名称
45
- if len(parts) >= 3:
46
- project_name = parts[2]
47
- else:
48
- project_name = "default"
49
- elif parts[0] == "crawlo":
50
- # 正常的 crawlo 前缀,取第二个部分作为项目名称
51
- project_name = parts[1]
52
- else:
53
- # 没有 crawlo 前缀,使用第一个部分作为项目名称
54
- project_name = parts[0]
55
- ```
56
-
57
- ## 测试验证
58
-
59
- ### 测试1:Redis队列命名修复测试
60
- 验证RedisPriorityQueue正确处理各种队列名称格式:
61
- - 正常命名:`crawlo:test_project:queue:requests` → `crawlo:test_project:queue:requests`
62
- - 双重 crawlo 前缀:`crawlo:crawlo:queue:requests` → `crawlo:crawlo:queue:requests`
63
- - 三重 crawlo 前缀:`crawlo:crawlo:crawlo:queue:requests` → `crawlo:crawlo:crawlo:queue:requests`
64
-
65
- ### 测试2:队列管理器项目名称提取测试
66
- 验证QueueManager正确提取项目名称:
67
- - 正常命名:`crawlo:test_project:queue:requests` → `test_project`
68
- - 双重 crawlo 前缀:`crawlo:crawlo:queue:requests` → [queue](file://d:\dowell\projects\Crawlo\crawlo\core\processor.py#L13-L13)
69
- - 三重 crawlo 前缀:`crawlo:crawlo:crawlo:queue:requests` → `crawlo`
70
-
71
- ### 测试3:队列管理器创建队列测试
72
- 验证整个流程的正确性,确保队列名称在传递过程中保持一致。
73
-
74
- 所有测试均已通过,表明双重`crawlo`前缀问题已得到解决。
75
-
76
- ## 结论
77
- 通过以上修复,我们成功解决了Redis key中出现双重`crawlo`前缀的问题。现在Redis队列名称将保持用户配置的一致性,processing和failed队列也会相应地保持相同的前缀结构。
78
-
79
- ## 建议
80
- 1. 建议用户在项目配置中使用标准的队列名称格式,如`crawlo:{project_name}:queue:requests`
81
- 2. 可以使用Redis key验证工具定期检查和规范Redis key命名
1
+ # 双重 crawlo 前缀问题修复报告
2
+
3
+ ## 问题描述
4
+ 用户在使用分布式爬虫时发现Redis key中出现了双重`crawlo`前缀,例如`crawlo:crawlo:queue:processing:data`。这导致了Redis key命名不一致和潜在的混淆问题。
5
+
6
+ ## 问题分析
7
+ 经过代码分析,发现问题出在以下两个方面:
8
+ 1. RedisPriorityQueue类在处理队列名称时会自动修改用户提供的队列名称
9
+ 2. QueueManager类在提取项目名称时没有正确处理双重`crawlo`前缀的情况
10
+
11
+ ## 修复方案
12
+
13
+ ### 1. RedisPriorityQueue类修复
14
+ 文件:`crawlo/queue/redis_priority_queue.py`
15
+
16
+ **修复前**:
17
+ ```python
18
+ # 如果提供了 queue_name,确保符合命名规范
19
+ # 处理可能的重复前缀问题
20
+ if queue_name.startswith("crawlo:crawlo:"):
21
+ # 修复双重 crawlo 前缀
22
+ self.queue_name = queue_name.replace("crawlo:crawlo:", "crawlo:", 1)
23
+ elif not queue_name.startswith("crawlo:"):
24
+ # 如果没有 crawlo 前缀,添加它
25
+ self.queue_name = f"crawlo:{module_name}:queue:requests"
26
+ else:
27
+ # 已经有正确的 crawlo 前缀
28
+ self.queue_name = queue_name
29
+ ```
30
+
31
+ **修复后**:
32
+ ```python
33
+ # 保持用户提供的队列名称不变,不做修改
34
+ self.queue_name = queue_name
35
+ ```
36
+
37
+ ### 2. QueueManager类修复
38
+ 文件:`crawlo/queue/queue_manager.py`
39
+
40
+ **修复后**:
41
+ ```python
42
+ # 处理可能的双重 crawlo 前缀
43
+ if parts[0] == "crawlo" and parts[1] == "crawlo":
44
+ # 双重 crawlo 前缀,取第三个部分作为项目名称
45
+ if len(parts) >= 3:
46
+ project_name = parts[2]
47
+ else:
48
+ project_name = "default"
49
+ elif parts[0] == "crawlo":
50
+ # 正常的 crawlo 前缀,取第二个部分作为项目名称
51
+ project_name = parts[1]
52
+ else:
53
+ # 没有 crawlo 前缀,使用第一个部分作为项目名称
54
+ project_name = parts[0]
55
+ ```
56
+
57
+ ## 测试验证
58
+
59
+ ### 测试1:Redis队列命名修复测试
60
+ 验证RedisPriorityQueue正确处理各种队列名称格式:
61
+ - 正常命名:`crawlo:test_project:queue:requests` → `crawlo:test_project:queue:requests`
62
+ - 双重 crawlo 前缀:`crawlo:crawlo:queue:requests` → `crawlo:crawlo:queue:requests`
63
+ - 三重 crawlo 前缀:`crawlo:crawlo:crawlo:queue:requests` → `crawlo:crawlo:crawlo:queue:requests`
64
+
65
+ ### 测试2:队列管理器项目名称提取测试
66
+ 验证QueueManager正确提取项目名称:
67
+ - 正常命名:`crawlo:test_project:queue:requests` → `test_project`
68
+ - 双重 crawlo 前缀:`crawlo:crawlo:queue:requests` → [queue](file://d:\dowell\projects\Crawlo\crawlo\core\processor.py#L13-L13)
69
+ - 三重 crawlo 前缀:`crawlo:crawlo:crawlo:queue:requests` → `crawlo`
70
+
71
+ ### 测试3:队列管理器创建队列测试
72
+ 验证整个流程的正确性,确保队列名称在传递过程中保持一致。
73
+
74
+ 所有测试均已通过,表明双重`crawlo`前缀问题已得到解决。
75
+
76
+ ## 结论
77
+ 通过以上修复,我们成功解决了Redis key中出现双重`crawlo`前缀的问题。现在Redis队列名称将保持用户配置的一致性,processing和failed队列也会相应地保持相同的前缀结构。
78
+
79
+ ## 建议
80
+ 1. 建议用户在项目配置中使用标准的队列名称格式,如`crawlo:{project_name}:queue:requests`
81
+ 2. 可以使用Redis key验证工具定期检查和规范Redis key命名
82
82
  3. 如果需要统一的命名规范,可以在项目初始化时明确指定队列名称
tests/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- # @Time : 2025-08-24 12:36
5
- # @Author : crawl-coder
6
- # @Desc : None
7
- """
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ # @Time : 2025-08-24 12:36
5
+ # @Author : crawl-coder
6
+ # @Desc : None
7
+ """