crawlo 1.1.8__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (191) hide show
  1. crawlo/__init__.py +61 -61
  2. crawlo/__version__.py +1 -1
  3. crawlo/cleaners/__init__.py +60 -60
  4. crawlo/cleaners/data_formatter.py +225 -225
  5. crawlo/cleaners/encoding_converter.py +125 -125
  6. crawlo/cleaners/text_cleaner.py +232 -232
  7. crawlo/cli.py +65 -65
  8. crawlo/commands/__init__.py +14 -14
  9. crawlo/commands/check.py +594 -594
  10. crawlo/commands/genspider.py +151 -151
  11. crawlo/commands/help.py +132 -132
  12. crawlo/commands/list.py +155 -155
  13. crawlo/commands/run.py +292 -292
  14. crawlo/commands/startproject.py +418 -418
  15. crawlo/commands/stats.py +188 -188
  16. crawlo/commands/utils.py +186 -186
  17. crawlo/config.py +312 -312
  18. crawlo/config_validator.py +252 -252
  19. crawlo/core/__init__.py +2 -2
  20. crawlo/core/engine.py +354 -345
  21. crawlo/core/processor.py +40 -40
  22. crawlo/core/scheduler.py +143 -136
  23. crawlo/crawler.py +1027 -1027
  24. crawlo/downloader/__init__.py +266 -266
  25. crawlo/downloader/aiohttp_downloader.py +220 -220
  26. crawlo/downloader/cffi_downloader.py +256 -256
  27. crawlo/downloader/httpx_downloader.py +259 -259
  28. crawlo/downloader/hybrid_downloader.py +213 -213
  29. crawlo/downloader/playwright_downloader.py +402 -402
  30. crawlo/downloader/selenium_downloader.py +472 -472
  31. crawlo/event.py +11 -11
  32. crawlo/exceptions.py +81 -81
  33. crawlo/extension/__init__.py +37 -37
  34. crawlo/extension/health_check.py +141 -141
  35. crawlo/extension/log_interval.py +57 -57
  36. crawlo/extension/log_stats.py +81 -81
  37. crawlo/extension/logging_extension.py +43 -43
  38. crawlo/extension/memory_monitor.py +104 -104
  39. crawlo/extension/performance_profiler.py +133 -133
  40. crawlo/extension/request_recorder.py +107 -107
  41. crawlo/filters/__init__.py +154 -154
  42. crawlo/filters/aioredis_filter.py +280 -280
  43. crawlo/filters/memory_filter.py +269 -269
  44. crawlo/items/__init__.py +23 -23
  45. crawlo/items/base.py +21 -21
  46. crawlo/items/fields.py +53 -53
  47. crawlo/items/items.py +104 -104
  48. crawlo/middleware/__init__.py +21 -21
  49. crawlo/middleware/default_header.py +32 -32
  50. crawlo/middleware/download_delay.py +28 -28
  51. crawlo/middleware/middleware_manager.py +135 -135
  52. crawlo/middleware/proxy.py +272 -272
  53. crawlo/middleware/request_ignore.py +30 -30
  54. crawlo/middleware/response_code.py +18 -18
  55. crawlo/middleware/response_filter.py +26 -26
  56. crawlo/middleware/retry.py +124 -124
  57. crawlo/mode_manager.py +211 -211
  58. crawlo/network/__init__.py +21 -21
  59. crawlo/network/request.py +338 -338
  60. crawlo/network/response.py +359 -359
  61. crawlo/pipelines/__init__.py +21 -21
  62. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  63. crawlo/pipelines/console_pipeline.py +39 -39
  64. crawlo/pipelines/csv_pipeline.py +316 -316
  65. crawlo/pipelines/database_dedup_pipeline.py +224 -224
  66. crawlo/pipelines/json_pipeline.py +218 -218
  67. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  68. crawlo/pipelines/mongo_pipeline.py +131 -131
  69. crawlo/pipelines/mysql_pipeline.py +316 -316
  70. crawlo/pipelines/pipeline_manager.py +61 -61
  71. crawlo/pipelines/redis_dedup_pipeline.py +167 -167
  72. crawlo/project.py +187 -187
  73. crawlo/queue/pqueue.py +37 -37
  74. crawlo/queue/queue_manager.py +337 -334
  75. crawlo/queue/redis_priority_queue.py +298 -298
  76. crawlo/settings/__init__.py +7 -7
  77. crawlo/settings/default_settings.py +219 -219
  78. crawlo/settings/setting_manager.py +122 -122
  79. crawlo/spider/__init__.py +639 -639
  80. crawlo/stats_collector.py +59 -59
  81. crawlo/subscriber.py +130 -130
  82. crawlo/task_manager.py +30 -30
  83. crawlo/templates/crawlo.cfg.tmpl +10 -10
  84. crawlo/templates/project/__init__.py.tmpl +3 -3
  85. crawlo/templates/project/items.py.tmpl +17 -17
  86. crawlo/templates/project/middlewares.py.tmpl +109 -109
  87. crawlo/templates/project/pipelines.py.tmpl +96 -96
  88. crawlo/templates/project/run.py.tmpl +45 -45
  89. crawlo/templates/project/settings.py.tmpl +326 -326
  90. crawlo/templates/project/settings_distributed.py.tmpl +119 -119
  91. crawlo/templates/project/settings_gentle.py.tmpl +94 -94
  92. crawlo/templates/project/settings_high_performance.py.tmpl +151 -151
  93. crawlo/templates/project/settings_simple.py.tmpl +68 -68
  94. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  95. crawlo/templates/spider/spider.py.tmpl +141 -141
  96. crawlo/tools/__init__.py +182 -182
  97. crawlo/tools/anti_crawler.py +268 -268
  98. crawlo/tools/authenticated_proxy.py +240 -240
  99. crawlo/tools/data_validator.py +180 -180
  100. crawlo/tools/date_tools.py +35 -35
  101. crawlo/tools/distributed_coordinator.py +386 -386
  102. crawlo/tools/retry_mechanism.py +220 -220
  103. crawlo/tools/scenario_adapter.py +262 -262
  104. crawlo/utils/__init__.py +35 -35
  105. crawlo/utils/batch_processor.py +260 -260
  106. crawlo/utils/controlled_spider_mixin.py +439 -439
  107. crawlo/utils/date_tools.py +290 -290
  108. crawlo/utils/db_helper.py +343 -343
  109. crawlo/utils/enhanced_error_handler.py +359 -359
  110. crawlo/utils/env_config.py +105 -105
  111. crawlo/utils/error_handler.py +125 -125
  112. crawlo/utils/func_tools.py +82 -82
  113. crawlo/utils/large_scale_config.py +286 -286
  114. crawlo/utils/large_scale_helper.py +343 -343
  115. crawlo/utils/log.py +128 -128
  116. crawlo/utils/performance_monitor.py +284 -284
  117. crawlo/utils/queue_helper.py +175 -175
  118. crawlo/utils/redis_connection_pool.py +334 -334
  119. crawlo/utils/redis_key_validator.py +199 -199
  120. crawlo/utils/request.py +267 -267
  121. crawlo/utils/request_serializer.py +219 -219
  122. crawlo/utils/spider_loader.py +62 -62
  123. crawlo/utils/system.py +11 -11
  124. crawlo/utils/tools.py +4 -4
  125. crawlo/utils/url.py +39 -39
  126. crawlo-1.2.0.dist-info/METADATA +697 -0
  127. crawlo-1.2.0.dist-info/RECORD +190 -0
  128. examples/__init__.py +7 -7
  129. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
  130. tests/__init__.py +7 -7
  131. tests/advanced_tools_example.py +275 -275
  132. tests/authenticated_proxy_example.py +236 -236
  133. tests/cleaners_example.py +160 -160
  134. tests/config_validation_demo.py +102 -102
  135. tests/controlled_spider_example.py +205 -205
  136. tests/date_tools_example.py +180 -180
  137. tests/dynamic_loading_example.py +523 -523
  138. tests/dynamic_loading_test.py +104 -104
  139. tests/env_config_example.py +133 -133
  140. tests/error_handling_example.py +171 -171
  141. tests/redis_key_validation_demo.py +130 -130
  142. tests/response_improvements_example.py +144 -144
  143. tests/test_advanced_tools.py +148 -148
  144. tests/test_all_redis_key_configs.py +145 -145
  145. tests/test_authenticated_proxy.py +141 -141
  146. tests/test_cleaners.py +54 -54
  147. tests/test_comprehensive.py +146 -146
  148. tests/test_config_validator.py +193 -193
  149. tests/test_date_tools.py +123 -123
  150. tests/test_double_crawlo_fix.py +207 -207
  151. tests/test_double_crawlo_fix_simple.py +124 -124
  152. tests/test_dynamic_downloaders_proxy.py +124 -124
  153. tests/test_dynamic_proxy.py +92 -92
  154. tests/test_dynamic_proxy_config.py +146 -146
  155. tests/test_dynamic_proxy_real.py +109 -109
  156. tests/test_edge_cases.py +303 -303
  157. tests/test_enhanced_error_handler.py +270 -270
  158. tests/test_env_config.py +121 -121
  159. tests/test_error_handler_compatibility.py +112 -112
  160. tests/test_final_validation.py +153 -153
  161. tests/test_framework_env_usage.py +103 -103
  162. tests/test_integration.py +356 -356
  163. tests/test_item_dedup_redis_key.py +122 -122
  164. tests/test_parsel.py +29 -29
  165. tests/test_performance.py +327 -327
  166. tests/test_proxy_health_check.py +32 -32
  167. tests/test_proxy_middleware_integration.py +136 -136
  168. tests/test_proxy_providers.py +56 -56
  169. tests/test_proxy_stats.py +19 -19
  170. tests/test_proxy_strategies.py +59 -59
  171. tests/test_queue_manager_double_crawlo.py +174 -231
  172. tests/test_queue_manager_redis_key.py +176 -176
  173. tests/test_redis_config.py +28 -28
  174. tests/test_redis_connection_pool.py +294 -294
  175. tests/test_redis_key_naming.py +181 -181
  176. tests/test_redis_key_validator.py +123 -123
  177. tests/test_redis_queue.py +224 -224
  178. tests/test_request_serialization.py +70 -70
  179. tests/test_response_improvements.py +152 -152
  180. tests/test_scheduler.py +241 -241
  181. tests/test_simple_response.py +61 -61
  182. tests/test_telecom_spider_redis_key.py +205 -205
  183. tests/test_template_content.py +87 -87
  184. tests/test_template_redis_key.py +134 -134
  185. tests/test_tools.py +153 -153
  186. tests/tools_example.py +257 -257
  187. crawlo-1.1.8.dist-info/METADATA +0 -626
  188. crawlo-1.1.8.dist-info/RECORD +0 -190
  189. {crawlo-1.1.8.dist-info → crawlo-1.2.0.dist-info}/WHEEL +0 -0
  190. {crawlo-1.1.8.dist-info → crawlo-1.2.0.dist-info}/entry_points.txt +0 -0
  191. {crawlo-1.1.8.dist-info → crawlo-1.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,190 @@
1
+ crawlo/__init__.py,sha256=1tc6uUDF1yRNU7K_k-Dl6h9FGy7Jp8fdhRsXu9PctFI,1312
2
+ crawlo/__version__.py,sha256=MpAT5hgNoHnTtG1XRD_GV_A7QrHVU6vJjGSw_8qMGA4,22
3
+ crawlo/cli.py,sha256=pbd9wR6evB4aHWNrTRG4WW2ScCN5p4kz44eAuohSdR0,2029
4
+ crawlo/config.py,sha256=cQTDYn2VCdlIs3Jb8mGwF6IWqj85BMr6HQaGS3XjZ7g,9535
5
+ crawlo/config_validator.py,sha256=Q2j9rGW2lZiaA1ka5cJWaabPr1W0fwYHzY_gv-qpPyY,9903
6
+ crawlo/crawler.py,sha256=Mjv4WrNduSTXv3hJuSynwEu-MyUhgx2TOMJ913LxkmY,36371
7
+ crawlo/event.py,sha256=7-y6HNv_EIJSYQNzsj0mVK-Gg4ON3wdQeMdQjfFJPlw,313
8
+ crawlo/exceptions.py,sha256=sMay0wnWLfc_FXWslqxm60qz6b66LXs3EdN_w8ygE9k,1166
9
+ crawlo/mode_manager.py,sha256=cLHwKW_iupFQ-uTgyl90SFhrYtbpVIdqevMXgPOJpWE,7517
10
+ crawlo/project.py,sha256=FDAwiucebeck5Ftuc7M8mFCRunAyUd49FtKcCAE7R4c,6644
11
+ crawlo/stats_collector.py,sha256=NkO09CB-220qz5rxFcD_dedGfr2VPFrDo4hya0Zh8Qc,1577
12
+ crawlo/subscriber.py,sha256=gioTIqRdEwVG-bwIiQonbk1vWWAqTh9hzVkrqZ1AfP0,5006
13
+ crawlo/task_manager.py,sha256=19InAxS9oJ9EMj20Aw2urN-v6BeC22dkgPuW-B9-4UI,819
14
+ crawlo/cleaners/__init__.py,sha256=DVD4W09aJF-ih8vYx3DPZaXIE18MTEM36uo1SnqLMLk,1201
15
+ crawlo/cleaners/data_formatter.py,sha256=NEj3NqPiyG67V6qDgn2KNj9VNHWOLNwX-7p_nad0znc,7583
16
+ crawlo/cleaners/encoding_converter.py,sha256=uWyZMHJxC2y5cBRg8LnrBAYV3X4ESrkmKVE2ho8s1FY,4113
17
+ crawlo/cleaners/text_cleaner.py,sha256=WDcQdvoyaY3tSxYa7EdS6BGvZNggoJbfyE662Y8TiL8,6475
18
+ crawlo/commands/__init__.py,sha256=QbhGAmItiwVrtlTr9UUbEJMegLJo-SdzaKX2PUhBgfI,378
19
+ crawlo/commands/check.py,sha256=172OiAxnX5wwSlszUsyPgMZwAoIbGDTdfhtRz309ilc,22843
20
+ crawlo/commands/genspider.py,sha256=-jGJdfXLsefX_H1ydQ2wirdu6p6wmhClzVXY_0L-1aE,5050
21
+ crawlo/commands/help.py,sha256=EaU2v4A5cFFaUUG_HTp6Hby1p36777oK_OogGnrzBfI,4879
22
+ crawlo/commands/list.py,sha256=yByqQeZBgvjewOKxpnOobpeJ7Hnbs-CWsoyITqZu2ZY,5781
23
+ crawlo/commands/run.py,sha256=LL6P0eGgwCgMWLkGsTWjXDu6Vh74N8vq4m7-PUiNKWc,10751
24
+ crawlo/commands/startproject.py,sha256=aBBR5dNb7R-yEMVMisoKiEA86HUFB_yHa0VgPftDXko,15354
25
+ crawlo/commands/stats.py,sha256=6pAgkEi8MBnCer2rWmKpaTYr1jaM6HeMG9owAvEzJyY,6064
26
+ crawlo/commands/utils.py,sha256=nohMvUU2zLvX0XzXk6KeCNxP0EvSWj9DiVLxM_7tD5o,5106
27
+ crawlo/core/__init__.py,sha256=PnFyJdVNHBoPmV1sW0AHQXijeoSTQ8cMYrbNM1JK8kA,41
28
+ crawlo/core/engine.py,sha256=c65vwIPrwDzFvec2f1QJ2_hikBjj-CYjTGkYrjnWxto,13724
29
+ crawlo/core/processor.py,sha256=qmCqAeqhwYu-UE86evYesaGt9qpuSIfH-ZIZKcXFCZc,1140
30
+ crawlo/core/scheduler.py,sha256=ONdbmcjGyUoOhnXsO2pmTGU6no1RxTEM5FDDcthGLGA,5219
31
+ crawlo/downloader/__init__.py,sha256=MlstaKfW-WLXNuZs7tb7cG_wG2sQLw2hdWmUjZEIH7c,8299
32
+ crawlo/downloader/aiohttp_downloader.py,sha256=n2qP3Q13lOnvwB7cb3YxNyNKYVHKqofNNg7j9tV9h-E,8400
33
+ crawlo/downloader/cffi_downloader.py,sha256=IpQUqvls4mEYs_UwPvtN2L4uUIujqn-rf03NuZZkMl0,10710
34
+ crawlo/downloader/httpx_downloader.py,sha256=7jfQfvAtfk8yD_mvwUbWLhYOxMM7r1nudiU7m_Jl9wc,12037
35
+ crawlo/downloader/hybrid_downloader.py,sha256=otiIQBJYmzDeY9dB4Qrlu9LSSdJOpZVVae-HnNKBXik,8043
36
+ crawlo/downloader/playwright_downloader.py,sha256=Lnc7k5cXhVnURXSxgZFCYMJkBxLg5F_OE67rtf3G7Ig,16261
37
+ crawlo/downloader/selenium_downloader.py,sha256=vykWifXoSSs61k8BB6PtcTDDBKp5uZ9-rYYFR4_pgSQ,21036
38
+ crawlo/extension/__init__.py,sha256=Sg588p6UhyrwFNTiD2wqGW-i3xgLX6HlLuQPKT7mayE,1526
39
+ crawlo/extension/health_check.py,sha256=IVaaVo_0CcZtf1LoCAYXIBvs3wZ7hdmT6U4-NYWAgP0,5527
40
+ crawlo/extension/log_interval.py,sha256=VCIeNqXcWDnxj4m6l77cjqgRzV8LfsPMb22X0Xc1Vwc,2417
41
+ crawlo/extension/log_stats.py,sha256=Ssxz6R1YpWIj5WJvQ2cJ9F5oR7FUFdj-ITc9lV92SSU,2908
42
+ crawlo/extension/logging_extension.py,sha256=ET6VAu1J2qNMz4NnG1G3zQLRhbsvV7l6xRIuQLE6DaE,1626
43
+ crawlo/extension/memory_monitor.py,sha256=4aszl3C0GMQbqFhGZjZq5iQuXQR1sOz06VHjjEHgkyE,4290
44
+ crawlo/extension/performance_profiler.py,sha256=EPiNuXuPPDU0Jtgy8arYHpr_8ASK13cCI2BytdJnu_I,4899
45
+ crawlo/extension/request_recorder.py,sha256=RC23yzXClnVv9j2ljQvjBkUfWznfnDHsrQejKhE9y5E,4074
46
+ crawlo/filters/__init__.py,sha256=XC_Q4ykZtSNYizYlAcehVwBBNO3lZ2zuWwafzXiuWyQ,4241
47
+ crawlo/filters/aioredis_filter.py,sha256=E8BBvzGJfiLuwEZrlfI3O-jANudizT9xcFWrxxFHpgk,10028
48
+ crawlo/filters/memory_filter.py,sha256=VJO0UFRYGxmV8dj4G1subsQ-FtvPcGLbvd7IVtqXnOs,9260
49
+ crawlo/items/__init__.py,sha256=bqekZrRlDhxfWie0UbCs656TptYseoe9QJ67I4E7Elk,386
50
+ crawlo/items/base.py,sha256=tAYrPJgblp3ZEihDXvappdYc6pGdim6x2_9QSmMKI2o,577
51
+ crawlo/items/fields.py,sha256=wMlakQTsEwyrlLzMt1gI4pScLQZMqd3E1xcfH4dbSqk,1801
52
+ crawlo/items/items.py,sha256=e-3nXI9ckD64vcDxxQiAU6ufbtJMs09gbZQcYjxgwHY,3374
53
+ crawlo/middleware/__init__.py,sha256=ldaGFNbiJnK9Fx12Vdf9fDNfzXxoETtShp5r-vodtw0,549
54
+ crawlo/middleware/default_header.py,sha256=i_Uj07JObyeZFxL7ZAZmvZsHvA1HGtkNab1sA0d-nWI,1067
55
+ crawlo/middleware/download_delay.py,sha256=2M-TchDA7MwyTfYy0Hzh_bW9wlHlpiP-oQlys7crTj0,966
56
+ crawlo/middleware/middleware_manager.py,sha256=j1hkWRFB5rnC5SnB7oXWE5eUNv8blS9krDIDM5fIDs8,6213
57
+ crawlo/middleware/proxy.py,sha256=qLPA9_iualWBdZhIRJJLlSZVMbwOophLEEpzS8yBG5M,11089
58
+ crawlo/middleware/request_ignore.py,sha256=QI2z4fUnJ-4xvPTZAmsL-GqR4RFHS1xq9iDr5KFrMco,997
59
+ crawlo/middleware/response_code.py,sha256=tmef2QVl3JCiTMii6VQkASlOY2OyqmOPoOfNxIK1eF8,659
60
+ crawlo/middleware/response_filter.py,sha256=ep8ZxDlfIefi9YqK8dPASEp5TTDRo9QEY_jMceC411s,837
61
+ crawlo/middleware/retry.py,sha256=-7zpRURugiTTm4QYUSUlbnURD5mcT2Ji0yHvCgY1wGc,4124
62
+ crawlo/network/__init__.py,sha256=BLPERYPo22g1BXrW--wUnlolrdFUmOPjgOB8XQQJlck,397
63
+ crawlo/network/request.py,sha256=63aBgH6H3ZZakBt6b_c-HUKE-c70AHxhggNOzmxjA2g,12295
64
+ crawlo/network/response.py,sha256=EZiG4LjuIb7PxdGou4H-oSOQhec1ZdBRTkO-5fl8JTo,12701
65
+ crawlo/pipelines/__init__.py,sha256=lrdVDjeHLNkA4_MAwI1auk_I9xfeU1SlBWXiammb6lc,616
66
+ crawlo/pipelines/bloom_dedup_pipeline.py,sha256=QQxGFGEoMHN4Vx2kq7G_i1o9pmuXp8clZebilOar3fk,5642
67
+ crawlo/pipelines/console_pipeline.py,sha256=KABkR3J-rqO0Awox7lizxKR2XuHfVhWPiVRgIybwwu4,1248
68
+ crawlo/pipelines/csv_pipeline.py,sha256=6FBT2AoU6iNU-5NfgWRq7-JpF9dK2nBokjxx-y4jIas,12174
69
+ crawlo/pipelines/database_dedup_pipeline.py,sha256=wVBXEGArFR3uxoN7yfJSOarBmtGrJpOqowAqa7OUs98,8000
70
+ crawlo/pipelines/json_pipeline.py,sha256=vlu1nqbD2mtqtExt9cL5nibx1CwJM1RNqd4WGjZRHAY,8367
71
+ crawlo/pipelines/memory_dedup_pipeline.py,sha256=5jeL2jEq7sioYmXlzfkx-LNSbWyChrXeWx8d15YEZOA,3839
72
+ crawlo/pipelines/mongo_pipeline.py,sha256=k7gNqAO-g2MtIfArphC6z5ZzkKVRkBKcv-2ImziPFA0,5706
73
+ crawlo/pipelines/mysql_pipeline.py,sha256=cwgJvRORTRea_Eep2coBaMf3G8PQVTQA1qrnIlDZApc,13480
74
+ crawlo/pipelines/pipeline_manager.py,sha256=vK87pAEmpGR24yl6Cr7ovCKag2oB5mruijfYT8nnG5o,2358
75
+ crawlo/pipelines/redis_dedup_pipeline.py,sha256=sgrBSVdxPWgh8HQxvGsazz1MSyBERJF5jd1yoeYo0lE,6166
76
+ crawlo/queue/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
+ crawlo/queue/pqueue.py,sha256=yzF900ko2RReVNZtfk_VV3PzSXEUALI6SHf97geuu6s,1058
78
+ crawlo/queue/queue_manager.py,sha256=_oO5Taqj5WTSKtj9UuLrGZcWxMg2liqaa-kceXyF4GI,12874
79
+ crawlo/queue/redis_priority_queue.py,sha256=RbwKsVxzk31B1VRvyve4vHKe2DesL7K37IZAA31kdd0,12783
80
+ crawlo/settings/__init__.py,sha256=xsukVKn_h2Hopm1Nj-bXkhbfyS62QTTvJi7fhZUwR9M,123
81
+ crawlo/settings/default_settings.py,sha256=aTYgtWJUGmFF93SV4qeh3pmMpcTkkPd9bJkK486yTik,8781
82
+ crawlo/settings/setting_manager.py,sha256=0RYAk07qoJ5WTw_mvV4ECWGS2QNpCnGmBZVTHjqOVIg,3707
83
+ crawlo/spider/__init__.py,sha256=Z_rK23l5yt-DuwJPg8bcqodM_FIs4-iHLaKOimGumcE,20452
84
+ crawlo/templates/crawlo.cfg.tmpl,sha256=9BAmwEibS5Tvy6HIcGXPb0BGeuesmibebmTW0iAEkmo,230
85
+ crawlo/templates/project/__init__.py.tmpl,sha256=f3ETIXw_O6K-lkL6lXM5znMPJW1FZYGFrwDs2BnHcnQ,58
86
+ crawlo/templates/project/items.py.tmpl,sha256=mt1Mm--H2Ouos3r7JPkYh0r33rgYJf1YOMz0OZy8TYs,297
87
+ crawlo/templates/project/middlewares.py.tmpl,sha256=Dyp_tyNgF9U4w1oMAvOAOfSepWsjXWOvH__qQ3BOkSQ,3731
88
+ crawlo/templates/project/pipelines.py.tmpl,sha256=GBHYU0Jx8sKDCdGJp44FMSH7u2slxoFg6a-R9Uwg_-I,2608
89
+ crawlo/templates/project/run.py.tmpl,sha256=KaLhqngKEzKtvDI2KCHe00yFRXnxMwbdq7YoXPR4kEg,1226
90
+ crawlo/templates/project/settings.py.tmpl,sha256=JpK0Zq9TdxwucizUliOSLNYs9KgnFwJuai8GJEGlt6o,11710
91
+ crawlo/templates/project/settings_distributed.py.tmpl,sha256=6xG7ehu86qAXscWOAFbrtXL5mzGNarCgYpbW5LXzExg,4202
92
+ crawlo/templates/project/settings_gentle.py.tmpl,sha256=IPE9H9WqRQWVdj9supFutt9gvypYQRVhRFOwZyYxem0,3226
93
+ crawlo/templates/project/settings_high_performance.py.tmpl,sha256=UNDx9f76kF2je8hE4ZE73wQcLcApjrEzpNLHWp9DvaI,5283
94
+ crawlo/templates/project/settings_simple.py.tmpl,sha256=qk0EnX4xncVBPVlFmqS1yDn56Cz6TBEQhgDdY8TABJ8,2297
95
+ crawlo/templates/project/spiders/__init__.py.tmpl,sha256=j_YKsw6HQMJyqlk3WUouP3bsr-XVxshRoSNakHBc00g,106
96
+ crawlo/templates/spider/spider.py.tmpl,sha256=CzZr8SNzkuEQGA8F11CfMxYvOpwnHOWCG-1MiLqN6D0,4896
97
+ crawlo/tools/__init__.py,sha256=5H6rAhjfNSqRMjjlLDVq-vEJWRFyCO-J6HN2kexnXJU,3671
98
+ crawlo/tools/anti_crawler.py,sha256=MU6KEPT0q85e_-Px8Rmw1fxdwlmOdpXfo0KYVpPlivU,9163
99
+ crawlo/tools/authenticated_proxy.py,sha256=L93WeXajIZ3si8xNcE7bBPv34FvqSyTvVfD78fJAKQE,7032
100
+ crawlo/tools/data_validator.py,sha256=hxPN28YtJDFFLjBBYhDjHmR8ShNTEjgIsv-cmcDKIu8,5310
101
+ crawlo/tools/date_tools.py,sha256=icbx6x08t-Aj8j0jAB-93hDdSpBV9Rd-aaNt4UpyjNA,600
102
+ crawlo/tools/distributed_coordinator.py,sha256=ZN-zYsDPw0JWSBF7oHqL2vpUDgRElkDTVW7TcY1ArcI,12477
103
+ crawlo/tools/retry_mechanism.py,sha256=27iIuLCg1ZHjog6Gcr7dz0APkeTZ5cIlcnZ0pF18ZVA,7801
104
+ crawlo/tools/scenario_adapter.py,sha256=JouFxI3513PRe1ObwHWc72vBvptNpNv0Ew3pRaEKjQQ,9398
105
+ crawlo/utils/__init__.py,sha256=OVjs7qjyd7sn1vrh4MWykO4sMsAPLIWkq-74dQ2Pzak,557
106
+ crawlo/utils/batch_processor.py,sha256=Bim6iSinjFKuKI2c1Ss6PQ1R-zJEvLdVxPiBDTa75z4,8895
107
+ crawlo/utils/controlled_spider_mixin.py,sha256=VjT30pNW_YIgmTD0nb7DDl2D3HvpnAYFzgSgV3fxFN0,16475
108
+ crawlo/utils/date_tools.py,sha256=6zNU23vQ8p3-xHknlIp5airCLOREETaqw4YElyMO_lQ,8907
109
+ crawlo/utils/db_helper.py,sha256=3ib5-agrlwf2t5S_QtLRYH75wvJDlYbRqRmDEbpH5Bo,10559
110
+ crawlo/utils/enhanced_error_handler.py,sha256=qcFR3o2HyEyBQf0n1l6-hSOiFGytYGmhAzseqeidwHA,13906
111
+ crawlo/utils/env_config.py,sha256=Rn5DOUoshtIfCDqWh5CNpTfiXo1bF_zEb0tLo09DLMk,2869
112
+ crawlo/utils/error_handler.py,sha256=v-UVJfIp9k-lOXq5DBukJfE_nb-kv83tlr28RXrIUYQ,4286
113
+ crawlo/utils/func_tools.py,sha256=WUZEGpWMuDDX7g-QySM7iaiC74erW2SSkZoUvDw1NjM,2369
114
+ crawlo/utils/large_scale_config.py,sha256=j7wQ5ty7pQlBRygw2vhRJ7OI19RYBZKPfYMP3WeF2WI,8154
115
+ crawlo/utils/large_scale_helper.py,sha256=JJqcGSI6VaVe3MSL6IWjmCp8XQIu6T4U-BvBLSttr_s,12157
116
+ crawlo/utils/log.py,sha256=A3lPyhD8kD88cV23KOL-_eT8g69xGQ5L1toDB2AO0mc,4005
117
+ crawlo/utils/performance_monitor.py,sha256=mygXJ0FNxHyunb_aQSgmcBv0bkT4ISG5QqlOXANtvlk,9595
118
+ crawlo/utils/queue_helper.py,sha256=xpUUTOqlU1xz2Pb9NKAVGo3AfAO-7Xvx8Lm1q65Dgck,4743
119
+ crawlo/utils/redis_connection_pool.py,sha256=9WprN-HbtYGLhAu0AXjXeHcVbJ9jd3dS3U4e_stdCMY,10331
120
+ crawlo/utils/redis_key_validator.py,sha256=mDqZROAlrnMvKX7ToXdaSjQGfu2J5tpCqo1tQ75n5RA,5637
121
+ crawlo/utils/request.py,sha256=yoLB2rY8d78vgPjIWpdhY5SalIKjyLIvTG_UH6EMdVI,8798
122
+ crawlo/utils/request_serializer.py,sha256=bPoSQqE2ksiMyP3WiPB3w3UqZs4f_LgkAw4Pj0qyBDo,8565
123
+ crawlo/utils/spider_loader.py,sha256=pEDUsYOTGjszA6KgjiMlYN4GS5fP4uakkhcp3JTFFQY,2187
124
+ crawlo/utils/system.py,sha256=HvWV1acxou0Rn0L7pNq4CnV_GWFeU0Tmjy3_nLD8M64,237
125
+ crawlo/utils/tools.py,sha256=5Uv25Wy4m_ndZY0-n-eX-t3PxvaZ6wR3-Wvx-o7_Vrs,271
126
+ crawlo/utils/url.py,sha256=rlgX2VlJv6JvLmCDTsbxzMSXE6R5ZL_0dLEqprsA-JU,1482
127
+ examples/__init__.py,sha256=6i631BPnS_TR_BWUjtjB5CBO-zv9kRkwQTQvSya2wHE,123
128
+ tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md,sha256=HjMZv6RjN1o5D1mfgEydP8Mcc9T_4ScR6lG3xVxs8P8,3346
129
+ tests/__init__.py,sha256=scL1IPVT1iucth7v8ffrjRdeW7QrC_Y7AMmFVMdTY1Y,129
130
+ tests/advanced_tools_example.py,sha256=7nlFLRVMVYzDz_CAdgQa9fJu7o0M6jBMo7PTvUsRbo0,9065
131
+ tests/authenticated_proxy_example.py,sha256=8I8bxAIeYDdsh5JoGIK1nxaKEpIdOFnCtufVLjpuq_o,8241
132
+ tests/cleaners_example.py,sha256=i90lrmno16qSq_qjc6WOVOw1Mx4_vzoBgVeqJYC7Vro,5170
133
+ tests/config_validation_demo.py,sha256=iPe_BWBTa-gRvRrnYYkkRDqyKqLzAhSCoWB_yFSkhrg,3116
134
+ tests/controlled_spider_example.py,sha256=SP_k4mdKPvD1JCPs9UCm68jcy2Frg84vvXv9-14RC6I,7776
135
+ tests/date_tools_example.py,sha256=x_-duqnVZ-Hrk-SaNplIfcIV6W3c6u6MTxW35u1i0F0,4862
136
+ tests/dynamic_loading_example.py,sha256=NI0SCg4lPME0RCcNpDDw1HjErjmCgJntCN0ahAEw61g,18263
137
+ tests/dynamic_loading_test.py,sha256=DYbMrEewerx0VGXixci3p9VYgDDQvCPevA92CNjq1Jo,3309
138
+ tests/env_config_example.py,sha256=sKE8DvMBhM3uy439LpgLHd4wF7MGUrUc-X6E7g9qsz0,4818
139
+ tests/error_handling_example.py,sha256=219H1dOm6TZaQ0tmd1zXZlo0ODcwQSBL7m4jZmCFoCo,5161
140
+ tests/redis_key_validation_demo.py,sha256=z1R7ZW6oiG3T0mSZJfP6FRqp4IsOjgbCPxvPuEL7z0I,4345
141
+ tests/response_improvements_example.py,sha256=wnYGJO6MKj5_jbwKLDlbXu_Dli5XC7vlWdzByi82_5Y,5258
142
+ tests/test_advanced_tools.py,sha256=3R8EfKVyBHEb6FA5TP3ieaWeHZhobVgSx8t3phipCrE,5250
143
+ tests/test_all_redis_key_configs.py,sha256=uMLlgtJ7tglzFO_Xbnwol723Lu6bjyIeHaLHtemphaQ,5660
144
+ tests/test_authenticated_proxy.py,sha256=s4pr5JyBTHYQgRq_IymiVKE08vyW1MwR27pSwrrVLVk,4198
145
+ tests/test_cleaners.py,sha256=7KRAjqXF7lWXe-9Hj_5CSFlbRJ6SB7XyMNuMXx36Jmg,1765
146
+ tests/test_comprehensive.py,sha256=kGNcJ9UkQxysYqvsBu0YxAaPleOvN9_hztLy7ljkfc4,5036
147
+ tests/test_config_validator.py,sha256=eZCnZyh34rpbYbzXMWvQkLNl7gu7_Js0fE7iFatXyB4,6632
148
+ tests/test_date_tools.py,sha256=CQdAmIS6bpAdwQH9ETDH__06l2gGL7EHUQuh7mdTF-A,3930
149
+ tests/test_double_crawlo_fix.py,sha256=t7EZsKIfE49ZL3FM0bu1AZ1hOMygwALUN5UtqDGtLUs,7826
150
+ tests/test_double_crawlo_fix_simple.py,sha256=0OPbW8c71n_oB7pt10rRZi-W9eFCO_JOKTquWdIgTUU,4683
151
+ tests/test_dynamic_downloaders_proxy.py,sha256=PtEW-pnVijeX2yX34UcoXYEY23yTBxb-kyNYh-WDljQ,4326
152
+ tests/test_dynamic_proxy.py,sha256=YL2sghNKG7k27-SaHMh4boNLVBHhfSttUwUqiSsOEX4,3080
153
+ tests/test_dynamic_proxy_config.py,sha256=uYXZ804ULI9qYMF-uNjMbi3L_NGzoMqLJcEZAl7aZ2I,5707
154
+ tests/test_dynamic_proxy_real.py,sha256=DTjP8JnSwBnNZ3Ls1BjDAmt6xSuye_6CxwZ4LBisPTM,3402
155
+ tests/test_edge_cases.py,sha256=fFRL3-ChBylg4e-73L9ZSc3u72dPmSfg3jpAg_6GOGA,10553
156
+ tests/test_enhanced_error_handler.py,sha256=E6HHzqGA6U9frzQvy0iVqFHi6LvRJobRwkFWk37tA10,8435
157
+ tests/test_env_config.py,sha256=nfP4nCG9ZHeJUfxo1JKUmiihYbhSeWx_oNW5mMfDHfQ,4746
158
+ tests/test_error_handler_compatibility.py,sha256=o5JLLLdo25Sl_3hpMx6I2fqSgZFAcnI4E6Ci-KxAxwA,4129
159
+ tests/test_final_validation.py,sha256=fBxf_6YcAEa_HyV_oGAXmmVHY4i6FdA4J6klCmc36hQ,4925
160
+ tests/test_framework_env_usage.py,sha256=HYpTwORXeaJHMffCYAGHGvc_a6ax4lo28xP8BYOaKxk,4098
161
+ tests/test_integration.py,sha256=8Bky_n5fbJDbb2pbKqc6IwJgpV0B9iau84C-PwcbsLA,11321
162
+ tests/test_item_dedup_redis_key.py,sha256=GOmmJ7hdTV_2XifRa_PQDrQpBt7Rnt3ViYNjNpflGgw,3755
163
+ tests/test_parsel.py,sha256=KYskaN_4HBc1XDTltjVo12v1i7JAThB2UIwcWZ-mwbY,672
164
+ tests/test_performance.py,sha256=heeFoS3R9n_t7ri0MyUL8kE0FY9sYJiC7BSQNI2C5HQ,11286
165
+ tests/test_proxy_health_check.py,sha256=xo3QMP1YNw9hu7JDHZOYCUZmFFKLJpHSh4SbxXhCRPQ,1091
166
+ tests/test_proxy_middleware_integration.py,sha256=zcl7fR9Toc-I-stSUTzKZPwcfh3kgrpjI5SbkZ6AVmE,4305
167
+ tests/test_proxy_providers.py,sha256=XwWZCywTYguSsUxSm6fsbaoH1p9dKjqSIx9-sqKZehA,1693
168
+ tests/test_proxy_stats.py,sha256=Til_yksrRz2yBVw-yJi5-36LhNW3vTwpXTm4BdR9PUM,507
169
+ tests/test_proxy_strategies.py,sha256=ZkziozkvZd3KWOQnpHQ8Upd3WpyoX7gN0qFGluNm348,1809
170
+ tests/test_queue_manager_double_crawlo.py,sha256=pdoWUFgjfiPGCsuCypUaxMzicXPQnKpZqVh1o4LuaSM,6822
171
+ tests/test_queue_manager_redis_key.py,sha256=-dnqs2p50zpf09-Z7vo74s8hLQ6cxXd8WCdHM0l17qM,6278
172
+ tests/test_redis_config.py,sha256=TqzFRojc6esGXjGhUCvSLYQDUTAgEJsty9vRVuNraMU,893
173
+ tests/test_redis_connection_pool.py,sha256=ORBU0k-6htQ2VVfyQkqTKVLGQWX04XRxdEGsZKeerQA,9177
174
+ tests/test_redis_key_naming.py,sha256=ojHhBED2DEf0VzqQgtzefygSn6hZiBhPFyVUBs_4rdI,6776
175
+ tests/test_redis_key_validator.py,sha256=kud0AhnS_6NP9s8G5HB5L9rob5rv2W6WLy6ypHYQDbk,4304
176
+ tests/test_redis_queue.py,sha256=o6xViXxJcdx-1eMcG3vhAQEIm8h346HnZb7JXs7ZjwM,6622
177
+ tests/test_request_serialization.py,sha256=8sVdppAsohJ5u-m1WvablCndwL-M_36YPLdGKwgeznM,2289
178
+ tests/test_response_improvements.py,sha256=zvbkTkWhgdlFYtRu_ckgq6wGDGwpe_PTECYqpLDM3BU,5876
179
+ tests/test_scheduler.py,sha256=-FOkTWzaMdr6yfO1Msu74hI_GgSfD7iRxO-cFA-9Iyk,7442
180
+ tests/test_simple_response.py,sha256=6RYOBRzAtyNvJ9a5JVTNubM-rvxnuX8jQOvq3sUZxwo,1488
181
+ tests/test_telecom_spider_redis_key.py,sha256=R581J-kvjh4CpDCCIZKPnV0vY6MvtkvTzmSIbwhXSXs,7470
182
+ tests/test_template_content.py,sha256=5QAnhKZFDKg-_uFryllLMpCk3a1nCS44hMmYfXm8gWk,2878
183
+ tests/test_template_redis_key.py,sha256=U6L5HtnDyGp3s6-O4F_yG2Q2nNIGTqB_Q-ESv2EMeOU,4812
184
+ tests/test_tools.py,sha256=IWiu9JGV-5Ow0ivFtiDw62REht-8Hn7NfyR9rnYSlbU,5113
185
+ tests/tools_example.py,sha256=MfVBYemKvHs6MUbydgrJfhiGnygp5dRoIE-eIXCOR7M,7669
186
+ crawlo-1.2.0.dist-info/METADATA,sha256=NwPsIy1QiXk07HBs2iVrgX1_GM8GvNmUVQr-BX08W1g,20004
187
+ crawlo-1.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
188
+ crawlo-1.2.0.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
189
+ crawlo-1.2.0.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
190
+ crawlo-1.2.0.dist-info/RECORD,,
examples/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- # @Time : 2025-02-05 12:36
5
- # @Author : oscar
6
- # @Desc : None
7
- """
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ # @Time : 2025-02-05 12:36
5
+ # @Author : oscar
6
+ # @Desc : None
7
+ """
@@ -1,82 +1,82 @@
1
- # 双重 crawlo 前缀问题修复报告
2
-
3
- ## 问题描述
4
- 用户在使用分布式爬虫时发现Redis key中出现了双重`crawlo`前缀,例如`crawlo:crawlo:queue:processing:data`。这导致了Redis key命名不一致和潜在的混淆问题。
5
-
6
- ## 问题分析
7
- 经过代码分析,发现问题出在以下两个方面:
8
- 1. RedisPriorityQueue类在处理队列名称时会自动修改用户提供的队列名称
9
- 2. QueueManager类在提取项目名称时没有正确处理双重`crawlo`前缀的情况
10
-
11
- ## 修复方案
12
-
13
- ### 1. RedisPriorityQueue类修复
14
- 文件:`crawlo/queue/redis_priority_queue.py`
15
-
16
- **修复前**:
17
- ```python
18
- # 如果提供了 queue_name,确保符合命名规范
19
- # 处理可能的重复前缀问题
20
- if queue_name.startswith("crawlo:crawlo:"):
21
- # 修复双重 crawlo 前缀
22
- self.queue_name = queue_name.replace("crawlo:crawlo:", "crawlo:", 1)
23
- elif not queue_name.startswith("crawlo:"):
24
- # 如果没有 crawlo 前缀,添加它
25
- self.queue_name = f"crawlo:{module_name}:queue:requests"
26
- else:
27
- # 已经有正确的 crawlo 前缀
28
- self.queue_name = queue_name
29
- ```
30
-
31
- **修复后**:
32
- ```python
33
- # 保持用户提供的队列名称不变,不做修改
34
- self.queue_name = queue_name
35
- ```
36
-
37
- ### 2. QueueManager类修复
38
- 文件:`crawlo/queue/queue_manager.py`
39
-
40
- **修复后**:
41
- ```python
42
- # 处理可能的双重 crawlo 前缀
43
- if parts[0] == "crawlo" and parts[1] == "crawlo":
44
- # 双重 crawlo 前缀,取第三个部分作为项目名称
45
- if len(parts) >= 3:
46
- project_name = parts[2]
47
- else:
48
- project_name = "default"
49
- elif parts[0] == "crawlo":
50
- # 正常的 crawlo 前缀,取第二个部分作为项目名称
51
- project_name = parts[1]
52
- else:
53
- # 没有 crawlo 前缀,使用第一个部分作为项目名称
54
- project_name = parts[0]
55
- ```
56
-
57
- ## 测试验证
58
-
59
- ### 测试1:Redis队列命名修复测试
60
- 验证RedisPriorityQueue正确处理各种队列名称格式:
61
- - 正常命名:`crawlo:test_project:queue:requests` → `crawlo:test_project:queue:requests`
62
- - 双重 crawlo 前缀:`crawlo:crawlo:queue:requests` → `crawlo:crawlo:queue:requests`
63
- - 三重 crawlo 前缀:`crawlo:crawlo:crawlo:queue:requests` → `crawlo:crawlo:crawlo:queue:requests`
64
-
65
- ### 测试2:队列管理器项目名称提取测试
66
- 验证QueueManager正确提取项目名称:
67
- - 正常命名:`crawlo:test_project:queue:requests` → `test_project`
68
- - 双重 crawlo 前缀:`crawlo:crawlo:queue:requests` → [queue](file://d:\dowell\projects\Crawlo\crawlo\core\processor.py#L13-L13)
69
- - 三重 crawlo 前缀:`crawlo:crawlo:crawlo:queue:requests` → `crawlo`
70
-
71
- ### 测试3:队列管理器创建队列测试
72
- 验证整个流程的正确性,确保队列名称在传递过程中保持一致。
73
-
74
- 所有测试均已通过,表明双重`crawlo`前缀问题已得到解决。
75
-
76
- ## 结论
77
- 通过以上修复,我们成功解决了Redis key中出现双重`crawlo`前缀的问题。现在Redis队列名称将保持用户配置的一致性,processing和failed队列也会相应地保持相同的前缀结构。
78
-
79
- ## 建议
80
- 1. 建议用户在项目配置中使用标准的队列名称格式,如`crawlo:{project_name}:queue:requests`
81
- 2. 可以使用Redis key验证工具定期检查和规范Redis key命名
1
+ # 双重 crawlo 前缀问题修复报告
2
+
3
+ ## 问题描述
4
+ 用户在使用分布式爬虫时发现Redis key中出现了双重`crawlo`前缀,例如`crawlo:crawlo:queue:processing:data`。这导致了Redis key命名不一致和潜在的混淆问题。
5
+
6
+ ## 问题分析
7
+ 经过代码分析,发现问题出在以下两个方面:
8
+ 1. RedisPriorityQueue类在处理队列名称时会自动修改用户提供的队列名称
9
+ 2. QueueManager类在提取项目名称时没有正确处理双重`crawlo`前缀的情况
10
+
11
+ ## 修复方案
12
+
13
+ ### 1. RedisPriorityQueue类修复
14
+ 文件:`crawlo/queue/redis_priority_queue.py`
15
+
16
+ **修复前**:
17
+ ```python
18
+ # 如果提供了 queue_name,确保符合命名规范
19
+ # 处理可能的重复前缀问题
20
+ if queue_name.startswith("crawlo:crawlo:"):
21
+ # 修复双重 crawlo 前缀
22
+ self.queue_name = queue_name.replace("crawlo:crawlo:", "crawlo:", 1)
23
+ elif not queue_name.startswith("crawlo:"):
24
+ # 如果没有 crawlo 前缀,添加它
25
+ self.queue_name = f"crawlo:{module_name}:queue:requests"
26
+ else:
27
+ # 已经有正确的 crawlo 前缀
28
+ self.queue_name = queue_name
29
+ ```
30
+
31
+ **修复后**:
32
+ ```python
33
+ # 保持用户提供的队列名称不变,不做修改
34
+ self.queue_name = queue_name
35
+ ```
36
+
37
+ ### 2. QueueManager类修复
38
+ 文件:`crawlo/queue/queue_manager.py`
39
+
40
+ **修复后**:
41
+ ```python
42
+ # 处理可能的双重 crawlo 前缀
43
+ if parts[0] == "crawlo" and parts[1] == "crawlo":
44
+ # 双重 crawlo 前缀,取第三个部分作为项目名称
45
+ if len(parts) >= 3:
46
+ project_name = parts[2]
47
+ else:
48
+ project_name = "default"
49
+ elif parts[0] == "crawlo":
50
+ # 正常的 crawlo 前缀,取第二个部分作为项目名称
51
+ project_name = parts[1]
52
+ else:
53
+ # 没有 crawlo 前缀,使用第一个部分作为项目名称
54
+ project_name = parts[0]
55
+ ```
56
+
57
+ ## 测试验证
58
+
59
+ ### 测试1:Redis队列命名修复测试
60
+ 验证RedisPriorityQueue正确处理各种队列名称格式:
61
+ - 正常命名:`crawlo:test_project:queue:requests` → `crawlo:test_project:queue:requests`
62
+ - 双重 crawlo 前缀:`crawlo:crawlo:queue:requests` → `crawlo:crawlo:queue:requests`
63
+ - 三重 crawlo 前缀:`crawlo:crawlo:crawlo:queue:requests` → `crawlo:crawlo:crawlo:queue:requests`
64
+
65
+ ### 测试2:队列管理器项目名称提取测试
66
+ 验证QueueManager正确提取项目名称:
67
+ - 正常命名:`crawlo:test_project:queue:requests` → `test_project`
68
+ - 双重 crawlo 前缀:`crawlo:crawlo:queue:requests` → [queue](file://d:\dowell\projects\Crawlo\crawlo\core\processor.py#L13-L13)
69
+ - 三重 crawlo 前缀:`crawlo:crawlo:crawlo:queue:requests` → `crawlo`
70
+
71
+ ### 测试3:队列管理器创建队列测试
72
+ 验证整个流程的正确性,确保队列名称在传递过程中保持一致。
73
+
74
+ 所有测试均已通过,表明双重`crawlo`前缀问题已得到解决。
75
+
76
+ ## 结论
77
+ 通过以上修复,我们成功解决了Redis key中出现双重`crawlo`前缀的问题。现在Redis队列名称将保持用户配置的一致性,processing和failed队列也会相应地保持相同的前缀结构。
78
+
79
+ ## 建议
80
+ 1. 建议用户在项目配置中使用标准的队列名称格式,如`crawlo:{project_name}:queue:requests`
81
+ 2. 可以使用Redis key验证工具定期检查和规范Redis key命名
82
82
  3. 如果需要统一的命名规范,可以在项目初始化时明确指定队列名称
tests/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- # @Time : 2025-08-24 12:36
5
- # @Author : crawl-coder
6
- # @Desc : None
7
- """
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ # @Time : 2025-08-24 12:36
5
+ # @Author : crawl-coder
6
+ # @Desc : None
7
+ """