crawlo 1.2.8__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (221) hide show
  1. crawlo/__init__.py +63 -61
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +151 -151
  7. crawlo/commands/help.py +138 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +314 -323
  10. crawlo/commands/startproject.py +436 -436
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +186 -186
  13. crawlo/config.py +312 -312
  14. crawlo/config_validator.py +277 -251
  15. crawlo/core/__init__.py +2 -2
  16. crawlo/core/engine.py +365 -365
  17. crawlo/core/processor.py +40 -40
  18. crawlo/core/scheduler.py +256 -251
  19. crawlo/crawler.py +1097 -1099
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -107
  22. crawlo/downloader/__init__.py +273 -266
  23. crawlo/downloader/aiohttp_downloader.py +226 -228
  24. crawlo/downloader/cffi_downloader.py +245 -256
  25. crawlo/downloader/httpx_downloader.py +259 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +402 -402
  28. crawlo/downloader/selenium_downloader.py +472 -472
  29. crawlo/event.py +11 -11
  30. crawlo/exceptions.py +81 -81
  31. crawlo/extension/__init__.py +39 -39
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +57 -57
  34. crawlo/extension/log_stats.py +81 -81
  35. crawlo/extension/logging_extension.py +45 -43
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/filters/__init__.py +154 -154
  40. crawlo/filters/aioredis_filter.py +234 -234
  41. crawlo/filters/memory_filter.py +269 -269
  42. crawlo/items/__init__.py +23 -23
  43. crawlo/items/base.py +21 -21
  44. crawlo/items/fields.py +52 -52
  45. crawlo/items/items.py +104 -104
  46. crawlo/middleware/__init__.py +21 -21
  47. crawlo/middleware/default_header.py +132 -132
  48. crawlo/middleware/download_delay.py +104 -104
  49. crawlo/middleware/middleware_manager.py +136 -136
  50. crawlo/middleware/offsite.py +114 -114
  51. crawlo/middleware/proxy.py +386 -368
  52. crawlo/middleware/request_ignore.py +86 -86
  53. crawlo/middleware/response_code.py +163 -163
  54. crawlo/middleware/response_filter.py +136 -136
  55. crawlo/middleware/retry.py +124 -124
  56. crawlo/middleware/simple_proxy.py +65 -0
  57. crawlo/mode_manager.py +212 -211
  58. crawlo/network/__init__.py +21 -21
  59. crawlo/network/request.py +379 -338
  60. crawlo/network/response.py +359 -359
  61. crawlo/pipelines/__init__.py +21 -21
  62. crawlo/pipelines/bloom_dedup_pipeline.py +157 -157
  63. crawlo/pipelines/console_pipeline.py +39 -39
  64. crawlo/pipelines/csv_pipeline.py +316 -316
  65. crawlo/pipelines/database_dedup_pipeline.py +223 -223
  66. crawlo/pipelines/json_pipeline.py +218 -218
  67. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  68. crawlo/pipelines/mongo_pipeline.py +131 -131
  69. crawlo/pipelines/mysql_pipeline.py +317 -317
  70. crawlo/pipelines/pipeline_manager.py +74 -62
  71. crawlo/pipelines/redis_dedup_pipeline.py +167 -167
  72. crawlo/project.py +284 -315
  73. crawlo/queue/pqueue.py +37 -37
  74. crawlo/queue/queue_manager.py +379 -378
  75. crawlo/queue/redis_priority_queue.py +306 -306
  76. crawlo/settings/__init__.py +7 -7
  77. crawlo/settings/default_settings.py +216 -220
  78. crawlo/settings/setting_manager.py +175 -122
  79. crawlo/spider/__init__.py +639 -639
  80. crawlo/stats_collector.py +59 -59
  81. crawlo/subscriber.py +129 -129
  82. crawlo/task_manager.py +30 -30
  83. crawlo/templates/crawlo.cfg.tmpl +10 -10
  84. crawlo/templates/project/__init__.py.tmpl +3 -3
  85. crawlo/templates/project/items.py.tmpl +17 -17
  86. crawlo/templates/project/middlewares.py.tmpl +118 -118
  87. crawlo/templates/project/pipelines.py.tmpl +96 -96
  88. crawlo/templates/project/settings.py.tmpl +261 -288
  89. crawlo/templates/project/settings_distributed.py.tmpl +174 -157
  90. crawlo/templates/project/settings_gentle.py.tmpl +95 -100
  91. crawlo/templates/project/settings_high_performance.py.tmpl +125 -134
  92. crawlo/templates/project/settings_minimal.py.tmpl +30 -0
  93. crawlo/templates/project/settings_simple.py.tmpl +96 -98
  94. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  95. crawlo/templates/run.py.tmpl +47 -47
  96. crawlo/templates/spider/spider.py.tmpl +143 -143
  97. crawlo/tools/__init__.py +200 -182
  98. crawlo/tools/anti_crawler.py +268 -268
  99. crawlo/tools/authenticated_proxy.py +240 -240
  100. crawlo/{cleaners → tools}/data_formatter.py +225 -225
  101. crawlo/tools/data_validator.py +180 -180
  102. crawlo/tools/date_tools.py +290 -36
  103. crawlo/tools/distributed_coordinator.py +388 -387
  104. crawlo/{cleaners → tools}/encoding_converter.py +127 -126
  105. crawlo/tools/request_tools.py +83 -0
  106. crawlo/tools/retry_mechanism.py +224 -221
  107. crawlo/tools/scenario_adapter.py +262 -262
  108. crawlo/{cleaners → tools}/text_cleaner.py +232 -232
  109. crawlo/utils/__init__.py +35 -35
  110. crawlo/utils/batch_processor.py +259 -259
  111. crawlo/utils/controlled_spider_mixin.py +439 -439
  112. crawlo/utils/db_helper.py +343 -343
  113. crawlo/utils/enhanced_error_handler.py +356 -356
  114. crawlo/utils/env_config.py +142 -142
  115. crawlo/utils/error_handler.py +123 -123
  116. crawlo/utils/func_tools.py +82 -82
  117. crawlo/utils/large_scale_config.py +286 -286
  118. crawlo/utils/large_scale_helper.py +344 -344
  119. crawlo/utils/log.py +146 -128
  120. crawlo/utils/performance_monitor.py +285 -285
  121. crawlo/utils/queue_helper.py +175 -175
  122. crawlo/utils/redis_connection_pool.py +351 -351
  123. crawlo/utils/redis_key_validator.py +198 -198
  124. crawlo/utils/request.py +267 -267
  125. crawlo/utils/request_serializer.py +218 -218
  126. crawlo/utils/spider_loader.py +61 -61
  127. crawlo/utils/system.py +11 -11
  128. crawlo/utils/tools.py +4 -4
  129. crawlo/utils/url.py +39 -39
  130. {crawlo-1.2.8.dist-info → crawlo-1.3.0.dist-info}/METADATA +1011 -764
  131. crawlo-1.3.0.dist-info/RECORD +219 -0
  132. examples/__init__.py +7 -7
  133. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
  134. tests/__init__.py +7 -7
  135. tests/advanced_tools_example.py +275 -275
  136. tests/authenticated_proxy_example.py +107 -237
  137. tests/cleaners_example.py +160 -160
  138. tests/config_validation_demo.py +143 -103
  139. tests/controlled_spider_example.py +205 -205
  140. tests/date_tools_example.py +180 -180
  141. tests/debug_pipelines.py +67 -0
  142. tests/dynamic_loading_example.py +523 -523
  143. tests/dynamic_loading_test.py +104 -104
  144. tests/env_config_example.py +133 -133
  145. tests/error_handling_example.py +171 -171
  146. tests/redis_key_validation_demo.py +130 -130
  147. tests/request_params_example.py +151 -0
  148. tests/response_improvements_example.py +144 -144
  149. tests/test_advanced_tools.py +148 -148
  150. tests/test_all_redis_key_configs.py +145 -145
  151. tests/test_authenticated_proxy.py +141 -141
  152. tests/test_cleaners.py +54 -54
  153. tests/test_comprehensive.py +146 -146
  154. tests/test_config_consistency.py +80 -80
  155. tests/test_config_merge.py +153 -0
  156. tests/test_config_validator.py +182 -193
  157. tests/test_crawlo_proxy_integration.py +109 -173
  158. tests/test_date_tools.py +123 -123
  159. tests/test_default_header_middleware.py +158 -158
  160. tests/test_distributed.py +65 -0
  161. tests/test_double_crawlo_fix.py +207 -207
  162. tests/test_double_crawlo_fix_simple.py +124 -124
  163. tests/test_download_delay_middleware.py +221 -221
  164. tests/test_downloader_proxy_compatibility.py +268 -268
  165. tests/test_dynamic_downloaders_proxy.py +124 -124
  166. tests/test_dynamic_proxy.py +92 -92
  167. tests/test_dynamic_proxy_config.py +146 -146
  168. tests/test_dynamic_proxy_real.py +109 -109
  169. tests/test_edge_cases.py +303 -303
  170. tests/test_enhanced_error_handler.py +270 -270
  171. tests/test_env_config.py +121 -121
  172. tests/test_error_handler_compatibility.py +112 -112
  173. tests/test_final_validation.py +153 -153
  174. tests/test_framework_env_usage.py +103 -103
  175. tests/test_integration.py +169 -357
  176. tests/test_item_dedup_redis_key.py +122 -122
  177. tests/test_mode_consistency.py +51 -51
  178. tests/test_offsite_middleware.py +221 -221
  179. tests/test_parsel.py +29 -29
  180. tests/test_performance.py +327 -327
  181. tests/test_proxy_api.py +264 -264
  182. tests/test_proxy_health_check.py +32 -32
  183. tests/test_proxy_middleware.py +121 -121
  184. tests/test_proxy_middleware_enhanced.py +216 -216
  185. tests/test_proxy_middleware_integration.py +136 -136
  186. tests/test_proxy_middleware_refactored.py +185 -0
  187. tests/test_proxy_providers.py +56 -56
  188. tests/test_proxy_stats.py +19 -19
  189. tests/test_proxy_strategies.py +59 -59
  190. tests/test_queue_manager_double_crawlo.py +173 -173
  191. tests/test_queue_manager_redis_key.py +176 -176
  192. tests/test_random_user_agent.py +73 -0
  193. tests/test_real_scenario_proxy.py +195 -195
  194. tests/test_redis_config.py +28 -28
  195. tests/test_redis_connection_pool.py +294 -294
  196. tests/test_redis_key_naming.py +181 -181
  197. tests/test_redis_key_validator.py +123 -123
  198. tests/test_redis_queue.py +224 -224
  199. tests/test_request_ignore_middleware.py +182 -182
  200. tests/test_request_params.py +112 -0
  201. tests/test_request_serialization.py +70 -70
  202. tests/test_response_code_middleware.py +349 -349
  203. tests/test_response_filter_middleware.py +427 -427
  204. tests/test_response_improvements.py +152 -152
  205. tests/test_retry_middleware.py +241 -241
  206. tests/test_scheduler.py +252 -252
  207. tests/test_scheduler_config_update.py +133 -133
  208. tests/test_simple_response.py +61 -61
  209. tests/test_telecom_spider_redis_key.py +205 -205
  210. tests/test_template_content.py +87 -87
  211. tests/test_template_redis_key.py +134 -134
  212. tests/test_tools.py +159 -153
  213. tests/test_user_agents.py +97 -0
  214. tests/tools_example.py +260 -257
  215. tests/verify_distributed.py +117 -0
  216. crawlo/cleaners/__init__.py +0 -61
  217. crawlo/utils/date_tools.py +0 -290
  218. crawlo-1.2.8.dist-info/RECORD +0 -209
  219. {crawlo-1.2.8.dist-info → crawlo-1.3.0.dist-info}/WHEEL +0 -0
  220. {crawlo-1.2.8.dist-info → crawlo-1.3.0.dist-info}/entry_points.txt +0 -0
  221. {crawlo-1.2.8.dist-info → crawlo-1.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,219 @@
1
+ crawlo/__init__.py,sha256=qZzTmb7hw5h_qcP2EYGUZcoSScxlKZFJ76CjSeS7UfA,1381
2
+ crawlo/__version__.py,sha256=zi_LaUT_OsChAtsPXbOeRpQkCohSsOyeXfavQPM0GoE,22
3
+ crawlo/cli.py,sha256=OXprmcTUbFK02ptw_Gq8Gk4-ZCU-WEMJgzU1ztgP6Bk,2327
4
+ crawlo/config.py,sha256=dNoNyTkXLe2msQ7bZx3YTQItk1m49nIg5-g89FQDNwE,9486
5
+ crawlo/config_validator.py,sha256=gsiLqf5swWd9ISDvoLqCdG7iSXr-ZdBPD4iT6ug1ua4,11239
6
+ crawlo/crawler.py,sha256=rxyjA5pXOd709bujgniqYG9tR3eoNaok6wJaeZOgzmo,39451
7
+ crawlo/event.py,sha256=7-y6HNv_EIJSYQNzsj0mVK-Gg4ON3wdQeMdQjfFJPlw,313
8
+ crawlo/exceptions.py,sha256=sMay0wnWLfc_FXWslqxm60qz6b66LXs3EdN_w8ygE9k,1166
9
+ crawlo/mode_manager.py,sha256=soEgZNBt6jA0qtC1WH-MG_2WngDk2RfmQckLsK3NzmQ,7510
10
+ crawlo/project.py,sha256=830PPRUD6ldE8MKPdkFkPiUcecHhlWP3fUXYC96_T0Y,10506
11
+ crawlo/stats_collector.py,sha256=NkO09CB-220qz5rxFcD_dedGfr2VPFrDo4hya0Zh8Qc,1577
12
+ crawlo/subscriber.py,sha256=D3hzE7Pc_zJjc-zR7lct5pt32bz6LsDYeC8uHlS4Hso,4986
13
+ crawlo/task_manager.py,sha256=19InAxS9oJ9EMj20Aw2urN-v6BeC22dkgPuW-B9-4UI,819
14
+ crawlo/commands/__init__.py,sha256=QbhGAmItiwVrtlTr9UUbEJMegLJo-SdzaKX2PUhBgfI,378
15
+ crawlo/commands/check.py,sha256=7pD43s97DD-fSLO9OEOuNcNr7o-2g94rJULL8fUzdaI,22605
16
+ crawlo/commands/genspider.py,sha256=HhtvBLkIuhYtJUzom6PquItiC22vU9LNpOkjDUiqdM4,4937
17
+ crawlo/commands/help.py,sha256=gwfHibRpdYDmZO6waUMOEn8SMJ_ubdjL-prD5fiuVY8,4973
18
+ crawlo/commands/list.py,sha256=BqlPjBa5FLotjAlyZ3-nGmXg5cWcCNbHi8U5znb2_D8,5722
19
+ crawlo/commands/run.py,sha256=KcJ4h4D7lavB6qQDpYMrbgJMgY5vCSLHaLckos5EUNY,11793
20
+ crawlo/commands/startproject.py,sha256=aqKRJarKqTf5XjJnGXwjRpp0uYF16LreFbwwQLGpK-0,16070
21
+ crawlo/commands/stats.py,sha256=8wTubR1RQ1JPTlpOKArcGcQ39bM-0cuH27lDpndnwPQ,6014
22
+ crawlo/commands/utils.py,sha256=pXiFzwVIVXdSPO2Fty_u19P1lsE8HmuE8gTMamKZZUs,5047
23
+ crawlo/core/__init__.py,sha256=PnFyJdVNHBoPmV1sW0AHQXijeoSTQ8cMYrbNM1JK8kA,41
24
+ crawlo/core/engine.py,sha256=Hy0K_g9My6aQ3CPkxAcCiPsumdwh4O8qRhmFlNoErd4,14496
25
+ crawlo/core/processor.py,sha256=qmCqAeqhwYu-UE86evYesaGt9qpuSIfH-ZIZKcXFCZc,1140
26
+ crawlo/core/scheduler.py,sha256=D-YzXVvnP6DEkovmz9hThhzIe2UgRrQLNt9pJCPEPwY,12593
27
+ crawlo/data/__init__.py,sha256=8MgDxcMhx-emFARcLAw_ODOZNz0neYBcx7kEbzothd8,116
28
+ crawlo/data/user_agents.py,sha256=6V34lYHREWV5ZR5wH-1pCnr1Y3ZYC7iMLfC6vZHyhZQ,9697
29
+ crawlo/downloader/__init__.py,sha256=PB8oluLFMX2PBmeb3NBKkM6GaceX0ujFId8t2URy1ks,8624
30
+ crawlo/downloader/aiohttp_downloader.py,sha256=KZY8xJ8jubrlfZNQugf8lpSeJ_Axk5-klpPSSfb4j1w,8969
31
+ crawlo/downloader/cffi_downloader.py,sha256=BpA1q6Udz7sSXJ0gX94xGnzy8cdgK-vlr_Q6YA4QIxE,10243
32
+ crawlo/downloader/httpx_downloader.py,sha256=7jfQfvAtfk8yD_mvwUbWLhYOxMM7r1nudiU7m_Jl9wc,12037
33
+ crawlo/downloader/hybrid_downloader.py,sha256=4SzOPEwBlSZVzUAWR3DyxMx2Tsx15YrpBvQS4it4Vps,8028
34
+ crawlo/downloader/playwright_downloader.py,sha256=Lnc7k5cXhVnURXSxgZFCYMJkBxLg5F_OE67rtf3G7Ig,16261
35
+ crawlo/downloader/selenium_downloader.py,sha256=B_0muNi-GQ_hgoYHcf7wgu01V68q7xKnSh-0kzlUiio,21036
36
+ crawlo/extension/__init__.py,sha256=FbOwJ4jh60xCbSh7P9CUGJsGAC-VH4MyOtCftRMlxbk,1594
37
+ crawlo/extension/health_check.py,sha256=0GveZgUtFwjYEKlm3qbwIvCmb4FR0qrIKc8cEF1yQV8,5516
38
+ crawlo/extension/log_interval.py,sha256=VCIeNqXcWDnxj4m6l77cjqgRzV8LfsPMb22X0Xc1Vwc,2417
39
+ crawlo/extension/log_stats.py,sha256=vrChs3bj_Dvay3kxxkBOp4-w0K-IG-2XZ0PoSUahTPs,2908
40
+ crawlo/extension/logging_extension.py,sha256=RfL1wI4nz-1Xtg420Ktp7RXnOPnZSHwO0Zpg1w4fO4M,1726
41
+ crawlo/extension/memory_monitor.py,sha256=4aszl3C0GMQbqFhGZjZq5iQuXQR1sOz06VHjjEHgkyE,4290
42
+ crawlo/extension/performance_profiler.py,sha256=EPiNuXuPPDU0Jtgy8arYHpr_8ASK13cCI2BytdJnu_I,4899
43
+ crawlo/extension/request_recorder.py,sha256=RC23yzXClnVv9j2ljQvjBkUfWznfnDHsrQejKhE9y5E,4074
44
+ crawlo/filters/__init__.py,sha256=XC_Q4ykZtSNYizYlAcehVwBBNO3lZ2zuWwafzXiuWyQ,4241
45
+ crawlo/filters/aioredis_filter.py,sha256=XixK3DD5QbCLOw3Me2YdtMkxQpXOT75FE-GiVr_PUGc,8245
46
+ crawlo/filters/memory_filter.py,sha256=mO4oBPV5_uAiBQF3a16tU5tcD8244dOjKoNX_MU8cEw,9292
47
+ crawlo/items/__init__.py,sha256=bqekZrRlDhxfWie0UbCs656TptYseoe9QJ67I4E7Elk,386
48
+ crawlo/items/base.py,sha256=tAYrPJgblp3ZEihDXvappdYc6pGdim6x2_9QSmMKI2o,577
49
+ crawlo/items/fields.py,sha256=jCG0-PS8mVO48lP_ioTZCQCa0vjP5Sfv-sAyvYQqr-s,1800
50
+ crawlo/items/items.py,sha256=e-3nXI9ckD64vcDxxQiAU6ufbtJMs09gbZQcYjxgwHY,3374
51
+ crawlo/middleware/__init__.py,sha256=ldaGFNbiJnK9Fx12Vdf9fDNfzXxoETtShp5r-vodtw0,549
52
+ crawlo/middleware/default_header.py,sha256=wQ7BrUHd-hRosFoKsReV9hwNNr_jwK6V0ZfxL6MOGrk,5032
53
+ crawlo/middleware/download_delay.py,sha256=zt9R5g2HWErWA_MAOnGcw_D8l6HD769Kyaw-Hv-vcTc,3438
54
+ crawlo/middleware/middleware_manager.py,sha256=9Sj9rrWK6R9NZq9eT38sWRGuBKLKfjSgEAxu-5NCWgU,6278
55
+ crawlo/middleware/offsite.py,sha256=b3BMwNKGR41YGJGHt1S0H7yXujEkztVvonUQGO05hoM,4026
56
+ crawlo/middleware/proxy.py,sha256=NquB6tqHAgHs3-2_1_5220kJYfjNG5JyHRJyo_2j4wo,15636
57
+ crawlo/middleware/request_ignore.py,sha256=xcyZ1c7r_HhbzR3r9pfjsLGW7L7FBVeYvlNt8cpP2wY,2577
58
+ crawlo/middleware/response_code.py,sha256=-Aa9Mm9nJN-WdddN7iTanJRMA83_LYYgSEz3XLQGvMo,4934
59
+ crawlo/middleware/response_filter.py,sha256=6VBUe04mu8C7XxmOak6XyhGMWZPYEm3AMo5Kt_r1SXY,4248
60
+ crawlo/middleware/retry.py,sha256=HxeIf7DibeLCpZ_y4rNARWMyzlrsdq5UR2CaFZInA3s,4124
61
+ crawlo/middleware/simple_proxy.py,sha256=V_v28L-faiMJtt8vi-u5O4za-aU77_JTqNTCYSfWzCE,2191
62
+ crawlo/network/__init__.py,sha256=BLPERYPo22g1BXrW--wUnlolrdFUmOPjgOB8XQQJlck,397
63
+ crawlo/network/request.py,sha256=9kV-gqb_d6aCsSBAwyzxnP9a70cAViwX8qvpyYV7Ym4,13799
64
+ crawlo/network/response.py,sha256=EZiG4LjuIb7PxdGou4H-oSOQhec1ZdBRTkO-5fl8JTo,12701
65
+ crawlo/pipelines/__init__.py,sha256=lrdVDjeHLNkA4_MAwI1auk_I9xfeU1SlBWXiammb6lc,616
66
+ crawlo/pipelines/bloom_dedup_pipeline.py,sha256=omB_gHtoacbco0sn_c6HO6PHCh6xylSecK7UbJIeLq8,5661
67
+ crawlo/pipelines/console_pipeline.py,sha256=KABkR3J-rqO0Awox7lizxKR2XuHfVhWPiVRgIybwwu4,1248
68
+ crawlo/pipelines/csv_pipeline.py,sha256=6FBT2AoU6iNU-5NfgWRq7-JpF9dK2nBokjxx-y4jIas,12174
69
+ crawlo/pipelines/database_dedup_pipeline.py,sha256=Ao_5jvVPl5QikxXhPeIrcB7_3tinR9bPNRV5Fu5zfDU,7978
70
+ crawlo/pipelines/json_pipeline.py,sha256=vlu1nqbD2mtqtExt9cL5nibx1CwJM1RNqd4WGjZRHAY,8367
71
+ crawlo/pipelines/memory_dedup_pipeline.py,sha256=oIksbIrmSw9s9jMh6JJMfVbv6hzseVMV_g9S8UHQUP4,3837
72
+ crawlo/pipelines/mongo_pipeline.py,sha256=k7gNqAO-g2MtIfArphC6z5ZzkKVRkBKcv-2ImziPFA0,5706
73
+ crawlo/pipelines/mysql_pipeline.py,sha256=G2DMhdh0ihBBolIul4YVTDz2JbrZGJauDtWF-gqRW0w,13473
74
+ crawlo/pipelines/pipeline_manager.py,sha256=vCgfbhgsKMLm_7jCnr3cE5GemIYkG9u4oF8u4Ta_7so,3013
75
+ crawlo/pipelines/redis_dedup_pipeline.py,sha256=POYRiWAOp1pqDW9iTPJ8h3VcpLALeLrpw74MvJJqPiM,6342
76
+ crawlo/queue/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
+ crawlo/queue/pqueue.py,sha256=yzF900ko2RReVNZtfk_VV3PzSXEUALI6SHf97geuu6s,1058
78
+ crawlo/queue/queue_manager.py,sha256=XqS_oVbNQJWdtokOuDDPK-FzMrVdnZ3UKp1MF_DMJww,14941
79
+ crawlo/queue/redis_priority_queue.py,sha256=k1OChSMRovSMkbbJ9388axfhpYeMevuJTe-3N1oYhbA,13126
80
+ crawlo/settings/__init__.py,sha256=xsukVKn_h2Hopm1Nj-bXkhbfyS62QTTvJi7fhZUwR9M,123
81
+ crawlo/settings/default_settings.py,sha256=98URrj6QBrx_pmJ1yvK-MSAW8VrZ-pl0FfiZEHV0ZnI,9183
82
+ crawlo/settings/setting_manager.py,sha256=V3nVJEPtusadoz5eILXFeNyDXX1u_MgIiKIFIWVDY1s,6189
83
+ crawlo/spider/__init__.py,sha256=ZnSAL9PXLZSIH-Jdv-P6RuWmQUdukr8KPLQK6SXZZaU,20435
84
+ crawlo/templates/crawlo.cfg.tmpl,sha256=9BAmwEibS5Tvy6HIcGXPb0BGeuesmibebmTW0iAEkmo,230
85
+ crawlo/templates/run.py.tmpl,sha256=v_g-LQMYJ6pC8TZgyWj0yB2yTTKrwy9lEJufAYCXyxY,1228
86
+ crawlo/templates/project/__init__.py.tmpl,sha256=f3ETIXw_O6K-lkL6lXM5znMPJW1FZYGFrwDs2BnHcnQ,58
87
+ crawlo/templates/project/items.py.tmpl,sha256=mt1Mm--H2Ouos3r7JPkYh0r33rgYJf1YOMz0OZy8TYs,297
88
+ crawlo/templates/project/middlewares.py.tmpl,sha256=T67p8j0laL4NJJ_3xzPM9yivgZRjTEMiEtEWLPwbkmw,4160
89
+ crawlo/templates/project/pipelines.py.tmpl,sha256=GBHYU0Jx8sKDCdGJp44FMSH7u2slxoFg6a-R9Uwg_-I,2608
90
+ crawlo/templates/project/settings.py.tmpl,sha256=K0WOyCJsiykbZjoZRhzmTVssoahETkYS2zb2q3Ai5Ts,9998
91
+ crawlo/templates/project/settings_distributed.py.tmpl,sha256=BsSqtYl69NtFUTq-lDkXTzU7cwhqZhkna3x__pQx7oc,6692
92
+ crawlo/templates/project/settings_gentle.py.tmpl,sha256=9S6l-v0yJOPDy3oxsCIpSDieXjxtHNPHhBqfGP28CG4,3975
93
+ crawlo/templates/project/settings_high_performance.py.tmpl,sha256=zjlGESHvt3m_vsvAYDc5oE2Eui3eI9QEn3-uKFyHXpc,4706
94
+ crawlo/templates/project/settings_minimal.py.tmpl,sha256=dFoz39BGkSzLDTwBN-mQh242SLzcP6g8MhI8Zk49jvw,909
95
+ crawlo/templates/project/settings_simple.py.tmpl,sha256=iWQFaw1WxTJA2QF-kXH4nspDGuXvnGEAuqpGQHCfuew,4101
96
+ crawlo/templates/project/spiders/__init__.py.tmpl,sha256=j_YKsw6HQMJyqlk3WUouP3bsr-XVxshRoSNakHBc00g,106
97
+ crawlo/templates/spider/spider.py.tmpl,sha256=jMhzyxpIpV_KigB-pmN-5mGMiYtu4mfQIOvpZcCGGJI,5055
98
+ crawlo/tools/__init__.py,sha256=8igeUXLD0vJ5ta2X91QyTvna6dOioKCn0z7EF4oHvHI,3942
99
+ crawlo/tools/anti_crawler.py,sha256=MU6KEPT0q85e_-Px8Rmw1fxdwlmOdpXfo0KYVpPlivU,9163
100
+ crawlo/tools/authenticated_proxy.py,sha256=L93WeXajIZ3si8xNcE7bBPv34FvqSyTvVfD78fJAKQE,7032
101
+ crawlo/tools/data_formatter.py,sha256=NEj3NqPiyG67V6qDgn2KNj9VNHWOLNwX-7p_nad0znc,7583
102
+ crawlo/tools/data_validator.py,sha256=hxPN28YtJDFFLjBBYhDjHmR8ShNTEjgIsv-cmcDKIu8,5310
103
+ crawlo/tools/date_tools.py,sha256=jjP5xA0-aDgm9UIK1RG2qaNagBzHFQ-BBDMo_YzSlLQ,8906
104
+ crawlo/tools/distributed_coordinator.py,sha256=Au20nZ4qUiAZUD2A1yfwD3soaHADpkEZt1hRyegp6M4,12323
105
+ crawlo/tools/encoding_converter.py,sha256=7P9Z7J1ALw_PPNApmjFsHZDpRxgxzduiViluenlSLEU,4043
106
+ crawlo/tools/request_tools.py,sha256=CjyFBtRQf_vFjQhaVwgHSGai4ZaWS8IIaF1flSfJxDs,2338
107
+ crawlo/tools/retry_mechanism.py,sha256=aT5hEs5O7B09K1IaNFZEOWR9e_mX52Dtq4gx-onsyRI,7553
108
+ crawlo/tools/scenario_adapter.py,sha256=JouFxI3513PRe1ObwHWc72vBvptNpNv0Ew3pRaEKjQQ,9398
109
+ crawlo/tools/text_cleaner.py,sha256=SOgT9frD6Cg-2D7ZIzrixrxFYfYisLPU48ir9U2ZbA0,6458
110
+ crawlo/utils/__init__.py,sha256=to1N8t0rNoczU9pteGt_RxhNrvfjtDxQidRwsTKcIjI,563
111
+ crawlo/utils/batch_processor.py,sha256=_J-dKj98csB9LdhTBHh_dKvV4OzHiP22-5OWxavDglQ,8883
112
+ crawlo/utils/controlled_spider_mixin.py,sha256=RVRAf9Wbi7z9NAlog4763xhHUEjl5r33aVMk7Oj4HCA,16497
113
+ crawlo/utils/db_helper.py,sha256=3ib5-agrlwf2t5S_QtLRYH75wvJDlYbRqRmDEbpH5Bo,10559
114
+ crawlo/utils/enhanced_error_handler.py,sha256=hj5AElt3ajfqnP4csQnEfEnzkbIep9k65DNQiCbmTFo,13858
115
+ crawlo/utils/env_config.py,sha256=HbZOEKkeQ0FMdZYJu9SgmSNEmfPJrmAzA7lHu5Du1DA,3937
116
+ crawlo/utils/error_handler.py,sha256=q6NqHxjYrKdswfmhshMYMmfBIr0M2YWPYxts4ScHl4Y,4244
117
+ crawlo/utils/func_tools.py,sha256=WUZEGpWMuDDX7g-QySM7iaiC74erW2SSkZoUvDw1NjM,2369
118
+ crawlo/utils/large_scale_config.py,sha256=j7wQ5ty7pQlBRygw2vhRJ7OI19RYBZKPfYMP3WeF2WI,8154
119
+ crawlo/utils/large_scale_helper.py,sha256=Kxdy3WMuqjzQTyCc6z4xEYxXDi4xnYKJzsVwaBYZrrg,12108
120
+ crawlo/utils/log.py,sha256=xZe3UU78yr10lK0hxALBQB0Uv9cXShOPPksoe5n_qKI,5229
121
+ crawlo/utils/performance_monitor.py,sha256=Q9xxuXBIfFoig_U-FQPOUuPAh1axO3MzYgpielDyku0,9547
122
+ crawlo/utils/queue_helper.py,sha256=xpUUTOqlU1xz2Pb9NKAVGo3AfAO-7Xvx8Lm1q65Dgck,4743
123
+ crawlo/utils/redis_connection_pool.py,sha256=amGjhaKpodMrw9X56qxZ6f3OTZhjrI89sSVGqgwAQGU,11050
124
+ crawlo/utils/redis_key_validator.py,sha256=M461uMU5mRZfYRSwf-fXJUi4UITNKUAZmLe-cvytm9c,5611
125
+ crawlo/utils/request.py,sha256=yoLB2rY8d78vgPjIWpdhY5SalIKjyLIvTG_UH6EMdVI,8798
126
+ crawlo/utils/request_serializer.py,sha256=k7PQG_Wa1S1k9qTvcKDeLOlX1aaa_0jo9sFUCQZBKBk,8521
127
+ crawlo/utils/spider_loader.py,sha256=WK9gL99sOeIrFC-a0Y10lygtryQR7-wfdGks-uwMYTM,2172
128
+ crawlo/utils/system.py,sha256=HvWV1acxou0Rn0L7pNq4CnV_GWFeU0Tmjy3_nLD8M64,237
129
+ crawlo/utils/tools.py,sha256=5Uv25Wy4m_ndZY0-n-eX-t3PxvaZ6wR3-Wvx-o7_Vrs,271
130
+ crawlo/utils/url.py,sha256=rlgX2VlJv6JvLmCDTsbxzMSXE6R5ZL_0dLEqprsA-JU,1482
131
+ examples/__init__.py,sha256=6i631BPnS_TR_BWUjtjB5CBO-zv9kRkwQTQvSya2wHE,123
132
+ tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md,sha256=HjMZv6RjN1o5D1mfgEydP8Mcc9T_4ScR6lG3xVxs8P8,3346
133
+ tests/__init__.py,sha256=scL1IPVT1iucth7v8ffrjRdeW7QrC_Y7AMmFVMdTY1Y,129
134
+ tests/advanced_tools_example.py,sha256=7nlFLRVMVYzDz_CAdgQa9fJu7o0M6jBMo7PTvUsRbo0,9065
135
+ tests/authenticated_proxy_example.py,sha256=gebJn8x_haztbFbizAL5CUosEAlNRsQhnmD-jV0glDk,2864
136
+ tests/cleaners_example.py,sha256=J6rT4rTbNzeN2YWf7IfLVwCGm3-UcSxE4LhH5AV-CE0,5164
137
+ tests/config_validation_demo.py,sha256=5MzW5P7ZX6xoMW_zC6XmIA50KWMTu0iB5H2hTe42Sb8,4029
138
+ tests/controlled_spider_example.py,sha256=SP_k4mdKPvD1JCPs9UCm68jcy2Frg84vvXv9-14RC6I,7776
139
+ tests/date_tools_example.py,sha256=x_-duqnVZ-Hrk-SaNplIfcIV6W3c6u6MTxW35u1i0F0,4862
140
+ tests/debug_pipelines.py,sha256=VpUmoYlt6Ci7foIGuQIotUu42xp6TzoA1cBDeagBzDk,2098
141
+ tests/dynamic_loading_example.py,sha256=NI0SCg4lPME0RCcNpDDw1HjErjmCgJntCN0ahAEw61g,18263
142
+ tests/dynamic_loading_test.py,sha256=DYbMrEewerx0VGXixci3p9VYgDDQvCPevA92CNjq1Jo,3309
143
+ tests/env_config_example.py,sha256=sKE8DvMBhM3uy439LpgLHd4wF7MGUrUc-X6E7g9qsz0,4818
144
+ tests/error_handling_example.py,sha256=goF8fnTXxU3CgHcX4ALEcidVPd-zACn2tDqqQislRPA,5123
145
+ tests/redis_key_validation_demo.py,sha256=FxqEXRgJllkgjyIyEuegQrLDuXAvi9N-dfMlvFotRZ4,4337
146
+ tests/request_params_example.py,sha256=bjHxK_ca6UO7kBff88nmoxXY1odiLQCGC36okjEi7gM,4100
147
+ tests/response_improvements_example.py,sha256=wnYGJO6MKj5_jbwKLDlbXu_Dli5XC7vlWdzByi82_5Y,5258
148
+ tests/test_advanced_tools.py,sha256=3R8EfKVyBHEb6FA5TP3ieaWeHZhobVgSx8t3phipCrE,5250
149
+ tests/test_all_redis_key_configs.py,sha256=SGoip8M7oB2LNWC_31aJ4ECcDRmx0psr7i7DGzuaH7c,5565
150
+ tests/test_authenticated_proxy.py,sha256=s4pr5JyBTHYQgRq_IymiVKE08vyW1MwR27pSwrrVLVk,4198
151
+ tests/test_cleaners.py,sha256=UD-X_eLnQic6GYbtFzYnAKqG4XKOSGIDd1X2fAl7Jso,1762
152
+ tests/test_comprehensive.py,sha256=kGNcJ9UkQxysYqvsBu0YxAaPleOvN9_hztLy7ljkfc4,5036
153
+ tests/test_config_consistency.py,sha256=DJaAQxGL7RXHs-DWF_B4yhHFGSGHWHUoDmLFiMi4aJg,1921
154
+ tests/test_config_merge.py,sha256=d8i8sU1XKS3egNKEYPZ2a6CBnJRx2M3p6q04wYufAcw,5454
155
+ tests/test_config_validator.py,sha256=5ivB71KstHGNi2BPzcclf9hBukXEgt_B8N4l1HRjBFc,6020
156
+ tests/test_crawlo_proxy_integration.py,sha256=_L62_soaHRYy_0fShjiZSmv-RtGICw7_kzhTNRoyFfc,2620
157
+ tests/test_date_tools.py,sha256=CQdAmIS6bpAdwQH9ETDH__06l2gGL7EHUQuh7mdTF-A,3930
158
+ tests/test_default_header_middleware.py,sha256=7kpONSsGMsmWgTX2pCpseme54_-82Baak0xVz6gclJk,5845
159
+ tests/test_distributed.py,sha256=RQHUpDfRNG2x_1Cdr9DLk25IBcgapm_u0xSBMObE0Xc,1725
160
+ tests/test_double_crawlo_fix.py,sha256=ZNkRDgWW2WN-QRNZhvIgTHonY-T_U_R_MOIBLuyJd_I,7770
161
+ tests/test_double_crawlo_fix_simple.py,sha256=MlWUqo51kOQ7Gu6Neoler8FVyRs0jpmQWoORHMBENz0,4644
162
+ tests/test_download_delay_middleware.py,sha256=Va79gsH_8BVrVVLA8gSwFEbrRJ7qwJMCC1cDJN6il_0,8886
163
+ tests/test_downloader_proxy_compatibility.py,sha256=3Jn7RJd1R2ywuitHp2Jju1yYNg57R4QmKwjuHGojDUE,8635
164
+ tests/test_dynamic_downloaders_proxy.py,sha256=PtEW-pnVijeX2yX34UcoXYEY23yTBxb-kyNYh-WDljQ,4326
165
+ tests/test_dynamic_proxy.py,sha256=YL2sghNKG7k27-SaHMh4boNLVBHhfSttUwUqiSsOEX4,3080
166
+ tests/test_dynamic_proxy_config.py,sha256=uYXZ804ULI9qYMF-uNjMbi3L_NGzoMqLJcEZAl7aZ2I,5707
167
+ tests/test_dynamic_proxy_real.py,sha256=DTjP8JnSwBnNZ3Ls1BjDAmt6xSuye_6CxwZ4LBisPTM,3402
168
+ tests/test_edge_cases.py,sha256=4XZIUPOtNM9WCoAV1dJYAK8T6NiWp18rcwLLwnpxILE,10426
169
+ tests/test_enhanced_error_handler.py,sha256=YYKyjT9ARcIcyKDOObaQTws18HfsHN923BOTAzaxYF8,8311
170
+ tests/test_env_config.py,sha256=nfP4nCG9ZHeJUfxo1JKUmiihYbhSeWx_oNW5mMfDHfQ,4746
171
+ tests/test_error_handler_compatibility.py,sha256=o5JLLLdo25Sl_3hpMx6I2fqSgZFAcnI4E6Ci-KxAxwA,4129
172
+ tests/test_final_validation.py,sha256=aAiWLzhDCcv-GEXg9sauaVIfq5rz3s2vm67Gk2_lmBI,4813
173
+ tests/test_framework_env_usage.py,sha256=HYpTwORXeaJHMffCYAGHGvc_a6ax4lo28xP8BYOaKxk,4098
174
+ tests/test_integration.py,sha256=OCkjyv76Wop7CrXEko6rfoDsIK6SESA18KgCaTwL7Q4,4670
175
+ tests/test_item_dedup_redis_key.py,sha256=QxLuXHUx0xqT6y7lQzOWcrLkRui7Qs7C6NgRvjzIypA,3720
176
+ tests/test_mode_consistency.py,sha256=X12X4496OoepOkRLz5OkJcJfFUeChnP9TiRWcR2J5p4,1175
177
+ tests/test_offsite_middleware.py,sha256=L5YT9ZqcQwBunUv0Ddj-sLZcW4IMlAlgaJCwICHFWxI,7543
178
+ tests/test_parsel.py,sha256=KYskaN_4HBc1XDTltjVo12v1i7JAThB2UIwcWZ-mwbY,672
179
+ tests/test_performance.py,sha256=gOJ1EpU9uGynIxETLAroe98OA4QPcX1wchCDJoO41Kc,11130
180
+ tests/test_proxy_api.py,sha256=dVqGElyL3K0_9IqkXzn7Ka2jSuhvYfR1BfZgyVukNM0,10749
181
+ tests/test_proxy_health_check.py,sha256=xo3QMP1YNw9hu7JDHZOYCUZmFFKLJpHSh4SbxXhCRPQ,1091
182
+ tests/test_proxy_middleware.py,sha256=qm2B0lepBZqzUpXNi4t1gjrQxUV4MQ2wvpmcaYV6O5A,3900
183
+ tests/test_proxy_middleware_enhanced.py,sha256=YQRZs4bniU2CR-eKRc-sS4zZ6cjdSHwijUWL0T4Tq1w,6819
184
+ tests/test_proxy_middleware_integration.py,sha256=zcl7fR9Toc-I-stSUTzKZPwcfh3kgrpjI5SbkZ6AVmE,4305
185
+ tests/test_proxy_middleware_refactored.py,sha256=QiV9OodRb6hUcPnjDs-jraV8hlBBVLsUJE04geWHoD8,6776
186
+ tests/test_proxy_providers.py,sha256=XwWZCywTYguSsUxSm6fsbaoH1p9dKjqSIx9-sqKZehA,1693
187
+ tests/test_proxy_stats.py,sha256=Til_yksrRz2yBVw-yJi5-36LhNW3vTwpXTm4BdR9PUM,507
188
+ tests/test_proxy_strategies.py,sha256=ZkziozkvZd3KWOQnpHQ8Upd3WpyoX7gN0qFGluNm348,1809
189
+ tests/test_queue_manager_double_crawlo.py,sha256=YzM6PnoyRSST-f2NVyI97bpPcoYWL06HUwf08Fyx3Qg,6784
190
+ tests/test_queue_manager_redis_key.py,sha256=nCCMnpKPNP5fyd4zb4LG2kmJAUcLoa8ODhBGcz4GcCU,6231
191
+ tests/test_random_user_agent.py,sha256=LuyR8WaKfqOap9WBQl4WEBcZDmKxhW80T-_wXbuo2Qw,2230
192
+ tests/test_real_scenario_proxy.py,sha256=LGtxEvCiTgn6aTPGd7ZuqaCjApsjosD2DunJrd8-jFE,8259
193
+ tests/test_redis_config.py,sha256=DBrqURBQt517Rt1h1l2iIKrKDfbkJzQSRUEYYbapcy4,875
194
+ tests/test_redis_connection_pool.py,sha256=WIUQlI6K3IINan14vknI4oFf9a8wpHCWi87KSfoB7_E,9034
195
+ tests/test_redis_key_naming.py,sha256=7_X_PSzFQn5m0n_7qLlCjFvY4ZKScC36cqWFu1PAFRw,6730
196
+ tests/test_redis_key_validator.py,sha256=VFuawmaA0G7VSHueCvZEQNKY-L2IdDGlEcyuJ9nZu7Q,4295
197
+ tests/test_redis_queue.py,sha256=2OZJHn5fN9b6XEgEs4Ht1AL6TOJ_H-IR9JxPzzvqMpg,6534
198
+ tests/test_request_ignore_middleware.py,sha256=8_2E6JU27eOWI3iHeh3YscLnp3SIHaubWdA477Ki6PE,6047
199
+ tests/test_request_params.py,sha256=9vNksaOrFbuSb0UffruPxUHhJXZxVYyjQw9J69FSzH8,4176
200
+ tests/test_request_serialization.py,sha256=TPBIzjaifcAjFWCFSFZ5ewRn814jSGPL28MGTwvrr_w,2262
201
+ tests/test_response_code_middleware.py,sha256=EAPHsNN3J4ShJ5UfpzcZpOPjDvYgYJgcpNxpRgpQBaE,11844
202
+ tests/test_response_filter_middleware.py,sha256=yVXl7LYxKxziGQg-YpdbVc6CVhmfmNOAZ86SaEVfMyI,15900
203
+ tests/test_response_improvements.py,sha256=zvbkTkWhgdlFYtRu_ckgq6wGDGwpe_PTECYqpLDM3BU,5876
204
+ tests/test_retry_middleware.py,sha256=g7QPDDLaX7hu3jqlWbDsjfuCbQXisTvs7YrcMLmn1Hw,7856
205
+ tests/test_scheduler.py,sha256=j1t-ItXkCHZpoLI4T_2SFzl0Wbcg81LYeOOfnjsYKuk,7930
206
+ tests/test_scheduler_config_update.py,sha256=eVE9lr7pTHlPCra9rtDge5rp4csiY3veYlG43qc34Fg,4128
207
+ tests/test_simple_response.py,sha256=6RYOBRzAtyNvJ9a5JVTNubM-rvxnuX8jQOvq3sUZxwo,1488
208
+ tests/test_telecom_spider_redis_key.py,sha256=hL_gjLJQbv6Tnli6Id7jXJcP-bgIQzhuWLUTBok6iv0,7420
209
+ tests/test_template_content.py,sha256=q3OYI26e-qpmCYb1qmg5qg0kl7A8BhYbK2QjvKGaS0o,2816
210
+ tests/test_template_redis_key.py,sha256=wJGAgWGO3hpSWoAUHHpBexXF7J2UP_tM6Z_PBjJl96Q,4742
211
+ tests/test_tools.py,sha256=9t9FXZ61MfdB70nck9NYzCq97yd3SLVlLiMybEAlClk,5345
212
+ tests/test_user_agents.py,sha256=rUotyuE2iJDi2LQBrUh980U-dAMTs4ARPMJxICOoQFY,3231
213
+ tests/tools_example.py,sha256=MtIypR-OFiWwi-skurwmq4fM0cGTt-GUX4hSekYs7BY,7739
214
+ tests/verify_distributed.py,sha256=krnYYA5Qx9xXDMWc9YF5DxPSplGvawDg2n0l-3CAqoM,3928
215
+ crawlo-1.3.0.dist-info/METADATA,sha256=5BRT0EE3J1yUtWZ0l_pZqEWxTgGA1p3laxJjTSu7980,26298
216
+ crawlo-1.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
217
+ crawlo-1.3.0.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
218
+ crawlo-1.3.0.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
219
+ crawlo-1.3.0.dist-info/RECORD,,
examples/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- # @Time : 2025-02-05 12:36
5
- # @Author : oscar
6
- # @Desc : None
7
- """
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ # @Time : 2025-02-05 12:36
5
+ # @Author : oscar
6
+ # @Desc : None
7
+ """
@@ -1,82 +1,82 @@
1
- # 双重 crawlo 前缀问题修复报告
2
-
3
- ## 问题描述
4
- 用户在使用分布式爬虫时发现Redis key中出现了双重`crawlo`前缀,例如`crawlo:crawlo:queue:processing:data`。这导致了Redis key命名不一致和潜在的混淆问题。
5
-
6
- ## 问题分析
7
- 经过代码分析,发现问题出在以下两个方面:
8
- 1. RedisPriorityQueue类在处理队列名称时会自动修改用户提供的队列名称
9
- 2. QueueManager类在提取项目名称时没有正确处理双重`crawlo`前缀的情况
10
-
11
- ## 修复方案
12
-
13
- ### 1. RedisPriorityQueue类修复
14
- 文件:`crawlo/queue/redis_priority_queue.py`
15
-
16
- **修复前**:
17
- ```python
18
- # 如果提供了 queue_name,确保符合命名规范
19
- # 处理可能的重复前缀问题
20
- if queue_name.startswith("crawlo:crawlo:"):
21
- # 修复双重 crawlo 前缀
22
- self.queue_name = queue_name.replace("crawlo:crawlo:", "crawlo:", 1)
23
- elif not queue_name.startswith("crawlo:"):
24
- # 如果没有 crawlo 前缀,添加它
25
- self.queue_name = f"crawlo:{module_name}:queue:requests"
26
- else:
27
- # 已经有正确的 crawlo 前缀
28
- self.queue_name = queue_name
29
- ```
30
-
31
- **修复后**:
32
- ```python
33
- # 保持用户提供的队列名称不变,不做修改
34
- self.queue_name = queue_name
35
- ```
36
-
37
- ### 2. QueueManager类修复
38
- 文件:`crawlo/queue/queue_manager.py`
39
-
40
- **修复后**:
41
- ```python
42
- # 处理可能的双重 crawlo 前缀
43
- if parts[0] == "crawlo" and parts[1] == "crawlo":
44
- # 双重 crawlo 前缀,取第三个部分作为项目名称
45
- if len(parts) >= 3:
46
- project_name = parts[2]
47
- else:
48
- project_name = "default"
49
- elif parts[0] == "crawlo":
50
- # 正常的 crawlo 前缀,取第二个部分作为项目名称
51
- project_name = parts[1]
52
- else:
53
- # 没有 crawlo 前缀,使用第一个部分作为项目名称
54
- project_name = parts[0]
55
- ```
56
-
57
- ## 测试验证
58
-
59
- ### 测试1:Redis队列命名修复测试
60
- 验证RedisPriorityQueue正确处理各种队列名称格式:
61
- - 正常命名:`crawlo:test_project:queue:requests` → `crawlo:test_project:queue:requests`
62
- - 双重 crawlo 前缀:`crawlo:crawlo:queue:requests` → `crawlo:crawlo:queue:requests`
63
- - 三重 crawlo 前缀:`crawlo:crawlo:crawlo:queue:requests` → `crawlo:crawlo:crawlo:queue:requests`
64
-
65
- ### 测试2:队列管理器项目名称提取测试
66
- 验证QueueManager正确提取项目名称:
67
- - 正常命名:`crawlo:test_project:queue:requests` → `test_project`
68
- - 双重 crawlo 前缀:`crawlo:crawlo:queue:requests` → [queue](file://d:\dowell\projects\Crawlo\crawlo\core\processor.py#L13-L13)
69
- - 三重 crawlo 前缀:`crawlo:crawlo:crawlo:queue:requests` → `crawlo`
70
-
71
- ### 测试3:队列管理器创建队列测试
72
- 验证整个流程的正确性,确保队列名称在传递过程中保持一致。
73
-
74
- 所有测试均已通过,表明双重`crawlo`前缀问题已得到解决。
75
-
76
- ## 结论
77
- 通过以上修复,我们成功解决了Redis key中出现双重`crawlo`前缀的问题。现在Redis队列名称将保持用户配置的一致性,processing和failed队列也会相应地保持相同的前缀结构。
78
-
79
- ## 建议
80
- 1. 建议用户在项目配置中使用标准的队列名称格式,如`crawlo:{project_name}:queue:requests`
81
- 2. 可以使用Redis key验证工具定期检查和规范Redis key命名
1
+ # 双重 crawlo 前缀问题修复报告
2
+
3
+ ## 问题描述
4
+ 用户在使用分布式爬虫时发现Redis key中出现了双重`crawlo`前缀,例如`crawlo:crawlo:queue:processing:data`。这导致了Redis key命名不一致和潜在的混淆问题。
5
+
6
+ ## 问题分析
7
+ 经过代码分析,发现问题出在以下两个方面:
8
+ 1. RedisPriorityQueue类在处理队列名称时会自动修改用户提供的队列名称
9
+ 2. QueueManager类在提取项目名称时没有正确处理双重`crawlo`前缀的情况
10
+
11
+ ## 修复方案
12
+
13
+ ### 1. RedisPriorityQueue类修复
14
+ 文件:`crawlo/queue/redis_priority_queue.py`
15
+
16
+ **修复前**:
17
+ ```python
18
+ # 如果提供了 queue_name,确保符合命名规范
19
+ # 处理可能的重复前缀问题
20
+ if queue_name.startswith("crawlo:crawlo:"):
21
+ # 修复双重 crawlo 前缀
22
+ self.queue_name = queue_name.replace("crawlo:crawlo:", "crawlo:", 1)
23
+ elif not queue_name.startswith("crawlo:"):
24
+ # 如果没有 crawlo 前缀,添加它
25
+ self.queue_name = f"crawlo:{module_name}:queue:requests"
26
+ else:
27
+ # 已经有正确的 crawlo 前缀
28
+ self.queue_name = queue_name
29
+ ```
30
+
31
+ **修复后**:
32
+ ```python
33
+ # 保持用户提供的队列名称不变,不做修改
34
+ self.queue_name = queue_name
35
+ ```
36
+
37
+ ### 2. QueueManager类修复
38
+ 文件:`crawlo/queue/queue_manager.py`
39
+
40
+ **修复后**:
41
+ ```python
42
+ # 处理可能的双重 crawlo 前缀
43
+ if parts[0] == "crawlo" and parts[1] == "crawlo":
44
+ # 双重 crawlo 前缀,取第三个部分作为项目名称
45
+ if len(parts) >= 3:
46
+ project_name = parts[2]
47
+ else:
48
+ project_name = "default"
49
+ elif parts[0] == "crawlo":
50
+ # 正常的 crawlo 前缀,取第二个部分作为项目名称
51
+ project_name = parts[1]
52
+ else:
53
+ # 没有 crawlo 前缀,使用第一个部分作为项目名称
54
+ project_name = parts[0]
55
+ ```
56
+
57
+ ## 测试验证
58
+
59
+ ### 测试1:Redis队列命名修复测试
60
+ 验证RedisPriorityQueue正确处理各种队列名称格式:
61
+ - 正常命名:`crawlo:test_project:queue:requests` → `crawlo:test_project:queue:requests`
62
+ - 双重 crawlo 前缀:`crawlo:crawlo:queue:requests` → `crawlo:crawlo:queue:requests`
63
+ - 三重 crawlo 前缀:`crawlo:crawlo:crawlo:queue:requests` → `crawlo:crawlo:crawlo:queue:requests`
64
+
65
+ ### 测试2:队列管理器项目名称提取测试
66
+ 验证QueueManager正确提取项目名称:
67
+ - 正常命名:`crawlo:test_project:queue:requests` → `test_project`
68
+ - 双重 crawlo 前缀:`crawlo:crawlo:queue:requests` → [queue](file://d:\dowell\projects\Crawlo\crawlo\core\processor.py#L13-L13)
69
+ - 三重 crawlo 前缀:`crawlo:crawlo:crawlo:queue:requests` → `crawlo`
70
+
71
+ ### 测试3:队列管理器创建队列测试
72
+ 验证整个流程的正确性,确保队列名称在传递过程中保持一致。
73
+
74
+ 所有测试均已通过,表明双重`crawlo`前缀问题已得到解决。
75
+
76
+ ## 结论
77
+ 通过以上修复,我们成功解决了Redis key中出现双重`crawlo`前缀的问题。现在Redis队列名称将保持用户配置的一致性,processing和failed队列也会相应地保持相同的前缀结构。
78
+
79
+ ## 建议
80
+ 1. 建议用户在项目配置中使用标准的队列名称格式,如`crawlo:{project_name}:queue:requests`
81
+ 2. 可以使用Redis key验证工具定期检查和规范Redis key命名
82
82
  3. 如果需要统一的命名规范,可以在项目初始化时明确指定队列名称
tests/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- # @Time : 2025-08-24 12:36
5
- # @Author : crawl-coder
6
- # @Desc : None
7
- """
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ # @Time : 2025-08-24 12:36
5
+ # @Author : crawl-coder
6
+ # @Desc : None
7
+ """