crawlo 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (68) hide show
  1. crawlo/__init__.py +2 -1
  2. crawlo/__version__.py +1 -1
  3. crawlo/commands/genspider.py +68 -42
  4. crawlo/commands/list.py +102 -93
  5. crawlo/commands/startproject.py +89 -4
  6. crawlo/commands/utils.py +187 -0
  7. crawlo/config.py +280 -0
  8. crawlo/core/engine.py +16 -3
  9. crawlo/core/enhanced_engine.py +190 -0
  10. crawlo/core/scheduler.py +113 -8
  11. crawlo/crawler.py +840 -307
  12. crawlo/downloader/__init__.py +181 -17
  13. crawlo/downloader/aiohttp_downloader.py +15 -2
  14. crawlo/downloader/cffi_downloader.py +11 -1
  15. crawlo/downloader/httpx_downloader.py +14 -3
  16. crawlo/filters/__init__.py +122 -5
  17. crawlo/filters/aioredis_filter.py +128 -36
  18. crawlo/filters/memory_filter.py +99 -32
  19. crawlo/middleware/proxy.py +11 -8
  20. crawlo/middleware/retry.py +40 -5
  21. crawlo/mode_manager.py +201 -0
  22. crawlo/network/__init__.py +17 -3
  23. crawlo/network/request.py +118 -10
  24. crawlo/network/response.py +131 -28
  25. crawlo/pipelines/__init__.py +1 -1
  26. crawlo/pipelines/csv_pipeline.py +317 -0
  27. crawlo/pipelines/json_pipeline.py +219 -0
  28. crawlo/queue/__init__.py +0 -0
  29. crawlo/queue/pqueue.py +37 -0
  30. crawlo/queue/queue_manager.py +304 -0
  31. crawlo/queue/redis_priority_queue.py +192 -0
  32. crawlo/settings/default_settings.py +68 -9
  33. crawlo/spider/__init__.py +576 -66
  34. crawlo/task_manager.py +4 -1
  35. crawlo/templates/project/middlewares.py.tmpl +56 -45
  36. crawlo/templates/project/pipelines.py.tmpl +308 -36
  37. crawlo/templates/project/run.py.tmpl +239 -0
  38. crawlo/templates/project/settings.py.tmpl +211 -17
  39. crawlo/templates/spider/spider.py.tmpl +153 -7
  40. crawlo/utils/controlled_spider_mixin.py +336 -0
  41. crawlo/utils/large_scale_config.py +287 -0
  42. crawlo/utils/large_scale_helper.py +344 -0
  43. crawlo/utils/queue_helper.py +176 -0
  44. crawlo/utils/request_serializer.py +220 -0
  45. crawlo-1.1.2.dist-info/METADATA +567 -0
  46. {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/RECORD +54 -46
  47. tests/test_final_validation.py +154 -0
  48. tests/test_redis_config.py +29 -0
  49. tests/test_redis_queue.py +225 -0
  50. tests/test_request_serialization.py +71 -0
  51. tests/test_scheduler.py +242 -0
  52. crawlo/pipelines/mysql_batch_pipline.py +0 -273
  53. crawlo/utils/pqueue.py +0 -174
  54. crawlo-1.1.1.dist-info/METADATA +0 -220
  55. examples/baidu_spider/__init__.py +0 -7
  56. examples/baidu_spider/demo.py +0 -94
  57. examples/baidu_spider/items.py +0 -46
  58. examples/baidu_spider/middleware.py +0 -49
  59. examples/baidu_spider/pipeline.py +0 -55
  60. examples/baidu_spider/run.py +0 -27
  61. examples/baidu_spider/settings.py +0 -121
  62. examples/baidu_spider/spiders/__init__.py +0 -7
  63. examples/baidu_spider/spiders/bai_du.py +0 -61
  64. examples/baidu_spider/spiders/miit.py +0 -159
  65. examples/baidu_spider/spiders/sina.py +0 -79
  66. {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/WHEEL +0 -0
  67. {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/entry_points.txt +0 -0
  68. {crawlo-1.1.1.dist-info → crawlo-1.1.2.dist-info}/top_level.txt +0 -0
@@ -1,35 +1,39 @@
1
- crawlo/__init__.py,sha256=zOGI9hnWawIl0QA0Hnjmqo7vfd-WXNevOP9nLCq04XA,811
2
- crawlo/__version__.py,sha256=q8_5C0f-8mHWNb6mMw02zlYPnEGXBqvOmP3z0CEwZKM,22
1
+ crawlo/__init__.py,sha256=esOolburYDjtF43D5N9Kh6TSQW2yKcz888ilhBSinBc,825
2
+ crawlo/__version__.py,sha256=5SgGjThsHu_ITn8V83BvCziqCwxdXxTQqcC3KQMHPfM,22
3
3
  crawlo/cli.py,sha256=CtR2Pfa7SyRxEKPaXqt-6E6K5Vq5z3rfdAI95UO4cbU,1166
4
- crawlo/crawler.py,sha256=xwViGsJutKjAvfrYlMUd0NQKQtBX2r5qNMvpWkujxTs,19558
4
+ crawlo/config.py,sha256=i0Amz6wNPgv_aVcdCBRRlcwuZLSa87cH9OEmTQvB97Q,8329
5
+ crawlo/crawler.py,sha256=v6i5tjgSOtbMoqiw1qdgKx1cY4kcVcd5l5bUTWtJNNU,36461
5
6
  crawlo/event.py,sha256=7-y6HNv_EIJSYQNzsj0mVK-Gg4ON3wdQeMdQjfFJPlw,313
6
7
  crawlo/exceptions.py,sha256=pthF1lJlJHyRZm-mE6NAo5WzK3GYJqmRqIuIlK1Odx8,1129
8
+ crawlo/mode_manager.py,sha256=WIxrq9S3EAH0D71LH1AxvcqXomeABqoXgtUN4A--DKY,6702
7
9
  crawlo/project.py,sha256=xWN2eTAjf_Pza-wWvvV4JjScQRWxe9hXlztX81ccUMc,5182
8
10
  crawlo/stats_collector.py,sha256=NkO09CB-220qz5rxFcD_dedGfr2VPFrDo4hya0Zh8Qc,1577
9
11
  crawlo/subscriber.py,sha256=3d4eYtkSgPj-18-mTZM6RQLSil-ux13FUcmfFxr3sMk,3730
10
- crawlo/task_manager.py,sha256=AS7Xu_8Q_eb3jg9QSkK_wv6W1rRXaI6WjDp8p6h9ltU,721
12
+ crawlo/task_manager.py,sha256=19InAxS9oJ9EMj20Aw2urN-v6BeC22dkgPuW-B9-4UI,819
11
13
  crawlo/commands/__init__.py,sha256=AMYjXG7ulE8dPVmgWVo0uqXsaCYUUZYmmu2-7kFzH1M,342
12
14
  crawlo/commands/check.py,sha256=172OiAxnX5wwSlszUsyPgMZwAoIbGDTdfhtRz309ilc,22843
13
- crawlo/commands/genspider.py,sha256=In7X463vFCDhow73Netb4S1Vug_E0VC-1sevDc5nmSA,4267
14
- crawlo/commands/list.py,sha256=P4O9zma8RA0061B2t8hRgz6FtRzINJRPbeWH7m6TZRg,5091
15
+ crawlo/commands/genspider.py,sha256=-jGJdfXLsefX_H1ydQ2wirdu6p6wmhClzVXY_0L-1aE,5050
16
+ crawlo/commands/list.py,sha256=yByqQeZBgvjewOKxpnOobpeJ7Hnbs-CWsoyITqZu2ZY,5781
15
17
  crawlo/commands/run.py,sha256=8Qngjsl8Q4RBdO39a__wKGsheY2PFuPit2hds_jwEbM,10524
16
- crawlo/commands/startproject.py,sha256=lu7_E_ygnM-S5LsViuHoaCdJubFGIY5ecWLL-g3R8A8,3953
18
+ crawlo/commands/startproject.py,sha256=bzNgpkKzUEggY2m7Iw810mSPe8wOPFBqSCO0jZX3z_g,7138
17
19
  crawlo/commands/stats.py,sha256=6pAgkEi8MBnCer2rWmKpaTYr1jaM6HeMG9owAvEzJyY,6064
20
+ crawlo/commands/utils.py,sha256=nohMvUU2zLvX0XzXk6KeCNxP0EvSWj9DiVLxM_7tD5o,5106
18
21
  crawlo/core/__init__.py,sha256=PnFyJdVNHBoPmV1sW0AHQXijeoSTQ8cMYrbNM1JK8kA,41
19
- crawlo/core/engine.py,sha256=xBYi-V1O3IfZU9Qo1TJynOpoMjdP_h8kmHC4iDbVfwE,5868
22
+ crawlo/core/engine.py,sha256=8Dcew1XyxChW5Fz1wFEWpJlPrQb2hKDWKul8e61S-Q0,6662
23
+ crawlo/core/enhanced_engine.py,sha256=9I9Uxdy2oAz8zDGTzEiytuKu__VDVmIN8zwZKfrD8bw,6254
20
24
  crawlo/core/processor.py,sha256=qmCqAeqhwYu-UE86evYesaGt9qpuSIfH-ZIZKcXFCZc,1140
21
- crawlo/core/scheduler.py,sha256=oxVzdXvU0CDRfUFkv2MP0NvTRfTGbcvoz-__dNy_rzU,1885
22
- crawlo/downloader/__init__.py,sha256=ukrDBULCaoDWoMLCO3XcQhDoasF0oUzj0PHnJ_ACJaE,2306
23
- crawlo/downloader/aiohttp_downloader.py,sha256=ZxeKvWyAYzKCwu_yRfduX0hSwU3TEqvhBQaB9UBpzNE,7476
24
- crawlo/downloader/cffi_downloader.py,sha256=j_LHvGxrQ9Ynod9RYp79vPgHlwKPPNc6NH8cN5hk-pk,10052
25
- crawlo/downloader/httpx_downloader.py,sha256=fuienQkkn2yQ9R4RlvrxxWqX7OlChJSBEP11-5wNboY,11508
25
+ crawlo/core/scheduler.py,sha256=iNZl47PvurnLLXRZN8Jfj1un6YP2q9m5n56X3Eb7_A8,5615
26
+ crawlo/downloader/__init__.py,sha256=tl0mE54reR-PuJYSsXsKP2VY5uzvq4lITxZwKKjNzPs,7663
27
+ crawlo/downloader/aiohttp_downloader.py,sha256=UKupGYPOWrscAVsjhFgKYElTa9tbEeltqV7nuWqjIeE,8005
28
+ crawlo/downloader/cffi_downloader.py,sha256=-GVfSIhi1Ip56suSiGf8jnUE2EBF1P56vw0uxLh_T6I,10440
29
+ crawlo/downloader/httpx_downloader.py,sha256=FJcdE3BoxUYFQDPm3quaveSjweAKfAQRaeJm1hQViLg,12003
26
30
  crawlo/extension/__init__.py,sha256=PEBbMxi6ULBZ9ivEJ4T7IH_R376BQJ6Rz_D9Ce-Cqbs,1133
27
31
  crawlo/extension/log_interval.py,sha256=S-hSoiz9GdmgHrac4vDQ52fleoBcH-kzdPUD8YRAons,1922
28
32
  crawlo/extension/log_stats.py,sha256=VinABjzuFa-JmNXSX86gQ_3Oyx1y3vUhpWTEu9HRRqg,1677
29
33
  crawlo/extension/logging_extension.py,sha256=Jce2HHc9ejsIWQitkHZuhnLbT_1MIfcvywfzYkkJ5eY,1202
30
- crawlo/filters/__init__.py,sha256=BCZl86BHiTfDGRe_b1TlNSr6pfNbMKTu0Uq0j4gX_1Q,977
31
- crawlo/filters/aioredis_filter.py,sha256=tlkKe7pdXxFEjFBl8UN07JHn_QbOJNY3y3BRj6uFXpg,5471
32
- crawlo/filters/memory_filter.py,sha256=bs2WUe7CdHiXgr344vzDqMfBv1b3RwXJMnwxpDb64Pw,6639
34
+ crawlo/filters/__init__.py,sha256=XC_Q4ykZtSNYizYlAcehVwBBNO3lZ2zuWwafzXiuWyQ,4241
35
+ crawlo/filters/aioredis_filter.py,sha256=3vbPOY3L_H7TUgL8K6AtaP6QDtoIg5xpE70Kxpil6dM,8376
36
+ crawlo/filters/memory_filter.py,sha256=VJO0UFRYGxmV8dj4G1subsQ-FtvPcGLbvd7IVtqXnOs,9260
33
37
  crawlo/items/__init__.py,sha256=bqekZrRlDhxfWie0UbCs656TptYseoe9QJ67I4E7Elk,386
34
38
  crawlo/items/base.py,sha256=tAYrPJgblp3ZEihDXvappdYc6pGdim6x2_9QSmMKI2o,577
35
39
  crawlo/items/fields.py,sha256=wMlakQTsEwyrlLzMt1gI4pScLQZMqd3E1xcfH4dbSqk,1801
@@ -38,63 +42,67 @@ crawlo/middleware/__init__.py,sha256=ldaGFNbiJnK9Fx12Vdf9fDNfzXxoETtShp5r-vodtw0
38
42
  crawlo/middleware/default_header.py,sha256=i_Uj07JObyeZFxL7ZAZmvZsHvA1HGtkNab1sA0d-nWI,1067
39
43
  crawlo/middleware/download_delay.py,sha256=2M-TchDA7MwyTfYy0Hzh_bW9wlHlpiP-oQlys7crTj0,966
40
44
  crawlo/middleware/middleware_manager.py,sha256=j1hkWRFB5rnC5SnB7oXWE5eUNv8blS9krDIDM5fIDs8,6213
41
- crawlo/middleware/proxy.py,sha256=3msCHiPgaaKt6vRjnNswkwSzUFrO4a8OgL9BeV42ySg,9555
45
+ crawlo/middleware/proxy.py,sha256=m2ZZ50En9hUtgrqSqA6hItGT74xMqccHFPhZshutIco,9811
42
46
  crawlo/middleware/request_ignore.py,sha256=QI2z4fUnJ-4xvPTZAmsL-GqR4RFHS1xq9iDr5KFrMco,997
43
47
  crawlo/middleware/response_code.py,sha256=tmef2QVl3JCiTMii6VQkASlOY2OyqmOPoOfNxIK1eF8,659
44
48
  crawlo/middleware/response_filter.py,sha256=ep8ZxDlfIefi9YqK8dPASEp5TTDRo9QEY_jMceC411s,837
45
- crawlo/middleware/retry.py,sha256=6I0FXJ4zCnJ-pMBxoc8Cg4OVCcH4Buyfoy6ivSu9bro,3369
46
- crawlo/network/__init__.py,sha256=VaD0GmsgDYJ8UMgrtjeOc1Wc7lDGee1uAF3neRpyug0,123
47
- crawlo/network/request.py,sha256=JD4p9e0osjAFb40Ux57NVJliUe7NQ6_IxoMwj4faczs,7044
48
- crawlo/network/response.py,sha256=_oaElq7_Y-5gz2k8lhfZIss7hxFh6dJ77OpuXZXr9oI,6034
49
- crawlo/pipelines/__init__.py,sha256=Hk-M6X0VCGLp6OEdgnhXGhGhKS5TjKf6dkg8bU9pvUE,260
49
+ crawlo/middleware/retry.py,sha256=pH4fdW71DwQzMY7APutX7yqtvSTAoznYeM3ux2EOT7E,4146
50
+ crawlo/network/__init__.py,sha256=BLPERYPo22g1BXrW--wUnlolrdFUmOPjgOB8XQQJlck,397
51
+ crawlo/network/request.py,sha256=5bzXJJWcAgGngNos-k4eIh9QaJdnGw0_SLgdrNt_0-4,11103
52
+ crawlo/network/response.py,sha256=M0iG8ggcEpFzN7aAinKwVbxHAhANLYj8Zp4-FwaTpYo,9812
53
+ crawlo/pipelines/__init__.py,sha256=Tvs9FQ_2qz8GcmoEqLrz_YFZrRpL3gGIssiI5Ce9b5I,254
50
54
  crawlo/pipelines/console_pipeline.py,sha256=KABkR3J-rqO0Awox7lizxKR2XuHfVhWPiVRgIybwwu4,1248
55
+ crawlo/pipelines/csv_pipeline.py,sha256=6FBT2AoU6iNU-5NfgWRq7-JpF9dK2nBokjxx-y4jIas,12174
56
+ crawlo/pipelines/json_pipeline.py,sha256=vlu1nqbD2mtqtExt9cL5nibx1CwJM1RNqd4WGjZRHAY,8367
51
57
  crawlo/pipelines/mongo_pipeline.py,sha256=lv-Zn_mWdE_jVy7Nh30Lzqm3YhtLRV5rMy-m4rBWYe0,4442
52
- crawlo/pipelines/mysql_batch_pipline.py,sha256=C7EPzAluY-Kplc6MB2UYLLDdahYsnZMZf3cxQ5vloeQ,10339
53
58
  crawlo/pipelines/mysql_pipeline.py,sha256=6g6PGTwAyzH5VStlPAg2SdG5t1lPw1Zu-cY7x6Mz16s,7861
54
59
  crawlo/pipelines/pipeline_manager.py,sha256=VrbebOYiqrobtKhp5II18w-odCICdWkmRg5WPK0Emz4,2112
60
+ crawlo/queue/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
61
+ crawlo/queue/pqueue.py,sha256=yzF900ko2RReVNZtfk_VV3PzSXEUALI6SHf97geuu6s,1058
62
+ crawlo/queue/queue_manager.py,sha256=OQ9YBbc9Y4uE_5WbQhe233FFdhaeiOhDVbqWF7ev42U,11303
63
+ crawlo/queue/redis_priority_queue.py,sha256=hf7hiIiRS-2DIq1bIJbs1yLEZjuMnHwc-n2_I7PlE9A,7294
55
64
  crawlo/settings/__init__.py,sha256=xsukVKn_h2Hopm1Nj-bXkhbfyS62QTTvJi7fhZUwR9M,123
56
- crawlo/settings/default_settings.py,sha256=BDpjZkPYQhuutxZomWYS6WvBHomKfHdNVSvTmyz1bKY,7100
65
+ crawlo/settings/default_settings.py,sha256=jRM_Cqt3tQ7V4mgqBTAIQY3BcfGsLkKdvW1TcbWHv48,9627
57
66
  crawlo/settings/setting_manager.py,sha256=SxKB1aCWh4OySM_bH9cYng9I3PAmrSP-Q8XOZEWEwbI,2899
58
- crawlo/spider/__init__.py,sha256=kNBJ02QB0EZcGvzMH6GYh5UssI1HzOREIHz3FVo8LOA,3854
67
+ crawlo/spider/__init__.py,sha256=Z_rK23l5yt-DuwJPg8bcqodM_FIs4-iHLaKOimGumcE,20452
59
68
  crawlo/templates/crawlo.cfg.tmpl,sha256=9BAmwEibS5Tvy6HIcGXPb0BGeuesmibebmTW0iAEkmo,230
60
69
  crawlo/templates/project/__init__.py.tmpl,sha256=f3ETIXw_O6K-lkL6lXM5znMPJW1FZYGFrwDs2BnHcnQ,58
61
70
  crawlo/templates/project/items.py.tmpl,sha256=3h-4nuneUoCAGUzSLyLNsdgeAAUXzz4rV0QctzCkIJA,301
62
- crawlo/templates/project/middlewares.py.tmpl,sha256=oy5RCkRqLeBpvtYA1NGVlbYfl8rWuS3VDgQSx5opnHQ,2102
63
- crawlo/templates/project/pipelines.py.tmpl,sha256=unEag7_qMN5DiffIzTKRmTVUCiTkHGCEad-jRQ6eH4I,1817
64
- crawlo/templates/project/settings.py.tmpl,sha256=By0YaZPxAP0pha_oY7J-vm9ntuvEfJO1jTiUhNddGic,1716
71
+ crawlo/templates/project/middlewares.py.tmpl,sha256=Ua2vG3WXliMBSmwsbYaSwzx2lZPw9vLrG2dLqtZWGRg,3157
72
+ crawlo/templates/project/pipelines.py.tmpl,sha256=-dxK7T396AVTbh0FhivLDZq4lKxw3p1-5Y6Ij4jgImE,12041
73
+ crawlo/templates/project/run.py.tmpl,sha256=RQ_k_B-Pp1R853-N_58yUGjIouXa6ipBtUPQenLjswE,7519
74
+ crawlo/templates/project/settings.py.tmpl,sha256=NxPvXGnomTV5NkczPIC7Tscn2vjmHUE4CYmbQB2SmDQ,7884
65
75
  crawlo/templates/project/spiders/__init__.py.tmpl,sha256=j_YKsw6HQMJyqlk3WUouP3bsr-XVxshRoSNakHBc00g,106
66
- crawlo/templates/spider/spider.py.tmpl,sha256=nyFMfk6qsmwfGuuNEcu6N6VnrOtq1GHVcoJOYRHrZFA,838
76
+ crawlo/templates/spider/spider.py.tmpl,sha256=Wx1yMBsqDQvJGb9HoEVecX6XZcVrJiBj9blXYrernTo,6167
67
77
  crawlo/utils/__init__.py,sha256=BDORpyjMN7VGPKImnCDKSkprS-petgD7ezc9rMlBvb0,123
78
+ crawlo/utils/controlled_spider_mixin.py,sha256=BpH76XxBJtYQ0rmCX6vrG-8gQXT9-LP65ibetzfqncU,12500
68
79
  crawlo/utils/date_tools.py,sha256=0yG0tzGb1VFgWDJJ_cow2LJfz3kj_w2MqSjmfKKESl8,6961
69
80
  crawlo/utils/db_helper.py,sha256=3ib5-agrlwf2t5S_QtLRYH75wvJDlYbRqRmDEbpH5Bo,10559
70
81
  crawlo/utils/func_tools.py,sha256=WUZEGpWMuDDX7g-QySM7iaiC74erW2SSkZoUvDw1NjM,2369
82
+ crawlo/utils/large_scale_config.py,sha256=j7wQ5ty7pQlBRygw2vhRJ7OI19RYBZKPfYMP3WeF2WI,8154
83
+ crawlo/utils/large_scale_helper.py,sha256=JJqcGSI6VaVe3MSL6IWjmCp8XQIu6T4U-BvBLSttr_s,12157
71
84
  crawlo/utils/log.py,sha256=A3lPyhD8kD88cV23KOL-_eT8g69xGQ5L1toDB2AO0mc,4005
72
- crawlo/utils/pqueue.py,sha256=4Ymkm38fRFqEcSJeD_ULkuBaCk6QdYvJdnYvtJjh-Tk,5386
85
+ crawlo/utils/queue_helper.py,sha256=xpUUTOqlU1xz2Pb9NKAVGo3AfAO-7Xvx8Lm1q65Dgck,4743
73
86
  crawlo/utils/request.py,sha256=yoLB2rY8d78vgPjIWpdhY5SalIKjyLIvTG_UH6EMdVI,8798
87
+ crawlo/utils/request_serializer.py,sha256=bPoSQqE2ksiMyP3WiPB3w3UqZs4f_LgkAw4Pj0qyBDo,8565
74
88
  crawlo/utils/spider_loader.py,sha256=pEDUsYOTGjszA6KgjiMlYN4GS5fP4uakkhcp3JTFFQY,2187
75
89
  crawlo/utils/system.py,sha256=HvWV1acxou0Rn0L7pNq4CnV_GWFeU0Tmjy3_nLD8M64,237
76
90
  crawlo/utils/tools.py,sha256=5Uv25Wy4m_ndZY0-n-eX-t3PxvaZ6wR3-Wvx-o7_Vrs,271
77
91
  crawlo/utils/url.py,sha256=rlgX2VlJv6JvLmCDTsbxzMSXE6R5ZL_0dLEqprsA-JU,1482
78
92
  examples/__init__.py,sha256=6i631BPnS_TR_BWUjtjB5CBO-zv9kRkwQTQvSya2wHE,123
79
- examples/baidu_spider/__init__.py,sha256=xlj0-TBQBhcKglllla_bQbufNiv10UFE0KsWMLvzFz4,123
80
- examples/baidu_spider/demo.py,sha256=MTEHkm7U4Kyx5QULCgR6to391xn4XPay6fmuV1c1uRc,24278
81
- examples/baidu_spider/items.py,sha256=e_LNs13SFiKkqwP2sdKhX72XkiX-E-ythMQOsY5P8IA,1264
82
- examples/baidu_spider/middleware.py,sha256=I71ZMmWTiDBFq4t2zfTE7IIXCqwaaeQ1DvKGW70q2Yg,1397
83
- examples/baidu_spider/pipeline.py,sha256=TUK_LnrU818UYmCn2_gKeNaTZjaj9qjrlndRLsR4wf0,1437
84
- examples/baidu_spider/run.py,sha256=7ifJVryShJ41rnlcdzU6rfex8L0Av1XpeSo4DhRRH6w,564
85
- examples/baidu_spider/settings.py,sha256=GOnHyYCSyOacCTn-sFAptNR3oSBpWxrGf1XkVHLaeng,3925
86
- examples/baidu_spider/spiders/__init__.py,sha256=eJ_ih4GiGfwQzPILeouy1Hnc4BrPz0KNPYlLHYvrvoc,123
87
- examples/baidu_spider/spiders/bai_du.py,sha256=pw4WccbmBR07CuSqCgm_7x9SH63FDJS_sXSaN5Ew5Tw,1589
88
- examples/baidu_spider/spiders/miit.py,sha256=RStlK5GdrILhqTRNpeWlP_jCijWPUpmmN7Sq6ua_K3s,6338
89
- examples/baidu_spider/spiders/sina.py,sha256=Q1TwvwutfGdFZzMIDyppOlwQIdr4bQSd2m42iNP8T5o,2418
90
93
  tests/__init__.py,sha256=scL1IPVT1iucth7v8ffrjRdeW7QrC_Y7AMmFVMdTY1Y,129
94
+ tests/test_final_validation.py,sha256=fBxf_6YcAEa_HyV_oGAXmmVHY4i6FdA4J6klCmc36hQ,4925
91
95
  tests/test_proxy_health_check.py,sha256=xo3QMP1YNw9hu7JDHZOYCUZmFFKLJpHSh4SbxXhCRPQ,1091
92
96
  tests/test_proxy_middleware_integration.py,sha256=zcl7fR9Toc-I-stSUTzKZPwcfh3kgrpjI5SbkZ6AVmE,4305
93
97
  tests/test_proxy_providers.py,sha256=XwWZCywTYguSsUxSm6fsbaoH1p9dKjqSIx9-sqKZehA,1693
94
98
  tests/test_proxy_stats.py,sha256=Til_yksrRz2yBVw-yJi5-36LhNW3vTwpXTm4BdR9PUM,507
95
99
  tests/test_proxy_strategies.py,sha256=ZkziozkvZd3KWOQnpHQ8Upd3WpyoX7gN0qFGluNm348,1809
96
- crawlo-1.1.1.dist-info/METADATA,sha256=t26VSUHs-38nd7L3JdSmD4zCICOUCv8aHFUSsvvLgIs,5347
97
- crawlo-1.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
98
- crawlo-1.1.1.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
99
- crawlo-1.1.1.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
100
- crawlo-1.1.1.dist-info/RECORD,,
100
+ tests/test_redis_config.py,sha256=TqzFRojc6esGXjGhUCvSLYQDUTAgEJsty9vRVuNraMU,893
101
+ tests/test_redis_queue.py,sha256=o6xViXxJcdx-1eMcG3vhAQEIm8h346HnZb7JXs7ZjwM,6622
102
+ tests/test_request_serialization.py,sha256=8sVdppAsohJ5u-m1WvablCndwL-M_36YPLdGKwgeznM,2289
103
+ tests/test_scheduler.py,sha256=-FOkTWzaMdr6yfO1Msu74hI_GgSfD7iRxO-cFA-9Iyk,7442
104
+ crawlo-1.1.2.dist-info/METADATA,sha256=fIXPyz2xk3GN82z5693BePRT1OWvul312NOh41CBRXA,17061
105
+ crawlo-1.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
106
+ crawlo-1.1.2.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
107
+ crawlo-1.1.2.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
108
+ crawlo-1.1.2.dist-info/RECORD,,
@@ -0,0 +1,154 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 最终验证测试:确认分布式队列的 logger 序列化问题已完全解决
5
+ """
6
+ import asyncio
7
+ import pickle
8
+ import sys
9
+ sys.path.insert(0, "..")
10
+
11
+ from crawlo.network.request import Request
12
+ from crawlo.spider import Spider
13
+ from crawlo.core.scheduler import Scheduler
14
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
15
+ from crawlo.utils.log import get_logger
16
+ from unittest.mock import Mock
17
+
18
+
19
+ class TestSpider(Spider):
20
+ """测试爬虫"""
21
+ name = "validation_spider"
22
+
23
+ def __init__(self):
24
+ super().__init__()
25
+ # 故意添加多个 logger 来测试清理
26
+ self.custom_logger = get_logger("custom")
27
+ self.debug_logger = get_logger("debug")
28
+ self.nested_data = {
29
+ 'logger': get_logger("nested"),
30
+ 'sub': {
31
+ 'logger_ref': get_logger("sub_logger")
32
+ }
33
+ }
34
+
35
+ def parse(self, response):
36
+ # 验证主 logger 还在
37
+ self.logger.info(f"✅ 主 logger 工作正常: {response.url}")
38
+ return {"url": response.url, "status": "success"}
39
+
40
+
41
+ def test_scheduler_cleaning():
42
+ """测试调度器的 logger 清理"""
43
+ print("🔍 测试调度器 logger 清理...")
44
+
45
+ spider = TestSpider()
46
+ request = Request(
47
+ url="https://scheduler-test.com",
48
+ callback=spider.parse,
49
+ meta={"logger": get_logger("meta_logger")}
50
+ )
51
+
52
+ # Mock crawler 和 scheduler
53
+ class MockCrawler:
54
+ def __init__(self):
55
+ self.spider = spider
56
+
57
+ class MockScheduler(Scheduler):
58
+ def __init__(self):
59
+ self.crawler = MockCrawler()
60
+ self.logger = get_logger("MockScheduler")
61
+
62
+ scheduler = MockScheduler()
63
+
64
+ # 清理前检查
65
+ print(f" 🔧 清理前 - spider.logger: {spider.logger is not None}")
66
+ print(f" 🔧 清理前 - spider.custom_logger: {spider.custom_logger is not None}")
67
+ print(f" 🔧 清理前 - request.callback: {request.callback is not None}")
68
+
69
+ # 执行清理
70
+ cleaned_request = scheduler._deep_clean_loggers(request)
71
+
72
+ # 清理后检查
73
+ print(f" ✅ 清理后 - spider.logger: {spider.logger is not None}")
74
+ print(f" ✅ 清理后 - spider.custom_logger: {spider.custom_logger is None}")
75
+ print(f" ✅ 清理后 - request.callback: {cleaned_request.callback is None}")
76
+
77
+ # 序列化测试
78
+ try:
79
+ serialized = pickle.dumps(cleaned_request)
80
+ print(f" ✅ 调度器清理后序列化成功,大小: {len(serialized)} bytes")
81
+ return True
82
+ except Exception as e:
83
+ print(f" ❌ 调度器清理后序列化失败: {e}")
84
+ return False
85
+
86
+
87
+ async def test_redis_queue_cleaning():
88
+ """测试 Redis 队列的 logger 清理"""
89
+ print("\\n🔍 测试 Redis 队列 logger 清理...")
90
+
91
+ spider = TestSpider()
92
+ request = Request(
93
+ url="https://redis-test.com",
94
+ callback=spider.parse,
95
+ meta={"logger": get_logger("meta_logger")}
96
+ )
97
+
98
+ try:
99
+ queue = RedisPriorityQueue(redis_url="redis://127.0.0.1:6379/0")
100
+ await queue.connect()
101
+
102
+ # 入队测试
103
+ success = await queue.put(request, priority=0)
104
+ print(f" ✅ Redis 队列入队成功: {success}")
105
+
106
+ if success:
107
+ # 出队测试
108
+ retrieved = await queue.get(timeout=2.0)
109
+ if retrieved:
110
+ print(f" ✅ Redis 队列出队成功: {retrieved.url}")
111
+ print(f" ✅ callback 信息保存: {'_callback_info' in retrieved.meta}")
112
+ await queue.close()
113
+ return True
114
+ else:
115
+ print(" ❌ 出队失败")
116
+ await queue.close()
117
+ return False
118
+ else:
119
+ await queue.close()
120
+ return False
121
+
122
+ except Exception as e:
123
+ print(f" ❌ Redis 队列测试失败: {e}")
124
+ return False
125
+
126
+
127
+ async def main():
128
+ """主测试函数"""
129
+ print("🚀 开始最终验证测试...")
130
+ print("=" * 60)
131
+
132
+ # 测试 1: 调度器清理
133
+ scheduler_ok = test_scheduler_cleaning()
134
+
135
+ # 测试 2: Redis 队列清理
136
+ redis_ok = await test_redis_queue_cleaning()
137
+
138
+ print("\\n" + "=" * 60)
139
+ print("📊 测试结果汇总:")
140
+ print(f" 调度器 logger 清理: {'✅ 通过' if scheduler_ok else '❌ 失败'}")
141
+ print(f" Redis 队列清理: {'✅ 通过' if redis_ok else '❌ 失败'}")
142
+
143
+ if scheduler_ok and redis_ok:
144
+ print("\\n🎉 所有测试通过!")
145
+ print("✅ 分布式队列的 logger 序列化问题已完全修复!")
146
+ print("✅ Crawlo 现在可以正常使用 Redis 分布式队列了!")
147
+ return True
148
+ else:
149
+ print("\\n❌ 部分测试失败,需要进一步修复")
150
+ return False
151
+
152
+
153
+ if __name__ == "__main__":
154
+ asyncio.run(main())
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 快速测试 Redis 连接配置修复
5
+ """
6
+ import asyncio
7
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
8
+ from crawlo.settings.default_settings import REDIS_URL
9
+
10
+ async def test_redis_config():
11
+ """测试修复后的 Redis 配置"""
12
+ print(f"🔍 测试 Redis 配置: {REDIS_URL}")
13
+
14
+ try:
15
+ queue = RedisPriorityQueue(redis_url=REDIS_URL)
16
+ await queue.connect()
17
+ print("✅ Redis 连接成功!")
18
+ await queue.close()
19
+ return True
20
+ except Exception as e:
21
+ print(f"❌ Redis 连接失败: {e}")
22
+ return False
23
+
24
+ if __name__ == "__main__":
25
+ success = asyncio.run(test_redis_config())
26
+ if success:
27
+ print("🎉 配置修复成功!现在可以运行你的爬虫了。")
28
+ else:
29
+ print("❌ 配置仍有问题,请检查 Redis 服务状态。")
@@ -0,0 +1,225 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Redis 分布式队列测试脚本
5
+ 用于诊断和修复分布式队列问题
6
+ """
7
+ import asyncio
8
+ import sys
9
+ import traceback
10
+ import time
11
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
12
+ from crawlo.network.request import Request
13
+
14
+
15
+ async def test_redis_connection():
16
+ """测试 Redis 连接"""
17
+ print("🔍 1. 测试 Redis 连接...")
18
+
19
+ # 测试不同的 Redis URL 格式
20
+ test_urls = [
21
+ "redis://localhost:6379/0",
22
+ "redis://:oscar&0503@127.0.0.1:6379/0", # 带密码
23
+ "redis://127.0.0.1:6379/0", # 无密码
24
+ ]
25
+
26
+ for redis_url in test_urls:
27
+ try:
28
+ print(f" 尝试连接: {redis_url}")
29
+ queue = RedisPriorityQueue(redis_url=redis_url)
30
+ await queue.connect()
31
+ print(f" ✅ 连接成功: {redis_url}")
32
+ await queue.close()
33
+ return redis_url
34
+ except Exception as e:
35
+ print(f" ❌ 连接失败: {redis_url} - {e}")
36
+
37
+ raise ConnectionError("所有 Redis URL 都连接失败")
38
+
39
+
40
+ async def test_queue_operations(redis_url):
41
+ """测试队列基本操作"""
42
+ print("🔍 2. 测试队列基本操作...")
43
+
44
+ queue = RedisPriorityQueue(
45
+ redis_url=redis_url,
46
+ queue_name="test:crawlo:requests",
47
+ max_retries=2
48
+ )
49
+
50
+ try:
51
+ await queue.connect()
52
+
53
+ # 测试 put 操作
54
+ test_request = Request(url="https://example.com", priority=5)
55
+ print(f" 📤 插入请求: {test_request.url}")
56
+
57
+ success = await queue.put(test_request, priority=5)
58
+ if success:
59
+ print(" ✅ 插入成功")
60
+ else:
61
+ print(" ❌ 插入失败")
62
+ return False
63
+
64
+ # 测试队列大小
65
+ size = await queue.qsize()
66
+ print(f" 📊 队列大小: {size}")
67
+
68
+ # 测试 get 操作
69
+ print(" 📥 获取请求...")
70
+ retrieved_request = await queue.get(timeout=2.0)
71
+
72
+ if retrieved_request:
73
+ print(f" ✅ 获取成功: {retrieved_request.url}")
74
+ # 测试 ack
75
+ await queue.ack(retrieved_request)
76
+ print(" ✅ ACK 成功")
77
+ else:
78
+ print(" ❌ 获取失败(超时)")
79
+ return False
80
+
81
+ return True
82
+
83
+ except Exception as e:
84
+ print(f" ❌ 队列操作失败: {e}")
85
+ traceback.print_exc()
86
+ return False
87
+ finally:
88
+ await queue.close()
89
+
90
+
91
+ async def test_serialization():
92
+ """测试序列化问题"""
93
+ print("🔍 3. 测试 Request 序列化...")
94
+
95
+ try:
96
+ import pickle
97
+ from crawlo.network.request import Request
98
+
99
+ # 创建测试请求
100
+ request = Request(
101
+ url="https://example.com",
102
+ method="GET",
103
+ headers={"User-Agent": "Test"},
104
+ meta={"test": "data"},
105
+ priority=5
106
+ )
107
+
108
+ # 测试序列化
109
+ serialized = pickle.dumps(request)
110
+ print(f" ✅ 序列化成功,大小: {len(serialized)} bytes")
111
+
112
+ # 测试反序列化
113
+ deserialized = pickle.loads(serialized)
114
+ print(f" ✅ 反序列化成功: {deserialized.url}")
115
+
116
+ return True
117
+
118
+ except Exception as e:
119
+ print(f" ❌ 序列化失败: {e}")
120
+ traceback.print_exc()
121
+ return False
122
+
123
+
124
+ async def test_concurrent_operations(redis_url):
125
+ """测试并发操作"""
126
+ print("🔍 4. 测试并发操作...")
127
+
128
+ async def producer(queue, start_id):
129
+ """生产者"""
130
+ try:
131
+ for i in range(5):
132
+ request = Request(url=f"https://example{start_id + i}.com", priority=i)
133
+ await queue.put(request, priority=i)
134
+ await asyncio.sleep(0.1)
135
+ print(f" ✅ 生产者 {start_id} 完成")
136
+ except Exception as e:
137
+ print(f" ❌ 生产者 {start_id} 失败: {e}")
138
+
139
+ async def consumer(queue, consumer_id):
140
+ """消费者"""
141
+ consumed = 0
142
+ try:
143
+ for _ in range(3): # 每个消费者处理3个请求
144
+ request = await queue.get(timeout=5.0)
145
+ if request:
146
+ await queue.ack(request)
147
+ consumed += 1
148
+ await asyncio.sleep(0.05)
149
+ else:
150
+ break
151
+ print(f" ✅ 消费者 {consumer_id} 处理了 {consumed} 个请求")
152
+ except Exception as e:
153
+ print(f" ❌ 消费者 {consumer_id} 失败: {e}")
154
+
155
+ queue = RedisPriorityQueue(
156
+ redis_url=redis_url,
157
+ queue_name="test:concurrent:requests"
158
+ )
159
+
160
+ try:
161
+ await queue.connect()
162
+
163
+ # 并发运行生产者和消费者
164
+ tasks = [
165
+ producer(queue, 0),
166
+ producer(queue, 10),
167
+ consumer(queue, 1),
168
+ consumer(queue, 2),
169
+ ]
170
+
171
+ await asyncio.gather(*tasks, return_exceptions=True)
172
+
173
+ # 检查剩余队列大小
174
+ final_size = await queue.qsize()
175
+ print(f" 📊 最终队列大小: {final_size}")
176
+
177
+ return True
178
+
179
+ except Exception as e:
180
+ print(f" ❌ 并发测试失败: {e}")
181
+ return False
182
+ finally:
183
+ await queue.close()
184
+
185
+
186
+ async def main():
187
+ """主测试函数"""
188
+ print("🚀 开始 Redis 分布式队列诊断...")
189
+ print("=" * 50)
190
+
191
+ try:
192
+ # 1. 测试连接
193
+ redis_url = await test_redis_connection()
194
+
195
+ # 2. 测试序列化
196
+ if not await test_serialization():
197
+ return
198
+
199
+ # 3. 测试基本操作
200
+ if not await test_queue_operations(redis_url):
201
+ return
202
+
203
+ # 4. 测试并发操作
204
+ if not await test_concurrent_operations(redis_url):
205
+ return
206
+
207
+ print("=" * 50)
208
+ print("🎉 所有测试通过!Redis 队列工作正常")
209
+
210
+ except Exception as e:
211
+ print("=" * 50)
212
+ print(f"❌ 诊断失败: {e}")
213
+ traceback.print_exc()
214
+
215
+ # 提供解决建议
216
+ print("\n🔧 可能的解决方案:")
217
+ print("1. 检查 Redis 服务是否启动: redis-server")
218
+ print("2. 检查 Redis 密码配置")
219
+ print("3. 检查防火墙和端口 6379")
220
+ print("4. 安装 Redis: pip install redis")
221
+ print("5. 检查 Redis 配置文件中的 bind 设置")
222
+
223
+
224
+ if __name__ == "__main__":
225
+ asyncio.run(main())
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 测试 Request 序列化问题修复
5
+ """
6
+ import pickle
7
+ import sys
8
+ sys.path.insert(0, "..")
9
+
10
+ from crawlo.network.request import Request
11
+ from crawlo.core.scheduler import Scheduler
12
+ from unittest.mock import Mock
13
+
14
+ # 模拟一个带 logger 的 Request
15
+ class TestRequest(Request):
16
+ def __init__(self, *args, **kwargs):
17
+ super().__init__(*args, **kwargs)
18
+ # 添加一个 logger 属性模拟问题
19
+ from crawlo.utils.log import get_logger
20
+ self.logger = get_logger("test_request")
21
+ self.meta['spider_logger'] = get_logger("spider_logger")
22
+
23
+ def test_request_serialization():
24
+ """测试 Request 序列化"""
25
+ print("🔍 测试 Request 序列化修复...")
26
+
27
+ # 创建一个带 logger 的请求
28
+ request = TestRequest(
29
+ url="https://example.com",
30
+ meta={"test": "data"} # 移除 Mock 对象
31
+ )
32
+
33
+ print(f" 📦 原始请求: {request}")
34
+ print(f" 🔧 请求有 logger: {hasattr(request, 'logger')}")
35
+ print(f" 🔧 meta 有 logger: {'spider_logger' in request.meta}")
36
+
37
+ # 创建一个 mock scheduler 来测试清理
38
+ class MockScheduler:
39
+ def _deep_clean_loggers(self, request):
40
+ return Scheduler._deep_clean_loggers(self, request)
41
+ def _remove_logger_from_dict(self, d):
42
+ return Scheduler._remove_logger_from_dict(self, d)
43
+
44
+ scheduler = MockScheduler()
45
+
46
+ # 执行清理
47
+ scheduler._deep_clean_loggers(request)
48
+
49
+ print(f" 🧹 清理后有 logger: {hasattr(request, 'logger')}")
50
+ print(f" 🧹 清理后 meta 有 logger: {'spider_logger' in request.meta}")
51
+
52
+ # 测试序列化
53
+ try:
54
+ serialized = pickle.dumps(request)
55
+ print(f" ✅ 序列化成功,大小: {len(serialized)} bytes")
56
+
57
+ # 测试反序列化
58
+ deserialized = pickle.loads(serialized)
59
+ print(f" ✅ 反序列化成功: {deserialized}")
60
+ return True
61
+
62
+ except Exception as e:
63
+ print(f" ❌ 序列化失败: {e}")
64
+ return False
65
+
66
+ if __name__ == "__main__":
67
+ success = test_request_serialization()
68
+ if success:
69
+ print("🎉 Request 序列化修复成功!")
70
+ else:
71
+ print("❌ 序列化问题仍未解决")