crawlo 1.1.2__py3-none-any.whl → 1.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__version__.py +1 -1
- crawlo/core/scheduler.py +20 -16
- crawlo/downloader/httpx_downloader.py +14 -12
- crawlo/exceptions.py +4 -0
- crawlo/extension/__init__.py +17 -10
- crawlo/extension/health_check.py +142 -0
- crawlo/extension/log_interval.py +27 -18
- crawlo/extension/log_stats.py +62 -24
- crawlo/extension/logging_extension.py +18 -9
- crawlo/extension/memory_monitor.py +89 -0
- crawlo/extension/performance_profiler.py +118 -0
- crawlo/extension/request_recorder.py +108 -0
- crawlo/filters/aioredis_filter.py +2 -2
- crawlo/middleware/retry.py +3 -3
- crawlo/network/request.py +2 -2
- crawlo/network/response.py +25 -23
- crawlo/pipelines/__init__.py +9 -0
- crawlo/pipelines/bloom_dedup_pipeline.py +157 -0
- crawlo/pipelines/database_dedup_pipeline.py +225 -0
- crawlo/pipelines/memory_dedup_pipeline.py +116 -0
- crawlo/pipelines/mongo_pipeline.py +81 -66
- crawlo/pipelines/mysql_pipeline.py +165 -43
- crawlo/pipelines/redis_dedup_pipeline.py +163 -0
- crawlo/queue/queue_manager.py +4 -0
- crawlo/queue/redis_priority_queue.py +20 -3
- crawlo/settings/default_settings.py +119 -66
- crawlo/subscriber.py +62 -37
- crawlo/templates/project/items.py.tmpl +1 -1
- crawlo/templates/project/middlewares.py.tmpl +73 -49
- crawlo/templates/project/pipelines.py.tmpl +52 -290
- crawlo/templates/project/run.py.tmpl +20 -7
- crawlo/templates/project/settings.py.tmpl +35 -3
- crawlo/templates/spider/spider.py.tmpl +1 -37
- crawlo/utils/controlled_spider_mixin.py +109 -5
- crawlo-1.1.4.dist-info/METADATA +403 -0
- {crawlo-1.1.2.dist-info → crawlo-1.1.4.dist-info}/RECORD +40 -31
- examples/controlled_spider_example.py +205 -0
- crawlo-1.1.2.dist-info/METADATA +0 -567
- {crawlo-1.1.2.dist-info → crawlo-1.1.4.dist-info}/WHEEL +0 -0
- {crawlo-1.1.2.dist-info → crawlo-1.1.4.dist-info}/entry_points.txt +0 -0
- {crawlo-1.1.2.dist-info → crawlo-1.1.4.dist-info}/top_level.txt +0 -0
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
crawlo/__init__.py,sha256=esOolburYDjtF43D5N9Kh6TSQW2yKcz888ilhBSinBc,825
|
|
2
|
-
crawlo/__version__.py,sha256=
|
|
2
|
+
crawlo/__version__.py,sha256=XxXhu8-QnuD9hA8Ah0WX5rgpt_DwOQmAwcK-FtpngyQ,22
|
|
3
3
|
crawlo/cli.py,sha256=CtR2Pfa7SyRxEKPaXqt-6E6K5Vq5z3rfdAI95UO4cbU,1166
|
|
4
4
|
crawlo/config.py,sha256=i0Amz6wNPgv_aVcdCBRRlcwuZLSa87cH9OEmTQvB97Q,8329
|
|
5
5
|
crawlo/crawler.py,sha256=v6i5tjgSOtbMoqiw1qdgKx1cY4kcVcd5l5bUTWtJNNU,36461
|
|
6
6
|
crawlo/event.py,sha256=7-y6HNv_EIJSYQNzsj0mVK-Gg4ON3wdQeMdQjfFJPlw,313
|
|
7
|
-
crawlo/exceptions.py,sha256=
|
|
7
|
+
crawlo/exceptions.py,sha256=sMay0wnWLfc_FXWslqxm60qz6b66LXs3EdN_w8ygE9k,1166
|
|
8
8
|
crawlo/mode_manager.py,sha256=WIxrq9S3EAH0D71LH1AxvcqXomeABqoXgtUN4A--DKY,6702
|
|
9
9
|
crawlo/project.py,sha256=xWN2eTAjf_Pza-wWvvV4JjScQRWxe9hXlztX81ccUMc,5182
|
|
10
10
|
crawlo/stats_collector.py,sha256=NkO09CB-220qz5rxFcD_dedGfr2VPFrDo4hya0Zh8Qc,1577
|
|
11
|
-
crawlo/subscriber.py,sha256=
|
|
11
|
+
crawlo/subscriber.py,sha256=gioTIqRdEwVG-bwIiQonbk1vWWAqTh9hzVkrqZ1AfP0,5006
|
|
12
12
|
crawlo/task_manager.py,sha256=19InAxS9oJ9EMj20Aw2urN-v6BeC22dkgPuW-B9-4UI,819
|
|
13
13
|
crawlo/commands/__init__.py,sha256=AMYjXG7ulE8dPVmgWVo0uqXsaCYUUZYmmu2-7kFzH1M,342
|
|
14
14
|
crawlo/commands/check.py,sha256=172OiAxnX5wwSlszUsyPgMZwAoIbGDTdfhtRz309ilc,22843
|
|
@@ -22,17 +22,21 @@ crawlo/core/__init__.py,sha256=PnFyJdVNHBoPmV1sW0AHQXijeoSTQ8cMYrbNM1JK8kA,41
|
|
|
22
22
|
crawlo/core/engine.py,sha256=8Dcew1XyxChW5Fz1wFEWpJlPrQb2hKDWKul8e61S-Q0,6662
|
|
23
23
|
crawlo/core/enhanced_engine.py,sha256=9I9Uxdy2oAz8zDGTzEiytuKu__VDVmIN8zwZKfrD8bw,6254
|
|
24
24
|
crawlo/core/processor.py,sha256=qmCqAeqhwYu-UE86evYesaGt9qpuSIfH-ZIZKcXFCZc,1140
|
|
25
|
-
crawlo/core/scheduler.py,sha256=
|
|
25
|
+
crawlo/core/scheduler.py,sha256=fiU-Q-lzyC3B6ih8NSWqjP1Xw_ryNVb_4dLUARtWRBE,5804
|
|
26
26
|
crawlo/downloader/__init__.py,sha256=tl0mE54reR-PuJYSsXsKP2VY5uzvq4lITxZwKKjNzPs,7663
|
|
27
27
|
crawlo/downloader/aiohttp_downloader.py,sha256=UKupGYPOWrscAVsjhFgKYElTa9tbEeltqV7nuWqjIeE,8005
|
|
28
28
|
crawlo/downloader/cffi_downloader.py,sha256=-GVfSIhi1Ip56suSiGf8jnUE2EBF1P56vw0uxLh_T6I,10440
|
|
29
|
-
crawlo/downloader/httpx_downloader.py,sha256=
|
|
30
|
-
crawlo/extension/__init__.py,sha256=
|
|
31
|
-
crawlo/extension/
|
|
32
|
-
crawlo/extension/
|
|
33
|
-
crawlo/extension/
|
|
29
|
+
crawlo/downloader/httpx_downloader.py,sha256=7jfQfvAtfk8yD_mvwUbWLhYOxMM7r1nudiU7m_Jl9wc,12037
|
|
30
|
+
crawlo/extension/__init__.py,sha256=Sg588p6UhyrwFNTiD2wqGW-i3xgLX6HlLuQPKT7mayE,1526
|
|
31
|
+
crawlo/extension/health_check.py,sha256=IVaaVo_0CcZtf1LoCAYXIBvs3wZ7hdmT6U4-NYWAgP0,5527
|
|
32
|
+
crawlo/extension/log_interval.py,sha256=VCIeNqXcWDnxj4m6l77cjqgRzV8LfsPMb22X0Xc1Vwc,2417
|
|
33
|
+
crawlo/extension/log_stats.py,sha256=Ssxz6R1YpWIj5WJvQ2cJ9F5oR7FUFdj-ITc9lV92SSU,2908
|
|
34
|
+
crawlo/extension/logging_extension.py,sha256=ET6VAu1J2qNMz4NnG1G3zQLRhbsvV7l6xRIuQLE6DaE,1626
|
|
35
|
+
crawlo/extension/memory_monitor.py,sha256=gg-GK5RD9XhnrN_zp3KTmPKyWDmKLMv_OTY-HxSxBNI,3664
|
|
36
|
+
crawlo/extension/performance_profiler.py,sha256=NvQuuvE83dXJ-1URpN8OF9za9J1l7xhVbV22JynPQpA,4235
|
|
37
|
+
crawlo/extension/request_recorder.py,sha256=RC23yzXClnVv9j2ljQvjBkUfWznfnDHsrQejKhE9y5E,4074
|
|
34
38
|
crawlo/filters/__init__.py,sha256=XC_Q4ykZtSNYizYlAcehVwBBNO3lZ2zuWwafzXiuWyQ,4241
|
|
35
|
-
crawlo/filters/aioredis_filter.py,sha256=
|
|
39
|
+
crawlo/filters/aioredis_filter.py,sha256=WhkFZcVAym9wLSUa8WTVctYfEibjxG42umtmacO1IY0,8370
|
|
36
40
|
crawlo/filters/memory_filter.py,sha256=VJO0UFRYGxmV8dj4G1subsQ-FtvPcGLbvd7IVtqXnOs,9260
|
|
37
41
|
crawlo/items/__init__.py,sha256=bqekZrRlDhxfWie0UbCs656TptYseoe9QJ67I4E7Elk,386
|
|
38
42
|
crawlo/items/base.py,sha256=tAYrPJgblp3ZEihDXvappdYc6pGdim6x2_9QSmMKI2o,577
|
|
@@ -46,36 +50,40 @@ crawlo/middleware/proxy.py,sha256=m2ZZ50En9hUtgrqSqA6hItGT74xMqccHFPhZshutIco,98
|
|
|
46
50
|
crawlo/middleware/request_ignore.py,sha256=QI2z4fUnJ-4xvPTZAmsL-GqR4RFHS1xq9iDr5KFrMco,997
|
|
47
51
|
crawlo/middleware/response_code.py,sha256=tmef2QVl3JCiTMii6VQkASlOY2OyqmOPoOfNxIK1eF8,659
|
|
48
52
|
crawlo/middleware/response_filter.py,sha256=ep8ZxDlfIefi9YqK8dPASEp5TTDRo9QEY_jMceC411s,837
|
|
49
|
-
crawlo/middleware/retry.py,sha256
|
|
53
|
+
crawlo/middleware/retry.py,sha256=-7zpRURugiTTm4QYUSUlbnURD5mcT2Ji0yHvCgY1wGc,4124
|
|
50
54
|
crawlo/network/__init__.py,sha256=BLPERYPo22g1BXrW--wUnlolrdFUmOPjgOB8XQQJlck,397
|
|
51
|
-
crawlo/network/request.py,sha256=
|
|
52
|
-
crawlo/network/response.py,sha256=
|
|
53
|
-
crawlo/pipelines/__init__.py,sha256=
|
|
55
|
+
crawlo/network/request.py,sha256=tPAiOVJyF3Kk-midqydTGXgv5M5tsYJRtwUUJTrUsrE,11075
|
|
56
|
+
crawlo/network/response.py,sha256=cUvdjsB2cQ-qWEKHNGIkwWGgCg-EnQ81xTrjrUOVno0,9738
|
|
57
|
+
crawlo/pipelines/__init__.py,sha256=lrdVDjeHLNkA4_MAwI1auk_I9xfeU1SlBWXiammb6lc,616
|
|
58
|
+
crawlo/pipelines/bloom_dedup_pipeline.py,sha256=QQxGFGEoMHN4Vx2kq7G_i1o9pmuXp8clZebilOar3fk,5642
|
|
54
59
|
crawlo/pipelines/console_pipeline.py,sha256=KABkR3J-rqO0Awox7lizxKR2XuHfVhWPiVRgIybwwu4,1248
|
|
55
60
|
crawlo/pipelines/csv_pipeline.py,sha256=6FBT2AoU6iNU-5NfgWRq7-JpF9dK2nBokjxx-y4jIas,12174
|
|
61
|
+
crawlo/pipelines/database_dedup_pipeline.py,sha256=wVBXEGArFR3uxoN7yfJSOarBmtGrJpOqowAqa7OUs98,8000
|
|
56
62
|
crawlo/pipelines/json_pipeline.py,sha256=vlu1nqbD2mtqtExt9cL5nibx1CwJM1RNqd4WGjZRHAY,8367
|
|
57
|
-
crawlo/pipelines/
|
|
58
|
-
crawlo/pipelines/
|
|
63
|
+
crawlo/pipelines/memory_dedup_pipeline.py,sha256=5jeL2jEq7sioYmXlzfkx-LNSbWyChrXeWx8d15YEZOA,3839
|
|
64
|
+
crawlo/pipelines/mongo_pipeline.py,sha256=k7gNqAO-g2MtIfArphC6z5ZzkKVRkBKcv-2ImziPFA0,5706
|
|
65
|
+
crawlo/pipelines/mysql_pipeline.py,sha256=cwgJvRORTRea_Eep2coBaMf3G8PQVTQA1qrnIlDZApc,13480
|
|
59
66
|
crawlo/pipelines/pipeline_manager.py,sha256=VrbebOYiqrobtKhp5II18w-odCICdWkmRg5WPK0Emz4,2112
|
|
67
|
+
crawlo/pipelines/redis_dedup_pipeline.py,sha256=TaokJ4wP5-Cxf-ueFJdh4SX58hchT0QzZ5RBDXHDN64,6003
|
|
60
68
|
crawlo/queue/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
61
69
|
crawlo/queue/pqueue.py,sha256=yzF900ko2RReVNZtfk_VV3PzSXEUALI6SHf97geuu6s,1058
|
|
62
|
-
crawlo/queue/queue_manager.py,sha256=
|
|
63
|
-
crawlo/queue/redis_priority_queue.py,sha256=
|
|
70
|
+
crawlo/queue/queue_manager.py,sha256=NMR0Fo8XFBg6_g7htq4D80cS6Ilo0EKt5QtyF-KxNuM,11467
|
|
71
|
+
crawlo/queue/redis_priority_queue.py,sha256=boJCKqcKxRw9XCCzaHy5qmrX9DvzPiQBzOkBHI5odfE,8116
|
|
64
72
|
crawlo/settings/__init__.py,sha256=xsukVKn_h2Hopm1Nj-bXkhbfyS62QTTvJi7fhZUwR9M,123
|
|
65
|
-
crawlo/settings/default_settings.py,sha256=
|
|
73
|
+
crawlo/settings/default_settings.py,sha256=B4_61tNJvqzVvyqt9AtRV7Iq5q8G4pJOExcN0ve7S_A,11559
|
|
66
74
|
crawlo/settings/setting_manager.py,sha256=SxKB1aCWh4OySM_bH9cYng9I3PAmrSP-Q8XOZEWEwbI,2899
|
|
67
75
|
crawlo/spider/__init__.py,sha256=Z_rK23l5yt-DuwJPg8bcqodM_FIs4-iHLaKOimGumcE,20452
|
|
68
76
|
crawlo/templates/crawlo.cfg.tmpl,sha256=9BAmwEibS5Tvy6HIcGXPb0BGeuesmibebmTW0iAEkmo,230
|
|
69
77
|
crawlo/templates/project/__init__.py.tmpl,sha256=f3ETIXw_O6K-lkL6lXM5znMPJW1FZYGFrwDs2BnHcnQ,58
|
|
70
|
-
crawlo/templates/project/items.py.tmpl,sha256=
|
|
71
|
-
crawlo/templates/project/middlewares.py.tmpl,sha256=
|
|
72
|
-
crawlo/templates/project/pipelines.py.tmpl,sha256
|
|
73
|
-
crawlo/templates/project/run.py.tmpl,sha256=
|
|
74
|
-
crawlo/templates/project/settings.py.tmpl,sha256=
|
|
78
|
+
crawlo/templates/project/items.py.tmpl,sha256=mt1Mm--H2Ouos3r7JPkYh0r33rgYJf1YOMz0OZy8TYs,297
|
|
79
|
+
crawlo/templates/project/middlewares.py.tmpl,sha256=jpmj7b7Zb7d3nVyxcaVNdp4KqSts6l2cPSqn_oJUSrM,3775
|
|
80
|
+
crawlo/templates/project/pipelines.py.tmpl,sha256=k_4MJnwZ6GPqVwJSEDURUlTxWybmts4vHrF0de2vgAk,2620
|
|
81
|
+
crawlo/templates/project/run.py.tmpl,sha256=ktkYOslcCh9mpklg6yE5VqfATx3Frj_jNT5z5gHjQ4o,8177
|
|
82
|
+
crawlo/templates/project/settings.py.tmpl,sha256=O_teIARjzRD3aMvPnuIgjaDHdjwW-3beyzfo1QH-Hr8,9580
|
|
75
83
|
crawlo/templates/project/spiders/__init__.py.tmpl,sha256=j_YKsw6HQMJyqlk3WUouP3bsr-XVxshRoSNakHBc00g,106
|
|
76
|
-
crawlo/templates/spider/spider.py.tmpl,sha256=
|
|
84
|
+
crawlo/templates/spider/spider.py.tmpl,sha256=a8S9j43z5gE4auMhf_OnnuVHSZN3JbMDu8Bczu8zIZY,4944
|
|
77
85
|
crawlo/utils/__init__.py,sha256=BDORpyjMN7VGPKImnCDKSkprS-petgD7ezc9rMlBvb0,123
|
|
78
|
-
crawlo/utils/controlled_spider_mixin.py,sha256=
|
|
86
|
+
crawlo/utils/controlled_spider_mixin.py,sha256=VjT30pNW_YIgmTD0nb7DDl2D3HvpnAYFzgSgV3fxFN0,16475
|
|
79
87
|
crawlo/utils/date_tools.py,sha256=0yG0tzGb1VFgWDJJ_cow2LJfz3kj_w2MqSjmfKKESl8,6961
|
|
80
88
|
crawlo/utils/db_helper.py,sha256=3ib5-agrlwf2t5S_QtLRYH75wvJDlYbRqRmDEbpH5Bo,10559
|
|
81
89
|
crawlo/utils/func_tools.py,sha256=WUZEGpWMuDDX7g-QySM7iaiC74erW2SSkZoUvDw1NjM,2369
|
|
@@ -90,6 +98,7 @@ crawlo/utils/system.py,sha256=HvWV1acxou0Rn0L7pNq4CnV_GWFeU0Tmjy3_nLD8M64,237
|
|
|
90
98
|
crawlo/utils/tools.py,sha256=5Uv25Wy4m_ndZY0-n-eX-t3PxvaZ6wR3-Wvx-o7_Vrs,271
|
|
91
99
|
crawlo/utils/url.py,sha256=rlgX2VlJv6JvLmCDTsbxzMSXE6R5ZL_0dLEqprsA-JU,1482
|
|
92
100
|
examples/__init__.py,sha256=6i631BPnS_TR_BWUjtjB5CBO-zv9kRkwQTQvSya2wHE,123
|
|
101
|
+
examples/controlled_spider_example.py,sha256=SP_k4mdKPvD1JCPs9UCm68jcy2Frg84vvXv9-14RC6I,7776
|
|
93
102
|
tests/__init__.py,sha256=scL1IPVT1iucth7v8ffrjRdeW7QrC_Y7AMmFVMdTY1Y,129
|
|
94
103
|
tests/test_final_validation.py,sha256=fBxf_6YcAEa_HyV_oGAXmmVHY4i6FdA4J6klCmc36hQ,4925
|
|
95
104
|
tests/test_proxy_health_check.py,sha256=xo3QMP1YNw9hu7JDHZOYCUZmFFKLJpHSh4SbxXhCRPQ,1091
|
|
@@ -101,8 +110,8 @@ tests/test_redis_config.py,sha256=TqzFRojc6esGXjGhUCvSLYQDUTAgEJsty9vRVuNraMU,89
|
|
|
101
110
|
tests/test_redis_queue.py,sha256=o6xViXxJcdx-1eMcG3vhAQEIm8h346HnZb7JXs7ZjwM,6622
|
|
102
111
|
tests/test_request_serialization.py,sha256=8sVdppAsohJ5u-m1WvablCndwL-M_36YPLdGKwgeznM,2289
|
|
103
112
|
tests/test_scheduler.py,sha256=-FOkTWzaMdr6yfO1Msu74hI_GgSfD7iRxO-cFA-9Iyk,7442
|
|
104
|
-
crawlo-1.1.
|
|
105
|
-
crawlo-1.1.
|
|
106
|
-
crawlo-1.1.
|
|
107
|
-
crawlo-1.1.
|
|
108
|
-
crawlo-1.1.
|
|
113
|
+
crawlo-1.1.4.dist-info/METADATA,sha256=2I2NA0BR-MWoPZmRUkWrUQYMjuPiUi9mrogIYPWpASU,19781
|
|
114
|
+
crawlo-1.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
115
|
+
crawlo-1.1.4.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
|
|
116
|
+
crawlo-1.1.4.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
|
|
117
|
+
crawlo-1.1.4.dist-info/RECORD,,
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
受控爬虫混入类使用示例
|
|
5
|
+
演示如何使用 ControlledRequestMixin 和 AsyncControlledRequestMixin 来控制大规模请求生成
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
from crawlo.spider import Spider
|
|
10
|
+
from crawlo.network import Request
|
|
11
|
+
from crawlo.utils.controlled_spider_mixin import ControlledRequestMixin, AsyncControlledRequestMixin
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class LargeScaleSpider(Spider, ControlledRequestMixin):
|
|
15
|
+
"""
|
|
16
|
+
同步版本的受控爬虫示例
|
|
17
|
+
适用于需要生成大量请求但要控制并发的场景
|
|
18
|
+
"""
|
|
19
|
+
name = 'large_scale_spider'
|
|
20
|
+
|
|
21
|
+
def __init__(self):
|
|
22
|
+
Spider.__init__(self)
|
|
23
|
+
ControlledRequestMixin.__init__(self)
|
|
24
|
+
|
|
25
|
+
# 配置受控生成参数
|
|
26
|
+
self.max_pending_requests = 150 # 最大待处理请求数
|
|
27
|
+
self.batch_size = 75 # 每批生成请求数
|
|
28
|
+
self.generation_interval = 0.02 # 生成间隔(秒)
|
|
29
|
+
self.backpressure_threshold = 300 # 背压阈值
|
|
30
|
+
|
|
31
|
+
def _original_start_requests(self):
|
|
32
|
+
"""
|
|
33
|
+
提供原始的大量请求
|
|
34
|
+
这里模拟爬取一个电商网站的商品页面
|
|
35
|
+
"""
|
|
36
|
+
# 模拟爬取 10,000 个商品页面
|
|
37
|
+
base_urls = [
|
|
38
|
+
"https://example-shop.com/products",
|
|
39
|
+
"https://example-shop.com/electronics",
|
|
40
|
+
"https://example-shop.com/clothing",
|
|
41
|
+
"https://example-shop.com/books",
|
|
42
|
+
"https://example-shop.com/home"
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
for category in base_urls:
|
|
46
|
+
# 每个分类爬取 2000 页
|
|
47
|
+
for page in range(1, 2001):
|
|
48
|
+
yield Request(
|
|
49
|
+
url=f"{category}?page={page}",
|
|
50
|
+
callback=self.parse_product_list,
|
|
51
|
+
meta={'category': category.split('/')[-1], 'page': page}
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
def _process_request_before_yield(self, request):
|
|
55
|
+
"""
|
|
56
|
+
在 yield 请求前进行处理
|
|
57
|
+
可以添加去重、优先级设置、请求头设置等逻辑
|
|
58
|
+
"""
|
|
59
|
+
# 设置请求头
|
|
60
|
+
request.headers.setdefault('User-Agent',
|
|
61
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36')
|
|
62
|
+
|
|
63
|
+
# 根据分类设置优先级
|
|
64
|
+
category = request.meta.get('category', '')
|
|
65
|
+
if category == 'electronics':
|
|
66
|
+
request.priority = 10 # 电子产品优先级最高
|
|
67
|
+
elif category == 'clothing':
|
|
68
|
+
request.priority = 8
|
|
69
|
+
else:
|
|
70
|
+
request.priority = 5
|
|
71
|
+
|
|
72
|
+
# 可以在这里添加去重逻辑
|
|
73
|
+
# if self.is_duplicate_request(request):
|
|
74
|
+
# return None # 跳过重复请求
|
|
75
|
+
|
|
76
|
+
return request
|
|
77
|
+
|
|
78
|
+
async def parse_product_list(self, response):
|
|
79
|
+
"""解析商品列表页面"""
|
|
80
|
+
# 提取商品链接
|
|
81
|
+
product_links = response.css('a.product-link::attr(href)').getall()
|
|
82
|
+
|
|
83
|
+
for link in product_links:
|
|
84
|
+
# 生成商品详情页请求
|
|
85
|
+
yield Request(
|
|
86
|
+
url=response.urljoin(link),
|
|
87
|
+
callback=self.parse_product_detail,
|
|
88
|
+
meta={'category': response.meta['category']}
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# 提取分页信息
|
|
92
|
+
next_page = response.css('a.next-page::attr(href)').get()
|
|
93
|
+
if next_page:
|
|
94
|
+
yield Request(
|
|
95
|
+
url=response.urljoin(next_page),
|
|
96
|
+
callback=self.parse_product_list,
|
|
97
|
+
meta=response.meta
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
async def parse_product_detail(self, response):
|
|
101
|
+
"""解析商品详情页面"""
|
|
102
|
+
yield {
|
|
103
|
+
'title': response.css('h1.product-title::text').get(),
|
|
104
|
+
'price': response.css('.price::text').re_first(r'\d+\.?\d*'),
|
|
105
|
+
'description': response.css('.product-description::text').get(),
|
|
106
|
+
'category': response.meta['category'],
|
|
107
|
+
'url': response.url,
|
|
108
|
+
'in_stock': 'in-stock' in response.css('.availability::attr(class)').get(''),
|
|
109
|
+
'rating': response.css('.rating::attr(data-rating)').get(),
|
|
110
|
+
'reviews_count': response.css('.reviews-count::text').re_first(r'\d+')
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class AsyncLargeScaleSpider(Spider, AsyncControlledRequestMixin):
|
|
115
|
+
"""
|
|
116
|
+
异步版本的受控爬虫示例
|
|
117
|
+
使用异步控制来实现更精确的并发管理
|
|
118
|
+
"""
|
|
119
|
+
name = 'async_large_scale_spider'
|
|
120
|
+
|
|
121
|
+
def __init__(self):
|
|
122
|
+
Spider.__init__(self)
|
|
123
|
+
AsyncControlledRequestMixin.__init__(self)
|
|
124
|
+
|
|
125
|
+
# 配置异步控制参数
|
|
126
|
+
self.max_concurrent_generations = 15 # 最大同时生成数
|
|
127
|
+
self.queue_monitor_interval = 0.5 # 队列监控间隔
|
|
128
|
+
|
|
129
|
+
def _original_start_requests(self):
|
|
130
|
+
"""
|
|
131
|
+
提供原始的大量请求
|
|
132
|
+
这里模拟爬取新闻网站的文章
|
|
133
|
+
"""
|
|
134
|
+
# 模拟爬取 50,000 篇新闻文章
|
|
135
|
+
news_sites = [
|
|
136
|
+
"https://news-site-1.com",
|
|
137
|
+
"https://news-site-2.com",
|
|
138
|
+
"https://news-site-3.com",
|
|
139
|
+
"https://tech-news.com",
|
|
140
|
+
"https://finance-news.com"
|
|
141
|
+
]
|
|
142
|
+
|
|
143
|
+
categories = ['tech', 'finance', 'sports', 'politics', 'entertainment']
|
|
144
|
+
|
|
145
|
+
for site in news_sites:
|
|
146
|
+
for category in categories:
|
|
147
|
+
# 每个分类爬取 2000 页
|
|
148
|
+
for page in range(1, 2001):
|
|
149
|
+
yield Request(
|
|
150
|
+
url=f"{site}/{category}?page={page}",
|
|
151
|
+
callback=self.parse_article_list,
|
|
152
|
+
meta={'site': site, 'category': category, 'page': page}
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
def _process_request_before_yield(self, request):
|
|
156
|
+
"""异步版本的请求预处理"""
|
|
157
|
+
# 设置请求头
|
|
158
|
+
request.headers.setdefault('User-Agent',
|
|
159
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
|
|
160
|
+
|
|
161
|
+
# 根据新闻类型设置优先级
|
|
162
|
+
category = request.meta.get('category', '')
|
|
163
|
+
if category in ['tech', 'finance']:
|
|
164
|
+
request.priority = 10 # 科技和财经新闻优先级最高
|
|
165
|
+
elif category in ['sports', 'politics']:
|
|
166
|
+
request.priority = 8
|
|
167
|
+
else:
|
|
168
|
+
request.priority = 5
|
|
169
|
+
|
|
170
|
+
# 设置延迟(避免对服务器造成过大压力)
|
|
171
|
+
site = request.meta.get('site', '')
|
|
172
|
+
if 'tech-news.com' in site:
|
|
173
|
+
request.meta['download_delay'] = 0.5 # 科技新闻站点较敏感,增加延迟
|
|
174
|
+
|
|
175
|
+
return request
|
|
176
|
+
|
|
177
|
+
async def parse_article_list(self, response):
|
|
178
|
+
"""解析文章列表页面"""
|
|
179
|
+
# 提取文章链接
|
|
180
|
+
article_links = response.css('a.article-link::attr(href)').getall()
|
|
181
|
+
|
|
182
|
+
for link in article_links:
|
|
183
|
+
yield Request(
|
|
184
|
+
url=response.urljoin(link),
|
|
185
|
+
callback=self.parse_article_detail,
|
|
186
|
+
meta={
|
|
187
|
+
'site': response.meta['site'],
|
|
188
|
+
'category': response.meta['category']
|
|
189
|
+
}
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
async def parse_article_detail(self, response):
|
|
193
|
+
"""解析文章详情页面"""
|
|
194
|
+
yield {
|
|
195
|
+
'title': response.css('h1.article-title::text').get(),
|
|
196
|
+
'content': ' '.join(response.css('.article-content p::text').getall()),
|
|
197
|
+
'author': response.css('.author::text').get(),
|
|
198
|
+
'publish_date': response.css('.publish-date::attr(datetime)').get(),
|
|
199
|
+
'category': response.meta['category'],
|
|
200
|
+
'site': response.meta['site'],
|
|
201
|
+
'url': response.url,
|
|
202
|
+
'tags': response.css('.tags a::text').getall(),
|
|
203
|
+
'views': response.css('.views-count::text').re_first(r'\d+'),
|
|
204
|
+
'comments': response.css('.comments-count::text').re_first(r'\d+')
|
|
205
|
+
}
|