crawlo 1.4.0__py3-none-any.whl → 1.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +9 -4
- crawlo/__version__.py +1 -1
- crawlo/core/__init__.py +8 -2
- crawlo/core/scheduler.py +2 -2
- crawlo/downloader/aiohttp_downloader.py +7 -2
- crawlo/extension/log_interval.py +44 -7
- crawlo/initialization/__init__.py +6 -2
- crawlo/middleware/middleware_manager.py +1 -1
- crawlo/mode_manager.py +13 -7
- crawlo/pipelines/bloom_dedup_pipeline.py +5 -15
- crawlo/pipelines/database_dedup_pipeline.py +5 -8
- crawlo/pipelines/memory_dedup_pipeline.py +5 -15
- crawlo/pipelines/redis_dedup_pipeline.py +2 -15
- crawlo/project.py +18 -7
- crawlo/settings/default_settings.py +114 -150
- crawlo/settings/setting_manager.py +14 -9
- crawlo/tools/distributed_coordinator.py +4 -8
- crawlo/utils/fingerprint.py +123 -0
- {crawlo-1.4.0.dist-info → crawlo-1.4.2.dist-info}/METADATA +1 -1
- {crawlo-1.4.0.dist-info → crawlo-1.4.2.dist-info}/RECORD +45 -29
- examples/test_project/__init__.py +7 -0
- examples/test_project/run.py +35 -0
- examples/test_project/test_project/__init__.py +4 -0
- examples/test_project/test_project/items.py +18 -0
- examples/test_project/test_project/middlewares.py +119 -0
- examples/test_project/test_project/pipelines.py +97 -0
- examples/test_project/test_project/settings.py +170 -0
- examples/test_project/test_project/spiders/__init__.py +10 -0
- examples/test_project/test_project/spiders/of_week_dis.py +144 -0
- tests/debug_framework_logger.py +1 -1
- tests/debug_log_levels.py +1 -1
- tests/test_all_pipeline_fingerprints.py +134 -0
- tests/test_default_header_middleware.py +242 -87
- tests/test_fingerprint_consistency.py +136 -0
- tests/test_fingerprint_simple.py +52 -0
- tests/test_framework_logger.py +1 -1
- tests/test_framework_startup.py +1 -1
- tests/test_hash_performance.py +100 -0
- tests/test_mode_change.py +1 -1
- tests/test_offsite_middleware.py +185 -162
- tests/test_offsite_middleware_simple.py +204 -0
- tests/test_pipeline_fingerprint_consistency.py +87 -0
- {crawlo-1.4.0.dist-info → crawlo-1.4.2.dist-info}/WHEEL +0 -0
- {crawlo-1.4.0.dist-info → crawlo-1.4.2.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.0.dist-info → crawlo-1.4.2.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
crawlo/__init__.py,sha256=
|
|
2
|
-
crawlo/__version__.py,sha256=
|
|
1
|
+
crawlo/__init__.py,sha256=WD-F_HxUb9ad2v7J_yipTK4wwENCzIYwa7AF14eL4sg,2288
|
|
2
|
+
crawlo/__version__.py,sha256=Gom1_ABKKkZoIghfDiYvSntLG2OKXC64sJD2Xm8bUKc,22
|
|
3
3
|
crawlo/cli.py,sha256=OXprmcTUbFK02ptw_Gq8Gk4-ZCU-WEMJgzU1ztgP6Bk,2327
|
|
4
4
|
crawlo/config.py,sha256=dNoNyTkXLe2msQ7bZx3YTQItk1m49nIg5-g89FQDNwE,9486
|
|
5
5
|
crawlo/config_validator.py,sha256=gsiLqf5swWd9ISDvoLqCdG7iSXr-ZdBPD4iT6ug1ua4,11239
|
|
@@ -7,8 +7,8 @@ crawlo/crawler.py,sha256=E83JhClOe58XVX1ma0f-HAF1BJ7Ej9Zs0w51ERs3fgA,26348
|
|
|
7
7
|
crawlo/event.py,sha256=7-y6HNv_EIJSYQNzsj0mVK-Gg4ON3wdQeMdQjfFJPlw,313
|
|
8
8
|
crawlo/exceptions.py,sha256=sMay0wnWLfc_FXWslqxm60qz6b66LXs3EdN_w8ygE9k,1166
|
|
9
9
|
crawlo/framework.py,sha256=1RVBwj_VBzfJiMB3lq6XcfFHCjRBHyT4D_T2X4fU_6g,9166
|
|
10
|
-
crawlo/mode_manager.py,sha256=
|
|
11
|
-
crawlo/project.py,sha256=
|
|
10
|
+
crawlo/mode_manager.py,sha256=j5Kez-KwZQWbEWeYlyUZov18kk8ZRQFYEBUzbkIc-DY,7549
|
|
11
|
+
crawlo/project.py,sha256=LofpNMP1D5AHMe987jfgkSFTjRmrfF5zUBkXl6Cvp3U,12730
|
|
12
12
|
crawlo/stats_collector.py,sha256=copzmfWTArYZCkMeZJsJfJcdC36s7_LM88hxAYttoeE,2306
|
|
13
13
|
crawlo/subscriber.py,sha256=D3hzE7Pc_zJjc-zR7lct5pt32bz6LsDYeC8uHlS4Hso,4986
|
|
14
14
|
crawlo/task_manager.py,sha256=a9JWpqiozFEhReH4PwD9HsDs050HERwi9X9LNGdOp0E,5828
|
|
@@ -21,14 +21,14 @@ crawlo/commands/run.py,sha256=gQ14PN3ZxsRNapRsyGZ4qdhbqzh70EnuS2YPaIUA8q0,12828
|
|
|
21
21
|
crawlo/commands/startproject.py,sha256=aqKRJarKqTf5XjJnGXwjRpp0uYF16LreFbwwQLGpK-0,16070
|
|
22
22
|
crawlo/commands/stats.py,sha256=8wTubR1RQ1JPTlpOKArcGcQ39bM-0cuH27lDpndnwPQ,6014
|
|
23
23
|
crawlo/commands/utils.py,sha256=Psfu2tKrmDloMq0WnfXLaxx0lJFitMZ-FWS3HAIrziQ,5382
|
|
24
|
-
crawlo/core/__init__.py,sha256=
|
|
24
|
+
crawlo/core/__init__.py,sha256=Mcf4BKnjz-AP72eLCSh5ryLoZchxJnr7Jqv0W_OpkiI,1278
|
|
25
25
|
crawlo/core/engine.py,sha256=0l7TVNf2R8EHJAZ4ktj71j-qysrq84cYqf_7LEzzYJM,19096
|
|
26
26
|
crawlo/core/processor.py,sha256=wO6DMU-Azr0yaMLJw8LSTG19a6ZAvPuT3J7wNLfbpu4,1577
|
|
27
|
-
crawlo/core/scheduler.py,sha256=
|
|
27
|
+
crawlo/core/scheduler.py,sha256=C2V23SIm1VGdi3A41TThB3zE99QFxTnsJp4SDOvSaBE,12561
|
|
28
28
|
crawlo/data/__init__.py,sha256=8MgDxcMhx-emFARcLAw_ODOZNz0neYBcx7kEbzothd8,116
|
|
29
29
|
crawlo/data/user_agents.py,sha256=6V34lYHREWV5ZR5wH-1pCnr1Y3ZYC7iMLfC6vZHyhZQ,9697
|
|
30
30
|
crawlo/downloader/__init__.py,sha256=PB8oluLFMX2PBmeb3NBKkM6GaceX0ujFId8t2URy1ks,8624
|
|
31
|
-
crawlo/downloader/aiohttp_downloader.py,sha256=
|
|
31
|
+
crawlo/downloader/aiohttp_downloader.py,sha256=nlsq2CE-NcmT7nsNEENWC66YlWDx_JzS-eA-P2cvw_Q,9323
|
|
32
32
|
crawlo/downloader/cffi_downloader.py,sha256=BpA1q6Udz7sSXJ0gX94xGnzy8cdgK-vlr_Q6YA4QIxE,10243
|
|
33
33
|
crawlo/downloader/httpx_downloader.py,sha256=7jfQfvAtfk8yD_mvwUbWLhYOxMM7r1nudiU7m_Jl9wc,12037
|
|
34
34
|
crawlo/downloader/hybrid_downloader.py,sha256=4SzOPEwBlSZVzUAWR3DyxMx2Tsx15YrpBvQS4it4Vps,8028
|
|
@@ -36,7 +36,7 @@ crawlo/downloader/playwright_downloader.py,sha256=Lnc7k5cXhVnURXSxgZFCYMJkBxLg5F
|
|
|
36
36
|
crawlo/downloader/selenium_downloader.py,sha256=B_0muNi-GQ_hgoYHcf7wgu01V68q7xKnSh-0kzlUiio,21036
|
|
37
37
|
crawlo/extension/__init__.py,sha256=7HxWQKBuiVphZUBLIBVCtIjgFIbzTa5QDOQp6WH4HhU,2923
|
|
38
38
|
crawlo/extension/health_check.py,sha256=0GveZgUtFwjYEKlm3qbwIvCmb4FR0qrIKc8cEF1yQV8,5516
|
|
39
|
-
crawlo/extension/log_interval.py,sha256=
|
|
39
|
+
crawlo/extension/log_interval.py,sha256=gWCcUCvMzpmx1IE_HvUGAvhfUIqd-UjARNNQLqvjyow,4403
|
|
40
40
|
crawlo/extension/log_stats.py,sha256=X90Y_E6USAdm55yvRN5t59HNLmyN9QMKglhbPPxtehA,2382
|
|
41
41
|
crawlo/extension/logging_extension.py,sha256=8KT-WJRK5tocS2kBOiSquree53L11qD1vLg-P8ob40U,2354
|
|
42
42
|
crawlo/extension/memory_monitor.py,sha256=4aszl3C0GMQbqFhGZjZq5iQuXQR1sOz06VHjjEHgkyE,4290
|
|
@@ -49,7 +49,7 @@ crawlo/factories/registry.py,sha256=LrtH7wMGQ2ZrswxnHDM9s43ckJ1isJKL7R8uyMQ8hCc,
|
|
|
49
49
|
crawlo/filters/__init__.py,sha256=XC_Q4ykZtSNYizYlAcehVwBBNO3lZ2zuWwafzXiuWyQ,4241
|
|
50
50
|
crawlo/filters/aioredis_filter.py,sha256=ZEApX23-S-7ruO_TSTKI0Noh8SEwjdZznf8TySeEtbQ,9255
|
|
51
51
|
crawlo/filters/memory_filter.py,sha256=mO4oBPV5_uAiBQF3a16tU5tcD8244dOjKoNX_MU8cEw,9292
|
|
52
|
-
crawlo/initialization/__init__.py,sha256=
|
|
52
|
+
crawlo/initialization/__init__.py,sha256=ML9fKhttJC3TRAtp5n9MTevLBH-vdPUXwKihQdxEUlE,1088
|
|
53
53
|
crawlo/initialization/built_in.py,sha256=EkZIPBrqsvFf0CuBL6POk2IJiDFf8q30eRGMqcL2N8M,15333
|
|
54
54
|
crawlo/initialization/context.py,sha256=SL2ge47EmyLHzB5yldISA-xr__ZOV1xnQP_-1RF5v0Y,4722
|
|
55
55
|
crawlo/initialization/core.py,sha256=ZR6veRDkKU5erQGXGKzgX6TU2_i6YkWsuFUeWnOEVjo,6679
|
|
@@ -66,7 +66,7 @@ crawlo/logging/manager.py,sha256=hbdwyFGnnyRFrVDXoqzHs8oERx72NHrf7KqwCf4oPc4,307
|
|
|
66
66
|
crawlo/middleware/__init__.py,sha256=ldaGFNbiJnK9Fx12Vdf9fDNfzXxoETtShp5r-vodtw0,549
|
|
67
67
|
crawlo/middleware/default_header.py,sha256=wQ7BrUHd-hRosFoKsReV9hwNNr_jwK6V0ZfxL6MOGrk,5032
|
|
68
68
|
crawlo/middleware/download_delay.py,sha256=zt9R5g2HWErWA_MAOnGcw_D8l6HD769Kyaw-Hv-vcTc,3438
|
|
69
|
-
crawlo/middleware/middleware_manager.py,sha256=
|
|
69
|
+
crawlo/middleware/middleware_manager.py,sha256=1-qeqlyjBDvLhojt8aOT-EASe9gCcwzJ2q53iRagOHI,6280
|
|
70
70
|
crawlo/middleware/offsite.py,sha256=FIWZvkkzlDJfvQc7Ud7BdfDZ78Sa85qlEEwAR76hSBk,4559
|
|
71
71
|
crawlo/middleware/proxy.py,sha256=NquB6tqHAgHs3-2_1_5220kJYfjNG5JyHRJyo_2j4wo,15636
|
|
72
72
|
crawlo/middleware/request_ignore.py,sha256=xcyZ1c7r_HhbzR3r9pfjsLGW7L7FBVeYvlNt8cpP2wY,2577
|
|
@@ -78,23 +78,23 @@ crawlo/network/__init__.py,sha256=BLPERYPo22g1BXrW--wUnlolrdFUmOPjgOB8XQQJlck,39
|
|
|
78
78
|
crawlo/network/request.py,sha256=9kV-gqb_d6aCsSBAwyzxnP9a70cAViwX8qvpyYV7Ym4,13799
|
|
79
79
|
crawlo/network/response.py,sha256=EZiG4LjuIb7PxdGou4H-oSOQhec1ZdBRTkO-5fl8JTo,12701
|
|
80
80
|
crawlo/pipelines/__init__.py,sha256=lrdVDjeHLNkA4_MAwI1auk_I9xfeU1SlBWXiammb6lc,616
|
|
81
|
-
crawlo/pipelines/bloom_dedup_pipeline.py,sha256=
|
|
81
|
+
crawlo/pipelines/bloom_dedup_pipeline.py,sha256=yNs7ON4icJK0nDGpDyZzghbPyf350bMEi1gOn4r_ya8,5317
|
|
82
82
|
crawlo/pipelines/console_pipeline.py,sha256=KABkR3J-rqO0Awox7lizxKR2XuHfVhWPiVRgIybwwu4,1248
|
|
83
83
|
crawlo/pipelines/csv_pipeline.py,sha256=6FBT2AoU6iNU-5NfgWRq7-JpF9dK2nBokjxx-y4jIas,12174
|
|
84
|
-
crawlo/pipelines/database_dedup_pipeline.py,sha256=
|
|
84
|
+
crawlo/pipelines/database_dedup_pipeline.py,sha256=EfX_y9JiqD5EissBZNWfDYJavMcCYQ72swzfe7RsNZc,7099
|
|
85
85
|
crawlo/pipelines/json_pipeline.py,sha256=vlu1nqbD2mtqtExt9cL5nibx1CwJM1RNqd4WGjZRHAY,8367
|
|
86
|
-
crawlo/pipelines/memory_dedup_pipeline.py,sha256=
|
|
86
|
+
crawlo/pipelines/memory_dedup_pipeline.py,sha256=sHoihpQiMZ7teiH4Ctl8OIPJs1x1bRRgzdy89JXFm7U,3493
|
|
87
87
|
crawlo/pipelines/mongo_pipeline.py,sha256=k7gNqAO-g2MtIfArphC6z5ZzkKVRkBKcv-2ImziPFA0,5706
|
|
88
88
|
crawlo/pipelines/mysql_pipeline.py,sha256=_oRfIvlEiOsTKkr4v-yPTcL8nG9O9coRmke2ZSkkKII,13871
|
|
89
89
|
crawlo/pipelines/pipeline_manager.py,sha256=BX17CU9JK2xJeIdzQ4FeK7kwpwew1k-BEVMk9oviqTQ,3682
|
|
90
|
-
crawlo/pipelines/redis_dedup_pipeline.py,sha256=
|
|
90
|
+
crawlo/pipelines/redis_dedup_pipeline.py,sha256=zNk7WJxtgWNfRzKnyxOM2OyfsKhPNsOmy_8IGT1iZi0,5921
|
|
91
91
|
crawlo/queue/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
92
92
|
crawlo/queue/pqueue.py,sha256=j2ISmyays5t1tuI36xM6EcELwSpq2xIjAScSBWSRZms,1220
|
|
93
93
|
crawlo/queue/queue_manager.py,sha256=JfkjtOD04e_OZZvEEvp3O_W3lfGXhHslZHrCgw90amY,20693
|
|
94
94
|
crawlo/queue/redis_priority_queue.py,sha256=Evmo514OFL0a7Xu2SdCiz6klFUGH1gmjlxCc01vX1tQ,15400
|
|
95
95
|
crawlo/settings/__init__.py,sha256=xsukVKn_h2Hopm1Nj-bXkhbfyS62QTTvJi7fhZUwR9M,123
|
|
96
|
-
crawlo/settings/default_settings.py,sha256=
|
|
97
|
-
crawlo/settings/setting_manager.py,sha256=
|
|
96
|
+
crawlo/settings/default_settings.py,sha256=BBJkTYe2ViIuGzr6KtWnJRDe0Yj6dkcBo56VxDeJIkE,12552
|
|
97
|
+
crawlo/settings/setting_manager.py,sha256=hfmZfRcCQjTCSFWOelrzsIjLBIGDPmy_vlWDm6rhHeM,8376
|
|
98
98
|
crawlo/spider/__init__.py,sha256=oi9LEYq9xaCSjktAIRUgjpGQQI7rTtN61ESdHeWb1x4,21224
|
|
99
99
|
crawlo/templates/crawlo.cfg.tmpl,sha256=9BAmwEibS5Tvy6HIcGXPb0BGeuesmibebmTW0iAEkmo,230
|
|
100
100
|
crawlo/templates/run.py.tmpl,sha256=j8xRI4IEI_YiDFvMyfYG0XD2GgBM9eXgSqpvBniferU,927
|
|
@@ -117,7 +117,7 @@ crawlo/tools/authenticated_proxy.py,sha256=L93WeXajIZ3si8xNcE7bBPv34FvqSyTvVfD78
|
|
|
117
117
|
crawlo/tools/data_formatter.py,sha256=NEj3NqPiyG67V6qDgn2KNj9VNHWOLNwX-7p_nad0znc,7583
|
|
118
118
|
crawlo/tools/data_validator.py,sha256=hxPN28YtJDFFLjBBYhDjHmR8ShNTEjgIsv-cmcDKIu8,5310
|
|
119
119
|
crawlo/tools/date_tools.py,sha256=jjP5xA0-aDgm9UIK1RG2qaNagBzHFQ-BBDMo_YzSlLQ,8906
|
|
120
|
-
crawlo/tools/distributed_coordinator.py,sha256=
|
|
120
|
+
crawlo/tools/distributed_coordinator.py,sha256=eUAQudIaz96WGIPVJxF_W0BOKDpRBX3ZmPLsDmiBKXM,12190
|
|
121
121
|
crawlo/tools/encoding_converter.py,sha256=7P9Z7J1ALw_PPNApmjFsHZDpRxgxzduiViluenlSLEU,4043
|
|
122
122
|
crawlo/tools/network_diagnostic.py,sha256=92diB7Ppo_TKGDYCRLzy7uvQMGApgGLwv7P5w4OpCms,12649
|
|
123
123
|
crawlo/tools/request_tools.py,sha256=CjyFBtRQf_vFjQhaVwgHSGai4ZaWS8IIaF1flSfJxDs,2338
|
|
@@ -132,6 +132,7 @@ crawlo/utils/db_helper.py,sha256=3ib5-agrlwf2t5S_QtLRYH75wvJDlYbRqRmDEbpH5Bo,105
|
|
|
132
132
|
crawlo/utils/enhanced_error_handler.py,sha256=hj5AElt3ajfqnP4csQnEfEnzkbIep9k65DNQiCbmTFo,13858
|
|
133
133
|
crawlo/utils/env_config.py,sha256=HbZOEKkeQ0FMdZYJu9SgmSNEmfPJrmAzA7lHu5Du1DA,3937
|
|
134
134
|
crawlo/utils/error_handler.py,sha256=nDfDA99q2sirE2pe7OT2bcA54GqUiAYgtdAh38uFEX4,5290
|
|
135
|
+
crawlo/utils/fingerprint.py,sha256=5XTH8ma0HSHc8arvre_sVv2g0Unxte1KLScmf3bHPDA,3565
|
|
135
136
|
crawlo/utils/func_tools.py,sha256=WUZEGpWMuDDX7g-QySM7iaiC74erW2SSkZoUvDw1NjM,2369
|
|
136
137
|
crawlo/utils/large_scale_config.py,sha256=j7wQ5ty7pQlBRygw2vhRJ7OI19RYBZKPfYMP3WeF2WI,8154
|
|
137
138
|
crawlo/utils/large_scale_helper.py,sha256=Kxdy3WMuqjzQTyCc6z4xEYxXDi4xnYKJzsVwaBYZrrg,12108
|
|
@@ -147,6 +148,15 @@ crawlo/utils/system.py,sha256=HvWV1acxou0Rn0L7pNq4CnV_GWFeU0Tmjy3_nLD8M64,237
|
|
|
147
148
|
crawlo/utils/tools.py,sha256=5Uv25Wy4m_ndZY0-n-eX-t3PxvaZ6wR3-Wvx-o7_Vrs,271
|
|
148
149
|
crawlo/utils/url.py,sha256=rlgX2VlJv6JvLmCDTsbxzMSXE6R5ZL_0dLEqprsA-JU,1482
|
|
149
150
|
examples/__init__.py,sha256=6i631BPnS_TR_BWUjtjB5CBO-zv9kRkwQTQvSya2wHE,123
|
|
151
|
+
examples/test_project/__init__.py,sha256=sBO0-NPkGiKKO44U9OYknIdkPGKgJGn7ne2M7v_YOSU,129
|
|
152
|
+
examples/test_project/run.py,sha256=5kADlYJYVp2seoxM5KPCX8qwSB-YzyLf-SGm66FcvyQ,919
|
|
153
|
+
examples/test_project/test_project/__init__.py,sha256=-8hHXHKxttgUB58mL95faujZIsubNXuA_0-VzU-Mmy8,54
|
|
154
|
+
examples/test_project/test_project/items.py,sha256=f4GN_2OChAdDI0zunbWIA6dpP7ou6ySt4mY98GGZzos,293
|
|
155
|
+
examples/test_project/test_project/middlewares.py,sha256=qh_UvYaxH_mK1h8oNL405NQ2SYLEWAKs7r9Fyw-Jw5M,4152
|
|
156
|
+
examples/test_project/test_project/pipelines.py,sha256=k1Z4M5StwGF4eHW8JoLCNLSVi57FiAxRRO-ABFynJ1k,2600
|
|
157
|
+
examples/test_project/test_project/settings.py,sha256=Aidvz43WZYu7LAJ3N9gXdpddWCP_656kIJGJMe7J4NA,6946
|
|
158
|
+
examples/test_project/test_project/spiders/__init__.py,sha256=tWZ6jEMrDw1KoRBw12F54srI1-YX7RNYe0dtU4OP0aQ,203
|
|
159
|
+
examples/test_project/test_project/spiders/of_week_dis.py,sha256=zKulqu_lc7NS0ewDh_MCttwpXMBp0P2fPostkxxM3Xc,5035
|
|
150
160
|
tests/__init__.py,sha256=scL1IPVT1iucth7v8ffrjRdeW7QrC_Y7AMmFVMdTY1Y,129
|
|
151
161
|
tests/advanced_tools_example.py,sha256=7nlFLRVMVYzDz_CAdgQa9fJu7o0M6jBMo7PTvUsRbo0,9065
|
|
152
162
|
tests/authenticated_proxy_example.py,sha256=rsLmILsrf9PpR77ekGi8KpB1dAYZdF26hlxkBjm4rSQ,2913
|
|
@@ -160,9 +170,9 @@ tests/config_validation_demo.py,sha256=5MzW5P7ZX6xoMW_zC6XmIA50KWMTu0iB5H2hTe42S
|
|
|
160
170
|
tests/controlled_spider_example.py,sha256=SP_k4mdKPvD1JCPs9UCm68jcy2Frg84vvXv9-14RC6I,7776
|
|
161
171
|
tests/date_tools_example.py,sha256=x_-duqnVZ-Hrk-SaNplIfcIV6W3c6u6MTxW35u1i0F0,4862
|
|
162
172
|
tests/debug_configure.py,sha256=E-6Djz8kk7tf2pzEqrGdekW2W20vrJeZN7iNm9ArWKk,2144
|
|
163
|
-
tests/debug_framework_logger.py,sha256=
|
|
173
|
+
tests/debug_framework_logger.py,sha256=lAxT1esheIyNtgjXf6VJj8Lv0TNw5KUHdjaKHTYrF7Q,3329
|
|
164
174
|
tests/debug_log_config.py,sha256=F5MrUmpkaD9Dc1eSbajaOrw3nKo9Sp55BTnNxmQkUWc,3588
|
|
165
|
-
tests/debug_log_levels.py,sha256=
|
|
175
|
+
tests/debug_log_levels.py,sha256=39ugttR13gfotsS3I4XVoWRRjTXPpLtuJOyU8V_ijEs,2105
|
|
166
176
|
tests/debug_pipelines.py,sha256=VpUmoYlt6Ci7foIGuQIotUu42xp6TzoA1cBDeagBzDk,2098
|
|
167
177
|
tests/detailed_log_test.py,sha256=M6yXjQypKmIjihgTSBffkgOzC7Nn0_4ZdQLrBN-L8i0,7268
|
|
168
178
|
tests/distributed_test.py,sha256=qZpFAMQTFcg0KUEdp2RUpkuYauSCf4C3lbbosyIDqgw,1759
|
|
@@ -197,6 +207,7 @@ tests/simple_test.py,sha256=kzMspCmfJxdnAIXXJv9tmDW1gpodkD9pznW5vA_gL84,1211
|
|
|
197
207
|
tests/spider_log_timing_test.py,sha256=ngZQ_v3o9oHYcs_BtZgxH1N-N2tZUDPu-cnTnsHEpP8,5396
|
|
198
208
|
tests/test_advanced_tools.py,sha256=3R8EfKVyBHEb6FA5TP3ieaWeHZhobVgSx8t3phipCrE,5250
|
|
199
209
|
tests/test_all_commands.py,sha256=yGPw8zMrB5Z5w5LkaymSzKRLOcZsBPBXLvllCkgEY4I,7488
|
|
210
|
+
tests/test_all_pipeline_fingerprints.py,sha256=4dtn-APwK2JIg-c4J-7u9J1PF9L_pdMh4CmIyVP266I,5175
|
|
200
211
|
tests/test_all_redis_key_configs.py,sha256=SGoip8M7oB2LNWC_31aJ4ECcDRmx0psr7i7DGzuaH7c,5565
|
|
201
212
|
tests/test_authenticated_proxy.py,sha256=s4pr5JyBTHYQgRq_IymiVKE08vyW1MwR27pSwrrVLVk,4198
|
|
202
213
|
tests/test_batch_processor.py,sha256=gMPoQcnUMm2-G_d7Zt9QnrRjCx1urzT31tYqoFNEklc,7034
|
|
@@ -210,7 +221,7 @@ tests/test_controlled_spider_mixin.py,sha256=7t6VGWr6Hxw0xtIFyToLH8_deSagUtsdqSJ
|
|
|
210
221
|
tests/test_crawlo_proxy_integration.py,sha256=SvdBuZjS6N2vuvFkTnc59U5n3dHV3E4dmFayxtmjCm4,2625
|
|
211
222
|
tests/test_date_tools.py,sha256=CQdAmIS6bpAdwQH9ETDH__06l2gGL7EHUQuh7mdTF-A,3930
|
|
212
223
|
tests/test_dedup_fix.py,sha256=9rFzzsDJKQbFaohzKJAlqo3Mm4wFy8-wAm3fWmw8jb4,8568
|
|
213
|
-
tests/test_default_header_middleware.py,sha256=
|
|
224
|
+
tests/test_default_header_middleware.py,sha256=5zYSZmUR_02UChFZfoCqurabNdAZLy7t4yNZ8I7Bw8E,12813
|
|
214
225
|
tests/test_distributed.py,sha256=RQHUpDfRNG2x_1Cdr9DLk25IBcgapm_u0xSBMObE0Xc,1725
|
|
215
226
|
tests/test_double_crawlo_fix.py,sha256=E5NxWHnQkwRTIrJGoag8G29fZqVMnsN6eCPuv17gGq0,7652
|
|
216
227
|
tests/test_double_crawlo_fix_simple.py,sha256=MlWUqo51kOQ7Gu6Neoler8FVyRs0jpmQWoORHMBENz0,4644
|
|
@@ -227,21 +238,26 @@ tests/test_env_config.py,sha256=nfP4nCG9ZHeJUfxo1JKUmiihYbhSeWx_oNW5mMfDHfQ,4746
|
|
|
227
238
|
tests/test_error_handler_compatibility.py,sha256=o5JLLLdo25Sl_3hpMx6I2fqSgZFAcnI4E6Ci-KxAxwA,4129
|
|
228
239
|
tests/test_factories.py,sha256=vXI8tx42iuBivCKQoY2kH7G6c0i_QCmCq77krEgQiGU,8613
|
|
229
240
|
tests/test_final_validation.py,sha256=aAiWLzhDCcv-GEXg9sauaVIfq5rz3s2vm67Gk2_lmBI,4813
|
|
241
|
+
tests/test_fingerprint_consistency.py,sha256=LI0hDokBtW5OJ6kwn9-x1WiiS_euBqXGNsD1iwngBhQ,4821
|
|
242
|
+
tests/test_fingerprint_simple.py,sha256=H-8C-ae8TDx29BY9xQGQzZXbpk6szj-kWK8zknR1jwk,1575
|
|
230
243
|
tests/test_framework_env_usage.py,sha256=HYpTwORXeaJHMffCYAGHGvc_a6ax4lo28xP8BYOaKxk,4098
|
|
231
|
-
tests/test_framework_logger.py,sha256=
|
|
232
|
-
tests/test_framework_startup.py,sha256=
|
|
244
|
+
tests/test_framework_logger.py,sha256=RBwf0aOV1TibMqOajg-KYiq_Oe7gxLfvRAurpyY4rnI,2494
|
|
245
|
+
tests/test_framework_startup.py,sha256=zvpzWwozMxGaCfdEER9A2qJuMYytW5mg3q4T86mJ6to,2203
|
|
233
246
|
tests/test_get_component_logger.py,sha256=f7nmmSGxoD1i3d17FlSicOmMGLTcyJxcujoS5eJFbAI,2202
|
|
247
|
+
tests/test_hash_performance.py,sha256=pHJpxF09JCCxk_32hRGXx221ySQROCx2OK7Yk6-B1bk,3112
|
|
234
248
|
tests/test_integration.py,sha256=OCkjyv76Wop7CrXEko6rfoDsIK6SESA18KgCaTwL7Q4,4670
|
|
235
249
|
tests/test_item_dedup_redis_key.py,sha256=QxLuXHUx0xqT6y7lQzOWcrLkRui7Qs7C6NgRvjzIypA,3720
|
|
236
250
|
tests/test_large_scale_config.py,sha256=wyeMOMjGYhbZ6mrcnLH3Eh6GfspJwhavwWoyOy1y90c,4184
|
|
237
251
|
tests/test_large_scale_helper.py,sha256=spvL0MPyXMAUDpzI2fY6-OQdSxOHtgJ1yuSUIbydyHY,8136
|
|
238
252
|
tests/test_logging_system.py,sha256=_LRdgprZFrChA26JJgkjVyf6S6qRIyi6BRajK13l_Q8,8924
|
|
239
|
-
tests/test_mode_change.py,sha256=
|
|
253
|
+
tests/test_mode_change.py,sha256=zNVdmvFOeKD6LMVnCwjtrIyhTw95n1ZQJPjNhOanUHw,2593
|
|
240
254
|
tests/test_mode_consistency.py,sha256=YJXf0SqAYVnFXy8eeBLC-zGTFAyO2fnsR4qLB76gZts,1225
|
|
241
|
-
tests/test_offsite_middleware.py,sha256=
|
|
255
|
+
tests/test_offsite_middleware.py,sha256=K5NREsWwpJda4OWgf9J4NXIqpbG9jNlgpbALevuGLzc,10358
|
|
256
|
+
tests/test_offsite_middleware_simple.py,sha256=CVBS7dvjpJ_fWBae7t9GtAQF_etn0U_cZ4UQFdu8eh8,7715
|
|
242
257
|
tests/test_parsel.py,sha256=KYskaN_4HBc1XDTltjVo12v1i7JAThB2UIwcWZ-mwbY,672
|
|
243
258
|
tests/test_performance.py,sha256=gOJ1EpU9uGynIxETLAroe98OA4QPcX1wchCDJoO41Kc,11130
|
|
244
259
|
tests/test_performance_monitor.py,sha256=5oEHPJfjZXdtDK2nW_2MuGbOFgTTZyEhLapV9Ug1iHY,4072
|
|
260
|
+
tests/test_pipeline_fingerprint_consistency.py,sha256=WV-1UC6PSXb43B5ATyRDSjW9FcHSBMmHDCFFJwQqE_s,2761
|
|
245
261
|
tests/test_proxy_api.py,sha256=dVqGElyL3K0_9IqkXzn7Ka2jSuhvYfR1BfZgyVukNM0,10749
|
|
246
262
|
tests/test_proxy_health_check.py,sha256=xo3QMP1YNw9hu7JDHZOYCUZmFFKLJpHSh4SbxXhCRPQ,1091
|
|
247
263
|
tests/test_proxy_middleware.py,sha256=qm2B0lepBZqzUpXNi4t1gjrQxUV4MQ2wvpmcaYV6O5A,3900
|
|
@@ -286,8 +302,8 @@ tests/verify_distributed.py,sha256=krnYYA5Qx9xXDMWc9YF5DxPSplGvawDg2n0l-3CAqoM,3
|
|
|
286
302
|
tests/verify_log_fix.py,sha256=TD7M1R22NxLqQPufvgE-H33u9tUjyz-rSR2ayIXozRU,4225
|
|
287
303
|
tests/scrapy_comparison/ofweek_scrapy.py,sha256=2Hvpi6DRTubUxBy6RyJApQxMQONPLc1zWjKTQO_i5U4,5652
|
|
288
304
|
tests/scrapy_comparison/scrapy_test.py,sha256=5sw7jOHhaTmQ8bsUd1TiolAUTRQYQOe-f49HPfysqbI,5466
|
|
289
|
-
crawlo-1.4.
|
|
290
|
-
crawlo-1.4.
|
|
291
|
-
crawlo-1.4.
|
|
292
|
-
crawlo-1.4.
|
|
293
|
-
crawlo-1.4.
|
|
305
|
+
crawlo-1.4.2.dist-info/METADATA,sha256=IwT9XfhtTQOUvMyKss9cne9PjX5RuWpiOkwMVu6zV6c,33235
|
|
306
|
+
crawlo-1.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
307
|
+
crawlo-1.4.2.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
|
|
308
|
+
crawlo-1.4.2.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
|
|
309
|
+
crawlo-1.4.2.dist-info/RECORD,,
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
test_project 项目运行脚本
|
|
5
|
+
============================
|
|
6
|
+
基于 Crawlo 框架的简化爬虫启动器。
|
|
7
|
+
|
|
8
|
+
框架会自动处理爬虫模块的导入和注册,用户无需手动导入。
|
|
9
|
+
只需指定spider_modules参数,框架会自动扫描并导入所有爬虫。
|
|
10
|
+
"""
|
|
11
|
+
import sys
|
|
12
|
+
import asyncio
|
|
13
|
+
|
|
14
|
+
from crawlo.crawler import CrawlerProcess
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def main():
|
|
18
|
+
"""主函数:运行爬虫"""
|
|
19
|
+
try:
|
|
20
|
+
# 指定爬虫模块路径,框架会自动导入并注册所有爬虫
|
|
21
|
+
spider_modules = ['test_project.spiders']
|
|
22
|
+
process = CrawlerProcess(spider_modules=spider_modules)
|
|
23
|
+
|
|
24
|
+
# TODO 运行指定的爬虫
|
|
25
|
+
asyncio.run(process.crawl('of_week_dis'))
|
|
26
|
+
|
|
27
|
+
except Exception as e:
|
|
28
|
+
print(f"❌ 运行失败: {e}")
|
|
29
|
+
import traceback
|
|
30
|
+
traceback.print_exc()
|
|
31
|
+
sys.exit(1)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
if __name__ == '__main__':
|
|
35
|
+
main()
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
test_project.items
|
|
4
|
+
======================
|
|
5
|
+
定义你抓取的数据结构。
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from crawlo.items import Item, Field
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ExampleItem(Item):
|
|
12
|
+
"""
|
|
13
|
+
一个示例数据项。
|
|
14
|
+
"""
|
|
15
|
+
id = Field()
|
|
16
|
+
# price = Field()
|
|
17
|
+
# description = Field()
|
|
18
|
+
pass
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
test_project.middlewares
|
|
4
|
+
============================
|
|
5
|
+
自定义中间件,用于在请求/响应/异常处理过程中插入自定义逻辑。
|
|
6
|
+
|
|
7
|
+
这是一个简单的示例中间件,您可以根据需要添加更多中间件。
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import random
|
|
11
|
+
from crawlo import Request, Response
|
|
12
|
+
from crawlo.utils.log import get_logger
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ExampleMiddleware:
|
|
16
|
+
"""
|
|
17
|
+
示例中间件,演示如何处理请求、响应和异常。
|
|
18
|
+
|
|
19
|
+
此中间件会:
|
|
20
|
+
1. 为请求添加随机 User-Agent
|
|
21
|
+
2. 记录请求和响应信息
|
|
22
|
+
3. 处理异常情况
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self):
|
|
26
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
27
|
+
self.user_agents = [
|
|
28
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
|
|
29
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
|
|
30
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0',
|
|
31
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:135.0) Gecko/20100101 Firefox/135.0',
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
def process_request(self, request, spider):
|
|
35
|
+
"""
|
|
36
|
+
在请求被下载器执行前调用。
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
request: 请求对象
|
|
40
|
+
spider: 爬虫实例
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
None: 继续处理请求
|
|
44
|
+
Response: 返回响应对象(短路处理)
|
|
45
|
+
Request: 返回新请求对象(替换原请求)
|
|
46
|
+
"""
|
|
47
|
+
# 为请求添加随机 User-Agent
|
|
48
|
+
if 'User-Agent' not in request.headers:
|
|
49
|
+
ua = random.choice(self.user_agents)
|
|
50
|
+
request.headers['User-Agent'] = ua
|
|
51
|
+
self.logger.debug(f"为请求 {request.url} 设置 User-Agent: {ua[:50]}...")
|
|
52
|
+
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
def process_response(self, request, response, spider):
|
|
56
|
+
"""
|
|
57
|
+
在响应被 Spider 处理前调用。
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
request: 原始请求对象
|
|
61
|
+
response: 响应对象
|
|
62
|
+
spider: 爬虫实例
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
Response: 处理后的响应对象
|
|
66
|
+
"""
|
|
67
|
+
# 记录响应信息
|
|
68
|
+
self.logger.info(f"收到响应: {request.url} - 状态码: {response.status_code}")
|
|
69
|
+
|
|
70
|
+
# 可以在这里处理特殊状态码
|
|
71
|
+
if response.status_code == 403:
|
|
72
|
+
self.logger.warning(f"访问被拒绝: {request.url}")
|
|
73
|
+
|
|
74
|
+
return response
|
|
75
|
+
|
|
76
|
+
def process_exception(self, request, exception, spider):
|
|
77
|
+
"""
|
|
78
|
+
在下载或处理过程中发生异常时调用。
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
request: 请求对象
|
|
82
|
+
exception: 异常对象
|
|
83
|
+
spider: 爬虫实例
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
None: 异常将继续传播
|
|
87
|
+
Response: 返回响应对象(处理异常)
|
|
88
|
+
Request: 返回新请求对象(重试请求)
|
|
89
|
+
"""
|
|
90
|
+
self.logger.error(f"请求异常: {request.url} - {exception}")
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# ======================== 使用说明 ========================
|
|
95
|
+
#
|
|
96
|
+
# 在 settings.py 中启用中间件:
|
|
97
|
+
# MIDDLEWARES = [
|
|
98
|
+
# 'test_project.middlewares.ExampleMiddleware',
|
|
99
|
+
# ]
|
|
100
|
+
#
|
|
101
|
+
# 您可以根据需要添加更多中间件,例如:
|
|
102
|
+
# 1. 请求处理中间件(修改请求头、设置代理等)
|
|
103
|
+
# 2. 响应处理中间件(解析、过滤等)
|
|
104
|
+
# 3. 异常处理中间件(重试、记录等)
|
|
105
|
+
#
|
|
106
|
+
# 每个中间件可以实现以下方法:
|
|
107
|
+
# - process_request: 处理请求
|
|
108
|
+
# - process_response: 处理响应
|
|
109
|
+
# - process_exception: 处理异常
|
|
110
|
+
#
|
|
111
|
+
# 注意:Crawlo框架提供了许多内置中间件,您可以直接使用:
|
|
112
|
+
# - DownloadDelayMiddleware: 控制请求延迟
|
|
113
|
+
# - ResponseCodeMiddleware: 处理HTTP状态码并记录统计信息
|
|
114
|
+
# - ResponseFilterMiddleware: 过滤特定状态码的响应
|
|
115
|
+
# - DefaultHeaderMiddleware: 添加默认请求头
|
|
116
|
+
# - ProxyMiddleware: 设置代理
|
|
117
|
+
# - RetryMiddleware: 处理重试逻辑
|
|
118
|
+
# - OffsiteMiddleware: 过滤站外请求
|
|
119
|
+
# ======================== 使用说明 ========================
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
test_project.pipelines
|
|
4
|
+
==========================
|
|
5
|
+
数据管道,用于处理 Spider 返回的 Item。
|
|
6
|
+
例如:清理、验证、去重、保存到数据库等。
|
|
7
|
+
|
|
8
|
+
这是一个简单的示例管道,您可以根据需要添加更多管道。
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from crawlo.exceptions import DropItem
|
|
13
|
+
from crawlo.utils.log import get_logger
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ExamplePipeline:
|
|
17
|
+
"""
|
|
18
|
+
示例管道,演示如何处理数据项。
|
|
19
|
+
|
|
20
|
+
此管道会:
|
|
21
|
+
1. 验证必要字段
|
|
22
|
+
2. 清理数据
|
|
23
|
+
3. 添加时间戳
|
|
24
|
+
4. 记录处理日志
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self):
|
|
28
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
29
|
+
self.item_count = 0
|
|
30
|
+
|
|
31
|
+
def process_item(self, item, spider):
|
|
32
|
+
"""
|
|
33
|
+
处理数据项。
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
item: 要处理的数据项
|
|
37
|
+
spider: 爬虫实例
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
处理后的数据项
|
|
41
|
+
|
|
42
|
+
Raises:
|
|
43
|
+
DropItem: 如果数据项无效则抛出此异常
|
|
44
|
+
"""
|
|
45
|
+
# 验证必要字段
|
|
46
|
+
if not item.get('title') or not item.get('url'):
|
|
47
|
+
raise DropItem("缺少必要字段: title 或 url")
|
|
48
|
+
|
|
49
|
+
# 数据清理
|
|
50
|
+
item['title'] = str(item['title']).strip()
|
|
51
|
+
|
|
52
|
+
# 添加处理时间戳
|
|
53
|
+
item['processed_at'] = datetime.now().isoformat()
|
|
54
|
+
|
|
55
|
+
# 计数器
|
|
56
|
+
self.item_count += 1
|
|
57
|
+
|
|
58
|
+
# 记录日志
|
|
59
|
+
self.logger.info(f"处理第 {self.item_count} 个数据项: {item['title']}")
|
|
60
|
+
|
|
61
|
+
return item
|
|
62
|
+
|
|
63
|
+
def open_spider(self, spider):
|
|
64
|
+
"""
|
|
65
|
+
爬虫启动时调用。
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
spider: 爬虫实例
|
|
69
|
+
"""
|
|
70
|
+
self.logger.info(f"管道已启动,准备处理爬虫 '{spider.name}' 的数据")
|
|
71
|
+
|
|
72
|
+
def close_spider(self, spider):
|
|
73
|
+
"""
|
|
74
|
+
爬虫关闭时调用。
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
spider: 爬虫实例
|
|
78
|
+
"""
|
|
79
|
+
self.logger.info(f"管道已关闭,共处理了 {self.item_count} 个数据项")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# ======================== 使用说明 ========================
|
|
83
|
+
#
|
|
84
|
+
# 在 settings.py 中启用管道:
|
|
85
|
+
# PIPELINES = [
|
|
86
|
+
# 'test_project.pipelines.ExamplePipeline',
|
|
87
|
+
# ]
|
|
88
|
+
#
|
|
89
|
+
# 您可以根据需要添加更多管道,例如:
|
|
90
|
+
# 1. 数据验证管道
|
|
91
|
+
# 2. 去重管道
|
|
92
|
+
# 3. 数据存储管道(数据库、文件等)
|
|
93
|
+
# 4. 数据转换管道
|
|
94
|
+
#
|
|
95
|
+
# 每个管道都应该实现 process_item 方法,
|
|
96
|
+
# 可选实现 open_spider 和 close_spider 方法。
|
|
97
|
+
# ======================== 使用说明 ========================
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
test_project 项目配置文件(分布式版)
|
|
4
|
+
=============================
|
|
5
|
+
基于 Crawlo 框架的分布式爬虫项目配置。
|
|
6
|
+
适合大规模数据采集和多节点部署。
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
# ============================== 项目基本信息 ==============================
|
|
12
|
+
PROJECT_NAME = 'test_project'
|
|
13
|
+
|
|
14
|
+
# ============================== 运行模式 ==============================
|
|
15
|
+
RUN_MODE = 'distributed'
|
|
16
|
+
|
|
17
|
+
# ============================== 并发配置 ==============================
|
|
18
|
+
CONCURRENCY = 16
|
|
19
|
+
MAX_RUNNING_SPIDERS = 5
|
|
20
|
+
DOWNLOAD_DELAY = 1.0
|
|
21
|
+
|
|
22
|
+
# ============================== 下载器配置 ==============================
|
|
23
|
+
# 可选下载器:
|
|
24
|
+
# DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
|
|
25
|
+
# DOWNLOADER = 'crawlo.downloader.httpx_downloader.HttpXDownloader'
|
|
26
|
+
# DOWNLOADER = 'crawlo.downloader.cffi_downloader.CurlCffiDownloader'
|
|
27
|
+
DOWNLOADER = 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader'
|
|
28
|
+
|
|
29
|
+
# ============================== 队列配置 ==============================
|
|
30
|
+
QUEUE_TYPE = 'redis'
|
|
31
|
+
# 当使用Redis队列时,可自定义队列名称
|
|
32
|
+
# 队列名称遵循统一命名规范: crawlo:{PROJECT_NAME}:queue:requests
|
|
33
|
+
# SCHEDULER_QUEUE_NAME = f'crawlo:{PROJECT_NAME}:queue:requests'
|
|
34
|
+
|
|
35
|
+
# ============================== 去重过滤器 ==============================
|
|
36
|
+
FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'
|
|
37
|
+
|
|
38
|
+
# ============================== 默认去重管道 ==============================
|
|
39
|
+
DEFAULT_DEDUP_PIPELINE = 'crawlo.pipelines.redis_dedup_pipeline.RedisDedupPipeline'
|
|
40
|
+
|
|
41
|
+
# ============================== 爬虫模块配置 ==============================
|
|
42
|
+
SPIDER_MODULES = ['test_project.spiders']
|
|
43
|
+
|
|
44
|
+
# ============================== 中间件 ==============================
|
|
45
|
+
# MIDDLEWARES = [
|
|
46
|
+
# 'crawlo.middleware.simple_proxy.SimpleProxyMiddleware',
|
|
47
|
+
# ]
|
|
48
|
+
|
|
49
|
+
# ============================== 默认请求头配置 ==============================
|
|
50
|
+
# 为DefaultHeaderMiddleware配置默认请求头
|
|
51
|
+
DEFAULT_REQUEST_HEADERS = {
|
|
52
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
53
|
+
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
|
54
|
+
'Accept-Encoding': 'gzip, deflate, br',
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
# ============================== 允许的域名 ==============================
|
|
58
|
+
# 为OffsiteMiddleware配置允许的域名
|
|
59
|
+
# ALLOWED_DOMAINS = ['example.com']
|
|
60
|
+
|
|
61
|
+
# ============================== 数据管道 ==============================
|
|
62
|
+
# PIPELINES = [
|
|
63
|
+
# 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # MySQL 存储(使用asyncmy异步库)
|
|
64
|
+
# ]
|
|
65
|
+
|
|
66
|
+
# ============================== 扩展组件 ==============================
|
|
67
|
+
# EXTENSIONS = [
|
|
68
|
+
# 'crawlo.extension.log_interval.LogIntervalExtension',
|
|
69
|
+
# 'crawlo.extension.log_stats.LogStats',
|
|
70
|
+
# 'crawlo.extension.logging_extension.CustomLoggerExtension',
|
|
71
|
+
# ]
|
|
72
|
+
|
|
73
|
+
# ============================== 日志配置 ==============================
|
|
74
|
+
LOG_LEVEL = 'INFO'
|
|
75
|
+
LOG_FILE = 'logs/test_project.log'
|
|
76
|
+
LOG_ENCODING = 'utf-8' # 明确指定日志文件编码
|
|
77
|
+
STATS_DUMP = True
|
|
78
|
+
|
|
79
|
+
# ============================== 输出配置 ==============================
|
|
80
|
+
OUTPUT_DIR = 'output'
|
|
81
|
+
|
|
82
|
+
# ============================== Redis配置 ==============================
|
|
83
|
+
REDIS_HOST = os.getenv('REDIS_HOST', '127.0.0.1')
|
|
84
|
+
REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
|
|
85
|
+
REDIS_PASSWORD = os.getenv('REDIS_PASSWORD', '')
|
|
86
|
+
REDIS_DB = int(os.getenv('REDIS_DB', 0))
|
|
87
|
+
|
|
88
|
+
# 根据是否有密码生成 URL
|
|
89
|
+
if REDIS_PASSWORD:
|
|
90
|
+
REDIS_URL = f'redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
|
|
91
|
+
else:
|
|
92
|
+
REDIS_URL = f'redis://{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}'
|
|
93
|
+
|
|
94
|
+
# ============================== MySQL配置 ==============================
|
|
95
|
+
MYSQL_HOST = os.getenv('MYSQL_HOST', '127.0.0.1')
|
|
96
|
+
MYSQL_PORT = int(os.getenv('MYSQL_PORT', 3306))
|
|
97
|
+
MYSQL_USER = os.getenv('MYSQL_USER', 'root')
|
|
98
|
+
MYSQL_PASSWORD = os.getenv('MYSQL_PASSWORD', '123456')
|
|
99
|
+
MYSQL_DB = os.getenv('MYSQL_DB', 'test_project')
|
|
100
|
+
MYSQL_TABLE = 'test_project_data'
|
|
101
|
+
MYSQL_BATCH_SIZE = 100
|
|
102
|
+
MYSQL_USE_BATCH = True # 是否启用批量插入
|
|
103
|
+
|
|
104
|
+
# ============================== MongoDB配置 ==============================
|
|
105
|
+
MONGO_URI = os.getenv('MONGO_URI', 'mongodb://localhost:27017')
|
|
106
|
+
MONGO_DATABASE = 'test_project_db'
|
|
107
|
+
MONGO_COLLECTION = 'test_project_items'
|
|
108
|
+
MONGO_MAX_POOL_SIZE = 200
|
|
109
|
+
MONGO_MIN_POOL_SIZE = 20
|
|
110
|
+
MONGO_BATCH_SIZE = 100 # 批量插入条数
|
|
111
|
+
MONGO_USE_BATCH = True # 是否启用批量插入
|
|
112
|
+
|
|
113
|
+
# ============================== 代理配置 ==============================
|
|
114
|
+
# 代理功能默认不启用,如需使用请在项目配置文件中启用并配置相关参数
|
|
115
|
+
PROXY_ENABLED = False # 是否启用代理
|
|
116
|
+
|
|
117
|
+
# 简化版代理配置(适用于SimpleProxyMiddleware)
|
|
118
|
+
PROXY_LIST = [] # 代理列表,例如: ["http://proxy1:8080", "http://proxy2:8080"]
|
|
119
|
+
|
|
120
|
+
# 高级代理配置(适用于ProxyMiddleware)
|
|
121
|
+
PROXY_API_URL = "" # 代理获取接口(请替换为真实地址)
|
|
122
|
+
|
|
123
|
+
# 代理提取方式(支持字段路径或函数)
|
|
124
|
+
# 示例: "proxy" 适用于 {"proxy": "http://1.1.1.1:8080"}
|
|
125
|
+
# 示例: "data.proxy" 适用于 {"data": {"proxy": "http://1.1.1.1:8080"}}
|
|
126
|
+
PROXY_EXTRACTOR = "proxy"
|
|
127
|
+
|
|
128
|
+
# 代理刷新控制
|
|
129
|
+
PROXY_REFRESH_INTERVAL = 60 # 代理刷新间隔(秒)
|
|
130
|
+
PROXY_API_TIMEOUT = 10 # 请求代理 API 超时时间
|
|
131
|
+
|
|
132
|
+
# ============================== Curl-Cffi 特有配置 ==============================
|
|
133
|
+
# 浏览器指纹模拟(仅 CurlCffi 下载器有效)
|
|
134
|
+
CURL_BROWSER_TYPE = "chrome" # 可选: chrome, edge, safari, firefox 或版本如 chrome136
|
|
135
|
+
|
|
136
|
+
# 自定义浏览器版本映射(可覆盖默认行为)
|
|
137
|
+
CURL_BROWSER_VERSION_MAP = {
|
|
138
|
+
"chrome": "chrome136",
|
|
139
|
+
"edge": "edge101",
|
|
140
|
+
"safari": "safari184",
|
|
141
|
+
"firefox": "firefox135",
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
# ============================== 下载器优化配置 ==============================
|
|
145
|
+
# 下载器健康检查
|
|
146
|
+
DOWNLOADER_HEALTH_CHECK = True # 是否启用下载器健康检查
|
|
147
|
+
HEALTH_CHECK_INTERVAL = 60 # 健康检查间隔(秒)
|
|
148
|
+
|
|
149
|
+
# 请求统计配置
|
|
150
|
+
REQUEST_STATS_ENABLED = True # 是否启用请求统计
|
|
151
|
+
STATS_RESET_ON_START = False # 启动时是否重置统计
|
|
152
|
+
|
|
153
|
+
# HttpX 下载器专用配置
|
|
154
|
+
HTTPX_HTTP2 = True # 是否启用HTTP/2支持
|
|
155
|
+
HTTPX_FOLLOW_REDIRECTS = True # 是否自动跟随重定向
|
|
156
|
+
|
|
157
|
+
# AioHttp 下载器专用配置
|
|
158
|
+
AIOHTTP_AUTO_DECOMPRESS = True # 是否自动解压响应
|
|
159
|
+
AIOHTTP_FORCE_CLOSE = False # 是否强制关闭连接
|
|
160
|
+
|
|
161
|
+
# 通用优化配置
|
|
162
|
+
CONNECTION_TTL_DNS_CACHE = 300 # DNS缓存TTL(秒)
|
|
163
|
+
CONNECTION_KEEPALIVE_TIMEOUT = 15 # Keep-Alive超时(秒)
|
|
164
|
+
|
|
165
|
+
# ============================== 内存监控配置 ==============================
|
|
166
|
+
# 内存监控扩展默认不启用,如需使用请在项目配置文件中启用
|
|
167
|
+
MEMORY_MONITOR_ENABLED = False # 是否启用内存监控
|
|
168
|
+
MEMORY_MONITOR_INTERVAL = 60 # 内存监控检查间隔(秒)
|
|
169
|
+
MEMORY_WARNING_THRESHOLD = 80.0 # 内存使用率警告阈值(百分比)
|
|
170
|
+
MEMORY_CRITICAL_THRESHOLD = 90.0 # 内存使用率严重阈值(百分比)
|