crawlo 1.4.5__py3-none-any.whl → 1.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__version__.py +1 -1
- crawlo/downloader/cffi_downloader.py +3 -1
- crawlo/middleware/proxy.py +171 -348
- crawlo/pipelines/mysql_pipeline.py +339 -188
- crawlo/settings/default_settings.py +38 -30
- crawlo/stats_collector.py +10 -1
- crawlo/templates/project/settings.py.tmpl +10 -55
- crawlo/templates/project/settings_distributed.py.tmpl +20 -22
- crawlo/templates/project/settings_gentle.py.tmpl +5 -0
- crawlo/templates/project/settings_high_performance.py.tmpl +5 -0
- crawlo/templates/project/settings_minimal.py.tmpl +25 -1
- crawlo/templates/project/settings_simple.py.tmpl +5 -0
- crawlo/templates/run.py.tmpl +1 -8
- crawlo/templates/spider/spider.py.tmpl +5 -108
- crawlo/utils/db_helper.py +11 -5
- {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/METADATA +1 -1
- {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/RECORD +43 -29
- tests/authenticated_proxy_example.py +10 -6
- tests/explain_mysql_update_behavior.py +77 -0
- tests/simulate_mysql_update_test.py +140 -0
- tests/test_asyncmy_usage.py +57 -0
- tests/test_crawlo_proxy_integration.py +8 -2
- tests/test_downloader_proxy_compatibility.py +24 -20
- tests/test_mysql_pipeline_config.py +165 -0
- tests/test_mysql_pipeline_error.py +99 -0
- tests/test_mysql_pipeline_init_log.py +83 -0
- tests/test_mysql_pipeline_integration.py +133 -0
- tests/test_mysql_pipeline_refactor.py +144 -0
- tests/test_mysql_pipeline_refactor_simple.py +86 -0
- tests/test_mysql_pipeline_robustness.py +196 -0
- tests/test_mysql_pipeline_types.py +89 -0
- tests/test_mysql_update_columns.py +94 -0
- tests/test_proxy_middleware.py +104 -8
- tests/test_proxy_middleware_enhanced.py +1 -5
- tests/test_proxy_middleware_integration.py +7 -2
- tests/test_proxy_middleware_refactored.py +25 -2
- tests/test_proxy_only.py +84 -0
- tests/test_proxy_with_downloader.py +153 -0
- tests/test_real_scenario_proxy.py +17 -17
- tests/verify_mysql_warnings.py +110 -0
- crawlo/middleware/simple_proxy.py +0 -65
- {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/WHEEL +0 -0
- {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/entry_points.txt +0 -0
- {crawlo-1.4.5.dist-info → crawlo-1.4.6.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
crawlo/__init__.py,sha256=n5vFwi0iuYrpAIyoNJZzWHV1gvF-vh-Yze3jiuwEXqM,2180
|
|
2
|
-
crawlo/__version__.py,sha256=
|
|
2
|
+
crawlo/__version__.py,sha256=C1PbImXkZPhAW7rUcTV61OKrbIa2DpoQJ2Kmga3lWwM,23
|
|
3
3
|
crawlo/cli.py,sha256=AQnAB5NMI-Ic1VPw_Jjng8L4AI4-wMozOwzE6CfXkZU,2402
|
|
4
4
|
crawlo/config.py,sha256=EQIT7WpkXAlr2ocd5SYJYOKTSWUlQx2AkTHX7ErEWxw,9798
|
|
5
5
|
crawlo/config_validator.py,sha256=oY4-2bwXUlwHAnGgkI-EznviDfML_dcxbWSGXNSxC2k,11516
|
|
@@ -10,7 +10,7 @@ crawlo/framework.py,sha256=9gP6VN4MHqutGXaxnwpNMSULfVYbNp906UdZiJGywlQ,9458
|
|
|
10
10
|
crawlo/interfaces.py,sha256=q1vwMSiZLfLpPhFa9Y0hAcjYEKvLkW2fZ2fmoAZ-5TE,653
|
|
11
11
|
crawlo/mode_manager.py,sha256=e8QmwsnndFx_hGME_7w-hazKo0GOYjUr-7FBf7dWxgc,8903
|
|
12
12
|
crawlo/project.py,sha256=9wnlHd-rYAC3TT1Fc1ftyUBx7mbDT6TQCqoaIP6N3iA,13998
|
|
13
|
-
crawlo/stats_collector.py,sha256=
|
|
13
|
+
crawlo/stats_collector.py,sha256=mzNHu628a31PwqpkBXN90PhD-xhMSunNNxAm-ney5JU,2803
|
|
14
14
|
crawlo/subscriber.py,sha256=h8fx69NJZeWem0ZkCmfHAi2kgfDGFObHpwN0aGNUM6Y,5115
|
|
15
15
|
crawlo/task_manager.py,sha256=Ic6PFUqZOhLXuZ_UEk_8Neb9FmqYv8I2RzV3vLzFNSU,5966
|
|
16
16
|
crawlo/commands/__init__.py,sha256=orvY6wLOBwGUEJKeF3h_T1fxj8AaQLjngBDd-3xKOE4,392
|
|
@@ -30,7 +30,7 @@ crawlo/data/__init__.py,sha256=UPqgioMdu3imSUmpLWzVlpvoBnEfaPSAT-crCcWd7iw,121
|
|
|
30
30
|
crawlo/data/user_agents.py,sha256=zjjFkldQkqtrn45j0WZplaZLannPxZDeAU0JofxQcBc,9891
|
|
31
31
|
crawlo/downloader/__init__.py,sha256=P5pl-BGYCkdKWgoIewcYPz7ocVLixVfYuCDFmYyuqIw,8966
|
|
32
32
|
crawlo/downloader/aiohttp_downloader.py,sha256=-dIFucMOQhiiEmtgEpG2Lqh1vF-PvDddbIrZ8Hge0Ig,9556
|
|
33
|
-
crawlo/downloader/cffi_downloader.py,sha256=
|
|
33
|
+
crawlo/downloader/cffi_downloader.py,sha256=aKmrooictEFNfsmM3t4dpkGEALI85E7eLOAxm4LPQAU,10585
|
|
34
34
|
crawlo/downloader/httpx_downloader.py,sha256=MpgDeIdGqNsiSKLOEDBnr5Z0eUbhHnqVEmAuoIfJmFU,12296
|
|
35
35
|
crawlo/downloader/hybrid_downloader.py,sha256=dNnFeegRnyLaOxTWI6XrWKqqVPx80AZBZNgmrcKRVBM,8240
|
|
36
36
|
crawlo/downloader/playwright_downloader.py,sha256=L-TVzG7cYfuBlqW0XSZuz5C_r9fpJrmYNcoQ-cDEna4,16663
|
|
@@ -72,12 +72,11 @@ crawlo/middleware/default_header.py,sha256=Pw-ev8ffi16GeCh84R5L3hAZgp3G1QXS-H5kV
|
|
|
72
72
|
crawlo/middleware/download_delay.py,sha256=2iWnJFtWDlqDy5MsAob8TPiJQoiz9v21yatkBI0eptg,3542
|
|
73
73
|
crawlo/middleware/middleware_manager.py,sha256=H_o0nwo_xQ8aSRnnvEs2Ho3fS-3WNi_5AjChhqvRYnk,6645
|
|
74
74
|
crawlo/middleware/offsite.py,sha256=4tUkPqXMMXsi1WwYnJ_e7wMd6sRgK19QHRCYq8-w8jk,4682
|
|
75
|
-
crawlo/middleware/proxy.py,sha256=
|
|
75
|
+
crawlo/middleware/proxy.py,sha256=jfaM4gL78ga_F7LN891dULjjO2zqFmulwQMDs5eJD6k,9591
|
|
76
76
|
crawlo/middleware/request_ignore.py,sha256=7qdX4zAimjSGwdod_aWUbOTfzLBWZ5KzLVFchGMCxCI,2663
|
|
77
77
|
crawlo/middleware/response_code.py,sha256=d5t0hmP8QliuvvtFOqW-ogCBtZxg2eyjsOtlQAEUxM8,4533
|
|
78
78
|
crawlo/middleware/response_filter.py,sha256=tVGr06bfJBR3xAHI2G5c3WimFsGHu8qoJtDcsVuCATU,4384
|
|
79
79
|
crawlo/middleware/retry.py,sha256=Acfo95B9wF8fQTCQIqluZOS2hHdnknQu_FOHvpGKJp0,4248
|
|
80
|
-
crawlo/middleware/simple_proxy.py,sha256=rQ4RkqewGvDRCw021nGrg8ngkBzg3wqrEVqvSmBgQ6M,2256
|
|
81
80
|
crawlo/network/__init__.py,sha256=bvEnpEUBZJ79URfNZbsHhsBKna54hM2-x_BV8eotTA4,418
|
|
82
81
|
crawlo/network/request.py,sha256=e6-YLgK7SU8D19n21mQwqt_b_aeRVJFOgWPIBPal2ys,14178
|
|
83
82
|
crawlo/network/response.py,sha256=-URnNc_J7qBSG19uJbfuF6A_14MHLOtY78FvcZDzbsI,23418
|
|
@@ -89,7 +88,7 @@ crawlo/pipelines/database_dedup_pipeline.py,sha256=IxahtD_mhni-Y21_idOMX58_Htf46
|
|
|
89
88
|
crawlo/pipelines/json_pipeline.py,sha256=wrCsh8YInmcPLAkhPrHObMx89VZfhf-c7qRrYsTixPE,8585
|
|
90
89
|
crawlo/pipelines/memory_dedup_pipeline.py,sha256=lKkYPu6vkpPjfQ1-xOLvPFT4VdTI8QVx0yjqtVR0ZB0,3598
|
|
91
90
|
crawlo/pipelines/mongo_pipeline.py,sha256=PohTKTGw3QRvuP-T6SrquwW3FAHSno8jQ2D2cH_d75U,5837
|
|
92
|
-
crawlo/pipelines/mysql_pipeline.py,sha256=
|
|
91
|
+
crawlo/pipelines/mysql_pipeline.py,sha256=jlTP1X5QMrSVZjLD4lMS1BUTz-x6bagUEODddvHI2Vg,23702
|
|
93
92
|
crawlo/pipelines/pipeline_manager.py,sha256=_DtWfxcTinIf5ApzUOVjZksd2tPbc7qeKi92IVd_kbs,4387
|
|
94
93
|
crawlo/pipelines/redis_dedup_pipeline.py,sha256=RB1kXLr8ZuWNrgZKYwt--tlmnWsQTbuwTsSt3pafol8,6077
|
|
95
94
|
crawlo/queue/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -97,24 +96,24 @@ crawlo/queue/pqueue.py,sha256=bbgd3l1VfqYXfz-4VFaiWLmJit1LdB3qHalCtNqyrqI,1210
|
|
|
97
96
|
crawlo/queue/queue_manager.py,sha256=8rKygMxr6DgSjnGsKFmvlTI5XAARvQIN_ENkAruHGXs,21532
|
|
98
97
|
crawlo/queue/redis_priority_queue.py,sha256=vLvg2toKaRrXD1QyEdu1ZjTmANv7clFaBF7mCtstBmI,15995
|
|
99
98
|
crawlo/settings/__init__.py,sha256=NgYFLfk_Bw7h6KSoepJn_lMBSqVbCHebjKxaE3_eMgw,130
|
|
100
|
-
crawlo/settings/default_settings.py,sha256=
|
|
99
|
+
crawlo/settings/default_settings.py,sha256=TvtXgLzgc9_j_ITt8_xYhag29k6dCJiPU0Yq-snMkt4,12704
|
|
101
100
|
crawlo/settings/setting_manager.py,sha256=yI1tGaludevxKGGZO3Pn4aYofrg2cwYwvMZCFC5PPZw,8595
|
|
102
101
|
crawlo/spider/__init__.py,sha256=QGhe_yNsnfnCF3G9nSoWEw23b8SkP5oSFU5W79C5DzI,21881
|
|
103
102
|
crawlo/templates/crawlo.cfg.tmpl,sha256=lwiUVe5sFixJgHFEjn1OtbAeyWsECOrz37uheuVtulk,240
|
|
104
|
-
crawlo/templates/run.py.tmpl,sha256=
|
|
103
|
+
crawlo/templates/run.py.tmpl,sha256=1ge0XILc3O5u7S8rsyg_rpe2B2ULokJcrKRVHMwPKj0,511
|
|
105
104
|
crawlo/templates/spiders_init.py.tmpl,sha256=p6UK8KWr8FDydNxiAh6Iz29MY5WmgXIkf2z-buOGhOM,354
|
|
106
105
|
crawlo/templates/project/__init__.py.tmpl,sha256=aQnHaOjMSkTviOC8COUX0fKymuyf8lx2tGduxkMkXEE,61
|
|
107
106
|
crawlo/templates/project/items.py.tmpl,sha256=hpQ2AfUmhddnzMuKM5LF6t44dOfFXwJRAZlWFKUFOZw,343
|
|
108
107
|
crawlo/templates/project/middlewares.py.tmpl,sha256=eEobZl8g_0DtiwLYbirQULqOacH-yUrrs4PUrGcJ2UE,1098
|
|
109
108
|
crawlo/templates/project/pipelines.py.tmpl,sha256=7BeaQDMHbIjhKzRtzlCMiFlU8xgMzDs2PIHq3EVUAlQ,887
|
|
110
|
-
crawlo/templates/project/settings.py.tmpl,sha256=
|
|
111
|
-
crawlo/templates/project/settings_distributed.py.tmpl,sha256=
|
|
112
|
-
crawlo/templates/project/settings_gentle.py.tmpl,sha256=
|
|
113
|
-
crawlo/templates/project/settings_high_performance.py.tmpl,sha256=
|
|
114
|
-
crawlo/templates/project/settings_minimal.py.tmpl,sha256=
|
|
115
|
-
crawlo/templates/project/settings_simple.py.tmpl,sha256=
|
|
109
|
+
crawlo/templates/project/settings.py.tmpl,sha256=fYK2NCJOc_jVRraKkEzH8beyax16KgNa-9s6TsQrdpI,3606
|
|
110
|
+
crawlo/templates/project/settings_distributed.py.tmpl,sha256=ULXyi5GDsZggk1Z4SRkalm2g7kJQx9ul6bCARN2I-TM,5566
|
|
111
|
+
crawlo/templates/project/settings_gentle.py.tmpl,sha256=NZjSqAqWmYlNE15Zt6-wY4rtxp7ID6HFUafoOvt7VAE,6039
|
|
112
|
+
crawlo/templates/project/settings_high_performance.py.tmpl,sha256=QYN4hJqvGmL7oayJjLcx4Mr3jedqRSvdlWkivom2M2o,6129
|
|
113
|
+
crawlo/templates/project/settings_minimal.py.tmpl,sha256=8XS_ButRDJxYRQSRHTc_l8ej2DbUnR0j891m0j-gjTY,3122
|
|
114
|
+
crawlo/templates/project/settings_simple.py.tmpl,sha256=OmL4GCPpFseRIG0CgL7625IWipc6vG_Da5tefXv_MD0,5891
|
|
116
115
|
crawlo/templates/project/spiders/__init__.py.tmpl,sha256=llhcIItXpm0TlEeumeLwp4fcYv2NHl8Iru7tLhDhxiE,216
|
|
117
|
-
crawlo/templates/spider/spider.py.tmpl,sha256=
|
|
116
|
+
crawlo/templates/spider/spider.py.tmpl,sha256=4E4DDoOfI0vN_zLjfmMX_QNmWCx8EbrOKWBg6zozVqs,1065
|
|
118
117
|
crawlo/tools/__init__.py,sha256=sXDMZNP6EwZIFivGcRthxqD1DFMMS8UOJvULAzHD-w4,3927
|
|
119
118
|
crawlo/tools/authenticated_proxy.py,sha256=ULCK0Cc9F2rGhRqu6kzKBdxzK9v2n1CsatSQ_PMxpAg,7272
|
|
120
119
|
crawlo/tools/data_formatter.py,sha256=iBDHpZBZvn9O7pLkTQilE1TzYJQEc3z3f6HXoVus0f0,7808
|
|
@@ -130,7 +129,7 @@ crawlo/tools/text_cleaner.py,sha256=UrMGcgRnJaufjmDKIDsRYKMA8znCAArHDgouttWPygk,
|
|
|
130
129
|
crawlo/utils/__init__.py,sha256=nxLnfqcEGLnsfSEagoKNyu-pm2ByU9BwE5tLxcS71Qo,1003
|
|
131
130
|
crawlo/utils/batch_processor.py,sha256=8LNy-K2SrQVUxmGEWxQyYw_j9M-erN4Ie7O4d3zpBvM,9142
|
|
132
131
|
crawlo/utils/controlled_spider_mixin.py,sha256=8CuM3Cr2wQLHbaO_ohbCsPcImJnyfZHpERbSeMgQ-AQ,16936
|
|
133
|
-
crawlo/utils/db_helper.py,sha256=
|
|
132
|
+
crawlo/utils/db_helper.py,sha256=zFr4BpEMbaY86DrR5Ol5-hfvkSXcG66prl00LPHLl8E,8702
|
|
134
133
|
crawlo/utils/env_config.py,sha256=W-VD_WF63DHxsyJysvp1eJwRh3L_pBRl_PitQAY3nQY,4079
|
|
135
134
|
crawlo/utils/error_handler.py,sha256=e2LeUGT_OMcNKcjiX9Pp-NuQh5spsHBqIPBd7VxA2IQ,16247
|
|
136
135
|
crawlo/utils/fingerprint.py,sha256=3IbctH3zwyBjN_12SH7-vrFt-akA2WSo3iAzHc6u--s,3689
|
|
@@ -154,7 +153,7 @@ crawlo/utils/url.py,sha256=RKe_iqdjafsNcp-P2GVLYpsL1qbxiuZLiFc-SqOQkcs,1521
|
|
|
154
153
|
examples/__init__.py,sha256=NkRbV8_S1tb8S2AW6BE2U6P2-eGOPwMR1k0YQAwQpSE,130
|
|
155
154
|
tests/__init__.py,sha256=409aRX8hsPffiZCVjOogtxwhACzBp8G2UTJyUQSxhK0,136
|
|
156
155
|
tests/advanced_tools_example.py,sha256=1_iitECKCuWUYMNNGo61l3lmwMRrWdA8F_Xw56UaGZY,9340
|
|
157
|
-
tests/authenticated_proxy_example.py,sha256=
|
|
156
|
+
tests/authenticated_proxy_example.py,sha256=ZgLrU-1GaBhkJK1Wy0X93lHP1GT2sU2_wi3RI1CfrVc,3135
|
|
158
157
|
tests/baidu_performance_test.py,sha256=wxdaI7UwKboMYH_qcaqZLxAStvndH60bvKGzD8F-jaI,3974
|
|
159
158
|
tests/baidu_test.py,sha256=NKYnwDbPJX3tmKtRn7uQ_QWzUXiLTQC-Gdr1cQkJzEo,1874
|
|
160
159
|
tests/bug_check_test.py,sha256=EIDOUk_QgtBOWKuBLm_WHbgJ0fsDuJACJ-nuxnBIdkQ,8056
|
|
@@ -178,6 +177,7 @@ tests/dynamic_loading_example.py,sha256=7LdeQZFevrb-U1_dgr4oX3aYo2Da4HvE_0KIf1fw
|
|
|
178
177
|
tests/dynamic_loading_test.py,sha256=dzDW7b66HeDsIYsYgvNRihE3V6b6gEbUGQpp-eJbcIM,3413
|
|
179
178
|
tests/env_config_example.py,sha256=_ZRDh_LR23ZKpy9E--y_KM0QIOiZF5vRT98QTn52TY8,4951
|
|
180
179
|
tests/error_handling_example.py,sha256=grTeo1X17rFz4lhgASb0g5yu4NWbmNz5neyuonnNR40,5294
|
|
180
|
+
tests/explain_mysql_update_behavior.py,sha256=uBrJwiYujTJF35oF1kYMRjYU5k5Y3YlqOfOni0oPQtY,2865
|
|
181
181
|
tests/final_comprehensive_test.py,sha256=szTNbtwKfYNmE0kzDPCsE_kvnTG7FNKl2JERakGhKIk,4314
|
|
182
182
|
tests/final_log_test.py,sha256=CpZ4ZvvuvFiBvz1a50qN599XIU086ett_I0bSX42BLU,9367
|
|
183
183
|
tests/final_validation_test.py,sha256=4cuTr58i46JI6M4Tz54e7vrVFrOr3R7HSWgyQPKmM9M,5244
|
|
@@ -203,11 +203,13 @@ tests/simple_selector_helper_test.py,sha256=l9FsVhQ-z-ICqqetLIyeSaI8Dn6bXNCD8sLd
|
|
|
203
203
|
tests/simple_selector_test.py,sha256=XzOYzpEzr0yaioLV6v-4XC60VZMd5jRthlyp7Ud02o4,6630
|
|
204
204
|
tests/simple_spider_test.py,sha256=RzziJg-fbIVJ6_CgbismfkwrLwpJp4WWp2RLgG7Tpws,1168
|
|
205
205
|
tests/simple_url_test.py,sha256=g9RBn46V7fHZTU0BrB5pl5AGCbw6QuKOXClVACb-MEQ,2297
|
|
206
|
+
tests/simulate_mysql_update_test.py,sha256=7BEFdQkYjgCdLN5vnieTf-ByosCcSj2QJUMOUeYlLgQ,4597
|
|
206
207
|
tests/spider_log_timing_test.py,sha256=pvYpKZemClr4mCR76xywhsiWbT5sPdzD_taZKFjlgvM,5573
|
|
207
208
|
tests/test_advanced_tools.py,sha256=HT_TcwfFzli-CavIJSqQqnCxnBn5FDMX09zL7AJ5tNY,5398
|
|
208
209
|
tests/test_all_commands.py,sha256=VgVa9SzU5Irvn5igHpC2W4E_6ZDWDt7jc-T4UPK_PFE,7718
|
|
209
210
|
tests/test_all_pipeline_fingerprints.py,sha256=NDrBYr0f9CAhjmSezTS4NUrAdcotrSX3ElJTWqjXXbU,5308
|
|
210
211
|
tests/test_all_redis_key_configs.py,sha256=dWc4Dsr07_vuSpb4hwkMpyy6XO8SI7vglVjGuGvXoa4,5710
|
|
212
|
+
tests/test_asyncmy_usage.py,sha256=gxENdxrcLlDG2m8V-j4ZnSJYFc3x6CvKvgPAhOC13DE,1688
|
|
211
213
|
tests/test_authenticated_proxy.py,sha256=lnvmQwuf0zaZP_E05EzcNFR2VJbwTkLjOmZGNoJKaC4,4339
|
|
212
214
|
tests/test_batch_processor.py,sha256=4_nYlu9R1JkDCFHq0bYc9LUNqsg41r7sQ879hkrhEts,7212
|
|
213
215
|
tests/test_cleaners.py,sha256=HDK8_YU7GUj_3hGU415cxEeUR74mnDSk0yroLlgDI0I,1816
|
|
@@ -219,7 +221,7 @@ tests/test_config_validator.py,sha256=Z4gBHkI0_fEx-xgiiG4T33F4BAuePuF81obpNTXfse
|
|
|
219
221
|
tests/test_controlled_spider_mixin.py,sha256=AQ493ic6AxZAKd7QCgnUES92BBWCMNteTd5DjoQlhwo,2864
|
|
220
222
|
tests/test_crawler_process_import.py,sha256=iIPqSCpv2VRb_hWTu5euLME4PDFf7NwixeBypRuv39Y,1175
|
|
221
223
|
tests/test_crawler_process_spider_modules.py,sha256=uMr4esj6ascVBzt0WrPd3ZOQfKD00O6tJrNhuWOdvV0,1395
|
|
222
|
-
tests/test_crawlo_proxy_integration.py,sha256=
|
|
224
|
+
tests/test_crawlo_proxy_integration.py,sha256=JFBI82ILXMwAIJ29C8uhu5r-hH3UhMC50jKr5-jy6Ng,3059
|
|
223
225
|
tests/test_date_tools.py,sha256=pcLDyhLrZ_jh-PhPm4CvLZEgNeH9kLMPKN5zacHwuWM,4053
|
|
224
226
|
tests/test_dedup_fix.py,sha256=UFdm8lIi0ZIdp40W8ruxRD69bxzijuFUfNyJmB4Fwl0,8788
|
|
225
227
|
tests/test_dedup_pipeline_consistency.py,sha256=dn5EAZSU5gQOV5EQwreHp76i5aQZ9tEdltSGO7dif5M,5176
|
|
@@ -228,7 +230,7 @@ tests/test_distributed.py,sha256=78Pn4HPLIaO8t1IiaSkckBmuEVTcnC8IDw7znf9_Zcw,179
|
|
|
228
230
|
tests/test_double_crawlo_fix.py,sha256=lZwrT5ij6Jbh0EzZswhw05FXwgKaEZsSHekLTrJJajg,7856
|
|
229
231
|
tests/test_double_crawlo_fix_simple.py,sha256=NDmCEeyvpf_D1tGQMA66iLPPKlAnSZcEg71e7GHYcjg,4768
|
|
230
232
|
tests/test_download_delay_middleware.py,sha256=Idc6KzhL3hY3aDKgn1j_v5-mLIHz7dTnV5c4tJVZh5Q,9107
|
|
231
|
-
tests/test_downloader_proxy_compatibility.py,sha256=
|
|
233
|
+
tests/test_downloader_proxy_compatibility.py,sha256=NJJ-g_I665lHLsJZd7ONvKubHRxv82FADZR9WYzgyzA,9418
|
|
232
234
|
tests/test_dynamic_downloaders_proxy.py,sha256=t_aWpxOHi4h3_fg2ImtIq7IIJ0r3PTHtnXiopPe2ZlM,4450
|
|
233
235
|
tests/test_dynamic_proxy.py,sha256=zi7Ocbhc9GL1zCs0XhmG2NvBBeIZ2d2hPJVh18lH4Y0,3172
|
|
234
236
|
tests/test_dynamic_proxy_config.py,sha256=C_9CEjCJtrr0SxIXCyLDhSIi88ujF7UAT1F-FAphd0w,5853
|
|
@@ -260,6 +262,15 @@ tests/test_middleware_debug.py,sha256=gtiaWCxBSTcaNkdqXirM7CsThr_HfiCueBdQCpp7rq
|
|
|
260
262
|
tests/test_mode_consistency.py,sha256=t72WX0etC_AayaL2AT6e2lIgbfP-zxTgYAiTARSN2Jk,1276
|
|
261
263
|
tests/test_multi_directory.py,sha256=sH9Y3B-fuESlc7J1aICa-AlBcCW8HFR-Q5j2anUr8l0,2196
|
|
262
264
|
tests/test_multiple_spider_modules.py,sha256=M0wPyQW7HMasbMIgn_R78wjZEj4A_DgqaGHp0qF9Y0c,2567
|
|
265
|
+
tests/test_mysql_pipeline_config.py,sha256=5Yveo4cPiGOG22EO5493QkC2m3ocKfv0Y2jK9m_4aZU,6793
|
|
266
|
+
tests/test_mysql_pipeline_error.py,sha256=htqZBnEIF3kIML53u8Sv4_PnyRep-0JJFApuD8FpkFQ,3529
|
|
267
|
+
tests/test_mysql_pipeline_init_log.py,sha256=-x9M2wqfa5g3jZ-y7iIPIOqEle0HouC28YECWfSE5OQ,2516
|
|
268
|
+
tests/test_mysql_pipeline_integration.py,sha256=fhBwU0ewH3nc1ol1JH4xpVTGrqlIttBghkqtxtOgMF0,4208
|
|
269
|
+
tests/test_mysql_pipeline_refactor.py,sha256=yJzBBgoIavQjXWQtivP0j8kAwmbb8zybypHqdLbfd_c,5804
|
|
270
|
+
tests/test_mysql_pipeline_refactor_simple.py,sha256=QmF2Zv-0FyWMs6SYNXQPC3GW1rVyPnKmM_2rGOtxCps,3724
|
|
271
|
+
tests/test_mysql_pipeline_robustness.py,sha256=cmjDOv9FX1OAFHJaY3WkveCSOTZiiZKu5ehjHaI-QW0,6138
|
|
272
|
+
tests/test_mysql_pipeline_types.py,sha256=dIs4aYlV9vsGfhvmDHOc-LCx-jDqUzoAkn-v8i2ae7Y,2474
|
|
273
|
+
tests/test_mysql_update_columns.py,sha256=CyEshc7b_yprIXcQtNOaWvCC2ZDb0kzjLOfmd8r3sOY,3458
|
|
263
274
|
tests/test_offsite_middleware.py,sha256=njpXTdngOqBs60Wj6xgo5EEXlJnMHd7vtYGi9dVauW0,10602
|
|
264
275
|
tests/test_offsite_middleware_simple.py,sha256=4MfDKSXGHcoFLYnnxCH2rmnzztWyN0xByYLoHtepyiA,7918
|
|
265
276
|
tests/test_optimized_selector_naming.py,sha256=fbmlB5S2kBwtQWpWoQ4lQ7rUQm2_DeWK-t6KqvIRTUQ,2787
|
|
@@ -272,13 +283,15 @@ tests/test_priority_consistency.py,sha256=rVX7nku5N_QpB_ffDu3xqREkCWPX5aNNiXy112
|
|
|
272
283
|
tests/test_priority_consistency_fixed.py,sha256=MlYi5PIr5wxunC3Ku4ilnxOatKyRu2qIvhV7pjadkjg,10765
|
|
273
284
|
tests/test_proxy_api.py,sha256=XnmklS-xU4ke_560gV6AIlBsRmG8YLQTGFAZrTUZuhc,11013
|
|
274
285
|
tests/test_proxy_health_check.py,sha256=_tDlxa_6TdL3M5RLkHF82roXJ8WIuG5hELBp2GADyKQ,1123
|
|
275
|
-
tests/test_proxy_middleware.py,sha256=
|
|
276
|
-
tests/test_proxy_middleware_enhanced.py,sha256=
|
|
277
|
-
tests/test_proxy_middleware_integration.py,sha256=
|
|
278
|
-
tests/test_proxy_middleware_refactored.py,sha256=
|
|
286
|
+
tests/test_proxy_middleware.py,sha256=MC2Hg88Pdpv6i_gTAy4ocIWOOxQ8bF7hYtszwpOzilE,8716
|
|
287
|
+
tests/test_proxy_middleware_enhanced.py,sha256=N7Ly3koCH2uRYk6pxhEJwWpChKdIucdrj0nKvq_E4bw,6896
|
|
288
|
+
tests/test_proxy_middleware_integration.py,sha256=PQhJKM1uGtQTlBh7XlKWAMwNwQ6K8of-P15KHDF2dJg,4729
|
|
289
|
+
tests/test_proxy_middleware_refactored.py,sha256=Z4szCDqyjAwWtgDoddgfeNIVsVefPcrfsZP57gCMrJQ,8272
|
|
290
|
+
tests/test_proxy_only.py,sha256=OqF3An_s9VY4mfLX7kDRz_LMtLpNzC6LS2kQkEyiBRw,2563
|
|
279
291
|
tests/test_proxy_providers.py,sha256=u_R2fhab90vqvQEaOAztpAOe9tJXvUMIdoDxmStmXJ4,1749
|
|
280
292
|
tests/test_proxy_stats.py,sha256=ES00CEoDITYPFBGPk8pecFzD3ItYIv6NSpcqNd8-kvo,526
|
|
281
293
|
tests/test_proxy_strategies.py,sha256=9Z1pXmTNyw-eIhGXlf2abZbJx6igLohYq-_3hldQ5uE,1868
|
|
294
|
+
tests/test_proxy_with_downloader.py,sha256=6OqyLcIM9nPMhL9bCYUIeSvYValKPw72XC-Up8jRri8,4597
|
|
282
295
|
tests/test_queue_empty_check.py,sha256=ZJC6jOgZq0Wb0-ubrB1ZNcCaUiWeCxoNZmjkd6PY6t0,1182
|
|
283
296
|
tests/test_queue_manager_double_crawlo.py,sha256=MijZ3JuyHMuqGbRC-8kclFr-4O7m_T8CqezP4qiWk-E,6957
|
|
284
297
|
tests/test_queue_manager_redis_key.py,sha256=txHLq5XUZZN7h9HUlqlUCEVCTe2IXdf9r7F_P2zNVdY,7117
|
|
@@ -288,7 +301,7 @@ tests/test_queue_type_redis_config_consistency.py,sha256=1ew7Zp9CxH1DQ0RUmsZMV-n
|
|
|
288
301
|
tests/test_random_headers_default.py,sha256=ulDb3_kRpnTCN1-TO3m6wVM-eMkZS_ezsSbd1ur8Xpg,12772
|
|
289
302
|
tests/test_random_headers_necessity.py,sha256=SSbNQIE347oCQvuG6yaAambFU-3MyQzTV5jN1kArRGY,11741
|
|
290
303
|
tests/test_random_user_agent.py,sha256=6HjU4iUcMk-J6bR2N5FhIkWDfnaFKAPNVyRzxmQQ14k,2302
|
|
291
|
-
tests/test_real_scenario_proxy.py,sha256=
|
|
304
|
+
tests/test_real_scenario_proxy.py,sha256=L2Mfwt47pvs6dYJDcazeyupoQ_DuvhdulCz6-2GFR9Y,7527
|
|
292
305
|
tests/test_redis_config.py,sha256=51_Fy1PqIhS0MMO2nR4q6oQjBFxfqcUPK_4NNf5s83g,903
|
|
293
306
|
tests/test_redis_connection_pool.py,sha256=pKfXdE3Cm_L_fNqI9zqFmqiidCwR0t7hiM_Fu_V1cNI,9328
|
|
294
307
|
tests/test_redis_key_naming.py,sha256=MTFk656JhiGVTsMctBDhBNOMFcBDZrsQA3UfPZ-Dgj4,6911
|
|
@@ -328,6 +341,7 @@ tests/untested_features_report.md,sha256=31aUlsw_1OKe0_ijAjeH85kJ7HJ8qzKLJdOHDjW
|
|
|
328
341
|
tests/verify_debug.py,sha256=iQ4Efwg9bQTHscr73VYAAZ8rBIe1u6mQfeaEK5YgneY,1564
|
|
329
342
|
tests/verify_distributed.py,sha256=0IolM4ymuPOz_uTfHSWFO3Vxzp7Lo6i0zhSbzJhHFtI,4045
|
|
330
343
|
tests/verify_log_fix.py,sha256=7reyVl3MXTDASyChgU5BAYuzuxvFjSLG9HywAHso0qg,4336
|
|
344
|
+
tests/verify_mysql_warnings.py,sha256=TMPsB1yp7R_c3S6LllgPJ-n_4He6gHVygAC81zbeQrc,4106
|
|
331
345
|
tests/ofweek_scrapy/scrapy.cfg,sha256=D_8rsW65iTbH7nG1kI25jYTCpoQKBVa2shajrsC6fBw,280
|
|
332
346
|
tests/ofweek_scrapy/ofweek_scrapy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
333
347
|
tests/ofweek_scrapy/ofweek_scrapy/items.py,sha256=Y_TwwHPAgOXTuCTdnhRxil7vYPk1_rzj1ZatTq4AX-I,280
|
|
@@ -340,8 +354,8 @@ tests/scrapy_comparison/ofweek_scrapy.py,sha256=rhVds_WjYum1bLuWWe90HtXE51fZXEqh
|
|
|
340
354
|
tests/scrapy_comparison/scrapy_test.py,sha256=-IsGUHPBgEL0TmXjeLZl-TUA01B7Dsc2nRo4JZbFwZA,5599
|
|
341
355
|
tests/test_spiders/__init__.py,sha256=Ws2DhfUA0Xh5Cxr9M46td7B6hyNoLTyAhZ60FnIh6D0,20
|
|
342
356
|
tests/test_spiders/test_spider.py,sha256=kNGEg80HMMFgzVseI1jJjljZEBy3QYKt_3SXGASffFM,168
|
|
343
|
-
crawlo-1.4.
|
|
344
|
-
crawlo-1.4.
|
|
345
|
-
crawlo-1.4.
|
|
346
|
-
crawlo-1.4.
|
|
347
|
-
crawlo-1.4.
|
|
357
|
+
crawlo-1.4.6.dist-info/METADATA,sha256=j66m-xE1oVuLE4WEnDbBjH6PXGbfbgM7yxSF616EOHo,9355
|
|
358
|
+
crawlo-1.4.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
359
|
+
crawlo-1.4.6.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
|
|
360
|
+
crawlo-1.4.6.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
|
|
361
|
+
crawlo-1.4.6.dist-info/RECORD,,
|
|
@@ -74,14 +74,18 @@ async def main():
|
|
|
74
74
|
config = CrawloConfig.standalone(
|
|
75
75
|
concurrency=2,
|
|
76
76
|
download_delay=1.0,
|
|
77
|
-
|
|
78
|
-
#
|
|
79
|
-
|
|
80
|
-
#
|
|
77
|
+
# 代理配置
|
|
78
|
+
# 高级代理配置(适用于ProxyMiddleware)
|
|
79
|
+
# 只要配置了代理API URL,中间件就会自动启用
|
|
80
|
+
PROXY_API_URL="http://proxy-api.example.com/get", # 代理API地址
|
|
81
|
+
|
|
82
|
+
# 代理配置(适用于ProxyMiddleware)
|
|
83
|
+
# 只要配置了代理列表,中间件就会自动启用
|
|
81
84
|
# PROXY_LIST=[
|
|
82
|
-
# "http://
|
|
83
|
-
# "http://
|
|
85
|
+
# "http://user:pass@proxy1.example.com:8080",
|
|
86
|
+
# "http://user:pass@proxy2.example.com:8080"
|
|
84
87
|
# ],
|
|
88
|
+
|
|
85
89
|
LOG_LEVEL='INFO'
|
|
86
90
|
)
|
|
87
91
|
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
解释 MySQL ON DUPLICATE KEY UPDATE 行为
|
|
4
|
+
帮助理解为什么在使用 MYSQL_UPDATE_COLUMNS 时可能显示"未插入新记录"
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
# 添加项目根目录到 Python 路径
|
|
10
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
11
|
+
|
|
12
|
+
from crawlo.utils.db_helper import SQLBuilder
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def explain_mysql_behavior():
|
|
16
|
+
"""解释 MySQL ON DUPLICATE KEY UPDATE 的行为"""
|
|
17
|
+
print("=== MySQL ON DUPLICATE KEY UPDATE 行为解释 ===\n")
|
|
18
|
+
|
|
19
|
+
# 模拟实际使用的数据
|
|
20
|
+
table = "news_items"
|
|
21
|
+
item_data = {
|
|
22
|
+
'title': '新一代OLED屏下光谱颜色传感技术:解锁显示新密码,重塑视觉新体验',
|
|
23
|
+
'publish_time': '2025-10-09 09:57',
|
|
24
|
+
'url': 'https://ee.ofweek.com/2025-10/ART-8460-2806-30671544.html',
|
|
25
|
+
'source': '',
|
|
26
|
+
'content': '在全球智能手机市场竞争日趋白热化的当下,消费者对手机屏幕显示效果的要求愈发严苛...'
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
print("当使用 MYSQL_UPDATE_COLUMNS 配置时:")
|
|
30
|
+
print("MYSQL_UPDATE_COLUMNS = ('title', 'publish_time')")
|
|
31
|
+
print()
|
|
32
|
+
|
|
33
|
+
# 生成 SQL
|
|
34
|
+
sql = SQLBuilder.make_insert(
|
|
35
|
+
table=table,
|
|
36
|
+
data=item_data,
|
|
37
|
+
auto_update=False,
|
|
38
|
+
update_columns=('title', 'publish_time'),
|
|
39
|
+
insert_ignore=False
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
print("生成的 SQL 语句:")
|
|
43
|
+
print(sql)
|
|
44
|
+
print()
|
|
45
|
+
|
|
46
|
+
print("MySQL 行为说明:")
|
|
47
|
+
print("1. 如果这是一条新记录(没有主键或唯一键冲突):")
|
|
48
|
+
print(" - MySQL 会正常插入记录")
|
|
49
|
+
print(" - 返回影响行数为 1")
|
|
50
|
+
print()
|
|
51
|
+
print("2. 如果遇到主键或唯一键冲突:")
|
|
52
|
+
print(" - MySQL 会执行 ON DUPLICATE KEY UPDATE 子句")
|
|
53
|
+
print(" - 更新指定的字段: title 和 publish_time")
|
|
54
|
+
print()
|
|
55
|
+
print("3. 关键点 - 如果更新的字段值与现有记录完全相同:")
|
|
56
|
+
print(" - MySQL 不会实际更新任何数据")
|
|
57
|
+
print(" - 返回影响行数为 0")
|
|
58
|
+
print(" - 这就是你看到 'SQL执行成功但未插入新记录' 的原因")
|
|
59
|
+
print()
|
|
60
|
+
|
|
61
|
+
print("如何验证是否真的更新了数据:")
|
|
62
|
+
print("1. 检查数据库中的记录是否发生变化")
|
|
63
|
+
print("2. 如果内容字段有变化但未在 update_columns 中指定,则不会更新")
|
|
64
|
+
print("3. 可以在 update_columns 中添加更多字段以确保更新")
|
|
65
|
+
print()
|
|
66
|
+
|
|
67
|
+
print("建议的配置:")
|
|
68
|
+
print("# 如果希望在冲突时更新更多字段,可以这样配置:")
|
|
69
|
+
print("MYSQL_UPDATE_COLUMNS = ('title', 'publish_time', 'content')")
|
|
70
|
+
print()
|
|
71
|
+
print("# 或者如果希望完全替换记录:")
|
|
72
|
+
print("MYSQL_AUTO_UPDATE = True")
|
|
73
|
+
print("MYSQL_UPDATE_COLUMNS = () # 空元组")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
if __name__ == "__main__":
|
|
77
|
+
explain_mysql_behavior()
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
模拟 MySQL ON DUPLICATE KEY UPDATE 行为测试
|
|
4
|
+
演示不同情况下的影响行数
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
# 添加项目根目录到 Python 路径
|
|
10
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
11
|
+
|
|
12
|
+
from crawlo.utils.db_helper import SQLBuilder
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def simulate_mysql_scenarios():
|
|
16
|
+
"""模拟不同的 MySQL 场景"""
|
|
17
|
+
print("=== MySQL 场景模拟测试 ===\n")
|
|
18
|
+
|
|
19
|
+
table = "news_items"
|
|
20
|
+
|
|
21
|
+
# 场景1: 新记录插入
|
|
22
|
+
print("场景1: 插入新记录")
|
|
23
|
+
new_data = {
|
|
24
|
+
'title': '新文章标题',
|
|
25
|
+
'publish_time': '2025-10-09 10:00',
|
|
26
|
+
'url': 'https://example.com/new-article',
|
|
27
|
+
'source': '新来源',
|
|
28
|
+
'content': '新文章内容'
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
sql1 = SQLBuilder.make_insert(
|
|
32
|
+
table=table,
|
|
33
|
+
data=new_data,
|
|
34
|
+
auto_update=False,
|
|
35
|
+
update_columns=('title', 'publish_time'),
|
|
36
|
+
insert_ignore=False
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
print(f"SQL: {sql1[:100]}...")
|
|
40
|
+
print("预期行为: 正常插入,影响行数 = 1")
|
|
41
|
+
print()
|
|
42
|
+
|
|
43
|
+
# 场景2: 冲突但字段值相同
|
|
44
|
+
print("场景2: 主键冲突,更新字段值相同")
|
|
45
|
+
duplicate_data = {
|
|
46
|
+
'title': '已有文章标题', # 假设数据库中已存在相同标题的记录
|
|
47
|
+
'publish_time': '2025-10-09 09:00', # 与数据库中记录相同的发布时间
|
|
48
|
+
'url': 'https://example.com/existing-article',
|
|
49
|
+
'source': '来源',
|
|
50
|
+
'content': '文章内容'
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
sql2 = SQLBuilder.make_insert(
|
|
54
|
+
table=table,
|
|
55
|
+
data=duplicate_data,
|
|
56
|
+
auto_update=False,
|
|
57
|
+
update_columns=('title', 'publish_time'),
|
|
58
|
+
insert_ignore=False
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
print(f"SQL: {sql2[:100]}...")
|
|
62
|
+
print("预期行为: 触发 ON DUPLICATE KEY UPDATE,但字段值未变化,影响行数 = 0")
|
|
63
|
+
print()
|
|
64
|
+
|
|
65
|
+
# 场景3: 冲突且字段值不同
|
|
66
|
+
print("场景3: 主键冲突,更新字段值不同")
|
|
67
|
+
updated_data = {
|
|
68
|
+
'title': '已有文章标题', # 与数据库中记录相同
|
|
69
|
+
'publish_time': '2025-10-09 11:00', # 与数据库中记录不同的发布时间
|
|
70
|
+
'url': 'https://example.com/existing-article',
|
|
71
|
+
'source': '来源',
|
|
72
|
+
'content': '文章内容'
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
sql3 = SQLBuilder.make_insert(
|
|
76
|
+
table=table,
|
|
77
|
+
data=updated_data,
|
|
78
|
+
auto_update=False,
|
|
79
|
+
update_columns=('title', 'publish_time'),
|
|
80
|
+
insert_ignore=False
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
print(f"SQL: {sql3[:100]}...")
|
|
84
|
+
print("预期行为: 触发 ON DUPLICATE KEY UPDATE,字段值变化,影响行数 = 2")
|
|
85
|
+
print("(MySQL 5.7+ 版本中,更新操作返回的影响行数为 2)")
|
|
86
|
+
print()
|
|
87
|
+
|
|
88
|
+
# 场景4: 使用 INSERT IGNORE
|
|
89
|
+
print("场景4: 使用 INSERT IGNORE")
|
|
90
|
+
ignore_data = {
|
|
91
|
+
'title': '忽略重复标题', # 假设数据库中已存在相同标题的记录
|
|
92
|
+
'publish_time': '2025-10-09 12:00',
|
|
93
|
+
'url': 'https://example.com/ignore-article',
|
|
94
|
+
'source': '忽略来源',
|
|
95
|
+
'content': '忽略内容'
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
sql4 = SQLBuilder.make_insert(
|
|
99
|
+
table=table,
|
|
100
|
+
data=ignore_data,
|
|
101
|
+
auto_update=False,
|
|
102
|
+
update_columns=(),
|
|
103
|
+
insert_ignore=True
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
print(f"SQL: {sql4[:100]}...")
|
|
107
|
+
print("预期行为: 遇到重复记录时忽略插入,影响行数 = 0")
|
|
108
|
+
print()
|
|
109
|
+
|
|
110
|
+
# 场景5: 使用 REPLACE INTO
|
|
111
|
+
print("场景5: 使用 REPLACE INTO")
|
|
112
|
+
replace_data = {
|
|
113
|
+
'title': '替换文章标题', # 假设数据库中已存在相同标题的记录
|
|
114
|
+
'publish_time': '2025-10-09 13:00',
|
|
115
|
+
'url': 'https://example.com/replace-article',
|
|
116
|
+
'source': '替换来源',
|
|
117
|
+
'content': '替换内容'
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
sql5 = SQLBuilder.make_insert(
|
|
121
|
+
table=table,
|
|
122
|
+
data=replace_data,
|
|
123
|
+
auto_update=True, # 使用 REPLACE INTO
|
|
124
|
+
update_columns=(),
|
|
125
|
+
insert_ignore=False
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
print(f"SQL: {sql5[:100]}...")
|
|
129
|
+
print("预期行为: 删除旧记录并插入新记录,影响行数 = 2")
|
|
130
|
+
print()
|
|
131
|
+
|
|
132
|
+
print("=== 总结 ===")
|
|
133
|
+
print("1. 当使用 MYSQL_UPDATE_COLUMNS 时,影响行数为 0 并不表示错误")
|
|
134
|
+
print("2. 这可能意味着更新的字段值与现有记录相同")
|
|
135
|
+
print("3. 如果需要确保更新,可以在 update_columns 中包含更多字段")
|
|
136
|
+
print("4. 如果需要完全替换记录,使用 MYSQL_AUTO_UPDATE = True")
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
if __name__ == "__main__":
|
|
140
|
+
simulate_mysql_scenarios()
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import asyncio
|
|
3
|
+
from asyncmy import create_pool
|
|
4
|
+
|
|
5
|
+
async def test_asyncmy_usage():
|
|
6
|
+
"""测试asyncmy库的正确使用方式"""
|
|
7
|
+
try:
|
|
8
|
+
# 创建连接池
|
|
9
|
+
pool = await create_pool(
|
|
10
|
+
host='127.0.0.1',
|
|
11
|
+
port=3306,
|
|
12
|
+
user='root',
|
|
13
|
+
password='123456',
|
|
14
|
+
db='test',
|
|
15
|
+
minsize=1,
|
|
16
|
+
maxsize=5
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
# 获取连接
|
|
20
|
+
conn = await pool.acquire()
|
|
21
|
+
try:
|
|
22
|
+
# 获取游标
|
|
23
|
+
cursor = await conn.cursor()
|
|
24
|
+
try:
|
|
25
|
+
# 执行SQL
|
|
26
|
+
result = cursor.execute("SELECT 1")
|
|
27
|
+
print(f"execute返回类型: {type(result)}")
|
|
28
|
+
print(f"execute返回值: {result}")
|
|
29
|
+
|
|
30
|
+
# 检查是否需要await
|
|
31
|
+
if hasattr(result, '__await__'):
|
|
32
|
+
print("execute返回的是协程对象,需要await")
|
|
33
|
+
result = await result
|
|
34
|
+
else:
|
|
35
|
+
print("execute返回的不是协程对象,不需要await")
|
|
36
|
+
|
|
37
|
+
# 提交事务
|
|
38
|
+
await conn.commit()
|
|
39
|
+
|
|
40
|
+
finally:
|
|
41
|
+
await cursor.close()
|
|
42
|
+
finally:
|
|
43
|
+
pool.release(conn)
|
|
44
|
+
|
|
45
|
+
# 关闭连接池
|
|
46
|
+
pool.close()
|
|
47
|
+
await pool.wait_closed()
|
|
48
|
+
|
|
49
|
+
print("测试完成")
|
|
50
|
+
|
|
51
|
+
except Exception as e:
|
|
52
|
+
print(f"测试出错: {e}")
|
|
53
|
+
import traceback
|
|
54
|
+
traceback.print_exc()
|
|
55
|
+
|
|
56
|
+
if __name__ == "__main__":
|
|
57
|
+
asyncio.run(test_asyncmy_usage())
|
|
@@ -61,8 +61,14 @@ async def test_proxy_integration():
|
|
|
61
61
|
config = CrawloConfig.standalone(
|
|
62
62
|
concurrency=1,
|
|
63
63
|
download_delay=0.1,
|
|
64
|
-
|
|
65
|
-
|
|
64
|
+
# 代理配置
|
|
65
|
+
# 高级代理配置(适用于ProxyMiddleware)
|
|
66
|
+
# 只要配置了代理API URL,中间件就会自动启用
|
|
67
|
+
PROXY_API_URL="https://proxy-api.example.com/get", # 模拟代理API
|
|
68
|
+
|
|
69
|
+
# 代理配置(适用于ProxyMiddleware)
|
|
70
|
+
# 只要配置了代理列表,中间件就会自动启用
|
|
71
|
+
# PROXY_LIST=["http://proxy1:8080", "http://proxy2:8080"],
|
|
66
72
|
LOG_LEVEL='WARNING' # 减少日志输出
|
|
67
73
|
)
|
|
68
74
|
|
|
@@ -35,7 +35,7 @@ class MockCrawler:
|
|
|
35
35
|
self.spider = MockSpider(self) # 添加spider属性
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
def create_test_settings(proxy_url=None):
|
|
38
|
+
def create_test_settings(proxy_url=None, proxy_list=None):
|
|
39
39
|
"""创建测试设置"""
|
|
40
40
|
settings = SettingManager()
|
|
41
41
|
settings.set("LOG_LEVEL", "DEBUG")
|
|
@@ -47,12 +47,13 @@ def create_test_settings(proxy_url=None):
|
|
|
47
47
|
|
|
48
48
|
# 代理相关设置
|
|
49
49
|
if proxy_url:
|
|
50
|
-
|
|
50
|
+
# 高级代理配置(适用于ProxyMiddleware)
|
|
51
|
+
# 只要配置了代理API URL,中间件就会自动启用
|
|
51
52
|
settings.set("PROXY_API_URL", proxy_url)
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
settings.set("
|
|
53
|
+
elif proxy_list:
|
|
54
|
+
# 代理配置(适用于ProxyMiddleware)
|
|
55
|
+
# 只要配置了代理列表,中间件就会自动启用
|
|
56
|
+
settings.set("PROXY_LIST", proxy_list)
|
|
56
57
|
|
|
57
58
|
return settings
|
|
58
59
|
|
|
@@ -65,7 +66,7 @@ async def test_aiohttp_with_proxy(proxy_url, target_url):
|
|
|
65
66
|
|
|
66
67
|
try:
|
|
67
68
|
# 创建设置
|
|
68
|
-
settings = create_test_settings(proxy_url)
|
|
69
|
+
settings = create_test_settings(proxy_url=proxy_url)
|
|
69
70
|
crawler = MockCrawler(settings)
|
|
70
71
|
|
|
71
72
|
# 创建下载器
|
|
@@ -73,6 +74,7 @@ async def test_aiohttp_with_proxy(proxy_url, target_url):
|
|
|
73
74
|
downloader.open()
|
|
74
75
|
|
|
75
76
|
# 创建代理中间件
|
|
77
|
+
from crawlo.middleware.proxy import ProxyMiddleware
|
|
76
78
|
proxy_middleware = ProxyMiddleware(settings, "DEBUG")
|
|
77
79
|
|
|
78
80
|
# 创建请求
|
|
@@ -115,15 +117,15 @@ async def test_aiohttp_with_proxy(proxy_url, target_url):
|
|
|
115
117
|
pass
|
|
116
118
|
|
|
117
119
|
|
|
118
|
-
async def test_httpx_with_proxy_async(
|
|
120
|
+
async def test_httpx_with_proxy_async(proxy_list, target_url):
|
|
119
121
|
"""测试httpx下载器与代理的适配性"""
|
|
120
122
|
print(f"\n=== 测试 httpx 下载器与代理 ===")
|
|
121
|
-
print(f"
|
|
123
|
+
print(f"代理列表: {proxy_list}")
|
|
122
124
|
print(f"目标URL: {target_url}")
|
|
123
125
|
|
|
124
126
|
try:
|
|
125
127
|
# 创建设置
|
|
126
|
-
settings = create_test_settings(
|
|
128
|
+
settings = create_test_settings(proxy_list=proxy_list)
|
|
127
129
|
crawler = MockCrawler(settings)
|
|
128
130
|
|
|
129
131
|
# 创建下载器
|
|
@@ -131,7 +133,8 @@ async def test_httpx_with_proxy_async(proxy_url, target_url):
|
|
|
131
133
|
downloader.open()
|
|
132
134
|
|
|
133
135
|
# 创建代理中间件
|
|
134
|
-
|
|
136
|
+
from crawlo.middleware.simple_proxy import SimpleProxyMiddleware
|
|
137
|
+
proxy_middleware = SimpleProxyMiddleware(settings, "DEBUG")
|
|
135
138
|
|
|
136
139
|
# 创建请求
|
|
137
140
|
request = Request(url=target_url)
|
|
@@ -168,7 +171,6 @@ async def test_httpx_with_proxy_async(proxy_url, target_url):
|
|
|
168
171
|
# 清理资源
|
|
169
172
|
try:
|
|
170
173
|
await downloader.close()
|
|
171
|
-
await proxy_middleware.close()
|
|
172
174
|
except:
|
|
173
175
|
pass
|
|
174
176
|
|
|
@@ -181,7 +183,7 @@ async def test_curl_cffi_with_proxy_async(proxy_url, target_url):
|
|
|
181
183
|
|
|
182
184
|
try:
|
|
183
185
|
# 创建设置
|
|
184
|
-
settings = create_test_settings(proxy_url)
|
|
186
|
+
settings = create_test_settings(proxy_url=proxy_url)
|
|
185
187
|
crawler = MockCrawler(settings)
|
|
186
188
|
|
|
187
189
|
# 创建下载器
|
|
@@ -238,26 +240,28 @@ async def main():
|
|
|
238
240
|
# 使用测试代理URL(这里使用一个公开的测试代理)
|
|
239
241
|
# 注意:在实际使用中,您需要替换为有效的代理URL
|
|
240
242
|
test_proxy_url = "http://test.proxy.api:8080/proxy/getitem/"
|
|
243
|
+
test_proxy_list = ["http://proxy1:8080", "http://proxy2:8080"]
|
|
241
244
|
test_target_url = "https://httpbin.org/ip" # 一个返回IP信息的测试站点
|
|
242
245
|
|
|
243
246
|
print(f"测试代理API: {test_proxy_url}")
|
|
247
|
+
print(f"测试代理列表: {test_proxy_list}")
|
|
244
248
|
print(f"测试目标URL: {test_target_url}")
|
|
245
249
|
|
|
246
|
-
# 测试aiohttp
|
|
250
|
+
# 测试aiohttp下载器(使用高级代理)
|
|
247
251
|
aiohttp_result = await test_aiohttp_with_proxy(test_proxy_url, test_target_url)
|
|
248
252
|
|
|
249
|
-
# 测试httpx
|
|
250
|
-
httpx_result = await test_httpx_with_proxy_async(
|
|
253
|
+
# 测试httpx下载器(使用简化代理)
|
|
254
|
+
httpx_result = await test_httpx_with_proxy_async(test_proxy_list, test_target_url)
|
|
251
255
|
|
|
252
|
-
# 测试curl-cffi
|
|
256
|
+
# 测试curl-cffi下载器(使用高级代理)
|
|
253
257
|
curl_cffi_result = await test_curl_cffi_with_proxy_async(test_proxy_url, test_target_url)
|
|
254
258
|
|
|
255
259
|
# 汇总结果
|
|
256
260
|
print("\n" + "="*50)
|
|
257
261
|
print("测试结果汇总:")
|
|
258
|
-
print(f"aiohttp
|
|
259
|
-
print(f"httpx
|
|
260
|
-
print(f"curl-cffi
|
|
262
|
+
print(f"aiohttp 下载器 (高级代理): {'✓ 通过' if aiohttp_result else '✗ 失败'}")
|
|
263
|
+
print(f"httpx 下载器 (简化代理): {'✓ 通过' if httpx_result else '✗ 失败'}")
|
|
264
|
+
print(f"curl-cffi 下载器 (高级代理): {'✓ 通过' if curl_cffi_result else '✗ 失败'}")
|
|
261
265
|
|
|
262
266
|
overall_result = all([aiohttp_result, httpx_result, curl_cffi_result])
|
|
263
267
|
print(f"\n总体结果: {'✓ 所有下载器都适配代理中间件' if overall_result else '✗ 部分下载器不兼容'}")
|