crawlo 1.1.3__py3-none-any.whl → 1.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (115) hide show
  1. crawlo/__init__.py +28 -1
  2. crawlo/__version__.py +1 -1
  3. crawlo/cleaners/__init__.py +61 -0
  4. crawlo/cleaners/data_formatter.py +226 -0
  5. crawlo/cleaners/encoding_converter.py +126 -0
  6. crawlo/cleaners/text_cleaner.py +233 -0
  7. crawlo/commands/startproject.py +117 -13
  8. crawlo/config.py +30 -0
  9. crawlo/config_validator.py +253 -0
  10. crawlo/core/engine.py +185 -11
  11. crawlo/core/scheduler.py +49 -78
  12. crawlo/crawler.py +6 -6
  13. crawlo/downloader/__init__.py +24 -0
  14. crawlo/downloader/aiohttp_downloader.py +8 -0
  15. crawlo/downloader/cffi_downloader.py +5 -0
  16. crawlo/downloader/hybrid_downloader.py +214 -0
  17. crawlo/downloader/playwright_downloader.py +403 -0
  18. crawlo/downloader/selenium_downloader.py +473 -0
  19. crawlo/extension/__init__.py +17 -10
  20. crawlo/extension/health_check.py +142 -0
  21. crawlo/extension/log_interval.py +27 -18
  22. crawlo/extension/log_stats.py +62 -24
  23. crawlo/extension/logging_extension.py +18 -9
  24. crawlo/extension/memory_monitor.py +105 -0
  25. crawlo/extension/performance_profiler.py +134 -0
  26. crawlo/extension/request_recorder.py +108 -0
  27. crawlo/filters/aioredis_filter.py +50 -12
  28. crawlo/middleware/proxy.py +26 -2
  29. crawlo/mode_manager.py +24 -19
  30. crawlo/network/request.py +30 -3
  31. crawlo/network/response.py +114 -25
  32. crawlo/pipelines/mongo_pipeline.py +81 -66
  33. crawlo/pipelines/mysql_pipeline.py +165 -43
  34. crawlo/pipelines/redis_dedup_pipeline.py +7 -3
  35. crawlo/queue/queue_manager.py +15 -2
  36. crawlo/queue/redis_priority_queue.py +144 -76
  37. crawlo/settings/default_settings.py +93 -121
  38. crawlo/subscriber.py +62 -37
  39. crawlo/templates/project/items.py.tmpl +1 -1
  40. crawlo/templates/project/middlewares.py.tmpl +73 -49
  41. crawlo/templates/project/pipelines.py.tmpl +51 -295
  42. crawlo/templates/project/settings.py.tmpl +93 -17
  43. crawlo/templates/project/settings_distributed.py.tmpl +120 -0
  44. crawlo/templates/project/settings_gentle.py.tmpl +95 -0
  45. crawlo/templates/project/settings_high_performance.py.tmpl +152 -0
  46. crawlo/templates/project/settings_simple.py.tmpl +69 -0
  47. crawlo/templates/spider/spider.py.tmpl +2 -38
  48. crawlo/tools/__init__.py +183 -0
  49. crawlo/tools/anti_crawler.py +269 -0
  50. crawlo/tools/authenticated_proxy.py +241 -0
  51. crawlo/tools/data_validator.py +181 -0
  52. crawlo/tools/date_tools.py +36 -0
  53. crawlo/tools/distributed_coordinator.py +387 -0
  54. crawlo/tools/retry_mechanism.py +221 -0
  55. crawlo/tools/scenario_adapter.py +263 -0
  56. crawlo/utils/__init__.py +29 -1
  57. crawlo/utils/batch_processor.py +261 -0
  58. crawlo/utils/date_tools.py +58 -1
  59. crawlo/utils/enhanced_error_handler.py +360 -0
  60. crawlo/utils/env_config.py +106 -0
  61. crawlo/utils/error_handler.py +126 -0
  62. crawlo/utils/performance_monitor.py +285 -0
  63. crawlo/utils/redis_connection_pool.py +335 -0
  64. crawlo/utils/redis_key_validator.py +200 -0
  65. crawlo-1.1.5.dist-info/METADATA +401 -0
  66. crawlo-1.1.5.dist-info/RECORD +185 -0
  67. tests/advanced_tools_example.py +276 -0
  68. tests/authenticated_proxy_example.py +237 -0
  69. tests/cleaners_example.py +161 -0
  70. tests/config_validation_demo.py +103 -0
  71. tests/date_tools_example.py +181 -0
  72. tests/dynamic_loading_example.py +524 -0
  73. tests/dynamic_loading_test.py +105 -0
  74. tests/env_config_example.py +134 -0
  75. tests/error_handling_example.py +172 -0
  76. tests/redis_key_validation_demo.py +131 -0
  77. tests/response_improvements_example.py +145 -0
  78. tests/test_advanced_tools.py +149 -0
  79. tests/test_all_redis_key_configs.py +146 -0
  80. tests/test_authenticated_proxy.py +142 -0
  81. tests/test_cleaners.py +55 -0
  82. tests/test_comprehensive.py +147 -0
  83. tests/test_config_validator.py +194 -0
  84. tests/test_date_tools.py +124 -0
  85. tests/test_dynamic_downloaders_proxy.py +125 -0
  86. tests/test_dynamic_proxy.py +93 -0
  87. tests/test_dynamic_proxy_config.py +147 -0
  88. tests/test_dynamic_proxy_real.py +110 -0
  89. tests/test_edge_cases.py +304 -0
  90. tests/test_enhanced_error_handler.py +271 -0
  91. tests/test_env_config.py +122 -0
  92. tests/test_error_handler_compatibility.py +113 -0
  93. tests/test_framework_env_usage.py +104 -0
  94. tests/test_integration.py +357 -0
  95. tests/test_item_dedup_redis_key.py +123 -0
  96. tests/test_parsel.py +30 -0
  97. tests/test_performance.py +328 -0
  98. tests/test_queue_manager_redis_key.py +177 -0
  99. tests/test_redis_connection_pool.py +295 -0
  100. tests/test_redis_key_naming.py +182 -0
  101. tests/test_redis_key_validator.py +124 -0
  102. tests/test_response_improvements.py +153 -0
  103. tests/test_simple_response.py +62 -0
  104. tests/test_telecom_spider_redis_key.py +206 -0
  105. tests/test_template_content.py +88 -0
  106. tests/test_template_redis_key.py +135 -0
  107. tests/test_tools.py +154 -0
  108. tests/tools_example.py +258 -0
  109. crawlo/core/enhanced_engine.py +0 -190
  110. crawlo-1.1.3.dist-info/METADATA +0 -635
  111. crawlo-1.1.3.dist-info/RECORD +0 -113
  112. {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/WHEEL +0 -0
  113. {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/entry_points.txt +0 -0
  114. {crawlo-1.1.3.dist-info → crawlo-1.1.5.dist-info}/top_level.txt +0 -0
  115. {examples → tests}/controlled_spider_example.py +0 -0
@@ -0,0 +1,185 @@
1
+ crawlo/__init__.py,sha256=jSOsZbDJ_Q5wZV8onSXx5LgNM7Z1q3zCROGdImBDr2I,1373
2
+ crawlo/__version__.py,sha256=neh7i8wZ1x6FcsvBBU2qQY3vJro_j6McS8uQFuJdY2M,23
3
+ crawlo/cli.py,sha256=hjAJKx9pba375sATvvcy-dtZyBIgXj8fRBq9RFIZHA4,1206
4
+ crawlo/config.py,sha256=JYz4xL2Av5t41Bw90CVS4SSYg18-MXIxwXfKu0WuBjI,9690
5
+ crawlo/config_validator.py,sha256=M118EATR-tITzRSe2oSinV5oh2QsooMCkEJ5WS8ma_0,10155
6
+ crawlo/crawler.py,sha256=24EE7zFPByeYLJnf1K_R9fhJMqaFUjBSa6TuUhlY4TI,37398
7
+ crawlo/event.py,sha256=ZhoPW5CglCEuZNFEwviSCBIw0pT5O6jT98bqYrDFd3E,324
8
+ crawlo/exceptions.py,sha256=YVIDnC1bKSMv3fXH_6tinWMuD9HmKHIaUfO4_fkX5sY,1247
9
+ crawlo/mode_manager.py,sha256=9AsDGhigYqohqiE3iYscISyRSoANlTdy4WRzXFAMPiI,7283
10
+ crawlo/project.py,sha256=DOf_zzdA_A_nilff6Dp5KJXA6KphHYMalAYv336-cO8,5335
11
+ crawlo/stats_collector.py,sha256=v4jC9BAe-23w93hWzbeMCCgQ9VuFPyxw5JV9ItbGH8w,1636
12
+ crawlo/subscriber.py,sha256=Aj0kPpbBYlzOb1uViDFraoaThsQEVlqOSYUaFT3jSDs,5136
13
+ crawlo/task_manager.py,sha256=PScfEB03306Txa0l38AeQ_0WVhKzeWOFyT3bnrkbHW0,849
14
+ crawlo/cleaners/__init__.py,sha256=lxL-ZWDKW-DdobdgKUQ27wNmBiUhGnD0CVG6HWkX3_o,1261
15
+ crawlo/cleaners/data_formatter.py,sha256=iBDHpZBZvn9O7pLkTQilE1TzYJQEc3z3f6HXoVus0f0,7808
16
+ crawlo/cleaners/encoding_converter.py,sha256=G3khLlk0uBeTwIutsWxVUeSuyc1GMC1BDNJDwsU9ryg,4238
17
+ crawlo/cleaners/text_cleaner.py,sha256=16e6WqIIb9qANMiK-vCEl4TvgkId19Aa2W1NMLU-jFQ,6707
18
+ crawlo/commands/__init__.py,sha256=kZ3qATqDPmMUCNUQSFfBfIA8fp_1dgBwIAWbmFN3_To,355
19
+ crawlo/commands/check.py,sha256=jW8SgfkOS35j4VS7nRZBZdFCBX9CVFez5LR2sfP_H1U,23437
20
+ crawlo/commands/genspider.py,sha256=_3GwFMYK79BuKk__5L0ljuwWwOzN80MeuhRkL4Ql11A,5201
21
+ crawlo/commands/list.py,sha256=octTk0QZhapiyM7WgCPersP2v3MesthbJeG9vMqVFOs,5936
22
+ crawlo/commands/run.py,sha256=m7SFTxmw4mZJ_eS1a9fHG-c6FvQcRHXfW71xenYBYYc,10809
23
+ crawlo/commands/startproject.py,sha256=1oTDgfdIQQBHa9P_1te0siQG4MNeWnAHv_2J7v4a2po,11305
24
+ crawlo/commands/stats.py,sha256=iEKdxHoqsJuTkn8zAF9ekBVO1--8__BeD7xohYG5NwE,6252
25
+ crawlo/commands/utils.py,sha256=b7yW6UlOLprR3gN9oOdhcl3fsCwWRE3-_gDxWz5xhMo,5292
26
+ crawlo/core/__init__.py,sha256=JYSAn15r8yWgRK_Nc69t_8tZCyb70MiPZKssA8wrYz0,43
27
+ crawlo/core/engine.py,sha256=wfuiGJJbEOlSjtTC3yrcugSFnvWQBVhk9A7ynWap-0o,13490
28
+ crawlo/core/processor.py,sha256=oHLs-cno0bJGTNc9NGD2S7_2-grI3ruvggO0SY2mf3Q,1180
29
+ crawlo/core/scheduler.py,sha256=CdHeVNJbCzRkivGMCiLpVHttJSEbIz5P6qywXAR_cw4,5089
30
+ crawlo/downloader/__init__.py,sha256=8-r4_Wc_X64FJtKzNQamwsZsc428creKeFo19VxF33o,8565
31
+ crawlo/downloader/aiohttp_downloader.py,sha256=Ck4ybwpox3sC7v4IGc8RIcvmIBf9v6jAdyNGHByY1m4,8620
32
+ crawlo/downloader/cffi_downloader.py,sha256=U2CMYQ8ZyKMQNNPvyI2USqI-Dclq2d7ZRgp3LxLjTFs,10966
33
+ crawlo/downloader/httpx_downloader.py,sha256=MpgDeIdGqNsiSKLOEDBnr5Z0eUbhHnqVEmAuoIfJmFU,12296
34
+ crawlo/downloader/hybrid_downloader.py,sha256=nn_cD4SOKj0_TE2zuLfXfJhQppeftZDL5u8DZuITwVQ,8256
35
+ crawlo/downloader/playwright_downloader.py,sha256=L-TVzG7cYfuBlqW0XSZuz5C_r9fpJrmYNcoQ-cDEna4,16663
36
+ crawlo/downloader/selenium_downloader.py,sha256=ho0ovIVU7e99xMT2_Clgj6SIH3GjxPx7yPN85C0_w9o,21508
37
+ crawlo/extension/__init__.py,sha256=g7Nj1vm8L8n9ZmIbTzOXMYTkMSAJj4pw8TefaDyd52k,1563
38
+ crawlo/extension/health_check.py,sha256=Yiu50Dkn_rIS6NnzurIoQmwDM2d-SmEetO1d5wor2Xc,5668
39
+ crawlo/extension/log_interval.py,sha256=2R3XVdM1grDN8wh9TTHRB_WmQypCr5YSGvESNDnS16s,2474
40
+ crawlo/extension/log_stats.py,sha256=5CpcTHj0lCtFvhBTI3pG41gkiL_hPtv5FWY87L6EVjM,2989
41
+ crawlo/extension/logging_extension.py,sha256=euecjvj71aK6dElMzYCuNXiNb9jO8qQmL5QspQ1xoz8,1669
42
+ crawlo/extension/memory_monitor.py,sha256=fClPchpCkVjcIiU0AJHCKDd7HEiz5B4KqNqKTRZ2hcU,4394
43
+ crawlo/extension/performance_profiler.py,sha256=BjWD3LOb4VwjQJQvQtWNg7GluEwFquI1CztNfgMzy3c,5032
44
+ crawlo/extension/request_recorder.py,sha256=KA_RmcfscDxP5wPdolO76yKfRj-1jmHhG3jkVGO1pbc,4181
45
+ crawlo/filters/__init__.py,sha256=lX-QOCDTiTRFoiK1qrZ5HABo7LgZfcxScx_lELYEvJk,4395
46
+ crawlo/filters/aioredis_filter.py,sha256=UVT2ezSnKfYFGn9L0ia512JBoGwoI8djGZ5DDvBT3P8,10173
47
+ crawlo/filters/memory_filter.py,sha256=FzGJPhVKfZ8P23kP6de-VSfE8oVMjjpfWzKJIdiMtZU,9529
48
+ crawlo/items/__init__.py,sha256=rFpx1qFBo0Ik7bSdnXC8EVTJUOQdoJYGVdhYjaH00nk,409
49
+ crawlo/items/base.py,sha256=hwGJEdFWOdaZfalFX8umRkh_HUWLEbCjvq4j70fplMQ,598
50
+ crawlo/items/fields.py,sha256=fpS0vlRPpZYjTaMDgI9Q8z_YQqruwf6fi4Dgm6R2oEk,1854
51
+ crawlo/items/items.py,sha256=OmVEvMmgofMU95GkaiWkfNQ2fjsH2fY9sw3SKcmUhLs,3478
52
+ crawlo/middleware/__init__.py,sha256=PSwpRLdBUopaQzBp1S0zK_TZbrRagQ4yzvgyLy4tBk8,570
53
+ crawlo/middleware/default_header.py,sha256=OVW4vpRPp3Y6qYXtiEYlGqVjCYcbuv1Iecc7zEgwCsI,1099
54
+ crawlo/middleware/download_delay.py,sha256=P2eyAJXwdLdC4yYuLhvKZVa1b5YQvQD0GpsR8aDW8-8,994
55
+ crawlo/middleware/middleware_manager.py,sha256=G9R9SnDVey_zS2k1zm_358EPHDxRWfP953I4QavL7P0,6348
56
+ crawlo/middleware/proxy.py,sha256=GxsqSs_ZtM_HQlb8OSikmhBJMzJ1j1oPEV-9PJLJ2r8,11361
57
+ crawlo/middleware/request_ignore.py,sha256=jdybWFVXuA5YsAPfZJFzLTWkYhEAewNgxuhFqczPW9M,1027
58
+ crawlo/middleware/response_code.py,sha256=vgXWv3mMu_v9URvhKA9myIFH4u6L4EwNme80wL4DCGc,677
59
+ crawlo/middleware/response_filter.py,sha256=O2gkV_Yjart8kmmXTGzrtZnb_Uuefap4uL2Cu01iRs4,863
60
+ crawlo/middleware/retry.py,sha256=D_v4-8grofGsLkGJIhLKklMbdHO0gtij7DIS4G52NJc,4248
61
+ crawlo/network/__init__.py,sha256=bvEnpEUBZJ79URfNZbsHhsBKna54hM2-x_BV8eotTA4,418
62
+ crawlo/network/request.py,sha256=amZdRvDJvFpx82thtWwv6uo5uv9vtCwbvw3jAc3sCrY,12633
63
+ crawlo/network/response.py,sha256=QwJhL3xJfPVy_gwtGrg61oAgaqCoCmjyj1Ug7Zju7Pg,13060
64
+ crawlo/pipelines/__init__.py,sha256=FDe2Pr5tiHtV8hFlheElRO_O1aVKvSWlkTcAl9BXAKA,637
65
+ crawlo/pipelines/bloom_dedup_pipeline.py,sha256=n0Ay7MtIEJ8L4Otiha4zRvI9toFUSNFugTNubi-Q3aw,5798
66
+ crawlo/pipelines/console_pipeline.py,sha256=bwe5hZgaVSWmh3R8XpOaaeAjJme-Ttrpo6G6f1cnLIg,1287
67
+ crawlo/pipelines/csv_pipeline.py,sha256=qbXZoqq4FIR9QkUGpC0ryWzmqGJSrM2bxmWLM4I1nXM,12490
68
+ crawlo/pipelines/database_dedup_pipeline.py,sha256=6_zKtCNgFBPJyTI3Mk4l75fMQ2UQQGKFfNTpZqGs_zI,8224
69
+ crawlo/pipelines/json_pipeline.py,sha256=wrCsh8YInmcPLAkhPrHObMx89VZfhf-c7qRrYsTixPE,8585
70
+ crawlo/pipelines/memory_dedup_pipeline.py,sha256=oQcBODO-I2p6B7Nm_klXvuhzSMIHP-JWwC4_o6Gkgcc,3954
71
+ crawlo/pipelines/mongo_pipeline.py,sha256=PohTKTGw3QRvuP-T6SrquwW3FAHSno8jQ2D2cH_d75U,5837
72
+ crawlo/pipelines/mysql_pipeline.py,sha256=RRKMuO-7BTomyRFYCmDpfvjBTcU4SGdjGDV4wBeKWck,13796
73
+ crawlo/pipelines/pipeline_manager.py,sha256=Kw37RC2GESWDnDJ6qIN1MA0qc27Uyhu77ebm1r-FgeU,2168
74
+ crawlo/pipelines/redis_dedup_pipeline.py,sha256=iniHsb5KGpxkshSFXY9eOKRK8eqVfO62Hzz6kFnPdDQ,6342
75
+ crawlo/queue/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
76
+ crawlo/queue/pqueue.py,sha256=qTFOuvEXsYEZbm0ULjsOeZo0XtSsZ-SHpx7nFEtmluE,1095
77
+ crawlo/queue/queue_manager.py,sha256=CIYlzOaaN855jP33NHieQ-eciL5vVjYC_bzCUX1GtIY,12501
78
+ crawlo/queue/redis_priority_queue.py,sha256=7jB0atlml-g5i0HVyt7_6k-SFYc5ZimcSZgKUKk9EB0,11623
79
+ crawlo/settings/__init__.py,sha256=NgYFLfk_Bw7h6KSoepJn_lMBSqVbCHebjKxaE3_eMgw,130
80
+ crawlo/settings/default_settings.py,sha256=qzOOVXycjSpUMvx861vE_vD4dZu22psTIBSzYuWqqRo,8908
81
+ crawlo/settings/setting_manager.py,sha256=4xXOzKwZCgAp8ybwvVcs2R--CsOD7c6dBIkj6DJHB3c,2998
82
+ crawlo/spider/__init__.py,sha256=xAH6NfE_6K2aY_VSL9DoGjcmMHJDd5Nxr7TG1Y8vQAE,21091
83
+ crawlo/templates/crawlo.cfg.tmpl,sha256=lwiUVe5sFixJgHFEjn1OtbAeyWsECOrz37uheuVtulk,240
84
+ crawlo/templates/project/__init__.py.tmpl,sha256=aQnHaOjMSkTviOC8COUX0fKymuyf8lx2tGduxkMkXEE,61
85
+ crawlo/templates/project/items.py.tmpl,sha256=8_3DBA8HrS2XbfHzsMZNJiZbFY6fDJUUMFoFti_obJk,314
86
+ crawlo/templates/project/middlewares.py.tmpl,sha256=BMTAAFhmZQBn0D3lgWwepcOlXgtzt2lXYEG0XhXFTDI,3885
87
+ crawlo/templates/project/pipelines.py.tmpl,sha256=ZIgQjtwMtdP7vyeeg-PSlOnKI46A9d6ew8bGAMx3Dxc,2717
88
+ crawlo/templates/project/run.py.tmpl,sha256=IBD0F0XEgBR6gR34PhYAjKiZDdvLufZJkABHapTsoYo,8428
89
+ crawlo/templates/project/settings.py.tmpl,sha256=XplJLfRgDdsN_tALmYM_wsDqA8tPd0D1j_UYzHCSxuA,11991
90
+ crawlo/templates/project/settings_distributed.py.tmpl,sha256=Nli-qR-UB4TXAq4mXjx17y-yAv46NwwpcVidjGeM00A,4321
91
+ crawlo/templates/project/settings_gentle.py.tmpl,sha256=ljXf9vKV2c-cN8yvf5U4UWsPWrMVmJUfbXladIdS2mg,3320
92
+ crawlo/templates/project/settings_high_performance.py.tmpl,sha256=3wf8fFYZ5EVE2742JlcwwrPF794vEIEmbxFSbqyGnJQ,5434
93
+ crawlo/templates/project/settings_simple.py.tmpl,sha256=UKfSEZFAXDywswwws9SvR5vBMng2maJ37DJMfK0nAmI,2365
94
+ crawlo/templates/project/spiders/__init__.py.tmpl,sha256=zMbePipgLsctQUEnda4WkHz8rDLUX--rc8ruI6zkpWc,111
95
+ crawlo/templates/spider/spider.py.tmpl,sha256=hXmdhYmBIWuebSAVGI37371RzGSZyd9EwWn57mMNmVA,5037
96
+ crawlo/tools/__init__.py,sha256=6BcoEE7t119VzdwqnULrWnYDNpXu86FHkWgx_WL1-Sk,3853
97
+ crawlo/tools/anti_crawler.py,sha256=LwLC6BkxDSkxc5H1hQ6kY9j7O0PZGAMPZECr7gbqw2M,9431
98
+ crawlo/tools/authenticated_proxy.py,sha256=ULCK0Cc9F2rGhRqu6kzKBdxzK9v2n1CsatSQ_PMxpAg,7272
99
+ crawlo/tools/data_validator.py,sha256=bLWnkpFdclJuqjtSAgMI5nznN4vAuPwE1YaiFWKWenM,5490
100
+ crawlo/tools/date_tools.py,sha256=dNm06OwJYQ7KeVkZoFcfFhMwJPsHlB3JJfSplOA0XfY,635
101
+ crawlo/tools/distributed_coordinator.py,sha256=QM4ECyhTba9jtV7-AkOuT_lDJObeKVeBsRLZZFS_A7Y,12863
102
+ crawlo/tools/retry_mechanism.py,sha256=VbHo9-pT8ADbwyPuKcRfeUo4BtVZQy6yvrSvugoiZRA,8021
103
+ crawlo/tools/scenario_adapter.py,sha256=pzysL1B2uQ1ZSEncVHd9Hv2viHNgaxP44YAUcDcppfw,9660
104
+ crawlo/utils/__init__.py,sha256=5unISyKR7pZdj7EwP5tadGYKvyb_W1uoC52YzqSbxis,592
105
+ crawlo/utils/batch_processor.py,sha256=wnpWL41Su6gSvbzNgRWv7sRxKzBFV0J27V9gC3WsLJc,9155
106
+ crawlo/utils/controlled_spider_mixin.py,sha256=os5v8eFX08Nbqm48W-xWdY04LboqDgOgOL-93IX9zvg,16914
107
+ crawlo/utils/date_tools.py,sha256=gsgHweZFOzeW-K91wRlBEg_XLKBCXl7UZJfnjI_W-J0,9197
108
+ crawlo/utils/db_helper.py,sha256=ZqOt1d3mErVv4TOvoWlov0niUxORB9aHByTmMoNFIDw,10902
109
+ crawlo/utils/enhanced_error_handler.py,sha256=o6oxCtWFcAGslON3FZNRgxvHutd9z0tqbc0R5kmF_lk,14265
110
+ crawlo/utils/env_config.py,sha256=9hJ4T-ufAzWPZugytgLMZStZxD17JaE900S8xpGoyn0,2974
111
+ crawlo/utils/error_handler.py,sha256=7AHwZHJD-CTq0DNOwbDjUvzdbushDI_xWJiwe4KzBeQ,4411
112
+ crawlo/utils/func_tools.py,sha256=y-TYP9H3X67MS_foWy9Z2LIS6GP7Y4Cy3T168ulq3Jc,2451
113
+ crawlo/utils/large_scale_config.py,sha256=lsraHTAQx3sMPjTnCBY_SVIpkuIBUb3zD9eFvmccOOM,8440
114
+ crawlo/utils/large_scale_helper.py,sha256=ZazAI7KV3V-3hzc4a3BWxTXfEO2XIEBMzxTLM9S1l_Q,12500
115
+ crawlo/utils/log.py,sha256=YD2FfXuuE2MC9ZdQQZ0H7KysE7l_LHZqQepaTPlcApo,4133
116
+ crawlo/utils/performance_monitor.py,sha256=ku_QAx3MX8U-Dfd5fqMr-YV5j1srYJbtF_5jdVJx6tY,9879
117
+ crawlo/utils/queue_helper.py,sha256=gFmkh1jKlIcN1rmo2Jl6vYcLP5ByUWlfHO9eNlZPBLs,4918
118
+ crawlo/utils/redis_connection_pool.py,sha256=rGaKcn2YVeP_bA38svkl3A-lU3DpfX6x0TzdaxWvIlo,10665
119
+ crawlo/utils/redis_key_validator.py,sha256=5lft8SPbHkBf9NdrQaW13R-fXLJoMBkJlc1w38wuPmU,5836
120
+ crawlo/utils/request.py,sha256=ejdKpTwc-HE04HQybafhOVywzz57IV3pY0YMkSLyGUo,9065
121
+ crawlo/utils/request_serializer.py,sha256=bZhuonZV2AdB_X0aje7sDljqWAIrEzUYwEaxXytaWsg,8784
122
+ crawlo/utils/spider_loader.py,sha256=V0CBTicJBYBZafhwLfDEfuEc_hJ2mSoiptT6qKufI9U,2249
123
+ crawlo/utils/system.py,sha256=24zGmtHNhDFMGVo7ftMV-Pqg6_5d63zsyNey9udvJJk,248
124
+ crawlo/utils/tools.py,sha256=uy7qw5Z1BIhyEgiHENvtM7WoGCJxlS8EX3PmOA7ouCo,275
125
+ crawlo/utils/url.py,sha256=RKe_iqdjafsNcp-P2GVLYpsL1qbxiuZLiFc-SqOQkcs,1521
126
+ examples/__init__.py,sha256=NkRbV8_S1tb8S2AW6BE2U6P2-eGOPwMR1k0YQAwQpSE,130
127
+ tests/__init__.py,sha256=409aRX8hsPffiZCVjOogtxwhACzBp8G2UTJyUQSxhK0,136
128
+ tests/advanced_tools_example.py,sha256=1_iitECKCuWUYMNNGo61l3lmwMRrWdA8F_Xw56UaGZY,9340
129
+ tests/authenticated_proxy_example.py,sha256=b307_RybOtxXVQK0ITboLvHmOTwIN4yTF9aup4dYF7Q,8477
130
+ tests/cleaners_example.py,sha256=tEGf-x8CNlXpKurgmNxTmmPjQa45dRUSBjnOJVGGSeU,5330
131
+ tests/config_validation_demo.py,sha256=ah4cLZnozMdwKzACOoa5R52dWfya282WyulCGjadioM,3218
132
+ tests/controlled_spider_example.py,sha256=2SAQKoREGHe-OzVaSkGpopCcrou6QXmeW7rLdmsyopw,7981
133
+ tests/date_tools_example.py,sha256=XI3iFEzeo7Nb5YepK8WHytIaBegtxWVSISpqQgpV6M8,5042
134
+ tests/dynamic_loading_example.py,sha256=7LdeQZFevrb-U1_dgr4oX3aYo2Da4HvE_0KIf1fw4Ew,18786
135
+ tests/dynamic_loading_test.py,sha256=dzDW7b66HeDsIYsYgvNRihE3V6b6gEbUGQpp-eJbcIM,3413
136
+ tests/env_config_example.py,sha256=_ZRDh_LR23ZKpy9E--y_KM0QIOiZF5vRT98QTn52TY8,4951
137
+ tests/error_handling_example.py,sha256=sqvSoogDefyDoEBz3AVPG4pe49gRqBxT5BDFkijmYaA,5332
138
+ tests/redis_key_validation_demo.py,sha256=R0ZTJJSeMB-ptVfzwegw7bw83FDDcR2XUNnUZqstSPE,4475
139
+ tests/response_improvements_example.py,sha256=t1cbG3nesp82bqog4_ku1GvQzNbhRyWa5EaKTmOPrSk,5402
140
+ tests/test_advanced_tools.py,sha256=HT_TcwfFzli-CavIJSqQqnCxnBn5FDMX09zL7AJ5tNY,5398
141
+ tests/test_all_redis_key_configs.py,sha256=Uk91dnGwGaGsdukpvbTiXd4NnrsOa-THWD2cwSZkqag,5805
142
+ tests/test_authenticated_proxy.py,sha256=lnvmQwuf0zaZP_E05EzcNFR2VJbwTkLjOmZGNoJKaC4,4339
143
+ tests/test_cleaners.py,sha256=qyd20RNuBHIVHz7X5JjLwlIZedn7yHZ4uB3X78BpaF4,1819
144
+ tests/test_comprehensive.py,sha256=dvRJeeVYc1cgXK9Y171hH9Y847zZpWSAFFH-EI3UepQ,5182
145
+ tests/test_config_validator.py,sha256=Ec1h8Mw-fVz1d9JoATIWZb0nTc8pYjhTCSjPm3tvkTQ,6825
146
+ tests/test_date_tools.py,sha256=pcLDyhLrZ_jh-PhPm4CvLZEgNeH9kLMPKN5zacHwuWM,4053
147
+ tests/test_dynamic_downloaders_proxy.py,sha256=t_aWpxOHi4h3_fg2ImtIq7IIJ0r3PTHtnXiopPe2ZlM,4450
148
+ tests/test_dynamic_proxy.py,sha256=zi7Ocbhc9GL1zCs0XhmG2NvBBeIZ2d2hPJVh18lH4Y0,3172
149
+ tests/test_dynamic_proxy_config.py,sha256=C_9CEjCJtrr0SxIXCyLDhSIi88ujF7UAT1F-FAphd0w,5853
150
+ tests/test_dynamic_proxy_real.py,sha256=krWnbFIH26mWNPhOfPMmx3ZxJfOreZxMZFGwVb_8-K8,3511
151
+ tests/test_edge_cases.py,sha256=kd5irijfGB0MkL-eukmc-WFm_xfbzQ30tPq-FMJaxsg,10856
152
+ tests/test_enhanced_error_handler.py,sha256=cROcikmLvL11NgjCj-fGfVlKV3sLq7VN4DY_9U54IBU,8705
153
+ tests/test_env_config.py,sha256=Qu1sDeADs69dSr1x0QmEe8nJrMHneE_4JClt-N901e8,4867
154
+ tests/test_error_handler_compatibility.py,sha256=xJ43cmCwfBGh-qBwCGiMDPPlfNDLw4ZrmlrHN9IojkU,4241
155
+ tests/test_final_validation.py,sha256=UNHMOkcOBx9jPdnYuYCF4Cx5GlXakBeHybOP27lpbAg,5078
156
+ tests/test_framework_env_usage.py,sha256=bFb_ptdLeX2obdJWEqEHPWweiWR-wR2BpvEaJMQK7h4,4201
157
+ tests/test_integration.py,sha256=l-G_z_btCtK0W4VvP1yNJFaZIZtKXDTeD1Ns2C-v6Nw,11677
158
+ tests/test_item_dedup_redis_key.py,sha256=GIdyFBm56EbFzTvgnYcimgQFoMgvEZW8z1KFVUu3HJ4,3877
159
+ tests/test_parsel.py,sha256=wuZqRFIm9xx1tt6o3Xi_OjvwhT_MPmHiUEj2ax06zlo,701
160
+ tests/test_performance.py,sha256=cjmWwIXBsHibgVyjyIt30iiXSDVse2HD6lFb0YlZsAQ,11613
161
+ tests/test_proxy_health_check.py,sha256=_tDlxa_6TdL3M5RLkHF82roXJ8WIuG5hELBp2GADyKQ,1123
162
+ tests/test_proxy_middleware_integration.py,sha256=mTPK_XvbmLCV_QoVZzA3ybWOOX61493Ew78WfTp-bYQ,4441
163
+ tests/test_proxy_providers.py,sha256=u_R2fhab90vqvQEaOAztpAOe9tJXvUMIdoDxmStmXJ4,1749
164
+ tests/test_proxy_stats.py,sha256=ES00CEoDITYPFBGPk8pecFzD3ItYIv6NSpcqNd8-kvo,526
165
+ tests/test_proxy_strategies.py,sha256=9Z1pXmTNyw-eIhGXlf2abZbJx6igLohYq-_3hldQ5uE,1868
166
+ tests/test_queue_manager_redis_key.py,sha256=tvy3qkmB6XNpnJ4SOgjKvxE83hltCdL5Z32CupQ2VZ0,6454
167
+ tests/test_redis_config.py,sha256=Kbl3PURGNM1BUIspakEOA-ZOl2xxTHb_8KbftwjYOsg,921
168
+ tests/test_redis_connection_pool.py,sha256=wLPmJ94jUajpShNrnnl4pTbX9ZIGqCEZgzzecasAC4s,9471
169
+ tests/test_redis_key_naming.py,sha256=44A174UkKdj2ipjJa4N0RWIN8CYun6w08yylg8Yu9ac,6957
170
+ tests/test_redis_key_validator.py,sha256=Yw0EsDQuI3Gzt_s86BubBP2VU_s7fyI-TthEX5J5C-o,4427
171
+ tests/test_redis_queue.py,sha256=5LTc86A5qqF5VbmkvkF2OnLAxlJ7ClfJPw0dODxekFk,6846
172
+ tests/test_request_serialization.py,sha256=Jf7Kr7edL0ENwxh8ABa1W_O3dWyqNlvoSfQM1Mykpys,2359
173
+ tests/test_response_improvements.py,sha256=vNqHKyoEoYeEGAUiRzdsff2V6yvJ9QnDwGg7gmN38Ow,6028
174
+ tests/test_scheduler.py,sha256=elAPFh-Ph49bbJQlTBEsRwzhoX82EdryqQbpc_wsobU,7683
175
+ tests/test_simple_response.py,sha256=_ui2PuVZvJcAuLY7HZ8xcsy_tDBimgBqX0ukj3kE5J0,1549
176
+ tests/test_telecom_spider_redis_key.py,sha256=SUCUK_CgHgl1wj59o2a1g4qJpLUyW4reXv-5Bjuk6Ko,7675
177
+ tests/test_template_content.py,sha256=URwjlAzMCdUN0sW_OupUcuSNMxp1OKgW79JOpkLPXnw,2965
178
+ tests/test_template_redis_key.py,sha256=dOFutic8CL3tOzGbYhWbMrYiXZ8R3fhNoF5VKax5Iy0,4946
179
+ tests/test_tools.py,sha256=fgzXL2L7eBV_nGjeMxH8IMhfc0dviQ80XgzZkJp_4dA,5266
180
+ tests/tools_example.py,sha256=uXNS4xXJ-OD_xInAn2zjKLG_nlbgVGXZLoJtfhaG9lI,7926
181
+ crawlo-1.1.5.dist-info/METADATA,sha256=24KOm7Z64Y-moon13p7n7jV4XBelSFZEfwOs0LtCBDI,20068
182
+ crawlo-1.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
183
+ crawlo-1.1.5.dist-info/entry_points.txt,sha256=5HoVoTSPxI8SCa5B7pQYxLSrkOdiunyO9tqNsLMv52g,43
184
+ crawlo-1.1.5.dist-info/top_level.txt,sha256=keG_67pbZ_wZL2dmDRA9RMaNHTaV_x_oxZ9DKNgwvR0,22
185
+ crawlo-1.1.5.dist-info/RECORD,,
@@ -0,0 +1,276 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ Crawlo框架高级工具使用示例
5
+ """
6
+ from crawlo.tools import (
7
+ # 数据处理工具
8
+ clean_text,
9
+ format_currency,
10
+ validate_email,
11
+ validate_url,
12
+ check_data_integrity,
13
+
14
+ # 重试机制
15
+ RetryMechanism,
16
+ retry,
17
+ exponential_backoff,
18
+
19
+ # 反爬虫应对工具
20
+ AntiCrawler,
21
+ rotate_proxy,
22
+ handle_captcha,
23
+ detect_rate_limiting,
24
+
25
+ # 带认证代理工具
26
+ AuthenticatedProxy,
27
+ create_proxy_config,
28
+ format_proxy_for_request,
29
+
30
+ # 分布式协调工具
31
+ generate_pagination_tasks,
32
+ distribute_tasks,
33
+ DistributedCoordinator
34
+ )
35
+
36
+
37
+ def demo_data_processing_tools():
38
+ """演示数据处理工具的使用"""
39
+ print("=== 数据处理工具演示 ===\n")
40
+
41
+ # 数据清洗
42
+ dirty_text = "<p>这是一个&nbsp;<b>测试</b>&amp;文本</p>"
43
+ clean_result = clean_text(dirty_text)
44
+ print(f"清洗文本: {dirty_text} -> {clean_result}")
45
+
46
+ # 数据格式化
47
+ price = 1234.567
48
+ formatted_price = format_currency(price, "¥", 2)
49
+ print(f"格式化货币: {price} -> {formatted_price}")
50
+
51
+ # 字段验证
52
+ email = "test@example.com"
53
+ is_valid_email = validate_email(email)
54
+ print(f"验证邮箱: {email} -> {'有效' if is_valid_email else '无效'}")
55
+
56
+ url = "https://example.com"
57
+ is_valid_url = validate_url(url)
58
+ print(f"验证URL: {url} -> {'有效' if is_valid_url else '无效'}")
59
+
60
+ # 数据完整性检查
61
+ data = {
62
+ "name": "张三",
63
+ "email": "test@example.com",
64
+ "phone": "13812345678"
65
+ }
66
+ required_fields = ["name", "email", "phone", "address"]
67
+ integrity_result = check_data_integrity(data, required_fields)
68
+ print(f"数据完整性检查: {integrity_result}")
69
+
70
+ print()
71
+
72
+
73
+ def demo_retry_mechanism():
74
+ """演示重试机制的使用"""
75
+ print("=== 重试机制演示 ===\n")
76
+
77
+ # 指数退避
78
+ for attempt in range(5):
79
+ delay = exponential_backoff(attempt)
80
+ print(f"重试次数 {attempt}: 延迟 {delay:.2f} 秒")
81
+
82
+ # 重试装饰器示例
83
+ @retry(max_retries=3)
84
+ def unreliable_function():
85
+ import random
86
+ if random.random() < 0.7: # 70%概率失败
87
+ raise ConnectionError("网络连接失败")
88
+ return "成功执行"
89
+
90
+ try:
91
+ result = unreliable_function()
92
+ print(f"函数执行结果: {result}")
93
+ except Exception as e:
94
+ print(f"函数执行失败: {e}")
95
+
96
+ print()
97
+
98
+
99
+ def demo_anti_crawler_tools():
100
+ """演示反爬虫应对工具的使用"""
101
+ print("=== 反爬虫应对工具演示 ===\n")
102
+
103
+ # 反爬虫工具
104
+ anti_crawler = AntiCrawler()
105
+
106
+ # 获取随机User-Agent
107
+ user_agent = anti_crawler.get_random_user_agent()
108
+ print(f"随机User-Agent: {user_agent[:50]}...")
109
+
110
+ # 轮换代理
111
+ proxy = anti_crawler.rotate_proxy()
112
+ print(f"轮换代理: {proxy}")
113
+
114
+ # 检测验证码
115
+ response_with_captcha = "请输入验证码进行验证"
116
+ has_captcha = anti_crawler.handle_captcha(response_with_captcha)
117
+ print(f"检测验证码: {response_with_captcha} -> {'需要验证码' if has_captcha else '无需验证码'}")
118
+
119
+ # 检测频率限制
120
+ status_code = 429 # Too Many Requests
121
+ response_headers = {"Retry-After": "60"}
122
+ is_rate_limited = anti_crawler.detect_rate_limiting(status_code, response_headers)
123
+ print(f"检测频率限制: 状态码{status_code} -> {'被限制' if is_rate_limited else '未限制'}")
124
+
125
+ print()
126
+
127
+
128
+ def demo_authenticated_proxy_tools():
129
+ """演示带认证代理工具的使用"""
130
+ print("=== 带认证代理工具演示 ===\n")
131
+
132
+ # 不同类型的代理URL
133
+ proxy_urls = [
134
+ "http://user:pass@proxy1.example.com:8080", # 带认证HTTP代理
135
+ "https://username:password@proxy2.example.com:443", # 带认证HTTPS代理
136
+ "http://proxy3.example.com:8080" # 不带认证代理
137
+ ]
138
+
139
+ for proxy_url in proxy_urls:
140
+ print(f"处理代理: {proxy_url}")
141
+
142
+ # 创建代理对象
143
+ proxy = AuthenticatedProxy(proxy_url)
144
+
145
+ # 为不同下载器格式化代理配置
146
+ for downloader in ["aiohttp", "httpx", "curl_cffi"]:
147
+ config = create_proxy_config(proxy_url)
148
+ formatted = format_proxy_for_request(config, downloader)
149
+ print(f" {downloader}格式: {formatted}")
150
+
151
+ print()
152
+
153
+
154
+ def demo_distributed_coordinator_tools():
155
+ """演示分布式协调工具的使用"""
156
+ print("=== 分布式协调工具演示 ===\n")
157
+
158
+ # 生成分页任务
159
+ base_url = "https://example.com/products"
160
+ pagination_tasks = generate_pagination_tasks(base_url, 1, 5)
161
+ print(f"生成分页任务 ({len(pagination_tasks)} 个):")
162
+ for i, task in enumerate(pagination_tasks[:3]): # 只显示前3个
163
+ print(f" {i+1}. {task}")
164
+ if len(pagination_tasks) > 3:
165
+ print(f" ... 还有 {len(pagination_tasks) - 3} 个任务")
166
+
167
+ # 任务分发
168
+ tasks = list(range(1, 21)) # 20个任务
169
+ distributed = distribute_tasks(tasks, 4) # 分发给4个工作节点
170
+ print(f"\n任务分发 (20个任务分发给4个工作节点):")
171
+ for i, worker_tasks in enumerate(distributed):
172
+ print(f" 工作节点 {i+1}: {len(worker_tasks)} 个任务 -> {worker_tasks}")
173
+
174
+ # 分布式协调器
175
+ coordinator = DistributedCoordinator()
176
+ cluster_info = coordinator.get_cluster_info()
177
+ print(f"\n集群信息: {cluster_info}")
178
+
179
+ print()
180
+
181
+
182
+ def demo_in_spider():
183
+ """演示在爬虫中使用高级工具"""
184
+ print("=== 在爬虫中使用高级工具 ===\n")
185
+ print("在爬虫项目中,您可以这样使用高级工具:")
186
+ print("""
187
+ import asyncio
188
+ from crawlo import Spider, Request
189
+ from crawlo.tools import (
190
+ clean_text,
191
+ validate_email,
192
+ AntiCrawler,
193
+ DistributedCoordinator,
194
+ retry,
195
+ AuthenticatedProxy
196
+ )
197
+
198
+ class AdvancedSpider(Spider):
199
+ def __init__(self):
200
+ super().__init__()
201
+ self.anti_crawler = AntiCrawler()
202
+ self.coordinator = DistributedCoordinator()
203
+ # 代理列表
204
+ self.proxy_urls = [
205
+ "http://user1:pass1@proxy1.example.com:8080",
206
+ "http://user2:pass2@proxy2.example.com:8080",
207
+ "http://proxy3.example.com:8080" # 不带认证
208
+ ]
209
+
210
+ def start_requests(self):
211
+ # 生成分页任务
212
+ base_url = "https://api.example.com/products"
213
+ pagination_tasks = self.coordinator.generate_pagination_tasks(base_url, 1, 100)
214
+
215
+ for i, url in enumerate(pagination_tasks):
216
+ # 轮换使用代理
217
+ proxy_url = self.proxy_urls[i % len(self.proxy_urls)]
218
+ proxy = AuthenticatedProxy(proxy_url)
219
+
220
+ request = Request(url, callback=self.parse)
221
+
222
+ # 根据下载器类型设置代理
223
+ downloader_type = self.crawler.settings.get("DOWNLOADER_TYPE", "aiohttp")
224
+ if downloader_type == "aiohttp":
225
+ request.proxy = proxy.clean_url
226
+ auth = proxy.get_auth_credentials()
227
+ if auth:
228
+ request.meta["proxy_auth"] = auth
229
+ elif downloader_type == "httpx":
230
+ request.proxy = proxy.clean_url
231
+ elif downloader_type == "curl_cffi":
232
+ request.proxy = proxy.proxy_dict
233
+ auth_header = proxy.get_auth_header()
234
+ if auth_header:
235
+ request.headers["Proxy-Authorization"] = auth_header
236
+
237
+ yield request
238
+
239
+ @retry(max_retries=3)
240
+ async def parse(self, response):
241
+ # 检查是否遇到验证码
242
+ if self.anti_crawler.handle_captcha(response.text):
243
+ # 处理验证码逻辑
244
+ print("遇到验证码,需要处理")
245
+ return
246
+
247
+ # 提取数据
248
+ products = response.css('.product-item')
249
+ for product in products:
250
+ name = product.css('.product-name::text').get()
251
+ price_text = product.css('.price::text').get()
252
+ email = product.css('.contact-email::text').get()
253
+
254
+ # 数据清洗和验证
255
+ clean_name = clean_text(name) if name else None
256
+ clean_price = clean_text(price_text) if price_text else None
257
+ is_valid_email = validate_email(email) if email else False
258
+
259
+ # 检查数据是否重复
260
+ if not await self.coordinator.is_duplicate({"name": clean_name, "price": clean_price}):
261
+ # 添加到去重集合
262
+ await self.coordinator.add_to_dedup({"name": clean_name, "price": clean_price})
263
+
264
+ # 处理产品数据...
265
+ pass
266
+ """)
267
+
268
+
269
+ if __name__ == '__main__':
270
+ # 运行演示
271
+ demo_data_processing_tools()
272
+ demo_retry_mechanism()
273
+ demo_anti_crawler_tools()
274
+ demo_authenticated_proxy_tools()
275
+ demo_distributed_coordinator_tools()
276
+ demo_in_spider()