crawlo 1.4.2__tar.gz → 1.4.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (346) hide show
  1. {crawlo-1.4.2 → crawlo-1.4.4}/LICENSE +22 -22
  2. {crawlo-1.4.2 → crawlo-1.4.4}/MANIFEST.in +16 -16
  3. crawlo-1.4.4/PKG-INFO +190 -0
  4. crawlo-1.4.4/README.md +140 -0
  5. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/__init__.py +93 -93
  6. crawlo-1.4.4/crawlo/__version__.py +1 -0
  7. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/cli.py +75 -75
  8. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/commands/__init__.py +14 -14
  9. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/commands/check.py +594 -594
  10. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/commands/genspider.py +186 -151
  11. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/commands/help.py +138 -138
  12. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/commands/list.py +155 -155
  13. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/commands/run.py +341 -341
  14. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/commands/startproject.py +436 -436
  15. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/commands/stats.py +187 -187
  16. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/commands/utils.py +196 -196
  17. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/config.py +312 -312
  18. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/config_validator.py +277 -277
  19. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/core/__init__.py +52 -52
  20. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/core/engine.py +438 -439
  21. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/core/processor.py +47 -47
  22. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/core/scheduler.py +291 -257
  23. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/crawler.py +656 -650
  24. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/data/__init__.py +5 -5
  25. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/data/user_agents.py +194 -194
  26. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/downloader/__init__.py +273 -273
  27. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/downloader/aiohttp_downloader.py +233 -233
  28. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/downloader/cffi_downloader.py +245 -245
  29. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/downloader/httpx_downloader.py +259 -259
  30. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/downloader/hybrid_downloader.py +212 -212
  31. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/downloader/playwright_downloader.py +402 -402
  32. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/downloader/selenium_downloader.py +472 -472
  33. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/event.py +11 -11
  34. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/exceptions.py +81 -81
  35. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/extension/__init__.py +63 -63
  36. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/extension/health_check.py +141 -141
  37. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/extension/log_interval.py +94 -94
  38. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/extension/log_stats.py +70 -70
  39. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/extension/logging_extension.py +61 -61
  40. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/extension/memory_monitor.py +104 -104
  41. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/extension/performance_profiler.py +133 -133
  42. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/extension/request_recorder.py +107 -107
  43. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/factories/__init__.py +27 -27
  44. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/factories/base.py +68 -68
  45. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/factories/crawler.py +103 -103
  46. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/factories/registry.py +84 -84
  47. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/filters/__init__.py +154 -154
  48. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/filters/aioredis_filter.py +257 -257
  49. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/filters/memory_filter.py +269 -269
  50. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/framework.py +292 -292
  51. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/initialization/__init__.py +44 -44
  52. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/initialization/built_in.py +425 -425
  53. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/initialization/context.py +141 -141
  54. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/initialization/core.py +193 -193
  55. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/initialization/phases.py +148 -148
  56. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/initialization/registry.py +145 -145
  57. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/items/__init__.py +23 -23
  58. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/items/base.py +23 -23
  59. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/items/fields.py +52 -52
  60. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/items/items.py +104 -104
  61. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/logging/__init__.py +45 -37
  62. crawlo-1.4.4/crawlo/logging/async_handler.py +181 -0
  63. crawlo-1.4.4/crawlo/logging/config.py +197 -0
  64. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/logging/factory.py +171 -128
  65. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/logging/manager.py +111 -111
  66. crawlo-1.4.4/crawlo/logging/monitor.py +153 -0
  67. crawlo-1.4.4/crawlo/logging/sampler.py +167 -0
  68. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/middleware/__init__.py +21 -21
  69. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/middleware/default_header.py +132 -132
  70. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/middleware/download_delay.py +104 -104
  71. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/middleware/middleware_manager.py +135 -135
  72. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/middleware/offsite.py +123 -123
  73. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/middleware/proxy.py +386 -386
  74. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/middleware/request_ignore.py +86 -86
  75. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/middleware/response_code.py +150 -150
  76. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/middleware/response_filter.py +136 -136
  77. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/middleware/retry.py +124 -124
  78. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/middleware/simple_proxy.py +65 -65
  79. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/mode_manager.py +219 -219
  80. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/network/__init__.py +21 -21
  81. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/network/request.py +379 -379
  82. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/network/response.py +359 -359
  83. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/pipelines/__init__.py +21 -21
  84. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
  85. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/pipelines/console_pipeline.py +39 -39
  86. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/pipelines/csv_pipeline.py +316 -316
  87. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/pipelines/database_dedup_pipeline.py +197 -197
  88. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/pipelines/json_pipeline.py +218 -218
  89. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/pipelines/memory_dedup_pipeline.py +105 -105
  90. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/pipelines/mongo_pipeline.py +131 -131
  91. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/pipelines/mysql_pipeline.py +325 -325
  92. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/pipelines/pipeline_manager.py +100 -84
  93. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/pipelines/redis_dedup_pipeline.py +156 -156
  94. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/project.py +349 -338
  95. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/queue/pqueue.py +38 -42
  96. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/queue/queue_manager.py +525 -522
  97. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/queue/redis_priority_queue.py +370 -367
  98. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/settings/__init__.py +7 -7
  99. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/settings/default_settings.py +265 -284
  100. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/settings/setting_manager.py +219 -219
  101. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/spider/__init__.py +657 -657
  102. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/stats_collector.py +73 -73
  103. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/subscriber.py +129 -129
  104. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/task_manager.py +138 -138
  105. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/templates/crawlo.cfg.tmpl +10 -10
  106. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/templates/project/__init__.py.tmpl +3 -3
  107. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/templates/project/items.py.tmpl +17 -17
  108. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/templates/project/middlewares.py.tmpl +118 -118
  109. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/templates/project/pipelines.py.tmpl +96 -96
  110. crawlo-1.4.4/crawlo/templates/project/settings.py.tmpl +157 -0
  111. crawlo-1.4.4/crawlo/templates/project/settings_distributed.py.tmpl +162 -0
  112. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/templates/project/settings_gentle.py.tmpl +171 -166
  113. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/templates/project/settings_high_performance.py.tmpl +172 -167
  114. crawlo-1.4.4/crawlo/templates/project/settings_minimal.py.tmpl +77 -0
  115. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/templates/project/settings_simple.py.tmpl +169 -164
  116. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
  117. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/templates/run.py.tmpl +30 -34
  118. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/templates/spider/spider.py.tmpl +143 -143
  119. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/templates/spiders_init.py.tmpl +9 -9
  120. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/tools/__init__.py +200 -200
  121. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/tools/anti_crawler.py +268 -268
  122. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/tools/authenticated_proxy.py +240 -240
  123. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/tools/data_formatter.py +225 -225
  124. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/tools/data_validator.py +180 -180
  125. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/tools/date_tools.py +289 -289
  126. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/tools/distributed_coordinator.py +384 -384
  127. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/tools/encoding_converter.py +127 -127
  128. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/tools/network_diagnostic.py +364 -364
  129. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/tools/request_tools.py +82 -82
  130. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/tools/retry_mechanism.py +224 -224
  131. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/tools/scenario_adapter.py +262 -262
  132. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/tools/text_cleaner.py +232 -232
  133. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/utils/__init__.py +34 -34
  134. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/utils/batch_processor.py +259 -259
  135. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/utils/class_loader.py +25 -25
  136. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/utils/controlled_spider_mixin.py +439 -439
  137. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/utils/db_helper.py +343 -343
  138. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/utils/enhanced_error_handler.py +356 -356
  139. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/utils/env_config.py +142 -142
  140. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/utils/error_handler.py +165 -165
  141. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/utils/fingerprint.py +122 -122
  142. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/utils/func_tools.py +82 -82
  143. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/utils/large_scale_config.py +286 -286
  144. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/utils/large_scale_helper.py +344 -344
  145. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/utils/log.py +79 -79
  146. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/utils/performance_monitor.py +285 -285
  147. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/utils/queue_helper.py +175 -175
  148. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/utils/redis_connection_pool.py +388 -388
  149. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/utils/redis_key_validator.py +198 -198
  150. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/utils/request.py +267 -267
  151. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/utils/request_serializer.py +225 -225
  152. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/utils/spider_loader.py +61 -61
  153. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/utils/system.py +11 -11
  154. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/utils/tools.py +4 -4
  155. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/utils/url.py +39 -39
  156. crawlo-1.4.4/crawlo.egg-info/PKG-INFO +190 -0
  157. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo.egg-info/SOURCES.txt +20 -9
  158. {crawlo-1.4.2 → crawlo-1.4.4}/examples/__init__.py +7 -7
  159. {crawlo-1.4.2 → crawlo-1.4.4}/pyproject.toml +2 -2
  160. {crawlo-1.4.2 → crawlo-1.4.4}/requirements.txt +34 -33
  161. {crawlo-1.4.2 → crawlo-1.4.4}/setup.cfg +71 -71
  162. {crawlo-1.4.2 → crawlo-1.4.4}/tests/__init__.py +7 -7
  163. {crawlo-1.4.2 → crawlo-1.4.4}/tests/advanced_tools_example.py +275 -275
  164. {crawlo-1.4.2 → crawlo-1.4.4}/tests/authenticated_proxy_example.py +106 -106
  165. {crawlo-1.4.2 → crawlo-1.4.4}/tests/baidu_performance_test.py +108 -108
  166. {crawlo-1.4.2 → crawlo-1.4.4}/tests/baidu_test.py +59 -59
  167. {crawlo-1.4.2 → crawlo-1.4.4}/tests/cleaners_example.py +160 -160
  168. {crawlo-1.4.2 → crawlo-1.4.4}/tests/comprehensive_framework_test.py +212 -212
  169. {crawlo-1.4.2 → crawlo-1.4.4}/tests/comprehensive_test.py +81 -81
  170. {crawlo-1.4.2 → crawlo-1.4.4}/tests/comprehensive_testing_summary.md +186 -186
  171. {crawlo-1.4.2 → crawlo-1.4.4}/tests/config_validation_demo.py +142 -142
  172. {crawlo-1.4.2 → crawlo-1.4.4}/tests/controlled_spider_example.py +205 -205
  173. {crawlo-1.4.2 → crawlo-1.4.4}/tests/date_tools_example.py +180 -180
  174. {crawlo-1.4.2 → crawlo-1.4.4}/tests/debug_configure.py +69 -69
  175. {crawlo-1.4.2 → crawlo-1.4.4}/tests/debug_framework_logger.py +84 -84
  176. {crawlo-1.4.2 → crawlo-1.4.4}/tests/debug_log_config.py +126 -126
  177. {crawlo-1.4.2 → crawlo-1.4.4}/tests/debug_log_levels.py +63 -63
  178. {crawlo-1.4.2 → crawlo-1.4.4}/tests/debug_pipelines.py +66 -66
  179. {crawlo-1.4.2 → crawlo-1.4.4}/tests/detailed_log_test.py +233 -233
  180. {crawlo-1.4.2 → crawlo-1.4.4}/tests/distributed_test.py +66 -66
  181. {crawlo-1.4.2 → crawlo-1.4.4}/tests/distributed_test_debug.py +76 -76
  182. {crawlo-1.4.2 → crawlo-1.4.4}/tests/dynamic_loading_example.py +523 -523
  183. {crawlo-1.4.2 → crawlo-1.4.4}/tests/dynamic_loading_test.py +104 -104
  184. {crawlo-1.4.2 → crawlo-1.4.4}/tests/env_config_example.py +133 -133
  185. {crawlo-1.4.2 → crawlo-1.4.4}/tests/error_handling_example.py +171 -171
  186. {crawlo-1.4.2 → crawlo-1.4.4}/tests/final_comprehensive_test.py +151 -151
  187. {crawlo-1.4.2 → crawlo-1.4.4}/tests/final_log_test.py +260 -260
  188. {crawlo-1.4.2 → crawlo-1.4.4}/tests/final_validation_test.py +182 -182
  189. {crawlo-1.4.2 → crawlo-1.4.4}/tests/fix_log_test.py +142 -142
  190. {crawlo-1.4.2 → crawlo-1.4.4}/tests/framework_performance_test.py +202 -202
  191. {crawlo-1.4.2 → crawlo-1.4.4}/tests/log_buffering_test.py +111 -111
  192. {crawlo-1.4.2 → crawlo-1.4.4}/tests/log_generation_timing_test.py +153 -153
  193. {crawlo-1.4.2 → crawlo-1.4.4}/tests/optimized_performance_test.py +211 -211
  194. {crawlo-1.4.2 → crawlo-1.4.4}/tests/performance_comparison.py +245 -245
  195. {crawlo-1.4.2 → crawlo-1.4.4}/tests/queue_blocking_test.py +113 -113
  196. {crawlo-1.4.2 → crawlo-1.4.4}/tests/queue_test.py +89 -89
  197. {crawlo-1.4.2 → crawlo-1.4.4}/tests/redis_key_validation_demo.py +130 -130
  198. {crawlo-1.4.2 → crawlo-1.4.4}/tests/request_params_example.py +150 -150
  199. {crawlo-1.4.2 → crawlo-1.4.4}/tests/response_improvements_example.py +144 -144
  200. {crawlo-1.4.2 → crawlo-1.4.4}/tests/scrapy_comparison/ofweek_scrapy.py +138 -138
  201. {crawlo-1.4.2 → crawlo-1.4.4}/tests/scrapy_comparison/scrapy_test.py +133 -133
  202. {crawlo-1.4.2 → crawlo-1.4.4}/tests/simple_command_test.py +119 -119
  203. {crawlo-1.4.2 → crawlo-1.4.4}/tests/simple_crawlo_test.py +127 -127
  204. {crawlo-1.4.2 → crawlo-1.4.4}/tests/simple_log_test.py +57 -57
  205. {crawlo-1.4.2 → crawlo-1.4.4}/tests/simple_log_test2.py +137 -137
  206. {crawlo-1.4.2 → crawlo-1.4.4}/tests/simple_optimization_test.py +128 -128
  207. {crawlo-1.4.2 → crawlo-1.4.4}/tests/simple_queue_type_test.py +41 -41
  208. {crawlo-1.4.2 → crawlo-1.4.4}/tests/simple_spider_test.py +49 -49
  209. {crawlo-1.4.2 → crawlo-1.4.4}/tests/simple_test.py +47 -47
  210. {crawlo-1.4.2 → crawlo-1.4.4}/tests/spider_log_timing_test.py +177 -177
  211. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_advanced_tools.py +148 -148
  212. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_all_commands.py +230 -230
  213. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_all_pipeline_fingerprints.py +133 -133
  214. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_all_redis_key_configs.py +145 -145
  215. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_authenticated_proxy.py +141 -141
  216. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_batch_processor.py +178 -178
  217. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_cleaners.py +54 -54
  218. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_component_factory.py +174 -174
  219. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_comprehensive.py +146 -146
  220. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_config_consistency.py +80 -80
  221. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_config_merge.py +152 -152
  222. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_config_validator.py +182 -182
  223. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_controlled_spider_mixin.py +79 -79
  224. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_crawlo_proxy_integration.py +108 -108
  225. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_date_tools.py +123 -123
  226. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_dedup_fix.py +220 -220
  227. crawlo-1.4.4/tests/test_dedup_pipeline_consistency.py +125 -0
  228. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_default_header_middleware.py +313 -313
  229. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_distributed.py +65 -65
  230. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_double_crawlo_fix.py +204 -204
  231. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_double_crawlo_fix_simple.py +124 -124
  232. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_download_delay_middleware.py +221 -221
  233. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_downloader_proxy_compatibility.py +268 -268
  234. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_dynamic_downloaders_proxy.py +124 -124
  235. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_dynamic_proxy.py +92 -92
  236. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_dynamic_proxy_config.py +146 -146
  237. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_dynamic_proxy_real.py +109 -109
  238. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_edge_cases.py +303 -303
  239. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_enhanced_error_handler.py +270 -270
  240. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_enhanced_error_handler_comprehensive.py +245 -245
  241. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_env_config.py +121 -121
  242. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_error_handler_compatibility.py +112 -112
  243. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_factories.py +252 -252
  244. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_final_validation.py +153 -153
  245. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_fingerprint_consistency.py +135 -135
  246. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_fingerprint_simple.py +51 -51
  247. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_framework_env_usage.py +103 -103
  248. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_framework_logger.py +66 -66
  249. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_framework_startup.py +64 -64
  250. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_get_component_logger.py +83 -83
  251. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_hash_performance.py +99 -99
  252. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_integration.py +169 -169
  253. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_item_dedup_redis_key.py +122 -122
  254. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_large_scale_config.py +112 -112
  255. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_large_scale_helper.py +235 -235
  256. crawlo-1.4.4/tests/test_logging_enhancements.py +375 -0
  257. crawlo-1.4.4/tests/test_logging_final.py +185 -0
  258. crawlo-1.4.4/tests/test_logging_integration.py +313 -0
  259. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_logging_system.py +282 -282
  260. crawlo-1.4.4/tests/test_middleware_debug.py +142 -0
  261. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_mode_change.py +72 -72
  262. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_mode_consistency.py +51 -51
  263. crawlo-1.4.4/tests/test_multi_directory.py +68 -0
  264. crawlo-1.4.4/tests/test_multiple_spider_modules.py +81 -0
  265. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_offsite_middleware.py +244 -244
  266. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_offsite_middleware_simple.py +203 -203
  267. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_parsel.py +29 -29
  268. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_performance.py +327 -327
  269. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_performance_monitor.py +115 -115
  270. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_pipeline_fingerprint_consistency.py +86 -86
  271. crawlo-1.4.4/tests/test_priority_behavior.py +212 -0
  272. crawlo-1.4.4/tests/test_priority_consistency.py +152 -0
  273. crawlo-1.4.4/tests/test_priority_consistency_fixed.py +250 -0
  274. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_proxy_api.py +264 -264
  275. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_proxy_health_check.py +32 -32
  276. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_proxy_middleware.py +121 -121
  277. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_proxy_middleware_enhanced.py +216 -216
  278. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_proxy_middleware_integration.py +136 -136
  279. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_proxy_middleware_refactored.py +184 -184
  280. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_proxy_providers.py +56 -56
  281. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_proxy_stats.py +19 -19
  282. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_proxy_strategies.py +59 -59
  283. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_queue_empty_check.py +41 -41
  284. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_queue_manager_double_crawlo.py +173 -173
  285. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_queue_manager_redis_key.py +179 -179
  286. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_queue_naming.py +154 -154
  287. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_queue_type.py +106 -106
  288. crawlo-1.4.4/tests/test_queue_type_redis_config_consistency.py +131 -0
  289. crawlo-1.4.4/tests/test_random_headers_default.py +323 -0
  290. crawlo-1.4.4/tests/test_random_headers_necessity.py +309 -0
  291. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_random_user_agent.py +72 -72
  292. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_real_scenario_proxy.py +195 -195
  293. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_redis_config.py +28 -28
  294. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_redis_connection_pool.py +294 -294
  295. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_redis_key_naming.py +181 -181
  296. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_redis_key_validator.py +123 -123
  297. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_redis_queue.py +224 -224
  298. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_redis_queue_name_fix.py +175 -175
  299. crawlo-1.4.4/tests/test_redis_queue_type_fallback.py +130 -0
  300. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_request_ignore_middleware.py +182 -182
  301. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_request_params.py +111 -111
  302. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_request_serialization.py +70 -70
  303. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_response_code_middleware.py +349 -349
  304. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_response_filter_middleware.py +427 -427
  305. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_response_improvements.py +152 -152
  306. crawlo-1.4.4/tests/test_retry_middleware.py +334 -0
  307. crawlo-1.4.4/tests/test_retry_middleware_realistic.py +274 -0
  308. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_scheduler.py +252 -252
  309. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_scheduler_config_update.py +133 -133
  310. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_simple_response.py +61 -61
  311. crawlo-1.4.4/tests/test_spider_modules.py +85 -0
  312. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_telecom_spider_redis_key.py +205 -205
  313. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_template_content.py +87 -87
  314. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_template_redis_key.py +134 -134
  315. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_tools.py +159 -159
  316. crawlo-1.4.4/tests/test_user_agent_randomness.py +177 -0
  317. {crawlo-1.4.2 → crawlo-1.4.4}/tests/test_user_agents.py +96 -96
  318. {crawlo-1.4.2 → crawlo-1.4.4}/tests/tools_example.py +260 -260
  319. {crawlo-1.4.2 → crawlo-1.4.4}/tests/untested_features_report.md +138 -138
  320. {crawlo-1.4.2 → crawlo-1.4.4}/tests/verify_debug.py +51 -51
  321. {crawlo-1.4.2 → crawlo-1.4.4}/tests/verify_distributed.py +117 -117
  322. {crawlo-1.4.2 → crawlo-1.4.4}/tests/verify_log_fix.py +111 -111
  323. crawlo-1.4.2/PKG-INFO +0 -1199
  324. crawlo-1.4.2/README.md +0 -1149
  325. crawlo-1.4.2/crawlo/__version__.py +0 -1
  326. crawlo-1.4.2/crawlo/logging/config.py +0 -97
  327. crawlo-1.4.2/crawlo/templates/project/settings.py.tmpl +0 -171
  328. crawlo-1.4.2/crawlo/templates/project/settings_distributed.py.tmpl +0 -170
  329. crawlo-1.4.2/crawlo/templates/project/settings_minimal.py.tmpl +0 -66
  330. crawlo-1.4.2/crawlo.egg-info/PKG-INFO +0 -1199
  331. crawlo-1.4.2/examples/test_project/__init__.py +0 -7
  332. crawlo-1.4.2/examples/test_project/run.py +0 -35
  333. crawlo-1.4.2/examples/test_project/test_project/__init__.py +0 -4
  334. crawlo-1.4.2/examples/test_project/test_project/items.py +0 -18
  335. crawlo-1.4.2/examples/test_project/test_project/middlewares.py +0 -119
  336. crawlo-1.4.2/examples/test_project/test_project/pipelines.py +0 -97
  337. crawlo-1.4.2/examples/test_project/test_project/settings.py +0 -170
  338. crawlo-1.4.2/examples/test_project/test_project/spiders/__init__.py +0 -10
  339. crawlo-1.4.2/examples/test_project/test_project/spiders/of_week_dis.py +0 -144
  340. crawlo-1.4.2/tests/test_retry_middleware.py +0 -242
  341. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo/queue/__init__.py +0 -0
  342. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo.egg-info/dependency_links.txt +0 -0
  343. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo.egg-info/entry_points.txt +0 -0
  344. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo.egg-info/requires.txt +0 -0
  345. {crawlo-1.4.2 → crawlo-1.4.4}/crawlo.egg-info/top_level.txt +0 -0
  346. {crawlo-1.4.2 → crawlo-1.4.4}/tests/final_command_test_report.md +0 -0
@@ -1,23 +1,23 @@
1
- MIT License
2
-
3
- Modifications:
4
-
5
- Copyright (c) 2020 crawl-coder <2251018029@qq.com>
6
-
7
- Permission is hereby granted, free of charge, to any person obtaining a copy
8
- of this software and associated documentation files (the "Software"), to deal
9
- in the Software without restriction, including without limitation the rights
10
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
- copies of the Software, and to permit persons to whom the Software is
12
- furnished to do so, subject to the following conditions:
13
-
14
- The above copyright notice and this permission notice shall be included in all
15
- copies or substantial portions of the Software.
16
-
17
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1
+ MIT License
2
+
3
+ Modifications:
4
+
5
+ Copyright (c) 2020 crawl-coder <2251018029@qq.com>
6
+
7
+ Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ of this software and associated documentation files (the "Software"), to deal
9
+ in the Software without restriction, including without limitation the rights
10
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ copies of the Software, and to permit persons to whom the Software is
12
+ furnished to do so, subject to the following conditions:
13
+
14
+ The above copyright notice and this permission notice shall be included in all
15
+ copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
23
  SOFTWARE.
@@ -1,17 +1,17 @@
1
- include README.md
2
- include LICENSE
3
- include requirements.txt # 如果根目录有全局requirements.txt
4
- include VERSION # 如果根目录有全局VERSION文件
5
-
6
- # 包内文件包含
7
- recursive-include crawlo/utils/js *
8
- recursive-include crawlo/templates *
9
-
10
- # 测试文件(如果需要在分发包中包含测试)
11
- recursive-include tests *
12
-
13
- # 排除项
14
- global-exclude __pycache__ *.py[cod] .DS_Store *.so
15
- global-exclude *.bak *.swp *.orig *.rej
16
- prune samples # 排除示例目录
1
+ include README.md
2
+ include LICENSE
3
+ include requirements.txt # 如果根目录有全局requirements.txt
4
+ include VERSION # 如果根目录有全局VERSION文件
5
+
6
+ # 包内文件包含
7
+ recursive-include crawlo/utils/js *
8
+ recursive-include crawlo/templates *
9
+
10
+ # 测试文件(如果需要在分发包中包含测试)
11
+ recursive-include tests *
12
+
13
+ # 排除项
14
+ global-exclude __pycache__ *.py[cod] .DS_Store *.so
15
+ global-exclude *.bak *.swp *.orig *.rej
16
+ prune samples # 排除示例目录
17
17
  prune docs # 排除文档目录
crawlo-1.4.4/PKG-INFO ADDED
@@ -0,0 +1,190 @@
1
+ Metadata-Version: 2.4
2
+ Name: crawlo
3
+ Version: 1.4.4
4
+ Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
+ Home-page: https://github.com/crawl-coder/Crawlo.git
6
+ Author: crawl-coder
7
+ Author-email: crawlo@qq.com
8
+ License: MIT
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.6
13
+ Description-Content-Type: text/markdown
14
+ Requires-Dist: aiohttp>=3.12.14
15
+ Requires-Dist: aiomysql>=0.2.0
16
+ Requires-Dist: aioredis>=2.0.1
17
+ Requires-Dist: asyncmy>=0.2.10
18
+ Requires-Dist: cssselect>=1.2.0
19
+ Requires-Dist: dateparser>=1.2.2
20
+ Requires-Dist: httpx[http2]>=0.27.0
21
+ Requires-Dist: curl-cffi>=0.13.0
22
+ Requires-Dist: lxml>=5.2.1
23
+ Requires-Dist: motor>=3.7.0
24
+ Requires-Dist: parsel>=1.9.1
25
+ Requires-Dist: pydantic>=2.11.7
26
+ Requires-Dist: pymongo>=4.11
27
+ Requires-Dist: PyMySQL>=1.1.1
28
+ Requires-Dist: python-dateutil>=2.9.0.post0
29
+ Requires-Dist: redis>=6.2.0
30
+ Requires-Dist: requests>=2.32.4
31
+ Requires-Dist: six>=1.17.0
32
+ Requires-Dist: ujson>=5.9.0
33
+ Requires-Dist: urllib3>=2.5.0
34
+ Requires-Dist: w3lib>=2.1.2
35
+ Requires-Dist: rich>=14.1.0
36
+ Requires-Dist: astor>=0.8.1
37
+ Requires-Dist: watchdog>=6.0.0
38
+ Provides-Extra: render
39
+ Requires-Dist: webdriver-manager>=4.0.0; extra == "render"
40
+ Requires-Dist: playwright; extra == "render"
41
+ Requires-Dist: selenium>=3.141.0; extra == "render"
42
+ Provides-Extra: all
43
+ Requires-Dist: bitarray>=1.5.3; extra == "all"
44
+ Requires-Dist: PyExecJS>=1.5.1; extra == "all"
45
+ Requires-Dist: pymongo>=3.10.1; extra == "all"
46
+ Requires-Dist: redis-py-cluster>=2.1.0; extra == "all"
47
+ Requires-Dist: webdriver-manager>=4.0.0; extra == "all"
48
+ Requires-Dist: playwright; extra == "all"
49
+ Requires-Dist: selenium>=3.141.0; extra == "all"
50
+
51
+ # Crawlo 爬虫框架
52
+
53
+ Crawlo 是一个高性能、可扩展的 Python 爬虫框架,支持单机和分布式部署。
54
+
55
+ ## 特性
56
+
57
+ - 高性能异步爬取
58
+ - 支持多种下载器 (aiohttp, httpx, curl-cffi)
59
+ - 内置数据清洗和验证
60
+ - 分布式爬取支持
61
+ - 灵活的中间件系统
62
+ - 强大的配置管理系统
63
+ - 详细的日志记录和监控
64
+ - Windows 和 Linux 兼容
65
+
66
+ ## 安装
67
+
68
+ ```bash
69
+ pip install crawlo
70
+ ```
71
+
72
+ 或者从源码安装:
73
+
74
+ ```bash
75
+ git clone https://github.com/your-username/crawlo.git
76
+ cd crawlo
77
+ pip install -r requirements.txt
78
+ pip install .
79
+ ```
80
+
81
+ ## 快速开始
82
+
83
+ ```python
84
+ from crawlo import Spider
85
+
86
+ class MySpider(Spider):
87
+ name = 'example'
88
+
89
+ def parse(self, response):
90
+ # 解析逻辑
91
+ pass
92
+
93
+ # 运行爬虫
94
+ # crawlo run example
95
+ ```
96
+
97
+ ## 日志系统
98
+
99
+ Crawlo 拥有一个功能强大的日志系统,支持多种配置选项:
100
+
101
+ ### 基本配置
102
+
103
+ ```python
104
+ from crawlo.logging import configure_logging, get_logger
105
+
106
+ # 配置日志系统
107
+ configure_logging(
108
+ LOG_LEVEL='INFO',
109
+ LOG_FILE='logs/app.log',
110
+ LOG_MAX_BYTES=10*1024*1024, # 10MB
111
+ LOG_BACKUP_COUNT=5
112
+ )
113
+
114
+ # 获取logger
115
+ logger = get_logger('my_module')
116
+ logger.info('这是一条日志消息')
117
+ ```
118
+
119
+ ### 高级配置
120
+
121
+ ```python
122
+ # 分别配置控制台和文件日志级别
123
+ configure_logging(
124
+ LOG_LEVEL='INFO',
125
+ LOG_CONSOLE_LEVEL='WARNING', # 控制台只显示WARNING及以上级别
126
+ LOG_FILE_LEVEL='DEBUG', # 文件记录DEBUG及以上级别
127
+ LOG_FILE='logs/app.log',
128
+ LOG_INCLUDE_THREAD_ID=True, # 包含线程ID
129
+ LOG_INCLUDE_PROCESS_ID=True # 包含进程ID
130
+ )
131
+
132
+ # 模块特定日志级别
133
+ configure_logging(
134
+ LOG_LEVEL='WARNING',
135
+ LOG_LEVELS={
136
+ 'my_module.debug': 'DEBUG',
137
+ 'my_module.info': 'INFO'
138
+ }
139
+ )
140
+ ```
141
+
142
+ ### 性能监控
143
+
144
+ ```python
145
+ from crawlo.logging import get_monitor
146
+
147
+ # 启用日志性能监控
148
+ monitor = get_monitor()
149
+ monitor.enable_monitoring()
150
+
151
+ # 获取性能报告
152
+ report = monitor.get_performance_report()
153
+ print(report)
154
+ ```
155
+
156
+ ### 日志采样
157
+
158
+ ```python
159
+ from crawlo.logging import get_sampler
160
+
161
+ # 设置采样率(只记录30%的日志)
162
+ sampler = get_sampler()
163
+ sampler.set_sample_rate('my_module', 0.3)
164
+
165
+ # 设置速率限制(每秒最多100条日志)
166
+ sampler.set_rate_limit('my_module', 100)
167
+ ```
168
+
169
+ ## Windows 兼容性说明
170
+
171
+ 在 Windows 系统上使用日志轮转功能时,可能会遇到文件锁定问题。为了解决这个问题,建议安装 `concurrent-log-handler` 库:
172
+
173
+ ```bash
174
+ pip install concurrent-log-handler
175
+ ```
176
+
177
+ Crawlo 框架会自动检测并使用这个库来提供更好的 Windows 兼容性。
178
+
179
+ 如果未安装 `concurrent-log-handler`,在 Windows 上运行时可能会出现以下错误:
180
+ ```
181
+ PermissionError: [WinError 32] 另一个程序正在使用此文件,进程无法访问。
182
+ ```
183
+
184
+ ## 文档
185
+
186
+ 请查看 [文档](https://your-docs-url.com) 获取更多信息。
187
+
188
+ ## 许可证
189
+
190
+ MIT
crawlo-1.4.4/README.md ADDED
@@ -0,0 +1,140 @@
1
+ # Crawlo 爬虫框架
2
+
3
+ Crawlo 是一个高性能、可扩展的 Python 爬虫框架,支持单机和分布式部署。
4
+
5
+ ## 特性
6
+
7
+ - 高性能异步爬取
8
+ - 支持多种下载器 (aiohttp, httpx, curl-cffi)
9
+ - 内置数据清洗和验证
10
+ - 分布式爬取支持
11
+ - 灵活的中间件系统
12
+ - 强大的配置管理系统
13
+ - 详细的日志记录和监控
14
+ - Windows 和 Linux 兼容
15
+
16
+ ## 安装
17
+
18
+ ```bash
19
+ pip install crawlo
20
+ ```
21
+
22
+ 或者从源码安装:
23
+
24
+ ```bash
25
+ git clone https://github.com/your-username/crawlo.git
26
+ cd crawlo
27
+ pip install -r requirements.txt
28
+ pip install .
29
+ ```
30
+
31
+ ## 快速开始
32
+
33
+ ```python
34
+ from crawlo import Spider
35
+
36
+ class MySpider(Spider):
37
+ name = 'example'
38
+
39
+ def parse(self, response):
40
+ # 解析逻辑
41
+ pass
42
+
43
+ # 运行爬虫
44
+ # crawlo run example
45
+ ```
46
+
47
+ ## 日志系统
48
+
49
+ Crawlo 拥有一个功能强大的日志系统,支持多种配置选项:
50
+
51
+ ### 基本配置
52
+
53
+ ```python
54
+ from crawlo.logging import configure_logging, get_logger
55
+
56
+ # 配置日志系统
57
+ configure_logging(
58
+ LOG_LEVEL='INFO',
59
+ LOG_FILE='logs/app.log',
60
+ LOG_MAX_BYTES=10*1024*1024, # 10MB
61
+ LOG_BACKUP_COUNT=5
62
+ )
63
+
64
+ # 获取logger
65
+ logger = get_logger('my_module')
66
+ logger.info('这是一条日志消息')
67
+ ```
68
+
69
+ ### 高级配置
70
+
71
+ ```python
72
+ # 分别配置控制台和文件日志级别
73
+ configure_logging(
74
+ LOG_LEVEL='INFO',
75
+ LOG_CONSOLE_LEVEL='WARNING', # 控制台只显示WARNING及以上级别
76
+ LOG_FILE_LEVEL='DEBUG', # 文件记录DEBUG及以上级别
77
+ LOG_FILE='logs/app.log',
78
+ LOG_INCLUDE_THREAD_ID=True, # 包含线程ID
79
+ LOG_INCLUDE_PROCESS_ID=True # 包含进程ID
80
+ )
81
+
82
+ # 模块特定日志级别
83
+ configure_logging(
84
+ LOG_LEVEL='WARNING',
85
+ LOG_LEVELS={
86
+ 'my_module.debug': 'DEBUG',
87
+ 'my_module.info': 'INFO'
88
+ }
89
+ )
90
+ ```
91
+
92
+ ### 性能监控
93
+
94
+ ```python
95
+ from crawlo.logging import get_monitor
96
+
97
+ # 启用日志性能监控
98
+ monitor = get_monitor()
99
+ monitor.enable_monitoring()
100
+
101
+ # 获取性能报告
102
+ report = monitor.get_performance_report()
103
+ print(report)
104
+ ```
105
+
106
+ ### 日志采样
107
+
108
+ ```python
109
+ from crawlo.logging import get_sampler
110
+
111
+ # 设置采样率(只记录30%的日志)
112
+ sampler = get_sampler()
113
+ sampler.set_sample_rate('my_module', 0.3)
114
+
115
+ # 设置速率限制(每秒最多100条日志)
116
+ sampler.set_rate_limit('my_module', 100)
117
+ ```
118
+
119
+ ## Windows 兼容性说明
120
+
121
+ 在 Windows 系统上使用日志轮转功能时,可能会遇到文件锁定问题。为了解决这个问题,建议安装 `concurrent-log-handler` 库:
122
+
123
+ ```bash
124
+ pip install concurrent-log-handler
125
+ ```
126
+
127
+ Crawlo 框架会自动检测并使用这个库来提供更好的 Windows 兼容性。
128
+
129
+ 如果未安装 `concurrent-log-handler`,在 Windows 上运行时可能会出现以下错误:
130
+ ```
131
+ PermissionError: [WinError 32] 另一个程序正在使用此文件,进程无法访问。
132
+ ```
133
+
134
+ ## 文档
135
+
136
+ 请查看 [文档](https://your-docs-url.com) 获取更多信息。
137
+
138
+ ## 许可证
139
+
140
+ MIT
@@ -1,93 +1,93 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- Crawlo - 一个异步爬虫框架
5
- """
6
- from typing import TYPE_CHECKING
7
-
8
- from crawlo.spider import Spider
9
- from crawlo.items import Item, Field
10
- from crawlo.network.request import Request
11
- from crawlo.network.response import Response
12
- from crawlo.downloader import DownloaderBase
13
- from crawlo.middleware import BaseMiddleware
14
- from crawlo.utils import (
15
- TimeUtils,
16
- parse_time,
17
- format_time,
18
- time_diff,
19
- to_timestamp,
20
- to_datetime,
21
- now,
22
- to_timezone,
23
- to_utc,
24
- to_local,
25
- from_timestamp_with_tz
26
- )
27
- from crawlo import tools
28
-
29
- # 框架核心模块 - 使用TYPE_CHECKING避免循环导入
30
- if TYPE_CHECKING:
31
- from crawlo.initialization import get_framework_initializer, initialize_framework
32
-
33
- # 为了向后兼容,从tools中导入cleaners相关的功能
34
- import crawlo.tools as cleaners
35
-
36
-
37
- # 延迟导入的辅助函数
38
- def get_framework_initializer():
39
- """延迟导入get_framework_initializer以避免循环依赖"""
40
- from crawlo.initialization import get_framework_initializer as _get_framework_initializer
41
- return _get_framework_initializer()
42
-
43
-
44
- def initialize_framework(custom_settings=None):
45
- """延迟导入initialize_framework以避免循环依赖"""
46
- from crawlo.initialization import initialize_framework as _initialize_framework
47
- return _initialize_framework(custom_settings)
48
-
49
-
50
- # 向后兼容的别名
51
- def get_bootstrap_manager():
52
- """向后兼容的别名"""
53
- return get_framework_initializer()
54
-
55
-
56
- # 版本号:优先从元数据读取
57
- try:
58
- from importlib.metadata import version
59
-
60
- __version__ = version("crawlo")
61
- except Exception:
62
- # 开发模式下可能未安装,回退到 __version__.py 或 dev
63
- try:
64
- from crawlo.__version__ import __version__
65
- except ImportError:
66
- __version__ = "dev"
67
-
68
- # 定义对外 API
69
- __all__ = [
70
- 'Spider',
71
- 'Item',
72
- 'Field',
73
- 'Request',
74
- 'Response',
75
- 'DownloaderBase',
76
- 'BaseMiddleware',
77
- 'TimeUtils',
78
- 'parse_time',
79
- 'format_time',
80
- 'time_diff',
81
- 'to_timestamp',
82
- 'to_datetime',
83
- 'now',
84
- 'to_timezone',
85
- 'to_utc',
86
- 'to_local',
87
- 'from_timestamp_with_tz',
88
- 'cleaners',
89
- 'tools',
90
- 'get_framework_initializer',
91
- 'get_bootstrap_manager',
92
- '__version__',
93
- ]
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ Crawlo - 一个异步爬虫框架
5
+ """
6
+ from typing import TYPE_CHECKING
7
+
8
+ from crawlo.spider import Spider
9
+ from crawlo.items import Item, Field
10
+ from crawlo.network.request import Request
11
+ from crawlo.network.response import Response
12
+ from crawlo.downloader import DownloaderBase
13
+ from crawlo.middleware import BaseMiddleware
14
+ from crawlo.utils import (
15
+ TimeUtils,
16
+ parse_time,
17
+ format_time,
18
+ time_diff,
19
+ to_timestamp,
20
+ to_datetime,
21
+ now,
22
+ to_timezone,
23
+ to_utc,
24
+ to_local,
25
+ from_timestamp_with_tz
26
+ )
27
+ from crawlo import tools
28
+
29
+ # 框架核心模块 - 使用TYPE_CHECKING避免循环导入
30
+ if TYPE_CHECKING:
31
+ from crawlo.initialization import get_framework_initializer, initialize_framework
32
+
33
+ # 为了向后兼容,从tools中导入cleaners相关的功能
34
+ import crawlo.tools as cleaners
35
+
36
+
37
+ # 延迟导入的辅助函数
38
+ def get_framework_initializer():
39
+ """延迟导入get_framework_initializer以避免循环依赖"""
40
+ from crawlo.initialization import get_framework_initializer as _get_framework_initializer
41
+ return _get_framework_initializer()
42
+
43
+
44
+ def initialize_framework(custom_settings=None):
45
+ """延迟导入initialize_framework以避免循环依赖"""
46
+ from crawlo.initialization import initialize_framework as _initialize_framework
47
+ return _initialize_framework(custom_settings)
48
+
49
+
50
+ # 向后兼容的别名
51
+ def get_bootstrap_manager():
52
+ """向后兼容的别名"""
53
+ return get_framework_initializer()
54
+
55
+
56
+ # 版本号:优先从元数据读取
57
+ try:
58
+ from importlib.metadata import version
59
+
60
+ __version__ = version("crawlo")
61
+ except Exception:
62
+ # 开发模式下可能未安装,回退到 __version__.py 或 dev
63
+ try:
64
+ from crawlo.__version__ import __version__
65
+ except ImportError:
66
+ __version__ = "dev"
67
+
68
+ # 定义对外 API
69
+ __all__ = [
70
+ 'Spider',
71
+ 'Item',
72
+ 'Field',
73
+ 'Request',
74
+ 'Response',
75
+ 'DownloaderBase',
76
+ 'BaseMiddleware',
77
+ 'TimeUtils',
78
+ 'parse_time',
79
+ 'format_time',
80
+ 'time_diff',
81
+ 'to_timestamp',
82
+ 'to_datetime',
83
+ 'now',
84
+ 'to_timezone',
85
+ 'to_utc',
86
+ 'to_local',
87
+ 'from_timestamp_with_tz',
88
+ 'cleaners',
89
+ 'tools',
90
+ 'get_framework_initializer',
91
+ 'get_bootstrap_manager',
92
+ '__version__',
93
+ ]
@@ -0,0 +1 @@
1
+ __version__ = '1.4.4'