crawlo 1.4.4__tar.gz → 1.4.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (392) hide show
  1. crawlo-1.4.6/PKG-INFO +329 -0
  2. crawlo-1.4.6/README.md +279 -0
  3. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/__init__.py +11 -15
  4. crawlo-1.4.6/crawlo/__version__.py +1 -0
  5. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/commands/startproject.py +24 -0
  6. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/core/engine.py +2 -2
  7. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/core/scheduler.py +4 -4
  8. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/crawler.py +8 -7
  9. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/downloader/__init__.py +5 -2
  10. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/downloader/cffi_downloader.py +3 -1
  11. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/extension/__init__.py +2 -2
  12. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/filters/aioredis_filter.py +8 -1
  13. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/filters/memory_filter.py +8 -1
  14. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/initialization/built_in.py +13 -4
  15. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/initialization/core.py +5 -4
  16. crawlo-1.4.6/crawlo/interfaces.py +24 -0
  17. crawlo-1.4.6/crawlo/middleware/__init__.py +24 -0
  18. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/middleware/middleware_manager.py +15 -8
  19. crawlo-1.4.6/crawlo/middleware/proxy.py +209 -0
  20. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/mode_manager.py +45 -11
  21. crawlo-1.4.6/crawlo/network/response.py +665 -0
  22. crawlo-1.4.6/crawlo/pipelines/mysql_pipeline.py +477 -0
  23. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/pipelines/pipeline_manager.py +2 -2
  24. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/project.py +2 -4
  25. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/settings/default_settings.py +42 -30
  26. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/stats_collector.py +10 -1
  27. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/task_manager.py +2 -2
  28. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/templates/project/items.py.tmpl +2 -2
  29. crawlo-1.4.6/crawlo/templates/project/middlewares.py.tmpl +39 -0
  30. crawlo-1.4.6/crawlo/templates/project/pipelines.py.tmpl +37 -0
  31. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/templates/project/settings.py.tmpl +10 -55
  32. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/templates/project/settings_distributed.py.tmpl +20 -22
  33. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/templates/project/settings_gentle.py.tmpl +5 -0
  34. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/templates/project/settings_high_performance.py.tmpl +5 -0
  35. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/templates/project/settings_minimal.py.tmpl +25 -1
  36. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/templates/project/settings_simple.py.tmpl +5 -0
  37. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/templates/run.py.tmpl +1 -8
  38. crawlo-1.4.6/crawlo/templates/spider/spider.py.tmpl +41 -0
  39. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/tools/__init__.py +0 -11
  40. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/utils/__init__.py +17 -1
  41. crawlo-1.4.6/crawlo/utils/db_helper.py +251 -0
  42. crawlo-1.4.4/crawlo/utils/enhanced_error_handler.py → crawlo-1.4.6/crawlo/utils/error_handler.py +57 -3
  43. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/utils/fingerprint.py +3 -4
  44. crawlo-1.4.6/crawlo/utils/misc.py +82 -0
  45. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/utils/request.py +55 -66
  46. crawlo-1.4.6/crawlo/utils/selector_helper.py +138 -0
  47. crawlo-1.4.6/crawlo/utils/spider_loader.py +202 -0
  48. crawlo-1.4.6/crawlo/utils/text_helper.py +95 -0
  49. crawlo-1.4.6/crawlo.egg-info/PKG-INFO +329 -0
  50. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo.egg-info/SOURCES.txt +52 -11
  51. {crawlo-1.4.4 → crawlo-1.4.6}/tests/authenticated_proxy_example.py +10 -6
  52. crawlo-1.4.6/tests/bug_check_test.py +251 -0
  53. crawlo-1.4.6/tests/direct_selector_helper_test.py +97 -0
  54. crawlo-1.4.6/tests/explain_mysql_update_behavior.py +77 -0
  55. crawlo-1.4.6/tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
  56. crawlo-1.4.6/tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
  57. crawlo-1.4.6/tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
  58. crawlo-1.4.6/tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
  59. crawlo-1.4.6/tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
  60. crawlo-1.4.6/tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +162 -0
  61. crawlo-1.4.6/tests/ofweek_scrapy/scrapy.cfg +11 -0
  62. {crawlo-1.4.4 → crawlo-1.4.6}/tests/performance_comparison.py +4 -5
  63. {crawlo-1.4.4 → crawlo-1.4.6}/tests/simple_crawlo_test.py +1 -2
  64. crawlo-1.4.6/tests/simple_follow_test.py +39 -0
  65. crawlo-1.4.6/tests/simple_response_selector_test.py +95 -0
  66. crawlo-1.4.6/tests/simple_selector_helper_test.py +155 -0
  67. crawlo-1.4.6/tests/simple_selector_test.py +208 -0
  68. crawlo-1.4.6/tests/simple_url_test.py +74 -0
  69. crawlo-1.4.6/tests/simulate_mysql_update_test.py +140 -0
  70. crawlo-1.4.6/tests/test_asyncmy_usage.py +57 -0
  71. crawlo-1.4.6/tests/test_crawler_process_import.py +39 -0
  72. crawlo-1.4.6/tests/test_crawler_process_spider_modules.py +48 -0
  73. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_crawlo_proxy_integration.py +8 -2
  74. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_downloader_proxy_compatibility.py +24 -20
  75. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_edge_cases.py +7 -5
  76. crawlo-1.4.6/tests/test_encoding_core.py +57 -0
  77. crawlo-1.4.6/tests/test_encoding_detection.py +127 -0
  78. crawlo-1.4.6/tests/test_factory_compatibility.py +197 -0
  79. crawlo-1.4.6/tests/test_mysql_pipeline_config.py +165 -0
  80. crawlo-1.4.6/tests/test_mysql_pipeline_error.py +99 -0
  81. crawlo-1.4.6/tests/test_mysql_pipeline_init_log.py +83 -0
  82. crawlo-1.4.6/tests/test_mysql_pipeline_integration.py +133 -0
  83. crawlo-1.4.6/tests/test_mysql_pipeline_refactor.py +144 -0
  84. crawlo-1.4.6/tests/test_mysql_pipeline_refactor_simple.py +86 -0
  85. crawlo-1.4.6/tests/test_mysql_pipeline_robustness.py +196 -0
  86. crawlo-1.4.6/tests/test_mysql_pipeline_types.py +89 -0
  87. crawlo-1.4.6/tests/test_mysql_update_columns.py +94 -0
  88. crawlo-1.4.6/tests/test_optimized_selector_naming.py +101 -0
  89. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_priority_behavior.py +18 -18
  90. crawlo-1.4.6/tests/test_proxy_middleware.py +218 -0
  91. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_proxy_middleware_enhanced.py +1 -5
  92. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_proxy_middleware_integration.py +7 -2
  93. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_proxy_middleware_refactored.py +25 -2
  94. crawlo-1.4.6/tests/test_proxy_only.py +84 -0
  95. crawlo-1.4.6/tests/test_proxy_with_downloader.py +153 -0
  96. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_real_scenario_proxy.py +17 -17
  97. crawlo-1.4.6/tests/test_response_follow.py +105 -0
  98. crawlo-1.4.6/tests/test_response_selector_methods.py +93 -0
  99. crawlo-1.4.6/tests/test_response_url_methods.py +71 -0
  100. crawlo-1.4.6/tests/test_response_urljoin.py +87 -0
  101. crawlo-1.4.6/tests/test_scrapy_style_encoding.py +113 -0
  102. crawlo-1.4.6/tests/test_selector_helper.py +101 -0
  103. crawlo-1.4.6/tests/test_selector_optimizations.py +147 -0
  104. crawlo-1.4.6/tests/test_spider_loader.py +50 -0
  105. crawlo-1.4.6/tests/test_spider_loader_comprehensive.py +70 -0
  106. crawlo-1.4.6/tests/test_spiders/__init__.py +1 -0
  107. crawlo-1.4.6/tests/test_spiders/test_spider.py +10 -0
  108. crawlo-1.4.6/tests/verify_mysql_warnings.py +110 -0
  109. crawlo-1.4.4/PKG-INFO +0 -190
  110. crawlo-1.4.4/README.md +0 -140
  111. crawlo-1.4.4/crawlo/__version__.py +0 -1
  112. crawlo-1.4.4/crawlo/middleware/__init__.py +0 -21
  113. crawlo-1.4.4/crawlo/middleware/proxy.py +0 -386
  114. crawlo-1.4.4/crawlo/middleware/simple_proxy.py +0 -65
  115. crawlo-1.4.4/crawlo/network/response.py +0 -360
  116. crawlo-1.4.4/crawlo/pipelines/mysql_pipeline.py +0 -326
  117. crawlo-1.4.4/crawlo/templates/project/middlewares.py.tmpl +0 -119
  118. crawlo-1.4.4/crawlo/templates/project/pipelines.py.tmpl +0 -97
  119. crawlo-1.4.4/crawlo/templates/spider/spider.py.tmpl +0 -144
  120. crawlo-1.4.4/crawlo/tools/anti_crawler.py +0 -269
  121. crawlo-1.4.4/crawlo/utils/class_loader.py +0 -26
  122. crawlo-1.4.4/crawlo/utils/db_helper.py +0 -344
  123. crawlo-1.4.4/crawlo/utils/error_handler.py +0 -165
  124. crawlo-1.4.4/crawlo/utils/spider_loader.py +0 -62
  125. crawlo-1.4.4/crawlo.egg-info/PKG-INFO +0 -190
  126. crawlo-1.4.4/tests/simple_log_test.py +0 -58
  127. crawlo-1.4.4/tests/simple_test.py +0 -48
  128. crawlo-1.4.4/tests/test_framework_logger.py +0 -67
  129. crawlo-1.4.4/tests/test_framework_startup.py +0 -65
  130. crawlo-1.4.4/tests/test_mode_change.py +0 -73
  131. crawlo-1.4.4/tests/test_proxy_middleware.py +0 -122
  132. {crawlo-1.4.4 → crawlo-1.4.6}/LICENSE +0 -0
  133. {crawlo-1.4.4 → crawlo-1.4.6}/MANIFEST.in +0 -0
  134. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/cli.py +0 -0
  135. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/commands/__init__.py +0 -0
  136. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/commands/check.py +0 -0
  137. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/commands/genspider.py +0 -0
  138. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/commands/help.py +0 -0
  139. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/commands/list.py +0 -0
  140. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/commands/run.py +0 -0
  141. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/commands/stats.py +0 -0
  142. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/commands/utils.py +0 -0
  143. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/config.py +0 -0
  144. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/config_validator.py +0 -0
  145. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/core/__init__.py +0 -0
  146. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/core/processor.py +0 -0
  147. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/data/__init__.py +0 -0
  148. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/data/user_agents.py +0 -0
  149. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/downloader/aiohttp_downloader.py +0 -0
  150. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/downloader/httpx_downloader.py +0 -0
  151. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/downloader/hybrid_downloader.py +0 -0
  152. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/downloader/playwright_downloader.py +0 -0
  153. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/downloader/selenium_downloader.py +0 -0
  154. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/event.py +0 -0
  155. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/exceptions.py +0 -0
  156. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/extension/health_check.py +0 -0
  157. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/extension/log_interval.py +0 -0
  158. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/extension/log_stats.py +0 -0
  159. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/extension/logging_extension.py +0 -0
  160. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/extension/memory_monitor.py +0 -0
  161. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/extension/performance_profiler.py +0 -0
  162. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/extension/request_recorder.py +0 -0
  163. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/factories/__init__.py +0 -0
  164. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/factories/base.py +0 -0
  165. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/factories/crawler.py +0 -0
  166. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/factories/registry.py +0 -0
  167. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/filters/__init__.py +0 -0
  168. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/framework.py +0 -0
  169. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/initialization/__init__.py +0 -0
  170. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/initialization/context.py +0 -0
  171. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/initialization/phases.py +0 -0
  172. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/initialization/registry.py +0 -0
  173. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/items/__init__.py +0 -0
  174. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/items/base.py +0 -0
  175. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/items/fields.py +0 -0
  176. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/items/items.py +0 -0
  177. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/logging/__init__.py +0 -0
  178. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/logging/async_handler.py +0 -0
  179. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/logging/config.py +0 -0
  180. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/logging/factory.py +0 -0
  181. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/logging/manager.py +0 -0
  182. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/logging/monitor.py +0 -0
  183. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/logging/sampler.py +0 -0
  184. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/middleware/default_header.py +0 -0
  185. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/middleware/download_delay.py +0 -0
  186. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/middleware/offsite.py +0 -0
  187. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/middleware/request_ignore.py +0 -0
  188. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/middleware/response_code.py +0 -0
  189. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/middleware/response_filter.py +0 -0
  190. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/middleware/retry.py +0 -0
  191. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/network/__init__.py +0 -0
  192. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/network/request.py +0 -0
  193. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/pipelines/__init__.py +0 -0
  194. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/pipelines/bloom_dedup_pipeline.py +0 -0
  195. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/pipelines/console_pipeline.py +0 -0
  196. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/pipelines/csv_pipeline.py +0 -0
  197. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/pipelines/database_dedup_pipeline.py +0 -0
  198. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/pipelines/json_pipeline.py +0 -0
  199. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/pipelines/memory_dedup_pipeline.py +0 -0
  200. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/pipelines/mongo_pipeline.py +0 -0
  201. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/pipelines/redis_dedup_pipeline.py +0 -0
  202. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/queue/__init__.py +0 -0
  203. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/queue/pqueue.py +0 -0
  204. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/queue/queue_manager.py +0 -0
  205. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/queue/redis_priority_queue.py +0 -0
  206. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/settings/__init__.py +0 -0
  207. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/settings/setting_manager.py +0 -0
  208. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/spider/__init__.py +0 -0
  209. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/subscriber.py +0 -0
  210. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/templates/crawlo.cfg.tmpl +0 -0
  211. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/templates/project/__init__.py.tmpl +0 -0
  212. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/templates/project/spiders/__init__.py.tmpl +0 -0
  213. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/templates/spiders_init.py.tmpl +0 -0
  214. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/tools/authenticated_proxy.py +0 -0
  215. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/tools/data_formatter.py +0 -0
  216. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/tools/data_validator.py +0 -0
  217. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/tools/date_tools.py +0 -0
  218. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/tools/distributed_coordinator.py +0 -0
  219. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/tools/encoding_converter.py +0 -0
  220. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/tools/network_diagnostic.py +0 -0
  221. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/tools/request_tools.py +0 -0
  222. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/tools/retry_mechanism.py +0 -0
  223. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/tools/scenario_adapter.py +0 -0
  224. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/tools/text_cleaner.py +0 -0
  225. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/utils/batch_processor.py +0 -0
  226. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/utils/controlled_spider_mixin.py +0 -0
  227. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/utils/env_config.py +0 -0
  228. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/utils/func_tools.py +0 -0
  229. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/utils/large_scale_config.py +0 -0
  230. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/utils/large_scale_helper.py +0 -0
  231. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/utils/log.py +0 -0
  232. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/utils/performance_monitor.py +0 -0
  233. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/utils/queue_helper.py +0 -0
  234. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/utils/redis_connection_pool.py +0 -0
  235. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/utils/redis_key_validator.py +0 -0
  236. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/utils/request_serializer.py +0 -0
  237. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/utils/system.py +0 -0
  238. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/utils/tools.py +0 -0
  239. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo/utils/url.py +0 -0
  240. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo.egg-info/dependency_links.txt +0 -0
  241. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo.egg-info/entry_points.txt +0 -0
  242. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo.egg-info/requires.txt +0 -0
  243. {crawlo-1.4.4 → crawlo-1.4.6}/crawlo.egg-info/top_level.txt +0 -0
  244. {crawlo-1.4.4 → crawlo-1.4.6}/examples/__init__.py +0 -0
  245. {crawlo-1.4.4 → crawlo-1.4.6}/pyproject.toml +0 -0
  246. {crawlo-1.4.4 → crawlo-1.4.6}/requirements.txt +0 -0
  247. {crawlo-1.4.4 → crawlo-1.4.6}/setup.cfg +0 -0
  248. {crawlo-1.4.4 → crawlo-1.4.6}/tests/__init__.py +0 -0
  249. {crawlo-1.4.4 → crawlo-1.4.6}/tests/advanced_tools_example.py +0 -0
  250. {crawlo-1.4.4 → crawlo-1.4.6}/tests/baidu_performance_test.py +0 -0
  251. {crawlo-1.4.4 → crawlo-1.4.6}/tests/baidu_test.py +0 -0
  252. {crawlo-1.4.4 → crawlo-1.4.6}/tests/cleaners_example.py +0 -0
  253. {crawlo-1.4.4 → crawlo-1.4.6}/tests/comprehensive_framework_test.py +0 -0
  254. {crawlo-1.4.4 → crawlo-1.4.6}/tests/comprehensive_test.py +0 -0
  255. {crawlo-1.4.4 → crawlo-1.4.6}/tests/comprehensive_testing_summary.md +0 -0
  256. {crawlo-1.4.4 → crawlo-1.4.6}/tests/config_validation_demo.py +0 -0
  257. {crawlo-1.4.4 → crawlo-1.4.6}/tests/controlled_spider_example.py +0 -0
  258. {crawlo-1.4.4 → crawlo-1.4.6}/tests/date_tools_example.py +0 -0
  259. {crawlo-1.4.4 → crawlo-1.4.6}/tests/debug_configure.py +0 -0
  260. {crawlo-1.4.4 → crawlo-1.4.6}/tests/debug_framework_logger.py +0 -0
  261. {crawlo-1.4.4 → crawlo-1.4.6}/tests/debug_log_config.py +0 -0
  262. {crawlo-1.4.4 → crawlo-1.4.6}/tests/debug_log_levels.py +0 -0
  263. {crawlo-1.4.4 → crawlo-1.4.6}/tests/debug_pipelines.py +0 -0
  264. {crawlo-1.4.4 → crawlo-1.4.6}/tests/detailed_log_test.py +0 -0
  265. {crawlo-1.4.4 → crawlo-1.4.6}/tests/distributed_test.py +0 -0
  266. {crawlo-1.4.4 → crawlo-1.4.6}/tests/distributed_test_debug.py +0 -0
  267. {crawlo-1.4.4 → crawlo-1.4.6}/tests/dynamic_loading_example.py +0 -0
  268. {crawlo-1.4.4 → crawlo-1.4.6}/tests/dynamic_loading_test.py +0 -0
  269. {crawlo-1.4.4 → crawlo-1.4.6}/tests/env_config_example.py +0 -0
  270. {crawlo-1.4.4 → crawlo-1.4.6}/tests/error_handling_example.py +0 -0
  271. {crawlo-1.4.4 → crawlo-1.4.6}/tests/final_comprehensive_test.py +0 -0
  272. {crawlo-1.4.4 → crawlo-1.4.6}/tests/final_log_test.py +0 -0
  273. {crawlo-1.4.4 → crawlo-1.4.6}/tests/final_validation_test.py +0 -0
  274. {crawlo-1.4.4 → crawlo-1.4.6}/tests/fix_log_test.py +0 -0
  275. {crawlo-1.4.4 → crawlo-1.4.6}/tests/framework_performance_test.py +0 -0
  276. {crawlo-1.4.4 → crawlo-1.4.6}/tests/log_buffering_test.py +0 -0
  277. {crawlo-1.4.4 → crawlo-1.4.6}/tests/log_generation_timing_test.py +0 -0
  278. /crawlo-1.4.4/tests/final_command_test_report.md → /crawlo-1.4.6/tests/ofweek_scrapy/ofweek_scrapy/__init__.py +0 -0
  279. {crawlo-1.4.4 → crawlo-1.4.6}/tests/optimized_performance_test.py +0 -0
  280. {crawlo-1.4.4 → crawlo-1.4.6}/tests/queue_blocking_test.py +0 -0
  281. {crawlo-1.4.4 → crawlo-1.4.6}/tests/queue_test.py +0 -0
  282. {crawlo-1.4.4 → crawlo-1.4.6}/tests/redis_key_validation_demo.py +0 -0
  283. {crawlo-1.4.4 → crawlo-1.4.6}/tests/request_params_example.py +0 -0
  284. {crawlo-1.4.4 → crawlo-1.4.6}/tests/response_improvements_example.py +0 -0
  285. {crawlo-1.4.4 → crawlo-1.4.6}/tests/scrapy_comparison/ofweek_scrapy.py +0 -0
  286. {crawlo-1.4.4 → crawlo-1.4.6}/tests/scrapy_comparison/scrapy_test.py +0 -0
  287. {crawlo-1.4.4 → crawlo-1.4.6}/tests/simple_command_test.py +0 -0
  288. {crawlo-1.4.4 → crawlo-1.4.6}/tests/simple_log_test2.py +0 -0
  289. {crawlo-1.4.4 → crawlo-1.4.6}/tests/simple_optimization_test.py +0 -0
  290. {crawlo-1.4.4 → crawlo-1.4.6}/tests/simple_queue_type_test.py +0 -0
  291. {crawlo-1.4.4 → crawlo-1.4.6}/tests/simple_spider_test.py +0 -0
  292. {crawlo-1.4.4 → crawlo-1.4.6}/tests/spider_log_timing_test.py +0 -0
  293. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_advanced_tools.py +0 -0
  294. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_all_commands.py +0 -0
  295. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_all_pipeline_fingerprints.py +0 -0
  296. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_all_redis_key_configs.py +0 -0
  297. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_authenticated_proxy.py +0 -0
  298. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_batch_processor.py +0 -0
  299. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_cleaners.py +0 -0
  300. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_component_factory.py +0 -0
  301. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_comprehensive.py +0 -0
  302. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_config_consistency.py +0 -0
  303. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_config_merge.py +0 -0
  304. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_config_validator.py +0 -0
  305. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_controlled_spider_mixin.py +0 -0
  306. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_date_tools.py +0 -0
  307. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_dedup_fix.py +0 -0
  308. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_dedup_pipeline_consistency.py +0 -0
  309. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_default_header_middleware.py +0 -0
  310. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_distributed.py +0 -0
  311. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_double_crawlo_fix.py +0 -0
  312. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_double_crawlo_fix_simple.py +0 -0
  313. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_download_delay_middleware.py +0 -0
  314. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_dynamic_downloaders_proxy.py +0 -0
  315. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_dynamic_proxy.py +0 -0
  316. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_dynamic_proxy_config.py +0 -0
  317. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_dynamic_proxy_real.py +0 -0
  318. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_enhanced_error_handler.py +0 -0
  319. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_enhanced_error_handler_comprehensive.py +0 -0
  320. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_env_config.py +0 -0
  321. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_error_handler_compatibility.py +0 -0
  322. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_factories.py +0 -0
  323. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_final_validation.py +0 -0
  324. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_fingerprint_consistency.py +0 -0
  325. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_fingerprint_simple.py +0 -0
  326. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_framework_env_usage.py +0 -0
  327. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_get_component_logger.py +0 -0
  328. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_hash_performance.py +0 -0
  329. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_integration.py +0 -0
  330. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_item_dedup_redis_key.py +0 -0
  331. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_large_scale_config.py +0 -0
  332. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_large_scale_helper.py +0 -0
  333. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_logging_enhancements.py +0 -0
  334. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_logging_final.py +0 -0
  335. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_logging_integration.py +0 -0
  336. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_logging_system.py +0 -0
  337. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_middleware_debug.py +0 -0
  338. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_mode_consistency.py +0 -0
  339. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_multi_directory.py +0 -0
  340. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_multiple_spider_modules.py +0 -0
  341. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_offsite_middleware.py +0 -0
  342. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_offsite_middleware_simple.py +0 -0
  343. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_parsel.py +0 -0
  344. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_performance.py +0 -0
  345. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_performance_monitor.py +0 -0
  346. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_pipeline_fingerprint_consistency.py +0 -0
  347. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_priority_consistency.py +0 -0
  348. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_priority_consistency_fixed.py +0 -0
  349. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_proxy_api.py +0 -0
  350. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_proxy_health_check.py +0 -0
  351. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_proxy_providers.py +0 -0
  352. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_proxy_stats.py +0 -0
  353. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_proxy_strategies.py +0 -0
  354. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_queue_empty_check.py +0 -0
  355. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_queue_manager_double_crawlo.py +0 -0
  356. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_queue_manager_redis_key.py +0 -0
  357. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_queue_naming.py +0 -0
  358. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_queue_type.py +0 -0
  359. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_queue_type_redis_config_consistency.py +0 -0
  360. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_random_headers_default.py +0 -0
  361. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_random_headers_necessity.py +0 -0
  362. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_random_user_agent.py +0 -0
  363. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_redis_config.py +0 -0
  364. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_redis_connection_pool.py +0 -0
  365. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_redis_key_naming.py +0 -0
  366. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_redis_key_validator.py +0 -0
  367. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_redis_queue.py +0 -0
  368. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_redis_queue_name_fix.py +0 -0
  369. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_redis_queue_type_fallback.py +0 -0
  370. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_request_ignore_middleware.py +0 -0
  371. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_request_params.py +0 -0
  372. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_request_serialization.py +0 -0
  373. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_response_code_middleware.py +0 -0
  374. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_response_filter_middleware.py +0 -0
  375. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_response_improvements.py +0 -0
  376. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_retry_middleware.py +0 -0
  377. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_retry_middleware_realistic.py +0 -0
  378. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_scheduler.py +0 -0
  379. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_scheduler_config_update.py +0 -0
  380. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_simple_response.py +0 -0
  381. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_spider_modules.py +0 -0
  382. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_telecom_spider_redis_key.py +0 -0
  383. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_template_content.py +0 -0
  384. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_template_redis_key.py +0 -0
  385. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_tools.py +0 -0
  386. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_user_agent_randomness.py +0 -0
  387. {crawlo-1.4.4 → crawlo-1.4.6}/tests/test_user_agents.py +0 -0
  388. {crawlo-1.4.4 → crawlo-1.4.6}/tests/tools_example.py +0 -0
  389. {crawlo-1.4.4 → crawlo-1.4.6}/tests/untested_features_report.md +0 -0
  390. {crawlo-1.4.4 → crawlo-1.4.6}/tests/verify_debug.py +0 -0
  391. {crawlo-1.4.4 → crawlo-1.4.6}/tests/verify_distributed.py +0 -0
  392. {crawlo-1.4.4 → crawlo-1.4.6}/tests/verify_log_fix.py +0 -0
crawlo-1.4.6/PKG-INFO ADDED
@@ -0,0 +1,329 @@
1
+ Metadata-Version: 2.4
2
+ Name: crawlo
3
+ Version: 1.4.6
4
+ Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
+ Home-page: https://github.com/crawl-coder/Crawlo.git
6
+ Author: crawl-coder
7
+ Author-email: crawlo@qq.com
8
+ License: MIT
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.6
13
+ Description-Content-Type: text/markdown
14
+ Requires-Dist: aiohttp>=3.12.14
15
+ Requires-Dist: aiomysql>=0.2.0
16
+ Requires-Dist: aioredis>=2.0.1
17
+ Requires-Dist: asyncmy>=0.2.10
18
+ Requires-Dist: cssselect>=1.2.0
19
+ Requires-Dist: dateparser>=1.2.2
20
+ Requires-Dist: httpx[http2]>=0.27.0
21
+ Requires-Dist: curl-cffi>=0.13.0
22
+ Requires-Dist: lxml>=5.2.1
23
+ Requires-Dist: motor>=3.7.0
24
+ Requires-Dist: parsel>=1.9.1
25
+ Requires-Dist: pydantic>=2.11.7
26
+ Requires-Dist: pymongo>=4.11
27
+ Requires-Dist: PyMySQL>=1.1.1
28
+ Requires-Dist: python-dateutil>=2.9.0.post0
29
+ Requires-Dist: redis>=6.2.0
30
+ Requires-Dist: requests>=2.32.4
31
+ Requires-Dist: six>=1.17.0
32
+ Requires-Dist: ujson>=5.9.0
33
+ Requires-Dist: urllib3>=2.5.0
34
+ Requires-Dist: w3lib>=2.1.2
35
+ Requires-Dist: rich>=14.1.0
36
+ Requires-Dist: astor>=0.8.1
37
+ Requires-Dist: watchdog>=6.0.0
38
+ Provides-Extra: render
39
+ Requires-Dist: webdriver-manager>=4.0.0; extra == "render"
40
+ Requires-Dist: playwright; extra == "render"
41
+ Requires-Dist: selenium>=3.141.0; extra == "render"
42
+ Provides-Extra: all
43
+ Requires-Dist: bitarray>=1.5.3; extra == "all"
44
+ Requires-Dist: PyExecJS>=1.5.1; extra == "all"
45
+ Requires-Dist: pymongo>=3.10.1; extra == "all"
46
+ Requires-Dist: redis-py-cluster>=2.1.0; extra == "all"
47
+ Requires-Dist: webdriver-manager>=4.0.0; extra == "all"
48
+ Requires-Dist: playwright; extra == "all"
49
+ Requires-Dist: selenium>=3.141.0; extra == "all"
50
+
51
+ # Crawlo 爬虫框架
52
+
53
+ Crawlo 是一个高性能、可扩展的 Python 爬虫框架,支持单机和分布式部署。
54
+
55
+ ## 特性
56
+
57
+ - 高性能异步爬取
58
+ - 支持多种下载器 (aiohttp, httpx, curl-cffi)
59
+ - 内置数据清洗和验证
60
+ - 分布式爬取支持
61
+ - 灵活的中间件系统
62
+ - 强大的配置管理系统
63
+ - 详细的日志记录和监控
64
+ - Windows 和 Linux 兼容
65
+
66
+ ## 安装
67
+
68
+ ```bash
69
+ pip install crawlo
70
+ ```
71
+
72
+ 或者从源码安装:
73
+
74
+ ```bash
75
+ git clone git@github.com:crawl-coder/Crawlo.git
76
+ cd crawlo
77
+ pip install -r requirements.txt
78
+ pip install .
79
+ ```
80
+
81
+ ## 快速开始
82
+
83
+ ```python
84
+ from crawlo import Spider
85
+
86
+ class MySpider(Spider):
87
+ name = 'example'
88
+
89
+ def parse(self, response):
90
+ # 解析逻辑
91
+ pass
92
+
93
+ # 运行爬虫
94
+ # crawlo run example
95
+ ```
96
+
97
+ ## Response 对象功能
98
+
99
+ Crawlo 框架对 Response 对象进行了增强,提供了更多便捷方法:
100
+
101
+ ### URL 处理
102
+
103
+ 使用 Response 对象封装的 URL 处理方法可以方便地处理各种 URL 操作,无需手动导入 `urllib.parse` 中的函数:
104
+
105
+ ```python
106
+ class MySpider(Spider):
107
+ def parse(self, response):
108
+ # 1. 处理相对URL和绝对URL
109
+ absolute_url = response.urljoin('/relative/path')
110
+
111
+ # 2. 解析URL组件
112
+ parsed = response.urlparse() # 解析当前响应URL
113
+ scheme = parsed.scheme
114
+ domain = parsed.netloc
115
+ path = parsed.path
116
+
117
+ # 3. 解析查询参数
118
+ query_params = response.parse_qs() # 解析当前URL的查询参数
119
+
120
+ # 4. 编码查询参数
121
+ new_query = response.urlencode({'key': 'value', 'name': '测试'})
122
+
123
+ # 5. URL编码/解码
124
+ encoded = response.quote('hello world 你好')
125
+ decoded = response.unquote(encoded)
126
+
127
+ # 6. 移除URL片段
128
+ url_without_fragment, fragment = response.urldefrag('http://example.com/path#section')
129
+
130
+ yield Request(url=absolute_url, callback=self.parse_detail)
131
+ ```
132
+
133
+ ### 编码检测优化
134
+
135
+ Crawlo 框架参考 Scrapy 的设计模式对 Response 对象的编码检测功能进行了优化,提供了更准确和可靠的编码检测:
136
+
137
+ ```python
138
+ class MySpider(Spider):
139
+ def parse(self, response):
140
+ # 自动检测响应编码
141
+ encoding = response.encoding
142
+
143
+ # 获取声明的编码(Request编码 > BOM > HTTP头部 > HTML meta标签)
144
+ declared_encoding = response._declared_encoding()
145
+
146
+ # 响应文本已自动使用正确的编码解码
147
+ text = response.text
148
+
149
+ # 处理解码后的内容
150
+ # ...
151
+ ```
152
+
153
+ 编码检测优先级:
154
+ 1. Request 中指定的编码
155
+ 2. BOM 字节顺序标记
156
+ 3. HTTP Content-Type 头部
157
+ 4. HTML meta 标签声明
158
+ 5. 内容自动检测
159
+ 6. 默认编码 (utf-8)
160
+
161
+ ### 选择器方法优化
162
+
163
+ Crawlo 框架对 Response 对象的选择器方法进行了优化,提供了更便捷的数据提取功能,方法命名更加直观和统一:
164
+
165
+ ```python
166
+ class MySpider(Spider):
167
+ def parse(self, response):
168
+ # 1. 提取单个元素文本(支持CSS和XPath)
169
+ title = response.extract_text('title') # CSS选择器
170
+ title = response.extract_text('//title') # XPath选择器
171
+
172
+ # 2. 提取多个元素文本
173
+ paragraphs = response.extract_texts('.content p') # CSS选择器
174
+ paragraphs = response.extract_texts('//div[@class="content"]//p') # XPath选择器
175
+
176
+ # 3. 提取单个元素属性
177
+ link_href = response.extract_attr('a', 'href') # CSS选择器
178
+ link_href = response.extract_attr('//a[@class="link"]', 'href') # XPath选择器
179
+
180
+ # 4. 提取多个元素属性
181
+ all_links = response.extract_attrs('a', 'href') # CSS选择器
182
+ all_links = response.extract_attrs('//a[@class="link"]', 'href') # XPath选择器
183
+
184
+ yield {
185
+ 'title': title,
186
+ 'paragraphs': paragraphs,
187
+ 'links': all_links
188
+ }
189
+ ```
190
+
191
+ 所有选择器方法都采用了简洁直观的命名风格,便于记忆和使用。
192
+
193
+ ### 工具模块
194
+
195
+ Crawlo 框架提供了丰富的工具模块,用于处理各种常见任务。选择器相关的辅助函数现在位于 `crawlo.utils.selector_helper` 模块中:
196
+
197
+ ```python
198
+ from crawlo.utils import (
199
+ extract_text,
200
+ extract_texts,
201
+ extract_attr,
202
+ extract_attrs,
203
+ is_xpath
204
+ )
205
+
206
+ # 在自定义代码中使用这些工具函数
207
+ title_elements = response.css('title')
208
+ title_text = extract_text(title_elements)
209
+
210
+ li_elements = response.css('.list li')
211
+ li_texts = extract_texts(li_elements)
212
+
213
+ link_elements = response.css('.link')
214
+ link_href = extract_attr(link_elements, 'href')
215
+
216
+ all_links = response.css('a')
217
+ all_hrefs = extract_attrs(all_links, 'href')
218
+ ```
219
+
220
+ ## 日志系统
221
+
222
+ Crawlo 拥有一个功能强大的日志系统,支持多种配置选项:
223
+
224
+ ### 基本配置
225
+
226
+ ```python
227
+ from crawlo.logging import configure_logging, get_logger
228
+
229
+ # 配置日志系统
230
+ configure_logging(
231
+ LOG_LEVEL='INFO',
232
+ LOG_FILE='logs/app.log',
233
+ LOG_MAX_BYTES=10*1024*1024, # 10MB
234
+ LOG_BACKUP_COUNT=5
235
+ )
236
+
237
+ # 获取logger
238
+ logger = get_logger('my_module')
239
+ logger.info('这是一条日志消息')
240
+ ```
241
+
242
+ ### 高级配置
243
+
244
+ ```python
245
+ # 分别配置控制台和文件日志级别
246
+ configure_logging(
247
+ LOG_LEVEL='INFO',
248
+ LOG_CONSOLE_LEVEL='WARNING', # 控制台只显示WARNING及以上级别
249
+ LOG_FILE_LEVEL='DEBUG', # 文件记录DEBUG及以上级别
250
+ LOG_FILE='logs/app.log',
251
+ LOG_INCLUDE_THREAD_ID=True, # 包含线程ID
252
+ LOG_INCLUDE_PROCESS_ID=True # 包含进程ID
253
+ )
254
+
255
+ # 模块特定日志级别
256
+ configure_logging(
257
+ LOG_LEVEL='WARNING',
258
+ LOG_LEVELS={
259
+ 'my_module.debug': 'DEBUG',
260
+ 'my_module.info': 'INFO'
261
+ }
262
+ )
263
+ ```
264
+
265
+ ### 性能监控
266
+
267
+ ```python
268
+ from crawlo.logging import get_monitor
269
+
270
+ # 启用日志性能监控
271
+ monitor = get_monitor()
272
+ monitor.enable_monitoring()
273
+
274
+ # 获取性能报告
275
+ report = monitor.get_performance_report()
276
+ print(report)
277
+ ```
278
+
279
+ ### 日志采样
280
+
281
+ ```python
282
+ from crawlo.logging import get_sampler
283
+
284
+ # 设置采样率(只记录30%的日志)
285
+ sampler = get_sampler()
286
+ sampler.set_sample_rate('my_module', 0.3)
287
+
288
+ # 设置速率限制(每秒最多100条日志)
289
+ sampler.set_rate_limit('my_module', 100)
290
+ ```
291
+
292
+ ## Windows 兼容性说明
293
+
294
+ 在 Windows 系统上使用日志轮转功能时,可能会遇到文件锁定问题。为了解决这个问题,建议安装 `concurrent-log-handler` 库:
295
+
296
+ ```bash
297
+ pip install concurrent-log-handler
298
+ ```
299
+
300
+ Crawlo 框架会自动检测并使用这个库来提供更好的 Windows 兼容性。
301
+
302
+ 如果未安装 `concurrent-log-handler`,在 Windows 上运行时可能会出现以下错误:
303
+ ```
304
+ PermissionError: [WinError 32] 另一个程序正在使用此文件,进程无法访问。
305
+ ```
306
+
307
+ ## 爬虫自动发现
308
+
309
+ Crawlo 框架支持通过 `SPIDER_MODULES` 配置自动发现和加载爬虫,类似于 Scrapy 的机制:
310
+
311
+ ```python
312
+ # settings.py
313
+ SPIDER_MODULES = [
314
+ 'myproject.spiders',
315
+ 'myproject.more_spiders',
316
+ ]
317
+
318
+ SPIDER_LOADER_WARN_ONLY = True # 加载错误时只警告不报错
319
+ ```
320
+
321
+ 框架会自动扫描配置的模块目录,发现并注册其中的爬虫类。
322
+
323
+ ## 文档
324
+
325
+ 请查看 [文档](https://your-docs-url.com) 获取更多信息。
326
+
327
+ ## 许可证
328
+
329
+ MIT
crawlo-1.4.6/README.md ADDED
@@ -0,0 +1,279 @@
1
+ # Crawlo 爬虫框架
2
+
3
+ Crawlo 是一个高性能、可扩展的 Python 爬虫框架,支持单机和分布式部署。
4
+
5
+ ## 特性
6
+
7
+ - 高性能异步爬取
8
+ - 支持多种下载器 (aiohttp, httpx, curl-cffi)
9
+ - 内置数据清洗和验证
10
+ - 分布式爬取支持
11
+ - 灵活的中间件系统
12
+ - 强大的配置管理系统
13
+ - 详细的日志记录和监控
14
+ - Windows 和 Linux 兼容
15
+
16
+ ## 安装
17
+
18
+ ```bash
19
+ pip install crawlo
20
+ ```
21
+
22
+ 或者从源码安装:
23
+
24
+ ```bash
25
+ git clone git@github.com:crawl-coder/Crawlo.git
26
+ cd crawlo
27
+ pip install -r requirements.txt
28
+ pip install .
29
+ ```
30
+
31
+ ## 快速开始
32
+
33
+ ```python
34
+ from crawlo import Spider
35
+
36
+ class MySpider(Spider):
37
+ name = 'example'
38
+
39
+ def parse(self, response):
40
+ # 解析逻辑
41
+ pass
42
+
43
+ # 运行爬虫
44
+ # crawlo run example
45
+ ```
46
+
47
+ ## Response 对象功能
48
+
49
+ Crawlo 框架对 Response 对象进行了增强,提供了更多便捷方法:
50
+
51
+ ### URL 处理
52
+
53
+ 使用 Response 对象封装的 URL 处理方法可以方便地处理各种 URL 操作,无需手动导入 `urllib.parse` 中的函数:
54
+
55
+ ```python
56
+ class MySpider(Spider):
57
+ def parse(self, response):
58
+ # 1. 处理相对URL和绝对URL
59
+ absolute_url = response.urljoin('/relative/path')
60
+
61
+ # 2. 解析URL组件
62
+ parsed = response.urlparse() # 解析当前响应URL
63
+ scheme = parsed.scheme
64
+ domain = parsed.netloc
65
+ path = parsed.path
66
+
67
+ # 3. 解析查询参数
68
+ query_params = response.parse_qs() # 解析当前URL的查询参数
69
+
70
+ # 4. 编码查询参数
71
+ new_query = response.urlencode({'key': 'value', 'name': '测试'})
72
+
73
+ # 5. URL编码/解码
74
+ encoded = response.quote('hello world 你好')
75
+ decoded = response.unquote(encoded)
76
+
77
+ # 6. 移除URL片段
78
+ url_without_fragment, fragment = response.urldefrag('http://example.com/path#section')
79
+
80
+ yield Request(url=absolute_url, callback=self.parse_detail)
81
+ ```
82
+
83
+ ### 编码检测优化
84
+
85
+ Crawlo 框架参考 Scrapy 的设计模式对 Response 对象的编码检测功能进行了优化,提供了更准确和可靠的编码检测:
86
+
87
+ ```python
88
+ class MySpider(Spider):
89
+ def parse(self, response):
90
+ # 自动检测响应编码
91
+ encoding = response.encoding
92
+
93
+ # 获取声明的编码(Request编码 > BOM > HTTP头部 > HTML meta标签)
94
+ declared_encoding = response._declared_encoding()
95
+
96
+ # 响应文本已自动使用正确的编码解码
97
+ text = response.text
98
+
99
+ # 处理解码后的内容
100
+ # ...
101
+ ```
102
+
103
+ 编码检测优先级:
104
+ 1. Request 中指定的编码
105
+ 2. BOM 字节顺序标记
106
+ 3. HTTP Content-Type 头部
107
+ 4. HTML meta 标签声明
108
+ 5. 内容自动检测
109
+ 6. 默认编码 (utf-8)
110
+
111
+ ### 选择器方法优化
112
+
113
+ Crawlo 框架对 Response 对象的选择器方法进行了优化,提供了更便捷的数据提取功能,方法命名更加直观和统一:
114
+
115
+ ```python
116
+ class MySpider(Spider):
117
+ def parse(self, response):
118
+ # 1. 提取单个元素文本(支持CSS和XPath)
119
+ title = response.extract_text('title') # CSS选择器
120
+ title = response.extract_text('//title') # XPath选择器
121
+
122
+ # 2. 提取多个元素文本
123
+ paragraphs = response.extract_texts('.content p') # CSS选择器
124
+ paragraphs = response.extract_texts('//div[@class="content"]//p') # XPath选择器
125
+
126
+ # 3. 提取单个元素属性
127
+ link_href = response.extract_attr('a', 'href') # CSS选择器
128
+ link_href = response.extract_attr('//a[@class="link"]', 'href') # XPath选择器
129
+
130
+ # 4. 提取多个元素属性
131
+ all_links = response.extract_attrs('a', 'href') # CSS选择器
132
+ all_links = response.extract_attrs('//a[@class="link"]', 'href') # XPath选择器
133
+
134
+ yield {
135
+ 'title': title,
136
+ 'paragraphs': paragraphs,
137
+ 'links': all_links
138
+ }
139
+ ```
140
+
141
+ 所有选择器方法都采用了简洁直观的命名风格,便于记忆和使用。
142
+
143
+ ### 工具模块
144
+
145
+ Crawlo 框架提供了丰富的工具模块,用于处理各种常见任务。选择器相关的辅助函数现在位于 `crawlo.utils.selector_helper` 模块中:
146
+
147
+ ```python
148
+ from crawlo.utils import (
149
+ extract_text,
150
+ extract_texts,
151
+ extract_attr,
152
+ extract_attrs,
153
+ is_xpath
154
+ )
155
+
156
+ # 在自定义代码中使用这些工具函数
157
+ title_elements = response.css('title')
158
+ title_text = extract_text(title_elements)
159
+
160
+ li_elements = response.css('.list li')
161
+ li_texts = extract_texts(li_elements)
162
+
163
+ link_elements = response.css('.link')
164
+ link_href = extract_attr(link_elements, 'href')
165
+
166
+ all_links = response.css('a')
167
+ all_hrefs = extract_attrs(all_links, 'href')
168
+ ```
169
+
170
+ ## 日志系统
171
+
172
+ Crawlo 拥有一个功能强大的日志系统,支持多种配置选项:
173
+
174
+ ### 基本配置
175
+
176
+ ```python
177
+ from crawlo.logging import configure_logging, get_logger
178
+
179
+ # 配置日志系统
180
+ configure_logging(
181
+ LOG_LEVEL='INFO',
182
+ LOG_FILE='logs/app.log',
183
+ LOG_MAX_BYTES=10*1024*1024, # 10MB
184
+ LOG_BACKUP_COUNT=5
185
+ )
186
+
187
+ # 获取logger
188
+ logger = get_logger('my_module')
189
+ logger.info('这是一条日志消息')
190
+ ```
191
+
192
+ ### 高级配置
193
+
194
+ ```python
195
+ # 分别配置控制台和文件日志级别
196
+ configure_logging(
197
+ LOG_LEVEL='INFO',
198
+ LOG_CONSOLE_LEVEL='WARNING', # 控制台只显示WARNING及以上级别
199
+ LOG_FILE_LEVEL='DEBUG', # 文件记录DEBUG及以上级别
200
+ LOG_FILE='logs/app.log',
201
+ LOG_INCLUDE_THREAD_ID=True, # 包含线程ID
202
+ LOG_INCLUDE_PROCESS_ID=True # 包含进程ID
203
+ )
204
+
205
+ # 模块特定日志级别
206
+ configure_logging(
207
+ LOG_LEVEL='WARNING',
208
+ LOG_LEVELS={
209
+ 'my_module.debug': 'DEBUG',
210
+ 'my_module.info': 'INFO'
211
+ }
212
+ )
213
+ ```
214
+
215
+ ### 性能监控
216
+
217
+ ```python
218
+ from crawlo.logging import get_monitor
219
+
220
+ # 启用日志性能监控
221
+ monitor = get_monitor()
222
+ monitor.enable_monitoring()
223
+
224
+ # 获取性能报告
225
+ report = monitor.get_performance_report()
226
+ print(report)
227
+ ```
228
+
229
+ ### 日志采样
230
+
231
+ ```python
232
+ from crawlo.logging import get_sampler
233
+
234
+ # 设置采样率(只记录30%的日志)
235
+ sampler = get_sampler()
236
+ sampler.set_sample_rate('my_module', 0.3)
237
+
238
+ # 设置速率限制(每秒最多100条日志)
239
+ sampler.set_rate_limit('my_module', 100)
240
+ ```
241
+
242
+ ## Windows 兼容性说明
243
+
244
+ 在 Windows 系统上使用日志轮转功能时,可能会遇到文件锁定问题。为了解决这个问题,建议安装 `concurrent-log-handler` 库:
245
+
246
+ ```bash
247
+ pip install concurrent-log-handler
248
+ ```
249
+
250
+ Crawlo 框架会自动检测并使用这个库来提供更好的 Windows 兼容性。
251
+
252
+ 如果未安装 `concurrent-log-handler`,在 Windows 上运行时可能会出现以下错误:
253
+ ```
254
+ PermissionError: [WinError 32] 另一个程序正在使用此文件,进程无法访问。
255
+ ```
256
+
257
+ ## 爬虫自动发现
258
+
259
+ Crawlo 框架支持通过 `SPIDER_MODULES` 配置自动发现和加载爬虫,类似于 Scrapy 的机制:
260
+
261
+ ```python
262
+ # settings.py
263
+ SPIDER_MODULES = [
264
+ 'myproject.spiders',
265
+ 'myproject.more_spiders',
266
+ ]
267
+
268
+ SPIDER_LOADER_WARN_ONLY = True # 加载错误时只警告不报错
269
+ ```
270
+
271
+ 框架会自动扫描配置的模块目录,发现并注册其中的爬虫类。
272
+
273
+ ## 文档
274
+
275
+ 请查看 [文档](https://your-docs-url.com) 获取更多信息。
276
+
277
+ ## 许可证
278
+
279
+ MIT
@@ -3,14 +3,17 @@
3
3
  """
4
4
  Crawlo - 一个异步爬虫框架
5
5
  """
6
- from typing import TYPE_CHECKING
7
6
 
8
- from crawlo.spider import Spider
7
+ # 为了向后兼容,从tools中导入cleaners相关的功能
8
+ import crawlo.tools as cleaners
9
+ from crawlo import tools
10
+ from crawlo.crawler import CrawlerProcess
11
+ from crawlo.downloader import DownloaderBase
9
12
  from crawlo.items import Item, Field
13
+ from crawlo.middleware import BaseMiddleware
10
14
  from crawlo.network.request import Request
11
15
  from crawlo.network.response import Response
12
- from crawlo.downloader import DownloaderBase
13
- from crawlo.middleware import BaseMiddleware
16
+ from crawlo.spider import Spider
14
17
  from crawlo.utils import (
15
18
  TimeUtils,
16
19
  parse_time,
@@ -24,21 +27,13 @@ from crawlo.utils import (
24
27
  to_local,
25
28
  from_timestamp_with_tz
26
29
  )
27
- from crawlo import tools
28
-
29
- # 框架核心模块 - 使用TYPE_CHECKING避免循环导入
30
- if TYPE_CHECKING:
31
- from crawlo.initialization import get_framework_initializer, initialize_framework
32
-
33
- # 为了向后兼容,从tools中导入cleaners相关的功能
34
- import crawlo.tools as cleaners
35
30
 
36
31
 
37
32
  # 延迟导入的辅助函数
38
33
  def get_framework_initializer():
39
- """延迟导入get_framework_initializer以避免循环依赖"""
40
- from crawlo.initialization import get_framework_initializer as _get_framework_initializer
41
- return _get_framework_initializer()
34
+ """延迟导入CoreInitializer以避免循环依赖"""
35
+ from crawlo.initialization import CoreInitializer
36
+ return CoreInitializer()
42
37
 
43
38
 
44
39
  def initialize_framework(custom_settings=None):
@@ -87,6 +82,7 @@ __all__ = [
87
82
  'from_timestamp_with_tz',
88
83
  'cleaners',
89
84
  'tools',
85
+ 'CrawlerProcess',
90
86
  'get_framework_initializer',
91
87
  'get_bootstrap_manager',
92
88
  '__version__',
@@ -0,0 +1 @@
1
+ __version__ = '1.4.6'
@@ -92,8 +92,32 @@ def _render_template(tmpl_path, context):
92
92
  """读取模板文件,替换 {{key}} 为 context 中的值"""
93
93
  with open(tmpl_path, 'r', encoding='utf-8') as f:
94
94
  content = f.read()
95
+
96
+ # 处理简单的过滤器语法 {{key|filter}}
97
+ import re
98
+
99
+ def apply_filter(value, filter_name):
100
+ if filter_name == 'title':
101
+ # 将 snake_case 转换为 TitleCase
102
+ words = value.replace('_', ' ').split()
103
+ return ''.join(word.capitalize() for word in words)
104
+ return value
105
+
106
+ # 查找并替换 {{key|filter}} 格式的占位符
107
+ pattern = r'\{\{([^}|]+)\|([^}]+)\}\}'
108
+ def replace_filter_match(match):
109
+ key = match.group(1).strip()
110
+ filter_name = match.group(2).strip()
111
+ if key in context:
112
+ return str(apply_filter(context[key], filter_name))
113
+ return match.group(0) # 如果找不到key,保持原样
114
+
115
+ content = re.sub(pattern, replace_filter_match, content)
116
+
117
+ # 处理普通的 {{key}} 占位符
95
118
  for key, value in context.items():
96
119
  content = content.replace(f'{{{{{key}}}}}', str(value))
120
+
97
121
  return content
98
122
 
99
123