crawlo 1.3.9__tar.gz → 1.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (322) hide show
  1. {crawlo-1.3.9/crawlo.egg-info → crawlo-1.4.1}/PKG-INFO +1 -1
  2. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/__init__.py +9 -4
  3. crawlo-1.4.1/crawlo/__version__.py +1 -0
  4. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/commands/run.py +1 -1
  5. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/core/__init__.py +8 -2
  6. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/core/processor.py +11 -3
  7. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/core/scheduler.py +2 -2
  8. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/crawler.py +12 -0
  9. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/extension/__init__.py +25 -0
  10. crawlo-1.4.1/crawlo/extension/log_interval.py +95 -0
  11. crawlo-1.4.1/crawlo/extension/log_stats.py +71 -0
  12. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/initialization/__init__.py +6 -2
  13. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/middleware/middleware_manager.py +1 -1
  14. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/middleware/response_code.py +1 -14
  15. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/mode_manager.py +13 -7
  16. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/pipelines/bloom_dedup_pipeline.py +5 -15
  17. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/pipelines/database_dedup_pipeline.py +5 -8
  18. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/pipelines/memory_dedup_pipeline.py +5 -15
  19. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/pipelines/pipeline_manager.py +15 -7
  20. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/pipelines/redis_dedup_pipeline.py +7 -17
  21. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/project.py +18 -7
  22. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/settings/default_settings.py +114 -150
  23. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/settings/setting_manager.py +14 -9
  24. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/tools/distributed_coordinator.py +4 -8
  25. crawlo-1.4.1/crawlo/utils/fingerprint.py +123 -0
  26. {crawlo-1.3.9 → crawlo-1.4.1/crawlo.egg-info}/PKG-INFO +1 -1
  27. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo.egg-info/SOURCES.txt +16 -0
  28. crawlo-1.4.1/examples/test_project/__init__.py +7 -0
  29. crawlo-1.4.1/examples/test_project/run.py +35 -0
  30. crawlo-1.4.1/examples/test_project/test_project/__init__.py +4 -0
  31. crawlo-1.4.1/examples/test_project/test_project/items.py +18 -0
  32. crawlo-1.4.1/examples/test_project/test_project/middlewares.py +119 -0
  33. crawlo-1.4.1/examples/test_project/test_project/pipelines.py +97 -0
  34. crawlo-1.4.1/examples/test_project/test_project/settings.py +170 -0
  35. crawlo-1.4.1/examples/test_project/test_project/spiders/__init__.py +10 -0
  36. crawlo-1.4.1/examples/test_project/test_project/spiders/of_week_dis.py +144 -0
  37. {crawlo-1.3.9 → crawlo-1.4.1}/tests/debug_framework_logger.py +1 -1
  38. {crawlo-1.3.9 → crawlo-1.4.1}/tests/debug_log_levels.py +1 -1
  39. crawlo-1.4.1/tests/test_all_pipeline_fingerprints.py +134 -0
  40. crawlo-1.4.1/tests/test_default_header_middleware.py +314 -0
  41. crawlo-1.4.1/tests/test_fingerprint_consistency.py +136 -0
  42. crawlo-1.4.1/tests/test_fingerprint_simple.py +52 -0
  43. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_framework_logger.py +1 -1
  44. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_framework_startup.py +1 -1
  45. crawlo-1.4.1/tests/test_hash_performance.py +100 -0
  46. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_mode_change.py +1 -1
  47. crawlo-1.4.1/tests/test_offsite_middleware.py +245 -0
  48. crawlo-1.4.1/tests/test_offsite_middleware_simple.py +204 -0
  49. crawlo-1.4.1/tests/test_pipeline_fingerprint_consistency.py +87 -0
  50. crawlo-1.3.9/crawlo/__version__.py +0 -1
  51. crawlo-1.3.9/crawlo/extension/log_interval.py +0 -58
  52. crawlo-1.3.9/crawlo/extension/log_stats.py +0 -82
  53. crawlo-1.3.9/tests/test_default_header_middleware.py +0 -159
  54. crawlo-1.3.9/tests/test_offsite_middleware.py +0 -222
  55. {crawlo-1.3.9 → crawlo-1.4.1}/LICENSE +0 -0
  56. {crawlo-1.3.9 → crawlo-1.4.1}/MANIFEST.in +0 -0
  57. {crawlo-1.3.9 → crawlo-1.4.1}/README.md +0 -0
  58. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/cli.py +0 -0
  59. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/commands/__init__.py +0 -0
  60. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/commands/check.py +0 -0
  61. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/commands/genspider.py +0 -0
  62. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/commands/help.py +0 -0
  63. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/commands/list.py +0 -0
  64. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/commands/startproject.py +0 -0
  65. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/commands/stats.py +0 -0
  66. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/commands/utils.py +0 -0
  67. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/config.py +0 -0
  68. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/config_validator.py +0 -0
  69. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/core/engine.py +0 -0
  70. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/data/__init__.py +0 -0
  71. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/data/user_agents.py +0 -0
  72. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/downloader/__init__.py +0 -0
  73. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/downloader/aiohttp_downloader.py +0 -0
  74. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/downloader/cffi_downloader.py +0 -0
  75. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/downloader/httpx_downloader.py +0 -0
  76. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/downloader/hybrid_downloader.py +0 -0
  77. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/downloader/playwright_downloader.py +0 -0
  78. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/downloader/selenium_downloader.py +0 -0
  79. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/event.py +0 -0
  80. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/exceptions.py +0 -0
  81. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/extension/health_check.py +0 -0
  82. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/extension/logging_extension.py +0 -0
  83. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/extension/memory_monitor.py +0 -0
  84. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/extension/performance_profiler.py +0 -0
  85. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/extension/request_recorder.py +0 -0
  86. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/factories/__init__.py +0 -0
  87. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/factories/base.py +0 -0
  88. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/factories/crawler.py +0 -0
  89. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/factories/registry.py +0 -0
  90. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/filters/__init__.py +0 -0
  91. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/filters/aioredis_filter.py +0 -0
  92. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/filters/memory_filter.py +0 -0
  93. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/framework.py +0 -0
  94. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/initialization/built_in.py +0 -0
  95. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/initialization/context.py +0 -0
  96. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/initialization/core.py +0 -0
  97. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/initialization/phases.py +0 -0
  98. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/initialization/registry.py +0 -0
  99. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/items/__init__.py +0 -0
  100. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/items/base.py +0 -0
  101. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/items/fields.py +0 -0
  102. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/items/items.py +0 -0
  103. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/logging/__init__.py +0 -0
  104. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/logging/config.py +0 -0
  105. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/logging/factory.py +0 -0
  106. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/logging/manager.py +0 -0
  107. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/middleware/__init__.py +0 -0
  108. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/middleware/default_header.py +0 -0
  109. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/middleware/download_delay.py +0 -0
  110. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/middleware/offsite.py +0 -0
  111. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/middleware/proxy.py +0 -0
  112. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/middleware/request_ignore.py +0 -0
  113. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/middleware/response_filter.py +0 -0
  114. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/middleware/retry.py +0 -0
  115. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/middleware/simple_proxy.py +0 -0
  116. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/network/__init__.py +0 -0
  117. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/network/request.py +0 -0
  118. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/network/response.py +0 -0
  119. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/pipelines/__init__.py +0 -0
  120. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/pipelines/console_pipeline.py +0 -0
  121. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/pipelines/csv_pipeline.py +0 -0
  122. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/pipelines/json_pipeline.py +0 -0
  123. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/pipelines/mongo_pipeline.py +0 -0
  124. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/pipelines/mysql_pipeline.py +0 -0
  125. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/queue/__init__.py +0 -0
  126. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/queue/pqueue.py +0 -0
  127. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/queue/queue_manager.py +0 -0
  128. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/queue/redis_priority_queue.py +0 -0
  129. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/settings/__init__.py +0 -0
  130. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/spider/__init__.py +0 -0
  131. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/stats_collector.py +0 -0
  132. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/subscriber.py +0 -0
  133. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/task_manager.py +0 -0
  134. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/templates/crawlo.cfg.tmpl +0 -0
  135. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/templates/project/__init__.py.tmpl +0 -0
  136. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/templates/project/items.py.tmpl +0 -0
  137. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/templates/project/middlewares.py.tmpl +0 -0
  138. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/templates/project/pipelines.py.tmpl +0 -0
  139. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/templates/project/settings.py.tmpl +0 -0
  140. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/templates/project/settings_distributed.py.tmpl +0 -0
  141. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/templates/project/settings_gentle.py.tmpl +0 -0
  142. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/templates/project/settings_high_performance.py.tmpl +0 -0
  143. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/templates/project/settings_minimal.py.tmpl +0 -0
  144. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/templates/project/settings_simple.py.tmpl +0 -0
  145. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/templates/project/spiders/__init__.py.tmpl +0 -0
  146. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/templates/run.py.tmpl +0 -0
  147. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/templates/spider/spider.py.tmpl +0 -0
  148. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/templates/spiders_init.py.tmpl +0 -0
  149. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/tools/__init__.py +0 -0
  150. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/tools/anti_crawler.py +0 -0
  151. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/tools/authenticated_proxy.py +0 -0
  152. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/tools/data_formatter.py +0 -0
  153. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/tools/data_validator.py +0 -0
  154. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/tools/date_tools.py +0 -0
  155. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/tools/encoding_converter.py +0 -0
  156. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/tools/network_diagnostic.py +0 -0
  157. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/tools/request_tools.py +0 -0
  158. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/tools/retry_mechanism.py +0 -0
  159. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/tools/scenario_adapter.py +0 -0
  160. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/tools/text_cleaner.py +0 -0
  161. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/utils/__init__.py +0 -0
  162. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/utils/batch_processor.py +0 -0
  163. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/utils/class_loader.py +0 -0
  164. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/utils/controlled_spider_mixin.py +0 -0
  165. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/utils/db_helper.py +0 -0
  166. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/utils/enhanced_error_handler.py +0 -0
  167. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/utils/env_config.py +0 -0
  168. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/utils/error_handler.py +0 -0
  169. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/utils/func_tools.py +0 -0
  170. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/utils/large_scale_config.py +0 -0
  171. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/utils/large_scale_helper.py +0 -0
  172. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/utils/log.py +0 -0
  173. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/utils/performance_monitor.py +0 -0
  174. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/utils/queue_helper.py +0 -0
  175. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/utils/redis_connection_pool.py +0 -0
  176. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/utils/redis_key_validator.py +0 -0
  177. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/utils/request.py +0 -0
  178. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/utils/request_serializer.py +0 -0
  179. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/utils/spider_loader.py +0 -0
  180. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/utils/system.py +0 -0
  181. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/utils/tools.py +0 -0
  182. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo/utils/url.py +0 -0
  183. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo.egg-info/dependency_links.txt +0 -0
  184. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo.egg-info/entry_points.txt +0 -0
  185. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo.egg-info/requires.txt +0 -0
  186. {crawlo-1.3.9 → crawlo-1.4.1}/crawlo.egg-info/top_level.txt +0 -0
  187. {crawlo-1.3.9 → crawlo-1.4.1}/examples/__init__.py +0 -0
  188. {crawlo-1.3.9 → crawlo-1.4.1}/pyproject.toml +0 -0
  189. {crawlo-1.3.9 → crawlo-1.4.1}/requirements.txt +0 -0
  190. {crawlo-1.3.9 → crawlo-1.4.1}/setup.cfg +0 -0
  191. {crawlo-1.3.9 → crawlo-1.4.1}/tests/__init__.py +0 -0
  192. {crawlo-1.3.9 → crawlo-1.4.1}/tests/advanced_tools_example.py +0 -0
  193. {crawlo-1.3.9 → crawlo-1.4.1}/tests/authenticated_proxy_example.py +0 -0
  194. {crawlo-1.3.9 → crawlo-1.4.1}/tests/baidu_performance_test.py +0 -0
  195. {crawlo-1.3.9 → crawlo-1.4.1}/tests/baidu_test.py +0 -0
  196. {crawlo-1.3.9 → crawlo-1.4.1}/tests/cleaners_example.py +0 -0
  197. {crawlo-1.3.9 → crawlo-1.4.1}/tests/comprehensive_framework_test.py +0 -0
  198. {crawlo-1.3.9 → crawlo-1.4.1}/tests/comprehensive_test.py +0 -0
  199. {crawlo-1.3.9 → crawlo-1.4.1}/tests/comprehensive_testing_summary.md +0 -0
  200. {crawlo-1.3.9 → crawlo-1.4.1}/tests/config_validation_demo.py +0 -0
  201. {crawlo-1.3.9 → crawlo-1.4.1}/tests/controlled_spider_example.py +0 -0
  202. {crawlo-1.3.9 → crawlo-1.4.1}/tests/date_tools_example.py +0 -0
  203. {crawlo-1.3.9 → crawlo-1.4.1}/tests/debug_configure.py +0 -0
  204. {crawlo-1.3.9 → crawlo-1.4.1}/tests/debug_log_config.py +0 -0
  205. {crawlo-1.3.9 → crawlo-1.4.1}/tests/debug_pipelines.py +0 -0
  206. {crawlo-1.3.9 → crawlo-1.4.1}/tests/detailed_log_test.py +0 -0
  207. {crawlo-1.3.9 → crawlo-1.4.1}/tests/distributed_test.py +0 -0
  208. {crawlo-1.3.9 → crawlo-1.4.1}/tests/distributed_test_debug.py +0 -0
  209. {crawlo-1.3.9 → crawlo-1.4.1}/tests/dynamic_loading_example.py +0 -0
  210. {crawlo-1.3.9 → crawlo-1.4.1}/tests/dynamic_loading_test.py +0 -0
  211. {crawlo-1.3.9 → crawlo-1.4.1}/tests/env_config_example.py +0 -0
  212. {crawlo-1.3.9 → crawlo-1.4.1}/tests/error_handling_example.py +0 -0
  213. {crawlo-1.3.9 → crawlo-1.4.1}/tests/final_command_test_report.md +0 -0
  214. {crawlo-1.3.9 → crawlo-1.4.1}/tests/final_comprehensive_test.py +0 -0
  215. {crawlo-1.3.9 → crawlo-1.4.1}/tests/final_log_test.py +0 -0
  216. {crawlo-1.3.9 → crawlo-1.4.1}/tests/final_validation_test.py +0 -0
  217. {crawlo-1.3.9 → crawlo-1.4.1}/tests/fix_log_test.py +0 -0
  218. {crawlo-1.3.9 → crawlo-1.4.1}/tests/framework_performance_test.py +0 -0
  219. {crawlo-1.3.9 → crawlo-1.4.1}/tests/log_buffering_test.py +0 -0
  220. {crawlo-1.3.9 → crawlo-1.4.1}/tests/log_generation_timing_test.py +0 -0
  221. {crawlo-1.3.9 → crawlo-1.4.1}/tests/optimized_performance_test.py +0 -0
  222. {crawlo-1.3.9 → crawlo-1.4.1}/tests/performance_comparison.py +0 -0
  223. {crawlo-1.3.9 → crawlo-1.4.1}/tests/queue_blocking_test.py +0 -0
  224. {crawlo-1.3.9 → crawlo-1.4.1}/tests/queue_test.py +0 -0
  225. {crawlo-1.3.9 → crawlo-1.4.1}/tests/redis_key_validation_demo.py +0 -0
  226. {crawlo-1.3.9 → crawlo-1.4.1}/tests/request_params_example.py +0 -0
  227. {crawlo-1.3.9 → crawlo-1.4.1}/tests/response_improvements_example.py +0 -0
  228. {crawlo-1.3.9 → crawlo-1.4.1}/tests/scrapy_comparison/ofweek_scrapy.py +0 -0
  229. {crawlo-1.3.9 → crawlo-1.4.1}/tests/scrapy_comparison/scrapy_test.py +0 -0
  230. {crawlo-1.3.9 → crawlo-1.4.1}/tests/simple_command_test.py +0 -0
  231. {crawlo-1.3.9 → crawlo-1.4.1}/tests/simple_crawlo_test.py +0 -0
  232. {crawlo-1.3.9 → crawlo-1.4.1}/tests/simple_log_test.py +0 -0
  233. {crawlo-1.3.9 → crawlo-1.4.1}/tests/simple_log_test2.py +0 -0
  234. {crawlo-1.3.9 → crawlo-1.4.1}/tests/simple_optimization_test.py +0 -0
  235. {crawlo-1.3.9 → crawlo-1.4.1}/tests/simple_queue_type_test.py +0 -0
  236. {crawlo-1.3.9 → crawlo-1.4.1}/tests/simple_spider_test.py +0 -0
  237. {crawlo-1.3.9 → crawlo-1.4.1}/tests/simple_test.py +0 -0
  238. {crawlo-1.3.9 → crawlo-1.4.1}/tests/spider_log_timing_test.py +0 -0
  239. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_advanced_tools.py +0 -0
  240. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_all_commands.py +0 -0
  241. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_all_redis_key_configs.py +0 -0
  242. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_authenticated_proxy.py +0 -0
  243. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_batch_processor.py +0 -0
  244. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_cleaners.py +0 -0
  245. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_component_factory.py +0 -0
  246. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_comprehensive.py +0 -0
  247. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_config_consistency.py +0 -0
  248. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_config_merge.py +0 -0
  249. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_config_validator.py +0 -0
  250. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_controlled_spider_mixin.py +0 -0
  251. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_crawlo_proxy_integration.py +0 -0
  252. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_date_tools.py +0 -0
  253. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_dedup_fix.py +0 -0
  254. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_distributed.py +0 -0
  255. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_double_crawlo_fix.py +0 -0
  256. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_double_crawlo_fix_simple.py +0 -0
  257. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_download_delay_middleware.py +0 -0
  258. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_downloader_proxy_compatibility.py +0 -0
  259. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_dynamic_downloaders_proxy.py +0 -0
  260. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_dynamic_proxy.py +0 -0
  261. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_dynamic_proxy_config.py +0 -0
  262. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_dynamic_proxy_real.py +0 -0
  263. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_edge_cases.py +0 -0
  264. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_enhanced_error_handler.py +0 -0
  265. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_enhanced_error_handler_comprehensive.py +0 -0
  266. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_env_config.py +0 -0
  267. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_error_handler_compatibility.py +0 -0
  268. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_factories.py +0 -0
  269. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_final_validation.py +0 -0
  270. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_framework_env_usage.py +0 -0
  271. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_get_component_logger.py +0 -0
  272. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_integration.py +0 -0
  273. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_item_dedup_redis_key.py +0 -0
  274. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_large_scale_config.py +0 -0
  275. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_large_scale_helper.py +0 -0
  276. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_logging_system.py +0 -0
  277. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_mode_consistency.py +0 -0
  278. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_parsel.py +0 -0
  279. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_performance.py +0 -0
  280. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_performance_monitor.py +0 -0
  281. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_proxy_api.py +0 -0
  282. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_proxy_health_check.py +0 -0
  283. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_proxy_middleware.py +0 -0
  284. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_proxy_middleware_enhanced.py +0 -0
  285. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_proxy_middleware_integration.py +0 -0
  286. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_proxy_middleware_refactored.py +0 -0
  287. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_proxy_providers.py +0 -0
  288. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_proxy_stats.py +0 -0
  289. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_proxy_strategies.py +0 -0
  290. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_queue_empty_check.py +0 -0
  291. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_queue_manager_double_crawlo.py +0 -0
  292. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_queue_manager_redis_key.py +0 -0
  293. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_queue_naming.py +0 -0
  294. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_queue_type.py +0 -0
  295. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_random_user_agent.py +0 -0
  296. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_real_scenario_proxy.py +0 -0
  297. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_redis_config.py +0 -0
  298. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_redis_connection_pool.py +0 -0
  299. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_redis_key_naming.py +0 -0
  300. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_redis_key_validator.py +0 -0
  301. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_redis_queue.py +0 -0
  302. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_redis_queue_name_fix.py +0 -0
  303. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_request_ignore_middleware.py +0 -0
  304. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_request_params.py +0 -0
  305. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_request_serialization.py +0 -0
  306. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_response_code_middleware.py +0 -0
  307. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_response_filter_middleware.py +0 -0
  308. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_response_improvements.py +0 -0
  309. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_retry_middleware.py +0 -0
  310. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_scheduler.py +0 -0
  311. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_scheduler_config_update.py +0 -0
  312. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_simple_response.py +0 -0
  313. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_telecom_spider_redis_key.py +0 -0
  314. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_template_content.py +0 -0
  315. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_template_redis_key.py +0 -0
  316. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_tools.py +0 -0
  317. {crawlo-1.3.9 → crawlo-1.4.1}/tests/test_user_agents.py +0 -0
  318. {crawlo-1.3.9 → crawlo-1.4.1}/tests/tools_example.py +0 -0
  319. {crawlo-1.3.9 → crawlo-1.4.1}/tests/untested_features_report.md +0 -0
  320. {crawlo-1.3.9 → crawlo-1.4.1}/tests/verify_debug.py +0 -0
  321. {crawlo-1.3.9 → crawlo-1.4.1}/tests/verify_distributed.py +0 -0
  322. {crawlo-1.3.9 → crawlo-1.4.1}/tests/verify_log_fix.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlo
3
- Version: 1.3.9
3
+ Version: 1.4.1
4
4
  Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
5
  Home-page: https://github.com/crawl-coder/Crawlo.git
6
6
  Author: crawl-coder
@@ -28,30 +28,35 @@ from crawlo import tools
28
28
 
29
29
  # 框架核心模块 - 使用TYPE_CHECKING避免循环导入
30
30
  if TYPE_CHECKING:
31
- from crawlo.core.framework_initializer import get_framework_initializer, initialize_framework
31
+ from crawlo.initialization import get_framework_initializer, initialize_framework
32
32
 
33
33
  # 为了向后兼容,从tools中导入cleaners相关的功能
34
34
  import crawlo.tools as cleaners
35
35
 
36
+
36
37
  # 延迟导入的辅助函数
37
38
  def get_framework_initializer():
38
39
  """延迟导入get_framework_initializer以避免循环依赖"""
39
- from crawlo.core.framework_initializer import get_framework_initializer as _get_framework_initializer
40
+ from crawlo.initialization import get_framework_initializer as _get_framework_initializer
40
41
  return _get_framework_initializer()
41
42
 
43
+
42
44
  def initialize_framework(custom_settings=None):
43
45
  """延迟导入initialize_framework以避免循环依赖"""
44
- from crawlo.core.framework_initializer import initialize_framework as _initialize_framework
46
+ from crawlo.initialization import initialize_framework as _initialize_framework
45
47
  return _initialize_framework(custom_settings)
46
48
 
49
+
47
50
  # 向后兼容的别名
48
51
  def get_bootstrap_manager():
49
52
  """向后兼容的别名"""
50
53
  return get_framework_initializer()
51
54
 
55
+
52
56
  # 版本号:优先从元数据读取
53
57
  try:
54
58
  from importlib.metadata import version
59
+
55
60
  __version__ = version("crawlo")
56
61
  except Exception:
57
62
  # 开发模式下可能未安装,回退到 __version__.py 或 dev
@@ -85,4 +90,4 @@ __all__ = [
85
90
  'get_framework_initializer',
86
91
  'get_bootstrap_manager',
87
92
  '__version__',
88
- ]
93
+ ]
@@ -0,0 +1 @@
1
+ __version__ = '1.4.1'
@@ -23,7 +23,7 @@ from crawlo.crawler import CrawlerProcess
23
23
  from crawlo.project import get_settings, _find_project_root
24
24
  # 使用新的统一初始化系统
25
25
  from crawlo.initialization import initialize_framework
26
- from crawlo.core import get_framework_initializer
26
+ from crawlo.initialization import get_framework_initializer
27
27
  from crawlo.utils.log import get_logger
28
28
 
29
29
  # 延迟获取logger,确保在日志系统配置之后获取
@@ -10,37 +10,43 @@ from ..initialization import (
10
10
  is_framework_ready
11
11
  )
12
12
 
13
+
13
14
  # 向后兼容的别名
14
15
  def async_initialize_framework(*args, **kwargs):
15
16
  """Async wrapper for framework initialization"""
16
17
  return initialize_framework(*args, **kwargs)
17
18
 
19
+
18
20
  def get_framework_initializer():
19
21
  """Get framework initializer - compatibility function"""
20
22
  from ..initialization.core import CoreInitializer
21
23
  return CoreInitializer()
22
24
 
25
+
23
26
  def get_framework_logger(name='crawlo.core'):
24
27
  """Get framework logger - compatibility function"""
25
28
  from ..logging import get_logger
26
29
  return get_logger(name)
27
30
 
31
+
28
32
  # 向后兼容
29
33
  def bootstrap_framework(*args, **kwargs):
30
34
  """Bootstrap framework - compatibility function"""
31
35
  return initialize_framework(*args, **kwargs)
32
36
 
37
+
33
38
  def get_bootstrap_manager():
34
39
  """Get bootstrap manager - compatibility function"""
35
40
  return get_framework_initializer()
36
41
 
42
+
37
43
  __all__ = [
38
44
  'initialize_framework',
39
- 'async_initialize_framework',
45
+ 'async_initialize_framework',
40
46
  'get_framework_initializer',
41
47
  'is_framework_ready',
42
48
  'get_framework_logger',
43
49
  # 向后兼容
44
50
  'bootstrap_framework',
45
51
  'get_bootstrap_manager'
46
- ]
52
+ ]
@@ -1,10 +1,12 @@
1
1
  #!/usr/bin/python
2
2
  # -*- coding:UTF-8 -*-
3
- from asyncio import Queue
3
+ from asyncio import Queue, create_task
4
4
  from typing import Union, Optional
5
5
 
6
6
  from crawlo import Request, Item
7
7
  from crawlo.pipelines.pipeline_manager import PipelineManager
8
+ from crawlo.exceptions import ItemDiscard
9
+ from crawlo.event import item_discard
8
10
 
9
11
 
10
12
  class Processor(object):
@@ -27,7 +29,13 @@ class Processor(object):
27
29
  await self._process_item(result)
28
30
 
29
31
  async def _process_item(self, item):
30
- await self.pipelines.process_item(item=item)
32
+ try:
33
+ await self.pipelines.process_item(item=item)
34
+ except ItemDiscard as exc:
35
+ # Item was discarded by a pipeline (e.g., deduplication pipeline)
36
+ # We simply ignore this item and don't pass it to subsequent pipelines
37
+ # The statistics system has already been notified in PipelineManager, so we don't need to notify again
38
+ pass
31
39
 
32
40
  async def enqueue(self, output: Union[Request, Item]):
33
41
  await self.queue.put(output)
@@ -37,4 +45,4 @@ class Processor(object):
37
45
  return len(self) == 0
38
46
 
39
47
  def __len__(self):
40
- return self.queue.qsize()
48
+ return self.queue.qsize()
@@ -77,8 +77,8 @@ class Scheduler:
77
77
  # 只有在确实需要更新配置时才重新创建过滤器实例
78
78
  # 检查是否真的进行了配置更新
79
79
  filter_updated = (
80
- (self.queue_manager._queue_type == QueueType.REDIS and 'memory_filter' in self.crawler.settings.get('FILTER_CLASS', '')) or
81
- (self.queue_manager._queue_type == QueueType.MEMORY and ('aioredis_filter' in self.crawler.settings.get('FILTER_CLASS', '') or 'redis_filter' in self.crawler.settings.get('FILTER_CLASS', '')))
80
+ (self.queue_manager._queue_type == QueueType.REDIS and 'aioredis_filter' in self.crawler.settings.get('FILTER_CLASS', '')) or
81
+ (self.queue_manager._queue_type == QueueType.MEMORY and 'memory_filter' in self.crawler.settings.get('FILTER_CLASS', ''))
82
82
  )
83
83
 
84
84
  if needs_config_update or filter_updated:
@@ -308,6 +308,18 @@ class ModernCrawler:
308
308
  except Exception as e:
309
309
  self._logger.warning(f"Spider cleanup failed: {e}")
310
310
 
311
+ # 调用StatsCollector的close_spider方法,设置reason和spider_name
312
+ if self._stats and hasattr(self._stats, 'close_spider'):
313
+ try:
314
+ # 使用默认的'finished'作为reason
315
+ self._stats.close_spider(self._spider, reason='finished')
316
+ except Exception as e:
317
+ self._logger.warning(f"Stats close_spider failed: {e}")
318
+
319
+ # 触发spider_closed事件,通知所有订阅者(包括扩展)
320
+ # 传递reason参数,这里使用默认的'finished'作为reason
321
+ await self.subscriber.notify("spider_closed", reason='finished')
322
+
311
323
  if self._stats and hasattr(self._stats, 'close'):
312
324
  try:
313
325
  close_result = self._stats.close()
@@ -16,6 +16,7 @@ class ExtensionManager(object):
16
16
  extensions = self.crawler.settings.get_list('EXTENSIONS')
17
17
  self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
18
18
  self._add_extensions(extensions)
19
+ self._subscribe_extensions()
19
20
 
20
21
  @classmethod
21
22
  def create_instance(cls, *args: Any, **kwargs: Any) -> 'ExtensionManager':
@@ -37,3 +38,27 @@ class ExtensionManager(object):
37
38
  if extensions:
38
39
  # 恢复INFO级别日志,保留关键的启用信息
39
40
  self.logger.info(f"Enabled extensions: \n{pformat(extensions)}")
41
+
42
+ def _subscribe_extensions(self) -> None:
43
+ """订阅扩展方法到相应的事件"""
44
+ for extension in self.extensions:
45
+ # 订阅 spider_closed 方法
46
+ if hasattr(extension, 'spider_closed'):
47
+ self.crawler.subscriber.subscribe(extension.spider_closed, event="spider_closed")
48
+
49
+ # 订阅 item_successful 方法
50
+ if hasattr(extension, 'item_successful'):
51
+ self.crawler.subscriber.subscribe(extension.item_successful, event="item_successful")
52
+
53
+ # 订阅 item_discard 方法
54
+ if hasattr(extension, 'item_discard'):
55
+ self.crawler.subscriber.subscribe(extension.item_discard, event="item_discard")
56
+
57
+ # 订阅 response_received 方法
58
+ if hasattr(extension, 'response_received'):
59
+ # 修复:将事件名称从 "request_received" 更正为 "response_received"
60
+ self.crawler.subscriber.subscribe(extension.response_received, event="response_received")
61
+
62
+ # 订阅 request_scheduled 方法
63
+ if hasattr(extension, 'request_scheduled'):
64
+ self.crawler.subscriber.subscribe(extension.request_scheduled, event="request_scheduled")
@@ -0,0 +1,95 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ import asyncio
4
+ from typing import Any, Optional
5
+
6
+ from crawlo.utils.log import get_logger
7
+ from crawlo.event import spider_opened, spider_closed
8
+
9
+
10
+ class LogIntervalExtension(object):
11
+
12
+ def __init__(self, crawler: Any):
13
+ self.task: Optional[asyncio.Task] = None
14
+ self.stats = crawler.stats
15
+ self.item_count = 0
16
+ self.response_count = 0
17
+ self.seconds = crawler.settings.get('INTERVAL', 60) # 默认60秒
18
+
19
+ # 修复时间单位计算逻辑
20
+ if self.seconds % 60 == 0:
21
+ self.interval = int(self.seconds / 60)
22
+ self.unit = 'min'
23
+ else:
24
+ self.interval = self.seconds
25
+ self.unit = 's'
26
+
27
+ # 处理单数情况
28
+ if self.interval == 1 and self.unit == 'min':
29
+ self.interval_display = ""
30
+ else:
31
+ self.interval_display = str(self.interval)
32
+
33
+ self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
34
+ self.logger.info(f"LogIntervalExtension initialized with interval: {self.seconds} seconds")
35
+
36
+ @classmethod
37
+ def create_instance(cls, crawler: Any) -> 'LogIntervalExtension':
38
+ o = cls(crawler)
39
+ crawler.subscriber.subscribe(o.spider_opened, event=spider_opened)
40
+ crawler.subscriber.subscribe(o.spider_closed, event=spider_closed)
41
+ return o
42
+
43
+ async def spider_opened(self) -> None:
44
+ self.logger.info("Spider opened, starting interval logging task")
45
+ self.task = asyncio.create_task(self.interval_log())
46
+ self.logger.info("Interval logging task started")
47
+
48
+ async def spider_closed(self) -> None:
49
+ self.logger.info("Spider closed, stopping interval logging task")
50
+ if self.task:
51
+ self.task.cancel()
52
+ try:
53
+ await self.task
54
+ except asyncio.CancelledError:
55
+ pass
56
+ self.task = None
57
+
58
+ async def interval_log(self) -> None:
59
+ iteration = 0
60
+ while True:
61
+ try:
62
+ iteration += 1
63
+ self.logger.debug(f"Interval log iteration {iteration} starting")
64
+ last_item_count = self.stats.get_value('item_successful_count', default=0)
65
+ last_response_count = self.stats.get_value('response_received_count', default=0)
66
+ item_rate = last_item_count - self.item_count
67
+ response_rate = last_response_count - self.response_count
68
+
69
+ # 添加调试信息
70
+ self.logger.debug(f"Debug info - Iteration: {iteration}, Last item count: {last_item_count}, Last response count: {last_response_count}")
71
+ self.logger.debug(f"Debug info - Previous item count: {self.item_count}, Previous response count: {self.response_count}")
72
+ self.logger.debug(f"Debug info - Item rate: {item_rate}, Response rate: {response_rate}")
73
+
74
+ self.item_count, self.response_count = last_item_count, last_response_count
75
+
76
+ # 修复效率计算,确保使用正确的单位
77
+ if self.unit == 'min' and self.seconds > 0:
78
+ # 转换为每分钟速率
79
+ pages_per_min = response_rate * 60 / self.seconds if self.seconds > 0 else 0
80
+ items_per_min = item_rate * 60 / self.seconds if self.seconds > 0 else 0
81
+ self.logger.info(
82
+ f'Crawled {last_response_count} pages (at {pages_per_min:.0f} pages/min),'
83
+ f' Got {last_item_count} items (at {items_per_min:.0f} items/min).'
84
+ )
85
+ else:
86
+ # 使用原始单位
87
+ self.logger.info(
88
+ f'Crawled {last_response_count} pages (at {response_rate} pages/{self.interval_display}{self.unit}),'
89
+ f' Got {last_item_count} items (at {item_rate} items/{self.interval_display}{self.unit}).'
90
+ )
91
+ self.logger.debug(f"Interval log iteration {iteration} completed, sleeping for {self.seconds} seconds")
92
+ await asyncio.sleep(self.seconds)
93
+ except Exception as e:
94
+ self.logger.error(f"Error in interval logging: {e}")
95
+ await asyncio.sleep(self.seconds) # 即使出错也继续执行
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ 日志统计扩展
5
+ 提供详细的爬虫运行统计信息
6
+ """
7
+ import asyncio
8
+ from typing import Any
9
+
10
+ from crawlo.utils.log import get_logger
11
+ from crawlo.utils import now, time_diff
12
+
13
+
14
+ class LogStats:
15
+ """
16
+ 日志统计扩展,记录和输出爬虫运行过程中的各种统计信息
17
+ """
18
+
19
+ def __init__(self, crawler):
20
+ self.crawler = crawler
21
+ self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
22
+ self._stats = crawler.stats
23
+ self._stats['start_time'] = now(fmt='%Y-%m-%d %H:%M:%S')
24
+
25
+ @classmethod
26
+ def from_crawler(cls, crawler):
27
+ return cls(crawler)
28
+
29
+ @classmethod
30
+ def create_instance(cls, crawler):
31
+ return cls.from_crawler(crawler)
32
+
33
+ async def spider_closed(self, reason: str = 'finished') -> None:
34
+ try:
35
+ self._stats['end_time'] = now(fmt='%Y-%m-%d %H:%M:%S')
36
+ self._stats['cost_time(s)'] = time_diff(start=self._stats['start_time'], end=self._stats['end_time'])
37
+ self._stats['reason'] = reason
38
+ except Exception as e:
39
+ # 添加日志以便调试
40
+ self.logger.error(f"Error in spider_closed: {e}")
41
+ # 静默处理,避免影响爬虫运行
42
+ pass
43
+
44
+ async def item_successful(self, _item: Any, _spider: Any) -> None:
45
+ try:
46
+ self._stats.inc_value('item_successful_count')
47
+ except Exception as e:
48
+ # 静默处理,避免影响爬虫运行
49
+ pass
50
+
51
+ async def item_discard(self, _item: Any, exc: Any, _spider: Any) -> None:
52
+ try:
53
+ # 只增加总的丢弃计数,不记录每个丢弃项目的原因详情
54
+ self._stats.inc_value('item_discard_count')
55
+ except Exception as e:
56
+ # 静默处理,避免影响爬虫运行
57
+ pass
58
+
59
+ async def response_received(self, _response: Any, _spider: Any) -> None:
60
+ try:
61
+ self._stats.inc_value('response_received_count')
62
+ except Exception as e:
63
+ # 静默处理,避免影响爬虫运行
64
+ pass
65
+
66
+ async def request_scheduled(self, _request: Any, _spider: Any) -> None:
67
+ try:
68
+ self._stats.inc_value('request_scheduler_count')
69
+ except Exception as e:
70
+ # 静默处理,避免影响爬虫运行
71
+ pass
@@ -16,25 +16,29 @@ from .context import InitializationContext
16
16
  from .core import CoreInitializer
17
17
  from .phases import InitializationPhase
18
18
 
19
+
19
20
  # 公共接口
20
21
  def initialize_framework(settings=None, **kwargs):
21
22
  """初始化框架的主要入口"""
22
23
  return CoreInitializer().initialize(settings, **kwargs)
23
24
 
25
+
24
26
  def is_framework_ready():
25
27
  """检查框架是否已准备就绪"""
26
28
  return CoreInitializer().is_ready
27
29
 
30
+
28
31
  def get_framework_context():
29
32
  """获取框架初始化上下文"""
30
33
  return CoreInitializer().context
31
34
 
35
+
32
36
  __all__ = [
33
37
  'InitializerRegistry',
34
- 'InitializationContext',
38
+ 'InitializationContext',
35
39
  'CoreInitializer',
36
40
  'InitializationPhase',
37
41
  'initialize_framework',
38
42
  'is_framework_ready',
39
43
  'get_framework_context'
40
- ]
44
+ ]
@@ -86,7 +86,7 @@ class MiddlewareManager:
86
86
  response = await self._process_exception(request, exp)
87
87
  else:
88
88
  create_task(self.crawler.subscriber.notify(response_received, response, self.crawler.spider))
89
- # self.crawler.stats.inc_value('response_received_count')
89
+ self._stats.inc_value('response_received_count')
90
90
  if isinstance(response, Response):
91
91
  response = await self._process_response(request, response)
92
92
  if isinstance(response, Request):
@@ -127,9 +127,7 @@ class ResponseCodeMiddleware(object):
127
127
  """
128
128
  status_code = response.status_code
129
129
 
130
- # 记录具体状态码统计
131
- self.stats.inc_value(f'response_status_code/count/{status_code}')
132
-
130
+ # 只记录总的统计信息,不记录每个域名和每个状态码的详细信息
133
131
  # 记录状态码分类统计
134
132
  category = self._get_status_category(status_code)
135
133
  self.stats.inc_value(f'response_status_code/category/{category}')
@@ -144,17 +142,6 @@ class ResponseCodeMiddleware(object):
144
142
  if hasattr(response, 'content_length') and response.content_length:
145
143
  self.stats.inc_value('response_total_bytes', response.content_length)
146
144
 
147
- # 记录域名统计
148
- try:
149
- from urllib.parse import urlparse
150
- parsed_url = urlparse(response.url)
151
- domain = parsed_url.netloc
152
- if domain:
153
- self.stats.inc_value(f'response_status_code/domain/{domain}/count/{status_code}')
154
- self.stats.inc_value(f'response_status_code/domain/{domain}/category/{category}')
155
- except Exception:
156
- self.stats.inc_value('response_status_code/domain/invalid_url/count/{status_code}')
157
-
158
145
  # 详细日志记录
159
146
  self.logger.debug(
160
147
  f'收到响应: {status_code} {response.url} '
@@ -7,7 +7,7 @@
7
7
 
8
8
  支持的运行模式:
9
9
  1. standalone - 单机模式(默认)
10
- 2. distributed - 分布式模式
10
+ 2. distributed - 分布式模式
11
11
  3. auto - 自动检测模式
12
12
  """
13
13
  import os
@@ -29,7 +29,7 @@ class ModeManager:
29
29
  # 延迟初始化logger,避免循环依赖
30
30
  self._logger = None
31
31
  self._debug("运行模式管理器初始化完成")
32
-
32
+
33
33
  def _get_logger(self):
34
34
  """延迟获取logger实例"""
35
35
  if self._logger is None:
@@ -40,7 +40,7 @@ class ModeManager:
40
40
  # 如果日志系统尚未初始化,返回None
41
41
  pass
42
42
  return self._logger
43
-
43
+
44
44
  def _debug(self, message: str):
45
45
  """调试日志"""
46
46
  logger = self._get_logger()
@@ -73,7 +73,7 @@ class ModeManager:
73
73
  redis_url = f'redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}'
74
74
  else:
75
75
  redis_url = f'redis://{redis_host}:{redis_port}/{redis_db}'
76
-
76
+
77
77
  return {
78
78
  'QUEUE_TYPE': 'redis',
79
79
  'FILTER_CLASS': 'crawlo.filters.aioredis_filter.AioRedisFilter',
@@ -119,6 +119,7 @@ class ModeManager:
119
119
 
120
120
  if mode == RunMode.STANDALONE:
121
121
  mode_info = "使用单机模式 - 简单快速,适合开发和中小规模爬取"
122
+ # 对于单机模式,如果用户设置了QUEUE_TYPE为'auto',应该保留用户的设置
122
123
  settings = self.get_standalone_settings()
123
124
  self._debug("应用单机模式配置")
124
125
 
@@ -142,8 +143,13 @@ class ModeManager:
142
143
  raise ValueError(f"不支持的运行模式: {mode}")
143
144
 
144
145
  # 合并用户自定义配置
145
- user_settings = {k: v for k, v in kwargs.items()
146
- if k not in ['redis_host', 'redis_port', 'redis_password', 'project_name']}
146
+ user_settings = {
147
+ k: v for k,
148
+ v in kwargs.items() if k not in [
149
+ 'redis_host',
150
+ 'redis_port',
151
+ 'redis_password',
152
+ 'project_name']}
147
153
  settings.update(user_settings)
148
154
  self._debug(f"合并用户自定义配置: {list(user_settings.keys())}")
149
155
 
@@ -210,4 +216,4 @@ def auto_mode(**kwargs) -> Dict[str, Any]:
210
216
  def from_env(default_mode: str = 'standalone') -> Dict[str, Any]:
211
217
  """从环境变量创建配置"""
212
218
  # 移除直接使用 os.getenv(),要求通过 settings 配置
213
- raise RuntimeError("环境变量配置已移除,请在 settings 中配置相关参数")
219
+ raise RuntimeError("环境变量配置已移除,请在 settings 中配置相关参数")
@@ -38,6 +38,7 @@ except ImportError:
38
38
 
39
39
  from crawlo import Item
40
40
  from crawlo.spider import Spider
41
+ from crawlo.utils.fingerprint import FingerprintGenerator
41
42
  from crawlo.utils.log import get_logger
42
43
  from crawlo.exceptions import DropItem, ItemDiscard
43
44
 
@@ -109,6 +110,9 @@ class BloomDedupPipeline:
109
110
  self.logger.debug(f"Processing new item: {fingerprint[:20]}...")
110
111
  return item
111
112
 
113
+ except ItemDiscard:
114
+ # 重新抛出ItemDiscard异常,确保管道管理器能正确处理
115
+ raise
112
116
  except Exception as e:
113
117
  self.logger.error(f"Error processing item: {e}")
114
118
  # 在错误时继续处理,避免丢失数据
@@ -123,21 +127,7 @@ class BloomDedupPipeline:
123
127
  :param item: 数据项
124
128
  :return: 指纹字符串
125
129
  """
126
- # 将数据项转换为可序列化的字典
127
- try:
128
- item_dict = item.to_dict()
129
- except AttributeError:
130
- # 兼容没有to_dict方法的Item实现
131
- item_dict = dict(item)
132
-
133
- # 对字典进行排序以确保一致性
134
- sorted_items = sorted(item_dict.items())
135
-
136
- # 生成指纹字符串
137
- fingerprint_string = '|'.join([f"{k}={v}" for k, v in sorted_items if v is not None])
138
-
139
- # 使用 SHA256 生成固定长度的指纹
140
- return hashlib.sha256(fingerprint_string.encode('utf-8')).hexdigest()
130
+ return FingerprintGenerator.item_fingerprint(item)
141
131
 
142
132
  def close_spider(self, spider: Spider) -> None:
143
133
  """
@@ -17,6 +17,7 @@ import aiomysql
17
17
  from crawlo import Item
18
18
  from crawlo.exceptions import DropItem, ItemDiscard
19
19
  from crawlo.spider import Spider
20
+ from crawlo.utils.fingerprint import FingerprintGenerator
20
21
  from crawlo.utils.log import get_logger
21
22
 
22
23
 
@@ -140,6 +141,9 @@ class DatabaseDedupPipeline:
140
141
  self.logger.debug(f"Processing new item: {fingerprint[:20]}...")
141
142
  return item
142
143
 
144
+ except ItemDiscard:
145
+ # 重新抛出ItemDiscard异常,确保管道管理器能正确处理
146
+ raise
143
147
  except Exception as e:
144
148
  self.logger.error(f"Error processing item: {e}")
145
149
  # 在错误时继续处理,避免丢失数据
@@ -190,11 +194,4 @@ class DatabaseDedupPipeline:
190
194
  :param item: 数据项
191
195
  :return: 指纹字符串
192
196
  """
193
- # 将数据项转换为可序列化的字典
194
- try:
195
- item_dict = item.to_dict()
196
- except AttributeError:
197
- # 兼容没有to_dict方法的Item实现
198
- item_dict = dict(item)
199
-
200
- # 对字典进行排序以确保一致性
197
+ return FingerprintGenerator.item_fingerprint(item)
@@ -18,6 +18,7 @@ from typing import Set
18
18
  from crawlo import Item
19
19
  from crawlo.exceptions import DropItem, ItemDiscard
20
20
  from crawlo.spider import Spider
21
+ from crawlo.utils.fingerprint import FingerprintGenerator
21
22
  from crawlo.utils.log import get_logger
22
23
 
23
24
 
@@ -71,6 +72,9 @@ class MemoryDedupPipeline:
71
72
  self.logger.debug(f"Processing new item: {fingerprint[:20]}...")
72
73
  return item
73
74
 
75
+ except ItemDiscard:
76
+ # 重新抛出ItemDiscard异常,确保管道管理器能正确处理
77
+ raise
74
78
  except Exception as e:
75
79
  self.logger.error(f"Error processing item: {e}")
76
80
  # 在错误时继续处理,避免丢失数据
@@ -85,21 +89,7 @@ class MemoryDedupPipeline:
85
89
  :param item: 数据项
86
90
  :return: 指纹字符串
87
91
  """
88
- # 将数据项转换为可序列化的字典
89
- try:
90
- item_dict = item.to_dict()
91
- except AttributeError:
92
- # 兼容没有to_dict方法的Item实现
93
- item_dict = dict(item)
94
-
95
- # 对字典进行排序以确保一致性
96
- sorted_items = sorted(item_dict.items())
97
-
98
- # 生成指纹字符串
99
- fingerprint_string = '|'.join([f"{k}={v}" for k, v in sorted_items if v is not None])
100
-
101
- # 使用 SHA256 生成固定长度的指纹
102
- return hashlib.sha256(fingerprint_string.encode('utf-8')).hexdigest()
92
+ return FingerprintGenerator.item_fingerprint(item)
103
93
 
104
94
  def close_spider(self, spider: Spider) -> None:
105
95
  """
@@ -66,11 +66,19 @@ class PipelineManager:
66
66
 
67
67
  async def process_item(self, item):
68
68
  try:
69
- for method in self.methods:
70
- item = await common_call(method, item, self.crawler.spider)
71
- if item is None:
72
- raise InvalidOutputError(f"{method.__qualname__} return None is not supported.")
73
- except (ItemDiscard, DropItem) as exc: # 同时捕获两种异常类型
74
- create_task(self.crawler.subscriber.notify(item_discard, item, exc, self.crawler.spider))
69
+ for i, method in enumerate(self.methods):
70
+ self.logger.debug(f"Processing item with pipeline method {i}: {method.__qualname__}")
71
+ try:
72
+ item = await common_call(method, item, self.crawler.spider)
73
+ if item is None:
74
+ raise InvalidOutputError(f"{method.__qualname__} return None is not supported.")
75
+ except (ItemDiscard, DropItem) as exc: # 同时捕获两种异常类型
76
+ self.logger.debug(f"Item discarded by pipeline: {exc}")
77
+ create_task(self.crawler.subscriber.notify(item_discard, item, exc, self.crawler.spider))
78
+ # 重新抛出异常,确保上层调用者也能捕获到,并停止执行后续管道
79
+ raise
80
+ except (ItemDiscard, DropItem):
81
+ # 异常已经被处理和通知,这里只需要重新抛出
82
+ raise
75
83
  else:
76
- create_task(self.crawler.subscriber.notify(item_successful, item, self.crawler.spider))
84
+ create_task(self.crawler.subscriber.notify(item_successful, item, self.crawler.spider))