crawlo 1.2.9__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (228) hide show
  1. {crawlo-1.2.9/crawlo.egg-info → crawlo-1.3.0}/PKG-INFO +1 -1
  2. crawlo-1.3.0/crawlo/__version__.py +1 -0
  3. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/commands/run.py +26 -35
  4. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/core/engine.py +1 -2
  5. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/crawler.py +48 -53
  6. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/extension/logging_extension.py +4 -2
  7. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/middleware/middleware_manager.py +1 -1
  8. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/mode_manager.py +37 -36
  9. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/pipelines/pipeline_manager.py +13 -1
  10. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/project.py +28 -34
  11. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/settings/setting_manager.py +31 -19
  12. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/log.py +20 -61
  13. {crawlo-1.2.9 → crawlo-1.3.0/crawlo.egg-info}/PKG-INFO +1 -1
  14. crawlo-1.2.9/crawlo/__version__.py +0 -1
  15. {crawlo-1.2.9 → crawlo-1.3.0}/LICENSE +0 -0
  16. {crawlo-1.2.9 → crawlo-1.3.0}/MANIFEST.in +0 -0
  17. {crawlo-1.2.9 → crawlo-1.3.0}/README.md +0 -0
  18. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/__init__.py +0 -0
  19. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/cli.py +0 -0
  20. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/commands/__init__.py +0 -0
  21. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/commands/check.py +0 -0
  22. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/commands/genspider.py +0 -0
  23. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/commands/help.py +0 -0
  24. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/commands/list.py +0 -0
  25. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/commands/startproject.py +0 -0
  26. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/commands/stats.py +0 -0
  27. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/commands/utils.py +0 -0
  28. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/config.py +0 -0
  29. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/config_validator.py +0 -0
  30. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/core/__init__.py +0 -0
  31. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/core/processor.py +0 -0
  32. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/core/scheduler.py +0 -0
  33. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/data/__init__.py +0 -0
  34. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/data/user_agents.py +0 -0
  35. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/downloader/__init__.py +0 -0
  36. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/downloader/aiohttp_downloader.py +0 -0
  37. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/downloader/cffi_downloader.py +0 -0
  38. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/downloader/httpx_downloader.py +0 -0
  39. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/downloader/hybrid_downloader.py +0 -0
  40. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/downloader/playwright_downloader.py +0 -0
  41. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/downloader/selenium_downloader.py +0 -0
  42. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/event.py +0 -0
  43. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/exceptions.py +0 -0
  44. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/extension/__init__.py +0 -0
  45. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/extension/health_check.py +0 -0
  46. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/extension/log_interval.py +0 -0
  47. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/extension/log_stats.py +0 -0
  48. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/extension/memory_monitor.py +0 -0
  49. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/extension/performance_profiler.py +0 -0
  50. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/extension/request_recorder.py +0 -0
  51. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/filters/__init__.py +0 -0
  52. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/filters/aioredis_filter.py +0 -0
  53. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/filters/memory_filter.py +0 -0
  54. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/items/__init__.py +0 -0
  55. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/items/base.py +0 -0
  56. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/items/fields.py +0 -0
  57. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/items/items.py +0 -0
  58. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/middleware/__init__.py +0 -0
  59. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/middleware/default_header.py +0 -0
  60. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/middleware/download_delay.py +0 -0
  61. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/middleware/offsite.py +0 -0
  62. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/middleware/proxy.py +0 -0
  63. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/middleware/request_ignore.py +0 -0
  64. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/middleware/response_code.py +0 -0
  65. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/middleware/response_filter.py +0 -0
  66. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/middleware/retry.py +0 -0
  67. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/middleware/simple_proxy.py +0 -0
  68. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/network/__init__.py +0 -0
  69. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/network/request.py +0 -0
  70. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/network/response.py +0 -0
  71. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/pipelines/__init__.py +0 -0
  72. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/pipelines/bloom_dedup_pipeline.py +0 -0
  73. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/pipelines/console_pipeline.py +0 -0
  74. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/pipelines/csv_pipeline.py +0 -0
  75. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/pipelines/database_dedup_pipeline.py +0 -0
  76. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/pipelines/json_pipeline.py +0 -0
  77. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/pipelines/memory_dedup_pipeline.py +0 -0
  78. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/pipelines/mongo_pipeline.py +0 -0
  79. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/pipelines/mysql_pipeline.py +0 -0
  80. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/pipelines/redis_dedup_pipeline.py +0 -0
  81. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/queue/__init__.py +0 -0
  82. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/queue/pqueue.py +0 -0
  83. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/queue/queue_manager.py +0 -0
  84. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/queue/redis_priority_queue.py +0 -0
  85. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/settings/__init__.py +0 -0
  86. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/settings/default_settings.py +0 -0
  87. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/spider/__init__.py +0 -0
  88. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/stats_collector.py +0 -0
  89. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/subscriber.py +0 -0
  90. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/task_manager.py +0 -0
  91. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/crawlo.cfg.tmpl +0 -0
  92. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/project/__init__.py.tmpl +0 -0
  93. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/project/items.py.tmpl +0 -0
  94. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/project/middlewares.py.tmpl +0 -0
  95. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/project/pipelines.py.tmpl +0 -0
  96. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/project/settings.py.tmpl +0 -0
  97. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/project/settings_distributed.py.tmpl +0 -0
  98. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/project/settings_gentle.py.tmpl +0 -0
  99. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/project/settings_high_performance.py.tmpl +0 -0
  100. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/project/settings_minimal.py.tmpl +0 -0
  101. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/project/settings_simple.py.tmpl +0 -0
  102. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/project/spiders/__init__.py.tmpl +0 -0
  103. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/run.py.tmpl +0 -0
  104. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/templates/spider/spider.py.tmpl +0 -0
  105. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/__init__.py +0 -0
  106. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/anti_crawler.py +0 -0
  107. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/authenticated_proxy.py +0 -0
  108. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/data_formatter.py +0 -0
  109. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/data_validator.py +0 -0
  110. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/date_tools.py +0 -0
  111. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/distributed_coordinator.py +0 -0
  112. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/encoding_converter.py +0 -0
  113. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/request_tools.py +0 -0
  114. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/retry_mechanism.py +0 -0
  115. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/scenario_adapter.py +0 -0
  116. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/tools/text_cleaner.py +0 -0
  117. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/__init__.py +0 -0
  118. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/batch_processor.py +0 -0
  119. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/controlled_spider_mixin.py +0 -0
  120. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/db_helper.py +0 -0
  121. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/enhanced_error_handler.py +0 -0
  122. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/env_config.py +0 -0
  123. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/error_handler.py +0 -0
  124. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/func_tools.py +0 -0
  125. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/large_scale_config.py +0 -0
  126. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/large_scale_helper.py +0 -0
  127. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/performance_monitor.py +0 -0
  128. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/queue_helper.py +0 -0
  129. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/redis_connection_pool.py +0 -0
  130. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/redis_key_validator.py +0 -0
  131. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/request.py +0 -0
  132. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/request_serializer.py +0 -0
  133. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/spider_loader.py +0 -0
  134. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/system.py +0 -0
  135. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/tools.py +0 -0
  136. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo/utils/url.py +0 -0
  137. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo.egg-info/SOURCES.txt +0 -0
  138. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo.egg-info/dependency_links.txt +0 -0
  139. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo.egg-info/entry_points.txt +0 -0
  140. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo.egg-info/requires.txt +0 -0
  141. {crawlo-1.2.9 → crawlo-1.3.0}/crawlo.egg-info/top_level.txt +0 -0
  142. {crawlo-1.2.9 → crawlo-1.3.0}/examples/__init__.py +0 -0
  143. {crawlo-1.2.9 → crawlo-1.3.0}/pyproject.toml +0 -0
  144. {crawlo-1.2.9 → crawlo-1.3.0}/requirements.txt +0 -0
  145. {crawlo-1.2.9 → crawlo-1.3.0}/setup.cfg +0 -0
  146. {crawlo-1.2.9 → crawlo-1.3.0}/tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -0
  147. {crawlo-1.2.9 → crawlo-1.3.0}/tests/__init__.py +0 -0
  148. {crawlo-1.2.9 → crawlo-1.3.0}/tests/advanced_tools_example.py +0 -0
  149. {crawlo-1.2.9 → crawlo-1.3.0}/tests/authenticated_proxy_example.py +0 -0
  150. {crawlo-1.2.9 → crawlo-1.3.0}/tests/cleaners_example.py +0 -0
  151. {crawlo-1.2.9 → crawlo-1.3.0}/tests/config_validation_demo.py +0 -0
  152. {crawlo-1.2.9 → crawlo-1.3.0}/tests/controlled_spider_example.py +0 -0
  153. {crawlo-1.2.9 → crawlo-1.3.0}/tests/date_tools_example.py +0 -0
  154. {crawlo-1.2.9 → crawlo-1.3.0}/tests/debug_pipelines.py +0 -0
  155. {crawlo-1.2.9 → crawlo-1.3.0}/tests/dynamic_loading_example.py +0 -0
  156. {crawlo-1.2.9 → crawlo-1.3.0}/tests/dynamic_loading_test.py +0 -0
  157. {crawlo-1.2.9 → crawlo-1.3.0}/tests/env_config_example.py +0 -0
  158. {crawlo-1.2.9 → crawlo-1.3.0}/tests/error_handling_example.py +0 -0
  159. {crawlo-1.2.9 → crawlo-1.3.0}/tests/redis_key_validation_demo.py +0 -0
  160. {crawlo-1.2.9 → crawlo-1.3.0}/tests/request_params_example.py +0 -0
  161. {crawlo-1.2.9 → crawlo-1.3.0}/tests/response_improvements_example.py +0 -0
  162. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_advanced_tools.py +0 -0
  163. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_all_redis_key_configs.py +0 -0
  164. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_authenticated_proxy.py +0 -0
  165. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_cleaners.py +0 -0
  166. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_comprehensive.py +0 -0
  167. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_config_consistency.py +0 -0
  168. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_config_merge.py +0 -0
  169. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_config_validator.py +0 -0
  170. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_crawlo_proxy_integration.py +0 -0
  171. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_date_tools.py +0 -0
  172. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_default_header_middleware.py +0 -0
  173. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_distributed.py +0 -0
  174. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_double_crawlo_fix.py +0 -0
  175. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_double_crawlo_fix_simple.py +0 -0
  176. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_download_delay_middleware.py +0 -0
  177. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_downloader_proxy_compatibility.py +0 -0
  178. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_dynamic_downloaders_proxy.py +0 -0
  179. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_dynamic_proxy.py +0 -0
  180. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_dynamic_proxy_config.py +0 -0
  181. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_dynamic_proxy_real.py +0 -0
  182. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_edge_cases.py +0 -0
  183. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_enhanced_error_handler.py +0 -0
  184. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_env_config.py +0 -0
  185. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_error_handler_compatibility.py +0 -0
  186. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_final_validation.py +0 -0
  187. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_framework_env_usage.py +0 -0
  188. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_integration.py +0 -0
  189. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_item_dedup_redis_key.py +0 -0
  190. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_mode_consistency.py +0 -0
  191. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_offsite_middleware.py +0 -0
  192. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_parsel.py +0 -0
  193. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_performance.py +0 -0
  194. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_proxy_api.py +0 -0
  195. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_proxy_health_check.py +0 -0
  196. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_proxy_middleware.py +0 -0
  197. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_proxy_middleware_enhanced.py +0 -0
  198. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_proxy_middleware_integration.py +0 -0
  199. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_proxy_middleware_refactored.py +0 -0
  200. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_proxy_providers.py +0 -0
  201. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_proxy_stats.py +0 -0
  202. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_proxy_strategies.py +0 -0
  203. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_queue_manager_double_crawlo.py +0 -0
  204. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_queue_manager_redis_key.py +0 -0
  205. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_random_user_agent.py +0 -0
  206. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_real_scenario_proxy.py +0 -0
  207. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_redis_config.py +0 -0
  208. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_redis_connection_pool.py +0 -0
  209. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_redis_key_naming.py +0 -0
  210. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_redis_key_validator.py +0 -0
  211. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_redis_queue.py +0 -0
  212. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_request_ignore_middleware.py +0 -0
  213. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_request_params.py +0 -0
  214. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_request_serialization.py +0 -0
  215. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_response_code_middleware.py +0 -0
  216. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_response_filter_middleware.py +0 -0
  217. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_response_improvements.py +0 -0
  218. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_retry_middleware.py +0 -0
  219. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_scheduler.py +0 -0
  220. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_scheduler_config_update.py +0 -0
  221. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_simple_response.py +0 -0
  222. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_telecom_spider_redis_key.py +0 -0
  223. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_template_content.py +0 -0
  224. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_template_redis_key.py +0 -0
  225. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_tools.py +0 -0
  226. {crawlo-1.2.9 → crawlo-1.3.0}/tests/test_user_agents.py +0 -0
  227. {crawlo-1.2.9 → crawlo-1.3.0}/tests/tools_example.py +0 -0
  228. {crawlo-1.2.9 → crawlo-1.3.0}/tests/verify_distributed.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlo
3
- Version: 1.2.9
3
+ Version: 1.3.0
4
4
  Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
5
  Home-page: https://github.com/crawl-coder/Crawlo.git
6
6
  Author: crawl-coder
@@ -0,0 +1 @@
1
+ __version__ = '1.3.0'
@@ -5,26 +5,27 @@
5
5
  # @Author : crawl-coder
6
6
  # @Desc : 命令行入口:crawlo run <spider_name>|all,用于运行指定爬虫。
7
7
  """
8
+ import os
8
9
  import sys
9
10
  import asyncio
10
11
  import configparser
11
- import os
12
- from pathlib import Path
13
12
  from importlib import import_module
14
13
 
14
+ from rich import box
15
15
  from rich.console import Console
16
16
  from rich.panel import Panel
17
+ from rich.progress import Progress, SpinnerColumn, TextColumn
17
18
  from rich.table import Table
18
19
  from rich.text import Text
19
- from rich import box
20
- from rich.progress import Progress, SpinnerColumn, TextColumn
21
20
 
21
+ from crawlo.commands.stats import record_stats
22
22
  from crawlo.crawler import CrawlerProcess
23
- from crawlo.utils.log import get_logger
24
23
  from crawlo.project import get_settings, _find_project_root
25
- from crawlo.commands.stats import record_stats
24
+ # 使用自定义日志系统
25
+ from crawlo.utils.log import get_logger
26
26
 
27
27
  logger = get_logger(__name__)
28
+
28
29
  console = Console()
29
30
 
30
31
 
@@ -77,6 +78,9 @@ def main(args):
77
78
  用法:
78
79
  crawlo run <spider_name>|all [--json] [--no-stats]
79
80
  """
81
+ # 添加调试信息
82
+ logger.debug("DEBUG: 进入main函数")
83
+
80
84
  if len(args) < 1:
81
85
  console.print("[bold red]用法:[/bold red] [blue]crawlo run[/blue] <爬虫名称>|all [bold yellow][--json] [--no-stats][/bold yellow]")
82
86
  console.print("示例:")
@@ -187,21 +191,7 @@ def main(args):
187
191
  return 1
188
192
 
189
193
  # 显示即将运行的爬虫列表
190
- table = Table(
191
- title=f"启动全部 {len(spider_names)} 个爬虫",
192
- box=box.ROUNDED,
193
- show_header=True,
194
- header_style="bold magenta"
195
- )
196
- table.add_column("名称", style="cyan")
197
- table.add_column("类名", style="green")
198
-
199
- for name in sorted(spider_names):
200
- cls = process.get_spider_class(name)
201
- table.add_row(name, cls.__name__)
202
-
203
- console.print(table)
204
- console.print()
194
+ # 根据用户要求,不再显示详细的爬虫列表信息
205
195
 
206
196
  # 注册 stats 记录(除非 --no-stats)
207
197
  if not no_stats:
@@ -260,20 +250,21 @@ def main(args):
260
250
  spider_class = process.get_spider_class(spider_name)
261
251
 
262
252
  # 显示启动信息
263
- if not show_json:
264
- info_table = Table(
265
- title=f"启动爬虫: [bold cyan]{spider_name}[/bold cyan]",
266
- box=box.SIMPLE,
267
- show_header=False,
268
- title_style="bold green"
269
- )
270
- info_table.add_column("Key", style="yellow")
271
- info_table.add_column("Value", style="cyan")
272
- info_table.add_row("Project", project_package)
273
- info_table.add_row("Class", spider_class.__name__)
274
- info_table.add_row("Module", spider_class.__module__)
275
- console.print(info_table)
276
- console.print()
253
+ # 根据用户要求,不再显示项目启动信息
254
+ # if not show_json:
255
+ # info_table = Table(
256
+ # title=f"启动爬虫: [bold cyan]{spider_name}[/bold cyan]",
257
+ # box=box.SIMPLE,
258
+ # show_header=False,
259
+ # title_style="bold green"
260
+ # )
261
+ # info_table.add_column("Key", style="yellow")
262
+ # info_table.add_column("Value", style="cyan")
263
+ # info_table.add_row("Project", project_package)
264
+ # info_table.add_row("Class", spider_class.__name__)
265
+ # info_table.add_row("Module", spider_class.__module__)
266
+ # console.print(info_table)
267
+ # console.print()
277
268
 
278
269
  # 注册 stats 记录
279
270
  if not no_stats:
@@ -75,8 +75,7 @@ class Engine(object):
75
75
  version = '1.0.0'
76
76
  # Change INFO level log to DEBUG level to avoid duplication with CrawlerProcess startup log
77
77
  self.logger.debug(
78
- f"Crawlo Started version {version} . "
79
- # f"(project name : {self.settings.get('PROJECT_NAME')})"
78
+ f"Crawlo Started version {version}"
80
79
  )
81
80
 
82
81
  async def start_spider(self, spider):
@@ -21,7 +21,7 @@ Example Usage:
21
21
  # Single crawler run
22
22
  crawler = Crawler(MySpider, settings)
23
23
  await crawler.crawl()
24
-
24
+
25
25
  # Multi-crawler concurrent management
26
26
  process = CrawlerProcess()
27
27
  await process.crawl([Spider1, Spider2])
@@ -34,7 +34,6 @@ import threading
34
34
  from typing import Type, Optional, Set, List, Union, Dict, Any
35
35
  from .spider import Spider, get_global_spider_registry
36
36
  from .core.engine import Engine
37
- from .utils.log import get_logger
38
37
  from .subscriber import Subscriber
39
38
  from .extension import ExtensionManager
40
39
  from .stats_collector import StatsCollector
@@ -42,16 +41,9 @@ from .event import spider_opened, spider_closed
42
41
  from .settings.setting_manager import SettingManager
43
42
  from crawlo.project import merge_settings, get_settings
44
43
 
45
- # 延迟初始化logger,在需要时通过get_logger获取
46
- logger = None
47
-
48
-
49
- def _get_logger():
50
- """延迟获取logger实例,确保在配置加载后创建"""
51
- global logger
52
- if logger is None:
53
- logger = get_logger(__name__)
54
- return logger
44
+ # 使用自定义日志系统
45
+ from crawlo.utils.log import get_logger
46
+ logger = get_logger(__name__)
55
47
 
56
48
 
57
49
  class CrawlerContext:
@@ -110,7 +102,7 @@ class CrawlerContext:
110
102
  class Crawler:
111
103
  """
112
104
  Single crawler runtime instance, managing Spider and engine lifecycle
113
-
105
+
114
106
  Provides functionality:
115
107
  - Spider lifecycle management (initialization, running, closing)
116
108
  - Engine component coordination management
@@ -148,7 +140,7 @@ class Crawler:
148
140
  async def crawl(self):
149
141
  """
150
142
  Start the crawler core process
151
-
143
+
152
144
  Includes the following stages:
153
145
  1. Initialization stage: Create all components
154
146
  2. Validation stage: Check configuration and state
@@ -190,12 +182,12 @@ class Crawler:
190
182
  # Update context status
191
183
  self.context.increment_completed()
192
184
 
193
- _get_logger().info(f"Spider {self.spider.name} completed, took {self._get_total_duration():.2f} seconds")
185
+ logger.info(f"Spider {self.spider.name} completed, took {self._get_total_duration():.2f} seconds")
194
186
 
195
187
  except Exception as e:
196
188
  self._performance_metrics['error_count'] += 1
197
189
  self.context.increment_failed(str(e))
198
- _get_logger().error(f"Spider {getattr(self.spider, 'name', 'Unknown')} failed to run: {e}", exc_info=True)
190
+ logger.error(f"Spider {getattr(self.spider, 'name', 'Unknown')} failed to run: {e}", exc_info=True)
199
191
  raise
200
192
  finally:
201
193
  self.context.decrement_active()
@@ -213,7 +205,7 @@ class Crawler:
213
205
  else:
214
206
  spider_name = 'Unknown'
215
207
 
216
- _get_logger().info(f"Starting running {spider_name}")
208
+ logger.info(f"Starting running {spider_name}")
217
209
 
218
210
  def _validate_crawler_state(self):
219
211
  """
@@ -233,7 +225,7 @@ class Crawler:
233
225
  if not self.spider.name:
234
226
  raise ValueError("Spider name cannot be empty")
235
227
 
236
- _get_logger().debug(f"Spider {self.spider.name} state validation passed")
228
+ logger.debug(f"Spider {self.spider.name} state validation passed")
237
229
 
238
230
  def _get_total_duration(self) -> float:
239
231
  """Get total runtime"""
@@ -247,7 +239,7 @@ class Crawler:
247
239
  if not self._closed:
248
240
  await self.close()
249
241
  except Exception as e:
250
- _get_logger().warning(f"Error cleaning up resources: {e}")
242
+ logger.warning(f"Error cleaning up resources: {e}")
251
243
 
252
244
  def get_performance_metrics(self) -> Dict[str, Any]:
253
245
  """Get performance metrics"""
@@ -267,7 +259,7 @@ class Crawler:
267
259
  def _create_spider(self) -> Spider:
268
260
  """
269
261
  Create and validate spider instance (enhanced version)
270
-
262
+
271
263
  Performs the following validations:
272
264
  - Spider name must exist
273
265
  - start_requests method must be callable
@@ -300,7 +292,7 @@ class Crawler:
300
292
 
301
293
  # parse method check (warning instead of error)
302
294
  if not callable(getattr(spider, 'parse', None)):
303
- _get_logger().warning(
295
+ logger.warning(
304
296
  f"Spider '{spider.name}' does not define 'parse' method.\n"
305
297
  f"Ensure all Requests specify a callback function, otherwise responses will be ignored."
306
298
  )
@@ -308,27 +300,29 @@ class Crawler:
308
300
  # Set spider configuration
309
301
  self._set_spider(spider)
310
302
 
311
- _get_logger().debug(f"Spider '{spider.name}' initialized successfully")
303
+ logger.debug(f"Spider '{spider.name}' initialized successfully")
312
304
  return spider
313
305
 
314
306
  def _create_engine(self) -> Engine:
315
307
  """Create and initialize engine"""
316
308
  engine = Engine(self)
317
309
  engine.engine_start()
318
- _get_logger().debug(f"Engine initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
310
+ logger.debug(f"Engine initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
319
311
  return engine
320
312
 
321
313
  def _create_stats(self) -> StatsCollector:
322
314
  """Create stats collector"""
323
315
  stats = StatsCollector(self)
324
- _get_logger().debug(f"Stats collector initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
316
+ logger.debug(
317
+ f"Stats collector initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
325
318
  return stats
326
319
 
327
320
  def _create_extension(self) -> ExtensionManager:
328
321
  """Create extension manager"""
329
322
  # Modify extension manager creation method, delay initialization until needed
330
323
  extension = ExtensionManager.create_instance(self)
331
- _get_logger().debug(f"Extension manager initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
324
+ logger.debug(
325
+ f"Extension manager initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
332
326
  return extension
333
327
 
334
328
  def _set_spider(self, spider: Spider):
@@ -343,12 +337,12 @@ class Crawler:
343
337
  # Merge spider custom configuration
344
338
  merge_settings(spider, self.settings)
345
339
 
346
- _get_logger().debug(f"Spider '{spider.name}' configuration merged successfully")
340
+ logger.debug(f"Spider '{spider.name}' configuration merged successfully")
347
341
 
348
342
  async def close(self, reason='finished') -> None:
349
343
  """
350
344
  Close crawler and clean up resources (enhanced version)
351
-
345
+
352
346
  Ensure closing only once and handle all cleanup operations
353
347
  """
354
348
  async with self._close_lock:
@@ -371,15 +365,15 @@ class Crawler:
371
365
  from crawlo.commands.stats import record_stats
372
366
  record_stats(self)
373
367
  except ImportError:
374
- _get_logger().debug("Statistics recording module does not exist, skipping statistics recording")
368
+ logger.debug("Statistics recording module does not exist, skipping statistics recording")
375
369
 
376
- _get_logger().info(
370
+ logger.info(
377
371
  f"Spider '{getattr(self.spider, 'name', 'Unknown')}' closed, "
378
372
  f"reason: {reason}, took: {self._get_total_duration():.2f} seconds"
379
373
  )
380
374
 
381
375
  except Exception as e:
382
- _get_logger().error(f"Error closing crawler: {e}", exc_info=True)
376
+ logger.error(f"Error closing crawler: {e}", exc_info=True)
383
377
  finally:
384
378
  # Ensure resource cleanup
385
379
  await self._cleanup_resources()
@@ -413,13 +407,13 @@ class Crawler:
413
407
  if cleanup_tasks:
414
408
  await asyncio.gather(*cleanup_tasks, return_exceptions=True)
415
409
 
416
- _get_logger().debug("Resource cleanup completed")
410
+ logger.debug("Resource cleanup completed")
417
411
 
418
412
 
419
413
  class CrawlerProcess:
420
414
  """
421
415
  Crawler process manager
422
-
416
+
423
417
  Supported features:
424
418
  - Multi-crawler concurrent scheduling and resource management
425
419
  - Automatic module discovery and spider registration
@@ -428,15 +422,15 @@ class CrawlerProcess:
428
422
  - Real-time status monitoring and statistics
429
423
  - Error recovery and retry mechanism
430
424
  - Large-scale crawler optimization support
431
-
425
+
432
426
  Usage example:
433
427
  # Basic usage
434
428
  process = CrawlerProcess()
435
429
  await process.crawl(MySpider)
436
-
430
+
437
431
  # Multi-crawler concurrency
438
432
  await process.crawl([Spider1, Spider2, 'spider_name'])
439
-
433
+
440
434
  # Custom concurrency
441
435
  process = CrawlerProcess(max_concurrency=8)
442
436
  """
@@ -563,7 +557,7 @@ class CrawlerProcess:
563
557
  def auto_discover(modules: List[str]):
564
558
  """
565
559
  Automatically import modules, trigger Spider class definition and registration (enhanced version)
566
-
560
+
567
561
  Supports recursive scanning and error recovery
568
562
  """
569
563
  import importlib
@@ -617,7 +611,7 @@ class CrawlerProcess:
617
611
  async def crawl(self, spiders: Union[Type[Spider], str, List[Union[Type[Spider], str]]]):
618
612
  """
619
613
  Start one or more crawlers
620
-
614
+
621
615
  Enhanced features:
622
616
  - Intelligent concurrency control
623
617
  - Real-time monitoring and statistics
@@ -639,7 +633,7 @@ class CrawlerProcess:
639
633
  await self.start_monitoring()
640
634
 
641
635
  try:
642
- # Phase 3: Sort by class name to ensure predictable startup order
636
+ # Phase 3: Initialize context and monitoring
643
637
  spider_classes_to_run.sort(key=lambda cls: cls.__name__.lower())
644
638
 
645
639
  logger.debug(
@@ -738,7 +732,7 @@ class CrawlerProcess:
738
732
  ) -> List[Type[Spider]]:
739
733
  """
740
734
  Resolve input to spider class list
741
-
735
+
742
736
  Supports various input formats and validates uniqueness
743
737
  """
744
738
  inputs = self._normalize_inputs(spiders_input)
@@ -762,7 +756,8 @@ class CrawlerProcess:
762
756
  seen_spider_names.add(spider_name)
763
757
  spider_classes.append(spider_cls)
764
758
 
765
- logger.debug(f"Spider resolved successfully: {item} -> {spider_cls.__name__} (name='{spider_name}')")
759
+ logger.debug(
760
+ f"Spider resolved successfully: {item} -> {spider_cls.__name__} (name='{spider_name}')")
766
761
 
767
762
  except Exception as e:
768
763
  logger.error(f"Failed to resolve spider: {item} - {e}")
@@ -774,7 +769,7 @@ class CrawlerProcess:
774
769
  def _normalize_inputs(spiders_input) -> List[Union[Type[Spider], str]]:
775
770
  """
776
771
  Normalize input to list
777
-
772
+
778
773
  Supports more input types and provides better error information
779
774
  """
780
775
  if isinstance(spiders_input, (type, str)):
@@ -793,7 +788,7 @@ class CrawlerProcess:
793
788
  def _resolve_spider_class(self, item: Union[Type[Spider], str]) -> Type[Spider]:
794
789
  """
795
790
  Resolve single input item to spider class
796
-
791
+
797
792
  Provides better error prompts and debugging information
798
793
  """
799
794
  if isinstance(item, type) and issubclass(item, Spider):
@@ -820,7 +815,7 @@ class CrawlerProcess:
820
815
  async def _run_spider_with_limit(self, spider_cls: Type[Spider], seq: int, total: int):
821
816
  """
822
817
  Spider running function limited by semaphore
823
-
818
+
824
819
  Includes enhanced error handling and monitoring functionality
825
820
  """
826
821
  task = asyncio.current_task()
@@ -888,7 +883,7 @@ class CrawlerProcess:
888
883
  def _shutdown(self, _signum, _frame):
889
884
  """
890
885
  Graceful shutdown signal handling
891
-
886
+
892
887
  Provides better shutdown experience and resource cleanup
893
888
  """
894
889
  signal_name = {signal.SIGINT: 'SIGINT', signal.SIGTERM: 'SIGTERM'}.get(_signum, str(_signum))
@@ -913,7 +908,7 @@ class CrawlerProcess:
913
908
  async def _wait_for_shutdown(self):
914
909
  """
915
910
  Wait for all active tasks to complete
916
-
911
+
917
912
  Provides better shutdown time control and progress feedback
918
913
  """
919
914
  try:
@@ -967,15 +962,15 @@ class CrawlerProcess:
967
962
  def _get_default_settings(cls) -> SettingManager:
968
963
  """
969
964
  Load default configuration
970
-
965
+
971
966
  Provides better error handling and fallback strategy
972
967
  """
973
968
  try:
974
969
  settings = get_settings()
975
- _get_logger().debug("Default configuration loaded successfully")
970
+ logger.debug("Default configuration loaded successfully")
976
971
  return settings
977
972
  except Exception as e:
978
- _get_logger().warning(f"Unable to load default configuration: {e}, using empty configuration")
973
+ logger.warning(f"Unable to load default configuration: {e}, using empty configuration")
979
974
  return SettingManager()
980
975
 
981
976
  def _log_startup_info(self):
@@ -990,7 +985,7 @@ class CrawlerProcess:
990
985
 
991
986
  # Build startup info log
992
987
  startup_info = [
993
- f"Crawlo Framework Started v{version}"
988
+ f"Crawlo Framework Started {version}"
994
989
  ]
995
990
 
996
991
  # Get actual queue type
@@ -1018,7 +1013,7 @@ class CrawlerProcess:
1018
1013
  else:
1019
1014
  startup_info.append(f"Run Mode: {run_mode}")
1020
1015
 
1021
- # Print startup information
1016
+ # Print startup information at INFO level
1022
1017
  for info in startup_info:
1023
1018
  logger.info(info)
1024
1019
 
@@ -1032,7 +1027,7 @@ def create_crawler_with_optimizations(
1032
1027
  ) -> Crawler:
1033
1028
  """
1034
1029
  Create an optimized crawler instance
1035
-
1030
+
1036
1031
  :param spider_cls: Spider class
1037
1032
  :param settings: Settings manager
1038
1033
  :param optimization_kwargs: Optimization parameters
@@ -1056,7 +1051,7 @@ def create_process_with_large_scale_config(
1056
1051
  ) -> CrawlerProcess:
1057
1052
  """
1058
1053
  Create a process manager that supports large-scale optimization
1059
-
1054
+
1060
1055
  :param config_type: Configuration type ('conservative', 'balanced', 'aggressive', 'memory_optimized')
1061
1056
  :param concurrency: Concurrency count
1062
1057
  :param kwargs: Other parameters
@@ -1100,4 +1095,4 @@ __all__ = [
1100
1095
  'CrawlerContext',
1101
1096
  'create_crawler_with_optimizations',
1102
1097
  'create_process_with_large_scale_config'
1103
- ]
1098
+ ]
@@ -1,8 +1,10 @@
1
1
  from typing import Any
2
2
  from crawlo.exceptions import NotConfigured
3
- from crawlo.utils.log import get_logger
4
3
  from crawlo.utils.log import LoggerManager
5
4
 
5
+ # 使用全局logger,避免每个模块都创建自己的延迟初始化函数
6
+ logger = LoggerManager.get_logger(__name__)
7
+
6
8
 
7
9
  class CustomLoggerExtension:
8
10
  """
@@ -32,7 +34,7 @@ class CustomLoggerExtension:
32
34
  return cls(crawler.settings)
33
35
 
34
36
  def spider_opened(self, spider: Any) -> None:
35
- logger = get_logger(__name__)
37
+ logger = LoggerManager.get_logger(__name__)
36
38
  try:
37
39
  logger.info(
38
40
  f"CustomLoggerExtension: Logging initialized. "
@@ -133,4 +133,4 @@ class MiddlewareManager:
133
133
  def _validate_middleware_method(method_name, middleware) -> bool:
134
134
  method = getattr(type(middleware), method_name)
135
135
  base_method = getattr(BaseMiddleware, method_name)
136
- return False if method == base_method else True
136
+ return False if method == base_method else True