crawlo 1.2.9__tar.gz → 1.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (228) hide show
  1. {crawlo-1.2.9/crawlo.egg-info → crawlo-1.3.1}/PKG-INFO +13 -4
  2. {crawlo-1.2.9 → crawlo-1.3.1}/README.md +12 -3
  3. crawlo-1.3.1/crawlo/__version__.py +1 -0
  4. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/commands/run.py +26 -35
  5. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/commands/utils.py +12 -2
  6. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/core/engine.py +1 -2
  7. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/crawler.py +135 -69
  8. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/extension/logging_extension.py +4 -2
  9. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/middleware/middleware_manager.py +1 -1
  10. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/middleware/offsite.py +2 -1
  11. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/mode_manager.py +37 -100
  12. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/pipelines/mysql_pipeline.py +5 -4
  13. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/pipelines/pipeline_manager.py +15 -2
  14. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/project.py +44 -37
  15. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/settings/default_settings.py +13 -4
  16. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/settings/setting_manager.py +55 -20
  17. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/log.py +21 -62
  18. {crawlo-1.2.9 → crawlo-1.3.1/crawlo.egg-info}/PKG-INFO +13 -4
  19. crawlo-1.2.9/crawlo/__version__.py +0 -1
  20. {crawlo-1.2.9 → crawlo-1.3.1}/LICENSE +0 -0
  21. {crawlo-1.2.9 → crawlo-1.3.1}/MANIFEST.in +0 -0
  22. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/__init__.py +0 -0
  23. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/cli.py +0 -0
  24. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/commands/__init__.py +0 -0
  25. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/commands/check.py +0 -0
  26. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/commands/genspider.py +0 -0
  27. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/commands/help.py +0 -0
  28. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/commands/list.py +0 -0
  29. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/commands/startproject.py +0 -0
  30. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/commands/stats.py +0 -0
  31. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/config.py +0 -0
  32. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/config_validator.py +0 -0
  33. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/core/__init__.py +0 -0
  34. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/core/processor.py +0 -0
  35. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/core/scheduler.py +0 -0
  36. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/data/__init__.py +0 -0
  37. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/data/user_agents.py +0 -0
  38. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/downloader/__init__.py +0 -0
  39. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/downloader/aiohttp_downloader.py +0 -0
  40. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/downloader/cffi_downloader.py +0 -0
  41. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/downloader/httpx_downloader.py +0 -0
  42. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/downloader/hybrid_downloader.py +0 -0
  43. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/downloader/playwright_downloader.py +0 -0
  44. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/downloader/selenium_downloader.py +0 -0
  45. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/event.py +0 -0
  46. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/exceptions.py +0 -0
  47. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/extension/__init__.py +0 -0
  48. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/extension/health_check.py +0 -0
  49. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/extension/log_interval.py +0 -0
  50. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/extension/log_stats.py +0 -0
  51. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/extension/memory_monitor.py +0 -0
  52. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/extension/performance_profiler.py +0 -0
  53. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/extension/request_recorder.py +0 -0
  54. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/filters/__init__.py +0 -0
  55. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/filters/aioredis_filter.py +0 -0
  56. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/filters/memory_filter.py +0 -0
  57. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/items/__init__.py +0 -0
  58. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/items/base.py +0 -0
  59. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/items/fields.py +0 -0
  60. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/items/items.py +0 -0
  61. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/middleware/__init__.py +0 -0
  62. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/middleware/default_header.py +0 -0
  63. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/middleware/download_delay.py +0 -0
  64. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/middleware/proxy.py +0 -0
  65. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/middleware/request_ignore.py +0 -0
  66. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/middleware/response_code.py +0 -0
  67. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/middleware/response_filter.py +0 -0
  68. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/middleware/retry.py +0 -0
  69. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/middleware/simple_proxy.py +0 -0
  70. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/network/__init__.py +0 -0
  71. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/network/request.py +0 -0
  72. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/network/response.py +0 -0
  73. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/pipelines/__init__.py +0 -0
  74. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/pipelines/bloom_dedup_pipeline.py +0 -0
  75. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/pipelines/console_pipeline.py +0 -0
  76. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/pipelines/csv_pipeline.py +0 -0
  77. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/pipelines/database_dedup_pipeline.py +0 -0
  78. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/pipelines/json_pipeline.py +0 -0
  79. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/pipelines/memory_dedup_pipeline.py +0 -0
  80. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/pipelines/mongo_pipeline.py +0 -0
  81. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/pipelines/redis_dedup_pipeline.py +0 -0
  82. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/queue/__init__.py +0 -0
  83. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/queue/pqueue.py +0 -0
  84. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/queue/queue_manager.py +0 -0
  85. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/queue/redis_priority_queue.py +0 -0
  86. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/settings/__init__.py +0 -0
  87. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/spider/__init__.py +0 -0
  88. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/stats_collector.py +0 -0
  89. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/subscriber.py +0 -0
  90. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/task_manager.py +0 -0
  91. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/crawlo.cfg.tmpl +0 -0
  92. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/project/__init__.py.tmpl +0 -0
  93. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/project/items.py.tmpl +0 -0
  94. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/project/middlewares.py.tmpl +0 -0
  95. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/project/pipelines.py.tmpl +0 -0
  96. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/project/settings.py.tmpl +0 -0
  97. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/project/settings_distributed.py.tmpl +0 -0
  98. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/project/settings_gentle.py.tmpl +0 -0
  99. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/project/settings_high_performance.py.tmpl +0 -0
  100. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/project/settings_minimal.py.tmpl +0 -0
  101. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/project/settings_simple.py.tmpl +0 -0
  102. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/project/spiders/__init__.py.tmpl +0 -0
  103. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/run.py.tmpl +0 -0
  104. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/templates/spider/spider.py.tmpl +0 -0
  105. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/__init__.py +0 -0
  106. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/anti_crawler.py +0 -0
  107. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/authenticated_proxy.py +0 -0
  108. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/data_formatter.py +0 -0
  109. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/data_validator.py +0 -0
  110. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/date_tools.py +0 -0
  111. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/distributed_coordinator.py +0 -0
  112. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/encoding_converter.py +0 -0
  113. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/request_tools.py +0 -0
  114. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/retry_mechanism.py +0 -0
  115. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/scenario_adapter.py +0 -0
  116. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/tools/text_cleaner.py +0 -0
  117. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/__init__.py +0 -0
  118. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/batch_processor.py +0 -0
  119. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/controlled_spider_mixin.py +0 -0
  120. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/db_helper.py +0 -0
  121. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/enhanced_error_handler.py +0 -0
  122. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/env_config.py +0 -0
  123. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/error_handler.py +0 -0
  124. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/func_tools.py +0 -0
  125. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/large_scale_config.py +0 -0
  126. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/large_scale_helper.py +0 -0
  127. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/performance_monitor.py +0 -0
  128. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/queue_helper.py +0 -0
  129. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/redis_connection_pool.py +0 -0
  130. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/redis_key_validator.py +0 -0
  131. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/request.py +0 -0
  132. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/request_serializer.py +0 -0
  133. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/spider_loader.py +0 -0
  134. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/system.py +0 -0
  135. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/tools.py +0 -0
  136. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo/utils/url.py +0 -0
  137. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo.egg-info/SOURCES.txt +0 -0
  138. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo.egg-info/dependency_links.txt +0 -0
  139. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo.egg-info/entry_points.txt +0 -0
  140. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo.egg-info/requires.txt +0 -0
  141. {crawlo-1.2.9 → crawlo-1.3.1}/crawlo.egg-info/top_level.txt +0 -0
  142. {crawlo-1.2.9 → crawlo-1.3.1}/examples/__init__.py +0 -0
  143. {crawlo-1.2.9 → crawlo-1.3.1}/pyproject.toml +0 -0
  144. {crawlo-1.2.9 → crawlo-1.3.1}/requirements.txt +0 -0
  145. {crawlo-1.2.9 → crawlo-1.3.1}/setup.cfg +0 -0
  146. {crawlo-1.2.9 → crawlo-1.3.1}/tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -0
  147. {crawlo-1.2.9 → crawlo-1.3.1}/tests/__init__.py +0 -0
  148. {crawlo-1.2.9 → crawlo-1.3.1}/tests/advanced_tools_example.py +0 -0
  149. {crawlo-1.2.9 → crawlo-1.3.1}/tests/authenticated_proxy_example.py +0 -0
  150. {crawlo-1.2.9 → crawlo-1.3.1}/tests/cleaners_example.py +0 -0
  151. {crawlo-1.2.9 → crawlo-1.3.1}/tests/config_validation_demo.py +0 -0
  152. {crawlo-1.2.9 → crawlo-1.3.1}/tests/controlled_spider_example.py +0 -0
  153. {crawlo-1.2.9 → crawlo-1.3.1}/tests/date_tools_example.py +0 -0
  154. {crawlo-1.2.9 → crawlo-1.3.1}/tests/debug_pipelines.py +0 -0
  155. {crawlo-1.2.9 → crawlo-1.3.1}/tests/dynamic_loading_example.py +0 -0
  156. {crawlo-1.2.9 → crawlo-1.3.1}/tests/dynamic_loading_test.py +0 -0
  157. {crawlo-1.2.9 → crawlo-1.3.1}/tests/env_config_example.py +0 -0
  158. {crawlo-1.2.9 → crawlo-1.3.1}/tests/error_handling_example.py +0 -0
  159. {crawlo-1.2.9 → crawlo-1.3.1}/tests/redis_key_validation_demo.py +0 -0
  160. {crawlo-1.2.9 → crawlo-1.3.1}/tests/request_params_example.py +0 -0
  161. {crawlo-1.2.9 → crawlo-1.3.1}/tests/response_improvements_example.py +0 -0
  162. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_advanced_tools.py +0 -0
  163. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_all_redis_key_configs.py +0 -0
  164. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_authenticated_proxy.py +0 -0
  165. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_cleaners.py +0 -0
  166. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_comprehensive.py +0 -0
  167. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_config_consistency.py +0 -0
  168. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_config_merge.py +0 -0
  169. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_config_validator.py +0 -0
  170. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_crawlo_proxy_integration.py +0 -0
  171. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_date_tools.py +0 -0
  172. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_default_header_middleware.py +0 -0
  173. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_distributed.py +0 -0
  174. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_double_crawlo_fix.py +0 -0
  175. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_double_crawlo_fix_simple.py +0 -0
  176. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_download_delay_middleware.py +0 -0
  177. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_downloader_proxy_compatibility.py +0 -0
  178. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_dynamic_downloaders_proxy.py +0 -0
  179. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_dynamic_proxy.py +0 -0
  180. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_dynamic_proxy_config.py +0 -0
  181. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_dynamic_proxy_real.py +0 -0
  182. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_edge_cases.py +0 -0
  183. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_enhanced_error_handler.py +0 -0
  184. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_env_config.py +0 -0
  185. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_error_handler_compatibility.py +0 -0
  186. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_final_validation.py +0 -0
  187. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_framework_env_usage.py +0 -0
  188. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_integration.py +0 -0
  189. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_item_dedup_redis_key.py +0 -0
  190. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_mode_consistency.py +0 -0
  191. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_offsite_middleware.py +0 -0
  192. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_parsel.py +0 -0
  193. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_performance.py +0 -0
  194. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_proxy_api.py +0 -0
  195. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_proxy_health_check.py +0 -0
  196. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_proxy_middleware.py +0 -0
  197. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_proxy_middleware_enhanced.py +0 -0
  198. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_proxy_middleware_integration.py +0 -0
  199. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_proxy_middleware_refactored.py +0 -0
  200. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_proxy_providers.py +0 -0
  201. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_proxy_stats.py +0 -0
  202. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_proxy_strategies.py +0 -0
  203. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_queue_manager_double_crawlo.py +0 -0
  204. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_queue_manager_redis_key.py +0 -0
  205. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_random_user_agent.py +0 -0
  206. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_real_scenario_proxy.py +0 -0
  207. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_redis_config.py +0 -0
  208. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_redis_connection_pool.py +0 -0
  209. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_redis_key_naming.py +0 -0
  210. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_redis_key_validator.py +0 -0
  211. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_redis_queue.py +0 -0
  212. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_request_ignore_middleware.py +0 -0
  213. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_request_params.py +0 -0
  214. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_request_serialization.py +0 -0
  215. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_response_code_middleware.py +0 -0
  216. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_response_filter_middleware.py +0 -0
  217. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_response_improvements.py +0 -0
  218. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_retry_middleware.py +0 -0
  219. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_scheduler.py +0 -0
  220. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_scheduler_config_update.py +0 -0
  221. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_simple_response.py +0 -0
  222. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_telecom_spider_redis_key.py +0 -0
  223. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_template_content.py +0 -0
  224. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_template_redis_key.py +0 -0
  225. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_tools.py +0 -0
  226. {crawlo-1.2.9 → crawlo-1.3.1}/tests/test_user_agents.py +0 -0
  227. {crawlo-1.2.9 → crawlo-1.3.1}/tests/tools_example.py +0 -0
  228. {crawlo-1.2.9 → crawlo-1.3.1}/tests/verify_distributed.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlo
3
- Version: 1.2.9
3
+ Version: 1.3.1
4
4
  Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
5
  Home-page: https://github.com/crawl-coder/Crawlo.git
6
6
  Author: crawl-coder
@@ -56,7 +56,7 @@ Requires-Dist: selenium>=3.141.0; extra == "all"
56
56
 
57
57
  <p align="center">
58
58
  <a href="https://www.python.org/downloads/">
59
- <img src="https://img.shields.io/badge/python-3.8%2B-blue" alt="Python Version">
59
+ <img src="https://img.shields.io/badge/python-%3C%3D3.12-blue" alt="Python Version">
60
60
  </a>
61
61
  <a href="LICENSE">
62
62
  <img src="https://img.shields.io/badge/license-MIT-green" alt="License">
@@ -429,13 +429,22 @@ Crawlo框架的中间件、管道和扩展组件采用模块化设计,框架
429
429
 
430
430
  用户可以通过`CUSTOM_MIDDLEWARES`配置自定义中间件:
431
431
 
432
- ```python
432
+ ``python
433
433
  # settings.py
434
434
  CUSTOM_MIDDLEWARES = [
435
435
  'myproject.middlewares.CustomMiddleware',
436
436
  ]
437
437
  ```
438
438
 
439
+ > **注意**:DefaultHeaderMiddleware 和 OffsiteMiddleware 需要相应的配置才能启用:
440
+ > - DefaultHeaderMiddleware 需要配置 `DEFAULT_REQUEST_HEADERS` 或 `USER_AGENT` 参数
441
+ > - OffsiteMiddleware 需要配置 `ALLOWED_DOMAINS` 参数
442
+ >
443
+ > 如果未配置相应参数,这些中间件会因为 NotConfiguredError 而被禁用。
444
+
445
+ > **注意**:中间件的顺序很重要。SimpleProxyMiddleware 通常放在列表末尾,
446
+ > 这样可以在所有默认中间件处理后再应用代理设置。
447
+
439
448
  #### 管道配置
440
449
 
441
450
  框架默认加载以下管道:
@@ -930,7 +939,7 @@ Crawlo框架内置了多种中间件,其中代理中间件有两种实现:
930
939
 
931
940
  如果需要使用简化版代理中间件,可以在配置文件中替换默认的代理中间件:
932
941
 
933
- ```python
942
+ ``python
934
943
  # settings.py
935
944
  MIDDLEWARES = [
936
945
  # 注释掉复杂版代理中间件
@@ -6,7 +6,7 @@
6
6
 
7
7
  <p align="center">
8
8
  <a href="https://www.python.org/downloads/">
9
- <img src="https://img.shields.io/badge/python-3.8%2B-blue" alt="Python Version">
9
+ <img src="https://img.shields.io/badge/python-%3C%3D3.12-blue" alt="Python Version">
10
10
  </a>
11
11
  <a href="LICENSE">
12
12
  <img src="https://img.shields.io/badge/license-MIT-green" alt="License">
@@ -379,13 +379,22 @@ Crawlo框架的中间件、管道和扩展组件采用模块化设计,框架
379
379
 
380
380
  用户可以通过`CUSTOM_MIDDLEWARES`配置自定义中间件:
381
381
 
382
- ```python
382
+ ``python
383
383
  # settings.py
384
384
  CUSTOM_MIDDLEWARES = [
385
385
  'myproject.middlewares.CustomMiddleware',
386
386
  ]
387
387
  ```
388
388
 
389
+ > **注意**:DefaultHeaderMiddleware 和 OffsiteMiddleware 需要相应的配置才能启用:
390
+ > - DefaultHeaderMiddleware 需要配置 `DEFAULT_REQUEST_HEADERS` 或 `USER_AGENT` 参数
391
+ > - OffsiteMiddleware 需要配置 `ALLOWED_DOMAINS` 参数
392
+ >
393
+ > 如果未配置相应参数,这些中间件会因为 NotConfiguredError 而被禁用。
394
+
395
+ > **注意**:中间件的顺序很重要。SimpleProxyMiddleware 通常放在列表末尾,
396
+ > 这样可以在所有默认中间件处理后再应用代理设置。
397
+
389
398
  #### 管道配置
390
399
 
391
400
  框架默认加载以下管道:
@@ -880,7 +889,7 @@ Crawlo框架内置了多种中间件,其中代理中间件有两种实现:
880
889
 
881
890
  如果需要使用简化版代理中间件,可以在配置文件中替换默认的代理中间件:
882
891
 
883
- ```python
892
+ ``python
884
893
  # settings.py
885
894
  MIDDLEWARES = [
886
895
  # 注释掉复杂版代理中间件
@@ -0,0 +1 @@
1
+ __version__ = '1.3.1'
@@ -5,26 +5,27 @@
5
5
  # @Author : crawl-coder
6
6
  # @Desc : 命令行入口:crawlo run <spider_name>|all,用于运行指定爬虫。
7
7
  """
8
+ import os
8
9
  import sys
9
10
  import asyncio
10
11
  import configparser
11
- import os
12
- from pathlib import Path
13
12
  from importlib import import_module
14
13
 
14
+ from rich import box
15
15
  from rich.console import Console
16
16
  from rich.panel import Panel
17
+ from rich.progress import Progress, SpinnerColumn, TextColumn
17
18
  from rich.table import Table
18
19
  from rich.text import Text
19
- from rich import box
20
- from rich.progress import Progress, SpinnerColumn, TextColumn
21
20
 
21
+ from crawlo.commands.stats import record_stats
22
22
  from crawlo.crawler import CrawlerProcess
23
- from crawlo.utils.log import get_logger
24
23
  from crawlo.project import get_settings, _find_project_root
25
- from crawlo.commands.stats import record_stats
24
+ # 使用自定义日志系统
25
+ from crawlo.utils.log import get_logger
26
26
 
27
27
  logger = get_logger(__name__)
28
+
28
29
  console = Console()
29
30
 
30
31
 
@@ -77,6 +78,9 @@ def main(args):
77
78
  用法:
78
79
  crawlo run <spider_name>|all [--json] [--no-stats]
79
80
  """
81
+ # 添加调试信息
82
+ logger.debug("DEBUG: 进入main函数")
83
+
80
84
  if len(args) < 1:
81
85
  console.print("[bold red]用法:[/bold red] [blue]crawlo run[/blue] <爬虫名称>|all [bold yellow][--json] [--no-stats][/bold yellow]")
82
86
  console.print("示例:")
@@ -187,21 +191,7 @@ def main(args):
187
191
  return 1
188
192
 
189
193
  # 显示即将运行的爬虫列表
190
- table = Table(
191
- title=f"启动全部 {len(spider_names)} 个爬虫",
192
- box=box.ROUNDED,
193
- show_header=True,
194
- header_style="bold magenta"
195
- )
196
- table.add_column("名称", style="cyan")
197
- table.add_column("类名", style="green")
198
-
199
- for name in sorted(spider_names):
200
- cls = process.get_spider_class(name)
201
- table.add_row(name, cls.__name__)
202
-
203
- console.print(table)
204
- console.print()
194
+ # 根据用户要求,不再显示详细的爬虫列表信息
205
195
 
206
196
  # 注册 stats 记录(除非 --no-stats)
207
197
  if not no_stats:
@@ -260,20 +250,21 @@ def main(args):
260
250
  spider_class = process.get_spider_class(spider_name)
261
251
 
262
252
  # 显示启动信息
263
- if not show_json:
264
- info_table = Table(
265
- title=f"启动爬虫: [bold cyan]{spider_name}[/bold cyan]",
266
- box=box.SIMPLE,
267
- show_header=False,
268
- title_style="bold green"
269
- )
270
- info_table.add_column("Key", style="yellow")
271
- info_table.add_column("Value", style="cyan")
272
- info_table.add_row("Project", project_package)
273
- info_table.add_row("Class", spider_class.__name__)
274
- info_table.add_row("Module", spider_class.__module__)
275
- console.print(info_table)
276
- console.print()
253
+ # 根据用户要求,不再显示项目启动信息
254
+ # if not show_json:
255
+ # info_table = Table(
256
+ # title=f"启动爬虫: [bold cyan]{spider_name}[/bold cyan]",
257
+ # box=box.SIMPLE,
258
+ # show_header=False,
259
+ # title_style="bold green"
260
+ # )
261
+ # info_table.add_column("Key", style="yellow")
262
+ # info_table.add_column("Value", style="cyan")
263
+ # info_table.add_row("Project", project_package)
264
+ # info_table.add_row("Class", spider_class.__name__)
265
+ # info_table.add_row("Module", spider_class.__module__)
266
+ # console.print(info_table)
267
+ # console.print()
277
268
 
278
269
  # 注册 stats 记录
279
270
  if not no_stats:
@@ -133,8 +133,11 @@ def validate_spider_name(spider_name: str) -> bool:
133
133
  bool: 是否有效
134
134
  """
135
135
  import re
136
+ # 清理爬虫名称中的不可见字符
137
+ cleaned_name = ''.join(c for c in spider_name if not unicodedata.category(c).startswith('C'))
138
+
136
139
  # 爬虫名称应该是有效的Python标识符
137
- return spider_name.isidentifier() and re.match(r'^[a-z][a-z0-9_]*$', spider_name)
140
+ return cleaned_name.isidentifier() and re.match(r'^[a-z][a-z0-9_]*$', cleaned_name)
138
141
 
139
142
 
140
143
  def format_file_size(size_bytes: int) -> str:
@@ -181,7 +184,14 @@ def is_valid_domain(domain: str) -> bool:
181
184
  bool: 是否有效
182
185
  """
183
186
  import re
187
+ # 清理域名中的不可见字符
188
+ cleaned_domain = ''.join(c for c in domain if not unicodedata.category(c).startswith('C'))
189
+
184
190
  pattern = re.compile(
185
191
  r'^(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)*[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?$'
186
192
  )
187
- return bool(pattern.match(domain))
193
+ return bool(pattern.match(cleaned_domain))
194
+
195
+
196
+ # 添加导入
197
+ import unicodedata
@@ -75,8 +75,7 @@ class Engine(object):
75
75
  version = '1.0.0'
76
76
  # Change INFO level log to DEBUG level to avoid duplication with CrawlerProcess startup log
77
77
  self.logger.debug(
78
- f"Crawlo Started version {version} . "
79
- # f"(project name : {self.settings.get('PROJECT_NAME')})"
78
+ f"Crawlo Started version {version}"
80
79
  )
81
80
 
82
81
  async def start_spider(self, spider):