crawlo 1.3.2__tar.gz → 1.5.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (475) hide show
  1. {crawlo-1.3.2 → crawlo-1.5.4}/LICENSE +22 -22
  2. {crawlo-1.3.2 → crawlo-1.5.4}/MANIFEST.in +16 -16
  3. crawlo-1.5.4/PKG-INFO +997 -0
  4. crawlo-1.5.4/README.md +947 -0
  5. crawlo-1.5.4/crawlo/__init__.py +89 -0
  6. crawlo-1.5.4/crawlo/__version__.py +1 -0
  7. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/cli.py +75 -75
  8. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/commands/__init__.py +14 -14
  9. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/commands/check.py +594 -594
  10. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/commands/genspider.py +186 -151
  11. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/commands/help.py +140 -138
  12. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/commands/list.py +155 -155
  13. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/commands/run.py +379 -315
  14. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/commands/startproject.py +460 -436
  15. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/commands/stats.py +187 -187
  16. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/commands/utils.py +196 -196
  17. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/config.py +450 -312
  18. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/config_validator.py +277 -277
  19. crawlo-1.5.4/crawlo/core/__init__.py +52 -0
  20. crawlo-1.5.4/crawlo/core/engine.py +515 -0
  21. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/core/processor.py +47 -40
  22. crawlo-1.5.4/crawlo/core/scheduler.py +441 -0
  23. crawlo-1.5.4/crawlo/crawler.py +941 -0
  24. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/data/__init__.py +5 -5
  25. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/data/user_agents.py +194 -194
  26. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/downloader/__init__.py +304 -273
  27. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/downloader/aiohttp_downloader.py +295 -226
  28. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/downloader/cffi_downloader.py +251 -245
  29. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/downloader/httpx_downloader.py +313 -259
  30. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/downloader/hybrid_downloader.py +217 -212
  31. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/downloader/playwright_downloader.py +429 -402
  32. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/downloader/selenium_downloader.py +487 -472
  33. crawlo-1.5.4/crawlo/event.py +45 -0
  34. crawlo-1.5.4/crawlo/exceptions.py +215 -0
  35. crawlo-1.5.4/crawlo/extension/__init__.py +65 -0
  36. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/extension/health_check.py +143 -141
  37. crawlo-1.5.4/crawlo/extension/log_interval.py +95 -0
  38. crawlo-1.5.4/crawlo/extension/log_stats.py +73 -0
  39. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/extension/logging_extension.py +53 -45
  40. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/extension/memory_monitor.py +104 -104
  41. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/extension/performance_profiler.py +133 -133
  42. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/extension/request_recorder.py +107 -107
  43. crawlo-1.5.4/crawlo/factories/__init__.py +28 -0
  44. crawlo-1.5.4/crawlo/factories/base.py +69 -0
  45. crawlo-1.5.4/crawlo/factories/crawler.py +105 -0
  46. crawlo-1.5.4/crawlo/factories/registry.py +85 -0
  47. crawlo-1.5.4/crawlo/factories/utils.py +135 -0
  48. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/filters/__init__.py +171 -154
  49. crawlo-1.5.4/crawlo/filters/aioredis_filter.py +504 -0
  50. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/filters/memory_filter.py +271 -269
  51. crawlo-1.5.4/crawlo/framework.py +307 -0
  52. crawlo-1.5.4/crawlo/initialization/__init__.py +44 -0
  53. crawlo-1.5.4/crawlo/initialization/built_in.py +392 -0
  54. crawlo-1.5.4/crawlo/initialization/context.py +142 -0
  55. crawlo-1.5.4/crawlo/initialization/core.py +241 -0
  56. crawlo-1.5.4/crawlo/initialization/phases.py +230 -0
  57. crawlo-1.5.4/crawlo/initialization/registry.py +144 -0
  58. crawlo-1.5.4/crawlo/initialization/utils.py +49 -0
  59. crawlo-1.5.4/crawlo/interfaces.py +46 -0
  60. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/items/__init__.py +23 -23
  61. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/items/base.py +23 -22
  62. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/items/fields.py +52 -52
  63. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/items/items.py +104 -104
  64. crawlo-1.5.4/crawlo/logging/__init__.py +42 -0
  65. crawlo-1.5.4/crawlo/logging/config.py +281 -0
  66. crawlo-1.5.4/crawlo/logging/factory.py +176 -0
  67. crawlo-1.5.4/crawlo/logging/manager.py +104 -0
  68. crawlo-1.5.4/crawlo/middleware/__init__.py +101 -0
  69. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/middleware/default_header.py +131 -132
  70. crawlo-1.5.4/crawlo/middleware/download_attachment_middleware.py +280 -0
  71. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/middleware/download_delay.py +109 -105
  72. crawlo-1.5.4/crawlo/middleware/middleware_manager.py +197 -0
  73. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/middleware/offsite.py +122 -123
  74. crawlo-1.5.4/crawlo/middleware/proxy.py +168 -0
  75. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/middleware/request_ignore.py +85 -86
  76. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/middleware/response_code.py +149 -163
  77. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/middleware/response_filter.py +134 -136
  78. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/middleware/retry.py +126 -124
  79. crawlo-1.5.4/crawlo/mode_manager.py +308 -0
  80. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/network/__init__.py +21 -21
  81. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/network/request.py +532 -380
  82. crawlo-1.5.4/crawlo/network/response.py +798 -0
  83. crawlo-1.5.4/crawlo/pipelines/__init__.py +53 -0
  84. crawlo-1.5.4/crawlo/pipelines/base_pipeline.py +692 -0
  85. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/pipelines/bloom_dedup_pipeline.py +136 -156
  86. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/pipelines/console_pipeline.py +39 -39
  87. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/pipelines/csv_pipeline.py +316 -316
  88. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/pipelines/database_dedup_pipeline.py +178 -223
  89. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/pipelines/json_pipeline.py +218 -218
  90. crawlo-1.5.4/crawlo/pipelines/memory_dedup_pipeline.py +93 -0
  91. crawlo-1.5.4/crawlo/pipelines/mongo_pipeline.py +186 -0
  92. crawlo-1.5.4/crawlo/pipelines/mysql_pipeline.py +799 -0
  93. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/pipelines/pipeline_manager.py +99 -75
  94. crawlo-1.5.4/crawlo/pipelines/redis_dedup_pipeline.py +210 -0
  95. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/project.py +347 -297
  96. crawlo-1.5.4/crawlo/queue/__init__.py +10 -0
  97. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/queue/pqueue.py +43 -37
  98. crawlo-1.5.4/crawlo/queue/queue_manager.py +671 -0
  99. crawlo-1.5.4/crawlo/queue/redis_priority_queue.py +487 -0
  100. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/settings/__init__.py +7 -7
  101. crawlo-1.5.4/crawlo/settings/default_settings.py +273 -0
  102. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/settings/setting_manager.py +258 -198
  103. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/spider/__init__.py +714 -639
  104. crawlo-1.5.4/crawlo/stats_collector.py +85 -0
  105. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/subscriber.py +129 -129
  106. crawlo-1.5.4/crawlo/task_manager.py +142 -0
  107. crawlo-1.5.4/crawlo/templates/crawlo.cfg.tmpl +11 -0
  108. crawlo-1.5.4/crawlo/templates/project/__init__.py.tmpl +2 -0
  109. crawlo-1.5.4/crawlo/templates/project/items.py.tmpl +14 -0
  110. crawlo-1.5.4/crawlo/templates/project/middlewares.py.tmpl +39 -0
  111. crawlo-1.5.4/crawlo/templates/project/pipelines.py.tmpl +36 -0
  112. crawlo-1.5.4/crawlo/templates/project/settings.py.tmpl +109 -0
  113. crawlo-1.5.4/crawlo/templates/project/settings_distributed.py.tmpl +152 -0
  114. crawlo-1.5.4/crawlo/templates/project/settings_gentle.py.tmpl +176 -0
  115. crawlo-1.5.4/crawlo/templates/project/settings_high_performance.py.tmpl +177 -0
  116. crawlo-1.5.4/crawlo/templates/project/settings_minimal.py.tmpl +103 -0
  117. crawlo-1.5.4/crawlo/templates/project/settings_simple.py.tmpl +174 -0
  118. crawlo-1.5.4/crawlo/templates/project/spiders/__init__.py.tmpl +10 -0
  119. crawlo-1.5.4/crawlo/templates/run.py.tmpl +24 -0
  120. crawlo-1.5.4/crawlo/templates/spider/spider.py.tmpl +33 -0
  121. crawlo-1.5.4/crawlo/templates/spiders_init.py.tmpl +5 -0
  122. crawlo-1.5.4/crawlo/tools/__init__.py +95 -0
  123. crawlo-1.5.4/crawlo/tools/attachment_downloader.py +335 -0
  124. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/tools/date_tools.py +289 -289
  125. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/tools/distributed_coordinator.py +384 -388
  126. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/tools/scenario_adapter.py +262 -262
  127. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/tools/text_cleaner.py +232 -232
  128. crawlo-1.5.4/crawlo/utils/__init__.py +75 -0
  129. crawlo-1.3.2/crawlo/utils/batch_processor.py → crawlo-1.5.4/crawlo/utils/batch_manager.py +303 -260
  130. crawlo-1.5.4/crawlo/utils/batch_processor.py +165 -0
  131. crawlo-1.5.4/crawlo/utils/config_manager.py +442 -0
  132. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/utils/controlled_spider_mixin.py +439 -439
  133. crawlo-1.5.4/crawlo/utils/database_connection_pool.py +109 -0
  134. crawlo-1.5.4/crawlo/utils/db_helper.py +231 -0
  135. crawlo-1.5.4/crawlo/utils/encoding_helper.py +190 -0
  136. crawlo-1.3.2/crawlo/utils/enhanced_error_handler.py → crawlo-1.5.4/crawlo/utils/error_handler.py +410 -356
  137. crawlo-1.5.4/crawlo/utils/fingerprint.py +122 -0
  138. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/utils/func_tools.py +82 -82
  139. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/utils/large_scale_helper.py +344 -344
  140. crawlo-1.5.4/crawlo/utils/misc.py +129 -0
  141. crawlo-1.5.4/crawlo/utils/mongo_connection_pool.py +221 -0
  142. crawlo-1.5.4/crawlo/utils/mysql_connection_pool.py +451 -0
  143. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/utils/performance_monitor.py +285 -285
  144. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/utils/queue_helper.py +175 -175
  145. crawlo-1.5.4/crawlo/utils/redis_key_manager.py +200 -0
  146. crawlo-1.5.4/crawlo/utils/redis_manager.py +809 -0
  147. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo/utils/request.py +278 -267
  148. crawlo-1.5.4/crawlo/utils/request_serializer.py +426 -0
  149. crawlo-1.5.4/crawlo/utils/resource_manager.py +339 -0
  150. crawlo-1.5.4/crawlo/utils/response_helper.py +113 -0
  151. crawlo-1.5.4/crawlo/utils/selector_helper.py +139 -0
  152. crawlo-1.5.4/crawlo/utils/singleton.py +70 -0
  153. crawlo-1.5.4/crawlo/utils/spider_loader.py +202 -0
  154. crawlo-1.5.4/crawlo/utils/text_helper.py +95 -0
  155. crawlo-1.5.4/crawlo.egg-info/PKG-INFO +997 -0
  156. crawlo-1.5.4/crawlo.egg-info/SOURCES.txt +390 -0
  157. {crawlo-1.3.2 → crawlo-1.5.4}/examples/__init__.py +7 -7
  158. crawlo-1.5.4/examples/attachment_download_example.py +200 -0
  159. {crawlo-1.3.2 → crawlo-1.5.4}/pyproject.toml +2 -2
  160. {crawlo-1.3.2 → crawlo-1.5.4}/requirements.txt +39 -33
  161. {crawlo-1.3.2 → crawlo-1.5.4}/setup.cfg +71 -71
  162. crawlo-1.5.4/tests/RESOURCE_LEAK_TEST_REPORT.md +128 -0
  163. {crawlo-1.3.2 → crawlo-1.5.4}/tests/__init__.py +7 -7
  164. {crawlo-1.3.2 → crawlo-1.5.4}/tests/advanced_tools_example.py +217 -275
  165. {crawlo-1.3.2 → crawlo-1.5.4}/tests/authenticated_proxy_example.py +111 -107
  166. crawlo-1.5.4/tests/baidu_performance_test.py +109 -0
  167. crawlo-1.5.4/tests/baidu_test.py +60 -0
  168. crawlo-1.5.4/tests/bug_check_test.py +251 -0
  169. {crawlo-1.3.2 → crawlo-1.5.4}/tests/cleaners_example.py +160 -160
  170. crawlo-1.5.4/tests/comprehensive_framework_test.py +213 -0
  171. crawlo-1.5.4/tests/comprehensive_test.py +82 -0
  172. crawlo-1.5.4/tests/comprehensive_testing_summary.md +187 -0
  173. {crawlo-1.3.2 → crawlo-1.5.4}/tests/config_validation_demo.py +142 -142
  174. {crawlo-1.3.2 → crawlo-1.5.4}/tests/controlled_spider_example.py +205 -205
  175. {crawlo-1.3.2 → crawlo-1.5.4}/tests/date_tools_example.py +180 -180
  176. crawlo-1.5.4/tests/debug_configure.py +70 -0
  177. crawlo-1.5.4/tests/debug_framework_logger.py +85 -0
  178. crawlo-1.5.4/tests/debug_log_config.py +127 -0
  179. crawlo-1.5.4/tests/debug_log_levels.py +64 -0
  180. {crawlo-1.3.2 → crawlo-1.5.4}/tests/debug_pipelines.py +66 -66
  181. crawlo-1.5.4/tests/detailed_log_test.py +234 -0
  182. crawlo-1.5.4/tests/direct_selector_helper_test.py +97 -0
  183. crawlo-1.5.4/tests/distributed_dedup_test.py +467 -0
  184. crawlo-1.5.4/tests/distributed_test.py +67 -0
  185. crawlo-1.5.4/tests/distributed_test_debug.py +77 -0
  186. {crawlo-1.3.2 → crawlo-1.5.4}/tests/dynamic_loading_example.py +523 -523
  187. {crawlo-1.3.2 → crawlo-1.5.4}/tests/dynamic_loading_test.py +104 -104
  188. {crawlo-1.3.2 → crawlo-1.5.4}/tests/error_handling_example.py +171 -171
  189. crawlo-1.5.4/tests/explain_mysql_update_behavior.py +77 -0
  190. crawlo-1.5.4/tests/final_comprehensive_test.py +152 -0
  191. crawlo-1.5.4/tests/final_log_test.py +261 -0
  192. crawlo-1.5.4/tests/final_validation_test.py +183 -0
  193. crawlo-1.5.4/tests/final_verification.py +383 -0
  194. crawlo-1.5.4/tests/fix_log_test.py +143 -0
  195. crawlo-1.5.4/tests/framework_performance_test.py +203 -0
  196. crawlo-1.5.4/tests/log_buffering_test.py +112 -0
  197. crawlo-1.5.4/tests/log_generation_timing_test.py +154 -0
  198. crawlo-1.5.4/tests/monitor_redis_dedup.sh +72 -0
  199. crawlo-1.5.4/tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -0
  200. crawlo-1.5.4/tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -0
  201. crawlo-1.5.4/tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -0
  202. crawlo-1.5.4/tests/ofweek_scrapy/ofweek_scrapy/settings.py +85 -0
  203. crawlo-1.5.4/tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -0
  204. crawlo-1.5.4/tests/ofweek_scrapy/scrapy.cfg +11 -0
  205. crawlo-1.5.4/tests/optimized_performance_test.py +212 -0
  206. crawlo-1.5.4/tests/performance_comparison.py +245 -0
  207. crawlo-1.5.4/tests/queue_blocking_test.py +114 -0
  208. crawlo-1.5.4/tests/queue_test.py +90 -0
  209. {crawlo-1.3.2 → crawlo-1.5.4}/tests/redis_key_validation_demo.py +130 -130
  210. {crawlo-1.3.2 → crawlo-1.5.4}/tests/request_params_example.py +150 -150
  211. {crawlo-1.3.2 → crawlo-1.5.4}/tests/response_improvements_example.py +144 -144
  212. crawlo-1.5.4/tests/run_all_leak_tests.py +156 -0
  213. crawlo-1.5.4/tests/scrapy_comparison/ofweek_scrapy.py +139 -0
  214. crawlo-1.5.4/tests/scrapy_comparison/scrapy_test.py +134 -0
  215. crawlo-1.5.4/tests/simple_cli_test.py +55 -0
  216. crawlo-1.5.4/tests/simple_command_test.py +120 -0
  217. crawlo-1.5.4/tests/simple_crawlo_test.py +127 -0
  218. crawlo-1.5.4/tests/simple_follow_test.py +39 -0
  219. crawlo-1.5.4/tests/simple_log_test2.py +138 -0
  220. crawlo-1.5.4/tests/simple_optimization_test.py +129 -0
  221. crawlo-1.5.4/tests/simple_queue_type_test.py +42 -0
  222. crawlo-1.5.4/tests/simple_response_selector_test.py +95 -0
  223. crawlo-1.5.4/tests/simple_selector_helper_test.py +155 -0
  224. crawlo-1.5.4/tests/simple_selector_test.py +208 -0
  225. crawlo-1.5.4/tests/simple_spider_test.py +50 -0
  226. crawlo-1.5.4/tests/simple_url_test.py +74 -0
  227. crawlo-1.5.4/tests/simulate_mysql_update_test.py +140 -0
  228. crawlo-1.5.4/tests/spider_log_timing_test.py +178 -0
  229. crawlo-1.5.4/tests/test_ack_call_analysis.py +91 -0
  230. crawlo-1.5.4/tests/test_ack_call_fix.py +237 -0
  231. crawlo-1.5.4/tests/test_ack_method_debug.py +177 -0
  232. crawlo-1.5.4/tests/test_ack_method_fix.py +131 -0
  233. crawlo-1.5.4/tests/test_ack_method_verification.py +211 -0
  234. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_advanced_tools.py +153 -148
  235. crawlo-1.5.4/tests/test_all_commands.py +236 -0
  236. crawlo-1.5.4/tests/test_all_pipeline_fingerprints.py +139 -0
  237. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_all_redis_key_configs.py +150 -145
  238. crawlo-1.5.4/tests/test_asyncmy_usage.py +62 -0
  239. crawlo-1.5.4/tests/test_batch_processor.py +184 -0
  240. crawlo-1.5.4/tests/test_browser_leak.py +88 -0
  241. crawlo-1.5.4/tests/test_cache_leak.py +79 -0
  242. crawlo-1.5.4/tests/test_circular_reference_leak.py +82 -0
  243. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_cleaners.py +59 -54
  244. crawlo-1.5.4/tests/test_cli_arguments.py +124 -0
  245. crawlo-1.5.4/tests/test_complete_ack_solution.py +144 -0
  246. crawlo-1.5.4/tests/test_component_factory.py +180 -0
  247. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_config_consistency.py +85 -80
  248. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_config_merge.py +157 -152
  249. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_config_validator.py +187 -182
  250. crawlo-1.5.4/tests/test_controlled_spider_mixin.py +85 -0
  251. crawlo-1.5.4/tests/test_coroutine_leak.py +78 -0
  252. crawlo-1.5.4/tests/test_crawler_process_import.py +44 -0
  253. crawlo-1.5.4/tests/test_crawler_process_spider_modules.py +53 -0
  254. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_crawlo_proxy_integration.py +119 -108
  255. crawlo-1.5.4/tests/test_database_connection_leak.py +92 -0
  256. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_date_tools.py +128 -123
  257. crawlo-1.5.4/tests/test_dedup_fix.py +226 -0
  258. crawlo-1.5.4/tests/test_dedup_pipeline_consistency.py +130 -0
  259. crawlo-1.5.4/tests/test_default_header_middleware.py +319 -0
  260. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_distributed.py +70 -65
  261. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_double_crawlo_fix.py +209 -207
  262. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_double_crawlo_fix_simple.py +129 -124
  263. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_download_delay_middleware.py +226 -221
  264. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_downloader_proxy_compatibility.py +277 -268
  265. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_edge_cases.py +310 -303
  266. crawlo-1.5.4/tests/test_encoding_core.py +62 -0
  267. crawlo-1.5.4/tests/test_encoding_detection.py +132 -0
  268. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_enhanced_error_handler.py +275 -270
  269. crawlo-1.5.4/tests/test_enhanced_error_handler_comprehensive.py +251 -0
  270. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_error_handler_compatibility.py +117 -112
  271. crawlo-1.5.4/tests/test_extract_spider_name.py +60 -0
  272. crawlo-1.5.4/tests/test_factories.py +258 -0
  273. crawlo-1.5.4/tests/test_factory_compatibility.py +202 -0
  274. crawlo-1.5.4/tests/test_file_handle_leak.py +74 -0
  275. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_final_validation.py +158 -153
  276. crawlo-1.5.4/tests/test_fingerprint_consistency.py +141 -0
  277. crawlo-1.5.4/tests/test_fingerprint_simple.py +57 -0
  278. crawlo-1.5.4/tests/test_get_component_logger.py +89 -0
  279. crawlo-1.5.4/tests/test_hash_performance.py +105 -0
  280. crawlo-1.5.4/tests/test_http_connection_leak.py +68 -0
  281. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_integration.py +174 -169
  282. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_item_dedup_redis_key.py +127 -122
  283. crawlo-1.5.4/tests/test_key_format_fix.py +165 -0
  284. crawlo-1.5.4/tests/test_key_format_fix_verification.py +179 -0
  285. crawlo-1.5.4/tests/test_large_scale_helper.py +241 -0
  286. crawlo-1.5.4/tests/test_logging_enhancements.py +380 -0
  287. crawlo-1.5.4/tests/test_logging_final.py +190 -0
  288. crawlo-1.5.4/tests/test_logging_integration.py +318 -0
  289. crawlo-1.5.4/tests/test_logging_system.py +288 -0
  290. crawlo-1.5.4/tests/test_middleware_debug.py +147 -0
  291. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_mode_consistency.py +56 -51
  292. crawlo-1.5.4/tests/test_multi_directory.py +73 -0
  293. crawlo-1.5.4/tests/test_multiple_spider_modules.py +86 -0
  294. crawlo-1.5.4/tests/test_mysql_optimizations.py +305 -0
  295. crawlo-1.5.4/tests/test_mysql_pipeline.py +257 -0
  296. crawlo-1.5.4/tests/test_mysql_pipeline_config.py +170 -0
  297. crawlo-1.5.4/tests/test_mysql_pipeline_error.py +104 -0
  298. crawlo-1.5.4/tests/test_mysql_pipeline_init_log.py +88 -0
  299. crawlo-1.5.4/tests/test_mysql_pipeline_integration.py +138 -0
  300. crawlo-1.5.4/tests/test_mysql_pipeline_refactor.py +149 -0
  301. crawlo-1.5.4/tests/test_mysql_pipeline_refactor_simple.py +91 -0
  302. crawlo-1.5.4/tests/test_mysql_pipeline_robustness.py +201 -0
  303. crawlo-1.5.4/tests/test_mysql_pipeline_types.py +94 -0
  304. crawlo-1.5.4/tests/test_mysql_update_columns.py +99 -0
  305. crawlo-1.5.4/tests/test_offsite_middleware.py +250 -0
  306. crawlo-1.5.4/tests/test_offsite_middleware_simple.py +209 -0
  307. crawlo-1.5.4/tests/test_optimized_selector_naming.py +106 -0
  308. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_parsel.py +34 -29
  309. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_performance.py +331 -327
  310. crawlo-1.5.4/tests/test_performance_monitor.py +121 -0
  311. crawlo-1.5.4/tests/test_pipeline_fingerprint_consistency.py +92 -0
  312. crawlo-1.5.4/tests/test_priority_behavior.py +217 -0
  313. crawlo-1.5.4/tests/test_priority_consistency.py +157 -0
  314. crawlo-1.5.4/tests/test_priority_consistency_fixed.py +255 -0
  315. crawlo-1.5.4/tests/test_processing_queue_debug.py +161 -0
  316. crawlo-1.5.4/tests/test_processing_queue_simple.py +209 -0
  317. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_proxy_health_check.py +37 -32
  318. crawlo-1.5.4/tests/test_proxy_middleware.py +223 -0
  319. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_proxy_middleware_enhanced.py +217 -216
  320. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_proxy_middleware_integration.py +147 -137
  321. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_proxy_middleware_refactored.py +212 -184
  322. crawlo-1.5.4/tests/test_proxy_only.py +89 -0
  323. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_proxy_providers.py +61 -56
  324. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_proxy_stats.py +24 -19
  325. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_proxy_strategies.py +64 -59
  326. crawlo-1.5.4/tests/test_proxy_with_downloader.py +158 -0
  327. crawlo-1.5.4/tests/test_queue_empty_check.py +47 -0
  328. crawlo-1.5.4/tests/test_queue_leak.py +99 -0
  329. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_queue_manager_double_crawlo.py +178 -173
  330. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_queue_manager_redis_key.py +184 -176
  331. crawlo-1.5.4/tests/test_queue_naming.py +160 -0
  332. crawlo-1.5.4/tests/test_queue_scores.py +128 -0
  333. crawlo-1.5.4/tests/test_queue_type.py +112 -0
  334. crawlo-1.5.4/tests/test_queue_type_redis_config_consistency.py +137 -0
  335. crawlo-1.5.4/tests/test_random_headers_default.py +328 -0
  336. crawlo-1.5.4/tests/test_random_headers_necessity.py +314 -0
  337. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_random_user_agent.py +77 -72
  338. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_redis_config.py +33 -28
  339. crawlo-1.5.4/tests/test_redis_connection_leak.py +79 -0
  340. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_redis_connection_pool.py +299 -294
  341. crawlo-1.5.4/tests/test_redis_key_consistency.py +103 -0
  342. crawlo-1.5.4/tests/test_redis_key_integration.py +107 -0
  343. crawlo-1.5.4/tests/test_redis_key_manager.py +126 -0
  344. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_redis_key_naming.py +186 -181
  345. crawlo-1.5.4/tests/test_redis_key_structure.py +56 -0
  346. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_redis_key_validator.py +128 -123
  347. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_redis_queue.py +229 -224
  348. crawlo-1.5.4/tests/test_redis_queue_name_fix.py +181 -0
  349. crawlo-1.5.4/tests/test_redis_queue_type_fallback.py +135 -0
  350. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_request_ignore_middleware.py +187 -182
  351. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_request_params.py +116 -111
  352. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_request_serialization.py +75 -70
  353. crawlo-1.5.4/tests/test_resource_leak_detection.py +150 -0
  354. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_response_code_middleware.py +354 -349
  355. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_response_filter_middleware.py +432 -427
  356. crawlo-1.5.4/tests/test_response_follow.py +110 -0
  357. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_response_improvements.py +157 -152
  358. crawlo-1.5.4/tests/test_response_selector_methods.py +98 -0
  359. crawlo-1.5.4/tests/test_response_url_methods.py +76 -0
  360. crawlo-1.5.4/tests/test_response_urljoin.py +92 -0
  361. crawlo-1.5.4/tests/test_retry_middleware.py +339 -0
  362. crawlo-1.5.4/tests/test_retry_middleware_realistic.py +279 -0
  363. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_scheduler.py +257 -252
  364. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_scheduler_config_update.py +138 -133
  365. crawlo-1.5.4/tests/test_scrapy_style_encoding.py +118 -0
  366. crawlo-1.5.4/tests/test_selector_helper.py +106 -0
  367. crawlo-1.5.4/tests/test_selector_optimizations.py +152 -0
  368. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_simple_response.py +66 -61
  369. crawlo-1.5.4/tests/test_spider_loader.py +55 -0
  370. crawlo-1.5.4/tests/test_spider_loader_comprehensive.py +75 -0
  371. crawlo-1.5.4/tests/test_spider_modules.py +90 -0
  372. crawlo-1.5.4/tests/test_spider_name_in_redis_keys.py +83 -0
  373. crawlo-1.5.4/tests/test_spiders/__init__.py +1 -0
  374. crawlo-1.5.4/tests/test_spiders/test_spider.py +15 -0
  375. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_telecom_spider_redis_key.py +210 -205
  376. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_template_content.py +92 -87
  377. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_template_redis_key.py +139 -134
  378. crawlo-1.5.4/tests/test_thread_leak.py +78 -0
  379. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_tools.py +164 -159
  380. crawlo-1.5.4/tests/test_user_agent_randomness.py +182 -0
  381. {crawlo-1.3.2 → crawlo-1.5.4}/tests/test_user_agents.py +101 -96
  382. crawlo-1.5.4/tests/untested_features_report.md +139 -0
  383. crawlo-1.5.4/tests/verify_debug.py +52 -0
  384. {crawlo-1.3.2 → crawlo-1.5.4}/tests/verify_distributed.py +117 -117
  385. crawlo-1.5.4/tests/verify_log_fix.py +112 -0
  386. crawlo-1.5.4/tests/verify_mysql_warnings.py +110 -0
  387. crawlo-1.3.2/PKG-INFO +0 -1020
  388. crawlo-1.3.2/README.md +0 -970
  389. crawlo-1.3.2/crawlo/__init__.py +0 -64
  390. crawlo-1.3.2/crawlo/__version__.py +0 -1
  391. crawlo-1.3.2/crawlo/core/__init__.py +0 -2
  392. crawlo-1.3.2/crawlo/core/engine.py +0 -366
  393. crawlo-1.3.2/crawlo/core/scheduler.py +0 -257
  394. crawlo-1.3.2/crawlo/crawler.py +0 -1169
  395. crawlo-1.3.2/crawlo/event.py +0 -11
  396. crawlo-1.3.2/crawlo/exceptions.py +0 -82
  397. crawlo-1.3.2/crawlo/extension/__init__.py +0 -39
  398. crawlo-1.3.2/crawlo/extension/log_interval.py +0 -58
  399. crawlo-1.3.2/crawlo/extension/log_stats.py +0 -82
  400. crawlo-1.3.2/crawlo/filters/aioredis_filter.py +0 -234
  401. crawlo-1.3.2/crawlo/middleware/__init__.py +0 -21
  402. crawlo-1.3.2/crawlo/middleware/middleware_manager.py +0 -136
  403. crawlo-1.3.2/crawlo/middleware/proxy.py +0 -386
  404. crawlo-1.3.2/crawlo/middleware/simple_proxy.py +0 -65
  405. crawlo-1.3.2/crawlo/mode_manager.py +0 -188
  406. crawlo-1.3.2/crawlo/network/response.py +0 -360
  407. crawlo-1.3.2/crawlo/pipelines/__init__.py +0 -22
  408. crawlo-1.3.2/crawlo/pipelines/memory_dedup_pipeline.py +0 -116
  409. crawlo-1.3.2/crawlo/pipelines/mongo_pipeline.py +0 -132
  410. crawlo-1.3.2/crawlo/pipelines/mysql_pipeline.py +0 -319
  411. crawlo-1.3.2/crawlo/pipelines/redis_dedup_pipeline.py +0 -167
  412. crawlo-1.3.2/crawlo/queue/queue_manager.py +0 -379
  413. crawlo-1.3.2/crawlo/queue/redis_priority_queue.py +0 -306
  414. crawlo-1.3.2/crawlo/settings/default_settings.py +0 -225
  415. crawlo-1.3.2/crawlo/stats_collector.py +0 -59
  416. crawlo-1.3.2/crawlo/task_manager.py +0 -30
  417. crawlo-1.3.2/crawlo/templates/crawlo.cfg.tmpl +0 -11
  418. crawlo-1.3.2/crawlo/templates/project/__init__.py.tmpl +0 -4
  419. crawlo-1.3.2/crawlo/templates/project/items.py.tmpl +0 -18
  420. crawlo-1.3.2/crawlo/templates/project/middlewares.py.tmpl +0 -119
  421. crawlo-1.3.2/crawlo/templates/project/pipelines.py.tmpl +0 -97
  422. crawlo-1.3.2/crawlo/templates/project/settings.py.tmpl +0 -267
  423. crawlo-1.3.2/crawlo/templates/project/settings_distributed.py.tmpl +0 -180
  424. crawlo-1.3.2/crawlo/templates/project/settings_gentle.py.tmpl +0 -61
  425. crawlo-1.3.2/crawlo/templates/project/settings_high_performance.py.tmpl +0 -131
  426. crawlo-1.3.2/crawlo/templates/project/settings_minimal.py.tmpl +0 -35
  427. crawlo-1.3.2/crawlo/templates/project/settings_simple.py.tmpl +0 -102
  428. crawlo-1.3.2/crawlo/templates/project/spiders/__init__.py.tmpl +0 -6
  429. crawlo-1.3.2/crawlo/templates/run.py.tmpl +0 -39
  430. crawlo-1.3.2/crawlo/templates/spider/spider.py.tmpl +0 -144
  431. crawlo-1.3.2/crawlo/tools/__init__.py +0 -201
  432. crawlo-1.3.2/crawlo/tools/anti_crawler.py +0 -269
  433. crawlo-1.3.2/crawlo/tools/authenticated_proxy.py +0 -241
  434. crawlo-1.3.2/crawlo/tools/data_formatter.py +0 -226
  435. crawlo-1.3.2/crawlo/tools/data_validator.py +0 -181
  436. crawlo-1.3.2/crawlo/tools/encoding_converter.py +0 -127
  437. crawlo-1.3.2/crawlo/tools/request_tools.py +0 -83
  438. crawlo-1.3.2/crawlo/tools/retry_mechanism.py +0 -224
  439. crawlo-1.3.2/crawlo/utils/__init__.py +0 -35
  440. crawlo-1.3.2/crawlo/utils/db_helper.py +0 -344
  441. crawlo-1.3.2/crawlo/utils/env_config.py +0 -143
  442. crawlo-1.3.2/crawlo/utils/error_handler.py +0 -124
  443. crawlo-1.3.2/crawlo/utils/large_scale_config.py +0 -287
  444. crawlo-1.3.2/crawlo/utils/log.py +0 -147
  445. crawlo-1.3.2/crawlo/utils/redis_connection_pool.py +0 -352
  446. crawlo-1.3.2/crawlo/utils/redis_key_validator.py +0 -199
  447. crawlo-1.3.2/crawlo/utils/request_serializer.py +0 -219
  448. crawlo-1.3.2/crawlo/utils/spider_loader.py +0 -62
  449. crawlo-1.3.2/crawlo/utils/system.py +0 -11
  450. crawlo-1.3.2/crawlo/utils/tools.py +0 -5
  451. crawlo-1.3.2/crawlo/utils/url.py +0 -40
  452. crawlo-1.3.2/crawlo.egg-info/PKG-INFO +0 -1020
  453. crawlo-1.3.2/crawlo.egg-info/SOURCES.txt +0 -226
  454. crawlo-1.3.2/tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -82
  455. crawlo-1.3.2/tests/env_config_example.py +0 -134
  456. crawlo-1.3.2/tests/test_authenticated_proxy.py +0 -142
  457. crawlo-1.3.2/tests/test_comprehensive.py +0 -147
  458. crawlo-1.3.2/tests/test_default_header_middleware.py +0 -159
  459. crawlo-1.3.2/tests/test_dynamic_downloaders_proxy.py +0 -125
  460. crawlo-1.3.2/tests/test_dynamic_proxy.py +0 -93
  461. crawlo-1.3.2/tests/test_dynamic_proxy_config.py +0 -147
  462. crawlo-1.3.2/tests/test_dynamic_proxy_real.py +0 -110
  463. crawlo-1.3.2/tests/test_env_config.py +0 -122
  464. crawlo-1.3.2/tests/test_framework_env_usage.py +0 -104
  465. crawlo-1.3.2/tests/test_offsite_middleware.py +0 -222
  466. crawlo-1.3.2/tests/test_proxy_api.py +0 -265
  467. crawlo-1.3.2/tests/test_proxy_middleware.py +0 -122
  468. crawlo-1.3.2/tests/test_real_scenario_proxy.py +0 -196
  469. crawlo-1.3.2/tests/test_retry_middleware.py +0 -242
  470. crawlo-1.3.2/tests/tools_example.py +0 -261
  471. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo.egg-info/dependency_links.txt +0 -0
  472. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo.egg-info/entry_points.txt +0 -0
  473. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo.egg-info/requires.txt +0 -0
  474. {crawlo-1.3.2 → crawlo-1.5.4}/crawlo.egg-info/top_level.txt +0 -0
  475. {crawlo-1.3.2/crawlo/queue → crawlo-1.5.4/tests/ofweek_scrapy/ofweek_scrapy}/__init__.py +0 -0
@@ -1,23 +1,23 @@
1
- MIT License
2
-
3
- Modifications:
4
-
5
- Copyright (c) 2020 crawl-coder <2251018029@qq.com>
6
-
7
- Permission is hereby granted, free of charge, to any person obtaining a copy
8
- of this software and associated documentation files (the "Software"), to deal
9
- in the Software without restriction, including without limitation the rights
10
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
- copies of the Software, and to permit persons to whom the Software is
12
- furnished to do so, subject to the following conditions:
13
-
14
- The above copyright notice and this permission notice shall be included in all
15
- copies or substantial portions of the Software.
16
-
17
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1
+ MIT License
2
+
3
+ Modifications:
4
+
5
+ Copyright (c) 2020 crawl-coder <2251018029@qq.com>
6
+
7
+ Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ of this software and associated documentation files (the "Software"), to deal
9
+ in the Software without restriction, including without limitation the rights
10
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ copies of the Software, and to permit persons to whom the Software is
12
+ furnished to do so, subject to the following conditions:
13
+
14
+ The above copyright notice and this permission notice shall be included in all
15
+ copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
23
  SOFTWARE.
@@ -1,17 +1,17 @@
1
- include README.md
2
- include LICENSE
3
- include requirements.txt # 如果根目录有全局requirements.txt
4
- include VERSION # 如果根目录有全局VERSION文件
5
-
6
- # 包内文件包含
7
- recursive-include crawlo/utils/js *
8
- recursive-include crawlo/templates *
9
-
10
- # 测试文件(如果需要在分发包中包含测试)
11
- recursive-include tests *
12
-
13
- # 排除项
14
- global-exclude __pycache__ *.py[cod] .DS_Store *.so
15
- global-exclude *.bak *.swp *.orig *.rej
16
- prune samples # 排除示例目录
1
+ include README.md
2
+ include LICENSE
3
+ include requirements.txt # 如果根目录有全局requirements.txt
4
+ include VERSION # 如果根目录有全局VERSION文件
5
+
6
+ # 包内文件包含
7
+ recursive-include crawlo/utils/js *
8
+ recursive-include crawlo/templates *
9
+
10
+ # 测试文件(如果需要在分发包中包含测试)
11
+ recursive-include tests *
12
+
13
+ # 排除项
14
+ global-exclude __pycache__ *.py[cod] .DS_Store *.so
15
+ global-exclude *.bak *.swp *.orig *.rej
16
+ prune samples # 排除示例目录
17
17
  prune docs # 排除文档目录