crawlo 1.4.5__tar.gz → 1.4.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (404) hide show
  1. {crawlo-1.4.5 → crawlo-1.4.7}/LICENSE +22 -22
  2. {crawlo-1.4.5 → crawlo-1.4.7}/MANIFEST.in +16 -16
  3. crawlo-1.4.7/PKG-INFO +689 -0
  4. crawlo-1.4.7/README.md +639 -0
  5. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/__init__.py +90 -89
  6. crawlo-1.4.7/crawlo/__version__.py +1 -0
  7. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/cli.py +75 -75
  8. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/commands/__init__.py +14 -14
  9. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/commands/check.py +594 -594
  10. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/commands/genspider.py +186 -186
  11. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/commands/help.py +140 -138
  12. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/commands/list.py +155 -155
  13. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/commands/run.py +379 -341
  14. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/commands/startproject.py +460 -460
  15. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/commands/stats.py +187 -187
  16. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/commands/utils.py +196 -196
  17. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/config.py +320 -312
  18. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/config_validator.py +277 -277
  19. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/core/__init__.py +52 -52
  20. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/core/engine.py +451 -438
  21. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/core/processor.py +47 -47
  22. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/core/scheduler.py +290 -291
  23. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/crawler.py +698 -657
  24. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/data/__init__.py +5 -5
  25. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/data/user_agents.py +194 -194
  26. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/downloader/__init__.py +280 -276
  27. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/downloader/aiohttp_downloader.py +233 -233
  28. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/downloader/cffi_downloader.py +250 -245
  29. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/downloader/httpx_downloader.py +265 -259
  30. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/downloader/hybrid_downloader.py +212 -212
  31. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/downloader/playwright_downloader.py +425 -402
  32. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/downloader/selenium_downloader.py +486 -472
  33. crawlo-1.4.7/crawlo/event.py +45 -0
  34. crawlo-1.4.7/crawlo/exceptions.py +215 -0
  35. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/extension/__init__.py +65 -64
  36. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/extension/health_check.py +141 -141
  37. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/extension/log_interval.py +94 -94
  38. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/extension/log_stats.py +70 -70
  39. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/extension/logging_extension.py +53 -61
  40. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/extension/memory_monitor.py +104 -104
  41. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/extension/performance_profiler.py +133 -133
  42. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/extension/request_recorder.py +107 -107
  43. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/factories/__init__.py +27 -27
  44. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/factories/base.py +68 -68
  45. crawlo-1.4.7/crawlo/factories/crawler.py +105 -0
  46. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/factories/registry.py +84 -84
  47. crawlo-1.4.7/crawlo/factories/utils.py +135 -0
  48. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/filters/__init__.py +170 -153
  49. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/filters/aioredis_filter.py +348 -264
  50. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/filters/memory_filter.py +261 -276
  51. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/framework.py +306 -292
  52. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/initialization/__init__.py +44 -44
  53. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/initialization/built_in.py +391 -434
  54. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/initialization/context.py +141 -141
  55. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/initialization/core.py +240 -194
  56. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/initialization/phases.py +230 -149
  57. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/initialization/registry.py +143 -145
  58. crawlo-1.4.7/crawlo/initialization/utils.py +49 -0
  59. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/interfaces.py +23 -23
  60. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/items/__init__.py +23 -23
  61. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/items/base.py +23 -23
  62. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/items/fields.py +52 -52
  63. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/items/items.py +104 -104
  64. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/logging/__init__.py +42 -46
  65. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/logging/config.py +277 -197
  66. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/logging/factory.py +175 -171
  67. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/logging/manager.py +104 -112
  68. crawlo-1.4.7/crawlo/middleware/__init__.py +87 -0
  69. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/middleware/default_header.py +132 -132
  70. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/middleware/download_delay.py +104 -104
  71. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/middleware/middleware_manager.py +142 -142
  72. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/middleware/offsite.py +123 -123
  73. crawlo-1.4.7/crawlo/middleware/proxy.py +209 -0
  74. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/middleware/request_ignore.py +86 -86
  75. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/middleware/response_code.py +150 -150
  76. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/middleware/response_filter.py +136 -136
  77. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/middleware/retry.py +124 -124
  78. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/mode_manager.py +287 -253
  79. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/network/__init__.py +21 -21
  80. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/network/request.py +375 -379
  81. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/network/response.py +569 -664
  82. crawlo-1.4.7/crawlo/pipelines/__init__.py +53 -0
  83. crawlo-1.4.7/crawlo/pipelines/base_pipeline.py +452 -0
  84. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
  85. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/pipelines/console_pipeline.py +39 -39
  86. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/pipelines/csv_pipeline.py +316 -316
  87. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/pipelines/database_dedup_pipeline.py +197 -197
  88. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/pipelines/json_pipeline.py +218 -218
  89. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/pipelines/memory_dedup_pipeline.py +105 -105
  90. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/pipelines/mongo_pipeline.py +140 -132
  91. crawlo-1.4.7/crawlo/pipelines/mysql_pipeline.py +470 -0
  92. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/pipelines/pipeline_manager.py +100 -100
  93. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/pipelines/redis_dedup_pipeline.py +155 -156
  94. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/project.py +347 -347
  95. crawlo-1.4.7/crawlo/queue/__init__.py +10 -0
  96. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/queue/pqueue.py +38 -38
  97. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/queue/queue_manager.py +591 -525
  98. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/queue/redis_priority_queue.py +519 -370
  99. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/settings/__init__.py +7 -7
  100. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/settings/default_settings.py +285 -270
  101. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/settings/setting_manager.py +219 -219
  102. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/spider/__init__.py +657 -657
  103. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/stats_collector.py +82 -73
  104. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/subscriber.py +129 -129
  105. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/task_manager.py +138 -138
  106. crawlo-1.4.7/crawlo/templates/crawlo.cfg.tmpl +11 -0
  107. crawlo-1.4.7/crawlo/templates/project/__init__.py.tmpl +2 -0
  108. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/templates/project/items.py.tmpl +13 -17
  109. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/templates/project/middlewares.py.tmpl +38 -38
  110. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/templates/project/pipelines.py.tmpl +35 -36
  111. crawlo-1.4.7/crawlo/templates/project/settings.py.tmpl +110 -0
  112. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/templates/project/settings_distributed.py.tmpl +156 -161
  113. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/templates/project/settings_gentle.py.tmpl +170 -171
  114. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/templates/project/settings_high_performance.py.tmpl +171 -172
  115. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/templates/project/settings_minimal.py.tmpl +99 -77
  116. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/templates/project/settings_simple.py.tmpl +168 -169
  117. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
  118. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/templates/run.py.tmpl +23 -30
  119. crawlo-1.4.7/crawlo/templates/spider/spider.py.tmpl +33 -0
  120. crawlo-1.4.7/crawlo/templates/spiders_init.py.tmpl +5 -0
  121. crawlo-1.4.7/crawlo/tools/__init__.py +87 -0
  122. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/tools/date_tools.py +289 -289
  123. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/tools/distributed_coordinator.py +384 -384
  124. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/tools/scenario_adapter.py +262 -262
  125. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/tools/text_cleaner.py +232 -232
  126. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/utils/__init__.py +50 -50
  127. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/utils/batch_processor.py +276 -259
  128. crawlo-1.4.7/crawlo/utils/config_manager.py +442 -0
  129. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/utils/controlled_spider_mixin.py +439 -439
  130. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/utils/db_helper.py +250 -244
  131. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/utils/error_handler.py +410 -410
  132. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/utils/fingerprint.py +121 -121
  133. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/utils/func_tools.py +82 -82
  134. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/utils/large_scale_helper.py +344 -344
  135. crawlo-1.4.7/crawlo/utils/leak_detector.py +335 -0
  136. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/utils/log.py +79 -79
  137. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/utils/misc.py +81 -81
  138. crawlo-1.4.7/crawlo/utils/mongo_connection_pool.py +157 -0
  139. crawlo-1.4.7/crawlo/utils/mysql_connection_pool.py +197 -0
  140. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/utils/performance_monitor.py +285 -285
  141. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/utils/queue_helper.py +175 -175
  142. crawlo-1.4.7/crawlo/utils/redis_checker.py +91 -0
  143. crawlo-1.4.7/crawlo/utils/redis_connection_pool.py +579 -0
  144. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/utils/redis_key_validator.py +198 -198
  145. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/utils/request.py +278 -256
  146. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/utils/request_serializer.py +225 -225
  147. crawlo-1.4.7/crawlo/utils/resource_manager.py +337 -0
  148. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/utils/selector_helper.py +137 -137
  149. crawlo-1.4.7/crawlo/utils/singleton.py +70 -0
  150. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/utils/spider_loader.py +201 -201
  151. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo/utils/text_helper.py +94 -94
  152. crawlo-1.4.5/crawlo/utils/url.py → crawlo-1.4.7/crawlo/utils/url_utils.py +39 -39
  153. crawlo-1.4.7/crawlo.egg-info/PKG-INFO +689 -0
  154. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo.egg-info/SOURCES.txt +30 -30
  155. {crawlo-1.4.5 → crawlo-1.4.7}/examples/__init__.py +7 -7
  156. {crawlo-1.4.5 → crawlo-1.4.7}/pyproject.toml +2 -2
  157. {crawlo-1.4.5 → crawlo-1.4.7}/requirements.txt +33 -33
  158. {crawlo-1.4.5 → crawlo-1.4.7}/setup.cfg +71 -71
  159. {crawlo-1.4.5 → crawlo-1.4.7}/tests/__init__.py +7 -7
  160. {crawlo-1.4.5 → crawlo-1.4.7}/tests/advanced_tools_example.py +217 -275
  161. {crawlo-1.4.5 → crawlo-1.4.7}/tests/authenticated_proxy_example.py +110 -106
  162. {crawlo-1.4.5 → crawlo-1.4.7}/tests/baidu_performance_test.py +108 -108
  163. {crawlo-1.4.5 → crawlo-1.4.7}/tests/baidu_test.py +59 -59
  164. {crawlo-1.4.5 → crawlo-1.4.7}/tests/bug_check_test.py +250 -250
  165. {crawlo-1.4.5 → crawlo-1.4.7}/tests/cleaners_example.py +160 -160
  166. {crawlo-1.4.5 → crawlo-1.4.7}/tests/comprehensive_framework_test.py +212 -212
  167. {crawlo-1.4.5 → crawlo-1.4.7}/tests/comprehensive_test.py +81 -81
  168. {crawlo-1.4.5 → crawlo-1.4.7}/tests/comprehensive_testing_summary.md +186 -186
  169. {crawlo-1.4.5 → crawlo-1.4.7}/tests/config_validation_demo.py +142 -142
  170. {crawlo-1.4.5 → crawlo-1.4.7}/tests/controlled_spider_example.py +205 -205
  171. {crawlo-1.4.5 → crawlo-1.4.7}/tests/date_tools_example.py +180 -180
  172. {crawlo-1.4.5 → crawlo-1.4.7}/tests/debug_configure.py +69 -69
  173. {crawlo-1.4.5 → crawlo-1.4.7}/tests/debug_framework_logger.py +84 -84
  174. {crawlo-1.4.5 → crawlo-1.4.7}/tests/debug_log_config.py +126 -126
  175. {crawlo-1.4.5 → crawlo-1.4.7}/tests/debug_log_levels.py +63 -63
  176. {crawlo-1.4.5 → crawlo-1.4.7}/tests/debug_pipelines.py +66 -66
  177. {crawlo-1.4.5 → crawlo-1.4.7}/tests/detailed_log_test.py +233 -233
  178. {crawlo-1.4.5 → crawlo-1.4.7}/tests/direct_selector_helper_test.py +96 -96
  179. crawlo-1.4.7/tests/distributed_dedup_test.py +467 -0
  180. {crawlo-1.4.5 → crawlo-1.4.7}/tests/distributed_test.py +66 -66
  181. {crawlo-1.4.5 → crawlo-1.4.7}/tests/distributed_test_debug.py +76 -76
  182. {crawlo-1.4.5 → crawlo-1.4.7}/tests/dynamic_loading_example.py +523 -523
  183. {crawlo-1.4.5 → crawlo-1.4.7}/tests/dynamic_loading_test.py +104 -104
  184. {crawlo-1.4.5 → crawlo-1.4.7}/tests/error_handling_example.py +171 -171
  185. crawlo-1.4.7/tests/explain_mysql_update_behavior.py +77 -0
  186. {crawlo-1.4.5 → crawlo-1.4.7}/tests/final_comprehensive_test.py +151 -151
  187. {crawlo-1.4.5 → crawlo-1.4.7}/tests/final_log_test.py +260 -260
  188. {crawlo-1.4.5 → crawlo-1.4.7}/tests/final_validation_test.py +182 -182
  189. {crawlo-1.4.5 → crawlo-1.4.7}/tests/fix_log_test.py +142 -142
  190. {crawlo-1.4.5 → crawlo-1.4.7}/tests/framework_performance_test.py +202 -202
  191. {crawlo-1.4.5 → crawlo-1.4.7}/tests/log_buffering_test.py +111 -111
  192. {crawlo-1.4.5 → crawlo-1.4.7}/tests/log_generation_timing_test.py +153 -153
  193. crawlo-1.4.7/tests/monitor_redis_dedup.sh +72 -0
  194. {crawlo-1.4.5 → crawlo-1.4.7}/tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
  195. {crawlo-1.4.5 → crawlo-1.4.7}/tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
  196. {crawlo-1.4.5 → crawlo-1.4.7}/tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
  197. {crawlo-1.4.5 → crawlo-1.4.7}/tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
  198. {crawlo-1.4.5 → crawlo-1.4.7}/tests/ofweek_scrapy/scrapy.cfg +11 -11
  199. {crawlo-1.4.5 → crawlo-1.4.7}/tests/optimized_performance_test.py +211 -211
  200. {crawlo-1.4.5 → crawlo-1.4.7}/tests/performance_comparison.py +244 -244
  201. {crawlo-1.4.5 → crawlo-1.4.7}/tests/queue_blocking_test.py +113 -113
  202. {crawlo-1.4.5 → crawlo-1.4.7}/tests/queue_test.py +89 -89
  203. {crawlo-1.4.5 → crawlo-1.4.7}/tests/redis_key_validation_demo.py +130 -130
  204. {crawlo-1.4.5 → crawlo-1.4.7}/tests/request_params_example.py +150 -150
  205. {crawlo-1.4.5 → crawlo-1.4.7}/tests/response_improvements_example.py +144 -144
  206. {crawlo-1.4.5 → crawlo-1.4.7}/tests/scrapy_comparison/ofweek_scrapy.py +138 -138
  207. {crawlo-1.4.5 → crawlo-1.4.7}/tests/scrapy_comparison/scrapy_test.py +133 -133
  208. crawlo-1.4.7/tests/simple_cli_test.py +55 -0
  209. {crawlo-1.4.5 → crawlo-1.4.7}/tests/simple_command_test.py +119 -119
  210. {crawlo-1.4.5 → crawlo-1.4.7}/tests/simple_crawlo_test.py +126 -126
  211. {crawlo-1.4.5 → crawlo-1.4.7}/tests/simple_follow_test.py +38 -38
  212. {crawlo-1.4.5 → crawlo-1.4.7}/tests/simple_log_test2.py +137 -137
  213. {crawlo-1.4.5 → crawlo-1.4.7}/tests/simple_optimization_test.py +128 -128
  214. {crawlo-1.4.5 → crawlo-1.4.7}/tests/simple_queue_type_test.py +41 -41
  215. {crawlo-1.4.5 → crawlo-1.4.7}/tests/simple_response_selector_test.py +94 -94
  216. {crawlo-1.4.5 → crawlo-1.4.7}/tests/simple_selector_helper_test.py +154 -154
  217. {crawlo-1.4.5 → crawlo-1.4.7}/tests/simple_selector_test.py +207 -207
  218. {crawlo-1.4.5 → crawlo-1.4.7}/tests/simple_spider_test.py +49 -49
  219. {crawlo-1.4.5 → crawlo-1.4.7}/tests/simple_url_test.py +73 -73
  220. crawlo-1.4.7/tests/simulate_mysql_update_test.py +140 -0
  221. {crawlo-1.4.5 → crawlo-1.4.7}/tests/spider_log_timing_test.py +177 -177
  222. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_advanced_tools.py +148 -148
  223. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_all_commands.py +230 -230
  224. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_all_pipeline_fingerprints.py +133 -133
  225. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_all_redis_key_configs.py +145 -145
  226. crawlo-1.4.7/tests/test_asyncmy_usage.py +57 -0
  227. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_batch_processor.py +178 -178
  228. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_cleaners.py +54 -54
  229. crawlo-1.4.7/tests/test_cli_arguments.py +119 -0
  230. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_component_factory.py +174 -174
  231. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_config_consistency.py +80 -80
  232. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_config_merge.py +152 -152
  233. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_config_validator.py +182 -182
  234. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_controlled_spider_mixin.py +79 -79
  235. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_crawler_process_import.py +38 -38
  236. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_crawler_process_spider_modules.py +47 -47
  237. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_crawlo_proxy_integration.py +114 -108
  238. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_date_tools.py +123 -123
  239. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_dedup_fix.py +220 -220
  240. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_dedup_pipeline_consistency.py +124 -124
  241. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_default_header_middleware.py +313 -313
  242. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_distributed.py +65 -65
  243. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_double_crawlo_fix.py +204 -204
  244. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_double_crawlo_fix_simple.py +124 -124
  245. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_download_delay_middleware.py +221 -221
  246. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_downloader_proxy_compatibility.py +272 -268
  247. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_edge_cases.py +305 -305
  248. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_encoding_core.py +56 -56
  249. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_encoding_detection.py +126 -126
  250. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_enhanced_error_handler.py +270 -270
  251. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_enhanced_error_handler_comprehensive.py +245 -245
  252. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_error_handler_compatibility.py +112 -112
  253. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_factories.py +252 -252
  254. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_factory_compatibility.py +196 -196
  255. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_final_validation.py +153 -153
  256. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_fingerprint_consistency.py +135 -135
  257. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_fingerprint_simple.py +51 -51
  258. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_get_component_logger.py +83 -83
  259. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_hash_performance.py +99 -99
  260. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_integration.py +169 -169
  261. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_item_dedup_redis_key.py +122 -122
  262. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_large_scale_helper.py +235 -235
  263. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_logging_enhancements.py +374 -374
  264. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_logging_final.py +184 -184
  265. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_logging_integration.py +312 -312
  266. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_logging_system.py +282 -282
  267. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_middleware_debug.py +141 -141
  268. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_mode_consistency.py +51 -51
  269. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_multi_directory.py +67 -67
  270. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_multiple_spider_modules.py +80 -80
  271. crawlo-1.4.7/tests/test_mysql_pipeline_config.py +165 -0
  272. crawlo-1.4.7/tests/test_mysql_pipeline_error.py +99 -0
  273. crawlo-1.4.7/tests/test_mysql_pipeline_init_log.py +83 -0
  274. crawlo-1.4.7/tests/test_mysql_pipeline_integration.py +133 -0
  275. crawlo-1.4.7/tests/test_mysql_pipeline_refactor.py +144 -0
  276. crawlo-1.4.7/tests/test_mysql_pipeline_refactor_simple.py +86 -0
  277. crawlo-1.4.7/tests/test_mysql_pipeline_robustness.py +196 -0
  278. crawlo-1.4.7/tests/test_mysql_pipeline_types.py +89 -0
  279. crawlo-1.4.7/tests/test_mysql_update_columns.py +94 -0
  280. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_offsite_middleware.py +244 -244
  281. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_offsite_middleware_simple.py +203 -203
  282. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_optimized_selector_naming.py +100 -100
  283. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_parsel.py +29 -29
  284. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_performance.py +327 -327
  285. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_performance_monitor.py +115 -115
  286. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_pipeline_fingerprint_consistency.py +86 -86
  287. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_priority_behavior.py +211 -211
  288. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_priority_consistency.py +151 -151
  289. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_priority_consistency_fixed.py +249 -249
  290. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_proxy_health_check.py +32 -32
  291. crawlo-1.4.7/tests/test_proxy_middleware.py +218 -0
  292. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_proxy_middleware_enhanced.py +212 -216
  293. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_proxy_middleware_integration.py +142 -137
  294. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_proxy_middleware_refactored.py +207 -184
  295. crawlo-1.4.7/tests/test_proxy_only.py +84 -0
  296. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_proxy_providers.py +56 -56
  297. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_proxy_stats.py +19 -19
  298. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_proxy_strategies.py +59 -59
  299. crawlo-1.4.7/tests/test_proxy_with_downloader.py +153 -0
  300. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_queue_empty_check.py +41 -41
  301. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_queue_manager_double_crawlo.py +173 -173
  302. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_queue_manager_redis_key.py +179 -179
  303. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_queue_naming.py +154 -154
  304. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_queue_type.py +106 -106
  305. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_queue_type_redis_config_consistency.py +130 -130
  306. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_random_headers_default.py +322 -322
  307. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_random_headers_necessity.py +308 -308
  308. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_random_user_agent.py +72 -72
  309. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_redis_config.py +28 -28
  310. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_redis_connection_pool.py +294 -294
  311. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_redis_key_naming.py +181 -181
  312. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_redis_key_validator.py +123 -123
  313. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_redis_queue.py +224 -224
  314. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_redis_queue_name_fix.py +175 -175
  315. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_redis_queue_type_fallback.py +129 -129
  316. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_request_ignore_middleware.py +182 -182
  317. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_request_params.py +111 -111
  318. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_request_serialization.py +70 -70
  319. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_response_code_middleware.py +349 -349
  320. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_response_filter_middleware.py +427 -427
  321. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_response_follow.py +104 -104
  322. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_response_improvements.py +152 -152
  323. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_response_selector_methods.py +92 -92
  324. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_response_url_methods.py +70 -70
  325. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_response_urljoin.py +86 -86
  326. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_retry_middleware.py +333 -333
  327. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_retry_middleware_realistic.py +273 -273
  328. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_scheduler.py +252 -252
  329. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_scheduler_config_update.py +133 -133
  330. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_scrapy_style_encoding.py +112 -112
  331. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_selector_helper.py +100 -100
  332. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_selector_optimizations.py +146 -146
  333. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_simple_response.py +61 -61
  334. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_spider_loader.py +49 -49
  335. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_spider_loader_comprehensive.py +69 -69
  336. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_spider_modules.py +84 -84
  337. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_spiders/test_spider.py +9 -9
  338. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_telecom_spider_redis_key.py +205 -205
  339. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_template_content.py +87 -87
  340. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_template_redis_key.py +134 -134
  341. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_tools.py +159 -159
  342. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_user_agent_randomness.py +176 -176
  343. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_user_agents.py +96 -96
  344. {crawlo-1.4.5 → crawlo-1.4.7}/tests/untested_features_report.md +138 -138
  345. {crawlo-1.4.5 → crawlo-1.4.7}/tests/verify_debug.py +51 -51
  346. {crawlo-1.4.5 → crawlo-1.4.7}/tests/verify_distributed.py +117 -117
  347. {crawlo-1.4.5 → crawlo-1.4.7}/tests/verify_log_fix.py +111 -111
  348. crawlo-1.4.7/tests/verify_mysql_warnings.py +110 -0
  349. crawlo-1.4.5/PKG-INFO +0 -329
  350. crawlo-1.4.5/README.md +0 -279
  351. crawlo-1.4.5/crawlo/__version__.py +0 -1
  352. crawlo-1.4.5/crawlo/event.py +0 -11
  353. crawlo-1.4.5/crawlo/exceptions.py +0 -82
  354. crawlo-1.4.5/crawlo/factories/crawler.py +0 -104
  355. crawlo-1.4.5/crawlo/logging/async_handler.py +0 -181
  356. crawlo-1.4.5/crawlo/logging/monitor.py +0 -153
  357. crawlo-1.4.5/crawlo/logging/sampler.py +0 -167
  358. crawlo-1.4.5/crawlo/middleware/__init__.py +0 -24
  359. crawlo-1.4.5/crawlo/middleware/proxy.py +0 -386
  360. crawlo-1.4.5/crawlo/middleware/simple_proxy.py +0 -65
  361. crawlo-1.4.5/crawlo/pipelines/__init__.py +0 -22
  362. crawlo-1.4.5/crawlo/pipelines/mysql_pipeline.py +0 -326
  363. crawlo-1.4.5/crawlo/templates/crawlo.cfg.tmpl +0 -11
  364. crawlo-1.4.5/crawlo/templates/project/__init__.py.tmpl +0 -4
  365. crawlo-1.4.5/crawlo/templates/project/settings.py.tmpl +0 -157
  366. crawlo-1.4.5/crawlo/templates/spider/spider.py.tmpl +0 -144
  367. crawlo-1.4.5/crawlo/templates/spiders_init.py.tmpl +0 -10
  368. crawlo-1.4.5/crawlo/tools/__init__.py +0 -190
  369. crawlo-1.4.5/crawlo/tools/authenticated_proxy.py +0 -241
  370. crawlo-1.4.5/crawlo/tools/data_formatter.py +0 -226
  371. crawlo-1.4.5/crawlo/tools/data_validator.py +0 -181
  372. crawlo-1.4.5/crawlo/tools/encoding_converter.py +0 -127
  373. crawlo-1.4.5/crawlo/tools/network_diagnostic.py +0 -365
  374. crawlo-1.4.5/crawlo/tools/request_tools.py +0 -83
  375. crawlo-1.4.5/crawlo/tools/retry_mechanism.py +0 -224
  376. crawlo-1.4.5/crawlo/utils/env_config.py +0 -143
  377. crawlo-1.4.5/crawlo/utils/large_scale_config.py +0 -287
  378. crawlo-1.4.5/crawlo/utils/redis_connection_pool.py +0 -389
  379. crawlo-1.4.5/crawlo/utils/system.py +0 -11
  380. crawlo-1.4.5/crawlo/utils/tools.py +0 -5
  381. crawlo-1.4.5/crawlo.egg-info/PKG-INFO +0 -329
  382. crawlo-1.4.5/tests/env_config_example.py +0 -134
  383. crawlo-1.4.5/tests/ofweek_scrapy/ofweek_scrapy/__init__.py +0 -0
  384. crawlo-1.4.5/tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
  385. crawlo-1.4.5/tests/test_authenticated_proxy.py +0 -142
  386. crawlo-1.4.5/tests/test_comprehensive.py +0 -147
  387. crawlo-1.4.5/tests/test_dynamic_downloaders_proxy.py +0 -125
  388. crawlo-1.4.5/tests/test_dynamic_proxy.py +0 -93
  389. crawlo-1.4.5/tests/test_dynamic_proxy_config.py +0 -147
  390. crawlo-1.4.5/tests/test_dynamic_proxy_real.py +0 -110
  391. crawlo-1.4.5/tests/test_env_config.py +0 -122
  392. crawlo-1.4.5/tests/test_framework_env_usage.py +0 -104
  393. crawlo-1.4.5/tests/test_large_scale_config.py +0 -113
  394. crawlo-1.4.5/tests/test_proxy_api.py +0 -265
  395. crawlo-1.4.5/tests/test_proxy_middleware.py +0 -122
  396. crawlo-1.4.5/tests/test_real_scenario_proxy.py +0 -196
  397. crawlo-1.4.5/tests/tools_example.py +0 -261
  398. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo.egg-info/dependency_links.txt +0 -0
  399. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo.egg-info/entry_points.txt +0 -0
  400. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo.egg-info/requires.txt +0 -0
  401. {crawlo-1.4.5 → crawlo-1.4.7}/crawlo.egg-info/top_level.txt +0 -0
  402. {crawlo-1.4.5/crawlo/queue → crawlo-1.4.7/tests/ofweek_scrapy/ofweek_scrapy}/__init__.py +0 -0
  403. {crawlo-1.4.5 → crawlo-1.4.7}/tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +0 -0
  404. {crawlo-1.4.5 → crawlo-1.4.7}/tests/test_spiders/__init__.py +0 -0
@@ -1,23 +1,23 @@
1
- MIT License
2
-
3
- Modifications:
4
-
5
- Copyright (c) 2020 crawl-coder <2251018029@qq.com>
6
-
7
- Permission is hereby granted, free of charge, to any person obtaining a copy
8
- of this software and associated documentation files (the "Software"), to deal
9
- in the Software without restriction, including without limitation the rights
10
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
- copies of the Software, and to permit persons to whom the Software is
12
- furnished to do so, subject to the following conditions:
13
-
14
- The above copyright notice and this permission notice shall be included in all
15
- copies or substantial portions of the Software.
16
-
17
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1
+ MIT License
2
+
3
+ Modifications:
4
+
5
+ Copyright (c) 2020 crawl-coder <2251018029@qq.com>
6
+
7
+ Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ of this software and associated documentation files (the "Software"), to deal
9
+ in the Software without restriction, including without limitation the rights
10
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ copies of the Software, and to permit persons to whom the Software is
12
+ furnished to do so, subject to the following conditions:
13
+
14
+ The above copyright notice and this permission notice shall be included in all
15
+ copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
23
  SOFTWARE.
@@ -1,17 +1,17 @@
1
- include README.md
2
- include LICENSE
3
- include requirements.txt # 如果根目录有全局requirements.txt
4
- include VERSION # 如果根目录有全局VERSION文件
5
-
6
- # 包内文件包含
7
- recursive-include crawlo/utils/js *
8
- recursive-include crawlo/templates *
9
-
10
- # 测试文件(如果需要在分发包中包含测试)
11
- recursive-include tests *
12
-
13
- # 排除项
14
- global-exclude __pycache__ *.py[cod] .DS_Store *.so
15
- global-exclude *.bak *.swp *.orig *.rej
16
- prune samples # 排除示例目录
1
+ include README.md
2
+ include LICENSE
3
+ include requirements.txt # 如果根目录有全局requirements.txt
4
+ include VERSION # 如果根目录有全局VERSION文件
5
+
6
+ # 包内文件包含
7
+ recursive-include crawlo/utils/js *
8
+ recursive-include crawlo/templates *
9
+
10
+ # 测试文件(如果需要在分发包中包含测试)
11
+ recursive-include tests *
12
+
13
+ # 排除项
14
+ global-exclude __pycache__ *.py[cod] .DS_Store *.so
15
+ global-exclude *.bak *.swp *.orig *.rej
16
+ prune samples # 排除示例目录
17
17
  prune docs # 排除文档目录