crawlo 1.4.6__tar.gz → 1.4.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (398) hide show
  1. {crawlo-1.4.6 → crawlo-1.4.7}/LICENSE +22 -22
  2. {crawlo-1.4.6 → crawlo-1.4.7}/MANIFEST.in +16 -16
  3. crawlo-1.4.7/PKG-INFO +689 -0
  4. crawlo-1.4.7/README.md +639 -0
  5. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/__init__.py +90 -89
  6. crawlo-1.4.7/crawlo/__version__.py +1 -0
  7. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/cli.py +75 -75
  8. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/commands/__init__.py +14 -14
  9. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/commands/check.py +594 -594
  10. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/commands/genspider.py +186 -186
  11. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/commands/help.py +140 -138
  12. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/commands/list.py +155 -155
  13. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/commands/run.py +379 -341
  14. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/commands/startproject.py +460 -460
  15. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/commands/stats.py +187 -187
  16. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/commands/utils.py +196 -196
  17. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/config.py +320 -312
  18. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/config_validator.py +277 -277
  19. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/core/__init__.py +52 -52
  20. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/core/engine.py +451 -438
  21. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/core/processor.py +47 -47
  22. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/core/scheduler.py +290 -291
  23. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/crawler.py +698 -657
  24. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/data/__init__.py +5 -5
  25. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/data/user_agents.py +194 -194
  26. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/downloader/__init__.py +280 -276
  27. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/downloader/aiohttp_downloader.py +233 -233
  28. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/downloader/cffi_downloader.py +250 -247
  29. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/downloader/httpx_downloader.py +265 -259
  30. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/downloader/hybrid_downloader.py +212 -212
  31. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/downloader/playwright_downloader.py +425 -402
  32. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/downloader/selenium_downloader.py +486 -472
  33. crawlo-1.4.7/crawlo/event.py +45 -0
  34. crawlo-1.4.7/crawlo/exceptions.py +215 -0
  35. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/extension/__init__.py +65 -64
  36. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/extension/health_check.py +141 -141
  37. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/extension/log_interval.py +94 -94
  38. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/extension/log_stats.py +70 -70
  39. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/extension/logging_extension.py +53 -61
  40. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/extension/memory_monitor.py +104 -104
  41. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/extension/performance_profiler.py +133 -133
  42. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/extension/request_recorder.py +107 -107
  43. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/factories/__init__.py +27 -27
  44. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/factories/base.py +68 -68
  45. crawlo-1.4.7/crawlo/factories/crawler.py +105 -0
  46. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/factories/registry.py +84 -84
  47. crawlo-1.4.7/crawlo/factories/utils.py +135 -0
  48. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/filters/__init__.py +170 -153
  49. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/filters/aioredis_filter.py +348 -264
  50. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/filters/memory_filter.py +261 -276
  51. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/framework.py +306 -292
  52. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/initialization/__init__.py +44 -44
  53. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/initialization/built_in.py +391 -434
  54. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/initialization/context.py +141 -141
  55. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/initialization/core.py +240 -194
  56. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/initialization/phases.py +230 -149
  57. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/initialization/registry.py +143 -145
  58. crawlo-1.4.7/crawlo/initialization/utils.py +49 -0
  59. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/interfaces.py +23 -23
  60. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/items/__init__.py +23 -23
  61. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/items/base.py +23 -23
  62. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/items/fields.py +52 -52
  63. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/items/items.py +104 -104
  64. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/logging/__init__.py +42 -46
  65. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/logging/config.py +277 -197
  66. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/logging/factory.py +175 -171
  67. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/logging/manager.py +104 -112
  68. crawlo-1.4.7/crawlo/middleware/__init__.py +87 -0
  69. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/middleware/default_header.py +132 -132
  70. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/middleware/download_delay.py +104 -104
  71. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/middleware/middleware_manager.py +142 -142
  72. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/middleware/offsite.py +123 -123
  73. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/middleware/proxy.py +209 -209
  74. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/middleware/request_ignore.py +86 -86
  75. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/middleware/response_code.py +150 -150
  76. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/middleware/response_filter.py +136 -136
  77. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/middleware/retry.py +124 -124
  78. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/mode_manager.py +287 -253
  79. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/network/__init__.py +21 -21
  80. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/network/request.py +375 -379
  81. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/network/response.py +569 -664
  82. crawlo-1.4.7/crawlo/pipelines/__init__.py +53 -0
  83. crawlo-1.4.7/crawlo/pipelines/base_pipeline.py +452 -0
  84. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
  85. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/pipelines/console_pipeline.py +39 -39
  86. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/pipelines/csv_pipeline.py +316 -316
  87. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/pipelines/database_dedup_pipeline.py +197 -197
  88. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/pipelines/json_pipeline.py +218 -218
  89. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/pipelines/memory_dedup_pipeline.py +105 -105
  90. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/pipelines/mongo_pipeline.py +140 -132
  91. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/pipelines/mysql_pipeline.py +469 -476
  92. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/pipelines/pipeline_manager.py +100 -100
  93. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/pipelines/redis_dedup_pipeline.py +155 -156
  94. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/project.py +347 -347
  95. crawlo-1.4.7/crawlo/queue/__init__.py +10 -0
  96. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/queue/pqueue.py +38 -38
  97. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/queue/queue_manager.py +591 -525
  98. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/queue/redis_priority_queue.py +519 -370
  99. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/settings/__init__.py +7 -7
  100. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/settings/default_settings.py +284 -277
  101. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/settings/setting_manager.py +219 -219
  102. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/spider/__init__.py +657 -657
  103. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/stats_collector.py +81 -81
  104. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/subscriber.py +129 -129
  105. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/task_manager.py +138 -138
  106. crawlo-1.4.7/crawlo/templates/crawlo.cfg.tmpl +11 -0
  107. crawlo-1.4.7/crawlo/templates/project/__init__.py.tmpl +2 -0
  108. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/templates/project/items.py.tmpl +13 -17
  109. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/templates/project/middlewares.py.tmpl +38 -38
  110. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/templates/project/pipelines.py.tmpl +35 -36
  111. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/templates/project/settings.py.tmpl +109 -111
  112. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/templates/project/settings_distributed.py.tmpl +156 -159
  113. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/templates/project/settings_gentle.py.tmpl +170 -176
  114. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/templates/project/settings_high_performance.py.tmpl +171 -177
  115. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/templates/project/settings_minimal.py.tmpl +98 -100
  116. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/templates/project/settings_simple.py.tmpl +168 -174
  117. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
  118. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/templates/run.py.tmpl +23 -23
  119. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/templates/spider/spider.py.tmpl +32 -40
  120. crawlo-1.4.7/crawlo/templates/spiders_init.py.tmpl +5 -0
  121. crawlo-1.4.7/crawlo/tools/__init__.py +87 -0
  122. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/tools/date_tools.py +289 -289
  123. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/tools/distributed_coordinator.py +384 -384
  124. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/tools/scenario_adapter.py +262 -262
  125. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/tools/text_cleaner.py +232 -232
  126. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/utils/__init__.py +50 -50
  127. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/utils/batch_processor.py +276 -259
  128. crawlo-1.4.7/crawlo/utils/config_manager.py +442 -0
  129. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/utils/controlled_spider_mixin.py +439 -439
  130. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/utils/db_helper.py +250 -250
  131. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/utils/error_handler.py +410 -410
  132. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/utils/fingerprint.py +121 -121
  133. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/utils/func_tools.py +82 -82
  134. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/utils/large_scale_helper.py +344 -344
  135. crawlo-1.4.7/crawlo/utils/leak_detector.py +335 -0
  136. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/utils/log.py +79 -79
  137. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/utils/misc.py +81 -81
  138. crawlo-1.4.7/crawlo/utils/mongo_connection_pool.py +157 -0
  139. crawlo-1.4.7/crawlo/utils/mysql_connection_pool.py +197 -0
  140. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/utils/performance_monitor.py +285 -285
  141. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/utils/queue_helper.py +175 -175
  142. crawlo-1.4.7/crawlo/utils/redis_checker.py +91 -0
  143. crawlo-1.4.7/crawlo/utils/redis_connection_pool.py +579 -0
  144. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/utils/redis_key_validator.py +198 -198
  145. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/utils/request.py +278 -256
  146. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/utils/request_serializer.py +225 -225
  147. crawlo-1.4.7/crawlo/utils/resource_manager.py +337 -0
  148. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/utils/selector_helper.py +137 -137
  149. crawlo-1.4.7/crawlo/utils/singleton.py +70 -0
  150. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/utils/spider_loader.py +201 -201
  151. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo/utils/text_helper.py +94 -94
  152. crawlo-1.4.6/crawlo/utils/url.py → crawlo-1.4.7/crawlo/utils/url_utils.py +39 -39
  153. crawlo-1.4.7/crawlo.egg-info/PKG-INFO +689 -0
  154. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo.egg-info/SOURCES.txt +15 -29
  155. {crawlo-1.4.6 → crawlo-1.4.7}/examples/__init__.py +7 -7
  156. {crawlo-1.4.6 → crawlo-1.4.7}/pyproject.toml +2 -2
  157. {crawlo-1.4.6 → crawlo-1.4.7}/requirements.txt +33 -33
  158. {crawlo-1.4.6 → crawlo-1.4.7}/setup.cfg +71 -71
  159. {crawlo-1.4.6 → crawlo-1.4.7}/tests/__init__.py +7 -7
  160. {crawlo-1.4.6 → crawlo-1.4.7}/tests/advanced_tools_example.py +217 -275
  161. {crawlo-1.4.6 → crawlo-1.4.7}/tests/authenticated_proxy_example.py +110 -110
  162. {crawlo-1.4.6 → crawlo-1.4.7}/tests/baidu_performance_test.py +108 -108
  163. {crawlo-1.4.6 → crawlo-1.4.7}/tests/baidu_test.py +59 -59
  164. {crawlo-1.4.6 → crawlo-1.4.7}/tests/bug_check_test.py +250 -250
  165. {crawlo-1.4.6 → crawlo-1.4.7}/tests/cleaners_example.py +160 -160
  166. {crawlo-1.4.6 → crawlo-1.4.7}/tests/comprehensive_framework_test.py +212 -212
  167. {crawlo-1.4.6 → crawlo-1.4.7}/tests/comprehensive_test.py +81 -81
  168. {crawlo-1.4.6 → crawlo-1.4.7}/tests/comprehensive_testing_summary.md +186 -186
  169. {crawlo-1.4.6 → crawlo-1.4.7}/tests/config_validation_demo.py +142 -142
  170. {crawlo-1.4.6 → crawlo-1.4.7}/tests/controlled_spider_example.py +205 -205
  171. {crawlo-1.4.6 → crawlo-1.4.7}/tests/date_tools_example.py +180 -180
  172. {crawlo-1.4.6 → crawlo-1.4.7}/tests/debug_configure.py +69 -69
  173. {crawlo-1.4.6 → crawlo-1.4.7}/tests/debug_framework_logger.py +84 -84
  174. {crawlo-1.4.6 → crawlo-1.4.7}/tests/debug_log_config.py +126 -126
  175. {crawlo-1.4.6 → crawlo-1.4.7}/tests/debug_log_levels.py +63 -63
  176. {crawlo-1.4.6 → crawlo-1.4.7}/tests/debug_pipelines.py +66 -66
  177. {crawlo-1.4.6 → crawlo-1.4.7}/tests/detailed_log_test.py +233 -233
  178. {crawlo-1.4.6 → crawlo-1.4.7}/tests/direct_selector_helper_test.py +96 -96
  179. crawlo-1.4.7/tests/distributed_dedup_test.py +467 -0
  180. {crawlo-1.4.6 → crawlo-1.4.7}/tests/distributed_test.py +66 -66
  181. {crawlo-1.4.6 → crawlo-1.4.7}/tests/distributed_test_debug.py +76 -76
  182. {crawlo-1.4.6 → crawlo-1.4.7}/tests/dynamic_loading_example.py +523 -523
  183. {crawlo-1.4.6 → crawlo-1.4.7}/tests/dynamic_loading_test.py +104 -104
  184. {crawlo-1.4.6 → crawlo-1.4.7}/tests/error_handling_example.py +171 -171
  185. {crawlo-1.4.6 → crawlo-1.4.7}/tests/explain_mysql_update_behavior.py +76 -76
  186. {crawlo-1.4.6 → crawlo-1.4.7}/tests/final_comprehensive_test.py +151 -151
  187. {crawlo-1.4.6 → crawlo-1.4.7}/tests/final_log_test.py +260 -260
  188. {crawlo-1.4.6 → crawlo-1.4.7}/tests/final_validation_test.py +182 -182
  189. {crawlo-1.4.6 → crawlo-1.4.7}/tests/fix_log_test.py +142 -142
  190. {crawlo-1.4.6 → crawlo-1.4.7}/tests/framework_performance_test.py +202 -202
  191. {crawlo-1.4.6 → crawlo-1.4.7}/tests/log_buffering_test.py +111 -111
  192. {crawlo-1.4.6 → crawlo-1.4.7}/tests/log_generation_timing_test.py +153 -153
  193. crawlo-1.4.7/tests/monitor_redis_dedup.sh +72 -0
  194. {crawlo-1.4.6 → crawlo-1.4.7}/tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
  195. {crawlo-1.4.6 → crawlo-1.4.7}/tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
  196. {crawlo-1.4.6 → crawlo-1.4.7}/tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
  197. {crawlo-1.4.6 → crawlo-1.4.7}/tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
  198. {crawlo-1.4.6 → crawlo-1.4.7}/tests/ofweek_scrapy/scrapy.cfg +11 -11
  199. {crawlo-1.4.6 → crawlo-1.4.7}/tests/optimized_performance_test.py +211 -211
  200. {crawlo-1.4.6 → crawlo-1.4.7}/tests/performance_comparison.py +244 -244
  201. {crawlo-1.4.6 → crawlo-1.4.7}/tests/queue_blocking_test.py +113 -113
  202. {crawlo-1.4.6 → crawlo-1.4.7}/tests/queue_test.py +89 -89
  203. {crawlo-1.4.6 → crawlo-1.4.7}/tests/redis_key_validation_demo.py +130 -130
  204. {crawlo-1.4.6 → crawlo-1.4.7}/tests/request_params_example.py +150 -150
  205. {crawlo-1.4.6 → crawlo-1.4.7}/tests/response_improvements_example.py +144 -144
  206. {crawlo-1.4.6 → crawlo-1.4.7}/tests/scrapy_comparison/ofweek_scrapy.py +138 -138
  207. {crawlo-1.4.6 → crawlo-1.4.7}/tests/scrapy_comparison/scrapy_test.py +133 -133
  208. crawlo-1.4.7/tests/simple_cli_test.py +55 -0
  209. {crawlo-1.4.6 → crawlo-1.4.7}/tests/simple_command_test.py +119 -119
  210. {crawlo-1.4.6 → crawlo-1.4.7}/tests/simple_crawlo_test.py +126 -126
  211. {crawlo-1.4.6 → crawlo-1.4.7}/tests/simple_follow_test.py +38 -38
  212. {crawlo-1.4.6 → crawlo-1.4.7}/tests/simple_log_test2.py +137 -137
  213. {crawlo-1.4.6 → crawlo-1.4.7}/tests/simple_optimization_test.py +128 -128
  214. {crawlo-1.4.6 → crawlo-1.4.7}/tests/simple_queue_type_test.py +41 -41
  215. {crawlo-1.4.6 → crawlo-1.4.7}/tests/simple_response_selector_test.py +94 -94
  216. {crawlo-1.4.6 → crawlo-1.4.7}/tests/simple_selector_helper_test.py +154 -154
  217. {crawlo-1.4.6 → crawlo-1.4.7}/tests/simple_selector_test.py +207 -207
  218. {crawlo-1.4.6 → crawlo-1.4.7}/tests/simple_spider_test.py +49 -49
  219. {crawlo-1.4.6 → crawlo-1.4.7}/tests/simple_url_test.py +73 -73
  220. {crawlo-1.4.6 → crawlo-1.4.7}/tests/simulate_mysql_update_test.py +139 -139
  221. {crawlo-1.4.6 → crawlo-1.4.7}/tests/spider_log_timing_test.py +177 -177
  222. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_advanced_tools.py +148 -148
  223. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_all_commands.py +230 -230
  224. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_all_pipeline_fingerprints.py +133 -133
  225. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_all_redis_key_configs.py +145 -145
  226. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_asyncmy_usage.py +56 -56
  227. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_batch_processor.py +178 -178
  228. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_cleaners.py +54 -54
  229. crawlo-1.4.7/tests/test_cli_arguments.py +119 -0
  230. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_component_factory.py +174 -174
  231. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_config_consistency.py +80 -80
  232. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_config_merge.py +152 -152
  233. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_config_validator.py +182 -182
  234. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_controlled_spider_mixin.py +79 -79
  235. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_crawler_process_import.py +38 -38
  236. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_crawler_process_spider_modules.py +47 -47
  237. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_crawlo_proxy_integration.py +114 -114
  238. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_date_tools.py +123 -123
  239. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_dedup_fix.py +220 -220
  240. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_dedup_pipeline_consistency.py +124 -124
  241. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_default_header_middleware.py +313 -313
  242. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_distributed.py +65 -65
  243. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_double_crawlo_fix.py +204 -204
  244. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_double_crawlo_fix_simple.py +124 -124
  245. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_download_delay_middleware.py +221 -221
  246. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_downloader_proxy_compatibility.py +272 -272
  247. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_edge_cases.py +305 -305
  248. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_encoding_core.py +56 -56
  249. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_encoding_detection.py +126 -126
  250. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_enhanced_error_handler.py +270 -270
  251. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_enhanced_error_handler_comprehensive.py +245 -245
  252. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_error_handler_compatibility.py +112 -112
  253. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_factories.py +252 -252
  254. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_factory_compatibility.py +196 -196
  255. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_final_validation.py +153 -153
  256. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_fingerprint_consistency.py +135 -135
  257. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_fingerprint_simple.py +51 -51
  258. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_get_component_logger.py +83 -83
  259. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_hash_performance.py +99 -99
  260. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_integration.py +169 -169
  261. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_item_dedup_redis_key.py +122 -122
  262. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_large_scale_helper.py +235 -235
  263. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_logging_enhancements.py +374 -374
  264. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_logging_final.py +184 -184
  265. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_logging_integration.py +312 -312
  266. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_logging_system.py +282 -282
  267. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_middleware_debug.py +141 -141
  268. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_mode_consistency.py +51 -51
  269. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_multi_directory.py +67 -67
  270. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_multiple_spider_modules.py +80 -80
  271. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_mysql_pipeline_config.py +164 -164
  272. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_mysql_pipeline_error.py +98 -98
  273. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_mysql_pipeline_init_log.py +82 -82
  274. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_mysql_pipeline_integration.py +132 -132
  275. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_mysql_pipeline_refactor.py +143 -143
  276. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_mysql_pipeline_refactor_simple.py +85 -85
  277. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_mysql_pipeline_robustness.py +195 -195
  278. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_mysql_pipeline_types.py +88 -88
  279. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_mysql_update_columns.py +93 -93
  280. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_offsite_middleware.py +244 -244
  281. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_offsite_middleware_simple.py +203 -203
  282. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_optimized_selector_naming.py +100 -100
  283. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_parsel.py +29 -29
  284. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_performance.py +327 -327
  285. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_performance_monitor.py +115 -115
  286. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_pipeline_fingerprint_consistency.py +86 -86
  287. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_priority_behavior.py +211 -211
  288. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_priority_consistency.py +151 -151
  289. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_priority_consistency_fixed.py +249 -249
  290. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_proxy_health_check.py +32 -32
  291. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_proxy_middleware.py +217 -217
  292. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_proxy_middleware_enhanced.py +212 -212
  293. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_proxy_middleware_integration.py +142 -142
  294. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_proxy_middleware_refactored.py +207 -207
  295. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_proxy_only.py +83 -83
  296. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_proxy_providers.py +56 -56
  297. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_proxy_stats.py +19 -19
  298. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_proxy_strategies.py +59 -59
  299. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_proxy_with_downloader.py +152 -152
  300. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_queue_empty_check.py +41 -41
  301. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_queue_manager_double_crawlo.py +173 -173
  302. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_queue_manager_redis_key.py +179 -179
  303. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_queue_naming.py +154 -154
  304. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_queue_type.py +106 -106
  305. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_queue_type_redis_config_consistency.py +130 -130
  306. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_random_headers_default.py +322 -322
  307. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_random_headers_necessity.py +308 -308
  308. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_random_user_agent.py +72 -72
  309. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_redis_config.py +28 -28
  310. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_redis_connection_pool.py +294 -294
  311. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_redis_key_naming.py +181 -181
  312. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_redis_key_validator.py +123 -123
  313. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_redis_queue.py +224 -224
  314. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_redis_queue_name_fix.py +175 -175
  315. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_redis_queue_type_fallback.py +129 -129
  316. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_request_ignore_middleware.py +182 -182
  317. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_request_params.py +111 -111
  318. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_request_serialization.py +70 -70
  319. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_response_code_middleware.py +349 -349
  320. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_response_filter_middleware.py +427 -427
  321. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_response_follow.py +104 -104
  322. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_response_improvements.py +152 -152
  323. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_response_selector_methods.py +92 -92
  324. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_response_url_methods.py +70 -70
  325. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_response_urljoin.py +86 -86
  326. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_retry_middleware.py +333 -333
  327. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_retry_middleware_realistic.py +273 -273
  328. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_scheduler.py +252 -252
  329. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_scheduler_config_update.py +133 -133
  330. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_scrapy_style_encoding.py +112 -112
  331. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_selector_helper.py +100 -100
  332. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_selector_optimizations.py +146 -146
  333. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_simple_response.py +61 -61
  334. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_spider_loader.py +49 -49
  335. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_spider_loader_comprehensive.py +69 -69
  336. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_spider_modules.py +84 -84
  337. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_spiders/test_spider.py +9 -9
  338. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_telecom_spider_redis_key.py +205 -205
  339. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_template_content.py +87 -87
  340. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_template_redis_key.py +134 -134
  341. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_tools.py +159 -159
  342. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_user_agent_randomness.py +176 -176
  343. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_user_agents.py +96 -96
  344. {crawlo-1.4.6 → crawlo-1.4.7}/tests/untested_features_report.md +138 -138
  345. {crawlo-1.4.6 → crawlo-1.4.7}/tests/verify_debug.py +51 -51
  346. {crawlo-1.4.6 → crawlo-1.4.7}/tests/verify_distributed.py +117 -117
  347. {crawlo-1.4.6 → crawlo-1.4.7}/tests/verify_log_fix.py +111 -111
  348. {crawlo-1.4.6 → crawlo-1.4.7}/tests/verify_mysql_warnings.py +109 -109
  349. crawlo-1.4.6/PKG-INFO +0 -329
  350. crawlo-1.4.6/README.md +0 -279
  351. crawlo-1.4.6/crawlo/__version__.py +0 -1
  352. crawlo-1.4.6/crawlo/event.py +0 -11
  353. crawlo-1.4.6/crawlo/exceptions.py +0 -82
  354. crawlo-1.4.6/crawlo/factories/crawler.py +0 -104
  355. crawlo-1.4.6/crawlo/logging/async_handler.py +0 -181
  356. crawlo-1.4.6/crawlo/logging/monitor.py +0 -153
  357. crawlo-1.4.6/crawlo/logging/sampler.py +0 -167
  358. crawlo-1.4.6/crawlo/middleware/__init__.py +0 -24
  359. crawlo-1.4.6/crawlo/pipelines/__init__.py +0 -22
  360. crawlo-1.4.6/crawlo/templates/crawlo.cfg.tmpl +0 -11
  361. crawlo-1.4.6/crawlo/templates/project/__init__.py.tmpl +0 -4
  362. crawlo-1.4.6/crawlo/templates/spiders_init.py.tmpl +0 -10
  363. crawlo-1.4.6/crawlo/tools/__init__.py +0 -190
  364. crawlo-1.4.6/crawlo/tools/authenticated_proxy.py +0 -241
  365. crawlo-1.4.6/crawlo/tools/data_formatter.py +0 -226
  366. crawlo-1.4.6/crawlo/tools/data_validator.py +0 -181
  367. crawlo-1.4.6/crawlo/tools/encoding_converter.py +0 -127
  368. crawlo-1.4.6/crawlo/tools/network_diagnostic.py +0 -365
  369. crawlo-1.4.6/crawlo/tools/request_tools.py +0 -83
  370. crawlo-1.4.6/crawlo/tools/retry_mechanism.py +0 -224
  371. crawlo-1.4.6/crawlo/utils/env_config.py +0 -143
  372. crawlo-1.4.6/crawlo/utils/large_scale_config.py +0 -287
  373. crawlo-1.4.6/crawlo/utils/redis_connection_pool.py +0 -389
  374. crawlo-1.4.6/crawlo/utils/system.py +0 -11
  375. crawlo-1.4.6/crawlo/utils/tools.py +0 -5
  376. crawlo-1.4.6/crawlo.egg-info/PKG-INFO +0 -329
  377. crawlo-1.4.6/tests/env_config_example.py +0 -134
  378. crawlo-1.4.6/tests/ofweek_scrapy/ofweek_scrapy/__init__.py +0 -0
  379. crawlo-1.4.6/tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
  380. crawlo-1.4.6/tests/test_authenticated_proxy.py +0 -142
  381. crawlo-1.4.6/tests/test_comprehensive.py +0 -147
  382. crawlo-1.4.6/tests/test_dynamic_downloaders_proxy.py +0 -125
  383. crawlo-1.4.6/tests/test_dynamic_proxy.py +0 -93
  384. crawlo-1.4.6/tests/test_dynamic_proxy_config.py +0 -147
  385. crawlo-1.4.6/tests/test_dynamic_proxy_real.py +0 -110
  386. crawlo-1.4.6/tests/test_env_config.py +0 -122
  387. crawlo-1.4.6/tests/test_framework_env_usage.py +0 -104
  388. crawlo-1.4.6/tests/test_large_scale_config.py +0 -113
  389. crawlo-1.4.6/tests/test_proxy_api.py +0 -265
  390. crawlo-1.4.6/tests/test_real_scenario_proxy.py +0 -196
  391. crawlo-1.4.6/tests/tools_example.py +0 -261
  392. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo.egg-info/dependency_links.txt +0 -0
  393. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo.egg-info/entry_points.txt +0 -0
  394. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo.egg-info/requires.txt +0 -0
  395. {crawlo-1.4.6 → crawlo-1.4.7}/crawlo.egg-info/top_level.txt +0 -0
  396. {crawlo-1.4.6/crawlo/queue → crawlo-1.4.7/tests/ofweek_scrapy/ofweek_scrapy}/__init__.py +0 -0
  397. {crawlo-1.4.6 → crawlo-1.4.7}/tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +0 -0
  398. {crawlo-1.4.6 → crawlo-1.4.7}/tests/test_spiders/__init__.py +0 -0
@@ -1,23 +1,23 @@
1
- MIT License
2
-
3
- Modifications:
4
-
5
- Copyright (c) 2020 crawl-coder <2251018029@qq.com>
6
-
7
- Permission is hereby granted, free of charge, to any person obtaining a copy
8
- of this software and associated documentation files (the "Software"), to deal
9
- in the Software without restriction, including without limitation the rights
10
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
- copies of the Software, and to permit persons to whom the Software is
12
- furnished to do so, subject to the following conditions:
13
-
14
- The above copyright notice and this permission notice shall be included in all
15
- copies or substantial portions of the Software.
16
-
17
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1
+ MIT License
2
+
3
+ Modifications:
4
+
5
+ Copyright (c) 2020 crawl-coder <2251018029@qq.com>
6
+
7
+ Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ of this software and associated documentation files (the "Software"), to deal
9
+ in the Software without restriction, including without limitation the rights
10
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ copies of the Software, and to permit persons to whom the Software is
12
+ furnished to do so, subject to the following conditions:
13
+
14
+ The above copyright notice and this permission notice shall be included in all
15
+ copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
23
  SOFTWARE.
@@ -1,17 +1,17 @@
1
- include README.md
2
- include LICENSE
3
- include requirements.txt # 如果根目录有全局requirements.txt
4
- include VERSION # 如果根目录有全局VERSION文件
5
-
6
- # 包内文件包含
7
- recursive-include crawlo/utils/js *
8
- recursive-include crawlo/templates *
9
-
10
- # 测试文件(如果需要在分发包中包含测试)
11
- recursive-include tests *
12
-
13
- # 排除项
14
- global-exclude __pycache__ *.py[cod] .DS_Store *.so
15
- global-exclude *.bak *.swp *.orig *.rej
16
- prune samples # 排除示例目录
1
+ include README.md
2
+ include LICENSE
3
+ include requirements.txt # 如果根目录有全局requirements.txt
4
+ include VERSION # 如果根目录有全局VERSION文件
5
+
6
+ # 包内文件包含
7
+ recursive-include crawlo/utils/js *
8
+ recursive-include crawlo/templates *
9
+
10
+ # 测试文件(如果需要在分发包中包含测试)
11
+ recursive-include tests *
12
+
13
+ # 排除项
14
+ global-exclude __pycache__ *.py[cod] .DS_Store *.so
15
+ global-exclude *.bak *.swp *.orig *.rej
16
+ prune samples # 排除示例目录
17
17
  prune docs # 排除文档目录