aio-scrapy 2.1.8__tar.gz → 2.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. {aio_scrapy-2.1.8/aio_scrapy.egg-info → aio_scrapy-2.1.9}/PKG-INFO +11 -4
  2. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/README.md +2 -2
  3. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9/aio_scrapy.egg-info}/PKG-INFO +11 -4
  4. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aio_scrapy.egg-info/SOURCES.txt +2 -0
  5. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aio_scrapy.egg-info/requires.txt +8 -0
  6. aio_scrapy-2.1.9/aioscrapy/VERSION +1 -0
  7. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/core/downloader/handlers/curl_cffi.py +13 -3
  8. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/core/downloader/handlers/webdriver/__init__.py +1 -0
  9. aio_scrapy-2.1.9/aioscrapy/core/downloader/handlers/webdriver/sbcdp.py +404 -0
  10. aio_scrapy-2.1.9/aioscrapy/libs/pipelines/redis.py +122 -0
  11. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/settings/default_settings.py +7 -0
  12. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/setup.py +4 -1
  13. aio_scrapy-2.1.8/aioscrapy/VERSION +0 -1
  14. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/LICENSE +0 -0
  15. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/MANIFEST.in +0 -0
  16. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aio_scrapy.egg-info/dependency_links.txt +0 -0
  17. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aio_scrapy.egg-info/entry_points.txt +0 -0
  18. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aio_scrapy.egg-info/not-zip-safe +0 -0
  19. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aio_scrapy.egg-info/top_level.txt +0 -0
  20. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/__init__.py +0 -0
  21. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/__main__.py +0 -0
  22. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/cmdline.py +0 -0
  23. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/commands/__init__.py +0 -0
  24. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/commands/crawl.py +0 -0
  25. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/commands/genspider.py +0 -0
  26. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/commands/list.py +0 -0
  27. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/commands/runspider.py +0 -0
  28. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/commands/settings.py +0 -0
  29. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/commands/startproject.py +0 -0
  30. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/commands/version.py +0 -0
  31. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/core/__init__.py +0 -0
  32. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/core/downloader/__init__.py +0 -0
  33. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/core/downloader/handlers/__init__.py +0 -0
  34. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/core/downloader/handlers/aiohttp.py +0 -0
  35. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/core/downloader/handlers/httpx.py +0 -0
  36. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/core/downloader/handlers/pyhttpx.py +0 -0
  37. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/core/downloader/handlers/requests.py +0 -0
  38. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +0 -0
  39. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/core/downloader/handlers/webdriver/driverpool.py +0 -0
  40. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/core/downloader/handlers/webdriver/playwright.py +0 -0
  41. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/core/engine.py +0 -0
  42. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/core/scheduler.py +0 -0
  43. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/core/scraper.py +0 -0
  44. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/crawler.py +0 -0
  45. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/db/__init__.py +0 -0
  46. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/db/absmanager.py +0 -0
  47. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/db/aiomongo.py +0 -0
  48. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/db/aiomysql.py +0 -0
  49. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/db/aiopg.py +0 -0
  50. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/db/aiorabbitmq.py +0 -0
  51. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/db/aioredis.py +0 -0
  52. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/dupefilters/__init__.py +0 -0
  53. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/dupefilters/disk.py +0 -0
  54. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/dupefilters/redis.py +0 -0
  55. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/exceptions.py +0 -0
  56. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/http/__init__.py +0 -0
  57. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/http/headers.py +0 -0
  58. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/http/request/__init__.py +0 -0
  59. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/http/request/form.py +0 -0
  60. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/http/request/json_request.py +0 -0
  61. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/http/response/__init__.py +0 -0
  62. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/http/response/html.py +0 -0
  63. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/http/response/text.py +0 -0
  64. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/http/response/web_driver.py +0 -0
  65. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/http/response/xml.py +0 -0
  66. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/__init__.py +0 -0
  67. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/downloader/__init__.py +0 -0
  68. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/downloader/defaultheaders.py +0 -0
  69. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/downloader/downloadtimeout.py +0 -0
  70. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/downloader/ja3fingerprint.py +0 -0
  71. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/downloader/retry.py +0 -0
  72. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/downloader/stats.py +0 -0
  73. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/downloader/useragent.py +0 -0
  74. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/extensions/__init__.py +0 -0
  75. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/extensions/closespider.py +0 -0
  76. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/extensions/corestats.py +0 -0
  77. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/extensions/logstats.py +0 -0
  78. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/extensions/metric.py +0 -0
  79. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/extensions/throttle.py +0 -0
  80. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/pipelines/__init__.py +0 -0
  81. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/pipelines/csv.py +0 -0
  82. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/pipelines/excel.py +0 -0
  83. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/pipelines/mongo.py +0 -0
  84. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/pipelines/mysql.py +0 -0
  85. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/pipelines/pg.py +0 -0
  86. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/spider/__init__.py +0 -0
  87. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/spider/depth.py +0 -0
  88. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/spider/httperror.py +0 -0
  89. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/spider/offsite.py +0 -0
  90. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/spider/referer.py +0 -0
  91. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/libs/spider/urllength.py +0 -0
  92. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/link.py +0 -0
  93. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/logformatter.py +0 -0
  94. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/middleware/__init__.py +0 -0
  95. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/middleware/absmanager.py +0 -0
  96. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/middleware/downloader.py +0 -0
  97. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/middleware/extension.py +0 -0
  98. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/middleware/itempipeline.py +0 -0
  99. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/middleware/spider.py +0 -0
  100. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/process.py +0 -0
  101. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/proxy/__init__.py +0 -0
  102. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/proxy/redis.py +0 -0
  103. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/queue/__init__.py +0 -0
  104. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/queue/memory.py +0 -0
  105. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/queue/rabbitmq.py +0 -0
  106. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/queue/redis.py +0 -0
  107. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/scrapyd/__init__.py +0 -0
  108. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/scrapyd/runner.py +0 -0
  109. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/serializer.py +0 -0
  110. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/settings/__init__.py +0 -0
  111. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/signalmanager.py +0 -0
  112. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/signals.py +0 -0
  113. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/spiderloader.py +0 -0
  114. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/spiders/__init__.py +0 -0
  115. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/statscollectors.py +0 -0
  116. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/templates/project/aioscrapy.cfg +0 -0
  117. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/templates/project/module/__init__.py +0 -0
  118. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/templates/project/module/middlewares.py.tmpl +0 -0
  119. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/templates/project/module/pipelines.py.tmpl +0 -0
  120. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/templates/project/module/settings.py.tmpl +0 -0
  121. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/templates/project/module/spiders/__init__.py +0 -0
  122. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/templates/spiders/basic.tmpl +0 -0
  123. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/templates/spiders/single.tmpl +0 -0
  124. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/utils/__init__.py +0 -0
  125. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/utils/conf.py +0 -0
  126. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/utils/curl.py +0 -0
  127. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/utils/decorators.py +0 -0
  128. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/utils/deprecate.py +0 -0
  129. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/utils/httpobj.py +0 -0
  130. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/utils/log.py +0 -0
  131. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/utils/misc.py +0 -0
  132. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/utils/ossignal.py +0 -0
  133. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/utils/project.py +0 -0
  134. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/utils/python.py +0 -0
  135. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/utils/reqser.py +0 -0
  136. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/utils/request.py +0 -0
  137. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/utils/response.py +0 -0
  138. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/utils/signal.py +0 -0
  139. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/utils/spider.py +0 -0
  140. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/utils/template.py +0 -0
  141. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/utils/tools.py +0 -0
  142. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/utils/trackref.py +0 -0
  143. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/aioscrapy/utils/url.py +0 -0
  144. {aio_scrapy-2.1.8 → aio_scrapy-2.1.9}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: aio-scrapy
3
- Version: 2.1.8
3
+ Version: 2.1.9
4
4
  Summary: A high-level Web Crawling and Web Scraping framework based on Asyncio
5
5
  Home-page: https://github.com/conlin-huang/aio-scrapy.git
6
6
  Author: conlin
@@ -41,6 +41,8 @@ Requires-Dist: XlsxWriter>=3.1.2; extra == "all"
41
41
  Requires-Dist: pillow>=9.4.0; extra == "all"
42
42
  Requires-Dist: requests>=2.28.2; extra == "all"
43
43
  Requires-Dist: curl_cffi; extra == "all"
44
+ Requires-Dist: sbcdp; extra == "all"
45
+ Requires-Dist: DrissionPage; extra == "all"
44
46
  Provides-Extra: aiomysql
45
47
  Requires-Dist: aiomysql>=0.1.1; extra == "aiomysql"
46
48
  Requires-Dist: cryptography; extra == "aiomysql"
@@ -52,6 +54,10 @@ Provides-Extra: mongo
52
54
  Requires-Dist: motor>=2.1.0; extra == "mongo"
53
55
  Provides-Extra: playwright
54
56
  Requires-Dist: playwright>=1.31.1; extra == "playwright"
57
+ Provides-Extra: sbcdp
58
+ Requires-Dist: sbcdp; extra == "sbcdp"
59
+ Provides-Extra: dp
60
+ Requires-Dist: DrissionPage; extra == "dp"
55
61
  Provides-Extra: pyhttpx
56
62
  Requires-Dist: pyhttpx>=2.10.4; extra == "pyhttpx"
57
63
  Provides-Extra: curl-cffi
@@ -71,6 +77,7 @@ Dynamic: description-content-type
71
77
  Dynamic: home-page
72
78
  Dynamic: keywords
73
79
  Dynamic: license
80
+ Dynamic: license-file
74
81
  Dynamic: provides-extra
75
82
  Dynamic: requires-dist
76
83
  Dynamic: requires-python
@@ -84,7 +91,7 @@ AioScrapy is a powerful asynchronous web crawling framework built on Python's as
84
91
  ## 特性 | Features
85
92
 
86
93
  - **完全异步**:基于Python的asyncio库,实现高效的并发爬取
87
- - **多种下载处理程序**:支持多种HTTP客户端,包括aiohttp、httpx、requests、pyhttpx、curl_cffi、DrissionPage和playwright
94
+ - **多种下载处理程序**:支持多种HTTP客户端,包括aiohttp、httpx、requests、pyhttpx、curl_cffi、DrissionPage、playwrightsbcdp
88
95
  - **灵活的中间件系统**:轻松添加自定义功能和处理逻辑
89
96
  - **强大的数据处理管道**:支持多种数据库存储选项
90
97
  - **内置信号系统**:方便的事件处理机制
@@ -94,7 +101,7 @@ AioScrapy is a powerful asynchronous web crawling framework built on Python's as
94
101
 
95
102
 
96
103
  - **Fully Asynchronous**: Built on Python's asyncio for efficient concurrent crawling
97
- - **Multiple Download Handlers**: Support for various HTTP clients including aiohttp, httpx, requests, pyhttpx, curl_cffi, DrissionPage and playwright
104
+ - **Multiple Download Handlers**: Support for various HTTP clients including aiohttp, httpx, requests, pyhttpx, curl_cffi, DrissionPage, playwright and sbcdp
98
105
  - **Flexible Middleware System**: Easily add custom functionality and processing logic
99
106
  - **Powerful Data Processing Pipelines**: Support for various database storage options
100
107
  - **Built-in Signal System**: Convenient event handling mechanism
@@ -6,7 +6,7 @@ AioScrapy is a powerful asynchronous web crawling framework built on Python's as
6
6
  ## 特性 | Features
7
7
 
8
8
  - **完全异步**:基于Python的asyncio库,实现高效的并发爬取
9
- - **多种下载处理程序**:支持多种HTTP客户端,包括aiohttp、httpx、requests、pyhttpx、curl_cffi、DrissionPage和playwright
9
+ - **多种下载处理程序**:支持多种HTTP客户端,包括aiohttp、httpx、requests、pyhttpx、curl_cffi、DrissionPage、playwrightsbcdp
10
10
  - **灵活的中间件系统**:轻松添加自定义功能和处理逻辑
11
11
  - **强大的数据处理管道**:支持多种数据库存储选项
12
12
  - **内置信号系统**:方便的事件处理机制
@@ -16,7 +16,7 @@ AioScrapy is a powerful asynchronous web crawling framework built on Python's as
16
16
 
17
17
 
18
18
  - **Fully Asynchronous**: Built on Python's asyncio for efficient concurrent crawling
19
- - **Multiple Download Handlers**: Support for various HTTP clients including aiohttp, httpx, requests, pyhttpx, curl_cffi, DrissionPage and playwright
19
+ - **Multiple Download Handlers**: Support for various HTTP clients including aiohttp, httpx, requests, pyhttpx, curl_cffi, DrissionPage, playwright and sbcdp
20
20
  - **Flexible Middleware System**: Easily add custom functionality and processing logic
21
21
  - **Powerful Data Processing Pipelines**: Support for various database storage options
22
22
  - **Built-in Signal System**: Convenient event handling mechanism
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: aio-scrapy
3
- Version: 2.1.8
3
+ Version: 2.1.9
4
4
  Summary: A high-level Web Crawling and Web Scraping framework based on Asyncio
5
5
  Home-page: https://github.com/conlin-huang/aio-scrapy.git
6
6
  Author: conlin
@@ -41,6 +41,8 @@ Requires-Dist: XlsxWriter>=3.1.2; extra == "all"
41
41
  Requires-Dist: pillow>=9.4.0; extra == "all"
42
42
  Requires-Dist: requests>=2.28.2; extra == "all"
43
43
  Requires-Dist: curl_cffi; extra == "all"
44
+ Requires-Dist: sbcdp; extra == "all"
45
+ Requires-Dist: DrissionPage; extra == "all"
44
46
  Provides-Extra: aiomysql
45
47
  Requires-Dist: aiomysql>=0.1.1; extra == "aiomysql"
46
48
  Requires-Dist: cryptography; extra == "aiomysql"
@@ -52,6 +54,10 @@ Provides-Extra: mongo
52
54
  Requires-Dist: motor>=2.1.0; extra == "mongo"
53
55
  Provides-Extra: playwright
54
56
  Requires-Dist: playwright>=1.31.1; extra == "playwright"
57
+ Provides-Extra: sbcdp
58
+ Requires-Dist: sbcdp; extra == "sbcdp"
59
+ Provides-Extra: dp
60
+ Requires-Dist: DrissionPage; extra == "dp"
55
61
  Provides-Extra: pyhttpx
56
62
  Requires-Dist: pyhttpx>=2.10.4; extra == "pyhttpx"
57
63
  Provides-Extra: curl-cffi
@@ -71,6 +77,7 @@ Dynamic: description-content-type
71
77
  Dynamic: home-page
72
78
  Dynamic: keywords
73
79
  Dynamic: license
80
+ Dynamic: license-file
74
81
  Dynamic: provides-extra
75
82
  Dynamic: requires-dist
76
83
  Dynamic: requires-python
@@ -84,7 +91,7 @@ AioScrapy is a powerful asynchronous web crawling framework built on Python's as
84
91
  ## 特性 | Features
85
92
 
86
93
  - **完全异步**:基于Python的asyncio库,实现高效的并发爬取
87
- - **多种下载处理程序**:支持多种HTTP客户端,包括aiohttp、httpx、requests、pyhttpx、curl_cffi、DrissionPage和playwright
94
+ - **多种下载处理程序**:支持多种HTTP客户端,包括aiohttp、httpx、requests、pyhttpx、curl_cffi、DrissionPage、playwrightsbcdp
88
95
  - **灵活的中间件系统**:轻松添加自定义功能和处理逻辑
89
96
  - **强大的数据处理管道**:支持多种数据库存储选项
90
97
  - **内置信号系统**:方便的事件处理机制
@@ -94,7 +101,7 @@ AioScrapy is a powerful asynchronous web crawling framework built on Python's as
94
101
 
95
102
 
96
103
  - **Fully Asynchronous**: Built on Python's asyncio for efficient concurrent crawling
97
- - **Multiple Download Handlers**: Support for various HTTP clients including aiohttp, httpx, requests, pyhttpx, curl_cffi, DrissionPage and playwright
104
+ - **Multiple Download Handlers**: Support for various HTTP clients including aiohttp, httpx, requests, pyhttpx, curl_cffi, DrissionPage, playwright and sbcdp
98
105
  - **Flexible Middleware System**: Easily add custom functionality and processing logic
99
106
  - **Powerful Data Processing Pipelines**: Support for various database storage options
100
107
  - **Built-in Signal System**: Convenient event handling mechanism
@@ -46,6 +46,7 @@ aioscrapy/core/downloader/handlers/webdriver/__init__.py
46
46
  aioscrapy/core/downloader/handlers/webdriver/drissionpage.py
47
47
  aioscrapy/core/downloader/handlers/webdriver/driverpool.py
48
48
  aioscrapy/core/downloader/handlers/webdriver/playwright.py
49
+ aioscrapy/core/downloader/handlers/webdriver/sbcdp.py
49
50
  aioscrapy/db/__init__.py
50
51
  aioscrapy/db/absmanager.py
51
52
  aioscrapy/db/aiomongo.py
@@ -86,6 +87,7 @@ aioscrapy/libs/pipelines/excel.py
86
87
  aioscrapy/libs/pipelines/mongo.py
87
88
  aioscrapy/libs/pipelines/mysql.py
88
89
  aioscrapy/libs/pipelines/pg.py
90
+ aioscrapy/libs/pipelines/redis.py
89
91
  aioscrapy/libs/spider/__init__.py
90
92
  aioscrapy/libs/spider/depth.py
91
93
  aioscrapy/libs/spider/httperror.py
@@ -28,10 +28,15 @@ XlsxWriter>=3.1.2
28
28
  pillow>=9.4.0
29
29
  requests>=2.28.2
30
30
  curl_cffi
31
+ sbcdp
32
+ DrissionPage
31
33
 
32
34
  [curl_cffi]
33
35
  curl_cffi>=0.6.1
34
36
 
37
+ [dp]
38
+ DrissionPage
39
+
35
40
  [execl]
36
41
  XlsxWriter>=3.1.2
37
42
  pillow>=9.4.0
@@ -53,3 +58,6 @@ pyhttpx>=2.10.4
53
58
 
54
59
  [requests]
55
60
  requests>=2.28.2
61
+
62
+ [sbcdp]
63
+ sbcdp
@@ -0,0 +1 @@
1
+ 2.1.9
@@ -8,8 +8,9 @@ It supports features like browser impersonation, proxies, and cookies.
8
8
  它支持浏览器模拟、代理和Cookie等功能。
9
9
  """
10
10
 
11
+ import asyncio
11
12
  from curl_cffi.curl import CurlError
12
- from curl_cffi.requests import AsyncSession
13
+ from curl_cffi.requests import AsyncSession, Session
13
14
 
14
15
  from aioscrapy import Request
15
16
  from aioscrapy.core.downloader.handlers import BaseDownloadHandler
@@ -50,6 +51,9 @@ class CurlCffiDownloadHandler(BaseDownloadHandler):
50
51
  # SSL验证设置
51
52
  self.verify_ssl: bool = self.settings.get("VERIFY_SSL", True)
52
53
 
54
+ # 是否在线程中执行
55
+ self.use_thread: bool = self.settings.get("USE_THREAD", False)
56
+
53
57
  @classmethod
54
58
  def from_settings(cls, settings: Settings):
55
59
  """
@@ -160,8 +164,14 @@ class CurlCffiDownloadHandler(BaseDownloadHandler):
160
164
 
161
165
  # Perform the request
162
166
  # 执行请求
163
- async with AsyncSession(**session_args) as session:
164
- response = await session.request(request.method, request.url, **kwargs)
167
+ if self.use_thread:
168
+ with Session(**session_args) as session:
169
+ # Run the synchronous curl-cffi request in a thread pool
170
+ # 在线程池中运行同步的curl-cffi请求
171
+ response = await asyncio.to_thread(session.request, request.method, request.url, **kwargs)
172
+ else:
173
+ async with AsyncSession(**session_args) as session:
174
+ response = await session.request(request.method, request.url, **kwargs)
165
175
 
166
176
  # Convert curl_cffi response to HtmlResponse
167
177
  # 将curl_cffi响应转换为HtmlResponse
@@ -1,2 +1,3 @@
1
1
  from .playwright import PlaywrightDownloadHandler, PlaywrightDriver
2
2
  from .drissionpage import DrissionPageDownloadHandler, DrissionPageDriver
3
+ from .sbcdp import SbcdpDownloadHandler, SbcdpDriver
@@ -0,0 +1,404 @@
1
+ """
2
+ Download handler implementation using Playwright.
3
+ 使用Playwright的下载处理程序实现。
4
+
5
+ This module provides a download handler that uses Playwright to perform browser-based HTTP requests.
6
+ It supports full browser automation, JavaScript execution, and event handling.
7
+ 此模块提供了一个使用Playwright执行基于浏览器的HTTP请求的下载处理程序。
8
+ 它支持完整的浏览器自动化、JavaScript执行和事件处理。
9
+ """
10
+ from functools import wraps
11
+ from typing import Optional, Literal
12
+
13
+ from sbcdp import AsyncChrome, NetHttp
14
+
15
+ from aioscrapy import Request, Spider
16
+ from aioscrapy.core.downloader.handlers import BaseDownloadHandler
17
+ from aioscrapy.exceptions import DownloadError
18
+ from aioscrapy.http import WebDriverResponse
19
+ from aioscrapy.settings import Settings
20
+ from aioscrapy.utils.tools import call_helper
21
+ from .driverpool import WebDriverPool, WebDriverBase
22
+
23
+
24
+ class SbcdpDriver(WebDriverBase):
25
+ """
26
+ A wrapper around sbcdp's browser automation API.
27
+ 对sbcdp浏览器自动化API的包装。
28
+
29
+ This class provides a simplified interface for working with sbcdp browsers,
30
+ handling initialization, proxy configuration, and browser lifecycle management.
31
+ 此类提供了一个简化的接口来使用sbcdp浏览器,处理初始化、代理配置和浏览器生命周期管理。
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ *,
37
+ driver_type: Literal["google-chrome", "edge"] = "google-chrome",
38
+ proxy: Optional[str] = None,
39
+ user_agent: str = None,
40
+ max_uses: Optional[int] = None,
41
+ **kwargs # Additional arguments (not used directly)
42
+ # 其他参数(不直接使用)
43
+ ):
44
+ """
45
+ Initialize the PlaywrightDriver.
46
+ 初始化PlaywrightDriver。
47
+
48
+ Args:
49
+ driver_type: The type of browser to use ("chromium", "firefox", or "webkit").
50
+ 要使用的浏览器类型("chromium"、"firefox"或"webkit")。
51
+ proxy: Optional proxy URL to use for browser connections.
52
+ 用于浏览器连接的可选代理URL。
53
+ browser_args: Optional arguments to pass to browser.launch().
54
+ 传递给browser.launch()的可选参数。
55
+ context_args: Optional arguments to pass to browser.new_context().
56
+ 传递给browser.new_context()的可选参数。
57
+ window_size: Optional tuple of (width, height) for the browser window size.
58
+ 浏览器窗口大小的可选元组(width, height)。
59
+ user_agent: Optional user agent string to use.
60
+ 要使用的可选用户代理字符串。
61
+ max_uses: Optional count of uses after which the browser should be recycled.
62
+ 浏览器应该被回收的使用次数的可选计数。
63
+ **kwargs: Additional arguments (not used directly).
64
+ 其他参数(不直接使用)。
65
+ """
66
+ # Browser configuration
67
+ # 浏览器配置
68
+ self.driver_type = driver_type # Type of browser to use
69
+ # 要使用的浏览器类型
70
+ self.proxy = proxy # Formatted proxy settings
71
+ # 代理设置
72
+ self.user_agent = user_agent # User agent string
73
+ # 用户代理字符串
74
+
75
+ # sbcdp components (initialized in setup())
76
+ # sbcdp组件(在setup()中初始化)
77
+ self.browser: Optional[AsyncChrome] = None # sbcdp instance
78
+ # sbcdp实例
79
+ self.url = None # Current URL (used for cookie management)
80
+ # 当前URL(用于Cookie管理)
81
+ self.max_uses = max_uses # Counter for browser recycling
82
+ # 浏览器回收计数器
83
+
84
+ self.had_set_event_http = False
85
+ self.cache_response = None
86
+
87
+ async def setup(self):
88
+ """
89
+ Initialize the Playwright browser and page.
90
+ 初始化Playwright浏览器和页面。
91
+
92
+ This method starts Playwright, launches the browser, creates a browser context,
93
+ and opens a new page. It applies all configuration options such as proxy settings,
94
+ viewport size, and user agent.
95
+ 此方法启动Playwright,启动浏览器,创建浏览器上下文,并打开新页面。
96
+ 它应用所有配置选项,如代理设置、视口大小和用户代理。
97
+
98
+ Returns:
99
+ None
100
+ """
101
+ # Start Playwright and launch browser
102
+ # 启动Playwright和浏览器
103
+ self.browser = AsyncChrome(url=self.url, user_agent=self.user_agent, proxy=self.proxy)
104
+ await self.browser.start()
105
+
106
+
107
+ async def quit(self):
108
+ """
109
+ Close the browser and clean up resources.
110
+ 关闭浏览器并清理资源。
111
+
112
+ This method closes the page, browser context, browser, and stops the
113
+ Playwright instance, releasing all associated resources.
114
+ 此方法关闭页面、浏览器上下文、浏览器,并停止Playwright实例,
115
+ 释放所有相关资源。
116
+
117
+ Returns:
118
+ None
119
+ """
120
+ self.cache_response = None
121
+ await self.browser.stop()
122
+
123
+ async def get_cookies(self):
124
+ """
125
+ Get all cookies from the browser context.
126
+ 从浏览器上下文获取所有Cookie。
127
+
128
+ This method retrieves all cookies from the current browser context
129
+ and returns them as a dictionary of name-value pairs.
130
+ 此方法从当前浏览器上下文检索所有Cookie,并将它们作为名称-值对的字典返回。
131
+
132
+ Returns:
133
+ dict: A dictionary of cookie name-value pairs.
134
+ Cookie名称-值对的字典。
135
+ """
136
+ # Convert the list of cookie objects to a name-value dictionary
137
+ # 将Cookie对象列表转换为名称-值字典
138
+ return {
139
+ cookie.name: cookie.value
140
+ for cookie in await self.browser.get_all_cookies()
141
+ }
142
+
143
+ async def set_cookies(self, cookies: dict):
144
+ """
145
+ Set cookies in the browser context.
146
+ 在浏览器上下文中设置Cookie。
147
+
148
+ This method adds the provided cookies to the browser context,
149
+ associating them with the current URL.
150
+ 此方法将提供的Cookie添加到浏览器上下文中,将它们与当前URL关联。
151
+
152
+ Args:
153
+ cookies: A dictionary of cookie name-value pairs to set.
154
+ 要设置的Cookie名称-值对的字典。
155
+
156
+ Returns:
157
+ None
158
+ """
159
+ # Convert the dictionary to the format expected by sbcdp
160
+ # 将字典转换为sbcdp期望的格式
161
+ u = self.url or await self.browser.get_origin()
162
+ await self.browser.set_all_cookies([
163
+ {
164
+ "name": key,
165
+ "value": value,
166
+ # Use the stored URL or current page URL
167
+ # 使用存储的URL或当前页面URL
168
+ "url": u
169
+ }
170
+ for key, value in cookies.items()
171
+ ])
172
+
173
+
174
+ class SbcdpDownloadHandler(BaseDownloadHandler):
175
+ """
176
+ Download handler that uses Playwright to perform browser-based HTTP requests.
177
+ 使用Playwright执行基于浏览器的HTTP请求的下载处理程序。
178
+
179
+ This handler implements the BaseDownloadHandler interface using Playwright,
180
+ which provides a high-level API to control browsers. It supports full browser
181
+ automation, JavaScript execution, and event handling.
182
+ 此处理程序使用Playwright实现BaseDownloadHandler接口,Playwright提供了控制浏览器的
183
+ 高级API。它支持完整的浏览器自动化、JavaScript执行和事件处理。
184
+ """
185
+
186
+ def __init__(self, settings: Settings):
187
+ """
188
+ Initialize the PlaywrightHandler.
189
+ 初始化PlaywrightHandler。
190
+
191
+ Args:
192
+ settings: The settings object containing configuration for the handler.
193
+ 包含处理程序配置的设置对象。
194
+ """
195
+ self.settings = settings
196
+
197
+ # Get Playwright client arguments from settings
198
+ # 从设置中获取Playwright客户端参数
199
+ sbcdp_client_args = settings.getdict('SBCDP_ARGS')
200
+
201
+ # Configure the pool size for browser instances
202
+ # 配置浏览器实例的池大小
203
+ pool_size = sbcdp_client_args.pop('pool_size', settings.getint("CONCURRENT_REQUESTS", 1))
204
+
205
+ # Initialize the WebDriver pool
206
+ # 初始化WebDriver池
207
+ self._webdriver_pool = WebDriverPool(SbcdpDriver, pool_size=pool_size, **sbcdp_client_args)
208
+
209
+ @classmethod
210
+ def from_settings(cls, settings: Settings):
211
+ """
212
+ Create a download handler from settings.
213
+ 从设置创建下载处理程序。
214
+
215
+ This is a factory method that creates a new PlaywrightHandler
216
+ instance with the given settings.
217
+ 这是一个工厂方法,使用给定的设置创建一个新的PlaywrightHandler实例。
218
+
219
+ Args:
220
+ settings: The settings to use for the handler.
221
+ 用于处理程序的设置。
222
+
223
+ Returns:
224
+ PlaywrightHandler: A new download handler instance.
225
+ 一个新的下载处理程序实例。
226
+ """
227
+ return cls(settings)
228
+
229
+ async def download_request(self, request: Request, spider: Spider) -> WebDriverResponse:
230
+ """
231
+ Download a request using Playwright.
232
+ 使用Playwright下载请求。
233
+
234
+ This method implements the BaseDownloadHandler.download_request interface.
235
+ It wraps the actual download logic in _download_request and handles
236
+ Playwright-specific exceptions.
237
+ 此方法实现了BaseDownloadHandler.download_request接口。
238
+ 它将实际的下载逻辑包装在_download_request中,并处理Playwright特定的异常。
239
+
240
+ Args:
241
+ request: The request to download.
242
+ 要下载的请求。
243
+ spider: The spider that initiated the request.
244
+ 发起请求的爬虫。
245
+
246
+ Returns:
247
+ PlaywrightResponse: The response from the browser.
248
+ 来自浏览器的响应。
249
+
250
+ Raises:
251
+ DownloadError: If a Playwright error or any other exception occurs during the download.
252
+ 如果在下载过程中发生Playwright错误或任何其他异常。
253
+ """
254
+ try:
255
+ return await self._download_request(request, spider)
256
+ # except Error as e:
257
+ # # Wrap Playwright-specific exceptions in a generic DownloadError
258
+ # # 将Playwright特定的异常包装在通用的DownloadError中
259
+ # raise DownloadError(real_error=e) from e
260
+ except Exception as e:
261
+ # Wrap any other exceptions in a generic DownloadError
262
+ # 将任何其他异常包装在通用的DownloadError中
263
+ raise DownloadError(real_error=e) from e
264
+
265
+ async def _download_request(self, request: Request, spider) -> WebDriverResponse:
266
+ """
267
+ Internal method to perform the actual download using Playwright.
268
+ 使用Playwright执行实际下载的内部方法。
269
+
270
+ This method configures and uses a Playwright browser to perform the request,
271
+ handling cookies, user agent, proxies, and event listeners. It also supports
272
+ custom browser actions defined in the spider.
273
+ 此方法配置并使用Playwright浏览器执行请求,处理Cookie、用户代理、代理和事件监听器。
274
+ 它还支持在爬虫中定义的自定义浏览器操作。
275
+
276
+ Args:
277
+ request: The request to download.
278
+ 要下载的请求。
279
+ spider: The spider that initiated the request.
280
+ 发起请求的爬虫。
281
+
282
+ Returns:
283
+ PlaywrightResponse: The response from the browser.
284
+ 来自浏览器的响应。
285
+
286
+ Raises:
287
+ Exception: If any error occurs during the browser automation.
288
+ 如果在浏览器自动化过程中发生任何错误。
289
+ """
290
+ # Extract request parameters
291
+ # 提取请求参数
292
+ cookies = dict(request.cookies)
293
+ timeout = request.meta.get('download_timeout', 30) * 1000 # Convert to milliseconds
294
+ # 转换为毫秒
295
+ user_agent = request.headers.get("User-Agent")
296
+ proxy: str = request.meta.get("proxy")
297
+ url = request.url
298
+
299
+ # Configure browser options
300
+ # 配置浏览器选项
301
+ kwargs = dict()
302
+ if proxy:
303
+ kwargs['proxy'] = proxy
304
+ if user_agent:
305
+ kwargs['user_agent'] = user_agent
306
+
307
+ # Get a browser instance from the pool
308
+ # 从池中获取浏览器实例
309
+ driver: SbcdpDriver = await self._webdriver_pool.get(**kwargs)
310
+
311
+ # Dictionary to store responses from event listeners
312
+ # 存储来自事件监听器的响应的字典
313
+ driver.cache_response = {}
314
+
315
+ # Wrapper for event handlers to capture their return values
316
+ # 包装事件处理程序以捕获其返回值
317
+ # 为了获取监听事件中的响应结果
318
+ def on_event_wrap_handler(func):
319
+ @wraps(func)
320
+ async def inner(*a, **kw):
321
+ ret = await func(*a, **kw)
322
+ if ret:
323
+ driver.cache_response[ret[0]] = ret[1]
324
+
325
+ return inner
326
+
327
+ # Set up event listeners from spider methods
328
+ # 从爬虫方法设置事件监听器
329
+ if (not driver.had_set_event_http) and (monitor_cb:=getattr(spider, "on_event_http", None)):
330
+ intercept_cb = getattr(spider, "on_event_http_intercept", None)
331
+ driver.browser.http_monitor(
332
+ monitor_cb=on_event_wrap_handler(monitor_cb),
333
+ intercept_cb=intercept_cb,
334
+ delay_response_body=True
335
+ )
336
+ driver.had_set_event_http = True
337
+
338
+ try:
339
+ # Set cookies if provided
340
+ # 如果提供了Cookie,则设置Cookie
341
+ if cookies:
342
+ driver.url = url
343
+ await driver.set_cookies(cookies)
344
+
345
+ # Navigate to the URL
346
+ # 导航到URL
347
+ await driver.browser.get(url, timeout=timeout)
348
+
349
+ # Execute custom actions if defined in the spider
350
+ # 如果在爬虫中定义了自定义操作,则执行
351
+ if process_action_fn := getattr(spider, 'process_action', None):
352
+ action_result = await call_helper(process_action_fn, driver, request)
353
+ if action_result:
354
+ driver.cache_response[action_result[0]] = action_result[1]
355
+
356
+ # Process any event responses
357
+ # 处理任何事件响应
358
+ for cache_key in list(driver.cache_response.keys()):
359
+ if isinstance(driver.cache_response[cache_key], NetHttp):
360
+ cache_ret = driver.cache_response[cache_key]
361
+ # Convert sbcdp response to WebDriverResponse
362
+ # 将sbcdp响应转换为WebDriverResponse
363
+ driver.cache_response[cache_key] = WebDriverResponse(
364
+ url=cache_ret.url,
365
+ request=request,
366
+ intercept_request=dict(
367
+ url=cache_ret.request.url,
368
+ headers=cache_ret.request.headers,
369
+ data=cache_ret.request.post_data,
370
+ ),
371
+ headers=cache_ret.headers,
372
+ body=(await cache_ret.get_response_body()).encode(),
373
+ status=200,
374
+ )
375
+
376
+ # Create and return the final response
377
+ # 创建并返回最终响应
378
+ return WebDriverResponse(
379
+ url=await driver.browser.get_current_url(),
380
+ status=200,
381
+ text=await driver.browser.get_page_source(),
382
+ cookies=await driver.get_cookies(),
383
+ cache_response=driver.cache_response,
384
+ driver=driver,
385
+ driver_pool=self._webdriver_pool
386
+ )
387
+ except Exception as e:
388
+ # Remove the driver from the pool on error
389
+ # 出错时从池中移除驱动程序
390
+ await self._webdriver_pool.remove(driver)
391
+ raise e
392
+
393
+ async def close(self):
394
+ """
395
+ Close the download handler and release resources.
396
+ 关闭下载处理程序并释放资源。
397
+
398
+ This method is called when the spider is closing. It closes all browser
399
+ instances in the pool and releases associated resources.
400
+ 当爬虫关闭时调用此方法。它关闭池中的所有浏览器实例并释放相关资源。
401
+ """
402
+ # Close all browser instances in the pool
403
+ # 关闭池中的所有浏览器实例
404
+ await self._webdriver_pool.close()