aio-scrapy 2.1.6__tar.gz → 2.1.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. {aio_scrapy-2.1.6/aio_scrapy.egg-info → aio_scrapy-2.1.8}/PKG-INFO +67 -128
  2. aio_scrapy-2.1.8/README.md +112 -0
  3. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8/aio_scrapy.egg-info}/PKG-INFO +67 -128
  4. aio_scrapy-2.1.8/aioscrapy/VERSION +1 -0
  5. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/core/downloader/handlers/aiohttp.py +3 -3
  6. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/core/downloader/handlers/curl_cffi.py +2 -2
  7. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/core/downloader/handlers/httpx.py +2 -2
  8. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/core/downloader/handlers/pyhttpx.py +5 -2
  9. aio_scrapy-2.1.8/aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  10. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +2 -2
  11. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/core/downloader/handlers/webdriver/playwright.py +2 -2
  12. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/http/request/__init__.py +89 -5
  13. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/settings/default_settings.py +7 -7
  14. aio_scrapy-2.1.6/README.md +0 -173
  15. aio_scrapy-2.1.6/aioscrapy/VERSION +0 -1
  16. aio_scrapy-2.1.6/aioscrapy/core/downloader/handlers/webdriver/__init__.py +0 -2
  17. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/LICENSE +0 -0
  18. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/MANIFEST.in +0 -0
  19. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aio_scrapy.egg-info/SOURCES.txt +0 -0
  20. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aio_scrapy.egg-info/dependency_links.txt +0 -0
  21. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aio_scrapy.egg-info/entry_points.txt +0 -0
  22. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aio_scrapy.egg-info/not-zip-safe +0 -0
  23. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aio_scrapy.egg-info/requires.txt +0 -0
  24. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aio_scrapy.egg-info/top_level.txt +0 -0
  25. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/__init__.py +0 -0
  26. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/__main__.py +0 -0
  27. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/cmdline.py +0 -0
  28. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/commands/__init__.py +0 -0
  29. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/commands/crawl.py +0 -0
  30. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/commands/genspider.py +0 -0
  31. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/commands/list.py +0 -0
  32. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/commands/runspider.py +0 -0
  33. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/commands/settings.py +0 -0
  34. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/commands/startproject.py +0 -0
  35. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/commands/version.py +0 -0
  36. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/core/__init__.py +0 -0
  37. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/core/downloader/__init__.py +0 -0
  38. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/core/downloader/handlers/__init__.py +0 -0
  39. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/core/downloader/handlers/requests.py +0 -0
  40. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/core/downloader/handlers/webdriver/driverpool.py +0 -0
  41. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/core/engine.py +0 -0
  42. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/core/scheduler.py +0 -0
  43. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/core/scraper.py +0 -0
  44. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/crawler.py +0 -0
  45. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/db/__init__.py +0 -0
  46. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/db/absmanager.py +0 -0
  47. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/db/aiomongo.py +0 -0
  48. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/db/aiomysql.py +0 -0
  49. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/db/aiopg.py +0 -0
  50. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/db/aiorabbitmq.py +0 -0
  51. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/db/aioredis.py +0 -0
  52. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/dupefilters/__init__.py +0 -0
  53. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/dupefilters/disk.py +0 -0
  54. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/dupefilters/redis.py +0 -0
  55. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/exceptions.py +0 -0
  56. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/http/__init__.py +0 -0
  57. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/http/headers.py +0 -0
  58. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/http/request/form.py +0 -0
  59. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/http/request/json_request.py +0 -0
  60. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/http/response/__init__.py +0 -0
  61. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/http/response/html.py +0 -0
  62. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/http/response/text.py +0 -0
  63. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/http/response/web_driver.py +0 -0
  64. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/http/response/xml.py +0 -0
  65. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/__init__.py +0 -0
  66. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/downloader/__init__.py +0 -0
  67. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/downloader/defaultheaders.py +0 -0
  68. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/downloader/downloadtimeout.py +0 -0
  69. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/downloader/ja3fingerprint.py +0 -0
  70. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/downloader/retry.py +0 -0
  71. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/downloader/stats.py +0 -0
  72. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/downloader/useragent.py +0 -0
  73. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/extensions/__init__.py +0 -0
  74. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/extensions/closespider.py +0 -0
  75. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/extensions/corestats.py +0 -0
  76. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/extensions/logstats.py +0 -0
  77. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/extensions/metric.py +0 -0
  78. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/extensions/throttle.py +0 -0
  79. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/pipelines/__init__.py +0 -0
  80. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/pipelines/csv.py +0 -0
  81. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/pipelines/excel.py +0 -0
  82. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/pipelines/mongo.py +0 -0
  83. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/pipelines/mysql.py +0 -0
  84. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/pipelines/pg.py +0 -0
  85. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/spider/__init__.py +0 -0
  86. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/spider/depth.py +0 -0
  87. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/spider/httperror.py +0 -0
  88. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/spider/offsite.py +0 -0
  89. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/spider/referer.py +0 -0
  90. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/libs/spider/urllength.py +0 -0
  91. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/link.py +0 -0
  92. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/logformatter.py +0 -0
  93. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/middleware/__init__.py +0 -0
  94. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/middleware/absmanager.py +0 -0
  95. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/middleware/downloader.py +0 -0
  96. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/middleware/extension.py +0 -0
  97. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/middleware/itempipeline.py +0 -0
  98. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/middleware/spider.py +0 -0
  99. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/process.py +0 -0
  100. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/proxy/__init__.py +0 -0
  101. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/proxy/redis.py +0 -0
  102. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/queue/__init__.py +0 -0
  103. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/queue/memory.py +0 -0
  104. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/queue/rabbitmq.py +0 -0
  105. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/queue/redis.py +0 -0
  106. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/scrapyd/__init__.py +0 -0
  107. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/scrapyd/runner.py +0 -0
  108. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/serializer.py +0 -0
  109. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/settings/__init__.py +0 -0
  110. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/signalmanager.py +0 -0
  111. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/signals.py +0 -0
  112. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/spiderloader.py +0 -0
  113. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/spiders/__init__.py +0 -0
  114. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/statscollectors.py +0 -0
  115. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/templates/project/aioscrapy.cfg +0 -0
  116. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/templates/project/module/__init__.py +0 -0
  117. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/templates/project/module/middlewares.py.tmpl +0 -0
  118. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/templates/project/module/pipelines.py.tmpl +0 -0
  119. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/templates/project/module/settings.py.tmpl +0 -0
  120. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/templates/project/module/spiders/__init__.py +0 -0
  121. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/templates/spiders/basic.tmpl +0 -0
  122. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/templates/spiders/single.tmpl +0 -0
  123. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/utils/__init__.py +0 -0
  124. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/utils/conf.py +0 -0
  125. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/utils/curl.py +0 -0
  126. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/utils/decorators.py +0 -0
  127. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/utils/deprecate.py +0 -0
  128. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/utils/httpobj.py +0 -0
  129. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/utils/log.py +0 -0
  130. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/utils/misc.py +0 -0
  131. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/utils/ossignal.py +0 -0
  132. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/utils/project.py +0 -0
  133. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/utils/python.py +0 -0
  134. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/utils/reqser.py +0 -0
  135. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/utils/request.py +0 -0
  136. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/utils/response.py +0 -0
  137. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/utils/signal.py +0 -0
  138. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/utils/spider.py +0 -0
  139. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/utils/template.py +0 -0
  140. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/utils/tools.py +0 -0
  141. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/utils/trackref.py +0 -0
  142. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/aioscrapy/utils/url.py +0 -0
  143. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/setup.cfg +0 -0
  144. {aio_scrapy-2.1.6 → aio_scrapy-2.1.8}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: aio-scrapy
3
- Version: 2.1.6
3
+ Version: 2.1.8
4
4
  Summary: A high-level Web Crawling and Web Scraping framework based on Asyncio
5
5
  Home-page: https://github.com/conlin-huang/aio-scrapy.git
6
6
  Author: conlin
@@ -76,115 +76,58 @@ Dynamic: requires-dist
76
76
  Dynamic: requires-python
77
77
  Dynamic: summary
78
78
 
79
- <!--
80
- ![aio-scrapy](./doc/images/aio-scrapy.png)
81
- -->
82
- ### aio-scrapy
79
+ # AioScrapy
83
80
 
84
- An asyncio + aiolibs crawler imitate scrapy framework
81
+ AioScrapy是一个基于Python异步IO的强大网络爬虫框架。它的设计理念源自Scrapy,但完全基于异步IO实现,提供更高的性能和更灵活的配置选项。</br>
82
+ AioScrapy is a powerful asynchronous web crawling framework built on Python's asyncio library. It is inspired by Scrapy but completely reimplemented with asynchronous IO, offering higher performance and more flexible configuration options.
85
83
 
86
- English | [中文](./doc/README_ZH.md)
84
+ ## 特性 | Features
87
85
 
88
- ### Overview
89
- - aio-scrapy framework is base on opensource project Scrapy & scrapy_redis.
90
- - aio-scrapy implements compatibility with scrapyd.
91
- - aio-scrapy implements redis queue and rabbitmq queue.
92
- - aio-scrapy is a fast high-level web crawling and web scraping framework, used to crawl websites and extract structured data from their pages.
93
- - Distributed crawling/scraping.
94
- ### Requirements
86
+ - **完全异步**:基于Python的asyncio库,实现高效的并发爬取
87
+ - **多种下载处理程序**:支持多种HTTP客户端,包括aiohttp、httpx、requests、pyhttpx、curl_cffi、DrissionPage和playwright
88
+ - **灵活的中间件系统**:轻松添加自定义功能和处理逻辑
89
+ - **强大的数据处理管道**:支持多种数据库存储选项
90
+ - **内置信号系统**:方便的事件处理机制
91
+ - **丰富的配置选项**:高度可定制的爬虫行为
92
+ - **分布式爬取**:支持使用Redis和RabbitMQ进行分布式爬取
93
+ - **数据库集成**:内置支持Redis、MySQL、MongoDB、PostgreSQL和RabbitMQ
95
94
 
96
- - Python 3.9+
97
- - Works on Linux, Windows, macOS, BSD
98
-
99
- ### Install
100
-
101
- The quick way:
102
-
103
- ```shell
104
- # Install the latest aio-scrapy
105
- pip install git+https://github.com/ConlinH/aio-scrapy
106
-
107
- # default
108
- pip install aio-scrapy
109
-
110
- # Install all dependencies
111
- pip install aio-scrapy[all]
112
-
113
- # When you need to use mysql/httpx/rabbitmq/mongo
114
- pip install aio-scrapy[aiomysql,httpx,aio-pika,mongo]
115
- ```
116
-
117
- ### Usage
118
-
119
- #### create project spider:
120
-
121
- ```shell
122
- aioscrapy startproject project_quotes
123
- ```
124
-
125
- ```
126
- cd project_quotes
127
- aioscrapy genspider quotes
128
- ```
129
-
130
- quotes.py
131
-
132
- ```python
133
- from aioscrapy.spiders import Spider
134
-
135
-
136
- class QuotesMemorySpider(Spider):
137
- name = 'QuotesMemorySpider'
138
-
139
- start_urls = ['https://quotes.toscrape.com']
140
-
141
- async def parse(self, response):
142
- for quote in response.css('div.quote'):
143
- yield {
144
- 'author': quote.xpath('span/small/text()').get(),
145
- 'text': quote.css('span.text::text').get(),
146
- }
147
95
 
148
- next_page = response.css('li.next a::attr("href")').get()
149
- if next_page is not None:
150
- yield response.follow(next_page, self.parse)
96
+ - **Fully Asynchronous**: Built on Python's asyncio for efficient concurrent crawling
97
+ - **Multiple Download Handlers**: Support for various HTTP clients including aiohttp, httpx, requests, pyhttpx, curl_cffi, DrissionPage and playwright
98
+ - **Flexible Middleware System**: Easily add custom functionality and processing logic
99
+ - **Powerful Data Processing Pipelines**: Support for various database storage options
100
+ - **Built-in Signal System**: Convenient event handling mechanism
101
+ - **Rich Configuration Options**: Highly customizable crawler behavior
102
+ - **Distributed Crawling**: Support for distributed crawling using Redis and RabbitMQ
103
+ - **Database Integration**: Built-in support for Redis, MySQL, MongoDB, PostgreSQL, and RabbitMQ
151
104
 
105
+ ## 安装 | Installation
152
106
 
153
- if __name__ == '__main__':
154
- QuotesMemorySpider.start()
107
+ ### 要求 | Requirements
155
108
 
156
- ```
157
-
158
- run the spider:
109
+ - Python 3.9+
159
110
 
160
- ```shell
161
- aioscrapy crawl quotes
162
- ```
111
+ ### 使用pip安装 | Install with pip
163
112
 
164
- #### create single script spider:
113
+ ```bash
114
+ pip install aio-scrapy
165
115
 
166
- ```shell
167
- aioscrapy genspider single_quotes -t single
116
+ # Install the latest aio-scrapy
117
+ # pip install git+https://github.com/ConlinH/aio-scrapy
168
118
  ```
169
119
 
170
- single_quotes.py:
171
-
120
+ ### 开始 | Start
172
121
  ```python
173
- from aioscrapy.spiders import Spider
122
+ from aioscrapy import Spider, logger
174
123
 
175
124
 
176
- class QuotesMemorySpider(Spider):
177
- name = 'QuotesMemorySpider'
125
+ class MyspiderSpider(Spider):
126
+ name = 'myspider'
178
127
  custom_settings = {
179
- "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
180
- 'CLOSE_SPIDER_ON_IDLE': True,
181
- # 'DOWNLOAD_DELAY': 3,
182
- # 'RANDOMIZE_DOWNLOAD_DELAY': True,
183
- # 'CONCURRENT_REQUESTS': 1,
184
- # 'LOG_LEVEL': 'INFO'
128
+ "CLOSE_SPIDER_ON_IDLE": True
185
129
  }
186
-
187
- start_urls = ['https://quotes.toscrape.com']
130
+ start_urls = ["https://quotes.toscrape.com"]
188
131
 
189
132
  @staticmethod
190
133
  async def process_request(request, spider):
@@ -203,49 +146,45 @@ class QuotesMemorySpider(Spider):
203
146
 
204
147
  async def parse(self, response):
205
148
  for quote in response.css('div.quote'):
206
- yield {
149
+ item = {
207
150
  'author': quote.xpath('span/small/text()').get(),
208
151
  'text': quote.css('span.text::text').get(),
209
152
  }
210
-
211
- next_page = response.css('li.next a::attr("href")').get()
212
- if next_page is not None:
213
- yield response.follow(next_page, self.parse)
153
+ yield item
214
154
 
215
155
  async def process_item(self, item):
216
- print(item)
156
+ logger.info(item)
217
157
 
218
158
 
219
159
  if __name__ == '__main__':
220
- QuotesMemorySpider.start()
221
-
160
+ MyspiderSpider.start()
222
161
  ```
223
162
 
224
- run the spider:
225
-
226
- ```shell
227
- aioscrapy runspider quotes.py
228
- ```
229
-
230
-
231
- ### more commands:
232
-
233
- ```shell
234
- aioscrapy -h
235
- ```
236
-
237
- #### [more example](./example)
238
-
239
- ### Documentation
240
- [doc](./doc/documentation.md)
241
-
242
- ### Ready
243
-
244
- Please submit your suggestions to the owner by creating an issue
245
-
246
- ## Thanks
247
-
248
- [aiohttp](https://github.com/aio-libs/aiohttp/)
249
-
250
- [scrapy](https://github.com/scrapy/scrapy)
251
-
163
+ ## 文档 | Documentation
164
+
165
+ ## 文档目录 | Documentation Contents
166
+ - [安装指南 | Installation Guide](docs/installation.md)
167
+ - [快速入门 | Quick Start](docs/quickstart.md)
168
+ - [核心概念 | Core Concepts](docs/concepts.md)
169
+ - [爬虫指南 | Spider Guide](docs/spiders.md)
170
+ - [下载器 | Downloaders](docs/downloaders.md)
171
+ - [中间件 | Middlewares](docs/middlewares.md)
172
+ - [管道 | Pipelines](docs/pipelines.md)
173
+ - [队列 | Queues](docs/queues.md)
174
+ - [请求过滤器 | Request Filters](docs/dupefilters.md)
175
+ - [代理 | Proxy](docs/proxy.md)
176
+ - [数据库连接 | Database Connections](docs/databases.md)
177
+ - [分布式部署 | Distributed Deployment](docs/distributed.md)
178
+ - [配置参考 | Settings Reference](docs/settings.md)
179
+ - [API参考 | API Reference](docs/api.md)
180
+ - [示例 | Example](example)
181
+
182
+ ## 许可证 | License
183
+
184
+ 本项目采用MIT许可证 - 详情请查看LICENSE文件。</br>
185
+ This project is licensed under the MIT License - see the LICENSE file for details.
186
+
187
+
188
+ ## 联系
189
+ QQ: 995018884 </br>
190
+ WeChat: h995018884
@@ -0,0 +1,112 @@
1
+ # AioScrapy
2
+
3
+ AioScrapy是一个基于Python异步IO的强大网络爬虫框架。它的设计理念源自Scrapy,但完全基于异步IO实现,提供更高的性能和更灵活的配置选项。</br>
4
+ AioScrapy is a powerful asynchronous web crawling framework built on Python's asyncio library. It is inspired by Scrapy but completely reimplemented with asynchronous IO, offering higher performance and more flexible configuration options.
5
+
6
+ ## 特性 | Features
7
+
8
+ - **完全异步**:基于Python的asyncio库,实现高效的并发爬取
9
+ - **多种下载处理程序**:支持多种HTTP客户端,包括aiohttp、httpx、requests、pyhttpx、curl_cffi、DrissionPage和playwright
10
+ - **灵活的中间件系统**:轻松添加自定义功能和处理逻辑
11
+ - **强大的数据处理管道**:支持多种数据库存储选项
12
+ - **内置信号系统**:方便的事件处理机制
13
+ - **丰富的配置选项**:高度可定制的爬虫行为
14
+ - **分布式爬取**:支持使用Redis和RabbitMQ进行分布式爬取
15
+ - **数据库集成**:内置支持Redis、MySQL、MongoDB、PostgreSQL和RabbitMQ
16
+
17
+
18
+ - **Fully Asynchronous**: Built on Python's asyncio for efficient concurrent crawling
19
+ - **Multiple Download Handlers**: Support for various HTTP clients including aiohttp, httpx, requests, pyhttpx, curl_cffi, DrissionPage and playwright
20
+ - **Flexible Middleware System**: Easily add custom functionality and processing logic
21
+ - **Powerful Data Processing Pipelines**: Support for various database storage options
22
+ - **Built-in Signal System**: Convenient event handling mechanism
23
+ - **Rich Configuration Options**: Highly customizable crawler behavior
24
+ - **Distributed Crawling**: Support for distributed crawling using Redis and RabbitMQ
25
+ - **Database Integration**: Built-in support for Redis, MySQL, MongoDB, PostgreSQL, and RabbitMQ
26
+
27
+ ## 安装 | Installation
28
+
29
+ ### 要求 | Requirements
30
+
31
+ - Python 3.9+
32
+
33
+ ### 使用pip安装 | Install with pip
34
+
35
+ ```bash
36
+ pip install aio-scrapy
37
+
38
+ # Install the latest aio-scrapy
39
+ # pip install git+https://github.com/ConlinH/aio-scrapy
40
+ ```
41
+
42
+ ### 开始 | Start
43
+ ```python
44
+ from aioscrapy import Spider, logger
45
+
46
+
47
+ class MyspiderSpider(Spider):
48
+ name = 'myspider'
49
+ custom_settings = {
50
+ "CLOSE_SPIDER_ON_IDLE": True
51
+ }
52
+ start_urls = ["https://quotes.toscrape.com"]
53
+
54
+ @staticmethod
55
+ async def process_request(request, spider):
56
+ """ request middleware """
57
+ pass
58
+
59
+ @staticmethod
60
+ async def process_response(request, response, spider):
61
+ """ response middleware """
62
+ return response
63
+
64
+ @staticmethod
65
+ async def process_exception(request, exception, spider):
66
+ """ exception middleware """
67
+ pass
68
+
69
+ async def parse(self, response):
70
+ for quote in response.css('div.quote'):
71
+ item = {
72
+ 'author': quote.xpath('span/small/text()').get(),
73
+ 'text': quote.css('span.text::text').get(),
74
+ }
75
+ yield item
76
+
77
+ async def process_item(self, item):
78
+ logger.info(item)
79
+
80
+
81
+ if __name__ == '__main__':
82
+ MyspiderSpider.start()
83
+ ```
84
+
85
+ ## 文档 | Documentation
86
+
87
+ ## 文档目录 | Documentation Contents
88
+ - [安装指南 | Installation Guide](docs/installation.md)
89
+ - [快速入门 | Quick Start](docs/quickstart.md)
90
+ - [核心概念 | Core Concepts](docs/concepts.md)
91
+ - [爬虫指南 | Spider Guide](docs/spiders.md)
92
+ - [下载器 | Downloaders](docs/downloaders.md)
93
+ - [中间件 | Middlewares](docs/middlewares.md)
94
+ - [管道 | Pipelines](docs/pipelines.md)
95
+ - [队列 | Queues](docs/queues.md)
96
+ - [请求过滤器 | Request Filters](docs/dupefilters.md)
97
+ - [代理 | Proxy](docs/proxy.md)
98
+ - [数据库连接 | Database Connections](docs/databases.md)
99
+ - [分布式部署 | Distributed Deployment](docs/distributed.md)
100
+ - [配置参考 | Settings Reference](docs/settings.md)
101
+ - [API参考 | API Reference](docs/api.md)
102
+ - [示例 | Example](example)
103
+
104
+ ## 许可证 | License
105
+
106
+ 本项目采用MIT许可证 - 详情请查看LICENSE文件。</br>
107
+ This project is licensed under the MIT License - see the LICENSE file for details.
108
+
109
+
110
+ ## 联系
111
+ QQ: 995018884 </br>
112
+ WeChat: h995018884
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: aio-scrapy
3
- Version: 2.1.6
3
+ Version: 2.1.8
4
4
  Summary: A high-level Web Crawling and Web Scraping framework based on Asyncio
5
5
  Home-page: https://github.com/conlin-huang/aio-scrapy.git
6
6
  Author: conlin
@@ -76,115 +76,58 @@ Dynamic: requires-dist
76
76
  Dynamic: requires-python
77
77
  Dynamic: summary
78
78
 
79
- <!--
80
- ![aio-scrapy](./doc/images/aio-scrapy.png)
81
- -->
82
- ### aio-scrapy
79
+ # AioScrapy
83
80
 
84
- An asyncio + aiolibs crawler imitate scrapy framework
81
+ AioScrapy是一个基于Python异步IO的强大网络爬虫框架。它的设计理念源自Scrapy,但完全基于异步IO实现,提供更高的性能和更灵活的配置选项。</br>
82
+ AioScrapy is a powerful asynchronous web crawling framework built on Python's asyncio library. It is inspired by Scrapy but completely reimplemented with asynchronous IO, offering higher performance and more flexible configuration options.
85
83
 
86
- English | [中文](./doc/README_ZH.md)
84
+ ## 特性 | Features
87
85
 
88
- ### Overview
89
- - aio-scrapy framework is base on opensource project Scrapy & scrapy_redis.
90
- - aio-scrapy implements compatibility with scrapyd.
91
- - aio-scrapy implements redis queue and rabbitmq queue.
92
- - aio-scrapy is a fast high-level web crawling and web scraping framework, used to crawl websites and extract structured data from their pages.
93
- - Distributed crawling/scraping.
94
- ### Requirements
86
+ - **完全异步**:基于Python的asyncio库,实现高效的并发爬取
87
+ - **多种下载处理程序**:支持多种HTTP客户端,包括aiohttp、httpx、requests、pyhttpx、curl_cffi、DrissionPage和playwright
88
+ - **灵活的中间件系统**:轻松添加自定义功能和处理逻辑
89
+ - **强大的数据处理管道**:支持多种数据库存储选项
90
+ - **内置信号系统**:方便的事件处理机制
91
+ - **丰富的配置选项**:高度可定制的爬虫行为
92
+ - **分布式爬取**:支持使用Redis和RabbitMQ进行分布式爬取
93
+ - **数据库集成**:内置支持Redis、MySQL、MongoDB、PostgreSQL和RabbitMQ
95
94
 
96
- - Python 3.9+
97
- - Works on Linux, Windows, macOS, BSD
98
-
99
- ### Install
100
-
101
- The quick way:
102
-
103
- ```shell
104
- # Install the latest aio-scrapy
105
- pip install git+https://github.com/ConlinH/aio-scrapy
106
-
107
- # default
108
- pip install aio-scrapy
109
-
110
- # Install all dependencies
111
- pip install aio-scrapy[all]
112
-
113
- # When you need to use mysql/httpx/rabbitmq/mongo
114
- pip install aio-scrapy[aiomysql,httpx,aio-pika,mongo]
115
- ```
116
-
117
- ### Usage
118
-
119
- #### create project spider:
120
-
121
- ```shell
122
- aioscrapy startproject project_quotes
123
- ```
124
-
125
- ```
126
- cd project_quotes
127
- aioscrapy genspider quotes
128
- ```
129
-
130
- quotes.py
131
-
132
- ```python
133
- from aioscrapy.spiders import Spider
134
-
135
-
136
- class QuotesMemorySpider(Spider):
137
- name = 'QuotesMemorySpider'
138
-
139
- start_urls = ['https://quotes.toscrape.com']
140
-
141
- async def parse(self, response):
142
- for quote in response.css('div.quote'):
143
- yield {
144
- 'author': quote.xpath('span/small/text()').get(),
145
- 'text': quote.css('span.text::text').get(),
146
- }
147
95
 
148
- next_page = response.css('li.next a::attr("href")').get()
149
- if next_page is not None:
150
- yield response.follow(next_page, self.parse)
96
+ - **Fully Asynchronous**: Built on Python's asyncio for efficient concurrent crawling
97
+ - **Multiple Download Handlers**: Support for various HTTP clients including aiohttp, httpx, requests, pyhttpx, curl_cffi, DrissionPage and playwright
98
+ - **Flexible Middleware System**: Easily add custom functionality and processing logic
99
+ - **Powerful Data Processing Pipelines**: Support for various database storage options
100
+ - **Built-in Signal System**: Convenient event handling mechanism
101
+ - **Rich Configuration Options**: Highly customizable crawler behavior
102
+ - **Distributed Crawling**: Support for distributed crawling using Redis and RabbitMQ
103
+ - **Database Integration**: Built-in support for Redis, MySQL, MongoDB, PostgreSQL, and RabbitMQ
151
104
 
105
+ ## 安装 | Installation
152
106
 
153
- if __name__ == '__main__':
154
- QuotesMemorySpider.start()
107
+ ### 要求 | Requirements
155
108
 
156
- ```
157
-
158
- run the spider:
109
+ - Python 3.9+
159
110
 
160
- ```shell
161
- aioscrapy crawl quotes
162
- ```
111
+ ### 使用pip安装 | Install with pip
163
112
 
164
- #### create single script spider:
113
+ ```bash
114
+ pip install aio-scrapy
165
115
 
166
- ```shell
167
- aioscrapy genspider single_quotes -t single
116
+ # Install the latest aio-scrapy
117
+ # pip install git+https://github.com/ConlinH/aio-scrapy
168
118
  ```
169
119
 
170
- single_quotes.py:
171
-
120
+ ### 开始 | Start
172
121
  ```python
173
- from aioscrapy.spiders import Spider
122
+ from aioscrapy import Spider, logger
174
123
 
175
124
 
176
- class QuotesMemorySpider(Spider):
177
- name = 'QuotesMemorySpider'
125
+ class MyspiderSpider(Spider):
126
+ name = 'myspider'
178
127
  custom_settings = {
179
- "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
180
- 'CLOSE_SPIDER_ON_IDLE': True,
181
- # 'DOWNLOAD_DELAY': 3,
182
- # 'RANDOMIZE_DOWNLOAD_DELAY': True,
183
- # 'CONCURRENT_REQUESTS': 1,
184
- # 'LOG_LEVEL': 'INFO'
128
+ "CLOSE_SPIDER_ON_IDLE": True
185
129
  }
186
-
187
- start_urls = ['https://quotes.toscrape.com']
130
+ start_urls = ["https://quotes.toscrape.com"]
188
131
 
189
132
  @staticmethod
190
133
  async def process_request(request, spider):
@@ -203,49 +146,45 @@ class QuotesMemorySpider(Spider):
203
146
 
204
147
  async def parse(self, response):
205
148
  for quote in response.css('div.quote'):
206
- yield {
149
+ item = {
207
150
  'author': quote.xpath('span/small/text()').get(),
208
151
  'text': quote.css('span.text::text').get(),
209
152
  }
210
-
211
- next_page = response.css('li.next a::attr("href")').get()
212
- if next_page is not None:
213
- yield response.follow(next_page, self.parse)
153
+ yield item
214
154
 
215
155
  async def process_item(self, item):
216
- print(item)
156
+ logger.info(item)
217
157
 
218
158
 
219
159
  if __name__ == '__main__':
220
- QuotesMemorySpider.start()
221
-
160
+ MyspiderSpider.start()
222
161
  ```
223
162
 
224
- run the spider:
225
-
226
- ```shell
227
- aioscrapy runspider quotes.py
228
- ```
229
-
230
-
231
- ### more commands:
232
-
233
- ```shell
234
- aioscrapy -h
235
- ```
236
-
237
- #### [more example](./example)
238
-
239
- ### Documentation
240
- [doc](./doc/documentation.md)
241
-
242
- ### Ready
243
-
244
- Please submit your suggestions to the owner by creating an issue
245
-
246
- ## Thanks
247
-
248
- [aiohttp](https://github.com/aio-libs/aiohttp/)
249
-
250
- [scrapy](https://github.com/scrapy/scrapy)
251
-
163
+ ## 文档 | Documentation
164
+
165
+ ## 文档目录 | Documentation Contents
166
+ - [安装指南 | Installation Guide](docs/installation.md)
167
+ - [快速入门 | Quick Start](docs/quickstart.md)
168
+ - [核心概念 | Core Concepts](docs/concepts.md)
169
+ - [爬虫指南 | Spider Guide](docs/spiders.md)
170
+ - [下载器 | Downloaders](docs/downloaders.md)
171
+ - [中间件 | Middlewares](docs/middlewares.md)
172
+ - [管道 | Pipelines](docs/pipelines.md)
173
+ - [队列 | Queues](docs/queues.md)
174
+ - [请求过滤器 | Request Filters](docs/dupefilters.md)
175
+ - [代理 | Proxy](docs/proxy.md)
176
+ - [数据库连接 | Database Connections](docs/databases.md)
177
+ - [分布式部署 | Distributed Deployment](docs/distributed.md)
178
+ - [配置参考 | Settings Reference](docs/settings.md)
179
+ - [API参考 | API Reference](docs/api.md)
180
+ - [示例 | Example](example)
181
+
182
+ ## 许可证 | License
183
+
184
+ 本项目采用MIT许可证 - 详情请查看LICENSE文件。</br>
185
+ This project is licensed under the MIT License - see the LICENSE file for details.
186
+
187
+
188
+ ## 联系
189
+ QQ: 995018884 </br>
190
+ WeChat: h995018884
@@ -0,0 +1 @@
1
+ 2.1.8
@@ -50,7 +50,7 @@ class AioHttpDownloadHandler(BaseDownloadHandler):
50
50
 
51
51
  # Arguments to pass to aiohttp.ClientSession constructor
52
52
  # 传递给aiohttp.ClientSession构造函数的参数
53
- self.aiohttp_client_session_args: dict = settings.getdict('AIOHTTP_CLIENT_SESSION_ARGS')
53
+ self.aiohttp_args: dict = settings.getdict('AIOHTTP_ARGS')
54
54
 
55
55
  # SSL verification setting
56
56
  # SSL验证设置
@@ -228,13 +228,13 @@ class AioHttpDownloadHandler(BaseDownloadHandler):
228
228
  if self.use_session:
229
229
  # Not recommended to use session, The abnormal phenomena will occurs when using tunnel proxy
230
230
  # 不建议使用会话,使用隧道代理时会出现异常现象
231
- session = self.get_session(**self.aiohttp_client_session_args)
231
+ session = self.get_session(**self.aiohttp_args)
232
232
  async with session.request(request.method, request.url, **kwargs) as response:
233
233
  content: bytes = await response.read()
234
234
  else:
235
235
  # Create a new session for each request (recommended)
236
236
  # 为每个请求创建一个新会话(推荐)
237
- async with aiohttp.ClientSession(**self.aiohttp_client_session_args) as session:
237
+ async with aiohttp.ClientSession(**self.aiohttp_args) as session:
238
238
  async with session.request(request.method, request.url, **kwargs) as response:
239
239
  content: bytes = await response.read()
240
240
 
@@ -44,7 +44,7 @@ class CurlCffiDownloadHandler(BaseDownloadHandler):
44
44
 
45
45
  # Arguments to pass to curl_cffi AsyncSession constructor
46
46
  # 传递给curl_cffi AsyncSession构造函数的参数
47
- self.httpx_client_session_args: dict = self.settings.get('CURL_CFFI_CLIENT_SESSION_ARGS', {})
47
+ self.curl_cffi_args: dict = self.settings.get('CURL_CFFI_ARGS', {})
48
48
 
49
49
  # SSL verification setting
50
50
  # SSL验证设置
@@ -156,7 +156,7 @@ class CurlCffiDownloadHandler(BaseDownloadHandler):
156
156
 
157
157
  # Configure curl_cffi session
158
158
  # 配置curl_cffi会话
159
- session_args = self.httpx_client_session_args.copy()
159
+ session_args = self.curl_cffi_args.copy()
160
160
 
161
161
  # Perform the request
162
162
  # 执行请求
@@ -46,7 +46,7 @@ class HttpxDownloadHandler(BaseDownloadHandler):
46
46
 
47
47
  # Arguments to pass to httpx AsyncClient constructor
48
48
  # 传递给httpx AsyncClient构造函数的参数
49
- self.httpx_client_session_args: dict = self.settings.get('HTTPX_CLIENT_SESSION_ARGS', {})
49
+ self.httpx_args: dict = self.settings.get('HTTPX_ARGS', {})
50
50
 
51
51
  # SSL verification setting
52
52
  # SSL验证设置
@@ -147,7 +147,7 @@ class HttpxDownloadHandler(BaseDownloadHandler):
147
147
 
148
148
  # Configure httpx client session
149
149
  # 配置httpx客户端会话
150
- session_args = self.httpx_client_session_args.copy()
150
+ session_args = self.httpx_args.copy()
151
151
  session_args.setdefault('http2', True) # Enable HTTP/2 by default
152
152
  # 默认启用HTTP/2
153
153
  session_args.update({