aio-scrapy 2.0.10__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: aio-scrapy
3
- Version: 2.0.10
3
+ Version: 2.1.0
4
4
  Summary: A high-level Web Crawling and Web Scraping framework based on Asyncio
5
5
  Home-page: https://github.com/conlin-huang/aio-scrapy.git
6
6
  Author: conlin
@@ -44,6 +44,9 @@ Requires-Dist: asyncpg >=0.27.0 ; extra == 'all'
44
44
  Requires-Dist: XlsxWriter >=3.1.2 ; extra == 'all'
45
45
  Requires-Dist: pillow >=9.4.0 ; extra == 'all'
46
46
  Requires-Dist: requests >=2.28.2 ; extra == 'all'
47
+ Requires-Dist: curl-cffi ; extra == 'all'
48
+ Provides-Extra: curl_cffi
49
+ Requires-Dist: curl-cffi >=0.6.1 ; extra == 'curl_cffi'
47
50
  Provides-Extra: execl
48
51
  Requires-Dist: XlsxWriter >=3.1.2 ; extra == 'execl'
49
52
  Requires-Dist: pillow >=9.4.0 ; extra == 'execl'
@@ -1,8 +1,8 @@
1
- aioscrapy/VERSION,sha256=bkksF7-FeZMTR8EfltCUKJZNQaHaQkySSXYbwvc2qdw,6
1
+ aioscrapy/VERSION,sha256=gkj3dyaHr_CxA7NomJCN64ISYzf5M-SaWsBIKQk8WP8,5
2
2
  aioscrapy/__init__.py,sha256=esJeH66Mz9WV7XbotvZEjNn49jc589YZ_L2DKoD0JvA,858
3
3
  aioscrapy/__main__.py,sha256=rvTdJ0cQwbi29aucPj3jJRpccx5SBzvRcV7qvxvX2NQ,80
4
4
  aioscrapy/cmdline.py,sha256=1qhNg2Edl-Obmf2re2K4V8pJG7ubGfZZCzcHdKtdE_s,5159
5
- aioscrapy/crawler.py,sha256=k24cWw8tev93obQHNqhjdLMTPX3jVGoHRfS29n56etk,10109
5
+ aioscrapy/crawler.py,sha256=6-ptivIjIGKdojOlZqXV0hV3x1Gont81tOC5u5JqIME,10330
6
6
  aioscrapy/exceptions.py,sha256=NjA2Rx1KZsjMgH7IOdNpxuRkh-RwylRCYvEhwgXKIb8,2027
7
7
  aioscrapy/link.py,sha256=fXMqsHvYEzsuYi-sNDcElS7jV6Lusq0tjPkPUGOlyZw,1867
8
8
  aioscrapy/logformatter.py,sha256=y3etd28ACbpTbcGprJ_cQ086gxQY3k_QX_yxYFoF1AU,3028
@@ -21,12 +21,13 @@ aioscrapy/commands/settings.py,sha256=sc0rwwfBQNySKX8uV3iJqv3i7SelFwNcrlHYxDupKO
21
21
  aioscrapy/commands/startproject.py,sha256=Rcc7JkN75Jp2t2aZIxBzPsWbLXChNAUSByDhcW_6Ig8,4001
22
22
  aioscrapy/commands/version.py,sha256=yqqTMlZkkiQhtbU9w_IqUWLMOAjqYlv24friEkPRQYM,485
23
23
  aioscrapy/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
- aioscrapy/core/engine.py,sha256=zW3GPigPqyrWJ_Jk7SUxD0ueV1HTuxUvweFUU4WFG-0,10926
25
- aioscrapy/core/scheduler.py,sha256=YCRw9j79ZOL8bijDa3IdRaw0YlMTrwXuJGzaApkN7lc,5737
24
+ aioscrapy/core/engine.py,sha256=h02-K2lQqlCxvNIlURgPpnhHCbyiJRIWrFJt5Ys7vZY,9843
25
+ aioscrapy/core/scheduler.py,sha256=sHrTfzSkqCVZTVw7zWAyv3vDd6iUwE9xbIsnePUVeZk,7408
26
26
  aioscrapy/core/scraper.py,sha256=M_bcizLUzWuECe7sIIZ_HJLNrPzL7dX2o-tN5nvFnCs,10304
27
27
  aioscrapy/core/downloader/__init__.py,sha256=22TC0z49BX3YvDUPl6DKMrOonECpY5tjaWJGGEV7RbU,9574
28
28
  aioscrapy/core/downloader/handlers/__init__.py,sha256=CriaX2Cp4jUqzDDGZDB7HiIEgUWt2pnYVho6HMV6sJ0,3198
29
29
  aioscrapy/core/downloader/handlers/aiohttp.py,sha256=dFVVeGgJ1WZcE1zI4fQOZIzmrkC6l1WZcYstHmB3qYg,3942
30
+ aioscrapy/core/downloader/handlers/curl_cffi.py,sha256=6jBp9WrGU0PCWd3HfXLD6P3MkMIG_zmKNCKieORVPas,2250
30
31
  aioscrapy/core/downloader/handlers/httpx.py,sha256=-DfjYgfrjxMhaMpTgEOFlQRONasCXV0g6UgH3WmWcfs,3041
31
32
  aioscrapy/core/downloader/handlers/pyhttpx.py,sha256=fgD6Kz_gfB17KHbnkFtUHJDjfYR-c9P2LhuYX4hcva8,2228
32
33
  aioscrapy/core/downloader/handlers/requests.py,sha256=I49YnAxFGf-_a_YR-1AOG8vPLMmKiMtdmP4Xn-c0dPw,1996
@@ -58,7 +59,7 @@ aioscrapy/libs/downloader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
58
59
  aioscrapy/libs/downloader/defaultheaders.py,sha256=tg_ULA0Y-41bZKG607mowFJQGVfnZ45LdR044DsjA_A,563
59
60
  aioscrapy/libs/downloader/downloadtimeout.py,sha256=hNh3OEj7rC0ceQrv_yrhR5lb5AvfxJ6cspj3qsQWj4o,704
60
61
  aioscrapy/libs/downloader/ja3fingerprint.py,sha256=DgTw74GXC_Bp94eD_bwoG6A_DphUHTt7bH4glBNXyV8,1058
61
- aioscrapy/libs/downloader/retry.py,sha256=eaMig7JpSyr6QQBD6FNYpcttuGK811Dm4tJGTUIi3q8,5191
62
+ aioscrapy/libs/downloader/retry.py,sha256=nNhAqudTBhYJES1CEuzo0a-ucmS2WKcj8bOvs3PwPjw,5306
62
63
  aioscrapy/libs/downloader/stats.py,sha256=FlkS8Zm4j3SBjHb6caXwq08HvvZ37VKORGCAjlA2U38,1376
63
64
  aioscrapy/libs/downloader/useragent.py,sha256=E5x5dk9AxsSCGDDICJlTXwWXRkqAibWgesqG0VhAG8M,743
64
65
  aioscrapy/libs/extensions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -67,7 +68,7 @@ aioscrapy/libs/extensions/corestats.py,sha256=WCZ4nnk6LUP7AdGx9mnuVm96iWMxHozxdN
67
68
  aioscrapy/libs/extensions/logstats.py,sha256=wSLbN9tmsw5I1FBxHjLfIdQo85fxJI7TmOefispaxc4,1844
68
69
  aioscrapy/libs/extensions/metric.py,sha256=cx9UnSdj6akzrPe_uwWHh_QKTNzD82VRrEjiiHOoAuc,5479
69
70
  aioscrapy/libs/extensions/throttle.py,sha256=yos2D3XZgH40G52kltMKv5_GeAK4MqpRwTu6FCErUh0,3512
70
- aioscrapy/libs/pipelines/__init__.py,sha256=x24am2am-aUjeX4XlYJxWQT0IS-jhKkQOL1MM-iWwzs,5709
71
+ aioscrapy/libs/pipelines/__init__.py,sha256=XW5Ur6bhvGLo-w-tdUeIB4jkFpZxqUU9mbajfAAztb0,5642
71
72
  aioscrapy/libs/pipelines/csv.py,sha256=-PEZOt-3ndF0ePO7EnqjEqeCYMJR9wHv3XcpSq6QswI,2454
72
73
  aioscrapy/libs/pipelines/execl.py,sha256=a8sfgQCHUc0MIja9cPP4TZ6ghfkxYZuAzLDIK4_nQuo,6284
73
74
  aioscrapy/libs/pipelines/mongo.py,sha256=jiPyC3C0mNb-zlS0ecEBgl883gBtBQBFEeBR8DOcmmI,2001
@@ -94,8 +95,8 @@ aioscrapy/queue/redis.py,sha256=KU31ZNciLI9xxZDxsDhtOPLtmkxZQlRPOx_1z8afdwY,4788
94
95
  aioscrapy/scrapyd/__init__.py,sha256=Ey14RVLUP7typ2XqP8RWcUum2fuFyigdhuhBBiEheIo,68
95
96
  aioscrapy/scrapyd/runner.py,sha256=tewEkdNTMrBoredCbhmdrswSrF-GWsU3MLgC__ntnzQ,1777
96
97
  aioscrapy/settings/__init__.py,sha256=GuiVhezV8U2J1B-WJwSvxxeH_1YWYD_Wighr9owC4HU,15781
97
- aioscrapy/settings/default_settings.py,sha256=ffGA1SKEBQtmRC7UaFcNBlZrVW9PjUwukDiARqVfTXs,5432
98
- aioscrapy/spiders/__init__.py,sha256=vAfod_sqXs85E-QRNji_Qhf7SyWx1kXgJD8n3AhAj1g,3934
98
+ aioscrapy/settings/default_settings.py,sha256=PrUOFYNnPIS8eCdqvRylMLBK-4tT-2MYuU6Nn8dQrx0,5639
99
+ aioscrapy/spiders/__init__.py,sha256=KoM3RMOtvWqN5Qfh6AATTWFmj9DIVmhQqrU_EhE1EdI,4010
99
100
  aioscrapy/templates/project/aioscrapy.cfg,sha256=_nRHP5wtPnZaBi7wCmjWv5BgUu5NYFJZhvCTRVSipyM,112
100
101
  aioscrapy/templates/project/module/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
101
102
  aioscrapy/templates/project/module/middlewares.py.tmpl,sha256=0eEf2LC0vYcWPH82HNqieYSORyUuIo3Bgl5t-neRAJ4,3469
@@ -124,9 +125,9 @@ aioscrapy/utils/template.py,sha256=HR97X4lpv2WuqhuPfzTgaBN66fYnzHVpP6zQ5IoTwcI,8
124
125
  aioscrapy/utils/tools.py,sha256=WJowViZB8XEs2CFqjVvbqXK3H5Uvf4BgWgBD_RcHMaM,2319
125
126
  aioscrapy/utils/trackref.py,sha256=0nIpelT1d5WYxALl8SGA8vHNYsh-jS0Z2lwVEAhwx8E,2019
126
127
  aioscrapy/utils/url.py,sha256=8W8tAhU7lgfPOfzKp3ejJGEcLj1i_PnA_53Jv5LpxiY,5464
127
- aio_scrapy-2.0.10.dist-info/LICENSE,sha256=L-UoAEM3fQSjKA7FVWxQM7gwSCbeue6gZRAnpRS_UCo,1088
128
- aio_scrapy-2.0.10.dist-info/METADATA,sha256=qMfSjJmZpj8xAaoGdjEC-oNULa4wYcWFwgJJm8wBQ3U,6385
129
- aio_scrapy-2.0.10.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
130
- aio_scrapy-2.0.10.dist-info/entry_points.txt,sha256=WWhoVHZvqhW8a5uFg97K0EP_GjG3uuCIFLkyqDICgaw,56
131
- aio_scrapy-2.0.10.dist-info/top_level.txt,sha256=8l08KyMt22wfX_5BmhrGH0PgwZdzZIPq-hBUa1GNir4,10
132
- aio_scrapy-2.0.10.dist-info/RECORD,,
128
+ aio_scrapy-2.1.0.dist-info/LICENSE,sha256=L-UoAEM3fQSjKA7FVWxQM7gwSCbeue6gZRAnpRS_UCo,1088
129
+ aio_scrapy-2.1.0.dist-info/METADATA,sha256=ZZlawN0H5Ngxljj10IgIy9O7RU-y_RxJ9iTEkc_YNR8,6511
130
+ aio_scrapy-2.1.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
131
+ aio_scrapy-2.1.0.dist-info/entry_points.txt,sha256=WWhoVHZvqhW8a5uFg97K0EP_GjG3uuCIFLkyqDICgaw,56
132
+ aio_scrapy-2.1.0.dist-info/top_level.txt,sha256=8l08KyMt22wfX_5BmhrGH0PgwZdzZIPq-hBUa1GNir4,10
133
+ aio_scrapy-2.1.0.dist-info/RECORD,,
aioscrapy/VERSION CHANGED
@@ -1 +1 @@
1
- 2.0.10
1
+ 2.1.0
@@ -0,0 +1,61 @@
1
+ import ssl
2
+
3
+ from curl_cffi.requests import AsyncSession
4
+
5
+ from aioscrapy import Request
6
+ from aioscrapy.core.downloader.handlers import BaseDownloadHandler
7
+ from aioscrapy.http import HtmlResponse
8
+ from aioscrapy.settings import Settings
9
+ from aioscrapy.utils.log import logger
10
+
11
+
12
+ class CurlCffiDownloadHandler(BaseDownloadHandler):
13
+
14
+ def __init__(self, settings):
15
+ self.settings: Settings = settings
16
+ self.httpx_client_session_args: dict = self.settings.get('CURL_CFFI_CLIENT_SESSION_ARGS', {})
17
+ self.verify_ssl: bool = self.settings.get("VERIFY_SSL", True)
18
+
19
+ @classmethod
20
+ def from_settings(cls, settings: Settings):
21
+ return cls(settings)
22
+
23
+ async def download_request(self, request: Request, _) -> HtmlResponse:
24
+ kwargs = {
25
+ 'timeout': self.settings.get('DOWNLOAD_TIMEOUT'),
26
+ 'cookies': dict(request.cookies),
27
+ 'verify': request.meta.get('verify_ssl', self.verify_ssl),
28
+ 'allow_redirects': self.settings.getbool('REDIRECT_ENABLED', True) if request.meta.get(
29
+ 'dont_redirect') is None else request.meta.get('dont_redirect'),
30
+ 'impersonate': request.meta.get('impersonate'),
31
+ }
32
+ post_data = request.body or None
33
+ if isinstance(post_data, dict):
34
+ kwargs['json'] = post_data
35
+ else:
36
+ kwargs['data'] = post_data
37
+
38
+ headers = request.headers or self.settings.get('DEFAULT_REQUEST_HEADERS')
39
+ kwargs['headers'] = headers
40
+
41
+ proxy = request.meta.get("proxy")
42
+ if proxy:
43
+ kwargs["proxies"] = {'http': proxy, 'https': proxy}
44
+ logger.debug(f"use proxy {proxy}: {request.url}")
45
+
46
+ session_args = self.httpx_client_session_args.copy()
47
+
48
+ async with AsyncSession(**session_args) as session:
49
+ response = await session.request(request.method, request.url, **kwargs)
50
+
51
+ return HtmlResponse(
52
+ str(response.url),
53
+ status=response.status_code,
54
+ headers=response.headers,
55
+ body=response.text,
56
+ cookies=dict(response.cookies),
57
+ encoding=response.encoding
58
+ )
59
+
60
+ async def close(self):
61
+ pass
aioscrapy/core/engine.py CHANGED
@@ -42,8 +42,6 @@ class ExecutionEngine(object):
42
42
  self.signals = crawler.signals
43
43
  self.logformatter = crawler.logformatter
44
44
 
45
- self.enqueue_cache_num = self.settings.getint("ENQUEUE_CACHE_NUM")
46
- self.enqueue_cache: Queue = Queue(self.enqueue_cache_num)
47
45
  self.slot: Optional[Slot] = None
48
46
  self.spider: Optional[Spider] = None
49
47
  self.downloader: Optional[DownloaderTV] = None
@@ -53,7 +51,6 @@ class ExecutionEngine(object):
53
51
  self.running: bool = False
54
52
  self.unlock: bool = True
55
53
  self.finish: bool = False
56
- self.enqueue_unlock: bool = True
57
54
 
58
55
  async def start(
59
56
  self,
@@ -70,7 +67,6 @@ class ExecutionEngine(object):
70
67
  while not self.finish:
71
68
  self.running and await self._next_request()
72
69
  await asyncio.sleep(1)
73
- self.enqueue_cache_num != 1 and create_task(self._crawl())
74
70
  self.running and await self._spider_idle(self.spider)
75
71
 
76
72
  async def stop(self, reason: str = 'shutdown') -> None:
@@ -81,7 +77,6 @@ class ExecutionEngine(object):
81
77
 
82
78
  while not self.is_idle():
83
79
  await asyncio.sleep(0.2)
84
- self.enqueue_cache_num != 1 and create_task(self._crawl())
85
80
  await self.close_spider(self.spider, reason=reason)
86
81
  await self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
87
82
  self.finish = True
@@ -212,27 +207,8 @@ class ExecutionEngine(object):
212
207
  return True
213
208
 
214
209
  async def crawl(self, request: Request) -> None:
215
- if self.enqueue_cache_num == 1:
216
- await self.scheduler.enqueue_request(request)
217
- create_task(self._next_request())
218
- else:
219
- await self.enqueue_cache.put(request)
220
-
221
- async def _crawl(self) -> None:
222
- if not self.enqueue_unlock:
223
- return
224
- self.enqueue_unlock = False
225
- requests = []
226
- for _ in range(self.enqueue_cache.qsize()):
227
- try:
228
- request = self.enqueue_cache.get_nowait()
229
- requests.append(request)
230
- except QueueEmpty:
231
- break
232
- if requests:
233
- await call_helper(self.scheduler.enqueue_request_batch, requests)
234
- create_task(self._next_request())
235
- self.enqueue_unlock = True
210
+ await self.scheduler.enqueue_request(request)
211
+ # create_task(self._next_request())
236
212
 
237
213
  async def close_spider(self, spider: Spider, reason: str = 'cancelled') -> None:
238
214
  """Close (cancel) spider and clear all its outstanding requests"""
@@ -276,7 +252,6 @@ class ExecutionEngine(object):
276
252
  # method of 'has_pending_requests' has IO, so method of 'is_idle' execute twice
277
253
  if self.is_idle() \
278
254
  and self.slot.start_requests is None \
279
- and self.enqueue_unlock and self.enqueue_cache.empty() \
280
255
  and not await self.scheduler.has_pending_requests() \
281
256
  and self.is_idle():
282
257
  await self.stop(reason='finished')
@@ -31,7 +31,7 @@ class BaseScheduler(metaclass=BaseSchedulerMeta):
31
31
  @classmethod
32
32
  async def from_crawler(cls, crawler: "aioscrapy.Crawler") -> "BaseScheduler":
33
33
  """
34
- Factory method which receives the current :class:`~scrapy.crawler.Crawler` object as argument.
34
+ Factory method which receives the current :class:`~aioscrapy.crawler.Crawler` object as argument.
35
35
  """
36
36
  return cls()
37
37
 
@@ -103,20 +103,27 @@ class Scheduler(BaseScheduler):
103
103
  queue: AbsQueue,
104
104
  spider: aioscrapy.Spider,
105
105
  stats=Optional[StatsCollector],
106
- persist: bool = True
106
+ persist: bool = True,
107
+ cache_queue: Optional[AbsQueue] = None
107
108
  ):
109
+
108
110
  self.queue = queue
111
+ self.cache_queue = cache_queue
109
112
  self.spider = spider
110
113
  self.stats = stats
111
114
  self.persist = persist
112
115
 
113
116
  @classmethod
114
117
  async def from_crawler(cls: Type[SchedulerTV], crawler: "aioscrapy.Crawler") -> SchedulerTV:
118
+ cache_queue = None
119
+ if crawler.settings.getbool('USE_SCHEDULER_QUEUE_CACHE', False):
120
+ cache_queue = await load_instance('aioscrapy.queue.memory.SpiderPriorityQueue', spider=crawler.spider)
115
121
  instance = cls(
116
122
  await load_instance(crawler.settings['SCHEDULER_QUEUE_CLASS'], spider=crawler.spider),
117
123
  crawler.spider,
118
124
  stats=crawler.stats,
119
- persist=crawler.settings.getbool('SCHEDULER_PERSIST', True)
125
+ persist=crawler.settings.getbool('SCHEDULER_PERSIST', True),
126
+ cache_queue=cache_queue
120
127
  )
121
128
 
122
129
  if crawler.settings.getbool('SCHEDULER_FLUSH_ON_START', False):
@@ -128,8 +135,20 @@ class Scheduler(BaseScheduler):
128
135
  return instance
129
136
 
130
137
  async def close(self, reason: str) -> None:
138
+
131
139
  if not self.persist:
132
140
  await self.flush()
141
+ return
142
+
143
+ # 如果持久化,将缓存中的任务放回到redis等分布式队列中
144
+ if self.cache_queue is not None:
145
+ while True:
146
+ temp = []
147
+ async for request in self.cache_queue.pop(2000):
148
+ temp.append(request)
149
+ temp and await self.queue.push_batch(temp)
150
+ if len(temp) < 2000:
151
+ break
133
152
 
134
153
  async def flush(self) -> None:
135
154
  await call_helper(self.queue.clear)
@@ -141,16 +160,38 @@ class Scheduler(BaseScheduler):
141
160
  return True
142
161
 
143
162
  async def enqueue_request(self, request: aioscrapy.Request) -> bool:
144
- await call_helper(self.queue.push, request)
163
+ """
164
+ 如果启用了缓存队列(USE_SCHEDULER_QUEUE_CACHE),则优先将任务放到缓存队列中
165
+ """
166
+ if self.cache_queue is not None:
167
+ await call_helper(self.cache_queue.push, request)
168
+ else:
169
+ await call_helper(self.queue.push, request)
145
170
  if self.stats:
146
171
  self.stats.inc_value(self.queue.inc_key, spider=self.spider)
147
172
  return True
148
173
 
149
174
  async def next_request(self, count: int = 1) -> Optional[aioscrapy.Request]:
175
+ """
176
+ 如果启用了缓存队列(USE_SCHEDULER_QUEUE_CACHE),则优先从缓存队列中获取任务,然后从redis等分布式队列中获取任务
177
+ """
178
+ flag = False
179
+ if self.cache_queue is not None:
180
+ async for request in self.cache_queue.pop(count):
181
+ if request and self.stats:
182
+ self.stats.inc_value(self.queue.inc_key, spider=self.spider)
183
+ yield request
184
+ flag = True
185
+
186
+ if flag:
187
+ return
188
+
150
189
  async for request in self.queue.pop(count):
151
190
  if request and self.stats:
152
191
  self.stats.inc_value(self.queue.inc_key, spider=self.spider)
153
192
  yield request
154
193
 
194
+
155
195
  async def has_pending_requests(self) -> bool:
156
- return await call_helper(self.queue.len) > 0
196
+ return await call_helper(self.queue.len) if self.cache_queue is None \
197
+ else (await call_helper(self.queue.len) + await call_helper(self.cache_queue.len)) > 0
aioscrapy/crawler.py CHANGED
@@ -234,9 +234,12 @@ class CrawlerProcess(CrawlerRunner):
234
234
  finally:
235
235
  await self.recycle_db_connect()
236
236
 
237
- def start(self) -> None:
237
+ def start(self, use_windows_selector_eventLoop: bool = False) -> None:
238
238
  if sys.platform.startswith('win'):
239
- asyncio.set_event_loop(asyncio.windows_events.ProactorEventLoop())
239
+ if use_windows_selector_eventLoop:
240
+ asyncio.set_event_loop_policy(asyncio.windows_events.WindowsSelectorEventLoopPolicy())
241
+ else:
242
+ asyncio.set_event_loop(asyncio.windows_events.ProactorEventLoop())
240
243
  else:
241
244
  try:
242
245
  import uvloop
@@ -61,6 +61,14 @@ try:
61
61
  except ImportError:
62
62
  pass
63
63
 
64
+
65
+ try:
66
+ from curl_cffi.curl import CurlError
67
+
68
+ NEED_RETRY_ERROR += (CurlError,)
69
+ except ImportError:
70
+ pass
71
+
64
72
  from aioscrapy.exceptions import NotConfigured
65
73
  from aioscrapy.http.request import Request
66
74
  from aioscrapy.spiders import Spider
@@ -8,15 +8,13 @@ class SqlFormat:
8
8
 
9
9
  @staticmethod
10
10
  def pg_insert(table: str, fields: list, *args) -> str:
11
- fields = ','.join(fields)
12
11
  placeholder = ','.join([f'${i + 1}' for i in range(len(fields))])
13
- return f'''INSERT INTO {table} ({fields}) VALUES ({placeholder})'''
12
+ return f'''INSERT INTO {table} ({",".join(fields)}) VALUES ({placeholder})'''
14
13
 
15
14
  @staticmethod
16
15
  def pg_ignore_insert(table: str, fields: list, *args) -> str:
17
16
  placeholder = ','.join([f'${i + 1}' for i in range(len(fields))])
18
- fields = ','.join(fields)
19
- return f'INSERT INTO {table} ({fields}) VALUES ({placeholder}) ON CONFLICT DO NOTHING'
17
+ return f'''INSERT INTO {table} ({",".join(fields)}) VALUES ({placeholder}) ON CONFLICT DO NOTHING'''
20
18
 
21
19
  @staticmethod
22
20
  def pg_update_insert(table: str, fields: list, update_fields: list, on_conflict: str, *args) -> str:
@@ -25,8 +23,7 @@ class SqlFormat:
25
23
  if not update_fields:
26
24
  update_fields = fields
27
25
  update_fields = ','.join([f"{key} = excluded.{key}" for key in update_fields])
28
- fields = ','.join(fields)
29
- return f'INSERT INTO {table} ({fields}) VALUES ({placeholder}) ON CONFLICT({on_conflict}) DO UPDATE SET {update_fields}'
26
+ return f'''INSERT INTO {table} ({",".join(fields)}) VALUES ({placeholder}) ON CONFLICT({on_conflict}) DO UPDATE SET {update_fields}'''
30
27
 
31
28
  @staticmethod
32
29
  def mysql_insert(table: str, fields: list, *args) -> str:
@@ -72,6 +72,10 @@ DOWNLOAD_HANDLERS_MAP = {
72
72
  'http': 'aioscrapy.core.downloader.handlers.playwright.PlaywrightHandler',
73
73
  'https': 'aioscrapy.core.downloader.handlers.playwright.PlaywrightHandler',
74
74
  },
75
+ 'curl_cffi': {
76
+ 'http': 'aioscrapy.core.downloader.handlers.curl_cffi.CurlCffiDownloadHandler',
77
+ 'https': 'aioscrapy.core.downloader.handlers.curl_cffi.CurlCffiDownloadHandler',
78
+ },
75
79
  }
76
80
 
77
81
  DOWNLOAD_TIMEOUT = 180 # 3mins
@@ -106,7 +106,7 @@ class Spider(object):
106
106
  __repr__ = __str__
107
107
 
108
108
  @classmethod
109
- def start(cls, setting_path=None):
109
+ def start(cls, setting_path=None, use_windows_selector_eventLoop: bool = False):
110
110
  from aioscrapy.crawler import CrawlerProcess
111
111
  from aioscrapy.utils.project import get_project_settings
112
112
 
@@ -115,7 +115,7 @@ class Spider(object):
115
115
  settings.setmodule(setting_path)
116
116
  cp = CrawlerProcess(settings)
117
117
  cp.crawl(cls)
118
- cp.start()
118
+ cp.start(use_windows_selector_eventLoop)
119
119
 
120
120
  def spider_idle(self):
121
121
  if not self.close_on_idle: