coocan 0.3.3__tar.gz → 0.4.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {coocan-0.3.3 → coocan-0.4.3}/PKG-INFO +1 -1
- coocan-0.4.3/coocan/cmd/__init__.py +0 -0
- coocan-0.4.3/coocan/cmd/cli.py +29 -0
- coocan-0.4.3/coocan/cmd/templates/__init__.py +0 -0
- coocan-0.4.3/coocan/cmd/templates/spider.py +12 -0
- coocan-0.4.3/coocan/gen.py +33 -0
- coocan-0.4.3/coocan/spider/base.py +143 -0
- {coocan-0.3.3 → coocan-0.4.3}/coocan/url/request.py +6 -1
- {coocan-0.3.3 → coocan-0.4.3}/coocan.egg-info/PKG-INFO +1 -1
- {coocan-0.3.3 → coocan-0.4.3}/coocan.egg-info/SOURCES.txt +7 -0
- coocan-0.4.3/coocan.egg-info/entry_points.txt +2 -0
- coocan-0.4.3/coocan.egg-info/requires.txt +1 -0
- {coocan-0.3.3 → coocan-0.4.3}/setup.py +9 -1
- coocan-0.3.3/coocan/spider/base.py +0 -98
- {coocan-0.3.3 → coocan-0.4.3}/coocan/__init__.py +0 -0
- {coocan-0.3.3 → coocan-0.4.3}/coocan/spider/__init__.py +0 -0
- {coocan-0.3.3 → coocan-0.4.3}/coocan/url/__init__.py +0 -0
- {coocan-0.3.3 → coocan-0.4.3}/coocan/url/response.py +0 -0
- {coocan-0.3.3 → coocan-0.4.3}/coocan.egg-info/dependency_links.txt +0 -0
- {coocan-0.3.3 → coocan-0.4.3}/coocan.egg-info/top_level.txt +0 -0
- {coocan-0.3.3 → coocan-0.4.3}/setup.cfg +0 -0
File without changes
|
@@ -0,0 +1,29 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
|
3
|
+
import click
|
4
|
+
|
5
|
+
TEMPLATE_DIR = Path(__file__).parent / "templates"
|
6
|
+
|
7
|
+
|
8
|
+
@click.command()
|
9
|
+
@click.option('-s', '--spider', required=True, help='新建爬虫')
|
10
|
+
def main(spider):
|
11
|
+
spider_file_name = "{}.py".format(spider)
|
12
|
+
try:
|
13
|
+
|
14
|
+
template_path = TEMPLATE_DIR / "spider.py"
|
15
|
+
with open(template_path, 'r') as f:
|
16
|
+
content = f.read()
|
17
|
+
|
18
|
+
with open(spider_file_name, 'w') as f:
|
19
|
+
f.write(content)
|
20
|
+
|
21
|
+
click.echo("Success Create Spider {}".format(spider_file_name))
|
22
|
+
|
23
|
+
except Exception as e:
|
24
|
+
click.echo(str(e))
|
25
|
+
raise click.ClickException("Failed Create Spider {}".format(spider_file_name))
|
26
|
+
|
27
|
+
|
28
|
+
if __name__ == '__main__':
|
29
|
+
main()
|
File without changes
|
@@ -0,0 +1,12 @@
|
|
1
|
+
from coocan import Request, Response, MiniSpider
|
2
|
+
|
3
|
+
|
4
|
+
class Spider(MiniSpider):
|
5
|
+
start_urls = ['https://github.com/markadc/coocan']
|
6
|
+
max_requests = 10
|
7
|
+
|
8
|
+
def middleware(self, request: Request):
|
9
|
+
request.headers["Referer"] = "https://github.com"
|
10
|
+
|
11
|
+
def parse(self, response: Response):
|
12
|
+
pass
|
@@ -0,0 +1,33 @@
|
|
1
|
+
import random
|
2
|
+
|
3
|
+
|
4
|
+
def gen_random_os() -> str:
|
5
|
+
"""生成一个随机的操作系统"""
|
6
|
+
os_choices = [
|
7
|
+
"Windows NT 10.0; Win64; x64",
|
8
|
+
"Windows NT 6.1; WOW64",
|
9
|
+
"Macintosh; Intel Mac OS X 10_15_6",
|
10
|
+
"X11; Linux x86_64",
|
11
|
+
"Windows NT 6.3; Trident/7.0",
|
12
|
+
]
|
13
|
+
return random.choice(os_choices)
|
14
|
+
|
15
|
+
|
16
|
+
def gen_random_browser() -> str:
|
17
|
+
"""生成一个随机的浏览器类型和版本"""
|
18
|
+
browser_choices = [
|
19
|
+
("Chrome", random.randint(70, 100)),
|
20
|
+
("Firefox", random.randint(70, 100)),
|
21
|
+
("Edge", random.randint(80, 100)),
|
22
|
+
("Safari", random.randint(10, 14)),
|
23
|
+
("Opera", random.randint(60, 80)),
|
24
|
+
]
|
25
|
+
browser, version = random.choice(browser_choices)
|
26
|
+
return f"{browser}/{version}.0"
|
27
|
+
|
28
|
+
|
29
|
+
def gen_random_ua() -> str:
|
30
|
+
"""生成一个随机的UA"""
|
31
|
+
os, browser = gen_random_os(), gen_random_browser()
|
32
|
+
ua = f"Mozilla/5.0 ({os}) AppleWebKit/537.36 (KHTML, like Gecko) {browser} Safari/537.36"
|
33
|
+
return ua
|
@@ -0,0 +1,143 @@
|
|
1
|
+
import asyncio
|
2
|
+
from collections.abc import Iterator
|
3
|
+
|
4
|
+
from loguru import logger
|
5
|
+
|
6
|
+
from coocan.gen import gen_random_ua
|
7
|
+
from coocan.url import Request, Response
|
8
|
+
|
9
|
+
|
10
|
+
class IgnoreRequest(Exception):
|
11
|
+
"""忽略这个请求,不再重试"""
|
12
|
+
pass
|
13
|
+
|
14
|
+
|
15
|
+
class IgnoreResponse(Exception):
|
16
|
+
"""忽略这个响应,不进回调"""
|
17
|
+
pass
|
18
|
+
|
19
|
+
|
20
|
+
class MiniSpider:
|
21
|
+
start_urls = []
|
22
|
+
max_requests = 5
|
23
|
+
max_retry_times = 3
|
24
|
+
enable_random_ua = True
|
25
|
+
headers_extra_field = {}
|
26
|
+
delay = 0
|
27
|
+
|
28
|
+
def start_requests(self):
|
29
|
+
"""初始请求"""
|
30
|
+
assert self.start_urls, "没有起始 URL 列表"
|
31
|
+
for url in self.start_urls:
|
32
|
+
yield Request(url, self.parse)
|
33
|
+
|
34
|
+
def middleware(self, request: Request):
|
35
|
+
# 随机Ua
|
36
|
+
if self.enable_random_ua is True:
|
37
|
+
request.headers.setdefault("User-Agent", gen_random_ua())
|
38
|
+
|
39
|
+
# 为 headers 补充额外字段
|
40
|
+
if self.headers_extra_field:
|
41
|
+
request.headers.update(self.headers_extra_field)
|
42
|
+
|
43
|
+
def validator(self, response: Response):
|
44
|
+
"""校验响应"""
|
45
|
+
pass
|
46
|
+
|
47
|
+
def parse(self, response: Response):
|
48
|
+
"""默认回调函数"""
|
49
|
+
raise NotImplementedError("没有定义回调函数 {}.parse ".format(self.__class__.__name__))
|
50
|
+
|
51
|
+
def handle_request_excetpion(self, e: Exception, request: Request):
|
52
|
+
"""处理请求时的异常"""
|
53
|
+
logger.error("{} {}".format(type(e).__name__, request.url))
|
54
|
+
|
55
|
+
def handle_callback_excetpion(self, e: Exception, request: Request, response: Response):
|
56
|
+
logger.error("{} `回调`时出现异常 | {} | {} | {}".format(response.status_code, e, request.callback.__name__, request.url))
|
57
|
+
|
58
|
+
async def worker(self, queue: asyncio.PriorityQueue, semaphore: asyncio.Semaphore):
|
59
|
+
"""工作协程,从队列中获取请求并处理"""
|
60
|
+
while True:
|
61
|
+
req: Request = await queue.get()
|
62
|
+
|
63
|
+
# 结束信号
|
64
|
+
if req.url == "":
|
65
|
+
break
|
66
|
+
|
67
|
+
# 控制并发
|
68
|
+
async with semaphore:
|
69
|
+
for i in range(self.max_retry_times + 1):
|
70
|
+
# 进入了重试
|
71
|
+
if i > 0:
|
72
|
+
logger.debug("正在重试第{}次... {}".format(i, req.url))
|
73
|
+
|
74
|
+
# 开始请求...
|
75
|
+
try:
|
76
|
+
self.middleware(req)
|
77
|
+
await asyncio.sleep(self.delay)
|
78
|
+
resp = await req.send()
|
79
|
+
|
80
|
+
# 请求失败
|
81
|
+
except Exception as e:
|
82
|
+
try:
|
83
|
+
result = self.handle_request_excetpion(e, req)
|
84
|
+
if isinstance(result, Request):
|
85
|
+
await queue.put(result)
|
86
|
+
break
|
87
|
+
except IgnoreRequest as e:
|
88
|
+
logger.debug("{} 忽略请求 {}".format(e, req.url))
|
89
|
+
break
|
90
|
+
except Exception as e:
|
91
|
+
logger.error("`处理异常函数`异常了 | {} | {}".format(e, req.url))
|
92
|
+
|
93
|
+
# 请求成功
|
94
|
+
else:
|
95
|
+
# 校验响应
|
96
|
+
try:
|
97
|
+
self.validator(resp)
|
98
|
+
except IgnoreResponse as e:
|
99
|
+
logger.debug("{} 忽略响应 {}".format(e, req.url))
|
100
|
+
break
|
101
|
+
except Exception as e:
|
102
|
+
logger.error("`校验器`函数异常了 | {} | {}".format(e, req.url))
|
103
|
+
|
104
|
+
# 进入回调
|
105
|
+
try:
|
106
|
+
cached = req.callback(Response(resp), **req.cb_kwargs)
|
107
|
+
if isinstance(cached, Iterator):
|
108
|
+
for next_request in cached:
|
109
|
+
await queue.put(next_request) # 把后续请求加入队列
|
110
|
+
except Exception as e:
|
111
|
+
self.handle_callback_excetpion(e, req, resp)
|
112
|
+
finally:
|
113
|
+
break
|
114
|
+
|
115
|
+
queue.task_done()
|
116
|
+
|
117
|
+
async def run(self):
|
118
|
+
"""爬取入口"""
|
119
|
+
queue = asyncio.PriorityQueue()
|
120
|
+
semaphore = asyncio.Semaphore(self.max_requests)
|
121
|
+
|
122
|
+
# 工作协程启动...
|
123
|
+
workers = [
|
124
|
+
asyncio.create_task(self.worker(queue, semaphore))
|
125
|
+
for _ in range(self.max_requests)
|
126
|
+
]
|
127
|
+
|
128
|
+
# 将初始请求加入队列
|
129
|
+
for req in self.start_requests():
|
130
|
+
await queue.put(req)
|
131
|
+
|
132
|
+
# 等待队列中的所有任务完成
|
133
|
+
await queue.join()
|
134
|
+
|
135
|
+
# ...停止工作协程
|
136
|
+
for _ in range(self.max_requests):
|
137
|
+
await queue.put(Request(url=""))
|
138
|
+
|
139
|
+
# 等待所有工作协程完成
|
140
|
+
await asyncio.gather(*workers)
|
141
|
+
|
142
|
+
def go(self):
|
143
|
+
asyncio.run(self.run())
|
@@ -1,3 +1,4 @@
|
|
1
|
+
import time
|
1
2
|
from typing import Callable
|
2
3
|
|
3
4
|
import httpx
|
@@ -6,7 +7,7 @@ cli = httpx.AsyncClient()
|
|
6
7
|
|
7
8
|
|
8
9
|
class Request:
|
9
|
-
def __init__(self, url: str, callback: Callable = None, params=None, headers=None, data=None, json=None, timeout=None, cb_kwargs=None):
|
10
|
+
def __init__(self, url: str, callback: Callable = None, params=None, headers=None, data=None, json=None, timeout=None, cb_kwargs=None, priority=None):
|
10
11
|
self.url = url
|
11
12
|
self.callback = callback
|
12
13
|
self.params = params
|
@@ -15,6 +16,7 @@ class Request:
|
|
15
16
|
self.json = json
|
16
17
|
self.timeout = timeout
|
17
18
|
self.cb_kwargs = cb_kwargs or {}
|
19
|
+
self.priority = priority or time.time()
|
18
20
|
|
19
21
|
async def send(self):
|
20
22
|
if (self.data and self.json) is None:
|
@@ -24,3 +26,6 @@ class Request:
|
|
24
26
|
else:
|
25
27
|
raise Exception("仅支持 GET 和 POST 请求")
|
26
28
|
return response
|
29
|
+
|
30
|
+
def __lt__(self, other):
|
31
|
+
return self.priority < other.priority
|
@@ -1,9 +1,16 @@
|
|
1
1
|
setup.py
|
2
2
|
coocan/__init__.py
|
3
|
+
coocan/gen.py
|
3
4
|
coocan.egg-info/PKG-INFO
|
4
5
|
coocan.egg-info/SOURCES.txt
|
5
6
|
coocan.egg-info/dependency_links.txt
|
7
|
+
coocan.egg-info/entry_points.txt
|
8
|
+
coocan.egg-info/requires.txt
|
6
9
|
coocan.egg-info/top_level.txt
|
10
|
+
coocan/cmd/__init__.py
|
11
|
+
coocan/cmd/cli.py
|
12
|
+
coocan/cmd/templates/__init__.py
|
13
|
+
coocan/cmd/templates/spider.py
|
7
14
|
coocan/spider/__init__.py
|
8
15
|
coocan/spider/base.py
|
9
16
|
coocan/url/__init__.py
|
@@ -0,0 +1 @@
|
|
1
|
+
click>=8.0.0
|
@@ -2,10 +2,18 @@ from setuptools import setup, find_packages
|
|
2
2
|
|
3
3
|
setup(
|
4
4
|
name="coocan",
|
5
|
-
version="0.
|
5
|
+
version="0.4.3",
|
6
6
|
author="wauo",
|
7
7
|
author_email="markadc@126.com",
|
8
8
|
description="Air Spider Framework",
|
9
9
|
packages=find_packages(),
|
10
10
|
python_requires=">=3.10",
|
11
|
+
install_requires=[
|
12
|
+
'click>=8.0.0',
|
13
|
+
],
|
14
|
+
entry_points={
|
15
|
+
'console_scripts': [
|
16
|
+
'cc=coocan.cmd.cli:main',
|
17
|
+
],
|
18
|
+
}
|
11
19
|
)
|
@@ -1,98 +0,0 @@
|
|
1
|
-
import asyncio
|
2
|
-
from collections.abc import Iterator
|
3
|
-
|
4
|
-
from loguru import logger
|
5
|
-
|
6
|
-
from coocan.url import Request, Response
|
7
|
-
|
8
|
-
|
9
|
-
class IgnoreRequest(Exception):
|
10
|
-
pass
|
11
|
-
|
12
|
-
|
13
|
-
class MiniSpider:
|
14
|
-
start_urls = []
|
15
|
-
max_requests = 5
|
16
|
-
max_retry_times = 3
|
17
|
-
|
18
|
-
def start_requests(self):
|
19
|
-
"""初始请求"""
|
20
|
-
assert self.start_urls, "没有起始 URL 列表"
|
21
|
-
for url in self.start_urls:
|
22
|
-
yield Request(url, self.parse)
|
23
|
-
|
24
|
-
def middleware(self, request: Request):
|
25
|
-
request.headers.setdefault("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0")
|
26
|
-
|
27
|
-
async def get_response(self, request: Request):
|
28
|
-
"""发送请求,获取响应"""
|
29
|
-
self.middleware(request)
|
30
|
-
response = await request.send()
|
31
|
-
return response
|
32
|
-
|
33
|
-
def parse(self, response: Response):
|
34
|
-
raise NotImplementedError("没有定义回调函数 {}.parse ".format(self.__class__.__name__))
|
35
|
-
|
36
|
-
def when_request_error(self, e: Exception, request: Request):
|
37
|
-
logger.error("{} {}".format(type(e).__name__, request.url))
|
38
|
-
|
39
|
-
async def worker(self, queue, semaphore):
|
40
|
-
"""工作协程,从队列中获取请求并处理"""
|
41
|
-
while True:
|
42
|
-
request = await queue.get()
|
43
|
-
|
44
|
-
# 结束信号
|
45
|
-
if request is None:
|
46
|
-
break
|
47
|
-
|
48
|
-
# 控制并发
|
49
|
-
async with semaphore:
|
50
|
-
for i in range(self.max_retry_times + 1):
|
51
|
-
if i > 0:
|
52
|
-
logger.debug("正在重试第{}次...{}".format(i, request.url))
|
53
|
-
try:
|
54
|
-
response = await self.get_response(request)
|
55
|
-
cached = request.callback(Response(response), **request.cb_kwargs)
|
56
|
-
if isinstance(cached, Iterator):
|
57
|
-
for next_request in cached:
|
58
|
-
await queue.put(next_request) # 将后续请求加入队列
|
59
|
-
except Exception as e:
|
60
|
-
try:
|
61
|
-
result = self.when_request_error(e, request)
|
62
|
-
if isinstance(result, Request):
|
63
|
-
await queue.put(result)
|
64
|
-
logger.debug("新的请求 {}".format(result.url))
|
65
|
-
except IgnoreRequest as e:
|
66
|
-
logger.debug("{} 忽略请求 {}".format(e, request.url))
|
67
|
-
break
|
68
|
-
else:
|
69
|
-
break
|
70
|
-
queue.task_done()
|
71
|
-
|
72
|
-
async def run(self):
|
73
|
-
"""爬取入口"""
|
74
|
-
queue = asyncio.Queue()
|
75
|
-
semaphore = asyncio.Semaphore(self.max_requests)
|
76
|
-
|
77
|
-
# 工作协程启动...
|
78
|
-
workers = [
|
79
|
-
asyncio.create_task(self.worker(queue, semaphore))
|
80
|
-
for _ in range(self.max_requests)
|
81
|
-
]
|
82
|
-
|
83
|
-
# 将初始请求加入队列
|
84
|
-
for req in self.start_requests():
|
85
|
-
await queue.put(req)
|
86
|
-
|
87
|
-
# 等待队列中的所有任务完成
|
88
|
-
await queue.join()
|
89
|
-
|
90
|
-
# ...停止工作协程
|
91
|
-
for _ in range(self.max_requests):
|
92
|
-
await queue.put(None)
|
93
|
-
|
94
|
-
# 等待所有工作协程完成
|
95
|
-
await asyncio.gather(*workers)
|
96
|
-
|
97
|
-
def go(self):
|
98
|
-
asyncio.run(self.run())
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|