coocan 0.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coocan/__init__.py +2 -0
- coocan/cmd/__init__.py +0 -0
- coocan/cmd/cli.py +72 -0
- coocan/cmd/templates/spider.txt +18 -0
- coocan/gen.py +33 -0
- coocan/spider/__init__.py +1 -0
- coocan/spider/base.py +143 -0
- coocan/url/__init__.py +2 -0
- coocan/url/request.py +31 -0
- coocan/url/response.py +50 -0
- coocan-0.4.9.dist-info/METADATA +93 -0
- coocan-0.4.9.dist-info/RECORD +15 -0
- coocan-0.4.9.dist-info/WHEEL +5 -0
- coocan-0.4.9.dist-info/entry_points.txt +2 -0
- coocan-0.4.9.dist-info/top_level.txt +1 -0
coocan/__init__.py
ADDED
coocan/cmd/__init__.py
ADDED
File without changes
|
coocan/cmd/cli.py
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
import os
|
2
|
+
import re
|
3
|
+
from pathlib import Path
|
4
|
+
|
5
|
+
import click
|
6
|
+
|
7
|
+
TEMPLATE_DIR = Path(__file__).parent / "templates"
|
8
|
+
|
9
|
+
help_info = """
|
10
|
+
██████╗ ██████╗ ██████╗ ██████╗ █████╗ ███╗ ██╗
|
11
|
+
██╔════╝██╔═══██╗██╔═══██╗██╔════╝██╔══██╗████╗ ██║
|
12
|
+
██║ ██║ ██║██║ ██║██║ ███████║██╔██╗ ██║
|
13
|
+
██║ ██║ ██║██║ ██║██║ ██╔══██║██║╚██╗██║
|
14
|
+
╚██████╗╚██████╔╝╚██████╔╝╚██████╗██║ ██║██║ ╚████║
|
15
|
+
╚═════╝ ╚═════╝ ╚═════╝ ╚═════╝╚═╝ ╚═╝╚═╝ ╚═══╝
|
16
|
+
"""
|
17
|
+
|
18
|
+
|
19
|
+
def show_help_info():
|
20
|
+
print(help_info)
|
21
|
+
|
22
|
+
|
23
|
+
def snake_to_pascal(snake_str: str):
|
24
|
+
"""小蛇变成大驼峰"""
|
25
|
+
words = snake_str.split('_')
|
26
|
+
pascal_str = ''.join(word.capitalize() for word in words)
|
27
|
+
return pascal_str
|
28
|
+
|
29
|
+
|
30
|
+
@click.group(invoke_without_command=True)
|
31
|
+
@click.pass_context
|
32
|
+
def main(ctx):
|
33
|
+
if ctx.invoked_subcommand is None:
|
34
|
+
show_help_info()
|
35
|
+
click.echo("cc new -s <spider_file_name>")
|
36
|
+
|
37
|
+
|
38
|
+
@main.command()
|
39
|
+
@click.option('-s', '--spider', required=True, help='爬虫文件名字')
|
40
|
+
def new(spider):
|
41
|
+
"""新建"""
|
42
|
+
if not re.search("^[a-zA-Z0-9_]*$", spider):
|
43
|
+
click.echo("只支持字母、数字、下划线")
|
44
|
+
return
|
45
|
+
|
46
|
+
pascal = snake_to_pascal(spider)
|
47
|
+
if not pascal.endswith("Spider"):
|
48
|
+
pascal += "Spider"
|
49
|
+
|
50
|
+
try:
|
51
|
+
template_path = TEMPLATE_DIR / "spider.txt"
|
52
|
+
with open(template_path, 'r') as f:
|
53
|
+
text = f.read()
|
54
|
+
spider_py_text = text.replace("{SpiderClassName}", pascal)
|
55
|
+
|
56
|
+
py_file = "{}.py".format(spider)
|
57
|
+
if os.path.exists(py_file):
|
58
|
+
click.echo("File {} already exists".format(py_file))
|
59
|
+
return
|
60
|
+
|
61
|
+
with open(py_file, 'w') as f:
|
62
|
+
f.write(spider_py_text)
|
63
|
+
|
64
|
+
click.echo("Success")
|
65
|
+
|
66
|
+
except Exception as e:
|
67
|
+
click.echo(str(e))
|
68
|
+
raise click.ClickException("Failed")
|
69
|
+
|
70
|
+
|
71
|
+
if __name__ == '__main__':
|
72
|
+
main()
|
@@ -0,0 +1,18 @@
|
|
1
|
+
from coocan import Request, Response, MiniSpider
|
2
|
+
|
3
|
+
|
4
|
+
class {SpiderClassName}(MiniSpider):
|
5
|
+
start_urls = ['https://github.com/markadc/coocan']
|
6
|
+
max_requests = 10
|
7
|
+
|
8
|
+
def middleware(self, request: Request):
|
9
|
+
request.headers["Referer"] = "https://github.com"
|
10
|
+
|
11
|
+
def parse(self, response: Response):
|
12
|
+
print(response.status_code)
|
13
|
+
print(response.get_one("//title/text()"))
|
14
|
+
|
15
|
+
|
16
|
+
if __name__ == '__main__':
|
17
|
+
s = {SpiderClassName}()
|
18
|
+
s.go()
|
coocan/gen.py
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
import random
|
2
|
+
|
3
|
+
|
4
|
+
def gen_random_os() -> str:
|
5
|
+
"""生成一个随机的操作系统"""
|
6
|
+
os_choices = [
|
7
|
+
"Windows NT 10.0; Win64; x64",
|
8
|
+
"Windows NT 6.1; WOW64",
|
9
|
+
"Macintosh; Intel Mac OS X 10_15_6",
|
10
|
+
"X11; Linux x86_64",
|
11
|
+
"Windows NT 6.3; Trident/7.0",
|
12
|
+
]
|
13
|
+
return random.choice(os_choices)
|
14
|
+
|
15
|
+
|
16
|
+
def gen_random_browser() -> str:
|
17
|
+
"""生成一个随机的浏览器类型和版本"""
|
18
|
+
browser_choices = [
|
19
|
+
("Chrome", random.randint(70, 100)),
|
20
|
+
("Firefox", random.randint(70, 100)),
|
21
|
+
("Edge", random.randint(80, 100)),
|
22
|
+
("Safari", random.randint(10, 14)),
|
23
|
+
("Opera", random.randint(60, 80)),
|
24
|
+
]
|
25
|
+
browser, version = random.choice(browser_choices)
|
26
|
+
return f"{browser}/{version}.0"
|
27
|
+
|
28
|
+
|
29
|
+
def gen_random_ua() -> str:
|
30
|
+
"""生成一个随机的UA"""
|
31
|
+
os, browser = gen_random_os(), gen_random_browser()
|
32
|
+
ua = f"Mozilla/5.0 ({os}) AppleWebKit/537.36 (KHTML, like Gecko) {browser} Safari/537.36"
|
33
|
+
return ua
|
@@ -0,0 +1 @@
|
|
1
|
+
from coocan.spider.base import MiniSpider, IgnoreRequest
|
coocan/spider/base.py
ADDED
@@ -0,0 +1,143 @@
|
|
1
|
+
import asyncio
|
2
|
+
from collections.abc import Iterator
|
3
|
+
|
4
|
+
from loguru import logger
|
5
|
+
|
6
|
+
from coocan.gen import gen_random_ua
|
7
|
+
from coocan.url import Request, Response
|
8
|
+
|
9
|
+
|
10
|
+
class IgnoreRequest(Exception):
|
11
|
+
"""忽略这个请求,不再重试"""
|
12
|
+
pass
|
13
|
+
|
14
|
+
|
15
|
+
class IgnoreResponse(Exception):
|
16
|
+
"""忽略这个响应,不进回调"""
|
17
|
+
pass
|
18
|
+
|
19
|
+
|
20
|
+
class MiniSpider:
|
21
|
+
start_urls = []
|
22
|
+
max_requests = 5
|
23
|
+
max_retry_times = 3
|
24
|
+
enable_random_ua = True
|
25
|
+
headers_extra_field = {}
|
26
|
+
delay = 0
|
27
|
+
|
28
|
+
def start_requests(self):
|
29
|
+
"""初始请求"""
|
30
|
+
assert self.start_urls, "没有起始 URL 列表"
|
31
|
+
for url in self.start_urls:
|
32
|
+
yield Request(url, self.parse)
|
33
|
+
|
34
|
+
def middleware(self, request: Request):
|
35
|
+
# 随机Ua
|
36
|
+
if self.enable_random_ua is True:
|
37
|
+
request.headers.setdefault("User-Agent", gen_random_ua())
|
38
|
+
|
39
|
+
# 为 headers 补充额外字段
|
40
|
+
if self.headers_extra_field:
|
41
|
+
request.headers.update(self.headers_extra_field)
|
42
|
+
|
43
|
+
def validator(self, response: Response):
|
44
|
+
"""校验响应"""
|
45
|
+
pass
|
46
|
+
|
47
|
+
def parse(self, response: Response):
|
48
|
+
"""默认回调函数"""
|
49
|
+
raise NotImplementedError("没有定义回调函数 {}.parse ".format(self.__class__.__name__))
|
50
|
+
|
51
|
+
def handle_request_excetpion(self, e: Exception, request: Request):
|
52
|
+
"""处理请求时的异常"""
|
53
|
+
logger.error("{} {}".format(type(e).__name__, request.url))
|
54
|
+
|
55
|
+
def handle_callback_excetpion(self, e: Exception, request: Request, response: Response):
|
56
|
+
logger.error("{} `回调`时出现异常 | {} | {} | {}".format(response.status_code, e, request.callback.__name__, request.url))
|
57
|
+
|
58
|
+
async def worker(self, queue: asyncio.PriorityQueue, semaphore: asyncio.Semaphore):
|
59
|
+
"""工作协程,从队列中获取请求并处理"""
|
60
|
+
while True:
|
61
|
+
req: Request = await queue.get()
|
62
|
+
|
63
|
+
# 结束信号
|
64
|
+
if req.url == "":
|
65
|
+
break
|
66
|
+
|
67
|
+
# 控制并发
|
68
|
+
async with semaphore:
|
69
|
+
for i in range(self.max_retry_times + 1):
|
70
|
+
# 进入了重试
|
71
|
+
if i > 0:
|
72
|
+
logger.debug("正在重试第{}次... {}".format(i, req.url))
|
73
|
+
|
74
|
+
# 开始请求...
|
75
|
+
try:
|
76
|
+
self.middleware(req)
|
77
|
+
await asyncio.sleep(self.delay)
|
78
|
+
resp = await req.send()
|
79
|
+
|
80
|
+
# 请求失败
|
81
|
+
except Exception as e:
|
82
|
+
try:
|
83
|
+
result = self.handle_request_excetpion(e, req)
|
84
|
+
if isinstance(result, Request):
|
85
|
+
await queue.put(result)
|
86
|
+
break
|
87
|
+
except IgnoreRequest as e:
|
88
|
+
logger.debug("{} 忽略请求 {}".format(e, req.url))
|
89
|
+
break
|
90
|
+
except Exception as e:
|
91
|
+
logger.error("`处理异常函数`异常了 | {} | {}".format(e, req.url))
|
92
|
+
|
93
|
+
# 请求成功
|
94
|
+
else:
|
95
|
+
# 校验响应
|
96
|
+
try:
|
97
|
+
self.validator(resp)
|
98
|
+
except IgnoreResponse as e:
|
99
|
+
logger.debug("{} 忽略响应 {}".format(e, req.url))
|
100
|
+
break
|
101
|
+
except Exception as e:
|
102
|
+
logger.error("`校验器`函数异常了 | {} | {}".format(e, req.url))
|
103
|
+
|
104
|
+
# 进入回调
|
105
|
+
try:
|
106
|
+
cached = req.callback(Response(resp), **req.cb_kwargs)
|
107
|
+
if isinstance(cached, Iterator):
|
108
|
+
for next_request in cached:
|
109
|
+
await queue.put(next_request) # 把后续请求加入队列
|
110
|
+
except Exception as e:
|
111
|
+
self.handle_callback_excetpion(e, req, resp)
|
112
|
+
finally:
|
113
|
+
break
|
114
|
+
|
115
|
+
queue.task_done()
|
116
|
+
|
117
|
+
async def run(self):
|
118
|
+
"""爬取入口"""
|
119
|
+
queue = asyncio.PriorityQueue()
|
120
|
+
semaphore = asyncio.Semaphore(self.max_requests)
|
121
|
+
|
122
|
+
# 工作协程启动...
|
123
|
+
workers = [
|
124
|
+
asyncio.create_task(self.worker(queue, semaphore))
|
125
|
+
for _ in range(self.max_requests)
|
126
|
+
]
|
127
|
+
|
128
|
+
# 将初始请求加入队列
|
129
|
+
for req in self.start_requests():
|
130
|
+
await queue.put(req)
|
131
|
+
|
132
|
+
# 等待队列中的所有任务完成
|
133
|
+
await queue.join()
|
134
|
+
|
135
|
+
# ...停止工作协程
|
136
|
+
for _ in range(self.max_requests):
|
137
|
+
await queue.put(Request(url=""))
|
138
|
+
|
139
|
+
# 等待所有工作协程完成
|
140
|
+
await asyncio.gather(*workers)
|
141
|
+
|
142
|
+
def go(self):
|
143
|
+
asyncio.run(self.run())
|
coocan/url/__init__.py
ADDED
coocan/url/request.py
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
import time
|
2
|
+
from typing import Callable
|
3
|
+
|
4
|
+
import httpx
|
5
|
+
|
6
|
+
cli = httpx.AsyncClient()
|
7
|
+
|
8
|
+
|
9
|
+
class Request:
|
10
|
+
def __init__(self, url: str, callback: Callable = None, params=None, headers=None, data=None, json=None, timeout=None, cb_kwargs=None, priority=None):
|
11
|
+
self.url = url
|
12
|
+
self.callback = callback
|
13
|
+
self.params = params
|
14
|
+
self.headers = headers or {}
|
15
|
+
self.data = data
|
16
|
+
self.json = json
|
17
|
+
self.timeout = timeout
|
18
|
+
self.cb_kwargs = cb_kwargs or {}
|
19
|
+
self.priority = priority or time.time()
|
20
|
+
|
21
|
+
async def send(self):
|
22
|
+
if (self.data and self.json) is None:
|
23
|
+
response = await cli.get(self.url, params=self.params, headers=self.headers, timeout=self.timeout)
|
24
|
+
elif self.data or self.json:
|
25
|
+
response = await cli.post(self.url, params=self.params, headers=self.headers, data=self.data, json=self.json, timeout=self.timeout)
|
26
|
+
else:
|
27
|
+
raise Exception("仅支持 GET 和 POST 请求")
|
28
|
+
return response
|
29
|
+
|
30
|
+
def __lt__(self, other):
|
31
|
+
return self.priority < other.priority
|
coocan/url/response.py
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
from typing import Callable
|
2
|
+
|
3
|
+
from cocoman.spider.errors import ResponseCodeError, ResponseTextError
|
4
|
+
from httpx import Response
|
5
|
+
from parsel import Selector
|
6
|
+
|
7
|
+
|
8
|
+
class SelectorResponse(Response):
|
9
|
+
"""可以使用Xpath、CSS"""
|
10
|
+
|
11
|
+
def __init__(self, response: Response):
|
12
|
+
super().__init__(response.status_code)
|
13
|
+
self.__dict__.update(response.__dict__)
|
14
|
+
self.selector = Selector(text=response.text)
|
15
|
+
|
16
|
+
def __str__(self):
|
17
|
+
return "<Response {}>".format(self.status_code)
|
18
|
+
|
19
|
+
def xpath(self, query: str):
|
20
|
+
sel = self.selector.xpath(query)
|
21
|
+
return sel
|
22
|
+
|
23
|
+
def css(self, query: str):
|
24
|
+
sel = self.selector.css(query)
|
25
|
+
return sel
|
26
|
+
|
27
|
+
def get_one(self, query: str, default=None, strip=True):
|
28
|
+
v = self.selector.xpath(query).get(default=default)
|
29
|
+
return v.strip() if strip and isinstance(v, str) else v
|
30
|
+
|
31
|
+
def get_all(self, query: str, strip=True):
|
32
|
+
vs = [v.strip() if strip else v for v in self.selector.xpath(query).getall()]
|
33
|
+
return vs
|
34
|
+
|
35
|
+
def raise_for_status(self, codes: list = None):
|
36
|
+
codes = codes or [200]
|
37
|
+
if self.status_code not in codes:
|
38
|
+
raise ResponseCodeError("{} not in {}".format(self.status_code, codes))
|
39
|
+
|
40
|
+
def raise_for_text(self, validate: Callable[[str], bool]):
|
41
|
+
if validate(self.text) is False:
|
42
|
+
raise ResponseTextError("not ideal text")
|
43
|
+
|
44
|
+
def raise_has_text(self, text: str):
|
45
|
+
"""有此文本则抛出异常"""
|
46
|
+
assert self.text.find(text) == -1, ResponseTextError("has text: {}".format(text))
|
47
|
+
|
48
|
+
def raise_no_text(self, text: str):
|
49
|
+
"""无此文本则抛出异常"""
|
50
|
+
assert self.text.find(text) != -1, ResponseTextError("no text: {}".format(text))
|
@@ -0,0 +1,93 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: coocan
|
3
|
+
Version: 0.4.9
|
4
|
+
Summary: Air Spider Framework
|
5
|
+
Author: wauo
|
6
|
+
Author-email: markadc@126.com
|
7
|
+
Requires-Python: >=3.10
|
8
|
+
Description-Content-Type: text/markdown
|
9
|
+
Requires-Dist: click>=8.0.0
|
10
|
+
Requires-Dist: httpx
|
11
|
+
Requires-Dist: loguru
|
12
|
+
|
13
|
+
# 项目说明
|
14
|
+
|
15
|
+
- 一个非常轻量的异步爬虫框架
|
16
|
+
|
17
|
+
# 项目地址
|
18
|
+
|
19
|
+
- https://github.com/markadc/coocan
|
20
|
+
|
21
|
+
## demo
|
22
|
+
|
23
|
+
- 效果
|
24
|
+
<br>
|
25
|
+

|
26
|
+
|
27
|
+
|
28
|
+
- 代码
|
29
|
+
|
30
|
+
```python
|
31
|
+
import json
|
32
|
+
|
33
|
+
from loguru import logger
|
34
|
+
|
35
|
+
import coocan
|
36
|
+
from coocan import Request, MiniSpider
|
37
|
+
|
38
|
+
|
39
|
+
class CSDNDetailSpider(MiniSpider):
|
40
|
+
start_urls = ['http://www.csdn.net']
|
41
|
+
max_requests = 10
|
42
|
+
|
43
|
+
def middleware(self, request: Request):
|
44
|
+
request.headers["Referer"] = "http://www.csdn.net/"
|
45
|
+
|
46
|
+
def parse(self, response):
|
47
|
+
api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
|
48
|
+
params = {
|
49
|
+
"page": "1",
|
50
|
+
"size": "20",
|
51
|
+
"businessType": "lately",
|
52
|
+
"noMore": "false",
|
53
|
+
"username": "markadc"
|
54
|
+
}
|
55
|
+
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
56
|
+
|
57
|
+
def parse_page(self, response, api, params):
|
58
|
+
current_page = params["page"]
|
59
|
+
data = json.loads(response.text)
|
60
|
+
some = data["data"]["list"]
|
61
|
+
|
62
|
+
if not some:
|
63
|
+
logger.warning("没有第 {} 页".format(current_page))
|
64
|
+
return
|
65
|
+
|
66
|
+
for one in some:
|
67
|
+
date = one["formatTime"]
|
68
|
+
name = one["title"]
|
69
|
+
detail_url = one["url"]
|
70
|
+
logger.info(
|
71
|
+
"""
|
72
|
+
{}
|
73
|
+
{}
|
74
|
+
{}
|
75
|
+
""".format(date, name, detail_url)
|
76
|
+
)
|
77
|
+
yield coocan.Request(detail_url, self.parse_detail, cb_kwargs={"title": name})
|
78
|
+
|
79
|
+
logger.info("第 {} 页抓取成功".format(params["page"]))
|
80
|
+
|
81
|
+
# 抓取下一页
|
82
|
+
next_page = int(current_page) + 1
|
83
|
+
params["page"] = str(next_page)
|
84
|
+
yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
|
85
|
+
|
86
|
+
def parse_detail(self, response, title):
|
87
|
+
logger.success("{} 已访问 {}".format(response.status_code, title))
|
88
|
+
|
89
|
+
|
90
|
+
if __name__ == '__main__':
|
91
|
+
s = CSDNDetailSpider()
|
92
|
+
s.go()
|
93
|
+
```
|
@@ -0,0 +1,15 @@
|
|
1
|
+
coocan/__init__.py,sha256=UqFmE7ucuR_xR3OyyBU8pxqLfCJ5AdH_HsDdTsYPf6g,55
|
2
|
+
coocan/gen.py,sha256=J6QWXkBVbiCQqey8i0BDqleRNpBswI8AyvrYmkDVQPw,1028
|
3
|
+
coocan/cmd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
+
coocan/cmd/cli.py,sha256=d_sG63wz8RHHa5u7HabOz36yai_D8S3bSr7VprVpTck,2420
|
5
|
+
coocan/cmd/templates/spider.txt,sha256=Htd7nOs1EeKbc8LNRUX7xyHkLWz3S0kaTPRW0M3NuUw,480
|
6
|
+
coocan/spider/__init__.py,sha256=kMDCGeqtN50raCzwfCn18s_W8xV6KO_Ny9Xol4I48Ag,58
|
7
|
+
coocan/spider/base.py,sha256=WMTnMQd7Dnv2aC7rnmkAo_WJu33p9g3GN07A2DnbdLI,5104
|
8
|
+
coocan/url/__init__.py,sha256=rEMx66XDy5AIJ9mF_2UVzHW5mRLBAWZEyQ3txrZzuZA,102
|
9
|
+
coocan/url/request.py,sha256=seZaQXQRvRMIf9WnCp3mAgNA-kxsj9P2JzAvuIt2Dx8,1116
|
10
|
+
coocan/url/response.py,sha256=AnC0xsF34q68r62EVlcHYmDH6skm9RBwRHITTb4iBbU,1785
|
11
|
+
coocan-0.4.9.dist-info/METADATA,sha256=WZhIMdcypGrPcE1_bAMxMUa3puvkngstEU4PSTR5UXo,2374
|
12
|
+
coocan-0.4.9.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
13
|
+
coocan-0.4.9.dist-info/entry_points.txt,sha256=tOLQN_TVhl_9f2YBASTGBE_ClmG-iQ4rKmyhE2WAOY0,43
|
14
|
+
coocan-0.4.9.dist-info/top_level.txt,sha256=VwB-Q4zEljgb9v1Ms1E59B-1pBYORXuhKjgZb-LHOhk,7
|
15
|
+
coocan-0.4.9.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
coocan
|