coocan 0.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
coocan/__init__.py ADDED
@@ -0,0 +1,2 @@
1
+ from coocan.spider import *
2
+ from coocan.url import *
coocan/cmd/__init__.py ADDED
File without changes
coocan/cmd/cli.py ADDED
@@ -0,0 +1,72 @@
1
+ import os
2
+ import re
3
+ from pathlib import Path
4
+
5
+ import click
6
+
7
+ TEMPLATE_DIR = Path(__file__).parent / "templates"
8
+
9
+ help_info = """
10
+ ██████╗ ██████╗ ██████╗ ██████╗ █████╗ ███╗ ██╗
11
+ ██╔════╝██╔═══██╗██╔═══██╗██╔════╝██╔══██╗████╗ ██║
12
+ ██║ ██║ ██║██║ ██║██║ ███████║██╔██╗ ██║
13
+ ██║ ██║ ██║██║ ██║██║ ██╔══██║██║╚██╗██║
14
+ ╚██████╗╚██████╔╝╚██████╔╝╚██████╗██║ ██║██║ ╚████║
15
+ ╚═════╝ ╚═════╝ ╚═════╝ ╚═════╝╚═╝ ╚═╝╚═╝ ╚═══╝
16
+ """
17
+
18
+
19
+ def show_help_info():
20
+ print(help_info)
21
+
22
+
23
+ def snake_to_pascal(snake_str: str):
24
+ """小蛇变成大驼峰"""
25
+ words = snake_str.split('_')
26
+ pascal_str = ''.join(word.capitalize() for word in words)
27
+ return pascal_str
28
+
29
+
30
+ @click.group(invoke_without_command=True)
31
+ @click.pass_context
32
+ def main(ctx):
33
+ if ctx.invoked_subcommand is None:
34
+ show_help_info()
35
+ click.echo("cc new -s <spider_file_name>")
36
+
37
+
38
+ @main.command()
39
+ @click.option('-s', '--spider', required=True, help='爬虫文件名字')
40
+ def new(spider):
41
+ """新建"""
42
+ if not re.search("^[a-zA-Z0-9_]*$", spider):
43
+ click.echo("只支持字母、数字、下划线")
44
+ return
45
+
46
+ pascal = snake_to_pascal(spider)
47
+ if not pascal.endswith("Spider"):
48
+ pascal += "Spider"
49
+
50
+ try:
51
+ template_path = TEMPLATE_DIR / "spider.txt"
52
+ with open(template_path, 'r') as f:
53
+ text = f.read()
54
+ spider_py_text = text.replace("{SpiderClassName}", pascal)
55
+
56
+ py_file = "{}.py".format(spider)
57
+ if os.path.exists(py_file):
58
+ click.echo("File {} already exists".format(py_file))
59
+ return
60
+
61
+ with open(py_file, 'w') as f:
62
+ f.write(spider_py_text)
63
+
64
+ click.echo("Success")
65
+
66
+ except Exception as e:
67
+ click.echo(str(e))
68
+ raise click.ClickException("Failed")
69
+
70
+
71
+ if __name__ == '__main__':
72
+ main()
@@ -0,0 +1,18 @@
1
+ from coocan import Request, Response, MiniSpider
2
+
3
+
4
+ class {SpiderClassName}(MiniSpider):
5
+ start_urls = ['https://github.com/markadc/coocan']
6
+ max_requests = 10
7
+
8
+ def middleware(self, request: Request):
9
+ request.headers["Referer"] = "https://github.com"
10
+
11
+ def parse(self, response: Response):
12
+ print(response.status_code)
13
+ print(response.get_one("//title/text()"))
14
+
15
+
16
+ if __name__ == '__main__':
17
+ s = {SpiderClassName}()
18
+ s.go()
coocan/gen.py ADDED
@@ -0,0 +1,33 @@
1
+ import random
2
+
3
+
4
+ def gen_random_os() -> str:
5
+ """生成一个随机的操作系统"""
6
+ os_choices = [
7
+ "Windows NT 10.0; Win64; x64",
8
+ "Windows NT 6.1; WOW64",
9
+ "Macintosh; Intel Mac OS X 10_15_6",
10
+ "X11; Linux x86_64",
11
+ "Windows NT 6.3; Trident/7.0",
12
+ ]
13
+ return random.choice(os_choices)
14
+
15
+
16
+ def gen_random_browser() -> str:
17
+ """生成一个随机的浏览器类型和版本"""
18
+ browser_choices = [
19
+ ("Chrome", random.randint(70, 100)),
20
+ ("Firefox", random.randint(70, 100)),
21
+ ("Edge", random.randint(80, 100)),
22
+ ("Safari", random.randint(10, 14)),
23
+ ("Opera", random.randint(60, 80)),
24
+ ]
25
+ browser, version = random.choice(browser_choices)
26
+ return f"{browser}/{version}.0"
27
+
28
+
29
+ def gen_random_ua() -> str:
30
+ """生成一个随机的UA"""
31
+ os, browser = gen_random_os(), gen_random_browser()
32
+ ua = f"Mozilla/5.0 ({os}) AppleWebKit/537.36 (KHTML, like Gecko) {browser} Safari/537.36"
33
+ return ua
@@ -0,0 +1 @@
1
+ from coocan.spider.base import MiniSpider, IgnoreRequest
coocan/spider/base.py ADDED
@@ -0,0 +1,143 @@
1
+ import asyncio
2
+ from collections.abc import Iterator
3
+
4
+ from loguru import logger
5
+
6
+ from coocan.gen import gen_random_ua
7
+ from coocan.url import Request, Response
8
+
9
+
10
+ class IgnoreRequest(Exception):
11
+ """忽略这个请求,不再重试"""
12
+ pass
13
+
14
+
15
+ class IgnoreResponse(Exception):
16
+ """忽略这个响应,不进回调"""
17
+ pass
18
+
19
+
20
+ class MiniSpider:
21
+ start_urls = []
22
+ max_requests = 5
23
+ max_retry_times = 3
24
+ enable_random_ua = True
25
+ headers_extra_field = {}
26
+ delay = 0
27
+
28
+ def start_requests(self):
29
+ """初始请求"""
30
+ assert self.start_urls, "没有起始 URL 列表"
31
+ for url in self.start_urls:
32
+ yield Request(url, self.parse)
33
+
34
+ def middleware(self, request: Request):
35
+ # 随机Ua
36
+ if self.enable_random_ua is True:
37
+ request.headers.setdefault("User-Agent", gen_random_ua())
38
+
39
+ # 为 headers 补充额外字段
40
+ if self.headers_extra_field:
41
+ request.headers.update(self.headers_extra_field)
42
+
43
+ def validator(self, response: Response):
44
+ """校验响应"""
45
+ pass
46
+
47
+ def parse(self, response: Response):
48
+ """默认回调函数"""
49
+ raise NotImplementedError("没有定义回调函数 {}.parse ".format(self.__class__.__name__))
50
+
51
+ def handle_request_excetpion(self, e: Exception, request: Request):
52
+ """处理请求时的异常"""
53
+ logger.error("{} {}".format(type(e).__name__, request.url))
54
+
55
+ def handle_callback_excetpion(self, e: Exception, request: Request, response: Response):
56
+ logger.error("{} `回调`时出现异常 | {} | {} | {}".format(response.status_code, e, request.callback.__name__, request.url))
57
+
58
+ async def worker(self, queue: asyncio.PriorityQueue, semaphore: asyncio.Semaphore):
59
+ """工作协程,从队列中获取请求并处理"""
60
+ while True:
61
+ req: Request = await queue.get()
62
+
63
+ # 结束信号
64
+ if req.url == "":
65
+ break
66
+
67
+ # 控制并发
68
+ async with semaphore:
69
+ for i in range(self.max_retry_times + 1):
70
+ # 进入了重试
71
+ if i > 0:
72
+ logger.debug("正在重试第{}次... {}".format(i, req.url))
73
+
74
+ # 开始请求...
75
+ try:
76
+ self.middleware(req)
77
+ await asyncio.sleep(self.delay)
78
+ resp = await req.send()
79
+
80
+ # 请求失败
81
+ except Exception as e:
82
+ try:
83
+ result = self.handle_request_excetpion(e, req)
84
+ if isinstance(result, Request):
85
+ await queue.put(result)
86
+ break
87
+ except IgnoreRequest as e:
88
+ logger.debug("{} 忽略请求 {}".format(e, req.url))
89
+ break
90
+ except Exception as e:
91
+ logger.error("`处理异常函数`异常了 | {} | {}".format(e, req.url))
92
+
93
+ # 请求成功
94
+ else:
95
+ # 校验响应
96
+ try:
97
+ self.validator(resp)
98
+ except IgnoreResponse as e:
99
+ logger.debug("{} 忽略响应 {}".format(e, req.url))
100
+ break
101
+ except Exception as e:
102
+ logger.error("`校验器`函数异常了 | {} | {}".format(e, req.url))
103
+
104
+ # 进入回调
105
+ try:
106
+ cached = req.callback(Response(resp), **req.cb_kwargs)
107
+ if isinstance(cached, Iterator):
108
+ for next_request in cached:
109
+ await queue.put(next_request) # 把后续请求加入队列
110
+ except Exception as e:
111
+ self.handle_callback_excetpion(e, req, resp)
112
+ finally:
113
+ break
114
+
115
+ queue.task_done()
116
+
117
+ async def run(self):
118
+ """爬取入口"""
119
+ queue = asyncio.PriorityQueue()
120
+ semaphore = asyncio.Semaphore(self.max_requests)
121
+
122
+ # 工作协程启动...
123
+ workers = [
124
+ asyncio.create_task(self.worker(queue, semaphore))
125
+ for _ in range(self.max_requests)
126
+ ]
127
+
128
+ # 将初始请求加入队列
129
+ for req in self.start_requests():
130
+ await queue.put(req)
131
+
132
+ # 等待队列中的所有任务完成
133
+ await queue.join()
134
+
135
+ # ...停止工作协程
136
+ for _ in range(self.max_requests):
137
+ await queue.put(Request(url=""))
138
+
139
+ # 等待所有工作协程完成
140
+ await asyncio.gather(*workers)
141
+
142
+ def go(self):
143
+ asyncio.run(self.run())
coocan/url/__init__.py ADDED
@@ -0,0 +1,2 @@
1
+ from coocan.url.request import Request
2
+ from coocan.url.response import SelectorResponse as Response
coocan/url/request.py ADDED
@@ -0,0 +1,31 @@
1
+ import time
2
+ from typing import Callable
3
+
4
+ import httpx
5
+
6
+ cli = httpx.AsyncClient()
7
+
8
+
9
+ class Request:
10
+ def __init__(self, url: str, callback: Callable = None, params=None, headers=None, data=None, json=None, timeout=None, cb_kwargs=None, priority=None):
11
+ self.url = url
12
+ self.callback = callback
13
+ self.params = params
14
+ self.headers = headers or {}
15
+ self.data = data
16
+ self.json = json
17
+ self.timeout = timeout
18
+ self.cb_kwargs = cb_kwargs or {}
19
+ self.priority = priority or time.time()
20
+
21
+ async def send(self):
22
+ if (self.data and self.json) is None:
23
+ response = await cli.get(self.url, params=self.params, headers=self.headers, timeout=self.timeout)
24
+ elif self.data or self.json:
25
+ response = await cli.post(self.url, params=self.params, headers=self.headers, data=self.data, json=self.json, timeout=self.timeout)
26
+ else:
27
+ raise Exception("仅支持 GET 和 POST 请求")
28
+ return response
29
+
30
+ def __lt__(self, other):
31
+ return self.priority < other.priority
coocan/url/response.py ADDED
@@ -0,0 +1,50 @@
1
+ from typing import Callable
2
+
3
+ from cocoman.spider.errors import ResponseCodeError, ResponseTextError
4
+ from httpx import Response
5
+ from parsel import Selector
6
+
7
+
8
+ class SelectorResponse(Response):
9
+ """可以使用Xpath、CSS"""
10
+
11
+ def __init__(self, response: Response):
12
+ super().__init__(response.status_code)
13
+ self.__dict__.update(response.__dict__)
14
+ self.selector = Selector(text=response.text)
15
+
16
+ def __str__(self):
17
+ return "<Response {}>".format(self.status_code)
18
+
19
+ def xpath(self, query: str):
20
+ sel = self.selector.xpath(query)
21
+ return sel
22
+
23
+ def css(self, query: str):
24
+ sel = self.selector.css(query)
25
+ return sel
26
+
27
+ def get_one(self, query: str, default=None, strip=True):
28
+ v = self.selector.xpath(query).get(default=default)
29
+ return v.strip() if strip and isinstance(v, str) else v
30
+
31
+ def get_all(self, query: str, strip=True):
32
+ vs = [v.strip() if strip else v for v in self.selector.xpath(query).getall()]
33
+ return vs
34
+
35
+ def raise_for_status(self, codes: list = None):
36
+ codes = codes or [200]
37
+ if self.status_code not in codes:
38
+ raise ResponseCodeError("{} not in {}".format(self.status_code, codes))
39
+
40
+ def raise_for_text(self, validate: Callable[[str], bool]):
41
+ if validate(self.text) is False:
42
+ raise ResponseTextError("not ideal text")
43
+
44
+ def raise_has_text(self, text: str):
45
+ """有此文本则抛出异常"""
46
+ assert self.text.find(text) == -1, ResponseTextError("has text: {}".format(text))
47
+
48
+ def raise_no_text(self, text: str):
49
+ """无此文本则抛出异常"""
50
+ assert self.text.find(text) != -1, ResponseTextError("no text: {}".format(text))
@@ -0,0 +1,93 @@
1
+ Metadata-Version: 2.1
2
+ Name: coocan
3
+ Version: 0.4.9
4
+ Summary: Air Spider Framework
5
+ Author: wauo
6
+ Author-email: markadc@126.com
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: click>=8.0.0
10
+ Requires-Dist: httpx
11
+ Requires-Dist: loguru
12
+
13
+ # 项目说明
14
+
15
+ - 一个非常轻量的异步爬虫框架
16
+
17
+ # 项目地址
18
+
19
+ - https://github.com/markadc/coocan
20
+
21
+ ## demo
22
+
23
+ - 效果
24
+ <br>
25
+ ![效果](coocan/demo.gif)
26
+
27
+
28
+ - 代码
29
+
30
+ ```python
31
+ import json
32
+
33
+ from loguru import logger
34
+
35
+ import coocan
36
+ from coocan import Request, MiniSpider
37
+
38
+
39
+ class CSDNDetailSpider(MiniSpider):
40
+ start_urls = ['http://www.csdn.net']
41
+ max_requests = 10
42
+
43
+ def middleware(self, request: Request):
44
+ request.headers["Referer"] = "http://www.csdn.net/"
45
+
46
+ def parse(self, response):
47
+ api = "https://blog.csdn.net/community/home-api/v1/get-business-list"
48
+ params = {
49
+ "page": "1",
50
+ "size": "20",
51
+ "businessType": "lately",
52
+ "noMore": "false",
53
+ "username": "markadc"
54
+ }
55
+ yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
56
+
57
+ def parse_page(self, response, api, params):
58
+ current_page = params["page"]
59
+ data = json.loads(response.text)
60
+ some = data["data"]["list"]
61
+
62
+ if not some:
63
+ logger.warning("没有第 {} 页".format(current_page))
64
+ return
65
+
66
+ for one in some:
67
+ date = one["formatTime"]
68
+ name = one["title"]
69
+ detail_url = one["url"]
70
+ logger.info(
71
+ """
72
+ {}
73
+ {}
74
+ {}
75
+ """.format(date, name, detail_url)
76
+ )
77
+ yield coocan.Request(detail_url, self.parse_detail, cb_kwargs={"title": name})
78
+
79
+ logger.info("第 {} 页抓取成功".format(params["page"]))
80
+
81
+ # 抓取下一页
82
+ next_page = int(current_page) + 1
83
+ params["page"] = str(next_page)
84
+ yield Request(api, self.parse_page, params=params, cb_kwargs={"api": api, "params": params})
85
+
86
+ def parse_detail(self, response, title):
87
+ logger.success("{} 已访问 {}".format(response.status_code, title))
88
+
89
+
90
+ if __name__ == '__main__':
91
+ s = CSDNDetailSpider()
92
+ s.go()
93
+ ```
@@ -0,0 +1,15 @@
1
+ coocan/__init__.py,sha256=UqFmE7ucuR_xR3OyyBU8pxqLfCJ5AdH_HsDdTsYPf6g,55
2
+ coocan/gen.py,sha256=J6QWXkBVbiCQqey8i0BDqleRNpBswI8AyvrYmkDVQPw,1028
3
+ coocan/cmd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ coocan/cmd/cli.py,sha256=d_sG63wz8RHHa5u7HabOz36yai_D8S3bSr7VprVpTck,2420
5
+ coocan/cmd/templates/spider.txt,sha256=Htd7nOs1EeKbc8LNRUX7xyHkLWz3S0kaTPRW0M3NuUw,480
6
+ coocan/spider/__init__.py,sha256=kMDCGeqtN50raCzwfCn18s_W8xV6KO_Ny9Xol4I48Ag,58
7
+ coocan/spider/base.py,sha256=WMTnMQd7Dnv2aC7rnmkAo_WJu33p9g3GN07A2DnbdLI,5104
8
+ coocan/url/__init__.py,sha256=rEMx66XDy5AIJ9mF_2UVzHW5mRLBAWZEyQ3txrZzuZA,102
9
+ coocan/url/request.py,sha256=seZaQXQRvRMIf9WnCp3mAgNA-kxsj9P2JzAvuIt2Dx8,1116
10
+ coocan/url/response.py,sha256=AnC0xsF34q68r62EVlcHYmDH6skm9RBwRHITTb4iBbU,1785
11
+ coocan-0.4.9.dist-info/METADATA,sha256=WZhIMdcypGrPcE1_bAMxMUa3puvkngstEU4PSTR5UXo,2374
12
+ coocan-0.4.9.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
13
+ coocan-0.4.9.dist-info/entry_points.txt,sha256=tOLQN_TVhl_9f2YBASTGBE_ClmG-iQ4rKmyhE2WAOY0,43
14
+ coocan-0.4.9.dist-info/top_level.txt,sha256=VwB-Q4zEljgb9v1Ms1E59B-1pBYORXuhKjgZb-LHOhk,7
15
+ coocan-0.4.9.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.45.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ cc = coocan.cmd.cli:main
@@ -0,0 +1 @@
1
+ coocan