coocan 0.3.3__tar.gz → 0.4.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: coocan
3
- Version: 0.3.3
3
+ Version: 0.4.4
4
4
  Summary: Air Spider Framework
5
5
  Author: wauo
6
6
  Author-email: markadc@126.com
File without changes
@@ -0,0 +1,41 @@
1
+ from pathlib import Path
2
+
3
+ import click
4
+
5
+ TEMPLATE_DIR = Path(__file__).parent / "templates"
6
+
7
+
8
+ @click.group()
9
+ def main():
10
+ """爬虫者的贴心助手
11
+ \b
12
+ 可用命令:
13
+ new - 创建新的爬虫文件
14
+ \b
15
+ 示例:
16
+ cc new -s demo
17
+ """
18
+
19
+
20
+ @main.command()
21
+ @click.option('-s', '--spider', required=True, help='爬虫文')
22
+ def new(spider):
23
+ """新建"""
24
+ spider_file_name = "{}.py".format(spider)
25
+ try:
26
+ template_path = TEMPLATE_DIR / "spider.py"
27
+ with open(template_path, 'r') as f:
28
+ content = f.read()
29
+
30
+ with open(spider_file_name, 'w') as f:
31
+ f.write(content)
32
+
33
+ click.echo("Success create spider file {}".format(spider_file_name))
34
+
35
+ except Exception as e:
36
+ click.echo(str(e))
37
+ raise click.ClickException("Failed")
38
+
39
+
40
+ if __name__ == '__main__':
41
+ main()
File without changes
@@ -0,0 +1,12 @@
1
+ from coocan import Request, Response, MiniSpider
2
+
3
+
4
+ class Spider(MiniSpider):
5
+ start_urls = ['https://github.com/markadc/coocan']
6
+ max_requests = 10
7
+
8
+ def middleware(self, request: Request):
9
+ request.headers["Referer"] = "https://github.com"
10
+
11
+ def parse(self, response: Response):
12
+ pass
@@ -0,0 +1,33 @@
1
+ import random
2
+
3
+
4
+ def gen_random_os() -> str:
5
+ """生成一个随机的操作系统"""
6
+ os_choices = [
7
+ "Windows NT 10.0; Win64; x64",
8
+ "Windows NT 6.1; WOW64",
9
+ "Macintosh; Intel Mac OS X 10_15_6",
10
+ "X11; Linux x86_64",
11
+ "Windows NT 6.3; Trident/7.0",
12
+ ]
13
+ return random.choice(os_choices)
14
+
15
+
16
+ def gen_random_browser() -> str:
17
+ """生成一个随机的浏览器类型和版本"""
18
+ browser_choices = [
19
+ ("Chrome", random.randint(70, 100)),
20
+ ("Firefox", random.randint(70, 100)),
21
+ ("Edge", random.randint(80, 100)),
22
+ ("Safari", random.randint(10, 14)),
23
+ ("Opera", random.randint(60, 80)),
24
+ ]
25
+ browser, version = random.choice(browser_choices)
26
+ return f"{browser}/{version}.0"
27
+
28
+
29
+ def gen_random_ua() -> str:
30
+ """生成一个随机的UA"""
31
+ os, browser = gen_random_os(), gen_random_browser()
32
+ ua = f"Mozilla/5.0 ({os}) AppleWebKit/537.36 (KHTML, like Gecko) {browser} Safari/537.36"
33
+ return ua
@@ -0,0 +1,143 @@
1
+ import asyncio
2
+ from collections.abc import Iterator
3
+
4
+ from loguru import logger
5
+
6
+ from coocan.gen import gen_random_ua
7
+ from coocan.url import Request, Response
8
+
9
+
10
+ class IgnoreRequest(Exception):
11
+ """忽略这个请求,不再重试"""
12
+ pass
13
+
14
+
15
+ class IgnoreResponse(Exception):
16
+ """忽略这个响应,不进回调"""
17
+ pass
18
+
19
+
20
+ class MiniSpider:
21
+ start_urls = []
22
+ max_requests = 5
23
+ max_retry_times = 3
24
+ enable_random_ua = True
25
+ headers_extra_field = {}
26
+ delay = 0
27
+
28
+ def start_requests(self):
29
+ """初始请求"""
30
+ assert self.start_urls, "没有起始 URL 列表"
31
+ for url in self.start_urls:
32
+ yield Request(url, self.parse)
33
+
34
+ def middleware(self, request: Request):
35
+ # 随机Ua
36
+ if self.enable_random_ua is True:
37
+ request.headers.setdefault("User-Agent", gen_random_ua())
38
+
39
+ # 为 headers 补充额外字段
40
+ if self.headers_extra_field:
41
+ request.headers.update(self.headers_extra_field)
42
+
43
+ def validator(self, response: Response):
44
+ """校验响应"""
45
+ pass
46
+
47
+ def parse(self, response: Response):
48
+ """默认回调函数"""
49
+ raise NotImplementedError("没有定义回调函数 {}.parse ".format(self.__class__.__name__))
50
+
51
+ def handle_request_excetpion(self, e: Exception, request: Request):
52
+ """处理请求时的异常"""
53
+ logger.error("{} {}".format(type(e).__name__, request.url))
54
+
55
+ def handle_callback_excetpion(self, e: Exception, request: Request, response: Response):
56
+ logger.error("{} `回调`时出现异常 | {} | {} | {}".format(response.status_code, e, request.callback.__name__, request.url))
57
+
58
+ async def worker(self, queue: asyncio.PriorityQueue, semaphore: asyncio.Semaphore):
59
+ """工作协程,从队列中获取请求并处理"""
60
+ while True:
61
+ req: Request = await queue.get()
62
+
63
+ # 结束信号
64
+ if req.url == "":
65
+ break
66
+
67
+ # 控制并发
68
+ async with semaphore:
69
+ for i in range(self.max_retry_times + 1):
70
+ # 进入了重试
71
+ if i > 0:
72
+ logger.debug("正在重试第{}次... {}".format(i, req.url))
73
+
74
+ # 开始请求...
75
+ try:
76
+ self.middleware(req)
77
+ await asyncio.sleep(self.delay)
78
+ resp = await req.send()
79
+
80
+ # 请求失败
81
+ except Exception as e:
82
+ try:
83
+ result = self.handle_request_excetpion(e, req)
84
+ if isinstance(result, Request):
85
+ await queue.put(result)
86
+ break
87
+ except IgnoreRequest as e:
88
+ logger.debug("{} 忽略请求 {}".format(e, req.url))
89
+ break
90
+ except Exception as e:
91
+ logger.error("`处理异常函数`异常了 | {} | {}".format(e, req.url))
92
+
93
+ # 请求成功
94
+ else:
95
+ # 校验响应
96
+ try:
97
+ self.validator(resp)
98
+ except IgnoreResponse as e:
99
+ logger.debug("{} 忽略响应 {}".format(e, req.url))
100
+ break
101
+ except Exception as e:
102
+ logger.error("`校验器`函数异常了 | {} | {}".format(e, req.url))
103
+
104
+ # 进入回调
105
+ try:
106
+ cached = req.callback(Response(resp), **req.cb_kwargs)
107
+ if isinstance(cached, Iterator):
108
+ for next_request in cached:
109
+ await queue.put(next_request) # 把后续请求加入队列
110
+ except Exception as e:
111
+ self.handle_callback_excetpion(e, req, resp)
112
+ finally:
113
+ break
114
+
115
+ queue.task_done()
116
+
117
+ async def run(self):
118
+ """爬取入口"""
119
+ queue = asyncio.PriorityQueue()
120
+ semaphore = asyncio.Semaphore(self.max_requests)
121
+
122
+ # 工作协程启动...
123
+ workers = [
124
+ asyncio.create_task(self.worker(queue, semaphore))
125
+ for _ in range(self.max_requests)
126
+ ]
127
+
128
+ # 将初始请求加入队列
129
+ for req in self.start_requests():
130
+ await queue.put(req)
131
+
132
+ # 等待队列中的所有任务完成
133
+ await queue.join()
134
+
135
+ # ...停止工作协程
136
+ for _ in range(self.max_requests):
137
+ await queue.put(Request(url=""))
138
+
139
+ # 等待所有工作协程完成
140
+ await asyncio.gather(*workers)
141
+
142
+ def go(self):
143
+ asyncio.run(self.run())
@@ -1,3 +1,4 @@
1
+ import time
1
2
  from typing import Callable
2
3
 
3
4
  import httpx
@@ -6,7 +7,7 @@ cli = httpx.AsyncClient()
6
7
 
7
8
 
8
9
  class Request:
9
- def __init__(self, url: str, callback: Callable = None, params=None, headers=None, data=None, json=None, timeout=None, cb_kwargs=None):
10
+ def __init__(self, url: str, callback: Callable = None, params=None, headers=None, data=None, json=None, timeout=None, cb_kwargs=None, priority=None):
10
11
  self.url = url
11
12
  self.callback = callback
12
13
  self.params = params
@@ -15,6 +16,7 @@ class Request:
15
16
  self.json = json
16
17
  self.timeout = timeout
17
18
  self.cb_kwargs = cb_kwargs or {}
19
+ self.priority = priority or time.time()
18
20
 
19
21
  async def send(self):
20
22
  if (self.data and self.json) is None:
@@ -24,3 +26,6 @@ class Request:
24
26
  else:
25
27
  raise Exception("仅支持 GET 和 POST 请求")
26
28
  return response
29
+
30
+ def __lt__(self, other):
31
+ return self.priority < other.priority
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: coocan
3
- Version: 0.3.3
3
+ Version: 0.4.4
4
4
  Summary: Air Spider Framework
5
5
  Author: wauo
6
6
  Author-email: markadc@126.com
@@ -1,9 +1,16 @@
1
1
  setup.py
2
2
  coocan/__init__.py
3
+ coocan/gen.py
3
4
  coocan.egg-info/PKG-INFO
4
5
  coocan.egg-info/SOURCES.txt
5
6
  coocan.egg-info/dependency_links.txt
7
+ coocan.egg-info/entry_points.txt
8
+ coocan.egg-info/requires.txt
6
9
  coocan.egg-info/top_level.txt
10
+ coocan/cmd/__init__.py
11
+ coocan/cmd/cli.py
12
+ coocan/cmd/templates/__init__.py
13
+ coocan/cmd/templates/spider.py
7
14
  coocan/spider/__init__.py
8
15
  coocan/spider/base.py
9
16
  coocan/url/__init__.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ cc = coocan.cmd.cli:main
@@ -0,0 +1 @@
1
+ click>=8.0.0
@@ -2,10 +2,18 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="coocan",
5
- version="0.3.3",
5
+ version="0.4.4",
6
6
  author="wauo",
7
7
  author_email="markadc@126.com",
8
8
  description="Air Spider Framework",
9
9
  packages=find_packages(),
10
10
  python_requires=">=3.10",
11
+ install_requires=[
12
+ 'click>=8.0.0',
13
+ ],
14
+ entry_points={
15
+ 'console_scripts': [
16
+ 'cc=coocan.cmd.cli:main',
17
+ ],
18
+ }
11
19
  )
@@ -1,98 +0,0 @@
1
- import asyncio
2
- from collections.abc import Iterator
3
-
4
- from loguru import logger
5
-
6
- from coocan.url import Request, Response
7
-
8
-
9
- class IgnoreRequest(Exception):
10
- pass
11
-
12
-
13
- class MiniSpider:
14
- start_urls = []
15
- max_requests = 5
16
- max_retry_times = 3
17
-
18
- def start_requests(self):
19
- """初始请求"""
20
- assert self.start_urls, "没有起始 URL 列表"
21
- for url in self.start_urls:
22
- yield Request(url, self.parse)
23
-
24
- def middleware(self, request: Request):
25
- request.headers.setdefault("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0")
26
-
27
- async def get_response(self, request: Request):
28
- """发送请求,获取响应"""
29
- self.middleware(request)
30
- response = await request.send()
31
- return response
32
-
33
- def parse(self, response: Response):
34
- raise NotImplementedError("没有定义回调函数 {}.parse ".format(self.__class__.__name__))
35
-
36
- def when_request_error(self, e: Exception, request: Request):
37
- logger.error("{} {}".format(type(e).__name__, request.url))
38
-
39
- async def worker(self, queue, semaphore):
40
- """工作协程,从队列中获取请求并处理"""
41
- while True:
42
- request = await queue.get()
43
-
44
- # 结束信号
45
- if request is None:
46
- break
47
-
48
- # 控制并发
49
- async with semaphore:
50
- for i in range(self.max_retry_times + 1):
51
- if i > 0:
52
- logger.debug("正在重试第{}次...{}".format(i, request.url))
53
- try:
54
- response = await self.get_response(request)
55
- cached = request.callback(Response(response), **request.cb_kwargs)
56
- if isinstance(cached, Iterator):
57
- for next_request in cached:
58
- await queue.put(next_request) # 将后续请求加入队列
59
- except Exception as e:
60
- try:
61
- result = self.when_request_error(e, request)
62
- if isinstance(result, Request):
63
- await queue.put(result)
64
- logger.debug("新的请求 {}".format(result.url))
65
- except IgnoreRequest as e:
66
- logger.debug("{} 忽略请求 {}".format(e, request.url))
67
- break
68
- else:
69
- break
70
- queue.task_done()
71
-
72
- async def run(self):
73
- """爬取入口"""
74
- queue = asyncio.Queue()
75
- semaphore = asyncio.Semaphore(self.max_requests)
76
-
77
- # 工作协程启动...
78
- workers = [
79
- asyncio.create_task(self.worker(queue, semaphore))
80
- for _ in range(self.max_requests)
81
- ]
82
-
83
- # 将初始请求加入队列
84
- for req in self.start_requests():
85
- await queue.put(req)
86
-
87
- # 等待队列中的所有任务完成
88
- await queue.join()
89
-
90
- # ...停止工作协程
91
- for _ in range(self.max_requests):
92
- await queue.put(None)
93
-
94
- # 等待所有工作协程完成
95
- await asyncio.gather(*workers)
96
-
97
- def go(self):
98
- asyncio.run(self.run())
File without changes
File without changes
File without changes
File without changes
File without changes