coocan 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coocan-0.0.1/PKG-INFO +12 -0
- coocan-0.0.1/coocan/__init__.py +2 -0
- coocan-0.0.1/coocan/spider/__init__.py +1 -0
- coocan-0.0.1/coocan/spider/base.py +84 -0
- coocan-0.0.1/coocan/url/__init__.py +2 -0
- coocan-0.0.1/coocan/url/request.py +21 -0
- coocan-0.0.1/coocan/url/response.py +51 -0
- coocan-0.0.1/coocan.egg-info/PKG-INFO +12 -0
- coocan-0.0.1/coocan.egg-info/SOURCES.txt +11 -0
- coocan-0.0.1/coocan.egg-info/dependency_links.txt +1 -0
- coocan-0.0.1/coocan.egg-info/top_level.txt +1 -0
- coocan-0.0.1/setup.cfg +4 -0
- coocan-0.0.1/setup.py +16 -0
coocan-0.0.1/PKG-INFO
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
from coocan.spider.base import Spider
|
@@ -0,0 +1,84 @@
|
|
1
|
+
import asyncio
|
2
|
+
from collections.abc import Iterator
|
3
|
+
|
4
|
+
from loguru import logger
|
5
|
+
|
6
|
+
import coocan
|
7
|
+
from coocan.url import Request
|
8
|
+
|
9
|
+
|
10
|
+
class Spider:
|
11
|
+
start_urls = []
|
12
|
+
max_requests = 5
|
13
|
+
|
14
|
+
def start_requests(self):
|
15
|
+
"""初始请求"""
|
16
|
+
assert self.start_urls, "没有起始 URL 列表"
|
17
|
+
for url in self.start_urls:
|
18
|
+
yield coocan.Request(url, self.parse)
|
19
|
+
|
20
|
+
def middleware(self, request: Request):
|
21
|
+
request.headers.setdefault("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0")
|
22
|
+
|
23
|
+
async def get_response(self, request: Request):
|
24
|
+
"""发送请求,获取响应"""
|
25
|
+
try:
|
26
|
+
self.middleware(request)
|
27
|
+
response = await request.send()
|
28
|
+
return response
|
29
|
+
except Exception as e:
|
30
|
+
logger.error("{} {}".format(request.url, e))
|
31
|
+
|
32
|
+
def parse(self, response):
|
33
|
+
raise NotImplementedError("没有定义回调函数 {}.parse ".format(self.__class__.__name__))
|
34
|
+
|
35
|
+
async def worker(self, queue, semaphore):
|
36
|
+
"""工作协程,从队列中获取请求并处理"""
|
37
|
+
while True:
|
38
|
+
request = await queue.get()
|
39
|
+
|
40
|
+
# 结束信号
|
41
|
+
if request is None:
|
42
|
+
break
|
43
|
+
|
44
|
+
# 控制并发
|
45
|
+
async with semaphore:
|
46
|
+
response = await self.get_response(request)
|
47
|
+
if response:
|
48
|
+
try:
|
49
|
+
cached = request.callback(response, **request.cb_kwargs)
|
50
|
+
if isinstance(cached, Iterator):
|
51
|
+
for next_request in cached:
|
52
|
+
await queue.put(next_request) # 将后续请求加入队列
|
53
|
+
except Exception as e:
|
54
|
+
logger.error(e)
|
55
|
+
|
56
|
+
queue.task_done()
|
57
|
+
|
58
|
+
async def run(self):
|
59
|
+
"""爬取入口"""
|
60
|
+
queue = asyncio.Queue()
|
61
|
+
semaphore = asyncio.Semaphore(self.max_requests)
|
62
|
+
|
63
|
+
# 工作协程启动...
|
64
|
+
workers = [
|
65
|
+
asyncio.create_task(self.worker(queue, semaphore))
|
66
|
+
for _ in range(self.max_requests)
|
67
|
+
]
|
68
|
+
|
69
|
+
# 将初始请求加入队列
|
70
|
+
for req in self.start_requests():
|
71
|
+
await queue.put(req)
|
72
|
+
|
73
|
+
# 等待队列中的所有任务完成
|
74
|
+
await queue.join()
|
75
|
+
|
76
|
+
# ...停止工作协程
|
77
|
+
for _ in range(self.max_requests):
|
78
|
+
await queue.put(None)
|
79
|
+
|
80
|
+
# 等待所有工作协程完成
|
81
|
+
await asyncio.gather(*workers)
|
82
|
+
|
83
|
+
def go(self):
|
84
|
+
asyncio.run(self.run())
|
@@ -0,0 +1,21 @@
|
|
1
|
+
from typing import Callable
|
2
|
+
|
3
|
+
import httpx
|
4
|
+
|
5
|
+
cli = httpx.AsyncClient()
|
6
|
+
|
7
|
+
|
8
|
+
class Request:
|
9
|
+
def __init__(self, url, callback: Callable = None, params=None, headers=None, data=None, json=None, timeout=None, cb_kwargs=None):
|
10
|
+
self.url = url
|
11
|
+
self.callback = callback
|
12
|
+
self.params = params
|
13
|
+
self.headers = headers or {}
|
14
|
+
self.data = data
|
15
|
+
self.json = json
|
16
|
+
self.timeout = timeout
|
17
|
+
self.cb_kwargs = cb_kwargs or {}
|
18
|
+
|
19
|
+
async def send(self):
|
20
|
+
response = await cli.get(self.url, params=self.params, headers=self.headers, timeout=self.timeout)
|
21
|
+
return response
|
@@ -0,0 +1,51 @@
|
|
1
|
+
from typing import Callable
|
2
|
+
|
3
|
+
from parsel import Selector
|
4
|
+
from requests import Response
|
5
|
+
|
6
|
+
from cocoman.spider.errors import ResponseCodeError, ResponseTextError
|
7
|
+
|
8
|
+
|
9
|
+
class SelectorResponse(Response):
|
10
|
+
"""可以使用Xpath、CSS"""
|
11
|
+
|
12
|
+
def __init__(self, response: Response):
|
13
|
+
super().__init__()
|
14
|
+
self.__dict__.update(response.__dict__)
|
15
|
+
self.selector = Selector(text=response.text)
|
16
|
+
|
17
|
+
def __str__(self):
|
18
|
+
return "<Response {}>".format(self.status_code)
|
19
|
+
|
20
|
+
def xpath(self, query: str):
|
21
|
+
sel = self.selector.xpath(query)
|
22
|
+
return sel
|
23
|
+
|
24
|
+
def css(self, query: str):
|
25
|
+
sel = self.selector.css(query)
|
26
|
+
return sel
|
27
|
+
|
28
|
+
def get_one(self, query: str, default=None, strip=True):
|
29
|
+
v = self.selector.xpath(query).get(default=default)
|
30
|
+
return v.strip() if strip and isinstance(v, str) else v
|
31
|
+
|
32
|
+
def get_all(self, query: str, strip=True):
|
33
|
+
vs = [v.strip() if strip else v for v in self.selector.xpath(query).getall()]
|
34
|
+
return vs
|
35
|
+
|
36
|
+
def raise_for_status(self, codes: list = None):
|
37
|
+
codes = codes or [200]
|
38
|
+
if self.status_code not in codes:
|
39
|
+
raise ResponseCodeError("{} not in {}".format(self.status_code, codes))
|
40
|
+
|
41
|
+
def raise_for_text(self, validate: Callable[[str], bool] = None):
|
42
|
+
if validate(self.text) is False:
|
43
|
+
raise ResponseTextError("not ideal text")
|
44
|
+
|
45
|
+
def raise_has_text(self, text: str):
|
46
|
+
"""有此文本则抛出异常"""
|
47
|
+
assert self.text.find(text) == -1, ResponseTextError("has text: {}".format(text))
|
48
|
+
|
49
|
+
def raise_no_text(self, text: str):
|
50
|
+
"""无此文本则抛出异常"""
|
51
|
+
assert self.text.find(text) != -1, ResponseTextError("no text: {}".format(text))
|
@@ -0,0 +1,11 @@
|
|
1
|
+
setup.py
|
2
|
+
coocan/__init__.py
|
3
|
+
coocan.egg-info/PKG-INFO
|
4
|
+
coocan.egg-info/SOURCES.txt
|
5
|
+
coocan.egg-info/dependency_links.txt
|
6
|
+
coocan.egg-info/top_level.txt
|
7
|
+
coocan/spider/__init__.py
|
8
|
+
coocan/spider/base.py
|
9
|
+
coocan/url/__init__.py
|
10
|
+
coocan/url/request.py
|
11
|
+
coocan/url/response.py
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
coocan
|
coocan-0.0.1/setup.cfg
ADDED
coocan-0.0.1/setup.py
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
from setuptools import setup, find_packages
|
2
|
+
|
3
|
+
with open("readme.md", "r", encoding="utf-8") as f:
|
4
|
+
long_description = f.read()
|
5
|
+
|
6
|
+
setup(
|
7
|
+
name="coocan",
|
8
|
+
version="0.0.1",
|
9
|
+
author="wauo",
|
10
|
+
author_email="markadc@126.com",
|
11
|
+
description="Air Spider Framework",
|
12
|
+
packages=find_packages(),
|
13
|
+
python_requires=">=3.10",
|
14
|
+
long_description=long_description,
|
15
|
+
long_description_content_type="text/markdown",
|
16
|
+
)
|