coocan 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
coocan-0.0.1/PKG-INFO ADDED
@@ -0,0 +1,12 @@
1
+ Metadata-Version: 2.1
2
+ Name: coocan
3
+ Version: 0.0.1
4
+ Summary: Air Spider Framework
5
+ Author: wauo
6
+ Author-email: markadc@126.com
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+
10
+ # 项目说明
11
+
12
+ - 一个轻量爬虫框架
@@ -0,0 +1,2 @@
1
+ from coocan.spider import *
2
+ from coocan.url import *
@@ -0,0 +1 @@
1
+ from coocan.spider.base import Spider
@@ -0,0 +1,84 @@
1
+ import asyncio
2
+ from collections.abc import Iterator
3
+
4
+ from loguru import logger
5
+
6
+ import coocan
7
+ from coocan.url import Request
8
+
9
+
10
+ class Spider:
11
+ start_urls = []
12
+ max_requests = 5
13
+
14
+ def start_requests(self):
15
+ """初始请求"""
16
+ assert self.start_urls, "没有起始 URL 列表"
17
+ for url in self.start_urls:
18
+ yield coocan.Request(url, self.parse)
19
+
20
+ def middleware(self, request: Request):
21
+ request.headers.setdefault("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0")
22
+
23
+ async def get_response(self, request: Request):
24
+ """发送请求,获取响应"""
25
+ try:
26
+ self.middleware(request)
27
+ response = await request.send()
28
+ return response
29
+ except Exception as e:
30
+ logger.error("{} {}".format(request.url, e))
31
+
32
+ def parse(self, response):
33
+ raise NotImplementedError("没有定义回调函数 {}.parse ".format(self.__class__.__name__))
34
+
35
+ async def worker(self, queue, semaphore):
36
+ """工作协程,从队列中获取请求并处理"""
37
+ while True:
38
+ request = await queue.get()
39
+
40
+ # 结束信号
41
+ if request is None:
42
+ break
43
+
44
+ # 控制并发
45
+ async with semaphore:
46
+ response = await self.get_response(request)
47
+ if response:
48
+ try:
49
+ cached = request.callback(response, **request.cb_kwargs)
50
+ if isinstance(cached, Iterator):
51
+ for next_request in cached:
52
+ await queue.put(next_request) # 将后续请求加入队列
53
+ except Exception as e:
54
+ logger.error(e)
55
+
56
+ queue.task_done()
57
+
58
+ async def run(self):
59
+ """爬取入口"""
60
+ queue = asyncio.Queue()
61
+ semaphore = asyncio.Semaphore(self.max_requests)
62
+
63
+ # 工作协程启动...
64
+ workers = [
65
+ asyncio.create_task(self.worker(queue, semaphore))
66
+ for _ in range(self.max_requests)
67
+ ]
68
+
69
+ # 将初始请求加入队列
70
+ for req in self.start_requests():
71
+ await queue.put(req)
72
+
73
+ # 等待队列中的所有任务完成
74
+ await queue.join()
75
+
76
+ # ...停止工作协程
77
+ for _ in range(self.max_requests):
78
+ await queue.put(None)
79
+
80
+ # 等待所有工作协程完成
81
+ await asyncio.gather(*workers)
82
+
83
+ def go(self):
84
+ asyncio.run(self.run())
@@ -0,0 +1,2 @@
1
+ from coocan.url.request import Request
2
+ from coocan.url.response import SelectorResponse
@@ -0,0 +1,21 @@
1
+ from typing import Callable
2
+
3
+ import httpx
4
+
5
+ cli = httpx.AsyncClient()
6
+
7
+
8
+ class Request:
9
+ def __init__(self, url, callback: Callable = None, params=None, headers=None, data=None, json=None, timeout=None, cb_kwargs=None):
10
+ self.url = url
11
+ self.callback = callback
12
+ self.params = params
13
+ self.headers = headers or {}
14
+ self.data = data
15
+ self.json = json
16
+ self.timeout = timeout
17
+ self.cb_kwargs = cb_kwargs or {}
18
+
19
+ async def send(self):
20
+ response = await cli.get(self.url, params=self.params, headers=self.headers, timeout=self.timeout)
21
+ return response
@@ -0,0 +1,51 @@
1
+ from typing import Callable
2
+
3
+ from parsel import Selector
4
+ from requests import Response
5
+
6
+ from cocoman.spider.errors import ResponseCodeError, ResponseTextError
7
+
8
+
9
+ class SelectorResponse(Response):
10
+ """可以使用Xpath、CSS"""
11
+
12
+ def __init__(self, response: Response):
13
+ super().__init__()
14
+ self.__dict__.update(response.__dict__)
15
+ self.selector = Selector(text=response.text)
16
+
17
+ def __str__(self):
18
+ return "<Response {}>".format(self.status_code)
19
+
20
+ def xpath(self, query: str):
21
+ sel = self.selector.xpath(query)
22
+ return sel
23
+
24
+ def css(self, query: str):
25
+ sel = self.selector.css(query)
26
+ return sel
27
+
28
+ def get_one(self, query: str, default=None, strip=True):
29
+ v = self.selector.xpath(query).get(default=default)
30
+ return v.strip() if strip and isinstance(v, str) else v
31
+
32
+ def get_all(self, query: str, strip=True):
33
+ vs = [v.strip() if strip else v for v in self.selector.xpath(query).getall()]
34
+ return vs
35
+
36
+ def raise_for_status(self, codes: list = None):
37
+ codes = codes or [200]
38
+ if self.status_code not in codes:
39
+ raise ResponseCodeError("{} not in {}".format(self.status_code, codes))
40
+
41
+ def raise_for_text(self, validate: Callable[[str], bool] = None):
42
+ if validate(self.text) is False:
43
+ raise ResponseTextError("not ideal text")
44
+
45
+ def raise_has_text(self, text: str):
46
+ """有此文本则抛出异常"""
47
+ assert self.text.find(text) == -1, ResponseTextError("has text: {}".format(text))
48
+
49
+ def raise_no_text(self, text: str):
50
+ """无此文本则抛出异常"""
51
+ assert self.text.find(text) != -1, ResponseTextError("no text: {}".format(text))
@@ -0,0 +1,12 @@
1
+ Metadata-Version: 2.1
2
+ Name: coocan
3
+ Version: 0.0.1
4
+ Summary: Air Spider Framework
5
+ Author: wauo
6
+ Author-email: markadc@126.com
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+
10
+ # 项目说明
11
+
12
+ - 一个轻量爬虫框架
@@ -0,0 +1,11 @@
1
+ setup.py
2
+ coocan/__init__.py
3
+ coocan.egg-info/PKG-INFO
4
+ coocan.egg-info/SOURCES.txt
5
+ coocan.egg-info/dependency_links.txt
6
+ coocan.egg-info/top_level.txt
7
+ coocan/spider/__init__.py
8
+ coocan/spider/base.py
9
+ coocan/url/__init__.py
10
+ coocan/url/request.py
11
+ coocan/url/response.py
@@ -0,0 +1 @@
1
+ coocan
coocan-0.0.1/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
coocan-0.0.1/setup.py ADDED
@@ -0,0 +1,16 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ with open("readme.md", "r", encoding="utf-8") as f:
4
+ long_description = f.read()
5
+
6
+ setup(
7
+ name="coocan",
8
+ version="0.0.1",
9
+ author="wauo",
10
+ author_email="markadc@126.com",
11
+ description="Air Spider Framework",
12
+ packages=find_packages(),
13
+ python_requires=">=3.10",
14
+ long_description=long_description,
15
+ long_description_content_type="text/markdown",
16
+ )