crawlo 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +9 -6
- crawlo/__version__.py +1 -2
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +158 -158
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +57 -59
- crawlo/crawler.py +242 -222
- crawlo/downloader/__init__.py +78 -78
- crawlo/downloader/aiohttp_downloader.py +259 -96
- crawlo/downloader/httpx_downloader.py +187 -48
- crawlo/downloader/playwright_downloader.py +160 -160
- crawlo/event.py +11 -11
- crawlo/exceptions.py +64 -64
- crawlo/extension/__init__.py +31 -31
- crawlo/extension/log_interval.py +49 -49
- crawlo/extension/log_stats.py +44 -44
- crawlo/filters/__init__.py +37 -37
- crawlo/filters/aioredis_filter.py +150 -130
- crawlo/filters/memory_filter.py +202 -203
- crawlo/items/__init__.py +62 -62
- crawlo/items/items.py +118 -118
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +32 -32
- crawlo/middleware/download_delay.py +28 -28
- crawlo/middleware/middleware_manager.py +140 -140
- crawlo/middleware/request_ignore.py +30 -30
- crawlo/middleware/response_code.py +18 -18
- crawlo/middleware/response_filter.py +26 -26
- crawlo/middleware/retry.py +90 -90
- crawlo/network/__init__.py +7 -7
- crawlo/network/request.py +204 -233
- crawlo/network/response.py +166 -162
- crawlo/pipelines/__init__.py +13 -13
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/mongo_pipeline.py +116 -116
- crawlo/pipelines/mysql_batch_pipline.py +133 -133
- crawlo/pipelines/mysql_pipeline.py +195 -195
- crawlo/pipelines/pipeline_manager.py +56 -56
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +94 -89
- crawlo/settings/setting_manager.py +99 -99
- crawlo/spider/__init__.py +36 -36
- crawlo/stats_collector.py +59 -47
- crawlo/subscriber.py +106 -106
- crawlo/task_manager.py +27 -27
- crawlo/templates/item_template.tmpl +21 -21
- crawlo/templates/project_template/main.py +32 -32
- crawlo/templates/project_template/setting.py +189 -189
- crawlo/templates/spider_template.tmpl +30 -30
- crawlo/utils/__init__.py +7 -7
- crawlo/utils/concurrency_manager.py +124 -124
- crawlo/utils/date_tools.py +177 -177
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/log.py +39 -39
- crawlo/utils/pqueue.py +173 -173
- crawlo/utils/project.py +59 -59
- crawlo/utils/request.py +122 -85
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +302 -302
- crawlo/utils/url.py +39 -39
- {crawlo-1.0.2.dist-info → crawlo-1.0.4.dist-info}/METADATA +48 -48
- crawlo-1.0.4.dist-info/RECORD +79 -0
- {crawlo-1.0.2.dist-info → crawlo-1.0.4.dist-info}/top_level.txt +1 -0
- tests/__init__.py +7 -0
- tests/baidu_spider/__init__.py +7 -0
- tests/baidu_spider/demo.py +94 -0
- tests/baidu_spider/items.py +25 -0
- tests/baidu_spider/middleware.py +49 -0
- tests/baidu_spider/pipeline.py +55 -0
- tests/baidu_spider/request_fingerprints.txt +9 -0
- tests/baidu_spider/run.py +27 -0
- tests/baidu_spider/settings.py +80 -0
- tests/baidu_spider/spiders/__init__.py +7 -0
- tests/baidu_spider/spiders/bai_du.py +61 -0
- tests/baidu_spider/spiders/sina.py +79 -0
- crawlo/filters/redis_filter.py +0 -120
- crawlo-1.0.2.dist-info/RECORD +0 -68
- {crawlo-1.0.2.dist-info → crawlo-1.0.4.dist-info}/WHEEL +0 -0
- {crawlo-1.0.2.dist-info → crawlo-1.0.4.dist-info}/entry_points.txt +0 -0
crawlo/utils/project.py
CHANGED
|
@@ -1,59 +1,59 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
import os
|
|
4
|
-
import sys
|
|
5
|
-
from importlib import import_module
|
|
6
|
-
from inspect import iscoroutinefunction
|
|
7
|
-
from typing import Callable
|
|
8
|
-
|
|
9
|
-
from crawlo.settings.setting_manager import SettingManager
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def _get_closest(path='.'):
|
|
13
|
-
path = os.path.abspath(path)
|
|
14
|
-
return path
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def _init_env():
|
|
18
|
-
closest = _get_closest()
|
|
19
|
-
if closest:
|
|
20
|
-
sys.path.append(closest)
|
|
21
|
-
# project_dir = os.path.dirname(closest)
|
|
22
|
-
# sys.path.append(project_dir)
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def get_settings(settings='settings'):
|
|
26
|
-
_settings = SettingManager()
|
|
27
|
-
_init_env()
|
|
28
|
-
_settings.set_settings(settings)
|
|
29
|
-
return _settings
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def merge_settings(spider, settings):
|
|
33
|
-
if hasattr(spider, 'custom_settings'):
|
|
34
|
-
custom_settings = getattr(spider, 'custom_settings')
|
|
35
|
-
settings.update_attributes(custom_settings)
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def load_class(_path):
|
|
39
|
-
if not isinstance(_path, str):
|
|
40
|
-
if callable(_path):
|
|
41
|
-
return _path
|
|
42
|
-
else:
|
|
43
|
-
raise TypeError(f"args expect str or object, got {_path}")
|
|
44
|
-
|
|
45
|
-
module_name, class_name = _path.rsplit('.', 1)
|
|
46
|
-
module = import_module(module_name)
|
|
47
|
-
|
|
48
|
-
try:
|
|
49
|
-
cls = getattr(module, class_name)
|
|
50
|
-
except AttributeError:
|
|
51
|
-
raise NameError(f"Module {module_name!r} has no class named {class_name!r}")
|
|
52
|
-
return cls
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
async def common_call(func: Callable, *args, **kwargs):
|
|
56
|
-
if iscoroutinefunction(func):
|
|
57
|
-
return await func(*args, **kwargs)
|
|
58
|
-
else:
|
|
59
|
-
return func(*args, **kwargs)
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
from importlib import import_module
|
|
6
|
+
from inspect import iscoroutinefunction
|
|
7
|
+
from typing import Callable
|
|
8
|
+
|
|
9
|
+
from crawlo.settings.setting_manager import SettingManager
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _get_closest(path='.'):
|
|
13
|
+
path = os.path.abspath(path)
|
|
14
|
+
return path
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _init_env():
|
|
18
|
+
closest = _get_closest()
|
|
19
|
+
if closest:
|
|
20
|
+
sys.path.append(closest)
|
|
21
|
+
# project_dir = os.path.dirname(closest)
|
|
22
|
+
# sys.path.append(project_dir)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_settings(settings='settings'):
|
|
26
|
+
_settings = SettingManager()
|
|
27
|
+
_init_env()
|
|
28
|
+
_settings.set_settings(settings)
|
|
29
|
+
return _settings
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def merge_settings(spider, settings):
|
|
33
|
+
if hasattr(spider, 'custom_settings'):
|
|
34
|
+
custom_settings = getattr(spider, 'custom_settings')
|
|
35
|
+
settings.update_attributes(custom_settings)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def load_class(_path):
|
|
39
|
+
if not isinstance(_path, str):
|
|
40
|
+
if callable(_path):
|
|
41
|
+
return _path
|
|
42
|
+
else:
|
|
43
|
+
raise TypeError(f"args expect str or object, got {_path}")
|
|
44
|
+
|
|
45
|
+
module_name, class_name = _path.rsplit('.', 1)
|
|
46
|
+
module = import_module(module_name)
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
cls = getattr(module, class_name)
|
|
50
|
+
except AttributeError:
|
|
51
|
+
raise NameError(f"Module {module_name!r} has no class named {class_name!r}")
|
|
52
|
+
return cls
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
async def common_call(func: Callable, *args, **kwargs):
|
|
56
|
+
if iscoroutinefunction(func):
|
|
57
|
+
return await func(*args, **kwargs)
|
|
58
|
+
else:
|
|
59
|
+
return func(*args, **kwargs)
|
crawlo/utils/request.py
CHANGED
|
@@ -1,85 +1,122 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
# @Time : 2025-07-08 08:55
|
|
5
|
-
# @Author : crawl-coder
|
|
6
|
-
# @Desc : None
|
|
7
|
-
"""
|
|
8
|
-
import json
|
|
9
|
-
import hashlib
|
|
10
|
-
from typing import Any, Optional, Iterable, Union
|
|
11
|
-
from w3lib.url import canonicalize_url
|
|
12
|
-
|
|
13
|
-
from crawlo import Request
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def to_bytes(data: Any, encoding='utf-8') -> bytes:
|
|
17
|
-
"""
|
|
18
|
-
将各种类型统一转换为 bytes。
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
#
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
# @Time : 2025-07-08 08:55
|
|
5
|
+
# @Author : crawl-coder
|
|
6
|
+
# @Desc : None
|
|
7
|
+
"""
|
|
8
|
+
import json
|
|
9
|
+
import hashlib
|
|
10
|
+
from typing import Any, Optional, Iterable, Union
|
|
11
|
+
from w3lib.url import canonicalize_url
|
|
12
|
+
|
|
13
|
+
from crawlo import Request
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def to_bytes(data: Any, encoding: str = 'utf-8') -> bytes:
|
|
17
|
+
"""
|
|
18
|
+
将各种类型统一转换为 bytes。
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
data: 要转换的数据,支持 str, bytes, dict, int, float, bool, None 等类型
|
|
22
|
+
encoding: 字符串编码格式,默认为 'utf-8'
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
bytes: 转换后的字节数据
|
|
26
|
+
|
|
27
|
+
Raises:
|
|
28
|
+
TypeError: 当数据类型无法转换时
|
|
29
|
+
UnicodeEncodeError: 当编码失败时
|
|
30
|
+
ValueError: 当 JSON 序列化失败时
|
|
31
|
+
|
|
32
|
+
Examples:
|
|
33
|
+
>>> to_bytes("hello")
|
|
34
|
+
b'hello'
|
|
35
|
+
>>> to_bytes({"key": "value"})
|
|
36
|
+
b'{"key": "value"}'
|
|
37
|
+
>>> to_bytes(123)
|
|
38
|
+
b'123'
|
|
39
|
+
>>> to_bytes(None)
|
|
40
|
+
b'null'
|
|
41
|
+
"""
|
|
42
|
+
# 预检查编码参数
|
|
43
|
+
if not isinstance(encoding, str):
|
|
44
|
+
raise TypeError(f"encoding must be str, not {type(encoding).__name__}")
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
if isinstance(data, bytes):
|
|
48
|
+
return data
|
|
49
|
+
elif isinstance(data, str):
|
|
50
|
+
return data.encode(encoding)
|
|
51
|
+
elif isinstance(data, dict):
|
|
52
|
+
return json.dumps(data, sort_keys=True, ensure_ascii=False, separators=(',', ':')).encode(encoding)
|
|
53
|
+
elif isinstance(data, (int, float, bool)):
|
|
54
|
+
return str(data).encode(encoding)
|
|
55
|
+
elif data is None:
|
|
56
|
+
return b'null'
|
|
57
|
+
elif hasattr(data, '__str__'):
|
|
58
|
+
# 处理其他可转换为字符串的对象
|
|
59
|
+
return str(data).encode(encoding)
|
|
60
|
+
else:
|
|
61
|
+
raise TypeError(
|
|
62
|
+
f"`data` must be str, dict, bytes, int, float, bool, or None, "
|
|
63
|
+
f"not {type(data).__name__}"
|
|
64
|
+
)
|
|
65
|
+
except (UnicodeEncodeError, ValueError) as e:
|
|
66
|
+
raise type(e)(f"Failed to convert {type(data).__name__} to bytes: {str(e)}") from e
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def request_fingerprint(
|
|
70
|
+
request: Request,
|
|
71
|
+
include_headers: Optional[Iterable[Union[bytes, str]]] = None
|
|
72
|
+
) -> str:
|
|
73
|
+
"""
|
|
74
|
+
生成请求指纹,基于方法、标准化 URL、body 和可选的 headers。
|
|
75
|
+
使用 SHA256 哈希算法以提高安全性。
|
|
76
|
+
|
|
77
|
+
:param request: Request 对象(需包含 method, url, body, headers)
|
|
78
|
+
:param include_headers: 指定要参与指纹计算的 header 名称列表(str 或 bytes)
|
|
79
|
+
:return: 请求指纹(hex string)
|
|
80
|
+
"""
|
|
81
|
+
hash_func = hashlib.sha256()
|
|
82
|
+
|
|
83
|
+
# 基本字段
|
|
84
|
+
hash_func.update(to_bytes(request.method))
|
|
85
|
+
hash_func.update(to_bytes(canonicalize_url(request.url)))
|
|
86
|
+
hash_func.update(request.body or b'')
|
|
87
|
+
|
|
88
|
+
# 处理 headers
|
|
89
|
+
if include_headers:
|
|
90
|
+
headers = request.headers # 假设 headers 是类似字典或 MultiDict 的结构
|
|
91
|
+
for header_name in include_headers:
|
|
92
|
+
name_bytes = to_bytes(header_name).lower() # 统一转为小写进行匹配
|
|
93
|
+
value = b''
|
|
94
|
+
|
|
95
|
+
# 兼容 headers 的访问方式(如 MultiDict 或 dict)
|
|
96
|
+
if hasattr(headers, 'get_all'):
|
|
97
|
+
# 如 scrapy.http.Headers 的 get_all 方法
|
|
98
|
+
values = headers.get_all(name_bytes)
|
|
99
|
+
value = b';'.join(values) if values else b''
|
|
100
|
+
elif hasattr(headers, '__getitem__'):
|
|
101
|
+
# 如普通 dict
|
|
102
|
+
try:
|
|
103
|
+
raw_value = headers[name_bytes]
|
|
104
|
+
if isinstance(raw_value, list):
|
|
105
|
+
value = b';'.join(to_bytes(v) for v in raw_value)
|
|
106
|
+
else:
|
|
107
|
+
value = to_bytes(raw_value)
|
|
108
|
+
except (KeyError, TypeError):
|
|
109
|
+
value = b''
|
|
110
|
+
else:
|
|
111
|
+
value = b''
|
|
112
|
+
|
|
113
|
+
hash_func.update(name_bytes + b':' + value)
|
|
114
|
+
|
|
115
|
+
return hash_func.hexdigest()
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def set_request(request: Request, priority: int) -> None:
|
|
119
|
+
request.meta['depth'] = request.meta.setdefault('depth', 0) + 1
|
|
120
|
+
if priority:
|
|
121
|
+
request.priority -= request.meta['depth'] * priority
|
|
122
|
+
|
crawlo/utils/system.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
import platform
|
|
4
|
-
|
|
5
|
-
system_name = platform.system().lower()
|
|
6
|
-
if system_name == 'windows':
|
|
7
|
-
import asyncio
|
|
8
|
-
asyncio.set_event_loop_policy(
|
|
9
|
-
asyncio.WindowsSelectorEventLoopPolicy()
|
|
10
|
-
)
|
|
11
|
-
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding:UTF-8 -*-
|
|
3
|
+
import platform
|
|
4
|
+
|
|
5
|
+
system_name = platform.system().lower()
|
|
6
|
+
if system_name == 'windows':
|
|
7
|
+
import asyncio
|
|
8
|
+
asyncio.set_event_loop_policy(
|
|
9
|
+
asyncio.WindowsSelectorEventLoopPolicy()
|
|
10
|
+
)
|
|
11
|
+
|