crawlo 1.0.4__py3-none-any.whl → 1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (95) hide show
  1. crawlo/__init__.py +25 -9
  2. crawlo/__version__.py +1 -1
  3. crawlo/core/__init__.py +2 -2
  4. crawlo/core/engine.py +158 -158
  5. crawlo/core/processor.py +40 -40
  6. crawlo/core/scheduler.py +57 -57
  7. crawlo/crawler.py +424 -242
  8. crawlo/downloader/__init__.py +78 -78
  9. crawlo/downloader/aiohttp_downloader.py +200 -259
  10. crawlo/downloader/cffi_downloader.py +277 -0
  11. crawlo/downloader/httpx_downloader.py +246 -187
  12. crawlo/event.py +11 -11
  13. crawlo/exceptions.py +73 -64
  14. crawlo/extension/__init__.py +31 -31
  15. crawlo/extension/log_interval.py +49 -49
  16. crawlo/extension/log_stats.py +44 -44
  17. crawlo/extension/logging_extension.py +35 -0
  18. crawlo/filters/__init__.py +37 -37
  19. crawlo/filters/aioredis_filter.py +150 -150
  20. crawlo/filters/memory_filter.py +202 -202
  21. crawlo/items/__init__.py +62 -62
  22. crawlo/items/items.py +115 -119
  23. crawlo/middleware/__init__.py +21 -21
  24. crawlo/middleware/default_header.py +32 -32
  25. crawlo/middleware/download_delay.py +28 -28
  26. crawlo/middleware/middleware_manager.py +135 -140
  27. crawlo/middleware/proxy.py +246 -0
  28. crawlo/middleware/request_ignore.py +30 -30
  29. crawlo/middleware/response_code.py +18 -18
  30. crawlo/middleware/response_filter.py +26 -26
  31. crawlo/middleware/retry.py +90 -90
  32. crawlo/network/__init__.py +7 -7
  33. crawlo/network/request.py +203 -204
  34. crawlo/network/response.py +166 -166
  35. crawlo/pipelines/__init__.py +13 -13
  36. crawlo/pipelines/console_pipeline.py +39 -39
  37. crawlo/pipelines/mongo_pipeline.py +116 -116
  38. crawlo/pipelines/mysql_batch_pipline.py +273 -134
  39. crawlo/pipelines/mysql_pipeline.py +195 -195
  40. crawlo/pipelines/pipeline_manager.py +56 -56
  41. crawlo/settings/__init__.py +7 -7
  42. crawlo/settings/default_settings.py +169 -94
  43. crawlo/settings/setting_manager.py +99 -99
  44. crawlo/spider/__init__.py +41 -36
  45. crawlo/stats_collector.py +59 -59
  46. crawlo/subscriber.py +106 -106
  47. crawlo/task_manager.py +27 -27
  48. crawlo/templates/item_template.tmpl +21 -21
  49. crawlo/templates/project_template/main.py +32 -32
  50. crawlo/templates/project_template/setting.py +189 -189
  51. crawlo/templates/spider_template.tmpl +30 -30
  52. crawlo/utils/__init__.py +7 -7
  53. crawlo/utils/concurrency_manager.py +124 -124
  54. crawlo/utils/date_tools.py +233 -177
  55. crawlo/utils/db_helper.py +344 -0
  56. crawlo/utils/func_tools.py +82 -82
  57. crawlo/utils/log.py +129 -39
  58. crawlo/utils/pqueue.py +173 -173
  59. crawlo/utils/project.py +59 -59
  60. crawlo/utils/request.py +267 -122
  61. crawlo/utils/system.py +11 -11
  62. crawlo/utils/tools.py +5 -303
  63. crawlo/utils/url.py +39 -39
  64. {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/METADATA +49 -48
  65. crawlo-1.0.5.dist-info/RECORD +84 -0
  66. {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/top_level.txt +1 -0
  67. examples/__init__.py +0 -0
  68. examples/gxb/__init__.py +0 -0
  69. examples/gxb/items.py +36 -0
  70. examples/gxb/run.py +15 -0
  71. examples/gxb/settings.py +71 -0
  72. examples/gxb/spider/__init__.py +0 -0
  73. examples/gxb/spider/miit_spider.py +180 -0
  74. examples/gxb/spider/telecom_device_licenses.py +129 -0
  75. tests/__init__.py +7 -7
  76. tests/test_proxy_health_check.py +33 -0
  77. tests/test_proxy_middleware_integration.py +137 -0
  78. tests/test_proxy_providers.py +57 -0
  79. tests/test_proxy_stats.py +20 -0
  80. tests/test_proxy_strategies.py +60 -0
  81. crawlo/downloader/playwright_downloader.py +0 -161
  82. crawlo-1.0.4.dist-info/RECORD +0 -79
  83. tests/baidu_spider/__init__.py +0 -7
  84. tests/baidu_spider/demo.py +0 -94
  85. tests/baidu_spider/items.py +0 -25
  86. tests/baidu_spider/middleware.py +0 -49
  87. tests/baidu_spider/pipeline.py +0 -55
  88. tests/baidu_spider/request_fingerprints.txt +0 -9
  89. tests/baidu_spider/run.py +0 -27
  90. tests/baidu_spider/settings.py +0 -80
  91. tests/baidu_spider/spiders/__init__.py +0 -7
  92. tests/baidu_spider/spiders/bai_du.py +0 -61
  93. tests/baidu_spider/spiders/sina.py +0 -79
  94. {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/WHEEL +0 -0
  95. {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/entry_points.txt +0 -0
@@ -1,166 +1,166 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- import re
4
- import ujson
5
- from typing import Dict, Any, List, Optional
6
- from parsel import Selector, SelectorList
7
- from http.cookies import SimpleCookie
8
- from urllib.parse import urljoin as _urljoin
9
-
10
- from crawlo import Request
11
- from crawlo.exceptions import DecodeError
12
-
13
-
14
- class Response:
15
- """
16
- HTTP响应的封装,提供数据解析的便捷方法。
17
- """
18
-
19
- def __init__(
20
- self,
21
- url: str,
22
- *,
23
- headers: Dict[str, Any],
24
- body: bytes = b"",
25
- method: str = 'GET',
26
- request: Request = None,
27
- status_code: int = 200,
28
- ):
29
- self.url = url
30
- self.headers = headers
31
- self.body = body
32
- self.method = method
33
- self.request = request
34
- self.status_code = status_code
35
- self.encoding = self.request.encoding if self.request else None
36
- self._text_cache = None
37
- self._json_cache = None
38
- self._selector_instance = None # 修改变量名,避免与 @property 冲突
39
-
40
- @property
41
- def text(self) -> str:
42
- """将响应体(body)以正确的编码解码为字符串,并缓存结果。"""
43
- if self._text_cache is not None:
44
- return self._text_cache
45
-
46
- encoding = self.encoding
47
- try:
48
- # 优先使用 request 提供的编码
49
- if encoding:
50
- self._text_cache = self.body.decode(encoding)
51
- return self._text_cache
52
-
53
- # 从 Content-Type 中提取编码
54
- content_type = self.headers.get("Content-Type", "")
55
- charset_match = re.search(r"charset=([\w-]+)", content_type, re.I)
56
- if charset_match:
57
- encoding = charset_match.group(1)
58
- self._text_cache = self.body.decode(encoding)
59
- return self._text_cache
60
-
61
- # 默认尝试 UTF-8
62
- self._text_cache = self.body.decode("utf-8")
63
- return self._text_cache
64
-
65
- except UnicodeDecodeError as e:
66
- raise DecodeError(f"Failed to decode response from {self.url}: {e}")
67
-
68
- def json(self) -> Any:
69
- """将响应文本解析为 JSON 对象。"""
70
- if self._json_cache:
71
- return self._json_cache
72
- self._json_cache = ujson.loads(self.text)
73
- return self._json_cache
74
-
75
- def urljoin(self, url: str) -> str:
76
- """拼接 URL,自动处理相对路径。"""
77
- return _urljoin(self.url, url)
78
-
79
- @property
80
- def _selector(self) -> Selector:
81
- """懒加载 Selector 实例"""
82
- if self._selector_instance is None:
83
- self._selector_instance = Selector(self.text)
84
- return self._selector_instance
85
-
86
- def xpath(self, query: str) -> SelectorList:
87
- """使用 XPath 选择器查询文档。"""
88
- return self._selector.xpath(query)
89
-
90
- def css(self, query: str) -> SelectorList:
91
- """使用 CSS 选择器查询文档。"""
92
- return self._selector.css(query)
93
-
94
- def xpath_text(self, query: str) -> str:
95
- """使用 XPath 提取并返回纯文本。"""
96
- fragments = self.xpath(f"{query}//text()").getall()
97
- return " ".join(text.strip() for text in fragments if text.strip())
98
-
99
- def css_text(self, query: str) -> str:
100
- """使用 CSS 选择器提取并返回纯文本。"""
101
- fragments = self.css(f"{query} ::text").getall()
102
- return " ".join(text.strip() for text in fragments if text.strip())
103
-
104
- def get_text(self, xpath_or_css: str, join_str: str = " ") -> str:
105
- """
106
- 获取指定节点的纯文本(自动拼接子节点文本)
107
-
108
- 参数:
109
- xpath_or_css: XPath或CSS选择器
110
- join_str: 文本拼接分隔符(默认为空格)
111
-
112
- 返回:
113
- 拼接后的纯文本字符串
114
- """
115
- elements = self.xpath(xpath_or_css) if xpath_or_css.startswith(('/', '//', './')) else self.css(xpath_or_css)
116
- texts = elements.xpath('.//text()').getall()
117
- return join_str.join(text.strip() for text in texts if text.strip())
118
-
119
- def get_all_text(self, xpath_or_css: str, join_str: str = " ") -> List[str]:
120
- """
121
- 获取多个节点的纯文本列表
122
-
123
- 参数:
124
- xpath_or_css: XPath或CSS选择器
125
- join_str: 单个节点内文本拼接分隔符
126
-
127
- 返回:
128
- 纯文本列表(每个元素对应一个节点的文本)
129
- """
130
- elements = self.xpath(xpath_or_css) if xpath_or_css.startswith(('/', '//', './')) else self.css(xpath_or_css)
131
- result = []
132
- for element in elements:
133
- texts = element.xpath('.//text()').getall()
134
- clean_text = join_str.join(text.strip() for text in texts if text.strip())
135
- if clean_text:
136
- result.append(clean_text)
137
- return result
138
-
139
- def re_search(self, pattern: str, flags: int = re.DOTALL) -> Optional[re.Match]:
140
- """在响应文本上执行正则表达式搜索。"""
141
- if not isinstance(pattern, str):
142
- raise TypeError("Pattern must be a string")
143
- return re.search(pattern, self.text, flags=flags)
144
-
145
- def re_findall(self, pattern: str, flags: int = re.DOTALL) -> List[Any]:
146
- """在响应文本上执行正则表达式查找。"""
147
- if not isinstance(pattern, str):
148
- raise TypeError("Pattern must be a string")
149
- return re.findall(pattern, self.text, flags=flags)
150
-
151
- def get_cookies(self) -> Dict[str, str]:
152
- """从响应头中解析并返回Cookies。"""
153
- cookie_header = self.headers.get("Set-Cookie", "")
154
- if isinstance(cookie_header, list):
155
- cookie_header = ", ".join(cookie_header)
156
- cookies = SimpleCookie()
157
- cookies.load(cookie_header)
158
- return {key: morsel.value for key, morsel in cookies.items()}
159
-
160
- @property
161
- def meta(self) -> Dict:
162
- """获取关联的 Request 对象的 meta 字典。"""
163
- return self.request.meta if self.request else {}
164
-
165
- def __str__(self):
166
- return f"<{self.status_code} {self.url}>"
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ import re
4
+ import ujson
5
+ from http.cookies import SimpleCookie
6
+ from parsel import Selector, SelectorList
7
+ from typing import Dict, Any, List, Optional
8
+ from urllib.parse import urljoin as _urljoin
9
+
10
+ from crawlo import Request
11
+ from crawlo.exceptions import DecodeError
12
+
13
+
14
+ class Response:
15
+ """
16
+ HTTP响应的封装,提供数据解析的便捷方法。
17
+ """
18
+
19
+ def __init__(
20
+ self,
21
+ url: str,
22
+ *,
23
+ headers: Dict[str, Any],
24
+ body: bytes = b"",
25
+ method: str = 'GET',
26
+ request: Request = None,
27
+ status_code: int = 200,
28
+ ):
29
+ self.url = url
30
+ self.headers = headers
31
+ self.body = body
32
+ self.method = method
33
+ self.request = request
34
+ self.status_code = status_code
35
+ self.encoding = self.request.encoding if self.request else None
36
+ self._text_cache = None
37
+ self._json_cache = None
38
+ self._selector_instance = None # 修改变量名,避免与 @property 冲突
39
+
40
+ @property
41
+ def text(self) -> str:
42
+ """将响应体(body)以正确的编码解码为字符串,并缓存结果。"""
43
+ if self._text_cache is not None:
44
+ return self._text_cache
45
+
46
+ encoding = self.encoding
47
+ try:
48
+ # 优先使用 request 提供的编码
49
+ if encoding:
50
+ self._text_cache = self.body.decode(encoding)
51
+ return self._text_cache
52
+
53
+ # 从 Content-Type 中提取编码
54
+ content_type = self.headers.get("Content-Type", "")
55
+ charset_match = re.search(r"charset=([\w-]+)", content_type, re.I)
56
+ if charset_match:
57
+ encoding = charset_match.group(1)
58
+ self._text_cache = self.body.decode(encoding)
59
+ return self._text_cache
60
+
61
+ # 默认尝试 UTF-8
62
+ self._text_cache = self.body.decode("utf-8")
63
+ return self._text_cache
64
+
65
+ except UnicodeDecodeError as e:
66
+ raise DecodeError(f"Failed to decode response from {self.url}: {e}")
67
+
68
+ def json(self) -> Any:
69
+ """将响应文本解析为 JSON 对象。"""
70
+ if self._json_cache:
71
+ return self._json_cache
72
+ self._json_cache = ujson.loads(self.text)
73
+ return self._json_cache
74
+
75
+ def urljoin(self, url: str) -> str:
76
+ """拼接 URL,自动处理相对路径。"""
77
+ return _urljoin(self.url, url)
78
+
79
+ @property
80
+ def _selector(self) -> Selector:
81
+ """懒加载 Selector 实例"""
82
+ if self._selector_instance is None:
83
+ self._selector_instance = Selector(self.text)
84
+ return self._selector_instance
85
+
86
+ def xpath(self, query: str) -> SelectorList:
87
+ """使用 XPath 选择器查询文档。"""
88
+ return self._selector.xpath(query)
89
+
90
+ def css(self, query: str) -> SelectorList:
91
+ """使用 CSS 选择器查询文档。"""
92
+ return self._selector.css(query)
93
+
94
+ def xpath_text(self, query: str) -> str:
95
+ """使用 XPath 提取并返回纯文本。"""
96
+ fragments = self.xpath(f"{query}//text()").getall()
97
+ return " ".join(text.strip() for text in fragments if text.strip())
98
+
99
+ def css_text(self, query: str) -> str:
100
+ """使用 CSS 选择器提取并返回纯文本。"""
101
+ fragments = self.css(f"{query} ::text").getall()
102
+ return " ".join(text.strip() for text in fragments if text.strip())
103
+
104
+ def get_text(self, xpath_or_css: str, join_str: str = " ") -> str:
105
+ """
106
+ 获取指定节点的纯文本(自动拼接子节点文本)
107
+
108
+ 参数:
109
+ xpath_or_css: XPath或CSS选择器
110
+ join_str: 文本拼接分隔符(默认为空格)
111
+
112
+ 返回:
113
+ 拼接后的纯文本字符串
114
+ """
115
+ elements = self.xpath(xpath_or_css) if xpath_or_css.startswith(('/', '//', './')) else self.css(xpath_or_css)
116
+ texts = elements.xpath('.//text()').getall()
117
+ return join_str.join(text.strip() for text in texts if text.strip())
118
+
119
+ def get_all_text(self, xpath_or_css: str, join_str: str = " ") -> List[str]:
120
+ """
121
+ 获取多个节点的纯文本列表
122
+
123
+ 参数:
124
+ xpath_or_css: XPath或CSS选择器
125
+ join_str: 单个节点内文本拼接分隔符
126
+
127
+ 返回:
128
+ 纯文本列表(每个元素对应一个节点的文本)
129
+ """
130
+ elements = self.xpath(xpath_or_css) if xpath_or_css.startswith(('/', '//', './')) else self.css(xpath_or_css)
131
+ result = []
132
+ for element in elements:
133
+ texts = element.xpath('.//text()').getall()
134
+ clean_text = join_str.join(text.strip() for text in texts if text.strip())
135
+ if clean_text:
136
+ result.append(clean_text)
137
+ return result
138
+
139
+ def re_search(self, pattern: str, flags: int = re.DOTALL) -> Optional[re.Match]:
140
+ """在响应文本上执行正则表达式搜索。"""
141
+ if not isinstance(pattern, str):
142
+ raise TypeError("Pattern must be a string")
143
+ return re.search(pattern, self.text, flags=flags)
144
+
145
+ def re_findall(self, pattern: str, flags: int = re.DOTALL) -> List[Any]:
146
+ """在响应文本上执行正则表达式查找。"""
147
+ if not isinstance(pattern, str):
148
+ raise TypeError("Pattern must be a string")
149
+ return re.findall(pattern, self.text, flags=flags)
150
+
151
+ def get_cookies(self) -> Dict[str, str]:
152
+ """从响应头中解析并返回Cookies。"""
153
+ cookie_header = self.headers.get("Set-Cookie", "")
154
+ if isinstance(cookie_header, list):
155
+ cookie_header = ", ".join(cookie_header)
156
+ cookies = SimpleCookie()
157
+ cookies.load(cookie_header)
158
+ return {key: morsel.value for key, morsel in cookies.items()}
159
+
160
+ @property
161
+ def meta(self) -> Dict:
162
+ """获取关联的 Request 对象的 meta 字典。"""
163
+ return self.request.meta if self.request else {}
164
+
165
+ def __str__(self):
166
+ return f"<{self.status_code} {self.url}>"
@@ -1,13 +1,13 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from crawlo.items.items import Item
4
-
5
-
6
- class BasePipeline:
7
-
8
- def process_item(self, item: Item, spider):
9
- raise NotImplementedError
10
-
11
- @classmethod
12
- def create_instance(cls, crawler):
13
- return cls()
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from crawlo.items.items import Item
4
+
5
+
6
+ class BasePipeline:
7
+
8
+ def process_item(self, item: Item, spider):
9
+ raise NotImplementedError
10
+
11
+ @classmethod
12
+ def create_instance(cls, crawler):
13
+ return cls()
@@ -1,40 +1,40 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from typing import Dict, Any
4
-
5
- from crawlo import Item
6
- from crawlo.spider import Spider
7
- from crawlo.utils.log import get_logger
8
-
9
-
10
- class ConsolePipeline:
11
- """将Item内容输出到控制台的管道"""
12
-
13
- def __init__(self, log_level: str = "DEBUG"):
14
- self.logger = get_logger(self.__class__.__name__, log_level)
15
-
16
- @classmethod
17
- def from_crawler(cls, crawler):
18
- """从crawler实例创建管道"""
19
- return cls(
20
- log_level=crawler.settings.get('LOG_LEVEL', 'DEBUG')
21
- )
22
-
23
- async def process_item(self, item: Item, spider: Spider) -> Item:
24
- """处理Item并输出到日志"""
25
- try:
26
- item_dict = self._convert_to_serializable(item)
27
- self.logger.info(f"Item processed: {item_dict}")
28
- return item
29
- except Exception as e:
30
- self.logger.error(f"Error processing item: {e}", exc_info=True)
31
- raise
32
-
33
- @staticmethod
34
- def _convert_to_serializable(item: Item) -> Dict[str, Any]:
35
- """将Item转换为可序列化的字典"""
36
- try:
37
- return item.to_dict()
38
- except AttributeError:
39
- # 兼容没有to_dict方法的Item实现
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from typing import Dict, Any
4
+
5
+ from crawlo import Item
6
+ from crawlo.spider import Spider
7
+ from crawlo.utils.log import get_logger
8
+
9
+
10
+ class ConsolePipeline:
11
+ """将Item内容输出到控制台的管道"""
12
+
13
+ def __init__(self, log_level: str = "DEBUG"):
14
+ self.logger = get_logger(self.__class__.__name__, log_level)
15
+
16
+ @classmethod
17
+ def from_crawler(cls, crawler):
18
+ """从crawler实例创建管道"""
19
+ return cls(
20
+ log_level=crawler.settings.get('LOG_LEVEL', 'DEBUG')
21
+ )
22
+
23
+ async def process_item(self, item: Item, spider: Spider) -> Item:
24
+ """处理Item并输出到日志"""
25
+ try:
26
+ item_dict = self._convert_to_serializable(item)
27
+ self.logger.info(f"Item processed: {item_dict}")
28
+ return item
29
+ except Exception as e:
30
+ self.logger.error(f"Error processing item: {e}", exc_info=True)
31
+ raise
32
+
33
+ @staticmethod
34
+ def _convert_to_serializable(item: Item) -> Dict[str, Any]:
35
+ """将Item转换为可序列化的字典"""
36
+ try:
37
+ return item.to_dict()
38
+ except AttributeError:
39
+ # 兼容没有to_dict方法的Item实现
40
40
  return dict(item)