crawlo 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (80) hide show
  1. crawlo/__init__.py +9 -6
  2. crawlo/__version__.py +1 -2
  3. crawlo/core/__init__.py +2 -2
  4. crawlo/core/engine.py +158 -158
  5. crawlo/core/processor.py +40 -40
  6. crawlo/core/scheduler.py +57 -59
  7. crawlo/crawler.py +242 -107
  8. crawlo/downloader/__init__.py +78 -78
  9. crawlo/downloader/aiohttp_downloader.py +259 -96
  10. crawlo/downloader/httpx_downloader.py +187 -48
  11. crawlo/downloader/playwright_downloader.py +160 -160
  12. crawlo/event.py +11 -11
  13. crawlo/exceptions.py +64 -64
  14. crawlo/extension/__init__.py +31 -31
  15. crawlo/extension/log_interval.py +49 -49
  16. crawlo/extension/log_stats.py +44 -44
  17. crawlo/filters/__init__.py +37 -37
  18. crawlo/filters/aioredis_filter.py +157 -129
  19. crawlo/filters/memory_filter.py +202 -203
  20. crawlo/filters/redis_filter.py +119 -119
  21. crawlo/items/__init__.py +62 -62
  22. crawlo/items/items.py +118 -118
  23. crawlo/middleware/__init__.py +21 -21
  24. crawlo/middleware/default_header.py +32 -32
  25. crawlo/middleware/download_delay.py +28 -28
  26. crawlo/middleware/middleware_manager.py +140 -140
  27. crawlo/middleware/request_ignore.py +30 -30
  28. crawlo/middleware/response_code.py +18 -18
  29. crawlo/middleware/response_filter.py +26 -26
  30. crawlo/middleware/retry.py +90 -89
  31. crawlo/network/__init__.py +7 -7
  32. crawlo/network/request.py +205 -155
  33. crawlo/network/response.py +166 -93
  34. crawlo/pipelines/__init__.py +13 -13
  35. crawlo/pipelines/console_pipeline.py +39 -39
  36. crawlo/pipelines/mongo_pipeline.py +116 -116
  37. crawlo/pipelines/mysql_batch_pipline.py +133 -133
  38. crawlo/pipelines/mysql_pipeline.py +195 -176
  39. crawlo/pipelines/pipeline_manager.py +56 -56
  40. crawlo/settings/__init__.py +7 -7
  41. crawlo/settings/default_settings.py +93 -89
  42. crawlo/settings/setting_manager.py +99 -99
  43. crawlo/spider/__init__.py +36 -36
  44. crawlo/stats_collector.py +59 -47
  45. crawlo/subscriber.py +106 -27
  46. crawlo/task_manager.py +27 -27
  47. crawlo/templates/item_template.tmpl +21 -21
  48. crawlo/templates/project_template/main.py +32 -32
  49. crawlo/templates/project_template/setting.py +189 -189
  50. crawlo/templates/spider_template.tmpl +30 -30
  51. crawlo/utils/__init__.py +7 -7
  52. crawlo/utils/concurrency_manager.py +125 -0
  53. crawlo/utils/date_tools.py +177 -177
  54. crawlo/utils/func_tools.py +82 -82
  55. crawlo/utils/log.py +39 -39
  56. crawlo/utils/pqueue.py +173 -173
  57. crawlo/utils/project.py +59 -59
  58. crawlo/utils/request.py +122 -85
  59. crawlo/utils/system.py +11 -11
  60. crawlo/utils/tools.py +303 -0
  61. crawlo/utils/url.py +39 -39
  62. {crawlo-1.0.1.dist-info → crawlo-1.0.3.dist-info}/METADATA +48 -36
  63. crawlo-1.0.3.dist-info/RECORD +80 -0
  64. {crawlo-1.0.1.dist-info → crawlo-1.0.3.dist-info}/top_level.txt +1 -0
  65. tests/__init__.py +7 -0
  66. tests/baidu_spider/__init__.py +7 -0
  67. tests/baidu_spider/demo.py +94 -0
  68. tests/baidu_spider/items.py +25 -0
  69. tests/baidu_spider/middleware.py +49 -0
  70. tests/baidu_spider/pipeline.py +55 -0
  71. tests/baidu_spider/request_fingerprints.txt +9 -0
  72. tests/baidu_spider/run.py +27 -0
  73. tests/baidu_spider/settings.py +78 -0
  74. tests/baidu_spider/spiders/__init__.py +7 -0
  75. tests/baidu_spider/spiders/bai_du.py +61 -0
  76. tests/baidu_spider/spiders/sina.py +79 -0
  77. crawlo-1.0.1.dist-info/RECORD +0 -67
  78. crawlo-1.0.1.dist-info/licenses/LICENSE +0 -23
  79. {crawlo-1.0.1.dist-info → crawlo-1.0.3.dist-info}/WHEEL +0 -0
  80. {crawlo-1.0.1.dist-info → crawlo-1.0.3.dist-info}/entry_points.txt +0 -0
@@ -1,161 +1,161 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from typing import Optional, Dict, Any
4
- from playwright.async_api import Browser, Page, Response as PlaywrightResponse
5
- from crawlo import Response, Request
6
- from crawlo.downloader import DownloaderBase
7
-
8
-
9
- class PlaywrightDownloader(DownloaderBase):
10
- def __init__(self, crawler):
11
- super().__init__(crawler)
12
- # Playwright 核心对象
13
- self.browser: Optional[Browser] = None # 浏览器实例
14
- self.context: Optional[Any] = None # 浏览器上下文(隔离cookies等)
15
-
16
- # 可配置参数(通过crawler.settings覆盖默认值)
17
- self._browser_type: str = "chromium" # 浏览器类型(chromium/firefox/webkit)
18
- self._headless: bool = True # 是否无头模式
19
- self._timeout: int = 30000 # 操作超时(毫秒)
20
- self._viewport: Dict[str, int] = {"width": 1280, "height": 720} # 视口大小
21
- self._extra_launch_args: Dict[str, Any] = {} # 浏览器启动额外参数
22
-
23
- async def _init_browser(self):
24
- """初始化Playwright浏览器实例"""
25
- from playwright.async_api import async_playwright
26
-
27
- # 启动Playwright引擎
28
- playwright = await async_playwright().start()
29
-
30
- # 根据配置选择浏览器类型
31
- browser_launcher = {
32
- "chromium": playwright.chromium,
33
- "firefox": playwright.firefox,
34
- "webkit": playwright.webkit
35
- }.get(self._browser_type, playwright.chromium) # 默认chromium
36
-
37
- # 启动浏览器(含启动参数)
38
- self.browser = await browser_launcher.launch(
39
- headless=self._headless, # 无头模式开关
40
- timeout=self._timeout, # 启动超时
41
- **self._extra_launch_args # 透传额外参数(如代理配置)
42
- )
43
-
44
- # 创建浏览器上下文(隔离环境)
45
- self.context = await self.browser.new_context(
46
- viewport=self._viewport, # 设置窗口大小
47
- user_agent=self.crawler.settings.get("USER_AGENT") # 自定义UA
48
- )
49
-
50
- def open(self):
51
- """从crawler配置加载参数"""
52
- super().open() # 调用父类初始化
53
-
54
- # 读取配置(支持在settings.py中覆盖)
55
- self._browser_type = self.crawler.settings.get("PLAYWRIGHT_BROWSER", "chromium")
56
- self._headless = self.crawler.settings.get_bool("HEADLESS", True)
57
- self._timeout = self.crawler.settings.get_int("PLAYWRIGHT_TIMEOUT", 30000)
58
- self._viewport = self.crawler.settings.get_dict("VIEWPORT", {"width": 1280, "height": 720})
59
- self._extra_launch_args = self.crawler.settings.get_dict("PLAYWRIGHT_LAUNCH_ARGS", {})
60
-
61
- async def download(self, request: Request) -> Response:
62
- """
63
- 核心下载方法:
64
- 1. 创建新页面Tab
65
- 2. 加载目标URL
66
- 3. 获取渲染后的内容
67
- """
68
- if not self.browser:
69
- await self._init_browser() # 懒加载浏览器
70
-
71
- page = await self.context.new_page() # 每个请求独立Page(自动隔离)
72
-
73
- try:
74
- # 设置请求头(模拟浏览器)
75
- if request.headers:
76
- await page.set_extra_http_headers(request.headers)
77
-
78
- # 导航到目标URL(支持等待策略配置)
79
- response = await page.goto(
80
- request.url,
81
- timeout=self._timeout,
82
- wait_until="domcontentloaded" # 等待策略:domcontentloaded/networkidle/load
83
- )
84
-
85
- # 特殊处理POST请求(Playwright限制需用API方式)
86
- if request.method.lower() == "post":
87
- return await self._handle_post_request(request, page)
88
-
89
- # 执行自定义JavaScript(用于提取动态数据)
90
- if request.meta.get("execute_js"):
91
- result = await page.evaluate(request.meta["execute_js"])
92
- request.meta["js_result"] = result # 存储JS执行结果
93
-
94
- # 获取渲染后的完整HTML(含动态生成内容)
95
- body = await page.content()
96
-
97
- # 调试模式下截图(用于排查页面问题)
98
- if self.crawler.settings.get_bool("DEBUG"):
99
- screenshot = await page.screenshot(type="png")
100
- request.meta["screenshot"] = screenshot # 截图存入request.meta
101
-
102
- # 构造统一响应对象
103
- return self._structure_response(request, response, body)
104
-
105
- except Exception as e:
106
- self.logger.error(f"页面下载失败: {str(e)}")
107
- raise
108
- finally:
109
- await page.close() # 确保页面关闭,避免资源泄漏
110
-
111
- async def _handle_post_request(self, request: Request, page: Page) -> Response:
112
- """
113
- 处理POST请求的特殊方法:
114
- 通过页面内fetch API发送POST请求,并监听响应
115
- """
116
- async with page.expect_response(request.url) as response_info:
117
- # 在页面上下文中执行fetch
118
- await page.evaluate(
119
- """async ({url, headers, body}) => {
120
- await fetch(url, {
121
- method: 'POST',
122
- headers: headers,
123
- body: body
124
- });
125
- }""",
126
- {
127
- "url": request.url,
128
- "headers": request.headers or {},
129
- "body": request.body or ""
130
- }
131
- )
132
-
133
- response = await response_info.value # 获取API响应
134
- body = await response.text() # 读取响应体
135
- return self._structure_response(request, response, body)
136
-
137
- @staticmethod
138
- def _structure_response(
139
- request: Request,
140
- response: PlaywrightResponse,
141
- body: str
142
- ) -> Response:
143
- """
144
- 标准化响应格式:
145
- 将Playwright的响应转换为crawlo的统一Response对象
146
- """
147
- return Response(
148
- url=str(response.url), # 最终URL(含重定向)
149
- headers=response.headers, # 响应头
150
- status_code=response.status, # HTTP状态码
151
- body=body.encode('utf-8'), # 响应体(转bytes)
152
- request=request # 关联的请求对象
153
- )
154
-
155
- async def close(self) -> None:
156
- """资源清理:关闭浏览器实例和上下文"""
157
- if self.context:
158
- await self.context.close()
159
- if self.browser:
160
- await self.browser.close()
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from typing import Optional, Dict, Any
4
+ from playwright.async_api import Browser, Page, Response as PlaywrightResponse
5
+ from crawlo import Response, Request
6
+ from crawlo.downloader import DownloaderBase
7
+
8
+
9
+ class PlaywrightDownloader(DownloaderBase):
10
+ def __init__(self, crawler):
11
+ super().__init__(crawler)
12
+ # Playwright 核心对象
13
+ self.browser: Optional[Browser] = None # 浏览器实例
14
+ self.context: Optional[Any] = None # 浏览器上下文(隔离cookies等)
15
+
16
+ # 可配置参数(通过crawler.settings覆盖默认值)
17
+ self._browser_type: str = "chromium" # 浏览器类型(chromium/firefox/webkit)
18
+ self._headless: bool = True # 是否无头模式
19
+ self._timeout: int = 30000 # 操作超时(毫秒)
20
+ self._viewport: Dict[str, int] = {"width": 1280, "height": 720} # 视口大小
21
+ self._extra_launch_args: Dict[str, Any] = {} # 浏览器启动额外参数
22
+
23
+ async def _init_browser(self):
24
+ """初始化Playwright浏览器实例"""
25
+ from playwright.async_api import async_playwright
26
+
27
+ # 启动Playwright引擎
28
+ playwright = await async_playwright().start()
29
+
30
+ # 根据配置选择浏览器类型
31
+ browser_launcher = {
32
+ "chromium": playwright.chromium,
33
+ "firefox": playwright.firefox,
34
+ "webkit": playwright.webkit
35
+ }.get(self._browser_type, playwright.chromium) # 默认chromium
36
+
37
+ # 启动浏览器(含启动参数)
38
+ self.browser = await browser_launcher.launch(
39
+ headless=self._headless, # 无头模式开关
40
+ timeout=self._timeout, # 启动超时
41
+ **self._extra_launch_args # 透传额外参数(如代理配置)
42
+ )
43
+
44
+ # 创建浏览器上下文(隔离环境)
45
+ self.context = await self.browser.new_context(
46
+ viewport=self._viewport, # 设置窗口大小
47
+ user_agent=self.crawler.settings.get("USER_AGENT") # 自定义UA
48
+ )
49
+
50
+ def open(self):
51
+ """从crawler配置加载参数"""
52
+ super().open() # 调用父类初始化
53
+
54
+ # 读取配置(支持在settings.py中覆盖)
55
+ self._browser_type = self.crawler.settings.get("PLAYWRIGHT_BROWSER", "chromium")
56
+ self._headless = self.crawler.settings.get_bool("HEADLESS", True)
57
+ self._timeout = self.crawler.settings.get_int("PLAYWRIGHT_TIMEOUT", 30000)
58
+ self._viewport = self.crawler.settings.get_dict("VIEWPORT", {"width": 1280, "height": 720})
59
+ self._extra_launch_args = self.crawler.settings.get_dict("PLAYWRIGHT_LAUNCH_ARGS", {})
60
+
61
+ async def download(self, request: Request) -> Response:
62
+ """
63
+ 核心下载方法:
64
+ 1. 创建新页面Tab
65
+ 2. 加载目标URL
66
+ 3. 获取渲染后的内容
67
+ """
68
+ if not self.browser:
69
+ await self._init_browser() # 懒加载浏览器
70
+
71
+ page = await self.context.new_page() # 每个请求独立Page(自动隔离)
72
+
73
+ try:
74
+ # 设置请求头(模拟浏览器)
75
+ if request.headers:
76
+ await page.set_extra_http_headers(request.headers)
77
+
78
+ # 导航到目标URL(支持等待策略配置)
79
+ response = await page.goto(
80
+ request.url,
81
+ timeout=self._timeout,
82
+ wait_until="domcontentloaded" # 等待策略:domcontentloaded/networkidle/load
83
+ )
84
+
85
+ # 特殊处理POST请求(Playwright限制需用API方式)
86
+ if request.method.lower() == "post":
87
+ return await self._handle_post_request(request, page)
88
+
89
+ # 执行自定义JavaScript(用于提取动态数据)
90
+ if request.meta.get("execute_js"):
91
+ result = await page.evaluate(request.meta["execute_js"])
92
+ request.meta["js_result"] = result # 存储JS执行结果
93
+
94
+ # 获取渲染后的完整HTML(含动态生成内容)
95
+ body = await page.content()
96
+
97
+ # 调试模式下截图(用于排查页面问题)
98
+ if self.crawler.settings.get_bool("DEBUG"):
99
+ screenshot = await page.screenshot(type="png")
100
+ request.meta["screenshot"] = screenshot # 截图存入request.meta
101
+
102
+ # 构造统一响应对象
103
+ return self._structure_response(request, response, body)
104
+
105
+ except Exception as e:
106
+ self.logger.error(f"页面下载失败: {str(e)}")
107
+ raise
108
+ finally:
109
+ await page.close() # 确保页面关闭,避免资源泄漏
110
+
111
+ async def _handle_post_request(self, request: Request, page: Page) -> Response:
112
+ """
113
+ 处理POST请求的特殊方法:
114
+ 通过页面内fetch API发送POST请求,并监听响应
115
+ """
116
+ async with page.expect_response(request.url) as response_info:
117
+ # 在页面上下文中执行fetch
118
+ await page.evaluate(
119
+ """async ({url, headers, body}) => {
120
+ await fetch(url, {
121
+ method: 'POST',
122
+ headers: headers,
123
+ body: body
124
+ });
125
+ }""",
126
+ {
127
+ "url": request.url,
128
+ "headers": request.headers or {},
129
+ "body": request.body or ""
130
+ }
131
+ )
132
+
133
+ response = await response_info.value # 获取API响应
134
+ body = await response.text() # 读取响应体
135
+ return self._structure_response(request, response, body)
136
+
137
+ @staticmethod
138
+ def _structure_response(
139
+ request: Request,
140
+ response: PlaywrightResponse,
141
+ body: str
142
+ ) -> Response:
143
+ """
144
+ 标准化响应格式:
145
+ 将Playwright的响应转换为crawlo的统一Response对象
146
+ """
147
+ return Response(
148
+ url=str(response.url), # 最终URL(含重定向)
149
+ headers=response.headers, # 响应头
150
+ status_code=response.status, # HTTP状态码
151
+ body=body.encode('utf-8'), # 响应体(转bytes)
152
+ request=request # 关联的请求对象
153
+ )
154
+
155
+ async def close(self) -> None:
156
+ """资源清理:关闭浏览器实例和上下文"""
157
+ if self.context:
158
+ await self.context.close()
159
+ if self.browser:
160
+ await self.browser.close()
161
161
  await super().close() # 调用父类清理逻辑
crawlo/event.py CHANGED
@@ -1,11 +1,11 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
-
4
- spider_error = "spider_error"
5
- spider_opened = "spider_open"
6
- spider_closed = "spider_closed"
7
- ignore_request = "ignore_request"
8
- request_scheduled = "request_scheduled"
9
- response_received = "request_received"
10
- item_successful = "item_successful"
11
- item_discard = "item_discard"
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+
4
+ spider_error = "spider_error"
5
+ spider_opened = "spider_open"
6
+ spider_closed = "spider_closed"
7
+ ignore_request = "ignore_request"
8
+ request_scheduled = "request_scheduled"
9
+ response_received = "request_received"
10
+ item_successful = "item_successful"
11
+ item_discard = "item_discard"
crawlo/exceptions.py CHANGED
@@ -1,64 +1,64 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- class TransformTypeError(TypeError):
4
- pass
5
-
6
-
7
- class OutputError(Exception):
8
- pass
9
-
10
-
11
- class SpiderTypeError(TypeError):
12
- pass
13
-
14
-
15
- class ItemInitError(Exception):
16
- pass
17
-
18
-
19
- class ItemAttributeError(Exception):
20
- pass
21
-
22
-
23
- class DecodeError(Exception):
24
- pass
25
-
26
-
27
- class MiddlewareInitError(Exception):
28
- pass
29
-
30
-
31
- class PipelineInitError(Exception):
32
- pass
33
-
34
-
35
- class InvalidOutputError(Exception):
36
- pass
37
-
38
-
39
- class RequestMethodError(Exception):
40
- pass
41
-
42
-
43
- class IgnoreRequestError(Exception):
44
- def __init__(self, msg):
45
- self.msg = msg
46
- super(IgnoreRequestError, self).__init__(msg)
47
-
48
-
49
- class ItemDiscard(Exception):
50
- def __init__(self, msg):
51
- self.msg = msg
52
- super(ItemDiscard, self).__init__(msg)
53
-
54
-
55
- class NotConfiguredError(Exception):
56
- pass
57
-
58
-
59
- class ExtensionInitError(Exception):
60
- pass
61
-
62
-
63
- class ReceiverTypeError(Exception):
64
- pass
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ class TransformTypeError(TypeError):
4
+ pass
5
+
6
+
7
+ class OutputError(Exception):
8
+ pass
9
+
10
+
11
+ class SpiderTypeError(TypeError):
12
+ pass
13
+
14
+
15
+ class ItemInitError(Exception):
16
+ pass
17
+
18
+
19
+ class ItemAttributeError(Exception):
20
+ pass
21
+
22
+
23
+ class DecodeError(Exception):
24
+ pass
25
+
26
+
27
+ class MiddlewareInitError(Exception):
28
+ pass
29
+
30
+
31
+ class PipelineInitError(Exception):
32
+ pass
33
+
34
+
35
+ class InvalidOutputError(Exception):
36
+ pass
37
+
38
+
39
+ class RequestMethodError(Exception):
40
+ pass
41
+
42
+
43
+ class IgnoreRequestError(Exception):
44
+ def __init__(self, msg):
45
+ self.msg = msg
46
+ super(IgnoreRequestError, self).__init__(msg)
47
+
48
+
49
+ class ItemDiscard(Exception):
50
+ def __init__(self, msg):
51
+ self.msg = msg
52
+ super(ItemDiscard, self).__init__(msg)
53
+
54
+
55
+ class NotConfiguredError(Exception):
56
+ pass
57
+
58
+
59
+ class ExtensionInitError(Exception):
60
+ pass
61
+
62
+
63
+ class ReceiverTypeError(Exception):
64
+ pass
@@ -1,31 +1,31 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from typing import List
4
- from pprint import pformat
5
-
6
- from crawlo.utils.log import get_logger
7
- from crawlo.utils.project import load_class
8
- from crawlo.exceptions import ExtensionInitError
9
-
10
-
11
- class ExtensionManager(object):
12
-
13
- def __init__(self, crawler):
14
- self.crawler = crawler
15
- self.extensions: List = []
16
- extensions = self.crawler.settings.get_list('EXTENSIONS')
17
- self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
18
- self._add_extensions(extensions)
19
-
20
- @classmethod
21
- def create_instance(cls, *args, **kwargs):
22
- return cls(*args, **kwargs)
23
-
24
- def _add_extensions(self, extensions):
25
- for extension in extensions:
26
- extension_cls = load_class(extension)
27
- if not hasattr(extension_cls, 'create_instance'):
28
- raise ExtensionInitError(f"extension init failed, Must have method 'create_instance()")
29
- self.extensions.append(extension_cls.create_instance(self.crawler))
30
- if extensions:
31
- self.logger.info(f"enabled extensions: \n {pformat(extensions)}")
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from typing import List
4
+ from pprint import pformat
5
+
6
+ from crawlo.utils.log import get_logger
7
+ from crawlo.utils.project import load_class
8
+ from crawlo.exceptions import ExtensionInitError
9
+
10
+
11
+ class ExtensionManager(object):
12
+
13
+ def __init__(self, crawler):
14
+ self.crawler = crawler
15
+ self.extensions: List = []
16
+ extensions = self.crawler.settings.get_list('EXTENSIONS')
17
+ self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
18
+ self._add_extensions(extensions)
19
+
20
+ @classmethod
21
+ def create_instance(cls, *args, **kwargs):
22
+ return cls(*args, **kwargs)
23
+
24
+ def _add_extensions(self, extensions):
25
+ for extension in extensions:
26
+ extension_cls = load_class(extension)
27
+ if not hasattr(extension_cls, 'create_instance'):
28
+ raise ExtensionInitError(f"extension init failed, Must have method 'create_instance()")
29
+ self.extensions.append(extension_cls.create_instance(self.crawler))
30
+ if extensions:
31
+ self.logger.info(f"enabled extensions: \n {pformat(extensions)}")
@@ -1,49 +1,49 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- import asyncio
4
-
5
- from crawlo.utils.log import get_logger
6
- from crawlo.event import spider_opened, spider_closed
7
-
8
-
9
- class LogIntervalExtension(object):
10
-
11
- def __init__(self, crawler):
12
- self.task = None
13
- self.stats = crawler.stats
14
- self.item_count = 0
15
- self.response_count = 0
16
- self.seconds = crawler.settings.get('INTERVAL')
17
- self.interval = int(self.seconds / 60) if self.seconds % 60 == 0 else self.seconds
18
- self.interval = "" if self.interval == 1 else self.interval
19
- self.unit = 'min' if self.seconds % 60 == 0 else 's'
20
-
21
- self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
22
-
23
- @classmethod
24
- def create_instance(cls, crawler):
25
- o = cls(crawler)
26
- crawler.subscriber.subscribe(o.spider_opened, event=spider_opened)
27
- crawler.subscriber.subscribe(o.spider_closed, event=spider_closed)
28
- return o
29
-
30
- async def spider_opened(self):
31
- self.task = asyncio.create_task(self.interval_log())
32
- await self.task
33
-
34
- async def spider_closed(self):
35
- if self.task:
36
- self.task.cancel()
37
-
38
- async def interval_log(self):
39
- while True:
40
- last_item_count = self.stats.get_value('item_successful_count', default=0)
41
- last_response_count = self.stats.get_value('response_received_count', default=0)
42
- item_rate = last_item_count - self.item_count
43
- response_rate = last_response_count - self.response_count
44
- self.item_count, self.response_count = last_item_count, last_response_count
45
- self.logger.info(
46
- f'Crawled {last_response_count} pages (at {response_rate} pages/{self.interval}{self.unit}),'
47
- f' Got {last_item_count} items (at {item_rate} items/{self.interval}{self.unit}).'
48
- )
49
- await asyncio.sleep(self.seconds)
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ import asyncio
4
+
5
+ from crawlo.utils.log import get_logger
6
+ from crawlo.event import spider_opened, spider_closed
7
+
8
+
9
+ class LogIntervalExtension(object):
10
+
11
+ def __init__(self, crawler):
12
+ self.task = None
13
+ self.stats = crawler.stats
14
+ self.item_count = 0
15
+ self.response_count = 0
16
+ self.seconds = crawler.settings.get('INTERVAL')
17
+ self.interval = int(self.seconds / 60) if self.seconds % 60 == 0 else self.seconds
18
+ self.interval = "" if self.interval == 1 else self.interval
19
+ self.unit = 'min' if self.seconds % 60 == 0 else 's'
20
+
21
+ self.logger = get_logger(self.__class__.__name__, crawler.settings.get('LOG_LEVEL'))
22
+
23
+ @classmethod
24
+ def create_instance(cls, crawler):
25
+ o = cls(crawler)
26
+ crawler.subscriber.subscribe(o.spider_opened, event=spider_opened)
27
+ crawler.subscriber.subscribe(o.spider_closed, event=spider_closed)
28
+ return o
29
+
30
+ async def spider_opened(self):
31
+ self.task = asyncio.create_task(self.interval_log())
32
+ await self.task
33
+
34
+ async def spider_closed(self):
35
+ if self.task:
36
+ self.task.cancel()
37
+
38
+ async def interval_log(self):
39
+ while True:
40
+ last_item_count = self.stats.get_value('item_successful_count', default=0)
41
+ last_response_count = self.stats.get_value('response_received_count', default=0)
42
+ item_rate = last_item_count - self.item_count
43
+ response_rate = last_response_count - self.response_count
44
+ self.item_count, self.response_count = last_item_count, last_response_count
45
+ self.logger.info(
46
+ f'Crawled {last_response_count} pages (at {response_rate} pages/{self.interval}{self.unit}),'
47
+ f' Got {last_item_count} items (at {item_rate} items/{self.interval}{self.unit}).'
48
+ )
49
+ await asyncio.sleep(self.seconds)