crawlo 1.0.4__py3-none-any.whl → 1.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (112) hide show
  1. crawlo/__init__.py +25 -9
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +41 -0
  4. crawlo/commands/__init__.py +10 -0
  5. crawlo/commands/genspider.py +111 -0
  6. crawlo/commands/run.py +149 -0
  7. crawlo/commands/startproject.py +101 -0
  8. crawlo/core/__init__.py +2 -2
  9. crawlo/core/engine.py +158 -158
  10. crawlo/core/processor.py +40 -40
  11. crawlo/core/scheduler.py +57 -57
  12. crawlo/crawler.py +219 -242
  13. crawlo/downloader/__init__.py +78 -78
  14. crawlo/downloader/aiohttp_downloader.py +200 -259
  15. crawlo/downloader/cffi_downloader.py +277 -0
  16. crawlo/downloader/httpx_downloader.py +246 -187
  17. crawlo/event.py +11 -11
  18. crawlo/exceptions.py +78 -64
  19. crawlo/extension/__init__.py +31 -31
  20. crawlo/extension/log_interval.py +49 -49
  21. crawlo/extension/log_stats.py +44 -44
  22. crawlo/extension/logging_extension.py +35 -0
  23. crawlo/filters/__init__.py +37 -37
  24. crawlo/filters/aioredis_filter.py +150 -150
  25. crawlo/filters/memory_filter.py +202 -202
  26. crawlo/items/__init__.py +22 -62
  27. crawlo/items/base.py +31 -0
  28. crawlo/items/fields.py +54 -0
  29. crawlo/items/items.py +105 -119
  30. crawlo/middleware/__init__.py +21 -21
  31. crawlo/middleware/default_header.py +32 -32
  32. crawlo/middleware/download_delay.py +28 -28
  33. crawlo/middleware/middleware_manager.py +135 -140
  34. crawlo/middleware/proxy.py +246 -0
  35. crawlo/middleware/request_ignore.py +30 -30
  36. crawlo/middleware/response_code.py +18 -18
  37. crawlo/middleware/response_filter.py +26 -26
  38. crawlo/middleware/retry.py +90 -90
  39. crawlo/network/__init__.py +7 -7
  40. crawlo/network/request.py +203 -204
  41. crawlo/network/response.py +166 -166
  42. crawlo/pipelines/__init__.py +13 -13
  43. crawlo/pipelines/console_pipeline.py +39 -39
  44. crawlo/pipelines/mongo_pipeline.py +116 -116
  45. crawlo/pipelines/mysql_batch_pipline.py +273 -134
  46. crawlo/pipelines/mysql_pipeline.py +195 -195
  47. crawlo/pipelines/pipeline_manager.py +56 -56
  48. crawlo/settings/__init__.py +7 -7
  49. crawlo/settings/default_settings.py +169 -94
  50. crawlo/settings/setting_manager.py +99 -99
  51. crawlo/spider/__init__.py +41 -36
  52. crawlo/stats_collector.py +59 -59
  53. crawlo/subscriber.py +106 -106
  54. crawlo/task_manager.py +27 -27
  55. crawlo/templates/crawlo.cfg.tmpl +11 -0
  56. crawlo/templates/project/__init__.py.tmpl +4 -0
  57. crawlo/templates/project/items.py.tmpl +18 -0
  58. crawlo/templates/project/middlewares.py.tmpl +76 -0
  59. crawlo/templates/project/pipelines.py.tmpl +64 -0
  60. crawlo/templates/project/settings.py.tmpl +54 -0
  61. crawlo/templates/project/spiders/__init__.py.tmpl +6 -0
  62. crawlo/templates/spider/spider.py.tmpl +32 -0
  63. crawlo/utils/__init__.py +7 -7
  64. crawlo/utils/concurrency_manager.py +124 -124
  65. crawlo/utils/date_tools.py +233 -177
  66. crawlo/utils/db_helper.py +344 -0
  67. crawlo/utils/func_tools.py +82 -82
  68. crawlo/utils/log.py +129 -39
  69. crawlo/utils/pqueue.py +173 -173
  70. crawlo/utils/project.py +199 -59
  71. crawlo/utils/request.py +267 -122
  72. crawlo/utils/spider_loader.py +63 -0
  73. crawlo/utils/system.py +11 -11
  74. crawlo/utils/tools.py +5 -303
  75. crawlo/utils/url.py +39 -39
  76. {crawlo-1.0.4.dist-info → crawlo-1.0.6.dist-info}/METADATA +49 -48
  77. crawlo-1.0.6.dist-info/RECORD +94 -0
  78. crawlo-1.0.6.dist-info/entry_points.txt +2 -0
  79. {crawlo-1.0.4.dist-info → crawlo-1.0.6.dist-info}/top_level.txt +1 -0
  80. examples/gxb/items.py +36 -0
  81. examples/gxb/run.py +16 -0
  82. examples/gxb/settings.py +72 -0
  83. examples/gxb/spider/__init__.py +0 -0
  84. examples/gxb/spider/miit_spider.py +180 -0
  85. examples/gxb/spider/telecom_device.py +129 -0
  86. tests/__init__.py +7 -7
  87. tests/test_proxy_health_check.py +33 -0
  88. tests/test_proxy_middleware_integration.py +137 -0
  89. tests/test_proxy_providers.py +57 -0
  90. tests/test_proxy_stats.py +20 -0
  91. tests/test_proxy_strategies.py +60 -0
  92. crawlo/downloader/playwright_downloader.py +0 -161
  93. crawlo/templates/item_template.tmpl +0 -22
  94. crawlo/templates/project_template/main.py +0 -33
  95. crawlo/templates/project_template/setting.py +0 -190
  96. crawlo/templates/spider_template.tmpl +0 -31
  97. crawlo-1.0.4.dist-info/RECORD +0 -79
  98. crawlo-1.0.4.dist-info/entry_points.txt +0 -2
  99. tests/baidu_spider/__init__.py +0 -7
  100. tests/baidu_spider/demo.py +0 -94
  101. tests/baidu_spider/items.py +0 -25
  102. tests/baidu_spider/middleware.py +0 -49
  103. tests/baidu_spider/pipeline.py +0 -55
  104. tests/baidu_spider/request_fingerprints.txt +0 -9
  105. tests/baidu_spider/run.py +0 -27
  106. tests/baidu_spider/settings.py +0 -80
  107. tests/baidu_spider/spiders/__init__.py +0 -7
  108. tests/baidu_spider/spiders/bai_du.py +0 -61
  109. tests/baidu_spider/spiders/sina.py +0 -79
  110. {crawlo-1.0.4.dist-info → crawlo-1.0.6.dist-info}/WHEEL +0 -0
  111. {crawlo/templates/project_template/items → examples}/__init__.py +0 -0
  112. {crawlo/templates/project_template/spiders → examples/gxb}/__init__.py +0 -0
@@ -1,26 +1,26 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from crawlo.utils.log import get_logger
4
- from crawlo.exceptions import IgnoreRequestError
5
-
6
-
7
- class ResponseFilterMiddleware:
8
-
9
- def __init__(self, allowed_codes, log_level):
10
- self.allowed_codes = allowed_codes
11
- self.logger = get_logger(self.__class__.__name__, log_level)
12
-
13
- @classmethod
14
- def create_instance(cls, crawler):
15
- o = cls(
16
- allowed_codes=crawler.settings.get_list('ALLOWED_CODES'),
17
- log_level=crawler.settings.get('LOG_LEVEL')
18
- )
19
- return o
20
-
21
- def process_response(self, request, response, spider):
22
- if 200 <= response.status_code < 300:
23
- return response
24
- if response.status_code in self.allowed_codes:
25
- return response
26
- raise IgnoreRequestError(f"response status_code/non-200")
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from crawlo.utils.log import get_logger
4
+ from crawlo.exceptions import IgnoreRequestError
5
+
6
+
7
+ class ResponseFilterMiddleware:
8
+
9
+ def __init__(self, allowed_codes, log_level):
10
+ self.allowed_codes = allowed_codes
11
+ self.logger = get_logger(self.__class__.__name__, log_level)
12
+
13
+ @classmethod
14
+ def create_instance(cls, crawler):
15
+ o = cls(
16
+ allowed_codes=crawler.settings.get_list('ALLOWED_CODES'),
17
+ log_level=crawler.settings.get('LOG_LEVEL')
18
+ )
19
+ return o
20
+
21
+ def process_response(self, request, response, spider):
22
+ if 200 <= response.status_code < 300:
23
+ return response
24
+ if response.status_code in self.allowed_codes:
25
+ return response
26
+ raise IgnoreRequestError(f"response status_code/non-200")
@@ -1,90 +1,90 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from typing import List
4
- from anyio import EndOfStream
5
- from httpcore import ReadError
6
- from asyncio.exceptions import TimeoutError
7
- from httpx import RemoteProtocolError, ConnectError, ReadTimeout
8
- from aiohttp.client_exceptions import ClientConnectionError, ClientPayloadError
9
- from aiohttp import ClientConnectorError, ClientTimeout, ClientConnectorSSLError, ClientResponseError
10
-
11
- from crawlo.utils.log import get_logger
12
- from crawlo.stats_collector import StatsCollector
13
-
14
- _retry_exceptions = [
15
- EndOfStream,
16
- ReadError,
17
- TimeoutError,
18
- ConnectError,
19
- ReadTimeout,
20
- ClientConnectorError,
21
- ClientResponseError,
22
- RemoteProtocolError,
23
- ClientTimeout,
24
- ClientConnectorSSLError,
25
- ClientPayloadError,
26
- ClientConnectionError
27
- ]
28
-
29
-
30
- class RetryMiddleware(object):
31
-
32
- def __init__(
33
- self,
34
- *,
35
- retry_http_codes: List,
36
- ignore_http_codes: List,
37
- max_retry_times: int,
38
- retry_exceptions: List,
39
- stats: StatsCollector,
40
- retry_priority: int
41
- ):
42
- self.retry_http_codes = retry_http_codes
43
- self.ignore_http_codes = ignore_http_codes
44
- self.max_retry_times = max_retry_times
45
- self.retry_exceptions = tuple(retry_exceptions + _retry_exceptions)
46
- self.retry_priority = retry_priority
47
- self.stats = stats
48
- self.logger = get_logger(self.__class__.__name__)
49
-
50
- @classmethod
51
- def create_instance(cls, crawler):
52
- o = cls(
53
- retry_http_codes=crawler.settings.get_list('RETRY_HTTP_CODES'),
54
- ignore_http_codes=crawler.settings.get_list('IGNORE_HTTP_CODES'),
55
- max_retry_times=crawler.settings.get_int('MAX_RETRY_TIMES'),
56
- retry_exceptions=crawler.settings.get_list('RETRY_EXCEPTIONS'),
57
- stats=crawler.stats,
58
- retry_priority=crawler.settings.get_int('RETRY_PRIORITY')
59
- )
60
- return o
61
-
62
- def process_response(self, request, response, spider):
63
- if request.meta.get('dont_retry', False):
64
- return response
65
- if response.status_code in self.ignore_http_codes:
66
- return response
67
- if response.status_code in self.retry_http_codes:
68
- # 重试逻辑
69
- reason = f"response code {response.status_code}"
70
- return self._retry(request, reason, spider) or response
71
- return response
72
-
73
- def process_exception(self, request, exc, spider):
74
- if isinstance(exc, self.retry_exceptions) and not request.meta.get('dont_retry', False):
75
- return self._retry(request=request, reason=type(exc).__name__, spider=spider)
76
-
77
- def _retry(self, request, reason, spider):
78
- retry_times = request.meta.get('retry_times', 0)
79
- if retry_times < self.max_retry_times:
80
- retry_times += 1
81
- self.logger.info(f"{spider} {request} {reason} retrying {retry_times} time...")
82
- request.meta['retry_times'] = retry_times
83
- # request.dont_retry = True
84
- request.meta['dont_retry'] = True
85
- request.retry_priority = request.priority + self.retry_priority
86
- self.stats.inc_value("retry_count")
87
- return request
88
- else:
89
- self.logger.warning(f"{spider} {request} {reason} retry max {self.max_retry_times} times, give up.")
90
- return None
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from typing import List
4
+ from anyio import EndOfStream
5
+ from httpcore import ReadError
6
+ from asyncio.exceptions import TimeoutError
7
+ from httpx import RemoteProtocolError, ConnectError, ReadTimeout
8
+ from aiohttp.client_exceptions import ClientConnectionError, ClientPayloadError
9
+ from aiohttp import ClientConnectorError, ClientTimeout, ClientConnectorSSLError, ClientResponseError
10
+
11
+ from crawlo.utils.log import get_logger
12
+ from crawlo.stats_collector import StatsCollector
13
+
14
+ _retry_exceptions = [
15
+ EndOfStream,
16
+ ReadError,
17
+ TimeoutError,
18
+ ConnectError,
19
+ ReadTimeout,
20
+ ClientConnectorError,
21
+ ClientResponseError,
22
+ RemoteProtocolError,
23
+ ClientTimeout,
24
+ ClientConnectorSSLError,
25
+ ClientPayloadError,
26
+ ClientConnectionError
27
+ ]
28
+
29
+
30
+ class RetryMiddleware(object):
31
+
32
+ def __init__(
33
+ self,
34
+ *,
35
+ retry_http_codes: List,
36
+ ignore_http_codes: List,
37
+ max_retry_times: int,
38
+ retry_exceptions: List,
39
+ stats: StatsCollector,
40
+ retry_priority: int
41
+ ):
42
+ self.retry_http_codes = retry_http_codes
43
+ self.ignore_http_codes = ignore_http_codes
44
+ self.max_retry_times = max_retry_times
45
+ self.retry_exceptions = tuple(retry_exceptions + _retry_exceptions)
46
+ self.retry_priority = retry_priority
47
+ self.stats = stats
48
+ self.logger = get_logger(self.__class__.__name__)
49
+
50
+ @classmethod
51
+ def create_instance(cls, crawler):
52
+ o = cls(
53
+ retry_http_codes=crawler.settings.get_list('RETRY_HTTP_CODES'),
54
+ ignore_http_codes=crawler.settings.get_list('IGNORE_HTTP_CODES'),
55
+ max_retry_times=crawler.settings.get_int('MAX_RETRY_TIMES'),
56
+ retry_exceptions=crawler.settings.get_list('RETRY_EXCEPTIONS'),
57
+ stats=crawler.stats,
58
+ retry_priority=crawler.settings.get_int('RETRY_PRIORITY')
59
+ )
60
+ return o
61
+
62
+ def process_response(self, request, response, spider):
63
+ if request.meta.get('dont_retry', False):
64
+ return response
65
+ if response.status_code in self.ignore_http_codes:
66
+ return response
67
+ if response.status_code in self.retry_http_codes:
68
+ # 重试逻辑
69
+ reason = f"response code {response.status_code}"
70
+ return self._retry(request, reason, spider) or response
71
+ return response
72
+
73
+ def process_exception(self, request, exc, spider):
74
+ if isinstance(exc, self.retry_exceptions) and not request.meta.get('dont_retry', False):
75
+ return self._retry(request=request, reason=type(exc).__name__, spider=spider)
76
+
77
+ def _retry(self, request, reason, spider):
78
+ retry_times = request.meta.get('retry_times', 0)
79
+ if retry_times < self.max_retry_times:
80
+ retry_times += 1
81
+ self.logger.info(f"{spider} {request} {reason} retrying {retry_times} time...")
82
+ request.meta['retry_times'] = retry_times
83
+ # request.dont_retry = True
84
+ request.meta['dont_retry'] = True
85
+ request.priority = request.priority + self.retry_priority
86
+ self.stats.inc_value("retry_count")
87
+ return request
88
+ else:
89
+ self.logger.warning(f"{spider} {request} {reason} retry max {self.max_retry_times} times, give up.")
90
+ return None
@@ -1,7 +1,7 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- # @Time : 2025-02-05 14:07
5
- # @Author : oscar
6
- # @Desc : None
7
- """
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ # @Time : 2025-02-05 14:07
5
+ # @Author : oscar
6
+ # @Desc : None
7
+ """