crawlo 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (96) hide show
  1. crawlo/__init__.py +25 -9
  2. crawlo/__version__.py +1 -1
  3. crawlo/core/__init__.py +2 -2
  4. crawlo/core/engine.py +158 -158
  5. crawlo/core/processor.py +40 -40
  6. crawlo/core/scheduler.py +57 -57
  7. crawlo/crawler.py +424 -242
  8. crawlo/downloader/__init__.py +78 -78
  9. crawlo/downloader/aiohttp_downloader.py +200 -259
  10. crawlo/downloader/cffi_downloader.py +277 -0
  11. crawlo/downloader/httpx_downloader.py +246 -187
  12. crawlo/event.py +11 -11
  13. crawlo/exceptions.py +73 -64
  14. crawlo/extension/__init__.py +31 -31
  15. crawlo/extension/log_interval.py +49 -49
  16. crawlo/extension/log_stats.py +44 -44
  17. crawlo/extension/logging_extension.py +35 -0
  18. crawlo/filters/__init__.py +37 -37
  19. crawlo/filters/aioredis_filter.py +150 -158
  20. crawlo/filters/memory_filter.py +202 -202
  21. crawlo/items/__init__.py +62 -62
  22. crawlo/items/items.py +115 -119
  23. crawlo/middleware/__init__.py +21 -21
  24. crawlo/middleware/default_header.py +32 -32
  25. crawlo/middleware/download_delay.py +28 -28
  26. crawlo/middleware/middleware_manager.py +135 -140
  27. crawlo/middleware/proxy.py +246 -0
  28. crawlo/middleware/request_ignore.py +30 -30
  29. crawlo/middleware/response_code.py +18 -18
  30. crawlo/middleware/response_filter.py +26 -26
  31. crawlo/middleware/retry.py +90 -90
  32. crawlo/network/__init__.py +7 -7
  33. crawlo/network/request.py +203 -204
  34. crawlo/network/response.py +166 -166
  35. crawlo/pipelines/__init__.py +13 -13
  36. crawlo/pipelines/console_pipeline.py +39 -39
  37. crawlo/pipelines/mongo_pipeline.py +116 -116
  38. crawlo/pipelines/mysql_batch_pipline.py +273 -134
  39. crawlo/pipelines/mysql_pipeline.py +195 -195
  40. crawlo/pipelines/pipeline_manager.py +56 -56
  41. crawlo/settings/__init__.py +7 -7
  42. crawlo/settings/default_settings.py +169 -93
  43. crawlo/settings/setting_manager.py +99 -99
  44. crawlo/spider/__init__.py +41 -36
  45. crawlo/stats_collector.py +59 -59
  46. crawlo/subscriber.py +106 -106
  47. crawlo/task_manager.py +27 -27
  48. crawlo/templates/item_template.tmpl +21 -21
  49. crawlo/templates/project_template/main.py +32 -32
  50. crawlo/templates/project_template/setting.py +189 -189
  51. crawlo/templates/spider_template.tmpl +30 -30
  52. crawlo/utils/__init__.py +7 -7
  53. crawlo/utils/concurrency_manager.py +124 -124
  54. crawlo/utils/date_tools.py +233 -177
  55. crawlo/utils/db_helper.py +344 -0
  56. crawlo/utils/func_tools.py +82 -82
  57. crawlo/utils/log.py +129 -39
  58. crawlo/utils/pqueue.py +173 -173
  59. crawlo/utils/project.py +59 -59
  60. crawlo/utils/request.py +267 -122
  61. crawlo/utils/system.py +11 -11
  62. crawlo/utils/tools.py +5 -303
  63. crawlo/utils/url.py +39 -39
  64. {crawlo-1.0.3.dist-info → crawlo-1.0.5.dist-info}/METADATA +49 -48
  65. crawlo-1.0.5.dist-info/RECORD +84 -0
  66. {crawlo-1.0.3.dist-info → crawlo-1.0.5.dist-info}/top_level.txt +1 -0
  67. examples/__init__.py +0 -0
  68. examples/gxb/__init__.py +0 -0
  69. examples/gxb/items.py +36 -0
  70. examples/gxb/run.py +15 -0
  71. examples/gxb/settings.py +71 -0
  72. examples/gxb/spider/__init__.py +0 -0
  73. examples/gxb/spider/miit_spider.py +180 -0
  74. examples/gxb/spider/telecom_device_licenses.py +129 -0
  75. tests/__init__.py +7 -7
  76. tests/test_proxy_health_check.py +33 -0
  77. tests/test_proxy_middleware_integration.py +137 -0
  78. tests/test_proxy_providers.py +57 -0
  79. tests/test_proxy_stats.py +20 -0
  80. tests/test_proxy_strategies.py +60 -0
  81. crawlo/downloader/playwright_downloader.py +0 -161
  82. crawlo/filters/redis_filter.py +0 -120
  83. crawlo-1.0.3.dist-info/RECORD +0 -80
  84. tests/baidu_spider/__init__.py +0 -7
  85. tests/baidu_spider/demo.py +0 -94
  86. tests/baidu_spider/items.py +0 -25
  87. tests/baidu_spider/middleware.py +0 -49
  88. tests/baidu_spider/pipeline.py +0 -55
  89. tests/baidu_spider/request_fingerprints.txt +0 -9
  90. tests/baidu_spider/run.py +0 -27
  91. tests/baidu_spider/settings.py +0 -78
  92. tests/baidu_spider/spiders/__init__.py +0 -7
  93. tests/baidu_spider/spiders/bai_du.py +0 -61
  94. tests/baidu_spider/spiders/sina.py +0 -79
  95. {crawlo-1.0.3.dist-info → crawlo-1.0.5.dist-info}/WHEEL +0 -0
  96. {crawlo-1.0.3.dist-info → crawlo-1.0.5.dist-info}/entry_points.txt +0 -0
@@ -1,26 +1,26 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from crawlo.utils.log import get_logger
4
- from crawlo.exceptions import IgnoreRequestError
5
-
6
-
7
- class ResponseFilterMiddleware:
8
-
9
- def __init__(self, allowed_codes, log_level):
10
- self.allowed_codes = allowed_codes
11
- self.logger = get_logger(self.__class__.__name__, log_level)
12
-
13
- @classmethod
14
- def create_instance(cls, crawler):
15
- o = cls(
16
- allowed_codes=crawler.settings.get_list('ALLOWED_CODES'),
17
- log_level=crawler.settings.get('LOG_LEVEL')
18
- )
19
- return o
20
-
21
- def process_response(self, request, response, spider):
22
- if 200 <= response.status_code < 300:
23
- return response
24
- if response.status_code in self.allowed_codes:
25
- return response
26
- raise IgnoreRequestError(f"response status_code/non-200")
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from crawlo.utils.log import get_logger
4
+ from crawlo.exceptions import IgnoreRequestError
5
+
6
+
7
+ class ResponseFilterMiddleware:
8
+
9
+ def __init__(self, allowed_codes, log_level):
10
+ self.allowed_codes = allowed_codes
11
+ self.logger = get_logger(self.__class__.__name__, log_level)
12
+
13
+ @classmethod
14
+ def create_instance(cls, crawler):
15
+ o = cls(
16
+ allowed_codes=crawler.settings.get_list('ALLOWED_CODES'),
17
+ log_level=crawler.settings.get('LOG_LEVEL')
18
+ )
19
+ return o
20
+
21
+ def process_response(self, request, response, spider):
22
+ if 200 <= response.status_code < 300:
23
+ return response
24
+ if response.status_code in self.allowed_codes:
25
+ return response
26
+ raise IgnoreRequestError(f"response status_code/non-200")
@@ -1,90 +1,90 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- from typing import List
4
- from anyio import EndOfStream
5
- from httpcore import ReadError
6
- from asyncio.exceptions import TimeoutError
7
- from httpx import RemoteProtocolError, ConnectError, ReadTimeout
8
- from aiohttp.client_exceptions import ClientConnectionError, ClientPayloadError
9
- from aiohttp import ClientConnectorError, ClientTimeout, ClientConnectorSSLError, ClientResponseError
10
-
11
- from crawlo.utils.log import get_logger
12
- from crawlo.stats_collector import StatsCollector
13
-
14
- _retry_exceptions = [
15
- EndOfStream,
16
- ReadError,
17
- TimeoutError,
18
- ConnectError,
19
- ReadTimeout,
20
- ClientConnectorError,
21
- ClientResponseError,
22
- RemoteProtocolError,
23
- ClientTimeout,
24
- ClientConnectorSSLError,
25
- ClientPayloadError,
26
- ClientConnectionError
27
- ]
28
-
29
-
30
- class RetryMiddleware(object):
31
-
32
- def __init__(
33
- self,
34
- *,
35
- retry_http_codes: List,
36
- ignore_http_codes: List,
37
- max_retry_times: int,
38
- retry_exceptions: List,
39
- stats: StatsCollector,
40
- retry_priority: int
41
- ):
42
- self.retry_http_codes = retry_http_codes
43
- self.ignore_http_codes = ignore_http_codes
44
- self.max_retry_times = max_retry_times
45
- self.retry_exceptions = tuple(retry_exceptions + _retry_exceptions)
46
- self.retry_priority = retry_priority
47
- self.stats = stats
48
- self.logger = get_logger(self.__class__.__name__)
49
-
50
- @classmethod
51
- def create_instance(cls, crawler):
52
- o = cls(
53
- retry_http_codes=crawler.settings.get_list('RETRY_HTTP_CODES'),
54
- ignore_http_codes=crawler.settings.get_list('IGNORE_HTTP_CODES'),
55
- max_retry_times=crawler.settings.get_int('MAX_RETRY_TIMES'),
56
- retry_exceptions=crawler.settings.get_list('RETRY_EXCEPTIONS'),
57
- stats=crawler.stats,
58
- retry_priority=crawler.settings.get_int('RETRY_PRIORITY')
59
- )
60
- return o
61
-
62
- def process_response(self, request, response, spider):
63
- if request.meta.get('dont_retry', False):
64
- return response
65
- if response.status_code in self.ignore_http_codes:
66
- return response
67
- if response.status_code in self.retry_http_codes:
68
- # 重试逻辑
69
- reason = f"response code {response.status_code}"
70
- return self._retry(request, reason, spider) or response
71
- return response
72
-
73
- def process_exception(self, request, exc, spider):
74
- if isinstance(exc, self.retry_exceptions) and not request.meta.get('dont_retry', False):
75
- return self._retry(request=request, reason=type(exc).__name__, spider=spider)
76
-
77
- def _retry(self, request, reason, spider):
78
- retry_times = request.meta.get('retry_times', 0)
79
- if retry_times < self.max_retry_times:
80
- retry_times += 1
81
- self.logger.info(f"{spider} {request} {reason} retrying {retry_times} time...")
82
- request.meta['retry_times'] = retry_times
83
- # request.dont_retry = True
84
- request.meta['dont_retry'] = True
85
- request.retry_priority = request.priority + self.retry_priority
86
- self.stats.inc_value("retry_count")
87
- return request
88
- else:
89
- self.logger.warning(f"{spider} {request} {reason} retry max {self.max_retry_times} times, give up.")
90
- return None
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from typing import List
4
+ from anyio import EndOfStream
5
+ from httpcore import ReadError
6
+ from asyncio.exceptions import TimeoutError
7
+ from httpx import RemoteProtocolError, ConnectError, ReadTimeout
8
+ from aiohttp.client_exceptions import ClientConnectionError, ClientPayloadError
9
+ from aiohttp import ClientConnectorError, ClientTimeout, ClientConnectorSSLError, ClientResponseError
10
+
11
+ from crawlo.utils.log import get_logger
12
+ from crawlo.stats_collector import StatsCollector
13
+
14
+ _retry_exceptions = [
15
+ EndOfStream,
16
+ ReadError,
17
+ TimeoutError,
18
+ ConnectError,
19
+ ReadTimeout,
20
+ ClientConnectorError,
21
+ ClientResponseError,
22
+ RemoteProtocolError,
23
+ ClientTimeout,
24
+ ClientConnectorSSLError,
25
+ ClientPayloadError,
26
+ ClientConnectionError
27
+ ]
28
+
29
+
30
+ class RetryMiddleware(object):
31
+
32
+ def __init__(
33
+ self,
34
+ *,
35
+ retry_http_codes: List,
36
+ ignore_http_codes: List,
37
+ max_retry_times: int,
38
+ retry_exceptions: List,
39
+ stats: StatsCollector,
40
+ retry_priority: int
41
+ ):
42
+ self.retry_http_codes = retry_http_codes
43
+ self.ignore_http_codes = ignore_http_codes
44
+ self.max_retry_times = max_retry_times
45
+ self.retry_exceptions = tuple(retry_exceptions + _retry_exceptions)
46
+ self.retry_priority = retry_priority
47
+ self.stats = stats
48
+ self.logger = get_logger(self.__class__.__name__)
49
+
50
+ @classmethod
51
+ def create_instance(cls, crawler):
52
+ o = cls(
53
+ retry_http_codes=crawler.settings.get_list('RETRY_HTTP_CODES'),
54
+ ignore_http_codes=crawler.settings.get_list('IGNORE_HTTP_CODES'),
55
+ max_retry_times=crawler.settings.get_int('MAX_RETRY_TIMES'),
56
+ retry_exceptions=crawler.settings.get_list('RETRY_EXCEPTIONS'),
57
+ stats=crawler.stats,
58
+ retry_priority=crawler.settings.get_int('RETRY_PRIORITY')
59
+ )
60
+ return o
61
+
62
+ def process_response(self, request, response, spider):
63
+ if request.meta.get('dont_retry', False):
64
+ return response
65
+ if response.status_code in self.ignore_http_codes:
66
+ return response
67
+ if response.status_code in self.retry_http_codes:
68
+ # 重试逻辑
69
+ reason = f"response code {response.status_code}"
70
+ return self._retry(request, reason, spider) or response
71
+ return response
72
+
73
+ def process_exception(self, request, exc, spider):
74
+ if isinstance(exc, self.retry_exceptions) and not request.meta.get('dont_retry', False):
75
+ return self._retry(request=request, reason=type(exc).__name__, spider=spider)
76
+
77
+ def _retry(self, request, reason, spider):
78
+ retry_times = request.meta.get('retry_times', 0)
79
+ if retry_times < self.max_retry_times:
80
+ retry_times += 1
81
+ self.logger.info(f"{spider} {request} {reason} retrying {retry_times} time...")
82
+ request.meta['retry_times'] = retry_times
83
+ # request.dont_retry = True
84
+ request.meta['dont_retry'] = True
85
+ request.priority = request.priority + self.retry_priority
86
+ self.stats.inc_value("retry_count")
87
+ return request
88
+ else:
89
+ self.logger.warning(f"{spider} {request} {reason} retry max {self.max_retry_times} times, give up.")
90
+ return None
@@ -1,7 +1,7 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- # @Time : 2025-02-05 14:07
5
- # @Author : oscar
6
- # @Desc : None
7
- """
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ # @Time : 2025-02-05 14:07
5
+ # @Author : oscar
6
+ # @Desc : None
7
+ """