crawlo 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (120) hide show
  1. crawlo/__init__.py +34 -24
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +40 -40
  4. crawlo/commands/__init__.py +13 -13
  5. crawlo/commands/check.py +594 -155
  6. crawlo/commands/genspider.py +152 -111
  7. crawlo/commands/list.py +156 -119
  8. crawlo/commands/run.py +285 -170
  9. crawlo/commands/startproject.py +196 -101
  10. crawlo/commands/stats.py +188 -167
  11. crawlo/commands/utils.py +187 -0
  12. crawlo/config.py +280 -0
  13. crawlo/core/__init__.py +2 -2
  14. crawlo/core/engine.py +171 -158
  15. crawlo/core/enhanced_engine.py +190 -0
  16. crawlo/core/processor.py +40 -40
  17. crawlo/core/scheduler.py +162 -57
  18. crawlo/crawler.py +1028 -493
  19. crawlo/downloader/__init__.py +242 -78
  20. crawlo/downloader/aiohttp_downloader.py +212 -199
  21. crawlo/downloader/cffi_downloader.py +252 -277
  22. crawlo/downloader/httpx_downloader.py +257 -246
  23. crawlo/event.py +11 -11
  24. crawlo/exceptions.py +78 -78
  25. crawlo/extension/__init__.py +31 -31
  26. crawlo/extension/log_interval.py +49 -49
  27. crawlo/extension/log_stats.py +44 -44
  28. crawlo/extension/logging_extension.py +34 -34
  29. crawlo/filters/__init__.py +154 -37
  30. crawlo/filters/aioredis_filter.py +242 -150
  31. crawlo/filters/memory_filter.py +269 -202
  32. crawlo/items/__init__.py +23 -23
  33. crawlo/items/base.py +21 -21
  34. crawlo/items/fields.py +53 -53
  35. crawlo/items/items.py +104 -104
  36. crawlo/middleware/__init__.py +21 -21
  37. crawlo/middleware/default_header.py +32 -32
  38. crawlo/middleware/download_delay.py +28 -28
  39. crawlo/middleware/middleware_manager.py +135 -135
  40. crawlo/middleware/proxy.py +248 -245
  41. crawlo/middleware/request_ignore.py +30 -30
  42. crawlo/middleware/response_code.py +18 -18
  43. crawlo/middleware/response_filter.py +26 -26
  44. crawlo/middleware/retry.py +125 -90
  45. crawlo/mode_manager.py +201 -0
  46. crawlo/network/__init__.py +21 -7
  47. crawlo/network/request.py +311 -203
  48. crawlo/network/response.py +269 -166
  49. crawlo/pipelines/__init__.py +13 -13
  50. crawlo/pipelines/console_pipeline.py +39 -39
  51. crawlo/pipelines/csv_pipeline.py +317 -0
  52. crawlo/pipelines/json_pipeline.py +219 -0
  53. crawlo/pipelines/mongo_pipeline.py +116 -116
  54. crawlo/pipelines/mysql_pipeline.py +195 -195
  55. crawlo/pipelines/pipeline_manager.py +56 -56
  56. crawlo/project.py +153 -0
  57. crawlo/queue/pqueue.py +37 -0
  58. crawlo/queue/queue_manager.py +304 -0
  59. crawlo/queue/redis_priority_queue.py +192 -0
  60. crawlo/settings/__init__.py +7 -7
  61. crawlo/settings/default_settings.py +226 -169
  62. crawlo/settings/setting_manager.py +99 -99
  63. crawlo/spider/__init__.py +639 -129
  64. crawlo/stats_collector.py +59 -59
  65. crawlo/subscriber.py +106 -106
  66. crawlo/task_manager.py +30 -27
  67. crawlo/templates/crawlo.cfg.tmpl +10 -10
  68. crawlo/templates/project/__init__.py.tmpl +3 -3
  69. crawlo/templates/project/items.py.tmpl +17 -17
  70. crawlo/templates/project/middlewares.py.tmpl +87 -76
  71. crawlo/templates/project/pipelines.py.tmpl +336 -64
  72. crawlo/templates/project/run.py.tmpl +239 -0
  73. crawlo/templates/project/settings.py.tmpl +248 -54
  74. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  75. crawlo/templates/spider/spider.py.tmpl +178 -32
  76. crawlo/utils/__init__.py +7 -7
  77. crawlo/utils/controlled_spider_mixin.py +336 -0
  78. crawlo/utils/date_tools.py +233 -233
  79. crawlo/utils/db_helper.py +343 -343
  80. crawlo/utils/func_tools.py +82 -82
  81. crawlo/utils/large_scale_config.py +287 -0
  82. crawlo/utils/large_scale_helper.py +344 -0
  83. crawlo/utils/log.py +128 -128
  84. crawlo/utils/queue_helper.py +176 -0
  85. crawlo/utils/request.py +267 -267
  86. crawlo/utils/request_serializer.py +220 -0
  87. crawlo/utils/spider_loader.py +62 -62
  88. crawlo/utils/system.py +11 -11
  89. crawlo/utils/tools.py +4 -4
  90. crawlo/utils/url.py +39 -39
  91. crawlo-1.1.2.dist-info/METADATA +567 -0
  92. crawlo-1.1.2.dist-info/RECORD +108 -0
  93. examples/__init__.py +7 -0
  94. tests/__init__.py +7 -7
  95. tests/test_final_validation.py +154 -0
  96. tests/test_proxy_health_check.py +32 -32
  97. tests/test_proxy_middleware_integration.py +136 -136
  98. tests/test_proxy_providers.py +56 -56
  99. tests/test_proxy_stats.py +19 -19
  100. tests/test_proxy_strategies.py +59 -59
  101. tests/test_redis_config.py +29 -0
  102. tests/test_redis_queue.py +225 -0
  103. tests/test_request_serialization.py +71 -0
  104. tests/test_scheduler.py +242 -0
  105. crawlo/pipelines/mysql_batch_pipline.py +0 -273
  106. crawlo/utils/concurrency_manager.py +0 -125
  107. crawlo/utils/pqueue.py +0 -174
  108. crawlo/utils/project.py +0 -197
  109. crawlo-1.1.0.dist-info/METADATA +0 -49
  110. crawlo-1.1.0.dist-info/RECORD +0 -97
  111. examples/gxb/items.py +0 -36
  112. examples/gxb/run.py +0 -16
  113. examples/gxb/settings.py +0 -72
  114. examples/gxb/spider/__init__.py +0 -2
  115. examples/gxb/spider/miit_spider.py +0 -180
  116. examples/gxb/spider/telecom_device.py +0 -129
  117. {examples/gxb → crawlo/queue}/__init__.py +0 -0
  118. {crawlo-1.1.0.dist-info → crawlo-1.1.2.dist-info}/WHEEL +0 -0
  119. {crawlo-1.1.0.dist-info → crawlo-1.1.2.dist-info}/entry_points.txt +0 -0
  120. {crawlo-1.1.0.dist-info → crawlo-1.1.2.dist-info}/top_level.txt +0 -0
@@ -1,100 +1,100 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- import json
4
- from copy import deepcopy
5
- from importlib import import_module
6
- from collections.abc import MutableMapping
7
-
8
- from crawlo.settings import default_settings
9
-
10
-
11
- class SettingManager(MutableMapping):
12
-
13
- def __init__(self, values=None):
14
- self.attributes = {}
15
- self.set_settings(default_settings)
16
- self.update_attributes(values)
17
-
18
- def get(self, key, default=None):
19
- """安全获取值,不触发递归"""
20
- value = self.attributes.get(key, default)
21
- return value if value is not None else default
22
-
23
- def get_int(self, key, default=0):
24
- return int(self.get(key, default=default))
25
-
26
- def get_float(self, key, default=0.0):
27
- return float(self.get(key, default=default))
28
-
29
- def get_bool(self, key, default=False):
30
- got = self.get(key, default=default)
31
- if isinstance(got, bool):
32
- return got
33
- if isinstance(got, (int, float)):
34
- return bool(got)
35
- got_lower = str(got).strip().lower()
36
- if got_lower in ('1', 'true'):
37
- return True
38
- if got_lower in ('0', 'false'):
39
- return False
40
- raise ValueError(
41
- f"Unsupported value for boolean setting: {got}. "
42
- "Supported values are: 0/1, True/False, '0'/'1', 'True'/'False' (case-insensitive)."
43
- )
44
-
45
- def get_list(self, key, default=None):
46
- values = self.get(key, default or [])
47
- if isinstance(values, str):
48
- return [v.strip() for v in values.split(',') if v.strip()]
49
- try:
50
- return list(values)
51
- except TypeError:
52
- return [values]
53
-
54
- def get_dict(self, key, default=None):
55
- value = self.get(key, default or {})
56
- if isinstance(value, str):
57
- value = json.loads(value)
58
- try:
59
- return dict(value)
60
- except TypeError:
61
- return value
62
-
63
- def set(self, key, value):
64
- self.attributes[key] = value
65
-
66
- def set_settings(self, module):
67
- if isinstance(module, str):
68
- module = import_module(module)
69
- for key in dir(module):
70
- if key.isupper():
71
- self.set(key, getattr(module, key))
72
-
73
- # 实现 MutableMapping 必须的方法
74
- def __getitem__(self, item):
75
- return self.attributes[item]
76
-
77
- def __setitem__(self, key, value):
78
- self.set(key, value)
79
-
80
- def __delitem__(self, key):
81
- del self.attributes[key]
82
-
83
- def __iter__(self):
84
- return iter(self.attributes)
85
-
86
- def __len__(self):
87
- return len(self.attributes)
88
-
89
- def __str__(self):
90
- return f'<Settings: {self.attributes}>'
91
-
92
- __repr__ = __str__
93
-
94
- def update_attributes(self, attributes):
95
- if attributes is not None:
96
- for key, value in attributes.items():
97
- self.set(key, value)
98
-
99
- def copy(self):
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ import json
4
+ from copy import deepcopy
5
+ from importlib import import_module
6
+ from collections.abc import MutableMapping
7
+
8
+ from crawlo.settings import default_settings
9
+
10
+
11
+ class SettingManager(MutableMapping):
12
+
13
+ def __init__(self, values=None):
14
+ self.attributes = {}
15
+ self.set_settings(default_settings)
16
+ self.update_attributes(values)
17
+
18
+ def get(self, key, default=None):
19
+ """安全获取值,不触发递归"""
20
+ value = self.attributes.get(key, default)
21
+ return value if value is not None else default
22
+
23
+ def get_int(self, key, default=0):
24
+ return int(self.get(key, default=default))
25
+
26
+ def get_float(self, key, default=0.0):
27
+ return float(self.get(key, default=default))
28
+
29
+ def get_bool(self, key, default=False):
30
+ got = self.get(key, default=default)
31
+ if isinstance(got, bool):
32
+ return got
33
+ if isinstance(got, (int, float)):
34
+ return bool(got)
35
+ got_lower = str(got).strip().lower()
36
+ if got_lower in ('1', 'true'):
37
+ return True
38
+ if got_lower in ('0', 'false'):
39
+ return False
40
+ raise ValueError(
41
+ f"Unsupported value for boolean setting: {got}. "
42
+ "Supported values are: 0/1, True/False, '0'/'1', 'True'/'False' (case-insensitive)."
43
+ )
44
+
45
+ def get_list(self, key, default=None):
46
+ values = self.get(key, default or [])
47
+ if isinstance(values, str):
48
+ return [v.strip() for v in values.split(',') if v.strip()]
49
+ try:
50
+ return list(values)
51
+ except TypeError:
52
+ return [values]
53
+
54
+ def get_dict(self, key, default=None):
55
+ value = self.get(key, default or {})
56
+ if isinstance(value, str):
57
+ value = json.loads(value)
58
+ try:
59
+ return dict(value)
60
+ except TypeError:
61
+ return value
62
+
63
+ def set(self, key, value):
64
+ self.attributes[key] = value
65
+
66
+ def set_settings(self, module):
67
+ if isinstance(module, str):
68
+ module = import_module(module)
69
+ for key in dir(module):
70
+ if key.isupper():
71
+ self.set(key, getattr(module, key))
72
+
73
+ # 实现 MutableMapping 必须的方法
74
+ def __getitem__(self, item):
75
+ return self.attributes[item]
76
+
77
+ def __setitem__(self, key, value):
78
+ self.set(key, value)
79
+
80
+ def __delitem__(self, key):
81
+ del self.attributes[key]
82
+
83
+ def __iter__(self):
84
+ return iter(self.attributes)
85
+
86
+ def __len__(self):
87
+ return len(self.attributes)
88
+
89
+ def __str__(self):
90
+ return f'<Settings: {self.attributes}>'
91
+
92
+ __repr__ = __str__
93
+
94
+ def update_attributes(self, attributes):
95
+ if attributes is not None:
96
+ for key, value in attributes.items():
97
+ self.set(key, value)
98
+
99
+ def copy(self):
100
100
  return deepcopy(self)