bricks-py 0.0.2__tar.gz → 0.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bricks-py-0.0.2 → bricks-py-0.0.4}/PKG-INFO +1 -1
- {bricks-py-0.0.2 → bricks-py-0.0.4}/README.md +1 -1
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/const.py +1 -1
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/core/genesis.py +1 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/downloader/genesis.py +8 -3
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/lib/context.py +1 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/lib/proxies.py +1 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/spider/air.py +2 -1
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/spider/form.py +50 -78
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/utils/pandora.py +6 -6
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks_py.egg-info/PKG-INFO +1 -1
- {bricks-py-0.0.2 → bricks-py-0.0.4}/demos/air_spider_demo.py +19 -19
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/__init__.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/core/__init__.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/core/dispatch.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/core/events.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/core/signals.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/db/__init__.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/db/redis_.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/db/sqllite.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/downloader/__init__.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/downloader/cffi.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/lib/__init__.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/lib/counter.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/lib/extractors.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/lib/headers.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/lib/items.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/lib/queues.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/lib/request.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/lib/response.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/plugins.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/spider/__init__.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/utils/__init__.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks_py.egg-info/SOURCES.txt +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks_py.egg-info/dependency_links.txt +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks_py.egg-info/requires.txt +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks_py.egg-info/top_level.txt +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/demos/__init__.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/demos/dispatcher_test.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/demos/form_spider_demo.py +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/setup.cfg +0 -0
- {bricks-py-0.0.2 → bricks-py-0.0.4}/setup.py +0 -0
|
@@ -108,12 +108,17 @@ class Downloader(metaclass=genesis.MetaClass):
|
|
|
108
108
|
# 获取请求头中的Content-Type
|
|
109
109
|
content_type = request.headers.get('Content-Type', '').lower()
|
|
110
110
|
|
|
111
|
-
|
|
111
|
+
# 如果 body 本来就是字符串 / bytes -> 直接使用, 不需要转换
|
|
112
|
+
if isinstance(request.body, (str, bytes)):
|
|
112
113
|
return {
|
|
113
114
|
"data": request.body,
|
|
114
|
-
"type": "
|
|
115
|
+
"type": "raw"
|
|
115
116
|
}
|
|
116
117
|
|
|
118
|
+
# 没有传 content-type, 并且 body 不为字符串, 默认设置为 application/json
|
|
119
|
+
if not content_type:
|
|
120
|
+
content_type = 'application/json'
|
|
121
|
+
|
|
117
122
|
# 根据Content-Type判断并处理请求体
|
|
118
123
|
if 'application/json' in content_type:
|
|
119
124
|
try:
|
|
@@ -143,4 +148,4 @@ class Downloader(metaclass=genesis.MetaClass):
|
|
|
143
148
|
}
|
|
144
149
|
|
|
145
150
|
else:
|
|
146
|
-
raise ValueError(f"
|
|
151
|
+
raise ValueError(f"Unsupported Content-Type: {content_type}")
|
|
@@ -35,6 +35,7 @@ class MetaClass(type):
|
|
|
35
35
|
def __new__(cls, name, bases, dct): # noqa
|
|
36
36
|
def wrapper(raw_method):
|
|
37
37
|
def inner(self, *args, **kwargs):
|
|
38
|
+
self: BaseProxy
|
|
38
39
|
proxy = raw_method(self, *args, **kwargs)
|
|
39
40
|
proxy.proxy = self.fmt(proxy=proxy.proxy)
|
|
40
41
|
proxy.auth = self.auth
|
|
@@ -137,6 +137,7 @@ class Spider(Pangu):
|
|
|
137
137
|
return {
|
|
138
138
|
self.on_consume: self.on_seeds,
|
|
139
139
|
self.on_seeds: self.on_request,
|
|
140
|
+
self.on_retry: self.on_request,
|
|
140
141
|
self.on_request: self.on_response,
|
|
141
142
|
self.on_response: self.on_pipeline,
|
|
142
143
|
self.on_pipeline: None
|
|
@@ -535,7 +536,7 @@ class Spider(Pangu):
|
|
|
535
536
|
}
|
|
536
537
|
)
|
|
537
538
|
ret = prepared.func(*prepared.args, **prepared.kwargs)
|
|
538
|
-
context.flow(
|
|
539
|
+
context.flow()
|
|
539
540
|
return ret
|
|
540
541
|
|
|
541
542
|
return wrapper
|
|
@@ -28,69 +28,26 @@ FORMAT_REGEX = re.compile(r'{(\w+)(?::(\w+))?}')
|
|
|
28
28
|
class Node:
|
|
29
29
|
|
|
30
30
|
@classmethod
|
|
31
|
-
def format(cls, value, base, errors: str = "raise"):
|
|
31
|
+
def format(cls, value, base: dict, errors: str = "raise"):
|
|
32
32
|
if isinstance(value, str):
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
# return value
|
|
51
|
-
#
|
|
52
|
-
# else:
|
|
53
|
-
# return ""
|
|
54
|
-
#
|
|
55
|
-
# elif len(placeholders) == 0:
|
|
56
|
-
# raise
|
|
57
|
-
# else:
|
|
58
|
-
# for placeholder, type_str in placeholders:
|
|
59
|
-
# try:
|
|
60
|
-
# placeholder_value = base[placeholder]
|
|
61
|
-
# except KeyError as e:
|
|
62
|
-
# if errors == "raise":
|
|
63
|
-
# raise ValueError(f"Missing key in base: {e}")
|
|
64
|
-
# elif errors == 'ignore':
|
|
65
|
-
# placeholder_value = f"{{{placeholder}:{type_str}}}"
|
|
66
|
-
# else:
|
|
67
|
-
# placeholder_value = ""
|
|
68
|
-
#
|
|
69
|
-
# value = value.replace(f"{{{placeholder}:{type_str}}}", str(placeholder_value))
|
|
70
|
-
#
|
|
71
|
-
# else:
|
|
72
|
-
# return value
|
|
73
|
-
#
|
|
74
|
-
# except KeyError as e:
|
|
75
|
-
# if errors == "raise":
|
|
76
|
-
# raise ValueError(f"Missing key in base: {e}")
|
|
77
|
-
#
|
|
78
|
-
# elif errors == 'ignore':
|
|
79
|
-
# return value
|
|
80
|
-
#
|
|
81
|
-
# else:
|
|
82
|
-
# return ""
|
|
83
|
-
# 查找所有占位符
|
|
33
|
+
while True:
|
|
34
|
+
try:
|
|
35
|
+
return value.format(**base)
|
|
36
|
+
except ValueError:
|
|
37
|
+
|
|
38
|
+
placeholders = FORMAT_REGEX.findall(value)
|
|
39
|
+
# 有多个, 那最终肯定还是字符串
|
|
40
|
+
convert_value = len(placeholders) == 1
|
|
41
|
+
for placeholder, type_str in placeholders:
|
|
42
|
+
|
|
43
|
+
if placeholder not in base:
|
|
44
|
+
if errors == 'raise':
|
|
45
|
+
raise ValueError(f"Missing key in base: {placeholder}")
|
|
46
|
+
elif errors == 'ignore':
|
|
47
|
+
return value
|
|
48
|
+
else:
|
|
49
|
+
base.setdefault(placeholder, "")
|
|
84
50
|
|
|
85
|
-
try:
|
|
86
|
-
return value.format(**base)
|
|
87
|
-
except ValueError:
|
|
88
|
-
|
|
89
|
-
placeholders = FORMAT_REGEX.findall(value)
|
|
90
|
-
# 有多个, 那最终肯定还是字符串
|
|
91
|
-
convert_value = len(placeholders) == 1
|
|
92
|
-
for placeholder, type_str in placeholders:
|
|
93
|
-
if placeholder in base:
|
|
94
51
|
placeholder_value = base[placeholder]
|
|
95
52
|
if type_str:
|
|
96
53
|
placeholder_value = cls.convert(placeholder_value, type_str)
|
|
@@ -101,23 +58,17 @@ class Node:
|
|
|
101
58
|
if convert_value:
|
|
102
59
|
value = cls.convert(value, type(placeholder_value))
|
|
103
60
|
|
|
104
|
-
|
|
105
|
-
raise ValueError(f"Missing key in base: {placeholder}")
|
|
106
|
-
elif errors == 'ignore':
|
|
107
|
-
return value
|
|
108
|
-
else:
|
|
109
|
-
return ""
|
|
110
|
-
return value
|
|
61
|
+
return value
|
|
111
62
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
63
|
+
except KeyError as e:
|
|
64
|
+
if errors == "raise":
|
|
65
|
+
raise ValueError(f"Missing key in base: {e}")
|
|
115
66
|
|
|
116
|
-
|
|
117
|
-
|
|
67
|
+
elif errors == 'ignore':
|
|
68
|
+
return value
|
|
118
69
|
|
|
119
|
-
|
|
120
|
-
|
|
70
|
+
else:
|
|
71
|
+
base.setdefault(e.args[0], "")
|
|
121
72
|
|
|
122
73
|
elif isinstance(value, list):
|
|
123
74
|
return [cls.format(item, base, errors=errors) for item in value]
|
|
@@ -251,6 +202,7 @@ class Spider(air.Spider):
|
|
|
251
202
|
self.on_consume: self.on_flow,
|
|
252
203
|
self.on_seeds: self.on_request,
|
|
253
204
|
self.on_request: self.on_flow,
|
|
205
|
+
self.on_retry: self.on_flow,
|
|
254
206
|
self.on_response: self.on_flow,
|
|
255
207
|
self.on_pipeline: self.on_flow,
|
|
256
208
|
}
|
|
@@ -260,23 +212,43 @@ class Spider(air.Spider):
|
|
|
260
212
|
raise NotImplementedError
|
|
261
213
|
|
|
262
214
|
def on_flow(self, context: Context):
|
|
263
|
-
context.signpost = context.install("signpost", 0, True)
|
|
264
215
|
if not self.config.spider:
|
|
265
216
|
logger.warning('没有配置 Spider 节点流程..')
|
|
266
217
|
raise signals.Exit
|
|
267
218
|
|
|
219
|
+
context.signpost.setdefault("cursor", 0)
|
|
220
|
+
# 这是重试回来了
|
|
221
|
+
if context.signpost.pop('retry', False):
|
|
222
|
+
# 找到之前下载节点的位置
|
|
223
|
+
bookmark = context.signpost.get('bookmark', 0)
|
|
224
|
+
# 没有下载节点 / 下载节点就在第一个 -> cursor 指向最起点
|
|
225
|
+
if bookmark == 0:
|
|
226
|
+
context.signpost['cursor'] = 1
|
|
227
|
+
|
|
228
|
+
# 找到下载节点前面不是 Task 的节点, 但是如果是两个
|
|
229
|
+
else:
|
|
230
|
+
for i in range(bookmark - 1, -1, -1):
|
|
231
|
+
node = self.config.spider[i]
|
|
232
|
+
if not isinstance(node, Task):
|
|
233
|
+
context.signpost['cursor'] = i + 1
|
|
234
|
+
break
|
|
235
|
+
else:
|
|
236
|
+
context.signpost['cursor'] = bookmark
|
|
237
|
+
|
|
268
238
|
while True:
|
|
269
239
|
try:
|
|
270
|
-
node: Union[Download, Task, Parse, Pipeline] = self.config.spider[context.signpost]
|
|
240
|
+
node: Union[Download, Task, Parse, Pipeline] = self.config.spider[context.signpost['cursor']]
|
|
271
241
|
context.node = node
|
|
272
242
|
except IndexError:
|
|
273
243
|
context.flow({"next": None})
|
|
274
244
|
raise signals.Switch
|
|
275
245
|
else:
|
|
276
|
-
context.signpost += 1
|
|
246
|
+
context.signpost['cursor'] += 1
|
|
277
247
|
|
|
278
248
|
# 种子 -> Request
|
|
279
249
|
if isinstance(node, Download):
|
|
250
|
+
# 记录下载节点的位置
|
|
251
|
+
context.signpost['bookmark'] = context.signpost['cursor'] - 1
|
|
280
252
|
context.flow({"next": self.on_seeds})
|
|
281
253
|
raise signals.Switch
|
|
282
254
|
|
|
@@ -116,6 +116,10 @@ def prepare(func, args=None, kwargs: dict = None, annotations: dict = None, name
|
|
|
116
116
|
if name in kwargs:
|
|
117
117
|
value = kwargs[name]
|
|
118
118
|
|
|
119
|
+
# 参数在 namespace 里面 -> 从 namespace 里面取
|
|
120
|
+
elif param.name in namespace:
|
|
121
|
+
value = namespace[param.name]
|
|
122
|
+
|
|
119
123
|
# 参数类型存在于 annotations, 并且还可以从 args 里面取值, 并且刚好取到的对应的值也是当前类型 -> 直接从 args 里面取
|
|
120
124
|
elif param.annotation in annotations and index < len(args) and type(args[index]) == param.annotation:
|
|
121
125
|
value = args[index]
|
|
@@ -130,20 +134,16 @@ def prepare(func, args=None, kwargs: dict = None, annotations: dict = None, name
|
|
|
130
134
|
value = args[index]
|
|
131
135
|
index += 1
|
|
132
136
|
|
|
133
|
-
elif param.name in namespace:
|
|
134
|
-
|
|
135
|
-
value = namespace[param.name]
|
|
136
|
-
|
|
137
137
|
elif param.default != inspect.Parameter.empty:
|
|
138
138
|
continue
|
|
139
139
|
|
|
140
140
|
# 没有传这个参数, 并且也没有可以备选的 annotations -> 报错
|
|
141
141
|
else:
|
|
142
142
|
raise TypeError(f"missing required argument: {name}, signature: {dict(parameters)}")
|
|
143
|
-
if param.kind in [inspect.Parameter.POSITIONAL_ONLY
|
|
143
|
+
if param.kind in [inspect.Parameter.POSITIONAL_ONLY]:
|
|
144
144
|
new_args.append(value)
|
|
145
145
|
|
|
146
|
-
if param.kind
|
|
146
|
+
if param.kind in [inspect.Parameter.KEYWORD_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD]:
|
|
147
147
|
new_kwargs[name] = value
|
|
148
148
|
|
|
149
149
|
return prepared(func=func, args=new_args, kwargs=new_kwargs)
|
|
@@ -85,27 +85,27 @@ if __name__ == '__main__':
|
|
|
85
85
|
spider = MySpider(
|
|
86
86
|
# # 设置代理模式 1, 该模式适用于: 你已经将代理提取至 Redis 的 proxy 里面
|
|
87
87
|
# # 这样设置的话就会自动去取
|
|
88
|
-
# proxy={
|
|
89
|
-
# "ref": "bricks.lib.proxies.RedisProxy", # 指向 Redis
|
|
90
|
-
# "key": "proxy", # 指向代理 Key
|
|
91
|
-
# # 这个不写默认指向本地 Redis, 无密码的
|
|
92
|
-
# "options": {
|
|
93
|
-
# "host": "127.0.0.1",
|
|
94
|
-
# "port": 6379,
|
|
95
|
-
# "password": "xsxsxax"
|
|
96
|
-
# },
|
|
97
|
-
# "threshold": 100, # 一个代理最多使用多少次, 到这个次数之后就会归还到Redis, 然后重新拿, 默认不归还
|
|
98
|
-
# "scheme": "socks5" # 代理协议, 默认是 http
|
|
99
|
-
# }
|
|
100
|
-
|
|
101
|
-
# 设置代理模式 2, 该模式适用于: 指向固定代理, 如 http://127.0.0.1:7890
|
|
102
|
-
# 这样设置的话就会自动去取
|
|
103
88
|
proxy={
|
|
104
|
-
"ref": "bricks.lib.proxies.
|
|
105
|
-
"key": "
|
|
89
|
+
"ref": "bricks.lib.proxies.RedisProxy", # 指向 Redis
|
|
90
|
+
"key": "proxy", # 指向代理 Key
|
|
91
|
+
# 这个不写默认指向本地 Redis, 无密码的
|
|
92
|
+
"options": {
|
|
93
|
+
"host": "127.0.0.1",
|
|
94
|
+
"port": 6379,
|
|
95
|
+
# "password": "xsxsxax"
|
|
96
|
+
},
|
|
106
97
|
"threshold": 100, # 一个代理最多使用多少次, 到这个次数之后就会归还到Redis, 然后重新拿, 默认不归还
|
|
107
|
-
"scheme": "
|
|
108
|
-
}
|
|
98
|
+
"scheme": "socks5" # 代理协议, 默认是 http
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
# # 设置代理模式 2, 该模式适用于: 指向固定代理, 如 http://127.0.0.1:7890
|
|
102
|
+
# # 这样设置的话就会自动去取
|
|
103
|
+
# proxy={
|
|
104
|
+
# "ref": "bricks.lib.proxies.CustomProxy", # 指向 Redis
|
|
105
|
+
# "key": "127.0.0.1:7890", # 指向代理 Key
|
|
106
|
+
# "threshold": 100, # 一个代理最多使用多少次, 到这个次数之后就会归还到Redis, 然后重新拿, 默认不归还
|
|
107
|
+
# # "scheme": "http" # 代理协议, 默认是 http
|
|
108
|
+
# },
|
|
109
109
|
|
|
110
110
|
# # 设置代理模式 3, 该模式适用于: 你有一个提取 api,访问就会获取代理
|
|
111
111
|
# # 这样设置的话就会自动去取
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|