bricks-py 0.0.2__tar.gz → 0.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {bricks-py-0.0.2 → bricks-py-0.0.4}/PKG-INFO +1 -1
  2. {bricks-py-0.0.2 → bricks-py-0.0.4}/README.md +1 -1
  3. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/const.py +1 -1
  4. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/core/genesis.py +1 -0
  5. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/downloader/genesis.py +8 -3
  6. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/lib/context.py +1 -0
  7. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/lib/proxies.py +1 -0
  8. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/spider/air.py +2 -1
  9. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/spider/form.py +50 -78
  10. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/utils/pandora.py +6 -6
  11. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks_py.egg-info/PKG-INFO +1 -1
  12. {bricks-py-0.0.2 → bricks-py-0.0.4}/demos/air_spider_demo.py +19 -19
  13. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/__init__.py +0 -0
  14. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/core/__init__.py +0 -0
  15. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/core/dispatch.py +0 -0
  16. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/core/events.py +0 -0
  17. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/core/signals.py +0 -0
  18. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/db/__init__.py +0 -0
  19. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/db/redis_.py +0 -0
  20. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/db/sqllite.py +0 -0
  21. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/downloader/__init__.py +0 -0
  22. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/downloader/cffi.py +0 -0
  23. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/lib/__init__.py +0 -0
  24. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/lib/counter.py +0 -0
  25. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/lib/extractors.py +0 -0
  26. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/lib/headers.py +0 -0
  27. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/lib/items.py +0 -0
  28. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/lib/queues.py +0 -0
  29. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/lib/request.py +0 -0
  30. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/lib/response.py +0 -0
  31. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/plugins.py +0 -0
  32. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/spider/__init__.py +0 -0
  33. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks/utils/__init__.py +0 -0
  34. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks_py.egg-info/SOURCES.txt +0 -0
  35. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks_py.egg-info/dependency_links.txt +0 -0
  36. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks_py.egg-info/requires.txt +0 -0
  37. {bricks-py-0.0.2 → bricks-py-0.0.4}/bricks_py.egg-info/top_level.txt +0 -0
  38. {bricks-py-0.0.2 → bricks-py-0.0.4}/demos/__init__.py +0 -0
  39. {bricks-py-0.0.2 → bricks-py-0.0.4}/demos/dispatcher_test.py +0 -0
  40. {bricks-py-0.0.2 → bricks-py-0.0.4}/demos/form_spider_demo.py +0 -0
  41. {bricks-py-0.0.2 → bricks-py-0.0.4}/setup.cfg +0 -0
  42. {bricks-py-0.0.2 → bricks-py-0.0.4}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bricks-py
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: quickly build your crawler
5
5
  Author: Kem
6
6
  Author-email: 531144129@qq.com
@@ -16,7 +16,7 @@ Bricks 拥有以下特性
16
16
  安装
17
17
 
18
18
  ```
19
- pip install bricks-py==0.0.1
19
+ pip install bricks-py
20
20
  ```
21
21
 
22
22
  # 简单上手
@@ -9,7 +9,7 @@ import uuid
9
9
  MACHINE_ID = hashlib.sha256(uuid.UUID(int=uuid.getnode()).hex[-12:].encode()).hexdigest()
10
10
 
11
11
  # 当前框架版本
12
- VERSION = "0.0.2"
12
+ VERSION = "0.0.4"
13
13
 
14
14
  # 事件类型
15
15
  ERROR_OCCURRED = 'ERROR_OCCURRED'
@@ -199,6 +199,7 @@ class Pangu(Chaos):
199
199
  pass
200
200
 
201
201
  except signals.Retry:
202
+ context.signpost['retry'] = True
202
203
  context.retry()
203
204
 
204
205
  except signals.Success:
@@ -108,12 +108,17 @@ class Downloader(metaclass=genesis.MetaClass):
108
108
  # 获取请求头中的Content-Type
109
109
  content_type = request.headers.get('Content-Type', '').lower()
110
110
 
111
- if not content_type or not isinstance(request.body, str):
111
+ # 如果 body 本来就是字符串 / bytes -> 直接使用, 不需要转换
112
+ if isinstance(request.body, (str, bytes)):
112
113
  return {
113
114
  "data": request.body,
114
- "type": "unknown"
115
+ "type": "raw"
115
116
  }
116
117
 
118
+ # 没有传 content-type, 并且 body 不为字符串, 默认设置为 application/json
119
+ if not content_type:
120
+ content_type = 'application/json'
121
+
117
122
  # 根据Content-Type判断并处理请求体
118
123
  if 'application/json' in content_type:
119
124
  try:
@@ -143,4 +148,4 @@ class Downloader(metaclass=genesis.MetaClass):
143
148
  }
144
149
 
145
150
  else:
146
- raise ValueError(f"Invalid JSON format, raw: {request.body}")
151
+ raise ValueError(f"Unsupported Content-Type: {content_type}")
@@ -67,6 +67,7 @@ class Flow(Context):
67
67
  super().__init__(form, target, **kwargs)
68
68
  self.doing: deque = deque([self])
69
69
  self.pending: deque = deque([])
70
+ self.signpost: dict = self.install("signpost", {}, True)
70
71
 
71
72
  def _set_next(self, value):
72
73
  if isinstance(value, Node):
@@ -35,6 +35,7 @@ class MetaClass(type):
35
35
  def __new__(cls, name, bases, dct): # noqa
36
36
  def wrapper(raw_method):
37
37
  def inner(self, *args, **kwargs):
38
+ self: BaseProxy
38
39
  proxy = raw_method(self, *args, **kwargs)
39
40
  proxy.proxy = self.fmt(proxy=proxy.proxy)
40
41
  proxy.auth = self.auth
@@ -137,6 +137,7 @@ class Spider(Pangu):
137
137
  return {
138
138
  self.on_consume: self.on_seeds,
139
139
  self.on_seeds: self.on_request,
140
+ self.on_retry: self.on_request,
140
141
  self.on_request: self.on_response,
141
142
  self.on_response: self.on_pipeline,
142
143
  self.on_pipeline: None
@@ -535,7 +536,7 @@ class Spider(Pangu):
535
536
  }
536
537
  )
537
538
  ret = prepared.func(*prepared.args, **prepared.kwargs)
538
- context.flow({"next": self.on_request})
539
+ context.flow()
539
540
  return ret
540
541
 
541
542
  return wrapper
@@ -28,69 +28,26 @@ FORMAT_REGEX = re.compile(r'{(\w+)(?::(\w+))?}')
28
28
  class Node:
29
29
 
30
30
  @classmethod
31
- def format(cls, value, base, errors: str = "raise"):
31
+ def format(cls, value, base: dict, errors: str = "raise"):
32
32
  if isinstance(value, str):
33
- # 使用正则表达式提取占位符和类型
34
- # try:
35
- # return value.format(**base)
36
- # except ValueError:
37
- # placeholders = FORMAT_REGEX.findall(value)
38
- #
39
- # # 只有一个
40
- # if len(placeholders) == 1:
41
- # placeholder, type_str = placeholders[0]
42
- # try:
43
- # placeholder_value = base[placeholder]
44
- # return cls.convert(placeholder_value, type_str)
45
- # except KeyError as e:
46
- # if errors == "raise":
47
- # raise ValueError(f"Missing key in base: {e}")
48
- #
49
- # elif errors == 'ignore':
50
- # return value
51
- #
52
- # else:
53
- # return ""
54
- #
55
- # elif len(placeholders) == 0:
56
- # raise
57
- # else:
58
- # for placeholder, type_str in placeholders:
59
- # try:
60
- # placeholder_value = base[placeholder]
61
- # except KeyError as e:
62
- # if errors == "raise":
63
- # raise ValueError(f"Missing key in base: {e}")
64
- # elif errors == 'ignore':
65
- # placeholder_value = f"{{{placeholder}:{type_str}}}"
66
- # else:
67
- # placeholder_value = ""
68
- #
69
- # value = value.replace(f"{{{placeholder}:{type_str}}}", str(placeholder_value))
70
- #
71
- # else:
72
- # return value
73
- #
74
- # except KeyError as e:
75
- # if errors == "raise":
76
- # raise ValueError(f"Missing key in base: {e}")
77
- #
78
- # elif errors == 'ignore':
79
- # return value
80
- #
81
- # else:
82
- # return ""
83
- # 查找所有占位符
33
+ while True:
34
+ try:
35
+ return value.format(**base)
36
+ except ValueError:
37
+
38
+ placeholders = FORMAT_REGEX.findall(value)
39
+ # 有多个, 那最终肯定还是字符串
40
+ convert_value = len(placeholders) == 1
41
+ for placeholder, type_str in placeholders:
42
+
43
+ if placeholder not in base:
44
+ if errors == 'raise':
45
+ raise ValueError(f"Missing key in base: {placeholder}")
46
+ elif errors == 'ignore':
47
+ return value
48
+ else:
49
+ base.setdefault(placeholder, "")
84
50
 
85
- try:
86
- return value.format(**base)
87
- except ValueError:
88
-
89
- placeholders = FORMAT_REGEX.findall(value)
90
- # 有多个, 那最终肯定还是字符串
91
- convert_value = len(placeholders) == 1
92
- for placeholder, type_str in placeholders:
93
- if placeholder in base:
94
51
  placeholder_value = base[placeholder]
95
52
  if type_str:
96
53
  placeholder_value = cls.convert(placeholder_value, type_str)
@@ -101,23 +58,17 @@ class Node:
101
58
  if convert_value:
102
59
  value = cls.convert(value, type(placeholder_value))
103
60
 
104
- elif errors == 'raise':
105
- raise ValueError(f"Missing key in base: {placeholder}")
106
- elif errors == 'ignore':
107
- return value
108
- else:
109
- return ""
110
- return value
61
+ return value
111
62
 
112
- except KeyError as e:
113
- if errors == "raise":
114
- raise ValueError(f"Missing key in base: {e}")
63
+ except KeyError as e:
64
+ if errors == "raise":
65
+ raise ValueError(f"Missing key in base: {e}")
115
66
 
116
- elif errors == 'ignore':
117
- return value
67
+ elif errors == 'ignore':
68
+ return value
118
69
 
119
- else:
120
- return ""
70
+ else:
71
+ base.setdefault(e.args[0], "")
121
72
 
122
73
  elif isinstance(value, list):
123
74
  return [cls.format(item, base, errors=errors) for item in value]
@@ -251,6 +202,7 @@ class Spider(air.Spider):
251
202
  self.on_consume: self.on_flow,
252
203
  self.on_seeds: self.on_request,
253
204
  self.on_request: self.on_flow,
205
+ self.on_retry: self.on_flow,
254
206
  self.on_response: self.on_flow,
255
207
  self.on_pipeline: self.on_flow,
256
208
  }
@@ -260,23 +212,43 @@ class Spider(air.Spider):
260
212
  raise NotImplementedError
261
213
 
262
214
  def on_flow(self, context: Context):
263
- context.signpost = context.install("signpost", 0, True)
264
215
  if not self.config.spider:
265
216
  logger.warning('没有配置 Spider 节点流程..')
266
217
  raise signals.Exit
267
218
 
219
+ context.signpost.setdefault("cursor", 0)
220
+ # 这是重试回来了
221
+ if context.signpost.pop('retry', False):
222
+ # 找到之前下载节点的位置
223
+ bookmark = context.signpost.get('bookmark', 0)
224
+ # 没有下载节点 / 下载节点就在第一个 -> cursor 指向最起点
225
+ if bookmark == 0:
226
+ context.signpost['cursor'] = 1
227
+
228
+ # 找到下载节点前面不是 Task 的节点, 但是如果是两个
229
+ else:
230
+ for i in range(bookmark - 1, -1, -1):
231
+ node = self.config.spider[i]
232
+ if not isinstance(node, Task):
233
+ context.signpost['cursor'] = i + 1
234
+ break
235
+ else:
236
+ context.signpost['cursor'] = bookmark
237
+
268
238
  while True:
269
239
  try:
270
- node: Union[Download, Task, Parse, Pipeline] = self.config.spider[context.signpost]
240
+ node: Union[Download, Task, Parse, Pipeline] = self.config.spider[context.signpost['cursor']]
271
241
  context.node = node
272
242
  except IndexError:
273
243
  context.flow({"next": None})
274
244
  raise signals.Switch
275
245
  else:
276
- context.signpost += 1
246
+ context.signpost['cursor'] += 1
277
247
 
278
248
  # 种子 -> Request
279
249
  if isinstance(node, Download):
250
+ # 记录下载节点的位置
251
+ context.signpost['bookmark'] = context.signpost['cursor'] - 1
280
252
  context.flow({"next": self.on_seeds})
281
253
  raise signals.Switch
282
254
 
@@ -116,6 +116,10 @@ def prepare(func, args=None, kwargs: dict = None, annotations: dict = None, name
116
116
  if name in kwargs:
117
117
  value = kwargs[name]
118
118
 
119
+ # 参数在 namespace 里面 -> 从 namespace 里面取
120
+ elif param.name in namespace:
121
+ value = namespace[param.name]
122
+
119
123
  # 参数类型存在于 annotations, 并且还可以从 args 里面取值, 并且刚好取到的对应的值也是当前类型 -> 直接从 args 里面取
120
124
  elif param.annotation in annotations and index < len(args) and type(args[index]) == param.annotation:
121
125
  value = args[index]
@@ -130,20 +134,16 @@ def prepare(func, args=None, kwargs: dict = None, annotations: dict = None, name
130
134
  value = args[index]
131
135
  index += 1
132
136
 
133
- elif param.name in namespace:
134
-
135
- value = namespace[param.name]
136
-
137
137
  elif param.default != inspect.Parameter.empty:
138
138
  continue
139
139
 
140
140
  # 没有传这个参数, 并且也没有可以备选的 annotations -> 报错
141
141
  else:
142
142
  raise TypeError(f"missing required argument: {name}, signature: {dict(parameters)}")
143
- if param.kind in [inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD]:
143
+ if param.kind in [inspect.Parameter.POSITIONAL_ONLY]:
144
144
  new_args.append(value)
145
145
 
146
- if param.kind == inspect.Parameter.KEYWORD_ONLY:
146
+ if param.kind in [inspect.Parameter.KEYWORD_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD]:
147
147
  new_kwargs[name] = value
148
148
 
149
149
  return prepared(func=func, args=new_args, kwargs=new_kwargs)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bricks-py
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: quickly build your crawler
5
5
  Author: Kem
6
6
  Author-email: 531144129@qq.com
@@ -85,27 +85,27 @@ if __name__ == '__main__':
85
85
  spider = MySpider(
86
86
  # # 设置代理模式 1, 该模式适用于: 你已经将代理提取至 Redis 的 proxy 里面
87
87
  # # 这样设置的话就会自动去取
88
- # proxy={
89
- # "ref": "bricks.lib.proxies.RedisProxy", # 指向 Redis
90
- # "key": "proxy", # 指向代理 Key
91
- # # 这个不写默认指向本地 Redis, 无密码的
92
- # "options": {
93
- # "host": "127.0.0.1",
94
- # "port": 6379,
95
- # "password": "xsxsxax"
96
- # },
97
- # "threshold": 100, # 一个代理最多使用多少次, 到这个次数之后就会归还到Redis, 然后重新拿, 默认不归还
98
- # "scheme": "socks5" # 代理协议, 默认是 http
99
- # }
100
-
101
- # 设置代理模式 2, 该模式适用于: 指向固定代理, 如 http://127.0.0.1:7890
102
- # 这样设置的话就会自动去取
103
88
  proxy={
104
- "ref": "bricks.lib.proxies.CustomProxy", # 指向 Redis
105
- "key": "127.0.0.1:7890", # 指向代理 Key
89
+ "ref": "bricks.lib.proxies.RedisProxy", # 指向 Redis
90
+ "key": "proxy", # 指向代理 Key
91
+ # 这个不写默认指向本地 Redis, 无密码的
92
+ "options": {
93
+ "host": "127.0.0.1",
94
+ "port": 6379,
95
+ # "password": "xsxsxax"
96
+ },
106
97
  "threshold": 100, # 一个代理最多使用多少次, 到这个次数之后就会归还到Redis, 然后重新拿, 默认不归还
107
- "scheme": "http" # 代理协议, 默认是 http
108
- },
98
+ "scheme": "socks5" # 代理协议, 默认是 http
99
+ }
100
+
101
+ # # 设置代理模式 2, 该模式适用于: 指向固定代理, 如 http://127.0.0.1:7890
102
+ # # 这样设置的话就会自动去取
103
+ # proxy={
104
+ # "ref": "bricks.lib.proxies.CustomProxy", # 指向 Redis
105
+ # "key": "127.0.0.1:7890", # 指向代理 Key
106
+ # "threshold": 100, # 一个代理最多使用多少次, 到这个次数之后就会归还到Redis, 然后重新拿, 默认不归还
107
+ # # "scheme": "http" # 代理协议, 默认是 http
108
+ # },
109
109
 
110
110
  # # 设置代理模式 3, 该模式适用于: 你有一个提取 api,访问就会获取代理
111
111
  # # 这样设置的话就会自动去取
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes