crawlo 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (40) hide show
  1. crawlo/__init__.py +1 -0
  2. crawlo/__version__.py +1 -1
  3. crawlo/core/engine.py +9 -7
  4. crawlo/core/processor.py +1 -1
  5. crawlo/core/scheduler.py +32 -8
  6. crawlo/crawler.py +133 -18
  7. crawlo/downloader/playwright_downloader.py +161 -0
  8. crawlo/extension/log_stats.py +4 -4
  9. crawlo/filters/__init__.py +37 -0
  10. crawlo/filters/aioredis_filter.py +130 -0
  11. crawlo/filters/memory_filter.py +203 -0
  12. crawlo/filters/redis_filter.py +120 -0
  13. crawlo/items/__init__.py +40 -2
  14. crawlo/items/items.py +36 -5
  15. crawlo/middleware/retry.py +8 -2
  16. crawlo/network/request.py +215 -33
  17. crawlo/network/response.py +122 -53
  18. crawlo/pipelines/console_pipeline.py +28 -8
  19. crawlo/pipelines/mongo_pipeline.py +114 -2
  20. crawlo/pipelines/mysql_batch_pipline.py +134 -0
  21. crawlo/pipelines/mysql_pipeline.py +192 -2
  22. crawlo/pipelines/pipeline_manager.py +3 -3
  23. crawlo/settings/default_settings.py +51 -1
  24. crawlo/spider/__init__.py +2 -2
  25. crawlo/subscriber.py +90 -11
  26. crawlo/utils/concurrency_manager.py +125 -0
  27. crawlo/utils/date_tools.py +165 -8
  28. crawlo/utils/func_tools.py +74 -14
  29. crawlo/utils/pqueue.py +166 -8
  30. crawlo/utils/project.py +3 -2
  31. crawlo/utils/request.py +85 -0
  32. crawlo/utils/tools.py +303 -0
  33. crawlo/utils/url.py +40 -0
  34. {crawlo-1.0.0.dist-info → crawlo-1.0.2.dist-info}/METADATA +23 -11
  35. crawlo-1.0.2.dist-info/RECORD +68 -0
  36. crawlo-1.0.0.dist-info/RECORD +0 -59
  37. crawlo-1.0.0.dist-info/licenses/LICENSE +0 -23
  38. {crawlo-1.0.0.dist-info → crawlo-1.0.2.dist-info}/WHEEL +0 -0
  39. {crawlo-1.0.0.dist-info → crawlo-1.0.2.dist-info}/entry_points.txt +0 -0
  40. {crawlo-1.0.0.dist-info → crawlo-1.0.2.dist-info}/top_level.txt +0 -0
crawlo/utils/tools.py ADDED
@@ -0,0 +1,303 @@
1
+ import json
2
+ import re
3
+ from pprint import pformat
4
+ from datetime import date, time, datetime
5
+
6
+ from crawlo.utils.log import get_logger
7
+
8
+
9
+ logger = get_logger(__name__)
10
+
11
+
12
+ def make_insert_sql(
13
+ table, data, auto_update=False, update_columns=(), insert_ignore=False
14
+ ):
15
+ """
16
+ @summary: 适用于mysql
17
+ ---------
18
+ @param table:
19
+ @param data: 表数据 json格式
20
+ @param auto_update: 使用的是replace into, 为完全覆盖已存在的数据
21
+ @param update_columns: 需要更新的列 默认全部,当指定值时,auto_update设置无效,当duplicate key冲突时更新指定的列
22
+ @param insert_ignore: 数据存在忽略
23
+ ---------
24
+ @result:
25
+ """
26
+
27
+ keys = ["`{}`".format(key) for key in data.keys()]
28
+ keys = list2str(keys).replace("'", "")
29
+
30
+ values = [format_sql_value(value) for value in data.values()]
31
+ values = list2str(values)
32
+
33
+ if update_columns:
34
+ if not isinstance(update_columns, (tuple, list)):
35
+ update_columns = [update_columns]
36
+ update_columns_ = ", ".join(
37
+ ["{key}=values({key})".format(key=key) for key in update_columns]
38
+ )
39
+ sql = (
40
+ "insert%s into `{table}` {keys} values {values} on duplicate key update %s"
41
+ % (" ignore" if insert_ignore else "", update_columns_)
42
+ )
43
+
44
+ elif auto_update:
45
+ sql = "replace into `{table}` {keys} values {values}"
46
+ else:
47
+ sql = "insert%s into `{table}` {keys} values {values}" % (
48
+ " ignore" if insert_ignore else ""
49
+ )
50
+
51
+ sql = sql.format(table=table, keys=keys, values=values).replace("None", "null")
52
+ return sql
53
+
54
+
55
+ def make_update_sql(table, data, condition):
56
+ """
57
+ @summary: 适用于mysql, oracle数据库时间需要to_date 处理(TODO)
58
+ ---------
59
+ @param table:
60
+ @param data: 表数据 json格式
61
+ @param condition: where 条件
62
+ ---------
63
+ @result:
64
+ """
65
+ key_values = []
66
+
67
+ for key, value in data.items():
68
+ value = format_sql_value(value)
69
+ if isinstance(value, str):
70
+ key_values.append("`{}`={}".format(key, repr(value)))
71
+ elif value is None:
72
+ key_values.append("`{}`={}".format(key, "null"))
73
+ else:
74
+ key_values.append("`{}`={}".format(key, value))
75
+
76
+ key_values = ", ".join(key_values)
77
+
78
+ sql = "update `{table}` set {key_values} where {condition}"
79
+ sql = sql.format(table=table, key_values=key_values, condition=condition)
80
+ return sql
81
+
82
+
83
+ def make_batch_sql(
84
+ table, datas, auto_update=False, update_columns=(), update_columns_value=()
85
+ ):
86
+ """
87
+ @summary: 生成批量的SQL
88
+ ---------
89
+ @param table:
90
+ @param datas: 表数据 [{...}]
91
+ @param auto_update: 使用的是replace into,为完全覆盖已存在的数据
92
+ @param update_columns: 需要更新的列,默认全部,当指定值时,auto_update设置无效,当duplicate key冲突时更新指定的列
93
+ @param update_columns_value: 需要更新的列的值,默认为datas里边对应的值,注意如果值为字符串类型需要主动加单引号,如 update_columns_value=("'test'",)
94
+ ---------
95
+ @result:
96
+ """
97
+ if not datas:
98
+ return
99
+
100
+ keys = list(set([key for data in datas for key in data]))
101
+ # values_placeholder = ["%s"] * len(keys)
102
+ values = []
103
+ for data in datas:
104
+ # 检查 data 是否是字典类型
105
+ if not isinstance(data, dict):
106
+ # 如果 data 不是字典,记录错误日志并打印 data 的内容和类型
107
+ # logger.error(f"期望的数据类型是字典,但实际得到: {data} (类型: {type(data)})")
108
+ continue # 跳过非字典类型的 data,继续处理下一个数据
109
+
110
+ value = []
111
+ for key in keys:
112
+ # 从字典中获取当前 key 对应的值
113
+ current_data = data.get(key)
114
+ try:
115
+ # 对值进行格式化处理
116
+ current_data = format_sql_value(current_data)
117
+ value.append(current_data) # 将处理后的值添加到列表中
118
+ except Exception as e:
119
+ # 如果格式化失败,记录错误日志
120
+ logger.error(f"{key}: {current_data} (类型: {type(current_data)}) -> {e}")
121
+
122
+ # 将处理后的值列表添加到 values 中
123
+ values.append(value)
124
+ keys_str = ", ".join(["`{}`".format(key) for key in keys])
125
+ placeholders_str = ", ".join(["%s"] * len(keys))
126
+
127
+ if update_columns:
128
+ if not isinstance(update_columns, (tuple, list)):
129
+ update_columns = [update_columns]
130
+ if update_columns_value:
131
+ update_columns_ = ", ".join(
132
+ [
133
+ "`{key}`={value}".format(key=key, value=value)
134
+ for key, value in zip(update_columns, update_columns_value)
135
+ ]
136
+ )
137
+ else:
138
+ # 修改这里,使用 VALUES() 函数来引用插入的值
139
+ update_columns_ = ", ".join(
140
+ ["`{key}`=VALUES(`{key}`)".format(key=key) for key in update_columns]
141
+ )
142
+
143
+ sql = f"INSERT INTO `{table}` ({keys_str}) VALUES ({placeholders_str}) ON DUPLICATE KEY UPDATE {update_columns_}"
144
+ elif auto_update:
145
+ sql = "REPLACE INTO `{table}` ({keys}) VALUES ({values_placeholder})".format(
146
+ table=table, keys=keys_str, values_placeholder=placeholders_str
147
+ )
148
+ else:
149
+ sql = "INSERT IGNORE INTO `{table}` ({keys}) VALUES ({values_placeholder})".format(
150
+ table=table, keys=keys_str, values_placeholder=placeholders_str
151
+ )
152
+ return sql, values
153
+
154
+
155
+ def format_sql_value(value):
156
+ """
157
+ 格式化 SQL 值
158
+ """
159
+ if value is None:
160
+ return None # 处理 NULL 值
161
+
162
+ # 确保处理字符串
163
+ if isinstance(value, str):
164
+ return value.strip() # 去除首尾空格
165
+
166
+ # 处理列表或元组类型
167
+ elif isinstance(value, (list, tuple)):
168
+ try:
169
+ return dumps_json(value) # 将其转为 JSON 字符串
170
+ except Exception as e:
171
+ raise ValueError(f"Failed to serialize list/tuple to JSON: {value}, error: {e}")
172
+
173
+ # 处理字典类型
174
+ elif isinstance(value, dict):
175
+ try:
176
+ return dumps_json(value) # 将其转为 JSON 字符串
177
+ except Exception as e:
178
+ raise ValueError(f"Failed to serialize dict to JSON: {value}, error: {e}")
179
+
180
+ # 处理布尔类型
181
+ elif isinstance(value, bool):
182
+ return int(value) # 转为整数
183
+
184
+ # 确保数值类型优先匹配
185
+ elif isinstance(value, (int, float)):
186
+ return value # 返回数值
187
+
188
+ # 处理日期、时间类型
189
+ elif isinstance(value, (date, time, datetime)):
190
+ return str(value) # 转换为字符串表示
191
+
192
+ # 如果遇到无法处理的类型,抛出异常
193
+ else:
194
+ raise TypeError(f"Unsupported value type: {type(value)}, value: {value}")
195
+
196
+
197
+
198
+
199
+ def list2str(datas):
200
+ """
201
+ 列表转字符串
202
+ :param datas: [1, 2]
203
+ :return: (1, 2)
204
+ """
205
+ data_str = str(tuple(datas))
206
+ data_str = re.sub(r",\)$", ")", data_str)
207
+ return data_str
208
+
209
+ _REGEXPS = {}
210
+
211
+ def get_info(html, regexps, allow_repeat=True, fetch_one=False, split=None):
212
+ regexps = isinstance(regexps, str) and [regexps] or regexps
213
+
214
+ infos = []
215
+ for regex in regexps:
216
+ if regex == "":
217
+ continue
218
+
219
+ if regex not in _REGEXPS.keys():
220
+ _REGEXPS[regex] = re.compile(regex, re.S)
221
+
222
+ if fetch_one:
223
+ infos = _REGEXPS[regex].search(html)
224
+ if infos:
225
+ infos = infos.groups()
226
+ else:
227
+ continue
228
+ else:
229
+ infos = _REGEXPS[regex].findall(str(html))
230
+
231
+ if len(infos) > 0:
232
+ break
233
+
234
+ if fetch_one:
235
+ infos = infos if infos else ("",)
236
+ return infos if len(infos) > 1 else infos[0]
237
+ else:
238
+ infos = allow_repeat and infos or sorted(set(infos), key=infos.index)
239
+ infos = split.join(infos) if split else infos
240
+ return infos
241
+
242
+
243
+ def get_json(json_str):
244
+ """
245
+ @summary: 取json对象
246
+ ---------
247
+ @param json_str: json格式的字符串
248
+ ---------
249
+ @result: 返回json对象
250
+ """
251
+
252
+ try:
253
+ return json.loads(json_str) if json_str else {}
254
+ except Exception as e1:
255
+ try:
256
+ json_str = json_str.strip()
257
+ json_str = json_str.replace("'", '"')
258
+ keys = get_info(json_str, r"(\w+):")
259
+ for key in keys:
260
+ json_str = json_str.replace(key, '"%s"' % key)
261
+
262
+ return json.loads(json_str) if json_str else {}
263
+
264
+ except Exception as e2:
265
+ logger.error(
266
+ """
267
+ e1: %s
268
+ format json_str: %s
269
+ e2: %s
270
+ """
271
+ % (e1, json_str, e2)
272
+ )
273
+
274
+ return {}
275
+
276
+
277
+ def dumps_json(data, indent=4, sort_keys=False):
278
+ """
279
+ @summary: 格式化json 用于打印
280
+ ---------
281
+ @param data: json格式的字符串或json对象
282
+ @param indent:
283
+ @param sort_keys:
284
+ ---------
285
+ @result: 格式化后的字符串
286
+ """
287
+ try:
288
+ if isinstance(data, str):
289
+ data = get_json(data)
290
+
291
+ data = json.dumps(
292
+ data,
293
+ ensure_ascii=False,
294
+ indent=indent,
295
+ skipkeys=True,
296
+ sort_keys=sort_keys,
297
+ default=str,
298
+ )
299
+
300
+ except Exception as e:
301
+ data = pformat(data)
302
+
303
+ return data
crawlo/utils/url.py ADDED
@@ -0,0 +1,40 @@
1
+ from urllib.parse import urldefrag
2
+ from w3lib.url import add_or_replace_parameter
3
+
4
+
5
+ def escape_ajax(url: str) -> str:
6
+ """
7
+ 根据Google AJAX爬取规范转换URL(处理哈希片段#!):
8
+ https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
9
+
10
+ 规则说明:
11
+ 1. 仅当URL包含 `#!` 时才转换(表示这是AJAX可爬取页面)
12
+ 2. 将 `#!key=value` 转换为 `?_escaped_fragment_=key%3Dvalue`
13
+ 3. 保留原始查询参数(如果有)
14
+
15
+ 示例:
16
+ >>> escape_ajax("www.example.com/ajax.html#!key=value")
17
+ 'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
18
+ >>> escape_ajax("www.example.com/ajax.html?k1=v1#!key=value")
19
+ 'www.example.com/ajax.html?k1=v1&_escaped_fragment_=key%3Dvalue'
20
+ >>> escape_ajax("www.example.com/ajax.html#!")
21
+ 'www.example.com/ajax.html?_escaped_fragment_='
22
+
23
+ 非AJAX可爬取的URL(无#!)原样返回:
24
+ >>> escape_ajax("www.example.com/ajax.html#normal")
25
+ 'www.example.com/ajax.html#normal'
26
+ """
27
+ # 分离URL的基础部分和哈希片段
28
+ de_frag, frag = urldefrag(url)
29
+
30
+ # 仅处理以"!"开头的哈希片段(Google规范)
31
+ if not frag.startswith("!"):
32
+ return url # 不符合规则则原样返回
33
+
34
+ # 调用辅助函数添加 `_escaped_fragment_` 参数
35
+ return add_or_replace_parameter(de_frag, "_escaped_fragment_", frag[1:])
36
+
37
+
38
+ if __name__ == '__main__':
39
+ f = escape_ajax('http://example.com/page#!')
40
+ print(f)
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlo
3
- Version: 1.0.0
4
- Summary: feapder是一款支持异步的python爬虫框架
3
+ Version: 1.0.2
4
+ Summary: Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取。
5
5
  Home-page: https://github.com/crawl-coder/Crawlo.git
6
6
  Author: crawl-coder
7
7
  Author-email: crawlo@qq.com
@@ -11,13 +11,26 @@ Classifier: License :: OSI Approved :: MIT License
11
11
  Classifier: Operating System :: OS Independent
12
12
  Requires-Python: >=3.6
13
13
  Description-Content-Type: text/markdown
14
- License-File: LICENSE
15
- Requires-Dist: aiohttp>=3.12.6
16
- Requires-Dist: httpx>=0.28.1
17
- Requires-Dist: DBUtils>=2.0
18
- Requires-Dist: parsel>=1.10.0
19
- Requires-Dist: pymysql>=1.1.1
20
- Requires-Dist: ujson>=5.10.0
14
+ Requires-Dist: aiohttp>=3.12.14
15
+ Requires-Dist: aiomysql>=0.2.0
16
+ Requires-Dist: aioredis>=2.0.1
17
+ Requires-Dist: asyncmy>=0.2.10
18
+ Requires-Dist: cssselect>=1.2.0
19
+ Requires-Dist: dateparser>=1.2.2
20
+ Requires-Dist: httpx>=0.27.0
21
+ Requires-Dist: lxml>=5.2.1
22
+ Requires-Dist: motor>=3.7.0
23
+ Requires-Dist: parsel>=1.9.1
24
+ Requires-Dist: pydantic>=2.11.7
25
+ Requires-Dist: pymongo>=4.11
26
+ Requires-Dist: PyMySQL>=1.1.1
27
+ Requires-Dist: python-dateutil>=2.9.0.post0
28
+ Requires-Dist: redis>=6.2.0
29
+ Requires-Dist: requests>=2.32.4
30
+ Requires-Dist: six>=1.17.0
31
+ Requires-Dist: ujson>=5.9.0
32
+ Requires-Dist: urllib3>=2.5.0
33
+ Requires-Dist: w3lib>=2.1.2
21
34
  Provides-Extra: render
22
35
  Requires-Dist: webdriver-manager>=4.0.0; extra == "render"
23
36
  Requires-Dist: playwright; extra == "render"
@@ -30,7 +43,6 @@ Requires-Dist: redis-py-cluster>=2.1.0; extra == "all"
30
43
  Requires-Dist: webdriver-manager>=4.0.0; extra == "all"
31
44
  Requires-Dist: playwright; extra == "all"
32
45
  Requires-Dist: selenium>=3.141.0; extra == "all"
33
- Dynamic: license-file
34
46
 
35
47
  # Crawlo
36
- 异步通用爬虫框架
48
+ Crawlo 是一款基于异步IO的高性能Python爬虫框架,支持分布式抓取与数据管道。
@@ -0,0 +1,68 @@
1
+ crawlo/__init__.py,sha256=BoRtaB19VFlByP3JKzXQbmg4Jb6i6yVnpTR3jCSrCig,208
2
+ crawlo/__version__.py,sha256=FlR3yFykEvKzEITpEQq_qx6Uq29lYlhxcnSHeRP1LgI,23
3
+ crawlo/crawler.py,sha256=XLiDyFyoiJr8BzhLbCB15zbccVHQiiLGDM1zFUkdrAI,8544
4
+ crawlo/event.py,sha256=ZhoPW5CglCEuZNFEwviSCBIw0pT5O6jT98bqYrDFd3E,324
5
+ crawlo/exceptions.py,sha256=trxM2c0jw50QsGSoFAKC2RrKpapOFHQDq0wQuLWqmKE,980
6
+ crawlo/stats_collector.py,sha256=jhAW8k0SzjqelkpiWpfGmMw2DBkgTjpwnObqTNDOp6A,1286
7
+ crawlo/subscriber.py,sha256=udlHeTR0ymGQhCDxVUGwUzeeeR4TYCEJrJwFnkgr0cU,3836
8
+ crawlo/task_manager.py,sha256=D9m-nqnGj-FZPtGk4CdwZX3Gw7IWyYvTS7CHpRGWc_w,748
9
+ crawlo/core/__init__.py,sha256=JYSAn15r8yWgRK_Nc69t_8tZCyb70MiPZKssA8wrYz0,43
10
+ crawlo/core/engine.py,sha256=JFHooPp-5cfHSyxEh87nOOR5NMaPLVDfNSqAsbtx4PM,6030
11
+ crawlo/core/processor.py,sha256=oHLs-cno0bJGTNc9NGD2S7_2-grI3ruvggO0SY2mf3Q,1180
12
+ crawlo/core/scheduler.py,sha256=ZwPoU_QRjs9wwrxdt-MGPwsSmKhvvhgmcnBllDrXnhg,2014
13
+ crawlo/downloader/__init__.py,sha256=72u2Hef4HaMfs9VCqEjbMtiaRXbaXmgNiJn6qy09LHs,2384
14
+ crawlo/downloader/aiohttp_downloader.py,sha256=4C2BDloKzwss16kfD7tH0WPugPbSSFxl-5-_DLWB0vM,3676
15
+ crawlo/downloader/httpx_downloader.py,sha256=ra6Ae_lv8pNyvLzPQYBgTNuBdMVBYi86kNt2OdZlcSo,1704
16
+ crawlo/downloader/playwright_downloader.py,sha256=mEGlSd6A6sN0Wyq-TDkownIElOgxnwVfY3rS5wtLoYY,6726
17
+ crawlo/extension/__init__.py,sha256=O2BVK1U3WwmurZb-PaYVz3g1tZ_iYUjCwilmUKf6844,1170
18
+ crawlo/extension/log_interval.py,sha256=FOWeTOuWtOpCz2UPV5F_--QIa8yomltSpjxbw3F7bkU,1971
19
+ crawlo/extension/log_stats.py,sha256=-V7ipdIfYMQdp1ZDc4kvNEAIHIR74U1ZHV5FhlLyGGU,1786
20
+ crawlo/filters/__init__.py,sha256=9fJQRVkxWWPChajYbAGe1O6UYB639xWt0hiLUGBs4hQ,1014
21
+ crawlo/filters/aioredis_filter.py,sha256=H_HAFfE9mHnPrzQcjyXXCseQ77iN4nT9a3lukHiu8M8,4874
22
+ crawlo/filters/memory_filter.py,sha256=pk2o0kbX0zrGLJ6poKhSa-cfOmsp68fA65hXEGQCQ5M,6895
23
+ crawlo/filters/redis_filter.py,sha256=m1nRsf_3slnWSb80RtTEURi5kwjdI0xotoFxnscx974,4211
24
+ crawlo/items/__init__.py,sha256=o5BSpS1Byivr-bpdfFgc9GCoGi8ThNuPJiTW7lz85-I,2125
25
+ crawlo/items/items.py,sha256=myOOjWaSByKW0r8SxIQ0bxS3PXjUDE1c-Pe38z6fSF8,4108
26
+ crawlo/middleware/__init__.py,sha256=PSwpRLdBUopaQzBp1S0zK_TZbrRagQ4yzvgyLy4tBk8,570
27
+ crawlo/middleware/default_header.py,sha256=OVW4vpRPp3Y6qYXtiEYlGqVjCYcbuv1Iecc7zEgwCsI,1099
28
+ crawlo/middleware/download_delay.py,sha256=P2eyAJXwdLdC4yYuLhvKZVa1b5YQvQD0GpsR8aDW8-8,994
29
+ crawlo/middleware/middleware_manager.py,sha256=T4axTY89Z0BOwaWDWcUTABeDNTvyPFiyrbwj-H4sbSA,6629
30
+ crawlo/middleware/request_ignore.py,sha256=jdybWFVXuA5YsAPfZJFzLTWkYhEAewNgxuhFqczPW9M,1027
31
+ crawlo/middleware/response_code.py,sha256=vgXWv3mMu_v9URvhKA9myIFH4u6L4EwNme80wL4DCGc,677
32
+ crawlo/middleware/response_filter.py,sha256=O2gkV_Yjart8kmmXTGzrtZnb_Uuefap4uL2Cu01iRs4,863
33
+ crawlo/middleware/retry.py,sha256=9SnE7l3Nhh143AqCiL0gfE6dl_gF1Kr6CjoNxvMH_Ps,3465
34
+ crawlo/network/__init__.py,sha256=DVz1JpasjxCgOlXvm76gz-S18OXr4emG_J39yi5iVuA,130
35
+ crawlo/network/request.py,sha256=yUAL6oecm1TniD9dsBmRedEaEHzh3rtr_6p3dMK2EfQ,8195
36
+ crawlo/network/response.py,sha256=amnLEExKq11dLh6m_YdSqdKLl0srh1eeY5uPz15fwyo,6055
37
+ crawlo/pipelines/__init__.py,sha256=IbXJ6B8LqxVVjeLNgL_12AxV6zbV8hNRQxAfMLjjSaw,273
38
+ crawlo/pipelines/console_pipeline.py,sha256=bwe5hZgaVSWmh3R8XpOaaeAjJme-Ttrpo6G6f1cnLIg,1287
39
+ crawlo/pipelines/mongo_pipeline.py,sha256=Yr48D0T61-_Y-EpgWXf7BUn9w8e-Pj5P07QDSPZ0pYU,4558
40
+ crawlo/pipelines/mysql_batch_pipline.py,sha256=7KXd0IUV0h3IViD8R0iruyWv5XdZR1pANB8EY9z6iMI,5022
41
+ crawlo/pipelines/mysql_pipeline.py,sha256=TzyaBg1oBj9pgzuVUrb5VVtkiwZH6gqP_6IYM2QCkLs,8052
42
+ crawlo/pipelines/pipeline_manager.py,sha256=k-Rg0os0Havrov99D-Jn3ROpnz154K30tf7aARE5W3k,2174
43
+ crawlo/settings/__init__.py,sha256=NgYFLfk_Bw7h6KSoepJn_lMBSqVbCHebjKxaE3_eMgw,130
44
+ crawlo/settings/default_settings.py,sha256=JS1QKYe7jkdFlOjqZ-eOeOcVS3AXCZynoNH95GuEnds,2556
45
+ crawlo/settings/setting_manager.py,sha256=4xXOzKwZCgAp8ybwvVcs2R--CsOD7c6dBIkj6DJHB3c,2998
46
+ crawlo/spider/__init__.py,sha256=1tmKkr2-oJi0w9r2ho9nn6Z_VDn18pjXHXU0Hv2eheY,941
47
+ crawlo/templates/item_template.tmpl,sha256=0bGFnlwJRqstxMNEj1H_pEICybwoueRhs31QaDPXrS0,372
48
+ crawlo/templates/spider_template.tmpl,sha256=JzphuA87Yl_F1xR9zOIi_ZSazyT8eSNPxYYPMv3Uiko,835
49
+ crawlo/templates/project_template/main.py,sha256=BcCP294ycCPsHi_AMN7OAJtcrLvQdf91meH93PqbQgs,626
50
+ crawlo/templates/project_template/setting.py,sha256=Ce4nMbrdhL1ioRdTcB0vV_vK_50cfnwVqSvt49QsNkA,9395
51
+ crawlo/templates/project_template/items/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
+ crawlo/templates/project_template/spiders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
+ crawlo/utils/__init__.py,sha256=XCYumI8wJ1jU_Myn_K0LT-LVygPDUCdETCbXM3EWvlo,130
54
+ crawlo/utils/concurrency_manager.py,sha256=o-_cfeUHdlBOM3eAXF857MtekSrRcVTBJ2jWZvY6weQ,5230
55
+ crawlo/utils/date_tools.py,sha256=Y7pnGNn1-5vkiHtydAgmQ-qR3pSO30k5WEYigOPifPQ,5496
56
+ crawlo/utils/func_tools.py,sha256=y-TYP9H3X67MS_foWy9Z2LIS6GP7Y4Cy3T168ulq3Jc,2451
57
+ crawlo/utils/log.py,sha256=LU0J3boPCL-Kynx3wR_CAryRgScNmPPn4pBitLrrsX4,1028
58
+ crawlo/utils/pqueue.py,sha256=HDgX4HAkc7RqYUtX6q51tzI1ZRTACf8P_4jLqC4-uC0,5559
59
+ crawlo/utils/project.py,sha256=FfBaMfxcau4yL59O-DfD7FAii8k6gXWQmQ1YU6aaUCE,1544
60
+ crawlo/utils/request.py,sha256=BEBtxwejvQw5euEiSclHCbqkNcBwUMY7KPGGkvj8BjE,2936
61
+ crawlo/utils/system.py,sha256=24zGmtHNhDFMGVo7ftMV-Pqg6_5d63zsyNey9udvJJk,248
62
+ crawlo/utils/tools.py,sha256=U7xGKgXe2PmMyvNxZ1vlJYcv4Ei1WhIBMf8XcJZ7XCY,9764
63
+ crawlo/utils/url.py,sha256=RKe_iqdjafsNcp-P2GVLYpsL1qbxiuZLiFc-SqOQkcs,1521
64
+ crawlo-1.0.2.dist-info/METADATA,sha256=kzNv4kckIbSZCADon5g2Ik5QJhubNxWra0-aXX6bMug,1784
65
+ crawlo-1.0.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
66
+ crawlo-1.0.2.dist-info/entry_points.txt,sha256=GD9PBhKQN83EaxPYtz7NhcGeZeh3bdr2jWbTixOs-lw,59
67
+ crawlo-1.0.2.dist-info/top_level.txt,sha256=Dwuv-Y1aGSJD3mjFrCdNGQ8EHroMj7RgVcxDdcczx4k,7
68
+ crawlo-1.0.2.dist-info/RECORD,,
@@ -1,59 +0,0 @@
1
- crawlo/__init__.py,sha256=yzg6DkuCNkz27Tn5YDdaboZ1Ah96FwV6eGNEDO48lxA,170
2
- crawlo/__version__.py,sha256=j4YpqEiBQsl3obE5LH55xbuejIMjUwUNp-k4JXOjEFU,23
3
- crawlo/crawler.py,sha256=lIx3-_TKXdCDhpfw4lYcizpyoclapjAHfGchBJ9DmTU,3679
4
- crawlo/event.py,sha256=ZhoPW5CglCEuZNFEwviSCBIw0pT5O6jT98bqYrDFd3E,324
5
- crawlo/exceptions.py,sha256=trxM2c0jw50QsGSoFAKC2RrKpapOFHQDq0wQuLWqmKE,980
6
- crawlo/stats_collector.py,sha256=jhAW8k0SzjqelkpiWpfGmMw2DBkgTjpwnObqTNDOp6A,1286
7
- crawlo/subscriber.py,sha256=4stxeXqNK6RB7oqo0wKJdUw2Ym3b6UBMhZ4pRrjSMEU,1001
8
- crawlo/task_manager.py,sha256=D9m-nqnGj-FZPtGk4CdwZX3Gw7IWyYvTS7CHpRGWc_w,748
9
- crawlo/core/__init__.py,sha256=JYSAn15r8yWgRK_Nc69t_8tZCyb70MiPZKssA8wrYz0,43
10
- crawlo/core/engine.py,sha256=mQfzY4Bm0Ysb5bPjoH3lMVFbn9WJuQ1UjBTReuOljzQ,5873
11
- crawlo/core/processor.py,sha256=5RQPfffE8AnKnuajL9J2oc7QAeoSjATxwdxRU3_EoKM,1183
12
- crawlo/core/scheduler.py,sha256=2z8qzZmj6zn9w5pE1yux6Vi7fDMq8Cto6n9Wyyo_DP8,1074
13
- crawlo/downloader/__init__.py,sha256=72u2Hef4HaMfs9VCqEjbMtiaRXbaXmgNiJn6qy09LHs,2384
14
- crawlo/downloader/aiohttp_downloader.py,sha256=4C2BDloKzwss16kfD7tH0WPugPbSSFxl-5-_DLWB0vM,3676
15
- crawlo/downloader/httpx_downloader.py,sha256=ra6Ae_lv8pNyvLzPQYBgTNuBdMVBYi86kNt2OdZlcSo,1704
16
- crawlo/extension/__init__.py,sha256=O2BVK1U3WwmurZb-PaYVz3g1tZ_iYUjCwilmUKf6844,1170
17
- crawlo/extension/log_interval.py,sha256=FOWeTOuWtOpCz2UPV5F_--QIa8yomltSpjxbw3F7bkU,1971
18
- crawlo/extension/log_stats.py,sha256=ZIIB6WKvYSxDIw7Mr52QkOSiX-tMGWU8Ifpd1EXiTUY,1677
19
- crawlo/items/__init__.py,sha256=Ir04T486XuJJ-OCItxqO-78Jyt7B8JGLBsrgCO605PU,535
20
- crawlo/items/items.py,sha256=7w0uojKu1GzY5SDfrlJtAhqVv6alHQjwkBkthSoJ-04,2856
21
- crawlo/middleware/__init__.py,sha256=PSwpRLdBUopaQzBp1S0zK_TZbrRagQ4yzvgyLy4tBk8,570
22
- crawlo/middleware/default_header.py,sha256=OVW4vpRPp3Y6qYXtiEYlGqVjCYcbuv1Iecc7zEgwCsI,1099
23
- crawlo/middleware/download_delay.py,sha256=P2eyAJXwdLdC4yYuLhvKZVa1b5YQvQD0GpsR8aDW8-8,994
24
- crawlo/middleware/middleware_manager.py,sha256=T4axTY89Z0BOwaWDWcUTABeDNTvyPFiyrbwj-H4sbSA,6629
25
- crawlo/middleware/request_ignore.py,sha256=jdybWFVXuA5YsAPfZJFzLTWkYhEAewNgxuhFqczPW9M,1027
26
- crawlo/middleware/response_code.py,sha256=vgXWv3mMu_v9URvhKA9myIFH4u6L4EwNme80wL4DCGc,677
27
- crawlo/middleware/response_filter.py,sha256=O2gkV_Yjart8kmmXTGzrtZnb_Uuefap4uL2Cu01iRs4,863
28
- crawlo/middleware/retry.py,sha256=sCRO5uh9cG8Ui3PODJEg_6PlxDD_lAoopaF2P7u6bUo,3148
29
- crawlo/network/__init__.py,sha256=DVz1JpasjxCgOlXvm76gz-S18OXr4emG_J39yi5iVuA,130
30
- crawlo/network/request.py,sha256=YokEk6fEBQF2ii-Oe18ZzVc262Ih6KcUZPLOpxcs6Mg,1474
31
- crawlo/network/response.py,sha256=QikuOCgxS5yMh4Uh0QuMpqWfyR78vossCs-Va-sQ2YE,2993
32
- crawlo/pipelines/__init__.py,sha256=IbXJ6B8LqxVVjeLNgL_12AxV6zbV8hNRQxAfMLjjSaw,273
33
- crawlo/pipelines/console_pipeline.py,sha256=1zorg1xrIiCctiTkd38jm7Q-miKMnRFnJBjv7Tg0gyo,537
34
- crawlo/pipelines/mongo_pipeline.py,sha256=DfXD10Ee-3q685EdS6du3Nn-f0EaBKJgdgsVlpcYhd8,67
35
- crawlo/pipelines/mysql_pipeline.py,sha256=QAZToJAmy3xyTLOh7DkwCfxMDWQTLiTOLdfvZhkMP6E,67
36
- crawlo/pipelines/pipeline_manager.py,sha256=FvpMuHwB5DWGzPsy1mZFdcyxzPN2mVL2aivUWlzrHx4,2183
37
- crawlo/settings/__init__.py,sha256=NgYFLfk_Bw7h6KSoepJn_lMBSqVbCHebjKxaE3_eMgw,130
38
- crawlo/settings/default_settings.py,sha256=4J8Kzc3dk09ogy5_FeaGA1KHUzHPMYg9LFIVFIrhuXY,979
39
- crawlo/settings/setting_manager.py,sha256=4xXOzKwZCgAp8ybwvVcs2R--CsOD7c6dBIkj6DJHB3c,2998
40
- crawlo/spider/__init__.py,sha256=9haJYfxX4Gx2iV3KK5Fry2fO9R-gEmAkR_pQjZTIErE,905
41
- crawlo/templates/item_template.tmpl,sha256=0bGFnlwJRqstxMNEj1H_pEICybwoueRhs31QaDPXrS0,372
42
- crawlo/templates/spider_template.tmpl,sha256=JzphuA87Yl_F1xR9zOIi_ZSazyT8eSNPxYYPMv3Uiko,835
43
- crawlo/templates/project_template/main.py,sha256=BcCP294ycCPsHi_AMN7OAJtcrLvQdf91meH93PqbQgs,626
44
- crawlo/templates/project_template/setting.py,sha256=Ce4nMbrdhL1ioRdTcB0vV_vK_50cfnwVqSvt49QsNkA,9395
45
- crawlo/templates/project_template/items/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
- crawlo/templates/project_template/spiders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
- crawlo/utils/__init__.py,sha256=XCYumI8wJ1jU_Myn_K0LT-LVygPDUCdETCbXM3EWvlo,130
48
- crawlo/utils/date_tools.py,sha256=bW3j9LxxuAUwykzuzaOYR3iY8U0g7wBPZL1ELf4lXVo,482
49
- crawlo/utils/func_tools.py,sha256=sWnoa2SDAMK2NBdoIO2o3Ew6SqeJXcSNEzmWZUJcE6o,616
50
- crawlo/utils/log.py,sha256=LU0J3boPCL-Kynx3wR_CAryRgScNmPPn4pBitLrrsX4,1028
51
- crawlo/utils/pqueue.py,sha256=W1n23t9roVvDB84jQ4j7sk_Z4FoD8q3LbRRbH4B6Ej4,446
52
- crawlo/utils/project.py,sha256=JHMV7SLDWMaPCwuwVY9KQqvu2VT1ZlXEQvpIzzNaSeo,1506
53
- crawlo/utils/system.py,sha256=24zGmtHNhDFMGVo7ftMV-Pqg6_5d63zsyNey9udvJJk,248
54
- crawlo-1.0.0.dist-info/licenses/LICENSE,sha256=f96mrub4oLJnuMcNwMjHWMksci5brQ2a2Fu0R7YEgnk,1125
55
- crawlo-1.0.0.dist-info/METADATA,sha256=wS6yr3avxrdzm4x4WPN7fBKgO5jobpQ8UpvrCUC1aQw,1266
56
- crawlo-1.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
57
- crawlo-1.0.0.dist-info/entry_points.txt,sha256=GD9PBhKQN83EaxPYtz7NhcGeZeh3bdr2jWbTixOs-lw,59
58
- crawlo-1.0.0.dist-info/top_level.txt,sha256=Dwuv-Y1aGSJD3mjFrCdNGQ8EHroMj7RgVcxDdcczx4k,7
59
- crawlo-1.0.0.dist-info/RECORD,,
@@ -1,23 +0,0 @@
1
- MIT License
2
-
3
- Modifications:
4
-
5
- Copyright (c) 2020 crawl-coder <2251018029@qq.com>
6
-
7
- Permission is hereby granted, free of charge, to any person obtaining a copy
8
- of this software and associated documentation files (the "Software"), to deal
9
- in the Software without restriction, including without limitation the rights
10
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
- copies of the Software, and to permit persons to whom the Software is
12
- furnished to do so, subject to the following conditions:
13
-
14
- The above copyright notice and this permission notice shall be included in all
15
- copies or substantial portions of the Software.
16
-
17
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
- SOFTWARE.
File without changes