crawlo 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +9 -6
- crawlo/__version__.py +1 -2
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +158 -158
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +57 -59
- crawlo/crawler.py +242 -222
- crawlo/downloader/__init__.py +78 -78
- crawlo/downloader/aiohttp_downloader.py +259 -96
- crawlo/downloader/httpx_downloader.py +187 -48
- crawlo/downloader/playwright_downloader.py +160 -160
- crawlo/event.py +11 -11
- crawlo/exceptions.py +64 -64
- crawlo/extension/__init__.py +31 -31
- crawlo/extension/log_interval.py +49 -49
- crawlo/extension/log_stats.py +44 -44
- crawlo/filters/__init__.py +37 -37
- crawlo/filters/aioredis_filter.py +157 -129
- crawlo/filters/memory_filter.py +202 -203
- crawlo/filters/redis_filter.py +119 -119
- crawlo/items/__init__.py +62 -62
- crawlo/items/items.py +118 -118
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +32 -32
- crawlo/middleware/download_delay.py +28 -28
- crawlo/middleware/middleware_manager.py +140 -140
- crawlo/middleware/request_ignore.py +30 -30
- crawlo/middleware/response_code.py +18 -18
- crawlo/middleware/response_filter.py +26 -26
- crawlo/middleware/retry.py +90 -90
- crawlo/network/__init__.py +7 -7
- crawlo/network/request.py +204 -233
- crawlo/network/response.py +166 -162
- crawlo/pipelines/__init__.py +13 -13
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/mongo_pipeline.py +116 -116
- crawlo/pipelines/mysql_batch_pipline.py +133 -133
- crawlo/pipelines/mysql_pipeline.py +195 -195
- crawlo/pipelines/pipeline_manager.py +56 -56
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +93 -89
- crawlo/settings/setting_manager.py +99 -99
- crawlo/spider/__init__.py +36 -36
- crawlo/stats_collector.py +59 -47
- crawlo/subscriber.py +106 -106
- crawlo/task_manager.py +27 -27
- crawlo/templates/item_template.tmpl +21 -21
- crawlo/templates/project_template/main.py +32 -32
- crawlo/templates/project_template/setting.py +189 -189
- crawlo/templates/spider_template.tmpl +30 -30
- crawlo/utils/__init__.py +7 -7
- crawlo/utils/concurrency_manager.py +124 -124
- crawlo/utils/date_tools.py +177 -177
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/log.py +39 -39
- crawlo/utils/pqueue.py +173 -173
- crawlo/utils/project.py +59 -59
- crawlo/utils/request.py +122 -85
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +302 -302
- crawlo/utils/url.py +39 -39
- {crawlo-1.0.2.dist-info → crawlo-1.0.3.dist-info}/METADATA +48 -48
- crawlo-1.0.3.dist-info/RECORD +80 -0
- {crawlo-1.0.2.dist-info → crawlo-1.0.3.dist-info}/top_level.txt +1 -0
- tests/__init__.py +7 -0
- tests/baidu_spider/__init__.py +7 -0
- tests/baidu_spider/demo.py +94 -0
- tests/baidu_spider/items.py +25 -0
- tests/baidu_spider/middleware.py +49 -0
- tests/baidu_spider/pipeline.py +55 -0
- tests/baidu_spider/request_fingerprints.txt +9 -0
- tests/baidu_spider/run.py +27 -0
- tests/baidu_spider/settings.py +78 -0
- tests/baidu_spider/spiders/__init__.py +7 -0
- tests/baidu_spider/spiders/bai_du.py +61 -0
- tests/baidu_spider/spiders/sina.py +79 -0
- crawlo-1.0.2.dist-info/RECORD +0 -68
- {crawlo-1.0.2.dist-info → crawlo-1.0.3.dist-info}/WHEEL +0 -0
- {crawlo-1.0.2.dist-info → crawlo-1.0.3.dist-info}/entry_points.txt +0 -0
crawlo/utils/tools.py
CHANGED
|
@@ -1,303 +1,303 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import re
|
|
3
|
-
from pprint import pformat
|
|
4
|
-
from datetime import date, time, datetime
|
|
5
|
-
|
|
6
|
-
from crawlo.utils.log import get_logger
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
logger = get_logger(__name__)
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def make_insert_sql(
|
|
13
|
-
table, data, auto_update=False, update_columns=(), insert_ignore=False
|
|
14
|
-
):
|
|
15
|
-
"""
|
|
16
|
-
@summary: 适用于mysql
|
|
17
|
-
---------
|
|
18
|
-
@param table:
|
|
19
|
-
@param data: 表数据 json格式
|
|
20
|
-
@param auto_update: 使用的是replace into, 为完全覆盖已存在的数据
|
|
21
|
-
@param update_columns: 需要更新的列 默认全部,当指定值时,auto_update设置无效,当duplicate key冲突时更新指定的列
|
|
22
|
-
@param insert_ignore: 数据存在忽略
|
|
23
|
-
---------
|
|
24
|
-
@result:
|
|
25
|
-
"""
|
|
26
|
-
|
|
27
|
-
keys = ["`{}`".format(key) for key in data.keys()]
|
|
28
|
-
keys = list2str(keys).replace("'", "")
|
|
29
|
-
|
|
30
|
-
values = [format_sql_value(value) for value in data.values()]
|
|
31
|
-
values = list2str(values)
|
|
32
|
-
|
|
33
|
-
if update_columns:
|
|
34
|
-
if not isinstance(update_columns, (tuple, list)):
|
|
35
|
-
update_columns = [update_columns]
|
|
36
|
-
update_columns_ = ", ".join(
|
|
37
|
-
["{key}=values({key})".format(key=key) for key in update_columns]
|
|
38
|
-
)
|
|
39
|
-
sql = (
|
|
40
|
-
"insert%s into `{table}` {keys} values {values} on duplicate key update %s"
|
|
41
|
-
% (" ignore" if insert_ignore else "", update_columns_)
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
elif auto_update:
|
|
45
|
-
sql = "replace into `{table}` {keys} values {values}"
|
|
46
|
-
else:
|
|
47
|
-
sql = "insert%s into `{table}` {keys} values {values}" % (
|
|
48
|
-
" ignore" if insert_ignore else ""
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
sql = sql.format(table=table, keys=keys, values=values).replace("None", "null")
|
|
52
|
-
return sql
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
def make_update_sql(table, data, condition):
|
|
56
|
-
"""
|
|
57
|
-
@summary: 适用于mysql, oracle数据库时间需要to_date 处理(TODO)
|
|
58
|
-
---------
|
|
59
|
-
@param table:
|
|
60
|
-
@param data: 表数据 json格式
|
|
61
|
-
@param condition: where 条件
|
|
62
|
-
---------
|
|
63
|
-
@result:
|
|
64
|
-
"""
|
|
65
|
-
key_values = []
|
|
66
|
-
|
|
67
|
-
for key, value in data.items():
|
|
68
|
-
value = format_sql_value(value)
|
|
69
|
-
if isinstance(value, str):
|
|
70
|
-
key_values.append("`{}`={}".format(key, repr(value)))
|
|
71
|
-
elif value is None:
|
|
72
|
-
key_values.append("`{}`={}".format(key, "null"))
|
|
73
|
-
else:
|
|
74
|
-
key_values.append("`{}`={}".format(key, value))
|
|
75
|
-
|
|
76
|
-
key_values = ", ".join(key_values)
|
|
77
|
-
|
|
78
|
-
sql = "update `{table}` set {key_values} where {condition}"
|
|
79
|
-
sql = sql.format(table=table, key_values=key_values, condition=condition)
|
|
80
|
-
return sql
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
def make_batch_sql(
|
|
84
|
-
table, datas, auto_update=False, update_columns=(), update_columns_value=()
|
|
85
|
-
):
|
|
86
|
-
"""
|
|
87
|
-
@summary: 生成批量的SQL
|
|
88
|
-
---------
|
|
89
|
-
@param table:
|
|
90
|
-
@param datas: 表数据 [{...}]
|
|
91
|
-
@param auto_update: 使用的是replace into,为完全覆盖已存在的数据
|
|
92
|
-
@param update_columns: 需要更新的列,默认全部,当指定值时,auto_update设置无效,当duplicate key冲突时更新指定的列
|
|
93
|
-
@param update_columns_value: 需要更新的列的值,默认为datas里边对应的值,注意如果值为字符串类型需要主动加单引号,如 update_columns_value=("'test'",)
|
|
94
|
-
---------
|
|
95
|
-
@result:
|
|
96
|
-
"""
|
|
97
|
-
if not datas:
|
|
98
|
-
return
|
|
99
|
-
|
|
100
|
-
keys = list(set([key for data in datas for key in data]))
|
|
101
|
-
# values_placeholder = ["%s"] * len(keys)
|
|
102
|
-
values = []
|
|
103
|
-
for data in datas:
|
|
104
|
-
# 检查 data 是否是字典类型
|
|
105
|
-
if not isinstance(data, dict):
|
|
106
|
-
# 如果 data 不是字典,记录错误日志并打印 data 的内容和类型
|
|
107
|
-
# logger.error(f"期望的数据类型是字典,但实际得到: {data} (类型: {type(data)})")
|
|
108
|
-
continue # 跳过非字典类型的 data,继续处理下一个数据
|
|
109
|
-
|
|
110
|
-
value = []
|
|
111
|
-
for key in keys:
|
|
112
|
-
# 从字典中获取当前 key 对应的值
|
|
113
|
-
current_data = data.get(key)
|
|
114
|
-
try:
|
|
115
|
-
# 对值进行格式化处理
|
|
116
|
-
current_data = format_sql_value(current_data)
|
|
117
|
-
value.append(current_data) # 将处理后的值添加到列表中
|
|
118
|
-
except Exception as e:
|
|
119
|
-
# 如果格式化失败,记录错误日志
|
|
120
|
-
logger.error(f"{key}: {current_data} (类型: {type(current_data)}) -> {e}")
|
|
121
|
-
|
|
122
|
-
# 将处理后的值列表添加到 values 中
|
|
123
|
-
values.append(value)
|
|
124
|
-
keys_str = ", ".join(["`{}`".format(key) for key in keys])
|
|
125
|
-
placeholders_str = ", ".join(["%s"] * len(keys))
|
|
126
|
-
|
|
127
|
-
if update_columns:
|
|
128
|
-
if not isinstance(update_columns, (tuple, list)):
|
|
129
|
-
update_columns = [update_columns]
|
|
130
|
-
if update_columns_value:
|
|
131
|
-
update_columns_ = ", ".join(
|
|
132
|
-
[
|
|
133
|
-
"`{key}`={value}".format(key=key, value=value)
|
|
134
|
-
for key, value in zip(update_columns, update_columns_value)
|
|
135
|
-
]
|
|
136
|
-
)
|
|
137
|
-
else:
|
|
138
|
-
# 修改这里,使用 VALUES() 函数来引用插入的值
|
|
139
|
-
update_columns_ = ", ".join(
|
|
140
|
-
["`{key}`=VALUES(`{key}`)".format(key=key) for key in update_columns]
|
|
141
|
-
)
|
|
142
|
-
|
|
143
|
-
sql = f"INSERT INTO `{table}` ({keys_str}) VALUES ({placeholders_str}) ON DUPLICATE KEY UPDATE {update_columns_}"
|
|
144
|
-
elif auto_update:
|
|
145
|
-
sql = "REPLACE INTO `{table}` ({keys}) VALUES ({values_placeholder})".format(
|
|
146
|
-
table=table, keys=keys_str, values_placeholder=placeholders_str
|
|
147
|
-
)
|
|
148
|
-
else:
|
|
149
|
-
sql = "INSERT IGNORE INTO `{table}` ({keys}) VALUES ({values_placeholder})".format(
|
|
150
|
-
table=table, keys=keys_str, values_placeholder=placeholders_str
|
|
151
|
-
)
|
|
152
|
-
return sql, values
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
def format_sql_value(value):
|
|
156
|
-
"""
|
|
157
|
-
格式化 SQL 值
|
|
158
|
-
"""
|
|
159
|
-
if value is None:
|
|
160
|
-
return None # 处理 NULL 值
|
|
161
|
-
|
|
162
|
-
# 确保处理字符串
|
|
163
|
-
if isinstance(value, str):
|
|
164
|
-
return value.strip() # 去除首尾空格
|
|
165
|
-
|
|
166
|
-
# 处理列表或元组类型
|
|
167
|
-
elif isinstance(value, (list, tuple)):
|
|
168
|
-
try:
|
|
169
|
-
return dumps_json(value) # 将其转为 JSON 字符串
|
|
170
|
-
except Exception as e:
|
|
171
|
-
raise ValueError(f"Failed to serialize list/tuple to JSON: {value}, error: {e}")
|
|
172
|
-
|
|
173
|
-
# 处理字典类型
|
|
174
|
-
elif isinstance(value, dict):
|
|
175
|
-
try:
|
|
176
|
-
return dumps_json(value) # 将其转为 JSON 字符串
|
|
177
|
-
except Exception as e:
|
|
178
|
-
raise ValueError(f"Failed to serialize dict to JSON: {value}, error: {e}")
|
|
179
|
-
|
|
180
|
-
# 处理布尔类型
|
|
181
|
-
elif isinstance(value, bool):
|
|
182
|
-
return int(value) # 转为整数
|
|
183
|
-
|
|
184
|
-
# 确保数值类型优先匹配
|
|
185
|
-
elif isinstance(value, (int, float)):
|
|
186
|
-
return value # 返回数值
|
|
187
|
-
|
|
188
|
-
# 处理日期、时间类型
|
|
189
|
-
elif isinstance(value, (date, time, datetime)):
|
|
190
|
-
return str(value) # 转换为字符串表示
|
|
191
|
-
|
|
192
|
-
# 如果遇到无法处理的类型,抛出异常
|
|
193
|
-
else:
|
|
194
|
-
raise TypeError(f"Unsupported value type: {type(value)}, value: {value}")
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
def list2str(datas):
|
|
200
|
-
"""
|
|
201
|
-
列表转字符串
|
|
202
|
-
:param datas: [1, 2]
|
|
203
|
-
:return: (1, 2)
|
|
204
|
-
"""
|
|
205
|
-
data_str = str(tuple(datas))
|
|
206
|
-
data_str = re.sub(r",\)$", ")", data_str)
|
|
207
|
-
return data_str
|
|
208
|
-
|
|
209
|
-
_REGEXPS = {}
|
|
210
|
-
|
|
211
|
-
def get_info(html, regexps, allow_repeat=True, fetch_one=False, split=None):
|
|
212
|
-
regexps = isinstance(regexps, str) and [regexps] or regexps
|
|
213
|
-
|
|
214
|
-
infos = []
|
|
215
|
-
for regex in regexps:
|
|
216
|
-
if regex == "":
|
|
217
|
-
continue
|
|
218
|
-
|
|
219
|
-
if regex not in _REGEXPS.keys():
|
|
220
|
-
_REGEXPS[regex] = re.compile(regex, re.S)
|
|
221
|
-
|
|
222
|
-
if fetch_one:
|
|
223
|
-
infos = _REGEXPS[regex].search(html)
|
|
224
|
-
if infos:
|
|
225
|
-
infos = infos.groups()
|
|
226
|
-
else:
|
|
227
|
-
continue
|
|
228
|
-
else:
|
|
229
|
-
infos = _REGEXPS[regex].findall(str(html))
|
|
230
|
-
|
|
231
|
-
if len(infos) > 0:
|
|
232
|
-
break
|
|
233
|
-
|
|
234
|
-
if fetch_one:
|
|
235
|
-
infos = infos if infos else ("",)
|
|
236
|
-
return infos if len(infos) > 1 else infos[0]
|
|
237
|
-
else:
|
|
238
|
-
infos = allow_repeat and infos or sorted(set(infos), key=infos.index)
|
|
239
|
-
infos = split.join(infos) if split else infos
|
|
240
|
-
return infos
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
def get_json(json_str):
|
|
244
|
-
"""
|
|
245
|
-
@summary: 取json对象
|
|
246
|
-
---------
|
|
247
|
-
@param json_str: json格式的字符串
|
|
248
|
-
---------
|
|
249
|
-
@result: 返回json对象
|
|
250
|
-
"""
|
|
251
|
-
|
|
252
|
-
try:
|
|
253
|
-
return json.loads(json_str) if json_str else {}
|
|
254
|
-
except Exception as e1:
|
|
255
|
-
try:
|
|
256
|
-
json_str = json_str.strip()
|
|
257
|
-
json_str = json_str.replace("'", '"')
|
|
258
|
-
keys = get_info(json_str, r"(\w+):")
|
|
259
|
-
for key in keys:
|
|
260
|
-
json_str = json_str.replace(key, '"%s"' % key)
|
|
261
|
-
|
|
262
|
-
return json.loads(json_str) if json_str else {}
|
|
263
|
-
|
|
264
|
-
except Exception as e2:
|
|
265
|
-
logger.error(
|
|
266
|
-
"""
|
|
267
|
-
e1: %s
|
|
268
|
-
format json_str: %s
|
|
269
|
-
e2: %s
|
|
270
|
-
"""
|
|
271
|
-
% (e1, json_str, e2)
|
|
272
|
-
)
|
|
273
|
-
|
|
274
|
-
return {}
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
def dumps_json(data, indent=4, sort_keys=False):
|
|
278
|
-
"""
|
|
279
|
-
@summary: 格式化json 用于打印
|
|
280
|
-
---------
|
|
281
|
-
@param data: json格式的字符串或json对象
|
|
282
|
-
@param indent:
|
|
283
|
-
@param sort_keys:
|
|
284
|
-
---------
|
|
285
|
-
@result: 格式化后的字符串
|
|
286
|
-
"""
|
|
287
|
-
try:
|
|
288
|
-
if isinstance(data, str):
|
|
289
|
-
data = get_json(data)
|
|
290
|
-
|
|
291
|
-
data = json.dumps(
|
|
292
|
-
data,
|
|
293
|
-
ensure_ascii=False,
|
|
294
|
-
indent=indent,
|
|
295
|
-
skipkeys=True,
|
|
296
|
-
sort_keys=sort_keys,
|
|
297
|
-
default=str,
|
|
298
|
-
)
|
|
299
|
-
|
|
300
|
-
except Exception as e:
|
|
301
|
-
data = pformat(data)
|
|
302
|
-
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
from pprint import pformat
|
|
4
|
+
from datetime import date, time, datetime
|
|
5
|
+
|
|
6
|
+
from crawlo.utils.log import get_logger
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
logger = get_logger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def make_insert_sql(
|
|
13
|
+
table, data, auto_update=False, update_columns=(), insert_ignore=False
|
|
14
|
+
):
|
|
15
|
+
"""
|
|
16
|
+
@summary: 适用于mysql
|
|
17
|
+
---------
|
|
18
|
+
@param table:
|
|
19
|
+
@param data: 表数据 json格式
|
|
20
|
+
@param auto_update: 使用的是replace into, 为完全覆盖已存在的数据
|
|
21
|
+
@param update_columns: 需要更新的列 默认全部,当指定值时,auto_update设置无效,当duplicate key冲突时更新指定的列
|
|
22
|
+
@param insert_ignore: 数据存在忽略
|
|
23
|
+
---------
|
|
24
|
+
@result:
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
keys = ["`{}`".format(key) for key in data.keys()]
|
|
28
|
+
keys = list2str(keys).replace("'", "")
|
|
29
|
+
|
|
30
|
+
values = [format_sql_value(value) for value in data.values()]
|
|
31
|
+
values = list2str(values)
|
|
32
|
+
|
|
33
|
+
if update_columns:
|
|
34
|
+
if not isinstance(update_columns, (tuple, list)):
|
|
35
|
+
update_columns = [update_columns]
|
|
36
|
+
update_columns_ = ", ".join(
|
|
37
|
+
["{key}=values({key})".format(key=key) for key in update_columns]
|
|
38
|
+
)
|
|
39
|
+
sql = (
|
|
40
|
+
"insert%s into `{table}` {keys} values {values} on duplicate key update %s"
|
|
41
|
+
% (" ignore" if insert_ignore else "", update_columns_)
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
elif auto_update:
|
|
45
|
+
sql = "replace into `{table}` {keys} values {values}"
|
|
46
|
+
else:
|
|
47
|
+
sql = "insert%s into `{table}` {keys} values {values}" % (
|
|
48
|
+
" ignore" if insert_ignore else ""
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
sql = sql.format(table=table, keys=keys, values=values).replace("None", "null")
|
|
52
|
+
return sql
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def make_update_sql(table, data, condition):
|
|
56
|
+
"""
|
|
57
|
+
@summary: 适用于mysql, oracle数据库时间需要to_date 处理(TODO)
|
|
58
|
+
---------
|
|
59
|
+
@param table:
|
|
60
|
+
@param data: 表数据 json格式
|
|
61
|
+
@param condition: where 条件
|
|
62
|
+
---------
|
|
63
|
+
@result:
|
|
64
|
+
"""
|
|
65
|
+
key_values = []
|
|
66
|
+
|
|
67
|
+
for key, value in data.items():
|
|
68
|
+
value = format_sql_value(value)
|
|
69
|
+
if isinstance(value, str):
|
|
70
|
+
key_values.append("`{}`={}".format(key, repr(value)))
|
|
71
|
+
elif value is None:
|
|
72
|
+
key_values.append("`{}`={}".format(key, "null"))
|
|
73
|
+
else:
|
|
74
|
+
key_values.append("`{}`={}".format(key, value))
|
|
75
|
+
|
|
76
|
+
key_values = ", ".join(key_values)
|
|
77
|
+
|
|
78
|
+
sql = "update `{table}` set {key_values} where {condition}"
|
|
79
|
+
sql = sql.format(table=table, key_values=key_values, condition=condition)
|
|
80
|
+
return sql
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def make_batch_sql(
|
|
84
|
+
table, datas, auto_update=False, update_columns=(), update_columns_value=()
|
|
85
|
+
):
|
|
86
|
+
"""
|
|
87
|
+
@summary: 生成批量的SQL
|
|
88
|
+
---------
|
|
89
|
+
@param table:
|
|
90
|
+
@param datas: 表数据 [{...}]
|
|
91
|
+
@param auto_update: 使用的是replace into,为完全覆盖已存在的数据
|
|
92
|
+
@param update_columns: 需要更新的列,默认全部,当指定值时,auto_update设置无效,当duplicate key冲突时更新指定的列
|
|
93
|
+
@param update_columns_value: 需要更新的列的值,默认为datas里边对应的值,注意如果值为字符串类型需要主动加单引号,如 update_columns_value=("'test'",)
|
|
94
|
+
---------
|
|
95
|
+
@result:
|
|
96
|
+
"""
|
|
97
|
+
if not datas:
|
|
98
|
+
return
|
|
99
|
+
|
|
100
|
+
keys = list(set([key for data in datas for key in data]))
|
|
101
|
+
# values_placeholder = ["%s"] * len(keys)
|
|
102
|
+
values = []
|
|
103
|
+
for data in datas:
|
|
104
|
+
# 检查 data 是否是字典类型
|
|
105
|
+
if not isinstance(data, dict):
|
|
106
|
+
# 如果 data 不是字典,记录错误日志并打印 data 的内容和类型
|
|
107
|
+
# logger.error(f"期望的数据类型是字典,但实际得到: {data} (类型: {type(data)})")
|
|
108
|
+
continue # 跳过非字典类型的 data,继续处理下一个数据
|
|
109
|
+
|
|
110
|
+
value = []
|
|
111
|
+
for key in keys:
|
|
112
|
+
# 从字典中获取当前 key 对应的值
|
|
113
|
+
current_data = data.get(key)
|
|
114
|
+
try:
|
|
115
|
+
# 对值进行格式化处理
|
|
116
|
+
current_data = format_sql_value(current_data)
|
|
117
|
+
value.append(current_data) # 将处理后的值添加到列表中
|
|
118
|
+
except Exception as e:
|
|
119
|
+
# 如果格式化失败,记录错误日志
|
|
120
|
+
logger.error(f"{key}: {current_data} (类型: {type(current_data)}) -> {e}")
|
|
121
|
+
|
|
122
|
+
# 将处理后的值列表添加到 values 中
|
|
123
|
+
values.append(value)
|
|
124
|
+
keys_str = ", ".join(["`{}`".format(key) for key in keys])
|
|
125
|
+
placeholders_str = ", ".join(["%s"] * len(keys))
|
|
126
|
+
|
|
127
|
+
if update_columns:
|
|
128
|
+
if not isinstance(update_columns, (tuple, list)):
|
|
129
|
+
update_columns = [update_columns]
|
|
130
|
+
if update_columns_value:
|
|
131
|
+
update_columns_ = ", ".join(
|
|
132
|
+
[
|
|
133
|
+
"`{key}`={value}".format(key=key, value=value)
|
|
134
|
+
for key, value in zip(update_columns, update_columns_value)
|
|
135
|
+
]
|
|
136
|
+
)
|
|
137
|
+
else:
|
|
138
|
+
# 修改这里,使用 VALUES() 函数来引用插入的值
|
|
139
|
+
update_columns_ = ", ".join(
|
|
140
|
+
["`{key}`=VALUES(`{key}`)".format(key=key) for key in update_columns]
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
sql = f"INSERT INTO `{table}` ({keys_str}) VALUES ({placeholders_str}) ON DUPLICATE KEY UPDATE {update_columns_}"
|
|
144
|
+
elif auto_update:
|
|
145
|
+
sql = "REPLACE INTO `{table}` ({keys}) VALUES ({values_placeholder})".format(
|
|
146
|
+
table=table, keys=keys_str, values_placeholder=placeholders_str
|
|
147
|
+
)
|
|
148
|
+
else:
|
|
149
|
+
sql = "INSERT IGNORE INTO `{table}` ({keys}) VALUES ({values_placeholder})".format(
|
|
150
|
+
table=table, keys=keys_str, values_placeholder=placeholders_str
|
|
151
|
+
)
|
|
152
|
+
return sql, values
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def format_sql_value(value):
|
|
156
|
+
"""
|
|
157
|
+
格式化 SQL 值
|
|
158
|
+
"""
|
|
159
|
+
if value is None:
|
|
160
|
+
return None # 处理 NULL 值
|
|
161
|
+
|
|
162
|
+
# 确保处理字符串
|
|
163
|
+
if isinstance(value, str):
|
|
164
|
+
return value.strip() # 去除首尾空格
|
|
165
|
+
|
|
166
|
+
# 处理列表或元组类型
|
|
167
|
+
elif isinstance(value, (list, tuple)):
|
|
168
|
+
try:
|
|
169
|
+
return dumps_json(value) # 将其转为 JSON 字符串
|
|
170
|
+
except Exception as e:
|
|
171
|
+
raise ValueError(f"Failed to serialize list/tuple to JSON: {value}, error: {e}")
|
|
172
|
+
|
|
173
|
+
# 处理字典类型
|
|
174
|
+
elif isinstance(value, dict):
|
|
175
|
+
try:
|
|
176
|
+
return dumps_json(value) # 将其转为 JSON 字符串
|
|
177
|
+
except Exception as e:
|
|
178
|
+
raise ValueError(f"Failed to serialize dict to JSON: {value}, error: {e}")
|
|
179
|
+
|
|
180
|
+
# 处理布尔类型
|
|
181
|
+
elif isinstance(value, bool):
|
|
182
|
+
return int(value) # 转为整数
|
|
183
|
+
|
|
184
|
+
# 确保数值类型优先匹配
|
|
185
|
+
elif isinstance(value, (int, float)):
|
|
186
|
+
return value # 返回数值
|
|
187
|
+
|
|
188
|
+
# 处理日期、时间类型
|
|
189
|
+
elif isinstance(value, (date, time, datetime)):
|
|
190
|
+
return str(value) # 转换为字符串表示
|
|
191
|
+
|
|
192
|
+
# 如果遇到无法处理的类型,抛出异常
|
|
193
|
+
else:
|
|
194
|
+
raise TypeError(f"Unsupported value type: {type(value)}, value: {value}")
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def list2str(datas):
|
|
200
|
+
"""
|
|
201
|
+
列表转字符串
|
|
202
|
+
:param datas: [1, 2]
|
|
203
|
+
:return: (1, 2)
|
|
204
|
+
"""
|
|
205
|
+
data_str = str(tuple(datas))
|
|
206
|
+
data_str = re.sub(r",\)$", ")", data_str)
|
|
207
|
+
return data_str
|
|
208
|
+
|
|
209
|
+
_REGEXPS = {}
|
|
210
|
+
|
|
211
|
+
def get_info(html, regexps, allow_repeat=True, fetch_one=False, split=None):
|
|
212
|
+
regexps = isinstance(regexps, str) and [regexps] or regexps
|
|
213
|
+
|
|
214
|
+
infos = []
|
|
215
|
+
for regex in regexps:
|
|
216
|
+
if regex == "":
|
|
217
|
+
continue
|
|
218
|
+
|
|
219
|
+
if regex not in _REGEXPS.keys():
|
|
220
|
+
_REGEXPS[regex] = re.compile(regex, re.S)
|
|
221
|
+
|
|
222
|
+
if fetch_one:
|
|
223
|
+
infos = _REGEXPS[regex].search(html)
|
|
224
|
+
if infos:
|
|
225
|
+
infos = infos.groups()
|
|
226
|
+
else:
|
|
227
|
+
continue
|
|
228
|
+
else:
|
|
229
|
+
infos = _REGEXPS[regex].findall(str(html))
|
|
230
|
+
|
|
231
|
+
if len(infos) > 0:
|
|
232
|
+
break
|
|
233
|
+
|
|
234
|
+
if fetch_one:
|
|
235
|
+
infos = infos if infos else ("",)
|
|
236
|
+
return infos if len(infos) > 1 else infos[0]
|
|
237
|
+
else:
|
|
238
|
+
infos = allow_repeat and infos or sorted(set(infos), key=infos.index)
|
|
239
|
+
infos = split.join(infos) if split else infos
|
|
240
|
+
return infos
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def get_json(json_str):
|
|
244
|
+
"""
|
|
245
|
+
@summary: 取json对象
|
|
246
|
+
---------
|
|
247
|
+
@param json_str: json格式的字符串
|
|
248
|
+
---------
|
|
249
|
+
@result: 返回json对象
|
|
250
|
+
"""
|
|
251
|
+
|
|
252
|
+
try:
|
|
253
|
+
return json.loads(json_str) if json_str else {}
|
|
254
|
+
except Exception as e1:
|
|
255
|
+
try:
|
|
256
|
+
json_str = json_str.strip()
|
|
257
|
+
json_str = json_str.replace("'", '"')
|
|
258
|
+
keys = get_info(json_str, r"(\w+):")
|
|
259
|
+
for key in keys:
|
|
260
|
+
json_str = json_str.replace(key, '"%s"' % key)
|
|
261
|
+
|
|
262
|
+
return json.loads(json_str) if json_str else {}
|
|
263
|
+
|
|
264
|
+
except Exception as e2:
|
|
265
|
+
logger.error(
|
|
266
|
+
"""
|
|
267
|
+
e1: %s
|
|
268
|
+
format json_str: %s
|
|
269
|
+
e2: %s
|
|
270
|
+
"""
|
|
271
|
+
% (e1, json_str, e2)
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
return {}
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def dumps_json(data, indent=4, sort_keys=False):
|
|
278
|
+
"""
|
|
279
|
+
@summary: 格式化json 用于打印
|
|
280
|
+
---------
|
|
281
|
+
@param data: json格式的字符串或json对象
|
|
282
|
+
@param indent:
|
|
283
|
+
@param sort_keys:
|
|
284
|
+
---------
|
|
285
|
+
@result: 格式化后的字符串
|
|
286
|
+
"""
|
|
287
|
+
try:
|
|
288
|
+
if isinstance(data, str):
|
|
289
|
+
data = get_json(data)
|
|
290
|
+
|
|
291
|
+
data = json.dumps(
|
|
292
|
+
data,
|
|
293
|
+
ensure_ascii=False,
|
|
294
|
+
indent=indent,
|
|
295
|
+
skipkeys=True,
|
|
296
|
+
sort_keys=sort_keys,
|
|
297
|
+
default=str,
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
except Exception as e:
|
|
301
|
+
data = pformat(data)
|
|
302
|
+
|
|
303
303
|
return data
|