pyxllib 0.3.197__py3-none-any.whl → 0.3.200__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyxllib/__init__.py +21 -21
- pyxllib/algo/__init__.py +8 -8
- pyxllib/algo/disjoint.py +54 -54
- pyxllib/algo/geo.py +541 -541
- pyxllib/algo/intervals.py +964 -964
- pyxllib/algo/matcher.py +389 -389
- pyxllib/algo/newbie.py +166 -166
- pyxllib/algo/pupil.py +629 -629
- pyxllib/algo/shapelylib.py +67 -67
- pyxllib/algo/specialist.py +241 -241
- pyxllib/algo/stat.py +494 -494
- pyxllib/algo/treelib.py +149 -149
- pyxllib/algo/unitlib.py +66 -66
- pyxllib/autogui/__init__.py +5 -5
- pyxllib/autogui/activewin.py +246 -246
- pyxllib/autogui/all.py +9 -9
- pyxllib/autogui/autogui.py +852 -852
- pyxllib/autogui/uiautolib.py +362 -362
- pyxllib/autogui/virtualkey.py +102 -102
- pyxllib/autogui/wechat.py +827 -827
- pyxllib/autogui/wechat_msg.py +421 -421
- pyxllib/autogui/wxautolib.py +84 -84
- pyxllib/cv/__init__.py +5 -5
- pyxllib/cv/expert.py +267 -267
- pyxllib/cv/imfile.py +159 -159
- pyxllib/cv/imhash.py +39 -39
- pyxllib/cv/pupil.py +9 -9
- pyxllib/cv/rgbfmt.py +1525 -1525
- pyxllib/cv/slidercaptcha.py +137 -137
- pyxllib/cv/trackbartools.py +251 -251
- pyxllib/cv/xlcvlib.py +1040 -1040
- pyxllib/cv/xlpillib.py +423 -423
- pyxllib/data/echarts.py +240 -240
- pyxllib/data/jsonlib.py +89 -89
- pyxllib/data/oss.py +72 -72
- pyxllib/data/pglib.py +1127 -1127
- pyxllib/data/sqlite.py +568 -568
- pyxllib/data/sqllib.py +297 -297
- pyxllib/ext/JLineViewer.py +505 -505
- pyxllib/ext/__init__.py +6 -6
- pyxllib/ext/demolib.py +246 -246
- pyxllib/ext/drissionlib.py +277 -277
- pyxllib/ext/kq5034lib.py +12 -12
- pyxllib/ext/old.py +663 -663
- pyxllib/ext/qt.py +449 -449
- pyxllib/ext/robustprocfile.py +497 -497
- pyxllib/ext/seleniumlib.py +76 -76
- pyxllib/ext/tk.py +173 -173
- pyxllib/ext/unixlib.py +827 -827
- pyxllib/ext/utools.py +351 -351
- pyxllib/ext/webhook.py +124 -119
- pyxllib/ext/win32lib.py +40 -40
- pyxllib/ext/wjxlib.py +88 -88
- pyxllib/ext/wpsapi.py +124 -124
- pyxllib/ext/xlwork.py +9 -9
- pyxllib/ext/yuquelib.py +1105 -1105
- pyxllib/file/__init__.py +17 -17
- pyxllib/file/docxlib.py +761 -761
- pyxllib/file/gitlib.py +309 -309
- pyxllib/file/libreoffice.py +165 -165
- pyxllib/file/movielib.py +148 -148
- pyxllib/file/newbie.py +10 -10
- pyxllib/file/onenotelib.py +1469 -1469
- pyxllib/file/packlib/__init__.py +330 -330
- pyxllib/file/packlib/zipfile.py +2441 -2441
- pyxllib/file/pdflib.py +426 -426
- pyxllib/file/pupil.py +185 -185
- pyxllib/file/specialist/__init__.py +685 -685
- pyxllib/file/specialist/dirlib.py +799 -799
- pyxllib/file/specialist/download.py +193 -193
- pyxllib/file/specialist/filelib.py +2829 -2829
- pyxllib/file/xlsxlib.py +3131 -3131
- pyxllib/file/xlsyncfile.py +341 -341
- pyxllib/prog/__init__.py +5 -5
- pyxllib/prog/cachetools.py +64 -64
- pyxllib/prog/deprecatedlib.py +233 -233
- pyxllib/prog/filelock.py +42 -42
- pyxllib/prog/ipyexec.py +253 -253
- pyxllib/prog/multiprogs.py +940 -940
- pyxllib/prog/newbie.py +451 -451
- pyxllib/prog/pupil.py +1197 -1197
- pyxllib/prog/sitepackages.py +33 -33
- pyxllib/prog/specialist/__init__.py +391 -391
- pyxllib/prog/specialist/bc.py +203 -203
- pyxllib/prog/specialist/browser.py +497 -497
- pyxllib/prog/specialist/common.py +347 -347
- pyxllib/prog/specialist/datetime.py +198 -198
- pyxllib/prog/specialist/tictoc.py +240 -240
- pyxllib/prog/specialist/xllog.py +180 -180
- pyxllib/prog/xlosenv.py +108 -108
- pyxllib/stdlib/__init__.py +17 -17
- pyxllib/stdlib/tablepyxl/__init__.py +10 -10
- pyxllib/stdlib/tablepyxl/style.py +303 -303
- pyxllib/stdlib/tablepyxl/tablepyxl.py +130 -130
- pyxllib/text/__init__.py +8 -8
- pyxllib/text/ahocorasick.py +39 -39
- pyxllib/text/airscript.js +744 -744
- pyxllib/text/charclasslib.py +121 -121
- pyxllib/text/jiebalib.py +267 -267
- pyxllib/text/jinjalib.py +32 -32
- pyxllib/text/jsa_ai_prompt.md +271 -271
- pyxllib/text/jscode.py +922 -922
- pyxllib/text/latex/__init__.py +158 -158
- pyxllib/text/levenshtein.py +303 -303
- pyxllib/text/nestenv.py +1215 -1215
- pyxllib/text/newbie.py +300 -300
- pyxllib/text/pupil/__init__.py +8 -8
- pyxllib/text/pupil/common.py +1121 -1121
- pyxllib/text/pupil/xlalign.py +326 -326
- pyxllib/text/pycode.py +47 -47
- pyxllib/text/specialist/__init__.py +8 -8
- pyxllib/text/specialist/common.py +112 -112
- pyxllib/text/specialist/ptag.py +186 -186
- pyxllib/text/spellchecker.py +172 -172
- pyxllib/text/templates/echart_base.html +10 -10
- pyxllib/text/templates/highlight_code.html +16 -16
- pyxllib/text/templates/latex_editor.html +102 -102
- pyxllib/text/vbacode.py +17 -17
- pyxllib/text/xmllib.py +747 -747
- pyxllib/xl.py +42 -39
- pyxllib/xlcv.py +17 -17
- {pyxllib-0.3.197.dist-info → pyxllib-0.3.200.dist-info}/METADATA +1 -1
- pyxllib-0.3.200.dist-info/RECORD +126 -0
- {pyxllib-0.3.197.dist-info → pyxllib-0.3.200.dist-info}/licenses/LICENSE +190 -190
- pyxllib-0.3.197.dist-info/RECORD +0 -126
- {pyxllib-0.3.197.dist-info → pyxllib-0.3.200.dist-info}/WHEEL +0 -0
@@ -1,685 +1,685 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
# @Author : 陈坤泽
|
4
|
-
# @Email : 877362867@qq.com
|
5
|
-
# @Date : 2021/06/06 17:46
|
6
|
-
import re
|
7
|
-
|
8
|
-
from pyxllib.prog.pupil import check_install_package
|
9
|
-
|
10
|
-
check_install_package('joblib', 'joblib>=1.3.2')
|
11
|
-
|
12
|
-
from collections import OrderedDict
|
13
|
-
import sqlite3
|
14
|
-
|
15
|
-
from joblib import Parallel, delayed
|
16
|
-
|
17
|
-
from pyxllib.file.specialist.filelib import *
|
18
|
-
from pyxllib.file.specialist.dirlib import *
|
19
|
-
from pyxllib.file.specialist.download import *
|
20
|
-
|
21
|
-
|
22
|
-
def merge_jsonl(*infiles):
|
23
|
-
data = []
|
24
|
-
for f in infiles:
|
25
|
-
data += XlPath(f).read_jsonl()
|
26
|
-
return data
|
27
|
-
|
28
|
-
|
29
|
-
class JsonlDataFile:
|
30
|
-
""" 通用的jsonl文件处理类 """
|
31
|
-
|
32
|
-
def __init__(self, filepath=None, num_records=None):
|
33
|
-
"""
|
34
|
-
从指定的jsonl文件中读取数据。可以选择读取全部数据或只读取前N条数据。
|
35
|
-
|
36
|
-
:param str filepath: jsonl文件的路径
|
37
|
-
:param int num_records: 指定读取的记录数量,如果为None则读取全部数据
|
38
|
-
"""
|
39
|
-
self.infile = None
|
40
|
-
self.records = []
|
41
|
-
|
42
|
-
if filepath is not None:
|
43
|
-
filepath = XlPath(filepath)
|
44
|
-
if '?k' in filepath.name: # 如果文件名中有'?',则需要进行模式匹配检索
|
45
|
-
new_name = filepath.name.replace('?k', '*')
|
46
|
-
filepaths = list(filepath.parent.glob(new_name))
|
47
|
-
if filepaths:
|
48
|
-
filepath = filepaths[0] # 找到第1个匹配的文件
|
49
|
-
self.infile = XlPath(filepath)
|
50
|
-
else:
|
51
|
-
self.infile = filepath
|
52
|
-
|
53
|
-
if self.infile and self.infile.is_file(): # 机制上文件也可能不存在的,有可能只是一个预设目录~
|
54
|
-
if num_records is None:
|
55
|
-
# 读取全部数据
|
56
|
-
if self.infile.is_file():
|
57
|
-
self.records = self.infile.read_jsonl()
|
58
|
-
else:
|
59
|
-
# 只读取部分数据
|
60
|
-
self.read_partial_records(num_records)
|
61
|
-
|
62
|
-
def __len__(self):
|
63
|
-
return len(self.records)
|
64
|
-
|
65
|
-
def yield_record(self, start=0, end=None, step=1, batch_size=None):
|
66
|
-
""" 返回指定区间的记录
|
67
|
-
|
68
|
-
:param int start: 起始记录索引,默认为0
|
69
|
-
:param int end: 结束记录索引,默认为None(读取到记录末尾)
|
70
|
-
:param int step: 步长,默认为1
|
71
|
-
:param int batch_size: 每批返回的记录数,如果为None,则逐记录返回
|
72
|
-
"""
|
73
|
-
total_records = len(self.records) # 获取总记录数
|
74
|
-
|
75
|
-
# 处理负索引
|
76
|
-
if start < 0 or (end is not None and end < 0):
|
77
|
-
if start < 0:
|
78
|
-
start = total_records + start
|
79
|
-
if end is not None and end < 0:
|
80
|
-
end = total_records + end
|
81
|
-
|
82
|
-
iterator = islice(self.records, start, end, step)
|
83
|
-
while True:
|
84
|
-
batch = list(islice(iterator, batch_size))
|
85
|
-
if not batch:
|
86
|
-
break
|
87
|
-
if batch_size is None:
|
88
|
-
yield from batch
|
89
|
-
else:
|
90
|
-
yield batch
|
91
|
-
|
92
|
-
def yield_group(self, key, sort_mode='keep'):
|
93
|
-
""" 分组提取数据
|
94
|
-
|
95
|
-
:param key: 一个函数,对record的映射,通过这个映射规则来分组
|
96
|
-
:param sort_mode:
|
97
|
-
keep: 保留原本的相对顺序
|
98
|
-
id: 按照id的值进行排序
|
99
|
-
sort: 按照key的值进行排序
|
100
|
-
"""
|
101
|
-
# 1 创建一个默认字典来保存分组
|
102
|
-
grouped_data = OrderedDict()
|
103
|
-
|
104
|
-
records = self.records
|
105
|
-
if sort_mode == 'id':
|
106
|
-
records = sorted(records, key=lambda x: x['id'])
|
107
|
-
|
108
|
-
# 2 对数据进行分组
|
109
|
-
for record in records:
|
110
|
-
k = key(record)
|
111
|
-
if k not in grouped_data:
|
112
|
-
grouped_data[k] = [record]
|
113
|
-
else:
|
114
|
-
grouped_data[k].append(record)
|
115
|
-
|
116
|
-
# 3 将分组的数据重新排序并合并为一个新列表
|
117
|
-
# 并且在这里可以进行一些分组信息的计算
|
118
|
-
if sort_mode == 'sort':
|
119
|
-
grouped_data = {k: grouped_data[k] for k in sorted(grouped_data.keys())}
|
120
|
-
|
121
|
-
# 4 返回分组后的数据
|
122
|
-
yield from grouped_data.values()
|
123
|
-
|
124
|
-
def read_partial_records(self, num_records):
|
125
|
-
""" 从jsonl文件中只读取指定数量的记录 """
|
126
|
-
if self.infile and self.infile.is_file():
|
127
|
-
try:
|
128
|
-
lines = next(self.infile.yield_line(batch_size=num_records))
|
129
|
-
for line in lines:
|
130
|
-
self.records.append(json.loads(line))
|
131
|
-
except StopIteration:
|
132
|
-
self.records = []
|
133
|
-
|
134
|
-
def save(self, outfile=None, ensure_ascii=False, json_encoder=None):
|
135
|
-
""" 将当前数据保存到指定的jsonl文件中 """
|
136
|
-
if outfile is None: # 默认保存回原文件
|
137
|
-
outfile = self.infile
|
138
|
-
p = XlPath(outfile)
|
139
|
-
|
140
|
-
# 如果文件名包含'?k',则替换'?'为self.records的数量
|
141
|
-
if m := re.search(r'\?k', p.name):
|
142
|
-
n = len(self.records)
|
143
|
-
if n < 500:
|
144
|
-
replace_str = f'{n}' # 数量小于500,直接给出数量
|
145
|
-
else:
|
146
|
-
v = int(round(n / 1000)) # 数量大于等于500,以"千"为单位'k',四舍五入计算
|
147
|
-
replace_str = f'{v}k'
|
148
|
-
# 用新字符串替换原来的字符串
|
149
|
-
new_name = re.sub(r'\?k', replace_str, p.name)
|
150
|
-
p = p.with_name(new_name) # 更改文件名
|
151
|
-
|
152
|
-
p.parent.mkdir(parents=True, exist_ok=True)
|
153
|
-
p.write_jsonl(self.records, ensure_ascii=ensure_ascii, default=json_encoder)
|
154
|
-
|
155
|
-
def browse_record(self, index=None, paths=None, **kwargs):
|
156
|
-
""" 在浏览器中显示指定记录的内容 """
|
157
|
-
from pyxllib.prog.specialist import browser
|
158
|
-
|
159
|
-
# 如果未提供索引,则尝试使用查询参数找到第一个匹配的记录
|
160
|
-
if index is None:
|
161
|
-
index = self.find_index(paths, **kwargs)
|
162
|
-
if index is None:
|
163
|
-
raise ValueError('No matching record found')
|
164
|
-
|
165
|
-
record = self.records[index]
|
166
|
-
html_content = ['<html><body><pre>',
|
167
|
-
json.dumps(record, ensure_ascii=False, indent=4),
|
168
|
-
'</pre></body></html>']
|
169
|
-
html_file = (XlPath.tempdir() / f'{self.__class__.__name__}_{index}.html')
|
170
|
-
html_file.write_text('\n'.join(html_content))
|
171
|
-
browser.html(html_file)
|
172
|
-
|
173
|
-
def browse_records(self, indices=None, paths=None, **kwargs):
|
174
|
-
""" 在浏览器中显示所有匹配的记录 """
|
175
|
-
from pyxllib.prog.specialist import browser
|
176
|
-
|
177
|
-
if indices is None:
|
178
|
-
indices = list(self.find_indexs(paths, **kwargs))
|
179
|
-
if not indices:
|
180
|
-
raise ValueError('No matching records found')
|
181
|
-
|
182
|
-
html_content = ['<html><body><h1>Matching Records: {}</h1>'.format(len(indices))]
|
183
|
-
|
184
|
-
for index in indices:
|
185
|
-
record = self.records[index]
|
186
|
-
html_content.extend([
|
187
|
-
'<h2>Record {}</h2>'.format(index),
|
188
|
-
'<pre>',
|
189
|
-
json.dumps(record, ensure_ascii=False, indent=4),
|
190
|
-
'</pre>'
|
191
|
-
])
|
192
|
-
|
193
|
-
html_content.append('</body></html>')
|
194
|
-
html_file = (XlPath.tempdir() / f'{self.__class__.__name__}_matched.html')
|
195
|
-
html_file.write_text('\n'.join(html_content))
|
196
|
-
browser.html(html_file)
|
197
|
-
|
198
|
-
def find_indexs(self, paths=None, **kwargs):
|
199
|
-
""" 查找满足特定条件的记录的索引,返回所有匹配的结果 """
|
200
|
-
paths = paths or {}
|
201
|
-
|
202
|
-
for i, record in enumerate(self.records):
|
203
|
-
# 检查kwargs中的所有条件
|
204
|
-
for key, value in kwargs.items():
|
205
|
-
if callable(value):
|
206
|
-
if not value(record.get(key)):
|
207
|
-
break
|
208
|
-
elif record.get(key) != value:
|
209
|
-
break
|
210
|
-
else:
|
211
|
-
# 检查paths中的所有条件
|
212
|
-
for path, value in paths.items():
|
213
|
-
try:
|
214
|
-
actual_value = eval(f'record{path}')
|
215
|
-
except Exception:
|
216
|
-
break
|
217
|
-
|
218
|
-
if callable(value):
|
219
|
-
if not value(actual_value):
|
220
|
-
break
|
221
|
-
elif actual_value != value:
|
222
|
-
break
|
223
|
-
else:
|
224
|
-
# 如果记录满足所有条件,则返回它的索引
|
225
|
-
yield i
|
226
|
-
|
227
|
-
def find_index(self, paths=None, **kwargs):
|
228
|
-
"""
|
229
|
-
:param dict paths: 在比较复杂场景,无法使用kwargs定位的时候可以用这个规则
|
230
|
-
key: 检索范式
|
231
|
-
value: 需要满足的值
|
232
|
-
示例: find_index({"['messages'][0]['role']": 'user'})
|
233
|
-
:param kwargs: 直接子结点名称和对应的值
|
234
|
-
示例:find_index(id=2023071320000003)
|
235
|
-
|
236
|
-
补充说明:
|
237
|
-
1、paths和kwargs可以组合使用,表示必须同时满足二者里限定的所有规则
|
238
|
-
2、value可以写一个 def func(v)->bool的函数,输入对应的value,返回是否满足条件
|
239
|
-
"""
|
240
|
-
return next(self.find_indexs(paths, **kwargs), None)
|
241
|
-
|
242
|
-
def add_record_basic(self, **kwargs):
|
243
|
-
""" 最基础的添加一个条目的接口 """
|
244
|
-
record = kwargs
|
245
|
-
self.records.append(record)
|
246
|
-
return record
|
247
|
-
|
248
|
-
@classmethod
|
249
|
-
def read_from_files(cls, src_files):
|
250
|
-
""" 从多个文件中读取并合并数据,并返回新的JsonlDataFile实例 """
|
251
|
-
merged_records = []
|
252
|
-
for file in src_files:
|
253
|
-
jsonl_file = cls(file)
|
254
|
-
merged_records.extend(jsonl_file.records)
|
255
|
-
# 创建新的实例并返回
|
256
|
-
new_instance = cls()
|
257
|
-
new_instance.records = merged_records
|
258
|
-
return new_instance
|
259
|
-
|
260
|
-
@classmethod
|
261
|
-
def read_from_dir(cls, src_dir):
|
262
|
-
""" 从一个目录下的所有jsonl文件中读取并合并数据,并返回新的JsonlDataFile实例 """
|
263
|
-
src_dir = XlPath(src_dir)
|
264
|
-
src_files = [str(file_path) for file_path in src_dir.glob('*.jsonl')]
|
265
|
-
return cls.read_from_files(src_files)
|
266
|
-
|
267
|
-
def __add__(self, other):
|
268
|
-
""" 实现类加法操作,合并两个JsonlDataFile的records """
|
269
|
-
if not isinstance(other, JsonlDataFile):
|
270
|
-
raise TypeError(f'Unsupported operand type: {type(other)}')
|
271
|
-
result = JsonlDataFile()
|
272
|
-
result.records = self.records + other.records
|
273
|
-
return result
|
274
|
-
|
275
|
-
def __iadd__(self, other):
|
276
|
-
""" 实现原地加法操作,即 += """
|
277
|
-
if not isinstance(other, JsonlDataFile):
|
278
|
-
raise TypeError(f'Unsupported operand type: {type(other)}')
|
279
|
-
self.records += other.records
|
280
|
-
return self
|
281
|
-
|
282
|
-
def process_each_record(self, func, *,
|
283
|
-
inplace=False,
|
284
|
-
timeout=None,
|
285
|
-
print_mode=0,
|
286
|
-
threads_num=1,
|
287
|
-
**kwargs):
|
288
|
-
""" 对records中的每个record应用函数func,可以选择是否在原地修改,以及是否显示进度条
|
289
|
-
|
290
|
-
:param function func: 对record进行处理的函数,应接受一个record作为参数并返回处理后的record,如果返回None则删除该record
|
291
|
-
:param bool inplace: 是否在原地修改records,如果为False(默认),则创建新的JsonlDataFile并返回
|
292
|
-
:param int print_mode: 是否显示处理过程的进度条,0表示不显示(默认),1表示显示
|
293
|
-
:return JsonlDataFile or None: 如果inplace为False,则返回新的JsonlDataFile,否则返回None
|
294
|
-
:param int threads_num: 线程数,默认为1,即单线程
|
295
|
-
|
296
|
-
遍历self.records,对每个record执行func函数,如果func返回None,则不包含该record到新的records中。
|
297
|
-
"""
|
298
|
-
backend = 'threading' if threads_num != 1 else 'sequential'
|
299
|
-
|
300
|
-
if print_mode:
|
301
|
-
parallel = Parallel(n_jobs=threads_num, backend=backend,
|
302
|
-
timeout=timeout, return_as='generator')
|
303
|
-
tasks = [delayed(func)(record) for record in self.records]
|
304
|
-
new_records = []
|
305
|
-
for y in tqdm(parallel(tasks), total=len(self.records), **kwargs):
|
306
|
-
if y:
|
307
|
-
new_records.append(y)
|
308
|
-
else:
|
309
|
-
parallel = Parallel(n_jobs=threads_num, backend=backend, timeout=timeout)
|
310
|
-
tasks = [delayed(func)(record) for record in self.records]
|
311
|
-
new_records = parallel(tasks)
|
312
|
-
new_records = [y for y in new_records if y]
|
313
|
-
|
314
|
-
if inplace:
|
315
|
-
self.records = new_records
|
316
|
-
|
317
|
-
return new_records
|
318
|
-
|
319
|
-
def update_each_record(self, func,
|
320
|
-
timeout=None,
|
321
|
-
print_mode=0,
|
322
|
-
threads_num=1):
|
323
|
-
""" 遍历并对原始数据进行更改 """
|
324
|
-
return self.process_each_record(func,
|
325
|
-
inplace=True,
|
326
|
-
timeout=timeout,
|
327
|
-
print_mode=print_mode,
|
328
|
-
threads_num=threads_num)
|
329
|
-
|
330
|
-
|
331
|
-
class JsonlDataDir:
|
332
|
-
""" 注意这个类开发目标,应该是尽量去模拟JsonDataFile,让下游工作更好衔接统一 """
|
333
|
-
|
334
|
-
def __init__(self, root):
|
335
|
-
""" 一般用来处理较大的jsonl文件,将其该放到一个目录里,拆分成多个jsonl文件
|
336
|
-
|
337
|
-
注意待处理的文件名是依照 01.jsonl, 02.jsonl,... 的格式识别的,不要改动这个规则
|
338
|
-
"""
|
339
|
-
self.root = XlPath(root)
|
340
|
-
self.files = []
|
341
|
-
self.update_subfiles()
|
342
|
-
|
343
|
-
def update_subfiles(self):
|
344
|
-
self.files = []
|
345
|
-
for f in self.root.glob_files('*.jsonl'):
|
346
|
-
if re.match(r'_?\d+$', f.stem): # 目前先用'_?'兼容旧版,但以后应该固定只匹配_\d+
|
347
|
-
self.files.append(f)
|
348
|
-
|
349
|
-
def __bool__(self):
|
350
|
-
if self.root.is_dir() and self.files:
|
351
|
-
return True
|
352
|
-
else:
|
353
|
-
return False
|
354
|
-
|
355
|
-
def count_records(self):
|
356
|
-
total = 0
|
357
|
-
for f in self.files:
|
358
|
-
total += f.get_total_lines(skip_blank=True)
|
359
|
-
return total
|
360
|
-
|
361
|
-
def check(self, title=''):
|
362
|
-
""" 检查一些数据状态 """
|
363
|
-
print(title, '文件数:', len(self.files), '条目数:', self.count_records())
|
364
|
-
|
365
|
-
@classmethod
|
366
|
-
def init_from_file(cls, file, lines_per_file=10000):
|
367
|
-
""" 从一个jsonl文件初始化一个JsonlDataDir对象 """
|
368
|
-
file = XlPath(file)
|
369
|
-
dst_dir = file.parent / file.stem
|
370
|
-
if not dst_dir.is_dir() and file.is_file():
|
371
|
-
file.split_to_dir(lines_per_file, dst_dir)
|
372
|
-
c = cls(dst_dir)
|
373
|
-
return c
|
374
|
-
|
375
|
-
def _rearrange_group(self, lines_per_file=10000,
|
376
|
-
group_key=None, sort_mode='keep',
|
377
|
-
print_mode=1):
|
378
|
-
# 1 使用sqlite3存储数据和分组信息
|
379
|
-
# 创建一个临时文件来作为SQLite数据库
|
380
|
-
temp_db_file = self.root / 'data.sqlite3'
|
381
|
-
temp_db_file.delete()
|
382
|
-
|
383
|
-
# 使用临时文件创建SQLite数据库连接
|
384
|
-
conn = sqlite3.connect(temp_db_file)
|
385
|
-
cursor = conn.cursor()
|
386
|
-
|
387
|
-
# 创建一个临时表来存储jsonl数据
|
388
|
-
cursor.execute('CREATE TABLE records (id INTEGER PRIMARY KEY AUTOINCREMENT,'
|
389
|
-
'data TEXT, group_key TEXT)')
|
390
|
-
# 给group_key添加索引
|
391
|
-
cursor.execute('CREATE INDEX idx_group_key ON records(group_key)')
|
392
|
-
|
393
|
-
# 从jsonl文件加载数据到SQLite数据库
|
394
|
-
commit_interval = 2000 # 多少记录执行一次commit
|
395
|
-
count = 0
|
396
|
-
for record in tqdm(self.yield_record(), desc='计算每个record分组', disable=not print_mode):
|
397
|
-
count += 1
|
398
|
-
group = group_key(record) if group_key else count
|
399
|
-
group = str(group)
|
400
|
-
cursor.execute('INSERT INTO records (data, group_key) VALUES (?, ?)',
|
401
|
-
(json.dumps(record, ensure_ascii=False), group))
|
402
|
-
if count % commit_interval == 0:
|
403
|
-
conn.commit()
|
404
|
-
conn.commit()
|
405
|
-
|
406
|
-
# 2 查询数据库以进行排序和分组,并将结果写入新的jsonl文件
|
407
|
-
new_file_count = 0
|
408
|
-
lines_written = 0
|
409
|
-
current_file = None
|
410
|
-
sort_sql = ''
|
411
|
-
if sort_mode == 'id':
|
412
|
-
sort_sql = 'ORDER BY id'
|
413
|
-
elif sort_mode == 'sort':
|
414
|
-
sort_sql = f'ORDER BY {group_key}'
|
415
|
-
|
416
|
-
for group, in tqdm(cursor.execute('SELECT DISTINCT group_key FROM records').fetchall(),
|
417
|
-
desc='提取每一组数据',
|
418
|
-
disable=not print_mode):
|
419
|
-
query = f'SELECT data FROM records WHERE group_key = ? {sort_sql}'
|
420
|
-
cursor.execute(query, (group,))
|
421
|
-
|
422
|
-
if current_file is None or lines_written >= lines_per_file:
|
423
|
-
if current_file:
|
424
|
-
current_file.close()
|
425
|
-
new_file_name = f'temp_{new_file_count}.jsonl'
|
426
|
-
new_file_path = self.root / new_file_name
|
427
|
-
current_file = new_file_path.open('w', encoding='utf-8')
|
428
|
-
new_file_count += 1
|
429
|
-
lines_written = 0
|
430
|
-
|
431
|
-
while True:
|
432
|
-
row = cursor.fetchone()
|
433
|
-
if row is None:
|
434
|
-
break
|
435
|
-
|
436
|
-
current_file.write(row[0] + '\n')
|
437
|
-
lines_written += 1
|
438
|
-
|
439
|
-
if current_file:
|
440
|
-
current_file.close()
|
441
|
-
|
442
|
-
# 3 关闭数据库连接并删除临时文件
|
443
|
-
conn.close()
|
444
|
-
temp_db_file.delete()
|
445
|
-
|
446
|
-
# 4 删除旧文件,重命名新文件
|
447
|
-
for f in self.files:
|
448
|
-
f.delete()
|
449
|
-
|
450
|
-
widths = len(str(new_file_count))
|
451
|
-
for temp_file in self.root.glob('temp_*.jsonl'):
|
452
|
-
n = int(re.search(r'\d+', temp_file.name).group())
|
453
|
-
temp_file.rename(self.root / f'_{n:0{widths}}.jsonl')
|
454
|
-
|
455
|
-
def rearrange(self, lines_per_file=10000, group_key=None,
|
456
|
-
sort_mode='keep', print_mode=1):
|
457
|
-
""" 重新整理划分文件
|
458
|
-
|
459
|
-
:param int lines_per_file: 每个文件的行数
|
460
|
-
:param func group_key: 用来分组的函数,确保相同key的数据会被分到同一个文件里
|
461
|
-
:param str sort_mode:
|
462
|
-
keep: 保留原本的相对顺序
|
463
|
-
id: 按照id的值进行排序
|
464
|
-
sort: 按照key的值进行排序
|
465
|
-
"""
|
466
|
-
if group_key is not None or sort_mode != 'keep':
|
467
|
-
return self._rearrange_group(lines_per_file, group_key, sort_mode, print_mode)
|
468
|
-
|
469
|
-
output_dir = self.root
|
470
|
-
temp_prefix = 'temp_'
|
471
|
-
|
472
|
-
new_file_count = 0
|
473
|
-
new_file = None
|
474
|
-
line_count = 0
|
475
|
-
|
476
|
-
# 计算总行数以确定文件名的前导零数量
|
477
|
-
total_lines = sum(1 for file in self.files for _ in file.open('r', encoding='utf-8'))
|
478
|
-
num_digits = len(str((total_lines + lines_per_file - 1) // lines_per_file))
|
479
|
-
|
480
|
-
for file in self.files:
|
481
|
-
with file.open('r', encoding='utf-8') as f:
|
482
|
-
for line in f:
|
483
|
-
if not line.strip():
|
484
|
-
continue
|
485
|
-
if line_count == 0:
|
486
|
-
if new_file is not None:
|
487
|
-
new_file.close()
|
488
|
-
new_file_name = f'{temp_prefix}{new_file_count:0{num_digits}d}.jsonl'
|
489
|
-
new_file_path = output_dir / new_file_name
|
490
|
-
new_file = new_file_path.open('w', encoding='utf-8')
|
491
|
-
new_file_count += 1
|
492
|
-
|
493
|
-
new_file.write(line)
|
494
|
-
line_count += 1
|
495
|
-
|
496
|
-
if line_count == lines_per_file:
|
497
|
-
line_count = 0
|
498
|
-
|
499
|
-
if new_file is not None:
|
500
|
-
new_file.close()
|
501
|
-
|
502
|
-
# 删除旧文件
|
503
|
-
for file in self.files:
|
504
|
-
os.remove(file)
|
505
|
-
|
506
|
-
# 将临时文件名更改为最终的文件名
|
507
|
-
for temp_file in output_dir.glob(f'{temp_prefix}*.jsonl'):
|
508
|
-
final_name = temp_file.name[len(temp_prefix) - 1:]
|
509
|
-
temp_file.rename(output_dir / final_name)
|
510
|
-
|
511
|
-
def yield_record(self, batch_size=None):
|
512
|
-
""" 返回数据记录
|
513
|
-
|
514
|
-
:param int batch_size: 每批返回的记录数,如果为None,则逐条返回
|
515
|
-
"""
|
516
|
-
for i, file in enumerate(self.files):
|
517
|
-
data = file.read_jsonl()
|
518
|
-
iterator = iter(data)
|
519
|
-
while True:
|
520
|
-
batch = list(islice(iterator, batch_size))
|
521
|
-
if not batch:
|
522
|
-
break
|
523
|
-
if batch_size is None:
|
524
|
-
yield from batch
|
525
|
-
else:
|
526
|
-
yield batch
|
527
|
-
|
528
|
-
def yield_group(self, key, sort_mode='keep'):
|
529
|
-
""" 分组提取数据
|
530
|
-
|
531
|
-
:param key: 一个函数,对record的映射,通过这个映射规则来分组
|
532
|
-
|
533
|
-
注意:这个分组只会对每个分文件单独执行,不会全局性质检索
|
534
|
-
一般要用self.rearrange对全局的文件进行检索重排后再使用这个函数
|
535
|
-
"""
|
536
|
-
for filepath in self.files:
|
537
|
-
jdf = JsonlDataFile(filepath)
|
538
|
-
yield from jdf.yield_group(key, sort_mode)
|
539
|
-
|
540
|
-
def process_each_file(self, func=None, *,
|
541
|
-
print_mode=0, desc='process_each_file',
|
542
|
-
processes_num=1,
|
543
|
-
subfiles=None,
|
544
|
-
**kwargs):
|
545
|
-
# 1 backend
|
546
|
-
backend = 'loky' if processes_num != 1 else 'sequential'
|
547
|
-
|
548
|
-
# 2 tasks
|
549
|
-
if subfiles is None:
|
550
|
-
subfiles = [0, len(self.files)]
|
551
|
-
elif not isinstance(subfiles, (list, tuple)):
|
552
|
-
subfiles = [subfiles, subfiles + 1]
|
553
|
-
a, b = subfiles
|
554
|
-
tasks = [delayed(func)(file) for file in self.files[a:b]]
|
555
|
-
|
556
|
-
# 3 run
|
557
|
-
if print_mode:
|
558
|
-
parallel = Parallel(n_jobs=processes_num, backend=backend, return_as='generator')
|
559
|
-
list(tqdm(parallel(tasks), total=len(self.files), desc=desc, **kwargs))
|
560
|
-
else:
|
561
|
-
parallel = Parallel(n_jobs=processes_num, backend=backend)
|
562
|
-
parallel(tasks)
|
563
|
-
|
564
|
-
def process_each_record(self, func, *,
|
565
|
-
inplace=False, reset=False,
|
566
|
-
print_mode=2, desc=None,
|
567
|
-
timeout=None,
|
568
|
-
processes_num=1, threads_num=1,
|
569
|
-
dst_dir=None, json_encoder=None,
|
570
|
-
subfiles=None):
|
571
|
-
""" 封装的对每个record进行操作的函数
|
572
|
-
|
573
|
-
:param func: 外部传入的处理函数
|
574
|
-
:param inplace: 是否修改原始数据
|
575
|
-
:param reset: 是否重新处理已经处理过的文件
|
576
|
-
:param print_mode:
|
577
|
-
0 不显示
|
578
|
-
1 只显示文件数进度
|
579
|
-
2(默认) 显示更详细的文件内处理进度
|
580
|
-
:param desc: print_mode=1的进度条标题
|
581
|
-
:param timeout: 超时时间,但有些场合会使用不了(比如linux的子进程里不能使用singal)
|
582
|
-
在用不了的场合,可以使用requests自带的timeout等各种机制来限时
|
583
|
-
:param processes_num: 进程数,每个文件为单独一个进程
|
584
|
-
:param threads_num: 线程数,每个文件处理的时候使用几个线程
|
585
|
-
:param dst_dir: 要保存到的目标目录,未设置的时候不保存
|
586
|
-
:param json_encoder: 有些不是标准的json数据结构,如何进行处理,有需要的时候一般会设置成str
|
587
|
-
:param subfiles: 只跑部分子文件
|
588
|
-
a,只有文件编号为a的才运行
|
589
|
-
[a, b],跑左闭右开的区间内的文件
|
590
|
-
"""
|
591
|
-
files_num = len(self.files)
|
592
|
-
|
593
|
-
def process_jsonl_file(srcfile):
|
594
|
-
# 1 如果没有reset,且dstfile存在,则不处理
|
595
|
-
srcfile = XlPath(srcfile)
|
596
|
-
if dst_dir:
|
597
|
-
dstfile = XlPath(dst_dir) / srcfile.name
|
598
|
-
else:
|
599
|
-
dstfile = None
|
600
|
-
if not reset and dstfile and dstfile.is_file():
|
601
|
-
return
|
602
|
-
|
603
|
-
# 2 跑特定文件里的条目
|
604
|
-
jdf = JsonlDataFile(srcfile)
|
605
|
-
new_records = jdf.process_each_record(func,
|
606
|
-
inplace=inplace,
|
607
|
-
print_mode=print_mode == 2,
|
608
|
-
desc=f'{jdf.infile.name}/{files_num}',
|
609
|
-
timeout=timeout,
|
610
|
-
threads_num=threads_num,
|
611
|
-
mininterval=processes_num * 3,
|
612
|
-
)
|
613
|
-
|
614
|
-
# 3 是否修改原文件,是否保存到dst_dir
|
615
|
-
if inplace:
|
616
|
-
jdf.save()
|
617
|
-
|
618
|
-
if dstfile:
|
619
|
-
jdf = JsonlDataFile()
|
620
|
-
jdf.records = new_records
|
621
|
-
jdf.save(dstfile, json_encoder=json_encoder)
|
622
|
-
|
623
|
-
self.process_each_file(process_jsonl_file, subfiles=subfiles,
|
624
|
-
processes_num=processes_num,
|
625
|
-
print_mode=print_mode == 1, desc=desc)
|
626
|
-
|
627
|
-
def process_each_group(self, func, group_key, sort_mode='keep', *,
|
628
|
-
inplace=False, reset=False,
|
629
|
-
print_mode=1, desc=None,
|
630
|
-
processes_num=1,
|
631
|
-
dst_dir=None,
|
632
|
-
json_encoder=None):
|
633
|
-
""" 封装的对每组records的处理
|
634
|
-
|
635
|
-
todo 230909周六14:00,还有些细节功能可能不完善,比如内部的进度条,多线程等,等后续使用的时候慢慢优化
|
636
|
-
"""
|
637
|
-
|
638
|
-
def process_jsonl_file(srcfile):
|
639
|
-
# 1 如果没有reset,且dstfile存在,则不处理
|
640
|
-
srcfile = XlPath(srcfile)
|
641
|
-
if dst_dir:
|
642
|
-
dstfile = XlPath(dst_dir) / srcfile.name
|
643
|
-
else:
|
644
|
-
dstfile = None
|
645
|
-
if not reset and dstfile and dstfile.is_file():
|
646
|
-
return
|
647
|
-
|
648
|
-
# 2 跑特定文件里的条目
|
649
|
-
jdf = JsonlDataFile(srcfile)
|
650
|
-
new_records = []
|
651
|
-
for records in jdf.yield_group(group_key, sort_mode):
|
652
|
-
records2 = func(records)
|
653
|
-
if records2:
|
654
|
-
new_records.extend(records2)
|
655
|
-
|
656
|
-
# 3 是否修改原文件,是否保存到dst_dir
|
657
|
-
if inplace:
|
658
|
-
jdf.records = new_records
|
659
|
-
jdf.save()
|
660
|
-
|
661
|
-
if dstfile:
|
662
|
-
jdf = JsonlDataFile()
|
663
|
-
jdf.records = new_records
|
664
|
-
jdf.save(dstfile, json_encoder=json_encoder)
|
665
|
-
|
666
|
-
self.process_each_file(process_jsonl_file,
|
667
|
-
processes_num=processes_num,
|
668
|
-
print_mode=print_mode == 1, desc=desc)
|
669
|
-
|
670
|
-
def save(self, dst_path=None):
|
671
|
-
""" 将数据合并到一个jsonl文件中 """
|
672
|
-
if not dst_path:
|
673
|
-
dst_path = self.root.parent / f'{self.root.name}.jsonl'
|
674
|
-
dst_path = XlPath(dst_path)
|
675
|
-
dst_path.parent.mkdir(parents=True, exist_ok=True)
|
676
|
-
with dst_path.open('w', encoding='utf8') as f:
|
677
|
-
for file in tqdm(self.files, desc=f'合并文件并保存 {dst_path.name}'):
|
678
|
-
with file.open('r', encoding='utf8') as f2:
|
679
|
-
for line in f2:
|
680
|
-
if line.strip(): # 不存储空行
|
681
|
-
f.write(line)
|
682
|
-
|
683
|
-
def clear(self):
|
684
|
-
for f in self.files:
|
685
|
-
f.delete()
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# @Author : 陈坤泽
|
4
|
+
# @Email : 877362867@qq.com
|
5
|
+
# @Date : 2021/06/06 17:46
|
6
|
+
import re
|
7
|
+
|
8
|
+
from pyxllib.prog.pupil import check_install_package
|
9
|
+
|
10
|
+
check_install_package('joblib', 'joblib>=1.3.2')
|
11
|
+
|
12
|
+
from collections import OrderedDict
|
13
|
+
import sqlite3
|
14
|
+
|
15
|
+
from joblib import Parallel, delayed
|
16
|
+
|
17
|
+
from pyxllib.file.specialist.filelib import *
|
18
|
+
from pyxllib.file.specialist.dirlib import *
|
19
|
+
from pyxllib.file.specialist.download import *
|
20
|
+
|
21
|
+
|
22
|
+
def merge_jsonl(*infiles):
|
23
|
+
data = []
|
24
|
+
for f in infiles:
|
25
|
+
data += XlPath(f).read_jsonl()
|
26
|
+
return data
|
27
|
+
|
28
|
+
|
29
|
+
class JsonlDataFile:
|
30
|
+
""" 通用的jsonl文件处理类 """
|
31
|
+
|
32
|
+
def __init__(self, filepath=None, num_records=None):
|
33
|
+
"""
|
34
|
+
从指定的jsonl文件中读取数据。可以选择读取全部数据或只读取前N条数据。
|
35
|
+
|
36
|
+
:param str filepath: jsonl文件的路径
|
37
|
+
:param int num_records: 指定读取的记录数量,如果为None则读取全部数据
|
38
|
+
"""
|
39
|
+
self.infile = None
|
40
|
+
self.records = []
|
41
|
+
|
42
|
+
if filepath is not None:
|
43
|
+
filepath = XlPath(filepath)
|
44
|
+
if '?k' in filepath.name: # 如果文件名中有'?',则需要进行模式匹配检索
|
45
|
+
new_name = filepath.name.replace('?k', '*')
|
46
|
+
filepaths = list(filepath.parent.glob(new_name))
|
47
|
+
if filepaths:
|
48
|
+
filepath = filepaths[0] # 找到第1个匹配的文件
|
49
|
+
self.infile = XlPath(filepath)
|
50
|
+
else:
|
51
|
+
self.infile = filepath
|
52
|
+
|
53
|
+
if self.infile and self.infile.is_file(): # 机制上文件也可能不存在的,有可能只是一个预设目录~
|
54
|
+
if num_records is None:
|
55
|
+
# 读取全部数据
|
56
|
+
if self.infile.is_file():
|
57
|
+
self.records = self.infile.read_jsonl()
|
58
|
+
else:
|
59
|
+
# 只读取部分数据
|
60
|
+
self.read_partial_records(num_records)
|
61
|
+
|
62
|
+
def __len__(self):
|
63
|
+
return len(self.records)
|
64
|
+
|
65
|
+
def yield_record(self, start=0, end=None, step=1, batch_size=None):
|
66
|
+
""" 返回指定区间的记录
|
67
|
+
|
68
|
+
:param int start: 起始记录索引,默认为0
|
69
|
+
:param int end: 结束记录索引,默认为None(读取到记录末尾)
|
70
|
+
:param int step: 步长,默认为1
|
71
|
+
:param int batch_size: 每批返回的记录数,如果为None,则逐记录返回
|
72
|
+
"""
|
73
|
+
total_records = len(self.records) # 获取总记录数
|
74
|
+
|
75
|
+
# 处理负索引
|
76
|
+
if start < 0 or (end is not None and end < 0):
|
77
|
+
if start < 0:
|
78
|
+
start = total_records + start
|
79
|
+
if end is not None and end < 0:
|
80
|
+
end = total_records + end
|
81
|
+
|
82
|
+
iterator = islice(self.records, start, end, step)
|
83
|
+
while True:
|
84
|
+
batch = list(islice(iterator, batch_size))
|
85
|
+
if not batch:
|
86
|
+
break
|
87
|
+
if batch_size is None:
|
88
|
+
yield from batch
|
89
|
+
else:
|
90
|
+
yield batch
|
91
|
+
|
92
|
+
def yield_group(self, key, sort_mode='keep'):
|
93
|
+
""" 分组提取数据
|
94
|
+
|
95
|
+
:param key: 一个函数,对record的映射,通过这个映射规则来分组
|
96
|
+
:param sort_mode:
|
97
|
+
keep: 保留原本的相对顺序
|
98
|
+
id: 按照id的值进行排序
|
99
|
+
sort: 按照key的值进行排序
|
100
|
+
"""
|
101
|
+
# 1 创建一个默认字典来保存分组
|
102
|
+
grouped_data = OrderedDict()
|
103
|
+
|
104
|
+
records = self.records
|
105
|
+
if sort_mode == 'id':
|
106
|
+
records = sorted(records, key=lambda x: x['id'])
|
107
|
+
|
108
|
+
# 2 对数据进行分组
|
109
|
+
for record in records:
|
110
|
+
k = key(record)
|
111
|
+
if k not in grouped_data:
|
112
|
+
grouped_data[k] = [record]
|
113
|
+
else:
|
114
|
+
grouped_data[k].append(record)
|
115
|
+
|
116
|
+
# 3 将分组的数据重新排序并合并为一个新列表
|
117
|
+
# 并且在这里可以进行一些分组信息的计算
|
118
|
+
if sort_mode == 'sort':
|
119
|
+
grouped_data = {k: grouped_data[k] for k in sorted(grouped_data.keys())}
|
120
|
+
|
121
|
+
# 4 返回分组后的数据
|
122
|
+
yield from grouped_data.values()
|
123
|
+
|
124
|
+
def read_partial_records(self, num_records):
|
125
|
+
""" 从jsonl文件中只读取指定数量的记录 """
|
126
|
+
if self.infile and self.infile.is_file():
|
127
|
+
try:
|
128
|
+
lines = next(self.infile.yield_line(batch_size=num_records))
|
129
|
+
for line in lines:
|
130
|
+
self.records.append(json.loads(line))
|
131
|
+
except StopIteration:
|
132
|
+
self.records = []
|
133
|
+
|
134
|
+
def save(self, outfile=None, ensure_ascii=False, json_encoder=None):
|
135
|
+
""" 将当前数据保存到指定的jsonl文件中 """
|
136
|
+
if outfile is None: # 默认保存回原文件
|
137
|
+
outfile = self.infile
|
138
|
+
p = XlPath(outfile)
|
139
|
+
|
140
|
+
# 如果文件名包含'?k',则替换'?'为self.records的数量
|
141
|
+
if m := re.search(r'\?k', p.name):
|
142
|
+
n = len(self.records)
|
143
|
+
if n < 500:
|
144
|
+
replace_str = f'{n}' # 数量小于500,直接给出数量
|
145
|
+
else:
|
146
|
+
v = int(round(n / 1000)) # 数量大于等于500,以"千"为单位'k',四舍五入计算
|
147
|
+
replace_str = f'{v}k'
|
148
|
+
# 用新字符串替换原来的字符串
|
149
|
+
new_name = re.sub(r'\?k', replace_str, p.name)
|
150
|
+
p = p.with_name(new_name) # 更改文件名
|
151
|
+
|
152
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
153
|
+
p.write_jsonl(self.records, ensure_ascii=ensure_ascii, default=json_encoder)
|
154
|
+
|
155
|
+
def browse_record(self, index=None, paths=None, **kwargs):
|
156
|
+
""" 在浏览器中显示指定记录的内容 """
|
157
|
+
from pyxllib.prog.specialist import browser
|
158
|
+
|
159
|
+
# 如果未提供索引,则尝试使用查询参数找到第一个匹配的记录
|
160
|
+
if index is None:
|
161
|
+
index = self.find_index(paths, **kwargs)
|
162
|
+
if index is None:
|
163
|
+
raise ValueError('No matching record found')
|
164
|
+
|
165
|
+
record = self.records[index]
|
166
|
+
html_content = ['<html><body><pre>',
|
167
|
+
json.dumps(record, ensure_ascii=False, indent=4),
|
168
|
+
'</pre></body></html>']
|
169
|
+
html_file = (XlPath.tempdir() / f'{self.__class__.__name__}_{index}.html')
|
170
|
+
html_file.write_text('\n'.join(html_content))
|
171
|
+
browser.html(html_file)
|
172
|
+
|
173
|
+
def browse_records(self, indices=None, paths=None, **kwargs):
|
174
|
+
""" 在浏览器中显示所有匹配的记录 """
|
175
|
+
from pyxllib.prog.specialist import browser
|
176
|
+
|
177
|
+
if indices is None:
|
178
|
+
indices = list(self.find_indexs(paths, **kwargs))
|
179
|
+
if not indices:
|
180
|
+
raise ValueError('No matching records found')
|
181
|
+
|
182
|
+
html_content = ['<html><body><h1>Matching Records: {}</h1>'.format(len(indices))]
|
183
|
+
|
184
|
+
for index in indices:
|
185
|
+
record = self.records[index]
|
186
|
+
html_content.extend([
|
187
|
+
'<h2>Record {}</h2>'.format(index),
|
188
|
+
'<pre>',
|
189
|
+
json.dumps(record, ensure_ascii=False, indent=4),
|
190
|
+
'</pre>'
|
191
|
+
])
|
192
|
+
|
193
|
+
html_content.append('</body></html>')
|
194
|
+
html_file = (XlPath.tempdir() / f'{self.__class__.__name__}_matched.html')
|
195
|
+
html_file.write_text('\n'.join(html_content))
|
196
|
+
browser.html(html_file)
|
197
|
+
|
198
|
+
def find_indexs(self, paths=None, **kwargs):
|
199
|
+
""" 查找满足特定条件的记录的索引,返回所有匹配的结果 """
|
200
|
+
paths = paths or {}
|
201
|
+
|
202
|
+
for i, record in enumerate(self.records):
|
203
|
+
# 检查kwargs中的所有条件
|
204
|
+
for key, value in kwargs.items():
|
205
|
+
if callable(value):
|
206
|
+
if not value(record.get(key)):
|
207
|
+
break
|
208
|
+
elif record.get(key) != value:
|
209
|
+
break
|
210
|
+
else:
|
211
|
+
# 检查paths中的所有条件
|
212
|
+
for path, value in paths.items():
|
213
|
+
try:
|
214
|
+
actual_value = eval(f'record{path}')
|
215
|
+
except Exception:
|
216
|
+
break
|
217
|
+
|
218
|
+
if callable(value):
|
219
|
+
if not value(actual_value):
|
220
|
+
break
|
221
|
+
elif actual_value != value:
|
222
|
+
break
|
223
|
+
else:
|
224
|
+
# 如果记录满足所有条件,则返回它的索引
|
225
|
+
yield i
|
226
|
+
|
227
|
+
def find_index(self, paths=None, **kwargs):
|
228
|
+
"""
|
229
|
+
:param dict paths: 在比较复杂场景,无法使用kwargs定位的时候可以用这个规则
|
230
|
+
key: 检索范式
|
231
|
+
value: 需要满足的值
|
232
|
+
示例: find_index({"['messages'][0]['role']": 'user'})
|
233
|
+
:param kwargs: 直接子结点名称和对应的值
|
234
|
+
示例:find_index(id=2023071320000003)
|
235
|
+
|
236
|
+
补充说明:
|
237
|
+
1、paths和kwargs可以组合使用,表示必须同时满足二者里限定的所有规则
|
238
|
+
2、value可以写一个 def func(v)->bool的函数,输入对应的value,返回是否满足条件
|
239
|
+
"""
|
240
|
+
return next(self.find_indexs(paths, **kwargs), None)
|
241
|
+
|
242
|
+
def add_record_basic(self, **kwargs):
|
243
|
+
""" 最基础的添加一个条目的接口 """
|
244
|
+
record = kwargs
|
245
|
+
self.records.append(record)
|
246
|
+
return record
|
247
|
+
|
248
|
+
@classmethod
|
249
|
+
def read_from_files(cls, src_files):
|
250
|
+
""" 从多个文件中读取并合并数据,并返回新的JsonlDataFile实例 """
|
251
|
+
merged_records = []
|
252
|
+
for file in src_files:
|
253
|
+
jsonl_file = cls(file)
|
254
|
+
merged_records.extend(jsonl_file.records)
|
255
|
+
# 创建新的实例并返回
|
256
|
+
new_instance = cls()
|
257
|
+
new_instance.records = merged_records
|
258
|
+
return new_instance
|
259
|
+
|
260
|
+
@classmethod
|
261
|
+
def read_from_dir(cls, src_dir):
|
262
|
+
""" 从一个目录下的所有jsonl文件中读取并合并数据,并返回新的JsonlDataFile实例 """
|
263
|
+
src_dir = XlPath(src_dir)
|
264
|
+
src_files = [str(file_path) for file_path in src_dir.glob('*.jsonl')]
|
265
|
+
return cls.read_from_files(src_files)
|
266
|
+
|
267
|
+
def __add__(self, other):
|
268
|
+
""" 实现类加法操作,合并两个JsonlDataFile的records """
|
269
|
+
if not isinstance(other, JsonlDataFile):
|
270
|
+
raise TypeError(f'Unsupported operand type: {type(other)}')
|
271
|
+
result = JsonlDataFile()
|
272
|
+
result.records = self.records + other.records
|
273
|
+
return result
|
274
|
+
|
275
|
+
def __iadd__(self, other):
|
276
|
+
""" 实现原地加法操作,即 += """
|
277
|
+
if not isinstance(other, JsonlDataFile):
|
278
|
+
raise TypeError(f'Unsupported operand type: {type(other)}')
|
279
|
+
self.records += other.records
|
280
|
+
return self
|
281
|
+
|
282
|
+
def process_each_record(self, func, *,
|
283
|
+
inplace=False,
|
284
|
+
timeout=None,
|
285
|
+
print_mode=0,
|
286
|
+
threads_num=1,
|
287
|
+
**kwargs):
|
288
|
+
""" 对records中的每个record应用函数func,可以选择是否在原地修改,以及是否显示进度条
|
289
|
+
|
290
|
+
:param function func: 对record进行处理的函数,应接受一个record作为参数并返回处理后的record,如果返回None则删除该record
|
291
|
+
:param bool inplace: 是否在原地修改records,如果为False(默认),则创建新的JsonlDataFile并返回
|
292
|
+
:param int print_mode: 是否显示处理过程的进度条,0表示不显示(默认),1表示显示
|
293
|
+
:return JsonlDataFile or None: 如果inplace为False,则返回新的JsonlDataFile,否则返回None
|
294
|
+
:param int threads_num: 线程数,默认为1,即单线程
|
295
|
+
|
296
|
+
遍历self.records,对每个record执行func函数,如果func返回None,则不包含该record到新的records中。
|
297
|
+
"""
|
298
|
+
backend = 'threading' if threads_num != 1 else 'sequential'
|
299
|
+
|
300
|
+
if print_mode:
|
301
|
+
parallel = Parallel(n_jobs=threads_num, backend=backend,
|
302
|
+
timeout=timeout, return_as='generator')
|
303
|
+
tasks = [delayed(func)(record) for record in self.records]
|
304
|
+
new_records = []
|
305
|
+
for y in tqdm(parallel(tasks), total=len(self.records), **kwargs):
|
306
|
+
if y:
|
307
|
+
new_records.append(y)
|
308
|
+
else:
|
309
|
+
parallel = Parallel(n_jobs=threads_num, backend=backend, timeout=timeout)
|
310
|
+
tasks = [delayed(func)(record) for record in self.records]
|
311
|
+
new_records = parallel(tasks)
|
312
|
+
new_records = [y for y in new_records if y]
|
313
|
+
|
314
|
+
if inplace:
|
315
|
+
self.records = new_records
|
316
|
+
|
317
|
+
return new_records
|
318
|
+
|
319
|
+
def update_each_record(self, func,
|
320
|
+
timeout=None,
|
321
|
+
print_mode=0,
|
322
|
+
threads_num=1):
|
323
|
+
""" 遍历并对原始数据进行更改 """
|
324
|
+
return self.process_each_record(func,
|
325
|
+
inplace=True,
|
326
|
+
timeout=timeout,
|
327
|
+
print_mode=print_mode,
|
328
|
+
threads_num=threads_num)
|
329
|
+
|
330
|
+
|
331
|
+
class JsonlDataDir:
|
332
|
+
""" 注意这个类开发目标,应该是尽量去模拟JsonDataFile,让下游工作更好衔接统一 """
|
333
|
+
|
334
|
+
def __init__(self, root):
|
335
|
+
""" 一般用来处理较大的jsonl文件,将其该放到一个目录里,拆分成多个jsonl文件
|
336
|
+
|
337
|
+
注意待处理的文件名是依照 01.jsonl, 02.jsonl,... 的格式识别的,不要改动这个规则
|
338
|
+
"""
|
339
|
+
self.root = XlPath(root)
|
340
|
+
self.files = []
|
341
|
+
self.update_subfiles()
|
342
|
+
|
343
|
+
def update_subfiles(self):
|
344
|
+
self.files = []
|
345
|
+
for f in self.root.glob_files('*.jsonl'):
|
346
|
+
if re.match(r'_?\d+$', f.stem): # 目前先用'_?'兼容旧版,但以后应该固定只匹配_\d+
|
347
|
+
self.files.append(f)
|
348
|
+
|
349
|
+
def __bool__(self):
|
350
|
+
if self.root.is_dir() and self.files:
|
351
|
+
return True
|
352
|
+
else:
|
353
|
+
return False
|
354
|
+
|
355
|
+
def count_records(self):
|
356
|
+
total = 0
|
357
|
+
for f in self.files:
|
358
|
+
total += f.get_total_lines(skip_blank=True)
|
359
|
+
return total
|
360
|
+
|
361
|
+
def check(self, title=''):
|
362
|
+
""" 检查一些数据状态 """
|
363
|
+
print(title, '文件数:', len(self.files), '条目数:', self.count_records())
|
364
|
+
|
365
|
+
@classmethod
|
366
|
+
def init_from_file(cls, file, lines_per_file=10000):
|
367
|
+
""" 从一个jsonl文件初始化一个JsonlDataDir对象 """
|
368
|
+
file = XlPath(file)
|
369
|
+
dst_dir = file.parent / file.stem
|
370
|
+
if not dst_dir.is_dir() and file.is_file():
|
371
|
+
file.split_to_dir(lines_per_file, dst_dir)
|
372
|
+
c = cls(dst_dir)
|
373
|
+
return c
|
374
|
+
|
375
|
+
def _rearrange_group(self, lines_per_file=10000,
|
376
|
+
group_key=None, sort_mode='keep',
|
377
|
+
print_mode=1):
|
378
|
+
# 1 使用sqlite3存储数据和分组信息
|
379
|
+
# 创建一个临时文件来作为SQLite数据库
|
380
|
+
temp_db_file = self.root / 'data.sqlite3'
|
381
|
+
temp_db_file.delete()
|
382
|
+
|
383
|
+
# 使用临时文件创建SQLite数据库连接
|
384
|
+
conn = sqlite3.connect(temp_db_file)
|
385
|
+
cursor = conn.cursor()
|
386
|
+
|
387
|
+
# 创建一个临时表来存储jsonl数据
|
388
|
+
cursor.execute('CREATE TABLE records (id INTEGER PRIMARY KEY AUTOINCREMENT,'
|
389
|
+
'data TEXT, group_key TEXT)')
|
390
|
+
# 给group_key添加索引
|
391
|
+
cursor.execute('CREATE INDEX idx_group_key ON records(group_key)')
|
392
|
+
|
393
|
+
# 从jsonl文件加载数据到SQLite数据库
|
394
|
+
commit_interval = 2000 # 多少记录执行一次commit
|
395
|
+
count = 0
|
396
|
+
for record in tqdm(self.yield_record(), desc='计算每个record分组', disable=not print_mode):
|
397
|
+
count += 1
|
398
|
+
group = group_key(record) if group_key else count
|
399
|
+
group = str(group)
|
400
|
+
cursor.execute('INSERT INTO records (data, group_key) VALUES (?, ?)',
|
401
|
+
(json.dumps(record, ensure_ascii=False), group))
|
402
|
+
if count % commit_interval == 0:
|
403
|
+
conn.commit()
|
404
|
+
conn.commit()
|
405
|
+
|
406
|
+
# 2 查询数据库以进行排序和分组,并将结果写入新的jsonl文件
|
407
|
+
new_file_count = 0
|
408
|
+
lines_written = 0
|
409
|
+
current_file = None
|
410
|
+
sort_sql = ''
|
411
|
+
if sort_mode == 'id':
|
412
|
+
sort_sql = 'ORDER BY id'
|
413
|
+
elif sort_mode == 'sort':
|
414
|
+
sort_sql = f'ORDER BY {group_key}'
|
415
|
+
|
416
|
+
for group, in tqdm(cursor.execute('SELECT DISTINCT group_key FROM records').fetchall(),
|
417
|
+
desc='提取每一组数据',
|
418
|
+
disable=not print_mode):
|
419
|
+
query = f'SELECT data FROM records WHERE group_key = ? {sort_sql}'
|
420
|
+
cursor.execute(query, (group,))
|
421
|
+
|
422
|
+
if current_file is None or lines_written >= lines_per_file:
|
423
|
+
if current_file:
|
424
|
+
current_file.close()
|
425
|
+
new_file_name = f'temp_{new_file_count}.jsonl'
|
426
|
+
new_file_path = self.root / new_file_name
|
427
|
+
current_file = new_file_path.open('w', encoding='utf-8')
|
428
|
+
new_file_count += 1
|
429
|
+
lines_written = 0
|
430
|
+
|
431
|
+
while True:
|
432
|
+
row = cursor.fetchone()
|
433
|
+
if row is None:
|
434
|
+
break
|
435
|
+
|
436
|
+
current_file.write(row[0] + '\n')
|
437
|
+
lines_written += 1
|
438
|
+
|
439
|
+
if current_file:
|
440
|
+
current_file.close()
|
441
|
+
|
442
|
+
# 3 关闭数据库连接并删除临时文件
|
443
|
+
conn.close()
|
444
|
+
temp_db_file.delete()
|
445
|
+
|
446
|
+
# 4 删除旧文件,重命名新文件
|
447
|
+
for f in self.files:
|
448
|
+
f.delete()
|
449
|
+
|
450
|
+
widths = len(str(new_file_count))
|
451
|
+
for temp_file in self.root.glob('temp_*.jsonl'):
|
452
|
+
n = int(re.search(r'\d+', temp_file.name).group())
|
453
|
+
temp_file.rename(self.root / f'_{n:0{widths}}.jsonl')
|
454
|
+
|
455
|
+
def rearrange(self, lines_per_file=10000, group_key=None,
|
456
|
+
sort_mode='keep', print_mode=1):
|
457
|
+
""" 重新整理划分文件
|
458
|
+
|
459
|
+
:param int lines_per_file: 每个文件的行数
|
460
|
+
:param func group_key: 用来分组的函数,确保相同key的数据会被分到同一个文件里
|
461
|
+
:param str sort_mode:
|
462
|
+
keep: 保留原本的相对顺序
|
463
|
+
id: 按照id的值进行排序
|
464
|
+
sort: 按照key的值进行排序
|
465
|
+
"""
|
466
|
+
if group_key is not None or sort_mode != 'keep':
|
467
|
+
return self._rearrange_group(lines_per_file, group_key, sort_mode, print_mode)
|
468
|
+
|
469
|
+
output_dir = self.root
|
470
|
+
temp_prefix = 'temp_'
|
471
|
+
|
472
|
+
new_file_count = 0
|
473
|
+
new_file = None
|
474
|
+
line_count = 0
|
475
|
+
|
476
|
+
# 计算总行数以确定文件名的前导零数量
|
477
|
+
total_lines = sum(1 for file in self.files for _ in file.open('r', encoding='utf-8'))
|
478
|
+
num_digits = len(str((total_lines + lines_per_file - 1) // lines_per_file))
|
479
|
+
|
480
|
+
for file in self.files:
|
481
|
+
with file.open('r', encoding='utf-8') as f:
|
482
|
+
for line in f:
|
483
|
+
if not line.strip():
|
484
|
+
continue
|
485
|
+
if line_count == 0:
|
486
|
+
if new_file is not None:
|
487
|
+
new_file.close()
|
488
|
+
new_file_name = f'{temp_prefix}{new_file_count:0{num_digits}d}.jsonl'
|
489
|
+
new_file_path = output_dir / new_file_name
|
490
|
+
new_file = new_file_path.open('w', encoding='utf-8')
|
491
|
+
new_file_count += 1
|
492
|
+
|
493
|
+
new_file.write(line)
|
494
|
+
line_count += 1
|
495
|
+
|
496
|
+
if line_count == lines_per_file:
|
497
|
+
line_count = 0
|
498
|
+
|
499
|
+
if new_file is not None:
|
500
|
+
new_file.close()
|
501
|
+
|
502
|
+
# 删除旧文件
|
503
|
+
for file in self.files:
|
504
|
+
os.remove(file)
|
505
|
+
|
506
|
+
# 将临时文件名更改为最终的文件名
|
507
|
+
for temp_file in output_dir.glob(f'{temp_prefix}*.jsonl'):
|
508
|
+
final_name = temp_file.name[len(temp_prefix) - 1:]
|
509
|
+
temp_file.rename(output_dir / final_name)
|
510
|
+
|
511
|
+
def yield_record(self, batch_size=None):
|
512
|
+
""" 返回数据记录
|
513
|
+
|
514
|
+
:param int batch_size: 每批返回的记录数,如果为None,则逐条返回
|
515
|
+
"""
|
516
|
+
for i, file in enumerate(self.files):
|
517
|
+
data = file.read_jsonl()
|
518
|
+
iterator = iter(data)
|
519
|
+
while True:
|
520
|
+
batch = list(islice(iterator, batch_size))
|
521
|
+
if not batch:
|
522
|
+
break
|
523
|
+
if batch_size is None:
|
524
|
+
yield from batch
|
525
|
+
else:
|
526
|
+
yield batch
|
527
|
+
|
528
|
+
def yield_group(self, key, sort_mode='keep'):
|
529
|
+
""" 分组提取数据
|
530
|
+
|
531
|
+
:param key: 一个函数,对record的映射,通过这个映射规则来分组
|
532
|
+
|
533
|
+
注意:这个分组只会对每个分文件单独执行,不会全局性质检索
|
534
|
+
一般要用self.rearrange对全局的文件进行检索重排后再使用这个函数
|
535
|
+
"""
|
536
|
+
for filepath in self.files:
|
537
|
+
jdf = JsonlDataFile(filepath)
|
538
|
+
yield from jdf.yield_group(key, sort_mode)
|
539
|
+
|
540
|
+
def process_each_file(self, func=None, *,
|
541
|
+
print_mode=0, desc='process_each_file',
|
542
|
+
processes_num=1,
|
543
|
+
subfiles=None,
|
544
|
+
**kwargs):
|
545
|
+
# 1 backend
|
546
|
+
backend = 'loky' if processes_num != 1 else 'sequential'
|
547
|
+
|
548
|
+
# 2 tasks
|
549
|
+
if subfiles is None:
|
550
|
+
subfiles = [0, len(self.files)]
|
551
|
+
elif not isinstance(subfiles, (list, tuple)):
|
552
|
+
subfiles = [subfiles, subfiles + 1]
|
553
|
+
a, b = subfiles
|
554
|
+
tasks = [delayed(func)(file) for file in self.files[a:b]]
|
555
|
+
|
556
|
+
# 3 run
|
557
|
+
if print_mode:
|
558
|
+
parallel = Parallel(n_jobs=processes_num, backend=backend, return_as='generator')
|
559
|
+
list(tqdm(parallel(tasks), total=len(self.files), desc=desc, **kwargs))
|
560
|
+
else:
|
561
|
+
parallel = Parallel(n_jobs=processes_num, backend=backend)
|
562
|
+
parallel(tasks)
|
563
|
+
|
564
|
+
def process_each_record(self, func, *,
|
565
|
+
inplace=False, reset=False,
|
566
|
+
print_mode=2, desc=None,
|
567
|
+
timeout=None,
|
568
|
+
processes_num=1, threads_num=1,
|
569
|
+
dst_dir=None, json_encoder=None,
|
570
|
+
subfiles=None):
|
571
|
+
""" 封装的对每个record进行操作的函数
|
572
|
+
|
573
|
+
:param func: 外部传入的处理函数
|
574
|
+
:param inplace: 是否修改原始数据
|
575
|
+
:param reset: 是否重新处理已经处理过的文件
|
576
|
+
:param print_mode:
|
577
|
+
0 不显示
|
578
|
+
1 只显示文件数进度
|
579
|
+
2(默认) 显示更详细的文件内处理进度
|
580
|
+
:param desc: print_mode=1的进度条标题
|
581
|
+
:param timeout: 超时时间,但有些场合会使用不了(比如linux的子进程里不能使用singal)
|
582
|
+
在用不了的场合,可以使用requests自带的timeout等各种机制来限时
|
583
|
+
:param processes_num: 进程数,每个文件为单独一个进程
|
584
|
+
:param threads_num: 线程数,每个文件处理的时候使用几个线程
|
585
|
+
:param dst_dir: 要保存到的目标目录,未设置的时候不保存
|
586
|
+
:param json_encoder: 有些不是标准的json数据结构,如何进行处理,有需要的时候一般会设置成str
|
587
|
+
:param subfiles: 只跑部分子文件
|
588
|
+
a,只有文件编号为a的才运行
|
589
|
+
[a, b],跑左闭右开的区间内的文件
|
590
|
+
"""
|
591
|
+
files_num = len(self.files)
|
592
|
+
|
593
|
+
def process_jsonl_file(srcfile):
|
594
|
+
# 1 如果没有reset,且dstfile存在,则不处理
|
595
|
+
srcfile = XlPath(srcfile)
|
596
|
+
if dst_dir:
|
597
|
+
dstfile = XlPath(dst_dir) / srcfile.name
|
598
|
+
else:
|
599
|
+
dstfile = None
|
600
|
+
if not reset and dstfile and dstfile.is_file():
|
601
|
+
return
|
602
|
+
|
603
|
+
# 2 跑特定文件里的条目
|
604
|
+
jdf = JsonlDataFile(srcfile)
|
605
|
+
new_records = jdf.process_each_record(func,
|
606
|
+
inplace=inplace,
|
607
|
+
print_mode=print_mode == 2,
|
608
|
+
desc=f'{jdf.infile.name}/{files_num}',
|
609
|
+
timeout=timeout,
|
610
|
+
threads_num=threads_num,
|
611
|
+
mininterval=processes_num * 3,
|
612
|
+
)
|
613
|
+
|
614
|
+
# 3 是否修改原文件,是否保存到dst_dir
|
615
|
+
if inplace:
|
616
|
+
jdf.save()
|
617
|
+
|
618
|
+
if dstfile:
|
619
|
+
jdf = JsonlDataFile()
|
620
|
+
jdf.records = new_records
|
621
|
+
jdf.save(dstfile, json_encoder=json_encoder)
|
622
|
+
|
623
|
+
self.process_each_file(process_jsonl_file, subfiles=subfiles,
|
624
|
+
processes_num=processes_num,
|
625
|
+
print_mode=print_mode == 1, desc=desc)
|
626
|
+
|
627
|
+
def process_each_group(self, func, group_key, sort_mode='keep', *,
|
628
|
+
inplace=False, reset=False,
|
629
|
+
print_mode=1, desc=None,
|
630
|
+
processes_num=1,
|
631
|
+
dst_dir=None,
|
632
|
+
json_encoder=None):
|
633
|
+
""" 封装的对每组records的处理
|
634
|
+
|
635
|
+
todo 230909周六14:00,还有些细节功能可能不完善,比如内部的进度条,多线程等,等后续使用的时候慢慢优化
|
636
|
+
"""
|
637
|
+
|
638
|
+
def process_jsonl_file(srcfile):
|
639
|
+
# 1 如果没有reset,且dstfile存在,则不处理
|
640
|
+
srcfile = XlPath(srcfile)
|
641
|
+
if dst_dir:
|
642
|
+
dstfile = XlPath(dst_dir) / srcfile.name
|
643
|
+
else:
|
644
|
+
dstfile = None
|
645
|
+
if not reset and dstfile and dstfile.is_file():
|
646
|
+
return
|
647
|
+
|
648
|
+
# 2 跑特定文件里的条目
|
649
|
+
jdf = JsonlDataFile(srcfile)
|
650
|
+
new_records = []
|
651
|
+
for records in jdf.yield_group(group_key, sort_mode):
|
652
|
+
records2 = func(records)
|
653
|
+
if records2:
|
654
|
+
new_records.extend(records2)
|
655
|
+
|
656
|
+
# 3 是否修改原文件,是否保存到dst_dir
|
657
|
+
if inplace:
|
658
|
+
jdf.records = new_records
|
659
|
+
jdf.save()
|
660
|
+
|
661
|
+
if dstfile:
|
662
|
+
jdf = JsonlDataFile()
|
663
|
+
jdf.records = new_records
|
664
|
+
jdf.save(dstfile, json_encoder=json_encoder)
|
665
|
+
|
666
|
+
self.process_each_file(process_jsonl_file,
|
667
|
+
processes_num=processes_num,
|
668
|
+
print_mode=print_mode == 1, desc=desc)
|
669
|
+
|
670
|
+
def save(self, dst_path=None):
|
671
|
+
""" 将数据合并到一个jsonl文件中 """
|
672
|
+
if not dst_path:
|
673
|
+
dst_path = self.root.parent / f'{self.root.name}.jsonl'
|
674
|
+
dst_path = XlPath(dst_path)
|
675
|
+
dst_path.parent.mkdir(parents=True, exist_ok=True)
|
676
|
+
with dst_path.open('w', encoding='utf8') as f:
|
677
|
+
for file in tqdm(self.files, desc=f'合并文件并保存 {dst_path.name}'):
|
678
|
+
with file.open('r', encoding='utf8') as f2:
|
679
|
+
for line in f2:
|
680
|
+
if line.strip(): # 不存储空行
|
681
|
+
f.write(line)
|
682
|
+
|
683
|
+
def clear(self):
|
684
|
+
for f in self.files:
|
685
|
+
f.delete()
|