xlin 0.1.23__py2.py3-none-any.whl → 0.1.25__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xlin/jsonl.py +8 -3
- xlin/multiprocess_mapping.py +95 -66
- {xlin-0.1.23.dist-info → xlin-0.1.25.dist-info}/METADATA +1 -1
- {xlin-0.1.23.dist-info → xlin-0.1.25.dist-info}/RECORD +6 -6
- {xlin-0.1.23.dist-info → xlin-0.1.25.dist-info}/LICENSE +0 -0
- {xlin-0.1.23.dist-info → xlin-0.1.25.dist-info}/WHEEL +0 -0
xlin/jsonl.py
CHANGED
@@ -224,11 +224,16 @@ def generator_from_paths(paths: List[Path], load_data: Callable[[Path], List[Dic
|
|
224
224
|
|
225
225
|
|
226
226
|
|
227
|
-
def append_to_json_list(data: list[dict], file_path: str):
|
227
|
+
def append_to_json_list(data: list[dict], file_path: Union[str, Path]):
|
228
228
|
"""Append a list of dictionaries to a JSON file."""
|
229
|
-
|
229
|
+
file_path = Path(file_path)
|
230
|
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
231
|
+
if file_path.exists() and file_path.is_dir():
|
232
|
+
print(f"{file_path} is a directory, not a file.")
|
233
|
+
return
|
234
|
+
with open(file_path, "a", encoding="utf-8") as f:
|
230
235
|
for item in data:
|
231
|
-
f.write(json.dumps(item) + "\n")
|
236
|
+
f.write(json.dumps(item, ensure_ascii=False, separators=(",", ":")) + "\n")
|
232
237
|
|
233
238
|
|
234
239
|
def row_to_json(row: dict) -> dict:
|
xlin/multiprocess_mapping.py
CHANGED
@@ -21,15 +21,19 @@ def element_mapping(
|
|
21
21
|
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
22
22
|
):
|
23
23
|
rows = []
|
24
|
+
# 转换为列表以获取长度,用于进度条显示
|
25
|
+
items = list(iterator)
|
26
|
+
total = len(items)
|
27
|
+
|
24
28
|
if use_multiprocessing:
|
25
29
|
pool = ThreadPool(thread_pool_size)
|
26
|
-
|
27
|
-
pool.
|
28
|
-
for ok, row in results:
|
30
|
+
# 使用imap替代map,结合tqdm显示进度
|
31
|
+
for ok, row in tqdm(pool.imap(mapping_func, items), total=total, desc="Processing"):
|
29
32
|
if ok:
|
30
33
|
rows.append(row)
|
34
|
+
pool.close()
|
31
35
|
else:
|
32
|
-
for row in tqdm(
|
36
|
+
for row in tqdm(items, desc="Processing"):
|
33
37
|
ok, row = mapping_func(row)
|
34
38
|
if ok:
|
35
39
|
rows.append(row)
|
@@ -70,57 +74,111 @@ def dataframe_with_row_mapping(
|
|
70
74
|
|
71
75
|
def multiprocessing_mapping_jsonlist(
|
72
76
|
jsonlist: List[Any],
|
73
|
-
|
74
|
-
|
77
|
+
partial_func: Callable[[Any], dict],
|
78
|
+
output_path: Optional[Union[str, Path]]=None, # 输出路径,None表示不缓存
|
75
79
|
batch_size=multiprocessing.cpu_count(),
|
76
80
|
cache_batch_num=1,
|
77
81
|
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
82
|
+
use_process_pool=True, # CPU密集型任务时设为True
|
83
|
+
preserve_order=True, # 是否保持结果顺序
|
84
|
+
chunksize=None, # 自动计算最佳分块大小
|
85
|
+
retry_count=0, # 失败重试次数
|
78
86
|
):
|
79
|
-
"""
|
87
|
+
"""高效处理JSON列表,支持多进程/多线程
|
80
88
|
|
81
89
|
Args:
|
82
|
-
|
83
|
-
output_path (Path):
|
84
|
-
partial_func (
|
90
|
+
jsonlist (List[Any]): 要处理的JSON对象列表
|
91
|
+
output_path (Optional[Union[str, Path]]): 输出路径,None表示不缓存
|
92
|
+
partial_func (Callable): 处理函数,接收dict返回dict
|
93
|
+
batch_size (int): 批处理大小
|
94
|
+
cache_batch_num (int): 缓存批次数量
|
95
|
+
thread_pool_size (int): 线程/进程池大小
|
96
|
+
use_process_pool (bool): 是否使用进程池(CPU密集型任务)
|
97
|
+
preserve_order (bool): 是否保持结果顺序
|
98
|
+
chunksize (Optional[int]): 单个任务分块大小,None为自动计算
|
99
|
+
retry_count (int): 任务失败重试次数
|
85
100
|
"""
|
86
101
|
need_caching = output_path is not None
|
87
|
-
|
102
|
+
output_list = []
|
88
103
|
start_idx = 0
|
104
|
+
|
105
|
+
# 自动计算最佳chunksize
|
106
|
+
if chunksize is None:
|
107
|
+
chunksize = max(1, min(batch_size // thread_pool_size, 100))
|
108
|
+
|
109
|
+
# 处理缓存
|
89
110
|
if need_caching:
|
90
111
|
output_path = Path(output_path)
|
91
112
|
if output_path.exists():
|
92
113
|
output_list = load_json_list(output_path)
|
93
114
|
start_idx = len(output_list)
|
94
|
-
logger.
|
95
|
-
logger.warning(f"缓存 {output_path} 存在 {start_idx} 行. 本次处理将从第 {start_idx} 行开始.")
|
115
|
+
logger.info(f"继续处理: 已有{start_idx}条记录,共{len(jsonlist)}条")
|
96
116
|
else:
|
97
117
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
118
|
+
|
119
|
+
# 选择线程池或进程池
|
120
|
+
if use_process_pool:
|
121
|
+
pool_cls = multiprocessing.Pool
|
122
|
+
logger.info(f"使用进程池(ProcessPool),适用于CPU密集型任务")
|
123
|
+
else:
|
124
|
+
pool_cls = ThreadPool
|
125
|
+
logger.info(f"使用线程池(ThreadPool),适用于IO密集型任务")
|
126
|
+
|
127
|
+
with pool_cls(thread_pool_size) as pool:
|
128
|
+
logger.info(f"池大小: {thread_pool_size}, 批处理大小: {batch_size}, 分块大小: {chunksize}")
|
129
|
+
|
130
|
+
# 准备要处理的数据
|
131
|
+
remaining_items = jsonlist[start_idx:]
|
132
|
+
total_items = len(remaining_items)
|
133
|
+
|
134
|
+
# 批量处理逻辑
|
135
|
+
def process_batch(items_batch, retry_remaining=retry_count):
|
136
|
+
try:
|
137
|
+
# 选择合适的映射方法
|
138
|
+
map_func = pool.imap_unordered if not preserve_order else pool.imap
|
139
|
+
return list(map_func(partial_func, items_batch, chunksize))
|
140
|
+
except Exception as e:
|
141
|
+
if retry_remaining > 0:
|
142
|
+
logger.warning(f"批处理失败,重试中 ({retry_count-retry_remaining+1}/{retry_count}): {e}")
|
143
|
+
return process_batch(items_batch, retry_remaining - 1)
|
144
|
+
else:
|
145
|
+
logger.error(f"批处理失败: {e}")
|
146
|
+
raise
|
147
|
+
|
148
|
+
# 处理数据
|
149
|
+
with tqdm(total=total_items, desc="处理数据", unit="项") as pbar:
|
150
|
+
# 跳过已处理的项目
|
151
|
+
pbar.update(start_idx)
|
152
|
+
|
153
|
+
# 分批处理
|
154
|
+
for i in range(0, total_items, batch_size):
|
155
|
+
batch = remaining_items[i : i + batch_size]
|
156
|
+
|
157
|
+
# 处理当前批次
|
158
|
+
batch_start_time = time.time()
|
159
|
+
results = process_batch(batch)
|
160
|
+
batch_time = time.time() - batch_start_time
|
161
|
+
|
162
|
+
# 更新结果
|
163
|
+
output_list.extend(results)
|
164
|
+
pbar.update(len(batch))
|
165
|
+
|
166
|
+
# 性能统计
|
167
|
+
items_per_second = len(batch) / batch_time if batch_time > 0 else 0
|
168
|
+
pbar.set_postfix_str(f"速率: {items_per_second:.1f}项/秒")
|
169
|
+
|
170
|
+
# 缓存逻辑
|
171
|
+
if need_caching and (i // batch_size) % cache_batch_num == 0:
|
172
|
+
# 仅当处理速度足够慢时才保存缓存,避免IO成为瓶颈
|
173
|
+
if batch_time > 3 or i + batch_size >= total_items:
|
174
|
+
save_json_list(output_list, output_path)
|
175
|
+
logger.debug(f"已保存{len(output_list)}条记录到{output_path}")
|
176
|
+
|
177
|
+
# 最终保存
|
122
178
|
if need_caching:
|
123
179
|
save_json_list(output_list, output_path)
|
180
|
+
logger.info(f"已完成处理并保存{len(output_list)}条记录")
|
181
|
+
|
124
182
|
return output_list
|
125
183
|
|
126
184
|
|
@@ -186,35 +244,6 @@ def multiprocessing_mapping(
|
|
186
244
|
return output_df, output_list
|
187
245
|
|
188
246
|
|
189
|
-
def continue_run(
|
190
|
-
jsonfiles: List[str],
|
191
|
-
save_dir: str,
|
192
|
-
mapping_func,
|
193
|
-
load_func=load_json,
|
194
|
-
save_func=save_json,
|
195
|
-
batch_size=1024,
|
196
|
-
cache_size=8,
|
197
|
-
):
|
198
|
-
save_dir: Path = Path(save_dir)
|
199
|
-
save_dir.mkdir(parents=True, exist_ok=True)
|
200
|
-
new_jsonfiles = []
|
201
|
-
for jsonfile in ls(jsonfiles):
|
202
|
-
jsonlist = load_func(jsonfile)
|
203
|
-
output_filepath = save_dir / jsonfile.name
|
204
|
-
for row in jsonlist:
|
205
|
-
row["来源"] = jsonfile.name
|
206
|
-
new_jsonlist = multiprocessing_mapping_jsonlist(
|
207
|
-
jsonlist,
|
208
|
-
output_filepath,
|
209
|
-
mapping_func,
|
210
|
-
batch_size,
|
211
|
-
cache_size,
|
212
|
-
)
|
213
|
-
save_func(new_jsonlist, output_filepath)
|
214
|
-
new_jsonfiles.append(output_filepath)
|
215
|
-
return new_jsonfiles
|
216
|
-
|
217
|
-
|
218
247
|
def dataframe_mapping(
|
219
248
|
df: pd.DataFrame,
|
220
249
|
row_func: Callable[[dict], dict],
|
@@ -1,15 +1,15 @@
|
|
1
1
|
xlin/__init__.py,sha256=MWWCNPgJFS_oV2US52ULa4yg4Ku61qjn40NVKqcp9-c,248
|
2
2
|
xlin/ischinese.py,sha256=Ia9IMQ6q-UHkdLwqS70L1fTnfSPbluFrv_I1UqsKquo,293
|
3
|
-
xlin/jsonl.py,sha256=
|
3
|
+
xlin/jsonl.py,sha256=IDRydHh2x-8iAGCxt9ScK2wfNLNA40PxNxR5hhr4v6k,7903
|
4
4
|
xlin/metric.py,sha256=N7wJ35y-C-IaBr1I1CJ_37lTG7gA69zmn9Xg6xSwKoI,1690
|
5
|
-
xlin/multiprocess_mapping.py,sha256=
|
5
|
+
xlin/multiprocess_mapping.py,sha256=ppSNidDLb6pI7_thCcqZBpYtKGTTS4osoPIIbWBu0d4,15893
|
6
6
|
xlin/read_as_dataframe.py,sha256=MqY57L7Wp9UoWTRlZLSBKQNaZa-dKw51-ufrKvHKf8s,9041
|
7
7
|
xlin/statistic.py,sha256=2DCUgzf7xkMFH4Pk9v82bFDNeSxCTjwPh9Y4IPJBHCE,9300
|
8
8
|
xlin/timing.py,sha256=XMT8dMcMolOMohDvAZOIM_BAiPMREhGQKnO1kc5s6PU,998
|
9
9
|
xlin/util.py,sha256=TTWJaqF5D_r-gAZ_fj0kyHomvCagjwHXQZ2OPSgwd54,10976
|
10
10
|
xlin/xls2xlsx.py,sha256=uSmXcDvIhi5Sq0LGidMXy0wErNBXdjaoa6EftYVjTXs,947
|
11
11
|
xlin/yaml.py,sha256=kICi7G3Td5q2MaSXXt85qNTWoHMgjzt7pvn7r3C4dME,183
|
12
|
-
xlin-0.1.
|
13
|
-
xlin-0.1.
|
14
|
-
xlin-0.1.
|
15
|
-
xlin-0.1.
|
12
|
+
xlin-0.1.25.dist-info/LICENSE,sha256=60ys6rRtc1dZOP8UjSUr9fAqhZudT3WpKe5WbMCralM,1066
|
13
|
+
xlin-0.1.25.dist-info/METADATA,sha256=4xqcaW20xkdlge7nsCWw5yRByrTyXsxZAgPca2TVFpY,1098
|
14
|
+
xlin-0.1.25.dist-info/WHEEL,sha256=IrRNNNJ-uuL1ggO5qMvT1GGhQVdQU54d6ZpYqEZfEWo,92
|
15
|
+
xlin-0.1.25.dist-info/RECORD,,
|
File without changes
|
File without changes
|