xlin 0.1.24__py2.py3-none-any.whl → 0.1.26__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xlin/multiprocess_mapping.py
CHANGED
@@ -15,21 +15,25 @@ from xlin.util import ls
|
|
15
15
|
|
16
16
|
|
17
17
|
def element_mapping(
|
18
|
-
iterator:
|
18
|
+
iterator: list[Any],
|
19
19
|
mapping_func: Callable[[Any], Tuple[bool, Any]],
|
20
20
|
use_multiprocessing=True,
|
21
21
|
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
22
22
|
):
|
23
23
|
rows = []
|
24
|
+
# 转换为列表以获取长度,用于进度条显示
|
25
|
+
items = list(iterator)
|
26
|
+
total = len(items)
|
27
|
+
|
24
28
|
if use_multiprocessing:
|
25
29
|
pool = ThreadPool(thread_pool_size)
|
26
|
-
|
27
|
-
pool.
|
28
|
-
for ok, row in results:
|
30
|
+
# 使用imap替代map,结合tqdm显示进度
|
31
|
+
for ok, row in tqdm(pool.imap(mapping_func, items), total=total, desc="Processing"):
|
29
32
|
if ok:
|
30
33
|
rows.append(row)
|
34
|
+
pool.close()
|
31
35
|
else:
|
32
|
-
for row in tqdm(
|
36
|
+
for row in tqdm(items, desc="Processing"):
|
33
37
|
ok, row = mapping_func(row)
|
34
38
|
if ok:
|
35
39
|
rows.append(row)
|
@@ -37,8 +41,8 @@ def element_mapping(
|
|
37
41
|
|
38
42
|
|
39
43
|
def batch_mapping(
|
40
|
-
iterator:
|
41
|
-
mapping_func: Callable[[
|
44
|
+
iterator: list[Any],
|
45
|
+
mapping_func: Callable[[list[Any]], Tuple[bool, list[Any]]],
|
42
46
|
use_multiprocessing=True,
|
43
47
|
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
44
48
|
batch_size=4,
|
@@ -68,59 +72,124 @@ def dataframe_with_row_mapping(
|
|
68
72
|
return df
|
69
73
|
|
70
74
|
|
71
|
-
def
|
72
|
-
jsonlist:
|
73
|
-
|
74
|
-
|
75
|
+
def xmap(
|
76
|
+
jsonlist: list[Any],
|
77
|
+
work_func: Union[Callable[[Any], dict], Callable[[list[Any]], list[dict]]],
|
78
|
+
output_path: Optional[Union[str, Path]]=None, # 输出路径,None表示不缓存
|
75
79
|
batch_size=multiprocessing.cpu_count(),
|
76
80
|
cache_batch_num=1,
|
77
|
-
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE",
|
81
|
+
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 8)),
|
82
|
+
use_process_pool=True, # CPU密集型任务时设为True
|
83
|
+
preserve_order=True, # 是否保持结果顺序
|
84
|
+
chunksize=None, # 自动计算最佳分块大小
|
85
|
+
retry_count=0, # 失败重试次数
|
86
|
+
force_overwrite=False, # 是否强制覆盖输出文件
|
87
|
+
is_batch_work_func=False, # 是否批量处理函数
|
78
88
|
):
|
79
|
-
"""
|
89
|
+
"""高效处理JSON列表,支持多进程/多线程
|
80
90
|
|
81
91
|
Args:
|
82
|
-
|
83
|
-
output_path (Path):
|
84
|
-
|
92
|
+
jsonlist (list[Any]): 要处理的JSON对象列表
|
93
|
+
output_path (Optional[Union[str, Path]]): 输出路径,None表示不缓存
|
94
|
+
work_func (Callable): 处理函数,接收dict返回dict
|
95
|
+
batch_size (int): 批处理大小
|
96
|
+
cache_batch_num (int): 缓存批次数量
|
97
|
+
thread_pool_size (int): 线程/进程池大小
|
98
|
+
use_process_pool (bool): 是否使用进程池(CPU密集型任务)
|
99
|
+
preserve_order (bool): 是否保持结果顺序
|
100
|
+
chunksize (Optional[int]): 单个任务分块大小,None为自动计算
|
101
|
+
retry_count (int): 任务失败重试次数
|
85
102
|
"""
|
86
103
|
need_caching = output_path is not None
|
87
|
-
|
104
|
+
output_list = []
|
88
105
|
start_idx = 0
|
106
|
+
|
107
|
+
# 自动计算最佳chunksize
|
108
|
+
if chunksize is None:
|
109
|
+
chunksize = max(1, min(batch_size // thread_pool_size, 100))
|
110
|
+
|
111
|
+
# 处理缓存
|
89
112
|
if need_caching:
|
90
113
|
output_path = Path(output_path)
|
91
114
|
if output_path.exists():
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
115
|
+
if force_overwrite:
|
116
|
+
logger.warning(f"强制覆盖输出文件: {output_path}")
|
117
|
+
output_path.unlink()
|
118
|
+
else:
|
119
|
+
output_list = load_json_list(output_path)
|
120
|
+
start_idx = len(output_list)
|
121
|
+
logger.info(f"继续处理: 已有{start_idx}条记录,共{len(jsonlist)}条")
|
96
122
|
else:
|
97
123
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
124
|
+
|
125
|
+
# 选择线程池或进程池
|
126
|
+
if use_process_pool:
|
127
|
+
pool_cls = multiprocessing.Pool
|
128
|
+
logger.info(f"使用进程池(ProcessPool),适用于CPU密集型任务")
|
129
|
+
else:
|
130
|
+
pool_cls = ThreadPool
|
131
|
+
logger.info(f"使用线程池(ThreadPool),适用于IO密集型任务")
|
132
|
+
|
133
|
+
with pool_cls(thread_pool_size) as pool:
|
134
|
+
logger.info(f"池大小: {thread_pool_size}, 批处理大小: {batch_size}, 分块大小: {chunksize}")
|
135
|
+
|
136
|
+
# 准备要处理的数据
|
137
|
+
remaining_items = jsonlist[start_idx:]
|
138
|
+
total_items = len(remaining_items)
|
139
|
+
|
140
|
+
# 批量处理逻辑
|
141
|
+
def process_batch(items_batch, retry_remaining=retry_count):
|
142
|
+
try:
|
143
|
+
if is_batch_work_func:
|
144
|
+
# 批量处理函数
|
145
|
+
return work_func(items_batch)
|
146
|
+
else:
|
147
|
+
# 选择合适的映射方法
|
148
|
+
map_func = pool.imap_unordered if not preserve_order else pool.imap
|
149
|
+
return list(map_func(work_func, items_batch, chunksize))
|
150
|
+
except Exception as e:
|
151
|
+
if retry_remaining > 0:
|
152
|
+
logger.warning(f"批处理失败,重试中 ({retry_count-retry_remaining+1}/{retry_count}): {e}")
|
153
|
+
return process_batch(items_batch, retry_remaining - 1)
|
154
|
+
else:
|
155
|
+
logger.error(f"批处理失败: {e}")
|
156
|
+
raise
|
157
|
+
|
158
|
+
# 处理数据
|
159
|
+
with tqdm(total=total_items, desc="处理数据", unit="项") as pbar:
|
160
|
+
# 跳过已处理的项目
|
161
|
+
pbar.update(start_idx)
|
162
|
+
|
163
|
+
# 分批处理
|
164
|
+
for i in range(0, total_items, batch_size):
|
165
|
+
batch = remaining_items[i : i + batch_size]
|
166
|
+
|
167
|
+
# 处理当前批次
|
168
|
+
batch_start_time = time.time()
|
169
|
+
results = process_batch(batch)
|
170
|
+
batch_time = time.time() - batch_start_time
|
171
|
+
|
172
|
+
# 更新结果
|
173
|
+
output_list.extend(results)
|
174
|
+
pbar.update(len(batch))
|
175
|
+
|
176
|
+
# 性能统计
|
177
|
+
items_per_second = len(batch) / batch_time if batch_time > 0 else 0
|
178
|
+
pbar.set_postfix_str(f"速率: {items_per_second:.1f}项/秒")
|
179
|
+
|
180
|
+
# 缓存逻辑
|
181
|
+
if need_caching and (i // batch_size) % cache_batch_num == 0:
|
182
|
+
# 仅当处理速度足够慢时才保存缓存,避免IO成为瓶颈
|
183
|
+
if batch_time > 3 or i + batch_size >= total_items:
|
184
|
+
save_json_list(output_list, output_path)
|
185
|
+
logger.debug(f"已保存{len(output_list)}条记录到{output_path}")
|
186
|
+
|
187
|
+
# 最终保存
|
122
188
|
if need_caching:
|
123
189
|
save_json_list(output_list, output_path)
|
190
|
+
drop_count = len(jsonlist) - len(output_list)
|
191
|
+
logger.info(f"处理完成,共处理{len(jsonlist)}条记录" + ", 丢弃{len(jsonlist) - len(output_list)}条记录" if drop_count > 0 else "")
|
192
|
+
|
124
193
|
return output_list
|
125
194
|
|
126
195
|
|
@@ -186,35 +255,6 @@ def multiprocessing_mapping(
|
|
186
255
|
return output_df, output_list
|
187
256
|
|
188
257
|
|
189
|
-
def continue_run(
|
190
|
-
jsonfiles: List[str],
|
191
|
-
save_dir: str,
|
192
|
-
mapping_func,
|
193
|
-
load_func=load_json,
|
194
|
-
save_func=save_json,
|
195
|
-
batch_size=1024,
|
196
|
-
cache_size=8,
|
197
|
-
):
|
198
|
-
save_dir: Path = Path(save_dir)
|
199
|
-
save_dir.mkdir(parents=True, exist_ok=True)
|
200
|
-
new_jsonfiles = []
|
201
|
-
for jsonfile in ls(jsonfiles):
|
202
|
-
jsonlist = load_func(jsonfile)
|
203
|
-
output_filepath = save_dir / jsonfile.name
|
204
|
-
for row in jsonlist:
|
205
|
-
row["来源"] = jsonfile.name
|
206
|
-
new_jsonlist = multiprocessing_mapping_jsonlist(
|
207
|
-
jsonlist,
|
208
|
-
output_filepath,
|
209
|
-
mapping_func,
|
210
|
-
batch_size,
|
211
|
-
cache_size,
|
212
|
-
)
|
213
|
-
save_func(new_jsonlist, output_filepath)
|
214
|
-
new_jsonfiles.append(output_filepath)
|
215
|
-
return new_jsonfiles
|
216
|
-
|
217
|
-
|
218
258
|
def dataframe_mapping(
|
219
259
|
df: pd.DataFrame,
|
220
260
|
row_func: Callable[[dict], dict],
|
@@ -2,14 +2,14 @@ xlin/__init__.py,sha256=MWWCNPgJFS_oV2US52ULa4yg4Ku61qjn40NVKqcp9-c,248
|
|
2
2
|
xlin/ischinese.py,sha256=Ia9IMQ6q-UHkdLwqS70L1fTnfSPbluFrv_I1UqsKquo,293
|
3
3
|
xlin/jsonl.py,sha256=IDRydHh2x-8iAGCxt9ScK2wfNLNA40PxNxR5hhr4v6k,7903
|
4
4
|
xlin/metric.py,sha256=N7wJ35y-C-IaBr1I1CJ_37lTG7gA69zmn9Xg6xSwKoI,1690
|
5
|
-
xlin/multiprocess_mapping.py,sha256=
|
5
|
+
xlin/multiprocess_mapping.py,sha256=M_d8G-apwZFcWuQxYexOpN8a4SX3OMUuqdJ4JHHgQHw,16480
|
6
6
|
xlin/read_as_dataframe.py,sha256=MqY57L7Wp9UoWTRlZLSBKQNaZa-dKw51-ufrKvHKf8s,9041
|
7
7
|
xlin/statistic.py,sha256=2DCUgzf7xkMFH4Pk9v82bFDNeSxCTjwPh9Y4IPJBHCE,9300
|
8
8
|
xlin/timing.py,sha256=XMT8dMcMolOMohDvAZOIM_BAiPMREhGQKnO1kc5s6PU,998
|
9
9
|
xlin/util.py,sha256=TTWJaqF5D_r-gAZ_fj0kyHomvCagjwHXQZ2OPSgwd54,10976
|
10
10
|
xlin/xls2xlsx.py,sha256=uSmXcDvIhi5Sq0LGidMXy0wErNBXdjaoa6EftYVjTXs,947
|
11
11
|
xlin/yaml.py,sha256=kICi7G3Td5q2MaSXXt85qNTWoHMgjzt7pvn7r3C4dME,183
|
12
|
-
xlin-0.1.
|
13
|
-
xlin-0.1.
|
14
|
-
xlin-0.1.
|
15
|
-
xlin-0.1.
|
12
|
+
xlin-0.1.26.dist-info/LICENSE,sha256=60ys6rRtc1dZOP8UjSUr9fAqhZudT3WpKe5WbMCralM,1066
|
13
|
+
xlin-0.1.26.dist-info/METADATA,sha256=suP1OEbJd7a0-mhqkh3thQVNBTGUi2amRQVaSGbYafk,1098
|
14
|
+
xlin-0.1.26.dist-info/WHEEL,sha256=IrRNNNJ-uuL1ggO5qMvT1GGhQVdQU54d6ZpYqEZfEWo,92
|
15
|
+
xlin-0.1.26.dist-info/RECORD,,
|
File without changes
|
File without changes
|