xlin 0.1.25__tar.gz → 0.1.27__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xlin-0.1.25 → xlin-0.1.27}/PKG-INFO +1 -1
- {xlin-0.1.25 → xlin-0.1.27}/pyproject.toml +1 -1
- {xlin-0.1.25 → xlin-0.1.27}/xlin/multiprocess_mapping.py +43 -23
- {xlin-0.1.25 → xlin-0.1.27}/LICENSE +0 -0
- {xlin-0.1.25 → xlin-0.1.27}/README.md +0 -0
- {xlin-0.1.25 → xlin-0.1.27}/xlin/__init__.py +0 -0
- {xlin-0.1.25 → xlin-0.1.27}/xlin/ischinese.py +0 -0
- {xlin-0.1.25 → xlin-0.1.27}/xlin/jsonl.py +0 -0
- {xlin-0.1.25 → xlin-0.1.27}/xlin/metric.py +0 -0
- {xlin-0.1.25 → xlin-0.1.27}/xlin/read_as_dataframe.py +0 -0
- {xlin-0.1.25 → xlin-0.1.27}/xlin/statistic.py +0 -0
- {xlin-0.1.25 → xlin-0.1.27}/xlin/timing.py +0 -0
- {xlin-0.1.25 → xlin-0.1.27}/xlin/util.py +0 -0
- {xlin-0.1.25 → xlin-0.1.27}/xlin/xls2xlsx.py +0 -0
- {xlin-0.1.25 → xlin-0.1.27}/xlin/yaml.py +0 -0
@@ -15,7 +15,7 @@ from xlin.util import ls
|
|
15
15
|
|
16
16
|
|
17
17
|
def element_mapping(
|
18
|
-
iterator:
|
18
|
+
iterator: list[Any],
|
19
19
|
mapping_func: Callable[[Any], Tuple[bool, Any]],
|
20
20
|
use_multiprocessing=True,
|
21
21
|
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
@@ -41,8 +41,8 @@ def element_mapping(
|
|
41
41
|
|
42
42
|
|
43
43
|
def batch_mapping(
|
44
|
-
iterator:
|
45
|
-
mapping_func: Callable[[
|
44
|
+
iterator: list[Any],
|
45
|
+
mapping_func: Callable[[list[Any]], Tuple[bool, list[Any]]],
|
46
46
|
use_multiprocessing=True,
|
47
47
|
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
48
48
|
batch_size=4,
|
@@ -72,24 +72,27 @@ def dataframe_with_row_mapping(
|
|
72
72
|
return df
|
73
73
|
|
74
74
|
|
75
|
-
def
|
76
|
-
jsonlist:
|
77
|
-
|
75
|
+
def xmap(
|
76
|
+
jsonlist: list[Any],
|
77
|
+
work_func: Union[Callable[[Any], dict], Callable[[list[Any]], list[dict]]],
|
78
78
|
output_path: Optional[Union[str, Path]]=None, # 输出路径,None表示不缓存
|
79
79
|
batch_size=multiprocessing.cpu_count(),
|
80
80
|
cache_batch_num=1,
|
81
|
-
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE",
|
81
|
+
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 8)),
|
82
82
|
use_process_pool=True, # CPU密集型任务时设为True
|
83
83
|
preserve_order=True, # 是否保持结果顺序
|
84
84
|
chunksize=None, # 自动计算最佳分块大小
|
85
85
|
retry_count=0, # 失败重试次数
|
86
|
+
force_overwrite=False, # 是否强制覆盖输出文件
|
87
|
+
is_batch_work_func=False, # 是否批量处理函数
|
88
|
+
verbose=False, # 是否打印详细信息
|
86
89
|
):
|
87
90
|
"""高效处理JSON列表,支持多进程/多线程
|
88
91
|
|
89
92
|
Args:
|
90
|
-
jsonlist (
|
93
|
+
jsonlist (list[Any]): 要处理的JSON对象列表
|
91
94
|
output_path (Optional[Union[str, Path]]): 输出路径,None表示不缓存
|
92
|
-
|
95
|
+
work_func (Callable): 处理函数,接收dict返回dict
|
93
96
|
batch_size (int): 批处理大小
|
94
97
|
cache_batch_num (int): 缓存批次数量
|
95
98
|
thread_pool_size (int): 线程/进程池大小
|
@@ -110,22 +113,31 @@ def multiprocessing_mapping_jsonlist(
|
|
110
113
|
if need_caching:
|
111
114
|
output_path = Path(output_path)
|
112
115
|
if output_path.exists():
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
+
if force_overwrite:
|
117
|
+
if verbose:
|
118
|
+
logger.warning(f"强制覆盖输出文件: {output_path}")
|
119
|
+
output_path.unlink()
|
120
|
+
else:
|
121
|
+
output_list = load_json_list(output_path)
|
122
|
+
start_idx = len(output_list)
|
123
|
+
if verbose:
|
124
|
+
logger.info(f"继续处理: 已有{start_idx}条记录,共{len(jsonlist)}条")
|
116
125
|
else:
|
117
126
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
118
127
|
|
119
128
|
# 选择线程池或进程池
|
120
129
|
if use_process_pool:
|
121
130
|
pool_cls = multiprocessing.Pool
|
122
|
-
|
131
|
+
if verbose:
|
132
|
+
logger.info(f"使用进程池(ProcessPool),适用于CPU密集型任务")
|
123
133
|
else:
|
124
134
|
pool_cls = ThreadPool
|
125
|
-
|
135
|
+
if verbose:
|
136
|
+
logger.info(f"使用线程池(ThreadPool),适用于IO密集型任务")
|
126
137
|
|
127
138
|
with pool_cls(thread_pool_size) as pool:
|
128
|
-
|
139
|
+
if verbose:
|
140
|
+
logger.info(f"池大小: {thread_pool_size}, 批处理大小: {batch_size}, 分块大小: {chunksize}")
|
129
141
|
|
130
142
|
# 准备要处理的数据
|
131
143
|
remaining_items = jsonlist[start_idx:]
|
@@ -134,15 +146,21 @@ def multiprocessing_mapping_jsonlist(
|
|
134
146
|
# 批量处理逻辑
|
135
147
|
def process_batch(items_batch, retry_remaining=retry_count):
|
136
148
|
try:
|
137
|
-
|
138
|
-
|
139
|
-
|
149
|
+
if is_batch_work_func:
|
150
|
+
# 批量处理函数
|
151
|
+
return work_func(items_batch)
|
152
|
+
else:
|
153
|
+
# 选择合适的映射方法
|
154
|
+
map_func = pool.imap_unordered if not preserve_order else pool.imap
|
155
|
+
return list(map_func(work_func, items_batch, chunksize))
|
140
156
|
except Exception as e:
|
141
157
|
if retry_remaining > 0:
|
142
|
-
|
158
|
+
if verbose:
|
159
|
+
logger.warning(f"批处理失败,重试中 ({retry_count-retry_remaining+1}/{retry_count}): {e}")
|
143
160
|
return process_batch(items_batch, retry_remaining - 1)
|
144
161
|
else:
|
145
|
-
|
162
|
+
if verbose:
|
163
|
+
logger.error(f"批处理失败: {e}")
|
146
164
|
raise
|
147
165
|
|
148
166
|
# 处理数据
|
@@ -165,19 +183,21 @@ def multiprocessing_mapping_jsonlist(
|
|
165
183
|
|
166
184
|
# 性能统计
|
167
185
|
items_per_second = len(batch) / batch_time if batch_time > 0 else 0
|
168
|
-
pbar.set_postfix_str(f"速率: {items_per_second:.1f}项/秒")
|
186
|
+
pbar.set_postfix_str(f"速率: {items_per_second:.1f} 项/秒")
|
169
187
|
|
170
188
|
# 缓存逻辑
|
171
189
|
if need_caching and (i // batch_size) % cache_batch_num == 0:
|
172
190
|
# 仅当处理速度足够慢时才保存缓存,避免IO成为瓶颈
|
173
191
|
if batch_time > 3 or i + batch_size >= total_items:
|
174
192
|
save_json_list(output_list, output_path)
|
175
|
-
logger.debug(f"已保存{len(output_list)}条记录到{output_path}")
|
193
|
+
logger.debug(f"已保存 {len(output_list)} 条记录到 {output_path}")
|
176
194
|
|
177
195
|
# 最终保存
|
178
196
|
if need_caching:
|
179
197
|
save_json_list(output_list, output_path)
|
180
|
-
|
198
|
+
if verbose:
|
199
|
+
drop_count = len(jsonlist) - len(output_list)
|
200
|
+
logger.info(f"处理完成,共处理 {len(jsonlist)} 条记录" + f", 丢弃 {drop_count} 条记录" if drop_count > 0 else "")
|
181
201
|
|
182
202
|
return output_list
|
183
203
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|