xlin 0.1.25__tar.gz → 0.1.26__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xlin-0.1.25 → xlin-0.1.26}/PKG-INFO +1 -1
- {xlin-0.1.25 → xlin-0.1.26}/pyproject.toml +1 -1
- {xlin-0.1.25 → xlin-0.1.26}/xlin/multiprocess_mapping.py +27 -16
- {xlin-0.1.25 → xlin-0.1.26}/LICENSE +0 -0
- {xlin-0.1.25 → xlin-0.1.26}/README.md +0 -0
- {xlin-0.1.25 → xlin-0.1.26}/xlin/__init__.py +0 -0
- {xlin-0.1.25 → xlin-0.1.26}/xlin/ischinese.py +0 -0
- {xlin-0.1.25 → xlin-0.1.26}/xlin/jsonl.py +0 -0
- {xlin-0.1.25 → xlin-0.1.26}/xlin/metric.py +0 -0
- {xlin-0.1.25 → xlin-0.1.26}/xlin/read_as_dataframe.py +0 -0
- {xlin-0.1.25 → xlin-0.1.26}/xlin/statistic.py +0 -0
- {xlin-0.1.25 → xlin-0.1.26}/xlin/timing.py +0 -0
- {xlin-0.1.25 → xlin-0.1.26}/xlin/util.py +0 -0
- {xlin-0.1.25 → xlin-0.1.26}/xlin/xls2xlsx.py +0 -0
- {xlin-0.1.25 → xlin-0.1.26}/xlin/yaml.py +0 -0
@@ -15,7 +15,7 @@ from xlin.util import ls
|
|
15
15
|
|
16
16
|
|
17
17
|
def element_mapping(
|
18
|
-
iterator:
|
18
|
+
iterator: list[Any],
|
19
19
|
mapping_func: Callable[[Any], Tuple[bool, Any]],
|
20
20
|
use_multiprocessing=True,
|
21
21
|
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
@@ -41,8 +41,8 @@ def element_mapping(
|
|
41
41
|
|
42
42
|
|
43
43
|
def batch_mapping(
|
44
|
-
iterator:
|
45
|
-
mapping_func: Callable[[
|
44
|
+
iterator: list[Any],
|
45
|
+
mapping_func: Callable[[list[Any]], Tuple[bool, list[Any]]],
|
46
46
|
use_multiprocessing=True,
|
47
47
|
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
48
48
|
batch_size=4,
|
@@ -72,24 +72,26 @@ def dataframe_with_row_mapping(
|
|
72
72
|
return df
|
73
73
|
|
74
74
|
|
75
|
-
def
|
76
|
-
jsonlist:
|
77
|
-
|
75
|
+
def xmap(
|
76
|
+
jsonlist: list[Any],
|
77
|
+
work_func: Union[Callable[[Any], dict], Callable[[list[Any]], list[dict]]],
|
78
78
|
output_path: Optional[Union[str, Path]]=None, # 输出路径,None表示不缓存
|
79
79
|
batch_size=multiprocessing.cpu_count(),
|
80
80
|
cache_batch_num=1,
|
81
|
-
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE",
|
81
|
+
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 8)),
|
82
82
|
use_process_pool=True, # CPU密集型任务时设为True
|
83
83
|
preserve_order=True, # 是否保持结果顺序
|
84
84
|
chunksize=None, # 自动计算最佳分块大小
|
85
85
|
retry_count=0, # 失败重试次数
|
86
|
+
force_overwrite=False, # 是否强制覆盖输出文件
|
87
|
+
is_batch_work_func=False, # 是否批量处理函数
|
86
88
|
):
|
87
89
|
"""高效处理JSON列表,支持多进程/多线程
|
88
90
|
|
89
91
|
Args:
|
90
|
-
jsonlist (
|
92
|
+
jsonlist (list[Any]): 要处理的JSON对象列表
|
91
93
|
output_path (Optional[Union[str, Path]]): 输出路径,None表示不缓存
|
92
|
-
|
94
|
+
work_func (Callable): 处理函数,接收dict返回dict
|
93
95
|
batch_size (int): 批处理大小
|
94
96
|
cache_batch_num (int): 缓存批次数量
|
95
97
|
thread_pool_size (int): 线程/进程池大小
|
@@ -110,9 +112,13 @@ def multiprocessing_mapping_jsonlist(
|
|
110
112
|
if need_caching:
|
111
113
|
output_path = Path(output_path)
|
112
114
|
if output_path.exists():
|
113
|
-
|
114
|
-
|
115
|
-
|
115
|
+
if force_overwrite:
|
116
|
+
logger.warning(f"强制覆盖输出文件: {output_path}")
|
117
|
+
output_path.unlink()
|
118
|
+
else:
|
119
|
+
output_list = load_json_list(output_path)
|
120
|
+
start_idx = len(output_list)
|
121
|
+
logger.info(f"继续处理: 已有{start_idx}条记录,共{len(jsonlist)}条")
|
116
122
|
else:
|
117
123
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
118
124
|
|
@@ -134,9 +140,13 @@ def multiprocessing_mapping_jsonlist(
|
|
134
140
|
# 批量处理逻辑
|
135
141
|
def process_batch(items_batch, retry_remaining=retry_count):
|
136
142
|
try:
|
137
|
-
|
138
|
-
|
139
|
-
|
143
|
+
if is_batch_work_func:
|
144
|
+
# 批量处理函数
|
145
|
+
return work_func(items_batch)
|
146
|
+
else:
|
147
|
+
# 选择合适的映射方法
|
148
|
+
map_func = pool.imap_unordered if not preserve_order else pool.imap
|
149
|
+
return list(map_func(work_func, items_batch, chunksize))
|
140
150
|
except Exception as e:
|
141
151
|
if retry_remaining > 0:
|
142
152
|
logger.warning(f"批处理失败,重试中 ({retry_count-retry_remaining+1}/{retry_count}): {e}")
|
@@ -177,7 +187,8 @@ def multiprocessing_mapping_jsonlist(
|
|
177
187
|
# 最终保存
|
178
188
|
if need_caching:
|
179
189
|
save_json_list(output_list, output_path)
|
180
|
-
|
190
|
+
drop_count = len(jsonlist) - len(output_list)
|
191
|
+
logger.info(f"处理完成,共处理{len(jsonlist)}条记录" + ", 丢弃{len(jsonlist) - len(output_list)}条记录" if drop_count > 0 else "")
|
181
192
|
|
182
193
|
return output_list
|
183
194
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|