xlin 0.1.25__tar.gz → 0.1.27__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: xlin
3
- Version: 0.1.25
3
+ Version: 0.1.27
4
4
  Summary: toolbox for LinXueyuan
5
5
  License: MIT
6
6
  Author: LinXueyuanStdio
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "xlin"
3
- version = "0.1.25"
3
+ version = "0.1.27"
4
4
  description = "toolbox for LinXueyuan"
5
5
  authors = ["LinXueyuanStdio <23211526+LinXueyuanStdio@users.noreply.github.com>"]
6
6
  license = "MIT"
@@ -15,7 +15,7 @@ from xlin.util import ls
15
15
 
16
16
 
17
17
  def element_mapping(
18
- iterator: List[Any],
18
+ iterator: list[Any],
19
19
  mapping_func: Callable[[Any], Tuple[bool, Any]],
20
20
  use_multiprocessing=True,
21
21
  thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
@@ -41,8 +41,8 @@ def element_mapping(
41
41
 
42
42
 
43
43
  def batch_mapping(
44
- iterator: List[Any],
45
- mapping_func: Callable[[List[Any]], Tuple[bool, List[Any]]],
44
+ iterator: list[Any],
45
+ mapping_func: Callable[[list[Any]], Tuple[bool, list[Any]]],
46
46
  use_multiprocessing=True,
47
47
  thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
48
48
  batch_size=4,
@@ -72,24 +72,27 @@ def dataframe_with_row_mapping(
72
72
  return df
73
73
 
74
74
 
75
- def multiprocessing_mapping_jsonlist(
76
- jsonlist: List[Any],
77
- partial_func: Callable[[Any], dict],
75
+ def xmap(
76
+ jsonlist: list[Any],
77
+ work_func: Union[Callable[[Any], dict], Callable[[list[Any]], list[dict]]],
78
78
  output_path: Optional[Union[str, Path]]=None, # 输出路径,None表示不缓存
79
79
  batch_size=multiprocessing.cpu_count(),
80
80
  cache_batch_num=1,
81
- thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
81
+ thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 8)),
82
82
  use_process_pool=True, # CPU密集型任务时设为True
83
83
  preserve_order=True, # 是否保持结果顺序
84
84
  chunksize=None, # 自动计算最佳分块大小
85
85
  retry_count=0, # 失败重试次数
86
+ force_overwrite=False, # 是否强制覆盖输出文件
87
+ is_batch_work_func=False, # 是否批量处理函数
88
+ verbose=False, # 是否打印详细信息
86
89
  ):
87
90
  """高效处理JSON列表,支持多进程/多线程
88
91
 
89
92
  Args:
90
- jsonlist (List[Any]): 要处理的JSON对象列表
93
+ jsonlist (list[Any]): 要处理的JSON对象列表
91
94
  output_path (Optional[Union[str, Path]]): 输出路径,None表示不缓存
92
- partial_func (Callable): 处理函数,接收dict返回dict
95
+ work_func (Callable): 处理函数,接收dict返回dict
93
96
  batch_size (int): 批处理大小
94
97
  cache_batch_num (int): 缓存批次数量
95
98
  thread_pool_size (int): 线程/进程池大小
@@ -110,22 +113,31 @@ def multiprocessing_mapping_jsonlist(
110
113
  if need_caching:
111
114
  output_path = Path(output_path)
112
115
  if output_path.exists():
113
- output_list = load_json_list(output_path)
114
- start_idx = len(output_list)
115
- logger.info(f"继续处理: 已有{start_idx}条记录,共{len(jsonlist)}条")
116
+ if force_overwrite:
117
+ if verbose:
118
+ logger.warning(f"强制覆盖输出文件: {output_path}")
119
+ output_path.unlink()
120
+ else:
121
+ output_list = load_json_list(output_path)
122
+ start_idx = len(output_list)
123
+ if verbose:
124
+ logger.info(f"继续处理: 已有{start_idx}条记录,共{len(jsonlist)}条")
116
125
  else:
117
126
  output_path.parent.mkdir(parents=True, exist_ok=True)
118
127
 
119
128
  # 选择线程池或进程池
120
129
  if use_process_pool:
121
130
  pool_cls = multiprocessing.Pool
122
- logger.info(f"使用进程池(ProcessPool),适用于CPU密集型任务")
131
+ if verbose:
132
+ logger.info(f"使用进程池(ProcessPool),适用于CPU密集型任务")
123
133
  else:
124
134
  pool_cls = ThreadPool
125
- logger.info(f"使用线程池(ThreadPool),适用于IO密集型任务")
135
+ if verbose:
136
+ logger.info(f"使用线程池(ThreadPool),适用于IO密集型任务")
126
137
 
127
138
  with pool_cls(thread_pool_size) as pool:
128
- logger.info(f"池大小: {thread_pool_size}, 批处理大小: {batch_size}, 分块大小: {chunksize}")
139
+ if verbose:
140
+ logger.info(f"池大小: {thread_pool_size}, 批处理大小: {batch_size}, 分块大小: {chunksize}")
129
141
 
130
142
  # 准备要处理的数据
131
143
  remaining_items = jsonlist[start_idx:]
@@ -134,15 +146,21 @@ def multiprocessing_mapping_jsonlist(
134
146
  # 批量处理逻辑
135
147
  def process_batch(items_batch, retry_remaining=retry_count):
136
148
  try:
137
- # 选择合适的映射方法
138
- map_func = pool.imap_unordered if not preserve_order else pool.imap
139
- return list(map_func(partial_func, items_batch, chunksize))
149
+ if is_batch_work_func:
150
+ # 批量处理函数
151
+ return work_func(items_batch)
152
+ else:
153
+ # 选择合适的映射方法
154
+ map_func = pool.imap_unordered if not preserve_order else pool.imap
155
+ return list(map_func(work_func, items_batch, chunksize))
140
156
  except Exception as e:
141
157
  if retry_remaining > 0:
142
- logger.warning(f"批处理失败,重试中 ({retry_count-retry_remaining+1}/{retry_count}): {e}")
158
+ if verbose:
159
+ logger.warning(f"批处理失败,重试中 ({retry_count-retry_remaining+1}/{retry_count}): {e}")
143
160
  return process_batch(items_batch, retry_remaining - 1)
144
161
  else:
145
- logger.error(f"批处理失败: {e}")
162
+ if verbose:
163
+ logger.error(f"批处理失败: {e}")
146
164
  raise
147
165
 
148
166
  # 处理数据
@@ -165,19 +183,21 @@ def multiprocessing_mapping_jsonlist(
165
183
 
166
184
  # 性能统计
167
185
  items_per_second = len(batch) / batch_time if batch_time > 0 else 0
168
- pbar.set_postfix_str(f"速率: {items_per_second:.1f}项/秒")
186
+ pbar.set_postfix_str(f"速率: {items_per_second:.1f} 项/秒")
169
187
 
170
188
  # 缓存逻辑
171
189
  if need_caching and (i // batch_size) % cache_batch_num == 0:
172
190
  # 仅当处理速度足够慢时才保存缓存,避免IO成为瓶颈
173
191
  if batch_time > 3 or i + batch_size >= total_items:
174
192
  save_json_list(output_list, output_path)
175
- logger.debug(f"已保存{len(output_list)}条记录到{output_path}")
193
+ logger.debug(f"已保存 {len(output_list)} 条记录到 {output_path}")
176
194
 
177
195
  # 最终保存
178
196
  if need_caching:
179
197
  save_json_list(output_list, output_path)
180
- logger.info(f"已完成处理并保存{len(output_list)}条记录")
198
+ if verbose:
199
+ drop_count = len(jsonlist) - len(output_list)
200
+ logger.info(f"处理完成,共处理 {len(jsonlist)} 条记录" + f", 丢弃 {drop_count} 条记录" if drop_count > 0 else "")
181
201
 
182
202
  return output_list
183
203
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes