xlin 0.1.25__py2.py3-none-any.whl → 0.1.26__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,7 +15,7 @@ from xlin.util import ls
15
15
 
16
16
 
17
17
  def element_mapping(
18
- iterator: List[Any],
18
+ iterator: list[Any],
19
19
  mapping_func: Callable[[Any], Tuple[bool, Any]],
20
20
  use_multiprocessing=True,
21
21
  thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
@@ -41,8 +41,8 @@ def element_mapping(
41
41
 
42
42
 
43
43
  def batch_mapping(
44
- iterator: List[Any],
45
- mapping_func: Callable[[List[Any]], Tuple[bool, List[Any]]],
44
+ iterator: list[Any],
45
+ mapping_func: Callable[[list[Any]], Tuple[bool, list[Any]]],
46
46
  use_multiprocessing=True,
47
47
  thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
48
48
  batch_size=4,
@@ -72,24 +72,26 @@ def dataframe_with_row_mapping(
72
72
  return df
73
73
 
74
74
 
75
- def multiprocessing_mapping_jsonlist(
76
- jsonlist: List[Any],
77
- partial_func: Callable[[Any], dict],
75
+ def xmap(
76
+ jsonlist: list[Any],
77
+ work_func: Union[Callable[[Any], dict], Callable[[list[Any]], list[dict]]],
78
78
  output_path: Optional[Union[str, Path]]=None, # 输出路径,None表示不缓存
79
79
  batch_size=multiprocessing.cpu_count(),
80
80
  cache_batch_num=1,
81
- thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
81
+ thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 8)),
82
82
  use_process_pool=True, # CPU密集型任务时设为True
83
83
  preserve_order=True, # 是否保持结果顺序
84
84
  chunksize=None, # 自动计算最佳分块大小
85
85
  retry_count=0, # 失败重试次数
86
+ force_overwrite=False, # 是否强制覆盖输出文件
87
+ is_batch_work_func=False, # 是否批量处理函数
86
88
  ):
87
89
  """高效处理JSON列表,支持多进程/多线程
88
90
 
89
91
  Args:
90
- jsonlist (List[Any]): 要处理的JSON对象列表
92
+ jsonlist (list[Any]): 要处理的JSON对象列表
91
93
  output_path (Optional[Union[str, Path]]): 输出路径,None表示不缓存
92
- partial_func (Callable): 处理函数,接收dict返回dict
94
+ work_func (Callable): 处理函数,接收dict返回dict
93
95
  batch_size (int): 批处理大小
94
96
  cache_batch_num (int): 缓存批次数量
95
97
  thread_pool_size (int): 线程/进程池大小
@@ -110,9 +112,13 @@ def multiprocessing_mapping_jsonlist(
110
112
  if need_caching:
111
113
  output_path = Path(output_path)
112
114
  if output_path.exists():
113
- output_list = load_json_list(output_path)
114
- start_idx = len(output_list)
115
- logger.info(f"继续处理: 已有{start_idx}条记录,共{len(jsonlist)}条")
115
+ if force_overwrite:
116
+ logger.warning(f"强制覆盖输出文件: {output_path}")
117
+ output_path.unlink()
118
+ else:
119
+ output_list = load_json_list(output_path)
120
+ start_idx = len(output_list)
121
+ logger.info(f"继续处理: 已有{start_idx}条记录,共{len(jsonlist)}条")
116
122
  else:
117
123
  output_path.parent.mkdir(parents=True, exist_ok=True)
118
124
 
@@ -134,9 +140,13 @@ def multiprocessing_mapping_jsonlist(
134
140
  # 批量处理逻辑
135
141
  def process_batch(items_batch, retry_remaining=retry_count):
136
142
  try:
137
- # 选择合适的映射方法
138
- map_func = pool.imap_unordered if not preserve_order else pool.imap
139
- return list(map_func(partial_func, items_batch, chunksize))
143
+ if is_batch_work_func:
144
+ # 批量处理函数
145
+ return work_func(items_batch)
146
+ else:
147
+ # 选择合适的映射方法
148
+ map_func = pool.imap_unordered if not preserve_order else pool.imap
149
+ return list(map_func(work_func, items_batch, chunksize))
140
150
  except Exception as e:
141
151
  if retry_remaining > 0:
142
152
  logger.warning(f"批处理失败,重试中 ({retry_count-retry_remaining+1}/{retry_count}): {e}")
@@ -177,7 +187,8 @@ def multiprocessing_mapping_jsonlist(
177
187
  # 最终保存
178
188
  if need_caching:
179
189
  save_json_list(output_list, output_path)
180
- logger.info(f"已完成处理并保存{len(output_list)}条记录")
190
+ drop_count = len(jsonlist) - len(output_list)
191
+ logger.info(f"处理完成,共处理{len(jsonlist)}条记录" + ", 丢弃{len(jsonlist) - len(output_list)}条记录" if drop_count > 0 else "")
181
192
 
182
193
  return output_list
183
194
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: xlin
3
- Version: 0.1.25
3
+ Version: 0.1.26
4
4
  Summary: toolbox for LinXueyuan
5
5
  License: MIT
6
6
  Author: LinXueyuanStdio
@@ -2,14 +2,14 @@ xlin/__init__.py,sha256=MWWCNPgJFS_oV2US52ULa4yg4Ku61qjn40NVKqcp9-c,248
2
2
  xlin/ischinese.py,sha256=Ia9IMQ6q-UHkdLwqS70L1fTnfSPbluFrv_I1UqsKquo,293
3
3
  xlin/jsonl.py,sha256=IDRydHh2x-8iAGCxt9ScK2wfNLNA40PxNxR5hhr4v6k,7903
4
4
  xlin/metric.py,sha256=N7wJ35y-C-IaBr1I1CJ_37lTG7gA69zmn9Xg6xSwKoI,1690
5
- xlin/multiprocess_mapping.py,sha256=ppSNidDLb6pI7_thCcqZBpYtKGTTS4osoPIIbWBu0d4,15893
5
+ xlin/multiprocess_mapping.py,sha256=M_d8G-apwZFcWuQxYexOpN8a4SX3OMUuqdJ4JHHgQHw,16480
6
6
  xlin/read_as_dataframe.py,sha256=MqY57L7Wp9UoWTRlZLSBKQNaZa-dKw51-ufrKvHKf8s,9041
7
7
  xlin/statistic.py,sha256=2DCUgzf7xkMFH4Pk9v82bFDNeSxCTjwPh9Y4IPJBHCE,9300
8
8
  xlin/timing.py,sha256=XMT8dMcMolOMohDvAZOIM_BAiPMREhGQKnO1kc5s6PU,998
9
9
  xlin/util.py,sha256=TTWJaqF5D_r-gAZ_fj0kyHomvCagjwHXQZ2OPSgwd54,10976
10
10
  xlin/xls2xlsx.py,sha256=uSmXcDvIhi5Sq0LGidMXy0wErNBXdjaoa6EftYVjTXs,947
11
11
  xlin/yaml.py,sha256=kICi7G3Td5q2MaSXXt85qNTWoHMgjzt7pvn7r3C4dME,183
12
- xlin-0.1.25.dist-info/LICENSE,sha256=60ys6rRtc1dZOP8UjSUr9fAqhZudT3WpKe5WbMCralM,1066
13
- xlin-0.1.25.dist-info/METADATA,sha256=4xqcaW20xkdlge7nsCWw5yRByrTyXsxZAgPca2TVFpY,1098
14
- xlin-0.1.25.dist-info/WHEEL,sha256=IrRNNNJ-uuL1ggO5qMvT1GGhQVdQU54d6ZpYqEZfEWo,92
15
- xlin-0.1.25.dist-info/RECORD,,
12
+ xlin-0.1.26.dist-info/LICENSE,sha256=60ys6rRtc1dZOP8UjSUr9fAqhZudT3WpKe5WbMCralM,1066
13
+ xlin-0.1.26.dist-info/METADATA,sha256=suP1OEbJd7a0-mhqkh3thQVNBTGUi2amRQVaSGbYafk,1098
14
+ xlin-0.1.26.dist-info/WHEEL,sha256=IrRNNNJ-uuL1ggO5qMvT1GGhQVdQU54d6ZpYqEZfEWo,92
15
+ xlin-0.1.26.dist-info/RECORD,,
File without changes
File without changes