xlin 0.1.24__py2.py3-none-any.whl → 0.1.26__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,21 +15,25 @@ from xlin.util import ls
15
15
 
16
16
 
17
17
  def element_mapping(
18
- iterator: List[Any],
18
+ iterator: list[Any],
19
19
  mapping_func: Callable[[Any], Tuple[bool, Any]],
20
20
  use_multiprocessing=True,
21
21
  thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
22
22
  ):
23
23
  rows = []
24
+ # 转换为列表以获取长度,用于进度条显示
25
+ items = list(iterator)
26
+ total = len(items)
27
+
24
28
  if use_multiprocessing:
25
29
  pool = ThreadPool(thread_pool_size)
26
- results = pool.map(mapping_func, iterator)
27
- pool.close()
28
- for ok, row in results:
30
+ # 使用imap替代map,结合tqdm显示进度
31
+ for ok, row in tqdm(pool.imap(mapping_func, items), total=total, desc="Processing"):
29
32
  if ok:
30
33
  rows.append(row)
34
+ pool.close()
31
35
  else:
32
- for row in tqdm(iterator):
36
+ for row in tqdm(items, desc="Processing"):
33
37
  ok, row = mapping_func(row)
34
38
  if ok:
35
39
  rows.append(row)
@@ -37,8 +41,8 @@ def element_mapping(
37
41
 
38
42
 
39
43
  def batch_mapping(
40
- iterator: List[Any],
41
- mapping_func: Callable[[List[Any]], Tuple[bool, List[Any]]],
44
+ iterator: list[Any],
45
+ mapping_func: Callable[[list[Any]], Tuple[bool, list[Any]]],
42
46
  use_multiprocessing=True,
43
47
  thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
44
48
  batch_size=4,
@@ -68,59 +72,124 @@ def dataframe_with_row_mapping(
68
72
  return df
69
73
 
70
74
 
71
- def multiprocessing_mapping_jsonlist(
72
- jsonlist: List[Any],
73
- output_path: Optional[Union[str, Path]],
74
- partial_func,
75
+ def xmap(
76
+ jsonlist: list[Any],
77
+ work_func: Union[Callable[[Any], dict], Callable[[list[Any]], list[dict]]],
78
+ output_path: Optional[Union[str, Path]]=None, # 输出路径,None表示不缓存
75
79
  batch_size=multiprocessing.cpu_count(),
76
80
  cache_batch_num=1,
77
- thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
81
+ thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 8)),
82
+ use_process_pool=True, # CPU密集型任务时设为True
83
+ preserve_order=True, # 是否保持结果顺序
84
+ chunksize=None, # 自动计算最佳分块大小
85
+ retry_count=0, # 失败重试次数
86
+ force_overwrite=False, # 是否强制覆盖输出文件
87
+ is_batch_work_func=False, # 是否批量处理函数
78
88
  ):
79
- """mapping a column to another column
89
+ """高效处理JSON列表,支持多进程/多线程
80
90
 
81
91
  Args:
82
- df (DataFrame): [description]
83
- output_path (Path): 数据量大的时候需要缓存
84
- partial_func (function): (Dict[str, str]) -> Dict[str, str]
92
+ jsonlist (list[Any]): 要处理的JSON对象列表
93
+ output_path (Optional[Union[str, Path]]): 输出路径,None表示不缓存
94
+ work_func (Callable): 处理函数,接收dict返回dict
95
+ batch_size (int): 批处理大小
96
+ cache_batch_num (int): 缓存批次数量
97
+ thread_pool_size (int): 线程/进程池大小
98
+ use_process_pool (bool): 是否使用进程池(CPU密集型任务)
99
+ preserve_order (bool): 是否保持结果顺序
100
+ chunksize (Optional[int]): 单个任务分块大小,None为自动计算
101
+ retry_count (int): 任务失败重试次数
85
102
  """
86
103
  need_caching = output_path is not None
87
- tmp_list, output_list = list(), list()
104
+ output_list = []
88
105
  start_idx = 0
106
+
107
+ # 自动计算最佳chunksize
108
+ if chunksize is None:
109
+ chunksize = max(1, min(batch_size // thread_pool_size, 100))
110
+
111
+ # 处理缓存
89
112
  if need_caching:
90
113
  output_path = Path(output_path)
91
114
  if output_path.exists():
92
- output_list = load_json_list(output_path)
93
- start_idx = len(output_list)
94
- logger.warning(f"Cache found {output_path} has {start_idx} rows. This process will continue at row index {start_idx}.")
95
- logger.warning(f"缓存 {output_path} 存在 {start_idx} 行. 本次处理将从第 {start_idx} 行开始.")
115
+ if force_overwrite:
116
+ logger.warning(f"强制覆盖输出文件: {output_path}")
117
+ output_path.unlink()
118
+ else:
119
+ output_list = load_json_list(output_path)
120
+ start_idx = len(output_list)
121
+ logger.info(f"继续处理: 已有{start_idx}条记录,共{len(jsonlist)}条")
96
122
  else:
97
123
  output_path.parent.mkdir(parents=True, exist_ok=True)
98
- pool = ThreadPool(thread_pool_size)
99
- logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
100
- start_time = time.time()
101
- last_save_time = start_time
102
- for i, line in tqdm(list(enumerate(jsonlist))):
103
- if i < start_idx:
104
- continue
105
- tmp_list.append(line)
106
- if len(tmp_list) == batch_size:
107
- results = pool.map(partial_func, tmp_list)
108
- output_list.extend([x for x in results])
109
- tmp_list = list()
110
- if need_caching and (i // batch_size) % cache_batch_num == 0:
111
- current_time = time.time()
112
- if current_time - last_save_time < 3:
113
- # 如果多进程处理太快,为了不让 IO 成为瓶颈拉慢进度,不足 3 秒的批次都忽略,也不缓存中间结果
114
- last_save_time = current_time
115
- continue
116
- save_json_list(output_list, output_path)
117
- last_save_time = time.time()
118
- if len(tmp_list) > 0:
119
- results = pool.map(partial_func, tmp_list)
120
- output_list.extend([x for x in results])
121
- pool.close()
124
+
125
+ # 选择线程池或进程池
126
+ if use_process_pool:
127
+ pool_cls = multiprocessing.Pool
128
+ logger.info(f"使用进程池(ProcessPool),适用于CPU密集型任务")
129
+ else:
130
+ pool_cls = ThreadPool
131
+ logger.info(f"使用线程池(ThreadPool),适用于IO密集型任务")
132
+
133
+ with pool_cls(thread_pool_size) as pool:
134
+ logger.info(f"池大小: {thread_pool_size}, 批处理大小: {batch_size}, 分块大小: {chunksize}")
135
+
136
+ # 准备要处理的数据
137
+ remaining_items = jsonlist[start_idx:]
138
+ total_items = len(remaining_items)
139
+
140
+ # 批量处理逻辑
141
+ def process_batch(items_batch, retry_remaining=retry_count):
142
+ try:
143
+ if is_batch_work_func:
144
+ # 批量处理函数
145
+ return work_func(items_batch)
146
+ else:
147
+ # 选择合适的映射方法
148
+ map_func = pool.imap_unordered if not preserve_order else pool.imap
149
+ return list(map_func(work_func, items_batch, chunksize))
150
+ except Exception as e:
151
+ if retry_remaining > 0:
152
+ logger.warning(f"批处理失败,重试中 ({retry_count-retry_remaining+1}/{retry_count}): {e}")
153
+ return process_batch(items_batch, retry_remaining - 1)
154
+ else:
155
+ logger.error(f"批处理失败: {e}")
156
+ raise
157
+
158
+ # 处理数据
159
+ with tqdm(total=total_items, desc="处理数据", unit="项") as pbar:
160
+ # 跳过已处理的项目
161
+ pbar.update(start_idx)
162
+
163
+ # 分批处理
164
+ for i in range(0, total_items, batch_size):
165
+ batch = remaining_items[i : i + batch_size]
166
+
167
+ # 处理当前批次
168
+ batch_start_time = time.time()
169
+ results = process_batch(batch)
170
+ batch_time = time.time() - batch_start_time
171
+
172
+ # 更新结果
173
+ output_list.extend(results)
174
+ pbar.update(len(batch))
175
+
176
+ # 性能统计
177
+ items_per_second = len(batch) / batch_time if batch_time > 0 else 0
178
+ pbar.set_postfix_str(f"速率: {items_per_second:.1f}项/秒")
179
+
180
+ # 缓存逻辑
181
+ if need_caching and (i // batch_size) % cache_batch_num == 0:
182
+ # 仅当处理速度足够慢时才保存缓存,避免IO成为瓶颈
183
+ if batch_time > 3 or i + batch_size >= total_items:
184
+ save_json_list(output_list, output_path)
185
+ logger.debug(f"已保存{len(output_list)}条记录到{output_path}")
186
+
187
+ # 最终保存
122
188
  if need_caching:
123
189
  save_json_list(output_list, output_path)
190
+ drop_count = len(jsonlist) - len(output_list)
191
+ logger.info(f"处理完成,共处理{len(jsonlist)}条记录" + ", 丢弃{len(jsonlist) - len(output_list)}条记录" if drop_count > 0 else "")
192
+
124
193
  return output_list
125
194
 
126
195
 
@@ -186,35 +255,6 @@ def multiprocessing_mapping(
186
255
  return output_df, output_list
187
256
 
188
257
 
189
- def continue_run(
190
- jsonfiles: List[str],
191
- save_dir: str,
192
- mapping_func,
193
- load_func=load_json,
194
- save_func=save_json,
195
- batch_size=1024,
196
- cache_size=8,
197
- ):
198
- save_dir: Path = Path(save_dir)
199
- save_dir.mkdir(parents=True, exist_ok=True)
200
- new_jsonfiles = []
201
- for jsonfile in ls(jsonfiles):
202
- jsonlist = load_func(jsonfile)
203
- output_filepath = save_dir / jsonfile.name
204
- for row in jsonlist:
205
- row["来源"] = jsonfile.name
206
- new_jsonlist = multiprocessing_mapping_jsonlist(
207
- jsonlist,
208
- output_filepath,
209
- mapping_func,
210
- batch_size,
211
- cache_size,
212
- )
213
- save_func(new_jsonlist, output_filepath)
214
- new_jsonfiles.append(output_filepath)
215
- return new_jsonfiles
216
-
217
-
218
258
  def dataframe_mapping(
219
259
  df: pd.DataFrame,
220
260
  row_func: Callable[[dict], dict],
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: xlin
3
- Version: 0.1.24
3
+ Version: 0.1.26
4
4
  Summary: toolbox for LinXueyuan
5
5
  License: MIT
6
6
  Author: LinXueyuanStdio
@@ -2,14 +2,14 @@ xlin/__init__.py,sha256=MWWCNPgJFS_oV2US52ULa4yg4Ku61qjn40NVKqcp9-c,248
2
2
  xlin/ischinese.py,sha256=Ia9IMQ6q-UHkdLwqS70L1fTnfSPbluFrv_I1UqsKquo,293
3
3
  xlin/jsonl.py,sha256=IDRydHh2x-8iAGCxt9ScK2wfNLNA40PxNxR5hhr4v6k,7903
4
4
  xlin/metric.py,sha256=N7wJ35y-C-IaBr1I1CJ_37lTG7gA69zmn9Xg6xSwKoI,1690
5
- xlin/multiprocess_mapping.py,sha256=dRXQoLaG1dK_qZ8B3bJblV0RKM2gqIeSW1EaOZbIdD0,14251
5
+ xlin/multiprocess_mapping.py,sha256=M_d8G-apwZFcWuQxYexOpN8a4SX3OMUuqdJ4JHHgQHw,16480
6
6
  xlin/read_as_dataframe.py,sha256=MqY57L7Wp9UoWTRlZLSBKQNaZa-dKw51-ufrKvHKf8s,9041
7
7
  xlin/statistic.py,sha256=2DCUgzf7xkMFH4Pk9v82bFDNeSxCTjwPh9Y4IPJBHCE,9300
8
8
  xlin/timing.py,sha256=XMT8dMcMolOMohDvAZOIM_BAiPMREhGQKnO1kc5s6PU,998
9
9
  xlin/util.py,sha256=TTWJaqF5D_r-gAZ_fj0kyHomvCagjwHXQZ2OPSgwd54,10976
10
10
  xlin/xls2xlsx.py,sha256=uSmXcDvIhi5Sq0LGidMXy0wErNBXdjaoa6EftYVjTXs,947
11
11
  xlin/yaml.py,sha256=kICi7G3Td5q2MaSXXt85qNTWoHMgjzt7pvn7r3C4dME,183
12
- xlin-0.1.24.dist-info/LICENSE,sha256=60ys6rRtc1dZOP8UjSUr9fAqhZudT3WpKe5WbMCralM,1066
13
- xlin-0.1.24.dist-info/METADATA,sha256=Xm8lU2owbTVPZt1G9baKLiWMIqUZB3THuoCgGGJrQ58,1098
14
- xlin-0.1.24.dist-info/WHEEL,sha256=IrRNNNJ-uuL1ggO5qMvT1GGhQVdQU54d6ZpYqEZfEWo,92
15
- xlin-0.1.24.dist-info/RECORD,,
12
+ xlin-0.1.26.dist-info/LICENSE,sha256=60ys6rRtc1dZOP8UjSUr9fAqhZudT3WpKe5WbMCralM,1066
13
+ xlin-0.1.26.dist-info/METADATA,sha256=suP1OEbJd7a0-mhqkh3thQVNBTGUi2amRQVaSGbYafk,1098
14
+ xlin-0.1.26.dist-info/WHEEL,sha256=IrRNNNJ-uuL1ggO5qMvT1GGhQVdQU54d6ZpYqEZfEWo,92
15
+ xlin-0.1.26.dist-info/RECORD,,
File without changes
File without changes