xlin 0.1.23__py2.py3-none-any.whl → 0.1.25__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xlin/jsonl.py CHANGED
@@ -224,11 +224,16 @@ def generator_from_paths(paths: List[Path], load_data: Callable[[Path], List[Dic
224
224
 
225
225
 
226
226
 
227
- def append_to_json_list(data: list[dict], file_path: str):
227
+ def append_to_json_list(data: list[dict], file_path: Union[str, Path]):
228
228
  """Append a list of dictionaries to a JSON file."""
229
- with open(file_path, "a") as f:
229
+ file_path = Path(file_path)
230
+ file_path.parent.mkdir(parents=True, exist_ok=True)
231
+ if file_path.exists() and file_path.is_dir():
232
+ print(f"{file_path} is a directory, not a file.")
233
+ return
234
+ with open(file_path, "a", encoding="utf-8") as f:
230
235
  for item in data:
231
- f.write(json.dumps(item) + "\n")
236
+ f.write(json.dumps(item, ensure_ascii=False, separators=(",", ":")) + "\n")
232
237
 
233
238
 
234
239
  def row_to_json(row: dict) -> dict:
@@ -21,15 +21,19 @@ def element_mapping(
21
21
  thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
22
22
  ):
23
23
  rows = []
24
+ # 转换为列表以获取长度,用于进度条显示
25
+ items = list(iterator)
26
+ total = len(items)
27
+
24
28
  if use_multiprocessing:
25
29
  pool = ThreadPool(thread_pool_size)
26
- results = pool.map(mapping_func, iterator)
27
- pool.close()
28
- for ok, row in results:
30
+ # 使用imap替代map,结合tqdm显示进度
31
+ for ok, row in tqdm(pool.imap(mapping_func, items), total=total, desc="Processing"):
29
32
  if ok:
30
33
  rows.append(row)
34
+ pool.close()
31
35
  else:
32
- for row in tqdm(iterator):
36
+ for row in tqdm(items, desc="Processing"):
33
37
  ok, row = mapping_func(row)
34
38
  if ok:
35
39
  rows.append(row)
@@ -70,57 +74,111 @@ def dataframe_with_row_mapping(
70
74
 
71
75
  def multiprocessing_mapping_jsonlist(
72
76
  jsonlist: List[Any],
73
- output_path: Optional[Union[str, Path]],
74
- partial_func,
77
+ partial_func: Callable[[Any], dict],
78
+ output_path: Optional[Union[str, Path]]=None, # 输出路径,None表示不缓存
75
79
  batch_size=multiprocessing.cpu_count(),
76
80
  cache_batch_num=1,
77
81
  thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
82
+ use_process_pool=True, # CPU密集型任务时设为True
83
+ preserve_order=True, # 是否保持结果顺序
84
+ chunksize=None, # 自动计算最佳分块大小
85
+ retry_count=0, # 失败重试次数
78
86
  ):
79
- """mapping a column to another column
87
+ """高效处理JSON列表,支持多进程/多线程
80
88
 
81
89
  Args:
82
- df (DataFrame): [description]
83
- output_path (Path): 数据量大的时候需要缓存
84
- partial_func (function): (Dict[str, str]) -> Dict[str, str]
90
+ jsonlist (List[Any]): 要处理的JSON对象列表
91
+ output_path (Optional[Union[str, Path]]): 输出路径,None表示不缓存
92
+ partial_func (Callable): 处理函数,接收dict返回dict
93
+ batch_size (int): 批处理大小
94
+ cache_batch_num (int): 缓存批次数量
95
+ thread_pool_size (int): 线程/进程池大小
96
+ use_process_pool (bool): 是否使用进程池(CPU密集型任务)
97
+ preserve_order (bool): 是否保持结果顺序
98
+ chunksize (Optional[int]): 单个任务分块大小,None为自动计算
99
+ retry_count (int): 任务失败重试次数
85
100
  """
86
101
  need_caching = output_path is not None
87
- tmp_list, output_list = list(), list()
102
+ output_list = []
88
103
  start_idx = 0
104
+
105
+ # 自动计算最佳chunksize
106
+ if chunksize is None:
107
+ chunksize = max(1, min(batch_size // thread_pool_size, 100))
108
+
109
+ # 处理缓存
89
110
  if need_caching:
90
111
  output_path = Path(output_path)
91
112
  if output_path.exists():
92
113
  output_list = load_json_list(output_path)
93
114
  start_idx = len(output_list)
94
- logger.warning(f"Cache found {output_path} has {start_idx} rows. This process will continue at row index {start_idx}.")
95
- logger.warning(f"缓存 {output_path} 存在 {start_idx} 行. 本次处理将从第 {start_idx} 行开始.")
115
+ logger.info(f"继续处理: 已有{start_idx}条记录,共{len(jsonlist)}")
96
116
  else:
97
117
  output_path.parent.mkdir(parents=True, exist_ok=True)
98
- pool = ThreadPool(thread_pool_size)
99
- logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
100
- start_time = time.time()
101
- last_save_time = start_time
102
- for i, line in tqdm(list(enumerate(jsonlist))):
103
- if i < start_idx:
104
- continue
105
- tmp_list.append(line)
106
- if len(tmp_list) == batch_size:
107
- results = pool.map(partial_func, tmp_list)
108
- output_list.extend([x for x in results])
109
- tmp_list = list()
110
- if need_caching and (i // batch_size) % cache_batch_num == 0:
111
- current_time = time.time()
112
- if current_time - last_save_time < 3:
113
- # 如果多进程处理太快,为了不让 IO 成为瓶颈拉慢进度,不足 3 秒的批次都忽略,也不缓存中间结果
114
- last_save_time = current_time
115
- continue
116
- save_json_list(output_list, output_path)
117
- last_save_time = time.time()
118
- if len(tmp_list) > 0:
119
- results = pool.map(partial_func, tmp_list)
120
- output_list.extend([x for x in results])
121
- pool.close()
118
+
119
+ # 选择线程池或进程池
120
+ if use_process_pool:
121
+ pool_cls = multiprocessing.Pool
122
+ logger.info(f"使用进程池(ProcessPool),适用于CPU密集型任务")
123
+ else:
124
+ pool_cls = ThreadPool
125
+ logger.info(f"使用线程池(ThreadPool),适用于IO密集型任务")
126
+
127
+ with pool_cls(thread_pool_size) as pool:
128
+ logger.info(f"池大小: {thread_pool_size}, 批处理大小: {batch_size}, 分块大小: {chunksize}")
129
+
130
+ # 准备要处理的数据
131
+ remaining_items = jsonlist[start_idx:]
132
+ total_items = len(remaining_items)
133
+
134
+ # 批量处理逻辑
135
+ def process_batch(items_batch, retry_remaining=retry_count):
136
+ try:
137
+ # 选择合适的映射方法
138
+ map_func = pool.imap_unordered if not preserve_order else pool.imap
139
+ return list(map_func(partial_func, items_batch, chunksize))
140
+ except Exception as e:
141
+ if retry_remaining > 0:
142
+ logger.warning(f"批处理失败,重试中 ({retry_count-retry_remaining+1}/{retry_count}): {e}")
143
+ return process_batch(items_batch, retry_remaining - 1)
144
+ else:
145
+ logger.error(f"批处理失败: {e}")
146
+ raise
147
+
148
+ # 处理数据
149
+ with tqdm(total=total_items, desc="处理数据", unit="项") as pbar:
150
+ # 跳过已处理的项目
151
+ pbar.update(start_idx)
152
+
153
+ # 分批处理
154
+ for i in range(0, total_items, batch_size):
155
+ batch = remaining_items[i : i + batch_size]
156
+
157
+ # 处理当前批次
158
+ batch_start_time = time.time()
159
+ results = process_batch(batch)
160
+ batch_time = time.time() - batch_start_time
161
+
162
+ # 更新结果
163
+ output_list.extend(results)
164
+ pbar.update(len(batch))
165
+
166
+ # 性能统计
167
+ items_per_second = len(batch) / batch_time if batch_time > 0 else 0
168
+ pbar.set_postfix_str(f"速率: {items_per_second:.1f}项/秒")
169
+
170
+ # 缓存逻辑
171
+ if need_caching and (i // batch_size) % cache_batch_num == 0:
172
+ # 仅当处理速度足够慢时才保存缓存,避免IO成为瓶颈
173
+ if batch_time > 3 or i + batch_size >= total_items:
174
+ save_json_list(output_list, output_path)
175
+ logger.debug(f"已保存{len(output_list)}条记录到{output_path}")
176
+
177
+ # 最终保存
122
178
  if need_caching:
123
179
  save_json_list(output_list, output_path)
180
+ logger.info(f"已完成处理并保存{len(output_list)}条记录")
181
+
124
182
  return output_list
125
183
 
126
184
 
@@ -186,35 +244,6 @@ def multiprocessing_mapping(
186
244
  return output_df, output_list
187
245
 
188
246
 
189
- def continue_run(
190
- jsonfiles: List[str],
191
- save_dir: str,
192
- mapping_func,
193
- load_func=load_json,
194
- save_func=save_json,
195
- batch_size=1024,
196
- cache_size=8,
197
- ):
198
- save_dir: Path = Path(save_dir)
199
- save_dir.mkdir(parents=True, exist_ok=True)
200
- new_jsonfiles = []
201
- for jsonfile in ls(jsonfiles):
202
- jsonlist = load_func(jsonfile)
203
- output_filepath = save_dir / jsonfile.name
204
- for row in jsonlist:
205
- row["来源"] = jsonfile.name
206
- new_jsonlist = multiprocessing_mapping_jsonlist(
207
- jsonlist,
208
- output_filepath,
209
- mapping_func,
210
- batch_size,
211
- cache_size,
212
- )
213
- save_func(new_jsonlist, output_filepath)
214
- new_jsonfiles.append(output_filepath)
215
- return new_jsonfiles
216
-
217
-
218
247
  def dataframe_mapping(
219
248
  df: pd.DataFrame,
220
249
  row_func: Callable[[dict], dict],
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: xlin
3
- Version: 0.1.23
3
+ Version: 0.1.25
4
4
  Summary: toolbox for LinXueyuan
5
5
  License: MIT
6
6
  Author: LinXueyuanStdio
@@ -1,15 +1,15 @@
1
1
  xlin/__init__.py,sha256=MWWCNPgJFS_oV2US52ULa4yg4Ku61qjn40NVKqcp9-c,248
2
2
  xlin/ischinese.py,sha256=Ia9IMQ6q-UHkdLwqS70L1fTnfSPbluFrv_I1UqsKquo,293
3
- xlin/jsonl.py,sha256=Ogn_9eIx1NPmI_hMvBVwuDTooJYDEJ8FTtViQ8zTVlQ,7618
3
+ xlin/jsonl.py,sha256=IDRydHh2x-8iAGCxt9ScK2wfNLNA40PxNxR5hhr4v6k,7903
4
4
  xlin/metric.py,sha256=N7wJ35y-C-IaBr1I1CJ_37lTG7gA69zmn9Xg6xSwKoI,1690
5
- xlin/multiprocess_mapping.py,sha256=dRXQoLaG1dK_qZ8B3bJblV0RKM2gqIeSW1EaOZbIdD0,14251
5
+ xlin/multiprocess_mapping.py,sha256=ppSNidDLb6pI7_thCcqZBpYtKGTTS4osoPIIbWBu0d4,15893
6
6
  xlin/read_as_dataframe.py,sha256=MqY57L7Wp9UoWTRlZLSBKQNaZa-dKw51-ufrKvHKf8s,9041
7
7
  xlin/statistic.py,sha256=2DCUgzf7xkMFH4Pk9v82bFDNeSxCTjwPh9Y4IPJBHCE,9300
8
8
  xlin/timing.py,sha256=XMT8dMcMolOMohDvAZOIM_BAiPMREhGQKnO1kc5s6PU,998
9
9
  xlin/util.py,sha256=TTWJaqF5D_r-gAZ_fj0kyHomvCagjwHXQZ2OPSgwd54,10976
10
10
  xlin/xls2xlsx.py,sha256=uSmXcDvIhi5Sq0LGidMXy0wErNBXdjaoa6EftYVjTXs,947
11
11
  xlin/yaml.py,sha256=kICi7G3Td5q2MaSXXt85qNTWoHMgjzt7pvn7r3C4dME,183
12
- xlin-0.1.23.dist-info/LICENSE,sha256=60ys6rRtc1dZOP8UjSUr9fAqhZudT3WpKe5WbMCralM,1066
13
- xlin-0.1.23.dist-info/METADATA,sha256=b0fZmt4pTd0U3NZd0N3P3CfHNIjdsbebm2Km-IvX_-E,1098
14
- xlin-0.1.23.dist-info/WHEEL,sha256=IrRNNNJ-uuL1ggO5qMvT1GGhQVdQU54d6ZpYqEZfEWo,92
15
- xlin-0.1.23.dist-info/RECORD,,
12
+ xlin-0.1.25.dist-info/LICENSE,sha256=60ys6rRtc1dZOP8UjSUr9fAqhZudT3WpKe5WbMCralM,1066
13
+ xlin-0.1.25.dist-info/METADATA,sha256=4xqcaW20xkdlge7nsCWw5yRByrTyXsxZAgPca2TVFpY,1098
14
+ xlin-0.1.25.dist-info/WHEEL,sha256=IrRNNNJ-uuL1ggO5qMvT1GGhQVdQU54d6ZpYqEZfEWo,92
15
+ xlin-0.1.25.dist-info/RECORD,,
File without changes
File without changes