xlin 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,18 +12,11 @@ from loguru import logger
12
12
  from xlin.jsonl import dataframe_to_json_list, load_json_list, save_json_list, load_json, save_json
13
13
 
14
14
 
15
- cpu_count = multiprocessing.cpu_count()
16
- # pool = ThreadPool(cpu_count) # 大模型接口辣鸡,太快会截断答案
17
- thread_pool_size = int(os.getenv("THREAD_POOL_SIZE", 5))
18
- pool = ThreadPool(thread_pool_size)
19
- logger.debug(f"pool size: {thread_pool_size}, cpu count: {cpu_count}")
20
-
21
-
22
15
  def multiprocessing_mapping_jsonlist(
23
16
  jsonlist: List[Any],
24
17
  output_path: Optional[Union[str, Path]],
25
18
  partial_func,
26
- batch_size=cpu_count * 2,
19
+ batch_size=multiprocessing.cpu_count(),
27
20
  cache_batch_num=1,
28
21
  thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
29
22
  ):
@@ -47,7 +40,7 @@ def multiprocessing_mapping_jsonlist(
47
40
  else:
48
41
  output_path.parent.mkdir(parents=True, exist_ok=True)
49
42
  pool = ThreadPool(thread_pool_size)
50
- logger.debug(f"pool size: {thread_pool_size}, cpu count: {cpu_count}")
43
+ logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
51
44
  start_time = time.time()
52
45
  last_save_time = start_time
53
46
  for i, line in tqdm(list(enumerate(jsonlist))):
@@ -78,7 +71,7 @@ def multiprocessing_mapping(
78
71
  df: pd.DataFrame,
79
72
  output_path: Optional[Union[str, Path]],
80
73
  partial_func,
81
- batch_size=cpu_count * 2,
74
+ batch_size=multiprocessing.cpu_count(),
82
75
  cache_batch_num=1,
83
76
  thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
84
77
  ):
@@ -104,7 +97,7 @@ def multiprocessing_mapping(
104
97
  else:
105
98
  output_path.parent.mkdir(parents=True, exist_ok=True)
106
99
  pool = ThreadPool(thread_pool_size)
107
- logger.debug(f"pool size: {thread_pool_size}, cpu count: {cpu_count}")
100
+ logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
108
101
  start_time = time.time()
109
102
  last_save_time = start_time
110
103
  for i, line in tqdm(list(df.iterrows())):
@@ -135,29 +128,51 @@ def multiprocessing_mapping(
135
128
  return output_df, output_list
136
129
 
137
130
 
138
- def dataframe_by_row_mapping(
131
+ def dataframe_with_row_mapping(
139
132
  df: pd.DataFrame,
140
- mapping_func: Callable[[dict], Tuple[bool, dict]],
133
+ mapping_func: Callable[[Tuple[int, dict]], Tuple[bool, dict]],
141
134
  use_multiprocessing=True,
142
135
  thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
143
136
  ):
144
137
  rows = []
145
138
  if use_multiprocessing:
146
139
  pool = ThreadPool(thread_pool_size)
147
- logger.debug(f"pool size: {thread_pool_size}, cpu count: {cpu_count}")
148
- results = pool.map(mapping_func, dataframe_to_json_list(df))
140
+ logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
141
+ results = pool.map(mapping_func, enumerate(dataframe_to_json_list(df)))
149
142
  for ok, row in results:
150
143
  if ok:
151
144
  rows.append(row)
152
145
  else:
153
- for i, row in df.iterrows():
154
- ok, row = mapping_func(row)
146
+ for i, row in tqdm(df.iterrows()):
147
+ ok, row = mapping_func(i, row)
155
148
  if ok:
156
149
  rows.append(row)
157
150
  df = pd.DataFrame(rows)
158
151
  return df
159
152
 
160
153
 
154
+ def list_with_element_mapping(
155
+ iterator: List[Any],
156
+ mapping_func: Callable[[Tuple[int, Any]], Tuple[bool, Any]],
157
+ use_multiprocessing=True,
158
+ thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
159
+ ):
160
+ rows = []
161
+ if use_multiprocessing:
162
+ pool = ThreadPool(thread_pool_size)
163
+ logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
164
+ results = pool.map(mapping_func, enumerate(iterator))
165
+ for ok, row in results:
166
+ if ok:
167
+ rows.append(row)
168
+ else:
169
+ for i, row in tqdm(enumerate(iterator)):
170
+ ok, row = mapping_func(i, row)
171
+ if ok:
172
+ rows.append(row)
173
+ return rows
174
+
175
+
161
176
  def continue_run(
162
177
  jsonfiles: List[str],
163
178
  save_dir: str,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: xlin
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: toolbox for LinXueyuan
5
5
  License: MIT
6
6
  Author: XiChen
@@ -1,7 +1,7 @@
1
1
  xlin/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  xlin/ischinese.py,sha256=Ia9IMQ6q-UHkdLwqS70L1fTnfSPbluFrv_I1UqsKquo,293
3
3
  xlin/jsonl.py,sha256=oE8w8IFVEnBQdWUCMGYF9BlE3wtEhFsmjaLpZPKwSXg,6605
4
- xlin/multiprocess_mapping.py,sha256=DE-liX94UlWJUuDAgIb_FpM6oaiNZ5Hq-HapdbU-DAg,6971
4
+ xlin/multiprocess_mapping.py,sha256=bJKYAvhbeRICn7p3iXYJTh-NRloxgvCa4rpGJSRLh6Y,7516
5
5
  xlin/read_as_dataframe.py,sha256=ir3HUT6dt3crqa3xnlcNn8j3wqjSIGJgiIVLP3KkBaQ,8678
6
6
  xlin/statistic.py,sha256=BLj8hszlbBT5xDIfd70_YtOb8QgZEvYXiFJDGXBwCfw,881
7
7
  xlin/terminal_color.py,sha256=nfE-CY2BzjY2eZbm9yk8r-AuyJ-hchmLXhASCb4HAIA,191
@@ -9,7 +9,7 @@ xlin/util.py,sha256=hme7Zl4Sa_-FTA9TEVzr1qTdaKW1eq5dTWZgd4owcDc,11303
9
9
  xlin/uuid.py,sha256=gouvm7_DL22sIhXl-g4e6S2qzIZtmE3SEp00xy1upyg,271
10
10
  xlin/xls2xlsx.py,sha256=5zfcM0gmunFQOcOj9nYd9Dj0HMhU7-cPKnPIy6Ot9iU,930
11
11
  xlin/yaml.py,sha256=kICi7G3Td5q2MaSXXt85qNTWoHMgjzt7pvn7r3C4dME,183
12
- xlin-0.1.2.dist-info/LICENSE,sha256=KX0dDCYlO4DskqMZY8qeY94EZMrDRNnNqlGLkXVlKyM,1063
13
- xlin-0.1.2.dist-info/METADATA,sha256=PKad7J7_9fQ2148ciupAUuSS8Js_3_1C2ZPmM2hnt44,772
14
- xlin-0.1.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
15
- xlin-0.1.2.dist-info/RECORD,,
12
+ xlin-0.1.4.dist-info/LICENSE,sha256=KX0dDCYlO4DskqMZY8qeY94EZMrDRNnNqlGLkXVlKyM,1063
13
+ xlin-0.1.4.dist-info/METADATA,sha256=ryDOmmS9Tt8A_ig_5zx6Gcme0Wwg1Wb4V0HPo_h2PNs,772
14
+ xlin-0.1.4.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
15
+ xlin-0.1.4.dist-info/RECORD,,
File without changes
File without changes