xlin 0.1.4__tar.gz → 0.1.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: xlin
3
- Version: 0.1.4
3
+ Version: 0.1.6
4
4
  Summary: toolbox for LinXueyuan
5
5
  License: MIT
6
6
  Author: XiChen
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "xlin"
3
- version = "0.1.4"
3
+ version = "0.1.6"
4
4
  description = "toolbox for LinXueyuan"
5
5
  authors = ["XiChen <23211526+LinXueyuanStdio@users.noreply.github.com>"]
6
6
  license = "MIT"
@@ -9,7 +9,62 @@ from pathlib import Path
9
9
  from tqdm import tqdm
10
10
  from loguru import logger
11
11
 
12
- from xlin.jsonl import dataframe_to_json_list, load_json_list, save_json_list, load_json, save_json
12
+ from xlin.jsonl import load_json_list, save_json_list, load_json, save_json
13
+ from xlin.util import ls
14
+
15
+
16
+ def element_mapping(
17
+ iterator: List[Any],
18
+ mapping_func: Callable[[Any], Tuple[bool, Any]],
19
+ use_multiprocessing=True,
20
+ thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
21
+ ):
22
+ rows = []
23
+ if use_multiprocessing:
24
+ pool = ThreadPool(thread_pool_size)
25
+ results = pool.map(mapping_func, iterator)
26
+ pool.close()
27
+ for ok, row in results:
28
+ if ok:
29
+ rows.append(row)
30
+ else:
31
+ for row in tqdm(iterator):
32
+ ok, row = mapping_func(row)
33
+ if ok:
34
+ rows.append(row)
35
+ return rows
36
+
37
+
38
+ def batch_mapping(
39
+ iterator: List[Any],
40
+ mapping_func: Callable[[List[Any]], Tuple[bool, List[Any]]],
41
+ use_multiprocessing=True,
42
+ thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
43
+ batch_size=4,
44
+ ):
45
+ batch_iterator = []
46
+ batch = []
47
+ for i, item in enumerate(iterator):
48
+ batch.append(item)
49
+ if len(batch) == batch_size:
50
+ batch_iterator.append(batch)
51
+ batch = []
52
+ if len(batch) > 0:
53
+ batch_iterator.append(batch)
54
+ rows = element_mapping(batch_iterator, mapping_func, use_multiprocessing, thread_pool_size)
55
+ rows = [row for batch in rows for row in batch]
56
+ return rows
57
+
58
+
59
+ def dataframe_with_row_mapping(
60
+ df: pd.DataFrame,
61
+ mapping_func: Callable[[dict], Tuple[bool, dict]],
62
+ use_multiprocessing=True,
63
+ thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
64
+ ):
65
+ rows = element_mapping(df.iterrows(), lambda x: mapping_func(x[1]), use_multiprocessing, thread_pool_size)
66
+ df = pd.DataFrame(rows)
67
+ return df
13
68
 
14
69
 
15
70
  def multiprocessing_mapping_jsonlist(
@@ -62,6 +117,7 @@ def multiprocessing_mapping_jsonlist(
62
117
  if len(tmp_list) > 0:
63
118
  results = pool.map(partial_func, tmp_list)
64
119
  output_list.extend([x for x in results])
120
+ pool.close()
65
121
  if need_caching:
66
122
  save_json_list(output_list, output_path)
67
123
  return output_list
@@ -122,57 +178,13 @@ def multiprocessing_mapping(
122
178
  if len(tmp_list) > 0:
123
179
  results = pool.map(partial_func, tmp_list)
124
180
  output_list.extend([x for x in results])
181
+ pool.close()
125
182
  output_df = pd.DataFrame(output_list)
126
183
  if need_caching:
127
184
  output_df.to_excel(output_path, index=False)
128
185
  return output_df, output_list
129
186
 
130
187
 
131
- def dataframe_with_row_mapping(
132
- df: pd.DataFrame,
133
- mapping_func: Callable[[Tuple[int, dict]], Tuple[bool, dict]],
134
- use_multiprocessing=True,
135
- thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
136
- ):
137
- rows = []
138
- if use_multiprocessing:
139
- pool = ThreadPool(thread_pool_size)
140
- logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
141
- results = pool.map(mapping_func, enumerate(dataframe_to_json_list(df)))
142
- for ok, row in results:
143
- if ok:
144
- rows.append(row)
145
- else:
146
- for i, row in tqdm(df.iterrows()):
147
- ok, row = mapping_func(i, row)
148
- if ok:
149
- rows.append(row)
150
- df = pd.DataFrame(rows)
151
- return df
152
-
153
-
154
- def list_with_element_mapping(
155
- iterator: List[Any],
156
- mapping_func: Callable[[Tuple[int, Any]], Tuple[bool, Any]],
157
- use_multiprocessing=True,
158
- thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
159
- ):
160
- rows = []
161
- if use_multiprocessing:
162
- pool = ThreadPool(thread_pool_size)
163
- logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
164
- results = pool.map(mapping_func, enumerate(iterator))
165
- for ok, row in results:
166
- if ok:
167
- rows.append(row)
168
- else:
169
- for i, row in tqdm(enumerate(iterator)):
170
- ok, row = mapping_func(i, row)
171
- if ok:
172
- rows.append(row)
173
- return rows
174
-
175
-
176
188
  def continue_run(
177
189
  jsonfiles: List[str],
178
190
  save_dir: str,
@@ -185,8 +197,7 @@ def continue_run(
185
197
  save_dir: Path = Path(save_dir)
186
198
  save_dir.mkdir(parents=True, exist_ok=True)
187
199
  new_jsonfiles = []
188
- for jsonfile in jsonfiles:
189
- jsonfile = Path(jsonfile)
200
+ for jsonfile in ls(jsonfiles):
190
201
  jsonlist = load_func(jsonfile)
191
202
  output_filepath = save_dir / jsonfile.name
192
203
  for row in jsonlist:
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
@@ -1,5 +1,5 @@
1
- from collections import defaultdict
2
1
  from typing import *
2
+ from collections import defaultdict
3
3
  from pathlib import Path
4
4
  import pandas as pd
5
5
  import os
File without changes
File without changes
File without changes