xlin 0.1.4__tar.gz → 0.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xlin-0.1.4 → xlin-0.1.6}/PKG-INFO +1 -1
- {xlin-0.1.4 → xlin-0.1.6}/pyproject.toml +1 -1
- {xlin-0.1.4 → xlin-0.1.6}/xlin/multiprocess_mapping.py +59 -48
- {xlin-0.1.4 → xlin-0.1.6}/LICENSE +0 -0
- {xlin-0.1.4 → xlin-0.1.6}/README.md +0 -0
- {xlin-0.1.4 → xlin-0.1.6}/xlin/__init__.py +0 -0
- {xlin-0.1.4 → xlin-0.1.6}/xlin/ischinese.py +0 -0
- {xlin-0.1.4 → xlin-0.1.6}/xlin/jsonl.py +0 -0
- {xlin-0.1.4 → xlin-0.1.6}/xlin/read_as_dataframe.py +0 -0
- {xlin-0.1.4 → xlin-0.1.6}/xlin/statistic.py +0 -0
- {xlin-0.1.4 → xlin-0.1.6}/xlin/terminal_color.py +0 -0
- {xlin-0.1.4 → xlin-0.1.6}/xlin/util.py +1 -1
- {xlin-0.1.4 → xlin-0.1.6}/xlin/uuid.py +0 -0
- {xlin-0.1.4 → xlin-0.1.6}/xlin/xls2xlsx.py +0 -0
- {xlin-0.1.4 → xlin-0.1.6}/xlin/yaml.py +0 -0
@@ -9,7 +9,62 @@ from pathlib import Path
|
|
9
9
|
from tqdm import tqdm
|
10
10
|
from loguru import logger
|
11
11
|
|
12
|
-
from xlin.jsonl import
|
12
|
+
from xlin.jsonl import load_json_list, save_json_list, load_json, save_json
|
13
|
+
from xlin.util import ls
|
14
|
+
|
15
|
+
|
16
|
+
def element_mapping(
|
17
|
+
iterator: List[Any],
|
18
|
+
mapping_func: Callable[[Any], Tuple[bool, Any]],
|
19
|
+
use_multiprocessing=True,
|
20
|
+
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
21
|
+
):
|
22
|
+
rows = []
|
23
|
+
if use_multiprocessing:
|
24
|
+
pool = ThreadPool(thread_pool_size)
|
25
|
+
results = pool.map(mapping_func, iterator)
|
26
|
+
pool.close()
|
27
|
+
for ok, row in results:
|
28
|
+
if ok:
|
29
|
+
rows.append(row)
|
30
|
+
else:
|
31
|
+
for row in tqdm(iterator):
|
32
|
+
ok, row = mapping_func(row)
|
33
|
+
if ok:
|
34
|
+
rows.append(row)
|
35
|
+
return rows
|
36
|
+
|
37
|
+
|
38
|
+
def batch_mapping(
|
39
|
+
iterator: List[Any],
|
40
|
+
mapping_func: Callable[[List[Any]], Tuple[bool, List[Any]]],
|
41
|
+
use_multiprocessing=True,
|
42
|
+
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
43
|
+
batch_size=4,
|
44
|
+
):
|
45
|
+
batch_iterator = []
|
46
|
+
batch = []
|
47
|
+
for i, item in enumerate(iterator):
|
48
|
+
batch.append(item)
|
49
|
+
if len(batch) == batch_size:
|
50
|
+
batch_iterator.append(batch)
|
51
|
+
batch = []
|
52
|
+
if len(batch) > 0:
|
53
|
+
batch_iterator.append(batch)
|
54
|
+
rows = element_mapping(batch_iterator, mapping_func, use_multiprocessing, thread_pool_size)
|
55
|
+
rows = [row for batch in rows for row in batch]
|
56
|
+
return rows
|
57
|
+
|
58
|
+
|
59
|
+
def dataframe_with_row_mapping(
|
60
|
+
df: pd.DataFrame,
|
61
|
+
mapping_func: Callable[[dict], Tuple[bool, dict]],
|
62
|
+
use_multiprocessing=True,
|
63
|
+
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
64
|
+
):
|
65
|
+
rows = element_mapping(df.iterrows(), lambda x: mapping_func(x[1]), use_multiprocessing, thread_pool_size)
|
66
|
+
df = pd.DataFrame(rows)
|
67
|
+
return df
|
13
68
|
|
14
69
|
|
15
70
|
def multiprocessing_mapping_jsonlist(
|
@@ -62,6 +117,7 @@ def multiprocessing_mapping_jsonlist(
|
|
62
117
|
if len(tmp_list) > 0:
|
63
118
|
results = pool.map(partial_func, tmp_list)
|
64
119
|
output_list.extend([x for x in results])
|
120
|
+
pool.close()
|
65
121
|
if need_caching:
|
66
122
|
save_json_list(output_list, output_path)
|
67
123
|
return output_list
|
@@ -122,57 +178,13 @@ def multiprocessing_mapping(
|
|
122
178
|
if len(tmp_list) > 0:
|
123
179
|
results = pool.map(partial_func, tmp_list)
|
124
180
|
output_list.extend([x for x in results])
|
181
|
+
pool.close()
|
125
182
|
output_df = pd.DataFrame(output_list)
|
126
183
|
if need_caching:
|
127
184
|
output_df.to_excel(output_path, index=False)
|
128
185
|
return output_df, output_list
|
129
186
|
|
130
187
|
|
131
|
-
def dataframe_with_row_mapping(
|
132
|
-
df: pd.DataFrame,
|
133
|
-
mapping_func: Callable[[Tuple[int, dict]], Tuple[bool, dict]],
|
134
|
-
use_multiprocessing=True,
|
135
|
-
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
136
|
-
):
|
137
|
-
rows = []
|
138
|
-
if use_multiprocessing:
|
139
|
-
pool = ThreadPool(thread_pool_size)
|
140
|
-
logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
|
141
|
-
results = pool.map(mapping_func, enumerate(dataframe_to_json_list(df)))
|
142
|
-
for ok, row in results:
|
143
|
-
if ok:
|
144
|
-
rows.append(row)
|
145
|
-
else:
|
146
|
-
for i, row in tqdm(df.iterrows()):
|
147
|
-
ok, row = mapping_func(i, row)
|
148
|
-
if ok:
|
149
|
-
rows.append(row)
|
150
|
-
df = pd.DataFrame(rows)
|
151
|
-
return df
|
152
|
-
|
153
|
-
|
154
|
-
def list_with_element_mapping(
|
155
|
-
iterator: List[Any],
|
156
|
-
mapping_func: Callable[[Tuple[int, Any]], Tuple[bool, Any]],
|
157
|
-
use_multiprocessing=True,
|
158
|
-
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
159
|
-
):
|
160
|
-
rows = []
|
161
|
-
if use_multiprocessing:
|
162
|
-
pool = ThreadPool(thread_pool_size)
|
163
|
-
logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
|
164
|
-
results = pool.map(mapping_func, enumerate(iterator))
|
165
|
-
for ok, row in results:
|
166
|
-
if ok:
|
167
|
-
rows.append(row)
|
168
|
-
else:
|
169
|
-
for i, row in tqdm(enumerate(iterator)):
|
170
|
-
ok, row = mapping_func(i, row)
|
171
|
-
if ok:
|
172
|
-
rows.append(row)
|
173
|
-
return rows
|
174
|
-
|
175
|
-
|
176
188
|
def continue_run(
|
177
189
|
jsonfiles: List[str],
|
178
190
|
save_dir: str,
|
@@ -185,8 +197,7 @@ def continue_run(
|
|
185
197
|
save_dir: Path = Path(save_dir)
|
186
198
|
save_dir.mkdir(parents=True, exist_ok=True)
|
187
199
|
new_jsonfiles = []
|
188
|
-
for jsonfile in jsonfiles:
|
189
|
-
jsonfile = Path(jsonfile)
|
200
|
+
for jsonfile in ls(jsonfiles):
|
190
201
|
jsonlist = load_func(jsonfile)
|
191
202
|
output_filepath = save_dir / jsonfile.name
|
192
203
|
for row in jsonlist:
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|