xlin 0.1.4__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xlin-0.1.4 → xlin-0.1.5}/PKG-INFO +1 -1
- {xlin-0.1.4 → xlin-0.1.5}/pyproject.toml +1 -1
- {xlin-0.1.4 → xlin-0.1.5}/xlin/multiprocess_mapping.py +56 -48
- {xlin-0.1.4 → xlin-0.1.5}/LICENSE +0 -0
- {xlin-0.1.4 → xlin-0.1.5}/README.md +0 -0
- {xlin-0.1.4 → xlin-0.1.5}/xlin/__init__.py +0 -0
- {xlin-0.1.4 → xlin-0.1.5}/xlin/ischinese.py +0 -0
- {xlin-0.1.4 → xlin-0.1.5}/xlin/jsonl.py +0 -0
- {xlin-0.1.4 → xlin-0.1.5}/xlin/read_as_dataframe.py +0 -0
- {xlin-0.1.4 → xlin-0.1.5}/xlin/statistic.py +0 -0
- {xlin-0.1.4 → xlin-0.1.5}/xlin/terminal_color.py +0 -0
- {xlin-0.1.4 → xlin-0.1.5}/xlin/util.py +1 -1
- {xlin-0.1.4 → xlin-0.1.5}/xlin/uuid.py +0 -0
- {xlin-0.1.4 → xlin-0.1.5}/xlin/xls2xlsx.py +0 -0
- {xlin-0.1.4 → xlin-0.1.5}/xlin/yaml.py +0 -0
@@ -9,7 +9,61 @@ from pathlib import Path
|
|
9
9
|
from tqdm import tqdm
|
10
10
|
from loguru import logger
|
11
11
|
|
12
|
-
from xlin.jsonl import
|
12
|
+
from xlin.jsonl import load_json_list, save_json_list, load_json, save_json
|
13
|
+
from xlin.util import ls
|
14
|
+
|
15
|
+
|
16
|
+
def element_mapping(
|
17
|
+
iterator: List[Any],
|
18
|
+
mapping_func: Callable[[Any], Tuple[bool, Any]],
|
19
|
+
use_multiprocessing=True,
|
20
|
+
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
21
|
+
):
|
22
|
+
rows = []
|
23
|
+
if use_multiprocessing:
|
24
|
+
pool = ThreadPool(thread_pool_size)
|
25
|
+
results = pool.map(mapping_func, iterator)
|
26
|
+
for ok, row in results:
|
27
|
+
if ok:
|
28
|
+
rows.append(row)
|
29
|
+
else:
|
30
|
+
for row in tqdm(iterator):
|
31
|
+
ok, row = mapping_func(row)
|
32
|
+
if ok:
|
33
|
+
rows.append(row)
|
34
|
+
return rows
|
35
|
+
|
36
|
+
|
37
|
+
def batch_mapping(
|
38
|
+
iterator: List[Any],
|
39
|
+
mapping_func: Callable[[List[Any]], Tuple[bool, List[Any]]],
|
40
|
+
use_multiprocessing=True,
|
41
|
+
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
42
|
+
batch_size=4,
|
43
|
+
):
|
44
|
+
batch_iterator = []
|
45
|
+
batch = []
|
46
|
+
for i, item in enumerate(iterator):
|
47
|
+
batch.append(item)
|
48
|
+
if len(batch) == batch_size:
|
49
|
+
batch_iterator.append(batch)
|
50
|
+
batch = []
|
51
|
+
if len(batch) > 0:
|
52
|
+
batch_iterator.append(batch)
|
53
|
+
rows = element_mapping(batch_iterator, mapping_func, use_multiprocessing, thread_pool_size)
|
54
|
+
rows = [row for batch in rows for row in batch]
|
55
|
+
return rows
|
56
|
+
|
57
|
+
|
58
|
+
def dataframe_with_row_mapping(
|
59
|
+
df: pd.DataFrame,
|
60
|
+
mapping_func: Callable[[dict], Tuple[bool, dict]],
|
61
|
+
use_multiprocessing=True,
|
62
|
+
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
63
|
+
):
|
64
|
+
rows = element_mapping(df.iterrows(), lambda x: mapping_func(x[1]), use_multiprocessing, thread_pool_size)
|
65
|
+
df = pd.DataFrame(rows)
|
66
|
+
return df
|
13
67
|
|
14
68
|
|
15
69
|
def multiprocessing_mapping_jsonlist(
|
@@ -128,51 +182,6 @@ def multiprocessing_mapping(
|
|
128
182
|
return output_df, output_list
|
129
183
|
|
130
184
|
|
131
|
-
def dataframe_with_row_mapping(
|
132
|
-
df: pd.DataFrame,
|
133
|
-
mapping_func: Callable[[Tuple[int, dict]], Tuple[bool, dict]],
|
134
|
-
use_multiprocessing=True,
|
135
|
-
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
136
|
-
):
|
137
|
-
rows = []
|
138
|
-
if use_multiprocessing:
|
139
|
-
pool = ThreadPool(thread_pool_size)
|
140
|
-
logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
|
141
|
-
results = pool.map(mapping_func, enumerate(dataframe_to_json_list(df)))
|
142
|
-
for ok, row in results:
|
143
|
-
if ok:
|
144
|
-
rows.append(row)
|
145
|
-
else:
|
146
|
-
for i, row in tqdm(df.iterrows()):
|
147
|
-
ok, row = mapping_func(i, row)
|
148
|
-
if ok:
|
149
|
-
rows.append(row)
|
150
|
-
df = pd.DataFrame(rows)
|
151
|
-
return df
|
152
|
-
|
153
|
-
|
154
|
-
def list_with_element_mapping(
|
155
|
-
iterator: List[Any],
|
156
|
-
mapping_func: Callable[[Tuple[int, Any]], Tuple[bool, Any]],
|
157
|
-
use_multiprocessing=True,
|
158
|
-
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
159
|
-
):
|
160
|
-
rows = []
|
161
|
-
if use_multiprocessing:
|
162
|
-
pool = ThreadPool(thread_pool_size)
|
163
|
-
logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
|
164
|
-
results = pool.map(mapping_func, enumerate(iterator))
|
165
|
-
for ok, row in results:
|
166
|
-
if ok:
|
167
|
-
rows.append(row)
|
168
|
-
else:
|
169
|
-
for i, row in tqdm(enumerate(iterator)):
|
170
|
-
ok, row = mapping_func(i, row)
|
171
|
-
if ok:
|
172
|
-
rows.append(row)
|
173
|
-
return rows
|
174
|
-
|
175
|
-
|
176
185
|
def continue_run(
|
177
186
|
jsonfiles: List[str],
|
178
187
|
save_dir: str,
|
@@ -185,8 +194,7 @@ def continue_run(
|
|
185
194
|
save_dir: Path = Path(save_dir)
|
186
195
|
save_dir.mkdir(parents=True, exist_ok=True)
|
187
196
|
new_jsonfiles = []
|
188
|
-
for jsonfile in jsonfiles:
|
189
|
-
jsonfile = Path(jsonfile)
|
197
|
+
for jsonfile in ls(jsonfiles):
|
190
198
|
jsonlist = load_func(jsonfile)
|
191
199
|
output_filepath = save_dir / jsonfile.name
|
192
200
|
for row in jsonlist:
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|