xlin 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,7 +9,61 @@ from pathlib import Path
9
9
  from tqdm import tqdm
10
10
  from loguru import logger
11
11
 
12
- from xlin.jsonl import dataframe_to_json_list, load_json_list, save_json_list, load_json, save_json
12
+ from xlin.jsonl import load_json_list, save_json_list, load_json, save_json
13
+ from xlin.util import ls
14
+
15
+
16
+ def element_mapping(
17
+ iterator: List[Any],
18
+ mapping_func: Callable[[Any], Tuple[bool, Any]],
19
+ use_multiprocessing=True,
20
+ thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
21
+ ):
22
+ rows = []
23
+ if use_multiprocessing:
24
+ pool = ThreadPool(thread_pool_size)
25
+ results = pool.map(mapping_func, iterator)
26
+ for ok, row in results:
27
+ if ok:
28
+ rows.append(row)
29
+ else:
30
+ for row in tqdm(iterator):
31
+ ok, row = mapping_func(row)
32
+ if ok:
33
+ rows.append(row)
34
+ return rows
35
+
36
+
37
+ def batch_mapping(
38
+ iterator: List[Any],
39
+ mapping_func: Callable[[List[Any]], Tuple[bool, List[Any]]],
40
+ use_multiprocessing=True,
41
+ thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
42
+ batch_size=4,
43
+ ):
44
+ batch_iterator = []
45
+ batch = []
46
+ for i, item in enumerate(iterator):
47
+ batch.append(item)
48
+ if len(batch) == batch_size:
49
+ batch_iterator.append(batch)
50
+ batch = []
51
+ if len(batch) > 0:
52
+ batch_iterator.append(batch)
53
+ rows = element_mapping(batch_iterator, mapping_func, use_multiprocessing, thread_pool_size)
54
+ rows = [row for batch in rows for row in batch]
55
+ return rows
56
+
57
+
58
+ def dataframe_with_row_mapping(
59
+ df: pd.DataFrame,
60
+ mapping_func: Callable[[dict], Tuple[bool, dict]],
61
+ use_multiprocessing=True,
62
+ thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
63
+ ):
64
+ rows = element_mapping(df.iterrows(), lambda x: mapping_func(x[1]), use_multiprocessing, thread_pool_size)
65
+ df = pd.DataFrame(rows)
66
+ return df
13
67
 
14
68
 
15
69
  def multiprocessing_mapping_jsonlist(
@@ -128,51 +182,6 @@ def multiprocessing_mapping(
128
182
  return output_df, output_list
129
183
 
130
184
 
131
- def dataframe_with_row_mapping(
132
- df: pd.DataFrame,
133
- mapping_func: Callable[[int, dict], Tuple[bool, dict]],
134
- use_multiprocessing=True,
135
- thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
136
- ):
137
- rows = []
138
- if use_multiprocessing:
139
- pool = ThreadPool(thread_pool_size)
140
- logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
141
- results = pool.map(mapping_func, enumerate(dataframe_to_json_list(df)))
142
- for ok, row in results:
143
- if ok:
144
- rows.append(row)
145
- else:
146
- for i, row in tqdm(df.iterrows()):
147
- ok, row = mapping_func(i, row)
148
- if ok:
149
- rows.append(row)
150
- df = pd.DataFrame(rows)
151
- return df
152
-
153
-
154
- def list_with_element_mapping(
155
- iterator: List[Any],
156
- mapping_func: Callable[[int, Any], Tuple[bool, Any]],
157
- use_multiprocessing=True,
158
- thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
159
- ):
160
- rows = []
161
- if use_multiprocessing:
162
- pool = ThreadPool(thread_pool_size)
163
- logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
164
- results = pool.map(mapping_func, enumerate(iterator))
165
- for ok, row in results:
166
- if ok:
167
- rows.append(row)
168
- else:
169
- for i, row in tqdm(enumerate(iterator)):
170
- ok, row = mapping_func(i, row)
171
- if ok:
172
- rows.append(row)
173
- return rows
174
-
175
-
176
185
  def continue_run(
177
186
  jsonfiles: List[str],
178
187
  save_dir: str,
@@ -185,8 +194,7 @@ def continue_run(
185
194
  save_dir: Path = Path(save_dir)
186
195
  save_dir.mkdir(parents=True, exist_ok=True)
187
196
  new_jsonfiles = []
188
- for jsonfile in jsonfiles:
189
- jsonfile = Path(jsonfile)
197
+ for jsonfile in ls(jsonfiles):
190
198
  jsonlist = load_func(jsonfile)
191
199
  output_filepath = save_dir / jsonfile.name
192
200
  for row in jsonlist:
xlin/util.py CHANGED
@@ -1,5 +1,5 @@
1
- from collections import defaultdict
2
1
  from typing import *
2
+ from collections import defaultdict
3
3
  from pathlib import Path
4
4
  import pandas as pd
5
5
  import os
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: xlin
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: toolbox for LinXueyuan
5
5
  License: MIT
6
6
  Author: XiChen
@@ -1,15 +1,15 @@
1
1
  xlin/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  xlin/ischinese.py,sha256=Ia9IMQ6q-UHkdLwqS70L1fTnfSPbluFrv_I1UqsKquo,293
3
3
  xlin/jsonl.py,sha256=oE8w8IFVEnBQdWUCMGYF9BlE3wtEhFsmjaLpZPKwSXg,6605
4
- xlin/multiprocess_mapping.py,sha256=mvkxra4uPvHfwHfI38FMEZU4dVYf6mu3gMVMAC1-P4o,7502
4
+ xlin/multiprocess_mapping.py,sha256=lLw0sBFDymk38DdvNV65IPV3_3Vqw2SiYtRpwTgCbr0,7597
5
5
  xlin/read_as_dataframe.py,sha256=ir3HUT6dt3crqa3xnlcNn8j3wqjSIGJgiIVLP3KkBaQ,8678
6
6
  xlin/statistic.py,sha256=BLj8hszlbBT5xDIfd70_YtOb8QgZEvYXiFJDGXBwCfw,881
7
7
  xlin/terminal_color.py,sha256=nfE-CY2BzjY2eZbm9yk8r-AuyJ-hchmLXhASCb4HAIA,191
8
- xlin/util.py,sha256=hme7Zl4Sa_-FTA9TEVzr1qTdaKW1eq5dTWZgd4owcDc,11303
8
+ xlin/util.py,sha256=SOQUh506GQlljJYLYuI6nScSTOrgRQnMq2xfxSvKIlI,11303
9
9
  xlin/uuid.py,sha256=gouvm7_DL22sIhXl-g4e6S2qzIZtmE3SEp00xy1upyg,271
10
10
  xlin/xls2xlsx.py,sha256=5zfcM0gmunFQOcOj9nYd9Dj0HMhU7-cPKnPIy6Ot9iU,930
11
11
  xlin/yaml.py,sha256=kICi7G3Td5q2MaSXXt85qNTWoHMgjzt7pvn7r3C4dME,183
12
- xlin-0.1.3.dist-info/LICENSE,sha256=KX0dDCYlO4DskqMZY8qeY94EZMrDRNnNqlGLkXVlKyM,1063
13
- xlin-0.1.3.dist-info/METADATA,sha256=Z0r6K1HueSI2T-U8I3xxV5Vl-92NZuwnQVuztiXClIk,772
14
- xlin-0.1.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
15
- xlin-0.1.3.dist-info/RECORD,,
12
+ xlin-0.1.5.dist-info/LICENSE,sha256=KX0dDCYlO4DskqMZY8qeY94EZMrDRNnNqlGLkXVlKyM,1063
13
+ xlin-0.1.5.dist-info/METADATA,sha256=SpRGumS27m1HewBbF_pPDuT8SlGnIuKwD9C2nNvhGaQ,772
14
+ xlin-0.1.5.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
15
+ xlin-0.1.5.dist-info/RECORD,,
File without changes
File without changes