xlin 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xlin/jsonl.py
CHANGED
@@ -47,6 +47,28 @@ def jsonlist_to_dataframe(json_list: List[Dict[str, str]]):
|
|
47
47
|
return pd.DataFrame(json_list)
|
48
48
|
|
49
49
|
|
50
|
+
def is_jsonl(filepath: str):
|
51
|
+
with open(filepath) as f:
|
52
|
+
try:
|
53
|
+
l = next(f) # 读取一行,用来判断文件是json还是jsonl格式
|
54
|
+
f.seek(0)
|
55
|
+
except:
|
56
|
+
return False
|
57
|
+
|
58
|
+
try:
|
59
|
+
_ = json.loads(l)
|
60
|
+
except ValueError:
|
61
|
+
return False # 第一行不是json,所以是json格式
|
62
|
+
else:
|
63
|
+
return True # 第一行是json,所以是jsonl格式
|
64
|
+
|
65
|
+
|
66
|
+
def load_json_or_jsonl(filepath: str):
|
67
|
+
if is_jsonl(filepath):
|
68
|
+
return load_json_list(filepath)
|
69
|
+
return load_json(filepath)
|
70
|
+
|
71
|
+
|
50
72
|
def load_json(filename: str):
|
51
73
|
with open(filename, "r", encoding="utf-8") as f:
|
52
74
|
return json.load(f)
|
@@ -66,6 +88,8 @@ def load_json_list(filename: str):
|
|
66
88
|
try:
|
67
89
|
obj = json.loads(i.strip())
|
68
90
|
except:
|
91
|
+
print("格式损坏数据,无法加载")
|
92
|
+
print(i)
|
69
93
|
continue
|
70
94
|
json_list.append(obj)
|
71
95
|
return json_list
|
xlin/multiprocess_mapping.py
CHANGED
@@ -12,18 +12,11 @@ from loguru import logger
|
|
12
12
|
from xlin.jsonl import dataframe_to_json_list, load_json_list, save_json_list, load_json, save_json
|
13
13
|
|
14
14
|
|
15
|
-
cpu_count = multiprocessing.cpu_count()
|
16
|
-
# pool = ThreadPool(cpu_count) # 大模型接口辣鸡,太快会截断答案
|
17
|
-
thread_pool_size = int(os.getenv("THREAD_POOL_SIZE", 5))
|
18
|
-
pool = ThreadPool(thread_pool_size)
|
19
|
-
logger.debug(f"pool size: {thread_pool_size}, cpu count: {cpu_count}")
|
20
|
-
|
21
|
-
|
22
15
|
def multiprocessing_mapping_jsonlist(
|
23
16
|
jsonlist: List[Any],
|
24
17
|
output_path: Optional[Union[str, Path]],
|
25
18
|
partial_func,
|
26
|
-
batch_size=cpu_count
|
19
|
+
batch_size=multiprocessing.cpu_count(),
|
27
20
|
cache_batch_num=1,
|
28
21
|
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
29
22
|
):
|
@@ -47,7 +40,7 @@ def multiprocessing_mapping_jsonlist(
|
|
47
40
|
else:
|
48
41
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
49
42
|
pool = ThreadPool(thread_pool_size)
|
50
|
-
logger.debug(f"pool size: {thread_pool_size}, cpu count: {cpu_count}")
|
43
|
+
logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
|
51
44
|
start_time = time.time()
|
52
45
|
last_save_time = start_time
|
53
46
|
for i, line in tqdm(list(enumerate(jsonlist))):
|
@@ -78,7 +71,7 @@ def multiprocessing_mapping(
|
|
78
71
|
df: pd.DataFrame,
|
79
72
|
output_path: Optional[Union[str, Path]],
|
80
73
|
partial_func,
|
81
|
-
batch_size=cpu_count
|
74
|
+
batch_size=multiprocessing.cpu_count(),
|
82
75
|
cache_batch_num=1,
|
83
76
|
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
84
77
|
):
|
@@ -104,7 +97,7 @@ def multiprocessing_mapping(
|
|
104
97
|
else:
|
105
98
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
106
99
|
pool = ThreadPool(thread_pool_size)
|
107
|
-
logger.debug(f"pool size: {thread_pool_size}, cpu count: {cpu_count}")
|
100
|
+
logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
|
108
101
|
start_time = time.time()
|
109
102
|
last_save_time = start_time
|
110
103
|
for i, line in tqdm(list(df.iterrows())):
|
@@ -135,29 +128,51 @@ def multiprocessing_mapping(
|
|
135
128
|
return output_df, output_list
|
136
129
|
|
137
130
|
|
138
|
-
def
|
131
|
+
def dataframe_with_row_mapping(
|
139
132
|
df: pd.DataFrame,
|
140
|
-
mapping_func: Callable[[dict], Tuple[bool, dict]],
|
133
|
+
mapping_func: Callable[[int, dict], Tuple[bool, dict]],
|
141
134
|
use_multiprocessing=True,
|
142
135
|
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
143
136
|
):
|
144
137
|
rows = []
|
145
138
|
if use_multiprocessing:
|
146
139
|
pool = ThreadPool(thread_pool_size)
|
147
|
-
logger.debug(f"pool size: {thread_pool_size}, cpu count: {cpu_count}")
|
148
|
-
results = pool.map(mapping_func, dataframe_to_json_list(df))
|
140
|
+
logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
|
141
|
+
results = pool.map(mapping_func, enumerate(dataframe_to_json_list(df)))
|
149
142
|
for ok, row in results:
|
150
143
|
if ok:
|
151
144
|
rows.append(row)
|
152
145
|
else:
|
153
|
-
for i, row in df.iterrows():
|
154
|
-
ok, row = mapping_func(row)
|
146
|
+
for i, row in tqdm(df.iterrows()):
|
147
|
+
ok, row = mapping_func(i, row)
|
155
148
|
if ok:
|
156
149
|
rows.append(row)
|
157
150
|
df = pd.DataFrame(rows)
|
158
151
|
return df
|
159
152
|
|
160
153
|
|
154
|
+
def list_with_element_mapping(
|
155
|
+
iterator: List[Any],
|
156
|
+
mapping_func: Callable[[int, Any], Tuple[bool, Any]],
|
157
|
+
use_multiprocessing=True,
|
158
|
+
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
159
|
+
):
|
160
|
+
rows = []
|
161
|
+
if use_multiprocessing:
|
162
|
+
pool = ThreadPool(thread_pool_size)
|
163
|
+
logger.debug(f"pool size: {thread_pool_size}, cpu count: {multiprocessing.cpu_count()}")
|
164
|
+
results = pool.map(mapping_func, enumerate(iterator))
|
165
|
+
for ok, row in results:
|
166
|
+
if ok:
|
167
|
+
rows.append(row)
|
168
|
+
else:
|
169
|
+
for i, row in tqdm(enumerate(iterator)):
|
170
|
+
ok, row = mapping_func(i, row)
|
171
|
+
if ok:
|
172
|
+
rows.append(row)
|
173
|
+
return rows
|
174
|
+
|
175
|
+
|
161
176
|
def continue_run(
|
162
177
|
jsonfiles: List[str],
|
163
178
|
save_dir: str,
|
@@ -1,7 +1,7 @@
|
|
1
1
|
xlin/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
xlin/ischinese.py,sha256=Ia9IMQ6q-UHkdLwqS70L1fTnfSPbluFrv_I1UqsKquo,293
|
3
|
-
xlin/jsonl.py,sha256=
|
4
|
-
xlin/multiprocess_mapping.py,sha256=
|
3
|
+
xlin/jsonl.py,sha256=oE8w8IFVEnBQdWUCMGYF9BlE3wtEhFsmjaLpZPKwSXg,6605
|
4
|
+
xlin/multiprocess_mapping.py,sha256=mvkxra4uPvHfwHfI38FMEZU4dVYf6mu3gMVMAC1-P4o,7502
|
5
5
|
xlin/read_as_dataframe.py,sha256=ir3HUT6dt3crqa3xnlcNn8j3wqjSIGJgiIVLP3KkBaQ,8678
|
6
6
|
xlin/statistic.py,sha256=BLj8hszlbBT5xDIfd70_YtOb8QgZEvYXiFJDGXBwCfw,881
|
7
7
|
xlin/terminal_color.py,sha256=nfE-CY2BzjY2eZbm9yk8r-AuyJ-hchmLXhASCb4HAIA,191
|
@@ -9,7 +9,7 @@ xlin/util.py,sha256=hme7Zl4Sa_-FTA9TEVzr1qTdaKW1eq5dTWZgd4owcDc,11303
|
|
9
9
|
xlin/uuid.py,sha256=gouvm7_DL22sIhXl-g4e6S2qzIZtmE3SEp00xy1upyg,271
|
10
10
|
xlin/xls2xlsx.py,sha256=5zfcM0gmunFQOcOj9nYd9Dj0HMhU7-cPKnPIy6Ot9iU,930
|
11
11
|
xlin/yaml.py,sha256=kICi7G3Td5q2MaSXXt85qNTWoHMgjzt7pvn7r3C4dME,183
|
12
|
-
xlin-0.1.
|
13
|
-
xlin-0.1.
|
14
|
-
xlin-0.1.
|
15
|
-
xlin-0.1.
|
12
|
+
xlin-0.1.3.dist-info/LICENSE,sha256=KX0dDCYlO4DskqMZY8qeY94EZMrDRNnNqlGLkXVlKyM,1063
|
13
|
+
xlin-0.1.3.dist-info/METADATA,sha256=Z0r6K1HueSI2T-U8I3xxV5Vl-92NZuwnQVuztiXClIk,772
|
14
|
+
xlin-0.1.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
15
|
+
xlin-0.1.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|