xlin 0.1.37__tar.gz → 0.1.38__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xlin-0.1.37 → xlin-0.1.38}/PKG-INFO +1 -1
- {xlin-0.1.37 → xlin-0.1.38}/pyproject.toml +1 -1
- {xlin-0.1.37 → xlin-0.1.38}/xlin/jsonl.py +99 -24
- {xlin-0.1.37 → xlin-0.1.38}/xlin/multiprocess_mapping.py +14 -1
- {xlin-0.1.37 → xlin-0.1.38}/xlin/read_as_dataframe.py +3 -1
- {xlin-0.1.37 → xlin-0.1.38}/xlin/util.py +3 -2
- {xlin-0.1.37 → xlin-0.1.38}/LICENSE +0 -0
- {xlin-0.1.37 → xlin-0.1.38}/README.md +0 -0
- {xlin-0.1.37 → xlin-0.1.38}/xlin/__init__.py +0 -0
- {xlin-0.1.37 → xlin-0.1.38}/xlin/ischinese.py +0 -0
- {xlin-0.1.37 → xlin-0.1.38}/xlin/metric.py +0 -0
- {xlin-0.1.37 → xlin-0.1.38}/xlin/statistic.py +0 -0
- {xlin-0.1.37 → xlin-0.1.38}/xlin/timing.py +0 -0
- {xlin-0.1.37 → xlin-0.1.38}/xlin/xls2xlsx.py +0 -0
- {xlin-0.1.37 → xlin-0.1.38}/xlin/yaml.py +0 -0
@@ -4,6 +4,10 @@ from typing import *
|
|
4
4
|
from pathlib import Path
|
5
5
|
from loguru import logger
|
6
6
|
import pandas as pd
|
7
|
+
import pyexcel
|
8
|
+
|
9
|
+
from xlin.util import ls
|
10
|
+
from xlin.xls2xlsx import is_xslx
|
7
11
|
|
8
12
|
|
9
13
|
def dataframe_to_json_list(df: pd.DataFrame):
|
@@ -68,11 +72,91 @@ def load_text(filename):
|
|
68
72
|
|
69
73
|
|
70
74
|
def load_json_or_jsonl(filepath: str):
|
75
|
+
"""
|
76
|
+
read_as_json_list 更好用,可以无缝切换到:read_as_json_list(filepath)
|
77
|
+
"""
|
71
78
|
if is_jsonl(filepath):
|
72
79
|
return load_json_list(filepath)
|
73
80
|
return load_json(filepath)
|
74
81
|
|
75
82
|
|
83
|
+
def read_as_json_list(
|
84
|
+
filepath: Union[str, Path, List[str], List[Path]],
|
85
|
+
sheet_name: Optional[str] = None,
|
86
|
+
skip_None: bool = True,
|
87
|
+
skip_blank: bool = True,
|
88
|
+
filter: Callable[[Path], bool] = lambda x: True,
|
89
|
+
) -> List[Dict]:
|
90
|
+
"""
|
91
|
+
读取文件或递归读取文件夹里的文件为 JSON list(List[Dict])。
|
92
|
+
支持格式:json, jsonl, xlsx, xls, csv, parquet, feather, pkl, h5, txt, tsv, xml, html, db
|
93
|
+
"""
|
94
|
+
if isinstance(filepath, list):
|
95
|
+
json_list = []
|
96
|
+
for path in filepath:
|
97
|
+
try:
|
98
|
+
sub_list = read_as_json_list(path, sheet_name, skip_None, skip_blank, filter)
|
99
|
+
for obj in sub_list:
|
100
|
+
if isinstance(obj, dict):
|
101
|
+
obj["数据来源"] = Path(path).name
|
102
|
+
json_list.extend(sub_list)
|
103
|
+
except Exception as e:
|
104
|
+
print(f"读取失败 {path}: {e}")
|
105
|
+
return json_list
|
106
|
+
|
107
|
+
filepath = Path(filepath)
|
108
|
+
if filepath.is_dir():
|
109
|
+
paths = ls(filepath, filter=filter, expand_all_subdir=True)
|
110
|
+
return read_as_json_list(paths, sheet_name, skip_None, skip_blank, filter)
|
111
|
+
|
112
|
+
filename = filepath.name
|
113
|
+
if filename.endswith(".json") or filename.endswith(".jsonl"):
|
114
|
+
if is_jsonl(filepath):
|
115
|
+
return load_json_list(filepath)
|
116
|
+
else:
|
117
|
+
return [load_json(filepath)]
|
118
|
+
|
119
|
+
elif filename.endswith(".xlsx"):
|
120
|
+
if sheet_name is None:
|
121
|
+
df = pd.read_excel(filepath)
|
122
|
+
else:
|
123
|
+
df = pd.read_excel(filepath, sheet_name)
|
124
|
+
elif filename.endswith(".xls"):
|
125
|
+
if is_xslx(filepath):
|
126
|
+
if sheet_name is None:
|
127
|
+
df = pd.read_excel(filepath)
|
128
|
+
else:
|
129
|
+
df = pd.read_excel(filepath, sheet_name)
|
130
|
+
else:
|
131
|
+
df = pyexcel.get_sheet(file_name=filepath)
|
132
|
+
elif filename.endswith(".csv"):
|
133
|
+
df = pd.read_csv(filepath)
|
134
|
+
elif filename.endswith(".parquet"):
|
135
|
+
df = pd.read_parquet(filepath)
|
136
|
+
elif filename.endswith(".feather"):
|
137
|
+
df = pd.read_feather(filepath)
|
138
|
+
elif filename.endswith(".pkl"):
|
139
|
+
df = pd.read_pickle(filepath)
|
140
|
+
elif filename.endswith(".h5"):
|
141
|
+
df = pd.read_hdf(filepath)
|
142
|
+
elif filename.endswith(".txt"):
|
143
|
+
df = pd.read_csv(filepath, delimiter="\t")
|
144
|
+
elif filename.endswith(".tsv"):
|
145
|
+
df = pd.read_csv(filepath, delimiter="\t")
|
146
|
+
elif filename.endswith(".xml"):
|
147
|
+
df = pd.read_xml(filepath)
|
148
|
+
elif filename.endswith(".html"):
|
149
|
+
df = pd.read_html(filepath)[0]
|
150
|
+
elif filename.endswith(".db"):
|
151
|
+
if sheet_name is None:
|
152
|
+
raise ValueError("读取 .db 文件需要提供 sheet_name 作为表名")
|
153
|
+
df = pd.read_sql_table(sheet_name, f"sqlite:///{filepath}")
|
154
|
+
else:
|
155
|
+
raise ValueError(f"Unsupported file type: {filepath}")
|
156
|
+
|
157
|
+
return df.to_dict(orient="records")
|
158
|
+
|
159
|
+
|
76
160
|
def load_json(filename: str):
|
77
161
|
with open(filename, "r", encoding="utf-8") as f:
|
78
162
|
return json.load(f)
|
@@ -84,16 +168,24 @@ def save_json(json_list: Union[Dict[str, str], List[Dict[str, str]]], filename:
|
|
84
168
|
return json.dump(json_list, f, ensure_ascii=False, separators=(",", ":"), indent=2)
|
85
169
|
|
86
170
|
|
87
|
-
def load_json_list(filename: str):
|
171
|
+
def load_json_list(filename: str, skip_None=True, skip_blank=True) -> List[Dict[str, str]]:
|
88
172
|
with open(filename, "r", encoding="utf-8") as f:
|
89
173
|
lines = f.readlines()
|
90
174
|
json_list = []
|
91
|
-
for i in lines:
|
175
|
+
for i, line in enumerate(lines):
|
176
|
+
line = line.strip()
|
177
|
+
if line == "":
|
178
|
+
if not skip_blank:
|
179
|
+
json_list.append("")
|
180
|
+
continue
|
181
|
+
if line == "None":
|
182
|
+
if not skip_None:
|
183
|
+
json_list.append(None)
|
184
|
+
continue
|
92
185
|
try:
|
93
|
-
obj = json.loads(
|
186
|
+
obj = json.loads(line)
|
94
187
|
except:
|
95
|
-
print("
|
96
|
-
print(i)
|
188
|
+
print(f"格式损坏,跳过第 {i} 行: {repr(line)}")
|
97
189
|
continue
|
98
190
|
json_list.append(obj)
|
99
191
|
return json_list
|
@@ -176,7 +268,7 @@ def apply_changes_to_paths(
|
|
176
268
|
):
|
177
269
|
total_updated = 0
|
178
270
|
total_deleted = 0
|
179
|
-
for path in paths:
|
271
|
+
for path in ls(paths):
|
180
272
|
if verbose:
|
181
273
|
print("checking", path)
|
182
274
|
jsonlist = load_json(path)
|
@@ -199,25 +291,8 @@ def apply_changes_to_paths(
|
|
199
291
|
print(f"total: updated {total_updated}, deleted {total_deleted}")
|
200
292
|
|
201
293
|
|
202
|
-
def backup_current_output(row: Dict[str, str], output_key="output"):
|
203
|
-
if "old_output" in row:
|
204
|
-
for i in range(1, 10):
|
205
|
-
if f"old_output{i}" not in row:
|
206
|
-
row[f"old_output{i}"] = row[output_key]
|
207
|
-
break
|
208
|
-
else:
|
209
|
-
row["old_output"] = row[output_key]
|
210
|
-
return row
|
211
|
-
|
212
|
-
|
213
|
-
def backup_and_set_output(row: Dict[str, str], output: str):
|
214
|
-
backup_current_output(row)
|
215
|
-
row["output"] = output
|
216
|
-
return row
|
217
|
-
|
218
|
-
|
219
294
|
def generator_from_paths(paths: List[Path], load_data: Callable[[Path], List[Dict[str, Any]]] = load_json):
|
220
|
-
for path in paths:
|
295
|
+
for path in ls(paths):
|
221
296
|
jsonlist: List[Dict[str, Any]] = load_data(path)
|
222
297
|
for row in jsonlist:
|
223
298
|
yield path, row
|
@@ -100,6 +100,16 @@ def xmap(
|
|
100
100
|
preserve_order (bool): 是否保持结果顺序
|
101
101
|
chunksize (Optional[int]): 单个任务分块大小,None为自动计算
|
102
102
|
retry_count (int): 任务失败重试次数
|
103
|
+
|
104
|
+
Example:
|
105
|
+
>>> from xlin.multiprocess_mapping import xmap
|
106
|
+
>>> jsonlist = [{"id": 1, "text": "Hello"}, {"id": 2, "text": "World"}]
|
107
|
+
>>> def work_func(item):
|
108
|
+
... item["text"] = item["text"].upper()
|
109
|
+
... return item
|
110
|
+
>>> results = xmap(jsonlist, work_func, output_path="output.jsonl", batch_size=2)
|
111
|
+
>>> print(results)
|
112
|
+
[{'id': 1, 'text': 'HELLO'}, {'id': 2, 'text': 'WORLD'}]
|
103
113
|
"""
|
104
114
|
need_caching = output_path is not None
|
105
115
|
output_list = []
|
@@ -204,7 +214,7 @@ def xmap(
|
|
204
214
|
def multiprocessing_mapping(
|
205
215
|
df: pd.DataFrame,
|
206
216
|
output_path: Optional[Union[str, Path]],
|
207
|
-
partial_func,
|
217
|
+
partial_func: Callable[[Dict[str, str]], Dict[str, str]],
|
208
218
|
batch_size=multiprocessing.cpu_count(),
|
209
219
|
cache_batch_num=1,
|
210
220
|
thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
|
@@ -215,6 +225,9 @@ def multiprocessing_mapping(
|
|
215
225
|
df (DataFrame): [description]
|
216
226
|
output_path (Path): 数据量大的时候需要缓存
|
217
227
|
partial_func (function): (Dict[str, str]) -> Dict[str, str]
|
228
|
+
batch_size (int): batch size
|
229
|
+
cache_batch_num (int): cache batch num
|
230
|
+
thread_pool_size (int): thread pool size
|
218
231
|
"""
|
219
232
|
need_caching = output_path is not None
|
220
233
|
tmp_list, output_list = list(), list()
|
@@ -77,7 +77,9 @@ def read_as_dataframe(
|
|
77
77
|
elif filename.endswith(".html"):
|
78
78
|
df = pd.read_html(filepath)[0]
|
79
79
|
elif filename.endswith(".db"):
|
80
|
-
|
80
|
+
if sheet_name is None:
|
81
|
+
raise ValueError("读取 .db 文件需要提供 sheet_name 作为表名")
|
82
|
+
df = pd.read_sql_table(sheet_name, f"sqlite:///{filepath}")
|
81
83
|
else:
|
82
84
|
raise ValueError(
|
83
85
|
(
|
@@ -1,14 +1,15 @@
|
|
1
1
|
from typing import *
|
2
2
|
from collections import defaultdict
|
3
3
|
from pathlib import Path
|
4
|
-
import pandas as pd
|
5
4
|
import os
|
6
5
|
import asyncio
|
7
6
|
import datetime
|
8
|
-
from loguru import logger
|
9
7
|
import shutil
|
10
8
|
import random
|
11
9
|
|
10
|
+
import pandas as pd
|
11
|
+
from loguru import logger
|
12
|
+
|
12
13
|
|
13
14
|
date_str = datetime.datetime.now().strftime("%Y%m%d")
|
14
15
|
datetime_str = datetime.datetime.now().strftime("%Y%m%d_%Hh%Mm%Ss")
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|