xlin 0.1.36__tar.gz → 0.1.38__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: xlin
3
- Version: 0.1.36
3
+ Version: 0.1.38
4
4
  Summary: toolbox for LinXueyuan
5
5
  License: MIT
6
6
  Author: LinXueyuanStdio
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "xlin"
3
- version = "0.1.36"
3
+ version = "0.1.38"
4
4
  description = "toolbox for LinXueyuan"
5
5
  authors = ["LinXueyuanStdio <23211526+LinXueyuanStdio@users.noreply.github.com>"]
6
6
  license = "MIT"
@@ -4,6 +4,10 @@ from typing import *
4
4
  from pathlib import Path
5
5
  from loguru import logger
6
6
  import pandas as pd
7
+ import pyexcel
8
+
9
+ from xlin.util import ls
10
+ from xlin.xls2xlsx import is_xslx
7
11
 
8
12
 
9
13
  def dataframe_to_json_list(df: pd.DataFrame):
@@ -68,11 +72,91 @@ def load_text(filename):
68
72
 
69
73
 
70
74
  def load_json_or_jsonl(filepath: str):
75
+ """
76
+ read_as_json_list 更好用,可以无缝切换到:read_as_json_list(filepath)
77
+ """
71
78
  if is_jsonl(filepath):
72
79
  return load_json_list(filepath)
73
80
  return load_json(filepath)
74
81
 
75
82
 
83
+ def read_as_json_list(
84
+ filepath: Union[str, Path, List[str], List[Path]],
85
+ sheet_name: Optional[str] = None,
86
+ skip_None: bool = True,
87
+ skip_blank: bool = True,
88
+ filter: Callable[[Path], bool] = lambda x: True,
89
+ ) -> List[Dict]:
90
+ """
91
+ 读取文件或递归读取文件夹里的文件为 JSON list(List[Dict])。
92
+ 支持格式:json, jsonl, xlsx, xls, csv, parquet, feather, pkl, h5, txt, tsv, xml, html, db
93
+ """
94
+ if isinstance(filepath, list):
95
+ json_list = []
96
+ for path in filepath:
97
+ try:
98
+ sub_list = read_as_json_list(path, sheet_name, skip_None, skip_blank, filter)
99
+ for obj in sub_list:
100
+ if isinstance(obj, dict):
101
+ obj["数据来源"] = Path(path).name
102
+ json_list.extend(sub_list)
103
+ except Exception as e:
104
+ print(f"读取失败 {path}: {e}")
105
+ return json_list
106
+
107
+ filepath = Path(filepath)
108
+ if filepath.is_dir():
109
+ paths = ls(filepath, filter=filter, expand_all_subdir=True)
110
+ return read_as_json_list(paths, sheet_name, skip_None, skip_blank, filter)
111
+
112
+ filename = filepath.name
113
+ if filename.endswith(".json") or filename.endswith(".jsonl"):
114
+ if is_jsonl(filepath):
115
+ return load_json_list(filepath)
116
+ else:
117
+ return [load_json(filepath)]
118
+
119
+ elif filename.endswith(".xlsx"):
120
+ if sheet_name is None:
121
+ df = pd.read_excel(filepath)
122
+ else:
123
+ df = pd.read_excel(filepath, sheet_name)
124
+ elif filename.endswith(".xls"):
125
+ if is_xslx(filepath):
126
+ if sheet_name is None:
127
+ df = pd.read_excel(filepath)
128
+ else:
129
+ df = pd.read_excel(filepath, sheet_name)
130
+ else:
131
+ df = pyexcel.get_sheet(file_name=filepath)
132
+ elif filename.endswith(".csv"):
133
+ df = pd.read_csv(filepath)
134
+ elif filename.endswith(".parquet"):
135
+ df = pd.read_parquet(filepath)
136
+ elif filename.endswith(".feather"):
137
+ df = pd.read_feather(filepath)
138
+ elif filename.endswith(".pkl"):
139
+ df = pd.read_pickle(filepath)
140
+ elif filename.endswith(".h5"):
141
+ df = pd.read_hdf(filepath)
142
+ elif filename.endswith(".txt"):
143
+ df = pd.read_csv(filepath, delimiter="\t")
144
+ elif filename.endswith(".tsv"):
145
+ df = pd.read_csv(filepath, delimiter="\t")
146
+ elif filename.endswith(".xml"):
147
+ df = pd.read_xml(filepath)
148
+ elif filename.endswith(".html"):
149
+ df = pd.read_html(filepath)[0]
150
+ elif filename.endswith(".db"):
151
+ if sheet_name is None:
152
+ raise ValueError("读取 .db 文件需要提供 sheet_name 作为表名")
153
+ df = pd.read_sql_table(sheet_name, f"sqlite:///{filepath}")
154
+ else:
155
+ raise ValueError(f"Unsupported file type: {filepath}")
156
+
157
+ return df.to_dict(orient="records")
158
+
159
+
76
160
  def load_json(filename: str):
77
161
  with open(filename, "r", encoding="utf-8") as f:
78
162
  return json.load(f)
@@ -84,16 +168,24 @@ def save_json(json_list: Union[Dict[str, str], List[Dict[str, str]]], filename:
84
168
  return json.dump(json_list, f, ensure_ascii=False, separators=(",", ":"), indent=2)
85
169
 
86
170
 
87
- def load_json_list(filename: str):
171
+ def load_json_list(filename: str, skip_None=True, skip_blank=True) -> List[Dict[str, str]]:
88
172
  with open(filename, "r", encoding="utf-8") as f:
89
173
  lines = f.readlines()
90
174
  json_list = []
91
- for i in lines:
175
+ for i, line in enumerate(lines):
176
+ line = line.strip()
177
+ if line == "":
178
+ if not skip_blank:
179
+ json_list.append("")
180
+ continue
181
+ if line == "None":
182
+ if not skip_None:
183
+ json_list.append(None)
184
+ continue
92
185
  try:
93
- obj = json.loads(i.strip())
186
+ obj = json.loads(line)
94
187
  except:
95
- print("格式损坏数据,无法加载")
96
- print(i)
188
+ print(f"格式损坏,跳过第 {i} 行: {repr(line)}")
97
189
  continue
98
190
  json_list.append(obj)
99
191
  return json_list
@@ -176,7 +268,7 @@ def apply_changes_to_paths(
176
268
  ):
177
269
  total_updated = 0
178
270
  total_deleted = 0
179
- for path in paths:
271
+ for path in ls(paths):
180
272
  if verbose:
181
273
  print("checking", path)
182
274
  jsonlist = load_json(path)
@@ -199,25 +291,8 @@ def apply_changes_to_paths(
199
291
  print(f"total: updated {total_updated}, deleted {total_deleted}")
200
292
 
201
293
 
202
- def backup_current_output(row: Dict[str, str], output_key="output"):
203
- if "old_output" in row:
204
- for i in range(1, 10):
205
- if f"old_output{i}" not in row:
206
- row[f"old_output{i}"] = row[output_key]
207
- break
208
- else:
209
- row["old_output"] = row[output_key]
210
- return row
211
-
212
-
213
- def backup_and_set_output(row: Dict[str, str], output: str):
214
- backup_current_output(row)
215
- row["output"] = output
216
- return row
217
-
218
-
219
294
  def generator_from_paths(paths: List[Path], load_data: Callable[[Path], List[Dict[str, Any]]] = load_json):
220
- for path in paths:
295
+ for path in ls(paths):
221
296
  jsonlist: List[Dict[str, Any]] = load_data(path)
222
297
  for row in jsonlist:
223
298
  yield path, row
@@ -100,6 +100,16 @@ def xmap(
100
100
  preserve_order (bool): 是否保持结果顺序
101
101
  chunksize (Optional[int]): 单个任务分块大小,None为自动计算
102
102
  retry_count (int): 任务失败重试次数
103
+
104
+ Example:
105
+ >>> from xlin.multiprocess_mapping import xmap
106
+ >>> jsonlist = [{"id": 1, "text": "Hello"}, {"id": 2, "text": "World"}]
107
+ >>> def work_func(item):
108
+ ... item["text"] = item["text"].upper()
109
+ ... return item
110
+ >>> results = xmap(jsonlist, work_func, output_path="output.jsonl", batch_size=2)
111
+ >>> print(results)
112
+ [{'id': 1, 'text': 'HELLO'}, {'id': 2, 'text': 'WORLD'}]
103
113
  """
104
114
  need_caching = output_path is not None
105
115
  output_list = []
@@ -204,7 +214,7 @@ def xmap(
204
214
  def multiprocessing_mapping(
205
215
  df: pd.DataFrame,
206
216
  output_path: Optional[Union[str, Path]],
207
- partial_func,
217
+ partial_func: Callable[[Dict[str, str]], Dict[str, str]],
208
218
  batch_size=multiprocessing.cpu_count(),
209
219
  cache_batch_num=1,
210
220
  thread_pool_size=int(os.getenv("THREAD_POOL_SIZE", 5)),
@@ -215,6 +225,9 @@ def multiprocessing_mapping(
215
225
  df (DataFrame): [description]
216
226
  output_path (Path): 数据量大的时候需要缓存
217
227
  partial_func (function): (Dict[str, str]) -> Dict[str, str]
228
+ batch_size (int): batch size
229
+ cache_batch_num (int): cache batch num
230
+ thread_pool_size (int): thread pool size
218
231
  """
219
232
  need_caching = output_path is not None
220
233
  tmp_list, output_list = list(), list()
@@ -77,7 +77,9 @@ def read_as_dataframe(
77
77
  elif filename.endswith(".html"):
78
78
  df = pd.read_html(filepath)[0]
79
79
  elif filename.endswith(".db"):
80
- df = pd.read_sql_table(sheet_name, filepath)
80
+ if sheet_name is None:
81
+ raise ValueError("读取 .db 文件需要提供 sheet_name 作为表名")
82
+ df = pd.read_sql_table(sheet_name, f"sqlite:///{filepath}")
81
83
  else:
82
84
  raise ValueError(
83
85
  (
@@ -355,6 +355,7 @@ def generate_classification_report(predictions: List[str], labels: List[str]) ->
355
355
  tail = pd.DataFrame([p], index=["precision"], columns=confusion_matrix.columns)
356
356
  confusion_matrix = pd.concat([confusion_matrix, tail], axis=0)
357
357
  confusion_matrix.index.name = "True \\ Pred"
358
+ confusion_matrix["sum"] = class_df["support"].values.tolist() + [class_df["support"].sum()]
358
359
  report["confusion_matrix"] = confusion_matrix
359
360
 
360
361
  micro_precision = micro_tp / (micro_tp + micro_fp) if (micro_tp + micro_fp) > 0 else 0
@@ -1,14 +1,15 @@
1
1
  from typing import *
2
2
  from collections import defaultdict
3
3
  from pathlib import Path
4
- import pandas as pd
5
4
  import os
6
5
  import asyncio
7
6
  import datetime
8
- from loguru import logger
9
7
  import shutil
10
8
  import random
11
9
 
10
+ import pandas as pd
11
+ from loguru import logger
12
+
12
13
 
13
14
  date_str = datetime.datetime.now().strftime("%Y%m%d")
14
15
  datetime_str = datetime.datetime.now().strftime("%Y%m%d_%Hh%Mm%Ss")
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes