xlin 0.1.38__py2.py3-none-any.whl → 0.1.39__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xlin/__init__.py CHANGED
@@ -1,10 +1,12 @@
1
- from .ischinese import *
2
- from .jsonl import *
1
+ from .dataframe_util import *
2
+ from .datetime_util import *
3
+ from .file_util import *
4
+ from .image_util import *
5
+ from .jsonlist_util import *
3
6
  from .metric import *
4
- from .multiprocess_mapping import *
5
- from .read_as_dataframe import *
7
+ from .multiprocess_util import *
6
8
  from .statistic import *
7
- from .timing import *
8
- from .util import *
9
- from .xls2xlsx import *
10
- from .yaml import *
9
+ from .text_util import *
10
+ from .timing_util import *
11
+ from .xlsx_util import *
12
+ from .yaml_util import *
@@ -1,3 +1,4 @@
1
+ import asyncio
1
2
  from collections import defaultdict
2
3
  import os
3
4
  from typing import Callable, Dict, List, Optional, Tuple, Union
@@ -7,9 +8,9 @@ from loguru import logger
7
8
  import pandas as pd
8
9
  import pyexcel
9
10
 
10
- from xlin.util import ls
11
- from xlin.jsonl import dataframe_to_json_list, load_json, load_json_list, save_json_list
12
- from xlin.xls2xlsx import is_xslx
11
+ from xlin.file_util import ls
12
+ from xlin.jsonlist_util import dataframe_to_json_list, load_json, load_json_list, save_json_list
13
+ from xlin.xlsx_util import is_xslx
13
14
 
14
15
 
15
16
  def read_as_dataframe(
@@ -269,3 +270,39 @@ def split_dataframe(
269
270
  df_i.to_excel(filepath, index=False)
270
271
  df_list.append(df_i)
271
272
  return df_list
273
+
274
+ def append_column(df: pd.DataFrame, query_column: str, output_column: str, transform):
275
+ query = df[query_column].tolist()
276
+ loop = asyncio.get_event_loop()
277
+ result = loop.run_until_complete(transform(query))
278
+ df[output_column] = [str(r) for r in result]
279
+ return df
280
+
281
+ def grouped_col_list(df: pd.DataFrame, key_col="query", value_col="output"):
282
+ grouped = defaultdict(list)
283
+ if key_col not in df.columns:
284
+ logger.warning(f"`{key_col}` not in columns: {list(df.columns)}")
285
+ return grouped
286
+ for i, row in df.iterrows():
287
+ grouped[row[key_col]].append(row[value_col])
288
+ return grouped
289
+
290
+
291
+ def grouped_col(df: pd.DataFrame, key_col="query", value_col="output"):
292
+ grouped = {}
293
+ if key_col not in df.columns:
294
+ logger.warning(f"`{key_col}` not in columns: {list(df.columns)}")
295
+ return grouped
296
+ for i, row in df.iterrows():
297
+ grouped[row[key_col]] = row[value_col]
298
+ return grouped
299
+
300
+
301
+ def grouped_row(df: pd.DataFrame, key_col="query"):
302
+ grouped = defaultdict(list)
303
+ if key_col not in df.columns:
304
+ logger.warning(f"`{key_col}` not in columns: {list(df.columns)}")
305
+ return grouped
306
+ for i, row in df.iterrows():
307
+ grouped[row[key_col]].append(row)
308
+ return grouped
xlin/datetime_util.py ADDED
@@ -0,0 +1,21 @@
1
+
2
+
3
+ import datetime
4
+ import random
5
+
6
+
7
+ date_str = datetime.datetime.now().strftime("%Y%m%d")
8
+ datetime_str = datetime.datetime.now().strftime("%Y%m%d_%Hh%Mm%Ss")
9
+
10
+
11
+ def random_timestamp(start_timestamp=None, end_timestamp=None):
12
+ if start_timestamp is None:
13
+ start_timestamp = datetime.datetime(2024, 1, 1).timestamp()
14
+ if end_timestamp is None:
15
+ end_timestamp = datetime.datetime.now().timestamp()
16
+ return random.uniform(start_timestamp, end_timestamp)
17
+
18
+
19
+ def random_timestamp_str(start_timestamp=None, end_timestamp=None, format="%Y年%m月%d日%H时%M分"):
20
+ return datetime.datetime.fromtimestamp(random_timestamp(start_timestamp, end_timestamp)).strftime(format)
21
+
@@ -3,30 +3,11 @@ from collections import defaultdict
3
3
  from pathlib import Path
4
4
  import os
5
5
  import asyncio
6
- import datetime
7
6
  import shutil
8
- import random
9
7
 
10
8
  import pandas as pd
11
9
  from loguru import logger
12
10
 
13
-
14
- date_str = datetime.datetime.now().strftime("%Y%m%d")
15
- datetime_str = datetime.datetime.now().strftime("%Y%m%d_%Hh%Mm%Ss")
16
-
17
-
18
- def random_timestamp(start_timestamp=None, end_timestamp=None):
19
- if start_timestamp is None:
20
- start_timestamp = datetime.datetime(2024, 1, 1).timestamp()
21
- if end_timestamp is None:
22
- end_timestamp = datetime.datetime.now().timestamp()
23
- return random.uniform(start_timestamp, end_timestamp)
24
-
25
-
26
- def random_timestamp_str(start_timestamp=None, end_timestamp=None, format="%Y年%m月%d日%H时%M分"):
27
- return datetime.datetime.fromtimestamp(random_timestamp(start_timestamp, end_timestamp)).strftime(format)
28
-
29
-
30
11
  def auto_retry_to_get_data(retry_times, request, data_key="data", *args, **kwargs):
31
12
  if retry_times == 0:
32
13
  return {}
@@ -40,13 +21,6 @@ def auto_retry_to_get_data(retry_times, request, data_key="data", *args, **kwarg
40
21
  return auto_retry_to_get_data(retry_times - 1, request, data_key, *args, **kwargs)
41
22
 
42
23
 
43
- def append_column(df: pd.DataFrame, query_column: str, output_column: str, transform):
44
- query = df[query_column].tolist()
45
- loop = asyncio.get_event_loop()
46
- result = loop.run_until_complete(transform(query))
47
- df[output_column] = [str(r) for r in result]
48
- return df
49
-
50
24
 
51
25
  def request_wrapper(request_num=10):
52
26
  def request_wrapper_body(func):
@@ -202,47 +176,6 @@ def clean_empty_folder(dir_path):
202
176
  clean_empty_folder(path)
203
177
 
204
178
 
205
- def grouped_col_list(df: pd.DataFrame, key_col="query", value_col="output"):
206
- grouped = defaultdict(list)
207
- if key_col not in df.columns:
208
- logger.warning(f"`{key_col}` not in columns: {list(df.columns)}")
209
- return grouped
210
- for i, row in df.iterrows():
211
- grouped[row[key_col]].append(row[value_col])
212
- return grouped
213
-
214
-
215
- def grouped_col(df: pd.DataFrame, key_col="query", value_col="output"):
216
- grouped = {}
217
- if key_col not in df.columns:
218
- logger.warning(f"`{key_col}` not in columns: {list(df.columns)}")
219
- return grouped
220
- for i, row in df.iterrows():
221
- grouped[row[key_col]] = row[value_col]
222
- return grouped
223
-
224
-
225
- def grouped_row(df: pd.DataFrame, key_col="query"):
226
- grouped = defaultdict(list)
227
- if key_col not in df.columns:
228
- logger.warning(f"`{key_col}` not in columns: {list(df.columns)}")
229
- return grouped
230
- for i, row in df.iterrows():
231
- grouped[row[key_col]].append(row)
232
- return grouped
233
-
234
-
235
- def grouped_row_in_jsonlist(jsonlist: List[Dict[str, Any]], key_col="query"):
236
- grouped = defaultdict(list)
237
- for i, row in enumerate(jsonlist):
238
- if key_col not in row:
239
- logger.warning(f"`{key_col}` not in row: {row}")
240
- notfound_key = f"NotFound:{key_col}"
241
- grouped[notfound_key].append(row)
242
- continue
243
- grouped[row[key_col]].append(row)
244
- return grouped
245
-
246
179
 
247
180
  def submit_file(path: Union[str, Path], target_dir: Union[str, Path]):
248
181
  p = Path(path).absolute()
@@ -267,44 +200,3 @@ def submit_file(path: Union[str, Path], target_dir: Union[str, Path]):
267
200
  logger.info("现在目标文件夹下的文件有:\n" + "\n".join(filenames))
268
201
 
269
202
 
270
- def pretty_limited_text(text: str, limited_length: int = 300, language="zh"):
271
- text = str(text).strip()
272
- if len(text) > limited_length:
273
- # if language == "zh":
274
- # tail = f"...(共{len(text)}字)"
275
- # else:
276
- # tail = f"...({len(text)} words in total)"
277
- # return text[: limited_length - len(tail)] + tail
278
- return text[: limited_length // 2] + text[-limited_length // 2 :]
279
- return text
280
-
281
-
282
- def bucket_count(length):
283
- grouped_count = []
284
- j = 0
285
- for i in range(0, max(length), 50):
286
- grouped_count.append(0)
287
- while length[j] < i:
288
- grouped_count[i // 50] += 1
289
- j += 1
290
- for i, j in enumerate(grouped_count):
291
- if i == 0 or j == 0:
292
- continue
293
- print(f"[{(i-1)*50}, {i*50}) {j} {sum(grouped_count[:i+1])/len(length)*100:.2f}%")
294
-
295
-
296
- def sortedCounter(obj, by="key", reverse=False, return_list=False):
297
- c = Counter(obj)
298
- c_list = [(k, c[k]) for k in c]
299
- if by == "key":
300
- c_list = sorted(c_list, key=lambda x: x[0], reverse=reverse)
301
- elif by in ["value", "count"]:
302
- c_list = sorted(c_list, key=lambda x: x[1], reverse=reverse)
303
- else:
304
- raise Exception(f"unsupported by: {by}")
305
- c = Counter()
306
- for k, v in c_list:
307
- c[k] = v
308
- if return_list:
309
- return c, c_list
310
- return c
xlin/image_util.py ADDED
@@ -0,0 +1,237 @@
1
+
2
+ import base64
3
+ from io import BytesIO
4
+ from loguru import logger
5
+ from PIL import Image, ImageDraw, ImageFont
6
+ import uuid
7
+ import os
8
+
9
+
10
+ def image_to_base64(image: Image.Image) -> str:
11
+ buffered = BytesIO()
12
+ image.save(buffered, format="PNG")
13
+ b64 = base64.b64encode(buffered.getvalue()).decode()
14
+ return f"data:image/png;base64,{b64}"
15
+
16
+
17
+ def base64_to_image(base64_str: str) -> Image.Image:
18
+ """
19
+ Convert a base64 string to an image.
20
+ """
21
+ prefix_list = [
22
+ "data:image/png;base64,",
23
+ "data:image/jpeg;base64,",
24
+ "data:image/gif;base64,",
25
+ "data:image/webp;base64,",
26
+ ]
27
+ for prefix in prefix_list:
28
+ if base64_str.startswith(prefix):
29
+ base64_str = base64_str[len(prefix):]
30
+ break
31
+ image_data = base64.b64decode(base64_str)
32
+ image = Image.open(BytesIO(image_data))
33
+ return image
34
+
35
+
36
+ def generate_short_uuid(length=8):
37
+ # 生成标准 UUID
38
+ uuid_value = uuid.uuid4().bytes
39
+
40
+ # 使用 Base64 编码并转换为 URL 安全格式
41
+ encoded = base64.urlsafe_b64encode(uuid_value).decode("ascii")
42
+
43
+ # 移除可能的填充字符 '='
44
+ encoded = encoded.rstrip("=")
45
+
46
+ # 截取指定长度的字符串
47
+ return encoded[:length]
48
+
49
+
50
+
51
+ def scale_to_fit(image: Image.Image, target_size: tuple[int, int]=(512, 512)) -> Image.Image:
52
+ """
53
+ 将图像缩放到适合目标大小的尺寸,同时保持原始宽高比。
54
+
55
+ args:
56
+ image: PIL.Image.Image
57
+ 要缩放的图像。
58
+ target_size: tuple[int, int]
59
+ 目标大小,格式为 (width, height)。
60
+
61
+ return: PIL.Image.Image
62
+ 缩放后的图像。
63
+ """
64
+ original_width, original_height = image.size
65
+ target_width, target_height = target_size
66
+
67
+ # 计算缩放比例
68
+ width_ratio = target_width / original_width
69
+ height_ratio = target_height / original_height
70
+ scale_ratio = min(width_ratio, height_ratio)
71
+ if scale_ratio >= 1:
72
+ # 如果图像已经小于或等于目标大小,则不需要缩放
73
+ return image
74
+
75
+ # 计算新的尺寸
76
+ new_width = round(original_width * scale_ratio)
77
+ new_height = round(original_height * scale_ratio)
78
+
79
+ # 缩放图像
80
+ resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
81
+ return resized_image
82
+
83
+
84
+ def add_scale_bar(
85
+ image: Image.Image,
86
+ spacing=64,
87
+ color=(0, 0, 0),
88
+ font_size=12,
89
+ left_margin=50,
90
+ top_margin=50,
91
+ tick_length=8,
92
+ tick_width=2,
93
+ text_offset=2,
94
+ origin_size: tuple[int, int] = None,
95
+ ):
96
+ """
97
+ 为图像添加顶部和左侧标尺,并将文字标签放在空白边距中,不与原图重叠。
98
+
99
+ args:
100
+ image: PIL.Image.Image
101
+ 要添加标尺的图像。
102
+ spacing: int
103
+ 刻度之间的间隔,单位为像素。
104
+ color: tuple
105
+ 刻度线和文字的颜色,RGB格式。
106
+ font_size: int
107
+ 文字的字体大小。
108
+ left_margin: int
109
+ 左侧边距的宽度,单位为像素。
110
+ top_margin: int
111
+ 顶部边距的高度,单位为像素。
112
+ tick_length: int
113
+ 刻度线的长度,单位为像素。
114
+ tick_width: int
115
+ 刻度线的宽度,单位为像素。
116
+ text_offset: int
117
+ 文字与刻度线之间的距离,单位为像素。
118
+ origin_size: tuple[int, int]
119
+ 原图的尺寸,格式为 (width, height)。如果未提供,则使用图像的实际尺寸。
120
+ return: PIL.Image.Image
121
+
122
+ 示例用法
123
+ ```
124
+ img = Image.open("/Pictures/example.png")
125
+ out = add_scale_bar(
126
+ img,
127
+ spacing=100,
128
+ color=(0, 0, 0),
129
+ font_size=12,
130
+ left_margin=50,
131
+ top_margin=50,
132
+ tick_length=8,
133
+ text_offset=4,
134
+ origin_size=(img.width, img.height) # 可选,指定原图尺寸
135
+ )
136
+ out
137
+ ```
138
+ """
139
+ # 加载字体
140
+ try:
141
+ font_path = "C:/Windows/Fonts/arial.ttf"
142
+ if not os.path.exists(font_path):
143
+ font_path = "/System/Library/Fonts/Supplemental/Arial.ttf"
144
+ if not os.path.exists(font_path):
145
+ font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
146
+ if not os.path.exists(font_path):
147
+ font_path = "/usr/share/fonts/truetype/freefont/FreeMono.ttf"
148
+ if not os.path.exists(font_path):
149
+ font_path = "/usr/share/fonts/truetype/liberation/LiberationMono-Regular.ttf"
150
+ if not os.path.exists(font_path):
151
+ font_path = "/usr/share/fonts/truetype/noto/NotoSansMono-Regular.ttf"
152
+ if not os.path.exists(font_path):
153
+ font_path = "/usr/share/fonts/truetype/ubuntu/Ubuntu-C.ttf"
154
+ font = ImageFont.truetype(font_path, font_size)
155
+ except:
156
+ font = ImageFont.load_default()
157
+
158
+ w, h = image.size
159
+ new_w, new_h = w + left_margin, h + top_margin
160
+
161
+ # 创建背景画布并粘贴原图
162
+ mode = image.mode
163
+ bg = (255, 255, 255) if mode == "RGB" else (255,)
164
+ canvas = Image.new(mode, (new_w, new_h), bg)
165
+ canvas.paste(image, (left_margin, top_margin))
166
+
167
+ draw = ImageDraw.Draw(canvas)
168
+
169
+ # 计算文字宽高的 helper
170
+ def text_dimensions(txt):
171
+ bbox = draw.textbbox((0, 0), txt, font=font)
172
+ return bbox[2] - bbox[0], bbox[3] - bbox[1]
173
+
174
+ origin_width, origin_height = origin_size if origin_size else (w, h)
175
+
176
+ # 顶部刻度和文字
177
+ x_ticks = range(0, w + 1, spacing)
178
+ for i, x in enumerate(x_ticks):
179
+ # 计算刻度线的 x 坐标
180
+ px = left_margin + x
181
+ if i == len(x_ticks) - 1:
182
+ # 最后一个刻度线在右侧边界
183
+ px = new_w - tick_width
184
+ # 刻度线
185
+ draw.line([(px, top_margin), (px, top_margin - tick_length)], width=tick_width, fill=color)
186
+ # 文字
187
+ origin_x = x * origin_width // w # 将刻度值映射到原图尺寸
188
+ if i == len(x_ticks) - 1:
189
+ origin_x = origin_width # 确保最后一个刻度值是原图宽度
190
+ txt = str(origin_x)
191
+ tw, th = text_dimensions(txt)
192
+ tx = px - tw / 2
193
+ if i == len(x_ticks) - 1:
194
+ # 最后一个刻度的文字放在刻度线的左边
195
+ tx = tx - tw / 2
196
+ ty = top_margin - tick_length - th - text_offset
197
+ draw.text((tx, ty), txt, fill=color, font=font)
198
+
199
+ # 左侧刻度和文字
200
+ y_ticks = range(0, h + 1, spacing)
201
+ for i, y in enumerate(y_ticks):
202
+ # 计算刻度线的 y 坐标
203
+ py = top_margin + y
204
+ if i == len(y_ticks) - 1:
205
+ # 最后一个刻度线在底部边界
206
+ py = new_h - tick_width
207
+ # 刻度线
208
+ draw.line([(left_margin, py), (left_margin - tick_length, py)], width=tick_width, fill=color)
209
+ # 文字
210
+ origin_y = y * origin_height // h # 将刻度值映射到原图尺寸
211
+ if i == len(y_ticks) - 1:
212
+ origin_y = origin_height
213
+ txt = str(origin_y)
214
+ tw, th = text_dimensions(txt)
215
+ tx = left_margin - tick_length - tw - text_offset
216
+ ty = py - th / 2
217
+ if i == len(y_ticks) - 1:
218
+ # 最后一个刻度的文字放在刻度线的上边
219
+ ty = ty - th / 3 * 2
220
+ draw.text((tx, ty), txt, fill=color, font=font)
221
+
222
+ return canvas
223
+
224
+
225
+
226
+ def scale_to_fit_and_add_scale_bar(image: Image.Image, debug=False) -> Image.Image:
227
+ origin_width, origin_height = image.size
228
+ target_width, target_height = 512, 512
229
+ if debug:
230
+ logger.debug(f"原图尺寸: {origin_width}x{origin_height}, 目标尺寸: {target_width}x{target_height}")
231
+ image = scale_to_fit(image, target_size=(target_width, target_height)) # 缩放图片到目标大小,为了省 image tokens
232
+ if debug:
233
+ logger.debug(f"缩放后图片尺寸: {image.size[0]}x{image.size[1]}")
234
+ image = add_scale_bar(image, origin_size=(origin_width, origin_height)) # 保持缩放后的比例尺为原图的比例尺,方便模型在原图上定位坐标和长宽用于裁剪
235
+ if debug:
236
+ logger.debug(f"添加比例尺后图片尺寸: {image.size[0]}x{image.size[1]}")
237
+ return image
@@ -1,3 +1,4 @@
1
+ from collections import defaultdict
1
2
  import json
2
3
  from typing import *
3
4
 
@@ -6,8 +7,8 @@ from loguru import logger
6
7
  import pandas as pd
7
8
  import pyexcel
8
9
 
9
- from xlin.util import ls
10
- from xlin.xls2xlsx import is_xslx
10
+ from xlin.file_util import ls
11
+ from xlin.xlsx_util import is_xslx
11
12
 
12
13
 
13
14
  def dataframe_to_json_list(df: pd.DataFrame):
@@ -337,3 +338,15 @@ def generator_from_jsonl(path):
337
338
  jsonlist = load_json_list(path)
338
339
  for line in jsonlist:
339
340
  yield line
341
+
342
+ def grouped_row_in_jsonlist(jsonlist: List[Dict[str, Any]], key_col="query"):
343
+ grouped = defaultdict(list)
344
+ for i, row in enumerate(jsonlist):
345
+ if key_col not in row:
346
+ logger.warning(f"`{key_col}` not in row: {row}")
347
+ notfound_key = f"NotFound:{key_col}"
348
+ grouped[notfound_key].append(row)
349
+ continue
350
+ grouped[row[key_col]].append(row)
351
+ return grouped
352
+
@@ -9,9 +9,9 @@ from pathlib import Path
9
9
  from tqdm import tqdm
10
10
  from loguru import logger
11
11
 
12
- from xlin.jsonl import append_to_json_list, dataframe_to_json_list, load_json_list, row_to_json, save_json_list, load_json, save_json
13
- from xlin.read_as_dataframe import read_as_dataframe
14
- from xlin.util import ls
12
+ from xlin.jsonlist_util import append_to_json_list, dataframe_to_json_list, load_json_list, row_to_json, save_json_list, load_json, save_json
13
+ from xlin.dataframe_util import read_as_dataframe
14
+ from xlin.file_util import ls
15
15
 
16
16
 
17
17
  def element_mapping(
xlin/statistic.py CHANGED
@@ -1,10 +1,26 @@
1
1
  import sys
2
2
  from typing import List, Optional
3
- from collections import defaultdict
3
+ from collections import Counter, defaultdict
4
4
 
5
5
  import pandas as pd
6
6
 
7
7
 
8
+ def sortedCounter(obj, by="key", reverse=False, return_list=False):
9
+ c = Counter(obj)
10
+ c_list = [(k, c[k]) for k in c]
11
+ if by == "key":
12
+ c_list = sorted(c_list, key=lambda x: x[0], reverse=reverse)
13
+ elif by in ["value", "count"]:
14
+ c_list = sorted(c_list, key=lambda x: x[1], reverse=reverse)
15
+ else:
16
+ raise Exception(f"unsupported by: {by}")
17
+ c = Counter()
18
+ for k, v in c_list:
19
+ c[k] = v
20
+ if return_list:
21
+ return c, c_list
22
+ return c
23
+
8
24
 
9
25
  def bucket_count(length: List[int], step=50, skip_zero_count=False):
10
26
  grouped_count = []
xlin/text_util.py ADDED
@@ -0,0 +1,24 @@
1
+ def text_is_all_chinese(test: str):
2
+ for ch in test:
3
+ if '\u4e00' <= ch <= '\u9fff':
4
+ continue
5
+ return False
6
+ return True
7
+
8
+
9
+ def text_contains_chinese(test: str):
10
+ for ch in test:
11
+ if '\u4e00' <= ch <= '\u9fff':
12
+ return True
13
+ return False
14
+
15
+ def pretty_limited_text(text: str, limited_length: int = 300, language="zh"):
16
+ text = str(text).strip()
17
+ if len(text) > limited_length:
18
+ # if language == "zh":
19
+ # tail = f"...(共{len(text)}字)"
20
+ # else:
21
+ # tail = f"...({len(text)} words in total)"
22
+ # return text[: limited_length - len(tail)] + tail
23
+ return text[: limited_length // 2] + text[-limited_length // 2 :]
24
+ return text
@@ -41,3 +41,17 @@ class Timer:
41
41
  def __exit__(self, *args):
42
42
  self.end = time.perf_counter()
43
43
  self.interval = self.end - self.start
44
+
45
+ if __name__ == "__main__":
46
+ with Timer() as t:
47
+ time.sleep(1)
48
+ print(t.interval)
49
+ with Benchmark("Test Benchmark") as b:
50
+ time.sleep(1)
51
+ print(b.time)
52
+ @timing
53
+ def test_function(x, y):
54
+ time.sleep(1)
55
+ return x + y
56
+ result = test_function(1, 2)
57
+ print(f"Result of test_function: {result}")
@@ -0,0 +1,282 @@
1
+ Metadata-Version: 2.1
2
+ Name: xlin
3
+ Version: 0.1.39
4
+ Summary: toolbox for LinXueyuan
5
+ License: MIT
6
+ Author: LinXueyuanStdio
7
+ Author-email: 23211526+LinXueyuanStdio@users.noreply.github.com
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Programming Language :: Python :: 2
10
+ Classifier: Programming Language :: Python :: 2.7
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.4
13
+ Classifier: Programming Language :: Python :: 3.5
14
+ Classifier: Programming Language :: Python :: 3.6
15
+ Classifier: Programming Language :: Python :: 3.7
16
+ Classifier: Programming Language :: Python :: 3.8
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Requires-Dist: loguru
22
+ Requires-Dist: pandas
23
+ Requires-Dist: pyexcel
24
+ Requires-Dist: pyexcel-xls
25
+ Requires-Dist: pyexcel-xlsx
26
+ Requires-Dist: pyyaml
27
+ Requires-Dist: tqdm
28
+ Requires-Dist: xlsxwriter
29
+ Description-Content-Type: text/markdown
30
+
31
+ # xlin
32
+
33
+ Python 工具代码集合,提供了丰富的工具函数,涵盖文件操作、数据处理、多进程处理等多个方面,旨在提高开发效率。
34
+
35
+ ## 安装
36
+
37
+ ```bash
38
+ pip install xlin --upgrade
39
+ ```
40
+
41
+ ## 使用方法
42
+
43
+ ```python
44
+ from xlin import *
45
+ ```
46
+
47
+ ### 文件操作类:`ls`,`rm` 和 `cp`
48
+ - `ls`: 列出文件或文件夹下的所有文件。
49
+ - `rm`: 删除文件或文件夹。
50
+ - `cp`: 复制文件或文件夹。
51
+
52
+ ```python
53
+ from xlin import ls, rm, cp
54
+
55
+ dir_path = "./data"
56
+ dir_path = "/mnt/data.json"
57
+ dir_path = "./data,/mnt/data.json"
58
+ dir_path = ["./data", "/mnt/data.json", "./data,/mnt/data.json"]
59
+ def filter_func(path: Path) -> bool:
60
+ return path.name.endswith('.json')
61
+
62
+ filepaths: list[Path] = ls(dir_path, filter=filter_func)
63
+ rm(dir_path)
64
+ cp(dir_path, "./backup_data") # 会根据最大公共前缀保持文件夹结构
65
+ ```
66
+
67
+ ### 读取类
68
+
69
+ - `read_as_json_list`:读取 JSON 文件为列表。
70
+ - `read_as_dataframe`:读取文件为表格。如果是文件夹,则读取文件夹下的所有文件为表格并拼接。
71
+ - `read_as_dataframe_dict`:读取文件为字典,键为表头,值为列数据。
72
+ - `load_text`:加载文本文件。
73
+ - `load_yaml`:加载 YAML 文件。
74
+ - `load_json`:加载 JSON 文件。
75
+ - `load_json_list`:加载 JSON 列表文件。
76
+
77
+
78
+ > `read_as_**` 函数支持文件夹或者文件,支持多种文件格式,包括 Excel、CSV、JSON、Parquet 等。
79
+ >
80
+ > `load_**` 函数主要用于加载单个文件,支持文本、YAML 和 JSON 格式。
81
+
82
+ ```python
83
+ from xlin import *
84
+ import pandas as pd
85
+
86
+ dir_path = "./data"
87
+ dir_path = "./data,data.xlsx,data.csv,data.json,data.jsonl,data.parquet,data.feather,data.pkl,data.h5,data.txt,data.tsv,data.xml,data.html,data.db"
88
+ dir_path = "./data,/mnt/data.json"
89
+ dir_path = ["./data", "/mnt/data.json", "./data,/mnt/data.json"]
90
+ df_single = read_as_dataframe(dir_path)
91
+ jsonlist = read_as_json_list(dir_path)
92
+ df_dict = read_as_dataframe_dict(dir_path) # xlsx or dirs
93
+ for sheet_name, df in df_dict.items():
94
+ print(f"Sheet: {sheet_name}")
95
+ print(df)
96
+
97
+ text = load_text("example.txt")
98
+ yaml_data = load_yaml("example.yaml")
99
+ json_data = load_json("example.json")
100
+ json_list_data = load_json_list("example.jsonl")
101
+ ```
102
+
103
+ ### 保存类
104
+
105
+ ```python
106
+ save_json(data, 'output.json')
107
+ save_json_list(jsonlist, 'output.jsonl')
108
+ save_df(df, 'output.xlsx')
109
+ save_df_dict(df_dict, 'output.xlsx') # 将 read_as_dataframe_dict 返回的字典保存为 Excel 文件。
110
+ save_df_from_jsonlist(jsonlist, 'output_from_jsonlist.xlsx')
111
+ append_to_json_list(data, 'output.jsonl')
112
+ ```
113
+
114
+ ### 并行处理类:`xmap`
115
+ 高效处理 JSON 列表,支持多进程/多线程。
116
+
117
+ ```python
118
+ from xlin import xmap
119
+
120
+ jsonlist = [{"id": 1, "text": "Hello"}, {"id": 2, "text": "World"}]
121
+
122
+ def work_func(item):
123
+ item["text"] = item["text"].upper()
124
+ return item
125
+
126
+ results = xmap(jsonlist, work_func, output_path="output.jsonl", batch_size=2)
127
+ print(results)
128
+ ```
129
+
130
+ ### 合并多个文件:`merge_json_list`,`merge_multiple_df_dict`
131
+ 合并多个 JSONL 文件。
132
+
133
+ ```python
134
+ from xlin import merge_json_list
135
+
136
+ filenames = ['example1.jsonl', 'example2.jsonl']
137
+ output_filename = 'merged.jsonl'
138
+ merge_json_list(filenames, output_filename)
139
+ ```
140
+
141
+ 合并多个 `read_as_dataframe_dict` 返回的字典。
142
+
143
+ ```python
144
+ from xlin import read_as_dataframe_dict, merge_multiple_df_dict
145
+
146
+ df_dict1 = read_as_dataframe_dict('example1.xlsx')
147
+ df_dict2 = read_as_dataframe_dict('example2.xlsx')
148
+ merged_df_dict = merge_multiple_df_dict([df_dict1, df_dict2])
149
+ for sheet_name, df in merged_df_dict.items():
150
+ print(f"Sheet: {sheet_name}")
151
+ print(df)
152
+ ```
153
+
154
+ ### 对 json 文件批量操作
155
+ - 对 JSON 列表应用更改:`apply_changes_to_paths`,`apply_changes_to_jsonlist`
156
+
157
+ ```python
158
+ from xlin import *
159
+
160
+ paths = [Path('example1.jsonl'), Path('example2.jsonl')]
161
+ jsonlist = [{"id": 1, "text": "Hello"}, {"id": 2, "text": "World"}]
162
+
163
+ def change_func(row):
164
+ if row["id"] == 1:
165
+ row["text"] = "New Hello"
166
+ return "updated", row
167
+ return "unchanged", row
168
+
169
+ changes = {"update_text": change_func}
170
+
171
+ # 1. 对文件路径应用更改
172
+ apply_changes_to_paths(paths, changes, save=True)
173
+ # 2. 对 JSON 列表应用更改
174
+ new_jsonlist, updated, deleted = apply_changes_to_jsonlist(jsonlist, changes)
175
+ print(new_jsonlist)
176
+ ```
177
+
178
+ ### 生成器
179
+ - 从多个文件中生成 JSON 列表的生成器:`generator_from_paths`
180
+
181
+ ```python
182
+ from xlin import generator_from_paths
183
+ from pathlib import Path
184
+
185
+ paths = [Path('example1.jsonl'), Path('example2.jsonl')]
186
+
187
+ for path, row in generator_from_paths(paths):
188
+ print(f"Path: {path}, Row: {row}")
189
+ ```
190
+
191
+ ### 数据转换
192
+ - DataFrame 和 JSON 列表之间的转换:`dataframe_to_json_list` 和 `jsonlist_to_dataframe`
193
+
194
+ ```python
195
+ from xlin import dataframe_to_json_list, jsonlist_to_dataframe
196
+ import pandas as pd
197
+
198
+ data = {'col1': [1, 2], 'col2': [3, 4]}
199
+ df = pd.DataFrame(data)
200
+
201
+ json_list = dataframe_to_json_list(df)
202
+ print(json_list)
203
+
204
+ new_df = jsonlist_to_dataframe(json_list)
205
+ print(new_df)
206
+ ```
207
+
208
+ ### 分组
209
+ - 对 DataFrame 进行分组:`grouped_col_list`、`grouped_col` 和 `grouped_row`
210
+
211
+ ```python
212
+ from xlin import grouped_col_list, grouped_col, grouped_row
213
+ import pandas as pd
214
+
215
+ data = {'query': ['a', 'a', 'b'], 'output': [1, 2, 3]}
216
+ df = pd.DataFrame(data)
217
+
218
+ grouped_col_list_result = grouped_col_list(df)
219
+ print(grouped_col_list_result)
220
+
221
+ grouped_col_result = grouped_col(df)
222
+ print(grouped_col_result)
223
+
224
+ grouped_row_result = grouped_row(df)
225
+ print(grouped_row_result)
226
+ ```
227
+
228
+ - 对 JSON 列表进行分组:`grouped_row_in_jsonlist`
229
+
230
+ ```python
231
+ from xlin import grouped_row_in_jsonlist
232
+
233
+ jsonlist = [{"query": "a", "output": 1}, {"query": "a", "output": 2}, {"query": "b", "output": 3}]
234
+ grouped_row_in_jsonlist_result = grouped_row_in_jsonlist(jsonlist)
235
+ print(grouped_row_in_jsonlist_result)
236
+ ```
237
+
238
+ ### 工具类
239
+
240
+ - `random_timestamp` 和 `random_timestamp_str`:生成随机时间戳和格式化的随机时间字符串。
241
+
242
+ ```python
243
+ from xlin import random_timestamp, random_timestamp_str
244
+
245
+ timestamp = random_timestamp()
246
+ print(timestamp)
247
+
248
+ timestamp_str = random_timestamp_str()
249
+ print(timestamp_str)
250
+ ```
251
+
252
+
253
+ - `df_dict_summary`: 对 `read_as_dataframe_dict` 返回的字典进行总结,返回一个 DataFrame 包含每个表的基本信息。
254
+
255
+ ```python
256
+ from xlin import read_as_dataframe_dict, df_dict_summary
257
+
258
+ df_dict = read_as_dataframe_dict('example.xlsx')
259
+ summary = df_dict_summary(df_dict)
260
+ print(summary)
261
+ ```
262
+
263
+ - `text_is_all_chinese` 和 `text_contains_chinese`:判断文本是否全为中文或是否包含中文。
264
+
265
+ ```python
266
+ from xlin import text_is_all_chinese, text_contains_chinese
267
+
268
+ text1 = "你好"
269
+ text2 = "Hello 你好"
270
+
271
+ print(text_is_all_chinese(text1)) # True
272
+ print(text_is_all_chinese(text2)) # False
273
+ print(text_contains_chinese(text2)) # True
274
+ ```
275
+
276
+ ## 许可证
277
+
278
+ 本项目采用 MIT 许可证,详情请参阅 [LICENSE](LICENSE) 文件。
279
+
280
+ ## 作者
281
+
282
+ - LinXueyuanStdio <23211526+LinXueyuanStdio@users.noreply.github.com>
@@ -0,0 +1,17 @@
1
+ xlin/__init__.py,sha256=CIhMAGhFgqwC6w16MzKcwo2mDjmaRUAcrlZFR3Am--I,321
2
+ xlin/dataframe_util.py,sha256=Z8k3_XLMP5B13IMov2dQJhe-7dPh_YUJGokibWSx8II,10460
3
+ xlin/datetime_util.py,sha256=jzdF-58PTb_ofBy6F-LBDnEmsTQ9jvoCgqKZJmyDtqE,687
4
+ xlin/file_util.py,sha256=mYTABNywdYoSfh1RLJcH7l1FzgKTFWN2-JZMFzv-ehw,7270
5
+ xlin/image_util.py,sha256=j1QlVXS-aikTDFDINbTmxjZi6CokPDTVlQ6_ABctMWQ,8109
6
+ xlin/jsonlist_util.py,sha256=dLgrgrSTvg_1plVRCEnilajPM_s3vYdVx2bCTqrZAN8,11316
7
+ xlin/metric.py,sha256=N7wJ35y-C-IaBr1I1CJ_37lTG7gA69zmn9Xg6xSwKoI,1690
8
+ xlin/multiprocess_util.py,sha256=-tskCWQlBBCOPycXLj9Y2MugYg-tHF_QYYWW7c1ixOk,17300
9
+ xlin/statistic.py,sha256=ioJJjL4qwHiwNPeBFBB67keoAIbB-uZM51zkDYviar0,17037
10
+ xlin/text_util.py,sha256=ejFD8-j8tLCbPlCPFg0Tu3MEMPEpF7R5_IpXXjl6qzA,735
11
+ xlin/timing_util.py,sha256=nNVKtSXel-Cc8SF_BqPRNkyNDOjGqOMxTol-L1vpON4,1340
12
+ xlin/xlsx_util.py,sha256=uSmXcDvIhi5Sq0LGidMXy0wErNBXdjaoa6EftYVjTXs,947
13
+ xlin/yaml_util.py,sha256=kICi7G3Td5q2MaSXXt85qNTWoHMgjzt7pvn7r3C4dME,183
14
+ xlin-0.1.39.dist-info/LICENSE,sha256=60ys6rRtc1dZOP8UjSUr9fAqhZudT3WpKe5WbMCralM,1066
15
+ xlin-0.1.39.dist-info/METADATA,sha256=d9F6S7LBxur58Jm817Kww54kVOBeWTRvc4R0AgOWYfw,7992
16
+ xlin-0.1.39.dist-info/WHEEL,sha256=IrRNNNJ-uuL1ggO5qMvT1GGhQVdQU54d6ZpYqEZfEWo,92
17
+ xlin-0.1.39.dist-info/RECORD,,
xlin/ischinese.py DELETED
@@ -1,13 +0,0 @@
1
- def text_is_all_chinese(test: str):
2
- for ch in test:
3
- if '\u4e00' <= ch <= '\u9fff':
4
- continue
5
- return False
6
- return True
7
-
8
-
9
- def text_contains_chinese(test: str):
10
- for ch in test:
11
- if '\u4e00' <= ch <= '\u9fff':
12
- return True
13
- return False
@@ -1,33 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: xlin
3
- Version: 0.1.38
4
- Summary: toolbox for LinXueyuan
5
- License: MIT
6
- Author: LinXueyuanStdio
7
- Author-email: 23211526+LinXueyuanStdio@users.noreply.github.com
8
- Classifier: License :: OSI Approved :: MIT License
9
- Classifier: Programming Language :: Python :: 2
10
- Classifier: Programming Language :: Python :: 2.7
11
- Classifier: Programming Language :: Python :: 3
12
- Classifier: Programming Language :: Python :: 3.4
13
- Classifier: Programming Language :: Python :: 3.5
14
- Classifier: Programming Language :: Python :: 3.6
15
- Classifier: Programming Language :: Python :: 3.7
16
- Classifier: Programming Language :: Python :: 3.8
17
- Classifier: Programming Language :: Python :: 3.9
18
- Classifier: Programming Language :: Python :: 3.10
19
- Classifier: Programming Language :: Python :: 3.11
20
- Classifier: Programming Language :: Python :: 3.12
21
- Requires-Dist: loguru
22
- Requires-Dist: pandas
23
- Requires-Dist: pyexcel
24
- Requires-Dist: pyexcel-xls
25
- Requires-Dist: pyexcel-xlsx
26
- Requires-Dist: pyyaml
27
- Requires-Dist: tqdm
28
- Requires-Dist: xlsxwriter
29
- Description-Content-Type: text/markdown
30
-
31
- # xlin
32
- 个人 python 工具代码
33
-
@@ -1,15 +0,0 @@
1
- xlin/__init__.py,sha256=MWWCNPgJFS_oV2US52ULa4yg4Ku61qjn40NVKqcp9-c,248
2
- xlin/ischinese.py,sha256=Ia9IMQ6q-UHkdLwqS70L1fTnfSPbluFrv_I1UqsKquo,293
3
- xlin/jsonl.py,sha256=QLIipsORyMC5OlTW5yntNnXS1aZ4so984yT_c0elM80,10854
4
- xlin/metric.py,sha256=N7wJ35y-C-IaBr1I1CJ_37lTG7gA69zmn9Xg6xSwKoI,1690
5
- xlin/multiprocess_mapping.py,sha256=XZJLsYRHyNooeBFngnSZ6l_YhbK0xjbN1_nK8GonmkE,17290
6
- xlin/read_as_dataframe.py,sha256=ufTH1z-ewdE4X33trXQDWgvsjCE18hzRxSFEvoH7Eaw,9173
7
- xlin/statistic.py,sha256=nwFSN8BWfTQRimI-zfp6RwfA-I9aFDbemtV2cyh6Hq8,16533
8
- xlin/timing.py,sha256=XMT8dMcMolOMohDvAZOIM_BAiPMREhGQKnO1kc5s6PU,998
9
- xlin/util.py,sha256=HEDJv09tNmvHCgQdP4uhMkDM8fQgcuYa0MuMXZmyZns,10977
10
- xlin/xls2xlsx.py,sha256=uSmXcDvIhi5Sq0LGidMXy0wErNBXdjaoa6EftYVjTXs,947
11
- xlin/yaml.py,sha256=kICi7G3Td5q2MaSXXt85qNTWoHMgjzt7pvn7r3C4dME,183
12
- xlin-0.1.38.dist-info/LICENSE,sha256=60ys6rRtc1dZOP8UjSUr9fAqhZudT3WpKe5WbMCralM,1066
13
- xlin-0.1.38.dist-info/METADATA,sha256=1yJfCyje0O72bLoSxXYr2NMqThAz5pNrAhD-x9DOrJw,1098
14
- xlin-0.1.38.dist-info/WHEEL,sha256=IrRNNNJ-uuL1ggO5qMvT1GGhQVdQU54d6ZpYqEZfEWo,92
15
- xlin-0.1.38.dist-info/RECORD,,
File without changes
File without changes
File without changes
File without changes