PyPI - xlin - Versions diffs - 0.1.38__py2.py3-none-any.whl → 0.2.2__py2.py3-none-any.whl - Mend

xlin 0.1.38py2.py3-none-any.whl → 0.2.2py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

xlin/__init__.py +10 -8
xlin/{read_as_dataframe.py → dataframe_util.py} +92 -3
xlin/datetime_util.py +175 -0
xlin/{util.py → file_util.py} +0 -108
xlin/image_util.py +248 -0
xlin/{jsonl.py → jsonlist_util.py} +15 -2
xlin/{multiprocess_mapping.py → multiprocess_util.py} +3 -3
xlin/statistic.py +17 -1
xlin/text_util.py +24 -0
xlin/{timing.py → timing_util.py} +14 -0
xlin-0.2.2.dist-info/METADATA +282 -0
xlin-0.2.2.dist-info/RECORD +17 -0
xlin/ischinese.py +0 -13
xlin-0.1.38.dist-info/METADATA +0 -33
xlin-0.1.38.dist-info/RECORD +0 -15
/xlin/{xls2xlsx.py → xlsx_util.py} +0 -0
/xlin/{yaml.py → yaml_util.py} +0 -0
{xlin-0.1.38.dist-info → xlin-0.2.2.dist-info}/LICENSE +0 -0
{xlin-0.1.38.dist-info → xlin-0.2.2.dist-info}/WHEEL +0 -0

xlin/__init__.py CHANGED Viewed

@@ -1,10 +1,12 @@
-from .ischinese import *
-from .jsonl import *
+from .dataframe_util import *
+from .datetime_util import *
+from .file_util import *
+from .image_util import *
+from .jsonlist_util import *
 from .metric import *
-from .multiprocess_mapping import *
-from .read_as_dataframe import *
+from .multiprocess_util import *
 from .statistic import *
-from .timing import *
-from .util import *
-from .xls2xlsx import *
-from .yaml import *
+from .text_util import *
+from .timing_util import *
+from .xlsx_util import *
+from .yaml_util import *

xlin/{read_as_dataframe.py → dataframe_util.py} RENAMED Viewed

@@ -1,3 +1,4 @@
+import asyncio
 from collections import defaultdict
 import os
 from typing import Callable, Dict, List, Optional, Tuple, Union
@@ -7,9 +8,9 @@ from loguru import logger
 import pandas as pd
 import pyexcel
-from xlin.util import ls
-from xlin.jsonl import dataframe_to_json_list, load_json, load_json_list, save_json_list
-from xlin.xls2xlsx import is_xslx
+from xlin.file_util import ls
+from xlin.jsonlist_util import dataframe_to_json_list, load_json, load_json_list, save_json_list
+from xlin.xlsx_util import is_xslx
 def read_as_dataframe(
@@ -269,3 +270,91 @@ def split_dataframe(
         df_i.to_excel(filepath, index=False)
         df_list.append(df_i)
     return df_list
+def append_column(df: pd.DataFrame, query_column: str, output_column: str, transform):
+    query = df[query_column].tolist()
+    loop = asyncio.get_event_loop()
+    result = loop.run_until_complete(transform(query))
+    df[output_column] = [str(r) for r in result]
+    return df
+def grouped_col_list(df: pd.DataFrame, key_col="query", value_col="output"):
+    grouped = defaultdict(list)
+    if key_col not in df.columns:
+        logger.warning(f"`{key_col}` not in columns: {list(df.columns)}")
+        return grouped
+    for i, row in df.iterrows():
+        grouped[row[key_col]].append(row[value_col])
+    return grouped
+def grouped_col(df: pd.DataFrame, key_col="query", value_col="output"):
+    grouped = {}
+    if key_col not in df.columns:
+        logger.warning(f"`{key_col}` not in columns: {list(df.columns)}")
+        return grouped
+    for i, row in df.iterrows():
+        grouped[row[key_col]] = row[value_col]
+    return grouped
+def grouped_row(df: pd.DataFrame, key_col="query"):
+    grouped = defaultdict(list)
+    if key_col not in df.columns:
+        logger.warning(f"`{key_col}` not in columns: {list(df.columns)}")
+        return grouped
+    for i, row in df.iterrows():
+        grouped[row[key_col]].append(row)
+    return grouped
+def select_sub_df(
+    df: pd.DataFrame,
+    start_date: str,
+    end_date: str,
+    lookback_window: int = 0,
+    lookforward_window: int = 0,
+    include_end_date: bool = False,
+) -> pd.DataFrame:
+    """
+    从DataFrame中选择指定日期范围内的子DataFrame。
+    Args:
+        df (pd.DataFrame): 带有日期索引的DataFrame，index是日期。
+        start_date (str): 起始日期，格式'YYYY-MM-DD'。
+        end_date (str): 结束日期，格式'YYYY-MM-DD'。
+        lookback_window (int): 向后查看的天数，默认为0。
+        lookforward_window (int): 向前查看的天数，默认为0。
+        include_end_date (bool): 是否包含结束日期，默认为False。
+    Returns:
+        pd.DataFrame: 指定日期范围内的子DataFrame。
+    """
+    # 确保索引是DatetimeIndex类型
+    if not isinstance(df.index, pd.DatetimeIndex):
+        df.index = pd.to_datetime(df.index)
+    # 确保索引是有序的
+    if not df.index.is_monotonic_increasing:
+        df = df.sort_index()
+    # 获取索引的时区信息
+    tz = df.index.tz
+    # 创建带时区的切片日期
+    start = pd.Timestamp(start_date, tz=tz)
+    end = pd.Timestamp(end_date, tz=tz)
+    # 选择子DataFrame
+    try:
+        if lookback_window > 0:
+            start = start - pd.Timedelta(days=lookback_window)
+        if lookforward_window > 0:
+            end = end + pd.Timedelta(days=lookforward_window)
+        if include_end_date:
+            end = end + pd.Timedelta(days=1)
+        sub_df = df[start:end]
+    except KeyError:
+        print(f"日期 {start_date} 或 {end_date} 不在索引范围内。")
+        sub_df = pd.DataFrame()
+    return sub_df

xlin/datetime_util.py ADDED Viewed

@@ -0,0 +1,175 @@
+from typing import Literal, Optional, Union
+import datetime
+import random
+import pandas as pd
+date_str = datetime.datetime.now().strftime("%Y%m%d")
+datetime_str = datetime.datetime.now().strftime("%Y%m%d_%Hh%Mm%Ss")
+def random_timestamp(start_timestamp: Optional[float]=None, end_timestamp: Optional[float]=None):
+    if start_timestamp is None:
+        start_timestamp = datetime.datetime(2024, 1, 1).timestamp()
+    if end_timestamp is None:
+        end_timestamp = datetime.datetime.now().timestamp()
+    return random.uniform(start_timestamp, end_timestamp)
+def random_datetime(
+    start_datetime: Optional[datetime.datetime] = None,
+    end_datetime: Optional[datetime.datetime] = None,
+) -> datetime.datetime:
+    """
+    生成一个随机的 datetime 对象，范围在指定的开始和结束时间之间。
+    如果未指定，则默认范围为 2024 年 1 月 1 日到当前时间。
+    """
+    if start_datetime is None:
+        start_datetime = datetime.datetime(2024, 1, 1)
+    if end_datetime is None:
+        end_datetime = datetime.datetime.now()
+    random_timestamp_value = random.uniform(start_datetime.timestamp(), end_datetime.timestamp())
+    return datetime.datetime.fromtimestamp(random_timestamp_value)
+# 初始化中美节假日（可缓存）懒加载
+us_holidays = None # US(categories=US.supported_categories)
+cn_holidays = None # CN(categories=CN.supported_categories)
+def format_datetime_with_holiday(
+    dt: Union[datetime.datetime, str, pd.Series, float],
+    language: Literal["zh", "en"] = "zh",
+    with_time: bool = True,
+    with_weekday: bool = True,
+    with_holiday: bool = True,
+) -> Union[str, pd.Series]:
+    """
+    格式化时间为中文日期+英文星期几，附带中美节假日信息。
+    如：2024年01月01日 10:00:00 星期一 [假期: 🇨🇳 元旦, 🇺🇸 New Year's Day]
+    支持 datetime, str, pandas.Series 批处理。
+    Args:
+        dt: 待格式化的时间，可以是 datetime, str, pandas.Series 或 timestamp。
+        language: 语言选择，支持 "zh" 和 "en"
+        with_time: 是否包含时间
+        with_weekday: 是否包含星期几
+        with_holiday: 是否包含节假日信息
+    Returns:
+        格式化后的字符串或 pandas.Series
+    Raises:
+        ValueError: 如果输入类型不正确
+        ImportError: 如果未安装 'holidays' 库
+    """
+    language_dict = {
+        "zh": {
+            "weekday": ["星期一", "星期二", "星期三", "星期四", "星期五", "星期六", "星期日"],
+            "holiday": "假期",
+            "date_format": "%Y年%m月%d日",
+            "time_format": "%H:%M:%S",
+        },
+        "en": {
+            "weekday": ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"],
+            "holiday": "Holiday",
+            "date_format": "%Y-%m-%d",
+            "time_format": "%H:%M:%S",
+        },
+    }
+    def _format_one(d: Union[datetime.datetime, str]) -> str:
+        if isinstance(d, str):
+            d = pd.to_datetime(d)
+        elif isinstance(d, float):
+            d = datetime.datetime.fromtimestamp(d)
+        if not isinstance(d, datetime.datetime):
+            raise ValueError("输入必须是 datetime, timestamp, str 或 pandas.Series 类型。")
+        formatted = d.strftime(language_dict[language]["date_format"])
+        if with_time:
+            formatted += " " + d.strftime(language_dict[language]["time_format"])
+        if with_weekday:
+            weekday_index = d.weekday()
+            formatted += " " + language_dict[language]["weekday"][weekday_index]
+        if not with_holiday:
+            return formatted
+        # 检查节假日
+        global us_holidays, cn_holidays
+        if not us_holidays or not cn_holidays:
+            try:
+                from holidays.countries import US, CN
+            except ImportError:
+                raise ImportError("请安装 'holidays' 库以支持节假日查询。可以使用 'pip install holidays' 安装。")
+            us_holidays = US(categories=US.supported_categories)
+            cn_holidays = CN(categories=CN.supported_categories)
+        tags = []
+        if d in cn_holidays:
+            tags.append(f"🇨🇳 {cn_holidays[d]}")
+        if d in us_holidays:
+            tags.append(f"🇺🇸 {us_holidays[d]}")
+        if tags:
+            holiday_str = language_dict[language]["holiday"]
+            formatted += f" [{holiday_str}: " + ", ".join(tags) + "]"
+        return formatted
+    if isinstance(dt, pd.Series):
+        return dt.apply(_format_one)
+    else:
+        return _format_one(dt)
+def format_timedelta(
+    delta: datetime.timedelta,
+    language: Literal["zh", "en"] = "zh",
+) -> str:
+    """
+    将 timedelta 格式化为精简的中文可读字符串，省略零值单位，四舍五入到秒
+    Args:
+        delta: 待格式化的时间间隔
+        language: 语言选择，支持 "zh" 和 "en"
+    Returns:
+        精简的中文时间字符串（如 "1天3小时5分" 或 "45秒"）
+    """
+    language_dict = {
+        "zh": {
+            "days": "天",
+            "hours": "小时",
+            "minutes": "分",
+            "seconds": "秒",
+        },
+        "en": {
+            "days": "days",
+            "hours": "hours",
+            "minutes": "minutes",
+            "seconds": "seconds",
+        },
+    }
+    # 处理负数时间（转为正数）
+    delta = abs(delta)
+    # 分解时间单位（四舍五入到秒）
+    days = delta.days
+    total_seconds = int(delta.total_seconds() + 0.5)  # 四舍五入到秒
+    hours, remainder = divmod(total_seconds, 3600)
+    minutes, seconds = divmod(remainder, 60)
+    # 构建结果列表，跳过零值单位
+    parts = []
+    if days > 0:
+        parts.append(f"{days}{language_dict[language]['days']}")
+    if hours > 0:
+        parts.append(f"{hours}{language_dict[language]['hours']}")
+    if minutes > 0:
+        parts.append(f"{minutes}{language_dict[language]['minutes']}")
+    if seconds > 0:
+        parts.append(f"{seconds}{language_dict[language]['seconds']}")
+    # 处理全零情况（如 timedelta(0)）
+    return "".join(parts) if parts else f"0{language_dict[language]['seconds']}"

xlin/{util.py → file_util.py} RENAMED Viewed

@@ -3,30 +3,11 @@ from collections import defaultdict
 from pathlib import Path
 import os
 import asyncio
-import datetime
 import shutil
-import random
 import pandas as pd
 from loguru import logger
-date_str = datetime.datetime.now().strftime("%Y%m%d")
-datetime_str = datetime.datetime.now().strftime("%Y%m%d_%Hh%Mm%Ss")
-def random_timestamp(start_timestamp=None, end_timestamp=None):
-    if start_timestamp is None:
-        start_timestamp = datetime.datetime(2024, 1, 1).timestamp()
-    if end_timestamp is None:
-        end_timestamp = datetime.datetime.now().timestamp()
-    return random.uniform(start_timestamp, end_timestamp)
-def random_timestamp_str(start_timestamp=None, end_timestamp=None, format="%Y年%m月%d日%H时%M分"):
-    return datetime.datetime.fromtimestamp(random_timestamp(start_timestamp, end_timestamp)).strftime(format)
 def auto_retry_to_get_data(retry_times, request, data_key="data", *args, **kwargs):
     if retry_times == 0:
         return {}
@@ -40,13 +21,6 @@ def auto_retry_to_get_data(retry_times, request, data_key="data", *args, **kwarg
     return auto_retry_to_get_data(retry_times - 1, request, data_key, *args, **kwargs)
-def append_column(df: pd.DataFrame, query_column: str, output_column: str, transform):
-    query = df[query_column].tolist()
-    loop = asyncio.get_event_loop()
-    result = loop.run_until_complete(transform(query))
-    df[output_column] = [str(r) for r in result]
-    return df
 def request_wrapper(request_num=10):
     def request_wrapper_body(func):
@@ -202,47 +176,6 @@ def clean_empty_folder(dir_path):
                 clean_empty_folder(path)
-def grouped_col_list(df: pd.DataFrame, key_col="query", value_col="output"):
-    grouped = defaultdict(list)
-    if key_col not in df.columns:
-        logger.warning(f"`{key_col}` not in columns: {list(df.columns)}")
-        return grouped
-    for i, row in df.iterrows():
-        grouped[row[key_col]].append(row[value_col])
-    return grouped
-def grouped_col(df: pd.DataFrame, key_col="query", value_col="output"):
-    grouped = {}
-    if key_col not in df.columns:
-        logger.warning(f"`{key_col}` not in columns: {list(df.columns)}")
-        return grouped
-    for i, row in df.iterrows():
-        grouped[row[key_col]] = row[value_col]
-    return grouped
-def grouped_row(df: pd.DataFrame, key_col="query"):
-    grouped = defaultdict(list)
-    if key_col not in df.columns:
-        logger.warning(f"`{key_col}` not in columns: {list(df.columns)}")
-        return grouped
-    for i, row in df.iterrows():
-        grouped[row[key_col]].append(row)
-    return grouped
-def grouped_row_in_jsonlist(jsonlist: List[Dict[str, Any]], key_col="query"):
-    grouped = defaultdict(list)
-    for i, row in enumerate(jsonlist):
-        if key_col not in row:
-            logger.warning(f"`{key_col}` not in row: {row}")
-            notfound_key = f"NotFound:{key_col}"
-            grouped[notfound_key].append(row)
-            continue
-        grouped[row[key_col]].append(row)
-    return grouped
 def submit_file(path: Union[str, Path], target_dir: Union[str, Path]):
     p = Path(path).absolute()
@@ -267,44 +200,3 @@ def submit_file(path: Union[str, Path], target_dir: Union[str, Path]):
     logger.info("现在目标文件夹下的文件有：\n" + "\n".join(filenames))
-def pretty_limited_text(text: str, limited_length: int = 300, language="zh"):
-    text = str(text).strip()
-    if len(text) > limited_length:
-        # if language == "zh":
-        #     tail = f"...(共{len(text)}字)"
-        # else:
-        #     tail = f"...({len(text)} words in total)"
-        # return text[: limited_length - len(tail)] + tail
-        return text[: limited_length // 2] + text[-limited_length // 2 :]
-    return text
-def bucket_count(length):
-    grouped_count = []
-    j = 0
-    for i in range(0, max(length), 50):
-        grouped_count.append(0)
-        while length[j] < i:
-            grouped_count[i // 50] += 1
-            j += 1
-    for i, j in enumerate(grouped_count):
-        if i == 0 or j == 0:
-            continue
-        print(f"[{(i-1)*50}, {i*50})  {j}   {sum(grouped_count[:i+1])/len(length)*100:.2f}%")
-def sortedCounter(obj, by="key", reverse=False, return_list=False):
-    c = Counter(obj)
-    c_list = [(k, c[k]) for k in c]
-    if by == "key":
-        c_list = sorted(c_list, key=lambda x: x[0], reverse=reverse)
-    elif by in ["value", "count"]:
-        c_list = sorted(c_list, key=lambda x: x[1], reverse=reverse)
-    else:
-        raise Exception(f"unsupported by: {by}")
-    c = Counter()
-    for k, v in c_list:
-        c[k] = v
-    if return_list:
-        return c, c_list
-    return c

xlin/image_util.py ADDED Viewed

@@ -0,0 +1,248 @@
+import base64
+from io import BytesIO
+from loguru import logger
+from PIL import Image, ImageDraw, ImageFont
+import uuid
+import os
+import requests
+def read_image_http_url(image_url: str) -> Image.Image:
+    # 使用 requests 获取图像的二进制数据
+    response = requests.get(image_url)
+    image_data = response.content
+    # 使用 Pillow 将二进制数据转换为 Image.Image 对象
+    image = Image.open(BytesIO(image_data))
+    return image
+def image_to_base64(image: Image.Image) -> str:
+    buffered = BytesIO()
+    image.save(buffered, format="PNG")
+    b64 = base64.b64encode(buffered.getvalue()).decode()
+    return f"data:image/png;base64,{b64}"
+def base64_to_image(base64_str: str) -> Image.Image:
+    """
+    Convert a base64 string to an image.
+    """
+    prefix_list = [
+        "data:image/png;base64,",
+        "data:image/jpeg;base64,",
+        "data:image/gif;base64,",
+        "data:image/webp;base64,",
+    ]
+    for prefix in prefix_list:
+        if base64_str.startswith(prefix):
+            base64_str = base64_str[len(prefix):]
+            break
+    image_data = base64.b64decode(base64_str)
+    image = Image.open(BytesIO(image_data))
+    return image
+def generate_short_uuid(length=8):
+    # 生成标准 UUID
+    uuid_value = uuid.uuid4().bytes
+    # 使用 Base64 编码并转换为 URL 安全格式
+    encoded = base64.urlsafe_b64encode(uuid_value).decode("ascii")
+    # 移除可能的填充字符 '='
+    encoded = encoded.rstrip("=")
+    # 截取指定长度的字符串
+    return encoded[:length]
+def scale_to_fit(image: Image.Image, target_size: tuple[int, int]=(512, 512)) -> Image.Image:
+    """
+    将图像缩放到适合目标大小的尺寸，同时保持原始宽高比。
+    args:
+        image: PIL.Image.Image
+            要缩放的图像。
+        target_size: tuple[int, int]
+            目标大小，格式为 (width, height)。
+    return: PIL.Image.Image
+        缩放后的图像。
+    """
+    original_width, original_height = image.size
+    target_width, target_height = target_size
+    # 计算缩放比例
+    width_ratio = target_width / original_width
+    height_ratio = target_height / original_height
+    scale_ratio = min(width_ratio, height_ratio)
+    if scale_ratio >= 1:
+        # 如果图像已经小于或等于目标大小，则不需要缩放
+        return image
+    # 计算新的尺寸
+    new_width = round(original_width * scale_ratio)
+    new_height = round(original_height * scale_ratio)
+    # 缩放图像
+    resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+    return resized_image
+def add_scale_bar(
+    image: Image.Image,
+    spacing=64,
+    color=(0, 0, 0),
+    font_size=12,
+    left_margin=50,
+    top_margin=50,
+    tick_length=8,
+    tick_width=2,
+    text_offset=2,
+    origin_size: tuple[int, int] = None,
+):
+    """
+    为图像添加顶部和左侧标尺，并将文字标签放在空白边距中，不与原图重叠。
+    args:
+        image: PIL.Image.Image
+            要添加标尺的图像。
+        spacing: int
+            刻度之间的间隔，单位为像素。
+        color: tuple
+            刻度线和文字的颜色，RGB格式。
+        font_size: int
+            文字的字体大小。
+        left_margin: int
+            左侧边距的宽度，单位为像素。
+        top_margin: int
+            顶部边距的高度，单位为像素。
+        tick_length: int
+            刻度线的长度，单位为像素。
+        tick_width: int
+            刻度线的宽度，单位为像素。
+        text_offset: int
+            文字与刻度线之间的距离，单位为像素。
+        origin_size: tuple[int, int]
+            原图的尺寸，格式为 (width, height)。如果未提供，则使用图像的实际尺寸。
+    return: PIL.Image.Image
+    示例用法
+    ```
+    img = Image.open("/Pictures/example.png")
+    out = add_scale_bar(
+        img,
+        spacing=100,
+        color=(0, 0, 0),
+        font_size=12,
+        left_margin=50,
+        top_margin=50,
+        tick_length=8,
+        text_offset=4,
+        origin_size=(img.width, img.height)  # 可选，指定原图尺寸
+    )
+    out
+    ```
+    """
+    # 加载字体
+    try:
+        font_path = "C:/Windows/Fonts/arial.ttf"
+        if not os.path.exists(font_path):
+            font_path = "/System/Library/Fonts/Supplemental/Arial.ttf"
+        if not os.path.exists(font_path):
+            font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
+        if not os.path.exists(font_path):
+            font_path = "/usr/share/fonts/truetype/freefont/FreeMono.ttf"
+        if not os.path.exists(font_path):
+            font_path = "/usr/share/fonts/truetype/liberation/LiberationMono-Regular.ttf"
+        if not os.path.exists(font_path):
+            font_path = "/usr/share/fonts/truetype/noto/NotoSansMono-Regular.ttf"
+        if not os.path.exists(font_path):
+            font_path = "/usr/share/fonts/truetype/ubuntu/Ubuntu-C.ttf"
+        font = ImageFont.truetype(font_path, font_size)
+    except:
+        font = ImageFont.load_default()
+    w, h = image.size
+    new_w, new_h = w + left_margin, h + top_margin
+    # 创建背景画布并粘贴原图
+    mode = image.mode
+    bg = (255, 255, 255) if mode == "RGB" else (255,)
+    canvas = Image.new(mode, (new_w, new_h), bg)
+    canvas.paste(image, (left_margin, top_margin))
+    draw = ImageDraw.Draw(canvas)
+    # 计算文字宽高的 helper
+    def text_dimensions(txt):
+        bbox = draw.textbbox((0, 0), txt, font=font)
+        return bbox[2] - bbox[0], bbox[3] - bbox[1]
+    origin_width, origin_height = origin_size if origin_size else (w, h)
+    # 顶部刻度和文字
+    x_ticks = range(0, w + 1, spacing)
+    for i, x in enumerate(x_ticks):
+        # 计算刻度线的 x 坐标
+        px = left_margin + x
+        if i == len(x_ticks) - 1:
+            # 最后一个刻度线在右侧边界
+            px = new_w - tick_width
+        # 刻度线
+        draw.line([(px, top_margin), (px, top_margin - tick_length)], width=tick_width, fill=color)
+        # 文字
+        origin_x = x * origin_width // w  # 将刻度值映射到原图尺寸
+        if i == len(x_ticks) - 1:
+            origin_x = origin_width  # 确保最后一个刻度值是原图宽度
+        txt = str(origin_x)
+        tw, th = text_dimensions(txt)
+        tx = px - tw / 2
+        if i == len(x_ticks) - 1:
+            # 最后一个刻度的文字放在刻度线的左边
+            tx = tx - tw / 2
+        ty = top_margin - tick_length - th - text_offset
+        draw.text((tx, ty), txt, fill=color, font=font)
+    # 左侧刻度和文字
+    y_ticks = range(0, h + 1, spacing)
+    for i, y in enumerate(y_ticks):
+        # 计算刻度线的 y 坐标
+        py = top_margin + y
+        if i == len(y_ticks) - 1:
+            # 最后一个刻度线在底部边界
+            py = new_h - tick_width
+        # 刻度线
+        draw.line([(left_margin, py), (left_margin - tick_length, py)], width=tick_width, fill=color)
+        # 文字
+        origin_y = y * origin_height // h  # 将刻度值映射到原图尺寸
+        if i == len(y_ticks) - 1:
+            origin_y = origin_height
+        txt = str(origin_y)
+        tw, th = text_dimensions(txt)
+        tx = left_margin - tick_length - tw - text_offset
+        ty = py - th / 2
+        if i == len(y_ticks) - 1:
+            # 最后一个刻度的文字放在刻度线的上边
+            ty = ty - th / 3 * 2
+        draw.text((tx, ty), txt, fill=color, font=font)
+    return canvas
+def scale_to_fit_and_add_scale_bar(image: Image.Image, debug=False) -> Image.Image:
+    origin_width, origin_height = image.size
+    target_width, target_height = 512, 512
+    if debug:
+        logger.debug(f"原图尺寸: {origin_width}x{origin_height}, 目标尺寸: {target_width}x{target_height}")
+    image = scale_to_fit(image, target_size=(target_width, target_height))  # 缩放图片到目标大小，为了省 image tokens
+    if debug:
+        logger.debug(f"缩放后图片尺寸: {image.size[0]}x{image.size[1]}")
+    image = add_scale_bar(image, origin_size=(origin_width, origin_height))  # 保持缩放后的比例尺为原图的比例尺，方便模型在原图上定位坐标和长宽用于裁剪
+    if debug:
+        logger.debug(f"添加比例尺后图片尺寸: {image.size[0]}x{image.size[1]}")
+    return image

xlin/{jsonl.py → jsonlist_util.py} RENAMED Viewed

@@ -1,3 +1,4 @@
+from collections import defaultdict
 import json
 from typing import *
@@ -6,8 +7,8 @@ from loguru import logger
 import pandas as pd
 import pyexcel
-from xlin.util import ls
-from xlin.xls2xlsx import is_xslx
+from xlin.file_util import ls
+from xlin.xlsx_util import is_xslx
 def dataframe_to_json_list(df: pd.DataFrame):
@@ -337,3 +338,15 @@ def generator_from_jsonl(path):
     jsonlist = load_json_list(path)
     for line in jsonlist:
         yield line
+def grouped_row_in_jsonlist(jsonlist: List[Dict[str, Any]], key_col="query"):
+    grouped = defaultdict(list)
+    for i, row in enumerate(jsonlist):
+        if key_col not in row:
+            logger.warning(f"`{key_col}` not in row: {row}")
+            notfound_key = f"NotFound:{key_col}"
+            grouped[notfound_key].append(row)
+            continue
+        grouped[row[key_col]].append(row)
+    return grouped

xlin/{multiprocess_mapping.py → multiprocess_util.py} RENAMED Viewed

@@ -9,9 +9,9 @@ from pathlib import Path
 from tqdm import tqdm
 from loguru import logger
-from xlin.jsonl import append_to_json_list, dataframe_to_json_list, load_json_list, row_to_json, save_json_list, load_json, save_json
-from xlin.read_as_dataframe import read_as_dataframe
-from xlin.util import ls
+from xlin.jsonlist_util import append_to_json_list, dataframe_to_json_list, load_json_list, row_to_json, save_json_list, load_json, save_json
+from xlin.dataframe_util import read_as_dataframe
+from xlin.file_util import ls
 def element_mapping(

xlin/statistic.py CHANGED Viewed

@@ -1,10 +1,26 @@
 import sys
 from typing import List, Optional
-from collections import defaultdict
+from collections import Counter, defaultdict
 import pandas as pd
+def sortedCounter(obj, by="key", reverse=False, return_list=False):
+    c = Counter(obj)
+    c_list = [(k, c[k]) for k in c]
+    if by == "key":
+        c_list = sorted(c_list, key=lambda x: x[0], reverse=reverse)
+    elif by in ["value", "count"]:
+        c_list = sorted(c_list, key=lambda x: x[1], reverse=reverse)
+    else:
+        raise Exception(f"unsupported by: {by}")
+    c = Counter()
+    for k, v in c_list:
+        c[k] = v
+    if return_list:
+        return c, c_list
+    return c
 def bucket_count(length: List[int], step=50, skip_zero_count=False):
     grouped_count = []

xlin/text_util.py ADDED Viewed

@@ -0,0 +1,24 @@
+def text_is_all_chinese(test: str):
+    for ch in test:
+        if '\u4e00' <= ch <= '\u9fff':
+            continue
+        return False
+    return True
+def text_contains_chinese(test: str):
+    for ch in test:
+        if '\u4e00' <= ch <= '\u9fff':
+            return True
+    return False
+def pretty_limited_text(text: str, limited_length: int = 300, language="zh"):
+    text = str(text).strip()
+    if len(text) > limited_length:
+        # if language == "zh":
+        #     tail = f"...(共{len(text)}字)"
+        # else:
+        #     tail = f"...({len(text)} words in total)"
+        # return text[: limited_length - len(tail)] + tail
+        return text[: limited_length // 2] + text[-limited_length // 2 :]
+    return text

xlin/{timing.py → timing_util.py} RENAMED Viewed

@@ -41,3 +41,17 @@ class Timer:
     def __exit__(self, *args):
         self.end = time.perf_counter()
         self.interval = self.end - self.start
+if __name__ == "__main__":
+    with Timer() as t:
+        time.sleep(1)
+    print(t.interval)
+    with Benchmark("Test Benchmark") as b:
+        time.sleep(1)
+    print(b.time)
+    @timing
+    def test_function(x, y):
+        time.sleep(1)
+        return x + y
+    result = test_function(1, 2)
+    print(f"Result of test_function: {result}")

xlin-0.2.2.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,282 @@
+Metadata-Version: 2.1
+Name: xlin
+Version: 0.2.2
+Summary: toolbox for LinXueyuan
+License: MIT
+Author: LinXueyuanStdio
+Author-email: 23211526+LinXueyuanStdio@users.noreply.github.com
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 2
+Classifier: Programming Language :: Python :: 2.7
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.4
+Classifier: Programming Language :: Python :: 3.5
+Classifier: Programming Language :: Python :: 3.6
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Requires-Dist: loguru
+Requires-Dist: pandas
+Requires-Dist: pyexcel
+Requires-Dist: pyexcel-xls
+Requires-Dist: pyexcel-xlsx
+Requires-Dist: pyyaml
+Requires-Dist: tqdm
+Requires-Dist: xlsxwriter
+Description-Content-Type: text/markdown
+# xlin
+Python 工具代码集合，提供了丰富的工具函数，涵盖文件操作、数据处理、多进程处理等多个方面，旨在提高开发效率。
+## 安装
+```bash
+pip install xlin --upgrade
+```
+## 使用方法
+```python
+from xlin import *
+```
+### 文件操作类：`ls`，`rm` 和 `cp`
+- `ls`: 列出文件或文件夹下的所有文件。
+- `rm`: 删除文件或文件夹。
+- `cp`: 复制文件或文件夹。
+```python
+from xlin import ls, rm, cp
+dir_path = "./data"
+dir_path = "/mnt/data.json"
+dir_path = "./data,/mnt/data.json"
+dir_path = ["./data", "/mnt/data.json", "./data,/mnt/data.json"]
+def filter_func(path: Path) -> bool:
+    return path.name.endswith('.json')
+filepaths: list[Path] = ls(dir_path, filter=filter_func)
+rm(dir_path)
+cp(dir_path, "./backup_data")  # 会根据最大公共前缀保持文件夹结构
+```
+### 读取类
+- `read_as_json_list`：读取 JSON 文件为列表。
+- `read_as_dataframe`：读取文件为表格。如果是文件夹，则读取文件夹下的所有文件为表格并拼接。
+- `read_as_dataframe_dict`：读取文件为字典，键为表头，值为列数据。
+- `load_text`：加载文本文件。
+- `load_yaml`：加载 YAML 文件。
+- `load_json`：加载 JSON 文件。
+- `load_json_list`：加载 JSON 列表文件。
+> `read_as_**` 函数支持文件夹或者文件，支持多种文件格式，包括 Excel、CSV、JSON、Parquet 等。
+>
+> `load_**` 函数主要用于加载单个文件，支持文本、YAML 和 JSON 格式。
+```python
+from xlin import *
+import pandas as pd
+dir_path = "./data"
+dir_path = "./data,data.xlsx,data.csv,data.json,data.jsonl,data.parquet,data.feather,data.pkl,data.h5,data.txt,data.tsv,data.xml,data.html,data.db"
+dir_path = "./data,/mnt/data.json"
+dir_path = ["./data", "/mnt/data.json", "./data,/mnt/data.json"]
+df_single = read_as_dataframe(dir_path)
+jsonlist = read_as_json_list(dir_path)
+df_dict = read_as_dataframe_dict(dir_path)  # xlsx or dirs
+for sheet_name, df in df_dict.items():
+    print(f"Sheet: {sheet_name}")
+    print(df)
+text = load_text("example.txt")
+yaml_data = load_yaml("example.yaml")
+json_data = load_json("example.json")
+json_list_data = load_json_list("example.jsonl")
+```
+### 保存类
+```python
+save_json(data, 'output.json')
+save_json_list(jsonlist, 'output.jsonl')
+save_df(df, 'output.xlsx')
+save_df_dict(df_dict, 'output.xlsx')  # 将 read_as_dataframe_dict 返回的字典保存为 Excel 文件。
+save_df_from_jsonlist(jsonlist, 'output_from_jsonlist.xlsx')
+append_to_json_list(data, 'output.jsonl')
+```
+### 并行处理类：`xmap`
+高效处理 JSON 列表，支持多进程/多线程。
+```python
+from xlin import xmap
+jsonlist = [{"id": 1, "text": "Hello"}, {"id": 2, "text": "World"}]
+def work_func(item):
+    item["text"] = item["text"].upper()
+    return item
+results = xmap(jsonlist, work_func, output_path="output.jsonl", batch_size=2)
+print(results)
+```
+### 合并多个文件：`merge_json_list`，`merge_multiple_df_dict`
+合并多个 JSONL 文件。
+```python
+from xlin import merge_json_list
+filenames = ['example1.jsonl', 'example2.jsonl']
+output_filename = 'merged.jsonl'
+merge_json_list(filenames, output_filename)
+```
+合并多个 `read_as_dataframe_dict` 返回的字典。
+```python
+from xlin import read_as_dataframe_dict, merge_multiple_df_dict
+df_dict1 = read_as_dataframe_dict('example1.xlsx')
+df_dict2 = read_as_dataframe_dict('example2.xlsx')
+merged_df_dict = merge_multiple_df_dict([df_dict1, df_dict2])
+for sheet_name, df in merged_df_dict.items():
+    print(f"Sheet: {sheet_name}")
+    print(df)
+```
+### 对 json 文件批量操作
+- 对 JSON 列表应用更改：`apply_changes_to_paths`，`apply_changes_to_jsonlist`
+```python
+from xlin import *
+paths = [Path('example1.jsonl'), Path('example2.jsonl')]
+jsonlist = [{"id": 1, "text": "Hello"}, {"id": 2, "text": "World"}]
+def change_func(row):
+    if row["id"] == 1:
+        row["text"] = "New Hello"
+        return "updated", row
+    return "unchanged", row
+changes = {"update_text": change_func}
+# 1. 对文件路径应用更改
+apply_changes_to_paths(paths, changes, save=True)
+# 2. 对 JSON 列表应用更改
+new_jsonlist, updated, deleted = apply_changes_to_jsonlist(jsonlist, changes)
+print(new_jsonlist)
+```
+### 生成器
+- 从多个文件中生成 JSON 列表的生成器：`generator_from_paths`
+```python
+from xlin import generator_from_paths
+from pathlib import Path
+paths = [Path('example1.jsonl'), Path('example2.jsonl')]
+for path, row in generator_from_paths(paths):
+    print(f"Path: {path}, Row: {row}")
+```
+### 数据转换
+- DataFrame 和 JSON 列表之间的转换：`dataframe_to_json_list` 和 `jsonlist_to_dataframe`
+```python
+from xlin import dataframe_to_json_list, jsonlist_to_dataframe
+import pandas as pd
+data = {'col1': [1, 2], 'col2': [3, 4]}
+df = pd.DataFrame(data)
+json_list = dataframe_to_json_list(df)
+print(json_list)
+new_df = jsonlist_to_dataframe(json_list)
+print(new_df)
+```
+### 分组
+- 对 DataFrame 进行分组：`grouped_col_list`、`grouped_col` 和 `grouped_row`
+```python
+from xlin import grouped_col_list, grouped_col, grouped_row
+import pandas as pd
+data = {'query': ['a', 'a', 'b'], 'output': [1, 2, 3]}
+df = pd.DataFrame(data)
+grouped_col_list_result = grouped_col_list(df)
+print(grouped_col_list_result)
+grouped_col_result = grouped_col(df)
+print(grouped_col_result)
+grouped_row_result = grouped_row(df)
+print(grouped_row_result)
+```
+- 对 JSON 列表进行分组：`grouped_row_in_jsonlist`
+```python
+from xlin import grouped_row_in_jsonlist
+jsonlist = [{"query": "a", "output": 1}, {"query": "a", "output": 2}, {"query": "b", "output": 3}]
+grouped_row_in_jsonlist_result = grouped_row_in_jsonlist(jsonlist)
+print(grouped_row_in_jsonlist_result)
+```
+### 工具类
+- `random_timestamp` 和 `random_timestamp_str`：生成随机时间戳和格式化的随机时间字符串。
+```python
+from xlin import random_timestamp, random_timestamp_str
+timestamp = random_timestamp()
+print(timestamp)
+timestamp_str = random_timestamp_str()
+print(timestamp_str)
+```
+- `df_dict_summary`: 对 `read_as_dataframe_dict` 返回的字典进行总结，返回一个 DataFrame 包含每个表的基本信息。
+```python
+from xlin import read_as_dataframe_dict, df_dict_summary
+df_dict = read_as_dataframe_dict('example.xlsx')
+summary = df_dict_summary(df_dict)
+print(summary)
+```
+- `text_is_all_chinese` 和 `text_contains_chinese`：判断文本是否全为中文或是否包含中文。
+```python
+from xlin import text_is_all_chinese, text_contains_chinese
+text1 = "你好"
+text2 = "Hello 你好"
+print(text_is_all_chinese(text1))  # True
+print(text_is_all_chinese(text2))  # False
+print(text_contains_chinese(text2))  # True
+```
+## 许可证
+本项目采用 MIT 许可证，详情请参阅 [LICENSE](LICENSE) 文件。
+## 作者
+- LinXueyuanStdio <23211526+LinXueyuanStdio@users.noreply.github.com>

xlin-0.2.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,17 @@
+xlin/__init__.py,sha256=CIhMAGhFgqwC6w16MzKcwo2mDjmaRUAcrlZFR3Am--I,321
+xlin/dataframe_util.py,sha256=zWpkGN-C9V9qVAVH8K4ElkPVu9pq4MjDbxwjJKSOO2o,12151
+xlin/datetime_util.py,sha256=MHi827LBuAOX6SSMb31staNBjmtnNOXwg7JDk73_pLU,6212
+xlin/file_util.py,sha256=mYTABNywdYoSfh1RLJcH7l1FzgKTFWN2-JZMFzv-ehw,7270
+xlin/image_util.py,sha256=hSNQ5suCrxFXpQwP-wfUT1ig3SfEdC6msuVp2k7J7b8,8438
+xlin/jsonlist_util.py,sha256=dLgrgrSTvg_1plVRCEnilajPM_s3vYdVx2bCTqrZAN8,11316
+xlin/metric.py,sha256=N7wJ35y-C-IaBr1I1CJ_37lTG7gA69zmn9Xg6xSwKoI,1690
+xlin/multiprocess_util.py,sha256=-tskCWQlBBCOPycXLj9Y2MugYg-tHF_QYYWW7c1ixOk,17300
+xlin/statistic.py,sha256=ioJJjL4qwHiwNPeBFBB67keoAIbB-uZM51zkDYviar0,17037
+xlin/text_util.py,sha256=ejFD8-j8tLCbPlCPFg0Tu3MEMPEpF7R5_IpXXjl6qzA,735
+xlin/timing_util.py,sha256=nNVKtSXel-Cc8SF_BqPRNkyNDOjGqOMxTol-L1vpON4,1340
+xlin/xlsx_util.py,sha256=uSmXcDvIhi5Sq0LGidMXy0wErNBXdjaoa6EftYVjTXs,947
+xlin/yaml_util.py,sha256=kICi7G3Td5q2MaSXXt85qNTWoHMgjzt7pvn7r3C4dME,183
+xlin-0.2.2.dist-info/LICENSE,sha256=60ys6rRtc1dZOP8UjSUr9fAqhZudT3WpKe5WbMCralM,1066
+xlin-0.2.2.dist-info/METADATA,sha256=gEhtB67hCGkiiEtGPSW9PbKXh2B2lEOoU1JWegGY1U8,7991
+xlin-0.2.2.dist-info/WHEEL,sha256=IrRNNNJ-uuL1ggO5qMvT1GGhQVdQU54d6ZpYqEZfEWo,92
+xlin-0.2.2.dist-info/RECORD,,

xlin/ischinese.py DELETED Viewed

@@ -1,13 +0,0 @@
-def text_is_all_chinese(test: str):
-    for ch in test:
-        if '\u4e00' <= ch <= '\u9fff':
-            continue
-        return False
-    return True
-def text_contains_chinese(test: str):
-    for ch in test:
-        if '\u4e00' <= ch <= '\u9fff':
-            return True
-    return False

xlin-0.1.38.dist-info/METADATA DELETED Viewed

@@ -1,33 +0,0 @@
-Metadata-Version: 2.1
-Name: xlin
-Version: 0.1.38
-Summary: toolbox for LinXueyuan
-License: MIT
-Author: LinXueyuanStdio
-Author-email: 23211526+LinXueyuanStdio@users.noreply.github.com
-Classifier: License :: OSI Approved :: MIT License
-Classifier: Programming Language :: Python :: 2
-Classifier: Programming Language :: Python :: 2.7
-Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.4
-Classifier: Programming Language :: Python :: 3.5
-Classifier: Programming Language :: Python :: 3.6
-Classifier: Programming Language :: Python :: 3.7
-Classifier: Programming Language :: Python :: 3.8
-Classifier: Programming Language :: Python :: 3.9
-Classifier: Programming Language :: Python :: 3.10
-Classifier: Programming Language :: Python :: 3.11
-Classifier: Programming Language :: Python :: 3.12
-Requires-Dist: loguru
-Requires-Dist: pandas
-Requires-Dist: pyexcel
-Requires-Dist: pyexcel-xls
-Requires-Dist: pyexcel-xlsx
-Requires-Dist: pyyaml
-Requires-Dist: tqdm
-Requires-Dist: xlsxwriter
-Description-Content-Type: text/markdown
-# xlin
-个人 python 工具代码

xlin-0.1.38.dist-info/RECORD DELETED Viewed

@@ -1,15 +0,0 @@
-xlin/__init__.py,sha256=MWWCNPgJFS_oV2US52ULa4yg4Ku61qjn40NVKqcp9-c,248
-xlin/ischinese.py,sha256=Ia9IMQ6q-UHkdLwqS70L1fTnfSPbluFrv_I1UqsKquo,293
-xlin/jsonl.py,sha256=QLIipsORyMC5OlTW5yntNnXS1aZ4so984yT_c0elM80,10854
-xlin/metric.py,sha256=N7wJ35y-C-IaBr1I1CJ_37lTG7gA69zmn9Xg6xSwKoI,1690
-xlin/multiprocess_mapping.py,sha256=XZJLsYRHyNooeBFngnSZ6l_YhbK0xjbN1_nK8GonmkE,17290
-xlin/read_as_dataframe.py,sha256=ufTH1z-ewdE4X33trXQDWgvsjCE18hzRxSFEvoH7Eaw,9173
-xlin/statistic.py,sha256=nwFSN8BWfTQRimI-zfp6RwfA-I9aFDbemtV2cyh6Hq8,16533
-xlin/timing.py,sha256=XMT8dMcMolOMohDvAZOIM_BAiPMREhGQKnO1kc5s6PU,998
-xlin/util.py,sha256=HEDJv09tNmvHCgQdP4uhMkDM8fQgcuYa0MuMXZmyZns,10977
-xlin/xls2xlsx.py,sha256=uSmXcDvIhi5Sq0LGidMXy0wErNBXdjaoa6EftYVjTXs,947
-xlin/yaml.py,sha256=kICi7G3Td5q2MaSXXt85qNTWoHMgjzt7pvn7r3C4dME,183
-xlin-0.1.38.dist-info/LICENSE,sha256=60ys6rRtc1dZOP8UjSUr9fAqhZudT3WpKe5WbMCralM,1066
-xlin-0.1.38.dist-info/METADATA,sha256=1yJfCyje0O72bLoSxXYr2NMqThAz5pNrAhD-x9DOrJw,1098
-xlin-0.1.38.dist-info/WHEEL,sha256=IrRNNNJ-uuL1ggO5qMvT1GGhQVdQU54d6ZpYqEZfEWo,92
-xlin-0.1.38.dist-info/RECORD,,

/xlin/{xls2xlsx.py → xlsx_util.py} RENAMED Viewed

File without changes

/xlin/{yaml.py → yaml_util.py} RENAMED Viewed

File without changes

{xlin-0.1.38.dist-info → xlin-0.2.2.dist-info}/LICENSE RENAMED Viewed

File without changes

{xlin-0.1.38.dist-info → xlin-0.2.2.dist-info}/WHEEL RENAMED Viewed

File without changes

xlin 0.1.38__py2.py3-none-any.whl → 0.2.2__py2.py3-none-any.whl

xlin 0.1.38py2.py3-none-any.whl → 0.2.2py2.py3-none-any.whl