PyPI - bedrockx - Versions diffs - 0.1.0__tar.gz - Mend

bedrockx 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

bedrockx-0.1.0/PKG-INFO +18 -0
bedrockx-0.1.0/README.md +7 -0
bedrockx-0.1.0/pyproject.toml +16 -0
bedrockx-0.1.0/setup.cfg +4 -0
bedrockx-0.1.0/src/bedrockx/__init__.py +7 -0
bedrockx-0.1.0/src/bedrockx/file/__init__.py +1 -0
bedrockx-0.1.0/src/bedrockx/file/utils.py +249 -0
bedrockx-0.1.0/src/bedrockx/process/__init__.py +2 -0
bedrockx-0.1.0/src/bedrockx/process/data_process.py +73 -0
bedrockx-0.1.0/src/bedrockx/process/multi_thread_process.py +74 -0
bedrockx-0.1.0/src/bedrockx/utils/__init__.py +2 -0
bedrockx-0.1.0/src/bedrockx/utils/log_manage.py +79 -0
bedrockx-0.1.0/src/bedrockx/utils/utils.py +26 -0
bedrockx-0.1.0/src/bedrockx.egg-info/PKG-INFO +18 -0
bedrockx-0.1.0/src/bedrockx.egg-info/SOURCES.txt +16 -0
bedrockx-0.1.0/src/bedrockx.egg-info/dependency_links.txt +1 -0
bedrockx-0.1.0/src/bedrockx.egg-info/requires.txt +4 -0
bedrockx-0.1.0/src/bedrockx.egg-info/top_level.txt +1 -0

bedrockx-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,18 @@
+Metadata-Version: 2.4
+Name: bedrockx
+Version: 0.1.0
+Summary: Add your description here
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: loguru>=0.7.3
+Requires-Dist: tqdm>=4.67.1
+Requires-Dist: openpyxl>=3.1.5
+Requires-Dist: pandas>=2.3.3
+# Introduction
+工作中经常用到的工具
+使用 `pip install bedrockx`  即可安装

bedrockx-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,7 @@
+# Introduction
+工作中经常用到的工具
+使用 `pip install bedrockx`  即可安装

bedrockx-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,16 @@
+[project]
+name = "bedrockx"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "loguru>=0.7.3",
+    "tqdm>=4.67.1",
+    "openpyxl>=3.1.5",
+    "pandas>=2.3.3",
+]
+[project.optional-dependencies]
+[tool.uv.workspace]

bedrockx-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

bedrockx-0.1.0/src/bedrockx/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""
+The caoyizhen_basetool library provides a tool to help you to dealing with data in Python.
+"""
+from .file import read_file, save_file, add_suffix_file, return_to_jsonl
+from .process import BaseMultiThreading, filter_data, remove_columns, drop_duplicates
+from .utils import singleton, LoggerManager, base_logger

bedrockx-0.1.0/src/bedrockx/file/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .utils import read_file, save_file, return_to_jsonl, add_suffix_file

bedrockx-0.1.0/src/bedrockx/file/utils.py ADDED Viewed

@@ -0,0 +1,249 @@
+# -*- encoding: utf-8 -*-
+# @Time    :   2025/10/11 19:57:41
+# @File    :   utils.py
+# @Author  :   ciaoyizhen
+# @Contact :   yizhen.ciao@gmail.com
+# @Function:   读取文件和保存文件
+import json
+import inspect
+from pathlib import Path
+from typing import List, Dict, Literal
+from tqdm import tqdm
+from functools import wraps
+from ..utils.log_manage import base_logger
+def read_file(file_name: str|Path, *, output_type="list", file_type=None, main_key_column=None, encoding="utf-8", disable_tqdm=False, **kwargs)-> List|Dict:
+    """读取文件，根据传参来判断读取的方式
+    最终返回完整的一个list
+    Args:
+        file_name (str|Path): 文件路径
+        output_type (Literal["list", "dict", "set"]): 返回类型,当该值为dict的时候,需要指定output_type
+        file_type (str): 文件类型,请使用`json`,`jsonl`,`xlsx`,`csv`,`txt`
+        encoding (str): 文件编码方式
+        key_columns (list): 需要取的列名
+        main_key_column (str): 当返回为dict时,这个为key,value为其他的值,类型为dict
+        output_type (Literal["list", "dict"]): 返回类型,当该值为dict的时候
+        disable_tqdm (bool): 是否关闭进度条
+        kwargs: 其他参数
+            - sheet_name (str): 读取xlsx时，可以指定读取哪个sheet_name
+    Returns:
+        list|dict|set: 根据output_type返回List|Dict|set
+    """
+    if isinstance(file_name, str):
+        file_name = Path(file_name)
+    if file_type is None:
+        file_type = file_name.suffix.lstrip(".")
+    match output_type:
+        case "list":
+            return_data = []
+        case "dict":
+            return_data = {}
+        case "set":
+            return_data = set()
+        case _:
+            raise RuntimeError(f"output_type 传入了一个不可预知的参数:{output_type=}\n目前仅允许`list`, `dict`, `set`")
+    match file_type:
+        case "jsonl":
+            with file_name.open("r", encoding=encoding) as f:
+                for line in tqdm(f.readlines(), disable=disable_tqdm):
+                    if line := line.strip():
+                        line = json.loads(line)
+                        if isinstance(return_data, list):
+                            return_data.append(line)
+                        elif isinstance(return_data, dict):
+                            if main_key_column not in line:
+                                raise RuntimeError(f"对象没有{main_key_column=}\n原始数据:{row}")
+                            value = line[main_key_column]
+                            return_data[value] = line
+                        elif isinstance(return_data, set):
+                            if main_key_column not in line:
+                                raise RuntimeError(f"对象没有{main_key_column=}\n原始数据:{row}")
+                            return_data.add(line[main_key_column])
+                return return_data
+        case "json":
+            with file_name.open("r", encoding=encoding) as f:
+                data = json.load(f)
+                assert isinstance(data, list), "理论上，这里应该是list[dict]结构，但是不是,请报告 https://github.com/ciaoyizhen/caoyizhen_basetool 让我知道!!!"
+                if isinstance(return_data, dict):
+                    for row in tqdm(data, disable=disable_tqdm):
+                        if main_key_column not in row:
+                            raise RuntimeError(f"对象没有{main_key_column=}\n原始数据:{row}")
+                        value = row[main_key_column]
+                        return_data[value] = row
+                    return return_data
+                elif isinstance(return_data, set):
+                    for row in tqdm(data, disable=disable_tqdm):
+                        if main_key_column not in row:
+                            raise RuntimeError(f"对象没有{main_key_column=}\n原始数据:{row}")
+                        return_data.add(row[main_key_column])
+                    return return_data
+                elif isinstance(return_data, list):
+                    return data
+        case "xlsx":
+            # 这里导包, 可以让不用pandas时不安装包
+            import pandas as pd
+            data = pd.read_excel(file_name, **kwargs)
+            if isinstance(return_data, list):
+                for _, row in tqdm(data.iterrows(), total=data.shape[0], disable=disable_tqdm):
+                    row = row.to_dict()
+                    return_data.append(row)
+            elif isinstance(return_data, dict):
+                for _, row in tqdm(data.iterrows(), total=data.shape[0], disable=disable_tqdm):
+                    row = row.to_dict()
+                    if main_key_column not in row:
+                        raise RuntimeError(f"对象没有{main_key_column=}\n原始数据:{row}")
+                    value = row[main_key_column]
+                    return_data[value] = row
+            elif isinstance(return_data, set):
+                for _, row in tqdm(data.iterrows(), total=data.shape[0], disable=disable_tqdm):
+                    row = row.to_dict()
+                    if main_key_column not in row:
+                        raise RuntimeError(f"对象没有{main_key_column=}\n原始数据:{row}")
+                    return_data.add(row[main_key_column])
+            return return_data
+        case "csv":
+            import pandas as pd
+            if encoding == "utf-8":  # 解决读取csv的编码问题
+                data = pd.read_csv(file_name, **kwargs)
+            else:
+                data = pd.read_csv(file_name, encoding=encoding, **kwargs)
+            if isinstance(return_data, list):
+                for _, row in tqdm(data.iterrows(), total=data.shape[0], disable=disable_tqdm):
+                    row = row.to_dict()
+                    return_data.append(row)
+            elif isinstance(return_data, dict):
+                for _, row in tqdm(data.iterrows(), total=data.shape[0], disable=disable_tqdm):
+                    row = row.to_dict()
+                    if main_key_column not in row:
+                        raise RuntimeError(f"对象没有{main_key_column=}\n原始数据:{row}")
+                    value = row[main_key_column]
+                    return_data[value] = row
+            elif isinstance(return_data, set):
+                for _, row in tqdm(data.iterrows(), total=data.shape[0], disable=disable_tqdm):
+                    row = row.to_dict()
+                    if main_key_column not in row:
+                        raise RuntimeError(f"对象没有{main_key_column=}\n原始数据:{row}")
+                    return_data.add(row[main_key_column])
+                return return_data
+            return return_data
+        case _:
+            raise RuntimeError(f"无法识别后缀{file_type=}是什么格式的文件,请传入file_type来控制或修改后缀名")
+def save_file(file_name: str|Path, data: list, file_type=None, *, encoding="utf-8", ensure_ascii=False, json_indent=4, pd_index=False,**kwargs):
+    if isinstance(file_name, str):
+        file_name = Path(file_name)
+    file_name.parent.mkdir(exist_ok=True, parents=True)
+    if file_type is None:
+        file_type = file_name.suffix.lstrip(".")
+    match file_type:
+        case "jsonl":
+            with file_name.open("w", encoding=encoding) as f:
+                for item in data:
+                    f.write(json.dumps(item, ensure_ascii=ensure_ascii) + "\n")
+            base_logger.info(f"文件保存至 {file_name.resolve(strict=True)} ")
+        case "json":
+            with file_name.open("w", encoding=encoding) as f:
+                json.dump(data, f, ensure_ascii=ensure_ascii, indent=json_indent)
+            base_logger.info(f"文件保存至 {file_name.resolve(strict=True)} ")
+        case "xlsx":
+            import pandas as pd
+            data = pd.DataFrame(data)
+            data.to_excel(file_name, **kwargs, index=pd_index)
+            base_logger.info(f"文件保存至 {file_name.resolve(strict=True)} ")
+        case "csv":
+            import pandas as pd
+            data = pd.DataFrame(data)
+            data.to_csv(file_name, **kwargs, index=pd_index)
+            base_logger.info(f"文件保存至 {file_name.resolve(strict=True)} ")
+        case _:
+            raise RuntimeError(f"保存文件识别,无法识别{file_type=},该保存成什么格式")
+def return_to_jsonl(file_path, encoding="utf-8", ensure_ascii=False):
+    """
+    兼容同步和异步函数的写入装饰器
+    """
+    def decorator(func):
+        def write_to_file(result):
+            if result is None:
+                return # 允许返回None时不写入
+            error_msg = f"被装饰器的函数需要有返回，并且必须是str或dict"
+            if isinstance(result, dict):
+                content = json.dumps(result, ensure_ascii=ensure_ascii)
+            elif isinstance(result, str):
+                content = result
+            else:
+                raise RuntimeError(error_msg)
+            # 确保父目录存在
+            Path(file_path).parent.mkdir(parents=True, exist_ok=True)
+            with open(file_path, "a", encoding=encoding) as f:
+                f.write(content + "\n")
+        if inspect.iscoroutinefunction(func):
+            @wraps(func)
+            async def wrapper(*args, **kwargs):
+                single_result = await func(*args, **kwargs)
+                write_to_file(single_result)
+                return single_result
+            return wrapper
+        else:
+            @wraps(func)
+            def wrapper(*args, **kwargs):
+                single_result = func(*args, **kwargs)
+                write_to_file(single_result)
+                return single_result
+            return wrapper
+    return decorator
+def add_suffix_file(file_path: str|Path, suffix: str, *, sep="_")-> Path:
+    """为文件添加真实后缀
+    example:
+    >>> file = "data.jsonl"
+    >>> print(add_suffix_file(file, "response"))
+    >>> Path("data_response.jsonl")
+    Args:
+        file_path (str|Path): _description_
+        suffix (str): _description_
+        sep (str): 分隔符
+    Returns:
+        Path: 路径
+    """
+    if isinstance(file_path, str):
+        file_path = Path(file_path)
+    new_name = f"{file_path.stem}{sep}{suffix}{file_path.suffix}"
+    return Path(new_name)

bedrockx-0.1.0/src/bedrockx/process/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .multi_thread_process import BaseMultiThreading
2	+ from .data_process import filter_data, drop_duplicates, remove_columns

bedrockx-0.1.0/src/bedrockx/process/data_process.py ADDED Viewed

@@ -0,0 +1,73 @@
+from ..utils.log_manage import base_logger
+from tqdm import tqdm
+def filter_data(data:list[dict], filter_set:set, main_key_column:str)-> list[dict]:
+    """将data中的main_key_columns字段根据filter_set的数据进行过滤
+    Args:
+        data (list): 待过滤的数据
+        filter_set (_type_): 需要过滤的数据
+        main_key_column (_type_): 待过滤数据的key
+    Returns:
+        list[dict]: 过滤后的数据
+    """
+    new_data = []
+    for item in data:
+        if main_key_column not in item:
+            raise RuntimeError(f"data中没有字段{main_key_column=}")
+        sub_item = item[main_key_column]
+        if sub_item in filter_set:
+            continue
+        new_data.append(item)
+    base_logger.info(f"原始数据大小:{len(data)}, 过滤后大小:{len(new_data)}")
+    return new_data
+def drop_duplicates(data: list[dict], main_key_column:str)-> list[dict]:
+    """去除data中 main_key_columns字段重复的数据
+    Args:
+        data (list): 由dict存储的数据
+        main_key_column (str): 需要去重的key
+    Returns:
+        list[dict]: 去重后的数据
+    """
+    temp_set = set()
+    new_data = []
+    for item in tqdm(data, desc="去重中"):
+        if main_key_column not in item:
+            base_logger.warning(f"不存在对应的key:{main_key_column=}\n{item=}\n已跳过")
+            continue
+        else:
+            key = item[main_key_column]
+            if key not in temp_set:
+                new_data.append(item)
+                temp_set.add(key)
+    return new_data
+def remove_columns(data: list[dict], key_list: list|str)-> list[dict]:
+    """删除data中对应key_list对应的数据
+    Args:
+        data (list): 由dict存储的数据
+        key_list (list|str): 需要删除的key
+    Returns:
+        list[dict]: 删除后的数据
+    """
+    if isinstance(key_list, str):
+        key_list = [key_list]
+    new_data = []
+    for item in tqdm(data, desc="删除对应列中"):
+        for k in key_list:
+            if k in item:
+                del item[k]
+        new_data.append(item)
+    return new_data

bedrockx-0.1.0/src/bedrockx/process/multi_thread_process.py ADDED Viewed

@@ -0,0 +1,74 @@
+# -*- encoding: utf-8 -*-
+# @Time    :   2025/10/11 22:16:26
+# @File    :   MultiThreadProcess.py
+# @Author  :   ciaoyizhen
+# @Contact :   yizhen.ciao@gmail.com
+# @Function:   多线程的消费者生产者进程处理
+import json
+from pathlib import Path
+from concurrent.futures import as_completed, ThreadPoolExecutor
+from ..utils import base_logger
+from ..file import save_file, read_file
+from tqdm import tqdm
+class BaseMultiThreading():
+    """
+    基类, 实现多线程的消费者生产者的处理, 实现边处理边存储
+    """
+    def __init__(self, max_workers:int, save_path: str|Path=None, *, file_type:str|Path=None, **kwargs):
+        """_summary_
+        Args:
+            max_workers (int): 并发数
+            single_file_size (int): 临时存储时，单个文件的大小
+            save_path (str|Path): 最终完整保存的文件
+            file_type (str|Path): 文件存储类型
+        """
+        self.max_workers = max_workers
+        self.save_path = Path(save_path)
+        self.file_type = file_type
+        if self.file_type is None:
+            self.file_type = self.save_path.suffix.lstrip(".")
+        if self.file_type not in {"json", "jsonl", "xlsx", "csv"}:
+            raise RuntimeError(f"传入的file_type不符合要求或你的文件后缀不符合要求")
+        self.post_init(**kwargs)
+    def post_init(self, **kwargs):
+        pass
+    def single_data_process(self, item:dict)->dict:
+        """
+        这个函数实现单个数据怎么处理，输入是一个数据，进行处理，返回一个数据
+        需要用户自定义实现
+        """
+        raise NotImplementedError(f"未实现函数 single_data_process, 该函数需要解决每个数据要怎么")
+    def __call__(self, data:list):
+        with ThreadPoolExecutor(max_workers=self.max_workers, thread_name_prefix="线程处理数据") as exec, \
+            tqdm(total=len(data), desc=f"{self.max_workers}并发处理中") as p_bar, \
+            open(self.save_path, "w", encoding="utf-8") as f:
+            try:
+                futures_list = []
+                for item in data:
+                    future = exec.submit(self.single_data_process, item)
+                    future.add_done_callback(lambda x: p_bar.update(1))
+                    futures_list.append(future)
+                for future in as_completed(futures_list):
+                    result = future.result()
+                    result = json.dumps(result, ensure_ascii=False)
+                    f.write(result + "\n")
+                    f.flush()
+            except KeyboardInterrupt:
+                exit()
+                exec.shutdown(cancel_futures=True)
+            except Exception:
+                import traceback
+                base_logger.error(traceback.format_exc())

bedrockx-0.1.0/src/bedrockx/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .utils import singleton
2	+ from .log_manage import LoggerManager, base_logger

bedrockx-0.1.0/src/bedrockx/utils/log_manage.py ADDED Viewed

@@ -0,0 +1,79 @@
+# -*- encoding: utf-8 -*-
+# @Time    :   2025/10/12 12:58:27
+# @File    :   log_manage.py
+# @Author  :   ciaoyizhen
+# @Contact :   yizhen.ciao@gmail.com
+# @Function:   日志的封装
+import os
+from loguru import logger
+from tqdm import tqdm
+class LoggerManager:
+    """简洁生产级 Loguru 封装类
+    用法：
+        log = LoggerManager("logs/app.log")
+        log.info("启动成功")
+        log.error("错误信息")
+    """
+    def __init__(
+        self,
+        log_path: str|None = None,
+        level: str = "INFO",
+        rotation: str = "10 MB",
+        retention: str = "7 days",
+        compression: str = "zip",
+        enqueue: bool = True,
+        console: bool = True,
+        *,
+        file_format="<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | <level>{level: <8}</level> | <cyan>{file}:{function}:{line}</cyan> - <level>{message}</level>",
+        console_format="<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | <level>{level: <8}</level> | <cyan>{file}:{function}:{line}</cyan> - <level>{message}</level>"
+    ):
+        logger.remove()  # 清空默认配置
+        # 文件日志
+        if log_path is not None:
+            os.makedirs(os.path.dirname(log_path), exist_ok=True)
+            logger.add(
+                log_path,
+                rotation=rotation,
+                retention=retention,
+                compression=compression,
+                enqueue=enqueue,  # ✅ 支持多进程
+                level=level,
+                encoding="utf-8",
+                format=file_format,
+            )
+        # 控制台日志（可选）
+        if console:
+            logger.add(
+                sink=lambda msg: tqdm.write(msg, end=""),
+                level=level,
+                format=console_format,
+            )
+        self._logger = logger
+    # ↓↓↓ 对 loguru 常用方法的封装 ↓↓↓
+    def debug(self, msg, *args, **kwargs):
+        self._logger.opt(depth=1).debug(msg, *args, **kwargs)
+    def info(self, msg, *args, **kwargs):
+        self._logger.opt(depth=1).info(msg, *args, **kwargs)
+    def warning(self, msg, *args, **kwargs):
+        self._logger.opt(depth=1).warning(msg, *args, **kwargs)
+    def error(self, msg, *args, **kwargs):
+        self._logger.opt(depth=1).error(msg, *args, **kwargs)
+    def critical(self, msg, *args, **kwargs):
+        self._logger.opt(depth=1).critical(msg, *args, **kwargs)
+    def exception(self, msg, *args, **kwargs):
+        self._logger.opt(depth=1).exception(msg, *args, **kwargs)
+base_logger = LoggerManager()

bedrockx-0.1.0/src/bedrockx/utils/utils.py ADDED Viewed

@@ -0,0 +1,26 @@
+# -*- encoding: utf-8 -*-
+# @Time    :   2025/10/12 12:48:14
+# @File    :   utils.py
+# @Author  :   ciaoyizhen
+# @Contact :   yizhen.ciao@gmail.com
+# @Function:   通用工具
+from functools import wraps
+from threading import Lock
+def singleton(cls):
+    """线程安全的单例模式装饰器"""
+    instances = {}
+    lock = Lock()
+    @wraps(cls)
+    def get_instance(*args, **kwargs):
+        # 双重检查锁，防止多线程竞争
+        if cls not in instances:
+            with lock:
+                if cls not in instances:
+                    instances[cls] = cls(*args, **kwargs)
+        return instances[cls]
+    return get_instance

bedrockx-0.1.0/src/bedrockx.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,18 @@
+Metadata-Version: 2.4
+Name: bedrockx
+Version: 0.1.0
+Summary: Add your description here
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: loguru>=0.7.3
+Requires-Dist: tqdm>=4.67.1
+Requires-Dist: openpyxl>=3.1.5
+Requires-Dist: pandas>=2.3.3
+# Introduction
+工作中经常用到的工具
+使用 `pip install bedrockx`  即可安装

bedrockx-0.1.0/src/bedrockx.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,16 @@
+README.md
+pyproject.toml
+src/bedrockx/__init__.py
+src/bedrockx.egg-info/PKG-INFO
+src/bedrockx.egg-info/SOURCES.txt
+src/bedrockx.egg-info/dependency_links.txt
+src/bedrockx.egg-info/requires.txt
+src/bedrockx.egg-info/top_level.txt
+src/bedrockx/file/__init__.py
+src/bedrockx/file/utils.py
+src/bedrockx/process/__init__.py
+src/bedrockx/process/data_process.py
+src/bedrockx/process/multi_thread_process.py
+src/bedrockx/utils/__init__.py
+src/bedrockx/utils/log_manage.py
+src/bedrockx/utils/utils.py

bedrockx-0.1.0/src/bedrockx.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

bedrockx-0.1.0/src/bedrockx.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,4 @@
+loguru>=0.7.3
+tqdm>=4.67.1
+openpyxl>=3.1.5
+pandas>=2.3.3

bedrockx-0.1.0/src/bedrockx.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ bedrockx