bedrockx 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,18 @@
1
+ Metadata-Version: 2.4
2
+ Name: bedrockx
3
+ Version: 0.1.0
4
+ Summary: Add your description here
5
+ Requires-Python: >=3.10
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: loguru>=0.7.3
8
+ Requires-Dist: tqdm>=4.67.1
9
+ Requires-Dist: openpyxl>=3.1.5
10
+ Requires-Dist: pandas>=2.3.3
11
+
12
+ # Introduction
13
+
14
+ 工作中经常用到的工具
15
+
16
+
17
+ 使用 `pip install bedrockx` 即可安装
18
+
@@ -0,0 +1,7 @@
1
+ # Introduction
2
+
3
+ 工作中经常用到的工具
4
+
5
+
6
+ 使用 `pip install bedrockx` 即可安装
7
+
@@ -0,0 +1,16 @@
1
+ [project]
2
+ name = "bedrockx"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "loguru>=0.7.3",
9
+ "tqdm>=4.67.1",
10
+ "openpyxl>=3.1.5",
11
+ "pandas>=2.3.3",
12
+ ]
13
+
14
+ [project.optional-dependencies]
15
+
16
+ [tool.uv.workspace]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,7 @@
1
+ """
2
+ The caoyizhen_basetool library provides a tool to help you to dealing with data in Python.
3
+ """
4
+
5
+ from .file import read_file, save_file, add_suffix_file, return_to_jsonl
6
+ from .process import BaseMultiThreading, filter_data, remove_columns, drop_duplicates
7
+ from .utils import singleton, LoggerManager, base_logger
@@ -0,0 +1 @@
1
+ from .utils import read_file, save_file, return_to_jsonl, add_suffix_file
@@ -0,0 +1,249 @@
1
+ # -*- encoding: utf-8 -*-
2
+ # @Time : 2025/10/11 19:57:41
3
+ # @File : utils.py
4
+ # @Author : ciaoyizhen
5
+ # @Contact : yizhen.ciao@gmail.com
6
+ # @Function: 读取文件和保存文件
7
+ import json
8
+ import inspect
9
+ from pathlib import Path
10
+ from typing import List, Dict, Literal
11
+ from tqdm import tqdm
12
+ from functools import wraps
13
+ from ..utils.log_manage import base_logger
14
+
15
+
16
+ def read_file(file_name: str|Path, *, output_type="list", file_type=None, main_key_column=None, encoding="utf-8", disable_tqdm=False, **kwargs)-> List|Dict:
17
+ """读取文件,根据传参来判断读取的方式
18
+ 最终返回完整的一个list
19
+
20
+ Args:
21
+ file_name (str|Path): 文件路径
22
+
23
+ output_type (Literal["list", "dict", "set"]): 返回类型,当该值为dict的时候,需要指定output_type
24
+ file_type (str): 文件类型,请使用`json`,`jsonl`,`xlsx`,`csv`,`txt`
25
+ encoding (str): 文件编码方式
26
+ key_columns (list): 需要取的列名
27
+ main_key_column (str): 当返回为dict时,这个为key,value为其他的值,类型为dict
28
+ output_type (Literal["list", "dict"]): 返回类型,当该值为dict的时候
29
+ disable_tqdm (bool): 是否关闭进度条
30
+
31
+ kwargs: 其他参数
32
+ - sheet_name (str): 读取xlsx时,可以指定读取哪个sheet_name
33
+
34
+ Returns:
35
+ list|dict|set: 根据output_type返回List|Dict|set
36
+ """
37
+
38
+ if isinstance(file_name, str):
39
+ file_name = Path(file_name)
40
+
41
+ if file_type is None:
42
+ file_type = file_name.suffix.lstrip(".")
43
+
44
+ match output_type:
45
+ case "list":
46
+ return_data = []
47
+ case "dict":
48
+ return_data = {}
49
+ case "set":
50
+ return_data = set()
51
+ case _:
52
+ raise RuntimeError(f"output_type 传入了一个不可预知的参数:{output_type=}\n目前仅允许`list`, `dict`, `set`")
53
+
54
+
55
+
56
+ match file_type:
57
+ case "jsonl":
58
+ with file_name.open("r", encoding=encoding) as f:
59
+ for line in tqdm(f.readlines(), disable=disable_tqdm):
60
+ if line := line.strip():
61
+ line = json.loads(line)
62
+ if isinstance(return_data, list):
63
+ return_data.append(line)
64
+ elif isinstance(return_data, dict):
65
+ if main_key_column not in line:
66
+ raise RuntimeError(f"对象没有{main_key_column=}\n原始数据:{row}")
67
+
68
+ value = line[main_key_column]
69
+ return_data[value] = line
70
+ elif isinstance(return_data, set):
71
+ if main_key_column not in line:
72
+ raise RuntimeError(f"对象没有{main_key_column=}\n原始数据:{row}")
73
+ return_data.add(line[main_key_column])
74
+ return return_data
75
+
76
+ case "json":
77
+ with file_name.open("r", encoding=encoding) as f:
78
+ data = json.load(f)
79
+ assert isinstance(data, list), "理论上,这里应该是list[dict]结构,但是不是,请报告 https://github.com/ciaoyizhen/caoyizhen_basetool 让我知道!!!"
80
+ if isinstance(return_data, dict):
81
+ for row in tqdm(data, disable=disable_tqdm):
82
+ if main_key_column not in row:
83
+ raise RuntimeError(f"对象没有{main_key_column=}\n原始数据:{row}")
84
+ value = row[main_key_column]
85
+ return_data[value] = row
86
+
87
+ return return_data
88
+ elif isinstance(return_data, set):
89
+ for row in tqdm(data, disable=disable_tqdm):
90
+ if main_key_column not in row:
91
+ raise RuntimeError(f"对象没有{main_key_column=}\n原始数据:{row}")
92
+ return_data.add(row[main_key_column])
93
+ return return_data
94
+
95
+ elif isinstance(return_data, list):
96
+ return data
97
+
98
+ case "xlsx":
99
+ # 这里导包, 可以让不用pandas时不安装包
100
+ import pandas as pd
101
+ data = pd.read_excel(file_name, **kwargs)
102
+
103
+ if isinstance(return_data, list):
104
+ for _, row in tqdm(data.iterrows(), total=data.shape[0], disable=disable_tqdm):
105
+ row = row.to_dict()
106
+ return_data.append(row)
107
+ elif isinstance(return_data, dict):
108
+ for _, row in tqdm(data.iterrows(), total=data.shape[0], disable=disable_tqdm):
109
+ row = row.to_dict()
110
+ if main_key_column not in row:
111
+ raise RuntimeError(f"对象没有{main_key_column=}\n原始数据:{row}")
112
+ value = row[main_key_column]
113
+ return_data[value] = row
114
+ elif isinstance(return_data, set):
115
+ for _, row in tqdm(data.iterrows(), total=data.shape[0], disable=disable_tqdm):
116
+ row = row.to_dict()
117
+ if main_key_column not in row:
118
+ raise RuntimeError(f"对象没有{main_key_column=}\n原始数据:{row}")
119
+ return_data.add(row[main_key_column])
120
+ return return_data
121
+
122
+
123
+ case "csv":
124
+ import pandas as pd
125
+ if encoding == "utf-8": # 解决读取csv的编码问题
126
+ data = pd.read_csv(file_name, **kwargs)
127
+ else:
128
+ data = pd.read_csv(file_name, encoding=encoding, **kwargs)
129
+
130
+ if isinstance(return_data, list):
131
+ for _, row in tqdm(data.iterrows(), total=data.shape[0], disable=disable_tqdm):
132
+ row = row.to_dict()
133
+ return_data.append(row)
134
+ elif isinstance(return_data, dict):
135
+ for _, row in tqdm(data.iterrows(), total=data.shape[0], disable=disable_tqdm):
136
+ row = row.to_dict()
137
+
138
+ if main_key_column not in row:
139
+ raise RuntimeError(f"对象没有{main_key_column=}\n原始数据:{row}")
140
+ value = row[main_key_column]
141
+ return_data[value] = row
142
+ elif isinstance(return_data, set):
143
+ for _, row in tqdm(data.iterrows(), total=data.shape[0], disable=disable_tqdm):
144
+ row = row.to_dict()
145
+ if main_key_column not in row:
146
+ raise RuntimeError(f"对象没有{main_key_column=}\n原始数据:{row}")
147
+ return_data.add(row[main_key_column])
148
+ return return_data
149
+ return return_data
150
+
151
+ case _:
152
+ raise RuntimeError(f"无法识别后缀{file_type=}是什么格式的文件,请传入file_type来控制或修改后缀名")
153
+
154
+
155
+ def save_file(file_name: str|Path, data: list, file_type=None, *, encoding="utf-8", ensure_ascii=False, json_indent=4, pd_index=False,**kwargs):
156
+ if isinstance(file_name, str):
157
+ file_name = Path(file_name)
158
+
159
+ file_name.parent.mkdir(exist_ok=True, parents=True)
160
+ if file_type is None:
161
+ file_type = file_name.suffix.lstrip(".")
162
+
163
+ match file_type:
164
+ case "jsonl":
165
+ with file_name.open("w", encoding=encoding) as f:
166
+ for item in data:
167
+ f.write(json.dumps(item, ensure_ascii=ensure_ascii) + "\n")
168
+ base_logger.info(f"文件保存至 {file_name.resolve(strict=True)} ")
169
+ case "json":
170
+ with file_name.open("w", encoding=encoding) as f:
171
+ json.dump(data, f, ensure_ascii=ensure_ascii, indent=json_indent)
172
+ base_logger.info(f"文件保存至 {file_name.resolve(strict=True)} ")
173
+ case "xlsx":
174
+ import pandas as pd
175
+ data = pd.DataFrame(data)
176
+ data.to_excel(file_name, **kwargs, index=pd_index)
177
+ base_logger.info(f"文件保存至 {file_name.resolve(strict=True)} ")
178
+ case "csv":
179
+ import pandas as pd
180
+ data = pd.DataFrame(data)
181
+ data.to_csv(file_name, **kwargs, index=pd_index)
182
+ base_logger.info(f"文件保存至 {file_name.resolve(strict=True)} ")
183
+ case _:
184
+ raise RuntimeError(f"保存文件识别,无法识别{file_type=},该保存成什么格式")
185
+
186
+ def return_to_jsonl(file_path, encoding="utf-8", ensure_ascii=False):
187
+ """
188
+ 兼容同步和异步函数的写入装饰器
189
+ """
190
+ def decorator(func):
191
+ def write_to_file(result):
192
+ if result is None:
193
+ return # 允许返回None时不写入
194
+ error_msg = f"被装饰器的函数需要有返回,并且必须是str或dict"
195
+
196
+ if isinstance(result, dict):
197
+ content = json.dumps(result, ensure_ascii=ensure_ascii)
198
+ elif isinstance(result, str):
199
+ content = result
200
+ else:
201
+ raise RuntimeError(error_msg)
202
+
203
+ # 确保父目录存在
204
+ Path(file_path).parent.mkdir(parents=True, exist_ok=True)
205
+
206
+ with open(file_path, "a", encoding=encoding) as f:
207
+ f.write(content + "\n")
208
+
209
+ if inspect.iscoroutinefunction(func):
210
+ @wraps(func)
211
+ async def wrapper(*args, **kwargs):
212
+ single_result = await func(*args, **kwargs)
213
+ write_to_file(single_result)
214
+ return single_result
215
+ return wrapper
216
+ else:
217
+ @wraps(func)
218
+ def wrapper(*args, **kwargs):
219
+ single_result = func(*args, **kwargs)
220
+ write_to_file(single_result)
221
+ return single_result
222
+ return wrapper
223
+ return decorator
224
+
225
+
226
+
227
+
228
+
229
+ def add_suffix_file(file_path: str|Path, suffix: str, *, sep="_")-> Path:
230
+ """为文件添加真实后缀
231
+ example:
232
+ >>> file = "data.jsonl"
233
+ >>> print(add_suffix_file(file, "response"))
234
+ >>> Path("data_response.jsonl")
235
+
236
+ Args:
237
+ file_path (str|Path): _description_
238
+ suffix (str): _description_
239
+ sep (str): 分隔符
240
+
241
+ Returns:
242
+ Path: 路径
243
+ """
244
+ if isinstance(file_path, str):
245
+ file_path = Path(file_path)
246
+
247
+ new_name = f"{file_path.stem}{sep}{suffix}{file_path.suffix}"
248
+
249
+ return Path(new_name)
@@ -0,0 +1,2 @@
1
+ from .multi_thread_process import BaseMultiThreading
2
+ from .data_process import filter_data, drop_duplicates, remove_columns
@@ -0,0 +1,73 @@
1
+
2
+ from ..utils.log_manage import base_logger
3
+ from tqdm import tqdm
4
+
5
+ def filter_data(data:list[dict], filter_set:set, main_key_column:str)-> list[dict]:
6
+ """将data中的main_key_columns字段根据filter_set的数据进行过滤
7
+
8
+ Args:
9
+ data (list): 待过滤的数据
10
+ filter_set (_type_): 需要过滤的数据
11
+ main_key_column (_type_): 待过滤数据的key
12
+
13
+ Returns:
14
+ list[dict]: 过滤后的数据
15
+ """
16
+ new_data = []
17
+ for item in data:
18
+ if main_key_column not in item:
19
+ raise RuntimeError(f"data中没有字段{main_key_column=}")
20
+ sub_item = item[main_key_column]
21
+ if sub_item in filter_set:
22
+ continue
23
+ new_data.append(item)
24
+ base_logger.info(f"原始数据大小:{len(data)}, 过滤后大小:{len(new_data)}")
25
+ return new_data
26
+
27
+
28
+ def drop_duplicates(data: list[dict], main_key_column:str)-> list[dict]:
29
+ """去除data中 main_key_columns字段重复的数据
30
+
31
+ Args:
32
+ data (list): 由dict存储的数据
33
+ main_key_column (str): 需要去重的key
34
+
35
+ Returns:
36
+ list[dict]: 去重后的数据
37
+ """
38
+ temp_set = set()
39
+ new_data = []
40
+ for item in tqdm(data, desc="去重中"):
41
+ if main_key_column not in item:
42
+ base_logger.warning(f"不存在对应的key:{main_key_column=}\n{item=}\n已跳过")
43
+ continue
44
+ else:
45
+ key = item[main_key_column]
46
+ if key not in temp_set:
47
+ new_data.append(item)
48
+ temp_set.add(key)
49
+ return new_data
50
+
51
+
52
+ def remove_columns(data: list[dict], key_list: list|str)-> list[dict]:
53
+ """删除data中对应key_list对应的数据
54
+
55
+ Args:
56
+ data (list): 由dict存储的数据
57
+ key_list (list|str): 需要删除的key
58
+
59
+ Returns:
60
+ list[dict]: 删除后的数据
61
+ """
62
+
63
+ if isinstance(key_list, str):
64
+ key_list = [key_list]
65
+
66
+ new_data = []
67
+ for item in tqdm(data, desc="删除对应列中"):
68
+ for k in key_list:
69
+ if k in item:
70
+ del item[k]
71
+
72
+ new_data.append(item)
73
+ return new_data
@@ -0,0 +1,74 @@
1
+ # -*- encoding: utf-8 -*-
2
+ # @Time : 2025/10/11 22:16:26
3
+ # @File : MultiThreadProcess.py
4
+ # @Author : ciaoyizhen
5
+ # @Contact : yizhen.ciao@gmail.com
6
+ # @Function: 多线程的消费者生产者进程处理
7
+ import json
8
+ from pathlib import Path
9
+ from concurrent.futures import as_completed, ThreadPoolExecutor
10
+ from ..utils import base_logger
11
+ from ..file import save_file, read_file
12
+ from tqdm import tqdm
13
+
14
+
15
+
16
+ class BaseMultiThreading():
17
+ """
18
+ 基类, 实现多线程的消费者生产者的处理, 实现边处理边存储
19
+ """
20
+ def __init__(self, max_workers:int, save_path: str|Path=None, *, file_type:str|Path=None, **kwargs):
21
+ """_summary_
22
+
23
+ Args:
24
+ max_workers (int): 并发数
25
+ single_file_size (int): 临时存储时,单个文件的大小
26
+ save_path (str|Path): 最终完整保存的文件
27
+ file_type (str|Path): 文件存储类型
28
+ """
29
+ self.max_workers = max_workers
30
+ self.save_path = Path(save_path)
31
+ self.file_type = file_type
32
+
33
+ if self.file_type is None:
34
+ self.file_type = self.save_path.suffix.lstrip(".")
35
+
36
+ if self.file_type not in {"json", "jsonl", "xlsx", "csv"}:
37
+ raise RuntimeError(f"传入的file_type不符合要求或你的文件后缀不符合要求")
38
+
39
+ self.post_init(**kwargs)
40
+
41
+ def post_init(self, **kwargs):
42
+ pass
43
+
44
+ def single_data_process(self, item:dict)->dict:
45
+ """
46
+ 这个函数实现单个数据怎么处理,输入是一个数据,进行处理,返回一个数据
47
+ 需要用户自定义实现
48
+ """
49
+ raise NotImplementedError(f"未实现函数 single_data_process, 该函数需要解决每个数据要怎么")
50
+
51
+
52
+ def __call__(self, data:list):
53
+ with ThreadPoolExecutor(max_workers=self.max_workers, thread_name_prefix="线程处理数据") as exec, \
54
+ tqdm(total=len(data), desc=f"{self.max_workers}并发处理中") as p_bar, \
55
+ open(self.save_path, "w", encoding="utf-8") as f:
56
+ try:
57
+ futures_list = []
58
+ for item in data:
59
+ future = exec.submit(self.single_data_process, item)
60
+ future.add_done_callback(lambda x: p_bar.update(1))
61
+ futures_list.append(future)
62
+
63
+ for future in as_completed(futures_list):
64
+ result = future.result()
65
+ result = json.dumps(result, ensure_ascii=False)
66
+ f.write(result + "\n")
67
+ f.flush()
68
+ except KeyboardInterrupt:
69
+ exit()
70
+ exec.shutdown(cancel_futures=True)
71
+ except Exception:
72
+ import traceback
73
+ base_logger.error(traceback.format_exc())
74
+
@@ -0,0 +1,2 @@
1
+ from .utils import singleton
2
+ from .log_manage import LoggerManager, base_logger
@@ -0,0 +1,79 @@
1
+ # -*- encoding: utf-8 -*-
2
+ # @Time : 2025/10/12 12:58:27
3
+ # @File : log_manage.py
4
+ # @Author : ciaoyizhen
5
+ # @Contact : yizhen.ciao@gmail.com
6
+ # @Function: 日志的封装
7
+
8
+ import os
9
+ from loguru import logger
10
+ from tqdm import tqdm
11
+
12
+ class LoggerManager:
13
+ """简洁生产级 Loguru 封装类
14
+
15
+ 用法:
16
+ log = LoggerManager("logs/app.log")
17
+ log.info("启动成功")
18
+ log.error("错误信息")
19
+ """
20
+
21
+ def __init__(
22
+ self,
23
+ log_path: str|None = None,
24
+ level: str = "INFO",
25
+ rotation: str = "10 MB",
26
+ retention: str = "7 days",
27
+ compression: str = "zip",
28
+ enqueue: bool = True,
29
+ console: bool = True,
30
+ *,
31
+ file_format="<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | <level>{level: <8}</level> | <cyan>{file}:{function}:{line}</cyan> - <level>{message}</level>",
32
+ console_format="<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | <level>{level: <8}</level> | <cyan>{file}:{function}:{line}</cyan> - <level>{message}</level>"
33
+ ):
34
+
35
+ logger.remove() # 清空默认配置
36
+
37
+ # 文件日志
38
+ if log_path is not None:
39
+ os.makedirs(os.path.dirname(log_path), exist_ok=True)
40
+ logger.add(
41
+ log_path,
42
+ rotation=rotation,
43
+ retention=retention,
44
+ compression=compression,
45
+ enqueue=enqueue, # ✅ 支持多进程
46
+ level=level,
47
+ encoding="utf-8",
48
+ format=file_format,
49
+ )
50
+
51
+ # 控制台日志(可选)
52
+ if console:
53
+ logger.add(
54
+ sink=lambda msg: tqdm.write(msg, end=""),
55
+ level=level,
56
+ format=console_format,
57
+ )
58
+ self._logger = logger
59
+
60
+ # ↓↓↓ 对 loguru 常用方法的封装 ↓↓↓
61
+ def debug(self, msg, *args, **kwargs):
62
+ self._logger.opt(depth=1).debug(msg, *args, **kwargs)
63
+
64
+ def info(self, msg, *args, **kwargs):
65
+ self._logger.opt(depth=1).info(msg, *args, **kwargs)
66
+
67
+ def warning(self, msg, *args, **kwargs):
68
+ self._logger.opt(depth=1).warning(msg, *args, **kwargs)
69
+
70
+ def error(self, msg, *args, **kwargs):
71
+ self._logger.opt(depth=1).error(msg, *args, **kwargs)
72
+
73
+ def critical(self, msg, *args, **kwargs):
74
+ self._logger.opt(depth=1).critical(msg, *args, **kwargs)
75
+
76
+ def exception(self, msg, *args, **kwargs):
77
+ self._logger.opt(depth=1).exception(msg, *args, **kwargs)
78
+
79
+ base_logger = LoggerManager()
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ # @Time : 2025/10/12 12:48:14
3
+ # @File : utils.py
4
+ # @Author : ciaoyizhen
5
+ # @Contact : yizhen.ciao@gmail.com
6
+ # @Function: 通用工具
7
+ from functools import wraps
8
+ from threading import Lock
9
+
10
+ def singleton(cls):
11
+ """线程安全的单例模式装饰器"""
12
+ instances = {}
13
+ lock = Lock()
14
+
15
+ @wraps(cls)
16
+ def get_instance(*args, **kwargs):
17
+ # 双重检查锁,防止多线程竞争
18
+ if cls not in instances:
19
+ with lock:
20
+ if cls not in instances:
21
+ instances[cls] = cls(*args, **kwargs)
22
+ return instances[cls]
23
+
24
+ return get_instance
25
+
26
+
@@ -0,0 +1,18 @@
1
+ Metadata-Version: 2.4
2
+ Name: bedrockx
3
+ Version: 0.1.0
4
+ Summary: Add your description here
5
+ Requires-Python: >=3.10
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: loguru>=0.7.3
8
+ Requires-Dist: tqdm>=4.67.1
9
+ Requires-Dist: openpyxl>=3.1.5
10
+ Requires-Dist: pandas>=2.3.3
11
+
12
+ # Introduction
13
+
14
+ 工作中经常用到的工具
15
+
16
+
17
+ 使用 `pip install bedrockx` 即可安装
18
+
@@ -0,0 +1,16 @@
1
+ README.md
2
+ pyproject.toml
3
+ src/bedrockx/__init__.py
4
+ src/bedrockx.egg-info/PKG-INFO
5
+ src/bedrockx.egg-info/SOURCES.txt
6
+ src/bedrockx.egg-info/dependency_links.txt
7
+ src/bedrockx.egg-info/requires.txt
8
+ src/bedrockx.egg-info/top_level.txt
9
+ src/bedrockx/file/__init__.py
10
+ src/bedrockx/file/utils.py
11
+ src/bedrockx/process/__init__.py
12
+ src/bedrockx/process/data_process.py
13
+ src/bedrockx/process/multi_thread_process.py
14
+ src/bedrockx/utils/__init__.py
15
+ src/bedrockx/utils/log_manage.py
16
+ src/bedrockx/utils/utils.py
@@ -0,0 +1,4 @@
1
+ loguru>=0.7.3
2
+ tqdm>=4.67.1
3
+ openpyxl>=3.1.5
4
+ pandas>=2.3.3
@@ -0,0 +1 @@
1
+ bedrockx