coze-coding-utils 0.2.1__py3-none-any.whl → 0.2.2a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coze_coding_utils/__init__.py +1 -1
- coze_coding_utils/error/__init__.py +31 -0
- coze_coding_utils/error/classifier.py +320 -0
- coze_coding_utils/error/codes.py +356 -0
- coze_coding_utils/error/exceptions.py +439 -0
- coze_coding_utils/error/patterns.py +939 -0
- coze_coding_utils/error/test_classifier.py +0 -0
- coze_coding_utils/file/__init__.py +0 -0
- coze_coding_utils/file/file.py +327 -0
- coze_coding_utils/helper/__init__.py +0 -0
- coze_coding_utils/helper/agent_helper.py +599 -0
- coze_coding_utils/helper/graph_helper.py +231 -0
- coze_coding_utils/log/__init__.py +0 -0
- coze_coding_utils/log/common.py +8 -0
- coze_coding_utils/log/config.py +10 -0
- coze_coding_utils/log/err_trace.py +88 -0
- coze_coding_utils/log/loop_trace.py +72 -0
- coze_coding_utils/log/node_log.py +487 -0
- coze_coding_utils/log/parser.py +255 -0
- coze_coding_utils/log/write_log.py +183 -0
- coze_coding_utils/messages/__init__.py +0 -0
- coze_coding_utils/messages/client.py +48 -0
- coze_coding_utils/messages/server.py +173 -0
- coze_coding_utils/openai/__init__.py +5 -0
- coze_coding_utils/openai/converter/__init__.py +6 -0
- coze_coding_utils/openai/converter/request_converter.py +165 -0
- coze_coding_utils/openai/converter/response_converter.py +467 -0
- coze_coding_utils/openai/handler.py +298 -0
- coze_coding_utils/openai/types/__init__.py +37 -0
- coze_coding_utils/openai/types/request.py +24 -0
- coze_coding_utils/openai/types/response.py +178 -0
- {coze_coding_utils-0.2.1.dist-info → coze_coding_utils-0.2.2a1.dist-info}/METADATA +2 -2
- coze_coding_utils-0.2.2a1.dist-info/RECORD +37 -0
- coze_coding_utils-0.2.1.dist-info/RECORD +0 -7
- {coze_coding_utils-0.2.1.dist-info → coze_coding_utils-0.2.2a1.dist-info}/WHEEL +0 -0
- {coze_coding_utils-0.2.1.dist-info → coze_coding_utils-0.2.2a1.dist-info}/licenses/LICENSE +0 -0
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import requests
|
|
3
|
+
import chardet
|
|
4
|
+
from io import BytesIO
|
|
5
|
+
from typing import Literal,Callable, Any, Optional,Union
|
|
6
|
+
from pydantic import BaseModel, Field, field_validator,PrivateAttr
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
from pptx import Presentation
|
|
9
|
+
|
|
10
|
+
MAX_FILE_SIZE = 10 * 1024 * 1024
|
|
11
|
+
|
|
12
|
+
class File(BaseModel):
|
|
13
|
+
"""
|
|
14
|
+
通用文件对象,支持自动类型推断和路径管理
|
|
15
|
+
"""
|
|
16
|
+
url: str = Field(..., description="文件URL(http/https)或本地路径")
|
|
17
|
+
file_type: Literal['image', 'video', 'audio', 'document', 'default'] = Field(
|
|
18
|
+
default="default",
|
|
19
|
+
description="文件类型"
|
|
20
|
+
)
|
|
21
|
+
_local_path: Optional[str] = PrivateAttr(default=None)
|
|
22
|
+
|
|
23
|
+
def set_cache_path(self, path: str):
|
|
24
|
+
"""设置缓存路径"""
|
|
25
|
+
self._local_path = path
|
|
26
|
+
|
|
27
|
+
def get_cache_path(self) -> Optional[str]:
|
|
28
|
+
"""获取缓存路径(如果文件实际存在)"""
|
|
29
|
+
return self._local_path
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def is_remote(self) -> bool:
|
|
33
|
+
"""判断是网络URL还是本地文件"""
|
|
34
|
+
return self.url.startswith(('http://', 'https://'))
|
|
35
|
+
|
|
36
|
+
def infer_file_category(path_or_url: str) -> tuple[str, str]:
|
|
37
|
+
"""
|
|
38
|
+
根据路径或URL后缀判断文件类型
|
|
39
|
+
逻辑:
|
|
40
|
+
1. 解析 URL 去除 query 参数 (?id=...),提取 path
|
|
41
|
+
2. 获取 path 最后一部分的文件名和后缀
|
|
42
|
+
3. 查表判断,匹配不到则返回 'default'
|
|
43
|
+
|
|
44
|
+
Return:
|
|
45
|
+
- 分类:image, video, audio, document, default
|
|
46
|
+
- 后缀:.pdf
|
|
47
|
+
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
# === 步骤 1 & 2: 提取纯净的后缀名 ===
|
|
51
|
+
# urlparse 可以同时处理本地路径 (会被视为 path) 和 网络 URL
|
|
52
|
+
parsed = urlparse(path_or_url)
|
|
53
|
+
path = parsed.path # 提取路径部分,忽略 http://... 和 ?query=...
|
|
54
|
+
|
|
55
|
+
# 获取文件名 (例如 /a/b/test.jpg -> test.jpg)
|
|
56
|
+
filename = os.path.basename(path)
|
|
57
|
+
|
|
58
|
+
# 分离后缀 (test.jpg -> .jpg)
|
|
59
|
+
_, ext_with_dot = os.path.splitext(filename)
|
|
60
|
+
|
|
61
|
+
# 如果没有后缀,直接兜底
|
|
62
|
+
if not ext_with_dot:
|
|
63
|
+
return 'default', ""
|
|
64
|
+
|
|
65
|
+
# 去除点并转小写 (例如 .JPG -> jpg)
|
|
66
|
+
ext = ext_with_dot.lstrip('.').lower()
|
|
67
|
+
|
|
68
|
+
# === 步骤 3: 查表匹配 ===
|
|
69
|
+
# 定义常见映射表
|
|
70
|
+
TYPE_MAPPING = {
|
|
71
|
+
'image': {
|
|
72
|
+
'apng', 'avif', 'bmp', 'gif', 'heic', 'ico', 'jpg', 'jpeg', 'png', 'svg', 'tiff', 'webp'
|
|
73
|
+
},
|
|
74
|
+
'video': {
|
|
75
|
+
'mp4', 'avi', 'mov', 'mkv', 'flv', 'wmv', 'webm', 'm4v', '3gp'
|
|
76
|
+
},
|
|
77
|
+
'audio': {
|
|
78
|
+
'mp3', 'wav', 'flac', 'aac', 'ogg', 'wma', 'm4a'
|
|
79
|
+
},
|
|
80
|
+
'document': {
|
|
81
|
+
'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx',
|
|
82
|
+
'txt', 'md', 'csv', 'json', 'xml', 'html', 'htm'
|
|
83
|
+
},
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
for category, extensions in TYPE_MAPPING.items():
|
|
87
|
+
if ext in extensions:
|
|
88
|
+
return category, ext_with_dot
|
|
89
|
+
|
|
90
|
+
return 'default', ext_with_dot
|
|
91
|
+
|
|
92
|
+
class FileOps:
|
|
93
|
+
DOWNLOAD_DIR = "/tmp"
|
|
94
|
+
|
|
95
|
+
@staticmethod
|
|
96
|
+
def read_content(file_obj:File, max_length=10000) -> str:
|
|
97
|
+
return ""
|
|
98
|
+
|
|
99
|
+
@staticmethod
|
|
100
|
+
def get_local_path(file_obj:File) -> str:
|
|
101
|
+
return file_obj.url
|
|
102
|
+
|
|
103
|
+
@staticmethod
|
|
104
|
+
def _get_bytes_stream(file_obj:File) -> tuple[bytes, str]:
|
|
105
|
+
"""
|
|
106
|
+
获取文件内容和后缀, 5MB大小限制检查, 超出抛异常
|
|
107
|
+
"""
|
|
108
|
+
_, ext = infer_file_category(file_obj.url)
|
|
109
|
+
|
|
110
|
+
if file_obj.is_remote:
|
|
111
|
+
try:
|
|
112
|
+
# stream=True: 此时只下载 Headers,连接保持打开,还没下载 Body
|
|
113
|
+
with requests.get(file_obj.url, stream=True, timeout=60) as resp:
|
|
114
|
+
resp.raise_for_status()
|
|
115
|
+
|
|
116
|
+
content_length = resp.headers.get('Content-Length')
|
|
117
|
+
if content_length and int(content_length) > MAX_FILE_SIZE:
|
|
118
|
+
raise Exception(
|
|
119
|
+
f"文件大小 ({int(content_length)} bytes) 超过限制 5MB,已终止下载。"
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# 场景:Header 缺失 Content-Length 或服务器 Header 欺骗
|
|
123
|
+
downloaded_content = BytesIO()
|
|
124
|
+
current_size = 0
|
|
125
|
+
|
|
126
|
+
# 分块读取,每块 8KB
|
|
127
|
+
for chunk in resp.iter_content(chunk_size=8192):
|
|
128
|
+
if chunk:
|
|
129
|
+
current_size += len(chunk)
|
|
130
|
+
if current_size > MAX_FILE_SIZE:
|
|
131
|
+
raise Exception(f"检测到文件超过 5MB,已中断。")
|
|
132
|
+
downloaded_content.write(chunk)
|
|
133
|
+
|
|
134
|
+
# 获取完整 bytes
|
|
135
|
+
return downloaded_content.getvalue(), ext
|
|
136
|
+
|
|
137
|
+
except requests.RequestException as e:
|
|
138
|
+
raise RuntimeError(f"网络请求失败: {e}")
|
|
139
|
+
|
|
140
|
+
else:
|
|
141
|
+
if not os.path.exists(file_obj.url):
|
|
142
|
+
raise FileNotFoundError(f"本地文件不存在: {file_obj.url}")
|
|
143
|
+
|
|
144
|
+
'''
|
|
145
|
+
file_size = os.path.getsize(file_obj.url)
|
|
146
|
+
if file_size > MAX_FILE_SIZE:
|
|
147
|
+
raise Exception(f"本地文件大小 ({file_size} bytes) 超过限制 5MB")
|
|
148
|
+
'''
|
|
149
|
+
|
|
150
|
+
with open(file_obj.url, 'rb') as f:
|
|
151
|
+
return f.read(), ext
|
|
152
|
+
|
|
153
|
+
@staticmethod
|
|
154
|
+
def save_to_local(file_obj: File, filename: str) -> str:
|
|
155
|
+
"""
|
|
156
|
+
将当前文件对象的内容保存到本地路径, 返回本地路径
|
|
157
|
+
如果是本地路径,直接返回
|
|
158
|
+
"""
|
|
159
|
+
if not file_obj.is_remote:
|
|
160
|
+
if os.path.exists(file_obj.url):
|
|
161
|
+
return file_obj.url
|
|
162
|
+
|
|
163
|
+
raise FileNotFoundError(f"Local file not found: {file_obj.url}")
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
os.makedirs(FileOps.DOWNLOAD_DIR, exist_ok=True)
|
|
167
|
+
|
|
168
|
+
# 简单的文件名生成策略 (真实场景建议用 url hash 避免重复下载)
|
|
169
|
+
# ext = os.path.splitext(file_obj.url.split('?')[0])[1] or ".tmp"
|
|
170
|
+
# filename = f"{uuid.uuid4().hex}{ext}"
|
|
171
|
+
local_path = os.path.join(FileOps.DOWNLOAD_DIR, filename)
|
|
172
|
+
|
|
173
|
+
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
|
|
174
|
+
with requests.get(file_obj.url, headers=headers, stream=True, timeout=120) as r:
|
|
175
|
+
r.raise_for_status()
|
|
176
|
+
with open(local_path, 'wb') as f:
|
|
177
|
+
for chunk in r.iter_content(chunk_size=8192):
|
|
178
|
+
f.write(chunk)
|
|
179
|
+
|
|
180
|
+
return local_path
|
|
181
|
+
except Exception as e:
|
|
182
|
+
raise RuntimeError(f"Download failed for {file_obj.url}: {str(e)}")
|
|
183
|
+
|
|
184
|
+
@staticmethod
|
|
185
|
+
def read_bytes(file_obj:File) -> bytes:
|
|
186
|
+
"""
|
|
187
|
+
获取文件的原始二进制数据
|
|
188
|
+
场景:上传到OSS、保存到本地、传给图像处理库
|
|
189
|
+
"""
|
|
190
|
+
content, _ = FileOps._get_bytes_stream(file_obj)
|
|
191
|
+
return content
|
|
192
|
+
|
|
193
|
+
@staticmethod
|
|
194
|
+
def extract_text(file_obj: File) -> str:
|
|
195
|
+
"""
|
|
196
|
+
提取文本内容
|
|
197
|
+
场景:RAG、HTML解析、文档分析
|
|
198
|
+
"""
|
|
199
|
+
try:
|
|
200
|
+
content, ext = FileOps._get_bytes_stream(file_obj)
|
|
201
|
+
|
|
202
|
+
if ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx']:
|
|
203
|
+
return FileOps._parse_document_bytes(file_obj, content, ext)
|
|
204
|
+
|
|
205
|
+
# 默认直接读
|
|
206
|
+
charset = chardet.detect(content)
|
|
207
|
+
if 'encoding' in charset:
|
|
208
|
+
return content.decode(charset['encoding'])
|
|
209
|
+
else:
|
|
210
|
+
return content.decode('utf-8')
|
|
211
|
+
|
|
212
|
+
except Exception as e:
|
|
213
|
+
return f"[FileOps Error] Failed to read content: {str(e)}"
|
|
214
|
+
|
|
215
|
+
@staticmethod
|
|
216
|
+
def _parse_document_bytes(file_obj: File, content: bytes, ext:str) -> str:
|
|
217
|
+
stream = BytesIO(content)
|
|
218
|
+
text_result = ""
|
|
219
|
+
|
|
220
|
+
try:
|
|
221
|
+
if ext == '.pdf':
|
|
222
|
+
import pypdf
|
|
223
|
+
reader = pypdf.PdfReader(stream)
|
|
224
|
+
for page in reader.pages:
|
|
225
|
+
text_result += page.extract_text() + "\n"
|
|
226
|
+
elif ext in ['.docx', '.doc']:
|
|
227
|
+
text_result = read_docx(stream)
|
|
228
|
+
elif ext in ['.xlsx', '.xls', '.csv']:
|
|
229
|
+
import pandas as pd
|
|
230
|
+
if ext == '.csv':
|
|
231
|
+
df = pd.read_csv(stream)
|
|
232
|
+
else:
|
|
233
|
+
df = pd.read_excel(stream)
|
|
234
|
+
text_result = df.to_string()
|
|
235
|
+
elif ext in ['.ppt', '.pptx']:
|
|
236
|
+
text_result = read_ppt(stream)
|
|
237
|
+
else:
|
|
238
|
+
text_result = f"[暂不支持解析该文档格式: {ext}]"
|
|
239
|
+
except ImportError as e:
|
|
240
|
+
text_result = f"[解析库缺失] {e}"
|
|
241
|
+
except Exception as e:
|
|
242
|
+
text_result = f"[解析失败] {e}"
|
|
243
|
+
|
|
244
|
+
return text_result
|
|
245
|
+
|
|
246
|
+
def read_docx(cont_stream) -> str:
|
|
247
|
+
"""
|
|
248
|
+
使用docx2python按顺序读取内容
|
|
249
|
+
"""
|
|
250
|
+
from docx2python import docx2python
|
|
251
|
+
doc_result = docx2python(cont_stream)
|
|
252
|
+
|
|
253
|
+
# 获取文档结构
|
|
254
|
+
all_parts = []
|
|
255
|
+
|
|
256
|
+
# docx2python以嵌套列表形式返回内容
|
|
257
|
+
# 遍历文档主体
|
|
258
|
+
for section in doc_result.body:
|
|
259
|
+
if isinstance(section, list):
|
|
260
|
+
for item in section:
|
|
261
|
+
if isinstance(item, list):
|
|
262
|
+
# 可能是表格或多级内容
|
|
263
|
+
for sub_item in item:
|
|
264
|
+
if isinstance(sub_item, str) and sub_item.strip():
|
|
265
|
+
all_parts.append(sub_item.strip())
|
|
266
|
+
elif isinstance(sub_item, list):
|
|
267
|
+
# 表格行
|
|
268
|
+
row_text = "\n".join([str(cell).strip() for cell in sub_item if str(cell).strip()])
|
|
269
|
+
if row_text:
|
|
270
|
+
all_parts.append(row_text)
|
|
271
|
+
elif isinstance(item, str) and item.strip():
|
|
272
|
+
all_parts.append(item.strip())
|
|
273
|
+
|
|
274
|
+
# 关闭文档
|
|
275
|
+
doc_result.close()
|
|
276
|
+
|
|
277
|
+
return "\n\n".join(all_parts)
|
|
278
|
+
|
|
279
|
+
def read_ppt(file_input: Union[str, bytes, BytesIO]) -> str:
|
|
280
|
+
if not Presentation:
|
|
281
|
+
return "[Error] 未安装 python-pptx 库,无法解析 PPT 文件"
|
|
282
|
+
|
|
283
|
+
# 1. 统一转换为文件流对象 (BytesIO)
|
|
284
|
+
if isinstance(file_input, str):
|
|
285
|
+
with open(file_input, 'rb') as f:
|
|
286
|
+
ppt_stream = BytesIO(f.read())
|
|
287
|
+
elif isinstance(file_input, bytes):
|
|
288
|
+
ppt_stream = BytesIO(file_input)
|
|
289
|
+
else:
|
|
290
|
+
ppt_stream = file_input
|
|
291
|
+
|
|
292
|
+
try:
|
|
293
|
+
prs = Presentation(ppt_stream)
|
|
294
|
+
full_text = []
|
|
295
|
+
|
|
296
|
+
for i, slide in enumerate(prs.slides):
|
|
297
|
+
page_content = []
|
|
298
|
+
page_content.append(f"=== 第 {i+1} 页 ===")
|
|
299
|
+
|
|
300
|
+
# shape.text_frame 包含了形状内的文本段落
|
|
301
|
+
for shape in slide.shapes:
|
|
302
|
+
# 提取普通文本框
|
|
303
|
+
if hasattr(shape, "text") and shape.text.strip():
|
|
304
|
+
page_content.append(shape.text.strip())
|
|
305
|
+
|
|
306
|
+
# B. 提取表格内容 (普通 shape.text 无法获取表格内的字)
|
|
307
|
+
if shape.has_table:
|
|
308
|
+
table_texts = []
|
|
309
|
+
for row in shape.table.rows:
|
|
310
|
+
row_cells = [cell.text_frame.text.strip() for cell in row.cells if cell.text_frame.text.strip()]
|
|
311
|
+
if row_cells:
|
|
312
|
+
table_texts.append(" | ".join(row_cells))
|
|
313
|
+
if table_texts:
|
|
314
|
+
page_content.append("[表格]\n" + "\n".join(table_texts))
|
|
315
|
+
|
|
316
|
+
# 很多重要信息藏在备注里
|
|
317
|
+
if slide.has_notes_slide:
|
|
318
|
+
notes = slide.notes_slide.notes_text_frame.text
|
|
319
|
+
if notes.strip():
|
|
320
|
+
page_content.append(f"[备注]: {notes.strip()}")
|
|
321
|
+
|
|
322
|
+
full_text.append("\n".join(page_content))
|
|
323
|
+
|
|
324
|
+
return "\n\n".join(full_text)
|
|
325
|
+
|
|
326
|
+
except Exception as e:
|
|
327
|
+
return f"[PPT解析失败] {str(e)}"
|
|
File without changes
|