coze-coding-utils 0.2.1__py3-none-any.whl → 0.2.2a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. coze_coding_utils/__init__.py +1 -1
  2. coze_coding_utils/error/__init__.py +31 -0
  3. coze_coding_utils/error/classifier.py +320 -0
  4. coze_coding_utils/error/codes.py +356 -0
  5. coze_coding_utils/error/exceptions.py +439 -0
  6. coze_coding_utils/error/patterns.py +939 -0
  7. coze_coding_utils/error/test_classifier.py +0 -0
  8. coze_coding_utils/file/__init__.py +0 -0
  9. coze_coding_utils/file/file.py +327 -0
  10. coze_coding_utils/helper/__init__.py +0 -0
  11. coze_coding_utils/helper/agent_helper.py +599 -0
  12. coze_coding_utils/helper/graph_helper.py +231 -0
  13. coze_coding_utils/log/__init__.py +0 -0
  14. coze_coding_utils/log/common.py +8 -0
  15. coze_coding_utils/log/config.py +10 -0
  16. coze_coding_utils/log/err_trace.py +88 -0
  17. coze_coding_utils/log/loop_trace.py +72 -0
  18. coze_coding_utils/log/node_log.py +487 -0
  19. coze_coding_utils/log/parser.py +255 -0
  20. coze_coding_utils/log/write_log.py +183 -0
  21. coze_coding_utils/messages/__init__.py +0 -0
  22. coze_coding_utils/messages/client.py +48 -0
  23. coze_coding_utils/messages/server.py +173 -0
  24. coze_coding_utils/openai/__init__.py +5 -0
  25. coze_coding_utils/openai/converter/__init__.py +6 -0
  26. coze_coding_utils/openai/converter/request_converter.py +165 -0
  27. coze_coding_utils/openai/converter/response_converter.py +467 -0
  28. coze_coding_utils/openai/handler.py +298 -0
  29. coze_coding_utils/openai/types/__init__.py +37 -0
  30. coze_coding_utils/openai/types/request.py +24 -0
  31. coze_coding_utils/openai/types/response.py +178 -0
  32. {coze_coding_utils-0.2.1.dist-info → coze_coding_utils-0.2.2a1.dist-info}/METADATA +2 -2
  33. coze_coding_utils-0.2.2a1.dist-info/RECORD +37 -0
  34. coze_coding_utils-0.2.1.dist-info/RECORD +0 -7
  35. {coze_coding_utils-0.2.1.dist-info → coze_coding_utils-0.2.2a1.dist-info}/WHEEL +0 -0
  36. {coze_coding_utils-0.2.1.dist-info → coze_coding_utils-0.2.2a1.dist-info}/licenses/LICENSE +0 -0
File without changes
File without changes
@@ -0,0 +1,327 @@
1
+ import os
2
+ import requests
3
+ import chardet
4
+ from io import BytesIO
5
+ from typing import Literal,Callable, Any, Optional,Union
6
+ from pydantic import BaseModel, Field, field_validator,PrivateAttr
7
+ from urllib.parse import urlparse
8
+ from pptx import Presentation
9
+
10
+ MAX_FILE_SIZE = 10 * 1024 * 1024
11
+
12
+ class File(BaseModel):
13
+ """
14
+ 通用文件对象,支持自动类型推断和路径管理
15
+ """
16
+ url: str = Field(..., description="文件URL(http/https)或本地路径")
17
+ file_type: Literal['image', 'video', 'audio', 'document', 'default'] = Field(
18
+ default="default",
19
+ description="文件类型"
20
+ )
21
+ _local_path: Optional[str] = PrivateAttr(default=None)
22
+
23
+ def set_cache_path(self, path: str):
24
+ """设置缓存路径"""
25
+ self._local_path = path
26
+
27
+ def get_cache_path(self) -> Optional[str]:
28
+ """获取缓存路径(如果文件实际存在)"""
29
+ return self._local_path
30
+
31
+ @property
32
+ def is_remote(self) -> bool:
33
+ """判断是网络URL还是本地文件"""
34
+ return self.url.startswith(('http://', 'https://'))
35
+
36
+ def infer_file_category(path_or_url: str) -> tuple[str, str]:
37
+ """
38
+ 根据路径或URL后缀判断文件类型
39
+ 逻辑:
40
+ 1. 解析 URL 去除 query 参数 (?id=...),提取 path
41
+ 2. 获取 path 最后一部分的文件名和后缀
42
+ 3. 查表判断,匹配不到则返回 'default'
43
+
44
+ Return:
45
+ - 分类:image, video, audio, document, default
46
+ - 后缀:.pdf
47
+
48
+ """
49
+
50
+ # === 步骤 1 & 2: 提取纯净的后缀名 ===
51
+ # urlparse 可以同时处理本地路径 (会被视为 path) 和 网络 URL
52
+ parsed = urlparse(path_or_url)
53
+ path = parsed.path # 提取路径部分,忽略 http://... 和 ?query=...
54
+
55
+ # 获取文件名 (例如 /a/b/test.jpg -> test.jpg)
56
+ filename = os.path.basename(path)
57
+
58
+ # 分离后缀 (test.jpg -> .jpg)
59
+ _, ext_with_dot = os.path.splitext(filename)
60
+
61
+ # 如果没有后缀,直接兜底
62
+ if not ext_with_dot:
63
+ return 'default', ""
64
+
65
+ # 去除点并转小写 (例如 .JPG -> jpg)
66
+ ext = ext_with_dot.lstrip('.').lower()
67
+
68
+ # === 步骤 3: 查表匹配 ===
69
+ # 定义常见映射表
70
+ TYPE_MAPPING = {
71
+ 'image': {
72
+ 'apng', 'avif', 'bmp', 'gif', 'heic', 'ico', 'jpg', 'jpeg', 'png', 'svg', 'tiff', 'webp'
73
+ },
74
+ 'video': {
75
+ 'mp4', 'avi', 'mov', 'mkv', 'flv', 'wmv', 'webm', 'm4v', '3gp'
76
+ },
77
+ 'audio': {
78
+ 'mp3', 'wav', 'flac', 'aac', 'ogg', 'wma', 'm4a'
79
+ },
80
+ 'document': {
81
+ 'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx',
82
+ 'txt', 'md', 'csv', 'json', 'xml', 'html', 'htm'
83
+ },
84
+ }
85
+
86
+ for category, extensions in TYPE_MAPPING.items():
87
+ if ext in extensions:
88
+ return category, ext_with_dot
89
+
90
+ return 'default', ext_with_dot
91
+
92
+ class FileOps:
93
+ DOWNLOAD_DIR = "/tmp"
94
+
95
+ @staticmethod
96
+ def read_content(file_obj:File, max_length=10000) -> str:
97
+ return ""
98
+
99
+ @staticmethod
100
+ def get_local_path(file_obj:File) -> str:
101
+ return file_obj.url
102
+
103
+ @staticmethod
104
+ def _get_bytes_stream(file_obj:File) -> tuple[bytes, str]:
105
+ """
106
+ 获取文件内容和后缀, 5MB大小限制检查, 超出抛异常
107
+ """
108
+ _, ext = infer_file_category(file_obj.url)
109
+
110
+ if file_obj.is_remote:
111
+ try:
112
+ # stream=True: 此时只下载 Headers,连接保持打开,还没下载 Body
113
+ with requests.get(file_obj.url, stream=True, timeout=60) as resp:
114
+ resp.raise_for_status()
115
+
116
+ content_length = resp.headers.get('Content-Length')
117
+ if content_length and int(content_length) > MAX_FILE_SIZE:
118
+ raise Exception(
119
+ f"文件大小 ({int(content_length)} bytes) 超过限制 5MB,已终止下载。"
120
+ )
121
+
122
+ # 场景:Header 缺失 Content-Length 或服务器 Header 欺骗
123
+ downloaded_content = BytesIO()
124
+ current_size = 0
125
+
126
+ # 分块读取,每块 8KB
127
+ for chunk in resp.iter_content(chunk_size=8192):
128
+ if chunk:
129
+ current_size += len(chunk)
130
+ if current_size > MAX_FILE_SIZE:
131
+ raise Exception(f"检测到文件超过 5MB,已中断。")
132
+ downloaded_content.write(chunk)
133
+
134
+ # 获取完整 bytes
135
+ return downloaded_content.getvalue(), ext
136
+
137
+ except requests.RequestException as e:
138
+ raise RuntimeError(f"网络请求失败: {e}")
139
+
140
+ else:
141
+ if not os.path.exists(file_obj.url):
142
+ raise FileNotFoundError(f"本地文件不存在: {file_obj.url}")
143
+
144
+ '''
145
+ file_size = os.path.getsize(file_obj.url)
146
+ if file_size > MAX_FILE_SIZE:
147
+ raise Exception(f"本地文件大小 ({file_size} bytes) 超过限制 5MB")
148
+ '''
149
+
150
+ with open(file_obj.url, 'rb') as f:
151
+ return f.read(), ext
152
+
153
+ @staticmethod
154
+ def save_to_local(file_obj: File, filename: str) -> str:
155
+ """
156
+ 将当前文件对象的内容保存到本地路径, 返回本地路径
157
+ 如果是本地路径,直接返回
158
+ """
159
+ if not file_obj.is_remote:
160
+ if os.path.exists(file_obj.url):
161
+ return file_obj.url
162
+
163
+ raise FileNotFoundError(f"Local file not found: {file_obj.url}")
164
+
165
+ try:
166
+ os.makedirs(FileOps.DOWNLOAD_DIR, exist_ok=True)
167
+
168
+ # 简单的文件名生成策略 (真实场景建议用 url hash 避免重复下载)
169
+ # ext = os.path.splitext(file_obj.url.split('?')[0])[1] or ".tmp"
170
+ # filename = f"{uuid.uuid4().hex}{ext}"
171
+ local_path = os.path.join(FileOps.DOWNLOAD_DIR, filename)
172
+
173
+ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
174
+ with requests.get(file_obj.url, headers=headers, stream=True, timeout=120) as r:
175
+ r.raise_for_status()
176
+ with open(local_path, 'wb') as f:
177
+ for chunk in r.iter_content(chunk_size=8192):
178
+ f.write(chunk)
179
+
180
+ return local_path
181
+ except Exception as e:
182
+ raise RuntimeError(f"Download failed for {file_obj.url}: {str(e)}")
183
+
184
+ @staticmethod
185
+ def read_bytes(file_obj:File) -> bytes:
186
+ """
187
+ 获取文件的原始二进制数据
188
+ 场景:上传到OSS、保存到本地、传给图像处理库
189
+ """
190
+ content, _ = FileOps._get_bytes_stream(file_obj)
191
+ return content
192
+
193
+ @staticmethod
194
+ def extract_text(file_obj: File) -> str:
195
+ """
196
+ 提取文本内容
197
+ 场景:RAG、HTML解析、文档分析
198
+ """
199
+ try:
200
+ content, ext = FileOps._get_bytes_stream(file_obj)
201
+
202
+ if ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx']:
203
+ return FileOps._parse_document_bytes(file_obj, content, ext)
204
+
205
+ # 默认直接读
206
+ charset = chardet.detect(content)
207
+ if 'encoding' in charset:
208
+ return content.decode(charset['encoding'])
209
+ else:
210
+ return content.decode('utf-8')
211
+
212
+ except Exception as e:
213
+ return f"[FileOps Error] Failed to read content: {str(e)}"
214
+
215
+ @staticmethod
216
+ def _parse_document_bytes(file_obj: File, content: bytes, ext:str) -> str:
217
+ stream = BytesIO(content)
218
+ text_result = ""
219
+
220
+ try:
221
+ if ext == '.pdf':
222
+ import pypdf
223
+ reader = pypdf.PdfReader(stream)
224
+ for page in reader.pages:
225
+ text_result += page.extract_text() + "\n"
226
+ elif ext in ['.docx', '.doc']:
227
+ text_result = read_docx(stream)
228
+ elif ext in ['.xlsx', '.xls', '.csv']:
229
+ import pandas as pd
230
+ if ext == '.csv':
231
+ df = pd.read_csv(stream)
232
+ else:
233
+ df = pd.read_excel(stream)
234
+ text_result = df.to_string()
235
+ elif ext in ['.ppt', '.pptx']:
236
+ text_result = read_ppt(stream)
237
+ else:
238
+ text_result = f"[暂不支持解析该文档格式: {ext}]"
239
+ except ImportError as e:
240
+ text_result = f"[解析库缺失] {e}"
241
+ except Exception as e:
242
+ text_result = f"[解析失败] {e}"
243
+
244
+ return text_result
245
+
246
+ def read_docx(cont_stream) -> str:
247
+ """
248
+ 使用docx2python按顺序读取内容
249
+ """
250
+ from docx2python import docx2python
251
+ doc_result = docx2python(cont_stream)
252
+
253
+ # 获取文档结构
254
+ all_parts = []
255
+
256
+ # docx2python以嵌套列表形式返回内容
257
+ # 遍历文档主体
258
+ for section in doc_result.body:
259
+ if isinstance(section, list):
260
+ for item in section:
261
+ if isinstance(item, list):
262
+ # 可能是表格或多级内容
263
+ for sub_item in item:
264
+ if isinstance(sub_item, str) and sub_item.strip():
265
+ all_parts.append(sub_item.strip())
266
+ elif isinstance(sub_item, list):
267
+ # 表格行
268
+ row_text = "\n".join([str(cell).strip() for cell in sub_item if str(cell).strip()])
269
+ if row_text:
270
+ all_parts.append(row_text)
271
+ elif isinstance(item, str) and item.strip():
272
+ all_parts.append(item.strip())
273
+
274
+ # 关闭文档
275
+ doc_result.close()
276
+
277
+ return "\n\n".join(all_parts)
278
+
279
+ def read_ppt(file_input: Union[str, bytes, BytesIO]) -> str:
280
+ if not Presentation:
281
+ return "[Error] 未安装 python-pptx 库,无法解析 PPT 文件"
282
+
283
+ # 1. 统一转换为文件流对象 (BytesIO)
284
+ if isinstance(file_input, str):
285
+ with open(file_input, 'rb') as f:
286
+ ppt_stream = BytesIO(f.read())
287
+ elif isinstance(file_input, bytes):
288
+ ppt_stream = BytesIO(file_input)
289
+ else:
290
+ ppt_stream = file_input
291
+
292
+ try:
293
+ prs = Presentation(ppt_stream)
294
+ full_text = []
295
+
296
+ for i, slide in enumerate(prs.slides):
297
+ page_content = []
298
+ page_content.append(f"=== 第 {i+1} 页 ===")
299
+
300
+ # shape.text_frame 包含了形状内的文本段落
301
+ for shape in slide.shapes:
302
+ # 提取普通文本框
303
+ if hasattr(shape, "text") and shape.text.strip():
304
+ page_content.append(shape.text.strip())
305
+
306
+ # B. 提取表格内容 (普通 shape.text 无法获取表格内的字)
307
+ if shape.has_table:
308
+ table_texts = []
309
+ for row in shape.table.rows:
310
+ row_cells = [cell.text_frame.text.strip() for cell in row.cells if cell.text_frame.text.strip()]
311
+ if row_cells:
312
+ table_texts.append(" | ".join(row_cells))
313
+ if table_texts:
314
+ page_content.append("[表格]\n" + "\n".join(table_texts))
315
+
316
+ # 很多重要信息藏在备注里
317
+ if slide.has_notes_slide:
318
+ notes = slide.notes_slide.notes_text_frame.text
319
+ if notes.strip():
320
+ page_content.append(f"[备注]: {notes.strip()}")
321
+
322
+ full_text.append("\n".join(page_content))
323
+
324
+ return "\n\n".join(full_text)
325
+
326
+ except Exception as e:
327
+ return f"[PPT解析失败] {str(e)}"
File without changes