auto-coder 0.1.346__py3-none-any.whl → 0.1.348__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of auto-coder might be problematic. Click here for more details.
- {auto_coder-0.1.346.dist-info → auto_coder-0.1.348.dist-info}/METADATA +1 -1
- {auto_coder-0.1.346.dist-info → auto_coder-0.1.348.dist-info}/RECORD +24 -21
- autocoder/auto_coder_runner.py +5 -4
- autocoder/common/auto_coder_lang.py +8 -0
- autocoder/common/v2/agent/agentic_edit.py +68 -22
- autocoder/common/v2/agent/agentic_edit_tools/__init__.py +2 -0
- autocoder/common/v2/agent/agentic_edit_tools/list_package_info_tool_resolver.py +42 -0
- autocoder/common/v2/agent/agentic_edit_types.py +4 -0
- autocoder/plugins/__init__.py +20 -0
- autocoder/rag/cache/byzer_storage_cache.py +44 -74
- autocoder/rag/cache/failed_files_utils.py +39 -0
- autocoder/rag/cache/file_monitor_cache.py +3 -1
- autocoder/rag/cache/local_byzer_storage_cache.py +45 -73
- autocoder/rag/cache/local_duckdb_storage_cache.py +43 -13
- autocoder/rag/cache/simple_cache.py +40 -12
- autocoder/rag/document_retriever.py +17 -8
- autocoder/rag/loaders/image_loader.py +551 -0
- autocoder/rag/long_context_rag.py +2 -0
- autocoder/rag/qa_conversation_strategy.py +26 -23
- autocoder/version.py +1 -1
- {auto_coder-0.1.346.dist-info → auto_coder-0.1.348.dist-info}/LICENSE +0 -0
- {auto_coder-0.1.346.dist-info → auto_coder-0.1.348.dist-info}/WHEEL +0 -0
- {auto_coder-0.1.346.dist-info → auto_coder-0.1.348.dist-info}/entry_points.txt +0 -0
- {auto_coder-0.1.346.dist-info → auto_coder-0.1.348.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,551 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import traceback
|
|
3
|
+
import re
|
|
4
|
+
from PIL import Image
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
from paddleocr import PaddleOCR
|
|
8
|
+
except ImportError:
|
|
9
|
+
PaddleOCR = None
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import paddlex as paddlex_module
|
|
13
|
+
except ImportError:
|
|
14
|
+
paddlex_module = None
|
|
15
|
+
|
|
16
|
+
import byzerllm
|
|
17
|
+
from byzerllm.utils.client import code_utils
|
|
18
|
+
from autocoder.utils.llms import get_single_llm
|
|
19
|
+
from loguru import logger
|
|
20
|
+
from typing import List, Tuple, Optional
|
|
21
|
+
from pydantic import BaseModel
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ReplaceInFileTool(BaseModel):
|
|
25
|
+
path: str
|
|
26
|
+
diff: str
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ImageLoader:
|
|
30
|
+
"""
|
|
31
|
+
A class for loading and processing images, extracting text and tables from them,
|
|
32
|
+
and converting the content to markdown format.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
@staticmethod
|
|
36
|
+
def parse_diff(diff_content: str) -> List[Tuple[str, str]]:
|
|
37
|
+
"""
|
|
38
|
+
Parses the diff content into a list of (search_block, replace_block) tuples.
|
|
39
|
+
"""
|
|
40
|
+
blocks = []
|
|
41
|
+
lines = diff_content.splitlines(keepends=True)
|
|
42
|
+
i = 0
|
|
43
|
+
n = len(lines)
|
|
44
|
+
|
|
45
|
+
while i < n:
|
|
46
|
+
line = lines[i]
|
|
47
|
+
if line.strip() == "<<<<<<< SEARCH":
|
|
48
|
+
i += 1
|
|
49
|
+
search_lines = []
|
|
50
|
+
# Accumulate search block
|
|
51
|
+
while i < n and lines[i].strip() != "=======":
|
|
52
|
+
search_lines.append(lines[i])
|
|
53
|
+
i += 1
|
|
54
|
+
if i >= n:
|
|
55
|
+
logger.warning("Unterminated SEARCH block found in diff content.")
|
|
56
|
+
break
|
|
57
|
+
i += 1 # skip '======='
|
|
58
|
+
replace_lines = []
|
|
59
|
+
# Accumulate replace block
|
|
60
|
+
while i < n and lines[i].strip() != ">>>>>>> REPLACE":
|
|
61
|
+
replace_lines.append(lines[i])
|
|
62
|
+
i += 1
|
|
63
|
+
if i >= n:
|
|
64
|
+
logger.warning("Unterminated REPLACE block found in diff content.")
|
|
65
|
+
break
|
|
66
|
+
i += 1 # skip '>>>>>>> REPLACE'
|
|
67
|
+
|
|
68
|
+
search_block = ''.join(search_lines)
|
|
69
|
+
replace_block = ''.join(replace_lines)
|
|
70
|
+
blocks.append((search_block, replace_block))
|
|
71
|
+
else:
|
|
72
|
+
i += 1
|
|
73
|
+
|
|
74
|
+
if not blocks and diff_content.strip():
|
|
75
|
+
logger.warning(f"Could not parse any SEARCH/REPLACE blocks from diff: {diff_content}")
|
|
76
|
+
return blocks
|
|
77
|
+
|
|
78
|
+
@staticmethod
|
|
79
|
+
def paddleocr_extract_text(
|
|
80
|
+
file_path,
|
|
81
|
+
lang='ch',
|
|
82
|
+
use_angle_cls=True,
|
|
83
|
+
page_num=10,
|
|
84
|
+
slice_params=None,
|
|
85
|
+
det_model_dir=None,
|
|
86
|
+
rec_model_dir=None,
|
|
87
|
+
**kwargs
|
|
88
|
+
):
|
|
89
|
+
"""
|
|
90
|
+
使用 PaddleOCR 识别文本,支持图片、PDF、超大图像滑动窗口
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
file_path: 图片或PDF路径
|
|
94
|
+
lang: 语言,默认中文
|
|
95
|
+
use_angle_cls: 是否启用方向分类
|
|
96
|
+
page_num: 识别PDF时的最大页数
|
|
97
|
+
slice_params: 超大图像滑动窗口参数 dict
|
|
98
|
+
det_model_dir: 自定义检测模型路径
|
|
99
|
+
rec_model_dir: 自定义识别模型路径
|
|
100
|
+
kwargs: 其他paddleocr参数
|
|
101
|
+
Returns:
|
|
102
|
+
识别出的纯文本字符串
|
|
103
|
+
"""
|
|
104
|
+
if PaddleOCR is None:
|
|
105
|
+
print("paddleocr not installed")
|
|
106
|
+
return ""
|
|
107
|
+
|
|
108
|
+
# 初始化 OCR
|
|
109
|
+
try:
|
|
110
|
+
ocr = PaddleOCR(
|
|
111
|
+
use_angle_cls=use_angle_cls,
|
|
112
|
+
lang=lang,
|
|
113
|
+
page_num=page_num,
|
|
114
|
+
det_model_dir=det_model_dir,
|
|
115
|
+
rec_model_dir=rec_model_dir,
|
|
116
|
+
**kwargs
|
|
117
|
+
)
|
|
118
|
+
except Exception:
|
|
119
|
+
traceback.print_exc()
|
|
120
|
+
return ""
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
124
|
+
|
|
125
|
+
# 处理PDF
|
|
126
|
+
if ext == ".pdf":
|
|
127
|
+
result = ocr.ocr(file_path, cls=True) # result is list of pages, each page is list of lines
|
|
128
|
+
lines = []
|
|
129
|
+
if result and isinstance(result, list):
|
|
130
|
+
for page in result:
|
|
131
|
+
if page and isinstance(page, list):
|
|
132
|
+
for line_info in page: # line_info is [points, (text, confidence)]
|
|
133
|
+
try:
|
|
134
|
+
# Check structure: [points, (text, confidence)]
|
|
135
|
+
if isinstance(line_info, (list, tuple)) and len(line_info) == 2 and \
|
|
136
|
+
isinstance(line_info[1], (list, tuple)) and len(line_info[1]) >= 1:
|
|
137
|
+
txt = line_info[1][0]
|
|
138
|
+
if isinstance(txt, str):
|
|
139
|
+
lines.append(txt)
|
|
140
|
+
else:
|
|
141
|
+
logger.warning(f"Extracted text is not a string in PDF: {txt} (type: {type(txt)}). Skipping.")
|
|
142
|
+
else:
|
|
143
|
+
logger.warning(f"Unexpected line_info structure in PDF: {line_info}. Skipping.")
|
|
144
|
+
except Exception as e:
|
|
145
|
+
logger.warning(f"Error processing line_info in PDF: {line_info}. Error: {e}")
|
|
146
|
+
return "\n".join(lines)
|
|
147
|
+
|
|
148
|
+
# 处理图片
|
|
149
|
+
else: # Image processing
|
|
150
|
+
if slice_params is not None:
|
|
151
|
+
result = ocr.ocr(file_path, cls=True, slice=slice_params)
|
|
152
|
+
else:
|
|
153
|
+
result = ocr.ocr(file_path, cls=True) # result is [[[points, (text, confidence)], ...]] for single image
|
|
154
|
+
|
|
155
|
+
lines = []
|
|
156
|
+
# Standardize handling: PaddleOCR often returns a list containing one item for single images.
|
|
157
|
+
# result = [page_result] where page_result = [[line1_info], [line2_info], ...]
|
|
158
|
+
if result and isinstance(result, list):
|
|
159
|
+
# Heuristic: Treat 'result' as the list of pages directly.
|
|
160
|
+
# This handles both single image wrapped in list and multi-page PDFs consistently.
|
|
161
|
+
page_list = result
|
|
162
|
+
|
|
163
|
+
for page in page_list:
|
|
164
|
+
if page and isinstance(page, list):
|
|
165
|
+
for line_info in page: # line_info is [points, (text, confidence)]
|
|
166
|
+
try:
|
|
167
|
+
# Check structure: [points, (text, confidence)]
|
|
168
|
+
if isinstance(line_info, (list, tuple)) and len(line_info) == 2 and \
|
|
169
|
+
isinstance(line_info[1], (list, tuple)) and len(line_info[1]) >= 1:
|
|
170
|
+
txt = line_info[1][0]
|
|
171
|
+
if isinstance(txt, str):
|
|
172
|
+
lines.append(txt)
|
|
173
|
+
else:
|
|
174
|
+
# Handle potential nested lists in text: join them? Or log?
|
|
175
|
+
if isinstance(txt, list):
|
|
176
|
+
processed_txt = " ".join(map(str, txt))
|
|
177
|
+
logger.warning(f"Extracted text is a list in Image: {txt}. Joined as: '{processed_txt}'.")
|
|
178
|
+
lines.append(processed_txt) # Attempt to join if it's a list of strings/convertibles
|
|
179
|
+
else:
|
|
180
|
+
logger.warning(f"Extracted text is not a string in Image: {txt} (type: {type(txt)}). Skipping.")
|
|
181
|
+
else:
|
|
182
|
+
logger.warning(f"Unexpected line_info structure in Image: {line_info}. Skipping.")
|
|
183
|
+
except Exception as e:
|
|
184
|
+
logger.warning(f"Error processing line_info in Image: {line_info}. Error: {e}")
|
|
185
|
+
return "\n".join(lines)
|
|
186
|
+
except Exception:
|
|
187
|
+
traceback.print_exc()
|
|
188
|
+
return ""
|
|
189
|
+
|
|
190
|
+
@staticmethod
|
|
191
|
+
def paddlex_table_extract_markdown(image_path):
|
|
192
|
+
"""
|
|
193
|
+
使用 PaddleX 表格识别pipeline,抽取表格并转换为markdown格式
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
image_path: 图片路径
|
|
197
|
+
Returns:
|
|
198
|
+
markdown格式的表格字符串
|
|
199
|
+
"""
|
|
200
|
+
if paddlex_module is None:
|
|
201
|
+
print("paddlex not installed")
|
|
202
|
+
return ""
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
# 创建 pipeline
|
|
206
|
+
pipeline = paddlex_module.create_pipeline(pipeline='table_recognition')
|
|
207
|
+
# 预测
|
|
208
|
+
outputs = pipeline.predict([image_path])
|
|
209
|
+
if not outputs:
|
|
210
|
+
return ""
|
|
211
|
+
|
|
212
|
+
md_results = []
|
|
213
|
+
for res in outputs:
|
|
214
|
+
# 获取HTML表格
|
|
215
|
+
html = None
|
|
216
|
+
try:
|
|
217
|
+
html = res.to_html() if hasattr(res, "to_html") else None
|
|
218
|
+
except Exception:
|
|
219
|
+
html = None
|
|
220
|
+
|
|
221
|
+
# 如果没有to_html方法,尝试res.print()内容中提取,或跳过
|
|
222
|
+
if html is None:
|
|
223
|
+
try:
|
|
224
|
+
from io import StringIO
|
|
225
|
+
import sys
|
|
226
|
+
buffer = StringIO()
|
|
227
|
+
sys_stdout = sys.stdout
|
|
228
|
+
sys.stdout = buffer
|
|
229
|
+
res.print()
|
|
230
|
+
sys.stdout = sys_stdout
|
|
231
|
+
html = buffer.getvalue()
|
|
232
|
+
except Exception:
|
|
233
|
+
html = ""
|
|
234
|
+
|
|
235
|
+
# 转markdown
|
|
236
|
+
md = ImageLoader.html_table_to_markdown(html)
|
|
237
|
+
md_results.append(md)
|
|
238
|
+
|
|
239
|
+
return "\n\n".join(md_results)
|
|
240
|
+
except Exception:
|
|
241
|
+
traceback.print_exc()
|
|
242
|
+
return ""
|
|
243
|
+
|
|
244
|
+
@staticmethod
|
|
245
|
+
def html_table_to_markdown(html):
|
|
246
|
+
"""
|
|
247
|
+
简单将HTML table转换为markdown table
|
|
248
|
+
"""
|
|
249
|
+
try:
|
|
250
|
+
from bs4 import BeautifulSoup
|
|
251
|
+
except ImportError:
|
|
252
|
+
print("BeautifulSoup4 not installed, cannot convert HTML to markdown")
|
|
253
|
+
return ""
|
|
254
|
+
|
|
255
|
+
try:
|
|
256
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
257
|
+
table = soup.find("table")
|
|
258
|
+
if table is None:
|
|
259
|
+
return ""
|
|
260
|
+
|
|
261
|
+
rows = []
|
|
262
|
+
for tr in table.find_all("tr"):
|
|
263
|
+
cells = tr.find_all(["td", "th"])
|
|
264
|
+
row = [cell.get_text(strip=True) for cell in cells]
|
|
265
|
+
rows.append(row)
|
|
266
|
+
|
|
267
|
+
if not rows:
|
|
268
|
+
return ""
|
|
269
|
+
|
|
270
|
+
# 生成markdown
|
|
271
|
+
md_lines = []
|
|
272
|
+
header = rows[0]
|
|
273
|
+
md_lines.append("| " + " | ".join(header) + " |")
|
|
274
|
+
md_lines.append("|" + "|".join(["---"] * len(header)) + "|")
|
|
275
|
+
|
|
276
|
+
for row in rows[1:]:
|
|
277
|
+
md_lines.append("| " + " | ".join(row) + " |")
|
|
278
|
+
|
|
279
|
+
return "\n".join(md_lines)
|
|
280
|
+
except Exception:
|
|
281
|
+
traceback.print_exc()
|
|
282
|
+
return ""
|
|
283
|
+
|
|
284
|
+
@staticmethod
|
|
285
|
+
def format_table_in_content(content: str, llm=None) -> str:
|
|
286
|
+
"""Format table content from OCR results into markdown format.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
content: The OCR text content that may contain tables
|
|
290
|
+
llm: The language model to use for formatting
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
Formatted content with tables converted to markdown
|
|
294
|
+
"""
|
|
295
|
+
|
|
296
|
+
@byzerllm.prompt()
|
|
297
|
+
def _format_table(content: str)->str:
|
|
298
|
+
'''
|
|
299
|
+
# 表格格式化任务
|
|
300
|
+
|
|
301
|
+
你是一个专业的OCR后处理专家,擅长将OCR识别出的表格数据转换为规范的Markdown表格。
|
|
302
|
+
|
|
303
|
+
## 输入内容分析
|
|
304
|
+
|
|
305
|
+
OCR识别的表格通常会有以下特点:
|
|
306
|
+
1. 每个单元格可能被识别为单独的一行
|
|
307
|
+
2. 表格的行列结构可能不明显
|
|
308
|
+
3. 可能包含非表格的文本内容
|
|
309
|
+
4. 可能存在多个表格
|
|
310
|
+
|
|
311
|
+
## 你的任务
|
|
312
|
+
|
|
313
|
+
1. 识别内容中的表格数据
|
|
314
|
+
2. 将表格数据转换为标准Markdown格式
|
|
315
|
+
3. 保留非表格的文本内容
|
|
316
|
+
4. 使用replace_in_file工具格式输出结果
|
|
317
|
+
|
|
318
|
+
## 输出格式
|
|
319
|
+
|
|
320
|
+
必须使用以下格式输出结果:
|
|
321
|
+
|
|
322
|
+
```
|
|
323
|
+
<replace_in_file>
|
|
324
|
+
<path>content</path>
|
|
325
|
+
<diff>
|
|
326
|
+
<<<<<<< SEARCH
|
|
327
|
+
[原始表格文本,精确匹配]
|
|
328
|
+
=======
|
|
329
|
+
[转换后的Markdown表格]
|
|
330
|
+
>>>>>>> REPLACE
|
|
331
|
+
</diff>
|
|
332
|
+
</replace_in_file>
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
## 示例
|
|
336
|
+
|
|
337
|
+
原始OCR文本:
|
|
338
|
+
```
|
|
339
|
+
下面是库存情况:
|
|
340
|
+
产品名称
|
|
341
|
+
价格
|
|
342
|
+
库存
|
|
343
|
+
苹果手机
|
|
344
|
+
8999 352
|
|
345
|
+
华为平板
|
|
346
|
+
4599
|
|
347
|
+
128
|
|
348
|
+
小米电视
|
|
349
|
+
3299
|
|
350
|
+
89
|
|
351
|
+
可以看到在,整体库存和价格是健康的。
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
转换后的输出:
|
|
355
|
+
```
|
|
356
|
+
<replace_in_file>
|
|
357
|
+
<path>content</path>
|
|
358
|
+
<diff>
|
|
359
|
+
<<<<<<< SEARCH
|
|
360
|
+
产品名称
|
|
361
|
+
价格
|
|
362
|
+
库存
|
|
363
|
+
苹果手机
|
|
364
|
+
8999 352
|
|
365
|
+
华为平板
|
|
366
|
+
4599
|
|
367
|
+
128
|
|
368
|
+
小米电视
|
|
369
|
+
3299
|
|
370
|
+
89
|
|
371
|
+
=======
|
|
372
|
+
| 产品名称 | 价格 | 库存 |
|
|
373
|
+
|---------|------|------|
|
|
374
|
+
| 苹果手机 | 8999 | 352 |
|
|
375
|
+
| 华为平板 | 4599 | 128 |
|
|
376
|
+
| 小米电视 | 3299 | 89 |
|
|
377
|
+
>>>>>>> REPLACE
|
|
378
|
+
</diff>
|
|
379
|
+
</replace_in_file>
|
|
380
|
+
```
|
|
381
|
+
|
|
382
|
+
## 处理规则
|
|
383
|
+
|
|
384
|
+
1. 表格识别:
|
|
385
|
+
- 分析行列结构,识别表头和数据行
|
|
386
|
+
- 如果一行中有多个值,可能是一行表格数据
|
|
387
|
+
- 连续的短行可能是表格的单元格
|
|
388
|
+
|
|
389
|
+
2. Markdown格式:
|
|
390
|
+
- 表头行使用`|`分隔各列
|
|
391
|
+
- 在表头下方添加分隔行`|---|---|---|`
|
|
392
|
+
- 对齐各列数据
|
|
393
|
+
- 保持原始数据的完整性
|
|
394
|
+
|
|
395
|
+
3. 多表格处理:
|
|
396
|
+
- 为每个表格创建单独的replace_in_file块
|
|
397
|
+
- 保持表格在原文中的相对位置
|
|
398
|
+
|
|
399
|
+
4. 非表格内容:
|
|
400
|
+
- 保留原始格式
|
|
401
|
+
- 不要修改非表格文本
|
|
402
|
+
|
|
403
|
+
## 处理以下内容
|
|
404
|
+
|
|
405
|
+
{{content}}
|
|
406
|
+
'''
|
|
407
|
+
|
|
408
|
+
# Run the prompt with the provided content
|
|
409
|
+
tool_response = _format_table.with_llm(llm).run(content)
|
|
410
|
+
|
|
411
|
+
# Parse the tool response to extract replace_in_file tool calls
|
|
412
|
+
def extract_replace_in_file_tools(response):
|
|
413
|
+
tools = []
|
|
414
|
+
# Pattern to match replace_in_file tool blocks
|
|
415
|
+
pattern = r'<replace_in_file>\s*<path>(.*?)</path>\s*<diff>(.*?)</diff>\s*</replace_in_file>'
|
|
416
|
+
matches = re.finditer(pattern, response, re.DOTALL)
|
|
417
|
+
|
|
418
|
+
for match in matches:
|
|
419
|
+
path = match.group(1).strip()
|
|
420
|
+
diff = match.group(2).strip()
|
|
421
|
+
tools.append(ReplaceInFileTool(path=path, diff=diff))
|
|
422
|
+
|
|
423
|
+
return tools
|
|
424
|
+
|
|
425
|
+
# Extract tools from the response
|
|
426
|
+
tools = extract_replace_in_file_tools(tool_response)
|
|
427
|
+
|
|
428
|
+
# Process each tool to apply the replacements
|
|
429
|
+
formatted_content = content
|
|
430
|
+
for tool in tools:
|
|
431
|
+
# For in-memory content replacement (not actual file modification)
|
|
432
|
+
if tool.path == "content":
|
|
433
|
+
# Parse the diff to get search/replace blocks
|
|
434
|
+
blocks = ImageLoader.parse_diff(tool.diff)
|
|
435
|
+
# Apply each replacement to the content
|
|
436
|
+
for search_block, replace_block in blocks:
|
|
437
|
+
formatted_content = formatted_content.replace(search_block, replace_block)
|
|
438
|
+
|
|
439
|
+
return formatted_content
|
|
440
|
+
|
|
441
|
+
@staticmethod
|
|
442
|
+
def extract_text_from_image(
|
|
443
|
+
image_path: str,
|
|
444
|
+
llm,
|
|
445
|
+
engine: str = "vl",
|
|
446
|
+
product_mode: str = "lite",
|
|
447
|
+
paddle_kwargs: dict = None
|
|
448
|
+
) -> str:
|
|
449
|
+
"""
|
|
450
|
+
识别图片或PDF中的所有文本内容,包括表格(以markdown table格式)
|
|
451
|
+
|
|
452
|
+
Args:
|
|
453
|
+
image_path: 图片或PDF路径
|
|
454
|
+
llm: LLM对象或字符串(模型名)
|
|
455
|
+
engine: 选择识别引擎
|
|
456
|
+
- "vl": 视觉语言模型
|
|
457
|
+
- "paddle": PaddleOCR
|
|
458
|
+
- "paddle_table": PaddleX表格识别
|
|
459
|
+
product_mode: get_single_llm的参数
|
|
460
|
+
paddle_kwargs: dict,传递给PaddleOCR的参数
|
|
461
|
+
Returns:
|
|
462
|
+
markdown内容字符串
|
|
463
|
+
"""
|
|
464
|
+
if isinstance(llm, str):
|
|
465
|
+
llm = get_single_llm(llm, product_mode=product_mode)
|
|
466
|
+
|
|
467
|
+
markdown_content = ""
|
|
468
|
+
|
|
469
|
+
if engine == "vl":
|
|
470
|
+
try:
|
|
471
|
+
vl_model = llm.get_sub_client("vl_model") if llm.get_sub_client("vl_model") else llm
|
|
472
|
+
|
|
473
|
+
@byzerllm.prompt()
|
|
474
|
+
def analyze_image(image_path):
|
|
475
|
+
"""
|
|
476
|
+
{{ image }}
|
|
477
|
+
你是一名图像理解专家,请识别这张图片中的所有内容,优先识别文字和表格。
|
|
478
|
+
对于普通文字,输出为段落文本。
|
|
479
|
+
对于表格截图,转换成markdown table格式输出。
|
|
480
|
+
请根据内容顺序,整合成一份markdown文档。
|
|
481
|
+
只返回markdown内容,不要添加额外解释。
|
|
482
|
+
"""
|
|
483
|
+
image = byzerllm.Image.load_image_from_path(image_path)
|
|
484
|
+
return {"image": image}
|
|
485
|
+
|
|
486
|
+
result = analyze_image.with_llm(vl_model).run(image_path)
|
|
487
|
+
md_blocks = code_utils.extract_code(result, language="markdown")
|
|
488
|
+
if md_blocks:
|
|
489
|
+
markdown_content = md_blocks[-1][1]
|
|
490
|
+
else:
|
|
491
|
+
markdown_content = result.strip()
|
|
492
|
+
if not markdown_content:
|
|
493
|
+
raise ValueError("Empty markdown from vl_model")
|
|
494
|
+
return markdown_content
|
|
495
|
+
|
|
496
|
+
except Exception:
|
|
497
|
+
traceback.print_exc()
|
|
498
|
+
return ""
|
|
499
|
+
|
|
500
|
+
elif engine == "paddle":
|
|
501
|
+
if paddle_kwargs is None:
|
|
502
|
+
paddle_kwargs = {}
|
|
503
|
+
|
|
504
|
+
markdown_content = ImageLoader.paddleocr_extract_text(image_path, **paddle_kwargs)
|
|
505
|
+
return markdown_content
|
|
506
|
+
|
|
507
|
+
elif engine == "paddle_table":
|
|
508
|
+
markdown_content = ImageLoader.paddlex_table_extract_markdown(image_path)
|
|
509
|
+
return markdown_content
|
|
510
|
+
|
|
511
|
+
else:
|
|
512
|
+
print(f"Unknown engine type: {engine}. Supported engines are 'vl', 'paddle', and 'paddle_table'.")
|
|
513
|
+
return ""
|
|
514
|
+
|
|
515
|
+
@staticmethod
|
|
516
|
+
def image_to_markdown(
|
|
517
|
+
image_path: str,
|
|
518
|
+
llm,
|
|
519
|
+
engine: str = "vl",
|
|
520
|
+
product_mode: str = "lite",
|
|
521
|
+
paddle_kwargs: dict = None
|
|
522
|
+
) -> str:
|
|
523
|
+
"""
|
|
524
|
+
识别图片或PDF内容,生成markdown文件
|
|
525
|
+
|
|
526
|
+
Args:
|
|
527
|
+
image_path: 文件路径
|
|
528
|
+
llm: LLM对象或字符串
|
|
529
|
+
engine: 'vl'、'paddle'或'paddle_table'
|
|
530
|
+
product_mode: LLM参数
|
|
531
|
+
paddle_kwargs: dict,传递给PaddleOCR参数
|
|
532
|
+
Returns:
|
|
533
|
+
markdown内容字符串
|
|
534
|
+
"""
|
|
535
|
+
md_content = ImageLoader.extract_text_from_image(
|
|
536
|
+
image_path,
|
|
537
|
+
llm,
|
|
538
|
+
engine=engine,
|
|
539
|
+
product_mode=product_mode,
|
|
540
|
+
paddle_kwargs=paddle_kwargs
|
|
541
|
+
)
|
|
542
|
+
|
|
543
|
+
md_path = os.path.splitext(image_path)[0] + ".md"
|
|
544
|
+
try:
|
|
545
|
+
with open(md_path, "w", encoding="utf-8") as f:
|
|
546
|
+
f.write(md_content)
|
|
547
|
+
except Exception:
|
|
548
|
+
traceback.print_exc()
|
|
549
|
+
|
|
550
|
+
return md_content
|
|
551
|
+
|
|
@@ -92,18 +92,6 @@ class MultiRoundStrategy(QAConversationStrategy):
|
|
|
92
92
|
{% endfor %}
|
|
93
93
|
</documents>
|
|
94
94
|
|
|
95
|
-
====
|
|
96
|
-
|
|
97
|
-
{% if extra_docs %}
|
|
98
|
-
AUTO EXTENSION DOCS
|
|
99
|
-
|
|
100
|
-
The following extension documents are loaded dynamically to enhance your understanding or provide special instructions, rules, or context.
|
|
101
|
-
|
|
102
|
-
{% for key, value in extra_docs.items() %}
|
|
103
|
-
### {{ key }}
|
|
104
|
-
{{ value }}
|
|
105
|
-
{% endfor %}
|
|
106
|
-
|
|
107
95
|
====
|
|
108
96
|
{% endif %}
|
|
109
97
|
|
|
@@ -124,6 +112,19 @@ class MultiRoundStrategy(QAConversationStrategy):
|
|
|
124
112
|
- Format your answer with Markdown for readability.
|
|
125
113
|
- Always use the language used by the user in their question.
|
|
126
114
|
|
|
115
|
+
{% if extra_docs %}
|
|
116
|
+
====
|
|
117
|
+
|
|
118
|
+
RULES PROVIDED BY USER
|
|
119
|
+
|
|
120
|
+
The following rules are provided by the user, and you must follow them strictly.
|
|
121
|
+
|
|
122
|
+
{% for key, value in extra_docs.items() %}
|
|
123
|
+
### {{ key }}
|
|
124
|
+
{{ value }}
|
|
125
|
+
{% endfor %}
|
|
126
|
+
{% endif %}
|
|
127
|
+
|
|
127
128
|
"""
|
|
128
129
|
|
|
129
130
|
import os
|
|
@@ -213,17 +214,6 @@ class SingleRoundStrategy(QAConversationStrategy):
|
|
|
213
214
|
{% endfor %}
|
|
214
215
|
</documents>
|
|
215
216
|
|
|
216
|
-
====
|
|
217
|
-
{% if extra_docs %}
|
|
218
|
-
AUTO EXTENSION DOCS
|
|
219
|
-
|
|
220
|
-
The following extension documents are loaded dynamically to enhance your understanding or provide special instructions, rules, or context.
|
|
221
|
-
|
|
222
|
-
{% for key, value in extra_docs.items() %}
|
|
223
|
-
### {{ key }}
|
|
224
|
-
{{ value }}
|
|
225
|
-
{% endfor %}
|
|
226
|
-
|
|
227
217
|
====
|
|
228
218
|
{% endif %}
|
|
229
219
|
|
|
@@ -252,6 +242,19 @@ class SingleRoundStrategy(QAConversationStrategy):
|
|
|
252
242
|
- Format your answer with Markdown for readability.
|
|
253
243
|
- Always use the language used by the user in their question.
|
|
254
244
|
|
|
245
|
+
{% if extra_docs %}
|
|
246
|
+
====
|
|
247
|
+
|
|
248
|
+
RULES PROVIDED BY USER
|
|
249
|
+
|
|
250
|
+
The following rules are provided by the user, and you must follow them strictly.
|
|
251
|
+
|
|
252
|
+
{% for key, value in extra_docs.items() %}
|
|
253
|
+
### {{ key }}
|
|
254
|
+
{{ value }}
|
|
255
|
+
{% endfor %}
|
|
256
|
+
{% endif %}
|
|
257
|
+
|
|
255
258
|
"""
|
|
256
259
|
import os
|
|
257
260
|
extra_docs = {}
|
autocoder/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.1.
|
|
1
|
+
__version__ = "0.1.348"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|