auto-coder 0.1.286__py3-none-any.whl → 0.1.288__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of auto-coder might be problematic. Click here for more details.

@@ -329,8 +329,7 @@ class AutoCoderArgs(pydantic.BaseModel):
329
329
  rag_build_name: Optional[str] = None
330
330
  disable_auto_window: bool = False
331
331
  filter_batch_size: Optional[int] = 5
332
- disable_segment_reorder: bool = False
333
- rag_doc_filter_relevance: int = 5
332
+ disable_segment_reorder: bool = False
334
333
  tokenizer_path: Optional[str] = None
335
334
  skip_confirm: Optional[bool] = False
336
335
  silence: Optional[bool] = False
@@ -11,6 +11,9 @@ from spire.doc import Document
11
11
  from spire.doc import ImageType
12
12
  from PIL import Image
13
13
  from concurrent.futures import ThreadPoolExecutor, as_completed
14
+ import json
15
+ from byzerllm.utils.client import code_utils
16
+
14
17
  class ImageInfo(pydantic.BaseModel):
15
18
  """
16
19
  图片信息
@@ -35,7 +38,10 @@ class Anything2Img:
35
38
  keep_conversion: bool = False,
36
39
  ):
37
40
  self.llm = llm
38
- self.vl_model = llm.get_sub_client("vl_model")
41
+ if llm.get_sub_client("vl_model"):
42
+ self.vl_model = llm.get_sub_client("vl_model")
43
+ else:
44
+ self.vl_model = self.llm
39
45
  self.args = args
40
46
  self.output_dir = args.output
41
47
  os.makedirs(self.output_dir, exist_ok=True)
@@ -45,28 +51,59 @@ class Anything2Img:
45
51
  def analyze_image(self, image_path: str) -> str:
46
52
  """
47
53
  {{ image }}
48
- 图片中一般包含文字,图片,图表。分析图片,返回该图片包含的文本内容以及图片位置信息。
49
- 请遵循以下格式返回:
54
+ 图片中一般可能包含文字,图片,图表三种元素,给出每种元素的bounding box坐标。
55
+ bouding box 使用 (xmin, ymin, xmax, ymax) 来表示,其中xmin, ymin: 表示矩形左上角的坐标
56
+ xmax, ymax: 表示矩形右下角的坐标
50
57
 
58
+ 最后按如下格式返回:
51
59
  ```json
52
60
  {
53
- "text": "页面的文本内容",
54
- "images": [
61
+ "objects": [
62
+ {
63
+ "type": "image",
64
+ "bounding_box": [xmin, ymin, xmax, ymax],
65
+ "text": "图片描述"
66
+ },
55
67
  {
56
- "coordinates": [x1, y1, x2, y2],
57
- "text": "对图片的描述"
68
+ "type": "text",
69
+ "bounding_box": [xmin, ymin, xmax, ymax],
70
+ "text": "文本内容"
58
71
  }
59
- ],
60
- "width": 页面宽度,
61
- "height": 页面高度
72
+ ,
73
+ {
74
+ "type": "table",
75
+ "bounding_box": [xmin, ymin, xmax, ymax],
76
+ "text": "表格的markdown格式"
77
+ }
78
+ ...
79
+ ]
62
80
  }
63
81
  ```
82
+ """
83
+ image = byzerllm.Image.load_image_from_path(image_path)
84
+ return {"image": image}
85
+
86
+ @byzerllm.prompt()
87
+ def detect_objects(self, image_path: str) -> str:
88
+ """
89
+ {{ image }}
90
+ 请分析这张图片,识别图片中图片,并给出每个图片的bounding box坐标。
91
+ bouding box 使用 (xmin, ymin, xmax, ymax) 来表示,其中xmin, ymin: 表示矩形左上角的坐标
92
+ xmax, ymax: 表示矩形右下角的坐标
64
93
 
65
- 注意:
66
- 1. 其中x1,y1是左上角坐标,x2,y2是右下角坐标,使用绝对坐标,也就是图片的像素坐标。
67
- 2. 文本内容应保持原有的段落格式
68
- 3. width和height是页面宽度,高度,要求整数类型
69
- 4. 格局图片中文本和图片的位置关系,在文本中使用 <image_placeholder> 来表示图片。
94
+ 最后按如下格式返回:
95
+ ```json
96
+ {
97
+ "objects": [
98
+ {
99
+ "bounding_box": [xmin, ymin, xmax, ymax],
100
+ "text": "图片描述"
101
+ },
102
+ ...
103
+ ]
104
+ }
105
+ ```
106
+
70
107
  """
71
108
  image = byzerllm.Image.load_image_from_path(image_path)
72
109
  return {"image": image}
@@ -141,12 +178,12 @@ class Anything2Img:
141
178
  else:
142
179
  image_paths = self.convert(file_path)[0:size]
143
180
 
144
- pages: List[Page] = []
181
+ pages_results = []
145
182
  # 使用线程池并行分析图片
146
183
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
147
184
  futures = {
148
185
  executor.submit(
149
- self.analyze_image.with_llm(self.vl_model).with_return_type(Page).run,
186
+ self.analyze_image.with_llm(self.vl_model).run,
150
187
  image_path
151
188
  ): image_path for image_path in image_paths
152
189
  }
@@ -155,7 +192,11 @@ class Anything2Img:
155
192
  image_path = futures[future]
156
193
  try:
157
194
  result = future.result()
158
- pages.append(result)
195
+ # 解析JSON结果
196
+ result_json = code_utils.extract_code(result)[-1][1]
197
+ result_dict = json.loads(result_json)
198
+ # 存储结果和对应的图像路径
199
+ pages_results.append((result_dict, image_path))
159
200
  logger.info(f"Analyzed {image_path}")
160
201
  except Exception as e:
161
202
  logger.error(f"Failed to analyze {image_path}: {str(e)}")
@@ -163,34 +204,63 @@ class Anything2Img:
163
204
  # 生成Markdown内容
164
205
  markdown_content = []
165
206
 
166
- # 遍历每个页面和对应的图片路径
167
- for page, image_path in zip(pages, image_paths):
168
- # 处理页面中的每个图片
169
- for img in page.images:
170
- # 打开原始图片
171
- original_image = Image.open(image_path)
172
-
173
- # 获得坐标
174
- x1 = img.coordinates[0]
175
- y1 = img.coordinates[1]
176
- x2 = img.coordinates[2]
177
- y2 = img.coordinates[3]
178
-
179
- # 截取图片
180
- cropped_image = original_image.crop((x1, y1, x2, y2))
207
+ # 遍历每个页面的分析结果
208
+ for page_result, image_path in pages_results:
209
+ page_markdown = []
210
+
211
+ # 按照对象类型分别处理文本、图片和表格
212
+ text_objects = []
213
+ image_objects = []
214
+ table_objects = []
215
+
216
+ for obj in page_result.get("objects", []):
217
+ obj_type = obj.get("type", "")
218
+ if obj_type == "text":
219
+ text_objects.append(obj)
220
+ elif obj_type == "image":
221
+ image_objects.append(obj)
222
+ elif obj_type == "table":
223
+ table_objects.append(obj)
224
+
225
+ # 按照垂直位置排序所有对象
226
+ all_objects = text_objects + image_objects + table_objects
227
+ all_objects.sort(key=lambda x: x.get("bounding_box", [0, 0, 0, 0])[1]) # 按y坐标排序
228
+
229
+ # 处理所有对象并生成markdown
230
+ for obj in all_objects:
231
+ obj_type = obj.get("type", "")
232
+ bbox = obj.get("bounding_box", [0, 0, 0, 0])
233
+ content = obj.get("text", "")
181
234
 
182
- # 保存截取后的图片
183
- cropped_image_path = os.path.join(images_dir, f"cropped_{os.path.basename(image_path)}")
184
- cropped_image.save(cropped_image_path)
235
+ if obj_type == "text":
236
+ # 直接添加文本内容
237
+ page_markdown.append(content)
185
238
 
186
- # 将图片路径转换为Markdown格式
187
- image_markdown = f"![{img.text}]({cropped_image_path})"
239
+ elif obj_type == "image":
240
+ # 处理图片
241
+ original_image = Image.open(image_path)
242
+
243
+ # 提取图片区域
244
+ x1, y1, x2, y2 = [int(coord) for coord in bbox]
245
+ cropped_image = original_image.crop((x1, y1, x2, y2))
246
+
247
+ # 生成唯一文件名
248
+ image_filename = f"img_{os.path.basename(image_path)}_{x1}_{y1}_{x2}_{y2}.png"
249
+ cropped_image_path = os.path.join(images_dir, image_filename)
250
+ cropped_image.save(cropped_image_path)
251
+
252
+ # 添加图片的markdown
253
+ image_markdown = f"![{content}]({cropped_image_path})"
254
+ page_markdown.append(image_markdown)
188
255
 
189
- # 替换文本中的<image_placeholder>为实际的图片Markdown
190
- page.text = page.text.replace("<image_placeholder>", image_markdown, 1)
256
+ elif obj_type == "table":
257
+ # 对表格内容进行处理,它已经是markdown格式
258
+ page_markdown.append(content)
191
259
 
192
- # 将处理后的页面文本添加到Markdown内容中
193
- markdown_content.append(page.text)
260
+ # 将页面内容合并为字符串
261
+ page_content = "\n\n".join(page_markdown)
262
+ markdown_content.append(page_content)
194
263
 
195
264
  # 将所有页面内容合并为一个Markdown文档
196
- return '\n\n'.join(markdown_content)
265
+ return '\n\n---\n\n'.join(markdown_content)
266
+
@@ -172,6 +172,20 @@ MESSAGES = {
172
172
  "file_snippet_procesed": "{{ file_path }} processed with tokens: {{ tokens }} => {{ snippet_tokens }}. Current total tokens: {{ total_tokens }}",
173
173
  "tool_ask_user": "Your Reply: ",
174
174
  "tool_ask_user_accept":"Your Response received",
175
+ "auto_web_analyzing": "Analyzing web automation task...",
176
+ "auto_web_analyzed": "Web automation task analysis completed",
177
+ "executing_web_action": "Executing action: {{action}} - {{description}}",
178
+ "executing_step": "Executing step {{step}}: {{description}}",
179
+ "operation_cancelled": "Operation cancelled",
180
+ "element_not_found": "Element not found: {{element}}",
181
+ "analyzing_results": "Analyzing execution results...",
182
+ "next_steps_determined": "Next steps determined",
183
+ "max_iterations_reached": "Max iterations reached ({max_iterations})",
184
+ "action_verification_failed": "Action verification failed: {{action}} - {{reason}}",
185
+ "action_succeeded": "Action succeeded: {{action}}",
186
+ "replanned_actions": "Replanned {{count}} actions",
187
+ "web_automation_ask_user": "Your answer: " # 新增消息
188
+
175
189
 
176
190
  },
177
191
  "zh": {
@@ -342,6 +356,19 @@ MESSAGES = {
342
356
  "file_snippet_procesed": "文件 {{ file_path }} 处理后token数: {{ tokens }} => {{ snippet_tokens }} 当前总token数: {{ total_tokens }}",
343
357
  "tool_ask_user": "您的回复: ",
344
358
  "tool_ask_user_accept":"收到您的回复",
359
+ "auto_web_analyzing": "正在分析网页自动化任务...",
360
+ "auto_web_analyzed": "网页自动化任务分析完成",
361
+ "executing_web_action": "执行操作: {{action}} - {{description}}",
362
+ "executing_step": "执行步骤 {{step}}: {{description}}",
363
+ "operation_cancelled": "操作已取消",
364
+ "element_not_found": "未找到元素: {{element}}",
365
+ "analyzing_results": "分析执行结果...",
366
+ "next_steps_determined": "已确定下一步操作",
367
+ "max_iterations_reached": "已达到最大迭代次数 {{max_iterations}}",
368
+ "action_verification_failed": "操作验证失败: {{action}} - {{reason}}",
369
+ "action_succeeded": "操作成功: {{action}}",
370
+ "replanned_actions": "已重新规划 {{count}} 个操作",
371
+ "web_automation_ask_user": "您的回答: " # 新增消息
345
372
  }}
346
373
 
347
374