computer-use-ootb-internal 0.0.108__py3-none-any.whl → 0.0.109__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- computer_use_ootb_internal/app_teachmode.py +8 -14
- computer_use_ootb_internal/app_teachmode_gradio.py +1 -1
- computer_use_ootb_internal/computer_use_demo/animation/click_animation.py +1 -1
- computer_use_ootb_internal/computer_use_demo/executor/teachmode_executor.py +70 -72
- computer_use_ootb_internal/computer_use_demo/tools/base.py +4 -2
- computer_use_ootb_internal/computer_use_demo/tools/computer.py +55 -43
- computer_use_ootb_internal/run_teachmode_ootb_args.py +23 -11
- {computer_use_ootb_internal-0.0.108.dist-info → computer_use_ootb_internal-0.0.109.dist-info}/METADATA +1 -1
- {computer_use_ootb_internal-0.0.108.dist-info → computer_use_ootb_internal-0.0.109.dist-info}/RECORD +11 -16
- computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/gui_parser.py +0 -676
- computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/icon_detection/icon_detection.py +0 -253
- computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/panel_recognition/llm_panel_recognize.py +0 -170
- computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/test_capture.py +0 -8
- computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/uia_parser.py +0 -0
- {computer_use_ootb_internal-0.0.108.dist-info → computer_use_ootb_internal-0.0.109.dist-info}/WHEEL +0 -0
- {computer_use_ootb_internal-0.0.108.dist-info → computer_use_ootb_internal-0.0.109.dist-info}/entry_points.txt +0 -0
computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/gui_parser.py
DELETED
@@ -1,676 +0,0 @@
|
|
1
|
-
import cv2
|
2
|
-
import numpy as np
|
3
|
-
import datetime
|
4
|
-
import requests
|
5
|
-
import importlib
|
6
|
-
# import textdistance
|
7
|
-
import glob
|
8
|
-
from PIL import Image
|
9
|
-
import ctypes
|
10
|
-
|
11
|
-
from .text_detection import text_detection
|
12
|
-
from .icon_detection.icon_detection import detect_icons, IconDetector
|
13
|
-
from .utils import *
|
14
|
-
|
15
|
-
import os
|
16
|
-
|
17
|
-
from importlib import resources
|
18
|
-
import pathlib
|
19
|
-
import computer_use_ootb_internal
|
20
|
-
|
21
|
-
class GUIParser():
|
22
|
-
name = "gui_parser"
|
23
|
-
description = """
|
24
|
-
This tool can extract the information of screenshot.
|
25
|
-
Invoke command: gui_parser(query, visual[i])
|
26
|
-
:param query -> str, specific command. visual[i] -> image, the latest screenshot.
|
27
|
-
"""
|
28
|
-
|
29
|
-
def __init__(self, cache_folder=".cache/", user_id=None, trace_id=None, scaleFactor="1.0x"):
|
30
|
-
# Use pathlib for platform-independent path handling
|
31
|
-
cache_path = pathlib.Path(cache_folder)
|
32
|
-
cache_path.mkdir(exist_ok=True)
|
33
|
-
ocr_cache_path = cache_path / "ocr"
|
34
|
-
ocr_cache_path.mkdir(exist_ok=True)
|
35
|
-
|
36
|
-
super(GUIParser, self).__init__()
|
37
|
-
self.cache_folder = str(cache_path)
|
38
|
-
print(f"Cache folder: {self.cache_folder}")
|
39
|
-
|
40
|
-
self.task_id = get_current_time()
|
41
|
-
self.parsers = {}
|
42
|
-
self.temperature = 0
|
43
|
-
self.gui_parser = None
|
44
|
-
self.icon_template_threshold = 0.7
|
45
|
-
try:
|
46
|
-
ootb_path = os.getenv("OOTB_PATH")
|
47
|
-
if ootb_path:
|
48
|
-
self.ootb_database_path = pathlib.Path(ootb_path) / "ootbdatabase"
|
49
|
-
else:
|
50
|
-
# Fallback to package resources if env var not set
|
51
|
-
with resources.path(__package__, "ootbdatabase") as db_path:
|
52
|
-
self.ootb_database_path = db_path
|
53
|
-
except Exception as e:
|
54
|
-
print(f"Error: {e}. OOTB_PATH is not set correctly.")
|
55
|
-
self.user_id = user_id
|
56
|
-
self.trace_id = trace_id
|
57
|
-
self.scaleFactor = scaleFactor
|
58
|
-
self.width = None
|
59
|
-
self.height = None
|
60
|
-
|
61
|
-
def __call__(self, uia_data=None, screenshot_path=None, query=None, mode="uia", ocr_mode="googleocr"):
|
62
|
-
screenshot_path = pathlib.Path(screenshot_path)
|
63
|
-
if not screenshot_path.exists():
|
64
|
-
raise FileNotFoundError(f"Screenshot not found: {screenshot_path}")
|
65
|
-
|
66
|
-
image = Image.open(screenshot_path)
|
67
|
-
self.width, self.height = image.size
|
68
|
-
|
69
|
-
if mode == "teach":
|
70
|
-
if query and isinstance(query, list) and len(query) > 1:
|
71
|
-
specific_icon_names = [item for item in query if self.is_image_path(item)]
|
72
|
-
if specific_icon_names:
|
73
|
-
icon_template_folder = self.ootb_database_path / self.user_id / self.trace_id / "icons"
|
74
|
-
icon_template_folder.mkdir(parents=True, exist_ok=True)
|
75
|
-
|
76
|
-
print(f"icon_template_folder: {icon_template_folder}")
|
77
|
-
icon_detector = IconDetector(str(icon_template_folder))
|
78
|
-
|
79
|
-
detected_items = icon_detector(
|
80
|
-
screenshot_path=screenshot_path,
|
81
|
-
mode="teach",
|
82
|
-
threshold=self.icon_template_threshold, # TODO: setting on init
|
83
|
-
scale_factor=self.scaleFactor,
|
84
|
-
specific_icon_names=specific_icon_names,
|
85
|
-
)
|
86
|
-
|
87
|
-
if detected_items:
|
88
|
-
icon_items = detected_items
|
89
|
-
print("detected_icons:", icon_items)
|
90
|
-
# 获取OCR文本结果
|
91
|
-
ocr_elements = []
|
92
|
-
if ocr and "texts" in ocr:
|
93
|
-
for text_item in ocr["texts"]:
|
94
|
-
ocr_elements.append({
|
95
|
-
"name": text_item["content"],
|
96
|
-
"rectangle": text_item["bbox"],
|
97
|
-
"is_text": True
|
98
|
-
})
|
99
|
-
|
100
|
-
# 合并所有元素
|
101
|
-
all_elements = []
|
102
|
-
all_elements.extend(icon_items) # 添加图标元素
|
103
|
-
all_elements.extend(ocr_elements) # 添加文本元素
|
104
|
-
|
105
|
-
# 创建主面板
|
106
|
-
ocr_icon_parsed_gui = {
|
107
|
-
"Screen": [{
|
108
|
-
"name": "Main Content",
|
109
|
-
"rectangle": [0, 0, self.width, self.height],
|
110
|
-
"elements": self.merge_elements({"combined_elements": all_elements})
|
111
|
-
}]
|
112
|
-
}
|
113
|
-
|
114
|
-
# get uia parsed gui
|
115
|
-
uia_parsed_gui = self.get_panel_uia(uia_data)
|
116
|
-
self.postprocess_uia(uia_parsed_gui)
|
117
|
-
|
118
|
-
# merge the code
|
119
|
-
self.parsed_gui = self.merge_uia_and_teach_results(uia_parsed_gui, ocr_icon_parsed_gui)
|
120
|
-
return self.parsed_gui
|
121
|
-
|
122
|
-
# only use uia mode
|
123
|
-
elif mode == "uia":
|
124
|
-
# Parse the UI elements of the current window
|
125
|
-
self.parsed_gui = {}
|
126
|
-
self.exclude_class_name_list = []
|
127
|
-
# if self.if_browser(window_name):
|
128
|
-
# self.exclude_class_name_list = [
|
129
|
-
# "Custom",
|
130
|
-
# "Menu",
|
131
|
-
# "Pane",
|
132
|
-
# "TabControl",
|
133
|
-
# "DataItem",
|
134
|
-
# ]
|
135
|
-
# elif self.if_office(window_name):
|
136
|
-
# self.exclude_class_name_list = [
|
137
|
-
# "Custom",
|
138
|
-
# "Menu",
|
139
|
-
# "Pane",
|
140
|
-
# "Toolbar",
|
141
|
-
# "TabControl",
|
142
|
-
# "TreeItem",
|
143
|
-
# "DataItem",
|
144
|
-
# "Hyperlink",
|
145
|
-
# ]
|
146
|
-
# else:
|
147
|
-
# self.exclude_class_name_list = []
|
148
|
-
|
149
|
-
self.parsed_gui = self.get_panel_uia(uia_data)
|
150
|
-
self.postprocess_uia(self.parsed_gui)
|
151
|
-
|
152
|
-
return self.parsed_gui
|
153
|
-
|
154
|
-
# OCR + full icon detection mode
|
155
|
-
elif mode == "icon_ocr":
|
156
|
-
ocr = text_detection(screenshot_path)
|
157
|
-
icon_template_folder = os.path.join(self.ootb_database_path, "ootbdatabase/teach_mode")
|
158
|
-
icon_detector = IconDetector(icon_template_folder)
|
159
|
-
icon_items = [
|
160
|
-
icon_detector(
|
161
|
-
screenshot_path=screenshot_path, mode="full_detect", threshold=0.6, scale_factor=self.scaleFactor
|
162
|
-
)
|
163
|
-
]
|
164
|
-
self.parsed_gui = self.postprocess_icon(
|
165
|
-
"screen", icon_items, screenshot_path
|
166
|
-
)
|
167
|
-
for panel_item in self.parsed_gui["screen"]:
|
168
|
-
temp = {}
|
169
|
-
temp["editing_control"] = self.get_text(
|
170
|
-
panel_item, ocr, screenshot_path
|
171
|
-
)
|
172
|
-
|
173
|
-
panel_item["elements"] += self.merge_elements(temp)
|
174
|
-
|
175
|
-
# OCR only mode
|
176
|
-
else:
|
177
|
-
ocr = text_detection(screenshot_path)
|
178
|
-
self.parsed_gui = self.postprocess_icon(
|
179
|
-
"screen", [], screenshot_path
|
180
|
-
)
|
181
|
-
for panel_item in self.parsed_gui["screen"]:
|
182
|
-
temp = {}
|
183
|
-
temp["editing_control"] = self.get_text(
|
184
|
-
panel_item, ocr, screenshot_path
|
185
|
-
)
|
186
|
-
|
187
|
-
panel_item["elements"] += self.merge_elements(temp)
|
188
|
-
|
189
|
-
|
190
|
-
return self.parsed_gui
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
def if_browser(self, window_name):
|
195
|
-
# check if ["Edge", "Chrome", "Firefox", "Safari", "Opera"] in window_name
|
196
|
-
if any(browser in window_name for browser in ["Edge", "Chrome", "Firefox", "Safari", "Opera"]):
|
197
|
-
return True
|
198
|
-
return False
|
199
|
-
|
200
|
-
def if_adobe(self, window_name):
|
201
|
-
# check if ["Adobe Premiere Pro", "Adobe After Effect", "Adobe Acrobat"] in window_name
|
202
|
-
if any(adobe in window_name for adobe in ["Adobe Premiere Pro", "Adobe After Effect", "Adobe Acrobat"]):
|
203
|
-
return True
|
204
|
-
return False
|
205
|
-
|
206
|
-
def if_office(self, window_name):
|
207
|
-
# check if ["Excel", "Word", "PowerPoint", "Outlook"] in window_name
|
208
|
-
if any(office in window_name for office in ["Excel", "Word", "PowerPoint", "Outlook"]):
|
209
|
-
return True
|
210
|
-
return False
|
211
|
-
|
212
|
-
@staticmethod
|
213
|
-
def is_image_path(text):
|
214
|
-
# Checking if the input text ends with typical image file extensions
|
215
|
-
image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif")
|
216
|
-
if text.endswith(image_extensions):
|
217
|
-
return True
|
218
|
-
else:
|
219
|
-
return False
|
220
|
-
|
221
|
-
@staticmethod
|
222
|
-
def get_text(panel_item, ocr, screenshot_path):
|
223
|
-
# Step1: Find all the texts in the panel
|
224
|
-
panel_rect = panel_item["rectangle"]
|
225
|
-
# 过滤出在panel区域内的文本
|
226
|
-
panel_texts = []
|
227
|
-
for item in ocr["texts"]:
|
228
|
-
bbox = item["bbox"]
|
229
|
-
# 检查文本是否在panel区域内
|
230
|
-
if (bbox[0] >= panel_rect[0] and
|
231
|
-
bbox[1] >= panel_rect[1] and
|
232
|
-
bbox[2] <= panel_rect[2] and
|
233
|
-
bbox[3] <= panel_rect[3]):
|
234
|
-
panel_texts.append(item)
|
235
|
-
|
236
|
-
if not panel_texts:
|
237
|
-
return []
|
238
|
-
|
239
|
-
# obtain information about panel texts
|
240
|
-
sorted_panel_texts = sorted(
|
241
|
-
panel_texts, key=lambda x: (x["bbox"][1], x["bbox"][0])
|
242
|
-
)
|
243
|
-
|
244
|
-
# Step 2: Identify rows by grouping elements that are in approximately the same vertical position.
|
245
|
-
editing_controls = []
|
246
|
-
current_row = []
|
247
|
-
if sorted_panel_texts:
|
248
|
-
previous_y = sorted_panel_texts[0]["bbox"][1]
|
249
|
-
|
250
|
-
for item in sorted_panel_texts:
|
251
|
-
y1 = item["bbox"][1]
|
252
|
-
# If the vertical position of the current item is significantly different from the previous one, start a new row.
|
253
|
-
if abs(y1 - previous_y) > 15:
|
254
|
-
if current_row: # 只有当current_row非空时才添加
|
255
|
-
editing_controls.append(current_row)
|
256
|
-
current_row = []
|
257
|
-
|
258
|
-
current_row.append(
|
259
|
-
{
|
260
|
-
"name": item["content"],
|
261
|
-
"rectangle": item["bbox"],
|
262
|
-
}
|
263
|
-
)
|
264
|
-
previous_y = y1
|
265
|
-
|
266
|
-
# Add the last row if not empty.
|
267
|
-
if current_row:
|
268
|
-
editing_controls.append(current_row)
|
269
|
-
|
270
|
-
# Step 3: Sort elements within each row based on their horizontal position.
|
271
|
-
for i, row in enumerate(editing_controls):
|
272
|
-
editing_controls[i] = sorted(row, key=lambda x: x["rectangle"][0])
|
273
|
-
|
274
|
-
return editing_controls
|
275
|
-
|
276
|
-
def postprocess_icon(self, window_name, icon_items, screenshot_path):
|
277
|
-
image = Image.open(screenshot_path)
|
278
|
-
width, height = image.size
|
279
|
-
result = {
|
280
|
-
window_name: [
|
281
|
-
{
|
282
|
-
"name": "Main Content",
|
283
|
-
"rectangle": [0, 0, width, height],
|
284
|
-
"elements": None
|
285
|
-
}
|
286
|
-
]
|
287
|
-
}
|
288
|
-
return result
|
289
|
-
|
290
|
-
@staticmethod
|
291
|
-
def merge_elements(panel_item):
|
292
|
-
# 检查是否为空
|
293
|
-
if not panel_item or not any(panel_item.values()):
|
294
|
-
return []
|
295
|
-
|
296
|
-
# 收集所有元素
|
297
|
-
all_elements = []
|
298
|
-
for key, value in panel_item.items():
|
299
|
-
if isinstance(value, list):
|
300
|
-
all_elements.extend(value)
|
301
|
-
|
302
|
-
if not all_elements:
|
303
|
-
return []
|
304
|
-
|
305
|
-
# 按Y轴分组,同一行的阈值为15像素
|
306
|
-
y_sorted = sorted(all_elements, key=lambda x: (x["rectangle"][1] + x["rectangle"][3]) / 2)
|
307
|
-
merged_control = []
|
308
|
-
current_row = []
|
309
|
-
previous_y = None
|
310
|
-
|
311
|
-
for element in y_sorted:
|
312
|
-
y_center = (element["rectangle"][1] + element["rectangle"][3]) / 2
|
313
|
-
|
314
|
-
if previous_y is not None and abs(y_center - previous_y) > 15:
|
315
|
-
if current_row:
|
316
|
-
# 对当前行按X轴排序
|
317
|
-
current_row.sort(key=lambda x: x["rectangle"][0])
|
318
|
-
merged_control.append(current_row)
|
319
|
-
current_row = []
|
320
|
-
|
321
|
-
current_row.append(element)
|
322
|
-
previous_y = y_center
|
323
|
-
|
324
|
-
# 添加最后一行
|
325
|
-
if current_row:
|
326
|
-
current_row.sort(key=lambda x: x["rectangle"][0])
|
327
|
-
merged_control.append(current_row)
|
328
|
-
|
329
|
-
# 对所有行按Y轴中心点排序
|
330
|
-
merged_control.sort(
|
331
|
-
key=lambda row: (row[0]["rectangle"][1] + row[0]["rectangle"][3]) / 2
|
332
|
-
)
|
333
|
-
|
334
|
-
return merged_control
|
335
|
-
|
336
|
-
def postprocess_uia(self, metadata):
|
337
|
-
# iterate each window
|
338
|
-
for window_data in metadata.values():
|
339
|
-
|
340
|
-
for element_data in window_data:
|
341
|
-
# get all elements of current level
|
342
|
-
elements = element_data.get("elements", [])
|
343
|
-
|
344
|
-
# if the class name is TitleBar, regain the rectangle of the whole panel
|
345
|
-
if element_data["class_name"] == "TitleBar" and len(elements) > 0:
|
346
|
-
left = min(e.get("rectangle", [0, 0, 0, 0])[0] for e in elements)
|
347
|
-
top = min(e.get("rectangle", [0, 0, 0, 0])[1] for e in elements)
|
348
|
-
right = max(e.get("rectangle", [0, 0, 0, 0])[2] for e in elements)
|
349
|
-
bottom = max(e.get("rectangle", [0, 0, 0, 0])[3] for e in elements)
|
350
|
-
element_data["rectangle"] = [left, top, right, bottom]
|
351
|
-
# sort elements by x and y
|
352
|
-
element_data["elements"] = sort_elements_by_xy(elements)
|
353
|
-
|
354
|
-
return metadata
|
355
|
-
|
356
|
-
def merge_uia_and_teach_results(self,uia_result, teach_result):
|
357
|
-
def calculate_overlap(rect1, rect2):
|
358
|
-
# 计算两个矩形的重叠面积
|
359
|
-
x1 = max(rect1[0], rect2[0])
|
360
|
-
y1 = max(rect1[1], rect2[1])
|
361
|
-
x2 = min(rect1[2], rect2[2])
|
362
|
-
y2 = min(rect1[3], rect2[3])
|
363
|
-
|
364
|
-
if x1 >= x2 or y1 >= y2:
|
365
|
-
return 0
|
366
|
-
|
367
|
-
overlap_width = x2 - x1
|
368
|
-
overlap_height = y2 - y1
|
369
|
-
|
370
|
-
return overlap_width * overlap_height
|
371
|
-
|
372
|
-
def is_significant_overlap(ocr_element, uia_element):
|
373
|
-
ocr_rect = ocr_element["rectangle"]
|
374
|
-
uia_rect = uia_element["rectangle"]
|
375
|
-
|
376
|
-
overlap_area = calculate_overlap(ocr_rect, uia_rect)
|
377
|
-
ocr_area = (ocr_rect[2] - ocr_rect[0]) * (ocr_rect[3] - ocr_rect[1])
|
378
|
-
|
379
|
-
return overlap_area > 0.5 * ocr_area if ocr_area > 0 else False
|
380
|
-
|
381
|
-
merged_result = uia_result.copy()
|
382
|
-
|
383
|
-
all_ocr_elements = []
|
384
|
-
# 遍历teach模式下的OCR文本元素
|
385
|
-
for screen in teach_result.get("Screen", []):
|
386
|
-
for elements_row in screen.get("elements", []):
|
387
|
-
for element in elements_row:
|
388
|
-
if element["is_text"]:
|
389
|
-
should_add = True
|
390
|
-
# 检查是否与任何UIA元素有显著重叠
|
391
|
-
for window_name, panels in merged_result.items():
|
392
|
-
for panel in panels:
|
393
|
-
for elements_list in panel.get("elements", []):
|
394
|
-
for uia_element in elements_list:
|
395
|
-
if is_significant_overlap(element, uia_element):
|
396
|
-
should_add = False
|
397
|
-
break
|
398
|
-
if not should_add:
|
399
|
-
break
|
400
|
-
if not should_add:
|
401
|
-
break
|
402
|
-
if not should_add:
|
403
|
-
break
|
404
|
-
|
405
|
-
# 如果没有显著重叠,将OCR元素添加到最合适的面板中
|
406
|
-
if should_add:
|
407
|
-
# 找到最合适的面板(这里简单地添加到第一个面板)
|
408
|
-
all_ocr_elements.append(element)
|
409
|
-
else:
|
410
|
-
all_ocr_elements.append(element)
|
411
|
-
|
412
|
-
merged_result["OCR"] = [{
|
413
|
-
"name": "Main Content",
|
414
|
-
"rectangle": [0, 0, self.width, self.height],
|
415
|
-
"elements": [all_ocr_elements]
|
416
|
-
}]
|
417
|
-
return merged_result
|
418
|
-
def get_panel_uia(self, control_info_list):
|
419
|
-
# 定义文本到面板名称的映射
|
420
|
-
text2panel_name = {
|
421
|
-
"新建会话属性": "New session properties",
|
422
|
-
"会话管理器": "Session manager",
|
423
|
-
"任务栏": "Taskbar",
|
424
|
-
}
|
425
|
-
|
426
|
-
# 递归处理控件信息
|
427
|
-
def recurse_controls(control_info, dialog_components, depth, software_name):
|
428
|
-
children = control_info.get("children", [])
|
429
|
-
if not children:
|
430
|
-
return
|
431
|
-
|
432
|
-
for child_control in children:
|
433
|
-
child_properties = child_control.get("properties", {})
|
434
|
-
child_friendly_class_name = child_properties.get("friendly_class_name", "")
|
435
|
-
|
436
|
-
# 如果控件类型在排除列表中,跳过处理
|
437
|
-
if child_friendly_class_name in self.exclude_class_name_list:
|
438
|
-
continue
|
439
|
-
|
440
|
-
# 获取控件名称
|
441
|
-
child_texts = child_properties.get("texts", [])
|
442
|
-
if not child_texts:
|
443
|
-
child_properties_name = ""
|
444
|
-
elif isinstance(child_texts[0], list):
|
445
|
-
result = []
|
446
|
-
for item in child_texts:
|
447
|
-
if isinstance(item, list) and item and isinstance(item[0], str):
|
448
|
-
result.append("".join(item))
|
449
|
-
elif isinstance(item, str):
|
450
|
-
result.append(item)
|
451
|
-
child_properties_name = "".join(result)
|
452
|
-
else:
|
453
|
-
child_properties_name = child_texts[0]
|
454
|
-
|
455
|
-
# 特殊处理编辑框和组合框
|
456
|
-
if child_friendly_class_name in ["Edit", "ComboBox"] and not child_properties_name:
|
457
|
-
child_properties_name = "Search Bar"
|
458
|
-
|
459
|
-
# 判断是否需要处理该控件
|
460
|
-
rectangle = child_properties.get("rectangle", [0, 0, 0, 0])
|
461
|
-
automation_id = child_properties.get("automation_id", "")
|
462
|
-
|
463
|
-
# 放宽控件过滤条件
|
464
|
-
browser_list = ["Edge", "Chrome", "Firefox", "Opera", "Safari"]
|
465
|
-
is_browser = False
|
466
|
-
for browser in browser_list:
|
467
|
-
if browser in software_name:
|
468
|
-
is_browser = True
|
469
|
-
break
|
470
|
-
should_process = (
|
471
|
-
# 任务栏特殊处理
|
472
|
-
(software_name == "Taskbar" and child_friendly_class_name in ["Button", "Static", "Image"]) or
|
473
|
-
# 浏览器特殊处理
|
474
|
-
(is_browser and child_friendly_class_name in ["Pane", "Button", "Edit", "Tab", "TabItem", "Document", "Group"]) or
|
475
|
-
# 常规控件处理
|
476
|
-
(not child_control.get("children", []) or # 没有子控件的元素保留
|
477
|
-
automation_id or # 有automation_id的控件保留
|
478
|
-
child_friendly_class_name not in self.exclude_class_name_list) # 不在排除列表中的控件保留
|
479
|
-
) and (
|
480
|
-
# 基本条件检查
|
481
|
-
len(rectangle) == 4 and
|
482
|
-
not all(element == 0 for element in rectangle) and
|
483
|
-
(child_properties_name not in ["", '"'] or automation_id) # 允许有automation_id的空名称控件
|
484
|
-
)
|
485
|
-
|
486
|
-
if should_process:
|
487
|
-
# 获取控件的矩形边界
|
488
|
-
left, top, right, bottom = rectangle
|
489
|
-
dialog_rect = dialog_components.get("rectangle", [0, 0, 0, 0])
|
490
|
-
if len(dialog_rect) != 4:
|
491
|
-
dialog_rect = [0, 0, 0, 0]
|
492
|
-
left_bound, top_bound, right_bound, bottom_bound = dialog_rect
|
493
|
-
|
494
|
-
# 根据软件名称进行特殊逻辑处理
|
495
|
-
if software_name not in ["Taskbar"] and not is_browser: # 任务栏和浏览器不裁剪坐标
|
496
|
-
child_properties["rectangle"] = [
|
497
|
-
max(left, left_bound),
|
498
|
-
max(top, top_bound),
|
499
|
-
min(right, right_bound),
|
500
|
-
min(bottom, bottom_bound)
|
501
|
-
]
|
502
|
-
else:
|
503
|
-
child_properties["rectangle"] = [left, top, right, bottom]
|
504
|
-
|
505
|
-
# 检查矩形有效性
|
506
|
-
if (child_properties["rectangle"][0] < child_properties["rectangle"][2]
|
507
|
-
and child_properties["rectangle"][1] < child_properties["rectangle"][3]):
|
508
|
-
|
509
|
-
# 添加控件信息到对话框组件
|
510
|
-
element_info = {
|
511
|
-
"name": child_properties_name.replace("\u200b", ""),
|
512
|
-
"rectangle": child_properties["rectangle"],
|
513
|
-
"class_name": child_friendly_class_name,
|
514
|
-
"type": ["Click", "rightClick"],
|
515
|
-
"depth": f"{depth}-{dialog_components.get('count', 1)}"
|
516
|
-
}
|
517
|
-
|
518
|
-
# 添加automation_id(如果存在)
|
519
|
-
if automation_id:
|
520
|
-
element_info["automation_id"] = automation_id
|
521
|
-
|
522
|
-
dialog_components.setdefault("elements", []).append(element_info)
|
523
|
-
dialog_components["count"] = dialog_components.get("count", 1) + 1
|
524
|
-
|
525
|
-
# 递归处理子控件
|
526
|
-
recurse_controls(child_control, dialog_components, depth, software_name)
|
527
|
-
|
528
|
-
dialog_components = {}
|
529
|
-
|
530
|
-
# 遍历每个软件的控件信息
|
531
|
-
for software_name, controls in control_info_list.items():
|
532
|
-
if not controls: # 跳过空的控件列表
|
533
|
-
continue
|
534
|
-
|
535
|
-
dialog_components[software_name] = []
|
536
|
-
|
537
|
-
for control_info in controls:
|
538
|
-
# 判断是否为需要处理的对话框类型
|
539
|
-
control_properties = control_info.get("properties", {})
|
540
|
-
friendly_class_name = control_properties.get("friendly_class_name", "")
|
541
|
-
|
542
|
-
# 放宽面板类型的限制
|
543
|
-
panel_types = ["Dialog", "Pane", "GroupBox", "TitleBar", "Menu", "Document", "ListBox", "AppBar", "Tab", "Group"]
|
544
|
-
if (friendly_class_name in panel_types and control_info.get("children")):
|
545
|
-
|
546
|
-
control_texts = control_properties.get("texts", [""])
|
547
|
-
control_name = control_texts[0] if control_texts else ""
|
548
|
-
automation_id = control_properties.get("automation_id", "")
|
549
|
-
|
550
|
-
# 设置控件名称
|
551
|
-
if not control_name and automation_id:
|
552
|
-
control_name = automation_id # 使用automation_id作为备选名称
|
553
|
-
elif not control_name:
|
554
|
-
if friendly_class_name == "TitleBar":
|
555
|
-
control_name = "Title Bar"
|
556
|
-
elif friendly_class_name == "Document":
|
557
|
-
control_name = "Main Content"
|
558
|
-
elif friendly_class_name == "Pane":
|
559
|
-
if software_name == "Taskbar":
|
560
|
-
control_name = "Taskbar"
|
561
|
-
elif "Edge" in software_name or "Chrome" in software_name or "Firefox" in software_name:
|
562
|
-
control_name = "Browser Content"
|
563
|
-
else:
|
564
|
-
control_name = "Navigation Bar" if software_name in ["web", "web video"] else "Main Content"
|
565
|
-
elif friendly_class_name == "Document" and software_name in ["web", "web video"]:
|
566
|
-
control_name = "Main Content"
|
567
|
-
|
568
|
-
# 转换文本名称
|
569
|
-
control_name = text2panel_name.get(control_name, control_name)
|
570
|
-
|
571
|
-
# 初始化对话框组件并递归处理控件
|
572
|
-
panel_info = {
|
573
|
-
"name": control_name,
|
574
|
-
"rectangle": control_properties.get("rectangle", [0, 0, 0, 0]),
|
575
|
-
"class_name": friendly_class_name,
|
576
|
-
"depth": "1",
|
577
|
-
"elements": [],
|
578
|
-
"count": 1
|
579
|
-
}
|
580
|
-
|
581
|
-
# 添加automation_id(如果存在)
|
582
|
-
if automation_id:
|
583
|
-
panel_info["automation_id"] = automation_id
|
584
|
-
|
585
|
-
dialog_components[software_name].append(panel_info)
|
586
|
-
recurse_controls(control_info, dialog_components[software_name][-1], "1", software_name)
|
587
|
-
|
588
|
-
return dialog_components
|
589
|
-
|
590
|
-
import matplotlib.pyplot as plt
|
591
|
-
import cv2
|
592
|
-
import matplotlib.font_manager as fm
|
593
|
-
|
594
|
-
def show_img_with_box_points(img, box, ax, color="red"):
|
595
|
-
x0, y0 = box[0], box[1]
|
596
|
-
w, h = box[2] - box[0], box[3] - box[1]
|
597
|
-
ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor=color, facecolor=(0, 0, 0, 0), lw=0.4))
|
598
|
-
|
599
|
-
def extract_second_underscore_content(filename):
|
600
|
-
parts = filename.split('_')
|
601
|
-
if len(parts) > 1 and "@" in filename:
|
602
|
-
return parts[1]
|
603
|
-
return filename
|
604
|
-
|
605
|
-
def show_guiParserd(gui, img_path):
|
606
|
-
img = cv2.imread(img_path)
|
607
|
-
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
608
|
-
plt.figure(dpi=500)
|
609
|
-
plt.axis('off')
|
610
|
-
plt.imshow(img)
|
611
|
-
|
612
|
-
# 设置微软雅黑字体
|
613
|
-
# font_path = "C:/Windows/Fonts/msyh.ttc" # Windows系统上的微软雅黑字体路径
|
614
|
-
# prop = fm.FontProperties(fname=font_path)
|
615
|
-
item_cache = []
|
616
|
-
for windows_name, panel_list in gui.items():
|
617
|
-
print("windows_name: ", windows_name)
|
618
|
-
print("panel_list: ", panel_list)
|
619
|
-
for sequence in panel_list:
|
620
|
-
# print("sequence: ", sequence)
|
621
|
-
show_img_with_box_points(img, sequence["rectangle"], plt.gca(), color="green")
|
622
|
-
if len(sequence["name"]) < 100:
|
623
|
-
plt.text(sequence["rectangle"][0], sequence["rectangle"][1], sequence["name"],
|
624
|
-
fontsize=4, color='green')
|
625
|
-
for _, items in enumerate(sequence['elements']):
|
626
|
-
item_cache.extend(items)
|
627
|
-
for item in item_cache:
|
628
|
-
try:
|
629
|
-
if "rectangle" in item:
|
630
|
-
if len(item["name"]) > 200:
|
631
|
-
continue
|
632
|
-
box = item["rectangle"]
|
633
|
-
if "current_step_icon" in item.keys() and item["current_step_icon"]:
|
634
|
-
show_img_with_box_points(img, box, plt.gca(),color="green")
|
635
|
-
plt.text(box[0], box[3]+25, "Current Step Icon", fontsize=3, color='green')
|
636
|
-
else:
|
637
|
-
show_img_with_box_points(img, box, plt.gca(),color="blue")
|
638
|
-
plt.text(box[0], box[1], extract_second_underscore_content(item["name"]),
|
639
|
-
fontsize=2, color='red')
|
640
|
-
except Exception as e:
|
641
|
-
print(e)
|
642
|
-
# plt.savefig('gui_parser5.png', bbox_inches='tight', pad_inches=0)
|
643
|
-
plt.show()
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
def get_screen_info():
|
648
|
-
try:
|
649
|
-
scaleFactor = ctypes.windll.shcore.GetScaleFactorForDevice(0) / 100
|
650
|
-
scaleFactor = str(scaleFactor) + "x"
|
651
|
-
except Exception as e:
|
652
|
-
print(e)
|
653
|
-
scaleFactor = "1.0x"
|
654
|
-
return scaleFactor
|
655
|
-
|
656
|
-
def parse_gui(user_id, trace_id, screenshot_path, user_scaleFactor="auto", uia_data=None, query=None, mode="uia", ocr_mode="googleocr"):
|
657
|
-
# TODO: check mult-screens
|
658
|
-
if user_scaleFactor == "auto":
|
659
|
-
user_scaleFactor = get_screen_info()
|
660
|
-
print("user_scaleFactor: ", user_scaleFactor)
|
661
|
-
gui_parser = GUIParser(user_id=user_id, trace_id=trace_id, scaleFactor=user_scaleFactor)
|
662
|
-
return gui_parser(uia_data=uia_data, screenshot_path=screenshot_path, query=query, mode=mode, ocr_mode=ocr_mode)
|
663
|
-
|
664
|
-
import json
|
665
|
-
import sys
|
666
|
-
if __name__ == "__main__":
|
667
|
-
|
668
|
-
screenshot_path = r'D:\develop\computer_use_ootb_internal-main\.cache\20241222_041910\screenshot-0.png'
|
669
|
-
meta_data = json.load(open("test_uia_data.json", "r"))
|
670
|
-
gui_parser = GUIParser(user_id="test", trace_id="test", scaleFactor="1.0x")
|
671
|
-
uia_gui = gui_parser.get_panel_uia(meta_data)
|
672
|
-
uia_gui = gui_parser.postprocess_uia(uia_gui)
|
673
|
-
with open("test_uia_gui.json", "w") as f:
|
674
|
-
json.dump(uia_gui, f, indent=4)
|
675
|
-
|
676
|
-
|