computer-use-ootb-internal 0.0.107__py3-none-any.whl → 0.0.109__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- computer_use_ootb_internal/app_teachmode.py +8 -14
- computer_use_ootb_internal/app_teachmode_gradio.py +1 -1
- computer_use_ootb_internal/computer_use_demo/animation/click_animation.py +1 -1
- computer_use_ootb_internal/computer_use_demo/executor/teachmode_executor.py +70 -72
- computer_use_ootb_internal/computer_use_demo/tools/base.py +4 -2
- computer_use_ootb_internal/computer_use_demo/tools/computer.py +55 -43
- computer_use_ootb_internal/run_teachmode_ootb_args.py +26 -11
- {computer_use_ootb_internal-0.0.107.dist-info → computer_use_ootb_internal-0.0.109.dist-info}/METADATA +1 -1
- {computer_use_ootb_internal-0.0.107.dist-info → computer_use_ootb_internal-0.0.109.dist-info}/RECORD +11 -16
- computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/gui_parser.py +0 -676
- computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/icon_detection/icon_detection.py +0 -253
- computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/panel_recognition/llm_panel_recognize.py +0 -170
- computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/test_capture.py +0 -8
- computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/uia_parser.py +0 -0
- {computer_use_ootb_internal-0.0.107.dist-info → computer_use_ootb_internal-0.0.109.dist-info}/WHEEL +0 -0
- {computer_use_ootb_internal-0.0.107.dist-info → computer_use_ootb_internal-0.0.109.dist-info}/entry_points.txt +0 -0
@@ -1,253 +0,0 @@
|
|
1
|
-
import cv2
|
2
|
-
import numpy as np
|
3
|
-
import os
|
4
|
-
import re
|
5
|
-
import glob
|
6
|
-
from PIL import Image
|
7
|
-
# import ctypes
|
8
|
-
|
9
|
-
|
10
|
-
class IconDetector:
|
11
|
-
def __init__(self, icon_folder="./template"):
|
12
|
-
self.icon_folder = icon_folder
|
13
|
-
print("ICON FOLDER:", self.icon_folder)
|
14
|
-
|
15
|
-
def __call__(
|
16
|
-
self,
|
17
|
-
screenshot_path,
|
18
|
-
mode="teach",
|
19
|
-
threshold=0.70,
|
20
|
-
scale_factor="1.0x",
|
21
|
-
specific_icon_names:list=None,
|
22
|
-
):
|
23
|
-
image = cv2.imread(screenshot_path, cv2.IMREAD_COLOR)
|
24
|
-
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
25
|
-
templates = self.load_icon_templates(
|
26
|
-
self.icon_folder,
|
27
|
-
scale_factor,
|
28
|
-
specific_icon_names
|
29
|
-
)
|
30
|
-
# print("specific_icon_names: ", specific_icon_names)
|
31
|
-
# print("templates: ", templates)
|
32
|
-
all_boxes, all_scores, labels = [], [], []
|
33
|
-
for template in templates:
|
34
|
-
icon_name = template["name"].replace(".png", "")
|
35
|
-
icon_template = template["template"]
|
36
|
-
|
37
|
-
# print(icon_name)
|
38
|
-
|
39
|
-
result = cv2.matchTemplate(image, icon_template, cv2.TM_CCOEFF_NORMED)
|
40
|
-
|
41
|
-
locs = np.where(result >= threshold)
|
42
|
-
icon_width, icon_height = icon_template.shape[1], icon_template.shape[0]
|
43
|
-
|
44
|
-
for pt in zip(*locs[::-1]):
|
45
|
-
pt_x, pt_y = pt
|
46
|
-
end_x = pt_x + icon_width
|
47
|
-
end_y = pt_y + icon_height
|
48
|
-
all_boxes.append([pt_x, pt_y, end_x, end_y])
|
49
|
-
all_scores.append(result[pt_y, pt_x])
|
50
|
-
# if teach mode, only detect the current step icon
|
51
|
-
if template.get("current_step_icon", False):
|
52
|
-
labels.append(f"{icon_name} (click here for reproducing the action)")
|
53
|
-
## if not teach mode, detect all the icons
|
54
|
-
else:
|
55
|
-
labels.append(icon_name)
|
56
|
-
|
57
|
-
nms_boxes, pick = self.non_max_suppression(all_boxes, 0.5, all_scores)
|
58
|
-
labels = [labels[i] for i in pick]
|
59
|
-
# print("labels: ", labels)
|
60
|
-
# 对于box 中的每一个元素, 用python 普通的int类型表示
|
61
|
-
nms_boxes = [[int(box[0]), int(box[1]), int(box[2]), int(box[3])] for box in nms_boxes]
|
62
|
-
button_items = [
|
63
|
-
{
|
64
|
-
"name": labels[ix],
|
65
|
-
"rectangle": list(box),
|
66
|
-
"is_text": False,
|
67
|
-
"current_step_icon": "click here for reproducing the action" in labels[ix]
|
68
|
-
}
|
69
|
-
for ix, box in enumerate(nms_boxes)
|
70
|
-
|
71
|
-
]
|
72
|
-
# print("button_items: ", button_items)
|
73
|
-
|
74
|
-
return button_items
|
75
|
-
|
76
|
-
def non_max_suppression(self, boxes, overlap_thresh, scores):
|
77
|
-
boxes = np.array(boxes)
|
78
|
-
if len(boxes) == 0:
|
79
|
-
return [], []
|
80
|
-
|
81
|
-
if boxes.dtype.kind == "i":
|
82
|
-
boxes = boxes.astype("float")
|
83
|
-
|
84
|
-
pick = []
|
85
|
-
x1, y1, x2, y2 = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3]
|
86
|
-
area = (x2 - x1 + 1) * (y2 - y1 + 1)
|
87
|
-
idxs = np.argsort(scores)
|
88
|
-
|
89
|
-
while len(idxs) > 0:
|
90
|
-
last = len(idxs) - 1
|
91
|
-
i = idxs[last]
|
92
|
-
pick.append(i)
|
93
|
-
|
94
|
-
xx1 = np.maximum(x1[i], x1[idxs[:last]])
|
95
|
-
yy1 = np.maximum(y1[i], y1[idxs[:last]])
|
96
|
-
xx2 = np.minimum(x2[i], x2[idxs[:last]])
|
97
|
-
yy2 = np.minimum(y2[i], y2[idxs[:last]])
|
98
|
-
|
99
|
-
w = np.maximum(0, xx2 - xx1 + 1)
|
100
|
-
h = np.maximum(0, yy2 - yy1 + 1)
|
101
|
-
|
102
|
-
overlap = (w * h) / area[idxs[:last]]
|
103
|
-
idxs = np.delete(
|
104
|
-
idxs, np.concatenate(([last], np.where(overlap > overlap_thresh)[0]))
|
105
|
-
)
|
106
|
-
|
107
|
-
return boxes[pick].astype("int"), pick
|
108
|
-
|
109
|
-
|
110
|
-
# Teach mode only one icon
|
111
|
-
def load_icon_templates(self, icon_folder:str, scale_factor:str="1.0x", specific_icon_names:list=None):
|
112
|
-
icons = []
|
113
|
-
specific_icon_template_flag = False
|
114
|
-
# 如果提供了specific_icon_names,先处理这些路径
|
115
|
-
if specific_icon_names:
|
116
|
-
specific_icon_template_flag = True
|
117
|
-
# for icon_path in specific_icon_names:
|
118
|
-
# full_path = os.path.join(icon_folder, icon_path)
|
119
|
-
# print("FULL PATH:", full_path)
|
120
|
-
# if os.path.exists(full_path):
|
121
|
-
# print(f"Found icon at: {full_path}")
|
122
|
-
# current_step_icons.add(os.path.normpath(full_path))
|
123
|
-
|
124
|
-
# print("Current step icons:", current_step_icons)
|
125
|
-
# 支持的图片格式
|
126
|
-
image_extensions = {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'}
|
127
|
-
# print("icon_folder: ", icon_folder)
|
128
|
-
# 使用os.walk遍历文件夹
|
129
|
-
for root, dirs, files in os.walk(icon_folder):
|
130
|
-
for dir in dirs:
|
131
|
-
scale_match = re.match(r'(\d+\.?\d*)x', dir)
|
132
|
-
if scale_match:
|
133
|
-
template_scale_factor = scale_match.group(1)
|
134
|
-
else:
|
135
|
-
raise ValueError(f"Invalid scale factor: {os.path.join(icon_folder, dir)}")
|
136
|
-
# print("root: ", root)
|
137
|
-
# print("files: ", files)
|
138
|
-
for root, dirs, files in os.walk(os.path.join(icon_folder, dir)):
|
139
|
-
for filename in files:
|
140
|
-
# print("filename: ", filename)
|
141
|
-
if any(filename.lower().endswith(ext) for ext in image_extensions):
|
142
|
-
template_path = os.path.join(root, filename)
|
143
|
-
try:
|
144
|
-
name = os.path.splitext(filename)[0]
|
145
|
-
name = re.sub(r"^\d+_", "", name) + "|icon"
|
146
|
-
|
147
|
-
icon_info = {
|
148
|
-
"name": name,
|
149
|
-
"path": template_path,
|
150
|
-
"scale_factor": float(template_scale_factor),
|
151
|
-
"current_step_icon": False
|
152
|
-
}
|
153
|
-
if specific_icon_template_flag:
|
154
|
-
# only load the specific icon
|
155
|
-
# print("specific_icon_names: ", specific_icon_names)
|
156
|
-
# print("os.path.normpath(template_path): ", filename)
|
157
|
-
if filename in specific_icon_names:
|
158
|
-
icon_info["current_step_icon"] = True
|
159
|
-
icons.append(icon_info)
|
160
|
-
else:
|
161
|
-
# load all the icons
|
162
|
-
icons.append(icon_info)
|
163
|
-
|
164
|
-
except Exception as e:
|
165
|
-
print(f"Error processing template {template_path}: {str(e)}")
|
166
|
-
continue
|
167
|
-
|
168
|
-
# 处理缩放
|
169
|
-
for icon in icons:
|
170
|
-
try:
|
171
|
-
scale_factor_value = float(scale_factor[:-1]) if scale_factor.endswith('x') else float(scale_factor)
|
172
|
-
icon_scale_factor_value = float(icon["scale_factor"])
|
173
|
-
factor = scale_factor_value / icon_scale_factor_value
|
174
|
-
print("factor: ", factor)
|
175
|
-
icon["template"] = self.resize_image(icon["path"], factor)
|
176
|
-
except Exception as e:
|
177
|
-
print(f"Error processing template {icon['name']}: {str(e)}")
|
178
|
-
continue
|
179
|
-
|
180
|
-
|
181
|
-
# show all the icons in the template folder
|
182
|
-
for icon in icons:
|
183
|
-
print("icon: ", icon["name"], icon["path"], icon["scale_factor"], icon["current_step_icon"])
|
184
|
-
|
185
|
-
print("Icons Template loaded: ", len(icons))
|
186
|
-
return icons
|
187
|
-
|
188
|
-
@staticmethod
|
189
|
-
def resize_image(input_path, factor):
|
190
|
-
# Open the image
|
191
|
-
with Image.open(input_path) as img:
|
192
|
-
# Calculate new size
|
193
|
-
new_size = (int(img.width * factor), int(img.height * factor))
|
194
|
-
# Resize the image
|
195
|
-
resized_img = img.resize(new_size, Image.LANCZOS)
|
196
|
-
# Convert to NumPy array
|
197
|
-
resized_img = np.array(resized_img)
|
198
|
-
## important! convert the image to RGB
|
199
|
-
resized_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB)
|
200
|
-
return resized_img
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
import matplotlib.pyplot as plt
|
205
|
-
|
206
|
-
|
207
|
-
def draw_detected_icons(image_path, detections):
|
208
|
-
# 读取图像
|
209
|
-
image = cv2.imread(image_path, cv2.IMREAD_COLOR)
|
210
|
-
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
211
|
-
|
212
|
-
# 遍历每个检测到的图标
|
213
|
-
for detection in detections:
|
214
|
-
name = detection["name"]
|
215
|
-
x1, y1, x2, y2 = detection["rectangle"]
|
216
|
-
# 画矩形框
|
217
|
-
cv2.rectangle(image, (x1, y1), (x2, y2), (255, 0, 0), 2)
|
218
|
-
# 在矩形框上方添加标签
|
219
|
-
cv2.putText(
|
220
|
-
image, name, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2
|
221
|
-
)
|
222
|
-
|
223
|
-
# 使用 Matplotlib 显示图像
|
224
|
-
plt.imshow(image)
|
225
|
-
plt.axis("off")
|
226
|
-
plt.show()
|
227
|
-
|
228
|
-
# 保存图像(如果需要)
|
229
|
-
# cv2.imwrite("output_with_icons.png", image)
|
230
|
-
|
231
|
-
|
232
|
-
def detect_icons(icon_folder, image_path, threshold=0.75, scale_factor="1.0x", specific_icon_names=None):
|
233
|
-
# 初始化图标检测器
|
234
|
-
detector = IconDetector(icon_folder)
|
235
|
-
|
236
|
-
# 检测图标
|
237
|
-
detections = detector(
|
238
|
-
screenshot_path=image_path,
|
239
|
-
threshold=threshold,
|
240
|
-
scale_factor=scale_factor,
|
241
|
-
mode="teach", # 确保使用正确的模式
|
242
|
-
specific_icon_names=specific_icon_names,
|
243
|
-
)
|
244
|
-
|
245
|
-
return detections
|
246
|
-
|
247
|
-
|
248
|
-
def get_screen_resize_factor():
|
249
|
-
# scaleFactor = ctypes.windll.shcore.GetScaleFactorForDevice(0) / 100
|
250
|
-
# scaleFactor = str(scaleFactor) + "x"
|
251
|
-
# return scaleFactor
|
252
|
-
return "1.0x"
|
253
|
-
|
@@ -1,170 +0,0 @@
|
|
1
|
-
from ....llm_utils.llm_utils import extract_data
|
2
|
-
from ....llm_utils.run_llm import run_llm
|
3
|
-
|
4
|
-
|
5
|
-
def recognize_panel(ocr, highlight_ocr, panel_metadata, screen_resolution, software):
|
6
|
-
panel_recognition = PanelRecognitionLLM(software=software)
|
7
|
-
panel_name = panel_recognition(
|
8
|
-
ocr, highlight_ocr, panel_metadata, screen_resolution
|
9
|
-
)
|
10
|
-
return panel_name
|
11
|
-
|
12
|
-
|
13
|
-
class PanelRecognitionLLM:
|
14
|
-
def __init__(self, llm="gpt-4o-mini", software="premiere"):
|
15
|
-
self.llm = llm
|
16
|
-
self.software = software
|
17
|
-
|
18
|
-
# TODO: Maybe consider supporting more general inputs
|
19
|
-
def __call__(
|
20
|
-
self,
|
21
|
-
ocr: dict,
|
22
|
-
highlight_ocr: dict,
|
23
|
-
raw_item: dict,
|
24
|
-
screen_resolution: str,
|
25
|
-
):
|
26
|
-
if raw_item["properties"]["friendly_class_name"] == "Dialog":
|
27
|
-
panel_name = raw_item["properties"]["texts"][0]
|
28
|
-
else:
|
29
|
-
panel_name = self.recognize_panel_llm(
|
30
|
-
ocr, highlight_ocr, raw_item, screen_resolution
|
31
|
-
)
|
32
|
-
# print(f"GGGGGGGGGGGG Panel name: {panel_name}")
|
33
|
-
return panel_name.strip("\"' ")
|
34
|
-
|
35
|
-
def recognize_panel_llm(self, ocr, highlight_ocr, raw_item, screen_resolution):
|
36
|
-
ocr_in_panel = self.get_ocr_in_panel(ocr, raw_item["properties"]["rectangle"])
|
37
|
-
|
38
|
-
highlight_ocr_in_panel = self.get_ocr_in_panel(
|
39
|
-
highlight_ocr, raw_item["properties"]["rectangle"]
|
40
|
-
)
|
41
|
-
|
42
|
-
panel_name_candidates = self.get_panel_name_candidates("premiere")
|
43
|
-
|
44
|
-
prompt = f"""These are the texts detected in a panel of Adobe Premiere.
|
45
|
-
|
46
|
-
OCR:
|
47
|
-
{ocr_in_panel}
|
48
|
-
, where the following ocrs are highlighted:
|
49
|
-
{highlight_ocr_in_panel}
|
50
|
-
|
51
|
-
Panel coordinates:
|
52
|
-
{raw_item['properties']['rectangle']}
|
53
|
-
|
54
|
-
Screen Resolution:
|
55
|
-
{screen_resolution}
|
56
|
-
|
57
|
-
Possible Panel:
|
58
|
-
{panel_name_candidates}
|
59
|
-
|
60
|
-
Could you infer the name of the panel from the OCR results?
|
61
|
-
|
62
|
-
Tips for panel:
|
63
|
-
Navigation Bar: The bar at the top of the screen with different panel names, e.g., Learn, Effects.
|
64
|
-
Tools: If there is No Text in it, and panel cordinates indicates is a vertical strip. This is the Tools Panel.
|
65
|
-
Timeline: A line of timecode numbers (more than three) on the top of the panel, xx:xx:xx:xx, that represent the time in the video
|
66
|
-
Program Monitor: A title that says "Program: [Name of the video]", also has two timecodes
|
67
|
-
Audio Meters: A column of numbers representing decibel (dB) levels
|
68
|
-
Source Monitor: A title that says "Source: [Name of the source]", also has two timecodes
|
69
|
-
Reference Monitor: A title that says "Reference: [Name of the referenced item]", also has two timecodes
|
70
|
-
Output format:
|
71
|
-
# Generate a brief Reason Here
|
72
|
-
```json
|
73
|
-
"Name of the panel"
|
74
|
-
```
|
75
|
-
"""
|
76
|
-
# print(f"========================\n{prompt}")
|
77
|
-
response = run_llm(prompt, "gpt-4o-mini")
|
78
|
-
panel_name = extract_data(response, "json")
|
79
|
-
# print(f"========================\nPanel name: {panel_name}")
|
80
|
-
return panel_name
|
81
|
-
|
82
|
-
def get_ocr_in_panel(self, ocr_result, panel):
|
83
|
-
"""
|
84
|
-
Check if OCR text bounding boxes are within the specified panel.
|
85
|
-
|
86
|
-
Parameters:
|
87
|
-
- ocr_result: dict, OCR results containing bounding boxes of detected text.
|
88
|
-
- panel: list, coordinates of the panel in the format [x_min, y_min, x_max, y_max].
|
89
|
-
|
90
|
-
Returns:
|
91
|
-
- result: list of booleans, True if the text bbox is within the panel, False otherwise.
|
92
|
-
"""
|
93
|
-
panel_x_min, panel_y_min, panel_x_max, panel_y_max = panel
|
94
|
-
|
95
|
-
results = []
|
96
|
-
|
97
|
-
for text_info in ocr_result["texts"]:
|
98
|
-
bbox = text_info["bbox"]
|
99
|
-
x_min, y_min, x_max, y_max = bbox
|
100
|
-
|
101
|
-
# Check if the bounding box is completely within the panel
|
102
|
-
if (
|
103
|
-
x_min >= panel_x_min
|
104
|
-
and y_min >= panel_y_min
|
105
|
-
and x_max <= panel_x_max
|
106
|
-
and y_max <= panel_y_max
|
107
|
-
):
|
108
|
-
results.append(text_info)
|
109
|
-
|
110
|
-
return results
|
111
|
-
|
112
|
-
|
113
|
-
def get_panel_knowledge(self, name):
|
114
|
-
panel_knowledge = {
|
115
|
-
"premiere": """Navigation Bar: The bar at the top of the screen with different panel names, e.g., Learn, Effects.
|
116
|
-
Tools: If there is No Text in it, and panel cordinates indicates is a vertical strip. This is the Tools Panel.
|
117
|
-
Timeline: A line of timecode numbers (more than three) on the top of the panel, xx:xx:xx:xx, that represent the time in the video
|
118
|
-
Program Monitor: A title that says "Program: [Name of the video]", also has two timecodes
|
119
|
-
Audio Meters: A column of numbers representing decibel (dB) levels
|
120
|
-
Source Monitor: A title that says "Source: [Name of the source]", also has two timecodes
|
121
|
-
Reference Monitor: A title that says "Reference: [Name of the referenced item]", also has two timecodes""",
|
122
|
-
"capcut": """
|
123
|
-
""",
|
124
|
-
"after_effects": """
|
125
|
-
""",
|
126
|
-
}
|
127
|
-
return panel_knowledge.get(name, "")
|
128
|
-
|
129
|
-
@staticmethod
|
130
|
-
def get_panel_name_candidates(name):
|
131
|
-
panel_name_candidates = {
|
132
|
-
"premiere": [
|
133
|
-
"Navigation Bar",
|
134
|
-
"Audio Track Mixer",
|
135
|
-
"Capture",
|
136
|
-
"Edit To Tape",
|
137
|
-
"Effect Controls",
|
138
|
-
"Essential Graphics",
|
139
|
-
"Essential Sound",
|
140
|
-
"Events",
|
141
|
-
"History",
|
142
|
-
"Info",
|
143
|
-
"Learn",
|
144
|
-
"Libraries",
|
145
|
-
"Lumetri Color",
|
146
|
-
"Lumetri Scopes",
|
147
|
-
"Markers",
|
148
|
-
"Media Browser",
|
149
|
-
"Metadata",
|
150
|
-
"Production",
|
151
|
-
"Program Monitor",
|
152
|
-
"Projects",
|
153
|
-
"Reference Monitor",
|
154
|
-
"Source Monitor",
|
155
|
-
"Timeline",
|
156
|
-
"Tools",
|
157
|
-
# "other panel",
|
158
|
-
"Text",
|
159
|
-
],
|
160
|
-
"after_effects": [
|
161
|
-
"Composition",
|
162
|
-
"Timeline",
|
163
|
-
"Preview",
|
164
|
-
"Effects & Presets",
|
165
|
-
"Character",
|
166
|
-
"Paragraph",
|
167
|
-
],
|
168
|
-
}
|
169
|
-
|
170
|
-
return panel_name_candidates.get(name, [])
|
computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/uia_parser.py
DELETED
File without changes
|
{computer_use_ootb_internal-0.0.107.dist-info → computer_use_ootb_internal-0.0.109.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|