computer-use-ootb-internal 0.0.107__py3-none-any.whl → 0.0.109__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (16) hide show
  1. computer_use_ootb_internal/app_teachmode.py +8 -14
  2. computer_use_ootb_internal/app_teachmode_gradio.py +1 -1
  3. computer_use_ootb_internal/computer_use_demo/animation/click_animation.py +1 -1
  4. computer_use_ootb_internal/computer_use_demo/executor/teachmode_executor.py +70 -72
  5. computer_use_ootb_internal/computer_use_demo/tools/base.py +4 -2
  6. computer_use_ootb_internal/computer_use_demo/tools/computer.py +55 -43
  7. computer_use_ootb_internal/run_teachmode_ootb_args.py +26 -11
  8. {computer_use_ootb_internal-0.0.107.dist-info → computer_use_ootb_internal-0.0.109.dist-info}/METADATA +1 -1
  9. {computer_use_ootb_internal-0.0.107.dist-info → computer_use_ootb_internal-0.0.109.dist-info}/RECORD +11 -16
  10. computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/gui_parser.py +0 -676
  11. computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/icon_detection/icon_detection.py +0 -253
  12. computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/panel_recognition/llm_panel_recognize.py +0 -170
  13. computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/test_capture.py +0 -8
  14. computer_use_ootb_internal/computer_use_demo/gui_agent/gui_parser/simple_parser/uia_parser.py +0 -0
  15. {computer_use_ootb_internal-0.0.107.dist-info → computer_use_ootb_internal-0.0.109.dist-info}/WHEEL +0 -0
  16. {computer_use_ootb_internal-0.0.107.dist-info → computer_use_ootb_internal-0.0.109.dist-info}/entry_points.txt +0 -0
@@ -1,253 +0,0 @@
1
- import cv2
2
- import numpy as np
3
- import os
4
- import re
5
- import glob
6
- from PIL import Image
7
- # import ctypes
8
-
9
-
10
- class IconDetector:
11
- def __init__(self, icon_folder="./template"):
12
- self.icon_folder = icon_folder
13
- print("ICON FOLDER:", self.icon_folder)
14
-
15
- def __call__(
16
- self,
17
- screenshot_path,
18
- mode="teach",
19
- threshold=0.70,
20
- scale_factor="1.0x",
21
- specific_icon_names:list=None,
22
- ):
23
- image = cv2.imread(screenshot_path, cv2.IMREAD_COLOR)
24
- image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
25
- templates = self.load_icon_templates(
26
- self.icon_folder,
27
- scale_factor,
28
- specific_icon_names
29
- )
30
- # print("specific_icon_names: ", specific_icon_names)
31
- # print("templates: ", templates)
32
- all_boxes, all_scores, labels = [], [], []
33
- for template in templates:
34
- icon_name = template["name"].replace(".png", "")
35
- icon_template = template["template"]
36
-
37
- # print(icon_name)
38
-
39
- result = cv2.matchTemplate(image, icon_template, cv2.TM_CCOEFF_NORMED)
40
-
41
- locs = np.where(result >= threshold)
42
- icon_width, icon_height = icon_template.shape[1], icon_template.shape[0]
43
-
44
- for pt in zip(*locs[::-1]):
45
- pt_x, pt_y = pt
46
- end_x = pt_x + icon_width
47
- end_y = pt_y + icon_height
48
- all_boxes.append([pt_x, pt_y, end_x, end_y])
49
- all_scores.append(result[pt_y, pt_x])
50
- # if teach mode, only detect the current step icon
51
- if template.get("current_step_icon", False):
52
- labels.append(f"{icon_name} (click here for reproducing the action)")
53
- ## if not teach mode, detect all the icons
54
- else:
55
- labels.append(icon_name)
56
-
57
- nms_boxes, pick = self.non_max_suppression(all_boxes, 0.5, all_scores)
58
- labels = [labels[i] for i in pick]
59
- # print("labels: ", labels)
60
- # 对于box 中的每一个元素, 用python 普通的int类型表示
61
- nms_boxes = [[int(box[0]), int(box[1]), int(box[2]), int(box[3])] for box in nms_boxes]
62
- button_items = [
63
- {
64
- "name": labels[ix],
65
- "rectangle": list(box),
66
- "is_text": False,
67
- "current_step_icon": "click here for reproducing the action" in labels[ix]
68
- }
69
- for ix, box in enumerate(nms_boxes)
70
-
71
- ]
72
- # print("button_items: ", button_items)
73
-
74
- return button_items
75
-
76
- def non_max_suppression(self, boxes, overlap_thresh, scores):
77
- boxes = np.array(boxes)
78
- if len(boxes) == 0:
79
- return [], []
80
-
81
- if boxes.dtype.kind == "i":
82
- boxes = boxes.astype("float")
83
-
84
- pick = []
85
- x1, y1, x2, y2 = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3]
86
- area = (x2 - x1 + 1) * (y2 - y1 + 1)
87
- idxs = np.argsort(scores)
88
-
89
- while len(idxs) > 0:
90
- last = len(idxs) - 1
91
- i = idxs[last]
92
- pick.append(i)
93
-
94
- xx1 = np.maximum(x1[i], x1[idxs[:last]])
95
- yy1 = np.maximum(y1[i], y1[idxs[:last]])
96
- xx2 = np.minimum(x2[i], x2[idxs[:last]])
97
- yy2 = np.minimum(y2[i], y2[idxs[:last]])
98
-
99
- w = np.maximum(0, xx2 - xx1 + 1)
100
- h = np.maximum(0, yy2 - yy1 + 1)
101
-
102
- overlap = (w * h) / area[idxs[:last]]
103
- idxs = np.delete(
104
- idxs, np.concatenate(([last], np.where(overlap > overlap_thresh)[0]))
105
- )
106
-
107
- return boxes[pick].astype("int"), pick
108
-
109
-
110
- # Teach mode only one icon
111
- def load_icon_templates(self, icon_folder:str, scale_factor:str="1.0x", specific_icon_names:list=None):
112
- icons = []
113
- specific_icon_template_flag = False
114
- # 如果提供了specific_icon_names,先处理这些路径
115
- if specific_icon_names:
116
- specific_icon_template_flag = True
117
- # for icon_path in specific_icon_names:
118
- # full_path = os.path.join(icon_folder, icon_path)
119
- # print("FULL PATH:", full_path)
120
- # if os.path.exists(full_path):
121
- # print(f"Found icon at: {full_path}")
122
- # current_step_icons.add(os.path.normpath(full_path))
123
-
124
- # print("Current step icons:", current_step_icons)
125
- # 支持的图片格式
126
- image_extensions = {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'}
127
- # print("icon_folder: ", icon_folder)
128
- # 使用os.walk遍历文件夹
129
- for root, dirs, files in os.walk(icon_folder):
130
- for dir in dirs:
131
- scale_match = re.match(r'(\d+\.?\d*)x', dir)
132
- if scale_match:
133
- template_scale_factor = scale_match.group(1)
134
- else:
135
- raise ValueError(f"Invalid scale factor: {os.path.join(icon_folder, dir)}")
136
- # print("root: ", root)
137
- # print("files: ", files)
138
- for root, dirs, files in os.walk(os.path.join(icon_folder, dir)):
139
- for filename in files:
140
- # print("filename: ", filename)
141
- if any(filename.lower().endswith(ext) for ext in image_extensions):
142
- template_path = os.path.join(root, filename)
143
- try:
144
- name = os.path.splitext(filename)[0]
145
- name = re.sub(r"^\d+_", "", name) + "|icon"
146
-
147
- icon_info = {
148
- "name": name,
149
- "path": template_path,
150
- "scale_factor": float(template_scale_factor),
151
- "current_step_icon": False
152
- }
153
- if specific_icon_template_flag:
154
- # only load the specific icon
155
- # print("specific_icon_names: ", specific_icon_names)
156
- # print("os.path.normpath(template_path): ", filename)
157
- if filename in specific_icon_names:
158
- icon_info["current_step_icon"] = True
159
- icons.append(icon_info)
160
- else:
161
- # load all the icons
162
- icons.append(icon_info)
163
-
164
- except Exception as e:
165
- print(f"Error processing template {template_path}: {str(e)}")
166
- continue
167
-
168
- # 处理缩放
169
- for icon in icons:
170
- try:
171
- scale_factor_value = float(scale_factor[:-1]) if scale_factor.endswith('x') else float(scale_factor)
172
- icon_scale_factor_value = float(icon["scale_factor"])
173
- factor = scale_factor_value / icon_scale_factor_value
174
- print("factor: ", factor)
175
- icon["template"] = self.resize_image(icon["path"], factor)
176
- except Exception as e:
177
- print(f"Error processing template {icon['name']}: {str(e)}")
178
- continue
179
-
180
-
181
- # show all the icons in the template folder
182
- for icon in icons:
183
- print("icon: ", icon["name"], icon["path"], icon["scale_factor"], icon["current_step_icon"])
184
-
185
- print("Icons Template loaded: ", len(icons))
186
- return icons
187
-
188
- @staticmethod
189
- def resize_image(input_path, factor):
190
- # Open the image
191
- with Image.open(input_path) as img:
192
- # Calculate new size
193
- new_size = (int(img.width * factor), int(img.height * factor))
194
- # Resize the image
195
- resized_img = img.resize(new_size, Image.LANCZOS)
196
- # Convert to NumPy array
197
- resized_img = np.array(resized_img)
198
- ## important! convert the image to RGB
199
- resized_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB)
200
- return resized_img
201
-
202
-
203
-
204
- import matplotlib.pyplot as plt
205
-
206
-
207
- def draw_detected_icons(image_path, detections):
208
- # 读取图像
209
- image = cv2.imread(image_path, cv2.IMREAD_COLOR)
210
- image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
211
-
212
- # 遍历每个检测到的图标
213
- for detection in detections:
214
- name = detection["name"]
215
- x1, y1, x2, y2 = detection["rectangle"]
216
- # 画矩形框
217
- cv2.rectangle(image, (x1, y1), (x2, y2), (255, 0, 0), 2)
218
- # 在矩形框上方添加标签
219
- cv2.putText(
220
- image, name, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2
221
- )
222
-
223
- # 使用 Matplotlib 显示图像
224
- plt.imshow(image)
225
- plt.axis("off")
226
- plt.show()
227
-
228
- # 保存图像(如果需要)
229
- # cv2.imwrite("output_with_icons.png", image)
230
-
231
-
232
- def detect_icons(icon_folder, image_path, threshold=0.75, scale_factor="1.0x", specific_icon_names=None):
233
- # 初始化图标检测器
234
- detector = IconDetector(icon_folder)
235
-
236
- # 检测图标
237
- detections = detector(
238
- screenshot_path=image_path,
239
- threshold=threshold,
240
- scale_factor=scale_factor,
241
- mode="teach", # 确保使用正确的模式
242
- specific_icon_names=specific_icon_names,
243
- )
244
-
245
- return detections
246
-
247
-
248
- def get_screen_resize_factor():
249
- # scaleFactor = ctypes.windll.shcore.GetScaleFactorForDevice(0) / 100
250
- # scaleFactor = str(scaleFactor) + "x"
251
- # return scaleFactor
252
- return "1.0x"
253
-
@@ -1,170 +0,0 @@
1
- from ....llm_utils.llm_utils import extract_data
2
- from ....llm_utils.run_llm import run_llm
3
-
4
-
5
- def recognize_panel(ocr, highlight_ocr, panel_metadata, screen_resolution, software):
6
- panel_recognition = PanelRecognitionLLM(software=software)
7
- panel_name = panel_recognition(
8
- ocr, highlight_ocr, panel_metadata, screen_resolution
9
- )
10
- return panel_name
11
-
12
-
13
- class PanelRecognitionLLM:
14
- def __init__(self, llm="gpt-4o-mini", software="premiere"):
15
- self.llm = llm
16
- self.software = software
17
-
18
- # TODO: Maybe consider supporting more general inputs
19
- def __call__(
20
- self,
21
- ocr: dict,
22
- highlight_ocr: dict,
23
- raw_item: dict,
24
- screen_resolution: str,
25
- ):
26
- if raw_item["properties"]["friendly_class_name"] == "Dialog":
27
- panel_name = raw_item["properties"]["texts"][0]
28
- else:
29
- panel_name = self.recognize_panel_llm(
30
- ocr, highlight_ocr, raw_item, screen_resolution
31
- )
32
- # print(f"GGGGGGGGGGGG Panel name: {panel_name}")
33
- return panel_name.strip("\"' ")
34
-
35
- def recognize_panel_llm(self, ocr, highlight_ocr, raw_item, screen_resolution):
36
- ocr_in_panel = self.get_ocr_in_panel(ocr, raw_item["properties"]["rectangle"])
37
-
38
- highlight_ocr_in_panel = self.get_ocr_in_panel(
39
- highlight_ocr, raw_item["properties"]["rectangle"]
40
- )
41
-
42
- panel_name_candidates = self.get_panel_name_candidates("premiere")
43
-
44
- prompt = f"""These are the texts detected in a panel of Adobe Premiere.
45
-
46
- OCR:
47
- {ocr_in_panel}
48
- , where the following ocrs are highlighted:
49
- {highlight_ocr_in_panel}
50
-
51
- Panel coordinates:
52
- {raw_item['properties']['rectangle']}
53
-
54
- Screen Resolution:
55
- {screen_resolution}
56
-
57
- Possible Panel:
58
- {panel_name_candidates}
59
-
60
- Could you infer the name of the panel from the OCR results?
61
-
62
- Tips for panel:
63
- Navigation Bar: The bar at the top of the screen with different panel names, e.g., Learn, Effects.
64
- Tools: If there is No Text in it, and panel cordinates indicates is a vertical strip. This is the Tools Panel.
65
- Timeline: A line of timecode numbers (more than three) on the top of the panel, xx:xx:xx:xx, that represent the time in the video
66
- Program Monitor: A title that says "Program: [Name of the video]", also has two timecodes
67
- Audio Meters: A column of numbers representing decibel (dB) levels
68
- Source Monitor: A title that says "Source: [Name of the source]", also has two timecodes
69
- Reference Monitor: A title that says "Reference: [Name of the referenced item]", also has two timecodes
70
- Output format:
71
- # Generate a brief Reason Here
72
- ```json
73
- "Name of the panel"
74
- ```
75
- """
76
- # print(f"========================\n{prompt}")
77
- response = run_llm(prompt, "gpt-4o-mini")
78
- panel_name = extract_data(response, "json")
79
- # print(f"========================\nPanel name: {panel_name}")
80
- return panel_name
81
-
82
- def get_ocr_in_panel(self, ocr_result, panel):
83
- """
84
- Check if OCR text bounding boxes are within the specified panel.
85
-
86
- Parameters:
87
- - ocr_result: dict, OCR results containing bounding boxes of detected text.
88
- - panel: list, coordinates of the panel in the format [x_min, y_min, x_max, y_max].
89
-
90
- Returns:
91
- - result: list of booleans, True if the text bbox is within the panel, False otherwise.
92
- """
93
- panel_x_min, panel_y_min, panel_x_max, panel_y_max = panel
94
-
95
- results = []
96
-
97
- for text_info in ocr_result["texts"]:
98
- bbox = text_info["bbox"]
99
- x_min, y_min, x_max, y_max = bbox
100
-
101
- # Check if the bounding box is completely within the panel
102
- if (
103
- x_min >= panel_x_min
104
- and y_min >= panel_y_min
105
- and x_max <= panel_x_max
106
- and y_max <= panel_y_max
107
- ):
108
- results.append(text_info)
109
-
110
- return results
111
-
112
-
113
- def get_panel_knowledge(self, name):
114
- panel_knowledge = {
115
- "premiere": """Navigation Bar: The bar at the top of the screen with different panel names, e.g., Learn, Effects.
116
- Tools: If there is No Text in it, and panel cordinates indicates is a vertical strip. This is the Tools Panel.
117
- Timeline: A line of timecode numbers (more than three) on the top of the panel, xx:xx:xx:xx, that represent the time in the video
118
- Program Monitor: A title that says "Program: [Name of the video]", also has two timecodes
119
- Audio Meters: A column of numbers representing decibel (dB) levels
120
- Source Monitor: A title that says "Source: [Name of the source]", also has two timecodes
121
- Reference Monitor: A title that says "Reference: [Name of the referenced item]", also has two timecodes""",
122
- "capcut": """
123
- """,
124
- "after_effects": """
125
- """,
126
- }
127
- return panel_knowledge.get(name, "")
128
-
129
- @staticmethod
130
- def get_panel_name_candidates(name):
131
- panel_name_candidates = {
132
- "premiere": [
133
- "Navigation Bar",
134
- "Audio Track Mixer",
135
- "Capture",
136
- "Edit To Tape",
137
- "Effect Controls",
138
- "Essential Graphics",
139
- "Essential Sound",
140
- "Events",
141
- "History",
142
- "Info",
143
- "Learn",
144
- "Libraries",
145
- "Lumetri Color",
146
- "Lumetri Scopes",
147
- "Markers",
148
- "Media Browser",
149
- "Metadata",
150
- "Production",
151
- "Program Monitor",
152
- "Projects",
153
- "Reference Monitor",
154
- "Source Monitor",
155
- "Timeline",
156
- "Tools",
157
- # "other panel",
158
- "Text",
159
- ],
160
- "after_effects": [
161
- "Composition",
162
- "Timeline",
163
- "Preview",
164
- "Effects & Presets",
165
- "Character",
166
- "Paragraph",
167
- ],
168
- }
169
-
170
- return panel_name_candidates.get(name, [])
@@ -1,8 +0,0 @@
1
- from PIL import ImageGrab
2
-
3
- bbox=(2560, 366, 2560+1920, 366+1080)
4
-
5
- screenshot = ImageGrab.grab(bbox=bbox, all_screens=True)
6
-
7
- screenshot = screenshot.convert('RGB')
8
- screenshot.save("screenshot.png")