vision-arwaky 2.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. vision_arwaky-2.0.5/PKG-INFO +16 -0
  2. vision_arwaky-2.0.5/README.md +11 -0
  3. vision_arwaky-2.0.5/pyproject.toml +32 -0
  4. vision_arwaky-2.0.5/setup.cfg +4 -0
  5. vision_arwaky-2.0.5/src/__init__.py +1 -0
  6. vision_arwaky-2.0.5/src/agent/__init__.py +3 -0
  7. vision_arwaky-2.0.5/src/agent/vision_agent_orchestrator.py +282 -0
  8. vision_arwaky-2.0.5/src/capabilities/__init__.py +15 -0
  9. vision_arwaky-2.0.5/src/capabilities/image_processing_processor.py +119 -0
  10. vision_arwaky-2.0.5/src/capabilities/object_tracking_tracker.py +98 -0
  11. vision_arwaky-2.0.5/src/capabilities/video_analysis_analyzer.py +112 -0
  12. vision_arwaky-2.0.5/src/capabilities/video_processing_processor.py +79 -0
  13. vision_arwaky-2.0.5/src/capabilities/video_timeline_generator.py +81 -0
  14. vision_arwaky-2.0.5/src/capabilities/visual_memory_store.py +103 -0
  15. vision_arwaky-2.0.5/src/cli_entry.py +66 -0
  16. vision_arwaky-2.0.5/src/contract/__init__.py +27 -0
  17. vision_arwaky-2.0.5/src/contract/ffmpeg_video_port.py +36 -0
  18. vision_arwaky-2.0.5/src/contract/image_processing_protocol.py +27 -0
  19. vision_arwaky-2.0.5/src/contract/llm_vision_port.py +22 -0
  20. vision_arwaky-2.0.5/src/contract/object_tracking_protocol.py +17 -0
  21. vision_arwaky-2.0.5/src/contract/opencv_image_port.py +82 -0
  22. vision_arwaky-2.0.5/src/contract/registry_service_aggregate.py +101 -0
  23. vision_arwaky-2.0.5/src/contract/system_utils_port.py +32 -0
  24. vision_arwaky-2.0.5/src/contract/tesseract_ocr_port.py +11 -0
  25. vision_arwaky-2.0.5/src/contract/video_analysis_protocol.py +17 -0
  26. vision_arwaky-2.0.5/src/contract/video_processing_protocol.py +39 -0
  27. vision_arwaky-2.0.5/src/contract/video_timeline_protocol.py +11 -0
  28. vision_arwaky-2.0.5/src/contract/visual_memory_protocol.py +17 -0
  29. vision_arwaky-2.0.5/src/infrastructure/__init__.py +13 -0
  30. vision_arwaky-2.0.5/src/infrastructure/ffmpeg_video_adapter.py +73 -0
  31. vision_arwaky-2.0.5/src/infrastructure/llm_vision_adapter.py +258 -0
  32. vision_arwaky-2.0.5/src/infrastructure/opencv_image_adapter.py +93 -0
  33. vision_arwaky-2.0.5/src/infrastructure/system_utils_util.py +37 -0
  34. vision_arwaky-2.0.5/src/infrastructure/tesseract_ocr_adapter.py +27 -0
  35. vision_arwaky-2.0.5/src/mcp_entry.py +15 -0
  36. vision_arwaky-2.0.5/src/py.typed +1 -0
  37. vision_arwaky-2.0.5/src/surfaces/__init__.py +56 -0
  38. vision_arwaky-2.0.5/src/surfaces/cli_commands_handler.py +125 -0
  39. vision_arwaky-2.0.5/src/surfaces/cli_handler.py +83 -0
  40. vision_arwaky-2.0.5/src/surfaces/mcp_handler.py +47 -0
  41. vision_arwaky-2.0.5/src/surfaces/mcp_tools_handler.py +236 -0
  42. vision_arwaky-2.0.5/src/taxonomy/__init__.py +47 -0
  43. vision_arwaky-2.0.5/src/taxonomy/vision_models_vo.py +157 -0
  44. vision_arwaky-2.0.5/tests/test_vision.py +195 -0
  45. vision_arwaky-2.0.5/vision_arwaky.egg-info/PKG-INFO +16 -0
  46. vision_arwaky-2.0.5/vision_arwaky.egg-info/SOURCES.txt +48 -0
  47. vision_arwaky-2.0.5/vision_arwaky.egg-info/dependency_links.txt +1 -0
  48. vision_arwaky-2.0.5/vision_arwaky.egg-info/entry_points.txt +3 -0
  49. vision_arwaky-2.0.5/vision_arwaky.egg-info/requires.txt +10 -0
  50. vision_arwaky-2.0.5/vision_arwaky.egg-info/top_level.txt +1 -0
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.4
2
+ Name: vision-arwaky
3
+ Version: 2.0.5
4
+ Summary: MCP server for unified image and video processing
5
+ Author-email: rakaarwaky <arwaky90@gmail.com>
6
+ Requires-Python: >=3.12
7
+ Requires-Dist: mcp[cli]
8
+ Requires-Dist: fastmcp
9
+ Requires-Dist: pydantic
10
+ Requires-Dist: opencv-contrib-python-headless
11
+ Requires-Dist: pillow
12
+ Requires-Dist: numpy
13
+ Requires-Dist: pytesseract
14
+ Requires-Dist: requests
15
+ Requires-Dist: pyyaml
16
+ Requires-Dist: llama-cpp-python
@@ -0,0 +1,11 @@
1
+ # Vision Arwaky
2
+
3
+ The unified computer vision server based on the AES 5 Domains Architecture.
4
+
5
+ ## Architecture
6
+ The server is structured into modular domains (Max Depth 5):
7
+ - `src/taxonomy/`: Data models (DNA)
8
+ - `src/capabilities/`: Business logic slice endpoints
9
+ - `src/infrastructure/`: Technology adapters (OpenCV, FFmpeg, Tesseract)
10
+ - `src/surfaces/`: MCP Interface for the agent
11
+ - `src/main.py`: Bootstrap wiring and initialization
@@ -0,0 +1,32 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "vision-arwaky"
7
+ version = "2.0.5"
8
+ description = "MCP server for unified image and video processing"
9
+ requires-python = ">=3.12"
10
+ authors = [
11
+ {name = "rakaarwaky", email = "arwaky90@gmail.com"}
12
+ ]
13
+ dependencies = [
14
+ "mcp[cli]",
15
+ "fastmcp",
16
+ "pydantic",
17
+ "opencv-contrib-python-headless",
18
+ "pillow",
19
+ "numpy",
20
+ "pytesseract",
21
+ "requests",
22
+ "pyyaml",
23
+ "llama-cpp-python",
24
+ ]
25
+
26
+ [project.scripts]
27
+ vision-arwaky = "src.mcp_entry:main"
28
+ vision-cli = "src.cli_entry:cli"
29
+
30
+ [tool.setuptools.packages.find]
31
+ where = ["."]
32
+ include = ["src*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1 @@
1
+ # Vision MCP Source Package
@@ -0,0 +1,3 @@
1
+ from .vision_agent_orchestrator import VisionAgentOrchestrator
2
+
3
+ __all__ = ["VisionAgentOrchestrator"]
@@ -0,0 +1,282 @@
1
+ """Vision Agent Orchestrator — dynamic Application Orchestrator and service locator."""
2
+
3
+ import importlib
4
+ import asyncio
5
+ import json
6
+ from typing import Any, Dict
7
+
8
+ from src.taxonomy import (
9
+ FilePath,
10
+ LanguageCode,
11
+ TimeSegment,
12
+ BoundingBox,
13
+ CommandName,
14
+ CommandOutput,
15
+ MemoryLabel,
16
+ DistanceThreshold,
17
+ SceneThreshold,
18
+ MinArea,
19
+ AnalysisPrompt,
20
+ IntervalSeconds,
21
+ MaxFrames,
22
+ )
23
+ from src.contract import (
24
+ RegistryServiceAggregate,
25
+ SystemUtilsPort,
26
+ OpenCVImagePort,
27
+ TesseractOCRPort,
28
+ FFmpegVideoPort,
29
+ LLMVisionPort,
30
+ ImageProcessingProtocol,
31
+ VideoProcessingProtocol,
32
+ VideoAnalysisProtocol,
33
+ ObjectTrackingProtocol,
34
+ VisualMemoryProtocol,
35
+ VideoTimelineProtocol,
36
+ )
37
+
38
+
39
+ class VisionAgentOrchestrator(RegistryServiceAggregate):
40
+ """Orchestrator and locator for Vision capabilities."""
41
+
42
+ @staticmethod
43
+ def get_utils() -> SystemUtilsPort:
44
+ """Instantiate concrete Utils adapter dynamically."""
45
+ module = importlib.import_module("src.infrastructure.system_utils_util")
46
+ cls = getattr(module, "SystemUtilsUtil")
47
+ return cls()
48
+
49
+ @staticmethod
50
+ def get_opencv() -> OpenCVImagePort:
51
+ """Instantiate concrete OpenCV adapter dynamically."""
52
+ module = importlib.import_module("src.infrastructure.opencv_image_adapter")
53
+ cls = getattr(module, "OpenCVImageAdapter")
54
+ return cls()
55
+
56
+ @staticmethod
57
+ def get_tesseract() -> TesseractOCRPort:
58
+ """Instantiate concrete Tesseract adapter dynamically."""
59
+ module = importlib.import_module("src.infrastructure.tesseract_ocr_adapter")
60
+ cls = getattr(module, "TesseractOCRAdapter")
61
+ return cls()
62
+
63
+ @staticmethod
64
+ def get_ffmpeg() -> FFmpegVideoPort:
65
+ """Instantiate concrete FFmpeg adapter dynamically."""
66
+ module = importlib.import_module("src.infrastructure.ffmpeg_video_adapter")
67
+ cls = getattr(module, "FFmpegVideoAdapter")
68
+ return cls()
69
+
70
+ @staticmethod
71
+ def get_llm() -> LLMVisionPort:
72
+ """Instantiate concrete LLM adapter dynamically."""
73
+ module = importlib.import_module("src.infrastructure.llm_vision_adapter")
74
+ cls = getattr(module, "LLMVisionAdapter")
75
+ return cls()
76
+
77
+ @staticmethod
78
+ def get_image_processing() -> ImageProcessingProtocol:
79
+ """Instantiate concrete ImageProcessingProcessor dynamically with injected ports."""
80
+ cap_mod = importlib.import_module("src.capabilities.image_processing_processor")
81
+ cap_cls = getattr(cap_mod, "ImageProcessingProcessor")
82
+ return cap_cls(
83
+ opencv_port=VisionAgentOrchestrator.get_opencv(),
84
+ tesseract_port=VisionAgentOrchestrator.get_tesseract(),
85
+ llm_port=VisionAgentOrchestrator.get_llm(),
86
+ )
87
+
88
+ @staticmethod
89
+ def get_video_processing() -> VideoProcessingProtocol:
90
+ """Instantiate concrete VideoProcessingProcessor dynamically with injected ports."""
91
+ cap_mod = importlib.import_module("src.capabilities.video_processing_processor")
92
+ cap_cls = getattr(cap_mod, "VideoProcessingProcessor")
93
+ return cap_cls(
94
+ opencv_port=VisionAgentOrchestrator.get_opencv(),
95
+ ffmpeg_port=VisionAgentOrchestrator.get_ffmpeg(),
96
+ )
97
+
98
+ @staticmethod
99
+ def get_video_analysis() -> VideoAnalysisProtocol:
100
+ """Instantiate concrete VideoAnalysisAnalyzer dynamically with injected ports."""
101
+ cap_mod = importlib.import_module("src.capabilities.video_analysis_analyzer")
102
+ cap_cls = getattr(cap_mod, "VideoAnalysisAnalyzer")
103
+ return cap_cls(
104
+ opencv_port=VisionAgentOrchestrator.get_opencv(),
105
+ )
106
+
107
+ @staticmethod
108
+ def get_object_tracking() -> ObjectTrackingProtocol:
109
+ """Instantiate concrete ObjectTrackingTracker dynamically with injected ports."""
110
+ cap_mod = importlib.import_module("src.capabilities.object_tracking_tracker")
111
+ cap_cls = getattr(cap_mod, "ObjectTrackingTracker")
112
+ return cap_cls(
113
+ opencv_port=VisionAgentOrchestrator.get_opencv(),
114
+ )
115
+
116
+ @staticmethod
117
+ def get_visual_memory() -> VisualMemoryProtocol:
118
+ """Instantiate concrete VisualMemoryStore dynamically with injected ports."""
119
+ cap_mod = importlib.import_module("src.capabilities.visual_memory_store")
120
+ cap_cls = getattr(cap_mod, "VisualMemoryStore")
121
+ return cap_cls(
122
+ opencv_port=VisionAgentOrchestrator.get_opencv(),
123
+ utils_port=VisionAgentOrchestrator.get_utils(),
124
+ )
125
+
126
+ @staticmethod
127
+ def get_video_timeline() -> VideoTimelineProtocol:
128
+ """Instantiate concrete VideoTimelineGenerator dynamically with injected ports."""
129
+ cap_mod = importlib.import_module("src.capabilities.video_timeline_generator")
130
+ cap_cls = getattr(cap_mod, "VideoTimelineGenerator")
131
+ return cap_cls(
132
+ opencv_port=VisionAgentOrchestrator.get_opencv(),
133
+ video_cap=VisionAgentOrchestrator.get_video_processing(),
134
+ analysis_cap=VisionAgentOrchestrator.get_video_analysis(),
135
+ )
136
+
137
+ @staticmethod
138
+ def _execute_image_cmd(command: str, kwargs: Dict[str, Any]) -> str | None:
139
+ if command == "analyze":
140
+ img = FilePath(value=kwargs["image"])
141
+ prompt_val = kwargs.get("prompt")
142
+ prompt = AnalysisPrompt(value=prompt_val)
143
+ return json.dumps(VisionAgentOrchestrator.get_image_processing().analyze_screenshot(img, prompt).model_dump(), indent=2)
144
+ elif command == "ocr":
145
+ img = FilePath(value=kwargs["image"])
146
+ lang_val = kwargs.get("lang") or "eng"
147
+ lang = LanguageCode(value=lang_val)
148
+ return VisionAgentOrchestrator.get_image_processing().extract_text(img, lang).value
149
+ elif command == "elements":
150
+ img = FilePath(value=kwargs["image"])
151
+ return json.dumps([e.model_dump() for e in VisionAgentOrchestrator.get_image_processing().find_elements(img)], indent=2)
152
+ elif command == "compare":
153
+ img1 = FilePath(value=kwargs["image1"])
154
+ img2 = FilePath(value=kwargs["image2"])
155
+ return json.dumps(VisionAgentOrchestrator.get_image_processing().compare_screenshots(img1, img2), indent=2)
156
+ return None
157
+
158
+ @staticmethod
159
+ def _cmd_video_info(kwargs: Dict[str, Any]) -> str:
160
+ vid = FilePath(value=kwargs["video"])
161
+ return json.dumps(VisionAgentOrchestrator.get_video_processing().get_info(vid).model_dump(), indent=2)
162
+
163
+ @staticmethod
164
+ def _cmd_extract_frames(kwargs: Dict[str, Any]) -> str:
165
+ interval_val = float(kwargs["interval"])
166
+ interval = IntervalSeconds(value=interval_val)
167
+ res = asyncio.run(VisionAgentOrchestrator.get_video_processing().extract_frames(FilePath(value=kwargs["video"]), interval))
168
+ return json.dumps([r.value for r in res], indent=2)
169
+
170
+ @staticmethod
171
+ def _cmd_convert(kwargs: Dict[str, Any]) -> str:
172
+ inp = FilePath(value=kwargs["input_path"])
173
+ out = FilePath(value=kwargs["output_path"])
174
+ res = asyncio.run(VisionAgentOrchestrator.get_video_processing().convert_format(inp, out))
175
+ return json.dumps({"success": res})
176
+
177
+ @staticmethod
178
+ def _cmd_check_corruption(kwargs: Dict[str, Any]) -> str:
179
+ res = VisionAgentOrchestrator.get_video_processing().check_corruption(FilePath(value=kwargs["video"]))
180
+ return json.dumps({"corrupted": res})
181
+
182
+ @staticmethod
183
+ def _cmd_create_gif(kwargs: Dict[str, Any]) -> str:
184
+ vid = FilePath(value=kwargs["video"])
185
+ out = FilePath(value=kwargs["output_path"])
186
+ start = float(kwargs["start"]) if kwargs["start"] else None
187
+ duration = float(kwargs["duration"]) if kwargs["duration"] else None
188
+ segment = TimeSegment(start=start, duration=duration)
189
+ res = asyncio.run(VisionAgentOrchestrator.get_video_processing().create_gif(vid, out, segment))
190
+ return json.dumps({"success": res})
191
+
192
+ @staticmethod
193
+ def _cmd_detect_scenes(kwargs: Dict[str, Any]) -> str:
194
+ vid = FilePath(value=kwargs["video"])
195
+ thresh_val = float(kwargs["threshold"])
196
+ threshold = SceneThreshold(value=thresh_val)
197
+ return json.dumps([s.model_dump() for s in VisionAgentOrchestrator.get_video_analysis().detect_scenes(vid, threshold)], indent=2)
198
+
199
+ @staticmethod
200
+ def _cmd_detect_motion(kwargs: Dict[str, Any]) -> str:
201
+ vid = FilePath(value=kwargs["video"])
202
+ min_area_val = int(kwargs["min_area"])
203
+ min_area = MinArea(value=min_area_val)
204
+ return json.dumps([m.model_dump() for m in VisionAgentOrchestrator.get_video_analysis().detect_motion(vid, min_area)], indent=2)
205
+
206
+ @staticmethod
207
+ def _cmd_track(kwargs: Dict[str, Any]) -> str:
208
+ vid = FilePath(value=kwargs["video"])
209
+ x, y, w, h = [int(v) for v in kwargs["bbox"].split(",")]
210
+ bbox = BoundingBox(x=x, y=y, width=w, height=h)
211
+ max_frames_val = int(kwargs["max_frames"])
212
+ max_frames = MaxFrames(value=max_frames_val)
213
+ return json.dumps([b.model_dump() for b in VisionAgentOrchestrator.get_object_tracking().track_object(vid, bbox, max_frames)], indent=2)
214
+
215
+ @staticmethod
216
+ def _cmd_timeline(kwargs: Dict[str, Any]) -> str:
217
+ vid = FilePath(value=kwargs["video"])
218
+ interval_val = int(kwargs["interval"])
219
+ interval = IntervalSeconds(value=float(interval_val))
220
+ return json.dumps(asyncio.run(VisionAgentOrchestrator.get_video_timeline().generate_timeline(vid, interval)).model_dump(), indent=2)
221
+
222
+ @staticmethod
223
+ def _execute_video_cmd(command: str, kwargs: Dict[str, Any]) -> str | None:
224
+ handlers = {
225
+ "video-info": VisionAgentOrchestrator._cmd_video_info,
226
+ "extract-frames": VisionAgentOrchestrator._cmd_extract_frames,
227
+ "convert": VisionAgentOrchestrator._cmd_convert,
228
+ "check-corruption": VisionAgentOrchestrator._cmd_check_corruption,
229
+ "create-gif": VisionAgentOrchestrator._cmd_create_gif,
230
+ "detect-scenes": VisionAgentOrchestrator._cmd_detect_scenes,
231
+ "detect-motion": VisionAgentOrchestrator._cmd_detect_motion,
232
+ "track": VisionAgentOrchestrator._cmd_track,
233
+ "timeline": VisionAgentOrchestrator._cmd_timeline,
234
+ }
235
+ if command in handlers:
236
+ return handlers[command](kwargs)
237
+ return None
238
+
239
+ @staticmethod
240
+ def _execute_memory_cmd(command: str, kwargs: Dict[str, Any]) -> str | None:
241
+ if command == "memory-store":
242
+ img = FilePath(value=kwargs["image"])
243
+ label = MemoryLabel(value=kwargs["label"])
244
+ return json.dumps(VisionAgentOrchestrator.get_visual_memory().remember_image(img, label).model_dump(), indent=2)
245
+ elif command == "memory-search":
246
+ query = FilePath(value=kwargs["query"])
247
+ max_dist_val = int(kwargs["max_distance"])
248
+ max_distance = DistanceThreshold(value=max_dist_val)
249
+ res = VisionAgentOrchestrator.get_visual_memory().find_similar_images(query, max_distance)
250
+ return json.dumps([r.model_dump() for r in res], indent=2)
251
+ elif command == "memory-list":
252
+ import os
253
+ memory_dir = os.path.expanduser("~/.vision-memory")
254
+ index_file = os.path.join(memory_dir, "index.json")
255
+ if os.path.exists(index_file):
256
+ with open(index_file) as f:
257
+ data = json.load(f)
258
+ return json.dumps(data, indent=2)
259
+ return json.dumps({})
260
+ return None
261
+
262
+ @classmethod
263
+ def execute_in_process(cls, command: CommandName, kwargs: dict) -> CommandOutput:
264
+ """Route and execute any command in-process across domains."""
265
+ try:
266
+ cmd_val = command.value if command else ""
267
+ img_res = cls._execute_image_cmd(cmd_val, kwargs)
268
+ if img_res is not None:
269
+ return CommandOutput(value=img_res)
270
+
271
+ vid_res = cls._execute_video_cmd(cmd_val, kwargs)
272
+ if vid_res is not None:
273
+ return CommandOutput(value=vid_res)
274
+
275
+ mem_res = cls._execute_memory_cmd(cmd_val, kwargs)
276
+ if mem_res is not None:
277
+ return CommandOutput(value=mem_res)
278
+
279
+ except Exception as e:
280
+ return CommandOutput(value=json.dumps({"error": str(e)}))
281
+
282
+ return CommandOutput(value=json.dumps({"error": f"Unknown command: {command.value if command else ''}"}))
@@ -0,0 +1,15 @@
1
+ from .image_processing_processor import ImageProcessingProcessor
2
+ from .video_processing_processor import VideoProcessingProcessor
3
+ from .video_analysis_analyzer import VideoAnalysisAnalyzer
4
+ from .object_tracking_tracker import ObjectTrackingTracker
5
+ from .visual_memory_store import VisualMemoryStore
6
+ from .video_timeline_generator import VideoTimelineGenerator
7
+
8
+ __all__ = [
9
+ "ImageProcessingProcessor",
10
+ "VideoProcessingProcessor",
11
+ "VideoAnalysisAnalyzer",
12
+ "ObjectTrackingTracker",
13
+ "VisualMemoryStore",
14
+ "VideoTimelineGenerator",
15
+ ]
@@ -0,0 +1,119 @@
1
+ from typing import Dict, Any, List
2
+ from src.contract import ImageProcessingProtocol
3
+ from src.contract import OpenCVImagePort
4
+ from src.contract import TesseractOCRPort
5
+ from src.contract import LLMVisionPort
6
+ from src.taxonomy import BoundingBox, Detection, VisionAnalysis, FilePath, LanguageCode, AnalysisPrompt, OcrText
7
+
8
+
9
+ class ImageProcessingProcessor(ImageProcessingProtocol):
10
+ """Image processing capability executing screenshot analysis and comparisons."""
11
+
12
+ def __init__(
13
+ self,
14
+ opencv_port: OpenCVImagePort,
15
+ tesseract_port: TesseractOCRPort,
16
+ llm_port: LLMVisionPort,
17
+ ):
18
+ self._opencv = opencv_port
19
+ self._tesseract = tesseract_port
20
+ self._llm = llm_port
21
+
22
+ def analyze_screenshot(self, image_path: FilePath, prompt: AnalysisPrompt) -> VisionAnalysis:
23
+ """Analyze screenshot for UI elements and text.
24
+
25
+ If prompt is provided and a local VLM is available, use LLM for
26
+ open-ended visual analysis. Otherwise fallback to OCR + element detection.
27
+ """
28
+ p_val = prompt.value if prompt else None
29
+ if p_val:
30
+ try:
31
+ analysis = self._llm.analyze_image(image_path.value, p_val)
32
+ return VisionAnalysis(
33
+ source="llm",
34
+ text=analysis,
35
+ model=self._llm.model or "unknown",
36
+ )
37
+ except Exception as e:
38
+ # Fallback to OpenCV if LLM fails
39
+ return VisionAnalysis(
40
+ source="opencv",
41
+ text=self.extract_text(image_path, LanguageCode(value="eng")).value,
42
+ elements=self.find_elements(image_path),
43
+ error=str(e),
44
+ )
45
+
46
+ # Default: OCR + element detection
47
+ text = self.extract_text(image_path, LanguageCode(value="eng")).value
48
+ elements = self.find_elements(image_path)
49
+ return VisionAnalysis(
50
+ source="opencv",
51
+ text=text,
52
+ elements=elements,
53
+ )
54
+
55
+ def extract_text(self, image_path: FilePath, lang: LanguageCode) -> OcrText:
56
+ """Extract text from image using OCR."""
57
+ text_str = self._tesseract.extract_text(image_path, lang)
58
+ return OcrText(value=text_str)
59
+
60
+ def find_elements(self, image_path: FilePath) -> List[Detection]:
61
+ """Find UI elements (buttons, input fields, etc)."""
62
+ image = self._opencv.read_image(image_path)
63
+ if image is None:
64
+ raise ValueError(f"Failed to load image: {image_path.value}")
65
+
66
+ gray = self._opencv.to_grayscale(image)
67
+ edges = self._opencv.detect_edges(gray, 50, 150)
68
+ contours = self._opencv.find_contours(edges)
69
+
70
+ detections = []
71
+ for cnt in contours:
72
+ area = self._opencv.get_contour_area(cnt)
73
+ if area > 100: # Filter out noise
74
+ x, y, w, h = self._opencv.get_bounding_box(cnt)
75
+ detections.append(
76
+ Detection(
77
+ label="ui_element",
78
+ confidence=1.0,
79
+ bbox=BoundingBox(x=x, y=y, width=w, height=h),
80
+ )
81
+ )
82
+ return detections
83
+
84
+ def compare_screenshots(self, image_path1: FilePath, image_path2: FilePath) -> Dict[str, Any]:
85
+ """Compare two screenshots and find differences."""
86
+ img1 = self._opencv.read_image(image_path1)
87
+ img2 = self._opencv.read_image(image_path2)
88
+
89
+ if img1 is None or img2 is None:
90
+ raise ValueError("Failed to load one or both images")
91
+
92
+ if img1.shape != img2.shape:
93
+ img2 = self._opencv.cv2.resize(img2, (img1.shape[1], img1.shape[0]))
94
+
95
+ diff = self._opencv.abs_diff(img1, img2)
96
+ gray_diff = self._opencv.to_grayscale(diff)
97
+
98
+ _, thresh = self._opencv.cv2.threshold(
99
+ gray_diff, 30, 255, self._opencv.cv2.THRESH_BINARY
100
+ )
101
+ contours = self._opencv.find_contours(thresh)
102
+
103
+ differences = []
104
+ for cnt in contours:
105
+ area = self._opencv.get_contour_area(cnt)
106
+ if area > 50:
107
+ x, y, w, h = self._opencv.get_bounding_box(cnt)
108
+ differences.append(
109
+ BoundingBox(x=x, y=y, width=w, height=h).model_dump()
110
+ )
111
+
112
+ hash1 = self._opencv.compute_phash(img1)
113
+ hash2 = self._opencv.compute_phash(img2)
114
+
115
+ return {
116
+ "identical": len(differences) == 0 and hash1 == hash2,
117
+ "phash_diff": hash1 != hash2,
118
+ "differences": differences,
119
+ }
@@ -0,0 +1,98 @@
1
+ """Object tracking using OpenCV tracking algorithms."""
2
+
3
+ import cv2
4
+ from typing import List
5
+ from src.contract import ObjectTrackingProtocol
6
+ from src.contract import OpenCVImagePort
7
+ from src.taxonomy.vision_models_vo import BoundingBox, FilePath, MaxFrames
8
+
9
+
10
+ class ObjectTrackingTracker(ObjectTrackingProtocol):
11
+ """Track objects through video frames using OpenCV trackers."""
12
+
13
+ def __init__(self, opencv_port: OpenCVImagePort):
14
+ self._opencv = opencv_port
15
+
16
+ def _create_tracker(self):
17
+ """Helper to dynamically construct the OpenCV tracker to avoid complexity and mypy issues."""
18
+ try:
19
+ csrt_creator = getattr(cv2, "TrackerCSRT_create", None)
20
+ if csrt_creator is not None:
21
+ return csrt_creator()
22
+
23
+ legacy = getattr(cv2, "legacy", None)
24
+ if legacy is not None:
25
+ legacy_csrt_creator = getattr(legacy, "TrackerCSRT_create", None)
26
+ if legacy_csrt_creator is not None:
27
+ return legacy_csrt_creator()
28
+ except Exception as e:
29
+ _err = str(e)
30
+
31
+ try:
32
+ kcf_creator = getattr(cv2, "TrackerKCF_create", None)
33
+ if kcf_creator is not None:
34
+ return kcf_creator()
35
+
36
+ legacy = getattr(cv2, "legacy", None)
37
+ if legacy is not None:
38
+ legacy_kcf_creator = getattr(legacy, "TrackerKCF_create", None)
39
+ if legacy_kcf_creator is not None:
40
+ return legacy_kcf_creator()
41
+ except Exception as e:
42
+ _err = str(e)
43
+
44
+ return None
45
+
46
+ def track_object(
47
+ self,
48
+ video_path: FilePath,
49
+ initial_box: BoundingBox,
50
+ max_frames: MaxFrames,
51
+ ) -> List[BoundingBox]:
52
+ """Track an object starting from an initial bounding box."""
53
+ cap = self._opencv.get_video_capture(video_path.value)
54
+ if not cap.isOpened():
55
+ return []
56
+
57
+ # Read first frame
58
+ ret, frame = cap.read()
59
+ if not ret:
60
+ cap.release()
61
+ return []
62
+
63
+ # Initialize tracker dynamically to avoid static mypy type ignores
64
+ tracker = self._create_tracker()
65
+
66
+ if tracker is None:
67
+ cap.release()
68
+ return []
69
+
70
+ bbox_tuple = (initial_box.x, initial_box.y, initial_box.width, initial_box.height)
71
+ ok = tracker.init(frame, bbox_tuple)
72
+
73
+ # OpenCV 4.x init returns None on success, not True
74
+ if ok is False:
75
+ cap.release()
76
+ return []
77
+
78
+ boxes: List[BoundingBox] = [initial_box]
79
+ frame_count = 0
80
+ max_frames_val = max_frames.value if max_frames else 300
81
+
82
+ while frame_count < max_frames_val:
83
+ ret, frame = cap.read()
84
+ if not ret:
85
+ break
86
+
87
+ ok, bbox = tracker.update(frame)
88
+ if ok:
89
+ x, y, w, h = [int(v) for v in bbox]
90
+ boxes.append(BoundingBox(x=x, y=y, width=w, height=h))
91
+ else:
92
+ # Lost tracking — stop
93
+ break
94
+
95
+ frame_count += 1
96
+
97
+ cap.release()
98
+ return boxes