PyPI - vision-arwaky - Versions diffs - 2.0.5__tar.gz - Mend

vision-arwaky 2.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

vision_arwaky-2.0.5/PKG-INFO ADDED Viewed

@@ -0,0 +1,16 @@
+Metadata-Version: 2.4
+Name: vision-arwaky
+Version: 2.0.5
+Summary: MCP server for unified image and video processing
+Author-email: rakaarwaky <arwaky90@gmail.com>
+Requires-Python: >=3.12
+Requires-Dist: mcp[cli]
+Requires-Dist: fastmcp
+Requires-Dist: pydantic
+Requires-Dist: opencv-contrib-python-headless
+Requires-Dist: pillow
+Requires-Dist: numpy
+Requires-Dist: pytesseract
+Requires-Dist: requests
+Requires-Dist: pyyaml
+Requires-Dist: llama-cpp-python

vision_arwaky-2.0.5/README.md ADDED Viewed

@@ -0,0 +1,11 @@
+# Vision Arwaky
+The unified computer vision server based on the AES 5 Domains Architecture.
+## Architecture
+The server is structured into modular domains (Max Depth 5):
+- `src/taxonomy/`: Data models (DNA)
+- `src/capabilities/`: Business logic slice endpoints
+- `src/infrastructure/`: Technology adapters (OpenCV, FFmpeg, Tesseract)
+- `src/surfaces/`: MCP Interface for the agent
+- `src/main.py`: Bootstrap wiring and initialization

vision_arwaky-2.0.5/pyproject.toml ADDED Viewed

@@ -0,0 +1,32 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "vision-arwaky"
+version = "2.0.5"
+description = "MCP server for unified image and video processing"
+requires-python = ">=3.12"
+authors = [
+    {name = "rakaarwaky", email = "arwaky90@gmail.com"}
+]
+dependencies = [
+    "mcp[cli]",
+    "fastmcp",
+    "pydantic",
+    "opencv-contrib-python-headless",
+    "pillow",
+    "numpy",
+    "pytesseract",
+    "requests",
+    "pyyaml",
+    "llama-cpp-python",
+]
+[project.scripts]
+vision-arwaky = "src.mcp_entry:main"
+vision-cli = "src.cli_entry:cli"
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["src*"]

vision_arwaky-2.0.5/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

vision_arwaky-2.0.5/src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # Vision MCP Source Package

vision_arwaky-2.0.5/src/agent/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .vision_agent_orchestrator import VisionAgentOrchestrator
+__all__ = ["VisionAgentOrchestrator"]

vision_arwaky-2.0.5/src/agent/vision_agent_orchestrator.py ADDED Viewed

@@ -0,0 +1,282 @@
+"""Vision Agent Orchestrator — dynamic Application Orchestrator and service locator."""
+import importlib
+import asyncio
+import json
+from typing import Any, Dict
+from src.taxonomy import (
+    FilePath,
+    LanguageCode,
+    TimeSegment,
+    BoundingBox,
+    CommandName,
+    CommandOutput,
+    MemoryLabel,
+    DistanceThreshold,
+    SceneThreshold,
+    MinArea,
+    AnalysisPrompt,
+    IntervalSeconds,
+    MaxFrames,
+)
+from src.contract import (
+    RegistryServiceAggregate,
+    SystemUtilsPort,
+    OpenCVImagePort,
+    TesseractOCRPort,
+    FFmpegVideoPort,
+    LLMVisionPort,
+    ImageProcessingProtocol,
+    VideoProcessingProtocol,
+    VideoAnalysisProtocol,
+    ObjectTrackingProtocol,
+    VisualMemoryProtocol,
+    VideoTimelineProtocol,
+)
+class VisionAgentOrchestrator(RegistryServiceAggregate):
+    """Orchestrator and locator for Vision capabilities."""
+    @staticmethod
+    def get_utils() -> SystemUtilsPort:
+        """Instantiate concrete Utils adapter dynamically."""
+        module = importlib.import_module("src.infrastructure.system_utils_util")
+        cls = getattr(module, "SystemUtilsUtil")
+        return cls()
+    @staticmethod
+    def get_opencv() -> OpenCVImagePort:
+        """Instantiate concrete OpenCV adapter dynamically."""
+        module = importlib.import_module("src.infrastructure.opencv_image_adapter")
+        cls = getattr(module, "OpenCVImageAdapter")
+        return cls()
+    @staticmethod
+    def get_tesseract() -> TesseractOCRPort:
+        """Instantiate concrete Tesseract adapter dynamically."""
+        module = importlib.import_module("src.infrastructure.tesseract_ocr_adapter")
+        cls = getattr(module, "TesseractOCRAdapter")
+        return cls()
+    @staticmethod
+    def get_ffmpeg() -> FFmpegVideoPort:
+        """Instantiate concrete FFmpeg adapter dynamically."""
+        module = importlib.import_module("src.infrastructure.ffmpeg_video_adapter")
+        cls = getattr(module, "FFmpegVideoAdapter")
+        return cls()
+    @staticmethod
+    def get_llm() -> LLMVisionPort:
+        """Instantiate concrete LLM adapter dynamically."""
+        module = importlib.import_module("src.infrastructure.llm_vision_adapter")
+        cls = getattr(module, "LLMVisionAdapter")
+        return cls()
+    @staticmethod
+    def get_image_processing() -> ImageProcessingProtocol:
+        """Instantiate concrete ImageProcessingProcessor dynamically with injected ports."""
+        cap_mod = importlib.import_module("src.capabilities.image_processing_processor")
+        cap_cls = getattr(cap_mod, "ImageProcessingProcessor")
+        return cap_cls(
+            opencv_port=VisionAgentOrchestrator.get_opencv(),
+            tesseract_port=VisionAgentOrchestrator.get_tesseract(),
+            llm_port=VisionAgentOrchestrator.get_llm(),
+        )
+    @staticmethod
+    def get_video_processing() -> VideoProcessingProtocol:
+        """Instantiate concrete VideoProcessingProcessor dynamically with injected ports."""
+        cap_mod = importlib.import_module("src.capabilities.video_processing_processor")
+        cap_cls = getattr(cap_mod, "VideoProcessingProcessor")
+        return cap_cls(
+            opencv_port=VisionAgentOrchestrator.get_opencv(),
+            ffmpeg_port=VisionAgentOrchestrator.get_ffmpeg(),
+        )
+    @staticmethod
+    def get_video_analysis() -> VideoAnalysisProtocol:
+        """Instantiate concrete VideoAnalysisAnalyzer dynamically with injected ports."""
+        cap_mod = importlib.import_module("src.capabilities.video_analysis_analyzer")
+        cap_cls = getattr(cap_mod, "VideoAnalysisAnalyzer")
+        return cap_cls(
+            opencv_port=VisionAgentOrchestrator.get_opencv(),
+        )
+    @staticmethod
+    def get_object_tracking() -> ObjectTrackingProtocol:
+        """Instantiate concrete ObjectTrackingTracker dynamically with injected ports."""
+        cap_mod = importlib.import_module("src.capabilities.object_tracking_tracker")
+        cap_cls = getattr(cap_mod, "ObjectTrackingTracker")
+        return cap_cls(
+            opencv_port=VisionAgentOrchestrator.get_opencv(),
+        )
+    @staticmethod
+    def get_visual_memory() -> VisualMemoryProtocol:
+        """Instantiate concrete VisualMemoryStore dynamically with injected ports."""
+        cap_mod = importlib.import_module("src.capabilities.visual_memory_store")
+        cap_cls = getattr(cap_mod, "VisualMemoryStore")
+        return cap_cls(
+            opencv_port=VisionAgentOrchestrator.get_opencv(),
+            utils_port=VisionAgentOrchestrator.get_utils(),
+        )
+    @staticmethod
+    def get_video_timeline() -> VideoTimelineProtocol:
+        """Instantiate concrete VideoTimelineGenerator dynamically with injected ports."""
+        cap_mod = importlib.import_module("src.capabilities.video_timeline_generator")
+        cap_cls = getattr(cap_mod, "VideoTimelineGenerator")
+        return cap_cls(
+            opencv_port=VisionAgentOrchestrator.get_opencv(),
+            video_cap=VisionAgentOrchestrator.get_video_processing(),
+            analysis_cap=VisionAgentOrchestrator.get_video_analysis(),
+        )
+    @staticmethod
+    def _execute_image_cmd(command: str, kwargs: Dict[str, Any]) -> str | None:
+        if command == "analyze":
+            img = FilePath(value=kwargs["image"])
+            prompt_val = kwargs.get("prompt")
+            prompt = AnalysisPrompt(value=prompt_val)
+            return json.dumps(VisionAgentOrchestrator.get_image_processing().analyze_screenshot(img, prompt).model_dump(), indent=2)
+        elif command == "ocr":
+            img = FilePath(value=kwargs["image"])
+            lang_val = kwargs.get("lang") or "eng"
+            lang = LanguageCode(value=lang_val)
+            return VisionAgentOrchestrator.get_image_processing().extract_text(img, lang).value
+        elif command == "elements":
+            img = FilePath(value=kwargs["image"])
+            return json.dumps([e.model_dump() for e in VisionAgentOrchestrator.get_image_processing().find_elements(img)], indent=2)
+        elif command == "compare":
+            img1 = FilePath(value=kwargs["image1"])
+            img2 = FilePath(value=kwargs["image2"])
+            return json.dumps(VisionAgentOrchestrator.get_image_processing().compare_screenshots(img1, img2), indent=2)
+        return None
+    @staticmethod
+    def _cmd_video_info(kwargs: Dict[str, Any]) -> str:
+        vid = FilePath(value=kwargs["video"])
+        return json.dumps(VisionAgentOrchestrator.get_video_processing().get_info(vid).model_dump(), indent=2)
+    @staticmethod
+    def _cmd_extract_frames(kwargs: Dict[str, Any]) -> str:
+        interval_val = float(kwargs["interval"])
+        interval = IntervalSeconds(value=interval_val)
+        res = asyncio.run(VisionAgentOrchestrator.get_video_processing().extract_frames(FilePath(value=kwargs["video"]), interval))
+        return json.dumps([r.value for r in res], indent=2)
+    @staticmethod
+    def _cmd_convert(kwargs: Dict[str, Any]) -> str:
+        inp = FilePath(value=kwargs["input_path"])
+        out = FilePath(value=kwargs["output_path"])
+        res = asyncio.run(VisionAgentOrchestrator.get_video_processing().convert_format(inp, out))
+        return json.dumps({"success": res})
+    @staticmethod
+    def _cmd_check_corruption(kwargs: Dict[str, Any]) -> str:
+        res = VisionAgentOrchestrator.get_video_processing().check_corruption(FilePath(value=kwargs["video"]))
+        return json.dumps({"corrupted": res})
+    @staticmethod
+    def _cmd_create_gif(kwargs: Dict[str, Any]) -> str:
+        vid = FilePath(value=kwargs["video"])
+        out = FilePath(value=kwargs["output_path"])
+        start = float(kwargs["start"]) if kwargs["start"] else None
+        duration = float(kwargs["duration"]) if kwargs["duration"] else None
+        segment = TimeSegment(start=start, duration=duration)
+        res = asyncio.run(VisionAgentOrchestrator.get_video_processing().create_gif(vid, out, segment))
+        return json.dumps({"success": res})
+    @staticmethod
+    def _cmd_detect_scenes(kwargs: Dict[str, Any]) -> str:
+        vid = FilePath(value=kwargs["video"])
+        thresh_val = float(kwargs["threshold"])
+        threshold = SceneThreshold(value=thresh_val)
+        return json.dumps([s.model_dump() for s in VisionAgentOrchestrator.get_video_analysis().detect_scenes(vid, threshold)], indent=2)
+    @staticmethod
+    def _cmd_detect_motion(kwargs: Dict[str, Any]) -> str:
+        vid = FilePath(value=kwargs["video"])
+        min_area_val = int(kwargs["min_area"])
+        min_area = MinArea(value=min_area_val)
+        return json.dumps([m.model_dump() for m in VisionAgentOrchestrator.get_video_analysis().detect_motion(vid, min_area)], indent=2)
+    @staticmethod
+    def _cmd_track(kwargs: Dict[str, Any]) -> str:
+        vid = FilePath(value=kwargs["video"])
+        x, y, w, h = [int(v) for v in kwargs["bbox"].split(",")]
+        bbox = BoundingBox(x=x, y=y, width=w, height=h)
+        max_frames_val = int(kwargs["max_frames"])
+        max_frames = MaxFrames(value=max_frames_val)
+        return json.dumps([b.model_dump() for b in VisionAgentOrchestrator.get_object_tracking().track_object(vid, bbox, max_frames)], indent=2)
+    @staticmethod
+    def _cmd_timeline(kwargs: Dict[str, Any]) -> str:
+        vid = FilePath(value=kwargs["video"])
+        interval_val = int(kwargs["interval"])
+        interval = IntervalSeconds(value=float(interval_val))
+        return json.dumps(asyncio.run(VisionAgentOrchestrator.get_video_timeline().generate_timeline(vid, interval)).model_dump(), indent=2)
+    @staticmethod
+    def _execute_video_cmd(command: str, kwargs: Dict[str, Any]) -> str | None:
+        handlers = {
+            "video-info": VisionAgentOrchestrator._cmd_video_info,
+            "extract-frames": VisionAgentOrchestrator._cmd_extract_frames,
+            "convert": VisionAgentOrchestrator._cmd_convert,
+            "check-corruption": VisionAgentOrchestrator._cmd_check_corruption,
+            "create-gif": VisionAgentOrchestrator._cmd_create_gif,
+            "detect-scenes": VisionAgentOrchestrator._cmd_detect_scenes,
+            "detect-motion": VisionAgentOrchestrator._cmd_detect_motion,
+            "track": VisionAgentOrchestrator._cmd_track,
+            "timeline": VisionAgentOrchestrator._cmd_timeline,
+        }
+        if command in handlers:
+            return handlers[command](kwargs)
+        return None
+    @staticmethod
+    def _execute_memory_cmd(command: str, kwargs: Dict[str, Any]) -> str | None:
+        if command == "memory-store":
+            img = FilePath(value=kwargs["image"])
+            label = MemoryLabel(value=kwargs["label"])
+            return json.dumps(VisionAgentOrchestrator.get_visual_memory().remember_image(img, label).model_dump(), indent=2)
+        elif command == "memory-search":
+            query = FilePath(value=kwargs["query"])
+            max_dist_val = int(kwargs["max_distance"])
+            max_distance = DistanceThreshold(value=max_dist_val)
+            res = VisionAgentOrchestrator.get_visual_memory().find_similar_images(query, max_distance)
+            return json.dumps([r.model_dump() for r in res], indent=2)
+        elif command == "memory-list":
+            import os
+            memory_dir = os.path.expanduser("~/.vision-memory")
+            index_file = os.path.join(memory_dir, "index.json")
+            if os.path.exists(index_file):
+                with open(index_file) as f:
+                    data = json.load(f)
+                return json.dumps(data, indent=2)
+            return json.dumps({})
+        return None
+    @classmethod
+    def execute_in_process(cls, command: CommandName, kwargs: dict) -> CommandOutput:
+        """Route and execute any command in-process across domains."""
+        try:
+            cmd_val = command.value if command else ""
+            img_res = cls._execute_image_cmd(cmd_val, kwargs)
+            if img_res is not None:
+                return CommandOutput(value=img_res)
+            vid_res = cls._execute_video_cmd(cmd_val, kwargs)
+            if vid_res is not None:
+                return CommandOutput(value=vid_res)
+            mem_res = cls._execute_memory_cmd(cmd_val, kwargs)
+            if mem_res is not None:
+                return CommandOutput(value=mem_res)
+        except Exception as e:
+            return CommandOutput(value=json.dumps({"error": str(e)}))
+        return CommandOutput(value=json.dumps({"error": f"Unknown command: {command.value if command else ''}"}))

vision_arwaky-2.0.5/src/capabilities/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+from .image_processing_processor import ImageProcessingProcessor
+from .video_processing_processor import VideoProcessingProcessor
+from .video_analysis_analyzer import VideoAnalysisAnalyzer
+from .object_tracking_tracker import ObjectTrackingTracker
+from .visual_memory_store import VisualMemoryStore
+from .video_timeline_generator import VideoTimelineGenerator
+__all__ = [
+    "ImageProcessingProcessor",
+    "VideoProcessingProcessor",
+    "VideoAnalysisAnalyzer",
+    "ObjectTrackingTracker",
+    "VisualMemoryStore",
+    "VideoTimelineGenerator",
+]

vision_arwaky-2.0.5/src/capabilities/image_processing_processor.py ADDED Viewed

@@ -0,0 +1,119 @@
+from typing import Dict, Any, List
+from src.contract import ImageProcessingProtocol
+from src.contract import OpenCVImagePort
+from src.contract import TesseractOCRPort
+from src.contract import LLMVisionPort
+from src.taxonomy import BoundingBox, Detection, VisionAnalysis, FilePath, LanguageCode, AnalysisPrompt, OcrText
+class ImageProcessingProcessor(ImageProcessingProtocol):
+    """Image processing capability executing screenshot analysis and comparisons."""
+    def __init__(
+        self,
+        opencv_port: OpenCVImagePort,
+        tesseract_port: TesseractOCRPort,
+        llm_port: LLMVisionPort,
+    ):
+        self._opencv = opencv_port
+        self._tesseract = tesseract_port
+        self._llm = llm_port
+    def analyze_screenshot(self, image_path: FilePath, prompt: AnalysisPrompt) -> VisionAnalysis:
+        """Analyze screenshot for UI elements and text.
+        If prompt is provided and a local VLM is available, use LLM for
+        open-ended visual analysis. Otherwise fallback to OCR + element detection.
+        """
+        p_val = prompt.value if prompt else None
+        if p_val:
+            try:
+                analysis = self._llm.analyze_image(image_path.value, p_val)
+                return VisionAnalysis(
+                    source="llm",
+                    text=analysis,
+                    model=self._llm.model or "unknown",
+                )
+            except Exception as e:
+                # Fallback to OpenCV if LLM fails
+                return VisionAnalysis(
+                    source="opencv",
+                    text=self.extract_text(image_path, LanguageCode(value="eng")).value,
+                    elements=self.find_elements(image_path),
+                    error=str(e),
+                )
+        # Default: OCR + element detection
+        text = self.extract_text(image_path, LanguageCode(value="eng")).value
+        elements = self.find_elements(image_path)
+        return VisionAnalysis(
+            source="opencv",
+            text=text,
+            elements=elements,
+        )
+    def extract_text(self, image_path: FilePath, lang: LanguageCode) -> OcrText:
+        """Extract text from image using OCR."""
+        text_str = self._tesseract.extract_text(image_path, lang)
+        return OcrText(value=text_str)
+    def find_elements(self, image_path: FilePath) -> List[Detection]:
+        """Find UI elements (buttons, input fields, etc)."""
+        image = self._opencv.read_image(image_path)
+        if image is None:
+            raise ValueError(f"Failed to load image: {image_path.value}")
+        gray = self._opencv.to_grayscale(image)
+        edges = self._opencv.detect_edges(gray, 50, 150)
+        contours = self._opencv.find_contours(edges)
+        detections = []
+        for cnt in contours:
+            area = self._opencv.get_contour_area(cnt)
+            if area > 100:  # Filter out noise
+                x, y, w, h = self._opencv.get_bounding_box(cnt)
+                detections.append(
+                    Detection(
+                        label="ui_element",
+                        confidence=1.0,
+                        bbox=BoundingBox(x=x, y=y, width=w, height=h),
+                    )
+                )
+        return detections
+    def compare_screenshots(self, image_path1: FilePath, image_path2: FilePath) -> Dict[str, Any]:
+        """Compare two screenshots and find differences."""
+        img1 = self._opencv.read_image(image_path1)
+        img2 = self._opencv.read_image(image_path2)
+        if img1 is None or img2 is None:
+            raise ValueError("Failed to load one or both images")
+        if img1.shape != img2.shape:
+            img2 = self._opencv.cv2.resize(img2, (img1.shape[1], img1.shape[0]))
+        diff = self._opencv.abs_diff(img1, img2)
+        gray_diff = self._opencv.to_grayscale(diff)
+        _, thresh = self._opencv.cv2.threshold(
+            gray_diff, 30, 255, self._opencv.cv2.THRESH_BINARY
+        )
+        contours = self._opencv.find_contours(thresh)
+        differences = []
+        for cnt in contours:
+            area = self._opencv.get_contour_area(cnt)
+            if area > 50:
+                x, y, w, h = self._opencv.get_bounding_box(cnt)
+                differences.append(
+                    BoundingBox(x=x, y=y, width=w, height=h).model_dump()
+                )
+        hash1 = self._opencv.compute_phash(img1)
+        hash2 = self._opencv.compute_phash(img2)
+        return {
+            "identical": len(differences) == 0 and hash1 == hash2,
+            "phash_diff": hash1 != hash2,
+            "differences": differences,
+        }

vision_arwaky-2.0.5/src/capabilities/object_tracking_tracker.py ADDED Viewed

@@ -0,0 +1,98 @@
+"""Object tracking using OpenCV tracking algorithms."""
+import cv2
+from typing import List
+from src.contract import ObjectTrackingProtocol
+from src.contract import OpenCVImagePort
+from src.taxonomy.vision_models_vo import BoundingBox, FilePath, MaxFrames
+class ObjectTrackingTracker(ObjectTrackingProtocol):
+    """Track objects through video frames using OpenCV trackers."""
+    def __init__(self, opencv_port: OpenCVImagePort):
+        self._opencv = opencv_port
+    def _create_tracker(self):
+        """Helper to dynamically construct the OpenCV tracker to avoid complexity and mypy issues."""
+        try:
+            csrt_creator = getattr(cv2, "TrackerCSRT_create", None)
+            if csrt_creator is not None:
+                return csrt_creator()
+            legacy = getattr(cv2, "legacy", None)
+            if legacy is not None:
+                legacy_csrt_creator = getattr(legacy, "TrackerCSRT_create", None)
+                if legacy_csrt_creator is not None:
+                    return legacy_csrt_creator()
+        except Exception as e:
+            _err = str(e)
+        try:
+            kcf_creator = getattr(cv2, "TrackerKCF_create", None)
+            if kcf_creator is not None:
+                return kcf_creator()
+            legacy = getattr(cv2, "legacy", None)
+            if legacy is not None:
+                legacy_kcf_creator = getattr(legacy, "TrackerKCF_create", None)
+                if legacy_kcf_creator is not None:
+                    return legacy_kcf_creator()
+        except Exception as e:
+            _err = str(e)
+        return None
+    def track_object(
+        self,
+        video_path: FilePath,
+        initial_box: BoundingBox,
+        max_frames: MaxFrames,
+    ) -> List[BoundingBox]:
+        """Track an object starting from an initial bounding box."""
+        cap = self._opencv.get_video_capture(video_path.value)
+        if not cap.isOpened():
+            return []
+        # Read first frame
+        ret, frame = cap.read()
+        if not ret:
+            cap.release()
+            return []
+        # Initialize tracker dynamically to avoid static mypy type ignores
+        tracker = self._create_tracker()
+        if tracker is None:
+            cap.release()
+            return []
+        bbox_tuple = (initial_box.x, initial_box.y, initial_box.width, initial_box.height)
+        ok = tracker.init(frame, bbox_tuple)
+        # OpenCV 4.x init returns None on success, not True
+        if ok is False:
+            cap.release()
+            return []
+        boxes: List[BoundingBox] = [initial_box]
+        frame_count = 0
+        max_frames_val = max_frames.value if max_frames else 300
+        while frame_count < max_frames_val:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            ok, bbox = tracker.update(frame)
+            if ok:
+                x, y, w, h = [int(v) for v in bbox]
+                boxes.append(BoundingBox(x=x, y=y, width=w, height=h))
+            else:
+                # Lost tracking — stop
+                break
+            frame_count += 1
+        cap.release()
+        return boxes