PyPI - screenforge - Versions diffs - 0.4.0__py3-none-any.whl - Mend

screenforge 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

cli/__init__.py +0 -0
cli/_version.py +1 -0
cli/dispatch.py +266 -0
cli/doctor.py +487 -0
cli/modes/__init__.py +0 -0
cli/modes/action.py +262 -0
cli/modes/default.py +248 -0
cli/modes/demo.py +162 -0
cli/modes/dry_run.py +237 -0
cli/modes/init.py +133 -0
cli/modes/plan.py +148 -0
cli/modes/workflow.py +354 -0
cli/parser.py +305 -0
cli/reporter.py +207 -0
cli/session.py +146 -0
cli/shared.py +427 -0
cli/shorthand.py +90 -0
cli/tool_protocol_handlers.py +446 -0
common/__init__.py +0 -0
common/adapters/__init__.py +21 -0
common/adapters/android_adapter.py +273 -0
common/adapters/base_adapter.py +24 -0
common/adapters/ios_adapter.py +278 -0
common/adapters/web_adapter.py +271 -0
common/ai.py +277 -0
common/ai_autonomous.py +273 -0
common/ai_heal.py +222 -0
common/cache/__init__.py +15 -0
common/cache/cache_hash.py +57 -0
common/cache/cache_manager.py +300 -0
common/cache/cache_stats.py +133 -0
common/cache/cache_storage.py +79 -0
common/cache/embedding_loader.py +150 -0
common/capabilities.py +121 -0
common/case_memory.py +327 -0
common/error_codes.py +61 -0
common/exceptions.py +18 -0
common/executor.py +1504 -0
common/failure_diagnosis.py +138 -0
common/history_manager.py +75 -0
common/logs.py +168 -0
common/mcp_server.py +467 -0
common/preflight.py +496 -0
common/progress.py +37 -0
common/run_reporter.py +415 -0
common/run_resume.py +149 -0
common/runtime_modes.py +35 -0
common/tool_protocol.py +196 -0
common/visual_fallback.py +71 -0
common/workflow_schema.py +150 -0
config/__init__.py +0 -0
config/config.py +167 -0
config/env_loader.py +76 -0
screenforge-0.4.0.dist-info/METADATA +43 -0
screenforge-0.4.0.dist-info/RECORD +64 -0
screenforge-0.4.0.dist-info/WHEEL +5 -0
screenforge-0.4.0.dist-info/entry_points.txt +2 -0
screenforge-0.4.0.dist-info/licenses/LICENSE +21 -0
screenforge-0.4.0.dist-info/top_level.txt +4 -0
utils/__init__.py +0 -0
utils/screenshot_annotator.py +60 -0
utils/utils_ios.py +195 -0
utils/utils_web.py +304 -0
utils/utils_xml.py +218 -0

common/adapters/web_adapter.py ADDED Viewed

@@ -0,0 +1,271 @@
+import json
+import os
+import subprocess
+import sys
+import time
+import config.config as config
+from common.logs import log
+from .base_adapter import BasePlatformAdapter
+_SESSION_FILE = os.path.abspath(os.path.join("report", "web_session.json"))
+_CDP_PORT = 9333
+def _read_session() -> dict | None:
+    if not os.path.exists(_SESSION_FILE):
+        return None
+    try:
+        with open(_SESSION_FILE, "r") as f:
+            return json.load(f)
+    except Exception:
+        return None
+def _write_session(cdp_url: str, pid: int) -> None:
+    os.makedirs(os.path.dirname(_SESSION_FILE), exist_ok=True)
+    with open(_SESSION_FILE, "w") as f:
+        json.dump({"cdp_url": cdp_url, "pid": pid}, f)
+def _clear_session() -> None:
+    if os.path.exists(_SESSION_FILE):
+        os.remove(_SESSION_FILE)
+def _is_process_alive(pid: int) -> bool:
+    try:
+        os.kill(pid, 0)
+    except (OSError, ProcessLookupError):
+        return False
+    # os.kill(pid, 0) succeeds for a ZOMBIE (killed but not yet reaped by its
+    # parent — common here because Chromium's parent is Playwright's node driver,
+    # which only reaps on its own exit). A zombie is dead, not alive. On POSIX,
+    # check the process state and treat Z/defunct as not-alive so the reaper and
+    # the reconnect path don't mistake a corpse for a live browser.
+    if sys.platform != "win32":
+        try:
+            state = subprocess.run(
+                ["ps", "-o", "state=", "-p", str(pid)],
+                capture_output=True, text=True, timeout=3,
+            ).stdout.strip()
+            if state.startswith("Z"):
+                return False
+        except Exception:
+            pass  # ps unavailable → fall back to the os.kill result
+    return True
+def stop_persistent_browser() -> bool:
+    """Terminate the detached persistent Chromium recorded in the session file.
+    The web adapter launches Chromium with --remote-debugging-port and keeps it
+    running across CLI calls (teardown only disconnects). Nothing else ever kills
+    it, so repeated runs leak browsers holding port 9333. This is the explicit
+    reaper, wired to `--web-stop`. Returns True if a live process was signalled.
+    """
+    import signal
+    session = _read_session()
+    if not session:
+        log.info("ℹ️ [System] No persistent web browser session on record")
+        return False
+    pid = session.get("pid", 0)
+    if not pid or not _is_process_alive(pid):
+        log.info("ℹ️ [System] Recorded browser is not running; clearing stale session")
+        _clear_session()
+        return False
+    try:
+        if sys.platform == "win32":
+            import subprocess as _sp
+            _sp.run(["taskkill", "/F", "/T", "/PID", str(pid)], capture_output=True)
+        else:
+            # Chromium ignores SIGTERM while a CDP client is attached, so escalate
+            # to SIGKILL if it's still up after a short grace period. (A leftover
+            # zombie — parent not yet reaped — reads as not-alive via _is_process_alive.)
+            os.kill(pid, signal.SIGTERM)
+            for _ in range(10):
+                time.sleep(0.2)
+                if not _is_process_alive(pid):
+                    break
+            else:
+                try:
+                    os.kill(pid, signal.SIGKILL)
+                except ProcessLookupError:
+                    pass
+        log.info(f"✅ [System] Stopped persistent Chromium (pid {pid})")
+        _clear_session()
+        return True
+    except Exception as e:
+        log.error(f"❌ [Error] Failed to stop persistent browser (pid {pid}): {e}")
+        return False
+class WebPlaywrightAdapter(BasePlatformAdapter):
+    def __init__(self):
+        super().__init__()
+        self.playwright = None
+        self.browser = None
+        self.context = None
+        self.driver = None
+        self._chromium_process = None
+        self.state_file = os.path.abspath(os.path.join("report", "browser_state.json"))
+        self.viewport_size = {"width": 1920, "height": 1080}
+    def setup(self):
+        log.info("⏱️ [System] Initializing Web (Playwright) browser...")
+        try:
+            from playwright.sync_api import sync_playwright
+        except ImportError:
+            log.error("❌ [Error] playwright not installed. Run: pip install playwright && playwright install")
+            raise
+        self.playwright = sync_playwright().start()
+        if self._try_reconnect():
+            return
+        self._launch_persistent_browser()
+    def _try_reconnect(self) -> bool:
+        session = _read_session()
+        if not session:
+            return False
+        cdp_url = session.get("cdp_url", "")
+        pid = session.get("pid", 0)
+        if not cdp_url or not _is_process_alive(pid):
+            log.info("⚠️ [System] Persistent browser no longer exists, launching new one")
+            _clear_session()
+            return False
+        try:
+            self.browser = self.playwright.chromium.connect_over_cdp(cdp_url)
+            log.info("✅ [System] Reconnected to persistent browser session")
+            if self.browser.contexts:
+                self.context = self.browser.contexts[0]
+                if self.context.pages:
+                    self.driver = self.context.pages[0]
+                    self.driver.set_default_timeout(config.DEFAULT_TIMEOUT * 1000)
+                    log.info(f"✅ [System] Reusing existing page: {self.driver.url}")
+                    return True
+            self._create_context_and_page()
+            return True
+        except Exception as e:
+            log.info(f"⚠️ [System] Reconnect failed ({e}), launching new browser")
+            _clear_session()
+            return False
+    def _launch_persistent_browser(self):
+        chromium_path = self.playwright.chromium.executable_path
+        if not chromium_path or not os.path.exists(chromium_path):
+            log.error("❌ [Error] Playwright Chromium not found. Run: playwright install chromium")
+            raise RuntimeError("Playwright Chromium not found")
+        cdp_url = f"http://127.0.0.1:{_CDP_PORT}"
+        user_data_dir = os.path.abspath(os.path.join("report", "chromium_profile"))
+        os.makedirs(user_data_dir, exist_ok=True)
+        log.info(f"🚀 [System] Launching persistent Chromium (CDP: {cdp_url})...")
+        self._chromium_process = subprocess.Popen(
+            [
+                chromium_path,
+                f"--remote-debugging-port={_CDP_PORT}",
+                f"--user-data-dir={user_data_dir}",
+                "--no-first-run",
+                "--no-default-browser-check",
+                f"--window-size={self.viewport_size['width']},{self.viewport_size['height']}",
+            ],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+        for _ in range(30):
+            try:
+                import urllib.request
+                urllib.request.urlopen(f"{cdp_url}/json/version", timeout=1)
+                break
+            except Exception:
+                time.sleep(0.5)
+        else:
+            raise RuntimeError(f"Chromium CDP port {_CDP_PORT} startup timed out")
+        _write_session(cdp_url, self._chromium_process.pid)
+        self.browser = self.playwright.chromium.connect_over_cdp(cdp_url)
+        log.info("✅ [System] Persistent Chromium launched and connected")
+        if self.browser.contexts and self.browser.contexts[0].pages:
+            self.context = self.browser.contexts[0]
+            self.driver = self.context.pages[0]
+            self.driver.set_default_timeout(config.DEFAULT_TIMEOUT * 1000)
+        else:
+            self._create_context_and_page()
+    def _create_context_and_page(self):
+        # NOTE: no video recording here. The adapter attaches to Chromium over CDP
+        # (connect_over_cdp) for cross-call session reuse, and Playwright cannot
+        # record video for a CDP-attached browser — `page.video` yields an object
+        # but no file is ever written. Recording only works for a browser
+        # Playwright launches itself, which this design does not do. Web video is
+        # therefore unsupported; see stop_record_and_get_path().
+        if self.browser.contexts:
+            self.context = self.browser.contexts[0]
+        else:
+            self.context = self.browser.new_context(viewport=self.viewport_size)
+        if os.path.exists(self.state_file):
+            try:
+                import json as _json
+                with open(self.state_file, "r") as f:
+                    state = _json.load(f)
+                for cookie in state.get("cookies", []):
+                    self.context.add_cookies([cookie])
+                log.info(f"✅ [System] Restored browser state from: {self.state_file}")
+            except Exception as e:
+                log.warning(f"⚠️ [Warning] Failed to restore browser state: {e}")
+        self.driver = self.context.new_page()
+        self.driver.set_default_timeout(config.DEFAULT_TIMEOUT * 1000)
+    def teardown(self):
+        log.info("⏱️ [System] Saving state and disconnecting (browser keeps running)...")
+        try:
+            if self.context:
+                try:
+                    os.makedirs(os.path.dirname(self.state_file), exist_ok=True)
+                    self.context.storage_state(path=self.state_file)
+                    log.info(f"✅ [System] Browser state saved to: {self.state_file}")
+                except Exception as e:
+                    log.warning(f"⚠️ [Warning] Failed to save browser state: {e}")
+        finally:
+            if self.playwright:
+                try:
+                    self.playwright.stop()
+                except Exception:
+                    pass
+    def start_record(self, video_name: str):
+        # Web video recording is unsupported (CDP-attached browser; see
+        # _create_context_and_page). No-op kept for the adapter interface.
+        log.info("ℹ️ [System] Web video recording is not supported (CDP session)")
+    def stop_record_and_get_path(self, video_name: str) -> str:
+        # Unsupported on web — return "" cleanly. (This also avoids the old
+        # AttributeError: stop used to call self.driver.video.path() which is
+        # None when no record_video_dir was set.)
+        return ""
+    def take_screenshot(self) -> bytes:
+        if self.driver:
+            return self.driver.screenshot()
+        return b""

common/ai.py ADDED Viewed

@@ -0,0 +1,277 @@
+import json
+import time
+from openai import OpenAI
+import config.config as config
+from common.cache import CacheManager
+from common.logs import log
+from common.progress import ai_status
+class AIBrain:
+    def __init__(self):
+        # 实例化文本专属客户端
+        self.text_client = OpenAI(
+            api_key=config.OPENAI_API_KEY, base_url=config.OPENAI_BASE_URL
+        )
+        # 实例化视觉专属客户端
+        self.vision_client = OpenAI(
+            api_key=config.VISION_API_KEY, base_url=config.VISION_BASE_URL
+        )
+        self.cache_manager = CacheManager(
+            cache_dir=config.CACHE_DIR,
+            enabled=config.CACHE_ENABLED,
+            ttl_days=config.CACHE_TTL_DAYS,
+            max_size_mb=config.CACHE_MAX_SIZE_MB,
+        )
+    def _verify_locator_in_ui(self, decision: dict, ui_dict: dict) -> bool:
+        """
+        校验缓存中推荐的动作，其元素是否真实存在于当前的 UI 树中
+        """
+        loc_type = decision.get("locator_type")
+        loc_val = decision.get("locator_value")
+        # 如果动作不需要特定元素 (比如 answer 或某些全局操作)，直接放行
+        if (
+            not loc_type
+            or not loc_val
+            or loc_type not in ["text", "description", "resourceId", "id"]
+        ):
+            return True
+        elements = ui_dict.get("ui_elements", [])
+        for el in elements:
+            if loc_type == "text" and el.get("text") == loc_val:
+                return True
+            if loc_type == "description" and el.get("desc") == loc_val:
+                return True
+            if loc_type in ["resourceId", "id"] and el.get("id") == loc_val:
+                return True
+        return False
+    def get_action(
+        self,
+        instruction: str,
+        ui_json: str,
+        platform: str = "android",
+        screenshot_base64: str = None,
+        chat_history: list = None,
+        skip_cache: bool = False
+    ) -> dict:
+        """
+        向大模型发送指令并返回结构化动作 JSON。
+        """
+        try:
+            ui_dict = json.loads(ui_json)
+        except json.JSONDecodeError:
+            ui_dict = {}
+        # ==========================================
+        # 1. 缓存读取与物理校验阶段
+        # ==========================================
+        if not skip_cache:
+            cached_l1 = self.cache_manager.get(instruction, ui_dict, platform)
+            if cached_l1 is not None:
+                log.info("[Cache] L1 exact hit (page-level action cache)")
+                return cached_l1
+            if hasattr(self.cache_manager, "get_chat_simple"):
+                cached_l2 = self.cache_manager.get_chat_simple(instruction, platform)
+                if cached_l2 is not None:
+                    if self._verify_locator_in_ui(cached_l2, ui_dict):
+                        log.info("[Cache] L2 semantic hit (global semantic cache)")
+                        return cached_l2
+                    else:
+                        log.warning("[Cache] Semantic hit discarded — target element not present on current page")
+            log.info("[Cache Miss] No cache hit, calling LLM API...")
+        else:
+            log.info("[System] Cache bypassed, forcing LLM re-evaluation...")
+        # ==========================================
+        # 2. 处理上下文历史 (触发大模型 Prompt Caching)
+        # ==========================================
+        history_prompt = ""
+        if chat_history:
+            # 仅提取历史意图和动作，丢弃庞大的历史 UI 树以防止 Context 爆炸
+            history_str = "\n".join(
+                [
+                    f"- 历史步骤{i + 1}: {step.get('action_description')}"
+                    for i, step in enumerate(chat_history)
+                ]
+            )
+            history_prompt = (f"\n\n【前置对话上下文】(请结合上下文理解当前指令):\n{history_str}")
+        # ==========================================
+        # 3. 提示词与 Payload 组装
+        # ==========================================
+        vision_prompt = ""
+        if screenshot_base64:
+            vision_prompt = """
+                你同时收到了一张真实屏幕截图。请优先结合视觉画面判断页面结构和元素状态！如果 XML 树找不到或者混乱，以视觉为准。
+            """
+        system_prompt = f"""
+        # Role: {platform} 自动化测试策略生成专家
+        ## Profile
+        - language: 中文
+        - description: 资深自动化测试专家，专门分析UI元素树和视觉画面，将自然语言测试指令转化为可执行的自动化操作策略
+        - background: 拥有10年以上UI自动化测试经验，精通各种测试框架和元素定位技术，擅长处理动态UI和复杂交互场景
+        - personality: 严谨、细致、逻辑性强，注重测试的准确性和可重复性
+        - expertise: UI元素分析、测试策略制定、定位器选择优化、跨平台测试适配
+        - target_audience: 测试工程师、开发人员、质量保证团队
+        ## Skills
+        1. 元素分析技能
+        - UI结构解析: 能够深度解析XML/JSON格式的UI元素树，理解页面层级结构
+        - 视觉辅助判断: 结合屏幕截图验证元素状态和布局，解决元素树不准确的问题
+        - 动态元素识别: 识别并处理动态生成的CSS、resourceId等不稳定定位器
+        - 元素属性评估: 分析元素的text、description、resourceId等关键属性
+        2. 测试策略制定技能
+        - 指令解析: 准确理解用户的自然语言测试指令，转化为具体操作步骤
+        - 定位器选择: 根据优先级规则选择最稳定可靠的元素定位方式
+        - 操作映射: 将测试需求映射到具体的自动化操作类型
+        - 异常处理: 预判可能出现的测试异常并提供应对策略
+        ## Rules (核心原则)
+        1. 基本原则：
+        - 视觉优先原则: 当XML元素树与视觉画面不一致时，以视觉画面为准进行判断
+        - 稳定性优先: 选择定位器时，稳定性比简洁性更重要，避免使用动态生成的定位器
+        - 准确性保证: 确保生成的测试策略能够准确执行用户的测试意图
+        - 完整性要求: 输出必须包含所有必要的操作参数，确保测试可执行
+        2. 行为准则：
+        - 严格遵循定位器优先级: css > resourceId > text > description
+        - 动态检测机制: 自动检测并规避动态生成的css和resourceId
+        - 上下文感知: 结合页面整体结构理解元素关系和状态
+        - 验证机制: 对选择的定位器进行逻辑验证，确保唯一性和可访问性
+        ## 📋 执行协议 (Protocol)
+        {vision_prompt}
+        ### 允许的 action 类型:
+        - "click": 点击元素
+        - "long_click": 长按元素
+        - "hover": 悬停元素 (针对 Web 端，触发下拉菜单显示等交互)
+        - "input": 在输入框中输入内容 (必须在 extra_value 字段提供输入内容)
+        - "swipe": 滑动屏幕以寻找不在视口内的元素。必须在 extra_value 填入 "up", "down", "left" 或 "right"。此时 locator_type 填 "global"。
+        - "press": 模拟键盘或物理系统按键。必须在 extra_value 填入按键名 (如 "Enter", "Back", "Home")。此时 locator_type 填 "global"。
+        - "scroll_into_view": (仅 Web) 将指定元素滚动到视口内 (元素级，优于盲目 swipe)。
+        - "select": (仅 Web) 在原生 <select> 下拉框中选择选项。extra_value 填选项的可见文本或 value。
+        - "upload": (仅 Web) 给文件 <input> 设置文件。extra_value 填文件路径。
+        - "double_click": (仅 Web) 双击元素。
+        - "right_click": (仅 Web) 右键点击元素 (触发上下文菜单)。
+        - "drag": (仅 Web) 将源元素拖拽到目标。locator 定位源元素，extra_value 填目标 (css 选择器或可见文本)。
+        - "wait_for": 显式等待元素出现或消失 (替代死等)。extra_value 填 "visible"(默认) 或 "hidden"。用于等待异步加载完成。
+        - "assert_exist": 校验某个元素是否在页面上出现
+        - "assert_not_exist": 校验某个元素已消失/不存在 (如加载动画消失、弹窗关闭)
+        - "assert_text_equals": 校验某个元素的文本是否与期望值【完全相等】
+        - "assert_text_contains": 校验某个元素的文本【包含】指定子串 (动态文本首选，比完全相等更稳健)。在 extra_value 填子串。
+        - "assert_value": 校验输入框/表单字段的当前值等于期望值。在 extra_value 填期望值。
+        - "assert_url": (仅 Web) 校验当前页面 URL 包含指定子串。locator_type/value 填 "global"，extra_value 填 URL 子串 (如 "/dashboard")。
+        - "not_found": 如果在提供的 UI 树中完全找不到符合用户意图的元素，且必须通过视觉验证，请务必返回此 action！
+        ### 定位器选择铁律
+        1. 优先级顺序：css > resourceId > text > description
+        2. 【🚨 降级原则】当发现 css 或 resourceId 是动态生成的（包含随机hash、时间戳），请严格降级并优先选择 "text" 或 "description"！
+        ### 强制输出格式
+        必须输出纯 JSON 对象，不要包含任何 markdown 格式，包含顶级 key "result"，内部结构如下:
+        {{"result": {{"action": "...", "locator_type": "...", "locator_value": "...", "extra_value": "..."}}}}
+        ## Workflows
+        - 目标: 将用户的测试指令转化为可执行的自动化测试策略
+        - 步骤 1: 接收并分析UI元素树（JSON格式），同时检查是否有视觉辅助截图
+        - 步骤 2: 解析用户的自然语言测试指令，明确测试意图和期望结果
+        - 步骤 3: 结合元素树和视觉画面，确定目标元素及其状态
+        - 步骤 4: 根据定位器优先级规则，选择最稳定可靠的定位器
+        - 步骤 5: 将测试指令映射到具体的操作类型，并准备必要参数
+        - 步骤 6: 按照指定格式输出JSON格式的测试策略
+        - 预期结果: 生成一个完整、准确、可执行的自动化测试操作策略
+        ## Initialization
+        作为自动化测试策略生成专家，你必须遵守上述Rules，严格按照【执行协议】输出结果。
+        """
+        user_prompt = f"用户指令: {instruction}{history_prompt}\n当前屏幕 UI 树:\n{ui_json}"
+        user_message_content = [{"type": "text", "text": user_prompt}]
+        if screenshot_base64:
+            user_message_content.append(
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{screenshot_base64}"},
+                }
+            )
+        # ==========================================
+        # 3. 动态智能路由
+        # ==========================================
+        if screenshot_base64:
+            active_client = self.vision_client
+            active_model = config.VISION_MODEL_NAME
+            log.info(f"[AI] Using multimodal vision model: {active_model}")
+        else:
+            active_client = self.text_client
+            active_model = config.MODEL_NAME
+        start_time = time.time()
+        decision = self._call_llm(active_client, active_model, system_prompt, user_message_content)
+        llm_latency = time.time() - start_time
+        log.info(f"[AI] LLM response received in {llm_latency:.2f}s")
+        # ==========================================
+        # 4. 缓存全量回写阶段
+        # ==========================================
+        if decision:
+            # 1. 只要成功，必然写入强绑定的页面缓存 (L1)
+            self.cache_manager.set(instruction, ui_dict, decision, platform, llm_latency=llm_latency)
+            # 2. 放开 L2 写入
+            if hasattr(self.cache_manager, "set_chat_simple"):
+                self.cache_manager.set_chat_simple(instruction, decision, platform, llm_latency=llm_latency)
+        return decision
+    def _call_llm(
+        self,
+        client: OpenAI,
+        model_name: str,
+        system_prompt: str,
+        user_message_content: list,
+    ) -> dict:
+        """
+        封装底层的 LLM 网络调用
+        """
+        try:
+            with ai_status(f"Thinking ({model_name})..."):
+                response = client.chat.completions.create(
+                    model=model_name,
+                    messages=[
+                        {"role": "system", "content": system_prompt},
+                        {"role": "user", "content": user_message_content},
+                    ],
+                    temperature=0.1,
+                )
+            result_text = response.choices[0].message.content.strip()
+            if "```json" in result_text:
+                result_text = result_text.split("```json")[1].split("```")[0].strip()
+            elif "```" in result_text:
+                result_text = result_text.replace("```", "").strip()
+            parsed_json = json.loads(result_text)
+            return parsed_json.get("result", {})
+        except Exception as e:
+            log.error(f"[Error] Model ({model_name}) request or parse failed: {e}")
+            return {}