visionclaw 0.1.187-beta.8 → 0.1.187-dev.refactor-computer-use-direct-coordinates.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent/loop.js +1 -1
- package/dist/agent/loop.js.map +1 -1
- package/dist/agent/providers/client-factory.d.ts +1 -1
- package/dist/agent/providers/client-factory.js +1 -1
- package/dist/agent/runtime-surface.d.ts +1 -1
- package/dist/agent/runtime-surface.d.ts.map +1 -1
- package/dist/agent/runtime-surface.js +35 -18
- package/dist/agent/runtime-surface.js.map +1 -1
- package/dist/agent/system-prompt.d.ts.map +1 -1
- package/dist/agent/system-prompt.js +1 -3
- package/dist/agent/system-prompt.js.map +1 -1
- package/dist/builtin-skills/macos-automation/SKILL.md +13 -10
- package/dist/onboarding/generate-wallpaper.d.ts +3 -8
- package/dist/onboarding/generate-wallpaper.d.ts.map +1 -1
- package/dist/onboarding/generate-wallpaper.js +3 -123
- package/dist/onboarding/generate-wallpaper.js.map +1 -1
- package/dist/tools/computer-use.d.ts +56 -6
- package/dist/tools/computer-use.d.ts.map +1 -1
- package/dist/tools/computer-use.js +129 -286
- package/dist/tools/computer-use.js.map +1 -1
- package/dist-agent/bundle.cjs +208 -574
- package/package.json +1 -1
- package/dist/agent/applied-credential-signature.d.ts +0 -53
- package/dist/agent/applied-credential-signature.d.ts.map +0 -1
- package/dist/agent/applied-credential-signature.js +0 -137
- package/dist/agent/applied-credential-signature.js.map +0 -1
- package/dist/agent/tunnel-credential-handler.d.ts +0 -90
- package/dist/agent/tunnel-credential-handler.d.ts.map +0 -1
- package/dist/agent/tunnel-credential-handler.js +0 -162
- package/dist/agent/tunnel-credential-handler.js.map +0 -1
- package/dist/billing/payg-handler.d.ts +0 -29
- package/dist/billing/payg-handler.d.ts.map +0 -1
- package/dist/billing/payg-handler.js +0 -92
- package/dist/billing/payg-handler.js.map +0 -1
- package/dist/billing/payment-handler.d.ts +0 -24
- package/dist/billing/payment-handler.d.ts.map +0 -1
- package/dist/billing/payment-handler.js +0 -101
- package/dist/billing/payment-handler.js.map +0 -1
- package/dist/builtin-skills/catalog/phone-adb-automation/SKILL.md +0 -412
- package/dist/builtin-skills/catalog/phone-adb-automation/phone_input.sh +0 -132
- package/dist/builtin-skills/catalog/phone-adb-automation/phone_launch.sh +0 -166
- package/dist/builtin-skills/catalog/phone-adb-automation/phone_screenshot.sh +0 -87
- package/dist/builtin-skills/catalog/phone-adb-automation/phone_security_kbd.py +0 -174
- package/dist/builtin-skills/catalog/phone-adb-automation/phone_setup.sh +0 -274
- package/dist/builtin-skills/catalog/phone-adb-automation/phone_swipe.sh +0 -111
- package/dist/builtin-skills/catalog/phone-adb-automation/phone_tap.sh +0 -87
- package/dist/builtin-skills/catalog/phone-adb-automation/phone_ui_parse.py +0 -176
- package/dist/builtin-skills/catalog/phone-adb-automation/phone_wake_unlock.sh +0 -67
- package/dist/builtin-skills/transcribe-audio/SKILL.md +0 -122
- package/dist/data-processing/convert-demo-cli.d.ts +0 -7
- package/dist/data-processing/convert-demo-cli.d.ts.map +0 -1
- package/dist/data-processing/convert-demo-cli.js +0 -30
- package/dist/data-processing/convert-demo-cli.js.map +0 -1
- package/dist/data-processing/convert-demo.d.ts +0 -26
- package/dist/data-processing/convert-demo.d.ts.map +0 -1
- package/dist/data-processing/convert-demo.js +0 -233
- package/dist/data-processing/convert-demo.js.map +0 -1
- package/dist/obs/rdp/icons/icons/app_windows.svg +0 -4
- package/dist/obs/rdp/icons/icons/clip_get.svg +0 -4
- package/dist/obs/rdp/icons/icons/clip_send.svg +0 -4
- package/dist/obs/rdp/icons/icons/clip_shared.svg +0 -4
- package/dist/obs/rdp/icons/icons/clipboard.svg +0 -4
- package/dist/obs/rdp/icons/icons/clipboard_shared.svg +0 -4
- package/dist/obs/rdp/icons/icons/control.svg +0 -4
- package/dist/obs/rdp/icons/icons/desktop.svg +0 -4
- package/dist/obs/rdp/icons/icons/display.svg +0 -4
- package/dist/obs/rdp/icons/icons/launchpad.svg +0 -4
- package/dist/obs/rdp/icons/icons/mission_control.svg +0 -4
- package/dist/obs/rdp/icons/icons/screenshot.svg +0 -4
- package/dist/obs/rdp/icons/icons/zoom_actual.svg +0 -4
- package/dist/obs/rdp/icons/icons/zoom_fit.svg +0 -4
- package/dist/obs/rdp/icons/icons/zoom_in.svg +0 -4
- package/dist/obs/rdp/icons/icons/zoom_out.svg +0 -4
- package/dist/obs/tunnel-telemetry.d.ts +0 -46
- package/dist/obs/tunnel-telemetry.d.ts.map +0 -1
- package/dist/obs/tunnel-telemetry.js +0 -70
- package/dist/obs/tunnel-telemetry.js.map +0 -1
- package/dist/service/gbox-tun.d.ts +0 -14
- package/dist/service/gbox-tun.d.ts.map +0 -1
- package/dist/service/gbox-tun.js +0 -315
- package/dist/service/gbox-tun.js.map +0 -1
- package/dist/tools/coordinate-resolver.d.ts +0 -30
- package/dist/tools/coordinate-resolver.d.ts.map +0 -1
- package/dist/tools/coordinate-resolver.js +0 -104
- package/dist/tools/coordinate-resolver.js.map +0 -1
- package/dist/utils/wechat-monitor.d.ts +0 -21
- package/dist/utils/wechat-monitor.d.ts.map +0 -1
- package/dist/utils/wechat-monitor.js +0 -88
- package/dist/utils/wechat-monitor.js.map +0 -1
|
@@ -1,111 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bash
|
|
2
|
-
# phone_swipe.sh - Swipe on Android phone with presets and coordinate scaling
|
|
3
|
-
#
|
|
4
|
-
# Usage:
|
|
5
|
-
# bash phone_swipe.sh <preset|x1 y1 x2 y2> [options]
|
|
6
|
-
#
|
|
7
|
-
# Presets (auto-calculated from device resolution):
|
|
8
|
-
# up - Scroll up (swipe from bottom to top, reveals content below)
|
|
9
|
-
# down - Scroll down (swipe from top to bottom, reveals content above)
|
|
10
|
-
# left - Swipe left (next page)
|
|
11
|
-
# right - Swipe right (previous page)
|
|
12
|
-
#
|
|
13
|
-
# Options:
|
|
14
|
-
# -d DEVICE_ID Target specific device
|
|
15
|
-
# -S SCALE Input coords from scaled screenshot (for custom x1 y1 x2 y2)
|
|
16
|
-
# -t DURATION Swipe duration in ms (default: 300)
|
|
17
|
-
# -q Quiet mode
|
|
18
|
-
#
|
|
19
|
-
# Examples:
|
|
20
|
-
# bash phone_swipe.sh up # Scroll up
|
|
21
|
-
# bash phone_swipe.sh down # Scroll down
|
|
22
|
-
# bash phone_swipe.sh left # Swipe left
|
|
23
|
-
# bash phone_swipe.sh 100 2000 100 800 # Custom swipe (native coords)
|
|
24
|
-
# bash phone_swipe.sh 50 1000 50 400 -S 50 # Custom swipe (scaled coords)
|
|
25
|
-
# bash phone_swipe.sh up -t 500 # Slow scroll up
|
|
26
|
-
|
|
27
|
-
set -euo pipefail
|
|
28
|
-
|
|
29
|
-
DEVICE=""
|
|
30
|
-
SCALE=""
|
|
31
|
-
DURATION=300
|
|
32
|
-
QUIET=false
|
|
33
|
-
|
|
34
|
-
# Collect all args, separate flags from positional
|
|
35
|
-
POSITIONAL=()
|
|
36
|
-
ARGV=("$@")
|
|
37
|
-
i=0
|
|
38
|
-
while [ $i -lt ${#ARGV[@]} ]; do
|
|
39
|
-
case "${ARGV[$i]}" in
|
|
40
|
-
-d) DEVICE="${ARGV[$((i+1))]}"; i=$((i+2)) ;;
|
|
41
|
-
-S) SCALE="${ARGV[$((i+1))]}"; i=$((i+2)) ;;
|
|
42
|
-
-t) DURATION="${ARGV[$((i+1))]}"; i=$((i+2)) ;;
|
|
43
|
-
-q) QUIET=true; i=$((i+1)) ;;
|
|
44
|
-
-*) echo "Unknown option: ${ARGV[$i]}" >&2; exit 1 ;;
|
|
45
|
-
*) POSITIONAL+=("${ARGV[$i]}"); i=$((i+1)) ;;
|
|
46
|
-
esac
|
|
47
|
-
done
|
|
48
|
-
|
|
49
|
-
ADB_CMD="adb"
|
|
50
|
-
if [ -n "$DEVICE" ]; then
|
|
51
|
-
ADB_CMD="adb -s $DEVICE"
|
|
52
|
-
fi
|
|
53
|
-
|
|
54
|
-
info() { $QUIET || echo "[phone_swipe] $*" >&2; }
|
|
55
|
-
|
|
56
|
-
if [ ${#POSITIONAL[@]} -eq 0 ]; then
|
|
57
|
-
echo "Usage: $0 <up|down|left|right|x1 y1 x2 y2> [-d device] [-S scale%] [-t ms] [-q]" >&2
|
|
58
|
-
exit 1
|
|
59
|
-
fi
|
|
60
|
-
|
|
61
|
-
# Get device resolution for presets
|
|
62
|
-
get_resolution() {
|
|
63
|
-
local size
|
|
64
|
-
size=$($ADB_CMD shell wm size | grep "Physical" | awk '{print $3}')
|
|
65
|
-
echo "$size"
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
PRESET="${POSITIONAL[0]}"
|
|
69
|
-
|
|
70
|
-
case "$PRESET" in
|
|
71
|
-
up|down|left|right)
|
|
72
|
-
SIZE=$(get_resolution)
|
|
73
|
-
W=$(echo "$SIZE" | cut -d'x' -f1)
|
|
74
|
-
H=$(echo "$SIZE" | cut -d'x' -f2)
|
|
75
|
-
CX=$((W / 2))
|
|
76
|
-
CY=$((H / 2))
|
|
77
|
-
|
|
78
|
-
case "$PRESET" in
|
|
79
|
-
up) X1=$CX; Y1=$((H * 3 / 4)); X2=$CX; Y2=$((H / 4)) ;;
|
|
80
|
-
down) X1=$CX; Y1=$((H / 4)); X2=$CX; Y2=$((H * 3 / 4)) ;;
|
|
81
|
-
left) X1=$((W * 4 / 5)); Y1=$CY; X2=$((W / 5)); Y2=$CY ;;
|
|
82
|
-
right) X1=$((W / 5)); Y1=$CY; X2=$((W * 4 / 5)); Y2=$CY ;;
|
|
83
|
-
esac
|
|
84
|
-
|
|
85
|
-
info "Preset '$PRESET' on ${W}x${H}: ($X1,$Y1) -> ($X2,$Y2)"
|
|
86
|
-
;;
|
|
87
|
-
*)
|
|
88
|
-
if [ ${#POSITIONAL[@]} -lt 4 ]; then
|
|
89
|
-
echo "ERROR: Custom swipe needs 4 coordinates: x1 y1 x2 y2" >&2
|
|
90
|
-
exit 1
|
|
91
|
-
fi
|
|
92
|
-
X1="${POSITIONAL[0]}"
|
|
93
|
-
Y1="${POSITIONAL[1]}"
|
|
94
|
-
X2="${POSITIONAL[2]}"
|
|
95
|
-
Y2="${POSITIONAL[3]}"
|
|
96
|
-
|
|
97
|
-
if [ -n "$SCALE" ] && [ "$SCALE" -gt 0 ]; then
|
|
98
|
-
X1=$((X1 * 100 / SCALE))
|
|
99
|
-
Y1=$((Y1 * 100 / SCALE))
|
|
100
|
-
X2=$((X2 * 100 / SCALE))
|
|
101
|
-
Y2=$((Y2 * 100 / SCALE))
|
|
102
|
-
info "Scaled coords at ${SCALE}% -> Native: ($X1,$Y1) -> ($X2,$Y2)"
|
|
103
|
-
else
|
|
104
|
-
info "Swipe: ($X1,$Y1) -> ($X2,$Y2)"
|
|
105
|
-
fi
|
|
106
|
-
;;
|
|
107
|
-
esac
|
|
108
|
-
|
|
109
|
-
$ADB_CMD shell input swipe "$X1" "$Y1" "$X2" "$Y2" "$DURATION"
|
|
110
|
-
info "Done."
|
|
111
|
-
echo "swiped $X1 $Y1 $X2 $Y2"
|
|
@@ -1,87 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bash
|
|
2
|
-
# phone_tap.sh - Tap on Android phone with optional coordinate scaling
|
|
3
|
-
#
|
|
4
|
-
# Usage:
|
|
5
|
-
# bash phone_tap.sh <x> <y> [options]
|
|
6
|
-
#
|
|
7
|
-
# The x,y coordinates are in the coordinate space you specify:
|
|
8
|
-
# - By default, treated as native device coordinates (no scaling)
|
|
9
|
-
# - With -S, treated as scaled coordinates and auto-converted to native
|
|
10
|
-
#
|
|
11
|
-
# Options:
|
|
12
|
-
# -d DEVICE_ID Target specific device
|
|
13
|
-
# -S SCALE Input coords are from a screenshot scaled to this % (e.g. 50)
|
|
14
|
-
# Will multiply by 100/SCALE to get native coords
|
|
15
|
-
# -l Long press (500ms hold)
|
|
16
|
-
# -t DURATION Hold duration in ms (default: tap=50, long=500)
|
|
17
|
-
# -q Quiet mode
|
|
18
|
-
#
|
|
19
|
-
# Examples:
|
|
20
|
-
# bash phone_tap.sh 610 1328 # Tap at native coords
|
|
21
|
-
# bash phone_tap.sh 305 664 -S 50 # Tap at 50%-scaled coords -> auto-converts to 610,1328
|
|
22
|
-
# bash phone_tap.sh 305 664 -S 50 -l # Long press at scaled coords
|
|
23
|
-
|
|
24
|
-
set -euo pipefail
|
|
25
|
-
|
|
26
|
-
DEVICE=""
|
|
27
|
-
SCALE=""
|
|
28
|
-
LONG=false
|
|
29
|
-
DURATION=""
|
|
30
|
-
QUIET=false
|
|
31
|
-
|
|
32
|
-
# Parse options after positional args
|
|
33
|
-
ARGS=()
|
|
34
|
-
while [ $# -gt 0 ]; do
|
|
35
|
-
case "$1" in
|
|
36
|
-
-d) DEVICE="$2"; shift 2 ;;
|
|
37
|
-
-S) SCALE="$2"; shift 2 ;;
|
|
38
|
-
-l) LONG=true; shift ;;
|
|
39
|
-
-t) DURATION="$2"; shift 2 ;;
|
|
40
|
-
-q) QUIET=true; shift ;;
|
|
41
|
-
-*) echo "Unknown option: $1" >&2; exit 1 ;;
|
|
42
|
-
*) ARGS+=("$1"); shift ;;
|
|
43
|
-
esac
|
|
44
|
-
done
|
|
45
|
-
|
|
46
|
-
if [ ${#ARGS[@]} -lt 2 ]; then
|
|
47
|
-
echo "Usage: $0 <x> <y> [-d device] [-S scale%] [-l] [-t duration_ms] [-q]" >&2
|
|
48
|
-
exit 1
|
|
49
|
-
fi
|
|
50
|
-
|
|
51
|
-
X="${ARGS[0]}"
|
|
52
|
-
Y="${ARGS[1]}"
|
|
53
|
-
|
|
54
|
-
ADB_CMD="adb"
|
|
55
|
-
if [ -n "$DEVICE" ]; then
|
|
56
|
-
ADB_CMD="adb -s $DEVICE"
|
|
57
|
-
fi
|
|
58
|
-
|
|
59
|
-
info() { $QUIET || echo "[phone_tap] $*" >&2; }
|
|
60
|
-
|
|
61
|
-
# Scale coordinates if needed
|
|
62
|
-
if [ -n "$SCALE" ] && [ "$SCALE" -gt 0 ]; then
|
|
63
|
-
NATIVE_X=$((X * 100 / SCALE))
|
|
64
|
-
NATIVE_Y=$((Y * 100 / SCALE))
|
|
65
|
-
info "Scaled ($X, $Y) at ${SCALE}% -> Native ($NATIVE_X, $NATIVE_Y)"
|
|
66
|
-
X=$NATIVE_X
|
|
67
|
-
Y=$NATIVE_Y
|
|
68
|
-
fi
|
|
69
|
-
|
|
70
|
-
# Determine duration
|
|
71
|
-
if [ -n "$DURATION" ]; then
|
|
72
|
-
DUR=$DURATION
|
|
73
|
-
elif $LONG; then
|
|
74
|
-
DUR=500
|
|
75
|
-
else
|
|
76
|
-
DUR=50
|
|
77
|
-
fi
|
|
78
|
-
|
|
79
|
-
if $LONG || [ "$DUR" -gt 100 ]; then
|
|
80
|
-
info "Long press at ($X, $Y) for ${DUR}ms"
|
|
81
|
-
$ADB_CMD shell input swipe "$X" "$Y" "$X" "$Y" "$DUR"
|
|
82
|
-
else
|
|
83
|
-
info "Tap at ($X, $Y)"
|
|
84
|
-
$ADB_CMD shell input tap "$X" "$Y"
|
|
85
|
-
fi
|
|
86
|
-
|
|
87
|
-
echo "tapped $X $Y"
|
|
@@ -1,176 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
phone_ui_parse.py - Dump and parse Android UI hierarchy via ADB uiautomator.
|
|
4
|
-
|
|
5
|
-
Usage:
|
|
6
|
-
python3 phone_ui_parse.py [options]
|
|
7
|
-
|
|
8
|
-
Options:
|
|
9
|
-
-d DEVICE_ID Target specific device
|
|
10
|
-
-f XML_FILE Parse existing XML file instead of dumping from device
|
|
11
|
-
-c Show only clickable elements
|
|
12
|
-
-s SEARCH Filter by text/description containing SEARCH (case-insensitive)
|
|
13
|
-
-j Output as JSON
|
|
14
|
-
--bounds-only Show only elements with bounds
|
|
15
|
-
-q Quiet (no info messages)
|
|
16
|
-
|
|
17
|
-
Output:
|
|
18
|
-
Each line: "text" desc="description" center=(x,y) bounds=[x1,y1,x2,y2] clickable=true/false class=ClassName
|
|
19
|
-
|
|
20
|
-
Examples:
|
|
21
|
-
python3 phone_ui_parse.py # Dump & parse all elements
|
|
22
|
-
python3 phone_ui_parse.py -c # Only clickable elements
|
|
23
|
-
python3 phone_ui_parse.py -s "send" # Elements matching "send"
|
|
24
|
-
python3 phone_ui_parse.py -c -s "confirm" -j # Clickable "confirm" as JSON
|
|
25
|
-
python3 phone_ui_parse.py -f /tmp/ui.xml # Parse local XML file
|
|
26
|
-
"""
|
|
27
|
-
|
|
28
|
-
import argparse
|
|
29
|
-
import json
|
|
30
|
-
import os
|
|
31
|
-
import re
|
|
32
|
-
import subprocess
|
|
33
|
-
import sys
|
|
34
|
-
import tempfile
|
|
35
|
-
import xml.etree.ElementTree as ET
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def run_adb(device, *args):
|
|
39
|
-
cmd = ["adb"]
|
|
40
|
-
if device:
|
|
41
|
-
cmd += ["-s", device]
|
|
42
|
-
cmd += list(args)
|
|
43
|
-
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
44
|
-
return result.stdout, result.stderr, result.returncode
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
def dump_ui(device=None):
|
|
48
|
-
"""Dump UI hierarchy from device and return local XML path."""
|
|
49
|
-
remote_path = "/sdcard/ui_dump.xml"
|
|
50
|
-
local_path = os.path.join(tempfile.gettempdir(), f"phone_ui_{os.getpid()}.xml")
|
|
51
|
-
|
|
52
|
-
_, stderr, rc = run_adb(device, "shell", "uiautomator", "dump", remote_path)
|
|
53
|
-
if rc != 0:
|
|
54
|
-
print(f"ERROR: uiautomator dump failed: {stderr}", file=sys.stderr)
|
|
55
|
-
sys.exit(1)
|
|
56
|
-
|
|
57
|
-
_, stderr, rc = run_adb(device, "pull", remote_path, local_path)
|
|
58
|
-
if rc != 0:
|
|
59
|
-
print(f"ERROR: Failed to pull UI dump: {stderr}", file=sys.stderr)
|
|
60
|
-
sys.exit(1)
|
|
61
|
-
|
|
62
|
-
run_adb(device, "shell", "rm", "-f", remote_path)
|
|
63
|
-
return local_path
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def parse_ui_xml(xml_path, clickable_only=False, search=None):
|
|
67
|
-
"""Parse UI XML and return list of element dicts."""
|
|
68
|
-
tree = ET.parse(xml_path)
|
|
69
|
-
root = tree.getroot()
|
|
70
|
-
elements = []
|
|
71
|
-
|
|
72
|
-
for node in root.iter():
|
|
73
|
-
text = node.get("text", "")
|
|
74
|
-
desc = node.get("content-desc", "")
|
|
75
|
-
bounds = node.get("bounds", "")
|
|
76
|
-
clickable = node.get("clickable", "false")
|
|
77
|
-
class_name = node.get("class", "")
|
|
78
|
-
resource_id = node.get("resource-id", "")
|
|
79
|
-
enabled = node.get("enabled", "true")
|
|
80
|
-
focused = node.get("focused", "false")
|
|
81
|
-
selected = node.get("selected", "false")
|
|
82
|
-
checked = node.get("checked", "false")
|
|
83
|
-
|
|
84
|
-
if not bounds:
|
|
85
|
-
continue
|
|
86
|
-
|
|
87
|
-
if clickable_only and clickable != "true":
|
|
88
|
-
continue
|
|
89
|
-
|
|
90
|
-
if search:
|
|
91
|
-
combined = f"{text} {desc} {resource_id}".lower()
|
|
92
|
-
if search.lower() not in combined:
|
|
93
|
-
continue
|
|
94
|
-
|
|
95
|
-
# Parse bounds: [x1,y1][x2,y2]
|
|
96
|
-
m = re.findall(r"\d+", bounds)
|
|
97
|
-
if len(m) != 4:
|
|
98
|
-
continue
|
|
99
|
-
|
|
100
|
-
x1, y1, x2, y2 = int(m[0]), int(m[1]), int(m[2]), int(m[3])
|
|
101
|
-
cx = (x1 + x2) // 2
|
|
102
|
-
cy = (y1 + y2) // 2
|
|
103
|
-
|
|
104
|
-
elements.append({
|
|
105
|
-
"text": text,
|
|
106
|
-
"content_desc": desc,
|
|
107
|
-
"resource_id": resource_id,
|
|
108
|
-
"class": class_name,
|
|
109
|
-
"bounds": [x1, y1, x2, y2],
|
|
110
|
-
"center": [cx, cy],
|
|
111
|
-
"clickable": clickable == "true",
|
|
112
|
-
"enabled": enabled == "true",
|
|
113
|
-
"focused": focused == "true",
|
|
114
|
-
"selected": selected == "true",
|
|
115
|
-
"checked": checked == "true",
|
|
116
|
-
})
|
|
117
|
-
|
|
118
|
-
return elements
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
def main():
|
|
122
|
-
parser = argparse.ArgumentParser(description="Parse Android UI hierarchy")
|
|
123
|
-
parser.add_argument("-d", "--device", help="Target device ID")
|
|
124
|
-
parser.add_argument("-f", "--file", help="Parse existing XML file")
|
|
125
|
-
parser.add_argument("-c", "--clickable", action="store_true", help="Clickable only")
|
|
126
|
-
parser.add_argument("-s", "--search", help="Filter by text/desc")
|
|
127
|
-
parser.add_argument("-j", "--json", action="store_true", help="JSON output")
|
|
128
|
-
parser.add_argument("--bounds-only", action="store_true", help="Elements with bounds only")
|
|
129
|
-
parser.add_argument("-q", "--quiet", action="store_true", help="Quiet mode")
|
|
130
|
-
args = parser.parse_args()
|
|
131
|
-
|
|
132
|
-
if args.file:
|
|
133
|
-
xml_path = args.file
|
|
134
|
-
cleanup = False
|
|
135
|
-
else:
|
|
136
|
-
if not args.quiet:
|
|
137
|
-
print("[phone_ui_parse] Dumping UI hierarchy...", file=sys.stderr)
|
|
138
|
-
xml_path = dump_ui(args.device)
|
|
139
|
-
cleanup = True
|
|
140
|
-
|
|
141
|
-
elements = parse_ui_xml(xml_path, args.clickable, args.search)
|
|
142
|
-
|
|
143
|
-
if cleanup:
|
|
144
|
-
os.unlink(xml_path)
|
|
145
|
-
|
|
146
|
-
if not args.quiet:
|
|
147
|
-
print(f"[phone_ui_parse] Found {len(elements)} elements", file=sys.stderr)
|
|
148
|
-
|
|
149
|
-
if args.json:
|
|
150
|
-
print(json.dumps(elements, ensure_ascii=False, indent=2))
|
|
151
|
-
else:
|
|
152
|
-
for el in elements:
|
|
153
|
-
text = el["text"]
|
|
154
|
-
desc = el["content_desc"]
|
|
155
|
-
cx, cy = el["center"]
|
|
156
|
-
click = el["clickable"]
|
|
157
|
-
cls = el["class"].split(".")[-1] if el["class"] else ""
|
|
158
|
-
rid = el["resource_id"].split("/")[-1] if el["resource_id"] else ""
|
|
159
|
-
|
|
160
|
-
parts = []
|
|
161
|
-
if text:
|
|
162
|
-
parts.append(f'"{text}"')
|
|
163
|
-
if desc:
|
|
164
|
-
parts.append(f'desc="{desc}"')
|
|
165
|
-
if rid:
|
|
166
|
-
parts.append(f"id={rid}")
|
|
167
|
-
parts.append(f"center=({cx},{cy})")
|
|
168
|
-
parts.append(f"clickable={click}")
|
|
169
|
-
if cls:
|
|
170
|
-
parts.append(f"class={cls}")
|
|
171
|
-
|
|
172
|
-
print(" ".join(parts))
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
if __name__ == "__main__":
|
|
176
|
-
main()
|
|
@@ -1,67 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bash
|
|
2
|
-
# phone_wake_unlock.sh - Wake up and unlock Android phone screen
|
|
3
|
-
#
|
|
4
|
-
# Usage:
|
|
5
|
-
# bash phone_wake_unlock.sh [options]
|
|
6
|
-
#
|
|
7
|
-
# Options:
|
|
8
|
-
# -d DEVICE_ID Target specific device
|
|
9
|
-
# -q Quiet mode
|
|
10
|
-
#
|
|
11
|
-
# The script checks if the screen is on, wakes it if needed,
|
|
12
|
-
# and performs a swipe-up gesture to unlock (works for swipe-to-unlock screens).
|
|
13
|
-
# For PIN/pattern locks, additional input is needed after this script.
|
|
14
|
-
|
|
15
|
-
set -euo pipefail
|
|
16
|
-
|
|
17
|
-
DEVICE=""
|
|
18
|
-
QUIET=false
|
|
19
|
-
|
|
20
|
-
while getopts "d:q" opt; do
|
|
21
|
-
case $opt in
|
|
22
|
-
d) DEVICE="$OPTARG" ;;
|
|
23
|
-
q) QUIET=true ;;
|
|
24
|
-
*) echo "Usage: $0 [-d device] [-q]" >&2; exit 1 ;;
|
|
25
|
-
esac
|
|
26
|
-
done
|
|
27
|
-
|
|
28
|
-
ADB_CMD="adb"
|
|
29
|
-
if [ -n "$DEVICE" ]; then
|
|
30
|
-
ADB_CMD="adb -s $DEVICE"
|
|
31
|
-
fi
|
|
32
|
-
|
|
33
|
-
info() { $QUIET || echo "[phone_wake] $*" >&2; }
|
|
34
|
-
|
|
35
|
-
# Verify device connection
|
|
36
|
-
if ! $ADB_CMD get-state >/dev/null 2>&1; then
|
|
37
|
-
echo "ERROR: No device connected or device unauthorized" >&2
|
|
38
|
-
exit 1
|
|
39
|
-
fi
|
|
40
|
-
|
|
41
|
-
# Check screen state
|
|
42
|
-
WAKE_STATE=$($ADB_CMD shell dumpsys power | grep "mWakefulness=" | head -1 | sed 's/.*mWakefulness=//')
|
|
43
|
-
|
|
44
|
-
if [ "$WAKE_STATE" = "Awake" ]; then
|
|
45
|
-
info "Screen is already awake"
|
|
46
|
-
else
|
|
47
|
-
info "Screen is asleep, waking up..."
|
|
48
|
-
$ADB_CMD shell input keyevent KEYCODE_WAKEUP
|
|
49
|
-
sleep 0.5
|
|
50
|
-
fi
|
|
51
|
-
|
|
52
|
-
# Get screen resolution for swipe coordinates
|
|
53
|
-
SIZE=$($ADB_CMD shell wm size | grep "Physical" | awk '{print $3}')
|
|
54
|
-
WIDTH=$(echo "$SIZE" | cut -d'x' -f1)
|
|
55
|
-
HEIGHT=$(echo "$SIZE" | cut -d'x' -f2)
|
|
56
|
-
|
|
57
|
-
# Swipe up from bottom quarter to middle to unlock
|
|
58
|
-
START_X=$((WIDTH / 2))
|
|
59
|
-
START_Y=$((HEIGHT * 3 / 4))
|
|
60
|
-
END_Y=$((HEIGHT / 3))
|
|
61
|
-
|
|
62
|
-
info "Swiping to unlock (${START_X}, ${START_Y}) -> (${START_X}, ${END_Y})..."
|
|
63
|
-
$ADB_CMD shell input swipe "$START_X" "$START_Y" "$START_X" "$END_Y" 300
|
|
64
|
-
sleep 0.5
|
|
65
|
-
|
|
66
|
-
info "Done. Phone should be unlocked (if no PIN/pattern required)."
|
|
67
|
-
echo "unlocked"
|
|
@@ -1,122 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
description: Use this skill to transcribe audio files (voice messages, recordings) to text, extract audio from Bilibili videos, or handle Telegram voice messages. Supports OGG, MP3, WAV, M4A, and other common audio formats. Uses whisper.cpp with auto language detection -- supports Chinese, English, and all other languages.
|
|
3
|
-
---
|
|
4
|
-
|
|
5
|
-
# Transcribe Audio & Media Extraction
|
|
6
|
-
|
|
7
|
-
Transcribe audio files to text using whisper.cpp (local, free, private, native macOS binary).
|
|
8
|
-
|
|
9
|
-
## Prerequisites
|
|
10
|
-
|
|
11
|
-
whisper-cpp, ffmpeg, and the whisper model are checked during `visionclaw setup`. If they're missing, run setup again or install manually:
|
|
12
|
-
|
|
13
|
-
```bash
|
|
14
|
-
brew install whisper-cpp ffmpeg
|
|
15
|
-
mkdir -p ~/.local/share/whisper-cpp && curl -L -o ~/.local/share/whisper-cpp/ggml-large-v3-turbo.bin 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo.bin?download=true'
|
|
16
|
-
```
|
|
17
|
-
|
|
18
|
-
Binary is `whisper-cli` (not `whisper`). If not on PATH: `/opt/homebrew/bin/whisper-cli`
|
|
19
|
-
|
|
20
|
-
## Steps
|
|
21
|
-
|
|
22
|
-
1. **Download the audio file** if it's a URL:
|
|
23
|
-
```bash
|
|
24
|
-
curl -s -o /tmp/audio_input.ogg "THE_AUDIO_URL"
|
|
25
|
-
```
|
|
26
|
-
|
|
27
|
-
2. **Convert to 16kHz WAV** (required by whisper.cpp):
|
|
28
|
-
```bash
|
|
29
|
-
ffmpeg -y -i /tmp/audio_input.ogg -ar 16000 -ac 1 /tmp/audio_input.wav 2>/dev/null
|
|
30
|
-
```
|
|
31
|
-
|
|
32
|
-
3. **Transcribe**:
|
|
33
|
-
```bash
|
|
34
|
-
whisper-cli -m ~/.local/share/whisper-cpp/ggml-large-v3-turbo.bin -f /tmp/audio_input.wav --no-timestamps -l auto 2>/dev/null
|
|
35
|
-
```
|
|
36
|
-
|
|
37
|
-
4. **Use the transcription** as needed -- respond to the user, take action, etc.
|
|
38
|
-
|
|
39
|
-
> **CRITICAL**: Always use `-l auto` for language detection. The default is `-l en` which BREAKS Chinese transcription.
|
|
40
|
-
|
|
41
|
-
## Notes
|
|
42
|
-
|
|
43
|
-
- Uses `large-v3-turbo` (~1.5GB) -- near-large accuracy at half the size of full large-v3.
|
|
44
|
-
- whisper.cpp is optimized for Apple Silicon -- runs efficiently on any M1/M2/M3/M4 MacBook.
|
|
45
|
-
- Supports: OGG, MP3, WAV, M4A, FLAC, WEBM, and most common audio formats (via ffmpeg conversion).
|
|
46
|
-
- Runs entirely locally -- no API calls, no costs, fully private.
|
|
47
|
-
- If whisper-cli is not found, verify with `which whisper-cli`. The Homebrew package is `whisper-cpp` but the binary is `whisper-cli`.
|
|
48
|
-
|
|
49
|
-
## Telegram Voice Messages
|
|
50
|
-
|
|
51
|
-
Telegram voice messages arrive as OGG files:
|
|
52
|
-
|
|
53
|
-
```bash
|
|
54
|
-
curl -s -o /tmp/voice.ogg "{file_url}"
|
|
55
|
-
ffmpeg -y -i /tmp/voice.ogg -ar 16000 -ac 1 /tmp/voice.wav 2>/dev/null
|
|
56
|
-
whisper-cli -m ~/.local/share/whisper-cpp/ggml-large-v3-turbo.bin -f /tmp/voice.wav --no-timestamps -l auto 2>/dev/null
|
|
57
|
-
```
|
|
58
|
-
|
|
59
|
-
## Bilibili Video Audio Extraction
|
|
60
|
-
|
|
61
|
-
### Method: Browser Network Interception
|
|
62
|
-
|
|
63
|
-
When the standard yt-dlp approach fails (Bilibili returns HTTP 412), use Playwright to intercept the .m4s audio stream URLs directly from the browser's network requests.
|
|
64
|
-
|
|
65
|
-
```python
|
|
66
|
-
# 1. Navigate to Bilibili video in Playwright browser
|
|
67
|
-
await page.goto(f'https://www.bilibili.com/video/{bv_id}')
|
|
68
|
-
|
|
69
|
-
# 2. Wait for video to load, then check network requests
|
|
70
|
-
# The audio stream URL contains "30216" in the filename (30216 = audio codec ID)
|
|
71
|
-
# Example: [CID]-1-30216.m4s
|
|
72
|
-
|
|
73
|
-
# 3. Download the audio .m4s with Referer header
|
|
74
|
-
curl -s -L \
|
|
75
|
-
-H "Referer: https://www.bilibili.com/video/[BV_ID]" \
|
|
76
|
-
-H "User-Agent: Mozilla/5.0 ..." \
|
|
77
|
-
-o /tmp/audio.m4a \
|
|
78
|
-
"{audio_url}"
|
|
79
|
-
```
|
|
80
|
-
|
|
81
|
-
### Finding Bilibili Video for a WeChat 视频号 Video
|
|
82
|
-
|
|
83
|
-
1. Note the creator name (shown in WeChat video player) and video title keywords
|
|
84
|
-
2. Search Bilibili: `https://search.bilibili.com/all?keyword={creator}+{keywords}`
|
|
85
|
-
3. Extract BV IDs from search results (look for `BV[A-Za-z0-9]+` patterns)
|
|
86
|
-
4. Get video metadata from initial page state:
|
|
87
|
-
```javascript
|
|
88
|
-
window.__INITIAL_STATE__.videoData.cid // CID for subtitle API
|
|
89
|
-
```
|
|
90
|
-
5. Check for subtitles via API: `GET https://api.bilibili.com/x/player/v2?bvid={bvid}&cid={cid}`
|
|
91
|
-
- If `subtitle.subtitles` is empty → use audio extraction + Whisper
|
|
92
|
-
|
|
93
|
-
### M4S File ID Numbers (Bilibili)
|
|
94
|
-
- `30032` — Video stream
|
|
95
|
-
- `30216` — Audio stream (64kbps AAC)
|
|
96
|
-
- `30280` — Higher quality audio
|
|
97
|
-
|
|
98
|
-
### yt-dlp (when it works)
|
|
99
|
-
```bash
|
|
100
|
-
yt-dlp --cookies-from-browser chrome \
|
|
101
|
-
--write-auto-subs --sub-lang zh-CN \
|
|
102
|
-
--skip-download -o /tmp/subtitle \
|
|
103
|
-
https://www.bilibili.com/video/{BV_ID}
|
|
104
|
-
```
|
|
105
|
-
Often fails with HTTP 412 on Bilibili even with cookies. Prefer the network interception method.
|
|
106
|
-
|
|
107
|
-
## Generating Voice Replies (TTS)
|
|
108
|
-
|
|
109
|
-
To send a voice message back to the user, use macOS text-to-speech:
|
|
110
|
-
|
|
111
|
-
```bash
|
|
112
|
-
# For Chinese text, use the Tingting voice:
|
|
113
|
-
say -v "Tingting (Chinese (China mainland))" -o /tmp/reply.aiff "你好!这是语音回复"
|
|
114
|
-
# For English text:
|
|
115
|
-
say -o /tmp/reply.aiff "Your reply text here"
|
|
116
|
-
# Convert to OGG:
|
|
117
|
-
ffmpeg -y -i /tmp/reply.aiff /tmp/reply_voice.ogg 2>/dev/null
|
|
118
|
-
```
|
|
119
|
-
|
|
120
|
-
Then send via notify_user with `file_path: "/tmp/reply_voice.ogg"`.
|
|
121
|
-
|
|
122
|
-
Most users prefer TEXT messages. Use voice only when specifically requested.
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"convert-demo-cli.d.ts","sourceRoot":"","sources":["../../src/data-processing/convert-demo-cli.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAIH,wBAAsB,cAAc,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC,CA0BlE"}
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* CLI wrapper for convert-demo.
|
|
3
|
-
*
|
|
4
|
-
* Usage: visionclaw convert-demo <output-dir> [-o demo-output-dir]
|
|
5
|
-
*/
|
|
6
|
-
import { convertDemo } from "./convert-demo.js";
|
|
7
|
-
export async function runConvertDemo(args) {
|
|
8
|
-
let outputDir = "";
|
|
9
|
-
let demoOutputDir;
|
|
10
|
-
for (let i = 0; i < args.length; i++) {
|
|
11
|
-
const arg = args[i];
|
|
12
|
-
if ((arg === "-o" || arg === "--output-dir") && i + 1 < args.length) {
|
|
13
|
-
demoOutputDir = args[++i];
|
|
14
|
-
}
|
|
15
|
-
else if (!arg.startsWith("-") && !outputDir) {
|
|
16
|
-
outputDir = arg;
|
|
17
|
-
}
|
|
18
|
-
}
|
|
19
|
-
if (!outputDir) {
|
|
20
|
-
console.error("Usage: visionclaw convert-demo <output-dir> [-o demo-output-dir]\n" +
|
|
21
|
-
"\n" +
|
|
22
|
-
" <output-dir> Path to pipeline output (contains restructured-*.jsonl + workspace/)\n" +
|
|
23
|
-
" -o <dir> Where to write demo-*.jsonl (defaults to output-dir)");
|
|
24
|
-
process.exit(1);
|
|
25
|
-
}
|
|
26
|
-
console.log(`Converting to demo format: ${outputDir}`);
|
|
27
|
-
await convertDemo({ outputDir, demoOutputDir });
|
|
28
|
-
console.log("Done.");
|
|
29
|
-
}
|
|
30
|
-
//# sourceMappingURL=convert-demo-cli.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"convert-demo-cli.js","sourceRoot":"","sources":["../../src/data-processing/convert-demo-cli.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAEhD,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,IAAc;IACjD,IAAI,SAAS,GAAG,EAAE,CAAC;IACnB,IAAI,aAAiC,CAAC;IAEtC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;QACpB,IAAI,CAAC,GAAG,KAAK,IAAI,IAAI,GAAG,KAAK,cAAc,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;YACpE,aAAa,GAAG,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;QAC5B,CAAC;aAAM,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,EAAE,CAAC;YAC9C,SAAS,GAAG,GAAG,CAAC;QAClB,CAAC;IACH,CAAC;IAED,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,OAAO,CAAC,KAAK,CACX,oEAAoE;YACpE,IAAI;YACJ,yFAAyF;YACzF,uEAAuE,CACxE,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,8BAA8B,SAAS,EAAE,CAAC,CAAC;IACvD,MAAM,WAAW,CAAC,EAAE,SAAS,EAAE,aAAa,EAAE,CAAC,CAAC;IAChD,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;AACvB,CAAC"}
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Convert restructured JSONL records to the OpenClaw demo.json format.
|
|
3
|
-
*
|
|
4
|
-
* The demo format uses an OpenAI chat-completion-style messages array:
|
|
5
|
-
* system → user → (assistant → tool)* → assistant
|
|
6
|
-
*
|
|
7
|
-
* Usage:
|
|
8
|
-
* npx tsx src/data-processing/convert-demo.ts <output-dir> [-o <demo-output-dir>]
|
|
9
|
-
*
|
|
10
|
-
* Reads:
|
|
11
|
-
* <output-dir>/restructured-*.jsonl
|
|
12
|
-
* <output-dir>/workspace/system-prompt.md
|
|
13
|
-
*
|
|
14
|
-
* Writes:
|
|
15
|
-
* <demo-output-dir>/demo-clean.jsonl (or specified output dir)
|
|
16
|
-
* <demo-output-dir>/demo-interrupted.jsonl
|
|
17
|
-
* <demo-output-dir>/demo-heartbeat.jsonl
|
|
18
|
-
*/
|
|
19
|
-
export interface ConvertDemoOptions {
|
|
20
|
-
/** Path to output dir (containing restructured-*.jsonl and workspace/) */
|
|
21
|
-
outputDir: string;
|
|
22
|
-
/** Where to write demo JSONL files (defaults to outputDir) */
|
|
23
|
-
demoOutputDir?: string;
|
|
24
|
-
}
|
|
25
|
-
export declare function convertDemo(opts: ConvertDemoOptions): Promise<void>;
|
|
26
|
-
//# sourceMappingURL=convert-demo.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"convert-demo.d.ts","sourceRoot":"","sources":["../../src/data-processing/convert-demo.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;GAiBG;AA2PH,MAAM,WAAW,kBAAkB;IACjC,0EAA0E;IAC1E,SAAS,EAAE,MAAM,CAAC;IAClB,8DAA8D;IAC9D,aAAa,CAAC,EAAE,MAAM,CAAC;CACxB;AAED,wBAAsB,WAAW,CAAC,IAAI,EAAE,kBAAkB,GAAG,OAAO,CAAC,IAAI,CAAC,CAiCzE"}
|