visionclaw 0.1.187-beta.8 → 0.1.187-dev.refactor-computer-use-direct-coordinates.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/dist/agent/loop.js +1 -1
  2. package/dist/agent/loop.js.map +1 -1
  3. package/dist/agent/providers/client-factory.d.ts +1 -1
  4. package/dist/agent/providers/client-factory.js +1 -1
  5. package/dist/agent/runtime-surface.d.ts +1 -1
  6. package/dist/agent/runtime-surface.d.ts.map +1 -1
  7. package/dist/agent/runtime-surface.js +35 -18
  8. package/dist/agent/runtime-surface.js.map +1 -1
  9. package/dist/agent/system-prompt.d.ts.map +1 -1
  10. package/dist/agent/system-prompt.js +1 -3
  11. package/dist/agent/system-prompt.js.map +1 -1
  12. package/dist/builtin-skills/macos-automation/SKILL.md +13 -10
  13. package/dist/onboarding/generate-wallpaper.d.ts +3 -8
  14. package/dist/onboarding/generate-wallpaper.d.ts.map +1 -1
  15. package/dist/onboarding/generate-wallpaper.js +3 -123
  16. package/dist/onboarding/generate-wallpaper.js.map +1 -1
  17. package/dist/tools/computer-use.d.ts +56 -6
  18. package/dist/tools/computer-use.d.ts.map +1 -1
  19. package/dist/tools/computer-use.js +129 -286
  20. package/dist/tools/computer-use.js.map +1 -1
  21. package/dist-agent/bundle.cjs +208 -574
  22. package/package.json +1 -1
  23. package/dist/agent/applied-credential-signature.d.ts +0 -53
  24. package/dist/agent/applied-credential-signature.d.ts.map +0 -1
  25. package/dist/agent/applied-credential-signature.js +0 -137
  26. package/dist/agent/applied-credential-signature.js.map +0 -1
  27. package/dist/agent/tunnel-credential-handler.d.ts +0 -90
  28. package/dist/agent/tunnel-credential-handler.d.ts.map +0 -1
  29. package/dist/agent/tunnel-credential-handler.js +0 -162
  30. package/dist/agent/tunnel-credential-handler.js.map +0 -1
  31. package/dist/billing/payg-handler.d.ts +0 -29
  32. package/dist/billing/payg-handler.d.ts.map +0 -1
  33. package/dist/billing/payg-handler.js +0 -92
  34. package/dist/billing/payg-handler.js.map +0 -1
  35. package/dist/billing/payment-handler.d.ts +0 -24
  36. package/dist/billing/payment-handler.d.ts.map +0 -1
  37. package/dist/billing/payment-handler.js +0 -101
  38. package/dist/billing/payment-handler.js.map +0 -1
  39. package/dist/builtin-skills/catalog/phone-adb-automation/SKILL.md +0 -412
  40. package/dist/builtin-skills/catalog/phone-adb-automation/phone_input.sh +0 -132
  41. package/dist/builtin-skills/catalog/phone-adb-automation/phone_launch.sh +0 -166
  42. package/dist/builtin-skills/catalog/phone-adb-automation/phone_screenshot.sh +0 -87
  43. package/dist/builtin-skills/catalog/phone-adb-automation/phone_security_kbd.py +0 -174
  44. package/dist/builtin-skills/catalog/phone-adb-automation/phone_setup.sh +0 -274
  45. package/dist/builtin-skills/catalog/phone-adb-automation/phone_swipe.sh +0 -111
  46. package/dist/builtin-skills/catalog/phone-adb-automation/phone_tap.sh +0 -87
  47. package/dist/builtin-skills/catalog/phone-adb-automation/phone_ui_parse.py +0 -176
  48. package/dist/builtin-skills/catalog/phone-adb-automation/phone_wake_unlock.sh +0 -67
  49. package/dist/builtin-skills/transcribe-audio/SKILL.md +0 -122
  50. package/dist/data-processing/convert-demo-cli.d.ts +0 -7
  51. package/dist/data-processing/convert-demo-cli.d.ts.map +0 -1
  52. package/dist/data-processing/convert-demo-cli.js +0 -30
  53. package/dist/data-processing/convert-demo-cli.js.map +0 -1
  54. package/dist/data-processing/convert-demo.d.ts +0 -26
  55. package/dist/data-processing/convert-demo.d.ts.map +0 -1
  56. package/dist/data-processing/convert-demo.js +0 -233
  57. package/dist/data-processing/convert-demo.js.map +0 -1
  58. package/dist/obs/rdp/icons/icons/app_windows.svg +0 -4
  59. package/dist/obs/rdp/icons/icons/clip_get.svg +0 -4
  60. package/dist/obs/rdp/icons/icons/clip_send.svg +0 -4
  61. package/dist/obs/rdp/icons/icons/clip_shared.svg +0 -4
  62. package/dist/obs/rdp/icons/icons/clipboard.svg +0 -4
  63. package/dist/obs/rdp/icons/icons/clipboard_shared.svg +0 -4
  64. package/dist/obs/rdp/icons/icons/control.svg +0 -4
  65. package/dist/obs/rdp/icons/icons/desktop.svg +0 -4
  66. package/dist/obs/rdp/icons/icons/display.svg +0 -4
  67. package/dist/obs/rdp/icons/icons/launchpad.svg +0 -4
  68. package/dist/obs/rdp/icons/icons/mission_control.svg +0 -4
  69. package/dist/obs/rdp/icons/icons/screenshot.svg +0 -4
  70. package/dist/obs/rdp/icons/icons/zoom_actual.svg +0 -4
  71. package/dist/obs/rdp/icons/icons/zoom_fit.svg +0 -4
  72. package/dist/obs/rdp/icons/icons/zoom_in.svg +0 -4
  73. package/dist/obs/rdp/icons/icons/zoom_out.svg +0 -4
  74. package/dist/obs/tunnel-telemetry.d.ts +0 -46
  75. package/dist/obs/tunnel-telemetry.d.ts.map +0 -1
  76. package/dist/obs/tunnel-telemetry.js +0 -70
  77. package/dist/obs/tunnel-telemetry.js.map +0 -1
  78. package/dist/service/gbox-tun.d.ts +0 -14
  79. package/dist/service/gbox-tun.d.ts.map +0 -1
  80. package/dist/service/gbox-tun.js +0 -315
  81. package/dist/service/gbox-tun.js.map +0 -1
  82. package/dist/tools/coordinate-resolver.d.ts +0 -30
  83. package/dist/tools/coordinate-resolver.d.ts.map +0 -1
  84. package/dist/tools/coordinate-resolver.js +0 -104
  85. package/dist/tools/coordinate-resolver.js.map +0 -1
  86. package/dist/utils/wechat-monitor.d.ts +0 -21
  87. package/dist/utils/wechat-monitor.d.ts.map +0 -1
  88. package/dist/utils/wechat-monitor.js +0 -88
  89. package/dist/utils/wechat-monitor.js.map +0 -1
@@ -1,111 +0,0 @@
1
- #!/usr/bin/env bash
2
- # phone_swipe.sh - Swipe on Android phone with presets and coordinate scaling
3
- #
4
- # Usage:
5
- # bash phone_swipe.sh <preset|x1 y1 x2 y2> [options]
6
- #
7
- # Presets (auto-calculated from device resolution):
8
- # up - Scroll up (swipe from bottom to top, reveals content below)
9
- # down - Scroll down (swipe from top to bottom, reveals content above)
10
- # left - Swipe left (next page)
11
- # right - Swipe right (previous page)
12
- #
13
- # Options:
14
- # -d DEVICE_ID Target specific device
15
- # -S SCALE Input coords from scaled screenshot (for custom x1 y1 x2 y2)
16
- # -t DURATION Swipe duration in ms (default: 300)
17
- # -q Quiet mode
18
- #
19
- # Examples:
20
- # bash phone_swipe.sh up # Scroll up
21
- # bash phone_swipe.sh down # Scroll down
22
- # bash phone_swipe.sh left # Swipe left
23
- # bash phone_swipe.sh 100 2000 100 800 # Custom swipe (native coords)
24
- # bash phone_swipe.sh 50 1000 50 400 -S 50 # Custom swipe (scaled coords)
25
- # bash phone_swipe.sh up -t 500 # Slow scroll up
26
-
27
- set -euo pipefail
28
-
29
- DEVICE=""
30
- SCALE=""
31
- DURATION=300
32
- QUIET=false
33
-
34
- # Collect all args, separate flags from positional
35
- POSITIONAL=()
36
- ARGV=("$@")
37
- i=0
38
- while [ $i -lt ${#ARGV[@]} ]; do
39
- case "${ARGV[$i]}" in
40
- -d) DEVICE="${ARGV[$((i+1))]}"; i=$((i+2)) ;;
41
- -S) SCALE="${ARGV[$((i+1))]}"; i=$((i+2)) ;;
42
- -t) DURATION="${ARGV[$((i+1))]}"; i=$((i+2)) ;;
43
- -q) QUIET=true; i=$((i+1)) ;;
44
- -*) echo "Unknown option: ${ARGV[$i]}" >&2; exit 1 ;;
45
- *) POSITIONAL+=("${ARGV[$i]}"); i=$((i+1)) ;;
46
- esac
47
- done
48
-
49
- ADB_CMD="adb"
50
- if [ -n "$DEVICE" ]; then
51
- ADB_CMD="adb -s $DEVICE"
52
- fi
53
-
54
- info() { $QUIET || echo "[phone_swipe] $*" >&2; }
55
-
56
- if [ ${#POSITIONAL[@]} -eq 0 ]; then
57
- echo "Usage: $0 <up|down|left|right|x1 y1 x2 y2> [-d device] [-S scale%] [-t ms] [-q]" >&2
58
- exit 1
59
- fi
60
-
61
- # Get device resolution for presets
62
- get_resolution() {
63
- local size
64
- size=$($ADB_CMD shell wm size | grep "Physical" | awk '{print $3}')
65
- echo "$size"
66
- }
67
-
68
- PRESET="${POSITIONAL[0]}"
69
-
70
- case "$PRESET" in
71
- up|down|left|right)
72
- SIZE=$(get_resolution)
73
- W=$(echo "$SIZE" | cut -d'x' -f1)
74
- H=$(echo "$SIZE" | cut -d'x' -f2)
75
- CX=$((W / 2))
76
- CY=$((H / 2))
77
-
78
- case "$PRESET" in
79
- up) X1=$CX; Y1=$((H * 3 / 4)); X2=$CX; Y2=$((H / 4)) ;;
80
- down) X1=$CX; Y1=$((H / 4)); X2=$CX; Y2=$((H * 3 / 4)) ;;
81
- left) X1=$((W * 4 / 5)); Y1=$CY; X2=$((W / 5)); Y2=$CY ;;
82
- right) X1=$((W / 5)); Y1=$CY; X2=$((W * 4 / 5)); Y2=$CY ;;
83
- esac
84
-
85
- info "Preset '$PRESET' on ${W}x${H}: ($X1,$Y1) -> ($X2,$Y2)"
86
- ;;
87
- *)
88
- if [ ${#POSITIONAL[@]} -lt 4 ]; then
89
- echo "ERROR: Custom swipe needs 4 coordinates: x1 y1 x2 y2" >&2
90
- exit 1
91
- fi
92
- X1="${POSITIONAL[0]}"
93
- Y1="${POSITIONAL[1]}"
94
- X2="${POSITIONAL[2]}"
95
- Y2="${POSITIONAL[3]}"
96
-
97
- if [ -n "$SCALE" ] && [ "$SCALE" -gt 0 ]; then
98
- X1=$((X1 * 100 / SCALE))
99
- Y1=$((Y1 * 100 / SCALE))
100
- X2=$((X2 * 100 / SCALE))
101
- Y2=$((Y2 * 100 / SCALE))
102
- info "Scaled coords at ${SCALE}% -> Native: ($X1,$Y1) -> ($X2,$Y2)"
103
- else
104
- info "Swipe: ($X1,$Y1) -> ($X2,$Y2)"
105
- fi
106
- ;;
107
- esac
108
-
109
- $ADB_CMD shell input swipe "$X1" "$Y1" "$X2" "$Y2" "$DURATION"
110
- info "Done."
111
- echo "swiped $X1 $Y1 $X2 $Y2"
@@ -1,87 +0,0 @@
1
- #!/usr/bin/env bash
2
- # phone_tap.sh - Tap on Android phone with optional coordinate scaling
3
- #
4
- # Usage:
5
- # bash phone_tap.sh <x> <y> [options]
6
- #
7
- # The x,y coordinates are in the coordinate space you specify:
8
- # - By default, treated as native device coordinates (no scaling)
9
- # - With -S, treated as scaled coordinates and auto-converted to native
10
- #
11
- # Options:
12
- # -d DEVICE_ID Target specific device
13
- # -S SCALE Input coords are from a screenshot scaled to this % (e.g. 50)
14
- # Will multiply by 100/SCALE to get native coords
15
- # -l Long press (500ms hold)
16
- # -t DURATION Hold duration in ms (default: tap=50, long=500)
17
- # -q Quiet mode
18
- #
19
- # Examples:
20
- # bash phone_tap.sh 610 1328 # Tap at native coords
21
- # bash phone_tap.sh 305 664 -S 50 # Tap at 50%-scaled coords -> auto-converts to 610,1328
22
- # bash phone_tap.sh 305 664 -S 50 -l # Long press at scaled coords
23
-
24
- set -euo pipefail
25
-
26
- DEVICE=""
27
- SCALE=""
28
- LONG=false
29
- DURATION=""
30
- QUIET=false
31
-
32
- # Parse options after positional args
33
- ARGS=()
34
- while [ $# -gt 0 ]; do
35
- case "$1" in
36
- -d) DEVICE="$2"; shift 2 ;;
37
- -S) SCALE="$2"; shift 2 ;;
38
- -l) LONG=true; shift ;;
39
- -t) DURATION="$2"; shift 2 ;;
40
- -q) QUIET=true; shift ;;
41
- -*) echo "Unknown option: $1" >&2; exit 1 ;;
42
- *) ARGS+=("$1"); shift ;;
43
- esac
44
- done
45
-
46
- if [ ${#ARGS[@]} -lt 2 ]; then
47
- echo "Usage: $0 <x> <y> [-d device] [-S scale%] [-l] [-t duration_ms] [-q]" >&2
48
- exit 1
49
- fi
50
-
51
- X="${ARGS[0]}"
52
- Y="${ARGS[1]}"
53
-
54
- ADB_CMD="adb"
55
- if [ -n "$DEVICE" ]; then
56
- ADB_CMD="adb -s $DEVICE"
57
- fi
58
-
59
- info() { $QUIET || echo "[phone_tap] $*" >&2; }
60
-
61
- # Scale coordinates if needed
62
- if [ -n "$SCALE" ] && [ "$SCALE" -gt 0 ]; then
63
- NATIVE_X=$((X * 100 / SCALE))
64
- NATIVE_Y=$((Y * 100 / SCALE))
65
- info "Scaled ($X, $Y) at ${SCALE}% -> Native ($NATIVE_X, $NATIVE_Y)"
66
- X=$NATIVE_X
67
- Y=$NATIVE_Y
68
- fi
69
-
70
- # Determine duration
71
- if [ -n "$DURATION" ]; then
72
- DUR=$DURATION
73
- elif $LONG; then
74
- DUR=500
75
- else
76
- DUR=50
77
- fi
78
-
79
- if $LONG || [ "$DUR" -gt 100 ]; then
80
- info "Long press at ($X, $Y) for ${DUR}ms"
81
- $ADB_CMD shell input swipe "$X" "$Y" "$X" "$Y" "$DUR"
82
- else
83
- info "Tap at ($X, $Y)"
84
- $ADB_CMD shell input tap "$X" "$Y"
85
- fi
86
-
87
- echo "tapped $X $Y"
@@ -1,176 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- phone_ui_parse.py - Dump and parse Android UI hierarchy via ADB uiautomator.
4
-
5
- Usage:
6
- python3 phone_ui_parse.py [options]
7
-
8
- Options:
9
- -d DEVICE_ID Target specific device
10
- -f XML_FILE Parse existing XML file instead of dumping from device
11
- -c Show only clickable elements
12
- -s SEARCH Filter by text/description containing SEARCH (case-insensitive)
13
- -j Output as JSON
14
- --bounds-only Show only elements with bounds
15
- -q Quiet (no info messages)
16
-
17
- Output:
18
- Each line: "text" desc="description" center=(x,y) bounds=[x1,y1,x2,y2] clickable=true/false class=ClassName
19
-
20
- Examples:
21
- python3 phone_ui_parse.py # Dump & parse all elements
22
- python3 phone_ui_parse.py -c # Only clickable elements
23
- python3 phone_ui_parse.py -s "send" # Elements matching "send"
24
- python3 phone_ui_parse.py -c -s "confirm" -j # Clickable "confirm" as JSON
25
- python3 phone_ui_parse.py -f /tmp/ui.xml # Parse local XML file
26
- """
27
-
28
- import argparse
29
- import json
30
- import os
31
- import re
32
- import subprocess
33
- import sys
34
- import tempfile
35
- import xml.etree.ElementTree as ET
36
-
37
-
38
- def run_adb(device, *args):
39
- cmd = ["adb"]
40
- if device:
41
- cmd += ["-s", device]
42
- cmd += list(args)
43
- result = subprocess.run(cmd, capture_output=True, text=True)
44
- return result.stdout, result.stderr, result.returncode
45
-
46
-
47
- def dump_ui(device=None):
48
- """Dump UI hierarchy from device and return local XML path."""
49
- remote_path = "/sdcard/ui_dump.xml"
50
- local_path = os.path.join(tempfile.gettempdir(), f"phone_ui_{os.getpid()}.xml")
51
-
52
- _, stderr, rc = run_adb(device, "shell", "uiautomator", "dump", remote_path)
53
- if rc != 0:
54
- print(f"ERROR: uiautomator dump failed: {stderr}", file=sys.stderr)
55
- sys.exit(1)
56
-
57
- _, stderr, rc = run_adb(device, "pull", remote_path, local_path)
58
- if rc != 0:
59
- print(f"ERROR: Failed to pull UI dump: {stderr}", file=sys.stderr)
60
- sys.exit(1)
61
-
62
- run_adb(device, "shell", "rm", "-f", remote_path)
63
- return local_path
64
-
65
-
66
- def parse_ui_xml(xml_path, clickable_only=False, search=None):
67
- """Parse UI XML and return list of element dicts."""
68
- tree = ET.parse(xml_path)
69
- root = tree.getroot()
70
- elements = []
71
-
72
- for node in root.iter():
73
- text = node.get("text", "")
74
- desc = node.get("content-desc", "")
75
- bounds = node.get("bounds", "")
76
- clickable = node.get("clickable", "false")
77
- class_name = node.get("class", "")
78
- resource_id = node.get("resource-id", "")
79
- enabled = node.get("enabled", "true")
80
- focused = node.get("focused", "false")
81
- selected = node.get("selected", "false")
82
- checked = node.get("checked", "false")
83
-
84
- if not bounds:
85
- continue
86
-
87
- if clickable_only and clickable != "true":
88
- continue
89
-
90
- if search:
91
- combined = f"{text} {desc} {resource_id}".lower()
92
- if search.lower() not in combined:
93
- continue
94
-
95
- # Parse bounds: [x1,y1][x2,y2]
96
- m = re.findall(r"\d+", bounds)
97
- if len(m) != 4:
98
- continue
99
-
100
- x1, y1, x2, y2 = int(m[0]), int(m[1]), int(m[2]), int(m[3])
101
- cx = (x1 + x2) // 2
102
- cy = (y1 + y2) // 2
103
-
104
- elements.append({
105
- "text": text,
106
- "content_desc": desc,
107
- "resource_id": resource_id,
108
- "class": class_name,
109
- "bounds": [x1, y1, x2, y2],
110
- "center": [cx, cy],
111
- "clickable": clickable == "true",
112
- "enabled": enabled == "true",
113
- "focused": focused == "true",
114
- "selected": selected == "true",
115
- "checked": checked == "true",
116
- })
117
-
118
- return elements
119
-
120
-
121
- def main():
122
- parser = argparse.ArgumentParser(description="Parse Android UI hierarchy")
123
- parser.add_argument("-d", "--device", help="Target device ID")
124
- parser.add_argument("-f", "--file", help="Parse existing XML file")
125
- parser.add_argument("-c", "--clickable", action="store_true", help="Clickable only")
126
- parser.add_argument("-s", "--search", help="Filter by text/desc")
127
- parser.add_argument("-j", "--json", action="store_true", help="JSON output")
128
- parser.add_argument("--bounds-only", action="store_true", help="Elements with bounds only")
129
- parser.add_argument("-q", "--quiet", action="store_true", help="Quiet mode")
130
- args = parser.parse_args()
131
-
132
- if args.file:
133
- xml_path = args.file
134
- cleanup = False
135
- else:
136
- if not args.quiet:
137
- print("[phone_ui_parse] Dumping UI hierarchy...", file=sys.stderr)
138
- xml_path = dump_ui(args.device)
139
- cleanup = True
140
-
141
- elements = parse_ui_xml(xml_path, args.clickable, args.search)
142
-
143
- if cleanup:
144
- os.unlink(xml_path)
145
-
146
- if not args.quiet:
147
- print(f"[phone_ui_parse] Found {len(elements)} elements", file=sys.stderr)
148
-
149
- if args.json:
150
- print(json.dumps(elements, ensure_ascii=False, indent=2))
151
- else:
152
- for el in elements:
153
- text = el["text"]
154
- desc = el["content_desc"]
155
- cx, cy = el["center"]
156
- click = el["clickable"]
157
- cls = el["class"].split(".")[-1] if el["class"] else ""
158
- rid = el["resource_id"].split("/")[-1] if el["resource_id"] else ""
159
-
160
- parts = []
161
- if text:
162
- parts.append(f'"{text}"')
163
- if desc:
164
- parts.append(f'desc="{desc}"')
165
- if rid:
166
- parts.append(f"id={rid}")
167
- parts.append(f"center=({cx},{cy})")
168
- parts.append(f"clickable={click}")
169
- if cls:
170
- parts.append(f"class={cls}")
171
-
172
- print(" ".join(parts))
173
-
174
-
175
- if __name__ == "__main__":
176
- main()
@@ -1,67 +0,0 @@
1
- #!/usr/bin/env bash
2
- # phone_wake_unlock.sh - Wake up and unlock Android phone screen
3
- #
4
- # Usage:
5
- # bash phone_wake_unlock.sh [options]
6
- #
7
- # Options:
8
- # -d DEVICE_ID Target specific device
9
- # -q Quiet mode
10
- #
11
- # The script checks if the screen is on, wakes it if needed,
12
- # and performs a swipe-up gesture to unlock (works for swipe-to-unlock screens).
13
- # For PIN/pattern locks, additional input is needed after this script.
14
-
15
- set -euo pipefail
16
-
17
- DEVICE=""
18
- QUIET=false
19
-
20
- while getopts "d:q" opt; do
21
- case $opt in
22
- d) DEVICE="$OPTARG" ;;
23
- q) QUIET=true ;;
24
- *) echo "Usage: $0 [-d device] [-q]" >&2; exit 1 ;;
25
- esac
26
- done
27
-
28
- ADB_CMD="adb"
29
- if [ -n "$DEVICE" ]; then
30
- ADB_CMD="adb -s $DEVICE"
31
- fi
32
-
33
- info() { $QUIET || echo "[phone_wake] $*" >&2; }
34
-
35
- # Verify device connection
36
- if ! $ADB_CMD get-state >/dev/null 2>&1; then
37
- echo "ERROR: No device connected or device unauthorized" >&2
38
- exit 1
39
- fi
40
-
41
- # Check screen state
42
- WAKE_STATE=$($ADB_CMD shell dumpsys power | grep "mWakefulness=" | head -1 | sed 's/.*mWakefulness=//')
43
-
44
- if [ "$WAKE_STATE" = "Awake" ]; then
45
- info "Screen is already awake"
46
- else
47
- info "Screen is asleep, waking up..."
48
- $ADB_CMD shell input keyevent KEYCODE_WAKEUP
49
- sleep 0.5
50
- fi
51
-
52
- # Get screen resolution for swipe coordinates
53
- SIZE=$($ADB_CMD shell wm size | grep "Physical" | awk '{print $3}')
54
- WIDTH=$(echo "$SIZE" | cut -d'x' -f1)
55
- HEIGHT=$(echo "$SIZE" | cut -d'x' -f2)
56
-
57
- # Swipe up from bottom quarter to middle to unlock
58
- START_X=$((WIDTH / 2))
59
- START_Y=$((HEIGHT * 3 / 4))
60
- END_Y=$((HEIGHT / 3))
61
-
62
- info "Swiping to unlock (${START_X}, ${START_Y}) -> (${START_X}, ${END_Y})..."
63
- $ADB_CMD shell input swipe "$START_X" "$START_Y" "$START_X" "$END_Y" 300
64
- sleep 0.5
65
-
66
- info "Done. Phone should be unlocked (if no PIN/pattern required)."
67
- echo "unlocked"
@@ -1,122 +0,0 @@
1
- ---
2
- description: Use this skill to transcribe audio files (voice messages, recordings) to text, extract audio from Bilibili videos, or handle Telegram voice messages. Supports OGG, MP3, WAV, M4A, and other common audio formats. Uses whisper.cpp with auto language detection -- supports Chinese, English, and all other languages.
3
- ---
4
-
5
- # Transcribe Audio & Media Extraction
6
-
7
- Transcribe audio files to text using whisper.cpp (local, free, private, native macOS binary).
8
-
9
- ## Prerequisites
10
-
11
- whisper-cpp, ffmpeg, and the whisper model are checked during `visionclaw setup`. If they're missing, run setup again or install manually:
12
-
13
- ```bash
14
- brew install whisper-cpp ffmpeg
15
- mkdir -p ~/.local/share/whisper-cpp && curl -L -o ~/.local/share/whisper-cpp/ggml-large-v3-turbo.bin 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo.bin?download=true'
16
- ```
17
-
18
- Binary is `whisper-cli` (not `whisper`). If not on PATH: `/opt/homebrew/bin/whisper-cli`
19
-
20
- ## Steps
21
-
22
- 1. **Download the audio file** if it's a URL:
23
- ```bash
24
- curl -s -o /tmp/audio_input.ogg "THE_AUDIO_URL"
25
- ```
26
-
27
- 2. **Convert to 16kHz WAV** (required by whisper.cpp):
28
- ```bash
29
- ffmpeg -y -i /tmp/audio_input.ogg -ar 16000 -ac 1 /tmp/audio_input.wav 2>/dev/null
30
- ```
31
-
32
- 3. **Transcribe**:
33
- ```bash
34
- whisper-cli -m ~/.local/share/whisper-cpp/ggml-large-v3-turbo.bin -f /tmp/audio_input.wav --no-timestamps -l auto 2>/dev/null
35
- ```
36
-
37
- 4. **Use the transcription** as needed -- respond to the user, take action, etc.
38
-
39
- > **CRITICAL**: Always use `-l auto` for language detection. The default is `-l en` which BREAKS Chinese transcription.
40
-
41
- ## Notes
42
-
43
- - Uses `large-v3-turbo` (~1.5GB) -- near-large accuracy at half the size of full large-v3.
44
- - whisper.cpp is optimized for Apple Silicon -- runs efficiently on any M1/M2/M3/M4 MacBook.
45
- - Supports: OGG, MP3, WAV, M4A, FLAC, WEBM, and most common audio formats (via ffmpeg conversion).
46
- - Runs entirely locally -- no API calls, no costs, fully private.
47
- - If whisper-cli is not found, verify with `which whisper-cli`. The Homebrew package is `whisper-cpp` but the binary is `whisper-cli`.
48
-
49
- ## Telegram Voice Messages
50
-
51
- Telegram voice messages arrive as OGG files:
52
-
53
- ```bash
54
- curl -s -o /tmp/voice.ogg "{file_url}"
55
- ffmpeg -y -i /tmp/voice.ogg -ar 16000 -ac 1 /tmp/voice.wav 2>/dev/null
56
- whisper-cli -m ~/.local/share/whisper-cpp/ggml-large-v3-turbo.bin -f /tmp/voice.wav --no-timestamps -l auto 2>/dev/null
57
- ```
58
-
59
- ## Bilibili Video Audio Extraction
60
-
61
- ### Method: Browser Network Interception
62
-
63
- When the standard yt-dlp approach fails (Bilibili returns HTTP 412), use Playwright to intercept the .m4s audio stream URLs directly from the browser's network requests.
64
-
65
- ```python
66
- # 1. Navigate to Bilibili video in Playwright browser
67
- await page.goto(f'https://www.bilibili.com/video/{bv_id}')
68
-
69
- # 2. Wait for video to load, then check network requests
70
- # The audio stream URL contains "30216" in the filename (30216 = audio codec ID)
71
- # Example: [CID]-1-30216.m4s
72
-
73
- # 3. Download the audio .m4s with Referer header
74
- curl -s -L \
75
- -H "Referer: https://www.bilibili.com/video/[BV_ID]" \
76
- -H "User-Agent: Mozilla/5.0 ..." \
77
- -o /tmp/audio.m4a \
78
- "{audio_url}"
79
- ```
80
-
81
- ### Finding Bilibili Video for a WeChat 视频号 Video
82
-
83
- 1. Note the creator name (shown in WeChat video player) and video title keywords
84
- 2. Search Bilibili: `https://search.bilibili.com/all?keyword={creator}+{keywords}`
85
- 3. Extract BV IDs from search results (look for `BV[A-Za-z0-9]+` patterns)
86
- 4. Get video metadata from initial page state:
87
- ```javascript
88
- window.__INITIAL_STATE__.videoData.cid // CID for subtitle API
89
- ```
90
- 5. Check for subtitles via API: `GET https://api.bilibili.com/x/player/v2?bvid={bvid}&cid={cid}`
91
- - If `subtitle.subtitles` is empty → use audio extraction + Whisper
92
-
93
- ### M4S File ID Numbers (Bilibili)
94
- - `30032` — Video stream
95
- - `30216` — Audio stream (64kbps AAC)
96
- - `30280` — Higher quality audio
97
-
98
- ### yt-dlp (when it works)
99
- ```bash
100
- yt-dlp --cookies-from-browser chrome \
101
- --write-auto-subs --sub-lang zh-CN \
102
- --skip-download -o /tmp/subtitle \
103
- https://www.bilibili.com/video/{BV_ID}
104
- ```
105
- Often fails with HTTP 412 on Bilibili even with cookies. Prefer the network interception method.
106
-
107
- ## Generating Voice Replies (TTS)
108
-
109
- To send a voice message back to the user, use macOS text-to-speech:
110
-
111
- ```bash
112
- # For Chinese text, use the Tingting voice:
113
- say -v "Tingting (Chinese (China mainland))" -o /tmp/reply.aiff "你好!这是语音回复"
114
- # For English text:
115
- say -o /tmp/reply.aiff "Your reply text here"
116
- # Convert to OGG:
117
- ffmpeg -y -i /tmp/reply.aiff /tmp/reply_voice.ogg 2>/dev/null
118
- ```
119
-
120
- Then send via notify_user with `file_path: "/tmp/reply_voice.ogg"`.
121
-
122
- Most users prefer TEXT messages. Use voice only when specifically requested.
@@ -1,7 +0,0 @@
1
- /**
2
- * CLI wrapper for convert-demo.
3
- *
4
- * Usage: visionclaw convert-demo <output-dir> [-o demo-output-dir]
5
- */
6
- export declare function runConvertDemo(args: string[]): Promise<void>;
7
- //# sourceMappingURL=convert-demo-cli.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"convert-demo-cli.d.ts","sourceRoot":"","sources":["../../src/data-processing/convert-demo-cli.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAIH,wBAAsB,cAAc,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC,CA0BlE"}
@@ -1,30 +0,0 @@
1
- /**
2
- * CLI wrapper for convert-demo.
3
- *
4
- * Usage: visionclaw convert-demo <output-dir> [-o demo-output-dir]
5
- */
6
- import { convertDemo } from "./convert-demo.js";
7
- export async function runConvertDemo(args) {
8
- let outputDir = "";
9
- let demoOutputDir;
10
- for (let i = 0; i < args.length; i++) {
11
- const arg = args[i];
12
- if ((arg === "-o" || arg === "--output-dir") && i + 1 < args.length) {
13
- demoOutputDir = args[++i];
14
- }
15
- else if (!arg.startsWith("-") && !outputDir) {
16
- outputDir = arg;
17
- }
18
- }
19
- if (!outputDir) {
20
- console.error("Usage: visionclaw convert-demo <output-dir> [-o demo-output-dir]\n" +
21
- "\n" +
22
- " <output-dir> Path to pipeline output (contains restructured-*.jsonl + workspace/)\n" +
23
- " -o <dir> Where to write demo-*.jsonl (defaults to output-dir)");
24
- process.exit(1);
25
- }
26
- console.log(`Converting to demo format: ${outputDir}`);
27
- await convertDemo({ outputDir, demoOutputDir });
28
- console.log("Done.");
29
- }
30
- //# sourceMappingURL=convert-demo-cli.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"convert-demo-cli.js","sourceRoot":"","sources":["../../src/data-processing/convert-demo-cli.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAEhD,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,IAAc;IACjD,IAAI,SAAS,GAAG,EAAE,CAAC;IACnB,IAAI,aAAiC,CAAC;IAEtC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;QACpB,IAAI,CAAC,GAAG,KAAK,IAAI,IAAI,GAAG,KAAK,cAAc,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;YACpE,aAAa,GAAG,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;QAC5B,CAAC;aAAM,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,EAAE,CAAC;YAC9C,SAAS,GAAG,GAAG,CAAC;QAClB,CAAC;IACH,CAAC;IAED,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,OAAO,CAAC,KAAK,CACX,oEAAoE;YACpE,IAAI;YACJ,yFAAyF;YACzF,uEAAuE,CACxE,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,8BAA8B,SAAS,EAAE,CAAC,CAAC;IACvD,MAAM,WAAW,CAAC,EAAE,SAAS,EAAE,aAAa,EAAE,CAAC,CAAC;IAChD,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;AACvB,CAAC"}
@@ -1,26 +0,0 @@
1
- /**
2
- * Convert restructured JSONL records to the OpenClaw demo.json format.
3
- *
4
- * The demo format uses an OpenAI chat-completion-style messages array:
5
- * system → user → (assistant → tool)* → assistant
6
- *
7
- * Usage:
8
- * npx tsx src/data-processing/convert-demo.ts <output-dir> [-o <demo-output-dir>]
9
- *
10
- * Reads:
11
- * <output-dir>/restructured-*.jsonl
12
- * <output-dir>/workspace/system-prompt.md
13
- *
14
- * Writes:
15
- * <demo-output-dir>/demo-clean.jsonl (or specified output dir)
16
- * <demo-output-dir>/demo-interrupted.jsonl
17
- * <demo-output-dir>/demo-heartbeat.jsonl
18
- */
19
- export interface ConvertDemoOptions {
20
- /** Path to output dir (containing restructured-*.jsonl and workspace/) */
21
- outputDir: string;
22
- /** Where to write demo JSONL files (defaults to outputDir) */
23
- demoOutputDir?: string;
24
- }
25
- export declare function convertDemo(opts: ConvertDemoOptions): Promise<void>;
26
- //# sourceMappingURL=convert-demo.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"convert-demo.d.ts","sourceRoot":"","sources":["../../src/data-processing/convert-demo.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;GAiBG;AA2PH,MAAM,WAAW,kBAAkB;IACjC,0EAA0E;IAC1E,SAAS,EAAE,MAAM,CAAC;IAClB,8DAA8D;IAC9D,aAAa,CAAC,EAAE,MAAM,CAAC;CACxB;AAED,wBAAsB,WAAW,CAAC,IAAI,EAAE,kBAAkB,GAAG,OAAO,CAAC,IAAI,CAAC,CAiCzE"}