@1presence/speech 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +3 -2
- package/public/index.html +69 -26
- package/public/styles.css +53 -0
- package/server.mjs +13 -2
package/package.json
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@1presence/speech",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.2.0",
|
|
4
4
|
"description": "Free speech-to-text for Mac — dictate into any app using your browser's built-in voice recognition, no subscription required",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
7
|
-
"1presence-speech": "server.mjs"
|
|
7
|
+
"1presence-speech": "server.mjs",
|
|
8
|
+
"speech": "server.mjs"
|
|
8
9
|
},
|
|
9
10
|
"files": [
|
|
10
11
|
"server.mjs",
|
package/public/index.html
CHANGED
|
@@ -9,6 +9,11 @@
|
|
|
9
9
|
<body>
|
|
10
10
|
<main class="shell">
|
|
11
11
|
<section class="panel">
|
|
12
|
+
<div class="browser-banner" id="browserBanner" role="alert" hidden>
|
|
13
|
+
<strong>Speech recognition unavailable</strong>
|
|
14
|
+
This browser does not support the Web Speech API. Open this page in Chrome or Edge to dictate.
|
|
15
|
+
</div>
|
|
16
|
+
|
|
12
17
|
<div class="topbar">
|
|
13
18
|
<div>
|
|
14
19
|
<h1>Speech Terminal</h1>
|
|
@@ -40,34 +45,47 @@
|
|
|
40
45
|
<span>Live word mode: send words as they are recognized</span>
|
|
41
46
|
</label>
|
|
42
47
|
|
|
48
|
+
<section class="setup setup-compact" aria-labelledby="setup-title">
|
|
49
|
+
<h2 id="setup-title">macOS permissions</h2>
|
|
50
|
+
<p class="setup-lede">If you started Speech Terminal with <code>npx @1presence/speech</code>, do this once before dictating into other apps.</p>
|
|
51
|
+
<ol class="setup-quick">
|
|
52
|
+
<li>Allow <strong>microphone</strong> access when Chrome or Edge asks. The page tries to start listening automatically; if the browser blocks auto-start, click <strong>Start</strong>.</li>
|
|
53
|
+
<li>Click <strong>Test in 3s</strong>, focus any text field in another app within three seconds, and wait for the test text. If macOS asks for <strong>Accessibility</strong>, allow it.</li>
|
|
54
|
+
<li>If nothing types, open <strong>System Settings -> Privacy & Security -> Accessibility</strong> and enable the app that ran the command: <strong>Terminal</strong>, <strong>iTerm</strong>, <strong>Warp</strong>, or whichever terminal you used. Not Chrome.</li>
|
|
55
|
+
<li>If that app is missing, click <strong>+</strong> and add it from <code>/Applications</code>. If it is already enabled but keystrokes are still blocked, toggle it off and on.</li>
|
|
56
|
+
<li>Fully quit and reopen that terminal app, run <code>npx @1presence/speech</code> again, refresh this page, and retry <strong>Test in 3s</strong>.</li>
|
|
57
|
+
</ol>
|
|
58
|
+
</section>
|
|
59
|
+
|
|
43
60
|
<textarea id="transcript" spellcheck="true" placeholder="Your dictated text will appear here."></textarea>
|
|
44
61
|
|
|
45
62
|
<div class="hint" id="hint">
|
|
46
|
-
Web Speech API works best in Chrome or Edge. The default destination types into whichever app has focus after macOS Accessibility permission is granted
|
|
63
|
+
Web Speech API works best in Chrome or Edge. The default destination types into whichever app has focus after macOS Accessibility permission is granted to the terminal app that ran <code>npx @1presence/speech</code>.
|
|
47
64
|
</div>
|
|
48
65
|
|
|
49
|
-
<section class="setup" aria-labelledby="
|
|
50
|
-
<
|
|
66
|
+
<section class="setup" aria-labelledby="usage-title">
|
|
67
|
+
<h3 id="usage-title">Using Speech Terminal</h3>
|
|
51
68
|
<ol>
|
|
52
|
-
<li>Start this bridge from Terminal, iTerm, Warp, or the terminal app you normally use: <code>cd speech && npm start</code>.</li>
|
|
53
|
-
<li>Open this page in Chrome or Edge at <code>http://127.0.0.1:8787</code>.</li>
|
|
54
|
-
<li>When the browser asks for microphone access, allow it. The page tries to start listening automatically; if Chrome blocks auto-start, click <strong>Start</strong>.</li>
|
|
55
69
|
<li>Leave <strong>Destination</strong> set to <strong>Focused app typed keystrokes</strong> for the broadest app support.</li>
|
|
56
|
-
<li>Click <strong>Test in 3s</strong>, focus any app text input within three seconds, and wait for the test text. If macOS asks for Accessibility access, allow it.</li>
|
|
57
|
-
<li>Open <strong>System Settings -> Privacy & Security -> Accessibility</strong>.</li>
|
|
58
|
-
<li>Enable the app that is running <code>npm start</code>: Terminal, iTerm, Warp, Visual Studio Code, Cursor, or whichever app launched the server. If it is missing, click <strong>+</strong> and add it from <code>/Applications</code>.</li>
|
|
59
|
-
<li>If that app is already enabled but keystrokes are still blocked, toggle it off and on, or remove it and add it again.</li>
|
|
60
|
-
<li>For iTerm, also check <strong>System Settings -> Privacy & Security -> Automation</strong> and allow the app running <code>npm start</code> to control iTerm.</li>
|
|
61
|
-
<li>Fully quit and reopen the app running <code>npm start</code>, then restart <code>npm start</code> after granting permission.</li>
|
|
62
70
|
<li>For manual mode, dictate here, edit the transcript, focus the target app, then click <strong>Send to app</strong>.</li>
|
|
63
|
-
<li>For
|
|
71
|
+
<li>For hands-free mode, focus the target app, then speak. Speech is sent automatically without returning to this page.</li>
|
|
64
72
|
<li>Short pauses create a new line in the transcript. Longer pauses add a full stop, start the next phrase with a capital letter, and show a blank line.</li>
|
|
65
73
|
<li><strong>Live word mode</strong> is enabled by default. It sends words earlier and optimistically while you speak. Turn it off to send only after the browser marks speech as settled.</li>
|
|
66
74
|
</ol>
|
|
67
75
|
|
|
76
|
+
<h3 id="developer-setup-title">Developer setup</h3>
|
|
77
|
+
<p class="setup-lede">If you are running from a local clone instead of the one-liner:</p>
|
|
78
|
+
<ol>
|
|
79
|
+
<li>Start the bridge from Terminal, iTerm, Warp, or the terminal app you normally use: <code>cd speech && npm start</code>.</li>
|
|
80
|
+
<li>Open this page in Chrome or Edge at <code>http://127.0.0.1:8787</code>.</li>
|
|
81
|
+
<li>Follow the macOS permissions steps above. Enable whichever app launched the server: Terminal, iTerm, Warp, Visual Studio Code, Cursor, or similar.</li>
|
|
82
|
+
<li>For iTerm, also check <strong>System Settings -> Privacy & Security -> Automation</strong> and allow the app running the server to control iTerm.</li>
|
|
83
|
+
<li>After granting permission, fully quit and reopen that app, restart the server, refresh this page, and try <strong>Test in 3s</strong> again.</li>
|
|
84
|
+
</ol>
|
|
85
|
+
|
|
68
86
|
<div class="callout">
|
|
69
87
|
<strong>If you see error 1002:</strong>
|
|
70
|
-
macOS has not allowed the server app to control keystrokes yet. Grant Accessibility permission to the app running this server, not to Chrome. Then fully quit and reopen that app, restart <code>npm start</code
|
|
88
|
+
macOS has not allowed the server app to control keystrokes yet. Grant Accessibility permission to the terminal app running this server, not to Chrome. Then fully quit and reopen that app, restart the server (<code>npx @1presence/speech</code> or <code>npm start</code>), refresh this page, and try <strong>Test in 3s</strong> again.
|
|
71
89
|
</div>
|
|
72
90
|
|
|
73
91
|
<div class="callout">
|
|
@@ -87,6 +105,7 @@
|
|
|
87
105
|
|
|
88
106
|
<script>
|
|
89
107
|
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
|
|
108
|
+
const browserBannerEl = document.getElementById("browserBanner");
|
|
90
109
|
const statusEl = document.getElementById("status");
|
|
91
110
|
const transcriptEl = document.getElementById("transcript");
|
|
92
111
|
const startButton = document.getElementById("start");
|
|
@@ -103,6 +122,7 @@
|
|
|
103
122
|
let finalText = "";
|
|
104
123
|
let listening = false;
|
|
105
124
|
let sendQueue = Promise.resolve();
|
|
125
|
+
let streamNeedsSpace = false;
|
|
106
126
|
const resultState = new Map();
|
|
107
127
|
let hasFinalSpeech = false;
|
|
108
128
|
let lastFinalAt = 0;
|
|
@@ -132,19 +152,22 @@
|
|
|
132
152
|
}
|
|
133
153
|
|
|
134
154
|
async function sendToTerminal(text, pressEnter) {
|
|
155
|
+
const payload = formatTextForAppSend(text);
|
|
156
|
+
if (!payload) return;
|
|
157
|
+
|
|
135
158
|
const response = await fetch("/paste", {
|
|
136
159
|
method: "POST",
|
|
137
160
|
headers: { "content-type": "application/json" },
|
|
138
161
|
body: JSON.stringify({
|
|
139
|
-
text,
|
|
162
|
+
text: payload,
|
|
140
163
|
pressEnter,
|
|
141
164
|
destination: destinationEl.value,
|
|
142
165
|
}),
|
|
143
166
|
});
|
|
144
|
-
const
|
|
167
|
+
const result = await response.json();
|
|
145
168
|
|
|
146
|
-
if (!
|
|
147
|
-
throw new Error(
|
|
169
|
+
if (!result.ok) {
|
|
170
|
+
throw new Error(result.error || "Paste failed.");
|
|
148
171
|
}
|
|
149
172
|
}
|
|
150
173
|
|
|
@@ -167,6 +190,15 @@
|
|
|
167
190
|
return text.replace(/\s+/g, " ").trim();
|
|
168
191
|
}
|
|
169
192
|
|
|
193
|
+
// Display keeps paragraph breaks; app/terminal delivery must be one line so
|
|
194
|
+
// paste guards (iTerm) do not treat newlines as a multi-line paste.
|
|
195
|
+
// Do not trim — live chunks often start with a leading space from getStreamSeparator.
|
|
196
|
+
function formatTextForAppSend(text) {
|
|
197
|
+
return String(text || "")
|
|
198
|
+
.replace(/\n+/g, " ")
|
|
199
|
+
.replace(/[^\S\n]+/g, " ");
|
|
200
|
+
}
|
|
201
|
+
|
|
170
202
|
function capitalizeFirstWord(text) {
|
|
171
203
|
return text.replace(/[A-Za-z]/, (letter) => letter.toUpperCase());
|
|
172
204
|
}
|
|
@@ -246,10 +278,17 @@
|
|
|
246
278
|
state.displaySeparatorApplied = true;
|
|
247
279
|
}
|
|
248
280
|
|
|
249
|
-
function
|
|
250
|
-
if (!
|
|
251
|
-
if (
|
|
252
|
-
|
|
281
|
+
function getStreamSeparator(state) {
|
|
282
|
+
if (!streamNeedsSpace) return "";
|
|
283
|
+
if (
|
|
284
|
+
hasFinalSpeech &&
|
|
285
|
+
state.pauseType === "long" &&
|
|
286
|
+
!state.appPrefixSent &&
|
|
287
|
+
!endsWithSentencePunctuation(finalText)
|
|
288
|
+
) {
|
|
289
|
+
return ". ";
|
|
290
|
+
}
|
|
291
|
+
return " ";
|
|
253
292
|
}
|
|
254
293
|
|
|
255
294
|
// Normalise a word for comparison: lowercase + strip leading/trailing
|
|
@@ -305,9 +344,10 @@
|
|
|
305
344
|
const delta = getSpeechDelta(state.sentText, normalizedCandidate, isFinal);
|
|
306
345
|
|
|
307
346
|
if (delta) {
|
|
308
|
-
const
|
|
309
|
-
state.appPrefixSent = true;
|
|
310
|
-
queueAutoSend(`${
|
|
347
|
+
const separator = getStreamSeparator(state);
|
|
348
|
+
if (state.pauseType === "long") state.appPrefixSent = true;
|
|
349
|
+
queueAutoSend(`${separator}${delta}`);
|
|
350
|
+
streamNeedsSpace = true;
|
|
311
351
|
}
|
|
312
352
|
|
|
313
353
|
state.sentText = normalizedCandidate;
|
|
@@ -390,6 +430,7 @@
|
|
|
390
430
|
finalText = transcriptEl.value ? transcriptEl.value.trim() : "";
|
|
391
431
|
hasFinalSpeech = Boolean(finalText);
|
|
392
432
|
lastFinalAt = hasFinalSpeech ? Date.now() : 0;
|
|
433
|
+
streamNeedsSpace = hasFinalSpeech;
|
|
393
434
|
try {
|
|
394
435
|
recognition.start();
|
|
395
436
|
} catch (error) {
|
|
@@ -411,6 +452,7 @@
|
|
|
411
452
|
resultState.clear();
|
|
412
453
|
hasFinalSpeech = false;
|
|
413
454
|
lastFinalAt = 0;
|
|
455
|
+
streamNeedsSpace = false;
|
|
414
456
|
transcriptEl.value = "";
|
|
415
457
|
transcriptEl.focus();
|
|
416
458
|
});
|
|
@@ -470,13 +512,14 @@
|
|
|
470
512
|
});
|
|
471
513
|
|
|
472
514
|
if (!SpeechRecognition) {
|
|
515
|
+
if (browserBannerEl) browserBannerEl.hidden = false;
|
|
473
516
|
setStatus("Unsupported", "error");
|
|
474
517
|
startButton.disabled = true;
|
|
475
518
|
}
|
|
476
519
|
|
|
477
520
|
setListening(false);
|
|
478
521
|
window.addEventListener("load", () => {
|
|
479
|
-
startRecognition();
|
|
522
|
+
if (SpeechRecognition) startRecognition();
|
|
480
523
|
});
|
|
481
524
|
</script>
|
|
482
525
|
</body>
|
package/public/styles.css
CHANGED
|
@@ -119,6 +119,27 @@ textarea {
|
|
|
119
119
|
box-shadow: var(--shadow-lg);
|
|
120
120
|
}
|
|
121
121
|
|
|
122
|
+
.browser-banner {
|
|
123
|
+
margin-bottom: var(--space-5);
|
|
124
|
+
border: 1px solid color-mix(in oklch, var(--red), var(--border) 45%);
|
|
125
|
+
border-radius: var(--radius-md);
|
|
126
|
+
padding: var(--space-4);
|
|
127
|
+
background: var(--red-soft);
|
|
128
|
+
color: var(--red);
|
|
129
|
+
font-family: var(--font-body);
|
|
130
|
+
font-size: 1rem;
|
|
131
|
+
line-height: 1.52;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
.browser-banner strong {
|
|
135
|
+
display: block;
|
|
136
|
+
margin-bottom: var(--space-1);
|
|
137
|
+
color: var(--red);
|
|
138
|
+
font-family: var(--font-ui);
|
|
139
|
+
font-size: 0.9375rem;
|
|
140
|
+
font-weight: 500;
|
|
141
|
+
}
|
|
142
|
+
|
|
122
143
|
.topbar {
|
|
123
144
|
display: flex;
|
|
124
145
|
align-items: flex-start;
|
|
@@ -323,6 +344,12 @@ textarea:focus {
|
|
|
323
344
|
padding-top: var(--space-6);
|
|
324
345
|
}
|
|
325
346
|
|
|
347
|
+
.setup-compact {
|
|
348
|
+
margin-top: var(--space-5);
|
|
349
|
+
border-top: none;
|
|
350
|
+
padding-top: 0;
|
|
351
|
+
}
|
|
352
|
+
|
|
326
353
|
.setup h2 {
|
|
327
354
|
margin: 0;
|
|
328
355
|
font-family: var(--font-display);
|
|
@@ -331,6 +358,32 @@ textarea:focus {
|
|
|
331
358
|
line-height: 1.1;
|
|
332
359
|
}
|
|
333
360
|
|
|
361
|
+
.setup h3 {
|
|
362
|
+
margin: var(--space-6) 0 0;
|
|
363
|
+
font-family: var(--font-display);
|
|
364
|
+
font-size: 1.35rem;
|
|
365
|
+
font-weight: 400;
|
|
366
|
+
line-height: 1.15;
|
|
367
|
+
color: var(--text-primary);
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
.setup > h3:first-child {
|
|
371
|
+
margin-top: 0;
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
.setup-lede {
|
|
375
|
+
max-width: 76ch;
|
|
376
|
+
margin-top: var(--space-3);
|
|
377
|
+
color: var(--text-secondary);
|
|
378
|
+
font-family: var(--font-body);
|
|
379
|
+
font-size: 1.06rem;
|
|
380
|
+
line-height: 1.52;
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
.setup-quick {
|
|
384
|
+
margin-top: var(--space-4);
|
|
385
|
+
}
|
|
386
|
+
|
|
334
387
|
.setup ol {
|
|
335
388
|
max-width: 76ch;
|
|
336
389
|
margin: var(--space-4) 0 0;
|
package/server.mjs
CHANGED
|
@@ -74,11 +74,22 @@ end run
|
|
|
74
74
|
}
|
|
75
75
|
|
|
76
76
|
function typeIntoFocusedApp(text, pressEnter) {
|
|
77
|
+
// System Events often drops literal spaces inside keystroke strings from Node.
|
|
78
|
+
// Type each character and use keystroke space for space characters.
|
|
77
79
|
const script = `
|
|
78
80
|
on run argv
|
|
81
|
+
set inputText to item 1 of argv
|
|
82
|
+
set pressEnter to item 2 of argv
|
|
79
83
|
tell application "System Events"
|
|
80
|
-
|
|
81
|
-
|
|
84
|
+
repeat with i from 1 to count of characters of inputText
|
|
85
|
+
set ch to character i of inputText
|
|
86
|
+
if ch is " " then
|
|
87
|
+
keystroke space
|
|
88
|
+
else
|
|
89
|
+
keystroke ch
|
|
90
|
+
end if
|
|
91
|
+
end repeat
|
|
92
|
+
if pressEnter is "true" then
|
|
82
93
|
key code 36
|
|
83
94
|
end if
|
|
84
95
|
end tell
|