npm - bloby-bot - Versions diffs - 0.49.6 → 0.50.1 - Mend

bloby-bot 0.49.6 → 0.50.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json +1 -1
package/worker/index.ts +113 -0
package/workspace/skills/plaud/SKILL.md +325 -0
package/workspace/skills/plaud/skill.json +15 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "bloby-bot",
-  "version": "0.49.6",
+  "version": "0.50.1",
   "releaseNotes": [
     "1. Something great..",
     "2. ",

package/worker/index.ts CHANGED Viewed

@@ -1002,6 +1002,119 @@ app.post('/api/whisper/transcribe', express.json({ limit: '10mb' }), async (req,
   }
 });
+// Transcribe an audio file already on disk under workspace/files/.
+// Body: { path, saveTranscriptNext?, language? }. `path` is interpreted
+// relative to workspace/files/ ("files/" prefix is tolerated).
+app.post('/api/whisper/transcribe-file', express.json({ limit: '1mb' }), async (req, res) => {
+  const whisperEnabled = getSetting('whisper_enabled');
+  const whisperKey = getSetting('whisper_key');
+  if (whisperEnabled !== 'true' || !whisperKey) {
+    res.status(400).json({ error: 'Whisper not enabled or API key missing' });
+    return;
+  }
+  const { path: relPath, saveTranscriptNext, language } = req.body as {
+    path?: string;
+    saveTranscriptNext?: boolean;
+    language?: string;
+  };
+  if (!relPath || typeof relPath !== 'string') {
+    res.status(400).json({ error: 'Missing path' });
+    return;
+  }
+  const normalized = relPath.replace(/^\/+/, '').replace(/^files\//, '');
+  const absPath = path.resolve(paths.files, normalized);
+  if (absPath !== paths.files && !absPath.startsWith(paths.files + path.sep)) {
+    res.status(400).json({ error: 'Path escapes workspace/files/' });
+    return;
+  }
+  if (!fs.existsSync(absPath) || !fs.statSync(absPath).isFile()) {
+    res.status(404).json({ error: 'File not found' });
+    return;
+  }
+  try {
+    const audioBuffer = fs.readFileSync(absPath);
+    const filename = path.basename(absPath);
+    const ext = path.extname(filename).toLowerCase().slice(1);
+    const contentTypes: Record<string, string> = {
+      mp3: 'audio/mpeg',
+      m4a: 'audio/mp4',
+      mp4: 'audio/mp4',
+      wav: 'audio/wav',
+      webm: 'audio/webm',
+      ogg: 'audio/ogg',
+      opus: 'audio/ogg',
+      flac: 'audio/flac',
+    };
+    const contentType = contentTypes[ext] || 'application/octet-stream';
+    const boundary = '----WhisperBoundary' + Date.now();
+    const CRLF = '\r\n';
+    const parts: Buffer[] = [];
+    parts.push(Buffer.from(
+      `--${boundary}${CRLF}` +
+      `Content-Disposition: form-data; name="file"; filename="${filename}"${CRLF}` +
+      `Content-Type: ${contentType}${CRLF}${CRLF}`
+    ));
+    parts.push(audioBuffer);
+    parts.push(Buffer.from(CRLF));
+    parts.push(Buffer.from(
+      `--${boundary}${CRLF}` +
+      `Content-Disposition: form-data; name="model"${CRLF}${CRLF}` +
+      `whisper-1${CRLF}`
+    ));
+    if (language && typeof language === 'string') {
+      parts.push(Buffer.from(
+        `--${boundary}${CRLF}` +
+        `Content-Disposition: form-data; name="language"${CRLF}${CRLF}` +
+        `${language}${CRLF}`
+      ));
+    }
+    parts.push(Buffer.from(`--${boundary}--${CRLF}`));
+    const body = Buffer.concat(parts);
+    const response = await fetch('https://api.openai.com/v1/audio/transcriptions', {
+      method: 'POST',
+      headers: {
+        'Authorization': `Bearer ${whisperKey}`,
+        'Content-Type': `multipart/form-data; boundary=${boundary}`,
+      },
+      body,
+    });
+    if (!response.ok) {
+      const errText = await response.text();
+      log.warn(`Whisper API error: ${response.status} ${errText}`);
+      res.status(502).json({ error: 'Whisper API error', detail: errText.slice(0, 500) });
+      return;
+    }
+    const result = await response.json() as { text: string };
+    const transcript = result.text;
+    let transcriptPath: string | undefined;
+    if (saveTranscriptNext) {
+      const txtAbs = absPath + '.txt';
+      fs.writeFileSync(txtAbs, transcript, 'utf8');
+      transcriptPath = path.relative(paths.files, txtAbs).split(path.sep).join('/');
+    }
+    res.json({ transcript, ...(transcriptPath ? { transcriptPath } : {}) });
+  } catch (err: any) {
+    log.warn(`Whisper transcribe-file failed: ${err.message}`);
+    res.status(500).json({ error: 'Transcription failed' });
+  }
+});
 // Serve stored files (audio, images, documents)
 app.use('/api/files', express.static(paths.files));

package/workspace/skills/plaud/SKILL.md ADDED Viewed

@@ -0,0 +1,325 @@
+# Plaud
+## What This Is
+A channel for getting **recordings off the user's Plaud Note device** and into your workspace as `(audio file, transcript)` pairs you can read and act on.
+Plaud is a tiny voice recorder (button on the case, magnet sticks to a phone). When the user records something — a meeting, a lecture, a thought on a walk — the device syncs to Plaud's cloud over Bluetooth/Wi-Fi. **You don't talk to the device.** You talk to Plaud's cloud, pull the audio, and transcribe it yourself.
+There is **no Plaud CLI, no Plaud webhook, no official Plaud API.** Plaud's mobile/web app uses an undocumented HTTP API. This skill uses the same one — same shape OpenPlaud uses (`https://github.com/openplaud/openplaud`).
+The user already pays Plaud $0 if they don't want Plaud's transcription subscription. We do transcription locally via Whisper using the OpenAI key the user added during the Bloby wizard. No new key, no new subscription.
+---
+## What Bloby Gives You (already-built plumbing)
+| Thing | Where | How you use it |
+|---|---|---|
+| Whisper-on-disk endpoint | `POST http://localhost:7400/api/whisper/transcribe-file` | Send a path under `workspace/files/`, get a transcript back. Optional `saveTranscriptNext: true` writes `foo.mp3.txt` next to `foo.mp3`. |
+| Settings k/v store | `GET/POST/PUT http://localhost:7400/api/settings` | Store/retrieve the Plaud JWT, region, workspace ID, last-sync cursor. |
+| Workspace files dir | `workspace/files/audio/plaud/` | Drop downloaded audio here. The supervisor serves it at `/api/files/audio/plaud/<name>`. |
+| Scheduling | `workspace/CRONS.json` or `workspace/PULSE.json` | Run sync periodically. See "Cadence" below. |
+Use `http://localhost:7400` from Bash. Auth is the same Bearer token you already have in your session for the worker; for skill-internal calls running inside the supervisor's own bloby session, `/api/settings` and `/api/whisper/transcribe-file` work the same way `/api/whisper/transcribe` does.
+---
+## Plaud's API in 60 seconds
+Three regions. Pick one when pairing. Token from one region won't work on another.
+| Region | Base URL |
+|---|---|
+| Global | `https://api.plaud.ai` |
+| EU | `https://api-euc1.plaud.ai` |
+| Asia-Pacific | `https://api-apse1.plaud.ai` |
+If the user doesn't know their region, start with Global. If `POST /auth/otp-send-code` returns `status: -302` with `data.domains.api`, redirect to that base instead — the user's account lives in a different region. Save whichever base actually succeeded.
+**User-Agent matters.** Plaud blocks some defaults. Always send a normal browser UA. Example:
+```
+User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36
+```
+---
+## Pairing (first time)
+Walk the human through it conversationally. They don't see any UI for this — just chat with you.
+### Step 1 — Ask for their Plaud email
+```
+Bloby: Which email do you use on plaud.ai? I'll have them send you a 6-digit code.
+Human: bruno@example.com
+```
+If the human mentions they signed up with **Google or Apple**, jump to the "Paste-token fallback" section instead. OTP only works for accounts that were created with an email+password identity on Plaud's side.
+### Step 2 — Send the OTP
+```bash
+curl -s -X POST 'https://api.plaud.ai/auth/otp-send-code' \
+  -H 'Content-Type: application/json' \
+  -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' \
+  -d '{"username":"<EMAIL>"}'
+```
+Expected `status: 0` and a `token` field. **Save the `token`** — you need it for verify. If you see `status: -302`, switch `apiBase` to `data.domains.api` and retry once.
+### Step 3 — Ask for the code
+```
+Bloby: Check your inbox — Plaud sent you a 6-digit code. What is it?
+```
+### Step 4 — Verify
+```bash
+curl -s -X POST '<apiBase>/auth/otp-login' \
+  -H 'Content-Type: application/json' \
+  -H 'User-Agent: Mozilla/5.0 ...' \
+  -d '{"code":"<6 DIGITS>","token":"<OTP TOKEN FROM STEP 2>"}'
+```
+Expected `access_token` (a long `eyJ...` JWT). **This is the long-lived token. Store it.**
+### Step 5 — Store the connection
+```bash
+# Each setting saved separately. Replace <TOKEN> and <BASE>.
+curl -s -X POST 'http://localhost:7400/api/settings' \
+  -H 'Content-Type: application/json' \
+  -d '{"key":"plaud_token","value":"<JWT>"}'
+curl -s -X POST 'http://localhost:7400/api/settings' \
+  -H 'Content-Type: application/json' \
+  -d '{"key":"plaud_api_base","value":"<BASE>"}'
+curl -s -X POST 'http://localhost:7400/api/settings' \
+  -H 'Content-Type: application/json' \
+  -d '{"key":"plaud_email","value":"<EMAIL>"}'
+```
+You can also save `plaud_workspace_id` once you discover it (see "Workspaces" below).
+### Step 6 — Smoke test
+```bash
+curl -s '<BASE>/device/list' \
+  -H 'Authorization: Bearer <JWT>' \
+  -H 'User-Agent: Mozilla/5.0 ...'
+```
+Should return a JSON object listing the user's Plaud devices (each has a `serial_number`). If you get `401`, the OTP didn't grant a usable token — start over. If `200`, tell the human: *"Paired. Your Plaud (serial ending ...XXXX) is connected. Want me to pull in everything you've recorded so far?"*
+---
+## Paste-token fallback (Google/Apple Plaud accounts)
+If OTP just won't work and the human signed up with Google or Apple, get the bearer manually:
+1. Open [web.plaud.ai](https://web.plaud.ai) in a browser and sign in with Google/Apple normally.
+2. Open DevTools (F12 or Cmd+Option+I) → Network tab → refresh.
+3. Click any request to `api.plaud.ai`, `api-euc1.plaud.ai`, or `api-apse1.plaud.ai`.
+4. Under **Request Headers**, find `Authorization`. Copy everything after `Bearer ` (the long `eyJ...`).
+5. Tell the bloby in chat. The bloby saves it via the same `plaud_token` setting key, plus the matching `plaud_api_base`.
+JWTs from this path expire too. The skill behaviour on 401 is the same (see "Re-auth" below).
+---
+## Syncing recordings
+The shape of a sync run:
+```
+GET /file/simple/web      → list recent recordings (paginated)
+for each new one:
+  GET /file/temp-url/<id>?is_opus=0   → get a short-lived S3 link (request the mp3, not opus)
+  curl -o workspace/files/audio/plaud/<id>.mp3  → download
+  POST /api/whisper/transcribe-file  → produces <id>.mp3.txt alongside
+```
+### List recordings
+```bash
+curl -s '<BASE>/file/simple/web?skip=0&limit=50&is_trash=0&sort_by=edit_time&is_desc=true' \
+  -H 'Authorization: Bearer <JWT>' \
+  -H 'User-Agent: Mozilla/5.0 ...'
+```
+The response has `data_file_list` — an array of recording objects. Fields you'll care about:
+| Field | Use |
+|---|---|
+| `id` | Plaud's file id. Use as the local filename. |
+| `filename` | Human label the user gave it (or auto-generated). Sanitise before using as a filename. |
+| `duration` | Seconds. |
+| `start_time` / `end_time` | When the recording happened. |
+| `version_ms` | Bumps if the user edits the recording. Track this to know when to re-download. |
+| `serial_number` | Which Plaud device. |
+| `is_trash` | Skip if 1. |
+Page with `skip=` (the API also accepts a huge `limit`, but page through 50-at-a-time politely).
+### Dedup
+You don't want to re-download what you already have. Two ways, pick one:
+- **Filesystem**: if `workspace/files/audio/plaud/<id>.mp3` exists, skip it.
+- **Cursor**: save the newest `version_ms` you've seen as `plaud_last_sync` setting. On next sync, skip anything `<=` that cursor. Faster — no `ls` needed.
+If `version_ms` changed on a recording you already downloaded, the user edited the filename or trimmed it. Re-fetch and overwrite.
+### Get the download URL
+```bash
+curl -s '<BASE>/file/temp-url/<FILE_ID>?is_opus=0' \
+  -H 'Authorization: Bearer <JWT>' \
+  -H 'User-Agent: Mozilla/5.0 ...'
+```
+`is_opus=0` returns the mp3 variant (`temp_url`). `is_opus=1` returns opus in `temp_url_opus`. **Use mp3** — Whisper handles it cleanly, opus needs ffmpeg.
+Response: `{ "temp_url": "https://<s3...>" }`. The URL expires in a few minutes. Download immediately.
+### Download
+```bash
+mkdir -p workspace/files/audio/plaud
+curl -s -o "workspace/files/audio/plaud/<FILE_ID>.mp3" '<TEMP URL>'
+```
+### Transcribe
+```bash
+curl -s -X POST 'http://localhost:7400/api/whisper/transcribe-file' \
+  -H 'Content-Type: application/json' \
+  -d '{"path":"audio/plaud/<FILE_ID>.mp3","saveTranscriptNext":true}'
+```
+Returns `{ "transcript": "...", "transcriptPath": "audio/plaud/<FILE_ID>.mp3.txt" }`. The `.txt` file is now sitting next to the audio. You can read it with `Read` like any other file.
+The user's `whisper_key` from the wizard is what powers this — you don't need to know or handle the OpenAI key.
+If transcription fails (e.g. file >25MB, Whisper API's own hard limit), leave the audio in place and skip the `.txt`. The human can ask you to split/compress later.
+### Pretty filenames (optional)
+Tell the human you can keep the raw `<FILE_ID>.mp3` filenames, OR you can also rename to something human-readable. If they want pretty names:
+```bash
+# After successful transcribe, also write a symlink or copy with a nicer name:
+NICE="$(date -d "<start_time>" +%Y-%m-%d_%H%M)_<sanitised filename>"
+ln -s "<FILE_ID>.mp3" "workspace/files/audio/plaud/${NICE}.mp3"
+ln -s "<FILE_ID>.mp3.txt" "workspace/files/audio/plaud/${NICE}.txt"
+```
+(Sanitise `filename` by stripping `/\\:*?"<>|`.)
+Don't rename the originals — keep `<id>.mp3` as the canonical name so dedup keeps working.
+---
+## Cadence — CRON or PULSE?
+**There is no automatic cron set up by this skill.** You and your human decide together. Two reasonable patterns:
+### Pattern A — CRON every N minutes
+When the human wants near-real-time freshness ("any time I record something, you should know about it within 15 minutes"), add an entry to `workspace/CRONS.json`:
+```json
+{
+  "id": "plaud-sync",
+  "schedule": "*/15 * * * *",
+  "task": "Run a Plaud sync: list new recordings, download any new ones into workspace/files/audio/plaud/, and transcribe them via /api/whisper/transcribe-file. After, summarise to the human in chat IF there were new recordings — otherwise stay silent.",
+  "enabled": true,
+  "oneShot": false
+}
+```
+Tune `*/15` to taste. `*/5` for aggressive, `0 * * * *` (top of every hour) for quiet.
+### Pattern B — PULSE memo
+When the human prefers their bloby just *check* during normal pulse wake-ups, add one line to your `MYSELF.md` or `MEMORY.md`:
+```
+- Each pulse, briefly check Plaud for new recordings via the plaud skill. If there's something new, transcribe and decide whether to surface it. If nothing new, move on silently.
+```
+Pulse runs every 30 min by default. No CRON entry needed. Less aggressive than Pattern A, fits naturally with whatever else you're doing at pulse time.
+### Or: don't auto-sync at all
+Some humans only want manual control: *"Bloby, pull anything new from Plaud."* That's also fine — just keep the skill installed, no CRON, no pulse memo, you sync when asked.
+**Always check with the human first.** Default to Pattern B for new installs unless they tell you otherwise.
+---
+## Re-auth (401 handling)
+When any Plaud call returns 401:
+1. Tell the human in chat: *"Your Plaud connection expired. Want me to re-pair?"* Don't silently fail.
+2. If they say yes, re-run the OTP flow from Step 1. Overwrite the `plaud_token` setting.
+3. If they signed up with Google/Apple originally, prompt for the paste-token fallback instead.
+4. Don't keep retrying with the dead token — pause the sync until re-paired.
+---
+## Disconnect
+```bash
+curl -s -X POST 'http://localhost:7400/api/settings' \
+  -H 'Content-Type: application/json' \
+  -d '{"key":"plaud_token","value":""}'
+curl -s -X POST 'http://localhost:7400/api/settings' \
+  -H 'Content-Type: application/json' \
+  -d '{"key":"plaud_api_base","value":""}'
+```
+Recordings already on disk stay. The user can also disable the CRON entry (or remove it from `CRONS.json`).
+---
+## Workspaces (advanced)
+Plaud's "workspace" is their multi-account team feature. Personal accounts don't usually need to worry about this — the API responds correctly without a workspace token. If a human ever reports recordings missing that they can see in the Plaud app, it's likely a workspace-scoped recording.
+To resolve a workspace token: there's an undocumented `/workspace/...` endpoint that mints a workspace-scoped token. OpenPlaud's `src/lib/plaud/workspace.ts` is the reference if you ever need it. Don't bother unless the human hits this case.
+---
+## What This Skill Does NOT Do
+- **No Plaud transcription.** We transcribe ourselves with Whisper. Plaud's own AI subscription is bypassed entirely.
+- **No dashboard.** OpenPlaud has a slick UI for browsing recordings. We don't. The bloby's job is to *read* the transcripts and act on them — summaries, action items, emails — using the normal workspace tools. If the human wants a UI, build one into `workspace/client/` as a normal workspace app.
+- **No push from Plaud.** No webhooks exist. You only know about new recordings when you ask.
+- **No editing recordings.** The Plaud API technically supports `PATCH /file/<id>` to rename. We don't expose it here — keep canonical `<id>.mp3` names.
+- **No real-time streaming.** Plaud syncs to its cloud *after* the recording finishes. Expect a lag of seconds-to-minutes between "user stopped recording" and "file appears in `/file/simple/web`."
+---
+## Quick Reference
+| Action | curl |
+|---|---|
+| Send OTP | `POST <base>/auth/otp-send-code` body `{username}` |
+| Verify OTP | `POST <base>/auth/otp-login` body `{code, token}` |
+| List devices | `GET <base>/device/list` |
+| List recordings | `GET <base>/file/simple/web?skip=0&limit=50&is_trash=0&sort_by=edit_time&is_desc=true` |
+| Get download URL | `GET <base>/file/temp-url/<id>?is_opus=0` |
+| Transcribe local file | `POST http://localhost:7400/api/whisper/transcribe-file` body `{path, saveTranscriptNext}` |
+| Save setting | `POST http://localhost:7400/api/settings` body `{key, value}` |
+All Plaud requests need `Authorization: Bearer <JWT>` + a browser-style `User-Agent`.
+---
+## Credit
+Plaud API shape is the same one [OpenPlaud](https://github.com/openplaud/openplaud) uses — they did the reverse-engineering work. This skill reimplements just the parts a bloby needs.

package/workspace/skills/plaud/skill.json ADDED Viewed

@@ -0,0 +1,15 @@
+{
+  "name": "plaud",
+  "version": "1.0.0",
+  "type": "skill",
+  "bloby_human": "Bruno Bertapeli",
+  "bloby": "bloby-bruno",
+  "author": "newbot-official",
+  "description": "Plaud Note integration. Pairs the user's Plaud account via email OTP, polls Plaud's cloud for new recordings, downloads the audio into workspace/files/audio/plaud/, and transcribes it via the user's Whisper key. Cadence (CRON vs PULSE memo) is chosen by the human and their bloby together.",
+  "depends": [],
+  "env_keys": [],
+  "has_telemetry": false,
+  "size": "8KB",
+  "contains_binaries": false,
+  "tags": ["plaud", "transcription", "audio", "recorder", "meeting"]
+}