npm - @crafter/trx - Versions diffs - 0.1.0 - Mend

@crafter/trx 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/bin/trx.ts +35 -0
package/biome.json +6 -0
package/package.json +42 -0
package/schemas/init.json +45 -0
package/schemas/transcribe.json +77 -0
package/skills/trx/SKILL.md +108 -0
package/skills/trx/references/whisper-fixes.md +88 -0
package/src/commands/doctor.ts +90 -0
package/src/commands/init.ts +171 -0
package/src/commands/schema.ts +24 -0
package/src/commands/transcribe.ts +131 -0
package/src/core/audio.ts +28 -0
package/src/core/download.ts +37 -0
package/src/core/pipeline.ts +71 -0
package/src/core/whisper.ts +67 -0
package/src/utils/config.ts +72 -0
package/src/utils/output.ts +49 -0
package/src/utils/spawn.ts +27 -0
package/src/validation/input.ts +189 -0
package/tests/e2e.test.ts +373 -0
package/tests/fixtures/silence.txt +2 -0
package/tests/fixtures/silence.wav +0 -0
package/tests/fixtures/silence.wav.srt +4 -0
package/tsconfig.json +19 -0

package/bin/trx.ts ADDED Viewed

@@ -0,0 +1,35 @@
+#!/usr/bin/env bun
+import { Command } from "commander";
+import { createDoctorCommand } from "../src/commands/doctor.ts";
+import { createInitCommand } from "../src/commands/init.ts";
+import { createSchemaCommand } from "../src/commands/schema.ts";
+import { createTranscribeCommand } from "../src/commands/transcribe.ts";
+const program = new Command();
+program
+	.name("trx")
+	.description("Agent-first CLI for audio/video transcription via Whisper")
+	.version("0.1.0")
+	.option("-o, --output <format>", "output format (json, table, auto)", "auto")
+	.hook("preAction", (thisCommand) => {
+		const opts = thisCommand.opts();
+		if (opts.output === "auto") {
+			opts.output = process.stdout.isTTY ? "table" : "json";
+		}
+	});
+program.addCommand(createInitCommand());
+program.addCommand(createTranscribeCommand());
+program.addCommand(createDoctorCommand());
+program.addCommand(createSchemaCommand());
+const args = process.argv.slice(2);
+const subcommands = ["init", "transcribe", "doctor", "schema", "help", "--help", "-h", "--version", "-V"];
+const firstArg = args[0];
+if (firstArg && !firstArg.startsWith("-") && !subcommands.includes(firstArg)) {
+	process.argv.splice(2, 0, "transcribe");
+}
+program.parse();

package/biome.json ADDED Viewed

@@ -0,0 +1,6 @@
+{
+	"$schema": "https://biomejs.dev/schemas/2.4.10/schema.json",
+	"linter": { "enabled": true, "rules": { "recommended": true } },
+	"formatter": { "enabled": true, "indentStyle": "tab", "lineWidth": 120 },
+	"assist": { "actions": { "source": { "organizeImports": { "level": "on" } } } }
+}

package/package.json ADDED Viewed

@@ -0,0 +1,42 @@
+{
+	"name": "@crafter/trx",
+	"version": "0.1.0",
+	"description": "Agent-first CLI for audio/video transcription via Whisper",
+	"module": "bin/trx.ts",
+	"type": "module",
+	"license": "MIT",
+	"homepage": "https://github.com/crafter-station/trx",
+	"repository": {
+		"type": "git",
+		"url": "https://github.com/crafter-station/trx"
+	},
+	"keywords": [
+		"transcription",
+		"whisper",
+		"stt",
+		"speech-to-text",
+		"cli",
+		"agent",
+		"subtitles",
+		"srt",
+		"audio",
+		"video",
+		"yt-dlp",
+		"ffmpeg"
+	],
+	"bin": {
+		"trx": "bin/trx.ts"
+	},
+	"scripts": {
+		"dev": "bun run bin/trx.ts",
+		"test": "bun test",
+		"check": "biome check --write ."
+	},
+	"dependencies": {
+		"@clack/prompts": "^1.1.0",
+		"commander": "^14.0.3"
+	},
+	"devDependencies": {
+		"@types/bun": "latest"
+	}
+}

package/schemas/init.json ADDED Viewed

@@ -0,0 +1,45 @@
+{
+	"command": "init",
+	"description": "Install dependencies (whisper-cli, yt-dlp, ffmpeg) and download Whisper model",
+	"flags": {
+		"--model": {
+			"type": "string",
+			"enum": ["tiny", "base", "small", "medium", "large"],
+			"default": "small",
+			"description": "Whisper model size to download"
+		},
+		"--language": {
+			"type": "string",
+			"default": "auto",
+			"description": "Default language for transcription (ISO 639-1 code or 'auto')"
+		},
+		"--output": {
+			"type": "string",
+			"enum": ["json", "table", "auto"],
+			"default": "auto",
+			"description": "Output format"
+		}
+	},
+	"dependencies": {
+		"whisper-cli": {
+			"install": "brew install whisper-cpp",
+			"purpose": "Local speech-to-text transcription engine"
+		},
+		"yt-dlp": {
+			"install": "brew install yt-dlp",
+			"purpose": "Download video/audio from URLs (YouTube, Twitter, etc.)"
+		},
+		"ffmpeg": {
+			"install": "brew install ffmpeg",
+			"purpose": "Audio cleaning, conversion, silence removal, noise reduction"
+		}
+	},
+	"output": {
+		"success": "boolean",
+		"model": "string",
+		"language": "string",
+		"modelPath": "string",
+		"config": "TrxConfig object"
+	},
+	"examples": ["trx init", "trx init --model small --language es", "trx init --model large --output json"]
+}

package/schemas/transcribe.json ADDED Viewed

@@ -0,0 +1,77 @@
+{
+	"command": "transcribe",
+	"description": "Transcribe audio/video from URL or local file using Whisper",
+	"arguments": {
+		"input": {
+			"type": "string",
+			"required": true,
+			"description": "URL (https://...) or local file path (.mp4, .m4a, .ogg, .wav, .webm, .mkv, .avi, .mov, .flac, .mp3)"
+		}
+	},
+	"flags": {
+		"--language": {
+			"type": "string",
+			"default": "auto",
+			"description": "ISO 639-1 language code or 'auto' for auto-detection"
+		},
+		"--model": {
+			"type": "string",
+			"enum": ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large"],
+			"description": "Override whisper model size"
+		},
+		"--output": {
+			"type": "string",
+			"enum": ["json", "table", "auto"],
+			"default": "auto",
+			"description": "Output format. 'auto' uses 'table' for TTY, 'json' when piped"
+		},
+		"--fields": {
+			"type": "string",
+			"description": "Comma-separated fields to include: text, srt, metadata, files"
+		},
+		"--dry-run": {
+			"type": "boolean",
+			"default": false,
+			"description": "Validate input and show execution plan without transcribing"
+		},
+		"--json": {
+			"type": "string",
+			"description": "Raw JSON payload: {\"input\": \"...\", \"language\": \"...\", \"model\": \"...\"}"
+		},
+		"--output-dir": {
+			"type": "string",
+			"default": ".",
+			"description": "Directory for output files"
+		},
+		"--no-download": {
+			"type": "boolean",
+			"default": false,
+			"description": "Skip yt-dlp download step (input must be local file)"
+		},
+		"--no-clean": {
+			"type": "boolean",
+			"default": false,
+			"description": "Skip ffmpeg audio cleaning/normalization"
+		}
+	},
+	"output": {
+		"success": "boolean",
+		"input": "string",
+		"files": {
+			"wav": "string (path)",
+			"srt": "string (path)",
+			"txt": "string (path)"
+		},
+		"metadata": {
+			"language": "string",
+			"model": "string"
+		},
+		"text": "string (full transcript)"
+	},
+	"examples": [
+		"trx transcribe recording.mp4 --output json",
+		"trx transcribe https://youtube.com/watch?v=abc --language es --output json",
+		"trx transcribe video.mp4 --fields text --output json",
+		"trx transcribe video.mp4 --dry-run --output json"
+	]
+}

package/skills/trx/SKILL.md ADDED Viewed

@@ -0,0 +1,108 @@
+---
+name: trx
+description: |
+  Transcribe audio/video using trx CLI and post-process results with agent corrections.
+  Use when: (1) user wants to transcribe a video or audio file, (2) user shares a
+  YouTube/Twitter/Instagram URL for transcription, (3) user says "transcribe",
+  "subtitles", "srt", "transcript", (4) user wants to fix/clean up a whisper
+  transcription, (5) user asks to extract text from a video.
+metadata:
+  author: Railly Hugo
+  version: "0.1.0"
+---
+# trx -- Agent-First Transcription CLI
+Install: `npx skills add crafter-station/trx -g`
+## Prerequisites
+Check setup: `trx doctor --output json`. If dependencies missing, run `trx init`.
+Install: `bun add -g @crafter/trx`
+## Workflow
+### 1. Dry-run first (always)
+```bash
+trx transcribe <input> --dry-run --output json
+```
+Validates input, checks dependencies, shows execution plan without running.
+### 2. Transcribe
+For URLs (YouTube, Twitter, Instagram, etc.):
+```bash
+trx transcribe "https://youtube.com/watch?v=..." --output json
+```
+For local files:
+```bash
+trx transcribe ./recording.mp4 --output json
+```
+Agent-optimized (text only, saves tokens):
+```bash
+trx transcribe <input> --fields text --output json
+```
+### 3. Post-process (fix whisper mistakes)
+After transcription, read the `.txt` output and apply corrections. Read [whisper-fixes.md](references/whisper-fixes.md) for common patterns.
+**Correction checklist:**
+1. **Punctuation**: Whisper drops periods at paragraph boundaries and misplaces commas. Fix sentence boundaries.
+2. **Accents** (Spanish): Whisper often drops diacritics. Restore: como -> como/cmo, esta -> esta/est, mas -> mas/ms.
+3. **Technical terms**: Whisper misspells domain-specific words. Ask user for a glossary or infer from context.
+4. **Repeated phrases**: Whisper sometimes stutters on word boundaries. Remove exact consecutive duplicates.
+5. **Speaker attribution**: If user provides speaker names, insert `[Speaker Name]:` markers.
+6. **Filler words**: Remove "um", "uh", "este", "o sea" if user wants clean output.
+7. **Timestamp alignment**: If editing `.srt`, preserve the timestamp structure. Only modify text between timestamps.
+### 4. Schema introspection
+```bash
+trx schema transcribe
+trx schema init
+```
+## Commands
+| Command | Example |
+|---------|---------|
+| `init` | `trx init --model small` |
+| `transcribe` | `trx transcribe <url-or-file> --output json` |
+| `doctor` | `trx doctor --output json` |
+| `schema` | `trx schema transcribe` |
+## Shorthand
+`trx <input>` is equivalent to `trx transcribe <input>`.
+## Output format
+- `--output json`: Machine-readable (default when piped)
+- `--output table`: Human-readable with progress (default when TTY)
+- `--fields text`: Only return transcript text (saves tokens)
+- `--fields metadata`: Only return metadata (language, model)
+- `--dry-run`: Validate without executing
+## Flags reference
+| Flag | Description | Default |
+|------|-------------|---------|
+| `--language <code>` | ISO 639-1 language code | `auto` (from config) |
+| `--model <size>` | Override model: tiny, base, small, medium, large | from config |
+| `--output-dir <dir>` | Output directory | `.` (cwd) |
+| `--no-download` | Skip yt-dlp (local files only) | false |
+| `--no-clean` | Skip ffmpeg audio cleaning | false |
+| `--json <payload>` | Raw JSON input | - |
+## Edge cases
+- **yt-dlp extension mismatch**: yt-dlp sometimes outputs `.mp4.webm` instead of `.mp4`. The CLI handles this by scanning for the downloaded file by prefix.
+- **Large files (>1hr)**: Whisper processes in segments. Works but is slow on CPU. Consider `--model tiny` for speed.
+- **No GPU**: whisper-cli uses CPU by default. Acceptable for tiny/base/small models.
+- **Auto-detect language**: When `--language auto`, Whisper detects the language from the first 30 seconds. For multilingual content, specify the primary language.

package/skills/trx/references/whisper-fixes.md ADDED Viewed

@@ -0,0 +1,88 @@
+# Common Whisper Transcription Mistakes
+Reference for post-processing agent corrections. Grouped by language and category.
+## Spanish
+### Accent marks (most common)
+Whisper frequently drops diacritics. Restore based on grammatical context:
+- "como" -> "cmo" (when meaning "how/what")
+- "esta" -> "est" (when it's a verb, not demonstrative)
+- "mas" -> "ms" (when meaning "more", not "but")
+- "si" -> "s" (when meaning "yes", not "if")
+- "el" -> "l" (when it's a pronoun, not article)
+- "que" -> "qu" (in questions: "Qu haces?")
+- "cuando" -> "cundo" (in questions)
+- "numero" -> "nmero"
+- "tambien" -> "tambin"
+- "informacion" -> "informacin"
+### Question/exclamation marks
+Whisper almost never generates opening marks:
+- Add "" at the start of questions
+- Add "" at the start of exclamations
+### Run-on sentences
+Whisper often produces long sentences without periods. Split when:
+- Topic changes
+- Speaker changes
+- Natural pause in the audio (check SRT timestamps for gaps > 1.5s)
+### Common confusions
+- "coma" vs "como" (comma vs how)
+- "haber" vs "a ver" (to have vs let's see)
+- "echo" vs "hecho" (thrown vs done/fact)
+- "hay" vs "ah" vs "ay" (there is vs interjection)
+## English
+### Homophones
+- "their" / "there" / "they're"
+- "your" / "you're"
+- "its" / "it's"
+- "to" / "too" / "two"
+- "then" / "than"
+### Capitalization
+Whisper inconsistently capitalizes:
+- Proper nouns (names, companies, places)
+- Sentence beginnings after periods
+- Acronyms (API, CLI, AI, ML)
+### Technical terms (common misspellings)
+- "typescript" -> "TypeScript"
+- "javascript" -> "JavaScript"
+- "react" -> "React" (when referring to the framework)
+- "next js" / "nextjs" -> "Next.js"
+- "node js" -> "Node.js"
+- "github" -> "GitHub"
+- "vercel" -> "Vercel"
+- "anthropic" -> "Anthropic"
+- "openai" -> "OpenAI"
+- "kubernetes" -> "Kubernetes"
+- "docker" -> "Docker"
+### Filler words (remove if user wants clean output)
+- "um", "uh", "like", "you know", "I mean", "basically", "actually", "right"
+## Both Languages
+### Repeated words
+Whisper sometimes outputs the same word/phrase twice at segment boundaries:
+```
+the the quick brown fox
+```
+Remove exact consecutive duplicates.
+### Numbers
+Whisper alternates between spelled-out and numeric forms inconsistently:
+- Prefer numeric for: dates, times, measurements, code references
+- Prefer spelled-out for: small numbers in natural speech (one, two, three)
+### Timestamps (SRT editing rules)
+When editing .srt files:
+1. Never modify timestamp lines (lines with `-->`)
+2. Never modify sequence numbers
+3. Only edit the text content between timestamps
+4. Keep the same number of subtitle blocks
+5. Preserve blank lines between blocks

package/src/commands/doctor.ts ADDED Viewed

@@ -0,0 +1,90 @@
+import { existsSync } from "node:fs";
+import { Command } from "commander";
+import { getConfigPath, getModelsDir, readConfig } from "../utils/config.ts";
+import { type OutputFormat, output, outputError } from "../utils/output.ts";
+import { spawn } from "../utils/spawn.ts";
+interface DepStatus {
+	installed: boolean;
+	version: string | null;
+	path: string | null;
+}
+async function checkBinary(name: string): Promise<DepStatus> {
+	const which = await spawn(["which", name]);
+	if (which.exitCode !== 0) {
+		return { installed: false, version: null, path: null };
+	}
+	const binPath = which.stdout.trim();
+	const ver = await spawn([name, "--version"]);
+	const version = ver.exitCode === 0 ? ver.stdout.split("\n")[0].trim() : null;
+	return { installed: true, version, path: binPath };
+}
+export function createDoctorCommand(): Command {
+	return new Command("doctor").description("Check dependencies and configuration status").action(async (_, cmd) => {
+		const format: OutputFormat = cmd.optsWithGlobals().output;
+		const [whisper, ytdlp, ffmpeg] = await Promise.all([
+			checkBinary("whisper-cli"),
+			checkBinary("yt-dlp"),
+			checkBinary("ffmpeg"),
+		]);
+		const config = readConfig();
+		const configPath = getConfigPath();
+		const modelsDir = getModelsDir();
+		const modelExists = config ? existsSync(config.modelPath) : false;
+		const allInstalled = whisper.installed && ytdlp.installed && ffmpeg.installed;
+		const data = {
+			healthy: allInstalled && !!config && modelExists,
+			dependencies: { "whisper-cli": whisper, "yt-dlp": ytdlp, ffmpeg },
+			config: {
+				exists: !!config,
+				path: configPath,
+				modelsDir,
+				...(config
+					? {
+							modelSize: config.modelSize,
+							modelPath: config.modelPath,
+							modelExists,
+							language: config.language,
+						}
+					: {}),
+			},
+		};
+		if (format === "json") {
+			output(format, { json: data });
+		} else {
+			console.log("\ntrx doctor\n");
+			const deps = [
+				["whisper-cli", whisper],
+				["yt-dlp", ytdlp],
+				["ffmpeg", ffmpeg],
+			] as const;
+			for (const [name, dep] of deps) {
+				const status = dep.installed ? "\u2713" : "\u2717";
+				const ver = dep.version ? ` (${dep.version})` : "";
+				console.log(`  ${status} ${name}${ver}`);
+			}
+			console.log();
+			if (config) {
+				console.log(`  Config: ${configPath}`);
+				console.log(`  Model: ${config.modelSize} ${modelExists ? "\u2713" : "\u2717 (not downloaded)"}`);
+				console.log(`  Language: ${config.language}`);
+			} else {
+				console.log('  Config: not found. Run "trx init" to set up.');
+			}
+			console.log();
+			if (!allInstalled) {
+				outputError('Missing dependencies. Run "trx init" to install.', "table");
+			}
+		}
+	});
+}

package/src/commands/init.ts ADDED Viewed

@@ -0,0 +1,171 @@
+import { existsSync } from "node:fs";
+import * as p from "@clack/prompts";
+import { Command } from "commander";
+import { defaultConfig, ensureTrxDir, getModelsDir, writeConfig } from "../utils/config.ts";
+import { type OutputFormat, output, outputError } from "../utils/output.ts";
+import { spawn, spawnOrThrow } from "../utils/spawn.ts";
+import { validateLanguage, validateModel } from "../validation/input.ts";
+const MODELS = [
+	{ value: "tiny", label: "tiny (~75 MB)", hint: "fastest, lowest accuracy" },
+	{ value: "base", label: "base (~142 MB)", hint: "fast, decent accuracy" },
+	{ value: "small", label: "small (~466 MB)", hint: "balanced speed/accuracy (recommended)" },
+	{ value: "medium", label: "medium (~1.5 GB)", hint: "slow, high accuracy" },
+	{ value: "large", label: "large (~3 GB)", hint: "slowest, best accuracy" },
+];
+const HF_BASE = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main";
+async function checkAndInstallDep(name: string, brewPackage: string, isTTY: boolean): Promise<boolean> {
+	const which = await spawn(["which", name]);
+	if (which.exitCode === 0) return true;
+	if (!isTTY) {
+		return false;
+	}
+	const install = await p.confirm({
+		message: `${name} is not installed. Install via Homebrew (brew install ${brewPackage})?`,
+	});
+	if (p.isCancel(install) || !install) {
+		p.log.warn(`Skipped ${name}. Install manually: brew install ${brewPackage}`);
+		return false;
+	}
+	try {
+		p.log.step(`Installing ${brewPackage}...`);
+		await spawnOrThrow(["brew", "install", brewPackage], `brew install ${brewPackage}`);
+		p.log.success(`${name} installed`);
+		return true;
+	} catch (e) {
+		p.log.error(`Failed to install ${brewPackage}: ${(e as Error).message}`);
+		return false;
+	}
+}
+async function downloadModel(modelSize: string, modelsDir: string, isTTY: boolean): Promise<string> {
+	const modelFile = `ggml-${modelSize}.bin`;
+	const modelPath = `${modelsDir}/${modelFile}`;
+	if (existsSync(modelPath)) {
+		if (isTTY) p.log.success(`Model ${modelSize} already downloaded`);
+		return modelPath;
+	}
+	const url = `${HF_BASE}/${modelFile}`;
+	if (isTTY) p.log.step(`Downloading ${modelFile} from Hugging Face...`);
+	await spawnOrThrow(["curl", "-L", "--progress-bar", "-o", modelPath, url], `Download model ${modelSize}`);
+	if (!existsSync(modelPath)) {
+		throw new Error(`Model download completed but file not found: ${modelPath}`);
+	}
+	return modelPath;
+}
+async function installSkill(isTTY: boolean): Promise<boolean> {
+	if (!isTTY) return false;
+	const install = await p.confirm({
+		message: "Install agent skill? (lets AI agents use trx with post-processing)",
+	});
+	if (p.isCancel(install) || !install) {
+		p.log.info("Skipped. Install later: npx skills add crafter-station/trx -g");
+		return false;
+	}
+	try {
+		const proc = Bun.spawn(["npx", "skills", "add", "crafter-station/trx", "-g"], {
+			stdin: "inherit",
+			stdout: "inherit",
+			stderr: "inherit",
+		});
+		const exitCode = await proc.exited;
+		return exitCode === 0;
+	} catch {
+		p.log.warn("npx skills not available. Install manually: npx skills add crafter-station/trx -g");
+		return false;
+	}
+}
+export function createInitCommand(): Command {
+	return new Command("init")
+		.description("Install dependencies and download Whisper model")
+		.option("-m, --model <size>", "whisper model size", "small")
+		.option("-l, --language <code>", "default language", "auto")
+		.action(async (opts, cmd) => {
+			const format: OutputFormat = cmd.optsWithGlobals().output;
+			const isTTY = process.stdout.isTTY && format !== "json";
+			try {
+				const modelSize = validateModel(opts.model);
+				const language = validateLanguage(opts.language);
+				if (isTTY) {
+					p.intro("trx init");
+				}
+				ensureTrxDir();
+				if (isTTY) p.log.step("Checking dependencies...");
+				const [hasWhisper, hasYtdlp, hasFfmpeg] = await Promise.all([
+					checkAndInstallDep("whisper-cli", "whisper-cpp", isTTY),
+					checkAndInstallDep("yt-dlp", "yt-dlp", isTTY),
+					checkAndInstallDep("ffmpeg", "ffmpeg", isTTY),
+				]);
+				if (!hasWhisper || !hasYtdlp || !hasFfmpeg) {
+					const missing = [!hasWhisper && "whisper-cli", !hasYtdlp && "yt-dlp", !hasFfmpeg && "ffmpeg"]
+						.filter(Boolean)
+						.join(", ");
+					outputError(`Missing dependencies: ${missing}`, format);
+					return;
+				}
+				let selectedModel = modelSize;
+				if (isTTY && !cmd.getOptionValueSource("model")) {
+					const choice = await p.select({
+						message: "Select Whisper model:",
+						options: MODELS,
+						initialValue: "small",
+					});
+					if (p.isCancel(choice)) {
+						p.cancel("Init cancelled");
+						process.exit(0);
+					}
+					selectedModel = validateModel(choice as string);
+				}
+				const modelsDir = getModelsDir();
+				const modelPath = await downloadModel(selectedModel, modelsDir, isTTY);
+				const config = defaultConfig(selectedModel, language);
+				config.modelPath = modelPath;
+				writeConfig(config);
+				if (isTTY) p.log.step("Agent skill setup...");
+				const skillInstalled = await installSkill(isTTY);
+				if (isTTY) {
+					p.outro("trx is ready. Run: trx <url-or-file>");
+				}
+				output(format, {
+					json: {
+						success: true,
+						model: selectedModel,
+						language,
+						modelPath,
+						skillInstalled,
+						config,
+					},
+				});
+			} catch (e) {
+				outputError((e as Error).message, format);
+			}
+		});
+}