buttercut 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/settings.local.json +2 -1
- data/.claude/skills/analyze-video/SKILL.md +11 -78
- data/.claude/skills/analyze-video/agent_prompt.md +84 -0
- data/.claude/skills/backup-library/SKILL.md +1 -1
- data/.claude/skills/cut-planner/SKILL.md +74 -0
- data/.claude/skills/roughcut/SKILL.md +41 -47
- data/.claude/skills/roughcut/agent_prompt.md +153 -0
- data/.claude/skills/summarize-video/SKILL.md +31 -0
- data/.claude/skills/summarize-video/agent_prompt.md +39 -0
- data/.claude/skills/summarize-video/summary_skeleton.rb +78 -0
- data/.claude/skills/summarize-video/visual_script_extractor.rb +78 -0
- data/.claude/skills/transcribe-audio/SKILL.md +17 -71
- data/.claude/skills/transcribe-audio/agent_prompt.md +53 -0
- data/CLAUDE.md +48 -18
- data/lib/buttercut/version.rb +1 -1
- data/templates/library_template.yaml +1 -0
- data/templates/plan_template.md +53 -0
- data/templates/roughcut_template.yaml +3 -20
- data/templates/settings_template.yaml +3 -0
- metadata +11 -3
- data/.claude/skills/roughcut/agent_instructions.md +0 -109
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# Pre-create a summary skeleton file for the summarize-video skill,
|
|
3
|
+
# with header (filename, duration) filled in and four placeholder markers
|
|
4
|
+
# in the body for the sub-agent to replace via Edit.
|
|
5
|
+
#
|
|
6
|
+
# Usage:
|
|
7
|
+
# ruby summary_skeleton.rb <visual_transcript.json> <summary_output.md>
|
|
8
|
+
|
|
9
|
+
require 'json'
|
|
10
|
+
|
|
11
|
+
class SummarySkeleton
|
|
12
|
+
def self.create(transcript_path, output_path)
|
|
13
|
+
new(transcript_path, output_path).create
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def initialize(transcript_path, output_path)
|
|
17
|
+
raise ArgumentError, "transcript_path is required" if transcript_path.nil? || transcript_path.empty?
|
|
18
|
+
raise ArgumentError, "output_path is required" if output_path.nil? || output_path.empty?
|
|
19
|
+
|
|
20
|
+
@transcript_path = transcript_path
|
|
21
|
+
@output_path = output_path
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def create
|
|
25
|
+
File.write(output_path, skeleton)
|
|
26
|
+
puts "skeleton: #{output_path}"
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
private
|
|
30
|
+
|
|
31
|
+
attr_reader :transcript_path, :output_path
|
|
32
|
+
|
|
33
|
+
def data
|
|
34
|
+
@data ||= JSON.parse(File.read(transcript_path))
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def video_filename
|
|
38
|
+
File.basename(data["video_path"].to_s)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def segments
|
|
42
|
+
data["segments"] or raise "transcript JSON has no 'segments' key: #{transcript_path}"
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def total_duration
|
|
46
|
+
segments.last["end"].to_f
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def format_timestamp(seconds)
|
|
50
|
+
total = seconds.to_i
|
|
51
|
+
"%02d:%02d" % [total / 60, total % 60]
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def skeleton
|
|
55
|
+
<<~MD
|
|
56
|
+
# #{video_filename}
|
|
57
|
+
**Duration:** #{format_timestamp(total_duration)}
|
|
58
|
+
|
|
59
|
+
## Overview
|
|
60
|
+
<!-- FILL_OVERVIEW -->
|
|
61
|
+
|
|
62
|
+
## Key Visuals
|
|
63
|
+
<!-- FILL_KEY_VISUALS -->
|
|
64
|
+
|
|
65
|
+
## Notable Dialogue
|
|
66
|
+
<!-- FILL_DIALOGUE -->
|
|
67
|
+
|
|
68
|
+
## B-Roll
|
|
69
|
+
<!-- FILL_BROLL -->
|
|
70
|
+
MD
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
if __FILE__ == $PROGRAM_NAME
|
|
75
|
+
transcript_path, output_path = ARGV
|
|
76
|
+
abort("usage: summary_skeleton.rb <visual_transcript.json> <summary_output.md>") unless transcript_path && output_path
|
|
77
|
+
SummarySkeleton.create(transcript_path, output_path)
|
|
78
|
+
end
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# Extract a human-readable script from a visual transcript JSON,
|
|
3
|
+
# interleaving [VISUAL] descriptions with timestamped dialogue.
|
|
4
|
+
# Prints to stdout for direct consumption by the summarize-video skill.
|
|
5
|
+
#
|
|
6
|
+
# Usage:
|
|
7
|
+
# ruby visual_script_extractor.rb <visual_transcript.json>
|
|
8
|
+
|
|
9
|
+
require 'json'
|
|
10
|
+
|
|
11
|
+
class VisualScriptExtractor
|
|
12
|
+
def self.extract(transcript_path)
|
|
13
|
+
new(transcript_path).extract
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def initialize(transcript_path)
|
|
17
|
+
raise ArgumentError, "transcript_path is required" if transcript_path.nil? || transcript_path.empty?
|
|
18
|
+
|
|
19
|
+
@transcript_path = transcript_path
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def extract
|
|
23
|
+
puts header
|
|
24
|
+
puts
|
|
25
|
+
puts format_script
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
attr_reader :transcript_path
|
|
31
|
+
|
|
32
|
+
def data
|
|
33
|
+
@data ||= JSON.parse(File.read(transcript_path))
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def segments
|
|
37
|
+
data["segments"] or raise "transcript JSON has no 'segments' key: #{transcript_path}"
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def header
|
|
41
|
+
"# Video: #{video_filename}\n# Duration: #{format_timestamp(total_duration)}"
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def video_filename
|
|
45
|
+
File.basename(data["video_path"].to_s)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def total_duration
|
|
49
|
+
segments.last["end"].to_f
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def format_script
|
|
53
|
+
segments.filter_map { |s| format_segment(s) }.join("\n\n")
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def format_segment(segment)
|
|
57
|
+
text = segment["text"].to_s.strip
|
|
58
|
+
visual = segment["visual"].to_s.strip
|
|
59
|
+
ts = format_timestamp(segment["start"].to_f)
|
|
60
|
+
|
|
61
|
+
lines = []
|
|
62
|
+
lines << "[#{ts}] [VISUAL] #{visual}" unless visual.empty?
|
|
63
|
+
lines << "[#{ts}] #{text}" unless text.empty?
|
|
64
|
+
|
|
65
|
+
lines.empty? ? nil : lines.join("\n")
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def format_timestamp(seconds)
|
|
69
|
+
total = seconds.to_i
|
|
70
|
+
"%02d:%02d" % [total / 60, total % 60]
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
if __FILE__ == $PROGRAM_NAME
|
|
75
|
+
transcript_path = ARGV[0]
|
|
76
|
+
abort("usage: visual_script_extractor.rb <visual_transcript.json>") unless transcript_path
|
|
77
|
+
VisualScriptExtractor.extract(transcript_path)
|
|
78
|
+
end
|
|
@@ -3,88 +3,34 @@ name: transcribe-audio
|
|
|
3
3
|
description: Transcribes video audio using WhisperX, preserving original timestamps. Creates JSON transcript with word-level timing. Use when you need to generate audio transcripts for videos.
|
|
4
4
|
---
|
|
5
5
|
|
|
6
|
-
# Skill: Transcribe Audio
|
|
6
|
+
# Skill: Transcribe Audio (parent brief)
|
|
7
7
|
|
|
8
|
-
Transcribes video audio using WhisperX and
|
|
8
|
+
Transcribes video audio using WhisperX and produces a clean JSON transcript with word-level timing.
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
- Videos need audio transcripts before visual analysis
|
|
10
|
+
`SKILL.md` is the parent's dispatch brief. The sub-agent's working prompt lives in `agent_prompt.md` — inline its contents when launching the Task agent. Don't pass `SKILL.md`.
|
|
12
11
|
|
|
13
|
-
##
|
|
12
|
+
## Parallelism
|
|
14
13
|
|
|
15
|
-
|
|
14
|
+
Launch at most **2 in parallel**. WhisperX is already multithreaded internally (~4 CPU threads via CTranslate2); 2 processes is the throughput-vs-RAM sweet spot on a 16GB Mac.
|
|
16
15
|
|
|
17
|
-
##
|
|
16
|
+
## Inputs to gather and pass inline
|
|
18
17
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
This skill runs as a sub-agent. Do NOT read `library.yaml` or `settings.yaml` — the parent has that context and passes everything inline in your prompt. Expect these inputs:
|
|
18
|
+
The parent reads `library.yaml` and `settings.yaml` and passes these values inline in each agent's prompt:
|
|
22
19
|
|
|
23
20
|
- `video_path` — absolute path to the video file
|
|
24
21
|
- `transcript_output_dir` — where to write the transcript JSON (e.g. `libraries/<library>/transcripts`)
|
|
25
|
-
- `language_code` — ISO 639-1 code
|
|
26
|
-
- `whisper_model` — model size from
|
|
27
|
-
- `transcript_refinement` — boolean
|
|
28
|
-
- `user_context` (
|
|
29
|
-
- `footage_summary` (
|
|
30
|
-
|
|
31
|
-
If any required input is missing from your prompt, stop and ask the parent rather than inferring it from the filesystem.
|
|
32
|
-
|
|
33
|
-
### 2. Run WhisperX
|
|
34
|
-
|
|
35
|
-
```bash
|
|
36
|
-
whisperx "<video_path>" \
|
|
37
|
-
--language <language_code> \
|
|
38
|
-
--model <whisper_model> \
|
|
39
|
-
--compute_type float32 \
|
|
40
|
-
--device cpu \
|
|
41
|
-
--output_format json \
|
|
42
|
-
--output_dir <transcript_output_dir>
|
|
43
|
-
```
|
|
44
|
-
|
|
45
|
-
### 3. Prepare Audio Transcript
|
|
46
|
-
|
|
47
|
-
After WhisperX completes, format the JSON using our prepare_audio_script:
|
|
48
|
-
|
|
49
|
-
```bash
|
|
50
|
-
ruby .claude/skills/transcribe-audio/prepare_audio_script.rb \
|
|
51
|
-
<transcript_output_dir>/<video_basename>.json \
|
|
52
|
-
<video_path>
|
|
53
|
-
```
|
|
54
|
-
|
|
55
|
-
This script:
|
|
56
|
-
- Adds video source path as metadata
|
|
57
|
-
- Removes unnecessary fields to reduce file size
|
|
58
|
-
- Prettifies JSON
|
|
59
|
-
|
|
60
|
-
### 4. (Optional) Refine the transcript
|
|
61
|
-
|
|
62
|
-
If the parent passed `transcript_refinement: true`, follow `.claude/skills/transcribe-audio/refine_instructions.md` using the `user_context` and `footage_summary` strings the parent supplied inline. Do NOT open `library.yaml`. If `transcript_refinement` is not set or is `false`, skip this step.
|
|
63
|
-
|
|
64
|
-
### 5. Return Success Response
|
|
65
|
-
|
|
66
|
-
After audio preparation completes, return this structured response to the parent agent:
|
|
67
|
-
|
|
68
|
-
```
|
|
69
|
-
✓ <video_basename.mov> transcribed successfully
|
|
70
|
-
Audio transcript: <transcript_output_dir>/<video_basename>.json
|
|
71
|
-
Video path: <video_path>
|
|
72
|
-
```
|
|
73
|
-
|
|
74
|
-
**DO NOT update library.yaml** - the parent agent will handle this to avoid race conditions when running multiple transcriptions in parallel.
|
|
75
|
-
|
|
76
|
-
## Running in Parallel
|
|
22
|
+
- `language_code` — ISO 639-1 code (e.g. `en`, `es`) — parent maps from library.yaml's `language` name
|
|
23
|
+
- `whisper_model` — model size from settings.yaml (e.g. `small`, `medium`, `turbo`)
|
|
24
|
+
- `transcript_refinement` — boolean from library.yaml. If `true`, also pass:
|
|
25
|
+
- `user_context` (may be empty string)
|
|
26
|
+
- `footage_summary` (may be empty string)
|
|
77
27
|
|
|
78
|
-
|
|
79
|
-
- Each agent handles ONE video file
|
|
80
|
-
- Multiple agents can run simultaneously
|
|
81
|
-
- Parent thread updates library.yaml sequentially after each agent completes
|
|
82
|
-
- No race conditions on shared YAML file
|
|
28
|
+
After the agent returns, update `library.yaml` with `transcript: <filename>.json`.
|
|
83
29
|
|
|
84
|
-
## Next
|
|
30
|
+
## Next step
|
|
85
31
|
|
|
86
|
-
|
|
32
|
+
Once all videos have audio transcripts, dispatch `analyze-video` for visual descriptions.
|
|
87
33
|
|
|
88
|
-
##
|
|
34
|
+
## Dependencies
|
|
89
35
|
|
|
90
|
-
|
|
36
|
+
WhisperX must be installed. Use the **setup** skill to verify.
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Transcribe Audio (sub-agent prompt)
|
|
2
|
+
|
|
3
|
+
You are a sub-agent. Transcribe one video file using WhisperX and produce a clean JSON transcript with word-level timing.
|
|
4
|
+
|
|
5
|
+
**Critical:** Use WhisperX, NOT standard Whisper. WhisperX preserves the original video timeline including leading silence, ensuring transcripts match actual video timestamps. Run WhisperX directly on the video file — don't extract audio separately.
|
|
6
|
+
|
|
7
|
+
## Inputs (passed inline by the parent)
|
|
8
|
+
|
|
9
|
+
- `video_path` — absolute path to the video file
|
|
10
|
+
- `transcript_output_dir` — where to write the transcript JSON
|
|
11
|
+
- `language_code` — ISO 639-1 code (e.g. `en`, `es`)
|
|
12
|
+
- `whisper_model` — model size (e.g. `small`, `medium`, `turbo`)
|
|
13
|
+
- `transcript_refinement` — boolean; if `true`, also expect:
|
|
14
|
+
- `user_context` — string, may be empty
|
|
15
|
+
- `footage_summary` — string, may be empty
|
|
16
|
+
|
|
17
|
+
Do NOT read `library.yaml` or `settings.yaml`. If a required input is missing from your prompt, stop and ask the parent rather than inferring from the filesystem.
|
|
18
|
+
|
|
19
|
+
## 1. Run WhisperX
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
whisperx "<video_path>" \
|
|
23
|
+
--language <language_code> \
|
|
24
|
+
--model <whisper_model> \
|
|
25
|
+
--compute_type float32 \
|
|
26
|
+
--device cpu \
|
|
27
|
+
--output_format json \
|
|
28
|
+
--output_dir <transcript_output_dir>
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## 2. Prepare audio transcript
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
ruby .claude/skills/transcribe-audio/prepare_audio_script.rb \
|
|
35
|
+
<transcript_output_dir>/<video_basename>.json \
|
|
36
|
+
<video_path>
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
This script adds the video source path as metadata, removes unnecessary fields, and prettifies the JSON.
|
|
40
|
+
|
|
41
|
+
## 3. (Optional) Refine the transcript
|
|
42
|
+
|
|
43
|
+
If `transcript_refinement: true`, follow `.claude/skills/transcribe-audio/refine_instructions.md`, using the `user_context` and `footage_summary` strings the parent supplied inline. Do NOT open `library.yaml`. Skip if `transcript_refinement` is missing or `false`.
|
|
44
|
+
|
|
45
|
+
## 4. Return success response
|
|
46
|
+
|
|
47
|
+
```
|
|
48
|
+
✓ <video_basename.mov> transcribed successfully
|
|
49
|
+
Audio transcript: <transcript_output_dir>/<video_basename>.json
|
|
50
|
+
Video path: <video_path>
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
**Do NOT update library.yaml** — the parent handles all yaml I/O to avoid race conditions in parallel runs.
|
data/CLAUDE.md
CHANGED
|
@@ -23,15 +23,16 @@ You are an AI video editor assistant working with a software engineer. You gener
|
|
|
23
23
|
- If new: gather project information (library name, video file locations, language)
|
|
24
24
|
- Create directory structure and library.yaml from template
|
|
25
25
|
- Automatically start footage analysis after setup
|
|
26
|
-
2. **Transcribe** → Use `transcribe-audio` and `
|
|
26
|
+
2. **Transcribe** → Use `transcribe-audio`, `analyze-video`, and `summarize-video` skills to process videos
|
|
27
27
|
- First: `transcribe-audio` creates audio transcripts with WhisperX (word-level timing)
|
|
28
28
|
- Then: `analyze-video` adds visual descriptions by extracting and analyzing frames
|
|
29
|
-
-
|
|
30
|
-
-
|
|
31
|
-
3. **Edit** → Use `roughcut`
|
|
32
|
-
-
|
|
33
|
-
-
|
|
34
|
-
- **
|
|
29
|
+
- Then: `summarize-video` generates a short markdown summary from each visual transcript
|
|
30
|
+
- All videos must have audio transcripts, visual transcripts, AND summaries before proceeding to rough cut or sequence creation
|
|
31
|
+
3. **Edit** → Use `cut-planner` then `roughcut` to plan and build a timeline from transcripts
|
|
32
|
+
- `cut-planner` reads all summaries in the main thread, proposes 2–3 narrative options, iterates with the user, and writes an approved plan markdown file
|
|
33
|
+
- `roughcut` consumes that plan, spins up a sub-agent that reads the library directly, builds the YAML iteratively, reviews against format conventions, exports the XML, and returns conversational editorial notes the parent uses to dialogue with the user
|
|
34
|
+
- **Rough cuts**: 3–15+ min edits. **Sequences**: 30–60s clips. Same pair of skills, different target duration.
|
|
35
|
+
- **PREREQUISITE:** Check library.yaml to verify all videos have `visual_transcript` and `summary` populated
|
|
35
36
|
4. **Backup** → Use `backup-library` skill to create compressed archives of all libraries
|
|
36
37
|
- Creates timestamped ZIP backup of entire libraries directory
|
|
37
38
|
- Backups are stored in `/backups/` and excluded from git
|
|
@@ -121,9 +122,11 @@ Ask the user these questions for new libraries one at a time (never all at once)
|
|
|
121
122
|
mkdir -p libraries/[library-name]
|
|
122
123
|
mkdir -p libraries/[library-name]/transcripts
|
|
123
124
|
mkdir -p libraries/[library-name]/roughcuts
|
|
125
|
+
mkdir -p libraries/[library-name]/summaries
|
|
126
|
+
mkdir -p libraries/[library-name]/plans
|
|
124
127
|
```
|
|
125
128
|
|
|
126
|
-
Note: A single
|
|
129
|
+
Note: A single `tmp/` directory inside the buttercut project root is used for all temporary files. Create subdirectories as needed and delete after use.
|
|
127
130
|
|
|
128
131
|
### Create Library File
|
|
129
132
|
|
|
@@ -131,7 +134,7 @@ Duplicate `templates/library_template.yaml` to create `libraries/[library-name]/
|
|
|
131
134
|
|
|
132
135
|
For each video file:
|
|
133
136
|
1. Use `ffprobe` to get duration
|
|
134
|
-
2. Add entry to library.yaml with empty `transcript` and `
|
|
137
|
+
2. Add entry to library.yaml with empty `transcript`, `visual_transcript`, and `summary`
|
|
135
138
|
3. Empty fields mean "todo", valid filenames mean "done"
|
|
136
139
|
|
|
137
140
|
The `language` field stores the language code for all videos in this library.
|
|
@@ -144,16 +147,21 @@ After library setup completes, **automatically start analyzing all footage**:
|
|
|
144
147
|
|
|
145
148
|
1. Inform user: "Library setup complete. Found [N] videos ([total size]). Starting footage analysis..."
|
|
146
149
|
2. Read `libraries/settings.yaml` (for `whisper_model`) and the library's `library.yaml` (for `language`, `transcript_refinement`, `user_context`, `footage_summary`) ONCE in the parent thread. If any expected field is missing, run the appropriate migration first (see Critical Principles below).
|
|
147
|
-
3. Launch `transcribe-audio` agents
|
|
150
|
+
3. Launch `transcribe-audio` agents. Pass these values inline in each agent's prompt:
|
|
148
151
|
- `video_path`, `transcript_output_dir`, `language_code`, `whisper_model`
|
|
149
152
|
- `transcript_refinement` (boolean). If `true`, also pass the current `user_context` and `footage_summary` strings (empty strings are fine — refinement still catches nonsense-token and self-witness fixes).
|
|
150
153
|
4. As each agent completes, update library.yaml with `transcript` (filename only, not full path).
|
|
151
|
-
5. After all audio transcripts complete, launch `analyze-video` agents
|
|
154
|
+
5. After all audio transcripts complete, launch `analyze-video` agents. Pass inline: `video_path`, `audio_transcript_path`, `visual_transcript_path`.
|
|
152
155
|
6. As each agent completes, update library.yaml with `visual_transcript` (filename only, not full path).
|
|
153
|
-
7.
|
|
154
|
-
|
|
156
|
+
7. After all visual transcripts complete, summarize each video using the `summarize-video` skill on the **Haiku model**:
|
|
157
|
+
- For each video, first pre-create a skeleton file in the parent: `ruby .claude/skills/summarize-video/summary_skeleton.rb <visual_transcript_path> <summary_output_path>`
|
|
158
|
+
- Then launch the agent passing inline: `visual_transcript_path`, `summary_output_path` (e.g., `libraries/[library-name]/summaries/summary_[videoname].md`)
|
|
159
|
+
- The agent fills the four placeholders via Edit. The skeleton + Edit pattern is required: without it, Haiku frequently refuses Write and dumps markdown into its reply instead.
|
|
160
|
+
8. As each agent completes, update library.yaml with `summary` (filename only, not full path).
|
|
161
|
+
9. Analyze ALL videos before offering to create rough cuts.
|
|
162
|
+
10. **After all analysis completes, automatically create a backup** using the `backup-library` skill.
|
|
155
163
|
|
|
156
|
-
**Contract: sub-agents
|
|
164
|
+
**Contract: sub-agents receive `agent_prompt.md`, not `SKILL.md`.** For parallelizable skills (`transcribe-audio`, `analyze-video`, `summarize-video`), the parent reads `SKILL.md` for dispatch info (parallelism cap, required inputs) and inlines `agent_prompt.md` into the sub-agent's prompt. `SKILL.md` is parent-only.
|
|
157
165
|
|
|
158
166
|
**Note on refinement:** When `transcript_refinement: true`, each `transcribe-audio` agent reviews and corrects its transcript in place before returning, using the `user_context` and `footage_summary` the parent passed in. Empty context strings are fine — the agent still runs and catches nonsense-token and self-witness fixes. The parent still only writes `transcript: <filename>.json` to `library.yaml` after the agent completes.
|
|
159
167
|
|
|
@@ -181,7 +189,8 @@ When processing multiple videos, use parallel agents for maximum throughput:
|
|
|
181
189
|
- Run WhisperX or frame extraction.
|
|
182
190
|
- Prepare and clean transcript JSON.
|
|
183
191
|
- Return structured response with file paths.
|
|
184
|
-
|
|
192
|
+
|
|
193
|
+
Each skill's `agent_prompt.md` documents its own IO contract — including whether the sub-agent reads or writes `library.yaml`.
|
|
185
194
|
|
|
186
195
|
3. **Benefits:**
|
|
187
196
|
- Multiple videos process simultaneously
|
|
@@ -200,6 +209,7 @@ Known migration triggers (match each to a `scripts/NNN_migrate_*.rb` script via
|
|
|
200
209
|
- `editor` missing (added in 0.4.0)
|
|
201
210
|
- `transcript_refinement` missing (added in [Unreleased]; missing means "predates the feature, default to `false`" — NOT the template default of `true`)
|
|
202
211
|
- `footage_summary` missing OR old name `footage_description` present (renamed in [Unreleased])
|
|
212
|
+
- video entries with `summary` missing (added in [Unreleased]; missing means "todo", default to empty string)
|
|
203
213
|
- video entries with `transcript_path` / `visual_transcript_path` (renamed to `transcript` / `visual_transcript` in 0.3.0)
|
|
204
214
|
- video entries with `file_size_mb` (removed in 0.3.0)
|
|
205
215
|
|
|
@@ -209,7 +219,9 @@ A missing field is not the same as a field set to the template default — the t
|
|
|
209
219
|
|
|
210
220
|
**Use actual filenames.** Never use generic labels like "Video 1" or "Clip A" - always reference actual filenames like "DJI_20250423171212_0210_D.mov" for clear traceability.
|
|
211
221
|
|
|
212
|
-
**Visual transcripts are mandatory.** Before creating any rough cut or sequence, verify ALL videos have
|
|
222
|
+
**Visual transcripts and summaries are mandatory.** Before creating any rough cut or sequence, verify ALL videos have audio transcripts, visual transcripts, AND summaries. Check `library.yaml` — every video entry must have `visual_transcript` and `summary` with filenames (not empty, null, or ""). Transcripts are stored in `libraries/[library-name]/transcripts/`; summaries in `libraries/[library-name]/summaries/`. Visual descriptions and summaries are essential for shot selection and pacing decisions.
|
|
223
|
+
|
|
224
|
+
**Single-track timelines only.** ButterCut produces one sequential video track. Each clip's own audio plays during that clip — there is no second video track for cutaways layered over a continuing voiceover, and no separate audio track. When planning or pitching cuts, never propose "B-roll over VO," "story under meetup footage," picture-in-picture, or any structure that assumes a clip's audio continues while different visuals play on top. Cutaways are fine, but they're hard cuts: when you cut to the wide shot, you cut to that shot's audio too. Plan every cut as a strictly linear sequence of clips.
|
|
213
225
|
|
|
214
226
|
**Be curious and ask questions.** Occasionally ask users questions about their libraries and footage to better understand context, creative intent, and preferences. When you receive answers, add this information to the `user_context` key in the library.yaml file. This builds institutional knowledge that improves future rough cut and sequence decisions and helps maintain continuity across editing sessions.
|
|
215
227
|
|
|
@@ -219,7 +231,7 @@ A missing field is not the same as a field set to the template default — the t
|
|
|
219
231
|
- Flag areas needing human judgment rather than making assumptions
|
|
220
232
|
- When you have lots of videos to process (dozens or hundreds isn't out of the ordinary), create a reasonable task list with 5 tasks and then a final task that says to check the yaml processing file to see if you need to then generate more tasks. This way users can see progress and the agent doesn't get overwhelmed.
|
|
221
233
|
- Generally avoid writing one-off scripts, but if you do need to write one, write it in Ruby unless you have a very strong reason to write in another language.
|
|
222
|
-
-
|
|
234
|
+
- Parallelism caps live in each skill's `SKILL.md` (parent brief). Read it before dispatching.
|
|
223
235
|
- Whenever you export XML files, include a datetime timestamp in the filename so it's clear when they were generated.
|
|
224
236
|
|
|
225
237
|
## Programming Style
|
|
@@ -255,6 +267,24 @@ ButterCut is designed to be simple, automatic and geared toward working with non
|
|
|
255
267
|
|
|
256
268
|
The user should not need to understand video codecs, frame rates, or FCPXML structure - just provide file paths and get working XML. We should talk to the user from a video editing perspective, not a technical software engineer perspective.
|
|
257
269
|
|
|
270
|
+
### Vocabulary — talk like an editor, not a developer
|
|
271
|
+
|
|
272
|
+
The user is a video editor, not a programmer (generally). They don't need to know what file the cut lives in, what tool transcribed their audio, or which skill or sub-agent is doing the work behind the scenes. Implementation details are for the codebase; user-facing chat stays in the language of video editing. When in doubt, drop the technical noun entirely and just say what's happening. Skills, code, etc, should obviously stay technical, but keep that out when chatting with the user.
|
|
273
|
+
|
|
274
|
+
Editor vocabulary that's always fine: rough cut, sequence, scene, beat, timeline, B-roll, cutaway, shot, take, transcript, footage, library, clip, splice, Final Cut, Premiere, Resolve.
|
|
275
|
+
|
|
276
|
+
Don't say → say (one per category — generalize the pattern, don't treat as a lookup table):
|
|
277
|
+
|
|
278
|
+
- *File/format nouns:* "I'll update the YAML" / "regenerate the FCPXML" → "I'll update the cut" / "I'll re-export it for Final Cut"
|
|
279
|
+
- *Architecture nouns:* "I'll spin up a sub-agent" / "running the roughcut skill" / "the parent thread" → just speak in first person ("I'll build the cut")
|
|
280
|
+
- *Tools and models:* "WhisperX will transcribe" / "running ffmpeg" / "I used Haiku for the summary" → "I'll transcribe the audio" / "I'll analyze the visuals" (don't name models)
|
|
281
|
+
- *Internal field names:* "I'll update footage_summary" / "transcript_refinement is true" → "I'll note that about your footage" / "I'll proofread the transcripts"
|
|
282
|
+
- *Paths in casual chat:* `.fcpxml`, `.json`, `libraries/foo/transcripts/…` → name the artifact ("the Final Cut export", "the transcript") and only show the path at final delivery or when the user needs to grab the file
|
|
283
|
+
|
|
284
|
+
Two exceptions where technical detail IS appropriate:
|
|
285
|
+
1. The user explicitly asks ("where is it saved?", "what format?") — answer plainly.
|
|
286
|
+
2. Final delivery summary — naming the export file path is genuinely useful so they can find it.
|
|
287
|
+
|
|
258
288
|
## Development Commands
|
|
259
289
|
|
|
260
290
|
### Testing
|
|
@@ -275,4 +305,4 @@ bundle exec rspec spec/buttercut_spec.rb:10
|
|
|
275
305
|
|
|
276
306
|
## Claude Skills
|
|
277
307
|
|
|
278
|
-
When creating new Claude skills, aim to keep them
|
|
308
|
+
When creating new Claude skills, aim to keep them as brief as possible. Use active voice to help condense instructions. Use simple, plain language.
|
data/lib/buttercut/version.rb
CHANGED
|
@@ -20,3 +20,4 @@ videos:
|
|
|
20
20
|
duration: "00:05:32"
|
|
21
21
|
transcript: # filename only (stored in libraries/[library-name]/transcripts/)
|
|
22
22
|
visual_transcript: # filename only (visual_*.json with frame descriptions)
|
|
23
|
+
summary: # filename only (summary_*.md overview stored in libraries/[library-name]/summaries/)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
<!--
|
|
2
|
+
Cut Plan Template — written by `cut-planner`, consumed by `roughcut`.
|
|
3
|
+
|
|
4
|
+
Fill in every section. Delete this comment block before saving.
|
|
5
|
+
The plan is editorial direction; the build agent picks the exact clips
|
|
6
|
+
and timestamps inside each beat.
|
|
7
|
+
-->
|
|
8
|
+
|
|
9
|
+
# [Working title]
|
|
10
|
+
|
|
11
|
+
## Concept
|
|
12
|
+
<!-- 1–2 sentences on the angle, tone, or arc. What story is this cut telling? -->
|
|
13
|
+
|
|
14
|
+
## Format
|
|
15
|
+
<!-- Vlog, YouTube Short, long-form, documentary, talking-head, montage, etc.
|
|
16
|
+
Include any pacing or tonal cues that flow from the format. -->
|
|
17
|
+
|
|
18
|
+
## Target Duration
|
|
19
|
+
<!-- Approximate runtime, e.g. "4–6 min" or "45–60s sequence". -->
|
|
20
|
+
|
|
21
|
+
## Beats
|
|
22
|
+
<!--
|
|
23
|
+
Ordered list. Each beat is one editorial unit with intent and footage suggestions. Beats are direction, not paper-cut timestamps.
|
|
24
|
+
|
|
25
|
+
For an 8 minute vlog, you might aim for something like 60 seconds per beat, with both good footage for a-roll and b-roll.
|
|
26
|
+
|
|
27
|
+
For other types of videos use your best editorial judgement, thinking about what is common in the genre you're working with. You can also talk to the user directly to determine what they want.
|
|
28
|
+
-->
|
|
29
|
+
|
|
30
|
+
### 1. [Beat name]
|
|
31
|
+
- **Intent:** what this beat does for the story (open, escalate, turn, payoff, etc.)
|
|
32
|
+
- **Approx. share:** rough fraction of runtime (e.g. "~30s", "~2 min", "~15% of total")
|
|
33
|
+
- **Footage suggestions:** filenames likely to feed this beat (e.g. `DJI_56738.mov`, `panasonic_1234.mov`). The build agent may swap in stronger moments from elsewhere.
|
|
34
|
+
|
|
35
|
+
### 2. [Beat name]
|
|
36
|
+
- **Intent:**
|
|
37
|
+
- **Approx. share:**
|
|
38
|
+
- **Footage suggestions:**
|
|
39
|
+
|
|
40
|
+
<!-- Add more beats as needed. A 6 minute video might have 4-6 beats. You'll need to use your judgement about the footage availability, target duration and cut you're making. -->
|
|
41
|
+
|
|
42
|
+
## Required Dialogue
|
|
43
|
+
<!--
|
|
44
|
+
Lines the user specifically wants in. Two flavors are both fine:
|
|
45
|
+
|
|
46
|
+
- **Exact quote:** "Here's how I learned to juggle." (`source_file_if_known.mov`)
|
|
47
|
+
- **Lossy reference:** "Include the bit about Kailey's uncle the magician teaching her to juggle before he died." (`file_1.mov, file_2.mov, file_5.mov`)
|
|
48
|
+
|
|
49
|
+
Leave this section empty if no specific lines are required.
|
|
50
|
+
-->
|
|
51
|
+
|
|
52
|
+
## Notes for the Build
|
|
53
|
+
<!-- Any constraints, things to avoid, or judgment calls the build agent should know — single-track timeline assumptions, must-not-include footage, tone preferences, etc. Include decisions or direction from the user. -->
|
|
@@ -1,25 +1,8 @@
|
|
|
1
1
|
# Rough Cut Template
|
|
2
2
|
# This template defines the structure for video rough cuts
|
|
3
3
|
|
|
4
|
-
#
|
|
5
|
-
description: "Brief
|
|
6
|
-
|
|
7
|
-
# Working notes for the agent during rough cut creation
|
|
8
|
-
notes: |
|
|
9
|
-
Working notes area for editorial decisions, narrative structure planning,
|
|
10
|
-
pacing considerations, and any issues or concerns identified during editing.
|
|
11
|
-
|
|
12
|
-
Consider:
|
|
13
|
-
- Story arc and key narrative beats
|
|
14
|
-
- Pacing and rhythm
|
|
15
|
-
- Transitions between segments
|
|
16
|
-
- B-roll placement opportunities
|
|
17
|
-
- Audio/dialogue clarity
|
|
18
|
-
|
|
19
|
-
# Coverage summary of available footage
|
|
20
|
-
footage_coverage: |
|
|
21
|
-
Overview of what footage is available and how it could be used.
|
|
22
|
-
Include notes about strongest segments, potential issues, and creative opportunities.
|
|
4
|
+
# One-line summary of what this cut is — useful when scanning a folder of cuts
|
|
5
|
+
description: "Brief one-line summary of this cut — what it is and roughly how long"
|
|
23
6
|
|
|
24
7
|
# The actual rough cut - ordered list of clips to use
|
|
25
8
|
clips:
|
|
@@ -39,4 +22,4 @@ clips:
|
|
|
39
22
|
# Rough cut metadata
|
|
40
23
|
metadata:
|
|
41
24
|
created_date: "" # Will be populated when rough cut is created
|
|
42
|
-
total_duration: "" # Calculated from all clip durations
|
|
25
|
+
total_duration: "" # Calculated from all clip durations
|
|
@@ -8,3 +8,6 @@ editor: fcpx
|
|
|
8
8
|
# turbo is nearly as accurate as large-v3 but significantly faster
|
|
9
9
|
# Recommended: `small` paired with transcript_refinement (set per-library in library.yaml)
|
|
10
10
|
whisper_model: small
|
|
11
|
+
|
|
12
|
+
# After exporting a roughcut, also drop a copy of the XML on the Desktop for easy import
|
|
13
|
+
save_to_desktop_after_export: true
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: buttercut
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.6.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Andrew Ford
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-04
|
|
11
|
+
date: 2026-05-04 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: nokogiri
|
|
@@ -66,18 +66,25 @@ files:
|
|
|
66
66
|
- ".claude/settings.json"
|
|
67
67
|
- ".claude/settings.local.json"
|
|
68
68
|
- ".claude/skills/analyze-video/SKILL.md"
|
|
69
|
+
- ".claude/skills/analyze-video/agent_prompt.md"
|
|
69
70
|
- ".claude/skills/analyze-video/prepare_visual_script.rb"
|
|
70
71
|
- ".claude/skills/backup-library/SKILL.md"
|
|
71
72
|
- ".claude/skills/backup-library/backup_libraries.rb"
|
|
73
|
+
- ".claude/skills/cut-planner/SKILL.md"
|
|
72
74
|
- ".claude/skills/release/SKILL.md"
|
|
73
75
|
- ".claude/skills/roughcut/SKILL.md"
|
|
74
|
-
- ".claude/skills/roughcut/
|
|
76
|
+
- ".claude/skills/roughcut/agent_prompt.md"
|
|
75
77
|
- ".claude/skills/roughcut/export_to_fcpxml.rb"
|
|
76
78
|
- ".claude/skills/setup/SKILL.md"
|
|
77
79
|
- ".claude/skills/setup/advanced-setup.md"
|
|
78
80
|
- ".claude/skills/setup/simple-setup.md"
|
|
79
81
|
- ".claude/skills/setup/verify_install.rb"
|
|
82
|
+
- ".claude/skills/summarize-video/SKILL.md"
|
|
83
|
+
- ".claude/skills/summarize-video/agent_prompt.md"
|
|
84
|
+
- ".claude/skills/summarize-video/summary_skeleton.rb"
|
|
85
|
+
- ".claude/skills/summarize-video/visual_script_extractor.rb"
|
|
80
86
|
- ".claude/skills/transcribe-audio/SKILL.md"
|
|
87
|
+
- ".claude/skills/transcribe-audio/agent_prompt.md"
|
|
81
88
|
- ".claude/skills/transcribe-audio/prepare_audio_script.rb"
|
|
82
89
|
- ".claude/skills/transcribe-audio/refine_instructions.md"
|
|
83
90
|
- ".claude/skills/update-buttercut/SKILL.md"
|
|
@@ -91,6 +98,7 @@ files:
|
|
|
91
98
|
- lib/buttercut/fcpx.rb
|
|
92
99
|
- lib/buttercut/version.rb
|
|
93
100
|
- templates/library_template.yaml
|
|
101
|
+
- templates/plan_template.md
|
|
94
102
|
- templates/roughcut_template.yaml
|
|
95
103
|
- templates/settings_template.yaml
|
|
96
104
|
homepage: https://github.com/andrewford/buttercut
|