buttercut 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,78 @@
1
+ #!/usr/bin/env ruby
2
+ # Pre-create a summary skeleton file for the summarize-video skill,
3
+ # with header (filename, duration) filled in and four placeholder markers
4
+ # in the body for the sub-agent to replace via Edit.
5
+ #
6
+ # Usage:
7
+ # ruby summary_skeleton.rb <visual_transcript.json> <summary_output.md>
8
+
9
+ require 'json'
10
+
11
+ class SummarySkeleton
12
+ def self.create(transcript_path, output_path)
13
+ new(transcript_path, output_path).create
14
+ end
15
+
16
+ def initialize(transcript_path, output_path)
17
+ raise ArgumentError, "transcript_path is required" if transcript_path.nil? || transcript_path.empty?
18
+ raise ArgumentError, "output_path is required" if output_path.nil? || output_path.empty?
19
+
20
+ @transcript_path = transcript_path
21
+ @output_path = output_path
22
+ end
23
+
24
+ def create
25
+ File.write(output_path, skeleton)
26
+ puts "skeleton: #{output_path}"
27
+ end
28
+
29
+ private
30
+
31
+ attr_reader :transcript_path, :output_path
32
+
33
+ def data
34
+ @data ||= JSON.parse(File.read(transcript_path))
35
+ end
36
+
37
+ def video_filename
38
+ File.basename(data["video_path"].to_s)
39
+ end
40
+
41
+ def segments
42
+ data["segments"] or raise "transcript JSON has no 'segments' key: #{transcript_path}"
43
+ end
44
+
45
+ def total_duration
46
+ segments.last["end"].to_f
47
+ end
48
+
49
+ def format_timestamp(seconds)
50
+ total = seconds.to_i
51
+ "%02d:%02d" % [total / 60, total % 60]
52
+ end
53
+
54
+ def skeleton
55
+ <<~MD
56
+ # #{video_filename}
57
+ **Duration:** #{format_timestamp(total_duration)}
58
+
59
+ ## Overview
60
+ <!-- FILL_OVERVIEW -->
61
+
62
+ ## Key Visuals
63
+ <!-- FILL_KEY_VISUALS -->
64
+
65
+ ## Notable Dialogue
66
+ <!-- FILL_DIALOGUE -->
67
+
68
+ ## B-Roll
69
+ <!-- FILL_BROLL -->
70
+ MD
71
+ end
72
+ end
73
+
74
+ if __FILE__ == $PROGRAM_NAME
75
+ transcript_path, output_path = ARGV
76
+ abort("usage: summary_skeleton.rb <visual_transcript.json> <summary_output.md>") unless transcript_path && output_path
77
+ SummarySkeleton.create(transcript_path, output_path)
78
+ end
@@ -0,0 +1,78 @@
1
+ #!/usr/bin/env ruby
2
+ # Extract a human-readable script from a visual transcript JSON,
3
+ # interleaving [VISUAL] descriptions with timestamped dialogue.
4
+ # Prints to stdout for direct consumption by the summarize-video skill.
5
+ #
6
+ # Usage:
7
+ # ruby visual_script_extractor.rb <visual_transcript.json>
8
+
9
+ require 'json'
10
+
11
+ class VisualScriptExtractor
12
+ def self.extract(transcript_path)
13
+ new(transcript_path).extract
14
+ end
15
+
16
+ def initialize(transcript_path)
17
+ raise ArgumentError, "transcript_path is required" if transcript_path.nil? || transcript_path.empty?
18
+
19
+ @transcript_path = transcript_path
20
+ end
21
+
22
+ def extract
23
+ puts header
24
+ puts
25
+ puts format_script
26
+ end
27
+
28
+ private
29
+
30
+ attr_reader :transcript_path
31
+
32
+ def data
33
+ @data ||= JSON.parse(File.read(transcript_path))
34
+ end
35
+
36
+ def segments
37
+ data["segments"] or raise "transcript JSON has no 'segments' key: #{transcript_path}"
38
+ end
39
+
40
+ def header
41
+ "# Video: #{video_filename}\n# Duration: #{format_timestamp(total_duration)}"
42
+ end
43
+
44
+ def video_filename
45
+ File.basename(data["video_path"].to_s)
46
+ end
47
+
48
+ def total_duration
49
+ segments.last["end"].to_f
50
+ end
51
+
52
+ def format_script
53
+ segments.filter_map { |s| format_segment(s) }.join("\n\n")
54
+ end
55
+
56
+ def format_segment(segment)
57
+ text = segment["text"].to_s.strip
58
+ visual = segment["visual"].to_s.strip
59
+ ts = format_timestamp(segment["start"].to_f)
60
+
61
+ lines = []
62
+ lines << "[#{ts}] [VISUAL] #{visual}" unless visual.empty?
63
+ lines << "[#{ts}] #{text}" unless text.empty?
64
+
65
+ lines.empty? ? nil : lines.join("\n")
66
+ end
67
+
68
+ def format_timestamp(seconds)
69
+ total = seconds.to_i
70
+ "%02d:%02d" % [total / 60, total % 60]
71
+ end
72
+ end
73
+
74
+ if __FILE__ == $PROGRAM_NAME
75
+ transcript_path = ARGV[0]
76
+ abort("usage: visual_script_extractor.rb <visual_transcript.json>") unless transcript_path
77
+ VisualScriptExtractor.extract(transcript_path)
78
+ end
@@ -3,88 +3,34 @@ name: transcribe-audio
3
3
  description: Transcribes video audio using WhisperX, preserving original timestamps. Creates JSON transcript with word-level timing. Use when you need to generate audio transcripts for videos.
4
4
  ---
5
5
 
6
- # Skill: Transcribe Audio
6
+ # Skill: Transcribe Audio (parent brief)
7
7
 
8
- Transcribes video audio using WhisperX and creates clean JSON transcripts with word-level timing data.
8
+ Transcribes video audio using WhisperX and produces a clean JSON transcript with word-level timing.
9
9
 
10
- ## When to Use
11
- - Videos need audio transcripts before visual analysis
10
+ `SKILL.md` is the parent's dispatch brief. The sub-agent's working prompt lives in `agent_prompt.md` — inline its contents when launching the Task agent. Don't pass `SKILL.md`.
12
11
 
13
- ## Critical Requirements
12
+ ## Parallelism
14
13
 
15
- Use WhisperX, NOT standard Whisper. WhisperX preserves the original video timeline including leading silence, ensuring transcripts match actual video timestamps. Run WhisperX directly on video files. Don't extract audio separately - this ensures timestamp alignment.
14
+ Launch at most **2 in parallel**. WhisperX is already multithreaded internally (~4 CPU threads via CTranslate2); 2 processes is the throughput-vs-RAM sweet spot on a 16GB Mac.
16
15
 
17
- ## Workflow
16
+ ## Inputs to gather and pass inline
18
17
 
19
- ### 1. Inputs from the parent
20
-
21
- This skill runs as a sub-agent. Do NOT read `library.yaml` or `settings.yaml` — the parent has that context and passes everything inline in your prompt. Expect these inputs:
18
+ The parent reads `library.yaml` and `settings.yaml` and passes these values inline in each agent's prompt:
22
19
 
23
20
  - `video_path` — absolute path to the video file
24
21
  - `transcript_output_dir` — where to write the transcript JSON (e.g. `libraries/<library>/transcripts`)
25
- - `language_code` — ISO 639-1 code already mapped by the parent (e.g. `en`, `es`)
26
- - `whisper_model` — model size from the parent (e.g. `small`, `medium`, `turbo`)
27
- - `transcript_refinement` — boolean; if `true`, the parent will also pass `user_context` and `footage_summary` strings for Step 4
28
- - `user_context` (only when refinement is on) — may be empty string
29
- - `footage_summary` (only when refinement is on) — may be empty string
30
-
31
- If any required input is missing from your prompt, stop and ask the parent rather than inferring it from the filesystem.
32
-
33
- ### 2. Run WhisperX
34
-
35
- ```bash
36
- whisperx "<video_path>" \
37
- --language <language_code> \
38
- --model <whisper_model> \
39
- --compute_type float32 \
40
- --device cpu \
41
- --output_format json \
42
- --output_dir <transcript_output_dir>
43
- ```
44
-
45
- ### 3. Prepare Audio Transcript
46
-
47
- After WhisperX completes, format the JSON using our prepare_audio_script:
48
-
49
- ```bash
50
- ruby .claude/skills/transcribe-audio/prepare_audio_script.rb \
51
- <transcript_output_dir>/<video_basename>.json \
52
- <video_path>
53
- ```
54
-
55
- This script:
56
- - Adds video source path as metadata
57
- - Removes unnecessary fields to reduce file size
58
- - Prettifies JSON
59
-
60
- ### 4. (Optional) Refine the transcript
61
-
62
- If the parent passed `transcript_refinement: true`, follow `.claude/skills/transcribe-audio/refine_instructions.md` using the `user_context` and `footage_summary` strings the parent supplied inline. Do NOT open `library.yaml`. If `transcript_refinement` is not set or is `false`, skip this step.
63
-
64
- ### 5. Return Success Response
65
-
66
- After audio preparation completes, return this structured response to the parent agent:
67
-
68
- ```
69
- ✓ <video_basename.mov> transcribed successfully
70
- Audio transcript: <transcript_output_dir>/<video_basename>.json
71
- Video path: <video_path>
72
- ```
73
-
74
- **DO NOT update library.yaml** - the parent agent will handle this to avoid race conditions when running multiple transcriptions in parallel.
75
-
76
- ## Running in Parallel
22
+ - `language_code` — ISO 639-1 code (e.g. `en`, `es`) — parent maps from library.yaml's `language` name
23
+ - `whisper_model` — model size from settings.yaml (e.g. `small`, `medium`, `turbo`)
24
+ - `transcript_refinement` — boolean from library.yaml. If `true`, also pass:
25
+ - `user_context` (may be empty string)
26
+ - `footage_summary` (may be empty string)
77
27
 
78
- This skill is designed to run inside a Task agent for parallel execution:
79
- - Each agent handles ONE video file
80
- - Multiple agents can run simultaneously
81
- - Parent thread updates library.yaml sequentially after each agent completes
82
- - No race conditions on shared YAML file
28
+ After the agent returns, update `library.yaml` with `transcript: <filename>.json`.
83
29
 
84
- ## Next Step
30
+ ## Next step
85
31
 
86
- After audio transcription, use the **analyze-video** skill to add visual descriptions and create the visual transcript.
32
+ Once all videos have audio transcripts, dispatch `analyze-video` for visual descriptions.
87
33
 
88
- ## Installation
34
+ ## Dependencies
89
35
 
90
- Ensure WhisperX is installed. Use the **setup** skill to verify dependencies.
36
+ WhisperX must be installed. Use the **setup** skill to verify.
@@ -0,0 +1,53 @@
1
+ # Transcribe Audio (sub-agent prompt)
2
+
3
+ You are a sub-agent. Transcribe one video file using WhisperX and produce a clean JSON transcript with word-level timing.
4
+
5
+ **Critical:** Use WhisperX, NOT standard Whisper. WhisperX preserves the original video timeline including leading silence, ensuring transcripts match actual video timestamps. Run WhisperX directly on the video file — don't extract audio separately.
6
+
7
+ ## Inputs (passed inline by the parent)
8
+
9
+ - `video_path` — absolute path to the video file
10
+ - `transcript_output_dir` — where to write the transcript JSON
11
+ - `language_code` — ISO 639-1 code (e.g. `en`, `es`)
12
+ - `whisper_model` — model size (e.g. `small`, `medium`, `turbo`)
13
+ - `transcript_refinement` — boolean; if `true`, also expect:
14
+ - `user_context` — string, may be empty
15
+ - `footage_summary` — string, may be empty
16
+
17
+ Do NOT read `library.yaml` or `settings.yaml`. If a required input is missing from your prompt, stop and ask the parent rather than inferring from the filesystem.
18
+
19
+ ## 1. Run WhisperX
20
+
21
+ ```bash
22
+ whisperx "<video_path>" \
23
+ --language <language_code> \
24
+ --model <whisper_model> \
25
+ --compute_type float32 \
26
+ --device cpu \
27
+ --output_format json \
28
+ --output_dir <transcript_output_dir>
29
+ ```
30
+
31
+ ## 2. Prepare audio transcript
32
+
33
+ ```bash
34
+ ruby .claude/skills/transcribe-audio/prepare_audio_script.rb \
35
+ <transcript_output_dir>/<video_basename>.json \
36
+ <video_path>
37
+ ```
38
+
39
+ This script adds the video source path as metadata, removes unnecessary fields, and prettifies the JSON.
40
+
41
+ ## 3. (Optional) Refine the transcript
42
+
43
+ If `transcript_refinement: true`, follow `.claude/skills/transcribe-audio/refine_instructions.md`, using the `user_context` and `footage_summary` strings the parent supplied inline. Do NOT open `library.yaml`. Skip if `transcript_refinement` is missing or `false`.
44
+
45
+ ## 4. Return success response
46
+
47
+ ```
48
+ ✓ <video_basename.mov> transcribed successfully
49
+ Audio transcript: <transcript_output_dir>/<video_basename>.json
50
+ Video path: <video_path>
51
+ ```
52
+
53
+ **Do NOT update library.yaml** — the parent handles all yaml I/O to avoid race conditions in parallel runs.
data/CLAUDE.md CHANGED
@@ -23,15 +23,16 @@ You are an AI video editor assistant working with a software engineer. You gener
23
23
  - If new: gather project information (library name, video file locations, language)
24
24
  - Create directory structure and library.yaml from template
25
25
  - Automatically start footage analysis after setup
26
- 2. **Transcribe** → Use `transcribe-audio` and `analyze-video` skills to process videos
26
+ 2. **Transcribe** → Use `transcribe-audio`, `analyze-video`, and `summarize-video` skills to process videos
27
27
  - First: `transcribe-audio` creates audio transcripts with WhisperX (word-level timing)
28
28
  - Then: `analyze-video` adds visual descriptions by extracting and analyzing frames
29
- - All videos must have BOTH audio transcripts AND visual transcripts before proceeding to rough cut or sequence creation
30
- - Visual transcripts are essential for B-roll selection, shot composition, and editorial decisions
31
- 3. **Edit** → Use `roughcut` skill to create timeline scripts from transcripts
32
- - **Rough cuts**: Multi-minute edits for full videos (typically 3-15+ minutes)
33
- - **Sequences**: 30-60 second clips that user will build to be imported into a larger video (created using the same roughcut skill with shorter target duration)
34
- - **PREREQUISITE:** Check library.yaml to verify all videos have visual_transcript populated
29
+ - Then: `summarize-video` generates a short markdown summary from each visual transcript
30
+ - All videos must have audio transcripts, visual transcripts, AND summaries before proceeding to rough cut or sequence creation
31
+ 3. **Edit** → Use `cut-planner` then `roughcut` to plan and build a timeline from transcripts
32
+ - `cut-planner` reads all summaries in the main thread, proposes 2–3 narrative options, iterates with the user, and writes an approved plan markdown file
33
+ - `roughcut` consumes that plan, spins up a sub-agent that reads the library directly, builds the YAML iteratively, reviews against format conventions, exports the XML, and returns conversational editorial notes the parent uses to dialogue with the user
34
+ - **Rough cuts**: 3–15+ min edits. **Sequences**: 30–60s clips. Same pair of skills, different target duration.
35
+ - **PREREQUISITE:** Check library.yaml to verify all videos have `visual_transcript` and `summary` populated
35
36
  4. **Backup** → Use `backup-library` skill to create compressed archives of all libraries
36
37
  - Creates timestamped ZIP backup of entire libraries directory
37
38
  - Backups are stored in `/backups/` and excluded from git
@@ -121,9 +122,11 @@ Ask the user these questions for new libraries one at a time (never all at once)
121
122
  mkdir -p libraries/[library-name]
122
123
  mkdir -p libraries/[library-name]/transcripts
123
124
  mkdir -p libraries/[library-name]/roughcuts
125
+ mkdir -p libraries/[library-name]/summaries
126
+ mkdir -p libraries/[library-name]/plans
124
127
  ```
125
128
 
126
- Note: A single `/tmp/` directory at the root is used for all temporary files. Create subdirectories as needed and delete after use.
129
+ Note: A single `tmp/` directory inside the buttercut project root is used for all temporary files. Create subdirectories as needed and delete after use.
127
130
 
128
131
  ### Create Library File
129
132
 
@@ -131,7 +134,7 @@ Duplicate `templates/library_template.yaml` to create `libraries/[library-name]/
131
134
 
132
135
  For each video file:
133
136
  1. Use `ffprobe` to get duration
134
- 2. Add entry to library.yaml with empty `transcript` and `visual_transcript`
137
+ 2. Add entry to library.yaml with empty `transcript`, `visual_transcript`, and `summary`
135
138
  3. Empty fields mean "todo", valid filenames mean "done"
136
139
 
137
140
  The `language` field stores the language code for all videos in this library.
@@ -144,16 +147,21 @@ After library setup completes, **automatically start analyzing all footage**:
144
147
 
145
148
  1. Inform user: "Library setup complete. Found [N] videos ([total size]). Starting footage analysis..."
146
149
  2. Read `libraries/settings.yaml` (for `whisper_model`) and the library's `library.yaml` (for `language`, `transcript_refinement`, `user_context`, `footage_summary`) ONCE in the parent thread. If any expected field is missing, run the appropriate migration first (see Critical Principles below).
147
- 3. Launch `transcribe-audio` agents (can run in parallel for multiple videos). Pass these values inline in each agent's prompt — the sub-agent never reads `library.yaml` or `settings.yaml`:
150
+ 3. Launch `transcribe-audio` agents. Pass these values inline in each agent's prompt:
148
151
  - `video_path`, `transcript_output_dir`, `language_code`, `whisper_model`
149
152
  - `transcript_refinement` (boolean). If `true`, also pass the current `user_context` and `footage_summary` strings (empty strings are fine — refinement still catches nonsense-token and self-witness fixes).
150
153
  4. As each agent completes, update library.yaml with `transcript` (filename only, not full path).
151
- 5. After all audio transcripts complete, launch `analyze-video` agents (can run in parallel) following the same "parent passes context inline" contract. Pass inline: `video_path`, `audio_transcript_path`, `visual_transcript_path`.
154
+ 5. After all audio transcripts complete, launch `analyze-video` agents. Pass inline: `video_path`, `audio_transcript_path`, `visual_transcript_path`.
152
155
  6. As each agent completes, update library.yaml with `visual_transcript` (filename only, not full path).
153
- 7. Analyze ALL videos before offering to create rough cuts.
154
- 8. **After all analysis completes, automatically create a backup** using the `backup-library` skill.
156
+ 7. After all visual transcripts complete, summarize each video using the `summarize-video` skill on the **Haiku model**:
157
+ - For each video, first pre-create a skeleton file in the parent: `ruby .claude/skills/summarize-video/summary_skeleton.rb <visual_transcript_path> <summary_output_path>`
158
+ - Then launch the agent passing inline: `visual_transcript_path`, `summary_output_path` (e.g., `libraries/[library-name]/summaries/summary_[videoname].md`)
159
+ - The agent fills the four placeholders via Edit. The skeleton + Edit pattern is required: without it, Haiku frequently refuses Write and dumps markdown into its reply instead.
160
+ 8. As each agent completes, update library.yaml with `summary` (filename only, not full path).
161
+ 9. Analyze ALL videos before offering to create rough cuts.
162
+ 10. **After all analysis completes, automatically create a backup** using the `backup-library` skill.
155
163
 
156
- **Contract: sub-agents don't read `library.yaml`.** The parent owns `library.yaml` (and `settings.yaml`) it reads once, passes values inline, and writes results once per agent completion. Sub-agents should not even know those files exist. This keeps the context boundary clean and avoids race conditions when many agents run in parallel.
164
+ **Contract: sub-agents receive `agent_prompt.md`, not `SKILL.md`.** For parallelizable skills (`transcribe-audio`, `analyze-video`, `summarize-video`), the parent reads `SKILL.md` for dispatch info (parallelism cap, required inputs) and inlines `agent_prompt.md` into the sub-agent's prompt. `SKILL.md` is parent-only.
157
165
 
158
166
  **Note on refinement:** When `transcript_refinement: true`, each `transcribe-audio` agent reviews and corrects its transcript in place before returning, using the `user_context` and `footage_summary` the parent passed in. Empty context strings are fine — the agent still runs and catches nonsense-token and self-witness fixes. The parent still only writes `transcript: <filename>.json` to `library.yaml` after the agent completes.
159
167
 
@@ -181,7 +189,8 @@ When processing multiple videos, use parallel agents for maximum throughput:
181
189
  - Run WhisperX or frame extraction.
182
190
  - Prepare and clean transcript JSON.
183
191
  - Return structured response with file paths.
184
- - DO NOT read `library.yaml` or `settings.yaml`, and DO NOT update `library.yaml` (parent handles all yaml I/O).
192
+
193
+ Each skill's `agent_prompt.md` documents its own IO contract — including whether the sub-agent reads or writes `library.yaml`.
185
194
 
186
195
  3. **Benefits:**
187
196
  - Multiple videos process simultaneously
@@ -200,6 +209,7 @@ Known migration triggers (match each to a `scripts/NNN_migrate_*.rb` script via
200
209
  - `editor` missing (added in 0.4.0)
201
210
  - `transcript_refinement` missing (added in [Unreleased]; missing means "predates the feature, default to `false`" — NOT the template default of `true`)
202
211
  - `footage_summary` missing OR old name `footage_description` present (renamed in [Unreleased])
212
+ - video entries with `summary` missing (added in [Unreleased]; missing means "todo", default to empty string)
203
213
  - video entries with `transcript_path` / `visual_transcript_path` (renamed to `transcript` / `visual_transcript` in 0.3.0)
204
214
  - video entries with `file_size_mb` (removed in 0.3.0)
205
215
 
@@ -209,7 +219,9 @@ A missing field is not the same as a field set to the template default — the t
209
219
 
210
220
  **Use actual filenames.** Never use generic labels like "Video 1" or "Clip A" - always reference actual filenames like "DJI_20250423171212_0210_D.mov" for clear traceability.
211
221
 
212
- **Visual transcripts are mandatory.** Before creating any rough cut or sequence, verify ALL videos have both audio and visual transcripts. Check `library.yaml` - every video entry must have a `visual_transcript` with a filename (not empty or null or ""). Transcripts are stored in `libraries/[library-name]/transcripts/`. Visual descriptions are essential for shot selection, pacing decisions, and B-roll placement.
222
+ **Visual transcripts and summaries are mandatory.** Before creating any rough cut or sequence, verify ALL videos have audio transcripts, visual transcripts, AND summaries. Check `library.yaml` every video entry must have `visual_transcript` and `summary` with filenames (not empty, null, or ""). Transcripts are stored in `libraries/[library-name]/transcripts/`; summaries in `libraries/[library-name]/summaries/`. Visual descriptions and summaries are essential for shot selection and pacing decisions.
223
+
224
+ **Single-track timelines only.** ButterCut produces one sequential video track. Each clip's own audio plays during that clip — there is no second video track for cutaways layered over a continuing voiceover, and no separate audio track. When planning or pitching cuts, never propose "B-roll over VO," "story under meetup footage," picture-in-picture, or any structure that assumes a clip's audio continues while different visuals play on top. Cutaways are fine, but they're hard cuts: when you cut to the wide shot, you cut to that shot's audio too. Plan every cut as a strictly linear sequence of clips.
213
225
 
214
226
  **Be curious and ask questions.** Occasionally ask users questions about their libraries and footage to better understand context, creative intent, and preferences. When you receive answers, add this information to the `user_context` key in the library.yaml file. This builds institutional knowledge that improves future rough cut and sequence decisions and helps maintain continuity across editing sessions.
215
227
 
@@ -219,7 +231,7 @@ A missing field is not the same as a field set to the template default — the t
219
231
  - Flag areas needing human judgment rather than making assumptions
220
232
  - When you have lots of videos to process (dozens or hundreds isn't out of the ordinary), create a reasonable task list with 5 tasks and then a final task that says to check the yaml processing file to see if you need to then generate more tasks. This way users can see progress and the agent doesn't get overwhelmed.
221
233
  - Generally avoid writing one-off scripts, but if you do need to write one, write it in Ruby unless you have a very strong reason to write in another language.
222
- - Only run 4 parallel tasks at a time.
234
+ - Parallelism caps live in each skill's `SKILL.md` (parent brief). Read it before dispatching.
223
235
  - Whenever you export XML files, include a datetime timestamp in the filename so it's clear when they were generated.
224
236
 
225
237
  ## Programming Style
@@ -255,6 +267,24 @@ ButterCut is designed to be simple, automatic and geared toward working with non
255
267
 
256
268
  The user should not need to understand video codecs, frame rates, or FCPXML structure - just provide file paths and get working XML. We should talk to the user from a video editing perspective, not a technical software engineer perspective.
257
269
 
270
+ ### Vocabulary — talk like an editor, not a developer
271
+
272
+ The user is a video editor, not a programmer (generally). They don't need to know what file the cut lives in, what tool transcribed their audio, or which skill or sub-agent is doing the work behind the scenes. Implementation details are for the codebase; user-facing chat stays in the language of video editing. When in doubt, drop the technical noun entirely and just say what's happening. Skills, code, etc, should obviously stay technical, but keep that out when chatting with the user.
273
+
274
+ Editor vocabulary that's always fine: rough cut, sequence, scene, beat, timeline, B-roll, cutaway, shot, take, transcript, footage, library, clip, splice, Final Cut, Premiere, Resolve.
275
+
276
+ Don't say → say (one per category — generalize the pattern, don't treat as a lookup table):
277
+
278
+ - *File/format nouns:* "I'll update the YAML" / "regenerate the FCPXML" → "I'll update the cut" / "I'll re-export it for Final Cut"
279
+ - *Architecture nouns:* "I'll spin up a sub-agent" / "running the roughcut skill" / "the parent thread" → just speak in first person ("I'll build the cut")
280
+ - *Tools and models:* "WhisperX will transcribe" / "running ffmpeg" / "I used Haiku for the summary" → "I'll transcribe the audio" / "I'll analyze the visuals" (don't name models)
281
+ - *Internal field names:* "I'll update footage_summary" / "transcript_refinement is true" → "I'll note that about your footage" / "I'll proofread the transcripts"
282
+ - *Paths in casual chat:* `.fcpxml`, `.json`, `libraries/foo/transcripts/…` → name the artifact ("the Final Cut export", "the transcript") and only show the path at final delivery or when the user needs to grab the file
283
+
284
+ Two exceptions where technical detail IS appropriate:
285
+ 1. The user explicitly asks ("where is it saved?", "what format?") — answer plainly.
286
+ 2. Final delivery summary — naming the export file path is genuinely useful so they can find it.
287
+
258
288
  ## Development Commands
259
289
 
260
290
  ### Testing
@@ -275,4 +305,4 @@ bundle exec rspec spec/buttercut_spec.rb:10
275
305
 
276
306
  ## Claude Skills
277
307
 
278
- When creating new Claude skills, aim to keep them to 50 lines. Only very complicated skills (ie transcription and roughcuts) should be larger than that. If the skill is complicated and seems like it can't be explained in 50 lines, consider if they should be broken up across multiple skills or if the complexity can be contained inside a ruby script saved adjacent to the skill.
308
+ When creating new Claude skills, aim to keep them as brief as possible. Use active voice to help condense instructions. Use simple, plain language.
@@ -1,3 +1,3 @@
1
1
  class ButterCut
2
- VERSION = "0.5.0"
2
+ VERSION = "0.6.0"
3
3
  end
@@ -20,3 +20,4 @@ videos:
20
20
  duration: "00:05:32"
21
21
  transcript: # filename only (stored in libraries/[library-name]/transcripts/)
22
22
  visual_transcript: # filename only (visual_*.json with frame descriptions)
23
+ summary: # filename only (summary_*.md overview stored in libraries/[library-name]/summaries/)
@@ -0,0 +1,53 @@
1
+ <!--
2
+ Cut Plan Template — written by `cut-planner`, consumed by `roughcut`.
3
+
4
+ Fill in every section. Delete this comment block before saving.
5
+ The plan is editorial direction; the build agent picks the exact clips
6
+ and timestamps inside each beat.
7
+ -->
8
+
9
+ # [Working title]
10
+
11
+ ## Concept
12
+ <!-- 1–2 sentences on the angle, tone, or arc. What story is this cut telling? -->
13
+
14
+ ## Format
15
+ <!-- Vlog, YouTube Short, long-form, documentary, talking-head, montage, etc.
16
+ Include any pacing or tonal cues that flow from the format. -->
17
+
18
+ ## Target Duration
19
+ <!-- Approximate runtime, e.g. "4–6 min" or "45–60s sequence". -->
20
+
21
+ ## Beats
22
+ <!--
23
+ Ordered list. Each beat is one editorial unit with intent and footage suggestions. Beats are direction, not paper-cut timestamps.
24
+
25
+ For an 8 minute vlog, you might aim for something like 60 seconds per beat, with both good footage for a-roll and b-roll.
26
+
27
+ For other types of videos use your best editorial judgement, thinking about what is common in the genre you're working with. You can also talk to the user directly to determine what they want.
28
+ -->
29
+
30
+ ### 1. [Beat name]
31
+ - **Intent:** what this beat does for the story (open, escalate, turn, payoff, etc.)
32
+ - **Approx. share:** rough fraction of runtime (e.g. "~30s", "~2 min", "~15% of total")
33
+ - **Footage suggestions:** filenames likely to feed this beat (e.g. `DJI_56738.mov`, `panasonic_1234.mov`). The build agent may swap in stronger moments from elsewhere.
34
+
35
+ ### 2. [Beat name]
36
+ - **Intent:**
37
+ - **Approx. share:**
38
+ - **Footage suggestions:**
39
+
40
+ <!-- Add more beats as needed. A 6 minute video might have 4-6 beats. You'll need to use your judgement about the footage availability, target duration and cut you're making. -->
41
+
42
+ ## Required Dialogue
43
+ <!--
44
+ Lines the user specifically wants in. Two flavors are both fine:
45
+
46
+ - **Exact quote:** "Here's how I learned to juggle." (`source_file_if_known.mov`)
47
+ - **Lossy reference:** "Include the bit about Kailey's uncle the magician teaching her to juggle before he died." (`file_1.mov, file_2.mov, file_5.mov`)
48
+
49
+ Leave this section empty if no specific lines are required.
50
+ -->
51
+
52
+ ## Notes for the Build
53
+ <!-- Any constraints, things to avoid, or judgment calls the build agent should know — single-track timeline assumptions, must-not-include footage, tone preferences, etc. Include decisions or direction from the user. -->
@@ -1,25 +1,8 @@
1
1
  # Rough Cut Template
2
2
  # This template defines the structure for video rough cuts
3
3
 
4
- # User-facing description of what this rough cut contains
5
- description: "Brief description of the rough cut - what story it tells, target duration, and editorial approach"
6
-
7
- # Working notes for the agent during rough cut creation
8
- notes: |
9
- Working notes area for editorial decisions, narrative structure planning,
10
- pacing considerations, and any issues or concerns identified during editing.
11
-
12
- Consider:
13
- - Story arc and key narrative beats
14
- - Pacing and rhythm
15
- - Transitions between segments
16
- - B-roll placement opportunities
17
- - Audio/dialogue clarity
18
-
19
- # Coverage summary of available footage
20
- footage_coverage: |
21
- Overview of what footage is available and how it could be used.
22
- Include notes about strongest segments, potential issues, and creative opportunities.
4
+ # One-line summary of what this cut is — useful when scanning a folder of cuts
5
+ description: "Brief one-line summary of this cut what it is and roughly how long"
23
6
 
24
7
  # The actual rough cut - ordered list of clips to use
25
8
  clips:
@@ -39,4 +22,4 @@ clips:
39
22
  # Rough cut metadata
40
23
  metadata:
41
24
  created_date: "" # Will be populated when rough cut is created
42
- total_duration: "" # Calculated from all clip durations
25
+ total_duration: "" # Calculated from all clip durations
@@ -8,3 +8,6 @@ editor: fcpx
8
8
  # turbo is nearly as accurate as large-v3 but significantly faster
9
9
  # Recommended: `small` paired with transcript_refinement (set per-library in library.yaml)
10
10
  whisper_model: small
11
+
12
+ # After exporting a roughcut, also drop a copy of the XML on the Desktop for easy import
13
+ save_to_desktop_after_export: true
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: buttercut
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Ford
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-04-25 00:00:00.000000000 Z
11
+ date: 2026-05-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -66,18 +66,25 @@ files:
66
66
  - ".claude/settings.json"
67
67
  - ".claude/settings.local.json"
68
68
  - ".claude/skills/analyze-video/SKILL.md"
69
+ - ".claude/skills/analyze-video/agent_prompt.md"
69
70
  - ".claude/skills/analyze-video/prepare_visual_script.rb"
70
71
  - ".claude/skills/backup-library/SKILL.md"
71
72
  - ".claude/skills/backup-library/backup_libraries.rb"
73
+ - ".claude/skills/cut-planner/SKILL.md"
72
74
  - ".claude/skills/release/SKILL.md"
73
75
  - ".claude/skills/roughcut/SKILL.md"
74
- - ".claude/skills/roughcut/agent_instructions.md"
76
+ - ".claude/skills/roughcut/agent_prompt.md"
75
77
  - ".claude/skills/roughcut/export_to_fcpxml.rb"
76
78
  - ".claude/skills/setup/SKILL.md"
77
79
  - ".claude/skills/setup/advanced-setup.md"
78
80
  - ".claude/skills/setup/simple-setup.md"
79
81
  - ".claude/skills/setup/verify_install.rb"
82
+ - ".claude/skills/summarize-video/SKILL.md"
83
+ - ".claude/skills/summarize-video/agent_prompt.md"
84
+ - ".claude/skills/summarize-video/summary_skeleton.rb"
85
+ - ".claude/skills/summarize-video/visual_script_extractor.rb"
80
86
  - ".claude/skills/transcribe-audio/SKILL.md"
87
+ - ".claude/skills/transcribe-audio/agent_prompt.md"
81
88
  - ".claude/skills/transcribe-audio/prepare_audio_script.rb"
82
89
  - ".claude/skills/transcribe-audio/refine_instructions.md"
83
90
  - ".claude/skills/update-buttercut/SKILL.md"
@@ -91,6 +98,7 @@ files:
91
98
  - lib/buttercut/fcpx.rb
92
99
  - lib/buttercut/version.rb
93
100
  - templates/library_template.yaml
101
+ - templates/plan_template.md
94
102
  - templates/roughcut_template.yaml
95
103
  - templates/settings_template.yaml
96
104
  homepage: https://github.com/andrewford/buttercut