buttercut 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c4de766d8aca2ec00b99b20abd177310ade3b5145424677bf6b3e515487960b2
4
- data.tar.gz: 97429c1df91a51ef44a921a0d2f3e8140adfa2c6c0943abf643d0967cea2e4bc
3
+ metadata.gz: 78b845e8b54d03aee93f00bdbaa96f140d0f10c94910a117352b7401cf30bf63
4
+ data.tar.gz: 0eb609100a9e2f367b493d6aef9f45d08e784ad573c4d033733009be40ccc525
5
5
  SHA512:
6
- metadata.gz: a6c30d9d4038725ef7b63cc15d5de34a8fc85e775bb3907896dae7d1bbde9f87abe7da263dec07ce0a27c7ec6b6d5940db00dc1bf40b24763a49c5381a9961b4
7
- data.tar.gz: '029612e07d6fb9e806aecd10f5d89a95557d23e151a4fe53cb3ab393cd05b45ad89908881c982f6b7922e6227399306cbc51a4ef257404637b84f51a52c5e757'
6
+ metadata.gz: 33eb34693818323f40900bcea272781391c372fc5bf9adb71f11632da83aa3de5936538150c69a9d5407f5e5a6059342eed23eed5683e6b1fec36d9a55b37d2a
7
+ data.tar.gz: f022d15eb198cb32dde550d0120598cf5d8eb7f95145e21bfb70401df1c762bcd4474f4836966601aa46ce232e2bd364a25cc8d1187ad2c1e7edf02164ca1789
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env ruby
2
+ # Extract the plain-text script from a WhisperX-style transcript JSON.
3
+ #
4
+ # Usage:
5
+ # ruby .claude/scripts/script_extractor.rb <transcript.json> <output.txt>
6
+ #
7
+ # Output is one segment per paragraph (blank line between), trimmed, suitable
8
+ # for proofreading by a human or a sub-agent without the overhead of the full
9
+ # transcript JSON (word-level timing, scores, etc.).
10
+
11
+ require 'json'
12
+
13
+ class ScriptExtractor
14
+ def self.extract(transcript_path, output_path)
15
+ new(transcript_path, output_path).extract
16
+ end
17
+
18
+ def initialize(transcript_path, output_path)
19
+ raise ArgumentError, "transcript_path is required" if transcript_path.nil? || transcript_path.empty?
20
+ raise ArgumentError, "output_path is required" if output_path.nil? || output_path.empty?
21
+ @transcript_path = transcript_path
22
+ @output_path = output_path
23
+ end
24
+
25
+ def extract
26
+ write_output(format_script)
27
+ report
28
+ end
29
+
30
+ private
31
+
32
+ attr_reader :transcript_path, :output_path
33
+
34
+ def data
35
+ @data ||= JSON.parse(File.read(transcript_path))
36
+ end
37
+
38
+ def segments
39
+ data["segments"] or raise "transcript JSON has no 'segments' key: #{transcript_path}"
40
+ end
41
+
42
+ def format_script
43
+ paragraphs = segments.map { |s| s["text"].to_s.strip }.reject(&:empty?)
44
+ paragraphs.join("\n\n") + "\n"
45
+ end
46
+
47
+ def write_output(text)
48
+ File.write(output_path, text)
49
+ end
50
+
51
+ def report
52
+ in_kb = (File.size(transcript_path) / 1024.0).round(1)
53
+ out_kb = (File.size(output_path) / 1024.0).round(1)
54
+ puts "Extracted script: #{output_path} (#{out_kb} KB from #{in_kb} KB source, #{segments.size} segments)"
55
+ end
56
+ end
57
+
58
+ if __FILE__ == $PROGRAM_NAME
59
+ transcript_path, output_path = ARGV
60
+ abort("usage: script_extractor.rb <transcript.json> <output.txt>") unless transcript_path && output_path
61
+ abort("file not found: #{transcript_path}") unless File.file?(transcript_path)
62
+ if File.expand_path(output_path) == File.expand_path(transcript_path)
63
+ abort("output path must differ from transcript path: #{transcript_path}")
64
+ end
65
+ ScriptExtractor.extract(transcript_path, output_path)
66
+ end
@@ -1,6 +1,9 @@
1
1
  {
2
2
  "permissions": {
3
3
  "allow": [
4
+ "Agent",
5
+ "Read(tmp/**)",
6
+ "Write(tmp/**)",
4
7
  "Bash(./.claude/skills/roughcut/combine_visual_transcripts.rb:*)",
5
8
  "Bash(./.claude/skills/roughcut/export_to_fcpxml.rb:*)",
6
9
  "Skill(backup-library)",
@@ -22,7 +25,9 @@
22
25
  "Bash(git worktree add:*)",
23
26
  "Bash(cat:*)",
24
27
  "Bash(python3:*)",
25
- "Bash(gh api:*)"
28
+ "Bash(gh api:*)",
29
+ "Bash(gh pr:*)",
30
+ "Bash(cp *)"
26
31
  ],
27
32
  "deny": [],
28
33
  "ask": []
@@ -13,16 +13,24 @@ Videos must have audio transcripts. Run **transcribe-audio** skill first if need
13
13
 
14
14
  ## Workflow
15
15
 
16
- ### 1. Copy & Clean Audio Transcript
16
+ ### 1. Inputs from the parent
17
+
18
+ This skill runs as a sub-agent. Do NOT read `library.yaml` or `settings.yaml` — the parent has that context and passes everything inline in your prompt. Expect these inputs:
19
+
20
+ - `video_path` — absolute path to the video file
21
+ - `audio_transcript_path` — absolute path to the prepared audio transcript JSON
22
+ - `visual_transcript_path` — absolute path to write the visual transcript JSON
23
+
24
+ ### 2. Copy & Clean Audio Transcript
17
25
 
18
26
  Don't read the audio transcript, just copy it and then prepare it by using the prepare_visual_script.rb file. This removes word-level timing data and prettifies the JSON for easier editing:
19
27
 
20
28
  ```bash
21
- cp libraries/[library]/transcripts/video.json libraries/[library]/transcripts/visual_video.json
22
- ruby .claude/skills/analyze-video/prepare_visual_script.rb libraries/[library]/transcripts/visual_video.json
29
+ cp <audio_transcript_path> <visual_transcript_path>
30
+ ruby .claude/skills/analyze-video/prepare_visual_script.rb <visual_transcript_path>
23
31
  ```
24
32
 
25
- ### 2. Extract Frames (Binary Search)
33
+ ### 3. Extract Frames (Binary Search)
26
34
 
27
35
  Create frame directory: `mkdir -p tmp/frames/[video_name]`
28
36
 
@@ -37,11 +45,11 @@ ffmpeg -ss 00:00:02 -i video.mov -vframes 1 -vf "scale=1280:-1" tmp/frames/[vide
37
45
  **Stop when:** The footage no longer seems to be changing or only has minor changes
38
46
  **Never sample** more frequently than once per 30 seconds
39
47
 
40
- ### 3. Add Visual Descriptions
48
+ ### 4. Add Visual Descriptions
41
49
 
42
50
  Read the visual video json file that you created earlier.
43
51
 
44
- **Read the JPG frames** from `tmp/frames/[video_name]/` using Read tool, then **Edit** `visual_video.json`:
52
+ **Read the JPG frames** from `tmp/frames/[video_name]/` using Read tool, then **Edit** the file at `<visual_transcript_path>`:
45
53
 
46
54
  Do these incrementally. You don't need to create a program or script to do this, just incrementally edit the json whenever you read new frames.
47
55
 
@@ -73,7 +81,7 @@ Do these incrementally. You don't need to create a program or script to do this,
73
81
  - First segment: detailed (subject, setting, shot type, lighting, camera style)
74
82
  - Continuing shots: brief if similar, otherwise can be up to 3 sentences if drastically different.
75
83
 
76
- ### 4. Cleanup & Return
84
+ ### 5. Cleanup & Return
77
85
 
78
86
  ```bash
79
87
  rm -rf tmp/frames/[video_name]
@@ -82,8 +90,8 @@ rm -rf tmp/frames/[video_name]
82
90
  Return structured response:
83
91
  ```
84
92
  ✓ [video_filename.mov] analyzed successfully
85
- Visual transcript: libraries/[library]/transcripts/visual_video.json
86
- Video path: /full/path/to/video_filename.mov
93
+ Visual transcript: <visual_transcript_path>
94
+ Video path: <video_path>
87
95
  ```
88
96
 
89
97
  **DO NOT update library.yaml** - parent agent handles this to avoid race conditions in parallel execution.
@@ -29,7 +29,7 @@ class LibraryBackup
29
29
 
30
30
  files = Dir.glob(File.join(@libraries_dir, '**', '*')).select { |f| File.file?(f) }
31
31
 
32
- Zip::File.open(backup_path, Zip::File::CREATE) do |zipfile|
32
+ Zip::File.open(backup_path, create: true) do |zipfile|
33
33
  files.each do |file|
34
34
  zipfile.add(file.sub("#{File.dirname(@libraries_dir)}/", ''), file)
35
35
  end
@@ -59,7 +59,17 @@ class ButterCut
59
59
  end
60
60
  ```
61
61
 
62
- ### 5. Gather Changelog Notes
62
+ ### 5. Update Gemfile.lock
63
+
64
+ Run `bundle install` so `Gemfile.lock` reflects the new version:
65
+
66
+ ```bash
67
+ bundle install
68
+ ```
69
+
70
+ Verify the version updated in `Gemfile.lock` before proceeding.
71
+
72
+ ### 6. Gather Changelog Notes
63
73
 
64
74
  Ask user for release notes. Prompt with:
65
75
  - What changed in this release?
@@ -67,7 +77,7 @@ Ask user for release notes. Prompt with:
67
77
  - Any bug fixes?
68
78
  - Any breaking changes?
69
79
 
70
- ### 6. Update or Create CHANGELOG.md
80
+ ### 7. Update or Create CHANGELOG.md
71
81
 
72
82
  If `CHANGELOG.md` exists, prepend new entry. Otherwise create it:
73
83
 
@@ -89,14 +99,14 @@ All notable changes to ButterCut will be documented in this file.
89
99
  - Improved W
90
100
  ```
91
101
 
92
- ### 7. Commit Version Bump
102
+ ### 8. Commit Version Bump
93
103
 
94
104
  ```bash
95
- git add lib/buttercut/version.rb CHANGELOG.md
105
+ git add lib/buttercut/version.rb Gemfile.lock CHANGELOG.md
96
106
  git commit -m "Bump version to 0.2.0"
97
107
  ```
98
108
 
99
- ### 8. Create and Push Git Tag
109
+ ### 9. Create and Push Git Tag
100
110
 
101
111
  ```bash
102
112
  git tag v0.2.0
@@ -104,7 +114,7 @@ git push origin main
104
114
  git push origin v0.2.0
105
115
  ```
106
116
 
107
- ### 9. Build Gem
117
+ ### 10. Build Gem
108
118
 
109
119
  ```bash
110
120
  gem build buttercut.gemspec
@@ -112,7 +122,7 @@ gem build buttercut.gemspec
112
122
 
113
123
  This creates `buttercut-0.2.0.gem` file.
114
124
 
115
- ### 10. Publish to RubyGems
125
+ ### 11. Publish to RubyGems
116
126
 
117
127
  **First time setup check:**
118
128
 
@@ -133,7 +143,7 @@ gem push buttercut-0.2.0.gem
133
143
 
134
144
  This makes the gem available for `gem install buttercut` worldwide.
135
145
 
136
- ### 11. Create GitHub Release
146
+ ### 12. Create GitHub Release
137
147
 
138
148
  **Using GitHub CLI:**
139
149
  ```bash
@@ -155,21 +165,21 @@ Guide user through manual release creation:
155
165
 
156
166
  Then wait for user confirmation that release is created before proceeding to cleanup.
157
167
 
158
- ### 12. Cleanup
168
+ ### 13. Cleanup
159
169
 
160
170
  ```bash
161
171
  # Remove local gem file (it's on RubyGems and GitHub now)
162
172
  rm buttercut-0.2.0.gem
163
173
  ```
164
174
 
165
- ### 13. Verify Release
175
+ ### 14. Verify Release
166
176
 
167
177
  Check that everything worked:
168
178
  - RubyGems page: https://rubygems.org/gems/buttercut
169
179
  - GitHub releases: https://github.com/andrewford/buttercut/releases
170
180
  - Git tags: `git tag -l`
171
181
 
172
- ### 14. Return Success Response
182
+ ### 15. Return Success Response
173
183
 
174
184
  Provide summary:
175
185
  ```
@@ -82,7 +82,7 @@ Each clip needs:
82
82
 
83
83
  ### 5. Export to Video Editor
84
84
 
85
- Check `library.yaml` for the `editor` field. If it's set, use that value. If it's not set or empty, ask the user for their editor choice (Final Cut Pro X, Adobe Premiere Pro, or DaVinci Resolve), then save their choice back to `library.yaml` (`fcpx`, `premiere`, or `resolve`).
85
+ Check `library.yaml` for the `editor` field. If it's set, use that value. If it's not set or empty, check `libraries/settings.yaml` for the default `editor` value and use that (also save it back to `library.yaml`). If neither has an editor set, ask the user for their editor choice (Final Cut Pro X, Adobe Premiere Pro, or DaVinci Resolve), then save their choice back to both `library.yaml` and `libraries/settings.yaml`.
86
86
 
87
87
  Export based on choice:
88
88
  ```bash
@@ -102,6 +102,31 @@ def main
102
102
  generator.save(output_path)
103
103
 
104
104
  puts "\n✓ Rough cut exported to: #{output_path}"
105
+
106
+ validate_fcpxml(output_path) if editor_symbol == :fcpx
107
+ end
108
+
109
+ def validate_fcpxml(xml_path)
110
+ dtd_path = File.expand_path('../../../dtd/FCPXMLv1_8.dtd', __dir__)
111
+ unless File.exist?(dtd_path)
112
+ puts "⚠ DTD not found at #{dtd_path}; skipping validation."
113
+ return
114
+ end
115
+
116
+ unless system('command -v xmllint > /dev/null 2>&1')
117
+ puts "⚠ xmllint not found; skipping validation."
118
+ return
119
+ end
120
+
121
+ # xmllint prints errors to stderr; --noout suppresses the doc dump on success.
122
+ output = `xmllint --noout --dtdvalid "#{dtd_path}" "#{xml_path}" 2>&1`
123
+ if $?.success?
124
+ puts "✓ FCPXML validates against FCPXMLv1_8.dtd"
125
+ else
126
+ warn "✗ FCPXML failed DTD validation:"
127
+ warn output
128
+ exit 1
129
+ end
105
130
  end
106
131
 
107
132
  main
@@ -16,27 +16,30 @@ Use WhisperX, NOT standard Whisper. WhisperX preserves the original video timeli
16
16
 
17
17
  ## Workflow
18
18
 
19
- ### 1. Read Language from Library File
19
+ ### 1. Inputs from the parent
20
20
 
21
- Read the library's `library.yaml` to get the language code:
21
+ This skill runs as a sub-agent. Do NOT read `library.yaml` or `settings.yaml` the parent has that context and passes everything inline in your prompt. Expect these inputs:
22
22
 
23
- ```yaml
24
- # Library metadata
25
- library_name: [library-name]
26
- language: en # Language code stored here
27
- ...
28
- ```
23
+ - `video_path` — absolute path to the video file
24
+ - `transcript_output_dir` — where to write the transcript JSON (e.g. `libraries/<library>/transcripts`)
25
+ - `language_code` — ISO 639-1 code already mapped by the parent (e.g. `en`, `es`)
26
+ - `whisper_model` model size from the parent (e.g. `small`, `medium`, `turbo`)
27
+ - `transcript_refinement` — boolean; if `true`, the parent will also pass `user_context` and `footage_summary` strings for Step 4
28
+ - `user_context` (only when refinement is on) — may be empty string
29
+ - `footage_summary` (only when refinement is on) — may be empty string
30
+
31
+ If any required input is missing from your prompt, stop and ask the parent rather than inferring it from the filesystem.
29
32
 
30
33
  ### 2. Run WhisperX
31
34
 
32
35
  ```bash
33
- whisperx "/full/path/to/video.mov" \
34
- --language en \
35
- --model medium \
36
+ whisperx "<video_path>" \
37
+ --language <language_code> \
38
+ --model <whisper_model> \
36
39
  --compute_type float32 \
37
40
  --device cpu \
38
41
  --output_format json \
39
- --output_dir libraries/[library-name]/transcripts
42
+ --output_dir <transcript_output_dir>
40
43
  ```
41
44
 
42
45
  ### 3. Prepare Audio Transcript
@@ -45,8 +48,8 @@ After WhisperX completes, format the JSON using our prepare_audio_script:
45
48
 
46
49
  ```bash
47
50
  ruby .claude/skills/transcribe-audio/prepare_audio_script.rb \
48
- libraries/[library-name]/transcripts/video_name.json \
49
- /full/path/to/original/video_name.mov
51
+ <transcript_output_dir>/<video_basename>.json \
52
+ <video_path>
50
53
  ```
51
54
 
52
55
  This script:
@@ -54,14 +57,18 @@ This script:
54
57
  - Removes unnecessary fields to reduce file size
55
58
  - Prettifies JSON
56
59
 
57
- ### 4. Return Success Response
60
+ ### 4. (Optional) Refine the transcript
61
+
62
+ If the parent passed `transcript_refinement: true`, follow `.claude/skills/transcribe-audio/refine_instructions.md` using the `user_context` and `footage_summary` strings the parent supplied inline. Do NOT open `library.yaml`. If `transcript_refinement` is not set or is `false`, skip this step.
63
+
64
+ ### 5. Return Success Response
58
65
 
59
66
  After audio preparation completes, return this structured response to the parent agent:
60
67
 
61
68
  ```
62
- [video_filename.mov] transcribed successfully
63
- Audio transcript: libraries/[library-name]/transcripts/video_name.json
64
- Video path: /full/path/to/video_filename.mov
69
+ <video_basename.mov> transcribed successfully
70
+ Audio transcript: <transcript_output_dir>/<video_basename>.json
71
+ Video path: <video_path>
65
72
  ```
66
73
 
67
74
  **DO NOT update library.yaml** - the parent agent will handle this to avoid race conditions when running multiple transcriptions in parallel.
@@ -0,0 +1,114 @@
1
+ # Transcript refinement instructions
2
+
3
+ Companion file for `SKILL.md`. Invoked from SKILL.md Step 4 when the parent passed `transcript_refinement: true`. Reviews a WhisperX transcript and corrects misheard words using the context strings the parent supplied, in place.
4
+
5
+ ## Step 1 — Gather inputs from the parent
6
+
7
+ The parent has already supplied these inline in your prompt:
8
+
9
+ - `transcript_path` — absolute path to the prepared transcript JSON
10
+ - `user_context` — string, may be empty
11
+ - `footage_summary` — string, may be empty
12
+
13
+ Do NOT open `library.yaml` or search the filesystem for additional context — if the parent didn't pass it, treat it as unavailable. If the parent invoked refinement with only empty context strings, proceed anyway. Catch issues from just what the parent gave you and the transcript.
14
+
15
+ ## Step 2 — Extract a compact script view
16
+
17
+ Run the shared extractor to produce a plain-text view of the transcript (one segment per paragraph, no timing metadata). Pick a sibling `.txt` path next to the transcript and pass it explicitly:
18
+
19
+ ```bash
20
+ ruby .claude/scripts/script_extractor.rb <transcript_path> <transcript_path_with_.txt_extension>
21
+ ```
22
+
23
+ Read ONLY that `.txt` file for the analysis steps below. Do NOT `Read` the full transcript JSON yet — it's large and you don't need its word-level structure to identify corrections.
24
+
25
+ ## Step 3 — HARD RULE: preserve word count, never change timing
26
+
27
+ WhisperX produces word-level timing. The `segments[].words[]` array is 1:1 with the space-separated tokens in `segments[].text`. Splitting or merging tokens breaks this alignment and corrupts downstream timing used by roughcut.
28
+
29
+ Allowed:
30
+ - **1→1 token spelling fix** (same count, different characters). Transcript: `"The bike ended up in a second-floor apartment over near the Tenderlohn, which is where the cops met us."` Fix: `Tenderlohn` → `Tenderloin` — one mangled token replaced by the correct San Francisco neighborhood spelling, same single-token slot. Surrounding words are untouched.
31
+ - **N→N token phrase fix** (same count across a phrase). Transcript: `"We had been planning to ride out to Walnut Creak for the weekend before the whole thing happened."` Fix: `Walnut Creak` → `Walnut Creek` — two tokens stay two tokens; only one character-set changes, but the phrase is treated as the unit of edit for safety.
32
+
33
+ Disallowed:
34
+ - **1→2 token split**. Transcript: `"Her cousin grew up in Sanjose and still lives in the same house her parents bought in the sixties."` The correct spelling is "San Jose" (two tokens), but WhisperX fused it into a single token covering the speaker's fast delivery. Splitting that one timing slot into two requires guessing where "San" ends and "Jose" begins — don't do it. (See squashing technique below for the right move.)
35
+ - **2→1 token merge**. Transcript: `"We walked every single block of the neighborhood looking for the stolen bike that afternoon."` If you wanted to "normalize" `every single` into a single `everysingle` token, you'd drop one entry from the words array. Same corruption in reverse. Don't.
36
+
37
+ Never modify timing fields (`start`, `end`, `duration`, `word.start`, `word.end`) for any reason.
38
+
39
+ **Squashing technique**: when the correct term is naturally multi-word but the transcript has it as a single nonsense token, squash the correction into a single-token form to preserve word count. Downstream agents (analyze-video, roughcut) care about accurate word recognition, not cosmetic spacing — prefer squashing over skipping.
40
+
41
+ - Transcript: `"Her cousin grew up in Sanjose and still lives in the same house her parents bought in the sixties."` Fix: `Sanjose` → `SanJose` (squashed single-token form). Downstream agents will still recognize the city. NOT `San Jose` — that's a disallowed 1→2 split.
42
+ - Transcript: `"Our rental was a tiny cottage right on the edge of Tenderknob, close to a Burmese place we ended up at every single night."` The speaker meant "Tendernob" (the informal Tenderloin/Nob Hill border). Fix: `Tenderknob` → `Tendernob` (1→1 spelling fix, stays one token).
43
+ - Transcript: `"She went to a little Catholic school in the Mission called Saintvincent when she was a kid, and her sister went there too."` Fix: `Saintvincent` → `SaintVincent` (squashed; preserves the one-token slot).
44
+
45
+ If even squashing won't work (genuinely requires splitting or merging tokens), do NOT edit. Note it in your return summary instead. Example: `"Skipped: 'everysingle' in segment 12 should likely be 'every single' (two words), but a 1→2 split would corrupt timing."`
46
+
47
+ ## Step 4 — Identify corrections from the compact script
48
+
49
+ Scan the `.txt` view against the confidence rubric. Every candidate must also satisfy Step 3's word-count rule.
50
+
51
+ - **Context-named term match**: correct if the intended term appears in `user_context` or `footage_summary` and the transcript has a close mishearing. Example: `footage_summary` says "the couple got married at a small vineyard in Sonoma over Labor Day weekend." The transcript has `"We drove all the way up to Sanoma on Friday afternoon and the traffic was unbelievable."` "Sanoma" is a 1→1 mishearing of the context-named location — fix it.
52
+ - **Nonsense-token match**: correct if the transcript token is a non-word nonsense string with a clear real-world spelling implied by context. Example: transcript says `"His mother grew up in Pleasantton and worked at the little cafe downtown for twenty years."` "Pleasantton" isn't a real place — but "Pleasanton" is a real East Bay city and nothing else is phonetically close. 1→1 spelling fix.
53
+ - **Self-witness rule**: correct if the proposed correct form appears elsewhere in the SAME transcript AND the suspect token is phonetically close. Example: an early segment says `"Andrew and Gordon ended up getting dinner at a Thai place in Pacific Heights that night after everything calmed down."` A later segment says `"Pacific Heights has been Andrew's favorite neighborhood since he first moved to the city back in 2015."` If a third segment has `"We drove through Pasific Hites on the way to the station."`, fix it — the correct form is witnessed twice elsewhere in the same transcript.
54
+ - **Do NOT correct based on general world knowledge alone**. Example: transcript says `"Andrew dropped by a little market on Fillmore for snacks before we started the ride."` Even if you happen to know of a specific famous store on Fillmore, don't invent it — the generic phrasing might be exactly what was said. Require either a context naming or a self-witness. If neither exists, leave it.
55
+
56
+ Collect every authorized correction as an `old → new` pair before moving to Step 5.
57
+
58
+ ## Step 5 — Apply each correction to the full JSON
59
+
60
+ Now (and only now) you need to touch the transcript JSON. For each correction, you must update three places so they stay consistent:
61
+
62
+ 1. `segments[].text` — the sentence-level text
63
+ 2. `segments[].words[].word` — the word-level array inside the owning segment
64
+ 3. `word_segments[].word` — the top-level flat word array
65
+
66
+ Read the JSON targeted, not whole — use `Grep` to locate each occurrence and its surrounding lines, then `Edit` with a unique anchor.
67
+
68
+ ### 5a — Update `segments[].text` with phrase context
69
+
70
+ Every correction must include at least one adjacent word of surrounding context. Never `Edit` on a bare word — even nonsense tokens — because Edit does substring matching, not word-boundary matching. Bare-word replacements silently corrupt legitimate substrings. For example, if you try to fix a misheard `"car"` by running `Edit replace_all=true old="car" new="far"`, you'll also rewrite every occurrence of `"carrot"` into `"farrot"`, every `"scared"` into `"sfared"`, and so on across the whole transcript. Always anchor the edit with at least one adjacent word.
71
+
72
+ Correct form:
73
+
74
+ - `Edit replace_all=true old="second-floor apartment over near the Tenderlohn" new="second-floor apartment over near the Tenderloin"` — 1→1 spelling fix in generous phrase context.
75
+ - `Edit replace_all=true old="ride out to Walnut Creak for the weekend" new="ride out to Walnut Creek for the weekend"` — 2→2 phrase fix.
76
+ - `Edit replace_all=true old="cousin grew up in Sanjose and still lives" new="cousin grew up in SanJose and still lives"` — squashed 1→1 fix.
77
+
78
+ **Case rule**: preserve the transcript's existing case. The goal is accurate word recognition for downstream agents, not proper-noun capitalization. If the transcript has "tundraloin" (lowercase), replace with "tenderloin" (lowercase) — don't upgrade to "Tenderloin". If the transcript has "Tundraloin" at a sentence start, replace with "Tenderloin" there. Match case-for-case; don't normalize. Exception: the squashing technique (Step 3) may introduce an internal capital to mark a word boundary (e.g. `Sanjose` → `SanJose`); the first letter's case still follows this rule.
79
+
80
+ ### 5b — Update the two word-level arrays, anchored by `start`
81
+
82
+ Both `segments[].words[].word` and top-level `word_segments[].word` have their own entry for each token. These arrays aren't consumed downstream yet, but they're how we'll cut a single word or phrase out of a segment later, so keeping them consistent with the corrected `segments[].text` is load-bearing — don't leave them stale.
83
+
84
+ Anchor each word-array edit on the adjacent `start` timestamp so it's unique (the token alone may appear in many slots). Only the `word` field changes; timing fields (`start`, `end`, `score`, etc.) must stay untouched.
85
+
86
+ The transcript JSON is pretty-printed (`JSON.pretty_generate`), so each key sits on its own line. `Edit` does literal substring matching — your `old_string` must include the newline and indentation between `"word": "..."` and `"start": ...`. Use the exact whitespace from the file (open it with `Read` or `Grep -A` first to copy the indentation verbatim).
87
+
88
+ - Two-line anchor form (copy the real indentation from the file):
89
+ ```
90
+ Edit old='"word": "Sanjose",
91
+ "start": 10.534' new='"word": "SanJose",
92
+ "start": 10.534'
93
+ ```
94
+ Updates one entry; repeat for the other array.
95
+ - For an N→N phrase fix, update each token's word entry the same way, anchored by its own `start`.
96
+ - For the squashing case (e.g. `Sanjose` → `SanJose`), the word count is unchanged, so there's still exactly one word entry to update per array.
97
+
98
+ ## Step 6 — Clean up the extracted script file
99
+
100
+ Delete the `.txt` file created in Step 2. It's scaffolding, not a deliverable.
101
+
102
+ ```bash
103
+ rm <transcript_path with .json replaced by .txt>
104
+ ```
105
+
106
+ ## Step 7 — Return summary to the parent
107
+
108
+ Append a refinement line to your SKILL.md Step 5 response. Format:
109
+
110
+ - If corrections made: list them as `old → new` pairs, one per line.
111
+ - If no corrections needed: `"Refinement: no corrections needed"`.
112
+ - If some candidates were skipped for word-count reasons: `"Refinement: skipped N corrections that would have changed word count"` followed by the list.
113
+
114
+ The parent writes only `transcript: <filename>.json` to library.yaml — no new field needed.
data/CLAUDE.md CHANGED
@@ -40,6 +40,33 @@ You are an AI video editor assistant working with a software engineer. You gener
40
40
 
41
41
  Libraries are the primary abstraction in ButterCut - each library represents a video series or project and is self-contained under `/libraries/[library-name]/`. A library is conceptually similar to a Final Cut Pro library, but uses a simple file structure (YAML, JSON transcripts) optimized for AI analysis rather than FCP's proprietary format.
42
42
 
43
+ ### Initialize Settings
44
+
45
+ Before any library setup, check if `libraries/settings.yaml` exists. If not, copy from template:
46
+
47
+ ```bash
48
+ cp templates/settings_template.yaml libraries/settings.yaml
49
+ ```
50
+
51
+ If no previous settings.yaml was present, use the ask user question tool to ask the user to confirm or change their defaults (editor and whisper_model).
52
+
53
+ Editor Options:
54
+ - Final Cut Pro X
55
+ - Adobe Premiere Pro
56
+ - DaVinci Resolve
57
+
58
+ Model Options:
59
+ - Small (recommended — pairs well with per-library transcript_refinement)
60
+ - Medium
61
+ - Turbo (Large)
62
+
63
+ Save these options into libraries/settings.yaml.
64
+
65
+ Note: `transcript_refinement` is a **per-library** setting (not global). Ask about it during library setup (see "Gather Project Information" below), not during initial settings setup.
66
+
67
+
68
+ When creating a new library, read `libraries/settings.yaml` and use the `editor` value to pre-populate the library's `editor` field.
69
+
43
70
  ### Check for Existing Library
44
71
 
45
72
  **ALWAYS** check if a library already exists before starting setup:
@@ -63,9 +90,9 @@ ls libraries/[library-name]/library.yaml
63
90
 
64
91
  ### Gather Project Information
65
92
 
66
- Ask the user these questions for new libraries:
93
+ Ask the user these questions for new libraries one at a time (never all at once):
67
94
 
68
- 1. **What is the library name for this project?**
95
+ 1. **What do you want to call this project library?**
69
96
  - Examples: "bike-locking-video-series", "raiders-2025-highlights", "yo-yo-techniques"
70
97
  - Normalize the name:
71
98
  - Replace spaces with dashes
@@ -73,15 +100,20 @@ Ask the user these questions for new libraries:
73
100
  - Remove special characters (keep alphanumeric and dashes)
74
101
 
75
102
  2. **Where are the video files located?**
76
- - Accept either a directory path (recursively find all video files inside folder) or individual file paths
103
+ - Ask: "Where are your video files? You can drag folders or individual files directly into the chat."
77
104
  - Verify all files exist before proceeding
78
105
  - Inform user of what was found: "Found 5 video files totaling 2.3GB"
79
106
 
80
107
  3. **What language is spoken in these videos?**
81
- - Common options: `en` (English), `es` (Spanish), `fr` (French), `de` (German), `ja` (Japanese)
82
- - Or `auto` for auto-detect (adds ~30 seconds per video during transcription)
83
- - This applies to all videos in this library
84
- - Save to library.yaml for use during transcription
108
+ - Ask using AskUserQuestion with options: "English", "Spanish" and a free-text fallback for other languages
109
+ - Save the language name (e.g., "English") to library.yaml
110
+ - Map to language code (e.g., `en`, `es`, `fr`) behind the scenes when needed for transcription
111
+
112
+ 4. **Can I proofread the transcripts after they're generated?**
113
+ - Ask using AskUserQuestion with this exact question: "Can I proofread the transcripts after they're generated? I'll use the video's context to fix mistakes."
114
+ - Options: "Yes - Recommended (Use Claude to refine video understanding)" and "No"
115
+ - Save the boolean to `transcript_refinement` in library.yaml (true for Yes, false for No)
116
+ - Default to `true` if the user skips
85
117
 
86
118
  ### Create Directory Structure
87
119
 
@@ -111,13 +143,19 @@ Progressively update the `footage_summary` field after each video is transcribed
111
143
  After library setup completes, **automatically start analyzing all footage**:
112
144
 
113
145
  1. Inform user: "Library setup complete. Found [N] videos ([total size]). Starting footage analysis..."
114
- 2. Read library.yaml to get language code and find videos needing transcription
115
- 3. Launch `transcribe-audio` agents (can run in parallel for multiple videos)
116
- 4. As each agent completes, update library.yaml with `transcript` (filename only, not full path)
117
- 5. After all audio transcripts complete, launch `analyze-video` agents (can run in parallel)
118
- 6. As each agent completes, update library.yaml with `visual_transcript` (filename only, not full path)
119
- 7. Analyze ALL videos before offering to create rough cuts
120
- 8. **After all analysis completes, automatically create a backup** using the `backup-library` skill
146
+ 2. Read `libraries/settings.yaml` (for `whisper_model`) and the library's `library.yaml` (for `language`, `transcript_refinement`, `user_context`, `footage_summary`) ONCE in the parent thread. If any expected field is missing, run the appropriate migration first (see Critical Principles below).
147
+ 3. Launch `transcribe-audio` agents (can run in parallel for multiple videos). Pass these values inline in each agent's prompt — the sub-agent never reads `library.yaml` or `settings.yaml`:
148
+ - `video_path`, `transcript_output_dir`, `language_code`, `whisper_model`
149
+ - `transcript_refinement` (boolean). If `true`, also pass the current `user_context` and `footage_summary` strings (empty strings are fine — refinement still catches nonsense-token and self-witness fixes).
150
+ 4. As each agent completes, update library.yaml with `transcript` (filename only, not full path).
151
+ 5. After all audio transcripts complete, launch `analyze-video` agents (can run in parallel) following the same "parent passes context inline" contract. Pass inline: `video_path`, `audio_transcript_path`, `visual_transcript_path`.
152
+ 6. As each agent completes, update library.yaml with `visual_transcript` (filename only, not full path).
153
+ 7. Analyze ALL videos before offering to create rough cuts.
154
+ 8. **After all analysis completes, automatically create a backup** using the `backup-library` skill.
155
+
156
+ **Contract: sub-agents don't read `library.yaml`.** The parent owns `library.yaml` (and `settings.yaml`) — it reads once, passes values inline, and writes results once per agent completion. Sub-agents should not even know those files exist. This keeps the context boundary clean and avoids race conditions when many agents run in parallel.
157
+
158
+ **Note on refinement:** When `transcript_refinement: true`, each `transcribe-audio` agent reviews and corrects its transcript in place before returning, using the `user_context` and `footage_summary` the parent passed in. Empty context strings are fine — the agent still runs and catches nonsense-token and self-witness fixes. The parent still only writes `transcript: <filename>.json` to `library.yaml` after the agent completes.
121
159
 
122
160
  **Terminology:**
123
161
  - User-facing: Call it "footage analysis" or "analyzing footage"
@@ -133,18 +171,17 @@ After library setup completes, **automatically start analyzing all footage**:
133
171
  When processing multiple videos, use parallel agents for maximum throughput:
134
172
 
135
173
  1. **Parent agent responsibilities:**
136
- - Read library.yaml for language code
137
- - Read library.yaml to find videos needing work
138
- - Launch Task agents with transcribe-audio or analyze-video skills
139
- - Update library.yaml sequentially as agents complete
140
- - Handle errors and retries
174
+ - Read `library.yaml` and `settings.yaml` once to gather: videos needing work, `language_code`, `whisper_model`, `transcript_refinement`, `user_context`, `footage_summary`.
175
+ - Launch Task agents with transcribe-audio or analyze-video skills, passing all needed values **inline in the prompt**.
176
+ - Update library.yaml sequentially as agents complete.
177
+ - Handle errors and retries.
141
178
 
142
179
  2. **Child agent (transcribe-audio/analyze-video) responsibilities:**
143
- - Process ONE video file
144
- - Run WhisperX or frame extraction
145
- - Prepare and clean transcript JSON
146
- - Return structured response with file paths
147
- - DO NOT update library.yaml (parent handles this)
180
+ - Process ONE video file using only the inputs passed inline by the parent.
181
+ - Run WhisperX or frame extraction.
182
+ - Prepare and clean transcript JSON.
183
+ - Return structured response with file paths.
184
+ - DO NOT read `library.yaml` or `settings.yaml`, and DO NOT update `library.yaml` (parent handles all yaml I/O).
148
185
 
149
186
  3. **Benefits:**
150
187
  - Multiple videos process simultaneously
@@ -156,7 +193,19 @@ When processing multiple videos, use parallel agents for maximum throughput:
156
193
 
157
194
  Each library has a `library.yaml` file that serves as your persistent memory and the SOURCE OF TRUTH. This file contains all library metadata, footage descriptions, transcription status, and key learnings. Always read this file when working on a library and you need guidance for how/where to save files.
158
195
 
159
- **If library structure seems wrong, check CHANGELOG.md.** The library.yaml format has evolved over versions. If you encounter unexpected field names (like `transcript_path` instead of `transcript`), read CHANGELOG.md to understand breaking changes and available migration scripts.
196
+ **Migrate legacy library.yaml files before doing anything else.** Every time you read a library.yaml, check it against the canonical field list in `templates/library_template.yaml`. If any expected field is missing, or any field appears under an old name, the library predates a feature and MUST be migrated before you do any further work on it — no rough cuts, sequences, transcription, exports, or anything else until the schema is current. The migrations are fast, idempotent, and safe; don't ask the user for permission and don't describe them as optional "tidying." Just run them.
197
+
198
+ Known migration triggers (match each to a `scripts/NNN_migrate_*.rb` script via CHANGELOG.md):
199
+
200
+ - `editor` missing (added in 0.4.0)
201
+ - `transcript_refinement` missing (added in [Unreleased]; missing means "predates the feature, default to `false`" — NOT the template default of `true`)
202
+ - `footage_summary` missing OR old name `footage_description` present (renamed in [Unreleased])
203
+ - video entries with `transcript_path` / `visual_transcript_path` (renamed to `transcript` / `visual_transcript` in 0.3.0)
204
+ - video entries with `file_size_mb` (removed in 0.3.0)
205
+
206
+ A missing field is not the same as a field set to the template default — the template default only applies to freshly created libraries. If you see a schema issue not on this list, still check CHANGELOG.md; the list may be behind. After running migrations, re-read the library.yaml and continue with whatever the user asked for.
207
+
208
+ **Keep main-thread context minimal.** The main thread orchestrates; sub-agents do the heavy work and return concise summaries. Don't read full transcript JSON, visual transcript JSON, or extracted frames into the main thread as part of routine workflow — across a large library this bloats context fast. Trust sub-agent return messages when updating library.yaml. Direct user requests ("show me transcript X") are fine; the rule is about automatic workflow behavior.
160
209
 
161
210
  **Use actual filenames.** Never use generic labels like "Video 1" or "Clip A" - always reference actual filenames like "DJI_20250423171212_0210_D.mov" for clear traceability.
162
211
 
@@ -171,6 +220,17 @@ Each library has a `library.yaml` file that serves as your persistent memory and
171
220
  - When you have lots of videos to process (dozens or hundreds isn't out of the ordinary), create a reasonable task list with 5 tasks and then a final task that says to check the yaml processing file to see if you need to then generate more tasks. This way users can see progress and the agent doesn't get overwhelmed.
172
221
  - Generally avoid writing one-off scripts, but if you do need to write one, write it in Ruby unless you have a very strong reason to write in another language.
173
222
  - Only run 4 parallel tasks at a time.
223
+ - Whenever you export XML files, include a datetime timestamp in the filename so it's clear when they were generated.
224
+
225
+ ## Programming Style
226
+
227
+ When you add a Ruby script under `.claude/scripts/` or similar, follow these conventions:
228
+
229
+ - **One class per script; file name matches the class name.** `ScriptExtractor` lives in `script_extractor.rb`.
230
+ - **Single high-level entry point.** Expose a class method (`Klass.extract`, `Klass.run`, etc.) that calls `new(...).extract` internally — callers shouldn't need to know about instantiation.
231
+ - **Break the work into small private methods with clear names** (`load_transcript`, `format_script`, `write_output`, `report`). The public entry point should read like a short outline of the workflow.
232
+ - **Required arguments are required.** Don't silently default `nil`/missing args — raise `ArgumentError` in `initialize` if a required value is missing or empty. No hidden fallback paths.
233
+ - **Keep CLI arg parsing out of the class.** Use a bottom-of-file `if __FILE__ == $PROGRAM_NAME` block to parse `ARGV`, validate file paths, print a usage line, and delegate to the class.
174
234
 
175
235
  ## Project Structure
176
236
 
@@ -182,21 +242,23 @@ Each library has a `library.yaml` file that serves as your persistent memory and
182
242
  - `spec/` - RSpec test suite
183
243
  - `templates/` - Library and project templates
184
244
  - `libraries/` - Working directory for user's video projects (gitignored)
245
+ - `libraries/settings.yaml` - User settings (editor, whisper_model) — created from template on first library setup
185
246
  - `backups/` - Compressed library backups (transcriptions, roughcuts, etc) (gitignored)
186
247
 
187
248
  ## Design Philosophy
188
249
 
189
- ButterCut is designed to be simple and automatic:
250
+ ButterCut is designed to be simple, automatic and geared toward working with non technical people using ButterCut via a client, Claude Cowork or Claude Code.
251
+
190
252
  - **Input**: Array of full file paths to video files
191
- - **Output**: Working FCPXML ready to import into Final Cut Pro
253
+ - **Output**: Working XML file ready to import into the non-technical user's video editor (Final Cut, Premiere, Resolve)
192
254
  - **Automatic Metadata Extraction**: Uses FFmpeg internally to extract video properties (duration, resolution, frame rate, audio rate, etc.)
193
- - **No Manual Configuration Required**: Library handles all the complexity of FCPXML generation
194
255
 
195
- The user should not need to understand video codecs, frame rates, or FCPXML structure - just provide file paths and get working XML.
256
+ The user should not need to understand video codecs, frame rates, or FCPXML structure - just provide file paths and get working XML. We should talk to the user from a video editing perspective, not a technical software engineer perspective.
196
257
 
197
258
  ## Development Commands
198
259
 
199
260
  ### Testing
261
+ RSpec tests for the XML generation library. This doesn't include agent or end to end testing.
200
262
  ```bash
201
263
  # Install dependencies
202
264
  bundle install
@@ -211,17 +273,6 @@ bundle exec rspec spec/buttercut_spec.rb
211
273
  bundle exec rspec spec/buttercut_spec.rb:10
212
274
  ```
213
275
 
214
- ### DTD Validation
215
-
216
- macOS has a built-in XML lint tool - allowing you to validate a FCPXML document against its DTD file.
217
-
218
- ```bash
219
- xmllint --dtdvalid "dtd/FCPXMLv1_8.dtd" "/path/to/your/file.fcpxml"
220
- ```
221
-
222
- This will check if the generated FCPXML conforms to the FCPXML 1.8 specification.
223
- - Whenever you export xml files, always include a datetime timestamp so it's clear when they were generated
224
-
225
276
  ## Claude Skills
226
277
 
227
278
  When creating new Claude skills, aim to keep them to 50 lines. Only very complicated skills (ie transcription and roughcuts) should be larger than that. If the skill is complicated and seems like it can't be explained in 50 lines, consider if they should be broken up across multiple skills or if the complexity can be contained inside a ruby script saved adjacent to the skill.
data/README.md CHANGED
@@ -124,4 +124,8 @@ MIT
124
124
 
125
125
  ## Contributing
126
126
 
127
- Bug reports and pull requests welcome.
127
+ Bug reports and pull requests welcome, with that said...
128
+
129
+ **Guidelines:**
130
+ - Write the body of your pull request or GitHub issue yourself. Don't use an agent (Claude Code, etc) to generate it.
131
+ - Keep pull requests small and limited to a single feature or bugfix at a time. It's a lot easier to write code, I feel like it's just as hard as before to review code.
@@ -1,3 +1,3 @@
1
1
  class ButterCut
2
- VERSION = "0.4.0"
2
+ VERSION = "0.5.0"
3
3
  end
@@ -4,6 +4,7 @@ created_date: [YYYY-MM-DD]
4
4
  last_updated: [YYYY-MM-DD]
5
5
  language: english
6
6
  editor: # preferred video editor: fcpx, premiere, or resolve
7
+ transcript_refinement: true # on by default; set false to skip the AI transcript review step
7
8
  user_context: ""
8
9
  # Whenever you ask the user questions about the library, save a summarized version here.
9
10
  # ie; The man wearing the dark blue long sleeve shirt is "Andrew". The small brown dog is "Sammy". This footage was shot over one evening.
@@ -0,0 +1,10 @@
1
+ # ButterCut User Settings
2
+ # Copy this file to libraries/settings.yaml to configure defaults
3
+
4
+ # Preferred video editor: fcpx, premiere, or resolve
5
+ editor: fcpx
6
+
7
+ # WhisperX model size: tiny, base, small, medium, or turbo
8
+ # turbo is nearly as accurate as large-v3 but significantly faster
9
+ # Recommended: `small` paired with transcript_refinement (set per-library in library.yaml)
10
+ whisper_model: small
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: buttercut
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Ford
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-02-24 00:00:00.000000000 Z
11
+ date: 2026-04-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -62,6 +62,7 @@ extensions: []
62
62
  extra_rdoc_files: []
63
63
  files:
64
64
  - ".claude/commands/worktree.md"
65
+ - ".claude/scripts/script_extractor.rb"
65
66
  - ".claude/settings.json"
66
67
  - ".claude/settings.local.json"
67
68
  - ".claude/skills/analyze-video/SKILL.md"
@@ -78,6 +79,7 @@ files:
78
79
  - ".claude/skills/setup/verify_install.rb"
79
80
  - ".claude/skills/transcribe-audio/SKILL.md"
80
81
  - ".claude/skills/transcribe-audio/prepare_audio_script.rb"
82
+ - ".claude/skills/transcribe-audio/refine_instructions.md"
81
83
  - ".claude/skills/update-buttercut/SKILL.md"
82
84
  - CLAUDE.md
83
85
  - LICENSE
@@ -90,6 +92,7 @@ files:
90
92
  - lib/buttercut/version.rb
91
93
  - templates/library_template.yaml
92
94
  - templates/roughcut_template.yaml
95
+ - templates/settings_template.yaml
93
96
  homepage: https://github.com/andrewford/buttercut
94
97
  licenses:
95
98
  - MIT