youtube-rb 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,632 @@
1
+ require 'open3'
2
+ require 'fileutils'
3
+ require 'faraday'
4
+
5
+ module YoutubeRb
6
+ class Downloader
7
+ class DownloadError < StandardError; end
8
+
9
+ attr_reader :url, :options, :video_info
10
+
11
+ def initialize(url, options = Options.new)
12
+ @url = url
13
+ @options = options.is_a?(Options) ? options : Options.new(**options)
14
+ @extractor = Extractor.new(url, @options.to_h)
15
+ @ytdlp_wrapper = nil
16
+ @video_info = nil
17
+ @tried_ytdlp = false
18
+ @tried_ruby = false
19
+ @cached_video_path = nil # For caching full video when downloading multiple segments
20
+ end
21
+
22
+ # Download full video
23
+ def download
24
+ ensure_output_directory
25
+
26
+ # Choose backend: yt-dlp or pure Ruby
27
+ if should_use_ytdlp?
28
+ download_with_ytdlp
29
+ else
30
+ download_with_ruby
31
+ end
32
+ end
33
+
34
+ # Download video segment (time range)
35
+ def download_segment(start_time, end_time, output_file = nil)
36
+ raise ArgumentError, "Start time must be less than end time" if start_time >= end_time
37
+
38
+ duration = end_time - start_time
39
+ unless valid_segment_duration?(duration)
40
+ raise ArgumentError, "Segment duration must be between #{@options.min_segment_duration} and #{@options.max_segment_duration} seconds, got: #{duration}"
41
+ end
42
+
43
+ ensure_output_directory
44
+
45
+ # Always use yt-dlp for segment downloads (most efficient and reliable)
46
+ unless ytdlp_available?
47
+ raise DownloadError, "yt-dlp is required for segment downloads. Please install yt-dlp."
48
+ end
49
+
50
+ download_segment_with_ytdlp(start_time, end_time, output_file)
51
+ end
52
+
53
+ # Download multiple video segments (batch processing)
54
+ # @param segments [Array<Hash>] Array of segment definitions: [{start: 0, end: 30, output_file: 'seg1.mp4'}, ...]
55
+ # @return [Array<String>] Paths to downloaded segment files
56
+ def download_segments(segments)
57
+ raise ArgumentError, "segments must be an Array" unless segments.is_a?(Array)
58
+ raise ArgumentError, "segments array cannot be empty" if segments.empty?
59
+
60
+ # Validate all segments first
61
+ segments.each_with_index do |seg, idx|
62
+ raise ArgumentError, "Segment #{idx} must be a Hash with :start and :end keys" unless seg.is_a?(Hash) && seg[:start] && seg[:end]
63
+
64
+ start_time = seg[:start]
65
+ end_time = seg[:end]
66
+ raise ArgumentError, "Segment #{idx}: start time must be less than end time" if start_time >= end_time
67
+
68
+ duration = end_time - start_time
69
+ unless valid_segment_duration?(duration)
70
+ raise ArgumentError, "Segment #{idx}: duration must be between #{@options.min_segment_duration} and #{@options.max_segment_duration} seconds, got: #{duration}"
71
+ end
72
+ end
73
+
74
+ ensure_output_directory
75
+
76
+ # Always use yt-dlp for batch segment downloads (most efficient and reliable)
77
+ unless ytdlp_available?
78
+ raise DownloadError, "yt-dlp is required for batch segment downloads. Please install yt-dlp."
79
+ end
80
+
81
+ download_segments_with_ytdlp(segments)
82
+ end
83
+
84
+ # Download only subtitles
85
+ def download_subtitles_only(langs = nil)
86
+ ensure_output_directory
87
+ @video_info = @extractor.extract_info
88
+
89
+ langs ||= @options.subtitle_langs
90
+ download_subtitles(langs)
91
+ end
92
+
93
+ # Get video information without downloading
94
+ def info
95
+ @video_info ||= @extractor.extract_info
96
+ end
97
+
98
+ private
99
+
100
+ def should_use_ytdlp?
101
+ # Use yt-dlp if:
102
+ # 1. Explicitly requested via options
103
+ # 2. yt-dlp is available
104
+ if @options.use_ytdlp && ytdlp_available?
105
+ return true
106
+ end
107
+
108
+ # Don't use yt-dlp if explicitly disabled
109
+ if @options.use_ytdlp == false
110
+ return false
111
+ end
112
+
113
+ # Default: use yt-dlp if available for better reliability
114
+ ytdlp_available?
115
+ end
116
+
117
+ def ytdlp_available?
118
+ @ytdlp_available ||= YtdlpWrapper.available?
119
+ end
120
+
121
+ def ytdlp_wrapper
122
+ @ytdlp_wrapper ||= YtdlpWrapper.new(@options)
123
+ end
124
+
125
+ def download_with_ytdlp
126
+ log "Using yt-dlp backend for download"
127
+ @tried_ytdlp = true
128
+
129
+ begin
130
+ output_file = ytdlp_wrapper.download(@url)
131
+ log "Downloaded successfully with yt-dlp: #{output_file}"
132
+ output_file
133
+ rescue YtdlpWrapper::YtdlpError => e
134
+ handle_ytdlp_error(e)
135
+ end
136
+ end
137
+
138
+ def download_with_ruby
139
+ log "Using pure Ruby backend for download"
140
+ @tried_ruby = true
141
+
142
+ begin
143
+ @video_info = @extractor.extract_info
144
+
145
+ output_file = generate_output_path(@video_info)
146
+
147
+ if @options.extract_audio
148
+ download_audio(output_file)
149
+ else
150
+ download_video(output_file)
151
+ end
152
+
153
+ download_subtitles if @options.write_subtitles || @options.write_auto_sub
154
+ download_metadata if @options.write_info_json
155
+ download_thumbnail if @options.write_thumbnail
156
+ download_description if @options.write_description
157
+
158
+ output_file
159
+ rescue => e
160
+ handle_ruby_error(e)
161
+ end
162
+ end
163
+
164
+ def download_segment_with_ytdlp(start_time, end_time, output_file)
165
+ log "Using yt-dlp backend for segment download"
166
+
167
+ output_file = ytdlp_wrapper.download_segment(@url, start_time, end_time, output_file)
168
+ log "Downloaded segment successfully with yt-dlp: #{output_file}"
169
+ output_file
170
+ end
171
+
172
+ def download_segments_with_ytdlp(segments)
173
+ log "Using yt-dlp backend for batch segment download (optimized: 1 download + local segmentation)"
174
+
175
+ output_files = []
176
+
177
+ begin
178
+ # Download full video once using yt-dlp (handles all YouTube protection)
179
+ full_video_path = get_full_video_for_segmentation_with_ytdlp
180
+
181
+ # Extract all segments locally using FFmpeg (fast and efficient)
182
+ segments.each_with_index do |seg, idx|
183
+ start_time = seg[:start]
184
+ end_time = seg[:end]
185
+ output_file = seg[:output_file] || generate_segment_output_path(@video_info, start_time, end_time)
186
+
187
+ log "Extracting segment #{idx + 1}/#{segments.size}: #{start_time}-#{end_time}s"
188
+
189
+ # Extract segment using ffmpeg (same as Pure Ruby backend)
190
+ extract_segment(full_video_path, output_file, start_time, end_time)
191
+
192
+ # Download subtitles for segment if requested
193
+ if @options.write_subtitles || @options.write_auto_sub
194
+ download_subtitles_for_segment(start_time, end_time)
195
+ end
196
+
197
+ output_files << output_file
198
+ end
199
+ ensure
200
+ # Clean up cache if not enabled
201
+ cleanup_video_cache unless @options.cache_full_video
202
+ end
203
+
204
+ output_files
205
+ end
206
+
207
+
208
+ def handle_ytdlp_error(error, fallback: nil)
209
+ log "yt-dlp error: #{error.message}"
210
+
211
+ # Try fallback to pure Ruby if enabled and not already tried
212
+ if @options.ytdlp_fallback && !@tried_ruby
213
+ if fallback
214
+ log "Falling back to pure Ruby implementation"
215
+ return fallback.call
216
+ else
217
+ log "Falling back to pure Ruby implementation"
218
+ return download_with_ruby
219
+ end
220
+ end
221
+
222
+ raise DownloadError, "yt-dlp failed: #{error.message}"
223
+ end
224
+
225
+ def handle_ruby_error(error)
226
+ log "Pure Ruby error: #{error.message}"
227
+
228
+ # Try fallback to yt-dlp if:
229
+ # 1. It's a 403 error (signature/auth issue)
230
+ # 2. ytdlp_fallback is enabled
231
+ # 3. yt-dlp is available
232
+ # 4. Haven't tried yt-dlp yet
233
+ if @options.ytdlp_fallback && ytdlp_available? && !@tried_ytdlp
234
+ if error.message.include?('403') || error.is_a?(Extractor::ExtractionError)
235
+ log "Falling back to yt-dlp"
236
+ return download_with_ytdlp
237
+ end
238
+ end
239
+
240
+ raise DownloadError, "Download failed: #{error.message}"
241
+ end
242
+
243
+ def log(message)
244
+ puts "[YoutubeRb] #{message}" if @options.verbose
245
+ end
246
+
247
+ def download_video(output_file)
248
+ # Always use HTTP download (pure Ruby)
249
+ download_with_http(output_file)
250
+ end
251
+
252
+ def download_audio(output_file)
253
+ base_output = output_file.sub(/\.[^.]+$/, '')
254
+
255
+ # Download video first, then extract audio with FFmpeg
256
+ temp_video = generate_temp_path
257
+ download_with_http(temp_video)
258
+ extract_audio(temp_video, "#{base_output}.#{@options.audio_format}")
259
+ File.delete(temp_video) if File.exist?(temp_video)
260
+ end
261
+
262
+ def download_with_http(output_file)
263
+ format = @video_info.best_format
264
+ raise DownloadError, "No suitable format found" unless format
265
+
266
+ url = format[:url]
267
+ raise DownloadError, "No URL found in format" unless url
268
+
269
+ puts "Downloading from: #{url[0..80]}..." if @options.respond_to?(:verbose) && @options.verbose
270
+
271
+ # Use streaming download with progress
272
+ downloaded = 0
273
+ File.open(output_file, 'wb') do |file|
274
+ response = http_client.get(url) do |req|
275
+ req.options.on_data = Proc.new do |chunk, overall_received_bytes|
276
+ file.write(chunk)
277
+ downloaded = overall_received_bytes
278
+ end
279
+ end
280
+
281
+ unless response.success?
282
+ raise DownloadError, "HTTP download failed with status #{response.status}"
283
+ end
284
+ end
285
+
286
+ puts "Downloaded #{(downloaded / 1024.0 / 1024.0).round(2)} MB" if @options.respond_to?(:verbose) && @options.verbose
287
+ rescue Faraday::Error => e
288
+ raise DownloadError, "Network error during download: #{e.message}"
289
+ end
290
+
291
+ def download_subtitles(langs = nil)
292
+ langs ||= @options.subtitle_langs
293
+ return if @video_info.subtitles.empty?
294
+
295
+ langs.each do |lang|
296
+ subtitle_data = @video_info.get_subtitle(lang)
297
+ next unless subtitle_data
298
+
299
+ subtitle_data.each do |sub|
300
+ download_subtitle_file(sub, lang)
301
+ end
302
+ end
303
+ end
304
+
305
+ def download_subtitles_for_segment(start_time, end_time)
306
+ langs = @options.subtitle_langs
307
+ return if @video_info.subtitles.empty?
308
+
309
+ langs.each do |lang|
310
+ subtitle_data = @video_info.get_subtitle(lang)
311
+ next unless subtitle_data
312
+
313
+ subtitle_data.each do |sub|
314
+ output_file = generate_subtitle_segment_path(lang, start_time, end_time)
315
+ download_and_trim_subtitle(sub, output_file, start_time, end_time)
316
+ end
317
+ end
318
+ end
319
+
320
+ def download_subtitle_file(subtitle, lang)
321
+ output_file = generate_subtitle_path(lang, subtitle[:ext])
322
+
323
+ begin
324
+ response = http_client.get(subtitle[:url])
325
+
326
+ unless response.success?
327
+ warn "Failed to download subtitle: HTTP #{response.status}"
328
+ return
329
+ end
330
+
331
+ File.write(output_file, response.body)
332
+
333
+ # Convert to requested format if different
334
+ if @options.subtitle_format != subtitle[:ext]
335
+ convert_subtitle_format(output_file, @options.subtitle_format)
336
+ end
337
+ rescue => e
338
+ warn "Failed to download subtitle for #{lang}: #{e.message}"
339
+ end
340
+ end
341
+
342
+ def download_and_trim_subtitle(subtitle, output_file, start_time, end_time)
343
+ response = http_client.get(subtitle[:url])
344
+ content = response.body
345
+
346
+ # Parse and trim subtitle based on time range
347
+ trimmed_content = trim_subtitle_content(content, start_time, end_time, subtitle[:ext])
348
+
349
+ File.write(output_file, trimmed_content)
350
+ end
351
+
352
+ def extract_segment(input_file, output_file, start_time, end_time)
353
+ unless ffmpeg_available?
354
+ raise DownloadError, "FFmpeg is required for segment extraction. Please install ffmpeg."
355
+ end
356
+
357
+ duration = end_time - start_time
358
+
359
+ cmd = [
360
+ 'ffmpeg',
361
+ '-i', input_file,
362
+ '-ss', start_time.to_s,
363
+ '-t', duration.to_s,
364
+ '-c', 'copy',
365
+ '-avoid_negative_ts', '1',
366
+ output_file,
367
+ '-y'
368
+ ].join(' ')
369
+
370
+ stdout, stderr, status = Open3.capture3(cmd)
371
+
372
+ unless status.success?
373
+ raise DownloadError, "Segment extraction failed: #{stderr}"
374
+ end
375
+ end
376
+
377
+ def extract_audio(input_file, output_file)
378
+ unless ffmpeg_available?
379
+ raise DownloadError, "FFmpeg is required for audio extraction. Please install ffmpeg."
380
+ end
381
+
382
+ cmd = [
383
+ 'ffmpeg',
384
+ '-i', input_file,
385
+ '-vn',
386
+ '-acodec', audio_codec_for_format(@options.audio_format),
387
+ '-ab', "#{@options.audio_quality}k",
388
+ output_file,
389
+ '-y'
390
+ ].join(' ')
391
+
392
+ stdout, stderr, status = Open3.capture3(cmd)
393
+
394
+ unless status.success?
395
+ raise DownloadError, "Audio extraction failed: #{stderr}"
396
+ end
397
+ end
398
+
399
+ def trim_subtitle_content(content, start_time, end_time, format)
400
+ case format
401
+ when 'vtt', 'srt'
402
+ trim_vtt_or_srt(content, start_time, end_time)
403
+ else
404
+ content # Return as-is for unsupported formats
405
+ end
406
+ end
407
+
408
+ def trim_vtt_or_srt(content, start_time, end_time)
409
+ lines = content.split("\n")
410
+ result = []
411
+ current_block = []
412
+ in_cue = false
413
+
414
+ lines.each do |line|
415
+ if match_data = line.match(/(\d{2}:\d{2}:\d{2}[.,]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[.,]\d{3})/)
416
+ # This is a timestamp line
417
+ cue_start = parse_subtitle_time(match_data[1])
418
+ cue_end = parse_subtitle_time(match_data[2])
419
+
420
+ if cue_end >= start_time && cue_start <= end_time
421
+ # Adjust timestamps relative to segment start
422
+ adjusted_start = [cue_start - start_time, 0].max
423
+ adjusted_end = [cue_end - start_time, end_time - start_time].min
424
+
425
+ current_block << format_subtitle_time(adjusted_start) + ' --> ' + format_subtitle_time(adjusted_end)
426
+ in_cue = true
427
+ else
428
+ in_cue = false
429
+ current_block = []
430
+ end
431
+ elsif in_cue
432
+ current_block << line
433
+ if line.strip.empty? && current_block.size > 1
434
+ result.concat(current_block)
435
+ current_block = []
436
+ end
437
+ elsif line.start_with?('WEBVTT') || line.start_with?('Kind:') || line.start_with?('Language:')
438
+ result << line
439
+ end
440
+ end
441
+
442
+ result.join("\n")
443
+ end
444
+
445
+ def parse_subtitle_time(time_str)
446
+ # Parse format: 00:00:10.500 or 00:00:10,500
447
+ parts = time_str.tr(',', '.').split(':')
448
+ hours = parts[0].to_i
449
+ minutes = parts[1].to_i
450
+ seconds_parts = parts[2].split('.')
451
+ seconds = seconds_parts[0].to_i
452
+ milliseconds = seconds_parts[1].to_i
453
+
454
+ hours * 3600 + minutes * 60 + seconds + milliseconds / 1000.0
455
+ end
456
+
457
+ def format_subtitle_time(seconds)
458
+ hours = (seconds / 3600).to_i
459
+ minutes = ((seconds % 3600) / 60).to_i
460
+ secs = (seconds % 60).to_i
461
+ millis = ((seconds % 1) * 1000).to_i
462
+
463
+ format("%02d:%02d:%02d.%03d", hours, minutes, secs, millis)
464
+ end
465
+
466
+ def download_metadata
467
+ output_file = generate_metadata_path
468
+ File.write(output_file, JSON.pretty_generate(@video_info.to_h))
469
+ end
470
+
471
+ def download_thumbnail
472
+ return unless @video_info.thumbnail
473
+
474
+ output_file = generate_thumbnail_path
475
+ response = http_client.get(@video_info.thumbnail)
476
+ File.write(output_file, response.body)
477
+ end
478
+
479
+ def download_description
480
+ return unless @video_info.description
481
+
482
+ output_file = generate_description_path
483
+ File.write(output_file, @video_info.description)
484
+ end
485
+
486
+ def convert_subtitle_format(input_file, target_format)
487
+ # Basic conversion support (can be extended)
488
+ return if File.extname(input_file) == ".#{target_format}"
489
+
490
+ output_file = input_file.sub(/\.[^.]+$/, ".#{target_format}")
491
+
492
+ # For now, just rename for compatible formats
493
+ # TODO: Add proper conversion logic for different formats
494
+ FileUtils.mv(input_file, output_file)
495
+ end
496
+
497
+ def audio_codec_for_format(format)
498
+ case format
499
+ when 'mp3'
500
+ 'libmp3lame'
501
+ when 'aac', 'm4a'
502
+ 'aac'
503
+ when 'opus'
504
+ 'libopus'
505
+ when 'vorbis', 'ogg'
506
+ 'libvorbis'
507
+ when 'flac'
508
+ 'flac'
509
+ when 'wav'
510
+ 'pcm_s16le'
511
+ else
512
+ 'copy'
513
+ end
514
+ end
515
+
516
+ def generate_output_path(video_info)
517
+ template = @options.output_template
518
+
519
+ # Replace template variables
520
+ filename = template
521
+ .gsub('%(title)s', sanitize_filename(video_info.title))
522
+ .gsub('%(id)s', video_info.id)
523
+ .gsub('%(ext)s', @options.extract_audio ? @options.audio_format : (video_info.ext || 'mp4'))
524
+ .gsub('%(uploader)s', sanitize_filename(video_info.uploader || 'unknown'))
525
+
526
+ File.join(@options.output_path, filename)
527
+ end
528
+
529
+ def generate_segment_output_path(video_info, start_time, end_time)
530
+ filename = "#{sanitize_filename(video_info.title)}-#{video_info.id}-segment-#{start_time}-#{end_time}.#{video_info.ext || 'mp4'}"
531
+ File.join(@options.output_path, filename)
532
+ end
533
+
534
+ def generate_subtitle_path(lang, ext)
535
+ filename = "#{sanitize_filename(@video_info.title)}-#{@video_info.id}.#{lang}.#{ext}"
536
+ File.join(@options.output_path, filename)
537
+ end
538
+
539
+ def generate_subtitle_segment_path(lang, start_time, end_time)
540
+ filename = "#{sanitize_filename(@video_info.title)}-#{@video_info.id}-segment-#{start_time}-#{end_time}.#{lang}.#{@options.subtitle_format}"
541
+ File.join(@options.output_path, filename)
542
+ end
543
+
544
+ def generate_metadata_path
545
+ filename = "#{sanitize_filename(@video_info.title)}-#{@video_info.id}.info.json"
546
+ File.join(@options.output_path, filename)
547
+ end
548
+
549
+ def generate_thumbnail_path
550
+ ext = File.extname(@video_info.thumbnail).split('?').first || '.jpg'
551
+ filename = "#{sanitize_filename(@video_info.title)}-#{@video_info.id}#{ext}"
552
+ File.join(@options.output_path, filename)
553
+ end
554
+
555
+ def generate_description_path
556
+ filename = "#{sanitize_filename(@video_info.title)}-#{@video_info.id}.description"
557
+ File.join(@options.output_path, filename)
558
+ end
559
+
560
+ def generate_temp_path
561
+ File.join(@options.output_path, ".temp_#{Time.now.to_i}_#{rand(1000)}.mp4")
562
+ end
563
+
564
+ def sanitize_filename(filename)
565
+ return 'video' if filename.nil? || filename.empty?
566
+
567
+ filename.to_s
568
+ .gsub(/[\/\\:*?"<>|]/, '_')
569
+ .gsub(/\s+/, ' ')
570
+ .strip
571
+ end
572
+
573
+ def ensure_output_directory
574
+ FileUtils.mkdir_p(@options.output_path) unless Dir.exist?(@options.output_path)
575
+ end
576
+
577
+ def valid_segment_duration?(duration)
578
+ duration >= @options.min_segment_duration && duration <= @options.max_segment_duration
579
+ end
580
+
581
+ def get_full_video_for_segmentation_with_ytdlp
582
+ # Return cached video if available
583
+ if @cached_video_path && File.exist?(@cached_video_path)
584
+ log "Using cached video: #{@cached_video_path}"
585
+ return @cached_video_path
586
+ end
587
+
588
+ # Extract video info first (needed for segment naming)
589
+ @video_info ||= @extractor.extract_info
590
+
591
+ # Download full video using yt-dlp
592
+ @cached_video_path = generate_cache_path
593
+ log "Downloading full video via yt-dlp for segmentation: #{@cached_video_path}"
594
+ ytdlp_wrapper.download(@url, @cached_video_path)
595
+
596
+ @cached_video_path
597
+ end
598
+
599
+ def cleanup_video_cache
600
+ if @cached_video_path && File.exist?(@cached_video_path)
601
+ log "Cleaning up cached video: #{@cached_video_path}"
602
+ File.delete(@cached_video_path)
603
+ @cached_video_path = nil
604
+ end
605
+ end
606
+
607
+ def generate_cache_path
608
+ File.join(@options.output_path, ".cache_#{Time.now.to_i}_#{rand(10000)}.mp4")
609
+ end
610
+
611
+ def ffmpeg_available?
612
+ system('which ffmpeg > /dev/null 2>&1')
613
+ end
614
+
615
+ def http_client
616
+ @http_client ||= Faraday.new do |f|
617
+ f.request :retry, max: @options.retries, interval: 0.5, backoff_factor: 2
618
+ f.adapter Faraday.default_adapter
619
+ f.options.timeout = 600 # 10 minutes for large downloads
620
+ f.options.open_timeout = 30
621
+
622
+ f.headers['User-Agent'] = @options.user_agent || 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
623
+ f.headers['Accept'] = '*/*'
624
+ f.headers['Accept-Language'] = 'en-US,en;q=0.9'
625
+ f.headers['Referer'] = @options.referer if @options.referer
626
+
627
+ # Add range support for resuming downloads
628
+ f.headers['Range'] = 'bytes=0-' if @options.continue_download
629
+ end
630
+ end
631
+ end
632
+ end