youtube-rb 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,425 @@
1
+ require 'json'
2
+ require 'faraday'
3
+ require 'faraday/retry'
4
+ require 'nokogiri'
5
+ require 'cgi'
6
+ require 'base64'
7
+ require 'date'
8
+
9
+ module YoutubeRb
10
+ class Extractor
11
+ class ExtractionError < StandardError; end
12
+
13
+ attr_reader :url, :options
14
+
15
+ def initialize(url, options = {})
16
+ @url = url
17
+ @options = options
18
+ @http_client = build_http_client
19
+ end
20
+
21
+ def extract_info
22
+ # Use pure Ruby extraction
23
+ info = extract_youtube_info
24
+
25
+ raise ExtractionError, "Failed to extract video information from #{@url}" unless info
26
+
27
+ VideoInfo.new(info)
28
+ end
29
+
30
+ def extract_formats
31
+ info = extract_info
32
+ info.formats
33
+ end
34
+
35
+ def extract_subtitles
36
+ info = extract_info
37
+ info.subtitles
38
+ end
39
+
40
+ private
41
+
42
+ def extract_youtube_info
43
+ return nil unless youtube_url?
44
+
45
+ begin
46
+ response = @http_client.get(@url)
47
+ raise ExtractionError, "Failed to fetch page: HTTP #{response.status}" unless response.success?
48
+
49
+ # Handle encoding properly
50
+ body = response.body
51
+ body = body.force_encoding('UTF-8') unless body.encoding == Encoding::UTF_8
52
+ body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '')
53
+
54
+ parse_youtube_page(body)
55
+ rescue Faraday::Error => e
56
+ raise ExtractionError, "Network error: #{e.message}"
57
+ rescue => e
58
+ raise ExtractionError, "Extraction failed: #{e.message}\n#{e.backtrace.first(3).join("\n")}"
59
+ end
60
+ end
61
+
62
+ def parse_youtube_page(html)
63
+ # Extract video ID from URL
64
+ video_id = extract_video_id(@url)
65
+ raise ExtractionError, "Could not extract video ID from URL" unless video_id
66
+
67
+ # Try to find ytInitialPlayerResponse in page
68
+ player_response = extract_player_response(html)
69
+ raise ExtractionError, "Could not extract player response from page" unless player_response
70
+
71
+ # Parse video details
72
+ video_details = player_response['videoDetails'] || {}
73
+ streaming_data = player_response['streamingData'] || {}
74
+ microformat = player_response.dig('microformat', 'playerMicroformatRenderer') || {}
75
+
76
+ # Decrypt streaming URLs if needed
77
+ if streaming_data['formats'] || streaming_data['adaptiveFormats']
78
+ streaming_data = decrypt_streaming_data(streaming_data, html)
79
+ end
80
+
81
+ {
82
+ 'id' => video_id,
83
+ 'title' => video_details['title'] || microformat['title'],
84
+ 'fulltitle' => video_details['title'] || microformat['title'],
85
+ 'description' => video_details['shortDescription'] || microformat['description'],
86
+ 'duration' => video_details['lengthSeconds']&.to_i,
87
+ 'view_count' => video_details['viewCount']&.to_i,
88
+ 'uploader' => video_details['author'] || microformat['ownerChannelName'],
89
+ 'uploader_id' => video_details['channelId'] || microformat['externalChannelId'],
90
+ 'upload_date' => parse_upload_date(microformat['uploadDate']),
91
+ 'thumbnail' => extract_thumbnail(video_details, microformat),
92
+ 'formats' => parse_formats(streaming_data),
93
+ 'subtitles' => parse_captions(player_response['captions']),
94
+ 'webpage_url' => @url,
95
+ 'ext' => 'mp4'
96
+ }
97
+ end
98
+
99
+ def extract_player_response(html)
100
+ # Look for ytInitialPlayerResponse assignment
101
+ if match = html.match(/ytInitialPlayerResponse\s*=\s*(\{)/)
102
+ start_pos = match.begin(1)
103
+ json_str = extract_balanced_json_from_position(html, start_pos)
104
+ begin
105
+ return JSON.parse(json_str)
106
+ rescue JSON::ParserError => e
107
+ # Continue to fallback
108
+ end
109
+ end
110
+
111
+ # Fallback: Look for var ytInitialPlayerResponse
112
+ if match = html.match(/var\s+ytInitialPlayerResponse\s*=\s*(\{)/)
113
+ start_pos = match.begin(1)
114
+ json_str = extract_balanced_json_from_position(html, start_pos)
115
+ begin
116
+ return JSON.parse(json_str)
117
+ rescue JSON::ParserError => e
118
+ # Continue to fallback
119
+ end
120
+ end
121
+
122
+ # Last resort: Look in script tags
123
+ doc = Nokogiri::HTML(html)
124
+ doc.css('script').each do |script|
125
+ content = script.content
126
+ next unless content.include?('ytInitialPlayerResponse')
127
+
128
+ if match = content.match(/ytInitialPlayerResponse\s*=\s*(\{)/)
129
+ start_pos = match.begin(1)
130
+ json_str = extract_balanced_json_from_position(content, start_pos)
131
+ begin
132
+ return JSON.parse(json_str)
133
+ rescue JSON::ParserError
134
+ next
135
+ end
136
+ end
137
+ end
138
+
139
+ nil
140
+ end
141
+
142
+ def extract_balanced_json_from_position(str, start_pos)
143
+ # Extract balanced JSON starting from a position
144
+ depth = 0
145
+ result = ''
146
+ in_string = false
147
+ escape_next = false
148
+
149
+ i = start_pos
150
+ while i < str.length
151
+ char = str[i]
152
+
153
+ if escape_next
154
+ result += char
155
+ escape_next = false
156
+ i += 1
157
+ next
158
+ end
159
+
160
+ case char
161
+ when '\\'
162
+ escape_next = true
163
+ result += char
164
+ when '"'
165
+ in_string = !in_string
166
+ result += char
167
+ when '{'
168
+ depth += 1 unless in_string
169
+ result += char
170
+ when '}'
171
+ result += char
172
+ depth -= 1 unless in_string
173
+ return result if depth == 0
174
+ else
175
+ result += char
176
+ end
177
+
178
+ i += 1
179
+ end
180
+
181
+ result
182
+ end
183
+
184
+ def decrypt_streaming_data(streaming_data, html)
185
+ # For now, return as-is. In the future, we can implement signature decryption
186
+ # YouTube signature decryption is complex and may require executing JavaScript
187
+ # For most videos, formats with direct URLs should be available
188
+ streaming_data
189
+ end
190
+
191
+ def extract_thumbnail(video_details, microformat)
192
+ # Try multiple sources for thumbnail
193
+ thumbnail = nil
194
+
195
+ if video_details['thumbnail'] && video_details['thumbnail']['thumbnails']
196
+ thumbnails = video_details['thumbnail']['thumbnails']
197
+ thumbnail = thumbnails.last['url'] if thumbnails.any?
198
+ end
199
+
200
+ thumbnail ||= microformat.dig('thumbnail', 'thumbnails', -1, 'url')
201
+
202
+ # Fallback to default YouTube thumbnail
203
+ thumbnail ||= "https://i.ytimg.com/vi/#{extract_video_id(@url)}/maxresdefault.jpg"
204
+
205
+ thumbnail
206
+ end
207
+
208
+ def parse_upload_date(date_str)
209
+ return nil unless date_str
210
+
211
+ # Parse ISO date format (e.g., "2005-04-23")
212
+ begin
213
+ Date.parse(date_str).strftime('%Y%m%d')
214
+ rescue
215
+ nil
216
+ end
217
+ end
218
+
219
+ def parse_formats(streaming_data)
220
+ formats = []
221
+
222
+ # Parse regular formats (contain both video and audio)
223
+ if streaming_data['formats']
224
+ streaming_data['formats'].each do |format|
225
+ parsed = parse_format(format)
226
+ formats << parsed if parsed && parsed['url']
227
+ end
228
+ end
229
+
230
+ # Parse adaptive formats (separate audio/video)
231
+ if streaming_data['adaptiveFormats']
232
+ streaming_data['adaptiveFormats'].each do |format|
233
+ parsed = parse_format(format)
234
+ formats << parsed if parsed && parsed['url']
235
+ end
236
+ end
237
+
238
+ formats
239
+ end
240
+
241
+ def parse_format(format_data)
242
+ # Get URL - it might be directly available or need to be constructed
243
+ url = format_data['url']
244
+
245
+ # If no direct URL, try to construct from signatureCipher or cipher
246
+ unless url
247
+ cipher = format_data['signatureCipher'] || format_data['cipher']
248
+ if cipher
249
+ url = decode_cipher(cipher)
250
+ end
251
+ end
252
+
253
+ return nil unless url
254
+
255
+ {
256
+ 'format_id' => format_data['itag']&.to_s,
257
+ 'url' => url,
258
+ 'ext' => extract_extension(format_data['mimeType']),
259
+ 'width' => format_data['width'],
260
+ 'height' => format_data['height'],
261
+ 'fps' => format_data['fps'],
262
+ 'quality' => format_data['quality'],
263
+ 'qualityLabel' => format_data['qualityLabel'],
264
+ 'tbr' => format_data['bitrate'] ? (format_data['bitrate'] / 1000.0).round : nil,
265
+ 'filesize' => format_data['contentLength']&.to_i,
266
+ 'vcodec' => extract_video_codec(format_data['mimeType']),
267
+ 'acodec' => extract_audio_codec(format_data['mimeType']),
268
+ 'format_note' => format_data['qualityLabel'] || format_data['quality']
269
+ }
270
+ end
271
+
272
+ def decode_cipher(cipher_string)
273
+ # Parse the cipher string (format: "s=signature&url=URL")
274
+ params = CGI.parse(cipher_string)
275
+ url = params['url']&.first
276
+
277
+ # For now, return the URL as-is
278
+ # Full signature decryption would require JavaScript execution
279
+ # which is complex to implement in pure Ruby
280
+ url
281
+ end
282
+
283
+ def parse_captions(captions_data)
284
+ return {} unless captions_data
285
+
286
+ subtitles = {}
287
+
288
+ # Get caption tracks
289
+ renderer = captions_data['playerCaptionsTracklistRenderer']
290
+ return {} unless renderer
291
+
292
+ tracks = renderer['captionTracks'] || []
293
+
294
+ tracks.each do |track|
295
+ lang = track['languageCode']
296
+ next unless lang
297
+
298
+ url = track['baseUrl']
299
+ next unless url
300
+
301
+ # Get caption name
302
+ name = if track['name'].is_a?(Hash)
303
+ track['name']['simpleText'] || track['name'].dig('runs', 0, 'text')
304
+ else
305
+ track['name']
306
+ end
307
+
308
+ subtitles[lang] = [{
309
+ 'ext' => 'vtt',
310
+ 'url' => url,
311
+ 'name' => name || lang
312
+ }]
313
+ end
314
+
315
+ # Also check for auto-generated captions
316
+ auto_tracks = renderer['automaticCaptionTracks'] || renderer['translationLanguages'] || []
317
+
318
+ subtitles
319
+ end
320
+
321
+ def extract_video_id(url)
322
+ # Extract video ID from various YouTube URL formats
323
+ patterns = [
324
+ /(?:youtube\.com\/watch\?.*v=|youtu\.be\/)([a-zA-Z0-9_-]{11})/,
325
+ /youtube\.com\/embed\/([a-zA-Z0-9_-]{11})/,
326
+ /youtube\.com\/v\/([a-zA-Z0-9_-]{11})/,
327
+ /youtube\.com\/shorts\/([a-zA-Z0-9_-]{11})/
328
+ ]
329
+
330
+ patterns.each do |pattern|
331
+ match = url.match(pattern)
332
+ return match[1] if match
333
+ end
334
+
335
+ nil
336
+ end
337
+
338
+ def extract_extension(mime_type)
339
+ return nil unless mime_type
340
+
341
+ case mime_type
342
+ when /video\/mp4/
343
+ 'mp4'
344
+ when /video\/webm/
345
+ 'webm'
346
+ when /audio\/mp4/
347
+ 'm4a'
348
+ when /audio\/webm/
349
+ 'webm'
350
+ else
351
+ 'mp4'
352
+ end
353
+ end
354
+
355
+ def extract_video_codec(mime_type)
356
+ return 'none' unless mime_type
357
+ return 'none' unless mime_type.include?('video')
358
+
359
+ case mime_type
360
+ when /codecs="([^"]+)"/
361
+ codecs = $1.split(',').first
362
+ codecs.strip
363
+ else
364
+ 'unknown'
365
+ end
366
+ end
367
+
368
+ def extract_audio_codec(mime_type)
369
+ return 'none' unless mime_type
370
+ return 'none' unless mime_type.include?('audio')
371
+
372
+ case mime_type
373
+ when /codecs="([^"]+)"/
374
+ codecs = $1.split(',').last || $1
375
+ codecs.strip
376
+ else
377
+ 'unknown'
378
+ end
379
+ end
380
+
381
+ def youtube_url?
382
+ @url.match?(/(?:youtube\.com|youtu\.be)/)
383
+ end
384
+
385
+ def build_http_client
386
+ Faraday.new do |f|
387
+ f.request :retry, max: 3, interval: 1, backoff_factor: 2
388
+ f.adapter Faraday.default_adapter
389
+ f.options.timeout = 60
390
+ f.options.open_timeout = 30
391
+
392
+ # Use simple browser-like headers
393
+ user_agent = if @options.is_a?(Hash) && @options[:user_agent]
394
+ @options[:user_agent]
395
+ elsif @options.respond_to?(:user_agent) && @options.user_agent
396
+ @options.user_agent
397
+ else
398
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
399
+ end
400
+
401
+ f.headers['User-Agent'] = user_agent
402
+
403
+ # Add referer if provided
404
+ referer = if @options.is_a?(Hash) && @options[:referer]
405
+ @options[:referer]
406
+ elsif @options.respond_to?(:referer)
407
+ @options.referer
408
+ end
409
+ f.headers['Referer'] = referer if referer
410
+
411
+ # Add cookies if provided
412
+ cookies_file = if @options.is_a?(Hash) && @options[:cookies_file]
413
+ @options[:cookies_file]
414
+ elsif @options.respond_to?(:cookies_file)
415
+ @options.cookies_file
416
+ end
417
+
418
+ if cookies_file && File.exist?(cookies_file)
419
+ cookies = File.read(cookies_file).strip
420
+ f.headers['Cookie'] = cookies
421
+ end
422
+ end
423
+ end
424
+ end
425
+ end
@@ -0,0 +1,186 @@
1
+ module YoutubeRb
2
+ class Options
3
+ attr_accessor :format, :quality, :output_path, :output_template,
4
+ :playlist_start, :playlist_end, :playlist_items,
5
+ :max_downloads, :min_filesize, :max_filesize,
6
+ :rate_limit, :retries, :buffer_size,
7
+ :write_subtitles, :write_auto_sub, :subtitle_format, :subtitle_langs,
8
+ :username, :password, :netrc, :video_password,
9
+ :cookies_file, :user_agent, :referer,
10
+ :extract_audio, :audio_format, :audio_quality,
11
+ :time_range, :start_time, :end_time
12
+
13
+ # Filesystem options
14
+ attr_accessor :no_overwrites, :continue_download, :no_part,
15
+ :write_description, :write_info_json, :write_thumbnail
16
+
17
+ # Video selection options
18
+ attr_accessor :match_title, :reject_title, :date, :datebefore, :dateafter,
19
+ :min_views, :max_views, :no_playlist, :yes_playlist
20
+
21
+ # Backend selection
22
+ attr_accessor :use_ytdlp, :ytdlp_fallback, :verbose
23
+
24
+ # Segment cutting options
25
+ attr_accessor :segment_mode # :fast (default) or :precise
26
+ attr_accessor :min_segment_duration, :max_segment_duration # min/max segment duration in seconds
27
+ attr_accessor :cache_full_video # cache full video for multiple segment extractions
28
+
29
+ def initialize(**options)
30
+ # Backend selection
31
+ @use_ytdlp = options.fetch(:use_ytdlp, false)
32
+ @ytdlp_fallback = options.fetch(:ytdlp_fallback, true)
33
+ @verbose = options.fetch(:verbose, false)
34
+
35
+ # Segment cutting options
36
+ # :fast (default) - Fast stream copy mode, cuts at keyframes (10x faster, ±2-5s accuracy)
37
+ # :precise - Precise mode with re-encoding, exact timestamps (slow, frame-perfect)
38
+ @segment_mode = options.fetch(:segment_mode, :fast)
39
+
40
+ # Validate segment_mode
41
+ unless [:fast, :precise].include?(@segment_mode)
42
+ raise ArgumentError, "segment_mode must be :fast or :precise, got: #{@segment_mode.inspect}"
43
+ end
44
+
45
+ # Segment duration limits (in seconds)
46
+ @min_segment_duration = options.fetch(:min_segment_duration, 10)
47
+ @max_segment_duration = options.fetch(:max_segment_duration, 60)
48
+
49
+ # Validate segment duration limits
50
+ if @min_segment_duration < 1
51
+ raise ArgumentError, "min_segment_duration must be at least 1 second, got: #{@min_segment_duration}"
52
+ end
53
+ if @max_segment_duration < @min_segment_duration
54
+ raise ArgumentError, "max_segment_duration (#{@max_segment_duration}) must be >= min_segment_duration (#{@min_segment_duration})"
55
+ end
56
+
57
+ # Cache full video for multiple segment extractions (Pure Ruby backend only)
58
+ @cache_full_video = options.fetch(:cache_full_video, false)
59
+
60
+ # Video Selection
61
+ @playlist_start = options[:playlist_start]
62
+ @playlist_end = options[:playlist_end]
63
+ @playlist_items = options[:playlist_items]
64
+ @match_title = options[:match_title]
65
+ @reject_title = options[:reject_title]
66
+ @max_downloads = options[:max_downloads]
67
+ @min_filesize = options[:min_filesize]
68
+ @max_filesize = options[:max_filesize]
69
+ @date = options[:date]
70
+ @datebefore = options[:datebefore]
71
+ @dateafter = options[:dateafter]
72
+ @min_views = options[:min_views]
73
+ @max_views = options[:max_views]
74
+ @no_playlist = options.fetch(:no_playlist, false)
75
+ @yes_playlist = options.fetch(:yes_playlist, false)
76
+
77
+ # Download Options
78
+ @format = options.fetch(:format, 'best')
79
+ @quality = options.fetch(:quality, 'best')
80
+ @rate_limit = options[:rate_limit]
81
+ @retries = options.fetch(:retries, 10)
82
+ @buffer_size = options.fetch(:buffer_size, 1024)
83
+ @extract_audio = options.fetch(:extract_audio, false)
84
+ @audio_format = options.fetch(:audio_format, 'mp3')
85
+ @audio_quality = options.fetch(:audio_quality, '192')
86
+
87
+ # Filesystem Options
88
+ @output_path = options.fetch(:output_path, './downloads')
89
+ @output_template = options.fetch(:output_template, '%(title)s-%(id)s.%(ext)s')
90
+ @no_overwrites = options.fetch(:no_overwrites, false)
91
+ @continue_download = options.fetch(:continue_download, true)
92
+ @no_part = options.fetch(:no_part, false)
93
+ @write_description = options.fetch(:write_description, false)
94
+ @write_info_json = options.fetch(:write_info_json, false)
95
+ @write_thumbnail = options.fetch(:write_thumbnail, false)
96
+
97
+ # Subtitle Options
98
+ @write_subtitles = options.fetch(:write_subtitles, false)
99
+ @write_auto_sub = options.fetch(:write_auto_sub, false)
100
+ @subtitle_format = options.fetch(:subtitle_format, 'srt')
101
+ @subtitle_langs = options.fetch(:subtitle_langs, ['en'])
102
+
103
+ # Authentication Options
104
+ @username = options[:username]
105
+ @password = options[:password]
106
+ @netrc = options[:netrc]
107
+ @video_password = options[:video_password]
108
+ @cookies_file = options[:cookies_file]
109
+
110
+ # Network Options
111
+ @user_agent = options[:user_agent]
112
+ @referer = options[:referer]
113
+
114
+ # Time range options for video segments
115
+ @time_range = options[:time_range]
116
+ @start_time = options[:start_time]
117
+ @end_time = options[:end_time]
118
+ end
119
+
120
+ def to_h
121
+ {
122
+ use_ytdlp: @use_ytdlp,
123
+ ytdlp_fallback: @ytdlp_fallback,
124
+ verbose: @verbose,
125
+ segment_mode: @segment_mode,
126
+ min_segment_duration: @min_segment_duration,
127
+ max_segment_duration: @max_segment_duration,
128
+ cache_full_video: @cache_full_video,
129
+ format: @format,
130
+ quality: @quality,
131
+ output_path: @output_path,
132
+ output_template: @output_template,
133
+ playlist_start: @playlist_start,
134
+ playlist_end: @playlist_end,
135
+ playlist_items: @playlist_items,
136
+ max_downloads: @max_downloads,
137
+ min_filesize: @min_filesize,
138
+ max_filesize: @max_filesize,
139
+ rate_limit: @rate_limit,
140
+ retries: @retries,
141
+ buffer_size: @buffer_size,
142
+ write_subtitles: @write_subtitles,
143
+ write_auto_sub: @write_auto_sub,
144
+ subtitle_format: @subtitle_format,
145
+ subtitle_langs: @subtitle_langs,
146
+ username: @username,
147
+ password: @password,
148
+ netrc: @netrc,
149
+ video_password: @video_password,
150
+ cookies_file: @cookies_file,
151
+ user_agent: @user_agent,
152
+ referer: @referer,
153
+ extract_audio: @extract_audio,
154
+ audio_format: @audio_format,
155
+ audio_quality: @audio_quality,
156
+ time_range: @time_range,
157
+ start_time: @start_time,
158
+ end_time: @end_time,
159
+ no_overwrites: @no_overwrites,
160
+ continue_download: @continue_download,
161
+ no_part: @no_part,
162
+ write_description: @write_description,
163
+ write_info_json: @write_info_json,
164
+ write_thumbnail: @write_thumbnail,
165
+ match_title: @match_title,
166
+ reject_title: @reject_title,
167
+ date: @date,
168
+ datebefore: @datebefore,
169
+ dateafter: @dateafter,
170
+ min_views: @min_views,
171
+ max_views: @max_views,
172
+ no_playlist: @no_playlist,
173
+ yes_playlist: @yes_playlist
174
+ }
175
+ end
176
+
177
+ def merge(other_options)
178
+ return self unless other_options
179
+
180
+ other_options.each do |key, value|
181
+ send("#{key}=", value) if respond_to?("#{key}=") && !value.nil?
182
+ end
183
+ self
184
+ end
185
+ end
186
+ end
@@ -0,0 +1,3 @@
1
+ module YoutubeRb
2
+ VERSION = "0.2.0"
3
+ end