youtube-transcript-rb 0.1.0 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +9 -0
- data/.rubocop_todo.yml +166 -0
- data/README.md +42 -42
- data/lib/youtube-transcript-rb.rb +4 -0
- data/lib/youtube_rb/formatters.rb +263 -0
- data/lib/youtube_rb/transcript/api.rb +144 -0
- data/lib/youtube_rb/transcript/errors.rb +215 -0
- data/lib/youtube_rb/transcript/settings.rb +26 -0
- data/lib/youtube_rb/transcript/transcript.rb +237 -0
- data/lib/youtube_rb/transcript/transcript_list.rb +168 -0
- data/lib/youtube_rb/transcript/transcript_list_fetcher.rb +220 -0
- data/lib/youtube_rb/transcript/transcript_parser.rb +81 -0
- data/lib/youtube_rb/transcript.rb +33 -0
- data/lib/youtube_rb/version.rb +5 -0
- data/sig/youtube_rb/transcript.rbs +4 -0
- data/spec/api_spec.rb +27 -27
- data/spec/errors_spec.rb +41 -41
- data/spec/formatters_spec.rb +45 -46
- data/spec/integration_spec.rb +39 -48
- data/spec/settings_spec.rb +16 -16
- data/spec/spec_helper.rb +52 -52
- data/spec/transcript_list_fetcher_spec.rb +38 -33
- data/spec/transcript_list_spec.rb +16 -19
- data/spec/transcript_parser_spec.rb +3 -3
- data/spec/transcript_spec.rb +23 -24
- metadata +17 -13
- data/lib/youtube/transcript/rb/api.rb +0 -150
- data/lib/youtube/transcript/rb/errors.rb +0 -217
- data/lib/youtube/transcript/rb/formatters.rb +0 -269
- data/lib/youtube/transcript/rb/settings.rb +0 -28
- data/lib/youtube/transcript/rb/transcript.rb +0 -239
- data/lib/youtube/transcript/rb/transcript_list.rb +0 -170
- data/lib/youtube/transcript/rb/transcript_list_fetcher.rb +0 -225
- data/lib/youtube/transcript/rb/transcript_parser.rb +0 -83
- data/lib/youtube/transcript/rb/version.rb +0 -9
- data/lib/youtube/transcript/rb.rb +0 -37
- data/sig/youtube/transcript/rb.rbs +0 -8
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module YoutubeRb
|
|
4
|
+
module Transcript
|
|
5
|
+
# Represents a list of available transcripts for a YouTube video.
|
|
6
|
+
# This class is Enumerable, allowing iteration over all available transcripts.
|
|
7
|
+
# It provides functionality to search for transcripts in specific languages.
|
|
8
|
+
class TranscriptList
|
|
9
|
+
include Enumerable
|
|
10
|
+
|
|
11
|
+
# @return [String] the video ID this TranscriptList is for
|
|
12
|
+
attr_reader :video_id
|
|
13
|
+
|
|
14
|
+
# Build a TranscriptList from captions JSON data
|
|
15
|
+
#
|
|
16
|
+
# @param http_client [Faraday::Connection] the HTTP client for fetching transcripts
|
|
17
|
+
# @param video_id [String] the YouTube video ID
|
|
18
|
+
# @param captions_json [Hash] the captions JSON parsed from YouTube
|
|
19
|
+
# @return [TranscriptList] the created TranscriptList
|
|
20
|
+
def self.build(http_client:, video_id:, captions_json:)
|
|
21
|
+
translation_languages = (captions_json["translationLanguages"] || []).map do |tl|
|
|
22
|
+
TranslationLanguage.new(
|
|
23
|
+
language: tl.dig("languageName", "runs", 0, "text") || "",
|
|
24
|
+
language_code: tl["languageCode"]
|
|
25
|
+
)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
manually_created_transcripts = {}
|
|
29
|
+
generated_transcripts = {}
|
|
30
|
+
|
|
31
|
+
(captions_json["captionTracks"] || []).each do |caption|
|
|
32
|
+
is_generated = caption.fetch("kind", "") == "asr"
|
|
33
|
+
target_dict = is_generated ? generated_transcripts : manually_created_transcripts
|
|
34
|
+
|
|
35
|
+
language_code = caption["languageCode"]
|
|
36
|
+
transcript_translation_languages = caption.fetch("isTranslatable", false) ? translation_languages : []
|
|
37
|
+
|
|
38
|
+
target_dict[language_code] = TranscriptMetadata.new(
|
|
39
|
+
http_client: http_client,
|
|
40
|
+
video_id: video_id,
|
|
41
|
+
url: caption["baseUrl"].to_s.gsub("&fmt=srv3", ""),
|
|
42
|
+
language: caption.dig("name", "runs", 0, "text") || "",
|
|
43
|
+
language_code: language_code,
|
|
44
|
+
is_generated: is_generated,
|
|
45
|
+
translation_languages: transcript_translation_languages
|
|
46
|
+
)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
new(
|
|
50
|
+
video_id: video_id,
|
|
51
|
+
manually_created_transcripts: manually_created_transcripts,
|
|
52
|
+
generated_transcripts: generated_transcripts,
|
|
53
|
+
translation_languages: translation_languages
|
|
54
|
+
)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# @param video_id [String] the YouTube video ID
|
|
58
|
+
# @param manually_created_transcripts [Hash<String, TranscriptMetadata>] manually created transcripts by language code
|
|
59
|
+
# @param generated_transcripts [Hash<String, TranscriptMetadata>] auto-generated transcripts by language code
|
|
60
|
+
# @param translation_languages [Array<TranslationLanguage>] available translation languages
|
|
61
|
+
def initialize(video_id:, manually_created_transcripts:, generated_transcripts:, translation_languages:)
|
|
62
|
+
@video_id = video_id
|
|
63
|
+
@manually_created_transcripts = manually_created_transcripts
|
|
64
|
+
@generated_transcripts = generated_transcripts
|
|
65
|
+
@translation_languages = translation_languages
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Iterate over all transcripts (manually created first, then generated)
|
|
69
|
+
#
|
|
70
|
+
# @yield [TranscriptMetadata] each available transcript
|
|
71
|
+
# @return [Enumerator] if no block given
|
|
72
|
+
def each(&)
|
|
73
|
+
return to_enum(:each) unless block_given?
|
|
74
|
+
|
|
75
|
+
@manually_created_transcripts.each_value(&)
|
|
76
|
+
@generated_transcripts.each_value(&)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Find a transcript for the given language codes.
|
|
80
|
+
# Manually created transcripts are preferred over generated ones.
|
|
81
|
+
#
|
|
82
|
+
# @param language_codes [Array<String>] language codes in descending priority
|
|
83
|
+
# @return [TranscriptMetadata] the found transcript
|
|
84
|
+
# @raise [NoTranscriptFound] if no transcript matches the requested languages
|
|
85
|
+
def find_transcript(language_codes)
|
|
86
|
+
find_transcript_in(
|
|
87
|
+
language_codes,
|
|
88
|
+
[@manually_created_transcripts, @generated_transcripts]
|
|
89
|
+
)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Find an automatically generated transcript for the given language codes.
|
|
93
|
+
#
|
|
94
|
+
# @param language_codes [Array<String>] language codes in descending priority
|
|
95
|
+
# @return [TranscriptMetadata] the found transcript
|
|
96
|
+
# @raise [NoTranscriptFound] if no generated transcript matches
|
|
97
|
+
def find_generated_transcript(language_codes)
|
|
98
|
+
find_transcript_in(language_codes, [@generated_transcripts])
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Find a manually created transcript for the given language codes.
|
|
102
|
+
#
|
|
103
|
+
# @param language_codes [Array<String>] language codes in descending priority
|
|
104
|
+
# @return [TranscriptMetadata] the found transcript
|
|
105
|
+
# @raise [NoTranscriptFound] if no manually created transcript matches
|
|
106
|
+
def find_manually_created_transcript(language_codes)
|
|
107
|
+
find_transcript_in(language_codes, [@manually_created_transcripts])
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# String representation of the transcript list
|
|
111
|
+
#
|
|
112
|
+
# @return [String] human-readable description of available transcripts
|
|
113
|
+
def to_s
|
|
114
|
+
<<~DESC
|
|
115
|
+
For this video (#{@video_id}) transcripts are available in the following languages:
|
|
116
|
+
|
|
117
|
+
(MANUALLY CREATED)
|
|
118
|
+
#{format_language_list(@manually_created_transcripts.values)}
|
|
119
|
+
|
|
120
|
+
(GENERATED)
|
|
121
|
+
#{format_language_list(@generated_transcripts.values)}
|
|
122
|
+
|
|
123
|
+
(TRANSLATION LANGUAGES)
|
|
124
|
+
#{format_translation_languages}
|
|
125
|
+
DESC
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
private
|
|
129
|
+
|
|
130
|
+
# Find a transcript from the given dictionaries
|
|
131
|
+
#
|
|
132
|
+
# @param language_codes [Array<String>] language codes to search for
|
|
133
|
+
# @param transcript_dicts [Array<Hash>] transcript dictionaries to search
|
|
134
|
+
# @return [TranscriptMetadata] the found transcript
|
|
135
|
+
# @raise [NoTranscriptFound] if no transcript matches
|
|
136
|
+
def find_transcript_in(language_codes, transcript_dicts)
|
|
137
|
+
language_codes.each do |language_code|
|
|
138
|
+
transcript_dicts.each do |dict|
|
|
139
|
+
return dict[language_code] if dict.key?(language_code)
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
raise NoTranscriptFound.new(@video_id, language_codes, self)
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Format a list of transcripts for display
|
|
147
|
+
#
|
|
148
|
+
# @param transcripts [Array<TranscriptMetadata>] transcripts to format
|
|
149
|
+
# @return [String] formatted list or "None"
|
|
150
|
+
def format_language_list(transcripts)
|
|
151
|
+
return "None" if transcripts.empty?
|
|
152
|
+
|
|
153
|
+
transcripts.map { |t| " - #{t}" }.join("\n")
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Format translation languages for display
|
|
157
|
+
#
|
|
158
|
+
# @return [String] formatted list or "None"
|
|
159
|
+
def format_translation_languages
|
|
160
|
+
return "None" if @translation_languages.empty?
|
|
161
|
+
|
|
162
|
+
@translation_languages.map do |tl|
|
|
163
|
+
" - #{tl.language_code} (\"#{tl.language}\")"
|
|
164
|
+
end.join("\n")
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "cgi"
|
|
4
|
+
require "json"
|
|
5
|
+
|
|
6
|
+
module YoutubeRb
|
|
7
|
+
module Transcript
|
|
8
|
+
# Playability status values returned by YouTube
|
|
9
|
+
module PlayabilityStatus
|
|
10
|
+
OK = "OK"
|
|
11
|
+
ERROR = "ERROR"
|
|
12
|
+
LOGIN_REQUIRED = "LOGIN_REQUIRED"
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Reason messages for playability failures
|
|
16
|
+
module PlayabilityFailedReason
|
|
17
|
+
BOT_DETECTED = "Sign in to confirm you're not a bot"
|
|
18
|
+
AGE_RESTRICTED = "This video may be inappropriate for some users."
|
|
19
|
+
VIDEO_UNAVAILABLE = "This video is unavailable"
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Fetches transcript lists from YouTube videos.
|
|
23
|
+
# This class handles all the HTTP communication with YouTube,
|
|
24
|
+
# including consent cookie handling and error detection.
|
|
25
|
+
class TranscriptListFetcher
|
|
26
|
+
# @param http_client [Faraday::Connection] the HTTP client to use
|
|
27
|
+
# @param proxy_config [Object, nil] optional proxy configuration
|
|
28
|
+
def initialize(http_client:, proxy_config: nil)
|
|
29
|
+
@http_client = http_client
|
|
30
|
+
@proxy_config = proxy_config
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Fetch the transcript list for a video
|
|
34
|
+
#
|
|
35
|
+
# @param video_id [String] the YouTube video ID
|
|
36
|
+
# @return [TranscriptList] the list of available transcripts
|
|
37
|
+
# @raise [CouldNotRetrieveTranscript] if transcripts cannot be retrieved
|
|
38
|
+
def fetch(video_id)
|
|
39
|
+
TranscriptList.build(
|
|
40
|
+
http_client: @http_client,
|
|
41
|
+
video_id: video_id,
|
|
42
|
+
captions_json: fetch_captions_json(video_id)
|
|
43
|
+
)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
private
|
|
47
|
+
|
|
48
|
+
# Fetch captions JSON with retry support
|
|
49
|
+
#
|
|
50
|
+
# @param video_id [String] the YouTube video ID
|
|
51
|
+
# @param try_number [Integer] current retry attempt
|
|
52
|
+
# @return [Hash] the captions JSON
|
|
53
|
+
def fetch_captions_json(video_id, try_number: 0)
|
|
54
|
+
html = fetch_video_html(video_id)
|
|
55
|
+
api_key = extract_innertube_api_key(html, video_id)
|
|
56
|
+
innertube_data = fetch_innertube_data(video_id, api_key)
|
|
57
|
+
extract_captions_json(innertube_data, video_id)
|
|
58
|
+
rescue RequestBlocked => e
|
|
59
|
+
retries = if @proxy_config.nil?
|
|
60
|
+
0
|
|
61
|
+
else
|
|
62
|
+
(@proxy_config.respond_to?(:retries_when_blocked) ? @proxy_config.retries_when_blocked : 0)
|
|
63
|
+
end
|
|
64
|
+
return fetch_captions_json(video_id, try_number: try_number + 1) if try_number + 1 < retries
|
|
65
|
+
|
|
66
|
+
raise e
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Extract the INNERTUBE_API_KEY from the video page HTML
|
|
70
|
+
#
|
|
71
|
+
# @param html [String] the HTML content
|
|
72
|
+
# @param video_id [String] the video ID (for error messages)
|
|
73
|
+
# @return [String] the API key
|
|
74
|
+
# @raise [IpBlocked] if a CAPTCHA is detected
|
|
75
|
+
# @raise [YouTubeDataUnparsable] if the key cannot be found
|
|
76
|
+
def extract_innertube_api_key(html, video_id)
|
|
77
|
+
match = html.match(/"INNERTUBE_API_KEY":\s*"([a-zA-Z0-9_-]+)"/)
|
|
78
|
+
return match[1] if match && match[1]
|
|
79
|
+
|
|
80
|
+
raise IpBlocked, video_id if html.include?('class="g-recaptcha"')
|
|
81
|
+
|
|
82
|
+
raise YouTubeDataUnparsable, video_id
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Extract captions JSON from innertube data
|
|
86
|
+
#
|
|
87
|
+
# @param innertube_data [Hash] the innertube API response
|
|
88
|
+
# @param video_id [String] the video ID
|
|
89
|
+
# @return [Hash] the captions JSON
|
|
90
|
+
# @raise [TranscriptsDisabled] if no captions are available
|
|
91
|
+
def extract_captions_json(innertube_data, video_id)
|
|
92
|
+
assert_playability(innertube_data["playabilityStatus"], video_id)
|
|
93
|
+
|
|
94
|
+
captions_json = innertube_data.dig("captions", "playerCaptionsTracklistRenderer")
|
|
95
|
+
raise TranscriptsDisabled, video_id if captions_json.nil? || !captions_json.key?("captionTracks")
|
|
96
|
+
|
|
97
|
+
captions_json
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Assert that the video is playable
|
|
101
|
+
#
|
|
102
|
+
# @param playability_status_data [Hash, nil] the playability status from API
|
|
103
|
+
# @param video_id [String] the video ID
|
|
104
|
+
# @raise [Various] depending on the playability status
|
|
105
|
+
def assert_playability(playability_status_data, video_id)
|
|
106
|
+
return if playability_status_data.nil?
|
|
107
|
+
|
|
108
|
+
status = playability_status_data["status"]
|
|
109
|
+
return if status == PlayabilityStatus::OK || status.nil?
|
|
110
|
+
|
|
111
|
+
reason = playability_status_data["reason"]
|
|
112
|
+
|
|
113
|
+
if status == PlayabilityStatus::LOGIN_REQUIRED
|
|
114
|
+
if reason == PlayabilityFailedReason::BOT_DETECTED
|
|
115
|
+
raise RequestBlocked, video_id
|
|
116
|
+
elsif reason == PlayabilityFailedReason::AGE_RESTRICTED
|
|
117
|
+
raise AgeRestricted, video_id
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
if status == PlayabilityStatus::ERROR && reason == PlayabilityFailedReason::VIDEO_UNAVAILABLE
|
|
122
|
+
raise InvalidVideoId, video_id if video_id.start_with?("http://") || video_id.start_with?("https://")
|
|
123
|
+
|
|
124
|
+
raise VideoUnavailable, video_id
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Extract subreasons for more detailed error messages
|
|
128
|
+
subreasons = playability_status_data.dig("errorScreen", "playerErrorMessageRenderer", "subreason", "runs") || []
|
|
129
|
+
subreason_texts = subreasons.map { |run| run["text"] || "" }
|
|
130
|
+
|
|
131
|
+
raise VideoUnplayable.new(video_id, reason, subreason_texts)
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Create a consent cookie from the HTML
|
|
135
|
+
#
|
|
136
|
+
# @param html [String] the HTML content
|
|
137
|
+
# @param video_id [String] the video ID
|
|
138
|
+
# @raise [FailedToCreateConsentCookie] if the cookie cannot be created
|
|
139
|
+
def create_consent_cookie(html, video_id)
|
|
140
|
+
match = html.match(/name="v" value="(.*?)"/)
|
|
141
|
+
raise FailedToCreateConsentCookie, video_id if match.nil?
|
|
142
|
+
|
|
143
|
+
# Set the consent cookie
|
|
144
|
+
# Note: Faraday doesn't have built-in cookie management like requests.Session
|
|
145
|
+
# We'll need to handle this via headers or middleware
|
|
146
|
+
@consent_value = "YES+#{match[1]}"
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Fetch the video HTML page
|
|
150
|
+
#
|
|
151
|
+
# @param video_id [String] the video ID
|
|
152
|
+
# @return [String] the HTML content
|
|
153
|
+
def fetch_video_html(video_id)
|
|
154
|
+
html = fetch_html(video_id)
|
|
155
|
+
|
|
156
|
+
if html.include?('action="https://consent.youtube.com/s"')
|
|
157
|
+
create_consent_cookie(html, video_id)
|
|
158
|
+
html = fetch_html(video_id)
|
|
159
|
+
raise FailedToCreateConsentCookie, video_id if html.include?('action="https://consent.youtube.com/s"')
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
html
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Fetch raw HTML from YouTube
|
|
166
|
+
#
|
|
167
|
+
# @param video_id [String] the video ID
|
|
168
|
+
# @return [String] the HTML content (unescaped)
|
|
169
|
+
def fetch_html(video_id)
|
|
170
|
+
url = format(WATCH_URL, video_id: video_id)
|
|
171
|
+
headers = { "Accept-Language" => "en-US" }
|
|
172
|
+
|
|
173
|
+
# Add consent cookie if we have one
|
|
174
|
+
headers["Cookie"] = "CONSENT=#{@consent_value}" if @consent_value
|
|
175
|
+
|
|
176
|
+
response = @http_client.get(url) do |req|
|
|
177
|
+
headers.each { |k, v| req.headers[k] = v }
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
raise_http_errors(response, video_id)
|
|
181
|
+
CGI.unescapeHTML(response.body)
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
# Fetch data from the Innertube API
|
|
185
|
+
#
|
|
186
|
+
# @param video_id [String] the video ID
|
|
187
|
+
# @param api_key [String] the API key
|
|
188
|
+
# @return [Hash] the API response
|
|
189
|
+
def fetch_innertube_data(video_id, api_key)
|
|
190
|
+
url = format(INNERTUBE_API_URL, api_key: api_key)
|
|
191
|
+
|
|
192
|
+
response = @http_client.post(url) do |req|
|
|
193
|
+
req.headers["Content-Type"] = "application/json"
|
|
194
|
+
req.body = JSON.generate({
|
|
195
|
+
"context" => INNERTUBE_CONTEXT,
|
|
196
|
+
"videoId" => video_id
|
|
197
|
+
})
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
raise_http_errors(response, video_id)
|
|
201
|
+
JSON.parse(response.body)
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Raise appropriate errors for HTTP responses
|
|
205
|
+
#
|
|
206
|
+
# @param response [Faraday::Response] the HTTP response
|
|
207
|
+
# @param video_id [String] the video ID
|
|
208
|
+
# @raise [IpBlocked] for 429 responses
|
|
209
|
+
# @raise [YouTubeRequestFailed] for other error responses
|
|
210
|
+
def raise_http_errors(response, video_id)
|
|
211
|
+
case response.status
|
|
212
|
+
when 429
|
|
213
|
+
raise IpBlocked, video_id
|
|
214
|
+
when 400..599
|
|
215
|
+
raise YouTubeRequestFailed.new(video_id, StandardError.new("HTTP #{response.status}"))
|
|
216
|
+
end
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
end
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "nokogiri"
|
|
4
|
+
require "cgi"
|
|
5
|
+
|
|
6
|
+
module YoutubeRb
|
|
7
|
+
module Transcript
|
|
8
|
+
# Parses XML transcript data from YouTube
|
|
9
|
+
class TranscriptParser
|
|
10
|
+
# HTML formatting tags to preserve when preserve_formatting is enabled
|
|
11
|
+
FORMATTING_TAGS = %w[
|
|
12
|
+
strong
|
|
13
|
+
em
|
|
14
|
+
b
|
|
15
|
+
i
|
|
16
|
+
mark
|
|
17
|
+
small
|
|
18
|
+
del
|
|
19
|
+
ins
|
|
20
|
+
sub
|
|
21
|
+
sup
|
|
22
|
+
].freeze
|
|
23
|
+
|
|
24
|
+
# @param preserve_formatting [Boolean] whether to preserve HTML formatting tags
|
|
25
|
+
def initialize(preserve_formatting: false)
|
|
26
|
+
@preserve_formatting = preserve_formatting
|
|
27
|
+
@html_regex = build_html_regex
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Parse XML transcript data into TranscriptSnippet objects
|
|
31
|
+
# @param raw_data [String] the raw XML data from YouTube
|
|
32
|
+
# @return [Array<TranscriptSnippet>] parsed transcript snippets
|
|
33
|
+
def parse(raw_data)
|
|
34
|
+
doc = Nokogiri::XML(raw_data)
|
|
35
|
+
snippets = []
|
|
36
|
+
|
|
37
|
+
doc.xpath("//text").each do |element|
|
|
38
|
+
text_content = element.text
|
|
39
|
+
next if text_content.nil? || text_content.empty?
|
|
40
|
+
|
|
41
|
+
# Unescape HTML entities and remove unwanted HTML tags
|
|
42
|
+
text = process_text(text_content)
|
|
43
|
+
|
|
44
|
+
snippets << TranscriptSnippet.new(
|
|
45
|
+
text: text,
|
|
46
|
+
start: element["start"].to_f,
|
|
47
|
+
duration: (element["dur"] || "0.0").to_f
|
|
48
|
+
)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
snippets
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
private
|
|
55
|
+
|
|
56
|
+
# Build regex for removing HTML tags
|
|
57
|
+
# @return [Regexp]
|
|
58
|
+
def build_html_regex
|
|
59
|
+
if @preserve_formatting
|
|
60
|
+
# Remove all tags except formatting tags
|
|
61
|
+
formats_pattern = FORMATTING_TAGS.join("|")
|
|
62
|
+
# Match tags that are NOT the formatting tags
|
|
63
|
+
Regexp.new("</?(?!/?(?:#{formats_pattern})\\b)[^>]*>", Regexp::IGNORECASE)
|
|
64
|
+
else
|
|
65
|
+
# Remove all HTML tags
|
|
66
|
+
Regexp.new("<[^>]*>", Regexp::IGNORECASE)
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Process text by unescaping HTML entities and removing unwanted tags
|
|
71
|
+
# @param text [String] the raw text
|
|
72
|
+
# @return [String] processed text
|
|
73
|
+
def process_text(text)
|
|
74
|
+
# Unescape HTML entities
|
|
75
|
+
unescaped = CGI.unescapeHTML(text)
|
|
76
|
+
# Remove unwanted HTML tags
|
|
77
|
+
unescaped.gsub(@html_regex, "")
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "transcript/settings"
|
|
4
|
+
require_relative "transcript/errors"
|
|
5
|
+
require_relative "transcript/transcript_parser"
|
|
6
|
+
require_relative "transcript/transcript"
|
|
7
|
+
require_relative "transcript/transcript_list"
|
|
8
|
+
require_relative "transcript/transcript_list_fetcher"
|
|
9
|
+
require_relative "transcript/api"
|
|
10
|
+
|
|
11
|
+
module YoutubeRb
|
|
12
|
+
module Transcript
|
|
13
|
+
class << self
|
|
14
|
+
# Convenience method to fetch a transcript
|
|
15
|
+
# @param video_id [String] YouTube video ID
|
|
16
|
+
# @param languages [Array<String>] Language codes in order of preference
|
|
17
|
+
# @param preserve_formatting [Boolean] Whether to preserve HTML formatting
|
|
18
|
+
# @return [FetchedTranscript] The fetched transcript
|
|
19
|
+
def fetch(video_id, languages: ["en"], preserve_formatting: false)
|
|
20
|
+
api = YouTubeTranscriptApi.new
|
|
21
|
+
api.fetch(video_id, languages: languages, preserve_formatting: preserve_formatting)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Convenience method to list available transcripts
|
|
25
|
+
# @param video_id [String] YouTube video ID
|
|
26
|
+
# @return [TranscriptList] List of available transcripts
|
|
27
|
+
def list(video_id)
|
|
28
|
+
api = YouTubeTranscriptApi.new
|
|
29
|
+
api.list(video_id)
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
data/spec/api_spec.rb
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
require "spec_helper"
|
|
4
4
|
require "webmock/rspec"
|
|
5
5
|
|
|
6
|
-
RSpec.describe
|
|
6
|
+
RSpec.describe YoutubeRb::Transcript::YouTubeTranscriptApi do
|
|
7
7
|
let(:api) { described_class.new }
|
|
8
8
|
let(:video_id) { "dQw4w9WgXcQ" }
|
|
9
9
|
let(:api_key) { "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8" }
|
|
@@ -84,7 +84,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
|
|
|
84
84
|
|
|
85
85
|
it "creates a TranscriptListFetcher" do
|
|
86
86
|
api = described_class.new
|
|
87
|
-
expect(api.instance_variable_get(:@fetcher)).to be_a(
|
|
87
|
+
expect(api.instance_variable_get(:@fetcher)).to be_a(YoutubeRb::Transcript::TranscriptListFetcher)
|
|
88
88
|
end
|
|
89
89
|
end
|
|
90
90
|
|
|
@@ -102,7 +102,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
|
|
|
102
102
|
|
|
103
103
|
it "returns a FetchedTranscript" do
|
|
104
104
|
result = api.fetch(video_id)
|
|
105
|
-
expect(result).to be_a(
|
|
105
|
+
expect(result).to be_a(YoutubeRb::Transcript::FetchedTranscript)
|
|
106
106
|
end
|
|
107
107
|
|
|
108
108
|
it "fetches the transcript with correct video_id" do
|
|
@@ -126,19 +126,19 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
|
|
|
126
126
|
stub_request(:get, "https://www.youtube.com/api/timedtext?v=#{video_id}&lang=es")
|
|
127
127
|
.to_return(status: 200, body: sample_transcript_xml)
|
|
128
128
|
|
|
129
|
-
result = api.fetch(video_id, languages: [
|
|
129
|
+
result = api.fetch(video_id, languages: %w[es en])
|
|
130
130
|
expect(result.language_code).to eq("es")
|
|
131
131
|
end
|
|
132
132
|
|
|
133
133
|
it "falls back to next language if first not available" do
|
|
134
|
-
result = api.fetch(video_id, languages: [
|
|
134
|
+
result = api.fetch(video_id, languages: %w[ja en])
|
|
135
135
|
expect(result.language_code).to eq("en")
|
|
136
136
|
end
|
|
137
137
|
|
|
138
138
|
it "raises NoTranscriptFound when no language matches" do
|
|
139
|
-
expect
|
|
140
|
-
api.fetch(video_id, languages: [
|
|
141
|
-
|
|
139
|
+
expect do
|
|
140
|
+
api.fetch(video_id, languages: %w[ja ko zh])
|
|
141
|
+
end.to raise_error(YoutubeRb::Transcript::NoTranscriptFound)
|
|
142
142
|
end
|
|
143
143
|
|
|
144
144
|
context "with preserve_formatting option" do
|
|
@@ -182,7 +182,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
|
|
|
182
182
|
|
|
183
183
|
it "returns a TranscriptList" do
|
|
184
184
|
result = api.list(video_id)
|
|
185
|
-
expect(result).to be_a(
|
|
185
|
+
expect(result).to be_a(YoutubeRb::Transcript::TranscriptList)
|
|
186
186
|
end
|
|
187
187
|
|
|
188
188
|
it "returns a list with the correct video_id" do
|
|
@@ -213,7 +213,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
|
|
|
213
213
|
end
|
|
214
214
|
|
|
215
215
|
it "raises VideoUnavailable error" do
|
|
216
|
-
expect { api.list(video_id) }.to raise_error(
|
|
216
|
+
expect { api.list(video_id) }.to raise_error(YoutubeRb::Transcript::VideoUnavailable)
|
|
217
217
|
end
|
|
218
218
|
end
|
|
219
219
|
|
|
@@ -227,13 +227,13 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
|
|
|
227
227
|
end
|
|
228
228
|
|
|
229
229
|
it "raises TranscriptsDisabled error" do
|
|
230
|
-
expect { api.list(video_id) }.to raise_error(
|
|
230
|
+
expect { api.list(video_id) }.to raise_error(YoutubeRb::Transcript::TranscriptsDisabled)
|
|
231
231
|
end
|
|
232
232
|
end
|
|
233
233
|
end
|
|
234
234
|
|
|
235
235
|
describe "#fetch_all" do
|
|
236
|
-
let(:video_ids) { [
|
|
236
|
+
let(:video_ids) { %w[video1 video2 video3] }
|
|
237
237
|
|
|
238
238
|
before do
|
|
239
239
|
video_ids.each do |vid|
|
|
@@ -266,20 +266,20 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
|
|
|
266
266
|
it "returns a hash of transcripts" do
|
|
267
267
|
results = api.fetch_all(video_ids)
|
|
268
268
|
expect(results).to be_a(Hash)
|
|
269
|
-
expect(results.keys).to
|
|
269
|
+
expect(results.keys).to match_array(video_ids)
|
|
270
270
|
end
|
|
271
271
|
|
|
272
272
|
it "fetches all video transcripts" do
|
|
273
273
|
results = api.fetch_all(video_ids)
|
|
274
274
|
results.each do |vid, transcript|
|
|
275
|
-
expect(transcript).to be_a(
|
|
275
|
+
expect(transcript).to be_a(YoutubeRb::Transcript::FetchedTranscript)
|
|
276
276
|
expect(transcript.video_id).to eq(vid)
|
|
277
277
|
end
|
|
278
278
|
end
|
|
279
279
|
|
|
280
280
|
it "respects language preference" do
|
|
281
281
|
results = api.fetch_all(video_ids, languages: ["en"])
|
|
282
|
-
results.
|
|
282
|
+
results.each_value do |transcript|
|
|
283
283
|
expect(transcript.language_code).to eq("en")
|
|
284
284
|
end
|
|
285
285
|
end
|
|
@@ -292,7 +292,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
|
|
|
292
292
|
expect(yielded.length).to eq(3)
|
|
293
293
|
yielded.each do |vid, klass|
|
|
294
294
|
expect(video_ids).to include(vid)
|
|
295
|
-
expect(klass).to eq(
|
|
295
|
+
expect(klass).to eq(YoutubeRb::Transcript::FetchedTranscript)
|
|
296
296
|
end
|
|
297
297
|
end
|
|
298
298
|
|
|
@@ -316,7 +316,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
|
|
|
316
316
|
end
|
|
317
317
|
|
|
318
318
|
it "raises error by default" do
|
|
319
|
-
expect { api.fetch_all(failing_video_ids) }.to raise_error(
|
|
319
|
+
expect { api.fetch_all(failing_video_ids) }.to raise_error(YoutubeRb::Transcript::VideoUnavailable)
|
|
320
320
|
end
|
|
321
321
|
|
|
322
322
|
it "continues on error when configured" do
|
|
@@ -332,7 +332,7 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
|
|
|
332
332
|
end
|
|
333
333
|
expect(errors.length).to eq(1)
|
|
334
334
|
expect(errors.first[0]).to eq("fail_video")
|
|
335
|
-
expect(errors.first[1]).to be_a(
|
|
335
|
+
expect(errors.first[1]).to be_a(YoutubeRb::Transcript::VideoUnavailable)
|
|
336
336
|
end
|
|
337
337
|
end
|
|
338
338
|
|
|
@@ -356,27 +356,27 @@ RSpec.describe Youtube::Transcript::Rb::YouTubeTranscriptApi do
|
|
|
356
356
|
.to_return(status: 200, body: sample_transcript_xml)
|
|
357
357
|
end
|
|
358
358
|
|
|
359
|
-
describe "
|
|
359
|
+
describe "YoutubeRb::Transcript.fetch" do
|
|
360
360
|
it "fetches a transcript" do
|
|
361
|
-
result =
|
|
362
|
-
expect(result).to be_a(
|
|
361
|
+
result = YoutubeRb::Transcript.fetch(video_id)
|
|
362
|
+
expect(result).to be_a(YoutubeRb::Transcript::FetchedTranscript)
|
|
363
363
|
end
|
|
364
364
|
|
|
365
365
|
it "accepts language option" do
|
|
366
|
-
result =
|
|
366
|
+
result = YoutubeRb::Transcript.fetch(video_id, languages: ["en"])
|
|
367
367
|
expect(result.language_code).to eq("en")
|
|
368
368
|
end
|
|
369
369
|
|
|
370
370
|
it "accepts preserve_formatting option" do
|
|
371
|
-
result =
|
|
372
|
-
expect(result).to be_a(
|
|
371
|
+
result = YoutubeRb::Transcript.fetch(video_id, preserve_formatting: false)
|
|
372
|
+
expect(result).to be_a(YoutubeRb::Transcript::FetchedTranscript)
|
|
373
373
|
end
|
|
374
374
|
end
|
|
375
375
|
|
|
376
|
-
describe "
|
|
376
|
+
describe "YoutubeRb::Transcript.list" do
|
|
377
377
|
it "lists available transcripts" do
|
|
378
|
-
result =
|
|
379
|
-
expect(result).to be_a(
|
|
378
|
+
result = YoutubeRb::Transcript.list(video_id)
|
|
379
|
+
expect(result).to be_a(YoutubeRb::Transcript::TranscriptList)
|
|
380
380
|
end
|
|
381
381
|
end
|
|
382
382
|
end
|