youtube-transcript-rb 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +42 -42
- data/lib/youtube_rb/transcript/api.rb +148 -0
- data/lib/youtube_rb/transcript/errors.rb +215 -0
- data/lib/youtube_rb/transcript/formatters.rb +267 -0
- data/lib/youtube_rb/transcript/settings.rb +26 -0
- data/lib/youtube_rb/transcript/transcript.rb +237 -0
- data/lib/youtube_rb/transcript/transcript_list.rb +168 -0
- data/lib/youtube_rb/transcript/transcript_list_fetcher.rb +223 -0
- data/lib/youtube_rb/transcript/transcript_parser.rb +81 -0
- data/lib/{youtube/transcript/rb → youtube_rb/transcript}/version.rb +2 -4
- data/lib/youtube_rb/transcript.rb +35 -0
- data/sig/youtube_rb/transcript.rbs +6 -0
- data/spec/api_spec.rb +20 -20
- data/spec/errors_spec.rb +39 -39
- data/spec/formatters_spec.rb +36 -36
- data/spec/integration_spec.rb +32 -32
- data/spec/settings_spec.rb +16 -16
- data/spec/spec_helper.rb +1 -1
- data/spec/transcript_list_fetcher_spec.rb +27 -27
- data/spec/transcript_list_spec.rb +6 -6
- data/spec/transcript_parser_spec.rb +3 -3
- data/spec/transcript_spec.rb +16 -16
- metadata +12 -12
- data/lib/youtube/transcript/rb/api.rb +0 -150
- data/lib/youtube/transcript/rb/errors.rb +0 -217
- data/lib/youtube/transcript/rb/formatters.rb +0 -269
- data/lib/youtube/transcript/rb/settings.rb +0 -28
- data/lib/youtube/transcript/rb/transcript.rb +0 -239
- data/lib/youtube/transcript/rb/transcript_list.rb +0 -170
- data/lib/youtube/transcript/rb/transcript_list_fetcher.rb +0 -225
- data/lib/youtube/transcript/rb/transcript_parser.rb +0 -83
- data/lib/youtube/transcript/rb.rb +0 -37
- data/sig/youtube/transcript/rb.rbs +0 -8
|
@@ -1,170 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Youtube
|
|
4
|
-
module Transcript
|
|
5
|
-
module Rb
|
|
6
|
-
# Represents a list of available transcripts for a YouTube video.
|
|
7
|
-
# This class is Enumerable, allowing iteration over all available transcripts.
|
|
8
|
-
# It provides functionality to search for transcripts in specific languages.
|
|
9
|
-
class TranscriptList
|
|
10
|
-
include Enumerable
|
|
11
|
-
|
|
12
|
-
# @return [String] the video ID this TranscriptList is for
|
|
13
|
-
attr_reader :video_id
|
|
14
|
-
|
|
15
|
-
# Build a TranscriptList from captions JSON data
|
|
16
|
-
#
|
|
17
|
-
# @param http_client [Faraday::Connection] the HTTP client for fetching transcripts
|
|
18
|
-
# @param video_id [String] the YouTube video ID
|
|
19
|
-
# @param captions_json [Hash] the captions JSON parsed from YouTube
|
|
20
|
-
# @return [TranscriptList] the created TranscriptList
|
|
21
|
-
def self.build(http_client:, video_id:, captions_json:)
|
|
22
|
-
translation_languages = (captions_json["translationLanguages"] || []).map do |tl|
|
|
23
|
-
TranslationLanguage.new(
|
|
24
|
-
language: tl.dig("languageName", "runs", 0, "text") || "",
|
|
25
|
-
language_code: tl["languageCode"]
|
|
26
|
-
)
|
|
27
|
-
end
|
|
28
|
-
|
|
29
|
-
manually_created_transcripts = {}
|
|
30
|
-
generated_transcripts = {}
|
|
31
|
-
|
|
32
|
-
(captions_json["captionTracks"] || []).each do |caption|
|
|
33
|
-
is_generated = caption.fetch("kind", "") == "asr"
|
|
34
|
-
target_dict = is_generated ? generated_transcripts : manually_created_transcripts
|
|
35
|
-
|
|
36
|
-
language_code = caption["languageCode"]
|
|
37
|
-
transcript_translation_languages = caption.fetch("isTranslatable", false) ? translation_languages : []
|
|
38
|
-
|
|
39
|
-
target_dict[language_code] = Transcript.new(
|
|
40
|
-
http_client: http_client,
|
|
41
|
-
video_id: video_id,
|
|
42
|
-
url: caption["baseUrl"].to_s.gsub("&fmt=srv3", ""),
|
|
43
|
-
language: caption.dig("name", "runs", 0, "text") || "",
|
|
44
|
-
language_code: language_code,
|
|
45
|
-
is_generated: is_generated,
|
|
46
|
-
translation_languages: transcript_translation_languages
|
|
47
|
-
)
|
|
48
|
-
end
|
|
49
|
-
|
|
50
|
-
new(
|
|
51
|
-
video_id: video_id,
|
|
52
|
-
manually_created_transcripts: manually_created_transcripts,
|
|
53
|
-
generated_transcripts: generated_transcripts,
|
|
54
|
-
translation_languages: translation_languages
|
|
55
|
-
)
|
|
56
|
-
end
|
|
57
|
-
|
|
58
|
-
# @param video_id [String] the YouTube video ID
|
|
59
|
-
# @param manually_created_transcripts [Hash<String, Transcript>] manually created transcripts by language code
|
|
60
|
-
# @param generated_transcripts [Hash<String, Transcript>] auto-generated transcripts by language code
|
|
61
|
-
# @param translation_languages [Array<TranslationLanguage>] available translation languages
|
|
62
|
-
def initialize(video_id:, manually_created_transcripts:, generated_transcripts:, translation_languages:)
|
|
63
|
-
@video_id = video_id
|
|
64
|
-
@manually_created_transcripts = manually_created_transcripts
|
|
65
|
-
@generated_transcripts = generated_transcripts
|
|
66
|
-
@translation_languages = translation_languages
|
|
67
|
-
end
|
|
68
|
-
|
|
69
|
-
# Iterate over all transcripts (manually created first, then generated)
|
|
70
|
-
#
|
|
71
|
-
# @yield [Transcript] each available transcript
|
|
72
|
-
# @return [Enumerator] if no block given
|
|
73
|
-
def each(&block)
|
|
74
|
-
return to_enum(:each) unless block_given?
|
|
75
|
-
|
|
76
|
-
@manually_created_transcripts.each_value(&block)
|
|
77
|
-
@generated_transcripts.each_value(&block)
|
|
78
|
-
end
|
|
79
|
-
|
|
80
|
-
# Find a transcript for the given language codes.
|
|
81
|
-
# Manually created transcripts are preferred over generated ones.
|
|
82
|
-
#
|
|
83
|
-
# @param language_codes [Array<String>] language codes in descending priority
|
|
84
|
-
# @return [Transcript] the found transcript
|
|
85
|
-
# @raise [NoTranscriptFound] if no transcript matches the requested languages
|
|
86
|
-
def find_transcript(language_codes)
|
|
87
|
-
find_transcript_in(
|
|
88
|
-
language_codes,
|
|
89
|
-
[@manually_created_transcripts, @generated_transcripts]
|
|
90
|
-
)
|
|
91
|
-
end
|
|
92
|
-
|
|
93
|
-
# Find an automatically generated transcript for the given language codes.
|
|
94
|
-
#
|
|
95
|
-
# @param language_codes [Array<String>] language codes in descending priority
|
|
96
|
-
# @return [Transcript] the found transcript
|
|
97
|
-
# @raise [NoTranscriptFound] if no generated transcript matches
|
|
98
|
-
def find_generated_transcript(language_codes)
|
|
99
|
-
find_transcript_in(language_codes, [@generated_transcripts])
|
|
100
|
-
end
|
|
101
|
-
|
|
102
|
-
# Find a manually created transcript for the given language codes.
|
|
103
|
-
#
|
|
104
|
-
# @param language_codes [Array<String>] language codes in descending priority
|
|
105
|
-
# @return [Transcript] the found transcript
|
|
106
|
-
# @raise [NoTranscriptFound] if no manually created transcript matches
|
|
107
|
-
def find_manually_created_transcript(language_codes)
|
|
108
|
-
find_transcript_in(language_codes, [@manually_created_transcripts])
|
|
109
|
-
end
|
|
110
|
-
|
|
111
|
-
# String representation of the transcript list
|
|
112
|
-
#
|
|
113
|
-
# @return [String] human-readable description of available transcripts
|
|
114
|
-
def to_s
|
|
115
|
-
<<~DESC
|
|
116
|
-
For this video (#{@video_id}) transcripts are available in the following languages:
|
|
117
|
-
|
|
118
|
-
(MANUALLY CREATED)
|
|
119
|
-
#{format_language_list(@manually_created_transcripts.values)}
|
|
120
|
-
|
|
121
|
-
(GENERATED)
|
|
122
|
-
#{format_language_list(@generated_transcripts.values)}
|
|
123
|
-
|
|
124
|
-
(TRANSLATION LANGUAGES)
|
|
125
|
-
#{format_translation_languages}
|
|
126
|
-
DESC
|
|
127
|
-
end
|
|
128
|
-
|
|
129
|
-
private
|
|
130
|
-
|
|
131
|
-
# Find a transcript from the given dictionaries
|
|
132
|
-
#
|
|
133
|
-
# @param language_codes [Array<String>] language codes to search for
|
|
134
|
-
# @param transcript_dicts [Array<Hash>] transcript dictionaries to search
|
|
135
|
-
# @return [Transcript] the found transcript
|
|
136
|
-
# @raise [NoTranscriptFound] if no transcript matches
|
|
137
|
-
def find_transcript_in(language_codes, transcript_dicts)
|
|
138
|
-
language_codes.each do |language_code|
|
|
139
|
-
transcript_dicts.each do |dict|
|
|
140
|
-
return dict[language_code] if dict.key?(language_code)
|
|
141
|
-
end
|
|
142
|
-
end
|
|
143
|
-
|
|
144
|
-
raise NoTranscriptFound.new(@video_id, language_codes, self)
|
|
145
|
-
end
|
|
146
|
-
|
|
147
|
-
# Format a list of transcripts for display
|
|
148
|
-
#
|
|
149
|
-
# @param transcripts [Array<Transcript>] transcripts to format
|
|
150
|
-
# @return [String] formatted list or "None"
|
|
151
|
-
def format_language_list(transcripts)
|
|
152
|
-
return "None" if transcripts.empty?
|
|
153
|
-
|
|
154
|
-
transcripts.map { |t| " - #{t}" }.join("\n")
|
|
155
|
-
end
|
|
156
|
-
|
|
157
|
-
# Format translation languages for display
|
|
158
|
-
#
|
|
159
|
-
# @return [String] formatted list or "None"
|
|
160
|
-
def format_translation_languages
|
|
161
|
-
return "None" if @translation_languages.empty?
|
|
162
|
-
|
|
163
|
-
@translation_languages.map do |tl|
|
|
164
|
-
" - #{tl.language_code} (\"#{tl.language}\")"
|
|
165
|
-
end.join("\n")
|
|
166
|
-
end
|
|
167
|
-
end
|
|
168
|
-
end
|
|
169
|
-
end
|
|
170
|
-
end
|
|
@@ -1,225 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require "cgi"
|
|
4
|
-
require "json"
|
|
5
|
-
|
|
6
|
-
module Youtube
|
|
7
|
-
module Transcript
|
|
8
|
-
module Rb
|
|
9
|
-
# Playability status values returned by YouTube
|
|
10
|
-
module PlayabilityStatus
|
|
11
|
-
OK = "OK"
|
|
12
|
-
ERROR = "ERROR"
|
|
13
|
-
LOGIN_REQUIRED = "LOGIN_REQUIRED"
|
|
14
|
-
end
|
|
15
|
-
|
|
16
|
-
# Reason messages for playability failures
|
|
17
|
-
module PlayabilityFailedReason
|
|
18
|
-
BOT_DETECTED = "Sign in to confirm you're not a bot"
|
|
19
|
-
AGE_RESTRICTED = "This video may be inappropriate for some users."
|
|
20
|
-
VIDEO_UNAVAILABLE = "This video is unavailable"
|
|
21
|
-
end
|
|
22
|
-
|
|
23
|
-
# Fetches transcript lists from YouTube videos.
|
|
24
|
-
# This class handles all the HTTP communication with YouTube,
|
|
25
|
-
# including consent cookie handling and error detection.
|
|
26
|
-
class TranscriptListFetcher
|
|
27
|
-
# @param http_client [Faraday::Connection] the HTTP client to use
|
|
28
|
-
# @param proxy_config [Object, nil] optional proxy configuration
|
|
29
|
-
def initialize(http_client:, proxy_config: nil)
|
|
30
|
-
@http_client = http_client
|
|
31
|
-
@proxy_config = proxy_config
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
# Fetch the transcript list for a video
|
|
35
|
-
#
|
|
36
|
-
# @param video_id [String] the YouTube video ID
|
|
37
|
-
# @return [TranscriptList] the list of available transcripts
|
|
38
|
-
# @raise [CouldNotRetrieveTranscript] if transcripts cannot be retrieved
|
|
39
|
-
def fetch(video_id)
|
|
40
|
-
TranscriptList.build(
|
|
41
|
-
http_client: @http_client,
|
|
42
|
-
video_id: video_id,
|
|
43
|
-
captions_json: fetch_captions_json(video_id)
|
|
44
|
-
)
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
private
|
|
48
|
-
|
|
49
|
-
# Fetch captions JSON with retry support
|
|
50
|
-
#
|
|
51
|
-
# @param video_id [String] the YouTube video ID
|
|
52
|
-
# @param try_number [Integer] current retry attempt
|
|
53
|
-
# @return [Hash] the captions JSON
|
|
54
|
-
def fetch_captions_json(video_id, try_number: 0)
|
|
55
|
-
html = fetch_video_html(video_id)
|
|
56
|
-
api_key = extract_innertube_api_key(html, video_id)
|
|
57
|
-
innertube_data = fetch_innertube_data(video_id, api_key)
|
|
58
|
-
extract_captions_json(innertube_data, video_id)
|
|
59
|
-
rescue RequestBlocked => e
|
|
60
|
-
retries = @proxy_config.nil? ? 0 : (@proxy_config.respond_to?(:retries_when_blocked) ? @proxy_config.retries_when_blocked : 0)
|
|
61
|
-
if try_number + 1 < retries
|
|
62
|
-
return fetch_captions_json(video_id, try_number: try_number + 1)
|
|
63
|
-
end
|
|
64
|
-
raise e
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
# Extract the INNERTUBE_API_KEY from the video page HTML
|
|
68
|
-
#
|
|
69
|
-
# @param html [String] the HTML content
|
|
70
|
-
# @param video_id [String] the video ID (for error messages)
|
|
71
|
-
# @return [String] the API key
|
|
72
|
-
# @raise [IpBlocked] if a CAPTCHA is detected
|
|
73
|
-
# @raise [YouTubeDataUnparsable] if the key cannot be found
|
|
74
|
-
def extract_innertube_api_key(html, video_id)
|
|
75
|
-
match = html.match(/"INNERTUBE_API_KEY":\s*"([a-zA-Z0-9_-]+)"/)
|
|
76
|
-
if match && match[1]
|
|
77
|
-
return match[1]
|
|
78
|
-
end
|
|
79
|
-
|
|
80
|
-
raise IpBlocked, video_id if html.include?('class="g-recaptcha"')
|
|
81
|
-
raise YouTubeDataUnparsable, video_id
|
|
82
|
-
end
|
|
83
|
-
|
|
84
|
-
# Extract captions JSON from innertube data
|
|
85
|
-
#
|
|
86
|
-
# @param innertube_data [Hash] the innertube API response
|
|
87
|
-
# @param video_id [String] the video ID
|
|
88
|
-
# @return [Hash] the captions JSON
|
|
89
|
-
# @raise [TranscriptsDisabled] if no captions are available
|
|
90
|
-
def extract_captions_json(innertube_data, video_id)
|
|
91
|
-
assert_playability(innertube_data["playabilityStatus"], video_id)
|
|
92
|
-
|
|
93
|
-
captions_json = innertube_data.dig("captions", "playerCaptionsTracklistRenderer")
|
|
94
|
-
if captions_json.nil? || !captions_json.key?("captionTracks")
|
|
95
|
-
raise TranscriptsDisabled, video_id
|
|
96
|
-
end
|
|
97
|
-
|
|
98
|
-
captions_json
|
|
99
|
-
end
|
|
100
|
-
|
|
101
|
-
# Assert that the video is playable
|
|
102
|
-
#
|
|
103
|
-
# @param playability_status_data [Hash, nil] the playability status from API
|
|
104
|
-
# @param video_id [String] the video ID
|
|
105
|
-
# @raise [Various] depending on the playability status
|
|
106
|
-
def assert_playability(playability_status_data, video_id)
|
|
107
|
-
return if playability_status_data.nil?
|
|
108
|
-
|
|
109
|
-
status = playability_status_data["status"]
|
|
110
|
-
return if status == PlayabilityStatus::OK || status.nil?
|
|
111
|
-
|
|
112
|
-
reason = playability_status_data["reason"]
|
|
113
|
-
|
|
114
|
-
if status == PlayabilityStatus::LOGIN_REQUIRED
|
|
115
|
-
if reason == PlayabilityFailedReason::BOT_DETECTED
|
|
116
|
-
raise RequestBlocked, video_id
|
|
117
|
-
elsif reason == PlayabilityFailedReason::AGE_RESTRICTED
|
|
118
|
-
raise AgeRestricted, video_id
|
|
119
|
-
end
|
|
120
|
-
end
|
|
121
|
-
|
|
122
|
-
if status == PlayabilityStatus::ERROR && reason == PlayabilityFailedReason::VIDEO_UNAVAILABLE
|
|
123
|
-
if video_id.start_with?("http://") || video_id.start_with?("https://")
|
|
124
|
-
raise InvalidVideoId, video_id
|
|
125
|
-
end
|
|
126
|
-
raise VideoUnavailable, video_id
|
|
127
|
-
end
|
|
128
|
-
|
|
129
|
-
# Extract subreasons for more detailed error messages
|
|
130
|
-
subreasons = playability_status_data.dig("errorScreen", "playerErrorMessageRenderer", "subreason", "runs") || []
|
|
131
|
-
subreason_texts = subreasons.map { |run| run["text"] || "" }
|
|
132
|
-
|
|
133
|
-
raise VideoUnplayable.new(video_id, reason, subreason_texts)
|
|
134
|
-
end
|
|
135
|
-
|
|
136
|
-
# Create a consent cookie from the HTML
|
|
137
|
-
#
|
|
138
|
-
# @param html [String] the HTML content
|
|
139
|
-
# @param video_id [String] the video ID
|
|
140
|
-
# @raise [FailedToCreateConsentCookie] if the cookie cannot be created
|
|
141
|
-
def create_consent_cookie(html, video_id)
|
|
142
|
-
match = html.match(/name="v" value="(.*?)"/)
|
|
143
|
-
raise FailedToCreateConsentCookie, video_id if match.nil?
|
|
144
|
-
|
|
145
|
-
# Set the consent cookie
|
|
146
|
-
# Note: Faraday doesn't have built-in cookie management like requests.Session
|
|
147
|
-
# We'll need to handle this via headers or middleware
|
|
148
|
-
@consent_value = "YES+#{match[1]}"
|
|
149
|
-
end
|
|
150
|
-
|
|
151
|
-
# Fetch the video HTML page
|
|
152
|
-
#
|
|
153
|
-
# @param video_id [String] the video ID
|
|
154
|
-
# @return [String] the HTML content
|
|
155
|
-
def fetch_video_html(video_id)
|
|
156
|
-
html = fetch_html(video_id)
|
|
157
|
-
|
|
158
|
-
if html.include?('action="https://consent.youtube.com/s"')
|
|
159
|
-
create_consent_cookie(html, video_id)
|
|
160
|
-
html = fetch_html(video_id)
|
|
161
|
-
if html.include?('action="https://consent.youtube.com/s"')
|
|
162
|
-
raise FailedToCreateConsentCookie, video_id
|
|
163
|
-
end
|
|
164
|
-
end
|
|
165
|
-
|
|
166
|
-
html
|
|
167
|
-
end
|
|
168
|
-
|
|
169
|
-
# Fetch raw HTML from YouTube
|
|
170
|
-
#
|
|
171
|
-
# @param video_id [String] the video ID
|
|
172
|
-
# @return [String] the HTML content (unescaped)
|
|
173
|
-
def fetch_html(video_id)
|
|
174
|
-
url = format(WATCH_URL, video_id: video_id)
|
|
175
|
-
headers = { "Accept-Language" => "en-US" }
|
|
176
|
-
|
|
177
|
-
# Add consent cookie if we have one
|
|
178
|
-
headers["Cookie"] = "CONSENT=#{@consent_value}" if @consent_value
|
|
179
|
-
|
|
180
|
-
response = @http_client.get(url) do |req|
|
|
181
|
-
headers.each { |k, v| req.headers[k] = v }
|
|
182
|
-
end
|
|
183
|
-
|
|
184
|
-
raise_http_errors(response, video_id)
|
|
185
|
-
CGI.unescapeHTML(response.body)
|
|
186
|
-
end
|
|
187
|
-
|
|
188
|
-
# Fetch data from the Innertube API
|
|
189
|
-
#
|
|
190
|
-
# @param video_id [String] the video ID
|
|
191
|
-
# @param api_key [String] the API key
|
|
192
|
-
# @return [Hash] the API response
|
|
193
|
-
def fetch_innertube_data(video_id, api_key)
|
|
194
|
-
url = format(INNERTUBE_API_URL, api_key: api_key)
|
|
195
|
-
|
|
196
|
-
response = @http_client.post(url) do |req|
|
|
197
|
-
req.headers["Content-Type"] = "application/json"
|
|
198
|
-
req.body = JSON.generate({
|
|
199
|
-
"context" => INNERTUBE_CONTEXT,
|
|
200
|
-
"videoId" => video_id
|
|
201
|
-
})
|
|
202
|
-
end
|
|
203
|
-
|
|
204
|
-
raise_http_errors(response, video_id)
|
|
205
|
-
JSON.parse(response.body)
|
|
206
|
-
end
|
|
207
|
-
|
|
208
|
-
# Raise appropriate errors for HTTP responses
|
|
209
|
-
#
|
|
210
|
-
# @param response [Faraday::Response] the HTTP response
|
|
211
|
-
# @param video_id [String] the video ID
|
|
212
|
-
# @raise [IpBlocked] for 429 responses
|
|
213
|
-
# @raise [YouTubeRequestFailed] for other error responses
|
|
214
|
-
def raise_http_errors(response, video_id)
|
|
215
|
-
case response.status
|
|
216
|
-
when 429
|
|
217
|
-
raise IpBlocked, video_id
|
|
218
|
-
when 400..599
|
|
219
|
-
raise YouTubeRequestFailed.new(video_id, StandardError.new("HTTP #{response.status}"))
|
|
220
|
-
end
|
|
221
|
-
end
|
|
222
|
-
end
|
|
223
|
-
end
|
|
224
|
-
end
|
|
225
|
-
end
|
|
@@ -1,83 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require "nokogiri"
|
|
4
|
-
require "cgi"
|
|
5
|
-
|
|
6
|
-
module Youtube
|
|
7
|
-
module Transcript
|
|
8
|
-
module Rb
|
|
9
|
-
# Parses XML transcript data from YouTube
|
|
10
|
-
class TranscriptParser
|
|
11
|
-
# HTML formatting tags to preserve when preserve_formatting is enabled
|
|
12
|
-
FORMATTING_TAGS = %w[
|
|
13
|
-
strong
|
|
14
|
-
em
|
|
15
|
-
b
|
|
16
|
-
i
|
|
17
|
-
mark
|
|
18
|
-
small
|
|
19
|
-
del
|
|
20
|
-
ins
|
|
21
|
-
sub
|
|
22
|
-
sup
|
|
23
|
-
].freeze
|
|
24
|
-
|
|
25
|
-
# @param preserve_formatting [Boolean] whether to preserve HTML formatting tags
|
|
26
|
-
def initialize(preserve_formatting: false)
|
|
27
|
-
@preserve_formatting = preserve_formatting
|
|
28
|
-
@html_regex = build_html_regex
|
|
29
|
-
end
|
|
30
|
-
|
|
31
|
-
# Parse XML transcript data into TranscriptSnippet objects
|
|
32
|
-
# @param raw_data [String] the raw XML data from YouTube
|
|
33
|
-
# @return [Array<TranscriptSnippet>] parsed transcript snippets
|
|
34
|
-
def parse(raw_data)
|
|
35
|
-
doc = Nokogiri::XML(raw_data)
|
|
36
|
-
snippets = []
|
|
37
|
-
|
|
38
|
-
doc.xpath("//text").each do |element|
|
|
39
|
-
text_content = element.text
|
|
40
|
-
next if text_content.nil? || text_content.empty?
|
|
41
|
-
|
|
42
|
-
# Unescape HTML entities and remove unwanted HTML tags
|
|
43
|
-
text = process_text(text_content)
|
|
44
|
-
|
|
45
|
-
snippets << TranscriptSnippet.new(
|
|
46
|
-
text: text,
|
|
47
|
-
start: element["start"].to_f,
|
|
48
|
-
duration: (element["dur"] || "0.0").to_f
|
|
49
|
-
)
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
snippets
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
private
|
|
56
|
-
|
|
57
|
-
# Build regex for removing HTML tags
|
|
58
|
-
# @return [Regexp]
|
|
59
|
-
def build_html_regex
|
|
60
|
-
if @preserve_formatting
|
|
61
|
-
# Remove all tags except formatting tags
|
|
62
|
-
formats_pattern = FORMATTING_TAGS.join("|")
|
|
63
|
-
# Match tags that are NOT the formatting tags
|
|
64
|
-
Regexp.new("</?(?!/?(?:#{formats_pattern})\\b)[^>]*>", Regexp::IGNORECASE)
|
|
65
|
-
else
|
|
66
|
-
# Remove all HTML tags
|
|
67
|
-
Regexp.new("<[^>]*>", Regexp::IGNORECASE)
|
|
68
|
-
end
|
|
69
|
-
end
|
|
70
|
-
|
|
71
|
-
# Process text by unescaping HTML entities and removing unwanted tags
|
|
72
|
-
# @param text [String] the raw text
|
|
73
|
-
# @return [String] processed text
|
|
74
|
-
def process_text(text)
|
|
75
|
-
# Unescape HTML entities
|
|
76
|
-
unescaped = CGI.unescapeHTML(text)
|
|
77
|
-
# Remove unwanted HTML tags
|
|
78
|
-
unescaped.gsub(@html_regex, "")
|
|
79
|
-
end
|
|
80
|
-
end
|
|
81
|
-
end
|
|
82
|
-
end
|
|
83
|
-
end
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require_relative "rb/version"
|
|
4
|
-
require_relative "rb/settings"
|
|
5
|
-
require_relative "rb/errors"
|
|
6
|
-
require_relative "rb/transcript_parser"
|
|
7
|
-
require_relative "rb/transcript"
|
|
8
|
-
require_relative "rb/transcript_list"
|
|
9
|
-
require_relative "rb/transcript_list_fetcher"
|
|
10
|
-
require_relative "rb/api"
|
|
11
|
-
require_relative "rb/formatters"
|
|
12
|
-
|
|
13
|
-
module Youtube
|
|
14
|
-
module Transcript
|
|
15
|
-
module Rb
|
|
16
|
-
class << self
|
|
17
|
-
# Convenience method to fetch a transcript
|
|
18
|
-
# @param video_id [String] YouTube video ID
|
|
19
|
-
# @param languages [Array<String>] Language codes in order of preference
|
|
20
|
-
# @param preserve_formatting [Boolean] Whether to preserve HTML formatting
|
|
21
|
-
# @return [FetchedTranscript] The fetched transcript
|
|
22
|
-
def fetch(video_id, languages: ["en"], preserve_formatting: false)
|
|
23
|
-
api = YouTubeTranscriptApi.new
|
|
24
|
-
api.fetch(video_id, languages: languages, preserve_formatting: preserve_formatting)
|
|
25
|
-
end
|
|
26
|
-
|
|
27
|
-
# Convenience method to list available transcripts
|
|
28
|
-
# @param video_id [String] YouTube video ID
|
|
29
|
-
# @return [TranscriptList] List of available transcripts
|
|
30
|
-
def list(video_id)
|
|
31
|
-
api = YouTubeTranscriptApi.new
|
|
32
|
-
api.list(video_id)
|
|
33
|
-
end
|
|
34
|
-
end
|
|
35
|
-
end
|
|
36
|
-
end
|
|
37
|
-
end
|