youtube-transcript-rb 0.1.0 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +9 -0
- data/.rubocop_todo.yml +166 -0
- data/README.md +42 -42
- data/lib/youtube-transcript-rb.rb +4 -0
- data/lib/youtube_rb/formatters.rb +263 -0
- data/lib/youtube_rb/transcript/api.rb +144 -0
- data/lib/youtube_rb/transcript/errors.rb +215 -0
- data/lib/youtube_rb/transcript/settings.rb +26 -0
- data/lib/youtube_rb/transcript/transcript.rb +237 -0
- data/lib/youtube_rb/transcript/transcript_list.rb +168 -0
- data/lib/youtube_rb/transcript/transcript_list_fetcher.rb +220 -0
- data/lib/youtube_rb/transcript/transcript_parser.rb +81 -0
- data/lib/youtube_rb/transcript.rb +33 -0
- data/lib/youtube_rb/version.rb +5 -0
- data/sig/youtube_rb/transcript.rbs +4 -0
- data/spec/api_spec.rb +27 -27
- data/spec/errors_spec.rb +41 -41
- data/spec/formatters_spec.rb +45 -46
- data/spec/integration_spec.rb +39 -48
- data/spec/settings_spec.rb +16 -16
- data/spec/spec_helper.rb +52 -52
- data/spec/transcript_list_fetcher_spec.rb +38 -33
- data/spec/transcript_list_spec.rb +16 -19
- data/spec/transcript_parser_spec.rb +3 -3
- data/spec/transcript_spec.rb +23 -24
- metadata +17 -13
- data/lib/youtube/transcript/rb/api.rb +0 -150
- data/lib/youtube/transcript/rb/errors.rb +0 -217
- data/lib/youtube/transcript/rb/formatters.rb +0 -269
- data/lib/youtube/transcript/rb/settings.rb +0 -28
- data/lib/youtube/transcript/rb/transcript.rb +0 -239
- data/lib/youtube/transcript/rb/transcript_list.rb +0 -170
- data/lib/youtube/transcript/rb/transcript_list_fetcher.rb +0 -225
- data/lib/youtube/transcript/rb/transcript_parser.rb +0 -83
- data/lib/youtube/transcript/rb/version.rb +0 -9
- data/lib/youtube/transcript/rb.rb +0 -37
- data/sig/youtube/transcript/rb.rbs +0 -8
|
@@ -1,83 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require "nokogiri"
|
|
4
|
-
require "cgi"
|
|
5
|
-
|
|
6
|
-
module Youtube
|
|
7
|
-
module Transcript
|
|
8
|
-
module Rb
|
|
9
|
-
# Parses XML transcript data from YouTube
|
|
10
|
-
class TranscriptParser
|
|
11
|
-
# HTML formatting tags to preserve when preserve_formatting is enabled
|
|
12
|
-
FORMATTING_TAGS = %w[
|
|
13
|
-
strong
|
|
14
|
-
em
|
|
15
|
-
b
|
|
16
|
-
i
|
|
17
|
-
mark
|
|
18
|
-
small
|
|
19
|
-
del
|
|
20
|
-
ins
|
|
21
|
-
sub
|
|
22
|
-
sup
|
|
23
|
-
].freeze
|
|
24
|
-
|
|
25
|
-
# @param preserve_formatting [Boolean] whether to preserve HTML formatting tags
|
|
26
|
-
def initialize(preserve_formatting: false)
|
|
27
|
-
@preserve_formatting = preserve_formatting
|
|
28
|
-
@html_regex = build_html_regex
|
|
29
|
-
end
|
|
30
|
-
|
|
31
|
-
# Parse XML transcript data into TranscriptSnippet objects
|
|
32
|
-
# @param raw_data [String] the raw XML data from YouTube
|
|
33
|
-
# @return [Array<TranscriptSnippet>] parsed transcript snippets
|
|
34
|
-
def parse(raw_data)
|
|
35
|
-
doc = Nokogiri::XML(raw_data)
|
|
36
|
-
snippets = []
|
|
37
|
-
|
|
38
|
-
doc.xpath("//text").each do |element|
|
|
39
|
-
text_content = element.text
|
|
40
|
-
next if text_content.nil? || text_content.empty?
|
|
41
|
-
|
|
42
|
-
# Unescape HTML entities and remove unwanted HTML tags
|
|
43
|
-
text = process_text(text_content)
|
|
44
|
-
|
|
45
|
-
snippets << TranscriptSnippet.new(
|
|
46
|
-
text: text,
|
|
47
|
-
start: element["start"].to_f,
|
|
48
|
-
duration: (element["dur"] || "0.0").to_f
|
|
49
|
-
)
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
snippets
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
private
|
|
56
|
-
|
|
57
|
-
# Build regex for removing HTML tags
|
|
58
|
-
# @return [Regexp]
|
|
59
|
-
def build_html_regex
|
|
60
|
-
if @preserve_formatting
|
|
61
|
-
# Remove all tags except formatting tags
|
|
62
|
-
formats_pattern = FORMATTING_TAGS.join("|")
|
|
63
|
-
# Match tags that are NOT the formatting tags
|
|
64
|
-
Regexp.new("</?(?!/?(?:#{formats_pattern})\\b)[^>]*>", Regexp::IGNORECASE)
|
|
65
|
-
else
|
|
66
|
-
# Remove all HTML tags
|
|
67
|
-
Regexp.new("<[^>]*>", Regexp::IGNORECASE)
|
|
68
|
-
end
|
|
69
|
-
end
|
|
70
|
-
|
|
71
|
-
# Process text by unescaping HTML entities and removing unwanted tags
|
|
72
|
-
# @param text [String] the raw text
|
|
73
|
-
# @return [String] processed text
|
|
74
|
-
def process_text(text)
|
|
75
|
-
# Unescape HTML entities
|
|
76
|
-
unescaped = CGI.unescapeHTML(text)
|
|
77
|
-
# Remove unwanted HTML tags
|
|
78
|
-
unescaped.gsub(@html_regex, "")
|
|
79
|
-
end
|
|
80
|
-
end
|
|
81
|
-
end
|
|
82
|
-
end
|
|
83
|
-
end
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require_relative "rb/version"
|
|
4
|
-
require_relative "rb/settings"
|
|
5
|
-
require_relative "rb/errors"
|
|
6
|
-
require_relative "rb/transcript_parser"
|
|
7
|
-
require_relative "rb/transcript"
|
|
8
|
-
require_relative "rb/transcript_list"
|
|
9
|
-
require_relative "rb/transcript_list_fetcher"
|
|
10
|
-
require_relative "rb/api"
|
|
11
|
-
require_relative "rb/formatters"
|
|
12
|
-
|
|
13
|
-
module Youtube
|
|
14
|
-
module Transcript
|
|
15
|
-
module Rb
|
|
16
|
-
class << self
|
|
17
|
-
# Convenience method to fetch a transcript
|
|
18
|
-
# @param video_id [String] YouTube video ID
|
|
19
|
-
# @param languages [Array<String>] Language codes in order of preference
|
|
20
|
-
# @param preserve_formatting [Boolean] Whether to preserve HTML formatting
|
|
21
|
-
# @return [FetchedTranscript] The fetched transcript
|
|
22
|
-
def fetch(video_id, languages: ["en"], preserve_formatting: false)
|
|
23
|
-
api = YouTubeTranscriptApi.new
|
|
24
|
-
api.fetch(video_id, languages: languages, preserve_formatting: preserve_formatting)
|
|
25
|
-
end
|
|
26
|
-
|
|
27
|
-
# Convenience method to list available transcripts
|
|
28
|
-
# @param video_id [String] YouTube video ID
|
|
29
|
-
# @return [TranscriptList] List of available transcripts
|
|
30
|
-
def list(video_id)
|
|
31
|
-
api = YouTubeTranscriptApi.new
|
|
32
|
-
api.list(video_id)
|
|
33
|
-
end
|
|
34
|
-
end
|
|
35
|
-
end
|
|
36
|
-
end
|
|
37
|
-
end
|