youtube-transcript-rb 0.1.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +9 -0
  3. data/.rubocop_todo.yml +166 -0
  4. data/README.md +42 -42
  5. data/lib/youtube-transcript-rb.rb +4 -0
  6. data/lib/youtube_rb/formatters.rb +263 -0
  7. data/lib/youtube_rb/transcript/api.rb +144 -0
  8. data/lib/youtube_rb/transcript/errors.rb +215 -0
  9. data/lib/youtube_rb/transcript/settings.rb +26 -0
  10. data/lib/youtube_rb/transcript/transcript.rb +237 -0
  11. data/lib/youtube_rb/transcript/transcript_list.rb +168 -0
  12. data/lib/youtube_rb/transcript/transcript_list_fetcher.rb +220 -0
  13. data/lib/youtube_rb/transcript/transcript_parser.rb +81 -0
  14. data/lib/youtube_rb/transcript.rb +33 -0
  15. data/lib/youtube_rb/version.rb +5 -0
  16. data/sig/youtube_rb/transcript.rbs +4 -0
  17. data/spec/api_spec.rb +27 -27
  18. data/spec/errors_spec.rb +41 -41
  19. data/spec/formatters_spec.rb +45 -46
  20. data/spec/integration_spec.rb +39 -48
  21. data/spec/settings_spec.rb +16 -16
  22. data/spec/spec_helper.rb +52 -52
  23. data/spec/transcript_list_fetcher_spec.rb +38 -33
  24. data/spec/transcript_list_spec.rb +16 -19
  25. data/spec/transcript_parser_spec.rb +3 -3
  26. data/spec/transcript_spec.rb +23 -24
  27. metadata +17 -13
  28. data/lib/youtube/transcript/rb/api.rb +0 -150
  29. data/lib/youtube/transcript/rb/errors.rb +0 -217
  30. data/lib/youtube/transcript/rb/formatters.rb +0 -269
  31. data/lib/youtube/transcript/rb/settings.rb +0 -28
  32. data/lib/youtube/transcript/rb/transcript.rb +0 -239
  33. data/lib/youtube/transcript/rb/transcript_list.rb +0 -170
  34. data/lib/youtube/transcript/rb/transcript_list_fetcher.rb +0 -225
  35. data/lib/youtube/transcript/rb/transcript_parser.rb +0 -83
  36. data/lib/youtube/transcript/rb/version.rb +0 -9
  37. data/lib/youtube/transcript/rb.rb +0 -37
  38. data/sig/youtube/transcript/rb.rbs +0 -8
@@ -1,83 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "nokogiri"
4
- require "cgi"
5
-
6
- module Youtube
7
- module Transcript
8
- module Rb
9
- # Parses XML transcript data from YouTube
10
- class TranscriptParser
11
- # HTML formatting tags to preserve when preserve_formatting is enabled
12
- FORMATTING_TAGS = %w[
13
- strong
14
- em
15
- b
16
- i
17
- mark
18
- small
19
- del
20
- ins
21
- sub
22
- sup
23
- ].freeze
24
-
25
- # @param preserve_formatting [Boolean] whether to preserve HTML formatting tags
26
- def initialize(preserve_formatting: false)
27
- @preserve_formatting = preserve_formatting
28
- @html_regex = build_html_regex
29
- end
30
-
31
- # Parse XML transcript data into TranscriptSnippet objects
32
- # @param raw_data [String] the raw XML data from YouTube
33
- # @return [Array<TranscriptSnippet>] parsed transcript snippets
34
- def parse(raw_data)
35
- doc = Nokogiri::XML(raw_data)
36
- snippets = []
37
-
38
- doc.xpath("//text").each do |element|
39
- text_content = element.text
40
- next if text_content.nil? || text_content.empty?
41
-
42
- # Unescape HTML entities and remove unwanted HTML tags
43
- text = process_text(text_content)
44
-
45
- snippets << TranscriptSnippet.new(
46
- text: text,
47
- start: element["start"].to_f,
48
- duration: (element["dur"] || "0.0").to_f
49
- )
50
- end
51
-
52
- snippets
53
- end
54
-
55
- private
56
-
57
- # Build regex for removing HTML tags
58
- # @return [Regexp]
59
- def build_html_regex
60
- if @preserve_formatting
61
- # Remove all tags except formatting tags
62
- formats_pattern = FORMATTING_TAGS.join("|")
63
- # Match tags that are NOT the formatting tags
64
- Regexp.new("</?(?!/?(?:#{formats_pattern})\\b)[^>]*>", Regexp::IGNORECASE)
65
- else
66
- # Remove all HTML tags
67
- Regexp.new("<[^>]*>", Regexp::IGNORECASE)
68
- end
69
- end
70
-
71
- # Process text by unescaping HTML entities and removing unwanted tags
72
- # @param text [String] the raw text
73
- # @return [String] processed text
74
- def process_text(text)
75
- # Unescape HTML entities
76
- unescaped = CGI.unescapeHTML(text)
77
- # Remove unwanted HTML tags
78
- unescaped.gsub(@html_regex, "")
79
- end
80
- end
81
- end
82
- end
83
- end
@@ -1,9 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Youtube
4
- module Transcript
5
- module Rb
6
- VERSION = "0.1.0"
7
- end
8
- end
9
- end
@@ -1,37 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative "rb/version"
4
- require_relative "rb/settings"
5
- require_relative "rb/errors"
6
- require_relative "rb/transcript_parser"
7
- require_relative "rb/transcript"
8
- require_relative "rb/transcript_list"
9
- require_relative "rb/transcript_list_fetcher"
10
- require_relative "rb/api"
11
- require_relative "rb/formatters"
12
-
13
- module Youtube
14
- module Transcript
15
- module Rb
16
- class << self
17
- # Convenience method to fetch a transcript
18
- # @param video_id [String] YouTube video ID
19
- # @param languages [Array<String>] Language codes in order of preference
20
- # @param preserve_formatting [Boolean] Whether to preserve HTML formatting
21
- # @return [FetchedTranscript] The fetched transcript
22
- def fetch(video_id, languages: ["en"], preserve_formatting: false)
23
- api = YouTubeTranscriptApi.new
24
- api.fetch(video_id, languages: languages, preserve_formatting: preserve_formatting)
25
- end
26
-
27
- # Convenience method to list available transcripts
28
- # @param video_id [String] YouTube video ID
29
- # @return [TranscriptList] List of available transcripts
30
- def list(video_id)
31
- api = YouTubeTranscriptApi.new
32
- api.list(video_id)
33
- end
34
- end
35
- end
36
- end
37
- end
@@ -1,8 +0,0 @@
1
- module Youtube
2
- module Transcript
3
- module Rb
4
- VERSION: String
5
- # See the writing guide of rbs: https://github.com/ruby/rbs#guides
6
- end
7
- end
8
- end