youtube_transcript2020 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data.tar.gz.sig +0 -0
- data/lib/youtube_transcript2020.rb +83 -34
- metadata +22 -2
- metadata.gz.sig +0 -0
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: eaec0007026f2086f0ed0ed41e5c4d6de5c2e64aea17cf21dfab2a201b5228c5
         | 
| 4 | 
            +
              data.tar.gz: 14953c7cf8156785e5413d17a6e02373935c368cd4f22be7ace93378517f8480
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 294f44e6db60fc35b8e9cdc0600d1d098a5d4eebe5cb09da0e9bc325b1ea489c5faf03ca9557fb7920672d82758313c162406b09b31aa617fcc3402282f8a61b
         | 
| 7 | 
            +
              data.tar.gz: 143628d6cde724dd466d779f8a5796ab02facede423e34d01cf31ed1a52841f56cc09633fc3160c8d68d2502edf2989d0dc3962901fbcb86ac3124d788ff535a
         | 
    
        checksums.yaml.gz.sig
    CHANGED
    
    | Binary file | 
    
        data.tar.gz.sig
    CHANGED
    
    | Binary file | 
| @@ -4,6 +4,7 @@ | |
| 4 4 |  | 
| 5 5 | 
             
            require 'yawc'
         | 
| 6 6 | 
             
            require 'subunit'
         | 
| 7 | 
            +
            require 'youtube_id'
         | 
| 7 8 | 
             
            require 'simple-config'
         | 
| 8 9 |  | 
| 9 10 |  | 
| @@ -11,20 +12,20 @@ class YoutubeTranscript2020 | |
| 11 12 |  | 
| 12 13 | 
             
              attr_reader :to_a, :author, :id, :title
         | 
| 13 14 |  | 
| 14 | 
            -
              def initialize(id=nil)
         | 
| 15 | 
            +
              def initialize(id=nil, debug: false)  
         | 
| 15 16 |  | 
| 16 17 | 
             
                return unless id
         | 
| 18 | 
            +
                
         | 
| 19 | 
            +
                @debug = debug
         | 
| 17 20 |  | 
| 18 | 
            -
                @id = if id[/https | 
| 19 | 
            -
                  id | 
| 20 | 
            -
                elsif id[/https:\/\/youtu\.be\//]
         | 
| 21 | 
            -
                  id[/(?<=^https:\/\/youtu\.be\/).*/]
         | 
| 21 | 
            +
                @id = if id[/https?:\/\//] then
         | 
| 22 | 
            +
                  YoutubeID.from(id)
         | 
| 22 23 | 
             
                else
         | 
| 23 24 | 
             
                  id
         | 
| 24 25 | 
             
                end
         | 
| 25 26 |  | 
| 26 27 | 
             
                s = Net::HTTP.get(URI("http://video.google.com/timedtext?lang=en&v=#{@id}"))
         | 
| 27 | 
            -
                @s = parse s
         | 
| 28 | 
            +
                @s = parse(s) unless s.empty?
         | 
| 28 29 |  | 
| 29 30 | 
             
                fetch_info(@id)
         | 
| 30 31 |  | 
| @@ -52,14 +53,22 @@ class YoutubeTranscript2020 | |
| 52 53 |  | 
| 53 54 | 
             
                s = RXFHelper.read(obj).first
         | 
| 54 55 |  | 
| 55 | 
            -
                 | 
| 56 | 
            +
                if s =~ /------+/ then
         | 
| 57 | 
            +
                  header, body = s.split(/-----+/,2)
         | 
| 56 58 |  | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 59 | 
            -
             | 
| 60 | 
            -
                
         | 
| 59 | 
            +
                  h = SimpleConfig.new(header).to_h
         | 
| 60 | 
            +
                  @id, @author, @title = h[:id], h[:author], h[:title]
         | 
| 61 | 
            +
                  @s = body
         | 
| 62 | 
            +
                else
         | 
| 63 | 
            +
                  body = obj
         | 
| 64 | 
            +
                  raw_transcript = true
         | 
| 65 | 
            +
                end
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                puts 'body: ' + body[0..400] if @debug
         | 
| 61 68 | 
             
                a = body.lines.map(&:chomp).partition {|x| x =~ /\d+:\d+/ }    
         | 
| 62 | 
            -
                @a = a[0].zip(a[1]) | 
| 69 | 
            +
                @a = a[0].zip(a[1])
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                @s = join_sentences(@a) if raw_transcript
         | 
| 63 72 |  | 
| 64 73 | 
             
              end
         | 
| 65 74 |  | 
| @@ -87,7 +96,7 @@ class YoutubeTranscript2020 | |
| 87 96 | 
             
              <body>
         | 
| 88 97 | 
             
            <div style="width: 1080px; background: white">
         | 
| 89 98 | 
             
            <div style="float:left; width: 580px; background: white">
         | 
| 90 | 
            -
             | 
| 99 | 
            +
            #{@html_embed}
         | 
| 91 100 | 
             
            <h1>#{@title}</h1>
         | 
| 92 101 | 
             
            </div>
         | 
| 93 102 | 
             
            <div style="float:right; width: 500px; overflow-y: scroll; height: 400px">
         | 
| @@ -121,35 +130,51 @@ EOF | |
| 121 130 |  | 
| 122 131 | 
             
              def fetch_info(id)
         | 
| 123 132 |  | 
| 124 | 
            -
                url = "http://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=#{id}&format= | 
| 133 | 
            +
                url = "http://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=#{id}&format=xml"
         | 
| 125 134 | 
             
                s = Net::HTTP.get(URI(url))
         | 
| 126 135 |  | 
| 127 | 
            -
                 | 
| 128 | 
            -
                 | 
| 129 | 
            -
                @ | 
| 136 | 
            +
                e = Rexle.new(s).root
         | 
| 137 | 
            +
                
         | 
| 138 | 
            +
                @title = e.text('title')
         | 
| 139 | 
            +
                @author = e.text('author_name')
         | 
| 140 | 
            +
                @html_embed = e.text('html').unescape
         | 
| 130 141 |  | 
| 131 142 | 
             
              end
         | 
| 132 | 
            -
             | 
| 133 | 
            -
              def  | 
| 134 | 
            -
             | 
| 135 | 
            -
                 | 
| 136 | 
            -
             | 
| 137 | 
            -
             | 
| 138 | 
            -
                  timestamp = Subunit.new(units={minutes:60, hours:60}, \
         | 
| 139 | 
            -
                    seconds: x.attributes[:start].to_f).to_s(verbose: false)
         | 
| 140 | 
            -
                  [timestamp, x.text.unescape.gsub("\n", ' ').gsub(''',"'").gsub('"','"')]
         | 
| 143 | 
            +
              
         | 
| 144 | 
            +
              def join_sentences(a)
         | 
| 145 | 
            +
                
         | 
| 146 | 
            +
                if @debug then
         | 
| 147 | 
            +
                  puts 'inside join_sentence'
         | 
| 148 | 
            +
                  puts 'a: ' + a.take(3).inspect
         | 
| 141 149 | 
             
                end
         | 
| 142 | 
            -
             | 
| 143 | 
            -
                @to_a = a
         | 
| 144 | 
            -
             | 
| 150 | 
            +
                
         | 
| 145 151 | 
             
                a2 = []
         | 
| 146 152 |  | 
| 147 153 | 
             
                # the following cleans up sentences that start with And, Or, But, So etc.
         | 
| 148 154 |  | 
| 149 | 
            -
                a.each do |time,  | 
| 155 | 
            +
                a.each do |time, raws|
         | 
| 150 156 |  | 
| 151 | 
            -
                  if  | 
| 152 | 
            -
             | 
| 157 | 
            +
                  puts 'raws: ' + raws.inspect if @debug
         | 
| 158 | 
            +
                  
         | 
| 159 | 
            +
                  s = raws.sub(/^\W+/,'')
         | 
| 160 | 
            +
                  
         | 
| 161 | 
            +
                  if s[/^[a-z|0-9]|I\b|I'/]then
         | 
| 162 | 
            +
                    
         | 
| 163 | 
            +
                    if a2.any? then
         | 
| 164 | 
            +
                      
         | 
| 165 | 
            +
                      # only join two parts together if there was no full stop in 
         | 
| 166 | 
            +
                      # the previous line
         | 
| 167 | 
            +
                      
         | 
| 168 | 
            +
                      if a2[-1][-1] != /\.$/ then
         | 
| 169 | 
            +
                        a2[-1][-1] = a2[-1][-1].chomp + ' ' + s            
         | 
| 170 | 
            +
                      else
         | 
| 171 | 
            +
                        a2 << [time, s]
         | 
| 172 | 
            +
                      end
         | 
| 173 | 
            +
                      
         | 
| 174 | 
            +
                    else          
         | 
| 175 | 
            +
                      a2 << [time, s.capitalize]
         | 
| 176 | 
            +
                    end
         | 
| 177 | 
            +
                    
         | 
| 153 178 | 
             
                  elsif s[/^And,? /]
         | 
| 154 179 | 
             
                    a2[-1][-1] += ' ' + s.sub(/^And,? /,'').capitalize
         | 
| 155 180 | 
             
                  elsif  s[/^Or,? /]
         | 
| @@ -160,15 +185,39 @@ EOF | |
| 160 185 | 
             
                    a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
         | 
| 161 186 | 
             
                  elsif s[/^So,? /]
         | 
| 162 187 | 
             
                    a2[-1][-1] += ' ' + s.sub(/^So,? /,'').capitalize
         | 
| 188 | 
            +
                  elsif s[/^\[Music|Applause\]/i]
         | 
| 189 | 
            +
                    # ignore it
         | 
| 163 190 | 
             
                  else
         | 
| 164 | 
            -
                     | 
| 191 | 
            +
                    
         | 
| 192 | 
            +
                    if a2.any? and not a2[-1][-1] =~ /\.\s*$/ then
         | 
| 193 | 
            +
                        a2[-1][-1] = a2[-1][-1].chomp + ' ' + s            
         | 
| 194 | 
            +
                    else
         | 
| 195 | 
            +
                        a2 << [time, s]
         | 
| 196 | 
            +
                    end
         | 
| 197 | 
            +
                    
         | 
| 165 198 | 
             
                  end
         | 
| 166 199 |  | 
| 167 200 | 
             
                end
         | 
| 168 201 |  | 
| 169 202 | 
             
                # formats the paragraph with the timestamp appearing above
         | 
| 170 203 | 
             
                @a = a2
         | 
| 171 | 
            -
                a2.map {|time, s| "\n%s\n\n%s" % [time, s]}.join("\n")
         | 
| 204 | 
            +
                a2.map {|time, s| "\n%s\n\n%s" % [time, s]}.join("\n")    
         | 
| 205 | 
            +
                
         | 
| 206 | 
            +
              end
         | 
| 207 | 
            +
             | 
| 208 | 
            +
              def parse(s)
         | 
| 209 | 
            +
             | 
| 210 | 
            +
                doc = Rexle.new(s)
         | 
| 211 | 
            +
             | 
| 212 | 
            +
                a = doc.root.elements.each.map do |x| 
         | 
| 213 | 
            +
                  timestamp = Subunit.new(units={minutes:60, hours:60}, \
         | 
| 214 | 
            +
                    seconds: x.attributes[:start].to_f).to_s(verbose: false)
         | 
| 215 | 
            +
                  [timestamp, x.text.unescape.gsub("\n", ' ').gsub(''',"'").gsub('"','"')]
         | 
| 216 | 
            +
                end
         | 
| 217 | 
            +
             | 
| 218 | 
            +
                @to_a = a
         | 
| 219 | 
            +
             | 
| 220 | 
            +
                join_sentences(a)
         | 
| 172 221 |  | 
| 173 222 | 
             
              end
         | 
| 174 223 |  | 
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: youtube_transcript2020
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0. | 
| 4 | 
            +
              version: 0.3.0
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - James Robertson
         | 
| @@ -35,7 +35,7 @@ cert_chain: | |
| 35 35 | 
             
              42/R+mfGUgXXd9e36R3+wmfHZSFR6p6I6XKToCKca7buvgP2XgO9I04lTYUr0KLi
         | 
| 36 36 | 
             
              6ZSQYo0XuSVg3by/5kp1TrrS
         | 
| 37 37 | 
             
              -----END CERTIFICATE-----
         | 
| 38 | 
            -
            date: 2020-07- | 
| 38 | 
            +
            date: 2020-07-29 00:00:00.000000000 Z
         | 
| 39 39 | 
             
            dependencies:
         | 
| 40 40 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 41 41 | 
             
              name: yawc
         | 
| @@ -97,6 +97,26 @@ dependencies: | |
| 97 97 | 
             
                - - ">="
         | 
| 98 98 | 
             
                  - !ruby/object:Gem::Version
         | 
| 99 99 | 
             
                    version: 0.7.1
         | 
| 100 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 101 | 
            +
              name: youtube_id
         | 
| 102 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 103 | 
            +
                requirements:
         | 
| 104 | 
            +
                - - ">="
         | 
| 105 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 106 | 
            +
                    version: 0.1.0
         | 
| 107 | 
            +
                - - "~>"
         | 
| 108 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 109 | 
            +
                    version: '0.1'
         | 
| 110 | 
            +
              type: :runtime
         | 
| 111 | 
            +
              prerelease: false
         | 
| 112 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 113 | 
            +
                requirements:
         | 
| 114 | 
            +
                - - ">="
         | 
| 115 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 116 | 
            +
                    version: 0.1.0
         | 
| 117 | 
            +
                - - "~>"
         | 
| 118 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 119 | 
            +
                    version: '0.1'
         | 
| 100 120 | 
             
            description: 
         | 
| 101 121 | 
             
            email: james@jamesrobertson.eu
         | 
| 102 122 | 
             
            executables: []
         | 
    
        metadata.gz.sig
    CHANGED
    
    | Binary file |