youtube_transcript2020 0.3.0 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: eaec0007026f2086f0ed0ed41e5c4d6de5c2e64aea17cf21dfab2a201b5228c5
4
- data.tar.gz: 14953c7cf8156785e5413d17a6e02373935c368cd4f22be7ace93378517f8480
3
+ metadata.gz: 10d150f7396c9e0e18db51381fc5f262eb83595b4f3d5aad313192d86041071c
4
+ data.tar.gz: 607ae0291272ff40d08db5398f465d99872747e300500c214602b754c43811bb
5
5
  SHA512:
6
- metadata.gz: 294f44e6db60fc35b8e9cdc0600d1d098a5d4eebe5cb09da0e9bc325b1ea489c5faf03ca9557fb7920672d82758313c162406b09b31aa617fcc3402282f8a61b
7
- data.tar.gz: 143628d6cde724dd466d779f8a5796ab02facede423e34d01cf31ed1a52841f56cc09633fc3160c8d68d2502edf2989d0dc3962901fbcb86ac3124d788ff535a
6
+ metadata.gz: 244d0242415308c65b07d9bfb09f6776fe3f67817dadbeb5189b4bb5756a4733668fde8b139f636c361598cb6e623ffbdfbad40c3675fd78b74df17701514183
7
+ data.tar.gz: '07884cdc2b52ad2ac5cf86814f8ae88aa864109d24584e2b256b65b0b8c8168ac7d08f71ce5f2343751b4bbcfe6d81960d4b81b50d4c49109d5b0855198761b1'
checksums.yaml.gz.sig CHANGED
Binary file
@@ -3,34 +3,36 @@
3
3
  # file: youtube_transcript2020.rb
4
4
 
5
5
  require 'yawc'
6
+ require 'json'
6
7
  require 'subunit'
7
8
  require 'youtube_id'
8
9
  require 'simple-config'
9
10
 
11
+ # https://github.com/jdepoix/youtube-transcript-api
10
12
 
11
13
  class YoutubeTranscript2020
12
14
 
13
15
  attr_reader :to_a, :author, :id, :title
14
16
 
15
- def initialize(id=nil, debug: false)
17
+ def initialize(id=nil, debug: false)
16
18
 
17
19
  return unless id
18
-
20
+
19
21
  @debug = debug
20
22
 
21
- @id = if id[/https?:\/\//] then
22
- YoutubeID.from(id)
23
- else
24
- id
25
- end
23
+ @id = id[/https?:\/\//] ? YoutubeID.from(id) : id
26
24
 
27
- s = Net::HTTP.get(URI("http://video.google.com/timedtext?lang=en&v=#{@id}"))
28
- @s = parse(s) unless s.empty?
25
+ # Fetching the transcript from the following statement no longer works.
26
+ # Instead, copy and paste the transcript from the YouTube video page into
27
+ # a text file and import it.
28
+ #
29
+ #s = Net::HTTP.get(URI("http://video.google.com/timedtext?lang=en&v=#{@id}"))
30
+ #@s = parse(s) unless s.empty?
29
31
 
30
32
  fetch_info(@id)
31
33
 
32
34
  end
33
-
35
+
34
36
  def to_a()
35
37
  @a
36
38
  end
@@ -42,7 +44,7 @@ class YoutubeTranscript2020
42
44
  h = {id: @id, title: @title, author: @author}
43
45
  SimpleConfig.new(h).to_s + "\n#{'-'*78}\n\n" + @s
44
46
  end
45
-
47
+
46
48
  def to_text()
47
49
  @a.map(&:last).join("\n")
48
50
  end
@@ -51,21 +53,33 @@ class YoutubeTranscript2020
51
53
  #
52
54
  def import(obj)
53
55
 
54
- s = RXFHelper.read(obj).first
56
+ s = RXFReader.read(obj).first
55
57
 
56
58
  if s =~ /------+/ then
59
+
57
60
  header, body = s.split(/-----+/,2)
58
61
 
59
62
  h = SimpleConfig.new(header).to_h
60
63
  @id, @author, @title = h[:id], h[:author], h[:title]
61
64
  @s = body
65
+
66
+ elsif File.extname(obj) == '.json'
67
+
68
+ r = JSON.parse(s)
69
+ @a = r.map {|x| [x['start'], x['text']]}
70
+ @s = join_sentences(@a)
71
+
72
+ return
73
+
62
74
  else
75
+
63
76
  body = obj
64
77
  raw_transcript = true
78
+
65
79
  end
66
80
 
67
81
  puts 'body: ' + body[0..400] if @debug
68
- a = body.lines.map(&:chomp).partition {|x| x =~ /\d+:\d+/ }
82
+ a = body.lines.map(&:chomp).partition {|x| x =~ /\d+:\d+/ }
69
83
  @a = a[0].zip(a[1])
70
84
 
71
85
  @s = join_sentences(@a) if raw_transcript
@@ -79,13 +93,21 @@ class YoutubeTranscript2020
79
93
  url = 'https://www.youtube.com/embed/' + @id
80
94
 
81
95
  links = @a.map do |timestamp, s|
82
-
83
- seconds = Subunit.new(units={minutes:60, hours:60},
96
+
97
+ seconds = Subunit.new(units={minutes:60, hours:60},
84
98
  timestamp.split(':').map(&:to_i)).to_i
85
99
  "<li><a href='%s?start=%s&autoplay=1' target='video'>%s</a><p>%s</p></li> " \
86
100
  % [url, seconds, timestamp, s]
87
101
  end
88
102
 
103
+ puts '@html_embed: ' + @html_embed.inspect if @debug
104
+ doc = Rexle.new(@html_embed.to_s)
105
+ puts 'before attributes'
106
+ doc.root.attributes[:name] = 'video'
107
+ embed = doc.xml(declaration: false)
108
+ puts 'embed: ' + embed.inspect if @debug
109
+ #embed = @html_embed
110
+
89
111
  <<EOF
90
112
  <!DOCTYPE html>
91
113
  <html lang="en">
@@ -96,7 +118,7 @@ class YoutubeTranscript2020
96
118
  <body>
97
119
  <div style="width: 1080px; background: white">
98
120
  <div style="float:left; width: 580px; background: white">
99
- #{@html_embed}
121
+ #{embed}
100
122
  <h1>#{@title}</h1>
101
123
  </div>
102
124
  <div style="float:right; width: 500px; overflow-y: scroll; height: 400px">
@@ -112,9 +134,9 @@ EOF
112
134
  # Outputs plain text containing the headings including timestamps
113
135
  # note: This can be helpful for copyng and pasting directly into a YouTube comment
114
136
  #
115
- def to_headings()
116
-
117
- @to_a.select {|timestamp, _| timestamp =~ / /}.map(&:first)
137
+ def to_headings()
138
+
139
+ @to_a.select {|timestamp, _| timestamp =~ / /}.map(&:first)
118
140
 
119
141
  end
120
142
 
@@ -129,52 +151,53 @@ EOF
129
151
  private
130
152
 
131
153
  def fetch_info(id)
132
-
133
- url = "http://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=#{id}&format=xml"
154
+
155
+ url = "https://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=#{id}&format=xml"
134
156
  s = Net::HTTP.get(URI(url))
135
-
157
+
136
158
  e = Rexle.new(s).root
137
-
159
+
138
160
  @title = e.text('title')
139
161
  @author = e.text('author_name')
140
162
  @html_embed = e.text('html').unescape
141
-
163
+ puts '@html_embed: ' + @html_embed.inspect if @debug
164
+
142
165
  end
143
-
166
+
144
167
  def join_sentences(a)
145
-
168
+
146
169
  if @debug then
147
170
  puts 'inside join_sentence'
148
171
  puts 'a: ' + a.take(3).inspect
149
172
  end
150
-
173
+
151
174
  a2 = []
152
175
 
153
176
  # the following cleans up sentences that start with And, Or, But, So etc.
154
177
 
155
- a.each do |time, raws|
178
+ (0..a.length - 1).each do |n|
179
+
180
+ time, s = a[n]
181
+
182
+ puts 's: ' + s.inspect if @debug
183
+
184
+ if s[/^[a-z|0-9]|I\b|I'/] then
156
185
 
157
- puts 'raws: ' + raws.inspect if @debug
158
-
159
- s = raws.sub(/^\W+/,'')
160
-
161
- if s[/^[a-z|0-9]|I\b|I'/]then
162
-
163
186
  if a2.any? then
164
-
165
- # only join two parts together if there was no full stop in
187
+
188
+ # only join two parts together if there was no full stop in
166
189
  # the previous line
167
-
190
+
168
191
  if a2[-1][-1] != /\.$/ then
169
- a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
192
+ a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
170
193
  else
171
194
  a2 << [time, s]
172
195
  end
173
-
174
- else
196
+
197
+ else
175
198
  a2 << [time, s.capitalize]
176
199
  end
177
-
200
+
178
201
  elsif s[/^And,? /]
179
202
  a2[-1][-1] += ' ' + s.sub(/^And,? /,'').capitalize
180
203
  elsif s[/^Or,? /]
@@ -184,32 +207,59 @@ EOF
184
207
  elsif s[/^"/]
185
208
  a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
186
209
  elsif s[/^So,? /]
187
- a2[-1][-1] += ' ' + s.sub(/^So,? /,'').capitalize
188
- elsif s[/^\[Music|Applause\]/i]
210
+
211
+ puts 'so? a2[-1]' + a2[-1].inspect if @debug
212
+
213
+ if a2.empty? then
214
+ a2 << [time, s.sub(/^So,? /,'').capitalize]
215
+ else
216
+ a2[-1][-1] += ' ' + s.sub(/^So,? /,'').capitalize
217
+ end
218
+ elsif s[/^\[(?:Music|Applause)\]/i]
219
+
189
220
  # ignore it
221
+ puts 'ignoring action commentary' if @debug
222
+ a2 << [time, '.']
223
+
224
+ # To promote the next sentence to a new timestamp we
225
+ # capitalize the 1st letter
226
+ a[n+1][-1] = a[n+1][-1].capitalize if a[n+1]
190
227
  else
191
-
228
+
192
229
  if a2.any? and not a2[-1][-1] =~ /\.\s*$/ then
193
- a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
230
+ a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
194
231
  else
195
232
  a2 << [time, s]
196
233
  end
197
-
234
+
198
235
  end
199
236
 
200
237
  end
201
238
 
239
+ # Remove those modified entries which were labelled [Music] etc
240
+ a2.reject! {|time, s| s.length < 2}
241
+
202
242
  # formats the paragraph with the timestamp appearing above
203
243
  @a = a2
204
- a2.map {|time, s| "\n%s\n\n%s" % [time, s]}.join("\n")
205
-
244
+
245
+ a2.map do |rawtime, s|
246
+
247
+ time = if rawtime.is_a? Float then
248
+ Subunit.seconds(rawtime).strfunit("%sc")
249
+ else
250
+ time
251
+ end
252
+
253
+ "\n%s\n\n%s" % [time, s]
254
+ end.join("\n")
255
+
206
256
  end
207
257
 
208
258
  def parse(s)
209
259
 
210
260
  doc = Rexle.new(s)
211
261
 
212
- a = doc.root.elements.each.map do |x|
262
+ a = doc.root.elements.each.map do |x|
213
263
  timestamp = Subunit.new(units={minutes:60, hours:60}, \
214
264
  seconds: x.attributes[:start].to_f).to_s(verbose: false)
215
265
  [timestamp, x.text.unescape.gsub("\n", ' ').gsub('&#39;',"'").gsub('&quot;','"')]
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: youtube_transcript2020
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -11,72 +11,72 @@ cert_chain:
11
11
  - |
12
12
  -----BEGIN CERTIFICATE-----
13
13
  MIIEXjCCAsagAwIBAgIBATANBgkqhkiG9w0BAQsFADAsMSowKAYDVQQDDCFnZW1t
14
- YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMjAwNzI3MjI1MTUyWhcN
15
- MjEwNzI3MjI1MTUyWjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
16
- cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQCxfRw+
17
- xg525jF+UNFVTtUrON2issNxWgDTq1efjPq9yMzqYrIDZREFE/3fgYbtAqA1Ut94
18
- 2h8mAKnAg1CC4plPA8o15f+h30TPRaxZXFmYUMxTkaLHL4Lvzd1D7eXqRYf9SFQM
19
- EvoYbncj9QwR57WcVF/MTdwbyyiZo3CGzwmWNb9OCIZtvs8m/UOzAmbfF3lIKz9k
20
- +ZK03KqYhyjuAiVhF39LdWUc1AWqu5i+JpFE+Lzfqv1uAjjgshmUkHOXkpWOorHc
21
- uxL0+xZXWgTwpa1QCw3cQY1LW45QjZt4ckA9lOub1LvUTDCvZocNS+dlIUMdW0mP
22
- jFII/nX/KWxW+NOmkWBpdGbXmY5QTppwx88r+VRpTdhepVcNiiHhMsYQsLI/fzVo
23
- kWTib/aBnAoahtlbaldC+e03GPsLPmpTl4ZjOFqUuAyq47h42NYt6kPY/y7Gj8To
24
- fx4pNgddR/r/WABaNao8Q+tzIxgQwCf1rijvfJP+u04GCmIeFm8oQ1x0XkUCAwEA
25
- AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQU1nkRML1E
26
- Q0PgH/jEHBOQSUTi4MYwJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
14
+ YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMjIwMzIxMjAxOTMzWhcN
15
+ MjMwMzIxMjAxOTMzWjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
16
+ cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQCesuFG
17
+ FfZDH7Xm8kEyH9B3OczqfuJW5yZOfANTnUsB864vtSWe6Vghp6JUI0bOcOQdMvIx
18
+ HO4wfaGwvKJtjWCXYdZo2QYXjf6caY007R10GVxzsBYh8Swym0SYf33ljxX+R9DS
19
+ WwdWIv9SU0T7quNEbeXa9dtZJqlFCspmni8MB284ZpqhP2bpvfhBT58dLEUnjcRP
20
+ rcnCBEueIWYkwoZ8K4/BlYrBfgWcm9hxfBimsID0CIDqD2mhOJo/NQSrJJNWTmOt
21
+ oBZg4K2Y/GCmpxS9wQCrM4pBlTjy/mfNWIxDa9xdrIEmQtSng+7X6wvWAiJmFG7Y
22
+ HYN+ARNOx6ODVGYa/GrLWTBr4EL6RJuOD6eqpxD0hjvTczS12RFIGZh9kKXVT7wy
23
+ gkF5vdtR8uyR8Eo8mJM39Nv7yzuj8cRhCAto6aWOx+srVP/woM96qSQ7Ro0/YaeM
24
+ PHHcgZfU4HGdkCJ5Y8gaO9AzioExf2uFfV/m4+pPcBRbNkymj0+qgT/UFyMCAwEA
25
+ AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUTmoD5rx8
26
+ rZ1imkIWMgtbUzNAn4YwJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
27
27
  c29uLmV1MCYGA1UdEgQfMB2BG2dlbW1hc3RlckBqYW1lc3JvYmVydHNvbi5ldTAN
28
- BgkqhkiG9w0BAQsFAAOCAYEAWjyRzOnO0k/P2YHBsie5hNyJq6q7zb9bto2WYF1L
29
- N0/cvumuBsJMDUuPlD9RFvzncZbu//hbnZbK6cxiptm9HUN+m7zNi8XUcDHQw4Ba
30
- 17ZyHWKM2pkf+PJb4waQVeqyUXjbM9r6L8cVa1gkalU6ZpqEtBmkEzJCDZVf0Fll
31
- KrPYWAW5cC7EWeDm1yxusOqzxnkBcXMnKYNJm8KU4YfVpgPXJy9bTLWhm482BlJm
32
- v6wUZwYOM9B7x3dWbbsQXSuKmFqoxiNRWaA41qUS6eVjXpd4Gn/diSzntaX/Whew
33
- dCXyioQY49CVGJg8LpX/zSYUk9dns+fCSeUUfKjv2K8WuzVkS/uMA8DxSeYBfxf5
34
- ON+xcGIy3Nk7FHwY+CuIIa4WCJYB+1bVFeyCaRlCpwHK8DGUxP5PzCb44USGTI2V
35
- 42/R+mfGUgXXd9e36R3+wmfHZSFR6p6I6XKToCKca7buvgP2XgO9I04lTYUr0KLi
36
- 6ZSQYo0XuSVg3by/5kp1TrrS
28
+ BgkqhkiG9w0BAQsFAAOCAYEAAnULhDB7LFrqhVw2ms+IyRTJcJSfFxFcTPG5/mEW
29
+ r8pyTmXvBOr9WIid7QNaUcHTMVlt03v/XCrEex+GajjDspH+rL0iw3poTyvQHeNt
30
+ WgMJiYJH1AZYTSIPnkdkoo6ok4jb9S4B6mgX7tGcBXMq0q3B2o8YZIwRPzajDvyf
31
+ ZgP+vWq4HfkE7/sLTPRoz+WF6c+0w6NAvCPh/LT9qQjwXhtKquprkPfR3+G9tyNO
32
+ rWGzBuj63YgqWsNTF0wZLXDMAGHsJvJa2plhhkMGU7/SMxxdG25A7THeTVMNH7kM
33
+ 041VYN5fokzIIVKn38M4giKliDGEWvnFnEKEeb6Hrgser85Z+P7GjC642k1FHGvb
34
+ T8Jyb5XNJAWcNTk2AspDthbjYwOYAPP1KSLoCbhABW2Dqb6Y+pDOtoHoVbQtx7Ja
35
+ Eh31Azzsjb9JoMQLQliugChaXNzGUL7z5A4jmxeBd91yoD6odSGqLbGuUwjMfyd/
36
+ bYe6x24BppPTKnvGv7iKJQHe
37
37
  -----END CERTIFICATE-----
38
- date: 2020-07-29 00:00:00.000000000 Z
38
+ date: 2022-03-22 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: yawc
42
42
  requirement: !ruby/object:Gem::Requirement
43
43
  requirements:
44
- - - ">="
45
- - !ruby/object:Gem::Version
46
- version: 0.2.0
47
44
  - - "~>"
48
45
  - !ruby/object:Gem::Version
49
- version: '0.2'
46
+ version: '0.3'
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: 0.3.0
50
50
  type: :runtime
51
51
  prerelease: false
52
52
  version_requirements: !ruby/object:Gem::Requirement
53
53
  requirements:
54
- - - ">="
55
- - !ruby/object:Gem::Version
56
- version: 0.2.0
57
54
  - - "~>"
58
55
  - !ruby/object:Gem::Version
59
- version: '0.2'
56
+ version: '0.3'
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: 0.3.0
60
60
  - !ruby/object:Gem::Dependency
61
61
  name: subunit
62
62
  requirement: !ruby/object:Gem::Requirement
63
63
  requirements:
64
64
  - - "~>"
65
65
  - !ruby/object:Gem::Version
66
- version: '0.5'
66
+ version: '0.8'
67
67
  - - ">="
68
68
  - !ruby/object:Gem::Version
69
- version: 0.5.2
69
+ version: 0.8.7
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
73
73
  requirements:
74
74
  - - "~>"
75
75
  - !ruby/object:Gem::Version
76
- version: '0.5'
76
+ version: '0.8'
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
- version: 0.5.2
79
+ version: 0.8.7
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: simple-config
82
82
  requirement: !ruby/object:Gem::Requirement
@@ -86,7 +86,7 @@ dependencies:
86
86
  version: '0.7'
87
87
  - - ">="
88
88
  - !ruby/object:Gem::Version
89
- version: 0.7.1
89
+ version: 0.7.2
90
90
  type: :runtime
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
@@ -96,29 +96,29 @@ dependencies:
96
96
  version: '0.7'
97
97
  - - ">="
98
98
  - !ruby/object:Gem::Version
99
- version: 0.7.1
99
+ version: 0.7.2
100
100
  - !ruby/object:Gem::Dependency
101
101
  name: youtube_id
102
102
  requirement: !ruby/object:Gem::Requirement
103
103
  requirements:
104
- - - ">="
105
- - !ruby/object:Gem::Version
106
- version: 0.1.0
107
104
  - - "~>"
108
105
  - !ruby/object:Gem::Version
109
106
  version: '0.1'
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: 0.1.0
110
110
  type: :runtime
111
111
  prerelease: false
112
112
  version_requirements: !ruby/object:Gem::Requirement
113
113
  requirements:
114
- - - ">="
115
- - !ruby/object:Gem::Version
116
- version: 0.1.0
117
114
  - - "~>"
118
115
  - !ruby/object:Gem::Version
119
116
  version: '0.1'
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ version: 0.1.0
120
120
  description:
121
- email: james@jamesrobertson.eu
121
+ email: digital.robertson@gmail.com
122
122
  executables: []
123
123
  extensions: []
124
124
  extra_rdoc_files: []
@@ -143,7 +143,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
143
143
  - !ruby/object:Gem::Version
144
144
  version: '0'
145
145
  requirements: []
146
- rubygems_version: 3.0.3
146
+ rubygems_version: 3.2.22
147
147
  signing_key:
148
148
  specification_version: 4
149
149
  summary: Makes it easier to digest a Youtube video by reading the transcript.
metadata.gz.sig CHANGED
Binary file