youtube_transcript2020 0.3.0 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: eaec0007026f2086f0ed0ed41e5c4d6de5c2e64aea17cf21dfab2a201b5228c5
4
- data.tar.gz: 14953c7cf8156785e5413d17a6e02373935c368cd4f22be7ace93378517f8480
3
+ metadata.gz: 10d150f7396c9e0e18db51381fc5f262eb83595b4f3d5aad313192d86041071c
4
+ data.tar.gz: 607ae0291272ff40d08db5398f465d99872747e300500c214602b754c43811bb
5
5
  SHA512:
6
- metadata.gz: 294f44e6db60fc35b8e9cdc0600d1d098a5d4eebe5cb09da0e9bc325b1ea489c5faf03ca9557fb7920672d82758313c162406b09b31aa617fcc3402282f8a61b
7
- data.tar.gz: 143628d6cde724dd466d779f8a5796ab02facede423e34d01cf31ed1a52841f56cc09633fc3160c8d68d2502edf2989d0dc3962901fbcb86ac3124d788ff535a
6
+ metadata.gz: 244d0242415308c65b07d9bfb09f6776fe3f67817dadbeb5189b4bb5756a4733668fde8b139f636c361598cb6e623ffbdfbad40c3675fd78b74df17701514183
7
+ data.tar.gz: '07884cdc2b52ad2ac5cf86814f8ae88aa864109d24584e2b256b65b0b8c8168ac7d08f71ce5f2343751b4bbcfe6d81960d4b81b50d4c49109d5b0855198761b1'
checksums.yaml.gz.sig CHANGED
Binary file
@@ -3,34 +3,36 @@
3
3
  # file: youtube_transcript2020.rb
4
4
 
5
5
  require 'yawc'
6
+ require 'json'
6
7
  require 'subunit'
7
8
  require 'youtube_id'
8
9
  require 'simple-config'
9
10
 
11
+ # https://github.com/jdepoix/youtube-transcript-api
10
12
 
11
13
  class YoutubeTranscript2020
12
14
 
13
15
  attr_reader :to_a, :author, :id, :title
14
16
 
15
- def initialize(id=nil, debug: false)
17
+ def initialize(id=nil, debug: false)
16
18
 
17
19
  return unless id
18
-
20
+
19
21
  @debug = debug
20
22
 
21
- @id = if id[/https?:\/\//] then
22
- YoutubeID.from(id)
23
- else
24
- id
25
- end
23
+ @id = id[/https?:\/\//] ? YoutubeID.from(id) : id
26
24
 
27
- s = Net::HTTP.get(URI("http://video.google.com/timedtext?lang=en&v=#{@id}"))
28
- @s = parse(s) unless s.empty?
25
+ # Fetching the transcript from the following statement no longer works.
26
+ # Instead, copy and paste the transcript from the YouTube video page into
27
+ # a text file and import it.
28
+ #
29
+ #s = Net::HTTP.get(URI("http://video.google.com/timedtext?lang=en&v=#{@id}"))
30
+ #@s = parse(s) unless s.empty?
29
31
 
30
32
  fetch_info(@id)
31
33
 
32
34
  end
33
-
35
+
34
36
  def to_a()
35
37
  @a
36
38
  end
@@ -42,7 +44,7 @@ class YoutubeTranscript2020
42
44
  h = {id: @id, title: @title, author: @author}
43
45
  SimpleConfig.new(h).to_s + "\n#{'-'*78}\n\n" + @s
44
46
  end
45
-
47
+
46
48
  def to_text()
47
49
  @a.map(&:last).join("\n")
48
50
  end
@@ -51,21 +53,33 @@ class YoutubeTranscript2020
51
53
  #
52
54
  def import(obj)
53
55
 
54
- s = RXFHelper.read(obj).first
56
+ s = RXFReader.read(obj).first
55
57
 
56
58
  if s =~ /------+/ then
59
+
57
60
  header, body = s.split(/-----+/,2)
58
61
 
59
62
  h = SimpleConfig.new(header).to_h
60
63
  @id, @author, @title = h[:id], h[:author], h[:title]
61
64
  @s = body
65
+
66
+ elsif File.extname(obj) == '.json'
67
+
68
+ r = JSON.parse(s)
69
+ @a = r.map {|x| [x['start'], x['text']]}
70
+ @s = join_sentences(@a)
71
+
72
+ return
73
+
62
74
  else
75
+
63
76
  body = obj
64
77
  raw_transcript = true
78
+
65
79
  end
66
80
 
67
81
  puts 'body: ' + body[0..400] if @debug
68
- a = body.lines.map(&:chomp).partition {|x| x =~ /\d+:\d+/ }
82
+ a = body.lines.map(&:chomp).partition {|x| x =~ /\d+:\d+/ }
69
83
  @a = a[0].zip(a[1])
70
84
 
71
85
  @s = join_sentences(@a) if raw_transcript
@@ -79,13 +93,21 @@ class YoutubeTranscript2020
79
93
  url = 'https://www.youtube.com/embed/' + @id
80
94
 
81
95
  links = @a.map do |timestamp, s|
82
-
83
- seconds = Subunit.new(units={minutes:60, hours:60},
96
+
97
+ seconds = Subunit.new(units={minutes:60, hours:60},
84
98
  timestamp.split(':').map(&:to_i)).to_i
85
99
  "<li><a href='%s?start=%s&autoplay=1' target='video'>%s</a><p>%s</p></li> " \
86
100
  % [url, seconds, timestamp, s]
87
101
  end
88
102
 
103
+ puts '@html_embed: ' + @html_embed.inspect if @debug
104
+ doc = Rexle.new(@html_embed.to_s)
105
+ puts 'before attributes'
106
+ doc.root.attributes[:name] = 'video'
107
+ embed = doc.xml(declaration: false)
108
+ puts 'embed: ' + embed.inspect if @debug
109
+ #embed = @html_embed
110
+
89
111
  <<EOF
90
112
  <!DOCTYPE html>
91
113
  <html lang="en">
@@ -96,7 +118,7 @@ class YoutubeTranscript2020
96
118
  <body>
97
119
  <div style="width: 1080px; background: white">
98
120
  <div style="float:left; width: 580px; background: white">
99
- #{@html_embed}
121
+ #{embed}
100
122
  <h1>#{@title}</h1>
101
123
  </div>
102
124
  <div style="float:right; width: 500px; overflow-y: scroll; height: 400px">
@@ -112,9 +134,9 @@ EOF
112
134
  # Outputs plain text containing the headings including timestamps
113
135
  # note: This can be helpful for copyng and pasting directly into a YouTube comment
114
136
  #
115
- def to_headings()
116
-
117
- @to_a.select {|timestamp, _| timestamp =~ / /}.map(&:first)
137
+ def to_headings()
138
+
139
+ @to_a.select {|timestamp, _| timestamp =~ / /}.map(&:first)
118
140
 
119
141
  end
120
142
 
@@ -129,52 +151,53 @@ EOF
129
151
  private
130
152
 
131
153
  def fetch_info(id)
132
-
133
- url = "http://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=#{id}&format=xml"
154
+
155
+ url = "https://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=#{id}&format=xml"
134
156
  s = Net::HTTP.get(URI(url))
135
-
157
+
136
158
  e = Rexle.new(s).root
137
-
159
+
138
160
  @title = e.text('title')
139
161
  @author = e.text('author_name')
140
162
  @html_embed = e.text('html').unescape
141
-
163
+ puts '@html_embed: ' + @html_embed.inspect if @debug
164
+
142
165
  end
143
-
166
+
144
167
  def join_sentences(a)
145
-
168
+
146
169
  if @debug then
147
170
  puts 'inside join_sentence'
148
171
  puts 'a: ' + a.take(3).inspect
149
172
  end
150
-
173
+
151
174
  a2 = []
152
175
 
153
176
  # the following cleans up sentences that start with And, Or, But, So etc.
154
177
 
155
- a.each do |time, raws|
178
+ (0..a.length - 1).each do |n|
179
+
180
+ time, s = a[n]
181
+
182
+ puts 's: ' + s.inspect if @debug
183
+
184
+ if s[/^[a-z|0-9]|I\b|I'/] then
156
185
 
157
- puts 'raws: ' + raws.inspect if @debug
158
-
159
- s = raws.sub(/^\W+/,'')
160
-
161
- if s[/^[a-z|0-9]|I\b|I'/]then
162
-
163
186
  if a2.any? then
164
-
165
- # only join two parts together if there was no full stop in
187
+
188
+ # only join two parts together if there was no full stop in
166
189
  # the previous line
167
-
190
+
168
191
  if a2[-1][-1] != /\.$/ then
169
- a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
192
+ a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
170
193
  else
171
194
  a2 << [time, s]
172
195
  end
173
-
174
- else
196
+
197
+ else
175
198
  a2 << [time, s.capitalize]
176
199
  end
177
-
200
+
178
201
  elsif s[/^And,? /]
179
202
  a2[-1][-1] += ' ' + s.sub(/^And,? /,'').capitalize
180
203
  elsif s[/^Or,? /]
@@ -184,32 +207,59 @@ EOF
184
207
  elsif s[/^"/]
185
208
  a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
186
209
  elsif s[/^So,? /]
187
- a2[-1][-1] += ' ' + s.sub(/^So,? /,'').capitalize
188
- elsif s[/^\[Music|Applause\]/i]
210
+
211
+ puts 'so? a2[-1]' + a2[-1].inspect if @debug
212
+
213
+ if a2.empty? then
214
+ a2 << [time, s.sub(/^So,? /,'').capitalize]
215
+ else
216
+ a2[-1][-1] += ' ' + s.sub(/^So,? /,'').capitalize
217
+ end
218
+ elsif s[/^\[(?:Music|Applause)\]/i]
219
+
189
220
  # ignore it
221
+ puts 'ignoring action commentary' if @debug
222
+ a2 << [time, '.']
223
+
224
+ # To promote the next sentence to a new timestamp we
225
+ # capitalize the 1st letter
226
+ a[n+1][-1] = a[n+1][-1].capitalize if a[n+1]
190
227
  else
191
-
228
+
192
229
  if a2.any? and not a2[-1][-1] =~ /\.\s*$/ then
193
- a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
230
+ a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
194
231
  else
195
232
  a2 << [time, s]
196
233
  end
197
-
234
+
198
235
  end
199
236
 
200
237
  end
201
238
 
239
+ # Remove those modified entries which were labelled [Music] etc
240
+ a2.reject! {|time, s| s.length < 2}
241
+
202
242
  # formats the paragraph with the timestamp appearing above
203
243
  @a = a2
204
- a2.map {|time, s| "\n%s\n\n%s" % [time, s]}.join("\n")
205
-
244
+
245
+ a2.map do |rawtime, s|
246
+
247
+ time = if rawtime.is_a? Float then
248
+ Subunit.seconds(rawtime).strfunit("%sc")
249
+ else
250
+ time
251
+ end
252
+
253
+ "\n%s\n\n%s" % [time, s]
254
+ end.join("\n")
255
+
206
256
  end
207
257
 
208
258
  def parse(s)
209
259
 
210
260
  doc = Rexle.new(s)
211
261
 
212
- a = doc.root.elements.each.map do |x|
262
+ a = doc.root.elements.each.map do |x|
213
263
  timestamp = Subunit.new(units={minutes:60, hours:60}, \
214
264
  seconds: x.attributes[:start].to_f).to_s(verbose: false)
215
265
  [timestamp, x.text.unescape.gsub("\n", ' ').gsub('&#39;',"'").gsub('&quot;','"')]
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: youtube_transcript2020
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -11,72 +11,72 @@ cert_chain:
11
11
  - |
12
12
  -----BEGIN CERTIFICATE-----
13
13
  MIIEXjCCAsagAwIBAgIBATANBgkqhkiG9w0BAQsFADAsMSowKAYDVQQDDCFnZW1t
14
- YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMjAwNzI3MjI1MTUyWhcN
15
- MjEwNzI3MjI1MTUyWjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
16
- cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQCxfRw+
17
- xg525jF+UNFVTtUrON2issNxWgDTq1efjPq9yMzqYrIDZREFE/3fgYbtAqA1Ut94
18
- 2h8mAKnAg1CC4plPA8o15f+h30TPRaxZXFmYUMxTkaLHL4Lvzd1D7eXqRYf9SFQM
19
- EvoYbncj9QwR57WcVF/MTdwbyyiZo3CGzwmWNb9OCIZtvs8m/UOzAmbfF3lIKz9k
20
- +ZK03KqYhyjuAiVhF39LdWUc1AWqu5i+JpFE+Lzfqv1uAjjgshmUkHOXkpWOorHc
21
- uxL0+xZXWgTwpa1QCw3cQY1LW45QjZt4ckA9lOub1LvUTDCvZocNS+dlIUMdW0mP
22
- jFII/nX/KWxW+NOmkWBpdGbXmY5QTppwx88r+VRpTdhepVcNiiHhMsYQsLI/fzVo
23
- kWTib/aBnAoahtlbaldC+e03GPsLPmpTl4ZjOFqUuAyq47h42NYt6kPY/y7Gj8To
24
- fx4pNgddR/r/WABaNao8Q+tzIxgQwCf1rijvfJP+u04GCmIeFm8oQ1x0XkUCAwEA
25
- AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQU1nkRML1E
26
- Q0PgH/jEHBOQSUTi4MYwJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
14
+ YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMjIwMzIxMjAxOTMzWhcN
15
+ MjMwMzIxMjAxOTMzWjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
16
+ cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQCesuFG
17
+ FfZDH7Xm8kEyH9B3OczqfuJW5yZOfANTnUsB864vtSWe6Vghp6JUI0bOcOQdMvIx
18
+ HO4wfaGwvKJtjWCXYdZo2QYXjf6caY007R10GVxzsBYh8Swym0SYf33ljxX+R9DS
19
+ WwdWIv9SU0T7quNEbeXa9dtZJqlFCspmni8MB284ZpqhP2bpvfhBT58dLEUnjcRP
20
+ rcnCBEueIWYkwoZ8K4/BlYrBfgWcm9hxfBimsID0CIDqD2mhOJo/NQSrJJNWTmOt
21
+ oBZg4K2Y/GCmpxS9wQCrM4pBlTjy/mfNWIxDa9xdrIEmQtSng+7X6wvWAiJmFG7Y
22
+ HYN+ARNOx6ODVGYa/GrLWTBr4EL6RJuOD6eqpxD0hjvTczS12RFIGZh9kKXVT7wy
23
+ gkF5vdtR8uyR8Eo8mJM39Nv7yzuj8cRhCAto6aWOx+srVP/woM96qSQ7Ro0/YaeM
24
+ PHHcgZfU4HGdkCJ5Y8gaO9AzioExf2uFfV/m4+pPcBRbNkymj0+qgT/UFyMCAwEA
25
+ AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUTmoD5rx8
26
+ rZ1imkIWMgtbUzNAn4YwJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
27
27
  c29uLmV1MCYGA1UdEgQfMB2BG2dlbW1hc3RlckBqYW1lc3JvYmVydHNvbi5ldTAN
28
- BgkqhkiG9w0BAQsFAAOCAYEAWjyRzOnO0k/P2YHBsie5hNyJq6q7zb9bto2WYF1L
29
- N0/cvumuBsJMDUuPlD9RFvzncZbu//hbnZbK6cxiptm9HUN+m7zNi8XUcDHQw4Ba
30
- 17ZyHWKM2pkf+PJb4waQVeqyUXjbM9r6L8cVa1gkalU6ZpqEtBmkEzJCDZVf0Fll
31
- KrPYWAW5cC7EWeDm1yxusOqzxnkBcXMnKYNJm8KU4YfVpgPXJy9bTLWhm482BlJm
32
- v6wUZwYOM9B7x3dWbbsQXSuKmFqoxiNRWaA41qUS6eVjXpd4Gn/diSzntaX/Whew
33
- dCXyioQY49CVGJg8LpX/zSYUk9dns+fCSeUUfKjv2K8WuzVkS/uMA8DxSeYBfxf5
34
- ON+xcGIy3Nk7FHwY+CuIIa4WCJYB+1bVFeyCaRlCpwHK8DGUxP5PzCb44USGTI2V
35
- 42/R+mfGUgXXd9e36R3+wmfHZSFR6p6I6XKToCKca7buvgP2XgO9I04lTYUr0KLi
36
- 6ZSQYo0XuSVg3by/5kp1TrrS
28
+ BgkqhkiG9w0BAQsFAAOCAYEAAnULhDB7LFrqhVw2ms+IyRTJcJSfFxFcTPG5/mEW
29
+ r8pyTmXvBOr9WIid7QNaUcHTMVlt03v/XCrEex+GajjDspH+rL0iw3poTyvQHeNt
30
+ WgMJiYJH1AZYTSIPnkdkoo6ok4jb9S4B6mgX7tGcBXMq0q3B2o8YZIwRPzajDvyf
31
+ ZgP+vWq4HfkE7/sLTPRoz+WF6c+0w6NAvCPh/LT9qQjwXhtKquprkPfR3+G9tyNO
32
+ rWGzBuj63YgqWsNTF0wZLXDMAGHsJvJa2plhhkMGU7/SMxxdG25A7THeTVMNH7kM
33
+ 041VYN5fokzIIVKn38M4giKliDGEWvnFnEKEeb6Hrgser85Z+P7GjC642k1FHGvb
34
+ T8Jyb5XNJAWcNTk2AspDthbjYwOYAPP1KSLoCbhABW2Dqb6Y+pDOtoHoVbQtx7Ja
35
+ Eh31Azzsjb9JoMQLQliugChaXNzGUL7z5A4jmxeBd91yoD6odSGqLbGuUwjMfyd/
36
+ bYe6x24BppPTKnvGv7iKJQHe
37
37
  -----END CERTIFICATE-----
38
- date: 2020-07-29 00:00:00.000000000 Z
38
+ date: 2022-03-22 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: yawc
42
42
  requirement: !ruby/object:Gem::Requirement
43
43
  requirements:
44
- - - ">="
45
- - !ruby/object:Gem::Version
46
- version: 0.2.0
47
44
  - - "~>"
48
45
  - !ruby/object:Gem::Version
49
- version: '0.2'
46
+ version: '0.3'
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: 0.3.0
50
50
  type: :runtime
51
51
  prerelease: false
52
52
  version_requirements: !ruby/object:Gem::Requirement
53
53
  requirements:
54
- - - ">="
55
- - !ruby/object:Gem::Version
56
- version: 0.2.0
57
54
  - - "~>"
58
55
  - !ruby/object:Gem::Version
59
- version: '0.2'
56
+ version: '0.3'
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: 0.3.0
60
60
  - !ruby/object:Gem::Dependency
61
61
  name: subunit
62
62
  requirement: !ruby/object:Gem::Requirement
63
63
  requirements:
64
64
  - - "~>"
65
65
  - !ruby/object:Gem::Version
66
- version: '0.5'
66
+ version: '0.8'
67
67
  - - ">="
68
68
  - !ruby/object:Gem::Version
69
- version: 0.5.2
69
+ version: 0.8.7
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
73
73
  requirements:
74
74
  - - "~>"
75
75
  - !ruby/object:Gem::Version
76
- version: '0.5'
76
+ version: '0.8'
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
- version: 0.5.2
79
+ version: 0.8.7
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: simple-config
82
82
  requirement: !ruby/object:Gem::Requirement
@@ -86,7 +86,7 @@ dependencies:
86
86
  version: '0.7'
87
87
  - - ">="
88
88
  - !ruby/object:Gem::Version
89
- version: 0.7.1
89
+ version: 0.7.2
90
90
  type: :runtime
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
@@ -96,29 +96,29 @@ dependencies:
96
96
  version: '0.7'
97
97
  - - ">="
98
98
  - !ruby/object:Gem::Version
99
- version: 0.7.1
99
+ version: 0.7.2
100
100
  - !ruby/object:Gem::Dependency
101
101
  name: youtube_id
102
102
  requirement: !ruby/object:Gem::Requirement
103
103
  requirements:
104
- - - ">="
105
- - !ruby/object:Gem::Version
106
- version: 0.1.0
107
104
  - - "~>"
108
105
  - !ruby/object:Gem::Version
109
106
  version: '0.1'
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: 0.1.0
110
110
  type: :runtime
111
111
  prerelease: false
112
112
  version_requirements: !ruby/object:Gem::Requirement
113
113
  requirements:
114
- - - ">="
115
- - !ruby/object:Gem::Version
116
- version: 0.1.0
117
114
  - - "~>"
118
115
  - !ruby/object:Gem::Version
119
116
  version: '0.1'
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ version: 0.1.0
120
120
  description:
121
- email: james@jamesrobertson.eu
121
+ email: digital.robertson@gmail.com
122
122
  executables: []
123
123
  extensions: []
124
124
  extra_rdoc_files: []
@@ -143,7 +143,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
143
143
  - !ruby/object:Gem::Version
144
144
  version: '0'
145
145
  requirements: []
146
- rubygems_version: 3.0.3
146
+ rubygems_version: 3.2.22
147
147
  signing_key:
148
148
  specification_version: 4
149
149
  summary: Makes it easier to digest a Youtube video by reading the transcript.
metadata.gz.sig CHANGED
Binary file