youtube_transcript2020 0.2.0 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9405f15c45cd185c0ad06d54fb2e7198a39cdef14896051a3b03ea3d14b497fb
4
- data.tar.gz: 643f78333b91e3c5ff5ef960d8339a496618bcf3b5d81f41d74d17ffb760d533
3
+ metadata.gz: 9462fe85e21ee061f717e564efb7b66bec2e62b1c454228d6fb8f31633f7363d
4
+ data.tar.gz: bb04a03be0cb61058682ce4d9c1159666e41feb2a794fe65d07fbae418412056
5
5
  SHA512:
6
- metadata.gz: 59915336a37bb3c3bdd7d84715c0e254db0f1b20b720c66f5191b6dba38ec072e3abb4e117f5876bcaf89eaa4dcfb7871ab279f36becd1523ba1d56daf230cae
7
- data.tar.gz: 53851b499c5a2024303f3da0b1da1da54a0eaf096db197c5ac97035e1ccb125ef581d551fa154c39892decfd5041fe667a2364293db8c1d1f8057ab24757b1ee
6
+ metadata.gz: d8d4f57bb617e07f647727eaf56c81f1a7efc5eaccc71c323b54e887a270691f01f1d58cc64fe505a40ad6bd5ebd2cea3ea5fc87b7950abe773697acb5e7cc4b
7
+ data.tar.gz: 9a9a4e72f99f70aca736171490bba54d563aa722576ce6d91435177ed9f0fb327080d84ead963474ab8e46cd1c2dfc4709d2ab93c20853c2a376b091399ed6ad
checksums.yaml.gz.sig CHANGED
Binary file
@@ -4,6 +4,7 @@
4
4
 
5
5
  require 'yawc'
6
6
  require 'subunit'
7
+ require 'youtube_id'
7
8
  require 'simple-config'
8
9
 
9
10
 
@@ -11,25 +12,25 @@ class YoutubeTranscript2020
11
12
 
12
13
  attr_reader :to_a, :author, :id, :title
13
14
 
14
- def initialize(id=nil)
15
+ def initialize(id=nil, debug: false)
15
16
 
16
17
  return unless id
17
18
 
18
- @id = if id[/https:\/\/www\.youtube\.com\/watch\?v=/] then
19
- id[/(?<=^https:\/\/www\.youtube\.com\/watch\?v=).*/]
20
- elsif id[/https:\/\/youtu\.be\//]
21
- id[/(?<=^https:\/\/youtu\.be\/).*/]
22
- else
23
- id
24
- end
19
+ @debug = debug
25
20
 
26
- s = Net::HTTP.get(URI("http://video.google.com/timedtext?lang=en&v=#{@id}"))
27
- @s = parse s
21
+ @id = id[/https?:\/\//] ? YoutubeID.from(id) : id
22
+
23
+ # Fetching the transcript from the following statement no longer works.
24
+ # Instead, copy and paste the transcript from the YouTube video page into
25
+ # a text file and import it.
26
+ #
27
+ #s = Net::HTTP.get(URI("http://video.google.com/timedtext?lang=en&v=#{@id}"))
28
+ #@s = parse(s) unless s.empty?
28
29
 
29
30
  fetch_info(@id)
30
31
 
31
32
  end
32
-
33
+
33
34
  def to_a()
34
35
  @a
35
36
  end
@@ -41,7 +42,7 @@ class YoutubeTranscript2020
41
42
  h = {id: @id, title: @title, author: @author}
42
43
  SimpleConfig.new(h).to_s + "\n#{'-'*78}\n\n" + @s
43
44
  end
44
-
45
+
45
46
  def to_text()
46
47
  @a.map(&:last).join("\n")
47
48
  end
@@ -50,16 +51,24 @@ class YoutubeTranscript2020
50
51
  #
51
52
  def import(obj)
52
53
 
53
- s = RXFHelper.read(obj).first
54
+ s = RXFReader.read(obj).first
54
55
 
55
- header, body = s.split(/-----+/,2)
56
+ if s =~ /------+/ then
57
+ header, body = s.split(/-----+/,2)
56
58
 
57
- h = SimpleConfig.new(header).to_h
58
- @id, @author, @title = h[:id], h[:author], h[:title]
59
- @s = body
60
-
61
- a = body.lines.map(&:chomp).partition {|x| x =~ /\d+:\d+/ }
62
- @a = a[0].zip(a[1])
59
+ h = SimpleConfig.new(header).to_h
60
+ @id, @author, @title = h[:id], h[:author], h[:title]
61
+ @s = body
62
+ else
63
+ body = obj
64
+ raw_transcript = true
65
+ end
66
+
67
+ puts 'body: ' + body[0..400] if @debug
68
+ a = body.lines.map(&:chomp).partition {|x| x =~ /\d+:\d+/ }
69
+ @a = a[0].zip(a[1])
70
+
71
+ @s = join_sentences(@a) if raw_transcript
63
72
 
64
73
  end
65
74
 
@@ -70,13 +79,21 @@ class YoutubeTranscript2020
70
79
  url = 'https://www.youtube.com/embed/' + @id
71
80
 
72
81
  links = @a.map do |timestamp, s|
73
-
74
- seconds = Subunit.new(units={minutes:60, hours:60},
82
+
83
+ seconds = Subunit.new(units={minutes:60, hours:60},
75
84
  timestamp.split(':').map(&:to_i)).to_i
76
85
  "<li><a href='%s?start=%s&autoplay=1' target='video'>%s</a><p>%s</p></li> " \
77
86
  % [url, seconds, timestamp, s]
78
87
  end
79
88
 
89
+ puts '@html_embed: ' + @html_embed.inspect if @debug
90
+ doc = Rexle.new(@html_embed.to_s)
91
+ puts 'before attributes'
92
+ doc.root.attributes[:name] = 'video'
93
+ embed = doc.xml(declaration: false)
94
+ puts 'embed: ' + embed.inspect if @debug
95
+ #embed = @html_embed
96
+
80
97
  <<EOF
81
98
  <!DOCTYPE html>
82
99
  <html lang="en">
@@ -87,7 +104,7 @@ class YoutubeTranscript2020
87
104
  <body>
88
105
  <div style="width: 1080px; background: white">
89
106
  <div style="float:left; width: 580px; background: white">
90
- <iframe width="560" height="315" src="#{url}&autoplay=1" name="video" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
107
+ #{embed}
91
108
  <h1>#{@title}</h1>
92
109
  </div>
93
110
  <div style="float:right; width: 500px; overflow-y: scroll; height: 400px">
@@ -103,9 +120,9 @@ EOF
103
120
  # Outputs plain text containing the headings including timestamps
104
121
  # note: This can be helpful for copyng and pasting directly into a YouTube comment
105
122
  #
106
- def to_headings()
107
-
108
- @to_a.select {|timestamp, _| timestamp =~ / /}.map(&:first)
123
+ def to_headings()
124
+
125
+ @to_a.select {|timestamp, _| timestamp =~ / /}.map(&:first)
109
126
 
110
127
  end
111
128
 
@@ -120,36 +137,53 @@ EOF
120
137
  private
121
138
 
122
139
  def fetch_info(id)
123
-
124
- url = "http://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=#{id}&format=json"
140
+
141
+ url = "https://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=#{id}&format=xml"
125
142
  s = Net::HTTP.get(URI(url))
126
-
127
- h = JSON.parse(s, symbolize_names: true)
128
- @title = h[:title]
129
- @author = h[:author_name]
130
-
131
- end
132
143
 
133
- def parse(s)
144
+ e = Rexle.new(s).root
134
145
 
135
- doc = Rexle.new(s)
146
+ @title = e.text('title')
147
+ @author = e.text('author_name')
148
+ @html_embed = e.text('html').unescape
149
+ puts '@html_embed: ' + @html_embed.inspect if @debug
136
150
 
137
- a = doc.root.elements.each.map do |x|
138
- timestamp = Subunit.new(units={minutes:60, hours:60}, \
139
- seconds: x.attributes[:start].to_f).to_s(verbose: false)
140
- [timestamp, x.text.unescape.gsub("\n", ' ').gsub('&#39;',"'").gsub('&quot;','"')]
141
- end
151
+ end
142
152
 
143
- @to_a = a
153
+ def join_sentences(a)
154
+
155
+ if @debug then
156
+ puts 'inside join_sentence'
157
+ puts 'a: ' + a.take(3).inspect
158
+ end
144
159
 
145
160
  a2 = []
146
161
 
147
162
  # the following cleans up sentences that start with And, Or, But, So etc.
148
163
 
149
- a.each do |time, s|
164
+ (0..a.length - 1).each do |n|
165
+
166
+ time, s = a[n]
167
+
168
+ puts 's: ' + s.inspect if @debug
169
+
170
+ if s[/^[a-z|0-9]|I\b|I'/] then
171
+
172
+ if a2.any? then
173
+
174
+ # only join two parts together if there was no full stop in
175
+ # the previous line
176
+
177
+ if a2[-1][-1] != /\.$/ then
178
+ a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
179
+ else
180
+ a2 << [time, s]
181
+ end
182
+
183
+ else
184
+ a2 << [time, s.capitalize]
185
+ end
150
186
 
151
- if s[/^[a-z|0-9]/]then
152
- a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
153
187
  elsif s[/^And,? /]
154
188
  a2[-1][-1] += ' ' + s.sub(/^And,? /,'').capitalize
155
189
  elsif s[/^Or,? /]
@@ -160,16 +194,50 @@ EOF
160
194
  a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
161
195
  elsif s[/^So,? /]
162
196
  a2[-1][-1] += ' ' + s.sub(/^So,? /,'').capitalize
197
+ elsif s[/^\[(?:Music|Applause)\]/i]
198
+
199
+ # ignore it
200
+ puts 'ignoring action commentary' if @debug
201
+ a2 << [time, '.']
202
+
203
+ # To promote the next sentence to a new timestamp we
204
+ # capitalize the 1st letter
205
+ a[n+1][-1] = a[n+1][-1].capitalize if a[n+1]
163
206
  else
164
- a2 << [time, s]
207
+
208
+ if a2.any? and not a2[-1][-1] =~ /\.\s*$/ then
209
+ a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
210
+ else
211
+ a2 << [time, s]
212
+ end
213
+
165
214
  end
166
215
 
167
216
  end
168
217
 
218
+ # Remove those modified entries which were labelled [Music] etc
219
+ a2.reject! {|time, s| s.length < 2}
220
+
169
221
  # formats the paragraph with the timestamp appearing above
170
222
  @a = a2
171
223
  a2.map {|time, s| "\n%s\n\n%s" % [time, s]}.join("\n")
172
224
 
173
225
  end
174
226
 
227
+ def parse(s)
228
+
229
+ doc = Rexle.new(s)
230
+
231
+ a = doc.root.elements.each.map do |x|
232
+ timestamp = Subunit.new(units={minutes:60, hours:60}, \
233
+ seconds: x.attributes[:start].to_f).to_s(verbose: false)
234
+ [timestamp, x.text.unescape.gsub("\n", ' ').gsub('&#39;',"'").gsub('&quot;','"')]
235
+ end
236
+
237
+ @to_a = a
238
+
239
+ join_sentences(a)
240
+
241
+ end
242
+
175
243
  end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: youtube_transcript2020
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -11,72 +11,72 @@ cert_chain:
11
11
  - |
12
12
  -----BEGIN CERTIFICATE-----
13
13
  MIIEXjCCAsagAwIBAgIBATANBgkqhkiG9w0BAQsFADAsMSowKAYDVQQDDCFnZW1t
14
- YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMjAwNzI3MjI1MTUyWhcN
15
- MjEwNzI3MjI1MTUyWjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
16
- cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQCxfRw+
17
- xg525jF+UNFVTtUrON2issNxWgDTq1efjPq9yMzqYrIDZREFE/3fgYbtAqA1Ut94
18
- 2h8mAKnAg1CC4plPA8o15f+h30TPRaxZXFmYUMxTkaLHL4Lvzd1D7eXqRYf9SFQM
19
- EvoYbncj9QwR57WcVF/MTdwbyyiZo3CGzwmWNb9OCIZtvs8m/UOzAmbfF3lIKz9k
20
- +ZK03KqYhyjuAiVhF39LdWUc1AWqu5i+JpFE+Lzfqv1uAjjgshmUkHOXkpWOorHc
21
- uxL0+xZXWgTwpa1QCw3cQY1LW45QjZt4ckA9lOub1LvUTDCvZocNS+dlIUMdW0mP
22
- jFII/nX/KWxW+NOmkWBpdGbXmY5QTppwx88r+VRpTdhepVcNiiHhMsYQsLI/fzVo
23
- kWTib/aBnAoahtlbaldC+e03GPsLPmpTl4ZjOFqUuAyq47h42NYt6kPY/y7Gj8To
24
- fx4pNgddR/r/WABaNao8Q+tzIxgQwCf1rijvfJP+u04GCmIeFm8oQ1x0XkUCAwEA
25
- AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQU1nkRML1E
26
- Q0PgH/jEHBOQSUTi4MYwJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
14
+ YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMjIwMzIxMjAxOTMzWhcN
15
+ MjMwMzIxMjAxOTMzWjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
16
+ cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQCesuFG
17
+ FfZDH7Xm8kEyH9B3OczqfuJW5yZOfANTnUsB864vtSWe6Vghp6JUI0bOcOQdMvIx
18
+ HO4wfaGwvKJtjWCXYdZo2QYXjf6caY007R10GVxzsBYh8Swym0SYf33ljxX+R9DS
19
+ WwdWIv9SU0T7quNEbeXa9dtZJqlFCspmni8MB284ZpqhP2bpvfhBT58dLEUnjcRP
20
+ rcnCBEueIWYkwoZ8K4/BlYrBfgWcm9hxfBimsID0CIDqD2mhOJo/NQSrJJNWTmOt
21
+ oBZg4K2Y/GCmpxS9wQCrM4pBlTjy/mfNWIxDa9xdrIEmQtSng+7X6wvWAiJmFG7Y
22
+ HYN+ARNOx6ODVGYa/GrLWTBr4EL6RJuOD6eqpxD0hjvTczS12RFIGZh9kKXVT7wy
23
+ gkF5vdtR8uyR8Eo8mJM39Nv7yzuj8cRhCAto6aWOx+srVP/woM96qSQ7Ro0/YaeM
24
+ PHHcgZfU4HGdkCJ5Y8gaO9AzioExf2uFfV/m4+pPcBRbNkymj0+qgT/UFyMCAwEA
25
+ AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUTmoD5rx8
26
+ rZ1imkIWMgtbUzNAn4YwJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
27
27
  c29uLmV1MCYGA1UdEgQfMB2BG2dlbW1hc3RlckBqYW1lc3JvYmVydHNvbi5ldTAN
28
- BgkqhkiG9w0BAQsFAAOCAYEAWjyRzOnO0k/P2YHBsie5hNyJq6q7zb9bto2WYF1L
29
- N0/cvumuBsJMDUuPlD9RFvzncZbu//hbnZbK6cxiptm9HUN+m7zNi8XUcDHQw4Ba
30
- 17ZyHWKM2pkf+PJb4waQVeqyUXjbM9r6L8cVa1gkalU6ZpqEtBmkEzJCDZVf0Fll
31
- KrPYWAW5cC7EWeDm1yxusOqzxnkBcXMnKYNJm8KU4YfVpgPXJy9bTLWhm482BlJm
32
- v6wUZwYOM9B7x3dWbbsQXSuKmFqoxiNRWaA41qUS6eVjXpd4Gn/diSzntaX/Whew
33
- dCXyioQY49CVGJg8LpX/zSYUk9dns+fCSeUUfKjv2K8WuzVkS/uMA8DxSeYBfxf5
34
- ON+xcGIy3Nk7FHwY+CuIIa4WCJYB+1bVFeyCaRlCpwHK8DGUxP5PzCb44USGTI2V
35
- 42/R+mfGUgXXd9e36R3+wmfHZSFR6p6I6XKToCKca7buvgP2XgO9I04lTYUr0KLi
36
- 6ZSQYo0XuSVg3by/5kp1TrrS
28
+ BgkqhkiG9w0BAQsFAAOCAYEAAnULhDB7LFrqhVw2ms+IyRTJcJSfFxFcTPG5/mEW
29
+ r8pyTmXvBOr9WIid7QNaUcHTMVlt03v/XCrEex+GajjDspH+rL0iw3poTyvQHeNt
30
+ WgMJiYJH1AZYTSIPnkdkoo6ok4jb9S4B6mgX7tGcBXMq0q3B2o8YZIwRPzajDvyf
31
+ ZgP+vWq4HfkE7/sLTPRoz+WF6c+0w6NAvCPh/LT9qQjwXhtKquprkPfR3+G9tyNO
32
+ rWGzBuj63YgqWsNTF0wZLXDMAGHsJvJa2plhhkMGU7/SMxxdG25A7THeTVMNH7kM
33
+ 041VYN5fokzIIVKn38M4giKliDGEWvnFnEKEeb6Hrgser85Z+P7GjC642k1FHGvb
34
+ T8Jyb5XNJAWcNTk2AspDthbjYwOYAPP1KSLoCbhABW2Dqb6Y+pDOtoHoVbQtx7Ja
35
+ Eh31Azzsjb9JoMQLQliugChaXNzGUL7z5A4jmxeBd91yoD6odSGqLbGuUwjMfyd/
36
+ bYe6x24BppPTKnvGv7iKJQHe
37
37
  -----END CERTIFICATE-----
38
- date: 2020-07-28 00:00:00.000000000 Z
38
+ date: 2022-03-21 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: yawc
42
42
  requirement: !ruby/object:Gem::Requirement
43
43
  requirements:
44
- - - ">="
45
- - !ruby/object:Gem::Version
46
- version: 0.2.0
47
44
  - - "~>"
48
45
  - !ruby/object:Gem::Version
49
- version: '0.2'
46
+ version: '0.3'
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: 0.3.0
50
50
  type: :runtime
51
51
  prerelease: false
52
52
  version_requirements: !ruby/object:Gem::Requirement
53
53
  requirements:
54
- - - ">="
55
- - !ruby/object:Gem::Version
56
- version: 0.2.0
57
54
  - - "~>"
58
55
  - !ruby/object:Gem::Version
59
- version: '0.2'
56
+ version: '0.3'
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: 0.3.0
60
60
  - !ruby/object:Gem::Dependency
61
61
  name: subunit
62
62
  requirement: !ruby/object:Gem::Requirement
63
63
  requirements:
64
64
  - - "~>"
65
65
  - !ruby/object:Gem::Version
66
- version: '0.5'
66
+ version: '0.8'
67
67
  - - ">="
68
68
  - !ruby/object:Gem::Version
69
- version: 0.5.2
69
+ version: 0.8.5
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
73
73
  requirements:
74
74
  - - "~>"
75
75
  - !ruby/object:Gem::Version
76
- version: '0.5'
76
+ version: '0.8'
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
- version: 0.5.2
79
+ version: 0.8.5
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: simple-config
82
82
  requirement: !ruby/object:Gem::Requirement
@@ -86,7 +86,7 @@ dependencies:
86
86
  version: '0.7'
87
87
  - - ">="
88
88
  - !ruby/object:Gem::Version
89
- version: 0.7.1
89
+ version: 0.7.2
90
90
  type: :runtime
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
@@ -96,9 +96,29 @@ dependencies:
96
96
  version: '0.7'
97
97
  - - ">="
98
98
  - !ruby/object:Gem::Version
99
- version: 0.7.1
99
+ version: 0.7.2
100
+ - !ruby/object:Gem::Dependency
101
+ name: youtube_id
102
+ requirement: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - "~>"
105
+ - !ruby/object:Gem::Version
106
+ version: '0.1'
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: 0.1.0
110
+ type: :runtime
111
+ prerelease: false
112
+ version_requirements: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - "~>"
115
+ - !ruby/object:Gem::Version
116
+ version: '0.1'
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ version: 0.1.0
100
120
  description:
101
- email: james@jamesrobertson.eu
121
+ email: digital.robertson@gmail.com
102
122
  executables: []
103
123
  extensions: []
104
124
  extra_rdoc_files: []
@@ -123,7 +143,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
123
143
  - !ruby/object:Gem::Version
124
144
  version: '0'
125
145
  requirements: []
126
- rubygems_version: 3.0.3
146
+ rubygems_version: 3.2.22
127
147
  signing_key:
128
148
  specification_version: 4
129
149
  summary: Makes it easier to digest a Youtube video by reading the transcript.
metadata.gz.sig CHANGED
Binary file