youtube_transcript2020 0.2.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9405f15c45cd185c0ad06d54fb2e7198a39cdef14896051a3b03ea3d14b497fb
4
- data.tar.gz: 643f78333b91e3c5ff5ef960d8339a496618bcf3b5d81f41d74d17ffb760d533
3
+ metadata.gz: 9462fe85e21ee061f717e564efb7b66bec2e62b1c454228d6fb8f31633f7363d
4
+ data.tar.gz: bb04a03be0cb61058682ce4d9c1159666e41feb2a794fe65d07fbae418412056
5
5
  SHA512:
6
- metadata.gz: 59915336a37bb3c3bdd7d84715c0e254db0f1b20b720c66f5191b6dba38ec072e3abb4e117f5876bcaf89eaa4dcfb7871ab279f36becd1523ba1d56daf230cae
7
- data.tar.gz: 53851b499c5a2024303f3da0b1da1da54a0eaf096db197c5ac97035e1ccb125ef581d551fa154c39892decfd5041fe667a2364293db8c1d1f8057ab24757b1ee
6
+ metadata.gz: d8d4f57bb617e07f647727eaf56c81f1a7efc5eaccc71c323b54e887a270691f01f1d58cc64fe505a40ad6bd5ebd2cea3ea5fc87b7950abe773697acb5e7cc4b
7
+ data.tar.gz: 9a9a4e72f99f70aca736171490bba54d563aa722576ce6d91435177ed9f0fb327080d84ead963474ab8e46cd1c2dfc4709d2ab93c20853c2a376b091399ed6ad
checksums.yaml.gz.sig CHANGED
Binary file
@@ -4,6 +4,7 @@
4
4
 
5
5
  require 'yawc'
6
6
  require 'subunit'
7
+ require 'youtube_id'
7
8
  require 'simple-config'
8
9
 
9
10
 
@@ -11,25 +12,25 @@ class YoutubeTranscript2020
11
12
 
12
13
  attr_reader :to_a, :author, :id, :title
13
14
 
14
- def initialize(id=nil)
15
+ def initialize(id=nil, debug: false)
15
16
 
16
17
  return unless id
17
18
 
18
- @id = if id[/https:\/\/www\.youtube\.com\/watch\?v=/] then
19
- id[/(?<=^https:\/\/www\.youtube\.com\/watch\?v=).*/]
20
- elsif id[/https:\/\/youtu\.be\//]
21
- id[/(?<=^https:\/\/youtu\.be\/).*/]
22
- else
23
- id
24
- end
19
+ @debug = debug
25
20
 
26
- s = Net::HTTP.get(URI("http://video.google.com/timedtext?lang=en&v=#{@id}"))
27
- @s = parse s
21
+ @id = id[/https?:\/\//] ? YoutubeID.from(id) : id
22
+
23
+ # Fetching the transcript from the following statement no longer works.
24
+ # Instead, copy and paste the transcript from the YouTube video page into
25
+ # a text file and import it.
26
+ #
27
+ #s = Net::HTTP.get(URI("http://video.google.com/timedtext?lang=en&v=#{@id}"))
28
+ #@s = parse(s) unless s.empty?
28
29
 
29
30
  fetch_info(@id)
30
31
 
31
32
  end
32
-
33
+
33
34
  def to_a()
34
35
  @a
35
36
  end
@@ -41,7 +42,7 @@ class YoutubeTranscript2020
41
42
  h = {id: @id, title: @title, author: @author}
42
43
  SimpleConfig.new(h).to_s + "\n#{'-'*78}\n\n" + @s
43
44
  end
44
-
45
+
45
46
  def to_text()
46
47
  @a.map(&:last).join("\n")
47
48
  end
@@ -50,16 +51,24 @@ class YoutubeTranscript2020
50
51
  #
51
52
  def import(obj)
52
53
 
53
- s = RXFHelper.read(obj).first
54
+ s = RXFReader.read(obj).first
54
55
 
55
- header, body = s.split(/-----+/,2)
56
+ if s =~ /------+/ then
57
+ header, body = s.split(/-----+/,2)
56
58
 
57
- h = SimpleConfig.new(header).to_h
58
- @id, @author, @title = h[:id], h[:author], h[:title]
59
- @s = body
60
-
61
- a = body.lines.map(&:chomp).partition {|x| x =~ /\d+:\d+/ }
62
- @a = a[0].zip(a[1])
59
+ h = SimpleConfig.new(header).to_h
60
+ @id, @author, @title = h[:id], h[:author], h[:title]
61
+ @s = body
62
+ else
63
+ body = obj
64
+ raw_transcript = true
65
+ end
66
+
67
+ puts 'body: ' + body[0..400] if @debug
68
+ a = body.lines.map(&:chomp).partition {|x| x =~ /\d+:\d+/ }
69
+ @a = a[0].zip(a[1])
70
+
71
+ @s = join_sentences(@a) if raw_transcript
63
72
 
64
73
  end
65
74
 
@@ -70,13 +79,21 @@ class YoutubeTranscript2020
70
79
  url = 'https://www.youtube.com/embed/' + @id
71
80
 
72
81
  links = @a.map do |timestamp, s|
73
-
74
- seconds = Subunit.new(units={minutes:60, hours:60},
82
+
83
+ seconds = Subunit.new(units={minutes:60, hours:60},
75
84
  timestamp.split(':').map(&:to_i)).to_i
76
85
  "<li><a href='%s?start=%s&autoplay=1' target='video'>%s</a><p>%s</p></li> " \
77
86
  % [url, seconds, timestamp, s]
78
87
  end
79
88
 
89
+ puts '@html_embed: ' + @html_embed.inspect if @debug
90
+ doc = Rexle.new(@html_embed.to_s)
91
+ puts 'before attributes'
92
+ doc.root.attributes[:name] = 'video'
93
+ embed = doc.xml(declaration: false)
94
+ puts 'embed: ' + embed.inspect if @debug
95
+ #embed = @html_embed
96
+
80
97
  <<EOF
81
98
  <!DOCTYPE html>
82
99
  <html lang="en">
@@ -87,7 +104,7 @@ class YoutubeTranscript2020
87
104
  <body>
88
105
  <div style="width: 1080px; background: white">
89
106
  <div style="float:left; width: 580px; background: white">
90
- <iframe width="560" height="315" src="#{url}&autoplay=1" name="video" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
107
+ #{embed}
91
108
  <h1>#{@title}</h1>
92
109
  </div>
93
110
  <div style="float:right; width: 500px; overflow-y: scroll; height: 400px">
@@ -103,9 +120,9 @@ EOF
103
120
  # Outputs plain text containing the headings including timestamps
104
121
  # note: This can be helpful for copyng and pasting directly into a YouTube comment
105
122
  #
106
- def to_headings()
107
-
108
- @to_a.select {|timestamp, _| timestamp =~ / /}.map(&:first)
123
+ def to_headings()
124
+
125
+ @to_a.select {|timestamp, _| timestamp =~ / /}.map(&:first)
109
126
 
110
127
  end
111
128
 
@@ -120,36 +137,53 @@ EOF
120
137
  private
121
138
 
122
139
  def fetch_info(id)
123
-
124
- url = "http://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=#{id}&format=json"
140
+
141
+ url = "https://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=#{id}&format=xml"
125
142
  s = Net::HTTP.get(URI(url))
126
-
127
- h = JSON.parse(s, symbolize_names: true)
128
- @title = h[:title]
129
- @author = h[:author_name]
130
-
131
- end
132
143
 
133
- def parse(s)
144
+ e = Rexle.new(s).root
134
145
 
135
- doc = Rexle.new(s)
146
+ @title = e.text('title')
147
+ @author = e.text('author_name')
148
+ @html_embed = e.text('html').unescape
149
+ puts '@html_embed: ' + @html_embed.inspect if @debug
136
150
 
137
- a = doc.root.elements.each.map do |x|
138
- timestamp = Subunit.new(units={minutes:60, hours:60}, \
139
- seconds: x.attributes[:start].to_f).to_s(verbose: false)
140
- [timestamp, x.text.unescape.gsub("\n", ' ').gsub('&#39;',"'").gsub('&quot;','"')]
141
- end
151
+ end
142
152
 
143
- @to_a = a
153
+ def join_sentences(a)
154
+
155
+ if @debug then
156
+ puts 'inside join_sentence'
157
+ puts 'a: ' + a.take(3).inspect
158
+ end
144
159
 
145
160
  a2 = []
146
161
 
147
162
  # the following cleans up sentences that start with And, Or, But, So etc.
148
163
 
149
- a.each do |time, s|
164
+ (0..a.length - 1).each do |n|
165
+
166
+ time, s = a[n]
167
+
168
+ puts 's: ' + s.inspect if @debug
169
+
170
+ if s[/^[a-z|0-9]|I\b|I'/] then
171
+
172
+ if a2.any? then
173
+
174
+ # only join two parts together if there was no full stop in
175
+ # the previous line
176
+
177
+ if a2[-1][-1] != /\.$/ then
178
+ a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
179
+ else
180
+ a2 << [time, s]
181
+ end
182
+
183
+ else
184
+ a2 << [time, s.capitalize]
185
+ end
150
186
 
151
- if s[/^[a-z|0-9]/]then
152
- a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
153
187
  elsif s[/^And,? /]
154
188
  a2[-1][-1] += ' ' + s.sub(/^And,? /,'').capitalize
155
189
  elsif s[/^Or,? /]
@@ -160,16 +194,50 @@ EOF
160
194
  a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
161
195
  elsif s[/^So,? /]
162
196
  a2[-1][-1] += ' ' + s.sub(/^So,? /,'').capitalize
197
+ elsif s[/^\[(?:Music|Applause)\]/i]
198
+
199
+ # ignore it
200
+ puts 'ignoring action commentary' if @debug
201
+ a2 << [time, '.']
202
+
203
+ # To promote the next sentence to a new timestamp we
204
+ # capitalize the 1st letter
205
+ a[n+1][-1] = a[n+1][-1].capitalize if a[n+1]
163
206
  else
164
- a2 << [time, s]
207
+
208
+ if a2.any? and not a2[-1][-1] =~ /\.\s*$/ then
209
+ a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
210
+ else
211
+ a2 << [time, s]
212
+ end
213
+
165
214
  end
166
215
 
167
216
  end
168
217
 
218
+ # Remove those modified entries which were labelled [Music] etc
219
+ a2.reject! {|time, s| s.length < 2}
220
+
169
221
  # formats the paragraph with the timestamp appearing above
170
222
  @a = a2
171
223
  a2.map {|time, s| "\n%s\n\n%s" % [time, s]}.join("\n")
172
224
 
173
225
  end
174
226
 
227
+ def parse(s)
228
+
229
+ doc = Rexle.new(s)
230
+
231
+ a = doc.root.elements.each.map do |x|
232
+ timestamp = Subunit.new(units={minutes:60, hours:60}, \
233
+ seconds: x.attributes[:start].to_f).to_s(verbose: false)
234
+ [timestamp, x.text.unescape.gsub("\n", ' ').gsub('&#39;',"'").gsub('&quot;','"')]
235
+ end
236
+
237
+ @to_a = a
238
+
239
+ join_sentences(a)
240
+
241
+ end
242
+
175
243
  end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: youtube_transcript2020
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -11,72 +11,72 @@ cert_chain:
11
11
  - |
12
12
  -----BEGIN CERTIFICATE-----
13
13
  MIIEXjCCAsagAwIBAgIBATANBgkqhkiG9w0BAQsFADAsMSowKAYDVQQDDCFnZW1t
14
- YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMjAwNzI3MjI1MTUyWhcN
15
- MjEwNzI3MjI1MTUyWjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
16
- cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQCxfRw+
17
- xg525jF+UNFVTtUrON2issNxWgDTq1efjPq9yMzqYrIDZREFE/3fgYbtAqA1Ut94
18
- 2h8mAKnAg1CC4plPA8o15f+h30TPRaxZXFmYUMxTkaLHL4Lvzd1D7eXqRYf9SFQM
19
- EvoYbncj9QwR57WcVF/MTdwbyyiZo3CGzwmWNb9OCIZtvs8m/UOzAmbfF3lIKz9k
20
- +ZK03KqYhyjuAiVhF39LdWUc1AWqu5i+JpFE+Lzfqv1uAjjgshmUkHOXkpWOorHc
21
- uxL0+xZXWgTwpa1QCw3cQY1LW45QjZt4ckA9lOub1LvUTDCvZocNS+dlIUMdW0mP
22
- jFII/nX/KWxW+NOmkWBpdGbXmY5QTppwx88r+VRpTdhepVcNiiHhMsYQsLI/fzVo
23
- kWTib/aBnAoahtlbaldC+e03GPsLPmpTl4ZjOFqUuAyq47h42NYt6kPY/y7Gj8To
24
- fx4pNgddR/r/WABaNao8Q+tzIxgQwCf1rijvfJP+u04GCmIeFm8oQ1x0XkUCAwEA
25
- AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQU1nkRML1E
26
- Q0PgH/jEHBOQSUTi4MYwJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
14
+ YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMjIwMzIxMjAxOTMzWhcN
15
+ MjMwMzIxMjAxOTMzWjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
16
+ cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQCesuFG
17
+ FfZDH7Xm8kEyH9B3OczqfuJW5yZOfANTnUsB864vtSWe6Vghp6JUI0bOcOQdMvIx
18
+ HO4wfaGwvKJtjWCXYdZo2QYXjf6caY007R10GVxzsBYh8Swym0SYf33ljxX+R9DS
19
+ WwdWIv9SU0T7quNEbeXa9dtZJqlFCspmni8MB284ZpqhP2bpvfhBT58dLEUnjcRP
20
+ rcnCBEueIWYkwoZ8K4/BlYrBfgWcm9hxfBimsID0CIDqD2mhOJo/NQSrJJNWTmOt
21
+ oBZg4K2Y/GCmpxS9wQCrM4pBlTjy/mfNWIxDa9xdrIEmQtSng+7X6wvWAiJmFG7Y
22
+ HYN+ARNOx6ODVGYa/GrLWTBr4EL6RJuOD6eqpxD0hjvTczS12RFIGZh9kKXVT7wy
23
+ gkF5vdtR8uyR8Eo8mJM39Nv7yzuj8cRhCAto6aWOx+srVP/woM96qSQ7Ro0/YaeM
24
+ PHHcgZfU4HGdkCJ5Y8gaO9AzioExf2uFfV/m4+pPcBRbNkymj0+qgT/UFyMCAwEA
25
+ AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUTmoD5rx8
26
+ rZ1imkIWMgtbUzNAn4YwJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
27
27
  c29uLmV1MCYGA1UdEgQfMB2BG2dlbW1hc3RlckBqYW1lc3JvYmVydHNvbi5ldTAN
28
- BgkqhkiG9w0BAQsFAAOCAYEAWjyRzOnO0k/P2YHBsie5hNyJq6q7zb9bto2WYF1L
29
- N0/cvumuBsJMDUuPlD9RFvzncZbu//hbnZbK6cxiptm9HUN+m7zNi8XUcDHQw4Ba
30
- 17ZyHWKM2pkf+PJb4waQVeqyUXjbM9r6L8cVa1gkalU6ZpqEtBmkEzJCDZVf0Fll
31
- KrPYWAW5cC7EWeDm1yxusOqzxnkBcXMnKYNJm8KU4YfVpgPXJy9bTLWhm482BlJm
32
- v6wUZwYOM9B7x3dWbbsQXSuKmFqoxiNRWaA41qUS6eVjXpd4Gn/diSzntaX/Whew
33
- dCXyioQY49CVGJg8LpX/zSYUk9dns+fCSeUUfKjv2K8WuzVkS/uMA8DxSeYBfxf5
34
- ON+xcGIy3Nk7FHwY+CuIIa4WCJYB+1bVFeyCaRlCpwHK8DGUxP5PzCb44USGTI2V
35
- 42/R+mfGUgXXd9e36R3+wmfHZSFR6p6I6XKToCKca7buvgP2XgO9I04lTYUr0KLi
36
- 6ZSQYo0XuSVg3by/5kp1TrrS
28
+ BgkqhkiG9w0BAQsFAAOCAYEAAnULhDB7LFrqhVw2ms+IyRTJcJSfFxFcTPG5/mEW
29
+ r8pyTmXvBOr9WIid7QNaUcHTMVlt03v/XCrEex+GajjDspH+rL0iw3poTyvQHeNt
30
+ WgMJiYJH1AZYTSIPnkdkoo6ok4jb9S4B6mgX7tGcBXMq0q3B2o8YZIwRPzajDvyf
31
+ ZgP+vWq4HfkE7/sLTPRoz+WF6c+0w6NAvCPh/LT9qQjwXhtKquprkPfR3+G9tyNO
32
+ rWGzBuj63YgqWsNTF0wZLXDMAGHsJvJa2plhhkMGU7/SMxxdG25A7THeTVMNH7kM
33
+ 041VYN5fokzIIVKn38M4giKliDGEWvnFnEKEeb6Hrgser85Z+P7GjC642k1FHGvb
34
+ T8Jyb5XNJAWcNTk2AspDthbjYwOYAPP1KSLoCbhABW2Dqb6Y+pDOtoHoVbQtx7Ja
35
+ Eh31Azzsjb9JoMQLQliugChaXNzGUL7z5A4jmxeBd91yoD6odSGqLbGuUwjMfyd/
36
+ bYe6x24BppPTKnvGv7iKJQHe
37
37
  -----END CERTIFICATE-----
38
- date: 2020-07-28 00:00:00.000000000 Z
38
+ date: 2022-03-21 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: yawc
42
42
  requirement: !ruby/object:Gem::Requirement
43
43
  requirements:
44
- - - ">="
45
- - !ruby/object:Gem::Version
46
- version: 0.2.0
47
44
  - - "~>"
48
45
  - !ruby/object:Gem::Version
49
- version: '0.2'
46
+ version: '0.3'
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: 0.3.0
50
50
  type: :runtime
51
51
  prerelease: false
52
52
  version_requirements: !ruby/object:Gem::Requirement
53
53
  requirements:
54
- - - ">="
55
- - !ruby/object:Gem::Version
56
- version: 0.2.0
57
54
  - - "~>"
58
55
  - !ruby/object:Gem::Version
59
- version: '0.2'
56
+ version: '0.3'
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: 0.3.0
60
60
  - !ruby/object:Gem::Dependency
61
61
  name: subunit
62
62
  requirement: !ruby/object:Gem::Requirement
63
63
  requirements:
64
64
  - - "~>"
65
65
  - !ruby/object:Gem::Version
66
- version: '0.5'
66
+ version: '0.8'
67
67
  - - ">="
68
68
  - !ruby/object:Gem::Version
69
- version: 0.5.2
69
+ version: 0.8.5
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
73
73
  requirements:
74
74
  - - "~>"
75
75
  - !ruby/object:Gem::Version
76
- version: '0.5'
76
+ version: '0.8'
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
- version: 0.5.2
79
+ version: 0.8.5
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: simple-config
82
82
  requirement: !ruby/object:Gem::Requirement
@@ -86,7 +86,7 @@ dependencies:
86
86
  version: '0.7'
87
87
  - - ">="
88
88
  - !ruby/object:Gem::Version
89
- version: 0.7.1
89
+ version: 0.7.2
90
90
  type: :runtime
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
@@ -96,9 +96,29 @@ dependencies:
96
96
  version: '0.7'
97
97
  - - ">="
98
98
  - !ruby/object:Gem::Version
99
- version: 0.7.1
99
+ version: 0.7.2
100
+ - !ruby/object:Gem::Dependency
101
+ name: youtube_id
102
+ requirement: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - "~>"
105
+ - !ruby/object:Gem::Version
106
+ version: '0.1'
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: 0.1.0
110
+ type: :runtime
111
+ prerelease: false
112
+ version_requirements: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - "~>"
115
+ - !ruby/object:Gem::Version
116
+ version: '0.1'
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ version: 0.1.0
100
120
  description:
101
- email: james@jamesrobertson.eu
121
+ email: digital.robertson@gmail.com
102
122
  executables: []
103
123
  extensions: []
104
124
  extra_rdoc_files: []
@@ -123,7 +143,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
123
143
  - !ruby/object:Gem::Version
124
144
  version: '0'
125
145
  requirements: []
126
- rubygems_version: 3.0.3
146
+ rubygems_version: 3.2.22
127
147
  signing_key:
128
148
  specification_version: 4
129
149
  summary: Makes it easier to digest a Youtube video by reading the transcript.
metadata.gz.sig CHANGED
Binary file