youtube_transcript2020 0.3.0 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/youtube_transcript2020.rb +99 -49
- data.tar.gz.sig +0 -0
- metadata +46 -46
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 10d150f7396c9e0e18db51381fc5f262eb83595b4f3d5aad313192d86041071c
|
4
|
+
data.tar.gz: 607ae0291272ff40d08db5398f465d99872747e300500c214602b754c43811bb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 244d0242415308c65b07d9bfb09f6776fe3f67817dadbeb5189b4bb5756a4733668fde8b139f636c361598cb6e623ffbdfbad40c3675fd78b74df17701514183
|
7
|
+
data.tar.gz: '07884cdc2b52ad2ac5cf86814f8ae88aa864109d24584e2b256b65b0b8c8168ac7d08f71ce5f2343751b4bbcfe6d81960d4b81b50d4c49109d5b0855198761b1'
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
@@ -3,34 +3,36 @@
|
|
3
3
|
# file: youtube_transcript2020.rb
|
4
4
|
|
5
5
|
require 'yawc'
|
6
|
+
require 'json'
|
6
7
|
require 'subunit'
|
7
8
|
require 'youtube_id'
|
8
9
|
require 'simple-config'
|
9
10
|
|
11
|
+
# https://github.com/jdepoix/youtube-transcript-api
|
10
12
|
|
11
13
|
class YoutubeTranscript2020
|
12
14
|
|
13
15
|
attr_reader :to_a, :author, :id, :title
|
14
16
|
|
15
|
-
def initialize(id=nil, debug: false)
|
17
|
+
def initialize(id=nil, debug: false)
|
16
18
|
|
17
19
|
return unless id
|
18
|
-
|
20
|
+
|
19
21
|
@debug = debug
|
20
22
|
|
21
|
-
@id =
|
22
|
-
YoutubeID.from(id)
|
23
|
-
else
|
24
|
-
id
|
25
|
-
end
|
23
|
+
@id = id[/https?:\/\//] ? YoutubeID.from(id) : id
|
26
24
|
|
27
|
-
|
28
|
-
|
25
|
+
# Fetching the transcript from the following statement no longer works.
|
26
|
+
# Instead, copy and paste the transcript from the YouTube video page into
|
27
|
+
# a text file and import it.
|
28
|
+
#
|
29
|
+
#s = Net::HTTP.get(URI("http://video.google.com/timedtext?lang=en&v=#{@id}"))
|
30
|
+
#@s = parse(s) unless s.empty?
|
29
31
|
|
30
32
|
fetch_info(@id)
|
31
33
|
|
32
34
|
end
|
33
|
-
|
35
|
+
|
34
36
|
def to_a()
|
35
37
|
@a
|
36
38
|
end
|
@@ -42,7 +44,7 @@ class YoutubeTranscript2020
|
|
42
44
|
h = {id: @id, title: @title, author: @author}
|
43
45
|
SimpleConfig.new(h).to_s + "\n#{'-'*78}\n\n" + @s
|
44
46
|
end
|
45
|
-
|
47
|
+
|
46
48
|
def to_text()
|
47
49
|
@a.map(&:last).join("\n")
|
48
50
|
end
|
@@ -51,21 +53,33 @@ class YoutubeTranscript2020
|
|
51
53
|
#
|
52
54
|
def import(obj)
|
53
55
|
|
54
|
-
s =
|
56
|
+
s = RXFReader.read(obj).first
|
55
57
|
|
56
58
|
if s =~ /------+/ then
|
59
|
+
|
57
60
|
header, body = s.split(/-----+/,2)
|
58
61
|
|
59
62
|
h = SimpleConfig.new(header).to_h
|
60
63
|
@id, @author, @title = h[:id], h[:author], h[:title]
|
61
64
|
@s = body
|
65
|
+
|
66
|
+
elsif File.extname(obj) == '.json'
|
67
|
+
|
68
|
+
r = JSON.parse(s)
|
69
|
+
@a = r.map {|x| [x['start'], x['text']]}
|
70
|
+
@s = join_sentences(@a)
|
71
|
+
|
72
|
+
return
|
73
|
+
|
62
74
|
else
|
75
|
+
|
63
76
|
body = obj
|
64
77
|
raw_transcript = true
|
78
|
+
|
65
79
|
end
|
66
80
|
|
67
81
|
puts 'body: ' + body[0..400] if @debug
|
68
|
-
a = body.lines.map(&:chomp).partition {|x| x =~ /\d+:\d+/ }
|
82
|
+
a = body.lines.map(&:chomp).partition {|x| x =~ /\d+:\d+/ }
|
69
83
|
@a = a[0].zip(a[1])
|
70
84
|
|
71
85
|
@s = join_sentences(@a) if raw_transcript
|
@@ -79,13 +93,21 @@ class YoutubeTranscript2020
|
|
79
93
|
url = 'https://www.youtube.com/embed/' + @id
|
80
94
|
|
81
95
|
links = @a.map do |timestamp, s|
|
82
|
-
|
83
|
-
seconds = Subunit.new(units={minutes:60, hours:60},
|
96
|
+
|
97
|
+
seconds = Subunit.new(units={minutes:60, hours:60},
|
84
98
|
timestamp.split(':').map(&:to_i)).to_i
|
85
99
|
"<li><a href='%s?start=%s&autoplay=1' target='video'>%s</a><p>%s</p></li> " \
|
86
100
|
% [url, seconds, timestamp, s]
|
87
101
|
end
|
88
102
|
|
103
|
+
puts '@html_embed: ' + @html_embed.inspect if @debug
|
104
|
+
doc = Rexle.new(@html_embed.to_s)
|
105
|
+
puts 'before attributes'
|
106
|
+
doc.root.attributes[:name] = 'video'
|
107
|
+
embed = doc.xml(declaration: false)
|
108
|
+
puts 'embed: ' + embed.inspect if @debug
|
109
|
+
#embed = @html_embed
|
110
|
+
|
89
111
|
<<EOF
|
90
112
|
<!DOCTYPE html>
|
91
113
|
<html lang="en">
|
@@ -96,7 +118,7 @@ class YoutubeTranscript2020
|
|
96
118
|
<body>
|
97
119
|
<div style="width: 1080px; background: white">
|
98
120
|
<div style="float:left; width: 580px; background: white">
|
99
|
-
#{
|
121
|
+
#{embed}
|
100
122
|
<h1>#{@title}</h1>
|
101
123
|
</div>
|
102
124
|
<div style="float:right; width: 500px; overflow-y: scroll; height: 400px">
|
@@ -112,9 +134,9 @@ EOF
|
|
112
134
|
# Outputs plain text containing the headings including timestamps
|
113
135
|
# note: This can be helpful for copyng and pasting directly into a YouTube comment
|
114
136
|
#
|
115
|
-
def to_headings()
|
116
|
-
|
117
|
-
@to_a.select {|timestamp, _| timestamp =~ / /}.map(&:first)
|
137
|
+
def to_headings()
|
138
|
+
|
139
|
+
@to_a.select {|timestamp, _| timestamp =~ / /}.map(&:first)
|
118
140
|
|
119
141
|
end
|
120
142
|
|
@@ -129,52 +151,53 @@ EOF
|
|
129
151
|
private
|
130
152
|
|
131
153
|
def fetch_info(id)
|
132
|
-
|
133
|
-
url = "
|
154
|
+
|
155
|
+
url = "https://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=#{id}&format=xml"
|
134
156
|
s = Net::HTTP.get(URI(url))
|
135
|
-
|
157
|
+
|
136
158
|
e = Rexle.new(s).root
|
137
|
-
|
159
|
+
|
138
160
|
@title = e.text('title')
|
139
161
|
@author = e.text('author_name')
|
140
162
|
@html_embed = e.text('html').unescape
|
141
|
-
|
163
|
+
puts '@html_embed: ' + @html_embed.inspect if @debug
|
164
|
+
|
142
165
|
end
|
143
|
-
|
166
|
+
|
144
167
|
def join_sentences(a)
|
145
|
-
|
168
|
+
|
146
169
|
if @debug then
|
147
170
|
puts 'inside join_sentence'
|
148
171
|
puts 'a: ' + a.take(3).inspect
|
149
172
|
end
|
150
|
-
|
173
|
+
|
151
174
|
a2 = []
|
152
175
|
|
153
176
|
# the following cleans up sentences that start with And, Or, But, So etc.
|
154
177
|
|
155
|
-
a.each do |
|
178
|
+
(0..a.length - 1).each do |n|
|
179
|
+
|
180
|
+
time, s = a[n]
|
181
|
+
|
182
|
+
puts 's: ' + s.inspect if @debug
|
183
|
+
|
184
|
+
if s[/^[a-z|0-9]|I\b|I'/] then
|
156
185
|
|
157
|
-
puts 'raws: ' + raws.inspect if @debug
|
158
|
-
|
159
|
-
s = raws.sub(/^\W+/,'')
|
160
|
-
|
161
|
-
if s[/^[a-z|0-9]|I\b|I'/]then
|
162
|
-
|
163
186
|
if a2.any? then
|
164
|
-
|
165
|
-
# only join two parts together if there was no full stop in
|
187
|
+
|
188
|
+
# only join two parts together if there was no full stop in
|
166
189
|
# the previous line
|
167
|
-
|
190
|
+
|
168
191
|
if a2[-1][-1] != /\.$/ then
|
169
|
-
a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
|
192
|
+
a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
|
170
193
|
else
|
171
194
|
a2 << [time, s]
|
172
195
|
end
|
173
|
-
|
174
|
-
else
|
196
|
+
|
197
|
+
else
|
175
198
|
a2 << [time, s.capitalize]
|
176
199
|
end
|
177
|
-
|
200
|
+
|
178
201
|
elsif s[/^And,? /]
|
179
202
|
a2[-1][-1] += ' ' + s.sub(/^And,? /,'').capitalize
|
180
203
|
elsif s[/^Or,? /]
|
@@ -184,32 +207,59 @@ EOF
|
|
184
207
|
elsif s[/^"/]
|
185
208
|
a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
|
186
209
|
elsif s[/^So,? /]
|
187
|
-
|
188
|
-
|
210
|
+
|
211
|
+
puts 'so? a2[-1]' + a2[-1].inspect if @debug
|
212
|
+
|
213
|
+
if a2.empty? then
|
214
|
+
a2 << [time, s.sub(/^So,? /,'').capitalize]
|
215
|
+
else
|
216
|
+
a2[-1][-1] += ' ' + s.sub(/^So,? /,'').capitalize
|
217
|
+
end
|
218
|
+
elsif s[/^\[(?:Music|Applause)\]/i]
|
219
|
+
|
189
220
|
# ignore it
|
221
|
+
puts 'ignoring action commentary' if @debug
|
222
|
+
a2 << [time, '.']
|
223
|
+
|
224
|
+
# To promote the next sentence to a new timestamp we
|
225
|
+
# capitalize the 1st letter
|
226
|
+
a[n+1][-1] = a[n+1][-1].capitalize if a[n+1]
|
190
227
|
else
|
191
|
-
|
228
|
+
|
192
229
|
if a2.any? and not a2[-1][-1] =~ /\.\s*$/ then
|
193
|
-
a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
|
230
|
+
a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
|
194
231
|
else
|
195
232
|
a2 << [time, s]
|
196
233
|
end
|
197
|
-
|
234
|
+
|
198
235
|
end
|
199
236
|
|
200
237
|
end
|
201
238
|
|
239
|
+
# Remove those modified entries which were labelled [Music] etc
|
240
|
+
a2.reject! {|time, s| s.length < 2}
|
241
|
+
|
202
242
|
# formats the paragraph with the timestamp appearing above
|
203
243
|
@a = a2
|
204
|
-
|
205
|
-
|
244
|
+
|
245
|
+
a2.map do |rawtime, s|
|
246
|
+
|
247
|
+
time = if rawtime.is_a? Float then
|
248
|
+
Subunit.seconds(rawtime).strfunit("%sc")
|
249
|
+
else
|
250
|
+
time
|
251
|
+
end
|
252
|
+
|
253
|
+
"\n%s\n\n%s" % [time, s]
|
254
|
+
end.join("\n")
|
255
|
+
|
206
256
|
end
|
207
257
|
|
208
258
|
def parse(s)
|
209
259
|
|
210
260
|
doc = Rexle.new(s)
|
211
261
|
|
212
|
-
a = doc.root.elements.each.map do |x|
|
262
|
+
a = doc.root.elements.each.map do |x|
|
213
263
|
timestamp = Subunit.new(units={minutes:60, hours:60}, \
|
214
264
|
seconds: x.attributes[:start].to_f).to_s(verbose: false)
|
215
265
|
[timestamp, x.text.unescape.gsub("\n", ' ').gsub(''',"'").gsub('"','"')]
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: youtube_transcript2020
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -11,72 +11,72 @@ cert_chain:
|
|
11
11
|
- |
|
12
12
|
-----BEGIN CERTIFICATE-----
|
13
13
|
MIIEXjCCAsagAwIBAgIBATANBgkqhkiG9w0BAQsFADAsMSowKAYDVQQDDCFnZW1t
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
14
|
+
YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMjIwMzIxMjAxOTMzWhcN
|
15
|
+
MjMwMzIxMjAxOTMzWjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
|
16
|
+
cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQCesuFG
|
17
|
+
FfZDH7Xm8kEyH9B3OczqfuJW5yZOfANTnUsB864vtSWe6Vghp6JUI0bOcOQdMvIx
|
18
|
+
HO4wfaGwvKJtjWCXYdZo2QYXjf6caY007R10GVxzsBYh8Swym0SYf33ljxX+R9DS
|
19
|
+
WwdWIv9SU0T7quNEbeXa9dtZJqlFCspmni8MB284ZpqhP2bpvfhBT58dLEUnjcRP
|
20
|
+
rcnCBEueIWYkwoZ8K4/BlYrBfgWcm9hxfBimsID0CIDqD2mhOJo/NQSrJJNWTmOt
|
21
|
+
oBZg4K2Y/GCmpxS9wQCrM4pBlTjy/mfNWIxDa9xdrIEmQtSng+7X6wvWAiJmFG7Y
|
22
|
+
HYN+ARNOx6ODVGYa/GrLWTBr4EL6RJuOD6eqpxD0hjvTczS12RFIGZh9kKXVT7wy
|
23
|
+
gkF5vdtR8uyR8Eo8mJM39Nv7yzuj8cRhCAto6aWOx+srVP/woM96qSQ7Ro0/YaeM
|
24
|
+
PHHcgZfU4HGdkCJ5Y8gaO9AzioExf2uFfV/m4+pPcBRbNkymj0+qgT/UFyMCAwEA
|
25
|
+
AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUTmoD5rx8
|
26
|
+
rZ1imkIWMgtbUzNAn4YwJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
|
27
27
|
c29uLmV1MCYGA1UdEgQfMB2BG2dlbW1hc3RlckBqYW1lc3JvYmVydHNvbi5ldTAN
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
28
|
+
BgkqhkiG9w0BAQsFAAOCAYEAAnULhDB7LFrqhVw2ms+IyRTJcJSfFxFcTPG5/mEW
|
29
|
+
r8pyTmXvBOr9WIid7QNaUcHTMVlt03v/XCrEex+GajjDspH+rL0iw3poTyvQHeNt
|
30
|
+
WgMJiYJH1AZYTSIPnkdkoo6ok4jb9S4B6mgX7tGcBXMq0q3B2o8YZIwRPzajDvyf
|
31
|
+
ZgP+vWq4HfkE7/sLTPRoz+WF6c+0w6NAvCPh/LT9qQjwXhtKquprkPfR3+G9tyNO
|
32
|
+
rWGzBuj63YgqWsNTF0wZLXDMAGHsJvJa2plhhkMGU7/SMxxdG25A7THeTVMNH7kM
|
33
|
+
041VYN5fokzIIVKn38M4giKliDGEWvnFnEKEeb6Hrgser85Z+P7GjC642k1FHGvb
|
34
|
+
T8Jyb5XNJAWcNTk2AspDthbjYwOYAPP1KSLoCbhABW2Dqb6Y+pDOtoHoVbQtx7Ja
|
35
|
+
Eh31Azzsjb9JoMQLQliugChaXNzGUL7z5A4jmxeBd91yoD6odSGqLbGuUwjMfyd/
|
36
|
+
bYe6x24BppPTKnvGv7iKJQHe
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date:
|
38
|
+
date: 2022-03-22 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: yawc
|
42
42
|
requirement: !ruby/object:Gem::Requirement
|
43
43
|
requirements:
|
44
|
-
- - ">="
|
45
|
-
- !ruby/object:Gem::Version
|
46
|
-
version: 0.2.0
|
47
44
|
- - "~>"
|
48
45
|
- !ruby/object:Gem::Version
|
49
|
-
version: '0.
|
46
|
+
version: '0.3'
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 0.3.0
|
50
50
|
type: :runtime
|
51
51
|
prerelease: false
|
52
52
|
version_requirements: !ruby/object:Gem::Requirement
|
53
53
|
requirements:
|
54
|
-
- - ">="
|
55
|
-
- !ruby/object:Gem::Version
|
56
|
-
version: 0.2.0
|
57
54
|
- - "~>"
|
58
55
|
- !ruby/object:Gem::Version
|
59
|
-
version: '0.
|
56
|
+
version: '0.3'
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: 0.3.0
|
60
60
|
- !ruby/object:Gem::Dependency
|
61
61
|
name: subunit
|
62
62
|
requirement: !ruby/object:Gem::Requirement
|
63
63
|
requirements:
|
64
64
|
- - "~>"
|
65
65
|
- !ruby/object:Gem::Version
|
66
|
-
version: '0.
|
66
|
+
version: '0.8'
|
67
67
|
- - ">="
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: 0.
|
69
|
+
version: 0.8.7
|
70
70
|
type: :runtime
|
71
71
|
prerelease: false
|
72
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
73
|
requirements:
|
74
74
|
- - "~>"
|
75
75
|
- !ruby/object:Gem::Version
|
76
|
-
version: '0.
|
76
|
+
version: '0.8'
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
|
-
version: 0.
|
79
|
+
version: 0.8.7
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: simple-config
|
82
82
|
requirement: !ruby/object:Gem::Requirement
|
@@ -86,7 +86,7 @@ dependencies:
|
|
86
86
|
version: '0.7'
|
87
87
|
- - ">="
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: 0.7.
|
89
|
+
version: 0.7.2
|
90
90
|
type: :runtime
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -96,29 +96,29 @@ dependencies:
|
|
96
96
|
version: '0.7'
|
97
97
|
- - ">="
|
98
98
|
- !ruby/object:Gem::Version
|
99
|
-
version: 0.7.
|
99
|
+
version: 0.7.2
|
100
100
|
- !ruby/object:Gem::Dependency
|
101
101
|
name: youtube_id
|
102
102
|
requirement: !ruby/object:Gem::Requirement
|
103
103
|
requirements:
|
104
|
-
- - ">="
|
105
|
-
- !ruby/object:Gem::Version
|
106
|
-
version: 0.1.0
|
107
104
|
- - "~>"
|
108
105
|
- !ruby/object:Gem::Version
|
109
106
|
version: '0.1'
|
107
|
+
- - ">="
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: 0.1.0
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
112
|
version_requirements: !ruby/object:Gem::Requirement
|
113
113
|
requirements:
|
114
|
-
- - ">="
|
115
|
-
- !ruby/object:Gem::Version
|
116
|
-
version: 0.1.0
|
117
114
|
- - "~>"
|
118
115
|
- !ruby/object:Gem::Version
|
119
116
|
version: '0.1'
|
117
|
+
- - ">="
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: 0.1.0
|
120
120
|
description:
|
121
|
-
email:
|
121
|
+
email: digital.robertson@gmail.com
|
122
122
|
executables: []
|
123
123
|
extensions: []
|
124
124
|
extra_rdoc_files: []
|
@@ -143,7 +143,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
143
143
|
- !ruby/object:Gem::Version
|
144
144
|
version: '0'
|
145
145
|
requirements: []
|
146
|
-
rubygems_version: 3.
|
146
|
+
rubygems_version: 3.2.22
|
147
147
|
signing_key:
|
148
148
|
specification_version: 4
|
149
149
|
summary: Makes it easier to digest a Youtube video by reading the transcript.
|
metadata.gz.sig
CHANGED
Binary file
|