youtube_transcript2020 0.2.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/youtube_transcript2020.rb +114 -46
- data.tar.gz.sig +0 -0
- metadata +60 -40
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9462fe85e21ee061f717e564efb7b66bec2e62b1c454228d6fb8f31633f7363d
|
4
|
+
data.tar.gz: bb04a03be0cb61058682ce4d9c1159666e41feb2a794fe65d07fbae418412056
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d8d4f57bb617e07f647727eaf56c81f1a7efc5eaccc71c323b54e887a270691f01f1d58cc64fe505a40ad6bd5ebd2cea3ea5fc87b7950abe773697acb5e7cc4b
|
7
|
+
data.tar.gz: 9a9a4e72f99f70aca736171490bba54d563aa722576ce6d91435177ed9f0fb327080d84ead963474ab8e46cd1c2dfc4709d2ab93c20853c2a376b091399ed6ad
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
@@ -4,6 +4,7 @@
|
|
4
4
|
|
5
5
|
require 'yawc'
|
6
6
|
require 'subunit'
|
7
|
+
require 'youtube_id'
|
7
8
|
require 'simple-config'
|
8
9
|
|
9
10
|
|
@@ -11,25 +12,25 @@ class YoutubeTranscript2020
|
|
11
12
|
|
12
13
|
attr_reader :to_a, :author, :id, :title
|
13
14
|
|
14
|
-
def initialize(id=nil)
|
15
|
+
def initialize(id=nil, debug: false)
|
15
16
|
|
16
17
|
return unless id
|
17
18
|
|
18
|
-
@
|
19
|
-
id[/(?<=^https:\/\/www\.youtube\.com\/watch\?v=).*/]
|
20
|
-
elsif id[/https:\/\/youtu\.be\//]
|
21
|
-
id[/(?<=^https:\/\/youtu\.be\/).*/]
|
22
|
-
else
|
23
|
-
id
|
24
|
-
end
|
19
|
+
@debug = debug
|
25
20
|
|
26
|
-
|
27
|
-
|
21
|
+
@id = id[/https?:\/\//] ? YoutubeID.from(id) : id
|
22
|
+
|
23
|
+
# Fetching the transcript from the following statement no longer works.
|
24
|
+
# Instead, copy and paste the transcript from the YouTube video page into
|
25
|
+
# a text file and import it.
|
26
|
+
#
|
27
|
+
#s = Net::HTTP.get(URI("http://video.google.com/timedtext?lang=en&v=#{@id}"))
|
28
|
+
#@s = parse(s) unless s.empty?
|
28
29
|
|
29
30
|
fetch_info(@id)
|
30
31
|
|
31
32
|
end
|
32
|
-
|
33
|
+
|
33
34
|
def to_a()
|
34
35
|
@a
|
35
36
|
end
|
@@ -41,7 +42,7 @@ class YoutubeTranscript2020
|
|
41
42
|
h = {id: @id, title: @title, author: @author}
|
42
43
|
SimpleConfig.new(h).to_s + "\n#{'-'*78}\n\n" + @s
|
43
44
|
end
|
44
|
-
|
45
|
+
|
45
46
|
def to_text()
|
46
47
|
@a.map(&:last).join("\n")
|
47
48
|
end
|
@@ -50,16 +51,24 @@ class YoutubeTranscript2020
|
|
50
51
|
#
|
51
52
|
def import(obj)
|
52
53
|
|
53
|
-
s =
|
54
|
+
s = RXFReader.read(obj).first
|
54
55
|
|
55
|
-
|
56
|
+
if s =~ /------+/ then
|
57
|
+
header, body = s.split(/-----+/,2)
|
56
58
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
59
|
+
h = SimpleConfig.new(header).to_h
|
60
|
+
@id, @author, @title = h[:id], h[:author], h[:title]
|
61
|
+
@s = body
|
62
|
+
else
|
63
|
+
body = obj
|
64
|
+
raw_transcript = true
|
65
|
+
end
|
66
|
+
|
67
|
+
puts 'body: ' + body[0..400] if @debug
|
68
|
+
a = body.lines.map(&:chomp).partition {|x| x =~ /\d+:\d+/ }
|
69
|
+
@a = a[0].zip(a[1])
|
70
|
+
|
71
|
+
@s = join_sentences(@a) if raw_transcript
|
63
72
|
|
64
73
|
end
|
65
74
|
|
@@ -70,13 +79,21 @@ class YoutubeTranscript2020
|
|
70
79
|
url = 'https://www.youtube.com/embed/' + @id
|
71
80
|
|
72
81
|
links = @a.map do |timestamp, s|
|
73
|
-
|
74
|
-
seconds = Subunit.new(units={minutes:60, hours:60},
|
82
|
+
|
83
|
+
seconds = Subunit.new(units={minutes:60, hours:60},
|
75
84
|
timestamp.split(':').map(&:to_i)).to_i
|
76
85
|
"<li><a href='%s?start=%s&autoplay=1' target='video'>%s</a><p>%s</p></li> " \
|
77
86
|
% [url, seconds, timestamp, s]
|
78
87
|
end
|
79
88
|
|
89
|
+
puts '@html_embed: ' + @html_embed.inspect if @debug
|
90
|
+
doc = Rexle.new(@html_embed.to_s)
|
91
|
+
puts 'before attributes'
|
92
|
+
doc.root.attributes[:name] = 'video'
|
93
|
+
embed = doc.xml(declaration: false)
|
94
|
+
puts 'embed: ' + embed.inspect if @debug
|
95
|
+
#embed = @html_embed
|
96
|
+
|
80
97
|
<<EOF
|
81
98
|
<!DOCTYPE html>
|
82
99
|
<html lang="en">
|
@@ -87,7 +104,7 @@ class YoutubeTranscript2020
|
|
87
104
|
<body>
|
88
105
|
<div style="width: 1080px; background: white">
|
89
106
|
<div style="float:left; width: 580px; background: white">
|
90
|
-
|
107
|
+
#{embed}
|
91
108
|
<h1>#{@title}</h1>
|
92
109
|
</div>
|
93
110
|
<div style="float:right; width: 500px; overflow-y: scroll; height: 400px">
|
@@ -103,9 +120,9 @@ EOF
|
|
103
120
|
# Outputs plain text containing the headings including timestamps
|
104
121
|
# note: This can be helpful for copyng and pasting directly into a YouTube comment
|
105
122
|
#
|
106
|
-
def to_headings()
|
107
|
-
|
108
|
-
@to_a.select {|timestamp, _| timestamp =~ / /}.map(&:first)
|
123
|
+
def to_headings()
|
124
|
+
|
125
|
+
@to_a.select {|timestamp, _| timestamp =~ / /}.map(&:first)
|
109
126
|
|
110
127
|
end
|
111
128
|
|
@@ -120,36 +137,53 @@ EOF
|
|
120
137
|
private
|
121
138
|
|
122
139
|
def fetch_info(id)
|
123
|
-
|
124
|
-
url = "
|
140
|
+
|
141
|
+
url = "https://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=#{id}&format=xml"
|
125
142
|
s = Net::HTTP.get(URI(url))
|
126
|
-
|
127
|
-
h = JSON.parse(s, symbolize_names: true)
|
128
|
-
@title = h[:title]
|
129
|
-
@author = h[:author_name]
|
130
|
-
|
131
|
-
end
|
132
143
|
|
133
|
-
|
144
|
+
e = Rexle.new(s).root
|
134
145
|
|
135
|
-
|
146
|
+
@title = e.text('title')
|
147
|
+
@author = e.text('author_name')
|
148
|
+
@html_embed = e.text('html').unescape
|
149
|
+
puts '@html_embed: ' + @html_embed.inspect if @debug
|
136
150
|
|
137
|
-
|
138
|
-
timestamp = Subunit.new(units={minutes:60, hours:60}, \
|
139
|
-
seconds: x.attributes[:start].to_f).to_s(verbose: false)
|
140
|
-
[timestamp, x.text.unescape.gsub("\n", ' ').gsub(''',"'").gsub('"','"')]
|
141
|
-
end
|
151
|
+
end
|
142
152
|
|
143
|
-
|
153
|
+
def join_sentences(a)
|
154
|
+
|
155
|
+
if @debug then
|
156
|
+
puts 'inside join_sentence'
|
157
|
+
puts 'a: ' + a.take(3).inspect
|
158
|
+
end
|
144
159
|
|
145
160
|
a2 = []
|
146
161
|
|
147
162
|
# the following cleans up sentences that start with And, Or, But, So etc.
|
148
163
|
|
149
|
-
a.each do |
|
164
|
+
(0..a.length - 1).each do |n|
|
165
|
+
|
166
|
+
time, s = a[n]
|
167
|
+
|
168
|
+
puts 's: ' + s.inspect if @debug
|
169
|
+
|
170
|
+
if s[/^[a-z|0-9]|I\b|I'/] then
|
171
|
+
|
172
|
+
if a2.any? then
|
173
|
+
|
174
|
+
# only join two parts together if there was no full stop in
|
175
|
+
# the previous line
|
176
|
+
|
177
|
+
if a2[-1][-1] != /\.$/ then
|
178
|
+
a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
|
179
|
+
else
|
180
|
+
a2 << [time, s]
|
181
|
+
end
|
182
|
+
|
183
|
+
else
|
184
|
+
a2 << [time, s.capitalize]
|
185
|
+
end
|
150
186
|
|
151
|
-
if s[/^[a-z|0-9]/]then
|
152
|
-
a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
|
153
187
|
elsif s[/^And,? /]
|
154
188
|
a2[-1][-1] += ' ' + s.sub(/^And,? /,'').capitalize
|
155
189
|
elsif s[/^Or,? /]
|
@@ -160,16 +194,50 @@ EOF
|
|
160
194
|
a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
|
161
195
|
elsif s[/^So,? /]
|
162
196
|
a2[-1][-1] += ' ' + s.sub(/^So,? /,'').capitalize
|
197
|
+
elsif s[/^\[(?:Music|Applause)\]/i]
|
198
|
+
|
199
|
+
# ignore it
|
200
|
+
puts 'ignoring action commentary' if @debug
|
201
|
+
a2 << [time, '.']
|
202
|
+
|
203
|
+
# To promote the next sentence to a new timestamp we
|
204
|
+
# capitalize the 1st letter
|
205
|
+
a[n+1][-1] = a[n+1][-1].capitalize if a[n+1]
|
163
206
|
else
|
164
|
-
|
207
|
+
|
208
|
+
if a2.any? and not a2[-1][-1] =~ /\.\s*$/ then
|
209
|
+
a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
|
210
|
+
else
|
211
|
+
a2 << [time, s]
|
212
|
+
end
|
213
|
+
|
165
214
|
end
|
166
215
|
|
167
216
|
end
|
168
217
|
|
218
|
+
# Remove those modified entries which were labelled [Music] etc
|
219
|
+
a2.reject! {|time, s| s.length < 2}
|
220
|
+
|
169
221
|
# formats the paragraph with the timestamp appearing above
|
170
222
|
@a = a2
|
171
223
|
a2.map {|time, s| "\n%s\n\n%s" % [time, s]}.join("\n")
|
172
224
|
|
173
225
|
end
|
174
226
|
|
227
|
+
def parse(s)
|
228
|
+
|
229
|
+
doc = Rexle.new(s)
|
230
|
+
|
231
|
+
a = doc.root.elements.each.map do |x|
|
232
|
+
timestamp = Subunit.new(units={minutes:60, hours:60}, \
|
233
|
+
seconds: x.attributes[:start].to_f).to_s(verbose: false)
|
234
|
+
[timestamp, x.text.unescape.gsub("\n", ' ').gsub(''',"'").gsub('"','"')]
|
235
|
+
end
|
236
|
+
|
237
|
+
@to_a = a
|
238
|
+
|
239
|
+
join_sentences(a)
|
240
|
+
|
241
|
+
end
|
242
|
+
|
175
243
|
end
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: youtube_transcript2020
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2
|
4
|
+
version: 0.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -11,72 +11,72 @@ cert_chain:
|
|
11
11
|
- |
|
12
12
|
-----BEGIN CERTIFICATE-----
|
13
13
|
MIIEXjCCAsagAwIBAgIBATANBgkqhkiG9w0BAQsFADAsMSowKAYDVQQDDCFnZW1t
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
14
|
+
YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMjIwMzIxMjAxOTMzWhcN
|
15
|
+
MjMwMzIxMjAxOTMzWjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
|
16
|
+
cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQCesuFG
|
17
|
+
FfZDH7Xm8kEyH9B3OczqfuJW5yZOfANTnUsB864vtSWe6Vghp6JUI0bOcOQdMvIx
|
18
|
+
HO4wfaGwvKJtjWCXYdZo2QYXjf6caY007R10GVxzsBYh8Swym0SYf33ljxX+R9DS
|
19
|
+
WwdWIv9SU0T7quNEbeXa9dtZJqlFCspmni8MB284ZpqhP2bpvfhBT58dLEUnjcRP
|
20
|
+
rcnCBEueIWYkwoZ8K4/BlYrBfgWcm9hxfBimsID0CIDqD2mhOJo/NQSrJJNWTmOt
|
21
|
+
oBZg4K2Y/GCmpxS9wQCrM4pBlTjy/mfNWIxDa9xdrIEmQtSng+7X6wvWAiJmFG7Y
|
22
|
+
HYN+ARNOx6ODVGYa/GrLWTBr4EL6RJuOD6eqpxD0hjvTczS12RFIGZh9kKXVT7wy
|
23
|
+
gkF5vdtR8uyR8Eo8mJM39Nv7yzuj8cRhCAto6aWOx+srVP/woM96qSQ7Ro0/YaeM
|
24
|
+
PHHcgZfU4HGdkCJ5Y8gaO9AzioExf2uFfV/m4+pPcBRbNkymj0+qgT/UFyMCAwEA
|
25
|
+
AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUTmoD5rx8
|
26
|
+
rZ1imkIWMgtbUzNAn4YwJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
|
27
27
|
c29uLmV1MCYGA1UdEgQfMB2BG2dlbW1hc3RlckBqYW1lc3JvYmVydHNvbi5ldTAN
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
28
|
+
BgkqhkiG9w0BAQsFAAOCAYEAAnULhDB7LFrqhVw2ms+IyRTJcJSfFxFcTPG5/mEW
|
29
|
+
r8pyTmXvBOr9WIid7QNaUcHTMVlt03v/XCrEex+GajjDspH+rL0iw3poTyvQHeNt
|
30
|
+
WgMJiYJH1AZYTSIPnkdkoo6ok4jb9S4B6mgX7tGcBXMq0q3B2o8YZIwRPzajDvyf
|
31
|
+
ZgP+vWq4HfkE7/sLTPRoz+WF6c+0w6NAvCPh/LT9qQjwXhtKquprkPfR3+G9tyNO
|
32
|
+
rWGzBuj63YgqWsNTF0wZLXDMAGHsJvJa2plhhkMGU7/SMxxdG25A7THeTVMNH7kM
|
33
|
+
041VYN5fokzIIVKn38M4giKliDGEWvnFnEKEeb6Hrgser85Z+P7GjC642k1FHGvb
|
34
|
+
T8Jyb5XNJAWcNTk2AspDthbjYwOYAPP1KSLoCbhABW2Dqb6Y+pDOtoHoVbQtx7Ja
|
35
|
+
Eh31Azzsjb9JoMQLQliugChaXNzGUL7z5A4jmxeBd91yoD6odSGqLbGuUwjMfyd/
|
36
|
+
bYe6x24BppPTKnvGv7iKJQHe
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date:
|
38
|
+
date: 2022-03-21 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: yawc
|
42
42
|
requirement: !ruby/object:Gem::Requirement
|
43
43
|
requirements:
|
44
|
-
- - ">="
|
45
|
-
- !ruby/object:Gem::Version
|
46
|
-
version: 0.2.0
|
47
44
|
- - "~>"
|
48
45
|
- !ruby/object:Gem::Version
|
49
|
-
version: '0.
|
46
|
+
version: '0.3'
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 0.3.0
|
50
50
|
type: :runtime
|
51
51
|
prerelease: false
|
52
52
|
version_requirements: !ruby/object:Gem::Requirement
|
53
53
|
requirements:
|
54
|
-
- - ">="
|
55
|
-
- !ruby/object:Gem::Version
|
56
|
-
version: 0.2.0
|
57
54
|
- - "~>"
|
58
55
|
- !ruby/object:Gem::Version
|
59
|
-
version: '0.
|
56
|
+
version: '0.3'
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: 0.3.0
|
60
60
|
- !ruby/object:Gem::Dependency
|
61
61
|
name: subunit
|
62
62
|
requirement: !ruby/object:Gem::Requirement
|
63
63
|
requirements:
|
64
64
|
- - "~>"
|
65
65
|
- !ruby/object:Gem::Version
|
66
|
-
version: '0.
|
66
|
+
version: '0.8'
|
67
67
|
- - ">="
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: 0.5
|
69
|
+
version: 0.8.5
|
70
70
|
type: :runtime
|
71
71
|
prerelease: false
|
72
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
73
|
requirements:
|
74
74
|
- - "~>"
|
75
75
|
- !ruby/object:Gem::Version
|
76
|
-
version: '0.
|
76
|
+
version: '0.8'
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
|
-
version: 0.5
|
79
|
+
version: 0.8.5
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: simple-config
|
82
82
|
requirement: !ruby/object:Gem::Requirement
|
@@ -86,7 +86,7 @@ dependencies:
|
|
86
86
|
version: '0.7'
|
87
87
|
- - ">="
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: 0.7.
|
89
|
+
version: 0.7.2
|
90
90
|
type: :runtime
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -96,9 +96,29 @@ dependencies:
|
|
96
96
|
version: '0.7'
|
97
97
|
- - ">="
|
98
98
|
- !ruby/object:Gem::Version
|
99
|
-
version: 0.7.
|
99
|
+
version: 0.7.2
|
100
|
+
- !ruby/object:Gem::Dependency
|
101
|
+
name: youtube_id
|
102
|
+
requirement: !ruby/object:Gem::Requirement
|
103
|
+
requirements:
|
104
|
+
- - "~>"
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: '0.1'
|
107
|
+
- - ">="
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: 0.1.0
|
110
|
+
type: :runtime
|
111
|
+
prerelease: false
|
112
|
+
version_requirements: !ruby/object:Gem::Requirement
|
113
|
+
requirements:
|
114
|
+
- - "~>"
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: '0.1'
|
117
|
+
- - ">="
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: 0.1.0
|
100
120
|
description:
|
101
|
-
email:
|
121
|
+
email: digital.robertson@gmail.com
|
102
122
|
executables: []
|
103
123
|
extensions: []
|
104
124
|
extra_rdoc_files: []
|
@@ -123,7 +143,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
123
143
|
- !ruby/object:Gem::Version
|
124
144
|
version: '0'
|
125
145
|
requirements: []
|
126
|
-
rubygems_version: 3.
|
146
|
+
rubygems_version: 3.2.22
|
127
147
|
signing_key:
|
128
148
|
specification_version: 4
|
129
149
|
summary: Makes it easier to digest a Youtube video by reading the transcript.
|
metadata.gz.sig
CHANGED
Binary file
|