youtube_transcript2020 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/youtube_transcript2020.rb +47 -43
- data.tar.gz.sig +0 -0
- metadata +46 -46
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9462fe85e21ee061f717e564efb7b66bec2e62b1c454228d6fb8f31633f7363d
|
4
|
+
data.tar.gz: bb04a03be0cb61058682ce4d9c1159666e41feb2a794fe65d07fbae418412056
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d8d4f57bb617e07f647727eaf56c81f1a7efc5eaccc71c323b54e887a270691f01f1d58cc64fe505a40ad6bd5ebd2cea3ea5fc87b7950abe773697acb5e7cc4b
|
7
|
+
data.tar.gz: 9a9a4e72f99f70aca736171490bba54d563aa722576ce6d91435177ed9f0fb327080d84ead963474ab8e46cd1c2dfc4709d2ab93c20853c2a376b091399ed6ad
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
@@ -12,21 +12,25 @@ class YoutubeTranscript2020
|
|
12
12
|
|
13
13
|
attr_reader :to_a, :author, :id, :title
|
14
14
|
|
15
|
-
def initialize(id=nil, debug: false)
|
15
|
+
def initialize(id=nil, debug: false)
|
16
16
|
|
17
17
|
return unless id
|
18
|
-
|
18
|
+
|
19
19
|
@debug = debug
|
20
20
|
|
21
21
|
@id = id[/https?:\/\//] ? YoutubeID.from(id) : id
|
22
22
|
|
23
|
-
|
24
|
-
|
23
|
+
# Fetching the transcript from the following statement no longer works.
|
24
|
+
# Instead, copy and paste the transcript from the YouTube video page into
|
25
|
+
# a text file and import it.
|
26
|
+
#
|
27
|
+
#s = Net::HTTP.get(URI("http://video.google.com/timedtext?lang=en&v=#{@id}"))
|
28
|
+
#@s = parse(s) unless s.empty?
|
25
29
|
|
26
30
|
fetch_info(@id)
|
27
31
|
|
28
32
|
end
|
29
|
-
|
33
|
+
|
30
34
|
def to_a()
|
31
35
|
@a
|
32
36
|
end
|
@@ -38,7 +42,7 @@ class YoutubeTranscript2020
|
|
38
42
|
h = {id: @id, title: @title, author: @author}
|
39
43
|
SimpleConfig.new(h).to_s + "\n#{'-'*78}\n\n" + @s
|
40
44
|
end
|
41
|
-
|
45
|
+
|
42
46
|
def to_text()
|
43
47
|
@a.map(&:last).join("\n")
|
44
48
|
end
|
@@ -47,7 +51,7 @@ class YoutubeTranscript2020
|
|
47
51
|
#
|
48
52
|
def import(obj)
|
49
53
|
|
50
|
-
s =
|
54
|
+
s = RXFReader.read(obj).first
|
51
55
|
|
52
56
|
if s =~ /------+/ then
|
53
57
|
header, body = s.split(/-----+/,2)
|
@@ -61,7 +65,7 @@ class YoutubeTranscript2020
|
|
61
65
|
end
|
62
66
|
|
63
67
|
puts 'body: ' + body[0..400] if @debug
|
64
|
-
a = body.lines.map(&:chomp).partition {|x| x =~ /\d+:\d+/ }
|
68
|
+
a = body.lines.map(&:chomp).partition {|x| x =~ /\d+:\d+/ }
|
65
69
|
@a = a[0].zip(a[1])
|
66
70
|
|
67
71
|
@s = join_sentences(@a) if raw_transcript
|
@@ -75,13 +79,13 @@ class YoutubeTranscript2020
|
|
75
79
|
url = 'https://www.youtube.com/embed/' + @id
|
76
80
|
|
77
81
|
links = @a.map do |timestamp, s|
|
78
|
-
|
79
|
-
seconds = Subunit.new(units={minutes:60, hours:60},
|
82
|
+
|
83
|
+
seconds = Subunit.new(units={minutes:60, hours:60},
|
80
84
|
timestamp.split(':').map(&:to_i)).to_i
|
81
85
|
"<li><a href='%s?start=%s&autoplay=1' target='video'>%s</a><p>%s</p></li> " \
|
82
86
|
% [url, seconds, timestamp, s]
|
83
87
|
end
|
84
|
-
|
88
|
+
|
85
89
|
puts '@html_embed: ' + @html_embed.inspect if @debug
|
86
90
|
doc = Rexle.new(@html_embed.to_s)
|
87
91
|
puts 'before attributes'
|
@@ -116,9 +120,9 @@ EOF
|
|
116
120
|
# Outputs plain text containing the headings including timestamps
|
117
121
|
# note: This can be helpful for copyng and pasting directly into a YouTube comment
|
118
122
|
#
|
119
|
-
def to_headings()
|
120
|
-
|
121
|
-
@to_a.select {|timestamp, _| timestamp =~ / /}.map(&:first)
|
123
|
+
def to_headings()
|
124
|
+
|
125
|
+
@to_a.select {|timestamp, _| timestamp =~ / /}.map(&:first)
|
122
126
|
|
123
127
|
end
|
124
128
|
|
@@ -133,53 +137,53 @@ EOF
|
|
133
137
|
private
|
134
138
|
|
135
139
|
def fetch_info(id)
|
136
|
-
|
137
|
-
url = "
|
140
|
+
|
141
|
+
url = "https://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=#{id}&format=xml"
|
138
142
|
s = Net::HTTP.get(URI(url))
|
139
|
-
|
143
|
+
|
140
144
|
e = Rexle.new(s).root
|
141
|
-
|
145
|
+
|
142
146
|
@title = e.text('title')
|
143
147
|
@author = e.text('author_name')
|
144
148
|
@html_embed = e.text('html').unescape
|
145
149
|
puts '@html_embed: ' + @html_embed.inspect if @debug
|
146
|
-
|
150
|
+
|
147
151
|
end
|
148
|
-
|
152
|
+
|
149
153
|
def join_sentences(a)
|
150
|
-
|
154
|
+
|
151
155
|
if @debug then
|
152
156
|
puts 'inside join_sentence'
|
153
157
|
puts 'a: ' + a.take(3).inspect
|
154
158
|
end
|
155
|
-
|
159
|
+
|
156
160
|
a2 = []
|
157
161
|
|
158
162
|
# the following cleans up sentences that start with And, Or, But, So etc.
|
159
163
|
|
160
164
|
(0..a.length - 1).each do |n|
|
161
|
-
|
165
|
+
|
162
166
|
time, s = a[n]
|
163
167
|
|
164
|
-
puts 's: ' + s.inspect if @debug
|
165
|
-
|
168
|
+
puts 's: ' + s.inspect if @debug
|
169
|
+
|
166
170
|
if s[/^[a-z|0-9]|I\b|I'/] then
|
167
|
-
|
171
|
+
|
168
172
|
if a2.any? then
|
169
|
-
|
170
|
-
# only join two parts together if there was no full stop in
|
173
|
+
|
174
|
+
# only join two parts together if there was no full stop in
|
171
175
|
# the previous line
|
172
|
-
|
176
|
+
|
173
177
|
if a2[-1][-1] != /\.$/ then
|
174
|
-
a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
|
178
|
+
a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
|
175
179
|
else
|
176
180
|
a2 << [time, s]
|
177
181
|
end
|
178
|
-
|
179
|
-
else
|
182
|
+
|
183
|
+
else
|
180
184
|
a2 << [time, s.capitalize]
|
181
185
|
end
|
182
|
-
|
186
|
+
|
183
187
|
elsif s[/^And,? /]
|
184
188
|
a2[-1][-1] += ' ' + s.sub(/^And,? /,'').capitalize
|
185
189
|
elsif s[/^Or,? /]
|
@@ -191,40 +195,40 @@ EOF
|
|
191
195
|
elsif s[/^So,? /]
|
192
196
|
a2[-1][-1] += ' ' + s.sub(/^So,? /,'').capitalize
|
193
197
|
elsif s[/^\[(?:Music|Applause)\]/i]
|
194
|
-
|
198
|
+
|
195
199
|
# ignore it
|
196
200
|
puts 'ignoring action commentary' if @debug
|
197
201
|
a2 << [time, '.']
|
198
|
-
|
199
|
-
# To promote the next sentence to a new timestamp we
|
202
|
+
|
203
|
+
# To promote the next sentence to a new timestamp we
|
200
204
|
# capitalize the 1st letter
|
201
205
|
a[n+1][-1] = a[n+1][-1].capitalize if a[n+1]
|
202
206
|
else
|
203
|
-
|
207
|
+
|
204
208
|
if a2.any? and not a2[-1][-1] =~ /\.\s*$/ then
|
205
|
-
a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
|
209
|
+
a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
|
206
210
|
else
|
207
211
|
a2 << [time, s]
|
208
212
|
end
|
209
|
-
|
213
|
+
|
210
214
|
end
|
211
215
|
|
212
216
|
end
|
213
|
-
|
217
|
+
|
214
218
|
# Remove those modified entries which were labelled [Music] etc
|
215
219
|
a2.reject! {|time, s| s.length < 2}
|
216
220
|
|
217
221
|
# formats the paragraph with the timestamp appearing above
|
218
222
|
@a = a2
|
219
|
-
a2.map {|time, s| "\n%s\n\n%s" % [time, s]}.join("\n")
|
220
|
-
|
223
|
+
a2.map {|time, s| "\n%s\n\n%s" % [time, s]}.join("\n")
|
224
|
+
|
221
225
|
end
|
222
226
|
|
223
227
|
def parse(s)
|
224
228
|
|
225
229
|
doc = Rexle.new(s)
|
226
230
|
|
227
|
-
a = doc.root.elements.each.map do |x|
|
231
|
+
a = doc.root.elements.each.map do |x|
|
228
232
|
timestamp = Subunit.new(units={minutes:60, hours:60}, \
|
229
233
|
seconds: x.attributes[:start].to_f).to_s(verbose: false)
|
230
234
|
[timestamp, x.text.unescape.gsub("\n", ' ').gsub(''',"'").gsub('"','"')]
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: youtube_transcript2020
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -11,72 +11,72 @@ cert_chain:
|
|
11
11
|
- |
|
12
12
|
-----BEGIN CERTIFICATE-----
|
13
13
|
MIIEXjCCAsagAwIBAgIBATANBgkqhkiG9w0BAQsFADAsMSowKAYDVQQDDCFnZW1t
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
14
|
+
YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMjIwMzIxMjAxOTMzWhcN
|
15
|
+
MjMwMzIxMjAxOTMzWjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
|
16
|
+
cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQCesuFG
|
17
|
+
FfZDH7Xm8kEyH9B3OczqfuJW5yZOfANTnUsB864vtSWe6Vghp6JUI0bOcOQdMvIx
|
18
|
+
HO4wfaGwvKJtjWCXYdZo2QYXjf6caY007R10GVxzsBYh8Swym0SYf33ljxX+R9DS
|
19
|
+
WwdWIv9SU0T7quNEbeXa9dtZJqlFCspmni8MB284ZpqhP2bpvfhBT58dLEUnjcRP
|
20
|
+
rcnCBEueIWYkwoZ8K4/BlYrBfgWcm9hxfBimsID0CIDqD2mhOJo/NQSrJJNWTmOt
|
21
|
+
oBZg4K2Y/GCmpxS9wQCrM4pBlTjy/mfNWIxDa9xdrIEmQtSng+7X6wvWAiJmFG7Y
|
22
|
+
HYN+ARNOx6ODVGYa/GrLWTBr4EL6RJuOD6eqpxD0hjvTczS12RFIGZh9kKXVT7wy
|
23
|
+
gkF5vdtR8uyR8Eo8mJM39Nv7yzuj8cRhCAto6aWOx+srVP/woM96qSQ7Ro0/YaeM
|
24
|
+
PHHcgZfU4HGdkCJ5Y8gaO9AzioExf2uFfV/m4+pPcBRbNkymj0+qgT/UFyMCAwEA
|
25
|
+
AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQUTmoD5rx8
|
26
|
+
rZ1imkIWMgtbUzNAn4YwJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
|
27
27
|
c29uLmV1MCYGA1UdEgQfMB2BG2dlbW1hc3RlckBqYW1lc3JvYmVydHNvbi5ldTAN
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
28
|
+
BgkqhkiG9w0BAQsFAAOCAYEAAnULhDB7LFrqhVw2ms+IyRTJcJSfFxFcTPG5/mEW
|
29
|
+
r8pyTmXvBOr9WIid7QNaUcHTMVlt03v/XCrEex+GajjDspH+rL0iw3poTyvQHeNt
|
30
|
+
WgMJiYJH1AZYTSIPnkdkoo6ok4jb9S4B6mgX7tGcBXMq0q3B2o8YZIwRPzajDvyf
|
31
|
+
ZgP+vWq4HfkE7/sLTPRoz+WF6c+0w6NAvCPh/LT9qQjwXhtKquprkPfR3+G9tyNO
|
32
|
+
rWGzBuj63YgqWsNTF0wZLXDMAGHsJvJa2plhhkMGU7/SMxxdG25A7THeTVMNH7kM
|
33
|
+
041VYN5fokzIIVKn38M4giKliDGEWvnFnEKEeb6Hrgser85Z+P7GjC642k1FHGvb
|
34
|
+
T8Jyb5XNJAWcNTk2AspDthbjYwOYAPP1KSLoCbhABW2Dqb6Y+pDOtoHoVbQtx7Ja
|
35
|
+
Eh31Azzsjb9JoMQLQliugChaXNzGUL7z5A4jmxeBd91yoD6odSGqLbGuUwjMfyd/
|
36
|
+
bYe6x24BppPTKnvGv7iKJQHe
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date:
|
38
|
+
date: 2022-03-21 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: yawc
|
42
42
|
requirement: !ruby/object:Gem::Requirement
|
43
43
|
requirements:
|
44
|
-
- - ">="
|
45
|
-
- !ruby/object:Gem::Version
|
46
|
-
version: 0.2.0
|
47
44
|
- - "~>"
|
48
45
|
- !ruby/object:Gem::Version
|
49
|
-
version: '0.
|
46
|
+
version: '0.3'
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 0.3.0
|
50
50
|
type: :runtime
|
51
51
|
prerelease: false
|
52
52
|
version_requirements: !ruby/object:Gem::Requirement
|
53
53
|
requirements:
|
54
|
-
- - ">="
|
55
|
-
- !ruby/object:Gem::Version
|
56
|
-
version: 0.2.0
|
57
54
|
- - "~>"
|
58
55
|
- !ruby/object:Gem::Version
|
59
|
-
version: '0.
|
56
|
+
version: '0.3'
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: 0.3.0
|
60
60
|
- !ruby/object:Gem::Dependency
|
61
61
|
name: subunit
|
62
62
|
requirement: !ruby/object:Gem::Requirement
|
63
63
|
requirements:
|
64
64
|
- - "~>"
|
65
65
|
- !ruby/object:Gem::Version
|
66
|
-
version: '0.
|
66
|
+
version: '0.8'
|
67
67
|
- - ">="
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: 0.5
|
69
|
+
version: 0.8.5
|
70
70
|
type: :runtime
|
71
71
|
prerelease: false
|
72
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
73
|
requirements:
|
74
74
|
- - "~>"
|
75
75
|
- !ruby/object:Gem::Version
|
76
|
-
version: '0.
|
76
|
+
version: '0.8'
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
|
-
version: 0.5
|
79
|
+
version: 0.8.5
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: simple-config
|
82
82
|
requirement: !ruby/object:Gem::Requirement
|
@@ -86,7 +86,7 @@ dependencies:
|
|
86
86
|
version: '0.7'
|
87
87
|
- - ">="
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: 0.7.
|
89
|
+
version: 0.7.2
|
90
90
|
type: :runtime
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -96,29 +96,29 @@ dependencies:
|
|
96
96
|
version: '0.7'
|
97
97
|
- - ">="
|
98
98
|
- !ruby/object:Gem::Version
|
99
|
-
version: 0.7.
|
99
|
+
version: 0.7.2
|
100
100
|
- !ruby/object:Gem::Dependency
|
101
101
|
name: youtube_id
|
102
102
|
requirement: !ruby/object:Gem::Requirement
|
103
103
|
requirements:
|
104
|
-
- - ">="
|
105
|
-
- !ruby/object:Gem::Version
|
106
|
-
version: 0.1.0
|
107
104
|
- - "~>"
|
108
105
|
- !ruby/object:Gem::Version
|
109
106
|
version: '0.1'
|
107
|
+
- - ">="
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: 0.1.0
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
112
|
version_requirements: !ruby/object:Gem::Requirement
|
113
113
|
requirements:
|
114
|
-
- - ">="
|
115
|
-
- !ruby/object:Gem::Version
|
116
|
-
version: 0.1.0
|
117
114
|
- - "~>"
|
118
115
|
- !ruby/object:Gem::Version
|
119
116
|
version: '0.1'
|
117
|
+
- - ">="
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: 0.1.0
|
120
120
|
description:
|
121
|
-
email:
|
121
|
+
email: digital.robertson@gmail.com
|
122
122
|
executables: []
|
123
123
|
extensions: []
|
124
124
|
extra_rdoc_files: []
|
@@ -143,7 +143,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
143
143
|
- !ruby/object:Gem::Version
|
144
144
|
version: '0'
|
145
145
|
requirements: []
|
146
|
-
rubygems_version: 3.
|
146
|
+
rubygems_version: 3.2.22
|
147
147
|
signing_key:
|
148
148
|
specification_version: 4
|
149
149
|
summary: Makes it easier to digest a Youtube video by reading the transcript.
|
metadata.gz.sig
CHANGED
Binary file
|