youtube_transcript2020 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data.tar.gz.sig +0 -0
- data/lib/youtube_transcript2020.rb +83 -34
- metadata +22 -2
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: eaec0007026f2086f0ed0ed41e5c4d6de5c2e64aea17cf21dfab2a201b5228c5
|
4
|
+
data.tar.gz: 14953c7cf8156785e5413d17a6e02373935c368cd4f22be7ace93378517f8480
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 294f44e6db60fc35b8e9cdc0600d1d098a5d4eebe5cb09da0e9bc325b1ea489c5faf03ca9557fb7920672d82758313c162406b09b31aa617fcc3402282f8a61b
|
7
|
+
data.tar.gz: 143628d6cde724dd466d779f8a5796ab02facede423e34d01cf31ed1a52841f56cc09633fc3160c8d68d2502edf2989d0dc3962901fbcb86ac3124d788ff535a
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data.tar.gz.sig
CHANGED
Binary file
|
@@ -4,6 +4,7 @@
|
|
4
4
|
|
5
5
|
require 'yawc'
|
6
6
|
require 'subunit'
|
7
|
+
require 'youtube_id'
|
7
8
|
require 'simple-config'
|
8
9
|
|
9
10
|
|
@@ -11,20 +12,20 @@ class YoutubeTranscript2020
|
|
11
12
|
|
12
13
|
attr_reader :to_a, :author, :id, :title
|
13
14
|
|
14
|
-
def initialize(id=nil)
|
15
|
+
def initialize(id=nil, debug: false)
|
15
16
|
|
16
17
|
return unless id
|
18
|
+
|
19
|
+
@debug = debug
|
17
20
|
|
18
|
-
@id = if id[/https
|
19
|
-
id
|
20
|
-
elsif id[/https:\/\/youtu\.be\//]
|
21
|
-
id[/(?<=^https:\/\/youtu\.be\/).*/]
|
21
|
+
@id = if id[/https?:\/\//] then
|
22
|
+
YoutubeID.from(id)
|
22
23
|
else
|
23
24
|
id
|
24
25
|
end
|
25
26
|
|
26
27
|
s = Net::HTTP.get(URI("http://video.google.com/timedtext?lang=en&v=#{@id}"))
|
27
|
-
@s = parse s
|
28
|
+
@s = parse(s) unless s.empty?
|
28
29
|
|
29
30
|
fetch_info(@id)
|
30
31
|
|
@@ -52,14 +53,22 @@ class YoutubeTranscript2020
|
|
52
53
|
|
53
54
|
s = RXFHelper.read(obj).first
|
54
55
|
|
55
|
-
|
56
|
+
if s =~ /------+/ then
|
57
|
+
header, body = s.split(/-----+/,2)
|
56
58
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
59
|
+
h = SimpleConfig.new(header).to_h
|
60
|
+
@id, @author, @title = h[:id], h[:author], h[:title]
|
61
|
+
@s = body
|
62
|
+
else
|
63
|
+
body = obj
|
64
|
+
raw_transcript = true
|
65
|
+
end
|
66
|
+
|
67
|
+
puts 'body: ' + body[0..400] if @debug
|
61
68
|
a = body.lines.map(&:chomp).partition {|x| x =~ /\d+:\d+/ }
|
62
|
-
@a = a[0].zip(a[1])
|
69
|
+
@a = a[0].zip(a[1])
|
70
|
+
|
71
|
+
@s = join_sentences(@a) if raw_transcript
|
63
72
|
|
64
73
|
end
|
65
74
|
|
@@ -87,7 +96,7 @@ class YoutubeTranscript2020
|
|
87
96
|
<body>
|
88
97
|
<div style="width: 1080px; background: white">
|
89
98
|
<div style="float:left; width: 580px; background: white">
|
90
|
-
|
99
|
+
#{@html_embed}
|
91
100
|
<h1>#{@title}</h1>
|
92
101
|
</div>
|
93
102
|
<div style="float:right; width: 500px; overflow-y: scroll; height: 400px">
|
@@ -121,35 +130,51 @@ EOF
|
|
121
130
|
|
122
131
|
def fetch_info(id)
|
123
132
|
|
124
|
-
url = "http://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=#{id}&format=
|
133
|
+
url = "http://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=#{id}&format=xml"
|
125
134
|
s = Net::HTTP.get(URI(url))
|
126
135
|
|
127
|
-
|
128
|
-
|
129
|
-
@
|
136
|
+
e = Rexle.new(s).root
|
137
|
+
|
138
|
+
@title = e.text('title')
|
139
|
+
@author = e.text('author_name')
|
140
|
+
@html_embed = e.text('html').unescape
|
130
141
|
|
131
142
|
end
|
132
|
-
|
133
|
-
def
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
timestamp = Subunit.new(units={minutes:60, hours:60}, \
|
139
|
-
seconds: x.attributes[:start].to_f).to_s(verbose: false)
|
140
|
-
[timestamp, x.text.unescape.gsub("\n", ' ').gsub(''',"'").gsub('"','"')]
|
143
|
+
|
144
|
+
def join_sentences(a)
|
145
|
+
|
146
|
+
if @debug then
|
147
|
+
puts 'inside join_sentence'
|
148
|
+
puts 'a: ' + a.take(3).inspect
|
141
149
|
end
|
142
|
-
|
143
|
-
@to_a = a
|
144
|
-
|
150
|
+
|
145
151
|
a2 = []
|
146
152
|
|
147
153
|
# the following cleans up sentences that start with And, Or, But, So etc.
|
148
154
|
|
149
|
-
a.each do |time,
|
155
|
+
a.each do |time, raws|
|
150
156
|
|
151
|
-
if
|
152
|
-
|
157
|
+
puts 'raws: ' + raws.inspect if @debug
|
158
|
+
|
159
|
+
s = raws.sub(/^\W+/,'')
|
160
|
+
|
161
|
+
if s[/^[a-z|0-9]|I\b|I'/]then
|
162
|
+
|
163
|
+
if a2.any? then
|
164
|
+
|
165
|
+
# only join two parts together if there was no full stop in
|
166
|
+
# the previous line
|
167
|
+
|
168
|
+
if a2[-1][-1] != /\.$/ then
|
169
|
+
a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
|
170
|
+
else
|
171
|
+
a2 << [time, s]
|
172
|
+
end
|
173
|
+
|
174
|
+
else
|
175
|
+
a2 << [time, s.capitalize]
|
176
|
+
end
|
177
|
+
|
153
178
|
elsif s[/^And,? /]
|
154
179
|
a2[-1][-1] += ' ' + s.sub(/^And,? /,'').capitalize
|
155
180
|
elsif s[/^Or,? /]
|
@@ -160,15 +185,39 @@ EOF
|
|
160
185
|
a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
|
161
186
|
elsif s[/^So,? /]
|
162
187
|
a2[-1][-1] += ' ' + s.sub(/^So,? /,'').capitalize
|
188
|
+
elsif s[/^\[Music|Applause\]/i]
|
189
|
+
# ignore it
|
163
190
|
else
|
164
|
-
|
191
|
+
|
192
|
+
if a2.any? and not a2[-1][-1] =~ /\.\s*$/ then
|
193
|
+
a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
|
194
|
+
else
|
195
|
+
a2 << [time, s]
|
196
|
+
end
|
197
|
+
|
165
198
|
end
|
166
199
|
|
167
200
|
end
|
168
201
|
|
169
202
|
# formats the paragraph with the timestamp appearing above
|
170
203
|
@a = a2
|
171
|
-
a2.map {|time, s| "\n%s\n\n%s" % [time, s]}.join("\n")
|
204
|
+
a2.map {|time, s| "\n%s\n\n%s" % [time, s]}.join("\n")
|
205
|
+
|
206
|
+
end
|
207
|
+
|
208
|
+
def parse(s)
|
209
|
+
|
210
|
+
doc = Rexle.new(s)
|
211
|
+
|
212
|
+
a = doc.root.elements.each.map do |x|
|
213
|
+
timestamp = Subunit.new(units={minutes:60, hours:60}, \
|
214
|
+
seconds: x.attributes[:start].to_f).to_s(verbose: false)
|
215
|
+
[timestamp, x.text.unescape.gsub("\n", ' ').gsub(''',"'").gsub('"','"')]
|
216
|
+
end
|
217
|
+
|
218
|
+
@to_a = a
|
219
|
+
|
220
|
+
join_sentences(a)
|
172
221
|
|
173
222
|
end
|
174
223
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: youtube_transcript2020
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
42/R+mfGUgXXd9e36R3+wmfHZSFR6p6I6XKToCKca7buvgP2XgO9I04lTYUr0KLi
|
36
36
|
6ZSQYo0XuSVg3by/5kp1TrrS
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2020-07-
|
38
|
+
date: 2020-07-29 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: yawc
|
@@ -97,6 +97,26 @@ dependencies:
|
|
97
97
|
- - ">="
|
98
98
|
- !ruby/object:Gem::Version
|
99
99
|
version: 0.7.1
|
100
|
+
- !ruby/object:Gem::Dependency
|
101
|
+
name: youtube_id
|
102
|
+
requirement: !ruby/object:Gem::Requirement
|
103
|
+
requirements:
|
104
|
+
- - ">="
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: 0.1.0
|
107
|
+
- - "~>"
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0.1'
|
110
|
+
type: :runtime
|
111
|
+
prerelease: false
|
112
|
+
version_requirements: !ruby/object:Gem::Requirement
|
113
|
+
requirements:
|
114
|
+
- - ">="
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: 0.1.0
|
117
|
+
- - "~>"
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: '0.1'
|
100
120
|
description:
|
101
121
|
email: james@jamesrobertson.eu
|
102
122
|
executables: []
|
metadata.gz.sig
CHANGED
Binary file
|