youtube_transcript2020 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data.tar.gz.sig +0 -0
- data/lib/youtube_transcript2020.rb +83 -34
- metadata +22 -2
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: eaec0007026f2086f0ed0ed41e5c4d6de5c2e64aea17cf21dfab2a201b5228c5
|
4
|
+
data.tar.gz: 14953c7cf8156785e5413d17a6e02373935c368cd4f22be7ace93378517f8480
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 294f44e6db60fc35b8e9cdc0600d1d098a5d4eebe5cb09da0e9bc325b1ea489c5faf03ca9557fb7920672d82758313c162406b09b31aa617fcc3402282f8a61b
|
7
|
+
data.tar.gz: 143628d6cde724dd466d779f8a5796ab02facede423e34d01cf31ed1a52841f56cc09633fc3160c8d68d2502edf2989d0dc3962901fbcb86ac3124d788ff535a
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data.tar.gz.sig
CHANGED
Binary file
|
@@ -4,6 +4,7 @@
|
|
4
4
|
|
5
5
|
require 'yawc'
|
6
6
|
require 'subunit'
|
7
|
+
require 'youtube_id'
|
7
8
|
require 'simple-config'
|
8
9
|
|
9
10
|
|
@@ -11,20 +12,20 @@ class YoutubeTranscript2020
|
|
11
12
|
|
12
13
|
attr_reader :to_a, :author, :id, :title
|
13
14
|
|
14
|
-
def initialize(id=nil)
|
15
|
+
def initialize(id=nil, debug: false)
|
15
16
|
|
16
17
|
return unless id
|
18
|
+
|
19
|
+
@debug = debug
|
17
20
|
|
18
|
-
@id = if id[/https
|
19
|
-
id
|
20
|
-
elsif id[/https:\/\/youtu\.be\//]
|
21
|
-
id[/(?<=^https:\/\/youtu\.be\/).*/]
|
21
|
+
@id = if id[/https?:\/\//] then
|
22
|
+
YoutubeID.from(id)
|
22
23
|
else
|
23
24
|
id
|
24
25
|
end
|
25
26
|
|
26
27
|
s = Net::HTTP.get(URI("http://video.google.com/timedtext?lang=en&v=#{@id}"))
|
27
|
-
@s = parse s
|
28
|
+
@s = parse(s) unless s.empty?
|
28
29
|
|
29
30
|
fetch_info(@id)
|
30
31
|
|
@@ -52,14 +53,22 @@ class YoutubeTranscript2020
|
|
52
53
|
|
53
54
|
s = RXFHelper.read(obj).first
|
54
55
|
|
55
|
-
|
56
|
+
if s =~ /------+/ then
|
57
|
+
header, body = s.split(/-----+/,2)
|
56
58
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
59
|
+
h = SimpleConfig.new(header).to_h
|
60
|
+
@id, @author, @title = h[:id], h[:author], h[:title]
|
61
|
+
@s = body
|
62
|
+
else
|
63
|
+
body = obj
|
64
|
+
raw_transcript = true
|
65
|
+
end
|
66
|
+
|
67
|
+
puts 'body: ' + body[0..400] if @debug
|
61
68
|
a = body.lines.map(&:chomp).partition {|x| x =~ /\d+:\d+/ }
|
62
|
-
@a = a[0].zip(a[1])
|
69
|
+
@a = a[0].zip(a[1])
|
70
|
+
|
71
|
+
@s = join_sentences(@a) if raw_transcript
|
63
72
|
|
64
73
|
end
|
65
74
|
|
@@ -87,7 +96,7 @@ class YoutubeTranscript2020
|
|
87
96
|
<body>
|
88
97
|
<div style="width: 1080px; background: white">
|
89
98
|
<div style="float:left; width: 580px; background: white">
|
90
|
-
|
99
|
+
#{@html_embed}
|
91
100
|
<h1>#{@title}</h1>
|
92
101
|
</div>
|
93
102
|
<div style="float:right; width: 500px; overflow-y: scroll; height: 400px">
|
@@ -121,35 +130,51 @@ EOF
|
|
121
130
|
|
122
131
|
def fetch_info(id)
|
123
132
|
|
124
|
-
url = "http://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=#{id}&format=
|
133
|
+
url = "http://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=#{id}&format=xml"
|
125
134
|
s = Net::HTTP.get(URI(url))
|
126
135
|
|
127
|
-
|
128
|
-
|
129
|
-
@
|
136
|
+
e = Rexle.new(s).root
|
137
|
+
|
138
|
+
@title = e.text('title')
|
139
|
+
@author = e.text('author_name')
|
140
|
+
@html_embed = e.text('html').unescape
|
130
141
|
|
131
142
|
end
|
132
|
-
|
133
|
-
def
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
timestamp = Subunit.new(units={minutes:60, hours:60}, \
|
139
|
-
seconds: x.attributes[:start].to_f).to_s(verbose: false)
|
140
|
-
[timestamp, x.text.unescape.gsub("\n", ' ').gsub(''',"'").gsub('"','"')]
|
143
|
+
|
144
|
+
def join_sentences(a)
|
145
|
+
|
146
|
+
if @debug then
|
147
|
+
puts 'inside join_sentence'
|
148
|
+
puts 'a: ' + a.take(3).inspect
|
141
149
|
end
|
142
|
-
|
143
|
-
@to_a = a
|
144
|
-
|
150
|
+
|
145
151
|
a2 = []
|
146
152
|
|
147
153
|
# the following cleans up sentences that start with And, Or, But, So etc.
|
148
154
|
|
149
|
-
a.each do |time,
|
155
|
+
a.each do |time, raws|
|
150
156
|
|
151
|
-
if
|
152
|
-
|
157
|
+
puts 'raws: ' + raws.inspect if @debug
|
158
|
+
|
159
|
+
s = raws.sub(/^\W+/,'')
|
160
|
+
|
161
|
+
if s[/^[a-z|0-9]|I\b|I'/]then
|
162
|
+
|
163
|
+
if a2.any? then
|
164
|
+
|
165
|
+
# only join two parts together if there was no full stop in
|
166
|
+
# the previous line
|
167
|
+
|
168
|
+
if a2[-1][-1] != /\.$/ then
|
169
|
+
a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
|
170
|
+
else
|
171
|
+
a2 << [time, s]
|
172
|
+
end
|
173
|
+
|
174
|
+
else
|
175
|
+
a2 << [time, s.capitalize]
|
176
|
+
end
|
177
|
+
|
153
178
|
elsif s[/^And,? /]
|
154
179
|
a2[-1][-1] += ' ' + s.sub(/^And,? /,'').capitalize
|
155
180
|
elsif s[/^Or,? /]
|
@@ -160,15 +185,39 @@ EOF
|
|
160
185
|
a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
|
161
186
|
elsif s[/^So,? /]
|
162
187
|
a2[-1][-1] += ' ' + s.sub(/^So,? /,'').capitalize
|
188
|
+
elsif s[/^\[Music|Applause\]/i]
|
189
|
+
# ignore it
|
163
190
|
else
|
164
|
-
|
191
|
+
|
192
|
+
if a2.any? and not a2[-1][-1] =~ /\.\s*$/ then
|
193
|
+
a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
|
194
|
+
else
|
195
|
+
a2 << [time, s]
|
196
|
+
end
|
197
|
+
|
165
198
|
end
|
166
199
|
|
167
200
|
end
|
168
201
|
|
169
202
|
# formats the paragraph with the timestamp appearing above
|
170
203
|
@a = a2
|
171
|
-
a2.map {|time, s| "\n%s\n\n%s" % [time, s]}.join("\n")
|
204
|
+
a2.map {|time, s| "\n%s\n\n%s" % [time, s]}.join("\n")
|
205
|
+
|
206
|
+
end
|
207
|
+
|
208
|
+
def parse(s)
|
209
|
+
|
210
|
+
doc = Rexle.new(s)
|
211
|
+
|
212
|
+
a = doc.root.elements.each.map do |x|
|
213
|
+
timestamp = Subunit.new(units={minutes:60, hours:60}, \
|
214
|
+
seconds: x.attributes[:start].to_f).to_s(verbose: false)
|
215
|
+
[timestamp, x.text.unescape.gsub("\n", ' ').gsub(''',"'").gsub('"','"')]
|
216
|
+
end
|
217
|
+
|
218
|
+
@to_a = a
|
219
|
+
|
220
|
+
join_sentences(a)
|
172
221
|
|
173
222
|
end
|
174
223
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: youtube_transcript2020
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
42/R+mfGUgXXd9e36R3+wmfHZSFR6p6I6XKToCKca7buvgP2XgO9I04lTYUr0KLi
|
36
36
|
6ZSQYo0XuSVg3by/5kp1TrrS
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2020-07-
|
38
|
+
date: 2020-07-29 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: yawc
|
@@ -97,6 +97,26 @@ dependencies:
|
|
97
97
|
- - ">="
|
98
98
|
- !ruby/object:Gem::Version
|
99
99
|
version: 0.7.1
|
100
|
+
- !ruby/object:Gem::Dependency
|
101
|
+
name: youtube_id
|
102
|
+
requirement: !ruby/object:Gem::Requirement
|
103
|
+
requirements:
|
104
|
+
- - ">="
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: 0.1.0
|
107
|
+
- - "~>"
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0.1'
|
110
|
+
type: :runtime
|
111
|
+
prerelease: false
|
112
|
+
version_requirements: !ruby/object:Gem::Requirement
|
113
|
+
requirements:
|
114
|
+
- - ">="
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: 0.1.0
|
117
|
+
- - "~>"
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: '0.1'
|
100
120
|
description:
|
101
121
|
email: james@jamesrobertson.eu
|
102
122
|
executables: []
|
metadata.gz.sig
CHANGED
Binary file
|