youtube_transcript2020 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9405f15c45cd185c0ad06d54fb2e7198a39cdef14896051a3b03ea3d14b497fb
4
- data.tar.gz: 643f78333b91e3c5ff5ef960d8339a496618bcf3b5d81f41d74d17ffb760d533
3
+ metadata.gz: eaec0007026f2086f0ed0ed41e5c4d6de5c2e64aea17cf21dfab2a201b5228c5
4
+ data.tar.gz: 14953c7cf8156785e5413d17a6e02373935c368cd4f22be7ace93378517f8480
5
5
  SHA512:
6
- metadata.gz: 59915336a37bb3c3bdd7d84715c0e254db0f1b20b720c66f5191b6dba38ec072e3abb4e117f5876bcaf89eaa4dcfb7871ab279f36becd1523ba1d56daf230cae
7
- data.tar.gz: 53851b499c5a2024303f3da0b1da1da54a0eaf096db197c5ac97035e1ccb125ef581d551fa154c39892decfd5041fe667a2364293db8c1d1f8057ab24757b1ee
6
+ metadata.gz: 294f44e6db60fc35b8e9cdc0600d1d098a5d4eebe5cb09da0e9bc325b1ea489c5faf03ca9557fb7920672d82758313c162406b09b31aa617fcc3402282f8a61b
7
+ data.tar.gz: 143628d6cde724dd466d779f8a5796ab02facede423e34d01cf31ed1a52841f56cc09633fc3160c8d68d2502edf2989d0dc3962901fbcb86ac3124d788ff535a
Binary file
data.tar.gz.sig CHANGED
Binary file
@@ -4,6 +4,7 @@
4
4
 
5
5
  require 'yawc'
6
6
  require 'subunit'
7
+ require 'youtube_id'
7
8
  require 'simple-config'
8
9
 
9
10
 
@@ -11,20 +12,20 @@ class YoutubeTranscript2020
11
12
 
12
13
  attr_reader :to_a, :author, :id, :title
13
14
 
14
- def initialize(id=nil)
15
+ def initialize(id=nil, debug: false)
15
16
 
16
17
  return unless id
18
+
19
+ @debug = debug
17
20
 
18
- @id = if id[/https:\/\/www\.youtube\.com\/watch\?v=/] then
19
- id[/(?<=^https:\/\/www\.youtube\.com\/watch\?v=).*/]
20
- elsif id[/https:\/\/youtu\.be\//]
21
- id[/(?<=^https:\/\/youtu\.be\/).*/]
21
+ @id = if id[/https?:\/\//] then
22
+ YoutubeID.from(id)
22
23
  else
23
24
  id
24
25
  end
25
26
 
26
27
  s = Net::HTTP.get(URI("http://video.google.com/timedtext?lang=en&v=#{@id}"))
27
- @s = parse s
28
+ @s = parse(s) unless s.empty?
28
29
 
29
30
  fetch_info(@id)
30
31
 
@@ -52,14 +53,22 @@ class YoutubeTranscript2020
52
53
 
53
54
  s = RXFHelper.read(obj).first
54
55
 
55
- header, body = s.split(/-----+/,2)
56
+ if s =~ /------+/ then
57
+ header, body = s.split(/-----+/,2)
56
58
 
57
- h = SimpleConfig.new(header).to_h
58
- @id, @author, @title = h[:id], h[:author], h[:title]
59
- @s = body
60
-
59
+ h = SimpleConfig.new(header).to_h
60
+ @id, @author, @title = h[:id], h[:author], h[:title]
61
+ @s = body
62
+ else
63
+ body = obj
64
+ raw_transcript = true
65
+ end
66
+
67
+ puts 'body: ' + body[0..400] if @debug
61
68
  a = body.lines.map(&:chomp).partition {|x| x =~ /\d+:\d+/ }
62
- @a = a[0].zip(a[1])
69
+ @a = a[0].zip(a[1])
70
+
71
+ @s = join_sentences(@a) if raw_transcript
63
72
 
64
73
  end
65
74
 
@@ -87,7 +96,7 @@ class YoutubeTranscript2020
87
96
  <body>
88
97
  <div style="width: 1080px; background: white">
89
98
  <div style="float:left; width: 580px; background: white">
90
- <iframe width="560" height="315" src="#{url}&autoplay=1" name="video" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
99
+ #{@html_embed}
91
100
  <h1>#{@title}</h1>
92
101
  </div>
93
102
  <div style="float:right; width: 500px; overflow-y: scroll; height: 400px">
@@ -121,35 +130,51 @@ EOF
121
130
 
122
131
  def fetch_info(id)
123
132
 
124
- url = "http://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=#{id}&format=json"
133
+ url = "http://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=#{id}&format=xml"
125
134
  s = Net::HTTP.get(URI(url))
126
135
 
127
- h = JSON.parse(s, symbolize_names: true)
128
- @title = h[:title]
129
- @author = h[:author_name]
136
+ e = Rexle.new(s).root
137
+
138
+ @title = e.text('title')
139
+ @author = e.text('author_name')
140
+ @html_embed = e.text('html').unescape
130
141
 
131
142
  end
132
-
133
- def parse(s)
134
-
135
- doc = Rexle.new(s)
136
-
137
- a = doc.root.elements.each.map do |x|
138
- timestamp = Subunit.new(units={minutes:60, hours:60}, \
139
- seconds: x.attributes[:start].to_f).to_s(verbose: false)
140
- [timestamp, x.text.unescape.gsub("\n", ' ').gsub('&#39;',"'").gsub('&quot;','"')]
143
+
144
+ def join_sentences(a)
145
+
146
+ if @debug then
147
+ puts 'inside join_sentence'
148
+ puts 'a: ' + a.take(3).inspect
141
149
  end
142
-
143
- @to_a = a
144
-
150
+
145
151
  a2 = []
146
152
 
147
153
  # the following cleans up sentences that start with And, Or, But, So etc.
148
154
 
149
- a.each do |time, s|
155
+ a.each do |time, raws|
150
156
 
151
- if s[/^[a-z|0-9]/]then
152
- a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
157
+ puts 'raws: ' + raws.inspect if @debug
158
+
159
+ s = raws.sub(/^\W+/,'')
160
+
161
+ if s[/^[a-z|0-9]|I\b|I'/]then
162
+
163
+ if a2.any? then
164
+
165
+ # only join two parts together if there was no full stop in
166
+ # the previous line
167
+
168
+ if a2[-1][-1] != /\.$/ then
169
+ a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
170
+ else
171
+ a2 << [time, s]
172
+ end
173
+
174
+ else
175
+ a2 << [time, s.capitalize]
176
+ end
177
+
153
178
  elsif s[/^And,? /]
154
179
  a2[-1][-1] += ' ' + s.sub(/^And,? /,'').capitalize
155
180
  elsif s[/^Or,? /]
@@ -160,15 +185,39 @@ EOF
160
185
  a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
161
186
  elsif s[/^So,? /]
162
187
  a2[-1][-1] += ' ' + s.sub(/^So,? /,'').capitalize
188
+ elsif s[/^\[Music|Applause\]/i]
189
+ # ignore it
163
190
  else
164
- a2 << [time, s]
191
+
192
+ if a2.any? and not a2[-1][-1] =~ /\.\s*$/ then
193
+ a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
194
+ else
195
+ a2 << [time, s]
196
+ end
197
+
165
198
  end
166
199
 
167
200
  end
168
201
 
169
202
  # formats the paragraph with the timestamp appearing above
170
203
  @a = a2
171
- a2.map {|time, s| "\n%s\n\n%s" % [time, s]}.join("\n")
204
+ a2.map {|time, s| "\n%s\n\n%s" % [time, s]}.join("\n")
205
+
206
+ end
207
+
208
+ def parse(s)
209
+
210
+ doc = Rexle.new(s)
211
+
212
+ a = doc.root.elements.each.map do |x|
213
+ timestamp = Subunit.new(units={minutes:60, hours:60}, \
214
+ seconds: x.attributes[:start].to_f).to_s(verbose: false)
215
+ [timestamp, x.text.unescape.gsub("\n", ' ').gsub('&#39;',"'").gsub('&quot;','"')]
216
+ end
217
+
218
+ @to_a = a
219
+
220
+ join_sentences(a)
172
221
 
173
222
  end
174
223
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: youtube_transcript2020
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  42/R+mfGUgXXd9e36R3+wmfHZSFR6p6I6XKToCKca7buvgP2XgO9I04lTYUr0KLi
36
36
  6ZSQYo0XuSVg3by/5kp1TrrS
37
37
  -----END CERTIFICATE-----
38
- date: 2020-07-28 00:00:00.000000000 Z
38
+ date: 2020-07-29 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: yawc
@@ -97,6 +97,26 @@ dependencies:
97
97
  - - ">="
98
98
  - !ruby/object:Gem::Version
99
99
  version: 0.7.1
100
+ - !ruby/object:Gem::Dependency
101
+ name: youtube_id
102
+ requirement: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: 0.1.0
107
+ - - "~>"
108
+ - !ruby/object:Gem::Version
109
+ version: '0.1'
110
+ type: :runtime
111
+ prerelease: false
112
+ version_requirements: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - ">="
115
+ - !ruby/object:Gem::Version
116
+ version: 0.1.0
117
+ - - "~>"
118
+ - !ruby/object:Gem::Version
119
+ version: '0.1'
100
120
  description:
101
121
  email: james@jamesrobertson.eu
102
122
  executables: []
metadata.gz.sig CHANGED
Binary file