youtube_transcript2020 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9405f15c45cd185c0ad06d54fb2e7198a39cdef14896051a3b03ea3d14b497fb
4
- data.tar.gz: 643f78333b91e3c5ff5ef960d8339a496618bcf3b5d81f41d74d17ffb760d533
3
+ metadata.gz: eaec0007026f2086f0ed0ed41e5c4d6de5c2e64aea17cf21dfab2a201b5228c5
4
+ data.tar.gz: 14953c7cf8156785e5413d17a6e02373935c368cd4f22be7ace93378517f8480
5
5
  SHA512:
6
- metadata.gz: 59915336a37bb3c3bdd7d84715c0e254db0f1b20b720c66f5191b6dba38ec072e3abb4e117f5876bcaf89eaa4dcfb7871ab279f36becd1523ba1d56daf230cae
7
- data.tar.gz: 53851b499c5a2024303f3da0b1da1da54a0eaf096db197c5ac97035e1ccb125ef581d551fa154c39892decfd5041fe667a2364293db8c1d1f8057ab24757b1ee
6
+ metadata.gz: 294f44e6db60fc35b8e9cdc0600d1d098a5d4eebe5cb09da0e9bc325b1ea489c5faf03ca9557fb7920672d82758313c162406b09b31aa617fcc3402282f8a61b
7
+ data.tar.gz: 143628d6cde724dd466d779f8a5796ab02facede423e34d01cf31ed1a52841f56cc09633fc3160c8d68d2502edf2989d0dc3962901fbcb86ac3124d788ff535a
Binary file
data.tar.gz.sig CHANGED
Binary file
@@ -4,6 +4,7 @@
4
4
 
5
5
  require 'yawc'
6
6
  require 'subunit'
7
+ require 'youtube_id'
7
8
  require 'simple-config'
8
9
 
9
10
 
@@ -11,20 +12,20 @@ class YoutubeTranscript2020
11
12
 
12
13
  attr_reader :to_a, :author, :id, :title
13
14
 
14
- def initialize(id=nil)
15
+ def initialize(id=nil, debug: false)
15
16
 
16
17
  return unless id
18
+
19
+ @debug = debug
17
20
 
18
- @id = if id[/https:\/\/www\.youtube\.com\/watch\?v=/] then
19
- id[/(?<=^https:\/\/www\.youtube\.com\/watch\?v=).*/]
20
- elsif id[/https:\/\/youtu\.be\//]
21
- id[/(?<=^https:\/\/youtu\.be\/).*/]
21
+ @id = if id[/https?:\/\//] then
22
+ YoutubeID.from(id)
22
23
  else
23
24
  id
24
25
  end
25
26
 
26
27
  s = Net::HTTP.get(URI("http://video.google.com/timedtext?lang=en&v=#{@id}"))
27
- @s = parse s
28
+ @s = parse(s) unless s.empty?
28
29
 
29
30
  fetch_info(@id)
30
31
 
@@ -52,14 +53,22 @@ class YoutubeTranscript2020
52
53
 
53
54
  s = RXFHelper.read(obj).first
54
55
 
55
- header, body = s.split(/-----+/,2)
56
+ if s =~ /------+/ then
57
+ header, body = s.split(/-----+/,2)
56
58
 
57
- h = SimpleConfig.new(header).to_h
58
- @id, @author, @title = h[:id], h[:author], h[:title]
59
- @s = body
60
-
59
+ h = SimpleConfig.new(header).to_h
60
+ @id, @author, @title = h[:id], h[:author], h[:title]
61
+ @s = body
62
+ else
63
+ body = obj
64
+ raw_transcript = true
65
+ end
66
+
67
+ puts 'body: ' + body[0..400] if @debug
61
68
  a = body.lines.map(&:chomp).partition {|x| x =~ /\d+:\d+/ }
62
- @a = a[0].zip(a[1])
69
+ @a = a[0].zip(a[1])
70
+
71
+ @s = join_sentences(@a) if raw_transcript
63
72
 
64
73
  end
65
74
 
@@ -87,7 +96,7 @@ class YoutubeTranscript2020
87
96
  <body>
88
97
  <div style="width: 1080px; background: white">
89
98
  <div style="float:left; width: 580px; background: white">
90
- <iframe width="560" height="315" src="#{url}&autoplay=1" name="video" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
99
+ #{@html_embed}
91
100
  <h1>#{@title}</h1>
92
101
  </div>
93
102
  <div style="float:right; width: 500px; overflow-y: scroll; height: 400px">
@@ -121,35 +130,51 @@ EOF
121
130
 
122
131
  def fetch_info(id)
123
132
 
124
- url = "http://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=#{id}&format=json"
133
+ url = "http://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=#{id}&format=xml"
125
134
  s = Net::HTTP.get(URI(url))
126
135
 
127
- h = JSON.parse(s, symbolize_names: true)
128
- @title = h[:title]
129
- @author = h[:author_name]
136
+ e = Rexle.new(s).root
137
+
138
+ @title = e.text('title')
139
+ @author = e.text('author_name')
140
+ @html_embed = e.text('html').unescape
130
141
 
131
142
  end
132
-
133
- def parse(s)
134
-
135
- doc = Rexle.new(s)
136
-
137
- a = doc.root.elements.each.map do |x|
138
- timestamp = Subunit.new(units={minutes:60, hours:60}, \
139
- seconds: x.attributes[:start].to_f).to_s(verbose: false)
140
- [timestamp, x.text.unescape.gsub("\n", ' ').gsub('&#39;',"'").gsub('&quot;','"')]
143
+
144
+ def join_sentences(a)
145
+
146
+ if @debug then
147
+ puts 'inside join_sentence'
148
+ puts 'a: ' + a.take(3).inspect
141
149
  end
142
-
143
- @to_a = a
144
-
150
+
145
151
  a2 = []
146
152
 
147
153
  # the following cleans up sentences that start with And, Or, But, So etc.
148
154
 
149
- a.each do |time, s|
155
+ a.each do |time, raws|
150
156
 
151
- if s[/^[a-z|0-9]/]then
152
- a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
157
+ puts 'raws: ' + raws.inspect if @debug
158
+
159
+ s = raws.sub(/^\W+/,'')
160
+
161
+ if s[/^[a-z|0-9]|I\b|I'/]then
162
+
163
+ if a2.any? then
164
+
165
+ # only join two parts together if there was no full stop in
166
+ # the previous line
167
+
168
+ if a2[-1][-1] != /\.$/ then
169
+ a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
170
+ else
171
+ a2 << [time, s]
172
+ end
173
+
174
+ else
175
+ a2 << [time, s.capitalize]
176
+ end
177
+
153
178
  elsif s[/^And,? /]
154
179
  a2[-1][-1] += ' ' + s.sub(/^And,? /,'').capitalize
155
180
  elsif s[/^Or,? /]
@@ -160,15 +185,39 @@ EOF
160
185
  a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
161
186
  elsif s[/^So,? /]
162
187
  a2[-1][-1] += ' ' + s.sub(/^So,? /,'').capitalize
188
+ elsif s[/^\[Music|Applause\]/i]
189
+ # ignore it
163
190
  else
164
- a2 << [time, s]
191
+
192
+ if a2.any? and not a2[-1][-1] =~ /\.\s*$/ then
193
+ a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
194
+ else
195
+ a2 << [time, s]
196
+ end
197
+
165
198
  end
166
199
 
167
200
  end
168
201
 
169
202
  # formats the paragraph with the timestamp appearing above
170
203
  @a = a2
171
- a2.map {|time, s| "\n%s\n\n%s" % [time, s]}.join("\n")
204
+ a2.map {|time, s| "\n%s\n\n%s" % [time, s]}.join("\n")
205
+
206
+ end
207
+
208
+ def parse(s)
209
+
210
+ doc = Rexle.new(s)
211
+
212
+ a = doc.root.elements.each.map do |x|
213
+ timestamp = Subunit.new(units={minutes:60, hours:60}, \
214
+ seconds: x.attributes[:start].to_f).to_s(verbose: false)
215
+ [timestamp, x.text.unescape.gsub("\n", ' ').gsub('&#39;',"'").gsub('&quot;','"')]
216
+ end
217
+
218
+ @to_a = a
219
+
220
+ join_sentences(a)
172
221
 
173
222
  end
174
223
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: youtube_transcript2020
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  42/R+mfGUgXXd9e36R3+wmfHZSFR6p6I6XKToCKca7buvgP2XgO9I04lTYUr0KLi
36
36
  6ZSQYo0XuSVg3by/5kp1TrrS
37
37
  -----END CERTIFICATE-----
38
- date: 2020-07-28 00:00:00.000000000 Z
38
+ date: 2020-07-29 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: yawc
@@ -97,6 +97,26 @@ dependencies:
97
97
  - - ">="
98
98
  - !ruby/object:Gem::Version
99
99
  version: 0.7.1
100
+ - !ruby/object:Gem::Dependency
101
+ name: youtube_id
102
+ requirement: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: 0.1.0
107
+ - - "~>"
108
+ - !ruby/object:Gem::Version
109
+ version: '0.1'
110
+ type: :runtime
111
+ prerelease: false
112
+ version_requirements: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - ">="
115
+ - !ruby/object:Gem::Version
116
+ version: 0.1.0
117
+ - - "~>"
118
+ - !ruby/object:Gem::Version
119
+ version: '0.1'
100
120
  description:
101
121
  email: james@jamesrobertson.eu
102
122
  executables: []
metadata.gz.sig CHANGED
Binary file