youtube_transcript2020 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- checksums.yaml.gz.sig +3 -0
- data.tar.gz.sig +2 -0
- data/lib/youtube_transcript2020.rb +163 -0
- metadata +110 -0
- metadata.gz.sig +0 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 8010045930895f8e21d94e9a66764270caa71c187592fb385d29c5b005c44d98
|
4
|
+
data.tar.gz: 370d33e8f02774874ab626b2d3b52c3cc7d0ae681db36c4c6dd0070a882faa59
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 9f8a4090ea44bb1579b58048ae7d1586883ef9580898e569207b365df2d70e2c0e5fa3db30e65c0d23d4011f1583d779432fec331b955d6745c90b561b863e88
|
7
|
+
data.tar.gz: b0bd2bda2e4182e4d1b522720ba032e7c0313ead7f2a987f71b728cc80da34edb26c5b114ccc006f5196448af1e325708e9b7d32ddcd2511f3a2a549b5480e97
|
checksums.yaml.gz.sig
ADDED
@@ -0,0 +1,3 @@
|
|
1
|
+
1y������>�1ǽ�
|
2
|
+
ҋ���Z*f6��W��!F���7��u\= � �}�e5N��r�u�a����d������m�dƩ�>ʼn�2���^<3%>$N�|��*��۬�4�A��v���kLC�y��{�-Yܽ�J��v�}�\KDނ��AG`�Ug��G7�,y�\��Uag�I:.��=�W��0V��
|
3
|
+
��h�}"_��#��nf�L�o��L}�;��r�㑩�{�;���T��YC�rxj�D�r��kX5�K8.ɘS�t��Z9�[9��yCg@<2��������R�>�����@pk��ef鹂��֍ �� ��FP���+*��_�-J���q�u%�z��#�ݠb}�e�#rHI
|
data.tar.gz.sig
ADDED
@@ -0,0 +1,2 @@
|
|
1
|
+
g�ج����d�Y�"��)�
|
2
|
+
�X�" k�5��\�aO�S�K;v��nr�e�S�H�<��5-`�ȶ��R�z����WS����ʜ�}g*���?����y'��Λ:�>NK�P�����HI_�����23$�8Y����"�7,��Ng����1��Y�-<v�4�5��U$FA��!8�lK��!5�m<n���aa))W�ªH�D�4�k dQ��A�:+�(�=L �B]x5��>�F�Kk]{�0P����,���9�cf�V� ,�1�犄�������t}ִة�2��x�j������)Yv>�A�{�)&�P��������
|
@@ -0,0 +1,163 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# file: youtube_transcript2020.rb
|
4
|
+
|
5
|
+
require 'subunit'
|
6
|
+
require 'simple-config'
|
7
|
+
|
8
|
+
|
9
|
+
class YoutubeTranscript2020
|
10
|
+
|
11
|
+
attr_reader :to_a, :author, :id, :title
|
12
|
+
|
13
|
+
def initialize(id=nil)
|
14
|
+
|
15
|
+
return unless id
|
16
|
+
|
17
|
+
@id = if id[/https:\/\/www\.youtube\.com\/watch\?v=/] then
|
18
|
+
id[/(?<=^https:\/\/www\.youtube\.com\/watch\?v=).*/]
|
19
|
+
elsif id[/https:\/\/youtu\.be\//]
|
20
|
+
id[/(?<=^https:\/\/youtu\.be\/).*/]
|
21
|
+
else
|
22
|
+
id
|
23
|
+
end
|
24
|
+
|
25
|
+
s = Net::HTTP.get(URI("http://video.google.com/timedtext?lang=en&v=#{@id}"))
|
26
|
+
@s = parse s
|
27
|
+
|
28
|
+
fetch_info(@id)
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
def to_a()
|
33
|
+
@a
|
34
|
+
end
|
35
|
+
|
36
|
+
# returns the transcript in plain text including timestamps
|
37
|
+
#
|
38
|
+
def to_s()
|
39
|
+
|
40
|
+
h = {id: @id, title: @title, author: @author}
|
41
|
+
SimpleConfig.new(h).to_s + "\n#{'-'*78}\n\n" + @s
|
42
|
+
end
|
43
|
+
|
44
|
+
# reads a plain text transcript which has been modified to include headings
|
45
|
+
#
|
46
|
+
def import(obj)
|
47
|
+
|
48
|
+
s = RXFHelper.read(obj).first
|
49
|
+
|
50
|
+
header, body = s.split(/-----+/,2)
|
51
|
+
|
52
|
+
h = SimpleConfig.new(header).to_h
|
53
|
+
@id, @author, @title = h[:id], h[:author], h[:title]
|
54
|
+
@s = body
|
55
|
+
|
56
|
+
a = body.lines.map(&:chomp).partition {|x| x =~ /\d+:\d+/ }
|
57
|
+
@a = a[0].zip(a[1])
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
# Outputs HTML containing the embedded video and transcription
|
62
|
+
#
|
63
|
+
def to_html()
|
64
|
+
|
65
|
+
url = 'https://www.youtube.com/embed/' + @id
|
66
|
+
|
67
|
+
links = @a.map do |timestamp, s|
|
68
|
+
|
69
|
+
seconds = Subunit.new(units={minutes:60, hours:60},
|
70
|
+
timestamp.split(':').map(&:to_i)).to_i
|
71
|
+
"<li><a href='%s?start=%s&autoplay=1' target='video'>%s</a><p>%s</p></li> " \
|
72
|
+
% [url, seconds, timestamp, s]
|
73
|
+
end
|
74
|
+
|
75
|
+
<<EOF
|
76
|
+
<!DOCTYPE html>
|
77
|
+
<html lang="en">
|
78
|
+
<head>
|
79
|
+
<title></title>
|
80
|
+
<meta charset="utf-8" />
|
81
|
+
</head>
|
82
|
+
<body>
|
83
|
+
<div style="width: 1080px; background: white">
|
84
|
+
<div style="float:left; width: 580px; background: white">
|
85
|
+
<iframe width="560" height="315" src="#{url}?start=67&autoplay=1" name="video" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
|
86
|
+
<h1>#{@title}</h1>
|
87
|
+
</div>
|
88
|
+
<div style="float:right; width: 500px; overflow-y: scroll; height: 400px">
|
89
|
+
<ul>#{links.join("\n")}</ul>
|
90
|
+
</div>
|
91
|
+
|
92
|
+
</div>
|
93
|
+
</body>
|
94
|
+
</html>
|
95
|
+
EOF
|
96
|
+
end
|
97
|
+
|
98
|
+
# Outputs plain text containing the headings including timestamps
|
99
|
+
# note: This can be helpful for copyng and pasting directly into a YouTube comment
|
100
|
+
#
|
101
|
+
def to_headings()
|
102
|
+
|
103
|
+
@to_a.select {|timestamp, _| timestamp =~ / /}.map(&:first)
|
104
|
+
|
105
|
+
end
|
106
|
+
|
107
|
+
private
|
108
|
+
|
109
|
+
def fetch_info(id)
|
110
|
+
|
111
|
+
url = "http://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=#{id}&format=json"
|
112
|
+
s = Net::HTTP.get(URI(url))
|
113
|
+
|
114
|
+
h = JSON.parse(s, symbolize_names: true)
|
115
|
+
@title = h[:title]
|
116
|
+
@author = h[:author_name]
|
117
|
+
|
118
|
+
end
|
119
|
+
|
120
|
+
def parse(s)
|
121
|
+
|
122
|
+
doc = Rexle.new(s)
|
123
|
+
|
124
|
+
a = doc.root.elements.each.map do |x|
|
125
|
+
timestamp = Subunit.new(units={minutes:60, hours:60}, \
|
126
|
+
seconds: x.attributes[:start].to_f).to_s(verbose: false)
|
127
|
+
[timestamp, x.text.unescape.gsub("\n", ' ').gsub(''',"'").gsub('"','"')]
|
128
|
+
end
|
129
|
+
|
130
|
+
@to_a = a
|
131
|
+
|
132
|
+
a2 = []
|
133
|
+
|
134
|
+
# the following cleans up sentences that start with And, Or, But, So etc.
|
135
|
+
|
136
|
+
a.each do |time, s|
|
137
|
+
|
138
|
+
if s[/^[a-z|0-9]/]then
|
139
|
+
a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
|
140
|
+
elsif s[/^And,? /]
|
141
|
+
a2[-1][-1] += ' ' + s.sub(/^And,? /,'').capitalize
|
142
|
+
elsif s[/^Or,? /]
|
143
|
+
a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
|
144
|
+
elsif s[/^But /]
|
145
|
+
a2[-1][-1] += ' ' + s.sub(/But,? /,'').capitalize
|
146
|
+
elsif s[/^"/]
|
147
|
+
a2[-1][-1] = a2[-1][-1].chomp + ' ' + s
|
148
|
+
elsif s[/^So,? /]
|
149
|
+
a2[-1][-1] += ' ' + s.sub(/^So,? /,'').capitalize
|
150
|
+
else
|
151
|
+
a2 << [time, s]
|
152
|
+
end
|
153
|
+
|
154
|
+
end
|
155
|
+
|
156
|
+
# formats the paragraph with the timestamp appearing above
|
157
|
+
@a = a2
|
158
|
+
a2.map {|time, s| "\n%s\n\n%s" % [time, s]}
|
159
|
+
|
160
|
+
end
|
161
|
+
|
162
|
+
end
|
163
|
+
|
metadata
ADDED
@@ -0,0 +1,110 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: youtube_transcript2020
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- James Robertson
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain:
|
11
|
+
- |
|
12
|
+
-----BEGIN CERTIFICATE-----
|
13
|
+
MIIEXjCCAsagAwIBAgIBATANBgkqhkiG9w0BAQsFADAsMSowKAYDVQQDDCFnZW1t
|
14
|
+
YXN0ZXIvREM9amFtZXNyb2JlcnRzb24vREM9ZXUwHhcNMjAwNzI3MjI1MTUyWhcN
|
15
|
+
MjEwNzI3MjI1MTUyWjAsMSowKAYDVQQDDCFnZW1tYXN0ZXIvREM9amFtZXNyb2Jl
|
16
|
+
cnRzb24vREM9ZXUwggGiMA0GCSqGSIb3DQEBAQUAA4IBjwAwggGKAoIBgQCxfRw+
|
17
|
+
xg525jF+UNFVTtUrON2issNxWgDTq1efjPq9yMzqYrIDZREFE/3fgYbtAqA1Ut94
|
18
|
+
2h8mAKnAg1CC4plPA8o15f+h30TPRaxZXFmYUMxTkaLHL4Lvzd1D7eXqRYf9SFQM
|
19
|
+
EvoYbncj9QwR57WcVF/MTdwbyyiZo3CGzwmWNb9OCIZtvs8m/UOzAmbfF3lIKz9k
|
20
|
+
+ZK03KqYhyjuAiVhF39LdWUc1AWqu5i+JpFE+Lzfqv1uAjjgshmUkHOXkpWOorHc
|
21
|
+
uxL0+xZXWgTwpa1QCw3cQY1LW45QjZt4ckA9lOub1LvUTDCvZocNS+dlIUMdW0mP
|
22
|
+
jFII/nX/KWxW+NOmkWBpdGbXmY5QTppwx88r+VRpTdhepVcNiiHhMsYQsLI/fzVo
|
23
|
+
kWTib/aBnAoahtlbaldC+e03GPsLPmpTl4ZjOFqUuAyq47h42NYt6kPY/y7Gj8To
|
24
|
+
fx4pNgddR/r/WABaNao8Q+tzIxgQwCf1rijvfJP+u04GCmIeFm8oQ1x0XkUCAwEA
|
25
|
+
AaOBijCBhzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIEsDAdBgNVHQ4EFgQU1nkRML1E
|
26
|
+
Q0PgH/jEHBOQSUTi4MYwJgYDVR0RBB8wHYEbZ2VtbWFzdGVyQGphbWVzcm9iZXJ0
|
27
|
+
c29uLmV1MCYGA1UdEgQfMB2BG2dlbW1hc3RlckBqYW1lc3JvYmVydHNvbi5ldTAN
|
28
|
+
BgkqhkiG9w0BAQsFAAOCAYEAWjyRzOnO0k/P2YHBsie5hNyJq6q7zb9bto2WYF1L
|
29
|
+
N0/cvumuBsJMDUuPlD9RFvzncZbu//hbnZbK6cxiptm9HUN+m7zNi8XUcDHQw4Ba
|
30
|
+
17ZyHWKM2pkf+PJb4waQVeqyUXjbM9r6L8cVa1gkalU6ZpqEtBmkEzJCDZVf0Fll
|
31
|
+
KrPYWAW5cC7EWeDm1yxusOqzxnkBcXMnKYNJm8KU4YfVpgPXJy9bTLWhm482BlJm
|
32
|
+
v6wUZwYOM9B7x3dWbbsQXSuKmFqoxiNRWaA41qUS6eVjXpd4Gn/diSzntaX/Whew
|
33
|
+
dCXyioQY49CVGJg8LpX/zSYUk9dns+fCSeUUfKjv2K8WuzVkS/uMA8DxSeYBfxf5
|
34
|
+
ON+xcGIy3Nk7FHwY+CuIIa4WCJYB+1bVFeyCaRlCpwHK8DGUxP5PzCb44USGTI2V
|
35
|
+
42/R+mfGUgXXd9e36R3+wmfHZSFR6p6I6XKToCKca7buvgP2XgO9I04lTYUr0KLi
|
36
|
+
6ZSQYo0XuSVg3by/5kp1TrrS
|
37
|
+
-----END CERTIFICATE-----
|
38
|
+
date: 2020-07-27 00:00:00.000000000 Z
|
39
|
+
dependencies:
|
40
|
+
- !ruby/object:Gem::Dependency
|
41
|
+
name: subunit
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - "~>"
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0.5'
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 0.5.2
|
50
|
+
type: :runtime
|
51
|
+
prerelease: false
|
52
|
+
version_requirements: !ruby/object:Gem::Requirement
|
53
|
+
requirements:
|
54
|
+
- - "~>"
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: '0.5'
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: 0.5.2
|
60
|
+
- !ruby/object:Gem::Dependency
|
61
|
+
name: simple-config
|
62
|
+
requirement: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - "~>"
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: '0.7'
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 0.7.1
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - "~>"
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0.7'
|
77
|
+
- - ">="
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: 0.7.1
|
80
|
+
description:
|
81
|
+
email: james@jamesrobertson.eu
|
82
|
+
executables: []
|
83
|
+
extensions: []
|
84
|
+
extra_rdoc_files: []
|
85
|
+
files:
|
86
|
+
- lib/youtube_transcript2020.rb
|
87
|
+
homepage: https://github.com/jrobertson/youtube_transcript2020
|
88
|
+
licenses:
|
89
|
+
- MIT
|
90
|
+
metadata: {}
|
91
|
+
post_install_message:
|
92
|
+
rdoc_options: []
|
93
|
+
require_paths:
|
94
|
+
- lib
|
95
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
96
|
+
requirements:
|
97
|
+
- - ">="
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: '0'
|
100
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
101
|
+
requirements:
|
102
|
+
- - ">="
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: '0'
|
105
|
+
requirements: []
|
106
|
+
rubygems_version: 3.0.3
|
107
|
+
signing_key:
|
108
|
+
specification_version: 4
|
109
|
+
summary: Makes it easier to digest a Youtube video by reading the transcript.
|
110
|
+
test_files: []
|
metadata.gz.sig
ADDED
Binary file
|