spdeck-scrape 0.0.61
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/README.md +53 -0
- data/Rakefile +0 -0
- data/bin/spdeck-scrape +34 -0
- data/lib/spdeck-scrape.rb +29 -0
- data/lib/spdeck-scrape/spdeck-scraper-class.rb +174 -0
- data/spdeck-scrape-0.0.6.gem +0 -0
- data/spdeck-scrape.gemspec +20 -0
- data/spec/.rspec +0 -0
- data/spec/spec_helper.rb +0 -0
- metadata +70 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: f6e658ef89f9252e69359bf19387b9067ff505fa
|
4
|
+
data.tar.gz: b788e687930c378e9b61e0857208d412d4afb07e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e536ce69458e7401cc94d311a72759f16270243c4bdf43b2618c13d9a93315bb5619053e8e45d4ae04f4e3a881e0a96cd7b64d40ab772957f271479290560c15
|
7
|
+
data.tar.gz: 7f482bce096a34c2c5e68eb9e50a444b3b7df43a773fb7d162475f0f93d57fa7a4d8b76fccf4ef4859ed35b19ac37ce78d9260b1c388715726afd2c10b88c912
|
data/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
ref-ignore/*
|
data/README.md
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
#spdeck-scrape: Simple SpeakerDeck Scraper
|
2
|
+
|
3
|
+
This is a simple gem designed to scrape data from SpeakerDeck.com. This is the first gem I have ever built! I wrote it to practice scraping websites and to learn how to build gems.
|
4
|
+
|
5
|
+
SpeakerDeck.com does not natively allow sorting presenations according to views, so this gem allows you to grab the views data and port it into a database or straight to barebones HTML, sorted in descending order.
|
6
|
+
|
7
|
+
This gem is still a work in progress!
|
8
|
+
|
9
|
+
###Installation
|
10
|
+
`gem install spdeck-scrape`
|
11
|
+
|
12
|
+
|
13
|
+
###Usage
|
14
|
+
spdeck-scrape can be used from the command line and also in Ruby scripts.
|
15
|
+
|
16
|
+
From the command line:
|
17
|
+
```bash
|
18
|
+
$ spdeck-scrape [query | range | l or s display]
|
19
|
+
```
|
20
|
+
In a Ruby script:
|
21
|
+
|
22
|
+
- initialize a new `SpeakerdeckScraper` object specifying the desired query. It defaults to Ruby.
|
23
|
+
|
24
|
+
```ruby
|
25
|
+
spd-ruby = SpeakerdeckScraper.new("rails")
|
26
|
+
# grabs the titles, authors, views, and links
|
27
|
+
```
|
28
|
+
- set the number of query results pages to pull
|
29
|
+
```ruby
|
30
|
+
spd-ruby.query_results_scrape(10)
|
31
|
+
# pulls the first 10 pages
|
32
|
+
```
|
33
|
+
- initiate the scrape
|
34
|
+
```ruby
|
35
|
+
spd-ruby.scrape_all
|
36
|
+
```
|
37
|
+
- extract the data to basic HTML
|
38
|
+
```ruby
|
39
|
+
spd-ruby.html_gen
|
40
|
+
# will create a file called 'spd-ruby.html' in the working directory with a table of the results sorted by views descending
|
41
|
+
```
|
42
|
+
|
43
|
+
|
44
|
+
|
45
|
+
###Classes
|
46
|
+
|
47
|
+
SpeakerdeckScraper
|
48
|
+
Presentations
|
49
|
+
SPDatabase
|
50
|
+
SPHTMLGen
|
51
|
+
|
52
|
+
###Methods
|
53
|
+
|
data/Rakefile
ADDED
File without changes
|
data/bin/spdeck-scrape
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'spdeck-scrape'
|
4
|
+
|
5
|
+
# usage: $ spdeck-scrape my_query my_range [verbose | concise] -html[optional for html gen]
|
6
|
+
if ARGV.empty?
|
7
|
+
puts "\n\n------- spdeck-scrape: ERROR! --------"
|
8
|
+
puts " Usage:"
|
9
|
+
puts " Please specify a query, range, and display option (if desired):\n"
|
10
|
+
puts " spdeck-scrape my_query an_integer [options]"
|
11
|
+
puts " Options:
|
12
|
+
-v # verbose display while running"
|
13
|
+
puts " -c # concise display"
|
14
|
+
puts " -html # include this tag to print data to an HTML file (must also include a display option)\n"
|
15
|
+
puts " Example:"
|
16
|
+
puts " spdeck-scrape ruby 15 -v -html\n\n"
|
17
|
+
else
|
18
|
+
query = ARGV[0]
|
19
|
+
ARGV[1].nil? ? range = 5 : range = ARGV[1].to_i
|
20
|
+
display = ARGV[2] || '-c'
|
21
|
+
|
22
|
+
user = SpeakerdeckScraper.new(query, range, display)
|
23
|
+
|
24
|
+
user.query_results_scrape(range)
|
25
|
+
user.scrape_all
|
26
|
+
if ARGV[3] == ("-html")
|
27
|
+
user.html_gen
|
28
|
+
system("open spd-#{query}.html")
|
29
|
+
puts
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require_relative './spdeck-scrape/spdeck-scraper-class.rb'
|
2
|
+
#require 'spdeck-scrape'
|
3
|
+
|
4
|
+
# this file is loaded and run when 'spdeck-scrape' is required in a script
|
5
|
+
|
6
|
+
# test code
|
7
|
+
|
8
|
+
# scraper = SpeakerdeckScraper.new(, "ruby")
|
9
|
+
# scraper.query_results_scrape(3)
|
10
|
+
# scraper.scrape_all
|
11
|
+
# File.open('spd-ruby-raw', 'w') do |file|
|
12
|
+
# file.write(scraper.presentations)
|
13
|
+
# end
|
14
|
+
|
15
|
+
|
16
|
+
# scraper.html_gen
|
17
|
+
|
18
|
+
# scraper2 = SpeakerdeckScraper.new("https://speakerdeck.com/", "json")
|
19
|
+
# scraper2.query_results_scrape(2)
|
20
|
+
# scraper2.scrape_all
|
21
|
+
# File.open('spd-json-raw', 'w') do |file|
|
22
|
+
# file.write(scraper.presentations)
|
23
|
+
# end
|
24
|
+
|
25
|
+
# scraper2.html_gen
|
26
|
+
|
27
|
+
# system("open spd-ruby.html spd-json.html")
|
28
|
+
|
29
|
+
# initialize a scraper with a website and a query
|
@@ -0,0 +1,174 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'pry'
|
4
|
+
|
5
|
+
class SpeakerdeckScraper
|
6
|
+
|
7
|
+
attr_reader :page_object, :presentations, :url
|
8
|
+
attr_accessor :start_time, :end_time, :opts, :query, :display
|
9
|
+
|
10
|
+
SD_QUERY_FIRST_PAGE = "https://speakerdeck.com/search?q=ruby"
|
11
|
+
SD_DOMAIN = "https://speakerdeck.com"
|
12
|
+
|
13
|
+
def initialize(query, range = 5, display = '-v')
|
14
|
+
@url = "https://speakerdeck.com/"
|
15
|
+
@query = query
|
16
|
+
@page_object = ''
|
17
|
+
@presentations = {}
|
18
|
+
@start_time = Time.now
|
19
|
+
@range = range
|
20
|
+
@display = display
|
21
|
+
end
|
22
|
+
|
23
|
+
def query_results_scrape(range)
|
24
|
+
puts "grabbing presentations"
|
25
|
+
begin
|
26
|
+
single_results_page_scrape(SD_QUERY_FIRST_PAGE)
|
27
|
+
(2..range).collect do |i|
|
28
|
+
single_results_page_scrape(i)
|
29
|
+
end
|
30
|
+
rescue
|
31
|
+
puts "error! prob nothing to worry about"
|
32
|
+
end
|
33
|
+
puts "\ncool! we got #{presentations.length} presentations"
|
34
|
+
end
|
35
|
+
|
36
|
+
# dumps the query results into a hash, presentations = { 'pres title' => 'pres_link.html' }
|
37
|
+
# not called explicitly, lives in query scrape wrapper
|
38
|
+
def single_results_page_scrape(i)
|
39
|
+
doc = Nokogiri::HTML(open "#{self.url}search?page=#{i}&q=#{query}")
|
40
|
+
doc.css('div.talk').each do |presentation|
|
41
|
+
# ensures a unique key in the hash
|
42
|
+
pres_id = presentation.attr('data-id')
|
43
|
+
|
44
|
+
pres_link = presentation.css('h3.title a').attr('href').text
|
45
|
+
|
46
|
+
pres_title = presentation.css('h3.title').text.strip
|
47
|
+
author_name = presentation.parent.css('h3.title a').last.text
|
48
|
+
verbose_display(pres_title, author_name) if self.display == "-v"
|
49
|
+
concise_display if self.display == "-c"
|
50
|
+
|
51
|
+
self.presentations[pres_id] = pres_link
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
#### display options ############
|
56
|
+
def verbose_display(pres_title, author_name)
|
57
|
+
good_words = ["awesome", "great", "amazing", "really cool", "tops", "mind-blowing", "super", "glittering", "thought-provoking", "glorious", "sweet", "classy","really great", "fun", "strong", "robust", "healthy", "fine", "superior", "quality", "thoughful", "intelligent", "clever", "genius","incredible", "smart", "beautiful", "handsome", "pulchritudinous", "elegant", "bespoke", "crazy", "satisfying", "inspirational", "inspiring", "mind-exploding", "hot"]
|
58
|
+
puts "grabbed a #{good_words[rand(good_words.length)]} presentation #{pres_title} by #{author_name}"
|
59
|
+
sleep(0.02)
|
60
|
+
end
|
61
|
+
|
62
|
+
def concise_display
|
63
|
+
print "#"
|
64
|
+
sleep(0.02)
|
65
|
+
end
|
66
|
+
#### display options end ##########
|
67
|
+
|
68
|
+
# wrapper to run the single page scraper for all links
|
69
|
+
def scrape_all
|
70
|
+
puts "reading presentation data"
|
71
|
+
self.presentations.each do |id, link|
|
72
|
+
pres_page_scrape(id, link)
|
73
|
+
end
|
74
|
+
self.end_time = Time.now
|
75
|
+
end
|
76
|
+
|
77
|
+
# grab data from one page
|
78
|
+
# note: this is a time consuming process -- have to open each page (but necessary because the views data isn't stored on the query pages)
|
79
|
+
def pres_page_scrape(id, pres_link)
|
80
|
+
pres_page = Nokogiri::HTML(open("https://speakerdeck.com#{pres_link}"))
|
81
|
+
|
82
|
+
presentations[id] = {
|
83
|
+
:title => pres_title(pres_page),
|
84
|
+
:link => pres_link,
|
85
|
+
:date => pres_date(pres_page),
|
86
|
+
:author => pres_author(pres_page),
|
87
|
+
:author_link => pres_author_link(pres_page),
|
88
|
+
:category => pres_category(pres_page),
|
89
|
+
:views => pres_views(pres_page)
|
90
|
+
}
|
91
|
+
|
92
|
+
if self.display == '-c'
|
93
|
+
concise_display
|
94
|
+
else
|
95
|
+
puts "#{presentations[id][:title]} has #{presentations[id][:views]} views!"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def pres_views(pres_page)
|
100
|
+
pres_page.css('li.views').text.scan(/\d+/).join.to_i
|
101
|
+
end
|
102
|
+
|
103
|
+
def pres_title(pres_page)
|
104
|
+
pres_page.css('div#content header h1').text
|
105
|
+
|
106
|
+
end
|
107
|
+
|
108
|
+
def pres_author(pres_page)
|
109
|
+
pres_page.css('div#content header h2 a').text
|
110
|
+
end
|
111
|
+
|
112
|
+
def pres_author_link(pres_page)
|
113
|
+
pres_page.css('div#content header h2 a').attr('href').text
|
114
|
+
end
|
115
|
+
|
116
|
+
def pres_date(pres_page)
|
117
|
+
pres_page.css('div#talk-details mark').first.text.strip
|
118
|
+
end
|
119
|
+
|
120
|
+
def pres_category(pres_page)
|
121
|
+
pres_page.css('div#talk-details mark a').text
|
122
|
+
end
|
123
|
+
|
124
|
+
def html_gen
|
125
|
+
# take data and sort it by views descending
|
126
|
+
sorted_array = self.presentations.values.sort_by do |pres_hash|
|
127
|
+
pres_hash[:views]
|
128
|
+
end.reverse
|
129
|
+
|
130
|
+
File.open("spd-#{query}.html", "w") do |file|
|
131
|
+
file.write( <<-HTML
|
132
|
+
<html>
|
133
|
+
<head>
|
134
|
+
</head>
|
135
|
+
<body>
|
136
|
+
<h1>speakerdeck presentations - #{query}</h1>
|
137
|
+
<h4>this site was generated in #{self.end_time - self.start_time} seconds (last queried at #{self.start_time})
|
138
|
+
<table class="tablesorter" border="1">
|
139
|
+
<tr>
|
140
|
+
<th>title</th>
|
141
|
+
<th>date</th>
|
142
|
+
<th>category</th>
|
143
|
+
<th>author</th>
|
144
|
+
<th>views</th>
|
145
|
+
</tr>
|
146
|
+
HTML
|
147
|
+
)
|
148
|
+
sorted_array.each do |content_hash|
|
149
|
+
link = "#{SD_DOMAIN}#{content_hash[:link]}"
|
150
|
+
author_link = "#{SD_DOMAIN}#{content_hash[:author_link]}"
|
151
|
+
file.write ( <<-HTML
|
152
|
+
<tr>
|
153
|
+
<td><a href=#{link}>#{content_hash[:title]}</a></td>
|
154
|
+
<td>#{content_hash[:date]}</td>
|
155
|
+
<td><a href="https://speakerdeck.com/c/#{content_hash[:category].downcase}">#{content_hash[:category]}</a></td>
|
156
|
+
<td><a href=#{author_link}>#{content_hash[:author]}</a></td>
|
157
|
+
<td>#{content_hash[:views]}</td>
|
158
|
+
</tr>
|
159
|
+
HTML
|
160
|
+
)
|
161
|
+
end
|
162
|
+
file.write(<<-HTML
|
163
|
+
</table>
|
164
|
+
</body>
|
165
|
+
</html>
|
166
|
+
HTML
|
167
|
+
)
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
# class end
|
172
|
+
end
|
173
|
+
|
174
|
+
|
Binary file
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# in progress
|
2
|
+
Gem::Specification.new do |s|
|
3
|
+
s.name = 'spdeck-scrape'
|
4
|
+
s.executables << 'spdeck-scrape'
|
5
|
+
s.version = '0.0.61'
|
6
|
+
s.date = '2013-10-11'
|
7
|
+
s.summary = "Simple scraper for SpeakerDeck"
|
8
|
+
s.description = "Generate data organized by viewcount for a database or webpages about presentations posted on SpeakerDeck (SpeakerDeck.com)"
|
9
|
+
s.author = "Joe O'Conor"
|
10
|
+
s.email = 'joe.oconor@gmail.com'
|
11
|
+
s.files = `git ls-files`.split("\n")
|
12
|
+
s.homepage =
|
13
|
+
'http://rubygems.org/gems/spdeck-scrape'
|
14
|
+
s.license = 'MIT'
|
15
|
+
s.require_path = 'lib'
|
16
|
+
s.add_runtime_dependency 'nokogiri' >= '1.6.0'
|
17
|
+
s.post_install_message = <<-JNO
|
18
|
+
You have installed spdeck-scrape! See the README at https://github.com/jnoconor/spdeck-scrape for more information.
|
19
|
+
JNO
|
20
|
+
end
|
data/spec/.rspec
ADDED
File without changes
|
data/spec/spec_helper.rb
ADDED
File without changes
|
metadata
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: spdeck-scrape
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.61
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Joe O'Conor
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-10-11 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: 'true'
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
description: Generate data organized by viewcount for a database or webpages about
|
28
|
+
presentations posted on SpeakerDeck (SpeakerDeck.com)
|
29
|
+
email: joe.oconor@gmail.com
|
30
|
+
executables:
|
31
|
+
- spdeck-scrape
|
32
|
+
extensions: []
|
33
|
+
extra_rdoc_files: []
|
34
|
+
files:
|
35
|
+
- .gitignore
|
36
|
+
- README.md
|
37
|
+
- Rakefile
|
38
|
+
- bin/spdeck-scrape
|
39
|
+
- lib/spdeck-scrape.rb
|
40
|
+
- lib/spdeck-scrape/spdeck-scraper-class.rb
|
41
|
+
- spdeck-scrape-0.0.6.gem
|
42
|
+
- spdeck-scrape.gemspec
|
43
|
+
- spec/.rspec
|
44
|
+
- spec/spec_helper.rb
|
45
|
+
homepage: http://rubygems.org/gems/spdeck-scrape
|
46
|
+
licenses:
|
47
|
+
- MIT
|
48
|
+
metadata: {}
|
49
|
+
post_install_message: |2
|
50
|
+
You have installed spdeck-scrape! See the README at https://github.com/jnoconor/spdeck-scrape for more information.
|
51
|
+
rdoc_options: []
|
52
|
+
require_paths:
|
53
|
+
- lib
|
54
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
55
|
+
requirements:
|
56
|
+
- - '>='
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: '0'
|
59
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - '>='
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0'
|
64
|
+
requirements: []
|
65
|
+
rubyforge_project:
|
66
|
+
rubygems_version: 2.0.6
|
67
|
+
signing_key:
|
68
|
+
specification_version: 4
|
69
|
+
summary: Simple scraper for SpeakerDeck
|
70
|
+
test_files: []
|