spdeck-scrape 0.0.61
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/README.md +53 -0
- data/Rakefile +0 -0
- data/bin/spdeck-scrape +34 -0
- data/lib/spdeck-scrape.rb +29 -0
- data/lib/spdeck-scrape/spdeck-scraper-class.rb +174 -0
- data/spdeck-scrape-0.0.6.gem +0 -0
- data/spdeck-scrape.gemspec +20 -0
- data/spec/.rspec +0 -0
- data/spec/spec_helper.rb +0 -0
- metadata +70 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: f6e658ef89f9252e69359bf19387b9067ff505fa
|
4
|
+
data.tar.gz: b788e687930c378e9b61e0857208d412d4afb07e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e536ce69458e7401cc94d311a72759f16270243c4bdf43b2618c13d9a93315bb5619053e8e45d4ae04f4e3a881e0a96cd7b64d40ab772957f271479290560c15
|
7
|
+
data.tar.gz: 7f482bce096a34c2c5e68eb9e50a444b3b7df43a773fb7d162475f0f93d57fa7a4d8b76fccf4ef4859ed35b19ac37ce78d9260b1c388715726afd2c10b88c912
|
data/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
ref-ignore/*
|
data/README.md
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
#spdeck-scrape: Simple SpeakerDeck Scraper
|
2
|
+
|
3
|
+
This is a simple gem designed to scrape data from SpeakerDeck.com. This is the first gem I have ever built! I wrote it to practice scraping websites and to learn how to build gems.
|
4
|
+
|
5
|
+
SpeakerDeck.com does not natively allow sorting presenations according to views, so this gem allows you to grab the views data and port it into a database or straight to barebones HTML, sorted in descending order.
|
6
|
+
|
7
|
+
This gem is still a work in progress!
|
8
|
+
|
9
|
+
###Installation
|
10
|
+
`gem install spdeck-scrape`
|
11
|
+
|
12
|
+
|
13
|
+
###Usage
|
14
|
+
spdeck-scrape can be used from the command line and also in Ruby scripts.
|
15
|
+
|
16
|
+
From the command line:
|
17
|
+
```bash
|
18
|
+
$ spdeck-scrape [query | range | l or s display]
|
19
|
+
```
|
20
|
+
In a Ruby script:
|
21
|
+
|
22
|
+
- initialize a new `SpeakerdeckScraper` object specifying the desired query. It defaults to Ruby.
|
23
|
+
|
24
|
+
```ruby
|
25
|
+
spd-ruby = SpeakerdeckScraper.new("rails")
|
26
|
+
# grabs the titles, authors, views, and links
|
27
|
+
```
|
28
|
+
- set the number of query results pages to pull
|
29
|
+
```ruby
|
30
|
+
spd-ruby.query_results_scrape(10)
|
31
|
+
# pulls the first 10 pages
|
32
|
+
```
|
33
|
+
- initiate the scrape
|
34
|
+
```ruby
|
35
|
+
spd-ruby.scrape_all
|
36
|
+
```
|
37
|
+
- extract the data to basic HTML
|
38
|
+
```ruby
|
39
|
+
spd-ruby.html_gen
|
40
|
+
# will create a file called 'spd-ruby.html' in the working directory with a table of the results sorted by views descending
|
41
|
+
```
|
42
|
+
|
43
|
+
|
44
|
+
|
45
|
+
###Classes
|
46
|
+
|
47
|
+
SpeakerdeckScraper
|
48
|
+
Presentations
|
49
|
+
SPDatabase
|
50
|
+
SPHTMLGen
|
51
|
+
|
52
|
+
###Methods
|
53
|
+
|
data/Rakefile
ADDED
File without changes
|
data/bin/spdeck-scrape
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'spdeck-scrape'
|
4
|
+
|
5
|
+
# usage: $ spdeck-scrape my_query my_range [verbose | concise] -html[optional for html gen]
|
6
|
+
if ARGV.empty?
|
7
|
+
puts "\n\n------- spdeck-scrape: ERROR! --------"
|
8
|
+
puts " Usage:"
|
9
|
+
puts " Please specify a query, range, and display option (if desired):\n"
|
10
|
+
puts " spdeck-scrape my_query an_integer [options]"
|
11
|
+
puts " Options:
|
12
|
+
-v # verbose display while running"
|
13
|
+
puts " -c # concise display"
|
14
|
+
puts " -html # include this tag to print data to an HTML file (must also include a display option)\n"
|
15
|
+
puts " Example:"
|
16
|
+
puts " spdeck-scrape ruby 15 -v -html\n\n"
|
17
|
+
else
|
18
|
+
query = ARGV[0]
|
19
|
+
ARGV[1].nil? ? range = 5 : range = ARGV[1].to_i
|
20
|
+
display = ARGV[2] || '-c'
|
21
|
+
|
22
|
+
user = SpeakerdeckScraper.new(query, range, display)
|
23
|
+
|
24
|
+
user.query_results_scrape(range)
|
25
|
+
user.scrape_all
|
26
|
+
if ARGV[3] == ("-html")
|
27
|
+
user.html_gen
|
28
|
+
system("open spd-#{query}.html")
|
29
|
+
puts
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require_relative './spdeck-scrape/spdeck-scraper-class.rb'
|
2
|
+
#require 'spdeck-scrape'
|
3
|
+
|
4
|
+
# this file is loaded and run when 'spdeck-scrape' is required in a script
|
5
|
+
|
6
|
+
# test code
|
7
|
+
|
8
|
+
# scraper = SpeakerdeckScraper.new(, "ruby")
|
9
|
+
# scraper.query_results_scrape(3)
|
10
|
+
# scraper.scrape_all
|
11
|
+
# File.open('spd-ruby-raw', 'w') do |file|
|
12
|
+
# file.write(scraper.presentations)
|
13
|
+
# end
|
14
|
+
|
15
|
+
|
16
|
+
# scraper.html_gen
|
17
|
+
|
18
|
+
# scraper2 = SpeakerdeckScraper.new("https://speakerdeck.com/", "json")
|
19
|
+
# scraper2.query_results_scrape(2)
|
20
|
+
# scraper2.scrape_all
|
21
|
+
# File.open('spd-json-raw', 'w') do |file|
|
22
|
+
# file.write(scraper.presentations)
|
23
|
+
# end
|
24
|
+
|
25
|
+
# scraper2.html_gen
|
26
|
+
|
27
|
+
# system("open spd-ruby.html spd-json.html")
|
28
|
+
|
29
|
+
# initialize a scraper with a website and a query
|
@@ -0,0 +1,174 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'pry'
|
4
|
+
|
5
|
+
class SpeakerdeckScraper
|
6
|
+
|
7
|
+
attr_reader :page_object, :presentations, :url
|
8
|
+
attr_accessor :start_time, :end_time, :opts, :query, :display
|
9
|
+
|
10
|
+
SD_QUERY_FIRST_PAGE = "https://speakerdeck.com/search?q=ruby"
|
11
|
+
SD_DOMAIN = "https://speakerdeck.com"
|
12
|
+
|
13
|
+
def initialize(query, range = 5, display = '-v')
|
14
|
+
@url = "https://speakerdeck.com/"
|
15
|
+
@query = query
|
16
|
+
@page_object = ''
|
17
|
+
@presentations = {}
|
18
|
+
@start_time = Time.now
|
19
|
+
@range = range
|
20
|
+
@display = display
|
21
|
+
end
|
22
|
+
|
23
|
+
def query_results_scrape(range)
|
24
|
+
puts "grabbing presentations"
|
25
|
+
begin
|
26
|
+
single_results_page_scrape(SD_QUERY_FIRST_PAGE)
|
27
|
+
(2..range).collect do |i|
|
28
|
+
single_results_page_scrape(i)
|
29
|
+
end
|
30
|
+
rescue
|
31
|
+
puts "error! prob nothing to worry about"
|
32
|
+
end
|
33
|
+
puts "\ncool! we got #{presentations.length} presentations"
|
34
|
+
end
|
35
|
+
|
36
|
+
# dumps the query results into a hash, presentations = { 'pres title' => 'pres_link.html' }
|
37
|
+
# not called explicitly, lives in query scrape wrapper
|
38
|
+
def single_results_page_scrape(i)
|
39
|
+
doc = Nokogiri::HTML(open "#{self.url}search?page=#{i}&q=#{query}")
|
40
|
+
doc.css('div.talk').each do |presentation|
|
41
|
+
# ensures a unique key in the hash
|
42
|
+
pres_id = presentation.attr('data-id')
|
43
|
+
|
44
|
+
pres_link = presentation.css('h3.title a').attr('href').text
|
45
|
+
|
46
|
+
pres_title = presentation.css('h3.title').text.strip
|
47
|
+
author_name = presentation.parent.css('h3.title a').last.text
|
48
|
+
verbose_display(pres_title, author_name) if self.display == "-v"
|
49
|
+
concise_display if self.display == "-c"
|
50
|
+
|
51
|
+
self.presentations[pres_id] = pres_link
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
#### display options ############
|
56
|
+
def verbose_display(pres_title, author_name)
|
57
|
+
good_words = ["awesome", "great", "amazing", "really cool", "tops", "mind-blowing", "super", "glittering", "thought-provoking", "glorious", "sweet", "classy","really great", "fun", "strong", "robust", "healthy", "fine", "superior", "quality", "thoughful", "intelligent", "clever", "genius","incredible", "smart", "beautiful", "handsome", "pulchritudinous", "elegant", "bespoke", "crazy", "satisfying", "inspirational", "inspiring", "mind-exploding", "hot"]
|
58
|
+
puts "grabbed a #{good_words[rand(good_words.length)]} presentation #{pres_title} by #{author_name}"
|
59
|
+
sleep(0.02)
|
60
|
+
end
|
61
|
+
|
62
|
+
def concise_display
|
63
|
+
print "#"
|
64
|
+
sleep(0.02)
|
65
|
+
end
|
66
|
+
#### display options end ##########
|
67
|
+
|
68
|
+
# wrapper to run the single page scraper for all links
|
69
|
+
def scrape_all
|
70
|
+
puts "reading presentation data"
|
71
|
+
self.presentations.each do |id, link|
|
72
|
+
pres_page_scrape(id, link)
|
73
|
+
end
|
74
|
+
self.end_time = Time.now
|
75
|
+
end
|
76
|
+
|
77
|
+
# grab data from one page
|
78
|
+
# note: this is a time consuming process -- have to open each page (but necessary because the views data isn't stored on the query pages)
|
79
|
+
def pres_page_scrape(id, pres_link)
|
80
|
+
pres_page = Nokogiri::HTML(open("https://speakerdeck.com#{pres_link}"))
|
81
|
+
|
82
|
+
presentations[id] = {
|
83
|
+
:title => pres_title(pres_page),
|
84
|
+
:link => pres_link,
|
85
|
+
:date => pres_date(pres_page),
|
86
|
+
:author => pres_author(pres_page),
|
87
|
+
:author_link => pres_author_link(pres_page),
|
88
|
+
:category => pres_category(pres_page),
|
89
|
+
:views => pres_views(pres_page)
|
90
|
+
}
|
91
|
+
|
92
|
+
if self.display == '-c'
|
93
|
+
concise_display
|
94
|
+
else
|
95
|
+
puts "#{presentations[id][:title]} has #{presentations[id][:views]} views!"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def pres_views(pres_page)
|
100
|
+
pres_page.css('li.views').text.scan(/\d+/).join.to_i
|
101
|
+
end
|
102
|
+
|
103
|
+
def pres_title(pres_page)
|
104
|
+
pres_page.css('div#content header h1').text
|
105
|
+
|
106
|
+
end
|
107
|
+
|
108
|
+
def pres_author(pres_page)
|
109
|
+
pres_page.css('div#content header h2 a').text
|
110
|
+
end
|
111
|
+
|
112
|
+
def pres_author_link(pres_page)
|
113
|
+
pres_page.css('div#content header h2 a').attr('href').text
|
114
|
+
end
|
115
|
+
|
116
|
+
def pres_date(pres_page)
|
117
|
+
pres_page.css('div#talk-details mark').first.text.strip
|
118
|
+
end
|
119
|
+
|
120
|
+
def pres_category(pres_page)
|
121
|
+
pres_page.css('div#talk-details mark a').text
|
122
|
+
end
|
123
|
+
|
124
|
+
def html_gen
|
125
|
+
# take data and sort it by views descending
|
126
|
+
sorted_array = self.presentations.values.sort_by do |pres_hash|
|
127
|
+
pres_hash[:views]
|
128
|
+
end.reverse
|
129
|
+
|
130
|
+
File.open("spd-#{query}.html", "w") do |file|
|
131
|
+
file.write( <<-HTML
|
132
|
+
<html>
|
133
|
+
<head>
|
134
|
+
</head>
|
135
|
+
<body>
|
136
|
+
<h1>speakerdeck presentations - #{query}</h1>
|
137
|
+
<h4>this site was generated in #{self.end_time - self.start_time} seconds (last queried at #{self.start_time})
|
138
|
+
<table class="tablesorter" border="1">
|
139
|
+
<tr>
|
140
|
+
<th>title</th>
|
141
|
+
<th>date</th>
|
142
|
+
<th>category</th>
|
143
|
+
<th>author</th>
|
144
|
+
<th>views</th>
|
145
|
+
</tr>
|
146
|
+
HTML
|
147
|
+
)
|
148
|
+
sorted_array.each do |content_hash|
|
149
|
+
link = "#{SD_DOMAIN}#{content_hash[:link]}"
|
150
|
+
author_link = "#{SD_DOMAIN}#{content_hash[:author_link]}"
|
151
|
+
file.write ( <<-HTML
|
152
|
+
<tr>
|
153
|
+
<td><a href=#{link}>#{content_hash[:title]}</a></td>
|
154
|
+
<td>#{content_hash[:date]}</td>
|
155
|
+
<td><a href="https://speakerdeck.com/c/#{content_hash[:category].downcase}">#{content_hash[:category]}</a></td>
|
156
|
+
<td><a href=#{author_link}>#{content_hash[:author]}</a></td>
|
157
|
+
<td>#{content_hash[:views]}</td>
|
158
|
+
</tr>
|
159
|
+
HTML
|
160
|
+
)
|
161
|
+
end
|
162
|
+
file.write(<<-HTML
|
163
|
+
</table>
|
164
|
+
</body>
|
165
|
+
</html>
|
166
|
+
HTML
|
167
|
+
)
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
# class end
|
172
|
+
end
|
173
|
+
|
174
|
+
|
Binary file
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# in progress
|
2
|
+
Gem::Specification.new do |s|
|
3
|
+
s.name = 'spdeck-scrape'
|
4
|
+
s.executables << 'spdeck-scrape'
|
5
|
+
s.version = '0.0.61'
|
6
|
+
s.date = '2013-10-11'
|
7
|
+
s.summary = "Simple scraper for SpeakerDeck"
|
8
|
+
s.description = "Generate data organized by viewcount for a database or webpages about presentations posted on SpeakerDeck (SpeakerDeck.com)"
|
9
|
+
s.author = "Joe O'Conor"
|
10
|
+
s.email = 'joe.oconor@gmail.com'
|
11
|
+
s.files = `git ls-files`.split("\n")
|
12
|
+
s.homepage =
|
13
|
+
'http://rubygems.org/gems/spdeck-scrape'
|
14
|
+
s.license = 'MIT'
|
15
|
+
s.require_path = 'lib'
|
16
|
+
s.add_runtime_dependency 'nokogiri' >= '1.6.0'
|
17
|
+
s.post_install_message = <<-JNO
|
18
|
+
You have installed spdeck-scrape! See the README at https://github.com/jnoconor/spdeck-scrape for more information.
|
19
|
+
JNO
|
20
|
+
end
|
data/spec/.rspec
ADDED
File without changes
|
data/spec/spec_helper.rb
ADDED
File without changes
|
metadata
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: spdeck-scrape
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.61
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Joe O'Conor
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-10-11 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: 'true'
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
description: Generate data organized by viewcount for a database or webpages about
|
28
|
+
presentations posted on SpeakerDeck (SpeakerDeck.com)
|
29
|
+
email: joe.oconor@gmail.com
|
30
|
+
executables:
|
31
|
+
- spdeck-scrape
|
32
|
+
extensions: []
|
33
|
+
extra_rdoc_files: []
|
34
|
+
files:
|
35
|
+
- .gitignore
|
36
|
+
- README.md
|
37
|
+
- Rakefile
|
38
|
+
- bin/spdeck-scrape
|
39
|
+
- lib/spdeck-scrape.rb
|
40
|
+
- lib/spdeck-scrape/spdeck-scraper-class.rb
|
41
|
+
- spdeck-scrape-0.0.6.gem
|
42
|
+
- spdeck-scrape.gemspec
|
43
|
+
- spec/.rspec
|
44
|
+
- spec/spec_helper.rb
|
45
|
+
homepage: http://rubygems.org/gems/spdeck-scrape
|
46
|
+
licenses:
|
47
|
+
- MIT
|
48
|
+
metadata: {}
|
49
|
+
post_install_message: |2
|
50
|
+
You have installed spdeck-scrape! See the README at https://github.com/jnoconor/spdeck-scrape for more information.
|
51
|
+
rdoc_options: []
|
52
|
+
require_paths:
|
53
|
+
- lib
|
54
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
55
|
+
requirements:
|
56
|
+
- - '>='
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: '0'
|
59
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - '>='
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0'
|
64
|
+
requirements: []
|
65
|
+
rubyforge_project:
|
66
|
+
rubygems_version: 2.0.6
|
67
|
+
signing_key:
|
68
|
+
specification_version: 4
|
69
|
+
summary: Simple scraper for SpeakerDeck
|
70
|
+
test_files: []
|