spdeck-scrape 0.0.61

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: f6e658ef89f9252e69359bf19387b9067ff505fa
4
+ data.tar.gz: b788e687930c378e9b61e0857208d412d4afb07e
5
+ SHA512:
6
+ metadata.gz: e536ce69458e7401cc94d311a72759f16270243c4bdf43b2618c13d9a93315bb5619053e8e45d4ae04f4e3a881e0a96cd7b64d40ab772957f271479290560c15
7
+ data.tar.gz: 7f482bce096a34c2c5e68eb9e50a444b3b7df43a773fb7d162475f0f93d57fa7a4d8b76fccf4ef4859ed35b19ac37ce78d9260b1c388715726afd2c10b88c912
data/.gitignore ADDED
@@ -0,0 +1 @@
1
+ ref-ignore/*
data/README.md ADDED
@@ -0,0 +1,53 @@
1
+ #spdeck-scrape: Simple SpeakerDeck Scraper
2
+
3
+ This is a simple gem designed to scrape data from SpeakerDeck.com. This is the first gem I have ever built! I wrote it to practice scraping websites and to learn how to build gems.
4
+
5
+ SpeakerDeck.com does not natively allow sorting presenations according to views, so this gem allows you to grab the views data and port it into a database or straight to barebones HTML, sorted in descending order.
6
+
7
+ This gem is still a work in progress!
8
+
9
+ ###Installation
10
+ `gem install spdeck-scrape`
11
+
12
+
13
+ ###Usage
14
+ spdeck-scrape can be used from the command line and also in Ruby scripts.
15
+
16
+ From the command line:
17
+ ```bash
18
+ $ spdeck-scrape [query | range | l or s display]
19
+ ```
20
+ In a Ruby script:
21
+
22
+ - initialize a new `SpeakerdeckScraper` object specifying the desired query. It defaults to Ruby.
23
+
24
+ ```ruby
25
+ spd-ruby = SpeakerdeckScraper.new("rails")
26
+ # grabs the titles, authors, views, and links
27
+ ```
28
+ - set the number of query results pages to pull
29
+ ```ruby
30
+ spd-ruby.query_results_scrape(10)
31
+ # pulls the first 10 pages
32
+ ```
33
+ - initiate the scrape
34
+ ```ruby
35
+ spd-ruby.scrape_all
36
+ ```
37
+ - extract the data to basic HTML
38
+ ```ruby
39
+ spd-ruby.html_gen
40
+ # will create a file called 'spd-ruby.html' in the working directory with a table of the results sorted by views descending
41
+ ```
42
+
43
+
44
+
45
+ ###Classes
46
+
47
+ SpeakerdeckScraper
48
+ Presentations
49
+ SPDatabase
50
+ SPHTMLGen
51
+
52
+ ###Methods
53
+
data/Rakefile ADDED
File without changes
data/bin/spdeck-scrape ADDED
@@ -0,0 +1,34 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'spdeck-scrape'
4
+
5
+ # usage: $ spdeck-scrape my_query my_range [verbose | concise] -html[optional for html gen]
6
+ if ARGV.empty?
7
+ puts "\n\n------- spdeck-scrape: ERROR! --------"
8
+ puts " Usage:"
9
+ puts " Please specify a query, range, and display option (if desired):\n"
10
+ puts " spdeck-scrape my_query an_integer [options]"
11
+ puts " Options:
12
+ -v # verbose display while running"
13
+ puts " -c # concise display"
14
+ puts " -html # include this tag to print data to an HTML file (must also include a display option)\n"
15
+ puts " Example:"
16
+ puts " spdeck-scrape ruby 15 -v -html\n\n"
17
+ else
18
+ query = ARGV[0]
19
+ ARGV[1].nil? ? range = 5 : range = ARGV[1].to_i
20
+ display = ARGV[2] || '-c'
21
+
22
+ user = SpeakerdeckScraper.new(query, range, display)
23
+
24
+ user.query_results_scrape(range)
25
+ user.scrape_all
26
+ if ARGV[3] == ("-html")
27
+ user.html_gen
28
+ system("open spd-#{query}.html")
29
+ puts
30
+ end
31
+
32
+ end
33
+
34
+
@@ -0,0 +1,29 @@
1
+ require_relative './spdeck-scrape/spdeck-scraper-class.rb'
2
+ #require 'spdeck-scrape'
3
+
4
+ # this file is loaded and run when 'spdeck-scrape' is required in a script
5
+
6
+ # test code
7
+
8
+ # scraper = SpeakerdeckScraper.new(, "ruby")
9
+ # scraper.query_results_scrape(3)
10
+ # scraper.scrape_all
11
+ # File.open('spd-ruby-raw', 'w') do |file|
12
+ # file.write(scraper.presentations)
13
+ # end
14
+
15
+
16
+ # scraper.html_gen
17
+
18
+ # scraper2 = SpeakerdeckScraper.new("https://speakerdeck.com/", "json")
19
+ # scraper2.query_results_scrape(2)
20
+ # scraper2.scrape_all
21
+ # File.open('spd-json-raw', 'w') do |file|
22
+ # file.write(scraper.presentations)
23
+ # end
24
+
25
+ # scraper2.html_gen
26
+
27
+ # system("open spd-ruby.html spd-json.html")
28
+
29
+ # initialize a scraper with a website and a query
@@ -0,0 +1,174 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'pry'
4
+
5
+ class SpeakerdeckScraper
6
+
7
+ attr_reader :page_object, :presentations, :url
8
+ attr_accessor :start_time, :end_time, :opts, :query, :display
9
+
10
+ SD_QUERY_FIRST_PAGE = "https://speakerdeck.com/search?q=ruby"
11
+ SD_DOMAIN = "https://speakerdeck.com"
12
+
13
+ def initialize(query, range = 5, display = '-v')
14
+ @url = "https://speakerdeck.com/"
15
+ @query = query
16
+ @page_object = ''
17
+ @presentations = {}
18
+ @start_time = Time.now
19
+ @range = range
20
+ @display = display
21
+ end
22
+
23
+ def query_results_scrape(range)
24
+ puts "grabbing presentations"
25
+ begin
26
+ single_results_page_scrape(SD_QUERY_FIRST_PAGE)
27
+ (2..range).collect do |i|
28
+ single_results_page_scrape(i)
29
+ end
30
+ rescue
31
+ puts "error! prob nothing to worry about"
32
+ end
33
+ puts "\ncool! we got #{presentations.length} presentations"
34
+ end
35
+
36
+ # dumps the query results into a hash, presentations = { 'pres title' => 'pres_link.html' }
37
+ # not called explicitly, lives in query scrape wrapper
38
+ def single_results_page_scrape(i)
39
+ doc = Nokogiri::HTML(open "#{self.url}search?page=#{i}&q=#{query}")
40
+ doc.css('div.talk').each do |presentation|
41
+ # ensures a unique key in the hash
42
+ pres_id = presentation.attr('data-id')
43
+
44
+ pres_link = presentation.css('h3.title a').attr('href').text
45
+
46
+ pres_title = presentation.css('h3.title').text.strip
47
+ author_name = presentation.parent.css('h3.title a').last.text
48
+ verbose_display(pres_title, author_name) if self.display == "-v"
49
+ concise_display if self.display == "-c"
50
+
51
+ self.presentations[pres_id] = pres_link
52
+ end
53
+ end
54
+
55
+ #### display options ############
56
+ def verbose_display(pres_title, author_name)
57
+ good_words = ["awesome", "great", "amazing", "really cool", "tops", "mind-blowing", "super", "glittering", "thought-provoking", "glorious", "sweet", "classy","really great", "fun", "strong", "robust", "healthy", "fine", "superior", "quality", "thoughful", "intelligent", "clever", "genius","incredible", "smart", "beautiful", "handsome", "pulchritudinous", "elegant", "bespoke", "crazy", "satisfying", "inspirational", "inspiring", "mind-exploding", "hot"]
58
+ puts "grabbed a #{good_words[rand(good_words.length)]} presentation #{pres_title} by #{author_name}"
59
+ sleep(0.02)
60
+ end
61
+
62
+ def concise_display
63
+ print "#"
64
+ sleep(0.02)
65
+ end
66
+ #### display options end ##########
67
+
68
+ # wrapper to run the single page scraper for all links
69
+ def scrape_all
70
+ puts "reading presentation data"
71
+ self.presentations.each do |id, link|
72
+ pres_page_scrape(id, link)
73
+ end
74
+ self.end_time = Time.now
75
+ end
76
+
77
+ # grab data from one page
78
+ # note: this is a time consuming process -- have to open each page (but necessary because the views data isn't stored on the query pages)
79
+ def pres_page_scrape(id, pres_link)
80
+ pres_page = Nokogiri::HTML(open("https://speakerdeck.com#{pres_link}"))
81
+
82
+ presentations[id] = {
83
+ :title => pres_title(pres_page),
84
+ :link => pres_link,
85
+ :date => pres_date(pres_page),
86
+ :author => pres_author(pres_page),
87
+ :author_link => pres_author_link(pres_page),
88
+ :category => pres_category(pres_page),
89
+ :views => pres_views(pres_page)
90
+ }
91
+
92
+ if self.display == '-c'
93
+ concise_display
94
+ else
95
+ puts "#{presentations[id][:title]} has #{presentations[id][:views]} views!"
96
+ end
97
+ end
98
+
99
+ def pres_views(pres_page)
100
+ pres_page.css('li.views').text.scan(/\d+/).join.to_i
101
+ end
102
+
103
+ def pres_title(pres_page)
104
+ pres_page.css('div#content header h1').text
105
+
106
+ end
107
+
108
+ def pres_author(pres_page)
109
+ pres_page.css('div#content header h2 a').text
110
+ end
111
+
112
+ def pres_author_link(pres_page)
113
+ pres_page.css('div#content header h2 a').attr('href').text
114
+ end
115
+
116
+ def pres_date(pres_page)
117
+ pres_page.css('div#talk-details mark').first.text.strip
118
+ end
119
+
120
+ def pres_category(pres_page)
121
+ pres_page.css('div#talk-details mark a').text
122
+ end
123
+
124
+ def html_gen
125
+ # take data and sort it by views descending
126
+ sorted_array = self.presentations.values.sort_by do |pres_hash|
127
+ pres_hash[:views]
128
+ end.reverse
129
+
130
+ File.open("spd-#{query}.html", "w") do |file|
131
+ file.write( <<-HTML
132
+ <html>
133
+ <head>
134
+ </head>
135
+ <body>
136
+ <h1>speakerdeck presentations - #{query}</h1>
137
+ <h4>this site was generated in #{self.end_time - self.start_time} seconds (last queried at #{self.start_time})
138
+ <table class="tablesorter" border="1">
139
+ <tr>
140
+ <th>title</th>
141
+ <th>date</th>
142
+ <th>category</th>
143
+ <th>author</th>
144
+ <th>views</th>
145
+ </tr>
146
+ HTML
147
+ )
148
+ sorted_array.each do |content_hash|
149
+ link = "#{SD_DOMAIN}#{content_hash[:link]}"
150
+ author_link = "#{SD_DOMAIN}#{content_hash[:author_link]}"
151
+ file.write ( <<-HTML
152
+ <tr>
153
+ <td><a href=#{link}>#{content_hash[:title]}</a></td>
154
+ <td>#{content_hash[:date]}</td>
155
+ <td><a href="https://speakerdeck.com/c/#{content_hash[:category].downcase}">#{content_hash[:category]}</a></td>
156
+ <td><a href=#{author_link}>#{content_hash[:author]}</a></td>
157
+ <td>#{content_hash[:views]}</td>
158
+ </tr>
159
+ HTML
160
+ )
161
+ end
162
+ file.write(<<-HTML
163
+ </table>
164
+ </body>
165
+ </html>
166
+ HTML
167
+ )
168
+ end
169
+ end
170
+
171
+ # class end
172
+ end
173
+
174
+
Binary file
@@ -0,0 +1,20 @@
1
+ # in progress
2
+ Gem::Specification.new do |s|
3
+ s.name = 'spdeck-scrape'
4
+ s.executables << 'spdeck-scrape'
5
+ s.version = '0.0.61'
6
+ s.date = '2013-10-11'
7
+ s.summary = "Simple scraper for SpeakerDeck"
8
+ s.description = "Generate data organized by viewcount for a database or webpages about presentations posted on SpeakerDeck (SpeakerDeck.com)"
9
+ s.author = "Joe O'Conor"
10
+ s.email = 'joe.oconor@gmail.com'
11
+ s.files = `git ls-files`.split("\n")
12
+ s.homepage =
13
+ 'http://rubygems.org/gems/spdeck-scrape'
14
+ s.license = 'MIT'
15
+ s.require_path = 'lib'
16
+ s.add_runtime_dependency 'nokogiri' >= '1.6.0'
17
+ s.post_install_message = <<-JNO
18
+ You have installed spdeck-scrape! See the README at https://github.com/jnoconor/spdeck-scrape for more information.
19
+ JNO
20
+ end
data/spec/.rspec ADDED
File without changes
File without changes
metadata ADDED
@@ -0,0 +1,70 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: spdeck-scrape
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.61
5
+ platform: ruby
6
+ authors:
7
+ - Joe O'Conor
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-10-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: 'true'
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ description: Generate data organized by viewcount for a database or webpages about
28
+ presentations posted on SpeakerDeck (SpeakerDeck.com)
29
+ email: joe.oconor@gmail.com
30
+ executables:
31
+ - spdeck-scrape
32
+ extensions: []
33
+ extra_rdoc_files: []
34
+ files:
35
+ - .gitignore
36
+ - README.md
37
+ - Rakefile
38
+ - bin/spdeck-scrape
39
+ - lib/spdeck-scrape.rb
40
+ - lib/spdeck-scrape/spdeck-scraper-class.rb
41
+ - spdeck-scrape-0.0.6.gem
42
+ - spdeck-scrape.gemspec
43
+ - spec/.rspec
44
+ - spec/spec_helper.rb
45
+ homepage: http://rubygems.org/gems/spdeck-scrape
46
+ licenses:
47
+ - MIT
48
+ metadata: {}
49
+ post_install_message: |2
50
+ You have installed spdeck-scrape! See the README at https://github.com/jnoconor/spdeck-scrape for more information.
51
+ rdoc_options: []
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - '>='
57
+ - !ruby/object:Gem::Version
58
+ version: '0'
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - '>='
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ requirements: []
65
+ rubyforge_project:
66
+ rubygems_version: 2.0.6
67
+ signing_key:
68
+ specification_version: 4
69
+ summary: Simple scraper for SpeakerDeck
70
+ test_files: []