spdeck-scrape 0.0.61

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: f6e658ef89f9252e69359bf19387b9067ff505fa
4
+ data.tar.gz: b788e687930c378e9b61e0857208d412d4afb07e
5
+ SHA512:
6
+ metadata.gz: e536ce69458e7401cc94d311a72759f16270243c4bdf43b2618c13d9a93315bb5619053e8e45d4ae04f4e3a881e0a96cd7b64d40ab772957f271479290560c15
7
+ data.tar.gz: 7f482bce096a34c2c5e68eb9e50a444b3b7df43a773fb7d162475f0f93d57fa7a4d8b76fccf4ef4859ed35b19ac37ce78d9260b1c388715726afd2c10b88c912
data/.gitignore ADDED
@@ -0,0 +1 @@
1
+ ref-ignore/*
data/README.md ADDED
@@ -0,0 +1,53 @@
1
+ #spdeck-scrape: Simple SpeakerDeck Scraper
2
+
3
+ This is a simple gem designed to scrape data from SpeakerDeck.com. This is the first gem I have ever built! I wrote it to practice scraping websites and to learn how to build gems.
4
+
5
+ SpeakerDeck.com does not natively allow sorting presenations according to views, so this gem allows you to grab the views data and port it into a database or straight to barebones HTML, sorted in descending order.
6
+
7
+ This gem is still a work in progress!
8
+
9
+ ###Installation
10
+ `gem install spdeck-scrape`
11
+
12
+
13
+ ###Usage
14
+ spdeck-scrape can be used from the command line and also in Ruby scripts.
15
+
16
+ From the command line:
17
+ ```bash
18
+ $ spdeck-scrape [query | range | l or s display]
19
+ ```
20
+ In a Ruby script:
21
+
22
+ - initialize a new `SpeakerdeckScraper` object specifying the desired query. It defaults to Ruby.
23
+
24
+ ```ruby
25
+ spd-ruby = SpeakerdeckScraper.new("rails")
26
+ # grabs the titles, authors, views, and links
27
+ ```
28
+ - set the number of query results pages to pull
29
+ ```ruby
30
+ spd-ruby.query_results_scrape(10)
31
+ # pulls the first 10 pages
32
+ ```
33
+ - initiate the scrape
34
+ ```ruby
35
+ spd-ruby.scrape_all
36
+ ```
37
+ - extract the data to basic HTML
38
+ ```ruby
39
+ spd-ruby.html_gen
40
+ # will create a file called 'spd-ruby.html' in the working directory with a table of the results sorted by views descending
41
+ ```
42
+
43
+
44
+
45
+ ###Classes
46
+
47
+ SpeakerdeckScraper
48
+ Presentations
49
+ SPDatabase
50
+ SPHTMLGen
51
+
52
+ ###Methods
53
+
data/Rakefile ADDED
File without changes
data/bin/spdeck-scrape ADDED
@@ -0,0 +1,34 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'spdeck-scrape'
4
+
5
+ # usage: $ spdeck-scrape my_query my_range [verbose | concise] -html[optional for html gen]
6
+ if ARGV.empty?
7
+ puts "\n\n------- spdeck-scrape: ERROR! --------"
8
+ puts " Usage:"
9
+ puts " Please specify a query, range, and display option (if desired):\n"
10
+ puts " spdeck-scrape my_query an_integer [options]"
11
+ puts " Options:
12
+ -v # verbose display while running"
13
+ puts " -c # concise display"
14
+ puts " -html # include this tag to print data to an HTML file (must also include a display option)\n"
15
+ puts " Example:"
16
+ puts " spdeck-scrape ruby 15 -v -html\n\n"
17
+ else
18
+ query = ARGV[0]
19
+ ARGV[1].nil? ? range = 5 : range = ARGV[1].to_i
20
+ display = ARGV[2] || '-c'
21
+
22
+ user = SpeakerdeckScraper.new(query, range, display)
23
+
24
+ user.query_results_scrape(range)
25
+ user.scrape_all
26
+ if ARGV[3] == ("-html")
27
+ user.html_gen
28
+ system("open spd-#{query}.html")
29
+ puts
30
+ end
31
+
32
+ end
33
+
34
+
@@ -0,0 +1,29 @@
1
+ require_relative './spdeck-scrape/spdeck-scraper-class.rb'
2
+ #require 'spdeck-scrape'
3
+
4
+ # this file is loaded and run when 'spdeck-scrape' is required in a script
5
+
6
+ # test code
7
+
8
+ # scraper = SpeakerdeckScraper.new(, "ruby")
9
+ # scraper.query_results_scrape(3)
10
+ # scraper.scrape_all
11
+ # File.open('spd-ruby-raw', 'w') do |file|
12
+ # file.write(scraper.presentations)
13
+ # end
14
+
15
+
16
+ # scraper.html_gen
17
+
18
+ # scraper2 = SpeakerdeckScraper.new("https://speakerdeck.com/", "json")
19
+ # scraper2.query_results_scrape(2)
20
+ # scraper2.scrape_all
21
+ # File.open('spd-json-raw', 'w') do |file|
22
+ # file.write(scraper.presentations)
23
+ # end
24
+
25
+ # scraper2.html_gen
26
+
27
+ # system("open spd-ruby.html spd-json.html")
28
+
29
+ # initialize a scraper with a website and a query
@@ -0,0 +1,174 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'pry'
4
+
5
+ class SpeakerdeckScraper
6
+
7
+ attr_reader :page_object, :presentations, :url
8
+ attr_accessor :start_time, :end_time, :opts, :query, :display
9
+
10
+ SD_QUERY_FIRST_PAGE = "https://speakerdeck.com/search?q=ruby"
11
+ SD_DOMAIN = "https://speakerdeck.com"
12
+
13
+ def initialize(query, range = 5, display = '-v')
14
+ @url = "https://speakerdeck.com/"
15
+ @query = query
16
+ @page_object = ''
17
+ @presentations = {}
18
+ @start_time = Time.now
19
+ @range = range
20
+ @display = display
21
+ end
22
+
23
+ def query_results_scrape(range)
24
+ puts "grabbing presentations"
25
+ begin
26
+ single_results_page_scrape(SD_QUERY_FIRST_PAGE)
27
+ (2..range).collect do |i|
28
+ single_results_page_scrape(i)
29
+ end
30
+ rescue
31
+ puts "error! prob nothing to worry about"
32
+ end
33
+ puts "\ncool! we got #{presentations.length} presentations"
34
+ end
35
+
36
+ # dumps the query results into a hash, presentations = { 'pres title' => 'pres_link.html' }
37
+ # not called explicitly, lives in query scrape wrapper
38
+ def single_results_page_scrape(i)
39
+ doc = Nokogiri::HTML(open "#{self.url}search?page=#{i}&q=#{query}")
40
+ doc.css('div.talk').each do |presentation|
41
+ # ensures a unique key in the hash
42
+ pres_id = presentation.attr('data-id')
43
+
44
+ pres_link = presentation.css('h3.title a').attr('href').text
45
+
46
+ pres_title = presentation.css('h3.title').text.strip
47
+ author_name = presentation.parent.css('h3.title a').last.text
48
+ verbose_display(pres_title, author_name) if self.display == "-v"
49
+ concise_display if self.display == "-c"
50
+
51
+ self.presentations[pres_id] = pres_link
52
+ end
53
+ end
54
+
55
+ #### display options ############
56
+ def verbose_display(pres_title, author_name)
57
+ good_words = ["awesome", "great", "amazing", "really cool", "tops", "mind-blowing", "super", "glittering", "thought-provoking", "glorious", "sweet", "classy","really great", "fun", "strong", "robust", "healthy", "fine", "superior", "quality", "thoughful", "intelligent", "clever", "genius","incredible", "smart", "beautiful", "handsome", "pulchritudinous", "elegant", "bespoke", "crazy", "satisfying", "inspirational", "inspiring", "mind-exploding", "hot"]
58
+ puts "grabbed a #{good_words[rand(good_words.length)]} presentation #{pres_title} by #{author_name}"
59
+ sleep(0.02)
60
+ end
61
+
62
+ def concise_display
63
+ print "#"
64
+ sleep(0.02)
65
+ end
66
+ #### display options end ##########
67
+
68
+ # wrapper to run the single page scraper for all links
69
+ def scrape_all
70
+ puts "reading presentation data"
71
+ self.presentations.each do |id, link|
72
+ pres_page_scrape(id, link)
73
+ end
74
+ self.end_time = Time.now
75
+ end
76
+
77
+ # grab data from one page
78
+ # note: this is a time consuming process -- have to open each page (but necessary because the views data isn't stored on the query pages)
79
+ def pres_page_scrape(id, pres_link)
80
+ pres_page = Nokogiri::HTML(open("https://speakerdeck.com#{pres_link}"))
81
+
82
+ presentations[id] = {
83
+ :title => pres_title(pres_page),
84
+ :link => pres_link,
85
+ :date => pres_date(pres_page),
86
+ :author => pres_author(pres_page),
87
+ :author_link => pres_author_link(pres_page),
88
+ :category => pres_category(pres_page),
89
+ :views => pres_views(pres_page)
90
+ }
91
+
92
+ if self.display == '-c'
93
+ concise_display
94
+ else
95
+ puts "#{presentations[id][:title]} has #{presentations[id][:views]} views!"
96
+ end
97
+ end
98
+
99
+ def pres_views(pres_page)
100
+ pres_page.css('li.views').text.scan(/\d+/).join.to_i
101
+ end
102
+
103
+ def pres_title(pres_page)
104
+ pres_page.css('div#content header h1').text
105
+
106
+ end
107
+
108
+ def pres_author(pres_page)
109
+ pres_page.css('div#content header h2 a').text
110
+ end
111
+
112
+ def pres_author_link(pres_page)
113
+ pres_page.css('div#content header h2 a').attr('href').text
114
+ end
115
+
116
+ def pres_date(pres_page)
117
+ pres_page.css('div#talk-details mark').first.text.strip
118
+ end
119
+
120
+ def pres_category(pres_page)
121
+ pres_page.css('div#talk-details mark a').text
122
+ end
123
+
124
+ def html_gen
125
+ # take data and sort it by views descending
126
+ sorted_array = self.presentations.values.sort_by do |pres_hash|
127
+ pres_hash[:views]
128
+ end.reverse
129
+
130
+ File.open("spd-#{query}.html", "w") do |file|
131
+ file.write( <<-HTML
132
+ <html>
133
+ <head>
134
+ </head>
135
+ <body>
136
+ <h1>speakerdeck presentations - #{query}</h1>
137
+ <h4>this site was generated in #{self.end_time - self.start_time} seconds (last queried at #{self.start_time})
138
+ <table class="tablesorter" border="1">
139
+ <tr>
140
+ <th>title</th>
141
+ <th>date</th>
142
+ <th>category</th>
143
+ <th>author</th>
144
+ <th>views</th>
145
+ </tr>
146
+ HTML
147
+ )
148
+ sorted_array.each do |content_hash|
149
+ link = "#{SD_DOMAIN}#{content_hash[:link]}"
150
+ author_link = "#{SD_DOMAIN}#{content_hash[:author_link]}"
151
+ file.write ( <<-HTML
152
+ <tr>
153
+ <td><a href=#{link}>#{content_hash[:title]}</a></td>
154
+ <td>#{content_hash[:date]}</td>
155
+ <td><a href="https://speakerdeck.com/c/#{content_hash[:category].downcase}">#{content_hash[:category]}</a></td>
156
+ <td><a href=#{author_link}>#{content_hash[:author]}</a></td>
157
+ <td>#{content_hash[:views]}</td>
158
+ </tr>
159
+ HTML
160
+ )
161
+ end
162
+ file.write(<<-HTML
163
+ </table>
164
+ </body>
165
+ </html>
166
+ HTML
167
+ )
168
+ end
169
+ end
170
+
171
+ # class end
172
+ end
173
+
174
+
Binary file
@@ -0,0 +1,20 @@
1
+ # in progress
2
+ Gem::Specification.new do |s|
3
+ s.name = 'spdeck-scrape'
4
+ s.executables << 'spdeck-scrape'
5
+ s.version = '0.0.61'
6
+ s.date = '2013-10-11'
7
+ s.summary = "Simple scraper for SpeakerDeck"
8
+ s.description = "Generate data organized by viewcount for a database or webpages about presentations posted on SpeakerDeck (SpeakerDeck.com)"
9
+ s.author = "Joe O'Conor"
10
+ s.email = 'joe.oconor@gmail.com'
11
+ s.files = `git ls-files`.split("\n")
12
+ s.homepage =
13
+ 'http://rubygems.org/gems/spdeck-scrape'
14
+ s.license = 'MIT'
15
+ s.require_path = 'lib'
16
+ s.add_runtime_dependency 'nokogiri' >= '1.6.0'
17
+ s.post_install_message = <<-JNO
18
+ You have installed spdeck-scrape! See the README at https://github.com/jnoconor/spdeck-scrape for more information.
19
+ JNO
20
+ end
data/spec/.rspec ADDED
File without changes
File without changes
metadata ADDED
@@ -0,0 +1,70 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: spdeck-scrape
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.61
5
+ platform: ruby
6
+ authors:
7
+ - Joe O'Conor
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-10-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: 'true'
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ description: Generate data organized by viewcount for a database or webpages about
28
+ presentations posted on SpeakerDeck (SpeakerDeck.com)
29
+ email: joe.oconor@gmail.com
30
+ executables:
31
+ - spdeck-scrape
32
+ extensions: []
33
+ extra_rdoc_files: []
34
+ files:
35
+ - .gitignore
36
+ - README.md
37
+ - Rakefile
38
+ - bin/spdeck-scrape
39
+ - lib/spdeck-scrape.rb
40
+ - lib/spdeck-scrape/spdeck-scraper-class.rb
41
+ - spdeck-scrape-0.0.6.gem
42
+ - spdeck-scrape.gemspec
43
+ - spec/.rspec
44
+ - spec/spec_helper.rb
45
+ homepage: http://rubygems.org/gems/spdeck-scrape
46
+ licenses:
47
+ - MIT
48
+ metadata: {}
49
+ post_install_message: |2
50
+ You have installed spdeck-scrape! See the README at https://github.com/jnoconor/spdeck-scrape for more information.
51
+ rdoc_options: []
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - '>='
57
+ - !ruby/object:Gem::Version
58
+ version: '0'
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - '>='
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ requirements: []
65
+ rubyforge_project:
66
+ rubygems_version: 2.0.6
67
+ signing_key:
68
+ specification_version: 4
69
+ summary: Simple scraper for SpeakerDeck
70
+ test_files: []