RubyGems - spdeck-scrape - Versions diffs - 0.0.61 - Mend

spdeck-scrape 0.0.61

Files changed (12) hide show

checksums.yaml +7 -0
data/.gitignore +1 -0
data/README.md +53 -0
data/Rakefile +0 -0
data/bin/spdeck-scrape +34 -0
data/lib/spdeck-scrape.rb +29 -0
data/lib/spdeck-scrape/spdeck-scraper-class.rb +174 -0
data/spdeck-scrape-0.0.6.gem +0 -0
data/spdeck-scrape.gemspec +20 -0
data/spec/.rspec +0 -0
data/spec/spec_helper.rb +0 -0
metadata +70 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: f6e658ef89f9252e69359bf19387b9067ff505fa
+  data.tar.gz: b788e687930c378e9b61e0857208d412d4afb07e
+SHA512:
+  metadata.gz: e536ce69458e7401cc94d311a72759f16270243c4bdf43b2618c13d9a93315bb5619053e8e45d4ae04f4e3a881e0a96cd7b64d40ab772957f271479290560c15
+  data.tar.gz: 7f482bce096a34c2c5e68eb9e50a444b3b7df43a773fb7d162475f0f93d57fa7a4d8b76fccf4ef4859ed35b19ac37ce78d9260b1c388715726afd2c10b88c912

data/.gitignore ADDED Viewed

	@@ -0,0 +1 @@
1	+ ref-ignore/*

data/README.md ADDED Viewed

@@ -0,0 +1,53 @@
+#spdeck-scrape: Simple SpeakerDeck Scraper
+This is a simple gem designed to scrape data from SpeakerDeck.com. This is the first gem I have ever built! I wrote it to practice scraping websites and to learn how to build gems.
+SpeakerDeck.com does not natively allow sorting presenations according to views, so this gem allows you to grab the views data and port it into a database or straight to barebones HTML, sorted in descending order.
+This gem is still a work in progress!
+###Installation
+`gem install spdeck-scrape`
+###Usage
+spdeck-scrape can be used from the command line and also in Ruby scripts.
+From the command line:
+```bash
+$ spdeck-scrape [query | range | l or s display]
+```
+In a Ruby script:
+-   initialize a new `SpeakerdeckScraper` object specifying the desired query. It defaults to Ruby.
+```ruby
+spd-ruby = SpeakerdeckScraper.new("rails")
+# grabs the titles, authors, views, and links
+```
+-   set the number of query results pages to pull
+```ruby
+spd-ruby.query_results_scrape(10)
+# pulls the first 10 pages
+```
+-   initiate the scrape
+```ruby
+spd-ruby.scrape_all
+```
+-   extract the data to basic HTML
+```ruby
+spd-ruby.html_gen
+# will create a file called 'spd-ruby.html' in the working directory with a table of the results sorted by views descending
+```
+###Classes
+SpeakerdeckScraper
+Presentations
+SPDatabase
+SPHTMLGen
+###Methods

data/Rakefile ADDED Viewed

File without changes

data/bin/spdeck-scrape ADDED Viewed

@@ -0,0 +1,34 @@
+#!/usr/bin/env ruby
+require 'spdeck-scrape'
+# usage: $ spdeck-scrape my_query my_range [verbose | concise] -html[optional for html gen]
+if ARGV.empty?
+    puts "\n\n------- spdeck-scrape: ERROR! --------"
+    puts "      Usage:"
+    puts "      Please specify a query, range, and display option (if desired):\n"
+    puts "          spdeck-scrape my_query an_integer [options]"
+    puts "      Options:
+                    -v       # verbose display while running"
+    puts "          -c       # concise display"
+    puts "          -html    # include this tag to print data to an HTML file (must also include a display option)\n"
+    puts "      Example:"
+    puts "          spdeck-scrape ruby 15 -v -html\n\n"
+else
+    query = ARGV[0]
+    ARGV[1].nil? ? range = 5 : range = ARGV[1].to_i
+    display = ARGV[2] || '-c'
+    user = SpeakerdeckScraper.new(query, range, display)
+    user.query_results_scrape(range)
+    user.scrape_all
+    if ARGV[3] == ("-html")
+        user.html_gen
+        system("open spd-#{query}.html")
+        puts
+    end
+end

data/lib/spdeck-scrape.rb ADDED Viewed

@@ -0,0 +1,29 @@
+require_relative './spdeck-scrape/spdeck-scraper-class.rb'
+#require 'spdeck-scrape'
+# this file is loaded and run when 'spdeck-scrape' is required in a script
+# test code
+# scraper = SpeakerdeckScraper.new(, "ruby")
+# scraper.query_results_scrape(3)
+# scraper.scrape_all
+# File.open('spd-ruby-raw', 'w') do |file|
+#     file.write(scraper.presentations)
+# end
+# scraper.html_gen
+# scraper2 = SpeakerdeckScraper.new("https://speakerdeck.com/", "json")
+# scraper2.query_results_scrape(2)
+# scraper2.scrape_all
+# File.open('spd-json-raw', 'w') do |file|
+#     file.write(scraper.presentations)
+# end
+# scraper2.html_gen
+# system("open spd-ruby.html spd-json.html")
+# initialize a scraper with a website and a query

data/lib/spdeck-scrape/spdeck-scraper-class.rb ADDED Viewed

@@ -0,0 +1,174 @@
+require 'nokogiri'
+require 'open-uri'
+require 'pry'
+class SpeakerdeckScraper
+    attr_reader :page_object, :presentations, :url
+    attr_accessor :start_time, :end_time, :opts, :query, :display
+    SD_QUERY_FIRST_PAGE = "https://speakerdeck.com/search?q=ruby"
+    SD_DOMAIN = "https://speakerdeck.com"
+    def initialize(query, range = 5, display = '-v')
+        @url = "https://speakerdeck.com/"
+        @query = query
+        @page_object = ''
+        @presentations = {}
+        @start_time = Time.now
+        @range = range
+        @display = display
+    end
+    def query_results_scrape(range)
+        puts "grabbing presentations"
+        begin
+        single_results_page_scrape(SD_QUERY_FIRST_PAGE)
+        (2..range).collect do |i|
+            single_results_page_scrape(i)
+        end
+        rescue
+            puts "error! prob nothing to worry about"
+        end
+        puts "\ncool! we got #{presentations.length} presentations"
+    end
+    # dumps the query results into a hash, presentations = { 'pres title' => 'pres_link.html' }
+    # not called explicitly, lives in query scrape wrapper
+    def single_results_page_scrape(i)
+        doc = Nokogiri::HTML(open "#{self.url}search?page=#{i}&q=#{query}")
+        doc.css('div.talk').each do |presentation|
+            # ensures a unique key in the hash
+            pres_id = presentation.attr('data-id')
+            pres_link = presentation.css('h3.title a').attr('href').text
+            pres_title = presentation.css('h3.title').text.strip
+            author_name = presentation.parent.css('h3.title a').last.text
+            verbose_display(pres_title, author_name) if self.display == "-v"
+            concise_display if self.display == "-c"
+            self.presentations[pres_id] = pres_link
+        end
+    end
+    #### display options ############
+    def verbose_display(pres_title, author_name)
+        good_words = ["awesome", "great", "amazing", "really cool", "tops", "mind-blowing", "super", "glittering", "thought-provoking", "glorious", "sweet", "classy","really great", "fun", "strong", "robust", "healthy", "fine", "superior", "quality", "thoughful", "intelligent", "clever", "genius","incredible", "smart", "beautiful", "handsome", "pulchritudinous", "elegant", "bespoke", "crazy", "satisfying", "inspirational", "inspiring", "mind-exploding", "hot"]
+        puts "grabbed a #{good_words[rand(good_words.length)]} presentation #{pres_title} by #{author_name}"
+        sleep(0.02)
+    end
+    def concise_display
+        print "#"
+        sleep(0.02)
+    end
+    #### display options end ##########
+    # wrapper to run the single page scraper for all links
+    def scrape_all
+        puts "reading presentation data"
+        self.presentations.each do |id, link|
+            pres_page_scrape(id, link)
+        end
+        self.end_time = Time.now
+    end
+    # grab data from one page
+    # note: this is a time consuming process -- have to open each page (but necessary because the views data isn't stored on the query pages)
+    def pres_page_scrape(id, pres_link)
+        pres_page = Nokogiri::HTML(open("https://speakerdeck.com#{pres_link}"))
+        presentations[id] = {
+            :title => pres_title(pres_page),
+            :link => pres_link,
+            :date => pres_date(pres_page),
+            :author => pres_author(pres_page),
+            :author_link => pres_author_link(pres_page),
+            :category => pres_category(pres_page),
+            :views => pres_views(pres_page)
+            }
+        if self.display == '-c'
+            concise_display
+        else
+        puts "#{presentations[id][:title]} has #{presentations[id][:views]} views!"
+        end
+    end
+    def pres_views(pres_page)
+        pres_page.css('li.views').text.scan(/\d+/).join.to_i
+    end
+    def pres_title(pres_page)
+        pres_page.css('div#content header h1').text
+    end
+    def pres_author(pres_page)
+        pres_page.css('div#content header h2 a').text
+    end
+    def pres_author_link(pres_page)
+        pres_page.css('div#content header h2 a').attr('href').text
+    end
+    def pres_date(pres_page)
+        pres_page.css('div#talk-details mark').first.text.strip
+    end
+    def pres_category(pres_page)
+        pres_page.css('div#talk-details mark a').text
+    end
+    def html_gen
+        # take data and sort it by views descending
+        sorted_array = self.presentations.values.sort_by do |pres_hash|
+            pres_hash[:views]
+        end.reverse
+        File.open("spd-#{query}.html", "w") do |file|
+            file.write( <<-HTML
+                <html>
+                <head>
+                </head>
+                <body>
+                <h1>speakerdeck presentations - #{query}</h1>
+                <h4>this site was generated in #{self.end_time - self.start_time} seconds (last queried at #{self.start_time})
+                    <table class="tablesorter" border="1">
+                    <tr>
+                        <th>title</th>
+                        <th>date</th>
+                        <th>category</th>
+                        <th>author</th>
+                        <th>views</th>
+                    </tr>
+            HTML
+            )
+            sorted_array.each do |content_hash|
+                link = "#{SD_DOMAIN}#{content_hash[:link]}"
+                author_link = "#{SD_DOMAIN}#{content_hash[:author_link]}"
+                file.write ( <<-HTML
+                    <tr>
+                        <td><a href=#{link}>#{content_hash[:title]}</a></td>
+                        <td>#{content_hash[:date]}</td>
+                        <td><a href="https://speakerdeck.com/c/#{content_hash[:category].downcase}">#{content_hash[:category]}</a></td>
+                        <td><a href=#{author_link}>#{content_hash[:author]}</a></td>
+                        <td>#{content_hash[:views]}</td>
+                    </tr>
+                    HTML
+                )
+            end
+            file.write(<<-HTML
+                </table>
+                </body>
+                </html>
+                HTML
+                )
+        end
+    end
+# class end
+end

data/spdeck-scrape-0.0.6.gem ADDED Viewed

Binary file

data/spdeck-scrape.gemspec ADDED Viewed

@@ -0,0 +1,20 @@
+# in progress
+Gem::Specification.new do |s|
+  s.name        = 'spdeck-scrape'
+  s.executables << 'spdeck-scrape'
+  s.version     = '0.0.61'
+  s.date        = '2013-10-11'
+  s.summary     = "Simple scraper for SpeakerDeck"
+  s.description = "Generate data organized by viewcount for a database or webpages about presentations posted on SpeakerDeck (SpeakerDeck.com)"
+  s.author      = "Joe O'Conor"
+  s.email       = 'joe.oconor@gmail.com'
+  s.files       = `git ls-files`.split("\n")
+  s.homepage    =
+    'http://rubygems.org/gems/spdeck-scrape'
+  s.license       = 'MIT'
+  s.require_path = 'lib'
+  s.add_runtime_dependency 'nokogiri' >= '1.6.0'
+  s.post_install_message = <<-JNO
+  You have installed spdeck-scrape! See the README at https://github.com/jnoconor/spdeck-scrape for more information.
+  JNO
+end

data/spec/.rspec ADDED Viewed

File without changes

data/spec/spec_helper.rb ADDED Viewed

File without changes

metadata ADDED Viewed

@@ -0,0 +1,70 @@
+--- !ruby/object:Gem::Specification
+name: spdeck-scrape
+version: !ruby/object:Gem::Version
+  version: 0.0.61
+platform: ruby
+authors:
+- Joe O'Conor
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-10-11 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: 'true'
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: Generate data organized by viewcount for a database or webpages about
+  presentations posted on SpeakerDeck (SpeakerDeck.com)
+email: joe.oconor@gmail.com
+executables:
+- spdeck-scrape
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- README.md
+- Rakefile
+- bin/spdeck-scrape
+- lib/spdeck-scrape.rb
+- lib/spdeck-scrape/spdeck-scraper-class.rb
+- spdeck-scrape-0.0.6.gem
+- spdeck-scrape.gemspec
+- spec/.rspec
+- spec/spec_helper.rb
+homepage: http://rubygems.org/gems/spdeck-scrape
+licenses:
+- MIT
+metadata: {}
+post_install_message: |2
+    You have installed spdeck-scrape! See the README at https://github.com/jnoconor/spdeck-scrape for more information.
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.0.6
+signing_key:
+specification_version: 4
+summary: Simple scraper for SpeakerDeck
+test_files: []