RubyGems - rubyscraper - Versions diffs - 0.1.0 → 0.2.0 - Mend

rubyscraper 0.1.0 → 0.2.0

Files changed (9) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 6f4cfba8b1442632b6c54f30886d254ff25fbbd3
-  data.tar.gz: 4bc8431e8294d5900819d29735f69650e93448da
+  metadata.gz: e1b7f2d272df18fbf97f0da1c6d655077e86bf06
+  data.tar.gz: d7d09ff13581907abab68aafce6e15b0843bea58
 SHA512:
-  metadata.gz: 73ff93065f3079602dfcc58e35e08686ce4fff2d693b7f80c63e8cd37f011826156731b7d8a42a84472232c4a0c5b18dcb8efde1e7a7ede2ded37c431d22f95f
-  data.tar.gz: 04e30fe957b8d95a25cb0620539eca3b8956eb1c5638c8bd038aa6b461f4868555889b033201b350dc651f5faff7e2e38c2fac86ea98ce4bf13ac76de21834c5
+  metadata.gz: 1f06d2c6e6e91658a90d44fd3bc4ff55b8a2bf99116d016bf4d7afbe2cefc9b6f911f2b7a0bcfc31665aba3d4d5d962eb66e36be264eaf5a18434e39a461ea89
+  data.tar.gz: ad976858fee74b80497619622d76349a34ad586ace4aaa4657134d1fe7f38fabe82ec46d514693ebc7c7b75b3c704a58cb296deca4b3f516b6ab6a462c7e358d

data/Gemfile.lock CHANGED Viewed

@@ -16,10 +16,13 @@ GEM
       rack-test (>= 0.5.4)
       xpath (~> 2.0)
     cliver (0.3.2)
+    coderay (1.1.0)
+    diff-lcs (1.2.5)
     domain_name (0.5.24)
       unf (>= 0.0.5, < 1.0.0)
     http-cookie (1.0.2)
       domain_name (~> 0.5)
+    method_source (0.8.2)
     mime-types (2.4.3)
     mini_portile (0.6.2)
     multi_json (1.11.0)
@@ -31,6 +34,10 @@ GEM
       cliver (~> 0.3.1)
       multi_json (~> 1.0)
       websocket-driver (>= 0.2.0)
+    pry (0.10.1)
+      coderay (~> 1.1.0)
+      method_source (~> 0.8.1)
+      slop (~> 3.4)
     rack (1.6.0)
     rack-test (0.6.3)
       rack (>= 1.0)
@@ -39,6 +46,20 @@ GEM
       http-cookie (>= 1.0.2, < 2.0)
       mime-types (>= 1.16, < 3.0)
       netrc (~> 0.7)
+    rspec (3.2.0)
+      rspec-core (~> 3.2.0)
+      rspec-expectations (~> 3.2.0)
+      rspec-mocks (~> 3.2.0)
+    rspec-core (3.2.3)
+      rspec-support (~> 3.2.0)
+    rspec-expectations (3.2.1)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.2.0)
+    rspec-mocks (3.2.1)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.2.0)
+    rspec-support (3.2.2)
+    slop (3.6.0)
     unf (0.1.4)
       unf_ext
     unf_ext (0.0.7.1)
@@ -53,5 +74,7 @@ PLATFORMS
 DEPENDENCIES
   bundler (~> 1.9)
+  pry
   rake (~> 10.0)
+  rspec (~> 3.0)
   rubyscraper!

data/lib/assets/scrapes.json ADDED Viewed

@@ -0,0 +1,287 @@
+[
+  {
+    "name":"stackoverflow",
+    "base_url":"http://www.careers.stackoverflow.com",
+    "summary":{
+      "url":"/jobs?searchTerm=SEARCHTERM&sort=p",
+      "pagination_fmt":"&pg=",
+      "pagination_start":"1",
+      "pagination_scale":"1",
+      "params":[
+        {
+          "SEARCHTERM":[
+            "ruby",
+            "ruby+on+rails",
+            "javascript"
+          ]
+        }
+      ],
+      "loop":".listResults .-item",
+      "fields":[
+        {
+          "field":"position",
+          "method":"find",
+          "path":"h3.-title a"
+        },
+        {
+          "field":"url",
+          "method":"find",
+          "path":"h3.-title a",
+          "attr":"href"
+        },
+        {
+          "field":"posting_date",
+          "method":"first",
+          "path":"p._muted"
+        }
+      ]
+    },
+    "sub_page":{
+      "fields":[
+        {
+          "field":"company",
+          "method":"find",
+          "path":"a.employer"
+        },
+        {
+          "field":"location",
+          "method":"find",
+          "path":"span.location"
+        },
+        {
+          "field":"description",
+          "method":"all",
+          "path":"div.description p",
+          "loop_collect":"text",
+          "join":"\n"
+        },
+        {
+          "field":"tags",
+          "method":"all",
+          "path":"div.tags a.post-tag",
+          "loop_collect":"text",
+          "join":", "
+        }
+      ]
+    }
+  },
+  {
+    "name":"rubynow",
+    "base_url":"http://jobs.rubynow.com/",
+    "summary":{
+      "url":"",
+      "no_pagination?":"true",
+      "pagination_fmt":"",
+      "pagination_start":"",
+      "pagination_scale":"",
+      "params":[
+      ],
+      "loop":"ul.jobs li",
+      "fields":[
+        {
+          "field":"position",
+          "method":"find",
+          "path":"h3 a"
+        },
+        {
+          "field":"url",
+          "method":"find",
+          "path":"h3 a",
+          "attr":"href"
+        },
+        {
+          "field":"posting_date",
+          "method":"find",
+          "path":"span.date"
+        }
+      ]
+    },
+    "sub_page":{
+      "fields":[
+        {
+          "field":"description",
+          "method":"all",
+          "path":"div#info p",
+          "loop_collect":"text",
+          "join":"\n"
+        },
+        {
+          "field":"company",
+          "method":"find",
+          "path":"h2#headline a"
+        },
+        {
+          "field":"location",
+          "method":"find",
+          "path":"h3#location"
+        }
+      ]
+    }
+  },
+  {
+    "name":"weworkremotely",
+    "base_url":"https://weworkremotely.com",
+    "summary":{
+      "url":"/categories/2/jobs",
+      "no_pagination?":"true",
+      "pagination_fmt":"",
+      "pagination_start":"",
+      "pagination_scale":"",
+      "params":[
+      ],
+      "loop":"section.jobs ul li",
+      "fields":[
+        {
+          "field":"position",
+          "method":"find",
+          "path":"span.title"
+        },
+        {
+          "field":"company",
+          "method":"find",
+          "path":"span.company"
+        },
+        {
+          "field":"url",
+          "method":"find",
+          "path":"a",
+          "attr":"href"
+        },
+        {
+          "field":"posting_date",
+          "method":"find",
+          "path":"span.date"
+        }
+      ]
+    },
+    "sub_page":{
+      "fields":[
+        {
+          "field":"location",
+          "method":"find",
+          "path":"span.location"
+        },
+        {
+          "field":"description",
+          "method":"all",
+          "path":"div.listing-container div",
+          "loop_collect":"text",
+          "join":"\n"
+        }
+      ]
+    }
+  },
+  {
+    "name":"indeed",
+    "skip":"true",
+    "base_url":"http://www.indeed.com",
+    "summary":{
+      "url":"/jobs?q=SEARCHTERM&sr=directhire",
+      "pagination_fmt":"&start=",
+      "pagination_start":"0",
+      "pagination_scale":"10",
+      "params":[
+        {
+          "SEARCHTERM":[
+            "ruby",
+            "ruby+on+rails",
+            "junior+web+developer",
+            "ember.js",
+            "full+stack"
+          ]
+        }
+      ],
+      "loop":"div.row.result",
+      "fields":[
+        {
+          "field":"position",
+          "method":"find",
+          "path":"h2.jobtitle a"
+        },
+        {
+          "field":"url",
+          "method":"find",
+          "path":"h2.jobtitle a",
+          "attr":"href"
+        },
+        {
+          "field":"company",
+          "method":"find",
+          "path":"span.company span"
+        },
+        {
+          "field":"location",
+          "method":"find",
+          "path":"span.location span"
+        },
+        {
+          "field":"description",
+          "method":"find",
+          "path":"span.summary span"
+        },
+        {
+          "field":"posting_date",
+          "method":"find",
+          "path":"span.date"
+        }
+      ]
+    },
+    "sub_page":{
+      "fields":[
+      ]
+    }
+  },
+  {
+    "name":"linkedin",
+    "skip":"true",
+    "base_url":"https://www.linkedin.com",
+    "summary":{
+      "url":"/vsearch/j?keywords=SEARCHTERM&openAdvancedForm=true&locationType=I&countryCode=us&rsid=754744171429892349899&orig=FCTD&openFacets=L,C,TP&f_TP=1&pt=jobs&pt=jobs",
+      "pagination_fmt":"&page_num=",
+      "pagination_start":"1",
+      "pagination_scale":"1",
+      "params":[
+        {
+          "SEARCHTERM":[
+            "Ruby",
+            "Ruby+On+Rails",
+            "javascript"
+          ]
+        }
+      ],
+      "loop":"ol.search-results li.result",
+      "fields":[
+        {
+          "field":"position",
+          "method":"find",
+          "path":"a.title"
+        },
+        {
+          "field":"url",
+          "method":"find",
+          "path":"a.title",
+          "attr":"href"
+        },
+        {
+          "field":"company",
+          "method":"find",
+          "path":"div.description a"
+        },
+        {
+          "field":"location",
+          "method":"find",
+          "path":"dl.demographic bdi"
+        }
+      ]
+    },
+    "sub_page":{
+      "fields":[
+        {
+          "field":"description",
+          "method":"find",
+          "path":"div.description-section div.rich-text"
+        }
+      ]
+    }
+  }
+]

data/lib/rubyscraper.rb CHANGED Viewed

@@ -5,76 +5,135 @@ require 'rubyscraper/version'
 class RubyScraper
   include Capybara::DSL
+  attr_reader :scrape_config, :pages, :jobs, :posted_jobs, :endpoint, :scraped_jobs
-  def initialize(endpoint)
+  def initialize(endpoint, pages=1)
     Capybara.register_driver :poltergeist do |app|
       Capybara::Poltergeist::Driver.new(app, js_errors: false)
     end
     Capybara.default_driver = :poltergeist
     @jobs = []
+    @scraped_jobs = 0
     @posted_jobs = 0
+    @pages = pages
     @endpoint = endpoint
-    @search_terms_file = File.expand_path('../assets/search-terms.txt', __FILE__)
-    @search_terms = []
-    File.foreach(@search_terms_file) { |x| @search_terms << x.strip }
+    @scrape_file = File.expand_path('../assets/scrapes.json', __FILE__)
+    @scrape_config = JSON.parse(File.read(@scrape_file))
   end
-  def scrape
-    get_summaries
-    get_bodies
-    send_to_server
-    return @jobs.length, @posted_jobs
+  def scrape(single_site=nil)
+    if single_site
+      search_site = scrape_config.select { |site| site["name"] == single_site }
+      if search_site
+        get_data(search_site.first)
+      else
+        raise "Invalid single site name #{single_site}. Not in scrape file."
+      end
+    else
+      scrape_config.each do |site|
+        unless site["skip"] == "true"
+          get_data(site)
+        end
+      end
+    end
+    return scraped_jobs, posted_jobs
   end
-  def get_summaries
-    @search_terms.each do |term|
-      visit "http://careers.stackoverflow.com/jobs?searchTerm=#{term}&sort=p"
-      (1..2).to_a.each do |page|
-        visit "http://careers.stackoverflow.com/jobs?searchTerm=ruby&sort=p&pg=#{page}"
-        all(".listResults .-item").each do |listing|
-          position = listing.find("h3.-title a").text
-          url = listing.find("h3.-title a")["href"]
-          posting_date = listing.first("p._muted").text
+  def get_data(site)
+    get_summaries(site)
+    get_bodies(site)
+    send_to_server
+  end
-          @jobs << { position: position, url: url, posting_date: posting_date }
+  def get_summaries(site)
+    if site["summary"]["params"].length > 0 && !site["summary"]["no_pagination?"]
+      site["summary"]["params"][0]["SEARCHTERM"].each do |term|
+        summary_url = "#{site["base_url"]}#{site["summary"]["url"].sub("SEARCHTERM", term)}"
+        pagination_start = site["summary"]["pagination_start"].to_i
+        pagination_end   = pagination_start + pages - 1
+        (pagination_start..pagination_end).to_a.each do |page|
+          visit "#{summary_url}#{site["summary"]["pagination_fmt"]}#{page * site["summary"]["pagination_scale"].to_i}"
+          all(site["summary"]["loop"]).each do |listing|
+            job = pull_summary_data(site, listing)
+            job = modify_data(site, job)
+            jobs << job
+          end
+          puts "Pulled #{site["name"]}: #{term} (page: #{page}) job summaries."
         end
       end
-      puts "Pulled #{term} job summaries."
+    else
+      summary_url = "#{site["base_url"]}#{site["summary"]["url"]}"
+      visit summary_url
+      all(site["summary"]["loop"]).each do |listing|
+        job = pull_summary_data(site, listing)
+        job = modify_data(site, job)
+        jobs << job
+      end
+      puts "Pulled #{site["name"]} job summaries."
     end
   end
-  def get_bodies
-    @jobs.each_with_index do |job, i|
-      puts "Job #{i+1} pulled."
-      sleep 1
-      visit "http://careers.stackoverflow.com#{job[:url]}"
-      if has_css?("a.employer")
-        job[:company] = find("a.employer").text
-      end
-      if has_css?("span.location")
-        job[:location] = find("span.location").text
-      end
-      #job[:description] = first("div.description p")
-      description = all("div.description p").map do |p|
-        p.text
+  def pull_summary_data(site, listing)
+    job = Hash.new
+    site["summary"]["fields"].each do |field|
+      if field["attr"]
+        if listing.has_css?(field["path"])
+          job[field["field"]] =
+            listing.send(field["method"].to_sym, field["path"])[field["attr"]]
+        end
+      else
+        if listing.has_css?(field["path"])
+          job[field["field"]] =
+            listing.send(field["method"].to_sym, field["path"]).text
+        end
       end
-      job[:description] = description.join("\n")
-      tags = all("div.tags a.post-tag").map do |tag|
-        tag.text
+    end; job
+  end
+  def modify_data(site, job)
+    job["url"] = "#{site["base_url"]}#{job["url"]}" unless job["url"].match(/^http/)
+    job
+  end
+  def get_bodies(site)
+    jobs.each_with_index do |job, i|
+      sleep 1
+      pull_job_data(site, job)
+      puts "Job #{i+1} pulled."
+    end
+  end
+  def pull_job_data(site, job)
+    visit job["url"]
+    site["sub_page"]["fields"].each do |field|
+      if field["method"] == "all"
+        if has_css?(field["path"])
+          values = all(field["path"]).map do |elem|
+            elem.send(field["loop_collect"])
+          end
+          job[field["field"]] = values.join(field["join"])
+        end
+      else
+        if has_css?(field["path"])
+          job[field["field"]] =
+            send(field["method"].to_sym,field["path"]).text
+        end
       end
-      job[:tags] = tags
     end
   end
   def send_to_server
-    @jobs.each_with_index do |job, i|
+    @scraped_jobs += jobs.length
+    jobs.each do |job|
       new_job = {
-        position: job[:position],
-        location: job[:location],
-        description: job[:description],
-        source: "http://careers.stackoverflow.com#{job[:url]}"
+        position: job["position"],
+        location: job["location"],
+        description: job["description"],
+        source: job["url"]
       }
-      RestClient.post(@endpoint, job: new_job){ |response, request, result, &block|
+      RestClient.post(endpoint, job: new_job){ |response, request, result, &block|
         case response.code
         when 201
           @posted_jobs += 1
@@ -86,5 +145,6 @@ class RubyScraper
         end
       }
     end
+    @jobs = []
   end
 end

data/lib/rubyscraper/binary.rb CHANGED Viewed

@@ -6,9 +6,10 @@ class RubyScraper
       outstream.puts "StackOverflow Job Scraper"
       outstream.puts "---------------------------------------------"
       outstream.puts "Started scraping..."
-      endpoint = argv.first
+      endpoint = argv[0]
+      single_site = argv[1]
       outstream.puts "Sending post requests to #{endpoint}"
-      jobs_scraped, jobs_saved = RubyScraper.new(endpoint).scrape
+      jobs_scraped, jobs_saved = RubyScraper.new(endpoint).scrape(single_site)
       outstream.puts "Scraped #{jobs_scraped} jobs, succesfully posted #{jobs_saved} jobs."
       outstream.puts "---------------------------------------------"
       outstream.puts "Completed!"

data/lib/rubyscraper/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 class RubyScraper
-  VERSION = "0.1.0"
+  VERSION = "0.2.0"
 end

data/rubyscraper.gemspec CHANGED Viewed

@@ -16,6 +16,7 @@ Gem::Specification.new do |s|
   s.add_dependency "capybara"
   s.add_dependency "poltergeist"
   s.add_dependency "rest-client"
+  s.add_dependency "slop"
   s.add_development_dependency "bundler", "~> 1.9"
   s.add_development_dependency "rake", "~> 10.0"

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rubyscraper
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.2.0
 platform: ruby
 authors:
 - Nathan Owsiany
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-04-23 00:00:00.000000000 Z
+date: 2015-04-26 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: capybara
@@ -52,6 +52,20 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: slop
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
@@ -122,7 +136,7 @@ files:
 - bin/console
 - bin/rubyscraper
 - bin/setup
-- lib/assets/search-terms.txt
+- lib/assets/scrapes.json
 - lib/rubyscraper.rb
 - lib/rubyscraper/binary.rb
 - lib/rubyscraper/version.rb

data/lib/assets/search-terms.txt DELETED Viewed

@@ -1,5 +0,0 @@
-ruby
-ruby+on+rails
-javascript
-junior
-full-stack