RubyGems - rubyscraper - Versions diffs - 0.1.0 → 0.2.0 - Mend

rubyscraper 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 6f4cfba8b1442632b6c54f30886d254ff25fbbd3
-  data.tar.gz: 4bc8431e8294d5900819d29735f69650e93448da
+  metadata.gz: e1b7f2d272df18fbf97f0da1c6d655077e86bf06
+  data.tar.gz: d7d09ff13581907abab68aafce6e15b0843bea58
 SHA512:
-  metadata.gz: 73ff93065f3079602dfcc58e35e08686ce4fff2d693b7f80c63e8cd37f011826156731b7d8a42a84472232c4a0c5b18dcb8efde1e7a7ede2ded37c431d22f95f
-  data.tar.gz: 04e30fe957b8d95a25cb0620539eca3b8956eb1c5638c8bd038aa6b461f4868555889b033201b350dc651f5faff7e2e38c2fac86ea98ce4bf13ac76de21834c5
+  metadata.gz: 1f06d2c6e6e91658a90d44fd3bc4ff55b8a2bf99116d016bf4d7afbe2cefc9b6f911f2b7a0bcfc31665aba3d4d5d962eb66e36be264eaf5a18434e39a461ea89
+  data.tar.gz: ad976858fee74b80497619622d76349a34ad586ace4aaa4657134d1fe7f38fabe82ec46d514693ebc7c7b75b3c704a58cb296deca4b3f516b6ab6a462c7e358d

data/Gemfile.lock CHANGED Viewed

@@ -16,10 +16,13 @@ GEM
       rack-test (>= 0.5.4)
       xpath (~> 2.0)
     cliver (0.3.2)
+    coderay (1.1.0)
+    diff-lcs (1.2.5)
     domain_name (0.5.24)
       unf (>= 0.0.5, < 1.0.0)
     http-cookie (1.0.2)
       domain_name (~> 0.5)
+    method_source (0.8.2)
     mime-types (2.4.3)
     mini_portile (0.6.2)
     multi_json (1.11.0)
@@ -31,6 +34,10 @@ GEM
       cliver (~> 0.3.1)
       multi_json (~> 1.0)
       websocket-driver (>= 0.2.0)
+    pry (0.10.1)
+      coderay (~> 1.1.0)
+      method_source (~> 0.8.1)
+      slop (~> 3.4)
     rack (1.6.0)
     rack-test (0.6.3)
       rack (>= 1.0)
@@ -39,6 +46,20 @@ GEM
       http-cookie (>= 1.0.2, < 2.0)
       mime-types (>= 1.16, < 3.0)
       netrc (~> 0.7)
+    rspec (3.2.0)
+      rspec-core (~> 3.2.0)
+      rspec-expectations (~> 3.2.0)
+      rspec-mocks (~> 3.2.0)
+    rspec-core (3.2.3)
+      rspec-support (~> 3.2.0)
+    rspec-expectations (3.2.1)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.2.0)
+    rspec-mocks (3.2.1)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.2.0)
+    rspec-support (3.2.2)
+    slop (3.6.0)
     unf (0.1.4)
       unf_ext
     unf_ext (0.0.7.1)
@@ -53,5 +74,7 @@ PLATFORMS
 DEPENDENCIES
   bundler (~> 1.9)
+  pry
   rake (~> 10.0)
+  rspec (~> 3.0)
   rubyscraper!

data/lib/assets/scrapes.json ADDED Viewed

@@ -0,0 +1,287 @@
+[
+  {
+    "name":"stackoverflow",
+    "base_url":"http://www.careers.stackoverflow.com",
+    "summary":{
+      "url":"/jobs?searchTerm=SEARCHTERM&sort=p",
+      "pagination_fmt":"&pg=",
+      "pagination_start":"1",
+      "pagination_scale":"1",
+      "params":[
+        {
+          "SEARCHTERM":[
+            "ruby",
+            "ruby+on+rails",
+            "javascript"
+          ]
+        }
+      ],
+      "loop":".listResults .-item",
+      "fields":[
+        {
+          "field":"position",
+          "method":"find",
+          "path":"h3.-title a"
+        },
+        {
+          "field":"url",
+          "method":"find",
+          "path":"h3.-title a",
+          "attr":"href"
+        },
+        {
+          "field":"posting_date",
+          "method":"first",
+          "path":"p._muted"
+        }
+      ]
+    },
+    "sub_page":{
+      "fields":[
+        {
+          "field":"company",
+          "method":"find",
+          "path":"a.employer"
+        },
+        {
+          "field":"location",
+          "method":"find",
+          "path":"span.location"
+        },
+        {
+          "field":"description",
+          "method":"all",
+          "path":"div.description p",
+          "loop_collect":"text",
+          "join":"\n"
+        },
+        {
+          "field":"tags",
+          "method":"all",
+          "path":"div.tags a.post-tag",
+          "loop_collect":"text",
+          "join":", "
+        }
+      ]
+    }
+  },
+  {
+    "name":"rubynow",
+    "base_url":"http://jobs.rubynow.com/",
+    "summary":{
+      "url":"",
+      "no_pagination?":"true",
+      "pagination_fmt":"",
+      "pagination_start":"",
+      "pagination_scale":"",
+      "params":[
+      ],
+      "loop":"ul.jobs li",
+      "fields":[
+        {
+          "field":"position",
+          "method":"find",
+          "path":"h3 a"
+        },
+        {
+          "field":"url",
+          "method":"find",
+          "path":"h3 a",
+          "attr":"href"
+        },
+        {
+          "field":"posting_date",
+          "method":"find",
+          "path":"span.date"
+        }
+      ]
+    },
+    "sub_page":{
+      "fields":[
+        {
+          "field":"description",
+          "method":"all",
+          "path":"div#info p",
+          "loop_collect":"text",
+          "join":"\n"
+        },
+        {
+          "field":"company",
+          "method":"find",
+          "path":"h2#headline a"
+        },
+        {
+          "field":"location",
+          "method":"find",
+          "path":"h3#location"
+        }
+      ]
+    }
+  },
+  {
+    "name":"weworkremotely",
+    "base_url":"https://weworkremotely.com",
+    "summary":{
+      "url":"/categories/2/jobs",
+      "no_pagination?":"true",
+      "pagination_fmt":"",
+      "pagination_start":"",
+      "pagination_scale":"",
+      "params":[
+      ],
+      "loop":"section.jobs ul li",
+      "fields":[
+        {
+          "field":"position",
+          "method":"find",
+          "path":"span.title"
+        },
+        {
+          "field":"company",
+          "method":"find",
+          "path":"span.company"
+        },
+        {
+          "field":"url",
+          "method":"find",
+          "path":"a",
+          "attr":"href"
+        },
+        {
+          "field":"posting_date",
+          "method":"find",
+          "path":"span.date"
+        }
+      ]
+    },
+    "sub_page":{
+      "fields":[
+        {
+          "field":"location",
+          "method":"find",
+          "path":"span.location"
+        },
+        {
+          "field":"description",
+          "method":"all",
+          "path":"div.listing-container div",
+          "loop_collect":"text",
+          "join":"\n"
+        }
+      ]
+    }
+  },
+  {
+    "name":"indeed",
+    "skip":"true",
+    "base_url":"http://www.indeed.com",
+    "summary":{
+      "url":"/jobs?q=SEARCHTERM&sr=directhire",
+      "pagination_fmt":"&start=",
+      "pagination_start":"0",
+      "pagination_scale":"10",
+      "params":[
+        {
+          "SEARCHTERM":[
+            "ruby",
+            "ruby+on+rails",
+            "junior+web+developer",
+            "ember.js",
+            "full+stack"
+          ]
+        }
+      ],
+      "loop":"div.row.result",
+      "fields":[
+        {
+          "field":"position",
+          "method":"find",
+          "path":"h2.jobtitle a"
+        },
+        {
+          "field":"url",
+          "method":"find",
+          "path":"h2.jobtitle a",
+          "attr":"href"
+        },
+        {
+          "field":"company",
+          "method":"find",
+          "path":"span.company span"
+        },
+        {
+          "field":"location",
+          "method":"find",
+          "path":"span.location span"
+        },
+        {
+          "field":"description",
+          "method":"find",
+          "path":"span.summary span"
+        },
+        {
+          "field":"posting_date",
+          "method":"find",
+          "path":"span.date"
+        }
+      ]
+    },
+    "sub_page":{
+      "fields":[
+      ]
+    }
+  },
+  {
+    "name":"linkedin",
+    "skip":"true",
+    "base_url":"https://www.linkedin.com",
+    "summary":{
+      "url":"/vsearch/j?keywords=SEARCHTERM&openAdvancedForm=true&locationType=I&countryCode=us&rsid=754744171429892349899&orig=FCTD&openFacets=L,C,TP&f_TP=1&pt=jobs&pt=jobs",
+      "pagination_fmt":"&page_num=",
+      "pagination_start":"1",
+      "pagination_scale":"1",
+      "params":[
+        {
+          "SEARCHTERM":[
+            "Ruby",
+            "Ruby+On+Rails",
+            "javascript"
+          ]
+        }
+      ],
+      "loop":"ol.search-results li.result",
+      "fields":[
+        {
+          "field":"position",
+          "method":"find",
+          "path":"a.title"
+        },
+        {
+          "field":"url",
+          "method":"find",
+          "path":"a.title",
+          "attr":"href"
+        },
+        {
+          "field":"company",
+          "method":"find",
+          "path":"div.description a"
+        },
+        {
+          "field":"location",
+          "method":"find",
+          "path":"dl.demographic bdi"
+        }
+      ]
+    },
+    "sub_page":{
+      "fields":[
+        {
+          "field":"description",
+          "method":"find",
+          "path":"div.description-section div.rich-text"
+        }
+      ]
+    }
+  }
+]

data/lib/rubyscraper.rb CHANGED Viewed

@@ -5,76 +5,135 @@ require 'rubyscraper/version'
 class RubyScraper
   include Capybara::DSL
+  attr_reader :scrape_config, :pages, :jobs, :posted_jobs, :endpoint, :scraped_jobs
-  def initialize(endpoint)
+  def initialize(endpoint, pages=1)
     Capybara.register_driver :poltergeist do |app|
       Capybara::Poltergeist::Driver.new(app, js_errors: false)
     end
     Capybara.default_driver = :poltergeist
     @jobs = []
+    @scraped_jobs = 0
     @posted_jobs = 0
+    @pages = pages
     @endpoint = endpoint
-    @search_terms_file = File.expand_path('../assets/search-terms.txt', __FILE__)
-    @search_terms = []
-    File.foreach(@search_terms_file) { |x| @search_terms << x.strip }
+    @scrape_file = File.expand_path('../assets/scrapes.json', __FILE__)
+    @scrape_config = JSON.parse(File.read(@scrape_file))
   end
-  def scrape
-    get_summaries
-    get_bodies
-    send_to_server
-    return @jobs.length, @posted_jobs
+  def scrape(single_site=nil)
+    if single_site
+      search_site = scrape_config.select { |site| site["name"] == single_site }
+      if search_site
+        get_data(search_site.first)
+      else
+        raise "Invalid single site name #{single_site}. Not in scrape file."
+      end
+    else
+      scrape_config.each do |site|
+        unless site["skip"] == "true"
+          get_data(site)
+        end
+      end
+    end
+    return scraped_jobs, posted_jobs
   end
-  def get_summaries
-    @search_terms.each do |term|
-      visit "http://careers.stackoverflow.com/jobs?searchTerm=#{term}&sort=p"
-      (1..2).to_a.each do |page|
-        visit "http://careers.stackoverflow.com/jobs?searchTerm=ruby&sort=p&pg=#{page}"
-        all(".listResults .-item").each do |listing|
-          position = listing.find("h3.-title a").text
-          url = listing.find("h3.-title a")["href"]
-          posting_date = listing.first("p._muted").text
+  def get_data(site)
+    get_summaries(site)
+    get_bodies(site)
+    send_to_server
+  end
-          @jobs << { position: position, url: url, posting_date: posting_date }
+  def get_summaries(site)
+    if site["summary"]["params"].length > 0 && !site["summary"]["no_pagination?"]
+      site["summary"]["params"][0]["SEARCHTERM"].each do |term|
+        summary_url = "#{site["base_url"]}#{site["summary"]["url"].sub("SEARCHTERM", term)}"
+        pagination_start = site["summary"]["pagination_start"].to_i
+        pagination_end   = pagination_start + pages - 1
+        (pagination_start..pagination_end).to_a.each do |page|
+          visit "#{summary_url}#{site["summary"]["pagination_fmt"]}#{page * site["summary"]["pagination_scale"].to_i}"
+          all(site["summary"]["loop"]).each do |listing|
+            job = pull_summary_data(site, listing)
+            job = modify_data(site, job)
+            jobs << job
+          end
+          puts "Pulled #{site["name"]}: #{term} (page: #{page}) job summaries."
         end
       end
-      puts "Pulled #{term} job summaries."
+    else
+      summary_url = "#{site["base_url"]}#{site["summary"]["url"]}"
+      visit summary_url
+      all(site["summary"]["loop"]).each do |listing|
+        job = pull_summary_data(site, listing)
+        job = modify_data(site, job)
+        jobs << job
+      end
+      puts "Pulled #{site["name"]} job summaries."
     end
   end
-  def get_bodies
-    @jobs.each_with_index do |job, i|
-      puts "Job #{i+1} pulled."
-      sleep 1
-      visit "http://careers.stackoverflow.com#{job[:url]}"
-      if has_css?("a.employer")
-        job[:company] = find("a.employer").text
-      end
-      if has_css?("span.location")
-        job[:location] = find("span.location").text
-      end
-      #job[:description] = first("div.description p")
-      description = all("div.description p").map do |p|
-        p.text
+  def pull_summary_data(site, listing)
+    job = Hash.new
+    site["summary"]["fields"].each do |field|
+      if field["attr"]
+        if listing.has_css?(field["path"])
+          job[field["field"]] =
+            listing.send(field["method"].to_sym, field["path"])[field["attr"]]
+        end
+      else
+        if listing.has_css?(field["path"])
+          job[field["field"]] =
+            listing.send(field["method"].to_sym, field["path"]).text
+        end
       end
-      job[:description] = description.join("\n")
-      tags = all("div.tags a.post-tag").map do |tag|
-        tag.text
+    end; job
+  end
+  def modify_data(site, job)
+    job["url"] = "#{site["base_url"]}#{job["url"]}" unless job["url"].match(/^http/)
+    job
+  end
+  def get_bodies(site)
+    jobs.each_with_index do |job, i|
+      sleep 1
+      pull_job_data(site, job)
+      puts "Job #{i+1} pulled."
+    end
+  end
+  def pull_job_data(site, job)
+    visit job["url"]
+    site["sub_page"]["fields"].each do |field|
+      if field["method"] == "all"
+        if has_css?(field["path"])
+          values = all(field["path"]).map do |elem|
+            elem.send(field["loop_collect"])
+          end
+          job[field["field"]] = values.join(field["join"])
+        end
+      else
+        if has_css?(field["path"])
+          job[field["field"]] =
+            send(field["method"].to_sym,field["path"]).text
+        end
       end
-      job[:tags] = tags
     end
   end
   def send_to_server
-    @jobs.each_with_index do |job, i|
+    @scraped_jobs += jobs.length
+    jobs.each do |job|
       new_job = {
-        position: job[:position],
-        location: job[:location],
-        description: job[:description],
-        source: "http://careers.stackoverflow.com#{job[:url]}"
+        position: job["position"],
+        location: job["location"],
+        description: job["description"],
+        source: job["url"]
       }
-      RestClient.post(@endpoint, job: new_job){ |response, request, result, &block|
+      RestClient.post(endpoint, job: new_job){ |response, request, result, &block|
         case response.code
         when 201
           @posted_jobs += 1
@@ -86,5 +145,6 @@ class RubyScraper
         end
       }
     end
+    @jobs = []
   end
 end

data/lib/rubyscraper/binary.rb CHANGED Viewed

@@ -6,9 +6,10 @@ class RubyScraper
       outstream.puts "StackOverflow Job Scraper"
       outstream.puts "---------------------------------------------"
       outstream.puts "Started scraping..."
-      endpoint = argv.first
+      endpoint = argv[0]
+      single_site = argv[1]
       outstream.puts "Sending post requests to #{endpoint}"
-      jobs_scraped, jobs_saved = RubyScraper.new(endpoint).scrape
+      jobs_scraped, jobs_saved = RubyScraper.new(endpoint).scrape(single_site)
       outstream.puts "Scraped #{jobs_scraped} jobs, succesfully posted #{jobs_saved} jobs."
       outstream.puts "---------------------------------------------"
       outstream.puts "Completed!"

data/lib/rubyscraper/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 class RubyScraper
-  VERSION = "0.1.0"
+  VERSION = "0.2.0"
 end

data/rubyscraper.gemspec CHANGED Viewed

@@ -16,6 +16,7 @@ Gem::Specification.new do |s|
   s.add_dependency "capybara"
   s.add_dependency "poltergeist"
   s.add_dependency "rest-client"
+  s.add_dependency "slop"
   s.add_development_dependency "bundler", "~> 1.9"
   s.add_development_dependency "rake", "~> 10.0"

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: rubyscraper
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.2.0
 platform: ruby
 authors:
 - Nathan Owsiany
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-04-23 00:00:00.000000000 Z
+date: 2015-04-26 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: capybara
@@ -52,6 +52,20 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: slop
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
@@ -122,7 +136,7 @@ files:
 - bin/console
 - bin/rubyscraper
 - bin/setup
-- lib/assets/search-terms.txt
+- lib/assets/scrapes.json
 - lib/rubyscraper.rb
 - lib/rubyscraper/binary.rb
 - lib/rubyscraper/version.rb

data/lib/assets/search-terms.txt DELETED Viewed

@@ -1,5 +0,0 @@
-ruby
-ruby+on+rails
-javascript
-junior
-full-stack