RubyGems - scrapin-a-livin - Versions diffs - 0.1.0 - Mend

scrapin-a-livin 0.1.0

Files changed (22) hide show

data/.document +5 -0
data/.gitignore +5 -0
data/LICENSE +20 -0
data/README.markdown +36 -0
data/Rakefile +75 -0
data/VERSION +1 -0
data/lib/dice/dice_search.rb +122 -0
data/lib/generic/listing.rb +25 -0
data/lib/scrapin-a-livin.rb +43 -0
data/lib/yahoo/hotjobs.rb +171 -0
data/test/dice_parser_test.rb +172 -0
data/test/queries/dice/queryAustin.html +2207 -0
data/test/queries/dice/queryRaleigh.html +2272 -0
data/test/queries/dice/querySanJose.html +2517 -0
data/test/queries/hotjobs/queryAustin.html +737 -0
data/test/queries/hotjobs/queryRaleigh.html +755 -0
data/test/queries/hotjobs/querySanJose.html +753 -0
data/test/scripts/diceDump.rb +71 -0
data/test/scripts/hotjobsDump.rb +70 -0
data/test/test_helper.rb +8 -0
data/test/yahoo_parser_test.rb +168 -0
metadata +110 -0

data/.document ADDED

@@ -0,0 +1,5 @@
+README.rdoc
+lib/**/*.rb
+bin/*
+features/**/*.feature
+LICENSE

data/.gitignore ADDED

@@ -0,0 +1,5 @@
+*.sw?
+.DS_Store
+coverage
+rdoc
+pkg

data/LICENSE ADDED

@@ -0,0 +1,20 @@
+Copyright (c) 2009 Kevin S Kirkup
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.markdown ADDED

@@ -0,0 +1,36 @@
+# scrapin-a-livin
+This script helps you screen scrape the most common job sites and save the lists on you local computer.
+You can then use other tools to display updates, send out resumes or search your LinkedIn account
+to see who you may know at that company.
+Please feel free to contribute make. All suggestions are welcome.
+Hopefully this will help you find the career you are looking for.
+Thanks goes out to Igvita.com for posting this article
+http://www.igvita.com/2007/02/04/ruby-screen-scraper-in-60-seconds/
+## Quick links
+  *[Wiki](http://wiki.github.com/angrytuna/scrapin-a-livin)
+  *[Bugs](http://github.com/angrytuna/scrapin-a-livin/issues)
+## How To
+A
+## Note on Patches/Pull Requests
+* Fork the project.
+* Make your feature addition or bug fix.
+* Add tests for it. This is important so I don't break it in a
+  future version unintentionally.
+* Commit, do not mess with rakefile, version, or history.
+  (if you want to have your own version, that is fine but
+   bump version in a commit by itself I can ignore when I pull)
+* Send me a pull request. Bonus points for topic branches.
+## Copyright
+Copyright (c) 2009 Kevin S Kirkup. See LICENSE for details.

data/Rakefile ADDED

@@ -0,0 +1,75 @@
+require 'rubygems'
+require 'rake'
+begin
+  require 'jeweler'
+  Jeweler::Tasks.new do |gem|
+    gem.name = "scrapin-a-livin"
+    gem.summary = %Q{Scrape to find a job}
+    gem.description = <<EOF
+Are you in the technology industry?
+The you have probably lost a job during your career.
+This script helps you scrape the most common job sites for your search criteria,
+and save the lists on you local computer.
+You can then use other tools to display updates, send out resumes or search your LinkedIn account
+to see who you may know at that company.
+Please feel free to contribute make. All suggestions are welcome.
+Hopefully this will help you find the career you are looking for.
+EOF
+    gem.email = "kevin.kirkup@gmail.com"
+    gem.homepage = "http://github.com/angrytuna/scrapin-a-livin"
+    gem.authors = ["Kevin S Kirkup"]
+    gem.platform = Gem::Platform::RUBY
+    gem.require_path = 'lib'
+    gem.add_dependency('hpricot', '>= 0.6')
+    gem.add_development_dependency "thoughtbot-shoulda"
+    # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
+  end
+  Jeweler::GemcutterTasks.new
+rescue LoadError
+  puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
+end
+require 'rake/testtask'
+Rake::TestTask.new(:test) do |test|
+  test.libs << 'lib' << 'test'
+  test.pattern = 'test/**/*_test.rb'
+  test.verbose = true
+end
+begin
+  require 'rcov/rcovtask'
+  Rcov::RcovTask.new do |test|
+    test.libs << 'test'
+    test.pattern = 'test/**/*_test.rb'
+    test.verbose = true
+  end
+rescue LoadError
+  task :rcov do
+    abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
+  end
+end
+task :test => :check_dependencies
+task :default => :test
+require 'rake/rdoctask'
+Rake::RDocTask.new do |rdoc|
+  if File.exist?('VERSION')
+    version = File.read('VERSION')
+  else
+    version = ""
+  end
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "scrapin-a-livin #{version}"
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end

data/VERSION ADDED

	@@ -0,0 +1 @@
1	+ 0.1.0

data/lib/dice/dice_search.rb ADDED

@@ -0,0 +1,122 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'open-uri'
+require 'hpricot'
+# Class to screen scrape the dice website
+#
+# http://seeker.dice.com
+class DiceSearch
+	# Constants
+	DEBUG = false
+	TITLE_CELL = 2
+	COMPANY_CELL = 3
+	LOCATION_CELL = 4
+	DATE_CELL = 5
+	CELL_COUNT = 6
+	DICE_LINK = "http://seeker.dice.com"
+	# Parse the provided query data
+	#
+	# @param query [String, #read] the html web page data
+	# @returns [JobListings] an array of job listings
+	def self.parse_listings(query)
+		# Create the listings
+		listings = Array.new
+		# Filter the data with Hpricot
+		doc = Hpricot(query)
+		# Get the table
+		table = (doc/"//table[@class=summary]")
+		# Get the rows
+		rows = (table/"tr")
+		# Retrieve the table rows that contain the job listings
+		rows.each { |row|
+			# Get the individual cells
+			cells = (row/"td")
+			# If this is a job listing
+			if cells.size == CELL_COUNT
+				# Get the fields
+				name = (cells[TITLE_CELL]/"a").inner_html
+				link = DICE_LINK + (cells[TITLE_CELL]/"a").attr("href")
+				company = (cells[COMPANY_CELL]/"a").inner_html
+				company_link = DICE_LINK + (cells[COMPANY_CELL]/"a").attr("href")
+				location = cells[LOCATION_CELL].inner_html
+				date = cells[DATE_CELL].inner_html
+				if DEBUG
+					puts "Row: count #{cells.size}"
+					puts "Name: #{name}"
+					puts "Link: #{link}"
+					puts "Company: #{company}"
+					puts "Company Link: #{company_link}"
+					puts "Location: #{location}"
+					puts "Date: #{date}"
+				end
+				# Create the job listing
+				listings << JobListing.new(name, link, company, company_link, location, date)
+			end
+		}
+		# Return the listings
+		return listings
+	end
+	# Retrieve the job listings
+	#
+	# @param url [String, #read] the url used to query the data
+	# @param [JobListings] an array of job listings
+	def self.get_listings(url)
+		# Read the data from the url
+		response = open(url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
+					"From" => "email@addr.com",
+					"Referer" => "http://hotjobs.yahoo.com/").read
+		# Parse the listings from the query
+		parse_listings(response)
+	end
+	# Query dice for html code for the query
+	#
+	# @param location [String, #read] the location to search
+	# @param keywords [String, #read] keywords to use for the search
+	# @param days_back [String, #read] how long ago to search
+	# @param num_entries [String, #read] the number of entries to request
+	def self.query(location, keywords, days_back, num_entries)
+		# The search URL
+		url = "http://seeker.dice.com/jobsearch/servlet/JobSearch" +
+			 "?caller=0&LOCATION_OPTION=2&EXTRA_STUFF=1&N=0&Hf=0" +
+			 "&Ntk=JobSearchRanking&op=300" +
+			 "&values=&FREE_TEXT=#{keywords}" +
+			 "&Ntx=mode+matchall&WHERE=#{location}" +
+			 "&WHEREList=#{location}" +
+			 "&RADIUS=80.4672" +
+			 "&COUNTRY=1525&STAT_PROV=0&METRO_AREA=33.78715899%2C-84.39164034&AREA_CODES=&AC_COUNTRY=1525" +
+			 "&TRAVEL=0&TAXTERM=1001" +
+			 "&SORTSPEC=0" +
+			 "&FRMT=0" +
+			 "&DAYSBACK=#{days_back}" +
+			 "&NUM_PER_PAGE=#{num_entries}"
+		# Read the data from the url
+		open(@url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
+			"From" => "email@addr.com",
+			"Referer" => "http://seeker.dice.com/jobsearch/").read
+	end
+end

data/lib/generic/listing.rb ADDED

@@ -0,0 +1,25 @@
+#!/usr/bin/env ruby
+# A generic job listing
+class JobListing
+	attr_reader :title
+	attr_reader :link
+	attr_reader :company
+	attr_reader :company_link
+	attr_reader :location
+	attr_reader :date
+	attr_reader :repost
+	# Initializer for the job listing
+	def initialize(title, link, company, company_link, location, date)
+		@title = title
+		@link = link
+		@company = company
+		@company_link = company_link
+		@location = location
+		@date = date
+	end
+end

data/lib/scrapin-a-livin.rb ADDED

@@ -0,0 +1,43 @@
+#!/usr/bin/env ruby
+require 'generic/listing.rb'
+require 'dice/dice_search.rb'
+require 'yahoo/hotjobs.rb'
+# Helper file to include the available libraries
+# Main entry
+if $0 == (__FILE__)
+#  # Check the arguments
+#  case ARGV.shift
+#
+#  # We want to create an async interface
+#  when /-async/
+#    $async = true
+#
+#  # We want to create an extension interface
+#  when /-extension/
+#    $extension = true
+#  end
+#
+#  # Go through the remaining command line arguments
+#  ARGV.each do |file|
+#
+#    # Check to see if an async interface file was requested
+#    if $async
+#
+#      AsyncInterface.print_out(file)
+#
+#    # Check if this is an extension file
+#    elsif $extension
+#      # Parse the extension file
+#      Extension.new(file)
+#
+#      # Print out the data
+#      ExtenionFile.print_out(file)
+#    end
+#
+#  end
+end

data/lib/yahoo/hotjobs.rb ADDED

@@ -0,0 +1,171 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'open-uri'
+require 'hpricot'
+# Scrap the Yahoo Hotjobs website
+#
+# http://hotjobs.yahoo.com
+class HotjobsSearch
+	# Constants
+	DEBUG = false
+	TITLE_CELL = 0
+	COMPANY_CELL = 1
+	LOCATION_CELL = 2
+	DATE_CELL = 3
+	CELL_COUNT = 4
+	HOTJOBS_LINK = "http://hotjobs.yahoo.com"
+	# Parse the provided query data
+	#
+	# @param query [String, #read] the html web page data
+	# @returns [JobListings] an array of job listings
+	def self.parse_listings(query)
+		# Create the listings
+		listings = Array.new
+		# Filter the data with Hpricot
+		doc = Hpricot(query)
+		# Get the table
+		table = (doc/"//table[@id=results]")
+		# Iterate through each row
+		rows = (table/"tr")
+		# Retrieve the table rows that contain the job listings
+		rows.each { |row|
+			# Get the individual cells
+			cells = (row/"td")
+			# If this is a job listing
+			if cells.size == CELL_COUNT
+				# Get the fields
+				name = (cells[TITLE_CELL]/"a").inner_html
+				link = HOTJOBS_LINK + (cells[TITLE_CELL]/"a").attr("href")
+				company = (cells[COMPANY_CELL]/"a").inner_html
+				company_link = HOTJOBS_LINK + (cells[COMPANY_CELL]/"a").attr("href")
+				location = get_location(cells[LOCATION_CELL])
+				date, repost = get_dates(cells[DATE_CELL])
+				if DEBUG
+					puts "Row: count #{cells.size}"
+					puts "Name: #{name}"
+					puts "Link: #{link}"
+					puts "Company: #{company}"
+					puts "Company Link: #{company_link}"
+					puts "Location: #{location}"
+					puts "Date: #{date}"
+				end
+				# Create the job listing
+				listings << JobListing.new(name, link, company, company_link, location, date)
+			end
+		}
+		# Return the listings
+		return listings
+	end
+	# Retrieve the job listings
+	#
+	# @param url [String, #read] the url used to query the data
+	# @param [JobListings] an array of job listings
+	def self.get_listings(url)
+		# Read the data from the url
+		response = open(url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
+					"From" => "email@addr.com",
+					"Referer" => "http://hotjobs.yahoo.com/").read
+		# Parse the listings from the query
+		parse_listings(response)
+	end
+	# Query yahoo for html code for the query
+	#
+	# @param location [String, #read] the location to search
+	# @param keywords [String, #read] keywords to use for the search
+	# @param days_back [String, #read] how long ago to search
+	# @param num_entries [String, #read] the number of entries to request
+	def self.query(location, keywords, days_back, num_entries)
+		url = "http://hotjobs.yahoo.com/job-search?" +
+			"src=advsearch&pageOp=search&ts=1259353986&" +
+			"kw_search_type=kwany&kw=#{keywords}&kw_none=&" +
+			"locations=#{location}&country=&locations=&locations=&" +
+			"industry=&industry=&industry=&" +
+			"updated_since=month&" +
+			"exp_level=&experience_level=&" +
+			"education=&salary[min]=&salary[type]=yearly&" +
+			"commitment=FT&commitment=PT&jobtype=PERM&jobtype=CONT&" +
+			"travel_amount=&company=&" +
+			"source=&email_format=html&email_frequency=1&email_enabled=0&search_jobs=Search+Jobs"
+		# Read the data from the url
+		open(url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
+			"From" => "email@addr.com",
+			"Referer" => "http://hotjobs.yahoo.com/").read
+	end
+	private
+	# Method to get the available locations
+	def self.get_location(element)
+		location = ''
+		puts "Element: #{element}" if DEBUG
+		# check to see if the element has a span
+		if (element/"span").size > 0
+			# The first span is the primary location
+			location << (element/"span")[0].inner_text
+		# Only one location
+		else
+			location = element.inner_html
+		end
+		return location
+	end
+	# Method to get the Dates
+	def self.get_dates(element)
+		date = ''
+		repost = ''
+		puts "Element: #{element}" if DEBUG
+		spans = (element/"span")
+		# Check to see if the element contains a span
+		if spans.size > 0
+			# The first span is the Reposted data
+			repost = spans[0].inner_text
+			# Remove the Reposted string
+			repost.sub!(/Reposted /, "")
+			# delete the span
+			spans.remove
+		end
+		# Get the main date
+		date = element.inner_text
+		return date, repost
+	end
+end