scrapin-a-livin 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +5 -0
- data/LICENSE +20 -0
- data/README.markdown +36 -0
- data/Rakefile +75 -0
- data/VERSION +1 -0
- data/lib/dice/dice_search.rb +122 -0
- data/lib/generic/listing.rb +25 -0
- data/lib/scrapin-a-livin.rb +43 -0
- data/lib/yahoo/hotjobs.rb +171 -0
- data/test/dice_parser_test.rb +172 -0
- data/test/queries/dice/queryAustin.html +2207 -0
- data/test/queries/dice/queryRaleigh.html +2272 -0
- data/test/queries/dice/querySanJose.html +2517 -0
- data/test/queries/hotjobs/queryAustin.html +737 -0
- data/test/queries/hotjobs/queryRaleigh.html +755 -0
- data/test/queries/hotjobs/querySanJose.html +753 -0
- data/test/scripts/diceDump.rb +71 -0
- data/test/scripts/hotjobsDump.rb +70 -0
- data/test/test_helper.rb +8 -0
- data/test/yahoo_parser_test.rb +168 -0
- metadata +110 -0
data/.document
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Kevin S Kirkup
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.markdown
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# scrapin-a-livin
|
2
|
+
|
3
|
+
This script helps you screen scrape the most common job sites and save the lists on you local computer.
|
4
|
+
|
5
|
+
You can then use other tools to display updates, send out resumes or search your LinkedIn account
|
6
|
+
to see who you may know at that company.
|
7
|
+
|
8
|
+
Please feel free to contribute make. All suggestions are welcome.
|
9
|
+
Hopefully this will help you find the career you are looking for.
|
10
|
+
|
11
|
+
Thanks goes out to Igvita.com for posting this article
|
12
|
+
http://www.igvita.com/2007/02/04/ruby-screen-scraper-in-60-seconds/
|
13
|
+
|
14
|
+
## Quick links
|
15
|
+
|
16
|
+
*[Wiki](http://wiki.github.com/angrytuna/scrapin-a-livin)
|
17
|
+
*[Bugs](http://github.com/angrytuna/scrapin-a-livin/issues)
|
18
|
+
|
19
|
+
## How To
|
20
|
+
|
21
|
+
A
|
22
|
+
|
23
|
+
## Note on Patches/Pull Requests
|
24
|
+
|
25
|
+
* Fork the project.
|
26
|
+
* Make your feature addition or bug fix.
|
27
|
+
* Add tests for it. This is important so I don't break it in a
|
28
|
+
future version unintentionally.
|
29
|
+
* Commit, do not mess with rakefile, version, or history.
|
30
|
+
(if you want to have your own version, that is fine but
|
31
|
+
bump version in a commit by itself I can ignore when I pull)
|
32
|
+
* Send me a pull request. Bonus points for topic branches.
|
33
|
+
|
34
|
+
## Copyright
|
35
|
+
|
36
|
+
Copyright (c) 2009 Kevin S Kirkup. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "scrapin-a-livin"
|
8
|
+
gem.summary = %Q{Scrape to find a job}
|
9
|
+
gem.description = <<EOF
|
10
|
+
Are you in the technology industry?
|
11
|
+
The you have probably lost a job during your career.
|
12
|
+
This script helps you scrape the most common job sites for your search criteria,
|
13
|
+
and save the lists on you local computer.
|
14
|
+
|
15
|
+
You can then use other tools to display updates, send out resumes or search your LinkedIn account
|
16
|
+
to see who you may know at that company.
|
17
|
+
|
18
|
+
Please feel free to contribute make. All suggestions are welcome.
|
19
|
+
Hopefully this will help you find the career you are looking for.
|
20
|
+
EOF
|
21
|
+
gem.email = "kevin.kirkup@gmail.com"
|
22
|
+
gem.homepage = "http://github.com/angrytuna/scrapin-a-livin"
|
23
|
+
gem.authors = ["Kevin S Kirkup"]
|
24
|
+
gem.platform = Gem::Platform::RUBY
|
25
|
+
gem.require_path = 'lib'
|
26
|
+
|
27
|
+
gem.add_dependency('hpricot', '>= 0.6')
|
28
|
+
|
29
|
+
gem.add_development_dependency "thoughtbot-shoulda"
|
30
|
+
|
31
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
32
|
+
end
|
33
|
+
|
34
|
+
Jeweler::GemcutterTasks.new
|
35
|
+
rescue LoadError
|
36
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
37
|
+
end
|
38
|
+
|
39
|
+
require 'rake/testtask'
|
40
|
+
Rake::TestTask.new(:test) do |test|
|
41
|
+
test.libs << 'lib' << 'test'
|
42
|
+
test.pattern = 'test/**/*_test.rb'
|
43
|
+
test.verbose = true
|
44
|
+
end
|
45
|
+
|
46
|
+
begin
|
47
|
+
require 'rcov/rcovtask'
|
48
|
+
Rcov::RcovTask.new do |test|
|
49
|
+
test.libs << 'test'
|
50
|
+
test.pattern = 'test/**/*_test.rb'
|
51
|
+
test.verbose = true
|
52
|
+
end
|
53
|
+
rescue LoadError
|
54
|
+
task :rcov do
|
55
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
task :test => :check_dependencies
|
60
|
+
|
61
|
+
task :default => :test
|
62
|
+
|
63
|
+
require 'rake/rdoctask'
|
64
|
+
Rake::RDocTask.new do |rdoc|
|
65
|
+
if File.exist?('VERSION')
|
66
|
+
version = File.read('VERSION')
|
67
|
+
else
|
68
|
+
version = ""
|
69
|
+
end
|
70
|
+
|
71
|
+
rdoc.rdoc_dir = 'rdoc'
|
72
|
+
rdoc.title = "scrapin-a-livin #{version}"
|
73
|
+
rdoc.rdoc_files.include('README*')
|
74
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
75
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
@@ -0,0 +1,122 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'hpricot'
|
5
|
+
|
6
|
+
# Class to screen scrape the dice website
|
7
|
+
#
|
8
|
+
# http://seeker.dice.com
|
9
|
+
class DiceSearch
|
10
|
+
|
11
|
+
# Constants
|
12
|
+
DEBUG = false
|
13
|
+
TITLE_CELL = 2
|
14
|
+
COMPANY_CELL = 3
|
15
|
+
LOCATION_CELL = 4
|
16
|
+
DATE_CELL = 5
|
17
|
+
CELL_COUNT = 6
|
18
|
+
DICE_LINK = "http://seeker.dice.com"
|
19
|
+
|
20
|
+
# Parse the provided query data
|
21
|
+
#
|
22
|
+
# @param query [String, #read] the html web page data
|
23
|
+
# @returns [JobListings] an array of job listings
|
24
|
+
def self.parse_listings(query)
|
25
|
+
|
26
|
+
# Create the listings
|
27
|
+
listings = Array.new
|
28
|
+
|
29
|
+
# Filter the data with Hpricot
|
30
|
+
doc = Hpricot(query)
|
31
|
+
|
32
|
+
# Get the table
|
33
|
+
table = (doc/"//table[@class=summary]")
|
34
|
+
|
35
|
+
# Get the rows
|
36
|
+
rows = (table/"tr")
|
37
|
+
|
38
|
+
# Retrieve the table rows that contain the job listings
|
39
|
+
rows.each { |row|
|
40
|
+
|
41
|
+
# Get the individual cells
|
42
|
+
cells = (row/"td")
|
43
|
+
|
44
|
+
# If this is a job listing
|
45
|
+
if cells.size == CELL_COUNT
|
46
|
+
|
47
|
+
# Get the fields
|
48
|
+
name = (cells[TITLE_CELL]/"a").inner_html
|
49
|
+
link = DICE_LINK + (cells[TITLE_CELL]/"a").attr("href")
|
50
|
+
company = (cells[COMPANY_CELL]/"a").inner_html
|
51
|
+
company_link = DICE_LINK + (cells[COMPANY_CELL]/"a").attr("href")
|
52
|
+
location = cells[LOCATION_CELL].inner_html
|
53
|
+
date = cells[DATE_CELL].inner_html
|
54
|
+
|
55
|
+
if DEBUG
|
56
|
+
puts "Row: count #{cells.size}"
|
57
|
+
puts "Name: #{name}"
|
58
|
+
puts "Link: #{link}"
|
59
|
+
puts "Company: #{company}"
|
60
|
+
puts "Company Link: #{company_link}"
|
61
|
+
puts "Location: #{location}"
|
62
|
+
puts "Date: #{date}"
|
63
|
+
end
|
64
|
+
|
65
|
+
# Create the job listing
|
66
|
+
listings << JobListing.new(name, link, company, company_link, location, date)
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
}
|
71
|
+
|
72
|
+
# Return the listings
|
73
|
+
return listings
|
74
|
+
|
75
|
+
end
|
76
|
+
|
77
|
+
# Retrieve the job listings
|
78
|
+
#
|
79
|
+
# @param url [String, #read] the url used to query the data
|
80
|
+
# @param [JobListings] an array of job listings
|
81
|
+
def self.get_listings(url)
|
82
|
+
|
83
|
+
# Read the data from the url
|
84
|
+
response = open(url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
|
85
|
+
"From" => "email@addr.com",
|
86
|
+
"Referer" => "http://hotjobs.yahoo.com/").read
|
87
|
+
|
88
|
+
# Parse the listings from the query
|
89
|
+
parse_listings(response)
|
90
|
+
|
91
|
+
end
|
92
|
+
|
93
|
+
# Query dice for html code for the query
|
94
|
+
#
|
95
|
+
# @param location [String, #read] the location to search
|
96
|
+
# @param keywords [String, #read] keywords to use for the search
|
97
|
+
# @param days_back [String, #read] how long ago to search
|
98
|
+
# @param num_entries [String, #read] the number of entries to request
|
99
|
+
def self.query(location, keywords, days_back, num_entries)
|
100
|
+
|
101
|
+
# The search URL
|
102
|
+
url = "http://seeker.dice.com/jobsearch/servlet/JobSearch" +
|
103
|
+
"?caller=0&LOCATION_OPTION=2&EXTRA_STUFF=1&N=0&Hf=0" +
|
104
|
+
"&Ntk=JobSearchRanking&op=300" +
|
105
|
+
"&values=&FREE_TEXT=#{keywords}" +
|
106
|
+
"&Ntx=mode+matchall&WHERE=#{location}" +
|
107
|
+
"&WHEREList=#{location}" +
|
108
|
+
"&RADIUS=80.4672" +
|
109
|
+
"&COUNTRY=1525&STAT_PROV=0&METRO_AREA=33.78715899%2C-84.39164034&AREA_CODES=&AC_COUNTRY=1525" +
|
110
|
+
"&TRAVEL=0&TAXTERM=1001" +
|
111
|
+
"&SORTSPEC=0" +
|
112
|
+
"&FRMT=0" +
|
113
|
+
"&DAYSBACK=#{days_back}" +
|
114
|
+
"&NUM_PER_PAGE=#{num_entries}"
|
115
|
+
|
116
|
+
# Read the data from the url
|
117
|
+
open(@url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
|
118
|
+
"From" => "email@addr.com",
|
119
|
+
"Referer" => "http://seeker.dice.com/jobsearch/").read
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# A generic job listing
|
4
|
+
class JobListing
|
5
|
+
|
6
|
+
attr_reader :title
|
7
|
+
attr_reader :link
|
8
|
+
attr_reader :company
|
9
|
+
attr_reader :company_link
|
10
|
+
attr_reader :location
|
11
|
+
attr_reader :date
|
12
|
+
attr_reader :repost
|
13
|
+
|
14
|
+
# Initializer for the job listing
|
15
|
+
def initialize(title, link, company, company_link, location, date)
|
16
|
+
|
17
|
+
@title = title
|
18
|
+
@link = link
|
19
|
+
@company = company
|
20
|
+
@company_link = company_link
|
21
|
+
@location = location
|
22
|
+
@date = date
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'generic/listing.rb'
|
3
|
+
require 'dice/dice_search.rb'
|
4
|
+
require 'yahoo/hotjobs.rb'
|
5
|
+
|
6
|
+
# Helper file to include the available libraries
|
7
|
+
|
8
|
+
# Main entry
|
9
|
+
if $0 == (__FILE__)
|
10
|
+
|
11
|
+
# # Check the arguments
|
12
|
+
# case ARGV.shift
|
13
|
+
#
|
14
|
+
# # We want to create an async interface
|
15
|
+
# when /-async/
|
16
|
+
# $async = true
|
17
|
+
#
|
18
|
+
# # We want to create an extension interface
|
19
|
+
# when /-extension/
|
20
|
+
# $extension = true
|
21
|
+
# end
|
22
|
+
#
|
23
|
+
# # Go through the remaining command line arguments
|
24
|
+
# ARGV.each do |file|
|
25
|
+
#
|
26
|
+
# # Check to see if an async interface file was requested
|
27
|
+
# if $async
|
28
|
+
#
|
29
|
+
# AsyncInterface.print_out(file)
|
30
|
+
#
|
31
|
+
# # Check if this is an extension file
|
32
|
+
# elsif $extension
|
33
|
+
# # Parse the extension file
|
34
|
+
# Extension.new(file)
|
35
|
+
#
|
36
|
+
# # Print out the data
|
37
|
+
# ExtenionFile.print_out(file)
|
38
|
+
# end
|
39
|
+
#
|
40
|
+
# end
|
41
|
+
|
42
|
+
|
43
|
+
end
|
@@ -0,0 +1,171 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'hpricot'
|
5
|
+
|
6
|
+
# Scrap the Yahoo Hotjobs website
|
7
|
+
#
|
8
|
+
# http://hotjobs.yahoo.com
|
9
|
+
class HotjobsSearch
|
10
|
+
|
11
|
+
# Constants
|
12
|
+
DEBUG = false
|
13
|
+
TITLE_CELL = 0
|
14
|
+
COMPANY_CELL = 1
|
15
|
+
LOCATION_CELL = 2
|
16
|
+
DATE_CELL = 3
|
17
|
+
CELL_COUNT = 4
|
18
|
+
HOTJOBS_LINK = "http://hotjobs.yahoo.com"
|
19
|
+
|
20
|
+
# Parse the provided query data
|
21
|
+
#
|
22
|
+
# @param query [String, #read] the html web page data
|
23
|
+
# @returns [JobListings] an array of job listings
|
24
|
+
def self.parse_listings(query)
|
25
|
+
|
26
|
+
# Create the listings
|
27
|
+
listings = Array.new
|
28
|
+
|
29
|
+
# Filter the data with Hpricot
|
30
|
+
doc = Hpricot(query)
|
31
|
+
|
32
|
+
# Get the table
|
33
|
+
table = (doc/"//table[@id=results]")
|
34
|
+
|
35
|
+
# Iterate through each row
|
36
|
+
rows = (table/"tr")
|
37
|
+
|
38
|
+
# Retrieve the table rows that contain the job listings
|
39
|
+
rows.each { |row|
|
40
|
+
|
41
|
+
# Get the individual cells
|
42
|
+
cells = (row/"td")
|
43
|
+
|
44
|
+
# If this is a job listing
|
45
|
+
if cells.size == CELL_COUNT
|
46
|
+
|
47
|
+
# Get the fields
|
48
|
+
name = (cells[TITLE_CELL]/"a").inner_html
|
49
|
+
link = HOTJOBS_LINK + (cells[TITLE_CELL]/"a").attr("href")
|
50
|
+
company = (cells[COMPANY_CELL]/"a").inner_html
|
51
|
+
company_link = HOTJOBS_LINK + (cells[COMPANY_CELL]/"a").attr("href")
|
52
|
+
location = get_location(cells[LOCATION_CELL])
|
53
|
+
date, repost = get_dates(cells[DATE_CELL])
|
54
|
+
|
55
|
+
if DEBUG
|
56
|
+
puts "Row: count #{cells.size}"
|
57
|
+
puts "Name: #{name}"
|
58
|
+
puts "Link: #{link}"
|
59
|
+
puts "Company: #{company}"
|
60
|
+
puts "Company Link: #{company_link}"
|
61
|
+
puts "Location: #{location}"
|
62
|
+
puts "Date: #{date}"
|
63
|
+
end
|
64
|
+
|
65
|
+
# Create the job listing
|
66
|
+
listings << JobListing.new(name, link, company, company_link, location, date)
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
}
|
71
|
+
|
72
|
+
# Return the listings
|
73
|
+
return listings
|
74
|
+
|
75
|
+
end
|
76
|
+
|
77
|
+
# Retrieve the job listings
|
78
|
+
#
|
79
|
+
# @param url [String, #read] the url used to query the data
|
80
|
+
# @param [JobListings] an array of job listings
|
81
|
+
def self.get_listings(url)
|
82
|
+
|
83
|
+
# Read the data from the url
|
84
|
+
response = open(url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
|
85
|
+
"From" => "email@addr.com",
|
86
|
+
"Referer" => "http://hotjobs.yahoo.com/").read
|
87
|
+
|
88
|
+
# Parse the listings from the query
|
89
|
+
parse_listings(response)
|
90
|
+
|
91
|
+
end
|
92
|
+
|
93
|
+
# Query yahoo for html code for the query
|
94
|
+
#
|
95
|
+
# @param location [String, #read] the location to search
|
96
|
+
# @param keywords [String, #read] keywords to use for the search
|
97
|
+
# @param days_back [String, #read] how long ago to search
|
98
|
+
# @param num_entries [String, #read] the number of entries to request
|
99
|
+
def self.query(location, keywords, days_back, num_entries)
|
100
|
+
|
101
|
+
url = "http://hotjobs.yahoo.com/job-search?" +
|
102
|
+
"src=advsearch&pageOp=search&ts=1259353986&" +
|
103
|
+
"kw_search_type=kwany&kw=#{keywords}&kw_none=&" +
|
104
|
+
"locations=#{location}&country=&locations=&locations=&" +
|
105
|
+
"industry=&industry=&industry=&" +
|
106
|
+
"updated_since=month&" +
|
107
|
+
"exp_level=&experience_level=&" +
|
108
|
+
"education=&salary[min]=&salary[type]=yearly&" +
|
109
|
+
"commitment=FT&commitment=PT&jobtype=PERM&jobtype=CONT&" +
|
110
|
+
"travel_amount=&company=&" +
|
111
|
+
"source=&email_format=html&email_frequency=1&email_enabled=0&search_jobs=Search+Jobs"
|
112
|
+
|
113
|
+
# Read the data from the url
|
114
|
+
open(url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
|
115
|
+
"From" => "email@addr.com",
|
116
|
+
"Referer" => "http://hotjobs.yahoo.com/").read
|
117
|
+
end
|
118
|
+
|
119
|
+
private
|
120
|
+
# Method to get the available locations
|
121
|
+
def self.get_location(element)
|
122
|
+
|
123
|
+
location = ''
|
124
|
+
|
125
|
+
puts "Element: #{element}" if DEBUG
|
126
|
+
|
127
|
+
# check to see if the element has a span
|
128
|
+
if (element/"span").size > 0
|
129
|
+
|
130
|
+
# The first span is the primary location
|
131
|
+
location << (element/"span")[0].inner_text
|
132
|
+
|
133
|
+
# Only one location
|
134
|
+
else
|
135
|
+
location = element.inner_html
|
136
|
+
end
|
137
|
+
|
138
|
+
return location
|
139
|
+
end
|
140
|
+
|
141
|
+
# Method to get the Dates
|
142
|
+
def self.get_dates(element)
|
143
|
+
|
144
|
+
date = ''
|
145
|
+
repost = ''
|
146
|
+
|
147
|
+
puts "Element: #{element}" if DEBUG
|
148
|
+
|
149
|
+
spans = (element/"span")
|
150
|
+
|
151
|
+
# Check to see if the element contains a span
|
152
|
+
if spans.size > 0
|
153
|
+
|
154
|
+
# The first span is the Reposted data
|
155
|
+
repost = spans[0].inner_text
|
156
|
+
|
157
|
+
# Remove the Reposted string
|
158
|
+
repost.sub!(/Reposted /, "")
|
159
|
+
|
160
|
+
# delete the span
|
161
|
+
spans.remove
|
162
|
+
end
|
163
|
+
|
164
|
+
# Get the main date
|
165
|
+
date = element.inner_text
|
166
|
+
|
167
|
+
return date, repost
|
168
|
+
|
169
|
+
end
|
170
|
+
|
171
|
+
end
|