scrapin-a-livin 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +5 -0
- data/LICENSE +20 -0
- data/README.markdown +36 -0
- data/Rakefile +75 -0
- data/VERSION +1 -0
- data/lib/dice/dice_search.rb +122 -0
- data/lib/generic/listing.rb +25 -0
- data/lib/scrapin-a-livin.rb +43 -0
- data/lib/yahoo/hotjobs.rb +171 -0
- data/test/dice_parser_test.rb +172 -0
- data/test/queries/dice/queryAustin.html +2207 -0
- data/test/queries/dice/queryRaleigh.html +2272 -0
- data/test/queries/dice/querySanJose.html +2517 -0
- data/test/queries/hotjobs/queryAustin.html +737 -0
- data/test/queries/hotjobs/queryRaleigh.html +755 -0
- data/test/queries/hotjobs/querySanJose.html +753 -0
- data/test/scripts/diceDump.rb +71 -0
- data/test/scripts/hotjobsDump.rb +70 -0
- data/test/test_helper.rb +8 -0
- data/test/yahoo_parser_test.rb +168 -0
- metadata +110 -0
data/.document
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Kevin S Kirkup
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.markdown
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# scrapin-a-livin
|
2
|
+
|
3
|
+
This script helps you screen scrape the most common job sites and save the lists on you local computer.
|
4
|
+
|
5
|
+
You can then use other tools to display updates, send out resumes or search your LinkedIn account
|
6
|
+
to see who you may know at that company.
|
7
|
+
|
8
|
+
Please feel free to contribute make. All suggestions are welcome.
|
9
|
+
Hopefully this will help you find the career you are looking for.
|
10
|
+
|
11
|
+
Thanks goes out to Igvita.com for posting this article
|
12
|
+
http://www.igvita.com/2007/02/04/ruby-screen-scraper-in-60-seconds/
|
13
|
+
|
14
|
+
## Quick links
|
15
|
+
|
16
|
+
*[Wiki](http://wiki.github.com/angrytuna/scrapin-a-livin)
|
17
|
+
*[Bugs](http://github.com/angrytuna/scrapin-a-livin/issues)
|
18
|
+
|
19
|
+
## How To
|
20
|
+
|
21
|
+
A
|
22
|
+
|
23
|
+
## Note on Patches/Pull Requests
|
24
|
+
|
25
|
+
* Fork the project.
|
26
|
+
* Make your feature addition or bug fix.
|
27
|
+
* Add tests for it. This is important so I don't break it in a
|
28
|
+
future version unintentionally.
|
29
|
+
* Commit, do not mess with rakefile, version, or history.
|
30
|
+
(if you want to have your own version, that is fine but
|
31
|
+
bump version in a commit by itself I can ignore when I pull)
|
32
|
+
* Send me a pull request. Bonus points for topic branches.
|
33
|
+
|
34
|
+
## Copyright
|
35
|
+
|
36
|
+
Copyright (c) 2009 Kevin S Kirkup. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "scrapin-a-livin"
|
8
|
+
gem.summary = %Q{Scrape to find a job}
|
9
|
+
gem.description = <<EOF
|
10
|
+
Are you in the technology industry?
|
11
|
+
The you have probably lost a job during your career.
|
12
|
+
This script helps you scrape the most common job sites for your search criteria,
|
13
|
+
and save the lists on you local computer.
|
14
|
+
|
15
|
+
You can then use other tools to display updates, send out resumes or search your LinkedIn account
|
16
|
+
to see who you may know at that company.
|
17
|
+
|
18
|
+
Please feel free to contribute make. All suggestions are welcome.
|
19
|
+
Hopefully this will help you find the career you are looking for.
|
20
|
+
EOF
|
21
|
+
gem.email = "kevin.kirkup@gmail.com"
|
22
|
+
gem.homepage = "http://github.com/angrytuna/scrapin-a-livin"
|
23
|
+
gem.authors = ["Kevin S Kirkup"]
|
24
|
+
gem.platform = Gem::Platform::RUBY
|
25
|
+
gem.require_path = 'lib'
|
26
|
+
|
27
|
+
gem.add_dependency('hpricot', '>= 0.6')
|
28
|
+
|
29
|
+
gem.add_development_dependency "thoughtbot-shoulda"
|
30
|
+
|
31
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
32
|
+
end
|
33
|
+
|
34
|
+
Jeweler::GemcutterTasks.new
|
35
|
+
rescue LoadError
|
36
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
37
|
+
end
|
38
|
+
|
39
|
+
require 'rake/testtask'
|
40
|
+
Rake::TestTask.new(:test) do |test|
|
41
|
+
test.libs << 'lib' << 'test'
|
42
|
+
test.pattern = 'test/**/*_test.rb'
|
43
|
+
test.verbose = true
|
44
|
+
end
|
45
|
+
|
46
|
+
begin
|
47
|
+
require 'rcov/rcovtask'
|
48
|
+
Rcov::RcovTask.new do |test|
|
49
|
+
test.libs << 'test'
|
50
|
+
test.pattern = 'test/**/*_test.rb'
|
51
|
+
test.verbose = true
|
52
|
+
end
|
53
|
+
rescue LoadError
|
54
|
+
task :rcov do
|
55
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
task :test => :check_dependencies
|
60
|
+
|
61
|
+
task :default => :test
|
62
|
+
|
63
|
+
require 'rake/rdoctask'
|
64
|
+
Rake::RDocTask.new do |rdoc|
|
65
|
+
if File.exist?('VERSION')
|
66
|
+
version = File.read('VERSION')
|
67
|
+
else
|
68
|
+
version = ""
|
69
|
+
end
|
70
|
+
|
71
|
+
rdoc.rdoc_dir = 'rdoc'
|
72
|
+
rdoc.title = "scrapin-a-livin #{version}"
|
73
|
+
rdoc.rdoc_files.include('README*')
|
74
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
75
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
@@ -0,0 +1,122 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'hpricot'
|
5
|
+
|
6
|
+
# Class to screen scrape the dice website
|
7
|
+
#
|
8
|
+
# http://seeker.dice.com
|
9
|
+
class DiceSearch
|
10
|
+
|
11
|
+
# Constants
|
12
|
+
DEBUG = false
|
13
|
+
TITLE_CELL = 2
|
14
|
+
COMPANY_CELL = 3
|
15
|
+
LOCATION_CELL = 4
|
16
|
+
DATE_CELL = 5
|
17
|
+
CELL_COUNT = 6
|
18
|
+
DICE_LINK = "http://seeker.dice.com"
|
19
|
+
|
20
|
+
# Parse the provided query data
|
21
|
+
#
|
22
|
+
# @param query [String, #read] the html web page data
|
23
|
+
# @returns [JobListings] an array of job listings
|
24
|
+
def self.parse_listings(query)
|
25
|
+
|
26
|
+
# Create the listings
|
27
|
+
listings = Array.new
|
28
|
+
|
29
|
+
# Filter the data with Hpricot
|
30
|
+
doc = Hpricot(query)
|
31
|
+
|
32
|
+
# Get the table
|
33
|
+
table = (doc/"//table[@class=summary]")
|
34
|
+
|
35
|
+
# Get the rows
|
36
|
+
rows = (table/"tr")
|
37
|
+
|
38
|
+
# Retrieve the table rows that contain the job listings
|
39
|
+
rows.each { |row|
|
40
|
+
|
41
|
+
# Get the individual cells
|
42
|
+
cells = (row/"td")
|
43
|
+
|
44
|
+
# If this is a job listing
|
45
|
+
if cells.size == CELL_COUNT
|
46
|
+
|
47
|
+
# Get the fields
|
48
|
+
name = (cells[TITLE_CELL]/"a").inner_html
|
49
|
+
link = DICE_LINK + (cells[TITLE_CELL]/"a").attr("href")
|
50
|
+
company = (cells[COMPANY_CELL]/"a").inner_html
|
51
|
+
company_link = DICE_LINK + (cells[COMPANY_CELL]/"a").attr("href")
|
52
|
+
location = cells[LOCATION_CELL].inner_html
|
53
|
+
date = cells[DATE_CELL].inner_html
|
54
|
+
|
55
|
+
if DEBUG
|
56
|
+
puts "Row: count #{cells.size}"
|
57
|
+
puts "Name: #{name}"
|
58
|
+
puts "Link: #{link}"
|
59
|
+
puts "Company: #{company}"
|
60
|
+
puts "Company Link: #{company_link}"
|
61
|
+
puts "Location: #{location}"
|
62
|
+
puts "Date: #{date}"
|
63
|
+
end
|
64
|
+
|
65
|
+
# Create the job listing
|
66
|
+
listings << JobListing.new(name, link, company, company_link, location, date)
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
}
|
71
|
+
|
72
|
+
# Return the listings
|
73
|
+
return listings
|
74
|
+
|
75
|
+
end
|
76
|
+
|
77
|
+
# Retrieve the job listings
|
78
|
+
#
|
79
|
+
# @param url [String, #read] the url used to query the data
|
80
|
+
# @param [JobListings] an array of job listings
|
81
|
+
def self.get_listings(url)
|
82
|
+
|
83
|
+
# Read the data from the url
|
84
|
+
response = open(url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
|
85
|
+
"From" => "email@addr.com",
|
86
|
+
"Referer" => "http://hotjobs.yahoo.com/").read
|
87
|
+
|
88
|
+
# Parse the listings from the query
|
89
|
+
parse_listings(response)
|
90
|
+
|
91
|
+
end
|
92
|
+
|
93
|
+
# Query dice for html code for the query
|
94
|
+
#
|
95
|
+
# @param location [String, #read] the location to search
|
96
|
+
# @param keywords [String, #read] keywords to use for the search
|
97
|
+
# @param days_back [String, #read] how long ago to search
|
98
|
+
# @param num_entries [String, #read] the number of entries to request
|
99
|
+
def self.query(location, keywords, days_back, num_entries)
|
100
|
+
|
101
|
+
# The search URL
|
102
|
+
url = "http://seeker.dice.com/jobsearch/servlet/JobSearch" +
|
103
|
+
"?caller=0&LOCATION_OPTION=2&EXTRA_STUFF=1&N=0&Hf=0" +
|
104
|
+
"&Ntk=JobSearchRanking&op=300" +
|
105
|
+
"&values=&FREE_TEXT=#{keywords}" +
|
106
|
+
"&Ntx=mode+matchall&WHERE=#{location}" +
|
107
|
+
"&WHEREList=#{location}" +
|
108
|
+
"&RADIUS=80.4672" +
|
109
|
+
"&COUNTRY=1525&STAT_PROV=0&METRO_AREA=33.78715899%2C-84.39164034&AREA_CODES=&AC_COUNTRY=1525" +
|
110
|
+
"&TRAVEL=0&TAXTERM=1001" +
|
111
|
+
"&SORTSPEC=0" +
|
112
|
+
"&FRMT=0" +
|
113
|
+
"&DAYSBACK=#{days_back}" +
|
114
|
+
"&NUM_PER_PAGE=#{num_entries}"
|
115
|
+
|
116
|
+
# Read the data from the url
|
117
|
+
open(@url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
|
118
|
+
"From" => "email@addr.com",
|
119
|
+
"Referer" => "http://seeker.dice.com/jobsearch/").read
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# A generic job listing
|
4
|
+
class JobListing
|
5
|
+
|
6
|
+
attr_reader :title
|
7
|
+
attr_reader :link
|
8
|
+
attr_reader :company
|
9
|
+
attr_reader :company_link
|
10
|
+
attr_reader :location
|
11
|
+
attr_reader :date
|
12
|
+
attr_reader :repost
|
13
|
+
|
14
|
+
# Initializer for the job listing
|
15
|
+
def initialize(title, link, company, company_link, location, date)
|
16
|
+
|
17
|
+
@title = title
|
18
|
+
@link = link
|
19
|
+
@company = company
|
20
|
+
@company_link = company_link
|
21
|
+
@location = location
|
22
|
+
@date = date
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'generic/listing.rb'
|
3
|
+
require 'dice/dice_search.rb'
|
4
|
+
require 'yahoo/hotjobs.rb'
|
5
|
+
|
6
|
+
# Helper file to include the available libraries
|
7
|
+
|
8
|
+
# Main entry
|
9
|
+
if $0 == (__FILE__)
|
10
|
+
|
11
|
+
# # Check the arguments
|
12
|
+
# case ARGV.shift
|
13
|
+
#
|
14
|
+
# # We want to create an async interface
|
15
|
+
# when /-async/
|
16
|
+
# $async = true
|
17
|
+
#
|
18
|
+
# # We want to create an extension interface
|
19
|
+
# when /-extension/
|
20
|
+
# $extension = true
|
21
|
+
# end
|
22
|
+
#
|
23
|
+
# # Go through the remaining command line arguments
|
24
|
+
# ARGV.each do |file|
|
25
|
+
#
|
26
|
+
# # Check to see if an async interface file was requested
|
27
|
+
# if $async
|
28
|
+
#
|
29
|
+
# AsyncInterface.print_out(file)
|
30
|
+
#
|
31
|
+
# # Check if this is an extension file
|
32
|
+
# elsif $extension
|
33
|
+
# # Parse the extension file
|
34
|
+
# Extension.new(file)
|
35
|
+
#
|
36
|
+
# # Print out the data
|
37
|
+
# ExtenionFile.print_out(file)
|
38
|
+
# end
|
39
|
+
#
|
40
|
+
# end
|
41
|
+
|
42
|
+
|
43
|
+
end
|
@@ -0,0 +1,171 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'hpricot'
|
5
|
+
|
6
|
+
# Scrap the Yahoo Hotjobs website
|
7
|
+
#
|
8
|
+
# http://hotjobs.yahoo.com
|
9
|
+
class HotjobsSearch
|
10
|
+
|
11
|
+
# Constants
|
12
|
+
DEBUG = false
|
13
|
+
TITLE_CELL = 0
|
14
|
+
COMPANY_CELL = 1
|
15
|
+
LOCATION_CELL = 2
|
16
|
+
DATE_CELL = 3
|
17
|
+
CELL_COUNT = 4
|
18
|
+
HOTJOBS_LINK = "http://hotjobs.yahoo.com"
|
19
|
+
|
20
|
+
# Parse the provided query data
|
21
|
+
#
|
22
|
+
# @param query [String, #read] the html web page data
|
23
|
+
# @returns [JobListings] an array of job listings
|
24
|
+
def self.parse_listings(query)
|
25
|
+
|
26
|
+
# Create the listings
|
27
|
+
listings = Array.new
|
28
|
+
|
29
|
+
# Filter the data with Hpricot
|
30
|
+
doc = Hpricot(query)
|
31
|
+
|
32
|
+
# Get the table
|
33
|
+
table = (doc/"//table[@id=results]")
|
34
|
+
|
35
|
+
# Iterate through each row
|
36
|
+
rows = (table/"tr")
|
37
|
+
|
38
|
+
# Retrieve the table rows that contain the job listings
|
39
|
+
rows.each { |row|
|
40
|
+
|
41
|
+
# Get the individual cells
|
42
|
+
cells = (row/"td")
|
43
|
+
|
44
|
+
# If this is a job listing
|
45
|
+
if cells.size == CELL_COUNT
|
46
|
+
|
47
|
+
# Get the fields
|
48
|
+
name = (cells[TITLE_CELL]/"a").inner_html
|
49
|
+
link = HOTJOBS_LINK + (cells[TITLE_CELL]/"a").attr("href")
|
50
|
+
company = (cells[COMPANY_CELL]/"a").inner_html
|
51
|
+
company_link = HOTJOBS_LINK + (cells[COMPANY_CELL]/"a").attr("href")
|
52
|
+
location = get_location(cells[LOCATION_CELL])
|
53
|
+
date, repost = get_dates(cells[DATE_CELL])
|
54
|
+
|
55
|
+
if DEBUG
|
56
|
+
puts "Row: count #{cells.size}"
|
57
|
+
puts "Name: #{name}"
|
58
|
+
puts "Link: #{link}"
|
59
|
+
puts "Company: #{company}"
|
60
|
+
puts "Company Link: #{company_link}"
|
61
|
+
puts "Location: #{location}"
|
62
|
+
puts "Date: #{date}"
|
63
|
+
end
|
64
|
+
|
65
|
+
# Create the job listing
|
66
|
+
listings << JobListing.new(name, link, company, company_link, location, date)
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
}
|
71
|
+
|
72
|
+
# Return the listings
|
73
|
+
return listings
|
74
|
+
|
75
|
+
end
|
76
|
+
|
77
|
+
# Retrieve the job listings
|
78
|
+
#
|
79
|
+
# @param url [String, #read] the url used to query the data
|
80
|
+
# @param [JobListings] an array of job listings
|
81
|
+
def self.get_listings(url)
|
82
|
+
|
83
|
+
# Read the data from the url
|
84
|
+
response = open(url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
|
85
|
+
"From" => "email@addr.com",
|
86
|
+
"Referer" => "http://hotjobs.yahoo.com/").read
|
87
|
+
|
88
|
+
# Parse the listings from the query
|
89
|
+
parse_listings(response)
|
90
|
+
|
91
|
+
end
|
92
|
+
|
93
|
+
# Query yahoo for html code for the query
|
94
|
+
#
|
95
|
+
# @param location [String, #read] the location to search
|
96
|
+
# @param keywords [String, #read] keywords to use for the search
|
97
|
+
# @param days_back [String, #read] how long ago to search
|
98
|
+
# @param num_entries [String, #read] the number of entries to request
|
99
|
+
def self.query(location, keywords, days_back, num_entries)
|
100
|
+
|
101
|
+
url = "http://hotjobs.yahoo.com/job-search?" +
|
102
|
+
"src=advsearch&pageOp=search&ts=1259353986&" +
|
103
|
+
"kw_search_type=kwany&kw=#{keywords}&kw_none=&" +
|
104
|
+
"locations=#{location}&country=&locations=&locations=&" +
|
105
|
+
"industry=&industry=&industry=&" +
|
106
|
+
"updated_since=month&" +
|
107
|
+
"exp_level=&experience_level=&" +
|
108
|
+
"education=&salary[min]=&salary[type]=yearly&" +
|
109
|
+
"commitment=FT&commitment=PT&jobtype=PERM&jobtype=CONT&" +
|
110
|
+
"travel_amount=&company=&" +
|
111
|
+
"source=&email_format=html&email_frequency=1&email_enabled=0&search_jobs=Search+Jobs"
|
112
|
+
|
113
|
+
# Read the data from the url
|
114
|
+
open(url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
|
115
|
+
"From" => "email@addr.com",
|
116
|
+
"Referer" => "http://hotjobs.yahoo.com/").read
|
117
|
+
end
|
118
|
+
|
119
|
+
private
|
120
|
+
# Method to get the available locations
|
121
|
+
def self.get_location(element)
|
122
|
+
|
123
|
+
location = ''
|
124
|
+
|
125
|
+
puts "Element: #{element}" if DEBUG
|
126
|
+
|
127
|
+
# check to see if the element has a span
|
128
|
+
if (element/"span").size > 0
|
129
|
+
|
130
|
+
# The first span is the primary location
|
131
|
+
location << (element/"span")[0].inner_text
|
132
|
+
|
133
|
+
# Only one location
|
134
|
+
else
|
135
|
+
location = element.inner_html
|
136
|
+
end
|
137
|
+
|
138
|
+
return location
|
139
|
+
end
|
140
|
+
|
141
|
+
# Method to get the Dates
|
142
|
+
def self.get_dates(element)
|
143
|
+
|
144
|
+
date = ''
|
145
|
+
repost = ''
|
146
|
+
|
147
|
+
puts "Element: #{element}" if DEBUG
|
148
|
+
|
149
|
+
spans = (element/"span")
|
150
|
+
|
151
|
+
# Check to see if the element contains a span
|
152
|
+
if spans.size > 0
|
153
|
+
|
154
|
+
# The first span is the Reposted data
|
155
|
+
repost = spans[0].inner_text
|
156
|
+
|
157
|
+
# Remove the Reposted string
|
158
|
+
repost.sub!(/Reposted /, "")
|
159
|
+
|
160
|
+
# delete the span
|
161
|
+
spans.remove
|
162
|
+
end
|
163
|
+
|
164
|
+
# Get the main date
|
165
|
+
date = element.inner_text
|
166
|
+
|
167
|
+
return date, repost
|
168
|
+
|
169
|
+
end
|
170
|
+
|
171
|
+
end
|