scrapin-a-livin 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Kevin S Kirkup
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,36 @@
1
+ # scrapin-a-livin
2
+
3
+ This script helps you screen scrape the most common job sites and save the lists on you local computer.
4
+
5
+ You can then use other tools to display updates, send out resumes or search your LinkedIn account
6
+ to see who you may know at that company.
7
+
8
+ Please feel free to contribute make. All suggestions are welcome.
9
+ Hopefully this will help you find the career you are looking for.
10
+
11
+ Thanks goes out to Igvita.com for posting this article
12
+ http://www.igvita.com/2007/02/04/ruby-screen-scraper-in-60-seconds/
13
+
14
+ ## Quick links
15
+
16
+ *[Wiki](http://wiki.github.com/angrytuna/scrapin-a-livin)
17
+ *[Bugs](http://github.com/angrytuna/scrapin-a-livin/issues)
18
+
19
+ ## How To
20
+
21
+ A
22
+
23
+ ## Note on Patches/Pull Requests
24
+
25
+ * Fork the project.
26
+ * Make your feature addition or bug fix.
27
+ * Add tests for it. This is important so I don't break it in a
28
+ future version unintentionally.
29
+ * Commit, do not mess with rakefile, version, or history.
30
+ (if you want to have your own version, that is fine but
31
+ bump version in a commit by itself I can ignore when I pull)
32
+ * Send me a pull request. Bonus points for topic branches.
33
+
34
+ ## Copyright
35
+
36
+ Copyright (c) 2009 Kevin S Kirkup. See LICENSE for details.
@@ -0,0 +1,75 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "scrapin-a-livin"
8
+ gem.summary = %Q{Scrape to find a job}
9
+ gem.description = <<EOF
10
+ Are you in the technology industry?
11
+ The you have probably lost a job during your career.
12
+ This script helps you scrape the most common job sites for your search criteria,
13
+ and save the lists on you local computer.
14
+
15
+ You can then use other tools to display updates, send out resumes or search your LinkedIn account
16
+ to see who you may know at that company.
17
+
18
+ Please feel free to contribute make. All suggestions are welcome.
19
+ Hopefully this will help you find the career you are looking for.
20
+ EOF
21
+ gem.email = "kevin.kirkup@gmail.com"
22
+ gem.homepage = "http://github.com/angrytuna/scrapin-a-livin"
23
+ gem.authors = ["Kevin S Kirkup"]
24
+ gem.platform = Gem::Platform::RUBY
25
+ gem.require_path = 'lib'
26
+
27
+ gem.add_dependency('hpricot', '>= 0.6')
28
+
29
+ gem.add_development_dependency "thoughtbot-shoulda"
30
+
31
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
32
+ end
33
+
34
+ Jeweler::GemcutterTasks.new
35
+ rescue LoadError
36
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
37
+ end
38
+
39
+ require 'rake/testtask'
40
+ Rake::TestTask.new(:test) do |test|
41
+ test.libs << 'lib' << 'test'
42
+ test.pattern = 'test/**/*_test.rb'
43
+ test.verbose = true
44
+ end
45
+
46
+ begin
47
+ require 'rcov/rcovtask'
48
+ Rcov::RcovTask.new do |test|
49
+ test.libs << 'test'
50
+ test.pattern = 'test/**/*_test.rb'
51
+ test.verbose = true
52
+ end
53
+ rescue LoadError
54
+ task :rcov do
55
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
56
+ end
57
+ end
58
+
59
+ task :test => :check_dependencies
60
+
61
+ task :default => :test
62
+
63
+ require 'rake/rdoctask'
64
+ Rake::RDocTask.new do |rdoc|
65
+ if File.exist?('VERSION')
66
+ version = File.read('VERSION')
67
+ else
68
+ version = ""
69
+ end
70
+
71
+ rdoc.rdoc_dir = 'rdoc'
72
+ rdoc.title = "scrapin-a-livin #{version}"
73
+ rdoc.rdoc_files.include('README*')
74
+ rdoc.rdoc_files.include('lib/**/*.rb')
75
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,122 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'open-uri'
4
+ require 'hpricot'
5
+
6
+ # Class to screen scrape the dice website
7
+ #
8
+ # http://seeker.dice.com
9
+ class DiceSearch
10
+
11
+ # Constants
12
+ DEBUG = false
13
+ TITLE_CELL = 2
14
+ COMPANY_CELL = 3
15
+ LOCATION_CELL = 4
16
+ DATE_CELL = 5
17
+ CELL_COUNT = 6
18
+ DICE_LINK = "http://seeker.dice.com"
19
+
20
+ # Parse the provided query data
21
+ #
22
+ # @param query [String, #read] the html web page data
23
+ # @returns [JobListings] an array of job listings
24
+ def self.parse_listings(query)
25
+
26
+ # Create the listings
27
+ listings = Array.new
28
+
29
+ # Filter the data with Hpricot
30
+ doc = Hpricot(query)
31
+
32
+ # Get the table
33
+ table = (doc/"//table[@class=summary]")
34
+
35
+ # Get the rows
36
+ rows = (table/"tr")
37
+
38
+ # Retrieve the table rows that contain the job listings
39
+ rows.each { |row|
40
+
41
+ # Get the individual cells
42
+ cells = (row/"td")
43
+
44
+ # If this is a job listing
45
+ if cells.size == CELL_COUNT
46
+
47
+ # Get the fields
48
+ name = (cells[TITLE_CELL]/"a").inner_html
49
+ link = DICE_LINK + (cells[TITLE_CELL]/"a").attr("href")
50
+ company = (cells[COMPANY_CELL]/"a").inner_html
51
+ company_link = DICE_LINK + (cells[COMPANY_CELL]/"a").attr("href")
52
+ location = cells[LOCATION_CELL].inner_html
53
+ date = cells[DATE_CELL].inner_html
54
+
55
+ if DEBUG
56
+ puts "Row: count #{cells.size}"
57
+ puts "Name: #{name}"
58
+ puts "Link: #{link}"
59
+ puts "Company: #{company}"
60
+ puts "Company Link: #{company_link}"
61
+ puts "Location: #{location}"
62
+ puts "Date: #{date}"
63
+ end
64
+
65
+ # Create the job listing
66
+ listings << JobListing.new(name, link, company, company_link, location, date)
67
+
68
+ end
69
+
70
+ }
71
+
72
+ # Return the listings
73
+ return listings
74
+
75
+ end
76
+
77
+ # Retrieve the job listings
78
+ #
79
+ # @param url [String, #read] the url used to query the data
80
+ # @param [JobListings] an array of job listings
81
+ def self.get_listings(url)
82
+
83
+ # Read the data from the url
84
+ response = open(url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
85
+ "From" => "email@addr.com",
86
+ "Referer" => "http://hotjobs.yahoo.com/").read
87
+
88
+ # Parse the listings from the query
89
+ parse_listings(response)
90
+
91
+ end
92
+
93
+ # Query dice for html code for the query
94
+ #
95
+ # @param location [String, #read] the location to search
96
+ # @param keywords [String, #read] keywords to use for the search
97
+ # @param days_back [String, #read] how long ago to search
98
+ # @param num_entries [String, #read] the number of entries to request
99
+ def self.query(location, keywords, days_back, num_entries)
100
+
101
+ # The search URL
102
+ url = "http://seeker.dice.com/jobsearch/servlet/JobSearch" +
103
+ "?caller=0&LOCATION_OPTION=2&EXTRA_STUFF=1&N=0&Hf=0" +
104
+ "&Ntk=JobSearchRanking&op=300" +
105
+ "&values=&FREE_TEXT=#{keywords}" +
106
+ "&Ntx=mode+matchall&WHERE=#{location}" +
107
+ "&WHEREList=#{location}" +
108
+ "&RADIUS=80.4672" +
109
+ "&COUNTRY=1525&STAT_PROV=0&METRO_AREA=33.78715899%2C-84.39164034&AREA_CODES=&AC_COUNTRY=1525" +
110
+ "&TRAVEL=0&TAXTERM=1001" +
111
+ "&SORTSPEC=0" +
112
+ "&FRMT=0" +
113
+ "&DAYSBACK=#{days_back}" +
114
+ "&NUM_PER_PAGE=#{num_entries}"
115
+
116
+ # Read the data from the url
117
+ open(@url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
118
+ "From" => "email@addr.com",
119
+ "Referer" => "http://seeker.dice.com/jobsearch/").read
120
+ end
121
+
122
+ end
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # A generic job listing
4
+ class JobListing
5
+
6
+ attr_reader :title
7
+ attr_reader :link
8
+ attr_reader :company
9
+ attr_reader :company_link
10
+ attr_reader :location
11
+ attr_reader :date
12
+ attr_reader :repost
13
+
14
+ # Initializer for the job listing
15
+ def initialize(title, link, company, company_link, location, date)
16
+
17
+ @title = title
18
+ @link = link
19
+ @company = company
20
+ @company_link = company_link
21
+ @location = location
22
+ @date = date
23
+ end
24
+
25
+ end
@@ -0,0 +1,43 @@
1
+ #!/usr/bin/env ruby
2
+ require 'generic/listing.rb'
3
+ require 'dice/dice_search.rb'
4
+ require 'yahoo/hotjobs.rb'
5
+
6
+ # Helper file to include the available libraries
7
+
8
+ # Main entry
9
+ if $0 == (__FILE__)
10
+
11
+ # # Check the arguments
12
+ # case ARGV.shift
13
+ #
14
+ # # We want to create an async interface
15
+ # when /-async/
16
+ # $async = true
17
+ #
18
+ # # We want to create an extension interface
19
+ # when /-extension/
20
+ # $extension = true
21
+ # end
22
+ #
23
+ # # Go through the remaining command line arguments
24
+ # ARGV.each do |file|
25
+ #
26
+ # # Check to see if an async interface file was requested
27
+ # if $async
28
+ #
29
+ # AsyncInterface.print_out(file)
30
+ #
31
+ # # Check if this is an extension file
32
+ # elsif $extension
33
+ # # Parse the extension file
34
+ # Extension.new(file)
35
+ #
36
+ # # Print out the data
37
+ # ExtenionFile.print_out(file)
38
+ # end
39
+ #
40
+ # end
41
+
42
+
43
+ end
@@ -0,0 +1,171 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'open-uri'
4
+ require 'hpricot'
5
+
6
+ # Scrap the Yahoo Hotjobs website
7
+ #
8
+ # http://hotjobs.yahoo.com
9
+ class HotjobsSearch
10
+
11
+ # Constants
12
+ DEBUG = false
13
+ TITLE_CELL = 0
14
+ COMPANY_CELL = 1
15
+ LOCATION_CELL = 2
16
+ DATE_CELL = 3
17
+ CELL_COUNT = 4
18
+ HOTJOBS_LINK = "http://hotjobs.yahoo.com"
19
+
20
+ # Parse the provided query data
21
+ #
22
+ # @param query [String, #read] the html web page data
23
+ # @returns [JobListings] an array of job listings
24
+ def self.parse_listings(query)
25
+
26
+ # Create the listings
27
+ listings = Array.new
28
+
29
+ # Filter the data with Hpricot
30
+ doc = Hpricot(query)
31
+
32
+ # Get the table
33
+ table = (doc/"//table[@id=results]")
34
+
35
+ # Iterate through each row
36
+ rows = (table/"tr")
37
+
38
+ # Retrieve the table rows that contain the job listings
39
+ rows.each { |row|
40
+
41
+ # Get the individual cells
42
+ cells = (row/"td")
43
+
44
+ # If this is a job listing
45
+ if cells.size == CELL_COUNT
46
+
47
+ # Get the fields
48
+ name = (cells[TITLE_CELL]/"a").inner_html
49
+ link = HOTJOBS_LINK + (cells[TITLE_CELL]/"a").attr("href")
50
+ company = (cells[COMPANY_CELL]/"a").inner_html
51
+ company_link = HOTJOBS_LINK + (cells[COMPANY_CELL]/"a").attr("href")
52
+ location = get_location(cells[LOCATION_CELL])
53
+ date, repost = get_dates(cells[DATE_CELL])
54
+
55
+ if DEBUG
56
+ puts "Row: count #{cells.size}"
57
+ puts "Name: #{name}"
58
+ puts "Link: #{link}"
59
+ puts "Company: #{company}"
60
+ puts "Company Link: #{company_link}"
61
+ puts "Location: #{location}"
62
+ puts "Date: #{date}"
63
+ end
64
+
65
+ # Create the job listing
66
+ listings << JobListing.new(name, link, company, company_link, location, date)
67
+
68
+ end
69
+
70
+ }
71
+
72
+ # Return the listings
73
+ return listings
74
+
75
+ end
76
+
77
+ # Retrieve the job listings
78
+ #
79
+ # @param url [String, #read] the url used to query the data
80
+ # @param [JobListings] an array of job listings
81
+ def self.get_listings(url)
82
+
83
+ # Read the data from the url
84
+ response = open(url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
85
+ "From" => "email@addr.com",
86
+ "Referer" => "http://hotjobs.yahoo.com/").read
87
+
88
+ # Parse the listings from the query
89
+ parse_listings(response)
90
+
91
+ end
92
+
93
+ # Query yahoo for html code for the query
94
+ #
95
+ # @param location [String, #read] the location to search
96
+ # @param keywords [String, #read] keywords to use for the search
97
+ # @param days_back [String, #read] how long ago to search
98
+ # @param num_entries [String, #read] the number of entries to request
99
+ def self.query(location, keywords, days_back, num_entries)
100
+
101
+ url = "http://hotjobs.yahoo.com/job-search?" +
102
+ "src=advsearch&pageOp=search&ts=1259353986&" +
103
+ "kw_search_type=kwany&kw=#{keywords}&kw_none=&" +
104
+ "locations=#{location}&country=&locations=&locations=&" +
105
+ "industry=&industry=&industry=&" +
106
+ "updated_since=month&" +
107
+ "exp_level=&experience_level=&" +
108
+ "education=&salary[min]=&salary[type]=yearly&" +
109
+ "commitment=FT&commitment=PT&jobtype=PERM&jobtype=CONT&" +
110
+ "travel_amount=&company=&" +
111
+ "source=&email_format=html&email_frequency=1&email_enabled=0&search_jobs=Search+Jobs"
112
+
113
+ # Read the data from the url
114
+ open(url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
115
+ "From" => "email@addr.com",
116
+ "Referer" => "http://hotjobs.yahoo.com/").read
117
+ end
118
+
119
+ private
120
+ # Method to get the available locations
121
+ def self.get_location(element)
122
+
123
+ location = ''
124
+
125
+ puts "Element: #{element}" if DEBUG
126
+
127
+ # check to see if the element has a span
128
+ if (element/"span").size > 0
129
+
130
+ # The first span is the primary location
131
+ location << (element/"span")[0].inner_text
132
+
133
+ # Only one location
134
+ else
135
+ location = element.inner_html
136
+ end
137
+
138
+ return location
139
+ end
140
+
141
+ # Method to get the Dates
142
+ def self.get_dates(element)
143
+
144
+ date = ''
145
+ repost = ''
146
+
147
+ puts "Element: #{element}" if DEBUG
148
+
149
+ spans = (element/"span")
150
+
151
+ # Check to see if the element contains a span
152
+ if spans.size > 0
153
+
154
+ # The first span is the Reposted data
155
+ repost = spans[0].inner_text
156
+
157
+ # Remove the Reposted string
158
+ repost.sub!(/Reposted /, "")
159
+
160
+ # delete the span
161
+ spans.remove
162
+ end
163
+
164
+ # Get the main date
165
+ date = element.inner_text
166
+
167
+ return date, repost
168
+
169
+ end
170
+
171
+ end