scrapin-a-livin 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Kevin S Kirkup
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,36 @@
1
+ # scrapin-a-livin
2
+
3
+ This script helps you screen scrape the most common job sites and save the lists on you local computer.
4
+
5
+ You can then use other tools to display updates, send out resumes or search your LinkedIn account
6
+ to see who you may know at that company.
7
+
8
+ Please feel free to contribute make. All suggestions are welcome.
9
+ Hopefully this will help you find the career you are looking for.
10
+
11
+ Thanks goes out to Igvita.com for posting this article
12
+ http://www.igvita.com/2007/02/04/ruby-screen-scraper-in-60-seconds/
13
+
14
+ ## Quick links
15
+
16
+ *[Wiki](http://wiki.github.com/angrytuna/scrapin-a-livin)
17
+ *[Bugs](http://github.com/angrytuna/scrapin-a-livin/issues)
18
+
19
+ ## How To
20
+
21
+ A
22
+
23
+ ## Note on Patches/Pull Requests
24
+
25
+ * Fork the project.
26
+ * Make your feature addition or bug fix.
27
+ * Add tests for it. This is important so I don't break it in a
28
+ future version unintentionally.
29
+ * Commit, do not mess with rakefile, version, or history.
30
+ (if you want to have your own version, that is fine but
31
+ bump version in a commit by itself I can ignore when I pull)
32
+ * Send me a pull request. Bonus points for topic branches.
33
+
34
+ ## Copyright
35
+
36
+ Copyright (c) 2009 Kevin S Kirkup. See LICENSE for details.
@@ -0,0 +1,75 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "scrapin-a-livin"
8
+ gem.summary = %Q{Scrape to find a job}
9
+ gem.description = <<EOF
10
+ Are you in the technology industry?
11
+ The you have probably lost a job during your career.
12
+ This script helps you scrape the most common job sites for your search criteria,
13
+ and save the lists on you local computer.
14
+
15
+ You can then use other tools to display updates, send out resumes or search your LinkedIn account
16
+ to see who you may know at that company.
17
+
18
+ Please feel free to contribute make. All suggestions are welcome.
19
+ Hopefully this will help you find the career you are looking for.
20
+ EOF
21
+ gem.email = "kevin.kirkup@gmail.com"
22
+ gem.homepage = "http://github.com/angrytuna/scrapin-a-livin"
23
+ gem.authors = ["Kevin S Kirkup"]
24
+ gem.platform = Gem::Platform::RUBY
25
+ gem.require_path = 'lib'
26
+
27
+ gem.add_dependency('hpricot', '>= 0.6')
28
+
29
+ gem.add_development_dependency "thoughtbot-shoulda"
30
+
31
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
32
+ end
33
+
34
+ Jeweler::GemcutterTasks.new
35
+ rescue LoadError
36
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
37
+ end
38
+
39
+ require 'rake/testtask'
40
+ Rake::TestTask.new(:test) do |test|
41
+ test.libs << 'lib' << 'test'
42
+ test.pattern = 'test/**/*_test.rb'
43
+ test.verbose = true
44
+ end
45
+
46
+ begin
47
+ require 'rcov/rcovtask'
48
+ Rcov::RcovTask.new do |test|
49
+ test.libs << 'test'
50
+ test.pattern = 'test/**/*_test.rb'
51
+ test.verbose = true
52
+ end
53
+ rescue LoadError
54
+ task :rcov do
55
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
56
+ end
57
+ end
58
+
59
+ task :test => :check_dependencies
60
+
61
+ task :default => :test
62
+
63
+ require 'rake/rdoctask'
64
+ Rake::RDocTask.new do |rdoc|
65
+ if File.exist?('VERSION')
66
+ version = File.read('VERSION')
67
+ else
68
+ version = ""
69
+ end
70
+
71
+ rdoc.rdoc_dir = 'rdoc'
72
+ rdoc.title = "scrapin-a-livin #{version}"
73
+ rdoc.rdoc_files.include('README*')
74
+ rdoc.rdoc_files.include('lib/**/*.rb')
75
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,122 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'open-uri'
4
+ require 'hpricot'
5
+
6
+ # Class to screen scrape the dice website
7
+ #
8
+ # http://seeker.dice.com
9
+ class DiceSearch
10
+
11
+ # Constants
12
+ DEBUG = false
13
+ TITLE_CELL = 2
14
+ COMPANY_CELL = 3
15
+ LOCATION_CELL = 4
16
+ DATE_CELL = 5
17
+ CELL_COUNT = 6
18
+ DICE_LINK = "http://seeker.dice.com"
19
+
20
+ # Parse the provided query data
21
+ #
22
+ # @param query [String, #read] the html web page data
23
+ # @returns [JobListings] an array of job listings
24
+ def self.parse_listings(query)
25
+
26
+ # Create the listings
27
+ listings = Array.new
28
+
29
+ # Filter the data with Hpricot
30
+ doc = Hpricot(query)
31
+
32
+ # Get the table
33
+ table = (doc/"//table[@class=summary]")
34
+
35
+ # Get the rows
36
+ rows = (table/"tr")
37
+
38
+ # Retrieve the table rows that contain the job listings
39
+ rows.each { |row|
40
+
41
+ # Get the individual cells
42
+ cells = (row/"td")
43
+
44
+ # If this is a job listing
45
+ if cells.size == CELL_COUNT
46
+
47
+ # Get the fields
48
+ name = (cells[TITLE_CELL]/"a").inner_html
49
+ link = DICE_LINK + (cells[TITLE_CELL]/"a").attr("href")
50
+ company = (cells[COMPANY_CELL]/"a").inner_html
51
+ company_link = DICE_LINK + (cells[COMPANY_CELL]/"a").attr("href")
52
+ location = cells[LOCATION_CELL].inner_html
53
+ date = cells[DATE_CELL].inner_html
54
+
55
+ if DEBUG
56
+ puts "Row: count #{cells.size}"
57
+ puts "Name: #{name}"
58
+ puts "Link: #{link}"
59
+ puts "Company: #{company}"
60
+ puts "Company Link: #{company_link}"
61
+ puts "Location: #{location}"
62
+ puts "Date: #{date}"
63
+ end
64
+
65
+ # Create the job listing
66
+ listings << JobListing.new(name, link, company, company_link, location, date)
67
+
68
+ end
69
+
70
+ }
71
+
72
+ # Return the listings
73
+ return listings
74
+
75
+ end
76
+
77
+ # Retrieve the job listings
78
+ #
79
+ # @param url [String, #read] the url used to query the data
80
+ # @param [JobListings] an array of job listings
81
+ def self.get_listings(url)
82
+
83
+ # Read the data from the url
84
+ response = open(url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
85
+ "From" => "email@addr.com",
86
+ "Referer" => "http://hotjobs.yahoo.com/").read
87
+
88
+ # Parse the listings from the query
89
+ parse_listings(response)
90
+
91
+ end
92
+
93
+ # Query dice for html code for the query
94
+ #
95
+ # @param location [String, #read] the location to search
96
+ # @param keywords [String, #read] keywords to use for the search
97
+ # @param days_back [String, #read] how long ago to search
98
+ # @param num_entries [String, #read] the number of entries to request
99
+ def self.query(location, keywords, days_back, num_entries)
100
+
101
+ # The search URL
102
+ url = "http://seeker.dice.com/jobsearch/servlet/JobSearch" +
103
+ "?caller=0&LOCATION_OPTION=2&EXTRA_STUFF=1&N=0&Hf=0" +
104
+ "&Ntk=JobSearchRanking&op=300" +
105
+ "&values=&FREE_TEXT=#{keywords}" +
106
+ "&Ntx=mode+matchall&WHERE=#{location}" +
107
+ "&WHEREList=#{location}" +
108
+ "&RADIUS=80.4672" +
109
+ "&COUNTRY=1525&STAT_PROV=0&METRO_AREA=33.78715899%2C-84.39164034&AREA_CODES=&AC_COUNTRY=1525" +
110
+ "&TRAVEL=0&TAXTERM=1001" +
111
+ "&SORTSPEC=0" +
112
+ "&FRMT=0" +
113
+ "&DAYSBACK=#{days_back}" +
114
+ "&NUM_PER_PAGE=#{num_entries}"
115
+
116
+ # Read the data from the url
117
+ open(@url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
118
+ "From" => "email@addr.com",
119
+ "Referer" => "http://seeker.dice.com/jobsearch/").read
120
+ end
121
+
122
+ end
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # A generic job listing
4
+ class JobListing
5
+
6
+ attr_reader :title
7
+ attr_reader :link
8
+ attr_reader :company
9
+ attr_reader :company_link
10
+ attr_reader :location
11
+ attr_reader :date
12
+ attr_reader :repost
13
+
14
+ # Initializer for the job listing
15
+ def initialize(title, link, company, company_link, location, date)
16
+
17
+ @title = title
18
+ @link = link
19
+ @company = company
20
+ @company_link = company_link
21
+ @location = location
22
+ @date = date
23
+ end
24
+
25
+ end
@@ -0,0 +1,43 @@
1
+ #!/usr/bin/env ruby
2
+ require 'generic/listing.rb'
3
+ require 'dice/dice_search.rb'
4
+ require 'yahoo/hotjobs.rb'
5
+
6
+ # Helper file to include the available libraries
7
+
8
+ # Main entry
9
+ if $0 == (__FILE__)
10
+
11
+ # # Check the arguments
12
+ # case ARGV.shift
13
+ #
14
+ # # We want to create an async interface
15
+ # when /-async/
16
+ # $async = true
17
+ #
18
+ # # We want to create an extension interface
19
+ # when /-extension/
20
+ # $extension = true
21
+ # end
22
+ #
23
+ # # Go through the remaining command line arguments
24
+ # ARGV.each do |file|
25
+ #
26
+ # # Check to see if an async interface file was requested
27
+ # if $async
28
+ #
29
+ # AsyncInterface.print_out(file)
30
+ #
31
+ # # Check if this is an extension file
32
+ # elsif $extension
33
+ # # Parse the extension file
34
+ # Extension.new(file)
35
+ #
36
+ # # Print out the data
37
+ # ExtenionFile.print_out(file)
38
+ # end
39
+ #
40
+ # end
41
+
42
+
43
+ end
@@ -0,0 +1,171 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'open-uri'
4
+ require 'hpricot'
5
+
6
+ # Scrap the Yahoo Hotjobs website
7
+ #
8
+ # http://hotjobs.yahoo.com
9
+ class HotjobsSearch
10
+
11
+ # Constants
12
+ DEBUG = false
13
+ TITLE_CELL = 0
14
+ COMPANY_CELL = 1
15
+ LOCATION_CELL = 2
16
+ DATE_CELL = 3
17
+ CELL_COUNT = 4
18
+ HOTJOBS_LINK = "http://hotjobs.yahoo.com"
19
+
20
+ # Parse the provided query data
21
+ #
22
+ # @param query [String, #read] the html web page data
23
+ # @returns [JobListings] an array of job listings
24
+ def self.parse_listings(query)
25
+
26
+ # Create the listings
27
+ listings = Array.new
28
+
29
+ # Filter the data with Hpricot
30
+ doc = Hpricot(query)
31
+
32
+ # Get the table
33
+ table = (doc/"//table[@id=results]")
34
+
35
+ # Iterate through each row
36
+ rows = (table/"tr")
37
+
38
+ # Retrieve the table rows that contain the job listings
39
+ rows.each { |row|
40
+
41
+ # Get the individual cells
42
+ cells = (row/"td")
43
+
44
+ # If this is a job listing
45
+ if cells.size == CELL_COUNT
46
+
47
+ # Get the fields
48
+ name = (cells[TITLE_CELL]/"a").inner_html
49
+ link = HOTJOBS_LINK + (cells[TITLE_CELL]/"a").attr("href")
50
+ company = (cells[COMPANY_CELL]/"a").inner_html
51
+ company_link = HOTJOBS_LINK + (cells[COMPANY_CELL]/"a").attr("href")
52
+ location = get_location(cells[LOCATION_CELL])
53
+ date, repost = get_dates(cells[DATE_CELL])
54
+
55
+ if DEBUG
56
+ puts "Row: count #{cells.size}"
57
+ puts "Name: #{name}"
58
+ puts "Link: #{link}"
59
+ puts "Company: #{company}"
60
+ puts "Company Link: #{company_link}"
61
+ puts "Location: #{location}"
62
+ puts "Date: #{date}"
63
+ end
64
+
65
+ # Create the job listing
66
+ listings << JobListing.new(name, link, company, company_link, location, date)
67
+
68
+ end
69
+
70
+ }
71
+
72
+ # Return the listings
73
+ return listings
74
+
75
+ end
76
+
77
+ # Retrieve the job listings
78
+ #
79
+ # @param url [String, #read] the url used to query the data
80
+ # @param [JobListings] an array of job listings
81
+ def self.get_listings(url)
82
+
83
+ # Read the data from the url
84
+ response = open(url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
85
+ "From" => "email@addr.com",
86
+ "Referer" => "http://hotjobs.yahoo.com/").read
87
+
88
+ # Parse the listings from the query
89
+ parse_listings(response)
90
+
91
+ end
92
+
93
+ # Query yahoo for html code for the query
94
+ #
95
+ # @param location [String, #read] the location to search
96
+ # @param keywords [String, #read] keywords to use for the search
97
+ # @param days_back [String, #read] how long ago to search
98
+ # @param num_entries [String, #read] the number of entries to request
99
+ def self.query(location, keywords, days_back, num_entries)
100
+
101
+ url = "http://hotjobs.yahoo.com/job-search?" +
102
+ "src=advsearch&pageOp=search&ts=1259353986&" +
103
+ "kw_search_type=kwany&kw=#{keywords}&kw_none=&" +
104
+ "locations=#{location}&country=&locations=&locations=&" +
105
+ "industry=&industry=&industry=&" +
106
+ "updated_since=month&" +
107
+ "exp_level=&experience_level=&" +
108
+ "education=&salary[min]=&salary[type]=yearly&" +
109
+ "commitment=FT&commitment=PT&jobtype=PERM&jobtype=CONT&" +
110
+ "travel_amount=&company=&" +
111
+ "source=&email_format=html&email_frequency=1&email_enabled=0&search_jobs=Search+Jobs"
112
+
113
+ # Read the data from the url
114
+ open(url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
115
+ "From" => "email@addr.com",
116
+ "Referer" => "http://hotjobs.yahoo.com/").read
117
+ end
118
+
119
+ private
120
+ # Method to get the available locations
121
+ def self.get_location(element)
122
+
123
+ location = ''
124
+
125
+ puts "Element: #{element}" if DEBUG
126
+
127
+ # check to see if the element has a span
128
+ if (element/"span").size > 0
129
+
130
+ # The first span is the primary location
131
+ location << (element/"span")[0].inner_text
132
+
133
+ # Only one location
134
+ else
135
+ location = element.inner_html
136
+ end
137
+
138
+ return location
139
+ end
140
+
141
+ # Method to get the Dates
142
+ def self.get_dates(element)
143
+
144
+ date = ''
145
+ repost = ''
146
+
147
+ puts "Element: #{element}" if DEBUG
148
+
149
+ spans = (element/"span")
150
+
151
+ # Check to see if the element contains a span
152
+ if spans.size > 0
153
+
154
+ # The first span is the Reposted data
155
+ repost = spans[0].inner_text
156
+
157
+ # Remove the Reposted string
158
+ repost.sub!(/Reposted /, "")
159
+
160
+ # delete the span
161
+ spans.remove
162
+ end
163
+
164
+ # Get the main date
165
+ date = element.inner_text
166
+
167
+ return date, repost
168
+
169
+ end
170
+
171
+ end