scrapin-a-livin 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'open-uri'
4
+ require 'hpricot'
5
+
6
+ # We want to be able to easily pull and update set of listings
7
+ # from the dice website. This script formats the request for us
8
+ # to make things a little easier.
9
+ ENTRIES = 5
10
+ DAYS_BACK = 30
11
+ SEARCH_KEYWORDS = "Software"
12
+
13
+ # Format the URL to retrieve the current listings from
14
+ # the Dice website
15
+ #
16
+ # http://seeker.dice.com
17
+ # @param where [String, #read] the job location
18
+ # @returns [String] the url request for dice.com
19
+ def get_url(where)
20
+ "http://seeker.dice.com/jobsearch/servlet/JobSearch" +
21
+ "?caller=0&LOCATION_OPTION=2&EXTRA_STUFF=1&N=0&Hf=0" +
22
+ "&Ntk=JobSearchRanking&op=300" +
23
+ "&values=&FREE_TEXT=#{SEARCH_KEYWORDS}" +
24
+ "&Ntx=mode+matchall&WHERE=#{where}" +
25
+ "&WHEREList=#{where}" +
26
+ "&RADIUS=80.4672" +
27
+ "&COUNTRY=1525&STAT_PROV=0&METRO_AREA=33.78715899%2C-84.39164034&AREA_CODES=&AC_COUNTRY=1525" +
28
+ "&TRAVEL=0&TAXTERM=1001" +
29
+ "&SORTSPEC=0" +
30
+ "&FRMT=0" +
31
+ "&DAYSBACK=#{DAYS_BACK}" +
32
+ "&NUM_PER_PAGE=#{ENTRIES}"
33
+ end
34
+
35
+ # Write the retrieved data to the file at the specified path.
36
+ #
37
+ # @param filepath [String, #read] the path to the output file
38
+ # @param url [String, #read] the url to use for the request
39
+ def write_query(filepath, url)
40
+
41
+ begin
42
+ # Open the file for writing
43
+ file = File.new(filepath, "w")
44
+
45
+ # Query the page
46
+ response = open(url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
47
+ "From" => "email@addr.com",
48
+ "Referer" => "http://seeker.dice.com/jobsearch/").read
49
+
50
+ # Write the data to the file
51
+ file.write(response)
52
+
53
+ ensure
54
+
55
+ # Make sure the file is closed
56
+ file.close
57
+ end
58
+
59
+ end
60
+
61
+ # Query for San Jose
62
+ url = get_url("San+Jose+CA")
63
+ write_query('./querySanJose.html', url)
64
+
65
+ # Query for San Jose
66
+ url = get_url("Raleigh+NC")
67
+ write_query('./queryRaleigh.html', url)
68
+
69
+ # Query for San Jose
70
+ url = get_url("Austin+TX")
71
+ write_query('./queryAustin.html', url)
@@ -0,0 +1,70 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'open-uri'
4
+ require 'hpricot'
5
+
6
+ # We want to be able to easily pull and update set of listings
7
+ # from the yahoo website. This script formats the request for us
8
+ # to make things a little easier.
9
+ DAYS_BACK = "month"
10
+ SEARCH_KEYWORDS = "Software"
11
+
12
+ # Format the URL to retrieve the current listings from
13
+ # the Yahoo Hotjobs website
14
+ #
15
+ # http://hotjobs.yahoo.com
16
+ # @param where [String, #read] the job location
17
+ # @returns [String] the url request for dice.com
18
+ def get_url(where)
19
+
20
+ "http://hotjobs.yahoo.com/job-search?" +
21
+ "src=advsearch&pageOp=search&ts=1259353986&" +
22
+ "kw_search_type=kwany&kw=#{SEARCH_KEYWORDS}&kw_none=&" +
23
+ "locations=#{where}&country=&locations=&locations=&" +
24
+ "industry=&industry=&industry=&" +
25
+ "updated_since=#{DAYS_BACK}&" +
26
+ "exp_level=&experience_level=&" +
27
+ "education=&salary[min]=&salary[type]=yearly&" +
28
+ "commitment=FT&commitment=PT&jobtype=PERM&jobtype=CONT&" +
29
+ "travel_amount=&company=&" +
30
+ "source=&email_format=html&email_frequency=1&email_enabled=0&search_jobs=Search+Jobs"
31
+ end
32
+
33
+ # Write the retrieved data to the file at the specified path.
34
+ #
35
+ # @param filepath [String, #read] the path to the output file
36
+ # @param url [String, #read] the url to use for the request
37
+ def write_query(filepath, url)
38
+
39
+ begin
40
+ # Open the file for writing
41
+ file = File.new(filepath, "w")
42
+
43
+ # Query the page
44
+ response = open(url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
45
+ "From" => "email@addr.com",
46
+ "Referer" => "http://hotjobs.yahoo.com/").read
47
+
48
+ # Write the data to the file
49
+ file.write(response)
50
+
51
+ ensure
52
+
53
+ # Make sure the file is closed
54
+ file.close
55
+ end
56
+
57
+ end
58
+
59
+ # Query for San Jose
60
+ url = get_url("San+Jose+CA")
61
+ write_query('./querySanJose.html', url)
62
+
63
+
64
+ # Query for San Jose
65
+ url = get_url("Raleigh+NC")
66
+ write_query('./queryRaleigh.html', url)
67
+
68
+ # Query for San Jose
69
+ url = get_url("Austin+TX")
70
+ write_query('./queryAustin.html', url)
@@ -0,0 +1,8 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+ # Test Case helper file
5
+
6
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
7
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
8
+ require 'scrapin-a-livin'
@@ -0,0 +1,168 @@
1
+ require 'test_helper'
2
+ require 'ftools'
3
+
4
+ # Test Case for the yahoo hotjobs listing parser Dice
5
+ class TC_HotjobsSearchParser < Test::Unit::TestCase
6
+
7
+ RALEIGH_QUERY = File.dirname(__FILE__) + '/queries/hotjobs/queryRaleigh.html'
8
+ AUSTIN_QUERY = File.dirname(__FILE__) + '/queries/hotjobs/queryAustin.html'
9
+ SANJOSE_QUERY = File.dirname(__FILE__) + '/queries/hotjobs/querySanJose.html'
10
+
11
+ # Test for the listing header
12
+ def test_raleigh
13
+
14
+ # Read the query data from the file
15
+ data = check_query(RALEIGH_QUERY)
16
+
17
+ # Parse the listings from the query
18
+ listings = HotjobsSearch.parse_listings(data)
19
+
20
+ # Go through each listing
21
+ assert_equal("Embedded Software Engineer", listings[0].title)
22
+ assert_equal("http://hotjobs.yahoo.com/job-JNEJ91O9CZA?source=SRP", listings[0].link)
23
+ assert_equal("Sirit Technology", listings[0].company)
24
+ assert_equal("http://hotjobs.yahoo.com/careers-786030-Sirit_Technology", listings[0].company_link)
25
+ assert_equal("Morrisville, NC", listings[0].location)
26
+ assert_equal("Oct 28", listings[0].date)
27
+
28
+ assert_equal("MES Java Software Engineer", listings[1].title)
29
+ assert_equal("http://hotjobs.yahoo.com/job-JDZWR4DRM00?source=SRP", listings[1].link)
30
+ assert_equal("Werum Software & Systems America", listings[1].company)
31
+ assert_equal("http://hotjobs.yahoo.com/careers-616438-Werum_Software___Systems_America", listings[1].company_link)
32
+ assert_equal("Cary, NC", listings[1].location)
33
+ assert_equal("Sep 29", listings[1].date)
34
+
35
+ assert_equal("Software Developer", listings[2].title)
36
+ assert_equal("http://hotjobs.yahoo.com/job-JQGSVB1Y99F?source=SRP", listings[2].link)
37
+ assert_equal("CTG", listings[2].company)
38
+ assert_equal("http://hotjobs.yahoo.com/careers-601569-CTG", listings[2].company_link)
39
+ assert_equal("Durham, NC", listings[2].location)
40
+ assert_equal("Oct 30", listings[2].date)
41
+
42
+ assert_equal("Software Engineer - Windows Installer Developer, MSI, WiX, WiM", listings[3].title)
43
+ assert_equal("http://hotjobs.yahoo.com/job-J9WH2JP6WO7?source=SRP", listings[3].link)
44
+ assert_equal("CyberCoders", listings[3].company)
45
+ assert_equal("http://hotjobs.yahoo.com/careers-577525-CyberCoders", listings[3].company_link)
46
+ assert_equal("Raleigh, NC", listings[3].location)
47
+ assert_equal("Nov 27", listings[3].date)
48
+
49
+ assert_equal("C# Developer, Software Developer, SAAS, ASP.Net, SQL Server, SQL, .Net, WCF, JavaScript", listings[4].title)
50
+ assert_equal("http://hotjobs.yahoo.com/job-J5PKCPM1QM2?source=SRP", listings[4].link)
51
+ assert_equal("CyberCoders", listings[4].company)
52
+ assert_equal("http://hotjobs.yahoo.com/careers-577525-CyberCoders", listings[4].company_link)
53
+ assert_equal("Raleigh, NC", listings[4].location)
54
+ assert_equal("Nov 27", listings[4].date)
55
+ end
56
+
57
+ # Test for parsing a listing
58
+ def test_austin
59
+
60
+ # Read the query data
61
+ data = check_query(AUSTIN_QUERY)
62
+
63
+ # Parse the listings from the query
64
+ listings = HotjobsSearch.parse_listings(data)
65
+
66
+ # Go through each listing
67
+ assert_equal("Software Program Manager", listings[0].title)
68
+ assert_equal("http://hotjobs.yahoo.com/job-JSFNSIFFNCA?source=SRP", listings[0].link)
69
+ assert_equal("Zebra Imaging", listings[0].company)
70
+ assert_equal("http://hotjobs.yahoo.com/careers-627739-Zebra_Imaging", listings[0].company_link)
71
+ assert_equal("Austin, TX", listings[0].location)
72
+ assert_equal("Nov 19", listings[0].date)
73
+
74
+ assert_equal("Client Project Manager - Software", listings[1].title)
75
+ assert_equal("http://hotjobs.yahoo.com/job-JM1F32JSA7R?source=SRP", listings[1].link)
76
+ assert_equal("Digital Cheetah Solutions, Inc.", listings[1].company)
77
+ assert_equal("http://hotjobs.yahoo.com/careers-577804-Digital_Cheetah_Solutions__Inc_", listings[1].company_link)
78
+ assert_equal("Austin, TX", listings[1].location)
79
+ assert_equal("Nov 08", listings[1].date)
80
+
81
+ assert_equal("Software Quality Assurance Lead", listings[2].title)
82
+ assert_equal("http://hotjobs.yahoo.com/job-JL5SOECPQKF?source=SRP", listings[2].link)
83
+ assert_equal("Zebra Imaging", listings[2].company)
84
+ assert_equal("http://hotjobs.yahoo.com/careers-627739-Zebra_Imaging", listings[2].company_link)
85
+ assert_equal("Austin, TX", listings[2].location)
86
+ assert_equal("Oct 06", listings[2].date)
87
+
88
+ assert_equal("Operating System Software Test Specialist", listings[3].title)
89
+ assert_equal("http://hotjobs.yahoo.com/job-J3KBF8EADAO?source=SRP", listings[3].link)
90
+ assert_equal("CTG", listings[3].company)
91
+ assert_equal("http://hotjobs.yahoo.com/careers-601569-CTG", listings[3].company_link)
92
+ assert_equal("Austin, TX", listings[3].location)
93
+ assert_equal("Nov 12", listings[3].date)
94
+
95
+ assert_equal("Software Engineer", listings[4].title)
96
+ assert_equal("http://hotjobs.yahoo.com/job-JTXD1K5GAZ2?source=SRP", listings[4].link)
97
+ assert_equal("Troux Technologies", listings[4].company)
98
+ assert_equal("http://hotjobs.yahoo.com/careers-788124-Troux_Technologies", listings[4].company_link)
99
+ assert_equal("Austin, TX", listings[4].location)
100
+ assert_equal("Nov 05", listings[4].date)
101
+
102
+ end
103
+
104
+ # Test for parsing a listing
105
+ def test_sanjose
106
+
107
+ # Read the query data
108
+ data = check_query(SANJOSE_QUERY)
109
+
110
+ # Parse the listings from the query
111
+ listings = HotjobsSearch.parse_listings(data)
112
+
113
+ # Go through each listing
114
+ assert_equal("Software Engineer/Carrier CE", listings[0].title)
115
+ assert_equal("http://hotjobs.yahoo.com/job-J2VW8Y7GMAH?source=SRP", listings[0].link)
116
+ assert_equal("Atheros Communications, Inc.", listings[0].company)
117
+ assert_equal("http://hotjobs.yahoo.com/careers-552561-Atheros_Communications__Inc_", listings[0].company_link)
118
+ assert_equal("Santa Clara, CA", listings[0].location)
119
+ assert_equal("Nov 12", listings[0].date)
120
+
121
+ assert_equal("EMBEDDED SOFTWARE ENGINEER", listings[1].title)
122
+ assert_equal("http://hotjobs.yahoo.com/job-JD285WTGJLP?source=SRP", listings[1].link)
123
+ assert_equal("NVIDIA Corporation", listings[1].company)
124
+ assert_equal("http://hotjobs.yahoo.com/careers-601859-NVIDIA_Corporation", listings[1].company_link)
125
+ assert_equal("Santa Clara, CA", listings[1].location)
126
+ assert_equal("Oct 23", listings[1].date)
127
+
128
+ assert_equal("EMBEDDED SOFTWARE ENGINEER", listings[2].title)
129
+ assert_equal("http://hotjobs.yahoo.com/job-J9LJP1J38EB?source=SRP", listings[2].link)
130
+ assert_equal("NVIDIA Corporation", listings[2].company)
131
+ assert_equal("http://hotjobs.yahoo.com/careers-601859-NVIDIA_Corporation", listings[2].company_link)
132
+ assert_equal("Santa Clara, CA", listings[2].location)
133
+ assert_equal("Oct 23", listings[2].date)
134
+
135
+ assert_equal("Platform Bringup Software Engineer", listings[3].title)
136
+ assert_equal("http://hotjobs.yahoo.com/job-J6YYFVVFK7P?source=SRP", listings[3].link)
137
+ assert_equal("Aruba Networks", listings[3].company)
138
+ assert_equal("http://hotjobs.yahoo.com/careers-562030-Aruba_Networks", listings[3].company_link)
139
+ assert_equal("Sunnyvale, CA", listings[3].location)
140
+ assert_equal("Nov 14", listings[3].date)
141
+
142
+ assert_equal("Staff Software Development Engineer", listings[4].title)
143
+ assert_equal("http://hotjobs.yahoo.com/job-JDG4ZD5RP59?source=SRP", listings[4].link)
144
+ assert_equal("IDT", listings[4].company)
145
+ assert_equal("http://hotjobs.yahoo.com/careers-577851-IDT", listings[4].company_link)
146
+ assert_equal("San Jose, CA", listings[4].location)
147
+ assert_equal("Nov 19", listings[4].date)
148
+ end
149
+
150
+ private
151
+
152
+ # Make sure the query file exists
153
+ #
154
+ # @param path [String, #read] the path to the input file
155
+ # @returns [String] the data from the input file
156
+ def check_query(path)
157
+
158
+ # Check to make sure the query exists and is readable
159
+ if !File.exists?(path) || !File.readable?(path)
160
+
161
+ raise ArgumentError.new "Invalid Query: \"#{path}\""
162
+ end
163
+
164
+ # Open the file and read the contents
165
+ File.open(path, "r").read
166
+ end
167
+
168
+ end
metadata ADDED
@@ -0,0 +1,110 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scrapin-a-livin
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Kevin S Kirkup
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-11-28 00:00:00 -05:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hpricot
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0.6"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: thoughtbot-shoulda
27
+ type: :development
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ description: |
36
+ Are you in the technology industry?
37
+ The you have probably lost a job during your career.
38
+ This script helps you scrape the most common job sites for your search criteria,
39
+ and save the lists on you local computer.
40
+
41
+ You can then use other tools to display updates, send out resumes or search your LinkedIn account
42
+ to see who you may know at that company.
43
+
44
+ Please feel free to contribute make. All suggestions are welcome.
45
+ Hopefully this will help you find the career you are looking for.
46
+
47
+ email: kevin.kirkup@gmail.com
48
+ executables: []
49
+
50
+ extensions: []
51
+
52
+ extra_rdoc_files:
53
+ - LICENSE
54
+ - README.markdown
55
+ files:
56
+ - .document
57
+ - .gitignore
58
+ - LICENSE
59
+ - README.markdown
60
+ - Rakefile
61
+ - VERSION
62
+ - lib/dice/dice_search.rb
63
+ - lib/generic/listing.rb
64
+ - lib/scrapin-a-livin.rb
65
+ - lib/yahoo/hotjobs.rb
66
+ - test/dice_parser_test.rb
67
+ - test/queries/dice/queryAustin.html
68
+ - test/queries/dice/queryRaleigh.html
69
+ - test/queries/dice/querySanJose.html
70
+ - test/queries/hotjobs/queryAustin.html
71
+ - test/queries/hotjobs/queryRaleigh.html
72
+ - test/queries/hotjobs/querySanJose.html
73
+ - test/scripts/diceDump.rb
74
+ - test/scripts/hotjobsDump.rb
75
+ - test/test_helper.rb
76
+ - test/yahoo_parser_test.rb
77
+ has_rdoc: true
78
+ homepage: http://github.com/angrytuna/scrapin-a-livin
79
+ licenses: []
80
+
81
+ post_install_message:
82
+ rdoc_options:
83
+ - --charset=UTF-8
84
+ require_paths:
85
+ - lib
86
+ required_ruby_version: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: "0"
91
+ version:
92
+ required_rubygems_version: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: "0"
97
+ version:
98
+ requirements: []
99
+
100
+ rubyforge_project:
101
+ rubygems_version: 1.3.5
102
+ signing_key:
103
+ specification_version: 3
104
+ summary: Scrape to find a job
105
+ test_files:
106
+ - test/scripts/diceDump.rb
107
+ - test/scripts/hotjobsDump.rb
108
+ - test/yahoo_parser_test.rb
109
+ - test/dice_parser_test.rb
110
+ - test/test_helper.rb