scrapin-a-livin 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.markdown +27 -8
- data/VERSION +1 -1
- data/lib/dice/dice.rb +122 -0
- data/lib/scrapin-a-livin.rb +1 -38
- metadata +2 -1
data/README.markdown
CHANGED
@@ -1,10 +1,7 @@
|
|
1
1
|
# scrapin-a-livin
|
2
2
|
|
3
3
|
This script helps you screen scrape the most common job sites and save the lists on you local computer.
|
4
|
-
|
5
|
-
You can then use other tools to display updates, send out resumes or search your LinkedIn account
|
6
|
-
to see who you may know at that company.
|
7
|
-
|
4
|
+
You can then use other tools to display updates, send out resumes or search your LinkedIn account to see who you may know at that company.
|
8
5
|
Please feel free to contribute make. All suggestions are welcome.
|
9
6
|
Hopefully this will help you find the career you are looking for.
|
10
7
|
|
@@ -13,12 +10,34 @@ http://www.igvita.com/2007/02/04/ruby-screen-scraper-in-60-seconds/
|
|
13
10
|
|
14
11
|
## Quick links
|
15
12
|
|
16
|
-
*[Wiki](http://wiki.github.com/angrytuna/scrapin-a-livin)
|
17
|
-
*[Bugs](http://github.com/angrytuna/scrapin-a-livin/issues)
|
13
|
+
* [Wiki](http://wiki.github.com/angrytuna/scrapin-a-livin)
|
14
|
+
* [Bugs](http://github.com/angrytuna/scrapin-a-livin/issues)
|
15
|
+
|
16
|
+
## Installing
|
17
|
+
|
18
|
+
# Install the gem
|
19
|
+
$ sudo gem install scrapin-a-livin
|
20
|
+
|
21
|
+
## Using
|
22
|
+
|
23
|
+
To use the require the scrapin-a-livin library and then use the static methods for the site you want data from.
|
24
|
+
|
25
|
+
require 'scrapin-a-livin'
|
26
|
+
|
27
|
+
# Create a url for the request
|
28
|
+
url = HotjobsSearch.query("Raleigh+NC", "Software")
|
29
|
+
|
30
|
+
# Use the query to retrieve the job listings
|
31
|
+
listings = HotjobsSearch.get_listings(url)
|
18
32
|
|
19
|
-
|
33
|
+
# Now you can use the job listing information
|
34
|
+
listings.each { |job|
|
35
|
+
puts job.title
|
36
|
+
puts job.company
|
37
|
+
}
|
20
38
|
|
21
|
-
|
39
|
+
The current support for creating the html query for the both Dice and Yahoo Hotjobs is quite limited.
|
40
|
+
If you would like to have a more advanced search, you can go the either site, create the search using the advanced search editor, and them copy the url from the resulting query. Then just substitute this string in the code above to retrieve the listings.
|
22
41
|
|
23
42
|
## Note on Patches/Pull Requests
|
24
43
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.1
|
data/lib/dice/dice.rb
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'hpricot'
|
5
|
+
|
6
|
+
# Class to screen scrape the dice website
|
7
|
+
#
|
8
|
+
# http://seeker.dice.com
|
9
|
+
class DiceSearch
|
10
|
+
|
11
|
+
# Constants
|
12
|
+
DEBUG = false
|
13
|
+
TITLE_CELL = 2
|
14
|
+
COMPANY_CELL = 3
|
15
|
+
LOCATION_CELL = 4
|
16
|
+
DATE_CELL = 5
|
17
|
+
CELL_COUNT = 6
|
18
|
+
DICE_LINK = "http://seeker.dice.com"
|
19
|
+
|
20
|
+
# Parse the provided query data
|
21
|
+
#
|
22
|
+
# @param query [String, #read] the html web page data
|
23
|
+
# @returns [JobListings] an array of job listings
|
24
|
+
def self.parse_listings(query)
|
25
|
+
|
26
|
+
# Create the listings
|
27
|
+
listings = Array.new
|
28
|
+
|
29
|
+
# Filter the data with Hpricot
|
30
|
+
doc = Hpricot(query)
|
31
|
+
|
32
|
+
# Get the table
|
33
|
+
table = (doc/"//table[@class=summary]")
|
34
|
+
|
35
|
+
# Get the rows
|
36
|
+
rows = (table/"tr")
|
37
|
+
|
38
|
+
# Retrieve the table rows that contain the job listings
|
39
|
+
rows.each { |row|
|
40
|
+
|
41
|
+
# Get the individual cells
|
42
|
+
cells = (row/"td")
|
43
|
+
|
44
|
+
# If this is a job listing
|
45
|
+
if cells.size == CELL_COUNT
|
46
|
+
|
47
|
+
# Get the fields
|
48
|
+
name = (cells[TITLE_CELL]/"a").inner_html
|
49
|
+
link = DICE_LINK + (cells[TITLE_CELL]/"a").attr("href")
|
50
|
+
company = (cells[COMPANY_CELL]/"a").inner_html
|
51
|
+
company_link = DICE_LINK + (cells[COMPANY_CELL]/"a").attr("href")
|
52
|
+
location = cells[LOCATION_CELL].inner_html
|
53
|
+
date = cells[DATE_CELL].inner_html
|
54
|
+
|
55
|
+
if DEBUG
|
56
|
+
puts "Row: count #{cells.size}"
|
57
|
+
puts "Name: #{name}"
|
58
|
+
puts "Link: #{link}"
|
59
|
+
puts "Company: #{company}"
|
60
|
+
puts "Company Link: #{company_link}"
|
61
|
+
puts "Location: #{location}"
|
62
|
+
puts "Date: #{date}"
|
63
|
+
end
|
64
|
+
|
65
|
+
# Create the job listing
|
66
|
+
listings << JobListing.new(name, link, company, company_link, location, date)
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
}
|
71
|
+
|
72
|
+
# Return the listings
|
73
|
+
return listings
|
74
|
+
|
75
|
+
end
|
76
|
+
|
77
|
+
# Retrieve the job listings
|
78
|
+
#
|
79
|
+
# @param url [String, #read] the url used to query the data
|
80
|
+
# @param [JobListings] an array of job listings
|
81
|
+
def self.get_listings(url)
|
82
|
+
|
83
|
+
# Read the data from the url
|
84
|
+
response = open(url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
|
85
|
+
"From" => "email@addr.com",
|
86
|
+
"Referer" => "http://hotjobs.yahoo.com/").read
|
87
|
+
|
88
|
+
# Parse the listings from the query
|
89
|
+
parse_listings(response)
|
90
|
+
|
91
|
+
end
|
92
|
+
|
93
|
+
# Query dice for html code for the query
|
94
|
+
#
|
95
|
+
# @param location [String, #read] the location to search
|
96
|
+
# @param keywords [String, #read] keywords to use for the search
|
97
|
+
# @param days_back [String, #read] how long ago to search
|
98
|
+
# @param num_entries [String, #read] the number of entries to request
|
99
|
+
def self.query(location, keywords, days_back, num_entries)
|
100
|
+
|
101
|
+
# The search URL
|
102
|
+
url = "http://seeker.dice.com/jobsearch/servlet/JobSearch" +
|
103
|
+
"?caller=0&LOCATION_OPTION=2&EXTRA_STUFF=1&N=0&Hf=0" +
|
104
|
+
"&Ntk=JobSearchRanking&op=300" +
|
105
|
+
"&values=&FREE_TEXT=#{keywords}" +
|
106
|
+
"&Ntx=mode+matchall&WHERE=#{location}" +
|
107
|
+
"&WHEREList=#{location}" +
|
108
|
+
"&RADIUS=80.4672" +
|
109
|
+
"&COUNTRY=1525&STAT_PROV=0&METRO_AREA=33.78715899%2C-84.39164034&AREA_CODES=&AC_COUNTRY=1525" +
|
110
|
+
"&TRAVEL=0&TAXTERM=1001" +
|
111
|
+
"&SORTSPEC=0" +
|
112
|
+
"&FRMT=0" +
|
113
|
+
"&DAYSBACK=#{days_back}" +
|
114
|
+
"&NUM_PER_PAGE=#{num_entries}"
|
115
|
+
|
116
|
+
# Read the data from the url
|
117
|
+
open(@url, "User-Agent" => "Ruby/#{RUBY_VERSION}",
|
118
|
+
"From" => "email@addr.com",
|
119
|
+
"Referer" => "http://seeker.dice.com/jobsearch/").read
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
data/lib/scrapin-a-livin.rb
CHANGED
@@ -1,43 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
require 'generic/listing.rb'
|
3
|
-
require 'dice/
|
3
|
+
require 'dice/dice.rb'
|
4
4
|
require 'yahoo/hotjobs.rb'
|
5
5
|
|
6
6
|
# Helper file to include the available libraries
|
7
|
-
|
8
|
-
# Main entry
|
9
|
-
if $0 == (__FILE__)
|
10
|
-
|
11
|
-
# # Check the arguments
|
12
|
-
# case ARGV.shift
|
13
|
-
#
|
14
|
-
# # We want to create an async interface
|
15
|
-
# when /-async/
|
16
|
-
# $async = true
|
17
|
-
#
|
18
|
-
# # We want to create an extension interface
|
19
|
-
# when /-extension/
|
20
|
-
# $extension = true
|
21
|
-
# end
|
22
|
-
#
|
23
|
-
# # Go through the remaining command line arguments
|
24
|
-
# ARGV.each do |file|
|
25
|
-
#
|
26
|
-
# # Check to see if an async interface file was requested
|
27
|
-
# if $async
|
28
|
-
#
|
29
|
-
# AsyncInterface.print_out(file)
|
30
|
-
#
|
31
|
-
# # Check if this is an extension file
|
32
|
-
# elsif $extension
|
33
|
-
# # Parse the extension file
|
34
|
-
# Extension.new(file)
|
35
|
-
#
|
36
|
-
# # Print out the data
|
37
|
-
# ExtenionFile.print_out(file)
|
38
|
-
# end
|
39
|
-
#
|
40
|
-
# end
|
41
|
-
|
42
|
-
|
43
|
-
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapin-a-livin
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S Kirkup
|
@@ -59,6 +59,7 @@ files:
|
|
59
59
|
- README.markdown
|
60
60
|
- Rakefile
|
61
61
|
- VERSION
|
62
|
+
- lib/dice/dice.rb
|
62
63
|
- lib/dice/dice_search.rb
|
63
64
|
- lib/generic/listing.rb
|
64
65
|
- lib/scrapin-a-livin.rb
|