tsjobcrawler 0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.rspec +2 -0
- data/COPYING +674 -0
- data/Gemfile +7 -0
- data/README.md +28 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/lib/clearancejobscom/clearance_jobs_com_crawler.rb +86 -0
- data/lib/clearancejobscom/clearance_jobs_com_parser.rb +137 -0
- data/lib/clearedjobsnet/cleared_jobs_net_crawler.rb +141 -0
- data/lib/clearedjobsnet/cleared_jobs_net_parser.rb +93 -0
- data/lib/clearedjobsnet/get_all_cleared_jobs.rb +77 -0
- data/lib/clearedjobsnet/terms/clearance_levels.json +15 -0
- data/lib/clearedjobsnet/terms/company_names.json +17 -0
- data/lib/clearedjobsnet/terms/country_names.json +202 -0
- data/lib/clearedjobsnet/terms/search_terms.json +27 -0
- data/lib/securityclearedjobscom/security_cleared_jobs_com_crawler.rb +93 -0
- data/lib/securityclearedjobscom/security_cleared_jobs_com_parser.rb +115 -0
- data/lib/tsjobcrawler.rb +52 -0
- data/lib/util/failure_handler.rb +22 -0
- data/tsjobcrawler.gemspec +27 -0
- metadata +162 -0
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
This is a crawler for job listings that require security clearance.
|
2
|
+
|
3
|
+
To run-
|
4
|
+
|
5
|
+
t = TSJobCrawler.new("search term" (or nil), request_manager, cm_hash or nil)
|
6
|
+
|
7
|
+
t.crawl_jobs
|
8
|
+
|
9
|
+
|
10
|
+
For example-
|
11
|
+
|
12
|
+
Headless.ly do
|
13
|
+
|
14
|
+
r = RequestManager.new(nil, [0, 0], 1)
|
15
|
+
|
16
|
+
t = TSJobCrawler.new(nil, r, nil)
|
17
|
+
|
18
|
+
t.crawl_jobs
|
19
|
+
|
20
|
+
File.write("test.json", t.gen_json)
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
If you input nil for the search term, it downloads as many job listings as
|
26
|
+
possible. Unless you have a lot of RAM, you should run it through Harvester if
|
27
|
+
you want to download as many listings as possible as then you can take
|
28
|
+
advantage of incremental result reporting.
|
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "tsjobcrawler"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
require 'pry'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'cgi'
|
5
|
+
require 'json'
|
6
|
+
require 'requestmanager'
|
7
|
+
require 'headless'
|
8
|
+
require 'harvesterreporter'
|
9
|
+
|
10
|
+
load 'clearancejobscom/clearance_jobs_com_parser.rb'
|
11
|
+
load 'util/failure_handler.rb'
|
12
|
+
|
13
|
+
class ClearanceJobsComCrawler
|
14
|
+
include FailureHandler
|
15
|
+
def initialize(search_term, requests=nil, cm_hash=nil)
|
16
|
+
@search_term = search_term
|
17
|
+
@requests = requests
|
18
|
+
@base_url = set_base_url
|
19
|
+
|
20
|
+
# Handle crawler manager info
|
21
|
+
@reporter = HarvesterReporter.new(cm_hash)
|
22
|
+
end
|
23
|
+
|
24
|
+
# Run the crawler
|
25
|
+
def crawl
|
26
|
+
page_count = get_page_count
|
27
|
+
|
28
|
+
(1..page_count).each do |page_num|
|
29
|
+
listing_links = collect_links_on_page(get_next_page(page_num))
|
30
|
+
parse_listings(listing_links)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Get base url
|
35
|
+
def set_base_url
|
36
|
+
if @search_term == nil
|
37
|
+
@base_url = "https://www.clearancejobs.com/jobs?"
|
38
|
+
else
|
39
|
+
@base_url = "https://www.clearancejobs.com/jobs?keywords="+CGI.escape(@search_term)+"&zip_text="
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Get the URL for the next page
|
44
|
+
def get_next_page_url(page_num)
|
45
|
+
return @base_url+"PAGE="+page_num.to_s+"&limit=25"
|
46
|
+
end
|
47
|
+
|
48
|
+
# Get the page
|
49
|
+
def get_page(url)
|
50
|
+
get_retry(url, @requests, 0)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Get the correct total # of pages
|
54
|
+
def get_page_count
|
55
|
+
page_html = Nokogiri::HTML.parse(get_next_page(1))
|
56
|
+
result_count = page_html.css("#viewing").text.split(" of ")[1].gsub(",", "").to_i
|
57
|
+
return (result_count/25.0).ceil
|
58
|
+
end
|
59
|
+
|
60
|
+
# Get the next page
|
61
|
+
def get_next_page(page_num)
|
62
|
+
return get_page(get_next_page_url(page_num))
|
63
|
+
end
|
64
|
+
|
65
|
+
# Collect the links on the page
|
66
|
+
def collect_links_on_page(page)
|
67
|
+
html = Nokogiri::HTML.parse(page)
|
68
|
+
return html.css(".cj-search-result-item-title").css("a").map{|a| a['href']}
|
69
|
+
end
|
70
|
+
|
71
|
+
# Parse the listings on the page
|
72
|
+
def parse_listings(listings)
|
73
|
+
found_listings = Array.new
|
74
|
+
listings.each do |listing|
|
75
|
+
parser = ClearanceJobsComParser.new(listing, get_page(listing), @requests)
|
76
|
+
parsed_listing = parser.parse
|
77
|
+
found_listings.push(parsed_listing) if parsed_listing
|
78
|
+
end
|
79
|
+
|
80
|
+
@reporter.report_results(found_listings, listings.first)
|
81
|
+
end
|
82
|
+
|
83
|
+
def gen_json
|
84
|
+
return @reporter.gen_json
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,137 @@
|
|
1
|
+
require 'pry'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
load 'util/failure_handler.rb'
|
6
|
+
|
7
|
+
class ClearanceJobsComParser
|
8
|
+
include FailureHandler
|
9
|
+
def initialize(url, page, requests=nil)
|
10
|
+
@url = url
|
11
|
+
@requests = requests
|
12
|
+
@i = 0
|
13
|
+
@html = page
|
14
|
+
@page = Nokogiri::HTML.parse(page)
|
15
|
+
end
|
16
|
+
|
17
|
+
# Parse the profile
|
18
|
+
def parse
|
19
|
+
begin
|
20
|
+
return {
|
21
|
+
url: @url,
|
22
|
+
company_name: company_name,
|
23
|
+
location: location,
|
24
|
+
job_title: job_title,
|
25
|
+
job_description: job_description,
|
26
|
+
job_description_plaintext: job_description_plaintext,
|
27
|
+
required_travel: required_travel,
|
28
|
+
salary: salary,
|
29
|
+
salary_notes: salary_notes,
|
30
|
+
job_category: job_category,
|
31
|
+
group_id: group_id,
|
32
|
+
required_experience: required_experience,
|
33
|
+
employment_status: employment_status,
|
34
|
+
required_clearance: required_clearance,
|
35
|
+
required_experience: required_experience,
|
36
|
+
work_environment: work_environment,
|
37
|
+
posting_date: posting_date,
|
38
|
+
html: @html
|
39
|
+
}
|
40
|
+
rescue
|
41
|
+
@i += 1
|
42
|
+
if @i < 10
|
43
|
+
@html = Nokogiri::HTML.parse(get_retry(@url, @requests, @i))
|
44
|
+
parse
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Get the company name
|
50
|
+
def company_name
|
51
|
+
@page.css("h2").text
|
52
|
+
end
|
53
|
+
|
54
|
+
# Get the job location
|
55
|
+
def location
|
56
|
+
raw_location = @page.css("div").select{|e| e['itemprop'] == "hiringOrganization"}[0].css("h3").text
|
57
|
+
raw_location.gsub(/(\d)/, "").strip if raw_location
|
58
|
+
end
|
59
|
+
|
60
|
+
# Get the job title
|
61
|
+
def job_title
|
62
|
+
@page.css("h1").text
|
63
|
+
end
|
64
|
+
|
65
|
+
# Get the job description
|
66
|
+
def job_description
|
67
|
+
@page.css("div.margin-bottom-20").select{|e| e['itemprop'] == "description"}[0].to_html
|
68
|
+
end
|
69
|
+
|
70
|
+
# Get the job description without text
|
71
|
+
def job_description_plaintext
|
72
|
+
Nokogiri::HTML.parse(job_description.gsub('<br />',"\n").gsub('<br>', "\n").gsub('<br/>', "\n")).text
|
73
|
+
end
|
74
|
+
|
75
|
+
# Get if there is travel required
|
76
|
+
def required_travel
|
77
|
+
get_element_value("Travel:")
|
78
|
+
end
|
79
|
+
|
80
|
+
# Get the salary
|
81
|
+
def salary
|
82
|
+
get_element_value("Compensation:")
|
83
|
+
end
|
84
|
+
|
85
|
+
# Get notes about the salary
|
86
|
+
def salary_notes
|
87
|
+
salary_info = get_element_value("Compensation Comments:")
|
88
|
+
salary_info.lstrip.strip if salary_info
|
89
|
+
end
|
90
|
+
|
91
|
+
# Get the job category
|
92
|
+
def job_category
|
93
|
+
get_element_value("Job Category:")
|
94
|
+
end
|
95
|
+
|
96
|
+
# Get the group ID
|
97
|
+
def group_id
|
98
|
+
get_element_value("Group ID")
|
99
|
+
end
|
100
|
+
|
101
|
+
# Get the # of years of experience required
|
102
|
+
def required_experience
|
103
|
+
get_element_value("Minimum Experience Required")
|
104
|
+
end
|
105
|
+
|
106
|
+
# Get the employment status for the position
|
107
|
+
def employment_status
|
108
|
+
get_element_value("Status:")
|
109
|
+
end
|
110
|
+
|
111
|
+
# Get the clearance level
|
112
|
+
def required_clearance
|
113
|
+
get_element_value("Minimum Clearance")
|
114
|
+
end
|
115
|
+
|
116
|
+
# Get the work environment
|
117
|
+
def work_environment
|
118
|
+
get_element_value("Workplace:")
|
119
|
+
end
|
120
|
+
|
121
|
+
# Get the date of the posting
|
122
|
+
def posting_date
|
123
|
+
element = get_element("Post Date:")
|
124
|
+
DateTime.parse(element[0].css("meta")[0]['content'])
|
125
|
+
end
|
126
|
+
|
127
|
+
# Get the value for the element
|
128
|
+
def get_element_value(phrase)
|
129
|
+
element = get_element(phrase)[0]
|
130
|
+
element.css("strong").text if element
|
131
|
+
end
|
132
|
+
|
133
|
+
# Get the element including the phrase specified
|
134
|
+
def get_element(phrase)
|
135
|
+
@page.css(".cj-job-data").select{|d| d.text.include?(phrase) }
|
136
|
+
end
|
137
|
+
end
|
@@ -0,0 +1,141 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'cgi'
|
4
|
+
require 'json'
|
5
|
+
require 'requestmanager'
|
6
|
+
require 'harvesterreporter'
|
7
|
+
|
8
|
+
load 'clearedjobsnet/cleared_jobs_net_parser.rb'
|
9
|
+
load 'util/failure_handler.rb'
|
10
|
+
|
11
|
+
class ClearedJobsNetCrawler
|
12
|
+
include FailureHandler
|
13
|
+
def initialize(crawl_type, filter_name=nil, requests=nil, cm_hash=nil)
|
14
|
+
@base_url = "https://clearedjobs.net/"
|
15
|
+
@output = Array.new
|
16
|
+
@requests = requests
|
17
|
+
|
18
|
+
# Handle crawler manager info
|
19
|
+
@reporter = HarvesterReporter.new(cm_hash)
|
20
|
+
|
21
|
+
# Get all items
|
22
|
+
if crawl_type == "all"
|
23
|
+
@crawl_type = crawl_type
|
24
|
+
|
25
|
+
# Get a company page
|
26
|
+
elsif filter_name == "company_page"
|
27
|
+
@crawl_type = "company_page"
|
28
|
+
@search_term = crawl_type
|
29
|
+
|
30
|
+
# Add a filter
|
31
|
+
elsif filter_name
|
32
|
+
@crawl_type = "filter"
|
33
|
+
@filter = filter_name
|
34
|
+
@search_term = crawl_type
|
35
|
+
|
36
|
+
# Query search
|
37
|
+
else
|
38
|
+
@crawl_type = "search"
|
39
|
+
@search_term = crawl_type
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Crawls the listings for the query
|
44
|
+
def crawl_listings
|
45
|
+
pages_to_crawl = get_num_pages_per_query.to_i
|
46
|
+
base_url = get_base_query_url
|
47
|
+
|
48
|
+
# Loop through pages and collect links for each
|
49
|
+
(1..pages_to_crawl).each do |page_num|
|
50
|
+
next_page_html = goto_next_page(base_url, page_num)
|
51
|
+
collect_page_links(next_page_html)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# Collect all the links on the page
|
56
|
+
def collect_page_links(page_html)
|
57
|
+
html = Nokogiri::HTML.parse(page_html)
|
58
|
+
result_rows = html.css("table.search_res").css("tbody").css("tr")
|
59
|
+
|
60
|
+
# Parse URL and date from each row
|
61
|
+
parsed_result_rows = result_rows.map do |r|
|
62
|
+
link = (@base_url+r.css("a")[0]['href']).split("/keywords")[0]
|
63
|
+
date = Date.parse(r.css("div").select{|e| e.text.include?("Posted - ")}.first.text.gsub("Posted - ", ""))
|
64
|
+
{url: link, posting_date: date}
|
65
|
+
end
|
66
|
+
|
67
|
+
parse_all_listings(parsed_result_rows)
|
68
|
+
return parsed_result_rows
|
69
|
+
end
|
70
|
+
|
71
|
+
# Parse all of the listings
|
72
|
+
def parse_all_listings(listing_links)
|
73
|
+
found_listings = Array.new
|
74
|
+
listing_links.each do |listing|
|
75
|
+
parser = ClearedJobsNetParser.new(get_page(listing[:url]), listing, @requests)
|
76
|
+
found_listings.push(parser.parse_job)
|
77
|
+
end
|
78
|
+
|
79
|
+
@reporter.report_results(found_listings, listing_links.first)
|
80
|
+
end
|
81
|
+
|
82
|
+
# Gets the number of pages for the query
|
83
|
+
def get_num_pages_per_query
|
84
|
+
# Go to the last page
|
85
|
+
html = Nokogiri::HTML.parse(run_initial_query)
|
86
|
+
if !html.css("div.navbar_bottom").css("a").empty?
|
87
|
+
last_page_link = @base_url+html.css("div.navbar_bottom").css("a").last['href']
|
88
|
+
last_page_html = Nokogiri::HTML.parse(get_page(last_page_link.gsub("//", "/")))
|
89
|
+
|
90
|
+
# Parse the page numbers in last page
|
91
|
+
return last_page_html.css("div.navbar_bottom").css("strong").text
|
92
|
+
else # Just one page of results
|
93
|
+
return "1"
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Goes to the next page
|
98
|
+
def goto_next_page(base_query_url, num)
|
99
|
+
start_index = (num-1)*25
|
100
|
+
|
101
|
+
# Set the URL for the next page appropriately
|
102
|
+
if start_index == 0 || num == 0
|
103
|
+
next_page_url = base_query_url
|
104
|
+
else
|
105
|
+
next_page_url = base_query_url+"/start/"+start_index.to_s
|
106
|
+
end
|
107
|
+
|
108
|
+
return get_page(next_page_url)
|
109
|
+
end
|
110
|
+
|
111
|
+
# Open the page
|
112
|
+
def get_page(url, i=0)
|
113
|
+
get_retry(url, @requests, 0)
|
114
|
+
end
|
115
|
+
|
116
|
+
# Get the initial results page
|
117
|
+
def run_initial_query
|
118
|
+
url = get_base_query_url
|
119
|
+
return get_page(url)
|
120
|
+
end
|
121
|
+
|
122
|
+
# Get the base query url depending on type
|
123
|
+
def get_base_query_url
|
124
|
+
if @crawl_type == "all"
|
125
|
+
return @base_url+"search/action/advanced_search/zip_radius/20/keywords/+/city_state_zip/+/security_clearance/+/submit/SEARCH+JOBS/sort/time"
|
126
|
+
elsif @crawl_type == "search"
|
127
|
+
encoded_term = CGI.escape(@search_term)
|
128
|
+
return @base_url+"search/action/advanced_search/zip_radius/20/keywords/"+encoded_term+"/city_state_zip/+/security_clearance/+/submit/SEARCH+JOBS"
|
129
|
+
elsif @crawl_type == "filter"
|
130
|
+
encoded_term = CGI.escape(@search_term)
|
131
|
+
return @base_url+"search/action/advanced_search/keywords/+/"+@filter+"[]/"+encoded_term+"/zip/+/zip_radius/20"
|
132
|
+
elsif @crawl_type == "company_page"
|
133
|
+
return @base_url+"view-employer/employer_id_seo/"+@search_term
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
# Return JSON
|
138
|
+
def gen_json
|
139
|
+
return @reporter.gen_json
|
140
|
+
end
|
141
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'pry'
|
3
|
+
require 'open-uri'
|
4
|
+
|
5
|
+
load 'util/failure_handler.rb'
|
6
|
+
|
7
|
+
class ClearedJobsNetParser
|
8
|
+
include FailureHandler
|
9
|
+
def initialize(html, details_hash, requests=nil)
|
10
|
+
@html = Nokogiri::HTML.parse(html)
|
11
|
+
@requests = requests
|
12
|
+
@url = details_hash[:url]
|
13
|
+
@i = 0
|
14
|
+
@posting_date = details_hash[:posting_date]
|
15
|
+
end
|
16
|
+
|
17
|
+
# Parses the job
|
18
|
+
def parse_job
|
19
|
+
begin
|
20
|
+
return {
|
21
|
+
url: @url,
|
22
|
+
html: @html.to_html,
|
23
|
+
posting_date: @posting_date,
|
24
|
+
company_name: company_name,
|
25
|
+
company_listing_link: company_listing_link,
|
26
|
+
required_clearance: required_clearance,
|
27
|
+
location: location,
|
28
|
+
country: country,
|
29
|
+
salary: salary,
|
30
|
+
job_number: job_number,
|
31
|
+
job_title: job_title,
|
32
|
+
job_description: job_description,
|
33
|
+
job_description_plaintext: job_description_plaintext
|
34
|
+
}
|
35
|
+
rescue
|
36
|
+
@i += 1
|
37
|
+
if @i < 10
|
38
|
+
@html = Nokogiri::HTML.parse(get_retry(@url, @requests, @i))
|
39
|
+
parse_job
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Gets the company name
|
45
|
+
def company_name
|
46
|
+
@html.css("div.view_job_table").css("div.row")[0].css(".left2").text
|
47
|
+
end
|
48
|
+
|
49
|
+
# Get the link to the company page
|
50
|
+
def company_listing_link
|
51
|
+
@html.css("div.view_job_table").css("div.row")[0].css(".left2").css("a")[0]['href']
|
52
|
+
end
|
53
|
+
|
54
|
+
# Gets the clearance level required
|
55
|
+
def required_clearance
|
56
|
+
@html.css("div.view_job_table").css("div.row")[0].css(".clearAll")[1].text
|
57
|
+
end
|
58
|
+
|
59
|
+
# Get the location of work
|
60
|
+
def location
|
61
|
+
@html.css("div.view_job_table").css("div.row")[1].css(".left2").text
|
62
|
+
end
|
63
|
+
|
64
|
+
# Get the country of work
|
65
|
+
def country
|
66
|
+
@html.css("div.view_job_table").css("div.row")[1].css(".right2").text
|
67
|
+
end
|
68
|
+
|
69
|
+
# Get the salary
|
70
|
+
def salary
|
71
|
+
@html.css("div.view_job_table").css("div.row")[2].css(".left2").text.strip.lstrip
|
72
|
+
end
|
73
|
+
|
74
|
+
# Get the job number
|
75
|
+
def job_number
|
76
|
+
@html.css("div.view_job_table").css("div.row")[2].css(".right2").text.strip.lstrip
|
77
|
+
end
|
78
|
+
|
79
|
+
# Get the job title
|
80
|
+
def job_title
|
81
|
+
@html.css("#view_employer").text
|
82
|
+
end
|
83
|
+
|
84
|
+
# Get the job description
|
85
|
+
def job_description
|
86
|
+
@html.css(".view-job-right").to_html
|
87
|
+
end
|
88
|
+
|
89
|
+
# Get the job description without HTML
|
90
|
+
def job_description_plaintext
|
91
|
+
Nokogiri::HTML.parse(job_description.gsub('<br />',"\n").gsub('<br>', "\n").gsub('<br/>', "\n")).text
|
92
|
+
end
|
93
|
+
end
|