tsjobcrawler 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +2 -0
- data/COPYING +674 -0
- data/Gemfile +7 -0
- data/README.md +28 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/lib/clearancejobscom/clearance_jobs_com_crawler.rb +86 -0
- data/lib/clearancejobscom/clearance_jobs_com_parser.rb +137 -0
- data/lib/clearedjobsnet/cleared_jobs_net_crawler.rb +141 -0
- data/lib/clearedjobsnet/cleared_jobs_net_parser.rb +93 -0
- data/lib/clearedjobsnet/get_all_cleared_jobs.rb +77 -0
- data/lib/clearedjobsnet/terms/clearance_levels.json +15 -0
- data/lib/clearedjobsnet/terms/company_names.json +17 -0
- data/lib/clearedjobsnet/terms/country_names.json +202 -0
- data/lib/clearedjobsnet/terms/search_terms.json +27 -0
- data/lib/securityclearedjobscom/security_cleared_jobs_com_crawler.rb +93 -0
- data/lib/securityclearedjobscom/security_cleared_jobs_com_parser.rb +115 -0
- data/lib/tsjobcrawler.rb +52 -0
- data/lib/util/failure_handler.rb +22 -0
- data/tsjobcrawler.gemspec +27 -0
- metadata +162 -0
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
This is a crawler for job listings that require security clearance.
|
2
|
+
|
3
|
+
To run-
|
4
|
+
|
5
|
+
t = TSJobCrawler.new("search term" (or nil), request_manager, cm_hash or nil)
|
6
|
+
|
7
|
+
t.crawl_jobs
|
8
|
+
|
9
|
+
|
10
|
+
For example-
|
11
|
+
|
12
|
+
Headless.ly do
|
13
|
+
|
14
|
+
r = RequestManager.new(nil, [0, 0], 1)
|
15
|
+
|
16
|
+
t = TSJobCrawler.new(nil, r, nil)
|
17
|
+
|
18
|
+
t.crawl_jobs
|
19
|
+
|
20
|
+
File.write("test.json", t.gen_json)
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
If you input nil for the search term, it downloads as many job listings as
|
26
|
+
possible. Unless you have a lot of RAM, you should run it through Harvester if
|
27
|
+
you want to download as many listings as possible as then you can take
|
28
|
+
advantage of incremental result reporting.
|
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "tsjobcrawler"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
require 'pry'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'cgi'
|
5
|
+
require 'json'
|
6
|
+
require 'requestmanager'
|
7
|
+
require 'headless'
|
8
|
+
require 'harvesterreporter'
|
9
|
+
|
10
|
+
load 'clearancejobscom/clearance_jobs_com_parser.rb'
|
11
|
+
load 'util/failure_handler.rb'
|
12
|
+
|
13
|
+
class ClearanceJobsComCrawler
|
14
|
+
include FailureHandler
|
15
|
+
def initialize(search_term, requests=nil, cm_hash=nil)
|
16
|
+
@search_term = search_term
|
17
|
+
@requests = requests
|
18
|
+
@base_url = set_base_url
|
19
|
+
|
20
|
+
# Handle crawler manager info
|
21
|
+
@reporter = HarvesterReporter.new(cm_hash)
|
22
|
+
end
|
23
|
+
|
24
|
+
# Run the crawler
|
25
|
+
def crawl
|
26
|
+
page_count = get_page_count
|
27
|
+
|
28
|
+
(1..page_count).each do |page_num|
|
29
|
+
listing_links = collect_links_on_page(get_next_page(page_num))
|
30
|
+
parse_listings(listing_links)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Get base url
|
35
|
+
def set_base_url
|
36
|
+
if @search_term == nil
|
37
|
+
@base_url = "https://www.clearancejobs.com/jobs?"
|
38
|
+
else
|
39
|
+
@base_url = "https://www.clearancejobs.com/jobs?keywords="+CGI.escape(@search_term)+"&zip_text="
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Get the URL for the next page
|
44
|
+
def get_next_page_url(page_num)
|
45
|
+
return @base_url+"PAGE="+page_num.to_s+"&limit=25"
|
46
|
+
end
|
47
|
+
|
48
|
+
# Get the page
|
49
|
+
def get_page(url)
|
50
|
+
get_retry(url, @requests, 0)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Get the correct total # of pages
|
54
|
+
def get_page_count
|
55
|
+
page_html = Nokogiri::HTML.parse(get_next_page(1))
|
56
|
+
result_count = page_html.css("#viewing").text.split(" of ")[1].gsub(",", "").to_i
|
57
|
+
return (result_count/25.0).ceil
|
58
|
+
end
|
59
|
+
|
60
|
+
# Get the next page
|
61
|
+
def get_next_page(page_num)
|
62
|
+
return get_page(get_next_page_url(page_num))
|
63
|
+
end
|
64
|
+
|
65
|
+
# Collect the links on the page
|
66
|
+
def collect_links_on_page(page)
|
67
|
+
html = Nokogiri::HTML.parse(page)
|
68
|
+
return html.css(".cj-search-result-item-title").css("a").map{|a| a['href']}
|
69
|
+
end
|
70
|
+
|
71
|
+
# Parse the listings on the page
|
72
|
+
def parse_listings(listings)
|
73
|
+
found_listings = Array.new
|
74
|
+
listings.each do |listing|
|
75
|
+
parser = ClearanceJobsComParser.new(listing, get_page(listing), @requests)
|
76
|
+
parsed_listing = parser.parse
|
77
|
+
found_listings.push(parsed_listing) if parsed_listing
|
78
|
+
end
|
79
|
+
|
80
|
+
@reporter.report_results(found_listings, listings.first)
|
81
|
+
end
|
82
|
+
|
83
|
+
def gen_json
|
84
|
+
return @reporter.gen_json
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,137 @@
|
|
1
|
+
require 'pry'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
load 'util/failure_handler.rb'
|
6
|
+
|
7
|
+
class ClearanceJobsComParser
|
8
|
+
include FailureHandler
|
9
|
+
def initialize(url, page, requests=nil)
|
10
|
+
@url = url
|
11
|
+
@requests = requests
|
12
|
+
@i = 0
|
13
|
+
@html = page
|
14
|
+
@page = Nokogiri::HTML.parse(page)
|
15
|
+
end
|
16
|
+
|
17
|
+
# Parse the profile
|
18
|
+
def parse
|
19
|
+
begin
|
20
|
+
return {
|
21
|
+
url: @url,
|
22
|
+
company_name: company_name,
|
23
|
+
location: location,
|
24
|
+
job_title: job_title,
|
25
|
+
job_description: job_description,
|
26
|
+
job_description_plaintext: job_description_plaintext,
|
27
|
+
required_travel: required_travel,
|
28
|
+
salary: salary,
|
29
|
+
salary_notes: salary_notes,
|
30
|
+
job_category: job_category,
|
31
|
+
group_id: group_id,
|
32
|
+
required_experience: required_experience,
|
33
|
+
employment_status: employment_status,
|
34
|
+
required_clearance: required_clearance,
|
35
|
+
required_experience: required_experience,
|
36
|
+
work_environment: work_environment,
|
37
|
+
posting_date: posting_date,
|
38
|
+
html: @html
|
39
|
+
}
|
40
|
+
rescue
|
41
|
+
@i += 1
|
42
|
+
if @i < 10
|
43
|
+
@html = Nokogiri::HTML.parse(get_retry(@url, @requests, @i))
|
44
|
+
parse
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Get the company name
|
50
|
+
def company_name
|
51
|
+
@page.css("h2").text
|
52
|
+
end
|
53
|
+
|
54
|
+
# Get the job location
|
55
|
+
def location
|
56
|
+
raw_location = @page.css("div").select{|e| e['itemprop'] == "hiringOrganization"}[0].css("h3").text
|
57
|
+
raw_location.gsub(/(\d)/, "").strip if raw_location
|
58
|
+
end
|
59
|
+
|
60
|
+
# Get the job title
|
61
|
+
def job_title
|
62
|
+
@page.css("h1").text
|
63
|
+
end
|
64
|
+
|
65
|
+
# Get the job description
|
66
|
+
def job_description
|
67
|
+
@page.css("div.margin-bottom-20").select{|e| e['itemprop'] == "description"}[0].to_html
|
68
|
+
end
|
69
|
+
|
70
|
+
# Get the job description without text
|
71
|
+
def job_description_plaintext
|
72
|
+
Nokogiri::HTML.parse(job_description.gsub('<br />',"\n").gsub('<br>', "\n").gsub('<br/>', "\n")).text
|
73
|
+
end
|
74
|
+
|
75
|
+
# Get if there is travel required
|
76
|
+
def required_travel
|
77
|
+
get_element_value("Travel:")
|
78
|
+
end
|
79
|
+
|
80
|
+
# Get the salary
|
81
|
+
def salary
|
82
|
+
get_element_value("Compensation:")
|
83
|
+
end
|
84
|
+
|
85
|
+
# Get notes about the salary
|
86
|
+
def salary_notes
|
87
|
+
salary_info = get_element_value("Compensation Comments:")
|
88
|
+
salary_info.lstrip.strip if salary_info
|
89
|
+
end
|
90
|
+
|
91
|
+
# Get the job category
|
92
|
+
def job_category
|
93
|
+
get_element_value("Job Category:")
|
94
|
+
end
|
95
|
+
|
96
|
+
# Get the group ID
|
97
|
+
def group_id
|
98
|
+
get_element_value("Group ID")
|
99
|
+
end
|
100
|
+
|
101
|
+
# Get the # of years of experience required
|
102
|
+
def required_experience
|
103
|
+
get_element_value("Minimum Experience Required")
|
104
|
+
end
|
105
|
+
|
106
|
+
# Get the employment status for the position
|
107
|
+
def employment_status
|
108
|
+
get_element_value("Status:")
|
109
|
+
end
|
110
|
+
|
111
|
+
# Get the clearance level
|
112
|
+
def required_clearance
|
113
|
+
get_element_value("Minimum Clearance")
|
114
|
+
end
|
115
|
+
|
116
|
+
# Get the work environment
|
117
|
+
def work_environment
|
118
|
+
get_element_value("Workplace:")
|
119
|
+
end
|
120
|
+
|
121
|
+
# Get the date of the posting
|
122
|
+
def posting_date
|
123
|
+
element = get_element("Post Date:")
|
124
|
+
DateTime.parse(element[0].css("meta")[0]['content'])
|
125
|
+
end
|
126
|
+
|
127
|
+
# Get the value for the element
|
128
|
+
def get_element_value(phrase)
|
129
|
+
element = get_element(phrase)[0]
|
130
|
+
element.css("strong").text if element
|
131
|
+
end
|
132
|
+
|
133
|
+
# Get the element including the phrase specified
|
134
|
+
def get_element(phrase)
|
135
|
+
@page.css(".cj-job-data").select{|d| d.text.include?(phrase) }
|
136
|
+
end
|
137
|
+
end
|
@@ -0,0 +1,141 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'cgi'
|
4
|
+
require 'json'
|
5
|
+
require 'requestmanager'
|
6
|
+
require 'harvesterreporter'
|
7
|
+
|
8
|
+
load 'clearedjobsnet/cleared_jobs_net_parser.rb'
|
9
|
+
load 'util/failure_handler.rb'
|
10
|
+
|
11
|
+
class ClearedJobsNetCrawler
|
12
|
+
include FailureHandler
|
13
|
+
def initialize(crawl_type, filter_name=nil, requests=nil, cm_hash=nil)
|
14
|
+
@base_url = "https://clearedjobs.net/"
|
15
|
+
@output = Array.new
|
16
|
+
@requests = requests
|
17
|
+
|
18
|
+
# Handle crawler manager info
|
19
|
+
@reporter = HarvesterReporter.new(cm_hash)
|
20
|
+
|
21
|
+
# Get all items
|
22
|
+
if crawl_type == "all"
|
23
|
+
@crawl_type = crawl_type
|
24
|
+
|
25
|
+
# Get a company page
|
26
|
+
elsif filter_name == "company_page"
|
27
|
+
@crawl_type = "company_page"
|
28
|
+
@search_term = crawl_type
|
29
|
+
|
30
|
+
# Add a filter
|
31
|
+
elsif filter_name
|
32
|
+
@crawl_type = "filter"
|
33
|
+
@filter = filter_name
|
34
|
+
@search_term = crawl_type
|
35
|
+
|
36
|
+
# Query search
|
37
|
+
else
|
38
|
+
@crawl_type = "search"
|
39
|
+
@search_term = crawl_type
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Crawls the listings for the query
|
44
|
+
def crawl_listings
|
45
|
+
pages_to_crawl = get_num_pages_per_query.to_i
|
46
|
+
base_url = get_base_query_url
|
47
|
+
|
48
|
+
# Loop through pages and collect links for each
|
49
|
+
(1..pages_to_crawl).each do |page_num|
|
50
|
+
next_page_html = goto_next_page(base_url, page_num)
|
51
|
+
collect_page_links(next_page_html)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# Collect all the links on the page
|
56
|
+
def collect_page_links(page_html)
|
57
|
+
html = Nokogiri::HTML.parse(page_html)
|
58
|
+
result_rows = html.css("table.search_res").css("tbody").css("tr")
|
59
|
+
|
60
|
+
# Parse URL and date from each row
|
61
|
+
parsed_result_rows = result_rows.map do |r|
|
62
|
+
link = (@base_url+r.css("a")[0]['href']).split("/keywords")[0]
|
63
|
+
date = Date.parse(r.css("div").select{|e| e.text.include?("Posted - ")}.first.text.gsub("Posted - ", ""))
|
64
|
+
{url: link, posting_date: date}
|
65
|
+
end
|
66
|
+
|
67
|
+
parse_all_listings(parsed_result_rows)
|
68
|
+
return parsed_result_rows
|
69
|
+
end
|
70
|
+
|
71
|
+
# Parse all of the listings
|
72
|
+
def parse_all_listings(listing_links)
|
73
|
+
found_listings = Array.new
|
74
|
+
listing_links.each do |listing|
|
75
|
+
parser = ClearedJobsNetParser.new(get_page(listing[:url]), listing, @requests)
|
76
|
+
found_listings.push(parser.parse_job)
|
77
|
+
end
|
78
|
+
|
79
|
+
@reporter.report_results(found_listings, listing_links.first)
|
80
|
+
end
|
81
|
+
|
82
|
+
# Gets the number of pages for the query
|
83
|
+
def get_num_pages_per_query
|
84
|
+
# Go to the last page
|
85
|
+
html = Nokogiri::HTML.parse(run_initial_query)
|
86
|
+
if !html.css("div.navbar_bottom").css("a").empty?
|
87
|
+
last_page_link = @base_url+html.css("div.navbar_bottom").css("a").last['href']
|
88
|
+
last_page_html = Nokogiri::HTML.parse(get_page(last_page_link.gsub("//", "/")))
|
89
|
+
|
90
|
+
# Parse the page numbers in last page
|
91
|
+
return last_page_html.css("div.navbar_bottom").css("strong").text
|
92
|
+
else # Just one page of results
|
93
|
+
return "1"
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Goes to the next page
|
98
|
+
def goto_next_page(base_query_url, num)
|
99
|
+
start_index = (num-1)*25
|
100
|
+
|
101
|
+
# Set the URL for the next page appropriately
|
102
|
+
if start_index == 0 || num == 0
|
103
|
+
next_page_url = base_query_url
|
104
|
+
else
|
105
|
+
next_page_url = base_query_url+"/start/"+start_index.to_s
|
106
|
+
end
|
107
|
+
|
108
|
+
return get_page(next_page_url)
|
109
|
+
end
|
110
|
+
|
111
|
+
# Open the page
|
112
|
+
def get_page(url, i=0)
|
113
|
+
get_retry(url, @requests, 0)
|
114
|
+
end
|
115
|
+
|
116
|
+
# Get the initial results page
|
117
|
+
def run_initial_query
|
118
|
+
url = get_base_query_url
|
119
|
+
return get_page(url)
|
120
|
+
end
|
121
|
+
|
122
|
+
# Get the base query url depending on type
|
123
|
+
def get_base_query_url
|
124
|
+
if @crawl_type == "all"
|
125
|
+
return @base_url+"search/action/advanced_search/zip_radius/20/keywords/+/city_state_zip/+/security_clearance/+/submit/SEARCH+JOBS/sort/time"
|
126
|
+
elsif @crawl_type == "search"
|
127
|
+
encoded_term = CGI.escape(@search_term)
|
128
|
+
return @base_url+"search/action/advanced_search/zip_radius/20/keywords/"+encoded_term+"/city_state_zip/+/security_clearance/+/submit/SEARCH+JOBS"
|
129
|
+
elsif @crawl_type == "filter"
|
130
|
+
encoded_term = CGI.escape(@search_term)
|
131
|
+
return @base_url+"search/action/advanced_search/keywords/+/"+@filter+"[]/"+encoded_term+"/zip/+/zip_radius/20"
|
132
|
+
elsif @crawl_type == "company_page"
|
133
|
+
return @base_url+"view-employer/employer_id_seo/"+@search_term
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
# Return JSON
|
138
|
+
def gen_json
|
139
|
+
return @reporter.gen_json
|
140
|
+
end
|
141
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'pry'
|
3
|
+
require 'open-uri'
|
4
|
+
|
5
|
+
load 'util/failure_handler.rb'
|
6
|
+
|
7
|
+
class ClearedJobsNetParser
|
8
|
+
include FailureHandler
|
9
|
+
def initialize(html, details_hash, requests=nil)
|
10
|
+
@html = Nokogiri::HTML.parse(html)
|
11
|
+
@requests = requests
|
12
|
+
@url = details_hash[:url]
|
13
|
+
@i = 0
|
14
|
+
@posting_date = details_hash[:posting_date]
|
15
|
+
end
|
16
|
+
|
17
|
+
# Parses the job
|
18
|
+
def parse_job
|
19
|
+
begin
|
20
|
+
return {
|
21
|
+
url: @url,
|
22
|
+
html: @html.to_html,
|
23
|
+
posting_date: @posting_date,
|
24
|
+
company_name: company_name,
|
25
|
+
company_listing_link: company_listing_link,
|
26
|
+
required_clearance: required_clearance,
|
27
|
+
location: location,
|
28
|
+
country: country,
|
29
|
+
salary: salary,
|
30
|
+
job_number: job_number,
|
31
|
+
job_title: job_title,
|
32
|
+
job_description: job_description,
|
33
|
+
job_description_plaintext: job_description_plaintext
|
34
|
+
}
|
35
|
+
rescue
|
36
|
+
@i += 1
|
37
|
+
if @i < 10
|
38
|
+
@html = Nokogiri::HTML.parse(get_retry(@url, @requests, @i))
|
39
|
+
parse_job
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Gets the company name
|
45
|
+
def company_name
|
46
|
+
@html.css("div.view_job_table").css("div.row")[0].css(".left2").text
|
47
|
+
end
|
48
|
+
|
49
|
+
# Get the link to the company page
|
50
|
+
def company_listing_link
|
51
|
+
@html.css("div.view_job_table").css("div.row")[0].css(".left2").css("a")[0]['href']
|
52
|
+
end
|
53
|
+
|
54
|
+
# Gets the clearance level required
|
55
|
+
def required_clearance
|
56
|
+
@html.css("div.view_job_table").css("div.row")[0].css(".clearAll")[1].text
|
57
|
+
end
|
58
|
+
|
59
|
+
# Get the location of work
|
60
|
+
def location
|
61
|
+
@html.css("div.view_job_table").css("div.row")[1].css(".left2").text
|
62
|
+
end
|
63
|
+
|
64
|
+
# Get the country of work
|
65
|
+
def country
|
66
|
+
@html.css("div.view_job_table").css("div.row")[1].css(".right2").text
|
67
|
+
end
|
68
|
+
|
69
|
+
# Get the salary
|
70
|
+
def salary
|
71
|
+
@html.css("div.view_job_table").css("div.row")[2].css(".left2").text.strip.lstrip
|
72
|
+
end
|
73
|
+
|
74
|
+
# Get the job number
|
75
|
+
def job_number
|
76
|
+
@html.css("div.view_job_table").css("div.row")[2].css(".right2").text.strip.lstrip
|
77
|
+
end
|
78
|
+
|
79
|
+
# Get the job title
|
80
|
+
def job_title
|
81
|
+
@html.css("#view_employer").text
|
82
|
+
end
|
83
|
+
|
84
|
+
# Get the job description
|
85
|
+
def job_description
|
86
|
+
@html.css(".view-job-right").to_html
|
87
|
+
end
|
88
|
+
|
89
|
+
# Get the job description without HTML
|
90
|
+
def job_description_plaintext
|
91
|
+
Nokogiri::HTML.parse(job_description.gsub('<br />',"\n").gsub('<br>', "\n").gsub('<br/>', "\n")).text
|
92
|
+
end
|
93
|
+
end
|