tsjobcrawler 0.1

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
3
+
4
+ group :test do
5
+ gem "simplecov"
6
+ gem "codeclimate-test-reporter", "~> 1.0.0"
7
+ end
data/README.md ADDED
@@ -0,0 +1,28 @@
1
+ This is a crawler for job listings that require security clearance.
2
+
3
+ To run-
4
+
5
+ t = TSJobCrawler.new("search term" (or nil), request_manager, cm_hash or nil)
6
+
7
+ t.crawl_jobs
8
+
9
+
10
+ For example-
11
+
12
+ Headless.ly do
13
+
14
+ r = RequestManager.new(nil, [0, 0], 1)
15
+
16
+ t = TSJobCrawler.new(nil, r, nil)
17
+
18
+ t.crawl_jobs
19
+
20
+ File.write("test.json", t.gen_json)
21
+
22
+ end
23
+
24
+
25
+ If you input nil for the search term, it downloads as many job listings as
26
+ possible. Unless you have a lot of RAM, you should run it through Harvester if
27
+ you want to download as many listings as possible as then you can take
28
+ advantage of incremental result reporting.
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "tsjobcrawler"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,86 @@
1
+ require 'pry'
2
+ require 'open-uri'
3
+ require 'nokogiri'
4
+ require 'cgi'
5
+ require 'json'
6
+ require 'requestmanager'
7
+ require 'headless'
8
+ require 'harvesterreporter'
9
+
10
+ load 'clearancejobscom/clearance_jobs_com_parser.rb'
11
+ load 'util/failure_handler.rb'
12
+
13
+ class ClearanceJobsComCrawler
14
+ include FailureHandler
15
+ def initialize(search_term, requests=nil, cm_hash=nil)
16
+ @search_term = search_term
17
+ @requests = requests
18
+ @base_url = set_base_url
19
+
20
+ # Handle crawler manager info
21
+ @reporter = HarvesterReporter.new(cm_hash)
22
+ end
23
+
24
+ # Run the crawler
25
+ def crawl
26
+ page_count = get_page_count
27
+
28
+ (1..page_count).each do |page_num|
29
+ listing_links = collect_links_on_page(get_next_page(page_num))
30
+ parse_listings(listing_links)
31
+ end
32
+ end
33
+
34
+ # Get base url
35
+ def set_base_url
36
+ if @search_term == nil
37
+ @base_url = "https://www.clearancejobs.com/jobs?"
38
+ else
39
+ @base_url = "https://www.clearancejobs.com/jobs?keywords="+CGI.escape(@search_term)+"&zip_text="
40
+ end
41
+ end
42
+
43
+ # Get the URL for the next page
44
+ def get_next_page_url(page_num)
45
+ return @base_url+"PAGE="+page_num.to_s+"&limit=25"
46
+ end
47
+
48
+ # Get the page
49
+ def get_page(url)
50
+ get_retry(url, @requests, 0)
51
+ end
52
+
53
+ # Get the correct total # of pages
54
+ def get_page_count
55
+ page_html = Nokogiri::HTML.parse(get_next_page(1))
56
+ result_count = page_html.css("#viewing").text.split(" of ")[1].gsub(",", "").to_i
57
+ return (result_count/25.0).ceil
58
+ end
59
+
60
+ # Get the next page
61
+ def get_next_page(page_num)
62
+ return get_page(get_next_page_url(page_num))
63
+ end
64
+
65
+ # Collect the links on the page
66
+ def collect_links_on_page(page)
67
+ html = Nokogiri::HTML.parse(page)
68
+ return html.css(".cj-search-result-item-title").css("a").map{|a| a['href']}
69
+ end
70
+
71
+ # Parse the listings on the page
72
+ def parse_listings(listings)
73
+ found_listings = Array.new
74
+ listings.each do |listing|
75
+ parser = ClearanceJobsComParser.new(listing, get_page(listing), @requests)
76
+ parsed_listing = parser.parse
77
+ found_listings.push(parsed_listing) if parsed_listing
78
+ end
79
+
80
+ @reporter.report_results(found_listings, listings.first)
81
+ end
82
+
83
+ def gen_json
84
+ return @reporter.gen_json
85
+ end
86
+ end
@@ -0,0 +1,137 @@
1
+ require 'pry'
2
+ require 'open-uri'
3
+ require 'nokogiri'
4
+
5
+ load 'util/failure_handler.rb'
6
+
7
+ class ClearanceJobsComParser
8
+ include FailureHandler
9
+ def initialize(url, page, requests=nil)
10
+ @url = url
11
+ @requests = requests
12
+ @i = 0
13
+ @html = page
14
+ @page = Nokogiri::HTML.parse(page)
15
+ end
16
+
17
+ # Parse the profile
18
+ def parse
19
+ begin
20
+ return {
21
+ url: @url,
22
+ company_name: company_name,
23
+ location: location,
24
+ job_title: job_title,
25
+ job_description: job_description,
26
+ job_description_plaintext: job_description_plaintext,
27
+ required_travel: required_travel,
28
+ salary: salary,
29
+ salary_notes: salary_notes,
30
+ job_category: job_category,
31
+ group_id: group_id,
32
+ required_experience: required_experience,
33
+ employment_status: employment_status,
34
+ required_clearance: required_clearance,
35
+ required_experience: required_experience,
36
+ work_environment: work_environment,
37
+ posting_date: posting_date,
38
+ html: @html
39
+ }
40
+ rescue
41
+ @i += 1
42
+ if @i < 10
43
+ @html = Nokogiri::HTML.parse(get_retry(@url, @requests, @i))
44
+ parse
45
+ end
46
+ end
47
+ end
48
+
49
+ # Get the company name
50
+ def company_name
51
+ @page.css("h2").text
52
+ end
53
+
54
+ # Get the job location
55
+ def location
56
+ raw_location = @page.css("div").select{|e| e['itemprop'] == "hiringOrganization"}[0].css("h3").text
57
+ raw_location.gsub(/(\d)/, "").strip if raw_location
58
+ end
59
+
60
+ # Get the job title
61
+ def job_title
62
+ @page.css("h1").text
63
+ end
64
+
65
+ # Get the job description
66
+ def job_description
67
+ @page.css("div.margin-bottom-20").select{|e| e['itemprop'] == "description"}[0].to_html
68
+ end
69
+
70
+ # Get the job description without text
71
+ def job_description_plaintext
72
+ Nokogiri::HTML.parse(job_description.gsub('<br />',"\n").gsub('<br>', "\n").gsub('<br/>', "\n")).text
73
+ end
74
+
75
+ # Get if there is travel required
76
+ def required_travel
77
+ get_element_value("Travel:")
78
+ end
79
+
80
+ # Get the salary
81
+ def salary
82
+ get_element_value("Compensation:")
83
+ end
84
+
85
+ # Get notes about the salary
86
+ def salary_notes
87
+ salary_info = get_element_value("Compensation Comments:")
88
+ salary_info.lstrip.strip if salary_info
89
+ end
90
+
91
+ # Get the job category
92
+ def job_category
93
+ get_element_value("Job Category:")
94
+ end
95
+
96
+ # Get the group ID
97
+ def group_id
98
+ get_element_value("Group ID")
99
+ end
100
+
101
+ # Get the # of years of experience required
102
+ def required_experience
103
+ get_element_value("Minimum Experience Required")
104
+ end
105
+
106
+ # Get the employment status for the position
107
+ def employment_status
108
+ get_element_value("Status:")
109
+ end
110
+
111
+ # Get the clearance level
112
+ def required_clearance
113
+ get_element_value("Minimum Clearance")
114
+ end
115
+
116
+ # Get the work environment
117
+ def work_environment
118
+ get_element_value("Workplace:")
119
+ end
120
+
121
+ # Get the date of the posting
122
+ def posting_date
123
+ element = get_element("Post Date:")
124
+ DateTime.parse(element[0].css("meta")[0]['content'])
125
+ end
126
+
127
+ # Get the value for the element
128
+ def get_element_value(phrase)
129
+ element = get_element(phrase)[0]
130
+ element.css("strong").text if element
131
+ end
132
+
133
+ # Get the element including the phrase specified
134
+ def get_element(phrase)
135
+ @page.css(".cj-job-data").select{|d| d.text.include?(phrase) }
136
+ end
137
+ end
@@ -0,0 +1,141 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'cgi'
4
+ require 'json'
5
+ require 'requestmanager'
6
+ require 'harvesterreporter'
7
+
8
+ load 'clearedjobsnet/cleared_jobs_net_parser.rb'
9
+ load 'util/failure_handler.rb'
10
+
11
+ class ClearedJobsNetCrawler
12
+ include FailureHandler
13
+ def initialize(crawl_type, filter_name=nil, requests=nil, cm_hash=nil)
14
+ @base_url = "https://clearedjobs.net/"
15
+ @output = Array.new
16
+ @requests = requests
17
+
18
+ # Handle crawler manager info
19
+ @reporter = HarvesterReporter.new(cm_hash)
20
+
21
+ # Get all items
22
+ if crawl_type == "all"
23
+ @crawl_type = crawl_type
24
+
25
+ # Get a company page
26
+ elsif filter_name == "company_page"
27
+ @crawl_type = "company_page"
28
+ @search_term = crawl_type
29
+
30
+ # Add a filter
31
+ elsif filter_name
32
+ @crawl_type = "filter"
33
+ @filter = filter_name
34
+ @search_term = crawl_type
35
+
36
+ # Query search
37
+ else
38
+ @crawl_type = "search"
39
+ @search_term = crawl_type
40
+ end
41
+ end
42
+
43
+ # Crawls the listings for the query
44
+ def crawl_listings
45
+ pages_to_crawl = get_num_pages_per_query.to_i
46
+ base_url = get_base_query_url
47
+
48
+ # Loop through pages and collect links for each
49
+ (1..pages_to_crawl).each do |page_num|
50
+ next_page_html = goto_next_page(base_url, page_num)
51
+ collect_page_links(next_page_html)
52
+ end
53
+ end
54
+
55
+ # Collect all the links on the page
56
+ def collect_page_links(page_html)
57
+ html = Nokogiri::HTML.parse(page_html)
58
+ result_rows = html.css("table.search_res").css("tbody").css("tr")
59
+
60
+ # Parse URL and date from each row
61
+ parsed_result_rows = result_rows.map do |r|
62
+ link = (@base_url+r.css("a")[0]['href']).split("/keywords")[0]
63
+ date = Date.parse(r.css("div").select{|e| e.text.include?("Posted - ")}.first.text.gsub("Posted - ", ""))
64
+ {url: link, posting_date: date}
65
+ end
66
+
67
+ parse_all_listings(parsed_result_rows)
68
+ return parsed_result_rows
69
+ end
70
+
71
+ # Parse all of the listings
72
+ def parse_all_listings(listing_links)
73
+ found_listings = Array.new
74
+ listing_links.each do |listing|
75
+ parser = ClearedJobsNetParser.new(get_page(listing[:url]), listing, @requests)
76
+ found_listings.push(parser.parse_job)
77
+ end
78
+
79
+ @reporter.report_results(found_listings, listing_links.first)
80
+ end
81
+
82
+ # Gets the number of pages for the query
83
+ def get_num_pages_per_query
84
+ # Go to the last page
85
+ html = Nokogiri::HTML.parse(run_initial_query)
86
+ if !html.css("div.navbar_bottom").css("a").empty?
87
+ last_page_link = @base_url+html.css("div.navbar_bottom").css("a").last['href']
88
+ last_page_html = Nokogiri::HTML.parse(get_page(last_page_link.gsub("//", "/")))
89
+
90
+ # Parse the page numbers in last page
91
+ return last_page_html.css("div.navbar_bottom").css("strong").text
92
+ else # Just one page of results
93
+ return "1"
94
+ end
95
+ end
96
+
97
+ # Goes to the next page
98
+ def goto_next_page(base_query_url, num)
99
+ start_index = (num-1)*25
100
+
101
+ # Set the URL for the next page appropriately
102
+ if start_index == 0 || num == 0
103
+ next_page_url = base_query_url
104
+ else
105
+ next_page_url = base_query_url+"/start/"+start_index.to_s
106
+ end
107
+
108
+ return get_page(next_page_url)
109
+ end
110
+
111
+ # Open the page
112
+ def get_page(url, i=0)
113
+ get_retry(url, @requests, 0)
114
+ end
115
+
116
+ # Get the initial results page
117
+ def run_initial_query
118
+ url = get_base_query_url
119
+ return get_page(url)
120
+ end
121
+
122
+ # Get the base query url depending on type
123
+ def get_base_query_url
124
+ if @crawl_type == "all"
125
+ return @base_url+"search/action/advanced_search/zip_radius/20/keywords/+/city_state_zip/+/security_clearance/+/submit/SEARCH+JOBS/sort/time"
126
+ elsif @crawl_type == "search"
127
+ encoded_term = CGI.escape(@search_term)
128
+ return @base_url+"search/action/advanced_search/zip_radius/20/keywords/"+encoded_term+"/city_state_zip/+/security_clearance/+/submit/SEARCH+JOBS"
129
+ elsif @crawl_type == "filter"
130
+ encoded_term = CGI.escape(@search_term)
131
+ return @base_url+"search/action/advanced_search/keywords/+/"+@filter+"[]/"+encoded_term+"/zip/+/zip_radius/20"
132
+ elsif @crawl_type == "company_page"
133
+ return @base_url+"view-employer/employer_id_seo/"+@search_term
134
+ end
135
+ end
136
+
137
+ # Return JSON
138
+ def gen_json
139
+ return @reporter.gen_json
140
+ end
141
+ end
@@ -0,0 +1,93 @@
1
+ require 'nokogiri'
2
+ require 'pry'
3
+ require 'open-uri'
4
+
5
+ load 'util/failure_handler.rb'
6
+
7
+ class ClearedJobsNetParser
8
+ include FailureHandler
9
+ def initialize(html, details_hash, requests=nil)
10
+ @html = Nokogiri::HTML.parse(html)
11
+ @requests = requests
12
+ @url = details_hash[:url]
13
+ @i = 0
14
+ @posting_date = details_hash[:posting_date]
15
+ end
16
+
17
+ # Parses the job
18
+ def parse_job
19
+ begin
20
+ return {
21
+ url: @url,
22
+ html: @html.to_html,
23
+ posting_date: @posting_date,
24
+ company_name: company_name,
25
+ company_listing_link: company_listing_link,
26
+ required_clearance: required_clearance,
27
+ location: location,
28
+ country: country,
29
+ salary: salary,
30
+ job_number: job_number,
31
+ job_title: job_title,
32
+ job_description: job_description,
33
+ job_description_plaintext: job_description_plaintext
34
+ }
35
+ rescue
36
+ @i += 1
37
+ if @i < 10
38
+ @html = Nokogiri::HTML.parse(get_retry(@url, @requests, @i))
39
+ parse_job
40
+ end
41
+ end
42
+ end
43
+
44
+ # Gets the company name
45
+ def company_name
46
+ @html.css("div.view_job_table").css("div.row")[0].css(".left2").text
47
+ end
48
+
49
+ # Get the link to the company page
50
+ def company_listing_link
51
+ @html.css("div.view_job_table").css("div.row")[0].css(".left2").css("a")[0]['href']
52
+ end
53
+
54
+ # Gets the clearance level required
55
+ def required_clearance
56
+ @html.css("div.view_job_table").css("div.row")[0].css(".clearAll")[1].text
57
+ end
58
+
59
+ # Get the location of work
60
+ def location
61
+ @html.css("div.view_job_table").css("div.row")[1].css(".left2").text
62
+ end
63
+
64
+ # Get the country of work
65
+ def country
66
+ @html.css("div.view_job_table").css("div.row")[1].css(".right2").text
67
+ end
68
+
69
+ # Get the salary
70
+ def salary
71
+ @html.css("div.view_job_table").css("div.row")[2].css(".left2").text.strip.lstrip
72
+ end
73
+
74
+ # Get the job number
75
+ def job_number
76
+ @html.css("div.view_job_table").css("div.row")[2].css(".right2").text.strip.lstrip
77
+ end
78
+
79
+ # Get the job title
80
+ def job_title
81
+ @html.css("#view_employer").text
82
+ end
83
+
84
+ # Get the job description
85
+ def job_description
86
+ @html.css(".view-job-right").to_html
87
+ end
88
+
89
+ # Get the job description without HTML
90
+ def job_description_plaintext
91
+ Nokogiri::HTML.parse(job_description.gsub('<br />',"\n").gsub('<br>', "\n").gsub('<br/>', "\n")).text
92
+ end
93
+ end