tsjobcrawler 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
3
+
4
+ group :test do
5
+ gem "simplecov"
6
+ gem "codeclimate-test-reporter", "~> 1.0.0"
7
+ end
data/README.md ADDED
@@ -0,0 +1,28 @@
1
+ This is a crawler for job listings that require security clearance.
2
+
3
+ To run-
4
+
5
+ t = TSJobCrawler.new("search term" (or nil), request_manager, cm_hash or nil)
6
+
7
+ t.crawl_jobs
8
+
9
+
10
+ For example-
11
+
12
+ Headless.ly do
13
+
14
+ r = RequestManager.new(nil, [0, 0], 1)
15
+
16
+ t = TSJobCrawler.new(nil, r, nil)
17
+
18
+ t.crawl_jobs
19
+
20
+ File.write("test.json", t.gen_json)
21
+
22
+ end
23
+
24
+
25
+ If you input nil for the search term, it downloads as many job listings as
26
+ possible. Unless you have a lot of RAM, you should run it through Harvester if
27
+ you want to download as many listings as possible as then you can take
28
+ advantage of incremental result reporting.
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "tsjobcrawler"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,86 @@
1
+ require 'pry'
2
+ require 'open-uri'
3
+ require 'nokogiri'
4
+ require 'cgi'
5
+ require 'json'
6
+ require 'requestmanager'
7
+ require 'headless'
8
+ require 'harvesterreporter'
9
+
10
+ load 'clearancejobscom/clearance_jobs_com_parser.rb'
11
+ load 'util/failure_handler.rb'
12
+
13
+ class ClearanceJobsComCrawler
14
+ include FailureHandler
15
+ def initialize(search_term, requests=nil, cm_hash=nil)
16
+ @search_term = search_term
17
+ @requests = requests
18
+ @base_url = set_base_url
19
+
20
+ # Handle crawler manager info
21
+ @reporter = HarvesterReporter.new(cm_hash)
22
+ end
23
+
24
+ # Run the crawler
25
+ def crawl
26
+ page_count = get_page_count
27
+
28
+ (1..page_count).each do |page_num|
29
+ listing_links = collect_links_on_page(get_next_page(page_num))
30
+ parse_listings(listing_links)
31
+ end
32
+ end
33
+
34
+ # Get base url
35
+ def set_base_url
36
+ if @search_term == nil
37
+ @base_url = "https://www.clearancejobs.com/jobs?"
38
+ else
39
+ @base_url = "https://www.clearancejobs.com/jobs?keywords="+CGI.escape(@search_term)+"&zip_text="
40
+ end
41
+ end
42
+
43
+ # Get the URL for the next page
44
+ def get_next_page_url(page_num)
45
+ return @base_url+"PAGE="+page_num.to_s+"&limit=25"
46
+ end
47
+
48
+ # Get the page
49
+ def get_page(url)
50
+ get_retry(url, @requests, 0)
51
+ end
52
+
53
+ # Get the correct total # of pages
54
+ def get_page_count
55
+ page_html = Nokogiri::HTML.parse(get_next_page(1))
56
+ result_count = page_html.css("#viewing").text.split(" of ")[1].gsub(",", "").to_i
57
+ return (result_count/25.0).ceil
58
+ end
59
+
60
+ # Get the next page
61
+ def get_next_page(page_num)
62
+ return get_page(get_next_page_url(page_num))
63
+ end
64
+
65
+ # Collect the links on the page
66
+ def collect_links_on_page(page)
67
+ html = Nokogiri::HTML.parse(page)
68
+ return html.css(".cj-search-result-item-title").css("a").map{|a| a['href']}
69
+ end
70
+
71
+ # Parse the listings on the page
72
+ def parse_listings(listings)
73
+ found_listings = Array.new
74
+ listings.each do |listing|
75
+ parser = ClearanceJobsComParser.new(listing, get_page(listing), @requests)
76
+ parsed_listing = parser.parse
77
+ found_listings.push(parsed_listing) if parsed_listing
78
+ end
79
+
80
+ @reporter.report_results(found_listings, listings.first)
81
+ end
82
+
83
+ def gen_json
84
+ return @reporter.gen_json
85
+ end
86
+ end
@@ -0,0 +1,137 @@
1
+ require 'pry'
2
+ require 'open-uri'
3
+ require 'nokogiri'
4
+
5
+ load 'util/failure_handler.rb'
6
+
7
+ class ClearanceJobsComParser
8
+ include FailureHandler
9
+ def initialize(url, page, requests=nil)
10
+ @url = url
11
+ @requests = requests
12
+ @i = 0
13
+ @html = page
14
+ @page = Nokogiri::HTML.parse(page)
15
+ end
16
+
17
+ # Parse the profile
18
+ def parse
19
+ begin
20
+ return {
21
+ url: @url,
22
+ company_name: company_name,
23
+ location: location,
24
+ job_title: job_title,
25
+ job_description: job_description,
26
+ job_description_plaintext: job_description_plaintext,
27
+ required_travel: required_travel,
28
+ salary: salary,
29
+ salary_notes: salary_notes,
30
+ job_category: job_category,
31
+ group_id: group_id,
32
+ required_experience: required_experience,
33
+ employment_status: employment_status,
34
+ required_clearance: required_clearance,
35
+ required_experience: required_experience,
36
+ work_environment: work_environment,
37
+ posting_date: posting_date,
38
+ html: @html
39
+ }
40
+ rescue
41
+ @i += 1
42
+ if @i < 10
43
+ @html = Nokogiri::HTML.parse(get_retry(@url, @requests, @i))
44
+ parse
45
+ end
46
+ end
47
+ end
48
+
49
+ # Get the company name
50
+ def company_name
51
+ @page.css("h2").text
52
+ end
53
+
54
+ # Get the job location
55
+ def location
56
+ raw_location = @page.css("div").select{|e| e['itemprop'] == "hiringOrganization"}[0].css("h3").text
57
+ raw_location.gsub(/(\d)/, "").strip if raw_location
58
+ end
59
+
60
+ # Get the job title
61
+ def job_title
62
+ @page.css("h1").text
63
+ end
64
+
65
+ # Get the job description
66
+ def job_description
67
+ @page.css("div.margin-bottom-20").select{|e| e['itemprop'] == "description"}[0].to_html
68
+ end
69
+
70
+ # Get the job description without text
71
+ def job_description_plaintext
72
+ Nokogiri::HTML.parse(job_description.gsub('<br />',"\n").gsub('<br>', "\n").gsub('<br/>', "\n")).text
73
+ end
74
+
75
+ # Get if there is travel required
76
+ def required_travel
77
+ get_element_value("Travel:")
78
+ end
79
+
80
+ # Get the salary
81
+ def salary
82
+ get_element_value("Compensation:")
83
+ end
84
+
85
+ # Get notes about the salary
86
+ def salary_notes
87
+ salary_info = get_element_value("Compensation Comments:")
88
+ salary_info.lstrip.strip if salary_info
89
+ end
90
+
91
+ # Get the job category
92
+ def job_category
93
+ get_element_value("Job Category:")
94
+ end
95
+
96
+ # Get the group ID
97
+ def group_id
98
+ get_element_value("Group ID")
99
+ end
100
+
101
+ # Get the # of years of experience required
102
+ def required_experience
103
+ get_element_value("Minimum Experience Required")
104
+ end
105
+
106
+ # Get the employment status for the position
107
+ def employment_status
108
+ get_element_value("Status:")
109
+ end
110
+
111
+ # Get the clearance level
112
+ def required_clearance
113
+ get_element_value("Minimum Clearance")
114
+ end
115
+
116
+ # Get the work environment
117
+ def work_environment
118
+ get_element_value("Workplace:")
119
+ end
120
+
121
+ # Get the date of the posting
122
+ def posting_date
123
+ element = get_element("Post Date:")
124
+ DateTime.parse(element[0].css("meta")[0]['content'])
125
+ end
126
+
127
+ # Get the value for the element
128
+ def get_element_value(phrase)
129
+ element = get_element(phrase)[0]
130
+ element.css("strong").text if element
131
+ end
132
+
133
+ # Get the element including the phrase specified
134
+ def get_element(phrase)
135
+ @page.css(".cj-job-data").select{|d| d.text.include?(phrase) }
136
+ end
137
+ end
@@ -0,0 +1,141 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'cgi'
4
+ require 'json'
5
+ require 'requestmanager'
6
+ require 'harvesterreporter'
7
+
8
+ load 'clearedjobsnet/cleared_jobs_net_parser.rb'
9
+ load 'util/failure_handler.rb'
10
+
11
+ class ClearedJobsNetCrawler
12
+ include FailureHandler
13
+ def initialize(crawl_type, filter_name=nil, requests=nil, cm_hash=nil)
14
+ @base_url = "https://clearedjobs.net/"
15
+ @output = Array.new
16
+ @requests = requests
17
+
18
+ # Handle crawler manager info
19
+ @reporter = HarvesterReporter.new(cm_hash)
20
+
21
+ # Get all items
22
+ if crawl_type == "all"
23
+ @crawl_type = crawl_type
24
+
25
+ # Get a company page
26
+ elsif filter_name == "company_page"
27
+ @crawl_type = "company_page"
28
+ @search_term = crawl_type
29
+
30
+ # Add a filter
31
+ elsif filter_name
32
+ @crawl_type = "filter"
33
+ @filter = filter_name
34
+ @search_term = crawl_type
35
+
36
+ # Query search
37
+ else
38
+ @crawl_type = "search"
39
+ @search_term = crawl_type
40
+ end
41
+ end
42
+
43
+ # Crawls the listings for the query
44
+ def crawl_listings
45
+ pages_to_crawl = get_num_pages_per_query.to_i
46
+ base_url = get_base_query_url
47
+
48
+ # Loop through pages and collect links for each
49
+ (1..pages_to_crawl).each do |page_num|
50
+ next_page_html = goto_next_page(base_url, page_num)
51
+ collect_page_links(next_page_html)
52
+ end
53
+ end
54
+
55
+ # Collect all the links on the page
56
+ def collect_page_links(page_html)
57
+ html = Nokogiri::HTML.parse(page_html)
58
+ result_rows = html.css("table.search_res").css("tbody").css("tr")
59
+
60
+ # Parse URL and date from each row
61
+ parsed_result_rows = result_rows.map do |r|
62
+ link = (@base_url+r.css("a")[0]['href']).split("/keywords")[0]
63
+ date = Date.parse(r.css("div").select{|e| e.text.include?("Posted - ")}.first.text.gsub("Posted - ", ""))
64
+ {url: link, posting_date: date}
65
+ end
66
+
67
+ parse_all_listings(parsed_result_rows)
68
+ return parsed_result_rows
69
+ end
70
+
71
+ # Parse all of the listings
72
+ def parse_all_listings(listing_links)
73
+ found_listings = Array.new
74
+ listing_links.each do |listing|
75
+ parser = ClearedJobsNetParser.new(get_page(listing[:url]), listing, @requests)
76
+ found_listings.push(parser.parse_job)
77
+ end
78
+
79
+ @reporter.report_results(found_listings, listing_links.first)
80
+ end
81
+
82
+ # Gets the number of pages for the query
83
+ def get_num_pages_per_query
84
+ # Go to the last page
85
+ html = Nokogiri::HTML.parse(run_initial_query)
86
+ if !html.css("div.navbar_bottom").css("a").empty?
87
+ last_page_link = @base_url+html.css("div.navbar_bottom").css("a").last['href']
88
+ last_page_html = Nokogiri::HTML.parse(get_page(last_page_link.gsub("//", "/")))
89
+
90
+ # Parse the page numbers in last page
91
+ return last_page_html.css("div.navbar_bottom").css("strong").text
92
+ else # Just one page of results
93
+ return "1"
94
+ end
95
+ end
96
+
97
+ # Goes to the next page
98
+ def goto_next_page(base_query_url, num)
99
+ start_index = (num-1)*25
100
+
101
+ # Set the URL for the next page appropriately
102
+ if start_index == 0 || num == 0
103
+ next_page_url = base_query_url
104
+ else
105
+ next_page_url = base_query_url+"/start/"+start_index.to_s
106
+ end
107
+
108
+ return get_page(next_page_url)
109
+ end
110
+
111
+ # Open the page
112
+ def get_page(url, i=0)
113
+ get_retry(url, @requests, 0)
114
+ end
115
+
116
+ # Get the initial results page
117
+ def run_initial_query
118
+ url = get_base_query_url
119
+ return get_page(url)
120
+ end
121
+
122
+ # Get the base query url depending on type
123
+ def get_base_query_url
124
+ if @crawl_type == "all"
125
+ return @base_url+"search/action/advanced_search/zip_radius/20/keywords/+/city_state_zip/+/security_clearance/+/submit/SEARCH+JOBS/sort/time"
126
+ elsif @crawl_type == "search"
127
+ encoded_term = CGI.escape(@search_term)
128
+ return @base_url+"search/action/advanced_search/zip_radius/20/keywords/"+encoded_term+"/city_state_zip/+/security_clearance/+/submit/SEARCH+JOBS"
129
+ elsif @crawl_type == "filter"
130
+ encoded_term = CGI.escape(@search_term)
131
+ return @base_url+"search/action/advanced_search/keywords/+/"+@filter+"[]/"+encoded_term+"/zip/+/zip_radius/20"
132
+ elsif @crawl_type == "company_page"
133
+ return @base_url+"view-employer/employer_id_seo/"+@search_term
134
+ end
135
+ end
136
+
137
+ # Return JSON
138
+ def gen_json
139
+ return @reporter.gen_json
140
+ end
141
+ end
@@ -0,0 +1,93 @@
1
+ require 'nokogiri'
2
+ require 'pry'
3
+ require 'open-uri'
4
+
5
+ load 'util/failure_handler.rb'
6
+
7
+ class ClearedJobsNetParser
8
+ include FailureHandler
9
+ def initialize(html, details_hash, requests=nil)
10
+ @html = Nokogiri::HTML.parse(html)
11
+ @requests = requests
12
+ @url = details_hash[:url]
13
+ @i = 0
14
+ @posting_date = details_hash[:posting_date]
15
+ end
16
+
17
+ # Parses the job
18
+ def parse_job
19
+ begin
20
+ return {
21
+ url: @url,
22
+ html: @html.to_html,
23
+ posting_date: @posting_date,
24
+ company_name: company_name,
25
+ company_listing_link: company_listing_link,
26
+ required_clearance: required_clearance,
27
+ location: location,
28
+ country: country,
29
+ salary: salary,
30
+ job_number: job_number,
31
+ job_title: job_title,
32
+ job_description: job_description,
33
+ job_description_plaintext: job_description_plaintext
34
+ }
35
+ rescue
36
+ @i += 1
37
+ if @i < 10
38
+ @html = Nokogiri::HTML.parse(get_retry(@url, @requests, @i))
39
+ parse_job
40
+ end
41
+ end
42
+ end
43
+
44
+ # Gets the company name
45
+ def company_name
46
+ @html.css("div.view_job_table").css("div.row")[0].css(".left2").text
47
+ end
48
+
49
+ # Get the link to the company page
50
+ def company_listing_link
51
+ @html.css("div.view_job_table").css("div.row")[0].css(".left2").css("a")[0]['href']
52
+ end
53
+
54
+ # Gets the clearance level required
55
+ def required_clearance
56
+ @html.css("div.view_job_table").css("div.row")[0].css(".clearAll")[1].text
57
+ end
58
+
59
+ # Get the location of work
60
+ def location
61
+ @html.css("div.view_job_table").css("div.row")[1].css(".left2").text
62
+ end
63
+
64
+ # Get the country of work
65
+ def country
66
+ @html.css("div.view_job_table").css("div.row")[1].css(".right2").text
67
+ end
68
+
69
+ # Get the salary
70
+ def salary
71
+ @html.css("div.view_job_table").css("div.row")[2].css(".left2").text.strip.lstrip
72
+ end
73
+
74
+ # Get the job number
75
+ def job_number
76
+ @html.css("div.view_job_table").css("div.row")[2].css(".right2").text.strip.lstrip
77
+ end
78
+
79
+ # Get the job title
80
+ def job_title
81
+ @html.css("#view_employer").text
82
+ end
83
+
84
+ # Get the job description
85
+ def job_description
86
+ @html.css(".view-job-right").to_html
87
+ end
88
+
89
+ # Get the job description without HTML
90
+ def job_description_plaintext
91
+ Nokogiri::HTML.parse(job_description.gsub('<br />',"\n").gsub('<br>', "\n").gsub('<br/>', "\n")).text
92
+ end
93
+ end