tsjobcrawler 0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,77 @@
1
+ require 'json'
2
+ require 'headless'
3
+ require 'requestmanager'
4
+ load 'clearedjobsnet/cleared_jobs_net_crawler.rb'
5
+
6
+ # Get as many jobs as possible
7
+ class GetAllClearedJobs
8
+ def initialize(requests, cm_hash)
9
+ @output = Array.new
10
+ @requests = requests
11
+ @cm_hash
12
+ end
13
+
14
+ # Crawl through many options
15
+ def crawl
16
+ get_first_1000
17
+ get_by_clearance
18
+ get_by_country
19
+ get_by_company
20
+ get_by_searchterm
21
+ end
22
+
23
+ # Get the most recent jobs from blank search
24
+ def get_first_1000
25
+ start_crawler("all")
26
+ end
27
+
28
+ # Crawl by security clearance
29
+ def get_by_clearance
30
+ clearance_levels = JSON.parse(File.read("clearedjobsnet/terms/clearance_levels.json"))
31
+ crawl_each(clearance_levels, "security_clearance")
32
+ end
33
+
34
+ # Crawl each country
35
+ def get_by_country
36
+ country_names = JSON.parse(File.read("clearedjobsnet/terms/country_names.json"))
37
+ crawl_each(country_names, "country")
38
+ end
39
+
40
+ # Crawl company pages
41
+ def get_by_company
42
+ company_names = JSON.parse(File.read("clearedjobsnet/terms/company_names.json"))
43
+ crawl_each(company_names, "company_page")
44
+ end
45
+
46
+ # Crawl search term list
47
+ def get_by_searchterm
48
+ search_terms = JSON.parse(File.read("clearedjobsnet/terms/search_terms.json"))
49
+ crawl_each(search_terms)
50
+ end
51
+
52
+
53
+ # Crawl each item
54
+ def crawl_each(term_list, filter_name=nil)
55
+ term_list.each do |term|
56
+ start_crawler(term, filter_name)
57
+ end
58
+ end
59
+
60
+ # Start the crawler
61
+ def start_crawler(search_term, filter=nil)
62
+ c = ClearedJobsNetCrawler.new(search_term, filter, @requests, @cm_hash)
63
+ c.crawl_listings
64
+ save_listings(c.gen_json)
65
+ end
66
+
67
+ # Save unique listings in output
68
+ def save_listings(listings)
69
+ @output = @output | JSON.parse(listings)
70
+ end
71
+
72
+ # Generates output JSON
73
+ def gen_json
74
+ return JSON.pretty_generate(@output)
75
+ end
76
+ end
77
+
@@ -0,0 +1,15 @@
1
+ [
2
+ "DoJ",
3
+ "IRS",
4
+ "Secret",
5
+ "Top Secret",
6
+ "Top Secret / SCI",
7
+ "Top Secret / SCI + Poly",
8
+ "Top Secret / SCI + CI Poly",
9
+ "Top Secret / SCI + Full Scope Poly",
10
+ "DHS",
11
+ "DoE",
12
+ "Public Trust",
13
+ "Confidential",
14
+ "Security Clearance Required"
15
+ ]
@@ -0,0 +1,17 @@
1
+ [
2
+ "sos-international-llc-0701",
3
+ "leidos-0062",
4
+ "caci-0108",
5
+ "northrop-grumman-0388",
6
+ "booz-allen-hamilton-0816",
7
+ "raytheon--0833",
8
+ "mantech-international-155811",
9
+ "csra-0194",
10
+ "engility-0380",
11
+ "hewlett-packard-enterprise-company-0212",
12
+ "pae-1023",
13
+ "jacobs-0868",
14
+ "los-alamos-national-laboratory-200019",
15
+ "amazon-web-services-0834",
16
+ "deloitte-0086"
17
+ ]
@@ -0,0 +1,202 @@
1
+ [
2
+ "Canada",
3
+ "United States",
4
+ "United Kingdom",
5
+ "Australia",
6
+ "Brazil",
7
+ "Afghanistan",
8
+ "Albania",
9
+ "American Samoa",
10
+ "Andorra",
11
+ "Angola",
12
+ "Anguilla",
13
+ "Antigua and Barbuda",
14
+ "Argentina",
15
+ "Armenia",
16
+ "Aruba",
17
+ "Austria",
18
+ "Azerbaijan Republic",
19
+ "Bahamas",
20
+ "Bahrain",
21
+ "Bangladesh",
22
+ "Barbados",
23
+ "Belarus",
24
+ "Belgium",
25
+ "Belize",
26
+ "Benin",
27
+ "Bermuda",
28
+ "Bhutan",
29
+ "Bolivia",
30
+ "Bosnia and Herzegovina",
31
+ "Botswana",
32
+ "British Virgin Islands",
33
+ "Brunei Darussalam",
34
+ "Bulgaria",
35
+ "Burkina Faso",
36
+ "Burma",
37
+ "Burundi",
38
+ "Cambodia",
39
+ "Cameroon",
40
+ "Cape Verde Islands",
41
+ "Cayman Islands",
42
+ "Central African Republic",
43
+ "Chile",
44
+ "Colombia",
45
+ "Comoros",
46
+ "Cook Islands",
47
+ "Costa Rica",
48
+ "Cote d Ivoire (Ivory Coast)",
49
+ "Croatia, Republic of",
50
+ "Cuba",
51
+ "Cyprus",
52
+ "Czech Republic",
53
+ "Denmark",
54
+ "Djibouti",
55
+ "Dominica",
56
+ "Dominican Republic",
57
+ "Ecuador",
58
+ "El Salvador",
59
+ "Equatorial Guinea",
60
+ "Eritrea",
61
+ "Estonia",
62
+ "Falkland Islands (Islas Malvinas)",
63
+ "Fiji",
64
+ "Finland",
65
+ "France",
66
+ "French Guiana",
67
+ "French Polynesia",
68
+ "Gabon Republic",
69
+ "Gambia",
70
+ "Georgia",
71
+ "Germany",
72
+ "Ghana",
73
+ "Gibraltar",
74
+ "Greece",
75
+ "Greenland",
76
+ "Grenada",
77
+ "Guadeloupe",
78
+ "Guam",
79
+ "Guatemala",
80
+ "Guernsey",
81
+ "Guinea",
82
+ "Guinea-Bissau",
83
+ "Guyana",
84
+ "Haiti",
85
+ "Honduras",
86
+ "Hong Kong",
87
+ "Hungary",
88
+ "Iceland",
89
+ "Indonesia",
90
+ "Iraq",
91
+ "Ireland",
92
+ "Israel",
93
+ "Italy",
94
+ "Jamaica",
95
+ "Jan Mayen",
96
+ "Japan",
97
+ "Jersey",
98
+ "Jordan",
99
+ "Kazakhstan",
100
+ "Kiribati",
101
+ "Korea, South",
102
+ "Kosovo",
103
+ "Kuwait",
104
+ "Kyrgyzstan",
105
+ "Laos",
106
+ "Latvia",
107
+ "Liechtenstein",
108
+ "Lithuania",
109
+ "Luxembourg",
110
+ "Macau",
111
+ "Macedonia",
112
+ "Madagascar",
113
+ "Malawi",
114
+ "Malaysia",
115
+ "Maldives",
116
+ "Malta",
117
+ "Marshall Islands",
118
+ "Martinique",
119
+ "Mauritania",
120
+ "Mauritius",
121
+ "Mayotte",
122
+ "Mexico",
123
+ "Moldova",
124
+ "Monaco",
125
+ "Mongolia",
126
+ "Montserrat",
127
+ "Mozambique",
128
+ "Namibia",
129
+ "Nauru",
130
+ "Nepal",
131
+ "Netherlands",
132
+ "Netherlands Antilles",
133
+ "New Caledonia",
134
+ "New Zealand",
135
+ "Nicaragua",
136
+ "Niger",
137
+ "Niue",
138
+ "Norway",
139
+ "Palau",
140
+ "Panama",
141
+ "Papua New Guinea",
142
+ "Paraguay",
143
+ "Peru",
144
+ "Philippines",
145
+ "Poland",
146
+ "Portugal",
147
+ "Puerto Rico",
148
+ "Qatar",
149
+ "Romania",
150
+ "Rwanda",
151
+ "Saint Helena",
152
+ "Saint Kitts-Nevis",
153
+ "Saint Lucia",
154
+ "Saint Pierre and Miquelon",
155
+ "Saint Vincent and the Grenadines",
156
+ "San Marino",
157
+ "Saudi Arabia",
158
+ "Senegal",
159
+ "Seychelles",
160
+ "Sierra Leone",
161
+ "Singapore",
162
+ "Slovakia",
163
+ "Slovenia",
164
+ "Solomon Islands",
165
+ "South Africa",
166
+ "Spain",
167
+ "Sri Lanka",
168
+ "Sudan",
169
+ "Suriname",
170
+ "Svalbard",
171
+ "Swaziland",
172
+ "Sweden",
173
+ "Switzerland",
174
+ "Tahiti",
175
+ "Taiwan",
176
+ "Tajikistan",
177
+ "Tanzania",
178
+ "Thailand",
179
+ "Togo",
180
+ "Tonga",
181
+ "Trinidad and Tobago",
182
+ "Tunisia",
183
+ "Turkey",
184
+ "Turkmenistan",
185
+ "Turks and Caicos Islands",
186
+ "Tuvalu",
187
+ "Uganda",
188
+ "Ukraine",
189
+ "United Arab Emirates",
190
+ "Uruguay",
191
+ "Uzbekistan",
192
+ "Vanuatu",
193
+ "Vatican City State",
194
+ "Venezuela",
195
+ "Virgin Islands (U.S.)",
196
+ "Wallis and Futuna",
197
+ "Western Sahara",
198
+ "Western Samoa",
199
+ "Zimbabwe",
200
+ "Yugoslavia",
201
+ "Zambia"
202
+ ]
@@ -0,0 +1,27 @@
1
+ [
2
+ "COMINT",
3
+ "OSINT",
4
+ "HUMINT",
5
+ "SIGINT",
6
+ "MASINT",
7
+ "GEOINT",
8
+ "IMINT",
9
+ "intelligence",
10
+ "analyst",
11
+ "target",
12
+ "linguist",
13
+ "research",
14
+ "software",
15
+ "network",
16
+ "supply",
17
+ "engineer",
18
+ "finance",
19
+ "cyber",
20
+ "geographic",
21
+ "manager",
22
+ "project",
23
+ "program",
24
+ "hardware",
25
+ "technician",
26
+ "administrative"
27
+ ]
@@ -0,0 +1,93 @@
1
+ require 'json'
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+ require 'cgi'
5
+ require 'pry'
6
+ require 'requestmanager'
7
+ require 'headless'
8
+ require 'harvesterreporter'
9
+
10
+ load 'securityclearedjobscom/security_cleared_jobs_com_parser.rb'
11
+ load 'util/failure_handler.rb'
12
+
13
+ class SecurityClearedJobsComCrawler
14
+ include FailureHandler
15
+ def initialize(search_term, requests=nil, cm_hash=nil)
16
+ @search_term = search_term
17
+ @requests = requests
18
+ @site_url = "https://www.securityclearedjobs.com"
19
+ @query_base_url = set_base_url
20
+
21
+ @reporter = HarvesterReporter.new(cm_hash)
22
+ end
23
+
24
+ # Set the base url for the query
25
+ def set_base_url
26
+ if @search_term == nil
27
+ return @site_url+"/searchjobs/?countrycode=GB"
28
+ else
29
+ return @site_url+"/searchjobs/?countrycode=GB&Keywords="+CGI.escape(@search_term)
30
+ end
31
+ end
32
+
33
+ # Get the page
34
+ def get_page(url)
35
+ get_retry(url, @requests, 0)
36
+ end
37
+
38
+ # Get the total pagecount
39
+ def get_total_pagecount
40
+ initial_page = Nokogiri::HTML.parse(load_next_page(1))
41
+ navbar = initial_page.css(".paginator__item").last
42
+ last_page_link = navbar.css("a")[0]['href'] if navbar
43
+
44
+ # Handle case of there just being one page
45
+ if last_page_link
46
+ page_count = last_page_link.split("&Page=")[1].to_i
47
+ page_count == 0 ? (return 1) : (return page_count)
48
+ end
49
+ end
50
+
51
+ # Load the next page
52
+ def load_next_page(page_num)
53
+ next_page_url = @query_base_url + "&Page="+page_num.to_s
54
+ return get_page(next_page_url)
55
+ end
56
+
57
+ # Save the result links on a page
58
+ def save_result_links(page)
59
+ html = Nokogiri::HTML.parse(page)
60
+ return html.css(".lister__header").css("a").map{|e| @site_url+e['href']}
61
+ end
62
+
63
+ # Parse all the listings on a single page
64
+ def parse_listings(page)
65
+ listing_links = save_result_links(page)
66
+ found_listings = Array.new
67
+
68
+ listing_links.each do |listing|
69
+ parser = SecurityClearedJobsComParser.new(listing, get_page(listing), @requests)
70
+ parsed_listing = parser.parse
71
+ found_listings.push(parsed_listing) if parsed_listing
72
+ end
73
+
74
+ @reporter.report_results(found_listings, listing_links.first)
75
+ end
76
+
77
+ # Crawls all of the listings
78
+ def crawl
79
+ total_pagecount = get_total_pagecount
80
+
81
+ # Load each page
82
+ (1..total_pagecount.to_i).each do |page_num|
83
+ next_page = load_next_page(page_num)
84
+ parse_listings(next_page)
85
+ end
86
+ end
87
+
88
+ # Output JSON
89
+ def gen_json
90
+ return @reporter.gen_json
91
+ end
92
+ end
93
+
@@ -0,0 +1,115 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'pry'
4
+
5
+ load 'util/failure_handler.rb'
6
+
7
+ class SecurityClearedJobsComParser
8
+ include FailureHandler
9
+ def initialize(url, html, requests=nil)
10
+ @url = url
11
+ @requests = requests
12
+ @html = html
13
+ @page = Nokogiri::HTML.parse(@html)
14
+ end
15
+
16
+ # Parse the job listing
17
+ def parse
18
+ begin
19
+ return {
20
+ url: @url,
21
+ company_name: company_name,
22
+ location: location,
23
+ salary: salary,
24
+ posting_date: posting_date,
25
+ closing_date: closing_date,
26
+ job_number: job_number,
27
+ contact_person: contact_person,
28
+ employment_status: employment_status,
29
+ required_clearance: required_clearance,
30
+ job_category: job_category,
31
+ job_title: job_title,
32
+ job_description: job_description,
33
+ job_description_plaintext: job_description_plaintext,
34
+ html: @html
35
+ }
36
+ rescue
37
+ @i += 1
38
+ if @i < 10
39
+ @html = Nokogiri::HTML.parse(get_retry(@url, @requests, @i))
40
+ parse
41
+ end
42
+ end
43
+ end
44
+
45
+ # Get the name of the company
46
+ def company_name
47
+ @page.css("div.cf[itemprop='hiringOrganization']")[0].css("span[itemprop='name']").text
48
+ end
49
+
50
+ # Get the location
51
+ def location
52
+ get_element("Location")
53
+ end
54
+
55
+ # Get the salary
56
+ def salary
57
+ get_element("Salary")
58
+ end
59
+
60
+ # Get the posting date
61
+ def posting_date
62
+ Date.parse(get_element("Posted"))
63
+ end
64
+
65
+ # Get the date it closes
66
+ def closing_date
67
+ Date.parse(get_element("Closes"))
68
+ end
69
+
70
+ # Get the job number
71
+ def job_number
72
+ get_element("Ref")
73
+ end
74
+
75
+ # Get the contact person
76
+ def contact_person
77
+ get_element("Contact")
78
+ end
79
+
80
+ # Get the employment status
81
+ def employment_status
82
+ get_element("Job Type")
83
+ end
84
+
85
+ # Gets the clearance level required
86
+ def required_clearance
87
+ get_element("Clearance Level").split(", ")
88
+ end
89
+
90
+ # Gets the sector of the job
91
+ def job_category
92
+ get_element("Sector").split(", ")
93
+ end
94
+
95
+ # Get the job title
96
+ def job_title
97
+ @page.css("h1[itemprop='title']").text
98
+ end
99
+
100
+ # Get the job description
101
+ def job_description
102
+ @page.css("div[itemprop='description']").to_html
103
+ end
104
+
105
+ # Get the job description without html
106
+ def job_description_plaintext
107
+ Nokogiri::HTML.parse(job_description.gsub('<br />',"\n").gsub('<br>', "\n").gsub('<br/>', "\n")).text
108
+ end
109
+
110
+ # Get the element for the field
111
+ def get_element(field_name)
112
+ element = @page.css("div.cf").select{|d| d.text.include?(field_name)}
113
+ element[0].css("dd").text.strip.lstrip.gsub(/\s+/, " ") if !element.empty?
114
+ end
115
+ end