tsjobcrawler 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,77 @@
1
+ require 'json'
2
+ require 'headless'
3
+ require 'requestmanager'
4
+ load 'clearedjobsnet/cleared_jobs_net_crawler.rb'
5
+
6
+ # Get as many jobs as possible
7
+ class GetAllClearedJobs
8
+ def initialize(requests, cm_hash)
9
+ @output = Array.new
10
+ @requests = requests
11
+ @cm_hash
12
+ end
13
+
14
+ # Crawl through many options
15
+ def crawl
16
+ get_first_1000
17
+ get_by_clearance
18
+ get_by_country
19
+ get_by_company
20
+ get_by_searchterm
21
+ end
22
+
23
+ # Get the most recent jobs from blank search
24
+ def get_first_1000
25
+ start_crawler("all")
26
+ end
27
+
28
+ # Crawl by security clearance
29
+ def get_by_clearance
30
+ clearance_levels = JSON.parse(File.read("clearedjobsnet/terms/clearance_levels.json"))
31
+ crawl_each(clearance_levels, "security_clearance")
32
+ end
33
+
34
+ # Crawl each country
35
+ def get_by_country
36
+ country_names = JSON.parse(File.read("clearedjobsnet/terms/country_names.json"))
37
+ crawl_each(country_names, "country")
38
+ end
39
+
40
+ # Crawl company pages
41
+ def get_by_company
42
+ company_names = JSON.parse(File.read("clearedjobsnet/terms/company_names.json"))
43
+ crawl_each(company_names, "company_page")
44
+ end
45
+
46
+ # Crawl search term list
47
+ def get_by_searchterm
48
+ search_terms = JSON.parse(File.read("clearedjobsnet/terms/search_terms.json"))
49
+ crawl_each(search_terms)
50
+ end
51
+
52
+
53
+ # Crawl each item
54
+ def crawl_each(term_list, filter_name=nil)
55
+ term_list.each do |term|
56
+ start_crawler(term, filter_name)
57
+ end
58
+ end
59
+
60
+ # Start the crawler
61
+ def start_crawler(search_term, filter=nil)
62
+ c = ClearedJobsNetCrawler.new(search_term, filter, @requests, @cm_hash)
63
+ c.crawl_listings
64
+ save_listings(c.gen_json)
65
+ end
66
+
67
+ # Save unique listings in output
68
+ def save_listings(listings)
69
+ @output = @output | JSON.parse(listings)
70
+ end
71
+
72
+ # Generates output JSON
73
+ def gen_json
74
+ return JSON.pretty_generate(@output)
75
+ end
76
+ end
77
+
@@ -0,0 +1,15 @@
1
+ [
2
+ "DoJ",
3
+ "IRS",
4
+ "Secret",
5
+ "Top Secret",
6
+ "Top Secret / SCI",
7
+ "Top Secret / SCI + Poly",
8
+ "Top Secret / SCI + CI Poly",
9
+ "Top Secret / SCI + Full Scope Poly",
10
+ "DHS",
11
+ "DoE",
12
+ "Public Trust",
13
+ "Confidential",
14
+ "Security Clearance Required"
15
+ ]
@@ -0,0 +1,17 @@
1
+ [
2
+ "sos-international-llc-0701",
3
+ "leidos-0062",
4
+ "caci-0108",
5
+ "northrop-grumman-0388",
6
+ "booz-allen-hamilton-0816",
7
+ "raytheon--0833",
8
+ "mantech-international-155811",
9
+ "csra-0194",
10
+ "engility-0380",
11
+ "hewlett-packard-enterprise-company-0212",
12
+ "pae-1023",
13
+ "jacobs-0868",
14
+ "los-alamos-national-laboratory-200019",
15
+ "amazon-web-services-0834",
16
+ "deloitte-0086"
17
+ ]
@@ -0,0 +1,202 @@
1
+ [
2
+ "Canada",
3
+ "United States",
4
+ "United Kingdom",
5
+ "Australia",
6
+ "Brazil",
7
+ "Afghanistan",
8
+ "Albania",
9
+ "American Samoa",
10
+ "Andorra",
11
+ "Angola",
12
+ "Anguilla",
13
+ "Antigua and Barbuda",
14
+ "Argentina",
15
+ "Armenia",
16
+ "Aruba",
17
+ "Austria",
18
+ "Azerbaijan Republic",
19
+ "Bahamas",
20
+ "Bahrain",
21
+ "Bangladesh",
22
+ "Barbados",
23
+ "Belarus",
24
+ "Belgium",
25
+ "Belize",
26
+ "Benin",
27
+ "Bermuda",
28
+ "Bhutan",
29
+ "Bolivia",
30
+ "Bosnia and Herzegovina",
31
+ "Botswana",
32
+ "British Virgin Islands",
33
+ "Brunei Darussalam",
34
+ "Bulgaria",
35
+ "Burkina Faso",
36
+ "Burma",
37
+ "Burundi",
38
+ "Cambodia",
39
+ "Cameroon",
40
+ "Cape Verde Islands",
41
+ "Cayman Islands",
42
+ "Central African Republic",
43
+ "Chile",
44
+ "Colombia",
45
+ "Comoros",
46
+ "Cook Islands",
47
+ "Costa Rica",
48
+ "Cote d Ivoire (Ivory Coast)",
49
+ "Croatia, Republic of",
50
+ "Cuba",
51
+ "Cyprus",
52
+ "Czech Republic",
53
+ "Denmark",
54
+ "Djibouti",
55
+ "Dominica",
56
+ "Dominican Republic",
57
+ "Ecuador",
58
+ "El Salvador",
59
+ "Equatorial Guinea",
60
+ "Eritrea",
61
+ "Estonia",
62
+ "Falkland Islands (Islas Malvinas)",
63
+ "Fiji",
64
+ "Finland",
65
+ "France",
66
+ "French Guiana",
67
+ "French Polynesia",
68
+ "Gabon Republic",
69
+ "Gambia",
70
+ "Georgia",
71
+ "Germany",
72
+ "Ghana",
73
+ "Gibraltar",
74
+ "Greece",
75
+ "Greenland",
76
+ "Grenada",
77
+ "Guadeloupe",
78
+ "Guam",
79
+ "Guatemala",
80
+ "Guernsey",
81
+ "Guinea",
82
+ "Guinea-Bissau",
83
+ "Guyana",
84
+ "Haiti",
85
+ "Honduras",
86
+ "Hong Kong",
87
+ "Hungary",
88
+ "Iceland",
89
+ "Indonesia",
90
+ "Iraq",
91
+ "Ireland",
92
+ "Israel",
93
+ "Italy",
94
+ "Jamaica",
95
+ "Jan Mayen",
96
+ "Japan",
97
+ "Jersey",
98
+ "Jordan",
99
+ "Kazakhstan",
100
+ "Kiribati",
101
+ "Korea, South",
102
+ "Kosovo",
103
+ "Kuwait",
104
+ "Kyrgyzstan",
105
+ "Laos",
106
+ "Latvia",
107
+ "Liechtenstein",
108
+ "Lithuania",
109
+ "Luxembourg",
110
+ "Macau",
111
+ "Macedonia",
112
+ "Madagascar",
113
+ "Malawi",
114
+ "Malaysia",
115
+ "Maldives",
116
+ "Malta",
117
+ "Marshall Islands",
118
+ "Martinique",
119
+ "Mauritania",
120
+ "Mauritius",
121
+ "Mayotte",
122
+ "Mexico",
123
+ "Moldova",
124
+ "Monaco",
125
+ "Mongolia",
126
+ "Montserrat",
127
+ "Mozambique",
128
+ "Namibia",
129
+ "Nauru",
130
+ "Nepal",
131
+ "Netherlands",
132
+ "Netherlands Antilles",
133
+ "New Caledonia",
134
+ "New Zealand",
135
+ "Nicaragua",
136
+ "Niger",
137
+ "Niue",
138
+ "Norway",
139
+ "Palau",
140
+ "Panama",
141
+ "Papua New Guinea",
142
+ "Paraguay",
143
+ "Peru",
144
+ "Philippines",
145
+ "Poland",
146
+ "Portugal",
147
+ "Puerto Rico",
148
+ "Qatar",
149
+ "Romania",
150
+ "Rwanda",
151
+ "Saint Helena",
152
+ "Saint Kitts-Nevis",
153
+ "Saint Lucia",
154
+ "Saint Pierre and Miquelon",
155
+ "Saint Vincent and the Grenadines",
156
+ "San Marino",
157
+ "Saudi Arabia",
158
+ "Senegal",
159
+ "Seychelles",
160
+ "Sierra Leone",
161
+ "Singapore",
162
+ "Slovakia",
163
+ "Slovenia",
164
+ "Solomon Islands",
165
+ "South Africa",
166
+ "Spain",
167
+ "Sri Lanka",
168
+ "Sudan",
169
+ "Suriname",
170
+ "Svalbard",
171
+ "Swaziland",
172
+ "Sweden",
173
+ "Switzerland",
174
+ "Tahiti",
175
+ "Taiwan",
176
+ "Tajikistan",
177
+ "Tanzania",
178
+ "Thailand",
179
+ "Togo",
180
+ "Tonga",
181
+ "Trinidad and Tobago",
182
+ "Tunisia",
183
+ "Turkey",
184
+ "Turkmenistan",
185
+ "Turks and Caicos Islands",
186
+ "Tuvalu",
187
+ "Uganda",
188
+ "Ukraine",
189
+ "United Arab Emirates",
190
+ "Uruguay",
191
+ "Uzbekistan",
192
+ "Vanuatu",
193
+ "Vatican City State",
194
+ "Venezuela",
195
+ "Virgin Islands (U.S.)",
196
+ "Wallis and Futuna",
197
+ "Western Sahara",
198
+ "Western Samoa",
199
+ "Zimbabwe",
200
+ "Yugoslavia",
201
+ "Zambia"
202
+ ]
@@ -0,0 +1,27 @@
1
+ [
2
+ "COMINT",
3
+ "OSINT",
4
+ "HUMINT",
5
+ "SIGINT",
6
+ "MASINT",
7
+ "GEOINT",
8
+ "IMINT",
9
+ "intelligence",
10
+ "analyst",
11
+ "target",
12
+ "linguist",
13
+ "research",
14
+ "software",
15
+ "network",
16
+ "supply",
17
+ "engineer",
18
+ "finance",
19
+ "cyber",
20
+ "geographic",
21
+ "manager",
22
+ "project",
23
+ "program",
24
+ "hardware",
25
+ "technician",
26
+ "administrative"
27
+ ]
@@ -0,0 +1,93 @@
1
+ require 'json'
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+ require 'cgi'
5
+ require 'pry'
6
+ require 'requestmanager'
7
+ require 'headless'
8
+ require 'harvesterreporter'
9
+
10
+ load 'securityclearedjobscom/security_cleared_jobs_com_parser.rb'
11
+ load 'util/failure_handler.rb'
12
+
13
+ class SecurityClearedJobsComCrawler
14
+ include FailureHandler
15
+ def initialize(search_term, requests=nil, cm_hash=nil)
16
+ @search_term = search_term
17
+ @requests = requests
18
+ @site_url = "https://www.securityclearedjobs.com"
19
+ @query_base_url = set_base_url
20
+
21
+ @reporter = HarvesterReporter.new(cm_hash)
22
+ end
23
+
24
+ # Set the base url for the query
25
+ def set_base_url
26
+ if @search_term == nil
27
+ return @site_url+"/searchjobs/?countrycode=GB"
28
+ else
29
+ return @site_url+"/searchjobs/?countrycode=GB&Keywords="+CGI.escape(@search_term)
30
+ end
31
+ end
32
+
33
+ # Get the page
34
+ def get_page(url)
35
+ get_retry(url, @requests, 0)
36
+ end
37
+
38
+ # Get the total pagecount
39
+ def get_total_pagecount
40
+ initial_page = Nokogiri::HTML.parse(load_next_page(1))
41
+ navbar = initial_page.css(".paginator__item").last
42
+ last_page_link = navbar.css("a")[0]['href'] if navbar
43
+
44
+ # Handle case of there just being one page
45
+ if last_page_link
46
+ page_count = last_page_link.split("&Page=")[1].to_i
47
+ page_count == 0 ? (return 1) : (return page_count)
48
+ end
49
+ end
50
+
51
+ # Load the next page
52
+ def load_next_page(page_num)
53
+ next_page_url = @query_base_url + "&Page="+page_num.to_s
54
+ return get_page(next_page_url)
55
+ end
56
+
57
+ # Save the result links on a page
58
+ def save_result_links(page)
59
+ html = Nokogiri::HTML.parse(page)
60
+ return html.css(".lister__header").css("a").map{|e| @site_url+e['href']}
61
+ end
62
+
63
+ # Parse all the listings on a single page
64
+ def parse_listings(page)
65
+ listing_links = save_result_links(page)
66
+ found_listings = Array.new
67
+
68
+ listing_links.each do |listing|
69
+ parser = SecurityClearedJobsComParser.new(listing, get_page(listing), @requests)
70
+ parsed_listing = parser.parse
71
+ found_listings.push(parsed_listing) if parsed_listing
72
+ end
73
+
74
+ @reporter.report_results(found_listings, listing_links.first)
75
+ end
76
+
77
+ # Crawls all of the listings
78
+ def crawl
79
+ total_pagecount = get_total_pagecount
80
+
81
+ # Load each page
82
+ (1..total_pagecount.to_i).each do |page_num|
83
+ next_page = load_next_page(page_num)
84
+ parse_listings(next_page)
85
+ end
86
+ end
87
+
88
+ # Output JSON
89
+ def gen_json
90
+ return @reporter.gen_json
91
+ end
92
+ end
93
+
@@ -0,0 +1,115 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ require 'pry'
4
+
5
+ load 'util/failure_handler.rb'
6
+
7
+ class SecurityClearedJobsComParser
8
+ include FailureHandler
9
+ def initialize(url, html, requests=nil)
10
+ @url = url
11
+ @requests = requests
12
+ @html = html
13
+ @page = Nokogiri::HTML.parse(@html)
14
+ end
15
+
16
+ # Parse the job listing
17
+ def parse
18
+ begin
19
+ return {
20
+ url: @url,
21
+ company_name: company_name,
22
+ location: location,
23
+ salary: salary,
24
+ posting_date: posting_date,
25
+ closing_date: closing_date,
26
+ job_number: job_number,
27
+ contact_person: contact_person,
28
+ employment_status: employment_status,
29
+ required_clearance: required_clearance,
30
+ job_category: job_category,
31
+ job_title: job_title,
32
+ job_description: job_description,
33
+ job_description_plaintext: job_description_plaintext,
34
+ html: @html
35
+ }
36
+ rescue
37
+ @i += 1
38
+ if @i < 10
39
+ @html = Nokogiri::HTML.parse(get_retry(@url, @requests, @i))
40
+ parse
41
+ end
42
+ end
43
+ end
44
+
45
+ # Get the name of the company
46
+ def company_name
47
+ @page.css("div.cf[itemprop='hiringOrganization']")[0].css("span[itemprop='name']").text
48
+ end
49
+
50
+ # Get the location
51
+ def location
52
+ get_element("Location")
53
+ end
54
+
55
+ # Get the salary
56
+ def salary
57
+ get_element("Salary")
58
+ end
59
+
60
+ # Get the posting date
61
+ def posting_date
62
+ Date.parse(get_element("Posted"))
63
+ end
64
+
65
+ # Get the date it closes
66
+ def closing_date
67
+ Date.parse(get_element("Closes"))
68
+ end
69
+
70
+ # Get the job number
71
+ def job_number
72
+ get_element("Ref")
73
+ end
74
+
75
+ # Get the contact person
76
+ def contact_person
77
+ get_element("Contact")
78
+ end
79
+
80
+ # Get the employment status
81
+ def employment_status
82
+ get_element("Job Type")
83
+ end
84
+
85
+ # Gets the clearance level required
86
+ def required_clearance
87
+ get_element("Clearance Level").split(", ")
88
+ end
89
+
90
+ # Gets the sector of the job
91
+ def job_category
92
+ get_element("Sector").split(", ")
93
+ end
94
+
95
+ # Get the job title
96
+ def job_title
97
+ @page.css("h1[itemprop='title']").text
98
+ end
99
+
100
+ # Get the job description
101
+ def job_description
102
+ @page.css("div[itemprop='description']").to_html
103
+ end
104
+
105
+ # Get the job description without html
106
+ def job_description_plaintext
107
+ Nokogiri::HTML.parse(job_description.gsub('<br />',"\n").gsub('<br>', "\n").gsub('<br/>', "\n")).text
108
+ end
109
+
110
+ # Get the element for the field
111
+ def get_element(field_name)
112
+ element = @page.css("div.cf").select{|d| d.text.include?(field_name)}
113
+ element[0].css("dd").text.strip.lstrip.gsub(/\s+/, " ") if !element.empty?
114
+ end
115
+ end