tsjobcrawler 0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.rspec +2 -0
- data/COPYING +674 -0
- data/Gemfile +7 -0
- data/README.md +28 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/lib/clearancejobscom/clearance_jobs_com_crawler.rb +86 -0
- data/lib/clearancejobscom/clearance_jobs_com_parser.rb +137 -0
- data/lib/clearedjobsnet/cleared_jobs_net_crawler.rb +141 -0
- data/lib/clearedjobsnet/cleared_jobs_net_parser.rb +93 -0
- data/lib/clearedjobsnet/get_all_cleared_jobs.rb +77 -0
- data/lib/clearedjobsnet/terms/clearance_levels.json +15 -0
- data/lib/clearedjobsnet/terms/company_names.json +17 -0
- data/lib/clearedjobsnet/terms/country_names.json +202 -0
- data/lib/clearedjobsnet/terms/search_terms.json +27 -0
- data/lib/securityclearedjobscom/security_cleared_jobs_com_crawler.rb +93 -0
- data/lib/securityclearedjobscom/security_cleared_jobs_com_parser.rb +115 -0
- data/lib/tsjobcrawler.rb +52 -0
- data/lib/util/failure_handler.rb +22 -0
- data/tsjobcrawler.gemspec +27 -0
- metadata +162 -0
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'headless'
|
3
|
+
require 'requestmanager'
|
4
|
+
load 'clearedjobsnet/cleared_jobs_net_crawler.rb'
|
5
|
+
|
6
|
+
# Get as many jobs as possible
|
7
|
+
class GetAllClearedJobs
|
8
|
+
def initialize(requests, cm_hash)
|
9
|
+
@output = Array.new
|
10
|
+
@requests = requests
|
11
|
+
@cm_hash
|
12
|
+
end
|
13
|
+
|
14
|
+
# Crawl through many options
|
15
|
+
def crawl
|
16
|
+
get_first_1000
|
17
|
+
get_by_clearance
|
18
|
+
get_by_country
|
19
|
+
get_by_company
|
20
|
+
get_by_searchterm
|
21
|
+
end
|
22
|
+
|
23
|
+
# Get the most recent jobs from blank search
|
24
|
+
def get_first_1000
|
25
|
+
start_crawler("all")
|
26
|
+
end
|
27
|
+
|
28
|
+
# Crawl by security clearance
|
29
|
+
def get_by_clearance
|
30
|
+
clearance_levels = JSON.parse(File.read("clearedjobsnet/terms/clearance_levels.json"))
|
31
|
+
crawl_each(clearance_levels, "security_clearance")
|
32
|
+
end
|
33
|
+
|
34
|
+
# Crawl each country
|
35
|
+
def get_by_country
|
36
|
+
country_names = JSON.parse(File.read("clearedjobsnet/terms/country_names.json"))
|
37
|
+
crawl_each(country_names, "country")
|
38
|
+
end
|
39
|
+
|
40
|
+
# Crawl company pages
|
41
|
+
def get_by_company
|
42
|
+
company_names = JSON.parse(File.read("clearedjobsnet/terms/company_names.json"))
|
43
|
+
crawl_each(company_names, "company_page")
|
44
|
+
end
|
45
|
+
|
46
|
+
# Crawl search term list
|
47
|
+
def get_by_searchterm
|
48
|
+
search_terms = JSON.parse(File.read("clearedjobsnet/terms/search_terms.json"))
|
49
|
+
crawl_each(search_terms)
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
# Crawl each item
|
54
|
+
def crawl_each(term_list, filter_name=nil)
|
55
|
+
term_list.each do |term|
|
56
|
+
start_crawler(term, filter_name)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Start the crawler
|
61
|
+
def start_crawler(search_term, filter=nil)
|
62
|
+
c = ClearedJobsNetCrawler.new(search_term, filter, @requests, @cm_hash)
|
63
|
+
c.crawl_listings
|
64
|
+
save_listings(c.gen_json)
|
65
|
+
end
|
66
|
+
|
67
|
+
# Save unique listings in output
|
68
|
+
def save_listings(listings)
|
69
|
+
@output = @output | JSON.parse(listings)
|
70
|
+
end
|
71
|
+
|
72
|
+
# Generates output JSON
|
73
|
+
def gen_json
|
74
|
+
return JSON.pretty_generate(@output)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
@@ -0,0 +1,15 @@
|
|
1
|
+
[
|
2
|
+
"DoJ",
|
3
|
+
"IRS",
|
4
|
+
"Secret",
|
5
|
+
"Top Secret",
|
6
|
+
"Top Secret / SCI",
|
7
|
+
"Top Secret / SCI + Poly",
|
8
|
+
"Top Secret / SCI + CI Poly",
|
9
|
+
"Top Secret / SCI + Full Scope Poly",
|
10
|
+
"DHS",
|
11
|
+
"DoE",
|
12
|
+
"Public Trust",
|
13
|
+
"Confidential",
|
14
|
+
"Security Clearance Required"
|
15
|
+
]
|
@@ -0,0 +1,17 @@
|
|
1
|
+
[
|
2
|
+
"sos-international-llc-0701",
|
3
|
+
"leidos-0062",
|
4
|
+
"caci-0108",
|
5
|
+
"northrop-grumman-0388",
|
6
|
+
"booz-allen-hamilton-0816",
|
7
|
+
"raytheon--0833",
|
8
|
+
"mantech-international-155811",
|
9
|
+
"csra-0194",
|
10
|
+
"engility-0380",
|
11
|
+
"hewlett-packard-enterprise-company-0212",
|
12
|
+
"pae-1023",
|
13
|
+
"jacobs-0868",
|
14
|
+
"los-alamos-national-laboratory-200019",
|
15
|
+
"amazon-web-services-0834",
|
16
|
+
"deloitte-0086"
|
17
|
+
]
|
@@ -0,0 +1,202 @@
|
|
1
|
+
[
|
2
|
+
"Canada",
|
3
|
+
"United States",
|
4
|
+
"United Kingdom",
|
5
|
+
"Australia",
|
6
|
+
"Brazil",
|
7
|
+
"Afghanistan",
|
8
|
+
"Albania",
|
9
|
+
"American Samoa",
|
10
|
+
"Andorra",
|
11
|
+
"Angola",
|
12
|
+
"Anguilla",
|
13
|
+
"Antigua and Barbuda",
|
14
|
+
"Argentina",
|
15
|
+
"Armenia",
|
16
|
+
"Aruba",
|
17
|
+
"Austria",
|
18
|
+
"Azerbaijan Republic",
|
19
|
+
"Bahamas",
|
20
|
+
"Bahrain",
|
21
|
+
"Bangladesh",
|
22
|
+
"Barbados",
|
23
|
+
"Belarus",
|
24
|
+
"Belgium",
|
25
|
+
"Belize",
|
26
|
+
"Benin",
|
27
|
+
"Bermuda",
|
28
|
+
"Bhutan",
|
29
|
+
"Bolivia",
|
30
|
+
"Bosnia and Herzegovina",
|
31
|
+
"Botswana",
|
32
|
+
"British Virgin Islands",
|
33
|
+
"Brunei Darussalam",
|
34
|
+
"Bulgaria",
|
35
|
+
"Burkina Faso",
|
36
|
+
"Burma",
|
37
|
+
"Burundi",
|
38
|
+
"Cambodia",
|
39
|
+
"Cameroon",
|
40
|
+
"Cape Verde Islands",
|
41
|
+
"Cayman Islands",
|
42
|
+
"Central African Republic",
|
43
|
+
"Chile",
|
44
|
+
"Colombia",
|
45
|
+
"Comoros",
|
46
|
+
"Cook Islands",
|
47
|
+
"Costa Rica",
|
48
|
+
"Cote d Ivoire (Ivory Coast)",
|
49
|
+
"Croatia, Republic of",
|
50
|
+
"Cuba",
|
51
|
+
"Cyprus",
|
52
|
+
"Czech Republic",
|
53
|
+
"Denmark",
|
54
|
+
"Djibouti",
|
55
|
+
"Dominica",
|
56
|
+
"Dominican Republic",
|
57
|
+
"Ecuador",
|
58
|
+
"El Salvador",
|
59
|
+
"Equatorial Guinea",
|
60
|
+
"Eritrea",
|
61
|
+
"Estonia",
|
62
|
+
"Falkland Islands (Islas Malvinas)",
|
63
|
+
"Fiji",
|
64
|
+
"Finland",
|
65
|
+
"France",
|
66
|
+
"French Guiana",
|
67
|
+
"French Polynesia",
|
68
|
+
"Gabon Republic",
|
69
|
+
"Gambia",
|
70
|
+
"Georgia",
|
71
|
+
"Germany",
|
72
|
+
"Ghana",
|
73
|
+
"Gibraltar",
|
74
|
+
"Greece",
|
75
|
+
"Greenland",
|
76
|
+
"Grenada",
|
77
|
+
"Guadeloupe",
|
78
|
+
"Guam",
|
79
|
+
"Guatemala",
|
80
|
+
"Guernsey",
|
81
|
+
"Guinea",
|
82
|
+
"Guinea-Bissau",
|
83
|
+
"Guyana",
|
84
|
+
"Haiti",
|
85
|
+
"Honduras",
|
86
|
+
"Hong Kong",
|
87
|
+
"Hungary",
|
88
|
+
"Iceland",
|
89
|
+
"Indonesia",
|
90
|
+
"Iraq",
|
91
|
+
"Ireland",
|
92
|
+
"Israel",
|
93
|
+
"Italy",
|
94
|
+
"Jamaica",
|
95
|
+
"Jan Mayen",
|
96
|
+
"Japan",
|
97
|
+
"Jersey",
|
98
|
+
"Jordan",
|
99
|
+
"Kazakhstan",
|
100
|
+
"Kiribati",
|
101
|
+
"Korea, South",
|
102
|
+
"Kosovo",
|
103
|
+
"Kuwait",
|
104
|
+
"Kyrgyzstan",
|
105
|
+
"Laos",
|
106
|
+
"Latvia",
|
107
|
+
"Liechtenstein",
|
108
|
+
"Lithuania",
|
109
|
+
"Luxembourg",
|
110
|
+
"Macau",
|
111
|
+
"Macedonia",
|
112
|
+
"Madagascar",
|
113
|
+
"Malawi",
|
114
|
+
"Malaysia",
|
115
|
+
"Maldives",
|
116
|
+
"Malta",
|
117
|
+
"Marshall Islands",
|
118
|
+
"Martinique",
|
119
|
+
"Mauritania",
|
120
|
+
"Mauritius",
|
121
|
+
"Mayotte",
|
122
|
+
"Mexico",
|
123
|
+
"Moldova",
|
124
|
+
"Monaco",
|
125
|
+
"Mongolia",
|
126
|
+
"Montserrat",
|
127
|
+
"Mozambique",
|
128
|
+
"Namibia",
|
129
|
+
"Nauru",
|
130
|
+
"Nepal",
|
131
|
+
"Netherlands",
|
132
|
+
"Netherlands Antilles",
|
133
|
+
"New Caledonia",
|
134
|
+
"New Zealand",
|
135
|
+
"Nicaragua",
|
136
|
+
"Niger",
|
137
|
+
"Niue",
|
138
|
+
"Norway",
|
139
|
+
"Palau",
|
140
|
+
"Panama",
|
141
|
+
"Papua New Guinea",
|
142
|
+
"Paraguay",
|
143
|
+
"Peru",
|
144
|
+
"Philippines",
|
145
|
+
"Poland",
|
146
|
+
"Portugal",
|
147
|
+
"Puerto Rico",
|
148
|
+
"Qatar",
|
149
|
+
"Romania",
|
150
|
+
"Rwanda",
|
151
|
+
"Saint Helena",
|
152
|
+
"Saint Kitts-Nevis",
|
153
|
+
"Saint Lucia",
|
154
|
+
"Saint Pierre and Miquelon",
|
155
|
+
"Saint Vincent and the Grenadines",
|
156
|
+
"San Marino",
|
157
|
+
"Saudi Arabia",
|
158
|
+
"Senegal",
|
159
|
+
"Seychelles",
|
160
|
+
"Sierra Leone",
|
161
|
+
"Singapore",
|
162
|
+
"Slovakia",
|
163
|
+
"Slovenia",
|
164
|
+
"Solomon Islands",
|
165
|
+
"South Africa",
|
166
|
+
"Spain",
|
167
|
+
"Sri Lanka",
|
168
|
+
"Sudan",
|
169
|
+
"Suriname",
|
170
|
+
"Svalbard",
|
171
|
+
"Swaziland",
|
172
|
+
"Sweden",
|
173
|
+
"Switzerland",
|
174
|
+
"Tahiti",
|
175
|
+
"Taiwan",
|
176
|
+
"Tajikistan",
|
177
|
+
"Tanzania",
|
178
|
+
"Thailand",
|
179
|
+
"Togo",
|
180
|
+
"Tonga",
|
181
|
+
"Trinidad and Tobago",
|
182
|
+
"Tunisia",
|
183
|
+
"Turkey",
|
184
|
+
"Turkmenistan",
|
185
|
+
"Turks and Caicos Islands",
|
186
|
+
"Tuvalu",
|
187
|
+
"Uganda",
|
188
|
+
"Ukraine",
|
189
|
+
"United Arab Emirates",
|
190
|
+
"Uruguay",
|
191
|
+
"Uzbekistan",
|
192
|
+
"Vanuatu",
|
193
|
+
"Vatican City State",
|
194
|
+
"Venezuela",
|
195
|
+
"Virgin Islands (U.S.)",
|
196
|
+
"Wallis and Futuna",
|
197
|
+
"Western Sahara",
|
198
|
+
"Western Samoa",
|
199
|
+
"Zimbabwe",
|
200
|
+
"Yugoslavia",
|
201
|
+
"Zambia"
|
202
|
+
]
|
@@ -0,0 +1,27 @@
|
|
1
|
+
[
|
2
|
+
"COMINT",
|
3
|
+
"OSINT",
|
4
|
+
"HUMINT",
|
5
|
+
"SIGINT",
|
6
|
+
"MASINT",
|
7
|
+
"GEOINT",
|
8
|
+
"IMINT",
|
9
|
+
"intelligence",
|
10
|
+
"analyst",
|
11
|
+
"target",
|
12
|
+
"linguist",
|
13
|
+
"research",
|
14
|
+
"software",
|
15
|
+
"network",
|
16
|
+
"supply",
|
17
|
+
"engineer",
|
18
|
+
"finance",
|
19
|
+
"cyber",
|
20
|
+
"geographic",
|
21
|
+
"manager",
|
22
|
+
"project",
|
23
|
+
"program",
|
24
|
+
"hardware",
|
25
|
+
"technician",
|
26
|
+
"administrative"
|
27
|
+
]
|
@@ -0,0 +1,93 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'cgi'
|
5
|
+
require 'pry'
|
6
|
+
require 'requestmanager'
|
7
|
+
require 'headless'
|
8
|
+
require 'harvesterreporter'
|
9
|
+
|
10
|
+
load 'securityclearedjobscom/security_cleared_jobs_com_parser.rb'
|
11
|
+
load 'util/failure_handler.rb'
|
12
|
+
|
13
|
+
class SecurityClearedJobsComCrawler
|
14
|
+
include FailureHandler
|
15
|
+
def initialize(search_term, requests=nil, cm_hash=nil)
|
16
|
+
@search_term = search_term
|
17
|
+
@requests = requests
|
18
|
+
@site_url = "https://www.securityclearedjobs.com"
|
19
|
+
@query_base_url = set_base_url
|
20
|
+
|
21
|
+
@reporter = HarvesterReporter.new(cm_hash)
|
22
|
+
end
|
23
|
+
|
24
|
+
# Set the base url for the query
|
25
|
+
def set_base_url
|
26
|
+
if @search_term == nil
|
27
|
+
return @site_url+"/searchjobs/?countrycode=GB"
|
28
|
+
else
|
29
|
+
return @site_url+"/searchjobs/?countrycode=GB&Keywords="+CGI.escape(@search_term)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Get the page
|
34
|
+
def get_page(url)
|
35
|
+
get_retry(url, @requests, 0)
|
36
|
+
end
|
37
|
+
|
38
|
+
# Get the total pagecount
|
39
|
+
def get_total_pagecount
|
40
|
+
initial_page = Nokogiri::HTML.parse(load_next_page(1))
|
41
|
+
navbar = initial_page.css(".paginator__item").last
|
42
|
+
last_page_link = navbar.css("a")[0]['href'] if navbar
|
43
|
+
|
44
|
+
# Handle case of there just being one page
|
45
|
+
if last_page_link
|
46
|
+
page_count = last_page_link.split("&Page=")[1].to_i
|
47
|
+
page_count == 0 ? (return 1) : (return page_count)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Load the next page
|
52
|
+
def load_next_page(page_num)
|
53
|
+
next_page_url = @query_base_url + "&Page="+page_num.to_s
|
54
|
+
return get_page(next_page_url)
|
55
|
+
end
|
56
|
+
|
57
|
+
# Save the result links on a page
|
58
|
+
def save_result_links(page)
|
59
|
+
html = Nokogiri::HTML.parse(page)
|
60
|
+
return html.css(".lister__header").css("a").map{|e| @site_url+e['href']}
|
61
|
+
end
|
62
|
+
|
63
|
+
# Parse all the listings on a single page
|
64
|
+
def parse_listings(page)
|
65
|
+
listing_links = save_result_links(page)
|
66
|
+
found_listings = Array.new
|
67
|
+
|
68
|
+
listing_links.each do |listing|
|
69
|
+
parser = SecurityClearedJobsComParser.new(listing, get_page(listing), @requests)
|
70
|
+
parsed_listing = parser.parse
|
71
|
+
found_listings.push(parsed_listing) if parsed_listing
|
72
|
+
end
|
73
|
+
|
74
|
+
@reporter.report_results(found_listings, listing_links.first)
|
75
|
+
end
|
76
|
+
|
77
|
+
# Crawls all of the listings
|
78
|
+
def crawl
|
79
|
+
total_pagecount = get_total_pagecount
|
80
|
+
|
81
|
+
# Load each page
|
82
|
+
(1..total_pagecount.to_i).each do |page_num|
|
83
|
+
next_page = load_next_page(page_num)
|
84
|
+
parse_listings(next_page)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# Output JSON
|
89
|
+
def gen_json
|
90
|
+
return @reporter.gen_json
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
@@ -0,0 +1,115 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'pry'
|
4
|
+
|
5
|
+
load 'util/failure_handler.rb'
|
6
|
+
|
7
|
+
class SecurityClearedJobsComParser
|
8
|
+
include FailureHandler
|
9
|
+
def initialize(url, html, requests=nil)
|
10
|
+
@url = url
|
11
|
+
@requests = requests
|
12
|
+
@html = html
|
13
|
+
@page = Nokogiri::HTML.parse(@html)
|
14
|
+
end
|
15
|
+
|
16
|
+
# Parse the job listing
|
17
|
+
def parse
|
18
|
+
begin
|
19
|
+
return {
|
20
|
+
url: @url,
|
21
|
+
company_name: company_name,
|
22
|
+
location: location,
|
23
|
+
salary: salary,
|
24
|
+
posting_date: posting_date,
|
25
|
+
closing_date: closing_date,
|
26
|
+
job_number: job_number,
|
27
|
+
contact_person: contact_person,
|
28
|
+
employment_status: employment_status,
|
29
|
+
required_clearance: required_clearance,
|
30
|
+
job_category: job_category,
|
31
|
+
job_title: job_title,
|
32
|
+
job_description: job_description,
|
33
|
+
job_description_plaintext: job_description_plaintext,
|
34
|
+
html: @html
|
35
|
+
}
|
36
|
+
rescue
|
37
|
+
@i += 1
|
38
|
+
if @i < 10
|
39
|
+
@html = Nokogiri::HTML.parse(get_retry(@url, @requests, @i))
|
40
|
+
parse
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# Get the name of the company
|
46
|
+
def company_name
|
47
|
+
@page.css("div.cf[itemprop='hiringOrganization']")[0].css("span[itemprop='name']").text
|
48
|
+
end
|
49
|
+
|
50
|
+
# Get the location
|
51
|
+
def location
|
52
|
+
get_element("Location")
|
53
|
+
end
|
54
|
+
|
55
|
+
# Get the salary
|
56
|
+
def salary
|
57
|
+
get_element("Salary")
|
58
|
+
end
|
59
|
+
|
60
|
+
# Get the posting date
|
61
|
+
def posting_date
|
62
|
+
Date.parse(get_element("Posted"))
|
63
|
+
end
|
64
|
+
|
65
|
+
# Get the date it closes
|
66
|
+
def closing_date
|
67
|
+
Date.parse(get_element("Closes"))
|
68
|
+
end
|
69
|
+
|
70
|
+
# Get the job number
|
71
|
+
def job_number
|
72
|
+
get_element("Ref")
|
73
|
+
end
|
74
|
+
|
75
|
+
# Get the contact person
|
76
|
+
def contact_person
|
77
|
+
get_element("Contact")
|
78
|
+
end
|
79
|
+
|
80
|
+
# Get the employment status
|
81
|
+
def employment_status
|
82
|
+
get_element("Job Type")
|
83
|
+
end
|
84
|
+
|
85
|
+
# Gets the clearance level required
|
86
|
+
def required_clearance
|
87
|
+
get_element("Clearance Level").split(", ")
|
88
|
+
end
|
89
|
+
|
90
|
+
# Gets the sector of the job
|
91
|
+
def job_category
|
92
|
+
get_element("Sector").split(", ")
|
93
|
+
end
|
94
|
+
|
95
|
+
# Get the job title
|
96
|
+
def job_title
|
97
|
+
@page.css("h1[itemprop='title']").text
|
98
|
+
end
|
99
|
+
|
100
|
+
# Get the job description
|
101
|
+
def job_description
|
102
|
+
@page.css("div[itemprop='description']").to_html
|
103
|
+
end
|
104
|
+
|
105
|
+
# Get the job description without html
|
106
|
+
def job_description_plaintext
|
107
|
+
Nokogiri::HTML.parse(job_description.gsub('<br />',"\n").gsub('<br>', "\n").gsub('<br/>', "\n")).text
|
108
|
+
end
|
109
|
+
|
110
|
+
# Get the element for the field
|
111
|
+
def get_element(field_name)
|
112
|
+
element = @page.css("div.cf").select{|d| d.text.include?(field_name)}
|
113
|
+
element[0].css("dd").text.strip.lstrip.gsub(/\s+/, " ") if !element.empty?
|
114
|
+
end
|
115
|
+
end
|