tsjobcrawler 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +2 -0
- data/COPYING +674 -0
- data/Gemfile +7 -0
- data/README.md +28 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/lib/clearancejobscom/clearance_jobs_com_crawler.rb +86 -0
- data/lib/clearancejobscom/clearance_jobs_com_parser.rb +137 -0
- data/lib/clearedjobsnet/cleared_jobs_net_crawler.rb +141 -0
- data/lib/clearedjobsnet/cleared_jobs_net_parser.rb +93 -0
- data/lib/clearedjobsnet/get_all_cleared_jobs.rb +77 -0
- data/lib/clearedjobsnet/terms/clearance_levels.json +15 -0
- data/lib/clearedjobsnet/terms/company_names.json +17 -0
- data/lib/clearedjobsnet/terms/country_names.json +202 -0
- data/lib/clearedjobsnet/terms/search_terms.json +27 -0
- data/lib/securityclearedjobscom/security_cleared_jobs_com_crawler.rb +93 -0
- data/lib/securityclearedjobscom/security_cleared_jobs_com_parser.rb +115 -0
- data/lib/tsjobcrawler.rb +52 -0
- data/lib/util/failure_handler.rb +22 -0
- data/tsjobcrawler.gemspec +27 -0
- metadata +162 -0
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'headless'
|
3
|
+
require 'requestmanager'
|
4
|
+
load 'clearedjobsnet/cleared_jobs_net_crawler.rb'
|
5
|
+
|
6
|
+
# Get as many jobs as possible
|
7
|
+
class GetAllClearedJobs
|
8
|
+
def initialize(requests, cm_hash)
|
9
|
+
@output = Array.new
|
10
|
+
@requests = requests
|
11
|
+
@cm_hash
|
12
|
+
end
|
13
|
+
|
14
|
+
# Crawl through many options
|
15
|
+
def crawl
|
16
|
+
get_first_1000
|
17
|
+
get_by_clearance
|
18
|
+
get_by_country
|
19
|
+
get_by_company
|
20
|
+
get_by_searchterm
|
21
|
+
end
|
22
|
+
|
23
|
+
# Get the most recent jobs from blank search
|
24
|
+
def get_first_1000
|
25
|
+
start_crawler("all")
|
26
|
+
end
|
27
|
+
|
28
|
+
# Crawl by security clearance
|
29
|
+
def get_by_clearance
|
30
|
+
clearance_levels = JSON.parse(File.read("clearedjobsnet/terms/clearance_levels.json"))
|
31
|
+
crawl_each(clearance_levels, "security_clearance")
|
32
|
+
end
|
33
|
+
|
34
|
+
# Crawl each country
|
35
|
+
def get_by_country
|
36
|
+
country_names = JSON.parse(File.read("clearedjobsnet/terms/country_names.json"))
|
37
|
+
crawl_each(country_names, "country")
|
38
|
+
end
|
39
|
+
|
40
|
+
# Crawl company pages
|
41
|
+
def get_by_company
|
42
|
+
company_names = JSON.parse(File.read("clearedjobsnet/terms/company_names.json"))
|
43
|
+
crawl_each(company_names, "company_page")
|
44
|
+
end
|
45
|
+
|
46
|
+
# Crawl search term list
|
47
|
+
def get_by_searchterm
|
48
|
+
search_terms = JSON.parse(File.read("clearedjobsnet/terms/search_terms.json"))
|
49
|
+
crawl_each(search_terms)
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
# Crawl each item
|
54
|
+
def crawl_each(term_list, filter_name=nil)
|
55
|
+
term_list.each do |term|
|
56
|
+
start_crawler(term, filter_name)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Start the crawler
|
61
|
+
def start_crawler(search_term, filter=nil)
|
62
|
+
c = ClearedJobsNetCrawler.new(search_term, filter, @requests, @cm_hash)
|
63
|
+
c.crawl_listings
|
64
|
+
save_listings(c.gen_json)
|
65
|
+
end
|
66
|
+
|
67
|
+
# Save unique listings in output
|
68
|
+
def save_listings(listings)
|
69
|
+
@output = @output | JSON.parse(listings)
|
70
|
+
end
|
71
|
+
|
72
|
+
# Generates output JSON
|
73
|
+
def gen_json
|
74
|
+
return JSON.pretty_generate(@output)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
@@ -0,0 +1,15 @@
|
|
1
|
+
[
|
2
|
+
"DoJ",
|
3
|
+
"IRS",
|
4
|
+
"Secret",
|
5
|
+
"Top Secret",
|
6
|
+
"Top Secret / SCI",
|
7
|
+
"Top Secret / SCI + Poly",
|
8
|
+
"Top Secret / SCI + CI Poly",
|
9
|
+
"Top Secret / SCI + Full Scope Poly",
|
10
|
+
"DHS",
|
11
|
+
"DoE",
|
12
|
+
"Public Trust",
|
13
|
+
"Confidential",
|
14
|
+
"Security Clearance Required"
|
15
|
+
]
|
@@ -0,0 +1,17 @@
|
|
1
|
+
[
|
2
|
+
"sos-international-llc-0701",
|
3
|
+
"leidos-0062",
|
4
|
+
"caci-0108",
|
5
|
+
"northrop-grumman-0388",
|
6
|
+
"booz-allen-hamilton-0816",
|
7
|
+
"raytheon--0833",
|
8
|
+
"mantech-international-155811",
|
9
|
+
"csra-0194",
|
10
|
+
"engility-0380",
|
11
|
+
"hewlett-packard-enterprise-company-0212",
|
12
|
+
"pae-1023",
|
13
|
+
"jacobs-0868",
|
14
|
+
"los-alamos-national-laboratory-200019",
|
15
|
+
"amazon-web-services-0834",
|
16
|
+
"deloitte-0086"
|
17
|
+
]
|
@@ -0,0 +1,202 @@
|
|
1
|
+
[
|
2
|
+
"Canada",
|
3
|
+
"United States",
|
4
|
+
"United Kingdom",
|
5
|
+
"Australia",
|
6
|
+
"Brazil",
|
7
|
+
"Afghanistan",
|
8
|
+
"Albania",
|
9
|
+
"American Samoa",
|
10
|
+
"Andorra",
|
11
|
+
"Angola",
|
12
|
+
"Anguilla",
|
13
|
+
"Antigua and Barbuda",
|
14
|
+
"Argentina",
|
15
|
+
"Armenia",
|
16
|
+
"Aruba",
|
17
|
+
"Austria",
|
18
|
+
"Azerbaijan Republic",
|
19
|
+
"Bahamas",
|
20
|
+
"Bahrain",
|
21
|
+
"Bangladesh",
|
22
|
+
"Barbados",
|
23
|
+
"Belarus",
|
24
|
+
"Belgium",
|
25
|
+
"Belize",
|
26
|
+
"Benin",
|
27
|
+
"Bermuda",
|
28
|
+
"Bhutan",
|
29
|
+
"Bolivia",
|
30
|
+
"Bosnia and Herzegovina",
|
31
|
+
"Botswana",
|
32
|
+
"British Virgin Islands",
|
33
|
+
"Brunei Darussalam",
|
34
|
+
"Bulgaria",
|
35
|
+
"Burkina Faso",
|
36
|
+
"Burma",
|
37
|
+
"Burundi",
|
38
|
+
"Cambodia",
|
39
|
+
"Cameroon",
|
40
|
+
"Cape Verde Islands",
|
41
|
+
"Cayman Islands",
|
42
|
+
"Central African Republic",
|
43
|
+
"Chile",
|
44
|
+
"Colombia",
|
45
|
+
"Comoros",
|
46
|
+
"Cook Islands",
|
47
|
+
"Costa Rica",
|
48
|
+
"Cote d Ivoire (Ivory Coast)",
|
49
|
+
"Croatia, Republic of",
|
50
|
+
"Cuba",
|
51
|
+
"Cyprus",
|
52
|
+
"Czech Republic",
|
53
|
+
"Denmark",
|
54
|
+
"Djibouti",
|
55
|
+
"Dominica",
|
56
|
+
"Dominican Republic",
|
57
|
+
"Ecuador",
|
58
|
+
"El Salvador",
|
59
|
+
"Equatorial Guinea",
|
60
|
+
"Eritrea",
|
61
|
+
"Estonia",
|
62
|
+
"Falkland Islands (Islas Malvinas)",
|
63
|
+
"Fiji",
|
64
|
+
"Finland",
|
65
|
+
"France",
|
66
|
+
"French Guiana",
|
67
|
+
"French Polynesia",
|
68
|
+
"Gabon Republic",
|
69
|
+
"Gambia",
|
70
|
+
"Georgia",
|
71
|
+
"Germany",
|
72
|
+
"Ghana",
|
73
|
+
"Gibraltar",
|
74
|
+
"Greece",
|
75
|
+
"Greenland",
|
76
|
+
"Grenada",
|
77
|
+
"Guadeloupe",
|
78
|
+
"Guam",
|
79
|
+
"Guatemala",
|
80
|
+
"Guernsey",
|
81
|
+
"Guinea",
|
82
|
+
"Guinea-Bissau",
|
83
|
+
"Guyana",
|
84
|
+
"Haiti",
|
85
|
+
"Honduras",
|
86
|
+
"Hong Kong",
|
87
|
+
"Hungary",
|
88
|
+
"Iceland",
|
89
|
+
"Indonesia",
|
90
|
+
"Iraq",
|
91
|
+
"Ireland",
|
92
|
+
"Israel",
|
93
|
+
"Italy",
|
94
|
+
"Jamaica",
|
95
|
+
"Jan Mayen",
|
96
|
+
"Japan",
|
97
|
+
"Jersey",
|
98
|
+
"Jordan",
|
99
|
+
"Kazakhstan",
|
100
|
+
"Kiribati",
|
101
|
+
"Korea, South",
|
102
|
+
"Kosovo",
|
103
|
+
"Kuwait",
|
104
|
+
"Kyrgyzstan",
|
105
|
+
"Laos",
|
106
|
+
"Latvia",
|
107
|
+
"Liechtenstein",
|
108
|
+
"Lithuania",
|
109
|
+
"Luxembourg",
|
110
|
+
"Macau",
|
111
|
+
"Macedonia",
|
112
|
+
"Madagascar",
|
113
|
+
"Malawi",
|
114
|
+
"Malaysia",
|
115
|
+
"Maldives",
|
116
|
+
"Malta",
|
117
|
+
"Marshall Islands",
|
118
|
+
"Martinique",
|
119
|
+
"Mauritania",
|
120
|
+
"Mauritius",
|
121
|
+
"Mayotte",
|
122
|
+
"Mexico",
|
123
|
+
"Moldova",
|
124
|
+
"Monaco",
|
125
|
+
"Mongolia",
|
126
|
+
"Montserrat",
|
127
|
+
"Mozambique",
|
128
|
+
"Namibia",
|
129
|
+
"Nauru",
|
130
|
+
"Nepal",
|
131
|
+
"Netherlands",
|
132
|
+
"Netherlands Antilles",
|
133
|
+
"New Caledonia",
|
134
|
+
"New Zealand",
|
135
|
+
"Nicaragua",
|
136
|
+
"Niger",
|
137
|
+
"Niue",
|
138
|
+
"Norway",
|
139
|
+
"Palau",
|
140
|
+
"Panama",
|
141
|
+
"Papua New Guinea",
|
142
|
+
"Paraguay",
|
143
|
+
"Peru",
|
144
|
+
"Philippines",
|
145
|
+
"Poland",
|
146
|
+
"Portugal",
|
147
|
+
"Puerto Rico",
|
148
|
+
"Qatar",
|
149
|
+
"Romania",
|
150
|
+
"Rwanda",
|
151
|
+
"Saint Helena",
|
152
|
+
"Saint Kitts-Nevis",
|
153
|
+
"Saint Lucia",
|
154
|
+
"Saint Pierre and Miquelon",
|
155
|
+
"Saint Vincent and the Grenadines",
|
156
|
+
"San Marino",
|
157
|
+
"Saudi Arabia",
|
158
|
+
"Senegal",
|
159
|
+
"Seychelles",
|
160
|
+
"Sierra Leone",
|
161
|
+
"Singapore",
|
162
|
+
"Slovakia",
|
163
|
+
"Slovenia",
|
164
|
+
"Solomon Islands",
|
165
|
+
"South Africa",
|
166
|
+
"Spain",
|
167
|
+
"Sri Lanka",
|
168
|
+
"Sudan",
|
169
|
+
"Suriname",
|
170
|
+
"Svalbard",
|
171
|
+
"Swaziland",
|
172
|
+
"Sweden",
|
173
|
+
"Switzerland",
|
174
|
+
"Tahiti",
|
175
|
+
"Taiwan",
|
176
|
+
"Tajikistan",
|
177
|
+
"Tanzania",
|
178
|
+
"Thailand",
|
179
|
+
"Togo",
|
180
|
+
"Tonga",
|
181
|
+
"Trinidad and Tobago",
|
182
|
+
"Tunisia",
|
183
|
+
"Turkey",
|
184
|
+
"Turkmenistan",
|
185
|
+
"Turks and Caicos Islands",
|
186
|
+
"Tuvalu",
|
187
|
+
"Uganda",
|
188
|
+
"Ukraine",
|
189
|
+
"United Arab Emirates",
|
190
|
+
"Uruguay",
|
191
|
+
"Uzbekistan",
|
192
|
+
"Vanuatu",
|
193
|
+
"Vatican City State",
|
194
|
+
"Venezuela",
|
195
|
+
"Virgin Islands (U.S.)",
|
196
|
+
"Wallis and Futuna",
|
197
|
+
"Western Sahara",
|
198
|
+
"Western Samoa",
|
199
|
+
"Zimbabwe",
|
200
|
+
"Yugoslavia",
|
201
|
+
"Zambia"
|
202
|
+
]
|
@@ -0,0 +1,27 @@
|
|
1
|
+
[
|
2
|
+
"COMINT",
|
3
|
+
"OSINT",
|
4
|
+
"HUMINT",
|
5
|
+
"SIGINT",
|
6
|
+
"MASINT",
|
7
|
+
"GEOINT",
|
8
|
+
"IMINT",
|
9
|
+
"intelligence",
|
10
|
+
"analyst",
|
11
|
+
"target",
|
12
|
+
"linguist",
|
13
|
+
"research",
|
14
|
+
"software",
|
15
|
+
"network",
|
16
|
+
"supply",
|
17
|
+
"engineer",
|
18
|
+
"finance",
|
19
|
+
"cyber",
|
20
|
+
"geographic",
|
21
|
+
"manager",
|
22
|
+
"project",
|
23
|
+
"program",
|
24
|
+
"hardware",
|
25
|
+
"technician",
|
26
|
+
"administrative"
|
27
|
+
]
|
@@ -0,0 +1,93 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'cgi'
|
5
|
+
require 'pry'
|
6
|
+
require 'requestmanager'
|
7
|
+
require 'headless'
|
8
|
+
require 'harvesterreporter'
|
9
|
+
|
10
|
+
load 'securityclearedjobscom/security_cleared_jobs_com_parser.rb'
|
11
|
+
load 'util/failure_handler.rb'
|
12
|
+
|
13
|
+
class SecurityClearedJobsComCrawler
|
14
|
+
include FailureHandler
|
15
|
+
def initialize(search_term, requests=nil, cm_hash=nil)
|
16
|
+
@search_term = search_term
|
17
|
+
@requests = requests
|
18
|
+
@site_url = "https://www.securityclearedjobs.com"
|
19
|
+
@query_base_url = set_base_url
|
20
|
+
|
21
|
+
@reporter = HarvesterReporter.new(cm_hash)
|
22
|
+
end
|
23
|
+
|
24
|
+
# Set the base url for the query
|
25
|
+
def set_base_url
|
26
|
+
if @search_term == nil
|
27
|
+
return @site_url+"/searchjobs/?countrycode=GB"
|
28
|
+
else
|
29
|
+
return @site_url+"/searchjobs/?countrycode=GB&Keywords="+CGI.escape(@search_term)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Get the page
|
34
|
+
def get_page(url)
|
35
|
+
get_retry(url, @requests, 0)
|
36
|
+
end
|
37
|
+
|
38
|
+
# Get the total pagecount
|
39
|
+
def get_total_pagecount
|
40
|
+
initial_page = Nokogiri::HTML.parse(load_next_page(1))
|
41
|
+
navbar = initial_page.css(".paginator__item").last
|
42
|
+
last_page_link = navbar.css("a")[0]['href'] if navbar
|
43
|
+
|
44
|
+
# Handle case of there just being one page
|
45
|
+
if last_page_link
|
46
|
+
page_count = last_page_link.split("&Page=")[1].to_i
|
47
|
+
page_count == 0 ? (return 1) : (return page_count)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Load the next page
|
52
|
+
def load_next_page(page_num)
|
53
|
+
next_page_url = @query_base_url + "&Page="+page_num.to_s
|
54
|
+
return get_page(next_page_url)
|
55
|
+
end
|
56
|
+
|
57
|
+
# Save the result links on a page
|
58
|
+
def save_result_links(page)
|
59
|
+
html = Nokogiri::HTML.parse(page)
|
60
|
+
return html.css(".lister__header").css("a").map{|e| @site_url+e['href']}
|
61
|
+
end
|
62
|
+
|
63
|
+
# Parse all the listings on a single page
|
64
|
+
def parse_listings(page)
|
65
|
+
listing_links = save_result_links(page)
|
66
|
+
found_listings = Array.new
|
67
|
+
|
68
|
+
listing_links.each do |listing|
|
69
|
+
parser = SecurityClearedJobsComParser.new(listing, get_page(listing), @requests)
|
70
|
+
parsed_listing = parser.parse
|
71
|
+
found_listings.push(parsed_listing) if parsed_listing
|
72
|
+
end
|
73
|
+
|
74
|
+
@reporter.report_results(found_listings, listing_links.first)
|
75
|
+
end
|
76
|
+
|
77
|
+
# Crawls all of the listings
|
78
|
+
def crawl
|
79
|
+
total_pagecount = get_total_pagecount
|
80
|
+
|
81
|
+
# Load each page
|
82
|
+
(1..total_pagecount.to_i).each do |page_num|
|
83
|
+
next_page = load_next_page(page_num)
|
84
|
+
parse_listings(next_page)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# Output JSON
|
89
|
+
def gen_json
|
90
|
+
return @reporter.gen_json
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
@@ -0,0 +1,115 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'pry'
|
4
|
+
|
5
|
+
load 'util/failure_handler.rb'
|
6
|
+
|
7
|
+
class SecurityClearedJobsComParser
|
8
|
+
include FailureHandler
|
9
|
+
def initialize(url, html, requests=nil)
|
10
|
+
@url = url
|
11
|
+
@requests = requests
|
12
|
+
@html = html
|
13
|
+
@page = Nokogiri::HTML.parse(@html)
|
14
|
+
end
|
15
|
+
|
16
|
+
# Parse the job listing
|
17
|
+
def parse
|
18
|
+
begin
|
19
|
+
return {
|
20
|
+
url: @url,
|
21
|
+
company_name: company_name,
|
22
|
+
location: location,
|
23
|
+
salary: salary,
|
24
|
+
posting_date: posting_date,
|
25
|
+
closing_date: closing_date,
|
26
|
+
job_number: job_number,
|
27
|
+
contact_person: contact_person,
|
28
|
+
employment_status: employment_status,
|
29
|
+
required_clearance: required_clearance,
|
30
|
+
job_category: job_category,
|
31
|
+
job_title: job_title,
|
32
|
+
job_description: job_description,
|
33
|
+
job_description_plaintext: job_description_plaintext,
|
34
|
+
html: @html
|
35
|
+
}
|
36
|
+
rescue
|
37
|
+
@i += 1
|
38
|
+
if @i < 10
|
39
|
+
@html = Nokogiri::HTML.parse(get_retry(@url, @requests, @i))
|
40
|
+
parse
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# Get the name of the company
|
46
|
+
def company_name
|
47
|
+
@page.css("div.cf[itemprop='hiringOrganization']")[0].css("span[itemprop='name']").text
|
48
|
+
end
|
49
|
+
|
50
|
+
# Get the location
|
51
|
+
def location
|
52
|
+
get_element("Location")
|
53
|
+
end
|
54
|
+
|
55
|
+
# Get the salary
|
56
|
+
def salary
|
57
|
+
get_element("Salary")
|
58
|
+
end
|
59
|
+
|
60
|
+
# Get the posting date
|
61
|
+
def posting_date
|
62
|
+
Date.parse(get_element("Posted"))
|
63
|
+
end
|
64
|
+
|
65
|
+
# Get the date it closes
|
66
|
+
def closing_date
|
67
|
+
Date.parse(get_element("Closes"))
|
68
|
+
end
|
69
|
+
|
70
|
+
# Get the job number
|
71
|
+
def job_number
|
72
|
+
get_element("Ref")
|
73
|
+
end
|
74
|
+
|
75
|
+
# Get the contact person
|
76
|
+
def contact_person
|
77
|
+
get_element("Contact")
|
78
|
+
end
|
79
|
+
|
80
|
+
# Get the employment status
|
81
|
+
def employment_status
|
82
|
+
get_element("Job Type")
|
83
|
+
end
|
84
|
+
|
85
|
+
# Gets the clearance level required
|
86
|
+
def required_clearance
|
87
|
+
get_element("Clearance Level").split(", ")
|
88
|
+
end
|
89
|
+
|
90
|
+
# Gets the sector of the job
|
91
|
+
def job_category
|
92
|
+
get_element("Sector").split(", ")
|
93
|
+
end
|
94
|
+
|
95
|
+
# Get the job title
|
96
|
+
def job_title
|
97
|
+
@page.css("h1[itemprop='title']").text
|
98
|
+
end
|
99
|
+
|
100
|
+
# Get the job description
|
101
|
+
def job_description
|
102
|
+
@page.css("div[itemprop='description']").to_html
|
103
|
+
end
|
104
|
+
|
105
|
+
# Get the job description without html
|
106
|
+
def job_description_plaintext
|
107
|
+
Nokogiri::HTML.parse(job_description.gsub('<br />',"\n").gsub('<br>', "\n").gsub('<br/>', "\n")).text
|
108
|
+
end
|
109
|
+
|
110
|
+
# Get the element for the field
|
111
|
+
def get_element(field_name)
|
112
|
+
element = @page.css("div.cf").select{|d| d.text.include?(field_name)}
|
113
|
+
element[0].css("dd").text.strip.lstrip.gsub(/\s+/, " ") if !element.empty?
|
114
|
+
end
|
115
|
+
end
|