tsjobcrawler 0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,52 @@
1
+ require 'json'
2
+ load 'securityclearedjobscom/security_cleared_jobs_com_crawler.rb'
3
+ load 'clearancejobscom/clearance_jobs_com_crawler.rb'
4
+ load 'clearedjobsnet/cleared_jobs_net_crawler.rb'
5
+ load 'clearedjobsnet/get_all_cleared_jobs.rb'
6
+
7
+ # Crawls all the jobs that require clearance
8
+ class TSJobCrawler
9
+ def initialize(search_term, requests=nil, cm_hash=nil)
10
+ @search_term = search_term
11
+ @requests = requests
12
+ @cm_hash = cm_hash
13
+ @output = Array.new
14
+ end
15
+
16
+ # Crawl all of the listing sites
17
+ def crawl_jobs
18
+ security_cleared_jobs_com
19
+ clearance_jobs_com
20
+ cleared_jobs_net
21
+ end
22
+
23
+ def security_cleared_jobs_com
24
+ c = SecurityClearedJobsComCrawler.new(@search_term, @requests, @cm_hash)
25
+ c.crawl
26
+ @output += JSON.parse(c.gen_json) if @cm_hash == nil
27
+ end
28
+
29
+ def clearance_jobs_com
30
+ c = ClearanceJobsComCrawler.new(@search_term, @requests, @cm_hash)
31
+ c.crawl
32
+ @output += JSON.parse(c.gen_json) if @cm_hash == nil
33
+ end
34
+
35
+ def cleared_jobs_net
36
+ if @search_term == nil
37
+ g = GetAllClearedJobs.new(@requests, @cm_hash)
38
+ g.crawl
39
+ @output += JSON.parse(g.gen_json) if @cm_hash == nil
40
+ else # Scrape by search term
41
+ c = ClearedJobsNetCrawler.new(@search_term, nil, @requests, @cm_hash)
42
+ c.crawl_listings
43
+ @output += JSON.parse(c.gen_json) if @cm_hash == nil
44
+ end
45
+ end
46
+
47
+ # Generate output
48
+ def gen_json
49
+ JSON.pretty_generate(@output)
50
+ end
51
+ end
52
+
@@ -0,0 +1,22 @@
1
+ require 'requestmanager'
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+
5
+ module FailureHandler
6
+ def get_retry(url, requests, i=0)
7
+ puts "crawling "+url
8
+ begin
9
+ if requests
10
+ return requests.get_page(url)
11
+ else
12
+ return File.read(open(url.gsub("[", "%5B").gsub("]", "%5D")))
13
+ end
14
+ rescue
15
+ if i < 10
16
+ i+=1
17
+ sleep(i*rand(1..10))
18
+ get_retry(url, requests, i)
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "tsjobcrawler"
7
+ spec.version = '0.1'
8
+ spec.authors = ["M. C. McGrath"]
9
+ spec.email = ["shidash@shidash.com"]
10
+
11
+ spec.summary = %q{Crawls job listing websites for jobs requiring security clearance.}
12
+ spec.description = %q{Crawls job listing websites for jobs requiring security clearance.}
13
+ spec.homepage = "https://github.com/TransparencyToolkit/TSJobCrawler"
14
+
15
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
16
+ spec.bindir = "exe"
17
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
18
+ spec.require_paths = ["lib"]
19
+
20
+ spec.add_development_dependency "bundler", "~> 1.10"
21
+ spec.add_development_dependency "rake", "~> 10.0"
22
+ spec.add_runtime_dependency "nokogiri"
23
+ spec.add_runtime_dependency "requestmanager"
24
+ spec.add_runtime_dependency "harvesterreporter"
25
+ spec.add_runtime_dependency "pry"
26
+ spec.add_runtime_dependency "headless"
27
+ end
metadata ADDED
@@ -0,0 +1,162 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tsjobcrawler
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ platform: ruby
6
+ authors:
7
+ - M. C. McGrath
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2017-03-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.10'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.10'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: requestmanager
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: harvesterreporter
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: pry
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: headless
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description: Crawls job listing websites for jobs requiring security clearance.
112
+ email:
113
+ - shidash@shidash.com
114
+ executables: []
115
+ extensions: []
116
+ extra_rdoc_files: []
117
+ files:
118
+ - ".rspec"
119
+ - COPYING
120
+ - Gemfile
121
+ - README.md
122
+ - bin/console
123
+ - bin/setup
124
+ - lib/clearancejobscom/clearance_jobs_com_crawler.rb
125
+ - lib/clearancejobscom/clearance_jobs_com_parser.rb
126
+ - lib/clearedjobsnet/cleared_jobs_net_crawler.rb
127
+ - lib/clearedjobsnet/cleared_jobs_net_parser.rb
128
+ - lib/clearedjobsnet/get_all_cleared_jobs.rb
129
+ - lib/clearedjobsnet/terms/clearance_levels.json
130
+ - lib/clearedjobsnet/terms/company_names.json
131
+ - lib/clearedjobsnet/terms/country_names.json
132
+ - lib/clearedjobsnet/terms/search_terms.json
133
+ - lib/securityclearedjobscom/security_cleared_jobs_com_crawler.rb
134
+ - lib/securityclearedjobscom/security_cleared_jobs_com_parser.rb
135
+ - lib/tsjobcrawler.rb
136
+ - lib/util/failure_handler.rb
137
+ - tsjobcrawler.gemspec
138
+ homepage: https://github.com/TransparencyToolkit/TSJobCrawler
139
+ licenses: []
140
+ metadata: {}
141
+ post_install_message:
142
+ rdoc_options: []
143
+ require_paths:
144
+ - lib
145
+ required_ruby_version: !ruby/object:Gem::Requirement
146
+ requirements:
147
+ - - ">="
148
+ - !ruby/object:Gem::Version
149
+ version: '0'
150
+ required_rubygems_version: !ruby/object:Gem::Requirement
151
+ requirements:
152
+ - - ">="
153
+ - !ruby/object:Gem::Version
154
+ version: '0'
155
+ requirements: []
156
+ rubyforge_project:
157
+ rubygems_version: 2.4.8
158
+ signing_key:
159
+ specification_version: 4
160
+ summary: Crawls job listing websites for jobs requiring security clearance.
161
+ test_files: []
162
+ has_rdoc: