tsjobcrawler 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,52 @@
1
+ require 'json'
2
+ load 'securityclearedjobscom/security_cleared_jobs_com_crawler.rb'
3
+ load 'clearancejobscom/clearance_jobs_com_crawler.rb'
4
+ load 'clearedjobsnet/cleared_jobs_net_crawler.rb'
5
+ load 'clearedjobsnet/get_all_cleared_jobs.rb'
6
+
7
+ # Crawls all the jobs that require clearance
8
+ class TSJobCrawler
9
+ def initialize(search_term, requests=nil, cm_hash=nil)
10
+ @search_term = search_term
11
+ @requests = requests
12
+ @cm_hash = cm_hash
13
+ @output = Array.new
14
+ end
15
+
16
+ # Crawl all of the listing sites
17
+ def crawl_jobs
18
+ security_cleared_jobs_com
19
+ clearance_jobs_com
20
+ cleared_jobs_net
21
+ end
22
+
23
+ def security_cleared_jobs_com
24
+ c = SecurityClearedJobsComCrawler.new(@search_term, @requests, @cm_hash)
25
+ c.crawl
26
+ @output += JSON.parse(c.gen_json) if @cm_hash == nil
27
+ end
28
+
29
+ def clearance_jobs_com
30
+ c = ClearanceJobsComCrawler.new(@search_term, @requests, @cm_hash)
31
+ c.crawl
32
+ @output += JSON.parse(c.gen_json) if @cm_hash == nil
33
+ end
34
+
35
+ def cleared_jobs_net
36
+ if @search_term == nil
37
+ g = GetAllClearedJobs.new(@requests, @cm_hash)
38
+ g.crawl
39
+ @output += JSON.parse(g.gen_json) if @cm_hash == nil
40
+ else # Scrape by search term
41
+ c = ClearedJobsNetCrawler.new(@search_term, nil, @requests, @cm_hash)
42
+ c.crawl_listings
43
+ @output += JSON.parse(c.gen_json) if @cm_hash == nil
44
+ end
45
+ end
46
+
47
+ # Generate output
48
+ def gen_json
49
+ JSON.pretty_generate(@output)
50
+ end
51
+ end
52
+
@@ -0,0 +1,22 @@
1
+ require 'requestmanager'
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+
5
+ module FailureHandler
6
+ def get_retry(url, requests, i=0)
7
+ puts "crawling "+url
8
+ begin
9
+ if requests
10
+ return requests.get_page(url)
11
+ else
12
+ return File.read(open(url.gsub("[", "%5B").gsub("]", "%5D")))
13
+ end
14
+ rescue
15
+ if i < 10
16
+ i+=1
17
+ sleep(i*rand(1..10))
18
+ get_retry(url, requests, i)
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "tsjobcrawler"
7
+ spec.version = '0.1'
8
+ spec.authors = ["M. C. McGrath"]
9
+ spec.email = ["shidash@shidash.com"]
10
+
11
+ spec.summary = %q{Crawls job listing websites for jobs requiring security clearance.}
12
+ spec.description = %q{Crawls job listing websites for jobs requiring security clearance.}
13
+ spec.homepage = "https://github.com/TransparencyToolkit/TSJobCrawler"
14
+
15
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
16
+ spec.bindir = "exe"
17
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
18
+ spec.require_paths = ["lib"]
19
+
20
+ spec.add_development_dependency "bundler", "~> 1.10"
21
+ spec.add_development_dependency "rake", "~> 10.0"
22
+ spec.add_runtime_dependency "nokogiri"
23
+ spec.add_runtime_dependency "requestmanager"
24
+ spec.add_runtime_dependency "harvesterreporter"
25
+ spec.add_runtime_dependency "pry"
26
+ spec.add_runtime_dependency "headless"
27
+ end
metadata ADDED
@@ -0,0 +1,162 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tsjobcrawler
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ platform: ruby
6
+ authors:
7
+ - M. C. McGrath
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2017-03-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.10'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.10'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: requestmanager
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: harvesterreporter
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: pry
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: headless
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description: Crawls job listing websites for jobs requiring security clearance.
112
+ email:
113
+ - shidash@shidash.com
114
+ executables: []
115
+ extensions: []
116
+ extra_rdoc_files: []
117
+ files:
118
+ - ".rspec"
119
+ - COPYING
120
+ - Gemfile
121
+ - README.md
122
+ - bin/console
123
+ - bin/setup
124
+ - lib/clearancejobscom/clearance_jobs_com_crawler.rb
125
+ - lib/clearancejobscom/clearance_jobs_com_parser.rb
126
+ - lib/clearedjobsnet/cleared_jobs_net_crawler.rb
127
+ - lib/clearedjobsnet/cleared_jobs_net_parser.rb
128
+ - lib/clearedjobsnet/get_all_cleared_jobs.rb
129
+ - lib/clearedjobsnet/terms/clearance_levels.json
130
+ - lib/clearedjobsnet/terms/company_names.json
131
+ - lib/clearedjobsnet/terms/country_names.json
132
+ - lib/clearedjobsnet/terms/search_terms.json
133
+ - lib/securityclearedjobscom/security_cleared_jobs_com_crawler.rb
134
+ - lib/securityclearedjobscom/security_cleared_jobs_com_parser.rb
135
+ - lib/tsjobcrawler.rb
136
+ - lib/util/failure_handler.rb
137
+ - tsjobcrawler.gemspec
138
+ homepage: https://github.com/TransparencyToolkit/TSJobCrawler
139
+ licenses: []
140
+ metadata: {}
141
+ post_install_message:
142
+ rdoc_options: []
143
+ require_paths:
144
+ - lib
145
+ required_ruby_version: !ruby/object:Gem::Requirement
146
+ requirements:
147
+ - - ">="
148
+ - !ruby/object:Gem::Version
149
+ version: '0'
150
+ required_rubygems_version: !ruby/object:Gem::Requirement
151
+ requirements:
152
+ - - ">="
153
+ - !ruby/object:Gem::Version
154
+ version: '0'
155
+ requirements: []
156
+ rubyforge_project:
157
+ rubygems_version: 2.4.8
158
+ signing_key:
159
+ specification_version: 4
160
+ summary: Crawls job listing websites for jobs requiring security clearance.
161
+ test_files: []
162
+ has_rdoc: