indeed_scraper2022 0.1.3 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6bcc9484e0c35971209ee1419b9ff7d07d912831eedf889d10789c2ab4bf0863
4
- data.tar.gz: c3be84be0f499ed92794a8de0c3109cc99d3ba73f6aef74f9426a12b0a250e10
3
+ metadata.gz: 972f811430fae59121e39c9c4752b64fc43b37165a52dcec8c3eac42cf1e4555
4
+ data.tar.gz: 85e987eb264b098b4c892e2e05d2ab082e3b8968fff2bdd519552e889f014f9d
5
5
  SHA512:
6
- metadata.gz: 9e6d64b754bbd242f6d038f385602ed4dc5fe2d8112da12709a673a494419a159c9afe2ef4fa6bc95cccd160f9bceb22b03783108ffec213ed96442372092f81
7
- data.tar.gz: eae4c65e2cd7784aff9c3db394fa29e44344e61a5275a31790a1fc21c57df34204585aa5e9e3db8af28a251cf2366675d4d9027b9a6578456dd9f64744842caa
6
+ metadata.gz: e7d3c2a13e315383248c557806dd0184d8831f46a2e314816395f62fcb886ba2e38a3e1f2deb180ceb33b614cf0b7be8a13379028fc56982e388948575bdb02c
7
+ data.tar.gz: 6ca4792d260c43b22fcee5ead8928525df17f1db112c49ac4cc7d7a5c9b29a8f483b94c18cd39bace1d4a4ee553ff83254a7fb445d76abf06c1705d19bac455c
checksums.yaml.gz.sig CHANGED
Binary file
@@ -2,23 +2,31 @@
2
2
 
3
3
  # file: indeed_scraper2022.rb
4
4
 
5
- require 'mechanize'
5
+ require 'ferrumwizard'
6
6
  require 'nokorexi'
7
+ require 'yaml'
7
8
 
8
9
  # Given the nature of changes to jobsearch websites,
9
10
  # don't rely upon this gem working in the near future.
10
11
 
11
12
 
13
+
12
14
  class IndeedScraper2022Err < Exception
13
15
  end
14
16
 
15
17
  class IndeedScraper2022
16
18
 
17
- def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '', debug: false)
19
+ attr_reader :browser
20
+
21
+ def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '',
22
+ headless: true, cookies: nil, debug: false)
18
23
 
19
24
  @debug = debug
20
25
  @url_base, @q, @location = url, q, location
21
- @results = search
26
+ @headless, @cookies = headless, cookies
27
+
28
+ fw = FerrumWizard.new( headless: @headless, cookies: @cookies, debug: @debug)
29
+ @browser = fw.browser
22
30
 
23
31
  end
24
32
 
@@ -28,6 +36,101 @@ class IndeedScraper2022
28
36
  @results
29
37
  end
30
38
 
39
+ def search(q: @q, location: @location, start: nil)
40
+
41
+ url = @url_base
42
+ url += 'start=' + start if start
43
+
44
+ @browser.goto(url)
45
+ #@browser.network.wait_for_idle
46
+ puts 'sleeping for 4 seconds' if @debug
47
+ sleep 4
48
+
49
+ if q.length > 1 then
50
+
51
+ input = @browser.at_xpath("//input[@name='q']")
52
+
53
+ # select any existing text and overwrite it
54
+ input.focus.type(:home); sleep 0.2
55
+ input.focus.type(:shift, :end); sleep 0.2
56
+ input.focus.type(q); sleep 0.2
57
+ end
58
+
59
+ if location.length > 1 then
60
+
61
+ input2 = @browser.at_xpath("//input[@name='l']")
62
+
63
+ # select any existing text and overwrite it
64
+ input2.focus.type(:home); sleep 0.2
65
+ input2.focus.type(:shift, :end); sleep 0.2
66
+ input2.focus.type(location); sleep 0.2
67
+
68
+ end
69
+
70
+ button = @browser.at_xpath("//button[@type='submit']")
71
+ button.click
72
+ #@browser.network.wait_for_idle
73
+ puts 'sleeping for 2 seconds' if @debug
74
+ sleep 2
75
+
76
+ doc2 = Nokogiri::XML(@browser.body)
77
+
78
+ a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
79
+ puts 'a2: ' + a2.length.inspect if @debug
80
+
81
+ @a2 = a2.map {|x| Rexle.new x.to_s }
82
+
83
+ @results = @a2.map do |doc|
84
+
85
+ div = doc.element("a[@class='desktop']/div[@class='slider" \
86
+ "_container']/div[@class='slider_list']/div[@class='sl" \
87
+ "ider_item']/div[@class='job_seen_beacon']")
88
+ td = div.element("table[@class='jobCard_mainContent']/tbo" \
89
+ "dy/tr/td[@class='resultContent']")
90
+
91
+ # job title (e.g. Software Developer)
92
+ jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
93
+ "class='jobTitle-color-purple']/span")&.text
94
+ puts 'jobtitle: ' + jobtitle.inspect if @debug
95
+
96
+ salary = td.element("div[@class='metadataContainer']/" \
97
+ "div[@class='salary-snippet-container']/div[@class='sa" \
98
+ "lary-snippet']/span")&.text
99
+
100
+ puts 'salary: ' + salary.inspect if @debug
101
+ div1 = td.element("div[@class='companyInfo']")
102
+
103
+ # company name (e.g. Coda Octopus Products Ltd)
104
+ company_name = div1.element("span[@class='companyName']")&.text
105
+
106
+ # company location (e.g. Edinburgh)
107
+ location = div1.element("div[@class='companyLocation']")&.text
108
+ tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
109
+
110
+ div3 = tbody.element("tr[@class='underShelfFooter']/td/di" \
111
+ "v[@class='result-footer']")
112
+
113
+ # job (e.g. Our products are primarily written in C#, using...)
114
+ jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
115
+
116
+ # visually (e.g. Posted 14 days ago)
117
+ dateposted = div3.element("span[@class='date']")&.texts
118
+ date = (Date.today - dateposted.first.to_i).to_s if dateposted
119
+
120
+ {
121
+ link: @url_base.sub(/\/[^\/]+$/,'') \
122
+ + doc.root.attributes[:href].gsub(/&amp;/,'&'),
123
+ title: jobtitle,
124
+ salary: salary,
125
+ company: company_name,
126
+ location: location,
127
+ jobsnippet: jobsnippet,
128
+ date: date
129
+ }
130
+
131
+ end
132
+ end
133
+
31
134
  def page(n)
32
135
 
33
136
  if n < 1 or n > @results.length then
@@ -64,16 +167,18 @@ class IndeedScraper2022
64
167
  "ickyContainer-companyrating']/div/div[@class='icl-u-x" \
65
168
  "s-mr--xs']")[1]
66
169
  clink = div3.element('//a')
67
- company = cname ? cname.text : clink.text
170
+ company = cname.text ? cname.text : clink.text
68
171
  companylink = clink.attributes[:href] if clink
69
172
 
173
+ salary = div1.element("//span[@class='attribute_snippet']")&.text
174
+ type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
70
175
  div5 = div3.xpath("div/div")
71
176
  location, worklocation = div5.map(&:text).compact
72
177
 
73
178
  # icl (e.g. Full-time, Permanent)
74
179
  jobtype = div1.element("div/div/div[@class='jobsearch-J" \
75
180
  "obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
76
- jobtype = jobtype.texts.join if jobtype
181
+ jobtype = jobtype&.texts.join if jobtype
77
182
 
78
183
  # jobsearch (e.g. Urgently needed)
79
184
  jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
@@ -81,98 +186,87 @@ class IndeedScraper2022
81
186
  "h-DesktopTag-text']")&.text
82
187
 
83
188
  # jobsearch (e.g. 10 days ago)
84
- datepost = e0.element("//div[@class='jobsearch-JobTab-con" \
85
- "tent']/div[@class='jobsearch-JobMetadataFooter']/div")&.text
189
+ days = e0.element("//div[@class='jobsearch-JobTab-con" \
190
+ "tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
191
+ d = Date.today - days.to_i
192
+ datepost = d.strftime("%Y-%m-%d")
193
+
86
194
 
87
195
  jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
88
196
  "ass='jobsearch-jobDescriptionText']").xml
89
197
 
90
198
  {
91
199
  title: jobtitle,
200
+ type: type,
92
201
  company: company,
93
202
  companylink: companylink,
94
203
  location: location,
204
+ salary: salary,
95
205
  worklocation: worklocation,
96
206
  note: jobnote1,
97
- date: (Date.today - datepost.to_i).to_s,
207
+ date: datepost,
98
208
  desc: jobdesc
99
209
  }
100
210
 
101
211
  end
102
212
 
103
- def search(q='', location='')
104
213
 
105
- a = Mechanize.new
106
-
107
- page = a.get(@url_base)
108
- form = page.forms.first
109
- form.fields[0].value = @q
110
- form.fields[1].value = @location
111
- pg = form.submit
214
+ end
112
215
 
113
- doc2 = Nokogiri::XML(pg.body)
216
+ class IS22Plus < IndeedScraper2022
114
217
 
115
- a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
116
- puts 'a2: ' + a2.length.inspect if @debug
218
+ def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
219
+ super(q: q, location: location, headless: headless, cookies: cookies,
220
+ debug: debug)
221
+ end
117
222
 
118
- @a2 = a2.map {|x| Rexle.new x.to_s }
223
+ def archive(filepath='/tmp/indeed')
119
224
 
120
- @a2.map do |doc|
225
+ return unless @results
121
226
 
122
- div = doc.element("a[@class='desktop']/div[@class='slider" \
123
- "_container']/div[@class='slider_list']/div[@class='sl" \
124
- "ider_item']/div[@class='job_seen_beacon']")
125
- td = div.element("table[@class='jobCard_mainContent']/tbo" \
126
- "dy/tr/td[@class='resultContent']")
227
+ FileUtils.mkdir_p filepath
127
228
 
128
- # job title (e.g. Software Developer)
129
- jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
130
- "class='jobTitle-color-purple']/span")&.text
131
- puts 'jobtitle: ' + jobtitle.inspect if @debug
229
+ idxfile = File.join(filepath, 'index.yml')
132
230
 
133
- salary = td.element("div[@class='metadataContainer']/" \
134
- "div[@class='salary-snippet-container']/div[@class='sa" \
135
- "lary-snippet']/span")&.text
231
+ index = if File.exists? idxfile then
232
+ YAML.load(File.read(idxfile))
233
+ else
234
+ {}
235
+ end
136
236
 
137
- puts 'salary: ' + salary.inspect if @debug
138
- div1 = td.element("div[@class='companyInfo']")
237
+ @results.each.with_index do |item, i|
139
238
 
140
- # company name (e.g. Coda Octopus Products Ltd)
141
- company_name = div1.element("span[@class='companyName']")&.text
239
+ puts 'saving ' + item[:title] if @debug
240
+ puts 'link: ' + item[:link].inspect
241
+ links = RXFReader.reveal(item[:link])
242
+ puts 'links: ' + links.inspect
142
243
 
143
- # company location (e.g. Edinburgh)
144
- location = div1.element("div[@class='companyLocation']")&.text
145
- tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
244
+ url = links.last
245
+ id = url[/(?<=\?jk=)[^&]+/]
146
246
 
147
- div3 = tbody.element("tr[@class='underShelfFooter']/td/di" \
148
- "v[@class='result-footer']")
247
+ if index[id.to_sym] then
248
+ next
249
+ else
149
250
 
150
- # job (e.g. Our products are primarily written in C#, using...)
151
- jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
251
+ File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
152
252
 
153
- # visually (e.g. Posted 14 days ago)
154
- dateposted = div3.element("span[@class='date']").texts
155
- date = (Date.today - dateposted.first.to_i).to_s
253
+ h = {
254
+ link: url[/^[^&]+/],
255
+ title: item[:title].to_s,
256
+ salary: item[:salary].to_s,
257
+ company: item[:company].to_s.strip,
258
+ location: item[:location].to_s,
259
+ jobsnippet: item[:jobsnippet],
260
+ date: item[:date]
261
+ }
156
262
 
157
- {
158
- link: @url_base.sub(/\/[^\/]+$/,'') \
159
- + doc.root.attributes[:href].gsub(/&amp;/,'&'),
160
- title: jobtitle,
161
- salary: salary,
162
- company: company_name,
163
- location: location,
164
- jobsnippet: jobsnippet,
165
- date: date
166
- }
263
+ index[id.to_sym] = h
264
+ end
167
265
 
168
266
  end
169
- end
170
- end
171
267
 
172
- class IS22Plus < IndeedScraper2022
268
+ File.write idxfile, index.to_yaml
173
269
 
174
- def initialize(q: '', location: '', debug: false)
175
- super(q: q, location: location, debug: debug)
176
270
  end
177
271
 
178
272
  def list()
@@ -183,4 +277,5 @@ class IS22Plus < IndeedScraper2022
183
277
 
184
278
  end
185
279
 
280
+
186
281
  end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeed_scraper2022
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
36
36
  SW/2zInu2bkj/meWm5eBoWHT
37
37
  -----END CERTIFICATE-----
38
- date: 2022-03-22 00:00:00.000000000 Z
38
+ date: 2022-04-01 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: nokorexi
@@ -58,25 +58,25 @@ dependencies:
58
58
  - !ruby/object:Gem::Version
59
59
  version: 0.7.0
60
60
  - !ruby/object:Gem::Dependency
61
- name: mechanize
61
+ name: ferrumwizard
62
62
  requirement: !ruby/object:Gem::Requirement
63
63
  requirements:
64
64
  - - "~>"
65
65
  - !ruby/object:Gem::Version
66
- version: '2.8'
66
+ version: '0.2'
67
67
  - - ">="
68
68
  - !ruby/object:Gem::Version
69
- version: 2.8.4
69
+ version: 0.2.2
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
73
73
  requirements:
74
74
  - - "~>"
75
75
  - !ruby/object:Gem::Version
76
- version: '2.8'
76
+ version: '0.2'
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
- version: 2.8.4
79
+ version: 0.2.2
80
80
  description:
81
81
  email: digital.robertson@gmail.com
82
82
  executables: []
metadata.gz.sig CHANGED
Binary file