indeed_scraper2022 0.1.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6bcc9484e0c35971209ee1419b9ff7d07d912831eedf889d10789c2ab4bf0863
4
- data.tar.gz: c3be84be0f499ed92794a8de0c3109cc99d3ba73f6aef74f9426a12b0a250e10
3
+ metadata.gz: 972f811430fae59121e39c9c4752b64fc43b37165a52dcec8c3eac42cf1e4555
4
+ data.tar.gz: 85e987eb264b098b4c892e2e05d2ab082e3b8968fff2bdd519552e889f014f9d
5
5
  SHA512:
6
- metadata.gz: 9e6d64b754bbd242f6d038f385602ed4dc5fe2d8112da12709a673a494419a159c9afe2ef4fa6bc95cccd160f9bceb22b03783108ffec213ed96442372092f81
7
- data.tar.gz: eae4c65e2cd7784aff9c3db394fa29e44344e61a5275a31790a1fc21c57df34204585aa5e9e3db8af28a251cf2366675d4d9027b9a6578456dd9f64744842caa
6
+ metadata.gz: e7d3c2a13e315383248c557806dd0184d8831f46a2e314816395f62fcb886ba2e38a3e1f2deb180ceb33b614cf0b7be8a13379028fc56982e388948575bdb02c
7
+ data.tar.gz: 6ca4792d260c43b22fcee5ead8928525df17f1db112c49ac4cc7d7a5c9b29a8f483b94c18cd39bace1d4a4ee553ff83254a7fb445d76abf06c1705d19bac455c
checksums.yaml.gz.sig CHANGED
Binary file
@@ -2,23 +2,31 @@
2
2
 
3
3
  # file: indeed_scraper2022.rb
4
4
 
5
- require 'mechanize'
5
+ require 'ferrumwizard'
6
6
  require 'nokorexi'
7
+ require 'yaml'
7
8
 
8
9
  # Given the nature of changes to jobsearch websites,
9
10
  # don't rely upon this gem working in the near future.
10
11
 
11
12
 
13
+
12
14
  class IndeedScraper2022Err < Exception
13
15
  end
14
16
 
15
17
  class IndeedScraper2022
16
18
 
17
- def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '', debug: false)
19
+ attr_reader :browser
20
+
21
+ def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '',
22
+ headless: true, cookies: nil, debug: false)
18
23
 
19
24
  @debug = debug
20
25
  @url_base, @q, @location = url, q, location
21
- @results = search
26
+ @headless, @cookies = headless, cookies
27
+
28
+ fw = FerrumWizard.new( headless: @headless, cookies: @cookies, debug: @debug)
29
+ @browser = fw.browser
22
30
 
23
31
  end
24
32
 
@@ -28,6 +36,101 @@ class IndeedScraper2022
28
36
  @results
29
37
  end
30
38
 
39
+ def search(q: @q, location: @location, start: nil)
40
+
41
+ url = @url_base
42
+ url += 'start=' + start if start
43
+
44
+ @browser.goto(url)
45
+ #@browser.network.wait_for_idle
46
+ puts 'sleeping for 4 seconds' if @debug
47
+ sleep 4
48
+
49
+ if q.length > 1 then
50
+
51
+ input = @browser.at_xpath("//input[@name='q']")
52
+
53
+ # select any existing text and overwrite it
54
+ input.focus.type(:home); sleep 0.2
55
+ input.focus.type(:shift, :end); sleep 0.2
56
+ input.focus.type(q); sleep 0.2
57
+ end
58
+
59
+ if location.length > 1 then
60
+
61
+ input2 = @browser.at_xpath("//input[@name='l']")
62
+
63
+ # select any existing text and overwrite it
64
+ input2.focus.type(:home); sleep 0.2
65
+ input2.focus.type(:shift, :end); sleep 0.2
66
+ input2.focus.type(location); sleep 0.2
67
+
68
+ end
69
+
70
+ button = @browser.at_xpath("//button[@type='submit']")
71
+ button.click
72
+ #@browser.network.wait_for_idle
73
+ puts 'sleeping for 2 seconds' if @debug
74
+ sleep 2
75
+
76
+ doc2 = Nokogiri::XML(@browser.body)
77
+
78
+ a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
79
+ puts 'a2: ' + a2.length.inspect if @debug
80
+
81
+ @a2 = a2.map {|x| Rexle.new x.to_s }
82
+
83
+ @results = @a2.map do |doc|
84
+
85
+ div = doc.element("a[@class='desktop']/div[@class='slider" \
86
+ "_container']/div[@class='slider_list']/div[@class='sl" \
87
+ "ider_item']/div[@class='job_seen_beacon']")
88
+ td = div.element("table[@class='jobCard_mainContent']/tbo" \
89
+ "dy/tr/td[@class='resultContent']")
90
+
91
+ # job title (e.g. Software Developer)
92
+ jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
93
+ "class='jobTitle-color-purple']/span")&.text
94
+ puts 'jobtitle: ' + jobtitle.inspect if @debug
95
+
96
+ salary = td.element("div[@class='metadataContainer']/" \
97
+ "div[@class='salary-snippet-container']/div[@class='sa" \
98
+ "lary-snippet']/span")&.text
99
+
100
+ puts 'salary: ' + salary.inspect if @debug
101
+ div1 = td.element("div[@class='companyInfo']")
102
+
103
+ # company name (e.g. Coda Octopus Products Ltd)
104
+ company_name = div1.element("span[@class='companyName']")&.text
105
+
106
+ # company location (e.g. Edinburgh)
107
+ location = div1.element("div[@class='companyLocation']")&.text
108
+ tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
109
+
110
+ div3 = tbody.element("tr[@class='underShelfFooter']/td/di" \
111
+ "v[@class='result-footer']")
112
+
113
+ # job (e.g. Our products are primarily written in C#, using...)
114
+ jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
115
+
116
+ # visually (e.g. Posted 14 days ago)
117
+ dateposted = div3.element("span[@class='date']")&.texts
118
+ date = (Date.today - dateposted.first.to_i).to_s if dateposted
119
+
120
+ {
121
+ link: @url_base.sub(/\/[^\/]+$/,'') \
122
+ + doc.root.attributes[:href].gsub(/&amp;/,'&'),
123
+ title: jobtitle,
124
+ salary: salary,
125
+ company: company_name,
126
+ location: location,
127
+ jobsnippet: jobsnippet,
128
+ date: date
129
+ }
130
+
131
+ end
132
+ end
133
+
31
134
  def page(n)
32
135
 
33
136
  if n < 1 or n > @results.length then
@@ -64,16 +167,18 @@ class IndeedScraper2022
64
167
  "ickyContainer-companyrating']/div/div[@class='icl-u-x" \
65
168
  "s-mr--xs']")[1]
66
169
  clink = div3.element('//a')
67
- company = cname ? cname.text : clink.text
170
+ company = cname.text ? cname.text : clink.text
68
171
  companylink = clink.attributes[:href] if clink
69
172
 
173
+ salary = div1.element("//span[@class='attribute_snippet']")&.text
174
+ type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
70
175
  div5 = div3.xpath("div/div")
71
176
  location, worklocation = div5.map(&:text).compact
72
177
 
73
178
  # icl (e.g. Full-time, Permanent)
74
179
  jobtype = div1.element("div/div/div[@class='jobsearch-J" \
75
180
  "obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
76
- jobtype = jobtype.texts.join if jobtype
181
+ jobtype = jobtype&.texts.join if jobtype
77
182
 
78
183
  # jobsearch (e.g. Urgently needed)
79
184
  jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
@@ -81,98 +186,87 @@ class IndeedScraper2022
81
186
  "h-DesktopTag-text']")&.text
82
187
 
83
188
  # jobsearch (e.g. 10 days ago)
84
- datepost = e0.element("//div[@class='jobsearch-JobTab-con" \
85
- "tent']/div[@class='jobsearch-JobMetadataFooter']/div")&.text
189
+ days = e0.element("//div[@class='jobsearch-JobTab-con" \
190
+ "tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
191
+ d = Date.today - days.to_i
192
+ datepost = d.strftime("%Y-%m-%d")
193
+
86
194
 
87
195
  jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
88
196
  "ass='jobsearch-jobDescriptionText']").xml
89
197
 
90
198
  {
91
199
  title: jobtitle,
200
+ type: type,
92
201
  company: company,
93
202
  companylink: companylink,
94
203
  location: location,
204
+ salary: salary,
95
205
  worklocation: worklocation,
96
206
  note: jobnote1,
97
- date: (Date.today - datepost.to_i).to_s,
207
+ date: datepost,
98
208
  desc: jobdesc
99
209
  }
100
210
 
101
211
  end
102
212
 
103
- def search(q='', location='')
104
213
 
105
- a = Mechanize.new
106
-
107
- page = a.get(@url_base)
108
- form = page.forms.first
109
- form.fields[0].value = @q
110
- form.fields[1].value = @location
111
- pg = form.submit
214
+ end
112
215
 
113
- doc2 = Nokogiri::XML(pg.body)
216
+ class IS22Plus < IndeedScraper2022
114
217
 
115
- a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
116
- puts 'a2: ' + a2.length.inspect if @debug
218
+ def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
219
+ super(q: q, location: location, headless: headless, cookies: cookies,
220
+ debug: debug)
221
+ end
117
222
 
118
- @a2 = a2.map {|x| Rexle.new x.to_s }
223
+ def archive(filepath='/tmp/indeed')
119
224
 
120
- @a2.map do |doc|
225
+ return unless @results
121
226
 
122
- div = doc.element("a[@class='desktop']/div[@class='slider" \
123
- "_container']/div[@class='slider_list']/div[@class='sl" \
124
- "ider_item']/div[@class='job_seen_beacon']")
125
- td = div.element("table[@class='jobCard_mainContent']/tbo" \
126
- "dy/tr/td[@class='resultContent']")
227
+ FileUtils.mkdir_p filepath
127
228
 
128
- # job title (e.g. Software Developer)
129
- jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
130
- "class='jobTitle-color-purple']/span")&.text
131
- puts 'jobtitle: ' + jobtitle.inspect if @debug
229
+ idxfile = File.join(filepath, 'index.yml')
132
230
 
133
- salary = td.element("div[@class='metadataContainer']/" \
134
- "div[@class='salary-snippet-container']/div[@class='sa" \
135
- "lary-snippet']/span")&.text
231
+ index = if File.exists? idxfile then
232
+ YAML.load(File.read(idxfile))
233
+ else
234
+ {}
235
+ end
136
236
 
137
- puts 'salary: ' + salary.inspect if @debug
138
- div1 = td.element("div[@class='companyInfo']")
237
+ @results.each.with_index do |item, i|
139
238
 
140
- # company name (e.g. Coda Octopus Products Ltd)
141
- company_name = div1.element("span[@class='companyName']")&.text
239
+ puts 'saving ' + item[:title] if @debug
240
+ puts 'link: ' + item[:link].inspect
241
+ links = RXFReader.reveal(item[:link])
242
+ puts 'links: ' + links.inspect
142
243
 
143
- # company location (e.g. Edinburgh)
144
- location = div1.element("div[@class='companyLocation']")&.text
145
- tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
244
+ url = links.last
245
+ id = url[/(?<=\?jk=)[^&]+/]
146
246
 
147
- div3 = tbody.element("tr[@class='underShelfFooter']/td/di" \
148
- "v[@class='result-footer']")
247
+ if index[id.to_sym] then
248
+ next
249
+ else
149
250
 
150
- # job (e.g. Our products are primarily written in C#, using...)
151
- jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
251
+ File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
152
252
 
153
- # visually (e.g. Posted 14 days ago)
154
- dateposted = div3.element("span[@class='date']").texts
155
- date = (Date.today - dateposted.first.to_i).to_s
253
+ h = {
254
+ link: url[/^[^&]+/],
255
+ title: item[:title].to_s,
256
+ salary: item[:salary].to_s,
257
+ company: item[:company].to_s.strip,
258
+ location: item[:location].to_s,
259
+ jobsnippet: item[:jobsnippet],
260
+ date: item[:date]
261
+ }
156
262
 
157
- {
158
- link: @url_base.sub(/\/[^\/]+$/,'') \
159
- + doc.root.attributes[:href].gsub(/&amp;/,'&'),
160
- title: jobtitle,
161
- salary: salary,
162
- company: company_name,
163
- location: location,
164
- jobsnippet: jobsnippet,
165
- date: date
166
- }
263
+ index[id.to_sym] = h
264
+ end
167
265
 
168
266
  end
169
- end
170
- end
171
267
 
172
- class IS22Plus < IndeedScraper2022
268
+ File.write idxfile, index.to_yaml
173
269
 
174
- def initialize(q: '', location: '', debug: false)
175
- super(q: q, location: location, debug: debug)
176
270
  end
177
271
 
178
272
  def list()
@@ -183,4 +277,5 @@ class IS22Plus < IndeedScraper2022
183
277
 
184
278
  end
185
279
 
280
+
186
281
  end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeed_scraper2022
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
36
36
  SW/2zInu2bkj/meWm5eBoWHT
37
37
  -----END CERTIFICATE-----
38
- date: 2022-03-22 00:00:00.000000000 Z
38
+ date: 2022-04-01 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: nokorexi
@@ -58,25 +58,25 @@ dependencies:
58
58
  - !ruby/object:Gem::Version
59
59
  version: 0.7.0
60
60
  - !ruby/object:Gem::Dependency
61
- name: mechanize
61
+ name: ferrumwizard
62
62
  requirement: !ruby/object:Gem::Requirement
63
63
  requirements:
64
64
  - - "~>"
65
65
  - !ruby/object:Gem::Version
66
- version: '2.8'
66
+ version: '0.2'
67
67
  - - ">="
68
68
  - !ruby/object:Gem::Version
69
- version: 2.8.4
69
+ version: 0.2.2
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
73
73
  requirements:
74
74
  - - "~>"
75
75
  - !ruby/object:Gem::Version
76
- version: '2.8'
76
+ version: '0.2'
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
- version: 2.8.4
79
+ version: 0.2.2
80
80
  description:
81
81
  email: digital.robertson@gmail.com
82
82
  executables: []
metadata.gz.sig CHANGED
Binary file