indeed_scraper2022 0.1.2 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 19d85a0b62308f9e6c53eda22a9b26fb559e62d9528597fde6d89bb117f0deb0
4
- data.tar.gz: 43aea0e9efb6d6ed45c1efe4e480d8422939d20f42cd5b8514f8733b6f3ebd03
3
+ metadata.gz: 5e33dfd54667ecc9f8b7985aa07af403be8d95729ce68e8a40d8c985d57bd4e1
4
+ data.tar.gz: a2c041ec8103b6afac3a422e7b73bc82c89fd7f8d955240439a29ec0347c8a5f
5
5
  SHA512:
6
- metadata.gz: bf0592ffdbaf9dbba2e84be8dcbdbf18c1d8fcb25e0e33eacbbb786decb0f8d18240371149ef9850bbde890e5b64893660f2ad2e0d149cb5e40e900802d82bb4
7
- data.tar.gz: 60e00efcef2f86e6e64c8f02d45e943ab5cdc1090b3d361f3d6e305b4389694a6061618da693dee1d81b11dbe729f02786c06e71562b10d581b88503189f676a
6
+ metadata.gz: 8e640cb8262a057bb588b501ee1122a59e6e239e2a5988dd0566ffffb814a2fef763c36fdeae1ba5dc4e6f819ca145374058bd62373ce776df1e393057a49fc0
7
+ data.tar.gz: 0a6bfe0ef2b685d5711a95704cee3fa67d58eb7c9d0f149c872f9c23b0cc489382ab1327b101f7accf982f7ea1f1a6d56dc20ae528ef1fd4d6105c9ef93067da
checksums.yaml.gz.sig CHANGED
Binary file
@@ -2,20 +2,29 @@
2
2
 
3
3
  # file: indeed_scraper2022.rb
4
4
 
5
- require 'mechanize'
5
+ require 'ferrumwizard'
6
6
  require 'nokorexi'
7
7
 
8
8
  # Given the nature of changes to jobsearch websites,
9
9
  # don't rely upon this gem working in the near future.
10
10
 
11
11
 
12
+ class IndeedScraper2022Err < Exception
13
+ end
14
+
12
15
  class IndeedScraper2022
13
16
 
14
- def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '', debug: false)
17
+ attr_reader :browser
18
+
19
+ def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '',
20
+ headless: true, cookies: nil, debug: false)
15
21
 
16
22
  @debug = debug
17
23
  @url_base, @q, @location = url, q, location
18
- @results = search
24
+ @headless, @cookies = headless, cookies
25
+
26
+ fw = FerrumWizard.new( headless: @headless, cookies: @cookies, debug: @debug)
27
+ @browser = fw.browser
19
28
 
20
29
  end
21
30
 
@@ -25,7 +34,107 @@ class IndeedScraper2022
25
34
  @results
26
35
  end
27
36
 
37
+ def search(q: @q, location: @location, start: nil)
38
+
39
+ url = @url_base
40
+ url += 'start=' + start if start
41
+
42
+ @browser.goto(url)
43
+ #@browser.network.wait_for_idle
44
+ puts 'sleeping for 4 seconds' if @debug
45
+ sleep 4
46
+
47
+ if q.length > 1 then
48
+
49
+ input = @browser.at_xpath("//input[@name='q']")
50
+
51
+ # select any existing text and overwrite it
52
+ input.focus.type(:home); sleep 0.2
53
+ input.focus.type(:shift, :end); sleep 0.2
54
+ input.focus.type(q); sleep 0.2
55
+ end
56
+
57
+ if location.length > 1 then
58
+
59
+ input2 = @browser.at_xpath("//input[@name='l']")
60
+
61
+ # select any existing text and overwrite it
62
+ input2.focus.type(:home); sleep 0.2
63
+ input2.focus.type(:shift, :end); sleep 0.2
64
+ input2.focus.type(location); sleep 0.2
65
+
66
+ end
67
+
68
+ button = @browser.at_xpath("//button[@type='submit']")
69
+ button.click
70
+ #@browser.network.wait_for_idle
71
+ puts 'sleeping for 2 seconds' if @debug
72
+ sleep 2
73
+
74
+ doc2 = Nokogiri::XML(@browser.body)
75
+
76
+ a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
77
+ puts 'a2: ' + a2.length.inspect if @debug
78
+
79
+ @a2 = a2.map {|x| Rexle.new x.to_s }
80
+
81
+ @results = @a2.map do |doc|
82
+
83
+ div = doc.element("a[@class='desktop']/div[@class='slider" \
84
+ "_container']/div[@class='slider_list']/div[@class='sl" \
85
+ "ider_item']/div[@class='job_seen_beacon']")
86
+ td = div.element("table[@class='jobCard_mainContent']/tbo" \
87
+ "dy/tr/td[@class='resultContent']")
88
+
89
+ # job title (e.g. Software Developer)
90
+ jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
91
+ "class='jobTitle-color-purple']/span")&.text
92
+ puts 'jobtitle: ' + jobtitle.inspect if @debug
93
+
94
+ salary = td.element("div[@class='metadataContainer']/" \
95
+ "div[@class='salary-snippet-container']/div[@class='sa" \
96
+ "lary-snippet']/span")&.text
97
+
98
+ puts 'salary: ' + salary.inspect if @debug
99
+ div1 = td.element("div[@class='companyInfo']")
100
+
101
+ # company name (e.g. Coda Octopus Products Ltd)
102
+ company_name = div1.element("span[@class='companyName']")&.text
103
+
104
+ # company location (e.g. Edinburgh)
105
+ location = div1.element("div[@class='companyLocation']")&.text
106
+ tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
107
+
108
+ div3 = tbody.element("tr[@class='underShelfFooter']/td/di" \
109
+ "v[@class='result-footer']")
110
+
111
+ # job (e.g. Our products are primarily written in C#, using...)
112
+ jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
113
+
114
+ # visually (e.g. Posted 14 days ago)
115
+ dateposted = div3.element("span[@class='date']")&.texts
116
+ date = (Date.today - dateposted.first.to_i).to_s if dateposted
117
+
118
+ {
119
+ link: @url_base.sub(/\/[^\/]+$/,'') \
120
+ + doc.root.attributes[:href].gsub(/&amp;/,'&'),
121
+ title: jobtitle,
122
+ salary: salary,
123
+ company: company_name,
124
+ location: location,
125
+ jobsnippet: jobsnippet,
126
+ date: date
127
+ }
128
+
129
+ end
130
+ end
131
+
28
132
  def page(n)
133
+
134
+ if n < 1 or n > @results.length then
135
+ raise IndeedScraper2022Err, 'Invalid page no.'
136
+ end
137
+
29
138
  url = @results[n-1][:link]
30
139
  fetchjob(url)
31
140
  end
@@ -56,16 +165,18 @@ class IndeedScraper2022
56
165
  "ickyContainer-companyrating']/div/div[@class='icl-u-x" \
57
166
  "s-mr--xs']")[1]
58
167
  clink = div3.element('//a')
59
- company = cname ? cname.text : clink.text
168
+ company = cname.text ? cname.text : clink.text
60
169
  companylink = clink.attributes[:href] if clink
61
170
 
171
+ salary = div1.element("//span[@class='attribute_snippet']")&.text
172
+ type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
62
173
  div5 = div3.xpath("div/div")
63
174
  location, worklocation = div5.map(&:text).compact
64
175
 
65
176
  # icl (e.g. Full-time, Permanent)
66
177
  jobtype = div1.element("div/div/div[@class='jobsearch-J" \
67
178
  "obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
68
- jobtype = jobtype.texts.join if jobtype
179
+ jobtype = jobtype&.texts.join if jobtype
69
180
 
70
181
  # jobsearch (e.g. Urgently needed)
71
182
  jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
@@ -73,106 +184,57 @@ class IndeedScraper2022
73
184
  "h-DesktopTag-text']")&.text
74
185
 
75
186
  # jobsearch (e.g. 10 days ago)
76
- datepost = e0.element("//div[@class='jobsearch-JobTab-con" \
77
- "tent']/div[@class='jobsearch-JobMetadataFooter']/div")&.text
187
+ days = e0.element("//div[@class='jobsearch-JobTab-con" \
188
+ "tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
189
+ d = Date.today - days.to_i
190
+ datepost = d.strftime("%Y-%m-%d")
191
+
78
192
 
79
193
  jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
80
194
  "ass='jobsearch-jobDescriptionText']").xml
81
195
 
82
196
  {
83
197
  title: jobtitle,
198
+ type: type,
84
199
  company: company,
85
200
  companylink: companylink,
86
201
  location: location,
202
+ salary: salary,
87
203
  worklocation: worklocation,
88
204
  note: jobnote1,
89
- date: (Date.today - datepost.to_i).to_s,
205
+ date: datepost,
90
206
  desc: jobdesc
91
207
  }
92
208
 
93
209
  end
94
210
 
95
- def search(q='', location='')
96
-
97
- a = Mechanize.new
98
-
99
- page = a.get(@url_base)
100
- form = page.forms.first
101
- form.fields[0].value = @q
102
- form.fields[1].value = @location
103
- pg = form.submit
104
-
105
- doc2 = Nokogiri::XML(pg.body)
106
-
107
- a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
108
- puts 'a2: ' + a2.length.inspect if @debug
109
-
110
- @a2 = a2.map {|x| Rexle.new x.to_s }
111
-
112
- @a2.map do |doc|
113
-
114
- div = doc.element("a[@class='desktop']/div[@class='slider" \
115
- "_container']/div[@class='slider_list']/div[@class='sl" \
116
- "ider_item']/div[@class='job_seen_beacon']")
117
- td = div.element("table[@class='jobCard_mainContent']/tbo" \
118
- "dy/tr/td[@class='resultContent']")
119
-
120
- # job title (e.g. Software Developer)
121
- jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
122
- "class='jobTitle-color-purple']/span")&.text
123
- puts 'jobtitle: ' + jobtitle.inspect if @debug
124
-
125
- salary = td.element("div[@class='metadataContainer']/" \
126
- "div[@class='salary-snippet-container']/div[@class='sa" \
127
- "lary-snippet']/span")&.text
128
211
 
129
- puts 'salary: ' + salary.inspect if @debug
130
- div1 = td.element("div[@class='companyInfo']")
131
-
132
- # company name (e.g. Coda Octopus Products Ltd)
133
- company_name = div1.element("span[@class='companyName']")&.text
134
-
135
- # company location (e.g. Edinburgh)
136
- location = div1.element("div[@class='companyLocation']")&.text
137
- tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
212
+ end
138
213
 
139
- div3 = tbody.element("tr[@class='underShelfFooter']/td/di" \
140
- "v[@class='result-footer']")
214
+ class IS22Plus < IndeedScraper2022
141
215
 
142
- # job (e.g. Our products are primarily written in C#, using...)
143
- jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
216
+ def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
217
+ super(q: q, location: location, headless: headless, cookies: cookies,
218
+ debug: debug)
219
+ end
144
220
 
145
- # visually (e.g. Posted 14 days ago)
146
- dateposted = div3.element("span[@class='date']").texts
147
- date = (Date.today - dateposted.first.to_i).to_s
221
+ def archive()
148
222
 
149
- {
150
- link: @url_base.sub(/\/[^\/]+$/,'') \
151
- + doc.root.attributes[:href].gsub(/&amp;/,'&'),
152
- title: jobtitle,
153
- salary: salary,
154
- company: company_name,
155
- location: location,
156
- jobsnippet: jobsnippet,
157
- date: date
158
- }
223
+ return unless @results
159
224
 
225
+ 1.upto(@results.length).each do |n|
226
+ page(n)
160
227
  end
161
- end
162
- end
163
-
164
- class IS22Plus < IndeedScraper2022
165
228
 
166
- def initialize(q: '', location: '', debug: false)
167
- super(q: q, location: location, debug: debug)
168
229
  end
169
230
 
170
231
  def list()
171
232
 
172
233
  @results.map.with_index do |x,i|
173
- "%2d. %s" % [i,x[:title]]
234
+ "%2d. %s" % [i+1,x[:title]]
174
235
  end.join("\n")
175
236
 
176
237
  end
177
238
 
239
+
178
240
  end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeed_scraper2022
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
36
36
  SW/2zInu2bkj/meWm5eBoWHT
37
37
  -----END CERTIFICATE-----
38
- date: 2022-01-25 00:00:00.000000000 Z
38
+ date: 2022-03-30 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: nokorexi
@@ -43,42 +43,42 @@ dependencies:
43
43
  requirements:
44
44
  - - "~>"
45
45
  - !ruby/object:Gem::Version
46
- version: '0.5'
46
+ version: '0.7'
47
47
  - - ">="
48
48
  - !ruby/object:Gem::Version
49
- version: 0.5.5
49
+ version: 0.7.0
50
50
  type: :runtime
51
51
  prerelease: false
52
52
  version_requirements: !ruby/object:Gem::Requirement
53
53
  requirements:
54
54
  - - "~>"
55
55
  - !ruby/object:Gem::Version
56
- version: '0.5'
56
+ version: '0.7'
57
57
  - - ">="
58
58
  - !ruby/object:Gem::Version
59
- version: 0.5.5
59
+ version: 0.7.0
60
60
  - !ruby/object:Gem::Dependency
61
- name: mechanize
61
+ name: ferrumwizard
62
62
  requirement: !ruby/object:Gem::Requirement
63
63
  requirements:
64
64
  - - "~>"
65
65
  - !ruby/object:Gem::Version
66
- version: '2.8'
66
+ version: '0.2'
67
67
  - - ">="
68
68
  - !ruby/object:Gem::Version
69
- version: 2.8.4
69
+ version: 0.2.2
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
73
73
  requirements:
74
74
  - - "~>"
75
75
  - !ruby/object:Gem::Version
76
- version: '2.8'
76
+ version: '0.2'
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
- version: 2.8.4
79
+ version: 0.2.2
80
80
  description:
81
- email: digital.robertson@Gmail.com
81
+ email: digital.robertson@gmail.com
82
82
  executables: []
83
83
  extensions: []
84
84
  extra_rdoc_files: []
@@ -103,8 +103,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
103
103
  - !ruby/object:Gem::Version
104
104
  version: '0'
105
105
  requirements: []
106
- rubyforge_project:
107
- rubygems_version: 2.7.10
106
+ rubygems_version: 3.2.22
108
107
  signing_key:
109
108
  specification_version: 4
110
109
  summary: Attempts to scrape the indeed.com jobsearch results (1 page).
metadata.gz.sig CHANGED
Binary file