indeed_scraper2022 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 19d85a0b62308f9e6c53eda22a9b26fb559e62d9528597fde6d89bb117f0deb0
4
- data.tar.gz: 43aea0e9efb6d6ed45c1efe4e480d8422939d20f42cd5b8514f8733b6f3ebd03
3
+ metadata.gz: 5e33dfd54667ecc9f8b7985aa07af403be8d95729ce68e8a40d8c985d57bd4e1
4
+ data.tar.gz: a2c041ec8103b6afac3a422e7b73bc82c89fd7f8d955240439a29ec0347c8a5f
5
5
  SHA512:
6
- metadata.gz: bf0592ffdbaf9dbba2e84be8dcbdbf18c1d8fcb25e0e33eacbbb786decb0f8d18240371149ef9850bbde890e5b64893660f2ad2e0d149cb5e40e900802d82bb4
7
- data.tar.gz: 60e00efcef2f86e6e64c8f02d45e943ab5cdc1090b3d361f3d6e305b4389694a6061618da693dee1d81b11dbe729f02786c06e71562b10d581b88503189f676a
6
+ metadata.gz: 8e640cb8262a057bb588b501ee1122a59e6e239e2a5988dd0566ffffb814a2fef763c36fdeae1ba5dc4e6f819ca145374058bd62373ce776df1e393057a49fc0
7
+ data.tar.gz: 0a6bfe0ef2b685d5711a95704cee3fa67d58eb7c9d0f149c872f9c23b0cc489382ab1327b101f7accf982f7ea1f1a6d56dc20ae528ef1fd4d6105c9ef93067da
checksums.yaml.gz.sig CHANGED
Binary file
@@ -2,20 +2,29 @@
2
2
 
3
3
  # file: indeed_scraper2022.rb
4
4
 
5
- require 'mechanize'
5
+ require 'ferrumwizard'
6
6
  require 'nokorexi'
7
7
 
8
8
  # Given the nature of changes to jobsearch websites,
9
9
  # don't rely upon this gem working in the near future.
10
10
 
11
11
 
12
+ class IndeedScraper2022Err < Exception
13
+ end
14
+
12
15
  class IndeedScraper2022
13
16
 
14
- def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '', debug: false)
17
+ attr_reader :browser
18
+
19
+ def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '',
20
+ headless: true, cookies: nil, debug: false)
15
21
 
16
22
  @debug = debug
17
23
  @url_base, @q, @location = url, q, location
18
- @results = search
24
+ @headless, @cookies = headless, cookies
25
+
26
+ fw = FerrumWizard.new( headless: @headless, cookies: @cookies, debug: @debug)
27
+ @browser = fw.browser
19
28
 
20
29
  end
21
30
 
@@ -25,7 +34,107 @@ class IndeedScraper2022
25
34
  @results
26
35
  end
27
36
 
37
+ def search(q: @q, location: @location, start: nil)
38
+
39
+ url = @url_base
40
+ url += 'start=' + start if start
41
+
42
+ @browser.goto(url)
43
+ #@browser.network.wait_for_idle
44
+ puts 'sleeping for 4 seconds' if @debug
45
+ sleep 4
46
+
47
+ if q.length > 1 then
48
+
49
+ input = @browser.at_xpath("//input[@name='q']")
50
+
51
+ # select any existing text and overwrite it
52
+ input.focus.type(:home); sleep 0.2
53
+ input.focus.type(:shift, :end); sleep 0.2
54
+ input.focus.type(q); sleep 0.2
55
+ end
56
+
57
+ if location.length > 1 then
58
+
59
+ input2 = @browser.at_xpath("//input[@name='l']")
60
+
61
+ # select any existing text and overwrite it
62
+ input2.focus.type(:home); sleep 0.2
63
+ input2.focus.type(:shift, :end); sleep 0.2
64
+ input2.focus.type(location); sleep 0.2
65
+
66
+ end
67
+
68
+ button = @browser.at_xpath("//button[@type='submit']")
69
+ button.click
70
+ #@browser.network.wait_for_idle
71
+ puts 'sleeping for 2 seconds' if @debug
72
+ sleep 2
73
+
74
+ doc2 = Nokogiri::XML(@browser.body)
75
+
76
+ a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
77
+ puts 'a2: ' + a2.length.inspect if @debug
78
+
79
+ @a2 = a2.map {|x| Rexle.new x.to_s }
80
+
81
+ @results = @a2.map do |doc|
82
+
83
+ div = doc.element("a[@class='desktop']/div[@class='slider" \
84
+ "_container']/div[@class='slider_list']/div[@class='sl" \
85
+ "ider_item']/div[@class='job_seen_beacon']")
86
+ td = div.element("table[@class='jobCard_mainContent']/tbo" \
87
+ "dy/tr/td[@class='resultContent']")
88
+
89
+ # job title (e.g. Software Developer)
90
+ jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
91
+ "class='jobTitle-color-purple']/span")&.text
92
+ puts 'jobtitle: ' + jobtitle.inspect if @debug
93
+
94
+ salary = td.element("div[@class='metadataContainer']/" \
95
+ "div[@class='salary-snippet-container']/div[@class='sa" \
96
+ "lary-snippet']/span")&.text
97
+
98
+ puts 'salary: ' + salary.inspect if @debug
99
+ div1 = td.element("div[@class='companyInfo']")
100
+
101
+ # company name (e.g. Coda Octopus Products Ltd)
102
+ company_name = div1.element("span[@class='companyName']")&.text
103
+
104
+ # company location (e.g. Edinburgh)
105
+ location = div1.element("div[@class='companyLocation']")&.text
106
+ tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
107
+
108
+ div3 = tbody.element("tr[@class='underShelfFooter']/td/di" \
109
+ "v[@class='result-footer']")
110
+
111
+ # job (e.g. Our products are primarily written in C#, using...)
112
+ jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
113
+
114
+ # visually (e.g. Posted 14 days ago)
115
+ dateposted = div3.element("span[@class='date']")&.texts
116
+ date = (Date.today - dateposted.first.to_i).to_s if dateposted
117
+
118
+ {
119
+ link: @url_base.sub(/\/[^\/]+$/,'') \
120
+ + doc.root.attributes[:href].gsub(/&amp;/,'&'),
121
+ title: jobtitle,
122
+ salary: salary,
123
+ company: company_name,
124
+ location: location,
125
+ jobsnippet: jobsnippet,
126
+ date: date
127
+ }
128
+
129
+ end
130
+ end
131
+
28
132
  def page(n)
133
+
134
+ if n < 1 or n > @results.length then
135
+ raise IndeedScraper2022Err, 'Invalid page no.'
136
+ end
137
+
29
138
  url = @results[n-1][:link]
30
139
  fetchjob(url)
31
140
  end
@@ -56,16 +165,18 @@ class IndeedScraper2022
56
165
  "ickyContainer-companyrating']/div/div[@class='icl-u-x" \
57
166
  "s-mr--xs']")[1]
58
167
  clink = div3.element('//a')
59
- company = cname ? cname.text : clink.text
168
+ company = cname.text ? cname.text : clink.text
60
169
  companylink = clink.attributes[:href] if clink
61
170
 
171
+ salary = div1.element("//span[@class='attribute_snippet']")&.text
172
+ type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
62
173
  div5 = div3.xpath("div/div")
63
174
  location, worklocation = div5.map(&:text).compact
64
175
 
65
176
  # icl (e.g. Full-time, Permanent)
66
177
  jobtype = div1.element("div/div/div[@class='jobsearch-J" \
67
178
  "obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
68
- jobtype = jobtype.texts.join if jobtype
179
+ jobtype = jobtype&.texts.join if jobtype
69
180
 
70
181
  # jobsearch (e.g. Urgently needed)
71
182
  jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
@@ -73,106 +184,57 @@ class IndeedScraper2022
73
184
  "h-DesktopTag-text']")&.text
74
185
 
75
186
  # jobsearch (e.g. 10 days ago)
76
- datepost = e0.element("//div[@class='jobsearch-JobTab-con" \
77
- "tent']/div[@class='jobsearch-JobMetadataFooter']/div")&.text
187
+ days = e0.element("//div[@class='jobsearch-JobTab-con" \
188
+ "tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
189
+ d = Date.today - days.to_i
190
+ datepost = d.strftime("%Y-%m-%d")
191
+
78
192
 
79
193
  jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
80
194
  "ass='jobsearch-jobDescriptionText']").xml
81
195
 
82
196
  {
83
197
  title: jobtitle,
198
+ type: type,
84
199
  company: company,
85
200
  companylink: companylink,
86
201
  location: location,
202
+ salary: salary,
87
203
  worklocation: worklocation,
88
204
  note: jobnote1,
89
- date: (Date.today - datepost.to_i).to_s,
205
+ date: datepost,
90
206
  desc: jobdesc
91
207
  }
92
208
 
93
209
  end
94
210
 
95
- def search(q='', location='')
96
-
97
- a = Mechanize.new
98
-
99
- page = a.get(@url_base)
100
- form = page.forms.first
101
- form.fields[0].value = @q
102
- form.fields[1].value = @location
103
- pg = form.submit
104
-
105
- doc2 = Nokogiri::XML(pg.body)
106
-
107
- a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
108
- puts 'a2: ' + a2.length.inspect if @debug
109
-
110
- @a2 = a2.map {|x| Rexle.new x.to_s }
111
-
112
- @a2.map do |doc|
113
-
114
- div = doc.element("a[@class='desktop']/div[@class='slider" \
115
- "_container']/div[@class='slider_list']/div[@class='sl" \
116
- "ider_item']/div[@class='job_seen_beacon']")
117
- td = div.element("table[@class='jobCard_mainContent']/tbo" \
118
- "dy/tr/td[@class='resultContent']")
119
-
120
- # job title (e.g. Software Developer)
121
- jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
122
- "class='jobTitle-color-purple']/span")&.text
123
- puts 'jobtitle: ' + jobtitle.inspect if @debug
124
-
125
- salary = td.element("div[@class='metadataContainer']/" \
126
- "div[@class='salary-snippet-container']/div[@class='sa" \
127
- "lary-snippet']/span")&.text
128
211
 
129
- puts 'salary: ' + salary.inspect if @debug
130
- div1 = td.element("div[@class='companyInfo']")
131
-
132
- # company name (e.g. Coda Octopus Products Ltd)
133
- company_name = div1.element("span[@class='companyName']")&.text
134
-
135
- # company location (e.g. Edinburgh)
136
- location = div1.element("div[@class='companyLocation']")&.text
137
- tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
212
+ end
138
213
 
139
- div3 = tbody.element("tr[@class='underShelfFooter']/td/di" \
140
- "v[@class='result-footer']")
214
+ class IS22Plus < IndeedScraper2022
141
215
 
142
- # job (e.g. Our products are primarily written in C#, using...)
143
- jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
216
+ def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
217
+ super(q: q, location: location, headless: headless, cookies: cookies,
218
+ debug: debug)
219
+ end
144
220
 
145
- # visually (e.g. Posted 14 days ago)
146
- dateposted = div3.element("span[@class='date']").texts
147
- date = (Date.today - dateposted.first.to_i).to_s
221
+ def archive()
148
222
 
149
- {
150
- link: @url_base.sub(/\/[^\/]+$/,'') \
151
- + doc.root.attributes[:href].gsub(/&amp;/,'&'),
152
- title: jobtitle,
153
- salary: salary,
154
- company: company_name,
155
- location: location,
156
- jobsnippet: jobsnippet,
157
- date: date
158
- }
223
+ return unless @results
159
224
 
225
+ 1.upto(@results.length).each do |n|
226
+ page(n)
160
227
  end
161
- end
162
- end
163
-
164
- class IS22Plus < IndeedScraper2022
165
228
 
166
- def initialize(q: '', location: '', debug: false)
167
- super(q: q, location: location, debug: debug)
168
229
  end
169
230
 
170
231
  def list()
171
232
 
172
233
  @results.map.with_index do |x,i|
173
- "%2d. %s" % [i,x[:title]]
234
+ "%2d. %s" % [i+1,x[:title]]
174
235
  end.join("\n")
175
236
 
176
237
  end
177
238
 
239
+
178
240
  end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeed_scraper2022
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
36
36
  SW/2zInu2bkj/meWm5eBoWHT
37
37
  -----END CERTIFICATE-----
38
- date: 2022-01-25 00:00:00.000000000 Z
38
+ date: 2022-03-30 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: nokorexi
@@ -43,42 +43,42 @@ dependencies:
43
43
  requirements:
44
44
  - - "~>"
45
45
  - !ruby/object:Gem::Version
46
- version: '0.5'
46
+ version: '0.7'
47
47
  - - ">="
48
48
  - !ruby/object:Gem::Version
49
- version: 0.5.5
49
+ version: 0.7.0
50
50
  type: :runtime
51
51
  prerelease: false
52
52
  version_requirements: !ruby/object:Gem::Requirement
53
53
  requirements:
54
54
  - - "~>"
55
55
  - !ruby/object:Gem::Version
56
- version: '0.5'
56
+ version: '0.7'
57
57
  - - ">="
58
58
  - !ruby/object:Gem::Version
59
- version: 0.5.5
59
+ version: 0.7.0
60
60
  - !ruby/object:Gem::Dependency
61
- name: mechanize
61
+ name: ferrumwizard
62
62
  requirement: !ruby/object:Gem::Requirement
63
63
  requirements:
64
64
  - - "~>"
65
65
  - !ruby/object:Gem::Version
66
- version: '2.8'
66
+ version: '0.2'
67
67
  - - ">="
68
68
  - !ruby/object:Gem::Version
69
- version: 2.8.4
69
+ version: 0.2.2
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
73
73
  requirements:
74
74
  - - "~>"
75
75
  - !ruby/object:Gem::Version
76
- version: '2.8'
76
+ version: '0.2'
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
- version: 2.8.4
79
+ version: 0.2.2
80
80
  description:
81
- email: digital.robertson@Gmail.com
81
+ email: digital.robertson@gmail.com
82
82
  executables: []
83
83
  extensions: []
84
84
  extra_rdoc_files: []
@@ -103,8 +103,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
103
103
  - !ruby/object:Gem::Version
104
104
  version: '0'
105
105
  requirements: []
106
- rubyforge_project:
107
- rubygems_version: 2.7.10
106
+ rubygems_version: 3.2.22
108
107
  signing_key:
109
108
  specification_version: 4
110
109
  summary: Attempts to scrape the indeed.com jobsearch results (1 page).
metadata.gz.sig CHANGED
Binary file