indeed_scraper2022 0.1.3 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6bcc9484e0c35971209ee1419b9ff7d07d912831eedf889d10789c2ab4bf0863
4
- data.tar.gz: c3be84be0f499ed92794a8de0c3109cc99d3ba73f6aef74f9426a12b0a250e10
3
+ metadata.gz: 07f5323381b5751c470454f6f4c3ba6dced6f1424e054b85360a49d814d662ba
4
+ data.tar.gz: 3d25353b9f8a0543944cac82ef6dc91adf7d3e83444f3c6ef469f15cbba8a3d8
5
5
  SHA512:
6
- metadata.gz: 9e6d64b754bbd242f6d038f385602ed4dc5fe2d8112da12709a673a494419a159c9afe2ef4fa6bc95cccd160f9bceb22b03783108ffec213ed96442372092f81
7
- data.tar.gz: eae4c65e2cd7784aff9c3db394fa29e44344e61a5275a31790a1fc21c57df34204585aa5e9e3db8af28a251cf2366675d4d9027b9a6578456dd9f64744842caa
6
+ metadata.gz: 5e13ae04b46bfa3eb15aab8d0aff388d8caec591c413493db591c37da099d2bcd5ba340a72137d4aa7d374652b68bc1d037b86fe4cc2ed2ae5b0a56c5202f00b
7
+ data.tar.gz: ca14ae99251aabbcaee08a3bb6f240742ed1fab0f438496dc742ef39a10abb13e310b2d6a93bc472f5e1b3e45cfd8956d6a62f803b1d3a152054cf4e1ae35402
checksums.yaml.gz.sig CHANGED
Binary file
@@ -2,7 +2,7 @@
2
2
 
3
3
  # file: indeed_scraper2022.rb
4
4
 
5
- require 'mechanize'
5
+ require 'ferrumwizard'
6
6
  require 'nokorexi'
7
7
 
8
8
  # Given the nature of changes to jobsearch websites,
@@ -14,11 +14,13 @@ end
14
14
 
15
15
  class IndeedScraper2022
16
16
 
17
- def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '', debug: false)
17
+ def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '',
18
+ headless: true, cookies: nil, debug: false)
18
19
 
19
20
  @debug = debug
20
21
  @url_base, @q, @location = url, q, location
21
- @results = search
22
+ @headless, @cookies = headless, cookies
23
+ @results = search(q: @q, location: @location)
22
24
 
23
25
  end
24
26
 
@@ -28,6 +30,87 @@ class IndeedScraper2022
28
30
  @results
29
31
  end
30
32
 
33
+ def search(q: @q, location: @location, start: nil)
34
+
35
+ fw = FerrumWizard.new( headless: @headless, cookies: @cookies, debug: @debug)
36
+
37
+ url = @url_base
38
+ url += 'start=' + start if start
39
+
40
+ browser = fw.browser
41
+ browser.goto(url)
42
+
43
+ if q.length > 1 then
44
+ input = browser.at_xpath("//input[@name='q']")
45
+ input.focus.type(q)
46
+ end
47
+
48
+ if location.length > 1 then
49
+ input2 = browser.at_xpath("//input[@name='l']")
50
+ input2.focus.type(location)
51
+ end
52
+
53
+ button = browser.at_xpath("//button[@type='submit']")
54
+ button.click
55
+
56
+ doc2 = Nokogiri::XML(browser.body)
57
+
58
+ a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
59
+ puts 'a2: ' + a2.length.inspect if @debug
60
+
61
+ @a2 = a2.map {|x| Rexle.new x.to_s }
62
+
63
+ @a2.map do |doc|
64
+
65
+ div = doc.element("a[@class='desktop']/div[@class='slider" \
66
+ "_container']/div[@class='slider_list']/div[@class='sl" \
67
+ "ider_item']/div[@class='job_seen_beacon']")
68
+ td = div.element("table[@class='jobCard_mainContent']/tbo" \
69
+ "dy/tr/td[@class='resultContent']")
70
+
71
+ # job title (e.g. Software Developer)
72
+ jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
73
+ "class='jobTitle-color-purple']/span")&.text
74
+ puts 'jobtitle: ' + jobtitle.inspect if @debug
75
+
76
+ salary = td.element("div[@class='metadataContainer']/" \
77
+ "div[@class='salary-snippet-container']/div[@class='sa" \
78
+ "lary-snippet']/span")&.text
79
+
80
+ puts 'salary: ' + salary.inspect if @debug
81
+ div1 = td.element("div[@class='companyInfo']")
82
+
83
+ # company name (e.g. Coda Octopus Products Ltd)
84
+ company_name = div1.element("span[@class='companyName']")&.text
85
+
86
+ # company location (e.g. Edinburgh)
87
+ location = div1.element("div[@class='companyLocation']")&.text
88
+ tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
89
+
90
+ div3 = tbody.element("tr[@class='underShelfFooter']/td/di" \
91
+ "v[@class='result-footer']")
92
+
93
+ # job (e.g. Our products are primarily written in C#, using...)
94
+ jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
95
+
96
+ # visually (e.g. Posted 14 days ago)
97
+ dateposted = div3.element("span[@class='date']")&.texts
98
+ date = (Date.today - dateposted.first.to_i).to_s if dateposted
99
+
100
+ {
101
+ link: @url_base.sub(/\/[^\/]+$/,'') \
102
+ + doc.root.attributes[:href].gsub(/&/,'&'),
103
+ title: jobtitle,
104
+ salary: salary,
105
+ company: company_name,
106
+ location: location,
107
+ jobsnippet: jobsnippet,
108
+ date: date
109
+ }
110
+
111
+ end
112
+ end
113
+
31
114
  def page(n)
32
115
 
33
116
  if n < 1 or n > @results.length then
@@ -64,16 +147,18 @@ class IndeedScraper2022
64
147
  "ickyContainer-companyrating']/div/div[@class='icl-u-x" \
65
148
  "s-mr--xs']")[1]
66
149
  clink = div3.element('//a')
67
- company = cname ? cname.text : clink.text
150
+ company = cname.text ? cname.text : clink.text
68
151
  companylink = clink.attributes[:href] if clink
69
152
 
153
+ salary = div1.element("//span[@class='attribute_snippet']")&.text
154
+ type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
70
155
  div5 = div3.xpath("div/div")
71
156
  location, worklocation = div5.map(&:text).compact
72
157
 
73
158
  # icl (e.g. Full-time, Permanent)
74
159
  jobtype = div1.element("div/div/div[@class='jobsearch-J" \
75
160
  "obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
76
- jobtype = jobtype.texts.join if jobtype
161
+ jobtype = jobtype&.texts.join if jobtype
77
162
 
78
163
  # jobsearch (e.g. Urgently needed)
79
164
  jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
@@ -81,98 +166,46 @@ class IndeedScraper2022
81
166
  "h-DesktopTag-text']")&.text
82
167
 
83
168
  # jobsearch (e.g. 10 days ago)
84
- datepost = e0.element("//div[@class='jobsearch-JobTab-con" \
85
- "tent']/div[@class='jobsearch-JobMetadataFooter']/div")&.text
169
+ days = e0.element("//div[@class='jobsearch-JobTab-con" \
170
+ "tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
171
+ d = Date.today - days.to_i
172
+ datepost = d.strftime("%Y-%m-%d")
173
+
86
174
 
87
175
  jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
88
176
  "ass='jobsearch-jobDescriptionText']").xml
89
177
 
90
178
  {
91
179
  title: jobtitle,
180
+ type: type,
92
181
  company: company,
93
182
  companylink: companylink,
94
183
  location: location,
184
+ salary: salary,
95
185
  worklocation: worklocation,
96
186
  note: jobnote1,
97
- date: (Date.today - datepost.to_i).to_s,
187
+ date: datepost,
98
188
  desc: jobdesc
99
189
  }
100
190
 
101
191
  end
102
192
 
103
- def search(q='', location='')
104
-
105
- a = Mechanize.new
106
-
107
- page = a.get(@url_base)
108
- form = page.forms.first
109
- form.fields[0].value = @q
110
- form.fields[1].value = @location
111
- pg = form.submit
112
-
113
- doc2 = Nokogiri::XML(pg.body)
114
-
115
- a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
116
- puts 'a2: ' + a2.length.inspect if @debug
117
-
118
- @a2 = a2.map {|x| Rexle.new x.to_s }
119
-
120
- @a2.map do |doc|
121
-
122
- div = doc.element("a[@class='desktop']/div[@class='slider" \
123
- "_container']/div[@class='slider_list']/div[@class='sl" \
124
- "ider_item']/div[@class='job_seen_beacon']")
125
- td = div.element("table[@class='jobCard_mainContent']/tbo" \
126
- "dy/tr/td[@class='resultContent']")
127
-
128
- # job title (e.g. Software Developer)
129
- jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
130
- "class='jobTitle-color-purple']/span")&.text
131
- puts 'jobtitle: ' + jobtitle.inspect if @debug
132
-
133
- salary = td.element("div[@class='metadataContainer']/" \
134
- "div[@class='salary-snippet-container']/div[@class='sa" \
135
- "lary-snippet']/span")&.text
136
-
137
- puts 'salary: ' + salary.inspect if @debug
138
- div1 = td.element("div[@class='companyInfo']")
139
193
 
140
- # company name (e.g. Coda Octopus Products Ltd)
141
- company_name = div1.element("span[@class='companyName']")&.text
142
-
143
- # company location (e.g. Edinburgh)
144
- location = div1.element("div[@class='companyLocation']")&.text
145
- tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
146
-
147
- div3 = tbody.element("tr[@class='underShelfFooter']/td/di" \
148
- "v[@class='result-footer']")
194
+ end
149
195
 
150
- # job (e.g. Our products are primarily written in C#, using...)
151
- jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
196
+ class IS22Plus < IndeedScraper2022
152
197
 
153
- # visually (e.g. Posted 14 days ago)
154
- dateposted = div3.element("span[@class='date']").texts
155
- date = (Date.today - dateposted.first.to_i).to_s
198
+ def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
199
+ super(q: q, location: location, headless: headless, cookies: cookies,
200
+ debug: debug)
201
+ end
156
202
 
157
- {
158
- link: @url_base.sub(/\/[^\/]+$/,'') \
159
- + doc.root.attributes[:href].gsub(/&amp;/,'&'),
160
- title: jobtitle,
161
- salary: salary,
162
- company: company_name,
163
- location: location,
164
- jobsnippet: jobsnippet,
165
- date: date
166
- }
203
+ def archive()
167
204
 
205
+ 1.upto(15).each do |n|
206
+ page(n)
168
207
  end
169
- end
170
- end
171
208
 
172
- class IS22Plus < IndeedScraper2022
173
-
174
- def initialize(q: '', location: '', debug: false)
175
- super(q: q, location: location, debug: debug)
176
209
  end
177
210
 
178
211
  def list()
@@ -183,4 +216,5 @@ class IS22Plus < IndeedScraper2022
183
216
 
184
217
  end
185
218
 
219
+
186
220
  end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeed_scraper2022
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
36
36
  SW/2zInu2bkj/meWm5eBoWHT
37
37
  -----END CERTIFICATE-----
38
- date: 2022-03-22 00:00:00.000000000 Z
38
+ date: 2022-03-30 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: nokorexi
@@ -58,25 +58,25 @@ dependencies:
58
58
  - !ruby/object:Gem::Version
59
59
  version: 0.7.0
60
60
  - !ruby/object:Gem::Dependency
61
- name: mechanize
61
+ name: ferrumwizard
62
62
  requirement: !ruby/object:Gem::Requirement
63
63
  requirements:
64
64
  - - "~>"
65
65
  - !ruby/object:Gem::Version
66
- version: '2.8'
66
+ version: '0.2'
67
67
  - - ">="
68
68
  - !ruby/object:Gem::Version
69
- version: 2.8.4
69
+ version: 0.2.2
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
73
73
  requirements:
74
74
  - - "~>"
75
75
  - !ruby/object:Gem::Version
76
- version: '2.8'
76
+ version: '0.2'
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
- version: 2.8.4
79
+ version: 0.2.2
80
80
  description:
81
81
  email: digital.robertson@gmail.com
82
82
  executables: []
metadata.gz.sig CHANGED
Binary file