indeed_scraper2022 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6bcc9484e0c35971209ee1419b9ff7d07d912831eedf889d10789c2ab4bf0863
4
- data.tar.gz: c3be84be0f499ed92794a8de0c3109cc99d3ba73f6aef74f9426a12b0a250e10
3
+ metadata.gz: 07f5323381b5751c470454f6f4c3ba6dced6f1424e054b85360a49d814d662ba
4
+ data.tar.gz: 3d25353b9f8a0543944cac82ef6dc91adf7d3e83444f3c6ef469f15cbba8a3d8
5
5
  SHA512:
6
- metadata.gz: 9e6d64b754bbd242f6d038f385602ed4dc5fe2d8112da12709a673a494419a159c9afe2ef4fa6bc95cccd160f9bceb22b03783108ffec213ed96442372092f81
7
- data.tar.gz: eae4c65e2cd7784aff9c3db394fa29e44344e61a5275a31790a1fc21c57df34204585aa5e9e3db8af28a251cf2366675d4d9027b9a6578456dd9f64744842caa
6
+ metadata.gz: 5e13ae04b46bfa3eb15aab8d0aff388d8caec591c413493db591c37da099d2bcd5ba340a72137d4aa7d374652b68bc1d037b86fe4cc2ed2ae5b0a56c5202f00b
7
+ data.tar.gz: ca14ae99251aabbcaee08a3bb6f240742ed1fab0f438496dc742ef39a10abb13e310b2d6a93bc472f5e1b3e45cfd8956d6a62f803b1d3a152054cf4e1ae35402
checksums.yaml.gz.sig CHANGED
Binary file
@@ -2,7 +2,7 @@
2
2
 
3
3
  # file: indeed_scraper2022.rb
4
4
 
5
- require 'mechanize'
5
+ require 'ferrumwizard'
6
6
  require 'nokorexi'
7
7
 
8
8
  # Given the nature of changes to jobsearch websites,
@@ -14,11 +14,13 @@ end
14
14
 
15
15
  class IndeedScraper2022
16
16
 
17
- def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '', debug: false)
17
+ def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '',
18
+ headless: true, cookies: nil, debug: false)
18
19
 
19
20
  @debug = debug
20
21
  @url_base, @q, @location = url, q, location
21
- @results = search
22
+ @headless, @cookies = headless, cookies
23
+ @results = search(q: @q, location: @location)
22
24
 
23
25
  end
24
26
 
@@ -28,6 +30,87 @@ class IndeedScraper2022
28
30
  @results
29
31
  end
30
32
 
33
+ def search(q: @q, location: @location, start: nil)
34
+
35
+ fw = FerrumWizard.new( headless: @headless, cookies: @cookies, debug: @debug)
36
+
37
+ url = @url_base
38
+ url += 'start=' + start if start
39
+
40
+ browser = fw.browser
41
+ browser.goto(url)
42
+
43
+ if q.length > 1 then
44
+ input = browser.at_xpath("//input[@name='q']")
45
+ input.focus.type(q)
46
+ end
47
+
48
+ if location.length > 1 then
49
+ input2 = browser.at_xpath("//input[@name='l']")
50
+ input2.focus.type(location)
51
+ end
52
+
53
+ button = browser.at_xpath("//button[@type='submit']")
54
+ button.click
55
+
56
+ doc2 = Nokogiri::XML(browser.body)
57
+
58
+ a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
59
+ puts 'a2: ' + a2.length.inspect if @debug
60
+
61
+ @a2 = a2.map {|x| Rexle.new x.to_s }
62
+
63
+ @a2.map do |doc|
64
+
65
+ div = doc.element("a[@class='desktop']/div[@class='slider" \
66
+ "_container']/div[@class='slider_list']/div[@class='sl" \
67
+ "ider_item']/div[@class='job_seen_beacon']")
68
+ td = div.element("table[@class='jobCard_mainContent']/tbo" \
69
+ "dy/tr/td[@class='resultContent']")
70
+
71
+ # job title (e.g. Software Developer)
72
+ jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
73
+ "class='jobTitle-color-purple']/span")&.text
74
+ puts 'jobtitle: ' + jobtitle.inspect if @debug
75
+
76
+ salary = td.element("div[@class='metadataContainer']/" \
77
+ "div[@class='salary-snippet-container']/div[@class='sa" \
78
+ "lary-snippet']/span")&.text
79
+
80
+ puts 'salary: ' + salary.inspect if @debug
81
+ div1 = td.element("div[@class='companyInfo']")
82
+
83
+ # company name (e.g. Coda Octopus Products Ltd)
84
+ company_name = div1.element("span[@class='companyName']")&.text
85
+
86
+ # company location (e.g. Edinburgh)
87
+ location = div1.element("div[@class='companyLocation']")&.text
88
+ tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
89
+
90
+ div3 = tbody.element("tr[@class='underShelfFooter']/td/di" \
91
+ "v[@class='result-footer']")
92
+
93
+ # job (e.g. Our products are primarily written in C#, using...)
94
+ jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
95
+
96
+ # visually (e.g. Posted 14 days ago)
97
+ dateposted = div3.element("span[@class='date']")&.texts
98
+ date = (Date.today - dateposted.first.to_i).to_s if dateposted
99
+
100
+ {
101
+ link: @url_base.sub(/\/[^\/]+$/,'') \
102
+ + doc.root.attributes[:href].gsub(/&/,'&'),
103
+ title: jobtitle,
104
+ salary: salary,
105
+ company: company_name,
106
+ location: location,
107
+ jobsnippet: jobsnippet,
108
+ date: date
109
+ }
110
+
111
+ end
112
+ end
113
+
31
114
  def page(n)
32
115
 
33
116
  if n < 1 or n > @results.length then
@@ -64,16 +147,18 @@ class IndeedScraper2022
64
147
  "ickyContainer-companyrating']/div/div[@class='icl-u-x" \
65
148
  "s-mr--xs']")[1]
66
149
  clink = div3.element('//a')
67
- company = cname ? cname.text : clink.text
150
+ company = cname.text ? cname.text : clink.text
68
151
  companylink = clink.attributes[:href] if clink
69
152
 
153
+ salary = div1.element("//span[@class='attribute_snippet']")&.text
154
+ type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
70
155
  div5 = div3.xpath("div/div")
71
156
  location, worklocation = div5.map(&:text).compact
72
157
 
73
158
  # icl (e.g. Full-time, Permanent)
74
159
  jobtype = div1.element("div/div/div[@class='jobsearch-J" \
75
160
  "obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
76
- jobtype = jobtype.texts.join if jobtype
161
+ jobtype = jobtype&.texts.join if jobtype
77
162
 
78
163
  # jobsearch (e.g. Urgently needed)
79
164
  jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
@@ -81,98 +166,46 @@ class IndeedScraper2022
81
166
  "h-DesktopTag-text']")&.text
82
167
 
83
168
  # jobsearch (e.g. 10 days ago)
84
- datepost = e0.element("//div[@class='jobsearch-JobTab-con" \
85
- "tent']/div[@class='jobsearch-JobMetadataFooter']/div")&.text
169
+ days = e0.element("//div[@class='jobsearch-JobTab-con" \
170
+ "tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
171
+ d = Date.today - days.to_i
172
+ datepost = d.strftime("%Y-%m-%d")
173
+
86
174
 
87
175
  jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
88
176
  "ass='jobsearch-jobDescriptionText']").xml
89
177
 
90
178
  {
91
179
  title: jobtitle,
180
+ type: type,
92
181
  company: company,
93
182
  companylink: companylink,
94
183
  location: location,
184
+ salary: salary,
95
185
  worklocation: worklocation,
96
186
  note: jobnote1,
97
- date: (Date.today - datepost.to_i).to_s,
187
+ date: datepost,
98
188
  desc: jobdesc
99
189
  }
100
190
 
101
191
  end
102
192
 
103
- def search(q='', location='')
104
-
105
- a = Mechanize.new
106
-
107
- page = a.get(@url_base)
108
- form = page.forms.first
109
- form.fields[0].value = @q
110
- form.fields[1].value = @location
111
- pg = form.submit
112
-
113
- doc2 = Nokogiri::XML(pg.body)
114
-
115
- a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
116
- puts 'a2: ' + a2.length.inspect if @debug
117
-
118
- @a2 = a2.map {|x| Rexle.new x.to_s }
119
-
120
- @a2.map do |doc|
121
-
122
- div = doc.element("a[@class='desktop']/div[@class='slider" \
123
- "_container']/div[@class='slider_list']/div[@class='sl" \
124
- "ider_item']/div[@class='job_seen_beacon']")
125
- td = div.element("table[@class='jobCard_mainContent']/tbo" \
126
- "dy/tr/td[@class='resultContent']")
127
-
128
- # job title (e.g. Software Developer)
129
- jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
130
- "class='jobTitle-color-purple']/span")&.text
131
- puts 'jobtitle: ' + jobtitle.inspect if @debug
132
-
133
- salary = td.element("div[@class='metadataContainer']/" \
134
- "div[@class='salary-snippet-container']/div[@class='sa" \
135
- "lary-snippet']/span")&.text
136
-
137
- puts 'salary: ' + salary.inspect if @debug
138
- div1 = td.element("div[@class='companyInfo']")
139
193
 
140
- # company name (e.g. Coda Octopus Products Ltd)
141
- company_name = div1.element("span[@class='companyName']")&.text
142
-
143
- # company location (e.g. Edinburgh)
144
- location = div1.element("div[@class='companyLocation']")&.text
145
- tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
146
-
147
- div3 = tbody.element("tr[@class='underShelfFooter']/td/di" \
148
- "v[@class='result-footer']")
194
+ end
149
195
 
150
- # job (e.g. Our products are primarily written in C#, using...)
151
- jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
196
+ class IS22Plus < IndeedScraper2022
152
197
 
153
- # visually (e.g. Posted 14 days ago)
154
- dateposted = div3.element("span[@class='date']").texts
155
- date = (Date.today - dateposted.first.to_i).to_s
198
+ def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
199
+ super(q: q, location: location, headless: headless, cookies: cookies,
200
+ debug: debug)
201
+ end
156
202
 
157
- {
158
- link: @url_base.sub(/\/[^\/]+$/,'') \
159
- + doc.root.attributes[:href].gsub(/&amp;/,'&'),
160
- title: jobtitle,
161
- salary: salary,
162
- company: company_name,
163
- location: location,
164
- jobsnippet: jobsnippet,
165
- date: date
166
- }
203
+ def archive()
167
204
 
205
+ 1.upto(15).each do |n|
206
+ page(n)
168
207
  end
169
- end
170
- end
171
208
 
172
- class IS22Plus < IndeedScraper2022
173
-
174
- def initialize(q: '', location: '', debug: false)
175
- super(q: q, location: location, debug: debug)
176
209
  end
177
210
 
178
211
  def list()
@@ -183,4 +216,5 @@ class IS22Plus < IndeedScraper2022
183
216
 
184
217
  end
185
218
 
219
+
186
220
  end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeed_scraper2022
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
36
36
  SW/2zInu2bkj/meWm5eBoWHT
37
37
  -----END CERTIFICATE-----
38
- date: 2022-03-22 00:00:00.000000000 Z
38
+ date: 2022-03-30 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: nokorexi
@@ -58,25 +58,25 @@ dependencies:
58
58
  - !ruby/object:Gem::Version
59
59
  version: 0.7.0
60
60
  - !ruby/object:Gem::Dependency
61
- name: mechanize
61
+ name: ferrumwizard
62
62
  requirement: !ruby/object:Gem::Requirement
63
63
  requirements:
64
64
  - - "~>"
65
65
  - !ruby/object:Gem::Version
66
- version: '2.8'
66
+ version: '0.2'
67
67
  - - ">="
68
68
  - !ruby/object:Gem::Version
69
- version: 2.8.4
69
+ version: 0.2.2
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
73
73
  requirements:
74
74
  - - "~>"
75
75
  - !ruby/object:Gem::Version
76
- version: '2.8'
76
+ version: '0.2'
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
- version: 2.8.4
79
+ version: 0.2.2
80
80
  description:
81
81
  email: digital.robertson@gmail.com
82
82
  executables: []
metadata.gz.sig CHANGED
Binary file