indeed_scraper2022 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/indeed_scraper2022.rb +136 -74
- data.tar.gz.sig +0 -0
- metadata +13 -14
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5e33dfd54667ecc9f8b7985aa07af403be8d95729ce68e8a40d8c985d57bd4e1
|
4
|
+
data.tar.gz: a2c041ec8103b6afac3a422e7b73bc82c89fd7f8d955240439a29ec0347c8a5f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8e640cb8262a057bb588b501ee1122a59e6e239e2a5988dd0566ffffb814a2fef763c36fdeae1ba5dc4e6f819ca145374058bd62373ce776df1e393057a49fc0
|
7
|
+
data.tar.gz: 0a6bfe0ef2b685d5711a95704cee3fa67d58eb7c9d0f149c872f9c23b0cc489382ab1327b101f7accf982f7ea1f1a6d56dc20ae528ef1fd4d6105c9ef93067da
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/indeed_scraper2022.rb
CHANGED
@@ -2,20 +2,29 @@
|
|
2
2
|
|
3
3
|
# file: indeed_scraper2022.rb
|
4
4
|
|
5
|
-
require '
|
5
|
+
require 'ferrumwizard'
|
6
6
|
require 'nokorexi'
|
7
7
|
|
8
8
|
# Given the nature of changes to jobsearch websites,
|
9
9
|
# don't rely upon this gem working in the near future.
|
10
10
|
|
11
11
|
|
12
|
+
class IndeedScraper2022Err < Exception
|
13
|
+
end
|
14
|
+
|
12
15
|
class IndeedScraper2022
|
13
16
|
|
14
|
-
|
17
|
+
attr_reader :browser
|
18
|
+
|
19
|
+
def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '',
|
20
|
+
headless: true, cookies: nil, debug: false)
|
15
21
|
|
16
22
|
@debug = debug
|
17
23
|
@url_base, @q, @location = url, q, location
|
18
|
-
@
|
24
|
+
@headless, @cookies = headless, cookies
|
25
|
+
|
26
|
+
fw = FerrumWizard.new( headless: @headless, cookies: @cookies, debug: @debug)
|
27
|
+
@browser = fw.browser
|
19
28
|
|
20
29
|
end
|
21
30
|
|
@@ -25,7 +34,107 @@ class IndeedScraper2022
|
|
25
34
|
@results
|
26
35
|
end
|
27
36
|
|
37
|
+
def search(q: @q, location: @location, start: nil)
|
38
|
+
|
39
|
+
url = @url_base
|
40
|
+
url += 'start=' + start if start
|
41
|
+
|
42
|
+
@browser.goto(url)
|
43
|
+
#@browser.network.wait_for_idle
|
44
|
+
puts 'sleeping for 4 seconds' if @debug
|
45
|
+
sleep 4
|
46
|
+
|
47
|
+
if q.length > 1 then
|
48
|
+
|
49
|
+
input = @browser.at_xpath("//input[@name='q']")
|
50
|
+
|
51
|
+
# select any existing text and overwrite it
|
52
|
+
input.focus.type(:home); sleep 0.2
|
53
|
+
input.focus.type(:shift, :end); sleep 0.2
|
54
|
+
input.focus.type(q); sleep 0.2
|
55
|
+
end
|
56
|
+
|
57
|
+
if location.length > 1 then
|
58
|
+
|
59
|
+
input2 = @browser.at_xpath("//input[@name='l']")
|
60
|
+
|
61
|
+
# select any existing text and overwrite it
|
62
|
+
input2.focus.type(:home); sleep 0.2
|
63
|
+
input2.focus.type(:shift, :end); sleep 0.2
|
64
|
+
input2.focus.type(location); sleep 0.2
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
button = @browser.at_xpath("//button[@type='submit']")
|
69
|
+
button.click
|
70
|
+
#@browser.network.wait_for_idle
|
71
|
+
puts 'sleeping for 2 seconds' if @debug
|
72
|
+
sleep 2
|
73
|
+
|
74
|
+
doc2 = Nokogiri::XML(@browser.body)
|
75
|
+
|
76
|
+
a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
|
77
|
+
puts 'a2: ' + a2.length.inspect if @debug
|
78
|
+
|
79
|
+
@a2 = a2.map {|x| Rexle.new x.to_s }
|
80
|
+
|
81
|
+
@results = @a2.map do |doc|
|
82
|
+
|
83
|
+
div = doc.element("a[@class='desktop']/div[@class='slider" \
|
84
|
+
"_container']/div[@class='slider_list']/div[@class='sl" \
|
85
|
+
"ider_item']/div[@class='job_seen_beacon']")
|
86
|
+
td = div.element("table[@class='jobCard_mainContent']/tbo" \
|
87
|
+
"dy/tr/td[@class='resultContent']")
|
88
|
+
|
89
|
+
# job title (e.g. Software Developer)
|
90
|
+
jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
|
91
|
+
"class='jobTitle-color-purple']/span")&.text
|
92
|
+
puts 'jobtitle: ' + jobtitle.inspect if @debug
|
93
|
+
|
94
|
+
salary = td.element("div[@class='metadataContainer']/" \
|
95
|
+
"div[@class='salary-snippet-container']/div[@class='sa" \
|
96
|
+
"lary-snippet']/span")&.text
|
97
|
+
|
98
|
+
puts 'salary: ' + salary.inspect if @debug
|
99
|
+
div1 = td.element("div[@class='companyInfo']")
|
100
|
+
|
101
|
+
# company name (e.g. Coda Octopus Products Ltd)
|
102
|
+
company_name = div1.element("span[@class='companyName']")&.text
|
103
|
+
|
104
|
+
# company location (e.g. Edinburgh)
|
105
|
+
location = div1.element("div[@class='companyLocation']")&.text
|
106
|
+
tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
|
107
|
+
|
108
|
+
div3 = tbody.element("tr[@class='underShelfFooter']/td/di" \
|
109
|
+
"v[@class='result-footer']")
|
110
|
+
|
111
|
+
# job (e.g. Our products are primarily written in C#, using...)
|
112
|
+
jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
|
113
|
+
|
114
|
+
# visually (e.g. Posted 14 days ago)
|
115
|
+
dateposted = div3.element("span[@class='date']")&.texts
|
116
|
+
date = (Date.today - dateposted.first.to_i).to_s if dateposted
|
117
|
+
|
118
|
+
{
|
119
|
+
link: @url_base.sub(/\/[^\/]+$/,'') \
|
120
|
+
+ doc.root.attributes[:href].gsub(/&/,'&'),
|
121
|
+
title: jobtitle,
|
122
|
+
salary: salary,
|
123
|
+
company: company_name,
|
124
|
+
location: location,
|
125
|
+
jobsnippet: jobsnippet,
|
126
|
+
date: date
|
127
|
+
}
|
128
|
+
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
28
132
|
def page(n)
|
133
|
+
|
134
|
+
if n < 1 or n > @results.length then
|
135
|
+
raise IndeedScraper2022Err, 'Invalid page no.'
|
136
|
+
end
|
137
|
+
|
29
138
|
url = @results[n-1][:link]
|
30
139
|
fetchjob(url)
|
31
140
|
end
|
@@ -56,16 +165,18 @@ class IndeedScraper2022
|
|
56
165
|
"ickyContainer-companyrating']/div/div[@class='icl-u-x" \
|
57
166
|
"s-mr--xs']")[1]
|
58
167
|
clink = div3.element('//a')
|
59
|
-
company = cname ? cname.text : clink.text
|
168
|
+
company = cname.text ? cname.text : clink.text
|
60
169
|
companylink = clink.attributes[:href] if clink
|
61
170
|
|
171
|
+
salary = div1.element("//span[@class='attribute_snippet']")&.text
|
172
|
+
type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
|
62
173
|
div5 = div3.xpath("div/div")
|
63
174
|
location, worklocation = div5.map(&:text).compact
|
64
175
|
|
65
176
|
# icl (e.g. Full-time, Permanent)
|
66
177
|
jobtype = div1.element("div/div/div[@class='jobsearch-J" \
|
67
178
|
"obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
|
68
|
-
jobtype = jobtype
|
179
|
+
jobtype = jobtype&.texts.join if jobtype
|
69
180
|
|
70
181
|
# jobsearch (e.g. Urgently needed)
|
71
182
|
jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
|
@@ -73,106 +184,57 @@ class IndeedScraper2022
|
|
73
184
|
"h-DesktopTag-text']")&.text
|
74
185
|
|
75
186
|
# jobsearch (e.g. 10 days ago)
|
76
|
-
|
77
|
-
"tent']/div[@class='jobsearch-JobMetadataFooter']/div")&.text
|
187
|
+
days = e0.element("//div[@class='jobsearch-JobTab-con" \
|
188
|
+
"tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
|
189
|
+
d = Date.today - days.to_i
|
190
|
+
datepost = d.strftime("%Y-%m-%d")
|
191
|
+
|
78
192
|
|
79
193
|
jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
|
80
194
|
"ass='jobsearch-jobDescriptionText']").xml
|
81
195
|
|
82
196
|
{
|
83
197
|
title: jobtitle,
|
198
|
+
type: type,
|
84
199
|
company: company,
|
85
200
|
companylink: companylink,
|
86
201
|
location: location,
|
202
|
+
salary: salary,
|
87
203
|
worklocation: worklocation,
|
88
204
|
note: jobnote1,
|
89
|
-
date:
|
205
|
+
date: datepost,
|
90
206
|
desc: jobdesc
|
91
207
|
}
|
92
208
|
|
93
209
|
end
|
94
210
|
|
95
|
-
def search(q='', location='')
|
96
|
-
|
97
|
-
a = Mechanize.new
|
98
|
-
|
99
|
-
page = a.get(@url_base)
|
100
|
-
form = page.forms.first
|
101
|
-
form.fields[0].value = @q
|
102
|
-
form.fields[1].value = @location
|
103
|
-
pg = form.submit
|
104
|
-
|
105
|
-
doc2 = Nokogiri::XML(pg.body)
|
106
|
-
|
107
|
-
a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
|
108
|
-
puts 'a2: ' + a2.length.inspect if @debug
|
109
|
-
|
110
|
-
@a2 = a2.map {|x| Rexle.new x.to_s }
|
111
|
-
|
112
|
-
@a2.map do |doc|
|
113
|
-
|
114
|
-
div = doc.element("a[@class='desktop']/div[@class='slider" \
|
115
|
-
"_container']/div[@class='slider_list']/div[@class='sl" \
|
116
|
-
"ider_item']/div[@class='job_seen_beacon']")
|
117
|
-
td = div.element("table[@class='jobCard_mainContent']/tbo" \
|
118
|
-
"dy/tr/td[@class='resultContent']")
|
119
|
-
|
120
|
-
# job title (e.g. Software Developer)
|
121
|
-
jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
|
122
|
-
"class='jobTitle-color-purple']/span")&.text
|
123
|
-
puts 'jobtitle: ' + jobtitle.inspect if @debug
|
124
|
-
|
125
|
-
salary = td.element("div[@class='metadataContainer']/" \
|
126
|
-
"div[@class='salary-snippet-container']/div[@class='sa" \
|
127
|
-
"lary-snippet']/span")&.text
|
128
211
|
|
129
|
-
|
130
|
-
div1 = td.element("div[@class='companyInfo']")
|
131
|
-
|
132
|
-
# company name (e.g. Coda Octopus Products Ltd)
|
133
|
-
company_name = div1.element("span[@class='companyName']")&.text
|
134
|
-
|
135
|
-
# company location (e.g. Edinburgh)
|
136
|
-
location = div1.element("div[@class='companyLocation']")&.text
|
137
|
-
tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
|
212
|
+
end
|
138
213
|
|
139
|
-
|
140
|
-
"v[@class='result-footer']")
|
214
|
+
class IS22Plus < IndeedScraper2022
|
141
215
|
|
142
|
-
|
143
|
-
|
216
|
+
def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
|
217
|
+
super(q: q, location: location, headless: headless, cookies: cookies,
|
218
|
+
debug: debug)
|
219
|
+
end
|
144
220
|
|
145
|
-
|
146
|
-
dateposted = div3.element("span[@class='date']").texts
|
147
|
-
date = (Date.today - dateposted.first.to_i).to_s
|
221
|
+
def archive()
|
148
222
|
|
149
|
-
|
150
|
-
link: @url_base.sub(/\/[^\/]+$/,'') \
|
151
|
-
+ doc.root.attributes[:href].gsub(/&/,'&'),
|
152
|
-
title: jobtitle,
|
153
|
-
salary: salary,
|
154
|
-
company: company_name,
|
155
|
-
location: location,
|
156
|
-
jobsnippet: jobsnippet,
|
157
|
-
date: date
|
158
|
-
}
|
223
|
+
return unless @results
|
159
224
|
|
225
|
+
1.upto(@results.length).each do |n|
|
226
|
+
page(n)
|
160
227
|
end
|
161
|
-
end
|
162
|
-
end
|
163
|
-
|
164
|
-
class IS22Plus < IndeedScraper2022
|
165
228
|
|
166
|
-
def initialize(q: '', location: '', debug: false)
|
167
|
-
super(q: q, location: location, debug: debug)
|
168
229
|
end
|
169
230
|
|
170
231
|
def list()
|
171
232
|
|
172
233
|
@results.map.with_index do |x,i|
|
173
|
-
"%2d. %s" % [i,x[:title]]
|
234
|
+
"%2d. %s" % [i+1,x[:title]]
|
174
235
|
end.join("\n")
|
175
236
|
|
176
237
|
end
|
177
238
|
|
239
|
+
|
178
240
|
end
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeed_scraper2022
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
|
36
36
|
SW/2zInu2bkj/meWm5eBoWHT
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-
|
38
|
+
date: 2022-03-30 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: nokorexi
|
@@ -43,42 +43,42 @@ dependencies:
|
|
43
43
|
requirements:
|
44
44
|
- - "~>"
|
45
45
|
- !ruby/object:Gem::Version
|
46
|
-
version: '0.
|
46
|
+
version: '0.7'
|
47
47
|
- - ">="
|
48
48
|
- !ruby/object:Gem::Version
|
49
|
-
version: 0.
|
49
|
+
version: 0.7.0
|
50
50
|
type: :runtime
|
51
51
|
prerelease: false
|
52
52
|
version_requirements: !ruby/object:Gem::Requirement
|
53
53
|
requirements:
|
54
54
|
- - "~>"
|
55
55
|
- !ruby/object:Gem::Version
|
56
|
-
version: '0.
|
56
|
+
version: '0.7'
|
57
57
|
- - ">="
|
58
58
|
- !ruby/object:Gem::Version
|
59
|
-
version: 0.
|
59
|
+
version: 0.7.0
|
60
60
|
- !ruby/object:Gem::Dependency
|
61
|
-
name:
|
61
|
+
name: ferrumwizard
|
62
62
|
requirement: !ruby/object:Gem::Requirement
|
63
63
|
requirements:
|
64
64
|
- - "~>"
|
65
65
|
- !ruby/object:Gem::Version
|
66
|
-
version: '2
|
66
|
+
version: '0.2'
|
67
67
|
- - ">="
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: 2.
|
69
|
+
version: 0.2.2
|
70
70
|
type: :runtime
|
71
71
|
prerelease: false
|
72
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
73
|
requirements:
|
74
74
|
- - "~>"
|
75
75
|
- !ruby/object:Gem::Version
|
76
|
-
version: '2
|
76
|
+
version: '0.2'
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
|
-
version: 2.
|
79
|
+
version: 0.2.2
|
80
80
|
description:
|
81
|
-
email: digital.robertson@
|
81
|
+
email: digital.robertson@gmail.com
|
82
82
|
executables: []
|
83
83
|
extensions: []
|
84
84
|
extra_rdoc_files: []
|
@@ -103,8 +103,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
103
103
|
- !ruby/object:Gem::Version
|
104
104
|
version: '0'
|
105
105
|
requirements: []
|
106
|
-
|
107
|
-
rubygems_version: 2.7.10
|
106
|
+
rubygems_version: 3.2.22
|
108
107
|
signing_key:
|
109
108
|
specification_version: 4
|
110
109
|
summary: Attempts to scrape the indeed.com jobsearch results (1 page).
|
metadata.gz.sig
CHANGED
Binary file
|