indeed_scraper2022 0.1.2 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/indeed_scraper2022.rb +136 -74
- data.tar.gz.sig +0 -0
- metadata +13 -14
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5e33dfd54667ecc9f8b7985aa07af403be8d95729ce68e8a40d8c985d57bd4e1
|
4
|
+
data.tar.gz: a2c041ec8103b6afac3a422e7b73bc82c89fd7f8d955240439a29ec0347c8a5f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8e640cb8262a057bb588b501ee1122a59e6e239e2a5988dd0566ffffb814a2fef763c36fdeae1ba5dc4e6f819ca145374058bd62373ce776df1e393057a49fc0
|
7
|
+
data.tar.gz: 0a6bfe0ef2b685d5711a95704cee3fa67d58eb7c9d0f149c872f9c23b0cc489382ab1327b101f7accf982f7ea1f1a6d56dc20ae528ef1fd4d6105c9ef93067da
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/indeed_scraper2022.rb
CHANGED
@@ -2,20 +2,29 @@
|
|
2
2
|
|
3
3
|
# file: indeed_scraper2022.rb
|
4
4
|
|
5
|
-
require '
|
5
|
+
require 'ferrumwizard'
|
6
6
|
require 'nokorexi'
|
7
7
|
|
8
8
|
# Given the nature of changes to jobsearch websites,
|
9
9
|
# don't rely upon this gem working in the near future.
|
10
10
|
|
11
11
|
|
12
|
+
class IndeedScraper2022Err < Exception
|
13
|
+
end
|
14
|
+
|
12
15
|
class IndeedScraper2022
|
13
16
|
|
14
|
-
|
17
|
+
attr_reader :browser
|
18
|
+
|
19
|
+
def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '',
|
20
|
+
headless: true, cookies: nil, debug: false)
|
15
21
|
|
16
22
|
@debug = debug
|
17
23
|
@url_base, @q, @location = url, q, location
|
18
|
-
@
|
24
|
+
@headless, @cookies = headless, cookies
|
25
|
+
|
26
|
+
fw = FerrumWizard.new( headless: @headless, cookies: @cookies, debug: @debug)
|
27
|
+
@browser = fw.browser
|
19
28
|
|
20
29
|
end
|
21
30
|
|
@@ -25,7 +34,107 @@ class IndeedScraper2022
|
|
25
34
|
@results
|
26
35
|
end
|
27
36
|
|
37
|
+
def search(q: @q, location: @location, start: nil)
|
38
|
+
|
39
|
+
url = @url_base
|
40
|
+
url += 'start=' + start if start
|
41
|
+
|
42
|
+
@browser.goto(url)
|
43
|
+
#@browser.network.wait_for_idle
|
44
|
+
puts 'sleeping for 4 seconds' if @debug
|
45
|
+
sleep 4
|
46
|
+
|
47
|
+
if q.length > 1 then
|
48
|
+
|
49
|
+
input = @browser.at_xpath("//input[@name='q']")
|
50
|
+
|
51
|
+
# select any existing text and overwrite it
|
52
|
+
input.focus.type(:home); sleep 0.2
|
53
|
+
input.focus.type(:shift, :end); sleep 0.2
|
54
|
+
input.focus.type(q); sleep 0.2
|
55
|
+
end
|
56
|
+
|
57
|
+
if location.length > 1 then
|
58
|
+
|
59
|
+
input2 = @browser.at_xpath("//input[@name='l']")
|
60
|
+
|
61
|
+
# select any existing text and overwrite it
|
62
|
+
input2.focus.type(:home); sleep 0.2
|
63
|
+
input2.focus.type(:shift, :end); sleep 0.2
|
64
|
+
input2.focus.type(location); sleep 0.2
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
button = @browser.at_xpath("//button[@type='submit']")
|
69
|
+
button.click
|
70
|
+
#@browser.network.wait_for_idle
|
71
|
+
puts 'sleeping for 2 seconds' if @debug
|
72
|
+
sleep 2
|
73
|
+
|
74
|
+
doc2 = Nokogiri::XML(@browser.body)
|
75
|
+
|
76
|
+
a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
|
77
|
+
puts 'a2: ' + a2.length.inspect if @debug
|
78
|
+
|
79
|
+
@a2 = a2.map {|x| Rexle.new x.to_s }
|
80
|
+
|
81
|
+
@results = @a2.map do |doc|
|
82
|
+
|
83
|
+
div = doc.element("a[@class='desktop']/div[@class='slider" \
|
84
|
+
"_container']/div[@class='slider_list']/div[@class='sl" \
|
85
|
+
"ider_item']/div[@class='job_seen_beacon']")
|
86
|
+
td = div.element("table[@class='jobCard_mainContent']/tbo" \
|
87
|
+
"dy/tr/td[@class='resultContent']")
|
88
|
+
|
89
|
+
# job title (e.g. Software Developer)
|
90
|
+
jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
|
91
|
+
"class='jobTitle-color-purple']/span")&.text
|
92
|
+
puts 'jobtitle: ' + jobtitle.inspect if @debug
|
93
|
+
|
94
|
+
salary = td.element("div[@class='metadataContainer']/" \
|
95
|
+
"div[@class='salary-snippet-container']/div[@class='sa" \
|
96
|
+
"lary-snippet']/span")&.text
|
97
|
+
|
98
|
+
puts 'salary: ' + salary.inspect if @debug
|
99
|
+
div1 = td.element("div[@class='companyInfo']")
|
100
|
+
|
101
|
+
# company name (e.g. Coda Octopus Products Ltd)
|
102
|
+
company_name = div1.element("span[@class='companyName']")&.text
|
103
|
+
|
104
|
+
# company location (e.g. Edinburgh)
|
105
|
+
location = div1.element("div[@class='companyLocation']")&.text
|
106
|
+
tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
|
107
|
+
|
108
|
+
div3 = tbody.element("tr[@class='underShelfFooter']/td/di" \
|
109
|
+
"v[@class='result-footer']")
|
110
|
+
|
111
|
+
# job (e.g. Our products are primarily written in C#, using...)
|
112
|
+
jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
|
113
|
+
|
114
|
+
# visually (e.g. Posted 14 days ago)
|
115
|
+
dateposted = div3.element("span[@class='date']")&.texts
|
116
|
+
date = (Date.today - dateposted.first.to_i).to_s if dateposted
|
117
|
+
|
118
|
+
{
|
119
|
+
link: @url_base.sub(/\/[^\/]+$/,'') \
|
120
|
+
+ doc.root.attributes[:href].gsub(/&/,'&'),
|
121
|
+
title: jobtitle,
|
122
|
+
salary: salary,
|
123
|
+
company: company_name,
|
124
|
+
location: location,
|
125
|
+
jobsnippet: jobsnippet,
|
126
|
+
date: date
|
127
|
+
}
|
128
|
+
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
28
132
|
def page(n)
|
133
|
+
|
134
|
+
if n < 1 or n > @results.length then
|
135
|
+
raise IndeedScraper2022Err, 'Invalid page no.'
|
136
|
+
end
|
137
|
+
|
29
138
|
url = @results[n-1][:link]
|
30
139
|
fetchjob(url)
|
31
140
|
end
|
@@ -56,16 +165,18 @@ class IndeedScraper2022
|
|
56
165
|
"ickyContainer-companyrating']/div/div[@class='icl-u-x" \
|
57
166
|
"s-mr--xs']")[1]
|
58
167
|
clink = div3.element('//a')
|
59
|
-
company = cname ? cname.text : clink.text
|
168
|
+
company = cname.text ? cname.text : clink.text
|
60
169
|
companylink = clink.attributes[:href] if clink
|
61
170
|
|
171
|
+
salary = div1.element("//span[@class='attribute_snippet']")&.text
|
172
|
+
type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
|
62
173
|
div5 = div3.xpath("div/div")
|
63
174
|
location, worklocation = div5.map(&:text).compact
|
64
175
|
|
65
176
|
# icl (e.g. Full-time, Permanent)
|
66
177
|
jobtype = div1.element("div/div/div[@class='jobsearch-J" \
|
67
178
|
"obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
|
68
|
-
jobtype = jobtype
|
179
|
+
jobtype = jobtype&.texts.join if jobtype
|
69
180
|
|
70
181
|
# jobsearch (e.g. Urgently needed)
|
71
182
|
jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
|
@@ -73,106 +184,57 @@ class IndeedScraper2022
|
|
73
184
|
"h-DesktopTag-text']")&.text
|
74
185
|
|
75
186
|
# jobsearch (e.g. 10 days ago)
|
76
|
-
|
77
|
-
"tent']/div[@class='jobsearch-JobMetadataFooter']/div")&.text
|
187
|
+
days = e0.element("//div[@class='jobsearch-JobTab-con" \
|
188
|
+
"tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
|
189
|
+
d = Date.today - days.to_i
|
190
|
+
datepost = d.strftime("%Y-%m-%d")
|
191
|
+
|
78
192
|
|
79
193
|
jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
|
80
194
|
"ass='jobsearch-jobDescriptionText']").xml
|
81
195
|
|
82
196
|
{
|
83
197
|
title: jobtitle,
|
198
|
+
type: type,
|
84
199
|
company: company,
|
85
200
|
companylink: companylink,
|
86
201
|
location: location,
|
202
|
+
salary: salary,
|
87
203
|
worklocation: worklocation,
|
88
204
|
note: jobnote1,
|
89
|
-
date:
|
205
|
+
date: datepost,
|
90
206
|
desc: jobdesc
|
91
207
|
}
|
92
208
|
|
93
209
|
end
|
94
210
|
|
95
|
-
def search(q='', location='')
|
96
|
-
|
97
|
-
a = Mechanize.new
|
98
|
-
|
99
|
-
page = a.get(@url_base)
|
100
|
-
form = page.forms.first
|
101
|
-
form.fields[0].value = @q
|
102
|
-
form.fields[1].value = @location
|
103
|
-
pg = form.submit
|
104
|
-
|
105
|
-
doc2 = Nokogiri::XML(pg.body)
|
106
|
-
|
107
|
-
a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
|
108
|
-
puts 'a2: ' + a2.length.inspect if @debug
|
109
|
-
|
110
|
-
@a2 = a2.map {|x| Rexle.new x.to_s }
|
111
|
-
|
112
|
-
@a2.map do |doc|
|
113
|
-
|
114
|
-
div = doc.element("a[@class='desktop']/div[@class='slider" \
|
115
|
-
"_container']/div[@class='slider_list']/div[@class='sl" \
|
116
|
-
"ider_item']/div[@class='job_seen_beacon']")
|
117
|
-
td = div.element("table[@class='jobCard_mainContent']/tbo" \
|
118
|
-
"dy/tr/td[@class='resultContent']")
|
119
|
-
|
120
|
-
# job title (e.g. Software Developer)
|
121
|
-
jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
|
122
|
-
"class='jobTitle-color-purple']/span")&.text
|
123
|
-
puts 'jobtitle: ' + jobtitle.inspect if @debug
|
124
|
-
|
125
|
-
salary = td.element("div[@class='metadataContainer']/" \
|
126
|
-
"div[@class='salary-snippet-container']/div[@class='sa" \
|
127
|
-
"lary-snippet']/span")&.text
|
128
211
|
|
129
|
-
|
130
|
-
div1 = td.element("div[@class='companyInfo']")
|
131
|
-
|
132
|
-
# company name (e.g. Coda Octopus Products Ltd)
|
133
|
-
company_name = div1.element("span[@class='companyName']")&.text
|
134
|
-
|
135
|
-
# company location (e.g. Edinburgh)
|
136
|
-
location = div1.element("div[@class='companyLocation']")&.text
|
137
|
-
tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
|
212
|
+
end
|
138
213
|
|
139
|
-
|
140
|
-
"v[@class='result-footer']")
|
214
|
+
class IS22Plus < IndeedScraper2022
|
141
215
|
|
142
|
-
|
143
|
-
|
216
|
+
def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
|
217
|
+
super(q: q, location: location, headless: headless, cookies: cookies,
|
218
|
+
debug: debug)
|
219
|
+
end
|
144
220
|
|
145
|
-
|
146
|
-
dateposted = div3.element("span[@class='date']").texts
|
147
|
-
date = (Date.today - dateposted.first.to_i).to_s
|
221
|
+
def archive()
|
148
222
|
|
149
|
-
|
150
|
-
link: @url_base.sub(/\/[^\/]+$/,'') \
|
151
|
-
+ doc.root.attributes[:href].gsub(/&/,'&'),
|
152
|
-
title: jobtitle,
|
153
|
-
salary: salary,
|
154
|
-
company: company_name,
|
155
|
-
location: location,
|
156
|
-
jobsnippet: jobsnippet,
|
157
|
-
date: date
|
158
|
-
}
|
223
|
+
return unless @results
|
159
224
|
|
225
|
+
1.upto(@results.length).each do |n|
|
226
|
+
page(n)
|
160
227
|
end
|
161
|
-
end
|
162
|
-
end
|
163
|
-
|
164
|
-
class IS22Plus < IndeedScraper2022
|
165
228
|
|
166
|
-
def initialize(q: '', location: '', debug: false)
|
167
|
-
super(q: q, location: location, debug: debug)
|
168
229
|
end
|
169
230
|
|
170
231
|
def list()
|
171
232
|
|
172
233
|
@results.map.with_index do |x,i|
|
173
|
-
"%2d. %s" % [i,x[:title]]
|
234
|
+
"%2d. %s" % [i+1,x[:title]]
|
174
235
|
end.join("\n")
|
175
236
|
|
176
237
|
end
|
177
238
|
|
239
|
+
|
178
240
|
end
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeed_scraper2022
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
|
36
36
|
SW/2zInu2bkj/meWm5eBoWHT
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-
|
38
|
+
date: 2022-03-30 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: nokorexi
|
@@ -43,42 +43,42 @@ dependencies:
|
|
43
43
|
requirements:
|
44
44
|
- - "~>"
|
45
45
|
- !ruby/object:Gem::Version
|
46
|
-
version: '0.
|
46
|
+
version: '0.7'
|
47
47
|
- - ">="
|
48
48
|
- !ruby/object:Gem::Version
|
49
|
-
version: 0.
|
49
|
+
version: 0.7.0
|
50
50
|
type: :runtime
|
51
51
|
prerelease: false
|
52
52
|
version_requirements: !ruby/object:Gem::Requirement
|
53
53
|
requirements:
|
54
54
|
- - "~>"
|
55
55
|
- !ruby/object:Gem::Version
|
56
|
-
version: '0.
|
56
|
+
version: '0.7'
|
57
57
|
- - ">="
|
58
58
|
- !ruby/object:Gem::Version
|
59
|
-
version: 0.
|
59
|
+
version: 0.7.0
|
60
60
|
- !ruby/object:Gem::Dependency
|
61
|
-
name:
|
61
|
+
name: ferrumwizard
|
62
62
|
requirement: !ruby/object:Gem::Requirement
|
63
63
|
requirements:
|
64
64
|
- - "~>"
|
65
65
|
- !ruby/object:Gem::Version
|
66
|
-
version: '2
|
66
|
+
version: '0.2'
|
67
67
|
- - ">="
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: 2.
|
69
|
+
version: 0.2.2
|
70
70
|
type: :runtime
|
71
71
|
prerelease: false
|
72
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
73
|
requirements:
|
74
74
|
- - "~>"
|
75
75
|
- !ruby/object:Gem::Version
|
76
|
-
version: '2
|
76
|
+
version: '0.2'
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
|
-
version: 2.
|
79
|
+
version: 0.2.2
|
80
80
|
description:
|
81
|
-
email: digital.robertson@
|
81
|
+
email: digital.robertson@gmail.com
|
82
82
|
executables: []
|
83
83
|
extensions: []
|
84
84
|
extra_rdoc_files: []
|
@@ -103,8 +103,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
103
103
|
- !ruby/object:Gem::Version
|
104
104
|
version: '0'
|
105
105
|
requirements: []
|
106
|
-
|
107
|
-
rubygems_version: 2.7.10
|
106
|
+
rubygems_version: 3.2.22
|
108
107
|
signing_key:
|
109
108
|
specification_version: 4
|
110
109
|
summary: Attempts to scrape the indeed.com jobsearch results (1 page).
|
metadata.gz.sig
CHANGED
Binary file
|