indeed_scraper2022 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/indeed_scraper2022.rb +108 -74
- data.tar.gz.sig +0 -0
- metadata +7 -7
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 07f5323381b5751c470454f6f4c3ba6dced6f1424e054b85360a49d814d662ba
|
4
|
+
data.tar.gz: 3d25353b9f8a0543944cac82ef6dc91adf7d3e83444f3c6ef469f15cbba8a3d8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5e13ae04b46bfa3eb15aab8d0aff388d8caec591c413493db591c37da099d2bcd5ba340a72137d4aa7d374652b68bc1d037b86fe4cc2ed2ae5b0a56c5202f00b
|
7
|
+
data.tar.gz: ca14ae99251aabbcaee08a3bb6f240742ed1fab0f438496dc742ef39a10abb13e310b2d6a93bc472f5e1b3e45cfd8956d6a62f803b1d3a152054cf4e1ae35402
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/indeed_scraper2022.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
# file: indeed_scraper2022.rb
|
4
4
|
|
5
|
-
require '
|
5
|
+
require 'ferrumwizard'
|
6
6
|
require 'nokorexi'
|
7
7
|
|
8
8
|
# Given the nature of changes to jobsearch websites,
|
@@ -14,11 +14,13 @@ end
|
|
14
14
|
|
15
15
|
class IndeedScraper2022
|
16
16
|
|
17
|
-
def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '',
|
17
|
+
def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '',
|
18
|
+
headless: true, cookies: nil, debug: false)
|
18
19
|
|
19
20
|
@debug = debug
|
20
21
|
@url_base, @q, @location = url, q, location
|
21
|
-
@
|
22
|
+
@headless, @cookies = headless, cookies
|
23
|
+
@results = search(q: @q, location: @location)
|
22
24
|
|
23
25
|
end
|
24
26
|
|
@@ -28,6 +30,87 @@ class IndeedScraper2022
|
|
28
30
|
@results
|
29
31
|
end
|
30
32
|
|
33
|
+
def search(q: @q, location: @location, start: nil)
|
34
|
+
|
35
|
+
fw = FerrumWizard.new( headless: @headless, cookies: @cookies, debug: @debug)
|
36
|
+
|
37
|
+
url = @url_base
|
38
|
+
url += 'start=' + start if start
|
39
|
+
|
40
|
+
browser = fw.browser
|
41
|
+
browser.goto(url)
|
42
|
+
|
43
|
+
if q.length > 1 then
|
44
|
+
input = browser.at_xpath("//input[@name='q']")
|
45
|
+
input.focus.type(q)
|
46
|
+
end
|
47
|
+
|
48
|
+
if location.length > 1 then
|
49
|
+
input2 = browser.at_xpath("//input[@name='l']")
|
50
|
+
input2.focus.type(location)
|
51
|
+
end
|
52
|
+
|
53
|
+
button = browser.at_xpath("//button[@type='submit']")
|
54
|
+
button.click
|
55
|
+
|
56
|
+
doc2 = Nokogiri::XML(browser.body)
|
57
|
+
|
58
|
+
a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
|
59
|
+
puts 'a2: ' + a2.length.inspect if @debug
|
60
|
+
|
61
|
+
@a2 = a2.map {|x| Rexle.new x.to_s }
|
62
|
+
|
63
|
+
@a2.map do |doc|
|
64
|
+
|
65
|
+
div = doc.element("a[@class='desktop']/div[@class='slider" \
|
66
|
+
"_container']/div[@class='slider_list']/div[@class='sl" \
|
67
|
+
"ider_item']/div[@class='job_seen_beacon']")
|
68
|
+
td = div.element("table[@class='jobCard_mainContent']/tbo" \
|
69
|
+
"dy/tr/td[@class='resultContent']")
|
70
|
+
|
71
|
+
# job title (e.g. Software Developer)
|
72
|
+
jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
|
73
|
+
"class='jobTitle-color-purple']/span")&.text
|
74
|
+
puts 'jobtitle: ' + jobtitle.inspect if @debug
|
75
|
+
|
76
|
+
salary = td.element("div[@class='metadataContainer']/" \
|
77
|
+
"div[@class='salary-snippet-container']/div[@class='sa" \
|
78
|
+
"lary-snippet']/span")&.text
|
79
|
+
|
80
|
+
puts 'salary: ' + salary.inspect if @debug
|
81
|
+
div1 = td.element("div[@class='companyInfo']")
|
82
|
+
|
83
|
+
# company name (e.g. Coda Octopus Products Ltd)
|
84
|
+
company_name = div1.element("span[@class='companyName']")&.text
|
85
|
+
|
86
|
+
# company location (e.g. Edinburgh)
|
87
|
+
location = div1.element("div[@class='companyLocation']")&.text
|
88
|
+
tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
|
89
|
+
|
90
|
+
div3 = tbody.element("tr[@class='underShelfFooter']/td/di" \
|
91
|
+
"v[@class='result-footer']")
|
92
|
+
|
93
|
+
# job (e.g. Our products are primarily written in C#, using...)
|
94
|
+
jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
|
95
|
+
|
96
|
+
# visually (e.g. Posted 14 days ago)
|
97
|
+
dateposted = div3.element("span[@class='date']")&.texts
|
98
|
+
date = (Date.today - dateposted.first.to_i).to_s if dateposted
|
99
|
+
|
100
|
+
{
|
101
|
+
link: @url_base.sub(/\/[^\/]+$/,'') \
|
102
|
+
+ doc.root.attributes[:href].gsub(/&/,'&'),
|
103
|
+
title: jobtitle,
|
104
|
+
salary: salary,
|
105
|
+
company: company_name,
|
106
|
+
location: location,
|
107
|
+
jobsnippet: jobsnippet,
|
108
|
+
date: date
|
109
|
+
}
|
110
|
+
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
31
114
|
def page(n)
|
32
115
|
|
33
116
|
if n < 1 or n > @results.length then
|
@@ -64,16 +147,18 @@ class IndeedScraper2022
|
|
64
147
|
"ickyContainer-companyrating']/div/div[@class='icl-u-x" \
|
65
148
|
"s-mr--xs']")[1]
|
66
149
|
clink = div3.element('//a')
|
67
|
-
company = cname ? cname.text : clink.text
|
150
|
+
company = cname.text ? cname.text : clink.text
|
68
151
|
companylink = clink.attributes[:href] if clink
|
69
152
|
|
153
|
+
salary = div1.element("//span[@class='attribute_snippet']")&.text
|
154
|
+
type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
|
70
155
|
div5 = div3.xpath("div/div")
|
71
156
|
location, worklocation = div5.map(&:text).compact
|
72
157
|
|
73
158
|
# icl (e.g. Full-time, Permanent)
|
74
159
|
jobtype = div1.element("div/div/div[@class='jobsearch-J" \
|
75
160
|
"obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
|
76
|
-
jobtype = jobtype
|
161
|
+
jobtype = jobtype&.texts.join if jobtype
|
77
162
|
|
78
163
|
# jobsearch (e.g. Urgently needed)
|
79
164
|
jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
|
@@ -81,98 +166,46 @@ class IndeedScraper2022
|
|
81
166
|
"h-DesktopTag-text']")&.text
|
82
167
|
|
83
168
|
# jobsearch (e.g. 10 days ago)
|
84
|
-
|
85
|
-
"tent']/div[@class='jobsearch-JobMetadataFooter']/div")&.text
|
169
|
+
days = e0.element("//div[@class='jobsearch-JobTab-con" \
|
170
|
+
"tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
|
171
|
+
d = Date.today - days.to_i
|
172
|
+
datepost = d.strftime("%Y-%m-%d")
|
173
|
+
|
86
174
|
|
87
175
|
jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
|
88
176
|
"ass='jobsearch-jobDescriptionText']").xml
|
89
177
|
|
90
178
|
{
|
91
179
|
title: jobtitle,
|
180
|
+
type: type,
|
92
181
|
company: company,
|
93
182
|
companylink: companylink,
|
94
183
|
location: location,
|
184
|
+
salary: salary,
|
95
185
|
worklocation: worklocation,
|
96
186
|
note: jobnote1,
|
97
|
-
date:
|
187
|
+
date: datepost,
|
98
188
|
desc: jobdesc
|
99
189
|
}
|
100
190
|
|
101
191
|
end
|
102
192
|
|
103
|
-
def search(q='', location='')
|
104
|
-
|
105
|
-
a = Mechanize.new
|
106
|
-
|
107
|
-
page = a.get(@url_base)
|
108
|
-
form = page.forms.first
|
109
|
-
form.fields[0].value = @q
|
110
|
-
form.fields[1].value = @location
|
111
|
-
pg = form.submit
|
112
|
-
|
113
|
-
doc2 = Nokogiri::XML(pg.body)
|
114
|
-
|
115
|
-
a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
|
116
|
-
puts 'a2: ' + a2.length.inspect if @debug
|
117
|
-
|
118
|
-
@a2 = a2.map {|x| Rexle.new x.to_s }
|
119
|
-
|
120
|
-
@a2.map do |doc|
|
121
|
-
|
122
|
-
div = doc.element("a[@class='desktop']/div[@class='slider" \
|
123
|
-
"_container']/div[@class='slider_list']/div[@class='sl" \
|
124
|
-
"ider_item']/div[@class='job_seen_beacon']")
|
125
|
-
td = div.element("table[@class='jobCard_mainContent']/tbo" \
|
126
|
-
"dy/tr/td[@class='resultContent']")
|
127
|
-
|
128
|
-
# job title (e.g. Software Developer)
|
129
|
-
jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
|
130
|
-
"class='jobTitle-color-purple']/span")&.text
|
131
|
-
puts 'jobtitle: ' + jobtitle.inspect if @debug
|
132
|
-
|
133
|
-
salary = td.element("div[@class='metadataContainer']/" \
|
134
|
-
"div[@class='salary-snippet-container']/div[@class='sa" \
|
135
|
-
"lary-snippet']/span")&.text
|
136
|
-
|
137
|
-
puts 'salary: ' + salary.inspect if @debug
|
138
|
-
div1 = td.element("div[@class='companyInfo']")
|
139
193
|
|
140
|
-
|
141
|
-
company_name = div1.element("span[@class='companyName']")&.text
|
142
|
-
|
143
|
-
# company location (e.g. Edinburgh)
|
144
|
-
location = div1.element("div[@class='companyLocation']")&.text
|
145
|
-
tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
|
146
|
-
|
147
|
-
div3 = tbody.element("tr[@class='underShelfFooter']/td/di" \
|
148
|
-
"v[@class='result-footer']")
|
194
|
+
end
|
149
195
|
|
150
|
-
|
151
|
-
jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
|
196
|
+
class IS22Plus < IndeedScraper2022
|
152
197
|
|
153
|
-
|
154
|
-
|
155
|
-
|
198
|
+
def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
|
199
|
+
super(q: q, location: location, headless: headless, cookies: cookies,
|
200
|
+
debug: debug)
|
201
|
+
end
|
156
202
|
|
157
|
-
|
158
|
-
link: @url_base.sub(/\/[^\/]+$/,'') \
|
159
|
-
+ doc.root.attributes[:href].gsub(/&/,'&'),
|
160
|
-
title: jobtitle,
|
161
|
-
salary: salary,
|
162
|
-
company: company_name,
|
163
|
-
location: location,
|
164
|
-
jobsnippet: jobsnippet,
|
165
|
-
date: date
|
166
|
-
}
|
203
|
+
def archive()
|
167
204
|
|
205
|
+
1.upto(15).each do |n|
|
206
|
+
page(n)
|
168
207
|
end
|
169
|
-
end
|
170
|
-
end
|
171
208
|
|
172
|
-
class IS22Plus < IndeedScraper2022
|
173
|
-
|
174
|
-
def initialize(q: '', location: '', debug: false)
|
175
|
-
super(q: q, location: location, debug: debug)
|
176
209
|
end
|
177
210
|
|
178
211
|
def list()
|
@@ -183,4 +216,5 @@ class IS22Plus < IndeedScraper2022
|
|
183
216
|
|
184
217
|
end
|
185
218
|
|
219
|
+
|
186
220
|
end
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeed_scraper2022
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
|
36
36
|
SW/2zInu2bkj/meWm5eBoWHT
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-03-
|
38
|
+
date: 2022-03-30 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: nokorexi
|
@@ -58,25 +58,25 @@ dependencies:
|
|
58
58
|
- !ruby/object:Gem::Version
|
59
59
|
version: 0.7.0
|
60
60
|
- !ruby/object:Gem::Dependency
|
61
|
-
name:
|
61
|
+
name: ferrumwizard
|
62
62
|
requirement: !ruby/object:Gem::Requirement
|
63
63
|
requirements:
|
64
64
|
- - "~>"
|
65
65
|
- !ruby/object:Gem::Version
|
66
|
-
version: '2
|
66
|
+
version: '0.2'
|
67
67
|
- - ">="
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: 2.
|
69
|
+
version: 0.2.2
|
70
70
|
type: :runtime
|
71
71
|
prerelease: false
|
72
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
73
|
requirements:
|
74
74
|
- - "~>"
|
75
75
|
- !ruby/object:Gem::Version
|
76
|
-
version: '2
|
76
|
+
version: '0.2'
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
|
-
version: 2.
|
79
|
+
version: 0.2.2
|
80
80
|
description:
|
81
81
|
email: digital.robertson@gmail.com
|
82
82
|
executables: []
|
metadata.gz.sig
CHANGED
Binary file
|