indeed_scraper2022 0.1.0 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/indeed_scraper2022.rb +95 -12
- data.tar.gz.sig +0 -0
- metadata +9 -10
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6bcc9484e0c35971209ee1419b9ff7d07d912831eedf889d10789c2ab4bf0863
|
4
|
+
data.tar.gz: c3be84be0f499ed92794a8de0c3109cc99d3ba73f6aef74f9426a12b0a250e10
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9e6d64b754bbd242f6d038f385602ed4dc5fe2d8112da12709a673a494419a159c9afe2ef4fa6bc95cccd160f9bceb22b03783108ffec213ed96442372092f81
|
7
|
+
data.tar.gz: eae4c65e2cd7784aff9c3db394fa29e44344e61a5275a31790a1fc21c57df34204585aa5e9e3db8af28a251cf2366675d4d9027b9a6578456dd9f64744842caa
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/indeed_scraper2022.rb
CHANGED
@@ -9,6 +9,9 @@ require 'nokorexi'
|
|
9
9
|
# don't rely upon this gem working in the near future.
|
10
10
|
|
11
11
|
|
12
|
+
class IndeedScraper2022Err < Exception
|
13
|
+
end
|
14
|
+
|
12
15
|
class IndeedScraper2022
|
13
16
|
|
14
17
|
def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '', debug: false)
|
@@ -25,17 +28,78 @@ class IndeedScraper2022
|
|
25
28
|
@results
|
26
29
|
end
|
27
30
|
|
28
|
-
def page()
|
29
|
-
end
|
31
|
+
def page(n)
|
30
32
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
33
|
+
if n < 1 or n > @results.length then
|
34
|
+
raise IndeedScraper2022Err, 'Invalid page no.'
|
35
|
+
end
|
36
|
+
|
37
|
+
url = @results[n-1][:link]
|
38
|
+
fetchjob(url)
|
35
39
|
end
|
36
40
|
|
37
41
|
private
|
38
42
|
|
43
|
+
def fetchjob(url)
|
44
|
+
|
45
|
+
doc = Nokorexi.new(url).to_doc
|
46
|
+
e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
|
47
|
+
|
48
|
+
#div = e0.element("//div[@class='jobsearch-JobComponent']")
|
49
|
+
div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
|
50
|
+
div2 = div1.element("div")
|
51
|
+
|
52
|
+
# jobsearch (e.g. Full Stack Website Developer (Wordpress))
|
53
|
+
jobtitle = div2.element("div[@class='jobsearch-JobInfoHead" \
|
54
|
+
"er-title-container']/h1[@class='jobsearch-JobInfoHead" \
|
55
|
+
"er-title']")&.text
|
56
|
+
|
57
|
+
div3 = div2.element("div[@class='jobsearch-CompanyInfoCon" \
|
58
|
+
"tainer']/div[@class='jobsearch-CompanyInfoWithoutHead" \
|
59
|
+
"erImage']/div/div[@class='jobsearch-DesktopStickyCont" \
|
60
|
+
"ainer-subtitle']")
|
61
|
+
|
62
|
+
# icl (e.g. Lyles Sutherland)
|
63
|
+
cname = div3.xpath("div[@class='jobsearch-DesktopSt" \
|
64
|
+
"ickyContainer-companyrating']/div/div[@class='icl-u-x" \
|
65
|
+
"s-mr--xs']")[1]
|
66
|
+
clink = div3.element('//a')
|
67
|
+
company = cname ? cname.text : clink.text
|
68
|
+
companylink = clink.attributes[:href] if clink
|
69
|
+
|
70
|
+
div5 = div3.xpath("div/div")
|
71
|
+
location, worklocation = div5.map(&:text).compact
|
72
|
+
|
73
|
+
# icl (e.g. Full-time, Permanent)
|
74
|
+
jobtype = div1.element("div/div/div[@class='jobsearch-J" \
|
75
|
+
"obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
|
76
|
+
jobtype = jobtype.texts.join if jobtype
|
77
|
+
|
78
|
+
# jobsearch (e.g. Urgently needed)
|
79
|
+
jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
|
80
|
+
"']/div[@class='urgently-hiring']/div[@class='jobsearc" \
|
81
|
+
"h-DesktopTag-text']")&.text
|
82
|
+
|
83
|
+
# jobsearch (e.g. 10 days ago)
|
84
|
+
datepost = e0.element("//div[@class='jobsearch-JobTab-con" \
|
85
|
+
"tent']/div[@class='jobsearch-JobMetadataFooter']/div")&.text
|
86
|
+
|
87
|
+
jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
|
88
|
+
"ass='jobsearch-jobDescriptionText']").xml
|
89
|
+
|
90
|
+
{
|
91
|
+
title: jobtitle,
|
92
|
+
company: company,
|
93
|
+
companylink: companylink,
|
94
|
+
location: location,
|
95
|
+
worklocation: worklocation,
|
96
|
+
note: jobnote1,
|
97
|
+
date: (Date.today - datepost.to_i).to_s,
|
98
|
+
desc: jobdesc
|
99
|
+
}
|
100
|
+
|
101
|
+
end
|
102
|
+
|
39
103
|
def search(q='', location='')
|
40
104
|
|
41
105
|
a = Mechanize.new
|
@@ -47,6 +111,7 @@ class IndeedScraper2022
|
|
47
111
|
pg = form.submit
|
48
112
|
|
49
113
|
doc2 = Nokogiri::XML(pg.body)
|
114
|
+
|
50
115
|
a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
|
51
116
|
puts 'a2: ' + a2.length.inspect if @debug
|
52
117
|
|
@@ -62,34 +127,36 @@ class IndeedScraper2022
|
|
62
127
|
|
63
128
|
# job title (e.g. Software Developer)
|
64
129
|
jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
|
65
|
-
"class='jobTitle-color-purple']/span")
|
130
|
+
"class='jobTitle-color-purple']/span")&.text
|
66
131
|
puts 'jobtitle: ' + jobtitle.inspect if @debug
|
67
132
|
|
68
133
|
salary = td.element("div[@class='metadataContainer']/" \
|
69
134
|
"div[@class='salary-snippet-container']/div[@class='sa" \
|
70
|
-
"lary-snippet']/span")
|
71
|
-
|
135
|
+
"lary-snippet']/span")&.text
|
136
|
+
|
72
137
|
puts 'salary: ' + salary.inspect if @debug
|
73
138
|
div1 = td.element("div[@class='companyInfo']")
|
74
139
|
|
75
140
|
# company name (e.g. Coda Octopus Products Ltd)
|
76
|
-
company_name = div1.element("span[@class='companyName']")
|
141
|
+
company_name = div1.element("span[@class='companyName']")&.text
|
77
142
|
|
78
143
|
# company location (e.g. Edinburgh)
|
79
|
-
location = div1.element("div[@class='companyLocation']")
|
144
|
+
location = div1.element("div[@class='companyLocation']")&.text
|
80
145
|
tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
|
81
146
|
|
82
147
|
div3 = tbody.element("tr[@class='underShelfFooter']/td/di" \
|
83
148
|
"v[@class='result-footer']")
|
84
149
|
|
85
150
|
# job (e.g. Our products are primarily written in C#, using...)
|
86
|
-
jobsnippet = div3.
|
151
|
+
jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
|
87
152
|
|
88
153
|
# visually (e.g. Posted 14 days ago)
|
89
154
|
dateposted = div3.element("span[@class='date']").texts
|
90
155
|
date = (Date.today - dateposted.first.to_i).to_s
|
91
156
|
|
92
157
|
{
|
158
|
+
link: @url_base.sub(/\/[^\/]+$/,'') \
|
159
|
+
+ doc.root.attributes[:href].gsub(/&/,'&'),
|
93
160
|
title: jobtitle,
|
94
161
|
salary: salary,
|
95
162
|
company: company_name,
|
@@ -101,3 +168,19 @@ class IndeedScraper2022
|
|
101
168
|
end
|
102
169
|
end
|
103
170
|
end
|
171
|
+
|
172
|
+
class IS22Plus < IndeedScraper2022
|
173
|
+
|
174
|
+
def initialize(q: '', location: '', debug: false)
|
175
|
+
super(q: q, location: location, debug: debug)
|
176
|
+
end
|
177
|
+
|
178
|
+
def list()
|
179
|
+
|
180
|
+
@results.map.with_index do |x,i|
|
181
|
+
"%2d. %s" % [i+1,x[:title]]
|
182
|
+
end.join("\n")
|
183
|
+
|
184
|
+
end
|
185
|
+
|
186
|
+
end
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeed_scraper2022
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
|
36
36
|
SW/2zInu2bkj/meWm5eBoWHT
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-
|
38
|
+
date: 2022-03-22 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: nokorexi
|
@@ -43,20 +43,20 @@ dependencies:
|
|
43
43
|
requirements:
|
44
44
|
- - "~>"
|
45
45
|
- !ruby/object:Gem::Version
|
46
|
-
version: '0.
|
46
|
+
version: '0.7'
|
47
47
|
- - ">="
|
48
48
|
- !ruby/object:Gem::Version
|
49
|
-
version: 0.
|
49
|
+
version: 0.7.0
|
50
50
|
type: :runtime
|
51
51
|
prerelease: false
|
52
52
|
version_requirements: !ruby/object:Gem::Requirement
|
53
53
|
requirements:
|
54
54
|
- - "~>"
|
55
55
|
- !ruby/object:Gem::Version
|
56
|
-
version: '0.
|
56
|
+
version: '0.7'
|
57
57
|
- - ">="
|
58
58
|
- !ruby/object:Gem::Version
|
59
|
-
version: 0.
|
59
|
+
version: 0.7.0
|
60
60
|
- !ruby/object:Gem::Dependency
|
61
61
|
name: mechanize
|
62
62
|
requirement: !ruby/object:Gem::Requirement
|
@@ -78,7 +78,7 @@ dependencies:
|
|
78
78
|
- !ruby/object:Gem::Version
|
79
79
|
version: 2.8.4
|
80
80
|
description:
|
81
|
-
email:
|
81
|
+
email: digital.robertson@gmail.com
|
82
82
|
executables: []
|
83
83
|
extensions: []
|
84
84
|
extra_rdoc_files: []
|
@@ -96,15 +96,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
96
96
|
requirements:
|
97
97
|
- - ">="
|
98
98
|
- !ruby/object:Gem::Version
|
99
|
-
version:
|
99
|
+
version: 2.3.0
|
100
100
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
101
101
|
requirements:
|
102
102
|
- - ">="
|
103
103
|
- !ruby/object:Gem::Version
|
104
104
|
version: '0'
|
105
105
|
requirements: []
|
106
|
-
|
107
|
-
rubygems_version: 2.7.10
|
106
|
+
rubygems_version: 3.2.22
|
108
107
|
signing_key:
|
109
108
|
specification_version: 4
|
110
109
|
summary: Attempts to scrape the indeed.com jobsearch results (1 page).
|
metadata.gz.sig
CHANGED
Binary file
|