indeed_scraper2022 0.1.0 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/indeed_scraper2022.rb +95 -12
- data.tar.gz.sig +0 -0
- metadata +9 -10
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6bcc9484e0c35971209ee1419b9ff7d07d912831eedf889d10789c2ab4bf0863
|
4
|
+
data.tar.gz: c3be84be0f499ed92794a8de0c3109cc99d3ba73f6aef74f9426a12b0a250e10
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9e6d64b754bbd242f6d038f385602ed4dc5fe2d8112da12709a673a494419a159c9afe2ef4fa6bc95cccd160f9bceb22b03783108ffec213ed96442372092f81
|
7
|
+
data.tar.gz: eae4c65e2cd7784aff9c3db394fa29e44344e61a5275a31790a1fc21c57df34204585aa5e9e3db8af28a251cf2366675d4d9027b9a6578456dd9f64744842caa
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/indeed_scraper2022.rb
CHANGED
@@ -9,6 +9,9 @@ require 'nokorexi'
|
|
9
9
|
# don't rely upon this gem working in the near future.
|
10
10
|
|
11
11
|
|
12
|
+
class IndeedScraper2022Err < Exception
|
13
|
+
end
|
14
|
+
|
12
15
|
class IndeedScraper2022
|
13
16
|
|
14
17
|
def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '', debug: false)
|
@@ -25,17 +28,78 @@ class IndeedScraper2022
|
|
25
28
|
@results
|
26
29
|
end
|
27
30
|
|
28
|
-
def page()
|
29
|
-
end
|
31
|
+
def page(n)
|
30
32
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
33
|
+
if n < 1 or n > @results.length then
|
34
|
+
raise IndeedScraper2022Err, 'Invalid page no.'
|
35
|
+
end
|
36
|
+
|
37
|
+
url = @results[n-1][:link]
|
38
|
+
fetchjob(url)
|
35
39
|
end
|
36
40
|
|
37
41
|
private
|
38
42
|
|
43
|
+
def fetchjob(url)
|
44
|
+
|
45
|
+
doc = Nokorexi.new(url).to_doc
|
46
|
+
e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
|
47
|
+
|
48
|
+
#div = e0.element("//div[@class='jobsearch-JobComponent']")
|
49
|
+
div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
|
50
|
+
div2 = div1.element("div")
|
51
|
+
|
52
|
+
# jobsearch (e.g. Full Stack Website Developer (Wordpress))
|
53
|
+
jobtitle = div2.element("div[@class='jobsearch-JobInfoHead" \
|
54
|
+
"er-title-container']/h1[@class='jobsearch-JobInfoHead" \
|
55
|
+
"er-title']")&.text
|
56
|
+
|
57
|
+
div3 = div2.element("div[@class='jobsearch-CompanyInfoCon" \
|
58
|
+
"tainer']/div[@class='jobsearch-CompanyInfoWithoutHead" \
|
59
|
+
"erImage']/div/div[@class='jobsearch-DesktopStickyCont" \
|
60
|
+
"ainer-subtitle']")
|
61
|
+
|
62
|
+
# icl (e.g. Lyles Sutherland)
|
63
|
+
cname = div3.xpath("div[@class='jobsearch-DesktopSt" \
|
64
|
+
"ickyContainer-companyrating']/div/div[@class='icl-u-x" \
|
65
|
+
"s-mr--xs']")[1]
|
66
|
+
clink = div3.element('//a')
|
67
|
+
company = cname ? cname.text : clink.text
|
68
|
+
companylink = clink.attributes[:href] if clink
|
69
|
+
|
70
|
+
div5 = div3.xpath("div/div")
|
71
|
+
location, worklocation = div5.map(&:text).compact
|
72
|
+
|
73
|
+
# icl (e.g. Full-time, Permanent)
|
74
|
+
jobtype = div1.element("div/div/div[@class='jobsearch-J" \
|
75
|
+
"obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
|
76
|
+
jobtype = jobtype.texts.join if jobtype
|
77
|
+
|
78
|
+
# jobsearch (e.g. Urgently needed)
|
79
|
+
jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
|
80
|
+
"']/div[@class='urgently-hiring']/div[@class='jobsearc" \
|
81
|
+
"h-DesktopTag-text']")&.text
|
82
|
+
|
83
|
+
# jobsearch (e.g. 10 days ago)
|
84
|
+
datepost = e0.element("//div[@class='jobsearch-JobTab-con" \
|
85
|
+
"tent']/div[@class='jobsearch-JobMetadataFooter']/div")&.text
|
86
|
+
|
87
|
+
jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
|
88
|
+
"ass='jobsearch-jobDescriptionText']").xml
|
89
|
+
|
90
|
+
{
|
91
|
+
title: jobtitle,
|
92
|
+
company: company,
|
93
|
+
companylink: companylink,
|
94
|
+
location: location,
|
95
|
+
worklocation: worklocation,
|
96
|
+
note: jobnote1,
|
97
|
+
date: (Date.today - datepost.to_i).to_s,
|
98
|
+
desc: jobdesc
|
99
|
+
}
|
100
|
+
|
101
|
+
end
|
102
|
+
|
39
103
|
def search(q='', location='')
|
40
104
|
|
41
105
|
a = Mechanize.new
|
@@ -47,6 +111,7 @@ class IndeedScraper2022
|
|
47
111
|
pg = form.submit
|
48
112
|
|
49
113
|
doc2 = Nokogiri::XML(pg.body)
|
114
|
+
|
50
115
|
a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
|
51
116
|
puts 'a2: ' + a2.length.inspect if @debug
|
52
117
|
|
@@ -62,34 +127,36 @@ class IndeedScraper2022
|
|
62
127
|
|
63
128
|
# job title (e.g. Software Developer)
|
64
129
|
jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
|
65
|
-
"class='jobTitle-color-purple']/span")
|
130
|
+
"class='jobTitle-color-purple']/span")&.text
|
66
131
|
puts 'jobtitle: ' + jobtitle.inspect if @debug
|
67
132
|
|
68
133
|
salary = td.element("div[@class='metadataContainer']/" \
|
69
134
|
"div[@class='salary-snippet-container']/div[@class='sa" \
|
70
|
-
"lary-snippet']/span")
|
71
|
-
|
135
|
+
"lary-snippet']/span")&.text
|
136
|
+
|
72
137
|
puts 'salary: ' + salary.inspect if @debug
|
73
138
|
div1 = td.element("div[@class='companyInfo']")
|
74
139
|
|
75
140
|
# company name (e.g. Coda Octopus Products Ltd)
|
76
|
-
company_name = div1.element("span[@class='companyName']")
|
141
|
+
company_name = div1.element("span[@class='companyName']")&.text
|
77
142
|
|
78
143
|
# company location (e.g. Edinburgh)
|
79
|
-
location = div1.element("div[@class='companyLocation']")
|
144
|
+
location = div1.element("div[@class='companyLocation']")&.text
|
80
145
|
tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
|
81
146
|
|
82
147
|
div3 = tbody.element("tr[@class='underShelfFooter']/td/di" \
|
83
148
|
"v[@class='result-footer']")
|
84
149
|
|
85
150
|
# job (e.g. Our products are primarily written in C#, using...)
|
86
|
-
jobsnippet = div3.
|
151
|
+
jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
|
87
152
|
|
88
153
|
# visually (e.g. Posted 14 days ago)
|
89
154
|
dateposted = div3.element("span[@class='date']").texts
|
90
155
|
date = (Date.today - dateposted.first.to_i).to_s
|
91
156
|
|
92
157
|
{
|
158
|
+
link: @url_base.sub(/\/[^\/]+$/,'') \
|
159
|
+
+ doc.root.attributes[:href].gsub(/&/,'&'),
|
93
160
|
title: jobtitle,
|
94
161
|
salary: salary,
|
95
162
|
company: company_name,
|
@@ -101,3 +168,19 @@ class IndeedScraper2022
|
|
101
168
|
end
|
102
169
|
end
|
103
170
|
end
|
171
|
+
|
172
|
+
class IS22Plus < IndeedScraper2022
|
173
|
+
|
174
|
+
def initialize(q: '', location: '', debug: false)
|
175
|
+
super(q: q, location: location, debug: debug)
|
176
|
+
end
|
177
|
+
|
178
|
+
def list()
|
179
|
+
|
180
|
+
@results.map.with_index do |x,i|
|
181
|
+
"%2d. %s" % [i+1,x[:title]]
|
182
|
+
end.join("\n")
|
183
|
+
|
184
|
+
end
|
185
|
+
|
186
|
+
end
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeed_scraper2022
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
|
36
36
|
SW/2zInu2bkj/meWm5eBoWHT
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-
|
38
|
+
date: 2022-03-22 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: nokorexi
|
@@ -43,20 +43,20 @@ dependencies:
|
|
43
43
|
requirements:
|
44
44
|
- - "~>"
|
45
45
|
- !ruby/object:Gem::Version
|
46
|
-
version: '0.
|
46
|
+
version: '0.7'
|
47
47
|
- - ">="
|
48
48
|
- !ruby/object:Gem::Version
|
49
|
-
version: 0.
|
49
|
+
version: 0.7.0
|
50
50
|
type: :runtime
|
51
51
|
prerelease: false
|
52
52
|
version_requirements: !ruby/object:Gem::Requirement
|
53
53
|
requirements:
|
54
54
|
- - "~>"
|
55
55
|
- !ruby/object:Gem::Version
|
56
|
-
version: '0.
|
56
|
+
version: '0.7'
|
57
57
|
- - ">="
|
58
58
|
- !ruby/object:Gem::Version
|
59
|
-
version: 0.
|
59
|
+
version: 0.7.0
|
60
60
|
- !ruby/object:Gem::Dependency
|
61
61
|
name: mechanize
|
62
62
|
requirement: !ruby/object:Gem::Requirement
|
@@ -78,7 +78,7 @@ dependencies:
|
|
78
78
|
- !ruby/object:Gem::Version
|
79
79
|
version: 2.8.4
|
80
80
|
description:
|
81
|
-
email:
|
81
|
+
email: digital.robertson@gmail.com
|
82
82
|
executables: []
|
83
83
|
extensions: []
|
84
84
|
extra_rdoc_files: []
|
@@ -96,15 +96,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
96
96
|
requirements:
|
97
97
|
- - ">="
|
98
98
|
- !ruby/object:Gem::Version
|
99
|
-
version:
|
99
|
+
version: 2.3.0
|
100
100
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
101
101
|
requirements:
|
102
102
|
- - ">="
|
103
103
|
- !ruby/object:Gem::Version
|
104
104
|
version: '0'
|
105
105
|
requirements: []
|
106
|
-
|
107
|
-
rubygems_version: 2.7.10
|
106
|
+
rubygems_version: 3.2.22
|
108
107
|
signing_key:
|
109
108
|
specification_version: 4
|
110
109
|
summary: Attempts to scrape the indeed.com jobsearch results (1 page).
|
metadata.gz.sig
CHANGED
Binary file
|