indeed_scraper2022 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 50a484cf1a272522091413129241620336f12ca94d795b7ab132dd6911802d1c
4
- data.tar.gz: 06dffee1253aa5076da9b6897bc48009687e8df86aa1a3629ee1d8a4432fdd13
3
+ metadata.gz: 07f5323381b5751c470454f6f4c3ba6dced6f1424e054b85360a49d814d662ba
4
+ data.tar.gz: 3d25353b9f8a0543944cac82ef6dc91adf7d3e83444f3c6ef469f15cbba8a3d8
5
5
  SHA512:
6
- metadata.gz: 7327fc5bf9668c4f292eabf673574bfd7ca9fbf180133896c559ac1b7415d4ee880365302d24550aba4b83ee2a709a77bdb059145310e4a5de21840fe11a5058
7
- data.tar.gz: a84b587275793166a7ac40d63607c8ea2f8dd340a3197ce782f901ca8c27de27b8eeef7f36fc9399aba286ddc399da109b2d14cd9db99b3c6a4a545e6ad9f21c
6
+ metadata.gz: 5e13ae04b46bfa3eb15aab8d0aff388d8caec591c413493db591c37da099d2bcd5ba340a72137d4aa7d374652b68bc1d037b86fe4cc2ed2ae5b0a56c5202f00b
7
+ data.tar.gz: ca14ae99251aabbcaee08a3bb6f240742ed1fab0f438496dc742ef39a10abb13e310b2d6a93bc472f5e1b3e45cfd8956d6a62f803b1d3a152054cf4e1ae35402
checksums.yaml.gz.sig CHANGED
Binary file
@@ -2,20 +2,25 @@
2
2
 
3
3
  # file: indeed_scraper2022.rb
4
4
 
5
- require 'mechanize'
5
+ require 'ferrumwizard'
6
6
  require 'nokorexi'
7
7
 
8
8
  # Given the nature of changes to jobsearch websites,
9
9
  # don't rely upon this gem working in the near future.
10
10
 
11
11
 
12
+ class IndeedScraper2022Err < Exception
13
+ end
14
+
12
15
  class IndeedScraper2022
13
16
 
14
- def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '', debug: false)
17
+ def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '',
18
+ headless: true, cookies: nil, debug: false)
15
19
 
16
20
  @debug = debug
17
21
  @url_base, @q, @location = url, q, location
18
- @results = search
22
+ @headless, @cookies = headless, cookies
23
+ @results = search(q: @q, location: @location)
19
24
 
20
25
  end
21
26
 
@@ -25,7 +30,93 @@ class IndeedScraper2022
25
30
  @results
26
31
  end
27
32
 
33
+ def search(q: @q, location: @location, start: nil)
34
+
35
+ fw = FerrumWizard.new( headless: @headless, cookies: @cookies, debug: @debug)
36
+
37
+ url = @url_base
38
+ url += 'start=' + start if start
39
+
40
+ browser = fw.browser
41
+ browser.goto(url)
42
+
43
+ if q.length > 1 then
44
+ input = browser.at_xpath("//input[@name='q']")
45
+ input.focus.type(q)
46
+ end
47
+
48
+ if location.length > 1 then
49
+ input2 = browser.at_xpath("//input[@name='l']")
50
+ input2.focus.type(location)
51
+ end
52
+
53
+ button = browser.at_xpath("//button[@type='submit']")
54
+ button.click
55
+
56
+ doc2 = Nokogiri::XML(browser.body)
57
+
58
+ a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
59
+ puts 'a2: ' + a2.length.inspect if @debug
60
+
61
+ @a2 = a2.map {|x| Rexle.new x.to_s }
62
+
63
+ @a2.map do |doc|
64
+
65
+ div = doc.element("a[@class='desktop']/div[@class='slider" \
66
+ "_container']/div[@class='slider_list']/div[@class='sl" \
67
+ "ider_item']/div[@class='job_seen_beacon']")
68
+ td = div.element("table[@class='jobCard_mainContent']/tbo" \
69
+ "dy/tr/td[@class='resultContent']")
70
+
71
+ # job title (e.g. Software Developer)
72
+ jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
73
+ "class='jobTitle-color-purple']/span")&.text
74
+ puts 'jobtitle: ' + jobtitle.inspect if @debug
75
+
76
+ salary = td.element("div[@class='metadataContainer']/" \
77
+ "div[@class='salary-snippet-container']/div[@class='sa" \
78
+ "lary-snippet']/span")&.text
79
+
80
+ puts 'salary: ' + salary.inspect if @debug
81
+ div1 = td.element("div[@class='companyInfo']")
82
+
83
+ # company name (e.g. Coda Octopus Products Ltd)
84
+ company_name = div1.element("span[@class='companyName']")&.text
85
+
86
+ # company location (e.g. Edinburgh)
87
+ location = div1.element("div[@class='companyLocation']")&.text
88
+ tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
89
+
90
+ div3 = tbody.element("tr[@class='underShelfFooter']/td/di" \
91
+ "v[@class='result-footer']")
92
+
93
+ # job (e.g. Our products are primarily written in C#, using...)
94
+ jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
95
+
96
+ # visually (e.g. Posted 14 days ago)
97
+ dateposted = div3.element("span[@class='date']")&.texts
98
+ date = (Date.today - dateposted.first.to_i).to_s if dateposted
99
+
100
+ {
101
+ link: @url_base.sub(/\/[^\/]+$/,'') \
102
+ + doc.root.attributes[:href].gsub(/&amp;/,'&'),
103
+ title: jobtitle,
104
+ salary: salary,
105
+ company: company_name,
106
+ location: location,
107
+ jobsnippet: jobsnippet,
108
+ date: date
109
+ }
110
+
111
+ end
112
+ end
113
+
28
114
  def page(n)
115
+
116
+ if n < 1 or n > @results.length then
117
+ raise IndeedScraper2022Err, 'Invalid page no.'
118
+ end
119
+
29
120
  url = @results[n-1][:link]
30
121
  fetchjob(url)
31
122
  end
@@ -44,7 +135,7 @@ class IndeedScraper2022
44
135
  # jobsearch (e.g. Full Stack Website Developer (Wordpress))
45
136
  jobtitle = div2.element("div[@class='jobsearch-JobInfoHead" \
46
137
  "er-title-container']/h1[@class='jobsearch-JobInfoHead" \
47
- "er-title']").text
138
+ "er-title']")&.text
48
139
 
49
140
  div3 = div2.element("div[@class='jobsearch-CompanyInfoCon" \
50
141
  "tainer']/div[@class='jobsearch-CompanyInfoWithoutHead" \
@@ -56,110 +147,74 @@ class IndeedScraper2022
56
147
  "ickyContainer-companyrating']/div/div[@class='icl-u-x" \
57
148
  "s-mr--xs']")[1]
58
149
  clink = div3.element('//a')
59
- company = cname ? cname.text : clink.text
150
+ company = cname.text ? cname.text : clink.text
60
151
  companylink = clink.attributes[:href] if clink
61
152
 
153
+ salary = div1.element("//span[@class='attribute_snippet']")&.text
154
+ type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
62
155
  div5 = div3.xpath("div/div")
63
156
  location, worklocation = div5.map(&:text).compact
64
157
 
65
158
  # icl (e.g. Full-time, Permanent)
66
159
  jobtype = div1.element("div/div/div[@class='jobsearch-J" \
67
160
  "obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
68
- jobtype = jobtype.texts.join if jobtype
161
+ jobtype = jobtype&.texts.join if jobtype
69
162
 
70
163
  # jobsearch (e.g. Urgently needed)
71
164
  jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
72
165
  "']/div[@class='urgently-hiring']/div[@class='jobsearc" \
73
- "h-DesktopTag-text']")
74
- jobnote1 = jobnote1.text if jobnote1
166
+ "h-DesktopTag-text']")&.text
75
167
 
76
168
  # jobsearch (e.g. 10 days ago)
77
- datepost = e0.element("//div[@class='jobsearch-JobTab-con" \
78
- "tent']/div[@class='jobsearch-JobMetadataFooter']/div").text
169
+ days = e0.element("//div[@class='jobsearch-JobTab-con" \
170
+ "tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
171
+ d = Date.today - days.to_i
172
+ datepost = d.strftime("%Y-%m-%d")
173
+
79
174
 
80
175
  jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
81
- "ass='jobsearch-jobDescriptionText']")
176
+ "ass='jobsearch-jobDescriptionText']").xml
82
177
 
83
178
  {
84
179
  title: jobtitle,
180
+ type: type,
85
181
  company: company,
86
182
  companylink: companylink,
87
183
  location: location,
184
+ salary: salary,
88
185
  worklocation: worklocation,
89
186
  note: jobnote1,
90
- date: (Date.today - datepost.to_i).to_s,
187
+ date: datepost,
91
188
  desc: jobdesc
92
189
  }
93
190
 
94
191
  end
95
192
 
96
- def search(q='', location='')
97
-
98
- a = Mechanize.new
99
-
100
- page = a.get(@url_base)
101
- form = page.forms.first
102
- form.fields[0].value = @q
103
- form.fields[1].value = @location
104
- pg = form.submit
105
-
106
- doc2 = Nokogiri::XML(pg.body)
107
-
108
- a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
109
- puts 'a2: ' + a2.length.inspect if @debug
110
-
111
- @a2 = a2.map {|x| Rexle.new x.to_s }
112
-
113
- @a2.map do |doc|
114
-
115
- div = doc.element("a[@class='desktop']/div[@class='slider" \
116
- "_container']/div[@class='slider_list']/div[@class='sl" \
117
- "ider_item']/div[@class='job_seen_beacon']")
118
- td = div.element("table[@class='jobCard_mainContent']/tbo" \
119
- "dy/tr/td[@class='resultContent']")
120
193
 
121
- # job title (e.g. Software Developer)
122
- jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
123
- "class='jobTitle-color-purple']/span").text
124
- puts 'jobtitle: ' + jobtitle.inspect if @debug
194
+ end
125
195
 
126
- salary = td.element("div[@class='metadataContainer']/" \
127
- "div[@class='salary-snippet-container']/div[@class='sa" \
128
- "lary-snippet']/span")
129
- salary = salary.text if salary
130
- puts 'salary: ' + salary.inspect if @debug
131
- div1 = td.element("div[@class='companyInfo']")
196
+ class IS22Plus < IndeedScraper2022
132
197
 
133
- # company name (e.g. Coda Octopus Products Ltd)
134
- company_name = div1.element("span[@class='companyName']").text
198
+ def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
199
+ super(q: q, location: location, headless: headless, cookies: cookies,
200
+ debug: debug)
201
+ end
135
202
 
136
- # company location (e.g. Edinburgh)
137
- location = div1.element("div[@class='companyLocation']").text
138
- tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
203
+ def archive()
139
204
 
140
- div3 = tbody.element("tr[@class='underShelfFooter']/td/di" \
141
- "v[@class='result-footer']")
205
+ 1.upto(15).each do |n|
206
+ page(n)
207
+ end
142
208
 
143
- # job (e.g. Our products are primarily written in C#, using...)
144
- jobsnippet = div3.element("div[@class='job-snippet']/ul/li").text
209
+ end
145
210
 
146
- # visually (e.g. Posted 14 days ago)
147
- dateposted = div3.element("span[@class='date']").texts
148
- date = (Date.today - dateposted.first.to_i).to_s
211
+ def list()
149
212
 
150
- {
151
- link: @url_base.sub(/\/[^\/]+$/,'') \
152
- + doc.root.attributes[:href].gsub(/&amp;/,'&'),
153
- title: jobtitle,
154
- salary: salary,
155
- company: company_name,
156
- location: location,
157
- jobsnippet: jobsnippet,
158
- date: date
159
- }
213
+ @results.map.with_index do |x,i|
214
+ "%2d. %s" % [i+1,x[:title]]
215
+ end.join("\n")
160
216
 
161
- end
162
217
  end
163
- end
164
218
 
165
219
 
220
+ end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeed_scraper2022
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
36
36
  SW/2zInu2bkj/meWm5eBoWHT
37
37
  -----END CERTIFICATE-----
38
- date: 2022-01-25 00:00:00.000000000 Z
38
+ date: 2022-03-30 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: nokorexi
@@ -43,42 +43,42 @@ dependencies:
43
43
  requirements:
44
44
  - - "~>"
45
45
  - !ruby/object:Gem::Version
46
- version: '0.5'
46
+ version: '0.7'
47
47
  - - ">="
48
48
  - !ruby/object:Gem::Version
49
- version: 0.5.5
49
+ version: 0.7.0
50
50
  type: :runtime
51
51
  prerelease: false
52
52
  version_requirements: !ruby/object:Gem::Requirement
53
53
  requirements:
54
54
  - - "~>"
55
55
  - !ruby/object:Gem::Version
56
- version: '0.5'
56
+ version: '0.7'
57
57
  - - ">="
58
58
  - !ruby/object:Gem::Version
59
- version: 0.5.5
59
+ version: 0.7.0
60
60
  - !ruby/object:Gem::Dependency
61
- name: mechanize
61
+ name: ferrumwizard
62
62
  requirement: !ruby/object:Gem::Requirement
63
63
  requirements:
64
64
  - - "~>"
65
65
  - !ruby/object:Gem::Version
66
- version: '2.8'
66
+ version: '0.2'
67
67
  - - ">="
68
68
  - !ruby/object:Gem::Version
69
- version: 2.8.4
69
+ version: 0.2.2
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
73
73
  requirements:
74
74
  - - "~>"
75
75
  - !ruby/object:Gem::Version
76
- version: '2.8'
76
+ version: '0.2'
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
- version: 2.8.4
79
+ version: 0.2.2
80
80
  description:
81
- email: james@jamesrobertson.eu
81
+ email: digital.robertson@gmail.com
82
82
  executables: []
83
83
  extensions: []
84
84
  extra_rdoc_files: []
@@ -96,15 +96,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
96
96
  requirements:
97
97
  - - ">="
98
98
  - !ruby/object:Gem::Version
99
- version: '0'
99
+ version: 2.3.0
100
100
  required_rubygems_version: !ruby/object:Gem::Requirement
101
101
  requirements:
102
102
  - - ">="
103
103
  - !ruby/object:Gem::Version
104
104
  version: '0'
105
105
  requirements: []
106
- rubyforge_project:
107
- rubygems_version: 2.7.10
106
+ rubygems_version: 3.2.22
108
107
  signing_key:
109
108
  specification_version: 4
110
109
  summary: Attempts to scrape the indeed.com jobsearch results (1 page).
metadata.gz.sig CHANGED
Binary file