indeed_scraper2022 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 50a484cf1a272522091413129241620336f12ca94d795b7ab132dd6911802d1c
4
- data.tar.gz: 06dffee1253aa5076da9b6897bc48009687e8df86aa1a3629ee1d8a4432fdd13
3
+ metadata.gz: 07f5323381b5751c470454f6f4c3ba6dced6f1424e054b85360a49d814d662ba
4
+ data.tar.gz: 3d25353b9f8a0543944cac82ef6dc91adf7d3e83444f3c6ef469f15cbba8a3d8
5
5
  SHA512:
6
- metadata.gz: 7327fc5bf9668c4f292eabf673574bfd7ca9fbf180133896c559ac1b7415d4ee880365302d24550aba4b83ee2a709a77bdb059145310e4a5de21840fe11a5058
7
- data.tar.gz: a84b587275793166a7ac40d63607c8ea2f8dd340a3197ce782f901ca8c27de27b8eeef7f36fc9399aba286ddc399da109b2d14cd9db99b3c6a4a545e6ad9f21c
6
+ metadata.gz: 5e13ae04b46bfa3eb15aab8d0aff388d8caec591c413493db591c37da099d2bcd5ba340a72137d4aa7d374652b68bc1d037b86fe4cc2ed2ae5b0a56c5202f00b
7
+ data.tar.gz: ca14ae99251aabbcaee08a3bb6f240742ed1fab0f438496dc742ef39a10abb13e310b2d6a93bc472f5e1b3e45cfd8956d6a62f803b1d3a152054cf4e1ae35402
checksums.yaml.gz.sig CHANGED
Binary file
@@ -2,20 +2,25 @@
2
2
 
3
3
  # file: indeed_scraper2022.rb
4
4
 
5
- require 'mechanize'
5
+ require 'ferrumwizard'
6
6
  require 'nokorexi'
7
7
 
8
8
  # Given the nature of changes to jobsearch websites,
9
9
  # don't rely upon this gem working in the near future.
10
10
 
11
11
 
12
+ class IndeedScraper2022Err < Exception
13
+ end
14
+
12
15
  class IndeedScraper2022
13
16
 
14
- def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '', debug: false)
17
+ def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '',
18
+ headless: true, cookies: nil, debug: false)
15
19
 
16
20
  @debug = debug
17
21
  @url_base, @q, @location = url, q, location
18
- @results = search
22
+ @headless, @cookies = headless, cookies
23
+ @results = search(q: @q, location: @location)
19
24
 
20
25
  end
21
26
 
@@ -25,7 +30,93 @@ class IndeedScraper2022
25
30
  @results
26
31
  end
27
32
 
33
+ def search(q: @q, location: @location, start: nil)
34
+
35
+ fw = FerrumWizard.new( headless: @headless, cookies: @cookies, debug: @debug)
36
+
37
+ url = @url_base
38
+ url += 'start=' + start if start
39
+
40
+ browser = fw.browser
41
+ browser.goto(url)
42
+
43
+ if q.length > 1 then
44
+ input = browser.at_xpath("//input[@name='q']")
45
+ input.focus.type(q)
46
+ end
47
+
48
+ if location.length > 1 then
49
+ input2 = browser.at_xpath("//input[@name='l']")
50
+ input2.focus.type(location)
51
+ end
52
+
53
+ button = browser.at_xpath("//button[@type='submit']")
54
+ button.click
55
+
56
+ doc2 = Nokogiri::XML(browser.body)
57
+
58
+ a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
59
+ puts 'a2: ' + a2.length.inspect if @debug
60
+
61
+ @a2 = a2.map {|x| Rexle.new x.to_s }
62
+
63
+ @a2.map do |doc|
64
+
65
+ div = doc.element("a[@class='desktop']/div[@class='slider" \
66
+ "_container']/div[@class='slider_list']/div[@class='sl" \
67
+ "ider_item']/div[@class='job_seen_beacon']")
68
+ td = div.element("table[@class='jobCard_mainContent']/tbo" \
69
+ "dy/tr/td[@class='resultContent']")
70
+
71
+ # job title (e.g. Software Developer)
72
+ jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
73
+ "class='jobTitle-color-purple']/span")&.text
74
+ puts 'jobtitle: ' + jobtitle.inspect if @debug
75
+
76
+ salary = td.element("div[@class='metadataContainer']/" \
77
+ "div[@class='salary-snippet-container']/div[@class='sa" \
78
+ "lary-snippet']/span")&.text
79
+
80
+ puts 'salary: ' + salary.inspect if @debug
81
+ div1 = td.element("div[@class='companyInfo']")
82
+
83
+ # company name (e.g. Coda Octopus Products Ltd)
84
+ company_name = div1.element("span[@class='companyName']")&.text
85
+
86
+ # company location (e.g. Edinburgh)
87
+ location = div1.element("div[@class='companyLocation']")&.text
88
+ tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
89
+
90
+ div3 = tbody.element("tr[@class='underShelfFooter']/td/di" \
91
+ "v[@class='result-footer']")
92
+
93
+ # job (e.g. Our products are primarily written in C#, using...)
94
+ jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
95
+
96
+ # visually (e.g. Posted 14 days ago)
97
+ dateposted = div3.element("span[@class='date']")&.texts
98
+ date = (Date.today - dateposted.first.to_i).to_s if dateposted
99
+
100
+ {
101
+ link: @url_base.sub(/\/[^\/]+$/,'') \
102
+ + doc.root.attributes[:href].gsub(/&amp;/,'&'),
103
+ title: jobtitle,
104
+ salary: salary,
105
+ company: company_name,
106
+ location: location,
107
+ jobsnippet: jobsnippet,
108
+ date: date
109
+ }
110
+
111
+ end
112
+ end
113
+
28
114
  def page(n)
115
+
116
+ if n < 1 or n > @results.length then
117
+ raise IndeedScraper2022Err, 'Invalid page no.'
118
+ end
119
+
29
120
  url = @results[n-1][:link]
30
121
  fetchjob(url)
31
122
  end
@@ -44,7 +135,7 @@ class IndeedScraper2022
44
135
  # jobsearch (e.g. Full Stack Website Developer (Wordpress))
45
136
  jobtitle = div2.element("div[@class='jobsearch-JobInfoHead" \
46
137
  "er-title-container']/h1[@class='jobsearch-JobInfoHead" \
47
- "er-title']").text
138
+ "er-title']")&.text
48
139
 
49
140
  div3 = div2.element("div[@class='jobsearch-CompanyInfoCon" \
50
141
  "tainer']/div[@class='jobsearch-CompanyInfoWithoutHead" \
@@ -56,110 +147,74 @@ class IndeedScraper2022
56
147
  "ickyContainer-companyrating']/div/div[@class='icl-u-x" \
57
148
  "s-mr--xs']")[1]
58
149
  clink = div3.element('//a')
59
- company = cname ? cname.text : clink.text
150
+ company = cname.text ? cname.text : clink.text
60
151
  companylink = clink.attributes[:href] if clink
61
152
 
153
+ salary = div1.element("//span[@class='attribute_snippet']")&.text
154
+ type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
62
155
  div5 = div3.xpath("div/div")
63
156
  location, worklocation = div5.map(&:text).compact
64
157
 
65
158
  # icl (e.g. Full-time, Permanent)
66
159
  jobtype = div1.element("div/div/div[@class='jobsearch-J" \
67
160
  "obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
68
- jobtype = jobtype.texts.join if jobtype
161
+ jobtype = jobtype&.texts.join if jobtype
69
162
 
70
163
  # jobsearch (e.g. Urgently needed)
71
164
  jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
72
165
  "']/div[@class='urgently-hiring']/div[@class='jobsearc" \
73
- "h-DesktopTag-text']")
74
- jobnote1 = jobnote1.text if jobnote1
166
+ "h-DesktopTag-text']")&.text
75
167
 
76
168
  # jobsearch (e.g. 10 days ago)
77
- datepost = e0.element("//div[@class='jobsearch-JobTab-con" \
78
- "tent']/div[@class='jobsearch-JobMetadataFooter']/div").text
169
+ days = e0.element("//div[@class='jobsearch-JobTab-con" \
170
+ "tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
171
+ d = Date.today - days.to_i
172
+ datepost = d.strftime("%Y-%m-%d")
173
+
79
174
 
80
175
  jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
81
- "ass='jobsearch-jobDescriptionText']")
176
+ "ass='jobsearch-jobDescriptionText']").xml
82
177
 
83
178
  {
84
179
  title: jobtitle,
180
+ type: type,
85
181
  company: company,
86
182
  companylink: companylink,
87
183
  location: location,
184
+ salary: salary,
88
185
  worklocation: worklocation,
89
186
  note: jobnote1,
90
- date: (Date.today - datepost.to_i).to_s,
187
+ date: datepost,
91
188
  desc: jobdesc
92
189
  }
93
190
 
94
191
  end
95
192
 
96
- def search(q='', location='')
97
-
98
- a = Mechanize.new
99
-
100
- page = a.get(@url_base)
101
- form = page.forms.first
102
- form.fields[0].value = @q
103
- form.fields[1].value = @location
104
- pg = form.submit
105
-
106
- doc2 = Nokogiri::XML(pg.body)
107
-
108
- a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
109
- puts 'a2: ' + a2.length.inspect if @debug
110
-
111
- @a2 = a2.map {|x| Rexle.new x.to_s }
112
-
113
- @a2.map do |doc|
114
-
115
- div = doc.element("a[@class='desktop']/div[@class='slider" \
116
- "_container']/div[@class='slider_list']/div[@class='sl" \
117
- "ider_item']/div[@class='job_seen_beacon']")
118
- td = div.element("table[@class='jobCard_mainContent']/tbo" \
119
- "dy/tr/td[@class='resultContent']")
120
193
 
121
- # job title (e.g. Software Developer)
122
- jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
123
- "class='jobTitle-color-purple']/span").text
124
- puts 'jobtitle: ' + jobtitle.inspect if @debug
194
+ end
125
195
 
126
- salary = td.element("div[@class='metadataContainer']/" \
127
- "div[@class='salary-snippet-container']/div[@class='sa" \
128
- "lary-snippet']/span")
129
- salary = salary.text if salary
130
- puts 'salary: ' + salary.inspect if @debug
131
- div1 = td.element("div[@class='companyInfo']")
196
+ class IS22Plus < IndeedScraper2022
132
197
 
133
- # company name (e.g. Coda Octopus Products Ltd)
134
- company_name = div1.element("span[@class='companyName']").text
198
+ def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
199
+ super(q: q, location: location, headless: headless, cookies: cookies,
200
+ debug: debug)
201
+ end
135
202
 
136
- # company location (e.g. Edinburgh)
137
- location = div1.element("div[@class='companyLocation']").text
138
- tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
203
+ def archive()
139
204
 
140
- div3 = tbody.element("tr[@class='underShelfFooter']/td/di" \
141
- "v[@class='result-footer']")
205
+ 1.upto(15).each do |n|
206
+ page(n)
207
+ end
142
208
 
143
- # job (e.g. Our products are primarily written in C#, using...)
144
- jobsnippet = div3.element("div[@class='job-snippet']/ul/li").text
209
+ end
145
210
 
146
- # visually (e.g. Posted 14 days ago)
147
- dateposted = div3.element("span[@class='date']").texts
148
- date = (Date.today - dateposted.first.to_i).to_s
211
+ def list()
149
212
 
150
- {
151
- link: @url_base.sub(/\/[^\/]+$/,'') \
152
- + doc.root.attributes[:href].gsub(/&amp;/,'&'),
153
- title: jobtitle,
154
- salary: salary,
155
- company: company_name,
156
- location: location,
157
- jobsnippet: jobsnippet,
158
- date: date
159
- }
213
+ @results.map.with_index do |x,i|
214
+ "%2d. %s" % [i+1,x[:title]]
215
+ end.join("\n")
160
216
 
161
- end
162
217
  end
163
- end
164
218
 
165
219
 
220
+ end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeed_scraper2022
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
36
36
  SW/2zInu2bkj/meWm5eBoWHT
37
37
  -----END CERTIFICATE-----
38
- date: 2022-01-25 00:00:00.000000000 Z
38
+ date: 2022-03-30 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: nokorexi
@@ -43,42 +43,42 @@ dependencies:
43
43
  requirements:
44
44
  - - "~>"
45
45
  - !ruby/object:Gem::Version
46
- version: '0.5'
46
+ version: '0.7'
47
47
  - - ">="
48
48
  - !ruby/object:Gem::Version
49
- version: 0.5.5
49
+ version: 0.7.0
50
50
  type: :runtime
51
51
  prerelease: false
52
52
  version_requirements: !ruby/object:Gem::Requirement
53
53
  requirements:
54
54
  - - "~>"
55
55
  - !ruby/object:Gem::Version
56
- version: '0.5'
56
+ version: '0.7'
57
57
  - - ">="
58
58
  - !ruby/object:Gem::Version
59
- version: 0.5.5
59
+ version: 0.7.0
60
60
  - !ruby/object:Gem::Dependency
61
- name: mechanize
61
+ name: ferrumwizard
62
62
  requirement: !ruby/object:Gem::Requirement
63
63
  requirements:
64
64
  - - "~>"
65
65
  - !ruby/object:Gem::Version
66
- version: '2.8'
66
+ version: '0.2'
67
67
  - - ">="
68
68
  - !ruby/object:Gem::Version
69
- version: 2.8.4
69
+ version: 0.2.2
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
73
73
  requirements:
74
74
  - - "~>"
75
75
  - !ruby/object:Gem::Version
76
- version: '2.8'
76
+ version: '0.2'
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
- version: 2.8.4
79
+ version: 0.2.2
80
80
  description:
81
- email: james@jamesrobertson.eu
81
+ email: digital.robertson@gmail.com
82
82
  executables: []
83
83
  extensions: []
84
84
  extra_rdoc_files: []
@@ -96,15 +96,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
96
96
  requirements:
97
97
  - - ">="
98
98
  - !ruby/object:Gem::Version
99
- version: '0'
99
+ version: 2.3.0
100
100
  required_rubygems_version: !ruby/object:Gem::Requirement
101
101
  requirements:
102
102
  - - ">="
103
103
  - !ruby/object:Gem::Version
104
104
  version: '0'
105
105
  requirements: []
106
- rubyforge_project:
107
- rubygems_version: 2.7.10
106
+ rubygems_version: 3.2.22
108
107
  signing_key:
109
108
  specification_version: 4
110
109
  summary: Attempts to scrape the indeed.com jobsearch results (1 page).
metadata.gz.sig CHANGED
Binary file