indeed_scraper2022 0.4.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 88f80a06ef0ab435c144d3b4ec53f1c98f2da7c427224c31dbef44f62fdafee3
4
- data.tar.gz: d0f549053bb225e7c8ebb2492715c6c470f689e191e9ae747e8f97317a61c02c
3
+ metadata.gz: 833f3e77c7771f39e3eccbd5f277a8ca73fbd34d55d5efc7c2509b7a4dbf61bd
4
+ data.tar.gz: 707cf360d0ca30102e59bc0e5ead4111199db76a090d734729c327eaefbd6cdd
5
5
  SHA512:
6
- metadata.gz: 90d23c6c35a87cdcf763dc15a072f35ede166aafe2f1f5eed9294de28e916cdaca3eb043a034fb50750c9444af9e5042dc50d01390816efcdde360d2d01c4e55
7
- data.tar.gz: 30153c9c5aafdb5e89d56632e223edfe0196a71c4d9294cc3f963f2039238cd27b836ac9ac42b49cad27988ada868d0d002246699399f7d175216b97689d46ed
6
+ metadata.gz: d3c8d5eacb62503e4b29634836b74a8b6c9636d9127fc345d79b9f177d75b41f2558c351ec7028d9a74887f964f440022b1d25e416f14752c16ded16055dcd2c
7
+ data.tar.gz: aea4011eea3c4f37f3537626e3ca2179bee215854a975d2b7618d09737395961fe88648460a8f58ce966acadc9a1fe8c21263319b07ff12147d4264b9374ae39
checksums.yaml.gz.sig CHANGED
@@ -1,3 +1,2 @@
1
- t0��KhQ��,������=��=�|�S\�����(�6듾&�L��H�c ��_oTP��eO�֔�
2
- ��r�����"e4l:��HLы�B�"��p�"
3
-
1
+ }ә���T��*��-ԣ��xe�. �x�<>X � JX~���)���-zi���w��Lɜ!n��
2
+ ˾/�x��0V��"�����䭴B��J�� Vt��<���3�8LW���R���kK��������hÅ��y���{<��IJC�:D�>�F%�8��mXҘ��AwO�GL����ޚZ,!"l��k����&�/� �'����8V�]r#�qZ�Vm!�� ����
@@ -5,6 +5,7 @@
5
5
  require 'ferrumwizard'
6
6
  require 'nokorexi'
7
7
  require 'yaml'
8
+ require 'reveal_url22'
8
9
 
9
10
  # Given the nature of changes to jobsearch websites,
10
11
  # don't rely upon this gem working in the near future.
@@ -44,9 +45,10 @@ class IndeedScraper2022
44
45
  end
45
46
 
46
47
  def search(q: @q, location: @location, start: nil)
47
-
48
+ puts 'inside search' if @debug
48
49
  url = @url_base
49
50
  url += 'start=' + start if start
51
+ puts 'url: ' + url.inspect if @debug
50
52
 
51
53
  @browser.goto(url)
52
54
  #@browser.network.wait_for_idle
@@ -81,28 +83,44 @@ class IndeedScraper2022
81
83
  sleep 2
82
84
 
83
85
  doc2 = Nokogiri::XML(@browser.body)
86
+ File.write '/tmp/body.txt', doc2.to_s if @debug
84
87
 
85
- a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
88
+ a2 = doc2.root.xpath "//li/div[div/div/div/div/table/tbody/tr/td/div/h2/a]"
86
89
  puts 'a2: ' + a2.length.inspect if @debug
87
90
 
88
91
  @a2 = a2.map {|x| Rexle.new x.to_s }
89
92
 
90
93
  @results = @a2.map do |doc|
91
94
 
92
- div = doc.element("a[@class='desktop']/div[@class='slider" \
95
+ div = doc.element("div[@class='cardOutline']/div[@class='slider" \
93
96
  "_container']/div[@class='slider_list']/div[@class='sl" \
94
97
  "ider_item']/div[@class='job_seen_beacon']")
98
+
95
99
  td = div.element("table[@class='jobCard_mainContent']/tbo" \
96
100
  "dy/tr/td[@class='resultContent']")
97
101
 
98
102
  # job title (e.g. Software Developer)
99
- jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
100
- "class='jobTitle-color-purple']/span")&.text
103
+ job = td.element("div[@class='tapItem-gutter']/h2[@" \
104
+ "class='jobTitle-color-purple']/a")
105
+ href = job.attributes[:href]
106
+ jobtitle = job.element("span")&.text
107
+
101
108
  puts 'jobtitle: ' + jobtitle.inspect if @debug
102
109
 
103
- salary = td.element("div[@class='metadataContainer']/" \
104
- "div[@class='salary-snippet-container']/div[@class='sa" \
105
- "lary-snippet']/span")&.text
110
+ sal = td.element("div[@class='metadataContainer']/" \
111
+ "div[@class='salary-snippet-container']")
112
+
113
+ salary = if sal then
114
+ sal_e = sal.element("div[@class='attribute_snippet']")
115
+ if sal_e then
116
+ sal_e.texts[0]
117
+ else
118
+ sal_e2 = sal.element("div[@class='salary-snippet']/span")
119
+ sal_e2 ? sal_e2.text : ''
120
+ end
121
+ else
122
+ ''
123
+ end
106
124
 
107
125
  puts 'salary: ' + salary.inspect if @debug
108
126
  div1 = td.element("div[@class='companyInfo']")
@@ -120,7 +138,12 @@ class IndeedScraper2022
120
138
  "v[@class='result-footer']")
121
139
 
122
140
  # job (e.g. Our products are primarily written in C#, using...)
123
- jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
141
+ advert_items = div3.xpath("div[@class='job-snippet']/ul/li/text()")
142
+ jobsnippet = if advert_items.any? then
143
+ advert_items.join("\n")
144
+ else
145
+ div3.element("div[@class='job-snippet']").text
146
+ end
124
147
 
125
148
  # visually (e.g. Posted 14 days ago)
126
149
  dateposted = div3.element("span[@class='date']")&.texts
@@ -128,7 +151,7 @@ class IndeedScraper2022
128
151
 
129
152
  {
130
153
  link: @url_base.sub(/\/[^\/]+$/,'') \
131
- + doc.root.attributes[:href].gsub(/&amp;/,'&'),
154
+ + href.gsub(/&amp;/,'&'),
132
155
  title: jobtitle,
133
156
  salary: salary,
134
157
  company: company_name,
@@ -239,7 +262,7 @@ class IS22Plus < IndeedScraper2022
239
262
 
240
263
  def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
241
264
  super(q: q, location: location, headless: headless, cookies: cookies,
242
- debug: debug)
265
+ debug: true)
243
266
  end
244
267
 
245
268
  # note: The most efficient method to accumulate vacancy articles is to
@@ -265,7 +288,7 @@ class IS22Plus < IndeedScraper2022
265
288
 
266
289
  puts 'saving ' + item[:title] if @debug
267
290
  puts 'link: ' + item[:link].inspect
268
- links = RXFReader.reveal(item[:link])
291
+ links = URL.reveal(item[:link])
269
292
  puts 'links: ' + links.inspect if @debug
270
293
 
271
294
  url = links.last
@@ -326,8 +349,6 @@ class IS22Archive
326
349
 
327
350
  def initialize(filepath='/tmp/indeed', debug: false)
328
351
 
329
- @debug = debug
330
-
331
352
  FileUtils.mkdir_p filepath
332
353
  @idxfile = File.join(filepath, 'index.yml')
333
354
 
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeed_scraper2022
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
36
36
  SW/2zInu2bkj/meWm5eBoWHT
37
37
  -----END CERTIFICATE-----
38
- date: 2022-04-16 00:00:00.000000000 Z
38
+ date: 2022-05-12 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: nokorexi
@@ -77,6 +77,26 @@ dependencies:
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
79
  version: 0.3.1
80
+ - !ruby/object:Gem::Dependency
81
+ name: reveal_url22
82
+ requirement: !ruby/object:Gem::Requirement
83
+ requirements:
84
+ - - "~>"
85
+ - !ruby/object:Gem::Version
86
+ version: '0.1'
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: 0.1.0
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.1'
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: 0.1.0
80
100
  description:
81
101
  email: digital.robertson@gmail.com
82
102
  executables: []
metadata.gz.sig CHANGED
Binary file