indeed_scraper2022 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 88f80a06ef0ab435c144d3b4ec53f1c98f2da7c427224c31dbef44f62fdafee3
4
- data.tar.gz: d0f549053bb225e7c8ebb2492715c6c470f689e191e9ae747e8f97317a61c02c
3
+ metadata.gz: 833f3e77c7771f39e3eccbd5f277a8ca73fbd34d55d5efc7c2509b7a4dbf61bd
4
+ data.tar.gz: 707cf360d0ca30102e59bc0e5ead4111199db76a090d734729c327eaefbd6cdd
5
5
  SHA512:
6
- metadata.gz: 90d23c6c35a87cdcf763dc15a072f35ede166aafe2f1f5eed9294de28e916cdaca3eb043a034fb50750c9444af9e5042dc50d01390816efcdde360d2d01c4e55
7
- data.tar.gz: 30153c9c5aafdb5e89d56632e223edfe0196a71c4d9294cc3f963f2039238cd27b836ac9ac42b49cad27988ada868d0d002246699399f7d175216b97689d46ed
6
+ metadata.gz: d3c8d5eacb62503e4b29634836b74a8b6c9636d9127fc345d79b9f177d75b41f2558c351ec7028d9a74887f964f440022b1d25e416f14752c16ded16055dcd2c
7
+ data.tar.gz: aea4011eea3c4f37f3537626e3ca2179bee215854a975d2b7618d09737395961fe88648460a8f58ce966acadc9a1fe8c21263319b07ff12147d4264b9374ae39
checksums.yaml.gz.sig CHANGED
@@ -1,3 +1,2 @@
1
- t0��KhQ��,������=��=�|�S\�����(�6듾&�L��H�c ��_oTP��eO�֔�
2
- ��r�����"e4l:��HLы�B�"��p�"
3
-
1
+ }ә���T��*��-ԣ��xe�. �x�<>X � JX~���)���-zi���w��Lɜ!n��
2
+ ˾/�x��0V��"�����䭴B��J�� Vt��<���3�8LW���R���kK��������hÅ��y���{<��IJC�:D�>�F%�8��mXҘ��AwO�GL����ޚZ,!"l��k����&�/� �'����8V�]r#�qZ�Vm!�� ����
@@ -5,6 +5,7 @@
5
5
  require 'ferrumwizard'
6
6
  require 'nokorexi'
7
7
  require 'yaml'
8
+ require 'reveal_url22'
8
9
 
9
10
  # Given the nature of changes to jobsearch websites,
10
11
  # don't rely upon this gem working in the near future.
@@ -44,9 +45,10 @@ class IndeedScraper2022
44
45
  end
45
46
 
46
47
  def search(q: @q, location: @location, start: nil)
47
-
48
+ puts 'inside search' if @debug
48
49
  url = @url_base
49
50
  url += 'start=' + start if start
51
+ puts 'url: ' + url.inspect if @debug
50
52
 
51
53
  @browser.goto(url)
52
54
  #@browser.network.wait_for_idle
@@ -81,28 +83,44 @@ class IndeedScraper2022
81
83
  sleep 2
82
84
 
83
85
  doc2 = Nokogiri::XML(@browser.body)
86
+ File.write '/tmp/body.txt', doc2.to_s if @debug
84
87
 
85
- a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
88
+ a2 = doc2.root.xpath "//li/div[div/div/div/div/table/tbody/tr/td/div/h2/a]"
86
89
  puts 'a2: ' + a2.length.inspect if @debug
87
90
 
88
91
  @a2 = a2.map {|x| Rexle.new x.to_s }
89
92
 
90
93
  @results = @a2.map do |doc|
91
94
 
92
- div = doc.element("a[@class='desktop']/div[@class='slider" \
95
+ div = doc.element("div[@class='cardOutline']/div[@class='slider" \
93
96
  "_container']/div[@class='slider_list']/div[@class='sl" \
94
97
  "ider_item']/div[@class='job_seen_beacon']")
98
+
95
99
  td = div.element("table[@class='jobCard_mainContent']/tbo" \
96
100
  "dy/tr/td[@class='resultContent']")
97
101
 
98
102
  # job title (e.g. Software Developer)
99
- jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
100
- "class='jobTitle-color-purple']/span")&.text
103
+ job = td.element("div[@class='tapItem-gutter']/h2[@" \
104
+ "class='jobTitle-color-purple']/a")
105
+ href = job.attributes[:href]
106
+ jobtitle = job.element("span")&.text
107
+
101
108
  puts 'jobtitle: ' + jobtitle.inspect if @debug
102
109
 
103
- salary = td.element("div[@class='metadataContainer']/" \
104
- "div[@class='salary-snippet-container']/div[@class='sa" \
105
- "lary-snippet']/span")&.text
110
+ sal = td.element("div[@class='metadataContainer']/" \
111
+ "div[@class='salary-snippet-container']")
112
+
113
+ salary = if sal then
114
+ sal_e = sal.element("div[@class='attribute_snippet']")
115
+ if sal_e then
116
+ sal_e.texts[0]
117
+ else
118
+ sal_e2 = sal.element("div[@class='salary-snippet']/span")
119
+ sal_e2 ? sal_e2.text : ''
120
+ end
121
+ else
122
+ ''
123
+ end
106
124
 
107
125
  puts 'salary: ' + salary.inspect if @debug
108
126
  div1 = td.element("div[@class='companyInfo']")
@@ -120,7 +138,12 @@ class IndeedScraper2022
120
138
  "v[@class='result-footer']")
121
139
 
122
140
  # job (e.g. Our products are primarily written in C#, using...)
123
- jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
141
+ advert_items = div3.xpath("div[@class='job-snippet']/ul/li/text()")
142
+ jobsnippet = if advert_items.any? then
143
+ advert_items.join("\n")
144
+ else
145
+ div3.element("div[@class='job-snippet']").text
146
+ end
124
147
 
125
148
  # visually (e.g. Posted 14 days ago)
126
149
  dateposted = div3.element("span[@class='date']")&.texts
@@ -128,7 +151,7 @@ class IndeedScraper2022
128
151
 
129
152
  {
130
153
  link: @url_base.sub(/\/[^\/]+$/,'') \
131
- + doc.root.attributes[:href].gsub(/&amp;/,'&'),
154
+ + href.gsub(/&amp;/,'&'),
132
155
  title: jobtitle,
133
156
  salary: salary,
134
157
  company: company_name,
@@ -239,7 +262,7 @@ class IS22Plus < IndeedScraper2022
239
262
 
240
263
  def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
241
264
  super(q: q, location: location, headless: headless, cookies: cookies,
242
- debug: debug)
265
+ debug: true)
243
266
  end
244
267
 
245
268
  # note: The most efficient method to accumulate vacancy articles is to
@@ -265,7 +288,7 @@ class IS22Plus < IndeedScraper2022
265
288
 
266
289
  puts 'saving ' + item[:title] if @debug
267
290
  puts 'link: ' + item[:link].inspect
268
- links = RXFReader.reveal(item[:link])
291
+ links = URL.reveal(item[:link])
269
292
  puts 'links: ' + links.inspect if @debug
270
293
 
271
294
  url = links.last
@@ -326,8 +349,6 @@ class IS22Archive
326
349
 
327
350
  def initialize(filepath='/tmp/indeed', debug: false)
328
351
 
329
- @debug = debug
330
-
331
352
  FileUtils.mkdir_p filepath
332
353
  @idxfile = File.join(filepath, 'index.yml')
333
354
 
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeed_scraper2022
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
36
36
  SW/2zInu2bkj/meWm5eBoWHT
37
37
  -----END CERTIFICATE-----
38
- date: 2022-04-16 00:00:00.000000000 Z
38
+ date: 2022-05-12 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: nokorexi
@@ -77,6 +77,26 @@ dependencies:
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
79
  version: 0.3.1
80
+ - !ruby/object:Gem::Dependency
81
+ name: reveal_url22
82
+ requirement: !ruby/object:Gem::Requirement
83
+ requirements:
84
+ - - "~>"
85
+ - !ruby/object:Gem::Version
86
+ version: '0.1'
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: 0.1.0
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.1'
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: 0.1.0
80
100
  description:
81
101
  email: digital.robertson@gmail.com
82
102
  executables: []
metadata.gz.sig CHANGED
Binary file