jobparser 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,14 +6,23 @@ module JobParser
6
6
  return special_case_result unless special_case_result.nil?
7
7
 
8
8
  salary = ""
9
- loop_over_elements do |name, elem|
10
- SALARY_REGEX.match(@plain_text) { |m|
11
- salary = m.to_s
9
+ SALARY_REGEX.match(@plain_text) { |m|
10
+ salary = m.to_s
11
+ }
12
+ final_salary = nil
13
+ if salary && !salary.empty?
14
+ p "got slaary"
15
+ p salary
16
+ SALARY_GROUP_REGEX.match(salary.gsub(CLEAN_SALARY_REGEX, "")) { |match|
17
+ final_salary = [match[1].to_i, match[2].to_i]
18
+ }
19
+ else
20
+ SALARY_UP_TO_REGEX.match(@plain_text) { |m|
21
+ final_salary = [nil, m[3].gsub(",","").to_i]
12
22
  }
13
23
  end
14
- SALARY_GROUP_REGEX.match(salary.gsub(CLEAN_SALARY_REGEX, "")) { |match|
15
- [match[1].to_i, match[2].to_i]
16
- } || nil
24
+
25
+ final_salary
17
26
  end
18
27
  end
19
28
  end
@@ -6,11 +6,12 @@ module JobParser
6
6
  return special_case_result unless special_case_result.nil?
7
7
 
8
8
  salary = ""
9
- loop_over_elements do |name, elem|
10
- SALARY_STRING_REGEX.match(@plain_text) { |m|
11
- salary = m.to_s
12
- }
9
+ SALARY_STRING_REGEX.match(@plain_text) { |m| salary = m.to_s }
10
+
11
+ if salary.empty?
12
+ SALARY_UP_TO_REGEX.match(@plain_text) { |m| salary = m.to_s }
13
13
  end
14
+
14
15
  Cleaner.strip_all_white_space(salary)
15
16
  end
16
17
  end
@@ -7,13 +7,15 @@ module JobParser
7
7
 
8
8
  title_scorer = Scorer.new
9
9
  page_title = @doc.at_css("title").content
10
- title_scorer.store(page_title, 20).and_score_now
10
+ title_scorer.store_and_score(page_title, 20)
11
11
 
12
12
  # first see if we find something with a matching id
13
13
  loop_over_elements do |name, elem|
14
- # check the ID of the elements for matches
15
14
  next if elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
15
+
16
16
  content = Cleaner.strip_all_white_space(elem.content)
17
+
18
+ # does the element have an id that means it might store the title?
17
19
  title_scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
18
20
 
19
21
  # or if a heading element matches the page title
@@ -4,13 +4,13 @@ module JobParser
4
4
  class ParseHtml
5
5
  ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
6
6
 
7
- attr_reader :doc
7
+ attr_reader :doc, :plain_text
8
8
 
9
9
  def initialize(html, from_url)
10
10
  @url = from_url
11
- @doc = Nokogiri::HTML(html)
11
+ @doc = strip_bad_elements(Nokogiri::HTML(html))
12
12
  @doc.css("br").each { |br| br.replace "\n" }
13
- @plain_text = strip_html
13
+ @plain_text = get_plain_text
14
14
  end
15
15
 
16
16
  def job
@@ -25,13 +25,18 @@ module JobParser
25
25
 
26
26
  private
27
27
 
28
- def strip_html
28
+ def strip_bad_elements(doc)
29
+ blacklist = ['script', 'style', 'button']
30
+ blacklist.each do |tag|
31
+ doc.xpath("//#{tag}").remove
32
+ end
33
+ doc
34
+ end
35
+
36
+ def get_plain_text
29
37
  doc = @doc.dup
30
38
  blacklist = ['title', 'script', 'style', 'button']
31
39
  nodelist = doc.search('//text()')
32
- blacklist.each do |tag|
33
- nodelist -= doc.search('//' + tag + '/text()')
34
- end
35
40
  nodelist.text
36
41
  end
37
42
 
@@ -3,6 +3,7 @@ require 'nokogiri'
3
3
  module JobParser
4
4
  SALARY_REGEX = /£[\d,]*(?:.+)£[\d,]*/
5
5
  SALARY_STRING_REGEX = /£[\d,]*.+£[\d,]*(\s.*$)?/
6
+ SALARY_UP_TO_REGEX = /(up to)(.+)£([\d,]*)/
6
7
  SALARY_TITLE_REGEX = /salary|\srate/i
7
8
  VACANCY_TITLE_REGEX = /vacancy|job title/i
8
9
  JOB_TITLE_ID_REGEX = /job(.?)title|title/i
@@ -24,6 +24,11 @@ module JobParser
24
24
  def top_match
25
25
  @matches.select { |k, v| v.score > 0 }.max_by { |k, v| v.score }.first
26
26
  end
27
+
28
+ def store_and_score(str, worth)
29
+ store(str, worth).and_score_now
30
+ end
31
+
27
32
  end
28
33
 
29
34
  class Match
@@ -1,3 +1,3 @@
1
1
  module JobParser
2
- VERSION = "0.2.0"
2
+ VERSION = "0.3.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jobparser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-07-24 00:00:00.000000000 Z
12
+ date: 2013-07-25 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler