jobparser 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -6,14 +6,23 @@ module JobParser
6
6
  return special_case_result unless special_case_result.nil?
7
7
 
8
8
  salary = ""
9
- loop_over_elements do |name, elem|
10
- SALARY_REGEX.match(@plain_text) { |m|
11
- salary = m.to_s
9
+ SALARY_REGEX.match(@plain_text) { |m|
10
+ salary = m.to_s
11
+ }
12
+ final_salary = nil
13
+ if salary && !salary.empty?
14
+ p "got slaary"
15
+ p salary
16
+ SALARY_GROUP_REGEX.match(salary.gsub(CLEAN_SALARY_REGEX, "")) { |match|
17
+ final_salary = [match[1].to_i, match[2].to_i]
18
+ }
19
+ else
20
+ SALARY_UP_TO_REGEX.match(@plain_text) { |m|
21
+ final_salary = [nil, m[3].gsub(",","").to_i]
12
22
  }
13
23
  end
14
- SALARY_GROUP_REGEX.match(salary.gsub(CLEAN_SALARY_REGEX, "")) { |match|
15
- [match[1].to_i, match[2].to_i]
16
- } || nil
24
+
25
+ final_salary
17
26
  end
18
27
  end
19
28
  end
@@ -6,11 +6,12 @@ module JobParser
6
6
  return special_case_result unless special_case_result.nil?
7
7
 
8
8
  salary = ""
9
- loop_over_elements do |name, elem|
10
- SALARY_STRING_REGEX.match(@plain_text) { |m|
11
- salary = m.to_s
12
- }
9
+ SALARY_STRING_REGEX.match(@plain_text) { |m| salary = m.to_s }
10
+
11
+ if salary.empty?
12
+ SALARY_UP_TO_REGEX.match(@plain_text) { |m| salary = m.to_s }
13
13
  end
14
+
14
15
  Cleaner.strip_all_white_space(salary)
15
16
  end
16
17
  end
@@ -7,13 +7,15 @@ module JobParser
7
7
 
8
8
  title_scorer = Scorer.new
9
9
  page_title = @doc.at_css("title").content
10
- title_scorer.store(page_title, 20).and_score_now
10
+ title_scorer.store_and_score(page_title, 20)
11
11
 
12
12
  # first see if we find something with a matching id
13
13
  loop_over_elements do |name, elem|
14
- # check the ID of the elements for matches
15
14
  next if elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
15
+
16
16
  content = Cleaner.strip_all_white_space(elem.content)
17
+
18
+ # does the element have an id that means it might store the title?
17
19
  title_scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
18
20
 
19
21
  # or if a heading element matches the page title
@@ -4,13 +4,13 @@ module JobParser
4
4
  class ParseHtml
5
5
  ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
6
6
 
7
- attr_reader :doc
7
+ attr_reader :doc, :plain_text
8
8
 
9
9
  def initialize(html, from_url)
10
10
  @url = from_url
11
- @doc = Nokogiri::HTML(html)
11
+ @doc = strip_bad_elements(Nokogiri::HTML(html))
12
12
  @doc.css("br").each { |br| br.replace "\n" }
13
- @plain_text = strip_html
13
+ @plain_text = get_plain_text
14
14
  end
15
15
 
16
16
  def job
@@ -25,13 +25,18 @@ module JobParser
25
25
 
26
26
  private
27
27
 
28
- def strip_html
28
+ def strip_bad_elements(doc)
29
+ blacklist = ['script', 'style', 'button']
30
+ blacklist.each do |tag|
31
+ doc.xpath("//#{tag}").remove
32
+ end
33
+ doc
34
+ end
35
+
36
+ def get_plain_text
29
37
  doc = @doc.dup
30
38
  blacklist = ['title', 'script', 'style', 'button']
31
39
  nodelist = doc.search('//text()')
32
- blacklist.each do |tag|
33
- nodelist -= doc.search('//' + tag + '/text()')
34
- end
35
40
  nodelist.text
36
41
  end
37
42
 
@@ -3,6 +3,7 @@ require 'nokogiri'
3
3
  module JobParser
4
4
  SALARY_REGEX = /£[\d,]*(?:.+)£[\d,]*/
5
5
  SALARY_STRING_REGEX = /£[\d,]*.+£[\d,]*(\s.*$)?/
6
+ SALARY_UP_TO_REGEX = /(up to)(.+)£([\d,]*)/
6
7
  SALARY_TITLE_REGEX = /salary|\srate/i
7
8
  VACANCY_TITLE_REGEX = /vacancy|job title/i
8
9
  JOB_TITLE_ID_REGEX = /job(.?)title|title/i
@@ -24,6 +24,11 @@ module JobParser
24
24
  def top_match
25
25
  @matches.select { |k, v| v.score > 0 }.max_by { |k, v| v.score }.first
26
26
  end
27
+
28
+ def store_and_score(str, worth)
29
+ store(str, worth).and_score_now
30
+ end
31
+
27
32
  end
28
33
 
29
34
  class Match
@@ -1,3 +1,3 @@
1
1
  module JobParser
2
- VERSION = "0.2.0"
2
+ VERSION = "0.3.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jobparser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-07-24 00:00:00.000000000 Z
12
+ date: 2013-07-25 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler