jobparser 0.13.9 → 0.13.10

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,45 +1,78 @@
1
1
  module JobParser
2
2
  module Facets
3
3
  class Title < Facet
4
+ def initialize(*args)
5
+ super(*args)
6
+ @scorer = Scorer.new
7
+ end
8
+
4
9
  def parse
5
10
  special_case_result = use_special_case(:title)
6
11
  return special_case_result unless special_case_result.nil?
7
12
 
8
- title_scorer = Scorer.new
9
- page_title = @doc.at_css("title").content
10
- title_scorer.store_and_score(page_title, 10)
13
+ score_page_title
11
14
 
12
15
  # first see if we find something with a matching id
13
16
  loop_over_elements do |name, elem|
14
- next if elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
17
+ next if elem_not_suitable_as_title?(elem)
15
18
 
16
19
  content = Cleaner.strip_all_white_space(elem.content)
17
20
 
18
21
  # does the element have an id that means it might store the title?
19
- title_scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
22
+ elem_has_job_title_id_score(elem, content)
20
23
 
21
24
  # or if a heading element matches the page title
22
- if elem_is_heading?(name)
23
- title_scorer.store(content, 40).if_block_true { page_title.include?(content) }
24
- end
25
+ elem_heading_matches_page_title(elem, content)
25
26
 
26
- title_scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
27
+ # if it has some common words that feature in job titles
28
+ elem_matches_job_title_words(content)
27
29
 
28
- VACANCY_TITLE_REGEX.match(content) {
29
- if elem.next_element && !Cleaner.strip_all_white_space(elem.next_element.content).empty?
30
- next_content = Cleaner.strip_all_white_space(elem.next_element.content)
31
- title_scorer.store(next_content, 30).if_block_true {
32
- ACCEPTED_ELEMENTS.include?(elem.next_element.name)
33
- }
34
- end
35
- }
30
+ # if it's the title, get the content of the next element
31
+ elem_is_vacancy_title(elem, content)
36
32
  end
37
33
 
38
- clean_title(title_scorer.top_match.strip.gsub(NBSP, ""))
34
+ clean_title(@scorer.top_match.strip.gsub(NBSP, ""))
39
35
  end
40
36
 
41
37
  private
42
38
 
39
+ def elem_is_vacancy_title(elem, content)
40
+ VACANCY_TITLE_REGEX.match(content) {
41
+ if elem.next_element && !Cleaner.strip_all_white_space(elem.next_element.content).empty?
42
+ next_content = Cleaner.strip_all_white_space(elem.next_element.content)
43
+ @scorer.store(next_content, 30).if_block_true {
44
+ ACCEPTED_ELEMENTS.include?(elem.next_element.name)
45
+ }
46
+ end
47
+ }
48
+ end
49
+
50
+ def elem_matches_job_title_words(content)
51
+ @scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
52
+ end
53
+
54
+ def elem_heading_matches_page_title(elem, content)
55
+ if elem_is_heading?(elem.name)
56
+ @scorer.store(content, 40).if_block_true { page_title.include?(content) }
57
+ end
58
+ end
59
+
60
+ def elem_has_job_title_id_score(elem, content)
61
+ @scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
62
+ end
63
+
64
+ def page_title
65
+ @doc.at_css("title").content
66
+ end
67
+
68
+ def score_page_title
69
+ @scorer.store_and_score(page_title, 10)
70
+ end
71
+
72
+ def elem_not_suitable_as_title?(elem)
73
+ elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
74
+ end
75
+
43
76
  def elem_is_heading?(name)
44
77
  %w{h1 h2 h3 h4 h5}.include?(name)
45
78
  end
@@ -12,18 +12,18 @@ module JobParser
12
12
 
13
13
  def job
14
14
  if JobParser.cache.valid_for_url?(@url)
15
- return JobParser.cache.fetch_result_for_url(@url)
15
+ JobParser.cache.fetch_result_for_url(@url)
16
+ else
17
+ { :url => @url,
18
+ :salary => job_salary,
19
+ :title => job_title,
20
+ :apply => apply_link,
21
+ :salary_string => job_salary_string,
22
+ :location => job_location,
23
+ :deadline => deadline,
24
+ :postcode => job_postcode
25
+ }
16
26
  end
17
-
18
- { :url => @url,
19
- :salary => job_salary,
20
- :title => job_title,
21
- :apply => apply_link,
22
- :salary_string => job_salary_string,
23
- :location => job_location,
24
- :deadline => deadline,
25
- :postcode => job_postcode
26
- }
27
27
  end
28
28
 
29
29
  private
@@ -96,11 +96,7 @@ module JobParser
96
96
 
97
97
  def get_content_at_prop(prop)
98
98
  elem = find_with_itemprop(prop)
99
- if elem
100
- Cleaner.strip_all_white_space(elem.content)
101
- else
102
- ""
103
- end
99
+ elem ? Cleaner.strip_all_white_space(elem.content) : ""
104
100
  end
105
101
 
106
102
  def is_content_at_prop?(prop)
@@ -1,3 +1,3 @@
1
1
  module JobParser
2
- VERSION = "0.13.9"
2
+ VERSION = "0.13.10"
3
3
  end
data/lib/jobparser.rb CHANGED
@@ -40,7 +40,7 @@ module JobParser
40
40
  ParseHtml.new(html, url)
41
41
  end
42
42
  rescue URI::InvalidURIError
43
- raise JobParser::Error::InvalidUrl, "The URI given was not valid"
43
+ raise JobParser::Error::InvalidUrl, "The URI given (\"#{url}\") was not valid"
44
44
  end
45
45
  end
46
46
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jobparser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.13.9
4
+ version: 0.13.10
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-08-23 00:00:00.000000000 Z
12
+ date: 2013-08-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler