jobparser 0.13.9 → 0.13.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,45 +1,78 @@
1
1
  module JobParser
2
2
  module Facets
3
3
  class Title < Facet
4
+ def initialize(*args)
5
+ super(*args)
6
+ @scorer = Scorer.new
7
+ end
8
+
4
9
  def parse
5
10
  special_case_result = use_special_case(:title)
6
11
  return special_case_result unless special_case_result.nil?
7
12
 
8
- title_scorer = Scorer.new
9
- page_title = @doc.at_css("title").content
10
- title_scorer.store_and_score(page_title, 10)
13
+ score_page_title
11
14
 
12
15
  # first see if we find something with a matching id
13
16
  loop_over_elements do |name, elem|
14
- next if elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
17
+ next if elem_not_suitable_as_title?(elem)
15
18
 
16
19
  content = Cleaner.strip_all_white_space(elem.content)
17
20
 
18
21
  # does the element have an id that means it might store the title?
19
- title_scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
22
+ elem_has_job_title_id_score(elem, content)
20
23
 
21
24
  # or if a heading element matches the page title
22
- if elem_is_heading?(name)
23
- title_scorer.store(content, 40).if_block_true { page_title.include?(content) }
24
- end
25
+ elem_heading_matches_page_title(elem, content)
25
26
 
26
- title_scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
27
+ # if it has some common words that feature in job titles
28
+ elem_matches_job_title_words(content)
27
29
 
28
- VACANCY_TITLE_REGEX.match(content) {
29
- if elem.next_element && !Cleaner.strip_all_white_space(elem.next_element.content).empty?
30
- next_content = Cleaner.strip_all_white_space(elem.next_element.content)
31
- title_scorer.store(next_content, 30).if_block_true {
32
- ACCEPTED_ELEMENTS.include?(elem.next_element.name)
33
- }
34
- end
35
- }
30
+ # if it's the title, get the content of the next element
31
+ elem_is_vacancy_title(elem, content)
36
32
  end
37
33
 
38
- clean_title(title_scorer.top_match.strip.gsub(NBSP, ""))
34
+ clean_title(@scorer.top_match.strip.gsub(NBSP, ""))
39
35
  end
40
36
 
41
37
  private
42
38
 
39
+ def elem_is_vacancy_title(elem, content)
40
+ VACANCY_TITLE_REGEX.match(content) {
41
+ if elem.next_element && !Cleaner.strip_all_white_space(elem.next_element.content).empty?
42
+ next_content = Cleaner.strip_all_white_space(elem.next_element.content)
43
+ @scorer.store(next_content, 30).if_block_true {
44
+ ACCEPTED_ELEMENTS.include?(elem.next_element.name)
45
+ }
46
+ end
47
+ }
48
+ end
49
+
50
+ def elem_matches_job_title_words(content)
51
+ @scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
52
+ end
53
+
54
+ def elem_heading_matches_page_title(elem, content)
55
+ if elem_is_heading?(elem.name)
56
+ @scorer.store(content, 40).if_block_true { page_title.include?(content) }
57
+ end
58
+ end
59
+
60
+ def elem_has_job_title_id_score(elem, content)
61
+ @scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
62
+ end
63
+
64
+ def page_title
65
+ @doc.at_css("title").content
66
+ end
67
+
68
+ def score_page_title
69
+ @scorer.store_and_score(page_title, 10)
70
+ end
71
+
72
+ def elem_not_suitable_as_title?(elem)
73
+ elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
74
+ end
75
+
43
76
  def elem_is_heading?(name)
44
77
  %w{h1 h2 h3 h4 h5}.include?(name)
45
78
  end
@@ -12,18 +12,18 @@ module JobParser
12
12
 
13
13
  def job
14
14
  if JobParser.cache.valid_for_url?(@url)
15
- return JobParser.cache.fetch_result_for_url(@url)
15
+ JobParser.cache.fetch_result_for_url(@url)
16
+ else
17
+ { :url => @url,
18
+ :salary => job_salary,
19
+ :title => job_title,
20
+ :apply => apply_link,
21
+ :salary_string => job_salary_string,
22
+ :location => job_location,
23
+ :deadline => deadline,
24
+ :postcode => job_postcode
25
+ }
16
26
  end
17
-
18
- { :url => @url,
19
- :salary => job_salary,
20
- :title => job_title,
21
- :apply => apply_link,
22
- :salary_string => job_salary_string,
23
- :location => job_location,
24
- :deadline => deadline,
25
- :postcode => job_postcode
26
- }
27
27
  end
28
28
 
29
29
  private
@@ -96,11 +96,7 @@ module JobParser
96
96
 
97
97
  def get_content_at_prop(prop)
98
98
  elem = find_with_itemprop(prop)
99
- if elem
100
- Cleaner.strip_all_white_space(elem.content)
101
- else
102
- ""
103
- end
99
+ elem ? Cleaner.strip_all_white_space(elem.content) : ""
104
100
  end
105
101
 
106
102
  def is_content_at_prop?(prop)
@@ -1,3 +1,3 @@
1
1
  module JobParser
2
- VERSION = "0.13.9"
2
+ VERSION = "0.13.10"
3
3
  end
data/lib/jobparser.rb CHANGED
@@ -40,7 +40,7 @@ module JobParser
40
40
  ParseHtml.new(html, url)
41
41
  end
42
42
  rescue URI::InvalidURIError
43
- raise JobParser::Error::InvalidUrl, "The URI given was not valid"
43
+ raise JobParser::Error::InvalidUrl, "The URI given (\"#{url}\") was not valid"
44
44
  end
45
45
  end
46
46
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jobparser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.13.9
4
+ version: 0.13.10
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-08-23 00:00:00.000000000 Z
12
+ date: 2013-08-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler