jobparser 0.13.9 → 0.13.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/jobparser/facets/title.rb +51 -18
- data/lib/jobparser/parser.rb +11 -11
- data/lib/jobparser/parseschema.rb +1 -5
- data/lib/jobparser/version.rb +1 -1
- data/lib/jobparser.rb +1 -1
- metadata +2 -2
@@ -1,45 +1,78 @@
|
|
1
1
|
module JobParser
|
2
2
|
module Facets
|
3
3
|
class Title < Facet
|
4
|
+
def initialize(*args)
|
5
|
+
super(*args)
|
6
|
+
@scorer = Scorer.new
|
7
|
+
end
|
8
|
+
|
4
9
|
def parse
|
5
10
|
special_case_result = use_special_case(:title)
|
6
11
|
return special_case_result unless special_case_result.nil?
|
7
12
|
|
8
|
-
|
9
|
-
page_title = @doc.at_css("title").content
|
10
|
-
title_scorer.store_and_score(page_title, 10)
|
13
|
+
score_page_title
|
11
14
|
|
12
15
|
# first see if we find something with a matching id
|
13
16
|
loop_over_elements do |name, elem|
|
14
|
-
next if elem
|
17
|
+
next if elem_not_suitable_as_title?(elem)
|
15
18
|
|
16
19
|
content = Cleaner.strip_all_white_space(elem.content)
|
17
20
|
|
18
21
|
# does the element have an id that means it might store the title?
|
19
|
-
|
22
|
+
elem_has_job_title_id_score(elem, content)
|
20
23
|
|
21
24
|
# or if a heading element matches the page title
|
22
|
-
|
23
|
-
title_scorer.store(content, 40).if_block_true { page_title.include?(content) }
|
24
|
-
end
|
25
|
+
elem_heading_matches_page_title(elem, content)
|
25
26
|
|
26
|
-
|
27
|
+
# if it has some common words that feature in job titles
|
28
|
+
elem_matches_job_title_words(content)
|
27
29
|
|
28
|
-
|
29
|
-
|
30
|
-
next_content = Cleaner.strip_all_white_space(elem.next_element.content)
|
31
|
-
title_scorer.store(next_content, 30).if_block_true {
|
32
|
-
ACCEPTED_ELEMENTS.include?(elem.next_element.name)
|
33
|
-
}
|
34
|
-
end
|
35
|
-
}
|
30
|
+
# if it's the title, get the content of the next element
|
31
|
+
elem_is_vacancy_title(elem, content)
|
36
32
|
end
|
37
33
|
|
38
|
-
clean_title(
|
34
|
+
clean_title(@scorer.top_match.strip.gsub(NBSP, ""))
|
39
35
|
end
|
40
36
|
|
41
37
|
private
|
42
38
|
|
39
|
+
def elem_is_vacancy_title(elem, content)
|
40
|
+
VACANCY_TITLE_REGEX.match(content) {
|
41
|
+
if elem.next_element && !Cleaner.strip_all_white_space(elem.next_element.content).empty?
|
42
|
+
next_content = Cleaner.strip_all_white_space(elem.next_element.content)
|
43
|
+
@scorer.store(next_content, 30).if_block_true {
|
44
|
+
ACCEPTED_ELEMENTS.include?(elem.next_element.name)
|
45
|
+
}
|
46
|
+
end
|
47
|
+
}
|
48
|
+
end
|
49
|
+
|
50
|
+
def elem_matches_job_title_words(content)
|
51
|
+
@scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
|
52
|
+
end
|
53
|
+
|
54
|
+
def elem_heading_matches_page_title(elem, content)
|
55
|
+
if elem_is_heading?(elem.name)
|
56
|
+
@scorer.store(content, 40).if_block_true { page_title.include?(content) }
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def elem_has_job_title_id_score(elem, content)
|
61
|
+
@scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
|
62
|
+
end
|
63
|
+
|
64
|
+
def page_title
|
65
|
+
@doc.at_css("title").content
|
66
|
+
end
|
67
|
+
|
68
|
+
def score_page_title
|
69
|
+
@scorer.store_and_score(page_title, 10)
|
70
|
+
end
|
71
|
+
|
72
|
+
def elem_not_suitable_as_title?(elem)
|
73
|
+
elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
|
74
|
+
end
|
75
|
+
|
43
76
|
def elem_is_heading?(name)
|
44
77
|
%w{h1 h2 h3 h4 h5}.include?(name)
|
45
78
|
end
|
data/lib/jobparser/parser.rb
CHANGED
@@ -12,18 +12,18 @@ module JobParser
|
|
12
12
|
|
13
13
|
def job
|
14
14
|
if JobParser.cache.valid_for_url?(@url)
|
15
|
-
|
15
|
+
JobParser.cache.fetch_result_for_url(@url)
|
16
|
+
else
|
17
|
+
{ :url => @url,
|
18
|
+
:salary => job_salary,
|
19
|
+
:title => job_title,
|
20
|
+
:apply => apply_link,
|
21
|
+
:salary_string => job_salary_string,
|
22
|
+
:location => job_location,
|
23
|
+
:deadline => deadline,
|
24
|
+
:postcode => job_postcode
|
25
|
+
}
|
16
26
|
end
|
17
|
-
|
18
|
-
{ :url => @url,
|
19
|
-
:salary => job_salary,
|
20
|
-
:title => job_title,
|
21
|
-
:apply => apply_link,
|
22
|
-
:salary_string => job_salary_string,
|
23
|
-
:location => job_location,
|
24
|
-
:deadline => deadline,
|
25
|
-
:postcode => job_postcode
|
26
|
-
}
|
27
27
|
end
|
28
28
|
|
29
29
|
private
|
@@ -96,11 +96,7 @@ module JobParser
|
|
96
96
|
|
97
97
|
def get_content_at_prop(prop)
|
98
98
|
elem = find_with_itemprop(prop)
|
99
|
-
|
100
|
-
Cleaner.strip_all_white_space(elem.content)
|
101
|
-
else
|
102
|
-
""
|
103
|
-
end
|
99
|
+
elem ? Cleaner.strip_all_white_space(elem.content) : ""
|
104
100
|
end
|
105
101
|
|
106
102
|
def is_content_at_prop?(prop)
|
data/lib/jobparser/version.rb
CHANGED
data/lib/jobparser.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jobparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.13.
|
4
|
+
version: 0.13.10
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-08-
|
12
|
+
date: 2013-08-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|