jobparser 0.13.9 → 0.13.10
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/jobparser/facets/title.rb +51 -18
- data/lib/jobparser/parser.rb +11 -11
- data/lib/jobparser/parseschema.rb +1 -5
- data/lib/jobparser/version.rb +1 -1
- data/lib/jobparser.rb +1 -1
- metadata +2 -2
@@ -1,45 +1,78 @@
|
|
1
1
|
module JobParser
|
2
2
|
module Facets
|
3
3
|
class Title < Facet
|
4
|
+
def initialize(*args)
|
5
|
+
super(*args)
|
6
|
+
@scorer = Scorer.new
|
7
|
+
end
|
8
|
+
|
4
9
|
def parse
|
5
10
|
special_case_result = use_special_case(:title)
|
6
11
|
return special_case_result unless special_case_result.nil?
|
7
12
|
|
8
|
-
|
9
|
-
page_title = @doc.at_css("title").content
|
10
|
-
title_scorer.store_and_score(page_title, 10)
|
13
|
+
score_page_title
|
11
14
|
|
12
15
|
# first see if we find something with a matching id
|
13
16
|
loop_over_elements do |name, elem|
|
14
|
-
next if elem
|
17
|
+
next if elem_not_suitable_as_title?(elem)
|
15
18
|
|
16
19
|
content = Cleaner.strip_all_white_space(elem.content)
|
17
20
|
|
18
21
|
# does the element have an id that means it might store the title?
|
19
|
-
|
22
|
+
elem_has_job_title_id_score(elem, content)
|
20
23
|
|
21
24
|
# or if a heading element matches the page title
|
22
|
-
|
23
|
-
title_scorer.store(content, 40).if_block_true { page_title.include?(content) }
|
24
|
-
end
|
25
|
+
elem_heading_matches_page_title(elem, content)
|
25
26
|
|
26
|
-
|
27
|
+
# if it has some common words that feature in job titles
|
28
|
+
elem_matches_job_title_words(content)
|
27
29
|
|
28
|
-
|
29
|
-
|
30
|
-
next_content = Cleaner.strip_all_white_space(elem.next_element.content)
|
31
|
-
title_scorer.store(next_content, 30).if_block_true {
|
32
|
-
ACCEPTED_ELEMENTS.include?(elem.next_element.name)
|
33
|
-
}
|
34
|
-
end
|
35
|
-
}
|
30
|
+
# if it's the title, get the content of the next element
|
31
|
+
elem_is_vacancy_title(elem, content)
|
36
32
|
end
|
37
33
|
|
38
|
-
clean_title(
|
34
|
+
clean_title(@scorer.top_match.strip.gsub(NBSP, ""))
|
39
35
|
end
|
40
36
|
|
41
37
|
private
|
42
38
|
|
39
|
+
def elem_is_vacancy_title(elem, content)
|
40
|
+
VACANCY_TITLE_REGEX.match(content) {
|
41
|
+
if elem.next_element && !Cleaner.strip_all_white_space(elem.next_element.content).empty?
|
42
|
+
next_content = Cleaner.strip_all_white_space(elem.next_element.content)
|
43
|
+
@scorer.store(next_content, 30).if_block_true {
|
44
|
+
ACCEPTED_ELEMENTS.include?(elem.next_element.name)
|
45
|
+
}
|
46
|
+
end
|
47
|
+
}
|
48
|
+
end
|
49
|
+
|
50
|
+
def elem_matches_job_title_words(content)
|
51
|
+
@scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
|
52
|
+
end
|
53
|
+
|
54
|
+
def elem_heading_matches_page_title(elem, content)
|
55
|
+
if elem_is_heading?(elem.name)
|
56
|
+
@scorer.store(content, 40).if_block_true { page_title.include?(content) }
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def elem_has_job_title_id_score(elem, content)
|
61
|
+
@scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
|
62
|
+
end
|
63
|
+
|
64
|
+
def page_title
|
65
|
+
@doc.at_css("title").content
|
66
|
+
end
|
67
|
+
|
68
|
+
def score_page_title
|
69
|
+
@scorer.store_and_score(page_title, 10)
|
70
|
+
end
|
71
|
+
|
72
|
+
def elem_not_suitable_as_title?(elem)
|
73
|
+
elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
|
74
|
+
end
|
75
|
+
|
43
76
|
def elem_is_heading?(name)
|
44
77
|
%w{h1 h2 h3 h4 h5}.include?(name)
|
45
78
|
end
|
data/lib/jobparser/parser.rb
CHANGED
@@ -12,18 +12,18 @@ module JobParser
|
|
12
12
|
|
13
13
|
def job
|
14
14
|
if JobParser.cache.valid_for_url?(@url)
|
15
|
-
|
15
|
+
JobParser.cache.fetch_result_for_url(@url)
|
16
|
+
else
|
17
|
+
{ :url => @url,
|
18
|
+
:salary => job_salary,
|
19
|
+
:title => job_title,
|
20
|
+
:apply => apply_link,
|
21
|
+
:salary_string => job_salary_string,
|
22
|
+
:location => job_location,
|
23
|
+
:deadline => deadline,
|
24
|
+
:postcode => job_postcode
|
25
|
+
}
|
16
26
|
end
|
17
|
-
|
18
|
-
{ :url => @url,
|
19
|
-
:salary => job_salary,
|
20
|
-
:title => job_title,
|
21
|
-
:apply => apply_link,
|
22
|
-
:salary_string => job_salary_string,
|
23
|
-
:location => job_location,
|
24
|
-
:deadline => deadline,
|
25
|
-
:postcode => job_postcode
|
26
|
-
}
|
27
27
|
end
|
28
28
|
|
29
29
|
private
|
@@ -96,11 +96,7 @@ module JobParser
|
|
96
96
|
|
97
97
|
def get_content_at_prop(prop)
|
98
98
|
elem = find_with_itemprop(prop)
|
99
|
-
|
100
|
-
Cleaner.strip_all_white_space(elem.content)
|
101
|
-
else
|
102
|
-
""
|
103
|
-
end
|
99
|
+
elem ? Cleaner.strip_all_white_space(elem.content) : ""
|
104
100
|
end
|
105
101
|
|
106
102
|
def is_content_at_prop?(prop)
|
data/lib/jobparser/version.rb
CHANGED
data/lib/jobparser.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jobparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.13.
|
4
|
+
version: 0.13.10
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-08-
|
12
|
+
date: 2013-08-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|