jobparser 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -5,30 +5,16 @@ require "jobparser/regex"
5
5
  module JobParser
6
6
  class Cleaner
7
7
 
8
- SALARY_GROUP_REGEX = /£([\d,]*)(?:.+)£([\d,]*)/
9
- CLEAN_SALARY_REGEX = /,|\s/
10
-
11
- def initialize(ary, opts = {})
12
- @subject = ary
13
- @type = opts[:type]
14
- end
15
-
16
- def clean
17
- clean_array
8
+ def self.strip_all_white_space(str)
9
+ Cleaner.clean_text(Cleaner.remove_nbsp(str.gsub('\n', ''))).strip
18
10
  end
19
11
 
20
- def self.clean_salary(salary_str)
21
- SALARY_GROUP_REGEX.match(salary_str.gsub(CLEAN_SALARY_REGEX, "")) { |match|
22
- [match[1].to_i, match[2].to_i]
23
- }
24
- end
25
-
26
- def self.strip_string(str)
27
- str.gsub('/n', '').gsub(NBSP, '').strip
12
+ def self.clean_text(str)
13
+ str.gsub(/\r|\t/, "").gsub(NBSP, " ")
28
14
  end
29
15
 
30
- def self.clean_plain_text(str)
31
- str.gsub(/\r|\t/, "").gsub(NBSP, " ")
16
+ def self.remove_nbsp(str)
17
+ str.gsub(NBSP, "")
32
18
  end
33
19
 
34
20
  def self.make_link_absolute(url, href)
@@ -45,22 +31,5 @@ module JobParser
45
31
  end
46
32
  end
47
33
 
48
- private
49
-
50
- def clean_array
51
- @subject.select { |item|
52
- not_whitespace_or_empty(item)
53
- }.map { |item|
54
- clean_string(item) if item.is_a?(String)
55
- }.uniq
56
- end
57
-
58
- def clean_string(str)
59
- self.class.strip_string(str)
60
- end
61
-
62
- def not_whitespace_or_empty(item)
63
- /^\s+$/.match(item) == nil && !item.empty?
64
- end
65
34
  end
66
35
  end
@@ -0,0 +1,18 @@
1
+ module JobParser
2
+ module Facets
3
+ class Apply < Facet
4
+ def parse
5
+ link = nil
6
+ elements["a"].each do |anchor|
7
+ APPLY_LINK_REGEX.match(anchor.content) { link = anchor }
8
+ end
9
+ if link
10
+ Cleaner.make_link_absolute(@url, link.attributes["href"].to_s.gsub(" ", "%20"))
11
+ else
12
+ @url
13
+ end
14
+ end
15
+
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,41 @@
1
+ module JobParser
2
+ module Facets
3
+ class Facet
4
+
5
+ ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
6
+
7
+ attr_reader :doc, :url, :plain_text
8
+
9
+ def initialize(doc, url, plain_text)
10
+ @doc = doc
11
+ @url = url
12
+ @plain_text = plain_text
13
+ end
14
+
15
+ private
16
+
17
+ def use_special_case(name)
18
+ if special_case = SpecialCases.case_for_url(@url)
19
+ special_case[name].call(@doc)
20
+ end
21
+ end
22
+
23
+ def loop_over_elements(&block)
24
+ elements.each do |name, elems|
25
+ elems.each do |elem|
26
+ yield name, elem
27
+ end
28
+ end
29
+ end
30
+
31
+ def elements
32
+ {}.tap do |response|
33
+ ACCEPTED_ELEMENTS.each do |elem|
34
+ response[elem] = @doc.css(elem).to_a
35
+ end
36
+ end
37
+ end
38
+
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,15 @@
1
+ module JobParser
2
+ module Facets
3
+ class Location < Facet
4
+ def parse
5
+ special_case_result = use_special_case(:location)
6
+ return special_case_result unless special_case_result.nil?
7
+
8
+ LOCATION_REGEX.match(Cleaner.clean_text(@plain_text)) { |m|
9
+ Cleaner.strip_all_white_space(m[1].to_s)
10
+ } || ""
11
+ end
12
+
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,20 @@
1
+ module JobParser
2
+ module Facets
3
+ class Salary < Facet
4
+ def parse
5
+ special_case_result = use_special_case(:salary)
6
+ return special_case_result unless special_case_result.nil?
7
+
8
+ salary = ""
9
+ loop_over_elements do |name, elem|
10
+ SALARY_REGEX.match(@plain_text) { |m|
11
+ salary = m.to_s
12
+ }
13
+ end
14
+ SALARY_GROUP_REGEX.match(salary.gsub(CLEAN_SALARY_REGEX, "")) { |match|
15
+ [match[1].to_i, match[2].to_i]
16
+ } || nil
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,18 @@
1
+ module JobParser
2
+ module Facets
3
+ class SalaryString < Facet
4
+ def parse
5
+ special_case_result = use_special_case(:salary_string)
6
+ return special_case_result unless special_case_result.nil?
7
+
8
+ salary = ""
9
+ loop_over_elements do |name, elem|
10
+ SALARY_STRING_REGEX.match(@plain_text) { |m|
11
+ salary = m.to_s
12
+ }
13
+ end
14
+ Cleaner.strip_all_white_space(salary)
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,46 @@
1
+ module JobParser
2
+ module Facets
3
+ class Title < Facet
4
+ def parse
5
+ special_case_result = use_special_case(:title)
6
+ return special_case_result unless special_case_result.nil?
7
+
8
+ title_scorer = Scorer.new
9
+ page_title = @doc.at_css("title").content
10
+ title_scorer.store(page_title, 20).and_score_now
11
+
12
+ # first see if we find something with a matching id
13
+ loop_over_elements do |name, elem|
14
+ # check the ID of the elements for matches
15
+ next if elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
16
+ content = Cleaner.strip_all_white_space(elem.content)
17
+ title_scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
18
+
19
+ # or if a heading element matches the page title
20
+ if elem_is_heading?(name)
21
+ title_scorer.store(content, 40).if_block_true { page_title.include?(content) }
22
+ end
23
+
24
+ title_scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
25
+
26
+ VACANCY_TITLE_REGEX.match(content) {
27
+ if elem.next_element && !Cleaner.strip_all_white_space(elem.next_element.content).empty?
28
+ next_content = Cleaner.strip_all_white_space(elem.next_element.content)
29
+ title_scorer.store(next_content, 30).if_block_true {
30
+ ACCEPTED_ELEMENTS.include?(elem.next_element.name)
31
+ }
32
+ end
33
+ }
34
+ end
35
+
36
+ title_scorer.top_match.strip.gsub(NBSP, "")
37
+ end
38
+
39
+ private
40
+
41
+ def elem_is_heading?(name)
42
+ %w{h1 h2 h3 h4 h5}.include?(name)
43
+ end
44
+ end
45
+ end
46
+ end
@@ -1,5 +1,4 @@
1
1
  # encoding: utf-8
2
- require 'jobparser/regex.rb'
3
2
  require "nokogiri"
4
3
  module JobParser
5
4
  class ParseHtml
@@ -26,15 +25,6 @@ module JobParser
26
25
 
27
26
  private
28
27
 
29
- def job_location
30
- special_case_result = use_special_case(:location)
31
- return special_case_result unless special_case_result.nil?
32
-
33
- LOCATION_REGEX.match(Cleaner.clean_plain_text(@plain_text)) { |m|
34
- Cleaner.strip_string(m[1].to_s)
35
- } || ""
36
- end
37
-
38
28
  def strip_html
39
29
  doc = @doc.dup
40
30
  blacklist = ['title', 'script', 'style', 'button']
@@ -45,119 +35,24 @@ module JobParser
45
35
  nodelist.text
46
36
  end
47
37
 
48
- def loop_over_elements(&block)
49
- elements.each do |name, elems|
50
- elems.each do |elem|
51
- yield name, elem
52
- end
53
- end
54
- end
55
-
56
- def clean_array(ary, type = nil)
57
- Cleaner.new(ary, :type => type).clean
58
- end
59
-
60
- def use_special_case(name)
61
- if special_case = SpecialCases.case_for_url(@url)
62
- special_case[name].call(@doc)
63
- end
38
+ def job_location
39
+ Facets::Location.new(@doc, @url, @plain_text).parse
64
40
  end
65
41
 
66
42
  def job_salary_string
67
- special_case_result = use_special_case(:salary_string)
68
- return special_case_result unless special_case_result.nil?
69
-
70
- salary = ""
71
- loop_over_elements do |name, elem|
72
- SALARY_STRING_REGEX.match(@plain_text) { |m|
73
- salary = m.to_s
74
- }
75
- end
76
- Cleaner.strip_string(salary)
43
+ Facets::SalaryString.new(@doc, @url, @plain_text).parse
77
44
  end
78
45
 
79
46
  def job_salary
80
- special_case_result = use_special_case(:salary)
81
- return special_case_result unless special_case_result.nil?
82
-
83
- salary = ""
84
- loop_over_elements do |name, elem|
85
- SALARY_REGEX.match(@plain_text) { |m|
86
- salary = m.to_s
87
- }
88
- end
89
- salary.empty? ? nil : Cleaner.clean_salary(salary)
47
+ Facets::Salary.new(@doc, @url, @plain_text).parse
90
48
  end
91
49
 
92
50
  def job_title
93
- special_case_result = use_special_case(:title)
94
- return special_case_result unless special_case_result.nil?
95
-
96
- title_scorer = Scorer.new
97
- page_title = @doc.at_css("title").content
98
- title_scorer.store(page_title, 20).and_score_now
99
-
100
- # http://stackoverflow.com/questions/4476047/how-to-make-nokogiri-not-to-convert-nbsp-to-space
101
- nbsp = Nokogiri::HTML("&nbsp;").text
102
-
103
- # first see if we find something with a matching id
104
- loop_over_elements do |name, elem|
105
- # check the ID of the elements for matches
106
- next if elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
107
- content = Cleaner.strip_string(elem.content)
108
- title_scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
109
-
110
- # or if a heading element matches the page title
111
- if elem_is_heading?(name)
112
- title_scorer.store(content, 40).if_block_true { page_title.include?(content) }
113
- end
114
-
115
- title_scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
116
-
117
- VACANCY_TITLE_REGEX.match(content) {
118
- if elem.next_element && !Cleaner.strip_string(elem.next_element.content).empty?
119
- next_content = Cleaner.strip_string(elem.next_element.content)
120
- title_scorer.store(next_content, 30).if_block_true {
121
- ACCEPTED_ELEMENTS.include?(elem.next_element.name)
122
- }
123
- end
124
- }
125
- end
126
-
127
- title_scorer.top_match.strip.gsub(nbsp, "")
51
+ Facets::Title.new(@doc, @url, @pplain_text).parse
128
52
  end
129
53
 
130
54
  def apply_link
131
- link = nil
132
- anchor_elements.each do |anchor|
133
- APPLY_LINK_REGEX.match(anchor.content) { link = anchor }
134
- end
135
- if link
136
- Cleaner.make_link_absolute(@url, link.attributes["href"].to_s.gsub(" ", "%20"))
137
- else
138
- @url
139
- end
140
- end
141
-
142
-
143
- def elem_is_heading?(name)
144
- %w{h1 h2 h3 h4 h5}.include?(name)
145
- end
146
-
147
- def heading_elements
148
- elements.select { |elem| elem_is_heading?(elem) }
149
- end
150
-
151
- def anchor_elements
152
- elements["a"]
153
- end
154
-
155
- def elements
156
- {}.tap do |response|
157
- ACCEPTED_ELEMENTS.each do |elem|
158
- response[elem] = doc.css(elem).to_a
159
- end
160
- end
55
+ Facets::Apply.new(@doc, @url, @plain_text).parse
161
56
  end
162
57
  end
163
58
  end
@@ -9,6 +9,8 @@ module JobParser
9
9
  APPLY_LINK_REGEX = /^apply|submit an application|application form/i
10
10
  NBSP = Nokogiri::HTML("&nbsp;").text
11
11
  LOCATION_REGEX = /(?:location: )([\D]*)$/i
12
+ SALARY_GROUP_REGEX = /£([\d,]*)(?:.+)£([\d,]*)/
13
+ CLEAN_SALARY_REGEX = /,|\s/
12
14
 
13
15
  # words commonly used in job listings - not sure if this is a good way to go but I think it's worth a go
14
16
  # could scope this regex just to headers
@@ -21,7 +21,7 @@ module JobParser
21
21
  listings = doc.css(".jobViewSummary dl dt")
22
22
  listings.each do |dt|
23
23
  if dt.content == "Salary"
24
- salary = Cleaner.strip_string(dt.next_element.content)
24
+ salary = Cleaner.remove_nbsp(dt.next_element.content)
25
25
  break
26
26
  end
27
27
  end
@@ -1,3 +1,3 @@
1
1
  module JobParser
2
- VERSION = "0.1.1"
2
+ VERSION = "0.2.0"
3
3
  end
data/lib/jobparser.rb CHANGED
@@ -4,6 +4,12 @@ require "jobparser/parseurl"
4
4
  require "jobparser/cleaner"
5
5
  require "jobparser/scorer"
6
6
  require "jobparser/specialcases"
7
+ require "jobparser/facets/facet"
8
+ require "jobparser/facets/salary"
9
+ require "jobparser/facets/salarystring"
10
+ require "jobparser/facets/location"
11
+ require "jobparser/facets/apply"
12
+ require "jobparser/facets/title"
7
13
 
8
14
  module JobParser
9
15
  # Your code goes here...
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jobparser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-07-23 00:00:00.000000000 Z
12
+ date: 2013-07-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -99,6 +99,12 @@ extensions: []
99
99
  extra_rdoc_files: []
100
100
  files:
101
101
  - lib/jobparser/cleaner.rb
102
+ - lib/jobparser/facets/apply.rb
103
+ - lib/jobparser/facets/facet.rb
104
+ - lib/jobparser/facets/location.rb
105
+ - lib/jobparser/facets/salary.rb
106
+ - lib/jobparser/facets/salarystring.rb
107
+ - lib/jobparser/facets/title.rb
102
108
  - lib/jobparser/parsehtml.rb
103
109
  - lib/jobparser/parseurl.rb
104
110
  - lib/jobparser/regex.rb