jobparser 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,30 +5,16 @@ require "jobparser/regex"
5
5
  module JobParser
6
6
  class Cleaner
7
7
 
8
- SALARY_GROUP_REGEX = /£([\d,]*)(?:.+)£([\d,]*)/
9
- CLEAN_SALARY_REGEX = /,|\s/
10
-
11
- def initialize(ary, opts = {})
12
- @subject = ary
13
- @type = opts[:type]
14
- end
15
-
16
- def clean
17
- clean_array
8
+ def self.strip_all_white_space(str)
9
+ Cleaner.clean_text(Cleaner.remove_nbsp(str.gsub('\n', ''))).strip
18
10
  end
19
11
 
20
- def self.clean_salary(salary_str)
21
- SALARY_GROUP_REGEX.match(salary_str.gsub(CLEAN_SALARY_REGEX, "")) { |match|
22
- [match[1].to_i, match[2].to_i]
23
- }
24
- end
25
-
26
- def self.strip_string(str)
27
- str.gsub('/n', '').gsub(NBSP, '').strip
12
+ def self.clean_text(str)
13
+ str.gsub(/\r|\t/, "").gsub(NBSP, " ")
28
14
  end
29
15
 
30
- def self.clean_plain_text(str)
31
- str.gsub(/\r|\t/, "").gsub(NBSP, " ")
16
+ def self.remove_nbsp(str)
17
+ str.gsub(NBSP, "")
32
18
  end
33
19
 
34
20
  def self.make_link_absolute(url, href)
@@ -45,22 +31,5 @@ module JobParser
45
31
  end
46
32
  end
47
33
 
48
- private
49
-
50
- def clean_array
51
- @subject.select { |item|
52
- not_whitespace_or_empty(item)
53
- }.map { |item|
54
- clean_string(item) if item.is_a?(String)
55
- }.uniq
56
- end
57
-
58
- def clean_string(str)
59
- self.class.strip_string(str)
60
- end
61
-
62
- def not_whitespace_or_empty(item)
63
- /^\s+$/.match(item) == nil && !item.empty?
64
- end
65
34
  end
66
35
  end
@@ -0,0 +1,18 @@
1
+ module JobParser
2
+ module Facets
3
+ class Apply < Facet
4
+ def parse
5
+ link = nil
6
+ elements["a"].each do |anchor|
7
+ APPLY_LINK_REGEX.match(anchor.content) { link = anchor }
8
+ end
9
+ if link
10
+ Cleaner.make_link_absolute(@url, link.attributes["href"].to_s.gsub(" ", "%20"))
11
+ else
12
+ @url
13
+ end
14
+ end
15
+
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,41 @@
1
+ module JobParser
2
+ module Facets
3
+ class Facet
4
+
5
+ ACCEPTED_ELEMENTS = %w{p a h1 h2 h3 h4 h5 span dl dd dt td}
6
+
7
+ attr_reader :doc, :url, :plain_text
8
+
9
+ def initialize(doc, url, plain_text)
10
+ @doc = doc
11
+ @url = url
12
+ @plain_text = plain_text
13
+ end
14
+
15
+ private
16
+
17
+ def use_special_case(name)
18
+ if special_case = SpecialCases.case_for_url(@url)
19
+ special_case[name].call(@doc)
20
+ end
21
+ end
22
+
23
+ def loop_over_elements(&block)
24
+ elements.each do |name, elems|
25
+ elems.each do |elem|
26
+ yield name, elem
27
+ end
28
+ end
29
+ end
30
+
31
+ def elements
32
+ {}.tap do |response|
33
+ ACCEPTED_ELEMENTS.each do |elem|
34
+ response[elem] = @doc.css(elem).to_a
35
+ end
36
+ end
37
+ end
38
+
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,15 @@
1
+ module JobParser
2
+ module Facets
3
+ class Location < Facet
4
+ def parse
5
+ special_case_result = use_special_case(:location)
6
+ return special_case_result unless special_case_result.nil?
7
+
8
+ LOCATION_REGEX.match(Cleaner.clean_text(@plain_text)) { |m|
9
+ Cleaner.strip_all_white_space(m[1].to_s)
10
+ } || ""
11
+ end
12
+
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,20 @@
1
+ module JobParser
2
+ module Facets
3
+ class Salary < Facet
4
+ def parse
5
+ special_case_result = use_special_case(:salary)
6
+ return special_case_result unless special_case_result.nil?
7
+
8
+ salary = ""
9
+ loop_over_elements do |name, elem|
10
+ SALARY_REGEX.match(@plain_text) { |m|
11
+ salary = m.to_s
12
+ }
13
+ end
14
+ SALARY_GROUP_REGEX.match(salary.gsub(CLEAN_SALARY_REGEX, "")) { |match|
15
+ [match[1].to_i, match[2].to_i]
16
+ } || nil
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,18 @@
1
+ module JobParser
2
+ module Facets
3
+ class SalaryString < Facet
4
+ def parse
5
+ special_case_result = use_special_case(:salary_string)
6
+ return special_case_result unless special_case_result.nil?
7
+
8
+ salary = ""
9
+ loop_over_elements do |name, elem|
10
+ SALARY_STRING_REGEX.match(@plain_text) { |m|
11
+ salary = m.to_s
12
+ }
13
+ end
14
+ Cleaner.strip_all_white_space(salary)
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,46 @@
1
+ module JobParser
2
+ module Facets
3
+ class Title < Facet
4
+ def parse
5
+ special_case_result = use_special_case(:title)
6
+ return special_case_result unless special_case_result.nil?
7
+
8
+ title_scorer = Scorer.new
9
+ page_title = @doc.at_css("title").content
10
+ title_scorer.store(page_title, 20).and_score_now
11
+
12
+ # first see if we find something with a matching id
13
+ loop_over_elements do |name, elem|
14
+ # check the ID of the elements for matches
15
+ next if elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
16
+ content = Cleaner.strip_all_white_space(elem.content)
17
+ title_scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
18
+
19
+ # or if a heading element matches the page title
20
+ if elem_is_heading?(name)
21
+ title_scorer.store(content, 40).if_block_true { page_title.include?(content) }
22
+ end
23
+
24
+ title_scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
25
+
26
+ VACANCY_TITLE_REGEX.match(content) {
27
+ if elem.next_element && !Cleaner.strip_all_white_space(elem.next_element.content).empty?
28
+ next_content = Cleaner.strip_all_white_space(elem.next_element.content)
29
+ title_scorer.store(next_content, 30).if_block_true {
30
+ ACCEPTED_ELEMENTS.include?(elem.next_element.name)
31
+ }
32
+ end
33
+ }
34
+ end
35
+
36
+ title_scorer.top_match.strip.gsub(NBSP, "")
37
+ end
38
+
39
+ private
40
+
41
+ def elem_is_heading?(name)
42
+ %w{h1 h2 h3 h4 h5}.include?(name)
43
+ end
44
+ end
45
+ end
46
+ end
@@ -1,5 +1,4 @@
1
1
  # encoding: utf-8
2
- require 'jobparser/regex.rb'
3
2
  require "nokogiri"
4
3
  module JobParser
5
4
  class ParseHtml
@@ -26,15 +25,6 @@ module JobParser
26
25
 
27
26
  private
28
27
 
29
- def job_location
30
- special_case_result = use_special_case(:location)
31
- return special_case_result unless special_case_result.nil?
32
-
33
- LOCATION_REGEX.match(Cleaner.clean_plain_text(@plain_text)) { |m|
34
- Cleaner.strip_string(m[1].to_s)
35
- } || ""
36
- end
37
-
38
28
  def strip_html
39
29
  doc = @doc.dup
40
30
  blacklist = ['title', 'script', 'style', 'button']
@@ -45,119 +35,24 @@ module JobParser
45
35
  nodelist.text
46
36
  end
47
37
 
48
- def loop_over_elements(&block)
49
- elements.each do |name, elems|
50
- elems.each do |elem|
51
- yield name, elem
52
- end
53
- end
54
- end
55
-
56
- def clean_array(ary, type = nil)
57
- Cleaner.new(ary, :type => type).clean
58
- end
59
-
60
- def use_special_case(name)
61
- if special_case = SpecialCases.case_for_url(@url)
62
- special_case[name].call(@doc)
63
- end
38
+ def job_location
39
+ Facets::Location.new(@doc, @url, @plain_text).parse
64
40
  end
65
41
 
66
42
  def job_salary_string
67
- special_case_result = use_special_case(:salary_string)
68
- return special_case_result unless special_case_result.nil?
69
-
70
- salary = ""
71
- loop_over_elements do |name, elem|
72
- SALARY_STRING_REGEX.match(@plain_text) { |m|
73
- salary = m.to_s
74
- }
75
- end
76
- Cleaner.strip_string(salary)
43
+ Facets::SalaryString.new(@doc, @url, @plain_text).parse
77
44
  end
78
45
 
79
46
  def job_salary
80
- special_case_result = use_special_case(:salary)
81
- return special_case_result unless special_case_result.nil?
82
-
83
- salary = ""
84
- loop_over_elements do |name, elem|
85
- SALARY_REGEX.match(@plain_text) { |m|
86
- salary = m.to_s
87
- }
88
- end
89
- salary.empty? ? nil : Cleaner.clean_salary(salary)
47
+ Facets::Salary.new(@doc, @url, @plain_text).parse
90
48
  end
91
49
 
92
50
  def job_title
93
- special_case_result = use_special_case(:title)
94
- return special_case_result unless special_case_result.nil?
95
-
96
- title_scorer = Scorer.new
97
- page_title = @doc.at_css("title").content
98
- title_scorer.store(page_title, 20).and_score_now
99
-
100
- # http://stackoverflow.com/questions/4476047/how-to-make-nokogiri-not-to-convert-nbsp-to-space
101
- nbsp = Nokogiri::HTML("&nbsp;").text
102
-
103
- # first see if we find something with a matching id
104
- loop_over_elements do |name, elem|
105
- # check the ID of the elements for matches
106
- next if elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
107
- content = Cleaner.strip_string(elem.content)
108
- title_scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
109
-
110
- # or if a heading element matches the page title
111
- if elem_is_heading?(name)
112
- title_scorer.store(content, 40).if_block_true { page_title.include?(content) }
113
- end
114
-
115
- title_scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
116
-
117
- VACANCY_TITLE_REGEX.match(content) {
118
- if elem.next_element && !Cleaner.strip_string(elem.next_element.content).empty?
119
- next_content = Cleaner.strip_string(elem.next_element.content)
120
- title_scorer.store(next_content, 30).if_block_true {
121
- ACCEPTED_ELEMENTS.include?(elem.next_element.name)
122
- }
123
- end
124
- }
125
- end
126
-
127
- title_scorer.top_match.strip.gsub(nbsp, "")
51
+ Facets::Title.new(@doc, @url, @pplain_text).parse
128
52
  end
129
53
 
130
54
  def apply_link
131
- link = nil
132
- anchor_elements.each do |anchor|
133
- APPLY_LINK_REGEX.match(anchor.content) { link = anchor }
134
- end
135
- if link
136
- Cleaner.make_link_absolute(@url, link.attributes["href"].to_s.gsub(" ", "%20"))
137
- else
138
- @url
139
- end
140
- end
141
-
142
-
143
- def elem_is_heading?(name)
144
- %w{h1 h2 h3 h4 h5}.include?(name)
145
- end
146
-
147
- def heading_elements
148
- elements.select { |elem| elem_is_heading?(elem) }
149
- end
150
-
151
- def anchor_elements
152
- elements["a"]
153
- end
154
-
155
- def elements
156
- {}.tap do |response|
157
- ACCEPTED_ELEMENTS.each do |elem|
158
- response[elem] = doc.css(elem).to_a
159
- end
160
- end
55
+ Facets::Apply.new(@doc, @url, @plain_text).parse
161
56
  end
162
57
  end
163
58
  end
@@ -9,6 +9,8 @@ module JobParser
9
9
  APPLY_LINK_REGEX = /^apply|submit an application|application form/i
10
10
  NBSP = Nokogiri::HTML("&nbsp;").text
11
11
  LOCATION_REGEX = /(?:location: )([\D]*)$/i
12
+ SALARY_GROUP_REGEX = /£([\d,]*)(?:.+)£([\d,]*)/
13
+ CLEAN_SALARY_REGEX = /,|\s/
12
14
 
13
15
  # words commonly used in job listings - not sure if this is a good way to go but I think it's worth a go
14
16
  # could scope this regex just to headers
@@ -21,7 +21,7 @@ module JobParser
21
21
  listings = doc.css(".jobViewSummary dl dt")
22
22
  listings.each do |dt|
23
23
  if dt.content == "Salary"
24
- salary = Cleaner.strip_string(dt.next_element.content)
24
+ salary = Cleaner.remove_nbsp(dt.next_element.content)
25
25
  break
26
26
  end
27
27
  end
@@ -1,3 +1,3 @@
1
1
  module JobParser
2
- VERSION = "0.1.1"
2
+ VERSION = "0.2.0"
3
3
  end
data/lib/jobparser.rb CHANGED
@@ -4,6 +4,12 @@ require "jobparser/parseurl"
4
4
  require "jobparser/cleaner"
5
5
  require "jobparser/scorer"
6
6
  require "jobparser/specialcases"
7
+ require "jobparser/facets/facet"
8
+ require "jobparser/facets/salary"
9
+ require "jobparser/facets/salarystring"
10
+ require "jobparser/facets/location"
11
+ require "jobparser/facets/apply"
12
+ require "jobparser/facets/title"
7
13
 
8
14
  module JobParser
9
15
  # Your code goes here...
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jobparser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-07-23 00:00:00.000000000 Z
12
+ date: 2013-07-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -99,6 +99,12 @@ extensions: []
99
99
  extra_rdoc_files: []
100
100
  files:
101
101
  - lib/jobparser/cleaner.rb
102
+ - lib/jobparser/facets/apply.rb
103
+ - lib/jobparser/facets/facet.rb
104
+ - lib/jobparser/facets/location.rb
105
+ - lib/jobparser/facets/salary.rb
106
+ - lib/jobparser/facets/salarystring.rb
107
+ - lib/jobparser/facets/title.rb
102
108
  - lib/jobparser/parsehtml.rb
103
109
  - lib/jobparser/parseurl.rb
104
110
  - lib/jobparser/regex.rb