indeedparser 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 2516696c6e9a799fbc03e3c1189b9b6273649b2c
4
+ data.tar.gz: 22034fac47e8b9eaab688a1427264cc727b09dbf
5
+ SHA512:
6
+ metadata.gz: 815a153b6f9af990cba3ecc5061b8576a2b459b9a08b86fbf0ddd3f67e11c42824c449453846c43b2a64e0d43bc0e79be0f396761fd081ac96464b4b792f8a49
7
+ data.tar.gz: 71307ef7ad393cae5a201ea628a44b39f3763a40709336b1ade3cd08ca32a4f01a9cea277658b38476262da445cdfd7a9d31a625b71ec69187186327c6a6ff07
data/lib/awards.rb ADDED
@@ -0,0 +1,38 @@
1
+ require 'nokogiri'
2
+ load 'utilities.rb'
3
+
4
+ class Awards
5
+ include Utilities
6
+ def initialize(html)
7
+ awards = html.xpath("//div[contains(concat(' ',normalize-space(@class),' '),' award-section ')]")
8
+ @award_list = Array.new
9
+
10
+ awards.each do |award|
11
+ @award_list.push({
12
+ award_title: award_title(award),
13
+ award_date: award_date(award),
14
+ award_description: award_description(award)
15
+ })
16
+ end
17
+ end
18
+
19
+ # Return award info
20
+ def get_awards
21
+ return @award_list
22
+ end
23
+
24
+ # Get title of award
25
+ def award_title(award)
26
+ award.xpath(".//p[@class='award_title']").text
27
+ end
28
+
29
+ # Get award date
30
+ def award_date(award)
31
+ award.xpath(".//p[@class='award_date']").text
32
+ end
33
+
34
+ # Get award description
35
+ def award_description(award)
36
+ award.xpath(".//p[@class='award_description']").text
37
+ end
38
+ end
@@ -0,0 +1,44 @@
1
+ require 'nokogiri'
2
+ load 'utilities.rb'
3
+
4
+ class Certifications
5
+ include Utilities
6
+ def initialize(html)
7
+ certifications = html.xpath("//div[contains(concat(' ',normalize-space(@class),' '),' certification-section ')]")
8
+ @certification_list = Array.new
9
+
10
+ certifications.each do |certification|
11
+ @certification_list.push({
12
+ cert_title: cert_title(certification),
13
+ cert_description: cert_description(certification),
14
+ cert_start_date: cert_start_date(certification),
15
+ cert_end_date: cert_end_date(certification)
16
+ })
17
+ end
18
+ end
19
+
20
+ # Return cert info
21
+ def get_certifications
22
+ return @certification_list
23
+ end
24
+
25
+ # Get title of cert
26
+ def cert_title(certification)
27
+ certification.xpath(".//p[@class='certification_title']").text
28
+ end
29
+
30
+ # Get description of cert
31
+ def cert_description(certification)
32
+ certification.xpath(".//p[@class='certification_description']").text
33
+ end
34
+
35
+ # Get start date for cert validity
36
+ def cert_start_date(certification)
37
+ parse_dates(certification.xpath(".//p[@class='certification_date']").text)[0]
38
+ end
39
+
40
+ # Get cert end date
41
+ def cert_end_date(certification)
42
+ parse_dates(certification.xpath(".//p[@class='certification_date']").text)[1]
43
+ end
44
+ end
data/lib/degrees.rb ADDED
@@ -0,0 +1,50 @@
1
+ require 'nokogiri'
2
+ load 'utilities.rb'
3
+
4
+ class Degrees
5
+ include Utilities
6
+ def initialize(html)
7
+ degrees = html.xpath("//div[@itemtype='http://schema.org/EducationalOrganization']")
8
+ @degree_list = Array.new
9
+
10
+ degrees.each do |degree|
11
+ @degree_list.push({
12
+ school: school(degree),
13
+ degree_title: degree_title(degree),
14
+ school_location: school_location(degree),
15
+ degree_start_date: degree_start_date(degree),
16
+ degree_end_date: degree_end_date(degree)
17
+ })
18
+ end
19
+ end
20
+
21
+ # Return degree info
22
+ def get_degrees
23
+ return @degree_list
24
+ end
25
+
26
+ # Get school name
27
+ def school(degree)
28
+ degree.xpath(".//span[@itemprop='name']").text
29
+ end
30
+
31
+ # Get title of degree
32
+ def degree_title(degree)
33
+ degree.xpath(".//p[@class='edu_title']").text
34
+ end
35
+
36
+ # Get where the school is
37
+ def school_location(degree)
38
+ degree.xpath(".//span[@itemprop='addressLocality']").text
39
+ end
40
+
41
+ # Get start date for degree
42
+ def degree_start_date(degree)
43
+ parse_dates(degree.xpath(".//p[@class='edu_dates']").text)[0]
44
+ end
45
+
46
+ # Get degree end date
47
+ def degree_end_date(degree)
48
+ parse_dates(degree.xpath(".//p[@class='edu_dates']").text)[1]
49
+ end
50
+ end
data/lib/groups.rb ADDED
@@ -0,0 +1,44 @@
1
+ require 'nokogiri'
2
+ load 'utilities.rb'
3
+
4
+ class Groups
5
+ include Utilities
6
+ def initialize(html)
7
+ groups = html.xpath("//div[contains(concat(' ',normalize-space(@class),' '),' group-section ')]")
8
+ @group_list = Array.new
9
+
10
+ groups.each do |group|
11
+ @group_list.push({
12
+ group_title: group_title(group),
13
+ group_description: group_description(group),
14
+ group_start_date: group_start_date(group),
15
+ group_end_date: group_end_date(group)
16
+ })
17
+ end
18
+ end
19
+
20
+ # Return group info
21
+ def get_groups
22
+ return @group_list
23
+ end
24
+
25
+ # Get title of group
26
+ def group_title(group)
27
+ group.xpath(".//p[@class='group_title']").text
28
+ end
29
+
30
+ # Get description of group
31
+ def group_description(group)
32
+ group.xpath(".//p[@class='group_description']").text
33
+ end
34
+
35
+ # Get start date for group
36
+ def group_start_date(group)
37
+ parse_dates(group.xpath(".//p[@class='group_date']").text)[0]
38
+ end
39
+
40
+ # Get group end date
41
+ def group_end_date(group)
42
+ parse_dates(group.xpath(".//p[@class='group_date']").text)[1]
43
+ end
44
+ end
@@ -0,0 +1,33 @@
1
+ require 'requestmanager'
2
+ require 'json'
3
+ load 'personal_info.rb'
4
+ load 'jobs.rb'
5
+
6
+ class IndeedParser
7
+ def initialize(html, url, crawler_fields)
8
+ @html = html
9
+ @url = url
10
+ @crawler_fields = crawler_fields
11
+ parse
12
+ end
13
+
14
+ # Parse profile
15
+ def parse
16
+ p = PersonalInfo.new(@html, @url)
17
+ @personal_info = p.get_personal_info
18
+
19
+ j = Jobs.new(@html)
20
+ @job_info = j.get_jobs
21
+ end
22
+
23
+ # Get output
24
+ def get_results_by_job
25
+ output = Array.new
26
+ @job_info.each do |job|
27
+ output.push(job.merge!(@personal_info).merge!(@crawler_fields))
28
+ end
29
+
30
+ JSON.pretty_generate(output)
31
+ end
32
+ end
33
+
data/lib/jobs.rb ADDED
@@ -0,0 +1,58 @@
1
+ require 'nokogiri'
2
+ load 'utilities.rb'
3
+
4
+ class Jobs
5
+ include Utilities
6
+ def initialize(html)
7
+ @html = Nokogiri::HTML(html)
8
+
9
+ jobs = @html.xpath("//div[contains(concat(' ',normalize-space(@class),' '),' work-experience-section ')]")
10
+ @job_info = Array.new
11
+
12
+ jobs.each do |job|
13
+ @job_info.push({
14
+ job_title: job_title(job),
15
+ company: company(job),
16
+ company_location: company_location(job),
17
+ job_description: job_description(job),
18
+ start_date: start_date(job),
19
+ end_date: end_date(job)
20
+ })
21
+ end
22
+ end
23
+
24
+ # Return job info
25
+ def get_jobs
26
+ return @job_info
27
+ end
28
+
29
+ # Get job title
30
+ def job_title(job)
31
+ job.xpath(".//p[@class='work_title title']").text
32
+ end
33
+
34
+ # Get company
35
+ def company(job)
36
+ job.xpath(".//div[@class='work_company']//span").first.text
37
+ end
38
+
39
+ # Get work location
40
+ def company_location(job)
41
+ job.xpath(".//div[@class='work_company']//div[@class='inline-block']//span").text
42
+ end
43
+
44
+ # Get job description
45
+ def job_description(job)
46
+ job.xpath(".//p[@class='work_description']").text
47
+ end
48
+
49
+ # Get start date
50
+ def start_date(job)
51
+ parse_dates(job.xpath(".//p[@class='work_dates']").text)[0]
52
+ end
53
+
54
+ # Get end date
55
+ def end_date(job)
56
+ parse_dates(job.xpath(".//p[@class='work_dates']").text)[1]
57
+ end
58
+ end
data/lib/links.rb ADDED
@@ -0,0 +1,32 @@
1
+ require 'nokogiri'
2
+ load 'utilities.rb'
3
+
4
+ class Links
5
+ include Utilities
6
+ def initialize(html)
7
+ links = html.xpath("//div[contains(concat(' ',normalize-space(@class),' '),' link-section ')]")
8
+ @link_list = Array.new
9
+
10
+ links.each do |link|
11
+ @link_list.push({
12
+ link_title: link_title(link),
13
+ link_url: link_url(link)
14
+ })
15
+ end
16
+ end
17
+
18
+ # Return person info
19
+ def get_links
20
+ return @link_list
21
+ end
22
+
23
+ # Get title of link
24
+ def link_title(link)
25
+ link.xpath(".//a").text
26
+ end
27
+
28
+ # Get link url
29
+ def link_url(link)
30
+ link.xpath(".//a").first['href']
31
+ end
32
+ end
@@ -0,0 +1,65 @@
1
+ require 'nokogiri'
2
+ load 'utilities.rb'
3
+
4
+ class MilitaryService
5
+ include Utilities
6
+ def initialize(html)
7
+ military_items = html.xpath("//div[contains(concat(' ',normalize-space(@class),' '),' military-section ')]")
8
+ @military_service = Array.new
9
+
10
+ military_items.each do |mil_item|
11
+ @military_service.push({
12
+ military_country: military_country(mil_item),
13
+ military_branch: military_branch(mil_item),
14
+ military_rank: military_rank(mil_item),
15
+ military_description: military_description(mil_item),
16
+ military_commendations: military_commendations(mil_item),
17
+ military_start_date: military_start_date(mil_item),
18
+ military_end_date: military_end_date(mil_item)
19
+ })
20
+ end
21
+ end
22
+
23
+ # Return military service info
24
+ def get_military_service
25
+ return @military_service
26
+ end
27
+
28
+ # Get country of military service
29
+ def military_country(mil_item)
30
+ remove = mil_item.xpath(".//p[@class='military_country']//span").text
31
+ mil_item.xpath(".//p[@class='military_country']").text.gsub(remove, "").strip.lstrip
32
+ end
33
+
34
+ # Get military branch
35
+ def military_branch(mil_item)
36
+ remove = mil_item.xpath(".//p[@class='military_branch']//span").text
37
+ mil_item.xpath(".//p[@class='military_branch']").text.gsub(remove, "").strip.lstrip
38
+ end
39
+
40
+ # Get military rank
41
+ def military_rank(mil_item)
42
+ remove = mil_item.xpath(".//p[@class='military_rank']//span").text
43
+ mil_item.xpath(".//p[@class='military_rank']").text.gsub(remove, "").strip.lstrip
44
+ end
45
+
46
+ # Get military description
47
+ def military_description(mil_item)
48
+ mil_item.xpath(".//p[@class='military_description']").text
49
+ end
50
+
51
+ # Get military commendations
52
+ def military_commendations(mil_item)
53
+ mil_item.xpath(".//p[@class='military_commendations']").text
54
+ end
55
+
56
+ # Get start date
57
+ def military_start_date(mil_item)
58
+ parse_dates(mil_item.xpath(".//p[@class='military_date']").text)[0]
59
+ end
60
+
61
+ # Get end date
62
+ def military_end_date(mil_item)
63
+ parse_dates(mil_item.xpath(".//p[@class='military_date']").text)[1]
64
+ end
65
+ end
@@ -0,0 +1,117 @@
1
+ require 'nokogiri'
2
+ load 'degrees.rb'
3
+ load 'military_service.rb'
4
+ load 'certifications.rb'
5
+ load 'rec_people.rb'
6
+ load 'links.rb'
7
+ load 'awards.rb'
8
+ load 'groups.rb'
9
+
10
+ class PersonalInfo
11
+ def initialize(html, url)
12
+ @raw_html = html
13
+ @html = Nokogiri::HTML(html)
14
+ @url = url
15
+
16
+ @personal_info = {
17
+ name: name,
18
+ url: @url,
19
+ location: location,
20
+ current_title: current_title,
21
+ skills: skills,
22
+ summary: summary,
23
+ additional_info: additional_info,
24
+ last_updated: last_updated,
25
+ degrees: degrees,
26
+ military_service: military_service,
27
+ certifications: certifications,
28
+ rec_people: rec_people,
29
+ links: links,
30
+ awards: awards,
31
+ groups: groups,
32
+ fulltext: @raw_html
33
+ }
34
+ end
35
+
36
+ # Return personal info hash
37
+ def get_personal_info
38
+ return @personal_info
39
+ end
40
+
41
+ # Get certification data
42
+ def certifications
43
+ c = Certifications.new(@html)
44
+ c.get_certifications
45
+ end
46
+
47
+ # Get list of suggested resumes from side
48
+ def rec_people
49
+ r = RecPeople.new(@html)
50
+ r.get_rec_people
51
+ end
52
+
53
+ # Get any links they list
54
+ def links
55
+ l = Links.new(@html)
56
+ l.get_links
57
+ end
58
+
59
+ # Get list of awards
60
+ def awards
61
+ a = Awards.new(@html)
62
+ a.get_awards
63
+ end
64
+
65
+ # Get list of groups
66
+ def groups
67
+ g = Groups.new(@html)
68
+ g.get_groups
69
+ end
70
+
71
+ # Get list of degrees
72
+ def degrees
73
+ d = Degrees.new(@html)
74
+ d.get_degrees
75
+ end
76
+
77
+ # Get military service
78
+ def military_service
79
+ m = MilitaryService.new(@html)
80
+ m.get_military_service
81
+ end
82
+
83
+ # Get persons name
84
+ def name
85
+ @html.xpath("//h1[@itemprop='name']").text
86
+ end
87
+
88
+ # Get location
89
+ def location
90
+ @html.xpath("//p[@id='headline_location']").text
91
+ end
92
+
93
+ # Get overall job title
94
+ def current_title
95
+ @html.xpath("//h2[@id='headline']").text
96
+ end
97
+
98
+ # Get skills section
99
+ def skills
100
+ @html.xpath("//span[@class='skill-text']").text
101
+ end
102
+
103
+ # Get summary
104
+ def summary
105
+ @html.xpath("//p[@id='res_summary']").text
106
+ end
107
+
108
+ # Get additional info
109
+ def additional_info
110
+ @html.xpath("//div[@id='additionalinfo-section']//p").text
111
+ end
112
+
113
+ # Get last updated time
114
+ def last_updated
115
+ @html.xpath("//div[@id='resume_actions_contacted']").text.gsub("Updated: ", "")
116
+ end
117
+ end
data/lib/rec_people.rb ADDED
@@ -0,0 +1,32 @@
1
+ require 'nokogiri'
2
+ load 'utilities.rb'
3
+
4
+ class RecPeople
5
+ include Utilities
6
+ def initialize(html)
7
+ rec_people = html.css(".rec_resume")
8
+ @rec_people_list = Array.new
9
+
10
+ rec_people.each do |rec_person|
11
+ @rec_people_list.push({
12
+ rec_person_name: rec_person_name(rec_person),
13
+ rec_person_link: rec_person_link(rec_person)
14
+ })
15
+ end
16
+ end
17
+
18
+ # Return person info
19
+ def get_rec_people
20
+ return @rec_people_list
21
+ end
22
+
23
+ # Get name of suggested person
24
+ def rec_person_name(rec_person)
25
+ rec_person.css("a").text
26
+ end
27
+
28
+ # Get name of suggested link
29
+ def rec_person_link(rec_person)
30
+ rec_person.css("a").first['href']
31
+ end
32
+ end
data/lib/utilities.rb ADDED
@@ -0,0 +1,23 @@
1
+ require 'date'
2
+
3
+ module Utilities
4
+ # Parse dates
5
+ def parse_dates(dates)
6
+ start_date, end_date = dates
7
+
8
+ if dates.include?(" to ")
9
+ start_date, end_date = dates.split(" to ")
10
+ end
11
+
12
+ return date_normalize(start_date), date_normalize(end_date)
13
+ end
14
+
15
+ def date_normalize(date)
16
+ begin
17
+ date = date+"-01-01" if date =~ /^(19|20)\d{2}$/
18
+ return Date.parse(date)
19
+ rescue
20
+ return date
21
+ end
22
+ end
23
+ end
metadata ADDED
@@ -0,0 +1,55 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: indeedparser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - M. C. McGrath
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-12-23 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Parses Indeed resumes
14
+ email: shidash@transparencytoolkit.org
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/awards.rb
20
+ - lib/certifications.rb
21
+ - lib/degrees.rb
22
+ - lib/groups.rb
23
+ - lib/indeed_parser.rb
24
+ - lib/jobs.rb
25
+ - lib/links.rb
26
+ - lib/military_service.rb
27
+ - lib/personal_info.rb
28
+ - lib/rec_people.rb
29
+ - lib/utilities.rb
30
+ homepage: https://github.com/TransparencyToolkit/indeedparser
31
+ licenses:
32
+ - GPL
33
+ metadata: {}
34
+ post_install_message:
35
+ rdoc_options: []
36
+ require_paths:
37
+ - lib
38
+ required_ruby_version: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ required_rubygems_version: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ requirements: []
49
+ rubyforge_project:
50
+ rubygems_version: 2.4.8
51
+ signing_key:
52
+ specification_version: 4
53
+ summary: Parses Indeed resumes
54
+ test_files: []
55
+ has_rdoc: