indeedparser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 2516696c6e9a799fbc03e3c1189b9b6273649b2c
4
+ data.tar.gz: 22034fac47e8b9eaab688a1427264cc727b09dbf
5
+ SHA512:
6
+ metadata.gz: 815a153b6f9af990cba3ecc5061b8576a2b459b9a08b86fbf0ddd3f67e11c42824c449453846c43b2a64e0d43bc0e79be0f396761fd081ac96464b4b792f8a49
7
+ data.tar.gz: 71307ef7ad393cae5a201ea628a44b39f3763a40709336b1ade3cd08ca32a4f01a9cea277658b38476262da445cdfd7a9d31a625b71ec69187186327c6a6ff07
data/lib/awards.rb ADDED
@@ -0,0 +1,38 @@
1
+ require 'nokogiri'
2
+ load 'utilities.rb'
3
+
4
+ class Awards
5
+ include Utilities
6
+ def initialize(html)
7
+ awards = html.xpath("//div[contains(concat(' ',normalize-space(@class),' '),' award-section ')]")
8
+ @award_list = Array.new
9
+
10
+ awards.each do |award|
11
+ @award_list.push({
12
+ award_title: award_title(award),
13
+ award_date: award_date(award),
14
+ award_description: award_description(award)
15
+ })
16
+ end
17
+ end
18
+
19
+ # Return award info
20
+ def get_awards
21
+ return @award_list
22
+ end
23
+
24
+ # Get title of award
25
+ def award_title(award)
26
+ award.xpath(".//p[@class='award_title']").text
27
+ end
28
+
29
+ # Get award date
30
+ def award_date(award)
31
+ award.xpath(".//p[@class='award_date']").text
32
+ end
33
+
34
+ # Get award description
35
+ def award_description(award)
36
+ award.xpath(".//p[@class='award_description']").text
37
+ end
38
+ end
@@ -0,0 +1,44 @@
1
+ require 'nokogiri'
2
+ load 'utilities.rb'
3
+
4
+ class Certifications
5
+ include Utilities
6
+ def initialize(html)
7
+ certifications = html.xpath("//div[contains(concat(' ',normalize-space(@class),' '),' certification-section ')]")
8
+ @certification_list = Array.new
9
+
10
+ certifications.each do |certification|
11
+ @certification_list.push({
12
+ cert_title: cert_title(certification),
13
+ cert_description: cert_description(certification),
14
+ cert_start_date: cert_start_date(certification),
15
+ cert_end_date: cert_end_date(certification)
16
+ })
17
+ end
18
+ end
19
+
20
+ # Return cert info
21
+ def get_certifications
22
+ return @certification_list
23
+ end
24
+
25
+ # Get title of cert
26
+ def cert_title(certification)
27
+ certification.xpath(".//p[@class='certification_title']").text
28
+ end
29
+
30
+ # Get description of cert
31
+ def cert_description(certification)
32
+ certification.xpath(".//p[@class='certification_description']").text
33
+ end
34
+
35
+ # Get start date for cert validity
36
+ def cert_start_date(certification)
37
+ parse_dates(certification.xpath(".//p[@class='certification_date']").text)[0]
38
+ end
39
+
40
+ # Get cert end date
41
+ def cert_end_date(certification)
42
+ parse_dates(certification.xpath(".//p[@class='certification_date']").text)[1]
43
+ end
44
+ end
data/lib/degrees.rb ADDED
@@ -0,0 +1,50 @@
1
+ require 'nokogiri'
2
+ load 'utilities.rb'
3
+
4
+ class Degrees
5
+ include Utilities
6
+ def initialize(html)
7
+ degrees = html.xpath("//div[@itemtype='http://schema.org/EducationalOrganization']")
8
+ @degree_list = Array.new
9
+
10
+ degrees.each do |degree|
11
+ @degree_list.push({
12
+ school: school(degree),
13
+ degree_title: degree_title(degree),
14
+ school_location: school_location(degree),
15
+ degree_start_date: degree_start_date(degree),
16
+ degree_end_date: degree_end_date(degree)
17
+ })
18
+ end
19
+ end
20
+
21
+ # Return degree info
22
+ def get_degrees
23
+ return @degree_list
24
+ end
25
+
26
+ # Get school name
27
+ def school(degree)
28
+ degree.xpath(".//span[@itemprop='name']").text
29
+ end
30
+
31
+ # Get title of degree
32
+ def degree_title(degree)
33
+ degree.xpath(".//p[@class='edu_title']").text
34
+ end
35
+
36
+ # Get where the school is
37
+ def school_location(degree)
38
+ degree.xpath(".//span[@itemprop='addressLocality']").text
39
+ end
40
+
41
+ # Get start date for degree
42
+ def degree_start_date(degree)
43
+ parse_dates(degree.xpath(".//p[@class='edu_dates']").text)[0]
44
+ end
45
+
46
+ # Get degree end date
47
+ def degree_end_date(degree)
48
+ parse_dates(degree.xpath(".//p[@class='edu_dates']").text)[1]
49
+ end
50
+ end
data/lib/groups.rb ADDED
@@ -0,0 +1,44 @@
1
+ require 'nokogiri'
2
+ load 'utilities.rb'
3
+
4
+ class Groups
5
+ include Utilities
6
+ def initialize(html)
7
+ groups = html.xpath("//div[contains(concat(' ',normalize-space(@class),' '),' group-section ')]")
8
+ @group_list = Array.new
9
+
10
+ groups.each do |group|
11
+ @group_list.push({
12
+ group_title: group_title(group),
13
+ group_description: group_description(group),
14
+ group_start_date: group_start_date(group),
15
+ group_end_date: group_end_date(group)
16
+ })
17
+ end
18
+ end
19
+
20
+ # Return group info
21
+ def get_groups
22
+ return @group_list
23
+ end
24
+
25
+ # Get title of group
26
+ def group_title(group)
27
+ group.xpath(".//p[@class='group_title']").text
28
+ end
29
+
30
+ # Get description of group
31
+ def group_description(group)
32
+ group.xpath(".//p[@class='group_description']").text
33
+ end
34
+
35
+ # Get start date for group
36
+ def group_start_date(group)
37
+ parse_dates(group.xpath(".//p[@class='group_date']").text)[0]
38
+ end
39
+
40
+ # Get group end date
41
+ def group_end_date(group)
42
+ parse_dates(group.xpath(".//p[@class='group_date']").text)[1]
43
+ end
44
+ end
@@ -0,0 +1,33 @@
1
+ require 'requestmanager'
2
+ require 'json'
3
+ load 'personal_info.rb'
4
+ load 'jobs.rb'
5
+
6
+ class IndeedParser
7
+ def initialize(html, url, crawler_fields)
8
+ @html = html
9
+ @url = url
10
+ @crawler_fields = crawler_fields
11
+ parse
12
+ end
13
+
14
+ # Parse profile
15
+ def parse
16
+ p = PersonalInfo.new(@html, @url)
17
+ @personal_info = p.get_personal_info
18
+
19
+ j = Jobs.new(@html)
20
+ @job_info = j.get_jobs
21
+ end
22
+
23
+ # Get output
24
+ def get_results_by_job
25
+ output = Array.new
26
+ @job_info.each do |job|
27
+ output.push(job.merge!(@personal_info).merge!(@crawler_fields))
28
+ end
29
+
30
+ JSON.pretty_generate(output)
31
+ end
32
+ end
33
+
data/lib/jobs.rb ADDED
@@ -0,0 +1,58 @@
1
+ require 'nokogiri'
2
+ load 'utilities.rb'
3
+
4
+ class Jobs
5
+ include Utilities
6
+ def initialize(html)
7
+ @html = Nokogiri::HTML(html)
8
+
9
+ jobs = @html.xpath("//div[contains(concat(' ',normalize-space(@class),' '),' work-experience-section ')]")
10
+ @job_info = Array.new
11
+
12
+ jobs.each do |job|
13
+ @job_info.push({
14
+ job_title: job_title(job),
15
+ company: company(job),
16
+ company_location: company_location(job),
17
+ job_description: job_description(job),
18
+ start_date: start_date(job),
19
+ end_date: end_date(job)
20
+ })
21
+ end
22
+ end
23
+
24
+ # Return job info
25
+ def get_jobs
26
+ return @job_info
27
+ end
28
+
29
+ # Get job title
30
+ def job_title(job)
31
+ job.xpath(".//p[@class='work_title title']").text
32
+ end
33
+
34
+ # Get company
35
+ def company(job)
36
+ job.xpath(".//div[@class='work_company']//span").first.text
37
+ end
38
+
39
+ # Get work location
40
+ def company_location(job)
41
+ job.xpath(".//div[@class='work_company']//div[@class='inline-block']//span").text
42
+ end
43
+
44
+ # Get job description
45
+ def job_description(job)
46
+ job.xpath(".//p[@class='work_description']").text
47
+ end
48
+
49
+ # Get start date
50
+ def start_date(job)
51
+ parse_dates(job.xpath(".//p[@class='work_dates']").text)[0]
52
+ end
53
+
54
+ # Get end date
55
+ def end_date(job)
56
+ parse_dates(job.xpath(".//p[@class='work_dates']").text)[1]
57
+ end
58
+ end
data/lib/links.rb ADDED
@@ -0,0 +1,32 @@
1
+ require 'nokogiri'
2
+ load 'utilities.rb'
3
+
4
+ class Links
5
+ include Utilities
6
+ def initialize(html)
7
+ links = html.xpath("//div[contains(concat(' ',normalize-space(@class),' '),' link-section ')]")
8
+ @link_list = Array.new
9
+
10
+ links.each do |link|
11
+ @link_list.push({
12
+ link_title: link_title(link),
13
+ link_url: link_url(link)
14
+ })
15
+ end
16
+ end
17
+
18
+ # Return person info
19
+ def get_links
20
+ return @link_list
21
+ end
22
+
23
+ # Get title of link
24
+ def link_title(link)
25
+ link.xpath(".//a").text
26
+ end
27
+
28
+ # Get link url
29
+ def link_url(link)
30
+ link.xpath(".//a").first['href']
31
+ end
32
+ end
@@ -0,0 +1,65 @@
1
+ require 'nokogiri'
2
+ load 'utilities.rb'
3
+
4
+ class MilitaryService
5
+ include Utilities
6
+ def initialize(html)
7
+ military_items = html.xpath("//div[contains(concat(' ',normalize-space(@class),' '),' military-section ')]")
8
+ @military_service = Array.new
9
+
10
+ military_items.each do |mil_item|
11
+ @military_service.push({
12
+ military_country: military_country(mil_item),
13
+ military_branch: military_branch(mil_item),
14
+ military_rank: military_rank(mil_item),
15
+ military_description: military_description(mil_item),
16
+ military_commendations: military_commendations(mil_item),
17
+ military_start_date: military_start_date(mil_item),
18
+ military_end_date: military_end_date(mil_item)
19
+ })
20
+ end
21
+ end
22
+
23
+ # Return military service info
24
+ def get_military_service
25
+ return @military_service
26
+ end
27
+
28
+ # Get country of military service
29
+ def military_country(mil_item)
30
+ remove = mil_item.xpath(".//p[@class='military_country']//span").text
31
+ mil_item.xpath(".//p[@class='military_country']").text.gsub(remove, "").strip.lstrip
32
+ end
33
+
34
+ # Get military branch
35
+ def military_branch(mil_item)
36
+ remove = mil_item.xpath(".//p[@class='military_branch']//span").text
37
+ mil_item.xpath(".//p[@class='military_branch']").text.gsub(remove, "").strip.lstrip
38
+ end
39
+
40
+ # Get military rank
41
+ def military_rank(mil_item)
42
+ remove = mil_item.xpath(".//p[@class='military_rank']//span").text
43
+ mil_item.xpath(".//p[@class='military_rank']").text.gsub(remove, "").strip.lstrip
44
+ end
45
+
46
+ # Get military description
47
+ def military_description(mil_item)
48
+ mil_item.xpath(".//p[@class='military_description']").text
49
+ end
50
+
51
+ # Get military commendations
52
+ def military_commendations(mil_item)
53
+ mil_item.xpath(".//p[@class='military_commendations']").text
54
+ end
55
+
56
+ # Get start date
57
+ def military_start_date(mil_item)
58
+ parse_dates(mil_item.xpath(".//p[@class='military_date']").text)[0]
59
+ end
60
+
61
+ # Get end date
62
+ def military_end_date(mil_item)
63
+ parse_dates(mil_item.xpath(".//p[@class='military_date']").text)[1]
64
+ end
65
+ end
@@ -0,0 +1,117 @@
1
+ require 'nokogiri'
2
+ load 'degrees.rb'
3
+ load 'military_service.rb'
4
+ load 'certifications.rb'
5
+ load 'rec_people.rb'
6
+ load 'links.rb'
7
+ load 'awards.rb'
8
+ load 'groups.rb'
9
+
10
+ class PersonalInfo
11
+ def initialize(html, url)
12
+ @raw_html = html
13
+ @html = Nokogiri::HTML(html)
14
+ @url = url
15
+
16
+ @personal_info = {
17
+ name: name,
18
+ url: @url,
19
+ location: location,
20
+ current_title: current_title,
21
+ skills: skills,
22
+ summary: summary,
23
+ additional_info: additional_info,
24
+ last_updated: last_updated,
25
+ degrees: degrees,
26
+ military_service: military_service,
27
+ certifications: certifications,
28
+ rec_people: rec_people,
29
+ links: links,
30
+ awards: awards,
31
+ groups: groups,
32
+ fulltext: @raw_html
33
+ }
34
+ end
35
+
36
+ # Return personal info hash
37
+ def get_personal_info
38
+ return @personal_info
39
+ end
40
+
41
+ # Get certification data
42
+ def certifications
43
+ c = Certifications.new(@html)
44
+ c.get_certifications
45
+ end
46
+
47
+ # Get list of suggested resumes from side
48
+ def rec_people
49
+ r = RecPeople.new(@html)
50
+ r.get_rec_people
51
+ end
52
+
53
+ # Get any links they list
54
+ def links
55
+ l = Links.new(@html)
56
+ l.get_links
57
+ end
58
+
59
+ # Get list of awards
60
+ def awards
61
+ a = Awards.new(@html)
62
+ a.get_awards
63
+ end
64
+
65
+ # Get list of groups
66
+ def groups
67
+ g = Groups.new(@html)
68
+ g.get_groups
69
+ end
70
+
71
+ # Get list of degrees
72
+ def degrees
73
+ d = Degrees.new(@html)
74
+ d.get_degrees
75
+ end
76
+
77
+ # Get military service
78
+ def military_service
79
+ m = MilitaryService.new(@html)
80
+ m.get_military_service
81
+ end
82
+
83
+ # Get persons name
84
+ def name
85
+ @html.xpath("//h1[@itemprop='name']").text
86
+ end
87
+
88
+ # Get location
89
+ def location
90
+ @html.xpath("//p[@id='headline_location']").text
91
+ end
92
+
93
+ # Get overall job title
94
+ def current_title
95
+ @html.xpath("//h2[@id='headline']").text
96
+ end
97
+
98
+ # Get skills section
99
+ def skills
100
+ @html.xpath("//span[@class='skill-text']").text
101
+ end
102
+
103
+ # Get summary
104
+ def summary
105
+ @html.xpath("//p[@id='res_summary']").text
106
+ end
107
+
108
+ # Get additional info
109
+ def additional_info
110
+ @html.xpath("//div[@id='additionalinfo-section']//p").text
111
+ end
112
+
113
+ # Get last updated time
114
+ def last_updated
115
+ @html.xpath("//div[@id='resume_actions_contacted']").text.gsub("Updated: ", "")
116
+ end
117
+ end
data/lib/rec_people.rb ADDED
@@ -0,0 +1,32 @@
1
+ require 'nokogiri'
2
+ load 'utilities.rb'
3
+
4
+ class RecPeople
5
+ include Utilities
6
+ def initialize(html)
7
+ rec_people = html.css(".rec_resume")
8
+ @rec_people_list = Array.new
9
+
10
+ rec_people.each do |rec_person|
11
+ @rec_people_list.push({
12
+ rec_person_name: rec_person_name(rec_person),
13
+ rec_person_link: rec_person_link(rec_person)
14
+ })
15
+ end
16
+ end
17
+
18
+ # Return person info
19
+ def get_rec_people
20
+ return @rec_people_list
21
+ end
22
+
23
+ # Get name of suggested person
24
+ def rec_person_name(rec_person)
25
+ rec_person.css("a").text
26
+ end
27
+
28
+ # Get name of suggested link
29
+ def rec_person_link(rec_person)
30
+ rec_person.css("a").first['href']
31
+ end
32
+ end
data/lib/utilities.rb ADDED
@@ -0,0 +1,23 @@
1
+ require 'date'
2
+
3
+ module Utilities
4
+ # Parse dates
5
+ def parse_dates(dates)
6
+ start_date, end_date = dates
7
+
8
+ if dates.include?(" to ")
9
+ start_date, end_date = dates.split(" to ")
10
+ end
11
+
12
+ return date_normalize(start_date), date_normalize(end_date)
13
+ end
14
+
15
+ def date_normalize(date)
16
+ begin
17
+ date = date+"-01-01" if date =~ /^(19|20)\d{2}$/
18
+ return Date.parse(date)
19
+ rescue
20
+ return date
21
+ end
22
+ end
23
+ end
metadata ADDED
@@ -0,0 +1,55 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: indeedparser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - M. C. McGrath
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-12-23 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Parses Indeed resumes
14
+ email: shidash@transparencytoolkit.org
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/awards.rb
20
+ - lib/certifications.rb
21
+ - lib/degrees.rb
22
+ - lib/groups.rb
23
+ - lib/indeed_parser.rb
24
+ - lib/jobs.rb
25
+ - lib/links.rb
26
+ - lib/military_service.rb
27
+ - lib/personal_info.rb
28
+ - lib/rec_people.rb
29
+ - lib/utilities.rb
30
+ homepage: https://github.com/TransparencyToolkit/indeedparser
31
+ licenses:
32
+ - GPL
33
+ metadata: {}
34
+ post_install_message:
35
+ rdoc_options: []
36
+ require_paths:
37
+ - lib
38
+ required_ruby_version: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ required_rubygems_version: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ requirements: []
49
+ rubyforge_project:
50
+ rubygems_version: 2.4.8
51
+ signing_key:
52
+ specification_version: 4
53
+ summary: Parses Indeed resumes
54
+ test_files: []
55
+ has_rdoc: