linkedinparser 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/jobs.rb +92 -0
- data/lib/linkedinparser.rb +63 -0
- data/lib/personal_info.rb +126 -0
- data/lib/picture.rb +32 -0
- data/lib/utilities.rb +6 -0
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 55b734b221bd743748b734b022e649c57461fdc5
|
4
|
+
data.tar.gz: b22cab96393fd2b48b1e4ff7c623fe93f5a09c88
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7715be94c914d84bd0226e6bf1a9df77bc0b3275134e648089e7f8e1a692995b7a56f96eb9d424d97e21b43c27af96c243155ba349ce6b99e41f90dca667acdb
|
7
|
+
data.tar.gz: 1e2aa43c38a5e83298689bd2d6be9e880c4371493d21a424aa0400f6b60e35a238d118f93d57a41be2f91c4f6f5deba92f2b497230f67ecf69b22ab81d97664c
|
data/lib/jobs.rb
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
load 'utilities.rb'
|
3
|
+
|
4
|
+
class Jobs
|
5
|
+
include Utilities
|
6
|
+
def initialize(profile)
|
7
|
+
@html = Nokogiri::HTML(profile)
|
8
|
+
parse_jobs
|
9
|
+
end
|
10
|
+
|
11
|
+
# Get list of jobs
|
12
|
+
def get_jobs
|
13
|
+
return @positions_list
|
14
|
+
end
|
15
|
+
|
16
|
+
def parse_jobs
|
17
|
+
# Multiple html options
|
18
|
+
positions = @html.css('#experience').css('.position')
|
19
|
+
positions = @html.css('#background-experience').css('.current-position') +
|
20
|
+
@html.css('#background-experience').css('.past-position') if is_empty?(positions)
|
21
|
+
|
22
|
+
# Get lists of positions
|
23
|
+
@positions_list = Array.new
|
24
|
+
positions.each do |position|
|
25
|
+
@positions_list.push({
|
26
|
+
title: title(position),
|
27
|
+
company: company(position),
|
28
|
+
description: description(position),
|
29
|
+
start_date: start_date(position),
|
30
|
+
end_date: end_date(position),
|
31
|
+
work_location: work_location(position),
|
32
|
+
current: current(position)})
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Check if it is a current position or not
|
37
|
+
def current(position)
|
38
|
+
if end_date(position) == "Present"
|
39
|
+
return "Yes"
|
40
|
+
else return "No"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Get the job title
|
45
|
+
def title(position)
|
46
|
+
position.css('h4').text
|
47
|
+
end
|
48
|
+
|
49
|
+
# Get the company for the position
|
50
|
+
def company(position)
|
51
|
+
position.css('h5').text
|
52
|
+
end
|
53
|
+
|
54
|
+
# Get job description
|
55
|
+
def description(position)
|
56
|
+
position.css('.description').text
|
57
|
+
end
|
58
|
+
|
59
|
+
# Get dates
|
60
|
+
def get_dates(position)
|
61
|
+
dates = position.css('.meta').css('.date-range')
|
62
|
+
dates = position.css('.experience-date-locale') if is_empty?(dates)
|
63
|
+
return dates
|
64
|
+
end
|
65
|
+
|
66
|
+
# Get start date
|
67
|
+
def start_date(position)
|
68
|
+
start_date = get_dates(position).text.split(' – ')[0]
|
69
|
+
return date_parse(start_date)
|
70
|
+
end
|
71
|
+
|
72
|
+
# Get end date
|
73
|
+
def end_date(position)
|
74
|
+
end_date = get_dates(position).text.split(' – ').last.split("(").first.strip
|
75
|
+
if end_date == "Present"
|
76
|
+
return end_date
|
77
|
+
elsif end_date && !end_date.empty?
|
78
|
+
return Date.parse(end_date)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# Parse date
|
83
|
+
def date_parse(date)
|
84
|
+
date = date+"-01-01" if date =~ /^(19|20)\d{2}$/
|
85
|
+
Date.parse(date)
|
86
|
+
end
|
87
|
+
|
88
|
+
# Get location for work
|
89
|
+
def work_location(position)
|
90
|
+
position.css('.experience-date-locale').css('.locality').text
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'selenium-webdriver'
|
2
|
+
require 'pry'
|
3
|
+
require 'nokogiri'
|
4
|
+
load 'personal_info.rb'
|
5
|
+
load 'jobs.rb'
|
6
|
+
|
7
|
+
class LinkedinParser
|
8
|
+
def initialize(profile, profile_url, crawler_fields)
|
9
|
+
@profile = profile
|
10
|
+
@profile_url = profile_url
|
11
|
+
@crawler_fields = crawler_fields
|
12
|
+
parse
|
13
|
+
end
|
14
|
+
|
15
|
+
def parse
|
16
|
+
# Get details about the person
|
17
|
+
p = PersonalInfo.new(@profile, @profile_url)
|
18
|
+
@personal_info = p.get_personal_info
|
19
|
+
|
20
|
+
# Get job info
|
21
|
+
j = Jobs.new(@profile)
|
22
|
+
@job_info = j.get_jobs
|
23
|
+
end
|
24
|
+
|
25
|
+
# Return results with new item for each job
|
26
|
+
def results_by_job
|
27
|
+
output = Array.new
|
28
|
+
@job_info.each do |job|
|
29
|
+
output.push(job.merge!(@personal_info).merge!(@crawler_fields))
|
30
|
+
end
|
31
|
+
|
32
|
+
JSON.pretty_generate(output)
|
33
|
+
end
|
34
|
+
|
35
|
+
# Return results in nested JSON
|
36
|
+
def results_by_person
|
37
|
+
output = @personal_info
|
38
|
+
output[:jobs] = @job_info
|
39
|
+
output.merge!(@crawler_fields)
|
40
|
+
JSON.pretty_generate(output)
|
41
|
+
end
|
42
|
+
|
43
|
+
# TODO: Fields to add to parser-
|
44
|
+
# Organizations
|
45
|
+
# Education
|
46
|
+
# Projects
|
47
|
+
# Related people
|
48
|
+
# Languages
|
49
|
+
# Certifications
|
50
|
+
# Groups
|
51
|
+
end
|
52
|
+
|
53
|
+
# Test:
|
54
|
+
#profile = Selenium::WebDriver::Firefox::Profile.new
|
55
|
+
#profile['intl.accept_languages'] = 'en'
|
56
|
+
#profile["javascript.enabled"] = false
|
57
|
+
#driver = Selenium::WebDriver.for :firefox, profile: profile
|
58
|
+
#url = "https://www.linkedin.com/pub/christopher-mcclellan/5b/a09/ba9"
|
59
|
+
#url = "https://www.linkedin.com/pub/maryann-holmes/2b/770/3b2"
|
60
|
+
#driver.navigate.to url
|
61
|
+
|
62
|
+
#l = LinkedinParser.new(driver.page_source, url, {timestamp: Time.now})
|
63
|
+
#puts l.results_by_job
|
@@ -0,0 +1,126 @@
|
|
1
|
+
load 'picture.rb'
|
2
|
+
load 'utilities.rb'
|
3
|
+
|
4
|
+
class PersonalInfo
|
5
|
+
include Utilities
|
6
|
+
def initialize(profile, profile_url)
|
7
|
+
@profile = profile
|
8
|
+
@html = Nokogiri::HTML(profile)
|
9
|
+
@profile_url = profile_url
|
10
|
+
|
11
|
+
# Parse attributes
|
12
|
+
p = Picture.new(@html)
|
13
|
+
@personal_info = {
|
14
|
+
profile_url: @profile_url,
|
15
|
+
full_name: full_name,
|
16
|
+
first_name: first_name,
|
17
|
+
last_name: last_name,
|
18
|
+
skills: skills,
|
19
|
+
full_location: full_location,
|
20
|
+
location: location,
|
21
|
+
area: area,
|
22
|
+
industry: industry,
|
23
|
+
summary: summary,
|
24
|
+
current_title: title,
|
25
|
+
interests: interests,
|
26
|
+
number_of_connections: number_of_connections,
|
27
|
+
picture: p.picture,
|
28
|
+
pic_path: p.pic_path,
|
29
|
+
full_html: full_html}
|
30
|
+
end
|
31
|
+
|
32
|
+
# Return person hash
|
33
|
+
def get_personal_info
|
34
|
+
return @personal_info
|
35
|
+
end
|
36
|
+
|
37
|
+
# Get the full name of the person
|
38
|
+
def full_name
|
39
|
+
@html.css(".profile-overview").css('h1').text
|
40
|
+
end
|
41
|
+
|
42
|
+
# Get first part of name
|
43
|
+
def first_name
|
44
|
+
full_name.split(" ", 2).first.strip
|
45
|
+
end
|
46
|
+
|
47
|
+
# Get last part of name
|
48
|
+
def last_name
|
49
|
+
full_name.split(" ", 2).last.strip
|
50
|
+
end
|
51
|
+
|
52
|
+
# Get list of skills
|
53
|
+
def skills
|
54
|
+
skill_list = Array.new
|
55
|
+
|
56
|
+
# Two formatting options for skills
|
57
|
+
skills = @html.css('#skills').css('.skill')
|
58
|
+
skills = @html.css('.skill-pill .endorse-item-name-text') if is_empty?(skills)
|
59
|
+
|
60
|
+
# Make list of skills
|
61
|
+
skills.each do |skill|
|
62
|
+
skill_list.push(skill.text)
|
63
|
+
end
|
64
|
+
return skill_list
|
65
|
+
end
|
66
|
+
|
67
|
+
# Get full location
|
68
|
+
def full_location
|
69
|
+
@html.css('.profile-overview').css('.locality').text
|
70
|
+
end
|
71
|
+
|
72
|
+
# Get town
|
73
|
+
def location
|
74
|
+
full_location.split(",").first.strip
|
75
|
+
end
|
76
|
+
|
77
|
+
# Get country/state
|
78
|
+
def area
|
79
|
+
full_location.split(",").last.strip
|
80
|
+
end
|
81
|
+
|
82
|
+
# Get the industry the person works in (2 different formats)
|
83
|
+
def industry
|
84
|
+
industry = @html.css('.profile-overview').css('.descriptor')[1]
|
85
|
+
industry = @html.css('.profile-overview').css('.industry') if is_empty?(industry)
|
86
|
+
return industry.text
|
87
|
+
end
|
88
|
+
|
89
|
+
# Get the summary field (2 different formats)
|
90
|
+
def summary
|
91
|
+
summary = @html.css('#summary').css('.description')
|
92
|
+
summary = @html.css('.summary').first if is_empty?(summary)
|
93
|
+
return summary.text
|
94
|
+
end
|
95
|
+
|
96
|
+
# Get the overall/current title
|
97
|
+
def title
|
98
|
+
title = @html.css('.title').css('.headline')
|
99
|
+
title = @html.css('#headline').css('.title') if is_empty?(title)
|
100
|
+
title = @html.css('.title') if is_empty?(title)
|
101
|
+
return title.text
|
102
|
+
end
|
103
|
+
|
104
|
+
# Get the number of connections
|
105
|
+
def number_of_connections
|
106
|
+
@html.css('.member-connections')[0].text.gsub("connections", "").strip
|
107
|
+
end
|
108
|
+
|
109
|
+
# Get list of interests
|
110
|
+
def interests
|
111
|
+
interest_list = Array.new
|
112
|
+
interests = @html.css('#interests').css('.interest')
|
113
|
+
interests = @html.css('#background-interests').css('.interest-item') if is_empty?(interests)
|
114
|
+
|
115
|
+
interests.each do |interest|
|
116
|
+
interest_list.push(interest.text)
|
117
|
+
end
|
118
|
+
|
119
|
+
return interest_list
|
120
|
+
end
|
121
|
+
|
122
|
+
# Save the full html of the page
|
123
|
+
def full_html
|
124
|
+
@profile
|
125
|
+
end
|
126
|
+
end
|
data/lib/picture.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
class Picture
|
2
|
+
def initialize(html)
|
3
|
+
@html = html
|
4
|
+
end
|
5
|
+
|
6
|
+
# Get path to the picture url
|
7
|
+
def picture
|
8
|
+
@html.css('.profile-picture').css('img').first['src']
|
9
|
+
end
|
10
|
+
|
11
|
+
# Download picture
|
12
|
+
def pic_path
|
13
|
+
if picture
|
14
|
+
# Get path
|
15
|
+
dir = "pictures/"
|
16
|
+
full_path = dir+picture.split("/").last.chomp.strip
|
17
|
+
|
18
|
+
# Get file
|
19
|
+
`wget -P #{dir} #{picture}` if !File.file?(full_path)
|
20
|
+
delete_duplicate_pics
|
21
|
+
return full_path
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Deletes duplicate pictures
|
26
|
+
def delete_duplicate_pics
|
27
|
+
pics = Dir["public/uploads/pictures/*.jpg.*"]
|
28
|
+
pics.each do |p|
|
29
|
+
File.delete(p)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
data/lib/utilities.rb
ADDED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedinparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
@@ -15,7 +15,12 @@ email: shidash@shidash.com
|
|
15
15
|
executables: []
|
16
16
|
extensions: []
|
17
17
|
extra_rdoc_files: []
|
18
|
-
files:
|
18
|
+
files:
|
19
|
+
- lib/jobs.rb
|
20
|
+
- lib/linkedinparser.rb
|
21
|
+
- lib/personal_info.rb
|
22
|
+
- lib/picture.rb
|
23
|
+
- lib/utilities.rb
|
19
24
|
homepage: https://github.com/TransparencyToolkit/linkedinparser
|
20
25
|
licenses:
|
21
26
|
- GPL
|