linkedinparser 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/jobs.rb +92 -0
- data/lib/linkedinparser.rb +63 -0
- data/lib/personal_info.rb +126 -0
- data/lib/picture.rb +32 -0
- data/lib/utilities.rb +6 -0
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 55b734b221bd743748b734b022e649c57461fdc5
|
4
|
+
data.tar.gz: b22cab96393fd2b48b1e4ff7c623fe93f5a09c88
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7715be94c914d84bd0226e6bf1a9df77bc0b3275134e648089e7f8e1a692995b7a56f96eb9d424d97e21b43c27af96c243155ba349ce6b99e41f90dca667acdb
|
7
|
+
data.tar.gz: 1e2aa43c38a5e83298689bd2d6be9e880c4371493d21a424aa0400f6b60e35a238d118f93d57a41be2f91c4f6f5deba92f2b497230f67ecf69b22ab81d97664c
|
data/lib/jobs.rb
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
load 'utilities.rb'
|
3
|
+
|
4
|
+
class Jobs
|
5
|
+
include Utilities
|
6
|
+
def initialize(profile)
|
7
|
+
@html = Nokogiri::HTML(profile)
|
8
|
+
parse_jobs
|
9
|
+
end
|
10
|
+
|
11
|
+
# Get list of jobs
|
12
|
+
def get_jobs
|
13
|
+
return @positions_list
|
14
|
+
end
|
15
|
+
|
16
|
+
def parse_jobs
|
17
|
+
# Multiple html options
|
18
|
+
positions = @html.css('#experience').css('.position')
|
19
|
+
positions = @html.css('#background-experience').css('.current-position') +
|
20
|
+
@html.css('#background-experience').css('.past-position') if is_empty?(positions)
|
21
|
+
|
22
|
+
# Get lists of positions
|
23
|
+
@positions_list = Array.new
|
24
|
+
positions.each do |position|
|
25
|
+
@positions_list.push({
|
26
|
+
title: title(position),
|
27
|
+
company: company(position),
|
28
|
+
description: description(position),
|
29
|
+
start_date: start_date(position),
|
30
|
+
end_date: end_date(position),
|
31
|
+
work_location: work_location(position),
|
32
|
+
current: current(position)})
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Check if it is a current position or not
|
37
|
+
def current(position)
|
38
|
+
if end_date(position) == "Present"
|
39
|
+
return "Yes"
|
40
|
+
else return "No"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Get the job title
|
45
|
+
def title(position)
|
46
|
+
position.css('h4').text
|
47
|
+
end
|
48
|
+
|
49
|
+
# Get the company for the position
|
50
|
+
def company(position)
|
51
|
+
position.css('h5').text
|
52
|
+
end
|
53
|
+
|
54
|
+
# Get job description
|
55
|
+
def description(position)
|
56
|
+
position.css('.description').text
|
57
|
+
end
|
58
|
+
|
59
|
+
# Get dates
|
60
|
+
def get_dates(position)
|
61
|
+
dates = position.css('.meta').css('.date-range')
|
62
|
+
dates = position.css('.experience-date-locale') if is_empty?(dates)
|
63
|
+
return dates
|
64
|
+
end
|
65
|
+
|
66
|
+
# Get start date
|
67
|
+
def start_date(position)
|
68
|
+
start_date = get_dates(position).text.split(' – ')[0]
|
69
|
+
return date_parse(start_date)
|
70
|
+
end
|
71
|
+
|
72
|
+
# Get end date
|
73
|
+
def end_date(position)
|
74
|
+
end_date = get_dates(position).text.split(' – ').last.split("(").first.strip
|
75
|
+
if end_date == "Present"
|
76
|
+
return end_date
|
77
|
+
elsif end_date && !end_date.empty?
|
78
|
+
return Date.parse(end_date)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# Parse date
|
83
|
+
def date_parse(date)
|
84
|
+
date = date+"-01-01" if date =~ /^(19|20)\d{2}$/
|
85
|
+
Date.parse(date)
|
86
|
+
end
|
87
|
+
|
88
|
+
# Get location for work
|
89
|
+
def work_location(position)
|
90
|
+
position.css('.experience-date-locale').css('.locality').text
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'selenium-webdriver'
|
2
|
+
require 'pry'
|
3
|
+
require 'nokogiri'
|
4
|
+
load 'personal_info.rb'
|
5
|
+
load 'jobs.rb'
|
6
|
+
|
7
|
+
class LinkedinParser
|
8
|
+
def initialize(profile, profile_url, crawler_fields)
|
9
|
+
@profile = profile
|
10
|
+
@profile_url = profile_url
|
11
|
+
@crawler_fields = crawler_fields
|
12
|
+
parse
|
13
|
+
end
|
14
|
+
|
15
|
+
def parse
|
16
|
+
# Get details about the person
|
17
|
+
p = PersonalInfo.new(@profile, @profile_url)
|
18
|
+
@personal_info = p.get_personal_info
|
19
|
+
|
20
|
+
# Get job info
|
21
|
+
j = Jobs.new(@profile)
|
22
|
+
@job_info = j.get_jobs
|
23
|
+
end
|
24
|
+
|
25
|
+
# Return results with new item for each job
|
26
|
+
def results_by_job
|
27
|
+
output = Array.new
|
28
|
+
@job_info.each do |job|
|
29
|
+
output.push(job.merge!(@personal_info).merge!(@crawler_fields))
|
30
|
+
end
|
31
|
+
|
32
|
+
JSON.pretty_generate(output)
|
33
|
+
end
|
34
|
+
|
35
|
+
# Return results in nested JSON
|
36
|
+
def results_by_person
|
37
|
+
output = @personal_info
|
38
|
+
output[:jobs] = @job_info
|
39
|
+
output.merge!(@crawler_fields)
|
40
|
+
JSON.pretty_generate(output)
|
41
|
+
end
|
42
|
+
|
43
|
+
# TODO: Fields to add to parser-
|
44
|
+
# Organizations
|
45
|
+
# Education
|
46
|
+
# Projects
|
47
|
+
# Related people
|
48
|
+
# Languages
|
49
|
+
# Certifications
|
50
|
+
# Groups
|
51
|
+
end
|
52
|
+
|
53
|
+
# Test:
|
54
|
+
#profile = Selenium::WebDriver::Firefox::Profile.new
|
55
|
+
#profile['intl.accept_languages'] = 'en'
|
56
|
+
#profile["javascript.enabled"] = false
|
57
|
+
#driver = Selenium::WebDriver.for :firefox, profile: profile
|
58
|
+
#url = "https://www.linkedin.com/pub/christopher-mcclellan/5b/a09/ba9"
|
59
|
+
#url = "https://www.linkedin.com/pub/maryann-holmes/2b/770/3b2"
|
60
|
+
#driver.navigate.to url
|
61
|
+
|
62
|
+
#l = LinkedinParser.new(driver.page_source, url, {timestamp: Time.now})
|
63
|
+
#puts l.results_by_job
|
@@ -0,0 +1,126 @@
|
|
1
|
+
load 'picture.rb'
|
2
|
+
load 'utilities.rb'
|
3
|
+
|
4
|
+
class PersonalInfo
|
5
|
+
include Utilities
|
6
|
+
def initialize(profile, profile_url)
|
7
|
+
@profile = profile
|
8
|
+
@html = Nokogiri::HTML(profile)
|
9
|
+
@profile_url = profile_url
|
10
|
+
|
11
|
+
# Parse attributes
|
12
|
+
p = Picture.new(@html)
|
13
|
+
@personal_info = {
|
14
|
+
profile_url: @profile_url,
|
15
|
+
full_name: full_name,
|
16
|
+
first_name: first_name,
|
17
|
+
last_name: last_name,
|
18
|
+
skills: skills,
|
19
|
+
full_location: full_location,
|
20
|
+
location: location,
|
21
|
+
area: area,
|
22
|
+
industry: industry,
|
23
|
+
summary: summary,
|
24
|
+
current_title: title,
|
25
|
+
interests: interests,
|
26
|
+
number_of_connections: number_of_connections,
|
27
|
+
picture: p.picture,
|
28
|
+
pic_path: p.pic_path,
|
29
|
+
full_html: full_html}
|
30
|
+
end
|
31
|
+
|
32
|
+
# Return person hash
|
33
|
+
def get_personal_info
|
34
|
+
return @personal_info
|
35
|
+
end
|
36
|
+
|
37
|
+
# Get the full name of the person
|
38
|
+
def full_name
|
39
|
+
@html.css(".profile-overview").css('h1').text
|
40
|
+
end
|
41
|
+
|
42
|
+
# Get first part of name
|
43
|
+
def first_name
|
44
|
+
full_name.split(" ", 2).first.strip
|
45
|
+
end
|
46
|
+
|
47
|
+
# Get last part of name
|
48
|
+
def last_name
|
49
|
+
full_name.split(" ", 2).last.strip
|
50
|
+
end
|
51
|
+
|
52
|
+
# Get list of skills
|
53
|
+
def skills
|
54
|
+
skill_list = Array.new
|
55
|
+
|
56
|
+
# Two formatting options for skills
|
57
|
+
skills = @html.css('#skills').css('.skill')
|
58
|
+
skills = @html.css('.skill-pill .endorse-item-name-text') if is_empty?(skills)
|
59
|
+
|
60
|
+
# Make list of skills
|
61
|
+
skills.each do |skill|
|
62
|
+
skill_list.push(skill.text)
|
63
|
+
end
|
64
|
+
return skill_list
|
65
|
+
end
|
66
|
+
|
67
|
+
# Get full location
|
68
|
+
def full_location
|
69
|
+
@html.css('.profile-overview').css('.locality').text
|
70
|
+
end
|
71
|
+
|
72
|
+
# Get town
|
73
|
+
def location
|
74
|
+
full_location.split(",").first.strip
|
75
|
+
end
|
76
|
+
|
77
|
+
# Get country/state
|
78
|
+
def area
|
79
|
+
full_location.split(",").last.strip
|
80
|
+
end
|
81
|
+
|
82
|
+
# Get the industry the person works in (2 different formats)
|
83
|
+
def industry
|
84
|
+
industry = @html.css('.profile-overview').css('.descriptor')[1]
|
85
|
+
industry = @html.css('.profile-overview').css('.industry') if is_empty?(industry)
|
86
|
+
return industry.text
|
87
|
+
end
|
88
|
+
|
89
|
+
# Get the summary field (2 different formats)
|
90
|
+
def summary
|
91
|
+
summary = @html.css('#summary').css('.description')
|
92
|
+
summary = @html.css('.summary').first if is_empty?(summary)
|
93
|
+
return summary.text
|
94
|
+
end
|
95
|
+
|
96
|
+
# Get the overall/current title
|
97
|
+
def title
|
98
|
+
title = @html.css('.title').css('.headline')
|
99
|
+
title = @html.css('#headline').css('.title') if is_empty?(title)
|
100
|
+
title = @html.css('.title') if is_empty?(title)
|
101
|
+
return title.text
|
102
|
+
end
|
103
|
+
|
104
|
+
# Get the number of connections
|
105
|
+
def number_of_connections
|
106
|
+
@html.css('.member-connections')[0].text.gsub("connections", "").strip
|
107
|
+
end
|
108
|
+
|
109
|
+
# Get list of interests
|
110
|
+
def interests
|
111
|
+
interest_list = Array.new
|
112
|
+
interests = @html.css('#interests').css('.interest')
|
113
|
+
interests = @html.css('#background-interests').css('.interest-item') if is_empty?(interests)
|
114
|
+
|
115
|
+
interests.each do |interest|
|
116
|
+
interest_list.push(interest.text)
|
117
|
+
end
|
118
|
+
|
119
|
+
return interest_list
|
120
|
+
end
|
121
|
+
|
122
|
+
# Save the full html of the page
|
123
|
+
def full_html
|
124
|
+
@profile
|
125
|
+
end
|
126
|
+
end
|
data/lib/picture.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
class Picture
|
2
|
+
def initialize(html)
|
3
|
+
@html = html
|
4
|
+
end
|
5
|
+
|
6
|
+
# Get path to the picture url
|
7
|
+
def picture
|
8
|
+
@html.css('.profile-picture').css('img').first['src']
|
9
|
+
end
|
10
|
+
|
11
|
+
# Download picture
|
12
|
+
def pic_path
|
13
|
+
if picture
|
14
|
+
# Get path
|
15
|
+
dir = "pictures/"
|
16
|
+
full_path = dir+picture.split("/").last.chomp.strip
|
17
|
+
|
18
|
+
# Get file
|
19
|
+
`wget -P #{dir} #{picture}` if !File.file?(full_path)
|
20
|
+
delete_duplicate_pics
|
21
|
+
return full_path
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Deletes duplicate pictures
|
26
|
+
def delete_duplicate_pics
|
27
|
+
pics = Dir["public/uploads/pictures/*.jpg.*"]
|
28
|
+
pics.each do |p|
|
29
|
+
File.delete(p)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
data/lib/utilities.rb
ADDED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedinparser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
@@ -15,7 +15,12 @@ email: shidash@shidash.com
|
|
15
15
|
executables: []
|
16
16
|
extensions: []
|
17
17
|
extra_rdoc_files: []
|
18
|
-
files:
|
18
|
+
files:
|
19
|
+
- lib/jobs.rb
|
20
|
+
- lib/linkedinparser.rb
|
21
|
+
- lib/personal_info.rb
|
22
|
+
- lib/picture.rb
|
23
|
+
- lib/utilities.rb
|
19
24
|
homepage: https://github.com/TransparencyToolkit/linkedinparser
|
20
25
|
licenses:
|
21
26
|
- GPL
|