linkedinparser 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 33dd5468e0b1c0ed881f443d224a247eda5b37d4
4
- data.tar.gz: 6eb3e10f6954f0ce5d82a60ecd9fc3041d70f1db
3
+ metadata.gz: 55b734b221bd743748b734b022e649c57461fdc5
4
+ data.tar.gz: b22cab96393fd2b48b1e4ff7c623fe93f5a09c88
5
5
  SHA512:
6
- metadata.gz: 609148d3105b8215854a96efd502b0dd1ca6662a54a8f8160b32eeb3037ca319abc80e4fdb8243119f0b15198ffa1eb99c795aed6aef95ebd86a7806a4d734d6
7
- data.tar.gz: 4404b31657ce3b78ddaec10a43c005bf86fc1321e0c2cc68865dcd2b2ea1603e53459bc6f29fb6a0ba2b128a7bb01d751cdf913b6250c6b53e01aecbfe7c95fe
6
+ metadata.gz: 7715be94c914d84bd0226e6bf1a9df77bc0b3275134e648089e7f8e1a692995b7a56f96eb9d424d97e21b43c27af96c243155ba349ce6b99e41f90dca667acdb
7
+ data.tar.gz: 1e2aa43c38a5e83298689bd2d6be9e880c4371493d21a424aa0400f6b60e35a238d118f93d57a41be2f91c4f6f5deba92f2b497230f67ecf69b22ab81d97664c
data/lib/jobs.rb ADDED
@@ -0,0 +1,92 @@
1
+ # coding: utf-8
2
+ load 'utilities.rb'
3
+
4
+ class Jobs
5
+ include Utilities
6
+ def initialize(profile)
7
+ @html = Nokogiri::HTML(profile)
8
+ parse_jobs
9
+ end
10
+
11
+ # Get list of jobs
12
+ def get_jobs
13
+ return @positions_list
14
+ end
15
+
16
+ def parse_jobs
17
+ # Multiple html options
18
+ positions = @html.css('#experience').css('.position')
19
+ positions = @html.css('#background-experience').css('.current-position') +
20
+ @html.css('#background-experience').css('.past-position') if is_empty?(positions)
21
+
22
+ # Get lists of positions
23
+ @positions_list = Array.new
24
+ positions.each do |position|
25
+ @positions_list.push({
26
+ title: title(position),
27
+ company: company(position),
28
+ description: description(position),
29
+ start_date: start_date(position),
30
+ end_date: end_date(position),
31
+ work_location: work_location(position),
32
+ current: current(position)})
33
+ end
34
+ end
35
+
36
+ # Check if it is a current position or not
37
+ def current(position)
38
+ if end_date(position) == "Present"
39
+ return "Yes"
40
+ else return "No"
41
+ end
42
+ end
43
+
44
+ # Get the job title
45
+ def title(position)
46
+ position.css('h4').text
47
+ end
48
+
49
+ # Get the company for the position
50
+ def company(position)
51
+ position.css('h5').text
52
+ end
53
+
54
+ # Get job description
55
+ def description(position)
56
+ position.css('.description').text
57
+ end
58
+
59
+ # Get dates
60
+ def get_dates(position)
61
+ dates = position.css('.meta').css('.date-range')
62
+ dates = position.css('.experience-date-locale') if is_empty?(dates)
63
+ return dates
64
+ end
65
+
66
+ # Get start date
67
+ def start_date(position)
68
+ start_date = get_dates(position).text.split(' – ')[0]
69
+ return date_parse(start_date)
70
+ end
71
+
72
+ # Get end date
73
+ def end_date(position)
74
+ end_date = get_dates(position).text.split(' – ').last.split("(").first.strip
75
+ if end_date == "Present"
76
+ return end_date
77
+ elsif end_date && !end_date.empty?
78
+ return Date.parse(end_date)
79
+ end
80
+ end
81
+
82
+ # Parse date
83
+ def date_parse(date)
84
+ date = date+"-01-01" if date =~ /^(19|20)\d{2}$/
85
+ Date.parse(date)
86
+ end
87
+
88
+ # Get location for work
89
+ def work_location(position)
90
+ position.css('.experience-date-locale').css('.locality').text
91
+ end
92
+ end
@@ -0,0 +1,63 @@
1
+ require 'selenium-webdriver'
2
+ require 'pry'
3
+ require 'nokogiri'
4
+ load 'personal_info.rb'
5
+ load 'jobs.rb'
6
+
7
+ class LinkedinParser
8
+ def initialize(profile, profile_url, crawler_fields)
9
+ @profile = profile
10
+ @profile_url = profile_url
11
+ @crawler_fields = crawler_fields
12
+ parse
13
+ end
14
+
15
+ def parse
16
+ # Get details about the person
17
+ p = PersonalInfo.new(@profile, @profile_url)
18
+ @personal_info = p.get_personal_info
19
+
20
+ # Get job info
21
+ j = Jobs.new(@profile)
22
+ @job_info = j.get_jobs
23
+ end
24
+
25
+ # Return results with new item for each job
26
+ def results_by_job
27
+ output = Array.new
28
+ @job_info.each do |job|
29
+ output.push(job.merge!(@personal_info).merge!(@crawler_fields))
30
+ end
31
+
32
+ JSON.pretty_generate(output)
33
+ end
34
+
35
+ # Return results in nested JSON
36
+ def results_by_person
37
+ output = @personal_info
38
+ output[:jobs] = @job_info
39
+ output.merge!(@crawler_fields)
40
+ JSON.pretty_generate(output)
41
+ end
42
+
43
+ # TODO: Fields to add to parser-
44
+ # Organizations
45
+ # Education
46
+ # Projects
47
+ # Related people
48
+ # Languages
49
+ # Certifications
50
+ # Groups
51
+ end
52
+
53
+ # Test:
54
+ #profile = Selenium::WebDriver::Firefox::Profile.new
55
+ #profile['intl.accept_languages'] = 'en'
56
+ #profile["javascript.enabled"] = false
57
+ #driver = Selenium::WebDriver.for :firefox, profile: profile
58
+ #url = "https://www.linkedin.com/pub/christopher-mcclellan/5b/a09/ba9"
59
+ #url = "https://www.linkedin.com/pub/maryann-holmes/2b/770/3b2"
60
+ #driver.navigate.to url
61
+
62
+ #l = LinkedinParser.new(driver.page_source, url, {timestamp: Time.now})
63
+ #puts l.results_by_job
@@ -0,0 +1,126 @@
1
+ load 'picture.rb'
2
+ load 'utilities.rb'
3
+
4
+ class PersonalInfo
5
+ include Utilities
6
+ def initialize(profile, profile_url)
7
+ @profile = profile
8
+ @html = Nokogiri::HTML(profile)
9
+ @profile_url = profile_url
10
+
11
+ # Parse attributes
12
+ p = Picture.new(@html)
13
+ @personal_info = {
14
+ profile_url: @profile_url,
15
+ full_name: full_name,
16
+ first_name: first_name,
17
+ last_name: last_name,
18
+ skills: skills,
19
+ full_location: full_location,
20
+ location: location,
21
+ area: area,
22
+ industry: industry,
23
+ summary: summary,
24
+ current_title: title,
25
+ interests: interests,
26
+ number_of_connections: number_of_connections,
27
+ picture: p.picture,
28
+ pic_path: p.pic_path,
29
+ full_html: full_html}
30
+ end
31
+
32
+ # Return person hash
33
+ def get_personal_info
34
+ return @personal_info
35
+ end
36
+
37
+ # Get the full name of the person
38
+ def full_name
39
+ @html.css(".profile-overview").css('h1').text
40
+ end
41
+
42
+ # Get first part of name
43
+ def first_name
44
+ full_name.split(" ", 2).first.strip
45
+ end
46
+
47
+ # Get last part of name
48
+ def last_name
49
+ full_name.split(" ", 2).last.strip
50
+ end
51
+
52
+ # Get list of skills
53
+ def skills
54
+ skill_list = Array.new
55
+
56
+ # Two formatting options for skills
57
+ skills = @html.css('#skills').css('.skill')
58
+ skills = @html.css('.skill-pill .endorse-item-name-text') if is_empty?(skills)
59
+
60
+ # Make list of skills
61
+ skills.each do |skill|
62
+ skill_list.push(skill.text)
63
+ end
64
+ return skill_list
65
+ end
66
+
67
+ # Get full location
68
+ def full_location
69
+ @html.css('.profile-overview').css('.locality').text
70
+ end
71
+
72
+ # Get town
73
+ def location
74
+ full_location.split(",").first.strip
75
+ end
76
+
77
+ # Get country/state
78
+ def area
79
+ full_location.split(",").last.strip
80
+ end
81
+
82
+ # Get the industry the person works in (2 different formats)
83
+ def industry
84
+ industry = @html.css('.profile-overview').css('.descriptor')[1]
85
+ industry = @html.css('.profile-overview').css('.industry') if is_empty?(industry)
86
+ return industry.text
87
+ end
88
+
89
+ # Get the summary field (2 different formats)
90
+ def summary
91
+ summary = @html.css('#summary').css('.description')
92
+ summary = @html.css('.summary').first if is_empty?(summary)
93
+ return summary.text
94
+ end
95
+
96
+ # Get the overall/current title
97
+ def title
98
+ title = @html.css('.title').css('.headline')
99
+ title = @html.css('#headline').css('.title') if is_empty?(title)
100
+ title = @html.css('.title') if is_empty?(title)
101
+ return title.text
102
+ end
103
+
104
+ # Get the number of connections
105
+ def number_of_connections
106
+ @html.css('.member-connections')[0].text.gsub("connections", "").strip
107
+ end
108
+
109
+ # Get list of interests
110
+ def interests
111
+ interest_list = Array.new
112
+ interests = @html.css('#interests').css('.interest')
113
+ interests = @html.css('#background-interests').css('.interest-item') if is_empty?(interests)
114
+
115
+ interests.each do |interest|
116
+ interest_list.push(interest.text)
117
+ end
118
+
119
+ return interest_list
120
+ end
121
+
122
+ # Save the full html of the page
123
+ def full_html
124
+ @profile
125
+ end
126
+ end
data/lib/picture.rb ADDED
@@ -0,0 +1,32 @@
1
+ class Picture
2
+ def initialize(html)
3
+ @html = html
4
+ end
5
+
6
+ # Get path to the picture url
7
+ def picture
8
+ @html.css('.profile-picture').css('img').first['src']
9
+ end
10
+
11
+ # Download picture
12
+ def pic_path
13
+ if picture
14
+ # Get path
15
+ dir = "pictures/"
16
+ full_path = dir+picture.split("/").last.chomp.strip
17
+
18
+ # Get file
19
+ `wget -P #{dir} #{picture}` if !File.file?(full_path)
20
+ delete_duplicate_pics
21
+ return full_path
22
+ end
23
+ end
24
+
25
+ # Deletes duplicate pictures
26
+ def delete_duplicate_pics
27
+ pics = Dir["public/uploads/pictures/*.jpg.*"]
28
+ pics.each do |p|
29
+ File.delete(p)
30
+ end
31
+ end
32
+ end
data/lib/utilities.rb ADDED
@@ -0,0 +1,6 @@
1
+ module Utilities
2
+ # Check if item is nil or empty
3
+ def is_empty?(item)
4
+ item == nil || item.text.empty?
5
+ end
6
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkedinparser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
@@ -15,7 +15,12 @@ email: shidash@shidash.com
15
15
  executables: []
16
16
  extensions: []
17
17
  extra_rdoc_files: []
18
- files: []
18
+ files:
19
+ - lib/jobs.rb
20
+ - lib/linkedinparser.rb
21
+ - lib/personal_info.rb
22
+ - lib/picture.rb
23
+ - lib/utilities.rb
19
24
  homepage: https://github.com/TransparencyToolkit/linkedinparser
20
25
  licenses:
21
26
  - GPL