linkedindata 0.0.17 → 0.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/get_related.rb +80 -0
- data/lib/linkedin.rb +66 -0
- data/lib/linkedindata.rb +36 -108
- data/lib/parse_profile.rb +50 -0
- metadata +5 -4
- data/lib/getrelated.rb +0 -55
- data/lib/parseprofile.rb +0 -79
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 62911808bef43a12c8723a47135534fd7ff330fb
|
4
|
+
data.tar.gz: 4012d7ef04d34401d79ee1c3b4150e3a353358fc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1912abe3d5349f5cbbcd4c06ab699926859fb78f75c96aef49f932351791d325a7101b7e585ee81fc0c87869d0b5440d3a7c9c77ecf926a5d280da50a7a1e023
|
7
|
+
data.tar.gz: a4de7d6888cd3ef25edf8037a7624a570890882744305694cc9a45f564b0094b08ae2d4bf0789569915e6ab9e9d56731b873f5bf714f042fb9af5fcfacdc2539
|
data/lib/get_related.rb
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
module GetRelated
|
2
|
+
# Get the list of names of related people
|
3
|
+
def getList(html)
|
4
|
+
namelist = Array.new
|
5
|
+
|
6
|
+
# Save each person's name and url
|
7
|
+
html.css("div.insights-browse-map").each do |d|
|
8
|
+
if d.css("h3").text == "People Also Viewed"
|
9
|
+
d.css("li").each do |l|
|
10
|
+
namelist.push({name: l.css("h4").text,
|
11
|
+
url: l.css("a")[0]['href']})
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
return namelist
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
# Get all profiles within numhops of original(s)
|
21
|
+
def getRelatedProfiles
|
22
|
+
@numhops.times do |hop_count|
|
23
|
+
@output.select { |profile| profile[:degree] == hop_count }.each do |item|
|
24
|
+
downloadRelated(item, hop_count) if item[:related_people]
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# Scrapes the related profiles for one result item
|
30
|
+
def downloadRelated(item, hop_count)
|
31
|
+
item[:related_people].each do |related_person|
|
32
|
+
# Check if it has been scraped already
|
33
|
+
if @output.select { |person| related_person[:name] == person[:name] }.empty?
|
34
|
+
scrape(related_person[:url], hop_count+1)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
# Make list of profiles for score tracking
|
41
|
+
def fullProfileList(data)
|
42
|
+
profiles = Hash.new
|
43
|
+
data.each do |d|
|
44
|
+
profiles[d[:profile_url]] = 0
|
45
|
+
end
|
46
|
+
return profiles
|
47
|
+
end
|
48
|
+
|
49
|
+
# Adds points to a profile for showing up in related people
|
50
|
+
def addPointsToProfile(profile_scores, data_item, person)
|
51
|
+
if profile_scores[person[:url]]
|
52
|
+
# Calculate degree- (2/d*2) except when degree is 0
|
53
|
+
degree_divide = data_item[:degree] == 0 ? 1 : data_item[:degree]*2
|
54
|
+
profile_scores[person[:url]] += (2.0/degree_divide)
|
55
|
+
end
|
56
|
+
return profile_scores
|
57
|
+
end
|
58
|
+
|
59
|
+
# Add a score to each profile based on the # of times it appears in "people also viewed"
|
60
|
+
def relScore(data)
|
61
|
+
profile_scores = fullProfileList(data)
|
62
|
+
|
63
|
+
# Get degree and calculate score for each profile
|
64
|
+
data.each do |data_item|
|
65
|
+
if data_item[:related_people]
|
66
|
+
data_item[:related_people].each do |person|
|
67
|
+
profile_scores = addPointsToProfile(profile_scores, data_item, person)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# Merge scores back into dataset
|
73
|
+
data.each do |m|
|
74
|
+
m.merge!(score: profile_scores[m[:profile_url]])
|
75
|
+
end
|
76
|
+
|
77
|
+
return data
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
data/lib/linkedin.rb
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
# Someone already made a nice gem for parsing public profiles:
|
2
|
+
# https://github.com/yatish27/linkedin-scraper
|
3
|
+
# This class reopens that to add extra things I need
|
4
|
+
module Linkedin
|
5
|
+
class Profile
|
6
|
+
include ProxyManager
|
7
|
+
include GetRelated
|
8
|
+
|
9
|
+
def initialize(url, curhops, proxylist, usedproxies)
|
10
|
+
@linkedin_url = url
|
11
|
+
@curhops = curhops
|
12
|
+
@proxylist = proxylist
|
13
|
+
@usedproxies = usedproxies
|
14
|
+
|
15
|
+
# Add attributes to list
|
16
|
+
ATTRIBUTES.push(
|
17
|
+
"related_people",
|
18
|
+
"profile_url",
|
19
|
+
"timestamp",
|
20
|
+
"degree",
|
21
|
+
"pic_path")
|
22
|
+
@page = getPage(url) # Get pages with proxies
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
def self.get_profile(url, curhops, proxylist, usedproxies)
|
27
|
+
Linkedin::Profile.new(url, curhops, proxylist, usedproxies)
|
28
|
+
rescue => e
|
29
|
+
puts e
|
30
|
+
end
|
31
|
+
|
32
|
+
# Gets "people also viewed list" form profile sidebar
|
33
|
+
def related_people
|
34
|
+
@related_people ||= getList(Nokogiri::HTML(@page.body))
|
35
|
+
end
|
36
|
+
|
37
|
+
# Similar to linkedin_url
|
38
|
+
def profile_url
|
39
|
+
@profile_url ||= @linkedin_url
|
40
|
+
end
|
41
|
+
|
42
|
+
# Get the time the profile was scraped
|
43
|
+
def timestamp
|
44
|
+
@timestamp ||= Time.now
|
45
|
+
end
|
46
|
+
|
47
|
+
# Get the number of hops out where profile appears
|
48
|
+
def degree
|
49
|
+
@degree ||= @curhops
|
50
|
+
end
|
51
|
+
|
52
|
+
# Download the profile picture
|
53
|
+
def pic_path
|
54
|
+
if picture
|
55
|
+
# Get path
|
56
|
+
dir = "public/uploads/pictures/"
|
57
|
+
full_path = dir+picture.split("/").last.chomp.strip
|
58
|
+
|
59
|
+
# Get file
|
60
|
+
`wget -P #{dir} #{picture}` if !File.file?(full_path)
|
61
|
+
return full_path
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
end
|
data/lib/linkedindata.rb
CHANGED
@@ -1,81 +1,46 @@
|
|
1
|
-
require 'mechanize'
|
2
1
|
require 'linkedin-scraper'
|
2
|
+
require 'generalscraper'
|
3
3
|
require 'json'
|
4
4
|
require 'nokogiri'
|
5
|
-
require 'open-uri'
|
6
|
-
load 'parseprofile.rb'
|
7
|
-
require 'pry'
|
8
|
-
require 'urlarchiver'
|
9
5
|
require 'set'
|
10
6
|
|
7
|
+
load 'parse_profile.rb'
|
8
|
+
load 'get_related.rb'
|
9
|
+
load 'linkedin.rb'
|
10
|
+
|
11
11
|
class LinkedinData
|
12
|
-
|
13
|
-
|
12
|
+
include GetRelated
|
13
|
+
include ParseProfile
|
14
|
+
include Linkedin
|
15
|
+
|
16
|
+
def initialize(todegree, proxylist)
|
17
|
+
@proxylist = IO.readlines(proxylist)
|
18
|
+
@proxy_list_path = proxylist
|
19
|
+
@usedproxies = Hash.new
|
14
20
|
@output = Array.new
|
15
21
|
@startindex = 10
|
16
22
|
@numhops = todegree
|
17
23
|
end
|
18
24
|
|
19
25
|
# Searches for profiles on Google
|
20
|
-
def search
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
gform.q = "site:linkedin.com/pub " + @input
|
25
|
-
page = agent.submit(gform, gform.buttons.first)
|
26
|
-
examine(page)
|
27
|
-
end
|
28
|
-
|
29
|
-
# Examines a search page
|
30
|
-
def examine(page)
|
31
|
-
# Separate getting profile links and going to next page
|
32
|
-
# Method for getting links to all result pages
|
33
|
-
# Different method for getting all profile links on page and scraping (split to new thread for this)
|
34
|
-
# Has own output set, merge into full one at end (make sure threadsafe)
|
35
|
-
|
36
|
-
# Have own input and output
|
37
|
-
page.links.each do |link|
|
38
|
-
if (link.href.include? "linkedin.com") && (!link.href.include? "webcache") && (!link.href.include? "site:linkedin.com/pub+")
|
39
|
-
saveurl = link.href.split("?q=")
|
40
|
-
|
41
|
-
if saveurl[1]
|
42
|
-
url = saveurl[1].split("&")
|
43
|
-
begin
|
44
|
-
scrape(url[0], 0)
|
45
|
-
rescue
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
# Find the link to the next page and go to it
|
51
|
-
if (link.href.include? "&sa=N") && (link.href.include? "&start=")
|
52
|
-
url1 = link.href.split("&start=")
|
53
|
-
url2 = url1[1].split("&sa=N")
|
54
|
-
|
55
|
-
if url2[0].to_i == @startindex
|
56
|
-
sleep(rand(30..90))
|
57
|
-
@startindex += 10
|
58
|
-
agent = Mechanize.new
|
59
|
-
examine(agent.get("http://google.com" + link.href))
|
60
|
-
end
|
61
|
-
end
|
26
|
+
def search(search_terms)
|
27
|
+
g = GeneralScraper.new("site:linkedin.com/pub", search_terms, @proxy_list_path)
|
28
|
+
JSON.parse(g.getURLs).each do |profile|
|
29
|
+
scrape(profile, 0)
|
62
30
|
end
|
63
31
|
end
|
64
32
|
|
65
|
-
# Scrapes profile
|
33
|
+
# Scrapes and parses individual profile
|
66
34
|
def scrape(url, curhops)
|
67
35
|
# Download profile and rescue on error
|
68
36
|
begin
|
69
37
|
url.gsub!("https", "http")
|
70
|
-
profile = Linkedin::Profile.get_profile(url)
|
38
|
+
profile = Linkedin::Profile.get_profile(url, curhops, @proxylist, @usedproxies)
|
71
39
|
rescue
|
72
40
|
end
|
73
41
|
|
74
|
-
# Parse profile if returned
|
75
|
-
if profile
|
76
|
-
p = ParseProfile.new(profile, url, curhops)
|
77
|
-
@output.concat(p.parse)
|
78
|
-
end
|
42
|
+
# Parse profile if returned and add to output
|
43
|
+
@output.concat(parseResume(profile)) if profile
|
79
44
|
end
|
80
45
|
|
81
46
|
# Make sure all keys that occur occur in each item (even if nil)
|
@@ -101,59 +66,22 @@ class LinkedinData
|
|
101
66
|
return datarr
|
102
67
|
end
|
103
68
|
|
104
|
-
#
|
105
|
-
def
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
data.each do |d|
|
110
|
-
profiles[d["profile_url"]] = 0
|
111
|
-
end
|
112
|
-
|
113
|
-
# Get degree for each profile
|
114
|
-
data.each do |i|
|
115
|
-
if i["related_people"]
|
116
|
-
i["related_people"].each do |p|
|
117
|
-
if profiles[p["url"]]
|
118
|
-
# Calculate degree- (2/d*2) except when degree is 0
|
119
|
-
degree_divide = i["degree"] == 0 ? 1 : i["degree"]*2
|
120
|
-
profiles[p["url"]] += (2.0/degree_divide)
|
121
|
-
end
|
122
|
-
end
|
123
|
-
end
|
124
|
-
end
|
125
|
-
|
126
|
-
# Merge scores back into dataset
|
127
|
-
data.each do |m|
|
128
|
-
m.merge!(:score => profiles[m["profile_url"]])
|
129
|
-
end
|
130
|
-
|
131
|
-
return data
|
69
|
+
# Gets related profiles then adds relevance scores and any missing keys
|
70
|
+
def prepareResults
|
71
|
+
getRelatedProfiles
|
72
|
+
deleteDuplicatePics
|
73
|
+
return JSON.pretty_generate(relScore(showAllKeys(@output)))
|
132
74
|
end
|
133
75
|
|
134
|
-
# Gets
|
135
|
-
def
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
o[:related_people].each do |i|
|
145
|
-
if @output.select { |obj| obj[:name] == i[:name]}.empty?
|
146
|
-
scrape(i[:url], o[:degree]+1)
|
147
|
-
end
|
148
|
-
end
|
149
|
-
end
|
150
|
-
|
151
|
-
end
|
152
|
-
end
|
153
|
-
end
|
154
|
-
|
155
|
-
formatted_json = JSON.pretty_generate(relScore(showAllKeys(@output)))
|
156
|
-
return formatted_json
|
76
|
+
# Gets one profile and the related profiles
|
77
|
+
def getSingleProfile(url)
|
78
|
+
scrape(url, 0)
|
79
|
+
return prepareResults
|
80
|
+
end
|
81
|
+
|
82
|
+
# Gets all profiles in search results and returns in JSON
|
83
|
+
def getByKeywords(search_term)
|
84
|
+
search(search_term)
|
85
|
+
return prepareResults
|
157
86
|
end
|
158
87
|
end
|
159
|
-
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module ParseProfile
|
2
|
+
# Parse profile into items by company
|
3
|
+
def parseResume(profile)
|
4
|
+
output = Array.new
|
5
|
+
|
6
|
+
# Parse profiles for current companies
|
7
|
+
profile.current_companies.each do |c|
|
8
|
+
output.push(addPersonFields(c, "Yes", profile))
|
9
|
+
end
|
10
|
+
|
11
|
+
# Parse past position/company info
|
12
|
+
profile.past_companies.each do |c|
|
13
|
+
output.push(addPersonFields(c, "No", profile))
|
14
|
+
end
|
15
|
+
|
16
|
+
return output
|
17
|
+
end
|
18
|
+
|
19
|
+
# Deletes duplicate pictures
|
20
|
+
def deleteDuplicatePics
|
21
|
+
pics = Dir["public/uploads/pictures/*.jpg.*"]
|
22
|
+
pics.each do |p|
|
23
|
+
File.delete(p)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Merge person data with role data
|
28
|
+
def addPersonFields(c, status, profile)
|
29
|
+
c.merge!(
|
30
|
+
skills: profile.skills,
|
31
|
+
certifications: profile.certifications,
|
32
|
+
languages: profile.languages,
|
33
|
+
name: profile.name,
|
34
|
+
location: profile.location,
|
35
|
+
area: profile.country,
|
36
|
+
industry: profile.industry,
|
37
|
+
picture: profile.picture,
|
38
|
+
organizations: profile.organizations,
|
39
|
+
groups: profile.groups,
|
40
|
+
education: profile.education,
|
41
|
+
websites: profile.websites,
|
42
|
+
profile_url: profile.profile_url,
|
43
|
+
current: status,
|
44
|
+
timestamp: profile.timestamp,
|
45
|
+
related_people: profile.related_people,
|
46
|
+
degree: profile.degree,
|
47
|
+
pic_path: profile.pic_path)
|
48
|
+
return c
|
49
|
+
end
|
50
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedindata
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.18
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-04-11 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Scrapes all LinkedIn profiles including terms you specify.
|
14
14
|
email: shidash@shidash.com
|
@@ -16,9 +16,10 @@ executables: []
|
|
16
16
|
extensions: []
|
17
17
|
extra_rdoc_files: []
|
18
18
|
files:
|
19
|
-
- lib/
|
19
|
+
- lib/get_related.rb
|
20
|
+
- lib/linkedin.rb
|
20
21
|
- lib/linkedindata.rb
|
21
|
-
- lib/
|
22
|
+
- lib/parse_profile.rb
|
22
23
|
homepage: https://github.com/transparencytoolkit/linkedindata
|
23
24
|
licenses:
|
24
25
|
- GPL
|
data/lib/getrelated.rb
DELETED
@@ -1,55 +0,0 @@
|
|
1
|
-
require 'json'
|
2
|
-
require 'nokogiri'
|
3
|
-
require 'open-uri'
|
4
|
-
|
5
|
-
class GetRelated
|
6
|
-
def initialize(url)
|
7
|
-
@url = url
|
8
|
-
@relatedlist = Array.new
|
9
|
-
end
|
10
|
-
|
11
|
-
# Get the list of names of related people
|
12
|
-
def getList
|
13
|
-
html = Nokogiri::HTML(open(@url.gsub("http", "https")))
|
14
|
-
|
15
|
-
if html
|
16
|
-
namelist = Array.new
|
17
|
-
|
18
|
-
# Go through each person
|
19
|
-
html.css("div.insights-browse-map").each do |d|
|
20
|
-
if d.css("h3").text == "People Also Viewed"
|
21
|
-
d.css("li").each do |l|
|
22
|
-
temphash = Hash.new
|
23
|
-
temphash[:name] = l.css("h4").text
|
24
|
-
temphash[:url] = l.css("a")[0]['href']
|
25
|
-
namelist.push(temphash)
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
return namelist
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
# This is just an outline for the next version of getrelated
|
36
|
-
|
37
|
-
# Add degree back as field (0 by default)
|
38
|
-
# Loop through all profiles
|
39
|
-
# Load n times (need to determine optimal num)
|
40
|
-
# Save list of related people (for profile- make list and append if seen listed as related or in related list)
|
41
|
-
# Save overall list of related people (with URLs and min degree)
|
42
|
-
# Track min degrees out
|
43
|
-
|
44
|
-
# Go through overall list of related people
|
45
|
-
# Parse profile
|
46
|
-
# Make sure degree is correct when saved
|
47
|
-
# Maybe save in JSONs by degree
|
48
|
-
|
49
|
-
|
50
|
-
# Info:
|
51
|
-
# Profiles of related people
|
52
|
-
# Degrees for all profiles
|
53
|
-
# Related people list on each profile (complete)
|
54
|
-
|
55
|
-
# Deduplicate
|
data/lib/parseprofile.rb
DELETED
@@ -1,79 +0,0 @@
|
|
1
|
-
require 'json'
|
2
|
-
load 'getrelated.rb'
|
3
|
-
require 'pry'
|
4
|
-
|
5
|
-
class ParseProfile
|
6
|
-
def initialize(profile, url, curhops)
|
7
|
-
@profile = profile
|
8
|
-
@url = url
|
9
|
-
@output = Array.new
|
10
|
-
@related_people
|
11
|
-
@curhops = curhops
|
12
|
-
end
|
13
|
-
|
14
|
-
# Parse profile
|
15
|
-
def parse
|
16
|
-
begin
|
17
|
-
g = GetRelated.new(@url)
|
18
|
-
@related_people = g.getList
|
19
|
-
rescue
|
20
|
-
end
|
21
|
-
|
22
|
-
# Parse profiles for current companies
|
23
|
-
@profile.current_companies.each do |c|
|
24
|
-
@output.push(parseCompany(c, "Yes"))
|
25
|
-
end
|
26
|
-
|
27
|
-
# Parse past position/company info
|
28
|
-
@profile.past_companies.each do |c|
|
29
|
-
@output.push(parseCompany(c, "No"))
|
30
|
-
end
|
31
|
-
|
32
|
-
# Clean up directories
|
33
|
-
pics = Dir["public/uploads/*.jpg.*"]
|
34
|
-
pics.each do |p|
|
35
|
-
File.delete(p)
|
36
|
-
end
|
37
|
-
|
38
|
-
return @output
|
39
|
-
end
|
40
|
-
|
41
|
-
# Merge person data with role data
|
42
|
-
def parseCompany(c, status)
|
43
|
-
c.merge!(
|
44
|
-
:skills => @profile.skills,
|
45
|
-
:certifications => @profile.certifications,
|
46
|
-
:languages => @profile.languages,
|
47
|
-
:name => @profile.first_name + " " + @profile.last_name,
|
48
|
-
:location => @profile.location,
|
49
|
-
:area => @profile.country,
|
50
|
-
:industry => @profile.industry,
|
51
|
-
:picture => @profile.picture,
|
52
|
-
:organizations => @profile.organizations,
|
53
|
-
:groups => @profile.groups,
|
54
|
-
:education => @profile.education,
|
55
|
-
:websites => @profile.websites,
|
56
|
-
:profile_url => @url,
|
57
|
-
:current => status,
|
58
|
-
:timestamp => Time.now,
|
59
|
-
:related_people => @related_people,
|
60
|
-
:degree => @curhops)
|
61
|
-
c.merge!(:pic_path => getPic)
|
62
|
-
return c
|
63
|
-
end
|
64
|
-
|
65
|
-
# Download pictures
|
66
|
-
def getPic
|
67
|
-
if @profile.picture
|
68
|
-
path = @profile.picture.split("/")
|
69
|
-
if !File.file?("public/uploads/pictures/" + path[path.length-1].chomp.strip)
|
70
|
-
begin
|
71
|
-
`wget -P public/uploads/pictures #{@profile.picture}`
|
72
|
-
rescue
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
return "public/uploads/pictures/" + path[path.length-1].chomp.strip
|
77
|
-
end
|
78
|
-
end
|
79
|
-
end
|