linkedindata 0.0.17 → 0.0.18
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/get_related.rb +80 -0
- data/lib/linkedin.rb +66 -0
- data/lib/linkedindata.rb +36 -108
- data/lib/parse_profile.rb +50 -0
- metadata +5 -4
- data/lib/getrelated.rb +0 -55
- data/lib/parseprofile.rb +0 -79
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 62911808bef43a12c8723a47135534fd7ff330fb
|
4
|
+
data.tar.gz: 4012d7ef04d34401d79ee1c3b4150e3a353358fc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1912abe3d5349f5cbbcd4c06ab699926859fb78f75c96aef49f932351791d325a7101b7e585ee81fc0c87869d0b5440d3a7c9c77ecf926a5d280da50a7a1e023
|
7
|
+
data.tar.gz: a4de7d6888cd3ef25edf8037a7624a570890882744305694cc9a45f564b0094b08ae2d4bf0789569915e6ab9e9d56731b873f5bf714f042fb9af5fcfacdc2539
|
data/lib/get_related.rb
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
module GetRelated
|
2
|
+
# Get the list of names of related people
|
3
|
+
def getList(html)
|
4
|
+
namelist = Array.new
|
5
|
+
|
6
|
+
# Save each person's name and url
|
7
|
+
html.css("div.insights-browse-map").each do |d|
|
8
|
+
if d.css("h3").text == "People Also Viewed"
|
9
|
+
d.css("li").each do |l|
|
10
|
+
namelist.push({name: l.css("h4").text,
|
11
|
+
url: l.css("a")[0]['href']})
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
return namelist
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
# Get all profiles within numhops of original(s)
|
21
|
+
def getRelatedProfiles
|
22
|
+
@numhops.times do |hop_count|
|
23
|
+
@output.select { |profile| profile[:degree] == hop_count }.each do |item|
|
24
|
+
downloadRelated(item, hop_count) if item[:related_people]
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# Scrapes the related profiles for one result item
|
30
|
+
def downloadRelated(item, hop_count)
|
31
|
+
item[:related_people].each do |related_person|
|
32
|
+
# Check if it has been scraped already
|
33
|
+
if @output.select { |person| related_person[:name] == person[:name] }.empty?
|
34
|
+
scrape(related_person[:url], hop_count+1)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
# Make list of profiles for score tracking
|
41
|
+
def fullProfileList(data)
|
42
|
+
profiles = Hash.new
|
43
|
+
data.each do |d|
|
44
|
+
profiles[d[:profile_url]] = 0
|
45
|
+
end
|
46
|
+
return profiles
|
47
|
+
end
|
48
|
+
|
49
|
+
# Adds points to a profile for showing up in related people
|
50
|
+
def addPointsToProfile(profile_scores, data_item, person)
|
51
|
+
if profile_scores[person[:url]]
|
52
|
+
# Calculate degree- (2/d*2) except when degree is 0
|
53
|
+
degree_divide = data_item[:degree] == 0 ? 1 : data_item[:degree]*2
|
54
|
+
profile_scores[person[:url]] += (2.0/degree_divide)
|
55
|
+
end
|
56
|
+
return profile_scores
|
57
|
+
end
|
58
|
+
|
59
|
+
# Add a score to each profile based on the # of times it appears in "people also viewed"
|
60
|
+
def relScore(data)
|
61
|
+
profile_scores = fullProfileList(data)
|
62
|
+
|
63
|
+
# Get degree and calculate score for each profile
|
64
|
+
data.each do |data_item|
|
65
|
+
if data_item[:related_people]
|
66
|
+
data_item[:related_people].each do |person|
|
67
|
+
profile_scores = addPointsToProfile(profile_scores, data_item, person)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# Merge scores back into dataset
|
73
|
+
data.each do |m|
|
74
|
+
m.merge!(score: profile_scores[m[:profile_url]])
|
75
|
+
end
|
76
|
+
|
77
|
+
return data
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
data/lib/linkedin.rb
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
# Someone already made a nice gem for parsing public profiles:
|
2
|
+
# https://github.com/yatish27/linkedin-scraper
|
3
|
+
# This class reopens that to add extra things I need
|
4
|
+
module Linkedin
|
5
|
+
class Profile
|
6
|
+
include ProxyManager
|
7
|
+
include GetRelated
|
8
|
+
|
9
|
+
def initialize(url, curhops, proxylist, usedproxies)
|
10
|
+
@linkedin_url = url
|
11
|
+
@curhops = curhops
|
12
|
+
@proxylist = proxylist
|
13
|
+
@usedproxies = usedproxies
|
14
|
+
|
15
|
+
# Add attributes to list
|
16
|
+
ATTRIBUTES.push(
|
17
|
+
"related_people",
|
18
|
+
"profile_url",
|
19
|
+
"timestamp",
|
20
|
+
"degree",
|
21
|
+
"pic_path")
|
22
|
+
@page = getPage(url) # Get pages with proxies
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
def self.get_profile(url, curhops, proxylist, usedproxies)
|
27
|
+
Linkedin::Profile.new(url, curhops, proxylist, usedproxies)
|
28
|
+
rescue => e
|
29
|
+
puts e
|
30
|
+
end
|
31
|
+
|
32
|
+
# Gets "people also viewed list" form profile sidebar
|
33
|
+
def related_people
|
34
|
+
@related_people ||= getList(Nokogiri::HTML(@page.body))
|
35
|
+
end
|
36
|
+
|
37
|
+
# Similar to linkedin_url
|
38
|
+
def profile_url
|
39
|
+
@profile_url ||= @linkedin_url
|
40
|
+
end
|
41
|
+
|
42
|
+
# Get the time the profile was scraped
|
43
|
+
def timestamp
|
44
|
+
@timestamp ||= Time.now
|
45
|
+
end
|
46
|
+
|
47
|
+
# Get the number of hops out where profile appears
|
48
|
+
def degree
|
49
|
+
@degree ||= @curhops
|
50
|
+
end
|
51
|
+
|
52
|
+
# Download the profile picture
|
53
|
+
def pic_path
|
54
|
+
if picture
|
55
|
+
# Get path
|
56
|
+
dir = "public/uploads/pictures/"
|
57
|
+
full_path = dir+picture.split("/").last.chomp.strip
|
58
|
+
|
59
|
+
# Get file
|
60
|
+
`wget -P #{dir} #{picture}` if !File.file?(full_path)
|
61
|
+
return full_path
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
end
|
data/lib/linkedindata.rb
CHANGED
@@ -1,81 +1,46 @@
|
|
1
|
-
require 'mechanize'
|
2
1
|
require 'linkedin-scraper'
|
2
|
+
require 'generalscraper'
|
3
3
|
require 'json'
|
4
4
|
require 'nokogiri'
|
5
|
-
require 'open-uri'
|
6
|
-
load 'parseprofile.rb'
|
7
|
-
require 'pry'
|
8
|
-
require 'urlarchiver'
|
9
5
|
require 'set'
|
10
6
|
|
7
|
+
load 'parse_profile.rb'
|
8
|
+
load 'get_related.rb'
|
9
|
+
load 'linkedin.rb'
|
10
|
+
|
11
11
|
class LinkedinData
|
12
|
-
|
13
|
-
|
12
|
+
include GetRelated
|
13
|
+
include ParseProfile
|
14
|
+
include Linkedin
|
15
|
+
|
16
|
+
def initialize(todegree, proxylist)
|
17
|
+
@proxylist = IO.readlines(proxylist)
|
18
|
+
@proxy_list_path = proxylist
|
19
|
+
@usedproxies = Hash.new
|
14
20
|
@output = Array.new
|
15
21
|
@startindex = 10
|
16
22
|
@numhops = todegree
|
17
23
|
end
|
18
24
|
|
19
25
|
# Searches for profiles on Google
|
20
|
-
def search
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
gform.q = "site:linkedin.com/pub " + @input
|
25
|
-
page = agent.submit(gform, gform.buttons.first)
|
26
|
-
examine(page)
|
27
|
-
end
|
28
|
-
|
29
|
-
# Examines a search page
|
30
|
-
def examine(page)
|
31
|
-
# Separate getting profile links and going to next page
|
32
|
-
# Method for getting links to all result pages
|
33
|
-
# Different method for getting all profile links on page and scraping (split to new thread for this)
|
34
|
-
# Has own output set, merge into full one at end (make sure threadsafe)
|
35
|
-
|
36
|
-
# Have own input and output
|
37
|
-
page.links.each do |link|
|
38
|
-
if (link.href.include? "linkedin.com") && (!link.href.include? "webcache") && (!link.href.include? "site:linkedin.com/pub+")
|
39
|
-
saveurl = link.href.split("?q=")
|
40
|
-
|
41
|
-
if saveurl[1]
|
42
|
-
url = saveurl[1].split("&")
|
43
|
-
begin
|
44
|
-
scrape(url[0], 0)
|
45
|
-
rescue
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
# Find the link to the next page and go to it
|
51
|
-
if (link.href.include? "&sa=N") && (link.href.include? "&start=")
|
52
|
-
url1 = link.href.split("&start=")
|
53
|
-
url2 = url1[1].split("&sa=N")
|
54
|
-
|
55
|
-
if url2[0].to_i == @startindex
|
56
|
-
sleep(rand(30..90))
|
57
|
-
@startindex += 10
|
58
|
-
agent = Mechanize.new
|
59
|
-
examine(agent.get("http://google.com" + link.href))
|
60
|
-
end
|
61
|
-
end
|
26
|
+
def search(search_terms)
|
27
|
+
g = GeneralScraper.new("site:linkedin.com/pub", search_terms, @proxy_list_path)
|
28
|
+
JSON.parse(g.getURLs).each do |profile|
|
29
|
+
scrape(profile, 0)
|
62
30
|
end
|
63
31
|
end
|
64
32
|
|
65
|
-
# Scrapes profile
|
33
|
+
# Scrapes and parses individual profile
|
66
34
|
def scrape(url, curhops)
|
67
35
|
# Download profile and rescue on error
|
68
36
|
begin
|
69
37
|
url.gsub!("https", "http")
|
70
|
-
profile = Linkedin::Profile.get_profile(url)
|
38
|
+
profile = Linkedin::Profile.get_profile(url, curhops, @proxylist, @usedproxies)
|
71
39
|
rescue
|
72
40
|
end
|
73
41
|
|
74
|
-
# Parse profile if returned
|
75
|
-
if profile
|
76
|
-
p = ParseProfile.new(profile, url, curhops)
|
77
|
-
@output.concat(p.parse)
|
78
|
-
end
|
42
|
+
# Parse profile if returned and add to output
|
43
|
+
@output.concat(parseResume(profile)) if profile
|
79
44
|
end
|
80
45
|
|
81
46
|
# Make sure all keys that occur occur in each item (even if nil)
|
@@ -101,59 +66,22 @@ class LinkedinData
|
|
101
66
|
return datarr
|
102
67
|
end
|
103
68
|
|
104
|
-
#
|
105
|
-
def
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
data.each do |d|
|
110
|
-
profiles[d["profile_url"]] = 0
|
111
|
-
end
|
112
|
-
|
113
|
-
# Get degree for each profile
|
114
|
-
data.each do |i|
|
115
|
-
if i["related_people"]
|
116
|
-
i["related_people"].each do |p|
|
117
|
-
if profiles[p["url"]]
|
118
|
-
# Calculate degree- (2/d*2) except when degree is 0
|
119
|
-
degree_divide = i["degree"] == 0 ? 1 : i["degree"]*2
|
120
|
-
profiles[p["url"]] += (2.0/degree_divide)
|
121
|
-
end
|
122
|
-
end
|
123
|
-
end
|
124
|
-
end
|
125
|
-
|
126
|
-
# Merge scores back into dataset
|
127
|
-
data.each do |m|
|
128
|
-
m.merge!(:score => profiles[m["profile_url"]])
|
129
|
-
end
|
130
|
-
|
131
|
-
return data
|
69
|
+
# Gets related profiles then adds relevance scores and any missing keys
|
70
|
+
def prepareResults
|
71
|
+
getRelatedProfiles
|
72
|
+
deleteDuplicatePics
|
73
|
+
return JSON.pretty_generate(relScore(showAllKeys(@output)))
|
132
74
|
end
|
133
75
|
|
134
|
-
# Gets
|
135
|
-
def
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
o[:related_people].each do |i|
|
145
|
-
if @output.select { |obj| obj[:name] == i[:name]}.empty?
|
146
|
-
scrape(i[:url], o[:degree]+1)
|
147
|
-
end
|
148
|
-
end
|
149
|
-
end
|
150
|
-
|
151
|
-
end
|
152
|
-
end
|
153
|
-
end
|
154
|
-
|
155
|
-
formatted_json = JSON.pretty_generate(relScore(showAllKeys(@output)))
|
156
|
-
return formatted_json
|
76
|
+
# Gets one profile and the related profiles
|
77
|
+
def getSingleProfile(url)
|
78
|
+
scrape(url, 0)
|
79
|
+
return prepareResults
|
80
|
+
end
|
81
|
+
|
82
|
+
# Gets all profiles in search results and returns in JSON
|
83
|
+
def getByKeywords(search_term)
|
84
|
+
search(search_term)
|
85
|
+
return prepareResults
|
157
86
|
end
|
158
87
|
end
|
159
|
-
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module ParseProfile
|
2
|
+
# Parse profile into items by company
|
3
|
+
def parseResume(profile)
|
4
|
+
output = Array.new
|
5
|
+
|
6
|
+
# Parse profiles for current companies
|
7
|
+
profile.current_companies.each do |c|
|
8
|
+
output.push(addPersonFields(c, "Yes", profile))
|
9
|
+
end
|
10
|
+
|
11
|
+
# Parse past position/company info
|
12
|
+
profile.past_companies.each do |c|
|
13
|
+
output.push(addPersonFields(c, "No", profile))
|
14
|
+
end
|
15
|
+
|
16
|
+
return output
|
17
|
+
end
|
18
|
+
|
19
|
+
# Deletes duplicate pictures
|
20
|
+
def deleteDuplicatePics
|
21
|
+
pics = Dir["public/uploads/pictures/*.jpg.*"]
|
22
|
+
pics.each do |p|
|
23
|
+
File.delete(p)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Merge person data with role data
|
28
|
+
def addPersonFields(c, status, profile)
|
29
|
+
c.merge!(
|
30
|
+
skills: profile.skills,
|
31
|
+
certifications: profile.certifications,
|
32
|
+
languages: profile.languages,
|
33
|
+
name: profile.name,
|
34
|
+
location: profile.location,
|
35
|
+
area: profile.country,
|
36
|
+
industry: profile.industry,
|
37
|
+
picture: profile.picture,
|
38
|
+
organizations: profile.organizations,
|
39
|
+
groups: profile.groups,
|
40
|
+
education: profile.education,
|
41
|
+
websites: profile.websites,
|
42
|
+
profile_url: profile.profile_url,
|
43
|
+
current: status,
|
44
|
+
timestamp: profile.timestamp,
|
45
|
+
related_people: profile.related_people,
|
46
|
+
degree: profile.degree,
|
47
|
+
pic_path: profile.pic_path)
|
48
|
+
return c
|
49
|
+
end
|
50
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedindata
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.18
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-04-11 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Scrapes all LinkedIn profiles including terms you specify.
|
14
14
|
email: shidash@shidash.com
|
@@ -16,9 +16,10 @@ executables: []
|
|
16
16
|
extensions: []
|
17
17
|
extra_rdoc_files: []
|
18
18
|
files:
|
19
|
-
- lib/
|
19
|
+
- lib/get_related.rb
|
20
|
+
- lib/linkedin.rb
|
20
21
|
- lib/linkedindata.rb
|
21
|
-
- lib/
|
22
|
+
- lib/parse_profile.rb
|
22
23
|
homepage: https://github.com/transparencytoolkit/linkedindata
|
23
24
|
licenses:
|
24
25
|
- GPL
|
data/lib/getrelated.rb
DELETED
@@ -1,55 +0,0 @@
|
|
1
|
-
require 'json'
|
2
|
-
require 'nokogiri'
|
3
|
-
require 'open-uri'
|
4
|
-
|
5
|
-
class GetRelated
|
6
|
-
def initialize(url)
|
7
|
-
@url = url
|
8
|
-
@relatedlist = Array.new
|
9
|
-
end
|
10
|
-
|
11
|
-
# Get the list of names of related people
|
12
|
-
def getList
|
13
|
-
html = Nokogiri::HTML(open(@url.gsub("http", "https")))
|
14
|
-
|
15
|
-
if html
|
16
|
-
namelist = Array.new
|
17
|
-
|
18
|
-
# Go through each person
|
19
|
-
html.css("div.insights-browse-map").each do |d|
|
20
|
-
if d.css("h3").text == "People Also Viewed"
|
21
|
-
d.css("li").each do |l|
|
22
|
-
temphash = Hash.new
|
23
|
-
temphash[:name] = l.css("h4").text
|
24
|
-
temphash[:url] = l.css("a")[0]['href']
|
25
|
-
namelist.push(temphash)
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
return namelist
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
# This is just an outline for the next version of getrelated
|
36
|
-
|
37
|
-
# Add degree back as field (0 by default)
|
38
|
-
# Loop through all profiles
|
39
|
-
# Load n times (need to determine optimal num)
|
40
|
-
# Save list of related people (for profile- make list and append if seen listed as related or in related list)
|
41
|
-
# Save overall list of related people (with URLs and min degree)
|
42
|
-
# Track min degrees out
|
43
|
-
|
44
|
-
# Go through overall list of related people
|
45
|
-
# Parse profile
|
46
|
-
# Make sure degree is correct when saved
|
47
|
-
# Maybe save in JSONs by degree
|
48
|
-
|
49
|
-
|
50
|
-
# Info:
|
51
|
-
# Profiles of related people
|
52
|
-
# Degrees for all profiles
|
53
|
-
# Related people list on each profile (complete)
|
54
|
-
|
55
|
-
# Deduplicate
|
data/lib/parseprofile.rb
DELETED
@@ -1,79 +0,0 @@
|
|
1
|
-
require 'json'
|
2
|
-
load 'getrelated.rb'
|
3
|
-
require 'pry'
|
4
|
-
|
5
|
-
class ParseProfile
|
6
|
-
def initialize(profile, url, curhops)
|
7
|
-
@profile = profile
|
8
|
-
@url = url
|
9
|
-
@output = Array.new
|
10
|
-
@related_people
|
11
|
-
@curhops = curhops
|
12
|
-
end
|
13
|
-
|
14
|
-
# Parse profile
|
15
|
-
def parse
|
16
|
-
begin
|
17
|
-
g = GetRelated.new(@url)
|
18
|
-
@related_people = g.getList
|
19
|
-
rescue
|
20
|
-
end
|
21
|
-
|
22
|
-
# Parse profiles for current companies
|
23
|
-
@profile.current_companies.each do |c|
|
24
|
-
@output.push(parseCompany(c, "Yes"))
|
25
|
-
end
|
26
|
-
|
27
|
-
# Parse past position/company info
|
28
|
-
@profile.past_companies.each do |c|
|
29
|
-
@output.push(parseCompany(c, "No"))
|
30
|
-
end
|
31
|
-
|
32
|
-
# Clean up directories
|
33
|
-
pics = Dir["public/uploads/*.jpg.*"]
|
34
|
-
pics.each do |p|
|
35
|
-
File.delete(p)
|
36
|
-
end
|
37
|
-
|
38
|
-
return @output
|
39
|
-
end
|
40
|
-
|
41
|
-
# Merge person data with role data
|
42
|
-
def parseCompany(c, status)
|
43
|
-
c.merge!(
|
44
|
-
:skills => @profile.skills,
|
45
|
-
:certifications => @profile.certifications,
|
46
|
-
:languages => @profile.languages,
|
47
|
-
:name => @profile.first_name + " " + @profile.last_name,
|
48
|
-
:location => @profile.location,
|
49
|
-
:area => @profile.country,
|
50
|
-
:industry => @profile.industry,
|
51
|
-
:picture => @profile.picture,
|
52
|
-
:organizations => @profile.organizations,
|
53
|
-
:groups => @profile.groups,
|
54
|
-
:education => @profile.education,
|
55
|
-
:websites => @profile.websites,
|
56
|
-
:profile_url => @url,
|
57
|
-
:current => status,
|
58
|
-
:timestamp => Time.now,
|
59
|
-
:related_people => @related_people,
|
60
|
-
:degree => @curhops)
|
61
|
-
c.merge!(:pic_path => getPic)
|
62
|
-
return c
|
63
|
-
end
|
64
|
-
|
65
|
-
# Download pictures
|
66
|
-
def getPic
|
67
|
-
if @profile.picture
|
68
|
-
path = @profile.picture.split("/")
|
69
|
-
if !File.file?("public/uploads/pictures/" + path[path.length-1].chomp.strip)
|
70
|
-
begin
|
71
|
-
`wget -P public/uploads/pictures #{@profile.picture}`
|
72
|
-
rescue
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
return "public/uploads/pictures/" + path[path.length-1].chomp.strip
|
77
|
-
end
|
78
|
-
end
|
79
|
-
end
|