linkedindata 0.0.12 → 0.0.13
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/getrelated.rb +50 -0
- data/lib/linkedindata.rb +85 -0
- data/lib/parseprofile.rb +76 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3bc1d73941a6228073ef054491452a8ecc34f0d6
|
4
|
+
data.tar.gz: 35046bb711b902b5a8650b6995d291afc48c3702
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5286e02124965b5f02ecbad34699840071ca0938c52e9da7d796683f150e0f796fa5d12ddca85763e997c95fbc1b99e573e89cc21344dddfd5709beaaa3434c8
|
7
|
+
data.tar.gz: 0015c5ef88cb4c14e187412e0da448ef40194c24d12fbbddc71ea50682e5769e9fa76930925eb08e654d1db7de91d1e5a3be7027cd41003b2beba0019f6ff82b
|
data/lib/getrelated.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'open-uri'
|
4
|
+
|
5
|
+
class GetRelated
|
6
|
+
def initialize(url)
|
7
|
+
@url = url
|
8
|
+
@relatedlist = Array.new
|
9
|
+
end
|
10
|
+
|
11
|
+
# Get the list of names of related people
|
12
|
+
def getList
|
13
|
+
html = Nokogiri::HTML(open(@url))
|
14
|
+
|
15
|
+
if html
|
16
|
+
namelist = Array.new
|
17
|
+
|
18
|
+
# Go through each person
|
19
|
+
html.css("div.insights-browse-map").each do |d|
|
20
|
+
d.css("li").each do |l|
|
21
|
+
namelist.push(l.css("h4").text)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
return namelist
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# This is just an outline for the next version of getrelated
|
31
|
+
|
32
|
+
# Add degree back as field (0 by default)
|
33
|
+
# Loop through all profiles
|
34
|
+
# Load n times (need to determine optimal num)
|
35
|
+
# Save list of related people (for profile- make list and append if seen listed as related or in related list)
|
36
|
+
# Save overall list of related people (with URLs and min degree)
|
37
|
+
# Track min degrees out
|
38
|
+
|
39
|
+
# Go through overall list of related people
|
40
|
+
# Parse profile
|
41
|
+
# Make sure degree is correct when saved
|
42
|
+
# Maybe save in JSONs by degree
|
43
|
+
|
44
|
+
|
45
|
+
# Info:
|
46
|
+
# Profiles of related people
|
47
|
+
# Degrees for all profiles
|
48
|
+
# Related people list on each profile (complete)
|
49
|
+
|
50
|
+
# Deduplicate
|
data/lib/linkedindata.rb
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
require 'linkedin-scraper'
|
3
|
+
require 'json'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'open-uri'
|
6
|
+
load 'parseprofile.rb'
|
7
|
+
require 'pry'
|
8
|
+
require 'urlarchiver'
|
9
|
+
|
10
|
+
class LinkedinData
|
11
|
+
def initialize(input, todegree)
|
12
|
+
@input = input
|
13
|
+
@output = Array.new
|
14
|
+
@startindex = 10
|
15
|
+
end
|
16
|
+
|
17
|
+
# Searches for profiles on Google
|
18
|
+
def search
|
19
|
+
agent = Mechanize.new
|
20
|
+
agent.user_agent_alias = 'Linux Firefox'
|
21
|
+
gform = agent.get("http://google.com").form("f")
|
22
|
+
gform.q = "site:linkedin.com/pub " + @input
|
23
|
+
page = agent.submit(gform, gform.buttons.first)
|
24
|
+
examine(page)
|
25
|
+
end
|
26
|
+
|
27
|
+
# Examines a search page
|
28
|
+
def examine(page)
|
29
|
+
# Separate getting profile links and going to next page
|
30
|
+
# Method for getting links to all result pages
|
31
|
+
# Different method for getting all profile links on page and scraping (split to new thread for this)
|
32
|
+
# Has own output set, merge into full one at end (make sure threadsafe)
|
33
|
+
|
34
|
+
# Have own input and output
|
35
|
+
page.links.each do |link|
|
36
|
+
if (link.href.include? "linkedin.com") && (!link.href.include? "webcache") && (!link.href.include? "site:linkedin.com/pub+")
|
37
|
+
saveurl = link.href.split("?q=")
|
38
|
+
|
39
|
+
if saveurl[1]
|
40
|
+
url = saveurl[1].split("&")
|
41
|
+
begin
|
42
|
+
scrape(url[0])
|
43
|
+
rescue
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Find the link to the next page and go to it
|
49
|
+
if (link.href.include? "&sa=N") && (link.href.include? "&start=")
|
50
|
+
url1 = link.href.split("&start=")
|
51
|
+
url2 = url1[1].split("&sa=N")
|
52
|
+
|
53
|
+
if url2[0].to_i == @startindex
|
54
|
+
sleep(rand(5..10))
|
55
|
+
@startindex += 10
|
56
|
+
agent = Mechanize.new
|
57
|
+
examine(agent.get("http://google.com" + link.href))
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
# Scrapes profile
|
64
|
+
def scrape(url)
|
65
|
+
# Download profile and rescue on error
|
66
|
+
begin
|
67
|
+
url.gsub!("https", "http")
|
68
|
+
profile = Linkedin::Profile.get_profile(url)
|
69
|
+
rescue
|
70
|
+
end
|
71
|
+
|
72
|
+
# Parse profile if returned
|
73
|
+
if profile
|
74
|
+
p = ParseProfile.new(profile, url)
|
75
|
+
@output.concat(p.parse)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# Gets all data and returns in JSON
|
80
|
+
def getData
|
81
|
+
search
|
82
|
+
formatted_json = JSON.pretty_generate(@output)
|
83
|
+
return formatted_json
|
84
|
+
end
|
85
|
+
end
|
data/lib/parseprofile.rb
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
require 'json'
|
2
|
+
load 'getrelated.rb'
|
3
|
+
|
4
|
+
class ParseProfile
|
5
|
+
def initialize(profile, url)
|
6
|
+
@profile = profile
|
7
|
+
@url = url
|
8
|
+
@output = Array.new
|
9
|
+
@related_people
|
10
|
+
end
|
11
|
+
|
12
|
+
# Parse profile
|
13
|
+
def parse
|
14
|
+
begin
|
15
|
+
g = GetRelated.new(@url)
|
16
|
+
@related_people = g.getList
|
17
|
+
rescue
|
18
|
+
end
|
19
|
+
|
20
|
+
# Parse profiles for current companies
|
21
|
+
@profile.current_companies.each do |c|
|
22
|
+
@output.push(parseCompany(c, "Yes"))
|
23
|
+
end
|
24
|
+
|
25
|
+
# Parse past position/company info
|
26
|
+
@profile.past_companies.each do |c|
|
27
|
+
@output.push(parseCompany(c, "No"))
|
28
|
+
end
|
29
|
+
|
30
|
+
# Clean up directories
|
31
|
+
pics = Dir["public/uploads/*.jpg.*"]
|
32
|
+
pics.each do |p|
|
33
|
+
File.delete(p)
|
34
|
+
end
|
35
|
+
|
36
|
+
return @output
|
37
|
+
end
|
38
|
+
|
39
|
+
# Merge person data with role data
|
40
|
+
def parseCompany(c, status)
|
41
|
+
c.merge!(
|
42
|
+
:skills => @profile.skills,
|
43
|
+
:certifications => @profile.certifications,
|
44
|
+
:languages => @profile.languages,
|
45
|
+
:name => @profile.first_name + " " + @profile.last_name,
|
46
|
+
:location => @profile.location,
|
47
|
+
:area => @profile.country,
|
48
|
+
:industry => @profile.industry,
|
49
|
+
:picture => @profile.picture,
|
50
|
+
:organizations => @profile.organizations,
|
51
|
+
:groups => @profile.groups,
|
52
|
+
:education => @profile.education,
|
53
|
+
:websites => @profile.websites,
|
54
|
+
:profile_url => @url,
|
55
|
+
:current => status,
|
56
|
+
:timestamp => Time.now,
|
57
|
+
:related_people => @related_people)
|
58
|
+
c.merge!(:pic_path => getPic)
|
59
|
+
return c
|
60
|
+
end
|
61
|
+
|
62
|
+
# Download pictures
|
63
|
+
def getPic
|
64
|
+
if @profile.picture
|
65
|
+
path = @profile.picture.split("/")
|
66
|
+
if !File.file?("public/uploads/pictures/" + path[path.length-1].chomp.strip)
|
67
|
+
begin
|
68
|
+
`wget -P public/uploads/pictures #{@profile.picture}`
|
69
|
+
rescue
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
return "public/uploads/pictures/" + path[path.length-1].chomp.strip
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedindata
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
@@ -15,7 +15,10 @@ email: shidash@shidash.com
|
|
15
15
|
executables: []
|
16
16
|
extensions: []
|
17
17
|
extra_rdoc_files: []
|
18
|
-
files:
|
18
|
+
files:
|
19
|
+
- lib/linkedindata.rb
|
20
|
+
- lib/parseprofile.rb
|
21
|
+
- lib/getrelated.rb
|
19
22
|
homepage: https://github.com/transparencytoolkit/linkedindata
|
20
23
|
licenses:
|
21
24
|
- GPL
|