linkedindata 0.0.12 → 0.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/getrelated.rb +50 -0
- data/lib/linkedindata.rb +85 -0
- data/lib/parseprofile.rb +76 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3bc1d73941a6228073ef054491452a8ecc34f0d6
|
4
|
+
data.tar.gz: 35046bb711b902b5a8650b6995d291afc48c3702
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5286e02124965b5f02ecbad34699840071ca0938c52e9da7d796683f150e0f796fa5d12ddca85763e997c95fbc1b99e573e89cc21344dddfd5709beaaa3434c8
|
7
|
+
data.tar.gz: 0015c5ef88cb4c14e187412e0da448ef40194c24d12fbbddc71ea50682e5769e9fa76930925eb08e654d1db7de91d1e5a3be7027cd41003b2beba0019f6ff82b
|
data/lib/getrelated.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'open-uri'
|
4
|
+
|
5
|
+
class GetRelated
|
6
|
+
def initialize(url)
|
7
|
+
@url = url
|
8
|
+
@relatedlist = Array.new
|
9
|
+
end
|
10
|
+
|
11
|
+
# Get the list of names of related people
|
12
|
+
def getList
|
13
|
+
html = Nokogiri::HTML(open(@url))
|
14
|
+
|
15
|
+
if html
|
16
|
+
namelist = Array.new
|
17
|
+
|
18
|
+
# Go through each person
|
19
|
+
html.css("div.insights-browse-map").each do |d|
|
20
|
+
d.css("li").each do |l|
|
21
|
+
namelist.push(l.css("h4").text)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
return namelist
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# This is just an outline for the next version of getrelated
|
31
|
+
|
32
|
+
# Add degree back as field (0 by default)
|
33
|
+
# Loop through all profiles
|
34
|
+
# Load n times (need to determine optimal num)
|
35
|
+
# Save list of related people (for profile- make list and append if seen listed as related or in related list)
|
36
|
+
# Save overall list of related people (with URLs and min degree)
|
37
|
+
# Track min degrees out
|
38
|
+
|
39
|
+
# Go through overall list of related people
|
40
|
+
# Parse profile
|
41
|
+
# Make sure degree is correct when saved
|
42
|
+
# Maybe save in JSONs by degree
|
43
|
+
|
44
|
+
|
45
|
+
# Info:
|
46
|
+
# Profiles of related people
|
47
|
+
# Degrees for all profiles
|
48
|
+
# Related people list on each profile (complete)
|
49
|
+
|
50
|
+
# Deduplicate
|
data/lib/linkedindata.rb
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
require 'linkedin-scraper'
|
3
|
+
require 'json'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'open-uri'
|
6
|
+
load 'parseprofile.rb'
|
7
|
+
require 'pry'
|
8
|
+
require 'urlarchiver'
|
9
|
+
|
10
|
+
class LinkedinData
|
11
|
+
def initialize(input, todegree)
|
12
|
+
@input = input
|
13
|
+
@output = Array.new
|
14
|
+
@startindex = 10
|
15
|
+
end
|
16
|
+
|
17
|
+
# Searches for profiles on Google
|
18
|
+
def search
|
19
|
+
agent = Mechanize.new
|
20
|
+
agent.user_agent_alias = 'Linux Firefox'
|
21
|
+
gform = agent.get("http://google.com").form("f")
|
22
|
+
gform.q = "site:linkedin.com/pub " + @input
|
23
|
+
page = agent.submit(gform, gform.buttons.first)
|
24
|
+
examine(page)
|
25
|
+
end
|
26
|
+
|
27
|
+
# Examines a search page
|
28
|
+
def examine(page)
|
29
|
+
# Separate getting profile links and going to next page
|
30
|
+
# Method for getting links to all result pages
|
31
|
+
# Different method for getting all profile links on page and scraping (split to new thread for this)
|
32
|
+
# Has own output set, merge into full one at end (make sure threadsafe)
|
33
|
+
|
34
|
+
# Have own input and output
|
35
|
+
page.links.each do |link|
|
36
|
+
if (link.href.include? "linkedin.com") && (!link.href.include? "webcache") && (!link.href.include? "site:linkedin.com/pub+")
|
37
|
+
saveurl = link.href.split("?q=")
|
38
|
+
|
39
|
+
if saveurl[1]
|
40
|
+
url = saveurl[1].split("&")
|
41
|
+
begin
|
42
|
+
scrape(url[0])
|
43
|
+
rescue
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Find the link to the next page and go to it
|
49
|
+
if (link.href.include? "&sa=N") && (link.href.include? "&start=")
|
50
|
+
url1 = link.href.split("&start=")
|
51
|
+
url2 = url1[1].split("&sa=N")
|
52
|
+
|
53
|
+
if url2[0].to_i == @startindex
|
54
|
+
sleep(rand(5..10))
|
55
|
+
@startindex += 10
|
56
|
+
agent = Mechanize.new
|
57
|
+
examine(agent.get("http://google.com" + link.href))
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
# Scrapes profile
|
64
|
+
def scrape(url)
|
65
|
+
# Download profile and rescue on error
|
66
|
+
begin
|
67
|
+
url.gsub!("https", "http")
|
68
|
+
profile = Linkedin::Profile.get_profile(url)
|
69
|
+
rescue
|
70
|
+
end
|
71
|
+
|
72
|
+
# Parse profile if returned
|
73
|
+
if profile
|
74
|
+
p = ParseProfile.new(profile, url)
|
75
|
+
@output.concat(p.parse)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# Gets all data and returns in JSON
|
80
|
+
def getData
|
81
|
+
search
|
82
|
+
formatted_json = JSON.pretty_generate(@output)
|
83
|
+
return formatted_json
|
84
|
+
end
|
85
|
+
end
|
data/lib/parseprofile.rb
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
require 'json'
|
2
|
+
load 'getrelated.rb'
|
3
|
+
|
4
|
+
class ParseProfile
|
5
|
+
def initialize(profile, url)
|
6
|
+
@profile = profile
|
7
|
+
@url = url
|
8
|
+
@output = Array.new
|
9
|
+
@related_people
|
10
|
+
end
|
11
|
+
|
12
|
+
# Parse profile
|
13
|
+
def parse
|
14
|
+
begin
|
15
|
+
g = GetRelated.new(@url)
|
16
|
+
@related_people = g.getList
|
17
|
+
rescue
|
18
|
+
end
|
19
|
+
|
20
|
+
# Parse profiles for current companies
|
21
|
+
@profile.current_companies.each do |c|
|
22
|
+
@output.push(parseCompany(c, "Yes"))
|
23
|
+
end
|
24
|
+
|
25
|
+
# Parse past position/company info
|
26
|
+
@profile.past_companies.each do |c|
|
27
|
+
@output.push(parseCompany(c, "No"))
|
28
|
+
end
|
29
|
+
|
30
|
+
# Clean up directories
|
31
|
+
pics = Dir["public/uploads/*.jpg.*"]
|
32
|
+
pics.each do |p|
|
33
|
+
File.delete(p)
|
34
|
+
end
|
35
|
+
|
36
|
+
return @output
|
37
|
+
end
|
38
|
+
|
39
|
+
# Merge person data with role data
|
40
|
+
def parseCompany(c, status)
|
41
|
+
c.merge!(
|
42
|
+
:skills => @profile.skills,
|
43
|
+
:certifications => @profile.certifications,
|
44
|
+
:languages => @profile.languages,
|
45
|
+
:name => @profile.first_name + " " + @profile.last_name,
|
46
|
+
:location => @profile.location,
|
47
|
+
:area => @profile.country,
|
48
|
+
:industry => @profile.industry,
|
49
|
+
:picture => @profile.picture,
|
50
|
+
:organizations => @profile.organizations,
|
51
|
+
:groups => @profile.groups,
|
52
|
+
:education => @profile.education,
|
53
|
+
:websites => @profile.websites,
|
54
|
+
:profile_url => @url,
|
55
|
+
:current => status,
|
56
|
+
:timestamp => Time.now,
|
57
|
+
:related_people => @related_people)
|
58
|
+
c.merge!(:pic_path => getPic)
|
59
|
+
return c
|
60
|
+
end
|
61
|
+
|
62
|
+
# Download pictures
|
63
|
+
def getPic
|
64
|
+
if @profile.picture
|
65
|
+
path = @profile.picture.split("/")
|
66
|
+
if !File.file?("public/uploads/pictures/" + path[path.length-1].chomp.strip)
|
67
|
+
begin
|
68
|
+
`wget -P public/uploads/pictures #{@profile.picture}`
|
69
|
+
rescue
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
return "public/uploads/pictures/" + path[path.length-1].chomp.strip
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedindata
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
@@ -15,7 +15,10 @@ email: shidash@shidash.com
|
|
15
15
|
executables: []
|
16
16
|
extensions: []
|
17
17
|
extra_rdoc_files: []
|
18
|
-
files:
|
18
|
+
files:
|
19
|
+
- lib/linkedindata.rb
|
20
|
+
- lib/parseprofile.rb
|
21
|
+
- lib/getrelated.rb
|
19
22
|
homepage: https://github.com/transparencytoolkit/linkedindata
|
20
23
|
licenses:
|
21
24
|
- GPL
|