linkedindata 0.0.17 → 0.0.18

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a5c91c293adffc48f543a68f568efbb3da7995ea
4
- data.tar.gz: 61d5376780067945e8666e9aceb2485a58b4391b
3
+ metadata.gz: 62911808bef43a12c8723a47135534fd7ff330fb
4
+ data.tar.gz: 4012d7ef04d34401d79ee1c3b4150e3a353358fc
5
5
  SHA512:
6
- metadata.gz: 03651bdc5fc45d1c4ca3d15818029f4f8a5b7e743996b9d58955d24354544fbb1f49e459d2b1ad22af3f117a3342bf8b792ae39eca9f84a9f77e35b67cdff303
7
- data.tar.gz: a1fb3827faa3f640769cc87d1a84a8c9857e15f4c35151d3b6776ce557ed921e4bbae6a588791452e2b29f951d8525fd16709aa899dc4072bdf01a2a5c2ccdf6
6
+ metadata.gz: 1912abe3d5349f5cbbcd4c06ab699926859fb78f75c96aef49f932351791d325a7101b7e585ee81fc0c87869d0b5440d3a7c9c77ecf926a5d280da50a7a1e023
7
+ data.tar.gz: a4de7d6888cd3ef25edf8037a7624a570890882744305694cc9a45f564b0094b08ae2d4bf0789569915e6ab9e9d56731b873f5bf714f042fb9af5fcfacdc2539
@@ -0,0 +1,80 @@
1
+ module GetRelated
2
+ # Get the list of names of related people
3
+ def getList(html)
4
+ namelist = Array.new
5
+
6
+ # Save each person's name and url
7
+ html.css("div.insights-browse-map").each do |d|
8
+ if d.css("h3").text == "People Also Viewed"
9
+ d.css("li").each do |l|
10
+ namelist.push({name: l.css("h4").text,
11
+ url: l.css("a")[0]['href']})
12
+ end
13
+ end
14
+ end
15
+
16
+ return namelist
17
+ end
18
+
19
+
20
+ # Get all profiles within numhops of original(s)
21
+ def getRelatedProfiles
22
+ @numhops.times do |hop_count|
23
+ @output.select { |profile| profile[:degree] == hop_count }.each do |item|
24
+ downloadRelated(item, hop_count) if item[:related_people]
25
+ end
26
+ end
27
+ end
28
+
29
+ # Scrapes the related profiles for one result item
30
+ def downloadRelated(item, hop_count)
31
+ item[:related_people].each do |related_person|
32
+ # Check if it has been scraped already
33
+ if @output.select { |person| related_person[:name] == person[:name] }.empty?
34
+ scrape(related_person[:url], hop_count+1)
35
+ end
36
+ end
37
+ end
38
+
39
+
40
+ # Make list of profiles for score tracking
41
+ def fullProfileList(data)
42
+ profiles = Hash.new
43
+ data.each do |d|
44
+ profiles[d[:profile_url]] = 0
45
+ end
46
+ return profiles
47
+ end
48
+
49
+ # Adds points to a profile for showing up in related people
50
+ def addPointsToProfile(profile_scores, data_item, person)
51
+ if profile_scores[person[:url]]
52
+ # Calculate degree- (2/d*2) except when degree is 0
53
+ degree_divide = data_item[:degree] == 0 ? 1 : data_item[:degree]*2
54
+ profile_scores[person[:url]] += (2.0/degree_divide)
55
+ end
56
+ return profile_scores
57
+ end
58
+
59
+ # Add a score to each profile based on the # of times it appears in "people also viewed"
60
+ def relScore(data)
61
+ profile_scores = fullProfileList(data)
62
+
63
+ # Get degree and calculate score for each profile
64
+ data.each do |data_item|
65
+ if data_item[:related_people]
66
+ data_item[:related_people].each do |person|
67
+ profile_scores = addPointsToProfile(profile_scores, data_item, person)
68
+ end
69
+ end
70
+ end
71
+
72
+ # Merge scores back into dataset
73
+ data.each do |m|
74
+ m.merge!(score: profile_scores[m[:profile_url]])
75
+ end
76
+
77
+ return data
78
+ end
79
+ end
80
+
data/lib/linkedin.rb ADDED
@@ -0,0 +1,66 @@
1
+ # Someone already made a nice gem for parsing public profiles:
2
+ # https://github.com/yatish27/linkedin-scraper
3
+ # This class reopens that to add extra things I need
4
+ module Linkedin
5
+ class Profile
6
+ include ProxyManager
7
+ include GetRelated
8
+
9
+ def initialize(url, curhops, proxylist, usedproxies)
10
+ @linkedin_url = url
11
+ @curhops = curhops
12
+ @proxylist = proxylist
13
+ @usedproxies = usedproxies
14
+
15
+ # Add attributes to list
16
+ ATTRIBUTES.push(
17
+ "related_people",
18
+ "profile_url",
19
+ "timestamp",
20
+ "degree",
21
+ "pic_path")
22
+ @page = getPage(url) # Get pages with proxies
23
+ end
24
+
25
+
26
+ def self.get_profile(url, curhops, proxylist, usedproxies)
27
+ Linkedin::Profile.new(url, curhops, proxylist, usedproxies)
28
+ rescue => e
29
+ puts e
30
+ end
31
+
32
+ # Gets "people also viewed list" form profile sidebar
33
+ def related_people
34
+ @related_people ||= getList(Nokogiri::HTML(@page.body))
35
+ end
36
+
37
+ # Similar to linkedin_url
38
+ def profile_url
39
+ @profile_url ||= @linkedin_url
40
+ end
41
+
42
+ # Get the time the profile was scraped
43
+ def timestamp
44
+ @timestamp ||= Time.now
45
+ end
46
+
47
+ # Get the number of hops out where profile appears
48
+ def degree
49
+ @degree ||= @curhops
50
+ end
51
+
52
+ # Download the profile picture
53
+ def pic_path
54
+ if picture
55
+ # Get path
56
+ dir = "public/uploads/pictures/"
57
+ full_path = dir+picture.split("/").last.chomp.strip
58
+
59
+ # Get file
60
+ `wget -P #{dir} #{picture}` if !File.file?(full_path)
61
+ return full_path
62
+ end
63
+ end
64
+
65
+ end
66
+ end
data/lib/linkedindata.rb CHANGED
@@ -1,81 +1,46 @@
1
- require 'mechanize'
2
1
  require 'linkedin-scraper'
2
+ require 'generalscraper'
3
3
  require 'json'
4
4
  require 'nokogiri'
5
- require 'open-uri'
6
- load 'parseprofile.rb'
7
- require 'pry'
8
- require 'urlarchiver'
9
5
  require 'set'
10
6
 
7
+ load 'parse_profile.rb'
8
+ load 'get_related.rb'
9
+ load 'linkedin.rb'
10
+
11
11
  class LinkedinData
12
- def initialize(input, todegree)
13
- @input = input
12
+ include GetRelated
13
+ include ParseProfile
14
+ include Linkedin
15
+
16
+ def initialize(todegree, proxylist)
17
+ @proxylist = IO.readlines(proxylist)
18
+ @proxy_list_path = proxylist
19
+ @usedproxies = Hash.new
14
20
  @output = Array.new
15
21
  @startindex = 10
16
22
  @numhops = todegree
17
23
  end
18
24
 
19
25
  # Searches for profiles on Google
20
- def search
21
- agent = Mechanize.new
22
- agent.user_agent_alias = 'Linux Firefox'
23
- gform = agent.get("http://google.com").form("f")
24
- gform.q = "site:linkedin.com/pub " + @input
25
- page = agent.submit(gform, gform.buttons.first)
26
- examine(page)
27
- end
28
-
29
- # Examines a search page
30
- def examine(page)
31
- # Separate getting profile links and going to next page
32
- # Method for getting links to all result pages
33
- # Different method for getting all profile links on page and scraping (split to new thread for this)
34
- # Has own output set, merge into full one at end (make sure threadsafe)
35
-
36
- # Have own input and output
37
- page.links.each do |link|
38
- if (link.href.include? "linkedin.com") && (!link.href.include? "webcache") && (!link.href.include? "site:linkedin.com/pub+")
39
- saveurl = link.href.split("?q=")
40
-
41
- if saveurl[1]
42
- url = saveurl[1].split("&")
43
- begin
44
- scrape(url[0], 0)
45
- rescue
46
- end
47
- end
48
- end
49
-
50
- # Find the link to the next page and go to it
51
- if (link.href.include? "&sa=N") && (link.href.include? "&start=")
52
- url1 = link.href.split("&start=")
53
- url2 = url1[1].split("&sa=N")
54
-
55
- if url2[0].to_i == @startindex
56
- sleep(rand(30..90))
57
- @startindex += 10
58
- agent = Mechanize.new
59
- examine(agent.get("http://google.com" + link.href))
60
- end
61
- end
26
+ def search(search_terms)
27
+ g = GeneralScraper.new("site:linkedin.com/pub", search_terms, @proxy_list_path)
28
+ JSON.parse(g.getURLs).each do |profile|
29
+ scrape(profile, 0)
62
30
  end
63
31
  end
64
32
 
65
- # Scrapes profile
33
+ # Scrapes and parses individual profile
66
34
  def scrape(url, curhops)
67
35
  # Download profile and rescue on error
68
36
  begin
69
37
  url.gsub!("https", "http")
70
- profile = Linkedin::Profile.get_profile(url)
38
+ profile = Linkedin::Profile.get_profile(url, curhops, @proxylist, @usedproxies)
71
39
  rescue
72
40
  end
73
41
 
74
- # Parse profile if returned
75
- if profile
76
- p = ParseProfile.new(profile, url, curhops)
77
- @output.concat(p.parse)
78
- end
42
+ # Parse profile if returned and add to output
43
+ @output.concat(parseResume(profile)) if profile
79
44
  end
80
45
 
81
46
  # Make sure all keys that occur occur in each item (even if nil)
@@ -101,59 +66,22 @@ class LinkedinData
101
66
  return datarr
102
67
  end
103
68
 
104
- # Add a score to each profile based on the # of times it appears in "people also viewed"
105
- def relScore(data)
106
-
107
- # Make list of profiles
108
- profiles = Hash.new
109
- data.each do |d|
110
- profiles[d["profile_url"]] = 0
111
- end
112
-
113
- # Get degree for each profile
114
- data.each do |i|
115
- if i["related_people"]
116
- i["related_people"].each do |p|
117
- if profiles[p["url"]]
118
- # Calculate degree- (2/d*2) except when degree is 0
119
- degree_divide = i["degree"] == 0 ? 1 : i["degree"]*2
120
- profiles[p["url"]] += (2.0/degree_divide)
121
- end
122
- end
123
- end
124
- end
125
-
126
- # Merge scores back into dataset
127
- data.each do |m|
128
- m.merge!(:score => profiles[m["profile_url"]])
129
- end
130
-
131
- return data
69
+ # Gets related profiles then adds relevance scores and any missing keys
70
+ def prepareResults
71
+ getRelatedProfiles
72
+ deleteDuplicatePics
73
+ return JSON.pretty_generate(relScore(showAllKeys(@output)))
132
74
  end
133
75
 
134
- # Gets all data and returns in JSON
135
- def getData
136
- search
137
-
138
- # Get related profiles
139
- @numhops.times do
140
- @output.each do |o|
141
- if o[:degree] < @numhops
142
-
143
- if o[:related_people]
144
- o[:related_people].each do |i|
145
- if @output.select { |obj| obj[:name] == i[:name]}.empty?
146
- scrape(i[:url], o[:degree]+1)
147
- end
148
- end
149
- end
150
-
151
- end
152
- end
153
- end
154
-
155
- formatted_json = JSON.pretty_generate(relScore(showAllKeys(@output)))
156
- return formatted_json
76
+ # Gets one profile and the related profiles
77
+ def getSingleProfile(url)
78
+ scrape(url, 0)
79
+ return prepareResults
80
+ end
81
+
82
+ # Gets all profiles in search results and returns in JSON
83
+ def getByKeywords(search_term)
84
+ search(search_term)
85
+ return prepareResults
157
86
  end
158
87
  end
159
-
@@ -0,0 +1,50 @@
1
+ module ParseProfile
2
+ # Parse profile into items by company
3
+ def parseResume(profile)
4
+ output = Array.new
5
+
6
+ # Parse profiles for current companies
7
+ profile.current_companies.each do |c|
8
+ output.push(addPersonFields(c, "Yes", profile))
9
+ end
10
+
11
+ # Parse past position/company info
12
+ profile.past_companies.each do |c|
13
+ output.push(addPersonFields(c, "No", profile))
14
+ end
15
+
16
+ return output
17
+ end
18
+
19
+ # Deletes duplicate pictures
20
+ def deleteDuplicatePics
21
+ pics = Dir["public/uploads/pictures/*.jpg.*"]
22
+ pics.each do |p|
23
+ File.delete(p)
24
+ end
25
+ end
26
+
27
+ # Merge person data with role data
28
+ def addPersonFields(c, status, profile)
29
+ c.merge!(
30
+ skills: profile.skills,
31
+ certifications: profile.certifications,
32
+ languages: profile.languages,
33
+ name: profile.name,
34
+ location: profile.location,
35
+ area: profile.country,
36
+ industry: profile.industry,
37
+ picture: profile.picture,
38
+ organizations: profile.organizations,
39
+ groups: profile.groups,
40
+ education: profile.education,
41
+ websites: profile.websites,
42
+ profile_url: profile.profile_url,
43
+ current: status,
44
+ timestamp: profile.timestamp,
45
+ related_people: profile.related_people,
46
+ degree: profile.degree,
47
+ pic_path: profile.pic_path)
48
+ return c
49
+ end
50
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkedindata
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.17
4
+ version: 0.0.18
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-02-24 00:00:00.000000000 Z
11
+ date: 2015-04-11 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Scrapes all LinkedIn profiles including terms you specify.
14
14
  email: shidash@shidash.com
@@ -16,9 +16,10 @@ executables: []
16
16
  extensions: []
17
17
  extra_rdoc_files: []
18
18
  files:
19
- - lib/getrelated.rb
19
+ - lib/get_related.rb
20
+ - lib/linkedin.rb
20
21
  - lib/linkedindata.rb
21
- - lib/parseprofile.rb
22
+ - lib/parse_profile.rb
22
23
  homepage: https://github.com/transparencytoolkit/linkedindata
23
24
  licenses:
24
25
  - GPL
data/lib/getrelated.rb DELETED
@@ -1,55 +0,0 @@
1
- require 'json'
2
- require 'nokogiri'
3
- require 'open-uri'
4
-
5
- class GetRelated
6
- def initialize(url)
7
- @url = url
8
- @relatedlist = Array.new
9
- end
10
-
11
- # Get the list of names of related people
12
- def getList
13
- html = Nokogiri::HTML(open(@url.gsub("http", "https")))
14
-
15
- if html
16
- namelist = Array.new
17
-
18
- # Go through each person
19
- html.css("div.insights-browse-map").each do |d|
20
- if d.css("h3").text == "People Also Viewed"
21
- d.css("li").each do |l|
22
- temphash = Hash.new
23
- temphash[:name] = l.css("h4").text
24
- temphash[:url] = l.css("a")[0]['href']
25
- namelist.push(temphash)
26
- end
27
- end
28
- end
29
-
30
- return namelist
31
- end
32
- end
33
- end
34
-
35
- # This is just an outline for the next version of getrelated
36
-
37
- # Add degree back as field (0 by default)
38
- # Loop through all profiles
39
- # Load n times (need to determine optimal num)
40
- # Save list of related people (for profile- make list and append if seen listed as related or in related list)
41
- # Save overall list of related people (with URLs and min degree)
42
- # Track min degrees out
43
-
44
- # Go through overall list of related people
45
- # Parse profile
46
- # Make sure degree is correct when saved
47
- # Maybe save in JSONs by degree
48
-
49
-
50
- # Info:
51
- # Profiles of related people
52
- # Degrees for all profiles
53
- # Related people list on each profile (complete)
54
-
55
- # Deduplicate
data/lib/parseprofile.rb DELETED
@@ -1,79 +0,0 @@
1
- require 'json'
2
- load 'getrelated.rb'
3
- require 'pry'
4
-
5
- class ParseProfile
6
- def initialize(profile, url, curhops)
7
- @profile = profile
8
- @url = url
9
- @output = Array.new
10
- @related_people
11
- @curhops = curhops
12
- end
13
-
14
- # Parse profile
15
- def parse
16
- begin
17
- g = GetRelated.new(@url)
18
- @related_people = g.getList
19
- rescue
20
- end
21
-
22
- # Parse profiles for current companies
23
- @profile.current_companies.each do |c|
24
- @output.push(parseCompany(c, "Yes"))
25
- end
26
-
27
- # Parse past position/company info
28
- @profile.past_companies.each do |c|
29
- @output.push(parseCompany(c, "No"))
30
- end
31
-
32
- # Clean up directories
33
- pics = Dir["public/uploads/*.jpg.*"]
34
- pics.each do |p|
35
- File.delete(p)
36
- end
37
-
38
- return @output
39
- end
40
-
41
- # Merge person data with role data
42
- def parseCompany(c, status)
43
- c.merge!(
44
- :skills => @profile.skills,
45
- :certifications => @profile.certifications,
46
- :languages => @profile.languages,
47
- :name => @profile.first_name + " " + @profile.last_name,
48
- :location => @profile.location,
49
- :area => @profile.country,
50
- :industry => @profile.industry,
51
- :picture => @profile.picture,
52
- :organizations => @profile.organizations,
53
- :groups => @profile.groups,
54
- :education => @profile.education,
55
- :websites => @profile.websites,
56
- :profile_url => @url,
57
- :current => status,
58
- :timestamp => Time.now,
59
- :related_people => @related_people,
60
- :degree => @curhops)
61
- c.merge!(:pic_path => getPic)
62
- return c
63
- end
64
-
65
- # Download pictures
66
- def getPic
67
- if @profile.picture
68
- path = @profile.picture.split("/")
69
- if !File.file?("public/uploads/pictures/" + path[path.length-1].chomp.strip)
70
- begin
71
- `wget -P public/uploads/pictures #{@profile.picture}`
72
- rescue
73
- end
74
- end
75
-
76
- return "public/uploads/pictures/" + path[path.length-1].chomp.strip
77
- end
78
- end
79
- end