linkedindata 0.0.17 → 0.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a5c91c293adffc48f543a68f568efbb3da7995ea
4
- data.tar.gz: 61d5376780067945e8666e9aceb2485a58b4391b
3
+ metadata.gz: 62911808bef43a12c8723a47135534fd7ff330fb
4
+ data.tar.gz: 4012d7ef04d34401d79ee1c3b4150e3a353358fc
5
5
  SHA512:
6
- metadata.gz: 03651bdc5fc45d1c4ca3d15818029f4f8a5b7e743996b9d58955d24354544fbb1f49e459d2b1ad22af3f117a3342bf8b792ae39eca9f84a9f77e35b67cdff303
7
- data.tar.gz: a1fb3827faa3f640769cc87d1a84a8c9857e15f4c35151d3b6776ce557ed921e4bbae6a588791452e2b29f951d8525fd16709aa899dc4072bdf01a2a5c2ccdf6
6
+ metadata.gz: 1912abe3d5349f5cbbcd4c06ab699926859fb78f75c96aef49f932351791d325a7101b7e585ee81fc0c87869d0b5440d3a7c9c77ecf926a5d280da50a7a1e023
7
+ data.tar.gz: a4de7d6888cd3ef25edf8037a7624a570890882744305694cc9a45f564b0094b08ae2d4bf0789569915e6ab9e9d56731b873f5bf714f042fb9af5fcfacdc2539
@@ -0,0 +1,80 @@
1
+ module GetRelated
2
+ # Get the list of names of related people
3
+ def getList(html)
4
+ namelist = Array.new
5
+
6
+ # Save each person's name and url
7
+ html.css("div.insights-browse-map").each do |d|
8
+ if d.css("h3").text == "People Also Viewed"
9
+ d.css("li").each do |l|
10
+ namelist.push({name: l.css("h4").text,
11
+ url: l.css("a")[0]['href']})
12
+ end
13
+ end
14
+ end
15
+
16
+ return namelist
17
+ end
18
+
19
+
20
+ # Get all profiles within numhops of original(s)
21
+ def getRelatedProfiles
22
+ @numhops.times do |hop_count|
23
+ @output.select { |profile| profile[:degree] == hop_count }.each do |item|
24
+ downloadRelated(item, hop_count) if item[:related_people]
25
+ end
26
+ end
27
+ end
28
+
29
+ # Scrapes the related profiles for one result item
30
+ def downloadRelated(item, hop_count)
31
+ item[:related_people].each do |related_person|
32
+ # Check if it has been scraped already
33
+ if @output.select { |person| related_person[:name] == person[:name] }.empty?
34
+ scrape(related_person[:url], hop_count+1)
35
+ end
36
+ end
37
+ end
38
+
39
+
40
+ # Make list of profiles for score tracking
41
+ def fullProfileList(data)
42
+ profiles = Hash.new
43
+ data.each do |d|
44
+ profiles[d[:profile_url]] = 0
45
+ end
46
+ return profiles
47
+ end
48
+
49
+ # Adds points to a profile for showing up in related people
50
+ def addPointsToProfile(profile_scores, data_item, person)
51
+ if profile_scores[person[:url]]
52
+ # Calculate degree- (2/d*2) except when degree is 0
53
+ degree_divide = data_item[:degree] == 0 ? 1 : data_item[:degree]*2
54
+ profile_scores[person[:url]] += (2.0/degree_divide)
55
+ end
56
+ return profile_scores
57
+ end
58
+
59
+ # Add a score to each profile based on the # of times it appears in "people also viewed"
60
+ def relScore(data)
61
+ profile_scores = fullProfileList(data)
62
+
63
+ # Get degree and calculate score for each profile
64
+ data.each do |data_item|
65
+ if data_item[:related_people]
66
+ data_item[:related_people].each do |person|
67
+ profile_scores = addPointsToProfile(profile_scores, data_item, person)
68
+ end
69
+ end
70
+ end
71
+
72
+ # Merge scores back into dataset
73
+ data.each do |m|
74
+ m.merge!(score: profile_scores[m[:profile_url]])
75
+ end
76
+
77
+ return data
78
+ end
79
+ end
80
+
data/lib/linkedin.rb ADDED
@@ -0,0 +1,66 @@
1
+ # Someone already made a nice gem for parsing public profiles:
2
+ # https://github.com/yatish27/linkedin-scraper
3
+ # This class reopens that to add extra things I need
4
+ module Linkedin
5
+ class Profile
6
+ include ProxyManager
7
+ include GetRelated
8
+
9
+ def initialize(url, curhops, proxylist, usedproxies)
10
+ @linkedin_url = url
11
+ @curhops = curhops
12
+ @proxylist = proxylist
13
+ @usedproxies = usedproxies
14
+
15
+ # Add attributes to list
16
+ ATTRIBUTES.push(
17
+ "related_people",
18
+ "profile_url",
19
+ "timestamp",
20
+ "degree",
21
+ "pic_path")
22
+ @page = getPage(url) # Get pages with proxies
23
+ end
24
+
25
+
26
+ def self.get_profile(url, curhops, proxylist, usedproxies)
27
+ Linkedin::Profile.new(url, curhops, proxylist, usedproxies)
28
+ rescue => e
29
+ puts e
30
+ end
31
+
32
+ # Gets "people also viewed list" form profile sidebar
33
+ def related_people
34
+ @related_people ||= getList(Nokogiri::HTML(@page.body))
35
+ end
36
+
37
+ # Similar to linkedin_url
38
+ def profile_url
39
+ @profile_url ||= @linkedin_url
40
+ end
41
+
42
+ # Get the time the profile was scraped
43
+ def timestamp
44
+ @timestamp ||= Time.now
45
+ end
46
+
47
+ # Get the number of hops out where profile appears
48
+ def degree
49
+ @degree ||= @curhops
50
+ end
51
+
52
+ # Download the profile picture
53
+ def pic_path
54
+ if picture
55
+ # Get path
56
+ dir = "public/uploads/pictures/"
57
+ full_path = dir+picture.split("/").last.chomp.strip
58
+
59
+ # Get file
60
+ `wget -P #{dir} #{picture}` if !File.file?(full_path)
61
+ return full_path
62
+ end
63
+ end
64
+
65
+ end
66
+ end
data/lib/linkedindata.rb CHANGED
@@ -1,81 +1,46 @@
1
- require 'mechanize'
2
1
  require 'linkedin-scraper'
2
+ require 'generalscraper'
3
3
  require 'json'
4
4
  require 'nokogiri'
5
- require 'open-uri'
6
- load 'parseprofile.rb'
7
- require 'pry'
8
- require 'urlarchiver'
9
5
  require 'set'
10
6
 
7
+ load 'parse_profile.rb'
8
+ load 'get_related.rb'
9
+ load 'linkedin.rb'
10
+
11
11
  class LinkedinData
12
- def initialize(input, todegree)
13
- @input = input
12
+ include GetRelated
13
+ include ParseProfile
14
+ include Linkedin
15
+
16
+ def initialize(todegree, proxylist)
17
+ @proxylist = IO.readlines(proxylist)
18
+ @proxy_list_path = proxylist
19
+ @usedproxies = Hash.new
14
20
  @output = Array.new
15
21
  @startindex = 10
16
22
  @numhops = todegree
17
23
  end
18
24
 
19
25
  # Searches for profiles on Google
20
- def search
21
- agent = Mechanize.new
22
- agent.user_agent_alias = 'Linux Firefox'
23
- gform = agent.get("http://google.com").form("f")
24
- gform.q = "site:linkedin.com/pub " + @input
25
- page = agent.submit(gform, gform.buttons.first)
26
- examine(page)
27
- end
28
-
29
- # Examines a search page
30
- def examine(page)
31
- # Separate getting profile links and going to next page
32
- # Method for getting links to all result pages
33
- # Different method for getting all profile links on page and scraping (split to new thread for this)
34
- # Has own output set, merge into full one at end (make sure threadsafe)
35
-
36
- # Have own input and output
37
- page.links.each do |link|
38
- if (link.href.include? "linkedin.com") && (!link.href.include? "webcache") && (!link.href.include? "site:linkedin.com/pub+")
39
- saveurl = link.href.split("?q=")
40
-
41
- if saveurl[1]
42
- url = saveurl[1].split("&")
43
- begin
44
- scrape(url[0], 0)
45
- rescue
46
- end
47
- end
48
- end
49
-
50
- # Find the link to the next page and go to it
51
- if (link.href.include? "&sa=N") && (link.href.include? "&start=")
52
- url1 = link.href.split("&start=")
53
- url2 = url1[1].split("&sa=N")
54
-
55
- if url2[0].to_i == @startindex
56
- sleep(rand(30..90))
57
- @startindex += 10
58
- agent = Mechanize.new
59
- examine(agent.get("http://google.com" + link.href))
60
- end
61
- end
26
+ def search(search_terms)
27
+ g = GeneralScraper.new("site:linkedin.com/pub", search_terms, @proxy_list_path)
28
+ JSON.parse(g.getURLs).each do |profile|
29
+ scrape(profile, 0)
62
30
  end
63
31
  end
64
32
 
65
- # Scrapes profile
33
+ # Scrapes and parses individual profile
66
34
  def scrape(url, curhops)
67
35
  # Download profile and rescue on error
68
36
  begin
69
37
  url.gsub!("https", "http")
70
- profile = Linkedin::Profile.get_profile(url)
38
+ profile = Linkedin::Profile.get_profile(url, curhops, @proxylist, @usedproxies)
71
39
  rescue
72
40
  end
73
41
 
74
- # Parse profile if returned
75
- if profile
76
- p = ParseProfile.new(profile, url, curhops)
77
- @output.concat(p.parse)
78
- end
42
+ # Parse profile if returned and add to output
43
+ @output.concat(parseResume(profile)) if profile
79
44
  end
80
45
 
81
46
  # Make sure all keys that occur occur in each item (even if nil)
@@ -101,59 +66,22 @@ class LinkedinData
101
66
  return datarr
102
67
  end
103
68
 
104
- # Add a score to each profile based on the # of times it appears in "people also viewed"
105
- def relScore(data)
106
-
107
- # Make list of profiles
108
- profiles = Hash.new
109
- data.each do |d|
110
- profiles[d["profile_url"]] = 0
111
- end
112
-
113
- # Get degree for each profile
114
- data.each do |i|
115
- if i["related_people"]
116
- i["related_people"].each do |p|
117
- if profiles[p["url"]]
118
- # Calculate degree- (2/d*2) except when degree is 0
119
- degree_divide = i["degree"] == 0 ? 1 : i["degree"]*2
120
- profiles[p["url"]] += (2.0/degree_divide)
121
- end
122
- end
123
- end
124
- end
125
-
126
- # Merge scores back into dataset
127
- data.each do |m|
128
- m.merge!(:score => profiles[m["profile_url"]])
129
- end
130
-
131
- return data
69
+ # Gets related profiles then adds relevance scores and any missing keys
70
+ def prepareResults
71
+ getRelatedProfiles
72
+ deleteDuplicatePics
73
+ return JSON.pretty_generate(relScore(showAllKeys(@output)))
132
74
  end
133
75
 
134
- # Gets all data and returns in JSON
135
- def getData
136
- search
137
-
138
- # Get related profiles
139
- @numhops.times do
140
- @output.each do |o|
141
- if o[:degree] < @numhops
142
-
143
- if o[:related_people]
144
- o[:related_people].each do |i|
145
- if @output.select { |obj| obj[:name] == i[:name]}.empty?
146
- scrape(i[:url], o[:degree]+1)
147
- end
148
- end
149
- end
150
-
151
- end
152
- end
153
- end
154
-
155
- formatted_json = JSON.pretty_generate(relScore(showAllKeys(@output)))
156
- return formatted_json
76
+ # Gets one profile and the related profiles
77
+ def getSingleProfile(url)
78
+ scrape(url, 0)
79
+ return prepareResults
80
+ end
81
+
82
+ # Gets all profiles in search results and returns in JSON
83
+ def getByKeywords(search_term)
84
+ search(search_term)
85
+ return prepareResults
157
86
  end
158
87
  end
159
-
@@ -0,0 +1,50 @@
1
+ module ParseProfile
2
+ # Parse profile into items by company
3
+ def parseResume(profile)
4
+ output = Array.new
5
+
6
+ # Parse profiles for current companies
7
+ profile.current_companies.each do |c|
8
+ output.push(addPersonFields(c, "Yes", profile))
9
+ end
10
+
11
+ # Parse past position/company info
12
+ profile.past_companies.each do |c|
13
+ output.push(addPersonFields(c, "No", profile))
14
+ end
15
+
16
+ return output
17
+ end
18
+
19
+ # Deletes duplicate pictures
20
+ def deleteDuplicatePics
21
+ pics = Dir["public/uploads/pictures/*.jpg.*"]
22
+ pics.each do |p|
23
+ File.delete(p)
24
+ end
25
+ end
26
+
27
+ # Merge person data with role data
28
+ def addPersonFields(c, status, profile)
29
+ c.merge!(
30
+ skills: profile.skills,
31
+ certifications: profile.certifications,
32
+ languages: profile.languages,
33
+ name: profile.name,
34
+ location: profile.location,
35
+ area: profile.country,
36
+ industry: profile.industry,
37
+ picture: profile.picture,
38
+ organizations: profile.organizations,
39
+ groups: profile.groups,
40
+ education: profile.education,
41
+ websites: profile.websites,
42
+ profile_url: profile.profile_url,
43
+ current: status,
44
+ timestamp: profile.timestamp,
45
+ related_people: profile.related_people,
46
+ degree: profile.degree,
47
+ pic_path: profile.pic_path)
48
+ return c
49
+ end
50
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkedindata
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.17
4
+ version: 0.0.18
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-02-24 00:00:00.000000000 Z
11
+ date: 2015-04-11 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Scrapes all LinkedIn profiles including terms you specify.
14
14
  email: shidash@shidash.com
@@ -16,9 +16,10 @@ executables: []
16
16
  extensions: []
17
17
  extra_rdoc_files: []
18
18
  files:
19
- - lib/getrelated.rb
19
+ - lib/get_related.rb
20
+ - lib/linkedin.rb
20
21
  - lib/linkedindata.rb
21
- - lib/parseprofile.rb
22
+ - lib/parse_profile.rb
22
23
  homepage: https://github.com/transparencytoolkit/linkedindata
23
24
  licenses:
24
25
  - GPL
data/lib/getrelated.rb DELETED
@@ -1,55 +0,0 @@
1
- require 'json'
2
- require 'nokogiri'
3
- require 'open-uri'
4
-
5
- class GetRelated
6
- def initialize(url)
7
- @url = url
8
- @relatedlist = Array.new
9
- end
10
-
11
- # Get the list of names of related people
12
- def getList
13
- html = Nokogiri::HTML(open(@url.gsub("http", "https")))
14
-
15
- if html
16
- namelist = Array.new
17
-
18
- # Go through each person
19
- html.css("div.insights-browse-map").each do |d|
20
- if d.css("h3").text == "People Also Viewed"
21
- d.css("li").each do |l|
22
- temphash = Hash.new
23
- temphash[:name] = l.css("h4").text
24
- temphash[:url] = l.css("a")[0]['href']
25
- namelist.push(temphash)
26
- end
27
- end
28
- end
29
-
30
- return namelist
31
- end
32
- end
33
- end
34
-
35
- # This is just an outline for the next version of getrelated
36
-
37
- # Add degree back as field (0 by default)
38
- # Loop through all profiles
39
- # Load n times (need to determine optimal num)
40
- # Save list of related people (for profile- make list and append if seen listed as related or in related list)
41
- # Save overall list of related people (with URLs and min degree)
42
- # Track min degrees out
43
-
44
- # Go through overall list of related people
45
- # Parse profile
46
- # Make sure degree is correct when saved
47
- # Maybe save in JSONs by degree
48
-
49
-
50
- # Info:
51
- # Profiles of related people
52
- # Degrees for all profiles
53
- # Related people list on each profile (complete)
54
-
55
- # Deduplicate
data/lib/parseprofile.rb DELETED
@@ -1,79 +0,0 @@
1
- require 'json'
2
- load 'getrelated.rb'
3
- require 'pry'
4
-
5
- class ParseProfile
6
- def initialize(profile, url, curhops)
7
- @profile = profile
8
- @url = url
9
- @output = Array.new
10
- @related_people
11
- @curhops = curhops
12
- end
13
-
14
- # Parse profile
15
- def parse
16
- begin
17
- g = GetRelated.new(@url)
18
- @related_people = g.getList
19
- rescue
20
- end
21
-
22
- # Parse profiles for current companies
23
- @profile.current_companies.each do |c|
24
- @output.push(parseCompany(c, "Yes"))
25
- end
26
-
27
- # Parse past position/company info
28
- @profile.past_companies.each do |c|
29
- @output.push(parseCompany(c, "No"))
30
- end
31
-
32
- # Clean up directories
33
- pics = Dir["public/uploads/*.jpg.*"]
34
- pics.each do |p|
35
- File.delete(p)
36
- end
37
-
38
- return @output
39
- end
40
-
41
- # Merge person data with role data
42
- def parseCompany(c, status)
43
- c.merge!(
44
- :skills => @profile.skills,
45
- :certifications => @profile.certifications,
46
- :languages => @profile.languages,
47
- :name => @profile.first_name + " " + @profile.last_name,
48
- :location => @profile.location,
49
- :area => @profile.country,
50
- :industry => @profile.industry,
51
- :picture => @profile.picture,
52
- :organizations => @profile.organizations,
53
- :groups => @profile.groups,
54
- :education => @profile.education,
55
- :websites => @profile.websites,
56
- :profile_url => @url,
57
- :current => status,
58
- :timestamp => Time.now,
59
- :related_people => @related_people,
60
- :degree => @curhops)
61
- c.merge!(:pic_path => getPic)
62
- return c
63
- end
64
-
65
- # Download pictures
66
- def getPic
67
- if @profile.picture
68
- path = @profile.picture.split("/")
69
- if !File.file?("public/uploads/pictures/" + path[path.length-1].chomp.strip)
70
- begin
71
- `wget -P public/uploads/pictures #{@profile.picture}`
72
- rescue
73
- end
74
- end
75
-
76
- return "public/uploads/pictures/" + path[path.length-1].chomp.strip
77
- end
78
- end
79
- end