linkedindata 0.0.12 → 0.0.13

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8f7c5ffe3de4948c6b28d505079581a10825ea91
4
- data.tar.gz: 05df543799dd12fbc1e6c25671f539d5644aefab
3
+ metadata.gz: 3bc1d73941a6228073ef054491452a8ecc34f0d6
4
+ data.tar.gz: 35046bb711b902b5a8650b6995d291afc48c3702
5
5
  SHA512:
6
- metadata.gz: a62b36e080463295eb988c37e8538f2f0181561f1fa48b7e61f0f8c13334990953c830fafd7db852fbaddace5f7c204ea441becc3f9c7995b372bb7feff90dc6
7
- data.tar.gz: 7d1c02373a972dbaf6851179b46fcee58c30e04a321cfa54143ef5b96b053b973b4b9123531a95722393d02953d2050ecbc45ccbec34f6dd7b46faa4108e77b0
6
+ metadata.gz: 5286e02124965b5f02ecbad34699840071ca0938c52e9da7d796683f150e0f796fa5d12ddca85763e997c95fbc1b99e573e89cc21344dddfd5709beaaa3434c8
7
+ data.tar.gz: 0015c5ef88cb4c14e187412e0da448ef40194c24d12fbbddc71ea50682e5769e9fa76930925eb08e654d1db7de91d1e5a3be7027cd41003b2beba0019f6ff82b
data/lib/getrelated.rb ADDED
@@ -0,0 +1,50 @@
1
+ require 'json'
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+
5
+ class GetRelated
6
+ def initialize(url)
7
+ @url = url
8
+ @relatedlist = Array.new
9
+ end
10
+
11
+ # Get the list of names of related people
12
+ def getList
13
+ html = Nokogiri::HTML(open(@url))
14
+
15
+ if html
16
+ namelist = Array.new
17
+
18
+ # Go through each person
19
+ html.css("div.insights-browse-map").each do |d|
20
+ d.css("li").each do |l|
21
+ namelist.push(l.css("h4").text)
22
+ end
23
+ end
24
+
25
+ return namelist
26
+ end
27
+ end
28
+ end
29
+
30
+ # This is just an outline for the next version of getrelated
31
+
32
+ # Add degree back as field (0 by default)
33
+ # Loop through all profiles
34
+ # Load n times (need to determine optimal num)
35
+ # Save list of related people (for profile- make list and append if seen listed as related or in related list)
36
+ # Save overall list of related people (with URLs and min degree)
37
+ # Track min degrees out
38
+
39
+ # Go through overall list of related people
40
+ # Parse profile
41
+ # Make sure degree is correct when saved
42
+ # Maybe save in JSONs by degree
43
+
44
+
45
+ # Info:
46
+ # Profiles of related people
47
+ # Degrees for all profiles
48
+ # Related people list on each profile (complete)
49
+
50
+ # Deduplicate
@@ -0,0 +1,85 @@
1
+ require 'mechanize'
2
+ require 'linkedin-scraper'
3
+ require 'json'
4
+ require 'nokogiri'
5
+ require 'open-uri'
6
+ load 'parseprofile.rb'
7
+ require 'pry'
8
+ require 'urlarchiver'
9
+
10
+ class LinkedinData
11
+ def initialize(input, todegree)
12
+ @input = input
13
+ @output = Array.new
14
+ @startindex = 10
15
+ end
16
+
17
+ # Searches for profiles on Google
18
+ def search
19
+ agent = Mechanize.new
20
+ agent.user_agent_alias = 'Linux Firefox'
21
+ gform = agent.get("http://google.com").form("f")
22
+ gform.q = "site:linkedin.com/pub " + @input
23
+ page = agent.submit(gform, gform.buttons.first)
24
+ examine(page)
25
+ end
26
+
27
+ # Examines a search page
28
+ def examine(page)
29
+ # Separate getting profile links and going to next page
30
+ # Method for getting links to all result pages
31
+ # Different method for getting all profile links on page and scraping (split to new thread for this)
32
+ # Has own output set, merge into full one at end (make sure threadsafe)
33
+
34
+ # Have own input and output
35
+ page.links.each do |link|
36
+ if (link.href.include? "linkedin.com") && (!link.href.include? "webcache") && (!link.href.include? "site:linkedin.com/pub+")
37
+ saveurl = link.href.split("?q=")
38
+
39
+ if saveurl[1]
40
+ url = saveurl[1].split("&")
41
+ begin
42
+ scrape(url[0])
43
+ rescue
44
+ end
45
+ end
46
+ end
47
+
48
+ # Find the link to the next page and go to it
49
+ if (link.href.include? "&sa=N") && (link.href.include? "&start=")
50
+ url1 = link.href.split("&start=")
51
+ url2 = url1[1].split("&sa=N")
52
+
53
+ if url2[0].to_i == @startindex
54
+ sleep(rand(5..10))
55
+ @startindex += 10
56
+ agent = Mechanize.new
57
+ examine(agent.get("http://google.com" + link.href))
58
+ end
59
+ end
60
+ end
61
+ end
62
+
63
+ # Scrapes profile
64
+ def scrape(url)
65
+ # Download profile and rescue on error
66
+ begin
67
+ url.gsub!("https", "http")
68
+ profile = Linkedin::Profile.get_profile(url)
69
+ rescue
70
+ end
71
+
72
+ # Parse profile if returned
73
+ if profile
74
+ p = ParseProfile.new(profile, url)
75
+ @output.concat(p.parse)
76
+ end
77
+ end
78
+
79
+ # Gets all data and returns in JSON
80
+ def getData
81
+ search
82
+ formatted_json = JSON.pretty_generate(@output)
83
+ return formatted_json
84
+ end
85
+ end
@@ -0,0 +1,76 @@
1
+ require 'json'
2
+ load 'getrelated.rb'
3
+
4
+ class ParseProfile
5
+ def initialize(profile, url)
6
+ @profile = profile
7
+ @url = url
8
+ @output = Array.new
9
+ @related_people
10
+ end
11
+
12
+ # Parse profile
13
+ def parse
14
+ begin
15
+ g = GetRelated.new(@url)
16
+ @related_people = g.getList
17
+ rescue
18
+ end
19
+
20
+ # Parse profiles for current companies
21
+ @profile.current_companies.each do |c|
22
+ @output.push(parseCompany(c, "Yes"))
23
+ end
24
+
25
+ # Parse past position/company info
26
+ @profile.past_companies.each do |c|
27
+ @output.push(parseCompany(c, "No"))
28
+ end
29
+
30
+ # Clean up directories
31
+ pics = Dir["public/uploads/*.jpg.*"]
32
+ pics.each do |p|
33
+ File.delete(p)
34
+ end
35
+
36
+ return @output
37
+ end
38
+
39
+ # Merge person data with role data
40
+ def parseCompany(c, status)
41
+ c.merge!(
42
+ :skills => @profile.skills,
43
+ :certifications => @profile.certifications,
44
+ :languages => @profile.languages,
45
+ :name => @profile.first_name + " " + @profile.last_name,
46
+ :location => @profile.location,
47
+ :area => @profile.country,
48
+ :industry => @profile.industry,
49
+ :picture => @profile.picture,
50
+ :organizations => @profile.organizations,
51
+ :groups => @profile.groups,
52
+ :education => @profile.education,
53
+ :websites => @profile.websites,
54
+ :profile_url => @url,
55
+ :current => status,
56
+ :timestamp => Time.now,
57
+ :related_people => @related_people)
58
+ c.merge!(:pic_path => getPic)
59
+ return c
60
+ end
61
+
62
+ # Download pictures
63
+ def getPic
64
+ if @profile.picture
65
+ path = @profile.picture.split("/")
66
+ if !File.file?("public/uploads/pictures/" + path[path.length-1].chomp.strip)
67
+ begin
68
+ `wget -P public/uploads/pictures #{@profile.picture}`
69
+ rescue
70
+ end
71
+ end
72
+
73
+ return "public/uploads/pictures/" + path[path.length-1].chomp.strip
74
+ end
75
+ end
76
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkedindata
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.12
4
+ version: 0.0.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
@@ -15,7 +15,10 @@ email: shidash@shidash.com
15
15
  executables: []
16
16
  extensions: []
17
17
  extra_rdoc_files: []
18
- files: []
18
+ files:
19
+ - lib/linkedindata.rb
20
+ - lib/parseprofile.rb
21
+ - lib/getrelated.rb
19
22
  homepage: https://github.com/transparencytoolkit/linkedindata
20
23
  licenses:
21
24
  - GPL