linkedindata 0.0.12 → 0.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8f7c5ffe3de4948c6b28d505079581a10825ea91
4
- data.tar.gz: 05df543799dd12fbc1e6c25671f539d5644aefab
3
+ metadata.gz: 3bc1d73941a6228073ef054491452a8ecc34f0d6
4
+ data.tar.gz: 35046bb711b902b5a8650b6995d291afc48c3702
5
5
  SHA512:
6
- metadata.gz: a62b36e080463295eb988c37e8538f2f0181561f1fa48b7e61f0f8c13334990953c830fafd7db852fbaddace5f7c204ea441becc3f9c7995b372bb7feff90dc6
7
- data.tar.gz: 7d1c02373a972dbaf6851179b46fcee58c30e04a321cfa54143ef5b96b053b973b4b9123531a95722393d02953d2050ecbc45ccbec34f6dd7b46faa4108e77b0
6
+ metadata.gz: 5286e02124965b5f02ecbad34699840071ca0938c52e9da7d796683f150e0f796fa5d12ddca85763e997c95fbc1b99e573e89cc21344dddfd5709beaaa3434c8
7
+ data.tar.gz: 0015c5ef88cb4c14e187412e0da448ef40194c24d12fbbddc71ea50682e5769e9fa76930925eb08e654d1db7de91d1e5a3be7027cd41003b2beba0019f6ff82b
data/lib/getrelated.rb ADDED
@@ -0,0 +1,50 @@
1
+ require 'json'
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+
5
+ class GetRelated
6
+ def initialize(url)
7
+ @url = url
8
+ @relatedlist = Array.new
9
+ end
10
+
11
+ # Get the list of names of related people
12
+ def getList
13
+ html = Nokogiri::HTML(open(@url))
14
+
15
+ if html
16
+ namelist = Array.new
17
+
18
+ # Go through each person
19
+ html.css("div.insights-browse-map").each do |d|
20
+ d.css("li").each do |l|
21
+ namelist.push(l.css("h4").text)
22
+ end
23
+ end
24
+
25
+ return namelist
26
+ end
27
+ end
28
+ end
29
+
30
+ # This is just an outline for the next version of getrelated
31
+
32
+ # Add degree back as field (0 by default)
33
+ # Loop through all profiles
34
+ # Load n times (need to determine optimal num)
35
+ # Save list of related people (for profile- make list and append if seen listed as related or in related list)
36
+ # Save overall list of related people (with URLs and min degree)
37
+ # Track min degrees out
38
+
39
+ # Go through overall list of related people
40
+ # Parse profile
41
+ # Make sure degree is correct when saved
42
+ # Maybe save in JSONs by degree
43
+
44
+
45
+ # Info:
46
+ # Profiles of related people
47
+ # Degrees for all profiles
48
+ # Related people list on each profile (complete)
49
+
50
+ # Deduplicate
@@ -0,0 +1,85 @@
1
+ require 'mechanize'
2
+ require 'linkedin-scraper'
3
+ require 'json'
4
+ require 'nokogiri'
5
+ require 'open-uri'
6
+ load 'parseprofile.rb'
7
+ require 'pry'
8
+ require 'urlarchiver'
9
+
10
+ class LinkedinData
11
+ def initialize(input, todegree)
12
+ @input = input
13
+ @output = Array.new
14
+ @startindex = 10
15
+ end
16
+
17
+ # Searches for profiles on Google
18
+ def search
19
+ agent = Mechanize.new
20
+ agent.user_agent_alias = 'Linux Firefox'
21
+ gform = agent.get("http://google.com").form("f")
22
+ gform.q = "site:linkedin.com/pub " + @input
23
+ page = agent.submit(gform, gform.buttons.first)
24
+ examine(page)
25
+ end
26
+
27
+ # Examines a search page
28
+ def examine(page)
29
+ # Separate getting profile links and going to next page
30
+ # Method for getting links to all result pages
31
+ # Different method for getting all profile links on page and scraping (split to new thread for this)
32
+ # Has own output set, merge into full one at end (make sure threadsafe)
33
+
34
+ # Have own input and output
35
+ page.links.each do |link|
36
+ if (link.href.include? "linkedin.com") && (!link.href.include? "webcache") && (!link.href.include? "site:linkedin.com/pub+")
37
+ saveurl = link.href.split("?q=")
38
+
39
+ if saveurl[1]
40
+ url = saveurl[1].split("&")
41
+ begin
42
+ scrape(url[0])
43
+ rescue
44
+ end
45
+ end
46
+ end
47
+
48
+ # Find the link to the next page and go to it
49
+ if (link.href.include? "&sa=N") && (link.href.include? "&start=")
50
+ url1 = link.href.split("&start=")
51
+ url2 = url1[1].split("&sa=N")
52
+
53
+ if url2[0].to_i == @startindex
54
+ sleep(rand(5..10))
55
+ @startindex += 10
56
+ agent = Mechanize.new
57
+ examine(agent.get("http://google.com" + link.href))
58
+ end
59
+ end
60
+ end
61
+ end
62
+
63
+ # Scrapes profile
64
+ def scrape(url)
65
+ # Download profile and rescue on error
66
+ begin
67
+ url.gsub!("https", "http")
68
+ profile = Linkedin::Profile.get_profile(url)
69
+ rescue
70
+ end
71
+
72
+ # Parse profile if returned
73
+ if profile
74
+ p = ParseProfile.new(profile, url)
75
+ @output.concat(p.parse)
76
+ end
77
+ end
78
+
79
+ # Gets all data and returns in JSON
80
+ def getData
81
+ search
82
+ formatted_json = JSON.pretty_generate(@output)
83
+ return formatted_json
84
+ end
85
+ end
@@ -0,0 +1,76 @@
1
+ require 'json'
2
+ load 'getrelated.rb'
3
+
4
+ class ParseProfile
5
+ def initialize(profile, url)
6
+ @profile = profile
7
+ @url = url
8
+ @output = Array.new
9
+ @related_people
10
+ end
11
+
12
+ # Parse profile
13
+ def parse
14
+ begin
15
+ g = GetRelated.new(@url)
16
+ @related_people = g.getList
17
+ rescue
18
+ end
19
+
20
+ # Parse profiles for current companies
21
+ @profile.current_companies.each do |c|
22
+ @output.push(parseCompany(c, "Yes"))
23
+ end
24
+
25
+ # Parse past position/company info
26
+ @profile.past_companies.each do |c|
27
+ @output.push(parseCompany(c, "No"))
28
+ end
29
+
30
+ # Clean up directories
31
+ pics = Dir["public/uploads/*.jpg.*"]
32
+ pics.each do |p|
33
+ File.delete(p)
34
+ end
35
+
36
+ return @output
37
+ end
38
+
39
+ # Merge person data with role data
40
+ def parseCompany(c, status)
41
+ c.merge!(
42
+ :skills => @profile.skills,
43
+ :certifications => @profile.certifications,
44
+ :languages => @profile.languages,
45
+ :name => @profile.first_name + " " + @profile.last_name,
46
+ :location => @profile.location,
47
+ :area => @profile.country,
48
+ :industry => @profile.industry,
49
+ :picture => @profile.picture,
50
+ :organizations => @profile.organizations,
51
+ :groups => @profile.groups,
52
+ :education => @profile.education,
53
+ :websites => @profile.websites,
54
+ :profile_url => @url,
55
+ :current => status,
56
+ :timestamp => Time.now,
57
+ :related_people => @related_people)
58
+ c.merge!(:pic_path => getPic)
59
+ return c
60
+ end
61
+
62
+ # Download pictures
63
+ def getPic
64
+ if @profile.picture
65
+ path = @profile.picture.split("/")
66
+ if !File.file?("public/uploads/pictures/" + path[path.length-1].chomp.strip)
67
+ begin
68
+ `wget -P public/uploads/pictures #{@profile.picture}`
69
+ rescue
70
+ end
71
+ end
72
+
73
+ return "public/uploads/pictures/" + path[path.length-1].chomp.strip
74
+ end
75
+ end
76
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkedindata
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.12
4
+ version: 0.0.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
@@ -15,7 +15,10 @@ email: shidash@shidash.com
15
15
  executables: []
16
16
  extensions: []
17
17
  extra_rdoc_files: []
18
- files: []
18
+ files:
19
+ - lib/linkedindata.rb
20
+ - lib/parseprofile.rb
21
+ - lib/getrelated.rb
19
22
  homepage: https://github.com/transparencytoolkit/linkedindata
20
23
  licenses:
21
24
  - GPL