linkedindata 0.0.11 → 0.0.12
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- metadata +2 -3
- data/lib/linkedindata.rb +0 -85
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8f7c5ffe3de4948c6b28d505079581a10825ea91
|
4
|
+
data.tar.gz: 05df543799dd12fbc1e6c25671f539d5644aefab
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a62b36e080463295eb988c37e8538f2f0181561f1fa48b7e61f0f8c13334990953c830fafd7db852fbaddace5f7c204ea441becc3f9c7995b372bb7feff90dc6
|
7
|
+
data.tar.gz: 7d1c02373a972dbaf6851179b46fcee58c30e04a321cfa54143ef5b96b053b973b4b9123531a95722393d02953d2050ecbc45ccbec34f6dd7b46faa4108e77b0
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedindata
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
@@ -15,8 +15,7 @@ email: shidash@shidash.com
|
|
15
15
|
executables: []
|
16
16
|
extensions: []
|
17
17
|
extra_rdoc_files: []
|
18
|
-
files:
|
19
|
-
- lib/linkedindata.rb
|
18
|
+
files: []
|
20
19
|
homepage: https://github.com/transparencytoolkit/linkedindata
|
21
20
|
licenses:
|
22
21
|
- GPL
|
data/lib/linkedindata.rb
DELETED
@@ -1,85 +0,0 @@
|
|
1
|
-
require 'mechanize'
|
2
|
-
require 'linkedin-scraper'
|
3
|
-
require 'json'
|
4
|
-
require 'nokogiri'
|
5
|
-
require 'open-uri'
|
6
|
-
load 'parseprofile.rb'
|
7
|
-
require 'pry'
|
8
|
-
require 'urlarchiver'
|
9
|
-
|
10
|
-
class LinkedinData
|
11
|
-
def initialize(input, todegree)
|
12
|
-
@input = input
|
13
|
-
@output = Array.new
|
14
|
-
@startindex = 10
|
15
|
-
end
|
16
|
-
|
17
|
-
# Searches for profiles on Google
|
18
|
-
def search
|
19
|
-
agent = Mechanize.new
|
20
|
-
agent.user_agent_alias = 'Linux Firefox'
|
21
|
-
gform = agent.get("http://google.com").form("f")
|
22
|
-
gform.q = "site:linkedin.com/pub " + @input
|
23
|
-
page = agent.submit(gform, gform.buttons.first)
|
24
|
-
examine(page)
|
25
|
-
end
|
26
|
-
|
27
|
-
# Examines a search page
|
28
|
-
def examine(page)
|
29
|
-
# Separate getting profile links and going to next page
|
30
|
-
# Method for getting links to all result pages
|
31
|
-
# Different method for getting all profile links on page and scraping (split to new thread for this)
|
32
|
-
# Has own output set, merge into full one at end (make sure threadsafe)
|
33
|
-
|
34
|
-
# Have own input and output
|
35
|
-
page.links.each do |link|
|
36
|
-
if (link.href.include? "linkedin.com") && (!link.href.include? "webcache") && (!link.href.include? "site:linkedin.com/pub+")
|
37
|
-
saveurl = link.href.split("?q=")
|
38
|
-
|
39
|
-
if saveurl[1]
|
40
|
-
url = saveurl[1].split("&")
|
41
|
-
begin
|
42
|
-
scrape(url[0])
|
43
|
-
rescue
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
# Find the link to the next page and go to it
|
49
|
-
if (link.href.include? "&sa=N") && (link.href.include? "&start=")
|
50
|
-
url1 = link.href.split("&start=")
|
51
|
-
url2 = url1[1].split("&sa=N")
|
52
|
-
|
53
|
-
if url2[0].to_i == @startindex
|
54
|
-
sleep(rand(5..10))
|
55
|
-
@startindex += 10
|
56
|
-
agent = Mechanize.new
|
57
|
-
examine(agent.get("http://google.com" + link.href))
|
58
|
-
end
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
# Scrapes profile
|
64
|
-
def scrape(url)
|
65
|
-
# Download profile and rescue on error
|
66
|
-
begin
|
67
|
-
url.gsub!("https", "http")
|
68
|
-
profile = Linkedin::Profile.get_profile(url)
|
69
|
-
rescue
|
70
|
-
end
|
71
|
-
|
72
|
-
# Parse profile if returned
|
73
|
-
if profile
|
74
|
-
p = ParseProfile.new(profile, url)
|
75
|
-
@output.concat(p.parse)
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
# Gets all data and returns in JSON
|
80
|
-
def getData
|
81
|
-
search
|
82
|
-
formatted_json = JSON.pretty_generate(@output)
|
83
|
-
return formatted_json
|
84
|
-
end
|
85
|
-
end
|