linkedindata 0.0.11 → 0.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- metadata +2 -3
- data/lib/linkedindata.rb +0 -85
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8f7c5ffe3de4948c6b28d505079581a10825ea91
|
4
|
+
data.tar.gz: 05df543799dd12fbc1e6c25671f539d5644aefab
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a62b36e080463295eb988c37e8538f2f0181561f1fa48b7e61f0f8c13334990953c830fafd7db852fbaddace5f7c204ea441becc3f9c7995b372bb7feff90dc6
|
7
|
+
data.tar.gz: 7d1c02373a972dbaf6851179b46fcee58c30e04a321cfa54143ef5b96b053b973b4b9123531a95722393d02953d2050ecbc45ccbec34f6dd7b46faa4108e77b0
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedindata
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
@@ -15,8 +15,7 @@ email: shidash@shidash.com
|
|
15
15
|
executables: []
|
16
16
|
extensions: []
|
17
17
|
extra_rdoc_files: []
|
18
|
-
files:
|
19
|
-
- lib/linkedindata.rb
|
18
|
+
files: []
|
20
19
|
homepage: https://github.com/transparencytoolkit/linkedindata
|
21
20
|
licenses:
|
22
21
|
- GPL
|
data/lib/linkedindata.rb
DELETED
@@ -1,85 +0,0 @@
|
|
1
|
-
require 'mechanize'
|
2
|
-
require 'linkedin-scraper'
|
3
|
-
require 'json'
|
4
|
-
require 'nokogiri'
|
5
|
-
require 'open-uri'
|
6
|
-
load 'parseprofile.rb'
|
7
|
-
require 'pry'
|
8
|
-
require 'urlarchiver'
|
9
|
-
|
10
|
-
class LinkedinData
|
11
|
-
def initialize(input, todegree)
|
12
|
-
@input = input
|
13
|
-
@output = Array.new
|
14
|
-
@startindex = 10
|
15
|
-
end
|
16
|
-
|
17
|
-
# Searches for profiles on Google
|
18
|
-
def search
|
19
|
-
agent = Mechanize.new
|
20
|
-
agent.user_agent_alias = 'Linux Firefox'
|
21
|
-
gform = agent.get("http://google.com").form("f")
|
22
|
-
gform.q = "site:linkedin.com/pub " + @input
|
23
|
-
page = agent.submit(gform, gform.buttons.first)
|
24
|
-
examine(page)
|
25
|
-
end
|
26
|
-
|
27
|
-
# Examines a search page
|
28
|
-
def examine(page)
|
29
|
-
# Separate getting profile links and going to next page
|
30
|
-
# Method for getting links to all result pages
|
31
|
-
# Different method for getting all profile links on page and scraping (split to new thread for this)
|
32
|
-
# Has own output set, merge into full one at end (make sure threadsafe)
|
33
|
-
|
34
|
-
# Have own input and output
|
35
|
-
page.links.each do |link|
|
36
|
-
if (link.href.include? "linkedin.com") && (!link.href.include? "webcache") && (!link.href.include? "site:linkedin.com/pub+")
|
37
|
-
saveurl = link.href.split("?q=")
|
38
|
-
|
39
|
-
if saveurl[1]
|
40
|
-
url = saveurl[1].split("&")
|
41
|
-
begin
|
42
|
-
scrape(url[0])
|
43
|
-
rescue
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
# Find the link to the next page and go to it
|
49
|
-
if (link.href.include? "&sa=N") && (link.href.include? "&start=")
|
50
|
-
url1 = link.href.split("&start=")
|
51
|
-
url2 = url1[1].split("&sa=N")
|
52
|
-
|
53
|
-
if url2[0].to_i == @startindex
|
54
|
-
sleep(rand(5..10))
|
55
|
-
@startindex += 10
|
56
|
-
agent = Mechanize.new
|
57
|
-
examine(agent.get("http://google.com" + link.href))
|
58
|
-
end
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
# Scrapes profile
|
64
|
-
def scrape(url)
|
65
|
-
# Download profile and rescue on error
|
66
|
-
begin
|
67
|
-
url.gsub!("https", "http")
|
68
|
-
profile = Linkedin::Profile.get_profile(url)
|
69
|
-
rescue
|
70
|
-
end
|
71
|
-
|
72
|
-
# Parse profile if returned
|
73
|
-
if profile
|
74
|
-
p = ParseProfile.new(profile, url)
|
75
|
-
@output.concat(p.parse)
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
# Gets all data and returns in JSON
|
80
|
-
def getData
|
81
|
-
search
|
82
|
-
formatted_json = JSON.pretty_generate(@output)
|
83
|
-
return formatted_json
|
84
|
-
end
|
85
|
-
end
|