linkedindata 0.0.9 → 0.0.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/linkedindata.rb +19 -94
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 609dc4f607cbb1b2fd16088b1645f3c764bd8bf9
|
4
|
+
data.tar.gz: 7036885a0b9327e44fb46b1cd578651d7397c3e0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 097bef77b1016cad5d9c6a8dcb5c1ecdcb42f5ce1c4a160ddc9d37f19a0e0af954218910410ed27d945b4caa87da86987393a76c8c51040ac5e125dbd3d6acb0
|
7
|
+
data.tar.gz: 0412174d4340cb353927e76d7f0b715fecc6ed9e89b8508e994065685226f7fe7b745f3581e8a1de9269e1acaf14384383abb5f16a2032f4f1a42ae06937a253
|
data/lib/linkedindata.rb
CHANGED
@@ -3,21 +3,17 @@ require 'linkedin-scraper'
|
|
3
3
|
require 'json'
|
4
4
|
require 'nokogiri'
|
5
5
|
require 'open-uri'
|
6
|
+
load 'parseprofile.rb'
|
7
|
+
require 'pry'
|
6
8
|
|
7
9
|
class LinkedinData
|
8
10
|
def initialize(input, todegree)
|
9
11
|
@input = input
|
10
12
|
@output = Array.new
|
11
13
|
@startindex = 10
|
12
|
-
@degree = 0
|
13
|
-
if todegree == nil
|
14
|
-
@to_degree = 0
|
15
|
-
else
|
16
|
-
@to_degree = todegree
|
17
|
-
end
|
18
14
|
end
|
19
15
|
|
20
|
-
# Searches for
|
16
|
+
# Searches for profiles on Google
|
21
17
|
def search
|
22
18
|
agent = Mechanize.new
|
23
19
|
agent.user_agent_alias = 'Linux Firefox'
|
@@ -29,6 +25,12 @@ class LinkedinData
|
|
29
25
|
|
30
26
|
# Examines a search page
|
31
27
|
def examine(page)
|
28
|
+
# Separate getting profile links and going to next page
|
29
|
+
# Method for getting links to all result pages
|
30
|
+
# Different method for getting all profile links on page and scraping (split to new thread for this)
|
31
|
+
# Has own output set, merge into full one at end (make sure threadsafe)
|
32
|
+
|
33
|
+
# Have own input and output
|
32
34
|
page.links.each do |link|
|
33
35
|
if (link.href.include? "linkedin.com") && (!link.href.include? "webcache") && (!link.href.include? "site:linkedin.com/pub+")
|
34
36
|
saveurl = link.href.split("?q=")
|
@@ -42,110 +44,34 @@ class LinkedinData
|
|
42
44
|
end
|
43
45
|
end
|
44
46
|
|
47
|
+
# Find the link to the next page and go to it
|
45
48
|
if (link.href.include? "&sa=N") && (link.href.include? "&start=")
|
46
49
|
url1 = link.href.split("&start=")
|
47
50
|
url2 = url1[1].split("&sa=N")
|
48
51
|
|
49
52
|
if url2[0].to_i == @startindex
|
50
|
-
sleep(rand(
|
53
|
+
sleep(rand(5..10))
|
51
54
|
@startindex += 10
|
52
55
|
agent = Mechanize.new
|
53
|
-
examine(agent.get("http://google.com" + link.href))
|
56
|
+
Thread.new{ examine(agent.get("http://google.com" + link.href))}
|
54
57
|
end
|
55
58
|
end
|
56
59
|
end
|
57
60
|
end
|
58
61
|
|
59
|
-
# Scrapes profile
|
62
|
+
# Scrapes profile
|
60
63
|
def scrape(url)
|
61
|
-
|
62
|
-
@output.each do |o|
|
63
|
-
if o[:profile_url] == url
|
64
|
-
flag = 1
|
65
|
-
if @degree < o[:degree]
|
66
|
-
o[:degree] = @degree
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|
64
|
+
# Download profile and rescue on error
|
70
65
|
begin
|
71
|
-
|
72
|
-
|
66
|
+
url.gsub!("https", "http")
|
67
|
+
profile = Linkedin::Profile.get_profile(url)
|
73
68
|
rescue
|
74
69
|
end
|
75
70
|
|
71
|
+
# Parse profile if returned
|
76
72
|
if profile
|
77
|
-
profile
|
78
|
-
|
79
|
-
|
80
|
-
if profile.picture
|
81
|
-
path = profile.picture.split("/")
|
82
|
-
if !File.file?("public/uploads/pictures/" + path[path.length-1].chomp.strip)
|
83
|
-
begin
|
84
|
-
`wget -P public/uploads/pictures #{profile.picture}`
|
85
|
-
rescue
|
86
|
-
end
|
87
|
-
end
|
88
|
-
c.merge!(:pic_path => "public/uploads/pictures/" + path[path.length-1].chomp.strip)
|
89
|
-
end
|
90
|
-
|
91
|
-
@output.push(c)
|
92
|
-
end
|
93
|
-
|
94
|
-
profile.past_companies.each do |c|
|
95
|
-
c.merge!(:skills => profile.skills, :certifications => profile.certifications, :languages => profile.languages, :name => profile.first_name + " " + profile.last_name, :location => profile.location, :area => profile.country, :industry => profile.industry, :picture => profile.picture, :organizations => profile.organizations, :groups => profile.groups, :education => profile.education, :websites => profile.websites, :profile_url => url, :degree => @degree, :current => "No")
|
96
|
-
@output.push(c)
|
97
|
-
|
98
|
-
if profile.picture
|
99
|
-
path = profile.picture.split("/")
|
100
|
-
if !File.file?("public/uploads/pictures/" + path[path.length-1].chomp.strip)
|
101
|
-
begin
|
102
|
-
`wget -P public/uploads/pictures #{profile.picture}`
|
103
|
-
rescue
|
104
|
-
end
|
105
|
-
end
|
106
|
-
c.merge!(:pic_path => "public/uploads/pictures/" + path[path.length-1].chomp.strip)
|
107
|
-
end
|
108
|
-
end
|
109
|
-
|
110
|
-
# Clean up directories
|
111
|
-
pics = Dir["public/uploads/*.jpg.*"]
|
112
|
-
pics.each do |p|
|
113
|
-
File.delete(p)
|
114
|
-
end
|
115
|
-
getRelated(url)
|
116
|
-
end
|
117
|
-
end
|
118
|
-
|
119
|
-
# Gets related profiles listed on side of the page
|
120
|
-
def getRelated(url)
|
121
|
-
if @degree < @to_degree
|
122
|
-
begin
|
123
|
-
html = Nokogiri::HTML(open(url))
|
124
|
-
rescue
|
125
|
-
end
|
126
|
-
|
127
|
-
if html
|
128
|
-
html.css("li.with-photo").each do |l|
|
129
|
-
plink = l.css("a")[0]['href'].split("?")
|
130
|
-
|
131
|
-
# Check to be sure not already saved
|
132
|
-
flag = 0
|
133
|
-
@output.each do |o|
|
134
|
-
if o[:profile_url] == plink[0]
|
135
|
-
flag = 1
|
136
|
-
end
|
137
|
-
end
|
138
|
-
|
139
|
-
if flag == 0
|
140
|
-
@degree += 1
|
141
|
-
begin
|
142
|
-
scrape(plink[0])
|
143
|
-
rescue
|
144
|
-
end
|
145
|
-
@degree -= 1
|
146
|
-
end
|
147
|
-
end
|
148
|
-
end
|
73
|
+
p = ParseProfile.new(profile, url)
|
74
|
+
@output.concat(p.parse)
|
149
75
|
end
|
150
76
|
end
|
151
77
|
|
@@ -155,4 +81,3 @@ class LinkedinData
|
|
155
81
|
return JSON.pretty_generate(@output)
|
156
82
|
end
|
157
83
|
end
|
158
|
-
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedindata
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-10-17 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Scrapes all LinkedIn profiles including terms you specify.
|
14
14
|
email: shidash@shidash.com
|