linkedindata 0.0.9 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/linkedindata.rb +19 -94
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 609dc4f607cbb1b2fd16088b1645f3c764bd8bf9
|
4
|
+
data.tar.gz: 7036885a0b9327e44fb46b1cd578651d7397c3e0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 097bef77b1016cad5d9c6a8dcb5c1ecdcb42f5ce1c4a160ddc9d37f19a0e0af954218910410ed27d945b4caa87da86987393a76c8c51040ac5e125dbd3d6acb0
|
7
|
+
data.tar.gz: 0412174d4340cb353927e76d7f0b715fecc6ed9e89b8508e994065685226f7fe7b745f3581e8a1de9269e1acaf14384383abb5f16a2032f4f1a42ae06937a253
|
data/lib/linkedindata.rb
CHANGED
@@ -3,21 +3,17 @@ require 'linkedin-scraper'
|
|
3
3
|
require 'json'
|
4
4
|
require 'nokogiri'
|
5
5
|
require 'open-uri'
|
6
|
+
load 'parseprofile.rb'
|
7
|
+
require 'pry'
|
6
8
|
|
7
9
|
class LinkedinData
|
8
10
|
def initialize(input, todegree)
|
9
11
|
@input = input
|
10
12
|
@output = Array.new
|
11
13
|
@startindex = 10
|
12
|
-
@degree = 0
|
13
|
-
if todegree == nil
|
14
|
-
@to_degree = 0
|
15
|
-
else
|
16
|
-
@to_degree = todegree
|
17
|
-
end
|
18
14
|
end
|
19
15
|
|
20
|
-
# Searches for
|
16
|
+
# Searches for profiles on Google
|
21
17
|
def search
|
22
18
|
agent = Mechanize.new
|
23
19
|
agent.user_agent_alias = 'Linux Firefox'
|
@@ -29,6 +25,12 @@ class LinkedinData
|
|
29
25
|
|
30
26
|
# Examines a search page
|
31
27
|
def examine(page)
|
28
|
+
# Separate getting profile links and going to next page
|
29
|
+
# Method for getting links to all result pages
|
30
|
+
# Different method for getting all profile links on page and scraping (split to new thread for this)
|
31
|
+
# Has own output set, merge into full one at end (make sure threadsafe)
|
32
|
+
|
33
|
+
# Have own input and output
|
32
34
|
page.links.each do |link|
|
33
35
|
if (link.href.include? "linkedin.com") && (!link.href.include? "webcache") && (!link.href.include? "site:linkedin.com/pub+")
|
34
36
|
saveurl = link.href.split("?q=")
|
@@ -42,110 +44,34 @@ class LinkedinData
|
|
42
44
|
end
|
43
45
|
end
|
44
46
|
|
47
|
+
# Find the link to the next page and go to it
|
45
48
|
if (link.href.include? "&sa=N") && (link.href.include? "&start=")
|
46
49
|
url1 = link.href.split("&start=")
|
47
50
|
url2 = url1[1].split("&sa=N")
|
48
51
|
|
49
52
|
if url2[0].to_i == @startindex
|
50
|
-
sleep(rand(
|
53
|
+
sleep(rand(5..10))
|
51
54
|
@startindex += 10
|
52
55
|
agent = Mechanize.new
|
53
|
-
examine(agent.get("http://google.com" + link.href))
|
56
|
+
Thread.new{ examine(agent.get("http://google.com" + link.href))}
|
54
57
|
end
|
55
58
|
end
|
56
59
|
end
|
57
60
|
end
|
58
61
|
|
59
|
-
# Scrapes profile
|
62
|
+
# Scrapes profile
|
60
63
|
def scrape(url)
|
61
|
-
|
62
|
-
@output.each do |o|
|
63
|
-
if o[:profile_url] == url
|
64
|
-
flag = 1
|
65
|
-
if @degree < o[:degree]
|
66
|
-
o[:degree] = @degree
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|
64
|
+
# Download profile and rescue on error
|
70
65
|
begin
|
71
|
-
|
72
|
-
|
66
|
+
url.gsub!("https", "http")
|
67
|
+
profile = Linkedin::Profile.get_profile(url)
|
73
68
|
rescue
|
74
69
|
end
|
75
70
|
|
71
|
+
# Parse profile if returned
|
76
72
|
if profile
|
77
|
-
profile
|
78
|
-
|
79
|
-
|
80
|
-
if profile.picture
|
81
|
-
path = profile.picture.split("/")
|
82
|
-
if !File.file?("public/uploads/pictures/" + path[path.length-1].chomp.strip)
|
83
|
-
begin
|
84
|
-
`wget -P public/uploads/pictures #{profile.picture}`
|
85
|
-
rescue
|
86
|
-
end
|
87
|
-
end
|
88
|
-
c.merge!(:pic_path => "public/uploads/pictures/" + path[path.length-1].chomp.strip)
|
89
|
-
end
|
90
|
-
|
91
|
-
@output.push(c)
|
92
|
-
end
|
93
|
-
|
94
|
-
profile.past_companies.each do |c|
|
95
|
-
c.merge!(:skills => profile.skills, :certifications => profile.certifications, :languages => profile.languages, :name => profile.first_name + " " + profile.last_name, :location => profile.location, :area => profile.country, :industry => profile.industry, :picture => profile.picture, :organizations => profile.organizations, :groups => profile.groups, :education => profile.education, :websites => profile.websites, :profile_url => url, :degree => @degree, :current => "No")
|
96
|
-
@output.push(c)
|
97
|
-
|
98
|
-
if profile.picture
|
99
|
-
path = profile.picture.split("/")
|
100
|
-
if !File.file?("public/uploads/pictures/" + path[path.length-1].chomp.strip)
|
101
|
-
begin
|
102
|
-
`wget -P public/uploads/pictures #{profile.picture}`
|
103
|
-
rescue
|
104
|
-
end
|
105
|
-
end
|
106
|
-
c.merge!(:pic_path => "public/uploads/pictures/" + path[path.length-1].chomp.strip)
|
107
|
-
end
|
108
|
-
end
|
109
|
-
|
110
|
-
# Clean up directories
|
111
|
-
pics = Dir["public/uploads/*.jpg.*"]
|
112
|
-
pics.each do |p|
|
113
|
-
File.delete(p)
|
114
|
-
end
|
115
|
-
getRelated(url)
|
116
|
-
end
|
117
|
-
end
|
118
|
-
|
119
|
-
# Gets related profiles listed on side of the page
|
120
|
-
def getRelated(url)
|
121
|
-
if @degree < @to_degree
|
122
|
-
begin
|
123
|
-
html = Nokogiri::HTML(open(url))
|
124
|
-
rescue
|
125
|
-
end
|
126
|
-
|
127
|
-
if html
|
128
|
-
html.css("li.with-photo").each do |l|
|
129
|
-
plink = l.css("a")[0]['href'].split("?")
|
130
|
-
|
131
|
-
# Check to be sure not already saved
|
132
|
-
flag = 0
|
133
|
-
@output.each do |o|
|
134
|
-
if o[:profile_url] == plink[0]
|
135
|
-
flag = 1
|
136
|
-
end
|
137
|
-
end
|
138
|
-
|
139
|
-
if flag == 0
|
140
|
-
@degree += 1
|
141
|
-
begin
|
142
|
-
scrape(plink[0])
|
143
|
-
rescue
|
144
|
-
end
|
145
|
-
@degree -= 1
|
146
|
-
end
|
147
|
-
end
|
148
|
-
end
|
73
|
+
p = ParseProfile.new(profile, url)
|
74
|
+
@output.concat(p.parse)
|
149
75
|
end
|
150
76
|
end
|
151
77
|
|
@@ -155,4 +81,3 @@ class LinkedinData
|
|
155
81
|
return JSON.pretty_generate(@output)
|
156
82
|
end
|
157
83
|
end
|
158
|
-
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedindata
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-10-17 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Scrapes all LinkedIn profiles including terms you specify.
|
14
14
|
email: shidash@shidash.com
|