linkedindata 0.0.9 → 0.0.10

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/linkedindata.rb +19 -94
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: fd21c1b7e2b7467b69f40632b6c1c1943bbf210d
4
- data.tar.gz: b1a2cbd4aaa75b3406848ef978c22c6966a106b3
3
+ metadata.gz: 609dc4f607cbb1b2fd16088b1645f3c764bd8bf9
4
+ data.tar.gz: 7036885a0b9327e44fb46b1cd578651d7397c3e0
5
5
  SHA512:
6
- metadata.gz: 4c234ba4ea5c7402e9ec5d5e80e15305e603d05bc516d9911f855f7baceb177f368d487af3b48cc2fcd4fe980048d2346f5abd5d9a8572775876db164e96b5d2
7
- data.tar.gz: 1566a9d2ffeeb889191443b3ef5fc8b3076b55209c023918401ec67e0aefb2ae7281eaa054f4a0f2ec5b045797c092030c8762b11dfd453279ebfb0d6cd173b2
6
+ metadata.gz: 097bef77b1016cad5d9c6a8dcb5c1ecdcb42f5ce1c4a160ddc9d37f19a0e0af954218910410ed27d945b4caa87da86987393a76c8c51040ac5e125dbd3d6acb0
7
+ data.tar.gz: 0412174d4340cb353927e76d7f0b715fecc6ed9e89b8508e994065685226f7fe7b745f3581e8a1de9269e1acaf14384383abb5f16a2032f4f1a42ae06937a253
data/lib/linkedindata.rb CHANGED
@@ -3,21 +3,17 @@ require 'linkedin-scraper'
3
3
  require 'json'
4
4
  require 'nokogiri'
5
5
  require 'open-uri'
6
+ load 'parseprofile.rb'
7
+ require 'pry'
6
8
 
7
9
  class LinkedinData
8
10
  def initialize(input, todegree)
9
11
  @input = input
10
12
  @output = Array.new
11
13
  @startindex = 10
12
- @degree = 0
13
- if todegree == nil
14
- @to_degree = 0
15
- else
16
- @to_degree = todegree
17
- end
18
14
  end
19
15
 
20
- # Searches for links on Google
16
+ # Searches for profiles on Google
21
17
  def search
22
18
  agent = Mechanize.new
23
19
  agent.user_agent_alias = 'Linux Firefox'
@@ -29,6 +25,12 @@ class LinkedinData
29
25
 
30
26
  # Examines a search page
31
27
  def examine(page)
28
+ # Separate getting profile links and going to next page
29
+ # Method for getting links to all result pages
30
+ # Different method for getting all profile links on page and scraping (split to new thread for this)
31
+ # Has own output set, merge into full one at end (make sure threadsafe)
32
+
33
+ # Have own input and output
32
34
  page.links.each do |link|
33
35
  if (link.href.include? "linkedin.com") && (!link.href.include? "webcache") && (!link.href.include? "site:linkedin.com/pub+")
34
36
  saveurl = link.href.split("?q=")
@@ -42,110 +44,34 @@ class LinkedinData
42
44
  end
43
45
  end
44
46
 
47
+ # Find the link to the next page and go to it
45
48
  if (link.href.include? "&sa=N") && (link.href.include? "&start=")
46
49
  url1 = link.href.split("&start=")
47
50
  url2 = url1[1].split("&sa=N")
48
51
 
49
52
  if url2[0].to_i == @startindex
50
- sleep(rand(30..90))
53
+ sleep(rand(5..10))
51
54
  @startindex += 10
52
55
  agent = Mechanize.new
53
- examine(agent.get("http://google.com" + link.href))
56
+ Thread.new{ examine(agent.get("http://google.com" + link.href))}
54
57
  end
55
58
  end
56
59
  end
57
60
  end
58
61
 
59
- # Scrapes profile and makes JSON
62
+ # Scrapes profile
60
63
  def scrape(url)
61
- flag = 0
62
- @output.each do |o|
63
- if o[:profile_url] == url
64
- flag = 1
65
- if @degree < o[:degree]
66
- o[:degree] = @degree
67
- end
68
- end
69
- end
64
+ # Download profile and rescue on error
70
65
  begin
71
- url.gsub!("https", "http")
72
- profile = Linkedin::Profile.get_profile(url)
66
+ url.gsub!("https", "http")
67
+ profile = Linkedin::Profile.get_profile(url)
73
68
  rescue
74
69
  end
75
70
 
71
+ # Parse profile if returned
76
72
  if profile
77
- profile.current_companies.each do |c|
78
- c.merge!(:skills => profile.skills, :certifications => profile.certifications, :languages => profile.languages, :name => profile.first_name + " " + profile.last_name, :location => profile.location, :area => profile.country, :industry => profile.industry, :picture => profile.picture, :organizations => profile.organizations, :groups => profile.groups, :education => profile.education, :websites => profile.websites, :profile_url => url, :degree => @degree, :current => "Yes")
79
-
80
- if profile.picture
81
- path = profile.picture.split("/")
82
- if !File.file?("public/uploads/pictures/" + path[path.length-1].chomp.strip)
83
- begin
84
- `wget -P public/uploads/pictures #{profile.picture}`
85
- rescue
86
- end
87
- end
88
- c.merge!(:pic_path => "public/uploads/pictures/" + path[path.length-1].chomp.strip)
89
- end
90
-
91
- @output.push(c)
92
- end
93
-
94
- profile.past_companies.each do |c|
95
- c.merge!(:skills => profile.skills, :certifications => profile.certifications, :languages => profile.languages, :name => profile.first_name + " " + profile.last_name, :location => profile.location, :area => profile.country, :industry => profile.industry, :picture => profile.picture, :organizations => profile.organizations, :groups => profile.groups, :education => profile.education, :websites => profile.websites, :profile_url => url, :degree => @degree, :current => "No")
96
- @output.push(c)
97
-
98
- if profile.picture
99
- path = profile.picture.split("/")
100
- if !File.file?("public/uploads/pictures/" + path[path.length-1].chomp.strip)
101
- begin
102
- `wget -P public/uploads/pictures #{profile.picture}`
103
- rescue
104
- end
105
- end
106
- c.merge!(:pic_path => "public/uploads/pictures/" + path[path.length-1].chomp.strip)
107
- end
108
- end
109
-
110
- # Clean up directories
111
- pics = Dir["public/uploads/*.jpg.*"]
112
- pics.each do |p|
113
- File.delete(p)
114
- end
115
- getRelated(url)
116
- end
117
- end
118
-
119
- # Gets related profiles listed on side of the page
120
- def getRelated(url)
121
- if @degree < @to_degree
122
- begin
123
- html = Nokogiri::HTML(open(url))
124
- rescue
125
- end
126
-
127
- if html
128
- html.css("li.with-photo").each do |l|
129
- plink = l.css("a")[0]['href'].split("?")
130
-
131
- # Check to be sure not already saved
132
- flag = 0
133
- @output.each do |o|
134
- if o[:profile_url] == plink[0]
135
- flag = 1
136
- end
137
- end
138
-
139
- if flag == 0
140
- @degree += 1
141
- begin
142
- scrape(plink[0])
143
- rescue
144
- end
145
- @degree -= 1
146
- end
147
- end
148
- end
73
+ p = ParseProfile.new(profile, url)
74
+ @output.concat(p.parse)
149
75
  end
150
76
  end
151
77
 
@@ -155,4 +81,3 @@ class LinkedinData
155
81
  return JSON.pretty_generate(@output)
156
82
  end
157
83
  end
158
-
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkedindata
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-07 00:00:00.000000000 Z
11
+ date: 2014-10-17 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Scrapes all LinkedIn profiles including terms you specify.
14
14
  email: shidash@shidash.com