linkedindata 0.0.9 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/linkedindata.rb +19 -94
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: fd21c1b7e2b7467b69f40632b6c1c1943bbf210d
4
- data.tar.gz: b1a2cbd4aaa75b3406848ef978c22c6966a106b3
3
+ metadata.gz: 609dc4f607cbb1b2fd16088b1645f3c764bd8bf9
4
+ data.tar.gz: 7036885a0b9327e44fb46b1cd578651d7397c3e0
5
5
  SHA512:
6
- metadata.gz: 4c234ba4ea5c7402e9ec5d5e80e15305e603d05bc516d9911f855f7baceb177f368d487af3b48cc2fcd4fe980048d2346f5abd5d9a8572775876db164e96b5d2
7
- data.tar.gz: 1566a9d2ffeeb889191443b3ef5fc8b3076b55209c023918401ec67e0aefb2ae7281eaa054f4a0f2ec5b045797c092030c8762b11dfd453279ebfb0d6cd173b2
6
+ metadata.gz: 097bef77b1016cad5d9c6a8dcb5c1ecdcb42f5ce1c4a160ddc9d37f19a0e0af954218910410ed27d945b4caa87da86987393a76c8c51040ac5e125dbd3d6acb0
7
+ data.tar.gz: 0412174d4340cb353927e76d7f0b715fecc6ed9e89b8508e994065685226f7fe7b745f3581e8a1de9269e1acaf14384383abb5f16a2032f4f1a42ae06937a253
data/lib/linkedindata.rb CHANGED
@@ -3,21 +3,17 @@ require 'linkedin-scraper'
3
3
  require 'json'
4
4
  require 'nokogiri'
5
5
  require 'open-uri'
6
+ load 'parseprofile.rb'
7
+ require 'pry'
6
8
 
7
9
  class LinkedinData
8
10
  def initialize(input, todegree)
9
11
  @input = input
10
12
  @output = Array.new
11
13
  @startindex = 10
12
- @degree = 0
13
- if todegree == nil
14
- @to_degree = 0
15
- else
16
- @to_degree = todegree
17
- end
18
14
  end
19
15
 
20
- # Searches for links on Google
16
+ # Searches for profiles on Google
21
17
  def search
22
18
  agent = Mechanize.new
23
19
  agent.user_agent_alias = 'Linux Firefox'
@@ -29,6 +25,12 @@ class LinkedinData
29
25
 
30
26
  # Examines a search page
31
27
  def examine(page)
28
+ # Separate getting profile links and going to next page
29
+ # Method for getting links to all result pages
30
+ # Different method for getting all profile links on page and scraping (split to new thread for this)
31
+ # Has own output set, merge into full one at end (make sure threadsafe)
32
+
33
+ # Have own input and output
32
34
  page.links.each do |link|
33
35
  if (link.href.include? "linkedin.com") && (!link.href.include? "webcache") && (!link.href.include? "site:linkedin.com/pub+")
34
36
  saveurl = link.href.split("?q=")
@@ -42,110 +44,34 @@ class LinkedinData
42
44
  end
43
45
  end
44
46
 
47
+ # Find the link to the next page and go to it
45
48
  if (link.href.include? "&sa=N") && (link.href.include? "&start=")
46
49
  url1 = link.href.split("&start=")
47
50
  url2 = url1[1].split("&sa=N")
48
51
 
49
52
  if url2[0].to_i == @startindex
50
- sleep(rand(30..90))
53
+ sleep(rand(5..10))
51
54
  @startindex += 10
52
55
  agent = Mechanize.new
53
- examine(agent.get("http://google.com" + link.href))
56
+ Thread.new{ examine(agent.get("http://google.com" + link.href))}
54
57
  end
55
58
  end
56
59
  end
57
60
  end
58
61
 
59
- # Scrapes profile and makes JSON
62
+ # Scrapes profile
60
63
  def scrape(url)
61
- flag = 0
62
- @output.each do |o|
63
- if o[:profile_url] == url
64
- flag = 1
65
- if @degree < o[:degree]
66
- o[:degree] = @degree
67
- end
68
- end
69
- end
64
+ # Download profile and rescue on error
70
65
  begin
71
- url.gsub!("https", "http")
72
- profile = Linkedin::Profile.get_profile(url)
66
+ url.gsub!("https", "http")
67
+ profile = Linkedin::Profile.get_profile(url)
73
68
  rescue
74
69
  end
75
70
 
71
+ # Parse profile if returned
76
72
  if profile
77
- profile.current_companies.each do |c|
78
- c.merge!(:skills => profile.skills, :certifications => profile.certifications, :languages => profile.languages, :name => profile.first_name + " " + profile.last_name, :location => profile.location, :area => profile.country, :industry => profile.industry, :picture => profile.picture, :organizations => profile.organizations, :groups => profile.groups, :education => profile.education, :websites => profile.websites, :profile_url => url, :degree => @degree, :current => "Yes")
79
-
80
- if profile.picture
81
- path = profile.picture.split("/")
82
- if !File.file?("public/uploads/pictures/" + path[path.length-1].chomp.strip)
83
- begin
84
- `wget -P public/uploads/pictures #{profile.picture}`
85
- rescue
86
- end
87
- end
88
- c.merge!(:pic_path => "public/uploads/pictures/" + path[path.length-1].chomp.strip)
89
- end
90
-
91
- @output.push(c)
92
- end
93
-
94
- profile.past_companies.each do |c|
95
- c.merge!(:skills => profile.skills, :certifications => profile.certifications, :languages => profile.languages, :name => profile.first_name + " " + profile.last_name, :location => profile.location, :area => profile.country, :industry => profile.industry, :picture => profile.picture, :organizations => profile.organizations, :groups => profile.groups, :education => profile.education, :websites => profile.websites, :profile_url => url, :degree => @degree, :current => "No")
96
- @output.push(c)
97
-
98
- if profile.picture
99
- path = profile.picture.split("/")
100
- if !File.file?("public/uploads/pictures/" + path[path.length-1].chomp.strip)
101
- begin
102
- `wget -P public/uploads/pictures #{profile.picture}`
103
- rescue
104
- end
105
- end
106
- c.merge!(:pic_path => "public/uploads/pictures/" + path[path.length-1].chomp.strip)
107
- end
108
- end
109
-
110
- # Clean up directories
111
- pics = Dir["public/uploads/*.jpg.*"]
112
- pics.each do |p|
113
- File.delete(p)
114
- end
115
- getRelated(url)
116
- end
117
- end
118
-
119
- # Gets related profiles listed on side of the page
120
- def getRelated(url)
121
- if @degree < @to_degree
122
- begin
123
- html = Nokogiri::HTML(open(url))
124
- rescue
125
- end
126
-
127
- if html
128
- html.css("li.with-photo").each do |l|
129
- plink = l.css("a")[0]['href'].split("?")
130
-
131
- # Check to be sure not already saved
132
- flag = 0
133
- @output.each do |o|
134
- if o[:profile_url] == plink[0]
135
- flag = 1
136
- end
137
- end
138
-
139
- if flag == 0
140
- @degree += 1
141
- begin
142
- scrape(plink[0])
143
- rescue
144
- end
145
- @degree -= 1
146
- end
147
- end
148
- end
73
+ p = ParseProfile.new(profile, url)
74
+ @output.concat(p.parse)
149
75
  end
150
76
  end
151
77
 
@@ -155,4 +81,3 @@ class LinkedinData
155
81
  return JSON.pretty_generate(@output)
156
82
  end
157
83
  end
158
-
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkedindata
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-07 00:00:00.000000000 Z
11
+ date: 2014-10-17 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Scrapes all LinkedIn profiles including terms you specify.
14
14
  email: shidash@shidash.com