linkedincrawler 0.0.13 → 0.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/linkedincrawler.rb +24 -10
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b0d3e42277b83b79704ceaa5d7b07a0a1996b5d2
4
- data.tar.gz: c2189d0a40e02f8f0e6e085720cc60335cdb2cef
3
+ metadata.gz: 9cab546fac1482b262f2435257af20d645b7a2ca
4
+ data.tar.gz: 8a18d774dbaf1791b13a104b85bd96edd3400dc9
5
5
  SHA512:
6
- metadata.gz: db465540122c5ec210afacec2c3445827542596c2db78271a3eef97e18dbd9795e6198c8cf7d0632838c77cbf197fbf127709f3cbd3902aacd4f2cdb3c04a616
7
- data.tar.gz: f5e34bd6e608e8c675029c353c7379ac2b5af1433ac9047f1cf8708c0a0adfe72a145ec99333ab71d01f3150a95ca15667f7f69052d1147c02b8a6ca6a670ccc
6
+ metadata.gz: fb059768cdaf5204cfef18ca7fc8ec8670c0ed4bed5346fc984ab57a44eb6d7fa988e8e2b584920b1f500a20257196fcf45bedaafdf86197887e3ad232fe398c
7
+ data.tar.gz: e6b09adb74fec01250746172e372e82b5277badda83c8156d8c60049945d36bdafcc528e5bf69f2063c219cfc3508c771138d363a0226742ac45aa25a0eb00e7
@@ -44,25 +44,39 @@ class LinkedinCrawler
44
44
  @retry_count < @retry_limit
45
45
  end
46
46
 
47
+ # Add the parsed profile to output, reset the retry count, and continue
48
+ def save_and_continue(parsed_profile)
49
+ @output += parsed_profile if parsed_profile != nil && !parsed_profile.empty?
50
+ @retry_count = 0
51
+ end
52
+
53
+ # Check if profile parsed successfully
54
+ def profile_parsing_failed?(parsed_profile)
55
+ return (parsed_profile == nil) || parsed_profile.empty? || parsed_profile.first["parsing_failed"]
56
+ end
57
+
47
58
  # Scrape each page
48
59
  def scrape(profile_url)
49
60
  # Get profile page
50
61
  profile_html = @requests.get_page(profile_url)
51
62
 
52
- # Parse profile and add to output
53
- begin
54
- l = LinkedinParser.new(profile_html, profile_url, {timestamp: Time.now, search_terms: @search_terms})
55
- @output += JSON.parse(l.results_by_job)
56
- @retry_count = 0
57
- rescue
58
- # If proxy doesn't work, try another a few times
59
- if check_right_page(profile_url)
63
+ # Parse profile
64
+ l = LinkedinParser.new(profile_html, profile_url, {timestamp: Time.now, search_terms: @search_terms})
65
+ parsed_profile = JSON.parse(l.results_by_job)
66
+
67
+ # Check if it failed or succeeded
68
+ if profile_parsing_failed?(parsed_profile)
69
+ # Handle something wrong- restart in case it is blocked and rescrape
70
+ if @retry_count < @retry_limit
60
71
  @requests.restart_browser
61
72
  @retry_count += 1
62
73
  scrape(profile_url)
63
- else
64
- @retry_count = 0
74
+ else # Just save it and move on
75
+ save_and_continue(parsed_profile)
65
76
  end
77
+
78
+ else # It succeeded!
79
+ save_and_continue(parsed_profile)
66
80
  end
67
81
  end
68
82
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkedincrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.13
4
+ version: 0.0.14
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-12-05 00:00:00.000000000 Z
11
+ date: 2015-12-06 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Crawls public LinkedIn profiles via Google
14
14
  email: shidash@shidash.com