generalscraper 0.0.23 → 0.0.24

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 85b3d91e96159d5f3cd36961664721d9bd5e7313
4
- data.tar.gz: 7568e30d7343d9be690e48e0369f7cb3db194a81
3
+ metadata.gz: c14621c2ab26d98c1a4f8998a47de2454c6bbfad
4
+ data.tar.gz: d388fab99cffa9ec89e283428e85015c545b9c1e
5
5
  SHA512:
6
- metadata.gz: b88b2d814a08bc24b68ed337e4e973471a57c02d1fa9323156ccd7a93f5dab754dde734e48326378e203e5c11599ee1e0be789d45eb728016feb194b0949094b
7
- data.tar.gz: 5f32fd7d6da2aea69a4654a2e3e3662bcbce88412252a4b366048ffe14d02e0d33a7f57b653138b9de9eb1a7f2117cf38023072b8936a9cbd10e0e5ed7f001b7
6
+ metadata.gz: a6b5058cb399dc7e5deb8e9fc2b5ea6ba9097ad9339f32c470a2e58c95688ca261a05769c6dce3c1a85cd93b9a884bd458e8737f11a5c9343839048fcd116a7e
7
+ data.tar.gz: 0700a3c861858921cba2ff45867a7afc33bd2233a748e4a0e9e98a09aa1b061a453edd97a6ddaaf5f3e73ec5209fecea80771143e846d512d0f78be8e70ec866
@@ -30,6 +30,7 @@ class GeneralScraper
30
30
  def search
31
31
  check_results(@requests.get_page("http://google.com", @operators + " " + @searchterm),
32
32
  "http://google.com", (@operators + " " + @searchterm))
33
+ report_status("Got search results for " + @operators.to_s + " " + @searchterm.to_s)
33
34
  end
34
35
 
35
36
  # Check that page with links loaded
@@ -45,13 +46,15 @@ class GeneralScraper
45
46
  check_results(@requests.get_updated_current_page)
46
47
 
47
48
  else # Restart and try again if CAPTCHA-solving not enabled
49
+ report_status("CAPTCHA Found. CAPTCHA solving not enabled. Trying to restart browser.")
48
50
  @requests.restart_browser
49
51
  check_results(@requests.get_page(requested_page), requested_page)
50
52
  end
51
53
  else # No CAPTCHA found :)
52
54
  begin
53
55
  navigate_save_results(page)
54
- rescue Exception
56
+ rescue => e
57
+ report_status("Error: " + e.to_s + " Retrying...")
55
58
  @requests.restart_browser
56
59
  check_results(@requests.get_page(requested_page), requested_page)
57
60
  end
@@ -66,8 +69,8 @@ class GeneralScraper
66
69
  return yield(html).inject(Array.new) do |link_arr, al|
67
70
  begin
68
71
  link_arr.push(al["href"])
69
- rescue
70
-
72
+ rescue => e
73
+ report_status("Error getting links: " + e.to_s)
71
74
  end
72
75
 
73
76
  link_arr
@@ -85,6 +88,7 @@ class GeneralScraper
85
88
  # Go to next page
86
89
  next_pages = get_links(page) {|html| html.css("#pnnext")}
87
90
  next_pages.each do |link|
91
+ report_status("Going to next page: google.com"+link)
88
92
  next_search_page("google.com"+link)
89
93
  end
90
94
  end
@@ -114,6 +118,7 @@ class GeneralScraper
114
118
  end
115
119
  end
116
120
 
121
+ report_status("Finished collecting data for " + @operators.to_s + " " + @searchterm.to_s)
117
122
  @requests.close_all_browsers
118
123
  end
119
124
 
@@ -135,6 +140,16 @@ class GeneralScraper
135
140
  Curl::PostField.content('results', JSON.pretty_generate([results])))
136
141
  end
137
142
 
143
+ # Report Harvester status message
144
+ def report_status(status_msg)
145
+ if @cm_url
146
+ curl_url = @cm_url+"/update_status"
147
+ c = Curl::Easy.http_post(curl_url,
148
+ Curl::PostField.content('selector_id', @selector_id),
149
+ Curl::PostField.content('status_message', status_msg))
150
+ end
151
+ end
152
+
138
153
  # Add page hash to output for bulk reporting
139
154
  def report_bulk(results)
140
155
  @output.push(results)
data/lib/parse_page.rb CHANGED
@@ -16,6 +16,7 @@ module ParsePage
16
16
  begin
17
17
  return getPDF(url, pagehash)
18
18
  rescue
19
+ report_status("PDF parsing failed for "+url.to_s)
19
20
  return nil
20
21
  end
21
22
  else
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: generalscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.23
4
+ version: 0.0.24
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-10-08 00:00:00.000000000 Z
11
+ date: 2016-10-30 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Scrapes Google
14
14
  email: shidash@shidash.com