generalscraper 0.0.23 → 0.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 85b3d91e96159d5f3cd36961664721d9bd5e7313
4
- data.tar.gz: 7568e30d7343d9be690e48e0369f7cb3db194a81
3
+ metadata.gz: c14621c2ab26d98c1a4f8998a47de2454c6bbfad
4
+ data.tar.gz: d388fab99cffa9ec89e283428e85015c545b9c1e
5
5
  SHA512:
6
- metadata.gz: b88b2d814a08bc24b68ed337e4e973471a57c02d1fa9323156ccd7a93f5dab754dde734e48326378e203e5c11599ee1e0be789d45eb728016feb194b0949094b
7
- data.tar.gz: 5f32fd7d6da2aea69a4654a2e3e3662bcbce88412252a4b366048ffe14d02e0d33a7f57b653138b9de9eb1a7f2117cf38023072b8936a9cbd10e0e5ed7f001b7
6
+ metadata.gz: a6b5058cb399dc7e5deb8e9fc2b5ea6ba9097ad9339f32c470a2e58c95688ca261a05769c6dce3c1a85cd93b9a884bd458e8737f11a5c9343839048fcd116a7e
7
+ data.tar.gz: 0700a3c861858921cba2ff45867a7afc33bd2233a748e4a0e9e98a09aa1b061a453edd97a6ddaaf5f3e73ec5209fecea80771143e846d512d0f78be8e70ec866
@@ -30,6 +30,7 @@ class GeneralScraper
30
30
  def search
31
31
  check_results(@requests.get_page("http://google.com", @operators + " " + @searchterm),
32
32
  "http://google.com", (@operators + " " + @searchterm))
33
+ report_status("Got search results for " + @operators.to_s + " " + @searchterm.to_s)
33
34
  end
34
35
 
35
36
  # Check that page with links loaded
@@ -45,13 +46,15 @@ class GeneralScraper
45
46
  check_results(@requests.get_updated_current_page)
46
47
 
47
48
  else # Restart and try again if CAPTCHA-solving not enabled
49
+ report_status("CAPTCHA Found. CAPTCHA solving not enabled. Trying to restart browser.")
48
50
  @requests.restart_browser
49
51
  check_results(@requests.get_page(requested_page), requested_page)
50
52
  end
51
53
  else # No CAPTCHA found :)
52
54
  begin
53
55
  navigate_save_results(page)
54
- rescue Exception
56
+ rescue => e
57
+ report_status("Error: " + e.to_s + " Retrying...")
55
58
  @requests.restart_browser
56
59
  check_results(@requests.get_page(requested_page), requested_page)
57
60
  end
@@ -66,8 +69,8 @@ class GeneralScraper
66
69
  return yield(html).inject(Array.new) do |link_arr, al|
67
70
  begin
68
71
  link_arr.push(al["href"])
69
- rescue
70
-
72
+ rescue => e
73
+ report_status("Error getting links: " + e.to_s)
71
74
  end
72
75
 
73
76
  link_arr
@@ -85,6 +88,7 @@ class GeneralScraper
85
88
  # Go to next page
86
89
  next_pages = get_links(page) {|html| html.css("#pnnext")}
87
90
  next_pages.each do |link|
91
+ report_status("Going to next page: google.com"+link)
88
92
  next_search_page("google.com"+link)
89
93
  end
90
94
  end
@@ -114,6 +118,7 @@ class GeneralScraper
114
118
  end
115
119
  end
116
120
 
121
+ report_status("Finished collecting data for " + @operators.to_s + " " + @searchterm.to_s)
117
122
  @requests.close_all_browsers
118
123
  end
119
124
 
@@ -135,6 +140,16 @@ class GeneralScraper
135
140
  Curl::PostField.content('results', JSON.pretty_generate([results])))
136
141
  end
137
142
 
143
+ # Report Harvester status message
144
+ def report_status(status_msg)
145
+ if @cm_url
146
+ curl_url = @cm_url+"/update_status"
147
+ c = Curl::Easy.http_post(curl_url,
148
+ Curl::PostField.content('selector_id', @selector_id),
149
+ Curl::PostField.content('status_message', status_msg))
150
+ end
151
+ end
152
+
138
153
  # Add page hash to output for bulk reporting
139
154
  def report_bulk(results)
140
155
  @output.push(results)
data/lib/parse_page.rb CHANGED
@@ -16,6 +16,7 @@ module ParsePage
16
16
  begin
17
17
  return getPDF(url, pagehash)
18
18
  rescue
19
+ report_status("PDF parsing failed for "+url.to_s)
19
20
  return nil
20
21
  end
21
22
  else
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: generalscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.23
4
+ version: 0.0.24
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-10-08 00:00:00.000000000 Z
11
+ date: 2016-10-30 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Scrapes Google
14
14
  email: shidash@shidash.com