generalscraper 0.0.23 → 0.0.24
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/generalscraper.rb +18 -3
- data/lib/parse_page.rb +1 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c14621c2ab26d98c1a4f8998a47de2454c6bbfad
|
4
|
+
data.tar.gz: d388fab99cffa9ec89e283428e85015c545b9c1e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a6b5058cb399dc7e5deb8e9fc2b5ea6ba9097ad9339f32c470a2e58c95688ca261a05769c6dce3c1a85cd93b9a884bd458e8737f11a5c9343839048fcd116a7e
|
7
|
+
data.tar.gz: 0700a3c861858921cba2ff45867a7afc33bd2233a748e4a0e9e98a09aa1b061a453edd97a6ddaaf5f3e73ec5209fecea80771143e846d512d0f78be8e70ec866
|
data/lib/generalscraper.rb
CHANGED
@@ -30,6 +30,7 @@ class GeneralScraper
|
|
30
30
|
def search
|
31
31
|
check_results(@requests.get_page("http://google.com", @operators + " " + @searchterm),
|
32
32
|
"http://google.com", (@operators + " " + @searchterm))
|
33
|
+
report_status("Got search results for " + @operators.to_s + " " + @searchterm.to_s)
|
33
34
|
end
|
34
35
|
|
35
36
|
# Check that page with links loaded
|
@@ -45,13 +46,15 @@ class GeneralScraper
|
|
45
46
|
check_results(@requests.get_updated_current_page)
|
46
47
|
|
47
48
|
else # Restart and try again if CAPTCHA-solving not enabled
|
49
|
+
report_status("CAPTCHA Found. CAPTCHA solving not enabled. Trying to restart browser.")
|
48
50
|
@requests.restart_browser
|
49
51
|
check_results(@requests.get_page(requested_page), requested_page)
|
50
52
|
end
|
51
53
|
else # No CAPTCHA found :)
|
52
54
|
begin
|
53
55
|
navigate_save_results(page)
|
54
|
-
rescue
|
56
|
+
rescue => e
|
57
|
+
report_status("Error: " + e.to_s + " Retrying...")
|
55
58
|
@requests.restart_browser
|
56
59
|
check_results(@requests.get_page(requested_page), requested_page)
|
57
60
|
end
|
@@ -66,8 +69,8 @@ class GeneralScraper
|
|
66
69
|
return yield(html).inject(Array.new) do |link_arr, al|
|
67
70
|
begin
|
68
71
|
link_arr.push(al["href"])
|
69
|
-
rescue
|
70
|
-
|
72
|
+
rescue => e
|
73
|
+
report_status("Error getting links: " + e.to_s)
|
71
74
|
end
|
72
75
|
|
73
76
|
link_arr
|
@@ -85,6 +88,7 @@ class GeneralScraper
|
|
85
88
|
# Go to next page
|
86
89
|
next_pages = get_links(page) {|html| html.css("#pnnext")}
|
87
90
|
next_pages.each do |link|
|
91
|
+
report_status("Going to next page: google.com"+link)
|
88
92
|
next_search_page("google.com"+link)
|
89
93
|
end
|
90
94
|
end
|
@@ -114,6 +118,7 @@ class GeneralScraper
|
|
114
118
|
end
|
115
119
|
end
|
116
120
|
|
121
|
+
report_status("Finished collecting data for " + @operators.to_s + " " + @searchterm.to_s)
|
117
122
|
@requests.close_all_browsers
|
118
123
|
end
|
119
124
|
|
@@ -135,6 +140,16 @@ class GeneralScraper
|
|
135
140
|
Curl::PostField.content('results', JSON.pretty_generate([results])))
|
136
141
|
end
|
137
142
|
|
143
|
+
# Report Harvester status message
|
144
|
+
def report_status(status_msg)
|
145
|
+
if @cm_url
|
146
|
+
curl_url = @cm_url+"/update_status"
|
147
|
+
c = Curl::Easy.http_post(curl_url,
|
148
|
+
Curl::PostField.content('selector_id', @selector_id),
|
149
|
+
Curl::PostField.content('status_message', status_msg))
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
138
153
|
# Add page hash to output for bulk reporting
|
139
154
|
def report_bulk(results)
|
140
155
|
@output.push(results)
|
data/lib/parse_page.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: generalscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.24
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-10-
|
11
|
+
date: 2016-10-30 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Scrapes Google
|
14
14
|
email: shidash@shidash.com
|