generalscraper 0.0.23 → 0.0.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/generalscraper.rb +18 -3
- data/lib/parse_page.rb +1 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c14621c2ab26d98c1a4f8998a47de2454c6bbfad
|
4
|
+
data.tar.gz: d388fab99cffa9ec89e283428e85015c545b9c1e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a6b5058cb399dc7e5deb8e9fc2b5ea6ba9097ad9339f32c470a2e58c95688ca261a05769c6dce3c1a85cd93b9a884bd458e8737f11a5c9343839048fcd116a7e
|
7
|
+
data.tar.gz: 0700a3c861858921cba2ff45867a7afc33bd2233a748e4a0e9e98a09aa1b061a453edd97a6ddaaf5f3e73ec5209fecea80771143e846d512d0f78be8e70ec866
|
data/lib/generalscraper.rb
CHANGED
@@ -30,6 +30,7 @@ class GeneralScraper
|
|
30
30
|
def search
|
31
31
|
check_results(@requests.get_page("http://google.com", @operators + " " + @searchterm),
|
32
32
|
"http://google.com", (@operators + " " + @searchterm))
|
33
|
+
report_status("Got search results for " + @operators.to_s + " " + @searchterm.to_s)
|
33
34
|
end
|
34
35
|
|
35
36
|
# Check that page with links loaded
|
@@ -45,13 +46,15 @@ class GeneralScraper
|
|
45
46
|
check_results(@requests.get_updated_current_page)
|
46
47
|
|
47
48
|
else # Restart and try again if CAPTCHA-solving not enabled
|
49
|
+
report_status("CAPTCHA Found. CAPTCHA solving not enabled. Trying to restart browser.")
|
48
50
|
@requests.restart_browser
|
49
51
|
check_results(@requests.get_page(requested_page), requested_page)
|
50
52
|
end
|
51
53
|
else # No CAPTCHA found :)
|
52
54
|
begin
|
53
55
|
navigate_save_results(page)
|
54
|
-
rescue
|
56
|
+
rescue => e
|
57
|
+
report_status("Error: " + e.to_s + " Retrying...")
|
55
58
|
@requests.restart_browser
|
56
59
|
check_results(@requests.get_page(requested_page), requested_page)
|
57
60
|
end
|
@@ -66,8 +69,8 @@ class GeneralScraper
|
|
66
69
|
return yield(html).inject(Array.new) do |link_arr, al|
|
67
70
|
begin
|
68
71
|
link_arr.push(al["href"])
|
69
|
-
rescue
|
70
|
-
|
72
|
+
rescue => e
|
73
|
+
report_status("Error getting links: " + e.to_s)
|
71
74
|
end
|
72
75
|
|
73
76
|
link_arr
|
@@ -85,6 +88,7 @@ class GeneralScraper
|
|
85
88
|
# Go to next page
|
86
89
|
next_pages = get_links(page) {|html| html.css("#pnnext")}
|
87
90
|
next_pages.each do |link|
|
91
|
+
report_status("Going to next page: google.com"+link)
|
88
92
|
next_search_page("google.com"+link)
|
89
93
|
end
|
90
94
|
end
|
@@ -114,6 +118,7 @@ class GeneralScraper
|
|
114
118
|
end
|
115
119
|
end
|
116
120
|
|
121
|
+
report_status("Finished collecting data for " + @operators.to_s + " " + @searchterm.to_s)
|
117
122
|
@requests.close_all_browsers
|
118
123
|
end
|
119
124
|
|
@@ -135,6 +140,16 @@ class GeneralScraper
|
|
135
140
|
Curl::PostField.content('results', JSON.pretty_generate([results])))
|
136
141
|
end
|
137
142
|
|
143
|
+
# Report Harvester status message
|
144
|
+
def report_status(status_msg)
|
145
|
+
if @cm_url
|
146
|
+
curl_url = @cm_url+"/update_status"
|
147
|
+
c = Curl::Easy.http_post(curl_url,
|
148
|
+
Curl::PostField.content('selector_id', @selector_id),
|
149
|
+
Curl::PostField.content('status_message', status_msg))
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
138
153
|
# Add page hash to output for bulk reporting
|
139
154
|
def report_bulk(results)
|
140
155
|
@output.push(results)
|
data/lib/parse_page.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: generalscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.24
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-10-
|
11
|
+
date: 2016-10-30 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Scrapes Google
|
14
14
|
email: shidash@shidash.com
|