generalscraper 0.0.19 → 0.0.20
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/generalscraper.rb +42 -7
- data/lib/parse_page.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ae1747657ed5aed8d784aee72c2203ffa8d0c5b9
|
4
|
+
data.tar.gz: e9e0e82429b6085d0455b8425123aca817229839
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9a97172a7739666794cc170973f5433b16fad53bb7ed4993e3e15d05910c650b13202bc5a053cbc3642f9d17b95a20fdc4c75306bbee9784cb209d51188f2d9e
|
7
|
+
data.tar.gz: 36ea20ed630476467fd7a4cecec499e4d6aa98e03b7a0335a1e4ac9329eecc88f008b1813e546f36c0d561fc5ee6f2de34dcd92509b45458c4adf26aa9dc3802
|
data/lib/generalscraper.rb
CHANGED
@@ -10,7 +10,7 @@ load 'captcha.rb'
|
|
10
10
|
class GeneralScraper
|
11
11
|
include ParsePage
|
12
12
|
|
13
|
-
def initialize(operators, searchterm, requests, solver_details)
|
13
|
+
def initialize(operators, searchterm, requests, solver_details, cm_hash)
|
14
14
|
@operators = operators
|
15
15
|
@searchterm = searchterm
|
16
16
|
@op_val = @operators.split(" ")[0].split(":")[1]
|
@@ -20,6 +20,10 @@ class GeneralScraper
|
|
20
20
|
@output = Array.new
|
21
21
|
@urllist = Array.new
|
22
22
|
@startindex = 10
|
23
|
+
|
24
|
+
# Handle crawler manager info
|
25
|
+
@cm_url = cm_hash[:crawler_manager_url] if cm_hash
|
26
|
+
@selector_id = cm_hash[:selector_id] if cm_hash
|
23
27
|
end
|
24
28
|
|
25
29
|
# Searches for links on Google
|
@@ -44,11 +48,13 @@ class GeneralScraper
|
|
44
48
|
@requests.restart_browser
|
45
49
|
check_results(@requests.get_page(requested_page), requested_page)
|
46
50
|
end
|
47
|
-
elsif page.include?("403") && page.length < 100
|
48
|
-
@requests.restart_browser
|
49
|
-
check_results(@requests.get_page(requested_page), requested_page)
|
50
51
|
else # No CAPTCHA found :)
|
51
|
-
|
52
|
+
begin
|
53
|
+
navigate_save_results(page)
|
54
|
+
rescue Exception
|
55
|
+
@requests.restart_browser
|
56
|
+
check_results(@requests.get_page(requested_page), requested_page)
|
57
|
+
end
|
52
58
|
end
|
53
59
|
end
|
54
60
|
|
@@ -102,17 +108,46 @@ class GeneralScraper
|
|
102
108
|
def getData
|
103
109
|
search
|
104
110
|
@urllist.each do |url|
|
105
|
-
getPageData(url)
|
111
|
+
report_results(getPageData(url), url)
|
106
112
|
end
|
107
113
|
|
108
114
|
@requests.close_all_browsers
|
109
|
-
return JSON.pretty_generate(@output)
|
110
115
|
end
|
111
116
|
|
117
|
+
# Figure out how to report results
|
118
|
+
def report_results(results, link)
|
119
|
+
if @cm_url
|
120
|
+
report_incremental(results, link)
|
121
|
+
else
|
122
|
+
report_bulk(results)
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
# Report results back to Harvester incrementally
|
127
|
+
def report_incremental(results, link)
|
128
|
+
curl_url = @cm_url+"/relay_results"
|
129
|
+
c = Curl::Easy.http_post(curl_url,
|
130
|
+
Curl::PostField.content('selector_id', @selector_id),
|
131
|
+
Curl::PostField.content('status_message', "Collected " + link),
|
132
|
+
Curl::PostField.content('results', JSON.pretty_generate(results)))
|
133
|
+
end
|
134
|
+
|
135
|
+
# Add page hash to output for bulk reporting
|
136
|
+
def report_bulk(results)
|
137
|
+
@output.push(results)
|
138
|
+
end
|
139
|
+
|
140
|
+
|
112
141
|
# Returns a list of search result URLs
|
113
142
|
def getURLs
|
114
143
|
search
|
115
144
|
@requests.close_all_browsers
|
116
145
|
return JSON.pretty_generate(@urllist)
|
117
146
|
end
|
147
|
+
|
148
|
+
# Get the JSON of all the data
|
149
|
+
def get_json_data
|
150
|
+
return JSON.pretty_generate(@output)
|
151
|
+
end
|
118
152
|
end
|
153
|
+
|
data/lib/parse_page.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: generalscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.20
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-10-08 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Scrapes Google
|
14
14
|
email: shidash@shidash.com
|