generalscraper 0.0.19 → 0.0.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/generalscraper.rb +42 -7
- data/lib/parse_page.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ae1747657ed5aed8d784aee72c2203ffa8d0c5b9
|
4
|
+
data.tar.gz: e9e0e82429b6085d0455b8425123aca817229839
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9a97172a7739666794cc170973f5433b16fad53bb7ed4993e3e15d05910c650b13202bc5a053cbc3642f9d17b95a20fdc4c75306bbee9784cb209d51188f2d9e
|
7
|
+
data.tar.gz: 36ea20ed630476467fd7a4cecec499e4d6aa98e03b7a0335a1e4ac9329eecc88f008b1813e546f36c0d561fc5ee6f2de34dcd92509b45458c4adf26aa9dc3802
|
data/lib/generalscraper.rb
CHANGED
@@ -10,7 +10,7 @@ load 'captcha.rb'
|
|
10
10
|
class GeneralScraper
|
11
11
|
include ParsePage
|
12
12
|
|
13
|
-
def initialize(operators, searchterm, requests, solver_details)
|
13
|
+
def initialize(operators, searchterm, requests, solver_details, cm_hash)
|
14
14
|
@operators = operators
|
15
15
|
@searchterm = searchterm
|
16
16
|
@op_val = @operators.split(" ")[0].split(":")[1]
|
@@ -20,6 +20,10 @@ class GeneralScraper
|
|
20
20
|
@output = Array.new
|
21
21
|
@urllist = Array.new
|
22
22
|
@startindex = 10
|
23
|
+
|
24
|
+
# Handle crawler manager info
|
25
|
+
@cm_url = cm_hash[:crawler_manager_url] if cm_hash
|
26
|
+
@selector_id = cm_hash[:selector_id] if cm_hash
|
23
27
|
end
|
24
28
|
|
25
29
|
# Searches for links on Google
|
@@ -44,11 +48,13 @@ class GeneralScraper
|
|
44
48
|
@requests.restart_browser
|
45
49
|
check_results(@requests.get_page(requested_page), requested_page)
|
46
50
|
end
|
47
|
-
elsif page.include?("403") && page.length < 100
|
48
|
-
@requests.restart_browser
|
49
|
-
check_results(@requests.get_page(requested_page), requested_page)
|
50
51
|
else # No CAPTCHA found :)
|
51
|
-
|
52
|
+
begin
|
53
|
+
navigate_save_results(page)
|
54
|
+
rescue Exception
|
55
|
+
@requests.restart_browser
|
56
|
+
check_results(@requests.get_page(requested_page), requested_page)
|
57
|
+
end
|
52
58
|
end
|
53
59
|
end
|
54
60
|
|
@@ -102,17 +108,46 @@ class GeneralScraper
|
|
102
108
|
def getData
|
103
109
|
search
|
104
110
|
@urllist.each do |url|
|
105
|
-
getPageData(url)
|
111
|
+
report_results(getPageData(url), url)
|
106
112
|
end
|
107
113
|
|
108
114
|
@requests.close_all_browsers
|
109
|
-
return JSON.pretty_generate(@output)
|
110
115
|
end
|
111
116
|
|
117
|
+
# Figure out how to report results
|
118
|
+
def report_results(results, link)
|
119
|
+
if @cm_url
|
120
|
+
report_incremental(results, link)
|
121
|
+
else
|
122
|
+
report_bulk(results)
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
# Report results back to Harvester incrementally
|
127
|
+
def report_incremental(results, link)
|
128
|
+
curl_url = @cm_url+"/relay_results"
|
129
|
+
c = Curl::Easy.http_post(curl_url,
|
130
|
+
Curl::PostField.content('selector_id', @selector_id),
|
131
|
+
Curl::PostField.content('status_message', "Collected " + link),
|
132
|
+
Curl::PostField.content('results', JSON.pretty_generate(results)))
|
133
|
+
end
|
134
|
+
|
135
|
+
# Add page hash to output for bulk reporting
|
136
|
+
def report_bulk(results)
|
137
|
+
@output.push(results)
|
138
|
+
end
|
139
|
+
|
140
|
+
|
112
141
|
# Returns a list of search result URLs
|
113
142
|
def getURLs
|
114
143
|
search
|
115
144
|
@requests.close_all_browsers
|
116
145
|
return JSON.pretty_generate(@urllist)
|
117
146
|
end
|
147
|
+
|
148
|
+
# Get the JSON of all the data
|
149
|
+
def get_json_data
|
150
|
+
return JSON.pretty_generate(@output)
|
151
|
+
end
|
118
152
|
end
|
153
|
+
|
data/lib/parse_page.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: generalscraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.20
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-10-08 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Scrapes Google
|
14
14
|
email: shidash@shidash.com
|