generalscraper 0.0.19 → 0.0.20

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9de0cd6461d23d2f27b9d7ced18698f503c481aa
4
- data.tar.gz: 3ed9dd6ecf0b84e8c31a3002a0e6e83689848208
3
+ metadata.gz: ae1747657ed5aed8d784aee72c2203ffa8d0c5b9
4
+ data.tar.gz: e9e0e82429b6085d0455b8425123aca817229839
5
5
  SHA512:
6
- metadata.gz: 678789d0b479ceba78cf9b464b21150e1d2469da95891043f0996f530f3a957c34ef02bc0c31458835a39170980190c1cac478fe44652d301b082e58ad1b103d
7
- data.tar.gz: 9196461845feecbafa876cd29cafde6b60ae1269a8eba3ada28fa5e7b778a95324d0176fd8488f944c3bbe7bdd8959e0db91a128cedcc004218a12f4aebf6cc0
6
+ metadata.gz: 9a97172a7739666794cc170973f5433b16fad53bb7ed4993e3e15d05910c650b13202bc5a053cbc3642f9d17b95a20fdc4c75306bbee9784cb209d51188f2d9e
7
+ data.tar.gz: 36ea20ed630476467fd7a4cecec499e4d6aa98e03b7a0335a1e4ac9329eecc88f008b1813e546f36c0d561fc5ee6f2de34dcd92509b45458c4adf26aa9dc3802
@@ -10,7 +10,7 @@ load 'captcha.rb'
10
10
  class GeneralScraper
11
11
  include ParsePage
12
12
 
13
- def initialize(operators, searchterm, requests, solver_details)
13
+ def initialize(operators, searchterm, requests, solver_details, cm_hash)
14
14
  @operators = operators
15
15
  @searchterm = searchterm
16
16
  @op_val = @operators.split(" ")[0].split(":")[1]
@@ -20,6 +20,10 @@ class GeneralScraper
20
20
  @output = Array.new
21
21
  @urllist = Array.new
22
22
  @startindex = 10
23
+
24
+ # Handle crawler manager info
25
+ @cm_url = cm_hash[:crawler_manager_url] if cm_hash
26
+ @selector_id = cm_hash[:selector_id] if cm_hash
23
27
  end
24
28
 
25
29
  # Searches for links on Google
@@ -44,11 +48,13 @@ class GeneralScraper
44
48
  @requests.restart_browser
45
49
  check_results(@requests.get_page(requested_page), requested_page)
46
50
  end
47
- elsif page.include?("403") && page.length < 100
48
- @requests.restart_browser
49
- check_results(@requests.get_page(requested_page), requested_page)
50
51
  else # No CAPTCHA found :)
51
- navigate_save_results(page)
52
+ begin
53
+ navigate_save_results(page)
54
+ rescue Exception
55
+ @requests.restart_browser
56
+ check_results(@requests.get_page(requested_page), requested_page)
57
+ end
52
58
  end
53
59
  end
54
60
 
@@ -102,17 +108,46 @@ class GeneralScraper
102
108
  def getData
103
109
  search
104
110
  @urllist.each do |url|
105
- getPageData(url)
111
+ report_results(getPageData(url), url)
106
112
  end
107
113
 
108
114
  @requests.close_all_browsers
109
- return JSON.pretty_generate(@output)
110
115
  end
111
116
 
117
+ # Figure out how to report results
118
+ def report_results(results, link)
119
+ if @cm_url
120
+ report_incremental(results, link)
121
+ else
122
+ report_bulk(results)
123
+ end
124
+ end
125
+
126
+ # Report results back to Harvester incrementally
127
+ def report_incremental(results, link)
128
+ curl_url = @cm_url+"/relay_results"
129
+ c = Curl::Easy.http_post(curl_url,
130
+ Curl::PostField.content('selector_id', @selector_id),
131
+ Curl::PostField.content('status_message', "Collected " + link),
132
+ Curl::PostField.content('results', JSON.pretty_generate(results)))
133
+ end
134
+
135
+ # Add page hash to output for bulk reporting
136
+ def report_bulk(results)
137
+ @output.push(results)
138
+ end
139
+
140
+
112
141
  # Returns a list of search result URLs
113
142
  def getURLs
114
143
  search
115
144
  @requests.close_all_browsers
116
145
  return JSON.pretty_generate(@urllist)
117
146
  end
147
+
148
+ # Get the JSON of all the data
149
+ def get_json_data
150
+ return JSON.pretty_generate(@output)
151
+ end
118
152
  end
153
+
data/lib/parse_page.rb CHANGED
@@ -8,7 +8,7 @@ module ParsePage
8
8
  html = Nokogiri::HTML(page)
9
9
  pagehash = getMetadata(url, html)
10
10
  pagehash = getContent(url, pagehash, html)
11
- @output.push(pagehash)
11
+ return pagehash
12
12
  rescue
13
13
  end
14
14
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: generalscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.19
4
+ version: 0.0.20
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-16 00:00:00.000000000 Z
11
+ date: 2016-10-08 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Scrapes Google
14
14
  email: shidash@shidash.com