generalscraper 0.0.19 → 0.0.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9de0cd6461d23d2f27b9d7ced18698f503c481aa
4
- data.tar.gz: 3ed9dd6ecf0b84e8c31a3002a0e6e83689848208
3
+ metadata.gz: ae1747657ed5aed8d784aee72c2203ffa8d0c5b9
4
+ data.tar.gz: e9e0e82429b6085d0455b8425123aca817229839
5
5
  SHA512:
6
- metadata.gz: 678789d0b479ceba78cf9b464b21150e1d2469da95891043f0996f530f3a957c34ef02bc0c31458835a39170980190c1cac478fe44652d301b082e58ad1b103d
7
- data.tar.gz: 9196461845feecbafa876cd29cafde6b60ae1269a8eba3ada28fa5e7b778a95324d0176fd8488f944c3bbe7bdd8959e0db91a128cedcc004218a12f4aebf6cc0
6
+ metadata.gz: 9a97172a7739666794cc170973f5433b16fad53bb7ed4993e3e15d05910c650b13202bc5a053cbc3642f9d17b95a20fdc4c75306bbee9784cb209d51188f2d9e
7
+ data.tar.gz: 36ea20ed630476467fd7a4cecec499e4d6aa98e03b7a0335a1e4ac9329eecc88f008b1813e546f36c0d561fc5ee6f2de34dcd92509b45458c4adf26aa9dc3802
@@ -10,7 +10,7 @@ load 'captcha.rb'
10
10
  class GeneralScraper
11
11
  include ParsePage
12
12
 
13
- def initialize(operators, searchterm, requests, solver_details)
13
+ def initialize(operators, searchterm, requests, solver_details, cm_hash)
14
14
  @operators = operators
15
15
  @searchterm = searchterm
16
16
  @op_val = @operators.split(" ")[0].split(":")[1]
@@ -20,6 +20,10 @@ class GeneralScraper
20
20
  @output = Array.new
21
21
  @urllist = Array.new
22
22
  @startindex = 10
23
+
24
+ # Handle crawler manager info
25
+ @cm_url = cm_hash[:crawler_manager_url] if cm_hash
26
+ @selector_id = cm_hash[:selector_id] if cm_hash
23
27
  end
24
28
 
25
29
  # Searches for links on Google
@@ -44,11 +48,13 @@ class GeneralScraper
44
48
  @requests.restart_browser
45
49
  check_results(@requests.get_page(requested_page), requested_page)
46
50
  end
47
- elsif page.include?("403") && page.length < 100
48
- @requests.restart_browser
49
- check_results(@requests.get_page(requested_page), requested_page)
50
51
  else # No CAPTCHA found :)
51
- navigate_save_results(page)
52
+ begin
53
+ navigate_save_results(page)
54
+ rescue Exception
55
+ @requests.restart_browser
56
+ check_results(@requests.get_page(requested_page), requested_page)
57
+ end
52
58
  end
53
59
  end
54
60
 
@@ -102,17 +108,46 @@ class GeneralScraper
102
108
  def getData
103
109
  search
104
110
  @urllist.each do |url|
105
- getPageData(url)
111
+ report_results(getPageData(url), url)
106
112
  end
107
113
 
108
114
  @requests.close_all_browsers
109
- return JSON.pretty_generate(@output)
110
115
  end
111
116
 
117
+ # Figure out how to report results
118
+ def report_results(results, link)
119
+ if @cm_url
120
+ report_incremental(results, link)
121
+ else
122
+ report_bulk(results)
123
+ end
124
+ end
125
+
126
+ # Report results back to Harvester incrementally
127
+ def report_incremental(results, link)
128
+ curl_url = @cm_url+"/relay_results"
129
+ c = Curl::Easy.http_post(curl_url,
130
+ Curl::PostField.content('selector_id', @selector_id),
131
+ Curl::PostField.content('status_message', "Collected " + link),
132
+ Curl::PostField.content('results', JSON.pretty_generate(results)))
133
+ end
134
+
135
+ # Add page hash to output for bulk reporting
136
+ def report_bulk(results)
137
+ @output.push(results)
138
+ end
139
+
140
+
112
141
  # Returns a list of search result URLs
113
142
  def getURLs
114
143
  search
115
144
  @requests.close_all_browsers
116
145
  return JSON.pretty_generate(@urllist)
117
146
  end
147
+
148
+ # Get the JSON of all the data
149
+ def get_json_data
150
+ return JSON.pretty_generate(@output)
151
+ end
118
152
  end
153
+
data/lib/parse_page.rb CHANGED
@@ -8,7 +8,7 @@ module ParsePage
8
8
  html = Nokogiri::HTML(page)
9
9
  pagehash = getMetadata(url, html)
10
10
  pagehash = getContent(url, pagehash, html)
11
- @output.push(pagehash)
11
+ return pagehash
12
12
  rescue
13
13
  end
14
14
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: generalscraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.19
4
+ version: 0.0.20
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-16 00:00:00.000000000 Z
11
+ date: 2016-10-08 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Scrapes Google
14
14
  email: shidash@shidash.com