RubyGems - generalscraper - Versions diffs - 0.0.6 → 0.0.7 - Mend

generalscraper 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: dbbd54bb1986056a1e3207eda365f04d4684e0f2
-  data.tar.gz: 274b1a30166371f8f587484020c20a86690e7161
+  metadata.gz: ee21102a93458476611426362719e7b3bc7edc9d
+  data.tar.gz: 0ae6818f49029a0b3cab8c8e9d9ff3ba7f23d26e
 SHA512:
-  metadata.gz: 0f99388b754103326c19a436d41dac5740169beb36554c40f6731a6c0bb3c2d0232742d17fab442c41ee34f55e424dfe6c046d0a0b2c7156d53d5a370d1800ac
-  data.tar.gz: d7f7ed11b8d11c4e0a8b85e83a35d86b3729633d4f2335d8dc6e3d206bd9b04eaaf02070458b28f879b8fd5df9620da37c34770606b3d26a58faa385c78f45e9
+  metadata.gz: 2203574a839f5c122621153bbd8666c5686bbfd3ee9020a74c26e6acdfa80aeec5263a45ea2d80bc092568b1633bc8f964429ba3fee6d5853b854a5a767a5ddd
+  data.tar.gz: 419c278558b0b17ddeadc3cd61c4891711fb20d0678706f8f50c16441165a0c9f8c0172f5c2160a07d8b690976162d8e0c2f93758c01891845c571aa49d1c1b2

data/lib/generalscraper.rb CHANGED Viewed

@@ -9,7 +9,7 @@ class GeneralScraper
   include ParsePage
   include ProxyManager
-  def initialize(operators, searchterm, proxylist)
+  def initialize(operators, searchterm, proxylist, use_proxy)
     @operators = operators
     @searchterm = searchterm
     @op_val = @operators.split(" ")[0].split(":")[1]
@@ -19,11 +19,12 @@ class GeneralScraper
     @output = Array.new
     @urllist = Array.new
     @startindex = 10
+    @use_proxy = use_proxy
   end
   # Searches for links on Google
   def search
-    categorizeLinks(getPage("http://google.com", @operators + " " + @searchterm))
+    categorizeLinks(getPage("http://google.com", @operators + " " + @searchterm, @use_proxy))
   end
   # Categorizes the links on results page into results and other search pages
@@ -60,7 +61,7 @@ class GeneralScraper
     if page_index_num.to_i == @startindex
       @startindex += 10
-      categorizeLinks(getPage("http://google.com" + link.href + "&filter=0"))
+      categorizeLinks(getPage("http://google.com" + link.href + "&filter=0", @use_proxy))
     end
   end
@@ -80,4 +81,3 @@ class GeneralScraper
     return JSON.pretty_generate(@urllist)
   end
 end

data/lib/proxy_manager.rb CHANGED Viewed

@@ -4,10 +4,16 @@ require 'uri'
 module ProxyManager
   # Get the page with a proxy
-  def getPage(url, form_input = nil, fail_count = 0)
+  def getPage(url, form_input = nil, fail_count = 0, use_proxy)
     agent = Mechanize.new do |a|
       a.user_agent_alias = "Linux Firefox"
-      a.set_proxy(*getRandomProxy(url))
+      # Set proxy if specified, otherwise delay to avoid blocks
+      if use_proxy
+        a.set_proxy(*getRandomProxy(url))
+      else
+        sleep(20)
+      end
     end
     # Slightly different based on filling in form or not
@@ -19,8 +25,8 @@ module ProxyManager
       else
         return agent.get(url)
       end
-    rescue # Only retry request 5 times
-      getPage(url, form_input, fail_count+=1) if fail_count < 5
+    rescue # Only retry request 10 times
+      getPage(url, form_input, fail_count+=1) if fail_count < 10
     end
   end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: generalscraper
 version: !ruby/object:Gem::Version
-  version: 0.0.6
+  version: 0.0.7
 platform: ruby
 authors:
 - M. C. McGrath
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-04-10 00:00:00.000000000 Z
+date: 2014-08-22 00:00:00.000000000 Z
 dependencies: []
 description: Scrapes Google
 email: shidash@shidash.com