RubyGems - generalscraper - Versions diffs - 0.0.5 → 0.0.6 - Mend

generalscraper 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 21df27ba7416ab3ea410f3c38f0aea43cfa0c5f0
-  data.tar.gz: bbdfaa98d9c7c0600dc626b43da0127a43472b36
+  metadata.gz: dbbd54bb1986056a1e3207eda365f04d4684e0f2
+  data.tar.gz: 274b1a30166371f8f587484020c20a86690e7161
 SHA512:
-  metadata.gz: f9c37e1e151b37d4eb231fb22304d9f7868eed8a02cb874aa9968756be0f2ad2f555f36c2e8a02977c353579232c6c7a40ee776236ec425e2bef28959f6ce80a
-  data.tar.gz: 57d4a622ed823a0acad91bea00a787e2f341721ed110c3d5a88b93f9c981e267a14d62a0db1c71993a10dd9042d77496833724a5f893123288b15f9d9faf9223
+  metadata.gz: 0f99388b754103326c19a436d41dac5740169beb36554c40f6731a6c0bb3c2d0232742d17fab442c41ee34f55e424dfe6c046d0a0b2c7156d53d5a370d1800ac
+  data.tar.gz: d7f7ed11b8d11c4e0a8b85e83a35d86b3729633d4f2335d8dc6e3d206bd9b04eaaf02070458b28f879b8fd5df9620da37c34770606b3d26a58faa385c78f45e9

data/lib/generalscraper.rb CHANGED Viewed

@@ -29,14 +29,25 @@ class GeneralScraper
   # Categorizes the links on results page into results and other search pages
   def categorizeLinks(page)
     page.links.each do |link|
-      if (link.href.include? @op_val) && (!link.href.include? "webcache") && (!link.href.include? @operators.gsub(" ", "+"))
+      if isResultLink?(link)
         siteURLSave(link)
-      elsif (link.href.include? "&sa=N") && (link.href.include? "&start=")
+      elsif isSearchPageLink?(link)
         nextSearchPage(link)
       end
     end
   end
+  # Determines if url is link to search result
+  def isResultLink?(link)
+    return (link.href.include? @op_val) && (!link.href.include? "webcache") && (!link.href.include? @operators.gsub(" ", "+"))
+  end
+  # Determines if URL is link to next search page
+  def isSearchPageLink?(link)
+    return (link.href.include? "&sa=N") && (link.href.include? "&start=")
+  end
   # Parse and save the URLs for search results
   def siteURLSave(link)
     site_url = link.href.split("?q=")[1]
@@ -53,6 +64,7 @@ class GeneralScraper
     end
   end
   # Gets all data and returns in JSON
   def getData
     search

data/lib/parse_page.rb CHANGED Viewed

@@ -4,26 +4,26 @@ module ParsePage
   # Get both page metadata and text
   def getPageData(url)
     begin
-      pagehash = getMetadata(url)
-      pagehash = getContent(url, pagehash)
+      html = Nokogiri::HTML(getPage(url).body)
+      pagehash = getMetadata(url, html)
+      pagehash = getContent(url, pagehash, html)
       @output.push(pagehash)
     rescue
     end
   end
   # Get the page content by type of page
-  def getContent(url, pagehash)
+  def getContent(url, pagehash, html)
     if url.include? ".pdf"
       return getPDF(url, pagehash)
     else
-      return getHTMLText(url, pagehash)
+      return getHTMLText(url, pagehash, html)
     end
   end
   # Download the page text
-  def getHTMLText(url, pagehash)
-    html = Nokogiri::HTML(getPage(url).body)
-    pagehash[:text] = html.css("body").text.encode("UTF-8")
+  def getHTMLText(url, pagehash, html)
+    pagehash[:text] = fixEncode(html.css("body").text)
     return pagehash
   end
@@ -35,12 +35,12 @@ module ParsePage
     # OCR PDF and save fields
     u = UploadConvert.new("public/uploads/" + path[path.length-1].chomp.strip)
     pdfparse = JSON.parse(u.handleDoc)
-    pdfparse.each{|k, v| pagehash[k] = v.encode("UTF-8")}
+    pdfparse.each{|k, v| pagehash[k] = fixEncode(v)}
     return pagehash
   end
   # Get the page metadata
-  def getMetadata(url)
+  def getMetadata(url, html)
     pagehash = Hash.new
     # Save URL and date retreived
@@ -50,14 +50,21 @@ module ParsePage
     pagehash[:date_retrieved] = Time.now
     # Get title and meta tag info
-    html = Nokogiri::HTML(getPage(url).body) # Eventually modify this
-    pagehash[:title] = html.css("title").text.encode("UTF-8")
+    pagehash[:title] = fixEncode(html.css("title").text)
     html.css("meta").each do |m|
       if m
-        pagehash[m['name']] = m['content']
+        pagehash[m['name']] = fixEncode(m['content'])
       end
     end
     return pagehash
   end
+  def fixEncode(str)
+    if str.is_a?(String)
+      return str.unpack('C*').pack('U*')
+    else
+      return str
+    end
+  end
 end

data/lib/proxy_manager.rb CHANGED Viewed

@@ -1,35 +1,53 @@
 require 'active_support/time'
 require 'mechanize'
+require 'uri'
 module ProxyManager
   # Get the page with a proxy
-  def getPage(url, form_input = nil)
+  def getPage(url, form_input = nil, fail_count = 0)
     agent = Mechanize.new do |a|
       a.user_agent_alias = "Linux Firefox"
-      a.set_proxy(getRandomProxy, 80)
+      a.set_proxy(*getRandomProxy(url))
     end
-    if form_input
-      gform = agent.get(url).form("f")
-      gform.q = form_input
-      return agent.submit(gform, gform.buttons.first)
-    else
-      return agent.get(url)
+    # Slightly different based on filling in form or not
+    begin
+      if form_input
+        gform = agent.get(url).form("f")
+        gform.q = form_input
+        return agent.submit(gform, gform.buttons.first)
+      else
+        return agent.get(url)
+      end
+    rescue # Only retry request 5 times
+      getPage(url, form_input, fail_count+=1) if fail_count < 5
     end
   end
   # Choose a random proxy
-  def getRandomProxy
+  def getRandomProxy(url)
     max = @proxylist.length
     chosen = @proxylist[Random.rand(max)]
-    # Only use proxy if it hasn't been used in last 20 seconds
-    if !@usedproxies[chosen] || @usedproxies[chosen] < Time.now-20
-      @usedproxies[chosen] = Time.now
-      return chosen
+    # Only use proxy if it hasn't been used in last 20 seconds on same host
+    if isNotUsed?(chosen, url)
+      @usedproxies[chosen] = [Time.now, URI.parse(url).host]
+      return parseProxy(chosen)
     else
-      sleep(0.5)
-      getRandomProxy
+      sleep(0.005)
+      getRandomProxy(url)
     end
   end
+  # Splits up proxy into IP, port, user, password
+  def parseProxy(chosen)
+    proxy_info = chosen.split(":")
+    proxy_info[proxy_info.length-1] = proxy_info.last.strip
+    return proxy_info
+  end
+  # Checks if a proxy has been used on domain in the last 20 seconds
+  def isNotUsed?(chosen, url)
+    return !@usedproxies[chosen] || @usedproxies[chosen][0] <= Time.now-20 || @usedproxies[chosen][1] != URI.parse(url).host
+  end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: generalscraper
 version: !ruby/object:Gem::Version
-  version: 0.0.5
+  version: 0.0.6
 platform: ruby
 authors:
 - M. C. McGrath
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-04-07 00:00:00.000000000 Z
+date: 2014-04-10 00:00:00.000000000 Z
 dependencies: []
 description: Scrapes Google
 email: shidash@shidash.com