RubyGems - crawlfish - Versions diffs - 0.0.2 → 0.0.3 - Mend

crawlfish 0.0.2 → 0.0.3

Files changed (4) hide show

@@ -43,7 +43,7 @@
       <file leaf-file-name="crawlfish.gemspec" pinned="false" current="false" current-in-tab="false">
         <entry file="file://$PROJECT_DIR$/crawlfish.gemspec">
           <provider selected="true" editor-type-id="text-editor">
-            <state line="10" column="21" selection-start="335" selection-end="335" vertical-scroll-proportion="0.0">
+            <state line="23" column="0" selection-start="785" selection-end="785" vertical-scroll-proportion="0.0">
               <folding />
             </state>
           </provider>
@@ -70,7 +70,7 @@
       <file leaf-file-name="crawlfish.rb" pinned="false" current="false" current-in-tab="false">
         <entry file="file://$PROJECT_DIR$/lib/crawlfish.rb">
           <provider selected="true" editor-type-id="text-editor">
-            <state line="61" column="3" selection-start="1587" selection-end="1587" vertical-scroll-proportion="0.0">
+            <state line="4" column="16" selection-start="72" selection-end="72" vertical-scroll-proportion="0.0">
               <folding />
             </state>
           </provider>
@@ -322,23 +322,23 @@
         </state>
       </provider>
     </entry>
-    <entry file="file://$PROJECT_DIR$/Gemfile">
+    <entry file="file://$PROJECT_DIR$/crawlfish.gemspec">
       <provider selected="true" editor-type-id="text-editor">
-        <state line="4" column="0" selection-start="93" selection-end="93" vertical-scroll-proportion="0.0">
+        <state line="23" column="0" selection-start="785" selection-end="785" vertical-scroll-proportion="0.0">
           <folding />
         </state>
       </provider>
     </entry>
-    <entry file="file://$PROJECT_DIR$/crawlfish.gemspec">
+    <entry file="file://$PROJECT_DIR$/lib/crawlfish.rb">
       <provider selected="true" editor-type-id="text-editor">
-        <state line="10" column="21" selection-start="335" selection-end="335" vertical-scroll-proportion="0.0">
+        <state line="4" column="16" selection-start="72" selection-end="72" vertical-scroll-proportion="0.0">
           <folding />
         </state>
       </provider>
     </entry>
-    <entry file="file://$PROJECT_DIR$/lib/crawlfish.rb">
+    <entry file="file://$PROJECT_DIR$/Gemfile">
       <provider selected="true" editor-type-id="text-editor">
-        <state line="61" column="3" selection-start="1587" selection-end="1587" vertical-scroll-proportion="0.0">
+        <state line="4" column="0" selection-start="93" selection-end="93" vertical-scroll-proportion="0.0">
           <folding />
         </state>
       </provider>

data/lib/crawlfish.rb CHANGED

@@ -2,6 +2,7 @@ require 'nokogiri'
 require 'open-uri'
 module Crawlfish
+  LOL = "giggle"
   class GoogleScraper
     attr_accessor :website, :keyword, :user_agent, :start, :i, :position
     def initialize(options)
@@ -59,4 +60,163 @@ module Crawlfish
       @current_page += 1
     end
   end
-end
+end
+=begin
+module Lol
+ def scrape
+    require 'gscraper'
+    set_keyword
+    set_domain
+    set_user_agent
+    reset_pager
+    query_keyword
+    # Until position is found OR ten pages are searched, loop through ten pages
+    until @position or @pager_position > 10
+      search_this_page
+      sleep 2.seconds
+      next_page
+    end
+    @position ||= -1 # -1 if not found
+    log_the_end_of_the_scrape
+    return {:position => @position, :measured_at => Time.now, :engine => "Google"}
+  end
+  def create_logger
+    @log = Logger.new("#{Rails.root}/log/scraping.log")
+  end
+  def log(log_message)
+    @log.debug log_message
+  end
+  def query_keyword
+    @query = GScraper::Search.query(:query => @keyword)
+  end
+  def set_keyword
+    @keyword = self.text
+  end
+  def set_domain
+    @domain = self.website.url
+  end
+  def set_user_agent
+    @user_agent = get_random_user_agent
+  end
+  def reset_pager
+    @pager_position = 1
+  end
+  def search_this_page
+    log "page: #{@pager_position}"
+    begin
+      @query.page(@pager_position).each do |result|
+        host = URI::parse(URI::extract(result.url.to_s).first).host
+        log "#{result.rank}. Checking host: '#{host}' against domain: '#{@domain}'"
+        if host == @domain
+          @position = result.rank
+        end
+      end
+    rescue
+      log "rescued."
+    end
+  end
+  def next_page
+    @pager_position += 1
+  end
+ end
+# =========================================================================================================================
+           module lol2
+def scrape
+82
+-    until position or start==100 # until position gets set or reaches 10th page
+83
+-      page = (start.to_i+10)/10
+84
+-      log.debug "page: #{page}... "
+85
+-      url = "http://www.google.com/search?q=#{keyword}&start=#{start}"
+86
+-      log.debug "url: #{url}"
+87
+-      #url = Rails.root + "test/google-search/search#{page}.html"
+88
+-      doc = Nokogiri::HTML(open(url, "User-Agent" => user_agent))
+89
+-      links = doc.xpath('//h3/a[contains(@class, "l")]')
+90
+-      #links = doc.search('//h3/a[@class="l"]')
+91
+-
+92
+-      #if links empty, try a more general scrape
+93
+-      if links.empty?
+94
+-        links = doc.xpath('//h3[@class = "r"]/a]')
+95
+-      end
+96	66
+97
+-      if links.empty?
+98
+-        if doc.at_xpath('//div/p').to_s =~ /did not match any documents/
+99
+-          # If no results returned,
+100
+-          log.debug "Page contains no results"
+101
+-          break
+102
+-        else
+103
+-          # No links, but also no "found no results" page. throw error
+104
+-          log.debug doc.to_s
+105
+-          log.debug "Raising: PageContainsNoLinks"
+106
+-          raise "PageContainsNoLinks"
+107
+-        end
+108
+-      end
+109	67
+110
+-      links.each do |link|
+111
+-
+112
+-        # Remove protocol prefix
+113
+-        to_remove = ["http://", "https://"]
+114
+-        reg = Regexp.new(to_remove.map{ |s| "(#{s})" }.join('|'))
+115
+-        link = link['href'].gsub(reg, '')
+116	68
+117
+-        # If link start with '/url?q=', remove it
+118
+-        if link[0..6] == '/url?q='
+119
+-           link = link[7..-1]
+120
+-        end
+121
+-
+122
+-        # If link starts with 'www.', remove it
+123
+-        if link[0..3] == 'www.'
+124
+-          link = link[4..-1]
+125
+-        end
+126
+-
+127
+-        log.debug "link #{i}. '#{link[0..domain.length-1]}' against '#{domain}'"
+ 	69
++  def scrape=end

data/lib/crawlfish/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Crawlfish
-  VERSION = "0.0.2"
+  VERSION = "0.0.3"
 end

metadata CHANGED

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: crawlfish
 version: !ruby/object:Gem::Version
-  hash: 27
+  hash: 25
   prerelease: false
   segments:
   - 0
   - 0
-  - 2
-  version: 0.0.2
+  - 3
+  version: 0.0.3
 platform: ruby
 authors:
 - Dan Neumann