RubyGems - rhack - Versions diffs - 1.2.1 → 1.2.7 - Mend

rhack 1.2.1 → 1.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

checksums.yaml +13 -5
data/README.md +21 -9
data/ext/curb/curb.c +977 -977
data/ext/curb/curb.h +52 -52
data/ext/curb/curb_config.h +270 -270
data/ext/curb/curb_easy.c +3437 -3434
data/ext/curb/curb_easy.h +94 -94
data/ext/curb/curb_errors.c +647 -647
data/ext/curb/curb_errors.h +129 -129
data/ext/curb/curb_macros.h +162 -162
data/ext/curb/curb_multi.c +704 -702
data/ext/curb/curb_multi.h +26 -26
data/ext/curb/curb_postfield.c +523 -523
data/ext/curb/curb_postfield.h +40 -40
data/ext/curb/curb_upload.c +80 -80
data/ext/curb/curb_upload.h +30 -30
data/ext/curb-original/curb.c +977 -977
data/ext/curb-original/curb.h +52 -52
data/ext/curb-original/curb_config.h +238 -238
data/ext/curb-original/curb_easy.c +3404 -3404
data/ext/curb-original/curb_easy.h +90 -90
data/ext/curb-original/curb_errors.c +647 -647
data/ext/curb-original/curb_errors.h +129 -129
data/ext/curb-original/curb_macros.h +159 -159
data/ext/curb-original/curb_multi.c +633 -633
data/ext/curb-original/curb_multi.h +26 -26
data/ext/curb-original/curb_postfield.c +523 -523
data/ext/curb-original/curb_postfield.h +40 -40
data/ext/curb-original/curb_upload.c +80 -80
data/ext/curb-original/curb_upload.h +30 -30
data/lib/rhack/clients/base.rb +61 -10
data/lib/rhack/clients/oauth.rb +4 -4
data/lib/rhack/curl/easy.rb +1 -0
data/lib/rhack/curl/global.rb +2 -0
data/lib/rhack/curl/response.rb +4 -2
data/lib/rhack/frame.rb +70 -32
data/lib/rhack/js/browser/env.js +697 -697
data/lib/rhack/js/browser/jquery.js +7180 -7180
data/lib/rhack/js/browser/xmlsax.js +1564 -1564
data/lib/rhack/js/browser/xmlw3cdom_1.js +1443 -1443
data/lib/rhack/js/browser/xmlw3cdom_2.js +2744 -2744
data/lib/rhack/page.rb +227 -68
data/lib/rhack/scout.rb +52 -26
data/lib/rhack/scout_squad.rb +10 -2
data/lib/rhack/version.rb +1 -1
data/rhack.gemspec +1 -1
metadata +17 -17

data/lib/rhack/page.rb CHANGED Viewed

@@ -33,89 +33,173 @@ module RHACK
     # for debug, just enable L#debug, don't write tons of chaotic log-lines
     __init__
     attr_writer :title
-    attr_reader :html, :loc, :hash, :doc, :js, :curl_res, :failed
+    attr_reader :body, :loc, :data, :doc, :js, :curl, :curl_res, :failed
+    alias :hash :data # DEPRECATED
+    alias :html :body # DEPRECATED
     # result of page processing been made in frame context
     attr_accessor :res
     # for johnson
     @@ignore = /google|_gat|tracker|adver/i
-    def initialize(obj='', loc=Hash.new(''), js=Johnson::Runtime.browser||Johnson::Runtime.new)
+      # Frame calls it with no args
+    def initialize(obj='', loc=Hash.new(''), js=is_a?(HtmlPage)&&(Johnson::Runtime.browser||Johnson::Runtime.new))
       loc = loc.parse:uri if !loc.is Hash
       @js = js
       if obj.is Curl::Easy or obj.kinda Scout
         c = obj.kinda(Scout) ? obj.http : obj
-        @html = ''
         # just (c, loc) would pass to #process opts variable that returns '' on any key
         process(c, loc.b || {})
       else
-        @html = obj
+        @body = obj
         @loc = loc
       end
     end
     def empty?
-      !(@hash.nil? ? @html : @hash).b
+      !@data && !@body.b
+    end
+    def size
+      if @data.nil?
+        (@body || '').size
+      elsif @data == false
+        0
+      else
+        @data.inspect.size
+      end
     end
     def inspect
-      if !@hash.nil?
-        "<##{self.class.name} (#{@hash ? @hash.inspect.size.bytes : 'failed to parse'}) #{@json ? 'json' : 'params hash'}>"
+      sz = size
+      if !@data.nil?
+        "<##{self.class.name} (#{@data == false ? 'failed to parse' : sz.bytes}) #{@json ? 'json' : 'url params'}>"
       else
-        "<##{self.class.name} #{@html.b ? "#{@failed ? @curl_res.header : '«'+title(false)+'»'} (#{@html.size.bytes}" : '(empty'})#{' js enabled' if @js and @doc and @hash.nil?}>"
+        "<##{self.class.name} #{sz == 0 ? '(empty)' : "#{@failed ? @curl_res.header : '«'+title(false)+'»'} (#{sz.bytes})"}#{' js enabled' if @js and @doc}>"
       end
     end
-    def html!(encoding='UTF-8')
-      @html.force_encoding(encoding)
+    def utf!
+      @body.utf!
     end
-    def url() @loc.href end
+    def url
+      @loc.href
+    end
     alias :href :url
+    # override this in a subclass
+    def failed?(*)
+      @curl_res.code != 200
+    end
+    # override this in a subclass
+    def retry?(*)
+      false
+    end
+    # override this in a subclass
+    # MUST return self if successful
+    # MAY return false otherwise
+    def parse(opts={})
+      if failed?
+        failed!
+        if opts[:json] or opts[:hash]
+          @data = false
+        end
+        return self
+      end
+      if opts[:json]
+        parse_json opts
+      elsif opts[:hash]
+        parse_hash opts
+      elsif opts[:xml]
+        parse_xml opts
+      else
+        parse_html opts
+      end
+      self
+    end
+  private
+    def failed!
+      @body = @curl_res.body
+      @failed = @curl_res.code
+    end
+    def log_failed(action)
+      L.debug "Failed #{action} from #{@curl.last_effective_url}, take a look at my @body for info; my object_id is #{object_id}"
+    end
+    def parse_xml(*)
+      @body = @curl_res.body.xml_to_utf
+      to_xml
+    rescue StandardError => e
+      L.warn "Exception raised during `to_xml': #{e.inspect}"
+      log_failed "to parse page as XML"
+      failed!
+    end
+    def parse_html(opts={})
+      @body = @curl_res.body.xml_to_utf
+      to_html
+      if opts[:eval]
+        load_scripts opts[:load_scripts]
+        eval_js
+      end
+    rescue StandardError => e
+      L.warn "Exception raised during `to_html': #{e.inspect}"
+      log_failed "to parse page as HTML"
+      failed!
+    end
+    def parse_json(*)
+      @json = true
+      begin
+        @data = @curl_res.body.from_json
+      rescue StandardError => e
+        L.warn "Exception raised during `from_json': #{e.inspect}"
+      end
+      if !@data or @data.is String
+        log_failed "to get JSON"
+        failed!
+        @data = false
+      end
+    end
+    def parse_hash(*)
+      if @curl_res.body.inline
+        @data = @curl_res.body.to_params
+      else
+        log_failed "to get url-params hash"
+        failed!
+        @data = false
+      end
+    end
+  public
     # We can then alternate #process in Page subclasses
     # Frame doesn't mind about value returned by #process
     def process(c, opts={})
       @loc = c.last_effective_url.parse:uri
+      @curl = c
       @curl_res = c.res
-      L.debug "#{@loc.fullpath} -> #{@curl_res}"
-      if @curl_res.code == 200
-        body = @curl_res.body
-        if opts[:json]
-          @json = true
-          @hash = begin; body.from_json
-          rescue StandardError
-            false
-          end
-          if !@hash or @hash.is String
-            L.debug "failed to get json from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}"
-            @html = body; to_doc
-            @hash = false
-          end
-        elsif opts[:hash]
-          if body.inline
-            @hash = body.to_params
-          else
-            @hash = false
-            L.debug "failed to get params hash from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}"
-            @html = body; to_doc
-          end
-        else
-          @html = body.xml_to_utf
-          to_doc
-          if opts[:eval]
-            load_scripts opts[:load_scripts]
-            eval_js
-          end
-        end
-      elsif !(opts[:json] or opts[:hash])
-        @html = @curl_res.body
-        @failed = @curl_res.code
+      if retry?
+        c.retry!
+        return # callback will not proceed
       end
-      self
+      L.debug "#{@loc.fullpath} -> #{@curl_res}"
+      parse(opts)
     end
     def eval_js(frame=nil)
       eval_string "document.location = window.location = #{@loc.to_json};
       document.URL = document.baseURI = document.documentURI = location.href;
@@ -149,14 +233,18 @@ module RHACK
       end
     end
-    def to_doc
-      @doc = @html.to_doc :forceutf
+    def to_html
+      @doc = @body.to_html
+    end
+    def to_xml
+      @doc = @body.to_xml
     end
     def title(full=true)
-      if @hash.nil? and !@failed and @html.b
+      if @data.nil? and !@failed and @body.b
         if full
-          to_doc unless defined? @doc
+          to_html unless defined? @doc
           if @doc.title.b
             @title = @doc.title
           else
@@ -257,14 +345,14 @@ module RHACK
       end
     end
-    def __at(xp) (@doc || to_doc).at xp end
+    def __at(xp) (@doc || to_html).at xp end
-    def __find(xp) (@doc || to_doc).find xp end
+    def __find(xp) (@doc || to_html).find xp end
   public
     def at(selector_or_node, options={})
-      if selector_or_node and preresult = selector_or_node.is_a?(XML::Node) ?
+      if selector_or_node and preresult = selector_or_node.is_a?(LibXML::XML::Node) ?
           selector_or_node : __at(selector_or_node)
         preresult = preprocess_search_result(preresult, options[:preprocess])
@@ -277,7 +365,7 @@ module RHACK
     alias :first :at
     def find(selector_or_nodes, options={}, &foreach)
-      preresult = selector_or_nodes.is_a?(XML::XPath::Object, Array) ?
+      preresult = selector_or_nodes.is_a?(LibXML::XML::XPath::Object, Array) ?
         selector_or_nodes : __find(selector_or_nodes)
       if preresult.size > 0
@@ -349,7 +437,7 @@ module RHACK
       form = "[action=#{@loc.path.inspect}]" if form == :self
       if form.is String
              form_node = at form
-             raise XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node or form_node.name != 'form'
+             raise LibXML::XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node or form_node.name != 'form'
       else form_node = form
       end
       hash = form_node.inputs_all.merge!(hash)
@@ -376,13 +464,13 @@ module RHACK
     end
-    # OLD #
+    ### DEPRECATED ###
     # TODO: make into same form as #get_src and #map
     def get_srcs(links='img')
       begin
         links = find(links).map {|e| e.src} if links.is String
-      rescue XML::Error
+      rescue LibXML::XML::Error
         links = [links]
       end
       links.map {|link| expand_link link}.uniq
@@ -392,7 +480,7 @@ module RHACK
     #def get_src(link='img')
     #  begin
     #    link = at(link) && at(link).src if link.is String
-    #  rescue XML::Error; nil
+    #  rescue LibXML::XML::Error; nil
     #  end
     #  expand_link link if link
     #end
@@ -400,7 +488,7 @@ module RHACK
     def get_links(links='a')
       begin
         links = find(links).map {|e| e.href}.b || find(links+'//a').map {|e| e.href} if links.is String
-      rescue XML::Error
+      rescue LibXML::XML::Error
         links = [links]
       end
       links.map {|link| expand_link link}.uniq
@@ -421,22 +509,93 @@ module RHACK
     end
   end
+  ### Pages with specific processing
+  class XmlPage < Page
+    # override this in a subclass
+    # MUST return self if successful
+    # MAY return false otherwise
+    def parse(opts={})
+      if failed?
+        failed!
+      else
+        parse_xml opts
+      end
+      self
+    end
+  end
+  class HtmlPage < Page
+    # override this in a subclass
+    # MUST return self if successful
+    # MAY return false otherwise
+    def parse(opts={})
+      if failed?
+        failed!
+      else
+        parse_html opts
+      end
+      self
+    end
+  end
+  class JsonPage < Page
+    # override this in a subclass
+    # MUST return self if successful
+    # MAY return false otherwise
+    def parse(opts={})
+      if failed?
+        failed!
+      else
+        parse_json opts
+      end
+      self
+    end
+  end
+  class HashPage < Page
+    # override this in a subclass
+    # MUST return self if successful
+    # MAY return false otherwise
+    def parse(opts={})
+      if failed?
+        failed!
+      else
+        parse_hash opts
+      end
+      self
+    end
+  end
+  ### DEPRECATED ### Use native inheritance and override #retry instead
   # using reprocessing of page in case of non-200 response:
   # page_class = ReloadablePage do
   #   @res and @res.code != 200
   # end
   def ReloadablePage(&reload_condition)
-    rp = Class.new Page
-    rp.send :define_method, :process do |curl, opts|
-      super(curl, opts || {})
-      if curl.instance_eval &reload_condition
-        curl.retry!
-        nil # in case of reload_condition.call super's callback will not proceed
-      else self
+    Class.new Page do
+      define_method :process do |curl, opts|
+        super(curl, opts || {})
+        if curl.instance_eval &reload_condition
+          curl.retry!
+          nil # in case of reload_condition.call super's callback will not proceed
+        else self
+        end
       end
     end
-    rp
   end
 end

data/lib/rhack/scout.rb CHANGED Viewed

@@ -46,9 +46,17 @@ module RHACK
       @timeout    	= opts[:timeout] || @@timeout || 60
       @post_proc	= @get_proc = @head_proc = @put_proc = @delete_proc = Proc::NULL
       update uri
       @retry = opts[:retry] || {}
       @retry = {@uri.host => @retry} if @retry.is Array
+    end
+    def setup_curl
+      if loaded?
+        Curl.carier.remove @http
+      end
+      @http = Curl::Easy(@webproxy ? @proxy : @root)
+      @http.base = self
       @http.cacert = @@cacert
     end
@@ -66,8 +74,7 @@ module RHACK
       if @http
         @http.url = @webproxy ? @proxy : @root
       else
-        @http = Curl::Easy(@webproxy ? @proxy : @root)
-        @http.base = self
+        setup_curl
       end
       if @proxy
         @http.proxy_url = @proxy*':' if !@webproxy
@@ -186,12 +193,23 @@ module RHACK
         cks.map2 {|k, v| Cookie(k, v)}
     end
-    def retry?(err)
-      # exc = ['0chan.ru', '2-ch.ru', 'www.nomer.org', 'nomer.org'].select_in('http://www.nomer.org') = ['www.nomer.org', 'nomer.org']
-      exc = (@@retry.keys + @retry.keys).select_in @root
-      return false if !exc.b
-      # ['www.nomer.org', 'nomer.org'].every {|www| 'TimeoutError'.in({'nomer.org' => 'TimeoutError'}[www])} ?
-      exc.no? {|e| err[0].self_name.in((@@retry[e] || []) + @retry[e])}
+    def retry?(eclass)
+      # sites = ['0chan.ru', '2-ch.ru', 'www.nomer.org', 'nomer.org'].select_in('http://www.nomer.org') = ['www.nomer.org', 'nomer.org']
+      sites = (@@retry.keys + @retry.keys).select_in @root
+      return false if sites.empty?
+      errname = eclass.self_name
+      # retry = ['www.nomer.org', 'nomer.org'].any? {|www| {'nomer.org' => ['TimeoutError']}[www].include? 'TimeoutError'}
+      sites.any? {|site|
+        (@@retry[site] || []).include? errname or
+        (@retry[site] || []).include? errname
+      }
+    end
+    def retry!(path=@__path, headers=@__headers, not_redir=@__not_redir, relvl=@__relvl, callback=@__callback)
+      # all external params including post_body are still set
+      setup_curl # @http reload here
+      # and now we can set @http.on_complete back again
+      load(path, headers, not_redir, relvl, &callback)
     end
     def loaded?
@@ -209,10 +227,24 @@ module RHACK
       end
     rescue RuntimeError => e
       e.message << ". Failed to load allready loaded? easy handler: Bad file descriptor" unless Curl::Err::CurlError === e
-      raise e
+      L.warn "#{e.inspect}: #{e.message}"
+      if loaded?
+        Curl.carier.remove @http
+      end
+      sleep 1
+      load!
+      #e.message << ". Failed to load allready loaded? easy handler: Bad file descriptor" unless Curl::Err::CurlError === e
+      #raise e
     end
     def load(path=@path, headers={}, not_redir=1, relvl=10, &callback)
+      # cache preprocessed data for one time for we can do #retry
+      @__path = path
+      @__headers = headers
+      @__not_redir = not_redir
+      @__relvl = relvl
+      @__callback = callback
       @http.path = path = fix(path)
       @http.headers = mkHeader(path).merge!(headers)
       @http.timeout = @timeout
@@ -233,24 +265,18 @@ module RHACK
         end
       }
       @http.on_failure {|c, e|
+        eclass = e[0]
         @error = e
-        if e[0] == Curl::Err::CurlOK
-          # в сорцах on_failure не вызывается по коду 0, это какой-то глюк
-          # в любом случае такой поворот не означает ошибки
-          L.warn "Got Curl::Err::CurlOK, response was: #{c.res}"
-          load!
+        c.outdate!
+        # we must clean @http.on_complete, otherwise
+        # it would run right after this function and with broken data
+        @http.on_complete &Proc::NULL
+        if retry? eclass
+          L.debug "#{eclass} -> reloading scout"
+          retry!
         else
-          c.outdate!
-          if retry? e
-            L.debug "#{e[0]} -> reloading scout"
-            #load uri, headers, not_redir, relvl, &callback
-            load! # all params including post_body are still set
-            # DO they include `on_complete'?
-          else
-            @http.on_complete &Proc::NULL
-            L.debug "#{e[0]} -> not reloading scout"
-            raise *e if @raise_err
-          end
+          L.debug "#{eclass} -> not reloading scout"
+          raise *e if @raise_err
         end
       } if !@http.on_failure

data/lib/rhack/scout_squad.rb CHANGED Viewed

@@ -15,12 +15,16 @@ module RHACK
       if args[0].is Scout
         s = args[0]
       else
-        if !args[0].is String
-          args.unshift ''
+        unless args[0].is String
           if (opts = args[-1]).is Hash and (opts[:cp] || opts[:ck]).is Hash
             L.warn "it's useless to setup cookies for untargeted squad!"
           end
         end
+        if !args[0]
+          args[0] = ''
+        elsif !args[0].is String
+          args.unshift ''
+        end
         if args[1] and args[1][0].is Array
           proxies = args[1]
           args[1] = proxies.shift
@@ -43,7 +47,9 @@ module RHACK
     end
     def wait_for_available
+      #L.debug {"Curl.carier_thread = #{Curl.carier_thread}; Thread.current = #{Thread.current}"}
       Curl.execute :unless_already
+      #L.debug {"Curl.carier_thread = #{Curl.carier_thread}; Thread.current = #{Thread.current}"}
       # Carier.requests освобождаются ещё до колбека,
       # но колбеки выполняются последовательно,
       # поэтому здесь мы можем усыплять тред,
@@ -59,6 +65,7 @@ module RHACK
       raise PickError if !b
       # to_a because Array#reject returns object of this class
       if scout = to_a.rand_by_available?
+        L.debug {"randomly picked an available scout##{scout.object_id}"}
         scout
       else
         wait_for_available
@@ -69,6 +76,7 @@ module RHACK
     def next
       raise PickError if !b
       if scout = to_a.find_available?
+        L.debug {"picked the next available scout##{scout.object_id}"}
         scout
       else
         wait_for_available

data/lib/rhack/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module RHACK
-  VERSION = '1.2.1'
+  VERSION = '1.2.7'
 end

data/rhack.gemspec CHANGED Viewed

@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
   spec.add_runtime_dependency "activesupport"
   #spec.add_runtime_dependency "redis"
-  spec.add_runtime_dependency "rmtools", ">= 2.3.0"
+  spec.add_runtime_dependency "rmtools", ">= 2.3.6"
   spec.add_runtime_dependency "libxml-ruby"
   spec.extensions << 'ext/curb/extconf.rb'