arb_spider 0.1.0 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b7b99674de972409ead3555f8eb50c34e060dbbb
4
- data.tar.gz: 9fa3fea8aae4964f9bb0026a0e1ba60f14c38ba4
3
+ metadata.gz: 5de63f3a7f1ae54a77faa7ac8a19bd9cbfa3bf4e
4
+ data.tar.gz: 6b8ac5b5b5ceeebeaf0feaf3f824ddb063f40b88
5
5
  SHA512:
6
- metadata.gz: 82d4182084187698eacd6b2c1ecdcddd899b8e1e1e2df2253b9d47534618b4314b3eb009dcd9ef7c0977f1d4356b83e48c9c806e91145db2443f9c2becb3c561
7
- data.tar.gz: e52fbe5ee98deebd8f2b4b38d457c217243bc51b0bd6c29fc021c43fdb18f7ce9c69cb3ed87ac507f19bc8f6b18989e96b27c1c74cc739f357ba595ef8ab6bfc
6
+ metadata.gz: 11d14959c81b2a0e0c51755fed28f64b3851ee35b71763b8d72e697269bb04fe635ddcfa102bc976416589939fa00ad3ccb4d7b9a66dba0252e5a25b0021d6a1
7
+ data.tar.gz: 8ab34e4442577b03b16e536bdaac322c0908efd5fd7b67d63bae6bf88a61aa7d0310a5968d399dd4752454f1b7952437845b816401c7c649f988da67e2aab585
data/Gemfile CHANGED
@@ -1,4 +1,8 @@
1
1
  source 'https://rubygems.org'
2
2
 
3
+ gem 'httpclient','~> 2.7.1'
4
+ gem 'nokogiri','~> 1.6.7'
5
+
6
+
3
7
  # Specify your gem's dependencies in arb_spider.gemspec
4
8
  gemspec
data/lib/arb_spider.rb CHANGED
@@ -1,5 +1,14 @@
1
- require "arb_spider/version"
1
+ require 'arb_spider/version'
2
+ require 'arb_spider/spider'
2
3
 
3
4
  module ArbSpider
4
- # Your code goes here...
5
+
6
+ class << self
7
+ attr_accessor :default_spider
8
+ end
9
+
10
+ def parse(*args, &block)
11
+ self.class.default_spider ||= ArbSpider::Spider.new
12
+ self.class.default_spider.send :parse *args, &block
13
+ end
5
14
  end
@@ -0,0 +1,37 @@
1
+ module ArbSpider
2
+ class ScaleHash < BasicObject
3
+
4
+ attr_accessor :help_hash
5
+ attr_accessor :core_hash
6
+ attr_accessor :counter
7
+ attr_accessor :scale
8
+
9
+ def initialize(scale=10, *args, &block)
10
+ self.counter ||= -1
11
+ self.scale=scale
12
+ self.help_hash = ::Hash.new
13
+ self.core_hash = ::Hash.new *args, &block
14
+ end
15
+
16
+ def method_missing(method, *args, &block)
17
+ begin
18
+ core_hash.send(method, *args, &block) unless hijack_method(method, *args, &block)
19
+ rescue ::NoMethodError => e
20
+ ::Kernel.raise ::NoMethodError, "undefined method '#{e.name}' for #{self}"
21
+ end
22
+ end
23
+
24
+ #return true to hijack the call of method
25
+ def hijack_method(method, *args, &block)
26
+ case method
27
+ when :'[]=' then
28
+ self.counter += 1
29
+ index = counter % scale
30
+ core_hash.delete(help_hash[index]) if help_hash.include?(index)
31
+ help_hash[index]=args[0] #key
32
+ core_hash[args[0]] = args[1] #value
33
+ else
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,63 @@
1
+ require 'httpclient'
2
+ require 'nokogiri'
3
+ require 'arb_spider/scale_hash'
4
+
5
+ module ArbSpider
6
+ class Spider < BasicObject
7
+
8
+ attr_reader :cached_docs
9
+ attr_reader :current_doc
10
+ attr_reader :client
11
+
12
+ def initialize
13
+ @client= ::HTTPClient.new
14
+ @cached_docs = ScaleHash.new 20
15
+
16
+ #after initializing
17
+ yield self if ::Kernel.block_given?
18
+ end
19
+
20
+ def parse(url=nil, *args, &block)
21
+ tmp_doc = url.nil? ? current_doc : (@cached_docs[url] || fetch_doc(url, *args))
22
+ ::Kernel.raise StandardError, "fail to load doc of specific url #{url}" if tmp_doc.nil?
23
+ end
24
+
25
+ def method_missing(method, *args, &block)
26
+ #used to support user authorization through httpclient method (eg post)
27
+ tmp = @client.send method, *args, &block
28
+ @current_doc = tmp if ::HTTP::Message === tmp
29
+ rescue ::NoMethodError => e
30
+ @current_doc.send method, *args, &block
31
+ rescue ::NoMethodError => e
32
+ ::Kernel.raise ::NoMethodError, "undefined method '#{e.name}' for #{self}"
33
+ end
34
+
35
+ def inspect
36
+ klass = class << self
37
+ self
38
+ end
39
+ '<%s:%d>' % [klass.superclass.name, self.__id__]
40
+ end
41
+
42
+ private
43
+
44
+ def fetch_doc(url, *args)
45
+ tmp_args = args.dup
46
+ tmp_doc = client.send(parse_args(:method) {} || :get, *args) #default to use get method
47
+ cached_docs[url] = tmp_doc
48
+ end
49
+
50
+ def parse_args (key, source=nil, &block)
51
+ raise ArgumentError, 'please provide either arguments source or a block with which the local variables bind! (eg parse_args{}) ' unless source || block
52
+ args_arr = source || block.binding.local_variables.map { |i|
53
+ block.binding.local_variable_get i
54
+ }
55
+ args_hash = args_arr && args_arr.find do |i|
56
+ Hash == i # or i.respond_to? :"[]"
57
+ end
58
+ raise ArgumentError, 'no arguments hash found in current hash, please provide the proper source ' if args_arr.nil?
59
+ args_hash.delete key
60
+ end
61
+
62
+ end
63
+ end
@@ -1,3 +1,3 @@
1
1
  module ArbSpider
2
- VERSION = "0.1.0"
2
+ VERSION = "1.1.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arb_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 1.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - arybin
@@ -55,6 +55,8 @@ files:
55
55
  - bin/console
56
56
  - bin/setup
57
57
  - lib/arb_spider.rb
58
+ - lib/arb_spider/scale_hash.rb
59
+ - lib/arb_spider/spider.rb
58
60
  - lib/arb_spider/version.rb
59
61
  homepage: https://github.com/arybin-cn/arb_spider
60
62
  licenses: []