digger 0.1.1 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 70e98c345c7da377cc9f84f68cb6b7f702f7b42b
4
- data.tar.gz: 5f3416916d1b726810075e5ea744a39ac3dd4487
2
+ SHA256:
3
+ metadata.gz: 5c4a94163a25d4b53ad5b477040f69b8fccca026adc313f8f61759317c1bf198
4
+ data.tar.gz: 307b443277c16708103c172e5fb4ef4d833f7d2631a7f85779570a3cbeac8925
5
5
  SHA512:
6
- metadata.gz: 5750dde57359a775d378e5dc84c75062bb3494af58f97708e84692cd5e6c9840719c40d6f0602244b542cc7e14d275dedd116b57119174366b05b04249674b99
7
- data.tar.gz: 2272ee6dd594a818b69cd0e4cb5eaba1f71954683d421f3f8654b9de1bc18a03e1f70cdf99a457b9d29f9add2736858bb2ffe22d31fa97073693bed4d8f64d44
6
+ metadata.gz: 9da40123fd09615d0c69ca5104d1141b82981813ef8d175bc567a0f35e8f7dd868ce235bc0f265308e2133710f63c3ce8325d6b528ae060eccb77904c12e3139
7
+ data.tar.gz: c05be67df6db25345acfdc3615690c0467029dd40ffde262863923b6f0786696a16fd547277ec4d3764ed39b399174dd28e10ca7eb31ffb55f773e08b04f2986
data/digger.gemspec CHANGED
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
18
18
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
19
  spec.require_paths = ["lib"]
20
20
 
21
- spec.add_development_dependency "bundler", "~> 1.7"
21
+ spec.add_development_dependency "bundler", "~> 2.0"
22
22
  spec.add_development_dependency "rake", "~> 10.0"
23
23
 
24
24
  spec.add_runtime_dependency 'nokogiri', '~> 1.6'
data/lib/digger/http.rb CHANGED
@@ -49,7 +49,7 @@ module Digger
49
49
  url = URI(url)
50
50
  pages = []
51
51
  get(url, referer) do |response, code, location, redirect_to, response_time|
52
- handle_compression response
52
+ handle_compression response if handle_compression?
53
53
  pages << Page.new(location, body: response.body,
54
54
  code: code,
55
55
  headers: response.to_hash,
@@ -70,6 +70,13 @@ module Digger
70
70
  [Page.new(url, error: e, referer: referer, depth: depth)]
71
71
  end
72
72
 
73
+ #
74
+ # Accept response compression, may bring encoding error if true
75
+ #
76
+ def handle_compression?
77
+ @opts[:handle_compression]
78
+ end
79
+
73
80
  #
74
81
  # The maximum number of redirects to follow
75
82
  #
@@ -185,7 +192,7 @@ module Digger
185
192
  opts['User-Agent'] = user_agent if user_agent
186
193
  opts['Referer'] = referer.to_s if referer
187
194
  opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
188
- opts['Accept-Encoding'] = 'gzip,deflate'
195
+ opts['Accept-Encoding'] = 'gzip,deflate' if handle_compression?
189
196
 
190
197
  retries = 0
191
198
  begin
data/lib/digger/model.rb CHANGED
@@ -17,6 +17,15 @@ module Digger
17
17
  }
18
18
  end
19
19
 
20
+ def validate_presence(*keys)
21
+ keys_all = pattern_config.keys
22
+ raise "Pattern keys #{(keys - keys_all).join(', ')} should be present" unless keys.all?{|k| keys_all.include?(k) }
23
+ end
24
+
25
+ def validate_includeness(*keys)
26
+ raise "Pattern keys #{(pattern_config.keys - keys).join(', ')} should not be included" unless pattern_config.keys.all?{|k| keys.include?(k)}
27
+ end
28
+
20
29
  # index page
21
30
  def index_config
22
31
  @@digger_config['index'][self.name]
@@ -39,14 +48,14 @@ module Digger
39
48
  result
40
49
  end
41
50
 
42
- def dig_url(url)
43
- client = Digger::HTTP.new
51
+ def dig_url(url, opts = {})
52
+ client = Digger::HTTP.new(opts)
44
53
  page = client.fetch_page(url)
45
54
  match_page(page)
46
55
  end
47
56
 
48
- def dig_urls(urls, cocurrence = 1)
49
- Index.batch(urls, cocurrence){|url| dig_url(url) }
57
+ def dig_urls(urls, cocurrence = 1, opts = {})
58
+ Index.batch(urls, cocurrence){|url| dig_url(url, opts) }
50
59
  end
51
60
 
52
61
  def dig(cocurrence = 1)
data/lib/digger/page.rb CHANGED
@@ -95,7 +95,13 @@ module Digger
95
95
  end
96
96
  end
97
97
 
98
+ def json
99
+ @json ||= JSON.parse body
100
+ end
98
101
 
102
+ def jsonp
103
+ @jsonp ||= JSON.parse body.match(/^[^\(]+?\((.+)\)[^\)]*$/)[1]
104
+ end
99
105
 
100
106
  #
101
107
  # Discard links, a next call of page.links will return an empty array
@@ -27,33 +27,36 @@ module Digger
27
27
  end
28
28
 
29
29
  MATCH_MAX = 3
30
-
31
- TYPES = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w{match_many css_one css_many}
32
30
 
33
- def regexp?
34
- TYPES.index(type) <= MATCH_MAX + 1 # match_many in addition
35
- end
31
+ TYPES_REGEXP = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w{match_many}
32
+ TYPES_CSS = %w{css_one css_many}
33
+ TYPES_JSON = %w{json jsonp}
34
+
35
+ TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON
36
36
 
37
37
  def match_page(page, &callback)
38
38
  blk = callback || safe_block
39
- if regexp? # regular expression
40
- index = TYPES.index(type)
41
- blk ||= ->(text){text.strip}
39
+ if TYPES_REGEXP.include?(type) # regular expression
40
+ blk ||= ->(text){ text.strip }
42
41
  # content is String
43
42
  if type == 'match_many'
44
43
  match = page.body.gsub(value).to_a
45
44
  else
45
+ index = TYPES_REGEXP.index(type)
46
46
  matches = page.body.match(value)
47
47
  match = matches.nil? ? nil : matches[index]
48
48
  end
49
- else # css expression
50
- blk ||= ->(node){node.content.strip}
49
+ elsif TYPES_CSS.include?(type) # css expression
50
+ blk ||= ->(node){ node.content.strip }
51
51
  # content is Nokogiri::HTML::Document
52
52
  if type == 'css_one'
53
53
  match = page.doc.css(value).first
54
- elsif type == 'css_many' # css_many
54
+ else
55
55
  match = page.doc.css(value)
56
56
  end
57
+ elsif TYPES_JSON.include?(type)
58
+ json = page.send(type)
59
+ match = json_fetch(json, value)
57
60
  end
58
61
  if match.nil?
59
62
  nil
@@ -66,6 +69,23 @@ module Digger
66
69
  nil
67
70
  end
68
71
 
72
+ def json_fetch(json, keys)
73
+ if keys.is_a? String
74
+ # parse json keys like '$.k1.k2[0]'
75
+ parts = keys.match(/^\$[\S]*$/)[0].scan(/(\.([\w]+)|\[([\d]+)\])/).map do |p|
76
+ p[1].nil? ? { index: p[2].to_i } : { key: p[1] }
77
+ end
78
+ json_fetch(json, parts)
79
+ elsif keys.is_a? Array
80
+ if keys.length == 0
81
+ json
82
+ else
83
+ pt = keys.shift
84
+ json_fetch(json[pt[:index] || pt[:key]], keys)
85
+ end
86
+ end
87
+ end
88
+
69
89
  class Nokogiri::XML::Node
70
90
  %w{one many}.each do |name|
71
91
  define_method "inner_#{name}" do |css, &block|
@@ -1,3 +1,3 @@
1
1
  module Digger
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.5"
3
3
  end
data/spec/digger_spec.rb CHANGED
@@ -7,6 +7,9 @@ pattern = Digger::Pattern.new({type: 'css_many', value: '.sites>a>span' })
7
7
 
8
8
  class Item < Digger::Model
9
9
  css_many sites: '.sites>a>span'
10
+ css_one logo: '.logo'
11
+ validate_presence :sites
12
+ validate_includeness :sites, :logo
10
13
  end
11
14
 
12
15
  describe Digger do
@@ -24,6 +27,9 @@ describe Digger do
24
27
  expect(item[:sites].include?('读远')).to be(true)
25
28
  end
26
29
 
30
+ it "validation support" do
31
+ end
32
+
27
33
  it "index multiple threading" do
28
34
 
29
35
  end
data/spec/page_spec.rb ADDED
@@ -0,0 +1,14 @@
1
+ require 'digger'
2
+ require 'json'
3
+
4
+ describe Digger::Page do
5
+ it 'page json' do
6
+ json_str = '{"a":1,"b":[1,2,3]}'
7
+ j1 = Digger::Page.new('', body: json_str)
8
+ j2 = Digger::Page.new('', body: "hello(#{json_str});")
9
+ expect(j1.json['a']).to eq(1)
10
+ expect(j2.jsonp['a']).to eq(1)
11
+ expect(j1.json['b'][0]).to eq(1)
12
+ expect(j2.jsonp['b'][1]).to eq(2)
13
+ end
14
+ end
@@ -0,0 +1,15 @@
1
+ require 'digger'
2
+ require 'json'
3
+
4
+ describe Digger::Pattern do
5
+ it 'json fetch' do
6
+ json = JSON.parse('{"a":1,"b":[1,2,3]}')
7
+ pt = Digger::Pattern.new
8
+ expect(pt.json_fetch(json, '$')['a']).to eq(1)
9
+ expect(pt.json_fetch(json, '$.a')).to eq(1)
10
+ expect(pt.json_fetch(json, '$.b').length).to eq(3)
11
+ expect(pt.json_fetch(json, '$.b[2]')).to eq(3)
12
+ end
13
+
14
+
15
+ end
@@ -0,0 +1,10 @@
1
+ require 'digger'
2
+
3
+ class Item < Digger::Model
4
+ # css_many sites: '.sites>a>span'
5
+ css_one logo: '.logo'
6
+ css_one title: '.title'
7
+
8
+ validate_presence :sites
9
+ validate_includeness :sites, :logo, :title
10
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: digger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - binz
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-02 00:00:00.000000000 Z
11
+ date: 2021-12-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.7'
19
+ version: '2.0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.7'
26
+ version: '2.0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -87,11 +87,14 @@ files:
87
87
  - lib/digger/pattern.rb
88
88
  - lib/digger/version.rb
89
89
  - spec/digger_spec.rb
90
+ - spec/page_spec.rb
91
+ - spec/pattern_spec.rb
92
+ - spec/validate_spec.rb
90
93
  homepage: ''
91
94
  licenses:
92
95
  - MIT
93
96
  metadata: {}
94
- post_install_message:
97
+ post_install_message:
95
98
  rdoc_options: []
96
99
  require_paths:
97
100
  - lib
@@ -106,10 +109,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
106
109
  - !ruby/object:Gem::Version
107
110
  version: '0'
108
111
  requirements: []
109
- rubyforge_project:
110
- rubygems_version: 2.2.2
111
- signing_key:
112
+ rubygems_version: 3.2.32
113
+ signing_key:
112
114
  specification_version: 4
113
115
  summary: Dig need stractual infomation from web page.
114
116
  test_files:
115
117
  - spec/digger_spec.rb
118
+ - spec/page_spec.rb
119
+ - spec/pattern_spec.rb
120
+ - spec/validate_spec.rb