digger 0.1.1 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 70e98c345c7da377cc9f84f68cb6b7f702f7b42b
4
- data.tar.gz: 5f3416916d1b726810075e5ea744a39ac3dd4487
2
+ SHA256:
3
+ metadata.gz: 5c4a94163a25d4b53ad5b477040f69b8fccca026adc313f8f61759317c1bf198
4
+ data.tar.gz: 307b443277c16708103c172e5fb4ef4d833f7d2631a7f85779570a3cbeac8925
5
5
  SHA512:
6
- metadata.gz: 5750dde57359a775d378e5dc84c75062bb3494af58f97708e84692cd5e6c9840719c40d6f0602244b542cc7e14d275dedd116b57119174366b05b04249674b99
7
- data.tar.gz: 2272ee6dd594a818b69cd0e4cb5eaba1f71954683d421f3f8654b9de1bc18a03e1f70cdf99a457b9d29f9add2736858bb2ffe22d31fa97073693bed4d8f64d44
6
+ metadata.gz: 9da40123fd09615d0c69ca5104d1141b82981813ef8d175bc567a0f35e8f7dd868ce235bc0f265308e2133710f63c3ce8325d6b528ae060eccb77904c12e3139
7
+ data.tar.gz: c05be67df6db25345acfdc3615690c0467029dd40ffde262863923b6f0786696a16fd547277ec4d3764ed39b399174dd28e10ca7eb31ffb55f773e08b04f2986
data/digger.gemspec CHANGED
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
18
18
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
19
  spec.require_paths = ["lib"]
20
20
 
21
- spec.add_development_dependency "bundler", "~> 1.7"
21
+ spec.add_development_dependency "bundler", "~> 2.0"
22
22
  spec.add_development_dependency "rake", "~> 10.0"
23
23
 
24
24
  spec.add_runtime_dependency 'nokogiri', '~> 1.6'
data/lib/digger/http.rb CHANGED
@@ -49,7 +49,7 @@ module Digger
49
49
  url = URI(url)
50
50
  pages = []
51
51
  get(url, referer) do |response, code, location, redirect_to, response_time|
52
- handle_compression response
52
+ handle_compression response if handle_compression?
53
53
  pages << Page.new(location, body: response.body,
54
54
  code: code,
55
55
  headers: response.to_hash,
@@ -70,6 +70,13 @@ module Digger
70
70
  [Page.new(url, error: e, referer: referer, depth: depth)]
71
71
  end
72
72
 
73
+ #
74
+ # Accept response compression, may bring encoding error if true
75
+ #
76
+ def handle_compression?
77
+ @opts[:handle_compression]
78
+ end
79
+
73
80
  #
74
81
  # The maximum number of redirects to follow
75
82
  #
@@ -185,7 +192,7 @@ module Digger
185
192
  opts['User-Agent'] = user_agent if user_agent
186
193
  opts['Referer'] = referer.to_s if referer
187
194
  opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
188
- opts['Accept-Encoding'] = 'gzip,deflate'
195
+ opts['Accept-Encoding'] = 'gzip,deflate' if handle_compression?
189
196
 
190
197
  retries = 0
191
198
  begin
data/lib/digger/model.rb CHANGED
@@ -17,6 +17,15 @@ module Digger
17
17
  }
18
18
  end
19
19
 
20
+ def validate_presence(*keys)
21
+ keys_all = pattern_config.keys
22
+ raise "Pattern keys #{(keys - keys_all).join(', ')} should be present" unless keys.all?{|k| keys_all.include?(k) }
23
+ end
24
+
25
+ def validate_includeness(*keys)
26
+ raise "Pattern keys #{(pattern_config.keys - keys).join(', ')} should not be included" unless pattern_config.keys.all?{|k| keys.include?(k)}
27
+ end
28
+
20
29
  # index page
21
30
  def index_config
22
31
  @@digger_config['index'][self.name]
@@ -39,14 +48,14 @@ module Digger
39
48
  result
40
49
  end
41
50
 
42
- def dig_url(url)
43
- client = Digger::HTTP.new
51
+ def dig_url(url, opts = {})
52
+ client = Digger::HTTP.new(opts)
44
53
  page = client.fetch_page(url)
45
54
  match_page(page)
46
55
  end
47
56
 
48
- def dig_urls(urls, cocurrence = 1)
49
- Index.batch(urls, cocurrence){|url| dig_url(url) }
57
+ def dig_urls(urls, cocurrence = 1, opts = {})
58
+ Index.batch(urls, cocurrence){|url| dig_url(url, opts) }
50
59
  end
51
60
 
52
61
  def dig(cocurrence = 1)
data/lib/digger/page.rb CHANGED
@@ -95,7 +95,13 @@ module Digger
95
95
  end
96
96
  end
97
97
 
98
+ def json
99
+ @json ||= JSON.parse body
100
+ end
98
101
 
102
+ def jsonp
103
+ @jsonp ||= JSON.parse body.match(/^[^\(]+?\((.+)\)[^\)]*$/)[1]
104
+ end
99
105
 
100
106
  #
101
107
  # Discard links, a next call of page.links will return an empty array
@@ -27,33 +27,36 @@ module Digger
27
27
  end
28
28
 
29
29
  MATCH_MAX = 3
30
-
31
- TYPES = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w{match_many css_one css_many}
32
30
 
33
- def regexp?
34
- TYPES.index(type) <= MATCH_MAX + 1 # match_many in addition
35
- end
31
+ TYPES_REGEXP = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w{match_many}
32
+ TYPES_CSS = %w{css_one css_many}
33
+ TYPES_JSON = %w{json jsonp}
34
+
35
+ TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON
36
36
 
37
37
  def match_page(page, &callback)
38
38
  blk = callback || safe_block
39
- if regexp? # regular expression
40
- index = TYPES.index(type)
41
- blk ||= ->(text){text.strip}
39
+ if TYPES_REGEXP.include?(type) # regular expression
40
+ blk ||= ->(text){ text.strip }
42
41
  # content is String
43
42
  if type == 'match_many'
44
43
  match = page.body.gsub(value).to_a
45
44
  else
45
+ index = TYPES_REGEXP.index(type)
46
46
  matches = page.body.match(value)
47
47
  match = matches.nil? ? nil : matches[index]
48
48
  end
49
- else # css expression
50
- blk ||= ->(node){node.content.strip}
49
+ elsif TYPES_CSS.include?(type) # css expression
50
+ blk ||= ->(node){ node.content.strip }
51
51
  # content is Nokogiri::HTML::Document
52
52
  if type == 'css_one'
53
53
  match = page.doc.css(value).first
54
- elsif type == 'css_many' # css_many
54
+ else
55
55
  match = page.doc.css(value)
56
56
  end
57
+ elsif TYPES_JSON.include?(type)
58
+ json = page.send(type)
59
+ match = json_fetch(json, value)
57
60
  end
58
61
  if match.nil?
59
62
  nil
@@ -66,6 +69,23 @@ module Digger
66
69
  nil
67
70
  end
68
71
 
72
+ def json_fetch(json, keys)
73
+ if keys.is_a? String
74
+ # parse json keys like '$.k1.k2[0]'
75
+ parts = keys.match(/^\$[\S]*$/)[0].scan(/(\.([\w]+)|\[([\d]+)\])/).map do |p|
76
+ p[1].nil? ? { index: p[2].to_i } : { key: p[1] }
77
+ end
78
+ json_fetch(json, parts)
79
+ elsif keys.is_a? Array
80
+ if keys.length == 0
81
+ json
82
+ else
83
+ pt = keys.shift
84
+ json_fetch(json[pt[:index] || pt[:key]], keys)
85
+ end
86
+ end
87
+ end
88
+
69
89
  class Nokogiri::XML::Node
70
90
  %w{one many}.each do |name|
71
91
  define_method "inner_#{name}" do |css, &block|
@@ -1,3 +1,3 @@
1
1
  module Digger
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.5"
3
3
  end
data/spec/digger_spec.rb CHANGED
@@ -7,6 +7,9 @@ pattern = Digger::Pattern.new({type: 'css_many', value: '.sites>a>span' })
7
7
 
8
8
  class Item < Digger::Model
9
9
  css_many sites: '.sites>a>span'
10
+ css_one logo: '.logo'
11
+ validate_presence :sites
12
+ validate_includeness :sites, :logo
10
13
  end
11
14
 
12
15
  describe Digger do
@@ -24,6 +27,9 @@ describe Digger do
24
27
  expect(item[:sites].include?('读远')).to be(true)
25
28
  end
26
29
 
30
+ it "validation support" do
31
+ end
32
+
27
33
  it "index multiple threading" do
28
34
 
29
35
  end
data/spec/page_spec.rb ADDED
@@ -0,0 +1,14 @@
1
+ require 'digger'
2
+ require 'json'
3
+
4
+ describe Digger::Page do
5
+ it 'page json' do
6
+ json_str = '{"a":1,"b":[1,2,3]}'
7
+ j1 = Digger::Page.new('', body: json_str)
8
+ j2 = Digger::Page.new('', body: "hello(#{json_str});")
9
+ expect(j1.json['a']).to eq(1)
10
+ expect(j2.jsonp['a']).to eq(1)
11
+ expect(j1.json['b'][0]).to eq(1)
12
+ expect(j2.jsonp['b'][1]).to eq(2)
13
+ end
14
+ end
@@ -0,0 +1,15 @@
1
+ require 'digger'
2
+ require 'json'
3
+
4
+ describe Digger::Pattern do
5
+ it 'json fetch' do
6
+ json = JSON.parse('{"a":1,"b":[1,2,3]}')
7
+ pt = Digger::Pattern.new
8
+ expect(pt.json_fetch(json, '$')['a']).to eq(1)
9
+ expect(pt.json_fetch(json, '$.a')).to eq(1)
10
+ expect(pt.json_fetch(json, '$.b').length).to eq(3)
11
+ expect(pt.json_fetch(json, '$.b[2]')).to eq(3)
12
+ end
13
+
14
+
15
+ end
@@ -0,0 +1,10 @@
1
+ require 'digger'
2
+
3
+ class Item < Digger::Model
4
+ # css_many sites: '.sites>a>span'
5
+ css_one logo: '.logo'
6
+ css_one title: '.title'
7
+
8
+ validate_presence :sites
9
+ validate_includeness :sites, :logo, :title
10
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: digger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - binz
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-02 00:00:00.000000000 Z
11
+ date: 2021-12-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.7'
19
+ version: '2.0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.7'
26
+ version: '2.0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -87,11 +87,14 @@ files:
87
87
  - lib/digger/pattern.rb
88
88
  - lib/digger/version.rb
89
89
  - spec/digger_spec.rb
90
+ - spec/page_spec.rb
91
+ - spec/pattern_spec.rb
92
+ - spec/validate_spec.rb
90
93
  homepage: ''
91
94
  licenses:
92
95
  - MIT
93
96
  metadata: {}
94
- post_install_message:
97
+ post_install_message:
95
98
  rdoc_options: []
96
99
  require_paths:
97
100
  - lib
@@ -106,10 +109,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
106
109
  - !ruby/object:Gem::Version
107
110
  version: '0'
108
111
  requirements: []
109
- rubyforge_project:
110
- rubygems_version: 2.2.2
111
- signing_key:
112
+ rubygems_version: 3.2.32
113
+ signing_key:
112
114
  specification_version: 4
113
115
  summary: Dig need stractual infomation from web page.
114
116
  test_files:
115
117
  - spec/digger_spec.rb
118
+ - spec/page_spec.rb
119
+ - spec/pattern_spec.rb
120
+ - spec/validate_spec.rb