digger 0.1.2 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: ed6cbdd1efd570809d15e3fec518c6f4b2a7e2a0
4
- data.tar.gz: 5c59c6ab0243aca0c6f130a3073fc0433dfe8211
2
+ SHA256:
3
+ metadata.gz: f3e89f179fa868ecd2879180d1fbfbf03ba0ebee3731b9c8b4741d22663ff4aa
4
+ data.tar.gz: 1b27e4a1446e9835203bf5497aeebc3bc4ab58998a0fc443eeaaf7e7ec86c2c7
5
5
  SHA512:
6
- metadata.gz: d7c57067e3a91f488791968f1752ebf6f89aaab10200508d89d02cf7f19a4d6c6de8ebf29f826db6ae70cd345a604f65dd4c9264b1f1575ff2e81c64b572712b
7
- data.tar.gz: 870cf5d2a98d005a7140fe400c79afdef9c37eb29778fe9efb406061e47f85624645d180449001c3e4827b98080b41bc978d6cd2039f55e81a8565e8c0f36153
6
+ metadata.gz: 5671e5d2484ca744e5c75f97beeb473e1970291fc094c0274714af27cc847ef47bcf3b1b81534e6f299f35648fdf3aabec9d90777e4b2fbe49dc2629c048f610
7
+ data.tar.gz: 496534bb394d17792dc7173759b83c0dd62b8569a241b2a6a57e60a14ea99ddafd7e030fe16480c5782ec319f6dc1d3563919f7202026fb008522dd26cae6c01
data/digger.gemspec CHANGED
@@ -18,8 +18,8 @@ Gem::Specification.new do |spec|
18
18
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
19
  spec.require_paths = ["lib"]
20
20
 
21
- spec.add_development_dependency "bundler", "~> 1.7"
22
- spec.add_development_dependency "rake", "~> 10.0"
21
+ spec.add_development_dependency "bundler", "~> 2.0"
22
+ spec.add_development_dependency "rake", ">= 12.3.3"
23
23
 
24
24
  spec.add_runtime_dependency 'nokogiri', '~> 1.6'
25
25
  spec.add_runtime_dependency 'http-cookie', '~> 1.0'
data/lib/digger/http.rb CHANGED
@@ -49,7 +49,7 @@ module Digger
49
49
  url = URI(url)
50
50
  pages = []
51
51
  get(url, referer) do |response, code, location, redirect_to, response_time|
52
- handle_compression response
52
+ handle_compression response if handle_compression?
53
53
  pages << Page.new(location, body: response.body,
54
54
  code: code,
55
55
  headers: response.to_hash,
@@ -70,6 +70,13 @@ module Digger
70
70
  [Page.new(url, error: e, referer: referer, depth: depth)]
71
71
  end
72
72
 
73
+ #
74
+ # Accept response compression, may bring encoding error if true
75
+ #
76
+ def handle_compression?
77
+ @opts[:handle_compression]
78
+ end
79
+
73
80
  #
74
81
  # The maximum number of redirects to follow
75
82
  #
@@ -185,7 +192,7 @@ module Digger
185
192
  opts['User-Agent'] = user_agent if user_agent
186
193
  opts['Referer'] = referer.to_s if referer
187
194
  opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
188
- opts['Accept-Encoding'] = 'gzip,deflate'
195
+ opts['Accept-Encoding'] = 'gzip,deflate' if handle_compression?
189
196
 
190
197
  retries = 0
191
198
  begin
data/lib/digger/model.rb CHANGED
@@ -17,6 +17,15 @@ module Digger
17
17
  }
18
18
  end
19
19
 
20
+ def validate_presence(*keys)
21
+ keys_all = pattern_config.keys
22
+ raise "Pattern keys #{(keys - keys_all).join(', ')} should be present" unless keys.all?{|k| keys_all.include?(k) }
23
+ end
24
+
25
+ def validate_includeness(*keys)
26
+ raise "Pattern keys #{(pattern_config.keys - keys).join(', ')} should not be included" unless pattern_config.keys.all?{|k| keys.include?(k)}
27
+ end
28
+
20
29
  # index page
21
30
  def index_config
22
31
  @@digger_config['index'][self.name]
data/lib/digger/page.rb CHANGED
@@ -3,6 +3,7 @@ require 'json'
3
3
  require 'ostruct'
4
4
  require 'set'
5
5
  require 'kconv'
6
+ require 'uri'
6
7
 
7
8
  # https://github.com/taganaka/polipus/blob/master/lib/polipus/page.rb
8
9
  module Digger
@@ -95,7 +96,13 @@ module Digger
95
96
  end
96
97
  end
97
98
 
99
+ def json
100
+ @json ||= JSON.parse body
101
+ end
98
102
 
103
+ def jsonp
104
+ @jsonp ||= JSON.parse body.match(/^[^\(]+?\((.+)\)[^\)]*$/)[1]
105
+ end
99
106
 
100
107
  #
101
108
  # Discard links, a next call of page.links will return an empty array
@@ -180,16 +187,7 @@ module Digger
180
187
  def to_absolute(link)
181
188
  return nil if link.nil?
182
189
 
183
- # link = link.to_s.encode('utf-8', 'binary', :invalid => :replace, :undef => :replace, :replace => '')
184
-
185
- # remove anchor
186
- link =
187
- begin
188
- URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/, '')))
189
- rescue URI::Error
190
- return nil
191
- end
192
-
190
+ link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#[\w]*$/, '')
193
191
  relative = begin
194
192
  URI(link)
195
193
  rescue URI::Error
@@ -27,33 +27,36 @@ module Digger
27
27
  end
28
28
 
29
29
  MATCH_MAX = 3
30
-
31
- TYPES = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w{match_many css_one css_many}
32
30
 
33
- def regexp?
34
- TYPES.index(type) <= MATCH_MAX + 1 # match_many in addition
35
- end
31
+ TYPES_REGEXP = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w{match_many}
32
+ TYPES_CSS = %w{css_one css_many}
33
+ TYPES_JSON = %w{json jsonp}
34
+
35
+ TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON
36
36
 
37
37
  def match_page(page, &callback)
38
38
  blk = callback || safe_block
39
- if regexp? # regular expression
40
- index = TYPES.index(type)
41
- blk ||= ->(text){text.strip}
39
+ if TYPES_REGEXP.include?(type) # regular expression
40
+ blk ||= ->(text){ text.strip }
42
41
  # content is String
43
42
  if type == 'match_many'
44
43
  match = page.body.gsub(value).to_a
45
44
  else
45
+ index = TYPES_REGEXP.index(type)
46
46
  matches = page.body.match(value)
47
47
  match = matches.nil? ? nil : matches[index]
48
48
  end
49
- else # css expression
50
- blk ||= ->(node){node.content.strip}
49
+ elsif TYPES_CSS.include?(type) # css expression
50
+ blk ||= ->(node){ node.content.strip }
51
51
  # content is Nokogiri::HTML::Document
52
52
  if type == 'css_one'
53
53
  match = page.doc.css(value).first
54
- elsif type == 'css_many' # css_many
54
+ else
55
55
  match = page.doc.css(value)
56
56
  end
57
+ elsif TYPES_JSON.include?(type)
58
+ json = page.send(type)
59
+ match = json_fetch(json, value)
57
60
  end
58
61
  if match.nil?
59
62
  nil
@@ -66,6 +69,23 @@ module Digger
66
69
  nil
67
70
  end
68
71
 
72
+ def json_fetch(json, keys)
73
+ if keys.is_a? String
74
+ # parse json keys like '$.k1.k2[0]'
75
+ parts = keys.match(/^\$[\S]*$/)[0].scan(/(\.([\w]+)|\[([\d]+)\])/).map do |p|
76
+ p[1].nil? ? { index: p[2].to_i } : { key: p[1] }
77
+ end
78
+ json_fetch(json, parts)
79
+ elsif keys.is_a? Array
80
+ if keys.length == 0
81
+ json
82
+ else
83
+ pt = keys.shift
84
+ json_fetch(json[pt[:index] || pt[:key]], keys)
85
+ end
86
+ end
87
+ end
88
+
69
89
  class Nokogiri::XML::Node
70
90
  %w{one many}.each do |name|
71
91
  define_method "inner_#{name}" do |css, &block|
@@ -1,3 +1,3 @@
1
1
  module Digger
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.6"
3
3
  end
data/spec/digger_spec.rb CHANGED
@@ -1,12 +1,14 @@
1
1
  require 'digger'
2
2
 
3
3
  http = Digger::HTTP.new
4
- page = http.fetch_page('http://nan.so/')
4
+ page = http.fetch_page('http://www.baidu.com/')
5
5
 
6
- pattern = Digger::Pattern.new({type: 'css_many', value: '.sites>a>span' })
6
+ pattern = Digger::Pattern.new({ type: 'css_many', value: '#s-top-left>a' })
7
7
 
8
8
  class Item < Digger::Model
9
- css_many sites: '.sites>a>span'
9
+ css_many sites: '#s-top-left>a'
10
+ validate_presence :sites
11
+ validate_includeness :sites
10
12
  end
11
13
 
12
14
  describe Digger do
@@ -16,12 +18,15 @@ describe Digger do
16
18
 
17
19
  it "pattern should match content" do
18
20
  sites = pattern.match_page(page)
19
- expect(sites.include?('百度网盘')).to eq(true)
21
+ expect(sites.include?('新闻')).to eq(true)
20
22
  end
21
23
 
22
24
  it "model should dig content" do
23
25
  item = Item.new.match_page(page)
24
- expect(item[:sites].include?('读远')).to be(true)
26
+ expect(item[:sites].include?('新闻')).to be(true)
27
+ end
28
+
29
+ it "validation support" do
25
30
  end
26
31
 
27
32
  it "index multiple threading" do
data/spec/page_spec.rb ADDED
@@ -0,0 +1,27 @@
1
+ require 'digger'
2
+ require 'json'
3
+ require 'uri'
4
+
5
+ describe Digger::Page do
6
+ it 'page json' do
7
+ json_str = '{"a":1,"b":[1,2,3]}'
8
+ j1 = Digger::Page.new('', body: json_str)
9
+ j2 = Digger::Page.new('', body: "hello(#{json_str});")
10
+ expect(j1.json['a']).to eq(1)
11
+ expect(j2.jsonp['a']).to eq(1)
12
+ expect(j1.json['b'][0]).to eq(1)
13
+ expect(j2.jsonp['b'][1]).to eq(2)
14
+ end
15
+
16
+ it 'fetch baidu' do
17
+ http = Digger::HTTP.new
18
+ page = http.fetch_page('http://www.baidu.com/')
19
+ expect(page.code).to eq(200)
20
+ end
21
+
22
+ it 'page uri' do
23
+ link ='https://www.baidu.com/s?wd=%E5%93%88%E5%93%88#hello'
24
+ link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#[\w]*$/, '')
25
+ p link
26
+ end
27
+ end
@@ -0,0 +1,15 @@
1
+ require 'digger'
2
+ require 'json'
3
+
4
+ describe Digger::Pattern do
5
+ it 'json fetch' do
6
+ json = JSON.parse('{"a":1,"b":[1,2,3]}')
7
+ pt = Digger::Pattern.new
8
+ expect(pt.json_fetch(json, '$')['a']).to eq(1)
9
+ expect(pt.json_fetch(json, '$.a')).to eq(1)
10
+ expect(pt.json_fetch(json, '$.b').length).to eq(3)
11
+ expect(pt.json_fetch(json, '$.b[2]')).to eq(3)
12
+ end
13
+
14
+
15
+ end
@@ -0,0 +1,10 @@
1
+ require 'digger'
2
+
3
+ class Item < Digger::Model
4
+ # css_many sites: '.sites>a>span'
5
+ css_one logo: '.logo'
6
+ css_one title: '.title'
7
+
8
+ validate_presence :sites
9
+ validate_includeness :sites, :logo, :title
10
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: digger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - binz
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-17 00:00:00.000000000 Z
11
+ date: 2021-12-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -16,28 +16,28 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.7'
19
+ version: '2.0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.7'
26
+ version: '2.0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: '10.0'
33
+ version: 12.3.3
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: '10.0'
40
+ version: 12.3.3
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: nokogiri
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -87,11 +87,14 @@ files:
87
87
  - lib/digger/pattern.rb
88
88
  - lib/digger/version.rb
89
89
  - spec/digger_spec.rb
90
+ - spec/page_spec.rb
91
+ - spec/pattern_spec.rb
92
+ - spec/validate_spec.rb
90
93
  homepage: ''
91
94
  licenses:
92
95
  - MIT
93
96
  metadata: {}
94
- post_install_message:
97
+ post_install_message:
95
98
  rdoc_options: []
96
99
  require_paths:
97
100
  - lib
@@ -106,10 +109,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
106
109
  - !ruby/object:Gem::Version
107
110
  version: '0'
108
111
  requirements: []
109
- rubyforge_project:
110
- rubygems_version: 2.2.2
111
- signing_key:
112
+ rubygems_version: 3.2.32
113
+ signing_key:
112
114
  specification_version: 4
113
115
  summary: Dig need stractual infomation from web page.
114
116
  test_files:
115
117
  - spec/digger_spec.rb
118
+ - spec/page_spec.rb
119
+ - spec/pattern_spec.rb
120
+ - spec/validate_spec.rb