digger 0.1.2 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: ed6cbdd1efd570809d15e3fec518c6f4b2a7e2a0
4
- data.tar.gz: 5c59c6ab0243aca0c6f130a3073fc0433dfe8211
2
+ SHA256:
3
+ metadata.gz: f3e89f179fa868ecd2879180d1fbfbf03ba0ebee3731b9c8b4741d22663ff4aa
4
+ data.tar.gz: 1b27e4a1446e9835203bf5497aeebc3bc4ab58998a0fc443eeaaf7e7ec86c2c7
5
5
  SHA512:
6
- metadata.gz: d7c57067e3a91f488791968f1752ebf6f89aaab10200508d89d02cf7f19a4d6c6de8ebf29f826db6ae70cd345a604f65dd4c9264b1f1575ff2e81c64b572712b
7
- data.tar.gz: 870cf5d2a98d005a7140fe400c79afdef9c37eb29778fe9efb406061e47f85624645d180449001c3e4827b98080b41bc978d6cd2039f55e81a8565e8c0f36153
6
+ metadata.gz: 5671e5d2484ca744e5c75f97beeb473e1970291fc094c0274714af27cc847ef47bcf3b1b81534e6f299f35648fdf3aabec9d90777e4b2fbe49dc2629c048f610
7
+ data.tar.gz: 496534bb394d17792dc7173759b83c0dd62b8569a241b2a6a57e60a14ea99ddafd7e030fe16480c5782ec319f6dc1d3563919f7202026fb008522dd26cae6c01
data/digger.gemspec CHANGED
@@ -18,8 +18,8 @@ Gem::Specification.new do |spec|
18
18
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
19
  spec.require_paths = ["lib"]
20
20
 
21
- spec.add_development_dependency "bundler", "~> 1.7"
22
- spec.add_development_dependency "rake", "~> 10.0"
21
+ spec.add_development_dependency "bundler", "~> 2.0"
22
+ spec.add_development_dependency "rake", ">= 12.3.3"
23
23
 
24
24
  spec.add_runtime_dependency 'nokogiri', '~> 1.6'
25
25
  spec.add_runtime_dependency 'http-cookie', '~> 1.0'
data/lib/digger/http.rb CHANGED
@@ -49,7 +49,7 @@ module Digger
49
49
  url = URI(url)
50
50
  pages = []
51
51
  get(url, referer) do |response, code, location, redirect_to, response_time|
52
- handle_compression response
52
+ handle_compression response if handle_compression?
53
53
  pages << Page.new(location, body: response.body,
54
54
  code: code,
55
55
  headers: response.to_hash,
@@ -70,6 +70,13 @@ module Digger
70
70
  [Page.new(url, error: e, referer: referer, depth: depth)]
71
71
  end
72
72
 
73
+ #
74
+ # Accept response compression, may bring encoding error if true
75
+ #
76
+ def handle_compression?
77
+ @opts[:handle_compression]
78
+ end
79
+
73
80
  #
74
81
  # The maximum number of redirects to follow
75
82
  #
@@ -185,7 +192,7 @@ module Digger
185
192
  opts['User-Agent'] = user_agent if user_agent
186
193
  opts['Referer'] = referer.to_s if referer
187
194
  opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
188
- opts['Accept-Encoding'] = 'gzip,deflate'
195
+ opts['Accept-Encoding'] = 'gzip,deflate' if handle_compression?
189
196
 
190
197
  retries = 0
191
198
  begin
data/lib/digger/model.rb CHANGED
@@ -17,6 +17,15 @@ module Digger
17
17
  }
18
18
  end
19
19
 
20
+ def validate_presence(*keys)
21
+ keys_all = pattern_config.keys
22
+ raise "Pattern keys #{(keys - keys_all).join(', ')} should be present" unless keys.all?{|k| keys_all.include?(k) }
23
+ end
24
+
25
+ def validate_includeness(*keys)
26
+ raise "Pattern keys #{(pattern_config.keys - keys).join(', ')} should not be included" unless pattern_config.keys.all?{|k| keys.include?(k)}
27
+ end
28
+
20
29
  # index page
21
30
  def index_config
22
31
  @@digger_config['index'][self.name]
data/lib/digger/page.rb CHANGED
@@ -3,6 +3,7 @@ require 'json'
3
3
  require 'ostruct'
4
4
  require 'set'
5
5
  require 'kconv'
6
+ require 'uri'
6
7
 
7
8
  # https://github.com/taganaka/polipus/blob/master/lib/polipus/page.rb
8
9
  module Digger
@@ -95,7 +96,13 @@ module Digger
95
96
  end
96
97
  end
97
98
 
99
+ def json
100
+ @json ||= JSON.parse body
101
+ end
98
102
 
103
+ def jsonp
104
+ @jsonp ||= JSON.parse body.match(/^[^\(]+?\((.+)\)[^\)]*$/)[1]
105
+ end
99
106
 
100
107
  #
101
108
  # Discard links, a next call of page.links will return an empty array
@@ -180,16 +187,7 @@ module Digger
180
187
  def to_absolute(link)
181
188
  return nil if link.nil?
182
189
 
183
- # link = link.to_s.encode('utf-8', 'binary', :invalid => :replace, :undef => :replace, :replace => '')
184
-
185
- # remove anchor
186
- link =
187
- begin
188
- URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/, '')))
189
- rescue URI::Error
190
- return nil
191
- end
192
-
190
+ link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#[\w]*$/, '')
193
191
  relative = begin
194
192
  URI(link)
195
193
  rescue URI::Error
@@ -27,33 +27,36 @@ module Digger
27
27
  end
28
28
 
29
29
  MATCH_MAX = 3
30
-
31
- TYPES = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w{match_many css_one css_many}
32
30
 
33
- def regexp?
34
- TYPES.index(type) <= MATCH_MAX + 1 # match_many in addition
35
- end
31
+ TYPES_REGEXP = 0.upto(MATCH_MAX).map{|i| "match_#{i}"} + %w{match_many}
32
+ TYPES_CSS = %w{css_one css_many}
33
+ TYPES_JSON = %w{json jsonp}
34
+
35
+ TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON
36
36
 
37
37
  def match_page(page, &callback)
38
38
  blk = callback || safe_block
39
- if regexp? # regular expression
40
- index = TYPES.index(type)
41
- blk ||= ->(text){text.strip}
39
+ if TYPES_REGEXP.include?(type) # regular expression
40
+ blk ||= ->(text){ text.strip }
42
41
  # content is String
43
42
  if type == 'match_many'
44
43
  match = page.body.gsub(value).to_a
45
44
  else
45
+ index = TYPES_REGEXP.index(type)
46
46
  matches = page.body.match(value)
47
47
  match = matches.nil? ? nil : matches[index]
48
48
  end
49
- else # css expression
50
- blk ||= ->(node){node.content.strip}
49
+ elsif TYPES_CSS.include?(type) # css expression
50
+ blk ||= ->(node){ node.content.strip }
51
51
  # content is Nokogiri::HTML::Document
52
52
  if type == 'css_one'
53
53
  match = page.doc.css(value).first
54
- elsif type == 'css_many' # css_many
54
+ else
55
55
  match = page.doc.css(value)
56
56
  end
57
+ elsif TYPES_JSON.include?(type)
58
+ json = page.send(type)
59
+ match = json_fetch(json, value)
57
60
  end
58
61
  if match.nil?
59
62
  nil
@@ -66,6 +69,23 @@ module Digger
66
69
  nil
67
70
  end
68
71
 
72
+ def json_fetch(json, keys)
73
+ if keys.is_a? String
74
+ # parse json keys like '$.k1.k2[0]'
75
+ parts = keys.match(/^\$[\S]*$/)[0].scan(/(\.([\w]+)|\[([\d]+)\])/).map do |p|
76
+ p[1].nil? ? { index: p[2].to_i } : { key: p[1] }
77
+ end
78
+ json_fetch(json, parts)
79
+ elsif keys.is_a? Array
80
+ if keys.length == 0
81
+ json
82
+ else
83
+ pt = keys.shift
84
+ json_fetch(json[pt[:index] || pt[:key]], keys)
85
+ end
86
+ end
87
+ end
88
+
69
89
  class Nokogiri::XML::Node
70
90
  %w{one many}.each do |name|
71
91
  define_method "inner_#{name}" do |css, &block|
@@ -1,3 +1,3 @@
1
1
  module Digger
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.6"
3
3
  end
data/spec/digger_spec.rb CHANGED
@@ -1,12 +1,14 @@
1
1
  require 'digger'
2
2
 
3
3
  http = Digger::HTTP.new
4
- page = http.fetch_page('http://nan.so/')
4
+ page = http.fetch_page('http://www.baidu.com/')
5
5
 
6
- pattern = Digger::Pattern.new({type: 'css_many', value: '.sites>a>span' })
6
+ pattern = Digger::Pattern.new({ type: 'css_many', value: '#s-top-left>a' })
7
7
 
8
8
  class Item < Digger::Model
9
- css_many sites: '.sites>a>span'
9
+ css_many sites: '#s-top-left>a'
10
+ validate_presence :sites
11
+ validate_includeness :sites
10
12
  end
11
13
 
12
14
  describe Digger do
@@ -16,12 +18,15 @@ describe Digger do
16
18
 
17
19
  it "pattern should match content" do
18
20
  sites = pattern.match_page(page)
19
- expect(sites.include?('百度网盘')).to eq(true)
21
+ expect(sites.include?('新闻')).to eq(true)
20
22
  end
21
23
 
22
24
  it "model should dig content" do
23
25
  item = Item.new.match_page(page)
24
- expect(item[:sites].include?('读远')).to be(true)
26
+ expect(item[:sites].include?('新闻')).to be(true)
27
+ end
28
+
29
+ it "validation support" do
25
30
  end
26
31
 
27
32
  it "index multiple threading" do
data/spec/page_spec.rb ADDED
@@ -0,0 +1,27 @@
1
+ require 'digger'
2
+ require 'json'
3
+ require 'uri'
4
+
5
+ describe Digger::Page do
6
+ it 'page json' do
7
+ json_str = '{"a":1,"b":[1,2,3]}'
8
+ j1 = Digger::Page.new('', body: json_str)
9
+ j2 = Digger::Page.new('', body: "hello(#{json_str});")
10
+ expect(j1.json['a']).to eq(1)
11
+ expect(j2.jsonp['a']).to eq(1)
12
+ expect(j1.json['b'][0]).to eq(1)
13
+ expect(j2.jsonp['b'][1]).to eq(2)
14
+ end
15
+
16
+ it 'fetch baidu' do
17
+ http = Digger::HTTP.new
18
+ page = http.fetch_page('http://www.baidu.com/')
19
+ expect(page.code).to eq(200)
20
+ end
21
+
22
+ it 'page uri' do
23
+ link ='https://www.baidu.com/s?wd=%E5%93%88%E5%93%88#hello'
24
+ link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#[\w]*$/, '')
25
+ p link
26
+ end
27
+ end
@@ -0,0 +1,15 @@
1
+ require 'digger'
2
+ require 'json'
3
+
4
+ describe Digger::Pattern do
5
+ it 'json fetch' do
6
+ json = JSON.parse('{"a":1,"b":[1,2,3]}')
7
+ pt = Digger::Pattern.new
8
+ expect(pt.json_fetch(json, '$')['a']).to eq(1)
9
+ expect(pt.json_fetch(json, '$.a')).to eq(1)
10
+ expect(pt.json_fetch(json, '$.b').length).to eq(3)
11
+ expect(pt.json_fetch(json, '$.b[2]')).to eq(3)
12
+ end
13
+
14
+
15
+ end
@@ -0,0 +1,10 @@
1
+ require 'digger'
2
+
3
+ class Item < Digger::Model
4
+ # css_many sites: '.sites>a>span'
5
+ css_one logo: '.logo'
6
+ css_one title: '.title'
7
+
8
+ validate_presence :sites
9
+ validate_includeness :sites, :logo, :title
10
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: digger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - binz
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-17 00:00:00.000000000 Z
11
+ date: 2021-12-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -16,28 +16,28 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.7'
19
+ version: '2.0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.7'
26
+ version: '2.0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: '10.0'
33
+ version: 12.3.3
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: '10.0'
40
+ version: 12.3.3
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: nokogiri
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -87,11 +87,14 @@ files:
87
87
  - lib/digger/pattern.rb
88
88
  - lib/digger/version.rb
89
89
  - spec/digger_spec.rb
90
+ - spec/page_spec.rb
91
+ - spec/pattern_spec.rb
92
+ - spec/validate_spec.rb
90
93
  homepage: ''
91
94
  licenses:
92
95
  - MIT
93
96
  metadata: {}
94
- post_install_message:
97
+ post_install_message:
95
98
  rdoc_options: []
96
99
  require_paths:
97
100
  - lib
@@ -106,10 +109,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
106
109
  - !ruby/object:Gem::Version
107
110
  version: '0'
108
111
  requirements: []
109
- rubyforge_project:
110
- rubygems_version: 2.2.2
111
- signing_key:
112
+ rubygems_version: 3.2.32
113
+ signing_key:
112
114
  specification_version: 4
113
115
  summary: Dig need stractual infomation from web page.
114
116
  test_files:
115
117
  - spec/digger_spec.rb
118
+ - spec/page_spec.rb
119
+ - spec/pattern_spec.rb
120
+ - spec/validate_spec.rb