digger 0.1.7 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2f506fd615df8b9d732d6b67bedc72644df26dc3f5725cfe5dba10c1098bae0b
4
- data.tar.gz: 0e9afcb19ba0be5ce4a90787d54c3b43d58100bac3ec45bf5ef93781a60b6eb1
3
+ metadata.gz: b036f2d202aa8360cf3b07822168fb8e3e3e084d4b2f5ccb5eca026dd4c47981
4
+ data.tar.gz: 85d13e567add73e38c25738852dc3b30e3ad1900579dca815d5aef4cb16dbca8
5
5
  SHA512:
6
- metadata.gz: 8608a2ee8e06ddd846772d40dc3e417560229729b7b736eda8a7f50977a7d2c6fc523f86fe64480b5172c5295137eae7c85f945f7ca310502c54c8b90dd75e8d
7
- data.tar.gz: 59fef1a13adc8f983c16428ee4d08d6ccdecea09b7583452b6ca07689727c3d6a386f5a5b767a20d117c8c4bc9775a2a6b32fd4caff1c1a5e85ee2af82e39d1b
6
+ metadata.gz: decb9abb96f56dc7f75ba95151b02c7415018a789583da1324ab4786317d7e0bfc272a83b47399b51672bdb30b3d90c502f25358d7438b9ee26a8d39764a58dd
7
+ data.tar.gz: 73f7dfd179175bcda4a24a120cad95a61093f17166e389e44b37a158ff8ee1ea927844d9b85773914f14d99473776f11752c7aa32fadfec1b8b703e584591c3f
data/digger.gemspec CHANGED
@@ -1,26 +1,27 @@
1
1
  # coding: utf-8
2
+
2
3
  lib = File.expand_path('../lib', __FILE__)
3
4
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
5
  require 'digger/version'
5
6
 
6
7
  Gem::Specification.new do |spec|
7
- spec.name = "digger"
8
+ spec.name = 'digger'
8
9
  spec.version = Digger::VERSION
9
- spec.authors = ["binz"]
10
- spec.email = ["xinkiang@gmail.com"]
10
+ spec.authors = ['binz']
11
+ spec.email = ['xinkiang@gmail.com']
11
12
  spec.summary = %q{Dig need stractual infomation from web page.}
12
13
  spec.description = %q{}
13
- spec.homepage = ""
14
- spec.license = "MIT"
14
+ spec.homepage = ''
15
+ spec.license = 'MIT'
15
16
 
16
17
  spec.files = `git ls-files -z`.split("\x0")
17
18
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
19
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
- spec.require_paths = ["lib"]
20
+ spec.require_paths = ['lib']
20
21
 
21
- spec.add_development_dependency "bundler", "~> 2.0"
22
- spec.add_development_dependency "rake", ">= 12.3.3"
22
+ spec.add_development_dependency 'rake', '>= 12.3.3'
23
+ spec.add_development_dependency 'bundler', '~> 2.0'
23
24
 
24
- spec.add_runtime_dependency 'nokogiri', '~> 1.6'
25
25
  spec.add_runtime_dependency 'http-cookie', '~> 1.0'
26
+ spec.add_runtime_dependency 'nokogiri', '~> 1.6'
26
27
  end
data/lib/digger/index.rb CHANGED
@@ -17,6 +17,35 @@ module Digger
17
17
  pattern.gsub('*').each_with_index { |_, i| arg[i] }
18
18
  end
19
19
 
20
+ def self.slow_down(entities, conf = {}, &block)
21
+ raise NoBlockError, 'No block given' unless block
22
+
23
+ config = {
24
+ sleep_range_seconds: 4...10, # 随机等待时间范围
25
+ fail_max_cnt: 10, # 最多失败次数
26
+ fail_unit_seconds: 10 * 60, # 失败等待时间
27
+ when_fail: ->(ent, e, failed_cnt) {}
28
+ }.merge(conf)
29
+ failed_cnt = 0
30
+ cursor = 0
31
+ result = []
32
+ while cursor < entities.length
33
+ begin
34
+ result << block.call(entities[cursor])
35
+ rescue StandardError => e
36
+ failed_cnt += 1
37
+ config[:when_fail].call(entities[cursor], e, failed_cnt)
38
+ break if failed_cnt >= config[:fail_max_cnt]
39
+
40
+ sleep(failed_cnt * config[:fail_unit_seconds])
41
+ else
42
+ cursor += 1
43
+ sleep(rand(config[:sleep_range_seconds]))
44
+ end
45
+ end
46
+ result
47
+ end
48
+
20
49
  def self.batch(entities, cocurrence = 1, &block)
21
50
  raise NoBlockError, 'No block given' unless block
22
51
 
data/lib/digger/page.rb CHANGED
@@ -4,6 +4,7 @@ require 'ostruct'
4
4
  require 'set'
5
5
  require 'kconv'
6
6
  require 'uri'
7
+ require 'http/cookie'
7
8
 
8
9
  # https://github.com/taganaka/polipus/blob/master/lib/polipus/page.rb
9
10
  module Digger
@@ -101,6 +102,10 @@ module Digger
101
102
  @jsonp ||= JSON.parse body.match(/^[^(]+?\((.+)\)[^)]*$/)[1]
102
103
  end
103
104
 
105
+ def cookies
106
+ @cookies ||= (headers['set-cookie'] || []).flat_map { |c| ::HTTP::Cookie.parse(c, url) }
107
+ end
108
+
104
109
  #
105
110
  # Discard links, a next call of page.links will return an empty array
106
111
  #
@@ -273,4 +278,4 @@ module Digger
273
278
  from_hash hash
274
279
  end
275
280
  end
276
- end
281
+ end
@@ -6,12 +6,14 @@ module Digger
6
6
  attr_accessor :type, :value, :block
7
7
 
8
8
  def initialize(hash = {})
9
- hash.each_pair { |key, value| send("#{key}=", value) if %w[type value block].include?(key.to_s)}
9
+ hash.each_pair do |key, value|
10
+ send("#{key}=", value) if %w[type value block].include?(key.to_s)
11
+ end
10
12
  end
11
13
 
12
14
  def safe_block(&default_block)
13
15
  if block.nil? || (block.is_a?(String) && block.strip.empty?)
14
- default_block
16
+ default_block || ->(v) { v }
15
17
  elsif block.respond_to?(:call)
16
18
  block
17
19
  else
@@ -31,30 +33,56 @@ module Digger
31
33
  TYPES_REGEXP = 0.upto(MATCH_MAX).map { |i| "match_#{i}" } + %w[match_many]
32
34
  TYPES_CSS = %w[css_one css_many].freeze
33
35
  TYPES_JSON = %w[json jsonp].freeze
36
+ TYPES_OTHER = %w[cookie plain lines header body].freeze
34
37
 
35
- TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON
38
+ TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON + TYPES_OTHER
36
39
 
37
40
  def match_page(page)
38
41
  return unless page.success?
42
+
39
43
  if TYPES_REGEXP.include?(type) # regular expression
40
44
  regexp_match(page.body)
41
45
  elsif TYPES_CSS.include?(type) # css expression
42
46
  css_match(page.doc)
43
47
  elsif TYPES_JSON.include?(type)
44
48
  json_match(page)
49
+ elsif TYPES_OTHER.include?(type)
50
+ send("get_#{type}", page)
45
51
  end
46
52
  end
47
53
 
54
+ def get_header(page)
55
+ header = (page.headers[value.to_s.downcase] || []).first
56
+ safe_block.call(header)
57
+ end
58
+
59
+ def get_body(page)
60
+ safe_block.call(page.body)
61
+ end
62
+
63
+ def get_plain(page)
64
+ safe_block.call(page.doc.text)
65
+ end
66
+
67
+ def get_lines(page)
68
+ block = safe_block
69
+ page.body.split("\n").map(&:strip).filter { |line| !line.empty? }.map { |line| block.call(line) }
70
+ end
71
+
72
+ def get_cookie(page)
73
+ cookie = page.cookies.find { |c| c.name == value }&.value
74
+ safe_block.call(cookie)
75
+ end
76
+
48
77
  def json_match(page)
49
- block = safe_block { |j| j }
50
78
  json = page.send(type)
51
79
  keys = json_index_keys(value)
52
80
  match = json_fetch(json, keys)
53
- block.call(match)
81
+ safe_block.call(match)
54
82
  end
55
83
 
56
84
  def css_match(doc)
57
- block = safe_block { |node| node.content.strip }
85
+ block = safe_block { |node| node&.content&.strip }
58
86
  # content is Nokogiri::HTML::Document
59
87
  contents = doc.css(value)
60
88
  if type == 'css_many'
@@ -68,7 +96,8 @@ module Digger
68
96
  block = safe_block(&:strip)
69
97
  # content is String
70
98
  if type == 'match_many'
71
- body.gsub(value).to_a.map { |node| block.call(node) }.uniq
99
+ regexp = value.is_a?(Regexp) ? value : Regexp.new(value.to_s)
100
+ body.gsub(regexp).to_a.map { |node| block.call(node) }.uniq
72
101
  else
73
102
  index = TYPES_REGEXP.index(type)
74
103
  matches = body.match(value)
@@ -93,21 +122,24 @@ module Digger
93
122
 
94
123
  private :json_index_keys, :json_fetch
95
124
 
96
- # Nokogiri node methods
97
- class Nokogiri::XML::Node
98
- %w[one many].each do |name|
99
- define_method "inner_#{name}" do |css, &block|
100
- callback = ->(node) { (block || ->(n) { n.text.strip }).call(node) if node }
101
- if name == 'one' # inner_one
102
- callback.call(self.css(css).first)
103
- else # inner_many
104
- self.css(css).map { |node| callback.call(node) }
105
- end
106
- end
125
+ class ::Nokogiri::XML::Node
126
+ def inner_one(expr, &block)
127
+ fn = block || ->(node) { node&.content&.strip }
128
+ fn.call(css(expr)&.first)
129
+ end
130
+
131
+ def inner_many(expr, &block)
132
+ fn = block || ->(node) { node&.content&.strip }
133
+ css(expr)&.map { |node| fn.call(node) }
107
134
  end
135
+
108
136
  def source
109
137
  to_xml
110
138
  end
139
+
140
+ def inner_number
141
+ content&.match(/\d+/).to_s.to_i
142
+ end
111
143
  end
112
144
  end
113
145
  end
@@ -1,3 +1,3 @@
1
1
  module Digger
2
- VERSION = '0.1.7'.freeze
2
+ VERSION = '0.2.1'.freeze
3
3
  end
data/spec/index_spec.rb CHANGED
@@ -9,4 +9,20 @@ describe Digger::Index do
9
9
  end
10
10
  expect(pt.join).to eq(list.map { |num| "##{num}" }.join)
11
11
  end
12
+
13
+ it 'slow down' do
14
+ list = [1, 2, 3, 4]
15
+ conf = {
16
+ sleep_range_seconds: 1...2,
17
+ fail_unit_seconds: 1,
18
+ fail_max_cnt: 2,
19
+ when_fail: ->(_, e, nth) { puts "#{nth}: #{e.message}" }
20
+ }
21
+ pt = Digger::Index.slow_down(list, conf) do |num|
22
+ raise 'error' if num == 3
23
+ num
24
+ end
25
+ p pt
26
+ expect(pt.size).to eq(2)
27
+ end
12
28
  end
data/spec/page_spec.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  require 'digger'
2
2
  require 'json'
3
3
  require 'uri'
4
+ require 'cgi'
4
5
 
5
6
  describe Digger::Page do
6
7
  it 'page json' do
@@ -15,13 +16,12 @@ describe Digger::Page do
15
16
 
16
17
  it 'fetch baidu' do
17
18
  http = Digger::HTTP.new
18
- page = http.fetch_page('http://www.baidu.com/')
19
+ page = http.fetch_page('http://baidu.com/')
19
20
  expect(page.code).to eq(200)
20
21
  end
21
22
 
22
- it 'page uri' do
23
- link ='https://www.baidu.com/s?wd=%E5%93%88%E5%93%88#hello'
24
- link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#[\w]*$/, '')
25
- p link
26
- end
27
- end
23
+ # it 'page uri' do
24
+ # link = 'https://www.baidu.com/s?wd=%E5%93%88%E5%93%88#hello'
25
+ # link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#\w*$/, '')
26
+ # end
27
+ end
data/spec/pattern_spec.rb CHANGED
@@ -2,14 +2,26 @@ require 'digger'
2
2
  require 'json'
3
3
 
4
4
  describe Digger::Pattern do
5
- it 'json fetch' do
6
- json = JSON.parse('[{"a":1,"b":[1,2,3]}]')
7
- pt = Digger::Pattern.new
8
- expect(pt.json_fetch(json, '$[0]')['a']).to eq(1)
9
- expect(pt.json_fetch(json, '$[0].a')).to eq(1)
10
- expect(pt.json_fetch(json, '$[0].b').length).to eq(3)
11
- expect(pt.json_fetch(json, '$[0].b[2]')).to eq(3)
12
- end
13
-
5
+ # it 'json fetch' do
6
+ # json = JSON.parse('[{"a":1,"b":[1,2,3]}]')
7
+ # pt = Digger::Pattern.new
8
+ # expect(pt.json_fetch(json, '$[0]')['a']).to eq(1)
9
+ # expect(pt.json_fetch(json, '$[0].a')).to eq(1)
10
+ # expect(pt.json_fetch(json, '$[0].b').length).to eq(3)
11
+ # expect(pt.json_fetch(json, '$[0].b[2]')).to eq(3)
12
+ # end
14
13
 
15
- end
14
+ it 'parse cookie & others' do
15
+ page = Digger::HTTP.new.fetch_page('https://xueqiu.com/')
16
+ p1 = Digger::Pattern.new({ type: 'cookie', value: 'xq_a_token', block: ->(v) { "!!#{v}" } })
17
+ # cookie
18
+ result = p1.match_page(page)
19
+ expect(result.length).to eq(42)
20
+ # header
21
+ p2 = Digger::Pattern.new({ type: 'header', value: 'transfer-encoding' })
22
+ expect(p2.match_page(page)).to eq('chunked')
23
+ # get_plain
24
+ p3 = Digger::Pattern.new({ type: 'plain' })
25
+ expect(p3.match_page(page).length).to be > 100
26
+ end
27
+ end
metadata CHANGED
@@ -1,71 +1,71 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: digger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - binz
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-12-26 00:00:00.000000000 Z
11
+ date: 2022-01-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: bundler
14
+ name: rake
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '2.0'
19
+ version: 12.3.3
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '2.0'
26
+ version: 12.3.3
27
27
  - !ruby/object:Gem::Dependency
28
- name: rake
28
+ name: bundler
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ">="
31
+ - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: 12.3.3
33
+ version: '2.0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ">="
38
+ - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: 12.3.3
40
+ version: '2.0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: nokogiri
42
+ name: http-cookie
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '1.6'
47
+ version: '1.0'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '1.6'
54
+ version: '1.0'
55
55
  - !ruby/object:Gem::Dependency
56
- name: http-cookie
56
+ name: nokogiri
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '1.0'
61
+ version: '1.6'
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '1.0'
68
+ version: '1.6'
69
69
  description: ''
70
70
  email:
71
71
  - xinkiang@gmail.com