digger 0.1.7 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2f506fd615df8b9d732d6b67bedc72644df26dc3f5725cfe5dba10c1098bae0b
4
- data.tar.gz: 0e9afcb19ba0be5ce4a90787d54c3b43d58100bac3ec45bf5ef93781a60b6eb1
3
+ metadata.gz: b036f2d202aa8360cf3b07822168fb8e3e3e084d4b2f5ccb5eca026dd4c47981
4
+ data.tar.gz: 85d13e567add73e38c25738852dc3b30e3ad1900579dca815d5aef4cb16dbca8
5
5
  SHA512:
6
- metadata.gz: 8608a2ee8e06ddd846772d40dc3e417560229729b7b736eda8a7f50977a7d2c6fc523f86fe64480b5172c5295137eae7c85f945f7ca310502c54c8b90dd75e8d
7
- data.tar.gz: 59fef1a13adc8f983c16428ee4d08d6ccdecea09b7583452b6ca07689727c3d6a386f5a5b767a20d117c8c4bc9775a2a6b32fd4caff1c1a5e85ee2af82e39d1b
6
+ metadata.gz: decb9abb96f56dc7f75ba95151b02c7415018a789583da1324ab4786317d7e0bfc272a83b47399b51672bdb30b3d90c502f25358d7438b9ee26a8d39764a58dd
7
+ data.tar.gz: 73f7dfd179175bcda4a24a120cad95a61093f17166e389e44b37a158ff8ee1ea927844d9b85773914f14d99473776f11752c7aa32fadfec1b8b703e584591c3f
data/digger.gemspec CHANGED
@@ -1,26 +1,27 @@
1
1
  # coding: utf-8
2
+
2
3
  lib = File.expand_path('../lib', __FILE__)
3
4
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
5
  require 'digger/version'
5
6
 
6
7
  Gem::Specification.new do |spec|
7
- spec.name = "digger"
8
+ spec.name = 'digger'
8
9
  spec.version = Digger::VERSION
9
- spec.authors = ["binz"]
10
- spec.email = ["xinkiang@gmail.com"]
10
+ spec.authors = ['binz']
11
+ spec.email = ['xinkiang@gmail.com']
11
12
  spec.summary = %q{Dig need stractual infomation from web page.}
12
13
  spec.description = %q{}
13
- spec.homepage = ""
14
- spec.license = "MIT"
14
+ spec.homepage = ''
15
+ spec.license = 'MIT'
15
16
 
16
17
  spec.files = `git ls-files -z`.split("\x0")
17
18
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
19
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
- spec.require_paths = ["lib"]
20
+ spec.require_paths = ['lib']
20
21
 
21
- spec.add_development_dependency "bundler", "~> 2.0"
22
- spec.add_development_dependency "rake", ">= 12.3.3"
22
+ spec.add_development_dependency 'rake', '>= 12.3.3'
23
+ spec.add_development_dependency 'bundler', '~> 2.0'
23
24
 
24
- spec.add_runtime_dependency 'nokogiri', '~> 1.6'
25
25
  spec.add_runtime_dependency 'http-cookie', '~> 1.0'
26
+ spec.add_runtime_dependency 'nokogiri', '~> 1.6'
26
27
  end
data/lib/digger/index.rb CHANGED
@@ -17,6 +17,35 @@ module Digger
17
17
  pattern.gsub('*').each_with_index { |_, i| arg[i] }
18
18
  end
19
19
 
20
+ def self.slow_down(entities, conf = {}, &block)
21
+ raise NoBlockError, 'No block given' unless block
22
+
23
+ config = {
24
+ sleep_range_seconds: 4...10, # 随机等待时间范围
25
+ fail_max_cnt: 10, # 最多失败次数
26
+ fail_unit_seconds: 10 * 60, # 失败等待时间
27
+ when_fail: ->(ent, e, failed_cnt) {}
28
+ }.merge(conf)
29
+ failed_cnt = 0
30
+ cursor = 0
31
+ result = []
32
+ while cursor < entities.length
33
+ begin
34
+ result << block.call(entities[cursor])
35
+ rescue StandardError => e
36
+ failed_cnt += 1
37
+ config[:when_fail].call(entities[cursor], e, failed_cnt)
38
+ break if failed_cnt >= config[:fail_max_cnt]
39
+
40
+ sleep(failed_cnt * config[:fail_unit_seconds])
41
+ else
42
+ cursor += 1
43
+ sleep(rand(config[:sleep_range_seconds]))
44
+ end
45
+ end
46
+ result
47
+ end
48
+
20
49
  def self.batch(entities, cocurrence = 1, &block)
21
50
  raise NoBlockError, 'No block given' unless block
22
51
 
data/lib/digger/page.rb CHANGED
@@ -4,6 +4,7 @@ require 'ostruct'
4
4
  require 'set'
5
5
  require 'kconv'
6
6
  require 'uri'
7
+ require 'http/cookie'
7
8
 
8
9
  # https://github.com/taganaka/polipus/blob/master/lib/polipus/page.rb
9
10
  module Digger
@@ -101,6 +102,10 @@ module Digger
101
102
  @jsonp ||= JSON.parse body.match(/^[^(]+?\((.+)\)[^)]*$/)[1]
102
103
  end
103
104
 
105
+ def cookies
106
+ @cookies ||= (headers['set-cookie'] || []).flat_map { |c| ::HTTP::Cookie.parse(c, url) }
107
+ end
108
+
104
109
  #
105
110
  # Discard links, a next call of page.links will return an empty array
106
111
  #
@@ -273,4 +278,4 @@ module Digger
273
278
  from_hash hash
274
279
  end
275
280
  end
276
- end
281
+ end
@@ -6,12 +6,14 @@ module Digger
6
6
  attr_accessor :type, :value, :block
7
7
 
8
8
  def initialize(hash = {})
9
- hash.each_pair { |key, value| send("#{key}=", value) if %w[type value block].include?(key.to_s)}
9
+ hash.each_pair do |key, value|
10
+ send("#{key}=", value) if %w[type value block].include?(key.to_s)
11
+ end
10
12
  end
11
13
 
12
14
  def safe_block(&default_block)
13
15
  if block.nil? || (block.is_a?(String) && block.strip.empty?)
14
- default_block
16
+ default_block || ->(v) { v }
15
17
  elsif block.respond_to?(:call)
16
18
  block
17
19
  else
@@ -31,30 +33,56 @@ module Digger
31
33
  TYPES_REGEXP = 0.upto(MATCH_MAX).map { |i| "match_#{i}" } + %w[match_many]
32
34
  TYPES_CSS = %w[css_one css_many].freeze
33
35
  TYPES_JSON = %w[json jsonp].freeze
36
+ TYPES_OTHER = %w[cookie plain lines header body].freeze
34
37
 
35
- TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON
38
+ TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON + TYPES_OTHER
36
39
 
37
40
  def match_page(page)
38
41
  return unless page.success?
42
+
39
43
  if TYPES_REGEXP.include?(type) # regular expression
40
44
  regexp_match(page.body)
41
45
  elsif TYPES_CSS.include?(type) # css expression
42
46
  css_match(page.doc)
43
47
  elsif TYPES_JSON.include?(type)
44
48
  json_match(page)
49
+ elsif TYPES_OTHER.include?(type)
50
+ send("get_#{type}", page)
45
51
  end
46
52
  end
47
53
 
54
+ def get_header(page)
55
+ header = (page.headers[value.to_s.downcase] || []).first
56
+ safe_block.call(header)
57
+ end
58
+
59
+ def get_body(page)
60
+ safe_block.call(page.body)
61
+ end
62
+
63
+ def get_plain(page)
64
+ safe_block.call(page.doc.text)
65
+ end
66
+
67
+ def get_lines(page)
68
+ block = safe_block
69
+ page.body.split("\n").map(&:strip).filter { |line| !line.empty? }.map { |line| block.call(line) }
70
+ end
71
+
72
+ def get_cookie(page)
73
+ cookie = page.cookies.find { |c| c.name == value }&.value
74
+ safe_block.call(cookie)
75
+ end
76
+
48
77
  def json_match(page)
49
- block = safe_block { |j| j }
50
78
  json = page.send(type)
51
79
  keys = json_index_keys(value)
52
80
  match = json_fetch(json, keys)
53
- block.call(match)
81
+ safe_block.call(match)
54
82
  end
55
83
 
56
84
  def css_match(doc)
57
- block = safe_block { |node| node.content.strip }
85
+ block = safe_block { |node| node&.content&.strip }
58
86
  # content is Nokogiri::HTML::Document
59
87
  contents = doc.css(value)
60
88
  if type == 'css_many'
@@ -68,7 +96,8 @@ module Digger
68
96
  block = safe_block(&:strip)
69
97
  # content is String
70
98
  if type == 'match_many'
71
- body.gsub(value).to_a.map { |node| block.call(node) }.uniq
99
+ regexp = value.is_a?(Regexp) ? value : Regexp.new(value.to_s)
100
+ body.gsub(regexp).to_a.map { |node| block.call(node) }.uniq
72
101
  else
73
102
  index = TYPES_REGEXP.index(type)
74
103
  matches = body.match(value)
@@ -93,21 +122,24 @@ module Digger
93
122
 
94
123
  private :json_index_keys, :json_fetch
95
124
 
96
- # Nokogiri node methods
97
- class Nokogiri::XML::Node
98
- %w[one many].each do |name|
99
- define_method "inner_#{name}" do |css, &block|
100
- callback = ->(node) { (block || ->(n) { n.text.strip }).call(node) if node }
101
- if name == 'one' # inner_one
102
- callback.call(self.css(css).first)
103
- else # inner_many
104
- self.css(css).map { |node| callback.call(node) }
105
- end
106
- end
125
+ class ::Nokogiri::XML::Node
126
+ def inner_one(expr, &block)
127
+ fn = block || ->(node) { node&.content&.strip }
128
+ fn.call(css(expr)&.first)
129
+ end
130
+
131
+ def inner_many(expr, &block)
132
+ fn = block || ->(node) { node&.content&.strip }
133
+ css(expr)&.map { |node| fn.call(node) }
107
134
  end
135
+
108
136
  def source
109
137
  to_xml
110
138
  end
139
+
140
+ def inner_number
141
+ content&.match(/\d+/).to_s.to_i
142
+ end
111
143
  end
112
144
  end
113
145
  end
@@ -1,3 +1,3 @@
1
1
  module Digger
2
- VERSION = '0.1.7'.freeze
2
+ VERSION = '0.2.1'.freeze
3
3
  end
data/spec/index_spec.rb CHANGED
@@ -9,4 +9,20 @@ describe Digger::Index do
9
9
  end
10
10
  expect(pt.join).to eq(list.map { |num| "##{num}" }.join)
11
11
  end
12
+
13
+ it 'slow down' do
14
+ list = [1, 2, 3, 4]
15
+ conf = {
16
+ sleep_range_seconds: 1...2,
17
+ fail_unit_seconds: 1,
18
+ fail_max_cnt: 2,
19
+ when_fail: ->(_, e, nth) { puts "#{nth}: #{e.message}" }
20
+ }
21
+ pt = Digger::Index.slow_down(list, conf) do |num|
22
+ raise 'error' if num == 3
23
+ num
24
+ end
25
+ p pt
26
+ expect(pt.size).to eq(2)
27
+ end
12
28
  end
data/spec/page_spec.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  require 'digger'
2
2
  require 'json'
3
3
  require 'uri'
4
+ require 'cgi'
4
5
 
5
6
  describe Digger::Page do
6
7
  it 'page json' do
@@ -15,13 +16,12 @@ describe Digger::Page do
15
16
 
16
17
  it 'fetch baidu' do
17
18
  http = Digger::HTTP.new
18
- page = http.fetch_page('http://www.baidu.com/')
19
+ page = http.fetch_page('http://baidu.com/')
19
20
  expect(page.code).to eq(200)
20
21
  end
21
22
 
22
- it 'page uri' do
23
- link ='https://www.baidu.com/s?wd=%E5%93%88%E5%93%88#hello'
24
- link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#[\w]*$/, '')
25
- p link
26
- end
27
- end
23
+ # it 'page uri' do
24
+ # link = 'https://www.baidu.com/s?wd=%E5%93%88%E5%93%88#hello'
25
+ # link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#\w*$/, '')
26
+ # end
27
+ end
data/spec/pattern_spec.rb CHANGED
@@ -2,14 +2,26 @@ require 'digger'
2
2
  require 'json'
3
3
 
4
4
  describe Digger::Pattern do
5
- it 'json fetch' do
6
- json = JSON.parse('[{"a":1,"b":[1,2,3]}]')
7
- pt = Digger::Pattern.new
8
- expect(pt.json_fetch(json, '$[0]')['a']).to eq(1)
9
- expect(pt.json_fetch(json, '$[0].a')).to eq(1)
10
- expect(pt.json_fetch(json, '$[0].b').length).to eq(3)
11
- expect(pt.json_fetch(json, '$[0].b[2]')).to eq(3)
12
- end
13
-
5
+ # it 'json fetch' do
6
+ # json = JSON.parse('[{"a":1,"b":[1,2,3]}]')
7
+ # pt = Digger::Pattern.new
8
+ # expect(pt.json_fetch(json, '$[0]')['a']).to eq(1)
9
+ # expect(pt.json_fetch(json, '$[0].a')).to eq(1)
10
+ # expect(pt.json_fetch(json, '$[0].b').length).to eq(3)
11
+ # expect(pt.json_fetch(json, '$[0].b[2]')).to eq(3)
12
+ # end
14
13
 
15
- end
14
+ it 'parse cookie & others' do
15
+ page = Digger::HTTP.new.fetch_page('https://xueqiu.com/')
16
+ p1 = Digger::Pattern.new({ type: 'cookie', value: 'xq_a_token', block: ->(v) { "!!#{v}" } })
17
+ # cookie
18
+ result = p1.match_page(page)
19
+ expect(result.length).to eq(42)
20
+ # header
21
+ p2 = Digger::Pattern.new({ type: 'header', value: 'transfer-encoding' })
22
+ expect(p2.match_page(page)).to eq('chunked')
23
+ # get_plain
24
+ p3 = Digger::Pattern.new({ type: 'plain' })
25
+ expect(p3.match_page(page).length).to be > 100
26
+ end
27
+ end
metadata CHANGED
@@ -1,71 +1,71 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: digger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - binz
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-12-26 00:00:00.000000000 Z
11
+ date: 2022-01-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: bundler
14
+ name: rake
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '2.0'
19
+ version: 12.3.3
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '2.0'
26
+ version: 12.3.3
27
27
  - !ruby/object:Gem::Dependency
28
- name: rake
28
+ name: bundler
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ">="
31
+ - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: 12.3.3
33
+ version: '2.0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ">="
38
+ - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: 12.3.3
40
+ version: '2.0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: nokogiri
42
+ name: http-cookie
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '1.6'
47
+ version: '1.0'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '1.6'
54
+ version: '1.0'
55
55
  - !ruby/object:Gem::Dependency
56
- name: http-cookie
56
+ name: nokogiri
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '1.0'
61
+ version: '1.6'
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '1.0'
68
+ version: '1.6'
69
69
  description: ''
70
70
  email:
71
71
  - xinkiang@gmail.com