digger 0.1.7 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/digger.gemspec +10 -9
- data/lib/digger/index.rb +29 -0
- data/lib/digger/page.rb +6 -1
- data/lib/digger/pattern.rb +50 -18
- data/lib/digger/version.rb +1 -1
- data/spec/index_spec.rb +16 -0
- data/spec/page_spec.rb +7 -7
- data/spec/pattern_spec.rb +22 -10
- metadata +18 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b036f2d202aa8360cf3b07822168fb8e3e3e084d4b2f5ccb5eca026dd4c47981
|
4
|
+
data.tar.gz: 85d13e567add73e38c25738852dc3b30e3ad1900579dca815d5aef4cb16dbca8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: decb9abb96f56dc7f75ba95151b02c7415018a789583da1324ab4786317d7e0bfc272a83b47399b51672bdb30b3d90c502f25358d7438b9ee26a8d39764a58dd
|
7
|
+
data.tar.gz: 73f7dfd179175bcda4a24a120cad95a61093f17166e389e44b37a158ff8ee1ea927844d9b85773914f14d99473776f11752c7aa32fadfec1b8b703e584591c3f
|
data/digger.gemspec
CHANGED
@@ -1,26 +1,27 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
|
2
3
|
lib = File.expand_path('../lib', __FILE__)
|
3
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
5
|
require 'digger/version'
|
5
6
|
|
6
7
|
Gem::Specification.new do |spec|
|
7
|
-
spec.name =
|
8
|
+
spec.name = 'digger'
|
8
9
|
spec.version = Digger::VERSION
|
9
|
-
spec.authors = [
|
10
|
-
spec.email = [
|
10
|
+
spec.authors = ['binz']
|
11
|
+
spec.email = ['xinkiang@gmail.com']
|
11
12
|
spec.summary = %q{Dig need stractual infomation from web page.}
|
12
13
|
spec.description = %q{}
|
13
|
-
spec.homepage =
|
14
|
-
spec.license =
|
14
|
+
spec.homepage = ''
|
15
|
+
spec.license = 'MIT'
|
15
16
|
|
16
17
|
spec.files = `git ls-files -z`.split("\x0")
|
17
18
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
19
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
-
spec.require_paths = [
|
20
|
+
spec.require_paths = ['lib']
|
20
21
|
|
21
|
-
spec.add_development_dependency
|
22
|
-
spec.add_development_dependency
|
22
|
+
spec.add_development_dependency 'rake', '>= 12.3.3'
|
23
|
+
spec.add_development_dependency 'bundler', '~> 2.0'
|
23
24
|
|
24
|
-
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
25
25
|
spec.add_runtime_dependency 'http-cookie', '~> 1.0'
|
26
|
+
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
26
27
|
end
|
data/lib/digger/index.rb
CHANGED
@@ -17,6 +17,35 @@ module Digger
|
|
17
17
|
pattern.gsub('*').each_with_index { |_, i| arg[i] }
|
18
18
|
end
|
19
19
|
|
20
|
+
def self.slow_down(entities, conf = {}, &block)
|
21
|
+
raise NoBlockError, 'No block given' unless block
|
22
|
+
|
23
|
+
config = {
|
24
|
+
sleep_range_seconds: 4...10, # 随机等待时间范围
|
25
|
+
fail_max_cnt: 10, # 最多失败次数
|
26
|
+
fail_unit_seconds: 10 * 60, # 失败等待时间
|
27
|
+
when_fail: ->(ent, e, failed_cnt) {}
|
28
|
+
}.merge(conf)
|
29
|
+
failed_cnt = 0
|
30
|
+
cursor = 0
|
31
|
+
result = []
|
32
|
+
while cursor < entities.length
|
33
|
+
begin
|
34
|
+
result << block.call(entities[cursor])
|
35
|
+
rescue StandardError => e
|
36
|
+
failed_cnt += 1
|
37
|
+
config[:when_fail].call(entities[cursor], e, failed_cnt)
|
38
|
+
break if failed_cnt >= config[:fail_max_cnt]
|
39
|
+
|
40
|
+
sleep(failed_cnt * config[:fail_unit_seconds])
|
41
|
+
else
|
42
|
+
cursor += 1
|
43
|
+
sleep(rand(config[:sleep_range_seconds]))
|
44
|
+
end
|
45
|
+
end
|
46
|
+
result
|
47
|
+
end
|
48
|
+
|
20
49
|
def self.batch(entities, cocurrence = 1, &block)
|
21
50
|
raise NoBlockError, 'No block given' unless block
|
22
51
|
|
data/lib/digger/page.rb
CHANGED
@@ -4,6 +4,7 @@ require 'ostruct'
|
|
4
4
|
require 'set'
|
5
5
|
require 'kconv'
|
6
6
|
require 'uri'
|
7
|
+
require 'http/cookie'
|
7
8
|
|
8
9
|
# https://github.com/taganaka/polipus/blob/master/lib/polipus/page.rb
|
9
10
|
module Digger
|
@@ -101,6 +102,10 @@ module Digger
|
|
101
102
|
@jsonp ||= JSON.parse body.match(/^[^(]+?\((.+)\)[^)]*$/)[1]
|
102
103
|
end
|
103
104
|
|
105
|
+
def cookies
|
106
|
+
@cookies ||= (headers['set-cookie'] || []).flat_map { |c| ::HTTP::Cookie.parse(c, url) }
|
107
|
+
end
|
108
|
+
|
104
109
|
#
|
105
110
|
# Discard links, a next call of page.links will return an empty array
|
106
111
|
#
|
@@ -273,4 +278,4 @@ module Digger
|
|
273
278
|
from_hash hash
|
274
279
|
end
|
275
280
|
end
|
276
|
-
end
|
281
|
+
end
|
data/lib/digger/pattern.rb
CHANGED
@@ -6,12 +6,14 @@ module Digger
|
|
6
6
|
attr_accessor :type, :value, :block
|
7
7
|
|
8
8
|
def initialize(hash = {})
|
9
|
-
hash.each_pair
|
9
|
+
hash.each_pair do |key, value|
|
10
|
+
send("#{key}=", value) if %w[type value block].include?(key.to_s)
|
11
|
+
end
|
10
12
|
end
|
11
13
|
|
12
14
|
def safe_block(&default_block)
|
13
15
|
if block.nil? || (block.is_a?(String) && block.strip.empty?)
|
14
|
-
default_block
|
16
|
+
default_block || ->(v) { v }
|
15
17
|
elsif block.respond_to?(:call)
|
16
18
|
block
|
17
19
|
else
|
@@ -31,30 +33,56 @@ module Digger
|
|
31
33
|
TYPES_REGEXP = 0.upto(MATCH_MAX).map { |i| "match_#{i}" } + %w[match_many]
|
32
34
|
TYPES_CSS = %w[css_one css_many].freeze
|
33
35
|
TYPES_JSON = %w[json jsonp].freeze
|
36
|
+
TYPES_OTHER = %w[cookie plain lines header body].freeze
|
34
37
|
|
35
|
-
TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON
|
38
|
+
TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON + TYPES_OTHER
|
36
39
|
|
37
40
|
def match_page(page)
|
38
41
|
return unless page.success?
|
42
|
+
|
39
43
|
if TYPES_REGEXP.include?(type) # regular expression
|
40
44
|
regexp_match(page.body)
|
41
45
|
elsif TYPES_CSS.include?(type) # css expression
|
42
46
|
css_match(page.doc)
|
43
47
|
elsif TYPES_JSON.include?(type)
|
44
48
|
json_match(page)
|
49
|
+
elsif TYPES_OTHER.include?(type)
|
50
|
+
send("get_#{type}", page)
|
45
51
|
end
|
46
52
|
end
|
47
53
|
|
54
|
+
def get_header(page)
|
55
|
+
header = (page.headers[value.to_s.downcase] || []).first
|
56
|
+
safe_block.call(header)
|
57
|
+
end
|
58
|
+
|
59
|
+
def get_body(page)
|
60
|
+
safe_block.call(page.body)
|
61
|
+
end
|
62
|
+
|
63
|
+
def get_plain(page)
|
64
|
+
safe_block.call(page.doc.text)
|
65
|
+
end
|
66
|
+
|
67
|
+
def get_lines(page)
|
68
|
+
block = safe_block
|
69
|
+
page.body.split("\n").map(&:strip).filter { |line| !line.empty? }.map { |line| block.call(line) }
|
70
|
+
end
|
71
|
+
|
72
|
+
def get_cookie(page)
|
73
|
+
cookie = page.cookies.find { |c| c.name == value }&.value
|
74
|
+
safe_block.call(cookie)
|
75
|
+
end
|
76
|
+
|
48
77
|
def json_match(page)
|
49
|
-
block = safe_block { |j| j }
|
50
78
|
json = page.send(type)
|
51
79
|
keys = json_index_keys(value)
|
52
80
|
match = json_fetch(json, keys)
|
53
|
-
|
81
|
+
safe_block.call(match)
|
54
82
|
end
|
55
83
|
|
56
84
|
def css_match(doc)
|
57
|
-
block = safe_block { |node| node
|
85
|
+
block = safe_block { |node| node&.content&.strip }
|
58
86
|
# content is Nokogiri::HTML::Document
|
59
87
|
contents = doc.css(value)
|
60
88
|
if type == 'css_many'
|
@@ -68,7 +96,8 @@ module Digger
|
|
68
96
|
block = safe_block(&:strip)
|
69
97
|
# content is String
|
70
98
|
if type == 'match_many'
|
71
|
-
|
99
|
+
regexp = value.is_a?(Regexp) ? value : Regexp.new(value.to_s)
|
100
|
+
body.gsub(regexp).to_a.map { |node| block.call(node) }.uniq
|
72
101
|
else
|
73
102
|
index = TYPES_REGEXP.index(type)
|
74
103
|
matches = body.match(value)
|
@@ -93,21 +122,24 @@ module Digger
|
|
93
122
|
|
94
123
|
private :json_index_keys, :json_fetch
|
95
124
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
end
|
106
|
-
end
|
125
|
+
class ::Nokogiri::XML::Node
|
126
|
+
def inner_one(expr, &block)
|
127
|
+
fn = block || ->(node) { node&.content&.strip }
|
128
|
+
fn.call(css(expr)&.first)
|
129
|
+
end
|
130
|
+
|
131
|
+
def inner_many(expr, &block)
|
132
|
+
fn = block || ->(node) { node&.content&.strip }
|
133
|
+
css(expr)&.map { |node| fn.call(node) }
|
107
134
|
end
|
135
|
+
|
108
136
|
def source
|
109
137
|
to_xml
|
110
138
|
end
|
139
|
+
|
140
|
+
def inner_number
|
141
|
+
content&.match(/\d+/).to_s.to_i
|
142
|
+
end
|
111
143
|
end
|
112
144
|
end
|
113
145
|
end
|
data/lib/digger/version.rb
CHANGED
data/spec/index_spec.rb
CHANGED
@@ -9,4 +9,20 @@ describe Digger::Index do
|
|
9
9
|
end
|
10
10
|
expect(pt.join).to eq(list.map { |num| "##{num}" }.join)
|
11
11
|
end
|
12
|
+
|
13
|
+
it 'slow down' do
|
14
|
+
list = [1, 2, 3, 4]
|
15
|
+
conf = {
|
16
|
+
sleep_range_seconds: 1...2,
|
17
|
+
fail_unit_seconds: 1,
|
18
|
+
fail_max_cnt: 2,
|
19
|
+
when_fail: ->(_, e, nth) { puts "#{nth}: #{e.message}" }
|
20
|
+
}
|
21
|
+
pt = Digger::Index.slow_down(list, conf) do |num|
|
22
|
+
raise 'error' if num == 3
|
23
|
+
num
|
24
|
+
end
|
25
|
+
p pt
|
26
|
+
expect(pt.size).to eq(2)
|
27
|
+
end
|
12
28
|
end
|
data/spec/page_spec.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'digger'
|
2
2
|
require 'json'
|
3
3
|
require 'uri'
|
4
|
+
require 'cgi'
|
4
5
|
|
5
6
|
describe Digger::Page do
|
6
7
|
it 'page json' do
|
@@ -15,13 +16,12 @@ describe Digger::Page do
|
|
15
16
|
|
16
17
|
it 'fetch baidu' do
|
17
18
|
http = Digger::HTTP.new
|
18
|
-
page = http.fetch_page('http://
|
19
|
+
page = http.fetch_page('http://baidu.com/')
|
19
20
|
expect(page.code).to eq(200)
|
20
21
|
end
|
21
22
|
|
22
|
-
it 'page uri' do
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
end
|
23
|
+
# it 'page uri' do
|
24
|
+
# link = 'https://www.baidu.com/s?wd=%E5%93%88%E5%93%88#hello'
|
25
|
+
# link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#\w*$/, '')
|
26
|
+
# end
|
27
|
+
end
|
data/spec/pattern_spec.rb
CHANGED
@@ -2,14 +2,26 @@ require 'digger'
|
|
2
2
|
require 'json'
|
3
3
|
|
4
4
|
describe Digger::Pattern do
|
5
|
-
it 'json fetch' do
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
end
|
13
|
-
|
5
|
+
# it 'json fetch' do
|
6
|
+
# json = JSON.parse('[{"a":1,"b":[1,2,3]}]')
|
7
|
+
# pt = Digger::Pattern.new
|
8
|
+
# expect(pt.json_fetch(json, '$[0]')['a']).to eq(1)
|
9
|
+
# expect(pt.json_fetch(json, '$[0].a')).to eq(1)
|
10
|
+
# expect(pt.json_fetch(json, '$[0].b').length).to eq(3)
|
11
|
+
# expect(pt.json_fetch(json, '$[0].b[2]')).to eq(3)
|
12
|
+
# end
|
14
13
|
|
15
|
-
|
14
|
+
it 'parse cookie & others' do
|
15
|
+
page = Digger::HTTP.new.fetch_page('https://xueqiu.com/')
|
16
|
+
p1 = Digger::Pattern.new({ type: 'cookie', value: 'xq_a_token', block: ->(v) { "!!#{v}" } })
|
17
|
+
# cookie
|
18
|
+
result = p1.match_page(page)
|
19
|
+
expect(result.length).to eq(42)
|
20
|
+
# header
|
21
|
+
p2 = Digger::Pattern.new({ type: 'header', value: 'transfer-encoding' })
|
22
|
+
expect(p2.match_page(page)).to eq('chunked')
|
23
|
+
# get_plain
|
24
|
+
p3 = Digger::Pattern.new({ type: 'plain' })
|
25
|
+
expect(p3.match_page(page).length).to be > 100
|
26
|
+
end
|
27
|
+
end
|
metadata
CHANGED
@@ -1,71 +1,71 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: digger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- binz
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-01-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: rake
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 12.3.3
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 12.3.3
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: bundler
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: '2.0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - "
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: '2.0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: http-cookie
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '1.
|
47
|
+
version: '1.0'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '1.
|
54
|
+
version: '1.0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: nokogiri
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '1.
|
61
|
+
version: '1.6'
|
62
62
|
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '1.
|
68
|
+
version: '1.6'
|
69
69
|
description: ''
|
70
70
|
email:
|
71
71
|
- xinkiang@gmail.com
|