digger 0.1.7 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/digger.gemspec +10 -9
- data/lib/digger/index.rb +29 -0
- data/lib/digger/page.rb +6 -1
- data/lib/digger/pattern.rb +50 -18
- data/lib/digger/version.rb +1 -1
- data/spec/index_spec.rb +16 -0
- data/spec/page_spec.rb +7 -7
- data/spec/pattern_spec.rb +22 -10
- metadata +18 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b036f2d202aa8360cf3b07822168fb8e3e3e084d4b2f5ccb5eca026dd4c47981
|
4
|
+
data.tar.gz: 85d13e567add73e38c25738852dc3b30e3ad1900579dca815d5aef4cb16dbca8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: decb9abb96f56dc7f75ba95151b02c7415018a789583da1324ab4786317d7e0bfc272a83b47399b51672bdb30b3d90c502f25358d7438b9ee26a8d39764a58dd
|
7
|
+
data.tar.gz: 73f7dfd179175bcda4a24a120cad95a61093f17166e389e44b37a158ff8ee1ea927844d9b85773914f14d99473776f11752c7aa32fadfec1b8b703e584591c3f
|
data/digger.gemspec
CHANGED
@@ -1,26 +1,27 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
|
2
3
|
lib = File.expand_path('../lib', __FILE__)
|
3
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
5
|
require 'digger/version'
|
5
6
|
|
6
7
|
Gem::Specification.new do |spec|
|
7
|
-
spec.name =
|
8
|
+
spec.name = 'digger'
|
8
9
|
spec.version = Digger::VERSION
|
9
|
-
spec.authors = [
|
10
|
-
spec.email = [
|
10
|
+
spec.authors = ['binz']
|
11
|
+
spec.email = ['xinkiang@gmail.com']
|
11
12
|
spec.summary = %q{Dig need stractual infomation from web page.}
|
12
13
|
spec.description = %q{}
|
13
|
-
spec.homepage =
|
14
|
-
spec.license =
|
14
|
+
spec.homepage = ''
|
15
|
+
spec.license = 'MIT'
|
15
16
|
|
16
17
|
spec.files = `git ls-files -z`.split("\x0")
|
17
18
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
19
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
-
spec.require_paths = [
|
20
|
+
spec.require_paths = ['lib']
|
20
21
|
|
21
|
-
spec.add_development_dependency
|
22
|
-
spec.add_development_dependency
|
22
|
+
spec.add_development_dependency 'rake', '>= 12.3.3'
|
23
|
+
spec.add_development_dependency 'bundler', '~> 2.0'
|
23
24
|
|
24
|
-
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
25
25
|
spec.add_runtime_dependency 'http-cookie', '~> 1.0'
|
26
|
+
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
26
27
|
end
|
data/lib/digger/index.rb
CHANGED
@@ -17,6 +17,35 @@ module Digger
|
|
17
17
|
pattern.gsub('*').each_with_index { |_, i| arg[i] }
|
18
18
|
end
|
19
19
|
|
20
|
+
def self.slow_down(entities, conf = {}, &block)
|
21
|
+
raise NoBlockError, 'No block given' unless block
|
22
|
+
|
23
|
+
config = {
|
24
|
+
sleep_range_seconds: 4...10, # 随机等待时间范围
|
25
|
+
fail_max_cnt: 10, # 最多失败次数
|
26
|
+
fail_unit_seconds: 10 * 60, # 失败等待时间
|
27
|
+
when_fail: ->(ent, e, failed_cnt) {}
|
28
|
+
}.merge(conf)
|
29
|
+
failed_cnt = 0
|
30
|
+
cursor = 0
|
31
|
+
result = []
|
32
|
+
while cursor < entities.length
|
33
|
+
begin
|
34
|
+
result << block.call(entities[cursor])
|
35
|
+
rescue StandardError => e
|
36
|
+
failed_cnt += 1
|
37
|
+
config[:when_fail].call(entities[cursor], e, failed_cnt)
|
38
|
+
break if failed_cnt >= config[:fail_max_cnt]
|
39
|
+
|
40
|
+
sleep(failed_cnt * config[:fail_unit_seconds])
|
41
|
+
else
|
42
|
+
cursor += 1
|
43
|
+
sleep(rand(config[:sleep_range_seconds]))
|
44
|
+
end
|
45
|
+
end
|
46
|
+
result
|
47
|
+
end
|
48
|
+
|
20
49
|
def self.batch(entities, cocurrence = 1, &block)
|
21
50
|
raise NoBlockError, 'No block given' unless block
|
22
51
|
|
data/lib/digger/page.rb
CHANGED
@@ -4,6 +4,7 @@ require 'ostruct'
|
|
4
4
|
require 'set'
|
5
5
|
require 'kconv'
|
6
6
|
require 'uri'
|
7
|
+
require 'http/cookie'
|
7
8
|
|
8
9
|
# https://github.com/taganaka/polipus/blob/master/lib/polipus/page.rb
|
9
10
|
module Digger
|
@@ -101,6 +102,10 @@ module Digger
|
|
101
102
|
@jsonp ||= JSON.parse body.match(/^[^(]+?\((.+)\)[^)]*$/)[1]
|
102
103
|
end
|
103
104
|
|
105
|
+
def cookies
|
106
|
+
@cookies ||= (headers['set-cookie'] || []).flat_map { |c| ::HTTP::Cookie.parse(c, url) }
|
107
|
+
end
|
108
|
+
|
104
109
|
#
|
105
110
|
# Discard links, a next call of page.links will return an empty array
|
106
111
|
#
|
@@ -273,4 +278,4 @@ module Digger
|
|
273
278
|
from_hash hash
|
274
279
|
end
|
275
280
|
end
|
276
|
-
end
|
281
|
+
end
|
data/lib/digger/pattern.rb
CHANGED
@@ -6,12 +6,14 @@ module Digger
|
|
6
6
|
attr_accessor :type, :value, :block
|
7
7
|
|
8
8
|
def initialize(hash = {})
|
9
|
-
hash.each_pair
|
9
|
+
hash.each_pair do |key, value|
|
10
|
+
send("#{key}=", value) if %w[type value block].include?(key.to_s)
|
11
|
+
end
|
10
12
|
end
|
11
13
|
|
12
14
|
def safe_block(&default_block)
|
13
15
|
if block.nil? || (block.is_a?(String) && block.strip.empty?)
|
14
|
-
default_block
|
16
|
+
default_block || ->(v) { v }
|
15
17
|
elsif block.respond_to?(:call)
|
16
18
|
block
|
17
19
|
else
|
@@ -31,30 +33,56 @@ module Digger
|
|
31
33
|
TYPES_REGEXP = 0.upto(MATCH_MAX).map { |i| "match_#{i}" } + %w[match_many]
|
32
34
|
TYPES_CSS = %w[css_one css_many].freeze
|
33
35
|
TYPES_JSON = %w[json jsonp].freeze
|
36
|
+
TYPES_OTHER = %w[cookie plain lines header body].freeze
|
34
37
|
|
35
|
-
TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON
|
38
|
+
TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON + TYPES_OTHER
|
36
39
|
|
37
40
|
def match_page(page)
|
38
41
|
return unless page.success?
|
42
|
+
|
39
43
|
if TYPES_REGEXP.include?(type) # regular expression
|
40
44
|
regexp_match(page.body)
|
41
45
|
elsif TYPES_CSS.include?(type) # css expression
|
42
46
|
css_match(page.doc)
|
43
47
|
elsif TYPES_JSON.include?(type)
|
44
48
|
json_match(page)
|
49
|
+
elsif TYPES_OTHER.include?(type)
|
50
|
+
send("get_#{type}", page)
|
45
51
|
end
|
46
52
|
end
|
47
53
|
|
54
|
+
def get_header(page)
|
55
|
+
header = (page.headers[value.to_s.downcase] || []).first
|
56
|
+
safe_block.call(header)
|
57
|
+
end
|
58
|
+
|
59
|
+
def get_body(page)
|
60
|
+
safe_block.call(page.body)
|
61
|
+
end
|
62
|
+
|
63
|
+
def get_plain(page)
|
64
|
+
safe_block.call(page.doc.text)
|
65
|
+
end
|
66
|
+
|
67
|
+
def get_lines(page)
|
68
|
+
block = safe_block
|
69
|
+
page.body.split("\n").map(&:strip).filter { |line| !line.empty? }.map { |line| block.call(line) }
|
70
|
+
end
|
71
|
+
|
72
|
+
def get_cookie(page)
|
73
|
+
cookie = page.cookies.find { |c| c.name == value }&.value
|
74
|
+
safe_block.call(cookie)
|
75
|
+
end
|
76
|
+
|
48
77
|
def json_match(page)
|
49
|
-
block = safe_block { |j| j }
|
50
78
|
json = page.send(type)
|
51
79
|
keys = json_index_keys(value)
|
52
80
|
match = json_fetch(json, keys)
|
53
|
-
|
81
|
+
safe_block.call(match)
|
54
82
|
end
|
55
83
|
|
56
84
|
def css_match(doc)
|
57
|
-
block = safe_block { |node| node
|
85
|
+
block = safe_block { |node| node&.content&.strip }
|
58
86
|
# content is Nokogiri::HTML::Document
|
59
87
|
contents = doc.css(value)
|
60
88
|
if type == 'css_many'
|
@@ -68,7 +96,8 @@ module Digger
|
|
68
96
|
block = safe_block(&:strip)
|
69
97
|
# content is String
|
70
98
|
if type == 'match_many'
|
71
|
-
|
99
|
+
regexp = value.is_a?(Regexp) ? value : Regexp.new(value.to_s)
|
100
|
+
body.gsub(regexp).to_a.map { |node| block.call(node) }.uniq
|
72
101
|
else
|
73
102
|
index = TYPES_REGEXP.index(type)
|
74
103
|
matches = body.match(value)
|
@@ -93,21 +122,24 @@ module Digger
|
|
93
122
|
|
94
123
|
private :json_index_keys, :json_fetch
|
95
124
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
end
|
106
|
-
end
|
125
|
+
class ::Nokogiri::XML::Node
|
126
|
+
def inner_one(expr, &block)
|
127
|
+
fn = block || ->(node) { node&.content&.strip }
|
128
|
+
fn.call(css(expr)&.first)
|
129
|
+
end
|
130
|
+
|
131
|
+
def inner_many(expr, &block)
|
132
|
+
fn = block || ->(node) { node&.content&.strip }
|
133
|
+
css(expr)&.map { |node| fn.call(node) }
|
107
134
|
end
|
135
|
+
|
108
136
|
def source
|
109
137
|
to_xml
|
110
138
|
end
|
139
|
+
|
140
|
+
def inner_number
|
141
|
+
content&.match(/\d+/).to_s.to_i
|
142
|
+
end
|
111
143
|
end
|
112
144
|
end
|
113
145
|
end
|
data/lib/digger/version.rb
CHANGED
data/spec/index_spec.rb
CHANGED
@@ -9,4 +9,20 @@ describe Digger::Index do
|
|
9
9
|
end
|
10
10
|
expect(pt.join).to eq(list.map { |num| "##{num}" }.join)
|
11
11
|
end
|
12
|
+
|
13
|
+
it 'slow down' do
|
14
|
+
list = [1, 2, 3, 4]
|
15
|
+
conf = {
|
16
|
+
sleep_range_seconds: 1...2,
|
17
|
+
fail_unit_seconds: 1,
|
18
|
+
fail_max_cnt: 2,
|
19
|
+
when_fail: ->(_, e, nth) { puts "#{nth}: #{e.message}" }
|
20
|
+
}
|
21
|
+
pt = Digger::Index.slow_down(list, conf) do |num|
|
22
|
+
raise 'error' if num == 3
|
23
|
+
num
|
24
|
+
end
|
25
|
+
p pt
|
26
|
+
expect(pt.size).to eq(2)
|
27
|
+
end
|
12
28
|
end
|
data/spec/page_spec.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'digger'
|
2
2
|
require 'json'
|
3
3
|
require 'uri'
|
4
|
+
require 'cgi'
|
4
5
|
|
5
6
|
describe Digger::Page do
|
6
7
|
it 'page json' do
|
@@ -15,13 +16,12 @@ describe Digger::Page do
|
|
15
16
|
|
16
17
|
it 'fetch baidu' do
|
17
18
|
http = Digger::HTTP.new
|
18
|
-
page = http.fetch_page('http://
|
19
|
+
page = http.fetch_page('http://baidu.com/')
|
19
20
|
expect(page.code).to eq(200)
|
20
21
|
end
|
21
22
|
|
22
|
-
it 'page uri' do
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
end
|
23
|
+
# it 'page uri' do
|
24
|
+
# link = 'https://www.baidu.com/s?wd=%E5%93%88%E5%93%88#hello'
|
25
|
+
# link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#\w*$/, '')
|
26
|
+
# end
|
27
|
+
end
|
data/spec/pattern_spec.rb
CHANGED
@@ -2,14 +2,26 @@ require 'digger'
|
|
2
2
|
require 'json'
|
3
3
|
|
4
4
|
describe Digger::Pattern do
|
5
|
-
it 'json fetch' do
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
end
|
13
|
-
|
5
|
+
# it 'json fetch' do
|
6
|
+
# json = JSON.parse('[{"a":1,"b":[1,2,3]}]')
|
7
|
+
# pt = Digger::Pattern.new
|
8
|
+
# expect(pt.json_fetch(json, '$[0]')['a']).to eq(1)
|
9
|
+
# expect(pt.json_fetch(json, '$[0].a')).to eq(1)
|
10
|
+
# expect(pt.json_fetch(json, '$[0].b').length).to eq(3)
|
11
|
+
# expect(pt.json_fetch(json, '$[0].b[2]')).to eq(3)
|
12
|
+
# end
|
14
13
|
|
15
|
-
|
14
|
+
it 'parse cookie & others' do
|
15
|
+
page = Digger::HTTP.new.fetch_page('https://xueqiu.com/')
|
16
|
+
p1 = Digger::Pattern.new({ type: 'cookie', value: 'xq_a_token', block: ->(v) { "!!#{v}" } })
|
17
|
+
# cookie
|
18
|
+
result = p1.match_page(page)
|
19
|
+
expect(result.length).to eq(42)
|
20
|
+
# header
|
21
|
+
p2 = Digger::Pattern.new({ type: 'header', value: 'transfer-encoding' })
|
22
|
+
expect(p2.match_page(page)).to eq('chunked')
|
23
|
+
# get_plain
|
24
|
+
p3 = Digger::Pattern.new({ type: 'plain' })
|
25
|
+
expect(p3.match_page(page).length).to be > 100
|
26
|
+
end
|
27
|
+
end
|
metadata
CHANGED
@@ -1,71 +1,71 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: digger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- binz
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-01-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: rake
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 12.3.3
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 12.3.3
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: bundler
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: '2.0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - "
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: '2.0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: http-cookie
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '1.
|
47
|
+
version: '1.0'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '1.
|
54
|
+
version: '1.0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: nokogiri
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '1.
|
61
|
+
version: '1.6'
|
62
62
|
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '1.
|
68
|
+
version: '1.6'
|
69
69
|
description: ''
|
70
70
|
email:
|
71
71
|
- xinkiang@gmail.com
|