digger 0.1.9 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/digger/pattern.rb +29 -7
- data/lib/digger/version.rb +1 -1
- data/spec/pattern_spec.rb +10 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 29c3945e9377348e1152eea7f46e0f11aa2e59cc5568fad57d25ecd3d271a9df
|
4
|
+
data.tar.gz: 1e4862f9939aa9c62e175a39df078fe12a2f51190af1f280f6d418cbab7e6390
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 177e393de76bc35e31d6cc0eeda839d543d13fd81c40eba7d08704a131ec01396872734ea689c49fb1d09ceb4ba604fae76c3c201a2706dc9c889161038e0323
|
7
|
+
data.tar.gz: da76004a179aaed5cf75a96f90da3ebc739416e0ec162ff95c6aa27625590826fcc179b749156ea96937e364fb6a251515cb335a14b317001f3a47ea30330aeb
|
data/lib/digger/pattern.rb
CHANGED
@@ -6,7 +6,9 @@ module Digger
|
|
6
6
|
attr_accessor :type, :value, :block
|
7
7
|
|
8
8
|
def initialize(hash = {})
|
9
|
-
hash.each_pair
|
9
|
+
hash.each_pair do |key, value|
|
10
|
+
send("#{key}=", value) if %w[type value block].include?(key.to_s)
|
11
|
+
end
|
10
12
|
end
|
11
13
|
|
12
14
|
def safe_block(&default_block)
|
@@ -31,8 +33,9 @@ module Digger
|
|
31
33
|
TYPES_REGEXP = 0.upto(MATCH_MAX).map { |i| "match_#{i}" } + %w[match_many]
|
32
34
|
TYPES_CSS = %w[css_one css_many].freeze
|
33
35
|
TYPES_JSON = %w[json jsonp].freeze
|
36
|
+
TYPES_OTHER = %w[cookie plain lines header body].freeze
|
34
37
|
|
35
|
-
TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON +
|
38
|
+
TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON + TYPES_OTHER
|
36
39
|
|
37
40
|
def match_page(page)
|
38
41
|
return unless page.success?
|
@@ -43,13 +46,31 @@ module Digger
|
|
43
46
|
css_match(page.doc)
|
44
47
|
elsif TYPES_JSON.include?(type)
|
45
48
|
json_match(page)
|
46
|
-
|
47
|
-
|
49
|
+
elsif TYPES_OTHER.include?(type)
|
50
|
+
send("get_#{type}", page)
|
48
51
|
end
|
49
52
|
end
|
50
53
|
|
51
|
-
def
|
52
|
-
|
54
|
+
def get_header(page)
|
55
|
+
header = (page.headers[value.to_s.downcase] || []).first
|
56
|
+
safe_block.call(header)
|
57
|
+
end
|
58
|
+
|
59
|
+
def get_body(page)
|
60
|
+
safe_block.call(page.body)
|
61
|
+
end
|
62
|
+
|
63
|
+
def get_plain(page)
|
64
|
+
safe_block.call(page.doc.text)
|
65
|
+
end
|
66
|
+
|
67
|
+
def get_lines(page)
|
68
|
+
block = safe_block
|
69
|
+
page.body.split("\n").map(&:strip).filter { |line| !line.empty? }.map { |line| block.call(line) }
|
70
|
+
end
|
71
|
+
|
72
|
+
def get_cookie(page)
|
73
|
+
cookie = page.cookies.find { |c| c.name == value }&.value
|
53
74
|
safe_block.call(cookie)
|
54
75
|
end
|
55
76
|
|
@@ -75,7 +96,8 @@ module Digger
|
|
75
96
|
block = safe_block(&:strip)
|
76
97
|
# content is String
|
77
98
|
if type == 'match_many'
|
78
|
-
|
99
|
+
regexp = value.is_a?(Regexp) ? value : Regexp.new(value.to_s)
|
100
|
+
body.gsub(regexp).to_a.map { |node| block.call(node) }.uniq
|
79
101
|
else
|
80
102
|
index = TYPES_REGEXP.index(type)
|
81
103
|
matches = body.match(value)
|
data/lib/digger/version.rb
CHANGED
data/spec/pattern_spec.rb
CHANGED
@@ -11,10 +11,17 @@ describe Digger::Pattern do
|
|
11
11
|
# expect(pt.json_fetch(json, '$[0].b[2]')).to eq(3)
|
12
12
|
# end
|
13
13
|
|
14
|
-
it 'parse
|
14
|
+
it 'parse cookie & others' do
|
15
15
|
page = Digger::HTTP.new.fetch_page('https://xueqiu.com/')
|
16
|
-
|
17
|
-
|
16
|
+
p1 = Digger::Pattern.new({ type: 'cookie', value: 'xq_a_token', block: ->(v) { "!!#{v}" } })
|
17
|
+
# cookie
|
18
|
+
result = p1.match_page(page)
|
18
19
|
expect(result.length).to eq(42)
|
20
|
+
# header
|
21
|
+
p2 = Digger::Pattern.new({ type: 'header', value: 'transfer-encoding' })
|
22
|
+
expect(p2.match_page(page)).to eq('chunked')
|
23
|
+
# get_plain
|
24
|
+
p3 = Digger::Pattern.new({ type: 'plain' })
|
25
|
+
expect(p3.match_page(page).length).to be > 100
|
19
26
|
end
|
20
27
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: digger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- binz
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-12-
|
11
|
+
date: 2021-12-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|