digger 0.1.9 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/digger/pattern.rb +60 -24
- data/lib/digger/version.rb +1 -1
- data/spec/pattern_spec.rb +16 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 025bfc4f8d6dbf55994363070b67a450fd421457c4f12bf9b714fe965aefaad6
|
4
|
+
data.tar.gz: a8464f07a53c332ddb40457998b830b1dff99661431a7608d9dc075175f5e9f1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d84a2f8ee2d65c16d79b66795fc618b07fb5ba4517d902eec7e17107b69c240b36f68728de49f451fd143ddeb872743a906b608d413446971e5f38a632cf6fbe
|
7
|
+
data.tar.gz: 4f36e035bab956bd6702024d951eb9963f2f97812f355e00b0840148eaa4fef07949abac0f874f889c88703addd13ea2bc69efa61bac3872b14f371f7fa47ded
|
data/lib/digger/pattern.rb
CHANGED
@@ -6,7 +6,9 @@ module Digger
|
|
6
6
|
attr_accessor :type, :value, :block
|
7
7
|
|
8
8
|
def initialize(hash = {})
|
9
|
-
hash.each_pair
|
9
|
+
hash.each_pair do |key, value|
|
10
|
+
send("#{key}=", value) if %w[type value block].include?(key.to_s)
|
11
|
+
end
|
10
12
|
end
|
11
13
|
|
12
14
|
def safe_block(&default_block)
|
@@ -28,11 +30,12 @@ module Digger
|
|
28
30
|
|
29
31
|
MATCH_MAX = 3
|
30
32
|
|
31
|
-
TYPES_REGEXP = 0.upto(MATCH_MAX).map { |i| "match_#{i}" } + %w[match_many]
|
32
|
-
TYPES_CSS = %w[css_one css_many].freeze
|
33
|
+
TYPES_REGEXP = 0.upto(MATCH_MAX).map { |i| "match_#{i}" } + %w[match_many match_all]
|
34
|
+
TYPES_CSS = %w[css_one css_many css_all].freeze
|
33
35
|
TYPES_JSON = %w[json jsonp].freeze
|
36
|
+
TYPES_OTHER = %w[cookie plain lines header body].freeze
|
34
37
|
|
35
|
-
TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON +
|
38
|
+
TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON + TYPES_OTHER
|
36
39
|
|
37
40
|
def match_page(page)
|
38
41
|
return unless page.success?
|
@@ -43,13 +46,31 @@ module Digger
|
|
43
46
|
css_match(page.doc)
|
44
47
|
elsif TYPES_JSON.include?(type)
|
45
48
|
json_match(page)
|
46
|
-
|
47
|
-
|
49
|
+
elsif TYPES_OTHER.include?(type)
|
50
|
+
send("get_#{type}", page)
|
48
51
|
end
|
49
52
|
end
|
50
53
|
|
51
|
-
def
|
52
|
-
|
54
|
+
def get_header(page)
|
55
|
+
header = (page.headers[value.to_s.downcase] || []).first
|
56
|
+
safe_block.call(header)
|
57
|
+
end
|
58
|
+
|
59
|
+
def get_body(page)
|
60
|
+
safe_block.call(page.body)
|
61
|
+
end
|
62
|
+
|
63
|
+
def get_plain(page)
|
64
|
+
safe_block.call(page.doc&.text)
|
65
|
+
end
|
66
|
+
|
67
|
+
def get_lines(page)
|
68
|
+
block = safe_block
|
69
|
+
page.body.split("\n").map(&:strip).filter { |line| !line.empty? }.map { |line| block.call(line) }
|
70
|
+
end
|
71
|
+
|
72
|
+
def get_cookie(page)
|
73
|
+
cookie = page.cookies.find { |c| c.name == value }&.value
|
53
74
|
safe_block.call(cookie)
|
54
75
|
end
|
55
76
|
|
@@ -61,24 +82,36 @@ module Digger
|
|
61
82
|
end
|
62
83
|
|
63
84
|
def css_match(doc)
|
64
|
-
block = safe_block { |node| node.content.strip }
|
65
85
|
# content is Nokogiri::HTML::Document
|
66
86
|
contents = doc.css(value)
|
67
87
|
if type == 'css_many'
|
68
|
-
|
88
|
+
block = safe_block { |node| node&.content&.strip }
|
89
|
+
contents.map { |node| block.call(node) }
|
90
|
+
elsif type == 'css_all'
|
91
|
+
block = safe_block
|
92
|
+
block.call(contents)
|
69
93
|
else
|
94
|
+
block = safe_block { |node| node&.content&.strip }
|
70
95
|
block.call(contents.first)
|
71
96
|
end
|
72
97
|
end
|
73
98
|
|
74
99
|
def regexp_match(body)
|
75
|
-
block = safe_block(&:strip)
|
76
100
|
# content is String
|
77
|
-
if
|
78
|
-
|
101
|
+
if %w[match_many match_all].include? type
|
102
|
+
regexp = value.is_a?(Regexp) ? value : Regexp.new(value.to_s)
|
103
|
+
matches = body.gsub(regexp).to_a
|
104
|
+
if type == 'match_many'
|
105
|
+
block = safe_block(&:strip)
|
106
|
+
matches.map { |node| block.call(node) }
|
107
|
+
else
|
108
|
+
block = safe_block
|
109
|
+
block.call(matches)
|
110
|
+
end
|
79
111
|
else
|
80
112
|
index = TYPES_REGEXP.index(type)
|
81
113
|
matches = body.match(value)
|
114
|
+
block = safe_block(&:strip)
|
82
115
|
block.call(matches[index]) unless matches.nil?
|
83
116
|
end
|
84
117
|
end
|
@@ -100,21 +133,24 @@ module Digger
|
|
100
133
|
|
101
134
|
private :json_index_keys, :json_fetch
|
102
135
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
end
|
113
|
-
end
|
136
|
+
class ::Nokogiri::XML::Node
|
137
|
+
def inner_one(expr, &block)
|
138
|
+
fn = block || ->(node) { node&.content&.strip }
|
139
|
+
fn.call(css(expr)&.first)
|
140
|
+
end
|
141
|
+
|
142
|
+
def inner_many(expr, &block)
|
143
|
+
fn = block || ->(node) { node&.content&.strip }
|
144
|
+
css(expr)&.map { |node| fn.call(node) }
|
114
145
|
end
|
146
|
+
|
115
147
|
def source
|
116
148
|
to_xml
|
117
149
|
end
|
150
|
+
|
151
|
+
def inner_number
|
152
|
+
content&.match(/\d+/).to_s.to_i
|
153
|
+
end
|
118
154
|
end
|
119
155
|
end
|
120
156
|
end
|
data/lib/digger/version.rb
CHANGED
data/spec/pattern_spec.rb
CHANGED
@@ -11,10 +11,23 @@ describe Digger::Pattern do
|
|
11
11
|
# expect(pt.json_fetch(json, '$[0].b[2]')).to eq(3)
|
12
12
|
# end
|
13
13
|
|
14
|
-
it 'parse
|
14
|
+
it 'parse cookie & others' do
|
15
15
|
page = Digger::HTTP.new.fetch_page('https://xueqiu.com/')
|
16
|
-
|
17
|
-
|
16
|
+
p1 = Digger::Pattern.new({ type: 'cookie', value: 'xq_a_token', block: ->(v) { "!!#{v}" } })
|
17
|
+
# cookie
|
18
|
+
result = p1.match_page(page)
|
18
19
|
expect(result.length).to eq(42)
|
20
|
+
# header
|
21
|
+
p2 = Digger::Pattern.new({ type: 'header', value: 'transfer-encoding' })
|
22
|
+
expect(p2.match_page(page)).to eq('chunked')
|
23
|
+
# get_plain
|
24
|
+
p3 = Digger::Pattern.new({ type: 'plain' })
|
25
|
+
expect(p3.match_page(page).length).to be > 100
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'match_all & css_all' do
|
29
|
+
p = Digger::Pattern.new({ type: 'match_all', value: '[\d]+' })
|
30
|
+
m = p.regexp_match('123,12,1')
|
31
|
+
expect(m.length).to eq(3)
|
19
32
|
end
|
20
33
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: digger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- binz
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-03-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|