digger 0.1.9 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ed96af1c5ae92569e1de4885958ac5852864c045f6c6337d5f17d91747d8ed80
4
- data.tar.gz: 82003ae80f54cd3f9b805757e5dcb4c7894bba91c4c376cf08ebd43e6de6e80b
3
+ metadata.gz: 025bfc4f8d6dbf55994363070b67a450fd421457c4f12bf9b714fe965aefaad6
4
+ data.tar.gz: a8464f07a53c332ddb40457998b830b1dff99661431a7608d9dc075175f5e9f1
5
5
  SHA512:
6
- metadata.gz: b7aad69fd46c7d1688026ece2e1efe14d7dea29b42f94656d794655e12a92677bd3e1034f0c776bf197bcd75c96bd49377df399433bd2e1c13507520af1addc5
7
- data.tar.gz: 60055a69ec3ad77e80fc4f1b50bb3a6c298e2274827ae21b64d1d82dae53a1d5338ccc99bcdb09c7d2e946abd007a488ff3034a753bc54586e9b651aeb3c5ce7
6
+ metadata.gz: d84a2f8ee2d65c16d79b66795fc618b07fb5ba4517d902eec7e17107b69c240b36f68728de49f451fd143ddeb872743a906b608d413446971e5f38a632cf6fbe
7
+ data.tar.gz: 4f36e035bab956bd6702024d951eb9963f2f97812f355e00b0840148eaa4fef07949abac0f874f889c88703addd13ea2bc69efa61bac3872b14f371f7fa47ded
@@ -6,7 +6,9 @@ module Digger
6
6
  attr_accessor :type, :value, :block
7
7
 
8
8
  def initialize(hash = {})
9
- hash.each_pair { |key, value| send("#{key}=", value) if %w[type value block].include?(key.to_s)}
9
+ hash.each_pair do |key, value|
10
+ send("#{key}=", value) if %w[type value block].include?(key.to_s)
11
+ end
10
12
  end
11
13
 
12
14
  def safe_block(&default_block)
@@ -28,11 +30,12 @@ module Digger
28
30
 
29
31
  MATCH_MAX = 3
30
32
 
31
- TYPES_REGEXP = 0.upto(MATCH_MAX).map { |i| "match_#{i}" } + %w[match_many]
32
- TYPES_CSS = %w[css_one css_many].freeze
33
+ TYPES_REGEXP = 0.upto(MATCH_MAX).map { |i| "match_#{i}" } + %w[match_many match_all]
34
+ TYPES_CSS = %w[css_one css_many css_all].freeze
33
35
  TYPES_JSON = %w[json jsonp].freeze
36
+ TYPES_OTHER = %w[cookie plain lines header body].freeze
34
37
 
35
- TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON + ['cookie']
38
+ TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON + TYPES_OTHER
36
39
 
37
40
  def match_page(page)
38
41
  return unless page.success?
@@ -43,13 +46,31 @@ module Digger
43
46
  css_match(page.doc)
44
47
  elsif TYPES_JSON.include?(type)
45
48
  json_match(page)
46
- else
47
- cookie_get(page.cookies)
49
+ elsif TYPES_OTHER.include?(type)
50
+ send("get_#{type}", page)
48
51
  end
49
52
  end
50
53
 
51
- def cookie_get(cookies)
52
- cookie = cookies.find { |c| c.name == value }&.value
54
+ def get_header(page)
55
+ header = (page.headers[value.to_s.downcase] || []).first
56
+ safe_block.call(header)
57
+ end
58
+
59
+ def get_body(page)
60
+ safe_block.call(page.body)
61
+ end
62
+
63
+ def get_plain(page)
64
+ safe_block.call(page.doc&.text)
65
+ end
66
+
67
+ def get_lines(page)
68
+ block = safe_block
69
+ page.body.split("\n").map(&:strip).filter { |line| !line.empty? }.map { |line| block.call(line) }
70
+ end
71
+
72
+ def get_cookie(page)
73
+ cookie = page.cookies.find { |c| c.name == value }&.value
53
74
  safe_block.call(cookie)
54
75
  end
55
76
 
@@ -61,24 +82,36 @@ module Digger
61
82
  end
62
83
 
63
84
  def css_match(doc)
64
- block = safe_block { |node| node.content.strip }
65
85
  # content is Nokogiri::HTML::Document
66
86
  contents = doc.css(value)
67
87
  if type == 'css_many'
68
- contents.map { |node| block.call(node) }.uniq
88
+ block = safe_block { |node| node&.content&.strip }
89
+ contents.map { |node| block.call(node) }
90
+ elsif type == 'css_all'
91
+ block = safe_block
92
+ block.call(contents)
69
93
  else
94
+ block = safe_block { |node| node&.content&.strip }
70
95
  block.call(contents.first)
71
96
  end
72
97
  end
73
98
 
74
99
  def regexp_match(body)
75
- block = safe_block(&:strip)
76
100
  # content is String
77
- if type == 'match_many'
78
- body.gsub(value).to_a.map { |node| block.call(node) }.uniq
101
+ if %w[match_many match_all].include? type
102
+ regexp = value.is_a?(Regexp) ? value : Regexp.new(value.to_s)
103
+ matches = body.gsub(regexp).to_a
104
+ if type == 'match_many'
105
+ block = safe_block(&:strip)
106
+ matches.map { |node| block.call(node) }
107
+ else
108
+ block = safe_block
109
+ block.call(matches)
110
+ end
79
111
  else
80
112
  index = TYPES_REGEXP.index(type)
81
113
  matches = body.match(value)
114
+ block = safe_block(&:strip)
82
115
  block.call(matches[index]) unless matches.nil?
83
116
  end
84
117
  end
@@ -100,21 +133,24 @@ module Digger
100
133
 
101
134
  private :json_index_keys, :json_fetch
102
135
 
103
- # Nokogiri node methods
104
- class Nokogiri::XML::Node
105
- %w[one many].each do |name|
106
- define_method "inner_#{name}" do |css, &block|
107
- callback = ->(node) { (block || ->(n) { n.text.strip }).call(node) if node }
108
- if name == 'one' # inner_one
109
- callback.call(self.css(css).first)
110
- else # inner_many
111
- self.css(css).map { |node| callback.call(node) }
112
- end
113
- end
136
+ class ::Nokogiri::XML::Node
137
+ def inner_one(expr, &block)
138
+ fn = block || ->(node) { node&.content&.strip }
139
+ fn.call(css(expr)&.first)
140
+ end
141
+
142
+ def inner_many(expr, &block)
143
+ fn = block || ->(node) { node&.content&.strip }
144
+ css(expr)&.map { |node| fn.call(node) }
114
145
  end
146
+
115
147
  def source
116
148
  to_xml
117
149
  end
150
+
151
+ def inner_number
152
+ content&.match(/\d+/).to_s.to_i
153
+ end
118
154
  end
119
155
  end
120
156
  end
@@ -1,3 +1,3 @@
1
1
  module Digger
2
- VERSION = '0.1.9'.freeze
2
+ VERSION = '0.2.2'.freeze
3
3
  end
data/spec/pattern_spec.rb CHANGED
@@ -11,10 +11,23 @@ describe Digger::Pattern do
11
11
  # expect(pt.json_fetch(json, '$[0].b[2]')).to eq(3)
12
12
  # end
13
13
 
14
- it 'parse cookoe' do
14
+ it 'parse cookie & others' do
15
15
  page = Digger::HTTP.new.fetch_page('https://xueqiu.com/')
16
- pt = Digger::Pattern.new({ type: 'cookie', value: 'xq_a_token', block: ->(v) { "!!#{v}" } })
17
- result = pt.match_page(page)
16
+ p1 = Digger::Pattern.new({ type: 'cookie', value: 'xq_a_token', block: ->(v) { "!!#{v}" } })
17
+ # cookie
18
+ result = p1.match_page(page)
18
19
  expect(result.length).to eq(42)
20
+ # header
21
+ p2 = Digger::Pattern.new({ type: 'header', value: 'transfer-encoding' })
22
+ expect(p2.match_page(page)).to eq('chunked')
23
+ # get_plain
24
+ p3 = Digger::Pattern.new({ type: 'plain' })
25
+ expect(p3.match_page(page).length).to be > 100
26
+ end
27
+
28
+ it 'match_all & css_all' do
29
+ p = Digger::Pattern.new({ type: 'match_all', value: '[\d]+' })
30
+ m = p.regexp_match('123,12,1')
31
+ expect(m.length).to eq(3)
19
32
  end
20
33
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: digger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.9
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - binz
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-12-29 00:00:00.000000000 Z
11
+ date: 2022-03-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake