digger 0.1.9 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ed96af1c5ae92569e1de4885958ac5852864c045f6c6337d5f17d91747d8ed80
4
- data.tar.gz: 82003ae80f54cd3f9b805757e5dcb4c7894bba91c4c376cf08ebd43e6de6e80b
3
+ metadata.gz: 025bfc4f8d6dbf55994363070b67a450fd421457c4f12bf9b714fe965aefaad6
4
+ data.tar.gz: a8464f07a53c332ddb40457998b830b1dff99661431a7608d9dc075175f5e9f1
5
5
  SHA512:
6
- metadata.gz: b7aad69fd46c7d1688026ece2e1efe14d7dea29b42f94656d794655e12a92677bd3e1034f0c776bf197bcd75c96bd49377df399433bd2e1c13507520af1addc5
7
- data.tar.gz: 60055a69ec3ad77e80fc4f1b50bb3a6c298e2274827ae21b64d1d82dae53a1d5338ccc99bcdb09c7d2e946abd007a488ff3034a753bc54586e9b651aeb3c5ce7
6
+ metadata.gz: d84a2f8ee2d65c16d79b66795fc618b07fb5ba4517d902eec7e17107b69c240b36f68728de49f451fd143ddeb872743a906b608d413446971e5f38a632cf6fbe
7
+ data.tar.gz: 4f36e035bab956bd6702024d951eb9963f2f97812f355e00b0840148eaa4fef07949abac0f874f889c88703addd13ea2bc69efa61bac3872b14f371f7fa47ded
@@ -6,7 +6,9 @@ module Digger
6
6
  attr_accessor :type, :value, :block
7
7
 
8
8
  def initialize(hash = {})
9
- hash.each_pair { |key, value| send("#{key}=", value) if %w[type value block].include?(key.to_s)}
9
+ hash.each_pair do |key, value|
10
+ send("#{key}=", value) if %w[type value block].include?(key.to_s)
11
+ end
10
12
  end
11
13
 
12
14
  def safe_block(&default_block)
@@ -28,11 +30,12 @@ module Digger
28
30
 
29
31
  MATCH_MAX = 3
30
32
 
31
- TYPES_REGEXP = 0.upto(MATCH_MAX).map { |i| "match_#{i}" } + %w[match_many]
32
- TYPES_CSS = %w[css_one css_many].freeze
33
+ TYPES_REGEXP = 0.upto(MATCH_MAX).map { |i| "match_#{i}" } + %w[match_many match_all]
34
+ TYPES_CSS = %w[css_one css_many css_all].freeze
33
35
  TYPES_JSON = %w[json jsonp].freeze
36
+ TYPES_OTHER = %w[cookie plain lines header body].freeze
34
37
 
35
- TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON + ['cookie']
38
+ TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON + TYPES_OTHER
36
39
 
37
40
  def match_page(page)
38
41
  return unless page.success?
@@ -43,13 +46,31 @@ module Digger
43
46
  css_match(page.doc)
44
47
  elsif TYPES_JSON.include?(type)
45
48
  json_match(page)
46
- else
47
- cookie_get(page.cookies)
49
+ elsif TYPES_OTHER.include?(type)
50
+ send("get_#{type}", page)
48
51
  end
49
52
  end
50
53
 
51
- def cookie_get(cookies)
52
- cookie = cookies.find { |c| c.name == value }&.value
54
+ def get_header(page)
55
+ header = (page.headers[value.to_s.downcase] || []).first
56
+ safe_block.call(header)
57
+ end
58
+
59
+ def get_body(page)
60
+ safe_block.call(page.body)
61
+ end
62
+
63
+ def get_plain(page)
64
+ safe_block.call(page.doc&.text)
65
+ end
66
+
67
+ def get_lines(page)
68
+ block = safe_block
69
+ page.body.split("\n").map(&:strip).filter { |line| !line.empty? }.map { |line| block.call(line) }
70
+ end
71
+
72
+ def get_cookie(page)
73
+ cookie = page.cookies.find { |c| c.name == value }&.value
53
74
  safe_block.call(cookie)
54
75
  end
55
76
 
@@ -61,24 +82,36 @@ module Digger
61
82
  end
62
83
 
63
84
  def css_match(doc)
64
- block = safe_block { |node| node.content.strip }
65
85
  # content is Nokogiri::HTML::Document
66
86
  contents = doc.css(value)
67
87
  if type == 'css_many'
68
- contents.map { |node| block.call(node) }.uniq
88
+ block = safe_block { |node| node&.content&.strip }
89
+ contents.map { |node| block.call(node) }
90
+ elsif type == 'css_all'
91
+ block = safe_block
92
+ block.call(contents)
69
93
  else
94
+ block = safe_block { |node| node&.content&.strip }
70
95
  block.call(contents.first)
71
96
  end
72
97
  end
73
98
 
74
99
  def regexp_match(body)
75
- block = safe_block(&:strip)
76
100
  # content is String
77
- if type == 'match_many'
78
- body.gsub(value).to_a.map { |node| block.call(node) }.uniq
101
+ if %w[match_many match_all].include? type
102
+ regexp = value.is_a?(Regexp) ? value : Regexp.new(value.to_s)
103
+ matches = body.gsub(regexp).to_a
104
+ if type == 'match_many'
105
+ block = safe_block(&:strip)
106
+ matches.map { |node| block.call(node) }
107
+ else
108
+ block = safe_block
109
+ block.call(matches)
110
+ end
79
111
  else
80
112
  index = TYPES_REGEXP.index(type)
81
113
  matches = body.match(value)
114
+ block = safe_block(&:strip)
82
115
  block.call(matches[index]) unless matches.nil?
83
116
  end
84
117
  end
@@ -100,21 +133,24 @@ module Digger
100
133
 
101
134
  private :json_index_keys, :json_fetch
102
135
 
103
- # Nokogiri node methods
104
- class Nokogiri::XML::Node
105
- %w[one many].each do |name|
106
- define_method "inner_#{name}" do |css, &block|
107
- callback = ->(node) { (block || ->(n) { n.text.strip }).call(node) if node }
108
- if name == 'one' # inner_one
109
- callback.call(self.css(css).first)
110
- else # inner_many
111
- self.css(css).map { |node| callback.call(node) }
112
- end
113
- end
136
+ class ::Nokogiri::XML::Node
137
+ def inner_one(expr, &block)
138
+ fn = block || ->(node) { node&.content&.strip }
139
+ fn.call(css(expr)&.first)
140
+ end
141
+
142
+ def inner_many(expr, &block)
143
+ fn = block || ->(node) { node&.content&.strip }
144
+ css(expr)&.map { |node| fn.call(node) }
114
145
  end
146
+
115
147
  def source
116
148
  to_xml
117
149
  end
150
+
151
+ def inner_number
152
+ content&.match(/\d+/).to_s.to_i
153
+ end
118
154
  end
119
155
  end
120
156
  end
@@ -1,3 +1,3 @@
1
1
  module Digger
2
- VERSION = '0.1.9'.freeze
2
+ VERSION = '0.2.2'.freeze
3
3
  end
data/spec/pattern_spec.rb CHANGED
@@ -11,10 +11,23 @@ describe Digger::Pattern do
11
11
  # expect(pt.json_fetch(json, '$[0].b[2]')).to eq(3)
12
12
  # end
13
13
 
14
- it 'parse cookoe' do
14
+ it 'parse cookie & others' do
15
15
  page = Digger::HTTP.new.fetch_page('https://xueqiu.com/')
16
- pt = Digger::Pattern.new({ type: 'cookie', value: 'xq_a_token', block: ->(v) { "!!#{v}" } })
17
- result = pt.match_page(page)
16
+ p1 = Digger::Pattern.new({ type: 'cookie', value: 'xq_a_token', block: ->(v) { "!!#{v}" } })
17
+ # cookie
18
+ result = p1.match_page(page)
18
19
  expect(result.length).to eq(42)
20
+ # header
21
+ p2 = Digger::Pattern.new({ type: 'header', value: 'transfer-encoding' })
22
+ expect(p2.match_page(page)).to eq('chunked')
23
+ # get_plain
24
+ p3 = Digger::Pattern.new({ type: 'plain' })
25
+ expect(p3.match_page(page).length).to be > 100
26
+ end
27
+
28
+ it 'match_all & css_all' do
29
+ p = Digger::Pattern.new({ type: 'match_all', value: '[\d]+' })
30
+ m = p.regexp_match('123,12,1')
31
+ expect(m.length).to eq(3)
19
32
  end
20
33
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: digger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.9
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - binz
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-12-29 00:00:00.000000000 Z
11
+ date: 2022-03-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake