digger 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/digger.gemspec +10 -9
- data/lib/digger/page.rb +6 -1
- data/lib/digger/pattern.rb +11 -4
- data/lib/digger/version.rb +1 -1
- data/spec/page_spec.rb +7 -7
- data/spec/pattern_spec.rb +15 -10
- metadata +18 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 19e59bc2161a078d80d00adf538a7c33891a53f9beeb453748eec7e0810c5b65
|
4
|
+
data.tar.gz: da1d93c663b42a6e0b7be2f136bea3b0f3a86c3c36c57a72347c70e7e0538508
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d13b06c8491f9cda42f8a8fd4fa9547aa6a95d075efbf52c789760adaa66d0c2bde24f810e23e4713df4362ae82cb82b0d61996386caaad2c685eb22f1a375db
|
7
|
+
data.tar.gz: a63a5cfe70b154b446dcbd0d2937d94a33fb033277c0d17aa1500a9ccd86dc8791e3cc40e8487ecc1c609eec304b91fb4a29b75b89a74b8efe24e29c49957d4e
|
data/digger.gemspec
CHANGED
@@ -1,26 +1,27 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
|
2
3
|
lib = File.expand_path('../lib', __FILE__)
|
3
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
5
|
require 'digger/version'
|
5
6
|
|
6
7
|
Gem::Specification.new do |spec|
|
7
|
-
spec.name =
|
8
|
+
spec.name = 'digger'
|
8
9
|
spec.version = Digger::VERSION
|
9
|
-
spec.authors = [
|
10
|
-
spec.email = [
|
10
|
+
spec.authors = ['binz']
|
11
|
+
spec.email = ['xinkiang@gmail.com']
|
11
12
|
spec.summary = %q{Dig need stractual infomation from web page.}
|
12
13
|
spec.description = %q{}
|
13
|
-
spec.homepage =
|
14
|
-
spec.license =
|
14
|
+
spec.homepage = ''
|
15
|
+
spec.license = 'MIT'
|
15
16
|
|
16
17
|
spec.files = `git ls-files -z`.split("\x0")
|
17
18
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
19
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
-
spec.require_paths = [
|
20
|
+
spec.require_paths = ['lib']
|
20
21
|
|
21
|
-
spec.add_development_dependency
|
22
|
-
spec.add_development_dependency
|
22
|
+
spec.add_development_dependency 'rake', '>= 12.3.3'
|
23
|
+
spec.add_development_dependency 'bundler', '~> 2.0'
|
23
24
|
|
24
|
-
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
25
25
|
spec.add_runtime_dependency 'http-cookie', '~> 1.0'
|
26
|
+
spec.add_runtime_dependency 'nokogiri', '~> 1.6'
|
26
27
|
end
|
data/lib/digger/page.rb
CHANGED
@@ -4,6 +4,7 @@ require 'ostruct'
|
|
4
4
|
require 'set'
|
5
5
|
require 'kconv'
|
6
6
|
require 'uri'
|
7
|
+
require 'http/cookie'
|
7
8
|
|
8
9
|
# https://github.com/taganaka/polipus/blob/master/lib/polipus/page.rb
|
9
10
|
module Digger
|
@@ -101,6 +102,10 @@ module Digger
|
|
101
102
|
@jsonp ||= JSON.parse body.match(/^[^(]+?\((.+)\)[^)]*$/)[1]
|
102
103
|
end
|
103
104
|
|
105
|
+
def cookies
|
106
|
+
@cookies ||= (headers['set-cookie'] || []).flat_map { |c| ::HTTP::Cookie.parse(c, url) }
|
107
|
+
end
|
108
|
+
|
104
109
|
#
|
105
110
|
# Discard links, a next call of page.links will return an empty array
|
106
111
|
#
|
@@ -273,4 +278,4 @@ module Digger
|
|
273
278
|
from_hash hash
|
274
279
|
end
|
275
280
|
end
|
276
|
-
end
|
281
|
+
end
|
data/lib/digger/pattern.rb
CHANGED
@@ -11,7 +11,7 @@ module Digger
|
|
11
11
|
|
12
12
|
def safe_block(&default_block)
|
13
13
|
if block.nil? || (block.is_a?(String) && block.strip.empty?)
|
14
|
-
default_block
|
14
|
+
default_block || ->(v) { v }
|
15
15
|
elsif block.respond_to?(:call)
|
16
16
|
block
|
17
17
|
else
|
@@ -32,25 +32,32 @@ module Digger
|
|
32
32
|
TYPES_CSS = %w[css_one css_many].freeze
|
33
33
|
TYPES_JSON = %w[json jsonp].freeze
|
34
34
|
|
35
|
-
TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON
|
35
|
+
TYPES = TYPES_REGEXP + TYPES_CSS + TYPES_JSON + ['cookie']
|
36
36
|
|
37
37
|
def match_page(page)
|
38
38
|
return unless page.success?
|
39
|
+
|
39
40
|
if TYPES_REGEXP.include?(type) # regular expression
|
40
41
|
regexp_match(page.body)
|
41
42
|
elsif TYPES_CSS.include?(type) # css expression
|
42
43
|
css_match(page.doc)
|
43
44
|
elsif TYPES_JSON.include?(type)
|
44
45
|
json_match(page)
|
46
|
+
else
|
47
|
+
cookie_get(page.cookies)
|
45
48
|
end
|
46
49
|
end
|
47
50
|
|
51
|
+
def cookie_get(cookies)
|
52
|
+
cookie = cookies.find { |c| c.name == value }&.value
|
53
|
+
safe_block.call(cookie)
|
54
|
+
end
|
55
|
+
|
48
56
|
def json_match(page)
|
49
|
-
block = safe_block { |j| j }
|
50
57
|
json = page.send(type)
|
51
58
|
keys = json_index_keys(value)
|
52
59
|
match = json_fetch(json, keys)
|
53
|
-
|
60
|
+
safe_block.call(match)
|
54
61
|
end
|
55
62
|
|
56
63
|
def css_match(doc)
|
data/lib/digger/version.rb
CHANGED
data/spec/page_spec.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'digger'
|
2
2
|
require 'json'
|
3
3
|
require 'uri'
|
4
|
+
require 'cgi'
|
4
5
|
|
5
6
|
describe Digger::Page do
|
6
7
|
it 'page json' do
|
@@ -15,13 +16,12 @@ describe Digger::Page do
|
|
15
16
|
|
16
17
|
it 'fetch baidu' do
|
17
18
|
http = Digger::HTTP.new
|
18
|
-
page = http.fetch_page('http://
|
19
|
+
page = http.fetch_page('http://baidu.com/')
|
19
20
|
expect(page.code).to eq(200)
|
20
21
|
end
|
21
22
|
|
22
|
-
it 'page uri' do
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
end
|
23
|
+
# it 'page uri' do
|
24
|
+
# link = 'https://www.baidu.com/s?wd=%E5%93%88%E5%93%88#hello'
|
25
|
+
# link = link.to_s.encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '').gsub(/#\w*$/, '')
|
26
|
+
# end
|
27
|
+
end
|
data/spec/pattern_spec.rb
CHANGED
@@ -2,14 +2,19 @@ require 'digger'
|
|
2
2
|
require 'json'
|
3
3
|
|
4
4
|
describe Digger::Pattern do
|
5
|
-
it 'json fetch' do
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
end
|
13
|
-
|
5
|
+
# it 'json fetch' do
|
6
|
+
# json = JSON.parse('[{"a":1,"b":[1,2,3]}]')
|
7
|
+
# pt = Digger::Pattern.new
|
8
|
+
# expect(pt.json_fetch(json, '$[0]')['a']).to eq(1)
|
9
|
+
# expect(pt.json_fetch(json, '$[0].a')).to eq(1)
|
10
|
+
# expect(pt.json_fetch(json, '$[0].b').length).to eq(3)
|
11
|
+
# expect(pt.json_fetch(json, '$[0].b[2]')).to eq(3)
|
12
|
+
# end
|
14
13
|
|
15
|
-
|
14
|
+
it 'parse cookoe' do
|
15
|
+
page = Digger::HTTP.new.fetch_page('https://xueqiu.com/')
|
16
|
+
pt = Digger::Pattern.new({ type: 'cookie', value: 'xq_a_token', block: ->(v) { "!!#{v}" } })
|
17
|
+
result = pt.match_page(page)
|
18
|
+
expect(result.length).to eq(42)
|
19
|
+
end
|
20
|
+
end
|
metadata
CHANGED
@@ -1,71 +1,71 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: digger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- binz
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-12-
|
11
|
+
date: 2021-12-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: rake
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 12.3.3
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 12.3.3
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: bundler
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: '2.0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - "
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: '2.0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: http-cookie
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '1.
|
47
|
+
version: '1.0'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '1.
|
54
|
+
version: '1.0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: nokogiri
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '1.
|
61
|
+
version: '1.6'
|
62
62
|
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '1.
|
68
|
+
version: '1.6'
|
69
69
|
description: ''
|
70
70
|
email:
|
71
71
|
- xinkiang@gmail.com
|