yasuri 1.9.11 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.github/workflows/ruby.yml +35 -0
- data/.gitignore +1 -2
- data/.ruby-version +1 -0
- data/.travis.yml +1 -3
- data/README.md +26 -1
- data/USAGE.ja.md +49 -1
- data/USAGE.md +51 -1
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +41 -10
- data/lib/yasuri/yasuri_links_node.rb +2 -2
- data/lib/yasuri/yasuri_node.rb +2 -2
- data/lib/yasuri/yasuri_node_generator.rb +7 -9
- data/lib/yasuri/yasuri_paginate_node.rb +11 -4
- data/lib/yasuri/yasuri_struct_node.rb +3 -3
- data/lib/yasuri/yasuri_text_node.rb +5 -5
- data/spec/htdocs/struct/structual_links.html +30 -0
- data/spec/htdocs/{structual_text.html → struct/structual_text.html} +0 -0
- data/spec/spec_helper.rb +0 -5
- data/spec/yasuri_links_node_spec.rb +12 -4
- data/spec/yasuri_paginate_node_spec.rb +43 -0
- data/spec/yasuri_spec.rb +87 -3
- data/spec/yasuri_struct_node_spec.rb +43 -2
- data/yasuri.gemspec +2 -2
- metadata +20 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 7f360d6efb02954a5a54e2fc308d0cd0c2e5c129c52eba727fb0dfe4a40ce502
|
4
|
+
data.tar.gz: 8d8805a55c7ce16c76eb50945b954ad19327a3a63183eca098dac6ac93d2203b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ffe02aee78de5f30f1e583b2aca8c0617324bdbf62d7c64e371e90d139bac8b1d26df23e9725df0b81b946c6a465283f88a7d51945872c56e7be892eac1b5e4e
|
7
|
+
data.tar.gz: c8983dc2cd283c7de0d97357d2a8164426ee3e1017e73c498c0676716a1c9ab4c42cc02a836bf7e559877d50ca23df6fa656c0197b5018a4881997e2fb4c57d0
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# This workflow uses actions that are not certified by GitHub.
|
2
|
+
# They are provided by a third-party and are governed by
|
3
|
+
# separate terms of service, privacy policy, and support
|
4
|
+
# documentation.
|
5
|
+
# This workflow will download a prebuilt Ruby version, install dependencies and run tests with Rake
|
6
|
+
# For more information see: https://github.com/marketplace/actions/setup-ruby-jruby-and-truffleruby
|
7
|
+
|
8
|
+
name: Ruby
|
9
|
+
|
10
|
+
on:
|
11
|
+
push:
|
12
|
+
branches: [ master ]
|
13
|
+
pull_request:
|
14
|
+
branches: [ master ]
|
15
|
+
|
16
|
+
jobs:
|
17
|
+
test:
|
18
|
+
|
19
|
+
runs-on: ubuntu-latest
|
20
|
+
strategy:
|
21
|
+
matrix:
|
22
|
+
ruby-version: ['2.6', '2.7', '3.0']
|
23
|
+
|
24
|
+
steps:
|
25
|
+
- uses: actions/checkout@v2
|
26
|
+
- name: Set up Ruby
|
27
|
+
# To automatically get bug fixes and new Ruby versions for ruby/setup-ruby,
|
28
|
+
# change this to (see https://github.com/ruby/setup-ruby#versioning):
|
29
|
+
# uses: ruby/setup-ruby@v1
|
30
|
+
uses: ruby/setup-ruby@473e4d8fe5dd94ee328fdfca9f8c9c7afc9dae5e
|
31
|
+
with:
|
32
|
+
ruby-version: ${{ matrix.ruby-version }}
|
33
|
+
bundler-cache: true # runs 'bundle install' and caches installed gems automatically
|
34
|
+
- name: Run tests
|
35
|
+
run: bundle exec rake
|
data/.gitignore
CHANGED
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
3.0.0
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -1,4 +1,5 @@
|
|
1
|
-
# Yasuri
|
1
|
+
# Yasuri
|
2
|
+
[](https://travis-ci.org/tac0x2a/yasuri) [](https://coveralls.io/r/tac0x2a/yasuri?branch=master) [](https://codeclimate.com/github/tac0x2a/yasuri/maintainability)
|
2
3
|
|
3
4
|
Yasuri (鑢) is an easy web-scraping library for supporting "[Mechanize](https://github.com/sparklemotion/mechanize)".
|
4
5
|
|
@@ -52,6 +53,19 @@ root = Yasuri.links_root '//*[@id="menu"]/ul/li/a' do
|
|
52
53
|
text_content '//*[@id="contents"]/p[1]'
|
53
54
|
end
|
54
55
|
|
56
|
+
|
57
|
+
# Node tree constructing by YAML
|
58
|
+
src = <<-EOYAML
|
59
|
+
root:
|
60
|
+
node: links
|
61
|
+
path: "//*[@id='menu']/ul/li/a"
|
62
|
+
children:
|
63
|
+
- title: { node: text, path: "//*[@id='contents']/h2" }
|
64
|
+
- content: { node: text, path: "//*[@id='contents']/p[1]" }
|
65
|
+
EOYAML
|
66
|
+
root = Yasuri.yaml2tree(src)
|
67
|
+
|
68
|
+
|
55
69
|
# Node tree constructing by JSON
|
56
70
|
src = <<-EOJSON
|
57
71
|
{ "node" : "links",
|
@@ -78,6 +92,17 @@ result = root.inject(agent, root_page)
|
|
78
92
|
# => [ {"title" => "PageTitle", "content" => "Page Contents" }, ... ]
|
79
93
|
```
|
80
94
|
|
95
|
+
## Dev
|
96
|
+
```sh
|
97
|
+
$ gem install bundler
|
98
|
+
$ bundle install
|
99
|
+
```
|
100
|
+
### Test
|
101
|
+
```sh
|
102
|
+
$ rake
|
103
|
+
# or
|
104
|
+
$ rspec spec/*spec.rb
|
105
|
+
```
|
81
106
|
|
82
107
|
## Contributing
|
83
108
|
|
data/USAGE.ja.md
CHANGED
@@ -67,7 +67,7 @@ page = agent.get(uri)
|
|
67
67
|
tree.inject(agent, page)
|
68
68
|
```
|
69
69
|
|
70
|
-
ツリーは、DSL
|
70
|
+
ツリーは、json,yaml,またはDSLで定義することができます.上の例ではDSLで定義しています.
|
71
71
|
以下は、jsonで上記と等価な解析ツリーを定義した例です.
|
72
72
|
|
73
73
|
```ruby
|
@@ -87,6 +87,19 @@ EOJSON
|
|
87
87
|
tree = Yasuri.json2tree(src)
|
88
88
|
```
|
89
89
|
|
90
|
+
```ruby
|
91
|
+
# yaml で構成する場合
|
92
|
+
src = <<-EOYAML
|
93
|
+
title:
|
94
|
+
node: links
|
95
|
+
path: "/html/body/a"
|
96
|
+
children:
|
97
|
+
- name:
|
98
|
+
node: text
|
99
|
+
path: "/html/body/p"
|
100
|
+
EOYAML
|
101
|
+
tree = Yasuri.yaml2tree(src)
|
102
|
+
```
|
90
103
|
|
91
104
|
### Node
|
92
105
|
ツリーは入れ子になった *Node* で構成されます.
|
@@ -431,3 +444,38 @@ node.inject(agent, page)
|
|
431
444
|
#=> [ {"content" => "Pagination01"}, {"content" => "Pagination02"}]
|
432
445
|
```
|
433
446
|
この場合、PaginateNode は最大2つまでのページを開いてパースします.ページネーションは4つのページを持っているようですが、`limit:2`が指定されているため、結果の配列には2つの結果のみが含まれています.
|
447
|
+
|
448
|
+
##### `flatten`
|
449
|
+
取得した各ページの結果を展開します.
|
450
|
+
|
451
|
+
```ruby
|
452
|
+
agent = Mechanize.new
|
453
|
+
page = agent.get("http://yasuri.example.net/page01.html")
|
454
|
+
|
455
|
+
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
|
456
|
+
text_title '/html/head/title'
|
457
|
+
text_content '/html/body/p'
|
458
|
+
end
|
459
|
+
node.inject(agent, page)
|
460
|
+
|
461
|
+
#=> [ {"title" => "Page01",
|
462
|
+
"content" => "Patination01"},
|
463
|
+
{"title" => "Page01",
|
464
|
+
"content" => "Patination02"},
|
465
|
+
{"title" => "Page01",
|
466
|
+
"content" => "Patination03"}]
|
467
|
+
|
468
|
+
|
469
|
+
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
|
470
|
+
text_title '/html/head/title'
|
471
|
+
text_content '/html/body/p'
|
472
|
+
end
|
473
|
+
node.inject(agent, page)
|
474
|
+
|
475
|
+
#=> [ "Page01",
|
476
|
+
"Patination01",
|
477
|
+
"Page02",
|
478
|
+
"Patination02",
|
479
|
+
"Page03",
|
480
|
+
"Patination03"]
|
481
|
+
```
|
data/USAGE.md
CHANGED
@@ -69,7 +69,7 @@ page = agent.get(uri)
|
|
69
69
|
tree.inject(agent, page)
|
70
70
|
```
|
71
71
|
|
72
|
-
Tree is definable by
|
72
|
+
Tree is definable by 3(+1) ways, json, yaml, and DSL (or basic ruby code). In above example, DSL.
|
73
73
|
|
74
74
|
```ruby
|
75
75
|
# Construct by json.
|
@@ -88,6 +88,21 @@ EOJSON
|
|
88
88
|
tree = Yasuri.json2tree(src)
|
89
89
|
```
|
90
90
|
|
91
|
+
```ruby
|
92
|
+
# Construct by yaml.
|
93
|
+
src = <<-EOYAML
|
94
|
+
title:
|
95
|
+
node: links
|
96
|
+
path: "/html/body/a"
|
97
|
+
children:
|
98
|
+
- name:
|
99
|
+
node: text
|
100
|
+
path: "/html/body/p"
|
101
|
+
EOYAML
|
102
|
+
tree = Yasuri.yaml2tree(src)
|
103
|
+
```
|
104
|
+
|
105
|
+
|
91
106
|
### Node
|
92
107
|
Tree is constructed by nested Nodes.
|
93
108
|
Node has `Type`, `Name`, `Path`, `Childlen`, and `Options`.
|
@@ -429,3 +444,38 @@ node.inject(agent, page)
|
|
429
444
|
#=> [ {"content" => "Pagination01"}, {"content" => "Pagination02"}]
|
430
445
|
```
|
431
446
|
Paginate Node open upto 2 given by `limit`. In this situation, pagination has 4 pages, but result Array has 2 texts because given `limit:2`.
|
447
|
+
|
448
|
+
##### `flatten`
|
449
|
+
`flatten` option expands each page results.
|
450
|
+
|
451
|
+
```ruby
|
452
|
+
agent = Mechanize.new
|
453
|
+
page = agent.get("http://yasuri.example.net/page01.html")
|
454
|
+
|
455
|
+
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
|
456
|
+
text_title '/html/head/title'
|
457
|
+
text_content '/html/body/p'
|
458
|
+
end
|
459
|
+
node.inject(agent, page)
|
460
|
+
|
461
|
+
#=> [ {"title" => "Page01",
|
462
|
+
"content" => "Patination01"},
|
463
|
+
{"title" => "Page01",
|
464
|
+
"content" => "Patination02"},
|
465
|
+
{"title" => "Page01",
|
466
|
+
"content" => "Patination03"}]
|
467
|
+
|
468
|
+
|
469
|
+
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
|
470
|
+
text_title '/html/head/title'
|
471
|
+
text_content '/html/body/p'
|
472
|
+
end
|
473
|
+
node.inject(agent, page)
|
474
|
+
|
475
|
+
#=> [ "Page01",
|
476
|
+
"Patination01",
|
477
|
+
"Page02",
|
478
|
+
"Patination02",
|
479
|
+
"Page03",
|
480
|
+
"Patination03"]
|
481
|
+
```
|
data/lib/yasuri/version.rb
CHANGED
data/lib/yasuri/yasuri.rb
CHANGED
@@ -4,6 +4,7 @@
|
|
4
4
|
|
5
5
|
require 'mechanize'
|
6
6
|
require 'json'
|
7
|
+
require 'yaml'
|
7
8
|
|
8
9
|
require_relative 'yasuri_node'
|
9
10
|
require_relative 'yasuri_text_node'
|
@@ -23,9 +24,39 @@ module Yasuri
|
|
23
24
|
Yasuri.node2hash(node).to_json
|
24
25
|
end
|
25
26
|
|
26
|
-
def self.
|
27
|
-
|
28
|
-
|
27
|
+
def self.yaml2tree(yaml_string)
|
28
|
+
raise RuntimeError if yaml_string.nil? or yaml_string.empty?
|
29
|
+
|
30
|
+
yaml = YAML.load(yaml_string)
|
31
|
+
raise RuntimeError if yaml.keys.size < 1
|
32
|
+
|
33
|
+
root_key, root = yaml.keys.first, yaml.values.first
|
34
|
+
hash = Yasuri.yaml2tree_sub(root_key, root)
|
35
|
+
|
36
|
+
Yasuri.hash2node(hash)
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
def self.yaml2tree_sub(name, body)
|
41
|
+
return nil if name.nil? or body.nil?
|
42
|
+
|
43
|
+
new_body = Hash[:name, name]
|
44
|
+
body.each{|k,v| new_body[k.to_sym] = v}
|
45
|
+
body = new_body
|
46
|
+
|
47
|
+
return body if body[:children].nil?
|
48
|
+
|
49
|
+
body[:children] = body[:children].map do |c|
|
50
|
+
k, b = c.keys.first, c.values.first
|
51
|
+
Yasuri.yaml2tree_sub(k, b)
|
52
|
+
end
|
53
|
+
|
54
|
+
body
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.method_missing(node_name, pattern, **opt, &block)
|
58
|
+
generated = Yasuri::NodeGenerator.gen(node_name, pattern, **opt, &block)
|
59
|
+
generated || super(node_name, **opt)
|
29
60
|
end
|
30
61
|
|
31
62
|
private
|
@@ -37,16 +68,16 @@ module Yasuri
|
|
37
68
|
}
|
38
69
|
Node2Text = Text2Node.invert
|
39
70
|
|
40
|
-
ReservedKeys =
|
71
|
+
ReservedKeys = %i|node name path children|
|
41
72
|
def self.hash2node(node_h)
|
42
73
|
node, name, path, children = ReservedKeys.map do |key|
|
43
74
|
node_h[key]
|
44
75
|
end
|
45
76
|
children ||= []
|
46
77
|
|
47
|
-
fail "Not found 'node' value in
|
48
|
-
fail "Not found 'name' value in
|
49
|
-
fail "Not found 'path' value in
|
78
|
+
fail "Not found 'node' value in map" if node.nil?
|
79
|
+
fail "Not found 'name' value in map" if name.nil?
|
80
|
+
fail "Not found 'path' value in map" if path.nil?
|
50
81
|
|
51
82
|
childnodes = children.map{|c| Yasuri.hash2node(c) }
|
52
83
|
ReservedKeys.each{|key| node_h.delete(key)}
|
@@ -54,7 +85,7 @@ module Yasuri
|
|
54
85
|
|
55
86
|
klass = Text2Node[node.to_sym]
|
56
87
|
fail "Undefined node type #{node}" if klass.nil?
|
57
|
-
klass.new(path, name, childnodes, opt)
|
88
|
+
klass.new(path, name, childnodes, **opt)
|
58
89
|
end
|
59
90
|
|
60
91
|
def self.node2hash(node)
|
@@ -78,8 +109,8 @@ module Yasuri
|
|
78
109
|
json
|
79
110
|
end
|
80
111
|
|
81
|
-
def self.NodeName(name,
|
82
|
-
symbolize_names =
|
112
|
+
def self.NodeName(name, opt)
|
113
|
+
symbolize_names = opt[:symbolize_names]
|
83
114
|
symbolize_names ? name.to_sym : name
|
84
115
|
end
|
85
116
|
|
@@ -6,10 +6,10 @@ require_relative 'yasuri_node'
|
|
6
6
|
module Yasuri
|
7
7
|
class LinksNode
|
8
8
|
include Node
|
9
|
-
def inject(agent, page, opt = {})
|
9
|
+
def inject(agent, page, opt = {}, element = page)
|
10
10
|
retry_count = opt[:retry_count] || 5
|
11
11
|
|
12
|
-
links =
|
12
|
+
links = element.search(@xpath) || [] # links expected
|
13
13
|
links.map do |link|
|
14
14
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
15
15
|
child_page = Yasuri.with_retry(retry_count) { link_button.click }
|
data/lib/yasuri/yasuri_node.rb
CHANGED
@@ -7,11 +7,11 @@ module Yasuri
|
|
7
7
|
module Node
|
8
8
|
attr_reader :url, :xpath, :name, :children
|
9
9
|
|
10
|
-
def initialize(xpath, name, children = [], opt
|
10
|
+
def initialize(xpath, name, children = [], opt: {})
|
11
11
|
@xpath, @name, @children = xpath, name, children
|
12
12
|
end
|
13
13
|
|
14
|
-
def inject(agent, page, opt = {})
|
14
|
+
def inject(agent, page, opt = {}, element = page)
|
15
15
|
fail "#{Kernel.__method__} is not implemented."
|
16
16
|
end
|
17
17
|
def opts
|
@@ -15,26 +15,24 @@ module Yasuri
|
|
15
15
|
@nodes
|
16
16
|
end
|
17
17
|
|
18
|
-
def method_missing(name,
|
19
|
-
node = NodeGenerator.gen(name,
|
18
|
+
def method_missing(name, pattern, **args, &block)
|
19
|
+
node = NodeGenerator.gen(name, pattern, **args, &block)
|
20
20
|
raise "Undefined Node Name '#{name}'" if node == nil
|
21
21
|
@nodes << node
|
22
22
|
end
|
23
23
|
|
24
|
-
def self.gen(name,
|
25
|
-
xpath, opt = *args
|
26
|
-
opt = [opt].flatten.compact
|
24
|
+
def self.gen(name, xpath, **opt, &block)
|
27
25
|
children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block_given?
|
28
26
|
|
29
27
|
case name
|
30
28
|
when /^text_(.+)$/
|
31
|
-
Yasuri::TextNode.new(xpath, $1, children || [],
|
29
|
+
Yasuri::TextNode.new(xpath, $1, children || [], **opt)
|
32
30
|
when /^struct_(.+)$/
|
33
|
-
Yasuri::StructNode.new(xpath, $1, children || [],
|
31
|
+
Yasuri::StructNode.new(xpath, $1, children || [], **opt)
|
34
32
|
when /^links_(.+)$/
|
35
|
-
Yasuri::LinksNode.new(xpath, $1, children || [],
|
33
|
+
Yasuri::LinksNode.new(xpath, $1, children || [], **opt)
|
36
34
|
when /^pages_(.+)$/
|
37
|
-
Yasuri::PaginateNode.new(xpath, $1, children || [],
|
35
|
+
Yasuri::PaginateNode.new(xpath, $1, children || [], **opt)
|
38
36
|
else
|
39
37
|
nil
|
40
38
|
end
|
@@ -7,14 +7,17 @@ module Yasuri
|
|
7
7
|
class PaginateNode
|
8
8
|
include Node
|
9
9
|
|
10
|
-
def initialize(xpath, name, children = [],
|
10
|
+
def initialize(xpath, name, children = [], limit: nil, flatten: false)
|
11
11
|
super(xpath, name, children)
|
12
|
-
@
|
12
|
+
@flatten = flatten
|
13
|
+
@limit = limit
|
13
14
|
end
|
14
15
|
|
15
|
-
def inject(agent, page, opt = {})
|
16
|
+
def inject(agent, page, opt = {}, element = page)
|
16
17
|
retry_count = opt[:retry_count] || 5
|
17
18
|
|
19
|
+
raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
|
20
|
+
|
18
21
|
child_results = []
|
19
22
|
limit = @limit.nil? ? Float::MAX : @limit
|
20
23
|
while page
|
@@ -32,10 +35,14 @@ module Yasuri
|
|
32
35
|
break if (limit -= 1) <= 0
|
33
36
|
end
|
34
37
|
|
38
|
+
if @flatten == true
|
39
|
+
return child_results.map{|h| h.values}.flatten
|
40
|
+
end
|
41
|
+
|
35
42
|
child_results
|
36
43
|
end
|
37
44
|
def opts
|
38
|
-
{limit:@limit}
|
45
|
+
{limit:@limit, flatten:@flatten}
|
39
46
|
end
|
40
47
|
end
|
41
48
|
end
|
@@ -6,12 +6,12 @@ require_relative 'yasuri_node'
|
|
6
6
|
module Yasuri
|
7
7
|
class StructNode
|
8
8
|
include Node
|
9
|
-
def inject(agent, page, opt = {})
|
10
|
-
sub_tags =
|
9
|
+
def inject(agent, page, opt = {}, element = page)
|
10
|
+
sub_tags = element.search(@xpath)
|
11
11
|
tree = sub_tags.map do |sub_tag|
|
12
12
|
child_results_kv = @children.map do |child_node|
|
13
13
|
child_name = Yasuri.NodeName(child_node.name, opt)
|
14
|
-
[child_name, child_node.inject(agent,
|
14
|
+
[child_name, child_node.inject(agent, page, opt, sub_tag)]
|
15
15
|
end
|
16
16
|
Hash[child_results_kv]
|
17
17
|
end
|
@@ -7,11 +7,11 @@ module Yasuri
|
|
7
7
|
class TextNode
|
8
8
|
include Node
|
9
9
|
|
10
|
-
def initialize(xpath, name, children = [],
|
10
|
+
def initialize(xpath, name, children = [], **opt)
|
11
11
|
super(xpath, name, children)
|
12
12
|
|
13
|
-
truncate =
|
14
|
-
proc
|
13
|
+
truncate = opt[:truncate]
|
14
|
+
proc = opt[:proc]
|
15
15
|
|
16
16
|
truncate = Regexp.new(truncate) if not truncate.nil? # regexp or nil
|
17
17
|
@truncate = truncate
|
@@ -21,8 +21,8 @@ module Yasuri
|
|
21
21
|
|
22
22
|
end
|
23
23
|
|
24
|
-
def inject(agent, page, opt = {})
|
25
|
-
node =
|
24
|
+
def inject(agent, page, opt = {}, element = page)
|
25
|
+
node = element.search(@xpath)
|
26
26
|
text = node.text.to_s
|
27
27
|
|
28
28
|
if @truncate
|
@@ -0,0 +1,30 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
<title>StructualLinksTest</title>
|
4
|
+
</head>
|
5
|
+
<body>
|
6
|
+
|
7
|
+
<table>
|
8
|
+
<thead>
|
9
|
+
<tr>
|
10
|
+
<th>Title</th>
|
11
|
+
<th>Links</th>
|
12
|
+
</tr>
|
13
|
+
</thead>
|
14
|
+
<tr>
|
15
|
+
<td>Child01,02</td>
|
16
|
+
<td><a href="../child01.html">Child01</a></td>
|
17
|
+
<td><a href="../child02.html">Child02</a></td>
|
18
|
+
<td>../child02.html</td>
|
19
|
+
</tr>
|
20
|
+
|
21
|
+
<tr>
|
22
|
+
<td>Child01,02,03</td>
|
23
|
+
<td><a href="../child01.html">Child01</a></td>
|
24
|
+
<td><a href="../child02.html">Child02</a></td>
|
25
|
+
<td><a href="../child03.html">Child03</a></td>
|
26
|
+
</tr>
|
27
|
+
</table>
|
28
|
+
|
29
|
+
</body>
|
30
|
+
</html>
|
File without changes
|
data/spec/spec_helper.rb
CHANGED
@@ -12,11 +12,6 @@ shared_context 'httpserver' do
|
|
12
12
|
}
|
13
13
|
end
|
14
14
|
|
15
|
-
|
16
|
-
# ENV['CODECLIMATE_REPO_TOKEN'] = "0dc78d33107a7f11f257c0218ac1a37e0073005bb9734f2fd61d0f7e803fc151"
|
17
|
-
# require "codeclimate-test-reporter"
|
18
|
-
# CodeClimate::TestReporter.start
|
19
|
-
|
20
15
|
require 'simplecov'
|
21
16
|
require 'coveralls'
|
22
17
|
Coveralls.wear!
|
@@ -59,10 +59,18 @@ describe 'Yasuri' do
|
|
59
59
|
]
|
60
60
|
expect(actual).to match expected
|
61
61
|
end
|
62
|
-
it 'can be defined by DSL, return
|
63
|
-
|
64
|
-
|
65
|
-
|
62
|
+
it 'can be defined by DSL, return no contains if no child node' do
|
63
|
+
root_node = Yasuri.links_title '/html/body/a'
|
64
|
+
actual = root_node.inject(@agent, @index_page)
|
65
|
+
expected = [{}, {}, {}] # Empty if no child node under links node.
|
66
|
+
expect(actual).to match expected
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'can be defined return no contains if no child node' do
|
70
|
+
root_node = Yasuri::LinksNode.new('/html/body/a', "title")
|
71
|
+
actual = root_node.inject(@agent, @index_page)
|
72
|
+
expected = [{}, {}, {}] # Empty if no child node under links node.
|
73
|
+
expect(actual).to match expected
|
66
74
|
end
|
67
75
|
it 'can be defined by DSL, return nested contents under link' do
|
68
76
|
generated = Yasuri.links_title '/html/body/a' do
|
@@ -30,6 +30,49 @@ describe 'Yasuri' do
|
|
30
30
|
expect(actual).to match expected
|
31
31
|
end
|
32
32
|
|
33
|
+
it "scrape each paginated pages with flatten" do
|
34
|
+
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
35
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
36
|
+
Yasuri::StructNode.new('/html/body/nav/span', "span", [
|
37
|
+
Yasuri::TextNode.new('./a', "text"),
|
38
|
+
]),
|
39
|
+
], flatten: true)
|
40
|
+
actual = root_node.inject(@agent, @page)
|
41
|
+
expected = [
|
42
|
+
"PaginationTest01",
|
43
|
+
{"text"=>""},
|
44
|
+
{"text"=>""},
|
45
|
+
{"text" => "2"},
|
46
|
+
{"text" => "3"},
|
47
|
+
{"text" => "4"},
|
48
|
+
{"text"=>"NextPage »"},
|
49
|
+
"PaginationTest02",
|
50
|
+
{"text"=>"« PreviousPage"},
|
51
|
+
{"text" => "1"},
|
52
|
+
{"text"=>""},
|
53
|
+
{"text" => "3"},
|
54
|
+
{"text" => "4"},
|
55
|
+
{"text"=>"NextPage »"},
|
56
|
+
"PaginationTest03",
|
57
|
+
{"text"=>"« PreviousPage"},
|
58
|
+
{"text" => "1"},
|
59
|
+
{"text" => "2"},
|
60
|
+
{"text"=>""},
|
61
|
+
{"text" => "4"},
|
62
|
+
{"text"=>"NextPage »"},
|
63
|
+
"PaginationTest04",
|
64
|
+
{"text"=>"« PreviousPage"},
|
65
|
+
{"text" => "1"},
|
66
|
+
{"text" => "2"},
|
67
|
+
{"text" => "3"},
|
68
|
+
{"text"=>""},
|
69
|
+
{"text"=>""},
|
70
|
+
]
|
71
|
+
|
72
|
+
expect(actual).to match expected
|
73
|
+
end
|
74
|
+
|
75
|
+
|
33
76
|
it "scrape each paginated pages limited" do
|
34
77
|
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
35
78
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
data/spec/yasuri_spec.rb
CHANGED
@@ -13,6 +13,89 @@ describe 'Yasuri' do
|
|
13
13
|
@index_page = @agent.get(@uri)
|
14
14
|
end
|
15
15
|
|
16
|
+
############
|
17
|
+
# yam2tree #
|
18
|
+
############
|
19
|
+
describe '.yaml2tree' do
|
20
|
+
it "fail if empty yaml" do
|
21
|
+
expect { Yasuri.yaml2tree(nil) }.to raise_error(RuntimeError)
|
22
|
+
end
|
23
|
+
|
24
|
+
it "return text node" do
|
25
|
+
src = <<-EOB
|
26
|
+
content:
|
27
|
+
node: text
|
28
|
+
path: "/html/body/p[1]"
|
29
|
+
EOB
|
30
|
+
generated = Yasuri.yaml2tree(src)
|
31
|
+
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
32
|
+
|
33
|
+
compare_generated_vs_original(generated, original, @index_page)
|
34
|
+
end
|
35
|
+
|
36
|
+
it "return text node as symbol" do
|
37
|
+
src = <<-EOB
|
38
|
+
:content:
|
39
|
+
:node: text
|
40
|
+
:path: "/html/body/p[1]"
|
41
|
+
EOB
|
42
|
+
generated = Yasuri.yaml2tree(src)
|
43
|
+
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
44
|
+
|
45
|
+
compare_generated_vs_original(generated, original, @index_page)
|
46
|
+
end
|
47
|
+
|
48
|
+
it "return LinksNode/TextNode" do
|
49
|
+
|
50
|
+
src = <<-EOB
|
51
|
+
root:
|
52
|
+
node: links
|
53
|
+
path: "/html/body/a"
|
54
|
+
children:
|
55
|
+
- content:
|
56
|
+
node: text
|
57
|
+
path: "/html/body/p"
|
58
|
+
EOB
|
59
|
+
generated = Yasuri.yaml2tree(src)
|
60
|
+
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
61
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
62
|
+
])
|
63
|
+
|
64
|
+
compare_generated_vs_original(generated, original, @index_page)
|
65
|
+
end
|
66
|
+
|
67
|
+
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
68
|
+
src = <<-EOB
|
69
|
+
tables:
|
70
|
+
node: struct
|
71
|
+
path: "/html/body/table"
|
72
|
+
children:
|
73
|
+
- table:
|
74
|
+
node: struct
|
75
|
+
path: "./tr"
|
76
|
+
children:
|
77
|
+
- title:
|
78
|
+
node: text
|
79
|
+
path: "./td[1]"
|
80
|
+
- pub_date:
|
81
|
+
node: text
|
82
|
+
path: "./td[2]"
|
83
|
+
EOB
|
84
|
+
|
85
|
+
generated = Yasuri.yaml2tree(src)
|
86
|
+
original = Yasuri::StructNode.new('/html/body/table', "tables", [
|
87
|
+
Yasuri::StructNode.new('./tr', "table", [
|
88
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
89
|
+
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
90
|
+
])
|
91
|
+
])
|
92
|
+
page = @agent.get(@uri + "/struct/structual_text.html")
|
93
|
+
compare_generated_vs_original(generated, original, page)
|
94
|
+
end
|
95
|
+
|
96
|
+
end # end of describe '.yaml2tree'
|
97
|
+
|
98
|
+
|
16
99
|
#############
|
17
100
|
# json2tree #
|
18
101
|
#############
|
@@ -39,7 +122,7 @@ describe 'Yasuri' do
|
|
39
122
|
"truncate" : "^[^,]+"
|
40
123
|
}|
|
41
124
|
generated = Yasuri.json2tree(src)
|
42
|
-
original = Yasuri::TextNode.new('/html/body/p[1]', "content",
|
125
|
+
original = Yasuri::TextNode.new('/html/body/p[1]', "content", truncate:/^[^,]+/)
|
43
126
|
compare_generated_vs_original(generated, original, @index_page)
|
44
127
|
end
|
45
128
|
|
@@ -126,7 +209,7 @@ describe 'Yasuri' do
|
|
126
209
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
127
210
|
])
|
128
211
|
])
|
129
|
-
page = @agent.get(@uri + "/structual_text.html")
|
212
|
+
page = @agent.get(@uri + "/struct/structual_text.html")
|
130
213
|
compare_generated_vs_original(generated, original, page)
|
131
214
|
end
|
132
215
|
end
|
@@ -153,7 +236,7 @@ describe 'Yasuri' do
|
|
153
236
|
end
|
154
237
|
|
155
238
|
it "return text node with truncate_regexp" do
|
156
|
-
node = Yasuri::TextNode.new("/html/head/title", "title",
|
239
|
+
node = Yasuri::TextNode.new("/html/head/title", "title", truncate:/^[^,]+/)
|
157
240
|
json = Yasuri.tree2json(node)
|
158
241
|
expected_str = %q| { "node": "text",
|
159
242
|
"name": "title",
|
@@ -193,6 +276,7 @@ describe 'Yasuri' do
|
|
193
276
|
"name" : "root",
|
194
277
|
"path" : "/html/body/nav/span/a[@class='next']",
|
195
278
|
"limit" : 10,
|
279
|
+
"flatten" : false,
|
196
280
|
"children" : [ { "node" : "text",
|
197
281
|
"name" : "content",
|
198
282
|
"path" : "/html/body/p"
|
@@ -12,7 +12,7 @@ describe 'Yasuri' do
|
|
12
12
|
describe '::StructNode' do
|
13
13
|
before do
|
14
14
|
@agent = Mechanize.new
|
15
|
-
@page = @agent.get(uri + "/structual_text.html")
|
15
|
+
@page = @agent.get(uri + "/struct/structual_text.html")
|
16
16
|
|
17
17
|
@table_1996 = [
|
18
18
|
{ "title" => "The Perfect Insider",
|
@@ -126,10 +126,51 @@ describe 'Yasuri' do
|
|
126
126
|
Yasuri::TextNode.new('./td[1]', "title"),
|
127
127
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
128
128
|
])
|
129
|
-
expected = @table_1996.map{|h|
|
129
|
+
expected = @table_1996.map{|h| h.map{|k,v| [k.to_sym, v] }.to_h }
|
130
130
|
actual = node.inject(@agent, @page, symbolize_names:true)
|
131
131
|
expect(actual).to match expected
|
132
132
|
end
|
133
133
|
|
134
134
|
end
|
135
|
+
|
136
|
+
describe '::StructNode::Links' do
|
137
|
+
before do
|
138
|
+
@agent = Mechanize.new
|
139
|
+
@page = @agent.get(uri + "/struct/structual_links.html")
|
140
|
+
|
141
|
+
@table = [
|
142
|
+
{ "title" => "Child01,02",
|
143
|
+
"child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}] },
|
144
|
+
|
145
|
+
{ "title" => "Child01,02,03",
|
146
|
+
"child" => [{"p" => "Child 01 page."}, {"p" => "Child 02 page."}, {"p" => "Child 03 page."}]}
|
147
|
+
]
|
148
|
+
end
|
149
|
+
|
150
|
+
it 'return child node in links inside struct' do
|
151
|
+
node = Yasuri::StructNode.new('/html/body/table/tr', "table", [
|
152
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
153
|
+
Yasuri::LinksNode.new('./td/a', "child", [
|
154
|
+
Yasuri::TextNode.new('/html/body/p', "p"),
|
155
|
+
])
|
156
|
+
])
|
157
|
+
expected = @table
|
158
|
+
actual = node.inject(@agent, @page)
|
159
|
+
expect(actual).to match expected
|
160
|
+
end
|
161
|
+
end # descrive
|
162
|
+
|
163
|
+
describe '::StructNode::Pages' do
|
164
|
+
before do
|
165
|
+
@agent = Mechanize.new
|
166
|
+
@page = @agent.get(uri + "/struct/structual_text.html") #dummy
|
167
|
+
end
|
168
|
+
|
169
|
+
it 'not supported' do
|
170
|
+
node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
|
171
|
+
Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "pages", [])
|
172
|
+
])
|
173
|
+
expect{ node.inject(@agent, @page) }.to raise_error(NotImplementedError, "PagenateNode inside StructNode, Not Supported")
|
174
|
+
end
|
175
|
+
end
|
135
176
|
end
|
data/yasuri.gemspec
CHANGED
@@ -18,8 +18,8 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
19
|
spec.require_paths = ["lib"]
|
20
20
|
|
21
|
-
spec.add_development_dependency "bundler"
|
22
|
-
spec.add_development_dependency "rake"
|
21
|
+
spec.add_development_dependency "bundler"
|
22
|
+
spec.add_development_dependency "rake"
|
23
23
|
spec.add_development_dependency "rspec"
|
24
24
|
spec.add_development_dependency "fuubar"
|
25
25
|
spec.add_development_dependency "glint"
|
metadata
CHANGED
@@ -1,43 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yasuri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 3.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- TAC
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-03-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - "
|
38
|
+
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rspec
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -144,8 +144,10 @@ extensions: []
|
|
144
144
|
extra_rdoc_files: []
|
145
145
|
files:
|
146
146
|
- ".coveralls.yml"
|
147
|
+
- ".github/workflows/ruby.yml"
|
147
148
|
- ".gitignore"
|
148
149
|
- ".rspec"
|
150
|
+
- ".ruby-version"
|
149
151
|
- ".travis.yml"
|
150
152
|
- Gemfile
|
151
153
|
- LICENSE
|
@@ -174,7 +176,8 @@ files:
|
|
174
176
|
- spec/htdocs/pagination/page02.html
|
175
177
|
- spec/htdocs/pagination/page03.html
|
176
178
|
- spec/htdocs/pagination/page04.html
|
177
|
-
- spec/htdocs/
|
179
|
+
- spec/htdocs/struct/structual_links.html
|
180
|
+
- spec/htdocs/struct/structual_text.html
|
178
181
|
- spec/servers/httpserver.rb
|
179
182
|
- spec/spec_helper.rb
|
180
183
|
- spec/yasuri_links_node_spec.rb
|
@@ -188,7 +191,7 @@ homepage: https://github.com/tac0x2a/yasuri
|
|
188
191
|
licenses:
|
189
192
|
- MIT
|
190
193
|
metadata: {}
|
191
|
-
post_install_message:
|
194
|
+
post_install_message:
|
192
195
|
rdoc_options: []
|
193
196
|
require_paths:
|
194
197
|
- lib
|
@@ -203,9 +206,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
203
206
|
- !ruby/object:Gem::Version
|
204
207
|
version: '0'
|
205
208
|
requirements: []
|
206
|
-
|
207
|
-
|
208
|
-
signing_key:
|
209
|
+
rubygems_version: 3.2.3
|
210
|
+
signing_key:
|
209
211
|
specification_version: 4
|
210
212
|
summary: Yasuri is easy scraping library.
|
211
213
|
test_files:
|
@@ -220,7 +222,8 @@ test_files:
|
|
220
222
|
- spec/htdocs/pagination/page02.html
|
221
223
|
- spec/htdocs/pagination/page03.html
|
222
224
|
- spec/htdocs/pagination/page04.html
|
223
|
-
- spec/htdocs/
|
225
|
+
- spec/htdocs/struct/structual_links.html
|
226
|
+
- spec/htdocs/struct/structual_text.html
|
224
227
|
- spec/servers/httpserver.rb
|
225
228
|
- spec/spec_helper.rb
|
226
229
|
- spec/yasuri_links_node_spec.rb
|