yasuri 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +7 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +38 -127
- data/lib/yasuri/yasuri_links_node.rb +23 -0
- data/lib/yasuri/yasuri_node.rb +21 -0
- data/lib/yasuri/yasuri_node_generator.rb +46 -0
- data/lib/yasuri/yasuri_paginate_node.rb +39 -0
- data/lib/yasuri/yasuri_struct_node.rb +19 -0
- data/lib/yasuri/yasuri_text_node.rb +32 -0
- data/spec/spec_helper.rb +6 -0
- data/spec/yasuri_links_node_spec.rb +94 -0
- data/spec/yasuri_node_spec.rb +11 -0
- data/spec/yasuri_paginate_node_spec.rb +85 -0
- data/spec/yasuri_spec.rb +93 -319
- data/spec/yasuri_struct_node_spec.rb +114 -0
- data/spec/yasuri_text_node_spec.rb +61 -0
- metadata +18 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 18c8b6da6ca1f9d5433adc128b83ed5d5a8e353e
|
4
|
+
data.tar.gz: 07ba467f8d62982e4a8e969da42b839d8ee07664
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 232a5893c4511b0ef80b34a95d58af4d1bb8683512a1cb41b7f9a6d19def75d3c825b12c383bf51bab09e12fe8bd54cd91b79b32640319b60d6185a46ed7f086
|
7
|
+
data.tar.gz: 4cb31e01b60861d13770b8b9d033cef35fe6bbdd226d11703826040840ac979fcfd98c162b18322baa8f9102fd2ed059ac0c15c277c49217feaf39173800dea6
|
data/README.md
CHANGED
@@ -2,6 +2,13 @@
|
|
2
2
|
|
3
3
|
Yasuri (鑢) is an easy web-scraping library for supporting "[Mechanize](https://github.com/sparklemotion/mechanize)".
|
4
4
|
|
5
|
+
|
6
|
+
## Sample
|
7
|
+
|
8
|
+
https://yasuri-sample.herokuapp.com/
|
9
|
+
|
10
|
+
(source code: https://github.com/tac0x2a/yasuri-sample)
|
11
|
+
|
5
12
|
## Installation
|
6
13
|
|
7
14
|
Add this line to your application's Gemfile:
|
data/lib/yasuri/version.rb
CHANGED
data/lib/yasuri/yasuri.rb
CHANGED
@@ -5,148 +5,38 @@
|
|
5
5
|
require 'mechanize'
|
6
6
|
require 'json'
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
@xpath, @name, @children = xpath, name, children
|
15
|
-
end
|
16
|
-
|
17
|
-
def inject(agent, page)
|
18
|
-
fail "#{Kernel.__method__} is not implemented."
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
class TextNode
|
23
|
-
include Node
|
24
|
-
def initialize(xpath, name, children = [], truncate_regexp: nil, opt: {})
|
25
|
-
super(xpath, name, children)
|
26
|
-
@truncate_regexp = truncate_regexp
|
27
|
-
end
|
28
|
-
def inject(agent, page, retry_count = 5)
|
29
|
-
node = page.search(@xpath)
|
30
|
-
text = node.text.to_s
|
31
|
-
|
32
|
-
text = text[@truncate_regexp, 0] if @truncate_regexp
|
33
|
-
|
34
|
-
text.to_s
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
class StructNode
|
39
|
-
include Node
|
40
|
-
def inject(agent, page, retry_count = 5)
|
41
|
-
sub_tags = page.search(@xpath)
|
42
|
-
sub_tags.map do |sub_tag|
|
43
|
-
child_results_kv = @children.map do |child_node|
|
44
|
-
[child_node.name, child_node.inject(agent, sub_tag, retry_count)]
|
45
|
-
end
|
46
|
-
Hash[child_results_kv]
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
class LinksNode
|
52
|
-
include Node
|
53
|
-
def inject(agent, page, retry_count = 5)
|
54
|
-
links = page.search(@xpath) || [] # links expected
|
55
|
-
links.map do |link|
|
56
|
-
link_button = Mechanize::Page::Link.new(link, agent, page)
|
57
|
-
child_page = Yasuri.with_retry(retry_count) { link_button.click }
|
58
|
-
|
59
|
-
child_results_kv = @children.map do |child_node|
|
60
|
-
[child_node.name, child_node.inject(agent, child_page, retry_count)]
|
61
|
-
end
|
62
|
-
|
63
|
-
Hash[child_results_kv]
|
64
|
-
end # each named child node
|
65
|
-
end
|
66
|
-
end
|
67
|
-
|
68
|
-
class PaginateNode
|
69
|
-
include Node
|
70
|
-
|
71
|
-
def initialize(xpath, name, children = [], limit: nil, opt: {})
|
72
|
-
super(xpath, name, children)
|
73
|
-
@limit = limit || opt["limit"] || Float::MAX
|
74
|
-
end
|
75
|
-
|
76
|
-
def inject(agent, page, retry_count = 5)
|
8
|
+
require_relative 'yasuri_node'
|
9
|
+
require_relative 'yasuri_text_node'
|
10
|
+
require_relative 'yasuri_struct_node'
|
11
|
+
require_relative 'yasuri_paginate_node'
|
12
|
+
require_relative 'yasuri_links_node'
|
13
|
+
require_relative 'yasuri_node_generator'
|
77
14
|
|
78
|
-
|
79
|
-
while page
|
80
|
-
child_results_kv = @children.map do |child_node|
|
81
|
-
[child_node.name, child_node.inject(agent, page, retry_count)]
|
82
|
-
end
|
83
|
-
child_results << Hash[child_results_kv]
|
84
|
-
|
85
|
-
link = page.search(@xpath).first
|
86
|
-
break if link == nil
|
87
|
-
|
88
|
-
link_button = Mechanize::Page::Link.new(link, agent, page)
|
89
|
-
page = Yasuri.with_retry(retry_count) { link_button.click }
|
90
|
-
break if (@limit -= 1) <= 0
|
91
|
-
end
|
92
|
-
|
93
|
-
child_results
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
|
-
class NodeGenerator
|
98
|
-
def gen_recursive(&block)
|
99
|
-
@nodes = []
|
100
|
-
instance_eval(&block)
|
101
|
-
@nodes
|
102
|
-
end
|
103
|
-
|
104
|
-
def method_missing(name, *args, &block)
|
105
|
-
node = NodeGenerator.gen(name, *args, &block)
|
106
|
-
raise "Undefined Node Name '#{name}'" if node == nil
|
107
|
-
@nodes << node
|
108
|
-
end
|
109
|
-
|
110
|
-
def self.gen(name, *args, &block)
|
111
|
-
xpath, opt = *args
|
112
|
-
children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block_given?
|
113
|
-
|
114
|
-
case name
|
115
|
-
when /^text_(.+)$/
|
116
|
-
truncate_regexp = opt
|
117
|
-
Yasuri::TextNode.new(xpath, $1, truncate_regexp)
|
118
|
-
when /^struct_(.+)$/
|
119
|
-
Yasuri::StructNode.new(xpath, $1, children || [])
|
120
|
-
when /^links_(.+)$/
|
121
|
-
Yasuri::LinksNode.new(xpath, $1, children || [])
|
122
|
-
when /^pages_(.+)$/
|
123
|
-
xpath, limit = *args
|
124
|
-
limit = limit || Float::MAX
|
125
|
-
Yasuri::PaginateNode.new(xpath, $1, children || [], limit: limit)
|
126
|
-
else
|
127
|
-
nil
|
128
|
-
end
|
129
|
-
end # of self.gen(name, *args, &block)
|
130
|
-
end # of class NodeGenerator
|
15
|
+
module Yasuri
|
131
16
|
|
132
17
|
def self.json2tree(json_string)
|
133
18
|
json = JSON.parse(json_string)
|
134
19
|
Yasuri.hash2node(json)
|
135
20
|
end
|
136
21
|
|
22
|
+
def self.tree2json(node)
|
23
|
+
Yasuri.node2hash(node).to_json
|
24
|
+
end
|
25
|
+
|
137
26
|
def self.method_missing(name, *args, &block)
|
138
27
|
generated = Yasuri::NodeGenerator.gen(name, *args, &block)
|
139
28
|
generated || super(name, args)
|
140
29
|
end
|
141
30
|
|
142
|
-
|
143
31
|
private
|
144
32
|
Text2Node = {
|
145
|
-
"text" => TextNode,
|
146
|
-
"struct" => StructNode,
|
147
|
-
"links" => LinksNode,
|
148
|
-
"pages" => PaginateNode
|
33
|
+
"text" => Yasuri::TextNode,
|
34
|
+
"struct" => Yasuri::StructNode,
|
35
|
+
"links" => Yasuri::LinksNode,
|
36
|
+
"pages" => Yasuri::PaginateNode
|
149
37
|
}
|
38
|
+
Node2Text = Text2Node.invert
|
39
|
+
|
150
40
|
ReservedKeys = %w|node name path children|
|
151
41
|
def self.hash2node(node_h)
|
152
42
|
node, name, path, children = ReservedKeys.map do |key|
|
@@ -162,6 +52,27 @@ module Yasuri
|
|
162
52
|
klass ? klass.new(path, name, childnodes, opt: opt) : nil
|
163
53
|
end
|
164
54
|
|
55
|
+
def self.node2hash(node)
|
56
|
+
json = JSON.parse("{}")
|
57
|
+
return json if node.nil?
|
58
|
+
|
59
|
+
klass = node.class
|
60
|
+
klass_str = Node2Text[klass]
|
61
|
+
|
62
|
+
json["node"] = klass_str
|
63
|
+
json["name"] = node.name
|
64
|
+
json["path"] = node.xpath
|
65
|
+
|
66
|
+
children = node.children.map{|c| Yasuri.node2hash(c)}
|
67
|
+
json["children"] = children if not children.empty?
|
68
|
+
|
69
|
+
node.opts.each do |key,value|
|
70
|
+
json[key] = value if not value.nil?
|
71
|
+
end
|
72
|
+
|
73
|
+
json
|
74
|
+
end
|
75
|
+
|
165
76
|
def self.with_retry(retry_count = 5)
|
166
77
|
begin
|
167
78
|
return yield() if block_given?
|
@@ -0,0 +1,23 @@
|
|
1
|
+
|
2
|
+
# Author:: TAC (tac@tac42.net)
|
3
|
+
|
4
|
+
require_relative 'yasuri_node'
|
5
|
+
|
6
|
+
module Yasuri
|
7
|
+
class LinksNode
|
8
|
+
include Node
|
9
|
+
def inject(agent, page, retry_count = 5)
|
10
|
+
links = page.search(@xpath) || [] # links expected
|
11
|
+
links.map do |link|
|
12
|
+
link_button = Mechanize::Page::Link.new(link, agent, page)
|
13
|
+
child_page = Yasuri.with_retry(retry_count) { link_button.click }
|
14
|
+
|
15
|
+
child_results_kv = @children.map do |child_node|
|
16
|
+
[child_node.name, child_node.inject(agent, child_page, retry_count)]
|
17
|
+
end
|
18
|
+
|
19
|
+
Hash[child_results_kv]
|
20
|
+
end # each named child node
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
|
2
|
+
# Author:: TAC (tac@tac42.net)
|
3
|
+
|
4
|
+
require_relative 'yasuri_node'
|
5
|
+
|
6
|
+
module Yasuri
|
7
|
+
module Node
|
8
|
+
attr_reader :url, :xpath, :name, :children
|
9
|
+
|
10
|
+
def initialize(xpath, name, children = [], opt: {})
|
11
|
+
@xpath, @name, @children = xpath, name, children
|
12
|
+
end
|
13
|
+
|
14
|
+
def inject(agent, page)
|
15
|
+
fail "#{Kernel.__method__} is not implemented."
|
16
|
+
end
|
17
|
+
def opts
|
18
|
+
{}
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
|
2
|
+
# Author:: TAC (tac@tac42.net)
|
3
|
+
|
4
|
+
require_relative 'yasuri_node'
|
5
|
+
require_relative 'yasuri_text_node'
|
6
|
+
require_relative 'yasuri_struct_node'
|
7
|
+
require_relative 'yasuri_links_node'
|
8
|
+
require_relative 'yasuri_paginate_node'
|
9
|
+
|
10
|
+
module Yasuri
|
11
|
+
class NodeGenerator
|
12
|
+
def gen_recursive(&block)
|
13
|
+
@nodes = []
|
14
|
+
instance_eval(&block)
|
15
|
+
@nodes
|
16
|
+
end
|
17
|
+
|
18
|
+
def method_missing(name, *args, &block)
|
19
|
+
node = NodeGenerator.gen(name, *args, &block)
|
20
|
+
raise "Undefined Node Name '#{name}'" if node == nil
|
21
|
+
@nodes << node
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.gen(name, *args, &block)
|
25
|
+
xpath, opt = *args
|
26
|
+
opt = [opt].flatten.compact
|
27
|
+
children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block_given?
|
28
|
+
|
29
|
+
case name
|
30
|
+
when /^text_(.+)$/
|
31
|
+
truncate, dummy = *opt
|
32
|
+
Yasuri::TextNode.new(xpath, $1, children || [], truncate: truncate)
|
33
|
+
when /^struct_(.+)$/
|
34
|
+
Yasuri::StructNode.new(xpath, $1, children || [])
|
35
|
+
when /^links_(.+)$/
|
36
|
+
Yasuri::LinksNode.new(xpath, $1, children || [])
|
37
|
+
when /^pages_(.+)$/
|
38
|
+
limit, dummy = *opt
|
39
|
+
limit = limit || Float::MAX
|
40
|
+
Yasuri::PaginateNode.new(xpath, $1, children || [], limit: limit)
|
41
|
+
else
|
42
|
+
nil
|
43
|
+
end
|
44
|
+
end # of self.gen(name, *args, &block)
|
45
|
+
end # of class NodeGenerator
|
46
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
|
2
|
+
# Author:: TAC (tac@tac42.net)
|
3
|
+
|
4
|
+
require_relative 'yasuri_node'
|
5
|
+
|
6
|
+
module Yasuri
|
7
|
+
class PaginateNode
|
8
|
+
include Node
|
9
|
+
|
10
|
+
def initialize(xpath, name, children = [], limit: nil, opt: {})
|
11
|
+
super(xpath, name, children)
|
12
|
+
@limit = limit || opt["limit"]
|
13
|
+
end
|
14
|
+
|
15
|
+
def inject(agent, page, retry_count = 5)
|
16
|
+
|
17
|
+
child_results = []
|
18
|
+
limit = @limit.nil? ? Float::MAX : @limit
|
19
|
+
while page
|
20
|
+
child_results_kv = @children.map do |child_node|
|
21
|
+
[child_node.name, child_node.inject(agent, page, retry_count)]
|
22
|
+
end
|
23
|
+
child_results << Hash[child_results_kv]
|
24
|
+
|
25
|
+
link = page.search(@xpath).first
|
26
|
+
break if link == nil
|
27
|
+
|
28
|
+
link_button = Mechanize::Page::Link.new(link, agent, page)
|
29
|
+
page = Yasuri.with_retry(retry_count) { link_button.click }
|
30
|
+
break if (limit -= 1) <= 0
|
31
|
+
end
|
32
|
+
|
33
|
+
child_results
|
34
|
+
end
|
35
|
+
def opts
|
36
|
+
{limit:@limit}
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
|
2
|
+
# Author:: TAC (tac@tac42.net)
|
3
|
+
|
4
|
+
require_relative 'yasuri_node'
|
5
|
+
|
6
|
+
module Yasuri
|
7
|
+
class StructNode
|
8
|
+
include Node
|
9
|
+
def inject(agent, page, retry_count = 5)
|
10
|
+
sub_tags = page.search(@xpath)
|
11
|
+
sub_tags.map do |sub_tag|
|
12
|
+
child_results_kv = @children.map do |child_node|
|
13
|
+
[child_node.name, child_node.inject(agent, sub_tag, retry_count)]
|
14
|
+
end
|
15
|
+
Hash[child_results_kv]
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
|
2
|
+
# Author:: TAC (tac@tac42.net)
|
3
|
+
|
4
|
+
require_relative 'yasuri_node'
|
5
|
+
|
6
|
+
module Yasuri
|
7
|
+
class TextNode
|
8
|
+
include Node
|
9
|
+
def initialize(xpath, name, children = [], truncate: nil, opt: {})
|
10
|
+
super(xpath, name, children)
|
11
|
+
|
12
|
+
truncate_opt = opt["truncate"] #str
|
13
|
+
truncate_opt = Regexp.new(truncate_opt) if not truncate_opt.nil? # regexp or nil
|
14
|
+
|
15
|
+
@truncate = truncate || truncate_opt || nil # regexp or nil
|
16
|
+
|
17
|
+
@truncate = Regexp.new(@truncate.to_s) if not @truncate.nil?
|
18
|
+
|
19
|
+
end
|
20
|
+
def inject(agent, page, retry_count = 5)
|
21
|
+
node = page.search(@xpath)
|
22
|
+
text = node.text.to_s
|
23
|
+
|
24
|
+
text = text[@truncate, 0] if @truncate
|
25
|
+
|
26
|
+
text.to_s
|
27
|
+
end
|
28
|
+
def opts
|
29
|
+
{truncate:@truncate}
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
data/spec/spec_helper.rb
CHANGED
@@ -29,3 +29,9 @@ SimpleCov.start
|
|
29
29
|
|
30
30
|
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
31
31
|
require 'yasuri'
|
32
|
+
|
33
|
+
def compare_generated_vs_original(generated, original, page)
|
34
|
+
expected = original.inject(@agent, page)
|
35
|
+
actual = generated.inject(@agent, page)
|
36
|
+
expect(actual).to match expected
|
37
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
|
2
|
+
# Author:: TAC (tac@tac42.net)
|
3
|
+
|
4
|
+
require_relative 'spec_helper'
|
5
|
+
|
6
|
+
#########
|
7
|
+
# Links #
|
8
|
+
#########
|
9
|
+
describe 'Yasuri' do
|
10
|
+
include_context 'httpserver'
|
11
|
+
|
12
|
+
describe '::LinksNode' do
|
13
|
+
before do
|
14
|
+
@agent = Mechanize.new
|
15
|
+
@uri = uri
|
16
|
+
@index_page = @agent.get(@uri)
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'scrape links' do
|
20
|
+
root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
|
21
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
22
|
+
])
|
23
|
+
|
24
|
+
actual = root_node.inject(@agent, @index_page)
|
25
|
+
expected = [
|
26
|
+
{"content" => "Child 01 page."},
|
27
|
+
{"content" => "Child 02 page."},
|
28
|
+
{"content" => "Child 03 page."},
|
29
|
+
]
|
30
|
+
expect(actual).to match expected
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'return empty set if no match node' do
|
34
|
+
missing_xpath = '/html/body/b'
|
35
|
+
root_node = Yasuri::LinksNode.new(missing_xpath, "root", [
|
36
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
37
|
+
])
|
38
|
+
|
39
|
+
actual = root_node.inject(@agent, @index_page)
|
40
|
+
expect(actual).to be_empty
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'scrape links, recursive' do
|
44
|
+
root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
|
45
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
46
|
+
Yasuri::LinksNode.new('/html/body/ul/li/a', "sub_link", [
|
47
|
+
Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
|
48
|
+
]),
|
49
|
+
])
|
50
|
+
actual = root_node.inject(@agent, @index_page)
|
51
|
+
expected = [
|
52
|
+
{"content" => "Child 01 page.",
|
53
|
+
"sub_link" => [{"sub_page_title" => "Child 01 SubPage Test"},
|
54
|
+
{"sub_page_title" => "Child 02 SubPage Test"}],},
|
55
|
+
{"content" => "Child 02 page.",
|
56
|
+
"sub_link" => [],},
|
57
|
+
{"content" => "Child 03 page.",
|
58
|
+
"sub_link" => [{"sub_page_title" => "Child 03 SubPage Test"}],},
|
59
|
+
]
|
60
|
+
expect(actual).to match expected
|
61
|
+
end
|
62
|
+
it 'can be defined by DSL, return single LinkNode title' do
|
63
|
+
generated = Yasuri.links_title '/html/body/a'
|
64
|
+
original = Yasuri::LinksNode.new('/html/body/a', "title")
|
65
|
+
compare_generated_vs_original(generated, original, @index_page)
|
66
|
+
end
|
67
|
+
it 'can be defined by DSL, return nested contents under link' do
|
68
|
+
generated = Yasuri.links_title '/html/body/a' do
|
69
|
+
text_name '/html/body/p'
|
70
|
+
end
|
71
|
+
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
72
|
+
Yasuri::TextNode.new('/html/body/p', "name"),
|
73
|
+
])
|
74
|
+
compare_generated_vs_original(generated, original, @index_page)
|
75
|
+
end
|
76
|
+
|
77
|
+
it 'can be defined by DSL, return recursive links node' do
|
78
|
+
generated = Yasuri.links_root '/html/body/a' do
|
79
|
+
text_content '/html/body/p'
|
80
|
+
links_sub_link '/html/body/ul/li/a' do
|
81
|
+
text_sub_page_title '/html/head/title'
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
86
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
87
|
+
Yasuri::LinksNode.new('/html/body/ul/li/a', "sub_link", [
|
88
|
+
Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
|
89
|
+
]),
|
90
|
+
])
|
91
|
+
compare_generated_vs_original(generated, original, @index_page)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
|
2
|
+
# Author:: TAC (tac@tac42.net)
|
3
|
+
|
4
|
+
require_relative 'spec_helper'
|
5
|
+
|
6
|
+
############
|
7
|
+
# Paginate #
|
8
|
+
############
|
9
|
+
describe 'Yasuri' do
|
10
|
+
include_context 'httpserver'
|
11
|
+
|
12
|
+
describe '::PaginateNode' do
|
13
|
+
before do
|
14
|
+
@agent = Mechanize.new
|
15
|
+
@uri = uri + "/pagination/page01.html"
|
16
|
+
@page = @agent.get(@uri)
|
17
|
+
end
|
18
|
+
|
19
|
+
it "scrape each paginated pages" do
|
20
|
+
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
21
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
22
|
+
])
|
23
|
+
actual = root_node.inject(@agent, @page)
|
24
|
+
expected = [
|
25
|
+
{"content" => "PaginationTest01"},
|
26
|
+
{"content" => "PaginationTest02"},
|
27
|
+
{"content" => "PaginationTest03"},
|
28
|
+
{"content" => "PaginationTest04"},
|
29
|
+
]
|
30
|
+
expect(actual).to match expected
|
31
|
+
end
|
32
|
+
|
33
|
+
it "scrape each paginated pages limited" do
|
34
|
+
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
35
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
36
|
+
], limit:3)
|
37
|
+
actual = root_node.inject(@agent, @page)
|
38
|
+
expected = [
|
39
|
+
{"content" => "PaginationTest01"},
|
40
|
+
{"content" => "PaginationTest02"},
|
41
|
+
{"content" => "PaginationTest03"},
|
42
|
+
]
|
43
|
+
expect(actual).to match expected
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'return first content if paginate link node is not found' do
|
47
|
+
missing_xpath = "/html/body/nav/span/b[@class='next']"
|
48
|
+
root_node = Yasuri::PaginateNode.new(missing_xpath, "root", [
|
49
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
50
|
+
])
|
51
|
+
actual = root_node.inject(@agent, @page)
|
52
|
+
expected = [ {"content" => "PaginationTest01"}, ]
|
53
|
+
expect(actual).to match_array expected
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'return empty hashes if content node is not found' do
|
57
|
+
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
58
|
+
Yasuri::TextNode.new('/html/body/hoge', "content"),
|
59
|
+
])
|
60
|
+
actual = root_node.inject(@agent, @page)
|
61
|
+
expected = [ {"content" => ""}, {"content" => ""}, {"content" => ""}, {"content" => ""},]
|
62
|
+
expect(actual).to match_array expected
|
63
|
+
end
|
64
|
+
|
65
|
+
it 'can be defined by DSL, return single PaginateNode content' do
|
66
|
+
generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']" do
|
67
|
+
text_content '/html/body/p'
|
68
|
+
end
|
69
|
+
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
70
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
71
|
+
])
|
72
|
+
compare_generated_vs_original(generated, original, @page)
|
73
|
+
end
|
74
|
+
|
75
|
+
it 'can be defined by DSL, return single PaginateNode content limited' do
|
76
|
+
generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']", 2 do
|
77
|
+
text_content '/html/body/p'
|
78
|
+
end
|
79
|
+
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
80
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
81
|
+
], limit: 2)
|
82
|
+
compare_generated_vs_original(generated, original, @page)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
data/spec/yasuri_spec.rb
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
# Author:: TAC (tac@tac42.net)
|
4
4
|
|
5
5
|
require_relative 'spec_helper'
|
6
|
+
require_relative 'yasuri_text_node_spec'
|
6
7
|
|
7
8
|
#require_relative '../lib/yasuri/yasuri'
|
8
9
|
|
@@ -15,323 +16,6 @@ describe 'Yasuri' do
|
|
15
16
|
@index_page = @agent.get(@uri)
|
16
17
|
end
|
17
18
|
|
18
|
-
########
|
19
|
-
# Node #
|
20
|
-
########
|
21
|
-
def compare_generated_vs_original(generated, original, page = @index_page)
|
22
|
-
expected = original.inject(@agent, page)
|
23
|
-
actual = generated.inject(@agent, page)
|
24
|
-
expect(actual).to match expected
|
25
|
-
end
|
26
|
-
|
27
|
-
########
|
28
|
-
# Text #
|
29
|
-
########
|
30
|
-
describe '::TextNode' do
|
31
|
-
before { @node = Yasuri::TextNode.new('/html/body/p[1]', "title") }
|
32
|
-
|
33
|
-
it 'scrape text text <p>Hello,Yasuri</p>' do
|
34
|
-
actual = @node.inject(@agent, @index_page)
|
35
|
-
expect(actual).to eq "Hello,Yasuri"
|
36
|
-
end
|
37
|
-
|
38
|
-
it 'return empty text if no match node' do
|
39
|
-
no_match_node = Yasuri::TextNode.new('/html/body/no_match_node', "title")
|
40
|
-
actual = no_match_node.inject(@agent, @index_page)
|
41
|
-
expect(actual).to be_empty
|
42
|
-
end
|
43
|
-
|
44
|
-
it 'fail with invalid xpath' do
|
45
|
-
invalid_xpath = '/html/body/no_match_node['
|
46
|
-
node = Yasuri::TextNode.new(invalid_xpath, "title")
|
47
|
-
expect { node.inject(@agent, @index_page) }.to raise_error
|
48
|
-
end
|
49
|
-
|
50
|
-
it "can be defined by DSL, return single TextNode title" do
|
51
|
-
generated = Yasuri.text_title '/html/body/p[1]'
|
52
|
-
original = Yasuri::TextNode.new('/html/body/p[1]', "title")
|
53
|
-
compare_generated_vs_original(generated, original)
|
54
|
-
end
|
55
|
-
|
56
|
-
it "can be truncated with regexp" do
|
57
|
-
node = Yasuri.text_title '/html/body/p[1]', truncate_regexp:/^[^,]+/
|
58
|
-
actual = node.inject(@agent, @index_page)
|
59
|
-
expect(actual).to eq "Hello"
|
60
|
-
end
|
61
|
-
|
62
|
-
it "can be truncated with regexp" do
|
63
|
-
node = Yasuri.text_title '/html/body/p[1]', truncate_regexp:/[^,]+$/
|
64
|
-
actual = node.inject(@agent, @index_page)
|
65
|
-
expect(actual).to eq "Yasuri"
|
66
|
-
end
|
67
|
-
|
68
|
-
it "return empty string if truncated with no match to regexp" do
|
69
|
-
node = Yasuri.text_title '/html/body/p[1]', truncate_regexp:/^hoge/
|
70
|
-
actual = node.inject(@agent, @index_page)
|
71
|
-
expect(actual).to be_empty
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
##########
|
76
|
-
# Struct #
|
77
|
-
##########
|
78
|
-
describe '::StructNode' do
|
79
|
-
before do
|
80
|
-
@page = @agent.get(@uri + "/structual_text.html")
|
81
|
-
@table_1996 = [
|
82
|
-
{ "title" => "The Perfect Insider",
|
83
|
-
"pub_date" => "1996/4/5" },
|
84
|
-
{ "title" => "Doctors in Isolated Room",
|
85
|
-
"pub_date" => "1996/7/5" },
|
86
|
-
{ "title" => "Mathematical Goodbye",
|
87
|
-
"pub_date" => "1996/9/5" },
|
88
|
-
]
|
89
|
-
@table_1997 = [
|
90
|
-
{ "title" => "Jack the Poetical Private",
|
91
|
-
"pub_date" => "1997/1/5" },
|
92
|
-
{ "title" => "Who Inside",
|
93
|
-
"pub_date" => "1997/4/5" },
|
94
|
-
{ "title" => "Illusion Acts Like Magic",
|
95
|
-
"pub_date" => "1997/10/5" },
|
96
|
-
]
|
97
|
-
@table_1998 = [
|
98
|
-
{ "title" => "Replaceable Summer",
|
99
|
-
"pub_date" => "1998/1/7" },
|
100
|
-
{ "title" => "Switch Back",
|
101
|
-
"pub_date" => "1998/4/5" },
|
102
|
-
{ "title" => "Numerical Models",
|
103
|
-
"pub_date" => "1998/7/5" },
|
104
|
-
{ "title" => "The Perfect Outsider",
|
105
|
-
"pub_date" => "1998/10/5" },
|
106
|
-
]
|
107
|
-
@all_tables = [
|
108
|
-
{"table" => @table_1996},
|
109
|
-
{"table" => @table_1997},
|
110
|
-
{"table" => @table_1998},
|
111
|
-
]
|
112
|
-
end
|
113
|
-
it 'scrape single table contents' do
|
114
|
-
node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
|
115
|
-
Yasuri::TextNode.new('./td[1]', "title"),
|
116
|
-
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
117
|
-
])
|
118
|
-
expected = @table_1996
|
119
|
-
actual = node.inject(@agent, @page)
|
120
|
-
expect(actual).to match expected
|
121
|
-
end
|
122
|
-
|
123
|
-
it 'return empty text if no match node' do
|
124
|
-
no_match_xpath = '/html/body/table[1]/t'
|
125
|
-
node = Yasuri::StructNode.new(no_match_xpath, "table", [
|
126
|
-
Yasuri::TextNode.new('./td[1]', "title")
|
127
|
-
])
|
128
|
-
actual = node.inject(@agent, @page)
|
129
|
-
expect(actual).to be_empty
|
130
|
-
end
|
131
|
-
|
132
|
-
it 'fail with invalid xpath' do
|
133
|
-
invalid_xpath = '/html/body/table[1]/table[1]/tr['
|
134
|
-
node = Yasuri::StructNode.new(invalid_xpath, "table", [
|
135
|
-
Yasuri::TextNode.new('./td[1]', "title")
|
136
|
-
])
|
137
|
-
expect { node.inject(@agent, @page) }.to raise_error
|
138
|
-
end
|
139
|
-
|
140
|
-
it 'fail with invalid xpath in children' do
|
141
|
-
invalid_xpath = './td[1]['
|
142
|
-
node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
|
143
|
-
Yasuri::TextNode.new(invalid_xpath, "title"),
|
144
|
-
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
145
|
-
])
|
146
|
-
expect { node.inject(@agent, @page) }.to raise_error
|
147
|
-
end
|
148
|
-
|
149
|
-
it 'scrape all tables' do
|
150
|
-
node = Yasuri::StructNode.new('/html/body/table', "tables", [
|
151
|
-
Yasuri::StructNode.new('./tr', "table", [
|
152
|
-
Yasuri::TextNode.new('./td[1]', "title"),
|
153
|
-
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
154
|
-
])
|
155
|
-
])
|
156
|
-
expected = @all_tables
|
157
|
-
actual = node.inject(@agent, @page)
|
158
|
-
expect(actual).to match expected
|
159
|
-
end
|
160
|
-
|
161
|
-
it 'can be defined by DSL, scrape all tables' do
|
162
|
-
generated = Yasuri.struct_tables '/html/body/table' do
|
163
|
-
struct_table './tr' do
|
164
|
-
text_title './td[1]'
|
165
|
-
text_pub_date './td[2]'
|
166
|
-
end
|
167
|
-
end
|
168
|
-
original = Yasuri::StructNode.new('/html/body/table', "tables", [
|
169
|
-
Yasuri::StructNode.new('./tr', "table", [
|
170
|
-
Yasuri::TextNode.new('./td[1]', "title"),
|
171
|
-
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
172
|
-
])
|
173
|
-
])
|
174
|
-
compare_generated_vs_original(generated, original)
|
175
|
-
end
|
176
|
-
end
|
177
|
-
|
178
|
-
#########
|
179
|
-
# Links #
|
180
|
-
#########
|
181
|
-
describe '::LinksNode' do
|
182
|
-
it 'scrape links' do
|
183
|
-
root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
|
184
|
-
Yasuri::TextNode.new('/html/body/p', "content"),
|
185
|
-
])
|
186
|
-
|
187
|
-
actual = root_node.inject(@agent, @index_page)
|
188
|
-
expected = [
|
189
|
-
{"content" => "Child 01 page."},
|
190
|
-
{"content" => "Child 02 page."},
|
191
|
-
{"content" => "Child 03 page."},
|
192
|
-
]
|
193
|
-
expect(actual).to match expected
|
194
|
-
end
|
195
|
-
|
196
|
-
it 'return empty set if no match node' do
|
197
|
-
missing_xpath = '/html/body/b'
|
198
|
-
root_node = Yasuri::LinksNode.new(missing_xpath, "root", [
|
199
|
-
Yasuri::TextNode.new('/html/body/p', "content"),
|
200
|
-
])
|
201
|
-
|
202
|
-
actual = root_node.inject(@agent, @index_page)
|
203
|
-
expect(actual).to be_empty
|
204
|
-
end
|
205
|
-
|
206
|
-
it 'scrape links, recursive' do
|
207
|
-
root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
|
208
|
-
Yasuri::TextNode.new('/html/body/p', "content"),
|
209
|
-
Yasuri::LinksNode.new('/html/body/ul/li/a', "sub_link", [
|
210
|
-
Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
|
211
|
-
]),
|
212
|
-
])
|
213
|
-
actual = root_node.inject(@agent, @index_page)
|
214
|
-
expected = [
|
215
|
-
{"content" => "Child 01 page.",
|
216
|
-
"sub_link" => [{"sub_page_title" => "Child 01 SubPage Test"},
|
217
|
-
{"sub_page_title" => "Child 02 SubPage Test"}],},
|
218
|
-
{"content" => "Child 02 page.",
|
219
|
-
"sub_link" => [],},
|
220
|
-
{"content" => "Child 03 page.",
|
221
|
-
"sub_link" => [{"sub_page_title" => "Child 03 SubPage Test"}],},
|
222
|
-
]
|
223
|
-
expect(actual).to match expected
|
224
|
-
end
|
225
|
-
it 'can be defined by DSL, return single LinkNode title' do
|
226
|
-
generated = Yasuri.links_title '/html/body/a'
|
227
|
-
original = Yasuri::LinksNode.new('/html/body/a', "title")
|
228
|
-
compare_generated_vs_original(generated, original)
|
229
|
-
end
|
230
|
-
it 'can be defined by DSL, return nested contents under link' do
|
231
|
-
generated = Yasuri.links_title '/html/body/a' do
|
232
|
-
text_name '/html/body/p'
|
233
|
-
end
|
234
|
-
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
235
|
-
Yasuri::TextNode.new('/html/body/p', "name"),
|
236
|
-
])
|
237
|
-
compare_generated_vs_original(generated, original)
|
238
|
-
end
|
239
|
-
|
240
|
-
it 'can be defined by DSL, return recursive links node' do
|
241
|
-
generated = Yasuri.links_root '/html/body/a' do
|
242
|
-
text_content '/html/body/p'
|
243
|
-
links_sub_link '/html/body/ul/li/a' do
|
244
|
-
text_sub_page_title '/html/head/title'
|
245
|
-
end
|
246
|
-
end
|
247
|
-
|
248
|
-
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
249
|
-
Yasuri::TextNode.new('/html/body/p', "content"),
|
250
|
-
Yasuri::LinksNode.new('/html/body/ul/li/a', "sub_link", [
|
251
|
-
Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
|
252
|
-
]),
|
253
|
-
])
|
254
|
-
compare_generated_vs_original(generated, original)
|
255
|
-
end
|
256
|
-
end
|
257
|
-
|
258
|
-
############
|
259
|
-
# Paginate #
|
260
|
-
############
|
261
|
-
describe '::PaginateNode' do
|
262
|
-
before do
|
263
|
-
@uri += "/pagination/page01.html"
|
264
|
-
@page = @agent.get(@uri)
|
265
|
-
end
|
266
|
-
|
267
|
-
it "scrape each paginated pages" do
|
268
|
-
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
269
|
-
Yasuri::TextNode.new('/html/body/p', "content"),
|
270
|
-
])
|
271
|
-
actual = root_node.inject(@agent, @page)
|
272
|
-
expected = [
|
273
|
-
{"content" => "PaginationTest01"},
|
274
|
-
{"content" => "PaginationTest02"},
|
275
|
-
{"content" => "PaginationTest03"},
|
276
|
-
{"content" => "PaginationTest04"},
|
277
|
-
]
|
278
|
-
expect(actual).to match expected
|
279
|
-
end
|
280
|
-
|
281
|
-
it "scrape each paginated pages limited" do
|
282
|
-
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
283
|
-
Yasuri::TextNode.new('/html/body/p', "content"),
|
284
|
-
], limit:3)
|
285
|
-
actual = root_node.inject(@agent, @page)
|
286
|
-
expected = [
|
287
|
-
{"content" => "PaginationTest01"},
|
288
|
-
{"content" => "PaginationTest02"},
|
289
|
-
{"content" => "PaginationTest03"},
|
290
|
-
]
|
291
|
-
expect(actual).to match expected
|
292
|
-
end
|
293
|
-
|
294
|
-
|
295
|
-
it 'return first content if paginate link node is not found' do
|
296
|
-
missing_xpath = "/html/body/nav/span/b[@class='next']"
|
297
|
-
root_node = Yasuri::PaginateNode.new(missing_xpath, "root", [
|
298
|
-
Yasuri::TextNode.new('/html/body/p', "content"),
|
299
|
-
])
|
300
|
-
actual = root_node.inject(@agent, @page)
|
301
|
-
expected = [ {"content" => "PaginationTest01"}, ]
|
302
|
-
expect(actual).to match_array expected
|
303
|
-
end
|
304
|
-
|
305
|
-
it 'return empty hashes if content node is not found' do
|
306
|
-
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
307
|
-
Yasuri::TextNode.new('/html/body/hoge', "content"),
|
308
|
-
])
|
309
|
-
actual = root_node.inject(@agent, @page)
|
310
|
-
expected = [ {"content" => ""}, {"content" => ""}, {"content" => ""}, {"content" => ""},]
|
311
|
-
expect(actual).to match_array expected
|
312
|
-
end
|
313
|
-
|
314
|
-
it 'can be defined by DSL, return single PaginateNode content' do
|
315
|
-
generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']" do
|
316
|
-
text_content '/html/body/p'
|
317
|
-
end
|
318
|
-
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
319
|
-
Yasuri::TextNode.new('/html/body/p', "content"),
|
320
|
-
])
|
321
|
-
compare_generated_vs_original(generated, original, @page)
|
322
|
-
end
|
323
|
-
|
324
|
-
it 'can be defined by DSL, return single PaginateNode content limited' do
|
325
|
-
generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']", 2 do
|
326
|
-
text_content '/html/body/p'
|
327
|
-
end
|
328
|
-
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
329
|
-
Yasuri::TextNode.new('/html/body/p', "content"),
|
330
|
-
], limit: 2)
|
331
|
-
compare_generated_vs_original(generated, original, @page)
|
332
|
-
end
|
333
|
-
end
|
334
|
-
|
335
19
|
#############
|
336
20
|
# json2tree #
|
337
21
|
#############
|
@@ -348,9 +32,21 @@ describe 'Yasuri' do
|
|
348
32
|
}|
|
349
33
|
generated = Yasuri.json2tree(src)
|
350
34
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
351
|
-
compare_generated_vs_original(generated, original)
|
35
|
+
compare_generated_vs_original(generated, original, @index_page)
|
36
|
+
end
|
37
|
+
|
38
|
+
it "return TextNode with truncate_regexp" do
|
39
|
+
src = %q| { "node" : "text",
|
40
|
+
"name" : "content",
|
41
|
+
"path" : "/html/body/p[1]",
|
42
|
+
"truncate" : "^[^,]+"
|
43
|
+
}|
|
44
|
+
generated = Yasuri.json2tree(src)
|
45
|
+
original = Yasuri::TextNode.new('/html/body/p[1]', "content", truncate:/^[^,]+/)
|
46
|
+
compare_generated_vs_original(generated, original, @index_page)
|
352
47
|
end
|
353
48
|
|
49
|
+
|
354
50
|
it "return LinksNode/TextNode" do
|
355
51
|
src = %q| { "node" : "links",
|
356
52
|
"name" : "root",
|
@@ -364,7 +60,8 @@ describe 'Yasuri' do
|
|
364
60
|
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
365
61
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
366
62
|
])
|
367
|
-
|
63
|
+
|
64
|
+
compare_generated_vs_original(generated, original, @index_page)
|
368
65
|
end
|
369
66
|
|
370
67
|
it "return PaginateNode/TextNode" do
|
@@ -437,6 +134,83 @@ describe 'Yasuri' do
|
|
437
134
|
end
|
438
135
|
end
|
439
136
|
|
137
|
+
#############
|
138
|
+
# tree2json #
|
139
|
+
#############
|
140
|
+
describe '.tree2json' do
|
141
|
+
it "return empty json" do
|
142
|
+
json = Yasuri.tree2json(nil)
|
143
|
+
expect(json).to match "{}"
|
144
|
+
end
|
145
|
+
|
146
|
+
it "return text node" do
|
147
|
+
node = Yasuri::TextNode.new("/html/head/title", "title")
|
148
|
+
json = Yasuri.tree2json(node)
|
149
|
+
expected_str = %q| { "node": "text",
|
150
|
+
"name": "title",
|
151
|
+
"path": "/html/head/title"
|
152
|
+
} |
|
153
|
+
expected = JSON.parse(expected_str)
|
154
|
+
actual = JSON.parse(json)
|
155
|
+
expect(actual).to match expected
|
156
|
+
end
|
157
|
+
|
158
|
+
it "return text node with truncate_regexp" do
|
159
|
+
node = Yasuri::TextNode.new("/html/head/title", "title", truncate:/^[^,]+/)
|
160
|
+
json = Yasuri.tree2json(node)
|
161
|
+
expected_str = %q| { "node": "text",
|
162
|
+
"name": "title",
|
163
|
+
"path": "/html/head/title",
|
164
|
+
"truncate": "^[^,]+"
|
165
|
+
} |
|
166
|
+
expected = Yasuri.tree2json(Yasuri.json2tree(expected_str))
|
167
|
+
actual = Yasuri.tree2json(Yasuri.json2tree(json))
|
168
|
+
expect(actual).to match expected
|
169
|
+
end
|
170
|
+
|
171
|
+
it "return LinksNode/TextNode" do
|
172
|
+
tree = Yasuri::LinksNode.new('/html/body/a', "root", [
|
173
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
174
|
+
])
|
175
|
+
json = Yasuri.tree2json(tree)
|
176
|
+
expected_src = %q| { "node" : "links",
|
177
|
+
"name" : "root",
|
178
|
+
"path" : "/html/body/a",
|
179
|
+
"children" : [ { "node" : "text",
|
180
|
+
"name" : "content",
|
181
|
+
"path" : "/html/body/p"
|
182
|
+
} ]
|
183
|
+
}|
|
184
|
+
expected = JSON.parse(expected_src)
|
185
|
+
actual = JSON.parse(json)
|
186
|
+
expect(actual).to match expected
|
187
|
+
end
|
188
|
+
|
189
|
+
it "return PaginateNode/TextNode with limit" do
|
190
|
+
tree = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
191
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
192
|
+
], limit:10)
|
193
|
+
|
194
|
+
json = Yasuri.tree2json(tree)
|
195
|
+
expected_src = %q| { "node" : "pages",
|
196
|
+
"name" : "root",
|
197
|
+
"path" : "/html/body/nav/span/a[@class='next']",
|
198
|
+
"limit" : 10,
|
199
|
+
"children" : [ { "node" : "text",
|
200
|
+
"name" : "content",
|
201
|
+
"path" : "/html/body/p"
|
202
|
+
} ]
|
203
|
+
}|
|
204
|
+
expected = JSON.parse(expected_src)
|
205
|
+
actual = JSON.parse(json)
|
206
|
+
expect(actual).to match expected
|
207
|
+
end
|
208
|
+
|
209
|
+
|
210
|
+
|
211
|
+
end
|
212
|
+
|
213
|
+
|
440
214
|
it 'has a version number' do
|
441
215
|
expect(Yasuri::VERSION).not_to be nil
|
442
216
|
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
|
2
|
+
# Author:: TAC (tac@tac42.net)
|
3
|
+
|
4
|
+
require_relative 'spec_helper'
|
5
|
+
|
6
|
+
##########
|
7
|
+
# Struct #
|
8
|
+
##########
|
9
|
+
describe 'Yasuri' do
|
10
|
+
include_context 'httpserver'
|
11
|
+
|
12
|
+
describe '::StructNode' do
|
13
|
+
before do
|
14
|
+
@agent = Mechanize.new
|
15
|
+
@page = @agent.get(uri + "/structual_text.html")
|
16
|
+
|
17
|
+
@table_1996 = [
|
18
|
+
{ "title" => "The Perfect Insider",
|
19
|
+
"pub_date" => "1996/4/5" },
|
20
|
+
{ "title" => "Doctors in Isolated Room",
|
21
|
+
"pub_date" => "1996/7/5" },
|
22
|
+
{ "title" => "Mathematical Goodbye",
|
23
|
+
"pub_date" => "1996/9/5" },
|
24
|
+
]
|
25
|
+
@table_1997 = [
|
26
|
+
{ "title" => "Jack the Poetical Private",
|
27
|
+
"pub_date" => "1997/1/5" },
|
28
|
+
{ "title" => "Who Inside",
|
29
|
+
"pub_date" => "1997/4/5" },
|
30
|
+
{ "title" => "Illusion Acts Like Magic",
|
31
|
+
"pub_date" => "1997/10/5" },
|
32
|
+
]
|
33
|
+
@table_1998 = [
|
34
|
+
{ "title" => "Replaceable Summer",
|
35
|
+
"pub_date" => "1998/1/7" },
|
36
|
+
{ "title" => "Switch Back",
|
37
|
+
"pub_date" => "1998/4/5" },
|
38
|
+
{ "title" => "Numerical Models",
|
39
|
+
"pub_date" => "1998/7/5" },
|
40
|
+
{ "title" => "The Perfect Outsider",
|
41
|
+
"pub_date" => "1998/10/5" },
|
42
|
+
]
|
43
|
+
@all_tables = [
|
44
|
+
{"table" => @table_1996},
|
45
|
+
{"table" => @table_1997},
|
46
|
+
{"table" => @table_1998},
|
47
|
+
]
|
48
|
+
end
|
49
|
+
|
50
|
+
it 'scrape single table contents' do
|
51
|
+
node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
|
52
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
53
|
+
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
54
|
+
])
|
55
|
+
expected = @table_1996
|
56
|
+
actual = node.inject(@agent, @page)
|
57
|
+
expect(actual).to match expected
|
58
|
+
end
|
59
|
+
|
60
|
+
it 'return empty text if no match node' do
|
61
|
+
no_match_xpath = '/html/body/table[1]/t'
|
62
|
+
node = Yasuri::StructNode.new(no_match_xpath, "table", [
|
63
|
+
Yasuri::TextNode.new('./td[1]', "title")
|
64
|
+
])
|
65
|
+
actual = node.inject(@agent, @page)
|
66
|
+
expect(actual).to be_empty
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'fail with invalid xpath' do
|
70
|
+
invalid_xpath = '/html/body/table[1]/table[1]/tr['
|
71
|
+
node = Yasuri::StructNode.new(invalid_xpath, "table", [
|
72
|
+
Yasuri::TextNode.new('./td[1]', "title")
|
73
|
+
])
|
74
|
+
expect { node.inject(@agent, @page) }.to raise_error
|
75
|
+
end
|
76
|
+
|
77
|
+
it 'fail with invalid xpath in children' do
|
78
|
+
invalid_xpath = './td[1]['
|
79
|
+
node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
|
80
|
+
Yasuri::TextNode.new(invalid_xpath, "title"),
|
81
|
+
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
82
|
+
])
|
83
|
+
expect { node.inject(@agent, @page) }.to raise_error
|
84
|
+
end
|
85
|
+
|
86
|
+
it 'scrape all tables' do
|
87
|
+
node = Yasuri::StructNode.new('/html/body/table', "tables", [
|
88
|
+
Yasuri::StructNode.new('./tr', "table", [
|
89
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
90
|
+
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
91
|
+
])
|
92
|
+
])
|
93
|
+
expected = @all_tables
|
94
|
+
actual = node.inject(@agent, @page)
|
95
|
+
expect(actual).to match expected
|
96
|
+
end
|
97
|
+
|
98
|
+
it 'can be defined by DSL, scrape all tables' do
|
99
|
+
generated = Yasuri.struct_tables '/html/body/table' do
|
100
|
+
struct_table './tr' do
|
101
|
+
text_title './td[1]'
|
102
|
+
text_pub_date './td[2]'
|
103
|
+
end
|
104
|
+
end
|
105
|
+
original = Yasuri::StructNode.new('/html/body/table', "tables", [
|
106
|
+
Yasuri::StructNode.new('./tr', "table", [
|
107
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
108
|
+
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
109
|
+
])
|
110
|
+
])
|
111
|
+
compare_generated_vs_original(generated, original, @page)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
|
2
|
+
# Author:: TAC (tac@tac42.net)
|
3
|
+
|
4
|
+
require_relative 'spec_helper'
|
5
|
+
|
6
|
+
########
|
7
|
+
# Text #
|
8
|
+
########
|
9
|
+
describe 'Yasuri' do
|
10
|
+
include_context 'httpserver'
|
11
|
+
|
12
|
+
before do
|
13
|
+
@agent = Mechanize.new
|
14
|
+
@index_page = @agent.get(uri)
|
15
|
+
end
|
16
|
+
|
17
|
+
describe '::TextNode' do
|
18
|
+
before { @node = Yasuri::TextNode.new('/html/body/p[1]', "title") }
|
19
|
+
|
20
|
+
it 'scrape text text <p>Hello,Yasuri</p>' do
|
21
|
+
actual = @node.inject(@agent, @index_page)
|
22
|
+
expect(actual).to eq "Hello,Yasuri"
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'return empty text if no match node' do
|
26
|
+
no_match_node = Yasuri::TextNode.new('/html/body/no_match_node', "title")
|
27
|
+
actual = no_match_node.inject(@agent, @index_page)
|
28
|
+
expect(actual).to be_empty
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'fail with invalid xpath' do
|
32
|
+
invalid_xpath = '/html/body/no_match_node['
|
33
|
+
node = Yasuri::TextNode.new(invalid_xpath, "title")
|
34
|
+
expect { node.inject(@agent, @index_page) }.to raise_error
|
35
|
+
end
|
36
|
+
|
37
|
+
it "can be defined by DSL, return single TextNode title" do
|
38
|
+
generated = Yasuri.text_title '/html/body/p[1]'
|
39
|
+
original = Yasuri::TextNode.new('/html/body/p[1]', "title")
|
40
|
+
compare_generated_vs_original(generated, original, @index_page)
|
41
|
+
end
|
42
|
+
|
43
|
+
it "can be truncated with regexp" do
|
44
|
+
node = Yasuri.text_title '/html/body/p[1]', /^[^,]+/
|
45
|
+
actual = node.inject(@agent, @index_page)
|
46
|
+
expect(actual).to eq "Hello"
|
47
|
+
end
|
48
|
+
|
49
|
+
it "can be truncated with regexp" do
|
50
|
+
node = Yasuri.text_title '/html/body/p[1]', /[^,]+$/
|
51
|
+
actual = node.inject(@agent, @index_page)
|
52
|
+
expect(actual).to eq "Yasuri"
|
53
|
+
end
|
54
|
+
|
55
|
+
it "return empty string if truncated with no match to regexp" do
|
56
|
+
node = Yasuri.text_title '/html/body/p[1]', /^hoge/
|
57
|
+
actual = node.inject(@agent, @index_page)
|
58
|
+
expect(actual).to be_empty
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yasuri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- TAC
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-02-
|
11
|
+
date: 2015-02-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -155,6 +155,12 @@ files:
|
|
155
155
|
- lib/yasuri.rb
|
156
156
|
- lib/yasuri/version.rb
|
157
157
|
- lib/yasuri/yasuri.rb
|
158
|
+
- lib/yasuri/yasuri_links_node.rb
|
159
|
+
- lib/yasuri/yasuri_node.rb
|
160
|
+
- lib/yasuri/yasuri_node_generator.rb
|
161
|
+
- lib/yasuri/yasuri_paginate_node.rb
|
162
|
+
- lib/yasuri/yasuri_struct_node.rb
|
163
|
+
- lib/yasuri/yasuri_text_node.rb
|
158
164
|
- spec/htdocs/child01.html
|
159
165
|
- spec/htdocs/child01_sub.html
|
160
166
|
- spec/htdocs/child02.html
|
@@ -169,7 +175,12 @@ files:
|
|
169
175
|
- spec/htdocs/structual_text.html
|
170
176
|
- spec/servers/httpserver.rb
|
171
177
|
- spec/spec_helper.rb
|
178
|
+
- spec/yasuri_links_node_spec.rb
|
179
|
+
- spec/yasuri_node_spec.rb
|
180
|
+
- spec/yasuri_paginate_node_spec.rb
|
172
181
|
- spec/yasuri_spec.rb
|
182
|
+
- spec/yasuri_struct_node_spec.rb
|
183
|
+
- spec/yasuri_text_node_spec.rb
|
173
184
|
- yasuri.gemspec
|
174
185
|
homepage: https://github.com/tac0x2a/yasuri
|
175
186
|
licenses:
|
@@ -210,4 +221,9 @@ test_files:
|
|
210
221
|
- spec/htdocs/structual_text.html
|
211
222
|
- spec/servers/httpserver.rb
|
212
223
|
- spec/spec_helper.rb
|
224
|
+
- spec/yasuri_links_node_spec.rb
|
225
|
+
- spec/yasuri_node_spec.rb
|
226
|
+
- spec/yasuri_paginate_node_spec.rb
|
213
227
|
- spec/yasuri_spec.rb
|
228
|
+
- spec/yasuri_struct_node_spec.rb
|
229
|
+
- spec/yasuri_text_node_spec.rb
|