yasuri 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +7 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +38 -127
- data/lib/yasuri/yasuri_links_node.rb +23 -0
- data/lib/yasuri/yasuri_node.rb +21 -0
- data/lib/yasuri/yasuri_node_generator.rb +46 -0
- data/lib/yasuri/yasuri_paginate_node.rb +39 -0
- data/lib/yasuri/yasuri_struct_node.rb +19 -0
- data/lib/yasuri/yasuri_text_node.rb +32 -0
- data/spec/spec_helper.rb +6 -0
- data/spec/yasuri_links_node_spec.rb +94 -0
- data/spec/yasuri_node_spec.rb +11 -0
- data/spec/yasuri_paginate_node_spec.rb +85 -0
- data/spec/yasuri_spec.rb +93 -319
- data/spec/yasuri_struct_node_spec.rb +114 -0
- data/spec/yasuri_text_node_spec.rb +61 -0
- metadata +18 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 18c8b6da6ca1f9d5433adc128b83ed5d5a8e353e
|
4
|
+
data.tar.gz: 07ba467f8d62982e4a8e969da42b839d8ee07664
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 232a5893c4511b0ef80b34a95d58af4d1bb8683512a1cb41b7f9a6d19def75d3c825b12c383bf51bab09e12fe8bd54cd91b79b32640319b60d6185a46ed7f086
|
7
|
+
data.tar.gz: 4cb31e01b60861d13770b8b9d033cef35fe6bbdd226d11703826040840ac979fcfd98c162b18322baa8f9102fd2ed059ac0c15c277c49217feaf39173800dea6
|
data/README.md
CHANGED
@@ -2,6 +2,13 @@
|
|
2
2
|
|
3
3
|
Yasuri (鑢) is an easy web-scraping library for supporting "[Mechanize](https://github.com/sparklemotion/mechanize)".
|
4
4
|
|
5
|
+
|
6
|
+
## Sample
|
7
|
+
|
8
|
+
https://yasuri-sample.herokuapp.com/
|
9
|
+
|
10
|
+
(source code: https://github.com/tac0x2a/yasuri-sample)
|
11
|
+
|
5
12
|
## Installation
|
6
13
|
|
7
14
|
Add this line to your application's Gemfile:
|
data/lib/yasuri/version.rb
CHANGED
data/lib/yasuri/yasuri.rb
CHANGED
@@ -5,148 +5,38 @@
|
|
5
5
|
require 'mechanize'
|
6
6
|
require 'json'
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
@xpath, @name, @children = xpath, name, children
|
15
|
-
end
|
16
|
-
|
17
|
-
def inject(agent, page)
|
18
|
-
fail "#{Kernel.__method__} is not implemented."
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
class TextNode
|
23
|
-
include Node
|
24
|
-
def initialize(xpath, name, children = [], truncate_regexp: nil, opt: {})
|
25
|
-
super(xpath, name, children)
|
26
|
-
@truncate_regexp = truncate_regexp
|
27
|
-
end
|
28
|
-
def inject(agent, page, retry_count = 5)
|
29
|
-
node = page.search(@xpath)
|
30
|
-
text = node.text.to_s
|
31
|
-
|
32
|
-
text = text[@truncate_regexp, 0] if @truncate_regexp
|
33
|
-
|
34
|
-
text.to_s
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
class StructNode
|
39
|
-
include Node
|
40
|
-
def inject(agent, page, retry_count = 5)
|
41
|
-
sub_tags = page.search(@xpath)
|
42
|
-
sub_tags.map do |sub_tag|
|
43
|
-
child_results_kv = @children.map do |child_node|
|
44
|
-
[child_node.name, child_node.inject(agent, sub_tag, retry_count)]
|
45
|
-
end
|
46
|
-
Hash[child_results_kv]
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
class LinksNode
|
52
|
-
include Node
|
53
|
-
def inject(agent, page, retry_count = 5)
|
54
|
-
links = page.search(@xpath) || [] # links expected
|
55
|
-
links.map do |link|
|
56
|
-
link_button = Mechanize::Page::Link.new(link, agent, page)
|
57
|
-
child_page = Yasuri.with_retry(retry_count) { link_button.click }
|
58
|
-
|
59
|
-
child_results_kv = @children.map do |child_node|
|
60
|
-
[child_node.name, child_node.inject(agent, child_page, retry_count)]
|
61
|
-
end
|
62
|
-
|
63
|
-
Hash[child_results_kv]
|
64
|
-
end # each named child node
|
65
|
-
end
|
66
|
-
end
|
67
|
-
|
68
|
-
class PaginateNode
|
69
|
-
include Node
|
70
|
-
|
71
|
-
def initialize(xpath, name, children = [], limit: nil, opt: {})
|
72
|
-
super(xpath, name, children)
|
73
|
-
@limit = limit || opt["limit"] || Float::MAX
|
74
|
-
end
|
75
|
-
|
76
|
-
def inject(agent, page, retry_count = 5)
|
8
|
+
require_relative 'yasuri_node'
|
9
|
+
require_relative 'yasuri_text_node'
|
10
|
+
require_relative 'yasuri_struct_node'
|
11
|
+
require_relative 'yasuri_paginate_node'
|
12
|
+
require_relative 'yasuri_links_node'
|
13
|
+
require_relative 'yasuri_node_generator'
|
77
14
|
|
78
|
-
|
79
|
-
while page
|
80
|
-
child_results_kv = @children.map do |child_node|
|
81
|
-
[child_node.name, child_node.inject(agent, page, retry_count)]
|
82
|
-
end
|
83
|
-
child_results << Hash[child_results_kv]
|
84
|
-
|
85
|
-
link = page.search(@xpath).first
|
86
|
-
break if link == nil
|
87
|
-
|
88
|
-
link_button = Mechanize::Page::Link.new(link, agent, page)
|
89
|
-
page = Yasuri.with_retry(retry_count) { link_button.click }
|
90
|
-
break if (@limit -= 1) <= 0
|
91
|
-
end
|
92
|
-
|
93
|
-
child_results
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
|
-
class NodeGenerator
|
98
|
-
def gen_recursive(&block)
|
99
|
-
@nodes = []
|
100
|
-
instance_eval(&block)
|
101
|
-
@nodes
|
102
|
-
end
|
103
|
-
|
104
|
-
def method_missing(name, *args, &block)
|
105
|
-
node = NodeGenerator.gen(name, *args, &block)
|
106
|
-
raise "Undefined Node Name '#{name}'" if node == nil
|
107
|
-
@nodes << node
|
108
|
-
end
|
109
|
-
|
110
|
-
def self.gen(name, *args, &block)
|
111
|
-
xpath, opt = *args
|
112
|
-
children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block_given?
|
113
|
-
|
114
|
-
case name
|
115
|
-
when /^text_(.+)$/
|
116
|
-
truncate_regexp = opt
|
117
|
-
Yasuri::TextNode.new(xpath, $1, truncate_regexp)
|
118
|
-
when /^struct_(.+)$/
|
119
|
-
Yasuri::StructNode.new(xpath, $1, children || [])
|
120
|
-
when /^links_(.+)$/
|
121
|
-
Yasuri::LinksNode.new(xpath, $1, children || [])
|
122
|
-
when /^pages_(.+)$/
|
123
|
-
xpath, limit = *args
|
124
|
-
limit = limit || Float::MAX
|
125
|
-
Yasuri::PaginateNode.new(xpath, $1, children || [], limit: limit)
|
126
|
-
else
|
127
|
-
nil
|
128
|
-
end
|
129
|
-
end # of self.gen(name, *args, &block)
|
130
|
-
end # of class NodeGenerator
|
15
|
+
module Yasuri
|
131
16
|
|
132
17
|
def self.json2tree(json_string)
|
133
18
|
json = JSON.parse(json_string)
|
134
19
|
Yasuri.hash2node(json)
|
135
20
|
end
|
136
21
|
|
22
|
+
def self.tree2json(node)
|
23
|
+
Yasuri.node2hash(node).to_json
|
24
|
+
end
|
25
|
+
|
137
26
|
def self.method_missing(name, *args, &block)
|
138
27
|
generated = Yasuri::NodeGenerator.gen(name, *args, &block)
|
139
28
|
generated || super(name, args)
|
140
29
|
end
|
141
30
|
|
142
|
-
|
143
31
|
private
|
144
32
|
Text2Node = {
|
145
|
-
"text" => TextNode,
|
146
|
-
"struct" => StructNode,
|
147
|
-
"links" => LinksNode,
|
148
|
-
"pages" => PaginateNode
|
33
|
+
"text" => Yasuri::TextNode,
|
34
|
+
"struct" => Yasuri::StructNode,
|
35
|
+
"links" => Yasuri::LinksNode,
|
36
|
+
"pages" => Yasuri::PaginateNode
|
149
37
|
}
|
38
|
+
Node2Text = Text2Node.invert
|
39
|
+
|
150
40
|
ReservedKeys = %w|node name path children|
|
151
41
|
def self.hash2node(node_h)
|
152
42
|
node, name, path, children = ReservedKeys.map do |key|
|
@@ -162,6 +52,27 @@ module Yasuri
|
|
162
52
|
klass ? klass.new(path, name, childnodes, opt: opt) : nil
|
163
53
|
end
|
164
54
|
|
55
|
+
def self.node2hash(node)
|
56
|
+
json = JSON.parse("{}")
|
57
|
+
return json if node.nil?
|
58
|
+
|
59
|
+
klass = node.class
|
60
|
+
klass_str = Node2Text[klass]
|
61
|
+
|
62
|
+
json["node"] = klass_str
|
63
|
+
json["name"] = node.name
|
64
|
+
json["path"] = node.xpath
|
65
|
+
|
66
|
+
children = node.children.map{|c| Yasuri.node2hash(c)}
|
67
|
+
json["children"] = children if not children.empty?
|
68
|
+
|
69
|
+
node.opts.each do |key,value|
|
70
|
+
json[key] = value if not value.nil?
|
71
|
+
end
|
72
|
+
|
73
|
+
json
|
74
|
+
end
|
75
|
+
|
165
76
|
def self.with_retry(retry_count = 5)
|
166
77
|
begin
|
167
78
|
return yield() if block_given?
|
@@ -0,0 +1,23 @@
|
|
1
|
+
|
2
|
+
# Author:: TAC (tac@tac42.net)
|
3
|
+
|
4
|
+
require_relative 'yasuri_node'
|
5
|
+
|
6
|
+
module Yasuri
|
7
|
+
class LinksNode
|
8
|
+
include Node
|
9
|
+
def inject(agent, page, retry_count = 5)
|
10
|
+
links = page.search(@xpath) || [] # links expected
|
11
|
+
links.map do |link|
|
12
|
+
link_button = Mechanize::Page::Link.new(link, agent, page)
|
13
|
+
child_page = Yasuri.with_retry(retry_count) { link_button.click }
|
14
|
+
|
15
|
+
child_results_kv = @children.map do |child_node|
|
16
|
+
[child_node.name, child_node.inject(agent, child_page, retry_count)]
|
17
|
+
end
|
18
|
+
|
19
|
+
Hash[child_results_kv]
|
20
|
+
end # each named child node
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
|
2
|
+
# Author:: TAC (tac@tac42.net)
|
3
|
+
|
4
|
+
require_relative 'yasuri_node'
|
5
|
+
|
6
|
+
module Yasuri
|
7
|
+
module Node
|
8
|
+
attr_reader :url, :xpath, :name, :children
|
9
|
+
|
10
|
+
def initialize(xpath, name, children = [], opt: {})
|
11
|
+
@xpath, @name, @children = xpath, name, children
|
12
|
+
end
|
13
|
+
|
14
|
+
def inject(agent, page)
|
15
|
+
fail "#{Kernel.__method__} is not implemented."
|
16
|
+
end
|
17
|
+
def opts
|
18
|
+
{}
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
|
2
|
+
# Author:: TAC (tac@tac42.net)
|
3
|
+
|
4
|
+
require_relative 'yasuri_node'
|
5
|
+
require_relative 'yasuri_text_node'
|
6
|
+
require_relative 'yasuri_struct_node'
|
7
|
+
require_relative 'yasuri_links_node'
|
8
|
+
require_relative 'yasuri_paginate_node'
|
9
|
+
|
10
|
+
module Yasuri
|
11
|
+
class NodeGenerator
|
12
|
+
def gen_recursive(&block)
|
13
|
+
@nodes = []
|
14
|
+
instance_eval(&block)
|
15
|
+
@nodes
|
16
|
+
end
|
17
|
+
|
18
|
+
def method_missing(name, *args, &block)
|
19
|
+
node = NodeGenerator.gen(name, *args, &block)
|
20
|
+
raise "Undefined Node Name '#{name}'" if node == nil
|
21
|
+
@nodes << node
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.gen(name, *args, &block)
|
25
|
+
xpath, opt = *args
|
26
|
+
opt = [opt].flatten.compact
|
27
|
+
children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block_given?
|
28
|
+
|
29
|
+
case name
|
30
|
+
when /^text_(.+)$/
|
31
|
+
truncate, dummy = *opt
|
32
|
+
Yasuri::TextNode.new(xpath, $1, children || [], truncate: truncate)
|
33
|
+
when /^struct_(.+)$/
|
34
|
+
Yasuri::StructNode.new(xpath, $1, children || [])
|
35
|
+
when /^links_(.+)$/
|
36
|
+
Yasuri::LinksNode.new(xpath, $1, children || [])
|
37
|
+
when /^pages_(.+)$/
|
38
|
+
limit, dummy = *opt
|
39
|
+
limit = limit || Float::MAX
|
40
|
+
Yasuri::PaginateNode.new(xpath, $1, children || [], limit: limit)
|
41
|
+
else
|
42
|
+
nil
|
43
|
+
end
|
44
|
+
end # of self.gen(name, *args, &block)
|
45
|
+
end # of class NodeGenerator
|
46
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
|
2
|
+
# Author:: TAC (tac@tac42.net)
|
3
|
+
|
4
|
+
require_relative 'yasuri_node'
|
5
|
+
|
6
|
+
module Yasuri
|
7
|
+
class PaginateNode
|
8
|
+
include Node
|
9
|
+
|
10
|
+
def initialize(xpath, name, children = [], limit: nil, opt: {})
|
11
|
+
super(xpath, name, children)
|
12
|
+
@limit = limit || opt["limit"]
|
13
|
+
end
|
14
|
+
|
15
|
+
def inject(agent, page, retry_count = 5)
|
16
|
+
|
17
|
+
child_results = []
|
18
|
+
limit = @limit.nil? ? Float::MAX : @limit
|
19
|
+
while page
|
20
|
+
child_results_kv = @children.map do |child_node|
|
21
|
+
[child_node.name, child_node.inject(agent, page, retry_count)]
|
22
|
+
end
|
23
|
+
child_results << Hash[child_results_kv]
|
24
|
+
|
25
|
+
link = page.search(@xpath).first
|
26
|
+
break if link == nil
|
27
|
+
|
28
|
+
link_button = Mechanize::Page::Link.new(link, agent, page)
|
29
|
+
page = Yasuri.with_retry(retry_count) { link_button.click }
|
30
|
+
break if (limit -= 1) <= 0
|
31
|
+
end
|
32
|
+
|
33
|
+
child_results
|
34
|
+
end
|
35
|
+
def opts
|
36
|
+
{limit:@limit}
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
|
2
|
+
# Author:: TAC (tac@tac42.net)
|
3
|
+
|
4
|
+
require_relative 'yasuri_node'
|
5
|
+
|
6
|
+
module Yasuri
|
7
|
+
class StructNode
|
8
|
+
include Node
|
9
|
+
def inject(agent, page, retry_count = 5)
|
10
|
+
sub_tags = page.search(@xpath)
|
11
|
+
sub_tags.map do |sub_tag|
|
12
|
+
child_results_kv = @children.map do |child_node|
|
13
|
+
[child_node.name, child_node.inject(agent, sub_tag, retry_count)]
|
14
|
+
end
|
15
|
+
Hash[child_results_kv]
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
|
2
|
+
# Author:: TAC (tac@tac42.net)
|
3
|
+
|
4
|
+
require_relative 'yasuri_node'
|
5
|
+
|
6
|
+
module Yasuri
|
7
|
+
class TextNode
|
8
|
+
include Node
|
9
|
+
def initialize(xpath, name, children = [], truncate: nil, opt: {})
|
10
|
+
super(xpath, name, children)
|
11
|
+
|
12
|
+
truncate_opt = opt["truncate"] #str
|
13
|
+
truncate_opt = Regexp.new(truncate_opt) if not truncate_opt.nil? # regexp or nil
|
14
|
+
|
15
|
+
@truncate = truncate || truncate_opt || nil # regexp or nil
|
16
|
+
|
17
|
+
@truncate = Regexp.new(@truncate.to_s) if not @truncate.nil?
|
18
|
+
|
19
|
+
end
|
20
|
+
def inject(agent, page, retry_count = 5)
|
21
|
+
node = page.search(@xpath)
|
22
|
+
text = node.text.to_s
|
23
|
+
|
24
|
+
text = text[@truncate, 0] if @truncate
|
25
|
+
|
26
|
+
text.to_s
|
27
|
+
end
|
28
|
+
def opts
|
29
|
+
{truncate:@truncate}
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
data/spec/spec_helper.rb
CHANGED
@@ -29,3 +29,9 @@ SimpleCov.start
|
|
29
29
|
|
30
30
|
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
31
31
|
require 'yasuri'
|
32
|
+
|
33
|
+
def compare_generated_vs_original(generated, original, page)
|
34
|
+
expected = original.inject(@agent, page)
|
35
|
+
actual = generated.inject(@agent, page)
|
36
|
+
expect(actual).to match expected
|
37
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
|
2
|
+
# Author:: TAC (tac@tac42.net)
|
3
|
+
|
4
|
+
require_relative 'spec_helper'
|
5
|
+
|
6
|
+
#########
|
7
|
+
# Links #
|
8
|
+
#########
|
9
|
+
describe 'Yasuri' do
|
10
|
+
include_context 'httpserver'
|
11
|
+
|
12
|
+
describe '::LinksNode' do
|
13
|
+
before do
|
14
|
+
@agent = Mechanize.new
|
15
|
+
@uri = uri
|
16
|
+
@index_page = @agent.get(@uri)
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'scrape links' do
|
20
|
+
root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
|
21
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
22
|
+
])
|
23
|
+
|
24
|
+
actual = root_node.inject(@agent, @index_page)
|
25
|
+
expected = [
|
26
|
+
{"content" => "Child 01 page."},
|
27
|
+
{"content" => "Child 02 page."},
|
28
|
+
{"content" => "Child 03 page."},
|
29
|
+
]
|
30
|
+
expect(actual).to match expected
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'return empty set if no match node' do
|
34
|
+
missing_xpath = '/html/body/b'
|
35
|
+
root_node = Yasuri::LinksNode.new(missing_xpath, "root", [
|
36
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
37
|
+
])
|
38
|
+
|
39
|
+
actual = root_node.inject(@agent, @index_page)
|
40
|
+
expect(actual).to be_empty
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'scrape links, recursive' do
|
44
|
+
root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
|
45
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
46
|
+
Yasuri::LinksNode.new('/html/body/ul/li/a', "sub_link", [
|
47
|
+
Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
|
48
|
+
]),
|
49
|
+
])
|
50
|
+
actual = root_node.inject(@agent, @index_page)
|
51
|
+
expected = [
|
52
|
+
{"content" => "Child 01 page.",
|
53
|
+
"sub_link" => [{"sub_page_title" => "Child 01 SubPage Test"},
|
54
|
+
{"sub_page_title" => "Child 02 SubPage Test"}],},
|
55
|
+
{"content" => "Child 02 page.",
|
56
|
+
"sub_link" => [],},
|
57
|
+
{"content" => "Child 03 page.",
|
58
|
+
"sub_link" => [{"sub_page_title" => "Child 03 SubPage Test"}],},
|
59
|
+
]
|
60
|
+
expect(actual).to match expected
|
61
|
+
end
|
62
|
+
it 'can be defined by DSL, return single LinkNode title' do
|
63
|
+
generated = Yasuri.links_title '/html/body/a'
|
64
|
+
original = Yasuri::LinksNode.new('/html/body/a', "title")
|
65
|
+
compare_generated_vs_original(generated, original, @index_page)
|
66
|
+
end
|
67
|
+
it 'can be defined by DSL, return nested contents under link' do
|
68
|
+
generated = Yasuri.links_title '/html/body/a' do
|
69
|
+
text_name '/html/body/p'
|
70
|
+
end
|
71
|
+
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
72
|
+
Yasuri::TextNode.new('/html/body/p', "name"),
|
73
|
+
])
|
74
|
+
compare_generated_vs_original(generated, original, @index_page)
|
75
|
+
end
|
76
|
+
|
77
|
+
it 'can be defined by DSL, return recursive links node' do
|
78
|
+
generated = Yasuri.links_root '/html/body/a' do
|
79
|
+
text_content '/html/body/p'
|
80
|
+
links_sub_link '/html/body/ul/li/a' do
|
81
|
+
text_sub_page_title '/html/head/title'
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
86
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
87
|
+
Yasuri::LinksNode.new('/html/body/ul/li/a', "sub_link", [
|
88
|
+
Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
|
89
|
+
]),
|
90
|
+
])
|
91
|
+
compare_generated_vs_original(generated, original, @index_page)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
|
2
|
+
# Author:: TAC (tac@tac42.net)
|
3
|
+
|
4
|
+
require_relative 'spec_helper'
|
5
|
+
|
6
|
+
############
|
7
|
+
# Paginate #
|
8
|
+
############
|
9
|
+
describe 'Yasuri' do
|
10
|
+
include_context 'httpserver'
|
11
|
+
|
12
|
+
describe '::PaginateNode' do
|
13
|
+
before do
|
14
|
+
@agent = Mechanize.new
|
15
|
+
@uri = uri + "/pagination/page01.html"
|
16
|
+
@page = @agent.get(@uri)
|
17
|
+
end
|
18
|
+
|
19
|
+
it "scrape each paginated pages" do
|
20
|
+
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
21
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
22
|
+
])
|
23
|
+
actual = root_node.inject(@agent, @page)
|
24
|
+
expected = [
|
25
|
+
{"content" => "PaginationTest01"},
|
26
|
+
{"content" => "PaginationTest02"},
|
27
|
+
{"content" => "PaginationTest03"},
|
28
|
+
{"content" => "PaginationTest04"},
|
29
|
+
]
|
30
|
+
expect(actual).to match expected
|
31
|
+
end
|
32
|
+
|
33
|
+
it "scrape each paginated pages limited" do
|
34
|
+
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
35
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
36
|
+
], limit:3)
|
37
|
+
actual = root_node.inject(@agent, @page)
|
38
|
+
expected = [
|
39
|
+
{"content" => "PaginationTest01"},
|
40
|
+
{"content" => "PaginationTest02"},
|
41
|
+
{"content" => "PaginationTest03"},
|
42
|
+
]
|
43
|
+
expect(actual).to match expected
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'return first content if paginate link node is not found' do
|
47
|
+
missing_xpath = "/html/body/nav/span/b[@class='next']"
|
48
|
+
root_node = Yasuri::PaginateNode.new(missing_xpath, "root", [
|
49
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
50
|
+
])
|
51
|
+
actual = root_node.inject(@agent, @page)
|
52
|
+
expected = [ {"content" => "PaginationTest01"}, ]
|
53
|
+
expect(actual).to match_array expected
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'return empty hashes if content node is not found' do
|
57
|
+
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
58
|
+
Yasuri::TextNode.new('/html/body/hoge', "content"),
|
59
|
+
])
|
60
|
+
actual = root_node.inject(@agent, @page)
|
61
|
+
expected = [ {"content" => ""}, {"content" => ""}, {"content" => ""}, {"content" => ""},]
|
62
|
+
expect(actual).to match_array expected
|
63
|
+
end
|
64
|
+
|
65
|
+
it 'can be defined by DSL, return single PaginateNode content' do
|
66
|
+
generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']" do
|
67
|
+
text_content '/html/body/p'
|
68
|
+
end
|
69
|
+
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
70
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
71
|
+
])
|
72
|
+
compare_generated_vs_original(generated, original, @page)
|
73
|
+
end
|
74
|
+
|
75
|
+
it 'can be defined by DSL, return single PaginateNode content limited' do
|
76
|
+
generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']", 2 do
|
77
|
+
text_content '/html/body/p'
|
78
|
+
end
|
79
|
+
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
80
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
81
|
+
], limit: 2)
|
82
|
+
compare_generated_vs_original(generated, original, @page)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
data/spec/yasuri_spec.rb
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
# Author:: TAC (tac@tac42.net)
|
4
4
|
|
5
5
|
require_relative 'spec_helper'
|
6
|
+
require_relative 'yasuri_text_node_spec'
|
6
7
|
|
7
8
|
#require_relative '../lib/yasuri/yasuri'
|
8
9
|
|
@@ -15,323 +16,6 @@ describe 'Yasuri' do
|
|
15
16
|
@index_page = @agent.get(@uri)
|
16
17
|
end
|
17
18
|
|
18
|
-
########
|
19
|
-
# Node #
|
20
|
-
########
|
21
|
-
def compare_generated_vs_original(generated, original, page = @index_page)
|
22
|
-
expected = original.inject(@agent, page)
|
23
|
-
actual = generated.inject(@agent, page)
|
24
|
-
expect(actual).to match expected
|
25
|
-
end
|
26
|
-
|
27
|
-
########
|
28
|
-
# Text #
|
29
|
-
########
|
30
|
-
describe '::TextNode' do
|
31
|
-
before { @node = Yasuri::TextNode.new('/html/body/p[1]', "title") }
|
32
|
-
|
33
|
-
it 'scrape text text <p>Hello,Yasuri</p>' do
|
34
|
-
actual = @node.inject(@agent, @index_page)
|
35
|
-
expect(actual).to eq "Hello,Yasuri"
|
36
|
-
end
|
37
|
-
|
38
|
-
it 'return empty text if no match node' do
|
39
|
-
no_match_node = Yasuri::TextNode.new('/html/body/no_match_node', "title")
|
40
|
-
actual = no_match_node.inject(@agent, @index_page)
|
41
|
-
expect(actual).to be_empty
|
42
|
-
end
|
43
|
-
|
44
|
-
it 'fail with invalid xpath' do
|
45
|
-
invalid_xpath = '/html/body/no_match_node['
|
46
|
-
node = Yasuri::TextNode.new(invalid_xpath, "title")
|
47
|
-
expect { node.inject(@agent, @index_page) }.to raise_error
|
48
|
-
end
|
49
|
-
|
50
|
-
it "can be defined by DSL, return single TextNode title" do
|
51
|
-
generated = Yasuri.text_title '/html/body/p[1]'
|
52
|
-
original = Yasuri::TextNode.new('/html/body/p[1]', "title")
|
53
|
-
compare_generated_vs_original(generated, original)
|
54
|
-
end
|
55
|
-
|
56
|
-
it "can be truncated with regexp" do
|
57
|
-
node = Yasuri.text_title '/html/body/p[1]', truncate_regexp:/^[^,]+/
|
58
|
-
actual = node.inject(@agent, @index_page)
|
59
|
-
expect(actual).to eq "Hello"
|
60
|
-
end
|
61
|
-
|
62
|
-
it "can be truncated with regexp" do
|
63
|
-
node = Yasuri.text_title '/html/body/p[1]', truncate_regexp:/[^,]+$/
|
64
|
-
actual = node.inject(@agent, @index_page)
|
65
|
-
expect(actual).to eq "Yasuri"
|
66
|
-
end
|
67
|
-
|
68
|
-
it "return empty string if truncated with no match to regexp" do
|
69
|
-
node = Yasuri.text_title '/html/body/p[1]', truncate_regexp:/^hoge/
|
70
|
-
actual = node.inject(@agent, @index_page)
|
71
|
-
expect(actual).to be_empty
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
##########
|
76
|
-
# Struct #
|
77
|
-
##########
|
78
|
-
describe '::StructNode' do
|
79
|
-
before do
|
80
|
-
@page = @agent.get(@uri + "/structual_text.html")
|
81
|
-
@table_1996 = [
|
82
|
-
{ "title" => "The Perfect Insider",
|
83
|
-
"pub_date" => "1996/4/5" },
|
84
|
-
{ "title" => "Doctors in Isolated Room",
|
85
|
-
"pub_date" => "1996/7/5" },
|
86
|
-
{ "title" => "Mathematical Goodbye",
|
87
|
-
"pub_date" => "1996/9/5" },
|
88
|
-
]
|
89
|
-
@table_1997 = [
|
90
|
-
{ "title" => "Jack the Poetical Private",
|
91
|
-
"pub_date" => "1997/1/5" },
|
92
|
-
{ "title" => "Who Inside",
|
93
|
-
"pub_date" => "1997/4/5" },
|
94
|
-
{ "title" => "Illusion Acts Like Magic",
|
95
|
-
"pub_date" => "1997/10/5" },
|
96
|
-
]
|
97
|
-
@table_1998 = [
|
98
|
-
{ "title" => "Replaceable Summer",
|
99
|
-
"pub_date" => "1998/1/7" },
|
100
|
-
{ "title" => "Switch Back",
|
101
|
-
"pub_date" => "1998/4/5" },
|
102
|
-
{ "title" => "Numerical Models",
|
103
|
-
"pub_date" => "1998/7/5" },
|
104
|
-
{ "title" => "The Perfect Outsider",
|
105
|
-
"pub_date" => "1998/10/5" },
|
106
|
-
]
|
107
|
-
@all_tables = [
|
108
|
-
{"table" => @table_1996},
|
109
|
-
{"table" => @table_1997},
|
110
|
-
{"table" => @table_1998},
|
111
|
-
]
|
112
|
-
end
|
113
|
-
it 'scrape single table contents' do
|
114
|
-
node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
|
115
|
-
Yasuri::TextNode.new('./td[1]', "title"),
|
116
|
-
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
117
|
-
])
|
118
|
-
expected = @table_1996
|
119
|
-
actual = node.inject(@agent, @page)
|
120
|
-
expect(actual).to match expected
|
121
|
-
end
|
122
|
-
|
123
|
-
it 'return empty text if no match node' do
|
124
|
-
no_match_xpath = '/html/body/table[1]/t'
|
125
|
-
node = Yasuri::StructNode.new(no_match_xpath, "table", [
|
126
|
-
Yasuri::TextNode.new('./td[1]', "title")
|
127
|
-
])
|
128
|
-
actual = node.inject(@agent, @page)
|
129
|
-
expect(actual).to be_empty
|
130
|
-
end
|
131
|
-
|
132
|
-
it 'fail with invalid xpath' do
|
133
|
-
invalid_xpath = '/html/body/table[1]/table[1]/tr['
|
134
|
-
node = Yasuri::StructNode.new(invalid_xpath, "table", [
|
135
|
-
Yasuri::TextNode.new('./td[1]', "title")
|
136
|
-
])
|
137
|
-
expect { node.inject(@agent, @page) }.to raise_error
|
138
|
-
end
|
139
|
-
|
140
|
-
it 'fail with invalid xpath in children' do
|
141
|
-
invalid_xpath = './td[1]['
|
142
|
-
node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
|
143
|
-
Yasuri::TextNode.new(invalid_xpath, "title"),
|
144
|
-
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
145
|
-
])
|
146
|
-
expect { node.inject(@agent, @page) }.to raise_error
|
147
|
-
end
|
148
|
-
|
149
|
-
it 'scrape all tables' do
|
150
|
-
node = Yasuri::StructNode.new('/html/body/table', "tables", [
|
151
|
-
Yasuri::StructNode.new('./tr', "table", [
|
152
|
-
Yasuri::TextNode.new('./td[1]', "title"),
|
153
|
-
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
154
|
-
])
|
155
|
-
])
|
156
|
-
expected = @all_tables
|
157
|
-
actual = node.inject(@agent, @page)
|
158
|
-
expect(actual).to match expected
|
159
|
-
end
|
160
|
-
|
161
|
-
it 'can be defined by DSL, scrape all tables' do
|
162
|
-
generated = Yasuri.struct_tables '/html/body/table' do
|
163
|
-
struct_table './tr' do
|
164
|
-
text_title './td[1]'
|
165
|
-
text_pub_date './td[2]'
|
166
|
-
end
|
167
|
-
end
|
168
|
-
original = Yasuri::StructNode.new('/html/body/table', "tables", [
|
169
|
-
Yasuri::StructNode.new('./tr', "table", [
|
170
|
-
Yasuri::TextNode.new('./td[1]', "title"),
|
171
|
-
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
172
|
-
])
|
173
|
-
])
|
174
|
-
compare_generated_vs_original(generated, original)
|
175
|
-
end
|
176
|
-
end
|
177
|
-
|
178
|
-
#########
|
179
|
-
# Links #
|
180
|
-
#########
|
181
|
-
describe '::LinksNode' do
|
182
|
-
it 'scrape links' do
|
183
|
-
root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
|
184
|
-
Yasuri::TextNode.new('/html/body/p', "content"),
|
185
|
-
])
|
186
|
-
|
187
|
-
actual = root_node.inject(@agent, @index_page)
|
188
|
-
expected = [
|
189
|
-
{"content" => "Child 01 page."},
|
190
|
-
{"content" => "Child 02 page."},
|
191
|
-
{"content" => "Child 03 page."},
|
192
|
-
]
|
193
|
-
expect(actual).to match expected
|
194
|
-
end
|
195
|
-
|
196
|
-
it 'return empty set if no match node' do
|
197
|
-
missing_xpath = '/html/body/b'
|
198
|
-
root_node = Yasuri::LinksNode.new(missing_xpath, "root", [
|
199
|
-
Yasuri::TextNode.new('/html/body/p', "content"),
|
200
|
-
])
|
201
|
-
|
202
|
-
actual = root_node.inject(@agent, @index_page)
|
203
|
-
expect(actual).to be_empty
|
204
|
-
end
|
205
|
-
|
206
|
-
it 'scrape links, recursive' do
|
207
|
-
root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
|
208
|
-
Yasuri::TextNode.new('/html/body/p', "content"),
|
209
|
-
Yasuri::LinksNode.new('/html/body/ul/li/a', "sub_link", [
|
210
|
-
Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
|
211
|
-
]),
|
212
|
-
])
|
213
|
-
actual = root_node.inject(@agent, @index_page)
|
214
|
-
expected = [
|
215
|
-
{"content" => "Child 01 page.",
|
216
|
-
"sub_link" => [{"sub_page_title" => "Child 01 SubPage Test"},
|
217
|
-
{"sub_page_title" => "Child 02 SubPage Test"}],},
|
218
|
-
{"content" => "Child 02 page.",
|
219
|
-
"sub_link" => [],},
|
220
|
-
{"content" => "Child 03 page.",
|
221
|
-
"sub_link" => [{"sub_page_title" => "Child 03 SubPage Test"}],},
|
222
|
-
]
|
223
|
-
expect(actual).to match expected
|
224
|
-
end
|
225
|
-
it 'can be defined by DSL, return single LinkNode title' do
|
226
|
-
generated = Yasuri.links_title '/html/body/a'
|
227
|
-
original = Yasuri::LinksNode.new('/html/body/a', "title")
|
228
|
-
compare_generated_vs_original(generated, original)
|
229
|
-
end
|
230
|
-
it 'can be defined by DSL, return nested contents under link' do
|
231
|
-
generated = Yasuri.links_title '/html/body/a' do
|
232
|
-
text_name '/html/body/p'
|
233
|
-
end
|
234
|
-
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
235
|
-
Yasuri::TextNode.new('/html/body/p', "name"),
|
236
|
-
])
|
237
|
-
compare_generated_vs_original(generated, original)
|
238
|
-
end
|
239
|
-
|
240
|
-
it 'can be defined by DSL, return recursive links node' do
|
241
|
-
generated = Yasuri.links_root '/html/body/a' do
|
242
|
-
text_content '/html/body/p'
|
243
|
-
links_sub_link '/html/body/ul/li/a' do
|
244
|
-
text_sub_page_title '/html/head/title'
|
245
|
-
end
|
246
|
-
end
|
247
|
-
|
248
|
-
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
249
|
-
Yasuri::TextNode.new('/html/body/p', "content"),
|
250
|
-
Yasuri::LinksNode.new('/html/body/ul/li/a', "sub_link", [
|
251
|
-
Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
|
252
|
-
]),
|
253
|
-
])
|
254
|
-
compare_generated_vs_original(generated, original)
|
255
|
-
end
|
256
|
-
end
|
257
|
-
|
258
|
-
############
|
259
|
-
# Paginate #
|
260
|
-
############
|
261
|
-
describe '::PaginateNode' do
|
262
|
-
before do
|
263
|
-
@uri += "/pagination/page01.html"
|
264
|
-
@page = @agent.get(@uri)
|
265
|
-
end
|
266
|
-
|
267
|
-
it "scrape each paginated pages" do
|
268
|
-
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
269
|
-
Yasuri::TextNode.new('/html/body/p', "content"),
|
270
|
-
])
|
271
|
-
actual = root_node.inject(@agent, @page)
|
272
|
-
expected = [
|
273
|
-
{"content" => "PaginationTest01"},
|
274
|
-
{"content" => "PaginationTest02"},
|
275
|
-
{"content" => "PaginationTest03"},
|
276
|
-
{"content" => "PaginationTest04"},
|
277
|
-
]
|
278
|
-
expect(actual).to match expected
|
279
|
-
end
|
280
|
-
|
281
|
-
it "scrape each paginated pages limited" do
|
282
|
-
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
283
|
-
Yasuri::TextNode.new('/html/body/p', "content"),
|
284
|
-
], limit:3)
|
285
|
-
actual = root_node.inject(@agent, @page)
|
286
|
-
expected = [
|
287
|
-
{"content" => "PaginationTest01"},
|
288
|
-
{"content" => "PaginationTest02"},
|
289
|
-
{"content" => "PaginationTest03"},
|
290
|
-
]
|
291
|
-
expect(actual).to match expected
|
292
|
-
end
|
293
|
-
|
294
|
-
|
295
|
-
it 'return first content if paginate link node is not found' do
|
296
|
-
missing_xpath = "/html/body/nav/span/b[@class='next']"
|
297
|
-
root_node = Yasuri::PaginateNode.new(missing_xpath, "root", [
|
298
|
-
Yasuri::TextNode.new('/html/body/p', "content"),
|
299
|
-
])
|
300
|
-
actual = root_node.inject(@agent, @page)
|
301
|
-
expected = [ {"content" => "PaginationTest01"}, ]
|
302
|
-
expect(actual).to match_array expected
|
303
|
-
end
|
304
|
-
|
305
|
-
it 'return empty hashes if content node is not found' do
|
306
|
-
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
307
|
-
Yasuri::TextNode.new('/html/body/hoge', "content"),
|
308
|
-
])
|
309
|
-
actual = root_node.inject(@agent, @page)
|
310
|
-
expected = [ {"content" => ""}, {"content" => ""}, {"content" => ""}, {"content" => ""},]
|
311
|
-
expect(actual).to match_array expected
|
312
|
-
end
|
313
|
-
|
314
|
-
it 'can be defined by DSL, return single PaginateNode content' do
|
315
|
-
generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']" do
|
316
|
-
text_content '/html/body/p'
|
317
|
-
end
|
318
|
-
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
319
|
-
Yasuri::TextNode.new('/html/body/p', "content"),
|
320
|
-
])
|
321
|
-
compare_generated_vs_original(generated, original, @page)
|
322
|
-
end
|
323
|
-
|
324
|
-
it 'can be defined by DSL, return single PaginateNode content limited' do
|
325
|
-
generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']", 2 do
|
326
|
-
text_content '/html/body/p'
|
327
|
-
end
|
328
|
-
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
329
|
-
Yasuri::TextNode.new('/html/body/p', "content"),
|
330
|
-
], limit: 2)
|
331
|
-
compare_generated_vs_original(generated, original, @page)
|
332
|
-
end
|
333
|
-
end
|
334
|
-
|
335
19
|
#############
|
336
20
|
# json2tree #
|
337
21
|
#############
|
@@ -348,9 +32,21 @@ describe 'Yasuri' do
|
|
348
32
|
}|
|
349
33
|
generated = Yasuri.json2tree(src)
|
350
34
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
351
|
-
compare_generated_vs_original(generated, original)
|
35
|
+
compare_generated_vs_original(generated, original, @index_page)
|
36
|
+
end
|
37
|
+
|
38
|
+
it "return TextNode with truncate_regexp" do
|
39
|
+
src = %q| { "node" : "text",
|
40
|
+
"name" : "content",
|
41
|
+
"path" : "/html/body/p[1]",
|
42
|
+
"truncate" : "^[^,]+"
|
43
|
+
}|
|
44
|
+
generated = Yasuri.json2tree(src)
|
45
|
+
original = Yasuri::TextNode.new('/html/body/p[1]', "content", truncate:/^[^,]+/)
|
46
|
+
compare_generated_vs_original(generated, original, @index_page)
|
352
47
|
end
|
353
48
|
|
49
|
+
|
354
50
|
it "return LinksNode/TextNode" do
|
355
51
|
src = %q| { "node" : "links",
|
356
52
|
"name" : "root",
|
@@ -364,7 +60,8 @@ describe 'Yasuri' do
|
|
364
60
|
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
365
61
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
366
62
|
])
|
367
|
-
|
63
|
+
|
64
|
+
compare_generated_vs_original(generated, original, @index_page)
|
368
65
|
end
|
369
66
|
|
370
67
|
it "return PaginateNode/TextNode" do
|
@@ -437,6 +134,83 @@ describe 'Yasuri' do
|
|
437
134
|
end
|
438
135
|
end
|
439
136
|
|
137
|
+
#############
|
138
|
+
# tree2json #
|
139
|
+
#############
|
140
|
+
describe '.tree2json' do
|
141
|
+
it "return empty json" do
|
142
|
+
json = Yasuri.tree2json(nil)
|
143
|
+
expect(json).to match "{}"
|
144
|
+
end
|
145
|
+
|
146
|
+
it "return text node" do
|
147
|
+
node = Yasuri::TextNode.new("/html/head/title", "title")
|
148
|
+
json = Yasuri.tree2json(node)
|
149
|
+
expected_str = %q| { "node": "text",
|
150
|
+
"name": "title",
|
151
|
+
"path": "/html/head/title"
|
152
|
+
} |
|
153
|
+
expected = JSON.parse(expected_str)
|
154
|
+
actual = JSON.parse(json)
|
155
|
+
expect(actual).to match expected
|
156
|
+
end
|
157
|
+
|
158
|
+
it "return text node with truncate_regexp" do
|
159
|
+
node = Yasuri::TextNode.new("/html/head/title", "title", truncate:/^[^,]+/)
|
160
|
+
json = Yasuri.tree2json(node)
|
161
|
+
expected_str = %q| { "node": "text",
|
162
|
+
"name": "title",
|
163
|
+
"path": "/html/head/title",
|
164
|
+
"truncate": "^[^,]+"
|
165
|
+
} |
|
166
|
+
expected = Yasuri.tree2json(Yasuri.json2tree(expected_str))
|
167
|
+
actual = Yasuri.tree2json(Yasuri.json2tree(json))
|
168
|
+
expect(actual).to match expected
|
169
|
+
end
|
170
|
+
|
171
|
+
it "return LinksNode/TextNode" do
|
172
|
+
tree = Yasuri::LinksNode.new('/html/body/a', "root", [
|
173
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
174
|
+
])
|
175
|
+
json = Yasuri.tree2json(tree)
|
176
|
+
expected_src = %q| { "node" : "links",
|
177
|
+
"name" : "root",
|
178
|
+
"path" : "/html/body/a",
|
179
|
+
"children" : [ { "node" : "text",
|
180
|
+
"name" : "content",
|
181
|
+
"path" : "/html/body/p"
|
182
|
+
} ]
|
183
|
+
}|
|
184
|
+
expected = JSON.parse(expected_src)
|
185
|
+
actual = JSON.parse(json)
|
186
|
+
expect(actual).to match expected
|
187
|
+
end
|
188
|
+
|
189
|
+
it "return PaginateNode/TextNode with limit" do
|
190
|
+
tree = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
191
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
192
|
+
], limit:10)
|
193
|
+
|
194
|
+
json = Yasuri.tree2json(tree)
|
195
|
+
expected_src = %q| { "node" : "pages",
|
196
|
+
"name" : "root",
|
197
|
+
"path" : "/html/body/nav/span/a[@class='next']",
|
198
|
+
"limit" : 10,
|
199
|
+
"children" : [ { "node" : "text",
|
200
|
+
"name" : "content",
|
201
|
+
"path" : "/html/body/p"
|
202
|
+
} ]
|
203
|
+
}|
|
204
|
+
expected = JSON.parse(expected_src)
|
205
|
+
actual = JSON.parse(json)
|
206
|
+
expect(actual).to match expected
|
207
|
+
end
|
208
|
+
|
209
|
+
|
210
|
+
|
211
|
+
end
|
212
|
+
|
213
|
+
|
440
214
|
it 'has a version number' do
|
441
215
|
expect(Yasuri::VERSION).not_to be nil
|
442
216
|
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
|
2
|
+
# Author:: TAC (tac@tac42.net)
|
3
|
+
|
4
|
+
require_relative 'spec_helper'
|
5
|
+
|
6
|
+
##########
|
7
|
+
# Struct #
|
8
|
+
##########
|
9
|
+
describe 'Yasuri' do
|
10
|
+
include_context 'httpserver'
|
11
|
+
|
12
|
+
describe '::StructNode' do
|
13
|
+
before do
|
14
|
+
@agent = Mechanize.new
|
15
|
+
@page = @agent.get(uri + "/structual_text.html")
|
16
|
+
|
17
|
+
@table_1996 = [
|
18
|
+
{ "title" => "The Perfect Insider",
|
19
|
+
"pub_date" => "1996/4/5" },
|
20
|
+
{ "title" => "Doctors in Isolated Room",
|
21
|
+
"pub_date" => "1996/7/5" },
|
22
|
+
{ "title" => "Mathematical Goodbye",
|
23
|
+
"pub_date" => "1996/9/5" },
|
24
|
+
]
|
25
|
+
@table_1997 = [
|
26
|
+
{ "title" => "Jack the Poetical Private",
|
27
|
+
"pub_date" => "1997/1/5" },
|
28
|
+
{ "title" => "Who Inside",
|
29
|
+
"pub_date" => "1997/4/5" },
|
30
|
+
{ "title" => "Illusion Acts Like Magic",
|
31
|
+
"pub_date" => "1997/10/5" },
|
32
|
+
]
|
33
|
+
@table_1998 = [
|
34
|
+
{ "title" => "Replaceable Summer",
|
35
|
+
"pub_date" => "1998/1/7" },
|
36
|
+
{ "title" => "Switch Back",
|
37
|
+
"pub_date" => "1998/4/5" },
|
38
|
+
{ "title" => "Numerical Models",
|
39
|
+
"pub_date" => "1998/7/5" },
|
40
|
+
{ "title" => "The Perfect Outsider",
|
41
|
+
"pub_date" => "1998/10/5" },
|
42
|
+
]
|
43
|
+
@all_tables = [
|
44
|
+
{"table" => @table_1996},
|
45
|
+
{"table" => @table_1997},
|
46
|
+
{"table" => @table_1998},
|
47
|
+
]
|
48
|
+
end
|
49
|
+
|
50
|
+
it 'scrape single table contents' do
|
51
|
+
node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
|
52
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
53
|
+
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
54
|
+
])
|
55
|
+
expected = @table_1996
|
56
|
+
actual = node.inject(@agent, @page)
|
57
|
+
expect(actual).to match expected
|
58
|
+
end
|
59
|
+
|
60
|
+
it 'return empty text if no match node' do
|
61
|
+
no_match_xpath = '/html/body/table[1]/t'
|
62
|
+
node = Yasuri::StructNode.new(no_match_xpath, "table", [
|
63
|
+
Yasuri::TextNode.new('./td[1]', "title")
|
64
|
+
])
|
65
|
+
actual = node.inject(@agent, @page)
|
66
|
+
expect(actual).to be_empty
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'fail with invalid xpath' do
|
70
|
+
invalid_xpath = '/html/body/table[1]/table[1]/tr['
|
71
|
+
node = Yasuri::StructNode.new(invalid_xpath, "table", [
|
72
|
+
Yasuri::TextNode.new('./td[1]', "title")
|
73
|
+
])
|
74
|
+
expect { node.inject(@agent, @page) }.to raise_error
|
75
|
+
end
|
76
|
+
|
77
|
+
it 'fail with invalid xpath in children' do
|
78
|
+
invalid_xpath = './td[1]['
|
79
|
+
node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
|
80
|
+
Yasuri::TextNode.new(invalid_xpath, "title"),
|
81
|
+
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
82
|
+
])
|
83
|
+
expect { node.inject(@agent, @page) }.to raise_error
|
84
|
+
end
|
85
|
+
|
86
|
+
it 'scrape all tables' do
|
87
|
+
node = Yasuri::StructNode.new('/html/body/table', "tables", [
|
88
|
+
Yasuri::StructNode.new('./tr', "table", [
|
89
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
90
|
+
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
91
|
+
])
|
92
|
+
])
|
93
|
+
expected = @all_tables
|
94
|
+
actual = node.inject(@agent, @page)
|
95
|
+
expect(actual).to match expected
|
96
|
+
end
|
97
|
+
|
98
|
+
it 'can be defined by DSL, scrape all tables' do
|
99
|
+
generated = Yasuri.struct_tables '/html/body/table' do
|
100
|
+
struct_table './tr' do
|
101
|
+
text_title './td[1]'
|
102
|
+
text_pub_date './td[2]'
|
103
|
+
end
|
104
|
+
end
|
105
|
+
original = Yasuri::StructNode.new('/html/body/table', "tables", [
|
106
|
+
Yasuri::StructNode.new('./tr', "table", [
|
107
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
108
|
+
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
109
|
+
])
|
110
|
+
])
|
111
|
+
compare_generated_vs_original(generated, original, @page)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
|
2
|
+
# Author:: TAC (tac@tac42.net)
|
3
|
+
|
4
|
+
require_relative 'spec_helper'
|
5
|
+
|
6
|
+
########
|
7
|
+
# Text #
|
8
|
+
########
|
9
|
+
describe 'Yasuri' do
|
10
|
+
include_context 'httpserver'
|
11
|
+
|
12
|
+
before do
|
13
|
+
@agent = Mechanize.new
|
14
|
+
@index_page = @agent.get(uri)
|
15
|
+
end
|
16
|
+
|
17
|
+
describe '::TextNode' do
|
18
|
+
before { @node = Yasuri::TextNode.new('/html/body/p[1]', "title") }
|
19
|
+
|
20
|
+
it 'scrape text text <p>Hello,Yasuri</p>' do
|
21
|
+
actual = @node.inject(@agent, @index_page)
|
22
|
+
expect(actual).to eq "Hello,Yasuri"
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'return empty text if no match node' do
|
26
|
+
no_match_node = Yasuri::TextNode.new('/html/body/no_match_node', "title")
|
27
|
+
actual = no_match_node.inject(@agent, @index_page)
|
28
|
+
expect(actual).to be_empty
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'fail with invalid xpath' do
|
32
|
+
invalid_xpath = '/html/body/no_match_node['
|
33
|
+
node = Yasuri::TextNode.new(invalid_xpath, "title")
|
34
|
+
expect { node.inject(@agent, @index_page) }.to raise_error
|
35
|
+
end
|
36
|
+
|
37
|
+
it "can be defined by DSL, return single TextNode title" do
|
38
|
+
generated = Yasuri.text_title '/html/body/p[1]'
|
39
|
+
original = Yasuri::TextNode.new('/html/body/p[1]', "title")
|
40
|
+
compare_generated_vs_original(generated, original, @index_page)
|
41
|
+
end
|
42
|
+
|
43
|
+
it "can be truncated with regexp" do
|
44
|
+
node = Yasuri.text_title '/html/body/p[1]', /^[^,]+/
|
45
|
+
actual = node.inject(@agent, @index_page)
|
46
|
+
expect(actual).to eq "Hello"
|
47
|
+
end
|
48
|
+
|
49
|
+
it "can be truncated with regexp" do
|
50
|
+
node = Yasuri.text_title '/html/body/p[1]', /[^,]+$/
|
51
|
+
actual = node.inject(@agent, @index_page)
|
52
|
+
expect(actual).to eq "Yasuri"
|
53
|
+
end
|
54
|
+
|
55
|
+
it "return empty string if truncated with no match to regexp" do
|
56
|
+
node = Yasuri.text_title '/html/body/p[1]', /^hoge/
|
57
|
+
actual = node.inject(@agent, @index_page)
|
58
|
+
expect(actual).to be_empty
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yasuri
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- TAC
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-02-
|
11
|
+
date: 2015-02-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -155,6 +155,12 @@ files:
|
|
155
155
|
- lib/yasuri.rb
|
156
156
|
- lib/yasuri/version.rb
|
157
157
|
- lib/yasuri/yasuri.rb
|
158
|
+
- lib/yasuri/yasuri_links_node.rb
|
159
|
+
- lib/yasuri/yasuri_node.rb
|
160
|
+
- lib/yasuri/yasuri_node_generator.rb
|
161
|
+
- lib/yasuri/yasuri_paginate_node.rb
|
162
|
+
- lib/yasuri/yasuri_struct_node.rb
|
163
|
+
- lib/yasuri/yasuri_text_node.rb
|
158
164
|
- spec/htdocs/child01.html
|
159
165
|
- spec/htdocs/child01_sub.html
|
160
166
|
- spec/htdocs/child02.html
|
@@ -169,7 +175,12 @@ files:
|
|
169
175
|
- spec/htdocs/structual_text.html
|
170
176
|
- spec/servers/httpserver.rb
|
171
177
|
- spec/spec_helper.rb
|
178
|
+
- spec/yasuri_links_node_spec.rb
|
179
|
+
- spec/yasuri_node_spec.rb
|
180
|
+
- spec/yasuri_paginate_node_spec.rb
|
172
181
|
- spec/yasuri_spec.rb
|
182
|
+
- spec/yasuri_struct_node_spec.rb
|
183
|
+
- spec/yasuri_text_node_spec.rb
|
173
184
|
- yasuri.gemspec
|
174
185
|
homepage: https://github.com/tac0x2a/yasuri
|
175
186
|
licenses:
|
@@ -210,4 +221,9 @@ test_files:
|
|
210
221
|
- spec/htdocs/structual_text.html
|
211
222
|
- spec/servers/httpserver.rb
|
212
223
|
- spec/spec_helper.rb
|
224
|
+
- spec/yasuri_links_node_spec.rb
|
225
|
+
- spec/yasuri_node_spec.rb
|
226
|
+
- spec/yasuri_paginate_node_spec.rb
|
213
227
|
- spec/yasuri_spec.rb
|
228
|
+
- spec/yasuri_struct_node_spec.rb
|
229
|
+
- spec/yasuri_text_node_spec.rb
|