yasuri 2.0.11 → 3.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/workflows/ruby.yml +35 -0
- data/.gitignore +1 -2
- data/.ruby-version +1 -0
- data/.travis.yml +1 -3
- data/README.md +88 -19
- data/USAGE.ja.md +325 -63
- data/USAGE.md +335 -69
- data/exe/yasuri +5 -0
- data/lib/yasuri.rb +1 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +80 -39
- data/lib/yasuri/yasuri_cli.rb +64 -0
- data/lib/yasuri/yasuri_links_node.rb +10 -6
- data/lib/yasuri/yasuri_map_node.rb +39 -0
- data/lib/yasuri/yasuri_node.rb +24 -3
- data/lib/yasuri/yasuri_node_generator.rb +16 -11
- data/lib/yasuri/yasuri_paginate_node.rb +18 -6
- data/lib/yasuri/yasuri_struct_node.rb +8 -4
- data/lib/yasuri/yasuri_text_node.rb +11 -4
- data/spec/cli_resources/tree.json +8 -0
- data/spec/cli_resources/tree.yml +5 -0
- data/spec/cli_resources/tree_wrong.json +9 -0
- data/spec/cli_resources/tree_wrong.yml +6 -0
- data/spec/htdocs/struct/structual_links.html +30 -0
- data/spec/htdocs/{structual_text.html → struct/structual_text.html} +0 -0
- data/spec/spec_helper.rb +1 -6
- data/spec/yasuri_cli_spec.rb +83 -0
- data/spec/yasuri_links_node_spec.rb +12 -4
- data/spec/yasuri_map_spec.rb +76 -0
- data/spec/yasuri_paginate_node_spec.rb +43 -0
- data/spec/yasuri_spec.rb +199 -84
- data/spec/yasuri_struct_node_spec.rb +42 -1
- data/yasuri.gemspec +5 -3
- metadata +52 -19
@@ -6,16 +6,20 @@ require_relative 'yasuri_node'
|
|
6
6
|
module Yasuri
|
7
7
|
class StructNode
|
8
8
|
include Node
|
9
|
-
def inject(agent, page, opt = {})
|
10
|
-
sub_tags =
|
9
|
+
def inject(agent, page, opt = {}, element = page)
|
10
|
+
sub_tags = element.search(@xpath)
|
11
11
|
tree = sub_tags.map do |sub_tag|
|
12
12
|
child_results_kv = @children.map do |child_node|
|
13
|
-
child_name = Yasuri.
|
14
|
-
[child_name, child_node.inject(agent,
|
13
|
+
child_name = Yasuri.node_name(child_node.name, opt)
|
14
|
+
[child_name, child_node.inject(agent, page, opt, sub_tag)]
|
15
15
|
end
|
16
16
|
Hash[child_results_kv]
|
17
17
|
end
|
18
18
|
tree.size == 1 ? tree.first : tree
|
19
19
|
end # inject
|
20
|
+
|
21
|
+
def node_type_str
|
22
|
+
"struct".freeze
|
23
|
+
end
|
20
24
|
end
|
21
25
|
end
|
@@ -7,19 +7,21 @@ module Yasuri
|
|
7
7
|
class TextNode
|
8
8
|
include Node
|
9
9
|
|
10
|
-
def initialize(xpath, name, children = [],
|
10
|
+
def initialize(xpath, name, children = [], **opt)
|
11
11
|
super(xpath, name, children)
|
12
12
|
|
13
|
+
truncate = opt[:truncate]
|
14
|
+
proc = opt[:proc]
|
15
|
+
|
13
16
|
truncate = Regexp.new(truncate) if not truncate.nil? # regexp or nil
|
14
17
|
@truncate = truncate
|
15
18
|
@truncate = Regexp.new(@truncate.to_s) if not @truncate.nil?
|
16
19
|
|
17
20
|
@proc = proc.nil? ? nil : proc.to_sym
|
18
|
-
|
19
21
|
end
|
20
22
|
|
21
|
-
def inject(agent, page, opt = {})
|
22
|
-
node =
|
23
|
+
def inject(agent, page, opt = {}, element = page)
|
24
|
+
node = element.search(@xpath)
|
23
25
|
text = node.text.to_s
|
24
26
|
|
25
27
|
if @truncate
|
@@ -28,11 +30,16 @@ module Yasuri
|
|
28
30
|
end
|
29
31
|
|
30
32
|
text = text.__send__(@proc) if @proc && text.respond_to?(@proc)
|
33
|
+
|
31
34
|
text
|
32
35
|
end
|
33
36
|
|
34
37
|
def opts
|
35
38
|
{truncate:@truncate, proc:@proc}
|
36
39
|
end
|
40
|
+
|
41
|
+
def node_type_str
|
42
|
+
"text".freeze
|
43
|
+
end
|
37
44
|
end
|
38
45
|
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
<title>StructualLinksTest</title>
|
4
|
+
</head>
|
5
|
+
<body>
|
6
|
+
|
7
|
+
<table>
|
8
|
+
<thead>
|
9
|
+
<tr>
|
10
|
+
<th>Title</th>
|
11
|
+
<th>Links</th>
|
12
|
+
</tr>
|
13
|
+
</thead>
|
14
|
+
<tr>
|
15
|
+
<td>Child01,02</td>
|
16
|
+
<td><a href="../child01.html">Child01</a></td>
|
17
|
+
<td><a href="../child02.html">Child02</a></td>
|
18
|
+
<td>../child02.html</td>
|
19
|
+
</tr>
|
20
|
+
|
21
|
+
<tr>
|
22
|
+
<td>Child01,02,03</td>
|
23
|
+
<td><a href="../child01.html">Child01</a></td>
|
24
|
+
<td><a href="../child02.html">Child02</a></td>
|
25
|
+
<td><a href="../child03.html">Child03</a></td>
|
26
|
+
</tr>
|
27
|
+
</table>
|
28
|
+
|
29
|
+
</body>
|
30
|
+
</html>
|
File without changes
|
data/spec/spec_helper.rb
CHANGED
@@ -12,16 +12,11 @@ shared_context 'httpserver' do
|
|
12
12
|
}
|
13
13
|
end
|
14
14
|
|
15
|
-
|
16
|
-
# ENV['CODECLIMATE_REPO_TOKEN'] = "0dc78d33107a7f11f257c0218ac1a37e0073005bb9734f2fd61d0f7e803fc151"
|
17
|
-
# require "codeclimate-test-reporter"
|
18
|
-
# CodeClimate::TestReporter.start
|
19
|
-
|
20
15
|
require 'simplecov'
|
21
16
|
require 'coveralls'
|
22
17
|
Coveralls.wear!
|
23
18
|
|
24
|
-
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
|
19
|
+
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new [
|
25
20
|
SimpleCov::Formatter::HTMLFormatter,
|
26
21
|
Coveralls::SimpleCov::Formatter
|
27
22
|
]
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Yasuri' do
|
4
|
+
include_context 'httpserver'
|
5
|
+
|
6
|
+
before do
|
7
|
+
@agent = Mechanize.new
|
8
|
+
@index_page = @agent.get(uri)
|
9
|
+
|
10
|
+
@res_dir = File.expand_path('../cli_resources', __FILE__)
|
11
|
+
end
|
12
|
+
|
13
|
+
describe 'cli scrape' do
|
14
|
+
it "require --file or --json option" do
|
15
|
+
expect {
|
16
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {})
|
17
|
+
}.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
|
18
|
+
end
|
19
|
+
|
20
|
+
it "only one of --file or --json option" do
|
21
|
+
expect {
|
22
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: "path.json", json: '{"text_title": "/html/head/title"}'})
|
23
|
+
}.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
|
24
|
+
end
|
25
|
+
|
26
|
+
it "require --file option is not empty string" do
|
27
|
+
expect {
|
28
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: "file"})
|
29
|
+
}.to output("ERROR: --file option require not empty argument.\n").to_stderr
|
30
|
+
end
|
31
|
+
|
32
|
+
it "require --json option is not empty string" do
|
33
|
+
expect {
|
34
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: "json"})
|
35
|
+
}.to output("ERROR: --json option require not empty argument.\n").to_stderr
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
it "display text node as simple string" do
|
40
|
+
expect {
|
41
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: '{"text_title": "/html/head/title"}'})
|
42
|
+
}.to output("Yasuri Test\n").to_stdout
|
43
|
+
end
|
44
|
+
|
45
|
+
it "display texts in single json" do
|
46
|
+
expect {
|
47
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: '{"text_c1":"/html/body/p[1]", "text_c2":"/html/body/p[2]"}'})
|
48
|
+
}.to output('{"c1":"Hello,Yasuri","c2":"Last Modify - 2015/02/14"}'+"\n").to_stdout
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
it "display text node as simple string via json file" do
|
53
|
+
expect {
|
54
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: "#{@res_dir}/tree.json"})
|
55
|
+
}.to output('[{"content":"Hello,YasuriLast Modify - 2015/02/14"}]' + "\n").to_stdout
|
56
|
+
end
|
57
|
+
it "display text node as simple string via yaml file" do
|
58
|
+
expect {
|
59
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: "#{@res_dir}/tree.yml"})
|
60
|
+
}.to output('[{"content":"Hello,YasuriLast Modify - 2015/02/14"}]' + "\n").to_stdout
|
61
|
+
end
|
62
|
+
|
63
|
+
|
64
|
+
it "display ERROR when json string is wrong" do
|
65
|
+
wrong_json = '{,,}'
|
66
|
+
expect {
|
67
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: wrong_json})
|
68
|
+
}.to output("ERROR: Failed to convert json to yasuri tree. 809: unexpected token at '#{wrong_json}'\n").to_stderr
|
69
|
+
end
|
70
|
+
it "display ERROR when json file contains is wrong" do
|
71
|
+
file_path = "#{@res_dir}/tree_wrong.json"
|
72
|
+
expect {
|
73
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: file_path})
|
74
|
+
}.to output("ERROR: Failed to convert to yasuri tree `#{file_path}`. (<unknown>): did not find expected node content while parsing a flow node at line 2 column 3\n").to_stderr
|
75
|
+
end
|
76
|
+
it "display ERROR when yaml file contains is wrong" do
|
77
|
+
file_path = "#{@res_dir}/tree_wrong.yml"
|
78
|
+
expect {
|
79
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: file_path})
|
80
|
+
}.to output("ERROR: Failed to convert to yasuri tree `#{file_path}`. (<unknown>): did not find expected node content while parsing a block node at line 1 column 1\n").to_stderr
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -59,10 +59,18 @@ describe 'Yasuri' do
|
|
59
59
|
]
|
60
60
|
expect(actual).to match expected
|
61
61
|
end
|
62
|
-
it 'can be defined by DSL, return
|
63
|
-
|
64
|
-
|
65
|
-
|
62
|
+
it 'can be defined by DSL, return no contains if no child node' do
|
63
|
+
root_node = Yasuri.links_title '/html/body/a'
|
64
|
+
actual = root_node.inject(@agent, @index_page)
|
65
|
+
expected = [{}, {}, {}] # Empty if no child node under links node.
|
66
|
+
expect(actual).to match expected
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'can be defined return no contains if no child node' do
|
70
|
+
root_node = Yasuri::LinksNode.new('/html/body/a', "title")
|
71
|
+
actual = root_node.inject(@agent, @index_page)
|
72
|
+
expected = [{}, {}, {}] # Empty if no child node under links node.
|
73
|
+
expect(actual).to match expected
|
66
74
|
end
|
67
75
|
it 'can be defined by DSL, return nested contents under link' do
|
68
76
|
generated = Yasuri.links_title '/html/body/a' do
|
@@ -0,0 +1,76 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Yasuri' do
|
4
|
+
include_context 'httpserver'
|
5
|
+
|
6
|
+
before do
|
7
|
+
@agent = Mechanize.new
|
8
|
+
@index_page = @agent.get(uri)
|
9
|
+
end
|
10
|
+
|
11
|
+
describe '::MapNode' do
|
12
|
+
it "multi scrape in singe page" do
|
13
|
+
map = Yasuri.map_sample do
|
14
|
+
text_title '/html/head/title'
|
15
|
+
text_body_p '/html/body/p[1]'
|
16
|
+
end
|
17
|
+
actual = map.inject(@agent, @index_page)
|
18
|
+
|
19
|
+
expected = {
|
20
|
+
"title" => "Yasuri Test",
|
21
|
+
"body_p" => "Hello,Yasuri"
|
22
|
+
}
|
23
|
+
expect(actual).to include expected
|
24
|
+
end
|
25
|
+
|
26
|
+
it "nested multi scrape in singe page" do
|
27
|
+
map = Yasuri.map_sample do
|
28
|
+
map_group1 { text_child01 '/html/body/a[1]' }
|
29
|
+
map_group2 do
|
30
|
+
text_child01 '/html/body/a[1]'
|
31
|
+
text_child03 '/html/body/a[3]'
|
32
|
+
end
|
33
|
+
end
|
34
|
+
actual = map.inject(@agent, @index_page)
|
35
|
+
|
36
|
+
expected = {
|
37
|
+
"group1" => {
|
38
|
+
"child01" => "child01"
|
39
|
+
},
|
40
|
+
"group2" => {
|
41
|
+
"child01" => "child01",
|
42
|
+
"child03" => "child03"
|
43
|
+
}
|
44
|
+
}
|
45
|
+
expect(actual).to include expected
|
46
|
+
end
|
47
|
+
|
48
|
+
it "scrape with links node" do
|
49
|
+
map = Yasuri.map_sample do
|
50
|
+
map_group1 do
|
51
|
+
links_a '/html/body/a' do
|
52
|
+
text_content '/html/body/p'
|
53
|
+
end
|
54
|
+
text_child01 '/html/body/a[1]'
|
55
|
+
end
|
56
|
+
map_group2 do
|
57
|
+
text_child03 '/html/body/a[3]'
|
58
|
+
end
|
59
|
+
end
|
60
|
+
actual = map.inject(@agent, @index_page)
|
61
|
+
|
62
|
+
expected = {
|
63
|
+
"group1" => {
|
64
|
+
"a" => [
|
65
|
+
{"content" => "Child 01 page."},
|
66
|
+
{"content" => "Child 02 page."},
|
67
|
+
{"content" => "Child 03 page."},
|
68
|
+
],
|
69
|
+
"child01" => "child01"
|
70
|
+
},
|
71
|
+
"group2" => { "child03" => "child03" }
|
72
|
+
}
|
73
|
+
expect(actual).to include expected
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -30,6 +30,49 @@ describe 'Yasuri' do
|
|
30
30
|
expect(actual).to match expected
|
31
31
|
end
|
32
32
|
|
33
|
+
it "scrape each paginated pages with flatten" do
|
34
|
+
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
35
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
36
|
+
Yasuri::StructNode.new('/html/body/nav/span', "span", [
|
37
|
+
Yasuri::TextNode.new('./a', "text"),
|
38
|
+
]),
|
39
|
+
], flatten: true)
|
40
|
+
actual = root_node.inject(@agent, @page)
|
41
|
+
expected = [
|
42
|
+
"PaginationTest01",
|
43
|
+
{"text"=>""},
|
44
|
+
{"text"=>""},
|
45
|
+
{"text" => "2"},
|
46
|
+
{"text" => "3"},
|
47
|
+
{"text" => "4"},
|
48
|
+
{"text"=>"NextPage »"},
|
49
|
+
"PaginationTest02",
|
50
|
+
{"text"=>"« PreviousPage"},
|
51
|
+
{"text" => "1"},
|
52
|
+
{"text"=>""},
|
53
|
+
{"text" => "3"},
|
54
|
+
{"text" => "4"},
|
55
|
+
{"text"=>"NextPage »"},
|
56
|
+
"PaginationTest03",
|
57
|
+
{"text"=>"« PreviousPage"},
|
58
|
+
{"text" => "1"},
|
59
|
+
{"text" => "2"},
|
60
|
+
{"text"=>""},
|
61
|
+
{"text" => "4"},
|
62
|
+
{"text"=>"NextPage »"},
|
63
|
+
"PaginationTest04",
|
64
|
+
{"text"=>"« PreviousPage"},
|
65
|
+
{"text" => "1"},
|
66
|
+
{"text" => "2"},
|
67
|
+
{"text" => "3"},
|
68
|
+
{"text"=>""},
|
69
|
+
{"text"=>""},
|
70
|
+
]
|
71
|
+
|
72
|
+
expect(actual).to match expected
|
73
|
+
end
|
74
|
+
|
75
|
+
|
33
76
|
it "scrape each paginated pages limited" do
|
34
77
|
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
35
78
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
data/spec/yasuri_spec.rb
CHANGED
@@ -13,6 +13,75 @@ describe 'Yasuri' do
|
|
13
13
|
@index_page = @agent.get(@uri)
|
14
14
|
end
|
15
15
|
|
16
|
+
|
17
|
+
############
|
18
|
+
# yam2tree #
|
19
|
+
############
|
20
|
+
describe '.yaml2tree' do
|
21
|
+
it "fail if empty yaml" do
|
22
|
+
expect { Yasuri.yaml2tree(nil) }.to raise_error(RuntimeError)
|
23
|
+
end
|
24
|
+
|
25
|
+
it "return text node" do
|
26
|
+
src = <<-EOB
|
27
|
+
text_content: "/html/body/p[1]"
|
28
|
+
EOB
|
29
|
+
generated = Yasuri.yaml2tree(src)
|
30
|
+
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
31
|
+
|
32
|
+
compare_generated_vs_original(generated, original, @index_page)
|
33
|
+
end
|
34
|
+
|
35
|
+
it "return text node as symbol" do
|
36
|
+
src = <<-EOB
|
37
|
+
:text_content:
|
38
|
+
:path: "/html/body/p[1]"
|
39
|
+
EOB
|
40
|
+
generated = Yasuri.yaml2tree(src)
|
41
|
+
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
42
|
+
|
43
|
+
compare_generated_vs_original(generated, original, @index_page)
|
44
|
+
end
|
45
|
+
|
46
|
+
it "return LinksNode/TextNode" do
|
47
|
+
|
48
|
+
src = <<-EOB
|
49
|
+
links_root:
|
50
|
+
path: "/html/body/a"
|
51
|
+
text_content: "/html/body/p"
|
52
|
+
EOB
|
53
|
+
generated = Yasuri.yaml2tree(src)
|
54
|
+
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
55
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
56
|
+
])
|
57
|
+
|
58
|
+
compare_generated_vs_original(generated, original, @index_page)
|
59
|
+
end
|
60
|
+
|
61
|
+
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
62
|
+
src = <<-EOB
|
63
|
+
struct_tables:
|
64
|
+
path: "/html/body/table"
|
65
|
+
struct_table:
|
66
|
+
path: "./tr"
|
67
|
+
text_title: "./td[1]"
|
68
|
+
text_pub_date: "./td[2]"
|
69
|
+
EOB
|
70
|
+
|
71
|
+
generated = Yasuri.yaml2tree(src)
|
72
|
+
original = Yasuri::StructNode.new('/html/body/table', "tables", [
|
73
|
+
Yasuri::StructNode.new('./tr', "table", [
|
74
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
75
|
+
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
76
|
+
])
|
77
|
+
])
|
78
|
+
page = @agent.get(@uri + "/struct/structual_text.html")
|
79
|
+
compare_generated_vs_original(generated, original, page)
|
80
|
+
end
|
81
|
+
|
82
|
+
end # end of describe '.yaml2tree'
|
83
|
+
|
84
|
+
|
16
85
|
#############
|
17
86
|
# json2tree #
|
18
87
|
#############
|
@@ -22,10 +91,10 @@ describe 'Yasuri' do
|
|
22
91
|
end
|
23
92
|
|
24
93
|
it "return TextNode" do
|
25
|
-
src = %q|
|
26
|
-
|
27
|
-
|
28
|
-
|
94
|
+
src = %q|
|
95
|
+
{
|
96
|
+
"text_content": "/html/body/p[1]"
|
97
|
+
}|
|
29
98
|
generated = Yasuri.json2tree(src)
|
30
99
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
31
100
|
|
@@ -33,26 +102,41 @@ describe 'Yasuri' do
|
|
33
102
|
end
|
34
103
|
|
35
104
|
it "return TextNode with truncate_regexp" do
|
36
|
-
src = %q|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
105
|
+
src = %q|
|
106
|
+
{
|
107
|
+
"text_content": {
|
108
|
+
"path": "/html/body/p[1]",
|
109
|
+
"truncate" : "^[^,]+"
|
110
|
+
}
|
111
|
+
}|
|
41
112
|
generated = Yasuri.json2tree(src)
|
42
113
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content", truncate:/^[^,]+/)
|
43
114
|
compare_generated_vs_original(generated, original, @index_page)
|
44
115
|
end
|
45
116
|
|
117
|
+
it "return MapNode with TextNodes" do
|
118
|
+
src = %q|
|
119
|
+
{
|
120
|
+
"text_content01": "/html/body/p[1]",
|
121
|
+
"text_content02": "/html/body/p[2]"
|
122
|
+
}|
|
123
|
+
generated = Yasuri.json2tree(src)
|
124
|
+
original = Yasuri::MapNode.new('parent', [
|
125
|
+
Yasuri::TextNode.new('/html/body/p[1]', "content01"),
|
126
|
+
Yasuri::TextNode.new('/html/body/p[2]', "content02"),
|
127
|
+
])
|
128
|
+
compare_generated_vs_original(generated, original, @index_page)
|
129
|
+
end
|
46
130
|
|
47
131
|
it "return LinksNode/TextNode" do
|
48
|
-
src = %q|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
132
|
+
src = %q|
|
133
|
+
{
|
134
|
+
"links_root": {
|
135
|
+
"path": "/html/body/a",
|
136
|
+
"text_content": "/html/body/p"
|
137
|
+
}
|
138
|
+
}|
|
139
|
+
|
56
140
|
generated = Yasuri.json2tree(src)
|
57
141
|
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
58
142
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
@@ -62,14 +146,13 @@ describe 'Yasuri' do
|
|
62
146
|
end
|
63
147
|
|
64
148
|
it "return PaginateNode/TextNode" do
|
65
|
-
src = %q|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
}|
|
149
|
+
src = %q|
|
150
|
+
{
|
151
|
+
"pages_root": {
|
152
|
+
"path": "/html/body/nav/span/a[@class=\'next\']",
|
153
|
+
"text_content": "/html/body/p"
|
154
|
+
}
|
155
|
+
}|
|
73
156
|
generated = Yasuri.json2tree(src)
|
74
157
|
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
75
158
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
@@ -81,15 +164,14 @@ describe 'Yasuri' do
|
|
81
164
|
end
|
82
165
|
|
83
166
|
it "return PaginateNode/TextNode with limit" do
|
84
|
-
src = %q|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
}|
|
167
|
+
src = %q|
|
168
|
+
{
|
169
|
+
"pages_root": {
|
170
|
+
"path": "/html/body/nav/span/a[@class=\'next\']",
|
171
|
+
"limit": 2,
|
172
|
+
"text_content": "/html/body/p"
|
173
|
+
}
|
174
|
+
}|
|
93
175
|
generated = Yasuri.json2tree(src)
|
94
176
|
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
95
177
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
@@ -101,24 +183,17 @@ describe 'Yasuri' do
|
|
101
183
|
end
|
102
184
|
|
103
185
|
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
},
|
116
|
-
{ "node" : "text",
|
117
|
-
"name" : "pub_date",
|
118
|
-
"path" : "./td[2]"
|
119
|
-
}]
|
120
|
-
}]
|
121
|
-
}|
|
186
|
+
src = %q|
|
187
|
+
{
|
188
|
+
"struct_tables": {
|
189
|
+
"path": "/html/body/table",
|
190
|
+
"struct_table": {
|
191
|
+
"path": "./tr",
|
192
|
+
"text_title": "./td[1]",
|
193
|
+
"text_pub_date": "./td[2]"
|
194
|
+
}
|
195
|
+
}
|
196
|
+
}|
|
122
197
|
generated = Yasuri.json2tree(src)
|
123
198
|
original = Yasuri::StructNode.new('/html/body/table', "tables", [
|
124
199
|
Yasuri::StructNode.new('./tr', "table", [
|
@@ -126,27 +201,27 @@ describe 'Yasuri' do
|
|
126
201
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
127
202
|
])
|
128
203
|
])
|
129
|
-
page = @agent.get(@uri + "/structual_text.html")
|
204
|
+
page = @agent.get(@uri + "/struct/structual_text.html")
|
130
205
|
compare_generated_vs_original(generated, original, page)
|
131
206
|
end
|
132
207
|
end
|
133
208
|
|
209
|
+
|
134
210
|
#############
|
135
211
|
# tree2json #
|
136
212
|
#############
|
137
213
|
describe '.tree2json' do
|
138
214
|
it "return empty json" do
|
139
|
-
|
140
|
-
expect(json).to match "{}"
|
215
|
+
expect { Yasuri.tree2json(nil) }.to raise_error(RuntimeError)
|
141
216
|
end
|
142
217
|
|
143
218
|
it "return text node" do
|
144
219
|
node = Yasuri::TextNode.new("/html/head/title", "title")
|
145
220
|
json = Yasuri.tree2json(node)
|
146
|
-
expected_str = %q|
|
147
|
-
|
148
|
-
|
149
|
-
|
221
|
+
expected_str = %q|
|
222
|
+
{
|
223
|
+
"text_title": "/html/head/title"
|
224
|
+
}|
|
150
225
|
expected = JSON.parse(expected_str)
|
151
226
|
actual = JSON.parse(json)
|
152
227
|
expect(actual).to match expected
|
@@ -155,29 +230,49 @@ describe 'Yasuri' do
|
|
155
230
|
it "return text node with truncate_regexp" do
|
156
231
|
node = Yasuri::TextNode.new("/html/head/title", "title", truncate:/^[^,]+/)
|
157
232
|
json = Yasuri.tree2json(node)
|
158
|
-
expected_str = %q|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
233
|
+
expected_str = %q|
|
234
|
+
{
|
235
|
+
"text_title": {
|
236
|
+
"path": "/html/head/title",
|
237
|
+
"truncate": "^[^,]+"
|
238
|
+
}
|
239
|
+
}|
|
163
240
|
expected = Yasuri.tree2json(Yasuri.json2tree(expected_str))
|
164
241
|
actual = Yasuri.tree2json(Yasuri.json2tree(json))
|
165
242
|
expect(actual).to match expected
|
166
243
|
end
|
167
244
|
|
245
|
+
it "return map node with text nodes" do
|
246
|
+
tree = Yasuri::MapNode.new('parent', [
|
247
|
+
Yasuri::TextNode.new('/html/body/p[1]', "content01"),
|
248
|
+
Yasuri::TextNode.new('/html/body/p[2]', "content02"),
|
249
|
+
])
|
250
|
+
actual_json = Yasuri.tree2json(tree)
|
251
|
+
|
252
|
+
expected_json = %q|
|
253
|
+
{
|
254
|
+
"text_content01": "/html/body/p[1]",
|
255
|
+
"text_content02": "/html/body/p[2]"
|
256
|
+
}|
|
257
|
+
|
258
|
+
expected = Yasuri.tree2json(Yasuri.json2tree(expected_json))
|
259
|
+
actual = Yasuri.tree2json(Yasuri.json2tree(actual_json))
|
260
|
+
expect(actual).to match expected
|
261
|
+
end
|
262
|
+
|
168
263
|
it "return LinksNode/TextNode" do
|
169
264
|
tree = Yasuri::LinksNode.new('/html/body/a', "root", [
|
170
265
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
171
266
|
])
|
172
267
|
json = Yasuri.tree2json(tree)
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
268
|
+
|
269
|
+
expected_src = %q|
|
270
|
+
{
|
271
|
+
"links_root": {
|
272
|
+
"path": "/html/body/a",
|
273
|
+
"text_content":"/html/body/p"
|
274
|
+
}
|
275
|
+
}|
|
181
276
|
expected = JSON.parse(expected_src)
|
182
277
|
actual = JSON.parse(json)
|
183
278
|
expect(actual).to match expected
|
@@ -189,24 +284,44 @@ describe 'Yasuri' do
|
|
189
284
|
], limit:10)
|
190
285
|
|
191
286
|
json = Yasuri.tree2json(tree)
|
192
|
-
expected_src = %q|
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
287
|
+
expected_src = %q|
|
288
|
+
{
|
289
|
+
"pages_root": {
|
290
|
+
"path": "/html/body/nav/span/a[@class='next']",
|
291
|
+
"limit": 10,
|
292
|
+
"flatten": false,
|
293
|
+
"text_content": "/html/body/p"
|
294
|
+
}
|
295
|
+
}|
|
201
296
|
expected = JSON.parse(expected_src)
|
202
297
|
actual = JSON.parse(json)
|
203
298
|
expect(actual).to match expected
|
204
299
|
end
|
205
|
-
|
206
|
-
|
207
|
-
|
208
300
|
end
|
209
301
|
|
302
|
+
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
303
|
+
tree = Yasuri::StructNode.new('/html/body/table', "tables", [
|
304
|
+
Yasuri::StructNode.new('./tr', "table", [
|
305
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
306
|
+
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
307
|
+
])
|
308
|
+
])
|
309
|
+
json = Yasuri.tree2json(tree)
|
310
|
+
expected_src = %q|
|
311
|
+
{
|
312
|
+
"struct_tables": {
|
313
|
+
"path": "/html/body/table",
|
314
|
+
"struct_table": {
|
315
|
+
"path": "./tr",
|
316
|
+
"text_title": "./td[1]",
|
317
|
+
"text_pub_date": "./td[2]"
|
318
|
+
}
|
319
|
+
}
|
320
|
+
}|
|
321
|
+
expected = JSON.parse(expected_src)
|
322
|
+
actual = JSON.parse(json)
|
323
|
+
expect(actual).to match expected
|
324
|
+
end
|
210
325
|
|
211
326
|
it 'has a version number' do
|
212
327
|
expect(Yasuri::VERSION).not_to be nil
|