yasuri 2.0.11 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.github/workflows/ruby.yml +35 -0
- data/.gitignore +1 -2
- data/.ruby-version +1 -0
- data/.travis.yml +1 -3
- data/README.md +88 -19
- data/USAGE.ja.md +325 -63
- data/USAGE.md +335 -69
- data/exe/yasuri +5 -0
- data/lib/yasuri.rb +1 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +80 -39
- data/lib/yasuri/yasuri_cli.rb +64 -0
- data/lib/yasuri/yasuri_links_node.rb +10 -6
- data/lib/yasuri/yasuri_map_node.rb +39 -0
- data/lib/yasuri/yasuri_node.rb +24 -3
- data/lib/yasuri/yasuri_node_generator.rb +16 -11
- data/lib/yasuri/yasuri_paginate_node.rb +18 -6
- data/lib/yasuri/yasuri_struct_node.rb +8 -4
- data/lib/yasuri/yasuri_text_node.rb +11 -4
- data/spec/cli_resources/tree.json +8 -0
- data/spec/cli_resources/tree.yml +5 -0
- data/spec/cli_resources/tree_wrong.json +9 -0
- data/spec/cli_resources/tree_wrong.yml +6 -0
- data/spec/htdocs/struct/structual_links.html +30 -0
- data/spec/htdocs/{structual_text.html → struct/structual_text.html} +0 -0
- data/spec/spec_helper.rb +1 -6
- data/spec/yasuri_cli_spec.rb +83 -0
- data/spec/yasuri_links_node_spec.rb +12 -4
- data/spec/yasuri_map_spec.rb +76 -0
- data/spec/yasuri_paginate_node_spec.rb +43 -0
- data/spec/yasuri_spec.rb +199 -84
- data/spec/yasuri_struct_node_spec.rb +42 -1
- data/yasuri.gemspec +5 -3
- metadata +52 -19
@@ -6,16 +6,20 @@ require_relative 'yasuri_node'
|
|
6
6
|
module Yasuri
|
7
7
|
class StructNode
|
8
8
|
include Node
|
9
|
-
def inject(agent, page, opt = {})
|
10
|
-
sub_tags =
|
9
|
+
def inject(agent, page, opt = {}, element = page)
|
10
|
+
sub_tags = element.search(@xpath)
|
11
11
|
tree = sub_tags.map do |sub_tag|
|
12
12
|
child_results_kv = @children.map do |child_node|
|
13
|
-
child_name = Yasuri.
|
14
|
-
[child_name, child_node.inject(agent,
|
13
|
+
child_name = Yasuri.node_name(child_node.name, opt)
|
14
|
+
[child_name, child_node.inject(agent, page, opt, sub_tag)]
|
15
15
|
end
|
16
16
|
Hash[child_results_kv]
|
17
17
|
end
|
18
18
|
tree.size == 1 ? tree.first : tree
|
19
19
|
end # inject
|
20
|
+
|
21
|
+
def node_type_str
|
22
|
+
"struct".freeze
|
23
|
+
end
|
20
24
|
end
|
21
25
|
end
|
@@ -7,19 +7,21 @@ module Yasuri
|
|
7
7
|
class TextNode
|
8
8
|
include Node
|
9
9
|
|
10
|
-
def initialize(xpath, name, children = [],
|
10
|
+
def initialize(xpath, name, children = [], **opt)
|
11
11
|
super(xpath, name, children)
|
12
12
|
|
13
|
+
truncate = opt[:truncate]
|
14
|
+
proc = opt[:proc]
|
15
|
+
|
13
16
|
truncate = Regexp.new(truncate) if not truncate.nil? # regexp or nil
|
14
17
|
@truncate = truncate
|
15
18
|
@truncate = Regexp.new(@truncate.to_s) if not @truncate.nil?
|
16
19
|
|
17
20
|
@proc = proc.nil? ? nil : proc.to_sym
|
18
|
-
|
19
21
|
end
|
20
22
|
|
21
|
-
def inject(agent, page, opt = {})
|
22
|
-
node =
|
23
|
+
def inject(agent, page, opt = {}, element = page)
|
24
|
+
node = element.search(@xpath)
|
23
25
|
text = node.text.to_s
|
24
26
|
|
25
27
|
if @truncate
|
@@ -28,11 +30,16 @@ module Yasuri
|
|
28
30
|
end
|
29
31
|
|
30
32
|
text = text.__send__(@proc) if @proc && text.respond_to?(@proc)
|
33
|
+
|
31
34
|
text
|
32
35
|
end
|
33
36
|
|
34
37
|
def opts
|
35
38
|
{truncate:@truncate, proc:@proc}
|
36
39
|
end
|
40
|
+
|
41
|
+
def node_type_str
|
42
|
+
"text".freeze
|
43
|
+
end
|
37
44
|
end
|
38
45
|
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
<title>StructualLinksTest</title>
|
4
|
+
</head>
|
5
|
+
<body>
|
6
|
+
|
7
|
+
<table>
|
8
|
+
<thead>
|
9
|
+
<tr>
|
10
|
+
<th>Title</th>
|
11
|
+
<th>Links</th>
|
12
|
+
</tr>
|
13
|
+
</thead>
|
14
|
+
<tr>
|
15
|
+
<td>Child01,02</td>
|
16
|
+
<td><a href="../child01.html">Child01</a></td>
|
17
|
+
<td><a href="../child02.html">Child02</a></td>
|
18
|
+
<td>../child02.html</td>
|
19
|
+
</tr>
|
20
|
+
|
21
|
+
<tr>
|
22
|
+
<td>Child01,02,03</td>
|
23
|
+
<td><a href="../child01.html">Child01</a></td>
|
24
|
+
<td><a href="../child02.html">Child02</a></td>
|
25
|
+
<td><a href="../child03.html">Child03</a></td>
|
26
|
+
</tr>
|
27
|
+
</table>
|
28
|
+
|
29
|
+
</body>
|
30
|
+
</html>
|
File without changes
|
data/spec/spec_helper.rb
CHANGED
@@ -12,16 +12,11 @@ shared_context 'httpserver' do
|
|
12
12
|
}
|
13
13
|
end
|
14
14
|
|
15
|
-
|
16
|
-
# ENV['CODECLIMATE_REPO_TOKEN'] = "0dc78d33107a7f11f257c0218ac1a37e0073005bb9734f2fd61d0f7e803fc151"
|
17
|
-
# require "codeclimate-test-reporter"
|
18
|
-
# CodeClimate::TestReporter.start
|
19
|
-
|
20
15
|
require 'simplecov'
|
21
16
|
require 'coveralls'
|
22
17
|
Coveralls.wear!
|
23
18
|
|
24
|
-
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
|
19
|
+
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new [
|
25
20
|
SimpleCov::Formatter::HTMLFormatter,
|
26
21
|
Coveralls::SimpleCov::Formatter
|
27
22
|
]
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Yasuri' do
|
4
|
+
include_context 'httpserver'
|
5
|
+
|
6
|
+
before do
|
7
|
+
@agent = Mechanize.new
|
8
|
+
@index_page = @agent.get(uri)
|
9
|
+
|
10
|
+
@res_dir = File.expand_path('../cli_resources', __FILE__)
|
11
|
+
end
|
12
|
+
|
13
|
+
describe 'cli scrape' do
|
14
|
+
it "require --file or --json option" do
|
15
|
+
expect {
|
16
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {})
|
17
|
+
}.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
|
18
|
+
end
|
19
|
+
|
20
|
+
it "only one of --file or --json option" do
|
21
|
+
expect {
|
22
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: "path.json", json: '{"text_title": "/html/head/title"}'})
|
23
|
+
}.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
|
24
|
+
end
|
25
|
+
|
26
|
+
it "require --file option is not empty string" do
|
27
|
+
expect {
|
28
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: "file"})
|
29
|
+
}.to output("ERROR: --file option require not empty argument.\n").to_stderr
|
30
|
+
end
|
31
|
+
|
32
|
+
it "require --json option is not empty string" do
|
33
|
+
expect {
|
34
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: "json"})
|
35
|
+
}.to output("ERROR: --json option require not empty argument.\n").to_stderr
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
it "display text node as simple string" do
|
40
|
+
expect {
|
41
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: '{"text_title": "/html/head/title"}'})
|
42
|
+
}.to output("Yasuri Test\n").to_stdout
|
43
|
+
end
|
44
|
+
|
45
|
+
it "display texts in single json" do
|
46
|
+
expect {
|
47
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: '{"text_c1":"/html/body/p[1]", "text_c2":"/html/body/p[2]"}'})
|
48
|
+
}.to output('{"c1":"Hello,Yasuri","c2":"Last Modify - 2015/02/14"}'+"\n").to_stdout
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
it "display text node as simple string via json file" do
|
53
|
+
expect {
|
54
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: "#{@res_dir}/tree.json"})
|
55
|
+
}.to output('[{"content":"Hello,YasuriLast Modify - 2015/02/14"}]' + "\n").to_stdout
|
56
|
+
end
|
57
|
+
it "display text node as simple string via yaml file" do
|
58
|
+
expect {
|
59
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: "#{@res_dir}/tree.yml"})
|
60
|
+
}.to output('[{"content":"Hello,YasuriLast Modify - 2015/02/14"}]' + "\n").to_stdout
|
61
|
+
end
|
62
|
+
|
63
|
+
|
64
|
+
it "display ERROR when json string is wrong" do
|
65
|
+
wrong_json = '{,,}'
|
66
|
+
expect {
|
67
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: wrong_json})
|
68
|
+
}.to output("ERROR: Failed to convert json to yasuri tree. 809: unexpected token at '#{wrong_json}'\n").to_stderr
|
69
|
+
end
|
70
|
+
it "display ERROR when json file contains is wrong" do
|
71
|
+
file_path = "#{@res_dir}/tree_wrong.json"
|
72
|
+
expect {
|
73
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: file_path})
|
74
|
+
}.to output("ERROR: Failed to convert to yasuri tree `#{file_path}`. (<unknown>): did not find expected node content while parsing a flow node at line 2 column 3\n").to_stderr
|
75
|
+
end
|
76
|
+
it "display ERROR when yaml file contains is wrong" do
|
77
|
+
file_path = "#{@res_dir}/tree_wrong.yml"
|
78
|
+
expect {
|
79
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: file_path})
|
80
|
+
}.to output("ERROR: Failed to convert to yasuri tree `#{file_path}`. (<unknown>): did not find expected node content while parsing a block node at line 1 column 1\n").to_stderr
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -59,10 +59,18 @@ describe 'Yasuri' do
|
|
59
59
|
]
|
60
60
|
expect(actual).to match expected
|
61
61
|
end
|
62
|
-
it 'can be defined by DSL, return
|
63
|
-
|
64
|
-
|
65
|
-
|
62
|
+
it 'can be defined by DSL, return no contains if no child node' do
|
63
|
+
root_node = Yasuri.links_title '/html/body/a'
|
64
|
+
actual = root_node.inject(@agent, @index_page)
|
65
|
+
expected = [{}, {}, {}] # Empty if no child node under links node.
|
66
|
+
expect(actual).to match expected
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'can be defined return no contains if no child node' do
|
70
|
+
root_node = Yasuri::LinksNode.new('/html/body/a', "title")
|
71
|
+
actual = root_node.inject(@agent, @index_page)
|
72
|
+
expected = [{}, {}, {}] # Empty if no child node under links node.
|
73
|
+
expect(actual).to match expected
|
66
74
|
end
|
67
75
|
it 'can be defined by DSL, return nested contents under link' do
|
68
76
|
generated = Yasuri.links_title '/html/body/a' do
|
@@ -0,0 +1,76 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Yasuri' do
|
4
|
+
include_context 'httpserver'
|
5
|
+
|
6
|
+
before do
|
7
|
+
@agent = Mechanize.new
|
8
|
+
@index_page = @agent.get(uri)
|
9
|
+
end
|
10
|
+
|
11
|
+
describe '::MapNode' do
|
12
|
+
it "multi scrape in singe page" do
|
13
|
+
map = Yasuri.map_sample do
|
14
|
+
text_title '/html/head/title'
|
15
|
+
text_body_p '/html/body/p[1]'
|
16
|
+
end
|
17
|
+
actual = map.inject(@agent, @index_page)
|
18
|
+
|
19
|
+
expected = {
|
20
|
+
"title" => "Yasuri Test",
|
21
|
+
"body_p" => "Hello,Yasuri"
|
22
|
+
}
|
23
|
+
expect(actual).to include expected
|
24
|
+
end
|
25
|
+
|
26
|
+
it "nested multi scrape in singe page" do
|
27
|
+
map = Yasuri.map_sample do
|
28
|
+
map_group1 { text_child01 '/html/body/a[1]' }
|
29
|
+
map_group2 do
|
30
|
+
text_child01 '/html/body/a[1]'
|
31
|
+
text_child03 '/html/body/a[3]'
|
32
|
+
end
|
33
|
+
end
|
34
|
+
actual = map.inject(@agent, @index_page)
|
35
|
+
|
36
|
+
expected = {
|
37
|
+
"group1" => {
|
38
|
+
"child01" => "child01"
|
39
|
+
},
|
40
|
+
"group2" => {
|
41
|
+
"child01" => "child01",
|
42
|
+
"child03" => "child03"
|
43
|
+
}
|
44
|
+
}
|
45
|
+
expect(actual).to include expected
|
46
|
+
end
|
47
|
+
|
48
|
+
it "scrape with links node" do
|
49
|
+
map = Yasuri.map_sample do
|
50
|
+
map_group1 do
|
51
|
+
links_a '/html/body/a' do
|
52
|
+
text_content '/html/body/p'
|
53
|
+
end
|
54
|
+
text_child01 '/html/body/a[1]'
|
55
|
+
end
|
56
|
+
map_group2 do
|
57
|
+
text_child03 '/html/body/a[3]'
|
58
|
+
end
|
59
|
+
end
|
60
|
+
actual = map.inject(@agent, @index_page)
|
61
|
+
|
62
|
+
expected = {
|
63
|
+
"group1" => {
|
64
|
+
"a" => [
|
65
|
+
{"content" => "Child 01 page."},
|
66
|
+
{"content" => "Child 02 page."},
|
67
|
+
{"content" => "Child 03 page."},
|
68
|
+
],
|
69
|
+
"child01" => "child01"
|
70
|
+
},
|
71
|
+
"group2" => { "child03" => "child03" }
|
72
|
+
}
|
73
|
+
expect(actual).to include expected
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -30,6 +30,49 @@ describe 'Yasuri' do
|
|
30
30
|
expect(actual).to match expected
|
31
31
|
end
|
32
32
|
|
33
|
+
it "scrape each paginated pages with flatten" do
|
34
|
+
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
35
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
36
|
+
Yasuri::StructNode.new('/html/body/nav/span', "span", [
|
37
|
+
Yasuri::TextNode.new('./a', "text"),
|
38
|
+
]),
|
39
|
+
], flatten: true)
|
40
|
+
actual = root_node.inject(@agent, @page)
|
41
|
+
expected = [
|
42
|
+
"PaginationTest01",
|
43
|
+
{"text"=>""},
|
44
|
+
{"text"=>""},
|
45
|
+
{"text" => "2"},
|
46
|
+
{"text" => "3"},
|
47
|
+
{"text" => "4"},
|
48
|
+
{"text"=>"NextPage »"},
|
49
|
+
"PaginationTest02",
|
50
|
+
{"text"=>"« PreviousPage"},
|
51
|
+
{"text" => "1"},
|
52
|
+
{"text"=>""},
|
53
|
+
{"text" => "3"},
|
54
|
+
{"text" => "4"},
|
55
|
+
{"text"=>"NextPage »"},
|
56
|
+
"PaginationTest03",
|
57
|
+
{"text"=>"« PreviousPage"},
|
58
|
+
{"text" => "1"},
|
59
|
+
{"text" => "2"},
|
60
|
+
{"text"=>""},
|
61
|
+
{"text" => "4"},
|
62
|
+
{"text"=>"NextPage »"},
|
63
|
+
"PaginationTest04",
|
64
|
+
{"text"=>"« PreviousPage"},
|
65
|
+
{"text" => "1"},
|
66
|
+
{"text" => "2"},
|
67
|
+
{"text" => "3"},
|
68
|
+
{"text"=>""},
|
69
|
+
{"text"=>""},
|
70
|
+
]
|
71
|
+
|
72
|
+
expect(actual).to match expected
|
73
|
+
end
|
74
|
+
|
75
|
+
|
33
76
|
it "scrape each paginated pages limited" do
|
34
77
|
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
35
78
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
data/spec/yasuri_spec.rb
CHANGED
@@ -13,6 +13,75 @@ describe 'Yasuri' do
|
|
13
13
|
@index_page = @agent.get(@uri)
|
14
14
|
end
|
15
15
|
|
16
|
+
|
17
|
+
############
|
18
|
+
# yam2tree #
|
19
|
+
############
|
20
|
+
describe '.yaml2tree' do
|
21
|
+
it "fail if empty yaml" do
|
22
|
+
expect { Yasuri.yaml2tree(nil) }.to raise_error(RuntimeError)
|
23
|
+
end
|
24
|
+
|
25
|
+
it "return text node" do
|
26
|
+
src = <<-EOB
|
27
|
+
text_content: "/html/body/p[1]"
|
28
|
+
EOB
|
29
|
+
generated = Yasuri.yaml2tree(src)
|
30
|
+
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
31
|
+
|
32
|
+
compare_generated_vs_original(generated, original, @index_page)
|
33
|
+
end
|
34
|
+
|
35
|
+
it "return text node as symbol" do
|
36
|
+
src = <<-EOB
|
37
|
+
:text_content:
|
38
|
+
:path: "/html/body/p[1]"
|
39
|
+
EOB
|
40
|
+
generated = Yasuri.yaml2tree(src)
|
41
|
+
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
42
|
+
|
43
|
+
compare_generated_vs_original(generated, original, @index_page)
|
44
|
+
end
|
45
|
+
|
46
|
+
it "return LinksNode/TextNode" do
|
47
|
+
|
48
|
+
src = <<-EOB
|
49
|
+
links_root:
|
50
|
+
path: "/html/body/a"
|
51
|
+
text_content: "/html/body/p"
|
52
|
+
EOB
|
53
|
+
generated = Yasuri.yaml2tree(src)
|
54
|
+
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
55
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
56
|
+
])
|
57
|
+
|
58
|
+
compare_generated_vs_original(generated, original, @index_page)
|
59
|
+
end
|
60
|
+
|
61
|
+
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
62
|
+
src = <<-EOB
|
63
|
+
struct_tables:
|
64
|
+
path: "/html/body/table"
|
65
|
+
struct_table:
|
66
|
+
path: "./tr"
|
67
|
+
text_title: "./td[1]"
|
68
|
+
text_pub_date: "./td[2]"
|
69
|
+
EOB
|
70
|
+
|
71
|
+
generated = Yasuri.yaml2tree(src)
|
72
|
+
original = Yasuri::StructNode.new('/html/body/table', "tables", [
|
73
|
+
Yasuri::StructNode.new('./tr', "table", [
|
74
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
75
|
+
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
76
|
+
])
|
77
|
+
])
|
78
|
+
page = @agent.get(@uri + "/struct/structual_text.html")
|
79
|
+
compare_generated_vs_original(generated, original, page)
|
80
|
+
end
|
81
|
+
|
82
|
+
end # end of describe '.yaml2tree'
|
83
|
+
|
84
|
+
|
16
85
|
#############
|
17
86
|
# json2tree #
|
18
87
|
#############
|
@@ -22,10 +91,10 @@ describe 'Yasuri' do
|
|
22
91
|
end
|
23
92
|
|
24
93
|
it "return TextNode" do
|
25
|
-
src = %q|
|
26
|
-
|
27
|
-
|
28
|
-
|
94
|
+
src = %q|
|
95
|
+
{
|
96
|
+
"text_content": "/html/body/p[1]"
|
97
|
+
}|
|
29
98
|
generated = Yasuri.json2tree(src)
|
30
99
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
31
100
|
|
@@ -33,26 +102,41 @@ describe 'Yasuri' do
|
|
33
102
|
end
|
34
103
|
|
35
104
|
it "return TextNode with truncate_regexp" do
|
36
|
-
src = %q|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
105
|
+
src = %q|
|
106
|
+
{
|
107
|
+
"text_content": {
|
108
|
+
"path": "/html/body/p[1]",
|
109
|
+
"truncate" : "^[^,]+"
|
110
|
+
}
|
111
|
+
}|
|
41
112
|
generated = Yasuri.json2tree(src)
|
42
113
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content", truncate:/^[^,]+/)
|
43
114
|
compare_generated_vs_original(generated, original, @index_page)
|
44
115
|
end
|
45
116
|
|
117
|
+
it "return MapNode with TextNodes" do
|
118
|
+
src = %q|
|
119
|
+
{
|
120
|
+
"text_content01": "/html/body/p[1]",
|
121
|
+
"text_content02": "/html/body/p[2]"
|
122
|
+
}|
|
123
|
+
generated = Yasuri.json2tree(src)
|
124
|
+
original = Yasuri::MapNode.new('parent', [
|
125
|
+
Yasuri::TextNode.new('/html/body/p[1]', "content01"),
|
126
|
+
Yasuri::TextNode.new('/html/body/p[2]', "content02"),
|
127
|
+
])
|
128
|
+
compare_generated_vs_original(generated, original, @index_page)
|
129
|
+
end
|
46
130
|
|
47
131
|
it "return LinksNode/TextNode" do
|
48
|
-
src = %q|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
132
|
+
src = %q|
|
133
|
+
{
|
134
|
+
"links_root": {
|
135
|
+
"path": "/html/body/a",
|
136
|
+
"text_content": "/html/body/p"
|
137
|
+
}
|
138
|
+
}|
|
139
|
+
|
56
140
|
generated = Yasuri.json2tree(src)
|
57
141
|
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
58
142
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
@@ -62,14 +146,13 @@ describe 'Yasuri' do
|
|
62
146
|
end
|
63
147
|
|
64
148
|
it "return PaginateNode/TextNode" do
|
65
|
-
src = %q|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
}|
|
149
|
+
src = %q|
|
150
|
+
{
|
151
|
+
"pages_root": {
|
152
|
+
"path": "/html/body/nav/span/a[@class=\'next\']",
|
153
|
+
"text_content": "/html/body/p"
|
154
|
+
}
|
155
|
+
}|
|
73
156
|
generated = Yasuri.json2tree(src)
|
74
157
|
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
75
158
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
@@ -81,15 +164,14 @@ describe 'Yasuri' do
|
|
81
164
|
end
|
82
165
|
|
83
166
|
it "return PaginateNode/TextNode with limit" do
|
84
|
-
src = %q|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
}|
|
167
|
+
src = %q|
|
168
|
+
{
|
169
|
+
"pages_root": {
|
170
|
+
"path": "/html/body/nav/span/a[@class=\'next\']",
|
171
|
+
"limit": 2,
|
172
|
+
"text_content": "/html/body/p"
|
173
|
+
}
|
174
|
+
}|
|
93
175
|
generated = Yasuri.json2tree(src)
|
94
176
|
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
95
177
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
@@ -101,24 +183,17 @@ describe 'Yasuri' do
|
|
101
183
|
end
|
102
184
|
|
103
185
|
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
},
|
116
|
-
{ "node" : "text",
|
117
|
-
"name" : "pub_date",
|
118
|
-
"path" : "./td[2]"
|
119
|
-
}]
|
120
|
-
}]
|
121
|
-
}|
|
186
|
+
src = %q|
|
187
|
+
{
|
188
|
+
"struct_tables": {
|
189
|
+
"path": "/html/body/table",
|
190
|
+
"struct_table": {
|
191
|
+
"path": "./tr",
|
192
|
+
"text_title": "./td[1]",
|
193
|
+
"text_pub_date": "./td[2]"
|
194
|
+
}
|
195
|
+
}
|
196
|
+
}|
|
122
197
|
generated = Yasuri.json2tree(src)
|
123
198
|
original = Yasuri::StructNode.new('/html/body/table', "tables", [
|
124
199
|
Yasuri::StructNode.new('./tr', "table", [
|
@@ -126,27 +201,27 @@ describe 'Yasuri' do
|
|
126
201
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
127
202
|
])
|
128
203
|
])
|
129
|
-
page = @agent.get(@uri + "/structual_text.html")
|
204
|
+
page = @agent.get(@uri + "/struct/structual_text.html")
|
130
205
|
compare_generated_vs_original(generated, original, page)
|
131
206
|
end
|
132
207
|
end
|
133
208
|
|
209
|
+
|
134
210
|
#############
|
135
211
|
# tree2json #
|
136
212
|
#############
|
137
213
|
describe '.tree2json' do
|
138
214
|
it "return empty json" do
|
139
|
-
|
140
|
-
expect(json).to match "{}"
|
215
|
+
expect { Yasuri.tree2json(nil) }.to raise_error(RuntimeError)
|
141
216
|
end
|
142
217
|
|
143
218
|
it "return text node" do
|
144
219
|
node = Yasuri::TextNode.new("/html/head/title", "title")
|
145
220
|
json = Yasuri.tree2json(node)
|
146
|
-
expected_str = %q|
|
147
|
-
|
148
|
-
|
149
|
-
|
221
|
+
expected_str = %q|
|
222
|
+
{
|
223
|
+
"text_title": "/html/head/title"
|
224
|
+
}|
|
150
225
|
expected = JSON.parse(expected_str)
|
151
226
|
actual = JSON.parse(json)
|
152
227
|
expect(actual).to match expected
|
@@ -155,29 +230,49 @@ describe 'Yasuri' do
|
|
155
230
|
it "return text node with truncate_regexp" do
|
156
231
|
node = Yasuri::TextNode.new("/html/head/title", "title", truncate:/^[^,]+/)
|
157
232
|
json = Yasuri.tree2json(node)
|
158
|
-
expected_str = %q|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
233
|
+
expected_str = %q|
|
234
|
+
{
|
235
|
+
"text_title": {
|
236
|
+
"path": "/html/head/title",
|
237
|
+
"truncate": "^[^,]+"
|
238
|
+
}
|
239
|
+
}|
|
163
240
|
expected = Yasuri.tree2json(Yasuri.json2tree(expected_str))
|
164
241
|
actual = Yasuri.tree2json(Yasuri.json2tree(json))
|
165
242
|
expect(actual).to match expected
|
166
243
|
end
|
167
244
|
|
245
|
+
it "return map node with text nodes" do
|
246
|
+
tree = Yasuri::MapNode.new('parent', [
|
247
|
+
Yasuri::TextNode.new('/html/body/p[1]', "content01"),
|
248
|
+
Yasuri::TextNode.new('/html/body/p[2]', "content02"),
|
249
|
+
])
|
250
|
+
actual_json = Yasuri.tree2json(tree)
|
251
|
+
|
252
|
+
expected_json = %q|
|
253
|
+
{
|
254
|
+
"text_content01": "/html/body/p[1]",
|
255
|
+
"text_content02": "/html/body/p[2]"
|
256
|
+
}|
|
257
|
+
|
258
|
+
expected = Yasuri.tree2json(Yasuri.json2tree(expected_json))
|
259
|
+
actual = Yasuri.tree2json(Yasuri.json2tree(actual_json))
|
260
|
+
expect(actual).to match expected
|
261
|
+
end
|
262
|
+
|
168
263
|
it "return LinksNode/TextNode" do
|
169
264
|
tree = Yasuri::LinksNode.new('/html/body/a', "root", [
|
170
265
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
171
266
|
])
|
172
267
|
json = Yasuri.tree2json(tree)
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
268
|
+
|
269
|
+
expected_src = %q|
|
270
|
+
{
|
271
|
+
"links_root": {
|
272
|
+
"path": "/html/body/a",
|
273
|
+
"text_content":"/html/body/p"
|
274
|
+
}
|
275
|
+
}|
|
181
276
|
expected = JSON.parse(expected_src)
|
182
277
|
actual = JSON.parse(json)
|
183
278
|
expect(actual).to match expected
|
@@ -189,24 +284,44 @@ describe 'Yasuri' do
|
|
189
284
|
], limit:10)
|
190
285
|
|
191
286
|
json = Yasuri.tree2json(tree)
|
192
|
-
expected_src = %q|
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
287
|
+
expected_src = %q|
|
288
|
+
{
|
289
|
+
"pages_root": {
|
290
|
+
"path": "/html/body/nav/span/a[@class='next']",
|
291
|
+
"limit": 10,
|
292
|
+
"flatten": false,
|
293
|
+
"text_content": "/html/body/p"
|
294
|
+
}
|
295
|
+
}|
|
201
296
|
expected = JSON.parse(expected_src)
|
202
297
|
actual = JSON.parse(json)
|
203
298
|
expect(actual).to match expected
|
204
299
|
end
|
205
|
-
|
206
|
-
|
207
|
-
|
208
300
|
end
|
209
301
|
|
302
|
+
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
303
|
+
tree = Yasuri::StructNode.new('/html/body/table', "tables", [
|
304
|
+
Yasuri::StructNode.new('./tr', "table", [
|
305
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
306
|
+
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
307
|
+
])
|
308
|
+
])
|
309
|
+
json = Yasuri.tree2json(tree)
|
310
|
+
expected_src = %q|
|
311
|
+
{
|
312
|
+
"struct_tables": {
|
313
|
+
"path": "/html/body/table",
|
314
|
+
"struct_table": {
|
315
|
+
"path": "./tr",
|
316
|
+
"text_title": "./td[1]",
|
317
|
+
"text_pub_date": "./td[2]"
|
318
|
+
}
|
319
|
+
}
|
320
|
+
}|
|
321
|
+
expected = JSON.parse(expected_src)
|
322
|
+
actual = JSON.parse(json)
|
323
|
+
expect(actual).to match expected
|
324
|
+
end
|
210
325
|
|
211
326
|
it 'has a version number' do
|
212
327
|
expect(Yasuri::VERSION).not_to be nil
|