yasuri 3.1.0 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +54 -24
- data/USAGE.ja.md +216 -72
- data/USAGE.md +225 -78
- data/exe/yasuri +5 -0
- data/lib/yasuri.rb +1 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +71 -36
- data/lib/yasuri/yasuri_cli.rb +64 -0
- data/lib/yasuri/yasuri_links_node.rb +3 -3
- data/lib/yasuri/yasuri_map_node.rb +12 -27
- data/lib/yasuri/yasuri_node.rb +15 -37
- data/lib/yasuri/yasuri_paginate_node.rb +5 -4
- data/lib/yasuri/yasuri_struct_node.rb +5 -1
- data/lib/yasuri/yasuri_text_node.rb +5 -5
- data/spec/cli_resources/tree.json +8 -0
- data/spec/cli_resources/tree.yml +5 -0
- data/spec/cli_resources/tree_wrong.json +9 -0
- data/spec/cli_resources/tree_wrong.yml +6 -0
- data/spec/spec_helper.rb +1 -1
- data/spec/yasuri_cli_spec.rb +83 -0
- data/spec/yasuri_spec.rb +125 -140
- data/yasuri.gemspec +3 -1
- metadata +31 -4
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'json'
|
3
|
+
require 'yasuri'
|
4
|
+
require 'mechanize'
|
5
|
+
|
6
|
+
module Yasuri
|
7
|
+
class CLI < Thor
|
8
|
+
package_name "yasuri"
|
9
|
+
|
10
|
+
default_command :scrape
|
11
|
+
desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]", "Getting from <URI> and scrape it. with <JSON> or json/yml from <TREE_FILE>. They should be Yasuri's format json or yaml string."
|
12
|
+
option :file, {aliases: 'f', desc: "path to file that written yasuri tree as json or yaml", type: :string}
|
13
|
+
option :json, {aliases: 'j', desc: "yasuri tree format json string", type: :string}
|
14
|
+
def scrape(uri)
|
15
|
+
# argument validations
|
16
|
+
if [options[:file], options[:json]].compact.count != 1
|
17
|
+
$stderr.puts "ERROR: Only one of `--file` or `--json` option should be specified."
|
18
|
+
return -1
|
19
|
+
end
|
20
|
+
if options[:file]&.empty? or options[:file] == "file" or options[:json]&.empty?
|
21
|
+
$stderr.puts "ERROR: --file option require not empty argument."
|
22
|
+
return -1
|
23
|
+
end
|
24
|
+
if options[:json]&.empty? or options[:json] == "json"
|
25
|
+
$stderr.puts "ERROR: --json option require not empty argument."
|
26
|
+
return -1
|
27
|
+
end
|
28
|
+
|
29
|
+
tree = if options[:file]
|
30
|
+
src = File.read(options[:file])
|
31
|
+
|
32
|
+
begin
|
33
|
+
Yasuri.json2tree(src)
|
34
|
+
rescue
|
35
|
+
begin
|
36
|
+
Yasuri.yaml2tree(src)
|
37
|
+
rescue => e
|
38
|
+
$stderr.puts "ERROR: Failed to convert to yasuri tree `#{options[:file]}`. #{e.message}"
|
39
|
+
return -1
|
40
|
+
end
|
41
|
+
end
|
42
|
+
else
|
43
|
+
begin
|
44
|
+
Yasuri.json2tree(options[:json])
|
45
|
+
rescue => e
|
46
|
+
$stderr.puts "ERROR: Failed to convert json to yasuri tree. #{e.message}"
|
47
|
+
return -1
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
agent = Mechanize.new
|
52
|
+
root_page = agent.get(uri)
|
53
|
+
result = tree.inject(agent, root_page)
|
54
|
+
|
55
|
+
if result.instance_of?(String)
|
56
|
+
puts result
|
57
|
+
else
|
58
|
+
j result
|
59
|
+
end
|
60
|
+
|
61
|
+
return 0
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -7,7 +7,7 @@ module Yasuri
|
|
7
7
|
class LinksNode
|
8
8
|
include Node
|
9
9
|
def inject(agent, page, opt = {}, element = page)
|
10
|
-
retry_count = opt[:retry_count] ||
|
10
|
+
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
11
11
|
|
12
12
|
links = element.search(@xpath) || [] # links expected
|
13
13
|
links.map do |link|
|
@@ -15,7 +15,7 @@ module Yasuri
|
|
15
15
|
child_page = Yasuri.with_retry(retry_count) { link_button.click }
|
16
16
|
|
17
17
|
child_results_kv = @children.map do |child_node|
|
18
|
-
child_name = Yasuri.
|
18
|
+
child_name = Yasuri.node_name(child_node.name, opt)
|
19
19
|
[child_name, child_node.inject(agent, child_page, opt)]
|
20
20
|
end
|
21
21
|
|
@@ -24,7 +24,7 @@ module Yasuri
|
|
24
24
|
end
|
25
25
|
|
26
26
|
def node_type_str
|
27
|
-
"links"
|
27
|
+
"links".freeze
|
28
28
|
end
|
29
29
|
end # class
|
30
30
|
end # module
|
@@ -3,7 +3,7 @@ module Yasuri
|
|
3
3
|
class MapNode
|
4
4
|
attr_reader :name, :children
|
5
5
|
|
6
|
-
def initialize(name, children, opt
|
6
|
+
def initialize(name, children, **opt)
|
7
7
|
@name = name
|
8
8
|
@children = children
|
9
9
|
@opt = opt
|
@@ -16,39 +16,24 @@ module Yasuri
|
|
16
16
|
Hash[child_results_kv]
|
17
17
|
end
|
18
18
|
|
19
|
-
def opts
|
20
|
-
{}
|
21
|
-
end
|
22
|
-
|
23
19
|
def to_h
|
24
|
-
|
25
|
-
|
26
|
-
h["name"] = self.name
|
27
|
-
h["children"] = self.children.map{|c| c.to_h} if not children.empty?
|
20
|
+
node_hash = {}
|
21
|
+
self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
|
28
22
|
|
29
|
-
|
30
|
-
|
23
|
+
children.each do |child|
|
24
|
+
child_node_name = "#{child.node_type_str}_#{child.name}"
|
25
|
+
node_hash[child_node_name] = child.to_h
|
31
26
|
end
|
32
27
|
|
33
|
-
|
28
|
+
node_hash
|
34
29
|
end
|
35
30
|
|
36
|
-
def
|
37
|
-
|
38
|
-
|
39
|
-
node, name, children = reservedKeys.map do |key|
|
40
|
-
node_h[key]
|
41
|
-
end
|
42
|
-
|
43
|
-
fail "Not found 'name' value in map" if name.nil?
|
44
|
-
fail "Not found 'children' value in map" if children.nil?
|
45
|
-
children ||= []
|
46
|
-
|
47
|
-
childnodes = children.map{|c| Yasuri.hash2node(c) }
|
48
|
-
reservedKeys.each{|key| node_h.delete(key)}
|
49
|
-
opt = node_h
|
31
|
+
def opts
|
32
|
+
{}
|
33
|
+
end
|
50
34
|
|
51
|
-
|
35
|
+
def node_type_str
|
36
|
+
"map".freeze
|
52
37
|
end
|
53
38
|
end
|
54
39
|
end
|
data/lib/yasuri/yasuri_node.rb
CHANGED
@@ -7,7 +7,7 @@ module Yasuri
|
|
7
7
|
module Node
|
8
8
|
attr_reader :url, :xpath, :name, :children
|
9
9
|
|
10
|
-
def initialize(xpath, name, children = [], opt
|
10
|
+
def initialize(xpath, name, children = [], **opt)
|
11
11
|
@xpath, @name, @children = xpath, name, children
|
12
12
|
end
|
13
13
|
|
@@ -15,50 +15,28 @@ module Yasuri
|
|
15
15
|
fail "#{Kernel.__method__} is not implemented in included class."
|
16
16
|
end
|
17
17
|
|
18
|
-
def opts
|
19
|
-
{}
|
20
|
-
end
|
21
|
-
|
22
18
|
def to_h
|
23
|
-
|
24
|
-
h["node"] = self.node_type_str
|
25
|
-
h["name"] = self.name
|
26
|
-
h["path"] = self.xpath
|
27
|
-
h["children"] = self.children.map{|c| c.to_h} if not children.empty?
|
28
|
-
|
29
|
-
self.opts.each do |key,value|
|
30
|
-
h[key] = value if not value.nil?
|
31
|
-
end
|
32
|
-
|
33
|
-
h
|
34
|
-
end
|
19
|
+
return @xpath if @xpath and @children.empty? and self.opts.values.compact.empty?
|
35
20
|
|
36
|
-
|
37
|
-
|
38
|
-
reservedKeys = %i|node name path children|
|
21
|
+
node_hash = {}
|
22
|
+
self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
|
39
23
|
|
40
|
-
|
41
|
-
node_h[key]
|
42
|
-
end
|
24
|
+
node_hash[:path] = @xpath if @xpath
|
43
25
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
childnodes = children.map{|c| Yasuri.hash2node(c) }
|
49
|
-
reservedKeys.each{|key| node_h.delete(key)}
|
50
|
-
opt = node_h
|
51
|
-
|
52
|
-
self.new(path, name, childnodes, **opt)
|
26
|
+
children.each do |child|
|
27
|
+
child_node_name = "#{child.node_type_str}_#{child.name}"
|
28
|
+
node_hash[child_node_name] = child.to_h
|
53
29
|
end
|
54
30
|
|
55
|
-
|
56
|
-
|
57
|
-
|
31
|
+
node_hash
|
32
|
+
end
|
33
|
+
|
34
|
+
def opts
|
35
|
+
{}
|
58
36
|
end
|
59
37
|
|
60
|
-
def
|
61
|
-
|
38
|
+
def node_type_str
|
39
|
+
fail "#{Kernel.__method__} is not implemented in included class."
|
62
40
|
end
|
63
41
|
end
|
64
42
|
end
|
@@ -14,7 +14,7 @@ module Yasuri
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def inject(agent, page, opt = {}, element = page)
|
17
|
-
retry_count = opt[:retry_count] ||
|
17
|
+
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
18
18
|
|
19
19
|
raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
|
20
20
|
|
@@ -22,12 +22,12 @@ module Yasuri
|
|
22
22
|
limit = @limit.nil? ? Float::MAX : @limit
|
23
23
|
while page
|
24
24
|
child_results_kv = @children.map do |child_node|
|
25
|
-
child_name = Yasuri.
|
25
|
+
child_name = Yasuri.node_name(child_node.name, opt)
|
26
26
|
[child_name, child_node.inject(agent, page, opt)]
|
27
27
|
end
|
28
28
|
child_results << Hash[child_results_kv]
|
29
29
|
|
30
|
-
link = page.search(@xpath).first
|
30
|
+
link = page.search(@xpath).first # Todo raise: link is not found
|
31
31
|
break if link == nil
|
32
32
|
|
33
33
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
@@ -41,12 +41,13 @@ module Yasuri
|
|
41
41
|
|
42
42
|
child_results
|
43
43
|
end
|
44
|
+
|
44
45
|
def opts
|
45
46
|
{limit:@limit, flatten:@flatten}
|
46
47
|
end
|
47
48
|
|
48
49
|
def node_type_str
|
49
|
-
"pages"
|
50
|
+
"pages".freeze
|
50
51
|
end
|
51
52
|
end
|
52
53
|
end
|
@@ -10,12 +10,16 @@ module Yasuri
|
|
10
10
|
sub_tags = element.search(@xpath)
|
11
11
|
tree = sub_tags.map do |sub_tag|
|
12
12
|
child_results_kv = @children.map do |child_node|
|
13
|
-
child_name = Yasuri.
|
13
|
+
child_name = Yasuri.node_name(child_node.name, opt)
|
14
14
|
[child_name, child_node.inject(agent, page, opt, sub_tag)]
|
15
15
|
end
|
16
16
|
Hash[child_results_kv]
|
17
17
|
end
|
18
18
|
tree.size == 1 ? tree.first : tree
|
19
19
|
end # inject
|
20
|
+
|
21
|
+
def node_type_str
|
22
|
+
"struct".freeze
|
23
|
+
end
|
20
24
|
end
|
21
25
|
end
|
@@ -18,7 +18,6 @@ module Yasuri
|
|
18
18
|
@truncate = Regexp.new(@truncate.to_s) if not @truncate.nil?
|
19
19
|
|
20
20
|
@proc = proc.nil? ? nil : proc.to_sym
|
21
|
-
|
22
21
|
end
|
23
22
|
|
24
23
|
def inject(agent, page, opt = {}, element = page)
|
@@ -31,15 +30,16 @@ module Yasuri
|
|
31
30
|
end
|
32
31
|
|
33
32
|
text = text.__send__(@proc) if @proc && text.respond_to?(@proc)
|
34
|
-
text
|
35
|
-
end
|
36
33
|
|
37
|
-
|
38
|
-
"text"
|
34
|
+
text
|
39
35
|
end
|
40
36
|
|
41
37
|
def opts
|
42
38
|
{truncate:@truncate, proc:@proc}
|
43
39
|
end
|
40
|
+
|
41
|
+
def node_type_str
|
42
|
+
"text".freeze
|
43
|
+
end
|
44
44
|
end
|
45
45
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -16,7 +16,7 @@ require 'simplecov'
|
|
16
16
|
require 'coveralls'
|
17
17
|
Coveralls.wear!
|
18
18
|
|
19
|
-
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
|
19
|
+
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new [
|
20
20
|
SimpleCov::Formatter::HTMLFormatter,
|
21
21
|
Coveralls::SimpleCov::Formatter
|
22
22
|
]
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Yasuri' do
|
4
|
+
include_context 'httpserver'
|
5
|
+
|
6
|
+
before do
|
7
|
+
@agent = Mechanize.new
|
8
|
+
@index_page = @agent.get(uri)
|
9
|
+
|
10
|
+
@res_dir = File.expand_path('../cli_resources', __FILE__)
|
11
|
+
end
|
12
|
+
|
13
|
+
describe 'cli scrape' do
|
14
|
+
it "require --file or --json option" do
|
15
|
+
expect {
|
16
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {})
|
17
|
+
}.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
|
18
|
+
end
|
19
|
+
|
20
|
+
it "only one of --file or --json option" do
|
21
|
+
expect {
|
22
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: "path.json", json: '{"text_title": "/html/head/title"}'})
|
23
|
+
}.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
|
24
|
+
end
|
25
|
+
|
26
|
+
it "require --file option is not empty string" do
|
27
|
+
expect {
|
28
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: "file"})
|
29
|
+
}.to output("ERROR: --file option require not empty argument.\n").to_stderr
|
30
|
+
end
|
31
|
+
|
32
|
+
it "require --json option is not empty string" do
|
33
|
+
expect {
|
34
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: "json"})
|
35
|
+
}.to output("ERROR: --json option require not empty argument.\n").to_stderr
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
it "display text node as simple string" do
|
40
|
+
expect {
|
41
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: '{"text_title": "/html/head/title"}'})
|
42
|
+
}.to output("Yasuri Test\n").to_stdout
|
43
|
+
end
|
44
|
+
|
45
|
+
it "display texts in single json" do
|
46
|
+
expect {
|
47
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: '{"text_c1":"/html/body/p[1]", "text_c2":"/html/body/p[2]"}'})
|
48
|
+
}.to output('{"c1":"Hello,Yasuri","c2":"Last Modify - 2015/02/14"}'+"\n").to_stdout
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
it "display text node as simple string via json file" do
|
53
|
+
expect {
|
54
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: "#{@res_dir}/tree.json"})
|
55
|
+
}.to output('[{"content":"Hello,YasuriLast Modify - 2015/02/14"}]' + "\n").to_stdout
|
56
|
+
end
|
57
|
+
it "display text node as simple string via yaml file" do
|
58
|
+
expect {
|
59
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: "#{@res_dir}/tree.yml"})
|
60
|
+
}.to output('[{"content":"Hello,YasuriLast Modify - 2015/02/14"}]' + "\n").to_stdout
|
61
|
+
end
|
62
|
+
|
63
|
+
|
64
|
+
it "display ERROR when json string is wrong" do
|
65
|
+
wrong_json = '{,,}'
|
66
|
+
expect {
|
67
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: wrong_json})
|
68
|
+
}.to output("ERROR: Failed to convert json to yasuri tree. 809: unexpected token at '#{wrong_json}'\n").to_stderr
|
69
|
+
end
|
70
|
+
it "display ERROR when json file contains is wrong" do
|
71
|
+
file_path = "#{@res_dir}/tree_wrong.json"
|
72
|
+
expect {
|
73
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: file_path})
|
74
|
+
}.to output("ERROR: Failed to convert to yasuri tree `#{file_path}`. (<unknown>): did not find expected node content while parsing a flow node at line 2 column 3\n").to_stderr
|
75
|
+
end
|
76
|
+
it "display ERROR when yaml file contains is wrong" do
|
77
|
+
file_path = "#{@res_dir}/tree_wrong.yml"
|
78
|
+
expect {
|
79
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: file_path})
|
80
|
+
}.to output("ERROR: Failed to convert to yasuri tree `#{file_path}`. (<unknown>): did not find expected node content while parsing a block node at line 1 column 1\n").to_stderr
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
data/spec/yasuri_spec.rb
CHANGED
@@ -13,6 +13,7 @@ describe 'Yasuri' do
|
|
13
13
|
@index_page = @agent.get(@uri)
|
14
14
|
end
|
15
15
|
|
16
|
+
|
16
17
|
############
|
17
18
|
# yam2tree #
|
18
19
|
############
|
@@ -23,10 +24,8 @@ describe 'Yasuri' do
|
|
23
24
|
|
24
25
|
it "return text node" do
|
25
26
|
src = <<-EOB
|
26
|
-
|
27
|
-
|
28
|
-
path: "/html/body/p[1]"
|
29
|
-
EOB
|
27
|
+
text_content: "/html/body/p[1]"
|
28
|
+
EOB
|
30
29
|
generated = Yasuri.yaml2tree(src)
|
31
30
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
32
31
|
|
@@ -35,10 +34,9 @@ EOB
|
|
35
34
|
|
36
35
|
it "return text node as symbol" do
|
37
36
|
src = <<-EOB
|
38
|
-
:
|
39
|
-
|
40
|
-
|
41
|
-
EOB
|
37
|
+
:text_content:
|
38
|
+
:path: "/html/body/p[1]"
|
39
|
+
EOB
|
42
40
|
generated = Yasuri.yaml2tree(src)
|
43
41
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
44
42
|
|
@@ -48,14 +46,10 @@ EOB
|
|
48
46
|
it "return LinksNode/TextNode" do
|
49
47
|
|
50
48
|
src = <<-EOB
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
- content:
|
56
|
-
node: text
|
57
|
-
path: "/html/body/p"
|
58
|
-
EOB
|
49
|
+
links_root:
|
50
|
+
path: "/html/body/a"
|
51
|
+
text_content: "/html/body/p"
|
52
|
+
EOB
|
59
53
|
generated = Yasuri.yaml2tree(src)
|
60
54
|
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
61
55
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
@@ -66,21 +60,13 @@ EOB
|
|
66
60
|
|
67
61
|
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
68
62
|
src = <<-EOB
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
children:
|
77
|
-
- title:
|
78
|
-
node: text
|
79
|
-
path: "./td[1]"
|
80
|
-
- pub_date:
|
81
|
-
node: text
|
82
|
-
path: "./td[2]"
|
83
|
-
EOB
|
63
|
+
struct_tables:
|
64
|
+
path: "/html/body/table"
|
65
|
+
struct_table:
|
66
|
+
path: "./tr"
|
67
|
+
text_title: "./td[1]"
|
68
|
+
text_pub_date: "./td[2]"
|
69
|
+
EOB
|
84
70
|
|
85
71
|
generated = Yasuri.yaml2tree(src)
|
86
72
|
original = Yasuri::StructNode.new('/html/body/table', "tables", [
|
@@ -105,10 +91,10 @@ EOB
|
|
105
91
|
end
|
106
92
|
|
107
93
|
it "return TextNode" do
|
108
|
-
src = %q|
|
109
|
-
|
110
|
-
|
111
|
-
|
94
|
+
src = %q|
|
95
|
+
{
|
96
|
+
"text_content": "/html/body/p[1]"
|
97
|
+
}|
|
112
98
|
generated = Yasuri.json2tree(src)
|
113
99
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
114
100
|
|
@@ -116,30 +102,24 @@ EOB
|
|
116
102
|
end
|
117
103
|
|
118
104
|
it "return TextNode with truncate_regexp" do
|
119
|
-
src = %q|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
105
|
+
src = %q|
|
106
|
+
{
|
107
|
+
"text_content": {
|
108
|
+
"path": "/html/body/p[1]",
|
109
|
+
"truncate" : "^[^,]+"
|
110
|
+
}
|
111
|
+
}|
|
124
112
|
generated = Yasuri.json2tree(src)
|
125
113
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content", truncate:/^[^,]+/)
|
126
114
|
compare_generated_vs_original(generated, original, @index_page)
|
127
115
|
end
|
128
116
|
|
129
117
|
it "return MapNode with TextNodes" do
|
130
|
-
src = %q|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
"path" : "/html/body/p[1]"
|
136
|
-
},
|
137
|
-
{ "node" : "text",
|
138
|
-
"name" : "content02",
|
139
|
-
"path" : "/html/body/p[2]"
|
140
|
-
}
|
141
|
-
]
|
142
|
-
}|
|
118
|
+
src = %q|
|
119
|
+
{
|
120
|
+
"text_content01": "/html/body/p[1]",
|
121
|
+
"text_content02": "/html/body/p[2]"
|
122
|
+
}|
|
143
123
|
generated = Yasuri.json2tree(src)
|
144
124
|
original = Yasuri::MapNode.new('parent', [
|
145
125
|
Yasuri::TextNode.new('/html/body/p[1]', "content01"),
|
@@ -149,14 +129,14 @@ EOB
|
|
149
129
|
end
|
150
130
|
|
151
131
|
it "return LinksNode/TextNode" do
|
152
|
-
src = %q|
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
132
|
+
src = %q|
|
133
|
+
{
|
134
|
+
"links_root": {
|
135
|
+
"path": "/html/body/a",
|
136
|
+
"text_content": "/html/body/p"
|
137
|
+
}
|
138
|
+
}|
|
139
|
+
|
160
140
|
generated = Yasuri.json2tree(src)
|
161
141
|
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
162
142
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
@@ -166,14 +146,13 @@ EOB
|
|
166
146
|
end
|
167
147
|
|
168
148
|
it "return PaginateNode/TextNode" do
|
169
|
-
src = %q|
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
}|
|
149
|
+
src = %q|
|
150
|
+
{
|
151
|
+
"pages_root": {
|
152
|
+
"path": "/html/body/nav/span/a[@class=\'next\']",
|
153
|
+
"text_content": "/html/body/p"
|
154
|
+
}
|
155
|
+
}|
|
177
156
|
generated = Yasuri.json2tree(src)
|
178
157
|
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
179
158
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
@@ -185,15 +164,14 @@ EOB
|
|
185
164
|
end
|
186
165
|
|
187
166
|
it "return PaginateNode/TextNode with limit" do
|
188
|
-
src = %q|
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
}|
|
167
|
+
src = %q|
|
168
|
+
{
|
169
|
+
"pages_root": {
|
170
|
+
"path": "/html/body/nav/span/a[@class=\'next\']",
|
171
|
+
"limit": 2,
|
172
|
+
"text_content": "/html/body/p"
|
173
|
+
}
|
174
|
+
}|
|
197
175
|
generated = Yasuri.json2tree(src)
|
198
176
|
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
199
177
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
@@ -205,24 +183,17 @@ EOB
|
|
205
183
|
end
|
206
184
|
|
207
185
|
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
},
|
220
|
-
{ "node" : "text",
|
221
|
-
"name" : "pub_date",
|
222
|
-
"path" : "./td[2]"
|
223
|
-
}]
|
224
|
-
}]
|
225
|
-
}|
|
186
|
+
src = %q|
|
187
|
+
{
|
188
|
+
"struct_tables": {
|
189
|
+
"path": "/html/body/table",
|
190
|
+
"struct_table": {
|
191
|
+
"path": "./tr",
|
192
|
+
"text_title": "./td[1]",
|
193
|
+
"text_pub_date": "./td[2]"
|
194
|
+
}
|
195
|
+
}
|
196
|
+
}|
|
226
197
|
generated = Yasuri.json2tree(src)
|
227
198
|
original = Yasuri::StructNode.new('/html/body/table', "tables", [
|
228
199
|
Yasuri::StructNode.new('./tr', "table", [
|
@@ -235,22 +206,22 @@ EOB
|
|
235
206
|
end
|
236
207
|
end
|
237
208
|
|
209
|
+
|
238
210
|
#############
|
239
211
|
# tree2json #
|
240
212
|
#############
|
241
213
|
describe '.tree2json' do
|
242
214
|
it "return empty json" do
|
243
|
-
|
244
|
-
expect(json).to match "{}"
|
215
|
+
expect { Yasuri.tree2json(nil) }.to raise_error(RuntimeError)
|
245
216
|
end
|
246
217
|
|
247
218
|
it "return text node" do
|
248
219
|
node = Yasuri::TextNode.new("/html/head/title", "title")
|
249
220
|
json = Yasuri.tree2json(node)
|
250
|
-
expected_str = %q|
|
251
|
-
|
252
|
-
|
253
|
-
|
221
|
+
expected_str = %q|
|
222
|
+
{
|
223
|
+
"text_title": "/html/head/title"
|
224
|
+
}|
|
254
225
|
expected = JSON.parse(expected_str)
|
255
226
|
actual = JSON.parse(json)
|
256
227
|
expect(actual).to match expected
|
@@ -259,11 +230,13 @@ EOB
|
|
259
230
|
it "return text node with truncate_regexp" do
|
260
231
|
node = Yasuri::TextNode.new("/html/head/title", "title", truncate:/^[^,]+/)
|
261
232
|
json = Yasuri.tree2json(node)
|
262
|
-
expected_str = %q|
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
233
|
+
expected_str = %q|
|
234
|
+
{
|
235
|
+
"text_title": {
|
236
|
+
"path": "/html/head/title",
|
237
|
+
"truncate": "^[^,]+"
|
238
|
+
}
|
239
|
+
}|
|
267
240
|
expected = Yasuri.tree2json(Yasuri.json2tree(expected_str))
|
268
241
|
actual = Yasuri.tree2json(Yasuri.json2tree(json))
|
269
242
|
expect(actual).to match expected
|
@@ -276,19 +249,12 @@ EOB
|
|
276
249
|
])
|
277
250
|
actual_json = Yasuri.tree2json(tree)
|
278
251
|
|
279
|
-
expected_json = %q|
|
280
|
-
|
281
|
-
"
|
282
|
-
|
283
|
-
"name" : "content01",
|
284
|
-
"path" : "/html/body/p[1]"
|
285
|
-
},
|
286
|
-
{ "node" : "text",
|
287
|
-
"name" : "content02",
|
288
|
-
"path" : "/html/body/p[2]"
|
289
|
-
}
|
290
|
-
]
|
252
|
+
expected_json = %q|
|
253
|
+
{
|
254
|
+
"text_content01": "/html/body/p[1]",
|
255
|
+
"text_content02": "/html/body/p[2]"
|
291
256
|
}|
|
257
|
+
|
292
258
|
expected = Yasuri.tree2json(Yasuri.json2tree(expected_json))
|
293
259
|
actual = Yasuri.tree2json(Yasuri.json2tree(actual_json))
|
294
260
|
expect(actual).to match expected
|
@@ -299,14 +265,14 @@ EOB
|
|
299
265
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
300
266
|
])
|
301
267
|
json = Yasuri.tree2json(tree)
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
268
|
+
|
269
|
+
expected_src = %q|
|
270
|
+
{
|
271
|
+
"links_root": {
|
272
|
+
"path": "/html/body/a",
|
273
|
+
"text_content":"/html/body/p"
|
274
|
+
}
|
275
|
+
}|
|
310
276
|
expected = JSON.parse(expected_src)
|
311
277
|
actual = JSON.parse(json)
|
312
278
|
expect(actual).to match expected
|
@@ -318,25 +284,44 @@ EOB
|
|
318
284
|
], limit:10)
|
319
285
|
|
320
286
|
json = Yasuri.tree2json(tree)
|
321
|
-
expected_src = %q|
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
}|
|
287
|
+
expected_src = %q|
|
288
|
+
{
|
289
|
+
"pages_root": {
|
290
|
+
"path": "/html/body/nav/span/a[@class='next']",
|
291
|
+
"limit": 10,
|
292
|
+
"flatten": false,
|
293
|
+
"text_content": "/html/body/p"
|
294
|
+
}
|
295
|
+
}|
|
331
296
|
expected = JSON.parse(expected_src)
|
332
297
|
actual = JSON.parse(json)
|
333
298
|
expect(actual).to match expected
|
334
299
|
end
|
335
|
-
|
336
|
-
|
337
|
-
|
338
300
|
end
|
339
301
|
|
302
|
+
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
303
|
+
tree = Yasuri::StructNode.new('/html/body/table', "tables", [
|
304
|
+
Yasuri::StructNode.new('./tr', "table", [
|
305
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
306
|
+
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
307
|
+
])
|
308
|
+
])
|
309
|
+
json = Yasuri.tree2json(tree)
|
310
|
+
expected_src = %q|
|
311
|
+
{
|
312
|
+
"struct_tables": {
|
313
|
+
"path": "/html/body/table",
|
314
|
+
"struct_table": {
|
315
|
+
"path": "./tr",
|
316
|
+
"text_title": "./td[1]",
|
317
|
+
"text_pub_date": "./td[2]"
|
318
|
+
}
|
319
|
+
}
|
320
|
+
}|
|
321
|
+
expected = JSON.parse(expected_src)
|
322
|
+
actual = JSON.parse(json)
|
323
|
+
expect(actual).to match expected
|
324
|
+
end
|
340
325
|
|
341
326
|
it 'has a version number' do
|
342
327
|
expect(Yasuri::VERSION).not_to be nil
|