yasuri 3.1.0 → 3.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +54 -24
- data/USAGE.ja.md +216 -72
- data/USAGE.md +225 -78
- data/exe/yasuri +5 -0
- data/lib/yasuri.rb +1 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +71 -36
- data/lib/yasuri/yasuri_cli.rb +64 -0
- data/lib/yasuri/yasuri_links_node.rb +3 -3
- data/lib/yasuri/yasuri_map_node.rb +12 -27
- data/lib/yasuri/yasuri_node.rb +15 -37
- data/lib/yasuri/yasuri_paginate_node.rb +5 -4
- data/lib/yasuri/yasuri_struct_node.rb +5 -1
- data/lib/yasuri/yasuri_text_node.rb +5 -5
- data/spec/cli_resources/tree.json +8 -0
- data/spec/cli_resources/tree.yml +5 -0
- data/spec/cli_resources/tree_wrong.json +9 -0
- data/spec/cli_resources/tree_wrong.yml +6 -0
- data/spec/spec_helper.rb +1 -1
- data/spec/yasuri_cli_spec.rb +83 -0
- data/spec/yasuri_spec.rb +125 -140
- data/yasuri.gemspec +3 -1
- metadata +31 -4
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'json'
|
3
|
+
require 'yasuri'
|
4
|
+
require 'mechanize'
|
5
|
+
|
6
|
+
module Yasuri
|
7
|
+
class CLI < Thor
|
8
|
+
package_name "yasuri"
|
9
|
+
|
10
|
+
default_command :scrape
|
11
|
+
desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]", "Getting from <URI> and scrape it. with <JSON> or json/yml from <TREE_FILE>. They should be Yasuri's format json or yaml string."
|
12
|
+
option :file, {aliases: 'f', desc: "path to file that written yasuri tree as json or yaml", type: :string}
|
13
|
+
option :json, {aliases: 'j', desc: "yasuri tree format json string", type: :string}
|
14
|
+
def scrape(uri)
|
15
|
+
# argument validations
|
16
|
+
if [options[:file], options[:json]].compact.count != 1
|
17
|
+
$stderr.puts "ERROR: Only one of `--file` or `--json` option should be specified."
|
18
|
+
return -1
|
19
|
+
end
|
20
|
+
if options[:file]&.empty? or options[:file] == "file" or options[:json]&.empty?
|
21
|
+
$stderr.puts "ERROR: --file option require not empty argument."
|
22
|
+
return -1
|
23
|
+
end
|
24
|
+
if options[:json]&.empty? or options[:json] == "json"
|
25
|
+
$stderr.puts "ERROR: --json option require not empty argument."
|
26
|
+
return -1
|
27
|
+
end
|
28
|
+
|
29
|
+
tree = if options[:file]
|
30
|
+
src = File.read(options[:file])
|
31
|
+
|
32
|
+
begin
|
33
|
+
Yasuri.json2tree(src)
|
34
|
+
rescue
|
35
|
+
begin
|
36
|
+
Yasuri.yaml2tree(src)
|
37
|
+
rescue => e
|
38
|
+
$stderr.puts "ERROR: Failed to convert to yasuri tree `#{options[:file]}`. #{e.message}"
|
39
|
+
return -1
|
40
|
+
end
|
41
|
+
end
|
42
|
+
else
|
43
|
+
begin
|
44
|
+
Yasuri.json2tree(options[:json])
|
45
|
+
rescue => e
|
46
|
+
$stderr.puts "ERROR: Failed to convert json to yasuri tree. #{e.message}"
|
47
|
+
return -1
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
agent = Mechanize.new
|
52
|
+
root_page = agent.get(uri)
|
53
|
+
result = tree.inject(agent, root_page)
|
54
|
+
|
55
|
+
if result.instance_of?(String)
|
56
|
+
puts result
|
57
|
+
else
|
58
|
+
j result
|
59
|
+
end
|
60
|
+
|
61
|
+
return 0
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -7,7 +7,7 @@ module Yasuri
|
|
7
7
|
class LinksNode
|
8
8
|
include Node
|
9
9
|
def inject(agent, page, opt = {}, element = page)
|
10
|
-
retry_count = opt[:retry_count] ||
|
10
|
+
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
11
11
|
|
12
12
|
links = element.search(@xpath) || [] # links expected
|
13
13
|
links.map do |link|
|
@@ -15,7 +15,7 @@ module Yasuri
|
|
15
15
|
child_page = Yasuri.with_retry(retry_count) { link_button.click }
|
16
16
|
|
17
17
|
child_results_kv = @children.map do |child_node|
|
18
|
-
child_name = Yasuri.
|
18
|
+
child_name = Yasuri.node_name(child_node.name, opt)
|
19
19
|
[child_name, child_node.inject(agent, child_page, opt)]
|
20
20
|
end
|
21
21
|
|
@@ -24,7 +24,7 @@ module Yasuri
|
|
24
24
|
end
|
25
25
|
|
26
26
|
def node_type_str
|
27
|
-
"links"
|
27
|
+
"links".freeze
|
28
28
|
end
|
29
29
|
end # class
|
30
30
|
end # module
|
@@ -3,7 +3,7 @@ module Yasuri
|
|
3
3
|
class MapNode
|
4
4
|
attr_reader :name, :children
|
5
5
|
|
6
|
-
def initialize(name, children, opt
|
6
|
+
def initialize(name, children, **opt)
|
7
7
|
@name = name
|
8
8
|
@children = children
|
9
9
|
@opt = opt
|
@@ -16,39 +16,24 @@ module Yasuri
|
|
16
16
|
Hash[child_results_kv]
|
17
17
|
end
|
18
18
|
|
19
|
-
def opts
|
20
|
-
{}
|
21
|
-
end
|
22
|
-
|
23
19
|
def to_h
|
24
|
-
|
25
|
-
|
26
|
-
h["name"] = self.name
|
27
|
-
h["children"] = self.children.map{|c| c.to_h} if not children.empty?
|
20
|
+
node_hash = {}
|
21
|
+
self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
|
28
22
|
|
29
|
-
|
30
|
-
|
23
|
+
children.each do |child|
|
24
|
+
child_node_name = "#{child.node_type_str}_#{child.name}"
|
25
|
+
node_hash[child_node_name] = child.to_h
|
31
26
|
end
|
32
27
|
|
33
|
-
|
28
|
+
node_hash
|
34
29
|
end
|
35
30
|
|
36
|
-
def
|
37
|
-
|
38
|
-
|
39
|
-
node, name, children = reservedKeys.map do |key|
|
40
|
-
node_h[key]
|
41
|
-
end
|
42
|
-
|
43
|
-
fail "Not found 'name' value in map" if name.nil?
|
44
|
-
fail "Not found 'children' value in map" if children.nil?
|
45
|
-
children ||= []
|
46
|
-
|
47
|
-
childnodes = children.map{|c| Yasuri.hash2node(c) }
|
48
|
-
reservedKeys.each{|key| node_h.delete(key)}
|
49
|
-
opt = node_h
|
31
|
+
def opts
|
32
|
+
{}
|
33
|
+
end
|
50
34
|
|
51
|
-
|
35
|
+
def node_type_str
|
36
|
+
"map".freeze
|
52
37
|
end
|
53
38
|
end
|
54
39
|
end
|
data/lib/yasuri/yasuri_node.rb
CHANGED
@@ -7,7 +7,7 @@ module Yasuri
|
|
7
7
|
module Node
|
8
8
|
attr_reader :url, :xpath, :name, :children
|
9
9
|
|
10
|
-
def initialize(xpath, name, children = [], opt
|
10
|
+
def initialize(xpath, name, children = [], **opt)
|
11
11
|
@xpath, @name, @children = xpath, name, children
|
12
12
|
end
|
13
13
|
|
@@ -15,50 +15,28 @@ module Yasuri
|
|
15
15
|
fail "#{Kernel.__method__} is not implemented in included class."
|
16
16
|
end
|
17
17
|
|
18
|
-
def opts
|
19
|
-
{}
|
20
|
-
end
|
21
|
-
|
22
18
|
def to_h
|
23
|
-
|
24
|
-
h["node"] = self.node_type_str
|
25
|
-
h["name"] = self.name
|
26
|
-
h["path"] = self.xpath
|
27
|
-
h["children"] = self.children.map{|c| c.to_h} if not children.empty?
|
28
|
-
|
29
|
-
self.opts.each do |key,value|
|
30
|
-
h[key] = value if not value.nil?
|
31
|
-
end
|
32
|
-
|
33
|
-
h
|
34
|
-
end
|
19
|
+
return @xpath if @xpath and @children.empty? and self.opts.values.compact.empty?
|
35
20
|
|
36
|
-
|
37
|
-
|
38
|
-
reservedKeys = %i|node name path children|
|
21
|
+
node_hash = {}
|
22
|
+
self.opts.each{|k, v| node_hash[k] = v if not v.nil?}
|
39
23
|
|
40
|
-
|
41
|
-
node_h[key]
|
42
|
-
end
|
24
|
+
node_hash[:path] = @xpath if @xpath
|
43
25
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
childnodes = children.map{|c| Yasuri.hash2node(c) }
|
49
|
-
reservedKeys.each{|key| node_h.delete(key)}
|
50
|
-
opt = node_h
|
51
|
-
|
52
|
-
self.new(path, name, childnodes, **opt)
|
26
|
+
children.each do |child|
|
27
|
+
child_node_name = "#{child.node_type_str}_#{child.name}"
|
28
|
+
node_hash[child_node_name] = child.to_h
|
53
29
|
end
|
54
30
|
|
55
|
-
|
56
|
-
|
57
|
-
|
31
|
+
node_hash
|
32
|
+
end
|
33
|
+
|
34
|
+
def opts
|
35
|
+
{}
|
58
36
|
end
|
59
37
|
|
60
|
-
def
|
61
|
-
|
38
|
+
def node_type_str
|
39
|
+
fail "#{Kernel.__method__} is not implemented in included class."
|
62
40
|
end
|
63
41
|
end
|
64
42
|
end
|
@@ -14,7 +14,7 @@ module Yasuri
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def inject(agent, page, opt = {}, element = page)
|
17
|
-
retry_count = opt[:retry_count] ||
|
17
|
+
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
18
18
|
|
19
19
|
raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
|
20
20
|
|
@@ -22,12 +22,12 @@ module Yasuri
|
|
22
22
|
limit = @limit.nil? ? Float::MAX : @limit
|
23
23
|
while page
|
24
24
|
child_results_kv = @children.map do |child_node|
|
25
|
-
child_name = Yasuri.
|
25
|
+
child_name = Yasuri.node_name(child_node.name, opt)
|
26
26
|
[child_name, child_node.inject(agent, page, opt)]
|
27
27
|
end
|
28
28
|
child_results << Hash[child_results_kv]
|
29
29
|
|
30
|
-
link = page.search(@xpath).first
|
30
|
+
link = page.search(@xpath).first # Todo raise: link is not found
|
31
31
|
break if link == nil
|
32
32
|
|
33
33
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
@@ -41,12 +41,13 @@ module Yasuri
|
|
41
41
|
|
42
42
|
child_results
|
43
43
|
end
|
44
|
+
|
44
45
|
def opts
|
45
46
|
{limit:@limit, flatten:@flatten}
|
46
47
|
end
|
47
48
|
|
48
49
|
def node_type_str
|
49
|
-
"pages"
|
50
|
+
"pages".freeze
|
50
51
|
end
|
51
52
|
end
|
52
53
|
end
|
@@ -10,12 +10,16 @@ module Yasuri
|
|
10
10
|
sub_tags = element.search(@xpath)
|
11
11
|
tree = sub_tags.map do |sub_tag|
|
12
12
|
child_results_kv = @children.map do |child_node|
|
13
|
-
child_name = Yasuri.
|
13
|
+
child_name = Yasuri.node_name(child_node.name, opt)
|
14
14
|
[child_name, child_node.inject(agent, page, opt, sub_tag)]
|
15
15
|
end
|
16
16
|
Hash[child_results_kv]
|
17
17
|
end
|
18
18
|
tree.size == 1 ? tree.first : tree
|
19
19
|
end # inject
|
20
|
+
|
21
|
+
def node_type_str
|
22
|
+
"struct".freeze
|
23
|
+
end
|
20
24
|
end
|
21
25
|
end
|
@@ -18,7 +18,6 @@ module Yasuri
|
|
18
18
|
@truncate = Regexp.new(@truncate.to_s) if not @truncate.nil?
|
19
19
|
|
20
20
|
@proc = proc.nil? ? nil : proc.to_sym
|
21
|
-
|
22
21
|
end
|
23
22
|
|
24
23
|
def inject(agent, page, opt = {}, element = page)
|
@@ -31,15 +30,16 @@ module Yasuri
|
|
31
30
|
end
|
32
31
|
|
33
32
|
text = text.__send__(@proc) if @proc && text.respond_to?(@proc)
|
34
|
-
text
|
35
|
-
end
|
36
33
|
|
37
|
-
|
38
|
-
"text"
|
34
|
+
text
|
39
35
|
end
|
40
36
|
|
41
37
|
def opts
|
42
38
|
{truncate:@truncate, proc:@proc}
|
43
39
|
end
|
40
|
+
|
41
|
+
def node_type_str
|
42
|
+
"text".freeze
|
43
|
+
end
|
44
44
|
end
|
45
45
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -16,7 +16,7 @@ require 'simplecov'
|
|
16
16
|
require 'coveralls'
|
17
17
|
Coveralls.wear!
|
18
18
|
|
19
|
-
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
|
19
|
+
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new [
|
20
20
|
SimpleCov::Formatter::HTMLFormatter,
|
21
21
|
Coveralls::SimpleCov::Formatter
|
22
22
|
]
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
|
3
|
+
describe 'Yasuri' do
|
4
|
+
include_context 'httpserver'
|
5
|
+
|
6
|
+
before do
|
7
|
+
@agent = Mechanize.new
|
8
|
+
@index_page = @agent.get(uri)
|
9
|
+
|
10
|
+
@res_dir = File.expand_path('../cli_resources', __FILE__)
|
11
|
+
end
|
12
|
+
|
13
|
+
describe 'cli scrape' do
|
14
|
+
it "require --file or --json option" do
|
15
|
+
expect {
|
16
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {})
|
17
|
+
}.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
|
18
|
+
end
|
19
|
+
|
20
|
+
it "only one of --file or --json option" do
|
21
|
+
expect {
|
22
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: "path.json", json: '{"text_title": "/html/head/title"}'})
|
23
|
+
}.to output("ERROR: Only one of `--file` or `--json` option should be specified.\n").to_stderr
|
24
|
+
end
|
25
|
+
|
26
|
+
it "require --file option is not empty string" do
|
27
|
+
expect {
|
28
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: "file"})
|
29
|
+
}.to output("ERROR: --file option require not empty argument.\n").to_stderr
|
30
|
+
end
|
31
|
+
|
32
|
+
it "require --json option is not empty string" do
|
33
|
+
expect {
|
34
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: "json"})
|
35
|
+
}.to output("ERROR: --json option require not empty argument.\n").to_stderr
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
it "display text node as simple string" do
|
40
|
+
expect {
|
41
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: '{"text_title": "/html/head/title"}'})
|
42
|
+
}.to output("Yasuri Test\n").to_stdout
|
43
|
+
end
|
44
|
+
|
45
|
+
it "display texts in single json" do
|
46
|
+
expect {
|
47
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: '{"text_c1":"/html/body/p[1]", "text_c2":"/html/body/p[2]"}'})
|
48
|
+
}.to output('{"c1":"Hello,Yasuri","c2":"Last Modify - 2015/02/14"}'+"\n").to_stdout
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
it "display text node as simple string via json file" do
|
53
|
+
expect {
|
54
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: "#{@res_dir}/tree.json"})
|
55
|
+
}.to output('[{"content":"Hello,YasuriLast Modify - 2015/02/14"}]' + "\n").to_stdout
|
56
|
+
end
|
57
|
+
it "display text node as simple string via yaml file" do
|
58
|
+
expect {
|
59
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: "#{@res_dir}/tree.yml"})
|
60
|
+
}.to output('[{"content":"Hello,YasuriLast Modify - 2015/02/14"}]' + "\n").to_stdout
|
61
|
+
end
|
62
|
+
|
63
|
+
|
64
|
+
it "display ERROR when json string is wrong" do
|
65
|
+
wrong_json = '{,,}'
|
66
|
+
expect {
|
67
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {json: wrong_json})
|
68
|
+
}.to output("ERROR: Failed to convert json to yasuri tree. 809: unexpected token at '#{wrong_json}'\n").to_stderr
|
69
|
+
end
|
70
|
+
it "display ERROR when json file contains is wrong" do
|
71
|
+
file_path = "#{@res_dir}/tree_wrong.json"
|
72
|
+
expect {
|
73
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: file_path})
|
74
|
+
}.to output("ERROR: Failed to convert to yasuri tree `#{file_path}`. (<unknown>): did not find expected node content while parsing a flow node at line 2 column 3\n").to_stderr
|
75
|
+
end
|
76
|
+
it "display ERROR when yaml file contains is wrong" do
|
77
|
+
file_path = "#{@res_dir}/tree_wrong.yml"
|
78
|
+
expect {
|
79
|
+
Yasuri::CLI.new.invoke(:scrape, [uri], {file: file_path})
|
80
|
+
}.to output("ERROR: Failed to convert to yasuri tree `#{file_path}`. (<unknown>): did not find expected node content while parsing a block node at line 1 column 1\n").to_stderr
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
data/spec/yasuri_spec.rb
CHANGED
@@ -13,6 +13,7 @@ describe 'Yasuri' do
|
|
13
13
|
@index_page = @agent.get(@uri)
|
14
14
|
end
|
15
15
|
|
16
|
+
|
16
17
|
############
|
17
18
|
# yam2tree #
|
18
19
|
############
|
@@ -23,10 +24,8 @@ describe 'Yasuri' do
|
|
23
24
|
|
24
25
|
it "return text node" do
|
25
26
|
src = <<-EOB
|
26
|
-
|
27
|
-
|
28
|
-
path: "/html/body/p[1]"
|
29
|
-
EOB
|
27
|
+
text_content: "/html/body/p[1]"
|
28
|
+
EOB
|
30
29
|
generated = Yasuri.yaml2tree(src)
|
31
30
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
32
31
|
|
@@ -35,10 +34,9 @@ EOB
|
|
35
34
|
|
36
35
|
it "return text node as symbol" do
|
37
36
|
src = <<-EOB
|
38
|
-
:
|
39
|
-
|
40
|
-
|
41
|
-
EOB
|
37
|
+
:text_content:
|
38
|
+
:path: "/html/body/p[1]"
|
39
|
+
EOB
|
42
40
|
generated = Yasuri.yaml2tree(src)
|
43
41
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
44
42
|
|
@@ -48,14 +46,10 @@ EOB
|
|
48
46
|
it "return LinksNode/TextNode" do
|
49
47
|
|
50
48
|
src = <<-EOB
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
- content:
|
56
|
-
node: text
|
57
|
-
path: "/html/body/p"
|
58
|
-
EOB
|
49
|
+
links_root:
|
50
|
+
path: "/html/body/a"
|
51
|
+
text_content: "/html/body/p"
|
52
|
+
EOB
|
59
53
|
generated = Yasuri.yaml2tree(src)
|
60
54
|
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
61
55
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
@@ -66,21 +60,13 @@ EOB
|
|
66
60
|
|
67
61
|
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
68
62
|
src = <<-EOB
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
children:
|
77
|
-
- title:
|
78
|
-
node: text
|
79
|
-
path: "./td[1]"
|
80
|
-
- pub_date:
|
81
|
-
node: text
|
82
|
-
path: "./td[2]"
|
83
|
-
EOB
|
63
|
+
struct_tables:
|
64
|
+
path: "/html/body/table"
|
65
|
+
struct_table:
|
66
|
+
path: "./tr"
|
67
|
+
text_title: "./td[1]"
|
68
|
+
text_pub_date: "./td[2]"
|
69
|
+
EOB
|
84
70
|
|
85
71
|
generated = Yasuri.yaml2tree(src)
|
86
72
|
original = Yasuri::StructNode.new('/html/body/table', "tables", [
|
@@ -105,10 +91,10 @@ EOB
|
|
105
91
|
end
|
106
92
|
|
107
93
|
it "return TextNode" do
|
108
|
-
src = %q|
|
109
|
-
|
110
|
-
|
111
|
-
|
94
|
+
src = %q|
|
95
|
+
{
|
96
|
+
"text_content": "/html/body/p[1]"
|
97
|
+
}|
|
112
98
|
generated = Yasuri.json2tree(src)
|
113
99
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
114
100
|
|
@@ -116,30 +102,24 @@ EOB
|
|
116
102
|
end
|
117
103
|
|
118
104
|
it "return TextNode with truncate_regexp" do
|
119
|
-
src = %q|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
105
|
+
src = %q|
|
106
|
+
{
|
107
|
+
"text_content": {
|
108
|
+
"path": "/html/body/p[1]",
|
109
|
+
"truncate" : "^[^,]+"
|
110
|
+
}
|
111
|
+
}|
|
124
112
|
generated = Yasuri.json2tree(src)
|
125
113
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content", truncate:/^[^,]+/)
|
126
114
|
compare_generated_vs_original(generated, original, @index_page)
|
127
115
|
end
|
128
116
|
|
129
117
|
it "return MapNode with TextNodes" do
|
130
|
-
src = %q|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
"path" : "/html/body/p[1]"
|
136
|
-
},
|
137
|
-
{ "node" : "text",
|
138
|
-
"name" : "content02",
|
139
|
-
"path" : "/html/body/p[2]"
|
140
|
-
}
|
141
|
-
]
|
142
|
-
}|
|
118
|
+
src = %q|
|
119
|
+
{
|
120
|
+
"text_content01": "/html/body/p[1]",
|
121
|
+
"text_content02": "/html/body/p[2]"
|
122
|
+
}|
|
143
123
|
generated = Yasuri.json2tree(src)
|
144
124
|
original = Yasuri::MapNode.new('parent', [
|
145
125
|
Yasuri::TextNode.new('/html/body/p[1]', "content01"),
|
@@ -149,14 +129,14 @@ EOB
|
|
149
129
|
end
|
150
130
|
|
151
131
|
it "return LinksNode/TextNode" do
|
152
|
-
src = %q|
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
132
|
+
src = %q|
|
133
|
+
{
|
134
|
+
"links_root": {
|
135
|
+
"path": "/html/body/a",
|
136
|
+
"text_content": "/html/body/p"
|
137
|
+
}
|
138
|
+
}|
|
139
|
+
|
160
140
|
generated = Yasuri.json2tree(src)
|
161
141
|
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
162
142
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
@@ -166,14 +146,13 @@ EOB
|
|
166
146
|
end
|
167
147
|
|
168
148
|
it "return PaginateNode/TextNode" do
|
169
|
-
src = %q|
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
}|
|
149
|
+
src = %q|
|
150
|
+
{
|
151
|
+
"pages_root": {
|
152
|
+
"path": "/html/body/nav/span/a[@class=\'next\']",
|
153
|
+
"text_content": "/html/body/p"
|
154
|
+
}
|
155
|
+
}|
|
177
156
|
generated = Yasuri.json2tree(src)
|
178
157
|
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
179
158
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
@@ -185,15 +164,14 @@ EOB
|
|
185
164
|
end
|
186
165
|
|
187
166
|
it "return PaginateNode/TextNode with limit" do
|
188
|
-
src = %q|
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
}|
|
167
|
+
src = %q|
|
168
|
+
{
|
169
|
+
"pages_root": {
|
170
|
+
"path": "/html/body/nav/span/a[@class=\'next\']",
|
171
|
+
"limit": 2,
|
172
|
+
"text_content": "/html/body/p"
|
173
|
+
}
|
174
|
+
}|
|
197
175
|
generated = Yasuri.json2tree(src)
|
198
176
|
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
199
177
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
@@ -205,24 +183,17 @@ EOB
|
|
205
183
|
end
|
206
184
|
|
207
185
|
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
},
|
220
|
-
{ "node" : "text",
|
221
|
-
"name" : "pub_date",
|
222
|
-
"path" : "./td[2]"
|
223
|
-
}]
|
224
|
-
}]
|
225
|
-
}|
|
186
|
+
src = %q|
|
187
|
+
{
|
188
|
+
"struct_tables": {
|
189
|
+
"path": "/html/body/table",
|
190
|
+
"struct_table": {
|
191
|
+
"path": "./tr",
|
192
|
+
"text_title": "./td[1]",
|
193
|
+
"text_pub_date": "./td[2]"
|
194
|
+
}
|
195
|
+
}
|
196
|
+
}|
|
226
197
|
generated = Yasuri.json2tree(src)
|
227
198
|
original = Yasuri::StructNode.new('/html/body/table', "tables", [
|
228
199
|
Yasuri::StructNode.new('./tr', "table", [
|
@@ -235,22 +206,22 @@ EOB
|
|
235
206
|
end
|
236
207
|
end
|
237
208
|
|
209
|
+
|
238
210
|
#############
|
239
211
|
# tree2json #
|
240
212
|
#############
|
241
213
|
describe '.tree2json' do
|
242
214
|
it "return empty json" do
|
243
|
-
|
244
|
-
expect(json).to match "{}"
|
215
|
+
expect { Yasuri.tree2json(nil) }.to raise_error(RuntimeError)
|
245
216
|
end
|
246
217
|
|
247
218
|
it "return text node" do
|
248
219
|
node = Yasuri::TextNode.new("/html/head/title", "title")
|
249
220
|
json = Yasuri.tree2json(node)
|
250
|
-
expected_str = %q|
|
251
|
-
|
252
|
-
|
253
|
-
|
221
|
+
expected_str = %q|
|
222
|
+
{
|
223
|
+
"text_title": "/html/head/title"
|
224
|
+
}|
|
254
225
|
expected = JSON.parse(expected_str)
|
255
226
|
actual = JSON.parse(json)
|
256
227
|
expect(actual).to match expected
|
@@ -259,11 +230,13 @@ EOB
|
|
259
230
|
it "return text node with truncate_regexp" do
|
260
231
|
node = Yasuri::TextNode.new("/html/head/title", "title", truncate:/^[^,]+/)
|
261
232
|
json = Yasuri.tree2json(node)
|
262
|
-
expected_str = %q|
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
233
|
+
expected_str = %q|
|
234
|
+
{
|
235
|
+
"text_title": {
|
236
|
+
"path": "/html/head/title",
|
237
|
+
"truncate": "^[^,]+"
|
238
|
+
}
|
239
|
+
}|
|
267
240
|
expected = Yasuri.tree2json(Yasuri.json2tree(expected_str))
|
268
241
|
actual = Yasuri.tree2json(Yasuri.json2tree(json))
|
269
242
|
expect(actual).to match expected
|
@@ -276,19 +249,12 @@ EOB
|
|
276
249
|
])
|
277
250
|
actual_json = Yasuri.tree2json(tree)
|
278
251
|
|
279
|
-
expected_json = %q|
|
280
|
-
|
281
|
-
"
|
282
|
-
|
283
|
-
"name" : "content01",
|
284
|
-
"path" : "/html/body/p[1]"
|
285
|
-
},
|
286
|
-
{ "node" : "text",
|
287
|
-
"name" : "content02",
|
288
|
-
"path" : "/html/body/p[2]"
|
289
|
-
}
|
290
|
-
]
|
252
|
+
expected_json = %q|
|
253
|
+
{
|
254
|
+
"text_content01": "/html/body/p[1]",
|
255
|
+
"text_content02": "/html/body/p[2]"
|
291
256
|
}|
|
257
|
+
|
292
258
|
expected = Yasuri.tree2json(Yasuri.json2tree(expected_json))
|
293
259
|
actual = Yasuri.tree2json(Yasuri.json2tree(actual_json))
|
294
260
|
expect(actual).to match expected
|
@@ -299,14 +265,14 @@ EOB
|
|
299
265
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
300
266
|
])
|
301
267
|
json = Yasuri.tree2json(tree)
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
268
|
+
|
269
|
+
expected_src = %q|
|
270
|
+
{
|
271
|
+
"links_root": {
|
272
|
+
"path": "/html/body/a",
|
273
|
+
"text_content":"/html/body/p"
|
274
|
+
}
|
275
|
+
}|
|
310
276
|
expected = JSON.parse(expected_src)
|
311
277
|
actual = JSON.parse(json)
|
312
278
|
expect(actual).to match expected
|
@@ -318,25 +284,44 @@ EOB
|
|
318
284
|
], limit:10)
|
319
285
|
|
320
286
|
json = Yasuri.tree2json(tree)
|
321
|
-
expected_src = %q|
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
}|
|
287
|
+
expected_src = %q|
|
288
|
+
{
|
289
|
+
"pages_root": {
|
290
|
+
"path": "/html/body/nav/span/a[@class='next']",
|
291
|
+
"limit": 10,
|
292
|
+
"flatten": false,
|
293
|
+
"text_content": "/html/body/p"
|
294
|
+
}
|
295
|
+
}|
|
331
296
|
expected = JSON.parse(expected_src)
|
332
297
|
actual = JSON.parse(json)
|
333
298
|
expect(actual).to match expected
|
334
299
|
end
|
335
|
-
|
336
|
-
|
337
|
-
|
338
300
|
end
|
339
301
|
|
302
|
+
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
303
|
+
tree = Yasuri::StructNode.new('/html/body/table', "tables", [
|
304
|
+
Yasuri::StructNode.new('./tr', "table", [
|
305
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
306
|
+
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
307
|
+
])
|
308
|
+
])
|
309
|
+
json = Yasuri.tree2json(tree)
|
310
|
+
expected_src = %q|
|
311
|
+
{
|
312
|
+
"struct_tables": {
|
313
|
+
"path": "/html/body/table",
|
314
|
+
"struct_table": {
|
315
|
+
"path": "./tr",
|
316
|
+
"text_title": "./td[1]",
|
317
|
+
"text_pub_date": "./td[2]"
|
318
|
+
}
|
319
|
+
}
|
320
|
+
}|
|
321
|
+
expected = JSON.parse(expected_src)
|
322
|
+
actual = JSON.parse(json)
|
323
|
+
expect(actual).to match expected
|
324
|
+
end
|
340
325
|
|
341
326
|
it 'has a version number' do
|
342
327
|
expect(Yasuri::VERSION).not_to be nil
|