yasuri 3.2.0 → 3.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +4 -7
- data/USAGE.ja.md +107 -86
- data/USAGE.md +106 -87
- data/examples/example.rb +79 -0
- data/examples/github.yml +15 -0
- data/examples/sample.json +4 -0
- data/examples/sample.yml +11 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +6 -2
- data/lib/yasuri/yasuri_cli.rb +6 -6
- data/lib/yasuri/yasuri_links_node.rb +3 -1
- data/lib/yasuri/yasuri_map_node.rb +1 -0
- data/lib/yasuri/yasuri_node.rb +14 -0
- data/lib/yasuri/yasuri_paginate_node.rb +2 -1
- data/spec/spec_helper.rb +3 -3
- data/spec/yasuri_cli_spec.rb +17 -4
- data/spec/yasuri_links_node_spec.rb +24 -10
- data/spec/yasuri_map_spec.rb +4 -5
- data/spec/yasuri_paginate_node_spec.rb +22 -10
- data/spec/yasuri_spec.rb +55 -19
- data/spec/yasuri_struct_node_spec.rb +13 -17
- data/spec/yasuri_text_node_spec.rb +11 -12
- metadata +6 -3
- data/app.rb +0 -52
data/examples/github.yml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
|
2
|
+
# yasuri scrape "https://github.com/tac0x2a?tab=repositories" -f github.yml
|
3
|
+
text_title: /html/head/title
|
4
|
+
links_repo:
|
5
|
+
path: //*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a
|
6
|
+
text_name: //*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a
|
7
|
+
text_desc:
|
8
|
+
path: //*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p
|
9
|
+
proc: :strip
|
10
|
+
text_stars:
|
11
|
+
path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]
|
12
|
+
proc: :to_i
|
13
|
+
text_forks:
|
14
|
+
path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span
|
15
|
+
proc: :to_i
|
data/examples/sample.yml
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
# yasuri scrape "https://www.tac42.net/" -f sample.yml
|
2
|
+
links_each:
|
3
|
+
path: //*[@id="posts"]/article/header/h1/a
|
4
|
+
text_title: //*[@id="content"]/article/header/h1
|
5
|
+
text_description: /html/head/meta[12]/@content
|
6
|
+
text_date:
|
7
|
+
path: //*[@id="content"]/article/header/div/span
|
8
|
+
proc: :strip
|
9
|
+
text_length:
|
10
|
+
path: //*[@id="content"]
|
11
|
+
proc: :size
|
data/lib/yasuri/version.rb
CHANGED
data/lib/yasuri/yasuri.rb
CHANGED
@@ -17,6 +17,7 @@ require_relative 'yasuri_node_generator'
|
|
17
17
|
module Yasuri
|
18
18
|
|
19
19
|
DefaultRetryCount = 5
|
20
|
+
DefaultInterval_ms = 0
|
20
21
|
|
21
22
|
def self.json2tree(json_string)
|
22
23
|
raise RuntimeError if json_string.nil? or json_string.empty?
|
@@ -112,12 +113,15 @@ module Yasuri
|
|
112
113
|
symbolize_names ? name.to_sym : name
|
113
114
|
end
|
114
115
|
|
115
|
-
def self.with_retry(
|
116
|
+
def self.with_retry(
|
117
|
+
retry_count = DefaultRetryCount,
|
118
|
+
interval_ms = DefaultInterval_ms)
|
119
|
+
|
116
120
|
begin
|
121
|
+
Kernel.sleep(interval_ms * 0.001)
|
117
122
|
return yield() if block_given?
|
118
123
|
rescue => e
|
119
124
|
if retry_count > 0
|
120
|
-
pp "retry #{retry_count}"
|
121
125
|
retry_count -= 1
|
122
126
|
retry
|
123
127
|
end
|
data/lib/yasuri/yasuri_cli.rb
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
require 'thor'
|
2
2
|
require 'json'
|
3
3
|
require 'yasuri'
|
4
|
-
require 'mechanize'
|
5
4
|
|
6
5
|
module Yasuri
|
7
6
|
class CLI < Thor
|
@@ -9,8 +8,9 @@ module Yasuri
|
|
9
8
|
|
10
9
|
default_command :scrape
|
11
10
|
desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]", "Getting from <URI> and scrape it. with <JSON> or json/yml from <TREE_FILE>. They should be Yasuri's format json or yaml string."
|
12
|
-
option :file,
|
13
|
-
option :json,
|
11
|
+
option :file, {aliases: 'f', desc: "path to file that written yasuri tree as json or yaml", type: :string}
|
12
|
+
option :json, {aliases: 'j', desc: "yasuri tree format json string", type: :string}
|
13
|
+
option :interval, {aliases: 'i', desc: "interval each request [ms]", type: :numeric}
|
14
14
|
def scrape(uri)
|
15
15
|
# argument validations
|
16
16
|
if [options[:file], options[:json]].compact.count != 1
|
@@ -26,6 +26,8 @@ module Yasuri
|
|
26
26
|
return -1
|
27
27
|
end
|
28
28
|
|
29
|
+
interval_ms = options[:interval] || Yasuri::DefaultInterval_ms
|
30
|
+
|
29
31
|
tree = if options[:file]
|
30
32
|
src = File.read(options[:file])
|
31
33
|
|
@@ -48,9 +50,7 @@ module Yasuri
|
|
48
50
|
end
|
49
51
|
end
|
50
52
|
|
51
|
-
|
52
|
-
root_page = agent.get(uri)
|
53
|
-
result = tree.inject(agent, root_page)
|
53
|
+
result = tree.scrape(uri, interval_ms: interval_ms)
|
54
54
|
|
55
55
|
if result.instance_of?(String)
|
56
56
|
puts result
|
@@ -6,13 +6,15 @@ require_relative 'yasuri_node'
|
|
6
6
|
module Yasuri
|
7
7
|
class LinksNode
|
8
8
|
include Node
|
9
|
+
|
9
10
|
def inject(agent, page, opt = {}, element = page)
|
10
11
|
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
12
|
+
interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
|
11
13
|
|
12
14
|
links = element.search(@xpath) || [] # links expected
|
13
15
|
links.map do |link|
|
14
16
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
15
|
-
child_page = Yasuri.with_retry(retry_count) { link_button.click }
|
17
|
+
child_page = Yasuri.with_retry(retry_count, interval_ms) { link_button.click }
|
16
18
|
|
17
19
|
child_results_kv = @children.map do |child_node|
|
18
20
|
child_name = Yasuri.node_name(child_node.name, opt)
|
data/lib/yasuri/yasuri_node.rb
CHANGED
@@ -11,6 +11,20 @@ module Yasuri
|
|
11
11
|
@xpath, @name, @children = xpath, name, children
|
12
12
|
end
|
13
13
|
|
14
|
+
def scrape(uri, opt = {})
|
15
|
+
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
16
|
+
interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
|
17
|
+
|
18
|
+
agent = Mechanize.new
|
19
|
+
page = Yasuri.with_retry(retry_count, interval_ms) { agent.get(uri) }
|
20
|
+
scrape_with_agent(uri, agent, opt)
|
21
|
+
end
|
22
|
+
|
23
|
+
def scrape_with_agent(uri, agent, opt = {})
|
24
|
+
page = agent.get(uri)
|
25
|
+
inject(agent, page, opt)
|
26
|
+
end
|
27
|
+
|
14
28
|
def inject(agent, page, opt = {}, element = page)
|
15
29
|
fail "#{Kernel.__method__} is not implemented in included class."
|
16
30
|
end
|
@@ -15,6 +15,7 @@ module Yasuri
|
|
15
15
|
|
16
16
|
def inject(agent, page, opt = {}, element = page)
|
17
17
|
retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
|
18
|
+
interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
|
18
19
|
|
19
20
|
raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
|
20
21
|
|
@@ -31,7 +32,7 @@ module Yasuri
|
|
31
32
|
break if link == nil
|
32
33
|
|
33
34
|
link_button = Mechanize::Page::Link.new(link, agent, page)
|
34
|
-
page = Yasuri.with_retry(retry_count) { link_button.click }
|
35
|
+
page = Yasuri.with_retry(retry_count, interval_ms) { link_button.click }
|
35
36
|
break if (limit -= 1) <= 0
|
36
37
|
end
|
37
38
|
|
data/spec/spec_helper.rb
CHANGED
@@ -26,8 +26,8 @@ SimpleCov.start
|
|
26
26
|
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
27
27
|
require 'yasuri'
|
28
28
|
|
29
|
-
def compare_generated_vs_original(generated, original,
|
30
|
-
expected = original.
|
31
|
-
actual = generated.
|
29
|
+
def compare_generated_vs_original(generated, original, uri)
|
30
|
+
expected = original.scrape(uri)
|
31
|
+
actual = generated.scrape(uri)
|
32
32
|
expect(actual).to match expected
|
33
33
|
end
|
data/spec/yasuri_cli_spec.rb
CHANGED
@@ -51,15 +51,28 @@ describe 'Yasuri' do
|
|
51
51
|
|
52
52
|
it "display text node as simple string via json file" do
|
53
53
|
expect {
|
54
|
-
Yasuri::CLI.new.invoke(:scrape, [uri], {file: "#{@res_dir}/tree.json"})
|
55
|
-
}.to output('[{"content":"
|
54
|
+
Yasuri::CLI.new.invoke(:scrape, [uri+"/pagination/page01.html"], {file: "#{@res_dir}/tree.json"})
|
55
|
+
}.to output('[{"content":"PaginationTest01"},{"content":"PaginationTest02"},{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' + "\n").to_stdout
|
56
56
|
end
|
57
57
|
it "display text node as simple string via yaml file" do
|
58
58
|
expect {
|
59
|
-
Yasuri::CLI.new.invoke(:scrape, [uri], {file: "#{@res_dir}/tree.yml"})
|
60
|
-
}.to output('[{"content":"
|
59
|
+
Yasuri::CLI.new.invoke(:scrape, [uri+"/pagination/page01.html"], {file: "#{@res_dir}/tree.yml"})
|
60
|
+
}.to output('[{"content":"PaginationTest01"},{"content":"PaginationTest02"},{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' + "\n").to_stdout
|
61
61
|
end
|
62
62
|
|
63
|
+
it "interval option is effect for each request" do
|
64
|
+
allow(Kernel).to receive(:sleep)
|
65
|
+
|
66
|
+
Yasuri::CLI.new.invoke(
|
67
|
+
:scrape,
|
68
|
+
[uri+"/pagination/page01.html"],
|
69
|
+
{file: "#{@res_dir}/tree.yml", interval: 500}
|
70
|
+
)
|
71
|
+
|
72
|
+
expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
|
73
|
+
expect(interval_sec).to match 0.5
|
74
|
+
end
|
75
|
+
end
|
63
76
|
|
64
77
|
it "display ERROR when json string is wrong" do
|
65
78
|
wrong_json = '{,,}'
|
@@ -11,9 +11,7 @@ describe 'Yasuri' do
|
|
11
11
|
|
12
12
|
describe '::LinksNode' do
|
13
13
|
before do
|
14
|
-
@agent = Mechanize.new
|
15
14
|
@uri = uri
|
16
|
-
@index_page = @agent.get(@uri)
|
17
15
|
end
|
18
16
|
|
19
17
|
it 'scrape links' do
|
@@ -21,7 +19,7 @@ describe 'Yasuri' do
|
|
21
19
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
22
20
|
])
|
23
21
|
|
24
|
-
actual = root_node.
|
22
|
+
actual = root_node.scrape(@uri)
|
25
23
|
expected = [
|
26
24
|
{"content" => "Child 01 page."},
|
27
25
|
{"content" => "Child 02 page."},
|
@@ -36,7 +34,7 @@ describe 'Yasuri' do
|
|
36
34
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
37
35
|
])
|
38
36
|
|
39
|
-
actual = root_node.
|
37
|
+
actual = root_node.scrape(@uri)
|
40
38
|
expect(actual).to be_empty
|
41
39
|
end
|
42
40
|
|
@@ -47,7 +45,7 @@ describe 'Yasuri' do
|
|
47
45
|
Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
|
48
46
|
]),
|
49
47
|
])
|
50
|
-
actual = root_node.
|
48
|
+
actual = root_node.scrape(@uri)
|
51
49
|
expected = [
|
52
50
|
{"content" => "Child 01 page.",
|
53
51
|
"sub_link" => [{"sub_page_title" => "Child 01 SubPage Test"},
|
@@ -61,14 +59,14 @@ describe 'Yasuri' do
|
|
61
59
|
end
|
62
60
|
it 'can be defined by DSL, return no contains if no child node' do
|
63
61
|
root_node = Yasuri.links_title '/html/body/a'
|
64
|
-
actual = root_node.
|
62
|
+
actual = root_node.scrape(@uri)
|
65
63
|
expected = [{}, {}, {}] # Empty if no child node under links node.
|
66
64
|
expect(actual).to match expected
|
67
65
|
end
|
68
66
|
|
69
67
|
it 'can be defined return no contains if no child node' do
|
70
68
|
root_node = Yasuri::LinksNode.new('/html/body/a', "title")
|
71
|
-
actual = root_node.
|
69
|
+
actual = root_node.scrape(@uri)
|
72
70
|
expected = [{}, {}, {}] # Empty if no child node under links node.
|
73
71
|
expect(actual).to match expected
|
74
72
|
end
|
@@ -79,7 +77,7 @@ describe 'Yasuri' do
|
|
79
77
|
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
80
78
|
Yasuri::TextNode.new('/html/body/p', "name"),
|
81
79
|
])
|
82
|
-
compare_generated_vs_original(generated, original, @
|
80
|
+
compare_generated_vs_original(generated, original, @uri)
|
83
81
|
end
|
84
82
|
|
85
83
|
it 'can be defined by DSL, return recursive links node' do
|
@@ -96,7 +94,7 @@ describe 'Yasuri' do
|
|
96
94
|
Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
|
97
95
|
]),
|
98
96
|
])
|
99
|
-
compare_generated_vs_original(generated, original, @
|
97
|
+
compare_generated_vs_original(generated, original, @uri)
|
100
98
|
end
|
101
99
|
|
102
100
|
it 'return child node as symbol' do
|
@@ -104,7 +102,7 @@ describe 'Yasuri' do
|
|
104
102
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
105
103
|
])
|
106
104
|
|
107
|
-
actual = root_node.
|
105
|
+
actual = root_node.scrape(@uri, symbolize_names: true )
|
108
106
|
expected = [
|
109
107
|
{:content => "Child 01 page."},
|
110
108
|
{:content => "Child 02 page."},
|
@@ -112,5 +110,21 @@ describe 'Yasuri' do
|
|
112
110
|
]
|
113
111
|
expect(actual).to match expected
|
114
112
|
end
|
113
|
+
|
114
|
+
it 'scrape with interval for each request' do
|
115
|
+
allow(Kernel).to receive(:sleep)
|
116
|
+
|
117
|
+
root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
|
118
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
119
|
+
])
|
120
|
+
actual = root_node.scrape(@uri, interval_ms: 100)
|
121
|
+
|
122
|
+
expect(actual.size).to match 3
|
123
|
+
|
124
|
+
# request will be run 4(1+3) times because root page will be requested
|
125
|
+
expect(Kernel).to have_received(:sleep).exactly(1+3).times do |interval_sec|
|
126
|
+
expect(interval_sec).to match 0.1
|
127
|
+
end
|
128
|
+
end
|
115
129
|
end
|
116
130
|
end
|
data/spec/yasuri_map_spec.rb
CHANGED
@@ -4,8 +4,7 @@ describe 'Yasuri' do
|
|
4
4
|
include_context 'httpserver'
|
5
5
|
|
6
6
|
before do
|
7
|
-
@
|
8
|
-
@index_page = @agent.get(uri)
|
7
|
+
@uri = uri
|
9
8
|
end
|
10
9
|
|
11
10
|
describe '::MapNode' do
|
@@ -14,7 +13,7 @@ describe 'Yasuri' do
|
|
14
13
|
text_title '/html/head/title'
|
15
14
|
text_body_p '/html/body/p[1]'
|
16
15
|
end
|
17
|
-
actual = map.
|
16
|
+
actual = map.scrape(@uri)
|
18
17
|
|
19
18
|
expected = {
|
20
19
|
"title" => "Yasuri Test",
|
@@ -31,7 +30,7 @@ describe 'Yasuri' do
|
|
31
30
|
text_child03 '/html/body/a[3]'
|
32
31
|
end
|
33
32
|
end
|
34
|
-
actual = map.
|
33
|
+
actual = map.scrape(@uri)
|
35
34
|
|
36
35
|
expected = {
|
37
36
|
"group1" => {
|
@@ -57,7 +56,7 @@ describe 'Yasuri' do
|
|
57
56
|
text_child03 '/html/body/a[3]'
|
58
57
|
end
|
59
58
|
end
|
60
|
-
actual = map.
|
59
|
+
actual = map.scrape(@uri)
|
61
60
|
|
62
61
|
expected = {
|
63
62
|
"group1" => {
|
@@ -11,16 +11,14 @@ describe 'Yasuri' do
|
|
11
11
|
|
12
12
|
describe '::PaginateNode' do
|
13
13
|
before do
|
14
|
-
@agent = Mechanize.new
|
15
14
|
@uri = uri + "/pagination/page01.html"
|
16
|
-
@page = @agent.get(@uri)
|
17
15
|
end
|
18
16
|
|
19
17
|
it "scrape each paginated pages" do
|
20
18
|
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
21
19
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
22
20
|
])
|
23
|
-
actual = root_node.
|
21
|
+
actual = root_node.scrape(@uri)
|
24
22
|
expected = [
|
25
23
|
{"content" => "PaginationTest01"},
|
26
24
|
{"content" => "PaginationTest02"},
|
@@ -37,7 +35,7 @@ describe 'Yasuri' do
|
|
37
35
|
Yasuri::TextNode.new('./a', "text"),
|
38
36
|
]),
|
39
37
|
], flatten: true)
|
40
|
-
actual = root_node.
|
38
|
+
actual = root_node.scrape(@uri)
|
41
39
|
expected = [
|
42
40
|
"PaginationTest01",
|
43
41
|
{"text"=>""},
|
@@ -77,7 +75,7 @@ describe 'Yasuri' do
|
|
77
75
|
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
78
76
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
79
77
|
], limit:3)
|
80
|
-
actual = root_node.
|
78
|
+
actual = root_node.scrape(@uri)
|
81
79
|
expected = [
|
82
80
|
{"content" => "PaginationTest01"},
|
83
81
|
{"content" => "PaginationTest02"},
|
@@ -91,7 +89,7 @@ describe 'Yasuri' do
|
|
91
89
|
root_node = Yasuri::PaginateNode.new(missing_xpath, "root", [
|
92
90
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
93
91
|
])
|
94
|
-
actual = root_node.
|
92
|
+
actual = root_node.scrape(@uri)
|
95
93
|
expected = [ {"content" => "PaginationTest01"}, ]
|
96
94
|
expect(actual).to match_array expected
|
97
95
|
end
|
@@ -100,7 +98,7 @@ describe 'Yasuri' do
|
|
100
98
|
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
101
99
|
Yasuri::TextNode.new('/html/body/hoge', "content"),
|
102
100
|
])
|
103
|
-
actual = root_node.
|
101
|
+
actual = root_node.scrape(@uri)
|
104
102
|
expected = [ {"content" => ""}, {"content" => ""}, {"content" => ""}, {"content" => ""},]
|
105
103
|
expect(actual).to match_array expected
|
106
104
|
end
|
@@ -112,7 +110,7 @@ describe 'Yasuri' do
|
|
112
110
|
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
113
111
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
114
112
|
])
|
115
|
-
compare_generated_vs_original(generated, original, @
|
113
|
+
compare_generated_vs_original(generated, original, @uri)
|
116
114
|
end
|
117
115
|
|
118
116
|
it 'can be defined by DSL, return single PaginateNode content limited' do
|
@@ -122,14 +120,14 @@ describe 'Yasuri' do
|
|
122
120
|
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
123
121
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
124
122
|
], limit: 2)
|
125
|
-
compare_generated_vs_original(generated, original, @
|
123
|
+
compare_generated_vs_original(generated, original, @uri)
|
126
124
|
end
|
127
125
|
|
128
126
|
it "return child node as symbol" do
|
129
127
|
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
130
128
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
131
129
|
])
|
132
|
-
actual = root_node.
|
130
|
+
actual = root_node.scrape(@uri, symbolize_names:true)
|
133
131
|
expected = [
|
134
132
|
{:content => "PaginationTest01"},
|
135
133
|
{:content => "PaginationTest02"},
|
@@ -138,5 +136,19 @@ describe 'Yasuri' do
|
|
138
136
|
]
|
139
137
|
expect(actual).to match expected
|
140
138
|
end
|
139
|
+
|
140
|
+
it "scrape with interval for each request" do
|
141
|
+
allow(Kernel).to receive(:sleep)
|
142
|
+
|
143
|
+
root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
144
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
145
|
+
])
|
146
|
+
actual = root_node.scrape(@uri, interval_ms: 1000)
|
147
|
+
expect(actual.size).to match 4
|
148
|
+
|
149
|
+
expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
|
150
|
+
expect(interval_sec).to match 1.0
|
151
|
+
end
|
152
|
+
end
|
141
153
|
end
|
142
154
|
end
|