yasuri 3.2.0 → 3.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,15 @@
1
+
2
+ # yasuri scrape "https://github.com/tac0x2a?tab=repositories" -f github.yml
3
+ text_title: /html/head/title
4
+ links_repo:
5
+ path: //*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a
6
+ text_name: //*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a
7
+ text_desc:
8
+ path: //*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p
9
+ proc: :strip
10
+ text_stars:
11
+ path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]
12
+ proc: :to_i
13
+ text_forks:
14
+ path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span
15
+ proc: :to_i
@@ -0,0 +1,4 @@
1
+ {
2
+ "text_title": "/html/head/title",
3
+ "text_desc": "//*[@id=\"intro\"]/p"
4
+ }
@@ -0,0 +1,11 @@
1
+ # yasuri scrape "https://www.tac42.net/" -f sample.yml
2
+ links_each:
3
+ path: //*[@id="posts"]/article/header/h1/a
4
+ text_title: //*[@id="content"]/article/header/h1
5
+ text_description: /html/head/meta[12]/@content
6
+ text_date:
7
+ path: //*[@id="content"]/article/header/div/span
8
+ proc: :strip
9
+ text_length:
10
+ path: //*[@id="content"]
11
+ proc: :size
@@ -1,3 +1,3 @@
1
1
  module Yasuri
2
- VERSION = "3.2.0"
2
+ VERSION = "3.3.0"
3
3
  end
data/lib/yasuri/yasuri.rb CHANGED
@@ -17,6 +17,7 @@ require_relative 'yasuri_node_generator'
17
17
  module Yasuri
18
18
 
19
19
  DefaultRetryCount = 5
20
+ DefaultInterval_ms = 0
20
21
 
21
22
  def self.json2tree(json_string)
22
23
  raise RuntimeError if json_string.nil? or json_string.empty?
@@ -112,12 +113,15 @@ module Yasuri
112
113
  symbolize_names ? name.to_sym : name
113
114
  end
114
115
 
115
- def self.with_retry(retry_count = 5)
116
+ def self.with_retry(
117
+ retry_count = DefaultRetryCount,
118
+ interval_ms = DefaultInterval_ms)
119
+
116
120
  begin
121
+ Kernel.sleep(interval_ms * 0.001)
117
122
  return yield() if block_given?
118
123
  rescue => e
119
124
  if retry_count > 0
120
- pp "retry #{retry_count}"
121
125
  retry_count -= 1
122
126
  retry
123
127
  end
@@ -1,7 +1,6 @@
1
1
  require 'thor'
2
2
  require 'json'
3
3
  require 'yasuri'
4
- require 'mechanize'
5
4
 
6
5
  module Yasuri
7
6
  class CLI < Thor
@@ -9,8 +8,9 @@ module Yasuri
9
8
 
10
9
  default_command :scrape
11
10
  desc "scrape <URI> [[--file <TREE_FILE>] or [--json <JSON>]]", "Getting from <URI> and scrape it. with <JSON> or json/yml from <TREE_FILE>. They should be Yasuri's format json or yaml string."
12
- option :file, {aliases: 'f', desc: "path to file that written yasuri tree as json or yaml", type: :string}
13
- option :json, {aliases: 'j', desc: "yasuri tree format json string", type: :string}
11
+ option :file, {aliases: 'f', desc: "path to file that written yasuri tree as json or yaml", type: :string}
12
+ option :json, {aliases: 'j', desc: "yasuri tree format json string", type: :string}
13
+ option :interval, {aliases: 'i', desc: "interval each request [ms]", type: :numeric}
14
14
  def scrape(uri)
15
15
  # argument validations
16
16
  if [options[:file], options[:json]].compact.count != 1
@@ -26,6 +26,8 @@ module Yasuri
26
26
  return -1
27
27
  end
28
28
 
29
+ interval_ms = options[:interval] || Yasuri::DefaultInterval_ms
30
+
29
31
  tree = if options[:file]
30
32
  src = File.read(options[:file])
31
33
 
@@ -48,9 +50,7 @@ module Yasuri
48
50
  end
49
51
  end
50
52
 
51
- agent = Mechanize.new
52
- root_page = agent.get(uri)
53
- result = tree.inject(agent, root_page)
53
+ result = tree.scrape(uri, interval_ms: interval_ms)
54
54
 
55
55
  if result.instance_of?(String)
56
56
  puts result
@@ -6,13 +6,15 @@ require_relative 'yasuri_node'
6
6
  module Yasuri
7
7
  class LinksNode
8
8
  include Node
9
+
9
10
  def inject(agent, page, opt = {}, element = page)
10
11
  retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
12
+ interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
11
13
 
12
14
  links = element.search(@xpath) || [] # links expected
13
15
  links.map do |link|
14
16
  link_button = Mechanize::Page::Link.new(link, agent, page)
15
- child_page = Yasuri.with_retry(retry_count) { link_button.click }
17
+ child_page = Yasuri.with_retry(retry_count, interval_ms) { link_button.click }
16
18
 
17
19
  child_results_kv = @children.map do |child_node|
18
20
  child_name = Yasuri.node_name(child_node.name, opt)
@@ -1,6 +1,7 @@
1
1
 
2
2
  module Yasuri
3
3
  class MapNode
4
+ include Node
4
5
  attr_reader :name, :children
5
6
 
6
7
  def initialize(name, children, **opt)
@@ -11,6 +11,20 @@ module Yasuri
11
11
  @xpath, @name, @children = xpath, name, children
12
12
  end
13
13
 
14
+ def scrape(uri, opt = {})
15
+ retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
16
+ interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
17
+
18
+ agent = Mechanize.new
19
+ page = Yasuri.with_retry(retry_count, interval_ms) { agent.get(uri) }
20
+ scrape_with_agent(uri, agent, opt)
21
+ end
22
+
23
+ def scrape_with_agent(uri, agent, opt = {})
24
+ page = agent.get(uri)
25
+ inject(agent, page, opt)
26
+ end
27
+
14
28
  def inject(agent, page, opt = {}, element = page)
15
29
  fail "#{Kernel.__method__} is not implemented in included class."
16
30
  end
@@ -15,6 +15,7 @@ module Yasuri
15
15
 
16
16
  def inject(agent, page, opt = {}, element = page)
17
17
  retry_count = opt[:retry_count] || Yasuri::DefaultRetryCount
18
+ interval_ms = opt[:interval_ms] || Yasuri::DefaultInterval_ms
18
19
 
19
20
  raise NotImplementedError.new("PagenateNode inside StructNode, Not Supported") if page != element
20
21
 
@@ -31,7 +32,7 @@ module Yasuri
31
32
  break if link == nil
32
33
 
33
34
  link_button = Mechanize::Page::Link.new(link, agent, page)
34
- page = Yasuri.with_retry(retry_count) { link_button.click }
35
+ page = Yasuri.with_retry(retry_count, interval_ms) { link_button.click }
35
36
  break if (limit -= 1) <= 0
36
37
  end
37
38
 
data/spec/spec_helper.rb CHANGED
@@ -26,8 +26,8 @@ SimpleCov.start
26
26
  $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
27
27
  require 'yasuri'
28
28
 
29
- def compare_generated_vs_original(generated, original, page)
30
- expected = original.inject(@agent, page)
31
- actual = generated.inject(@agent, page)
29
+ def compare_generated_vs_original(generated, original, uri)
30
+ expected = original.scrape(uri)
31
+ actual = generated.scrape(uri)
32
32
  expect(actual).to match expected
33
33
  end
@@ -51,15 +51,28 @@ describe 'Yasuri' do
51
51
 
52
52
  it "display text node as simple string via json file" do
53
53
  expect {
54
- Yasuri::CLI.new.invoke(:scrape, [uri], {file: "#{@res_dir}/tree.json"})
55
- }.to output('[{"content":"Hello,YasuriLast Modify - 2015/02/14"}]' + "\n").to_stdout
54
+ Yasuri::CLI.new.invoke(:scrape, [uri+"/pagination/page01.html"], {file: "#{@res_dir}/tree.json"})
55
+ }.to output('[{"content":"PaginationTest01"},{"content":"PaginationTest02"},{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' + "\n").to_stdout
56
56
  end
57
57
  it "display text node as simple string via yaml file" do
58
58
  expect {
59
- Yasuri::CLI.new.invoke(:scrape, [uri], {file: "#{@res_dir}/tree.yml"})
60
- }.to output('[{"content":"Hello,YasuriLast Modify - 2015/02/14"}]' + "\n").to_stdout
59
+ Yasuri::CLI.new.invoke(:scrape, [uri+"/pagination/page01.html"], {file: "#{@res_dir}/tree.yml"})
60
+ }.to output('[{"content":"PaginationTest01"},{"content":"PaginationTest02"},{"content":"PaginationTest03"},{"content":"PaginationTest04"}]' + "\n").to_stdout
61
61
  end
62
62
 
63
+ it "interval option is effect for each request" do
64
+ allow(Kernel).to receive(:sleep)
65
+
66
+ Yasuri::CLI.new.invoke(
67
+ :scrape,
68
+ [uri+"/pagination/page01.html"],
69
+ {file: "#{@res_dir}/tree.yml", interval: 500}
70
+ )
71
+
72
+ expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
73
+ expect(interval_sec).to match 0.5
74
+ end
75
+ end
63
76
 
64
77
  it "display ERROR when json string is wrong" do
65
78
  wrong_json = '{,,}'
@@ -11,9 +11,7 @@ describe 'Yasuri' do
11
11
 
12
12
  describe '::LinksNode' do
13
13
  before do
14
- @agent = Mechanize.new
15
14
  @uri = uri
16
- @index_page = @agent.get(@uri)
17
15
  end
18
16
 
19
17
  it 'scrape links' do
@@ -21,7 +19,7 @@ describe 'Yasuri' do
21
19
  Yasuri::TextNode.new('/html/body/p', "content"),
22
20
  ])
23
21
 
24
- actual = root_node.inject(@agent, @index_page)
22
+ actual = root_node.scrape(@uri)
25
23
  expected = [
26
24
  {"content" => "Child 01 page."},
27
25
  {"content" => "Child 02 page."},
@@ -36,7 +34,7 @@ describe 'Yasuri' do
36
34
  Yasuri::TextNode.new('/html/body/p', "content"),
37
35
  ])
38
36
 
39
- actual = root_node.inject(@agent, @index_page)
37
+ actual = root_node.scrape(@uri)
40
38
  expect(actual).to be_empty
41
39
  end
42
40
 
@@ -47,7 +45,7 @@ describe 'Yasuri' do
47
45
  Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
48
46
  ]),
49
47
  ])
50
- actual = root_node.inject(@agent, @index_page)
48
+ actual = root_node.scrape(@uri)
51
49
  expected = [
52
50
  {"content" => "Child 01 page.",
53
51
  "sub_link" => [{"sub_page_title" => "Child 01 SubPage Test"},
@@ -61,14 +59,14 @@ describe 'Yasuri' do
61
59
  end
62
60
  it 'can be defined by DSL, return no contains if no child node' do
63
61
  root_node = Yasuri.links_title '/html/body/a'
64
- actual = root_node.inject(@agent, @index_page)
62
+ actual = root_node.scrape(@uri)
65
63
  expected = [{}, {}, {}] # Empty if no child node under links node.
66
64
  expect(actual).to match expected
67
65
  end
68
66
 
69
67
  it 'can be defined return no contains if no child node' do
70
68
  root_node = Yasuri::LinksNode.new('/html/body/a', "title")
71
- actual = root_node.inject(@agent, @index_page)
69
+ actual = root_node.scrape(@uri)
72
70
  expected = [{}, {}, {}] # Empty if no child node under links node.
73
71
  expect(actual).to match expected
74
72
  end
@@ -79,7 +77,7 @@ describe 'Yasuri' do
79
77
  original = Yasuri::LinksNode.new('/html/body/a', "root", [
80
78
  Yasuri::TextNode.new('/html/body/p', "name"),
81
79
  ])
82
- compare_generated_vs_original(generated, original, @index_page)
80
+ compare_generated_vs_original(generated, original, @uri)
83
81
  end
84
82
 
85
83
  it 'can be defined by DSL, return recursive links node' do
@@ -96,7 +94,7 @@ describe 'Yasuri' do
96
94
  Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
97
95
  ]),
98
96
  ])
99
- compare_generated_vs_original(generated, original, @index_page)
97
+ compare_generated_vs_original(generated, original, @uri)
100
98
  end
101
99
 
102
100
  it 'return child node as symbol' do
@@ -104,7 +102,7 @@ describe 'Yasuri' do
104
102
  Yasuri::TextNode.new('/html/body/p', "content"),
105
103
  ])
106
104
 
107
- actual = root_node.inject(@agent, @index_page, symbolize_names: true )
105
+ actual = root_node.scrape(@uri, symbolize_names: true )
108
106
  expected = [
109
107
  {:content => "Child 01 page."},
110
108
  {:content => "Child 02 page."},
@@ -112,5 +110,21 @@ describe 'Yasuri' do
112
110
  ]
113
111
  expect(actual).to match expected
114
112
  end
113
+
114
+ it 'scrape with interval for each request' do
115
+ allow(Kernel).to receive(:sleep)
116
+
117
+ root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
118
+ Yasuri::TextNode.new('/html/body/p', "content"),
119
+ ])
120
+ actual = root_node.scrape(@uri, interval_ms: 100)
121
+
122
+ expect(actual.size).to match 3
123
+
124
+ # request will be run 4(1+3) times because root page will be requested
125
+ expect(Kernel).to have_received(:sleep).exactly(1+3).times do |interval_sec|
126
+ expect(interval_sec).to match 0.1
127
+ end
128
+ end
115
129
  end
116
130
  end
@@ -4,8 +4,7 @@ describe 'Yasuri' do
4
4
  include_context 'httpserver'
5
5
 
6
6
  before do
7
- @agent = Mechanize.new
8
- @index_page = @agent.get(uri)
7
+ @uri = uri
9
8
  end
10
9
 
11
10
  describe '::MapNode' do
@@ -14,7 +13,7 @@ describe 'Yasuri' do
14
13
  text_title '/html/head/title'
15
14
  text_body_p '/html/body/p[1]'
16
15
  end
17
- actual = map.inject(@agent, @index_page)
16
+ actual = map.scrape(@uri)
18
17
 
19
18
  expected = {
20
19
  "title" => "Yasuri Test",
@@ -31,7 +30,7 @@ describe 'Yasuri' do
31
30
  text_child03 '/html/body/a[3]'
32
31
  end
33
32
  end
34
- actual = map.inject(@agent, @index_page)
33
+ actual = map.scrape(@uri)
35
34
 
36
35
  expected = {
37
36
  "group1" => {
@@ -57,7 +56,7 @@ describe 'Yasuri' do
57
56
  text_child03 '/html/body/a[3]'
58
57
  end
59
58
  end
60
- actual = map.inject(@agent, @index_page)
59
+ actual = map.scrape(@uri)
61
60
 
62
61
  expected = {
63
62
  "group1" => {
@@ -11,16 +11,14 @@ describe 'Yasuri' do
11
11
 
12
12
  describe '::PaginateNode' do
13
13
  before do
14
- @agent = Mechanize.new
15
14
  @uri = uri + "/pagination/page01.html"
16
- @page = @agent.get(@uri)
17
15
  end
18
16
 
19
17
  it "scrape each paginated pages" do
20
18
  root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
21
19
  Yasuri::TextNode.new('/html/body/p', "content"),
22
20
  ])
23
- actual = root_node.inject(@agent, @page)
21
+ actual = root_node.scrape(@uri)
24
22
  expected = [
25
23
  {"content" => "PaginationTest01"},
26
24
  {"content" => "PaginationTest02"},
@@ -37,7 +35,7 @@ describe 'Yasuri' do
37
35
  Yasuri::TextNode.new('./a', "text"),
38
36
  ]),
39
37
  ], flatten: true)
40
- actual = root_node.inject(@agent, @page)
38
+ actual = root_node.scrape(@uri)
41
39
  expected = [
42
40
  "PaginationTest01",
43
41
  {"text"=>""},
@@ -77,7 +75,7 @@ describe 'Yasuri' do
77
75
  root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
78
76
  Yasuri::TextNode.new('/html/body/p', "content"),
79
77
  ], limit:3)
80
- actual = root_node.inject(@agent, @page)
78
+ actual = root_node.scrape(@uri)
81
79
  expected = [
82
80
  {"content" => "PaginationTest01"},
83
81
  {"content" => "PaginationTest02"},
@@ -91,7 +89,7 @@ describe 'Yasuri' do
91
89
  root_node = Yasuri::PaginateNode.new(missing_xpath, "root", [
92
90
  Yasuri::TextNode.new('/html/body/p', "content"),
93
91
  ])
94
- actual = root_node.inject(@agent, @page)
92
+ actual = root_node.scrape(@uri)
95
93
  expected = [ {"content" => "PaginationTest01"}, ]
96
94
  expect(actual).to match_array expected
97
95
  end
@@ -100,7 +98,7 @@ describe 'Yasuri' do
100
98
  root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
101
99
  Yasuri::TextNode.new('/html/body/hoge', "content"),
102
100
  ])
103
- actual = root_node.inject(@agent, @page)
101
+ actual = root_node.scrape(@uri)
104
102
  expected = [ {"content" => ""}, {"content" => ""}, {"content" => ""}, {"content" => ""},]
105
103
  expect(actual).to match_array expected
106
104
  end
@@ -112,7 +110,7 @@ describe 'Yasuri' do
112
110
  original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
113
111
  Yasuri::TextNode.new('/html/body/p', "content"),
114
112
  ])
115
- compare_generated_vs_original(generated, original, @page)
113
+ compare_generated_vs_original(generated, original, @uri)
116
114
  end
117
115
 
118
116
  it 'can be defined by DSL, return single PaginateNode content limited' do
@@ -122,14 +120,14 @@ describe 'Yasuri' do
122
120
  original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
123
121
  Yasuri::TextNode.new('/html/body/p', "content"),
124
122
  ], limit: 2)
125
- compare_generated_vs_original(generated, original, @page)
123
+ compare_generated_vs_original(generated, original, @uri)
126
124
  end
127
125
 
128
126
  it "return child node as symbol" do
129
127
  root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
130
128
  Yasuri::TextNode.new('/html/body/p', "content"),
131
129
  ])
132
- actual = root_node.inject(@agent, @page, symbolize_names:true)
130
+ actual = root_node.scrape(@uri, symbolize_names:true)
133
131
  expected = [
134
132
  {:content => "PaginationTest01"},
135
133
  {:content => "PaginationTest02"},
@@ -138,5 +136,19 @@ describe 'Yasuri' do
138
136
  ]
139
137
  expect(actual).to match expected
140
138
  end
139
+
140
+ it "scrape with interval for each request" do
141
+ allow(Kernel).to receive(:sleep)
142
+
143
+ root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
144
+ Yasuri::TextNode.new('/html/body/p', "content"),
145
+ ])
146
+ actual = root_node.scrape(@uri, interval_ms: 1000)
147
+ expect(actual.size).to match 4
148
+
149
+ expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
150
+ expect(interval_sec).to match 1.0
151
+ end
152
+ end
141
153
  end
142
154
  end