RubyGems - yasuri - Versions diffs - 3.0.0 → 3.3.2 - Mend

yasuri 3.0.0 → 3.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

checksums.yaml +4 -4
data/.github/workflows/ruby.yml +1 -1
data/.rubocop.yml +49 -0
data/.rubocop_todo.yml +0 -0
data/README.md +70 -27
data/Rakefile +1 -1
data/USAGE.ja.md +366 -131
data/USAGE.md +371 -136
data/examples/example.rb +78 -0
data/examples/github.yml +15 -0
data/examples/sample.json +4 -0
data/examples/sample.yml +11 -0
data/exe/yasuri +5 -0
data/lib/yasuri.rb +1 -0
data/lib/yasuri/version.rb +1 -1
data/lib/yasuri/yasuri.rb +96 -76
data/lib/yasuri/yasuri_cli.rb +78 -0
data/lib/yasuri/yasuri_links_node.rb +10 -6
data/lib/yasuri/yasuri_map_node.rb +40 -0
data/lib/yasuri/yasuri_node.rb +36 -4
data/lib/yasuri/yasuri_node_generator.rb +14 -9
data/lib/yasuri/yasuri_paginate_node.rb +26 -16
data/lib/yasuri/yasuri_struct_node.rb +6 -4
data/lib/yasuri/yasuri_text_node.rb +9 -7
data/spec/cli_resources/tree.json +8 -0
data/spec/cli_resources/tree.yml +5 -0
data/spec/cli_resources/tree_wrong.json +9 -0
data/spec/cli_resources/tree_wrong.yml +6 -0
data/spec/servers/httpserver.rb +0 -2
data/spec/spec_helper.rb +4 -6
data/spec/yasuri_cli_spec.rb +114 -0
data/spec/yasuri_links_node_spec.rb +82 -58
data/spec/yasuri_map_spec.rb +71 -0
data/spec/yasuri_paginate_node_spec.rb +99 -88
data/spec/yasuri_spec.rb +196 -138
data/spec/yasuri_struct_node_spec.rb +120 -100
data/spec/yasuri_text_node_spec.rb +22 -32
data/yasuri.gemspec +29 -22
metadata +105 -15
data/app.rb +0 -52
data/spec/yasuri_node_spec.rb +0 -11

data/spec/yasuri_links_node_spec.rb CHANGED Viewed

@@ -1,85 +1,86 @@
-# Author::    TAC (tac@tac42.net)
 require_relative 'spec_helper'
-#########
-# Links #
-#########
 describe 'Yasuri' do
   include_context 'httpserver'
   describe '::LinksNode' do
-    before do
-      @agent = Mechanize.new
-      @uri = uri
-      @index_page = @agent.get(@uri)
-    end
     it 'scrape links' do
-      root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
-        Yasuri::TextNode.new('/html/body/p', "content"),
-      ])
+      root_node = Yasuri::LinksNode.new(
+        '/html/body/a', "root", [
+          Yasuri::TextNode.new('/html/body/p', "content")
+        ]
+      )
-      actual = root_node.inject(@agent, @index_page)
+      actual = root_node.scrape(uri)
       expected = [
-        {"content" => "Child 01 page."},
-        {"content" => "Child 02 page."},
-        {"content" => "Child 03 page."},
+        { "content" => "Child 01 page." },
+        { "content" => "Child 02 page." },
+        { "content" => "Child 03 page." }
       ]
       expect(actual).to match expected
     end
     it 'return empty set if no match node' do
       missing_xpath = '/html/body/b'
-      root_node = Yasuri::LinksNode.new(missing_xpath, "root", [
-        Yasuri::TextNode.new('/html/body/p', "content"),
-      ])
+      root_node = Yasuri::LinksNode.new(
+        missing_xpath, "root", [
+          Yasuri::TextNode.new('/html/body/p', "content")
+        ]
+      )
-      actual = root_node.inject(@agent, @index_page)
+      actual = root_node.scrape(uri)
       expect(actual).to be_empty
     end
     it 'scrape links, recursive' do
-      root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
-        Yasuri::TextNode.new('/html/body/p', "content"),
-        Yasuri::LinksNode.new('/html/body/ul/li/a', "sub_link", [
-          Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
-        ]),
-      ])
-      actual = root_node.inject(@agent, @index_page)
+      root_node = Yasuri::LinksNode.new(
+        '/html/body/a', "root", [
+          Yasuri::TextNode.new('/html/body/p', "content"),
+          Yasuri::LinksNode.new(
+            '/html/body/ul/li/a', "sub_link", [
+              Yasuri::TextNode.new('/html/head/title', "sub_page_title")
+            ]
+          )
+        ]
+      )
+      actual = root_node.scrape(uri)
       expected = [
-        {"content"  => "Child 01 page.",
-         "sub_link" => [{"sub_page_title" => "Child 01 SubPage Test"},
-                        {"sub_page_title" => "Child 02 SubPage Test"}],},
-        {"content" => "Child 02 page.",
-         "sub_link" => [],},
-        {"content" => "Child 03 page.",
-         "sub_link" => [{"sub_page_title" => "Child 03 SubPage Test"}],},
+        { "content" => "Child 01 page.",
+          "sub_link" => [{ "sub_page_title" => "Child 01 SubPage Test" },
+                         { "sub_page_title" => "Child 02 SubPage Test" }] },
+        { "content" => "Child 02 page.",
+          "sub_link" => [] },
+        { "content" => "Child 03 page.",
+          "sub_link" => [{ "sub_page_title" => "Child 03 SubPage Test" }] }
       ]
       expect(actual).to match expected
     end
     it 'can be defined by DSL, return no contains if no child node' do
       root_node = Yasuri.links_title '/html/body/a'
-      actual = root_node.inject(@agent, @index_page)
+      actual = root_node.scrape(uri)
       expected = [{}, {}, {}] # Empty if no child node under links node.
       expect(actual).to match expected
     end
     it 'can be defined return no contains if no child node' do
       root_node = Yasuri::LinksNode.new('/html/body/a', "title")
-      actual = root_node.inject(@agent, @index_page)
+      actual = root_node.scrape(uri)
       expected = [{}, {}, {}] # Empty if no child node under links node.
       expect(actual).to match expected
     end
     it 'can be defined by DSL, return nested contents under link' do
       generated = Yasuri.links_title '/html/body/a' do
-                     text_name '/html/body/p'
-                  end
-      original = Yasuri::LinksNode.new('/html/body/a', "root", [
-        Yasuri::TextNode.new('/html/body/p', "name"),
-      ])
-      compare_generated_vs_original(generated, original, @index_page)
+        text_name '/html/body/p'
+      end
+      original = Yasuri::LinksNode.new(
+        '/html/body/a', "root", [
+          Yasuri::TextNode.new('/html/body/p', "name")
+        ]
+      )
+      compare_generated_vs_original(generated, original, uri)
     end
     it 'can be defined by DSL, return recursive links node' do
@@ -90,27 +91,50 @@ describe 'Yasuri' do
         end
       end
-      original = Yasuri::LinksNode.new('/html/body/a', "root", [
-        Yasuri::TextNode.new('/html/body/p', "content"),
-        Yasuri::LinksNode.new('/html/body/ul/li/a', "sub_link", [
-          Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
-        ]),
-      ])
-      compare_generated_vs_original(generated, original, @index_page)
+      original = Yasuri::LinksNode.new(
+        '/html/body/a', "root", [
+          Yasuri::TextNode.new('/html/body/p', "content"),
+          Yasuri::LinksNode.new(
+            '/html/body/ul/li/a', "sub_link", [
+              Yasuri::TextNode.new('/html/head/title', "sub_page_title")
+            ]
+          )
+        ]
+      )
+      compare_generated_vs_original(generated, original, uri)
     end
     it 'return child node as symbol' do
-      root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
-        Yasuri::TextNode.new('/html/body/p', "content"),
-      ])
+      root_node = Yasuri::LinksNode.new(
+        '/html/body/a', "root", [
+          Yasuri::TextNode.new('/html/body/p', "content")
+        ]
+      )
-      actual = root_node.inject(@agent, @index_page, symbolize_names: true )
+      actual = root_node.scrape(uri, symbolize_names: true)
       expected = [
-        {:content => "Child 01 page."},
-        {:content => "Child 02 page."},
-        {:content => "Child 03 page."},
+        { content: "Child 01 page." },
+        { content: "Child 02 page." },
+        { content: "Child 03 page." }
       ]
       expect(actual).to match expected
     end
+    it 'scrape with interval for each request' do
+      allow(Kernel).to receive(:sleep)
+      root_node = Yasuri::LinksNode.new(
+        '/html/body/a', "root", [
+          Yasuri::TextNode.new('/html/body/p', "content")
+        ]
+      )
+      actual = root_node.scrape(uri, interval_ms: 100)
+      expect(actual.size).to match 3
+      # request will be run 4(1+3) times because root page will be requested
+      expect(Kernel).to have_received(:sleep).exactly(1 + 3).times do |interval_sec|
+        expect(interval_sec).to match 0.1
+      end
+    end
   end
 end

data/spec/yasuri_map_spec.rb ADDED Viewed

@@ -0,0 +1,71 @@
+require_relative 'spec_helper'
+describe 'Yasuri' do
+  include_context 'httpserver'
+  describe '::MapNode' do
+    it "multi scrape in singe page" do
+      map = Yasuri.map_sample do
+        text_title  '/html/head/title'
+        text_body_p '/html/body/p[1]'
+      end
+      actual = map.scrape(uri)
+      expected = {
+        "title" => "Yasuri Test",
+        "body_p" => "Hello,Yasuri"
+      }
+      expect(actual).to include expected
+    end
+    it "nested multi scrape in singe page" do
+      map = Yasuri.map_sample do
+        map_group1 { text_child01 '/html/body/a[1]' }
+        map_group2 do
+          text_child01 '/html/body/a[1]'
+          text_child03 '/html/body/a[3]'
+        end
+      end
+      actual = map.scrape(uri)
+      expected = {
+        "group1" => {
+          "child01" => "child01"
+        },
+        "group2" => {
+          "child01" => "child01",
+          "child03" => "child03"
+        }
+      }
+      expect(actual).to include expected
+    end
+    it "scrape with links node" do
+      map = Yasuri.map_sample do
+        map_group1 do
+          links_a '/html/body/a' do
+            text_content '/html/body/p'
+          end
+          text_child01 '/html/body/a[1]'
+        end
+        map_group2 do
+          text_child03 '/html/body/a[3]'
+        end
+      end
+      actual = map.scrape(uri)
+      expected = {
+        "group1" => {
+          "a" => [
+            { "content" => "Child 01 page." },
+            { "content" => "Child 02 page." },
+            { "content" => "Child 03 page." }
+          ],
+          "child01" => "child01"
+        },
+        "group2" => { "child03" => "child03" }
+      }
+      expect(actual).to include expected
+    end
+  end
+end

data/spec/yasuri_paginate_node_spec.rb CHANGED Viewed

@@ -1,107 +1,96 @@
-# Author::    TAC (tac@tac42.net)
 require_relative 'spec_helper'
-############
-# Paginate #
-############
 describe 'Yasuri' do
   include_context 'httpserver'
   describe '::PaginateNode' do
-    before do
-      @agent = Mechanize.new
-      @uri = uri + "/pagination/page01.html"
-      @page = @agent.get(@uri)
-    end
+    let(:uri_paginate) { "#{uri}/pagination/page01.html" }
     it "scrape each paginated pages" do
-      root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
-        Yasuri::TextNode.new('/html/body/p', "content"),
-      ])
-      actual = root_node.inject(@agent, @page)
+      root_node = Yasuri::PaginateNode.new(
+        "/html/body/nav/span/a[@class='next']", "root", [
+          Yasuri::TextNode.new('/html/body/p', "content")
+        ]
+      )
+      actual = root_node.scrape(uri_paginate)
       expected = [
-        {"content" => "PaginationTest01"},
-        {"content" => "PaginationTest02"},
-        {"content" => "PaginationTest03"},
-        {"content" => "PaginationTest04"},
+        { "content" => "PaginationTest01" },
+        { "content" => "PaginationTest02" },
+        { "content" => "PaginationTest03" },
+        { "content" => "PaginationTest04" }
       ]
       expect(actual).to match expected
     end
     it "scrape each paginated pages with flatten" do
-      root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
-        Yasuri::TextNode.new('/html/body/p', "content"),
-        Yasuri::StructNode.new('/html/body/nav/span', "span", [
-          Yasuri::TextNode.new('./a', "text"),
-        ]),
-      ], flatten: true)
-      actual = root_node.inject(@agent, @page)
+      root_node = Yasuri::PaginateNode.new(
+        "/html/body/nav/span/a[@class='next']", "root", [
+          Yasuri::TextNode.new('/html/body/p', "content"),
+          Yasuri::StructNode.new(
+            '/html/body/nav/span', "span", [
+              Yasuri::TextNode.new('./a', "text")
+            ]
+          )
+        ], flatten: true
+      )
+      actual = root_node.scrape(uri_paginate)
       expected = [
-        "PaginationTest01",
-        {"text"=>""},
-        {"text"=>""},
-        {"text" => "2"},
-        {"text" => "3"},
-        {"text" => "4"},
-        {"text"=>"NextPage »"},
-        "PaginationTest02",
-        {"text"=>"« PreviousPage"},
-        {"text" => "1"},
-        {"text"=>""},
-        {"text" => "3"},
-        {"text" => "4"},
-        {"text"=>"NextPage »"},
-        "PaginationTest03",
-        {"text"=>"« PreviousPage"},
-        {"text" => "1"},
-        {"text" => "2"},
-        {"text"=>""},
-        {"text" => "4"},
-        {"text"=>"NextPage »"},
-        "PaginationTest04",
-        {"text"=>"« PreviousPage"},
-        {"text" => "1"},
-        {"text" => "2"},
-        {"text" => "3"},
-        {"text"=>""},
-        {"text"=>""},
+        "PaginationTest01", { "text" => "" },
+        { "text" => "" }, { "text" => "2" }, { "text" => "3" }, { "text" => "4" },
+        { "text" => "NextPage »" },
+        "PaginationTest02", { "text" => "« PreviousPage" },
+        { "text" => "1" }, { "text" => "" }, { "text" => "3" }, { "text" => "4" },
+        { "text" => "NextPage »" },
+        "PaginationTest03", { "text" => "« PreviousPage" },
+        { "text" => "1" }, { "text" => "2" }, { "text" => "" }, { "text" => "4" },
+        { "text" => "NextPage »" },
+        "PaginationTest04", { "text" => "« PreviousPage" },
+        { "text" => "1" }, { "text" => "2" }, { "text" => "3" }, { "text" => "" },
+        { "text" => "" }
       ]
       expect(actual).to match expected
     end
     it "scrape each paginated pages limited" do
-      root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
-        Yasuri::TextNode.new('/html/body/p', "content"),
-      ], limit:3)
-      actual = root_node.inject(@agent, @page)
+      root_node = Yasuri::PaginateNode.new(
+        "/html/body/nav/span/a[@class='next']", "root", [
+          Yasuri::TextNode.new('/html/body/p', "content")
+        ], limit: 3
+      )
+      actual = root_node.scrape(uri_paginate)
       expected = [
-        {"content" => "PaginationTest01"},
-        {"content" => "PaginationTest02"},
-        {"content" => "PaginationTest03"},
+        { "content" => "PaginationTest01" },
+        { "content" => "PaginationTest02" },
+        { "content" => "PaginationTest03" }
       ]
       expect(actual).to match expected
     end
     it 'return first content if paginate link node is not found' do
       missing_xpath = "/html/body/nav/span/b[@class='next']"
-      root_node = Yasuri::PaginateNode.new(missing_xpath, "root", [
-        Yasuri::TextNode.new('/html/body/p', "content"),
-      ])
-      actual = root_node.inject(@agent, @page)
-      expected = [ {"content" => "PaginationTest01"}, ]
+      root_node = Yasuri::PaginateNode.new(
+        missing_xpath, "root", [
+          Yasuri::TextNode.new('/html/body/p', "content")
+        ]
+      )
+      actual = root_node.scrape(uri_paginate)
+      expected = [{ "content" => "PaginationTest01" }]
       expect(actual).to match_array expected
     end
     it 'return empty hashes if content node is not found' do
-      root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
-        Yasuri::TextNode.new('/html/body/hoge', "content"),
-      ])
-      actual = root_node.inject(@agent, @page)
-      expected = [ {"content" => ""}, {"content" => ""}, {"content" => ""}, {"content" => ""},]
+      root_node = Yasuri::PaginateNode.new(
+        "/html/body/nav/span/a[@class='next']", "root", [
+          Yasuri::TextNode.new('/html/body/hoge', "content")
+        ]
+      )
+      actual = root_node.scrape(uri_paginate)
+      expected = [{ "content" => "" }, { "content" => "" }, { "content" => "" }, { "content" => "" }]
       expect(actual).to match_array expected
     end
@@ -109,34 +98,56 @@ describe 'Yasuri' do
       generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']" do
         text_content '/html/body/p'
       end
-      original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
-        Yasuri::TextNode.new('/html/body/p', "content"),
-      ])
-      compare_generated_vs_original(generated, original, @page)
+      original = Yasuri::PaginateNode.new(
+        "/html/body/nav/span/a[@class='next']", "root", [
+          Yasuri::TextNode.new('/html/body/p', "content")
+        ]
+      )
+      compare_generated_vs_original(generated, original, uri_paginate)
     end
     it 'can be defined by DSL, return single PaginateNode content limited' do
-      generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']", limit:2 do
+      generated = Yasuri.pages_next "/html/body/nav/span/a[@class='next']", limit: 2 do
         text_content '/html/body/p'
       end
-      original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
-        Yasuri::TextNode.new('/html/body/p', "content"),
-      ], limit: 2)
-      compare_generated_vs_original(generated, original, @page)
+      original = Yasuri::PaginateNode.new(
+        "/html/body/nav/span/a[@class='next']", "root", [
+          Yasuri::TextNode.new('/html/body/p', "content")
+        ], limit: 2
+      )
+      compare_generated_vs_original(generated, original, uri_paginate)
     end
     it "return child node as symbol" do
-      root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
-        Yasuri::TextNode.new('/html/body/p', "content"),
-      ])
-      actual = root_node.inject(@agent, @page, symbolize_names:true)
+      root_node = Yasuri::PaginateNode.new(
+        "/html/body/nav/span/a[@class='next']", "root", [
+          Yasuri::TextNode.new('/html/body/p', "content")
+        ]
+      )
+      actual = root_node.scrape(uri_paginate, symbolize_names: true)
       expected = [
-        {:content => "PaginationTest01"},
-        {:content => "PaginationTest02"},
-        {:content => "PaginationTest03"},
-        {:content => "PaginationTest04"},
+        { content: "PaginationTest01" },
+        { content: "PaginationTest02" },
+        { content: "PaginationTest03" },
+        { content: "PaginationTest04" }
       ]
       expect(actual).to match expected
     end
+    it "scrape with interval for each request" do
+      allow(Kernel).to receive(:sleep)
+      root_node = Yasuri::PaginateNode.new(
+        "/html/body/nav/span/a[@class='next']", "root", [
+          Yasuri::TextNode.new('/html/body/p', "content")
+        ]
+      )
+      actual = root_node.scrape(uri_paginate, interval_ms: 1000)
+      expect(actual.size).to match 4
+      expect(Kernel).to have_received(:sleep).exactly(4).times do |interval_sec|
+        expect(interval_sec).to match 1.0
+      end
+    end
   end
 end