yasuri 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1573790f94cdd8bf8621b1178f7c5ef5e00eee15
4
+ data.tar.gz: b37d3a38cf679b38a1d5d9dba2bc785e54d6d2b3
5
+ SHA512:
6
+ metadata.gz: 8c8da39e14541e21ed520c051a4e425d6a5aed2f530b7b50e1652894ead3678d4717db7aa686a64f6c6f905d3df1e1bbfe71d1226381b0f08778a043fafaa2ba
7
+ data.tar.gz: 1ee0447e8757ea26ff7b4dc36cc089dda9003be8de746259e8f430d27c608e0519e2558daf37b145f009e69b909c6647990f1e46876c47d47df7ba24b75f3862
data/.coveralls.yml ADDED
@@ -0,0 +1 @@
1
+ service_name: travis-ci
data/.gitignore ADDED
@@ -0,0 +1,70 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /test/tmp/
9
+ /test/version_tmp/
10
+ /tmp/
11
+
12
+ ## Specific to RubyMotion:
13
+ .dat*
14
+ .repl_history
15
+ build/
16
+
17
+ ## Documentation cache and generated files:
18
+ /.yardoc/
19
+ /_yardoc/
20
+ /doc/
21
+ /rdoc/
22
+
23
+ ## Environment normalisation:
24
+ /.bundle/
25
+ /lib/bundler/man/
26
+
27
+ # for a library or gem, you might want to ignore these files since the code is
28
+ # intended to run in multiple environments; otherwise, check them in:
29
+ # Gemfile.lock
30
+ # .ruby-version
31
+ # .ruby-gemset
32
+
33
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
34
+ .rvmrc
35
+
36
+ # -*- mode: gitignore; -*-
37
+ *~
38
+ \#*\#
39
+ /.emacs.desktop
40
+ /.emacs.desktop.lock
41
+ *.elc
42
+ auto-save-list
43
+ tramp
44
+ .\#*
45
+
46
+ # Org-mode
47
+ .org-id-locations
48
+ *_archive
49
+
50
+ # flymake-mode
51
+ *_flymake.*
52
+
53
+ # eshell files
54
+ /eshell/history
55
+ /eshell/lastdir
56
+
57
+ # elpa packages
58
+ /elpa/
59
+
60
+ # reftex files
61
+ *.rel
62
+
63
+ # AUCTeX auto folder
64
+ /auto/
65
+
66
+ # cask packages
67
+ .cask/
68
+
69
+ .ruby-version
70
+ Gemfile.lock
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,9 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.2.0
4
+ script:
5
+ - ruby --version
6
+ - rspec spec
7
+ addons:
8
+ code_climate:
9
+ repo_token: 0dc78d33107a7f11f257c0218ac1a37e0073005bb9734f2fd61d0f7e803fc151
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in yasuri.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 TAC
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
data/README.md ADDED
@@ -0,0 +1,63 @@
1
+ # Yasuri [![Build Status](https://travis-ci.org/tac0x2a/yasuri.svg?branch=master)](https://travis-ci.org/tac0x2a/yasuri) [![Coverage Status](https://coveralls.io/repos/tac0x2a/yasuri/badge.svg?branch=master)](https://coveralls.io/r/tac0x2a/yasuri?branch=master) [![Code Climate](https://codeclimate.com/github/tac0x2a/yasuri/badges/gpa.svg)](https://codeclimate.com/github/tac0x2a/yasuri)
2
+
3
+ Yasuri (鑢) is an easy web-scraping library for supporting "[Mechanize](https://github.com/sparklemotion/mechanize)".
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'yasuri'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install yasuri
20
+
21
+ ## Usage
22
+
23
+ ```ruby
24
+ # Node tree constructing by DSL
25
+ root = links_root '//*[@id="menu"]/ul/li/a' do
26
+ text_title '//*[@id="contents"]/h2'
27
+ text_content '//*[@id="contents"]/p[1]'
28
+ end
29
+
30
+ # Node tree constructing by JSON
31
+ src = <<-EOJSON
32
+ { "node" : "links",
33
+ "name" : "root",
34
+ "path" : "//*[@id='menu']/ul/li/a",
35
+ "children" : [
36
+ { "node" : "text",
37
+ "name" : "title",
38
+ "path" : "//*[@id='contents']/h2"
39
+ },
40
+ { "node" : "text",
41
+ "name" : "content",
42
+ "path" : "//*[@id='contents']/p[1]"
43
+ }
44
+ ]
45
+ }
46
+ EOJSON
47
+ root = Yasuri.json2tree(src)
48
+
49
+ agent = Mechanize.new
50
+ root_page = agent.get("http://some.scraping.page.net/")
51
+
52
+ result = root.inject(agent, root_page)
53
+ # => [ {"title" => "PageTitle", "content" => "Page Contents" }, ... ]
54
+ ```
55
+
56
+
57
+ ## Contributing
58
+
59
+ 1. Fork it ( https://github.com/[my-github-username]/yasuri/fork )
60
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
61
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
62
+ 4. Push to the branch (`git push origin my-new-feature`)
63
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
7
+
data/app.rb ADDED
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Author:: TAC (tac@tac42.net)
5
+
6
+ require 'pp'
7
+ require 'time'
8
+ require 'mechanize'
9
+
10
+ require_relative 'lib/yasuri/yasuri'
11
+
12
+ agent = Mechanize.new
13
+
14
+ uri = "http://www.asahi.com/"
15
+
16
+ # Node tree constructing by DSL
17
+ root = links_top '//*[@id="MainInner"]/div[1]/ul/li/a' do
18
+ text_title '//*[@id="MainInner"]/div[1]/div/h1'
19
+ text_article '//*[@id="MainInner"]/div/div[@class="ArticleText"]'
20
+ end
21
+
22
+ # Node tree constructing by JSON
23
+ src = <<-EOJSON
24
+ { "node" : "links",
25
+ "name" : "root",
26
+ "path" : "//*[@id='MainInner']/div[1]/ul/li/a",
27
+ "children" : [
28
+ { "node" : "text",
29
+ "name" : "title",
30
+ "path" : "//*[@id='MainInner']/div[1]/div/h1"
31
+ },
32
+ { "node" : "text",
33
+ "name" : "article",
34
+ "path" : "//*[@id='MainInner']/div/div[@class='ArticleText']"
35
+ }
36
+ ]
37
+ }
38
+ EOJSON
39
+ root = Yasuri.json2tree(src)
40
+
41
+ # Access to parsed resources
42
+ page = agent.get(uri)
43
+ contents = root.inject(agent, page)
44
+
45
+ contents.each do |h|
46
+ t = h['title']
47
+ a = h['article']
48
+
49
+ puts t
50
+ puts a
51
+ puts "=" * 100
52
+ end
@@ -0,0 +1,3 @@
1
+ module Yasuri
2
+ VERSION = "0.0.2"
3
+ end
@@ -0,0 +1,143 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # Author:: TAC (tac@tac42.net)
4
+
5
+ require 'mechanize'
6
+ require 'json'
7
+
8
+ module Yasuri
9
+
10
+ module Node
11
+ attr_reader :url, :xpath, :name
12
+
13
+ def initialize(xpath, name, children = [])
14
+ @xpath, @name, @children = xpath, name, children
15
+ end
16
+
17
+ def inject(agent, page)
18
+ fail "#{Kernel.__method__} is not implemented."
19
+ end
20
+ end
21
+
22
+ class TextNode
23
+ include Node
24
+ def inject(agent, page)
25
+ node = page.search(@xpath)
26
+ node.text.to_s
27
+ end
28
+ end
29
+
30
+ class StructNode
31
+ include Node
32
+ def inject(agent, page)
33
+ sub_tags = page.search(@xpath)
34
+ sub_tags.map do |sub_tag|
35
+ child_results_kv = @children.map do |child_node|
36
+ [child_node.name, child_node.inject(agent, sub_tag)]
37
+ end
38
+ Hash[child_results_kv]
39
+ end
40
+ end
41
+ end
42
+
43
+ class LinksNode
44
+ include Node
45
+ def inject(agent, page)
46
+ links = page.search(@xpath) || [] # links expected
47
+ links.map do |link|
48
+ link_button = Mechanize::Page::Link.new(link, agent, page)
49
+ child_page = link_button.click
50
+
51
+ child_results_kv = @children.map do |child_node|
52
+ [child_node.name, child_node.inject(agent, child_page)]
53
+ end
54
+
55
+ Hash[child_results_kv]
56
+ end # each named child node
57
+ end
58
+ end
59
+
60
+ class PaginateNode
61
+ include Node
62
+ def inject(agent, page)
63
+
64
+ child_results = []
65
+ while page
66
+ child_results_kv = @children.map do |child_node|
67
+ [child_node.name, child_node.inject(agent, page)]
68
+ end
69
+ child_results << Hash[child_results_kv]
70
+
71
+ link = page.search(@xpath).first
72
+ break if link == nil
73
+
74
+ link_button = Mechanize::Page::Link.new(link, agent, page)
75
+ page = link_button.click
76
+ end
77
+
78
+ child_results
79
+ end
80
+ end
81
+
82
+ class NodeGenerator
83
+ def gen_recursive(&block)
84
+ @nodes = []
85
+ instance_eval(&block)
86
+ @nodes
87
+ end
88
+
89
+ def method_missing(name, *args, &block)
90
+ node = NodeGenerator.gen(name, *args, &block)
91
+ raise "Undefined Node Name '#{name}'" if node == nil
92
+ @nodes << node
93
+ end
94
+
95
+ def self.gen(name, *args, &block)
96
+ xpath, children = *args
97
+ children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block_given?
98
+
99
+ case name
100
+ when /^text_(.+)$/
101
+ Yasuri::TextNode.new(xpath, $1, children || [])
102
+ when /^struct_(.+)$/
103
+ Yasuri::StructNode.new(xpath, $1, children || [])
104
+ when /^links_(.+)$/
105
+ Yasuri::LinksNode.new(xpath, $1, children || [])
106
+ when /^pages_(.+)$/
107
+ Yasuri::PaginateNode.new(xpath, $1, children || [])
108
+ else
109
+ nil
110
+ end
111
+ end # of self.gen(name, *args, &block)
112
+ end # of class NodeGenerator
113
+
114
+ def self.json2tree(json_string)
115
+ json = JSON.parse(json_string)
116
+ Yasuri.hash2node(json)
117
+ end
118
+
119
+ private
120
+ Text2Node = {
121
+ "text" => TextNode,
122
+ "struct" => StructNode,
123
+ "links" => LinksNode,
124
+ "pages" => PaginateNode
125
+ }
126
+ def self.hash2node(node_h)
127
+ node, name, path, children = %w|node name path children|.map do |key|
128
+ node_h[key]
129
+ end
130
+ children ||= []
131
+
132
+ childnodes = children.map{|c| Yasuri.hash2node(c) }
133
+
134
+ klass = Text2Node[node]
135
+ klass ? klass.new(path, name, childnodes) : nil
136
+ end
137
+ end
138
+
139
+ # alias for DSL
140
+ def method_missing(name, *args, &block)
141
+ generated = Yasuri::NodeGenerator.gen(name, *args, &block)
142
+ generated || super(name, args)
143
+ end
data/lib/yasuri.rb ADDED
@@ -0,0 +1,6 @@
1
+ require "yasuri/version"
2
+ require "yasuri/yasuri"
3
+
4
+ module Yasuri
5
+ # Your code goes here...
6
+ end
@@ -0,0 +1,10 @@
1
+ <html>
2
+ <head><title>Child 01 Test</title></head>
3
+ <body>
4
+ <p>Child 01 page.</p>
5
+ <ul>
6
+ <li><a href="./child01_sub.html">Child01_Sub</a></li>
7
+ <li><a href="./child02_sub.html">Child02_Sub</a></li>
8
+ </ul>
9
+ </body>
10
+ <title>
@@ -0,0 +1,6 @@
1
+ <html>
2
+ <head><title>Child 01 SubPage Test</title></head>
3
+ <body>
4
+ <p>Child 01 sub page.</p>
5
+ </body>
6
+ <title>
@@ -0,0 +1,6 @@
1
+ <html>
2
+ <head><title>Child 02 Test</title></head>
3
+ <body>
4
+ <p>Child 02 page.</p>
5
+ </body>
6
+ <title>
@@ -0,0 +1,6 @@
1
+ <html>
2
+ <head><title>Child 02 SubPage Test</title></head>
3
+ <body>
4
+ <p>Child 02 sub page.</p>
5
+ </body>
6
+ <title>
@@ -0,0 +1,9 @@
1
+ <html>
2
+ <head><title>Child 03 Test</title></head>
3
+ <body>
4
+ <p>Child 03 page.</p>
5
+ <ul>
6
+ <li><a href="./child03_sub.html">Child03_Sub</a></li>
7
+ </ul>
8
+ </body>
9
+ <title>
@@ -0,0 +1,6 @@
1
+ <html>
2
+ <head><title>Child 03 SubPage Test</title></head>
3
+ <body>
4
+ <p>Child 03 sub page.</p>
5
+ </body>
6
+ <title>
@@ -0,0 +1,10 @@
1
+ <html>
2
+ <head><title>Yasuri Test</title></head>
3
+ <body>
4
+ <p>Hello,Yasuri</p>
5
+ <a href="./child01.html">child01</a>
6
+ <a href="./child02.html">child02</a>
7
+ <a href="./child03.html">child03</a>
8
+ <p>Last Modify - 2015/02/14</p>
9
+ </body>
10
+ <title>
@@ -0,0 +1,26 @@
1
+ <html>
2
+ <head><title>Page01Test</title></head>
3
+ <body>
4
+ <p>PaginationTest01</p>
5
+ <nav class='pagination'>
6
+ <span class='prev'>
7
+ &laquo; PreviousPage
8
+ </span>
9
+ <span class='page'>
10
+ 1
11
+ </span>
12
+ <span class='page'>
13
+ <a href="./page02.html">2</a>
14
+ </span>
15
+ <span class='page'>
16
+ <a href="./page03.html">3</a>
17
+ </span>
18
+ <span class='page'>
19
+ <a href="./page04.html">4</a>
20
+ </span>
21
+ <span class='next'>
22
+ <a href="./page02.html" class="next" rel="next">NextPage &raquo;</a>
23
+ </span>
24
+ </nav>
25
+ </body>
26
+ <title>
@@ -0,0 +1,26 @@
1
+ <html>
2
+ <head><title>Page02Test</title></head>
3
+ <body>
4
+ <p>PaginationTest02</p>
5
+ <nav class='pagination'>
6
+ <span class='prev'>
7
+ <a href="./page01.html" class="prev" rel="prev">&laquo; PreviousPage</a>
8
+ </span>
9
+ <span class='page'>
10
+ <a href="./page01.html">1</a>
11
+ </span>
12
+ <span class='page'>
13
+ 2
14
+ </span>
15
+ <span class='page'>
16
+ <a href="./page03.html">3</a>
17
+ </span>
18
+ <span class='page'>
19
+ <a href="./page04.html">4</a>
20
+ </span>
21
+ <span class='next'>
22
+ <a href="./page03.html" class="next" rel="next">NextPage &raquo;</a>
23
+ </span>
24
+ </nav>
25
+ </body>
26
+ <title>
@@ -0,0 +1,26 @@
1
+ <html>
2
+ <head><title>Page03Test</title></head>
3
+ <body>
4
+ <p>PaginationTest03</p>
5
+ <nav class='pagination'>
6
+ <span class='prev'>
7
+ <a href="./page02.html" class="prev" rel="prev">&laquo; PreviousPage</a>
8
+ </span>
9
+ <span class='page'>
10
+ <a href="./page01.html">1</a>
11
+ </span>
12
+ <span class='page'>
13
+ <a href="./page02.html">2</a>
14
+ </span>
15
+ <span class='page'>
16
+ 3
17
+ </span>
18
+ <span class='page'>
19
+ <a href="./page04.html">4</a>
20
+ </span>
21
+ <span class='next'>
22
+ <a href="./page04.html" class="next" rel="next">NextPage &raquo;</a>
23
+ </span>
24
+ </nav>
25
+ </body>
26
+ <title>
@@ -0,0 +1,26 @@
1
+ <html>
2
+ <head><title>Page04Test</title></head>
3
+ <body>
4
+ <p>PaginationTest04</p>
5
+ <nav class='pagination'>
6
+ <span class='prev'>
7
+ <a href="./page03.html" class="prev" rel="prev">&laquo; PreviousPage</a>
8
+ </span>
9
+ <span class='page'>
10
+ <a href="./page01.html">1</a>
11
+ </span>
12
+ <span class='page'>
13
+ <a href="./page02.html">2</a>
14
+ </span>
15
+ <span class='page'>
16
+ <a href="./page03.html">3</a>
17
+ </span>
18
+ <span class='page'>
19
+ 4
20
+ </span>
21
+ <span class='next'>
22
+ NextPage &raquo;
23
+ </span>
24
+ </nav>
25
+ </body>
26
+ <title>
@@ -0,0 +1,77 @@
1
+ <html>
2
+ <head>
3
+ <title>StructualTextTest</title>
4
+ </head>
5
+ <body>
6
+
7
+ <h1>1996</h1>
8
+ <table>
9
+ <thead>
10
+ <tr>
11
+ <th>Title</th>
12
+ <th>Publication Date</th>
13
+ </tr>
14
+ </thead>
15
+ <tr>
16
+ <td>The Perfect Insider</td>
17
+ <td>1996/4/5</td>
18
+ </tr>
19
+ <tr>
20
+ <td>Doctors in Isolated Room</td>
21
+ <td>1996/7/5</td>
22
+ </tr>
23
+ <tr>
24
+ <td>Mathematical Goodbye</td>
25
+ <td>1996/9/5</td>
26
+ </tr>
27
+ </table>
28
+
29
+ <h1>1997</h1>
30
+ <table>
31
+ <thead>
32
+ <tr>
33
+ <th>Title</th>
34
+ <th>Publication Date</th>
35
+ </tr>
36
+ </thead>
37
+ <tr>
38
+ <td>Jack the Poetical Private</td>
39
+ <td>1997/1/5</td>
40
+ </tr>
41
+ <tr>
42
+ <td>Who Inside</td>
43
+ <td>1997/4/5</td>
44
+ </tr>
45
+ <tr>
46
+ <td>Illusion Acts Like Magic</td>
47
+ <td>1997/10/5</td>
48
+ </tr>
49
+ </table>
50
+
51
+ <h1>1998</h1>
52
+ <table>
53
+ <thead>
54
+ <tr>
55
+ <th>Title</th>
56
+ <th>Publication Date</th>
57
+ </tr>
58
+ </thead>
59
+ <tr>
60
+ <td>Replaceable Summer</td>
61
+ <td>1998/1/7</td>
62
+ </tr>
63
+ <tr>
64
+ <td>Switch Back</td>
65
+ <td>1998/4/5</td>
66
+ </tr>
67
+ <tr>
68
+ <td>Numerical Models</td>
69
+ <td>1998/7/5</td>
70
+ </tr>
71
+ <tr>
72
+ <td>The Perfect Outsider</td>
73
+ <td>1998/10/5</td>
74
+ </tr>
75
+ </table>
76
+ </body>
77
+ </html>
@@ -0,0 +1,26 @@
1
+ # -*- coding: utf-8 -*-
2
+ # Author:: TAC (tac@tac42.net)
3
+
4
+ require 'glint'
5
+
6
+ server = Glint::Server.new do |port|
7
+
8
+ require 'webrick'
9
+ http = WEBrick::HTTPServer.new({
10
+ DocumentRoot: 'spec/htdocs',
11
+ BindAddress: '127.0.0.1',
12
+ Port: port,
13
+ AccessLog: []
14
+ })
15
+
16
+ trap(:INT) { http.shutdown }
17
+ trap(:TERM) { http.shutdown }
18
+ http.start
19
+ end
20
+
21
+ server.start
22
+
23
+ Glint::Server.info[:httpserver] = {
24
+ host: "127.0.0.1",
25
+ port: server.port
26
+ }
@@ -0,0 +1,31 @@
1
+ # -*- coding: utf-8 -*-
2
+ # Author:: TAC (tac@tac42.net)
3
+
4
+ require 'glint'
5
+ Dir[File.expand_path("../servers/*.rb", __FILE__)].each {|f| require f}
6
+
7
+ require 'rspec'
8
+ shared_context 'httpserver' do
9
+ require 'net/http'
10
+ let(:uri) {
11
+ "http://#{Glint::Server.info[:httpserver][:host]}:#{Glint::Server.info[:httpserver][:port]}"
12
+ }
13
+ end
14
+
15
+
16
+ # ENV['CODECLIMATE_REPO_TOKEN'] = "0dc78d33107a7f11f257c0218ac1a37e0073005bb9734f2fd61d0f7e803fc151"
17
+ require "codeclimate-test-reporter"
18
+ CodeClimate::TestReporter.start
19
+
20
+ require 'simplecov'
21
+ require 'coveralls'
22
+ Coveralls.wear!
23
+
24
+ SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
25
+ SimpleCov::Formatter::HTMLFormatter,
26
+ Coveralls::SimpleCov::Formatter
27
+ ]
28
+ SimpleCov.start
29
+
30
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
31
+ require 'yasuri'
@@ -0,0 +1,299 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # Author:: TAC (tac@tac42.net)
4
+
5
+ require_relative 'spec_helper'
6
+
7
+ #require_relative '../lib/yasuri/yasuri'
8
+
9
+ describe 'Yasuri' do
10
+ include_context 'httpserver'
11
+
12
+ before do
13
+ @agent = Mechanize.new
14
+ @uri = uri
15
+ @index_page = @agent.get(@uri)
16
+ end
17
+
18
+ ########
19
+ # Node #
20
+ ########
21
+ def compare_generated_vs_original(generated, original, page = @index_page)
22
+ expected = original.inject(@agent, page)
23
+ actual = generated.inject(@agent, page)
24
+ expect(actual).to match expected
25
+ end
26
+
27
+ describe '::TextNode' do
28
+ before { @node = Yasuri::TextNode.new('/html/body/p[1]', "title") }
29
+
30
+ it 'scrape text text <p>Hello,Yasuri</p>' do
31
+ actual = @node.inject(@agent, @index_page)
32
+ expect(actual).to eq "Hello,Yasuri"
33
+ end
34
+
35
+ it "can be defined by DSL, return single TextNode title" do
36
+ generated = text_title '/html/body/p[1]'
37
+ original = Yasuri::TextNode.new('/html/body/p[1]', "title")
38
+ compare_generated_vs_original(generated, original)
39
+ end
40
+ end
41
+
42
+ describe '::StructNode' do
43
+ before do
44
+ @page = @agent.get(@uri + "/structual_text.html")
45
+ @table_1996 = [
46
+ { "title" => "The Perfect Insider",
47
+ "pub_date" => "1996/4/5" },
48
+ { "title" => "Doctors in Isolated Room",
49
+ "pub_date" => "1996/7/5" },
50
+ { "title" => "Mathematical Goodbye",
51
+ "pub_date" => "1996/9/5" },
52
+ ]
53
+ @table_1997 = [
54
+ { "title" => "Jack the Poetical Private",
55
+ "pub_date" => "1997/1/5" },
56
+ { "title" => "Who Inside",
57
+ "pub_date" => "1997/4/5" },
58
+ { "title" => "Illusion Acts Like Magic",
59
+ "pub_date" => "1997/10/5" },
60
+ ]
61
+ @table_1998 = [
62
+ { "title" => "Replaceable Summer",
63
+ "pub_date" => "1998/1/7" },
64
+ { "title" => "Switch Back",
65
+ "pub_date" => "1998/4/5" },
66
+ { "title" => "Numerical Models",
67
+ "pub_date" => "1998/7/5" },
68
+ { "title" => "The Perfect Outsider",
69
+ "pub_date" => "1998/10/5" },
70
+ ]
71
+ @all_tables = [
72
+ {"table" => @table_1996},
73
+ {"table" => @table_1997},
74
+ {"table" => @table_1998},
75
+ ]
76
+ end
77
+ it 'scrape single table contents' do
78
+ node = Yasuri::StructNode.new('/html/body/table[1]/tr', "table", [
79
+ Yasuri::TextNode.new('./td[1]', "title"),
80
+ Yasuri::TextNode.new('./td[2]', "pub_date"),
81
+ ])
82
+ expected = @table_1996
83
+ actual = node.inject(@agent, @page)
84
+ expect(actual).to match expected
85
+ end
86
+
87
+ it 'scrape all tables' do
88
+ node = Yasuri::StructNode.new('/html/body/table', "tables", [
89
+ Yasuri::StructNode.new('./tr', "table", [
90
+ Yasuri::TextNode.new('./td[1]', "title"),
91
+ Yasuri::TextNode.new('./td[2]', "pub_date"),
92
+ ])
93
+ ])
94
+ expected = @all_tables
95
+ actual = node.inject(@agent, @page)
96
+ expect(actual).to match expected
97
+ end
98
+
99
+ it 'can be defined by DSL, scrape all tables' do
100
+ generated = struct_tables '/html/body/table' do
101
+ struct_table './tr' do
102
+ text_title './td[1]'
103
+ text_pub_date './td[2]'
104
+ end
105
+ end
106
+ original = Yasuri::StructNode.new('/html/body/table', "tables", [
107
+ Yasuri::StructNode.new('./tr', "table", [
108
+ Yasuri::TextNode.new('./td[1]', "title"),
109
+ Yasuri::TextNode.new('./td[2]', "pub_date"),
110
+ ])
111
+ ])
112
+ compare_generated_vs_original(generated, original)
113
+ end
114
+ end
115
+
116
+ describe '::LinksNode' do
117
+ it 'scrape links' do
118
+ root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
119
+ Yasuri::TextNode.new('/html/body/p', "content"),
120
+ ])
121
+
122
+ actual = root_node.inject(@agent, @index_page)
123
+ expected = [
124
+ {"content" => "Child 01 page."},
125
+ {"content" => "Child 02 page."},
126
+ {"content" => "Child 03 page."},
127
+ ]
128
+ expect(actual).to match expected
129
+ end
130
+
131
+ it 'scrape links, recursive' do
132
+ root_node = Yasuri::LinksNode.new('/html/body/a', "root", [
133
+ Yasuri::TextNode.new('/html/body/p', "content"),
134
+ Yasuri::LinksNode.new('/html/body/ul/li/a', "sub_link", [
135
+ Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
136
+ ]),
137
+ ])
138
+ actual = root_node.inject(@agent, @index_page)
139
+ expected = [
140
+ {"content" => "Child 01 page.",
141
+ "sub_link" => [{"sub_page_title" => "Child 01 SubPage Test"},
142
+ {"sub_page_title" => "Child 02 SubPage Test"}],},
143
+ {"content" => "Child 02 page.",
144
+ "sub_link" => [],},
145
+ {"content" => "Child 03 page.",
146
+ "sub_link" => [{"sub_page_title" => "Child 03 SubPage Test"}],},
147
+ ]
148
+ expect(actual).to match expected
149
+ end
150
+ it 'can be defined by DSL, return single LinkNode title' do
151
+ generated = links_title '/html/body/a'
152
+ original = Yasuri::LinksNode.new('/html/body/a', "title")
153
+ compare_generated_vs_original(generated, original)
154
+ end
155
+ it 'can be defined by DSL, return nested contents under link' do
156
+ generated = links_title '/html/body/a' do
157
+ text_name '/html/body/p'
158
+ end
159
+ original = Yasuri::LinksNode.new('/html/body/a', "root", [
160
+ Yasuri::TextNode.new('/html/body/p', "name"),
161
+ ])
162
+ compare_generated_vs_original(generated, original)
163
+ end
164
+
165
+ it 'can be defined by DSL, return recursive links node' do
166
+ generated = links_root '/html/body/a' do
167
+ text_content '/html/body/p'
168
+ links_sub_link '/html/body/ul/li/a' do
169
+ text_sub_page_title '/html/head/title'
170
+ end
171
+ end
172
+
173
+ original = Yasuri::LinksNode.new('/html/body/a', "root", [
174
+ Yasuri::TextNode.new('/html/body/p', "content"),
175
+ Yasuri::LinksNode.new('/html/body/ul/li/a', "sub_link", [
176
+ Yasuri::TextNode.new('/html/head/title', "sub_page_title"),
177
+ ]),
178
+ ])
179
+ compare_generated_vs_original(generated, original)
180
+ end
181
+ end
182
+
183
+ describe '::PaginateNode' do
184
+ before do
185
+ @uri += "/pagination/page01.html"
186
+ @page = @agent.get(@uri)
187
+ end
188
+
189
+ it "scrape each paginated pages" do
190
+ root_node = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
191
+ Yasuri::TextNode.new('/html/body/p', "content"),
192
+ ])
193
+ actual = root_node.inject(@agent, @page)
194
+ expected = [
195
+ {"content" => "PaginationTest01"},
196
+ {"content" => "PaginationTest02"},
197
+ {"content" => "PaginationTest03"},
198
+ {"content" => "PaginationTest04"},
199
+ ]
200
+ expect(actual).to match expected
201
+ end
202
+
203
+ it 'can be defined by DSL, return single PaginateNode content' do
204
+ generated = pages_next "/html/body/nav/span/a[@class='next']" do
205
+ text_content '/html/body/p'
206
+ end
207
+ original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
208
+ Yasuri::TextNode.new('/html/body/p', "content"),
209
+ ])
210
+ compare_generated_vs_original(generated, original)
211
+ end
212
+ end
213
+
214
+ describe '.json2tree' do
215
+ it "return empty tree" do
216
+ tree = Yasuri.json2tree("{}")
217
+ expect(tree).to be_nil
218
+ end
219
+
220
+ it "return TextNode" do
221
+ src = %q| { "node" : "text",
222
+ "name" : "content",
223
+ "path" : "/html/body/p[1]"
224
+ }|
225
+ generated = Yasuri.json2tree(src)
226
+ original = Yasuri::TextNode.new('/html/body/p[1]', "content")
227
+ compare_generated_vs_original(generated, original)
228
+ end
229
+
230
+ it "return LinksNode/TextNode" do
231
+ src = %q| { "node" : "links",
232
+ "name" : "root",
233
+ "path" : "/html/body/a",
234
+ "children" : [ { "node" : "text",
235
+ "name" : "content",
236
+ "path" : "/html/body/p"
237
+ } ]
238
+ }|
239
+ generated = Yasuri.json2tree(src)
240
+ original = Yasuri::LinksNode.new('/html/body/a', "root", [
241
+ Yasuri::TextNode.new('/html/body/p', "content"),
242
+ ])
243
+ compare_generated_vs_original(generated, original)
244
+ end
245
+
246
+ it "return PaginateNode/TextNode" do
247
+ src = %q|{ "node" : "pages",
248
+ "name" : "root",
249
+ "path" : "/html/body/nav/span/a[@class=\'next\']",
250
+ "children" : [ { "node" : "text",
251
+ "name" : "content",
252
+ "path" : "/html/body/p"
253
+ } ]
254
+ }|
255
+ generated = Yasuri.json2tree(src)
256
+ original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
257
+ Yasuri::TextNode.new('/html/body/p', "content"),
258
+ ])
259
+
260
+ paginate_test_uri = @uri + "/pagination/page01.html"
261
+ paginate_test_page = @agent.get(paginate_test_uri)
262
+ compare_generated_vs_original(generated, original, paginate_test_page)
263
+ end
264
+
265
+ it "return StructNode/StructNode/[TextNode,TextNode]" do
266
+ src = %q| { "node" : "struct",
267
+ "name" : "tables",
268
+ "path" : "/html/body/table",
269
+ "children" : [
270
+ { "node" : "struct",
271
+ "name" : "table",
272
+ "path" : "./tr",
273
+ "children" : [
274
+ { "node" : "text",
275
+ "name" : "title",
276
+ "path" : "./td[1]"
277
+ },
278
+ { "node" : "text",
279
+ "name" : "pub_date",
280
+ "path" : "./td[2]"
281
+ }]
282
+ }]
283
+ }|
284
+ generated = Yasuri.json2tree(src)
285
+ original = Yasuri::StructNode.new('/html/body/table', "tables", [
286
+ Yasuri::StructNode.new('./tr', "table", [
287
+ Yasuri::TextNode.new('./td[1]', "title"),
288
+ Yasuri::TextNode.new('./td[2]', "pub_date"),
289
+ ])
290
+ ])
291
+ page = @agent.get(@uri + "/structual_text.html")
292
+ compare_generated_vs_original(generated, original, page)
293
+ end
294
+ end
295
+
296
+ it 'has a version number' do
297
+ expect(Yasuri::VERSION).not_to be nil
298
+ end
299
+ end
data/yasuri.gemspec ADDED
@@ -0,0 +1,31 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'yasuri/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "yasuri"
8
+ spec.version = Yasuri::VERSION
9
+ spec.authors = ["TAC"]
10
+ spec.email = ["tac@tac42.net"]
11
+ spec.summary = %q{Yasuri is easy scraping library.}
12
+ spec.description = %q{Yasuri is an easy web-scraping library for supporting "Mechanize".}
13
+ spec.homepage = "https://github.com/tac0x2a/yasuri"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.7"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+ spec.add_development_dependency "rspec"
24
+ spec.add_development_dependency "fuubar"
25
+ spec.add_development_dependency "glint"
26
+ spec.add_development_dependency "coveralls"
27
+ spec.add_development_dependency "simplecov"
28
+ spec.add_development_dependency "codeclimate-test-reporter"
29
+
30
+ spec.add_dependency "mechanize"
31
+ end
metadata ADDED
@@ -0,0 +1,213 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: yasuri
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - TAC
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-02-21 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.7'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: fuubar
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: glint
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: coveralls
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: simplecov
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: codeclimate-test-reporter
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: mechanize
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ description: Yasuri is an easy web-scraping library for supporting "Mechanize".
140
+ email:
141
+ - tac@tac42.net
142
+ executables: []
143
+ extensions: []
144
+ extra_rdoc_files: []
145
+ files:
146
+ - ".coveralls.yml"
147
+ - ".gitignore"
148
+ - ".rspec"
149
+ - ".travis.yml"
150
+ - Gemfile
151
+ - LICENSE
152
+ - README.md
153
+ - Rakefile
154
+ - app.rb
155
+ - lib/yasuri.rb
156
+ - lib/yasuri/version.rb
157
+ - lib/yasuri/yasuri.rb
158
+ - spec/htdocs/child01.html
159
+ - spec/htdocs/child01_sub.html
160
+ - spec/htdocs/child02.html
161
+ - spec/htdocs/child02_sub.html
162
+ - spec/htdocs/child03.html
163
+ - spec/htdocs/child03_sub.html
164
+ - spec/htdocs/index.html
165
+ - spec/htdocs/pagination/page01.html
166
+ - spec/htdocs/pagination/page02.html
167
+ - spec/htdocs/pagination/page03.html
168
+ - spec/htdocs/pagination/page04.html
169
+ - spec/htdocs/structual_text.html
170
+ - spec/servers/httpserver.rb
171
+ - spec/spec_helper.rb
172
+ - spec/yasuri_spec.rb
173
+ - yasuri.gemspec
174
+ homepage: https://github.com/tac0x2a/yasuri
175
+ licenses:
176
+ - MIT
177
+ metadata: {}
178
+ post_install_message:
179
+ rdoc_options: []
180
+ require_paths:
181
+ - lib
182
+ required_ruby_version: !ruby/object:Gem::Requirement
183
+ requirements:
184
+ - - ">="
185
+ - !ruby/object:Gem::Version
186
+ version: '0'
187
+ required_rubygems_version: !ruby/object:Gem::Requirement
188
+ requirements:
189
+ - - ">="
190
+ - !ruby/object:Gem::Version
191
+ version: '0'
192
+ requirements: []
193
+ rubyforge_project:
194
+ rubygems_version: 2.4.5
195
+ signing_key:
196
+ specification_version: 4
197
+ summary: Yasuri is easy scraping library.
198
+ test_files:
199
+ - spec/htdocs/child01.html
200
+ - spec/htdocs/child01_sub.html
201
+ - spec/htdocs/child02.html
202
+ - spec/htdocs/child02_sub.html
203
+ - spec/htdocs/child03.html
204
+ - spec/htdocs/child03_sub.html
205
+ - spec/htdocs/index.html
206
+ - spec/htdocs/pagination/page01.html
207
+ - spec/htdocs/pagination/page02.html
208
+ - spec/htdocs/pagination/page03.html
209
+ - spec/htdocs/pagination/page04.html
210
+ - spec/htdocs/structual_text.html
211
+ - spec/servers/httpserver.rb
212
+ - spec/spec_helper.rb
213
+ - spec/yasuri_spec.rb