yasuri 1.9.12 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 87406480e622911dca3649ba2a8e5b134ccffb36
4
- data.tar.gz: 2fde6a481bed02e569a5c08f6df96c09925abee0
2
+ SHA256:
3
+ metadata.gz: f3542a2cc0959a4534520f6104fc2922bdf0dbd368fcd4c149c3d251c2fc2198
4
+ data.tar.gz: 6fdb960db697e9a4ec1d87f2b83bf0e9914e3c9efe90764536bbee6d68774353
5
5
  SHA512:
6
- metadata.gz: 7c274f2316495aea66d737f053119dd71c154f7411b9cd54b102c71ee2e7ac36602dd0e44d78d0d06895ee0e27f934a9ac0e2f45d7e52ce4629f60f9fd905cf3
7
- data.tar.gz: a5696ca1fac061c542c7f0586bfecdf623962443e98207f897ff76ff0204cc78e53505b070f18467d2e72c0ddb527383224fb75ff2d05b5ff6e5a5149caaa20a
6
+ metadata.gz: 9df576243bea289f4c285c46f1bd2137b7b69b79b24e0c657e4ac952114dd7bcf82a5f95cd2dae88c6eac4e3e468273b7dbd6ead9d05ffdc8d25861921702333
7
+ data.tar.gz: 13f2ae72b3e8fa6d3ef58932daa2acad49f5d4f57c80f34e5215394940fc2305bc016d949760efe9f43ae2b8c3796064a1b0bd9bccf236cfe3789c2c291dfd8b
@@ -0,0 +1,35 @@
1
+ # This workflow uses actions that are not certified by GitHub.
2
+ # They are provided by a third-party and are governed by
3
+ # separate terms of service, privacy policy, and support
4
+ # documentation.
5
+ # This workflow will download a prebuilt Ruby version, install dependencies and run tests with Rake
6
+ # For more information see: https://github.com/marketplace/actions/setup-ruby-jruby-and-truffleruby
7
+
8
+ name: Ruby
9
+
10
+ on:
11
+ push:
12
+ branches: [ master ]
13
+ pull_request:
14
+ branches: [ master ]
15
+
16
+ jobs:
17
+ test:
18
+
19
+ runs-on: ubuntu-latest
20
+ strategy:
21
+ matrix:
22
+ ruby-version: ['2.6', '2.7', '3.0']
23
+
24
+ steps:
25
+ - uses: actions/checkout@v2
26
+ - name: Set up Ruby
27
+ # To automatically get bug fixes and new Ruby versions for ruby/setup-ruby,
28
+ # change this to (see https://github.com/ruby/setup-ruby#versioning):
29
+ # uses: ruby/setup-ruby@v1
30
+ uses: ruby/setup-ruby@473e4d8fe5dd94ee328fdfca9f8c9c7afc9dae5e
31
+ with:
32
+ ruby-version: ${{ matrix.ruby-version }}
33
+ bundler-cache: true # runs 'bundle install' and caches installed gems automatically
34
+ - name: Run tests
35
+ run: bundle exec rake
data/.gitignore CHANGED
@@ -66,5 +66,4 @@ tramp
66
66
  # cask packages
67
67
  .cask/
68
68
 
69
- .ruby-version
70
- Gemfile.lock
69
+ Gemfile.lock
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 3.0.0
data/.travis.yml CHANGED
@@ -1,9 +1,7 @@
1
1
  language: ruby
2
- rvm:
3
- - 2.2.0
4
2
  script:
5
3
  - ruby --version
6
4
  - rspec spec
7
5
  addons:
8
6
  code_climate:
9
- repo_token: 0dc78d33107a7f11f257c0218ac1a37e0073005bb9734f2fd61d0f7e803fc151
7
+ repo_token: 0dc78d33107a7f11f257c0218ac1a37e0073005bb9734f2fd61d0f7e803fc151
data/README.md CHANGED
@@ -1,4 +1,6 @@
1
- # Yasuri [![Build Status](https://travis-ci.org/tac0x2a/yasuri.svg?branch=master)](https://travis-ci.org/tac0x2a/yasuri) [![Coverage Status](https://coveralls.io/repos/tac0x2a/yasuri/badge.svg?branch=master)](https://coveralls.io/r/tac0x2a/yasuri?branch=master) [![Code Climate](https://codeclimate.com/github/tac0x2a/yasuri/badges/gpa.svg)](https://codeclimate.com/github/tac0x2a/yasuri)
1
+ # Yasuri
2
+ [![Build Status](https://github.com/tac0x2a/yasuri/actions/workflows/ruby.yml/badge.svg)](https://github.com/tac0x2a/yasuri/actions/workflows/ruby.yml)
3
+ [![Coverage Status](https://coveralls.io/repos/tac0x2a/yasuri/badge.svg?branch=master)](https://coveralls.io/r/tac0x2a/yasuri?branch=master) [![Maintainability](https://api.codeclimate.com/v1/badges/c29480fea1305afe999f/maintainability)](https://codeclimate.com/github/tac0x2a/yasuri/maintainability)
2
4
 
3
5
  Yasuri (鑢) is an easy web-scraping library for supporting "[Mechanize](https://github.com/sparklemotion/mechanize)".
4
6
 
@@ -32,6 +34,9 @@ or
32
34
  ```ruby
33
35
  # for Ruby 1.9.3 or lower
34
36
  gem 'yasuri', '~> 1.9'
37
+
38
+ # for Ruby 3.0.0 or lower
39
+ gem 'yasuri', '~> 3.0.1'
35
40
  ```
36
41
 
37
42
 
@@ -52,6 +57,19 @@ root = Yasuri.links_root '//*[@id="menu"]/ul/li/a' do
52
57
  text_content '//*[@id="contents"]/p[1]'
53
58
  end
54
59
 
60
+
61
+ # Node tree constructing by YAML
62
+ src = <<-EOYAML
63
+ root:
64
+ node: links
65
+ path: "//*[@id='menu']/ul/li/a"
66
+ children:
67
+ - title: { node: text, path: "//*[@id='contents']/h2" }
68
+ - content: { node: text, path: "//*[@id='contents']/p[1]" }
69
+ EOYAML
70
+ root = Yasuri.yaml2tree(src)
71
+
72
+
55
73
  # Node tree constructing by JSON
56
74
  src = <<-EOJSON
57
75
  { "node" : "links",
@@ -78,6 +96,27 @@ result = root.inject(agent, root_page)
78
96
  # => [ {"title" => "PageTitle", "content" => "Page Contents" }, ... ]
79
97
  ```
80
98
 
99
+ ## Dev
100
+ ```sh
101
+ $ gem install bundler
102
+ $ bundle install
103
+ ```
104
+ ### Test
105
+ ```sh
106
+ $ rake
107
+ # or
108
+ $ rspec spec/*spec.rb
109
+ ```
110
+
111
+ ### Release RubyGems
112
+ ```sh
113
+ # Only first time
114
+ $ curl -u <user_name> https://rubygems.org/api/v1/api_key.yaml > ~/.gem/credentials
115
+ $ chmod 0600 ~/.gem/credentials
116
+
117
+ $ nano lib/yasuri/version.rb # edit gem version
118
+ $ rake release
119
+ ```
81
120
 
82
121
  ## Contributing
83
122
 
data/USAGE.ja.md CHANGED
@@ -67,7 +67,7 @@ page = agent.get(uri)
67
67
  tree.inject(agent, page)
68
68
  ```
69
69
 
70
- ツリーは、DSLまたはjsonで定義することができます.上の例ではDSLで定義しています.
70
+ ツリーは、json,yaml,またはDSLで定義することができます.上の例ではDSLで定義しています.
71
71
  以下は、jsonで上記と等価な解析ツリーを定義した例です.
72
72
 
73
73
  ```ruby
@@ -87,25 +87,54 @@ EOJSON
87
87
  tree = Yasuri.json2tree(src)
88
88
  ```
89
89
 
90
+ ```ruby
91
+ # yaml で構成する場合
92
+ src = <<-EOYAML
93
+ title:
94
+ node: links
95
+ path: "/html/body/a"
96
+ children:
97
+ - name:
98
+ node: text
99
+ path: "/html/body/p"
100
+ EOYAML
101
+ tree = Yasuri.yaml2tree(src)
102
+ ```
90
103
 
91
104
  ### Node
92
105
  ツリーは入れ子になった *Node* で構成されます.
93
106
  Node は `Type`, `Name`, `Path`, `Childlen`, `Options` を持っています.
107
+ (ただし、`MapNode` のみ `Path` を持ちません)
94
108
 
95
109
  Nodeは以下のフォーマットで定義されます.
96
110
 
97
111
  ```ruby
98
- # トップレベル
99
112
  Yasuri.<Type>_<Name> <Path> [,<Options>]
100
113
 
101
114
  # 入れ子になっている場合
102
115
  Yasuri.<Type>_<Name> <Path> [,<Options>] do
103
116
  <Type>_<Name> <Path> [,<Options>] do
104
- <Children>
117
+ <Type>_<Name> <Path> [,<Options>]
118
+ ...
105
119
  end
106
120
  end
107
121
  ```
108
122
 
123
+
124
+
125
+ ```ruby
126
+ Yasuri.text_title '/html/head/title', truncate:/^[^,]+/
127
+
128
+ # 入れ子になっている場合
129
+ Yasuri.links_root '//*[@id="menu"]/ul/li/a' do
130
+ struct_table './tr' do
131
+ text_title './td[1]'
132
+ text_pub_date './td[2]'
133
+ end
134
+ end
135
+ ```
136
+
137
+
109
138
  #### Type
110
139
  *Type* は Nodeの振る舞いを示します.Typeには以下のものがあります.
111
140
 
@@ -113,18 +142,19 @@ end
113
142
  - *Struct*
114
143
  - *Links*
115
144
  - *Paginate*
145
+ - *Map*
116
146
 
117
- ### Name
147
+ #### Name
118
148
  *Name* は 解析結果のHashにおけるキーになります.
119
149
 
120
- ### Path
150
+ #### Path
121
151
  *Path* は xpath あるいは css セレクタによって、HTML上の特定のノードを指定します.
122
152
  これは Machinize の `search` で使用されます.
123
153
 
124
- ### Childlen
154
+ #### Childlen
125
155
  入れ子になっているノードの子ノードです.TextNodeはツリーの葉に当たるため、子ノードを持ちません.
126
156
 
127
- ### Options
157
+ #### Options
128
158
  パースのオプションです.オプションはTypeごとに異なります.
129
159
  各ノードに対して、`opt`メソッドをコールすることで、利用可能なオプションを取得できます.
130
160
 
@@ -156,13 +186,15 @@ page = agent.get("http://yasuri.example.net")
156
186
 
157
187
  p1 = Yasuri.text_title '/html/body/p[1]'
158
188
  p1t = Yasuri.text_title '/html/body/p[1]', truncate:/^[^,]+/
159
- p2u = Yasuri.text_title '/html/body/p[2]', proc: :upcase
189
+ p2u = Yasuri.text_title '/html/body/p[1]', proc: :upcase
160
190
 
161
- p1.inject(agent, page) #=> { "title" => "Hello,World" }
162
- p1t.inject(agent, page) #=> { "title" => "Hello" }
163
- node.inject(agent, page) #=> { "title" => "HELLO,YASURI" }
191
+ p1.inject(agent, page) #=> "Hello,World"
192
+ p1t.inject(agent, page) #=> "Hello"
193
+ p2u.inject(agent, page) #=> "HELLO,WORLD"
164
194
  ```
165
195
 
196
+ なお、同じページ内の複数の要素を一度にスクレイピングする場合は、`MapNode`を使用します。
197
+
166
198
  ### オプション
167
199
  ##### `truncate`
168
200
  正規表現にマッチした文字列を取り出します.グループを指定した場合、最初にマッチしたグループだけを返します.
@@ -466,3 +498,54 @@ node.inject(agent, page)
466
498
  "Page03",
467
499
  "Patination03"]
468
500
  ```
501
+
502
+ ## Map Node
503
+ *MapNode* はスクレイピングした結果をまとめるノードです.このノードはパースツリーにおいて常に節です.
504
+
505
+ ### 例
506
+
507
+ ```html
508
+ <!-- http://yasuri.example.net -->
509
+ <html>
510
+ <head><title>Yasuri Example</title></head>
511
+ <body>
512
+ <p>Hello,World</p>
513
+ <p>Hello,Yasuri</p>
514
+ </body>
515
+ </html>
516
+ ```
517
+
518
+ ```ruby
519
+ agent = Mechanize.new
520
+ page = agent.get("http://yasuri.example.net")
521
+
522
+
523
+ tree = Yasuri.map_root do
524
+ text_title '/html/head/title'
525
+ text_body_p '/html/body/p[1]'
526
+ end
527
+
528
+ tree.inject(agent, page) #=> { "title" => "Yasuri Example", "body_p" => "Hello,World" }
529
+
530
+
531
+ tree = Yasuri.map_root do
532
+ map_group1 { text_child01 '/html/body/a[1]' }
533
+ map_group2 do
534
+ text_child01 '/html/body/a[1]'
535
+ text_child03 '/html/body/a[3]'
536
+ end
537
+ end
538
+
539
+ tree.inject(agent, page) #=> {
540
+ # "group1" => {
541
+ # "child01" => "child01"
542
+ # },
543
+ # "group2" => {
544
+ # "child01" => "child01",
545
+ # "child03" => "child03"
546
+ # }
547
+ # }
548
+ ```
549
+
550
+ ### オプション
551
+ なし
data/USAGE.md CHANGED
@@ -69,7 +69,7 @@ page = agent.get(uri)
69
69
  tree.inject(agent, page)
70
70
  ```
71
71
 
72
- Tree is definable by 2(+1) ways, DSL and json (and basic ruby code). In above example, DSL.
72
+ Tree is definable by 3(+1) ways, json, yaml, and DSL (or basic ruby code). In above example, DSL.
73
73
 
74
74
  ```ruby
75
75
  # Construct by json.
@@ -88,21 +88,51 @@ EOJSON
88
88
  tree = Yasuri.json2tree(src)
89
89
  ```
90
90
 
91
+ ```ruby
92
+ # Construct by yaml.
93
+ src = <<-EOYAML
94
+ title:
95
+ node: links
96
+ path: "/html/body/a"
97
+ children:
98
+ - name:
99
+ node: text
100
+ path: "/html/body/p"
101
+ EOYAML
102
+ tree = Yasuri.yaml2tree(src)
103
+ ```
104
+
105
+
91
106
  ### Node
92
107
  Tree is constructed by nested Nodes.
93
108
  Node has `Type`, `Name`, `Path`, `Childlen`, and `Options`.
109
+ (But only `MapNode` does not have `Path`.)
94
110
 
95
111
  Node is defined by this format.
96
112
 
97
113
 
98
114
  ```ruby
99
- # Top Level
100
115
  Yasuri.<Type>_<Name> <Path> [,<Options>]
101
116
 
102
- # Nested
117
+ # Nested case
103
118
  Yasuri.<Type>_<Name> <Path> [,<Options>] do
104
119
  <Type>_<Name> <Path> [,<Options>] do
105
- <Children>
120
+ <Type>_<Name> <Path> [,<Options>]
121
+ ...
122
+ end
123
+ end
124
+ ```
125
+
126
+ Example
127
+
128
+ ```ruby
129
+ Yasuri.text_title '/html/head/title', truncate:/^[^,]+/
130
+
131
+ # Nested case
132
+ Yasuri.links_root '//*[@id="menu"]/ul/li/a' do
133
+ struct_table './tr' do
134
+ text_title './td[1]'
135
+ text_pub_date './td[2]'
106
136
  end
107
137
  end
108
138
  ```
@@ -114,17 +144,18 @@ Type meen behavior of Node.
114
144
  - *Struct*
115
145
  - *Links*
116
146
  - *Paginate*
147
+ - *Map*
117
148
 
118
- ### Name
149
+ #### Name
119
150
  Name is used keys in returned hash.
120
151
 
121
- ### Path
152
+ #### Path
122
153
  Path determine target node by xpath or css selector. It given by Machinize `search`.
123
154
 
124
- ### Childlen
155
+ #### Childlen
125
156
  Child nodes. TextNode has always empty set, because TextNode is leaf.
126
157
 
127
- ### Options
158
+ #### Options
128
159
  Parse options. It different in each types. You can get options and values by `opt` method.
129
160
 
130
161
  ```ruby
@@ -155,13 +186,15 @@ page = agent.get("http://yasuri.example.net")
155
186
 
156
187
  p1 = Yasuri.text_title '/html/body/p[1]'
157
188
  p1t = Yasuri.text_title '/html/body/p[1]', truncate:/^[^,]+/
158
- p2u = Yasuri.text_title '/html/body/p[2]', proc: :upcase
189
+ p2u = Yasuri.text_title '/html/body/p[1]', proc: :upcase
159
190
 
160
- p1.inject(agent, page) #=> { "title" => "Hello,World" }
161
- p1t.inject(agent, page) #=> { "title" => "Hello" }
162
- node.inject(agent, page) #=> { "title" => "HELLO,YASURI" }
191
+ p1.inject(agent, page) #=> "Hello,World"
192
+ p1t.inject(agent, page) #=> "Hello"
193
+ p2u.inject(agent, page) #=> "HELLO,WORLD"
163
194
  ```
164
195
 
196
+ Note that if you want to scrape multiple elements in the same page at once, use `MapNode`. See the `MapNode` example for details.
197
+
165
198
  ### Options
166
199
  ##### `truncate`
167
200
  Match to regexp, and truncate text. When you use group, it will return first matched group only.
@@ -464,3 +497,54 @@ node.inject(agent, page)
464
497
  "Page03",
465
498
  "Patination03"]
466
499
  ```
500
+
501
+ ## Map Node
502
+ *MapNode* is a node that summarizes the results of scraping. This node is always a branch node in the parse tree.
503
+
504
+ ### Example
505
+
506
+ ```html
507
+ <!-- http://yasuri.example.net -->
508
+ <html>
509
+ <head><title>Yasuri Example</title></head>
510
+ <body>
511
+ <p>Hello,World</p>
512
+ <p>Hello,Yasuri</p>
513
+ </body>
514
+ </html>
515
+ ```
516
+
517
+ ```ruby
518
+ agent = Mechanize.new
519
+ page = agent.get("http://yasuri.example.net")
520
+
521
+
522
+ tree = Yasuri.map_root do
523
+ text_title '/html/head/title'
524
+ text_body_p '/html/body/p[1]'
525
+ end
526
+
527
+ tree.inject(agent, page) #=> { "title" => "Yasuri Example", "body_p" => "Hello,World" }
528
+
529
+
530
+ tree = Yasuri.map_root do
531
+ map_group1 { text_child01 '/html/body/a[1]' }
532
+ map_group2 do
533
+ text_child01 '/html/body/a[1]'
534
+ text_child03 '/html/body/a[3]'
535
+ end
536
+ end
537
+
538
+ tree.inject(agent, page) #=> {
539
+ # "group1" => {
540
+ # "child01" => "child01"
541
+ # },
542
+ # "group2" => {
543
+ # "child01" => "child01",
544
+ # "child03" => "child03"
545
+ # }
546
+ # }
547
+ ```
548
+
549
+ ### Options
550
+ None.
@@ -1,3 +1,3 @@
1
1
  module Yasuri
2
- VERSION = "1.9.12"
2
+ VERSION = "3.1.0"
3
3
  end
data/lib/yasuri/yasuri.rb CHANGED
@@ -4,12 +4,14 @@
4
4
 
5
5
  require 'mechanize'
6
6
  require 'json'
7
+ require 'yaml'
7
8
 
8
9
  require_relative 'yasuri_node'
9
10
  require_relative 'yasuri_text_node'
10
11
  require_relative 'yasuri_struct_node'
11
12
  require_relative 'yasuri_paginate_node'
12
13
  require_relative 'yasuri_links_node'
14
+ require_relative 'yasuri_map_node'
13
15
  require_relative 'yasuri_node_generator'
14
16
 
15
17
  module Yasuri
@@ -23,9 +25,39 @@ module Yasuri
23
25
  Yasuri.node2hash(node).to_json
24
26
  end
25
27
 
26
- def self.method_missing(name, *args, &block)
27
- generated = Yasuri::NodeGenerator.gen(name, *args, &block)
28
- generated || super(name, args)
28
+ def self.yaml2tree(yaml_string)
29
+ raise RuntimeError if yaml_string.nil? or yaml_string.empty?
30
+
31
+ yaml = YAML.load(yaml_string)
32
+ raise RuntimeError if yaml.keys.size < 1
33
+
34
+ root_key, root = yaml.keys.first, yaml.values.first
35
+ hash = Yasuri.yaml2tree_sub(root_key, root)
36
+
37
+ Yasuri.hash2node(hash)
38
+ end
39
+
40
+ private
41
+ def self.yaml2tree_sub(name, body)
42
+ return nil if name.nil? or body.nil?
43
+
44
+ new_body = Hash[:name, name]
45
+ body.each{|k,v| new_body[k.to_sym] = v}
46
+ body = new_body
47
+
48
+ return body if body[:children].nil?
49
+
50
+ body[:children] = body[:children].map do |c|
51
+ k, b = c.keys.first, c.values.first
52
+ Yasuri.yaml2tree_sub(k, b)
53
+ end
54
+
55
+ body
56
+ end
57
+
58
+ def self.method_missing(method_name, pattern=nil, **opt, &block)
59
+ generated = Yasuri::NodeGenerator.gen(method_name, pattern, **opt, &block)
60
+ generated || super(method_name, **opt)
29
61
  end
30
62
 
31
63
  private
@@ -33,53 +65,26 @@ module Yasuri
33
65
  text: Yasuri::TextNode,
34
66
  struct: Yasuri::StructNode,
35
67
  links: Yasuri::LinksNode,
36
- pages: Yasuri::PaginateNode
68
+ pages: Yasuri::PaginateNode,
69
+ map: Yasuri::MapNode
37
70
  }
38
71
  Node2Text = Text2Node.invert
39
72
 
40
- ReservedKeys = [:node, :name, :path, :children]
73
+ ReservedKeys = %i|node name path children|
41
74
  def self.hash2node(node_h)
42
- node, name, path, children = ReservedKeys.map do |key|
43
- node_h[key]
44
- end
45
- children ||= []
46
-
47
- fail "Not found 'node' value in json" if node.nil?
48
- fail "Not found 'name' value in json" if name.nil?
49
- fail "Not found 'path' value in json" if path.nil?
50
-
51
- childnodes = children.map{|c| Yasuri.hash2node(c) }
52
- ReservedKeys.each{|key| node_h.delete(key)}
53
- opt = node_h
75
+ node = node_h[:node]
54
76
 
77
+ fail "Not found 'node' value in map" if node.nil?
55
78
  klass = Text2Node[node.to_sym]
56
- fail "Undefined node type #{node}" if klass.nil?
57
- klass.new(path, name, childnodes, opt)
79
+ klass::hash2node(node_h)
58
80
  end
59
81
 
60
82
  def self.node2hash(node)
61
- json = JSON.parse("{}")
62
- return json if node.nil?
63
-
64
- klass = node.class
65
- klass_str = Node2Text[klass]
66
-
67
- json["node"] = klass_str
68
- json["name"] = node.name
69
- json["path"] = node.xpath
70
-
71
- children = node.children.map{|c| Yasuri.node2hash(c)}
72
- json["children"] = children if not children.empty?
73
-
74
- node.opts.each do |key,value|
75
- json[key] = value if not value.nil?
76
- end
77
-
78
- json
83
+ node.to_h
79
84
  end
80
85
 
81
- def self.NodeName(name, hash = {})
82
- symbolize_names = hash[:symbolize_names] || false
86
+ def self.NodeName(name, opt)
87
+ symbolize_names = opt[:symbolize_names]
83
88
  symbolize_names ? name.to_sym : name
84
89
  end
85
90
 
@@ -22,5 +22,9 @@ module Yasuri
22
22
  Hash[child_results_kv]
23
23
  end # each named child node
24
24
  end
25
- end
26
- end
25
+
26
+ def node_type_str
27
+ "links"
28
+ end
29
+ end # class
30
+ end # module
@@ -0,0 +1,54 @@
1
+
2
+ module Yasuri
3
+ class MapNode
4
+ attr_reader :name, :children
5
+
6
+ def initialize(name, children, opt: {})
7
+ @name = name
8
+ @children = children
9
+ @opt = opt
10
+ end
11
+
12
+ def inject(agent, page, opt = {}, element = page)
13
+ child_results_kv = @children.map do |node|
14
+ [node.name, node.inject(agent, page, opt)]
15
+ end
16
+ Hash[child_results_kv]
17
+ end
18
+
19
+ def opts
20
+ {}
21
+ end
22
+
23
+ def to_h
24
+ h = {}
25
+ h["node"] = "map"
26
+ h["name"] = self.name
27
+ h["children"] = self.children.map{|c| c.to_h} if not children.empty?
28
+
29
+ self.opts.each do |key,value|
30
+ h[key] = value if not value.nil?
31
+ end
32
+
33
+ h
34
+ end
35
+
36
+ def self.hash2node(node_h)
37
+ reservedKeys = %i|node name children|
38
+
39
+ node, name, children = reservedKeys.map do |key|
40
+ node_h[key]
41
+ end
42
+
43
+ fail "Not found 'name' value in map" if name.nil?
44
+ fail "Not found 'children' value in map" if children.nil?
45
+ children ||= []
46
+
47
+ childnodes = children.map{|c| Yasuri.hash2node(c) }
48
+ reservedKeys.each{|key| node_h.delete(key)}
49
+ opt = node_h
50
+
51
+ self.new(name, childnodes, **opt)
52
+ end
53
+ end
54
+ end
@@ -7,15 +7,58 @@ module Yasuri
7
7
  module Node
8
8
  attr_reader :url, :xpath, :name, :children
9
9
 
10
- def initialize(xpath, name, children = [], opt = {})
10
+ def initialize(xpath, name, children = [], opt: {})
11
11
  @xpath, @name, @children = xpath, name, children
12
12
  end
13
13
 
14
14
  def inject(agent, page, opt = {}, element = page)
15
- fail "#{Kernel.__method__} is not implemented."
15
+ fail "#{Kernel.__method__} is not implemented in included class."
16
16
  end
17
+
17
18
  def opts
18
19
  {}
19
20
  end
21
+
22
+ def to_h
23
+ h = {}
24
+ h["node"] = self.node_type_str
25
+ h["name"] = self.name
26
+ h["path"] = self.xpath
27
+ h["children"] = self.children.map{|c| c.to_h} if not children.empty?
28
+
29
+ self.opts.each do |key,value|
30
+ h[key] = value if not value.nil?
31
+ end
32
+
33
+ h
34
+ end
35
+
36
+ module ClassMethods
37
+ def hash2node(node_h)
38
+ reservedKeys = %i|node name path children|
39
+
40
+ node, name, path, children = ReservedKeys.map do |key|
41
+ node_h[key]
42
+ end
43
+
44
+ fail "Not found 'name' value in map" if name.nil?
45
+ fail "Not found 'path' value in map" if path.nil?
46
+ children ||= []
47
+
48
+ childnodes = children.map{|c| Yasuri.hash2node(c) }
49
+ reservedKeys.each{|key| node_h.delete(key)}
50
+ opt = node_h
51
+
52
+ self.new(path, name, childnodes, **opt)
53
+ end
54
+
55
+ def node_type_str
56
+ fail "#{Kernel.__method__} is not implemented in included class."
57
+ end
58
+ end
59
+
60
+ def self.included(base)
61
+ base.extend(ClassMethods)
62
+ end
20
63
  end
21
64
  end
@@ -6,6 +6,7 @@ require_relative 'yasuri_text_node'
6
6
  require_relative 'yasuri_struct_node'
7
7
  require_relative 'yasuri_links_node'
8
8
  require_relative 'yasuri_paginate_node'
9
+ require_relative 'yasuri_map_node'
9
10
 
10
11
  module Yasuri
11
12
  class NodeGenerator
@@ -15,29 +16,33 @@ module Yasuri
15
16
  @nodes
16
17
  end
17
18
 
18
- def method_missing(name, *args, &block)
19
- node = NodeGenerator.gen(name, *args, &block)
19
+ def method_missing(name, pattern=nil, **args, &block)
20
+ node = NodeGenerator.gen(name, pattern, **args, &block)
20
21
  raise "Undefined Node Name '#{name}'" if node == nil
21
22
  @nodes << node
22
23
  end
23
24
 
24
- def self.gen(name, *args, &block)
25
- xpath, opt = *args
26
- opt = [opt].flatten.compact
25
+ def self.gen(method_name, xpath, **opt, &block)
27
26
  children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block_given?
28
27
 
29
- case name
28
+ case method_name
30
29
  when /^text_(.+)$/
31
- Yasuri::TextNode.new(xpath, $1, children || [], *opt)
30
+ # Todo raise error xpath is not valid
31
+ Yasuri::TextNode.new(xpath, $1, children || [], **opt)
32
32
  when /^struct_(.+)$/
33
- Yasuri::StructNode.new(xpath, $1, children || [], *opt)
33
+ # Todo raise error xpath is not valid
34
+ Yasuri::StructNode.new(xpath, $1, children || [], **opt)
34
35
  when /^links_(.+)$/
35
- Yasuri::LinksNode.new(xpath, $1, children || [], *opt)
36
+ # Todo raise error xpath is not valid
37
+ Yasuri::LinksNode.new(xpath, $1, children || [], **opt)
36
38
  when /^pages_(.+)$/
37
- Yasuri::PaginateNode.new(xpath, $1, children || [], *opt)
39
+ # Todo raise error xpath is not valid
40
+ Yasuri::PaginateNode.new(xpath, $1, children || [], **opt)
41
+ when /^map_(.+)$/
42
+ Yasuri::MapNode.new($1, children, **opt)
38
43
  else
39
44
  nil
40
45
  end
41
- end # of self.gen(name, *args, &block)
46
+ end # of self.gen(method_name, xpath, **opt, &block)
42
47
  end # of class NodeGenerator
43
48
  end
@@ -7,10 +7,10 @@ module Yasuri
7
7
  class PaginateNode
8
8
  include Node
9
9
 
10
- def initialize(xpath, name, children = [], hash = {})
10
+ def initialize(xpath, name, children = [], limit: nil, flatten: false)
11
11
  super(xpath, name, children)
12
- @limit = hash[:limit]
13
- @flatten = hash[:flatten] || false
12
+ @flatten = flatten
13
+ @limit = limit
14
14
  end
15
15
 
16
16
  def inject(agent, page, opt = {}, element = page)
@@ -44,5 +44,9 @@ module Yasuri
44
44
  def opts
45
45
  {limit:@limit, flatten:@flatten}
46
46
  end
47
+
48
+ def node_type_str
49
+ "pages"
50
+ end
47
51
  end
48
52
  end
@@ -7,11 +7,11 @@ module Yasuri
7
7
  class TextNode
8
8
  include Node
9
9
 
10
- def initialize(xpath, name, children = [], hash = {})
10
+ def initialize(xpath, name, children = [], **opt)
11
11
  super(xpath, name, children)
12
12
 
13
- truncate = hash[:truncate]
14
- proc = hash[:proc]
13
+ truncate = opt[:truncate]
14
+ proc = opt[:proc]
15
15
 
16
16
  truncate = Regexp.new(truncate) if not truncate.nil? # regexp or nil
17
17
  @truncate = truncate
@@ -34,6 +34,10 @@ module Yasuri
34
34
  text
35
35
  end
36
36
 
37
+ def node_type_str
38
+ "text"
39
+ end
40
+
37
41
  def opts
38
42
  {truncate:@truncate, proc:@proc}
39
43
  end
data/spec/spec_helper.rb CHANGED
@@ -12,11 +12,6 @@ shared_context 'httpserver' do
12
12
  }
13
13
  end
14
14
 
15
-
16
- # ENV['CODECLIMATE_REPO_TOKEN'] = "0dc78d33107a7f11f257c0218ac1a37e0073005bb9734f2fd61d0f7e803fc151"
17
- # require "codeclimate-test-reporter"
18
- # CodeClimate::TestReporter.start
19
-
20
15
  require 'simplecov'
21
16
  require 'coveralls'
22
17
  Coveralls.wear!
@@ -59,10 +59,18 @@ describe 'Yasuri' do
59
59
  ]
60
60
  expect(actual).to match expected
61
61
  end
62
- it 'can be defined by DSL, return single LinkNode title' do
63
- generated = Yasuri.links_title '/html/body/a'
64
- original = Yasuri::LinksNode.new('/html/body/a', "title")
65
- compare_generated_vs_original(generated, original, @index_page)
62
+ it 'can be defined by DSL, return no contains if no child node' do
63
+ root_node = Yasuri.links_title '/html/body/a'
64
+ actual = root_node.inject(@agent, @index_page)
65
+ expected = [{}, {}, {}] # Empty if no child node under links node.
66
+ expect(actual).to match expected
67
+ end
68
+
69
+ it 'can be defined return no contains if no child node' do
70
+ root_node = Yasuri::LinksNode.new('/html/body/a', "title")
71
+ actual = root_node.inject(@agent, @index_page)
72
+ expected = [{}, {}, {}] # Empty if no child node under links node.
73
+ expect(actual).to match expected
66
74
  end
67
75
  it 'can be defined by DSL, return nested contents under link' do
68
76
  generated = Yasuri.links_title '/html/body/a' do
@@ -0,0 +1,76 @@
1
+ require_relative 'spec_helper'
2
+
3
+ describe 'Yasuri' do
4
+ include_context 'httpserver'
5
+
6
+ before do
7
+ @agent = Mechanize.new
8
+ @index_page = @agent.get(uri)
9
+ end
10
+
11
+ describe '::MapNode' do
12
+ it "multi scrape in singe page" do
13
+ map = Yasuri.map_sample do
14
+ text_title '/html/head/title'
15
+ text_body_p '/html/body/p[1]'
16
+ end
17
+ actual = map.inject(@agent, @index_page)
18
+
19
+ expected = {
20
+ "title" => "Yasuri Test",
21
+ "body_p" => "Hello,Yasuri"
22
+ }
23
+ expect(actual).to include expected
24
+ end
25
+
26
+ it "nested multi scrape in singe page" do
27
+ map = Yasuri.map_sample do
28
+ map_group1 { text_child01 '/html/body/a[1]' }
29
+ map_group2 do
30
+ text_child01 '/html/body/a[1]'
31
+ text_child03 '/html/body/a[3]'
32
+ end
33
+ end
34
+ actual = map.inject(@agent, @index_page)
35
+
36
+ expected = {
37
+ "group1" => {
38
+ "child01" => "child01"
39
+ },
40
+ "group2" => {
41
+ "child01" => "child01",
42
+ "child03" => "child03"
43
+ }
44
+ }
45
+ expect(actual).to include expected
46
+ end
47
+
48
+ it "scrape with links node" do
49
+ map = Yasuri.map_sample do
50
+ map_group1 do
51
+ links_a '/html/body/a' do
52
+ text_content '/html/body/p'
53
+ end
54
+ text_child01 '/html/body/a[1]'
55
+ end
56
+ map_group2 do
57
+ text_child03 '/html/body/a[3]'
58
+ end
59
+ end
60
+ actual = map.inject(@agent, @index_page)
61
+
62
+ expected = {
63
+ "group1" => {
64
+ "a" => [
65
+ {"content" => "Child 01 page."},
66
+ {"content" => "Child 02 page."},
67
+ {"content" => "Child 03 page."},
68
+ ],
69
+ "child01" => "child01"
70
+ },
71
+ "group2" => { "child03" => "child03" }
72
+ }
73
+ expect(actual).to include expected
74
+ end
75
+ end
76
+ end
data/spec/yasuri_spec.rb CHANGED
@@ -13,6 +13,89 @@ describe 'Yasuri' do
13
13
  @index_page = @agent.get(@uri)
14
14
  end
15
15
 
16
+ ############
17
+ # yam2tree #
18
+ ############
19
+ describe '.yaml2tree' do
20
+ it "fail if empty yaml" do
21
+ expect { Yasuri.yaml2tree(nil) }.to raise_error(RuntimeError)
22
+ end
23
+
24
+ it "return text node" do
25
+ src = <<-EOB
26
+ content:
27
+ node: text
28
+ path: "/html/body/p[1]"
29
+ EOB
30
+ generated = Yasuri.yaml2tree(src)
31
+ original = Yasuri::TextNode.new('/html/body/p[1]', "content")
32
+
33
+ compare_generated_vs_original(generated, original, @index_page)
34
+ end
35
+
36
+ it "return text node as symbol" do
37
+ src = <<-EOB
38
+ :content:
39
+ :node: text
40
+ :path: "/html/body/p[1]"
41
+ EOB
42
+ generated = Yasuri.yaml2tree(src)
43
+ original = Yasuri::TextNode.new('/html/body/p[1]', "content")
44
+
45
+ compare_generated_vs_original(generated, original, @index_page)
46
+ end
47
+
48
+ it "return LinksNode/TextNode" do
49
+
50
+ src = <<-EOB
51
+ root:
52
+ node: links
53
+ path: "/html/body/a"
54
+ children:
55
+ - content:
56
+ node: text
57
+ path: "/html/body/p"
58
+ EOB
59
+ generated = Yasuri.yaml2tree(src)
60
+ original = Yasuri::LinksNode.new('/html/body/a', "root", [
61
+ Yasuri::TextNode.new('/html/body/p', "content"),
62
+ ])
63
+
64
+ compare_generated_vs_original(generated, original, @index_page)
65
+ end
66
+
67
+ it "return StructNode/StructNode/[TextNode,TextNode]" do
68
+ src = <<-EOB
69
+ tables:
70
+ node: struct
71
+ path: "/html/body/table"
72
+ children:
73
+ - table:
74
+ node: struct
75
+ path: "./tr"
76
+ children:
77
+ - title:
78
+ node: text
79
+ path: "./td[1]"
80
+ - pub_date:
81
+ node: text
82
+ path: "./td[2]"
83
+ EOB
84
+
85
+ generated = Yasuri.yaml2tree(src)
86
+ original = Yasuri::StructNode.new('/html/body/table', "tables", [
87
+ Yasuri::StructNode.new('./tr', "table", [
88
+ Yasuri::TextNode.new('./td[1]', "title"),
89
+ Yasuri::TextNode.new('./td[2]', "pub_date"),
90
+ ])
91
+ ])
92
+ page = @agent.get(@uri + "/struct/structual_text.html")
93
+ compare_generated_vs_original(generated, original, page)
94
+ end
95
+
96
+ end # end of describe '.yaml2tree'
97
+
98
+
16
99
  #############
17
100
  # json2tree #
18
101
  #############
@@ -39,10 +122,31 @@ describe 'Yasuri' do
39
122
  "truncate" : "^[^,]+"
40
123
  }|
41
124
  generated = Yasuri.json2tree(src)
42
- original = Yasuri::TextNode.new('/html/body/p[1]', "content", {}, truncate:/^[^,]+/)
125
+ original = Yasuri::TextNode.new('/html/body/p[1]', "content", truncate:/^[^,]+/)
43
126
  compare_generated_vs_original(generated, original, @index_page)
44
127
  end
45
128
 
129
+ it "return MapNode with TextNodes" do
130
+ src = %q| { "node" : "map",
131
+ "name" : "parent",
132
+ "children" : [
133
+ { "node" : "text",
134
+ "name" : "content01",
135
+ "path" : "/html/body/p[1]"
136
+ },
137
+ { "node" : "text",
138
+ "name" : "content02",
139
+ "path" : "/html/body/p[2]"
140
+ }
141
+ ]
142
+ }|
143
+ generated = Yasuri.json2tree(src)
144
+ original = Yasuri::MapNode.new('parent', [
145
+ Yasuri::TextNode.new('/html/body/p[1]', "content01"),
146
+ Yasuri::TextNode.new('/html/body/p[2]', "content02"),
147
+ ])
148
+ compare_generated_vs_original(generated, original, @index_page)
149
+ end
46
150
 
47
151
  it "return LinksNode/TextNode" do
48
152
  src = %q| { "node" : "links",
@@ -153,7 +257,7 @@ describe 'Yasuri' do
153
257
  end
154
258
 
155
259
  it "return text node with truncate_regexp" do
156
- node = Yasuri::TextNode.new("/html/head/title", "title", {}, truncate:/^[^,]+/)
260
+ node = Yasuri::TextNode.new("/html/head/title", "title", truncate:/^[^,]+/)
157
261
  json = Yasuri.tree2json(node)
158
262
  expected_str = %q| { "node": "text",
159
263
  "name": "title",
@@ -165,6 +269,31 @@ describe 'Yasuri' do
165
269
  expect(actual).to match expected
166
270
  end
167
271
 
272
+ it "return map node with text nodes" do
273
+ tree = Yasuri::MapNode.new('parent', [
274
+ Yasuri::TextNode.new('/html/body/p[1]', "content01"),
275
+ Yasuri::TextNode.new('/html/body/p[2]', "content02"),
276
+ ])
277
+ actual_json = Yasuri.tree2json(tree)
278
+
279
+ expected_json = %q| { "node" : "map",
280
+ "name" : "parent",
281
+ "children" : [
282
+ { "node" : "text",
283
+ "name" : "content01",
284
+ "path" : "/html/body/p[1]"
285
+ },
286
+ { "node" : "text",
287
+ "name" : "content02",
288
+ "path" : "/html/body/p[2]"
289
+ }
290
+ ]
291
+ }|
292
+ expected = Yasuri.tree2json(Yasuri.json2tree(expected_json))
293
+ actual = Yasuri.tree2json(Yasuri.json2tree(actual_json))
294
+ expect(actual).to match expected
295
+ end
296
+
168
297
  it "return LinksNode/TextNode" do
169
298
  tree = Yasuri::LinksNode.new('/html/body/a', "root", [
170
299
  Yasuri::TextNode.new('/html/body/p', "content"),
@@ -126,7 +126,7 @@ describe 'Yasuri' do
126
126
  Yasuri::TextNode.new('./td[1]', "title"),
127
127
  Yasuri::TextNode.new('./td[2]', "pub_date"),
128
128
  ])
129
- expected = @table_1996.map{|h| Hash[h.map{|k,v| [k.to_sym, v] }] }
129
+ expected = @table_1996.map{|h| h.map{|k,v| [k.to_sym, v] }.to_h }
130
130
  actual = node.inject(@agent, @page, symbolize_names:true)
131
131
  expect(actual).to match expected
132
132
  end
data/yasuri.gemspec CHANGED
@@ -18,8 +18,8 @@ Gem::Specification.new do |spec|
18
18
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
19
  spec.require_paths = ["lib"]
20
20
 
21
- spec.add_development_dependency "bundler", "~> 1.7"
22
- spec.add_development_dependency "rake", "~> 10.0"
21
+ spec.add_development_dependency "bundler"
22
+ spec.add_development_dependency "rake"
23
23
  spec.add_development_dependency "rspec"
24
24
  spec.add_development_dependency "fuubar"
25
25
  spec.add_development_dependency "glint"
metadata CHANGED
@@ -1,43 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: yasuri
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.9.12
4
+ version: 3.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - TAC
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-12-15 00:00:00.000000000 Z
11
+ date: 2021-03-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '1.7'
19
+ version: '0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '1.7'
26
+ version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: '10.0'
33
+ version: '0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: '10.0'
40
+ version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -144,8 +144,10 @@ extensions: []
144
144
  extra_rdoc_files: []
145
145
  files:
146
146
  - ".coveralls.yml"
147
+ - ".github/workflows/ruby.yml"
147
148
  - ".gitignore"
148
149
  - ".rspec"
150
+ - ".ruby-version"
149
151
  - ".travis.yml"
150
152
  - Gemfile
151
153
  - LICENSE
@@ -158,6 +160,7 @@ files:
158
160
  - lib/yasuri/version.rb
159
161
  - lib/yasuri/yasuri.rb
160
162
  - lib/yasuri/yasuri_links_node.rb
163
+ - lib/yasuri/yasuri_map_node.rb
161
164
  - lib/yasuri/yasuri_node.rb
162
165
  - lib/yasuri/yasuri_node_generator.rb
163
166
  - lib/yasuri/yasuri_paginate_node.rb
@@ -179,6 +182,7 @@ files:
179
182
  - spec/servers/httpserver.rb
180
183
  - spec/spec_helper.rb
181
184
  - spec/yasuri_links_node_spec.rb
185
+ - spec/yasuri_map_spec.rb
182
186
  - spec/yasuri_node_spec.rb
183
187
  - spec/yasuri_paginate_node_spec.rb
184
188
  - spec/yasuri_spec.rb
@@ -189,7 +193,7 @@ homepage: https://github.com/tac0x2a/yasuri
189
193
  licenses:
190
194
  - MIT
191
195
  metadata: {}
192
- post_install_message:
196
+ post_install_message:
193
197
  rdoc_options: []
194
198
  require_paths:
195
199
  - lib
@@ -204,9 +208,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
204
208
  - !ruby/object:Gem::Version
205
209
  version: '0'
206
210
  requirements: []
207
- rubyforge_project:
208
- rubygems_version: 2.5.2
209
- signing_key:
211
+ rubygems_version: 3.2.3
212
+ signing_key:
210
213
  specification_version: 4
211
214
  summary: Yasuri is easy scraping library.
212
215
  test_files:
@@ -226,6 +229,7 @@ test_files:
226
229
  - spec/servers/httpserver.rb
227
230
  - spec/spec_helper.rb
228
231
  - spec/yasuri_links_node_spec.rb
232
+ - spec/yasuri_map_spec.rb
229
233
  - spec/yasuri_node_spec.rb
230
234
  - spec/yasuri_paginate_node_spec.rb
231
235
  - spec/yasuri_spec.rb