yasuri 1.9.12 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.github/workflows/ruby.yml +35 -0
- data/.gitignore +1 -2
- data/.ruby-version +1 -0
- data/.travis.yml +1 -3
- data/README.md +40 -1
- data/USAGE.ja.md +94 -11
- data/USAGE.md +96 -12
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +44 -39
- data/lib/yasuri/yasuri_links_node.rb +6 -2
- data/lib/yasuri/yasuri_map_node.rb +54 -0
- data/lib/yasuri/yasuri_node.rb +45 -2
- data/lib/yasuri/yasuri_node_generator.rb +16 -11
- data/lib/yasuri/yasuri_paginate_node.rb +7 -3
- data/lib/yasuri/yasuri_text_node.rb +7 -3
- data/spec/spec_helper.rb +0 -5
- data/spec/yasuri_links_node_spec.rb +12 -4
- data/spec/yasuri_map_spec.rb +76 -0
- data/spec/yasuri_spec.rb +131 -2
- data/spec/yasuri_struct_node_spec.rb +1 -1
- data/yasuri.gemspec +2 -2
- metadata +19 -15
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
|
-
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: f3542a2cc0959a4534520f6104fc2922bdf0dbd368fcd4c149c3d251c2fc2198
|
|
4
|
+
data.tar.gz: 6fdb960db697e9a4ec1d87f2b83bf0e9914e3c9efe90764536bbee6d68774353
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 9df576243bea289f4c285c46f1bd2137b7b69b79b24e0c657e4ac952114dd7bcf82a5f95cd2dae88c6eac4e3e468273b7dbd6ead9d05ffdc8d25861921702333
|
|
7
|
+
data.tar.gz: 13f2ae72b3e8fa6d3ef58932daa2acad49f5d4f57c80f34e5215394940fc2305bc016d949760efe9f43ae2b8c3796064a1b0bd9bccf236cfe3789c2c291dfd8b
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# This workflow uses actions that are not certified by GitHub.
|
|
2
|
+
# They are provided by a third-party and are governed by
|
|
3
|
+
# separate terms of service, privacy policy, and support
|
|
4
|
+
# documentation.
|
|
5
|
+
# This workflow will download a prebuilt Ruby version, install dependencies and run tests with Rake
|
|
6
|
+
# For more information see: https://github.com/marketplace/actions/setup-ruby-jruby-and-truffleruby
|
|
7
|
+
|
|
8
|
+
name: Ruby
|
|
9
|
+
|
|
10
|
+
on:
|
|
11
|
+
push:
|
|
12
|
+
branches: [ master ]
|
|
13
|
+
pull_request:
|
|
14
|
+
branches: [ master ]
|
|
15
|
+
|
|
16
|
+
jobs:
|
|
17
|
+
test:
|
|
18
|
+
|
|
19
|
+
runs-on: ubuntu-latest
|
|
20
|
+
strategy:
|
|
21
|
+
matrix:
|
|
22
|
+
ruby-version: ['2.6', '2.7', '3.0']
|
|
23
|
+
|
|
24
|
+
steps:
|
|
25
|
+
- uses: actions/checkout@v2
|
|
26
|
+
- name: Set up Ruby
|
|
27
|
+
# To automatically get bug fixes and new Ruby versions for ruby/setup-ruby,
|
|
28
|
+
# change this to (see https://github.com/ruby/setup-ruby#versioning):
|
|
29
|
+
# uses: ruby/setup-ruby@v1
|
|
30
|
+
uses: ruby/setup-ruby@473e4d8fe5dd94ee328fdfca9f8c9c7afc9dae5e
|
|
31
|
+
with:
|
|
32
|
+
ruby-version: ${{ matrix.ruby-version }}
|
|
33
|
+
bundler-cache: true # runs 'bundle install' and caches installed gems automatically
|
|
34
|
+
- name: Run tests
|
|
35
|
+
run: bundle exec rake
|
data/.gitignore
CHANGED
data/.ruby-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.0.0
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
# Yasuri
|
|
1
|
+
# Yasuri
|
|
2
|
+
[](https://github.com/tac0x2a/yasuri/actions/workflows/ruby.yml)
|
|
3
|
+
[](https://coveralls.io/r/tac0x2a/yasuri?branch=master) [](https://codeclimate.com/github/tac0x2a/yasuri/maintainability)
|
|
2
4
|
|
|
3
5
|
Yasuri (鑢) is an easy web-scraping library for supporting "[Mechanize](https://github.com/sparklemotion/mechanize)".
|
|
4
6
|
|
|
@@ -32,6 +34,9 @@ or
|
|
|
32
34
|
```ruby
|
|
33
35
|
# for Ruby 1.9.3 or lower
|
|
34
36
|
gem 'yasuri', '~> 1.9'
|
|
37
|
+
|
|
38
|
+
# for Ruby 3.0.0 or lower
|
|
39
|
+
gem 'yasuri', '~> 3.0.1'
|
|
35
40
|
```
|
|
36
41
|
|
|
37
42
|
|
|
@@ -52,6 +57,19 @@ root = Yasuri.links_root '//*[@id="menu"]/ul/li/a' do
|
|
|
52
57
|
text_content '//*[@id="contents"]/p[1]'
|
|
53
58
|
end
|
|
54
59
|
|
|
60
|
+
|
|
61
|
+
# Node tree constructing by YAML
|
|
62
|
+
src = <<-EOYAML
|
|
63
|
+
root:
|
|
64
|
+
node: links
|
|
65
|
+
path: "//*[@id='menu']/ul/li/a"
|
|
66
|
+
children:
|
|
67
|
+
- title: { node: text, path: "//*[@id='contents']/h2" }
|
|
68
|
+
- content: { node: text, path: "//*[@id='contents']/p[1]" }
|
|
69
|
+
EOYAML
|
|
70
|
+
root = Yasuri.yaml2tree(src)
|
|
71
|
+
|
|
72
|
+
|
|
55
73
|
# Node tree constructing by JSON
|
|
56
74
|
src = <<-EOJSON
|
|
57
75
|
{ "node" : "links",
|
|
@@ -78,6 +96,27 @@ result = root.inject(agent, root_page)
|
|
|
78
96
|
# => [ {"title" => "PageTitle", "content" => "Page Contents" }, ... ]
|
|
79
97
|
```
|
|
80
98
|
|
|
99
|
+
## Dev
|
|
100
|
+
```sh
|
|
101
|
+
$ gem install bundler
|
|
102
|
+
$ bundle install
|
|
103
|
+
```
|
|
104
|
+
### Test
|
|
105
|
+
```sh
|
|
106
|
+
$ rake
|
|
107
|
+
# or
|
|
108
|
+
$ rspec spec/*spec.rb
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Release RubyGems
|
|
112
|
+
```sh
|
|
113
|
+
# Only first time
|
|
114
|
+
$ curl -u <user_name> https://rubygems.org/api/v1/api_key.yaml > ~/.gem/credentials
|
|
115
|
+
$ chmod 0600 ~/.gem/credentials
|
|
116
|
+
|
|
117
|
+
$ nano lib/yasuri/version.rb # edit gem version
|
|
118
|
+
$ rake release
|
|
119
|
+
```
|
|
81
120
|
|
|
82
121
|
## Contributing
|
|
83
122
|
|
data/USAGE.ja.md
CHANGED
|
@@ -67,7 +67,7 @@ page = agent.get(uri)
|
|
|
67
67
|
tree.inject(agent, page)
|
|
68
68
|
```
|
|
69
69
|
|
|
70
|
-
ツリーは、DSL
|
|
70
|
+
ツリーは、json,yaml,またはDSLで定義することができます.上の例ではDSLで定義しています.
|
|
71
71
|
以下は、jsonで上記と等価な解析ツリーを定義した例です.
|
|
72
72
|
|
|
73
73
|
```ruby
|
|
@@ -87,25 +87,54 @@ EOJSON
|
|
|
87
87
|
tree = Yasuri.json2tree(src)
|
|
88
88
|
```
|
|
89
89
|
|
|
90
|
+
```ruby
|
|
91
|
+
# yaml で構成する場合
|
|
92
|
+
src = <<-EOYAML
|
|
93
|
+
title:
|
|
94
|
+
node: links
|
|
95
|
+
path: "/html/body/a"
|
|
96
|
+
children:
|
|
97
|
+
- name:
|
|
98
|
+
node: text
|
|
99
|
+
path: "/html/body/p"
|
|
100
|
+
EOYAML
|
|
101
|
+
tree = Yasuri.yaml2tree(src)
|
|
102
|
+
```
|
|
90
103
|
|
|
91
104
|
### Node
|
|
92
105
|
ツリーは入れ子になった *Node* で構成されます.
|
|
93
106
|
Node は `Type`, `Name`, `Path`, `Childlen`, `Options` を持っています.
|
|
107
|
+
(ただし、`MapNode` のみ `Path` を持ちません)
|
|
94
108
|
|
|
95
109
|
Nodeは以下のフォーマットで定義されます.
|
|
96
110
|
|
|
97
111
|
```ruby
|
|
98
|
-
# トップレベル
|
|
99
112
|
Yasuri.<Type>_<Name> <Path> [,<Options>]
|
|
100
113
|
|
|
101
114
|
# 入れ子になっている場合
|
|
102
115
|
Yasuri.<Type>_<Name> <Path> [,<Options>] do
|
|
103
116
|
<Type>_<Name> <Path> [,<Options>] do
|
|
104
|
-
<
|
|
117
|
+
<Type>_<Name> <Path> [,<Options>]
|
|
118
|
+
...
|
|
105
119
|
end
|
|
106
120
|
end
|
|
107
121
|
```
|
|
108
122
|
|
|
123
|
+
例
|
|
124
|
+
|
|
125
|
+
```ruby
|
|
126
|
+
Yasuri.text_title '/html/head/title', truncate:/^[^,]+/
|
|
127
|
+
|
|
128
|
+
# 入れ子になっている場合
|
|
129
|
+
Yasuri.links_root '//*[@id="menu"]/ul/li/a' do
|
|
130
|
+
struct_table './tr' do
|
|
131
|
+
text_title './td[1]'
|
|
132
|
+
text_pub_date './td[2]'
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
|
|
109
138
|
#### Type
|
|
110
139
|
*Type* は Nodeの振る舞いを示します.Typeには以下のものがあります.
|
|
111
140
|
|
|
@@ -113,18 +142,19 @@ end
|
|
|
113
142
|
- *Struct*
|
|
114
143
|
- *Links*
|
|
115
144
|
- *Paginate*
|
|
145
|
+
- *Map*
|
|
116
146
|
|
|
117
|
-
|
|
147
|
+
#### Name
|
|
118
148
|
*Name* は 解析結果のHashにおけるキーになります.
|
|
119
149
|
|
|
120
|
-
|
|
150
|
+
#### Path
|
|
121
151
|
*Path* は xpath あるいは css セレクタによって、HTML上の特定のノードを指定します.
|
|
122
152
|
これは Machinize の `search` で使用されます.
|
|
123
153
|
|
|
124
|
-
|
|
154
|
+
#### Childlen
|
|
125
155
|
入れ子になっているノードの子ノードです.TextNodeはツリーの葉に当たるため、子ノードを持ちません.
|
|
126
156
|
|
|
127
|
-
|
|
157
|
+
#### Options
|
|
128
158
|
パースのオプションです.オプションはTypeごとに異なります.
|
|
129
159
|
各ノードに対して、`opt`メソッドをコールすることで、利用可能なオプションを取得できます.
|
|
130
160
|
|
|
@@ -156,13 +186,15 @@ page = agent.get("http://yasuri.example.net")
|
|
|
156
186
|
|
|
157
187
|
p1 = Yasuri.text_title '/html/body/p[1]'
|
|
158
188
|
p1t = Yasuri.text_title '/html/body/p[1]', truncate:/^[^,]+/
|
|
159
|
-
p2u = Yasuri.text_title '/html/body/p[
|
|
189
|
+
p2u = Yasuri.text_title '/html/body/p[1]', proc: :upcase
|
|
160
190
|
|
|
161
|
-
p1.inject(agent, page) #=>
|
|
162
|
-
p1t.inject(agent, page) #=>
|
|
163
|
-
|
|
191
|
+
p1.inject(agent, page) #=> "Hello,World"
|
|
192
|
+
p1t.inject(agent, page) #=> "Hello"
|
|
193
|
+
p2u.inject(agent, page) #=> "HELLO,WORLD"
|
|
164
194
|
```
|
|
165
195
|
|
|
196
|
+
なお、同じページ内の複数の要素を一度にスクレイピングする場合は、`MapNode`を使用します。
|
|
197
|
+
|
|
166
198
|
### オプション
|
|
167
199
|
##### `truncate`
|
|
168
200
|
正規表現にマッチした文字列を取り出します.グループを指定した場合、最初にマッチしたグループだけを返します.
|
|
@@ -466,3 +498,54 @@ node.inject(agent, page)
|
|
|
466
498
|
"Page03",
|
|
467
499
|
"Patination03"]
|
|
468
500
|
```
|
|
501
|
+
|
|
502
|
+
## Map Node
|
|
503
|
+
*MapNode* はスクレイピングした結果をまとめるノードです.このノードはパースツリーにおいて常に節です.
|
|
504
|
+
|
|
505
|
+
### 例
|
|
506
|
+
|
|
507
|
+
```html
|
|
508
|
+
<!-- http://yasuri.example.net -->
|
|
509
|
+
<html>
|
|
510
|
+
<head><title>Yasuri Example</title></head>
|
|
511
|
+
<body>
|
|
512
|
+
<p>Hello,World</p>
|
|
513
|
+
<p>Hello,Yasuri</p>
|
|
514
|
+
</body>
|
|
515
|
+
</html>
|
|
516
|
+
```
|
|
517
|
+
|
|
518
|
+
```ruby
|
|
519
|
+
agent = Mechanize.new
|
|
520
|
+
page = agent.get("http://yasuri.example.net")
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
tree = Yasuri.map_root do
|
|
524
|
+
text_title '/html/head/title'
|
|
525
|
+
text_body_p '/html/body/p[1]'
|
|
526
|
+
end
|
|
527
|
+
|
|
528
|
+
tree.inject(agent, page) #=> { "title" => "Yasuri Example", "body_p" => "Hello,World" }
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
tree = Yasuri.map_root do
|
|
532
|
+
map_group1 { text_child01 '/html/body/a[1]' }
|
|
533
|
+
map_group2 do
|
|
534
|
+
text_child01 '/html/body/a[1]'
|
|
535
|
+
text_child03 '/html/body/a[3]'
|
|
536
|
+
end
|
|
537
|
+
end
|
|
538
|
+
|
|
539
|
+
tree.inject(agent, page) #=> {
|
|
540
|
+
# "group1" => {
|
|
541
|
+
# "child01" => "child01"
|
|
542
|
+
# },
|
|
543
|
+
# "group2" => {
|
|
544
|
+
# "child01" => "child01",
|
|
545
|
+
# "child03" => "child03"
|
|
546
|
+
# }
|
|
547
|
+
# }
|
|
548
|
+
```
|
|
549
|
+
|
|
550
|
+
### オプション
|
|
551
|
+
なし
|
data/USAGE.md
CHANGED
|
@@ -69,7 +69,7 @@ page = agent.get(uri)
|
|
|
69
69
|
tree.inject(agent, page)
|
|
70
70
|
```
|
|
71
71
|
|
|
72
|
-
Tree is definable by
|
|
72
|
+
Tree is definable by 3(+1) ways, json, yaml, and DSL (or basic ruby code). In above example, DSL.
|
|
73
73
|
|
|
74
74
|
```ruby
|
|
75
75
|
# Construct by json.
|
|
@@ -88,21 +88,51 @@ EOJSON
|
|
|
88
88
|
tree = Yasuri.json2tree(src)
|
|
89
89
|
```
|
|
90
90
|
|
|
91
|
+
```ruby
|
|
92
|
+
# Construct by yaml.
|
|
93
|
+
src = <<-EOYAML
|
|
94
|
+
title:
|
|
95
|
+
node: links
|
|
96
|
+
path: "/html/body/a"
|
|
97
|
+
children:
|
|
98
|
+
- name:
|
|
99
|
+
node: text
|
|
100
|
+
path: "/html/body/p"
|
|
101
|
+
EOYAML
|
|
102
|
+
tree = Yasuri.yaml2tree(src)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
|
|
91
106
|
### Node
|
|
92
107
|
Tree is constructed by nested Nodes.
|
|
93
108
|
Node has `Type`, `Name`, `Path`, `Childlen`, and `Options`.
|
|
109
|
+
(But only `MapNode` does not have `Path`.)
|
|
94
110
|
|
|
95
111
|
Node is defined by this format.
|
|
96
112
|
|
|
97
113
|
|
|
98
114
|
```ruby
|
|
99
|
-
# Top Level
|
|
100
115
|
Yasuri.<Type>_<Name> <Path> [,<Options>]
|
|
101
116
|
|
|
102
|
-
# Nested
|
|
117
|
+
# Nested case
|
|
103
118
|
Yasuri.<Type>_<Name> <Path> [,<Options>] do
|
|
104
119
|
<Type>_<Name> <Path> [,<Options>] do
|
|
105
|
-
<
|
|
120
|
+
<Type>_<Name> <Path> [,<Options>]
|
|
121
|
+
...
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
Example
|
|
127
|
+
|
|
128
|
+
```ruby
|
|
129
|
+
Yasuri.text_title '/html/head/title', truncate:/^[^,]+/
|
|
130
|
+
|
|
131
|
+
# Nested case
|
|
132
|
+
Yasuri.links_root '//*[@id="menu"]/ul/li/a' do
|
|
133
|
+
struct_table './tr' do
|
|
134
|
+
text_title './td[1]'
|
|
135
|
+
text_pub_date './td[2]'
|
|
106
136
|
end
|
|
107
137
|
end
|
|
108
138
|
```
|
|
@@ -114,17 +144,18 @@ Type meen behavior of Node.
|
|
|
114
144
|
- *Struct*
|
|
115
145
|
- *Links*
|
|
116
146
|
- *Paginate*
|
|
147
|
+
- *Map*
|
|
117
148
|
|
|
118
|
-
|
|
149
|
+
#### Name
|
|
119
150
|
Name is used keys in returned hash.
|
|
120
151
|
|
|
121
|
-
|
|
152
|
+
#### Path
|
|
122
153
|
Path determine target node by xpath or css selector. It given by Machinize `search`.
|
|
123
154
|
|
|
124
|
-
|
|
155
|
+
#### Childlen
|
|
125
156
|
Child nodes. TextNode has always empty set, because TextNode is leaf.
|
|
126
157
|
|
|
127
|
-
|
|
158
|
+
#### Options
|
|
128
159
|
Parse options. It different in each types. You can get options and values by `opt` method.
|
|
129
160
|
|
|
130
161
|
```ruby
|
|
@@ -155,13 +186,15 @@ page = agent.get("http://yasuri.example.net")
|
|
|
155
186
|
|
|
156
187
|
p1 = Yasuri.text_title '/html/body/p[1]'
|
|
157
188
|
p1t = Yasuri.text_title '/html/body/p[1]', truncate:/^[^,]+/
|
|
158
|
-
p2u = Yasuri.text_title '/html/body/p[
|
|
189
|
+
p2u = Yasuri.text_title '/html/body/p[1]', proc: :upcase
|
|
159
190
|
|
|
160
|
-
p1.inject(agent, page) #=>
|
|
161
|
-
p1t.inject(agent, page) #=>
|
|
162
|
-
|
|
191
|
+
p1.inject(agent, page) #=> "Hello,World"
|
|
192
|
+
p1t.inject(agent, page) #=> "Hello"
|
|
193
|
+
p2u.inject(agent, page) #=> "HELLO,WORLD"
|
|
163
194
|
```
|
|
164
195
|
|
|
196
|
+
Note that if you want to scrape multiple elements in the same page at once, use `MapNode`. See the `MapNode` example for details.
|
|
197
|
+
|
|
165
198
|
### Options
|
|
166
199
|
##### `truncate`
|
|
167
200
|
Match to regexp, and truncate text. When you use group, it will return first matched group only.
|
|
@@ -464,3 +497,54 @@ node.inject(agent, page)
|
|
|
464
497
|
"Page03",
|
|
465
498
|
"Patination03"]
|
|
466
499
|
```
|
|
500
|
+
|
|
501
|
+
## Map Node
|
|
502
|
+
*MapNode* is a node that summarizes the results of scraping. This node is always a branch node in the parse tree.
|
|
503
|
+
|
|
504
|
+
### Example
|
|
505
|
+
|
|
506
|
+
```html
|
|
507
|
+
<!-- http://yasuri.example.net -->
|
|
508
|
+
<html>
|
|
509
|
+
<head><title>Yasuri Example</title></head>
|
|
510
|
+
<body>
|
|
511
|
+
<p>Hello,World</p>
|
|
512
|
+
<p>Hello,Yasuri</p>
|
|
513
|
+
</body>
|
|
514
|
+
</html>
|
|
515
|
+
```
|
|
516
|
+
|
|
517
|
+
```ruby
|
|
518
|
+
agent = Mechanize.new
|
|
519
|
+
page = agent.get("http://yasuri.example.net")
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
tree = Yasuri.map_root do
|
|
523
|
+
text_title '/html/head/title'
|
|
524
|
+
text_body_p '/html/body/p[1]'
|
|
525
|
+
end
|
|
526
|
+
|
|
527
|
+
tree.inject(agent, page) #=> { "title" => "Yasuri Example", "body_p" => "Hello,World" }
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
tree = Yasuri.map_root do
|
|
531
|
+
map_group1 { text_child01 '/html/body/a[1]' }
|
|
532
|
+
map_group2 do
|
|
533
|
+
text_child01 '/html/body/a[1]'
|
|
534
|
+
text_child03 '/html/body/a[3]'
|
|
535
|
+
end
|
|
536
|
+
end
|
|
537
|
+
|
|
538
|
+
tree.inject(agent, page) #=> {
|
|
539
|
+
# "group1" => {
|
|
540
|
+
# "child01" => "child01"
|
|
541
|
+
# },
|
|
542
|
+
# "group2" => {
|
|
543
|
+
# "child01" => "child01",
|
|
544
|
+
# "child03" => "child03"
|
|
545
|
+
# }
|
|
546
|
+
# }
|
|
547
|
+
```
|
|
548
|
+
|
|
549
|
+
### Options
|
|
550
|
+
None.
|
data/lib/yasuri/version.rb
CHANGED
data/lib/yasuri/yasuri.rb
CHANGED
|
@@ -4,12 +4,14 @@
|
|
|
4
4
|
|
|
5
5
|
require 'mechanize'
|
|
6
6
|
require 'json'
|
|
7
|
+
require 'yaml'
|
|
7
8
|
|
|
8
9
|
require_relative 'yasuri_node'
|
|
9
10
|
require_relative 'yasuri_text_node'
|
|
10
11
|
require_relative 'yasuri_struct_node'
|
|
11
12
|
require_relative 'yasuri_paginate_node'
|
|
12
13
|
require_relative 'yasuri_links_node'
|
|
14
|
+
require_relative 'yasuri_map_node'
|
|
13
15
|
require_relative 'yasuri_node_generator'
|
|
14
16
|
|
|
15
17
|
module Yasuri
|
|
@@ -23,9 +25,39 @@ module Yasuri
|
|
|
23
25
|
Yasuri.node2hash(node).to_json
|
|
24
26
|
end
|
|
25
27
|
|
|
26
|
-
def self.
|
|
27
|
-
|
|
28
|
-
|
|
28
|
+
def self.yaml2tree(yaml_string)
|
|
29
|
+
raise RuntimeError if yaml_string.nil? or yaml_string.empty?
|
|
30
|
+
|
|
31
|
+
yaml = YAML.load(yaml_string)
|
|
32
|
+
raise RuntimeError if yaml.keys.size < 1
|
|
33
|
+
|
|
34
|
+
root_key, root = yaml.keys.first, yaml.values.first
|
|
35
|
+
hash = Yasuri.yaml2tree_sub(root_key, root)
|
|
36
|
+
|
|
37
|
+
Yasuri.hash2node(hash)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
private
|
|
41
|
+
def self.yaml2tree_sub(name, body)
|
|
42
|
+
return nil if name.nil? or body.nil?
|
|
43
|
+
|
|
44
|
+
new_body = Hash[:name, name]
|
|
45
|
+
body.each{|k,v| new_body[k.to_sym] = v}
|
|
46
|
+
body = new_body
|
|
47
|
+
|
|
48
|
+
return body if body[:children].nil?
|
|
49
|
+
|
|
50
|
+
body[:children] = body[:children].map do |c|
|
|
51
|
+
k, b = c.keys.first, c.values.first
|
|
52
|
+
Yasuri.yaml2tree_sub(k, b)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
body
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def self.method_missing(method_name, pattern=nil, **opt, &block)
|
|
59
|
+
generated = Yasuri::NodeGenerator.gen(method_name, pattern, **opt, &block)
|
|
60
|
+
generated || super(method_name, **opt)
|
|
29
61
|
end
|
|
30
62
|
|
|
31
63
|
private
|
|
@@ -33,53 +65,26 @@ module Yasuri
|
|
|
33
65
|
text: Yasuri::TextNode,
|
|
34
66
|
struct: Yasuri::StructNode,
|
|
35
67
|
links: Yasuri::LinksNode,
|
|
36
|
-
pages: Yasuri::PaginateNode
|
|
68
|
+
pages: Yasuri::PaginateNode,
|
|
69
|
+
map: Yasuri::MapNode
|
|
37
70
|
}
|
|
38
71
|
Node2Text = Text2Node.invert
|
|
39
72
|
|
|
40
|
-
ReservedKeys =
|
|
73
|
+
ReservedKeys = %i|node name path children|
|
|
41
74
|
def self.hash2node(node_h)
|
|
42
|
-
node
|
|
43
|
-
node_h[key]
|
|
44
|
-
end
|
|
45
|
-
children ||= []
|
|
46
|
-
|
|
47
|
-
fail "Not found 'node' value in json" if node.nil?
|
|
48
|
-
fail "Not found 'name' value in json" if name.nil?
|
|
49
|
-
fail "Not found 'path' value in json" if path.nil?
|
|
50
|
-
|
|
51
|
-
childnodes = children.map{|c| Yasuri.hash2node(c) }
|
|
52
|
-
ReservedKeys.each{|key| node_h.delete(key)}
|
|
53
|
-
opt = node_h
|
|
75
|
+
node = node_h[:node]
|
|
54
76
|
|
|
77
|
+
fail "Not found 'node' value in map" if node.nil?
|
|
55
78
|
klass = Text2Node[node.to_sym]
|
|
56
|
-
|
|
57
|
-
klass.new(path, name, childnodes, opt)
|
|
79
|
+
klass::hash2node(node_h)
|
|
58
80
|
end
|
|
59
81
|
|
|
60
82
|
def self.node2hash(node)
|
|
61
|
-
|
|
62
|
-
return json if node.nil?
|
|
63
|
-
|
|
64
|
-
klass = node.class
|
|
65
|
-
klass_str = Node2Text[klass]
|
|
66
|
-
|
|
67
|
-
json["node"] = klass_str
|
|
68
|
-
json["name"] = node.name
|
|
69
|
-
json["path"] = node.xpath
|
|
70
|
-
|
|
71
|
-
children = node.children.map{|c| Yasuri.node2hash(c)}
|
|
72
|
-
json["children"] = children if not children.empty?
|
|
73
|
-
|
|
74
|
-
node.opts.each do |key,value|
|
|
75
|
-
json[key] = value if not value.nil?
|
|
76
|
-
end
|
|
77
|
-
|
|
78
|
-
json
|
|
83
|
+
node.to_h
|
|
79
84
|
end
|
|
80
85
|
|
|
81
|
-
def self.NodeName(name,
|
|
82
|
-
symbolize_names =
|
|
86
|
+
def self.NodeName(name, opt)
|
|
87
|
+
symbolize_names = opt[:symbolize_names]
|
|
83
88
|
symbolize_names ? name.to_sym : name
|
|
84
89
|
end
|
|
85
90
|
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
|
|
2
|
+
module Yasuri
|
|
3
|
+
class MapNode
|
|
4
|
+
attr_reader :name, :children
|
|
5
|
+
|
|
6
|
+
def initialize(name, children, opt: {})
|
|
7
|
+
@name = name
|
|
8
|
+
@children = children
|
|
9
|
+
@opt = opt
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def inject(agent, page, opt = {}, element = page)
|
|
13
|
+
child_results_kv = @children.map do |node|
|
|
14
|
+
[node.name, node.inject(agent, page, opt)]
|
|
15
|
+
end
|
|
16
|
+
Hash[child_results_kv]
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def opts
|
|
20
|
+
{}
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def to_h
|
|
24
|
+
h = {}
|
|
25
|
+
h["node"] = "map"
|
|
26
|
+
h["name"] = self.name
|
|
27
|
+
h["children"] = self.children.map{|c| c.to_h} if not children.empty?
|
|
28
|
+
|
|
29
|
+
self.opts.each do |key,value|
|
|
30
|
+
h[key] = value if not value.nil?
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
h
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def self.hash2node(node_h)
|
|
37
|
+
reservedKeys = %i|node name children|
|
|
38
|
+
|
|
39
|
+
node, name, children = reservedKeys.map do |key|
|
|
40
|
+
node_h[key]
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
fail "Not found 'name' value in map" if name.nil?
|
|
44
|
+
fail "Not found 'children' value in map" if children.nil?
|
|
45
|
+
children ||= []
|
|
46
|
+
|
|
47
|
+
childnodes = children.map{|c| Yasuri.hash2node(c) }
|
|
48
|
+
reservedKeys.each{|key| node_h.delete(key)}
|
|
49
|
+
opt = node_h
|
|
50
|
+
|
|
51
|
+
self.new(name, childnodes, **opt)
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
data/lib/yasuri/yasuri_node.rb
CHANGED
|
@@ -7,15 +7,58 @@ module Yasuri
|
|
|
7
7
|
module Node
|
|
8
8
|
attr_reader :url, :xpath, :name, :children
|
|
9
9
|
|
|
10
|
-
def initialize(xpath, name, children = [], opt
|
|
10
|
+
def initialize(xpath, name, children = [], opt: {})
|
|
11
11
|
@xpath, @name, @children = xpath, name, children
|
|
12
12
|
end
|
|
13
13
|
|
|
14
14
|
def inject(agent, page, opt = {}, element = page)
|
|
15
|
-
fail "#{Kernel.__method__} is not implemented."
|
|
15
|
+
fail "#{Kernel.__method__} is not implemented in included class."
|
|
16
16
|
end
|
|
17
|
+
|
|
17
18
|
def opts
|
|
18
19
|
{}
|
|
19
20
|
end
|
|
21
|
+
|
|
22
|
+
def to_h
|
|
23
|
+
h = {}
|
|
24
|
+
h["node"] = self.node_type_str
|
|
25
|
+
h["name"] = self.name
|
|
26
|
+
h["path"] = self.xpath
|
|
27
|
+
h["children"] = self.children.map{|c| c.to_h} if not children.empty?
|
|
28
|
+
|
|
29
|
+
self.opts.each do |key,value|
|
|
30
|
+
h[key] = value if not value.nil?
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
h
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
module ClassMethods
|
|
37
|
+
def hash2node(node_h)
|
|
38
|
+
reservedKeys = %i|node name path children|
|
|
39
|
+
|
|
40
|
+
node, name, path, children = ReservedKeys.map do |key|
|
|
41
|
+
node_h[key]
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
fail "Not found 'name' value in map" if name.nil?
|
|
45
|
+
fail "Not found 'path' value in map" if path.nil?
|
|
46
|
+
children ||= []
|
|
47
|
+
|
|
48
|
+
childnodes = children.map{|c| Yasuri.hash2node(c) }
|
|
49
|
+
reservedKeys.each{|key| node_h.delete(key)}
|
|
50
|
+
opt = node_h
|
|
51
|
+
|
|
52
|
+
self.new(path, name, childnodes, **opt)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def node_type_str
|
|
56
|
+
fail "#{Kernel.__method__} is not implemented in included class."
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def self.included(base)
|
|
61
|
+
base.extend(ClassMethods)
|
|
62
|
+
end
|
|
20
63
|
end
|
|
21
64
|
end
|
|
@@ -6,6 +6,7 @@ require_relative 'yasuri_text_node'
|
|
|
6
6
|
require_relative 'yasuri_struct_node'
|
|
7
7
|
require_relative 'yasuri_links_node'
|
|
8
8
|
require_relative 'yasuri_paginate_node'
|
|
9
|
+
require_relative 'yasuri_map_node'
|
|
9
10
|
|
|
10
11
|
module Yasuri
|
|
11
12
|
class NodeGenerator
|
|
@@ -15,29 +16,33 @@ module Yasuri
|
|
|
15
16
|
@nodes
|
|
16
17
|
end
|
|
17
18
|
|
|
18
|
-
def method_missing(name,
|
|
19
|
-
node = NodeGenerator.gen(name,
|
|
19
|
+
def method_missing(name, pattern=nil, **args, &block)
|
|
20
|
+
node = NodeGenerator.gen(name, pattern, **args, &block)
|
|
20
21
|
raise "Undefined Node Name '#{name}'" if node == nil
|
|
21
22
|
@nodes << node
|
|
22
23
|
end
|
|
23
24
|
|
|
24
|
-
def self.gen(
|
|
25
|
-
xpath, opt = *args
|
|
26
|
-
opt = [opt].flatten.compact
|
|
25
|
+
def self.gen(method_name, xpath, **opt, &block)
|
|
27
26
|
children = Yasuri::NodeGenerator.new.gen_recursive(&block) if block_given?
|
|
28
27
|
|
|
29
|
-
case
|
|
28
|
+
case method_name
|
|
30
29
|
when /^text_(.+)$/
|
|
31
|
-
|
|
30
|
+
# Todo raise error xpath is not valid
|
|
31
|
+
Yasuri::TextNode.new(xpath, $1, children || [], **opt)
|
|
32
32
|
when /^struct_(.+)$/
|
|
33
|
-
|
|
33
|
+
# Todo raise error xpath is not valid
|
|
34
|
+
Yasuri::StructNode.new(xpath, $1, children || [], **opt)
|
|
34
35
|
when /^links_(.+)$/
|
|
35
|
-
|
|
36
|
+
# Todo raise error xpath is not valid
|
|
37
|
+
Yasuri::LinksNode.new(xpath, $1, children || [], **opt)
|
|
36
38
|
when /^pages_(.+)$/
|
|
37
|
-
|
|
39
|
+
# Todo raise error xpath is not valid
|
|
40
|
+
Yasuri::PaginateNode.new(xpath, $1, children || [], **opt)
|
|
41
|
+
when /^map_(.+)$/
|
|
42
|
+
Yasuri::MapNode.new($1, children, **opt)
|
|
38
43
|
else
|
|
39
44
|
nil
|
|
40
45
|
end
|
|
41
|
-
end # of self.gen(
|
|
46
|
+
end # of self.gen(method_name, xpath, **opt, &block)
|
|
42
47
|
end # of class NodeGenerator
|
|
43
48
|
end
|
|
@@ -7,10 +7,10 @@ module Yasuri
|
|
|
7
7
|
class PaginateNode
|
|
8
8
|
include Node
|
|
9
9
|
|
|
10
|
-
def initialize(xpath, name, children = [],
|
|
10
|
+
def initialize(xpath, name, children = [], limit: nil, flatten: false)
|
|
11
11
|
super(xpath, name, children)
|
|
12
|
-
@
|
|
13
|
-
@
|
|
12
|
+
@flatten = flatten
|
|
13
|
+
@limit = limit
|
|
14
14
|
end
|
|
15
15
|
|
|
16
16
|
def inject(agent, page, opt = {}, element = page)
|
|
@@ -44,5 +44,9 @@ module Yasuri
|
|
|
44
44
|
def opts
|
|
45
45
|
{limit:@limit, flatten:@flatten}
|
|
46
46
|
end
|
|
47
|
+
|
|
48
|
+
def node_type_str
|
|
49
|
+
"pages"
|
|
50
|
+
end
|
|
47
51
|
end
|
|
48
52
|
end
|
|
@@ -7,11 +7,11 @@ module Yasuri
|
|
|
7
7
|
class TextNode
|
|
8
8
|
include Node
|
|
9
9
|
|
|
10
|
-
def initialize(xpath, name, children = [],
|
|
10
|
+
def initialize(xpath, name, children = [], **opt)
|
|
11
11
|
super(xpath, name, children)
|
|
12
12
|
|
|
13
|
-
truncate =
|
|
14
|
-
proc
|
|
13
|
+
truncate = opt[:truncate]
|
|
14
|
+
proc = opt[:proc]
|
|
15
15
|
|
|
16
16
|
truncate = Regexp.new(truncate) if not truncate.nil? # regexp or nil
|
|
17
17
|
@truncate = truncate
|
|
@@ -34,6 +34,10 @@ module Yasuri
|
|
|
34
34
|
text
|
|
35
35
|
end
|
|
36
36
|
|
|
37
|
+
def node_type_str
|
|
38
|
+
"text"
|
|
39
|
+
end
|
|
40
|
+
|
|
37
41
|
def opts
|
|
38
42
|
{truncate:@truncate, proc:@proc}
|
|
39
43
|
end
|
data/spec/spec_helper.rb
CHANGED
|
@@ -12,11 +12,6 @@ shared_context 'httpserver' do
|
|
|
12
12
|
}
|
|
13
13
|
end
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
# ENV['CODECLIMATE_REPO_TOKEN'] = "0dc78d33107a7f11f257c0218ac1a37e0073005bb9734f2fd61d0f7e803fc151"
|
|
17
|
-
# require "codeclimate-test-reporter"
|
|
18
|
-
# CodeClimate::TestReporter.start
|
|
19
|
-
|
|
20
15
|
require 'simplecov'
|
|
21
16
|
require 'coveralls'
|
|
22
17
|
Coveralls.wear!
|
|
@@ -59,10 +59,18 @@ describe 'Yasuri' do
|
|
|
59
59
|
]
|
|
60
60
|
expect(actual).to match expected
|
|
61
61
|
end
|
|
62
|
-
it 'can be defined by DSL, return
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
62
|
+
it 'can be defined by DSL, return no contains if no child node' do
|
|
63
|
+
root_node = Yasuri.links_title '/html/body/a'
|
|
64
|
+
actual = root_node.inject(@agent, @index_page)
|
|
65
|
+
expected = [{}, {}, {}] # Empty if no child node under links node.
|
|
66
|
+
expect(actual).to match expected
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
it 'can be defined return no contains if no child node' do
|
|
70
|
+
root_node = Yasuri::LinksNode.new('/html/body/a', "title")
|
|
71
|
+
actual = root_node.inject(@agent, @index_page)
|
|
72
|
+
expected = [{}, {}, {}] # Empty if no child node under links node.
|
|
73
|
+
expect(actual).to match expected
|
|
66
74
|
end
|
|
67
75
|
it 'can be defined by DSL, return nested contents under link' do
|
|
68
76
|
generated = Yasuri.links_title '/html/body/a' do
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
require_relative 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe 'Yasuri' do
|
|
4
|
+
include_context 'httpserver'
|
|
5
|
+
|
|
6
|
+
before do
|
|
7
|
+
@agent = Mechanize.new
|
|
8
|
+
@index_page = @agent.get(uri)
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
describe '::MapNode' do
|
|
12
|
+
it "multi scrape in singe page" do
|
|
13
|
+
map = Yasuri.map_sample do
|
|
14
|
+
text_title '/html/head/title'
|
|
15
|
+
text_body_p '/html/body/p[1]'
|
|
16
|
+
end
|
|
17
|
+
actual = map.inject(@agent, @index_page)
|
|
18
|
+
|
|
19
|
+
expected = {
|
|
20
|
+
"title" => "Yasuri Test",
|
|
21
|
+
"body_p" => "Hello,Yasuri"
|
|
22
|
+
}
|
|
23
|
+
expect(actual).to include expected
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
it "nested multi scrape in singe page" do
|
|
27
|
+
map = Yasuri.map_sample do
|
|
28
|
+
map_group1 { text_child01 '/html/body/a[1]' }
|
|
29
|
+
map_group2 do
|
|
30
|
+
text_child01 '/html/body/a[1]'
|
|
31
|
+
text_child03 '/html/body/a[3]'
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
actual = map.inject(@agent, @index_page)
|
|
35
|
+
|
|
36
|
+
expected = {
|
|
37
|
+
"group1" => {
|
|
38
|
+
"child01" => "child01"
|
|
39
|
+
},
|
|
40
|
+
"group2" => {
|
|
41
|
+
"child01" => "child01",
|
|
42
|
+
"child03" => "child03"
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
expect(actual).to include expected
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
it "scrape with links node" do
|
|
49
|
+
map = Yasuri.map_sample do
|
|
50
|
+
map_group1 do
|
|
51
|
+
links_a '/html/body/a' do
|
|
52
|
+
text_content '/html/body/p'
|
|
53
|
+
end
|
|
54
|
+
text_child01 '/html/body/a[1]'
|
|
55
|
+
end
|
|
56
|
+
map_group2 do
|
|
57
|
+
text_child03 '/html/body/a[3]'
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
actual = map.inject(@agent, @index_page)
|
|
61
|
+
|
|
62
|
+
expected = {
|
|
63
|
+
"group1" => {
|
|
64
|
+
"a" => [
|
|
65
|
+
{"content" => "Child 01 page."},
|
|
66
|
+
{"content" => "Child 02 page."},
|
|
67
|
+
{"content" => "Child 03 page."},
|
|
68
|
+
],
|
|
69
|
+
"child01" => "child01"
|
|
70
|
+
},
|
|
71
|
+
"group2" => { "child03" => "child03" }
|
|
72
|
+
}
|
|
73
|
+
expect(actual).to include expected
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
data/spec/yasuri_spec.rb
CHANGED
|
@@ -13,6 +13,89 @@ describe 'Yasuri' do
|
|
|
13
13
|
@index_page = @agent.get(@uri)
|
|
14
14
|
end
|
|
15
15
|
|
|
16
|
+
############
|
|
17
|
+
# yam2tree #
|
|
18
|
+
############
|
|
19
|
+
describe '.yaml2tree' do
|
|
20
|
+
it "fail if empty yaml" do
|
|
21
|
+
expect { Yasuri.yaml2tree(nil) }.to raise_error(RuntimeError)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
it "return text node" do
|
|
25
|
+
src = <<-EOB
|
|
26
|
+
content:
|
|
27
|
+
node: text
|
|
28
|
+
path: "/html/body/p[1]"
|
|
29
|
+
EOB
|
|
30
|
+
generated = Yasuri.yaml2tree(src)
|
|
31
|
+
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
|
32
|
+
|
|
33
|
+
compare_generated_vs_original(generated, original, @index_page)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
it "return text node as symbol" do
|
|
37
|
+
src = <<-EOB
|
|
38
|
+
:content:
|
|
39
|
+
:node: text
|
|
40
|
+
:path: "/html/body/p[1]"
|
|
41
|
+
EOB
|
|
42
|
+
generated = Yasuri.yaml2tree(src)
|
|
43
|
+
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
|
44
|
+
|
|
45
|
+
compare_generated_vs_original(generated, original, @index_page)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
it "return LinksNode/TextNode" do
|
|
49
|
+
|
|
50
|
+
src = <<-EOB
|
|
51
|
+
root:
|
|
52
|
+
node: links
|
|
53
|
+
path: "/html/body/a"
|
|
54
|
+
children:
|
|
55
|
+
- content:
|
|
56
|
+
node: text
|
|
57
|
+
path: "/html/body/p"
|
|
58
|
+
EOB
|
|
59
|
+
generated = Yasuri.yaml2tree(src)
|
|
60
|
+
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
|
61
|
+
Yasuri::TextNode.new('/html/body/p', "content"),
|
|
62
|
+
])
|
|
63
|
+
|
|
64
|
+
compare_generated_vs_original(generated, original, @index_page)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
|
68
|
+
src = <<-EOB
|
|
69
|
+
tables:
|
|
70
|
+
node: struct
|
|
71
|
+
path: "/html/body/table"
|
|
72
|
+
children:
|
|
73
|
+
- table:
|
|
74
|
+
node: struct
|
|
75
|
+
path: "./tr"
|
|
76
|
+
children:
|
|
77
|
+
- title:
|
|
78
|
+
node: text
|
|
79
|
+
path: "./td[1]"
|
|
80
|
+
- pub_date:
|
|
81
|
+
node: text
|
|
82
|
+
path: "./td[2]"
|
|
83
|
+
EOB
|
|
84
|
+
|
|
85
|
+
generated = Yasuri.yaml2tree(src)
|
|
86
|
+
original = Yasuri::StructNode.new('/html/body/table', "tables", [
|
|
87
|
+
Yasuri::StructNode.new('./tr', "table", [
|
|
88
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
|
89
|
+
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
|
90
|
+
])
|
|
91
|
+
])
|
|
92
|
+
page = @agent.get(@uri + "/struct/structual_text.html")
|
|
93
|
+
compare_generated_vs_original(generated, original, page)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
end # end of describe '.yaml2tree'
|
|
97
|
+
|
|
98
|
+
|
|
16
99
|
#############
|
|
17
100
|
# json2tree #
|
|
18
101
|
#############
|
|
@@ -39,10 +122,31 @@ describe 'Yasuri' do
|
|
|
39
122
|
"truncate" : "^[^,]+"
|
|
40
123
|
}|
|
|
41
124
|
generated = Yasuri.json2tree(src)
|
|
42
|
-
original = Yasuri::TextNode.new('/html/body/p[1]', "content",
|
|
125
|
+
original = Yasuri::TextNode.new('/html/body/p[1]', "content", truncate:/^[^,]+/)
|
|
43
126
|
compare_generated_vs_original(generated, original, @index_page)
|
|
44
127
|
end
|
|
45
128
|
|
|
129
|
+
it "return MapNode with TextNodes" do
|
|
130
|
+
src = %q| { "node" : "map",
|
|
131
|
+
"name" : "parent",
|
|
132
|
+
"children" : [
|
|
133
|
+
{ "node" : "text",
|
|
134
|
+
"name" : "content01",
|
|
135
|
+
"path" : "/html/body/p[1]"
|
|
136
|
+
},
|
|
137
|
+
{ "node" : "text",
|
|
138
|
+
"name" : "content02",
|
|
139
|
+
"path" : "/html/body/p[2]"
|
|
140
|
+
}
|
|
141
|
+
]
|
|
142
|
+
}|
|
|
143
|
+
generated = Yasuri.json2tree(src)
|
|
144
|
+
original = Yasuri::MapNode.new('parent', [
|
|
145
|
+
Yasuri::TextNode.new('/html/body/p[1]', "content01"),
|
|
146
|
+
Yasuri::TextNode.new('/html/body/p[2]', "content02"),
|
|
147
|
+
])
|
|
148
|
+
compare_generated_vs_original(generated, original, @index_page)
|
|
149
|
+
end
|
|
46
150
|
|
|
47
151
|
it "return LinksNode/TextNode" do
|
|
48
152
|
src = %q| { "node" : "links",
|
|
@@ -153,7 +257,7 @@ describe 'Yasuri' do
|
|
|
153
257
|
end
|
|
154
258
|
|
|
155
259
|
it "return text node with truncate_regexp" do
|
|
156
|
-
node = Yasuri::TextNode.new("/html/head/title", "title",
|
|
260
|
+
node = Yasuri::TextNode.new("/html/head/title", "title", truncate:/^[^,]+/)
|
|
157
261
|
json = Yasuri.tree2json(node)
|
|
158
262
|
expected_str = %q| { "node": "text",
|
|
159
263
|
"name": "title",
|
|
@@ -165,6 +269,31 @@ describe 'Yasuri' do
|
|
|
165
269
|
expect(actual).to match expected
|
|
166
270
|
end
|
|
167
271
|
|
|
272
|
+
it "return map node with text nodes" do
|
|
273
|
+
tree = Yasuri::MapNode.new('parent', [
|
|
274
|
+
Yasuri::TextNode.new('/html/body/p[1]', "content01"),
|
|
275
|
+
Yasuri::TextNode.new('/html/body/p[2]', "content02"),
|
|
276
|
+
])
|
|
277
|
+
actual_json = Yasuri.tree2json(tree)
|
|
278
|
+
|
|
279
|
+
expected_json = %q| { "node" : "map",
|
|
280
|
+
"name" : "parent",
|
|
281
|
+
"children" : [
|
|
282
|
+
{ "node" : "text",
|
|
283
|
+
"name" : "content01",
|
|
284
|
+
"path" : "/html/body/p[1]"
|
|
285
|
+
},
|
|
286
|
+
{ "node" : "text",
|
|
287
|
+
"name" : "content02",
|
|
288
|
+
"path" : "/html/body/p[2]"
|
|
289
|
+
}
|
|
290
|
+
]
|
|
291
|
+
}|
|
|
292
|
+
expected = Yasuri.tree2json(Yasuri.json2tree(expected_json))
|
|
293
|
+
actual = Yasuri.tree2json(Yasuri.json2tree(actual_json))
|
|
294
|
+
expect(actual).to match expected
|
|
295
|
+
end
|
|
296
|
+
|
|
168
297
|
it "return LinksNode/TextNode" do
|
|
169
298
|
tree = Yasuri::LinksNode.new('/html/body/a', "root", [
|
|
170
299
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
|
@@ -126,7 +126,7 @@ describe 'Yasuri' do
|
|
|
126
126
|
Yasuri::TextNode.new('./td[1]', "title"),
|
|
127
127
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
|
128
128
|
])
|
|
129
|
-
expected = @table_1996.map{|h|
|
|
129
|
+
expected = @table_1996.map{|h| h.map{|k,v| [k.to_sym, v] }.to_h }
|
|
130
130
|
actual = node.inject(@agent, @page, symbolize_names:true)
|
|
131
131
|
expect(actual).to match expected
|
|
132
132
|
end
|
data/yasuri.gemspec
CHANGED
|
@@ -18,8 +18,8 @@ Gem::Specification.new do |spec|
|
|
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
|
19
19
|
spec.require_paths = ["lib"]
|
|
20
20
|
|
|
21
|
-
spec.add_development_dependency "bundler"
|
|
22
|
-
spec.add_development_dependency "rake"
|
|
21
|
+
spec.add_development_dependency "bundler"
|
|
22
|
+
spec.add_development_dependency "rake"
|
|
23
23
|
spec.add_development_dependency "rspec"
|
|
24
24
|
spec.add_development_dependency "fuubar"
|
|
25
25
|
spec.add_development_dependency "glint"
|
metadata
CHANGED
|
@@ -1,43 +1,43 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: yasuri
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 3.1.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- TAC
|
|
8
|
-
autorequire:
|
|
8
|
+
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2021-03-21 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
|
16
16
|
requirements:
|
|
17
|
-
- - "
|
|
17
|
+
- - ">="
|
|
18
18
|
- !ruby/object:Gem::Version
|
|
19
|
-
version: '
|
|
19
|
+
version: '0'
|
|
20
20
|
type: :development
|
|
21
21
|
prerelease: false
|
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
23
|
requirements:
|
|
24
|
-
- - "
|
|
24
|
+
- - ">="
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
|
-
version: '
|
|
26
|
+
version: '0'
|
|
27
27
|
- !ruby/object:Gem::Dependency
|
|
28
28
|
name: rake
|
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
|
30
30
|
requirements:
|
|
31
|
-
- - "
|
|
31
|
+
- - ">="
|
|
32
32
|
- !ruby/object:Gem::Version
|
|
33
|
-
version: '
|
|
33
|
+
version: '0'
|
|
34
34
|
type: :development
|
|
35
35
|
prerelease: false
|
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
|
37
37
|
requirements:
|
|
38
|
-
- - "
|
|
38
|
+
- - ">="
|
|
39
39
|
- !ruby/object:Gem::Version
|
|
40
|
-
version: '
|
|
40
|
+
version: '0'
|
|
41
41
|
- !ruby/object:Gem::Dependency
|
|
42
42
|
name: rspec
|
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -144,8 +144,10 @@ extensions: []
|
|
|
144
144
|
extra_rdoc_files: []
|
|
145
145
|
files:
|
|
146
146
|
- ".coveralls.yml"
|
|
147
|
+
- ".github/workflows/ruby.yml"
|
|
147
148
|
- ".gitignore"
|
|
148
149
|
- ".rspec"
|
|
150
|
+
- ".ruby-version"
|
|
149
151
|
- ".travis.yml"
|
|
150
152
|
- Gemfile
|
|
151
153
|
- LICENSE
|
|
@@ -158,6 +160,7 @@ files:
|
|
|
158
160
|
- lib/yasuri/version.rb
|
|
159
161
|
- lib/yasuri/yasuri.rb
|
|
160
162
|
- lib/yasuri/yasuri_links_node.rb
|
|
163
|
+
- lib/yasuri/yasuri_map_node.rb
|
|
161
164
|
- lib/yasuri/yasuri_node.rb
|
|
162
165
|
- lib/yasuri/yasuri_node_generator.rb
|
|
163
166
|
- lib/yasuri/yasuri_paginate_node.rb
|
|
@@ -179,6 +182,7 @@ files:
|
|
|
179
182
|
- spec/servers/httpserver.rb
|
|
180
183
|
- spec/spec_helper.rb
|
|
181
184
|
- spec/yasuri_links_node_spec.rb
|
|
185
|
+
- spec/yasuri_map_spec.rb
|
|
182
186
|
- spec/yasuri_node_spec.rb
|
|
183
187
|
- spec/yasuri_paginate_node_spec.rb
|
|
184
188
|
- spec/yasuri_spec.rb
|
|
@@ -189,7 +193,7 @@ homepage: https://github.com/tac0x2a/yasuri
|
|
|
189
193
|
licenses:
|
|
190
194
|
- MIT
|
|
191
195
|
metadata: {}
|
|
192
|
-
post_install_message:
|
|
196
|
+
post_install_message:
|
|
193
197
|
rdoc_options: []
|
|
194
198
|
require_paths:
|
|
195
199
|
- lib
|
|
@@ -204,9 +208,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
204
208
|
- !ruby/object:Gem::Version
|
|
205
209
|
version: '0'
|
|
206
210
|
requirements: []
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
signing_key:
|
|
211
|
+
rubygems_version: 3.2.3
|
|
212
|
+
signing_key:
|
|
210
213
|
specification_version: 4
|
|
211
214
|
summary: Yasuri is easy scraping library.
|
|
212
215
|
test_files:
|
|
@@ -226,6 +229,7 @@ test_files:
|
|
|
226
229
|
- spec/servers/httpserver.rb
|
|
227
230
|
- spec/spec_helper.rb
|
|
228
231
|
- spec/yasuri_links_node_spec.rb
|
|
232
|
+
- spec/yasuri_map_spec.rb
|
|
229
233
|
- spec/yasuri_node_spec.rb
|
|
230
234
|
- spec/yasuri_paginate_node_spec.rb
|
|
231
235
|
- spec/yasuri_spec.rb
|