yasuri 2.0.13 → 3.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.github/workflows/ruby.yml +35 -0
- data/.rubocop.yml +49 -0
- data/.rubocop_todo.yml +0 -0
- data/.ruby-version +1 -1
- data/README.md +82 -31
- data/Rakefile +1 -1
- data/USAGE.ja.md +366 -131
- data/USAGE.md +371 -136
- data/examples/example.rb +78 -0
- data/examples/github.yml +15 -0
- data/examples/sample.json +4 -0
- data/examples/sample.yml +11 -0
- data/exe/yasuri +5 -0
- data/lib/yasuri.rb +1 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +96 -75
- data/lib/yasuri/yasuri_cli.rb +78 -0
- data/lib/yasuri/yasuri_links_node.rb +10 -6
- data/lib/yasuri/yasuri_map_node.rb +40 -0
- data/lib/yasuri/yasuri_node.rb +36 -4
- data/lib/yasuri/yasuri_node_generator.rb +17 -14
- data/lib/yasuri/yasuri_paginate_node.rb +26 -16
- data/lib/yasuri/yasuri_struct_node.rb +6 -4
- data/lib/yasuri/yasuri_text_node.rb +13 -8
- data/spec/cli_resources/tree.json +8 -0
- data/spec/cli_resources/tree.yml +5 -0
- data/spec/cli_resources/tree_wrong.json +9 -0
- data/spec/cli_resources/tree_wrong.yml +6 -0
- data/spec/servers/httpserver.rb +0 -2
- data/spec/spec_helper.rb +4 -11
- data/spec/yasuri_cli_spec.rb +114 -0
- data/spec/yasuri_links_node_spec.rb +92 -60
- data/spec/yasuri_map_spec.rb +71 -0
- data/spec/yasuri_paginate_node_spec.rb +99 -88
- data/spec/yasuri_spec.rb +196 -138
- data/spec/yasuri_struct_node_spec.rb +120 -100
- data/spec/yasuri_text_node_spec.rb +22 -32
- data/yasuri.gemspec +29 -22
- metadata +108 -19
- data/app.rb +0 -52
- data/spec/yasuri_node_spec.rb +0 -11
data/spec/yasuri_spec.rb
CHANGED
@@ -1,18 +1,8 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
|
3
|
-
# Author:: TAC (tac@tac42.net)
|
4
|
-
|
5
1
|
require_relative 'spec_helper'
|
6
2
|
|
7
3
|
describe 'Yasuri' do
|
8
4
|
include_context 'httpserver'
|
9
5
|
|
10
|
-
before do
|
11
|
-
@agent = Mechanize.new
|
12
|
-
@uri = uri
|
13
|
-
@index_page = @agent.get(@uri)
|
14
|
-
end
|
15
|
-
|
16
6
|
############
|
17
7
|
# yam2tree #
|
18
8
|
############
|
@@ -23,64 +13,49 @@ describe 'Yasuri' do
|
|
23
13
|
|
24
14
|
it "return text node" do
|
25
15
|
src = <<-EOB
|
26
|
-
|
27
|
-
|
28
|
-
path: "/html/body/p[1]"
|
29
|
-
EOB
|
16
|
+
text_content: "/html/body/p[1]"
|
17
|
+
EOB
|
30
18
|
generated = Yasuri.yaml2tree(src)
|
31
19
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
32
20
|
|
33
|
-
compare_generated_vs_original(generated, original,
|
21
|
+
compare_generated_vs_original(generated, original, uri)
|
34
22
|
end
|
35
23
|
|
36
24
|
it "return text node as symbol" do
|
37
25
|
src = <<-EOB
|
38
|
-
:
|
39
|
-
|
40
|
-
|
41
|
-
EOB
|
26
|
+
:text_content:
|
27
|
+
:path: "/html/body/p[1]"
|
28
|
+
EOB
|
42
29
|
generated = Yasuri.yaml2tree(src)
|
43
30
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
44
31
|
|
45
|
-
compare_generated_vs_original(generated, original,
|
32
|
+
compare_generated_vs_original(generated, original, uri)
|
46
33
|
end
|
47
34
|
|
48
35
|
it "return LinksNode/TextNode" do
|
49
36
|
|
50
37
|
src = <<-EOB
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
- content:
|
56
|
-
node: text
|
57
|
-
path: "/html/body/p"
|
58
|
-
EOB
|
38
|
+
links_root:
|
39
|
+
path: "/html/body/a"
|
40
|
+
text_content: "/html/body/p"
|
41
|
+
EOB
|
59
42
|
generated = Yasuri.yaml2tree(src)
|
60
43
|
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
61
44
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
62
45
|
])
|
63
46
|
|
64
|
-
compare_generated_vs_original(generated, original,
|
47
|
+
compare_generated_vs_original(generated, original, uri)
|
65
48
|
end
|
66
49
|
|
67
50
|
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
68
51
|
src = <<-EOB
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
children:
|
77
|
-
- title:
|
78
|
-
node: text
|
79
|
-
path: "./td[1]"
|
80
|
-
- pub_date:
|
81
|
-
node: text
|
82
|
-
path: "./td[2]"
|
83
|
-
EOB
|
52
|
+
struct_tables:
|
53
|
+
path: "/html/body/table"
|
54
|
+
struct_table:
|
55
|
+
path: "./tr"
|
56
|
+
text_title: "./td[1]"
|
57
|
+
text_pub_date: "./td[2]"
|
58
|
+
EOB
|
84
59
|
|
85
60
|
generated = Yasuri.yaml2tree(src)
|
86
61
|
original = Yasuri::StructNode.new('/html/body/table', "tables", [
|
@@ -89,8 +64,8 @@ EOB
|
|
89
64
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
90
65
|
])
|
91
66
|
])
|
92
|
-
|
93
|
-
compare_generated_vs_original(generated, original,
|
67
|
+
test_uri = uri + "/struct/structual_text.html"
|
68
|
+
compare_generated_vs_original(generated, original, test_uri)
|
94
69
|
end
|
95
70
|
|
96
71
|
end # end of describe '.yaml2tree'
|
@@ -105,103 +80,107 @@ EOB
|
|
105
80
|
end
|
106
81
|
|
107
82
|
it "return TextNode" do
|
108
|
-
src = %q|
|
109
|
-
|
110
|
-
|
111
|
-
|
83
|
+
src = %q|
|
84
|
+
{
|
85
|
+
"text_content": "/html/body/p[1]"
|
86
|
+
}|
|
112
87
|
generated = Yasuri.json2tree(src)
|
113
88
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content")
|
114
89
|
|
115
|
-
compare_generated_vs_original(generated, original,
|
90
|
+
compare_generated_vs_original(generated, original, uri)
|
116
91
|
end
|
117
92
|
|
118
93
|
it "return TextNode with truncate_regexp" do
|
119
|
-
src = %q|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
94
|
+
src = %q|
|
95
|
+
{
|
96
|
+
"text_content": {
|
97
|
+
"path": "/html/body/p[1]",
|
98
|
+
"truncate" : "^[^,]+"
|
99
|
+
}
|
100
|
+
}|
|
124
101
|
generated = Yasuri.json2tree(src)
|
125
102
|
original = Yasuri::TextNode.new('/html/body/p[1]', "content", truncate:/^[^,]+/)
|
126
|
-
compare_generated_vs_original(generated, original,
|
103
|
+
compare_generated_vs_original(generated, original, uri)
|
127
104
|
end
|
128
105
|
|
106
|
+
it "return MapNode with TextNodes" do
|
107
|
+
src = %q|
|
108
|
+
{
|
109
|
+
"text_content01": "/html/body/p[1]",
|
110
|
+
"text_content02": "/html/body/p[2]"
|
111
|
+
}|
|
112
|
+
generated = Yasuri.json2tree(src)
|
113
|
+
original = Yasuri::MapNode.new('parent', [
|
114
|
+
Yasuri::TextNode.new('/html/body/p[1]', "content01"),
|
115
|
+
Yasuri::TextNode.new('/html/body/p[2]', "content02"),
|
116
|
+
])
|
117
|
+
compare_generated_vs_original(generated, original, uri)
|
118
|
+
end
|
129
119
|
|
130
120
|
it "return LinksNode/TextNode" do
|
131
|
-
src = %q|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
121
|
+
src = %q|
|
122
|
+
{
|
123
|
+
"links_root": {
|
124
|
+
"path": "/html/body/a",
|
125
|
+
"text_content": "/html/body/p"
|
126
|
+
}
|
127
|
+
}|
|
128
|
+
|
139
129
|
generated = Yasuri.json2tree(src)
|
140
130
|
original = Yasuri::LinksNode.new('/html/body/a', "root", [
|
141
131
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
142
132
|
])
|
143
133
|
|
144
|
-
compare_generated_vs_original(generated, original,
|
134
|
+
compare_generated_vs_original(generated, original, uri)
|
145
135
|
end
|
146
136
|
|
147
137
|
it "return PaginateNode/TextNode" do
|
148
|
-
src = %q|
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
}|
|
138
|
+
src = %q|
|
139
|
+
{
|
140
|
+
"pages_root": {
|
141
|
+
"path": "/html/body/nav/span/a[@class=\'next\']",
|
142
|
+
"text_content": "/html/body/p"
|
143
|
+
}
|
144
|
+
}|
|
156
145
|
generated = Yasuri.json2tree(src)
|
157
146
|
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
158
147
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
159
148
|
])
|
160
149
|
|
161
|
-
|
162
|
-
|
163
|
-
compare_generated_vs_original(generated, original, paginate_test_page)
|
150
|
+
test_uri = uri + "/pagination/page01.html"
|
151
|
+
compare_generated_vs_original(generated, original, test_uri)
|
164
152
|
end
|
165
153
|
|
166
154
|
it "return PaginateNode/TextNode with limit" do
|
167
|
-
src = %q|
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
}|
|
155
|
+
src = %q|
|
156
|
+
{
|
157
|
+
"pages_root": {
|
158
|
+
"path": "/html/body/nav/span/a[@class=\'next\']",
|
159
|
+
"limit": 2,
|
160
|
+
"text_content": "/html/body/p"
|
161
|
+
}
|
162
|
+
}|
|
176
163
|
generated = Yasuri.json2tree(src)
|
177
164
|
original = Yasuri::PaginateNode.new("/html/body/nav/span/a[@class='next']", "root", [
|
178
165
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
179
166
|
], limit:2)
|
180
167
|
|
181
|
-
|
182
|
-
|
183
|
-
compare_generated_vs_original(generated, original, paginate_test_page)
|
168
|
+
test_uri = uri + "/pagination/page01.html"
|
169
|
+
compare_generated_vs_original(generated, original, test_uri)
|
184
170
|
end
|
185
171
|
|
186
172
|
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
},
|
199
|
-
{ "node" : "text",
|
200
|
-
"name" : "pub_date",
|
201
|
-
"path" : "./td[2]"
|
202
|
-
}]
|
203
|
-
}]
|
204
|
-
}|
|
173
|
+
src = %q|
|
174
|
+
{
|
175
|
+
"struct_tables": {
|
176
|
+
"path": "/html/body/table",
|
177
|
+
"struct_table": {
|
178
|
+
"path": "./tr",
|
179
|
+
"text_title": "./td[1]",
|
180
|
+
"text_pub_date": "./td[2]"
|
181
|
+
}
|
182
|
+
}
|
183
|
+
}|
|
205
184
|
generated = Yasuri.json2tree(src)
|
206
185
|
original = Yasuri::StructNode.new('/html/body/table', "tables", [
|
207
186
|
Yasuri::StructNode.new('./tr', "table", [
|
@@ -209,27 +188,27 @@ EOB
|
|
209
188
|
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
210
189
|
])
|
211
190
|
])
|
212
|
-
|
213
|
-
compare_generated_vs_original(generated, original,
|
191
|
+
test_uri = uri + "/struct/structual_text.html"
|
192
|
+
compare_generated_vs_original(generated, original, test_uri)
|
214
193
|
end
|
215
194
|
end
|
216
195
|
|
196
|
+
|
217
197
|
#############
|
218
198
|
# tree2json #
|
219
199
|
#############
|
220
200
|
describe '.tree2json' do
|
221
201
|
it "return empty json" do
|
222
|
-
|
223
|
-
expect(json).to match "{}"
|
202
|
+
expect { Yasuri.tree2json(nil) }.to raise_error(RuntimeError)
|
224
203
|
end
|
225
204
|
|
226
205
|
it "return text node" do
|
227
206
|
node = Yasuri::TextNode.new("/html/head/title", "title")
|
228
207
|
json = Yasuri.tree2json(node)
|
229
|
-
expected_str = %q|
|
230
|
-
|
231
|
-
|
232
|
-
|
208
|
+
expected_str = %q|
|
209
|
+
{
|
210
|
+
"text_title": "/html/head/title"
|
211
|
+
}|
|
233
212
|
expected = JSON.parse(expected_str)
|
234
213
|
actual = JSON.parse(json)
|
235
214
|
expect(actual).to match expected
|
@@ -238,29 +217,49 @@ EOB
|
|
238
217
|
it "return text node with truncate_regexp" do
|
239
218
|
node = Yasuri::TextNode.new("/html/head/title", "title", truncate:/^[^,]+/)
|
240
219
|
json = Yasuri.tree2json(node)
|
241
|
-
expected_str = %q|
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
220
|
+
expected_str = %q|
|
221
|
+
{
|
222
|
+
"text_title": {
|
223
|
+
"path": "/html/head/title",
|
224
|
+
"truncate": "^[^,]+"
|
225
|
+
}
|
226
|
+
}|
|
246
227
|
expected = Yasuri.tree2json(Yasuri.json2tree(expected_str))
|
247
228
|
actual = Yasuri.tree2json(Yasuri.json2tree(json))
|
248
229
|
expect(actual).to match expected
|
249
230
|
end
|
250
231
|
|
232
|
+
it "return map node with text nodes" do
|
233
|
+
tree = Yasuri::MapNode.new('parent', [
|
234
|
+
Yasuri::TextNode.new('/html/body/p[1]', "content01"),
|
235
|
+
Yasuri::TextNode.new('/html/body/p[2]', "content02"),
|
236
|
+
])
|
237
|
+
actual_json = Yasuri.tree2json(tree)
|
238
|
+
|
239
|
+
expected_json = %q|
|
240
|
+
{
|
241
|
+
"text_content01": "/html/body/p[1]",
|
242
|
+
"text_content02": "/html/body/p[2]"
|
243
|
+
}|
|
244
|
+
|
245
|
+
expected = Yasuri.tree2json(Yasuri.json2tree(expected_json))
|
246
|
+
actual = Yasuri.tree2json(Yasuri.json2tree(actual_json))
|
247
|
+
expect(actual).to match expected
|
248
|
+
end
|
249
|
+
|
251
250
|
it "return LinksNode/TextNode" do
|
252
251
|
tree = Yasuri::LinksNode.new('/html/body/a', "root", [
|
253
252
|
Yasuri::TextNode.new('/html/body/p', "content"),
|
254
253
|
])
|
255
254
|
json = Yasuri.tree2json(tree)
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
255
|
+
|
256
|
+
expected_src = %q|
|
257
|
+
{
|
258
|
+
"links_root": {
|
259
|
+
"path": "/html/body/a",
|
260
|
+
"text_content":"/html/body/p"
|
261
|
+
}
|
262
|
+
}|
|
264
263
|
expected = JSON.parse(expected_src)
|
265
264
|
actual = JSON.parse(json)
|
266
265
|
expect(actual).to match expected
|
@@ -272,25 +271,84 @@ EOB
|
|
272
271
|
], limit:10)
|
273
272
|
|
274
273
|
json = Yasuri.tree2json(tree)
|
275
|
-
expected_src = %q|
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
}|
|
274
|
+
expected_src = %q|
|
275
|
+
{
|
276
|
+
"pages_root": {
|
277
|
+
"path": "/html/body/nav/span/a[@class='next']",
|
278
|
+
"limit": 10,
|
279
|
+
"flatten": false,
|
280
|
+
"text_content": "/html/body/p"
|
281
|
+
}
|
282
|
+
}|
|
285
283
|
expected = JSON.parse(expected_src)
|
286
284
|
actual = JSON.parse(json)
|
287
285
|
expect(actual).to match expected
|
288
286
|
end
|
287
|
+
end
|
288
|
+
|
289
|
+
describe '.with_retry' do
|
290
|
+
it "call once if success" do
|
291
|
+
actual = Yasuri.with_retry(0){ 42 }
|
292
|
+
expect(actual).to match 42
|
293
|
+
end
|
294
|
+
|
295
|
+
it "call untile success" do
|
296
|
+
i = [1,1,0,0]
|
297
|
+
actual = Yasuri.with_retry(2){42 / i.pop } # 3 times in max
|
298
|
+
expect(actual).to match 42/1
|
299
|
+
end
|
300
|
+
|
301
|
+
it "raise error when exceed retry count" do
|
302
|
+
i = [1,0,0,0]
|
303
|
+
expect {
|
304
|
+
Yasuri.with_retry(2){42 / i.pop } # do this 3 times
|
305
|
+
}.to raise_error(Exception)
|
306
|
+
end
|
307
|
+
|
308
|
+
it "wait interval before run" do
|
309
|
+
allow(Kernel).to receive(:sleep)
|
310
|
+
Yasuri.with_retry(0){ 42 }
|
311
|
+
expect(Kernel).to have_received(:sleep).once
|
312
|
+
end
|
289
313
|
|
314
|
+
it "wait interval before run" do
|
315
|
+
allow(Kernel).to receive(:sleep)
|
316
|
+
Yasuri.with_retry(0){ 42 }
|
317
|
+
expect(Kernel).to have_received(:sleep).once
|
318
|
+
end
|
290
319
|
|
320
|
+
it "wait interval for each runs" do
|
321
|
+
allow(Kernel).to receive(:sleep)
|
291
322
|
|
323
|
+
i = [1,1,0,0]
|
324
|
+
Yasuri.with_retry(2){42 / i.pop } # 3 times in max
|
325
|
+
expect(Kernel).to have_received(:sleep).exactly(3).times
|
326
|
+
end
|
292
327
|
end
|
293
328
|
|
329
|
+
it "return StructNode/StructNode/[TextNode,TextNode]" do
|
330
|
+
tree = Yasuri::StructNode.new('/html/body/table', "tables", [
|
331
|
+
Yasuri::StructNode.new('./tr', "table", [
|
332
|
+
Yasuri::TextNode.new('./td[1]', "title"),
|
333
|
+
Yasuri::TextNode.new('./td[2]', "pub_date"),
|
334
|
+
])
|
335
|
+
])
|
336
|
+
json = Yasuri.tree2json(tree)
|
337
|
+
expected_src = %q|
|
338
|
+
{
|
339
|
+
"struct_tables": {
|
340
|
+
"path": "/html/body/table",
|
341
|
+
"struct_table": {
|
342
|
+
"path": "./tr",
|
343
|
+
"text_title": "./td[1]",
|
344
|
+
"text_pub_date": "./td[2]"
|
345
|
+
}
|
346
|
+
}
|
347
|
+
}|
|
348
|
+
expected = JSON.parse(expected_src)
|
349
|
+
actual = JSON.parse(json)
|
350
|
+
expect(actual).to match expected
|
351
|
+
end
|
294
352
|
|
295
353
|
it 'has a version number' do
|
296
354
|
expect(Yasuri::VERSION).not_to be nil
|