yasuri 3.2.0 → 3.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +4 -7
- data/USAGE.ja.md +107 -86
- data/USAGE.md +106 -87
- data/examples/example.rb +79 -0
- data/examples/github.yml +15 -0
- data/examples/sample.json +4 -0
- data/examples/sample.yml +11 -0
- data/lib/yasuri/version.rb +1 -1
- data/lib/yasuri/yasuri.rb +6 -2
- data/lib/yasuri/yasuri_cli.rb +6 -6
- data/lib/yasuri/yasuri_links_node.rb +3 -1
- data/lib/yasuri/yasuri_map_node.rb +1 -0
- data/lib/yasuri/yasuri_node.rb +14 -0
- data/lib/yasuri/yasuri_paginate_node.rb +2 -1
- data/spec/spec_helper.rb +3 -3
- data/spec/yasuri_cli_spec.rb +17 -4
- data/spec/yasuri_links_node_spec.rb +24 -10
- data/spec/yasuri_map_spec.rb +4 -5
- data/spec/yasuri_paginate_node_spec.rb +22 -10
- data/spec/yasuri_spec.rb +55 -19
- data/spec/yasuri_struct_node_spec.rb +13 -17
- data/spec/yasuri_text_node_spec.rb +11 -12
- metadata +6 -3
- data/app.rb +0 -52
data/USAGE.md
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
# Yasuri
|
2
2
|
|
3
3
|
## What is Yasuri
|
4
|
-
`Yasuri` (鑢) is a library for declarative web scraping and a command line tool for scraping with it.
|
4
|
+
`Yasuri` (鑢) is a library for declarative web scraping and a command line tool for scraping with it.
|
5
|
+
|
6
|
+
It performs scraping by simply describing the expected result in a simple declarative notation.
|
5
7
|
|
6
8
|
Yasuri makes it easy to write common scraping operations.
|
7
9
|
For example, the following processes can be easily implemented.
|
@@ -11,7 +13,6 @@ For example, the following processes can be easily implemented.
|
|
11
13
|
+ Scrape each table that appears repeatedly in the page and get the result as an array
|
12
14
|
+ Scrape only the first three pages of each page provided by pagination
|
13
15
|
|
14
|
-
|
15
16
|
## Quick Start
|
16
17
|
|
17
18
|
|
@@ -36,10 +37,7 @@ root = Yasuri.links_root '//*[@id="menu"]/ul/li/a' do
|
|
36
37
|
text_content '//*[@id="contents"]/p[1]'
|
37
38
|
end
|
38
39
|
|
39
|
-
|
40
|
-
root_page = agent.get("http://some.scraping.page.tac42.net/")
|
41
|
-
|
42
|
-
result = root.inject(agent, root_page)
|
40
|
+
result = root.scrape("http://some.scraping.page.tac42.net/")
|
43
41
|
# => [
|
44
42
|
# {"title" => "PageTitle 01", "content" => "Page Contents 01" },
|
45
43
|
# {"title" => "PageTitle 02", "content" => "Page Contents 02" },
|
@@ -171,7 +169,51 @@ In json or yaml format, a attribute can directly specify `path` as a value if it
|
|
171
169
|
}
|
172
170
|
}
|
173
171
|
```
|
172
|
+
### Run ParseTree
|
173
|
+
Call the `Node#scrape(uri, opt={})` method on the root node of the parse tree.
|
174
|
+
|
175
|
+
**Example**
|
176
|
+
```ruby
|
177
|
+
root = Yasuri.links_root '//*[@id="menu"]/ul/li/a' do
|
178
|
+
text_title '//*[@id="contents"]/h2'
|
179
|
+
text_content '//*[@id="contents"]/p[1]'
|
180
|
+
end
|
181
|
+
|
182
|
+
result = root.scrape("http://some.scraping.page.tac42.net/", interval_ms: 1000)
|
183
|
+
```
|
184
|
+
|
185
|
+
+ `uri` is the URI of the page to be scraped.
|
186
|
+
+ `opt` is options as Hash. The following options are available.
|
187
|
+
|
188
|
+
Yasuri uses `Mechanize` internally as an agent to do scraping.
|
189
|
+
If you want to specify this instance, call `Node#scrape_with_agent(uri, agent, opt={})`.
|
190
|
+
|
191
|
+
```ruby
|
192
|
+
require 'logger'
|
193
|
+
|
194
|
+
agent = Mechanize.new
|
195
|
+
agent.log = Logger.new $stderr
|
196
|
+
agent.request_headers = {
|
197
|
+
# ...
|
198
|
+
}
|
199
|
+
|
200
|
+
result = root.scrape_with_agent(
|
201
|
+
"http://some.scraping.page.tac42.net/",
|
202
|
+
agent,
|
203
|
+
interval_ms: 1000)
|
204
|
+
```
|
205
|
+
|
206
|
+
### `opt`
|
207
|
+
#### `interval_ms`
|
208
|
+
Interval [milliseconds] for requesting multiple pages.
|
209
|
+
|
210
|
+
If omitted, requests will be made continuously without an interval, but if requests to many pages are expected, it is strongly recommended to specify an interval time to avoid high load on the target host.
|
211
|
+
|
212
|
+
#### `retry_count`
|
213
|
+
Number of retries when page acquisition fails. If omitted, it will retry 5 times.
|
174
214
|
|
215
|
+
#### `symbolize_names`
|
216
|
+
If true, returns the keys of the result set as symbols.
|
175
217
|
|
176
218
|
--------------------------
|
177
219
|
## Node
|
@@ -216,7 +258,7 @@ TextNode return scraped text. This node have to be leaf.
|
|
216
258
|
### Example
|
217
259
|
|
218
260
|
```html
|
219
|
-
<!-- http://yasuri.example.net -->
|
261
|
+
<!-- http://yasuri.example.tac42.net -->
|
220
262
|
<html>
|
221
263
|
<head></head>
|
222
264
|
<body>
|
@@ -227,16 +269,13 @@ TextNode return scraped text. This node have to be leaf.
|
|
227
269
|
```
|
228
270
|
|
229
271
|
```ruby
|
230
|
-
agent = Mechanize.new
|
231
|
-
page = agent.get("http://yasuri.example.net")
|
232
|
-
|
233
272
|
p1 = Yasuri.text_title '/html/body/p[1]'
|
234
273
|
p1t = Yasuri.text_title '/html/body/p[1]', truncate:/^[^,]+/
|
235
274
|
p2u = Yasuri.text_title '/html/body/p[1]', proc: :upcase
|
236
275
|
|
237
|
-
p1.
|
238
|
-
p1t.
|
239
|
-
p2u.
|
276
|
+
p1.scrape("http://yasuri.example.tac42.net") #=> "Hello,World"
|
277
|
+
p1t.scrape("http://yasuri.example.tac42.net") #=> "Hello"
|
278
|
+
p2u.scrape("http://yasuri.example.tac42.net") #=> "HELLO,WORLD"
|
240
279
|
```
|
241
280
|
|
242
281
|
Note that if you want to scrape multiple elements in the same page at once, use `MapNode`. See the `MapNode` example for details.
|
@@ -247,7 +286,7 @@ Match to regexp, and truncate text. When you use group, it will return first mat
|
|
247
286
|
|
248
287
|
```ruby
|
249
288
|
node = Yasuri.text_example '/html/body/p[1]', truncate:/H(.+)i/
|
250
|
-
node.
|
289
|
+
node.scrape(uri)
|
251
290
|
#=> { "example" => "ello,Yasur" }
|
252
291
|
```
|
253
292
|
|
@@ -258,21 +297,22 @@ If it is given `truncate` option, apply method after truncated.
|
|
258
297
|
|
259
298
|
```ruby
|
260
299
|
node = Yasuri.text_example '/html/body/p[1]', proc: :upcase, truncate:/H(.+)i/
|
261
|
-
node.
|
300
|
+
node.scrape(uri)
|
262
301
|
#=> { "example" => "ELLO,YASUR" }
|
263
302
|
```
|
264
303
|
|
265
304
|
## Struct Node
|
266
305
|
Struct Node return structured text.
|
267
306
|
|
268
|
-
At first, Struct Node narrow down sub-tags by `Path`.
|
307
|
+
At first, Struct Node narrow down sub-tags by `Path`.
|
308
|
+
Child nodes parse narrowed tags, and struct node returns hash contains parsed result.
|
269
309
|
|
270
310
|
If Struct Node `Path` matches multi sub-tags, child nodes parse each sub-tags and struct node returns array.
|
271
311
|
|
272
312
|
### Example
|
273
313
|
|
274
314
|
```html
|
275
|
-
<!-- http://yasuri.example.net -->
|
315
|
+
<!-- http://yasuri.example.tac42.net -->
|
276
316
|
<html>
|
277
317
|
<head>
|
278
318
|
<title>Books</title>
|
@@ -313,15 +353,12 @@ If Struct Node `Path` matches multi sub-tags, child nodes parse each sub-tags an
|
|
313
353
|
```
|
314
354
|
|
315
355
|
```ruby
|
316
|
-
agent = Mechanize.new
|
317
|
-
page = agent.get("http://yasuri.example.net")
|
318
|
-
|
319
356
|
node = Yasuri.struct_table '/html/body/table[1]/tr' do
|
320
357
|
text_title './td[1]'
|
321
358
|
text_pub_date './td[2]'
|
322
|
-
|
359
|
+
end
|
323
360
|
|
324
|
-
node.
|
361
|
+
node.scrape("http://yasuri.example.tac42.net")
|
325
362
|
#=> [ { "title" => "The Perfect Insider",
|
326
363
|
# "pub_date" => "1996/4/5" },
|
327
364
|
# { "title" => "Doctors in Isolated Room",
|
@@ -340,17 +377,14 @@ Struct node can contain not only Text node.
|
|
340
377
|
### Example
|
341
378
|
|
342
379
|
```ruby
|
343
|
-
agent = Mechanize.new
|
344
|
-
page = agent.get("http://yasuri.example.net")
|
345
|
-
|
346
380
|
node = Yasuri.strucre_tables '/html/body/table' do
|
347
381
|
struct_table './tr' do
|
348
382
|
text_title './td[1]'
|
349
383
|
text_pub_date './td[2]'
|
350
384
|
end
|
351
|
-
|
385
|
+
end
|
352
386
|
|
353
|
-
node.
|
387
|
+
node.scrape("http://yasuri.example.tac42.net")
|
354
388
|
|
355
389
|
#=> [ { "table" => [ { "title" => "The Perfect Insider",
|
356
390
|
# "pub_date" => "1996/4/5" },
|
@@ -383,7 +417,7 @@ Links Node returns parsed text in each linked pages.
|
|
383
417
|
|
384
418
|
### Example
|
385
419
|
```html
|
386
|
-
<!-- http://yasuri.example.net -->
|
420
|
+
<!-- http://yasuri.example.tac42.net -->
|
387
421
|
<html>
|
388
422
|
<head><title>Yasuri Test</title></head>
|
389
423
|
<body>
|
@@ -396,7 +430,7 @@ Links Node returns parsed text in each linked pages.
|
|
396
430
|
```
|
397
431
|
|
398
432
|
```html
|
399
|
-
<!-- http://yasuri.example.net/child01.html -->
|
433
|
+
<!-- http://yasuri.example.tac42.net/child01.html -->
|
400
434
|
<html>
|
401
435
|
<head><title>Child 01 Test</title></head>
|
402
436
|
<body>
|
@@ -410,7 +444,7 @@ Links Node returns parsed text in each linked pages.
|
|
410
444
|
```
|
411
445
|
|
412
446
|
```html
|
413
|
-
<!-- http://yasuri.example.net/child02.html -->
|
447
|
+
<!-- http://yasuri.example.tac42.net/child02.html -->
|
414
448
|
<html>
|
415
449
|
<head><title>Child 02 Test</title></head>
|
416
450
|
<body>
|
@@ -420,7 +454,7 @@ Links Node returns parsed text in each linked pages.
|
|
420
454
|
```
|
421
455
|
|
422
456
|
```html
|
423
|
-
<!-- http://yasuri.example.net/child03.html -->
|
457
|
+
<!-- http://yasuri.example.tac42.net/child03.html -->
|
424
458
|
<html>
|
425
459
|
<head><title>Child 03 Test</title></head>
|
426
460
|
<body>
|
@@ -433,20 +467,17 @@ Links Node returns parsed text in each linked pages.
|
|
433
467
|
```
|
434
468
|
|
435
469
|
```ruby
|
436
|
-
agent = Mechanize.new
|
437
|
-
page = agent.get("http://yasuri.example.net")
|
438
|
-
|
439
470
|
node = Yasuri.links_title '/html/body/a' do
|
440
471
|
text_content '/html/body/p'
|
441
472
|
end
|
442
473
|
|
443
|
-
node.
|
474
|
+
node.scrape("http://yasuri.example.tac42.net")
|
444
475
|
#=> [ {"content" => "Child 01 page."},
|
445
476
|
{"content" => "Child 02 page."},
|
446
477
|
{"content" => "Child 03 page."}]
|
447
478
|
```
|
448
479
|
|
449
|
-
At first, Links Node find all links in the page by path. In this case, LinksNode find `/html/body/a` tags in `http://yasuri.example.net`. Then, open href attributes (`./child01.html`, `./child02.html` and `./child03.html`).
|
480
|
+
At first, Links Node find all links in the page by path. In this case, LinksNode find `/html/body/a` tags in `http://yasuri.example.tac42.net`. Then, open href attributes (`./child01.html`, `./child02.html` and `./child03.html`).
|
450
481
|
|
451
482
|
Then, Links Node and apply child nodes. Links Node will return applied result of each page as array.
|
452
483
|
|
@@ -460,7 +491,7 @@ Paginate Node parses and returns each pages that provid by paginate.
|
|
460
491
|
Target page `page01.html` is like this. `page02.html` to `page04.html` are similarly.
|
461
492
|
|
462
493
|
```html
|
463
|
-
<!-- http://yasuri.example.net/page01.html -->
|
494
|
+
<!-- http://yasuri.example.tac42.net/page01.html -->
|
464
495
|
<html>
|
465
496
|
<head><title>Page01</title></head>
|
466
497
|
<body>
|
@@ -480,21 +511,17 @@ Target page `page01.html` is like this. `page02.html` to `page04.html` are simil
|
|
480
511
|
```
|
481
512
|
|
482
513
|
```ruby
|
483
|
-
|
484
|
-
page = agent.get("http://yasuri.example.net/page01.html")
|
485
|
-
|
486
|
-
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" do
|
514
|
+
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , limit:3 do
|
487
515
|
text_content '/html/body/p'
|
488
516
|
end
|
489
517
|
|
490
|
-
node.
|
491
|
-
#=> [ {"content" => "
|
492
|
-
|
493
|
-
|
494
|
-
{"content" => "Pagination04"}]
|
518
|
+
node.scrape("http://yasuri.example.tac42.net/page01.html")
|
519
|
+
#=> [ {"content" => "Patination01"},
|
520
|
+
# {"content" => "Patination02"},
|
521
|
+
# {"content" => "Patination03"}]
|
495
522
|
```
|
496
|
-
|
497
|
-
|
523
|
+
Paginate Node require link for next page.
|
524
|
+
In this case, it is `NextPage` `/html/body/nav/span/a[@class='next']`.
|
498
525
|
|
499
526
|
### Options
|
500
527
|
##### `limit`
|
@@ -504,7 +531,7 @@ Upper limit of open pages in pagination.
|
|
504
531
|
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , limit:2 do
|
505
532
|
text_content '/html/body/p'
|
506
533
|
end
|
507
|
-
node.
|
534
|
+
node.scrape(uri)
|
508
535
|
#=> [ {"content" => "Pagination01"}, {"content" => "Pagination02"}]
|
509
536
|
```
|
510
537
|
Paginate Node open upto 2 given by `limit`. In this situation, pagination has 4 pages, but result Array has 2 texts because given `limit:2`.
|
@@ -513,35 +540,32 @@ Paginate Node open upto 2 given by `limit`. In this situation, pagination has 4
|
|
513
540
|
`flatten` option expands each page results.
|
514
541
|
|
515
542
|
```ruby
|
516
|
-
agent = Mechanize.new
|
517
|
-
page = agent.get("http://yasuri.example.net/page01.html")
|
518
|
-
|
519
543
|
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
|
520
544
|
text_title '/html/head/title'
|
521
545
|
text_content '/html/body/p'
|
522
546
|
end
|
523
|
-
node.
|
547
|
+
node.scrape("http://yasuri.example.tac42.net/page01.html")
|
524
548
|
|
525
549
|
#=> [ {"title" => "Page01",
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
550
|
+
# "content" => "Patination01"},
|
551
|
+
# {"title" => "Page01",
|
552
|
+
# "content" => "Patination02"},
|
553
|
+
# {"title" => "Page01",
|
554
|
+
# "content" => "Patination03"}]
|
531
555
|
|
532
556
|
|
533
557
|
node = Yasuri.pages_root "/html/body/nav/span/a[@class='next']" , flatten:true do
|
534
558
|
text_title '/html/head/title'
|
535
559
|
text_content '/html/body/p'
|
536
560
|
end
|
537
|
-
node.
|
561
|
+
node.scrape("http://yasuri.example.tac42.net/page01.html")
|
538
562
|
|
539
563
|
#=> [ "Page01",
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
564
|
+
# "Patination01",
|
565
|
+
# "Page02",
|
566
|
+
# "Patination02",
|
567
|
+
# "Page03",
|
568
|
+
# "Patination03"]
|
545
569
|
```
|
546
570
|
|
547
571
|
## Map Node
|
@@ -550,7 +574,7 @@ node.inject(agent, page)
|
|
550
574
|
### Example
|
551
575
|
|
552
576
|
```html
|
553
|
-
<!-- http://yasuri.example.net -->
|
577
|
+
<!-- http://yasuri.example.tac42.net -->
|
554
578
|
<html>
|
555
579
|
<head><title>Yasuri Example</title></head>
|
556
580
|
<body>
|
@@ -561,16 +585,12 @@ node.inject(agent, page)
|
|
561
585
|
```
|
562
586
|
|
563
587
|
```ruby
|
564
|
-
agent = Mechanize.new
|
565
|
-
page = agent.get("http://yasuri.example.net")
|
566
|
-
|
567
|
-
|
568
588
|
tree = Yasuri.map_root do
|
569
589
|
text_title '/html/head/title'
|
570
590
|
text_body_p '/html/body/p[1]'
|
571
591
|
end
|
572
592
|
|
573
|
-
tree.
|
593
|
+
tree.scrape("http://yasuri.example.tac42.net") #=> { "title" => "Yasuri Example", "body_p" => "Hello,World" }
|
574
594
|
|
575
595
|
|
576
596
|
tree = Yasuri.map_root do
|
@@ -581,7 +601,7 @@ tree = Yasuri.map_root do
|
|
581
601
|
end
|
582
602
|
end
|
583
603
|
|
584
|
-
tree.
|
604
|
+
tree.scrape("http://yasuri.example.tac42.net") #=> {
|
585
605
|
# "group1" => {
|
586
606
|
# "child01" => "child01"
|
587
607
|
# },
|
@@ -596,18 +616,15 @@ tree.inject(agent, page) #=> {
|
|
596
616
|
None.
|
597
617
|
|
598
618
|
|
599
|
-
|
600
|
-
|
601
619
|
-------------------------
|
602
620
|
## Usage
|
603
621
|
|
604
|
-
|
622
|
+
### Use as library
|
605
623
|
When used as a library, the tree can be defined in DSL, json, or yaml format.
|
624
|
+
|
606
625
|
```ruby
|
607
|
-
require 'mechanize'
|
608
626
|
require 'yasuri'
|
609
627
|
|
610
|
-
|
611
628
|
# 1. Create a parse tree.
|
612
629
|
# Define by Ruby's DSL
|
613
630
|
tree = Yasuri.links_title '/html/body/a' do
|
@@ -634,17 +651,11 @@ links_title:
|
|
634
651
|
EOYAML
|
635
652
|
tree = Yasuri.yaml2tree(src)
|
636
653
|
|
637
|
-
|
638
|
-
|
639
|
-
# 2. Give the Mechanize agent and the target page to start parsing
|
640
|
-
agent = Mechanize.new
|
641
|
-
page = agent.get(uri)
|
642
|
-
|
643
|
-
|
644
|
-
tree.inject(agent, page)
|
654
|
+
# 2. Give the URL to start parsing
|
655
|
+
tree.inject(uri)
|
645
656
|
```
|
646
657
|
|
647
|
-
|
658
|
+
### Use as CLI tool
|
648
659
|
|
649
660
|
**Help**
|
650
661
|
```sh
|
@@ -655,13 +666,14 @@ Usage:
|
|
655
666
|
Options:
|
656
667
|
f, [--file=FILE] # path to file that written yasuri tree as json or yaml
|
657
668
|
j, [--json=JSON] # yasuri tree format json string
|
669
|
+
i, [--interval=N] # interval each request [ms]
|
658
670
|
|
659
671
|
Getting from <URI> and scrape it. with <JSON> or json/yml from <TREE_FILE>. They should be Yasuri's format json or yaml string.
|
660
672
|
```
|
661
673
|
|
662
674
|
In the CLI tool, you can specify the parse tree in either of the following ways.
|
663
|
-
+ `--file`, `-f`
|
664
|
-
+ `--json`, `-j`
|
675
|
+
+ `--file`, `-f` : option to read the parse tree in json or yaml format output to a file.
|
676
|
+
+ `--json`, `-j` : option to specify the parse tree directly as a string.
|
665
677
|
|
666
678
|
|
667
679
|
**Example of specifying a parse tree as a file**
|
@@ -695,3 +707,10 @@ $ yasuri scrape "https://www.ruby-lang.org/en/" -j '
|
|
695
707
|
|
696
708
|
{"title":"Ruby Programming Language","desc":"\n A dynamic, open source programming language with a focus on\n simplicity and productivity. It has an elegant syntax that is\n natural to read and easy to write.\n "}
|
697
709
|
```
|
710
|
+
|
711
|
+
#### Other options
|
712
|
+
+ `--interval`, `-i` : The interval [milliseconds] for requesting multiple pages.
|
713
|
+
**Example: Request at 1 second intervals**
|
714
|
+
```sh
|
715
|
+
$ yasuri scrape "https://www.ruby-lang.org/en/" --file sample.yml --interval 1000
|
716
|
+
```
|
data/examples/example.rb
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# Author:: TAC (tac@tac42.net)
|
3
|
+
|
4
|
+
require 'yasuri'
|
5
|
+
uri = "https://github.com/tac0x2a?tab=repositories"
|
6
|
+
|
7
|
+
# Node tree constructing by DSL
|
8
|
+
root = Yasuri.map_root do
|
9
|
+
text_title '/html/head/title'
|
10
|
+
links_repo '//*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a' do
|
11
|
+
text_name '//*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a'
|
12
|
+
text_desc '//*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p', proc: :strip
|
13
|
+
text_stars '//*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]', proc: :to_i
|
14
|
+
text_forks '//*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span', proc: :to_i
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
# Node tree constructing by YAML
|
19
|
+
# src = <<-EOYML
|
20
|
+
# text_title: /html/head/title
|
21
|
+
# links_repo:
|
22
|
+
# path: //*[@id="user-repositories-list"]/ul/li/div[1]/div[1]/h3/a
|
23
|
+
# text_name: //*[@id="js-repo-pjax-container"]/div[1]/div[1]/div/h1/strong/a
|
24
|
+
# text_desc:
|
25
|
+
# path: //*[@id="repo-content-pjax-container"]/div/div[2]/div[2]/div/div[1]/div/p
|
26
|
+
# proc: :strip
|
27
|
+
# text_stars:
|
28
|
+
# path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[1]
|
29
|
+
# proc: :to_i
|
30
|
+
# text_forks:
|
31
|
+
# path: //*[@id="js-repo-pjax-container"]/div[1]/div[2]/div[2]/a[2]/span
|
32
|
+
# proc: :to_i
|
33
|
+
# EOYML
|
34
|
+
# root = Yasuri.yaml2tree(src)
|
35
|
+
|
36
|
+
contents = root.scrape(uri, interval_ms: 100)
|
37
|
+
# jj contents
|
38
|
+
# {
|
39
|
+
# "title": "tac0x2a (TAC) / Repositories · GitHub",
|
40
|
+
# "repo": [
|
41
|
+
# {
|
42
|
+
# "name": "o-namazu",
|
43
|
+
# "desc": "Oh Namazu (Catfish) in datalake",
|
44
|
+
# "stars": 1,
|
45
|
+
# "forks": 0
|
46
|
+
# },
|
47
|
+
# {
|
48
|
+
# "name": "grebe",
|
49
|
+
# "desc": "grebe in datalake",
|
50
|
+
# "stars": 2,
|
51
|
+
# "forks": 0
|
52
|
+
# },
|
53
|
+
# {
|
54
|
+
# "name": "yasuri",
|
55
|
+
# "desc": "Yasuri (鑢) is easy web scraping library.",
|
56
|
+
# "stars": 43,
|
57
|
+
# "forks": 1
|
58
|
+
# },
|
59
|
+
# {
|
60
|
+
# "name": "dotfiles",
|
61
|
+
# "desc": "dotfiles",
|
62
|
+
# "stars": 0,
|
63
|
+
# "forks": 0
|
64
|
+
# }
|
65
|
+
# ...
|
66
|
+
# ]
|
67
|
+
# }
|
68
|
+
|
69
|
+
# Output as markdown
|
70
|
+
puts "# #{contents['title']}"
|
71
|
+
contents['repo'].each do |h|
|
72
|
+
puts "-----"
|
73
|
+
puts "## #{h['name']}"
|
74
|
+
puts h['desc']
|
75
|
+
puts ""
|
76
|
+
puts "* Stars: #{h['stars']}"
|
77
|
+
puts "* Forks: #{h['forks']}"
|
78
|
+
puts ""
|
79
|
+
end
|