nhkore 0.3.13 → 0.3.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -1
- data/Gemfile.lock +41 -41
- data/README.md +4 -5
- data/Rakefile +5 -3
- data/lib/nhkore/app.rb +8 -19
- data/lib/nhkore/article.rb +5 -9
- data/lib/nhkore/article_scraper.rb +15 -14
- data/lib/nhkore/cleaner.rb +0 -12
- data/lib/nhkore/cli/fx_cmd.rb +0 -4
- data/lib/nhkore/cli/get_cmd.rb +0 -4
- data/lib/nhkore/cli/news_cmd.rb +29 -17
- data/lib/nhkore/cli/search_cmd.rb +45 -35
- data/lib/nhkore/cli/sift_cmd.rb +1 -5
- data/lib/nhkore/datetime_parser.rb +1 -5
- data/lib/nhkore/defn.rb +1 -5
- data/lib/nhkore/dict.rb +2 -5
- data/lib/nhkore/dict_scraper.rb +2 -6
- data/lib/nhkore/entry.rb +3 -9
- data/lib/nhkore/error.rb +1 -11
- data/lib/nhkore/fileable.rb +0 -4
- data/lib/nhkore/lib.rb +0 -3
- data/lib/nhkore/missingno.rb +2 -6
- data/lib/nhkore/news.rb +3 -15
- data/lib/nhkore/polisher.rb +0 -12
- data/lib/nhkore/scraper.rb +8 -5
- data/lib/nhkore/search_link.rb +9 -17
- data/lib/nhkore/search_scraper.rb +34 -24
- data/lib/nhkore/sifter.rb +7 -8
- data/lib/nhkore/splitter.rb +0 -18
- data/lib/nhkore/user_agents.rb +1 -4
- data/lib/nhkore/util.rb +0 -4
- data/lib/nhkore/variator.rb +0 -14
- data/lib/nhkore/version.rb +1 -1
- data/lib/nhkore/word.rb +0 -4
- data/lib/nhkore.rb +0 -5
- data/nhkore.gemspec +40 -37
- data/samples/looper.rb +0 -3
- metadata +24 -24
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f23192b04fc6a0c1cf225db4a029e3226346d27c4fe977ee05f1b522c40708bb
|
4
|
+
data.tar.gz: 3e71d5ef9eb60327cb9ced89a819de77b5cd8910e755332d5764dc95e8263f71
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2cec06bd51e86ddd7b30a052b4078a2932344996bd6d97c4b7d927d0bc8fbcecbd2fb21680957551857d3a152d33409aa7ee8e1dd496a271cb31c63cbe2eb2e9
|
7
|
+
data.tar.gz: 186c42412e35567aebe2afee71636fb58a28510bd6ab3e8b65df6adef5373860f25e87be71e5d8499dcf6b6babb0ee3aae796a01f818732bc53ad682e584e701
|
data/CHANGELOG.md
CHANGED
@@ -5,10 +5,26 @@ All notable changes to this project will be documented in this file.
|
|
5
5
|
Format is based on [Keep a Changelog v1.0.0](https://keepachangelog.com/en/1.0.0),
|
6
6
|
and this project adheres to [Semantic Versioning v2.0.0](https://semver.org/spec/v2.0.0.html).
|
7
7
|
|
8
|
-
## [[Unreleased]](https://github.com/esotericpig/nhkore/compare/v0.3.
|
8
|
+
## [[Unreleased]](https://github.com/esotericpig/nhkore/compare/v0.3.16...HEAD)
|
9
9
|
-
|
10
10
|
|
11
11
|
|
12
|
+
## [v0.3.16] - 2024-08-14
|
13
|
+
|
14
|
+
### Fixed
|
15
|
+
- Fixed to work with new NHK pages.
|
16
|
+
- Updated gems.
|
17
|
+
|
18
|
+
|
19
|
+
## [v0.3.14] - 2022-07-24
|
20
|
+
|
21
|
+
### Added
|
22
|
+
- `--loop` option to `search` command so can run web search (search engine) multiple times since this usually doesn't get all results if only do once.
|
23
|
+
|
24
|
+
### Fixed
|
25
|
+
- Updated gems (`nokogiri`).
|
26
|
+
|
27
|
+
|
12
28
|
## [v0.3.13] - 2022-04-27
|
13
29
|
|
14
30
|
### Fixed
|
data/Gemfile.lock
CHANGED
@@ -1,19 +1,19 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
nhkore (0.3.
|
4
|
+
nhkore (0.3.16)
|
5
5
|
attr_bool (~> 0.2)
|
6
6
|
bimyou_segmenter (~> 1.2)
|
7
7
|
cri (~> 2.15)
|
8
|
-
down (~> 5.
|
9
|
-
highline (~>
|
8
|
+
down (~> 5.4)
|
9
|
+
highline (~> 3.1)
|
10
10
|
http-cookie (~> 1.0)
|
11
11
|
japanese_deinflector (~> 0.0)
|
12
|
-
nokogiri (~> 1.
|
12
|
+
nokogiri (~> 1.16)
|
13
13
|
psychgus (~> 1.3)
|
14
|
-
public_suffix (~>
|
14
|
+
public_suffix (~> 6.0)
|
15
15
|
rainbow (~> 3.1)
|
16
|
-
rss (~> 0.
|
16
|
+
rss (~> 0.3)
|
17
17
|
rubyzip (~> 2.3)
|
18
18
|
tiny_segmenter (~> 0.0)
|
19
19
|
tty-progressbar (~> 0.18)
|
@@ -22,43 +22,48 @@ PATH
|
|
22
22
|
GEM
|
23
23
|
remote: https://rubygems.org/
|
24
24
|
specs:
|
25
|
-
addressable (2.8.
|
26
|
-
public_suffix (>= 2.0.2, <
|
25
|
+
addressable (2.8.7)
|
26
|
+
public_suffix (>= 2.0.2, < 7.0)
|
27
27
|
attr_bool (0.2.2)
|
28
28
|
bimyou_segmenter (1.2.0)
|
29
|
-
cri (2.15.
|
30
|
-
domain_name (0.
|
31
|
-
|
32
|
-
down (5.3.1)
|
29
|
+
cri (2.15.12)
|
30
|
+
domain_name (0.6.20240107)
|
31
|
+
down (5.4.2)
|
33
32
|
addressable (~> 2.8)
|
34
|
-
highline (
|
35
|
-
|
33
|
+
highline (3.1.0)
|
34
|
+
reline
|
35
|
+
http-cookie (1.0.6)
|
36
36
|
domain_name (~> 0.5)
|
37
|
+
io-console (0.7.2)
|
37
38
|
japanese_deinflector (0.0.2)
|
38
|
-
mini_portile2 (2.8.
|
39
|
-
minitest (5.
|
40
|
-
nokogiri (1.
|
41
|
-
mini_portile2 (~> 2.8.
|
39
|
+
mini_portile2 (2.8.7)
|
40
|
+
minitest (5.25.0)
|
41
|
+
nokogiri (1.16.7)
|
42
|
+
mini_portile2 (~> 2.8.2)
|
42
43
|
racc (~> 1.4)
|
43
|
-
psych (
|
44
|
+
psych (5.1.2)
|
44
45
|
stringio
|
45
46
|
psychgus (1.3.4)
|
46
47
|
psych (>= 3.0)
|
47
|
-
public_suffix (
|
48
|
-
racc (1.
|
48
|
+
public_suffix (6.0.1)
|
49
|
+
racc (1.8.1)
|
49
50
|
rainbow (3.1.1)
|
50
|
-
rake (13.
|
51
|
+
rake (13.2.1)
|
51
52
|
raketeer (0.2.13)
|
52
53
|
rake
|
53
|
-
rdoc (6.
|
54
|
+
rdoc (6.7.0)
|
54
55
|
psych (>= 4.0.0)
|
55
|
-
redcarpet (3.
|
56
|
-
|
57
|
-
|
56
|
+
redcarpet (3.6.0)
|
57
|
+
reline (0.5.9)
|
58
|
+
io-console (~> 0.5)
|
59
|
+
rexml (3.3.5)
|
60
|
+
strscan
|
61
|
+
rss (0.3.1)
|
58
62
|
rexml
|
59
63
|
rubyzip (2.3.2)
|
60
|
-
stringio (3.
|
64
|
+
stringio (3.1.1)
|
61
65
|
strings-ansi (0.2.0)
|
66
|
+
strscan (3.1.0)
|
62
67
|
tiny_segmenter (0.0.6)
|
63
68
|
tty-cursor (0.7.1)
|
64
69
|
tty-progressbar (0.18.2)
|
@@ -66,16 +71,11 @@ GEM
|
|
66
71
|
tty-cursor (~> 0.7)
|
67
72
|
tty-screen (~> 0.8)
|
68
73
|
unicode-display_width (>= 1.6, < 3.0)
|
69
|
-
tty-screen (0.8.
|
74
|
+
tty-screen (0.8.2)
|
70
75
|
tty-spinner (0.9.3)
|
71
76
|
tty-cursor (~> 0.7)
|
72
|
-
|
73
|
-
|
74
|
-
unf_ext (0.0.8.1)
|
75
|
-
unicode-display_width (2.1.0)
|
76
|
-
webrick (1.7.0)
|
77
|
-
yard (0.9.27)
|
78
|
-
webrick (~> 1.7.0)
|
77
|
+
unicode-display_width (2.5.0)
|
78
|
+
yard (0.9.36)
|
79
79
|
yard_ghurt (1.2.1)
|
80
80
|
rake
|
81
81
|
yard
|
@@ -84,15 +84,15 @@ PLATFORMS
|
|
84
84
|
ruby
|
85
85
|
|
86
86
|
DEPENDENCIES
|
87
|
-
bundler (~> 2.
|
88
|
-
minitest (~> 5.
|
87
|
+
bundler (~> 2.5)
|
88
|
+
minitest (~> 5.25)
|
89
89
|
nhkore!
|
90
|
-
rake (~> 13.
|
90
|
+
rake (~> 13.2)
|
91
91
|
raketeer (~> 0.2)
|
92
|
-
rdoc (~> 6.
|
93
|
-
redcarpet (~> 3.
|
92
|
+
rdoc (~> 6.7)
|
93
|
+
redcarpet (~> 3.6)
|
94
94
|
yard (~> 0.9)
|
95
95
|
yard_ghurt (~> 1.2)
|
96
96
|
|
97
97
|
BUNDLED WITH
|
98
|
-
2.
|
98
|
+
2.5.17
|
data/README.md
CHANGED
@@ -867,12 +867,11 @@ This will update *core/* for you:
|
|
867
867
|
2. Update *core* package:
|
868
868
|
- `$ bundle exec rake update_core`
|
869
869
|
- `$ bundle exec rake clobber pkg_core`
|
870
|
-
3.
|
870
|
+
3. Commit & Push.
|
871
|
+
4. Create a new tag & release:
|
871
872
|
- Note: make sure to add *pkg/nhkore-core.zip*
|
872
873
|
- `$ gh release create v0 pkg/*.gem pkg/*.zip`
|
873
|
-
- `$ git pull`
|
874
|
-
4. Release to *GitHub Packages*:
|
875
|
-
- With *Raketary*: `$ raketary github_pkg`
|
874
|
+
- `$ git pull && git fetch`
|
876
875
|
5. Release to *RubyGems*:
|
877
876
|
- `$ bundle exec rake release`
|
878
877
|
|
@@ -885,7 +884,7 @@ Releasing new HTML file for website:
|
|
885
884
|
[GNU LGPL v3+](LICENSE.txt)
|
886
885
|
|
887
886
|
> NHKore (<https://github.com/esotericpig/nhkore>)
|
888
|
-
> Copyright (c) 2020-
|
887
|
+
> Copyright (c) 2020-2022 Jonathan Bradley Whited
|
889
888
|
>
|
890
889
|
> NHKore is free software: you can redistribute it and/or modify
|
891
890
|
> it under the terms of the GNU Lesser General Public License as published by
|
data/Rakefile
CHANGED
@@ -56,11 +56,14 @@ task :update_core do |task|
|
|
56
56
|
cmd = ['ruby','-w','./lib/nhkore.rb','-t','300','-m','10']
|
57
57
|
hl = HighLine.new
|
58
58
|
|
59
|
-
next unless sh(*cmd,'se','
|
59
|
+
next unless sh(*cmd,'se','--show-count','ez')
|
60
|
+
puts
|
61
|
+
|
62
|
+
next unless sh(*cmd,'se','-l','10','ez','bing')
|
60
63
|
next unless hl.agree(continue_msg)
|
61
64
|
puts
|
62
65
|
|
63
|
-
next unless sh(*cmd,'news','-s','
|
66
|
+
next unless sh(*cmd,'news','-s','1000','ez')
|
64
67
|
next unless hl.agree(continue_msg)
|
65
68
|
puts
|
66
69
|
|
@@ -74,7 +77,6 @@ task :update_core do |task|
|
|
74
77
|
puts
|
75
78
|
end
|
76
79
|
|
77
|
-
# @since 0.3.6
|
78
80
|
desc 'Update showcase file for release'
|
79
81
|
task :update_showcase do |task|
|
80
82
|
require 'highline'
|
data/lib/nhkore/app.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
|
4
4
|
#--
|
5
5
|
# This file is part of NHKore.
|
6
|
-
# Copyright (c) 2020-
|
6
|
+
# Copyright (c) 2020-2022 Jonathan Bradley Whited
|
7
7
|
#
|
8
8
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
9
9
|
#++
|
@@ -27,18 +27,11 @@ require 'nhkore/cli/sift_cmd'
|
|
27
27
|
|
28
28
|
|
29
29
|
module NHKore
|
30
|
-
###
|
31
|
-
# @author Jonathan Bradley Whited
|
32
|
-
# @since 0.2.0
|
33
|
-
###
|
34
30
|
module CLI
|
35
31
|
end
|
36
32
|
|
37
33
|
###
|
38
34
|
# For disabling/enabling color output.
|
39
|
-
#
|
40
|
-
# @author Jonathan Bradley Whited
|
41
|
-
# @since 0.2.1
|
42
35
|
###
|
43
36
|
module CriColorExt
|
44
37
|
@color = true
|
@@ -52,10 +45,6 @@ module NHKore
|
|
52
45
|
end
|
53
46
|
end
|
54
47
|
|
55
|
-
###
|
56
|
-
# @author Jonathan Bradley Whited
|
57
|
-
# @since 0.2.0
|
58
|
-
###
|
59
48
|
class App
|
60
49
|
include CLI::FXCmd
|
61
50
|
include CLI::GetCmd
|
@@ -537,7 +526,11 @@ module NHKore
|
|
537
526
|
end
|
538
527
|
|
539
528
|
def sleep_scraper
|
540
|
-
|
529
|
+
# Do a range to better emulate being a human.
|
530
|
+
r = rand(@sleep_time..(@sleep_time + 0.1111))
|
531
|
+
s = r.round(3) # Within 1000ms (0.000 - 0.999).
|
532
|
+
|
533
|
+
sleep(s)
|
541
534
|
end
|
542
535
|
|
543
536
|
def start_spin(title,detail: '')
|
@@ -572,10 +565,6 @@ module NHKore
|
|
572
565
|
end
|
573
566
|
end
|
574
567
|
|
575
|
-
###
|
576
|
-
# @author Jonathan Bradley Whited
|
577
|
-
# @since 0.2.0
|
578
|
-
###
|
579
568
|
class NoProgressBar
|
580
569
|
MSG = '%{title}... %{percent}%%'
|
581
570
|
PUT_INTERVAL = 100.0 / 6.25
|
@@ -623,7 +612,7 @@ module NHKore
|
|
623
612
|
|
624
613
|
@tokens[:advance] = percent
|
625
614
|
|
626
|
-
puts
|
615
|
+
puts self
|
627
616
|
end
|
628
617
|
|
629
618
|
def finish
|
@@ -631,7 +620,7 @@ module NHKore
|
|
631
620
|
end
|
632
621
|
|
633
622
|
def start
|
634
|
-
puts
|
623
|
+
puts self
|
635
624
|
end
|
636
625
|
|
637
626
|
def to_s
|
data/lib/nhkore/article.rb
CHANGED
@@ -16,10 +16,6 @@ require 'nhkore/word'
|
|
16
16
|
|
17
17
|
|
18
18
|
module NHKore
|
19
|
-
###
|
20
|
-
# @author Jonathan Bradley Whited
|
21
|
-
# @since 0.2.0
|
22
|
-
###
|
23
19
|
class Article
|
24
20
|
attr_reader :datetime
|
25
21
|
attr_reader :futsuurl
|
@@ -29,7 +25,7 @@ module NHKore
|
|
29
25
|
attr_reader :words
|
30
26
|
|
31
27
|
def initialize
|
32
|
-
super
|
28
|
+
super
|
33
29
|
|
34
30
|
@datetime = nil
|
35
31
|
@futsuurl = nil
|
@@ -101,13 +97,13 @@ module NHKore
|
|
101
97
|
end
|
102
98
|
|
103
99
|
def futsuurl=(value)
|
104
|
-
# Don't store URI, store String.
|
105
|
-
@futsuurl = value
|
100
|
+
# Don't store URI, store String or nil.
|
101
|
+
@futsuurl = value&.to_s
|
106
102
|
end
|
107
103
|
|
108
104
|
def url=(value)
|
109
|
-
# Don't store URI, store String.
|
110
|
-
@url = value
|
105
|
+
# Don't store URI, store String or nil.
|
106
|
+
@url = value&.to_s
|
111
107
|
end
|
112
108
|
|
113
109
|
def to_s(mini: false)
|
@@ -26,10 +26,6 @@ require 'nhkore/word'
|
|
26
26
|
|
27
27
|
|
28
28
|
module NHKore
|
29
|
-
###
|
30
|
-
# @author Jonathan Bradley Whited
|
31
|
-
# @since 0.2.0
|
32
|
-
###
|
33
29
|
class ArticleScraper < Scraper
|
34
30
|
extend AttrBool::Ext
|
35
31
|
|
@@ -139,7 +135,13 @@ module NHKore
|
|
139
135
|
end
|
140
136
|
|
141
137
|
def parse_dicwin_id(str)
|
142
|
-
str = str.
|
138
|
+
str = str.to_s.strip.downcase
|
139
|
+
|
140
|
+
if str.start_with?('id-') # 'id-0000'
|
141
|
+
str = str.gsub(/\D+/,'')
|
142
|
+
else # 'RSHOK-K-003806'
|
143
|
+
# Same.
|
144
|
+
end
|
143
145
|
|
144
146
|
return nil if str.empty?
|
145
147
|
return str
|
@@ -235,8 +237,6 @@ module NHKore
|
|
235
237
|
# Ignore; try again below.
|
236
238
|
Util.warn("could not parse date time[#{tag_text}] from tag[#{tag_name}] at URL[#{@url}]: #{e}")
|
237
239
|
end
|
238
|
-
|
239
|
-
return datetime
|
240
240
|
end
|
241
241
|
|
242
242
|
# Third, try body's id.
|
@@ -393,7 +393,6 @@ module NHKore
|
|
393
393
|
return link
|
394
394
|
end
|
395
395
|
|
396
|
-
# @since 0.3.8
|
397
396
|
# @see https://www3.nhk.or.jp/news/easy/k10012759201000/k10012759201000.html
|
398
397
|
def scrape_ruby_words(tag,result: ScrapeWordsResult.new)
|
399
398
|
words = Word.scrape_ruby_tag(tag,missingno: @missingno,url: @url)
|
@@ -489,15 +488,21 @@ module NHKore
|
|
489
488
|
end
|
490
489
|
|
491
490
|
def scrape_title(doc,article)
|
491
|
+
# Not grabbing `<head><title>` because it doesn't have `<ruby>` tags.
|
492
|
+
|
492
493
|
tag = doc.css('h1.article-main__title')
|
493
494
|
tag_name = nil
|
494
495
|
|
496
|
+
if tag.length < 1
|
497
|
+
# - https://www3.nhk.or.jp/news/easy/em2024081312029/em2024081312029.html
|
498
|
+
tag = doc.css('h1.article-title') # No warning.
|
499
|
+
end
|
500
|
+
|
495
501
|
if tag.length < 1
|
496
502
|
# - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_illust.html
|
497
503
|
tag_name = 'h1.article-eq__title'
|
498
504
|
tag = doc.css(tag_name)
|
499
505
|
end
|
500
|
-
|
501
506
|
if tag.length < 1 && !@strict
|
502
507
|
# This shouldn't be used except for select sites.
|
503
508
|
# - https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html
|
@@ -632,16 +637,12 @@ module NHKore
|
|
632
637
|
end
|
633
638
|
end
|
634
639
|
|
635
|
-
###
|
636
|
-
# @author Jonathan Bradley Whited
|
637
|
-
# @since 0.2.0
|
638
|
-
###
|
639
640
|
class ScrapeWordsResult
|
640
641
|
attr_reader :text
|
641
642
|
attr_reader :words
|
642
643
|
|
643
644
|
def initialize
|
644
|
-
super
|
645
|
+
super
|
645
646
|
|
646
647
|
@text = ''.dup
|
647
648
|
@words = []
|
data/lib/nhkore/cleaner.rb
CHANGED
@@ -14,10 +14,6 @@ require 'nhkore/word'
|
|
14
14
|
|
15
15
|
|
16
16
|
module NHKore
|
17
|
-
###
|
18
|
-
# @author Jonathan Bradley Whited
|
19
|
-
# @since 0.2.0
|
20
|
-
###
|
21
17
|
class Cleaner
|
22
18
|
def begin_clean(str)
|
23
19
|
return str
|
@@ -53,10 +49,6 @@ module NHKore
|
|
53
49
|
end
|
54
50
|
end
|
55
51
|
|
56
|
-
###
|
57
|
-
# @author Jonathan Bradley Whited
|
58
|
-
# @since 0.2.0
|
59
|
-
###
|
60
52
|
class BasicCleaner < Cleaner
|
61
53
|
def end_clean(str)
|
62
54
|
# This is very simple, as Splitter will split on punctuation,
|
@@ -70,10 +62,6 @@ module NHKore
|
|
70
62
|
end
|
71
63
|
end
|
72
64
|
|
73
|
-
###
|
74
|
-
# @author Jonathan Bradley Whited
|
75
|
-
# @since 0.2.0
|
76
|
-
###
|
77
65
|
class BestCleaner < BasicCleaner
|
78
66
|
end
|
79
67
|
end
|
data/lib/nhkore/cli/fx_cmd.rb
CHANGED
data/lib/nhkore/cli/get_cmd.rb
CHANGED
@@ -14,10 +14,6 @@ require 'nhkore/util'
|
|
14
14
|
|
15
15
|
module NHKore
|
16
16
|
module CLI
|
17
|
-
###
|
18
|
-
# @author Jonathan Bradley Whited
|
19
|
-
# @since 0.2.0
|
20
|
-
###
|
21
17
|
module GetCmd
|
22
18
|
DEFAULT_GET_CHUNK_SIZE = 4 * 1024
|
23
19
|
DEFAULT_GET_URL_LENGTH = 11_000_000 # Just a generous estimation used as a fallback; may be outdated.
|
data/lib/nhkore/cli/news_cmd.rb
CHANGED
@@ -21,10 +21,6 @@ require 'nhkore/util'
|
|
21
21
|
|
22
22
|
module NHKore
|
23
23
|
module CLI
|
24
|
-
###
|
25
|
-
# @author Jonathan Bradley Whited
|
26
|
-
# @since 0.2.0
|
27
|
-
###
|
28
24
|
module NewsCmd
|
29
25
|
DEFAULT_NEWS_SCRAPE = 1
|
30
26
|
|
@@ -255,16 +251,20 @@ module CLI
|
|
255
251
|
next if !redo_scrapes && scraped_news_article?(news,link)
|
256
252
|
|
257
253
|
url = link.url
|
254
|
+
result = scrape_news_article(url,link: link,new_articles: new_articles,news: news)
|
258
255
|
|
259
|
-
if
|
256
|
+
if result == :scraped
|
257
|
+
scrape_count += 1
|
258
|
+
elsif result == :unscraped
|
259
|
+
next
|
260
|
+
else
|
260
261
|
# --show-dict
|
261
|
-
url =
|
262
|
-
scrape_count = max_scrapes
|
262
|
+
url = result
|
263
|
+
scrape_count = max_scrapes # Break on next iteration for update_spin_detail().
|
263
264
|
end
|
264
265
|
|
265
266
|
# Break on next iteration for update_spin_detail().
|
266
|
-
next if
|
267
|
-
|
267
|
+
next if scrape_count >= max_scrapes
|
268
268
|
sleep_scraper
|
269
269
|
end
|
270
270
|
else
|
@@ -275,9 +275,8 @@ module CLI
|
|
275
275
|
links.add_link(link)
|
276
276
|
end
|
277
277
|
|
278
|
-
scrape_news_article(url,link: link,new_articles: new_articles,news: news)
|
279
|
-
|
280
|
-
scrape_count += 1
|
278
|
+
result = scrape_news_article(url,link: link,new_articles: new_articles,news: news)
|
279
|
+
scrape_count += 1 if result != :unscraped
|
281
280
|
end
|
282
281
|
|
283
282
|
stop_spin
|
@@ -338,9 +337,17 @@ module CLI
|
|
338
337
|
return scraper.url
|
339
338
|
end
|
340
339
|
|
341
|
-
scraper =
|
342
|
-
|
340
|
+
scraper = nil
|
341
|
+
|
342
|
+
begin
|
343
|
+
scraper = ArticleScraper.new(url,**@news_article_scraper_kargs)
|
344
|
+
rescue Http404Error
|
345
|
+
# - https://www3.nhk.or.jp/news/easy/k10014157491000/k10014157491000.html
|
346
|
+
Util.warn("Ignoring URL with 404 error: #{url}.")
|
347
|
+
return :unscraped
|
348
|
+
end
|
343
349
|
|
350
|
+
article = scraper.scrape
|
344
351
|
# run_news_cmd() handles overwriting with --redo or not
|
345
352
|
# using scraped_news_article?().
|
346
353
|
news.add_article(article,overwrite: true)
|
@@ -350,7 +357,7 @@ module CLI
|
|
350
357
|
|
351
358
|
new_articles << article
|
352
359
|
|
353
|
-
return
|
360
|
+
return :scraped # No --show-dict
|
354
361
|
end
|
355
362
|
|
356
363
|
def scraped_news_article?(news,link)
|
@@ -366,10 +373,15 @@ module CLI
|
|
366
373
|
end
|
367
374
|
|
368
375
|
if article.nil?
|
369
|
-
scraper =
|
376
|
+
scraper = nil
|
370
377
|
|
371
|
-
|
378
|
+
begin
|
379
|
+
scraper = ArticleScraper.new(link.url,**@news_article_scraper_kargs)
|
380
|
+
rescue Http404Error
|
381
|
+
return false
|
382
|
+
end
|
372
383
|
|
384
|
+
sha256 = scraper.scrape_sha256_only
|
373
385
|
article = news.article_with_sha256(sha256) if news.sha256?(sha256)
|
374
386
|
end
|
375
387
|
end
|