nhkore 0.3.13 → 0.3.16
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -1
- data/Gemfile.lock +41 -41
- data/README.md +4 -5
- data/Rakefile +5 -3
- data/lib/nhkore/app.rb +8 -19
- data/lib/nhkore/article.rb +5 -9
- data/lib/nhkore/article_scraper.rb +15 -14
- data/lib/nhkore/cleaner.rb +0 -12
- data/lib/nhkore/cli/fx_cmd.rb +0 -4
- data/lib/nhkore/cli/get_cmd.rb +0 -4
- data/lib/nhkore/cli/news_cmd.rb +29 -17
- data/lib/nhkore/cli/search_cmd.rb +45 -35
- data/lib/nhkore/cli/sift_cmd.rb +1 -5
- data/lib/nhkore/datetime_parser.rb +1 -5
- data/lib/nhkore/defn.rb +1 -5
- data/lib/nhkore/dict.rb +2 -5
- data/lib/nhkore/dict_scraper.rb +2 -6
- data/lib/nhkore/entry.rb +3 -9
- data/lib/nhkore/error.rb +1 -11
- data/lib/nhkore/fileable.rb +0 -4
- data/lib/nhkore/lib.rb +0 -3
- data/lib/nhkore/missingno.rb +2 -6
- data/lib/nhkore/news.rb +3 -15
- data/lib/nhkore/polisher.rb +0 -12
- data/lib/nhkore/scraper.rb +8 -5
- data/lib/nhkore/search_link.rb +9 -17
- data/lib/nhkore/search_scraper.rb +34 -24
- data/lib/nhkore/sifter.rb +7 -8
- data/lib/nhkore/splitter.rb +0 -18
- data/lib/nhkore/user_agents.rb +1 -4
- data/lib/nhkore/util.rb +0 -4
- data/lib/nhkore/variator.rb +0 -14
- data/lib/nhkore/version.rb +1 -1
- data/lib/nhkore/word.rb +0 -4
- data/lib/nhkore.rb +0 -5
- data/nhkore.gemspec +40 -37
- data/samples/looper.rb +0 -3
- metadata +24 -24
@@ -3,7 +3,7 @@
|
|
3
3
|
|
4
4
|
#--
|
5
5
|
# This file is part of NHKore.
|
6
|
-
# Copyright (c) 2020-
|
6
|
+
# Copyright (c) 2020-2022 Jonathan Bradley Whited
|
7
7
|
#
|
8
8
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
9
9
|
#++
|
@@ -17,10 +17,6 @@ require 'nhkore/util'
|
|
17
17
|
|
18
18
|
module NHKore
|
19
19
|
module CLI
|
20
|
-
###
|
21
|
-
# @author Jonathan Bradley Whited
|
22
|
-
# @since 0.3.0
|
23
|
-
###
|
24
20
|
module SearchCmd
|
25
21
|
def build_search_cmd
|
26
22
|
app = self
|
@@ -42,6 +38,12 @@ module CLI
|
|
42
38
|
DESC
|
43
39
|
app.check_empty_opt(:in,value)
|
44
40
|
}
|
41
|
+
option :l,:loop,'number of times to repeat the search to ensure results',argument: :required,
|
42
|
+
transform: lambda { |value|
|
43
|
+
value = value.to_i
|
44
|
+
value = 1 if value < 1
|
45
|
+
value
|
46
|
+
}
|
45
47
|
option :o,:out,<<-DESC,argument: :required,transform: lambda { |value|
|
46
48
|
'directory/file' to save links to; if you only specify a directory or a file, it will attach the
|
47
49
|
appropriate default directory/file name
|
@@ -164,6 +166,8 @@ module CLI
|
|
164
166
|
|
165
167
|
dry_run = @cmd_opts[:dry_run]
|
166
168
|
in_file = @cmd_opts[:in]
|
169
|
+
loop_times = @cmd_opts[:loop]
|
170
|
+
loop_times = 1 if loop_times.nil? || loop_times < 1
|
167
171
|
out_file = @cmd_opts[:out]
|
168
172
|
result_count = @cmd_opts[:results]
|
169
173
|
result_count = SearchScraper::DEFAULT_RESULT_COUNT if result_count.nil?
|
@@ -174,9 +178,6 @@ module CLI
|
|
174
178
|
is_file = !in_file.nil?
|
175
179
|
links = nil
|
176
180
|
new_links = [] # For --dry-run
|
177
|
-
next_page = NextPage.new
|
178
|
-
page_count = 0
|
179
|
-
page_num = 1
|
180
181
|
url = in_file # nil will use default URL, else a file
|
181
182
|
|
182
183
|
# Load previous links for 'scraped?' vars.
|
@@ -196,43 +197,52 @@ module CLI
|
|
196
197
|
end
|
197
198
|
|
198
199
|
puts "#{scraped_count} of #{links_count} links scraped."
|
199
|
-
|
200
200
|
return
|
201
201
|
end
|
202
202
|
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
203
|
+
1.upto(loop_times) do |loop_i|
|
204
|
+
page_range = (0..10_000) # Do a range to prevent an infinite loop; ichiman!
|
205
|
+
|
206
|
+
next_page = NextPage.new
|
207
|
+
page_count = 0
|
208
|
+
page_num = 1
|
209
|
+
|
210
|
+
case search_type
|
211
|
+
# Anything that extends SearchScraper.
|
212
|
+
when :bing
|
213
|
+
page_range.each do
|
214
|
+
scraper = nil
|
215
|
+
|
216
|
+
case search_type
|
217
|
+
when :bing
|
218
|
+
scraper = BingScraper.new(
|
219
|
+
nhk_type,count: result_count,is_file: is_file,url: url,**@scraper_kargs
|
220
|
+
)
|
221
|
+
else
|
222
|
+
raise NHKore::Error,"internal code broken; add missing search_type[#{search_type}]"
|
223
|
+
end
|
217
224
|
|
218
|
-
|
225
|
+
next_page = scraper.scrape(links,next_page)
|
219
226
|
|
220
|
-
|
221
|
-
|
222
|
-
|
227
|
+
new_links.concat(links.links.values[links_count..])
|
228
|
+
links_count = links.length
|
229
|
+
page_count = next_page.count if next_page.count > 0
|
223
230
|
|
224
|
-
|
225
|
-
|
231
|
+
update_spin_detail(
|
232
|
+
format(' (%d/%d, page=%d, count=%d, links=%d, new_links=%d)',
|
233
|
+
loop_i,loop_times,page_num,page_count,links.length,new_links.length)
|
234
|
+
)
|
226
235
|
|
227
|
-
|
236
|
+
break if next_page.empty?
|
228
237
|
|
229
|
-
|
230
|
-
|
238
|
+
page_num += 1
|
239
|
+
url = next_page.url
|
231
240
|
|
232
|
-
|
241
|
+
sleep_scraper
|
242
|
+
end
|
243
|
+
else
|
244
|
+
raise ArgumentError,"invalid search_type[#{search_type}]"
|
233
245
|
end
|
234
|
-
else
|
235
|
-
raise ArgumentError,"invalid search_type[#{search_type}]"
|
236
246
|
end
|
237
247
|
|
238
248
|
stop_spin
|
data/lib/nhkore/cli/sift_cmd.rb
CHANGED
@@ -20,10 +20,6 @@ require 'nhkore/util'
|
|
20
20
|
|
21
21
|
module NHKore
|
22
22
|
module CLI
|
23
|
-
###
|
24
|
-
# @author Jonathan Bradley Whited
|
25
|
-
# @since 0.2.0
|
26
|
-
###
|
27
23
|
module SiftCmd
|
28
24
|
DEFAULT_SIFT_EXT = :csv
|
29
25
|
DEFAULT_SIFT_FUTSUU_FILE = "#{Sifter::DEFAULT_FUTSUU_FILE}{search.criteria}{file.ext}"
|
@@ -260,7 +256,7 @@ module CLI
|
|
260
256
|
puts
|
261
257
|
|
262
258
|
if dry_run
|
263
|
-
puts sifter
|
259
|
+
puts sifter
|
264
260
|
else
|
265
261
|
start_spin('Saving sifted data to file')
|
266
262
|
|
@@ -17,10 +17,6 @@ require 'nhkore/util'
|
|
17
17
|
|
18
18
|
|
19
19
|
module NHKore
|
20
|
-
###
|
21
|
-
# @author Jonathan Bradley Whited
|
22
|
-
# @since 0.3.4
|
23
|
-
###
|
24
20
|
class DatetimeParser
|
25
21
|
extend AttrBool::Ext
|
26
22
|
|
@@ -181,7 +177,7 @@ module NHKore
|
|
181
177
|
return self if @min_or_max
|
182
178
|
|
183
179
|
has_small = false
|
184
|
-
jst_now = Util.jst_now
|
180
|
+
jst_now = Util.jst_now
|
185
181
|
|
186
182
|
# Must be from smallest to biggest.
|
187
183
|
|
data/lib/nhkore/defn.rb
CHANGED
@@ -16,17 +16,13 @@ require 'nhkore/word'
|
|
16
16
|
|
17
17
|
|
18
18
|
module NHKore
|
19
|
-
###
|
20
|
-
# @author Jonathan Bradley Whited
|
21
|
-
# @since 0.2.0
|
22
|
-
###
|
23
19
|
class Defn
|
24
20
|
attr_reader :hyoukis
|
25
21
|
attr_accessor :text
|
26
22
|
attr_reader :words
|
27
23
|
|
28
24
|
def initialize
|
29
|
-
super
|
25
|
+
super
|
30
26
|
|
31
27
|
@hyoukis = []
|
32
28
|
@text = ''.dup
|
data/lib/nhkore/dict.rb
CHANGED
@@ -14,15 +14,11 @@ require 'nhkore/error'
|
|
14
14
|
|
15
15
|
|
16
16
|
module NHKore
|
17
|
-
###
|
18
|
-
# @author Jonathan Bradley Whited
|
19
|
-
# @since 0.2.0
|
20
|
-
###
|
21
17
|
class Dict
|
22
18
|
attr_reader :entries
|
23
19
|
|
24
20
|
def initialize
|
25
|
-
super
|
21
|
+
super
|
26
22
|
|
27
23
|
@entries = {}
|
28
24
|
end
|
@@ -39,6 +35,7 @@ module NHKore
|
|
39
35
|
dict = Dict.new
|
40
36
|
|
41
37
|
hash.each do |id,array|
|
38
|
+
id = id.to_s.strip.downcase # 'RSHOK-K-003806', '0000'
|
42
39
|
entry = Entry.scrape(id,array,missingno: missingno,url: url)
|
43
40
|
|
44
41
|
next if entry.nil?
|
data/lib/nhkore/dict_scraper.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
|
4
4
|
#--
|
5
5
|
# This file is part of NHKore.
|
6
|
-
# Copyright (c) 2020-
|
6
|
+
# Copyright (c) 2020-2022 Jonathan Bradley Whited
|
7
7
|
#
|
8
8
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
9
9
|
#++
|
@@ -16,10 +16,6 @@ require 'nhkore/util'
|
|
16
16
|
|
17
17
|
|
18
18
|
module NHKore
|
19
|
-
###
|
20
|
-
# @author Jonathan Bradley Whited
|
21
|
-
# @since 0.2.0
|
22
|
-
###
|
23
19
|
class DictScraper < Scraper
|
24
20
|
attr_accessor :missingno
|
25
21
|
|
@@ -39,7 +35,7 @@ module NHKore
|
|
39
35
|
i = url.rindex(%r{[/\\]}) # Can be a URL or a file
|
40
36
|
i = i.nil? ? 0 : (i + 1) # If no match found, no path
|
41
37
|
|
42
|
-
basename = File.basename(url[i
|
38
|
+
basename = File.basename(url[i..],'.*') if basename.nil?
|
43
39
|
path = url[0...i]
|
44
40
|
|
45
41
|
return "#{path}#{basename}.out.dic"
|
data/lib/nhkore/entry.rb
CHANGED
@@ -14,10 +14,6 @@ require 'nhkore/util'
|
|
14
14
|
|
15
15
|
|
16
16
|
module NHKore
|
17
|
-
###
|
18
|
-
# @author Jonathan Bradley Whited
|
19
|
-
# @since 0.2.0
|
20
|
-
###
|
21
17
|
class Entry
|
22
18
|
HYOUKI_SEP = '・'
|
23
19
|
|
@@ -25,18 +21,16 @@ module NHKore
|
|
25
21
|
attr_accessor :id
|
26
22
|
|
27
23
|
def initialize
|
28
|
-
super
|
24
|
+
super
|
29
25
|
|
30
26
|
@defns = []
|
31
27
|
@id = nil
|
32
28
|
end
|
33
29
|
|
34
30
|
def build_defn
|
35
|
-
defns = []
|
36
31
|
i = 0
|
37
|
-
|
38
|
-
|
39
|
-
defns << "#{i += 1})#{defn}" # Japanese parenthesis
|
32
|
+
defns = @defns.map do |defn|
|
33
|
+
"#{i += 1})#{defn}" # Japanese parenthesis
|
40
34
|
end
|
41
35
|
|
42
36
|
return defns.join("\n")
|
data/lib/nhkore/error.rb
CHANGED
@@ -10,21 +10,11 @@
|
|
10
10
|
|
11
11
|
|
12
12
|
module NHKore
|
13
|
-
###
|
14
|
-
# @author Jonathan Bradley Whited
|
15
|
-
# @since 0.2.0
|
16
|
-
###
|
17
13
|
class Error < ::StandardError; end
|
18
14
|
|
19
|
-
# @since 0.2.0
|
20
15
|
class CLIError < Error; end
|
21
|
-
|
22
|
-
# @since 0.2.0
|
16
|
+
class Http404Error < Error; end
|
23
17
|
class ParseError < Error; end
|
24
|
-
|
25
|
-
# @since 0.2.0
|
26
18
|
class ScrapeError < Error; end
|
27
|
-
|
28
|
-
# @since 0.2.0
|
29
19
|
class ZipError < Error; end
|
30
20
|
end
|
data/lib/nhkore/fileable.rb
CHANGED
data/lib/nhkore/lib.rb
CHANGED
data/lib/nhkore/missingno.rb
CHANGED
@@ -13,10 +13,6 @@ require 'nhkore/util'
|
|
13
13
|
|
14
14
|
|
15
15
|
module NHKore
|
16
|
-
###
|
17
|
-
# @author Jonathan Bradley Whited
|
18
|
-
# @since 0.2.0
|
19
|
-
###
|
20
16
|
class Missingno
|
21
17
|
attr_reader :kanas
|
22
18
|
attr_reader :kanjis
|
@@ -68,13 +64,13 @@ module NHKore
|
|
68
64
|
def kana_from_kanji(kanji)
|
69
65
|
word = @kanjis[kanji]
|
70
66
|
|
71
|
-
return word
|
67
|
+
return word&.kana
|
72
68
|
end
|
73
69
|
|
74
70
|
def kanji_from_kana(kana)
|
75
71
|
word = @kanas[kana]
|
76
72
|
|
77
|
-
return word
|
73
|
+
return word&.kanji
|
78
74
|
end
|
79
75
|
end
|
80
76
|
end
|
data/lib/nhkore/news.rb
CHANGED
@@ -16,10 +16,6 @@ require 'nhkore/util'
|
|
16
16
|
|
17
17
|
|
18
18
|
module NHKore
|
19
|
-
###
|
20
|
-
# @author Jonathan Bradley Whited
|
21
|
-
# @since 0.2.0
|
22
|
-
###
|
23
19
|
class News
|
24
20
|
include Fileable
|
25
21
|
|
@@ -30,7 +26,7 @@ module NHKore
|
|
30
26
|
attr_reader :sha256s
|
31
27
|
|
32
28
|
def initialize
|
33
|
-
super
|
29
|
+
super
|
34
30
|
|
35
31
|
@articles = {}
|
36
32
|
@sha256s = {}
|
@@ -127,10 +123,6 @@ module NHKore
|
|
127
123
|
end
|
128
124
|
end
|
129
125
|
|
130
|
-
###
|
131
|
-
# @author Jonathan Bradley Whited
|
132
|
-
# @since 0.2.0
|
133
|
-
###
|
134
126
|
class FutsuuNews < News
|
135
127
|
DEFAULT_FILENAME = 'nhk_news_web_regular.yml'
|
136
128
|
DEFAULT_FILE = build_file(DEFAULT_FILENAME)
|
@@ -144,14 +136,10 @@ module NHKore
|
|
144
136
|
end
|
145
137
|
|
146
138
|
def save_file(file=DEFAULT_FILE,**kargs)
|
147
|
-
super
|
139
|
+
super
|
148
140
|
end
|
149
141
|
end
|
150
142
|
|
151
|
-
###
|
152
|
-
# @author Jonathan Bradley Whited
|
153
|
-
# @since 0.2.0
|
154
|
-
###
|
155
143
|
class YasashiiNews < News
|
156
144
|
DEFAULT_FILENAME = 'nhk_news_web_easy.yml'
|
157
145
|
DEFAULT_FILE = build_file(DEFAULT_FILENAME)
|
@@ -165,7 +153,7 @@ module NHKore
|
|
165
153
|
end
|
166
154
|
|
167
155
|
def save_file(file=DEFAULT_FILE,**kargs)
|
168
|
-
super
|
156
|
+
super
|
169
157
|
end
|
170
158
|
end
|
171
159
|
end
|
data/lib/nhkore/polisher.rb
CHANGED
@@ -13,10 +13,6 @@ require 'nhkore/word'
|
|
13
13
|
|
14
14
|
|
15
15
|
module NHKore
|
16
|
-
###
|
17
|
-
# @author Jonathan Bradley Whited
|
18
|
-
# @since 0.2.0
|
19
|
-
###
|
20
16
|
class Polisher
|
21
17
|
def begin_polish(str)
|
22
18
|
return str
|
@@ -52,10 +48,6 @@ module NHKore
|
|
52
48
|
end
|
53
49
|
end
|
54
50
|
|
55
|
-
###
|
56
|
-
# @author Jonathan Bradley Whited
|
57
|
-
# @since 0.2.0
|
58
|
-
###
|
59
51
|
class BasicPolisher < Polisher
|
60
52
|
def end_polish(str)
|
61
53
|
# Keep Japanese dots in names:
|
@@ -72,10 +64,6 @@ module NHKore
|
|
72
64
|
end
|
73
65
|
end
|
74
66
|
|
75
|
-
###
|
76
|
-
# @author Jonathan Bradley Whited
|
77
|
-
# @since 0.2.0
|
78
|
-
###
|
79
67
|
class BestPolisher < BasicPolisher
|
80
68
|
end
|
81
69
|
end
|
data/lib/nhkore/scraper.rb
CHANGED
@@ -13,15 +13,12 @@ require 'attr_bool'
|
|
13
13
|
require 'nokogiri'
|
14
14
|
require 'open-uri'
|
15
15
|
|
16
|
+
require 'nhkore/error'
|
16
17
|
require 'nhkore/user_agents'
|
17
18
|
require 'nhkore/util'
|
18
19
|
|
19
20
|
|
20
21
|
module NHKore
|
21
|
-
###
|
22
|
-
# @author Jonathan Bradley Whited
|
23
|
-
# @since 0.2.0
|
24
|
-
###
|
25
22
|
class Scraper
|
26
23
|
extend AttrBool::Ext
|
27
24
|
|
@@ -177,7 +174,13 @@ module NHKore
|
|
177
174
|
retry
|
178
175
|
# Must come after HTTPRedirect since a subclass of HTTPError.
|
179
176
|
rescue OpenURI::HTTPError => e
|
180
|
-
|
177
|
+
msg = "HTTP error[#{e}] at URL[#{url}]"
|
178
|
+
|
179
|
+
if e.to_s.include?('404 Not Found')
|
180
|
+
raise Http404Error,msg
|
181
|
+
else
|
182
|
+
raise e.exception(msg)
|
183
|
+
end
|
181
184
|
rescue SocketError => e
|
182
185
|
if (max_retries -= 1) < 0
|
183
186
|
raise e.exception("Socket error[#{e}] at URL[#{url}]")
|
data/lib/nhkore/search_link.rb
CHANGED
@@ -17,10 +17,6 @@ require 'nhkore/util'
|
|
17
17
|
|
18
18
|
|
19
19
|
module NHKore
|
20
|
-
###
|
21
|
-
# @author Jonathan Bradley Whited
|
22
|
-
# @since 0.2.0
|
23
|
-
###
|
24
20
|
class SearchLink
|
25
21
|
extend AttrBool::Ext
|
26
22
|
|
@@ -45,11 +41,11 @@ module NHKore
|
|
45
41
|
def encode_with(coder)
|
46
42
|
# Order matters.
|
47
43
|
|
48
|
-
coder[:url] = @url
|
44
|
+
coder[:url] = @url&.to_s
|
49
45
|
coder[:scraped] = @scraped
|
50
|
-
coder[:datetime] = @datetime
|
46
|
+
coder[:datetime] = @datetime&.iso8601
|
51
47
|
coder[:title] = @title
|
52
|
-
coder[:futsuurl] = @futsuurl
|
48
|
+
coder[:futsuurl] = @futsuurl&.to_s
|
53
49
|
coder[:sha256] = @sha256
|
54
50
|
end
|
55
51
|
|
@@ -86,13 +82,13 @@ module NHKore
|
|
86
82
|
end
|
87
83
|
|
88
84
|
def futsuurl=(value)
|
89
|
-
# Don't store URI, store String.
|
90
|
-
@futsuurl = value
|
85
|
+
# Don't store URI, store String or nil.
|
86
|
+
@futsuurl = value&.to_s
|
91
87
|
end
|
92
88
|
|
93
89
|
def url=(value)
|
94
|
-
# Don't store URI, store String.
|
95
|
-
@url = value
|
90
|
+
# Don't store URI, store String or nil.
|
91
|
+
@url = value&.to_s
|
96
92
|
end
|
97
93
|
|
98
94
|
def to_s(mini: false)
|
@@ -114,10 +110,6 @@ module NHKore
|
|
114
110
|
end
|
115
111
|
end
|
116
112
|
|
117
|
-
###
|
118
|
-
# @author Jonathan Bradley Whited
|
119
|
-
# @since 0.2.0
|
120
|
-
###
|
121
113
|
class SearchLinks
|
122
114
|
include Fileable
|
123
115
|
|
@@ -136,13 +128,13 @@ module NHKore
|
|
136
128
|
attr_reader :links
|
137
129
|
|
138
130
|
def initialize
|
139
|
-
super
|
131
|
+
super
|
140
132
|
|
141
133
|
@links = {}
|
142
134
|
end
|
143
135
|
|
144
136
|
def add_link(link)
|
145
|
-
url = link.url
|
137
|
+
url = link.url&.to_s
|
146
138
|
|
147
139
|
return self if @links.key?(url)
|
148
140
|
|
@@ -9,6 +9,7 @@
|
|
9
9
|
#++
|
10
10
|
|
11
11
|
|
12
|
+
require 'net/http'
|
12
13
|
require 'uri'
|
13
14
|
|
14
15
|
require 'nhkore/error'
|
@@ -18,10 +19,6 @@ require 'nhkore/util'
|
|
18
19
|
|
19
20
|
|
20
21
|
module NHKore
|
21
|
-
###
|
22
|
-
# @author Jonathan Bradley Whited
|
23
|
-
# @since 0.2.0
|
24
|
-
###
|
25
22
|
class SearchScraper < Scraper
|
26
23
|
DEFAULT_RESULT_COUNT = 100
|
27
24
|
FUTSUU_SITE = 'nhk.or.jp/news/html/'
|
@@ -34,10 +31,11 @@ module NHKore
|
|
34
31
|
YASASHII_REGEX = /\A[^.]+\.#{Regexp.quote(YASASHII_SITE)}.+\.html?/i.freeze
|
35
32
|
|
36
33
|
IGNORE_LINK_REGEX = %r{
|
37
|
-
/about\.html?
|
38
|
-
|/movieplayer\.html?
|
39
|
-
|/audio\.html?
|
40
|
-
|/news/easy/index\.html?
|
34
|
+
/about\.html? # https://www3.nhk.or.jp/news/easy/about.html
|
35
|
+
|/movieplayer\.html? # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
|
36
|
+
|/audio\.html? # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
|
37
|
+
|/news/easy/index\.html? # https://www3.nhk.or.jp/news/easy/index.html
|
38
|
+
|/disaster_earthquake.html # https://www3.nhk.or.jp/news/easy/article/disaster_earthquake.html
|
41
39
|
|
42
40
|
# https://cgi2.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10011916321000&title=日本の会社が作った鉄道の車両「あずま」がイギリスで走る
|
43
41
|
# https://www3.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10012689671000&title=「鬼滅の刃」の映画が台湾でも始まって大勢の人が見に行く
|
@@ -47,7 +45,7 @@ module NHKore
|
|
47
45
|
# Search Engines are strict, so trigger using the default HTTP header fields
|
48
46
|
# with +header: {}+ and fetch/set the cookie using +eat_cookie: true+.
|
49
47
|
def initialize(url,eat_cookie: true,header: {},**kargs)
|
50
|
-
super
|
48
|
+
super
|
51
49
|
end
|
52
50
|
|
53
51
|
def ignore_link?(link,cleaned: true)
|
@@ -56,17 +54,35 @@ module NHKore
|
|
56
54
|
link = Util.unspace_web_str(link).downcase unless cleaned
|
57
55
|
|
58
56
|
return true if link.empty?
|
59
|
-
|
60
57
|
return true if IGNORE_LINK_REGEX.match?(link)
|
61
|
-
|
62
58
|
return false
|
63
59
|
end
|
60
|
+
|
61
|
+
# Example: https://www3.nhk.or.jp/news/easy/k10014150691000/k10014150691000.html
|
62
|
+
def fetch_valid_link?(link)
|
63
|
+
uri = begin
|
64
|
+
URI(link)
|
65
|
+
rescue StandardError
|
66
|
+
return false # Bad URL.
|
67
|
+
end
|
68
|
+
|
69
|
+
begin
|
70
|
+
ssl = uri.scheme.to_s.strip.downcase.include?('https')
|
71
|
+
|
72
|
+
Net::HTTP.start(uri.host,uri.port,use_ssl: ssl) do |http|
|
73
|
+
resp = http.head(uri.request_uri)
|
74
|
+
code = resp.code
|
75
|
+
|
76
|
+
return code != '404'
|
77
|
+
end
|
78
|
+
rescue StandardError
|
79
|
+
# Ignore; try actually scraping the article anyway.
|
80
|
+
end
|
81
|
+
|
82
|
+
return true
|
83
|
+
end
|
64
84
|
end
|
65
85
|
|
66
|
-
###
|
67
|
-
# @author Jonathan Bradley Whited
|
68
|
-
# @since 0.2.0
|
69
|
-
###
|
70
86
|
class BingScraper < SearchScraper
|
71
87
|
attr_reader :regex
|
72
88
|
attr_reader :site
|
@@ -136,9 +152,8 @@ module NHKore
|
|
136
152
|
next_page.count = count
|
137
153
|
next_page.url = join_url(href)
|
138
154
|
end
|
139
|
-
elsif href =~ regex
|
155
|
+
elsif href =~ regex && fetch_valid_link?(href)
|
140
156
|
slinks.add_link(SearchLink.new(href))
|
141
|
-
|
142
157
|
link_count += 1
|
143
158
|
end
|
144
159
|
end
|
@@ -165,10 +180,9 @@ module NHKore
|
|
165
180
|
rss_links << link
|
166
181
|
|
167
182
|
next if ignore_link?(link)
|
168
|
-
next if link !~ regex
|
183
|
+
next if link !~ regex || !fetch_valid_link?(link)
|
169
184
|
|
170
185
|
slinks.add_link(SearchLink.new(link))
|
171
|
-
|
172
186
|
link_count += 1
|
173
187
|
end
|
174
188
|
|
@@ -192,17 +206,13 @@ module NHKore
|
|
192
206
|
end
|
193
207
|
end
|
194
208
|
|
195
|
-
###
|
196
|
-
# @author Jonathan Bradley Whited
|
197
|
-
# @since 0.2.0
|
198
|
-
###
|
199
209
|
class NextPage
|
200
210
|
attr_accessor :count
|
201
211
|
attr_accessor :rss_links
|
202
212
|
attr_accessor :url
|
203
213
|
|
204
214
|
def initialize
|
205
|
-
super
|
215
|
+
super
|
206
216
|
|
207
217
|
@count = -1
|
208
218
|
@rss_links = nil
|