nhkore 0.3.13 → 0.3.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -1
- data/Gemfile.lock +41 -41
- data/README.md +4 -5
- data/Rakefile +5 -3
- data/lib/nhkore/app.rb +8 -19
- data/lib/nhkore/article.rb +5 -9
- data/lib/nhkore/article_scraper.rb +15 -14
- data/lib/nhkore/cleaner.rb +0 -12
- data/lib/nhkore/cli/fx_cmd.rb +0 -4
- data/lib/nhkore/cli/get_cmd.rb +0 -4
- data/lib/nhkore/cli/news_cmd.rb +29 -17
- data/lib/nhkore/cli/search_cmd.rb +45 -35
- data/lib/nhkore/cli/sift_cmd.rb +1 -5
- data/lib/nhkore/datetime_parser.rb +1 -5
- data/lib/nhkore/defn.rb +1 -5
- data/lib/nhkore/dict.rb +2 -5
- data/lib/nhkore/dict_scraper.rb +2 -6
- data/lib/nhkore/entry.rb +3 -9
- data/lib/nhkore/error.rb +1 -11
- data/lib/nhkore/fileable.rb +0 -4
- data/lib/nhkore/lib.rb +0 -3
- data/lib/nhkore/missingno.rb +2 -6
- data/lib/nhkore/news.rb +3 -15
- data/lib/nhkore/polisher.rb +0 -12
- data/lib/nhkore/scraper.rb +8 -5
- data/lib/nhkore/search_link.rb +9 -17
- data/lib/nhkore/search_scraper.rb +34 -24
- data/lib/nhkore/sifter.rb +7 -8
- data/lib/nhkore/splitter.rb +0 -18
- data/lib/nhkore/user_agents.rb +1 -4
- data/lib/nhkore/util.rb +0 -4
- data/lib/nhkore/variator.rb +0 -14
- data/lib/nhkore/version.rb +1 -1
- data/lib/nhkore/word.rb +0 -4
- data/lib/nhkore.rb +0 -5
- data/nhkore.gemspec +40 -37
- data/samples/looper.rb +0 -3
- metadata +24 -24
@@ -3,7 +3,7 @@
|
|
3
3
|
|
4
4
|
#--
|
5
5
|
# This file is part of NHKore.
|
6
|
-
# Copyright (c) 2020-
|
6
|
+
# Copyright (c) 2020-2022 Jonathan Bradley Whited
|
7
7
|
#
|
8
8
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
9
9
|
#++
|
@@ -17,10 +17,6 @@ require 'nhkore/util'
|
|
17
17
|
|
18
18
|
module NHKore
|
19
19
|
module CLI
|
20
|
-
###
|
21
|
-
# @author Jonathan Bradley Whited
|
22
|
-
# @since 0.3.0
|
23
|
-
###
|
24
20
|
module SearchCmd
|
25
21
|
def build_search_cmd
|
26
22
|
app = self
|
@@ -42,6 +38,12 @@ module CLI
|
|
42
38
|
DESC
|
43
39
|
app.check_empty_opt(:in,value)
|
44
40
|
}
|
41
|
+
option :l,:loop,'number of times to repeat the search to ensure results',argument: :required,
|
42
|
+
transform: lambda { |value|
|
43
|
+
value = value.to_i
|
44
|
+
value = 1 if value < 1
|
45
|
+
value
|
46
|
+
}
|
45
47
|
option :o,:out,<<-DESC,argument: :required,transform: lambda { |value|
|
46
48
|
'directory/file' to save links to; if you only specify a directory or a file, it will attach the
|
47
49
|
appropriate default directory/file name
|
@@ -164,6 +166,8 @@ module CLI
|
|
164
166
|
|
165
167
|
dry_run = @cmd_opts[:dry_run]
|
166
168
|
in_file = @cmd_opts[:in]
|
169
|
+
loop_times = @cmd_opts[:loop]
|
170
|
+
loop_times = 1 if loop_times.nil? || loop_times < 1
|
167
171
|
out_file = @cmd_opts[:out]
|
168
172
|
result_count = @cmd_opts[:results]
|
169
173
|
result_count = SearchScraper::DEFAULT_RESULT_COUNT if result_count.nil?
|
@@ -174,9 +178,6 @@ module CLI
|
|
174
178
|
is_file = !in_file.nil?
|
175
179
|
links = nil
|
176
180
|
new_links = [] # For --dry-run
|
177
|
-
next_page = NextPage.new
|
178
|
-
page_count = 0
|
179
|
-
page_num = 1
|
180
181
|
url = in_file # nil will use default URL, else a file
|
181
182
|
|
182
183
|
# Load previous links for 'scraped?' vars.
|
@@ -196,43 +197,52 @@ module CLI
|
|
196
197
|
end
|
197
198
|
|
198
199
|
puts "#{scraped_count} of #{links_count} links scraped."
|
199
|
-
|
200
200
|
return
|
201
201
|
end
|
202
202
|
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
203
|
+
1.upto(loop_times) do |loop_i|
|
204
|
+
page_range = (0..10_000) # Do a range to prevent an infinite loop; ichiman!
|
205
|
+
|
206
|
+
next_page = NextPage.new
|
207
|
+
page_count = 0
|
208
|
+
page_num = 1
|
209
|
+
|
210
|
+
case search_type
|
211
|
+
# Anything that extends SearchScraper.
|
212
|
+
when :bing
|
213
|
+
page_range.each do
|
214
|
+
scraper = nil
|
215
|
+
|
216
|
+
case search_type
|
217
|
+
when :bing
|
218
|
+
scraper = BingScraper.new(
|
219
|
+
nhk_type,count: result_count,is_file: is_file,url: url,**@scraper_kargs
|
220
|
+
)
|
221
|
+
else
|
222
|
+
raise NHKore::Error,"internal code broken; add missing search_type[#{search_type}]"
|
223
|
+
end
|
217
224
|
|
218
|
-
|
225
|
+
next_page = scraper.scrape(links,next_page)
|
219
226
|
|
220
|
-
|
221
|
-
|
222
|
-
|
227
|
+
new_links.concat(links.links.values[links_count..])
|
228
|
+
links_count = links.length
|
229
|
+
page_count = next_page.count if next_page.count > 0
|
223
230
|
|
224
|
-
|
225
|
-
|
231
|
+
update_spin_detail(
|
232
|
+
format(' (%d/%d, page=%d, count=%d, links=%d, new_links=%d)',
|
233
|
+
loop_i,loop_times,page_num,page_count,links.length,new_links.length)
|
234
|
+
)
|
226
235
|
|
227
|
-
|
236
|
+
break if next_page.empty?
|
228
237
|
|
229
|
-
|
230
|
-
|
238
|
+
page_num += 1
|
239
|
+
url = next_page.url
|
231
240
|
|
232
|
-
|
241
|
+
sleep_scraper
|
242
|
+
end
|
243
|
+
else
|
244
|
+
raise ArgumentError,"invalid search_type[#{search_type}]"
|
233
245
|
end
|
234
|
-
else
|
235
|
-
raise ArgumentError,"invalid search_type[#{search_type}]"
|
236
246
|
end
|
237
247
|
|
238
248
|
stop_spin
|
data/lib/nhkore/cli/sift_cmd.rb
CHANGED
@@ -20,10 +20,6 @@ require 'nhkore/util'
|
|
20
20
|
|
21
21
|
module NHKore
|
22
22
|
module CLI
|
23
|
-
###
|
24
|
-
# @author Jonathan Bradley Whited
|
25
|
-
# @since 0.2.0
|
26
|
-
###
|
27
23
|
module SiftCmd
|
28
24
|
DEFAULT_SIFT_EXT = :csv
|
29
25
|
DEFAULT_SIFT_FUTSUU_FILE = "#{Sifter::DEFAULT_FUTSUU_FILE}{search.criteria}{file.ext}"
|
@@ -260,7 +256,7 @@ module CLI
|
|
260
256
|
puts
|
261
257
|
|
262
258
|
if dry_run
|
263
|
-
puts sifter
|
259
|
+
puts sifter
|
264
260
|
else
|
265
261
|
start_spin('Saving sifted data to file')
|
266
262
|
|
@@ -17,10 +17,6 @@ require 'nhkore/util'
|
|
17
17
|
|
18
18
|
|
19
19
|
module NHKore
|
20
|
-
###
|
21
|
-
# @author Jonathan Bradley Whited
|
22
|
-
# @since 0.3.4
|
23
|
-
###
|
24
20
|
class DatetimeParser
|
25
21
|
extend AttrBool::Ext
|
26
22
|
|
@@ -181,7 +177,7 @@ module NHKore
|
|
181
177
|
return self if @min_or_max
|
182
178
|
|
183
179
|
has_small = false
|
184
|
-
jst_now = Util.jst_now
|
180
|
+
jst_now = Util.jst_now
|
185
181
|
|
186
182
|
# Must be from smallest to biggest.
|
187
183
|
|
data/lib/nhkore/defn.rb
CHANGED
@@ -16,17 +16,13 @@ require 'nhkore/word'
|
|
16
16
|
|
17
17
|
|
18
18
|
module NHKore
|
19
|
-
###
|
20
|
-
# @author Jonathan Bradley Whited
|
21
|
-
# @since 0.2.0
|
22
|
-
###
|
23
19
|
class Defn
|
24
20
|
attr_reader :hyoukis
|
25
21
|
attr_accessor :text
|
26
22
|
attr_reader :words
|
27
23
|
|
28
24
|
def initialize
|
29
|
-
super
|
25
|
+
super
|
30
26
|
|
31
27
|
@hyoukis = []
|
32
28
|
@text = ''.dup
|
data/lib/nhkore/dict.rb
CHANGED
@@ -14,15 +14,11 @@ require 'nhkore/error'
|
|
14
14
|
|
15
15
|
|
16
16
|
module NHKore
|
17
|
-
###
|
18
|
-
# @author Jonathan Bradley Whited
|
19
|
-
# @since 0.2.0
|
20
|
-
###
|
21
17
|
class Dict
|
22
18
|
attr_reader :entries
|
23
19
|
|
24
20
|
def initialize
|
25
|
-
super
|
21
|
+
super
|
26
22
|
|
27
23
|
@entries = {}
|
28
24
|
end
|
@@ -39,6 +35,7 @@ module NHKore
|
|
39
35
|
dict = Dict.new
|
40
36
|
|
41
37
|
hash.each do |id,array|
|
38
|
+
id = id.to_s.strip.downcase # 'RSHOK-K-003806', '0000'
|
42
39
|
entry = Entry.scrape(id,array,missingno: missingno,url: url)
|
43
40
|
|
44
41
|
next if entry.nil?
|
data/lib/nhkore/dict_scraper.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
|
4
4
|
#--
|
5
5
|
# This file is part of NHKore.
|
6
|
-
# Copyright (c) 2020-
|
6
|
+
# Copyright (c) 2020-2022 Jonathan Bradley Whited
|
7
7
|
#
|
8
8
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
9
9
|
#++
|
@@ -16,10 +16,6 @@ require 'nhkore/util'
|
|
16
16
|
|
17
17
|
|
18
18
|
module NHKore
|
19
|
-
###
|
20
|
-
# @author Jonathan Bradley Whited
|
21
|
-
# @since 0.2.0
|
22
|
-
###
|
23
19
|
class DictScraper < Scraper
|
24
20
|
attr_accessor :missingno
|
25
21
|
|
@@ -39,7 +35,7 @@ module NHKore
|
|
39
35
|
i = url.rindex(%r{[/\\]}) # Can be a URL or a file
|
40
36
|
i = i.nil? ? 0 : (i + 1) # If no match found, no path
|
41
37
|
|
42
|
-
basename = File.basename(url[i
|
38
|
+
basename = File.basename(url[i..],'.*') if basename.nil?
|
43
39
|
path = url[0...i]
|
44
40
|
|
45
41
|
return "#{path}#{basename}.out.dic"
|
data/lib/nhkore/entry.rb
CHANGED
@@ -14,10 +14,6 @@ require 'nhkore/util'
|
|
14
14
|
|
15
15
|
|
16
16
|
module NHKore
|
17
|
-
###
|
18
|
-
# @author Jonathan Bradley Whited
|
19
|
-
# @since 0.2.0
|
20
|
-
###
|
21
17
|
class Entry
|
22
18
|
HYOUKI_SEP = '・'
|
23
19
|
|
@@ -25,18 +21,16 @@ module NHKore
|
|
25
21
|
attr_accessor :id
|
26
22
|
|
27
23
|
def initialize
|
28
|
-
super
|
24
|
+
super
|
29
25
|
|
30
26
|
@defns = []
|
31
27
|
@id = nil
|
32
28
|
end
|
33
29
|
|
34
30
|
def build_defn
|
35
|
-
defns = []
|
36
31
|
i = 0
|
37
|
-
|
38
|
-
|
39
|
-
defns << "#{i += 1})#{defn}" # Japanese parenthesis
|
32
|
+
defns = @defns.map do |defn|
|
33
|
+
"#{i += 1})#{defn}" # Japanese parenthesis
|
40
34
|
end
|
41
35
|
|
42
36
|
return defns.join("\n")
|
data/lib/nhkore/error.rb
CHANGED
@@ -10,21 +10,11 @@
|
|
10
10
|
|
11
11
|
|
12
12
|
module NHKore
|
13
|
-
###
|
14
|
-
# @author Jonathan Bradley Whited
|
15
|
-
# @since 0.2.0
|
16
|
-
###
|
17
13
|
class Error < ::StandardError; end
|
18
14
|
|
19
|
-
# @since 0.2.0
|
20
15
|
class CLIError < Error; end
|
21
|
-
|
22
|
-
# @since 0.2.0
|
16
|
+
class Http404Error < Error; end
|
23
17
|
class ParseError < Error; end
|
24
|
-
|
25
|
-
# @since 0.2.0
|
26
18
|
class ScrapeError < Error; end
|
27
|
-
|
28
|
-
# @since 0.2.0
|
29
19
|
class ZipError < Error; end
|
30
20
|
end
|
data/lib/nhkore/fileable.rb
CHANGED
data/lib/nhkore/lib.rb
CHANGED
data/lib/nhkore/missingno.rb
CHANGED
@@ -13,10 +13,6 @@ require 'nhkore/util'
|
|
13
13
|
|
14
14
|
|
15
15
|
module NHKore
|
16
|
-
###
|
17
|
-
# @author Jonathan Bradley Whited
|
18
|
-
# @since 0.2.0
|
19
|
-
###
|
20
16
|
class Missingno
|
21
17
|
attr_reader :kanas
|
22
18
|
attr_reader :kanjis
|
@@ -68,13 +64,13 @@ module NHKore
|
|
68
64
|
def kana_from_kanji(kanji)
|
69
65
|
word = @kanjis[kanji]
|
70
66
|
|
71
|
-
return word
|
67
|
+
return word&.kana
|
72
68
|
end
|
73
69
|
|
74
70
|
def kanji_from_kana(kana)
|
75
71
|
word = @kanas[kana]
|
76
72
|
|
77
|
-
return word
|
73
|
+
return word&.kanji
|
78
74
|
end
|
79
75
|
end
|
80
76
|
end
|
data/lib/nhkore/news.rb
CHANGED
@@ -16,10 +16,6 @@ require 'nhkore/util'
|
|
16
16
|
|
17
17
|
|
18
18
|
module NHKore
|
19
|
-
###
|
20
|
-
# @author Jonathan Bradley Whited
|
21
|
-
# @since 0.2.0
|
22
|
-
###
|
23
19
|
class News
|
24
20
|
include Fileable
|
25
21
|
|
@@ -30,7 +26,7 @@ module NHKore
|
|
30
26
|
attr_reader :sha256s
|
31
27
|
|
32
28
|
def initialize
|
33
|
-
super
|
29
|
+
super
|
34
30
|
|
35
31
|
@articles = {}
|
36
32
|
@sha256s = {}
|
@@ -127,10 +123,6 @@ module NHKore
|
|
127
123
|
end
|
128
124
|
end
|
129
125
|
|
130
|
-
###
|
131
|
-
# @author Jonathan Bradley Whited
|
132
|
-
# @since 0.2.0
|
133
|
-
###
|
134
126
|
class FutsuuNews < News
|
135
127
|
DEFAULT_FILENAME = 'nhk_news_web_regular.yml'
|
136
128
|
DEFAULT_FILE = build_file(DEFAULT_FILENAME)
|
@@ -144,14 +136,10 @@ module NHKore
|
|
144
136
|
end
|
145
137
|
|
146
138
|
def save_file(file=DEFAULT_FILE,**kargs)
|
147
|
-
super
|
139
|
+
super
|
148
140
|
end
|
149
141
|
end
|
150
142
|
|
151
|
-
###
|
152
|
-
# @author Jonathan Bradley Whited
|
153
|
-
# @since 0.2.0
|
154
|
-
###
|
155
143
|
class YasashiiNews < News
|
156
144
|
DEFAULT_FILENAME = 'nhk_news_web_easy.yml'
|
157
145
|
DEFAULT_FILE = build_file(DEFAULT_FILENAME)
|
@@ -165,7 +153,7 @@ module NHKore
|
|
165
153
|
end
|
166
154
|
|
167
155
|
def save_file(file=DEFAULT_FILE,**kargs)
|
168
|
-
super
|
156
|
+
super
|
169
157
|
end
|
170
158
|
end
|
171
159
|
end
|
data/lib/nhkore/polisher.rb
CHANGED
@@ -13,10 +13,6 @@ require 'nhkore/word'
|
|
13
13
|
|
14
14
|
|
15
15
|
module NHKore
|
16
|
-
###
|
17
|
-
# @author Jonathan Bradley Whited
|
18
|
-
# @since 0.2.0
|
19
|
-
###
|
20
16
|
class Polisher
|
21
17
|
def begin_polish(str)
|
22
18
|
return str
|
@@ -52,10 +48,6 @@ module NHKore
|
|
52
48
|
end
|
53
49
|
end
|
54
50
|
|
55
|
-
###
|
56
|
-
# @author Jonathan Bradley Whited
|
57
|
-
# @since 0.2.0
|
58
|
-
###
|
59
51
|
class BasicPolisher < Polisher
|
60
52
|
def end_polish(str)
|
61
53
|
# Keep Japanese dots in names:
|
@@ -72,10 +64,6 @@ module NHKore
|
|
72
64
|
end
|
73
65
|
end
|
74
66
|
|
75
|
-
###
|
76
|
-
# @author Jonathan Bradley Whited
|
77
|
-
# @since 0.2.0
|
78
|
-
###
|
79
67
|
class BestPolisher < BasicPolisher
|
80
68
|
end
|
81
69
|
end
|
data/lib/nhkore/scraper.rb
CHANGED
@@ -13,15 +13,12 @@ require 'attr_bool'
|
|
13
13
|
require 'nokogiri'
|
14
14
|
require 'open-uri'
|
15
15
|
|
16
|
+
require 'nhkore/error'
|
16
17
|
require 'nhkore/user_agents'
|
17
18
|
require 'nhkore/util'
|
18
19
|
|
19
20
|
|
20
21
|
module NHKore
|
21
|
-
###
|
22
|
-
# @author Jonathan Bradley Whited
|
23
|
-
# @since 0.2.0
|
24
|
-
###
|
25
22
|
class Scraper
|
26
23
|
extend AttrBool::Ext
|
27
24
|
|
@@ -177,7 +174,13 @@ module NHKore
|
|
177
174
|
retry
|
178
175
|
# Must come after HTTPRedirect since a subclass of HTTPError.
|
179
176
|
rescue OpenURI::HTTPError => e
|
180
|
-
|
177
|
+
msg = "HTTP error[#{e}] at URL[#{url}]"
|
178
|
+
|
179
|
+
if e.to_s.include?('404 Not Found')
|
180
|
+
raise Http404Error,msg
|
181
|
+
else
|
182
|
+
raise e.exception(msg)
|
183
|
+
end
|
181
184
|
rescue SocketError => e
|
182
185
|
if (max_retries -= 1) < 0
|
183
186
|
raise e.exception("Socket error[#{e}] at URL[#{url}]")
|
data/lib/nhkore/search_link.rb
CHANGED
@@ -17,10 +17,6 @@ require 'nhkore/util'
|
|
17
17
|
|
18
18
|
|
19
19
|
module NHKore
|
20
|
-
###
|
21
|
-
# @author Jonathan Bradley Whited
|
22
|
-
# @since 0.2.0
|
23
|
-
###
|
24
20
|
class SearchLink
|
25
21
|
extend AttrBool::Ext
|
26
22
|
|
@@ -45,11 +41,11 @@ module NHKore
|
|
45
41
|
def encode_with(coder)
|
46
42
|
# Order matters.
|
47
43
|
|
48
|
-
coder[:url] = @url
|
44
|
+
coder[:url] = @url&.to_s
|
49
45
|
coder[:scraped] = @scraped
|
50
|
-
coder[:datetime] = @datetime
|
46
|
+
coder[:datetime] = @datetime&.iso8601
|
51
47
|
coder[:title] = @title
|
52
|
-
coder[:futsuurl] = @futsuurl
|
48
|
+
coder[:futsuurl] = @futsuurl&.to_s
|
53
49
|
coder[:sha256] = @sha256
|
54
50
|
end
|
55
51
|
|
@@ -86,13 +82,13 @@ module NHKore
|
|
86
82
|
end
|
87
83
|
|
88
84
|
def futsuurl=(value)
|
89
|
-
# Don't store URI, store String.
|
90
|
-
@futsuurl = value
|
85
|
+
# Don't store URI, store String or nil.
|
86
|
+
@futsuurl = value&.to_s
|
91
87
|
end
|
92
88
|
|
93
89
|
def url=(value)
|
94
|
-
# Don't store URI, store String.
|
95
|
-
@url = value
|
90
|
+
# Don't store URI, store String or nil.
|
91
|
+
@url = value&.to_s
|
96
92
|
end
|
97
93
|
|
98
94
|
def to_s(mini: false)
|
@@ -114,10 +110,6 @@ module NHKore
|
|
114
110
|
end
|
115
111
|
end
|
116
112
|
|
117
|
-
###
|
118
|
-
# @author Jonathan Bradley Whited
|
119
|
-
# @since 0.2.0
|
120
|
-
###
|
121
113
|
class SearchLinks
|
122
114
|
include Fileable
|
123
115
|
|
@@ -136,13 +128,13 @@ module NHKore
|
|
136
128
|
attr_reader :links
|
137
129
|
|
138
130
|
def initialize
|
139
|
-
super
|
131
|
+
super
|
140
132
|
|
141
133
|
@links = {}
|
142
134
|
end
|
143
135
|
|
144
136
|
def add_link(link)
|
145
|
-
url = link.url
|
137
|
+
url = link.url&.to_s
|
146
138
|
|
147
139
|
return self if @links.key?(url)
|
148
140
|
|
@@ -9,6 +9,7 @@
|
|
9
9
|
#++
|
10
10
|
|
11
11
|
|
12
|
+
require 'net/http'
|
12
13
|
require 'uri'
|
13
14
|
|
14
15
|
require 'nhkore/error'
|
@@ -18,10 +19,6 @@ require 'nhkore/util'
|
|
18
19
|
|
19
20
|
|
20
21
|
module NHKore
|
21
|
-
###
|
22
|
-
# @author Jonathan Bradley Whited
|
23
|
-
# @since 0.2.0
|
24
|
-
###
|
25
22
|
class SearchScraper < Scraper
|
26
23
|
DEFAULT_RESULT_COUNT = 100
|
27
24
|
FUTSUU_SITE = 'nhk.or.jp/news/html/'
|
@@ -34,10 +31,11 @@ module NHKore
|
|
34
31
|
YASASHII_REGEX = /\A[^.]+\.#{Regexp.quote(YASASHII_SITE)}.+\.html?/i.freeze
|
35
32
|
|
36
33
|
IGNORE_LINK_REGEX = %r{
|
37
|
-
/about\.html?
|
38
|
-
|/movieplayer\.html?
|
39
|
-
|/audio\.html?
|
40
|
-
|/news/easy/index\.html?
|
34
|
+
/about\.html? # https://www3.nhk.or.jp/news/easy/about.html
|
35
|
+
|/movieplayer\.html? # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
|
36
|
+
|/audio\.html? # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
|
37
|
+
|/news/easy/index\.html? # https://www3.nhk.or.jp/news/easy/index.html
|
38
|
+
|/disaster_earthquake.html # https://www3.nhk.or.jp/news/easy/article/disaster_earthquake.html
|
41
39
|
|
42
40
|
# https://cgi2.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10011916321000&title=日本の会社が作った鉄道の車両「あずま」がイギリスで走る
|
43
41
|
# https://www3.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10012689671000&title=「鬼滅の刃」の映画が台湾でも始まって大勢の人が見に行く
|
@@ -47,7 +45,7 @@ module NHKore
|
|
47
45
|
# Search Engines are strict, so trigger using the default HTTP header fields
|
48
46
|
# with +header: {}+ and fetch/set the cookie using +eat_cookie: true+.
|
49
47
|
def initialize(url,eat_cookie: true,header: {},**kargs)
|
50
|
-
super
|
48
|
+
super
|
51
49
|
end
|
52
50
|
|
53
51
|
def ignore_link?(link,cleaned: true)
|
@@ -56,17 +54,35 @@ module NHKore
|
|
56
54
|
link = Util.unspace_web_str(link).downcase unless cleaned
|
57
55
|
|
58
56
|
return true if link.empty?
|
59
|
-
|
60
57
|
return true if IGNORE_LINK_REGEX.match?(link)
|
61
|
-
|
62
58
|
return false
|
63
59
|
end
|
60
|
+
|
61
|
+
# Example: https://www3.nhk.or.jp/news/easy/k10014150691000/k10014150691000.html
|
62
|
+
def fetch_valid_link?(link)
|
63
|
+
uri = begin
|
64
|
+
URI(link)
|
65
|
+
rescue StandardError
|
66
|
+
return false # Bad URL.
|
67
|
+
end
|
68
|
+
|
69
|
+
begin
|
70
|
+
ssl = uri.scheme.to_s.strip.downcase.include?('https')
|
71
|
+
|
72
|
+
Net::HTTP.start(uri.host,uri.port,use_ssl: ssl) do |http|
|
73
|
+
resp = http.head(uri.request_uri)
|
74
|
+
code = resp.code
|
75
|
+
|
76
|
+
return code != '404'
|
77
|
+
end
|
78
|
+
rescue StandardError
|
79
|
+
# Ignore; try actually scraping the article anyway.
|
80
|
+
end
|
81
|
+
|
82
|
+
return true
|
83
|
+
end
|
64
84
|
end
|
65
85
|
|
66
|
-
###
|
67
|
-
# @author Jonathan Bradley Whited
|
68
|
-
# @since 0.2.0
|
69
|
-
###
|
70
86
|
class BingScraper < SearchScraper
|
71
87
|
attr_reader :regex
|
72
88
|
attr_reader :site
|
@@ -136,9 +152,8 @@ module NHKore
|
|
136
152
|
next_page.count = count
|
137
153
|
next_page.url = join_url(href)
|
138
154
|
end
|
139
|
-
elsif href =~ regex
|
155
|
+
elsif href =~ regex && fetch_valid_link?(href)
|
140
156
|
slinks.add_link(SearchLink.new(href))
|
141
|
-
|
142
157
|
link_count += 1
|
143
158
|
end
|
144
159
|
end
|
@@ -165,10 +180,9 @@ module NHKore
|
|
165
180
|
rss_links << link
|
166
181
|
|
167
182
|
next if ignore_link?(link)
|
168
|
-
next if link !~ regex
|
183
|
+
next if link !~ regex || !fetch_valid_link?(link)
|
169
184
|
|
170
185
|
slinks.add_link(SearchLink.new(link))
|
171
|
-
|
172
186
|
link_count += 1
|
173
187
|
end
|
174
188
|
|
@@ -192,17 +206,13 @@ module NHKore
|
|
192
206
|
end
|
193
207
|
end
|
194
208
|
|
195
|
-
###
|
196
|
-
# @author Jonathan Bradley Whited
|
197
|
-
# @since 0.2.0
|
198
|
-
###
|
199
209
|
class NextPage
|
200
210
|
attr_accessor :count
|
201
211
|
attr_accessor :rss_links
|
202
212
|
attr_accessor :url
|
203
213
|
|
204
214
|
def initialize
|
205
|
-
super
|
215
|
+
super
|
206
216
|
|
207
217
|
@count = -1
|
208
218
|
@rss_links = nil
|