nhkore 0.3.17 → 0.3.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +1 -1
- data/CHANGELOG.md +24 -1
- data/Gemfile +14 -1
- data/Gemfile.lock +29 -29
- data/README.md +2 -7
- data/Rakefile +19 -52
- data/bin/nhkore +1 -3
- data/lib/nhkore/app.rb +41 -46
- data/lib/nhkore/article.rb +9 -11
- data/lib/nhkore/article_scraper.rb +30 -29
- data/lib/nhkore/cleaner.rb +1 -3
- data/lib/nhkore/cli/fx_cmd.rb +17 -22
- data/lib/nhkore/cli/get_cmd.rb +5 -7
- data/lib/nhkore/cli/news_cmd.rb +14 -19
- data/lib/nhkore/cli/search_cmd.rb +11 -14
- data/lib/nhkore/cli/sift_cmd.rb +13 -15
- data/lib/nhkore/datetime_parser.rb +35 -37
- data/lib/nhkore/defn.rb +2 -4
- data/lib/nhkore/dict.rb +1 -3
- data/lib/nhkore/dict_scraper.rb +1 -3
- data/lib/nhkore/entry.rb +1 -3
- data/lib/nhkore/error.rb +1 -2
- data/lib/nhkore/fileable.rb +1 -2
- data/lib/nhkore/lib.rb +5 -12
- data/lib/nhkore/missingno.rb +1 -3
- data/lib/nhkore/news.rb +7 -10
- data/lib/nhkore/polisher.rb +1 -3
- data/lib/nhkore/scraper.rb +23 -13
- data/lib/nhkore/search_link.rb +11 -13
- data/lib/nhkore/search_scraper.rb +26 -15
- data/lib/nhkore/sifter.rb +7 -9
- data/lib/nhkore/splitter.rb +1 -3
- data/lib/nhkore/util.rb +8 -8
- data/lib/nhkore/variator.rb +3 -4
- data/lib/nhkore/version.rb +2 -3
- data/lib/nhkore/word.rb +8 -10
- data/lib/nhkore.rb +3 -11
- data/nhkore.gemspec +41 -47
- data/samples/looper.rb +1 -2
- data/test/nhkore/test_helper.rb +1 -8
- data/test/nhkore_test.rb +5 -9
- metadata +55 -139
- data/lib/nhkore/user_agents.rb +0 -1172
- data/yard/templates/default/layout/html/footer.erb +0 -5
@@ -3,19 +3,17 @@
|
|
3
3
|
|
4
4
|
#--
|
5
5
|
# This file is part of NHKore.
|
6
|
-
# Copyright (c) 2020
|
6
|
+
# Copyright (c) 2020 Bradley Whited
|
7
7
|
#
|
8
8
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
9
9
|
#++
|
10
10
|
|
11
|
-
|
12
11
|
require 'attr_bool'
|
13
12
|
require 'date'
|
14
13
|
require 'time'
|
15
14
|
|
16
15
|
require 'nhkore/util'
|
17
16
|
|
18
|
-
|
19
17
|
module NHKore
|
20
18
|
class DatetimeParser
|
21
19
|
extend AttrBool::Ext
|
@@ -55,10 +53,10 @@ module NHKore
|
|
55
53
|
# Assume this millennium.
|
56
54
|
# So if the current year is 2200, and year is 150,
|
57
55
|
# then it will be 2000 + 150 = 2150.
|
58
|
-
|
56
|
+
elsif millennium >= 1000
|
59
57
|
# Assume previous millennium (2000 -> 1000),
|
60
58
|
# so year 999 will become 1999.
|
61
|
-
millennium -= 1000
|
59
|
+
millennium -= 1000
|
62
60
|
end
|
63
61
|
|
64
62
|
year = millennium + year
|
@@ -155,7 +153,7 @@ module NHKore
|
|
155
153
|
|
156
154
|
attr_reader? :min_or_max
|
157
155
|
|
158
|
-
def initialize(year=nil,month=nil,day=nil,hour=nil,min=nil,sec=nil)
|
156
|
+
def initialize(year = nil,month = nil,day = nil,hour = nil,min = nil,sec = nil)
|
159
157
|
super()
|
160
158
|
|
161
159
|
set!(year,month,day,hour,min,sec)
|
@@ -185,66 +183,66 @@ module NHKore
|
|
185
183
|
@sec = other.sec unless @has_sec
|
186
184
|
has_small = true
|
187
185
|
else
|
188
|
-
if has_small
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
186
|
+
@sec = if has_small
|
187
|
+
jst_now.sec
|
188
|
+
else
|
189
|
+
is_from ? 0 : 59
|
190
|
+
end
|
193
191
|
end
|
194
192
|
|
195
193
|
if @has_min || other.has_min?
|
196
194
|
@min = other.min unless @has_min
|
197
195
|
has_small = true
|
198
196
|
else
|
199
|
-
if has_small
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
197
|
+
@min = if has_small
|
198
|
+
jst_now.min
|
199
|
+
else
|
200
|
+
is_from ? 0 : 59
|
201
|
+
end
|
204
202
|
end
|
205
203
|
|
206
204
|
if @has_hour || other.has_hour?
|
207
205
|
@hour = other.hour unless @has_hour
|
208
206
|
has_small = true
|
209
207
|
else
|
210
|
-
if has_small
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
208
|
+
@hour = if has_small
|
209
|
+
jst_now.hour
|
210
|
+
else
|
211
|
+
is_from ? 0 : 23
|
212
|
+
end
|
215
213
|
end
|
216
214
|
|
217
215
|
if @has_day || other.has_day?
|
218
216
|
@day = other.day unless @has_day
|
219
217
|
has_small = true
|
220
218
|
else
|
221
|
-
if has_small
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
219
|
+
@day = if has_small
|
220
|
+
jst_now.day
|
221
|
+
else
|
222
|
+
is_from ? 1 : :last_day
|
223
|
+
end
|
226
224
|
end
|
227
225
|
|
228
226
|
if @has_month || other.has_month?
|
229
227
|
@month = other.month unless @has_month
|
230
228
|
has_small = true
|
231
229
|
else
|
232
|
-
if has_small
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
230
|
+
@month = if has_small
|
231
|
+
jst_now.month
|
232
|
+
else
|
233
|
+
is_from ? 1 : 12
|
234
|
+
end
|
237
235
|
end
|
238
236
|
|
239
237
|
if @has_year || other.has_year?
|
240
238
|
@year = other.year unless @has_year
|
241
239
|
has_small = true # rubocop:disable Lint/UselessAssignment
|
242
240
|
else
|
243
|
-
if has_small
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
241
|
+
@year = if has_small
|
242
|
+
jst_now.year
|
243
|
+
else
|
244
|
+
is_from ? Util::MIN_SANE_YEAR : jst_now.year
|
245
|
+
end
|
248
246
|
end
|
249
247
|
|
250
248
|
# Must be after setting @year & @month.
|
@@ -289,7 +287,7 @@ module NHKore
|
|
289
287
|
return self
|
290
288
|
end
|
291
289
|
|
292
|
-
def set!(year=nil,month=nil,day=nil,hour=nil,min=nil,sec=nil)
|
290
|
+
def set!(year = nil,month = nil,day = nil,hour = nil,min = nil,sec = nil)
|
293
291
|
@year = year
|
294
292
|
@month = month
|
295
293
|
@day = day
|
data/lib/nhkore/defn.rb
CHANGED
@@ -3,18 +3,16 @@
|
|
3
3
|
|
4
4
|
#--
|
5
5
|
# This file is part of NHKore.
|
6
|
-
# Copyright (c) 2020
|
6
|
+
# Copyright (c) 2020 Bradley Whited
|
7
7
|
#
|
8
8
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
9
9
|
#++
|
10
10
|
|
11
|
-
|
12
11
|
require 'nokogiri'
|
13
12
|
|
14
13
|
require 'nhkore/util'
|
15
14
|
require 'nhkore/word'
|
16
15
|
|
17
|
-
|
18
16
|
module NHKore
|
19
17
|
class Defn
|
20
18
|
attr_reader :hyoukis
|
@@ -35,7 +33,7 @@ module NHKore
|
|
35
33
|
|
36
34
|
hyoukis = hash['hyouki']
|
37
35
|
|
38
|
-
hyoukis&.each
|
36
|
+
hyoukis&.each do |hyouki|
|
39
37
|
next if hyouki.nil?
|
40
38
|
next if (hyouki = Util.strip_web_str(hyouki)).empty?
|
41
39
|
|
data/lib/nhkore/dict.rb
CHANGED
@@ -3,16 +3,14 @@
|
|
3
3
|
|
4
4
|
#--
|
5
5
|
# This file is part of NHKore.
|
6
|
-
# Copyright (c) 2020
|
6
|
+
# Copyright (c) 2020 Bradley Whited
|
7
7
|
#
|
8
8
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
9
9
|
#++
|
10
10
|
|
11
|
-
|
12
11
|
require 'nhkore/entry'
|
13
12
|
require 'nhkore/error'
|
14
13
|
|
15
|
-
|
16
14
|
module NHKore
|
17
15
|
class Dict
|
18
16
|
attr_reader :entries
|
data/lib/nhkore/dict_scraper.rb
CHANGED
@@ -3,18 +3,16 @@
|
|
3
3
|
|
4
4
|
#--
|
5
5
|
# This file is part of NHKore.
|
6
|
-
# Copyright (c) 2020
|
6
|
+
# Copyright (c) 2020 Bradley Whited
|
7
7
|
#
|
8
8
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
9
9
|
#++
|
10
10
|
|
11
|
-
|
12
11
|
require 'nhkore/dict'
|
13
12
|
require 'nhkore/error'
|
14
13
|
require 'nhkore/scraper'
|
15
14
|
require 'nhkore/util'
|
16
15
|
|
17
|
-
|
18
16
|
module NHKore
|
19
17
|
class DictScraper < Scraper
|
20
18
|
attr_accessor :missingno
|
data/lib/nhkore/entry.rb
CHANGED
@@ -3,16 +3,14 @@
|
|
3
3
|
|
4
4
|
#--
|
5
5
|
# This file is part of NHKore.
|
6
|
-
# Copyright (c) 2020
|
6
|
+
# Copyright (c) 2020 Bradley Whited
|
7
7
|
#
|
8
8
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
9
9
|
#++
|
10
10
|
|
11
|
-
|
12
11
|
require 'nhkore/defn'
|
13
12
|
require 'nhkore/util'
|
14
13
|
|
15
|
-
|
16
14
|
module NHKore
|
17
15
|
class Entry
|
18
16
|
HYOUKI_SEP = '・'
|
data/lib/nhkore/error.rb
CHANGED
data/lib/nhkore/fileable.rb
CHANGED
@@ -3,12 +3,11 @@
|
|
3
3
|
|
4
4
|
#--
|
5
5
|
# This file is part of NHKore.
|
6
|
-
# Copyright (c) 2020
|
6
|
+
# Copyright (c) 2020 Bradley Whited
|
7
7
|
#
|
8
8
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
9
9
|
#++
|
10
10
|
|
11
|
-
|
12
11
|
module NHKore
|
13
12
|
module Fileable
|
14
13
|
def self.included(mod)
|
data/lib/nhkore/lib.rb
CHANGED
@@ -3,11 +3,15 @@
|
|
3
3
|
|
4
4
|
#--
|
5
5
|
# This file is part of NHKore.
|
6
|
-
# Copyright (c) 2020
|
6
|
+
# Copyright (c) 2020 Bradley Whited
|
7
7
|
#
|
8
8
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
9
9
|
#++
|
10
10
|
|
11
|
+
###
|
12
|
+
# Include this file to only require the files needed to use this
|
13
|
+
# Gem as a library (i.e., don't include CLI-related files).
|
14
|
+
###
|
11
15
|
|
12
16
|
require 'nhkore/article'
|
13
17
|
require 'nhkore/article_scraper'
|
@@ -27,18 +31,7 @@ require 'nhkore/search_link'
|
|
27
31
|
require 'nhkore/search_scraper'
|
28
32
|
require 'nhkore/sifter'
|
29
33
|
require 'nhkore/splitter'
|
30
|
-
require 'nhkore/user_agents'
|
31
34
|
require 'nhkore/util'
|
32
35
|
require 'nhkore/variator'
|
33
36
|
require 'nhkore/version'
|
34
37
|
require 'nhkore/word'
|
35
|
-
|
36
|
-
|
37
|
-
module NHKore
|
38
|
-
###
|
39
|
-
# Include this file to only require the files needed to use this
|
40
|
-
# Gem as a library (i.e., don't include CLI-related files).
|
41
|
-
###
|
42
|
-
module Lib
|
43
|
-
end
|
44
|
-
end
|
data/lib/nhkore/missingno.rb
CHANGED
@@ -3,15 +3,13 @@
|
|
3
3
|
|
4
4
|
#--
|
5
5
|
# This file is part of NHKore.
|
6
|
-
# Copyright (c) 2020
|
6
|
+
# Copyright (c) 2020 Bradley Whited
|
7
7
|
#
|
8
8
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
9
9
|
#++
|
10
10
|
|
11
|
-
|
12
11
|
require 'nhkore/util'
|
13
12
|
|
14
|
-
|
15
13
|
module NHKore
|
16
14
|
class Missingno
|
17
15
|
attr_reader :kanas
|
data/lib/nhkore/news.rb
CHANGED
@@ -3,18 +3,16 @@
|
|
3
3
|
|
4
4
|
#--
|
5
5
|
# This file is part of NHKore.
|
6
|
-
# Copyright (c) 2020
|
6
|
+
# Copyright (c) 2020 Bradley Whited
|
7
7
|
#
|
8
8
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
9
9
|
#++
|
10
10
|
|
11
|
-
|
12
11
|
require 'nhkore/article'
|
13
12
|
require 'nhkore/error'
|
14
13
|
require 'nhkore/fileable'
|
15
14
|
require 'nhkore/util'
|
16
15
|
|
17
|
-
|
18
16
|
module NHKore
|
19
17
|
class News
|
20
18
|
include Fileable
|
@@ -60,14 +58,14 @@ module NHKore
|
|
60
58
|
coder[:articles] = @articles
|
61
59
|
end
|
62
60
|
|
63
|
-
def self.load_data(data,article_class: Article,file: nil,news_class: News,overwrite: false,**
|
61
|
+
def self.load_data(data,article_class: Article,file: nil,news_class: News,overwrite: false,**_kargs)
|
64
62
|
data = Util.load_yaml(data,file: file)
|
65
63
|
|
66
64
|
articles = data[:articles]
|
67
65
|
|
68
66
|
news = news_class.new
|
69
67
|
|
70
|
-
articles&.each
|
68
|
+
articles&.each do |key,hash|
|
71
69
|
key = key.to_s # Change from a symbol
|
72
70
|
news.add_article(article_class.load_data(key,hash),key: key,overwrite: overwrite)
|
73
71
|
end
|
@@ -99,7 +97,6 @@ module NHKore
|
|
99
97
|
@articles.each_value do |a|
|
100
98
|
if a.sha256 == sha256
|
101
99
|
article = a
|
102
|
-
|
103
100
|
break
|
104
101
|
end
|
105
102
|
end
|
@@ -131,11 +128,11 @@ module NHKore
|
|
131
128
|
return News.load_data(data,article_class: Article,news_class: FutsuuNews,**kargs)
|
132
129
|
end
|
133
130
|
|
134
|
-
def self.load_file(file=DEFAULT_FILE,**kargs)
|
131
|
+
def self.load_file(file = DEFAULT_FILE,**kargs)
|
135
132
|
return News.load_file(file,article_class: Article,news_class: FutsuuNews,**kargs)
|
136
133
|
end
|
137
134
|
|
138
|
-
def save_file(file=DEFAULT_FILE,**kargs)
|
135
|
+
def save_file(file = DEFAULT_FILE,**kargs)
|
139
136
|
super
|
140
137
|
end
|
141
138
|
end
|
@@ -148,11 +145,11 @@ module NHKore
|
|
148
145
|
return News.load_data(data,article_class: Article,news_class: YasashiiNews,**kargs)
|
149
146
|
end
|
150
147
|
|
151
|
-
def self.load_file(file=DEFAULT_FILE,**kargs)
|
148
|
+
def self.load_file(file = DEFAULT_FILE,**kargs)
|
152
149
|
return News.load_file(file,article_class: Article,news_class: YasashiiNews,**kargs)
|
153
150
|
end
|
154
151
|
|
155
|
-
def save_file(file=DEFAULT_FILE,**kargs)
|
152
|
+
def save_file(file = DEFAULT_FILE,**kargs)
|
156
153
|
super
|
157
154
|
end
|
158
155
|
end
|
data/lib/nhkore/polisher.rb
CHANGED
@@ -3,15 +3,13 @@
|
|
3
3
|
|
4
4
|
#--
|
5
5
|
# This file is part of NHKore.
|
6
|
-
# Copyright (c) 2020
|
6
|
+
# Copyright (c) 2020 Bradley Whited
|
7
7
|
#
|
8
8
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
9
9
|
#++
|
10
10
|
|
11
|
-
|
12
11
|
require 'nhkore/word'
|
13
12
|
|
14
|
-
|
15
13
|
module NHKore
|
16
14
|
class Polisher
|
17
15
|
def begin_polish(str)
|
data/lib/nhkore/scraper.rb
CHANGED
@@ -3,30 +3,36 @@
|
|
3
3
|
|
4
4
|
#--
|
5
5
|
# This file is part of NHKore.
|
6
|
-
# Copyright (c) 2020
|
6
|
+
# Copyright (c) 2020 Bradley Whited
|
7
7
|
#
|
8
8
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
9
9
|
#++
|
10
10
|
|
11
|
-
|
12
11
|
require 'attr_bool'
|
13
12
|
require 'nokogiri'
|
14
13
|
require 'open-uri'
|
14
|
+
require 'ronin/web/user_agents'
|
15
15
|
|
16
16
|
require 'nhkore/error'
|
17
|
-
require 'nhkore/user_agents'
|
18
17
|
require 'nhkore/util'
|
19
18
|
|
20
|
-
|
21
19
|
module NHKore
|
22
20
|
class Scraper
|
23
21
|
extend AttrBool::Ext
|
24
22
|
|
25
23
|
DEFAULT_HEADER = {
|
26
|
-
|
27
|
-
|
28
|
-
|
24
|
+
# See for better ones:
|
25
|
+
# - https://www.useragentstring.com/pages/Chrome/
|
26
|
+
'user-agent' => Ronin::Web::UserAgents.random,
|
27
|
+
|
28
|
+
'accept' => 'text/html,application/xhtml+xml,application/xml,application/rss+xml,text/xml;' \
|
29
|
+
'q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
30
|
+
'accept-language' => 'en;q=0.9,ja-JP;q=0.8,ja',
|
31
|
+
'cache-control' => 'max-age=0',
|
29
32
|
'dnt' => '1',
|
33
|
+
'ect' => '4g',
|
34
|
+
'priority' => 'u=0, i',
|
35
|
+
'upgrade-insecure-requests' => '1',
|
30
36
|
}.freeze
|
31
37
|
|
32
38
|
attr_accessor? :eat_cookie
|
@@ -48,7 +54,7 @@ module NHKore
|
|
48
54
|
# (time-consuming) operation since it opens the URL again, but necessary for some URLs.
|
49
55
|
# @param redirect_rule [nil,:lenient,:strict]
|
50
56
|
def initialize(url,eat_cookie: false,header: nil,is_file: false,max_redirects: 3,max_retries: 3,
|
51
|
-
|
57
|
+
redirect_rule: :strict,str_or_io: nil,**kargs)
|
52
58
|
super()
|
53
59
|
|
54
60
|
if !header.nil? && !is_file
|
@@ -106,7 +112,7 @@ module NHKore
|
|
106
112
|
return URI.join(@url,relative_url)
|
107
113
|
end
|
108
114
|
|
109
|
-
def open(url,str_or_io=nil,is_file: @is_file)
|
115
|
+
def open(url,str_or_io = nil,is_file: @is_file)
|
110
116
|
@is_file = is_file
|
111
117
|
@str_or_io = str_or_io
|
112
118
|
@url = url
|
@@ -155,16 +161,20 @@ module NHKore
|
|
155
161
|
case @redirect_rule
|
156
162
|
when :lenient,:strict
|
157
163
|
if redirect_uri.scheme != top_uri.scheme
|
158
|
-
raise redirect.exception(
|
159
|
-
"scheme[#{
|
164
|
+
raise redirect.exception(
|
165
|
+
"redirect scheme[#{redirect_uri.scheme}] does not match original " \
|
166
|
+
"scheme[#{top_uri.scheme}] at redirect URL[#{redirect_uri}]: #{redirect}"
|
167
|
+
)
|
160
168
|
end
|
161
169
|
|
162
170
|
if @redirect_rule == :strict
|
163
171
|
redirect_domain = Util.domain(redirect_uri.host)
|
164
172
|
|
165
173
|
if redirect_domain != top_domain
|
166
|
-
raise redirect.exception(
|
167
|
-
"domain[#{
|
174
|
+
raise redirect.exception(
|
175
|
+
"redirect domain[#{redirect_domain}] does not match original " \
|
176
|
+
"domain[#{top_domain}] at redirect URL[#{redirect_uri}]: #{redirect}"
|
177
|
+
)
|
168
178
|
end
|
169
179
|
end
|
170
180
|
end
|
data/lib/nhkore/search_link.rb
CHANGED
@@ -3,19 +3,17 @@
|
|
3
3
|
|
4
4
|
#--
|
5
5
|
# This file is part of NHKore.
|
6
|
-
# Copyright (c) 2020
|
6
|
+
# Copyright (c) 2020 Bradley Whited
|
7
7
|
#
|
8
8
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
9
9
|
#++
|
10
10
|
|
11
|
-
|
12
11
|
require 'attr_bool'
|
13
12
|
require 'time'
|
14
13
|
|
15
14
|
require 'nhkore/fileable'
|
16
15
|
require 'nhkore/util'
|
17
16
|
|
18
|
-
|
19
17
|
module NHKore
|
20
18
|
class SearchLink
|
21
19
|
extend AttrBool::Ext
|
@@ -49,7 +47,7 @@ module NHKore
|
|
49
47
|
coder[:sha256] = @sha256
|
50
48
|
end
|
51
49
|
|
52
|
-
def self.load_data(
|
50
|
+
def self.load_data(_key,hash)
|
53
51
|
slink = SearchLink.new(
|
54
52
|
hash[:url],
|
55
53
|
scraped: hash[:scraped],
|
@@ -74,11 +72,11 @@ module NHKore
|
|
74
72
|
end
|
75
73
|
|
76
74
|
def datetime=(value)
|
77
|
-
if value.is_a?(Time)
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
75
|
+
@datetime = if value.is_a?(Time)
|
76
|
+
value
|
77
|
+
else
|
78
|
+
Util.empty_web_str?(value) ? nil : Time.iso8601(value)
|
79
|
+
end
|
82
80
|
end
|
83
81
|
|
84
82
|
def futsuurl=(value)
|
@@ -143,8 +141,8 @@ module NHKore
|
|
143
141
|
return self
|
144
142
|
end
|
145
143
|
|
146
|
-
def each(&
|
147
|
-
return @links.each(&
|
144
|
+
def each(&)
|
145
|
+
return @links.each(&)
|
148
146
|
end
|
149
147
|
|
150
148
|
def encode_with(coder)
|
@@ -153,14 +151,14 @@ module NHKore
|
|
153
151
|
coder[:links] = @links
|
154
152
|
end
|
155
153
|
|
156
|
-
def self.load_data(data,file: nil,**
|
154
|
+
def self.load_data(data,file: nil,**_kargs)
|
157
155
|
data = Util.load_yaml(data,file: file)
|
158
156
|
|
159
157
|
links = data[:links]
|
160
158
|
|
161
159
|
slinks = SearchLinks.new
|
162
160
|
|
163
|
-
links&.each
|
161
|
+
links&.each do |key,hash|
|
164
162
|
key = key.to_s unless key.nil?
|
165
163
|
slinks.links[key] = SearchLink.load_data(key,hash)
|
166
164
|
end
|
@@ -3,12 +3,11 @@
|
|
3
3
|
|
4
4
|
#--
|
5
5
|
# This file is part of NHKore.
|
6
|
-
# Copyright (c) 2020
|
6
|
+
# Copyright (c) 2020 Bradley Whited
|
7
7
|
#
|
8
8
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
9
9
|
#++
|
10
10
|
|
11
|
-
|
12
11
|
require 'net/http'
|
13
12
|
require 'uri'
|
14
13
|
|
@@ -17,7 +16,6 @@ require 'nhkore/scraper'
|
|
17
16
|
require 'nhkore/search_link'
|
18
17
|
require 'nhkore/util'
|
19
18
|
|
20
|
-
|
21
19
|
module NHKore
|
22
20
|
class SearchScraper < Scraper
|
23
21
|
DEFAULT_RESULT_COUNT = 100
|
@@ -60,8 +58,10 @@ module NHKore
|
|
60
58
|
|
61
59
|
# Example: https://www3.nhk.or.jp/news/easy/k10014150691000/k10014150691000.html
|
62
60
|
def fetch_valid_link?(link)
|
63
|
-
uri =
|
64
|
-
|
61
|
+
uri = nil
|
62
|
+
|
63
|
+
begin
|
64
|
+
uri = URI(link)
|
65
65
|
rescue StandardError
|
66
66
|
return false # Bad URL.
|
67
67
|
end
|
@@ -111,29 +111,40 @@ module NHKore
|
|
111
111
|
super(url,**kargs)
|
112
112
|
end
|
113
113
|
|
114
|
-
|
114
|
+
# FIXME: Bing no longer allows `count`.
|
115
|
+
# rubocop:disable Lint/UnusedMethodArgument
|
116
|
+
def self.build_url(site,count: DEFAULT_RESULT_COUNT,**_kargs)
|
115
117
|
url = ''.dup
|
116
118
|
|
117
119
|
url << 'https://www.bing.com/search?'
|
118
120
|
url << URI.encode_www_form(
|
119
121
|
q: "site:#{site}",
|
120
|
-
|
122
|
+
qs: 'n',
|
123
|
+
sp: '-1',
|
124
|
+
lq: '0',
|
125
|
+
pq: "site:#{site}",
|
126
|
+
sc: '1-25',
|
127
|
+
sk: '',
|
128
|
+
first: '1',
|
129
|
+
FORM: 'PERE',
|
121
130
|
)
|
122
131
|
|
123
132
|
return url
|
124
133
|
end
|
134
|
+
# rubocop:enable Lint/UnusedMethodArgument
|
125
135
|
|
126
|
-
def scrape(slinks,page=NextPage.new())
|
127
|
-
next_page,
|
136
|
+
def scrape(slinks,page = NextPage.new())
|
137
|
+
next_page,_link_count = scrape_html(slinks,page)
|
128
138
|
|
129
|
-
|
130
|
-
|
131
|
-
|
139
|
+
# FIXME: Bing no longer allows RSS pages after the first page.
|
140
|
+
# if link_count <= 0
|
141
|
+
# scrape_rss(slinks,page,next_page)
|
142
|
+
# end
|
132
143
|
|
133
144
|
return next_page
|
134
145
|
end
|
135
146
|
|
136
|
-
def scrape_html(slinks,page,next_page=NextPage.new())
|
147
|
+
def scrape_html(slinks,page,next_page = NextPage.new())
|
137
148
|
doc = html_doc
|
138
149
|
link_count = 0
|
139
150
|
|
@@ -145,7 +156,7 @@ module NHKore
|
|
145
156
|
|
146
157
|
next if ignore_link?(href)
|
147
158
|
|
148
|
-
if (md = href.match(/first=(\d+)/))
|
159
|
+
if (md = href.match(/first=(\d+)/i)) && href =~ /FORM=PERE/i
|
149
160
|
count = md[1].to_i
|
150
161
|
|
151
162
|
if count > page.count && (next_page.count < 0 || count < next_page.count)
|
@@ -161,7 +172,7 @@ module NHKore
|
|
161
172
|
return [next_page,link_count]
|
162
173
|
end
|
163
174
|
|
164
|
-
def scrape_rss(slinks,page,next_page=NextPage.new())
|
175
|
+
def scrape_rss(slinks,page,next_page = NextPage.new())
|
165
176
|
link_count = 0
|
166
177
|
|
167
178
|
if !@is_file
|