nhkore 0.3.1 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +81 -3
- data/README.md +505 -9
- data/Rakefile +48 -8
- data/lib/nhkore.rb +1 -22
- data/lib/nhkore/app.rb +3 -1
- data/lib/nhkore/article.rb +24 -7
- data/lib/nhkore/article_scraper.rb +21 -16
- data/lib/nhkore/cli/news_cmd.rb +3 -2
- data/lib/nhkore/cli/search_cmd.rb +2 -2
- data/lib/nhkore/cli/sift_cmd.rb +9 -112
- data/lib/nhkore/datetime_parser.rb +342 -0
- data/lib/nhkore/dict_scraper.rb +1 -1
- data/lib/nhkore/lib.rb +59 -0
- data/lib/nhkore/news.rb +13 -4
- data/lib/nhkore/scraper.rb +21 -9
- data/lib/nhkore/search_link.rb +37 -19
- data/lib/nhkore/search_scraper.rb +1 -0
- data/lib/nhkore/sifter.rb +106 -51
- data/lib/nhkore/util.rb +12 -21
- data/lib/nhkore/variator.rb +1 -0
- data/lib/nhkore/version.rb +1 -1
- data/nhkore.gemspec +12 -7
- metadata +21 -5
@@ -0,0 +1,342 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'attr_bool'
|
25
|
+
require 'date'
|
26
|
+
require 'time'
|
27
|
+
|
28
|
+
require 'nhkore/util'
|
29
|
+
|
30
|
+
|
31
|
+
module NHKore
|
32
|
+
###
|
33
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
34
|
+
# @since 0.3.4
|
35
|
+
###
|
36
|
+
class DatetimeParser
|
37
|
+
# Order matters!
|
38
|
+
FMTS = [
|
39
|
+
'%Y-%m-%d %H:%M',
|
40
|
+
'%Y-%m-%d %H',
|
41
|
+
'%Y-%m-%d',
|
42
|
+
'%m-%d %H:%M',
|
43
|
+
'%Y-%m %H:%M',
|
44
|
+
'%m-%d %H',
|
45
|
+
'%Y-%m %H',
|
46
|
+
'%m-%d',
|
47
|
+
'%Y-%m',
|
48
|
+
'%d %H:%M',
|
49
|
+
'%y %H:%M',
|
50
|
+
'%d %H',
|
51
|
+
'%Y %H',
|
52
|
+
'%H:%M',
|
53
|
+
'%d',
|
54
|
+
'%Y',
|
55
|
+
]
|
56
|
+
|
57
|
+
def self.guess_year(year)
|
58
|
+
if year < 1000
|
59
|
+
century = Util::JST_YEAR / 100 * 100 # 2120 -> 2100
|
60
|
+
millennium = Util::JST_YEAR / 1000 * 1000 # 2120 -> 2000
|
61
|
+
|
62
|
+
# If year <= 23 (2022 -> 23)...
|
63
|
+
if year <= ((Util::JST_YEAR % 100) + 1)
|
64
|
+
# Assume this century.
|
65
|
+
year = century + year
|
66
|
+
elsif year >= 100
|
67
|
+
# If (2000 + 150) <= 2201 (if current year is 2200)...
|
68
|
+
if (millennium + year) <= (Util::JST_YEAR + 1)
|
69
|
+
# Assume this millennium.
|
70
|
+
# So if the current year is 2200, and year is 150,
|
71
|
+
# then it will be 2000 + 150 = 2150.
|
72
|
+
year = millennium + year
|
73
|
+
else
|
74
|
+
# Assume previous millennium (2000 -> 1000),
|
75
|
+
# so year 999 will become 1999.
|
76
|
+
millennium -= 1000 if millennium >= 1000
|
77
|
+
year = millennium + year
|
78
|
+
end
|
79
|
+
else
|
80
|
+
# Assume previous century (2000 -> 1900).
|
81
|
+
century -= 100 if century >= 100
|
82
|
+
year = century + year
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
return year
|
87
|
+
end
|
88
|
+
|
89
|
+
def self.parse_range(value)
|
90
|
+
# Do not use unspace_web_str(), want spaces for formats.
|
91
|
+
value = Util.strip_web_str(Util.reduce_space(value))
|
92
|
+
values = value.split('...',2)
|
93
|
+
|
94
|
+
return nil if values.empty?() # For '' or '...'
|
95
|
+
|
96
|
+
# For '2020...' or '...2020'.
|
97
|
+
if value.include?('...')
|
98
|
+
# values.length is always 2 because of 2 in split() above.
|
99
|
+
|
100
|
+
# For '2020...'.
|
101
|
+
if Util.empty_web_str?(values[1])
|
102
|
+
values[1] = :infinity
|
103
|
+
# For '...2020'.
|
104
|
+
elsif Util.empty_web_str?(values[0])
|
105
|
+
values[0] = :infinity
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
datetimes = [
|
110
|
+
DatetimeParser.new(), # "From" date time
|
111
|
+
DatetimeParser.new(), # "To" date time
|
112
|
+
]
|
113
|
+
|
114
|
+
values.each_with_index() do |v,i|
|
115
|
+
dt = datetimes[i]
|
116
|
+
|
117
|
+
# Minimum/Maximum date time for '2020...' or '...2020'.
|
118
|
+
if v == :infinity
|
119
|
+
# "From" date time.
|
120
|
+
if i == 0
|
121
|
+
dt.min!()
|
122
|
+
# "To" date time.
|
123
|
+
else
|
124
|
+
dt.max!()
|
125
|
+
end
|
126
|
+
else
|
127
|
+
v = Util.strip_web_str(v)
|
128
|
+
|
129
|
+
FMTS.each_with_index() do |fmt,i|
|
130
|
+
begin
|
131
|
+
# If don't do this, "%d" values will be parsed using "%d %H".
|
132
|
+
# It seems as though strptime() ignores space.
|
133
|
+
raise ArgumentError if fmt.include?(' ') && !v.include?(' ')
|
134
|
+
|
135
|
+
# If don't do this, "%y..." values will be parsed using "%d...".
|
136
|
+
raise ArgumentError if fmt.start_with?('%d') && v.split(' ')[0].length > 2
|
137
|
+
|
138
|
+
dt.parse!(v,fmt)
|
139
|
+
|
140
|
+
break # No problem; this format worked
|
141
|
+
rescue ArgumentError
|
142
|
+
# Out of formats.
|
143
|
+
raise if i >= (FMTS.length - 1)
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
from = datetimes[0]
|
150
|
+
to = datetimes[1]
|
151
|
+
|
152
|
+
from.autofill!(:from,to)
|
153
|
+
to.autofill!(:to,from)
|
154
|
+
|
155
|
+
return [from.jst_time(),to.jst_time()]
|
156
|
+
end
|
157
|
+
|
158
|
+
attr_accessor :day
|
159
|
+
attr_accessor :hour
|
160
|
+
attr_accessor :min
|
161
|
+
attr_accessor :month
|
162
|
+
attr_accessor :sec
|
163
|
+
attr_accessor :year
|
164
|
+
|
165
|
+
attr_accessor? :has_day
|
166
|
+
attr_accessor? :has_hour
|
167
|
+
attr_accessor? :has_min
|
168
|
+
attr_accessor? :has_month
|
169
|
+
attr_accessor? :has_sec
|
170
|
+
attr_accessor? :has_year
|
171
|
+
|
172
|
+
attr_reader? :min_or_max
|
173
|
+
|
174
|
+
def initialize(year=nil,month=nil,day=nil,hour=nil,min=nil,sec=nil)
|
175
|
+
super()
|
176
|
+
|
177
|
+
set!(year,month,day,hour,min,sec)
|
178
|
+
|
179
|
+
self.has = false
|
180
|
+
@min_or_max = false
|
181
|
+
end
|
182
|
+
|
183
|
+
def autofill!(type,other)
|
184
|
+
case type
|
185
|
+
when :from
|
186
|
+
is_from = true
|
187
|
+
when :to
|
188
|
+
is_from = false
|
189
|
+
else
|
190
|
+
raise ArgumentError,"invalid type[#{type}]"
|
191
|
+
end
|
192
|
+
|
193
|
+
return self if @min_or_max
|
194
|
+
|
195
|
+
has_small = false
|
196
|
+
jst_now = Util.jst_now()
|
197
|
+
|
198
|
+
# Must be from smallest to biggest.
|
199
|
+
|
200
|
+
if @has_sec || other.has_sec?()
|
201
|
+
@sec = other.sec unless @has_sec
|
202
|
+
has_small = true
|
203
|
+
else
|
204
|
+
if has_small
|
205
|
+
@sec = jst_now.sec
|
206
|
+
else
|
207
|
+
@sec = is_from ? 0 : 59
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
if @has_min || other.has_min?()
|
212
|
+
@min = other.min unless @has_min
|
213
|
+
has_small = true
|
214
|
+
else
|
215
|
+
if has_small
|
216
|
+
@min = jst_now.min
|
217
|
+
else
|
218
|
+
@min = is_from ? 0 : 59
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
if @has_hour || other.has_hour?()
|
223
|
+
@hour = other.hour unless @has_hour
|
224
|
+
has_small = true
|
225
|
+
else
|
226
|
+
if has_small
|
227
|
+
@hour = jst_now.hour
|
228
|
+
else
|
229
|
+
@hour = is_from ? 0 : 23
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
if @has_day || other.has_day?()
|
234
|
+
@day = other.day unless @has_day
|
235
|
+
has_small = true
|
236
|
+
else
|
237
|
+
if has_small
|
238
|
+
@day = jst_now.day
|
239
|
+
else
|
240
|
+
@day = is_from ? 1 : :last_day
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
if @has_month || other.has_month?()
|
245
|
+
@month = other.month unless @has_month
|
246
|
+
has_small = true
|
247
|
+
else
|
248
|
+
if has_small
|
249
|
+
@month = jst_now.month
|
250
|
+
else
|
251
|
+
@month = is_from ? 1 : 12
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
if @has_year || other.has_year?()
|
256
|
+
@year = other.year unless @has_year
|
257
|
+
has_small = true
|
258
|
+
else
|
259
|
+
if has_small
|
260
|
+
@year = jst_now.year
|
261
|
+
else
|
262
|
+
@year = is_from ? Util::MIN_SANE_YEAR : jst_now.year
|
263
|
+
end
|
264
|
+
end
|
265
|
+
|
266
|
+
# Must be after setting @year & @month.
|
267
|
+
if @day == :last_day
|
268
|
+
@day = Date.new(@year,@month,-1).day
|
269
|
+
end
|
270
|
+
|
271
|
+
return self
|
272
|
+
end
|
273
|
+
|
274
|
+
def max!()
|
275
|
+
@min_or_max = true
|
276
|
+
|
277
|
+
# Ex: 2020-12-31 23:59:59
|
278
|
+
return set!(Util::JST_YEAR,12,31,23,59,59)
|
279
|
+
end
|
280
|
+
|
281
|
+
def min!()
|
282
|
+
@min_or_max = true
|
283
|
+
|
284
|
+
# Ex: 1924-01-01 00:00:00
|
285
|
+
return set!(Util::MIN_SANE_YEAR,1,1,0,0,0)
|
286
|
+
end
|
287
|
+
|
288
|
+
def parse!(value,fmt)
|
289
|
+
value = Time.strptime(value,fmt,&self.class.method(:guess_year))
|
290
|
+
|
291
|
+
@has_day = fmt.include?('%d')
|
292
|
+
@has_hour = fmt.include?('%H')
|
293
|
+
@has_min = fmt.include?('%M')
|
294
|
+
@has_month = fmt.include?('%m')
|
295
|
+
@has_sec = fmt.include?('%S')
|
296
|
+
@has_year = fmt.include?('%Y')
|
297
|
+
|
298
|
+
@day = value.day if @has_day
|
299
|
+
@hour = value.hour if @has_hour
|
300
|
+
@min = value.min if @has_min
|
301
|
+
@month = value.month if @has_month
|
302
|
+
@sec = value.sec if @has_sec
|
303
|
+
@year = value.year if @has_year
|
304
|
+
|
305
|
+
return self
|
306
|
+
end
|
307
|
+
|
308
|
+
def set!(year=nil,month=nil,day=nil,hour=nil,min=nil,sec=nil)
|
309
|
+
@year = year
|
310
|
+
@month = month
|
311
|
+
@day = day
|
312
|
+
@hour = hour
|
313
|
+
@min = min
|
314
|
+
@sec = sec
|
315
|
+
|
316
|
+
return self
|
317
|
+
end
|
318
|
+
|
319
|
+
def has=(value)
|
320
|
+
@has_day = value
|
321
|
+
@has_hour = value
|
322
|
+
@has_min = value
|
323
|
+
@has_month = value
|
324
|
+
@has_sec = value
|
325
|
+
@has_year = value
|
326
|
+
|
327
|
+
return self
|
328
|
+
end
|
329
|
+
|
330
|
+
def jst_time()
|
331
|
+
return Util.jst_time(time())
|
332
|
+
end
|
333
|
+
|
334
|
+
def time()
|
335
|
+
return Time.new(@year,@month,@day,@hour,@min,@sec)
|
336
|
+
end
|
337
|
+
|
338
|
+
def to_s()
|
339
|
+
return "#{@year}-#{@month}-#{@day} #{@hour}:#{@min}:#{@sec}"
|
340
|
+
end
|
341
|
+
end
|
342
|
+
end
|
data/lib/nhkore/dict_scraper.rb
CHANGED
data/lib/nhkore/lib.rb
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'nhkore/article'
|
25
|
+
require 'nhkore/article_scraper'
|
26
|
+
require 'nhkore/cleaner'
|
27
|
+
require 'nhkore/datetime_parser'
|
28
|
+
require 'nhkore/defn'
|
29
|
+
require 'nhkore/dict'
|
30
|
+
require 'nhkore/dict_scraper'
|
31
|
+
require 'nhkore/entry'
|
32
|
+
require 'nhkore/error'
|
33
|
+
require 'nhkore/fileable'
|
34
|
+
require 'nhkore/missingno'
|
35
|
+
require 'nhkore/news'
|
36
|
+
require 'nhkore/polisher'
|
37
|
+
require 'nhkore/scraper'
|
38
|
+
require 'nhkore/search_link'
|
39
|
+
require 'nhkore/search_scraper'
|
40
|
+
require 'nhkore/sifter'
|
41
|
+
require 'nhkore/splitter'
|
42
|
+
require 'nhkore/user_agents'
|
43
|
+
require 'nhkore/util'
|
44
|
+
require 'nhkore/variator'
|
45
|
+
require 'nhkore/version'
|
46
|
+
require 'nhkore/word'
|
47
|
+
|
48
|
+
|
49
|
+
module NHKore
|
50
|
+
###
|
51
|
+
# Include this file to only require the files needed to use this
|
52
|
+
# Gem as a library (i.e., don't include CLI-related files).
|
53
|
+
#
|
54
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
55
|
+
# @since 0.3.2
|
56
|
+
###
|
57
|
+
module Lib
|
58
|
+
end
|
59
|
+
end
|
data/lib/nhkore/news.rb
CHANGED
@@ -49,7 +49,10 @@ module NHKore
|
|
49
49
|
end
|
50
50
|
|
51
51
|
def add_article(article,key: nil,overwrite: false)
|
52
|
-
|
52
|
+
url = article.url
|
53
|
+
url = url.to_s() unless url.nil?()
|
54
|
+
|
55
|
+
key = key.nil?() ? url : key.to_s()
|
53
56
|
|
54
57
|
if !overwrite
|
55
58
|
raise ArgumentError,"duplicate article[#{key}] in articles" if @articles.key?(key)
|
@@ -57,7 +60,7 @@ module NHKore
|
|
57
60
|
end
|
58
61
|
|
59
62
|
@articles[key] = article
|
60
|
-
@sha256s[article.sha256] =
|
63
|
+
@sha256s[article.sha256] = url
|
61
64
|
|
62
65
|
return self
|
63
66
|
end
|
@@ -91,16 +94,20 @@ module NHKore
|
|
91
94
|
end
|
92
95
|
|
93
96
|
def update_article(article,url)
|
97
|
+
url = url.to_s() unless url.nil?()
|
98
|
+
|
94
99
|
# Favor https.
|
95
|
-
return if article.url =~ FAVORED_URL
|
100
|
+
return if article.url.to_s() =~ FAVORED_URL
|
96
101
|
return if url !~ FAVORED_URL
|
97
102
|
|
98
|
-
@articles.delete(article.url)
|
103
|
+
@articles.delete(article.url) # Probably no to_s() here
|
99
104
|
@articles[url] = article
|
100
105
|
article.url = url
|
101
106
|
end
|
102
107
|
|
103
108
|
def article(key)
|
109
|
+
key = key.to_s() unless key.nil?()
|
110
|
+
|
104
111
|
return @articles[key]
|
105
112
|
end
|
106
113
|
|
@@ -119,6 +126,8 @@ module NHKore
|
|
119
126
|
end
|
120
127
|
|
121
128
|
def article?(key)
|
129
|
+
key = key.to_s() unless key.nil?()
|
130
|
+
|
122
131
|
return @articles.key?(key)
|
123
132
|
end
|
124
133
|
|