nhkore 0.3.1 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +81 -3
- data/README.md +505 -9
- data/Rakefile +48 -8
- data/lib/nhkore.rb +1 -22
- data/lib/nhkore/app.rb +3 -1
- data/lib/nhkore/article.rb +24 -7
- data/lib/nhkore/article_scraper.rb +21 -16
- data/lib/nhkore/cli/news_cmd.rb +3 -2
- data/lib/nhkore/cli/search_cmd.rb +2 -2
- data/lib/nhkore/cli/sift_cmd.rb +9 -112
- data/lib/nhkore/datetime_parser.rb +342 -0
- data/lib/nhkore/dict_scraper.rb +1 -1
- data/lib/nhkore/lib.rb +59 -0
- data/lib/nhkore/news.rb +13 -4
- data/lib/nhkore/scraper.rb +21 -9
- data/lib/nhkore/search_link.rb +37 -19
- data/lib/nhkore/search_scraper.rb +1 -0
- data/lib/nhkore/sifter.rb +106 -51
- data/lib/nhkore/util.rb +12 -21
- data/lib/nhkore/variator.rb +1 -0
- data/lib/nhkore/version.rb +1 -1
- data/nhkore.gemspec +12 -7
- metadata +21 -5
data/lib/nhkore/scraper.rb
CHANGED
@@ -21,6 +21,7 @@
|
|
21
21
|
#++
|
22
22
|
|
23
23
|
|
24
|
+
require 'attr_bool'
|
24
25
|
require 'nokogiri'
|
25
26
|
require 'open-uri'
|
26
27
|
|
@@ -40,8 +41,8 @@ module NHKore
|
|
40
41
|
'dnt' => '1',
|
41
42
|
}
|
42
43
|
|
43
|
-
attr_accessor :eat_cookie
|
44
|
-
attr_accessor :is_file
|
44
|
+
attr_accessor? :eat_cookie
|
45
|
+
attr_accessor? :is_file
|
45
46
|
attr_reader :kargs
|
46
47
|
attr_accessor :max_redirects
|
47
48
|
attr_accessor :max_retries
|
@@ -49,9 +50,6 @@ module NHKore
|
|
49
50
|
attr_accessor :str_or_io
|
50
51
|
attr_accessor :url
|
51
52
|
|
52
|
-
alias_method :eat_cookie?,:eat_cookie
|
53
|
-
alias_method :is_file?,:is_file
|
54
|
-
|
55
53
|
# +max_redirects+ defaults to 3 for safety (infinite-loop attack).
|
56
54
|
#
|
57
55
|
# All URL options: https://ruby-doc.org/stdlib-2.7.0/libdoc/open-uri/rdoc/OpenURI/OpenRead.html
|
@@ -82,7 +80,7 @@ module NHKore
|
|
82
80
|
@max_retries = max_retries
|
83
81
|
@redirect_rule = redirect_rule
|
84
82
|
|
85
|
-
open(url,str_or_io)
|
83
|
+
open(url,str_or_io,is_file: is_file)
|
86
84
|
end
|
87
85
|
|
88
86
|
def fetch_cookie(url)
|
@@ -119,14 +117,14 @@ module NHKore
|
|
119
117
|
return URI::join(@url,relative_url)
|
120
118
|
end
|
121
119
|
|
122
|
-
def open(url,str_or_io=nil)
|
120
|
+
def open(url,str_or_io=nil,is_file: @is_file)
|
121
|
+
@is_file = is_file
|
123
122
|
@str_or_io = str_or_io
|
124
123
|
@url = url
|
125
124
|
|
126
125
|
if str_or_io.nil?()
|
127
126
|
if @is_file
|
128
|
-
|
129
|
-
@str_or_io = File.open(url,'rt:UTF-8',**@kargs)
|
127
|
+
open_file(url)
|
130
128
|
else
|
131
129
|
fetch_cookie(url) if @eat_cookie
|
132
130
|
open_url(url)
|
@@ -136,6 +134,16 @@ module NHKore
|
|
136
134
|
return self
|
137
135
|
end
|
138
136
|
|
137
|
+
def open_file(file)
|
138
|
+
@is_file = true
|
139
|
+
@url = file
|
140
|
+
|
141
|
+
# NHK's website tends to always use UTF-8.
|
142
|
+
@str_or_io = File.open(file,'rt:UTF-8',**@kargs)
|
143
|
+
|
144
|
+
return self
|
145
|
+
end
|
146
|
+
|
139
147
|
def open_url(url)
|
140
148
|
max_redirects = (@max_redirects.nil?() || @max_redirects < 0) ? 10_000 : @max_redirects
|
141
149
|
max_retries = (@max_retries.nil?() || @max_retries < 0) ? 10_000 : @max_retries
|
@@ -194,6 +202,10 @@ module NHKore
|
|
194
202
|
return @str_or_io
|
195
203
|
end
|
196
204
|
|
205
|
+
def reopen()
|
206
|
+
return open(@url)
|
207
|
+
end
|
208
|
+
|
197
209
|
def rss_doc()
|
198
210
|
require 'rss'
|
199
211
|
|
data/lib/nhkore/search_link.rb
CHANGED
@@ -21,6 +21,7 @@
|
|
21
21
|
#++
|
22
22
|
|
23
23
|
|
24
|
+
require 'attr_bool'
|
24
25
|
require 'time'
|
25
26
|
|
26
27
|
require 'nhkore/fileable'
|
@@ -33,14 +34,12 @@ module NHKore
|
|
33
34
|
# @since 0.2.0
|
34
35
|
###
|
35
36
|
class SearchLink
|
36
|
-
|
37
|
-
|
38
|
-
attr_accessor :scraped
|
37
|
+
attr_reader :datetime
|
38
|
+
attr_reader :futsuurl
|
39
|
+
attr_accessor? :scraped
|
39
40
|
attr_accessor :sha256
|
40
41
|
attr_accessor :title
|
41
|
-
|
42
|
-
|
43
|
-
alias_method :scraped?,:scraped
|
42
|
+
attr_reader :url
|
44
43
|
|
45
44
|
def initialize(url,scraped: false)
|
46
45
|
super()
|
@@ -50,29 +49,27 @@ module NHKore
|
|
50
49
|
@scraped = scraped
|
51
50
|
@sha256 = sha256
|
52
51
|
@title = nil
|
53
|
-
|
52
|
+
self.url = url
|
54
53
|
end
|
55
54
|
|
56
55
|
def encode_with(coder)
|
57
56
|
# Order matters.
|
58
57
|
|
59
|
-
coder[:url] = @url
|
58
|
+
coder[:url] = @url.nil?() ? nil : @url.to_s()
|
60
59
|
coder[:scraped] = @scraped
|
61
|
-
coder[:datetime] = @datetime.nil?() ?
|
60
|
+
coder[:datetime] = @datetime.nil?() ? nil : @datetime.iso8601()
|
62
61
|
coder[:title] = @title
|
63
|
-
coder[:futsuurl] = @futsuurl
|
62
|
+
coder[:futsuurl] = @futsuurl.nil?() ? nil : @futsuurl.to_s()
|
64
63
|
coder[:sha256] = @sha256
|
65
64
|
end
|
66
65
|
|
67
66
|
def self.load_data(key,hash)
|
68
|
-
datetime = hash[:datetime]
|
69
|
-
|
70
67
|
slink = SearchLink.new(
|
71
68
|
hash[:url],
|
72
|
-
scraped: hash[:scraped]
|
69
|
+
scraped: hash[:scraped],
|
73
70
|
)
|
74
71
|
|
75
|
-
slink.datetime =
|
72
|
+
slink.datetime = hash[:datetime]
|
76
73
|
slink.futsuurl = hash[:futsuurl]
|
77
74
|
slink.sha256 = hash[:sha256]
|
78
75
|
slink.title = hash[:title]
|
@@ -83,13 +80,31 @@ module NHKore
|
|
83
80
|
def update_from_article(article)
|
84
81
|
# Don't update the url, as it may be different (e.g., http vs https).
|
85
82
|
|
86
|
-
|
87
|
-
|
83
|
+
self.datetime = article.datetime if @datetime.nil?()
|
84
|
+
self.futsuurl = article.futsuurl if Util.empty_web_str?(@futsuurl)
|
88
85
|
@scraped = true # If we have an article, it's been scraped
|
89
86
|
@sha256 = article.sha256 if Util.empty_web_str?(@sha256)
|
90
87
|
@title = article.title if Util.empty_web_str?(@title)
|
91
88
|
end
|
92
89
|
|
90
|
+
def datetime=(value)
|
91
|
+
if value.is_a?(Time)
|
92
|
+
@datetime = value
|
93
|
+
else
|
94
|
+
@datetime = Util.empty_web_str?(value) ? nil : Time.iso8601(value)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def futsuurl=(value)
|
99
|
+
# Don't store URI, store String.
|
100
|
+
@futsuurl = value.nil?() ? nil : value.to_s()
|
101
|
+
end
|
102
|
+
|
103
|
+
def url=(value)
|
104
|
+
# Don't store URI, store String.
|
105
|
+
@url = value.nil?() ? nil : value.to_s()
|
106
|
+
end
|
107
|
+
|
93
108
|
def to_s(mini: false)
|
94
109
|
s = ''.dup()
|
95
110
|
|
@@ -137,9 +152,11 @@ module NHKore
|
|
137
152
|
end
|
138
153
|
|
139
154
|
def add_link(link)
|
140
|
-
|
155
|
+
url = link.url.nil?() ? nil : link.url.to_s()
|
156
|
+
|
157
|
+
return self if @links.key?(url)
|
141
158
|
|
142
|
-
@links[
|
159
|
+
@links[url] = link
|
143
160
|
|
144
161
|
return self
|
145
162
|
end
|
@@ -163,7 +180,7 @@ module NHKore
|
|
163
180
|
|
164
181
|
if !links.nil?()
|
165
182
|
links.each() do |key,hash|
|
166
|
-
key = key.to_s()
|
183
|
+
key = key.to_s() unless key.nil?()
|
167
184
|
slinks.links[key] = SearchLink.load_data(key,hash)
|
168
185
|
end
|
169
186
|
end
|
@@ -173,6 +190,7 @@ module NHKore
|
|
173
190
|
|
174
191
|
def [](url)
|
175
192
|
url = url.url if url.respond_to?(:url)
|
193
|
+
url = url.to_s() unless url.nil?()
|
176
194
|
|
177
195
|
return @links[url]
|
178
196
|
end
|
@@ -61,6 +61,7 @@ module NHKore
|
|
61
61
|
return true if link =~ /\/movieplayer\.html?/ # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
|
62
62
|
return true if link =~ /\/audio\.html?/ # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
|
63
63
|
return true if link =~ /\/news\/easy\/index\.html?/ # http://www3.nhk.or.jp/news/easy/index.html
|
64
|
+
return true if link =~ /cgi2.*enqform/ # https://cgi2.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10011916321000&title=日本の会社が作った鉄道の車両「あずま」がイギリスで走る
|
64
65
|
|
65
66
|
return false
|
66
67
|
end
|
data/lib/nhkore/sifter.rb
CHANGED
@@ -60,6 +60,40 @@ module NHKore
|
|
60
60
|
@output = nil
|
61
61
|
end
|
62
62
|
|
63
|
+
def build_header()
|
64
|
+
header = []
|
65
|
+
|
66
|
+
header << 'Frequency' unless @ignores[:freq]
|
67
|
+
header << 'Word' unless @ignores[:word]
|
68
|
+
header << 'Kana' unless @ignores[:kana]
|
69
|
+
header << 'English' unless @ignores[:eng]
|
70
|
+
header << 'Definition' unless @ignores[:defn]
|
71
|
+
|
72
|
+
return header
|
73
|
+
end
|
74
|
+
|
75
|
+
def build_rows(words)
|
76
|
+
rows = []
|
77
|
+
|
78
|
+
words.each() do |word|
|
79
|
+
rows << build_word_row(word)
|
80
|
+
end
|
81
|
+
|
82
|
+
return rows
|
83
|
+
end
|
84
|
+
|
85
|
+
def build_word_row(word)
|
86
|
+
row = []
|
87
|
+
|
88
|
+
row << word.freq unless @ignores[:freq]
|
89
|
+
row << word.word unless @ignores[:word]
|
90
|
+
row << word.kana unless @ignores[:kana]
|
91
|
+
row << word.eng unless @ignores[:eng]
|
92
|
+
row << word.defn unless @ignores[:defn]
|
93
|
+
|
94
|
+
return row
|
95
|
+
end
|
96
|
+
|
63
97
|
def filter?(article)
|
64
98
|
return false if @filters.empty?()
|
65
99
|
|
@@ -93,24 +127,29 @@ module NHKore
|
|
93
127
|
return false
|
94
128
|
end
|
95
129
|
|
96
|
-
def filter_by_datetime(datetime_filter=nil,
|
130
|
+
def filter_by_datetime(datetime_filter=nil,from: nil,to: nil)
|
97
131
|
if !datetime_filter.nil?()
|
98
|
-
|
99
|
-
|
100
|
-
|
132
|
+
if datetime_filter.respond_to?(:'[]')
|
133
|
+
# If out-of-bounds, just nil.
|
134
|
+
from = datetime_filter[0] if from.nil?()
|
135
|
+
to = datetime_filter[1] if to.nil?()
|
136
|
+
else
|
137
|
+
from = datetime_filter if from.nil?()
|
138
|
+
to = datetime_filter if to.nil?()
|
139
|
+
end
|
101
140
|
end
|
102
141
|
|
103
|
-
|
104
|
-
|
142
|
+
from = to if from.nil?()
|
143
|
+
to = from if to.nil?()
|
105
144
|
|
106
|
-
|
107
|
-
|
145
|
+
from = Util.jst_time(from) unless from.nil?()
|
146
|
+
to = Util.jst_time(to) unless to.nil?()
|
108
147
|
|
109
|
-
datetime_filter = [
|
148
|
+
datetime_filter = [from,to]
|
110
149
|
|
111
150
|
return self if datetime_filter.flatten().compact().empty?()
|
112
151
|
|
113
|
-
@filters[:datetime] = {from:
|
152
|
+
@filters[:datetime] = {from: from,to: to}
|
114
153
|
|
115
154
|
return self
|
116
155
|
end
|
@@ -146,26 +185,10 @@ module NHKore
|
|
146
185
|
words = sift()
|
147
186
|
|
148
187
|
@output = CSV.generate(headers: :first_row,write_headers: true) do |csv|
|
149
|
-
|
150
|
-
|
151
|
-
row << 'Frequency' unless @ignores[:freq]
|
152
|
-
row << 'Word' unless @ignores[:word]
|
153
|
-
row << 'Kana' unless @ignores[:kana]
|
154
|
-
row << 'English' unless @ignores[:eng]
|
155
|
-
row << 'Definition' unless @ignores[:defn]
|
156
|
-
|
157
|
-
csv << row
|
188
|
+
csv << build_header()
|
158
189
|
|
159
190
|
words.each() do |word|
|
160
|
-
|
161
|
-
|
162
|
-
row << word.freq unless @ignores[:freq]
|
163
|
-
row << word.word unless @ignores[:word]
|
164
|
-
row << word.kana unless @ignores[:kana]
|
165
|
-
row << word.eng unless @ignores[:eng]
|
166
|
-
row << word.defn unless @ignores[:defn]
|
167
|
-
|
168
|
-
csv << row
|
191
|
+
csv << build_word_row(word)
|
169
192
|
end
|
170
193
|
end
|
171
194
|
|
@@ -227,7 +250,7 @@ module NHKore
|
|
227
250
|
<h2>#{@caption}</h2>
|
228
251
|
<table>
|
229
252
|
EOH
|
230
|
-
#"
|
253
|
+
#"
|
231
254
|
|
232
255
|
# If have too few or too many '<col>', invalid HTML.
|
233
256
|
@output << %Q{<col style="width:6em;">\n} unless @ignores[:freq]
|
@@ -237,20 +260,20 @@ module NHKore
|
|
237
260
|
@output << "<col>\n" unless @ignores[:defn] # No width for defn, fills rest of page
|
238
261
|
|
239
262
|
@output << '<tr>'
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
263
|
+
|
264
|
+
build_header().each() do |h|
|
265
|
+
@output << "<th>#{h}</th>"
|
266
|
+
end
|
267
|
+
|
245
268
|
@output << "</tr>\n"
|
246
269
|
|
247
270
|
words.each() do |word|
|
248
271
|
@output << '<tr>'
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
272
|
+
|
273
|
+
build_word_row(word).each() do |w|
|
274
|
+
@output << "<td>#{Util.escape_html(w.to_s())}</td>"
|
275
|
+
end
|
276
|
+
|
254
277
|
@output << "</tr>\n"
|
255
278
|
end
|
256
279
|
|
@@ -259,31 +282,63 @@ module NHKore
|
|
259
282
|
</body>
|
260
283
|
</html>
|
261
284
|
EOH
|
262
|
-
#/
|
285
|
+
#/
|
263
286
|
|
264
287
|
return @output
|
265
288
|
end
|
266
289
|
|
267
|
-
def
|
290
|
+
def put_json!()
|
291
|
+
require 'json'
|
292
|
+
|
268
293
|
words = sift()
|
269
294
|
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
295
|
+
@output = ''.dup()
|
296
|
+
|
297
|
+
@output << <<~EOJ
|
298
|
+
{
|
299
|
+
"caption": #{JSON.generate(@caption)},
|
300
|
+
"header": #{JSON.generate(build_header())},
|
301
|
+
"words": [
|
302
|
+
EOJ
|
303
|
+
|
304
|
+
if !words.empty?()
|
305
|
+
0.upto(words.length - 2) do |i|
|
306
|
+
@output << " #{JSON.generate(build_word_row(words[i]))},\n"
|
277
307
|
end
|
308
|
+
|
309
|
+
@output << " #{JSON.generate(build_word_row(words[-1]))}\n"
|
278
310
|
end
|
279
311
|
|
312
|
+
@output << "]\n}\n"
|
313
|
+
|
314
|
+
return @output
|
315
|
+
end
|
316
|
+
|
317
|
+
def put_yaml!()
|
318
|
+
require 'psychgus'
|
319
|
+
|
320
|
+
words = sift()
|
321
|
+
|
280
322
|
yaml = {
|
281
323
|
caption: @caption,
|
282
|
-
|
324
|
+
header: build_header(),
|
325
|
+
words: build_rows(words),
|
283
326
|
}
|
284
327
|
|
328
|
+
header_styler = Class.new() do
|
329
|
+
include Psychgus::Styler
|
330
|
+
|
331
|
+
def style_sequence(sniffer,node)
|
332
|
+
parent = sniffer.parent
|
333
|
+
|
334
|
+
if !parent.nil?() && parent.node.respond_to?(:value) && parent.value == 'header'
|
335
|
+
node.style = Psychgus::SEQUENCE_FLOW
|
336
|
+
end
|
337
|
+
end
|
338
|
+
end
|
339
|
+
|
285
340
|
# Put each Word on one line (flow/inline style).
|
286
|
-
@output = Util.dump_yaml(yaml,flow_level: 4)
|
341
|
+
@output = Util.dump_yaml(yaml,flow_level: 4,stylers: header_styler.new())
|
287
342
|
|
288
343
|
return @output
|
289
344
|
end
|
@@ -301,7 +356,7 @@ module NHKore
|
|
301
356
|
|
302
357
|
words = master_article.words.values()
|
303
358
|
|
304
|
-
words
|
359
|
+
words.sort!() do |word1,word2|
|
305
360
|
# Order by freq DESC (most frequent words to top).
|
306
361
|
i = (word2.freq <=> word1.freq)
|
307
362
|
|
data/lib/nhkore/util.rb
CHANGED
@@ -22,8 +22,7 @@
|
|
22
22
|
|
23
23
|
|
24
24
|
require 'cgi'
|
25
|
-
require '
|
26
|
-
require 'public_suffix'
|
25
|
+
require 'set'
|
27
26
|
require 'time'
|
28
27
|
require 'uri'
|
29
28
|
|
@@ -68,21 +67,28 @@ module NHKore
|
|
68
67
|
end
|
69
68
|
|
70
69
|
def self.domain(host,clean: true)
|
70
|
+
require 'public_suffix'
|
71
|
+
|
71
72
|
domain = PublicSuffix.domain(host)
|
72
73
|
domain = unspace_web_str(domain).downcase() if !domain.nil?() && clean
|
73
74
|
|
74
75
|
return domain
|
75
76
|
end
|
76
77
|
|
77
|
-
def self.dump_yaml(obj,flow_level: 8)
|
78
|
+
def self.dump_yaml(obj,flow_level: 8,stylers: nil)
|
79
|
+
require 'psychgus'
|
80
|
+
|
81
|
+
stylers = Array(stylers)
|
82
|
+
|
78
83
|
return Psychgus.dump(obj,
|
79
84
|
deref_aliases: true, # Dereference aliases for load_yaml()
|
85
|
+
header: true, # %YAML [version]
|
80
86
|
line_width: 10000, # Try not to wrap; ichiman!
|
81
87
|
stylers: [
|
82
88
|
Psychgus::FlowStyler.new(flow_level), # Put extra details on one line (flow/inline style)
|
83
89
|
Psychgus::NoSymStyler.new(cap: false), # Remove symbols, don't capitalize
|
84
90
|
Psychgus::NoTagStyler.new(), # Remove class names (tags)
|
85
|
-
],
|
91
|
+
].concat(stylers),
|
86
92
|
)
|
87
93
|
end
|
88
94
|
|
@@ -102,23 +108,6 @@ module NHKore
|
|
102
108
|
return !str.match?(/[\/\\]/)
|
103
109
|
end
|
104
110
|
|
105
|
-
def self.guess_year(year)
|
106
|
-
if year < 100
|
107
|
-
# 2021 -> 2000.
|
108
|
-
millennium = JST_YEAR / 100 * 100
|
109
|
-
|
110
|
-
# If year <= (2021 -> 21), assume this century.
|
111
|
-
if year <= (JST_YEAR % 100)
|
112
|
-
year = millennium + year
|
113
|
-
else
|
114
|
-
# Assume previous century (2000 -> 1900).
|
115
|
-
year = (millennium - 100) + year
|
116
|
-
end
|
117
|
-
end
|
118
|
-
|
119
|
-
return year
|
120
|
-
end
|
121
|
-
|
122
111
|
def self.hiragana?(str)
|
123
112
|
return HIRAGANA_REGEX =~ str
|
124
113
|
end
|
@@ -142,6 +131,8 @@ module NHKore
|
|
142
131
|
end
|
143
132
|
|
144
133
|
def self.load_yaml(data,file: nil,**kargs)
|
134
|
+
require 'psychgus'
|
135
|
+
|
145
136
|
return Psych.safe_load(data,
|
146
137
|
aliases: false,
|
147
138
|
filename: file,
|