nhkore 0.3.1 → 0.3.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +81 -3
- data/README.md +505 -9
- data/Rakefile +48 -8
- data/lib/nhkore.rb +1 -22
- data/lib/nhkore/app.rb +3 -1
- data/lib/nhkore/article.rb +24 -7
- data/lib/nhkore/article_scraper.rb +21 -16
- data/lib/nhkore/cli/news_cmd.rb +3 -2
- data/lib/nhkore/cli/search_cmd.rb +2 -2
- data/lib/nhkore/cli/sift_cmd.rb +9 -112
- data/lib/nhkore/datetime_parser.rb +342 -0
- data/lib/nhkore/dict_scraper.rb +1 -1
- data/lib/nhkore/lib.rb +59 -0
- data/lib/nhkore/news.rb +13 -4
- data/lib/nhkore/scraper.rb +21 -9
- data/lib/nhkore/search_link.rb +37 -19
- data/lib/nhkore/search_scraper.rb +1 -0
- data/lib/nhkore/sifter.rb +106 -51
- data/lib/nhkore/util.rb +12 -21
- data/lib/nhkore/variator.rb +1 -0
- data/lib/nhkore/version.rb +1 -1
- data/nhkore.gemspec +12 -7
- metadata +21 -5
data/lib/nhkore/scraper.rb
CHANGED
@@ -21,6 +21,7 @@
|
|
21
21
|
#++
|
22
22
|
|
23
23
|
|
24
|
+
require 'attr_bool'
|
24
25
|
require 'nokogiri'
|
25
26
|
require 'open-uri'
|
26
27
|
|
@@ -40,8 +41,8 @@ module NHKore
|
|
40
41
|
'dnt' => '1',
|
41
42
|
}
|
42
43
|
|
43
|
-
attr_accessor :eat_cookie
|
44
|
-
attr_accessor :is_file
|
44
|
+
attr_accessor? :eat_cookie
|
45
|
+
attr_accessor? :is_file
|
45
46
|
attr_reader :kargs
|
46
47
|
attr_accessor :max_redirects
|
47
48
|
attr_accessor :max_retries
|
@@ -49,9 +50,6 @@ module NHKore
|
|
49
50
|
attr_accessor :str_or_io
|
50
51
|
attr_accessor :url
|
51
52
|
|
52
|
-
alias_method :eat_cookie?,:eat_cookie
|
53
|
-
alias_method :is_file?,:is_file
|
54
|
-
|
55
53
|
# +max_redirects+ defaults to 3 for safety (infinite-loop attack).
|
56
54
|
#
|
57
55
|
# All URL options: https://ruby-doc.org/stdlib-2.7.0/libdoc/open-uri/rdoc/OpenURI/OpenRead.html
|
@@ -82,7 +80,7 @@ module NHKore
|
|
82
80
|
@max_retries = max_retries
|
83
81
|
@redirect_rule = redirect_rule
|
84
82
|
|
85
|
-
open(url,str_or_io)
|
83
|
+
open(url,str_or_io,is_file: is_file)
|
86
84
|
end
|
87
85
|
|
88
86
|
def fetch_cookie(url)
|
@@ -119,14 +117,14 @@ module NHKore
|
|
119
117
|
return URI::join(@url,relative_url)
|
120
118
|
end
|
121
119
|
|
122
|
-
def open(url,str_or_io=nil)
|
120
|
+
def open(url,str_or_io=nil,is_file: @is_file)
|
121
|
+
@is_file = is_file
|
123
122
|
@str_or_io = str_or_io
|
124
123
|
@url = url
|
125
124
|
|
126
125
|
if str_or_io.nil?()
|
127
126
|
if @is_file
|
128
|
-
|
129
|
-
@str_or_io = File.open(url,'rt:UTF-8',**@kargs)
|
127
|
+
open_file(url)
|
130
128
|
else
|
131
129
|
fetch_cookie(url) if @eat_cookie
|
132
130
|
open_url(url)
|
@@ -136,6 +134,16 @@ module NHKore
|
|
136
134
|
return self
|
137
135
|
end
|
138
136
|
|
137
|
+
def open_file(file)
|
138
|
+
@is_file = true
|
139
|
+
@url = file
|
140
|
+
|
141
|
+
# NHK's website tends to always use UTF-8.
|
142
|
+
@str_or_io = File.open(file,'rt:UTF-8',**@kargs)
|
143
|
+
|
144
|
+
return self
|
145
|
+
end
|
146
|
+
|
139
147
|
def open_url(url)
|
140
148
|
max_redirects = (@max_redirects.nil?() || @max_redirects < 0) ? 10_000 : @max_redirects
|
141
149
|
max_retries = (@max_retries.nil?() || @max_retries < 0) ? 10_000 : @max_retries
|
@@ -194,6 +202,10 @@ module NHKore
|
|
194
202
|
return @str_or_io
|
195
203
|
end
|
196
204
|
|
205
|
+
def reopen()
|
206
|
+
return open(@url)
|
207
|
+
end
|
208
|
+
|
197
209
|
def rss_doc()
|
198
210
|
require 'rss'
|
199
211
|
|
data/lib/nhkore/search_link.rb
CHANGED
@@ -21,6 +21,7 @@
|
|
21
21
|
#++
|
22
22
|
|
23
23
|
|
24
|
+
require 'attr_bool'
|
24
25
|
require 'time'
|
25
26
|
|
26
27
|
require 'nhkore/fileable'
|
@@ -33,14 +34,12 @@ module NHKore
|
|
33
34
|
# @since 0.2.0
|
34
35
|
###
|
35
36
|
class SearchLink
|
36
|
-
|
37
|
-
|
38
|
-
attr_accessor :scraped
|
37
|
+
attr_reader :datetime
|
38
|
+
attr_reader :futsuurl
|
39
|
+
attr_accessor? :scraped
|
39
40
|
attr_accessor :sha256
|
40
41
|
attr_accessor :title
|
41
|
-
|
42
|
-
|
43
|
-
alias_method :scraped?,:scraped
|
42
|
+
attr_reader :url
|
44
43
|
|
45
44
|
def initialize(url,scraped: false)
|
46
45
|
super()
|
@@ -50,29 +49,27 @@ module NHKore
|
|
50
49
|
@scraped = scraped
|
51
50
|
@sha256 = sha256
|
52
51
|
@title = nil
|
53
|
-
|
52
|
+
self.url = url
|
54
53
|
end
|
55
54
|
|
56
55
|
def encode_with(coder)
|
57
56
|
# Order matters.
|
58
57
|
|
59
|
-
coder[:url] = @url
|
58
|
+
coder[:url] = @url.nil?() ? nil : @url.to_s()
|
60
59
|
coder[:scraped] = @scraped
|
61
|
-
coder[:datetime] = @datetime.nil?() ?
|
60
|
+
coder[:datetime] = @datetime.nil?() ? nil : @datetime.iso8601()
|
62
61
|
coder[:title] = @title
|
63
|
-
coder[:futsuurl] = @futsuurl
|
62
|
+
coder[:futsuurl] = @futsuurl.nil?() ? nil : @futsuurl.to_s()
|
64
63
|
coder[:sha256] = @sha256
|
65
64
|
end
|
66
65
|
|
67
66
|
def self.load_data(key,hash)
|
68
|
-
datetime = hash[:datetime]
|
69
|
-
|
70
67
|
slink = SearchLink.new(
|
71
68
|
hash[:url],
|
72
|
-
scraped: hash[:scraped]
|
69
|
+
scraped: hash[:scraped],
|
73
70
|
)
|
74
71
|
|
75
|
-
slink.datetime =
|
72
|
+
slink.datetime = hash[:datetime]
|
76
73
|
slink.futsuurl = hash[:futsuurl]
|
77
74
|
slink.sha256 = hash[:sha256]
|
78
75
|
slink.title = hash[:title]
|
@@ -83,13 +80,31 @@ module NHKore
|
|
83
80
|
def update_from_article(article)
|
84
81
|
# Don't update the url, as it may be different (e.g., http vs https).
|
85
82
|
|
86
|
-
|
87
|
-
|
83
|
+
self.datetime = article.datetime if @datetime.nil?()
|
84
|
+
self.futsuurl = article.futsuurl if Util.empty_web_str?(@futsuurl)
|
88
85
|
@scraped = true # If we have an article, it's been scraped
|
89
86
|
@sha256 = article.sha256 if Util.empty_web_str?(@sha256)
|
90
87
|
@title = article.title if Util.empty_web_str?(@title)
|
91
88
|
end
|
92
89
|
|
90
|
+
def datetime=(value)
|
91
|
+
if value.is_a?(Time)
|
92
|
+
@datetime = value
|
93
|
+
else
|
94
|
+
@datetime = Util.empty_web_str?(value) ? nil : Time.iso8601(value)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def futsuurl=(value)
|
99
|
+
# Don't store URI, store String.
|
100
|
+
@futsuurl = value.nil?() ? nil : value.to_s()
|
101
|
+
end
|
102
|
+
|
103
|
+
def url=(value)
|
104
|
+
# Don't store URI, store String.
|
105
|
+
@url = value.nil?() ? nil : value.to_s()
|
106
|
+
end
|
107
|
+
|
93
108
|
def to_s(mini: false)
|
94
109
|
s = ''.dup()
|
95
110
|
|
@@ -137,9 +152,11 @@ module NHKore
|
|
137
152
|
end
|
138
153
|
|
139
154
|
def add_link(link)
|
140
|
-
|
155
|
+
url = link.url.nil?() ? nil : link.url.to_s()
|
156
|
+
|
157
|
+
return self if @links.key?(url)
|
141
158
|
|
142
|
-
@links[
|
159
|
+
@links[url] = link
|
143
160
|
|
144
161
|
return self
|
145
162
|
end
|
@@ -163,7 +180,7 @@ module NHKore
|
|
163
180
|
|
164
181
|
if !links.nil?()
|
165
182
|
links.each() do |key,hash|
|
166
|
-
key = key.to_s()
|
183
|
+
key = key.to_s() unless key.nil?()
|
167
184
|
slinks.links[key] = SearchLink.load_data(key,hash)
|
168
185
|
end
|
169
186
|
end
|
@@ -173,6 +190,7 @@ module NHKore
|
|
173
190
|
|
174
191
|
def [](url)
|
175
192
|
url = url.url if url.respond_to?(:url)
|
193
|
+
url = url.to_s() unless url.nil?()
|
176
194
|
|
177
195
|
return @links[url]
|
178
196
|
end
|
@@ -61,6 +61,7 @@ module NHKore
|
|
61
61
|
return true if link =~ /\/movieplayer\.html?/ # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
|
62
62
|
return true if link =~ /\/audio\.html?/ # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
|
63
63
|
return true if link =~ /\/news\/easy\/index\.html?/ # http://www3.nhk.or.jp/news/easy/index.html
|
64
|
+
return true if link =~ /cgi2.*enqform/ # https://cgi2.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10011916321000&title=日本の会社が作った鉄道の車両「あずま」がイギリスで走る
|
64
65
|
|
65
66
|
return false
|
66
67
|
end
|
data/lib/nhkore/sifter.rb
CHANGED
@@ -60,6 +60,40 @@ module NHKore
|
|
60
60
|
@output = nil
|
61
61
|
end
|
62
62
|
|
63
|
+
def build_header()
|
64
|
+
header = []
|
65
|
+
|
66
|
+
header << 'Frequency' unless @ignores[:freq]
|
67
|
+
header << 'Word' unless @ignores[:word]
|
68
|
+
header << 'Kana' unless @ignores[:kana]
|
69
|
+
header << 'English' unless @ignores[:eng]
|
70
|
+
header << 'Definition' unless @ignores[:defn]
|
71
|
+
|
72
|
+
return header
|
73
|
+
end
|
74
|
+
|
75
|
+
def build_rows(words)
|
76
|
+
rows = []
|
77
|
+
|
78
|
+
words.each() do |word|
|
79
|
+
rows << build_word_row(word)
|
80
|
+
end
|
81
|
+
|
82
|
+
return rows
|
83
|
+
end
|
84
|
+
|
85
|
+
def build_word_row(word)
|
86
|
+
row = []
|
87
|
+
|
88
|
+
row << word.freq unless @ignores[:freq]
|
89
|
+
row << word.word unless @ignores[:word]
|
90
|
+
row << word.kana unless @ignores[:kana]
|
91
|
+
row << word.eng unless @ignores[:eng]
|
92
|
+
row << word.defn unless @ignores[:defn]
|
93
|
+
|
94
|
+
return row
|
95
|
+
end
|
96
|
+
|
63
97
|
def filter?(article)
|
64
98
|
return false if @filters.empty?()
|
65
99
|
|
@@ -93,24 +127,29 @@ module NHKore
|
|
93
127
|
return false
|
94
128
|
end
|
95
129
|
|
96
|
-
def filter_by_datetime(datetime_filter=nil,
|
130
|
+
def filter_by_datetime(datetime_filter=nil,from: nil,to: nil)
|
97
131
|
if !datetime_filter.nil?()
|
98
|
-
|
99
|
-
|
100
|
-
|
132
|
+
if datetime_filter.respond_to?(:'[]')
|
133
|
+
# If out-of-bounds, just nil.
|
134
|
+
from = datetime_filter[0] if from.nil?()
|
135
|
+
to = datetime_filter[1] if to.nil?()
|
136
|
+
else
|
137
|
+
from = datetime_filter if from.nil?()
|
138
|
+
to = datetime_filter if to.nil?()
|
139
|
+
end
|
101
140
|
end
|
102
141
|
|
103
|
-
|
104
|
-
|
142
|
+
from = to if from.nil?()
|
143
|
+
to = from if to.nil?()
|
105
144
|
|
106
|
-
|
107
|
-
|
145
|
+
from = Util.jst_time(from) unless from.nil?()
|
146
|
+
to = Util.jst_time(to) unless to.nil?()
|
108
147
|
|
109
|
-
datetime_filter = [
|
148
|
+
datetime_filter = [from,to]
|
110
149
|
|
111
150
|
return self if datetime_filter.flatten().compact().empty?()
|
112
151
|
|
113
|
-
@filters[:datetime] = {from:
|
152
|
+
@filters[:datetime] = {from: from,to: to}
|
114
153
|
|
115
154
|
return self
|
116
155
|
end
|
@@ -146,26 +185,10 @@ module NHKore
|
|
146
185
|
words = sift()
|
147
186
|
|
148
187
|
@output = CSV.generate(headers: :first_row,write_headers: true) do |csv|
|
149
|
-
|
150
|
-
|
151
|
-
row << 'Frequency' unless @ignores[:freq]
|
152
|
-
row << 'Word' unless @ignores[:word]
|
153
|
-
row << 'Kana' unless @ignores[:kana]
|
154
|
-
row << 'English' unless @ignores[:eng]
|
155
|
-
row << 'Definition' unless @ignores[:defn]
|
156
|
-
|
157
|
-
csv << row
|
188
|
+
csv << build_header()
|
158
189
|
|
159
190
|
words.each() do |word|
|
160
|
-
|
161
|
-
|
162
|
-
row << word.freq unless @ignores[:freq]
|
163
|
-
row << word.word unless @ignores[:word]
|
164
|
-
row << word.kana unless @ignores[:kana]
|
165
|
-
row << word.eng unless @ignores[:eng]
|
166
|
-
row << word.defn unless @ignores[:defn]
|
167
|
-
|
168
|
-
csv << row
|
191
|
+
csv << build_word_row(word)
|
169
192
|
end
|
170
193
|
end
|
171
194
|
|
@@ -227,7 +250,7 @@ module NHKore
|
|
227
250
|
<h2>#{@caption}</h2>
|
228
251
|
<table>
|
229
252
|
EOH
|
230
|
-
#"
|
253
|
+
#"
|
231
254
|
|
232
255
|
# If have too few or too many '<col>', invalid HTML.
|
233
256
|
@output << %Q{<col style="width:6em;">\n} unless @ignores[:freq]
|
@@ -237,20 +260,20 @@ module NHKore
|
|
237
260
|
@output << "<col>\n" unless @ignores[:defn] # No width for defn, fills rest of page
|
238
261
|
|
239
262
|
@output << '<tr>'
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
263
|
+
|
264
|
+
build_header().each() do |h|
|
265
|
+
@output << "<th>#{h}</th>"
|
266
|
+
end
|
267
|
+
|
245
268
|
@output << "</tr>\n"
|
246
269
|
|
247
270
|
words.each() do |word|
|
248
271
|
@output << '<tr>'
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
272
|
+
|
273
|
+
build_word_row(word).each() do |w|
|
274
|
+
@output << "<td>#{Util.escape_html(w.to_s())}</td>"
|
275
|
+
end
|
276
|
+
|
254
277
|
@output << "</tr>\n"
|
255
278
|
end
|
256
279
|
|
@@ -259,31 +282,63 @@ module NHKore
|
|
259
282
|
</body>
|
260
283
|
</html>
|
261
284
|
EOH
|
262
|
-
#/
|
285
|
+
#/
|
263
286
|
|
264
287
|
return @output
|
265
288
|
end
|
266
289
|
|
267
|
-
def
|
290
|
+
def put_json!()
|
291
|
+
require 'json'
|
292
|
+
|
268
293
|
words = sift()
|
269
294
|
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
295
|
+
@output = ''.dup()
|
296
|
+
|
297
|
+
@output << <<~EOJ
|
298
|
+
{
|
299
|
+
"caption": #{JSON.generate(@caption)},
|
300
|
+
"header": #{JSON.generate(build_header())},
|
301
|
+
"words": [
|
302
|
+
EOJ
|
303
|
+
|
304
|
+
if !words.empty?()
|
305
|
+
0.upto(words.length - 2) do |i|
|
306
|
+
@output << " #{JSON.generate(build_word_row(words[i]))},\n"
|
277
307
|
end
|
308
|
+
|
309
|
+
@output << " #{JSON.generate(build_word_row(words[-1]))}\n"
|
278
310
|
end
|
279
311
|
|
312
|
+
@output << "]\n}\n"
|
313
|
+
|
314
|
+
return @output
|
315
|
+
end
|
316
|
+
|
317
|
+
def put_yaml!()
|
318
|
+
require 'psychgus'
|
319
|
+
|
320
|
+
words = sift()
|
321
|
+
|
280
322
|
yaml = {
|
281
323
|
caption: @caption,
|
282
|
-
|
324
|
+
header: build_header(),
|
325
|
+
words: build_rows(words),
|
283
326
|
}
|
284
327
|
|
328
|
+
header_styler = Class.new() do
|
329
|
+
include Psychgus::Styler
|
330
|
+
|
331
|
+
def style_sequence(sniffer,node)
|
332
|
+
parent = sniffer.parent
|
333
|
+
|
334
|
+
if !parent.nil?() && parent.node.respond_to?(:value) && parent.value == 'header'
|
335
|
+
node.style = Psychgus::SEQUENCE_FLOW
|
336
|
+
end
|
337
|
+
end
|
338
|
+
end
|
339
|
+
|
285
340
|
# Put each Word on one line (flow/inline style).
|
286
|
-
@output = Util.dump_yaml(yaml,flow_level: 4)
|
341
|
+
@output = Util.dump_yaml(yaml,flow_level: 4,stylers: header_styler.new())
|
287
342
|
|
288
343
|
return @output
|
289
344
|
end
|
@@ -301,7 +356,7 @@ module NHKore
|
|
301
356
|
|
302
357
|
words = master_article.words.values()
|
303
358
|
|
304
|
-
words
|
359
|
+
words.sort!() do |word1,word2|
|
305
360
|
# Order by freq DESC (most frequent words to top).
|
306
361
|
i = (word2.freq <=> word1.freq)
|
307
362
|
|
data/lib/nhkore/util.rb
CHANGED
@@ -22,8 +22,7 @@
|
|
22
22
|
|
23
23
|
|
24
24
|
require 'cgi'
|
25
|
-
require '
|
26
|
-
require 'public_suffix'
|
25
|
+
require 'set'
|
27
26
|
require 'time'
|
28
27
|
require 'uri'
|
29
28
|
|
@@ -68,21 +67,28 @@ module NHKore
|
|
68
67
|
end
|
69
68
|
|
70
69
|
def self.domain(host,clean: true)
|
70
|
+
require 'public_suffix'
|
71
|
+
|
71
72
|
domain = PublicSuffix.domain(host)
|
72
73
|
domain = unspace_web_str(domain).downcase() if !domain.nil?() && clean
|
73
74
|
|
74
75
|
return domain
|
75
76
|
end
|
76
77
|
|
77
|
-
def self.dump_yaml(obj,flow_level: 8)
|
78
|
+
def self.dump_yaml(obj,flow_level: 8,stylers: nil)
|
79
|
+
require 'psychgus'
|
80
|
+
|
81
|
+
stylers = Array(stylers)
|
82
|
+
|
78
83
|
return Psychgus.dump(obj,
|
79
84
|
deref_aliases: true, # Dereference aliases for load_yaml()
|
85
|
+
header: true, # %YAML [version]
|
80
86
|
line_width: 10000, # Try not to wrap; ichiman!
|
81
87
|
stylers: [
|
82
88
|
Psychgus::FlowStyler.new(flow_level), # Put extra details on one line (flow/inline style)
|
83
89
|
Psychgus::NoSymStyler.new(cap: false), # Remove symbols, don't capitalize
|
84
90
|
Psychgus::NoTagStyler.new(), # Remove class names (tags)
|
85
|
-
],
|
91
|
+
].concat(stylers),
|
86
92
|
)
|
87
93
|
end
|
88
94
|
|
@@ -102,23 +108,6 @@ module NHKore
|
|
102
108
|
return !str.match?(/[\/\\]/)
|
103
109
|
end
|
104
110
|
|
105
|
-
def self.guess_year(year)
|
106
|
-
if year < 100
|
107
|
-
# 2021 -> 2000.
|
108
|
-
millennium = JST_YEAR / 100 * 100
|
109
|
-
|
110
|
-
# If year <= (2021 -> 21), assume this century.
|
111
|
-
if year <= (JST_YEAR % 100)
|
112
|
-
year = millennium + year
|
113
|
-
else
|
114
|
-
# Assume previous century (2000 -> 1900).
|
115
|
-
year = (millennium - 100) + year
|
116
|
-
end
|
117
|
-
end
|
118
|
-
|
119
|
-
return year
|
120
|
-
end
|
121
|
-
|
122
111
|
def self.hiragana?(str)
|
123
112
|
return HIRAGANA_REGEX =~ str
|
124
113
|
end
|
@@ -142,6 +131,8 @@ module NHKore
|
|
142
131
|
end
|
143
132
|
|
144
133
|
def self.load_yaml(data,file: nil,**kargs)
|
134
|
+
require 'psychgus'
|
135
|
+
|
145
136
|
return Psych.safe_load(data,
|
146
137
|
aliases: false,
|
147
138
|
filename: file,
|