nhkore 0.3.1 → 0.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,6 +21,7 @@
21
21
  #++
22
22
 
23
23
 
24
+ require 'attr_bool'
24
25
  require 'nokogiri'
25
26
  require 'open-uri'
26
27
 
@@ -40,8 +41,8 @@ module NHKore
40
41
  'dnt' => '1',
41
42
  }
42
43
 
43
- attr_accessor :eat_cookie
44
- attr_accessor :is_file
44
+ attr_accessor? :eat_cookie
45
+ attr_accessor? :is_file
45
46
  attr_reader :kargs
46
47
  attr_accessor :max_redirects
47
48
  attr_accessor :max_retries
@@ -49,9 +50,6 @@ module NHKore
49
50
  attr_accessor :str_or_io
50
51
  attr_accessor :url
51
52
 
52
- alias_method :eat_cookie?,:eat_cookie
53
- alias_method :is_file?,:is_file
54
-
55
53
  # +max_redirects+ defaults to 3 for safety (infinite-loop attack).
56
54
  #
57
55
  # All URL options: https://ruby-doc.org/stdlib-2.7.0/libdoc/open-uri/rdoc/OpenURI/OpenRead.html
@@ -82,7 +80,7 @@ module NHKore
82
80
  @max_retries = max_retries
83
81
  @redirect_rule = redirect_rule
84
82
 
85
- open(url,str_or_io)
83
+ open(url,str_or_io,is_file: is_file)
86
84
  end
87
85
 
88
86
  def fetch_cookie(url)
@@ -119,14 +117,14 @@ module NHKore
119
117
  return URI::join(@url,relative_url)
120
118
  end
121
119
 
122
- def open(url,str_or_io=nil)
120
+ def open(url,str_or_io=nil,is_file: @is_file)
121
+ @is_file = is_file
123
122
  @str_or_io = str_or_io
124
123
  @url = url
125
124
 
126
125
  if str_or_io.nil?()
127
126
  if @is_file
128
- # NHK's website tends to always use UTF-8.
129
- @str_or_io = File.open(url,'rt:UTF-8',**@kargs)
127
+ open_file(url)
130
128
  else
131
129
  fetch_cookie(url) if @eat_cookie
132
130
  open_url(url)
@@ -136,6 +134,16 @@ module NHKore
136
134
  return self
137
135
  end
138
136
 
137
+ def open_file(file)
138
+ @is_file = true
139
+ @url = file
140
+
141
+ # NHK's website tends to always use UTF-8.
142
+ @str_or_io = File.open(file,'rt:UTF-8',**@kargs)
143
+
144
+ return self
145
+ end
146
+
139
147
  def open_url(url)
140
148
  max_redirects = (@max_redirects.nil?() || @max_redirects < 0) ? 10_000 : @max_redirects
141
149
  max_retries = (@max_retries.nil?() || @max_retries < 0) ? 10_000 : @max_retries
@@ -194,6 +202,10 @@ module NHKore
194
202
  return @str_or_io
195
203
  end
196
204
 
205
+ def reopen()
206
+ return open(@url)
207
+ end
208
+
197
209
  def rss_doc()
198
210
  require 'rss'
199
211
 
@@ -21,6 +21,7 @@
21
21
  #++
22
22
 
23
23
 
24
+ require 'attr_bool'
24
25
  require 'time'
25
26
 
26
27
  require 'nhkore/fileable'
@@ -33,14 +34,12 @@ module NHKore
33
34
  # @since 0.2.0
34
35
  ###
35
36
  class SearchLink
36
- attr_accessor :datetime
37
- attr_accessor :futsuurl
38
- attr_accessor :scraped
37
+ attr_reader :datetime
38
+ attr_reader :futsuurl
39
+ attr_accessor? :scraped
39
40
  attr_accessor :sha256
40
41
  attr_accessor :title
41
- attr_accessor :url
42
-
43
- alias_method :scraped?,:scraped
42
+ attr_reader :url
44
43
 
45
44
  def initialize(url,scraped: false)
46
45
  super()
@@ -50,29 +49,27 @@ module NHKore
50
49
  @scraped = scraped
51
50
  @sha256 = sha256
52
51
  @title = nil
53
- @url = url
52
+ self.url = url
54
53
  end
55
54
 
56
55
  def encode_with(coder)
57
56
  # Order matters.
58
57
 
59
- coder[:url] = @url
58
+ coder[:url] = @url.nil?() ? nil : @url.to_s()
60
59
  coder[:scraped] = @scraped
61
- coder[:datetime] = @datetime.nil?() ? @datetime : @datetime.iso8601()
60
+ coder[:datetime] = @datetime.nil?() ? nil : @datetime.iso8601()
62
61
  coder[:title] = @title
63
- coder[:futsuurl] = @futsuurl
62
+ coder[:futsuurl] = @futsuurl.nil?() ? nil : @futsuurl.to_s()
64
63
  coder[:sha256] = @sha256
65
64
  end
66
65
 
67
66
  def self.load_data(key,hash)
68
- datetime = hash[:datetime]
69
-
70
67
  slink = SearchLink.new(
71
68
  hash[:url],
72
- scraped: hash[:scraped]
69
+ scraped: hash[:scraped],
73
70
  )
74
71
 
75
- slink.datetime = Util.empty_web_str?(datetime) ? nil : Time.iso8601(datetime)
72
+ slink.datetime = hash[:datetime]
76
73
  slink.futsuurl = hash[:futsuurl]
77
74
  slink.sha256 = hash[:sha256]
78
75
  slink.title = hash[:title]
@@ -83,13 +80,31 @@ module NHKore
83
80
  def update_from_article(article)
84
81
  # Don't update the url, as it may be different (e.g., http vs https).
85
82
 
86
- @datetime = article.datetime if @datetime.nil?()
87
- @futsuurl = article.futsuurl if Util.empty_web_str?(@futsuurl)
83
+ self.datetime = article.datetime if @datetime.nil?()
84
+ self.futsuurl = article.futsuurl if Util.empty_web_str?(@futsuurl)
88
85
  @scraped = true # If we have an article, it's been scraped
89
86
  @sha256 = article.sha256 if Util.empty_web_str?(@sha256)
90
87
  @title = article.title if Util.empty_web_str?(@title)
91
88
  end
92
89
 
90
+ def datetime=(value)
91
+ if value.is_a?(Time)
92
+ @datetime = value
93
+ else
94
+ @datetime = Util.empty_web_str?(value) ? nil : Time.iso8601(value)
95
+ end
96
+ end
97
+
98
+ def futsuurl=(value)
99
+ # Don't store URI, store String.
100
+ @futsuurl = value.nil?() ? nil : value.to_s()
101
+ end
102
+
103
+ def url=(value)
104
+ # Don't store URI, store String.
105
+ @url = value.nil?() ? nil : value.to_s()
106
+ end
107
+
93
108
  def to_s(mini: false)
94
109
  s = ''.dup()
95
110
 
@@ -137,9 +152,11 @@ module NHKore
137
152
  end
138
153
 
139
154
  def add_link(link)
140
- return self if @links.key?(link.url)
155
+ url = link.url.nil?() ? nil : link.url.to_s()
156
+
157
+ return self if @links.key?(url)
141
158
 
142
- @links[link.url] = link
159
+ @links[url] = link
143
160
 
144
161
  return self
145
162
  end
@@ -163,7 +180,7 @@ module NHKore
163
180
 
164
181
  if !links.nil?()
165
182
  links.each() do |key,hash|
166
- key = key.to_s() # Change from a symbol
183
+ key = key.to_s() unless key.nil?()
167
184
  slinks.links[key] = SearchLink.load_data(key,hash)
168
185
  end
169
186
  end
@@ -173,6 +190,7 @@ module NHKore
173
190
 
174
191
  def [](url)
175
192
  url = url.url if url.respond_to?(:url)
193
+ url = url.to_s() unless url.nil?()
176
194
 
177
195
  return @links[url]
178
196
  end
@@ -61,6 +61,7 @@ module NHKore
61
61
  return true if link =~ /\/movieplayer\.html?/ # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
62
62
  return true if link =~ /\/audio\.html?/ # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
63
63
  return true if link =~ /\/news\/easy\/index\.html?/ # http://www3.nhk.or.jp/news/easy/index.html
64
+ return true if link =~ /cgi2.*enqform/ # https://cgi2.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10011916321000&title=日本の会社が作った鉄道の車両「あずま」がイギリスで走る
64
65
 
65
66
  return false
66
67
  end
@@ -60,6 +60,40 @@ module NHKore
60
60
  @output = nil
61
61
  end
62
62
 
63
+ def build_header()
64
+ header = []
65
+
66
+ header << 'Frequency' unless @ignores[:freq]
67
+ header << 'Word' unless @ignores[:word]
68
+ header << 'Kana' unless @ignores[:kana]
69
+ header << 'English' unless @ignores[:eng]
70
+ header << 'Definition' unless @ignores[:defn]
71
+
72
+ return header
73
+ end
74
+
75
+ def build_rows(words)
76
+ rows = []
77
+
78
+ words.each() do |word|
79
+ rows << build_word_row(word)
80
+ end
81
+
82
+ return rows
83
+ end
84
+
85
+ def build_word_row(word)
86
+ row = []
87
+
88
+ row << word.freq unless @ignores[:freq]
89
+ row << word.word unless @ignores[:word]
90
+ row << word.kana unless @ignores[:kana]
91
+ row << word.eng unless @ignores[:eng]
92
+ row << word.defn unless @ignores[:defn]
93
+
94
+ return row
95
+ end
96
+
63
97
  def filter?(article)
64
98
  return false if @filters.empty?()
65
99
 
@@ -93,24 +127,29 @@ module NHKore
93
127
  return false
94
128
  end
95
129
 
96
- def filter_by_datetime(datetime_filter=nil,from_filter: nil,to_filter: nil)
130
+ def filter_by_datetime(datetime_filter=nil,from: nil,to: nil)
97
131
  if !datetime_filter.nil?()
98
- # If out-of-bounds, just nil.
99
- from_filter = datetime_filter[0]
100
- to_filter = datetime_filter[1]
132
+ if datetime_filter.respond_to?(:'[]')
133
+ # If out-of-bounds, just nil.
134
+ from = datetime_filter[0] if from.nil?()
135
+ to = datetime_filter[1] if to.nil?()
136
+ else
137
+ from = datetime_filter if from.nil?()
138
+ to = datetime_filter if to.nil?()
139
+ end
101
140
  end
102
141
 
103
- from_filter = to_filter if from_filter.nil?()
104
- to_filter = from_filter if to_filter.nil?()
142
+ from = to if from.nil?()
143
+ to = from if to.nil?()
105
144
 
106
- from_filter = Util.jst_time(from_filter) unless from_filter.nil?()
107
- to_filter = Util.jst_time(to_filter) unless to_filter.nil?()
145
+ from = Util.jst_time(from) unless from.nil?()
146
+ to = Util.jst_time(to) unless to.nil?()
108
147
 
109
- datetime_filter = [from_filter,to_filter]
148
+ datetime_filter = [from,to]
110
149
 
111
150
  return self if datetime_filter.flatten().compact().empty?()
112
151
 
113
- @filters[:datetime] = {from: from_filter,to: to_filter}
152
+ @filters[:datetime] = {from: from,to: to}
114
153
 
115
154
  return self
116
155
  end
@@ -146,26 +185,10 @@ module NHKore
146
185
  words = sift()
147
186
 
148
187
  @output = CSV.generate(headers: :first_row,write_headers: true) do |csv|
149
- row = []
150
-
151
- row << 'Frequency' unless @ignores[:freq]
152
- row << 'Word' unless @ignores[:word]
153
- row << 'Kana' unless @ignores[:kana]
154
- row << 'English' unless @ignores[:eng]
155
- row << 'Definition' unless @ignores[:defn]
156
-
157
- csv << row
188
+ csv << build_header()
158
189
 
159
190
  words.each() do |word|
160
- row = []
161
-
162
- row << word.freq unless @ignores[:freq]
163
- row << word.word unless @ignores[:word]
164
- row << word.kana unless @ignores[:kana]
165
- row << word.eng unless @ignores[:eng]
166
- row << word.defn unless @ignores[:defn]
167
-
168
- csv << row
191
+ csv << build_word_row(word)
169
192
  end
170
193
  end
171
194
 
@@ -227,7 +250,7 @@ module NHKore
227
250
  <h2>#{@caption}</h2>
228
251
  <table>
229
252
  EOH
230
- #" # Fix for editor
253
+ #"
231
254
 
232
255
  # If have too few or too many '<col>', invalid HTML.
233
256
  @output << %Q{<col style="width:6em;">\n} unless @ignores[:freq]
@@ -237,20 +260,20 @@ module NHKore
237
260
  @output << "<col>\n" unless @ignores[:defn] # No width for defn, fills rest of page
238
261
 
239
262
  @output << '<tr>'
240
- @output << '<th>Frequency</th>' unless @ignores[:freq]
241
- @output << '<th>Word</th>' unless @ignores[:word]
242
- @output << '<th>Kana</th>' unless @ignores[:kana]
243
- @output << '<th>English</th>' unless @ignores[:eng]
244
- @output << '<th>Definition</th>' unless @ignores[:defn]
263
+
264
+ build_header().each() do |h|
265
+ @output << "<th>#{h}</th>"
266
+ end
267
+
245
268
  @output << "</tr>\n"
246
269
 
247
270
  words.each() do |word|
248
271
  @output << '<tr>'
249
- @output << "<td>#{Util.escape_html(word.freq.to_s())}</td>" unless @ignores[:freq]
250
- @output << "<td>#{Util.escape_html(word.word.to_s())}</td>" unless @ignores[:word]
251
- @output << "<td>#{Util.escape_html(word.kana.to_s())}</td>" unless @ignores[:kana]
252
- @output << "<td>#{Util.escape_html(word.eng.to_s())}</td>" unless @ignores[:eng]
253
- @output << "<td>#{Util.escape_html(word.defn.to_s())}</td>" unless @ignores[:defn]
272
+
273
+ build_word_row(word).each() do |w|
274
+ @output << "<td>#{Util.escape_html(w.to_s())}</td>"
275
+ end
276
+
254
277
  @output << "</tr>\n"
255
278
  end
256
279
 
@@ -259,31 +282,63 @@ module NHKore
259
282
  </body>
260
283
  </html>
261
284
  EOH
262
- #/ # Fix for editor
285
+ #/
263
286
 
264
287
  return @output
265
288
  end
266
289
 
267
- def put_yaml!()
290
+ def put_json!()
291
+ require 'json'
292
+
268
293
  words = sift()
269
294
 
270
- # Just blank out ignores.
271
- if !@ignores.empty?()
272
- words.each() do |word|
273
- # word/kanji/kana do not have setters/mutators.
274
- word.defn = nil if @ignores[:defn]
275
- word.eng = nil if @ignores[:eng]
276
- word.freq = nil if @ignores[:freq]
295
+ @output = ''.dup()
296
+
297
+ @output << <<~EOJ
298
+ {
299
+ "caption": #{JSON.generate(@caption)},
300
+ "header": #{JSON.generate(build_header())},
301
+ "words": [
302
+ EOJ
303
+
304
+ if !words.empty?()
305
+ 0.upto(words.length - 2) do |i|
306
+ @output << " #{JSON.generate(build_word_row(words[i]))},\n"
277
307
  end
308
+
309
+ @output << " #{JSON.generate(build_word_row(words[-1]))}\n"
278
310
  end
279
311
 
312
+ @output << "]\n}\n"
313
+
314
+ return @output
315
+ end
316
+
317
+ def put_yaml!()
318
+ require 'psychgus'
319
+
320
+ words = sift()
321
+
280
322
  yaml = {
281
323
  caption: @caption,
282
- words: words
324
+ header: build_header(),
325
+ words: build_rows(words),
283
326
  }
284
327
 
328
+ header_styler = Class.new() do
329
+ include Psychgus::Styler
330
+
331
+ def style_sequence(sniffer,node)
332
+ parent = sniffer.parent
333
+
334
+ if !parent.nil?() && parent.node.respond_to?(:value) && parent.value == 'header'
335
+ node.style = Psychgus::SEQUENCE_FLOW
336
+ end
337
+ end
338
+ end
339
+
285
340
  # Put each Word on one line (flow/inline style).
286
- @output = Util.dump_yaml(yaml,flow_level: 4)
341
+ @output = Util.dump_yaml(yaml,flow_level: 4,stylers: header_styler.new())
287
342
 
288
343
  return @output
289
344
  end
@@ -301,7 +356,7 @@ module NHKore
301
356
 
302
357
  words = master_article.words.values()
303
358
 
304
- words = words.sort() do |word1,word2|
359
+ words.sort!() do |word1,word2|
305
360
  # Order by freq DESC (most frequent words to top).
306
361
  i = (word2.freq <=> word1.freq)
307
362
 
@@ -22,8 +22,7 @@
22
22
 
23
23
 
24
24
  require 'cgi'
25
- require 'psychgus'
26
- require 'public_suffix'
25
+ require 'set'
27
26
  require 'time'
28
27
  require 'uri'
29
28
 
@@ -68,21 +67,28 @@ module NHKore
68
67
  end
69
68
 
70
69
  def self.domain(host,clean: true)
70
+ require 'public_suffix'
71
+
71
72
  domain = PublicSuffix.domain(host)
72
73
  domain = unspace_web_str(domain).downcase() if !domain.nil?() && clean
73
74
 
74
75
  return domain
75
76
  end
76
77
 
77
- def self.dump_yaml(obj,flow_level: 8)
78
+ def self.dump_yaml(obj,flow_level: 8,stylers: nil)
79
+ require 'psychgus'
80
+
81
+ stylers = Array(stylers)
82
+
78
83
  return Psychgus.dump(obj,
79
84
  deref_aliases: true, # Dereference aliases for load_yaml()
85
+ header: true, # %YAML [version]
80
86
  line_width: 10000, # Try not to wrap; ichiman!
81
87
  stylers: [
82
88
  Psychgus::FlowStyler.new(flow_level), # Put extra details on one line (flow/inline style)
83
89
  Psychgus::NoSymStyler.new(cap: false), # Remove symbols, don't capitalize
84
90
  Psychgus::NoTagStyler.new(), # Remove class names (tags)
85
- ],
91
+ ].concat(stylers),
86
92
  )
87
93
  end
88
94
 
@@ -102,23 +108,6 @@ module NHKore
102
108
  return !str.match?(/[\/\\]/)
103
109
  end
104
110
 
105
- def self.guess_year(year)
106
- if year < 100
107
- # 2021 -> 2000.
108
- millennium = JST_YEAR / 100 * 100
109
-
110
- # If year <= (2021 -> 21), assume this century.
111
- if year <= (JST_YEAR % 100)
112
- year = millennium + year
113
- else
114
- # Assume previous century (2000 -> 1900).
115
- year = (millennium - 100) + year
116
- end
117
- end
118
-
119
- return year
120
- end
121
-
122
111
  def self.hiragana?(str)
123
112
  return HIRAGANA_REGEX =~ str
124
113
  end
@@ -142,6 +131,8 @@ module NHKore
142
131
  end
143
132
 
144
133
  def self.load_yaml(data,file: nil,**kargs)
134
+ require 'psychgus'
135
+
145
136
  return Psych.safe_load(data,
146
137
  aliases: false,
147
138
  filename: file,