nhkore 0.3.1 → 0.3.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -21,6 +21,7 @@
21
21
  #++
22
22
 
23
23
 
24
+ require 'attr_bool'
24
25
  require 'nokogiri'
25
26
  require 'open-uri'
26
27
 
@@ -40,8 +41,8 @@ module NHKore
40
41
  'dnt' => '1',
41
42
  }
42
43
 
43
- attr_accessor :eat_cookie
44
- attr_accessor :is_file
44
+ attr_accessor? :eat_cookie
45
+ attr_accessor? :is_file
45
46
  attr_reader :kargs
46
47
  attr_accessor :max_redirects
47
48
  attr_accessor :max_retries
@@ -49,9 +50,6 @@ module NHKore
49
50
  attr_accessor :str_or_io
50
51
  attr_accessor :url
51
52
 
52
- alias_method :eat_cookie?,:eat_cookie
53
- alias_method :is_file?,:is_file
54
-
55
53
  # +max_redirects+ defaults to 3 for safety (infinite-loop attack).
56
54
  #
57
55
  # All URL options: https://ruby-doc.org/stdlib-2.7.0/libdoc/open-uri/rdoc/OpenURI/OpenRead.html
@@ -82,7 +80,7 @@ module NHKore
82
80
  @max_retries = max_retries
83
81
  @redirect_rule = redirect_rule
84
82
 
85
- open(url,str_or_io)
83
+ open(url,str_or_io,is_file: is_file)
86
84
  end
87
85
 
88
86
  def fetch_cookie(url)
@@ -119,14 +117,14 @@ module NHKore
119
117
  return URI::join(@url,relative_url)
120
118
  end
121
119
 
122
- def open(url,str_or_io=nil)
120
+ def open(url,str_or_io=nil,is_file: @is_file)
121
+ @is_file = is_file
123
122
  @str_or_io = str_or_io
124
123
  @url = url
125
124
 
126
125
  if str_or_io.nil?()
127
126
  if @is_file
128
- # NHK's website tends to always use UTF-8.
129
- @str_or_io = File.open(url,'rt:UTF-8',**@kargs)
127
+ open_file(url)
130
128
  else
131
129
  fetch_cookie(url) if @eat_cookie
132
130
  open_url(url)
@@ -136,6 +134,16 @@ module NHKore
136
134
  return self
137
135
  end
138
136
 
137
+ def open_file(file)
138
+ @is_file = true
139
+ @url = file
140
+
141
+ # NHK's website tends to always use UTF-8.
142
+ @str_or_io = File.open(file,'rt:UTF-8',**@kargs)
143
+
144
+ return self
145
+ end
146
+
139
147
  def open_url(url)
140
148
  max_redirects = (@max_redirects.nil?() || @max_redirects < 0) ? 10_000 : @max_redirects
141
149
  max_retries = (@max_retries.nil?() || @max_retries < 0) ? 10_000 : @max_retries
@@ -194,6 +202,10 @@ module NHKore
194
202
  return @str_or_io
195
203
  end
196
204
 
205
+ def reopen()
206
+ return open(@url)
207
+ end
208
+
197
209
  def rss_doc()
198
210
  require 'rss'
199
211
 
@@ -21,6 +21,7 @@
21
21
  #++
22
22
 
23
23
 
24
+ require 'attr_bool'
24
25
  require 'time'
25
26
 
26
27
  require 'nhkore/fileable'
@@ -33,14 +34,12 @@ module NHKore
33
34
  # @since 0.2.0
34
35
  ###
35
36
  class SearchLink
36
- attr_accessor :datetime
37
- attr_accessor :futsuurl
38
- attr_accessor :scraped
37
+ attr_reader :datetime
38
+ attr_reader :futsuurl
39
+ attr_accessor? :scraped
39
40
  attr_accessor :sha256
40
41
  attr_accessor :title
41
- attr_accessor :url
42
-
43
- alias_method :scraped?,:scraped
42
+ attr_reader :url
44
43
 
45
44
  def initialize(url,scraped: false)
46
45
  super()
@@ -50,29 +49,27 @@ module NHKore
50
49
  @scraped = scraped
51
50
  @sha256 = sha256
52
51
  @title = nil
53
- @url = url
52
+ self.url = url
54
53
  end
55
54
 
56
55
  def encode_with(coder)
57
56
  # Order matters.
58
57
 
59
- coder[:url] = @url
58
+ coder[:url] = @url.nil?() ? nil : @url.to_s()
60
59
  coder[:scraped] = @scraped
61
- coder[:datetime] = @datetime.nil?() ? @datetime : @datetime.iso8601()
60
+ coder[:datetime] = @datetime.nil?() ? nil : @datetime.iso8601()
62
61
  coder[:title] = @title
63
- coder[:futsuurl] = @futsuurl
62
+ coder[:futsuurl] = @futsuurl.nil?() ? nil : @futsuurl.to_s()
64
63
  coder[:sha256] = @sha256
65
64
  end
66
65
 
67
66
  def self.load_data(key,hash)
68
- datetime = hash[:datetime]
69
-
70
67
  slink = SearchLink.new(
71
68
  hash[:url],
72
- scraped: hash[:scraped]
69
+ scraped: hash[:scraped],
73
70
  )
74
71
 
75
- slink.datetime = Util.empty_web_str?(datetime) ? nil : Time.iso8601(datetime)
72
+ slink.datetime = hash[:datetime]
76
73
  slink.futsuurl = hash[:futsuurl]
77
74
  slink.sha256 = hash[:sha256]
78
75
  slink.title = hash[:title]
@@ -83,13 +80,31 @@ module NHKore
83
80
  def update_from_article(article)
84
81
  # Don't update the url, as it may be different (e.g., http vs https).
85
82
 
86
- @datetime = article.datetime if @datetime.nil?()
87
- @futsuurl = article.futsuurl if Util.empty_web_str?(@futsuurl)
83
+ self.datetime = article.datetime if @datetime.nil?()
84
+ self.futsuurl = article.futsuurl if Util.empty_web_str?(@futsuurl)
88
85
  @scraped = true # If we have an article, it's been scraped
89
86
  @sha256 = article.sha256 if Util.empty_web_str?(@sha256)
90
87
  @title = article.title if Util.empty_web_str?(@title)
91
88
  end
92
89
 
90
+ def datetime=(value)
91
+ if value.is_a?(Time)
92
+ @datetime = value
93
+ else
94
+ @datetime = Util.empty_web_str?(value) ? nil : Time.iso8601(value)
95
+ end
96
+ end
97
+
98
+ def futsuurl=(value)
99
+ # Don't store URI, store String.
100
+ @futsuurl = value.nil?() ? nil : value.to_s()
101
+ end
102
+
103
+ def url=(value)
104
+ # Don't store URI, store String.
105
+ @url = value.nil?() ? nil : value.to_s()
106
+ end
107
+
93
108
  def to_s(mini: false)
94
109
  s = ''.dup()
95
110
 
@@ -137,9 +152,11 @@ module NHKore
137
152
  end
138
153
 
139
154
  def add_link(link)
140
- return self if @links.key?(link.url)
155
+ url = link.url.nil?() ? nil : link.url.to_s()
156
+
157
+ return self if @links.key?(url)
141
158
 
142
- @links[link.url] = link
159
+ @links[url] = link
143
160
 
144
161
  return self
145
162
  end
@@ -163,7 +180,7 @@ module NHKore
163
180
 
164
181
  if !links.nil?()
165
182
  links.each() do |key,hash|
166
- key = key.to_s() # Change from a symbol
183
+ key = key.to_s() unless key.nil?()
167
184
  slinks.links[key] = SearchLink.load_data(key,hash)
168
185
  end
169
186
  end
@@ -173,6 +190,7 @@ module NHKore
173
190
 
174
191
  def [](url)
175
192
  url = url.url if url.respond_to?(:url)
193
+ url = url.to_s() unless url.nil?()
176
194
 
177
195
  return @links[url]
178
196
  end
@@ -61,6 +61,7 @@ module NHKore
61
61
  return true if link =~ /\/movieplayer\.html?/ # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
62
62
  return true if link =~ /\/audio\.html?/ # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
63
63
  return true if link =~ /\/news\/easy\/index\.html?/ # http://www3.nhk.or.jp/news/easy/index.html
64
+ return true if link =~ /cgi2.*enqform/ # https://cgi2.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10011916321000&title=日本の会社が作った鉄道の車両「あずま」がイギリスで走る
64
65
 
65
66
  return false
66
67
  end
@@ -60,6 +60,40 @@ module NHKore
60
60
  @output = nil
61
61
  end
62
62
 
63
+ def build_header()
64
+ header = []
65
+
66
+ header << 'Frequency' unless @ignores[:freq]
67
+ header << 'Word' unless @ignores[:word]
68
+ header << 'Kana' unless @ignores[:kana]
69
+ header << 'English' unless @ignores[:eng]
70
+ header << 'Definition' unless @ignores[:defn]
71
+
72
+ return header
73
+ end
74
+
75
+ def build_rows(words)
76
+ rows = []
77
+
78
+ words.each() do |word|
79
+ rows << build_word_row(word)
80
+ end
81
+
82
+ return rows
83
+ end
84
+
85
+ def build_word_row(word)
86
+ row = []
87
+
88
+ row << word.freq unless @ignores[:freq]
89
+ row << word.word unless @ignores[:word]
90
+ row << word.kana unless @ignores[:kana]
91
+ row << word.eng unless @ignores[:eng]
92
+ row << word.defn unless @ignores[:defn]
93
+
94
+ return row
95
+ end
96
+
63
97
  def filter?(article)
64
98
  return false if @filters.empty?()
65
99
 
@@ -93,24 +127,29 @@ module NHKore
93
127
  return false
94
128
  end
95
129
 
96
- def filter_by_datetime(datetime_filter=nil,from_filter: nil,to_filter: nil)
130
+ def filter_by_datetime(datetime_filter=nil,from: nil,to: nil)
97
131
  if !datetime_filter.nil?()
98
- # If out-of-bounds, just nil.
99
- from_filter = datetime_filter[0]
100
- to_filter = datetime_filter[1]
132
+ if datetime_filter.respond_to?(:'[]')
133
+ # If out-of-bounds, just nil.
134
+ from = datetime_filter[0] if from.nil?()
135
+ to = datetime_filter[1] if to.nil?()
136
+ else
137
+ from = datetime_filter if from.nil?()
138
+ to = datetime_filter if to.nil?()
139
+ end
101
140
  end
102
141
 
103
- from_filter = to_filter if from_filter.nil?()
104
- to_filter = from_filter if to_filter.nil?()
142
+ from = to if from.nil?()
143
+ to = from if to.nil?()
105
144
 
106
- from_filter = Util.jst_time(from_filter) unless from_filter.nil?()
107
- to_filter = Util.jst_time(to_filter) unless to_filter.nil?()
145
+ from = Util.jst_time(from) unless from.nil?()
146
+ to = Util.jst_time(to) unless to.nil?()
108
147
 
109
- datetime_filter = [from_filter,to_filter]
148
+ datetime_filter = [from,to]
110
149
 
111
150
  return self if datetime_filter.flatten().compact().empty?()
112
151
 
113
- @filters[:datetime] = {from: from_filter,to: to_filter}
152
+ @filters[:datetime] = {from: from,to: to}
114
153
 
115
154
  return self
116
155
  end
@@ -146,26 +185,10 @@ module NHKore
146
185
  words = sift()
147
186
 
148
187
  @output = CSV.generate(headers: :first_row,write_headers: true) do |csv|
149
- row = []
150
-
151
- row << 'Frequency' unless @ignores[:freq]
152
- row << 'Word' unless @ignores[:word]
153
- row << 'Kana' unless @ignores[:kana]
154
- row << 'English' unless @ignores[:eng]
155
- row << 'Definition' unless @ignores[:defn]
156
-
157
- csv << row
188
+ csv << build_header()
158
189
 
159
190
  words.each() do |word|
160
- row = []
161
-
162
- row << word.freq unless @ignores[:freq]
163
- row << word.word unless @ignores[:word]
164
- row << word.kana unless @ignores[:kana]
165
- row << word.eng unless @ignores[:eng]
166
- row << word.defn unless @ignores[:defn]
167
-
168
- csv << row
191
+ csv << build_word_row(word)
169
192
  end
170
193
  end
171
194
 
@@ -227,7 +250,7 @@ module NHKore
227
250
  <h2>#{@caption}</h2>
228
251
  <table>
229
252
  EOH
230
- #" # Fix for editor
253
+ #"
231
254
 
232
255
  # If have too few or too many '<col>', invalid HTML.
233
256
  @output << %Q{<col style="width:6em;">\n} unless @ignores[:freq]
@@ -237,20 +260,20 @@ module NHKore
237
260
  @output << "<col>\n" unless @ignores[:defn] # No width for defn, fills rest of page
238
261
 
239
262
  @output << '<tr>'
240
- @output << '<th>Frequency</th>' unless @ignores[:freq]
241
- @output << '<th>Word</th>' unless @ignores[:word]
242
- @output << '<th>Kana</th>' unless @ignores[:kana]
243
- @output << '<th>English</th>' unless @ignores[:eng]
244
- @output << '<th>Definition</th>' unless @ignores[:defn]
263
+
264
+ build_header().each() do |h|
265
+ @output << "<th>#{h}</th>"
266
+ end
267
+
245
268
  @output << "</tr>\n"
246
269
 
247
270
  words.each() do |word|
248
271
  @output << '<tr>'
249
- @output << "<td>#{Util.escape_html(word.freq.to_s())}</td>" unless @ignores[:freq]
250
- @output << "<td>#{Util.escape_html(word.word.to_s())}</td>" unless @ignores[:word]
251
- @output << "<td>#{Util.escape_html(word.kana.to_s())}</td>" unless @ignores[:kana]
252
- @output << "<td>#{Util.escape_html(word.eng.to_s())}</td>" unless @ignores[:eng]
253
- @output << "<td>#{Util.escape_html(word.defn.to_s())}</td>" unless @ignores[:defn]
272
+
273
+ build_word_row(word).each() do |w|
274
+ @output << "<td>#{Util.escape_html(w.to_s())}</td>"
275
+ end
276
+
254
277
  @output << "</tr>\n"
255
278
  end
256
279
 
@@ -259,31 +282,63 @@ module NHKore
259
282
  </body>
260
283
  </html>
261
284
  EOH
262
- #/ # Fix for editor
285
+ #/
263
286
 
264
287
  return @output
265
288
  end
266
289
 
267
- def put_yaml!()
290
+ def put_json!()
291
+ require 'json'
292
+
268
293
  words = sift()
269
294
 
270
- # Just blank out ignores.
271
- if !@ignores.empty?()
272
- words.each() do |word|
273
- # word/kanji/kana do not have setters/mutators.
274
- word.defn = nil if @ignores[:defn]
275
- word.eng = nil if @ignores[:eng]
276
- word.freq = nil if @ignores[:freq]
295
+ @output = ''.dup()
296
+
297
+ @output << <<~EOJ
298
+ {
299
+ "caption": #{JSON.generate(@caption)},
300
+ "header": #{JSON.generate(build_header())},
301
+ "words": [
302
+ EOJ
303
+
304
+ if !words.empty?()
305
+ 0.upto(words.length - 2) do |i|
306
+ @output << " #{JSON.generate(build_word_row(words[i]))},\n"
277
307
  end
308
+
309
+ @output << " #{JSON.generate(build_word_row(words[-1]))}\n"
278
310
  end
279
311
 
312
+ @output << "]\n}\n"
313
+
314
+ return @output
315
+ end
316
+
317
+ def put_yaml!()
318
+ require 'psychgus'
319
+
320
+ words = sift()
321
+
280
322
  yaml = {
281
323
  caption: @caption,
282
- words: words
324
+ header: build_header(),
325
+ words: build_rows(words),
283
326
  }
284
327
 
328
+ header_styler = Class.new() do
329
+ include Psychgus::Styler
330
+
331
+ def style_sequence(sniffer,node)
332
+ parent = sniffer.parent
333
+
334
+ if !parent.nil?() && parent.node.respond_to?(:value) && parent.value == 'header'
335
+ node.style = Psychgus::SEQUENCE_FLOW
336
+ end
337
+ end
338
+ end
339
+
285
340
  # Put each Word on one line (flow/inline style).
286
- @output = Util.dump_yaml(yaml,flow_level: 4)
341
+ @output = Util.dump_yaml(yaml,flow_level: 4,stylers: header_styler.new())
287
342
 
288
343
  return @output
289
344
  end
@@ -301,7 +356,7 @@ module NHKore
301
356
 
302
357
  words = master_article.words.values()
303
358
 
304
- words = words.sort() do |word1,word2|
359
+ words.sort!() do |word1,word2|
305
360
  # Order by freq DESC (most frequent words to top).
306
361
  i = (word2.freq <=> word1.freq)
307
362
 
@@ -22,8 +22,7 @@
22
22
 
23
23
 
24
24
  require 'cgi'
25
- require 'psychgus'
26
- require 'public_suffix'
25
+ require 'set'
27
26
  require 'time'
28
27
  require 'uri'
29
28
 
@@ -68,21 +67,28 @@ module NHKore
68
67
  end
69
68
 
70
69
  def self.domain(host,clean: true)
70
+ require 'public_suffix'
71
+
71
72
  domain = PublicSuffix.domain(host)
72
73
  domain = unspace_web_str(domain).downcase() if !domain.nil?() && clean
73
74
 
74
75
  return domain
75
76
  end
76
77
 
77
- def self.dump_yaml(obj,flow_level: 8)
78
+ def self.dump_yaml(obj,flow_level: 8,stylers: nil)
79
+ require 'psychgus'
80
+
81
+ stylers = Array(stylers)
82
+
78
83
  return Psychgus.dump(obj,
79
84
  deref_aliases: true, # Dereference aliases for load_yaml()
85
+ header: true, # %YAML [version]
80
86
  line_width: 10000, # Try not to wrap; ichiman!
81
87
  stylers: [
82
88
  Psychgus::FlowStyler.new(flow_level), # Put extra details on one line (flow/inline style)
83
89
  Psychgus::NoSymStyler.new(cap: false), # Remove symbols, don't capitalize
84
90
  Psychgus::NoTagStyler.new(), # Remove class names (tags)
85
- ],
91
+ ].concat(stylers),
86
92
  )
87
93
  end
88
94
 
@@ -102,23 +108,6 @@ module NHKore
102
108
  return !str.match?(/[\/\\]/)
103
109
  end
104
110
 
105
- def self.guess_year(year)
106
- if year < 100
107
- # 2021 -> 2000.
108
- millennium = JST_YEAR / 100 * 100
109
-
110
- # If year <= (2021 -> 21), assume this century.
111
- if year <= (JST_YEAR % 100)
112
- year = millennium + year
113
- else
114
- # Assume previous century (2000 -> 1900).
115
- year = (millennium - 100) + year
116
- end
117
- end
118
-
119
- return year
120
- end
121
-
122
111
  def self.hiragana?(str)
123
112
  return HIRAGANA_REGEX =~ str
124
113
  end
@@ -142,6 +131,8 @@ module NHKore
142
131
  end
143
132
 
144
133
  def self.load_yaml(data,file: nil,**kargs)
134
+ require 'psychgus'
135
+
145
136
  return Psych.safe_load(data,
146
137
  aliases: false,
147
138
  filename: file,