nhkore 0.3.7 → 0.3.11

Sign up to get free protection for your applications and to get access to all the features.
data/lib/nhkore/sifter.rb CHANGED
@@ -1,23 +1,11 @@
1
- #!/usr/bin/env ruby
2
1
  # encoding: UTF-8
3
2
  # frozen_string_literal: true
4
3
 
5
4
  #--
6
5
  # This file is part of NHKore.
7
- # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
- #
9
- # NHKore is free software: you can redistribute it and/or modify
10
- # it under the terms of the GNU Lesser General Public License as published by
11
- # the Free Software Foundation, either version 3 of the License, or
12
- # (at your option) any later version.
13
- #
14
- # NHKore is distributed in the hope that it will be useful,
15
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- # GNU Lesser General Public License for more details.
18
- #
19
- # You should have received a copy of the GNU Lesser General Public License
20
- # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
6
+ # Copyright (c) 2020-2021 Jonathan Bradley Whited
7
+ #
8
+ # SPDX-License-Identifier: LGPL-3.0-or-later
21
9
  #++
22
10
 
23
11
 
@@ -28,179 +16,179 @@ require 'nhkore/util'
28
16
 
29
17
  module NHKore
30
18
  ###
31
- # @author Jonathan Bradley Whited (@esotericpig)
19
+ # @author Jonathan Bradley Whited
32
20
  # @since 0.2.0
33
21
  ###
34
22
  class Sifter
35
23
  include Fileable
36
-
24
+
37
25
  DEFAULT_DIR = Util::CORE_DIR
38
-
26
+
39
27
  DEFAULT_FUTSUU_FILENAME = 'sift_nhk_news_web_regular'
40
28
  DEFAULT_YASASHII_FILENAME = 'sift_nhk_news_web_easy'
41
-
29
+
42
30
  def self.build_file(filename)
43
31
  return File.join(DEFAULT_DIR,filename)
44
32
  end
45
-
33
+
46
34
  DEFAULT_FUTSUU_FILE = build_file(DEFAULT_FUTSUU_FILENAME)
47
35
  DEFAULT_YASASHII_FILE = build_file(DEFAULT_YASASHII_FILENAME)
48
-
36
+
49
37
  attr_accessor :articles
50
38
  attr_accessor :caption
51
39
  attr_accessor :filters
52
40
  attr_accessor :ignores
53
41
  attr_accessor :output
54
-
42
+
55
43
  def initialize(news)
56
- @articles = news.articles.values.dup()
44
+ @articles = news.articles.values.dup
57
45
  @caption = nil
58
46
  @filters = {}
59
47
  @ignores = {}
60
48
  @output = nil
61
49
  end
62
-
63
- def build_header()
50
+
51
+ def build_header
64
52
  header = []
65
-
53
+
66
54
  header << 'Frequency' unless @ignores[:freq]
67
55
  header << 'Word' unless @ignores[:word]
68
56
  header << 'Kana' unless @ignores[:kana]
69
57
  header << 'English' unless @ignores[:eng]
70
58
  header << 'Definition' unless @ignores[:defn]
71
-
59
+
72
60
  return header
73
61
  end
74
-
62
+
75
63
  def build_rows(words)
76
64
  rows = []
77
-
78
- words.each() do |word|
65
+
66
+ words.each do |word|
79
67
  rows << build_word_row(word)
80
68
  end
81
-
69
+
82
70
  return rows
83
71
  end
84
-
72
+
85
73
  def build_word_row(word)
86
74
  row = []
87
-
75
+
88
76
  row << word.freq unless @ignores[:freq]
89
77
  row << word.word unless @ignores[:word]
90
78
  row << word.kana unless @ignores[:kana]
91
79
  row << word.eng unless @ignores[:eng]
92
80
  row << word.defn unless @ignores[:defn]
93
-
81
+
94
82
  return row
95
83
  end
96
-
84
+
97
85
  def filter?(article)
98
- return false if @filters.empty?()
99
-
86
+ return false if @filters.empty?
87
+
100
88
  datetime_filter = @filters[:datetime]
101
89
  title_filter = @filters[:title]
102
90
  url_filter = @filters[:url]
103
-
104
- if !datetime_filter.nil?()
91
+
92
+ if !datetime_filter.nil?
105
93
  datetime = article.datetime
106
-
107
- return true if datetime.nil?() ||
94
+
95
+ return true if datetime.nil? ||
108
96
  datetime < datetime_filter[:from] || datetime > datetime_filter[:to]
109
97
  end
110
-
111
- if !title_filter.nil?()
112
- title = article.title.to_s()
98
+
99
+ if !title_filter.nil?
100
+ title = article.title.to_s
113
101
  title = Util.unspace_web_str(title) if title_filter[:unspace]
114
- title = title.downcase() if title_filter[:uncase]
115
-
102
+ title = title.downcase if title_filter[:uncase]
103
+
116
104
  return true unless title.include?(title_filter[:filter])
117
105
  end
118
-
119
- if !url_filter.nil?()
120
- url = article.url.to_s()
106
+
107
+ if !url_filter.nil?
108
+ url = article.url.to_s
121
109
  url = Util.unspace_web_str(url) if url_filter[:unspace]
122
- url = url.downcase() if url_filter[:uncase]
123
-
110
+ url = url.downcase if url_filter[:uncase]
111
+
124
112
  return true unless url.include?(url_filter[:filter])
125
113
  end
126
-
114
+
127
115
  return false
128
116
  end
129
-
117
+
130
118
  def filter_by_datetime(datetime_filter=nil,from: nil,to: nil)
131
- if !datetime_filter.nil?()
132
- if datetime_filter.respond_to?(:'[]')
119
+ if !datetime_filter.nil?
120
+ if datetime_filter.respond_to?(:[])
133
121
  # If out-of-bounds, just nil.
134
- from = datetime_filter[0] if from.nil?()
135
- to = datetime_filter[1] if to.nil?()
122
+ from = datetime_filter[0] if from.nil?
123
+ to = datetime_filter[1] if to.nil?
136
124
  else
137
- from = datetime_filter if from.nil?()
138
- to = datetime_filter if to.nil?()
125
+ from = datetime_filter if from.nil?
126
+ to = datetime_filter if to.nil?
139
127
  end
140
128
  end
141
-
142
- from = to if from.nil?()
143
- to = from if to.nil?()
144
-
145
- from = Util.jst_time(from) unless from.nil?()
146
- to = Util.jst_time(to) unless to.nil?()
147
-
129
+
130
+ from = to if from.nil?
131
+ to = from if to.nil?
132
+
133
+ from = Util.jst_time(from) unless from.nil?
134
+ to = Util.jst_time(to) unless to.nil?
135
+
148
136
  datetime_filter = [from,to]
149
-
150
- return self if datetime_filter.flatten().compact().empty?()
151
-
137
+
138
+ return self if datetime_filter.flatten.compact.empty?
139
+
152
140
  @filters[:datetime] = {from: from,to: to}
153
-
141
+
154
142
  return self
155
143
  end
156
-
144
+
157
145
  def filter_by_title(title_filter,uncase: true,unspace: true)
158
146
  title_filter = Util.unspace_web_str(title_filter) if unspace
159
- title_filter = title_filter.downcase() if uncase
160
-
147
+ title_filter = title_filter.downcase if uncase
148
+
161
149
  @filters[:title] = {filter: title_filter,uncase: uncase,unspace: unspace}
162
-
150
+
163
151
  return self
164
152
  end
165
-
153
+
166
154
  def filter_by_url(url_filter,uncase: true,unspace: true)
167
155
  url_filter = Util.unspace_web_str(url_filter) if unspace
168
- url_filter = url_filter.downcase() if uncase
169
-
156
+ url_filter = url_filter.downcase if uncase
157
+
170
158
  @filters[:url] = {filter: url_filter,uncase: uncase,unspace: unspace}
171
-
159
+
172
160
  return self
173
161
  end
174
-
162
+
175
163
  def ignore(key)
176
164
  @ignores[key] = true
177
-
165
+
178
166
  return self
179
167
  end
180
-
168
+
181
169
  # This does not output {caption}.
182
- def put_csv!()
170
+ def put_csv!
183
171
  require 'csv'
184
-
185
- words = sift()
186
-
172
+
173
+ words = sift
174
+
187
175
  @output = CSV.generate(headers: :first_row,write_headers: true) do |csv|
188
- csv << build_header()
189
-
190
- words.each() do |word|
176
+ csv << build_header
177
+
178
+ words.each do |word|
191
179
  csv << build_word_row(word)
192
180
  end
193
181
  end
194
-
182
+
195
183
  return @output
196
184
  end
197
-
198
- def put_html!()
199
- words = sift()
200
-
201
- @output = ''.dup()
202
-
203
- @output << <<~EOH
185
+
186
+ def put_html!
187
+ words = sift
188
+
189
+ @output = ''.dup
190
+
191
+ @output << <<~HTML
204
192
  <!DOCTYPE html>
205
193
  <html lang="ja">
206
194
  <head>
@@ -249,146 +237,144 @@ module NHKore
249
237
  <h1>NHKore</h1>
250
238
  <h2>#{@caption}</h2>
251
239
  <table>
252
- EOH
253
- #"
254
-
240
+ HTML
241
+
255
242
  # If have too few or too many '<col>', invalid HTML.
256
- @output << %Q{<col style="width:6em;">\n} unless @ignores[:freq]
257
- @output << %Q{<col style="width:17em;">\n} unless @ignores[:word]
258
- @output << %Q{<col style="width:17em;">\n} unless @ignores[:kana]
259
- @output << %Q{<col style="width:5em;">\n} unless @ignores[:eng]
243
+ @output << %Q(<col style="width:6em;">\n) unless @ignores[:freq]
244
+ @output << %Q(<col style="width:17em;">\n) unless @ignores[:word]
245
+ @output << %Q(<col style="width:17em;">\n) unless @ignores[:kana]
246
+ @output << %Q(<col style="width:5em;">\n) unless @ignores[:eng]
260
247
  @output << "<col>\n" unless @ignores[:defn] # No width for defn, fills rest of page
261
-
248
+
262
249
  @output << '<tr>'
263
-
264
- build_header().each() do |h|
250
+
251
+ build_header.each do |h|
265
252
  @output << "<th>#{h}</th>"
266
253
  end
267
-
254
+
268
255
  @output << "</tr>\n"
269
-
270
- words.each() do |word|
256
+
257
+ words.each do |word|
271
258
  @output << '<tr>'
272
-
273
- build_word_row(word).each() do |w|
274
- @output << "<td>#{Util.escape_html(w.to_s())}</td>"
259
+
260
+ build_word_row(word).each do |w|
261
+ @output << "<td>#{Util.escape_html(w.to_s)}</td>"
275
262
  end
276
-
263
+
277
264
  @output << "</tr>\n"
278
265
  end
279
-
280
- @output << <<~EOH
266
+
267
+ @output << <<~HTML
281
268
  </table>
282
269
  </body>
283
270
  </html>
284
- EOH
285
- #/
286
-
271
+ HTML
272
+
287
273
  return @output
288
274
  end
289
-
290
- def put_json!()
275
+
276
+ def put_json!
291
277
  require 'json'
292
-
293
- words = sift()
294
-
295
- @output = ''.dup()
296
-
297
- @output << <<~EOJ
278
+
279
+ words = sift
280
+
281
+ @output = ''.dup
282
+
283
+ @output << <<~JSON
298
284
  {
299
285
  "caption": #{JSON.generate(@caption)},
300
- "header": #{JSON.generate(build_header())},
286
+ "header": #{JSON.generate(build_header)},
301
287
  "words": [
302
- EOJ
303
-
304
- if !words.empty?()
288
+ JSON
289
+
290
+ if !words.empty?
305
291
  0.upto(words.length - 2) do |i|
306
292
  @output << " #{JSON.generate(build_word_row(words[i]))},\n"
307
293
  end
308
-
294
+
309
295
  @output << " #{JSON.generate(build_word_row(words[-1]))}\n"
310
296
  end
311
-
297
+
312
298
  @output << "]\n}\n"
313
-
299
+
314
300
  return @output
315
301
  end
316
-
317
- def put_yaml!()
302
+
303
+ def put_yaml!
318
304
  require 'psychgus'
319
-
320
- words = sift()
321
-
305
+
306
+ words = sift
307
+
322
308
  yaml = {
323
309
  caption: @caption,
324
- header: build_header(),
310
+ header: build_header,
325
311
  words: build_rows(words),
326
312
  }
327
-
328
- header_styler = Class.new() do
313
+
314
+ header_styler = Class.new do
329
315
  include Psychgus::Styler
330
-
316
+
331
317
  def style_sequence(sniffer,node)
332
318
  parent = sniffer.parent
333
-
334
- if !parent.nil?() && parent.node.respond_to?(:value) && parent.value == 'header'
319
+
320
+ if !parent.nil? && parent.node.respond_to?(:value) && parent.value == 'header'
335
321
  node.style = Psychgus::SEQUENCE_FLOW
336
322
  end
337
323
  end
338
324
  end
339
-
325
+
340
326
  # Put each Word on one line (flow/inline style).
341
- @output = Util.dump_yaml(yaml,flow_level: 4,stylers: header_styler.new())
342
-
327
+ @output = Util.dump_yaml(yaml,flow_level: 4,stylers: header_styler.new)
328
+
343
329
  return @output
344
330
  end
345
-
346
- def sift()
347
- master_article = Article.new()
348
-
349
- @articles.each() do |article|
331
+
332
+ def sift
333
+ master_article = Article.new
334
+
335
+ @articles.each do |article|
350
336
  next if filter?(article)
351
-
352
- article.words.values().each() do |word|
337
+
338
+ article.words.each_value do |word|
353
339
  master_article.add_word(word,use_freq: true)
354
340
  end
355
341
  end
356
-
357
- words = master_article.words.values()
358
-
359
- words.sort!() do |word1,word2|
342
+
343
+ words = master_article.words.values
344
+
345
+ words.sort! do |word1,word2|
360
346
  # Order by freq DESC (most frequent words to top).
361
347
  i = (word2.freq <=> word1.freq)
362
-
348
+
363
349
  # Order by !defn.empty, word ASC, !kana.empty, kana ASC, defn.len DESC, defn ASC.
364
350
  i = compare_empty_str(word1.defn,word2.defn) if i == 0 # Favor words that have definitions
365
- i = (word1.word.to_s() <=> word2.word.to_s()) if i == 0
351
+ i = (word1.word.to_s <=> word2.word.to_s) if i == 0
366
352
  i = compare_empty_str(word1.kana,word2.kana) if i == 0 # Favor words that have kana
367
- i = (word1.kana.to_s() <=> word2.kana.to_s()) if i == 0
368
- i = (word2.defn.to_s().length <=> word1.defn.to_s().length) if i == 0 # Favor longer definitions
369
- i = (word1.defn.to_s() <=> word2.defn.to_s()) if i == 0
370
-
353
+ i = (word1.kana.to_s <=> word2.kana.to_s) if i == 0
354
+ i = (word2.defn.to_s.length <=> word1.defn.to_s.length) if i == 0 # Favor longer definitions
355
+ i = (word1.defn.to_s <=> word2.defn.to_s) if i == 0
356
+
371
357
  i
372
358
  end
373
-
359
+
374
360
  return words
375
361
  end
376
-
362
+
377
363
  def compare_empty_str(str1,str2)
378
364
  has_str1 = !Util.empty_web_str?(str1)
379
365
  has_str2 = !Util.empty_web_str?(str2)
380
-
366
+
381
367
  if has_str1 && !has_str2
382
368
  return -1 # Bubble word1 to top
383
369
  elsif !has_str1 && has_str2
384
370
  return 1 # Bubble word2 to top
385
371
  end
386
-
372
+
387
373
  return 0 # Further comparison needed
388
374
  end
389
-
390
- def to_s()
391
- return @output.to_s()
375
+
376
+ def to_s
377
+ return @output.to_s
392
378
  end
393
379
  end
394
380
  end
@@ -1,23 +1,11 @@
1
- #!/usr/bin/env ruby
2
1
  # encoding: UTF-8
3
2
  # frozen_string_literal: true
4
3
 
5
4
  #--
6
5
  # This file is part of NHKore.
7
- # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
- #
9
- # NHKore is free software: you can redistribute it and/or modify
10
- # it under the terms of the GNU Lesser General Public License as published by
11
- # the Free Software Foundation, either version 3 of the License, or
12
- # (at your option) any later version.
13
- #
14
- # NHKore is distributed in the hope that it will be useful,
15
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- # GNU Lesser General Public License for more details.
18
- #
19
- # You should have received a copy of the GNU Lesser General Public License
20
- # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
6
+ # Copyright (c) 2020-2021 Jonathan Bradley Whited
7
+ #
8
+ # SPDX-License-Identifier: LGPL-3.0-or-later
21
9
  #++
22
10
 
23
11
 
@@ -26,24 +14,24 @@ require 'nhkore/util'
26
14
 
27
15
  module NHKore
28
16
  ###
29
- # @author Jonathan Bradley Whited (@esotericpig)
17
+ # @author Jonathan Bradley Whited
30
18
  # @since 0.2.0
31
19
  ###
32
20
  class Splitter
33
21
  def begin_split(str)
34
22
  return str
35
23
  end
36
-
24
+
37
25
  def split(str)
38
26
  str = begin_split(str)
39
27
  str = end_split(str)
40
-
28
+
41
29
  return str
42
30
  end
43
31
  end
44
-
32
+
45
33
  ###
46
- # @author Jonathan Bradley Whited (@esotericpig)
34
+ # @author Jonathan Bradley Whited
47
35
  # @since 0.2.0
48
36
  ###
49
37
  class BasicSplitter < Splitter
@@ -51,43 +39,43 @@ module NHKore
51
39
  return str.split(Util::NORMALIZE_STR_REGEX)
52
40
  end
53
41
  end
54
-
42
+
55
43
  ###
56
44
  # @since 0.2.0
57
45
  ###
58
46
  class BimyouSplitter < Splitter
59
47
  def initialize(*)
60
48
  require 'bimyou_segmenter'
61
-
49
+
62
50
  super
63
51
  end
64
-
52
+
65
53
  def end_split(str)
66
54
  return BimyouSegmenter.segment(str,symbol: false,white_space: false)
67
55
  end
68
56
  end
69
-
57
+
70
58
  ###
71
59
  # @since 0.2.0
72
60
  ###
73
61
  class TinySplitter < Splitter
74
62
  attr_accessor :tiny
75
-
63
+
76
64
  def initialize(*)
77
65
  require 'tiny_segmenter'
78
-
66
+
79
67
  super
80
-
81
- @tiny = TinySegmenter.new()
68
+
69
+ @tiny = TinySegmenter.new
82
70
  end
83
-
71
+
84
72
  def end_split(str)
85
73
  return @tiny.segment(str,ignore_punctuation: true)
86
74
  end
87
75
  end
88
-
76
+
89
77
  ###
90
- # @author Jonathan Bradley Whited (@esotericpig)
78
+ # @author Jonathan Bradley Whited
91
79
  # @since 0.2.0
92
80
  ###
93
81
  class BestSplitter < BimyouSplitter