nhkore 0.3.7 → 0.3.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +53 -2
- data/Gemfile +0 -18
- data/Gemfile.lock +36 -33
- data/README.md +36 -30
- data/Rakefile +38 -52
- data/bin/nhkore +4 -15
- data/lib/nhkore/app.rb +235 -234
- data/lib/nhkore/article.rb +39 -53
- data/lib/nhkore/article_scraper.rb +293 -285
- data/lib/nhkore/cleaner.rb +20 -32
- data/lib/nhkore/cli/fx_cmd.rb +41 -53
- data/lib/nhkore/cli/get_cmd.rb +59 -70
- data/lib/nhkore/cli/news_cmd.rb +143 -153
- data/lib/nhkore/cli/search_cmd.rb +108 -118
- data/lib/nhkore/cli/sift_cmd.rb +109 -120
- data/lib/nhkore/datetime_parser.rb +88 -104
- data/lib/nhkore/defn.rb +48 -55
- data/lib/nhkore/dict.rb +26 -38
- data/lib/nhkore/dict_scraper.rb +31 -40
- data/lib/nhkore/entry.rb +43 -55
- data/lib/nhkore/error.rb +16 -21
- data/lib/nhkore/fileable.rb +10 -21
- data/lib/nhkore/lib.rb +5 -17
- data/lib/nhkore/missingno.rb +21 -33
- data/lib/nhkore/news.rb +58 -72
- data/lib/nhkore/polisher.rb +22 -34
- data/lib/nhkore/scraper.rb +74 -83
- data/lib/nhkore/search_link.rb +62 -76
- data/lib/nhkore/search_scraper.rb +81 -92
- data/lib/nhkore/sifter.rb +157 -171
- data/lib/nhkore/splitter.rb +19 -31
- data/lib/nhkore/user_agents.rb +28 -32
- data/lib/nhkore/util.rb +72 -84
- data/lib/nhkore/variator.rb +20 -32
- data/lib/nhkore/version.rb +4 -16
- data/lib/nhkore/word.rb +99 -97
- data/lib/nhkore.rb +8 -20
- data/nhkore.gemspec +30 -51
- data/samples/looper.rb +18 -29
- data/test/nhkore/test_helper.rb +3 -15
- data/test/nhkore_test.rb +6 -18
- metadata +33 -24
data/lib/nhkore/sifter.rb
CHANGED
@@ -1,23 +1,11 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
@@ -28,179 +16,179 @@ require 'nhkore/util'
|
|
28
16
|
|
29
17
|
module NHKore
|
30
18
|
###
|
31
|
-
# @author Jonathan Bradley Whited
|
19
|
+
# @author Jonathan Bradley Whited
|
32
20
|
# @since 0.2.0
|
33
21
|
###
|
34
22
|
class Sifter
|
35
23
|
include Fileable
|
36
|
-
|
24
|
+
|
37
25
|
DEFAULT_DIR = Util::CORE_DIR
|
38
|
-
|
26
|
+
|
39
27
|
DEFAULT_FUTSUU_FILENAME = 'sift_nhk_news_web_regular'
|
40
28
|
DEFAULT_YASASHII_FILENAME = 'sift_nhk_news_web_easy'
|
41
|
-
|
29
|
+
|
42
30
|
def self.build_file(filename)
|
43
31
|
return File.join(DEFAULT_DIR,filename)
|
44
32
|
end
|
45
|
-
|
33
|
+
|
46
34
|
DEFAULT_FUTSUU_FILE = build_file(DEFAULT_FUTSUU_FILENAME)
|
47
35
|
DEFAULT_YASASHII_FILE = build_file(DEFAULT_YASASHII_FILENAME)
|
48
|
-
|
36
|
+
|
49
37
|
attr_accessor :articles
|
50
38
|
attr_accessor :caption
|
51
39
|
attr_accessor :filters
|
52
40
|
attr_accessor :ignores
|
53
41
|
attr_accessor :output
|
54
|
-
|
42
|
+
|
55
43
|
def initialize(news)
|
56
|
-
@articles = news.articles.values.dup
|
44
|
+
@articles = news.articles.values.dup
|
57
45
|
@caption = nil
|
58
46
|
@filters = {}
|
59
47
|
@ignores = {}
|
60
48
|
@output = nil
|
61
49
|
end
|
62
|
-
|
63
|
-
def build_header
|
50
|
+
|
51
|
+
def build_header
|
64
52
|
header = []
|
65
|
-
|
53
|
+
|
66
54
|
header << 'Frequency' unless @ignores[:freq]
|
67
55
|
header << 'Word' unless @ignores[:word]
|
68
56
|
header << 'Kana' unless @ignores[:kana]
|
69
57
|
header << 'English' unless @ignores[:eng]
|
70
58
|
header << 'Definition' unless @ignores[:defn]
|
71
|
-
|
59
|
+
|
72
60
|
return header
|
73
61
|
end
|
74
|
-
|
62
|
+
|
75
63
|
def build_rows(words)
|
76
64
|
rows = []
|
77
|
-
|
78
|
-
words.each
|
65
|
+
|
66
|
+
words.each do |word|
|
79
67
|
rows << build_word_row(word)
|
80
68
|
end
|
81
|
-
|
69
|
+
|
82
70
|
return rows
|
83
71
|
end
|
84
|
-
|
72
|
+
|
85
73
|
def build_word_row(word)
|
86
74
|
row = []
|
87
|
-
|
75
|
+
|
88
76
|
row << word.freq unless @ignores[:freq]
|
89
77
|
row << word.word unless @ignores[:word]
|
90
78
|
row << word.kana unless @ignores[:kana]
|
91
79
|
row << word.eng unless @ignores[:eng]
|
92
80
|
row << word.defn unless @ignores[:defn]
|
93
|
-
|
81
|
+
|
94
82
|
return row
|
95
83
|
end
|
96
|
-
|
84
|
+
|
97
85
|
def filter?(article)
|
98
|
-
return false if @filters.empty?
|
99
|
-
|
86
|
+
return false if @filters.empty?
|
87
|
+
|
100
88
|
datetime_filter = @filters[:datetime]
|
101
89
|
title_filter = @filters[:title]
|
102
90
|
url_filter = @filters[:url]
|
103
|
-
|
104
|
-
if !datetime_filter.nil?
|
91
|
+
|
92
|
+
if !datetime_filter.nil?
|
105
93
|
datetime = article.datetime
|
106
|
-
|
107
|
-
return true if datetime.nil?
|
94
|
+
|
95
|
+
return true if datetime.nil? ||
|
108
96
|
datetime < datetime_filter[:from] || datetime > datetime_filter[:to]
|
109
97
|
end
|
110
|
-
|
111
|
-
if !title_filter.nil?
|
112
|
-
title = article.title.to_s
|
98
|
+
|
99
|
+
if !title_filter.nil?
|
100
|
+
title = article.title.to_s
|
113
101
|
title = Util.unspace_web_str(title) if title_filter[:unspace]
|
114
|
-
title = title.downcase
|
115
|
-
|
102
|
+
title = title.downcase if title_filter[:uncase]
|
103
|
+
|
116
104
|
return true unless title.include?(title_filter[:filter])
|
117
105
|
end
|
118
|
-
|
119
|
-
if !url_filter.nil?
|
120
|
-
url = article.url.to_s
|
106
|
+
|
107
|
+
if !url_filter.nil?
|
108
|
+
url = article.url.to_s
|
121
109
|
url = Util.unspace_web_str(url) if url_filter[:unspace]
|
122
|
-
url = url.downcase
|
123
|
-
|
110
|
+
url = url.downcase if url_filter[:uncase]
|
111
|
+
|
124
112
|
return true unless url.include?(url_filter[:filter])
|
125
113
|
end
|
126
|
-
|
114
|
+
|
127
115
|
return false
|
128
116
|
end
|
129
|
-
|
117
|
+
|
130
118
|
def filter_by_datetime(datetime_filter=nil,from: nil,to: nil)
|
131
|
-
if !datetime_filter.nil?
|
132
|
-
if datetime_filter.respond_to?(:
|
119
|
+
if !datetime_filter.nil?
|
120
|
+
if datetime_filter.respond_to?(:[])
|
133
121
|
# If out-of-bounds, just nil.
|
134
|
-
from = datetime_filter[0] if from.nil?
|
135
|
-
to = datetime_filter[1] if to.nil?
|
122
|
+
from = datetime_filter[0] if from.nil?
|
123
|
+
to = datetime_filter[1] if to.nil?
|
136
124
|
else
|
137
|
-
from = datetime_filter if from.nil?
|
138
|
-
to = datetime_filter if to.nil?
|
125
|
+
from = datetime_filter if from.nil?
|
126
|
+
to = datetime_filter if to.nil?
|
139
127
|
end
|
140
128
|
end
|
141
|
-
|
142
|
-
from = to if from.nil?
|
143
|
-
to = from if to.nil?
|
144
|
-
|
145
|
-
from = Util.jst_time(from) unless from.nil?
|
146
|
-
to = Util.jst_time(to) unless to.nil?
|
147
|
-
|
129
|
+
|
130
|
+
from = to if from.nil?
|
131
|
+
to = from if to.nil?
|
132
|
+
|
133
|
+
from = Util.jst_time(from) unless from.nil?
|
134
|
+
to = Util.jst_time(to) unless to.nil?
|
135
|
+
|
148
136
|
datetime_filter = [from,to]
|
149
|
-
|
150
|
-
return self if datetime_filter.flatten
|
151
|
-
|
137
|
+
|
138
|
+
return self if datetime_filter.flatten.compact.empty?
|
139
|
+
|
152
140
|
@filters[:datetime] = {from: from,to: to}
|
153
|
-
|
141
|
+
|
154
142
|
return self
|
155
143
|
end
|
156
|
-
|
144
|
+
|
157
145
|
def filter_by_title(title_filter,uncase: true,unspace: true)
|
158
146
|
title_filter = Util.unspace_web_str(title_filter) if unspace
|
159
|
-
title_filter = title_filter.downcase
|
160
|
-
|
147
|
+
title_filter = title_filter.downcase if uncase
|
148
|
+
|
161
149
|
@filters[:title] = {filter: title_filter,uncase: uncase,unspace: unspace}
|
162
|
-
|
150
|
+
|
163
151
|
return self
|
164
152
|
end
|
165
|
-
|
153
|
+
|
166
154
|
def filter_by_url(url_filter,uncase: true,unspace: true)
|
167
155
|
url_filter = Util.unspace_web_str(url_filter) if unspace
|
168
|
-
url_filter = url_filter.downcase
|
169
|
-
|
156
|
+
url_filter = url_filter.downcase if uncase
|
157
|
+
|
170
158
|
@filters[:url] = {filter: url_filter,uncase: uncase,unspace: unspace}
|
171
|
-
|
159
|
+
|
172
160
|
return self
|
173
161
|
end
|
174
|
-
|
162
|
+
|
175
163
|
def ignore(key)
|
176
164
|
@ignores[key] = true
|
177
|
-
|
165
|
+
|
178
166
|
return self
|
179
167
|
end
|
180
|
-
|
168
|
+
|
181
169
|
# This does not output {caption}.
|
182
|
-
def put_csv!
|
170
|
+
def put_csv!
|
183
171
|
require 'csv'
|
184
|
-
|
185
|
-
words = sift
|
186
|
-
|
172
|
+
|
173
|
+
words = sift
|
174
|
+
|
187
175
|
@output = CSV.generate(headers: :first_row,write_headers: true) do |csv|
|
188
|
-
csv << build_header
|
189
|
-
|
190
|
-
words.each
|
176
|
+
csv << build_header
|
177
|
+
|
178
|
+
words.each do |word|
|
191
179
|
csv << build_word_row(word)
|
192
180
|
end
|
193
181
|
end
|
194
|
-
|
182
|
+
|
195
183
|
return @output
|
196
184
|
end
|
197
|
-
|
198
|
-
def put_html!
|
199
|
-
words = sift
|
200
|
-
|
201
|
-
@output = ''.dup
|
202
|
-
|
203
|
-
@output << <<~
|
185
|
+
|
186
|
+
def put_html!
|
187
|
+
words = sift
|
188
|
+
|
189
|
+
@output = ''.dup
|
190
|
+
|
191
|
+
@output << <<~HTML
|
204
192
|
<!DOCTYPE html>
|
205
193
|
<html lang="ja">
|
206
194
|
<head>
|
@@ -249,146 +237,144 @@ module NHKore
|
|
249
237
|
<h1>NHKore</h1>
|
250
238
|
<h2>#{@caption}</h2>
|
251
239
|
<table>
|
252
|
-
|
253
|
-
|
254
|
-
|
240
|
+
HTML
|
241
|
+
|
255
242
|
# If have too few or too many '<col>', invalid HTML.
|
256
|
-
@output << %Q
|
257
|
-
@output << %Q
|
258
|
-
@output << %Q
|
259
|
-
@output << %Q
|
243
|
+
@output << %Q(<col style="width:6em;">\n) unless @ignores[:freq]
|
244
|
+
@output << %Q(<col style="width:17em;">\n) unless @ignores[:word]
|
245
|
+
@output << %Q(<col style="width:17em;">\n) unless @ignores[:kana]
|
246
|
+
@output << %Q(<col style="width:5em;">\n) unless @ignores[:eng]
|
260
247
|
@output << "<col>\n" unless @ignores[:defn] # No width for defn, fills rest of page
|
261
|
-
|
248
|
+
|
262
249
|
@output << '<tr>'
|
263
|
-
|
264
|
-
build_header
|
250
|
+
|
251
|
+
build_header.each do |h|
|
265
252
|
@output << "<th>#{h}</th>"
|
266
253
|
end
|
267
|
-
|
254
|
+
|
268
255
|
@output << "</tr>\n"
|
269
|
-
|
270
|
-
words.each
|
256
|
+
|
257
|
+
words.each do |word|
|
271
258
|
@output << '<tr>'
|
272
|
-
|
273
|
-
build_word_row(word).each
|
274
|
-
@output << "<td>#{Util.escape_html(w.to_s
|
259
|
+
|
260
|
+
build_word_row(word).each do |w|
|
261
|
+
@output << "<td>#{Util.escape_html(w.to_s)}</td>"
|
275
262
|
end
|
276
|
-
|
263
|
+
|
277
264
|
@output << "</tr>\n"
|
278
265
|
end
|
279
|
-
|
280
|
-
@output << <<~
|
266
|
+
|
267
|
+
@output << <<~HTML
|
281
268
|
</table>
|
282
269
|
</body>
|
283
270
|
</html>
|
284
|
-
|
285
|
-
|
286
|
-
|
271
|
+
HTML
|
272
|
+
|
287
273
|
return @output
|
288
274
|
end
|
289
|
-
|
290
|
-
def put_json!
|
275
|
+
|
276
|
+
def put_json!
|
291
277
|
require 'json'
|
292
|
-
|
293
|
-
words = sift
|
294
|
-
|
295
|
-
@output = ''.dup
|
296
|
-
|
297
|
-
@output << <<~
|
278
|
+
|
279
|
+
words = sift
|
280
|
+
|
281
|
+
@output = ''.dup
|
282
|
+
|
283
|
+
@output << <<~JSON
|
298
284
|
{
|
299
285
|
"caption": #{JSON.generate(@caption)},
|
300
|
-
"header": #{JSON.generate(build_header
|
286
|
+
"header": #{JSON.generate(build_header)},
|
301
287
|
"words": [
|
302
|
-
|
303
|
-
|
304
|
-
if !words.empty?
|
288
|
+
JSON
|
289
|
+
|
290
|
+
if !words.empty?
|
305
291
|
0.upto(words.length - 2) do |i|
|
306
292
|
@output << " #{JSON.generate(build_word_row(words[i]))},\n"
|
307
293
|
end
|
308
|
-
|
294
|
+
|
309
295
|
@output << " #{JSON.generate(build_word_row(words[-1]))}\n"
|
310
296
|
end
|
311
|
-
|
297
|
+
|
312
298
|
@output << "]\n}\n"
|
313
|
-
|
299
|
+
|
314
300
|
return @output
|
315
301
|
end
|
316
|
-
|
317
|
-
def put_yaml!
|
302
|
+
|
303
|
+
def put_yaml!
|
318
304
|
require 'psychgus'
|
319
|
-
|
320
|
-
words = sift
|
321
|
-
|
305
|
+
|
306
|
+
words = sift
|
307
|
+
|
322
308
|
yaml = {
|
323
309
|
caption: @caption,
|
324
|
-
header: build_header
|
310
|
+
header: build_header,
|
325
311
|
words: build_rows(words),
|
326
312
|
}
|
327
|
-
|
328
|
-
header_styler = Class.new
|
313
|
+
|
314
|
+
header_styler = Class.new do
|
329
315
|
include Psychgus::Styler
|
330
|
-
|
316
|
+
|
331
317
|
def style_sequence(sniffer,node)
|
332
318
|
parent = sniffer.parent
|
333
|
-
|
334
|
-
if !parent.nil?
|
319
|
+
|
320
|
+
if !parent.nil? && parent.node.respond_to?(:value) && parent.value == 'header'
|
335
321
|
node.style = Psychgus::SEQUENCE_FLOW
|
336
322
|
end
|
337
323
|
end
|
338
324
|
end
|
339
|
-
|
325
|
+
|
340
326
|
# Put each Word on one line (flow/inline style).
|
341
|
-
@output = Util.dump_yaml(yaml,flow_level: 4,stylers: header_styler.new
|
342
|
-
|
327
|
+
@output = Util.dump_yaml(yaml,flow_level: 4,stylers: header_styler.new)
|
328
|
+
|
343
329
|
return @output
|
344
330
|
end
|
345
|
-
|
346
|
-
def sift
|
347
|
-
master_article = Article.new
|
348
|
-
|
349
|
-
@articles.each
|
331
|
+
|
332
|
+
def sift
|
333
|
+
master_article = Article.new
|
334
|
+
|
335
|
+
@articles.each do |article|
|
350
336
|
next if filter?(article)
|
351
|
-
|
352
|
-
article.words.
|
337
|
+
|
338
|
+
article.words.each_value do |word|
|
353
339
|
master_article.add_word(word,use_freq: true)
|
354
340
|
end
|
355
341
|
end
|
356
|
-
|
357
|
-
words = master_article.words.values
|
358
|
-
|
359
|
-
words.sort!
|
342
|
+
|
343
|
+
words = master_article.words.values
|
344
|
+
|
345
|
+
words.sort! do |word1,word2|
|
360
346
|
# Order by freq DESC (most frequent words to top).
|
361
347
|
i = (word2.freq <=> word1.freq)
|
362
|
-
|
348
|
+
|
363
349
|
# Order by !defn.empty, word ASC, !kana.empty, kana ASC, defn.len DESC, defn ASC.
|
364
350
|
i = compare_empty_str(word1.defn,word2.defn) if i == 0 # Favor words that have definitions
|
365
|
-
i = (word1.word.to_s
|
351
|
+
i = (word1.word.to_s <=> word2.word.to_s) if i == 0
|
366
352
|
i = compare_empty_str(word1.kana,word2.kana) if i == 0 # Favor words that have kana
|
367
|
-
i = (word1.kana.to_s
|
368
|
-
i = (word2.defn.to_s
|
369
|
-
i = (word1.defn.to_s
|
370
|
-
|
353
|
+
i = (word1.kana.to_s <=> word2.kana.to_s) if i == 0
|
354
|
+
i = (word2.defn.to_s.length <=> word1.defn.to_s.length) if i == 0 # Favor longer definitions
|
355
|
+
i = (word1.defn.to_s <=> word2.defn.to_s) if i == 0
|
356
|
+
|
371
357
|
i
|
372
358
|
end
|
373
|
-
|
359
|
+
|
374
360
|
return words
|
375
361
|
end
|
376
|
-
|
362
|
+
|
377
363
|
def compare_empty_str(str1,str2)
|
378
364
|
has_str1 = !Util.empty_web_str?(str1)
|
379
365
|
has_str2 = !Util.empty_web_str?(str2)
|
380
|
-
|
366
|
+
|
381
367
|
if has_str1 && !has_str2
|
382
368
|
return -1 # Bubble word1 to top
|
383
369
|
elsif !has_str1 && has_str2
|
384
370
|
return 1 # Bubble word2 to top
|
385
371
|
end
|
386
|
-
|
372
|
+
|
387
373
|
return 0 # Further comparison needed
|
388
374
|
end
|
389
|
-
|
390
|
-
def to_s
|
391
|
-
return @output.to_s
|
375
|
+
|
376
|
+
def to_s
|
377
|
+
return @output.to_s
|
392
378
|
end
|
393
379
|
end
|
394
380
|
end
|
data/lib/nhkore/splitter.rb
CHANGED
@@ -1,23 +1,11 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
@@ -26,24 +14,24 @@ require 'nhkore/util'
|
|
26
14
|
|
27
15
|
module NHKore
|
28
16
|
###
|
29
|
-
# @author Jonathan Bradley Whited
|
17
|
+
# @author Jonathan Bradley Whited
|
30
18
|
# @since 0.2.0
|
31
19
|
###
|
32
20
|
class Splitter
|
33
21
|
def begin_split(str)
|
34
22
|
return str
|
35
23
|
end
|
36
|
-
|
24
|
+
|
37
25
|
def split(str)
|
38
26
|
str = begin_split(str)
|
39
27
|
str = end_split(str)
|
40
|
-
|
28
|
+
|
41
29
|
return str
|
42
30
|
end
|
43
31
|
end
|
44
|
-
|
32
|
+
|
45
33
|
###
|
46
|
-
# @author Jonathan Bradley Whited
|
34
|
+
# @author Jonathan Bradley Whited
|
47
35
|
# @since 0.2.0
|
48
36
|
###
|
49
37
|
class BasicSplitter < Splitter
|
@@ -51,43 +39,43 @@ module NHKore
|
|
51
39
|
return str.split(Util::NORMALIZE_STR_REGEX)
|
52
40
|
end
|
53
41
|
end
|
54
|
-
|
42
|
+
|
55
43
|
###
|
56
44
|
# @since 0.2.0
|
57
45
|
###
|
58
46
|
class BimyouSplitter < Splitter
|
59
47
|
def initialize(*)
|
60
48
|
require 'bimyou_segmenter'
|
61
|
-
|
49
|
+
|
62
50
|
super
|
63
51
|
end
|
64
|
-
|
52
|
+
|
65
53
|
def end_split(str)
|
66
54
|
return BimyouSegmenter.segment(str,symbol: false,white_space: false)
|
67
55
|
end
|
68
56
|
end
|
69
|
-
|
57
|
+
|
70
58
|
###
|
71
59
|
# @since 0.2.0
|
72
60
|
###
|
73
61
|
class TinySplitter < Splitter
|
74
62
|
attr_accessor :tiny
|
75
|
-
|
63
|
+
|
76
64
|
def initialize(*)
|
77
65
|
require 'tiny_segmenter'
|
78
|
-
|
66
|
+
|
79
67
|
super
|
80
|
-
|
81
|
-
@tiny = TinySegmenter.new
|
68
|
+
|
69
|
+
@tiny = TinySegmenter.new
|
82
70
|
end
|
83
|
-
|
71
|
+
|
84
72
|
def end_split(str)
|
85
73
|
return @tiny.segment(str,ignore_punctuation: true)
|
86
74
|
end
|
87
75
|
end
|
88
|
-
|
76
|
+
|
89
77
|
###
|
90
|
-
# @author Jonathan Bradley Whited
|
78
|
+
# @author Jonathan Bradley Whited
|
91
79
|
# @since 0.2.0
|
92
80
|
###
|
93
81
|
class BestSplitter < BimyouSplitter
|