nhkore 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +37 -1
- data/README.md +18 -6
- data/Rakefile +11 -16
- data/bin/nhkore +1 -3
- data/lib/nhkore/app.rb +616 -0
- data/lib/nhkore/article.rb +130 -0
- data/lib/nhkore/article_scraper.rb +653 -0
- data/lib/nhkore/cleaner.rb +91 -0
- data/lib/nhkore/cli/bing_cmd.rb +220 -0
- data/lib/nhkore/cli/fx_cmd.rb +116 -0
- data/lib/nhkore/cli/get_cmd.rb +153 -0
- data/lib/nhkore/cli/news_cmd.rb +375 -0
- data/lib/nhkore/cli/sift_cmd.rb +382 -0
- data/lib/nhkore/defn.rb +104 -0
- data/lib/nhkore/dict.rb +80 -0
- data/lib/nhkore/dict_scraper.rb +76 -0
- data/lib/nhkore/entry.rb +104 -0
- data/lib/nhkore/error.rb +35 -0
- data/lib/nhkore/fileable.rb +48 -0
- data/lib/nhkore/missingno.rb +92 -0
- data/lib/nhkore/news.rb +176 -0
- data/lib/nhkore/polisher.rb +93 -0
- data/lib/nhkore/scraper.rb +137 -0
- data/lib/nhkore/search_link.rb +188 -0
- data/lib/nhkore/search_scraper.rb +152 -0
- data/lib/nhkore/sifter.rb +339 -0
- data/lib/nhkore/splitter.rb +90 -0
- data/lib/nhkore/util.rb +190 -0
- data/lib/nhkore/variator.rb +87 -0
- data/lib/nhkore/version.rb +1 -1
- data/lib/nhkore/word.rb +134 -17
- data/lib/nhkore.rb +39 -40
- data/nhkore.gemspec +23 -8
- data/test/{nhkore_tester.rb → nhkore/test_helper.rb} +3 -1
- data/test/nhkore_test.rb +8 -6
- metadata +204 -11
@@ -0,0 +1,382 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'date'
|
25
|
+
require 'time'
|
26
|
+
|
27
|
+
require 'nhkore/news'
|
28
|
+
require 'nhkore/sifter'
|
29
|
+
require 'nhkore/util'
|
30
|
+
|
31
|
+
|
32
|
+
module NHKore
|
33
|
+
module CLI
|
34
|
+
###
|
35
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
36
|
+
# @since 0.2.0
|
37
|
+
###
|
38
|
+
module SiftCmd
|
39
|
+
DEFAULT_SIFT_EXT = :csv
|
40
|
+
DEFAULT_SIFT_FUTSUU_FILE = "#{Sifter::DEFAULT_FUTSUU_FILE}{search.criteria}{file.ext}"
|
41
|
+
DEFAULT_SIFT_YASASHII_FILE = "#{Sifter::DEFAULT_YASASHII_FILE}{search.criteria}{file.ext}"
|
42
|
+
SIFT_EXTS = [:csv,:htm,:html,:yaml,:yml]
|
43
|
+
|
44
|
+
# Order matters.
|
45
|
+
SIFT_DATETIME_FMTS = [
|
46
|
+
'%Y-%m-%d %H:%M',
|
47
|
+
'%Y-%m-%d %H',
|
48
|
+
'%Y-%m-%d',
|
49
|
+
'%m-%d %H:%M',
|
50
|
+
'%Y-%m %H:%M',
|
51
|
+
'%m-%d %H',
|
52
|
+
'%Y-%m %H',
|
53
|
+
'%m-%d',
|
54
|
+
'%Y-%m',
|
55
|
+
'%d %H:%M',
|
56
|
+
'%y %H:%M',
|
57
|
+
'%d %H',
|
58
|
+
'%Y %H',
|
59
|
+
'%H:%M',
|
60
|
+
'%d',
|
61
|
+
'%Y'
|
62
|
+
]
|
63
|
+
|
64
|
+
attr_accessor :sift_datetime_text
|
65
|
+
attr_accessor :sift_search_criteria
|
66
|
+
|
67
|
+
def build_sift_cmd()
|
68
|
+
app = self
|
69
|
+
|
70
|
+
@sift_datetime_text = nil
|
71
|
+
@sift_search_criteria = nil
|
72
|
+
|
73
|
+
@sift_cmd = @app_cmd.define_command() do
|
74
|
+
name 'sift'
|
75
|
+
usage 'sift [OPTIONS] [COMMAND]...'
|
76
|
+
aliases :s
|
77
|
+
summary "Sift NHK News Web (Easy) articles data for the frequency of words (aliases: #{app.color_alias('s')})"
|
78
|
+
|
79
|
+
description <<-EOD
|
80
|
+
Sift NHK News Web (Easy) articles data for the frequency of words &
|
81
|
+
save to folder: #{Sifter::DEFAULT_DIR}
|
82
|
+
EOD
|
83
|
+
|
84
|
+
option :d,:datetime,<<-EOD,argument: :required,transform: -> (value) do
|
85
|
+
date time to filter on; examples:
|
86
|
+
'2020-7-1 13:10...2020-7-31 11:11';
|
87
|
+
'2020-12' (2020, December 1st-31st);
|
88
|
+
'7-4...7-9' (July 4th-9th of Current Year);
|
89
|
+
'7-9' (July 9th of Current Year);
|
90
|
+
'9' (9th of Current Year & Month)
|
91
|
+
EOD
|
92
|
+
app.sift_datetime_text = value # Save the original value for the file name
|
93
|
+
value = app.parse_sift_datetime(value)
|
94
|
+
value
|
95
|
+
end
|
96
|
+
option :e,:ext,<<-EOD,argument: :required,default: DEFAULT_SIFT_EXT,transform: -> (value) do
|
97
|
+
type of file (extension) to save; valid options: [#{SIFT_EXTS.join(', ')}];
|
98
|
+
not needed if you specify a file extension with the '--out' option: '--out sift.html'
|
99
|
+
EOD
|
100
|
+
value = Util.unspace_web_str(value).downcase().to_sym()
|
101
|
+
|
102
|
+
raise CLIError,"invalid ext[#{value}] for option[#{ext}]" unless SIFT_EXTS.include?(value)
|
103
|
+
|
104
|
+
value
|
105
|
+
end
|
106
|
+
option :i,:in,<<-EOD,argument: :required,transform: -> (value) do
|
107
|
+
file of NHK News Web (Easy) articles data to sift (see '#{App::NAME} news';
|
108
|
+
defaults: #{YasashiiNews::DEFAULT_FILE}, #{FutsuuNews::DEFAULT_FILE})
|
109
|
+
EOD
|
110
|
+
app.check_empty_opt(:in,value)
|
111
|
+
end
|
112
|
+
flag :D,:'no-defn','do not output the definitions for words (which can be quite long)'
|
113
|
+
flag :E,:'no-eng','do not output the English translations for words'
|
114
|
+
option :o,:out,<<-EOD,argument: :required,transform: -> (value) do
|
115
|
+
'directory/file' to save sifted data to; if you only specify a directory or a file, it will attach
|
116
|
+
the appropriate default directory/file name
|
117
|
+
(defaults: #{DEFAULT_SIFT_YASASHII_FILE}, #{DEFAULT_SIFT_FUTSUU_FILE})
|
118
|
+
EOD
|
119
|
+
app.check_empty_opt(:out,value)
|
120
|
+
end
|
121
|
+
option :t,:title,'title to filter on, where search text only needs to be somewhere in the title',
|
122
|
+
argument: :required
|
123
|
+
option :u,:url,'URL to filter on, where search text only needs to be somewhere in the URL',
|
124
|
+
argument: :required
|
125
|
+
|
126
|
+
run do |opts,args,cmd|
|
127
|
+
puts cmd.help
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
@sift_easy_cmd = @sift_cmd.define_command() do
|
132
|
+
name 'easy'
|
133
|
+
usage 'easy [OPTIONS] [COMMAND]...'
|
134
|
+
aliases :e,:ez
|
135
|
+
summary "Sift NHK News Web Easy (Yasashii) articles data (aliases: #{app.color_alias('e ez')})"
|
136
|
+
|
137
|
+
description <<-EOD
|
138
|
+
Sift NHK News Web Easy (Yasashii) articles data for the frequency of words &
|
139
|
+
save to file: #{DEFAULT_SIFT_YASASHII_FILE}
|
140
|
+
EOD
|
141
|
+
|
142
|
+
run do |opts,args,cmd|
|
143
|
+
app.refresh_cmd(opts,args,cmd)
|
144
|
+
app.run_sift_cmd(:yasashii)
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
@sift_regular_cmd = @sift_cmd.define_command() do
|
149
|
+
name 'regular'
|
150
|
+
usage 'regular [OPTIONS] [COMMAND]...'
|
151
|
+
aliases :r,:reg
|
152
|
+
summary "Sift NHK News Web Regular (Futsuu) articles data (aliases: #{app.color_alias('r reg')})"
|
153
|
+
|
154
|
+
description <<-EOD
|
155
|
+
Sift NHK News Web Regular (Futsuu) articles data for the frequency of words &
|
156
|
+
save to file: #{DEFAULT_SIFT_FUTSUU_FILE}
|
157
|
+
EOD
|
158
|
+
|
159
|
+
run do |opts,args,cmd|
|
160
|
+
app.refresh_cmd(opts,args,cmd)
|
161
|
+
app.run_sift_cmd(:futsuu)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
def build_sift_filename(filename)
|
167
|
+
@sift_search_criteria = []
|
168
|
+
|
169
|
+
@sift_search_criteria << Util.strip_web_str(@sift_datetime_text.to_s())
|
170
|
+
@sift_search_criteria << Util.strip_web_str(@cmd_opts[:title].to_s())
|
171
|
+
@sift_search_criteria << Util.strip_web_str(@cmd_opts[:url].to_s())
|
172
|
+
@sift_search_criteria.filter!() {|sc| !sc.empty?()}
|
173
|
+
|
174
|
+
clean_regex = /[^[[:alnum:]]\-_\.]+/
|
175
|
+
clean_search_criteria = ''.dup()
|
176
|
+
|
177
|
+
@sift_search_criteria.each() do |sc|
|
178
|
+
clean_search_criteria << sc.gsub(clean_regex,'')
|
179
|
+
end
|
180
|
+
|
181
|
+
@sift_search_criteria = @sift_search_criteria.empty?() ? nil : @sift_search_criteria.join(', ')
|
182
|
+
|
183
|
+
# Limit the file name length.
|
184
|
+
# If length is smaller, [..] still works appropriately.
|
185
|
+
clean_search_criteria = clean_search_criteria[0..32]
|
186
|
+
|
187
|
+
clean_search_criteria.prepend('_') unless clean_search_criteria.empty?()
|
188
|
+
|
189
|
+
file_ext = @cmd_opts[:ext]
|
190
|
+
|
191
|
+
if file_ext.nil?()
|
192
|
+
# Try to get from '--out' if it exists.
|
193
|
+
if !@cmd_opts[:out].nil?()
|
194
|
+
file_ext = Util.unspace_web_str(File.extname(@cmd_opts[:out])).downcase()
|
195
|
+
file_ext = file_ext.sub(/\A\./,'') # Remove '.'; can't be nil for to_sym()
|
196
|
+
file_ext = file_ext.to_sym()
|
197
|
+
|
198
|
+
file_ext = nil unless SIFT_EXTS.include?(file_ext)
|
199
|
+
end
|
200
|
+
|
201
|
+
file_ext = DEFAULT_SIFT_EXT if file_ext.nil?()
|
202
|
+
@cmd_opts[:ext] = file_ext
|
203
|
+
end
|
204
|
+
|
205
|
+
filename = "#{filename}#{clean_search_criteria}.#{file_ext}"
|
206
|
+
|
207
|
+
return filename
|
208
|
+
end
|
209
|
+
|
210
|
+
# TODO: This should probably be moved into its own class, into Util, or into Sifter?
|
211
|
+
def parse_sift_datetime(value)
|
212
|
+
value = Util.reduce_space(value).strip() # Don't use unspace_web_str(), want spaces for formats
|
213
|
+
value = value.split('...',2)
|
214
|
+
|
215
|
+
check_empty_opt(:datetime,nil) if value.empty?() # For ''
|
216
|
+
|
217
|
+
# Make a "to" and a "from" date time range.
|
218
|
+
value << value[0].dup() if value.length == 1
|
219
|
+
|
220
|
+
to_day = nil
|
221
|
+
to_hour = 23
|
222
|
+
to_minute = 59
|
223
|
+
to_month = 12
|
224
|
+
to_year = Util::MAX_SANE_YEAR
|
225
|
+
|
226
|
+
value.each_with_index() do |v,i|
|
227
|
+
v = check_empty_opt(:datetime,v) # For '...', '12-25...', or '...12-25'
|
228
|
+
|
229
|
+
has_day = false
|
230
|
+
has_hour = false
|
231
|
+
has_minute = false
|
232
|
+
has_month = false
|
233
|
+
has_year = false
|
234
|
+
|
235
|
+
SIFT_DATETIME_FMTS.each_with_index() do |fmt,i|
|
236
|
+
begin
|
237
|
+
# If don't do this, "%d" values will be parsed using "%d %H".
|
238
|
+
# It seems as though strptime() ignores space.
|
239
|
+
raise ArgumentError if !v.include?(' ') && fmt.include?(' ')
|
240
|
+
|
241
|
+
# If don't do this, "%y" values will be parsed using "%d".
|
242
|
+
raise ArgumentError if fmt == '%d' && v.length > 2
|
243
|
+
|
244
|
+
v = Time.strptime(v,fmt,&Util.method(:guess_year))
|
245
|
+
|
246
|
+
has_day = fmt.include?('%d')
|
247
|
+
has_hour = fmt.include?('%H')
|
248
|
+
has_minute = fmt.include?('%M')
|
249
|
+
has_month = fmt.include?('%m')
|
250
|
+
has_year = fmt.include?('%Y')
|
251
|
+
|
252
|
+
break # No problem; this format worked
|
253
|
+
rescue ArgumentError
|
254
|
+
# Out of formats.
|
255
|
+
raise if i >= (SIFT_DATETIME_FMTS.length - 1)
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
# "From" date time.
|
260
|
+
if i == 0
|
261
|
+
# Set these so that "2012-7-4...7-9" will use the appropriate year
|
262
|
+
# of "2012" for "7-9".
|
263
|
+
to_day = v.day if has_day
|
264
|
+
to_hour = v.hour if has_hour
|
265
|
+
to_minute = v.min if has_minute
|
266
|
+
to_month = v.month if has_month
|
267
|
+
to_year = v.year if has_year
|
268
|
+
|
269
|
+
v = Time.new(
|
270
|
+
has_year ? v.year : Util::MIN_SANE_YEAR,
|
271
|
+
has_month ? v.month : 1,
|
272
|
+
has_day ? v.day : 1,
|
273
|
+
has_hour ? v.hour : 0,
|
274
|
+
has_minute ? v.min : 0
|
275
|
+
)
|
276
|
+
# "To" date time.
|
277
|
+
else
|
278
|
+
to_hour = v.hour if has_hour
|
279
|
+
to_minute = v.min if has_minute
|
280
|
+
to_month = v.month if has_month
|
281
|
+
to_year = v.year if has_year
|
282
|
+
|
283
|
+
if has_day
|
284
|
+
to_day = v.day
|
285
|
+
# Nothing passed from the "from" date time?
|
286
|
+
elsif to_day.nil?()
|
287
|
+
# Last day of month.
|
288
|
+
to_day = Date.new(to_year,to_month,-1).day
|
289
|
+
end
|
290
|
+
|
291
|
+
v = Time.new(to_year,to_month,to_day,to_hour,to_minute)
|
292
|
+
end
|
293
|
+
|
294
|
+
value[i] = v
|
295
|
+
end
|
296
|
+
|
297
|
+
return value
|
298
|
+
end
|
299
|
+
|
300
|
+
def run_sift_cmd(type)
|
301
|
+
news_name = nil
|
302
|
+
|
303
|
+
case type
|
304
|
+
when :futsuu
|
305
|
+
build_in_file(:in,default_dir: News::DEFAULT_DIR,default_filename: FutsuuNews::DEFAULT_FILENAME)
|
306
|
+
build_out_file(:out,default_dir: Sifter::DEFAULT_DIR,
|
307
|
+
default_filename: build_sift_filename(Sifter::DEFAULT_FUTSUU_FILENAME))
|
308
|
+
|
309
|
+
news_name = 'Regular'
|
310
|
+
when :yasashii
|
311
|
+
build_in_file(:in,default_dir: News::DEFAULT_DIR,default_filename: YasashiiNews::DEFAULT_FILENAME)
|
312
|
+
build_out_file(:out,default_dir: Sifter::DEFAULT_DIR,
|
313
|
+
default_filename: build_sift_filename(Sifter::DEFAULT_YASASHII_FILENAME))
|
314
|
+
|
315
|
+
news_name = 'Easy'
|
316
|
+
else
|
317
|
+
raise ArgumentError,"invalid type[#{type}]"
|
318
|
+
end
|
319
|
+
|
320
|
+
return unless check_in_file(:in,empty_ok: false)
|
321
|
+
return unless check_out_file(:out)
|
322
|
+
|
323
|
+
datetime_filter = @cmd_opts[:datetime]
|
324
|
+
dry_run = @cmd_opts[:dry_run]
|
325
|
+
file_ext = @cmd_opts[:ext]
|
326
|
+
in_file = @cmd_opts[:in]
|
327
|
+
no_defn = @cmd_opts[:no_defn]
|
328
|
+
no_eng = @cmd_opts[:no_eng]
|
329
|
+
out_file = @cmd_opts[:out]
|
330
|
+
title_filter = @cmd_opts[:title]
|
331
|
+
url_filter = @cmd_opts[:url]
|
332
|
+
|
333
|
+
start_spin("Sifting NHK News Web #{news_name} data")
|
334
|
+
|
335
|
+
news = (type == :yasashii) ? YasashiiNews.load_file(in_file) : FutsuuNews.load_file(in_file)
|
336
|
+
|
337
|
+
sifter = Sifter.new(news)
|
338
|
+
|
339
|
+
sifter.filter_by_datetime(datetime_filter) unless datetime_filter.nil?()
|
340
|
+
sifter.filter_by_title(title_filter) unless title_filter.nil?()
|
341
|
+
sifter.filter_by_url(url_filter) unless url_filter.nil?()
|
342
|
+
sifter.ignore(:defn) if no_defn
|
343
|
+
sifter.ignore(:eng) if no_eng
|
344
|
+
|
345
|
+
sifter.caption = "NHK News Web #{news_name}".dup()
|
346
|
+
|
347
|
+
if !@sift_search_criteria.nil?()
|
348
|
+
if [:htm,:html].any?(file_ext)
|
349
|
+
sifter.caption << " — #{Util.escape_html(@sift_search_criteria.to_s())}"
|
350
|
+
else
|
351
|
+
sifter.caption << " -- #{@sift_search_criteria}"
|
352
|
+
end
|
353
|
+
end
|
354
|
+
|
355
|
+
case file_ext
|
356
|
+
when :csv
|
357
|
+
sifter.put_csv!()
|
358
|
+
when :htm,:html
|
359
|
+
sifter.put_html!()
|
360
|
+
when :yaml,:yml
|
361
|
+
sifter.put_yaml!()
|
362
|
+
else
|
363
|
+
raise ArgumentError,"invalid file ext[#{file_ext}]"
|
364
|
+
end
|
365
|
+
|
366
|
+
stop_spin()
|
367
|
+
puts
|
368
|
+
|
369
|
+
if dry_run
|
370
|
+
puts sifter.to_s()
|
371
|
+
else
|
372
|
+
start_spin('Saving sifted data to file')
|
373
|
+
|
374
|
+
sifter.save_file(out_file)
|
375
|
+
|
376
|
+
stop_spin()
|
377
|
+
puts "> #{out_file}"
|
378
|
+
end
|
379
|
+
end
|
380
|
+
end
|
381
|
+
end
|
382
|
+
end
|
data/lib/nhkore/defn.rb
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'nokogiri'
|
25
|
+
|
26
|
+
require 'nhkore/util'
|
27
|
+
require 'nhkore/word'
|
28
|
+
|
29
|
+
|
30
|
+
module NHKore
|
31
|
+
###
|
32
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
33
|
+
# @since 0.2.0
|
34
|
+
###
|
35
|
+
class Defn
|
36
|
+
attr_reader :hyoukis
|
37
|
+
attr_accessor :text
|
38
|
+
attr_reader :words
|
39
|
+
|
40
|
+
def initialize()
|
41
|
+
super()
|
42
|
+
|
43
|
+
@hyoukis = []
|
44
|
+
@text = ''.dup()
|
45
|
+
@words = []
|
46
|
+
end
|
47
|
+
|
48
|
+
# If no data, don't raise errors; don't care if have a definition or not.
|
49
|
+
def self.scrape(hash,missingno: nil,url: nil)
|
50
|
+
defn = Defn.new()
|
51
|
+
|
52
|
+
hyoukis = hash['hyouki']
|
53
|
+
|
54
|
+
if !hyoukis.nil?()
|
55
|
+
hyoukis.each() do |hyouki|
|
56
|
+
next if hyouki.nil?()
|
57
|
+
next if (hyouki = Util.strip_web_str(hyouki)).empty?()
|
58
|
+
|
59
|
+
defn.hyoukis << hyouki
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def_str = hash['def']
|
64
|
+
|
65
|
+
if Util.empty_web_str?(def_str)
|
66
|
+
return defn.hyoukis.empty?() ? nil : defn
|
67
|
+
end
|
68
|
+
|
69
|
+
doc = Nokogiri::HTML(def_str)
|
70
|
+
doc = doc.css('body') # Auto-added by Nokogiri
|
71
|
+
|
72
|
+
doc.children.each() do |child|
|
73
|
+
name = Util.unspace_web_str(child.name).downcase() if child.respond_to?(:name)
|
74
|
+
|
75
|
+
is_text = false
|
76
|
+
word = nil
|
77
|
+
|
78
|
+
if name == 'ruby'
|
79
|
+
word = Word.scrape_ruby_tag(child,missingno: missingno,url: url)
|
80
|
+
elsif child.respond_to?(:text) # Don't do child.text?(), just want content
|
81
|
+
word = Word.scrape_text_node(child,url: url)
|
82
|
+
is_text = true
|
83
|
+
end
|
84
|
+
|
85
|
+
if word.nil?()
|
86
|
+
defn.text << Util.reduce_jpn_space(child.text) if is_text
|
87
|
+
else
|
88
|
+
defn.text << Util.reduce_jpn_space(word.word)
|
89
|
+
defn.words << word unless Util.empty_web_str?(word.word)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
return nil if defn.hyoukis.empty?() && defn.words.empty?()
|
94
|
+
|
95
|
+
defn.text = Util.strip_web_str(defn.text)
|
96
|
+
|
97
|
+
return defn
|
98
|
+
end
|
99
|
+
|
100
|
+
def to_s()
|
101
|
+
return @text
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
data/lib/nhkore/dict.rb
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'nhkore/entry'
|
25
|
+
require 'nhkore/error'
|
26
|
+
|
27
|
+
|
28
|
+
module NHKore
|
29
|
+
###
|
30
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
31
|
+
# @since 0.2.0
|
32
|
+
###
|
33
|
+
class Dict
|
34
|
+
attr_reader :entries
|
35
|
+
|
36
|
+
def initialize()
|
37
|
+
super()
|
38
|
+
|
39
|
+
@entries = {}
|
40
|
+
end
|
41
|
+
|
42
|
+
def [](id)
|
43
|
+
return @entries[id]
|
44
|
+
end
|
45
|
+
|
46
|
+
def []=(id,entry)
|
47
|
+
return @entries[id] = entry
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.scrape(hash,missingno: nil,url: nil)
|
51
|
+
dict = Dict.new()
|
52
|
+
|
53
|
+
hash.each() do |id,array|
|
54
|
+
entry = Entry.scrape(id,array,missingno: missingno,url: url)
|
55
|
+
|
56
|
+
next if entry.nil?()
|
57
|
+
raise ScrapeError,"duplicate ID[#{id}] at URL[#{url}] in hash[#{hash}]" if dict.key?(id)
|
58
|
+
|
59
|
+
dict[id] = entry
|
60
|
+
end
|
61
|
+
|
62
|
+
return dict
|
63
|
+
end
|
64
|
+
|
65
|
+
def key?(id)
|
66
|
+
return @entries.key?(id)
|
67
|
+
end
|
68
|
+
|
69
|
+
def to_s()
|
70
|
+
s = ''.dup()
|
71
|
+
|
72
|
+
@entries.each() do |id,entry|
|
73
|
+
s << "#{id}:\n"
|
74
|
+
s << " #{entry.to_s().gsub("\n","\n ").rstrip()}\n"
|
75
|
+
end
|
76
|
+
|
77
|
+
return s
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'json'
|
25
|
+
require 'nhkore/dict'
|
26
|
+
require 'nhkore/error'
|
27
|
+
require 'nhkore/scraper'
|
28
|
+
require 'nhkore/util'
|
29
|
+
|
30
|
+
|
31
|
+
module NHKore
|
32
|
+
###
|
33
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
34
|
+
# @since 0.2.0
|
35
|
+
###
|
36
|
+
class DictScraper < Scraper
|
37
|
+
attr_accessor :missingno
|
38
|
+
|
39
|
+
def initialize(url,missingno: nil,parse_url: true,**kargs)
|
40
|
+
url = self.class.parse_url(url) if parse_url
|
41
|
+
|
42
|
+
super(url,**kargs)
|
43
|
+
|
44
|
+
@missingno = missingno
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.parse_url(url,basename: nil)
|
48
|
+
url = Util.strip_web_str(url)
|
49
|
+
|
50
|
+
raise ParseError,"cannot parse dictionary URL from URL[#{url}]" if url.empty?()
|
51
|
+
|
52
|
+
i = url.rindex(%r{[/\\]}) # Can be a URL or a file
|
53
|
+
i = i.nil?() ? 0 : (i + 1) # If no match found, no path
|
54
|
+
|
55
|
+
basename = File.basename(url[i..-1],'.*') if basename.nil?()
|
56
|
+
path = url[0...i]
|
57
|
+
|
58
|
+
return "#{path}#{basename}.out.dic"
|
59
|
+
end
|
60
|
+
|
61
|
+
def scrape()
|
62
|
+
json = JSON.load(@str_or_io)
|
63
|
+
|
64
|
+
return Dict.new() if json.nil?()
|
65
|
+
|
66
|
+
hash = json['reikai']
|
67
|
+
|
68
|
+
return Dict.new() if hash.nil?()
|
69
|
+
|
70
|
+
hash = hash['entries']
|
71
|
+
|
72
|
+
return Dict.new() if hash.nil?()
|
73
|
+
return Dict.scrape(hash,missingno: @missingno,url: @url)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|