nhkore 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +37 -1
- data/README.md +18 -6
- data/Rakefile +11 -16
- data/bin/nhkore +1 -3
- data/lib/nhkore/app.rb +616 -0
- data/lib/nhkore/article.rb +130 -0
- data/lib/nhkore/article_scraper.rb +653 -0
- data/lib/nhkore/cleaner.rb +91 -0
- data/lib/nhkore/cli/bing_cmd.rb +220 -0
- data/lib/nhkore/cli/fx_cmd.rb +116 -0
- data/lib/nhkore/cli/get_cmd.rb +153 -0
- data/lib/nhkore/cli/news_cmd.rb +375 -0
- data/lib/nhkore/cli/sift_cmd.rb +382 -0
- data/lib/nhkore/defn.rb +104 -0
- data/lib/nhkore/dict.rb +80 -0
- data/lib/nhkore/dict_scraper.rb +76 -0
- data/lib/nhkore/entry.rb +104 -0
- data/lib/nhkore/error.rb +35 -0
- data/lib/nhkore/fileable.rb +48 -0
- data/lib/nhkore/missingno.rb +92 -0
- data/lib/nhkore/news.rb +176 -0
- data/lib/nhkore/polisher.rb +93 -0
- data/lib/nhkore/scraper.rb +137 -0
- data/lib/nhkore/search_link.rb +188 -0
- data/lib/nhkore/search_scraper.rb +152 -0
- data/lib/nhkore/sifter.rb +339 -0
- data/lib/nhkore/splitter.rb +90 -0
- data/lib/nhkore/util.rb +190 -0
- data/lib/nhkore/variator.rb +87 -0
- data/lib/nhkore/version.rb +1 -1
- data/lib/nhkore/word.rb +134 -17
- data/lib/nhkore.rb +39 -40
- data/nhkore.gemspec +23 -8
- data/test/{nhkore_tester.rb → nhkore/test_helper.rb} +3 -1
- data/test/nhkore_test.rb +8 -6
- metadata +204 -11
@@ -0,0 +1,90 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'bimyou_segmenter'
|
25
|
+
require 'tiny_segmenter'
|
26
|
+
|
27
|
+
require 'nhkore/util'
|
28
|
+
|
29
|
+
|
30
|
+
module NHKore
|
31
|
+
###
|
32
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
33
|
+
# @since 0.2.0
|
34
|
+
###
|
35
|
+
class Splitter
|
36
|
+
def begin_split(str)
|
37
|
+
return str
|
38
|
+
end
|
39
|
+
|
40
|
+
def split(str)
|
41
|
+
str = begin_split(str)
|
42
|
+
str = end_split(str)
|
43
|
+
|
44
|
+
return str
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
###
|
49
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
50
|
+
# @since 0.2.0
|
51
|
+
###
|
52
|
+
class BasicSplitter < Splitter
|
53
|
+
def end_split(str)
|
54
|
+
return str.split(Util::NORMALIZE_STR_REGEX)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
###
|
59
|
+
# @since 0.2.0
|
60
|
+
###
|
61
|
+
class BimyouSplitter < Splitter
|
62
|
+
def end_split(str)
|
63
|
+
return BimyouSegmenter.segment(str,symbol: false,white_space: false)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
###
|
68
|
+
# @since 0.2.0
|
69
|
+
###
|
70
|
+
class TinySplitter < Splitter
|
71
|
+
attr_accessor :tiny
|
72
|
+
|
73
|
+
def initialize(*)
|
74
|
+
super
|
75
|
+
|
76
|
+
@tiny = TinySegmenter.new()
|
77
|
+
end
|
78
|
+
|
79
|
+
def end_split(str)
|
80
|
+
return @tiny.segment(str,ignore_punctuation: true)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
###
|
85
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
86
|
+
# @since 0.2.0
|
87
|
+
###
|
88
|
+
class BestSplitter < BimyouSplitter
|
89
|
+
end
|
90
|
+
end
|
data/lib/nhkore/util.rb
ADDED
@@ -0,0 +1,190 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'cgi'
|
25
|
+
require 'psychgus'
|
26
|
+
require 'public_suffix'
|
27
|
+
require 'time'
|
28
|
+
|
29
|
+
|
30
|
+
module NHKore
|
31
|
+
###
|
32
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
33
|
+
# @since 0.2.0
|
34
|
+
###
|
35
|
+
module Util
|
36
|
+
CORE_DIR = 'core'
|
37
|
+
WEB_DIR = 'web'
|
38
|
+
|
39
|
+
JST_OFFSET = '+09:00' # Japan Standard Time (JST) time zone offset from UTC
|
40
|
+
JST_OFFSET_HOUR = 9
|
41
|
+
JST_OFFSET_MIN = 0
|
42
|
+
|
43
|
+
HIRAGANA_REGEX = /\p{Hiragana}/
|
44
|
+
JPN_SPACE = "\u3000" # Must be double-quoted for escape chars
|
45
|
+
KANA_REGEX = /\p{Hiragana}|\p{Katakana}/
|
46
|
+
KANJI_REGEX = /\p{Han}/ # Han probably stands for Hanzi?
|
47
|
+
KATAKANA_REGEX = /\p{Katakana}/
|
48
|
+
NORMALIZE_STR_REGEX = /[^[[:alpha:]]]+/
|
49
|
+
STRIP_WEB_STR_REGEX = /(\A[[:space:]]+)|([[:space:]]+\z)/
|
50
|
+
WEB_SPACES_REGEX = /[[:space:]]+/
|
51
|
+
|
52
|
+
def self.jst_now()
|
53
|
+
return Time.now().getlocal(JST_OFFSET)
|
54
|
+
end
|
55
|
+
|
56
|
+
JST_YEAR = jst_now().year
|
57
|
+
MAX_SANE_YEAR = JST_YEAR + 1 # +1 Justin Case for time zone differences at the end of the year
|
58
|
+
|
59
|
+
# NHK was founded in 1924/25.
|
60
|
+
# - https://www.nhk.or.jp/bunken/english/about/history.html
|
61
|
+
# - https://en.wikipedia.org/wiki/NHK
|
62
|
+
# However, when was the website first created?
|
63
|
+
MIN_SANE_YEAR = 1924
|
64
|
+
|
65
|
+
def self.dir_str?(str)
|
66
|
+
# File.join() will add the appropriate slash.
|
67
|
+
return File.join(str,'') == str
|
68
|
+
end
|
69
|
+
|
70
|
+
def self.domain(host,clean: true)
|
71
|
+
domain = PublicSuffix.domain(host)
|
72
|
+
domain = unspace_web_str(domain).downcase() if clean
|
73
|
+
|
74
|
+
return domain
|
75
|
+
end
|
76
|
+
|
77
|
+
def self.dump_yaml(obj,flow_level: 8)
|
78
|
+
return Psychgus.dump(obj,
|
79
|
+
deref_aliases: true, # Dereference aliases for load_yaml()
|
80
|
+
line_width: 10000, # Try not to wrap; ichiman!
|
81
|
+
stylers: [
|
82
|
+
Psychgus::FlowStyler.new(flow_level), # Put extra details on one line (flow/inline style)
|
83
|
+
Psychgus::NoSymStyler.new(cap: false), # Remove symbols, don't capitalize
|
84
|
+
Psychgus::NoTagStyler.new(), # Remove class names (tags)
|
85
|
+
],
|
86
|
+
)
|
87
|
+
end
|
88
|
+
|
89
|
+
def self.empty_web_str?(str)
|
90
|
+
return str.nil?() || strip_web_str(str).empty?()
|
91
|
+
end
|
92
|
+
|
93
|
+
def self.escape_html(str)
|
94
|
+
str = CGI.escapeHTML(str)
|
95
|
+
str = str.gsub("\n",'<br>')
|
96
|
+
|
97
|
+
return str
|
98
|
+
end
|
99
|
+
|
100
|
+
def self.filename_str?(str)
|
101
|
+
return File.basename(str) == str
|
102
|
+
end
|
103
|
+
|
104
|
+
def self.guess_year(year)
|
105
|
+
if year < 100
|
106
|
+
# 2021 -> 2000.
|
107
|
+
millennium = JST_YEAR / 100 * 100
|
108
|
+
|
109
|
+
# If year <= (2021 -> 21), assume this century.
|
110
|
+
if year <= (JST_YEAR % 100)
|
111
|
+
year = millennium + year
|
112
|
+
else
|
113
|
+
# Assume previous century (2000 -> 1900).
|
114
|
+
year = (millennium - 100) + year
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
return year
|
119
|
+
end
|
120
|
+
|
121
|
+
def self.hiragana?(str)
|
122
|
+
return HIRAGANA_REGEX =~ str
|
123
|
+
end
|
124
|
+
|
125
|
+
# This doesn't modify the hour/minute according to {JST_OFFSET},
|
126
|
+
# but instead, it just drops {JST_OFFSET} into it without adjusting it.
|
127
|
+
def self.jst_time(time)
|
128
|
+
return Time.new(time.year,time.month,time.day,time.hour,time.min,time.sec,JST_OFFSET)
|
129
|
+
end
|
130
|
+
|
131
|
+
def self.kana?(str)
|
132
|
+
return KANA_REGEX =~ str
|
133
|
+
end
|
134
|
+
|
135
|
+
def self.kanji?(str)
|
136
|
+
return KANJI_REGEX =~ str
|
137
|
+
end
|
138
|
+
|
139
|
+
def self.katakana?(str)
|
140
|
+
return KATAKANA_REGEX =~ str
|
141
|
+
end
|
142
|
+
|
143
|
+
def self.load_yaml(data,file: nil,**kargs)
|
144
|
+
return Psych.safe_load(data,
|
145
|
+
aliases: false,
|
146
|
+
filename: file,
|
147
|
+
#freeze: true, # Not in this current version of Psych
|
148
|
+
permitted_classes: [Symbol],
|
149
|
+
symbolize_names: true,
|
150
|
+
**kargs,
|
151
|
+
)
|
152
|
+
end
|
153
|
+
|
154
|
+
def self.normalize_str(str)
|
155
|
+
return str.gsub(NORMALIZE_STR_REGEX,'')
|
156
|
+
end
|
157
|
+
|
158
|
+
def self.reduce_jpn_space(str)
|
159
|
+
# Do not strip; use a Japanese space
|
160
|
+
return str.gsub(WEB_SPACES_REGEX,JPN_SPACE)
|
161
|
+
end
|
162
|
+
|
163
|
+
def self.reduce_space(str)
|
164
|
+
return str.gsub(WEB_SPACES_REGEX,' ')
|
165
|
+
end
|
166
|
+
|
167
|
+
def self.sane_year?(year)
|
168
|
+
return year >= MIN_SANE_YEAR && year <= MAX_SANE_YEAR
|
169
|
+
end
|
170
|
+
|
171
|
+
# String's normal strip() method doesn't work with special Unicode/HTML white space.
|
172
|
+
def self.strip_web_str(str)
|
173
|
+
# After testing with Benchmark, this is slower than one regex.
|
174
|
+
#str = str.gsub(/\A[[:space:]]+/,'')
|
175
|
+
#str = str.gsub(/[[:space:]]+\z/,'')
|
176
|
+
|
177
|
+
str = str.gsub(STRIP_WEB_STR_REGEX,'')
|
178
|
+
|
179
|
+
return str
|
180
|
+
end
|
181
|
+
|
182
|
+
def self.unspace_web_str(str)
|
183
|
+
return str.gsub(WEB_SPACES_REGEX,'')
|
184
|
+
end
|
185
|
+
|
186
|
+
def self.warn(msg,uplevel: 1)
|
187
|
+
Kernel.warn(msg,uplevel: uplevel)
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'japanese_deinflector'
|
25
|
+
|
26
|
+
|
27
|
+
module NHKore
|
28
|
+
###
|
29
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
30
|
+
# @since 0.2.0
|
31
|
+
###
|
32
|
+
class Variator
|
33
|
+
def begin_variate(str)
|
34
|
+
return str
|
35
|
+
end
|
36
|
+
|
37
|
+
def variate(str)
|
38
|
+
str = begin_variate(str)
|
39
|
+
str = end_variate(str)
|
40
|
+
|
41
|
+
return str
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
###
|
46
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
47
|
+
# @since 0.2.0
|
48
|
+
###
|
49
|
+
class BasicVariator < Variator
|
50
|
+
def end_variate(str)
|
51
|
+
return [] # No variations; don't return nil
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
###
|
56
|
+
# Guesses a word's dictionary/plain form (辞書形).
|
57
|
+
#
|
58
|
+
# It doesn't work very well,but better than nothing...
|
59
|
+
#
|
60
|
+
# @since 0.2.0
|
61
|
+
###
|
62
|
+
class DictFormVariator < Variator
|
63
|
+
attr_accessor :deinflector
|
64
|
+
|
65
|
+
def initialize(*)
|
66
|
+
super
|
67
|
+
|
68
|
+
@deinflector = JapaneseDeinflector.new()
|
69
|
+
end
|
70
|
+
|
71
|
+
def end_variate(str)
|
72
|
+
guess = @deinflector.deinflect(str)
|
73
|
+
|
74
|
+
return [] if guess.length < 1
|
75
|
+
return [] if (guess = guess[0])[:weight] < 0.5
|
76
|
+
|
77
|
+
return [guess[:word]]
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
###
|
82
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
83
|
+
# @since 0.2.0
|
84
|
+
###
|
85
|
+
class BestVariator < DictFormVariator
|
86
|
+
end
|
87
|
+
end
|
data/lib/nhkore/version.rb
CHANGED
data/lib/nhkore/word.rb
CHANGED
@@ -21,57 +21,174 @@
|
|
21
21
|
#++
|
22
22
|
|
23
23
|
|
24
|
+
require 'nokogiri'
|
25
|
+
|
26
|
+
require 'nhkore/error'
|
27
|
+
require 'nhkore/util'
|
28
|
+
|
29
|
+
|
24
30
|
module NHKore
|
25
31
|
###
|
26
32
|
# @author Jonathan Bradley Whited (@esotericpig)
|
27
33
|
# @since 0.1.0
|
28
34
|
###
|
29
35
|
class Word
|
36
|
+
attr_accessor :defn
|
37
|
+
attr_accessor :eng
|
30
38
|
attr_accessor :freq
|
31
39
|
attr_reader :kana
|
32
|
-
attr_reader :
|
40
|
+
attr_reader :kanji
|
41
|
+
attr_reader :key
|
33
42
|
|
34
|
-
def initialize(
|
43
|
+
def initialize(defn: nil,eng: nil,freq: 1,kana: nil,kanji: nil,unknown: nil,word: nil,**kargs)
|
35
44
|
super()
|
36
45
|
|
46
|
+
if !word.nil?()
|
47
|
+
defn = word.defn if defn.nil?()
|
48
|
+
eng = word.eng if eng.nil?()
|
49
|
+
freq = word.freq if freq.nil?()
|
50
|
+
kana = word.kana if kana.nil?()
|
51
|
+
kanji = word.kanji if kanji.nil?()
|
52
|
+
end
|
53
|
+
|
54
|
+
raise ArgumentError,"freq[#{freq}] cannot be < 1" if freq < 1
|
55
|
+
|
56
|
+
if !unknown.nil?()
|
57
|
+
# kanji?() only tests if it contains kanji, so don't use kana?().
|
58
|
+
if Util.kanji?(unknown)
|
59
|
+
raise ArgumentError,"unknown[#{unknown}] will overwrite kanji[#{kanji}]" unless Util.empty_web_str?(kanji)
|
60
|
+
|
61
|
+
kanji = unknown
|
62
|
+
else
|
63
|
+
raise ArgumentError,"unknown[#{unknown}] will overwrite kana[#{kana}]" unless Util.empty_web_str?(kana)
|
64
|
+
|
65
|
+
kana = unknown
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
kana = nil if Util.empty_web_str?(kana)
|
70
|
+
kanji = nil if Util.empty_web_str?(kanji)
|
71
|
+
|
72
|
+
raise ArgumentError,'kanji and kana cannot both be empty' if kana.nil?() && kanji.nil?()
|
73
|
+
|
74
|
+
@defn = defn
|
75
|
+
@eng = eng
|
37
76
|
@freq = freq
|
38
77
|
@kana = kana
|
39
|
-
@
|
40
|
-
|
41
|
-
raise ArgumentError,'word and kana cannot both be nil; one must be specified' if @word.nil?()
|
78
|
+
@kanji = kanji
|
79
|
+
@key = "#{kanji}=#{kana}" # nil.to_s() is ''
|
42
80
|
end
|
43
81
|
|
44
82
|
def encode_with(coder)
|
45
|
-
# Ignore @
|
83
|
+
# Ignore @key because it will be the key in the YAML/Hash.
|
46
84
|
# Order matters.
|
85
|
+
|
86
|
+
coder[:kanji] = @kanji
|
47
87
|
coder[:kana] = @kana
|
48
88
|
coder[:freq] = @freq
|
89
|
+
coder[:defn] = @defn
|
90
|
+
coder[:eng] = @eng
|
49
91
|
end
|
50
92
|
|
51
|
-
def self.
|
52
|
-
|
93
|
+
def self.load_data(key,hash)
|
94
|
+
key = key.to_s() # Change from a symbol
|
95
|
+
|
96
|
+
word = Word.new(
|
97
|
+
defn: hash[:defn],
|
98
|
+
eng: hash[:eng],
|
99
|
+
kana: hash[:kana],
|
100
|
+
kanji: hash[:kanji]
|
101
|
+
)
|
102
|
+
|
103
|
+
if key != word.key
|
104
|
+
raise ArgumentError,"the key from the hash[#{key}] does not match the generated key[#{word.key}]"
|
105
|
+
end
|
106
|
+
|
107
|
+
freq = hash[:freq].to_i() # nil.to_i() is 0
|
108
|
+
word.freq = freq if freq > 0
|
109
|
+
|
110
|
+
return word
|
111
|
+
end
|
112
|
+
|
113
|
+
# Do not clean and/or strip spaces, as the raw text is important for
|
114
|
+
# Defn and ArticleScraper.
|
115
|
+
def self.scrape_ruby_tag(tag,missingno: nil,url: nil)
|
116
|
+
# First, try <rb> tags.
|
117
|
+
kanji = tag.css('rb')
|
118
|
+
# Second, try text nodes.
|
119
|
+
kanji = tag.search('./text()') if kanji.length < 1
|
120
|
+
# Third, try non-<rt> tags, in case of being surrounded by <span>, <b>, etc.
|
121
|
+
kanji = tag.search("./*[not(name()='rt')]") if kanji.length < 1
|
122
|
+
|
123
|
+
raise ScrapeError,"no kanji at URL[#{url}] in tag[#{tag}]" if kanji.length < 1
|
124
|
+
raise ScrapeError,"too many kanji at URL[#{url}] in tag[#{tag}]" if kanji.length > 1
|
125
|
+
|
126
|
+
kanji = kanji[0].text
|
127
|
+
kana = tag.css('rt')
|
128
|
+
|
129
|
+
raise ScrapeError,"no kana at URL[#{url}] in tag[#{tag}]" if kana.length < 1
|
130
|
+
raise ScrapeError,"too many kana at URL[#{url}] in tag[#{tag}]" if kana.length > 1
|
131
|
+
|
132
|
+
kana = kana[0].text
|
133
|
+
|
134
|
+
if !missingno.nil?()
|
135
|
+
# Check kana first, since this is the typical scenario.
|
136
|
+
# - https://www3.nhk.or.jp/news/easy/k10012331311000/k10012331311000.html
|
137
|
+
# - '窓' in '(8)窓を開けて外の空気を入れましょう'
|
138
|
+
if Util.empty_web_str?(kana)
|
139
|
+
kana = missingno.kana_from_kanji(kanji)
|
140
|
+
|
141
|
+
if !Util.empty_web_str?(kana)
|
142
|
+
Util.warn("using missingno for kana[#{kana}] from kanji[#{kanji}]")
|
143
|
+
end
|
144
|
+
elsif Util.empty_web_str?(kanji)
|
145
|
+
kanji = missingno.kanji_from_kana(kana)
|
146
|
+
|
147
|
+
if !Util.empty_web_str?(kanji)
|
148
|
+
Util.warn("using missingno for kanji[#{kanji}] from kana[#{kana}]")
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
raise ScrapeError,"empty kanji at URL[#{url}] in tag[#{tag}]" if Util.empty_web_str?(kanji)
|
154
|
+
raise ScrapeError,"empty kana at URL[#{url}] in tag[#{tag}]" if Util.empty_web_str?(kana)
|
155
|
+
|
156
|
+
word = Word.new(kana: kana,kanji: kanji)
|
157
|
+
|
158
|
+
return word
|
159
|
+
end
|
160
|
+
|
161
|
+
# Do not clean and/or strip spaces, as the raw text is important for
|
162
|
+
# Defn and ArticleScraper.
|
163
|
+
def self.scrape_text_node(tag,url: nil)
|
164
|
+
text = tag.text
|
165
|
+
|
166
|
+
# No error; empty text is fine (not strictly kanji/kana only).
|
167
|
+
return nil if Util.empty_web_str?(text)
|
53
168
|
|
54
|
-
|
55
|
-
word.freq = freq if freq >= 0
|
169
|
+
word = Word.new(unknown: text)
|
56
170
|
|
57
171
|
return word
|
58
172
|
end
|
59
173
|
|
60
174
|
def kanji?()
|
61
|
-
return @
|
175
|
+
return !Util.empty_web_str?(@kanji)
|
62
176
|
end
|
63
177
|
|
64
|
-
def
|
65
|
-
return
|
178
|
+
def word()
|
179
|
+
return kanji?() ? @kanji : @kana
|
66
180
|
end
|
67
181
|
|
68
182
|
def to_s()
|
69
183
|
s = ''.dup()
|
70
184
|
|
71
|
-
s << @
|
72
|
-
s << "
|
73
|
-
s << "
|
74
|
-
s << "
|
185
|
+
s << "'#{@key}': "
|
186
|
+
s << "{ kanji=>'#{@kanji}'"
|
187
|
+
s << ", kana=>'#{@kana}'"
|
188
|
+
s << ", freq=>#{@freq}"
|
189
|
+
s << ", defn=>'#{@defn.to_s().gsub("\n",'\\n')}'"
|
190
|
+
s << ", eng=>'#{@eng}'"
|
191
|
+
s << ' }'
|
75
192
|
|
76
193
|
return s
|
77
194
|
end
|
data/lib/nhkore.rb
CHANGED
@@ -21,60 +21,59 @@
|
|
21
21
|
#++
|
22
22
|
|
23
23
|
|
24
|
-
|
24
|
+
TESTING = ($0 == __FILE__)
|
25
25
|
|
26
|
-
if
|
26
|
+
if TESTING
|
27
27
|
require 'rubygems'
|
28
28
|
require 'bundler/setup'
|
29
29
|
end
|
30
30
|
|
31
|
+
require 'nhkore/app'
|
32
|
+
require 'nhkore/article'
|
33
|
+
require 'nhkore/article_scraper'
|
34
|
+
require 'nhkore/cleaner'
|
35
|
+
require 'nhkore/defn'
|
36
|
+
require 'nhkore/dict'
|
37
|
+
require 'nhkore/dict_scraper'
|
38
|
+
require 'nhkore/entry'
|
39
|
+
require 'nhkore/error'
|
40
|
+
require 'nhkore/fileable'
|
41
|
+
require 'nhkore/missingno'
|
42
|
+
require 'nhkore/news'
|
43
|
+
require 'nhkore/polisher'
|
44
|
+
require 'nhkore/scraper'
|
45
|
+
require 'nhkore/search_link'
|
46
|
+
require 'nhkore/search_scraper'
|
47
|
+
require 'nhkore/sifter'
|
48
|
+
require 'nhkore/splitter'
|
49
|
+
require 'nhkore/util'
|
50
|
+
require 'nhkore/variator'
|
31
51
|
require 'nhkore/version'
|
32
52
|
require 'nhkore/word'
|
33
53
|
|
54
|
+
require 'nhkore/cli/bing_cmd'
|
55
|
+
require 'nhkore/cli/fx_cmd'
|
56
|
+
require 'nhkore/cli/get_cmd'
|
57
|
+
require 'nhkore/cli/news_cmd'
|
58
|
+
require 'nhkore/cli/sift_cmd'
|
34
59
|
|
35
|
-
require 'nokogiri'
|
36
|
-
require 'psychgus'
|
37
|
-
|
38
|
-
# 2 files:
|
39
|
-
# nhk_news_web_easy.yml
|
40
|
-
# (only Dec 2019 as an example; more in release)
|
41
|
-
# (add comment about above in this file)
|
42
|
-
# - 2019-12-25 13:10 JST: (append 1,2,... if duplicate)
|
43
|
-
# - url: <url>
|
44
|
-
# - md5: <md5sum of content only (in case of ads)>
|
45
|
-
# - words:
|
46
|
-
# - word (kanji/kana):
|
47
|
-
# - kana:
|
48
|
-
# - freq:
|
49
|
-
# (only Dec 2019 as an example; more in release)
|
50
|
-
# nhk_news_web_easy_core_<search criteria>.csv
|
51
|
-
# word, kana, freq
|
52
|
-
# (sorted by freq, word, or kana [desc/asc])
|
53
|
-
|
54
|
-
# nhkore --date '2019-12-01...2019-12-11'
|
55
|
-
# nhkore --date '2019-12'
|
56
|
-
# nhkore --date '12' (Dec of this year)
|
57
|
-
# nhkore --date '12-01' (Dec 1 of this year)
|
58
|
-
|
59
|
-
class Article
|
60
|
-
attr_accessor :datetime
|
61
|
-
attr_accessor :futsuu_url
|
62
|
-
attr_accessor :md5
|
63
|
-
attr_accessor :url
|
64
|
-
attr_accessor :words
|
65
|
-
end
|
66
|
-
|
67
|
-
word = NHKore::Word.new(kana: 'banana')
|
68
|
-
puts word.to_yaml
|
69
60
|
|
70
61
|
###
|
71
62
|
# @author Jonathan Bradley Whited (@esotericpig)
|
72
63
|
# @since 0.1.0
|
73
64
|
###
|
74
65
|
module NHKore
|
66
|
+
# @since 0.2.0
|
67
|
+
def self.run(args=ARGV)
|
68
|
+
app = App.new(args)
|
69
|
+
|
70
|
+
begin
|
71
|
+
app.run()
|
72
|
+
rescue CLIError => e
|
73
|
+
puts "Error: #{e}"
|
74
|
+
exit 1
|
75
|
+
end
|
76
|
+
end
|
75
77
|
end
|
76
78
|
|
77
|
-
|
78
|
-
# https://www.google.com/search?q=site:https://www3.nhk.or.jp/news/easy/&tbas=0&tbs=cdr:1,cd_min:1/1/2019,cd_max:12/31/2019,sbd:1&tbm=nws&sxsrf=ALeKk01oebeT0hWvNro-vDn7WGYkRe0kxw:1582396168988&ei=CHNRXt7dO5KR0QSTnZGYCw&start=20&sa=N&ved=0ahUKEwjega-M5eXnAhWSSJQKHZNOBLM4ChDy0wMIag&biw=1028&bih=672&dpr=1.3
|
79
|
-
|
80
|
-
#NHKore::App.new().run() if IS_TESTING
|
79
|
+
NHKore.run() if TESTING
|