nhkore 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +37 -1
- data/README.md +18 -6
- data/Rakefile +11 -16
- data/bin/nhkore +1 -3
- data/lib/nhkore/app.rb +616 -0
- data/lib/nhkore/article.rb +130 -0
- data/lib/nhkore/article_scraper.rb +653 -0
- data/lib/nhkore/cleaner.rb +91 -0
- data/lib/nhkore/cli/bing_cmd.rb +220 -0
- data/lib/nhkore/cli/fx_cmd.rb +116 -0
- data/lib/nhkore/cli/get_cmd.rb +153 -0
- data/lib/nhkore/cli/news_cmd.rb +375 -0
- data/lib/nhkore/cli/sift_cmd.rb +382 -0
- data/lib/nhkore/defn.rb +104 -0
- data/lib/nhkore/dict.rb +80 -0
- data/lib/nhkore/dict_scraper.rb +76 -0
- data/lib/nhkore/entry.rb +104 -0
- data/lib/nhkore/error.rb +35 -0
- data/lib/nhkore/fileable.rb +48 -0
- data/lib/nhkore/missingno.rb +92 -0
- data/lib/nhkore/news.rb +176 -0
- data/lib/nhkore/polisher.rb +93 -0
- data/lib/nhkore/scraper.rb +137 -0
- data/lib/nhkore/search_link.rb +188 -0
- data/lib/nhkore/search_scraper.rb +152 -0
- data/lib/nhkore/sifter.rb +339 -0
- data/lib/nhkore/splitter.rb +90 -0
- data/lib/nhkore/util.rb +190 -0
- data/lib/nhkore/variator.rb +87 -0
- data/lib/nhkore/version.rb +1 -1
- data/lib/nhkore/word.rb +134 -17
- data/lib/nhkore.rb +39 -40
- data/nhkore.gemspec +23 -8
- data/test/{nhkore_tester.rb → nhkore/test_helper.rb} +3 -1
- data/test/nhkore_test.rb +8 -6
- metadata +204 -11
@@ -0,0 +1,90 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'bimyou_segmenter'
|
25
|
+
require 'tiny_segmenter'
|
26
|
+
|
27
|
+
require 'nhkore/util'
|
28
|
+
|
29
|
+
|
30
|
+
module NHKore
|
31
|
+
###
|
32
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
33
|
+
# @since 0.2.0
|
34
|
+
###
|
35
|
+
class Splitter
|
36
|
+
def begin_split(str)
|
37
|
+
return str
|
38
|
+
end
|
39
|
+
|
40
|
+
def split(str)
|
41
|
+
str = begin_split(str)
|
42
|
+
str = end_split(str)
|
43
|
+
|
44
|
+
return str
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
###
|
49
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
50
|
+
# @since 0.2.0
|
51
|
+
###
|
52
|
+
class BasicSplitter < Splitter
|
53
|
+
def end_split(str)
|
54
|
+
return str.split(Util::NORMALIZE_STR_REGEX)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
###
|
59
|
+
# @since 0.2.0
|
60
|
+
###
|
61
|
+
class BimyouSplitter < Splitter
|
62
|
+
def end_split(str)
|
63
|
+
return BimyouSegmenter.segment(str,symbol: false,white_space: false)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
###
|
68
|
+
# @since 0.2.0
|
69
|
+
###
|
70
|
+
class TinySplitter < Splitter
|
71
|
+
attr_accessor :tiny
|
72
|
+
|
73
|
+
def initialize(*)
|
74
|
+
super
|
75
|
+
|
76
|
+
@tiny = TinySegmenter.new()
|
77
|
+
end
|
78
|
+
|
79
|
+
def end_split(str)
|
80
|
+
return @tiny.segment(str,ignore_punctuation: true)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
###
|
85
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
86
|
+
# @since 0.2.0
|
87
|
+
###
|
88
|
+
class BestSplitter < BimyouSplitter
|
89
|
+
end
|
90
|
+
end
|
data/lib/nhkore/util.rb
ADDED
@@ -0,0 +1,190 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'cgi'
|
25
|
+
require 'psychgus'
|
26
|
+
require 'public_suffix'
|
27
|
+
require 'time'
|
28
|
+
|
29
|
+
|
30
|
+
module NHKore
|
31
|
+
###
|
32
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
33
|
+
# @since 0.2.0
|
34
|
+
###
|
35
|
+
module Util
|
36
|
+
CORE_DIR = 'core'
|
37
|
+
WEB_DIR = 'web'
|
38
|
+
|
39
|
+
JST_OFFSET = '+09:00' # Japan Standard Time (JST) time zone offset from UTC
|
40
|
+
JST_OFFSET_HOUR = 9
|
41
|
+
JST_OFFSET_MIN = 0
|
42
|
+
|
43
|
+
HIRAGANA_REGEX = /\p{Hiragana}/
|
44
|
+
JPN_SPACE = "\u3000" # Must be double-quoted for escape chars
|
45
|
+
KANA_REGEX = /\p{Hiragana}|\p{Katakana}/
|
46
|
+
KANJI_REGEX = /\p{Han}/ # Han probably stands for Hanzi?
|
47
|
+
KATAKANA_REGEX = /\p{Katakana}/
|
48
|
+
NORMALIZE_STR_REGEX = /[^[[:alpha:]]]+/
|
49
|
+
STRIP_WEB_STR_REGEX = /(\A[[:space:]]+)|([[:space:]]+\z)/
|
50
|
+
WEB_SPACES_REGEX = /[[:space:]]+/
|
51
|
+
|
52
|
+
def self.jst_now()
|
53
|
+
return Time.now().getlocal(JST_OFFSET)
|
54
|
+
end
|
55
|
+
|
56
|
+
JST_YEAR = jst_now().year
|
57
|
+
MAX_SANE_YEAR = JST_YEAR + 1 # +1 Justin Case for time zone differences at the end of the year
|
58
|
+
|
59
|
+
# NHK was founded in 1924/25.
|
60
|
+
# - https://www.nhk.or.jp/bunken/english/about/history.html
|
61
|
+
# - https://en.wikipedia.org/wiki/NHK
|
62
|
+
# However, when was the website first created?
|
63
|
+
MIN_SANE_YEAR = 1924
|
64
|
+
|
65
|
+
def self.dir_str?(str)
|
66
|
+
# File.join() will add the appropriate slash.
|
67
|
+
return File.join(str,'') == str
|
68
|
+
end
|
69
|
+
|
70
|
+
def self.domain(host,clean: true)
|
71
|
+
domain = PublicSuffix.domain(host)
|
72
|
+
domain = unspace_web_str(domain).downcase() if clean
|
73
|
+
|
74
|
+
return domain
|
75
|
+
end
|
76
|
+
|
77
|
+
def self.dump_yaml(obj,flow_level: 8)
|
78
|
+
return Psychgus.dump(obj,
|
79
|
+
deref_aliases: true, # Dereference aliases for load_yaml()
|
80
|
+
line_width: 10000, # Try not to wrap; ichiman!
|
81
|
+
stylers: [
|
82
|
+
Psychgus::FlowStyler.new(flow_level), # Put extra details on one line (flow/inline style)
|
83
|
+
Psychgus::NoSymStyler.new(cap: false), # Remove symbols, don't capitalize
|
84
|
+
Psychgus::NoTagStyler.new(), # Remove class names (tags)
|
85
|
+
],
|
86
|
+
)
|
87
|
+
end
|
88
|
+
|
89
|
+
def self.empty_web_str?(str)
|
90
|
+
return str.nil?() || strip_web_str(str).empty?()
|
91
|
+
end
|
92
|
+
|
93
|
+
def self.escape_html(str)
|
94
|
+
str = CGI.escapeHTML(str)
|
95
|
+
str = str.gsub("\n",'<br>')
|
96
|
+
|
97
|
+
return str
|
98
|
+
end
|
99
|
+
|
100
|
+
def self.filename_str?(str)
|
101
|
+
return File.basename(str) == str
|
102
|
+
end
|
103
|
+
|
104
|
+
def self.guess_year(year)
|
105
|
+
if year < 100
|
106
|
+
# 2021 -> 2000.
|
107
|
+
millennium = JST_YEAR / 100 * 100
|
108
|
+
|
109
|
+
# If year <= (2021 -> 21), assume this century.
|
110
|
+
if year <= (JST_YEAR % 100)
|
111
|
+
year = millennium + year
|
112
|
+
else
|
113
|
+
# Assume previous century (2000 -> 1900).
|
114
|
+
year = (millennium - 100) + year
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
return year
|
119
|
+
end
|
120
|
+
|
121
|
+
def self.hiragana?(str)
|
122
|
+
return HIRAGANA_REGEX =~ str
|
123
|
+
end
|
124
|
+
|
125
|
+
# This doesn't modify the hour/minute according to {JST_OFFSET},
|
126
|
+
# but instead, it just drops {JST_OFFSET} into it without adjusting it.
|
127
|
+
def self.jst_time(time)
|
128
|
+
return Time.new(time.year,time.month,time.day,time.hour,time.min,time.sec,JST_OFFSET)
|
129
|
+
end
|
130
|
+
|
131
|
+
def self.kana?(str)
|
132
|
+
return KANA_REGEX =~ str
|
133
|
+
end
|
134
|
+
|
135
|
+
def self.kanji?(str)
|
136
|
+
return KANJI_REGEX =~ str
|
137
|
+
end
|
138
|
+
|
139
|
+
def self.katakana?(str)
|
140
|
+
return KATAKANA_REGEX =~ str
|
141
|
+
end
|
142
|
+
|
143
|
+
def self.load_yaml(data,file: nil,**kargs)
|
144
|
+
return Psych.safe_load(data,
|
145
|
+
aliases: false,
|
146
|
+
filename: file,
|
147
|
+
#freeze: true, # Not in this current version of Psych
|
148
|
+
permitted_classes: [Symbol],
|
149
|
+
symbolize_names: true,
|
150
|
+
**kargs,
|
151
|
+
)
|
152
|
+
end
|
153
|
+
|
154
|
+
def self.normalize_str(str)
|
155
|
+
return str.gsub(NORMALIZE_STR_REGEX,'')
|
156
|
+
end
|
157
|
+
|
158
|
+
def self.reduce_jpn_space(str)
|
159
|
+
# Do not strip; use a Japanese space
|
160
|
+
return str.gsub(WEB_SPACES_REGEX,JPN_SPACE)
|
161
|
+
end
|
162
|
+
|
163
|
+
def self.reduce_space(str)
|
164
|
+
return str.gsub(WEB_SPACES_REGEX,' ')
|
165
|
+
end
|
166
|
+
|
167
|
+
def self.sane_year?(year)
|
168
|
+
return year >= MIN_SANE_YEAR && year <= MAX_SANE_YEAR
|
169
|
+
end
|
170
|
+
|
171
|
+
# String's normal strip() method doesn't work with special Unicode/HTML white space.
|
172
|
+
def self.strip_web_str(str)
|
173
|
+
# After testing with Benchmark, this is slower than one regex.
|
174
|
+
#str = str.gsub(/\A[[:space:]]+/,'')
|
175
|
+
#str = str.gsub(/[[:space:]]+\z/,'')
|
176
|
+
|
177
|
+
str = str.gsub(STRIP_WEB_STR_REGEX,'')
|
178
|
+
|
179
|
+
return str
|
180
|
+
end
|
181
|
+
|
182
|
+
def self.unspace_web_str(str)
|
183
|
+
return str.gsub(WEB_SPACES_REGEX,'')
|
184
|
+
end
|
185
|
+
|
186
|
+
def self.warn(msg,uplevel: 1)
|
187
|
+
Kernel.warn(msg,uplevel: uplevel)
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'japanese_deinflector'
|
25
|
+
|
26
|
+
|
27
|
+
module NHKore
|
28
|
+
###
|
29
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
30
|
+
# @since 0.2.0
|
31
|
+
###
|
32
|
+
class Variator
|
33
|
+
def begin_variate(str)
|
34
|
+
return str
|
35
|
+
end
|
36
|
+
|
37
|
+
def variate(str)
|
38
|
+
str = begin_variate(str)
|
39
|
+
str = end_variate(str)
|
40
|
+
|
41
|
+
return str
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
###
|
46
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
47
|
+
# @since 0.2.0
|
48
|
+
###
|
49
|
+
class BasicVariator < Variator
|
50
|
+
def end_variate(str)
|
51
|
+
return [] # No variations; don't return nil
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
###
|
56
|
+
# Guesses a word's dictionary/plain form (辞書形).
|
57
|
+
#
|
58
|
+
# It doesn't work very well,but better than nothing...
|
59
|
+
#
|
60
|
+
# @since 0.2.0
|
61
|
+
###
|
62
|
+
class DictFormVariator < Variator
|
63
|
+
attr_accessor :deinflector
|
64
|
+
|
65
|
+
def initialize(*)
|
66
|
+
super
|
67
|
+
|
68
|
+
@deinflector = JapaneseDeinflector.new()
|
69
|
+
end
|
70
|
+
|
71
|
+
def end_variate(str)
|
72
|
+
guess = @deinflector.deinflect(str)
|
73
|
+
|
74
|
+
return [] if guess.length < 1
|
75
|
+
return [] if (guess = guess[0])[:weight] < 0.5
|
76
|
+
|
77
|
+
return [guess[:word]]
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
###
|
82
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
83
|
+
# @since 0.2.0
|
84
|
+
###
|
85
|
+
class BestVariator < DictFormVariator
|
86
|
+
end
|
87
|
+
end
|
data/lib/nhkore/version.rb
CHANGED
data/lib/nhkore/word.rb
CHANGED
@@ -21,57 +21,174 @@
|
|
21
21
|
#++
|
22
22
|
|
23
23
|
|
24
|
+
require 'nokogiri'
|
25
|
+
|
26
|
+
require 'nhkore/error'
|
27
|
+
require 'nhkore/util'
|
28
|
+
|
29
|
+
|
24
30
|
module NHKore
|
25
31
|
###
|
26
32
|
# @author Jonathan Bradley Whited (@esotericpig)
|
27
33
|
# @since 0.1.0
|
28
34
|
###
|
29
35
|
class Word
|
36
|
+
attr_accessor :defn
|
37
|
+
attr_accessor :eng
|
30
38
|
attr_accessor :freq
|
31
39
|
attr_reader :kana
|
32
|
-
attr_reader :
|
40
|
+
attr_reader :kanji
|
41
|
+
attr_reader :key
|
33
42
|
|
34
|
-
def initialize(
|
43
|
+
def initialize(defn: nil,eng: nil,freq: 1,kana: nil,kanji: nil,unknown: nil,word: nil,**kargs)
|
35
44
|
super()
|
36
45
|
|
46
|
+
if !word.nil?()
|
47
|
+
defn = word.defn if defn.nil?()
|
48
|
+
eng = word.eng if eng.nil?()
|
49
|
+
freq = word.freq if freq.nil?()
|
50
|
+
kana = word.kana if kana.nil?()
|
51
|
+
kanji = word.kanji if kanji.nil?()
|
52
|
+
end
|
53
|
+
|
54
|
+
raise ArgumentError,"freq[#{freq}] cannot be < 1" if freq < 1
|
55
|
+
|
56
|
+
if !unknown.nil?()
|
57
|
+
# kanji?() only tests if it contains kanji, so don't use kana?().
|
58
|
+
if Util.kanji?(unknown)
|
59
|
+
raise ArgumentError,"unknown[#{unknown}] will overwrite kanji[#{kanji}]" unless Util.empty_web_str?(kanji)
|
60
|
+
|
61
|
+
kanji = unknown
|
62
|
+
else
|
63
|
+
raise ArgumentError,"unknown[#{unknown}] will overwrite kana[#{kana}]" unless Util.empty_web_str?(kana)
|
64
|
+
|
65
|
+
kana = unknown
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
kana = nil if Util.empty_web_str?(kana)
|
70
|
+
kanji = nil if Util.empty_web_str?(kanji)
|
71
|
+
|
72
|
+
raise ArgumentError,'kanji and kana cannot both be empty' if kana.nil?() && kanji.nil?()
|
73
|
+
|
74
|
+
@defn = defn
|
75
|
+
@eng = eng
|
37
76
|
@freq = freq
|
38
77
|
@kana = kana
|
39
|
-
@
|
40
|
-
|
41
|
-
raise ArgumentError,'word and kana cannot both be nil; one must be specified' if @word.nil?()
|
78
|
+
@kanji = kanji
|
79
|
+
@key = "#{kanji}=#{kana}" # nil.to_s() is ''
|
42
80
|
end
|
43
81
|
|
44
82
|
def encode_with(coder)
|
45
|
-
# Ignore @
|
83
|
+
# Ignore @key because it will be the key in the YAML/Hash.
|
46
84
|
# Order matters.
|
85
|
+
|
86
|
+
coder[:kanji] = @kanji
|
47
87
|
coder[:kana] = @kana
|
48
88
|
coder[:freq] = @freq
|
89
|
+
coder[:defn] = @defn
|
90
|
+
coder[:eng] = @eng
|
49
91
|
end
|
50
92
|
|
51
|
-
def self.
|
52
|
-
|
93
|
+
def self.load_data(key,hash)
|
94
|
+
key = key.to_s() # Change from a symbol
|
95
|
+
|
96
|
+
word = Word.new(
|
97
|
+
defn: hash[:defn],
|
98
|
+
eng: hash[:eng],
|
99
|
+
kana: hash[:kana],
|
100
|
+
kanji: hash[:kanji]
|
101
|
+
)
|
102
|
+
|
103
|
+
if key != word.key
|
104
|
+
raise ArgumentError,"the key from the hash[#{key}] does not match the generated key[#{word.key}]"
|
105
|
+
end
|
106
|
+
|
107
|
+
freq = hash[:freq].to_i() # nil.to_i() is 0
|
108
|
+
word.freq = freq if freq > 0
|
109
|
+
|
110
|
+
return word
|
111
|
+
end
|
112
|
+
|
113
|
+
# Do not clean and/or strip spaces, as the raw text is important for
|
114
|
+
# Defn and ArticleScraper.
|
115
|
+
def self.scrape_ruby_tag(tag,missingno: nil,url: nil)
|
116
|
+
# First, try <rb> tags.
|
117
|
+
kanji = tag.css('rb')
|
118
|
+
# Second, try text nodes.
|
119
|
+
kanji = tag.search('./text()') if kanji.length < 1
|
120
|
+
# Third, try non-<rt> tags, in case of being surrounded by <span>, <b>, etc.
|
121
|
+
kanji = tag.search("./*[not(name()='rt')]") if kanji.length < 1
|
122
|
+
|
123
|
+
raise ScrapeError,"no kanji at URL[#{url}] in tag[#{tag}]" if kanji.length < 1
|
124
|
+
raise ScrapeError,"too many kanji at URL[#{url}] in tag[#{tag}]" if kanji.length > 1
|
125
|
+
|
126
|
+
kanji = kanji[0].text
|
127
|
+
kana = tag.css('rt')
|
128
|
+
|
129
|
+
raise ScrapeError,"no kana at URL[#{url}] in tag[#{tag}]" if kana.length < 1
|
130
|
+
raise ScrapeError,"too many kana at URL[#{url}] in tag[#{tag}]" if kana.length > 1
|
131
|
+
|
132
|
+
kana = kana[0].text
|
133
|
+
|
134
|
+
if !missingno.nil?()
|
135
|
+
# Check kana first, since this is the typical scenario.
|
136
|
+
# - https://www3.nhk.or.jp/news/easy/k10012331311000/k10012331311000.html
|
137
|
+
# - '窓' in '(8)窓を開けて外の空気を入れましょう'
|
138
|
+
if Util.empty_web_str?(kana)
|
139
|
+
kana = missingno.kana_from_kanji(kanji)
|
140
|
+
|
141
|
+
if !Util.empty_web_str?(kana)
|
142
|
+
Util.warn("using missingno for kana[#{kana}] from kanji[#{kanji}]")
|
143
|
+
end
|
144
|
+
elsif Util.empty_web_str?(kanji)
|
145
|
+
kanji = missingno.kanji_from_kana(kana)
|
146
|
+
|
147
|
+
if !Util.empty_web_str?(kanji)
|
148
|
+
Util.warn("using missingno for kanji[#{kanji}] from kana[#{kana}]")
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
raise ScrapeError,"empty kanji at URL[#{url}] in tag[#{tag}]" if Util.empty_web_str?(kanji)
|
154
|
+
raise ScrapeError,"empty kana at URL[#{url}] in tag[#{tag}]" if Util.empty_web_str?(kana)
|
155
|
+
|
156
|
+
word = Word.new(kana: kana,kanji: kanji)
|
157
|
+
|
158
|
+
return word
|
159
|
+
end
|
160
|
+
|
161
|
+
# Do not clean and/or strip spaces, as the raw text is important for
|
162
|
+
# Defn and ArticleScraper.
|
163
|
+
def self.scrape_text_node(tag,url: nil)
|
164
|
+
text = tag.text
|
165
|
+
|
166
|
+
# No error; empty text is fine (not strictly kanji/kana only).
|
167
|
+
return nil if Util.empty_web_str?(text)
|
53
168
|
|
54
|
-
|
55
|
-
word.freq = freq if freq >= 0
|
169
|
+
word = Word.new(unknown: text)
|
56
170
|
|
57
171
|
return word
|
58
172
|
end
|
59
173
|
|
60
174
|
def kanji?()
|
61
|
-
return @
|
175
|
+
return !Util.empty_web_str?(@kanji)
|
62
176
|
end
|
63
177
|
|
64
|
-
def
|
65
|
-
return
|
178
|
+
def word()
|
179
|
+
return kanji?() ? @kanji : @kana
|
66
180
|
end
|
67
181
|
|
68
182
|
def to_s()
|
69
183
|
s = ''.dup()
|
70
184
|
|
71
|
-
s << @
|
72
|
-
s << "
|
73
|
-
s << "
|
74
|
-
s << "
|
185
|
+
s << "'#{@key}': "
|
186
|
+
s << "{ kanji=>'#{@kanji}'"
|
187
|
+
s << ", kana=>'#{@kana}'"
|
188
|
+
s << ", freq=>#{@freq}"
|
189
|
+
s << ", defn=>'#{@defn.to_s().gsub("\n",'\\n')}'"
|
190
|
+
s << ", eng=>'#{@eng}'"
|
191
|
+
s << ' }'
|
75
192
|
|
76
193
|
return s
|
77
194
|
end
|
data/lib/nhkore.rb
CHANGED
@@ -21,60 +21,59 @@
|
|
21
21
|
#++
|
22
22
|
|
23
23
|
|
24
|
-
|
24
|
+
TESTING = ($0 == __FILE__)
|
25
25
|
|
26
|
-
if
|
26
|
+
if TESTING
|
27
27
|
require 'rubygems'
|
28
28
|
require 'bundler/setup'
|
29
29
|
end
|
30
30
|
|
31
|
+
require 'nhkore/app'
|
32
|
+
require 'nhkore/article'
|
33
|
+
require 'nhkore/article_scraper'
|
34
|
+
require 'nhkore/cleaner'
|
35
|
+
require 'nhkore/defn'
|
36
|
+
require 'nhkore/dict'
|
37
|
+
require 'nhkore/dict_scraper'
|
38
|
+
require 'nhkore/entry'
|
39
|
+
require 'nhkore/error'
|
40
|
+
require 'nhkore/fileable'
|
41
|
+
require 'nhkore/missingno'
|
42
|
+
require 'nhkore/news'
|
43
|
+
require 'nhkore/polisher'
|
44
|
+
require 'nhkore/scraper'
|
45
|
+
require 'nhkore/search_link'
|
46
|
+
require 'nhkore/search_scraper'
|
47
|
+
require 'nhkore/sifter'
|
48
|
+
require 'nhkore/splitter'
|
49
|
+
require 'nhkore/util'
|
50
|
+
require 'nhkore/variator'
|
31
51
|
require 'nhkore/version'
|
32
52
|
require 'nhkore/word'
|
33
53
|
|
54
|
+
require 'nhkore/cli/bing_cmd'
|
55
|
+
require 'nhkore/cli/fx_cmd'
|
56
|
+
require 'nhkore/cli/get_cmd'
|
57
|
+
require 'nhkore/cli/news_cmd'
|
58
|
+
require 'nhkore/cli/sift_cmd'
|
34
59
|
|
35
|
-
require 'nokogiri'
|
36
|
-
require 'psychgus'
|
37
|
-
|
38
|
-
# 2 files:
|
39
|
-
# nhk_news_web_easy.yml
|
40
|
-
# (only Dec 2019 as an example; more in release)
|
41
|
-
# (add comment about above in this file)
|
42
|
-
# - 2019-12-25 13:10 JST: (append 1,2,... if duplicate)
|
43
|
-
# - url: <url>
|
44
|
-
# - md5: <md5sum of content only (in case of ads)>
|
45
|
-
# - words:
|
46
|
-
# - word (kanji/kana):
|
47
|
-
# - kana:
|
48
|
-
# - freq:
|
49
|
-
# (only Dec 2019 as an example; more in release)
|
50
|
-
# nhk_news_web_easy_core_<search criteria>.csv
|
51
|
-
# word, kana, freq
|
52
|
-
# (sorted by freq, word, or kana [desc/asc])
|
53
|
-
|
54
|
-
# nhkore --date '2019-12-01...2019-12-11'
|
55
|
-
# nhkore --date '2019-12'
|
56
|
-
# nhkore --date '12' (Dec of this year)
|
57
|
-
# nhkore --date '12-01' (Dec 1 of this year)
|
58
|
-
|
59
|
-
class Article
|
60
|
-
attr_accessor :datetime
|
61
|
-
attr_accessor :futsuu_url
|
62
|
-
attr_accessor :md5
|
63
|
-
attr_accessor :url
|
64
|
-
attr_accessor :words
|
65
|
-
end
|
66
|
-
|
67
|
-
word = NHKore::Word.new(kana: 'banana')
|
68
|
-
puts word.to_yaml
|
69
60
|
|
70
61
|
###
|
71
62
|
# @author Jonathan Bradley Whited (@esotericpig)
|
72
63
|
# @since 0.1.0
|
73
64
|
###
|
74
65
|
module NHKore
|
66
|
+
# @since 0.2.0
|
67
|
+
def self.run(args=ARGV)
|
68
|
+
app = App.new(args)
|
69
|
+
|
70
|
+
begin
|
71
|
+
app.run()
|
72
|
+
rescue CLIError => e
|
73
|
+
puts "Error: #{e}"
|
74
|
+
exit 1
|
75
|
+
end
|
76
|
+
end
|
75
77
|
end
|
76
78
|
|
77
|
-
|
78
|
-
# https://www.google.com/search?q=site:https://www3.nhk.or.jp/news/easy/&tbas=0&tbs=cdr:1,cd_min:1/1/2019,cd_max:12/31/2019,sbd:1&tbm=nws&sxsrf=ALeKk01oebeT0hWvNro-vDn7WGYkRe0kxw:1582396168988&ei=CHNRXt7dO5KR0QSTnZGYCw&start=20&sa=N&ved=0ahUKEwjega-M5eXnAhWSSJQKHZNOBLM4ChDy0wMIag&biw=1028&bih=672&dpr=1.3
|
79
|
-
|
80
|
-
#NHKore::App.new().run() if IS_TESTING
|
79
|
+
NHKore.run() if TESTING
|