nhkore 0.3.6 → 0.3.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +3 -0
- data/CHANGELOG.md +63 -2
- data/Gemfile +0 -18
- data/Gemfile.lock +89 -0
- data/README.md +36 -30
- data/Rakefile +38 -52
- data/bin/nhkore +4 -15
- data/lib/nhkore.rb +8 -20
- data/lib/nhkore/app.rb +236 -236
- data/lib/nhkore/article.rb +39 -53
- data/lib/nhkore/article_scraper.rb +301 -287
- data/lib/nhkore/cleaner.rb +20 -32
- data/lib/nhkore/cli/fx_cmd.rb +41 -53
- data/lib/nhkore/cli/get_cmd.rb +59 -70
- data/lib/nhkore/cli/news_cmd.rb +143 -153
- data/lib/nhkore/cli/search_cmd.rb +108 -118
- data/lib/nhkore/cli/sift_cmd.rb +109 -120
- data/lib/nhkore/datetime_parser.rb +89 -103
- data/lib/nhkore/defn.rb +48 -55
- data/lib/nhkore/dict.rb +26 -38
- data/lib/nhkore/dict_scraper.rb +31 -40
- data/lib/nhkore/entry.rb +43 -55
- data/lib/nhkore/error.rb +16 -21
- data/lib/nhkore/fileable.rb +10 -21
- data/lib/nhkore/lib.rb +5 -17
- data/lib/nhkore/missingno.rb +21 -33
- data/lib/nhkore/news.rb +58 -72
- data/lib/nhkore/polisher.rb +22 -34
- data/lib/nhkore/scraper.rb +75 -82
- data/lib/nhkore/search_link.rb +63 -75
- data/lib/nhkore/search_scraper.rb +89 -93
- data/lib/nhkore/sifter.rb +157 -171
- data/lib/nhkore/splitter.rb +19 -31
- data/lib/nhkore/user_agents.rb +28 -32
- data/lib/nhkore/util.rb +72 -84
- data/lib/nhkore/variator.rb +20 -32
- data/lib/nhkore/version.rb +4 -16
- data/lib/nhkore/word.rb +105 -99
- data/nhkore.gemspec +54 -65
- data/samples/looper.rb +71 -0
- data/test/nhkore/test_helper.rb +3 -15
- data/test/nhkore_test.rb +6 -18
- metadata +50 -28
data/lib/nhkore/lib.rb
CHANGED
@@ -1,23 +1,11 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
@@ -50,8 +38,8 @@ module NHKore
|
|
50
38
|
###
|
51
39
|
# Include this file to only require the files needed to use this
|
52
40
|
# Gem as a library (i.e., don't include CLI-related files).
|
53
|
-
#
|
54
|
-
# @author Jonathan Bradley Whited
|
41
|
+
#
|
42
|
+
# @author Jonathan Bradley Whited
|
55
43
|
# @since 0.3.2
|
56
44
|
###
|
57
45
|
module Lib
|
data/lib/nhkore/missingno.rb
CHANGED
@@ -1,23 +1,11 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
@@ -26,20 +14,20 @@ require 'nhkore/util'
|
|
26
14
|
|
27
15
|
module NHKore
|
28
16
|
###
|
29
|
-
# @author Jonathan Bradley Whited
|
17
|
+
# @author Jonathan Bradley Whited
|
30
18
|
# @since 0.2.0
|
31
19
|
###
|
32
20
|
class Missingno
|
33
21
|
attr_reader :kanas
|
34
22
|
attr_reader :kanjis
|
35
|
-
|
23
|
+
|
36
24
|
# @param data [News,Article,Array<Word>]
|
37
25
|
def initialize(data)
|
38
26
|
super()
|
39
|
-
|
27
|
+
|
40
28
|
@kanas = {}
|
41
29
|
@kanjis = {}
|
42
|
-
|
30
|
+
|
43
31
|
# News?
|
44
32
|
if data.respond_to?(:articles)
|
45
33
|
add_news(data)
|
@@ -50,43 +38,43 @@ module NHKore
|
|
50
38
|
add_words(data)
|
51
39
|
end
|
52
40
|
end
|
53
|
-
|
41
|
+
|
54
42
|
def add_article(article)
|
55
|
-
add_words(article.words.values
|
43
|
+
add_words(article.words.values)
|
56
44
|
end
|
57
|
-
|
45
|
+
|
58
46
|
def add_news(news)
|
59
|
-
news.articles.
|
47
|
+
news.articles.each_value do |article|
|
60
48
|
add_article(article)
|
61
49
|
end
|
62
50
|
end
|
63
|
-
|
51
|
+
|
64
52
|
def add_words(words)
|
65
|
-
words.each
|
53
|
+
words.each do |word|
|
66
54
|
# We only want ones that are both filled in because
|
67
55
|
# Word.scrape_ruby_tag() will raise an error if either is empty.
|
68
56
|
next if Util.empty_web_str?(word.kana) || Util.empty_web_str?(word.kanji)
|
69
|
-
|
57
|
+
|
70
58
|
if !kanas.key?(word.kana)
|
71
59
|
kanas[word.kana] = word
|
72
60
|
end
|
73
|
-
|
61
|
+
|
74
62
|
if !kanjis.key?(word.kanji)
|
75
63
|
kanjis[word.kanji] = word
|
76
64
|
end
|
77
65
|
end
|
78
66
|
end
|
79
|
-
|
67
|
+
|
80
68
|
def kana_from_kanji(kanji)
|
81
69
|
word = @kanjis[kanji]
|
82
|
-
|
83
|
-
return word.nil?
|
70
|
+
|
71
|
+
return word.nil? ? nil : word.kana
|
84
72
|
end
|
85
|
-
|
73
|
+
|
86
74
|
def kanji_from_kana(kana)
|
87
75
|
word = @kanas[kana]
|
88
|
-
|
89
|
-
return word.nil?
|
76
|
+
|
77
|
+
return word.nil? ? nil : word.kanji
|
90
78
|
end
|
91
79
|
end
|
92
80
|
end
|
data/lib/nhkore/news.rb
CHANGED
@@ -1,23 +1,11 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
@@ -29,155 +17,153 @@ require 'nhkore/util'
|
|
29
17
|
|
30
18
|
module NHKore
|
31
19
|
###
|
32
|
-
# @author Jonathan Bradley Whited
|
20
|
+
# @author Jonathan Bradley Whited
|
33
21
|
# @since 0.2.0
|
34
22
|
###
|
35
23
|
class News
|
36
24
|
include Fileable
|
37
|
-
|
25
|
+
|
38
26
|
DEFAULT_DIR = Util::CORE_DIR
|
39
|
-
FAVORED_URL = /https
|
40
|
-
|
27
|
+
FAVORED_URL = /https:/i.freeze
|
28
|
+
|
41
29
|
attr_reader :articles
|
42
30
|
attr_reader :sha256s
|
43
|
-
|
44
|
-
def initialize
|
31
|
+
|
32
|
+
def initialize
|
45
33
|
super()
|
46
|
-
|
34
|
+
|
47
35
|
@articles = {}
|
48
36
|
@sha256s = {}
|
49
37
|
end
|
50
|
-
|
38
|
+
|
51
39
|
def add_article(article,key: nil,overwrite: false)
|
52
40
|
url = article.url
|
53
|
-
url = url.to_s
|
54
|
-
|
55
|
-
key = key.nil?
|
56
|
-
|
41
|
+
url = url.to_s unless url.nil?
|
42
|
+
|
43
|
+
key = key.nil? ? url : key.to_s
|
44
|
+
|
57
45
|
if !overwrite
|
58
46
|
raise ArgumentError,"duplicate article[#{key}] in articles" if @articles.key?(key)
|
59
47
|
raise ArgumentError,"duplicate sha256[#{article.sha256}] in articles" if @sha256s.key?(article.sha256)
|
60
48
|
end
|
61
|
-
|
49
|
+
|
62
50
|
@articles[key] = article
|
63
51
|
@sha256s[article.sha256] = url
|
64
|
-
|
52
|
+
|
65
53
|
return self
|
66
54
|
end
|
67
|
-
|
55
|
+
|
68
56
|
def self.build_file(filename)
|
69
57
|
return File.join(DEFAULT_DIR,filename)
|
70
58
|
end
|
71
|
-
|
59
|
+
|
72
60
|
def encode_with(coder)
|
73
61
|
# Order matters.
|
74
62
|
# Don't output @sha256s.
|
75
|
-
|
63
|
+
|
76
64
|
coder[:articles] = @articles
|
77
65
|
end
|
78
|
-
|
66
|
+
|
79
67
|
def self.load_data(data,article_class: Article,file: nil,news_class: News,overwrite: false,**kargs)
|
80
68
|
data = Util.load_yaml(data,file: file)
|
81
|
-
|
69
|
+
|
82
70
|
articles = data[:articles]
|
83
|
-
|
84
|
-
news = news_class.new
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
news.add_article(article_class.load_data(key,hash),key: key,overwrite: overwrite)
|
90
|
-
end
|
71
|
+
|
72
|
+
news = news_class.new
|
73
|
+
|
74
|
+
articles&.each() do |key,hash|
|
75
|
+
key = key.to_s # Change from a symbol
|
76
|
+
news.add_article(article_class.load_data(key,hash),key: key,overwrite: overwrite)
|
91
77
|
end
|
92
|
-
|
78
|
+
|
93
79
|
return news
|
94
80
|
end
|
95
|
-
|
81
|
+
|
96
82
|
def update_article(article,url)
|
97
|
-
url = url.to_s
|
98
|
-
|
83
|
+
url = url.to_s unless url.nil?
|
84
|
+
|
99
85
|
# Favor https.
|
100
|
-
return if article.url.to_s
|
86
|
+
return if article.url.to_s =~ FAVORED_URL
|
101
87
|
return if url !~ FAVORED_URL
|
102
|
-
|
88
|
+
|
103
89
|
@articles.delete(article.url) # Probably no to_s() here
|
104
90
|
@articles[url] = article
|
105
91
|
article.url = url
|
106
92
|
end
|
107
|
-
|
93
|
+
|
108
94
|
def article(key)
|
109
|
-
key = key.to_s
|
110
|
-
|
95
|
+
key = key.to_s unless key.nil?
|
96
|
+
|
111
97
|
return @articles[key]
|
112
98
|
end
|
113
|
-
|
99
|
+
|
114
100
|
def article_with_sha256(sha256)
|
115
101
|
article = nil
|
116
|
-
|
117
|
-
@articles.
|
102
|
+
|
103
|
+
@articles.each_value do |a|
|
118
104
|
if a.sha256 == sha256
|
119
105
|
article = a
|
120
|
-
|
106
|
+
|
121
107
|
break
|
122
108
|
end
|
123
109
|
end
|
124
|
-
|
110
|
+
|
125
111
|
return article
|
126
112
|
end
|
127
|
-
|
113
|
+
|
128
114
|
def article?(key)
|
129
|
-
key = key.to_s
|
130
|
-
|
115
|
+
key = key.to_s unless key.nil?
|
116
|
+
|
131
117
|
return @articles.key?(key)
|
132
118
|
end
|
133
|
-
|
119
|
+
|
134
120
|
def sha256?(sha256)
|
135
121
|
return @sha256s.key?(sha256)
|
136
122
|
end
|
137
|
-
|
138
|
-
def to_s
|
123
|
+
|
124
|
+
def to_s
|
139
125
|
# Put each Word on one line (flow/inline style).
|
140
126
|
return Util.dump_yaml(self,flow_level: 8)
|
141
127
|
end
|
142
128
|
end
|
143
|
-
|
129
|
+
|
144
130
|
###
|
145
|
-
# @author Jonathan Bradley Whited
|
131
|
+
# @author Jonathan Bradley Whited
|
146
132
|
# @since 0.2.0
|
147
133
|
###
|
148
134
|
class FutsuuNews < News
|
149
135
|
DEFAULT_FILENAME = 'nhk_news_web_regular.yml'
|
150
136
|
DEFAULT_FILE = build_file(DEFAULT_FILENAME)
|
151
|
-
|
137
|
+
|
152
138
|
def self.load_data(data,**kargs)
|
153
139
|
return News.load_data(data,article_class: Article,news_class: FutsuuNews,**kargs)
|
154
140
|
end
|
155
|
-
|
141
|
+
|
156
142
|
def self.load_file(file=DEFAULT_FILE,**kargs)
|
157
143
|
return News.load_file(file,article_class: Article,news_class: FutsuuNews,**kargs)
|
158
144
|
end
|
159
|
-
|
145
|
+
|
160
146
|
def save_file(file=DEFAULT_FILE,**kargs)
|
161
147
|
super(file,**kargs)
|
162
148
|
end
|
163
149
|
end
|
164
|
-
|
150
|
+
|
165
151
|
###
|
166
|
-
# @author Jonathan Bradley Whited
|
152
|
+
# @author Jonathan Bradley Whited
|
167
153
|
# @since 0.2.0
|
168
154
|
###
|
169
155
|
class YasashiiNews < News
|
170
156
|
DEFAULT_FILENAME = 'nhk_news_web_easy.yml'
|
171
157
|
DEFAULT_FILE = build_file(DEFAULT_FILENAME)
|
172
|
-
|
158
|
+
|
173
159
|
def self.load_data(data,**kargs)
|
174
160
|
return News.load_data(data,article_class: Article,news_class: YasashiiNews,**kargs)
|
175
161
|
end
|
176
|
-
|
162
|
+
|
177
163
|
def self.load_file(file=DEFAULT_FILE,**kargs)
|
178
164
|
return News.load_file(file,article_class: Article,news_class: YasashiiNews,**kargs)
|
179
165
|
end
|
180
|
-
|
166
|
+
|
181
167
|
def save_file(file=DEFAULT_FILE,**kargs)
|
182
168
|
super(file,**kargs)
|
183
169
|
end
|
data/lib/nhkore/polisher.rb
CHANGED
@@ -1,23 +1,11 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
@@ -26,28 +14,28 @@ require 'nhkore/word'
|
|
26
14
|
|
27
15
|
module NHKore
|
28
16
|
###
|
29
|
-
# @author Jonathan Bradley Whited
|
17
|
+
# @author Jonathan Bradley Whited
|
30
18
|
# @since 0.2.0
|
31
19
|
###
|
32
20
|
class Polisher
|
33
21
|
def begin_polish(str)
|
34
22
|
return str
|
35
23
|
end
|
36
|
-
|
24
|
+
|
37
25
|
def polish(str)
|
38
26
|
str = begin_polish(str)
|
39
27
|
str = end_polish(str)
|
40
|
-
|
28
|
+
|
41
29
|
return str
|
42
30
|
end
|
43
|
-
|
31
|
+
|
44
32
|
def self.polish_any(obj,polishers)
|
45
|
-
return nil if obj.nil?
|
46
|
-
|
33
|
+
return nil if obj.nil?
|
34
|
+
|
47
35
|
polishers = Array(polishers)
|
48
|
-
|
49
|
-
return obj if polishers.empty?
|
50
|
-
|
36
|
+
|
37
|
+
return obj if polishers.empty?
|
38
|
+
|
51
39
|
if obj.is_a?(Word)
|
52
40
|
obj = Word.new(
|
53
41
|
kana: polish_any(obj.kana,polishers),
|
@@ -55,17 +43,17 @@ module NHKore
|
|
55
43
|
word: obj
|
56
44
|
)
|
57
45
|
else # String
|
58
|
-
polishers.each
|
46
|
+
polishers.each do |polisher|
|
59
47
|
obj = polisher.polish(obj)
|
60
48
|
end
|
61
49
|
end
|
62
|
-
|
50
|
+
|
63
51
|
return obj
|
64
52
|
end
|
65
53
|
end
|
66
|
-
|
54
|
+
|
67
55
|
###
|
68
|
-
# @author Jonathan Bradley Whited
|
56
|
+
# @author Jonathan Bradley Whited
|
69
57
|
# @since 0.2.0
|
70
58
|
###
|
71
59
|
class BasicPolisher < Polisher
|
@@ -74,18 +62,18 @@ module NHKore
|
|
74
62
|
# - Yunibaasaru・Sutajio・Japan
|
75
63
|
# Keep numbers next to kanji/kana, else the below kana won't make sense:
|
76
64
|
# - Word { kanji: 20日, kana: はつか }
|
77
|
-
|
65
|
+
|
78
66
|
str = str.gsub(/[^[[:alnum:]]・]/,'')
|
79
|
-
|
67
|
+
|
80
68
|
# Numbers/dots by themselves (without kanji/kana) should be ignored (empty).
|
81
|
-
str = '' if str.gsub(/[[[:digit:]]・]+/,'').empty?
|
82
|
-
|
69
|
+
str = '' if str.gsub(/[[[:digit:]]・]+/,'').empty?
|
70
|
+
|
83
71
|
return str
|
84
72
|
end
|
85
73
|
end
|
86
|
-
|
74
|
+
|
87
75
|
###
|
88
|
-
# @author Jonathan Bradley Whited
|
76
|
+
# @author Jonathan Bradley Whited
|
89
77
|
# @since 0.2.0
|
90
78
|
###
|
91
79
|
class BestPolisher < BasicPolisher
|