nhkore 0.3.3 → 0.3.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +3 -0
- data/CHANGELOG.md +97 -2
- data/Gemfile +0 -18
- data/Gemfile.lock +89 -0
- data/README.md +58 -30
- data/Rakefile +68 -42
- data/bin/nhkore +4 -15
- data/lib/nhkore.rb +8 -20
- data/lib/nhkore/app.rb +231 -236
- data/lib/nhkore/article.rb +56 -53
- data/lib/nhkore/article_scraper.rb +308 -289
- data/lib/nhkore/cleaner.rb +20 -32
- data/lib/nhkore/cli/fx_cmd.rb +41 -53
- data/lib/nhkore/cli/get_cmd.rb +59 -70
- data/lib/nhkore/cli/news_cmd.rb +145 -154
- data/lib/nhkore/cli/search_cmd.rb +110 -120
- data/lib/nhkore/cli/sift_cmd.rb +111 -227
- data/lib/nhkore/datetime_parser.rb +328 -0
- data/lib/nhkore/defn.rb +48 -55
- data/lib/nhkore/dict.rb +26 -38
- data/lib/nhkore/dict_scraper.rb +31 -40
- data/lib/nhkore/entry.rb +43 -55
- data/lib/nhkore/error.rb +16 -21
- data/lib/nhkore/fileable.rb +10 -21
- data/lib/nhkore/lib.rb +6 -17
- data/lib/nhkore/missingno.rb +21 -33
- data/lib/nhkore/news.rb +61 -66
- data/lib/nhkore/polisher.rb +22 -34
- data/lib/nhkore/scraper.rb +75 -82
- data/lib/nhkore/search_link.rb +85 -78
- data/lib/nhkore/search_scraper.rb +89 -92
- data/lib/nhkore/sifter.rb +157 -171
- data/lib/nhkore/splitter.rb +19 -31
- data/lib/nhkore/user_agents.rb +28 -32
- data/lib/nhkore/util.rb +72 -101
- data/lib/nhkore/variator.rb +20 -32
- data/lib/nhkore/version.rb +4 -16
- data/lib/nhkore/word.rb +105 -99
- data/nhkore.gemspec +58 -65
- data/samples/looper.rb +71 -0
- data/test/nhkore/test_helper.rb +3 -15
- data/test/nhkore_test.rb +6 -18
- metadata +53 -30
data/lib/nhkore/lib.rb
CHANGED
@@ -1,29 +1,18 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
24
12
|
require 'nhkore/article'
|
25
13
|
require 'nhkore/article_scraper'
|
26
14
|
require 'nhkore/cleaner'
|
15
|
+
require 'nhkore/datetime_parser'
|
27
16
|
require 'nhkore/defn'
|
28
17
|
require 'nhkore/dict'
|
29
18
|
require 'nhkore/dict_scraper'
|
@@ -49,8 +38,8 @@ module NHKore
|
|
49
38
|
###
|
50
39
|
# Include this file to only require the files needed to use this
|
51
40
|
# Gem as a library (i.e., don't include CLI-related files).
|
52
|
-
#
|
53
|
-
# @author Jonathan Bradley Whited
|
41
|
+
#
|
42
|
+
# @author Jonathan Bradley Whited
|
54
43
|
# @since 0.3.2
|
55
44
|
###
|
56
45
|
module Lib
|
data/lib/nhkore/missingno.rb
CHANGED
@@ -1,23 +1,11 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
@@ -26,20 +14,20 @@ require 'nhkore/util'
|
|
26
14
|
|
27
15
|
module NHKore
|
28
16
|
###
|
29
|
-
# @author Jonathan Bradley Whited
|
17
|
+
# @author Jonathan Bradley Whited
|
30
18
|
# @since 0.2.0
|
31
19
|
###
|
32
20
|
class Missingno
|
33
21
|
attr_reader :kanas
|
34
22
|
attr_reader :kanjis
|
35
|
-
|
23
|
+
|
36
24
|
# @param data [News,Article,Array<Word>]
|
37
25
|
def initialize(data)
|
38
26
|
super()
|
39
|
-
|
27
|
+
|
40
28
|
@kanas = {}
|
41
29
|
@kanjis = {}
|
42
|
-
|
30
|
+
|
43
31
|
# News?
|
44
32
|
if data.respond_to?(:articles)
|
45
33
|
add_news(data)
|
@@ -50,43 +38,43 @@ module NHKore
|
|
50
38
|
add_words(data)
|
51
39
|
end
|
52
40
|
end
|
53
|
-
|
41
|
+
|
54
42
|
def add_article(article)
|
55
|
-
add_words(article.words.values
|
43
|
+
add_words(article.words.values)
|
56
44
|
end
|
57
|
-
|
45
|
+
|
58
46
|
def add_news(news)
|
59
|
-
news.articles.
|
47
|
+
news.articles.each_value do |article|
|
60
48
|
add_article(article)
|
61
49
|
end
|
62
50
|
end
|
63
|
-
|
51
|
+
|
64
52
|
def add_words(words)
|
65
|
-
words.each
|
53
|
+
words.each do |word|
|
66
54
|
# We only want ones that are both filled in because
|
67
55
|
# Word.scrape_ruby_tag() will raise an error if either is empty.
|
68
56
|
next if Util.empty_web_str?(word.kana) || Util.empty_web_str?(word.kanji)
|
69
|
-
|
57
|
+
|
70
58
|
if !kanas.key?(word.kana)
|
71
59
|
kanas[word.kana] = word
|
72
60
|
end
|
73
|
-
|
61
|
+
|
74
62
|
if !kanjis.key?(word.kanji)
|
75
63
|
kanjis[word.kanji] = word
|
76
64
|
end
|
77
65
|
end
|
78
66
|
end
|
79
|
-
|
67
|
+
|
80
68
|
def kana_from_kanji(kanji)
|
81
69
|
word = @kanjis[kanji]
|
82
|
-
|
83
|
-
return word.nil?
|
70
|
+
|
71
|
+
return word.nil? ? nil : word.kana
|
84
72
|
end
|
85
|
-
|
73
|
+
|
86
74
|
def kanji_from_kana(kana)
|
87
75
|
word = @kanas[kana]
|
88
|
-
|
89
|
-
return word.nil?
|
76
|
+
|
77
|
+
return word.nil? ? nil : word.kanji
|
90
78
|
end
|
91
79
|
end
|
92
80
|
end
|
data/lib/nhkore/news.rb
CHANGED
@@ -1,23 +1,11 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
@@ -29,146 +17,153 @@ require 'nhkore/util'
|
|
29
17
|
|
30
18
|
module NHKore
|
31
19
|
###
|
32
|
-
# @author Jonathan Bradley Whited
|
20
|
+
# @author Jonathan Bradley Whited
|
33
21
|
# @since 0.2.0
|
34
22
|
###
|
35
23
|
class News
|
36
24
|
include Fileable
|
37
|
-
|
25
|
+
|
38
26
|
DEFAULT_DIR = Util::CORE_DIR
|
39
|
-
FAVORED_URL = /https
|
40
|
-
|
27
|
+
FAVORED_URL = /https:/i.freeze
|
28
|
+
|
41
29
|
attr_reader :articles
|
42
30
|
attr_reader :sha256s
|
43
|
-
|
44
|
-
def initialize
|
31
|
+
|
32
|
+
def initialize
|
45
33
|
super()
|
46
|
-
|
34
|
+
|
47
35
|
@articles = {}
|
48
36
|
@sha256s = {}
|
49
37
|
end
|
50
|
-
|
38
|
+
|
51
39
|
def add_article(article,key: nil,overwrite: false)
|
52
|
-
|
53
|
-
|
40
|
+
url = article.url
|
41
|
+
url = url.to_s unless url.nil?
|
42
|
+
|
43
|
+
key = key.nil? ? url : key.to_s
|
44
|
+
|
54
45
|
if !overwrite
|
55
46
|
raise ArgumentError,"duplicate article[#{key}] in articles" if @articles.key?(key)
|
56
47
|
raise ArgumentError,"duplicate sha256[#{article.sha256}] in articles" if @sha256s.key?(article.sha256)
|
57
48
|
end
|
58
|
-
|
49
|
+
|
59
50
|
@articles[key] = article
|
60
|
-
@sha256s[article.sha256] =
|
61
|
-
|
51
|
+
@sha256s[article.sha256] = url
|
52
|
+
|
62
53
|
return self
|
63
54
|
end
|
64
|
-
|
55
|
+
|
65
56
|
def self.build_file(filename)
|
66
57
|
return File.join(DEFAULT_DIR,filename)
|
67
58
|
end
|
68
|
-
|
59
|
+
|
69
60
|
def encode_with(coder)
|
70
61
|
# Order matters.
|
71
62
|
# Don't output @sha256s.
|
72
|
-
|
63
|
+
|
73
64
|
coder[:articles] = @articles
|
74
65
|
end
|
75
|
-
|
66
|
+
|
76
67
|
def self.load_data(data,article_class: Article,file: nil,news_class: News,overwrite: false,**kargs)
|
77
68
|
data = Util.load_yaml(data,file: file)
|
78
|
-
|
69
|
+
|
79
70
|
articles = data[:articles]
|
80
|
-
|
81
|
-
news = news_class.new
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
news.add_article(article_class.load_data(key,hash),key: key,overwrite: overwrite)
|
87
|
-
end
|
71
|
+
|
72
|
+
news = news_class.new
|
73
|
+
|
74
|
+
articles&.each() do |key,hash|
|
75
|
+
key = key.to_s # Change from a symbol
|
76
|
+
news.add_article(article_class.load_data(key,hash),key: key,overwrite: overwrite)
|
88
77
|
end
|
89
|
-
|
78
|
+
|
90
79
|
return news
|
91
80
|
end
|
92
|
-
|
81
|
+
|
93
82
|
def update_article(article,url)
|
83
|
+
url = url.to_s unless url.nil?
|
84
|
+
|
94
85
|
# Favor https.
|
95
|
-
return if article.url =~ FAVORED_URL
|
86
|
+
return if article.url.to_s =~ FAVORED_URL
|
96
87
|
return if url !~ FAVORED_URL
|
97
|
-
|
98
|
-
@articles.delete(article.url)
|
88
|
+
|
89
|
+
@articles.delete(article.url) # Probably no to_s() here
|
99
90
|
@articles[url] = article
|
100
91
|
article.url = url
|
101
92
|
end
|
102
|
-
|
93
|
+
|
103
94
|
def article(key)
|
95
|
+
key = key.to_s unless key.nil?
|
96
|
+
|
104
97
|
return @articles[key]
|
105
98
|
end
|
106
|
-
|
99
|
+
|
107
100
|
def article_with_sha256(sha256)
|
108
101
|
article = nil
|
109
|
-
|
110
|
-
@articles.
|
102
|
+
|
103
|
+
@articles.each_value do |a|
|
111
104
|
if a.sha256 == sha256
|
112
105
|
article = a
|
113
|
-
|
106
|
+
|
114
107
|
break
|
115
108
|
end
|
116
109
|
end
|
117
|
-
|
110
|
+
|
118
111
|
return article
|
119
112
|
end
|
120
|
-
|
113
|
+
|
121
114
|
def article?(key)
|
115
|
+
key = key.to_s unless key.nil?
|
116
|
+
|
122
117
|
return @articles.key?(key)
|
123
118
|
end
|
124
|
-
|
119
|
+
|
125
120
|
def sha256?(sha256)
|
126
121
|
return @sha256s.key?(sha256)
|
127
122
|
end
|
128
|
-
|
129
|
-
def to_s
|
123
|
+
|
124
|
+
def to_s
|
130
125
|
# Put each Word on one line (flow/inline style).
|
131
126
|
return Util.dump_yaml(self,flow_level: 8)
|
132
127
|
end
|
133
128
|
end
|
134
|
-
|
129
|
+
|
135
130
|
###
|
136
|
-
# @author Jonathan Bradley Whited
|
131
|
+
# @author Jonathan Bradley Whited
|
137
132
|
# @since 0.2.0
|
138
133
|
###
|
139
134
|
class FutsuuNews < News
|
140
135
|
DEFAULT_FILENAME = 'nhk_news_web_regular.yml'
|
141
136
|
DEFAULT_FILE = build_file(DEFAULT_FILENAME)
|
142
|
-
|
137
|
+
|
143
138
|
def self.load_data(data,**kargs)
|
144
139
|
return News.load_data(data,article_class: Article,news_class: FutsuuNews,**kargs)
|
145
140
|
end
|
146
|
-
|
141
|
+
|
147
142
|
def self.load_file(file=DEFAULT_FILE,**kargs)
|
148
143
|
return News.load_file(file,article_class: Article,news_class: FutsuuNews,**kargs)
|
149
144
|
end
|
150
|
-
|
145
|
+
|
151
146
|
def save_file(file=DEFAULT_FILE,**kargs)
|
152
147
|
super(file,**kargs)
|
153
148
|
end
|
154
149
|
end
|
155
|
-
|
150
|
+
|
156
151
|
###
|
157
|
-
# @author Jonathan Bradley Whited
|
152
|
+
# @author Jonathan Bradley Whited
|
158
153
|
# @since 0.2.0
|
159
154
|
###
|
160
155
|
class YasashiiNews < News
|
161
156
|
DEFAULT_FILENAME = 'nhk_news_web_easy.yml'
|
162
157
|
DEFAULT_FILE = build_file(DEFAULT_FILENAME)
|
163
|
-
|
158
|
+
|
164
159
|
def self.load_data(data,**kargs)
|
165
160
|
return News.load_data(data,article_class: Article,news_class: YasashiiNews,**kargs)
|
166
161
|
end
|
167
|
-
|
162
|
+
|
168
163
|
def self.load_file(file=DEFAULT_FILE,**kargs)
|
169
164
|
return News.load_file(file,article_class: Article,news_class: YasashiiNews,**kargs)
|
170
165
|
end
|
171
|
-
|
166
|
+
|
172
167
|
def save_file(file=DEFAULT_FILE,**kargs)
|
173
168
|
super(file,**kargs)
|
174
169
|
end
|
data/lib/nhkore/polisher.rb
CHANGED
@@ -1,23 +1,11 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
@@ -26,28 +14,28 @@ require 'nhkore/word'
|
|
26
14
|
|
27
15
|
module NHKore
|
28
16
|
###
|
29
|
-
# @author Jonathan Bradley Whited
|
17
|
+
# @author Jonathan Bradley Whited
|
30
18
|
# @since 0.2.0
|
31
19
|
###
|
32
20
|
class Polisher
|
33
21
|
def begin_polish(str)
|
34
22
|
return str
|
35
23
|
end
|
36
|
-
|
24
|
+
|
37
25
|
def polish(str)
|
38
26
|
str = begin_polish(str)
|
39
27
|
str = end_polish(str)
|
40
|
-
|
28
|
+
|
41
29
|
return str
|
42
30
|
end
|
43
|
-
|
31
|
+
|
44
32
|
def self.polish_any(obj,polishers)
|
45
|
-
return nil if obj.nil?
|
46
|
-
|
33
|
+
return nil if obj.nil?
|
34
|
+
|
47
35
|
polishers = Array(polishers)
|
48
|
-
|
49
|
-
return obj if polishers.empty?
|
50
|
-
|
36
|
+
|
37
|
+
return obj if polishers.empty?
|
38
|
+
|
51
39
|
if obj.is_a?(Word)
|
52
40
|
obj = Word.new(
|
53
41
|
kana: polish_any(obj.kana,polishers),
|
@@ -55,17 +43,17 @@ module NHKore
|
|
55
43
|
word: obj
|
56
44
|
)
|
57
45
|
else # String
|
58
|
-
polishers.each
|
46
|
+
polishers.each do |polisher|
|
59
47
|
obj = polisher.polish(obj)
|
60
48
|
end
|
61
49
|
end
|
62
|
-
|
50
|
+
|
63
51
|
return obj
|
64
52
|
end
|
65
53
|
end
|
66
|
-
|
54
|
+
|
67
55
|
###
|
68
|
-
# @author Jonathan Bradley Whited
|
56
|
+
# @author Jonathan Bradley Whited
|
69
57
|
# @since 0.2.0
|
70
58
|
###
|
71
59
|
class BasicPolisher < Polisher
|
@@ -74,18 +62,18 @@ module NHKore
|
|
74
62
|
# - Yunibaasaru・Sutajio・Japan
|
75
63
|
# Keep numbers next to kanji/kana, else the below kana won't make sense:
|
76
64
|
# - Word { kanji: 20日, kana: はつか }
|
77
|
-
|
65
|
+
|
78
66
|
str = str.gsub(/[^[[:alnum:]]・]/,'')
|
79
|
-
|
67
|
+
|
80
68
|
# Numbers/dots by themselves (without kanji/kana) should be ignored (empty).
|
81
|
-
str = '' if str.gsub(/[[[:digit:]]・]+/,'').empty?
|
82
|
-
|
69
|
+
str = '' if str.gsub(/[[[:digit:]]・]+/,'').empty?
|
70
|
+
|
83
71
|
return str
|
84
72
|
end
|
85
73
|
end
|
86
|
-
|
74
|
+
|
87
75
|
###
|
88
|
-
# @author Jonathan Bradley Whited
|
76
|
+
# @author Jonathan Bradley Whited
|
89
77
|
# @since 0.2.0
|
90
78
|
###
|
91
79
|
class BestPolisher < BasicPolisher
|