nhkore 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,104 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ require 'nhkore/defn'
25
+ require 'nhkore/util'
26
+
27
+
28
+ module NHKore
29
+ ###
30
+ # @author Jonathan Bradley Whited (@esotericpig)
31
+ # @since 0.2.0
32
+ ###
33
+ class Entry
34
+ HYOUKI_SEP = '・'
35
+
36
+ attr_reader :defns
37
+ attr_accessor :id
38
+
39
+ def initialize()
40
+ super()
41
+
42
+ @defns = []
43
+ @id = nil
44
+ end
45
+
46
+ def build_defn()
47
+ defns = []
48
+ i = 0
49
+
50
+ @defns.each() do |defn|
51
+ defns << "#{i += 1})#{defn}" # Japanese parenthesis
52
+ end
53
+
54
+ return defns.join("\n")
55
+ end
56
+
57
+ def build_hyouki()
58
+ # Since Ruby v1.9, Hash preserves order.
59
+ # Ruby v2.7 doc for Set still says no guarantee of order, so don't use.
60
+ hyoukis = {}
61
+
62
+ @defns.each() do |defn|
63
+ defn.hyoukis.each() do |hyouki|
64
+ hyouki = hyouki.chomp(HYOUKI_SEP)
65
+
66
+ next if hyouki.empty?()
67
+
68
+ hyoukis[hyouki] = true
69
+ end
70
+ end
71
+
72
+ return hyoukis.keys.join(HYOUKI_SEP)
73
+ end
74
+
75
+ def self.scrape(id,array,missingno: nil,url: nil)
76
+ entry = Entry.new()
77
+
78
+ entry.id = Util.unspace_web_str(id.to_s()).downcase()
79
+
80
+ return nil if entry.id.empty?()
81
+
82
+ array.each() do |hash|
83
+ defn = Defn.scrape(hash,missingno: missingno,url: url)
84
+ entry.defns << defn unless defn.nil?()
85
+ end
86
+
87
+ return nil if entry.defns.empty?()
88
+ return entry
89
+ end
90
+
91
+ def to_s()
92
+ s = ''.dup()
93
+
94
+ return s if @defns.empty?()
95
+
96
+ hyouki = build_hyouki()
97
+
98
+ s << "#{hyouki}\n" unless Util.empty_web_str?(hyouki)
99
+ s << build_defn()
100
+
101
+ return s
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ module NHKore
25
+ ###
26
+ # @author Jonathan Bradley Whited (@esotericpig)
27
+ # @since 0.2.0
28
+ ###
29
+ class Error < ::StandardError; end
30
+
31
+ class CLIError < Error; end # @since 0.2.0
32
+ class ParseError < Error; end # @since 0.2.0
33
+ class ScrapeError < Error; end # @since 0.2.0
34
+ class ZipError < Error; end # @since 0.2.0
35
+ end
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ module NHKore
25
+ ###
26
+ # @author Jonathan Bradley Whited (@esotericpig)
27
+ # @since 0.2.0
28
+ ###
29
+ module Fileable
30
+ def self.included(mod)
31
+ mod.extend ClassMethods
32
+ end
33
+
34
+ def save_file(file,mode: 'wt',**kargs)
35
+ File.open(file,mode: mode,**kargs) do |file|
36
+ file.write(to_s())
37
+ end
38
+ end
39
+
40
+ module ClassMethods
41
+ def load_file(file,mode: 'rt:BOM|UTF-8',**kargs)
42
+ data = File.read(file,mode: mode,**kargs)
43
+
44
+ return load_data(data,file: file,**kargs)
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,92 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ require 'nhkore/util'
25
+
26
+
27
+ module NHKore
28
+ ###
29
+ # @author Jonathan Bradley Whited (@esotericpig)
30
+ # @since 0.2.0
31
+ ###
32
+ class Missingno
33
+ attr_reader :kanas
34
+ attr_reader :kanjis
35
+
36
+ # @param data [News,Article,Array<Word>]
37
+ def initialize(data)
38
+ super()
39
+
40
+ @kanas = {}
41
+ @kanjis = {}
42
+
43
+ # News?
44
+ if data.respond_to?(:articles)
45
+ add_news(data)
46
+ # Article?
47
+ elsif data.respond_to?(:words)
48
+ add_article(data)
49
+ else
50
+ add_words(data)
51
+ end
52
+ end
53
+
54
+ def add_article(article)
55
+ add_words(article.words.values())
56
+ end
57
+
58
+ def add_news(news)
59
+ news.articles.values.each() do |article|
60
+ add_article(article)
61
+ end
62
+ end
63
+
64
+ def add_words(words)
65
+ words.each() do |word|
66
+ # We only want ones that are both filled in because
67
+ # Word.scrape_ruby_tag() will raise an error if either is empty.
68
+ next if Util.empty_web_str?(word.kana) || Util.empty_web_str?(word.kanji)
69
+
70
+ if !kanas.key?(word.kana)
71
+ kanas[word.kana] = word
72
+ end
73
+
74
+ if !kanjis.key?(word.kanji)
75
+ kanjis[word.kanji] = word
76
+ end
77
+ end
78
+ end
79
+
80
+ def kana_from_kanji(kanji)
81
+ word = @kanjis[kanji]
82
+
83
+ return word.nil?() ? nil : word.kana
84
+ end
85
+
86
+ def kanji_from_kana(kana)
87
+ word = @kanas[kana]
88
+
89
+ return word.nil?() ? nil : word.kanji
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,176 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ require 'nhkore/article'
25
+ require 'nhkore/error'
26
+ require 'nhkore/fileable'
27
+ require 'nhkore/util'
28
+
29
+
30
+ module NHKore
31
+ ###
32
+ # @author Jonathan Bradley Whited (@esotericpig)
33
+ # @since 0.2.0
34
+ ###
35
+ class News
36
+ include Fileable
37
+
38
+ DEFAULT_DIR = Util::CORE_DIR
39
+ FAVORED_URL = /https\:/i
40
+
41
+ attr_reader :articles
42
+ attr_reader :sha256s
43
+
44
+ def initialize()
45
+ super()
46
+
47
+ @articles = {}
48
+ @sha256s = {}
49
+ end
50
+
51
+ def add_article(article,key: nil,overwrite: false)
52
+ key = article.url if key.nil?()
53
+
54
+ if !overwrite
55
+ raise ArgumentError,"duplicate article[#{key}] in articles" if @articles.key?(key)
56
+ raise ArgumentError,"duplicate sha256[#{article.sha256}] in articles" if @sha256s.key?(article.sha256)
57
+ end
58
+
59
+ @articles[key] = article
60
+ @sha256s[article.sha256] = article.url
61
+
62
+ return self
63
+ end
64
+
65
+ def self.build_file(filename)
66
+ return File.join(DEFAULT_DIR,filename)
67
+ end
68
+
69
+ def encode_with(coder)
70
+ # Order matters.
71
+ # Don't output @sha256s.
72
+
73
+ coder[:articles] = @articles
74
+ end
75
+
76
+ def self.load_data(data,article_class: Article,file: nil,news_class: News,**kargs)
77
+ data = Util.load_yaml(data,file: file)
78
+
79
+ articles = data[:articles]
80
+
81
+ news = news_class.new()
82
+
83
+ if !articles.nil?()
84
+ articles.each() do |key,hash|
85
+ key = key.to_s() # Change from a symbol
86
+ news.add_article(article_class.load_data(key,hash),key: key)
87
+ end
88
+ end
89
+
90
+ return news
91
+ end
92
+
93
+ def update_article(article,url)
94
+ # Favor https.
95
+ return if article.url =~ FAVORED_URL
96
+ return if url !~ FAVORED_URL
97
+
98
+ @articles.delete(article.url)
99
+ @articles[url] = article
100
+ article.url = url
101
+ end
102
+
103
+ def article(key)
104
+ return @articles[key]
105
+ end
106
+
107
+ def article_with_sha256(sha256)
108
+ article = nil
109
+
110
+ @articles.values().each() do |a|
111
+ if a.sha256 == sha256
112
+ article = a
113
+
114
+ break
115
+ end
116
+ end
117
+
118
+ return article
119
+ end
120
+
121
+ def article?(key)
122
+ return @articles.key?(key)
123
+ end
124
+
125
+ def sha256?(sha256)
126
+ return @sha256s.key?(sha256)
127
+ end
128
+
129
+ def to_s()
130
+ # Put each Word on one line (flow/inline style).
131
+ return Util.dump_yaml(self,flow_level: 8)
132
+ end
133
+ end
134
+
135
+ ###
136
+ # @author Jonathan Bradley Whited (@esotericpig)
137
+ # @since 0.2.0
138
+ ###
139
+ class FutsuuNews < News
140
+ DEFAULT_FILENAME = 'nhk_news_web_regular.yml'
141
+ DEFAULT_FILE = build_file(DEFAULT_FILENAME)
142
+
143
+ def self.load_data(data,**kargs)
144
+ return News.load_data(data,article_class: Article,news_class: FutsuuNews,**kargs)
145
+ end
146
+
147
+ def self.load_file(file=DEFAULT_FILE,**kargs)
148
+ return News.load_file(file,article_class: Article,news_class: FutsuuNews,**kargs)
149
+ end
150
+
151
+ def save_file(file=DEFAULT_FILE,**kargs)
152
+ super(file,**kargs)
153
+ end
154
+ end
155
+
156
+ ###
157
+ # @author Jonathan Bradley Whited (@esotericpig)
158
+ # @since 0.2.0
159
+ ###
160
+ class YasashiiNews < News
161
+ DEFAULT_FILENAME = 'nhk_news_web_easy.yml'
162
+ DEFAULT_FILE = build_file(DEFAULT_FILENAME)
163
+
164
+ def self.load_data(data,**kargs)
165
+ return News.load_data(data,article_class: Article,news_class: YasashiiNews,**kargs)
166
+ end
167
+
168
+ def self.load_file(file=DEFAULT_FILE,**kargs)
169
+ return News.load_file(file,article_class: Article,news_class: YasashiiNews,**kargs)
170
+ end
171
+
172
+ def save_file(file=DEFAULT_FILE,**kargs)
173
+ super(file,**kargs)
174
+ end
175
+ end
176
+ end
@@ -0,0 +1,93 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ require 'nhkore/word'
25
+
26
+
27
+ module NHKore
28
+ ###
29
+ # @author Jonathan Bradley Whited (@esotericpig)
30
+ # @since 0.2.0
31
+ ###
32
+ class Polisher
33
+ def begin_polish(str)
34
+ return str
35
+ end
36
+
37
+ def polish(str)
38
+ str = begin_polish(str)
39
+ str = end_polish(str)
40
+
41
+ return str
42
+ end
43
+
44
+ def self.polish_any(obj,polishers)
45
+ return nil if obj.nil?()
46
+
47
+ polishers = Array(polishers)
48
+
49
+ return obj if polishers.empty?()
50
+
51
+ if obj.is_a?(Word)
52
+ obj = Word.new(
53
+ kana: polish_any(obj.kana,polishers),
54
+ kanji: polish_any(obj.kanji,polishers),
55
+ word: obj
56
+ )
57
+ else # String
58
+ polishers.each() do |polisher|
59
+ obj = polisher.polish(obj)
60
+ end
61
+ end
62
+
63
+ return obj
64
+ end
65
+ end
66
+
67
+ ###
68
+ # @author Jonathan Bradley Whited (@esotericpig)
69
+ # @since 0.2.0
70
+ ###
71
+ class BasicPolisher < Polisher
72
+ def end_polish(str)
73
+ # Keep Japanese dots in names:
74
+ # - Yunibaasaru・Sutajio・Japan
75
+ # Keep numbers next to kanji/kana, else the below kana won't make sense:
76
+ # - Word { kanji: 20日, kana: はつか }
77
+
78
+ str = str.gsub(/[^[[:alnum:]]・]/,'')
79
+
80
+ # Numbers/dots by themselves (without kanji/kana) should be ignored (empty).
81
+ str = '' if str.gsub(/[[[:digit:]]・]+/,'').empty?()
82
+
83
+ return str
84
+ end
85
+ end
86
+
87
+ ###
88
+ # @author Jonathan Bradley Whited (@esotericpig)
89
+ # @since 0.2.0
90
+ ###
91
+ class BestPolisher < BasicPolisher
92
+ end
93
+ end
@@ -0,0 +1,137 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ require 'nokogiri'
25
+ require 'open-uri'
26
+
27
+ require 'nhkore/util'
28
+
29
+
30
+ module NHKore
31
+ ###
32
+ # @author Jonathan Bradley Whited (@esotericpig)
33
+ # @since 0.2.0
34
+ ###
35
+ class Scraper
36
+ # Copied from googler (https://github.com/jarun/googler).
37
+ USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
38
+
39
+ attr_accessor :is_file
40
+ attr_accessor :str_or_io
41
+ attr_accessor :url
42
+
43
+ alias_method :is_file?,:is_file
44
+
45
+ # +max_redirects+ defaults to 3 for safety (infinite-loop attack).
46
+ #
47
+ # All URL options: https://ruby-doc.org/stdlib-2.7.0/libdoc/open-uri/rdoc/OpenURI/OpenRead.html
48
+ #
49
+ # Pass in +header: {}+ for the default HTTP header fields to be set.
50
+ #
51
+ # @param redirect_rule [nil,:lenient,:strict]
52
+ def initialize(url,header: nil,is_file: false,max_redirects: 3,max_retries: 3,redirect_rule: :strict,str_or_io: nil,**kargs)
53
+ super()
54
+
55
+ @is_file = is_file
56
+ @url = url
57
+
58
+ if !header.nil?() && !is_file
59
+ # Some sites (Search Engines) hate scrapers, so need HTTP header fields.
60
+ # If this isn't enough, look at googler for more header fields to set:
61
+ # - https://github.com/jarun/googler
62
+ # If necessary, can use Faraday, HTTParty, or RestClient gem and
63
+ # pass in to str_or_io.
64
+
65
+ header['User-Agent'] = USER_AGENT unless header.key?('User-Agent')
66
+
67
+ kargs.merge!(header) # header will overwrite duplicate kargs entries
68
+ end
69
+
70
+ if str_or_io.nil?()
71
+ if is_file
72
+ # NHK's website tends to always use UTF-8.
73
+ @str_or_io = File.open(url,'rt:UTF-8',**kargs)
74
+ else
75
+ max_redirects = 10000 if max_redirects.nil?() || max_redirects < 0
76
+
77
+ top_uri = URI(url)
78
+ top_domain = Util.domain(top_uri.host)
79
+
80
+ begin
81
+ # Use URI.open() instead of (Kernel.)open() for safety (code-injection attack).
82
+ @str_or_io = URI.open(url,redirect: false,**kargs)
83
+ @url = url
84
+ rescue OpenURI::HTTPError => e
85
+ raise e.exception("HTTP error[#{e.to_s()}] at URL[#{url}]")
86
+ rescue OpenURI::HTTPRedirect => redirect
87
+ redirect_uri = redirect.uri
88
+
89
+ if (max_redirects -= 1) < 0
90
+ raise redirect.exception("redirected to URL[#{redirect_uri}]: #{redirect}")
91
+ end
92
+
93
+ case redirect_rule
94
+ when :lenient,:strict
95
+ if redirect_uri.scheme != top_uri.scheme
96
+ raise redirect.exception("redirect scheme[#{redirect_uri.scheme}] does not match original " +
97
+ "scheme[#{top_uri.scheme}] at redirect URL[#{redirect_uri}]: #{redirect}")
98
+ end
99
+
100
+ if redirect_rule == :strict
101
+ redirect_domain = Util.domain(redirect_uri.host)
102
+
103
+ if redirect_domain != top_domain
104
+ raise redirect.exception("redirect domain[#{redirect_domain}] does not match original " +
105
+ "domain[#{top_domain}] at redirect URL[#{redirect_uri}]: #{redirect}")
106
+ end
107
+ end
108
+ end
109
+
110
+ url = redirect_uri
111
+
112
+ retry
113
+ rescue SocketError
114
+ raise if max_retries.nil?() || (max_retries -= 1) < 0
115
+
116
+ retry
117
+ end
118
+ end
119
+ else
120
+ @str_or_io = str_or_io
121
+ end
122
+ end
123
+
124
+ def html_doc()
125
+ return Nokogiri::HTML(@str_or_io)
126
+ end
127
+
128
+ def join_url(relative_url)
129
+ # For a file, don't know what to do.
130
+ # It would be unsafe to return something else;
131
+ # for example, it could return a lot of "../../../" to your root dir.
132
+ return nil if @is_file
133
+
134
+ return URI::join(@url,relative_url)
135
+ end
136
+ end
137
+ end