nhkore 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,104 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ require 'nhkore/defn'
25
+ require 'nhkore/util'
26
+
27
+
28
+ module NHKore
29
+ ###
30
+ # @author Jonathan Bradley Whited (@esotericpig)
31
+ # @since 0.2.0
32
+ ###
33
+ class Entry
34
+ HYOUKI_SEP = '・'
35
+
36
+ attr_reader :defns
37
+ attr_accessor :id
38
+
39
+ def initialize()
40
+ super()
41
+
42
+ @defns = []
43
+ @id = nil
44
+ end
45
+
46
+ def build_defn()
47
+ defns = []
48
+ i = 0
49
+
50
+ @defns.each() do |defn|
51
+ defns << "#{i += 1})#{defn}" # Japanese parenthesis
52
+ end
53
+
54
+ return defns.join("\n")
55
+ end
56
+
57
+ def build_hyouki()
58
+ # Since Ruby v1.9, Hash preserves order.
59
+ # Ruby v2.7 doc for Set still says no guarantee of order, so don't use.
60
+ hyoukis = {}
61
+
62
+ @defns.each() do |defn|
63
+ defn.hyoukis.each() do |hyouki|
64
+ hyouki = hyouki.chomp(HYOUKI_SEP)
65
+
66
+ next if hyouki.empty?()
67
+
68
+ hyoukis[hyouki] = true
69
+ end
70
+ end
71
+
72
+ return hyoukis.keys.join(HYOUKI_SEP)
73
+ end
74
+
75
+ def self.scrape(id,array,missingno: nil,url: nil)
76
+ entry = Entry.new()
77
+
78
+ entry.id = Util.unspace_web_str(id.to_s()).downcase()
79
+
80
+ return nil if entry.id.empty?()
81
+
82
+ array.each() do |hash|
83
+ defn = Defn.scrape(hash,missingno: missingno,url: url)
84
+ entry.defns << defn unless defn.nil?()
85
+ end
86
+
87
+ return nil if entry.defns.empty?()
88
+ return entry
89
+ end
90
+
91
+ def to_s()
92
+ s = ''.dup()
93
+
94
+ return s if @defns.empty?()
95
+
96
+ hyouki = build_hyouki()
97
+
98
+ s << "#{hyouki}\n" unless Util.empty_web_str?(hyouki)
99
+ s << build_defn()
100
+
101
+ return s
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ module NHKore
25
+ ###
26
+ # @author Jonathan Bradley Whited (@esotericpig)
27
+ # @since 0.2.0
28
+ ###
29
+ class Error < ::StandardError; end
30
+
31
+ class CLIError < Error; end # @since 0.2.0
32
+ class ParseError < Error; end # @since 0.2.0
33
+ class ScrapeError < Error; end # @since 0.2.0
34
+ class ZipError < Error; end # @since 0.2.0
35
+ end
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ module NHKore
25
+ ###
26
+ # @author Jonathan Bradley Whited (@esotericpig)
27
+ # @since 0.2.0
28
+ ###
29
+ module Fileable
30
+ def self.included(mod)
31
+ mod.extend ClassMethods
32
+ end
33
+
34
+ def save_file(file,mode: 'wt',**kargs)
35
+ File.open(file,mode: mode,**kargs) do |file|
36
+ file.write(to_s())
37
+ end
38
+ end
39
+
40
+ module ClassMethods
41
+ def load_file(file,mode: 'rt:BOM|UTF-8',**kargs)
42
+ data = File.read(file,mode: mode,**kargs)
43
+
44
+ return load_data(data,file: file,**kargs)
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,92 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ require 'nhkore/util'
25
+
26
+
27
+ module NHKore
28
+ ###
29
+ # @author Jonathan Bradley Whited (@esotericpig)
30
+ # @since 0.2.0
31
+ ###
32
+ class Missingno
33
+ attr_reader :kanas
34
+ attr_reader :kanjis
35
+
36
+ # @param data [News,Article,Array<Word>]
37
+ def initialize(data)
38
+ super()
39
+
40
+ @kanas = {}
41
+ @kanjis = {}
42
+
43
+ # News?
44
+ if data.respond_to?(:articles)
45
+ add_news(data)
46
+ # Article?
47
+ elsif data.respond_to?(:words)
48
+ add_article(data)
49
+ else
50
+ add_words(data)
51
+ end
52
+ end
53
+
54
+ def add_article(article)
55
+ add_words(article.words.values())
56
+ end
57
+
58
+ def add_news(news)
59
+ news.articles.values.each() do |article|
60
+ add_article(article)
61
+ end
62
+ end
63
+
64
+ def add_words(words)
65
+ words.each() do |word|
66
+ # We only want ones that are both filled in because
67
+ # Word.scrape_ruby_tag() will raise an error if either is empty.
68
+ next if Util.empty_web_str?(word.kana) || Util.empty_web_str?(word.kanji)
69
+
70
+ if !kanas.key?(word.kana)
71
+ kanas[word.kana] = word
72
+ end
73
+
74
+ if !kanjis.key?(word.kanji)
75
+ kanjis[word.kanji] = word
76
+ end
77
+ end
78
+ end
79
+
80
+ def kana_from_kanji(kanji)
81
+ word = @kanjis[kanji]
82
+
83
+ return word.nil?() ? nil : word.kana
84
+ end
85
+
86
+ def kanji_from_kana(kana)
87
+ word = @kanas[kana]
88
+
89
+ return word.nil?() ? nil : word.kanji
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,176 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ require 'nhkore/article'
25
+ require 'nhkore/error'
26
+ require 'nhkore/fileable'
27
+ require 'nhkore/util'
28
+
29
+
30
+ module NHKore
31
+ ###
32
+ # @author Jonathan Bradley Whited (@esotericpig)
33
+ # @since 0.2.0
34
+ ###
35
+ class News
36
+ include Fileable
37
+
38
+ DEFAULT_DIR = Util::CORE_DIR
39
+ FAVORED_URL = /https\:/i
40
+
41
+ attr_reader :articles
42
+ attr_reader :sha256s
43
+
44
+ def initialize()
45
+ super()
46
+
47
+ @articles = {}
48
+ @sha256s = {}
49
+ end
50
+
51
+ def add_article(article,key: nil,overwrite: false)
52
+ key = article.url if key.nil?()
53
+
54
+ if !overwrite
55
+ raise ArgumentError,"duplicate article[#{key}] in articles" if @articles.key?(key)
56
+ raise ArgumentError,"duplicate sha256[#{article.sha256}] in articles" if @sha256s.key?(article.sha256)
57
+ end
58
+
59
+ @articles[key] = article
60
+ @sha256s[article.sha256] = article.url
61
+
62
+ return self
63
+ end
64
+
65
+ def self.build_file(filename)
66
+ return File.join(DEFAULT_DIR,filename)
67
+ end
68
+
69
+ def encode_with(coder)
70
+ # Order matters.
71
+ # Don't output @sha256s.
72
+
73
+ coder[:articles] = @articles
74
+ end
75
+
76
+ def self.load_data(data,article_class: Article,file: nil,news_class: News,**kargs)
77
+ data = Util.load_yaml(data,file: file)
78
+
79
+ articles = data[:articles]
80
+
81
+ news = news_class.new()
82
+
83
+ if !articles.nil?()
84
+ articles.each() do |key,hash|
85
+ key = key.to_s() # Change from a symbol
86
+ news.add_article(article_class.load_data(key,hash),key: key)
87
+ end
88
+ end
89
+
90
+ return news
91
+ end
92
+
93
+ def update_article(article,url)
94
+ # Favor https.
95
+ return if article.url =~ FAVORED_URL
96
+ return if url !~ FAVORED_URL
97
+
98
+ @articles.delete(article.url)
99
+ @articles[url] = article
100
+ article.url = url
101
+ end
102
+
103
+ def article(key)
104
+ return @articles[key]
105
+ end
106
+
107
+ def article_with_sha256(sha256)
108
+ article = nil
109
+
110
+ @articles.values().each() do |a|
111
+ if a.sha256 == sha256
112
+ article = a
113
+
114
+ break
115
+ end
116
+ end
117
+
118
+ return article
119
+ end
120
+
121
+ def article?(key)
122
+ return @articles.key?(key)
123
+ end
124
+
125
+ def sha256?(sha256)
126
+ return @sha256s.key?(sha256)
127
+ end
128
+
129
+ def to_s()
130
+ # Put each Word on one line (flow/inline style).
131
+ return Util.dump_yaml(self,flow_level: 8)
132
+ end
133
+ end
134
+
135
+ ###
136
+ # @author Jonathan Bradley Whited (@esotericpig)
137
+ # @since 0.2.0
138
+ ###
139
+ class FutsuuNews < News
140
+ DEFAULT_FILENAME = 'nhk_news_web_regular.yml'
141
+ DEFAULT_FILE = build_file(DEFAULT_FILENAME)
142
+
143
+ def self.load_data(data,**kargs)
144
+ return News.load_data(data,article_class: Article,news_class: FutsuuNews,**kargs)
145
+ end
146
+
147
+ def self.load_file(file=DEFAULT_FILE,**kargs)
148
+ return News.load_file(file,article_class: Article,news_class: FutsuuNews,**kargs)
149
+ end
150
+
151
+ def save_file(file=DEFAULT_FILE,**kargs)
152
+ super(file,**kargs)
153
+ end
154
+ end
155
+
156
+ ###
157
+ # @author Jonathan Bradley Whited (@esotericpig)
158
+ # @since 0.2.0
159
+ ###
160
+ class YasashiiNews < News
161
+ DEFAULT_FILENAME = 'nhk_news_web_easy.yml'
162
+ DEFAULT_FILE = build_file(DEFAULT_FILENAME)
163
+
164
+ def self.load_data(data,**kargs)
165
+ return News.load_data(data,article_class: Article,news_class: YasashiiNews,**kargs)
166
+ end
167
+
168
+ def self.load_file(file=DEFAULT_FILE,**kargs)
169
+ return News.load_file(file,article_class: Article,news_class: YasashiiNews,**kargs)
170
+ end
171
+
172
+ def save_file(file=DEFAULT_FILE,**kargs)
173
+ super(file,**kargs)
174
+ end
175
+ end
176
+ end
@@ -0,0 +1,93 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ require 'nhkore/word'
25
+
26
+
27
+ module NHKore
28
+ ###
29
+ # @author Jonathan Bradley Whited (@esotericpig)
30
+ # @since 0.2.0
31
+ ###
32
+ class Polisher
33
+ def begin_polish(str)
34
+ return str
35
+ end
36
+
37
+ def polish(str)
38
+ str = begin_polish(str)
39
+ str = end_polish(str)
40
+
41
+ return str
42
+ end
43
+
44
+ def self.polish_any(obj,polishers)
45
+ return nil if obj.nil?()
46
+
47
+ polishers = Array(polishers)
48
+
49
+ return obj if polishers.empty?()
50
+
51
+ if obj.is_a?(Word)
52
+ obj = Word.new(
53
+ kana: polish_any(obj.kana,polishers),
54
+ kanji: polish_any(obj.kanji,polishers),
55
+ word: obj
56
+ )
57
+ else # String
58
+ polishers.each() do |polisher|
59
+ obj = polisher.polish(obj)
60
+ end
61
+ end
62
+
63
+ return obj
64
+ end
65
+ end
66
+
67
+ ###
68
+ # @author Jonathan Bradley Whited (@esotericpig)
69
+ # @since 0.2.0
70
+ ###
71
+ class BasicPolisher < Polisher
72
+ def end_polish(str)
73
+ # Keep Japanese dots in names:
74
+ # - Yunibaasaru・Sutajio・Japan
75
+ # Keep numbers next to kanji/kana, else the below kana won't make sense:
76
+ # - Word { kanji: 20日, kana: はつか }
77
+
78
+ str = str.gsub(/[^[[:alnum:]]・]/,'')
79
+
80
+ # Numbers/dots by themselves (without kanji/kana) should be ignored (empty).
81
+ str = '' if str.gsub(/[[[:digit:]]・]+/,'').empty?()
82
+
83
+ return str
84
+ end
85
+ end
86
+
87
+ ###
88
+ # @author Jonathan Bradley Whited (@esotericpig)
89
+ # @since 0.2.0
90
+ ###
91
+ class BestPolisher < BasicPolisher
92
+ end
93
+ end
@@ -0,0 +1,137 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ require 'nokogiri'
25
+ require 'open-uri'
26
+
27
+ require 'nhkore/util'
28
+
29
+
30
+ module NHKore
31
+ ###
32
+ # @author Jonathan Bradley Whited (@esotericpig)
33
+ # @since 0.2.0
34
+ ###
35
+ class Scraper
36
+ # Copied from googler (https://github.com/jarun/googler).
37
+ USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
38
+
39
+ attr_accessor :is_file
40
+ attr_accessor :str_or_io
41
+ attr_accessor :url
42
+
43
+ alias_method :is_file?,:is_file
44
+
45
+ # +max_redirects+ defaults to 3 for safety (infinite-loop attack).
46
+ #
47
+ # All URL options: https://ruby-doc.org/stdlib-2.7.0/libdoc/open-uri/rdoc/OpenURI/OpenRead.html
48
+ #
49
+ # Pass in +header: {}+ for the default HTTP header fields to be set.
50
+ #
51
+ # @param redirect_rule [nil,:lenient,:strict]
52
+ def initialize(url,header: nil,is_file: false,max_redirects: 3,max_retries: 3,redirect_rule: :strict,str_or_io: nil,**kargs)
53
+ super()
54
+
55
+ @is_file = is_file
56
+ @url = url
57
+
58
+ if !header.nil?() && !is_file
59
+ # Some sites (Search Engines) hate scrapers, so need HTTP header fields.
60
+ # If this isn't enough, look at googler for more header fields to set:
61
+ # - https://github.com/jarun/googler
62
+ # If necessary, can use Faraday, HTTParty, or RestClient gem and
63
+ # pass in to str_or_io.
64
+
65
+ header['User-Agent'] = USER_AGENT unless header.key?('User-Agent')
66
+
67
+ kargs.merge!(header) # header will overwrite duplicate kargs entries
68
+ end
69
+
70
+ if str_or_io.nil?()
71
+ if is_file
72
+ # NHK's website tends to always use UTF-8.
73
+ @str_or_io = File.open(url,'rt:UTF-8',**kargs)
74
+ else
75
+ max_redirects = 10000 if max_redirects.nil?() || max_redirects < 0
76
+
77
+ top_uri = URI(url)
78
+ top_domain = Util.domain(top_uri.host)
79
+
80
+ begin
81
+ # Use URI.open() instead of (Kernel.)open() for safety (code-injection attack).
82
+ @str_or_io = URI.open(url,redirect: false,**kargs)
83
+ @url = url
84
+ rescue OpenURI::HTTPError => e
85
+ raise e.exception("HTTP error[#{e.to_s()}] at URL[#{url}]")
86
+ rescue OpenURI::HTTPRedirect => redirect
87
+ redirect_uri = redirect.uri
88
+
89
+ if (max_redirects -= 1) < 0
90
+ raise redirect.exception("redirected to URL[#{redirect_uri}]: #{redirect}")
91
+ end
92
+
93
+ case redirect_rule
94
+ when :lenient,:strict
95
+ if redirect_uri.scheme != top_uri.scheme
96
+ raise redirect.exception("redirect scheme[#{redirect_uri.scheme}] does not match original " +
97
+ "scheme[#{top_uri.scheme}] at redirect URL[#{redirect_uri}]: #{redirect}")
98
+ end
99
+
100
+ if redirect_rule == :strict
101
+ redirect_domain = Util.domain(redirect_uri.host)
102
+
103
+ if redirect_domain != top_domain
104
+ raise redirect.exception("redirect domain[#{redirect_domain}] does not match original " +
105
+ "domain[#{top_domain}] at redirect URL[#{redirect_uri}]: #{redirect}")
106
+ end
107
+ end
108
+ end
109
+
110
+ url = redirect_uri
111
+
112
+ retry
113
+ rescue SocketError
114
+ raise if max_retries.nil?() || (max_retries -= 1) < 0
115
+
116
+ retry
117
+ end
118
+ end
119
+ else
120
+ @str_or_io = str_or_io
121
+ end
122
+ end
123
+
124
+ def html_doc()
125
+ return Nokogiri::HTML(@str_or_io)
126
+ end
127
+
128
+ def join_url(relative_url)
129
+ # For a file, don't know what to do.
130
+ # It would be unsafe to return something else;
131
+ # for example, it could return a lot of "../../../" to your root dir.
132
+ return nil if @is_file
133
+
134
+ return URI::join(@url,relative_url)
135
+ end
136
+ end
137
+ end