nhkore 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,188 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ require 'time'
25
+
26
+ require 'nhkore/fileable'
27
+ require 'nhkore/util'
28
+
29
+
30
+ module NHKore
31
+ ###
32
+ # @author Jonathan Bradley Whited (@esotericpig)
33
+ # @since 0.2.0
34
+ ###
35
+ class SearchLink
36
+ attr_accessor :datetime
37
+ attr_accessor :futsuurl
38
+ attr_accessor :scraped
39
+ attr_accessor :sha256
40
+ attr_accessor :title
41
+ attr_accessor :url
42
+
43
+ alias_method :scraped?,:scraped
44
+
45
+ def initialize(url,scraped: false)
46
+ super()
47
+
48
+ @datetime = nil
49
+ @futsuurl = nil
50
+ @scraped = scraped
51
+ @sha256 = sha256
52
+ @title = nil
53
+ @url = url
54
+ end
55
+
56
+ def encode_with(coder)
57
+ # Order matters.
58
+
59
+ coder[:url] = @url
60
+ coder[:scraped] = @scraped
61
+ coder[:datetime] = @datetime.nil?() ? @datetime : @datetime.iso8601()
62
+ coder[:title] = @title
63
+ coder[:futsuurl] = @futsuurl
64
+ coder[:sha256] = @sha256
65
+ end
66
+
67
+ def self.load_data(key,hash)
68
+ datetime = hash[:datetime]
69
+
70
+ slink = SearchLink.new(
71
+ hash[:url],
72
+ scraped: hash[:scraped]
73
+ )
74
+
75
+ slink.datetime = Util.empty_web_str?(datetime) ? nil : Time.iso8601(datetime)
76
+ slink.futsuurl = hash[:futsuurl]
77
+ slink.sha256 = hash[:sha256]
78
+ slink.title = hash[:title]
79
+
80
+ return slink
81
+ end
82
+
83
+ def update_from_article(article)
84
+ # Don't update the url, as it may be different (e.g., http vs https).
85
+
86
+ @datetime = article.datetime if @datetime.nil?()
87
+ @futsuurl = article.futsuurl if Util.empty_web_str?(@futsuurl)
88
+ @scraped = true # If we have an article, it's been scraped
89
+ @sha256 = article.sha256 if Util.empty_web_str?(@sha256)
90
+ @title = article.title if Util.empty_web_str?(@title)
91
+ end
92
+
93
+ def to_s(mini: false)
94
+ s = ''.dup()
95
+
96
+ s << "'#{@url}': "
97
+
98
+ if mini
99
+ s << "{ scraped? #{@scraped ? 'yes' : 'NO'} }"
100
+ else
101
+ s << "\n scraped? #{@scraped ? 'yes' : 'NO'}"
102
+ s << "\n datetime: '#{@datetime}'"
103
+ s << "\n title: '#{@title}'"
104
+ s << "\n futsuurl: '#{@futsuurl}'"
105
+ s << "\n sha256: '#{@sha256}'"
106
+ end
107
+
108
+ return s
109
+ end
110
+ end
111
+
112
+ ###
113
+ # @author Jonathan Bradley Whited (@esotericpig)
114
+ # @since 0.2.0
115
+ ###
116
+ class SearchLinks
117
+ include Fileable
118
+
119
+ DEFAULT_DIR = Util::CORE_DIR
120
+
121
+ DEFAULT_BING_FUTSUU_FILENAME = 'bing_nhk_news_web_regular.yml'
122
+ DEFAULT_BING_YASASHII_FILENAME = 'bing_nhk_news_web_easy.yml'
123
+
124
+ def self.build_file(filename)
125
+ return File.join(DEFAULT_DIR,filename)
126
+ end
127
+
128
+ DEFAULT_BING_FUTSUU_FILE = build_file(DEFAULT_BING_FUTSUU_FILENAME)
129
+ DEFAULT_BING_YASASHII_FILE = build_file(DEFAULT_BING_YASASHII_FILENAME)
130
+
131
+ attr_reader :links
132
+
133
+ def initialize()
134
+ super()
135
+
136
+ @links = {}
137
+ end
138
+
139
+ def add_link(link)
140
+ return self if @links.key?(link.url)
141
+
142
+ @links[link.url] = link
143
+
144
+ return self
145
+ end
146
+
147
+ def each(&block)
148
+ return @links.each(&block)
149
+ end
150
+
151
+ def encode_with(coder)
152
+ # Order matters.
153
+
154
+ coder[:links] = @links
155
+ end
156
+
157
+ def self.load_data(data,file: nil,**kargs)
158
+ data = Util.load_yaml(data,file: file)
159
+
160
+ links = data[:links]
161
+
162
+ slinks = SearchLinks.new()
163
+
164
+ if !links.nil?()
165
+ links.each() do |key,hash|
166
+ key = key.to_s() # Change from a symbol
167
+ slinks.links[key] = SearchLink.load_data(key,hash)
168
+ end
169
+ end
170
+
171
+ return slinks
172
+ end
173
+
174
+ def [](url)
175
+ url = url.url if url.respond_to?(:url)
176
+
177
+ return @links[url]
178
+ end
179
+
180
+ def length()
181
+ return @links.length
182
+ end
183
+
184
+ def to_s()
185
+ return Util.dump_yaml(self)
186
+ end
187
+ end
188
+ end
@@ -0,0 +1,152 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ require 'uri'
25
+
26
+ require 'nhkore/error'
27
+ require 'nhkore/scraper'
28
+ require 'nhkore/search_link'
29
+ require 'nhkore/util'
30
+
31
+
32
+ module NHKore
33
+ ###
34
+ # @author Jonathan Bradley Whited (@esotericpig)
35
+ # @since 0.2.0
36
+ ###
37
+ class SearchScraper < Scraper
38
+ DEFAULT_RESULT_COUNT = 100
39
+ FUTSUU_SITE = 'nhk.or.jp/news/html/'
40
+ YASASHII_SITE = 'nhk.or.jp/news/easy/'
41
+
42
+ # https://www3.nhk.or.jp/news/html/20200220/k10012294001000.html
43
+ FUTSUU_REGEX = /\A[^\.]+\.#{Regexp.quote(FUTSUU_SITE)}.+\.html?/i
44
+ # https://www3.nhk.or.jp/news/easy/k10012294001000/k10012294001000.html
45
+ # - https://www3.nhk.or.jp/news/easy/article/disaster_heat.html
46
+ YASASHII_REGEX = /\A[^\.]+\.#{Regexp.quote(YASASHII_SITE)}.+\.html?/i
47
+
48
+ # Pass in +header: {}+ to trigger using the default HTTP header fields.
49
+ def initialize(url,header: {},**kargs)
50
+ super(url,header: header,**kargs)
51
+ end
52
+ end
53
+
54
+ ###
55
+ # @author Jonathan Bradley Whited (@esotericpig)
56
+ # @since 0.2.0
57
+ ###
58
+ class BingScraper < SearchScraper
59
+ attr_reader :regex
60
+ attr_reader :site
61
+
62
+ def initialize(site,regex: nil,url: nil,**kargs)
63
+ case site
64
+ when :futsuu
65
+ regex = FUTSUU_REGEX if regex.nil?()
66
+ site = FUTSUU_SITE
67
+ when :yasashii
68
+ regex = YASASHII_REGEX if regex.nil?()
69
+ site = YASASHII_SITE
70
+ else
71
+ site = Util.strip_web_str(site.to_s())
72
+ regex = /#{Regexp.quote(site)}/ if regex.nil?()
73
+ end
74
+
75
+ raise ArgumentError,"empty regex[#{regex}]" if regex.nil?()
76
+ raise ArgumentError,"empty site[#{site}]" if site.empty?()
77
+
78
+ @regex = regex
79
+ @site = site
80
+ url = self.class.build_url(site,**kargs) if url.nil?()
81
+
82
+ # Delete class-specific args (don't pass to Open-URI).
83
+ kargs.delete(:count)
84
+
85
+ super(url,**kargs)
86
+ end
87
+
88
+ def self.build_url(site,count: DEFAULT_RESULT_COUNT,**kargs)
89
+ url = ''.dup()
90
+
91
+ url << 'https://www.bing.com/search?'
92
+ url << URI.encode_www_form(
93
+ q: "site:#{site}",
94
+ count: count
95
+ )
96
+
97
+ return url
98
+ end
99
+
100
+ def scrape(links,page=NextPage.new())
101
+ doc = html_doc()
102
+ next_page = NextPage.new()
103
+
104
+ anchors = doc.css('a')
105
+
106
+ return next_page if anchors.length < 1
107
+
108
+ anchors.each() do |anchor|
109
+ href = anchor['href'].to_s()
110
+ href = Util.unspace_web_str(href).downcase()
111
+
112
+ next if href.empty?()
113
+ next if href =~ /\/about\.html?/ # https://www3.nhk.or.jp/news/easy/about.html
114
+ next if href =~ /\/movieplayer\.html?/ # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
115
+ next if href =~ /\/audio\.html?/ # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
116
+
117
+ if (md = href.match(/first\=(\d+)/))
118
+ count = md[1].to_i()
119
+
120
+ if count > page.count && (next_page.count < 0 || count < next_page.count)
121
+ next_page.count = count
122
+ next_page.url = join_url(href)
123
+ end
124
+ elsif href =~ regex
125
+ links.add_link(SearchLink.new(href))
126
+ end
127
+ end
128
+
129
+ return next_page
130
+ end
131
+ end
132
+
133
+ ###
134
+ # @author Jonathan Bradley Whited (@esotericpig)
135
+ # @since 0.2.0
136
+ ###
137
+ class NextPage
138
+ attr_accessor :count
139
+ attr_accessor :url
140
+
141
+ def initialize()
142
+ super()
143
+
144
+ @count = -1
145
+ @url = nil
146
+ end
147
+
148
+ def empty?()
149
+ return @url.nil?() || @count < 0
150
+ end
151
+ end
152
+ end
@@ -0,0 +1,339 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ require 'csv'
25
+
26
+ require 'nhkore/article'
27
+ require 'nhkore/fileable'
28
+ require 'nhkore/util'
29
+
30
+
31
+ module NHKore
32
+ ###
33
+ # @author Jonathan Bradley Whited (@esotericpig)
34
+ # @since 0.2.0
35
+ ###
36
+ class Sifter
37
+ include Fileable
38
+
39
+ DEFAULT_DIR = Util::CORE_DIR
40
+
41
+ DEFAULT_FUTSUU_FILENAME = 'sift_nhk_news_web_regular'
42
+ DEFAULT_YASASHII_FILENAME = 'sift_nhk_news_web_easy'
43
+
44
+ def self.build_file(filename)
45
+ return File.join(DEFAULT_DIR,filename)
46
+ end
47
+
48
+ DEFAULT_FUTSUU_FILE = build_file(DEFAULT_FUTSUU_FILENAME)
49
+ DEFAULT_YASASHII_FILE = build_file(DEFAULT_YASASHII_FILENAME)
50
+
51
+ attr_accessor :articles
52
+ attr_accessor :caption
53
+ attr_accessor :filters
54
+ attr_accessor :ignores
55
+ attr_accessor :output
56
+
57
+ def initialize(news)
58
+ @articles = news.articles.values.dup()
59
+ @caption = nil
60
+ @filters = {}
61
+ @ignores = {}
62
+ @output = nil
63
+ end
64
+
65
+ def filter?(article)
66
+ return false if @filters.empty?()
67
+
68
+ datetime_filter = @filters[:datetime]
69
+ title_filter = @filters[:title]
70
+ url_filter = @filters[:url]
71
+
72
+ if !datetime_filter.nil?()
73
+ datetime = article.datetime
74
+
75
+ return true if datetime.nil?() ||
76
+ datetime < datetime_filter[:from] || datetime > datetime_filter[:to]
77
+ end
78
+
79
+ if !title_filter.nil?()
80
+ title = article.title.to_s()
81
+ title = Util.unspace_web_str(title) if title_filter[:unspace]
82
+ title = title.downcase() if title_filter[:uncase]
83
+
84
+ return true unless title.include?(title_filter[:filter])
85
+ end
86
+
87
+ if !url_filter.nil?()
88
+ url = article.url.to_s()
89
+ url = Util.unspace_web_str(url) if url_filter[:unspace]
90
+ url = url.downcase() if url_filter[:uncase]
91
+
92
+ return true unless url.include?(url_filter[:filter])
93
+ end
94
+
95
+ return false
96
+ end
97
+
98
+ def filter_by_datetime(datetime_filter=nil,from_filter: nil,to_filter: nil)
99
+ if !datetime_filter.nil?()
100
+ # If out-of-bounds, just nil.
101
+ from_filter = datetime_filter[0]
102
+ to_filter = datetime_filter[1]
103
+ end
104
+
105
+ from_filter = to_filter if from_filter.nil?()
106
+ to_filter = from_filter if to_filter.nil?()
107
+
108
+ from_filter = Util.jst_time(from_filter) unless from_filter.nil?()
109
+ to_filter = Util.jst_time(to_filter) unless to_filter.nil?()
110
+
111
+ datetime_filter = [from_filter,to_filter]
112
+
113
+ return self if datetime_filter.flatten().compact().empty?()
114
+
115
+ @filters[:datetime] = {from: from_filter,to: to_filter}
116
+
117
+ return self
118
+ end
119
+
120
+ def filter_by_title(title_filter,uncase: true,unspace: true)
121
+ title_filter = Util.unspace_web_str(title_filter) if unspace
122
+ title_filter = title_filter.downcase() if uncase
123
+
124
+ @filters[:title] = {filter: title_filter,uncase: uncase,unspace: unspace}
125
+
126
+ return self
127
+ end
128
+
129
+ def filter_by_url(url_filter,uncase: true,unspace: true)
130
+ url_filter = Util.unspace_web_str(url_filter) if unspace
131
+ url_filter = url_filter.downcase() if uncase
132
+
133
+ @filters[:url] = {filter: url_filter,uncase: uncase,unspace: unspace}
134
+
135
+ return self
136
+ end
137
+
138
+ def ignore(key)
139
+ @ignores[key] = true
140
+
141
+ return self
142
+ end
143
+
144
+ # This does not output {caption}.
145
+ def put_csv!()
146
+ words = sift()
147
+
148
+ @output = CSV.generate(headers: :first_row,write_headers: true) do |csv|
149
+ row = []
150
+
151
+ row << 'Frequency' unless @ignores[:freq]
152
+ row << 'Word' unless @ignores[:word]
153
+ row << 'Kana' unless @ignores[:kana]
154
+ row << 'English' unless @ignores[:eng]
155
+ row << 'Definition' unless @ignores[:defn]
156
+
157
+ csv << row
158
+
159
+ words.each() do |word|
160
+ row = []
161
+
162
+ row << word.freq unless @ignores[:freq]
163
+ row << word.word unless @ignores[:word]
164
+ row << word.kana unless @ignores[:kana]
165
+ row << word.eng unless @ignores[:eng]
166
+ row << word.defn unless @ignores[:defn]
167
+
168
+ csv << row
169
+ end
170
+ end
171
+
172
+ return @output
173
+ end
174
+
175
+ def put_html!()
176
+ words = sift()
177
+
178
+ @output = ''.dup()
179
+
180
+ @output << <<~EOH
181
+ <!DOCTYPE html>
182
+ <html lang="ja">
183
+ <head>
184
+ <meta charset="utf-8">
185
+ <title>NHKore</title>
186
+ <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Noto+Serif+JP&amp;display=fallback">
187
+ <style>
188
+ body {
189
+ background-color: #FCFBF9;
190
+ color: #333333;
191
+ font-family: 'Noto Serif JP',Verdana,sans-serif;
192
+ }
193
+ h1 {
194
+ color: #737373;
195
+ }
196
+ table {
197
+ border-collapse: collapse;
198
+ table-layout: fixed;
199
+ width: 100%;
200
+ }
201
+ tr:nth-child(even) {
202
+ background-color: #A5C7ED;
203
+ }
204
+ tr:hover {
205
+ background-color: #FFDDCA;
206
+ }
207
+ td,th {
208
+ border: 1px solid #333333;
209
+ padding: 8px;
210
+ text-align: left;
211
+ }
212
+ th {
213
+ background-color: #082A8E;
214
+ color: #FCFBF9;
215
+ }
216
+ td {
217
+ vertical-align: top;
218
+ }
219
+ td:nth-child(1) {
220
+ padding-right: 1em;
221
+ text-align: right;
222
+ }
223
+ </style>
224
+ </head>
225
+ <body>
226
+ <h1>NHKore</h1>
227
+ <h2>#{@caption}</h2>
228
+ <table>
229
+ EOH
230
+ #" # Fix for editor
231
+
232
+ # If have too few or too many '<col>', invalid HTML.
233
+ @output << %Q{<col style="width:6em;">\n} unless @ignores[:freq]
234
+ @output << %Q{<col style="width:17em;">\n} unless @ignores[:word]
235
+ @output << %Q{<col style="width:17em;">\n} unless @ignores[:kana]
236
+ @output << %Q{<col style="width:5em;">\n} unless @ignores[:eng]
237
+ @output << "<col>\n" unless @ignores[:defn] # No width for defn, fills rest of page
238
+
239
+ @output << '<tr>'
240
+ @output << '<th>Frequency</th>' unless @ignores[:freq]
241
+ @output << '<th>Word</th>' unless @ignores[:word]
242
+ @output << '<th>Kana</th>' unless @ignores[:kana]
243
+ @output << '<th>English</th>' unless @ignores[:eng]
244
+ @output << '<th>Definition</th>' unless @ignores[:defn]
245
+ @output << "</tr>\n"
246
+
247
+ words.each() do |word|
248
+ @output << '<tr>'
249
+ @output << "<td>#{Util.escape_html(word.freq.to_s())}</td>" unless @ignores[:freq]
250
+ @output << "<td>#{Util.escape_html(word.word.to_s())}</td>" unless @ignores[:word]
251
+ @output << "<td>#{Util.escape_html(word.kana.to_s())}</td>" unless @ignores[:kana]
252
+ @output << "<td>#{Util.escape_html(word.eng.to_s())}</td>" unless @ignores[:eng]
253
+ @output << "<td>#{Util.escape_html(word.defn.to_s())}</td>" unless @ignores[:defn]
254
+ @output << "</tr>\n"
255
+ end
256
+
257
+ @output << <<~EOH
258
+ </table>
259
+ </body>
260
+ </html>
261
+ EOH
262
+ #/ # Fix for editor
263
+
264
+ return @output
265
+ end
266
+
267
+ def put_yaml!()
268
+ words = sift()
269
+
270
+ # Just blank out ignores.
271
+ if !@ignores.empty?()
272
+ words.each() do |word|
273
+ # word/kanji/kana do not have setters/mutators.
274
+ word.defn = nil if @ignores[:defn]
275
+ word.eng = nil if @ignores[:eng]
276
+ word.freq = nil if @ignores[:freq]
277
+ end
278
+ end
279
+
280
+ yaml = {
281
+ caption: @caption,
282
+ words: words
283
+ }
284
+
285
+ # Put each Word on one line (flow/inline style).
286
+ @output = Util.dump_yaml(yaml,flow_level: 4)
287
+
288
+ return @output
289
+ end
290
+
291
+ def sift()
292
+ master_article = Article.new()
293
+
294
+ @articles.each() do |article|
295
+ next if filter?(article)
296
+
297
+ article.words.values().each() do |word|
298
+ master_article.add_word(word,use_freq: true)
299
+ end
300
+ end
301
+
302
+ words = master_article.words.values()
303
+
304
+ words = words.sort() do |word1,word2|
305
+ # Order by freq DESC (most frequent words to top).
306
+ i = (word2.freq <=> word1.freq)
307
+
308
+ # Order by !defn.empty, word ASC, !kana.empty, kana ASC, defn.len DESC, defn ASC.
309
+ i = compare_empty_str(word1.defn,word2.defn) if i == 0 # Favor words that have definitions
310
+ i = (word1.word.to_s() <=> word2.word.to_s()) if i == 0
311
+ i = compare_empty_str(word1.kana,word2.kana) if i == 0 # Favor words that have kana
312
+ i = (word1.kana.to_s() <=> word2.kana.to_s()) if i == 0
313
+ i = (word2.defn.to_s().length <=> word1.defn.to_s().length) if i == 0 # Favor longer definitions
314
+ i = (word1.defn.to_s() <=> word2.defn.to_s()) if i == 0
315
+
316
+ i
317
+ end
318
+
319
+ return words
320
+ end
321
+
322
+ def compare_empty_str(str1,str2)
323
+ has_str1 = !Util.empty_web_str?(str1)
324
+ has_str2 = !Util.empty_web_str?(str2)
325
+
326
+ if has_str1 && !has_str2
327
+ return -1 # Bubble word1 to top
328
+ elsif !has_str1 && has_str2
329
+ return 1 # Bubble word2 to top
330
+ end
331
+
332
+ return 0 # Further comparison needed
333
+ end
334
+
335
+ def to_s()
336
+ return @output.to_s()
337
+ end
338
+ end
339
+ end