nhkore 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +37 -1
- data/README.md +18 -6
- data/Rakefile +11 -16
- data/bin/nhkore +1 -3
- data/lib/nhkore/app.rb +616 -0
- data/lib/nhkore/article.rb +130 -0
- data/lib/nhkore/article_scraper.rb +653 -0
- data/lib/nhkore/cleaner.rb +91 -0
- data/lib/nhkore/cli/bing_cmd.rb +220 -0
- data/lib/nhkore/cli/fx_cmd.rb +116 -0
- data/lib/nhkore/cli/get_cmd.rb +153 -0
- data/lib/nhkore/cli/news_cmd.rb +375 -0
- data/lib/nhkore/cli/sift_cmd.rb +382 -0
- data/lib/nhkore/defn.rb +104 -0
- data/lib/nhkore/dict.rb +80 -0
- data/lib/nhkore/dict_scraper.rb +76 -0
- data/lib/nhkore/entry.rb +104 -0
- data/lib/nhkore/error.rb +35 -0
- data/lib/nhkore/fileable.rb +48 -0
- data/lib/nhkore/missingno.rb +92 -0
- data/lib/nhkore/news.rb +176 -0
- data/lib/nhkore/polisher.rb +93 -0
- data/lib/nhkore/scraper.rb +137 -0
- data/lib/nhkore/search_link.rb +188 -0
- data/lib/nhkore/search_scraper.rb +152 -0
- data/lib/nhkore/sifter.rb +339 -0
- data/lib/nhkore/splitter.rb +90 -0
- data/lib/nhkore/util.rb +190 -0
- data/lib/nhkore/variator.rb +87 -0
- data/lib/nhkore/version.rb +1 -1
- data/lib/nhkore/word.rb +134 -17
- data/lib/nhkore.rb +39 -40
- data/nhkore.gemspec +23 -8
- data/test/{nhkore_tester.rb → nhkore/test_helper.rb} +3 -1
- data/test/nhkore_test.rb +8 -6
- metadata +204 -11
@@ -0,0 +1,188 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'time'
|
25
|
+
|
26
|
+
require 'nhkore/fileable'
|
27
|
+
require 'nhkore/util'
|
28
|
+
|
29
|
+
|
30
|
+
module NHKore
|
31
|
+
###
|
32
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
33
|
+
# @since 0.2.0
|
34
|
+
###
|
35
|
+
class SearchLink
|
36
|
+
attr_accessor :datetime
|
37
|
+
attr_accessor :futsuurl
|
38
|
+
attr_accessor :scraped
|
39
|
+
attr_accessor :sha256
|
40
|
+
attr_accessor :title
|
41
|
+
attr_accessor :url
|
42
|
+
|
43
|
+
alias_method :scraped?,:scraped
|
44
|
+
|
45
|
+
def initialize(url,scraped: false)
|
46
|
+
super()
|
47
|
+
|
48
|
+
@datetime = nil
|
49
|
+
@futsuurl = nil
|
50
|
+
@scraped = scraped
|
51
|
+
@sha256 = sha256
|
52
|
+
@title = nil
|
53
|
+
@url = url
|
54
|
+
end
|
55
|
+
|
56
|
+
def encode_with(coder)
|
57
|
+
# Order matters.
|
58
|
+
|
59
|
+
coder[:url] = @url
|
60
|
+
coder[:scraped] = @scraped
|
61
|
+
coder[:datetime] = @datetime.nil?() ? @datetime : @datetime.iso8601()
|
62
|
+
coder[:title] = @title
|
63
|
+
coder[:futsuurl] = @futsuurl
|
64
|
+
coder[:sha256] = @sha256
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.load_data(key,hash)
|
68
|
+
datetime = hash[:datetime]
|
69
|
+
|
70
|
+
slink = SearchLink.new(
|
71
|
+
hash[:url],
|
72
|
+
scraped: hash[:scraped]
|
73
|
+
)
|
74
|
+
|
75
|
+
slink.datetime = Util.empty_web_str?(datetime) ? nil : Time.iso8601(datetime)
|
76
|
+
slink.futsuurl = hash[:futsuurl]
|
77
|
+
slink.sha256 = hash[:sha256]
|
78
|
+
slink.title = hash[:title]
|
79
|
+
|
80
|
+
return slink
|
81
|
+
end
|
82
|
+
|
83
|
+
def update_from_article(article)
|
84
|
+
# Don't update the url, as it may be different (e.g., http vs https).
|
85
|
+
|
86
|
+
@datetime = article.datetime if @datetime.nil?()
|
87
|
+
@futsuurl = article.futsuurl if Util.empty_web_str?(@futsuurl)
|
88
|
+
@scraped = true # If we have an article, it's been scraped
|
89
|
+
@sha256 = article.sha256 if Util.empty_web_str?(@sha256)
|
90
|
+
@title = article.title if Util.empty_web_str?(@title)
|
91
|
+
end
|
92
|
+
|
93
|
+
def to_s(mini: false)
|
94
|
+
s = ''.dup()
|
95
|
+
|
96
|
+
s << "'#{@url}': "
|
97
|
+
|
98
|
+
if mini
|
99
|
+
s << "{ scraped? #{@scraped ? 'yes' : 'NO'} }"
|
100
|
+
else
|
101
|
+
s << "\n scraped? #{@scraped ? 'yes' : 'NO'}"
|
102
|
+
s << "\n datetime: '#{@datetime}'"
|
103
|
+
s << "\n title: '#{@title}'"
|
104
|
+
s << "\n futsuurl: '#{@futsuurl}'"
|
105
|
+
s << "\n sha256: '#{@sha256}'"
|
106
|
+
end
|
107
|
+
|
108
|
+
return s
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
###
|
113
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
114
|
+
# @since 0.2.0
|
115
|
+
###
|
116
|
+
class SearchLinks
|
117
|
+
include Fileable
|
118
|
+
|
119
|
+
DEFAULT_DIR = Util::CORE_DIR
|
120
|
+
|
121
|
+
DEFAULT_BING_FUTSUU_FILENAME = 'bing_nhk_news_web_regular.yml'
|
122
|
+
DEFAULT_BING_YASASHII_FILENAME = 'bing_nhk_news_web_easy.yml'
|
123
|
+
|
124
|
+
def self.build_file(filename)
|
125
|
+
return File.join(DEFAULT_DIR,filename)
|
126
|
+
end
|
127
|
+
|
128
|
+
DEFAULT_BING_FUTSUU_FILE = build_file(DEFAULT_BING_FUTSUU_FILENAME)
|
129
|
+
DEFAULT_BING_YASASHII_FILE = build_file(DEFAULT_BING_YASASHII_FILENAME)
|
130
|
+
|
131
|
+
attr_reader :links
|
132
|
+
|
133
|
+
def initialize()
|
134
|
+
super()
|
135
|
+
|
136
|
+
@links = {}
|
137
|
+
end
|
138
|
+
|
139
|
+
def add_link(link)
|
140
|
+
return self if @links.key?(link.url)
|
141
|
+
|
142
|
+
@links[link.url] = link
|
143
|
+
|
144
|
+
return self
|
145
|
+
end
|
146
|
+
|
147
|
+
def each(&block)
|
148
|
+
return @links.each(&block)
|
149
|
+
end
|
150
|
+
|
151
|
+
def encode_with(coder)
|
152
|
+
# Order matters.
|
153
|
+
|
154
|
+
coder[:links] = @links
|
155
|
+
end
|
156
|
+
|
157
|
+
def self.load_data(data,file: nil,**kargs)
|
158
|
+
data = Util.load_yaml(data,file: file)
|
159
|
+
|
160
|
+
links = data[:links]
|
161
|
+
|
162
|
+
slinks = SearchLinks.new()
|
163
|
+
|
164
|
+
if !links.nil?()
|
165
|
+
links.each() do |key,hash|
|
166
|
+
key = key.to_s() # Change from a symbol
|
167
|
+
slinks.links[key] = SearchLink.load_data(key,hash)
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
return slinks
|
172
|
+
end
|
173
|
+
|
174
|
+
def [](url)
|
175
|
+
url = url.url if url.respond_to?(:url)
|
176
|
+
|
177
|
+
return @links[url]
|
178
|
+
end
|
179
|
+
|
180
|
+
def length()
|
181
|
+
return @links.length
|
182
|
+
end
|
183
|
+
|
184
|
+
def to_s()
|
185
|
+
return Util.dump_yaml(self)
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
@@ -0,0 +1,152 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'uri'
|
25
|
+
|
26
|
+
require 'nhkore/error'
|
27
|
+
require 'nhkore/scraper'
|
28
|
+
require 'nhkore/search_link'
|
29
|
+
require 'nhkore/util'
|
30
|
+
|
31
|
+
|
32
|
+
module NHKore
|
33
|
+
###
|
34
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
35
|
+
# @since 0.2.0
|
36
|
+
###
|
37
|
+
class SearchScraper < Scraper
|
38
|
+
DEFAULT_RESULT_COUNT = 100
|
39
|
+
FUTSUU_SITE = 'nhk.or.jp/news/html/'
|
40
|
+
YASASHII_SITE = 'nhk.or.jp/news/easy/'
|
41
|
+
|
42
|
+
# https://www3.nhk.or.jp/news/html/20200220/k10012294001000.html
|
43
|
+
FUTSUU_REGEX = /\A[^\.]+\.#{Regexp.quote(FUTSUU_SITE)}.+\.html?/i
|
44
|
+
# https://www3.nhk.or.jp/news/easy/k10012294001000/k10012294001000.html
|
45
|
+
# - https://www3.nhk.or.jp/news/easy/article/disaster_heat.html
|
46
|
+
YASASHII_REGEX = /\A[^\.]+\.#{Regexp.quote(YASASHII_SITE)}.+\.html?/i
|
47
|
+
|
48
|
+
# Pass in +header: {}+ to trigger using the default HTTP header fields.
|
49
|
+
def initialize(url,header: {},**kargs)
|
50
|
+
super(url,header: header,**kargs)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
###
|
55
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
56
|
+
# @since 0.2.0
|
57
|
+
###
|
58
|
+
class BingScraper < SearchScraper
|
59
|
+
attr_reader :regex
|
60
|
+
attr_reader :site
|
61
|
+
|
62
|
+
def initialize(site,regex: nil,url: nil,**kargs)
|
63
|
+
case site
|
64
|
+
when :futsuu
|
65
|
+
regex = FUTSUU_REGEX if regex.nil?()
|
66
|
+
site = FUTSUU_SITE
|
67
|
+
when :yasashii
|
68
|
+
regex = YASASHII_REGEX if regex.nil?()
|
69
|
+
site = YASASHII_SITE
|
70
|
+
else
|
71
|
+
site = Util.strip_web_str(site.to_s())
|
72
|
+
regex = /#{Regexp.quote(site)}/ if regex.nil?()
|
73
|
+
end
|
74
|
+
|
75
|
+
raise ArgumentError,"empty regex[#{regex}]" if regex.nil?()
|
76
|
+
raise ArgumentError,"empty site[#{site}]" if site.empty?()
|
77
|
+
|
78
|
+
@regex = regex
|
79
|
+
@site = site
|
80
|
+
url = self.class.build_url(site,**kargs) if url.nil?()
|
81
|
+
|
82
|
+
# Delete class-specific args (don't pass to Open-URI).
|
83
|
+
kargs.delete(:count)
|
84
|
+
|
85
|
+
super(url,**kargs)
|
86
|
+
end
|
87
|
+
|
88
|
+
def self.build_url(site,count: DEFAULT_RESULT_COUNT,**kargs)
|
89
|
+
url = ''.dup()
|
90
|
+
|
91
|
+
url << 'https://www.bing.com/search?'
|
92
|
+
url << URI.encode_www_form(
|
93
|
+
q: "site:#{site}",
|
94
|
+
count: count
|
95
|
+
)
|
96
|
+
|
97
|
+
return url
|
98
|
+
end
|
99
|
+
|
100
|
+
def scrape(links,page=NextPage.new())
|
101
|
+
doc = html_doc()
|
102
|
+
next_page = NextPage.new()
|
103
|
+
|
104
|
+
anchors = doc.css('a')
|
105
|
+
|
106
|
+
return next_page if anchors.length < 1
|
107
|
+
|
108
|
+
anchors.each() do |anchor|
|
109
|
+
href = anchor['href'].to_s()
|
110
|
+
href = Util.unspace_web_str(href).downcase()
|
111
|
+
|
112
|
+
next if href.empty?()
|
113
|
+
next if href =~ /\/about\.html?/ # https://www3.nhk.or.jp/news/easy/about.html
|
114
|
+
next if href =~ /\/movieplayer\.html?/ # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
|
115
|
+
next if href =~ /\/audio\.html?/ # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
|
116
|
+
|
117
|
+
if (md = href.match(/first\=(\d+)/))
|
118
|
+
count = md[1].to_i()
|
119
|
+
|
120
|
+
if count > page.count && (next_page.count < 0 || count < next_page.count)
|
121
|
+
next_page.count = count
|
122
|
+
next_page.url = join_url(href)
|
123
|
+
end
|
124
|
+
elsif href =~ regex
|
125
|
+
links.add_link(SearchLink.new(href))
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
return next_page
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
###
|
134
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
135
|
+
# @since 0.2.0
|
136
|
+
###
|
137
|
+
class NextPage
|
138
|
+
attr_accessor :count
|
139
|
+
attr_accessor :url
|
140
|
+
|
141
|
+
def initialize()
|
142
|
+
super()
|
143
|
+
|
144
|
+
@count = -1
|
145
|
+
@url = nil
|
146
|
+
end
|
147
|
+
|
148
|
+
def empty?()
|
149
|
+
return @url.nil?() || @count < 0
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
@@ -0,0 +1,339 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'csv'
|
25
|
+
|
26
|
+
require 'nhkore/article'
|
27
|
+
require 'nhkore/fileable'
|
28
|
+
require 'nhkore/util'
|
29
|
+
|
30
|
+
|
31
|
+
module NHKore
|
32
|
+
###
|
33
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
34
|
+
# @since 0.2.0
|
35
|
+
###
|
36
|
+
class Sifter
|
37
|
+
include Fileable
|
38
|
+
|
39
|
+
DEFAULT_DIR = Util::CORE_DIR
|
40
|
+
|
41
|
+
DEFAULT_FUTSUU_FILENAME = 'sift_nhk_news_web_regular'
|
42
|
+
DEFAULT_YASASHII_FILENAME = 'sift_nhk_news_web_easy'
|
43
|
+
|
44
|
+
def self.build_file(filename)
|
45
|
+
return File.join(DEFAULT_DIR,filename)
|
46
|
+
end
|
47
|
+
|
48
|
+
DEFAULT_FUTSUU_FILE = build_file(DEFAULT_FUTSUU_FILENAME)
|
49
|
+
DEFAULT_YASASHII_FILE = build_file(DEFAULT_YASASHII_FILENAME)
|
50
|
+
|
51
|
+
attr_accessor :articles
|
52
|
+
attr_accessor :caption
|
53
|
+
attr_accessor :filters
|
54
|
+
attr_accessor :ignores
|
55
|
+
attr_accessor :output
|
56
|
+
|
57
|
+
def initialize(news)
|
58
|
+
@articles = news.articles.values.dup()
|
59
|
+
@caption = nil
|
60
|
+
@filters = {}
|
61
|
+
@ignores = {}
|
62
|
+
@output = nil
|
63
|
+
end
|
64
|
+
|
65
|
+
def filter?(article)
|
66
|
+
return false if @filters.empty?()
|
67
|
+
|
68
|
+
datetime_filter = @filters[:datetime]
|
69
|
+
title_filter = @filters[:title]
|
70
|
+
url_filter = @filters[:url]
|
71
|
+
|
72
|
+
if !datetime_filter.nil?()
|
73
|
+
datetime = article.datetime
|
74
|
+
|
75
|
+
return true if datetime.nil?() ||
|
76
|
+
datetime < datetime_filter[:from] || datetime > datetime_filter[:to]
|
77
|
+
end
|
78
|
+
|
79
|
+
if !title_filter.nil?()
|
80
|
+
title = article.title.to_s()
|
81
|
+
title = Util.unspace_web_str(title) if title_filter[:unspace]
|
82
|
+
title = title.downcase() if title_filter[:uncase]
|
83
|
+
|
84
|
+
return true unless title.include?(title_filter[:filter])
|
85
|
+
end
|
86
|
+
|
87
|
+
if !url_filter.nil?()
|
88
|
+
url = article.url.to_s()
|
89
|
+
url = Util.unspace_web_str(url) if url_filter[:unspace]
|
90
|
+
url = url.downcase() if url_filter[:uncase]
|
91
|
+
|
92
|
+
return true unless url.include?(url_filter[:filter])
|
93
|
+
end
|
94
|
+
|
95
|
+
return false
|
96
|
+
end
|
97
|
+
|
98
|
+
def filter_by_datetime(datetime_filter=nil,from_filter: nil,to_filter: nil)
|
99
|
+
if !datetime_filter.nil?()
|
100
|
+
# If out-of-bounds, just nil.
|
101
|
+
from_filter = datetime_filter[0]
|
102
|
+
to_filter = datetime_filter[1]
|
103
|
+
end
|
104
|
+
|
105
|
+
from_filter = to_filter if from_filter.nil?()
|
106
|
+
to_filter = from_filter if to_filter.nil?()
|
107
|
+
|
108
|
+
from_filter = Util.jst_time(from_filter) unless from_filter.nil?()
|
109
|
+
to_filter = Util.jst_time(to_filter) unless to_filter.nil?()
|
110
|
+
|
111
|
+
datetime_filter = [from_filter,to_filter]
|
112
|
+
|
113
|
+
return self if datetime_filter.flatten().compact().empty?()
|
114
|
+
|
115
|
+
@filters[:datetime] = {from: from_filter,to: to_filter}
|
116
|
+
|
117
|
+
return self
|
118
|
+
end
|
119
|
+
|
120
|
+
def filter_by_title(title_filter,uncase: true,unspace: true)
|
121
|
+
title_filter = Util.unspace_web_str(title_filter) if unspace
|
122
|
+
title_filter = title_filter.downcase() if uncase
|
123
|
+
|
124
|
+
@filters[:title] = {filter: title_filter,uncase: uncase,unspace: unspace}
|
125
|
+
|
126
|
+
return self
|
127
|
+
end
|
128
|
+
|
129
|
+
def filter_by_url(url_filter,uncase: true,unspace: true)
|
130
|
+
url_filter = Util.unspace_web_str(url_filter) if unspace
|
131
|
+
url_filter = url_filter.downcase() if uncase
|
132
|
+
|
133
|
+
@filters[:url] = {filter: url_filter,uncase: uncase,unspace: unspace}
|
134
|
+
|
135
|
+
return self
|
136
|
+
end
|
137
|
+
|
138
|
+
def ignore(key)
|
139
|
+
@ignores[key] = true
|
140
|
+
|
141
|
+
return self
|
142
|
+
end
|
143
|
+
|
144
|
+
# This does not output {caption}.
|
145
|
+
def put_csv!()
|
146
|
+
words = sift()
|
147
|
+
|
148
|
+
@output = CSV.generate(headers: :first_row,write_headers: true) do |csv|
|
149
|
+
row = []
|
150
|
+
|
151
|
+
row << 'Frequency' unless @ignores[:freq]
|
152
|
+
row << 'Word' unless @ignores[:word]
|
153
|
+
row << 'Kana' unless @ignores[:kana]
|
154
|
+
row << 'English' unless @ignores[:eng]
|
155
|
+
row << 'Definition' unless @ignores[:defn]
|
156
|
+
|
157
|
+
csv << row
|
158
|
+
|
159
|
+
words.each() do |word|
|
160
|
+
row = []
|
161
|
+
|
162
|
+
row << word.freq unless @ignores[:freq]
|
163
|
+
row << word.word unless @ignores[:word]
|
164
|
+
row << word.kana unless @ignores[:kana]
|
165
|
+
row << word.eng unless @ignores[:eng]
|
166
|
+
row << word.defn unless @ignores[:defn]
|
167
|
+
|
168
|
+
csv << row
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
return @output
|
173
|
+
end
|
174
|
+
|
175
|
+
def put_html!()
|
176
|
+
words = sift()
|
177
|
+
|
178
|
+
@output = ''.dup()
|
179
|
+
|
180
|
+
@output << <<~EOH
|
181
|
+
<!DOCTYPE html>
|
182
|
+
<html lang="ja">
|
183
|
+
<head>
|
184
|
+
<meta charset="utf-8">
|
185
|
+
<title>NHKore</title>
|
186
|
+
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Noto+Serif+JP&display=fallback">
|
187
|
+
<style>
|
188
|
+
body {
|
189
|
+
background-color: #FCFBF9;
|
190
|
+
color: #333333;
|
191
|
+
font-family: 'Noto Serif JP',Verdana,sans-serif;
|
192
|
+
}
|
193
|
+
h1 {
|
194
|
+
color: #737373;
|
195
|
+
}
|
196
|
+
table {
|
197
|
+
border-collapse: collapse;
|
198
|
+
table-layout: fixed;
|
199
|
+
width: 100%;
|
200
|
+
}
|
201
|
+
tr:nth-child(even) {
|
202
|
+
background-color: #A5C7ED;
|
203
|
+
}
|
204
|
+
tr:hover {
|
205
|
+
background-color: #FFDDCA;
|
206
|
+
}
|
207
|
+
td,th {
|
208
|
+
border: 1px solid #333333;
|
209
|
+
padding: 8px;
|
210
|
+
text-align: left;
|
211
|
+
}
|
212
|
+
th {
|
213
|
+
background-color: #082A8E;
|
214
|
+
color: #FCFBF9;
|
215
|
+
}
|
216
|
+
td {
|
217
|
+
vertical-align: top;
|
218
|
+
}
|
219
|
+
td:nth-child(1) {
|
220
|
+
padding-right: 1em;
|
221
|
+
text-align: right;
|
222
|
+
}
|
223
|
+
</style>
|
224
|
+
</head>
|
225
|
+
<body>
|
226
|
+
<h1>NHKore</h1>
|
227
|
+
<h2>#{@caption}</h2>
|
228
|
+
<table>
|
229
|
+
EOH
|
230
|
+
#" # Fix for editor
|
231
|
+
|
232
|
+
# If have too few or too many '<col>', invalid HTML.
|
233
|
+
@output << %Q{<col style="width:6em;">\n} unless @ignores[:freq]
|
234
|
+
@output << %Q{<col style="width:17em;">\n} unless @ignores[:word]
|
235
|
+
@output << %Q{<col style="width:17em;">\n} unless @ignores[:kana]
|
236
|
+
@output << %Q{<col style="width:5em;">\n} unless @ignores[:eng]
|
237
|
+
@output << "<col>\n" unless @ignores[:defn] # No width for defn, fills rest of page
|
238
|
+
|
239
|
+
@output << '<tr>'
|
240
|
+
@output << '<th>Frequency</th>' unless @ignores[:freq]
|
241
|
+
@output << '<th>Word</th>' unless @ignores[:word]
|
242
|
+
@output << '<th>Kana</th>' unless @ignores[:kana]
|
243
|
+
@output << '<th>English</th>' unless @ignores[:eng]
|
244
|
+
@output << '<th>Definition</th>' unless @ignores[:defn]
|
245
|
+
@output << "</tr>\n"
|
246
|
+
|
247
|
+
words.each() do |word|
|
248
|
+
@output << '<tr>'
|
249
|
+
@output << "<td>#{Util.escape_html(word.freq.to_s())}</td>" unless @ignores[:freq]
|
250
|
+
@output << "<td>#{Util.escape_html(word.word.to_s())}</td>" unless @ignores[:word]
|
251
|
+
@output << "<td>#{Util.escape_html(word.kana.to_s())}</td>" unless @ignores[:kana]
|
252
|
+
@output << "<td>#{Util.escape_html(word.eng.to_s())}</td>" unless @ignores[:eng]
|
253
|
+
@output << "<td>#{Util.escape_html(word.defn.to_s())}</td>" unless @ignores[:defn]
|
254
|
+
@output << "</tr>\n"
|
255
|
+
end
|
256
|
+
|
257
|
+
@output << <<~EOH
|
258
|
+
</table>
|
259
|
+
</body>
|
260
|
+
</html>
|
261
|
+
EOH
|
262
|
+
#/ # Fix for editor
|
263
|
+
|
264
|
+
return @output
|
265
|
+
end
|
266
|
+
|
267
|
+
def put_yaml!()
|
268
|
+
words = sift()
|
269
|
+
|
270
|
+
# Just blank out ignores.
|
271
|
+
if !@ignores.empty?()
|
272
|
+
words.each() do |word|
|
273
|
+
# word/kanji/kana do not have setters/mutators.
|
274
|
+
word.defn = nil if @ignores[:defn]
|
275
|
+
word.eng = nil if @ignores[:eng]
|
276
|
+
word.freq = nil if @ignores[:freq]
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
yaml = {
|
281
|
+
caption: @caption,
|
282
|
+
words: words
|
283
|
+
}
|
284
|
+
|
285
|
+
# Put each Word on one line (flow/inline style).
|
286
|
+
@output = Util.dump_yaml(yaml,flow_level: 4)
|
287
|
+
|
288
|
+
return @output
|
289
|
+
end
|
290
|
+
|
291
|
+
def sift()
|
292
|
+
master_article = Article.new()
|
293
|
+
|
294
|
+
@articles.each() do |article|
|
295
|
+
next if filter?(article)
|
296
|
+
|
297
|
+
article.words.values().each() do |word|
|
298
|
+
master_article.add_word(word,use_freq: true)
|
299
|
+
end
|
300
|
+
end
|
301
|
+
|
302
|
+
words = master_article.words.values()
|
303
|
+
|
304
|
+
words = words.sort() do |word1,word2|
|
305
|
+
# Order by freq DESC (most frequent words to top).
|
306
|
+
i = (word2.freq <=> word1.freq)
|
307
|
+
|
308
|
+
# Order by !defn.empty, word ASC, !kana.empty, kana ASC, defn.len DESC, defn ASC.
|
309
|
+
i = compare_empty_str(word1.defn,word2.defn) if i == 0 # Favor words that have definitions
|
310
|
+
i = (word1.word.to_s() <=> word2.word.to_s()) if i == 0
|
311
|
+
i = compare_empty_str(word1.kana,word2.kana) if i == 0 # Favor words that have kana
|
312
|
+
i = (word1.kana.to_s() <=> word2.kana.to_s()) if i == 0
|
313
|
+
i = (word2.defn.to_s().length <=> word1.defn.to_s().length) if i == 0 # Favor longer definitions
|
314
|
+
i = (word1.defn.to_s() <=> word2.defn.to_s()) if i == 0
|
315
|
+
|
316
|
+
i
|
317
|
+
end
|
318
|
+
|
319
|
+
return words
|
320
|
+
end
|
321
|
+
|
322
|
+
def compare_empty_str(str1,str2)
|
323
|
+
has_str1 = !Util.empty_web_str?(str1)
|
324
|
+
has_str2 = !Util.empty_web_str?(str2)
|
325
|
+
|
326
|
+
if has_str1 && !has_str2
|
327
|
+
return -1 # Bubble word1 to top
|
328
|
+
elsif !has_str1 && has_str2
|
329
|
+
return 1 # Bubble word2 to top
|
330
|
+
end
|
331
|
+
|
332
|
+
return 0 # Further comparison needed
|
333
|
+
end
|
334
|
+
|
335
|
+
def to_s()
|
336
|
+
return @output.to_s()
|
337
|
+
end
|
338
|
+
end
|
339
|
+
end
|