nhkore 0.3.3 → 0.3.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +3 -0
- data/CHANGELOG.md +97 -2
- data/Gemfile +0 -18
- data/Gemfile.lock +89 -0
- data/README.md +58 -30
- data/Rakefile +68 -42
- data/bin/nhkore +4 -15
- data/lib/nhkore.rb +8 -20
- data/lib/nhkore/app.rb +231 -236
- data/lib/nhkore/article.rb +56 -53
- data/lib/nhkore/article_scraper.rb +308 -289
- data/lib/nhkore/cleaner.rb +20 -32
- data/lib/nhkore/cli/fx_cmd.rb +41 -53
- data/lib/nhkore/cli/get_cmd.rb +59 -70
- data/lib/nhkore/cli/news_cmd.rb +145 -154
- data/lib/nhkore/cli/search_cmd.rb +110 -120
- data/lib/nhkore/cli/sift_cmd.rb +111 -227
- data/lib/nhkore/datetime_parser.rb +328 -0
- data/lib/nhkore/defn.rb +48 -55
- data/lib/nhkore/dict.rb +26 -38
- data/lib/nhkore/dict_scraper.rb +31 -40
- data/lib/nhkore/entry.rb +43 -55
- data/lib/nhkore/error.rb +16 -21
- data/lib/nhkore/fileable.rb +10 -21
- data/lib/nhkore/lib.rb +6 -17
- data/lib/nhkore/missingno.rb +21 -33
- data/lib/nhkore/news.rb +61 -66
- data/lib/nhkore/polisher.rb +22 -34
- data/lib/nhkore/scraper.rb +75 -82
- data/lib/nhkore/search_link.rb +85 -78
- data/lib/nhkore/search_scraper.rb +89 -92
- data/lib/nhkore/sifter.rb +157 -171
- data/lib/nhkore/splitter.rb +19 -31
- data/lib/nhkore/user_agents.rb +28 -32
- data/lib/nhkore/util.rb +72 -101
- data/lib/nhkore/variator.rb +20 -32
- data/lib/nhkore/version.rb +4 -16
- data/lib/nhkore/word.rb +105 -99
- data/nhkore.gemspec +58 -65
- data/samples/looper.rb +71 -0
- data/test/nhkore/test_helper.rb +3 -15
- data/test/nhkore_test.rb +6 -18
- metadata +53 -30
@@ -1,23 +1,11 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
@@ -31,189 +19,198 @@ require 'nhkore/util'
|
|
31
19
|
|
32
20
|
module NHKore
|
33
21
|
###
|
34
|
-
# @author Jonathan Bradley Whited
|
22
|
+
# @author Jonathan Bradley Whited
|
35
23
|
# @since 0.2.0
|
36
24
|
###
|
37
25
|
class SearchScraper < Scraper
|
38
26
|
DEFAULT_RESULT_COUNT = 100
|
39
27
|
FUTSUU_SITE = 'nhk.or.jp/news/html/'
|
40
28
|
YASASHII_SITE = 'nhk.or.jp/news/easy/'
|
41
|
-
|
29
|
+
|
42
30
|
# https://www3.nhk.or.jp/news/html/20200220/k10012294001000.html
|
43
|
-
FUTSUU_REGEX = /\A[
|
31
|
+
FUTSUU_REGEX = /\A[^.]+\.#{Regexp.quote(FUTSUU_SITE)}.+\.html?/i.freeze
|
44
32
|
# https://www3.nhk.or.jp/news/easy/k10012294001000/k10012294001000.html
|
45
33
|
# - https://www3.nhk.or.jp/news/easy/article/disaster_heat.html
|
46
|
-
YASASHII_REGEX = /\A[
|
47
|
-
|
34
|
+
YASASHII_REGEX = /\A[^.]+\.#{Regexp.quote(YASASHII_SITE)}.+\.html?/i.freeze
|
35
|
+
|
36
|
+
IGNORE_LINK_REGEX = %r{
|
37
|
+
/about\.html? # https://www3.nhk.or.jp/news/easy/about.html
|
38
|
+
|/movieplayer\.html? # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
|
39
|
+
|/audio\.html? # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
|
40
|
+
|/news/easy/index\.html? # http://www3.nhk.or.jp/news/easy/index.html
|
41
|
+
|
42
|
+
# https://cgi2.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10011916321000&title=日本の会社が作った鉄道の車両「あずま」がイギリスで走る
|
43
|
+
# https://www3.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10012689671000&title=「鬼滅の刃」の映画が台湾でも始まって大勢の人が見に行く
|
44
|
+
|/enqform\.html?
|
45
|
+
}x.freeze
|
46
|
+
|
48
47
|
# Search Engines are strict, so trigger using the default HTTP header fields
|
49
48
|
# with +header: {}+ and fetch/set the cookie using +eat_cookie: true+.
|
50
49
|
def initialize(url,eat_cookie: true,header: {},**kargs)
|
51
50
|
super(url,eat_cookie: eat_cookie,header: header,**kargs)
|
52
51
|
end
|
53
|
-
|
52
|
+
|
54
53
|
def ignore_link?(link,cleaned: true)
|
55
|
-
return true if link.nil?
|
56
|
-
|
57
|
-
link = Util.unspace_web_str(link).downcase
|
58
|
-
|
59
|
-
return true if link.empty?
|
60
|
-
|
61
|
-
return true if
|
62
|
-
|
63
|
-
return true if link =~ /\/news\/easy\/index\.html?/ # http://www3.nhk.or.jp/news/easy/index.html
|
64
|
-
|
54
|
+
return true if link.nil?
|
55
|
+
|
56
|
+
link = Util.unspace_web_str(link).downcase unless cleaned
|
57
|
+
|
58
|
+
return true if link.empty?
|
59
|
+
|
60
|
+
return true if IGNORE_LINK_REGEX.match?(link)
|
61
|
+
|
65
62
|
return false
|
66
63
|
end
|
67
64
|
end
|
68
|
-
|
65
|
+
|
69
66
|
###
|
70
|
-
# @author Jonathan Bradley Whited
|
67
|
+
# @author Jonathan Bradley Whited
|
71
68
|
# @since 0.2.0
|
72
69
|
###
|
73
70
|
class BingScraper < SearchScraper
|
74
71
|
attr_reader :regex
|
75
72
|
attr_reader :site
|
76
|
-
|
73
|
+
|
77
74
|
def initialize(site,regex: nil,url: nil,**kargs)
|
78
75
|
case site
|
79
76
|
when :futsuu
|
80
|
-
regex = FUTSUU_REGEX if regex.nil?
|
77
|
+
regex = FUTSUU_REGEX if regex.nil?
|
81
78
|
site = FUTSUU_SITE
|
82
79
|
when :yasashii
|
83
|
-
regex = YASASHII_REGEX if regex.nil?
|
80
|
+
regex = YASASHII_REGEX if regex.nil?
|
84
81
|
site = YASASHII_SITE
|
85
82
|
else
|
86
83
|
raise ArgumentError,"invalid site[#{site}]"
|
87
84
|
end
|
88
|
-
|
89
|
-
raise ArgumentError,"empty regex[#{regex}]" if regex.nil?
|
90
|
-
|
85
|
+
|
86
|
+
raise ArgumentError,"empty regex[#{regex}]" if regex.nil?
|
87
|
+
|
91
88
|
@regex = regex
|
92
89
|
@site = site
|
93
|
-
url = self.class.build_url(site,**kargs) if url.nil?
|
94
|
-
|
90
|
+
url = self.class.build_url(site,**kargs) if url.nil?
|
91
|
+
|
95
92
|
# Delete class-specific args (don't pass to Open-URI).
|
96
93
|
kargs.delete(:count)
|
97
|
-
|
94
|
+
|
98
95
|
super(url,**kargs)
|
99
96
|
end
|
100
|
-
|
97
|
+
|
101
98
|
def self.build_url(site,count: DEFAULT_RESULT_COUNT,**kargs)
|
102
|
-
url = ''.dup
|
103
|
-
|
99
|
+
url = ''.dup
|
100
|
+
|
104
101
|
url << 'https://www.bing.com/search?'
|
105
102
|
url << URI.encode_www_form(
|
106
103
|
q: "site:#{site}",
|
107
104
|
count: count
|
108
105
|
)
|
109
|
-
|
106
|
+
|
110
107
|
return url
|
111
108
|
end
|
112
|
-
|
109
|
+
|
113
110
|
def scrape(slinks,page=NextPage.new())
|
114
111
|
next_page,link_count = scrape_html(slinks,page)
|
115
|
-
|
112
|
+
|
116
113
|
if link_count <= 0
|
117
114
|
scrape_rss(slinks,page,next_page)
|
118
115
|
end
|
119
|
-
|
116
|
+
|
120
117
|
return next_page
|
121
118
|
end
|
122
|
-
|
119
|
+
|
123
120
|
def scrape_html(slinks,page,next_page=NextPage.new())
|
124
|
-
doc = html_doc
|
121
|
+
doc = html_doc
|
125
122
|
link_count = 0
|
126
|
-
|
123
|
+
|
127
124
|
anchors = doc.css('a')
|
128
|
-
|
129
|
-
anchors.each
|
130
|
-
href = anchor['href'].to_s
|
131
|
-
href = Util.unspace_web_str(href).downcase
|
132
|
-
|
125
|
+
|
126
|
+
anchors.each do |anchor|
|
127
|
+
href = anchor['href'].to_s
|
128
|
+
href = Util.unspace_web_str(href).downcase
|
129
|
+
|
133
130
|
next if ignore_link?(href)
|
134
|
-
|
135
|
-
if (md = href.match(/first
|
136
|
-
count = md[1].to_i
|
137
|
-
|
131
|
+
|
132
|
+
if (md = href.match(/first=(\d+)/))
|
133
|
+
count = md[1].to_i
|
134
|
+
|
138
135
|
if count > page.count && (next_page.count < 0 || count < next_page.count)
|
139
136
|
next_page.count = count
|
140
137
|
next_page.url = join_url(href)
|
141
138
|
end
|
142
139
|
elsif href =~ regex
|
143
140
|
slinks.add_link(SearchLink.new(href))
|
144
|
-
|
141
|
+
|
145
142
|
link_count += 1
|
146
143
|
end
|
147
144
|
end
|
148
|
-
|
145
|
+
|
149
146
|
return [next_page,link_count]
|
150
147
|
end
|
151
|
-
|
148
|
+
|
152
149
|
def scrape_rss(slinks,page,next_page=NextPage.new())
|
153
150
|
link_count = 0
|
154
|
-
|
151
|
+
|
155
152
|
if !@is_file
|
156
153
|
uri = URI(@url)
|
157
|
-
|
154
|
+
|
158
155
|
Util.replace_uri_query!(uri,format: 'rss')
|
159
|
-
open(uri)
|
160
|
-
|
161
|
-
doc = rss_doc
|
156
|
+
self.open(uri)
|
157
|
+
|
158
|
+
doc = rss_doc
|
162
159
|
rss_links = []
|
163
|
-
|
164
|
-
doc.items.each
|
165
|
-
link = item.link.to_s
|
166
|
-
link = Util.unspace_web_str(link).downcase
|
167
|
-
|
160
|
+
|
161
|
+
doc.items.each do |item|
|
162
|
+
link = item.link.to_s
|
163
|
+
link = Util.unspace_web_str(link).downcase
|
164
|
+
|
168
165
|
rss_links << link
|
169
|
-
|
166
|
+
|
170
167
|
next if ignore_link?(link)
|
171
168
|
next if link !~ regex
|
172
|
-
|
169
|
+
|
173
170
|
slinks.add_link(SearchLink.new(link))
|
174
|
-
|
171
|
+
|
175
172
|
link_count += 1
|
176
173
|
end
|
177
|
-
|
174
|
+
|
178
175
|
# For RSS, Bing will keep returning the same links over and over
|
179
176
|
# if it's the last page or the "first=" query is the wrong count.
|
180
177
|
# Therefore, we have to test the previous RSS links (+page.rss_links+).
|
181
|
-
if next_page.empty?
|
178
|
+
if next_page.empty? && doc.items.length >= 1 && page.rss_links != rss_links
|
182
179
|
next_page.count = (page.count < 0) ? 0 : page.count
|
183
180
|
next_page.count += doc.items.length
|
184
181
|
next_page.rss_links = rss_links
|
185
|
-
|
186
|
-
uri = URI(page.url.nil?
|
187
|
-
|
182
|
+
|
183
|
+
uri = URI(page.url.nil? ? @url : page.url)
|
184
|
+
|
188
185
|
Util.replace_uri_query!(uri,first: next_page.count)
|
189
|
-
|
186
|
+
|
190
187
|
next_page.url = uri
|
191
188
|
end
|
192
189
|
end
|
193
|
-
|
190
|
+
|
194
191
|
return [next_page,link_count]
|
195
192
|
end
|
196
193
|
end
|
197
|
-
|
194
|
+
|
198
195
|
###
|
199
|
-
# @author Jonathan Bradley Whited
|
196
|
+
# @author Jonathan Bradley Whited
|
200
197
|
# @since 0.2.0
|
201
198
|
###
|
202
199
|
class NextPage
|
203
200
|
attr_accessor :count
|
204
201
|
attr_accessor :rss_links
|
205
202
|
attr_accessor :url
|
206
|
-
|
207
|
-
def initialize
|
203
|
+
|
204
|
+
def initialize
|
208
205
|
super()
|
209
|
-
|
206
|
+
|
210
207
|
@count = -1
|
211
208
|
@rss_links = nil
|
212
209
|
@url = nil
|
213
210
|
end
|
214
|
-
|
215
|
-
def empty?
|
216
|
-
return @url.nil?
|
211
|
+
|
212
|
+
def empty?
|
213
|
+
return @url.nil? || @count < 0
|
217
214
|
end
|
218
215
|
end
|
219
216
|
end
|
data/lib/nhkore/sifter.rb
CHANGED
@@ -1,23 +1,11 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
@@ -28,179 +16,179 @@ require 'nhkore/util'
|
|
28
16
|
|
29
17
|
module NHKore
|
30
18
|
###
|
31
|
-
# @author Jonathan Bradley Whited
|
19
|
+
# @author Jonathan Bradley Whited
|
32
20
|
# @since 0.2.0
|
33
21
|
###
|
34
22
|
class Sifter
|
35
23
|
include Fileable
|
36
|
-
|
24
|
+
|
37
25
|
DEFAULT_DIR = Util::CORE_DIR
|
38
|
-
|
26
|
+
|
39
27
|
DEFAULT_FUTSUU_FILENAME = 'sift_nhk_news_web_regular'
|
40
28
|
DEFAULT_YASASHII_FILENAME = 'sift_nhk_news_web_easy'
|
41
|
-
|
29
|
+
|
42
30
|
def self.build_file(filename)
|
43
31
|
return File.join(DEFAULT_DIR,filename)
|
44
32
|
end
|
45
|
-
|
33
|
+
|
46
34
|
DEFAULT_FUTSUU_FILE = build_file(DEFAULT_FUTSUU_FILENAME)
|
47
35
|
DEFAULT_YASASHII_FILE = build_file(DEFAULT_YASASHII_FILENAME)
|
48
|
-
|
36
|
+
|
49
37
|
attr_accessor :articles
|
50
38
|
attr_accessor :caption
|
51
39
|
attr_accessor :filters
|
52
40
|
attr_accessor :ignores
|
53
41
|
attr_accessor :output
|
54
|
-
|
42
|
+
|
55
43
|
def initialize(news)
|
56
|
-
@articles = news.articles.values.dup
|
44
|
+
@articles = news.articles.values.dup
|
57
45
|
@caption = nil
|
58
46
|
@filters = {}
|
59
47
|
@ignores = {}
|
60
48
|
@output = nil
|
61
49
|
end
|
62
|
-
|
63
|
-
def build_header
|
50
|
+
|
51
|
+
def build_header
|
64
52
|
header = []
|
65
|
-
|
53
|
+
|
66
54
|
header << 'Frequency' unless @ignores[:freq]
|
67
55
|
header << 'Word' unless @ignores[:word]
|
68
56
|
header << 'Kana' unless @ignores[:kana]
|
69
57
|
header << 'English' unless @ignores[:eng]
|
70
58
|
header << 'Definition' unless @ignores[:defn]
|
71
|
-
|
59
|
+
|
72
60
|
return header
|
73
61
|
end
|
74
|
-
|
62
|
+
|
75
63
|
def build_rows(words)
|
76
64
|
rows = []
|
77
|
-
|
78
|
-
words.each
|
65
|
+
|
66
|
+
words.each do |word|
|
79
67
|
rows << build_word_row(word)
|
80
68
|
end
|
81
|
-
|
69
|
+
|
82
70
|
return rows
|
83
71
|
end
|
84
|
-
|
72
|
+
|
85
73
|
def build_word_row(word)
|
86
74
|
row = []
|
87
|
-
|
75
|
+
|
88
76
|
row << word.freq unless @ignores[:freq]
|
89
77
|
row << word.word unless @ignores[:word]
|
90
78
|
row << word.kana unless @ignores[:kana]
|
91
79
|
row << word.eng unless @ignores[:eng]
|
92
80
|
row << word.defn unless @ignores[:defn]
|
93
|
-
|
81
|
+
|
94
82
|
return row
|
95
83
|
end
|
96
|
-
|
84
|
+
|
97
85
|
def filter?(article)
|
98
|
-
return false if @filters.empty?
|
99
|
-
|
86
|
+
return false if @filters.empty?
|
87
|
+
|
100
88
|
datetime_filter = @filters[:datetime]
|
101
89
|
title_filter = @filters[:title]
|
102
90
|
url_filter = @filters[:url]
|
103
|
-
|
104
|
-
if !datetime_filter.nil?
|
91
|
+
|
92
|
+
if !datetime_filter.nil?
|
105
93
|
datetime = article.datetime
|
106
|
-
|
107
|
-
return true if datetime.nil?
|
94
|
+
|
95
|
+
return true if datetime.nil? ||
|
108
96
|
datetime < datetime_filter[:from] || datetime > datetime_filter[:to]
|
109
97
|
end
|
110
|
-
|
111
|
-
if !title_filter.nil?
|
112
|
-
title = article.title.to_s
|
98
|
+
|
99
|
+
if !title_filter.nil?
|
100
|
+
title = article.title.to_s
|
113
101
|
title = Util.unspace_web_str(title) if title_filter[:unspace]
|
114
|
-
title = title.downcase
|
115
|
-
|
102
|
+
title = title.downcase if title_filter[:uncase]
|
103
|
+
|
116
104
|
return true unless title.include?(title_filter[:filter])
|
117
105
|
end
|
118
|
-
|
119
|
-
if !url_filter.nil?
|
120
|
-
url = article.url.to_s
|
106
|
+
|
107
|
+
if !url_filter.nil?
|
108
|
+
url = article.url.to_s
|
121
109
|
url = Util.unspace_web_str(url) if url_filter[:unspace]
|
122
|
-
url = url.downcase
|
123
|
-
|
110
|
+
url = url.downcase if url_filter[:uncase]
|
111
|
+
|
124
112
|
return true unless url.include?(url_filter[:filter])
|
125
113
|
end
|
126
|
-
|
114
|
+
|
127
115
|
return false
|
128
116
|
end
|
129
|
-
|
117
|
+
|
130
118
|
def filter_by_datetime(datetime_filter=nil,from: nil,to: nil)
|
131
|
-
if !datetime_filter.nil?
|
132
|
-
if datetime_filter.respond_to?(:
|
119
|
+
if !datetime_filter.nil?
|
120
|
+
if datetime_filter.respond_to?(:[])
|
133
121
|
# If out-of-bounds, just nil.
|
134
|
-
from = datetime_filter[0] if from.nil?
|
135
|
-
to = datetime_filter[1] if to.nil?
|
122
|
+
from = datetime_filter[0] if from.nil?
|
123
|
+
to = datetime_filter[1] if to.nil?
|
136
124
|
else
|
137
|
-
from = datetime_filter if from.nil?
|
138
|
-
to = datetime_filter if to.nil?
|
125
|
+
from = datetime_filter if from.nil?
|
126
|
+
to = datetime_filter if to.nil?
|
139
127
|
end
|
140
128
|
end
|
141
|
-
|
142
|
-
from = to if from.nil?
|
143
|
-
to = from if to.nil?
|
144
|
-
|
145
|
-
from = Util.jst_time(from) unless from.nil?
|
146
|
-
to = Util.jst_time(to) unless to.nil?
|
147
|
-
|
129
|
+
|
130
|
+
from = to if from.nil?
|
131
|
+
to = from if to.nil?
|
132
|
+
|
133
|
+
from = Util.jst_time(from) unless from.nil?
|
134
|
+
to = Util.jst_time(to) unless to.nil?
|
135
|
+
|
148
136
|
datetime_filter = [from,to]
|
149
|
-
|
150
|
-
return self if datetime_filter.flatten
|
151
|
-
|
137
|
+
|
138
|
+
return self if datetime_filter.flatten.compact.empty?
|
139
|
+
|
152
140
|
@filters[:datetime] = {from: from,to: to}
|
153
|
-
|
141
|
+
|
154
142
|
return self
|
155
143
|
end
|
156
|
-
|
144
|
+
|
157
145
|
def filter_by_title(title_filter,uncase: true,unspace: true)
|
158
146
|
title_filter = Util.unspace_web_str(title_filter) if unspace
|
159
|
-
title_filter = title_filter.downcase
|
160
|
-
|
147
|
+
title_filter = title_filter.downcase if uncase
|
148
|
+
|
161
149
|
@filters[:title] = {filter: title_filter,uncase: uncase,unspace: unspace}
|
162
|
-
|
150
|
+
|
163
151
|
return self
|
164
152
|
end
|
165
|
-
|
153
|
+
|
166
154
|
def filter_by_url(url_filter,uncase: true,unspace: true)
|
167
155
|
url_filter = Util.unspace_web_str(url_filter) if unspace
|
168
|
-
url_filter = url_filter.downcase
|
169
|
-
|
156
|
+
url_filter = url_filter.downcase if uncase
|
157
|
+
|
170
158
|
@filters[:url] = {filter: url_filter,uncase: uncase,unspace: unspace}
|
171
|
-
|
159
|
+
|
172
160
|
return self
|
173
161
|
end
|
174
|
-
|
162
|
+
|
175
163
|
def ignore(key)
|
176
164
|
@ignores[key] = true
|
177
|
-
|
165
|
+
|
178
166
|
return self
|
179
167
|
end
|
180
|
-
|
168
|
+
|
181
169
|
# This does not output {caption}.
|
182
|
-
def put_csv!
|
170
|
+
def put_csv!
|
183
171
|
require 'csv'
|
184
|
-
|
185
|
-
words = sift
|
186
|
-
|
172
|
+
|
173
|
+
words = sift
|
174
|
+
|
187
175
|
@output = CSV.generate(headers: :first_row,write_headers: true) do |csv|
|
188
|
-
csv << build_header
|
189
|
-
|
190
|
-
words.each
|
176
|
+
csv << build_header
|
177
|
+
|
178
|
+
words.each do |word|
|
191
179
|
csv << build_word_row(word)
|
192
180
|
end
|
193
181
|
end
|
194
|
-
|
182
|
+
|
195
183
|
return @output
|
196
184
|
end
|
197
|
-
|
198
|
-
def put_html!
|
199
|
-
words = sift
|
200
|
-
|
201
|
-
@output = ''.dup
|
202
|
-
|
203
|
-
@output << <<~
|
185
|
+
|
186
|
+
def put_html!
|
187
|
+
words = sift
|
188
|
+
|
189
|
+
@output = ''.dup
|
190
|
+
|
191
|
+
@output << <<~HTML
|
204
192
|
<!DOCTYPE html>
|
205
193
|
<html lang="ja">
|
206
194
|
<head>
|
@@ -249,146 +237,144 @@ module NHKore
|
|
249
237
|
<h1>NHKore</h1>
|
250
238
|
<h2>#{@caption}</h2>
|
251
239
|
<table>
|
252
|
-
|
253
|
-
|
254
|
-
|
240
|
+
HTML
|
241
|
+
|
255
242
|
# If have too few or too many '<col>', invalid HTML.
|
256
|
-
@output << %Q
|
257
|
-
@output << %Q
|
258
|
-
@output << %Q
|
259
|
-
@output << %Q
|
243
|
+
@output << %Q(<col style="width:6em;">\n) unless @ignores[:freq]
|
244
|
+
@output << %Q(<col style="width:17em;">\n) unless @ignores[:word]
|
245
|
+
@output << %Q(<col style="width:17em;">\n) unless @ignores[:kana]
|
246
|
+
@output << %Q(<col style="width:5em;">\n) unless @ignores[:eng]
|
260
247
|
@output << "<col>\n" unless @ignores[:defn] # No width for defn, fills rest of page
|
261
|
-
|
248
|
+
|
262
249
|
@output << '<tr>'
|
263
|
-
|
264
|
-
build_header
|
250
|
+
|
251
|
+
build_header.each do |h|
|
265
252
|
@output << "<th>#{h}</th>"
|
266
253
|
end
|
267
|
-
|
254
|
+
|
268
255
|
@output << "</tr>\n"
|
269
|
-
|
270
|
-
words.each
|
256
|
+
|
257
|
+
words.each do |word|
|
271
258
|
@output << '<tr>'
|
272
|
-
|
273
|
-
build_word_row(word).each
|
274
|
-
@output << "<td>#{Util.escape_html(w.to_s
|
259
|
+
|
260
|
+
build_word_row(word).each do |w|
|
261
|
+
@output << "<td>#{Util.escape_html(w.to_s)}</td>"
|
275
262
|
end
|
276
|
-
|
263
|
+
|
277
264
|
@output << "</tr>\n"
|
278
265
|
end
|
279
|
-
|
280
|
-
@output << <<~
|
266
|
+
|
267
|
+
@output << <<~HTML
|
281
268
|
</table>
|
282
269
|
</body>
|
283
270
|
</html>
|
284
|
-
|
285
|
-
|
286
|
-
|
271
|
+
HTML
|
272
|
+
|
287
273
|
return @output
|
288
274
|
end
|
289
|
-
|
290
|
-
def put_json!
|
275
|
+
|
276
|
+
def put_json!
|
291
277
|
require 'json'
|
292
|
-
|
293
|
-
words = sift
|
294
|
-
|
295
|
-
@output = ''.dup
|
296
|
-
|
297
|
-
@output << <<~
|
278
|
+
|
279
|
+
words = sift
|
280
|
+
|
281
|
+
@output = ''.dup
|
282
|
+
|
283
|
+
@output << <<~JSON
|
298
284
|
{
|
299
285
|
"caption": #{JSON.generate(@caption)},
|
300
|
-
"header": #{JSON.generate(build_header
|
286
|
+
"header": #{JSON.generate(build_header)},
|
301
287
|
"words": [
|
302
|
-
|
303
|
-
|
304
|
-
if !words.empty?
|
288
|
+
JSON
|
289
|
+
|
290
|
+
if !words.empty?
|
305
291
|
0.upto(words.length - 2) do |i|
|
306
292
|
@output << " #{JSON.generate(build_word_row(words[i]))},\n"
|
307
293
|
end
|
308
|
-
|
294
|
+
|
309
295
|
@output << " #{JSON.generate(build_word_row(words[-1]))}\n"
|
310
296
|
end
|
311
|
-
|
297
|
+
|
312
298
|
@output << "]\n}\n"
|
313
|
-
|
299
|
+
|
314
300
|
return @output
|
315
301
|
end
|
316
|
-
|
317
|
-
def put_yaml!
|
302
|
+
|
303
|
+
def put_yaml!
|
318
304
|
require 'psychgus'
|
319
|
-
|
320
|
-
words = sift
|
321
|
-
|
305
|
+
|
306
|
+
words = sift
|
307
|
+
|
322
308
|
yaml = {
|
323
309
|
caption: @caption,
|
324
|
-
header: build_header
|
310
|
+
header: build_header,
|
325
311
|
words: build_rows(words),
|
326
312
|
}
|
327
|
-
|
328
|
-
header_styler = Class.new
|
313
|
+
|
314
|
+
header_styler = Class.new do
|
329
315
|
include Psychgus::Styler
|
330
|
-
|
316
|
+
|
331
317
|
def style_sequence(sniffer,node)
|
332
318
|
parent = sniffer.parent
|
333
|
-
|
334
|
-
if !parent.nil?
|
319
|
+
|
320
|
+
if !parent.nil? && parent.node.respond_to?(:value) && parent.value == 'header'
|
335
321
|
node.style = Psychgus::SEQUENCE_FLOW
|
336
322
|
end
|
337
323
|
end
|
338
324
|
end
|
339
|
-
|
325
|
+
|
340
326
|
# Put each Word on one line (flow/inline style).
|
341
|
-
@output = Util.dump_yaml(yaml,flow_level: 4,stylers: header_styler.new
|
342
|
-
|
327
|
+
@output = Util.dump_yaml(yaml,flow_level: 4,stylers: header_styler.new)
|
328
|
+
|
343
329
|
return @output
|
344
330
|
end
|
345
|
-
|
346
|
-
def sift
|
347
|
-
master_article = Article.new
|
348
|
-
|
349
|
-
@articles.each
|
331
|
+
|
332
|
+
def sift
|
333
|
+
master_article = Article.new
|
334
|
+
|
335
|
+
@articles.each do |article|
|
350
336
|
next if filter?(article)
|
351
|
-
|
352
|
-
article.words.
|
337
|
+
|
338
|
+
article.words.each_value do |word|
|
353
339
|
master_article.add_word(word,use_freq: true)
|
354
340
|
end
|
355
341
|
end
|
356
|
-
|
357
|
-
words = master_article.words.values
|
358
|
-
|
359
|
-
words.sort!
|
342
|
+
|
343
|
+
words = master_article.words.values
|
344
|
+
|
345
|
+
words.sort! do |word1,word2|
|
360
346
|
# Order by freq DESC (most frequent words to top).
|
361
347
|
i = (word2.freq <=> word1.freq)
|
362
|
-
|
348
|
+
|
363
349
|
# Order by !defn.empty, word ASC, !kana.empty, kana ASC, defn.len DESC, defn ASC.
|
364
350
|
i = compare_empty_str(word1.defn,word2.defn) if i == 0 # Favor words that have definitions
|
365
|
-
i = (word1.word.to_s
|
351
|
+
i = (word1.word.to_s <=> word2.word.to_s) if i == 0
|
366
352
|
i = compare_empty_str(word1.kana,word2.kana) if i == 0 # Favor words that have kana
|
367
|
-
i = (word1.kana.to_s
|
368
|
-
i = (word2.defn.to_s
|
369
|
-
i = (word1.defn.to_s
|
370
|
-
|
353
|
+
i = (word1.kana.to_s <=> word2.kana.to_s) if i == 0
|
354
|
+
i = (word2.defn.to_s.length <=> word1.defn.to_s.length) if i == 0 # Favor longer definitions
|
355
|
+
i = (word1.defn.to_s <=> word2.defn.to_s) if i == 0
|
356
|
+
|
371
357
|
i
|
372
358
|
end
|
373
|
-
|
359
|
+
|
374
360
|
return words
|
375
361
|
end
|
376
|
-
|
362
|
+
|
377
363
|
def compare_empty_str(str1,str2)
|
378
364
|
has_str1 = !Util.empty_web_str?(str1)
|
379
365
|
has_str2 = !Util.empty_web_str?(str2)
|
380
|
-
|
366
|
+
|
381
367
|
if has_str1 && !has_str2
|
382
368
|
return -1 # Bubble word1 to top
|
383
369
|
elsif !has_str1 && has_str2
|
384
370
|
return 1 # Bubble word2 to top
|
385
371
|
end
|
386
|
-
|
372
|
+
|
387
373
|
return 0 # Further comparison needed
|
388
374
|
end
|
389
|
-
|
390
|
-
def to_s
|
391
|
-
return @output.to_s
|
375
|
+
|
376
|
+
def to_s
|
377
|
+
return @output.to_s
|
392
378
|
end
|
393
379
|
end
|
394
380
|
end
|