nhkore 0.3.3 → 0.3.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +3 -0
- data/CHANGELOG.md +97 -2
- data/Gemfile +0 -18
- data/Gemfile.lock +89 -0
- data/README.md +58 -30
- data/Rakefile +68 -42
- data/bin/nhkore +4 -15
- data/lib/nhkore.rb +8 -20
- data/lib/nhkore/app.rb +231 -236
- data/lib/nhkore/article.rb +56 -53
- data/lib/nhkore/article_scraper.rb +308 -289
- data/lib/nhkore/cleaner.rb +20 -32
- data/lib/nhkore/cli/fx_cmd.rb +41 -53
- data/lib/nhkore/cli/get_cmd.rb +59 -70
- data/lib/nhkore/cli/news_cmd.rb +145 -154
- data/lib/nhkore/cli/search_cmd.rb +110 -120
- data/lib/nhkore/cli/sift_cmd.rb +111 -227
- data/lib/nhkore/datetime_parser.rb +328 -0
- data/lib/nhkore/defn.rb +48 -55
- data/lib/nhkore/dict.rb +26 -38
- data/lib/nhkore/dict_scraper.rb +31 -40
- data/lib/nhkore/entry.rb +43 -55
- data/lib/nhkore/error.rb +16 -21
- data/lib/nhkore/fileable.rb +10 -21
- data/lib/nhkore/lib.rb +6 -17
- data/lib/nhkore/missingno.rb +21 -33
- data/lib/nhkore/news.rb +61 -66
- data/lib/nhkore/polisher.rb +22 -34
- data/lib/nhkore/scraper.rb +75 -82
- data/lib/nhkore/search_link.rb +85 -78
- data/lib/nhkore/search_scraper.rb +89 -92
- data/lib/nhkore/sifter.rb +157 -171
- data/lib/nhkore/splitter.rb +19 -31
- data/lib/nhkore/user_agents.rb +28 -32
- data/lib/nhkore/util.rb +72 -101
- data/lib/nhkore/variator.rb +20 -32
- data/lib/nhkore/version.rb +4 -16
- data/lib/nhkore/word.rb +105 -99
- data/nhkore.gemspec +58 -65
- data/samples/looper.rb +71 -0
- data/test/nhkore/test_helper.rb +3 -15
- data/test/nhkore_test.rb +6 -18
- metadata +53 -30
@@ -1,23 +1,11 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
@@ -30,122 +18,122 @@ require 'nhkore/util'
|
|
30
18
|
module NHKore
|
31
19
|
module CLI
|
32
20
|
###
|
33
|
-
# @author Jonathan Bradley Whited
|
21
|
+
# @author Jonathan Bradley Whited
|
34
22
|
# @since 0.3.0
|
35
23
|
###
|
36
24
|
module SearchCmd
|
37
|
-
def build_search_cmd
|
25
|
+
def build_search_cmd
|
38
26
|
app = self
|
39
|
-
|
40
|
-
@search_cmd = @app_cmd.define_command
|
27
|
+
|
28
|
+
@search_cmd = @app_cmd.define_command do
|
41
29
|
name 'search'
|
42
30
|
usage 'search [OPTIONS] [COMMAND]...'
|
43
31
|
aliases :se,:sea
|
44
32
|
summary "Search for links to NHK News Web (Easy) (aliases: #{app.color_alias('se sea')})"
|
45
|
-
|
46
|
-
description <<-
|
33
|
+
|
34
|
+
description <<-DESC
|
47
35
|
Search for links (using a Search Engine, etc.) to NHK News Web (Easy) &
|
48
36
|
save to folder: #{SearchLinks::DEFAULT_DIR}
|
49
|
-
|
50
|
-
|
51
|
-
option :i,:in,<<-
|
37
|
+
DESC
|
38
|
+
|
39
|
+
option :i,:in,<<-DESC,argument: :required,transform: lambda { |value|
|
52
40
|
file to read instead of URL (for offline testing and/or slow internet;
|
53
41
|
see '--show-*' options)
|
54
|
-
|
42
|
+
DESC
|
55
43
|
app.check_empty_opt(:in,value)
|
56
|
-
|
57
|
-
option :o,:out,<<-
|
44
|
+
}
|
45
|
+
option :o,:out,<<-DESC,argument: :required,transform: lambda { |value|
|
58
46
|
'directory/file' to save links to; if you only specify a directory or a file, it will attach the
|
59
47
|
appropriate default directory/file name
|
60
48
|
(defaults: #{SearchLinks::DEFAULT_YASASHII_FILE}, #{SearchLinks::DEFAULT_FUTSUU_FILE})
|
61
|
-
|
49
|
+
DESC
|
62
50
|
app.check_empty_opt(:out,value)
|
63
|
-
|
51
|
+
}
|
64
52
|
option :r,:results,'number of results per page to request from search',argument: :required,
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
option nil,:'show-count',<<-
|
53
|
+
default: SearchScraper::DEFAULT_RESULT_COUNT,transform: lambda { |value|
|
54
|
+
value = value.to_i
|
55
|
+
value = 1 if value < 1
|
56
|
+
value
|
57
|
+
}
|
58
|
+
option nil,:'show-count',<<-DESC
|
71
59
|
show the number of links scraped and exit;
|
72
60
|
useful for manually writing/updating scripts (but not for use in a variable);
|
73
61
|
implies '--dry-run' option
|
74
|
-
|
75
|
-
option nil,:'show-urls',<<-
|
62
|
+
DESC
|
63
|
+
option nil,:'show-urls',<<-DESC
|
76
64
|
show the URLs -- if any -- used when searching & scraping and exit;
|
77
65
|
you can download these for offline testing and/or slow internet
|
78
66
|
(see '--in' option)
|
79
|
-
|
80
|
-
|
67
|
+
DESC
|
68
|
+
|
81
69
|
run do |opts,args,cmd|
|
82
|
-
opts.each
|
83
|
-
key = key.to_s
|
84
|
-
|
70
|
+
opts.each do |key,value|
|
71
|
+
key = key.to_s
|
72
|
+
|
85
73
|
if key.include?('show')
|
86
|
-
raise CLIError
|
74
|
+
raise CLIError,"must specify a sub command for option[#{key}]"
|
87
75
|
end
|
88
76
|
end
|
89
|
-
|
77
|
+
|
90
78
|
puts cmd.help
|
91
79
|
end
|
92
80
|
end
|
93
|
-
|
94
|
-
@search_easy_cmd = @search_cmd.define_command
|
81
|
+
|
82
|
+
@search_easy_cmd = @search_cmd.define_command do
|
95
83
|
name 'easy'
|
96
84
|
usage 'easy [OPTIONS] [COMMAND]...'
|
97
85
|
aliases :e,:ez
|
98
86
|
summary "Search for NHK News Web Easy (Yasashii) links (aliases: #{app.color_alias('e ez')})"
|
99
|
-
|
100
|
-
description <<-
|
87
|
+
|
88
|
+
description <<-DESC
|
101
89
|
Search for NHK News Web Easy (Yasashii) links &
|
102
90
|
save to file: #{SearchLinks::DEFAULT_YASASHII_FILE}
|
103
|
-
|
104
|
-
|
91
|
+
DESC
|
92
|
+
|
105
93
|
run do |opts,args,cmd|
|
106
94
|
app.refresh_cmd(opts,args,cmd)
|
107
|
-
app.run_search_help
|
95
|
+
app.run_search_help
|
108
96
|
end
|
109
97
|
end
|
110
|
-
|
111
|
-
@search_regular_cmd = @search_cmd.define_command
|
98
|
+
|
99
|
+
@search_regular_cmd = @search_cmd.define_command do
|
112
100
|
name 'regular'
|
113
101
|
usage 'regular [OPTIONS] [COMMAND]...'
|
114
102
|
aliases :r,:reg
|
115
103
|
summary "Search for NHK News Web Regular (Futsuu) links (aliases: #{app.color_alias('r reg')})"
|
116
|
-
|
117
|
-
description <<-
|
104
|
+
|
105
|
+
description <<-DESC
|
118
106
|
Search for NHK News Web Regular (Futsuu) links &
|
119
107
|
save to file: #{SearchLinks::DEFAULT_FUTSUU_FILE}
|
120
|
-
|
121
|
-
|
108
|
+
DESC
|
109
|
+
|
122
110
|
run do |opts,args,cmd|
|
123
111
|
app.refresh_cmd(opts,args,cmd)
|
124
|
-
app.run_search_help
|
112
|
+
app.run_search_help
|
125
113
|
end
|
126
114
|
end
|
127
|
-
|
128
|
-
@search_bing_cmd = Cri::Command.define
|
115
|
+
|
116
|
+
@search_bing_cmd = Cri::Command.define do
|
129
117
|
name 'bing'
|
130
118
|
usage 'bing [OPTIONS] [COMMAND]...'
|
131
119
|
aliases :b
|
132
120
|
summary "Search bing.com for links (aliases: #{app.color_alias('b')})"
|
133
|
-
|
134
|
-
description <<-
|
121
|
+
|
122
|
+
description <<-DESC
|
135
123
|
Search bing.com for links & save to folder: #{SearchLinks::DEFAULT_DIR}
|
136
|
-
|
137
|
-
|
124
|
+
DESC
|
125
|
+
|
138
126
|
run do |opts,args,cmd|
|
139
127
|
app.refresh_cmd(opts,args,cmd)
|
140
|
-
app.run_search_cmd(cmd.supercommand.name.to_sym
|
128
|
+
app.run_search_cmd(cmd.supercommand.name.to_sym,:bing)
|
141
129
|
end
|
142
130
|
end
|
143
|
-
|
131
|
+
|
144
132
|
# dup()/clone() must be called for `cmd.supercommand` to work appropriately.
|
145
|
-
@search_easy_cmd.add_command @search_bing_cmd.dup
|
146
|
-
@search_regular_cmd.add_command @search_bing_cmd.dup
|
133
|
+
@search_easy_cmd.add_command @search_bing_cmd.dup
|
134
|
+
@search_regular_cmd.add_command @search_bing_cmd.dup
|
147
135
|
end
|
148
|
-
|
136
|
+
|
149
137
|
def run_search_cmd(nhk_type,search_type)
|
150
138
|
case nhk_type
|
151
139
|
when :easy
|
@@ -153,139 +141,141 @@ module CLI
|
|
153
141
|
when :regular
|
154
142
|
nhk_type = :futsuu
|
155
143
|
end
|
156
|
-
|
144
|
+
|
157
145
|
return if show_search_urls(search_type)
|
158
|
-
|
146
|
+
|
159
147
|
@cmd_opts[:dry_run] = true if @cmd_opts[:show_count]
|
160
|
-
|
148
|
+
|
161
149
|
build_in_file(:in)
|
162
|
-
|
150
|
+
|
163
151
|
case nhk_type
|
164
152
|
when :futsuu
|
165
|
-
build_out_file(:out,default_dir: SearchLinks::DEFAULT_DIR,
|
153
|
+
build_out_file(:out,default_dir: SearchLinks::DEFAULT_DIR,
|
154
|
+
default_filename: SearchLinks::DEFAULT_FUTSUU_FILENAME)
|
166
155
|
when :yasashii
|
167
|
-
build_out_file(:out,default_dir: SearchLinks::DEFAULT_DIR,
|
156
|
+
build_out_file(:out,default_dir: SearchLinks::DEFAULT_DIR,
|
157
|
+
default_filename: SearchLinks::DEFAULT_YASASHII_FILENAME)
|
168
158
|
else
|
169
159
|
raise ArgumentError,"invalid nhk_type[#{nhk_type}]"
|
170
160
|
end
|
171
|
-
|
161
|
+
|
172
162
|
return unless check_in_file(:in,empty_ok: true)
|
173
163
|
return unless check_out_file(:out)
|
174
|
-
|
164
|
+
|
175
165
|
dry_run = @cmd_opts[:dry_run]
|
176
166
|
in_file = @cmd_opts[:in]
|
177
167
|
out_file = @cmd_opts[:out]
|
178
168
|
result_count = @cmd_opts[:results]
|
179
|
-
result_count = SearchScraper::DEFAULT_RESULT_COUNT if result_count.nil?
|
169
|
+
result_count = SearchScraper::DEFAULT_RESULT_COUNT if result_count.nil?
|
180
170
|
show_count = @cmd_opts[:show_count]
|
181
|
-
|
171
|
+
|
182
172
|
start_spin("Scraping #{search_type}") unless show_count
|
183
|
-
|
184
|
-
is_file = !in_file.nil?
|
173
|
+
|
174
|
+
is_file = !in_file.nil?
|
185
175
|
links = nil
|
186
176
|
new_links = [] # For --dry-run
|
187
|
-
next_page = NextPage.new
|
177
|
+
next_page = NextPage.new
|
188
178
|
page_count = 0
|
189
179
|
page_num = 1
|
190
180
|
url = in_file # nil will use default URL, else a file
|
191
|
-
|
181
|
+
|
192
182
|
# Load previous links for 'scraped?' vars.
|
193
183
|
if File.exist?(out_file)
|
194
184
|
links = SearchLinks.load_file(out_file)
|
195
185
|
else
|
196
|
-
links = SearchLinks.new
|
186
|
+
links = SearchLinks.new
|
197
187
|
end
|
198
|
-
|
188
|
+
|
199
189
|
links_count = links.length
|
200
|
-
|
190
|
+
|
201
191
|
if show_count
|
202
192
|
scraped_count = 0
|
203
|
-
|
204
|
-
links.links.
|
205
|
-
scraped_count += 1 if link.scraped?
|
193
|
+
|
194
|
+
links.links.each_value do |link|
|
195
|
+
scraped_count += 1 if link.scraped?
|
206
196
|
end
|
207
|
-
|
197
|
+
|
208
198
|
puts "#{scraped_count} of #{links_count} links scraped."
|
209
|
-
|
199
|
+
|
210
200
|
return
|
211
201
|
end
|
212
|
-
|
213
|
-
range = (0..
|
214
|
-
|
202
|
+
|
203
|
+
range = (0..10_000) # Do a range to prevent an infinite loop; ichiman!
|
204
|
+
|
215
205
|
case search_type
|
216
206
|
# Anything that extends SearchScraper.
|
217
207
|
when :bing
|
218
|
-
range.each
|
208
|
+
range.each do
|
219
209
|
scraper = nil
|
220
|
-
|
210
|
+
|
221
211
|
case search_type
|
222
212
|
when :bing
|
223
213
|
scraper = BingScraper.new(nhk_type,count: result_count,is_file: is_file,url: url,**@scraper_kargs)
|
224
214
|
else
|
225
215
|
raise NHKore::Error,"internal code broken; add missing search_type[#{search_type}]"
|
226
216
|
end
|
227
|
-
|
217
|
+
|
228
218
|
next_page = scraper.scrape(links,next_page)
|
229
|
-
|
219
|
+
|
230
220
|
new_links.concat(links.links.values[links_count..-1])
|
231
221
|
links_count = links.length
|
232
222
|
page_count = next_page.count if next_page.count > 0
|
233
|
-
|
234
|
-
update_spin_detail(" (page=#{page_num}, count=#{page_count}, links=#{links.length},
|
235
|
-
"new_links=#{new_links.length})")
|
236
|
-
|
237
|
-
break if next_page.empty?
|
238
|
-
|
223
|
+
|
224
|
+
update_spin_detail(" (page=#{page_num}, count=#{page_count}, links=#{links.length}," \
|
225
|
+
" new_links=#{new_links.length})")
|
226
|
+
|
227
|
+
break if next_page.empty?
|
228
|
+
|
239
229
|
page_num += 1
|
240
230
|
url = next_page.url
|
241
|
-
|
242
|
-
sleep_scraper
|
231
|
+
|
232
|
+
sleep_scraper
|
243
233
|
end
|
244
234
|
else
|
245
235
|
raise ArgumentError,"invalid search_type[#{search_type}]"
|
246
236
|
end
|
247
|
-
|
248
|
-
stop_spin
|
237
|
+
|
238
|
+
stop_spin
|
249
239
|
puts
|
250
240
|
puts 'Last URL scraped:'
|
251
241
|
puts "> #{url}"
|
252
242
|
puts
|
253
|
-
|
243
|
+
|
254
244
|
if dry_run
|
255
|
-
new_links.each
|
245
|
+
new_links.each do |link|
|
256
246
|
puts link.to_s(mini: true)
|
257
247
|
end
|
258
248
|
else
|
259
249
|
links.save_file(out_file)
|
260
|
-
|
250
|
+
|
261
251
|
puts 'Saved scraped links to file:'
|
262
252
|
puts "> #{out_file}"
|
263
253
|
end
|
264
254
|
end
|
265
|
-
|
266
|
-
def run_search_help
|
255
|
+
|
256
|
+
def run_search_help
|
267
257
|
if @cmd_opts[:show_count] || @cmd_opts[:show_urls]
|
268
|
-
run_search_cmd(@cmd.name.to_sym
|
258
|
+
run_search_cmd(@cmd.name.to_sym,nil)
|
269
259
|
else
|
270
260
|
puts @cmd.help
|
271
261
|
end
|
272
262
|
end
|
273
|
-
|
263
|
+
|
274
264
|
def show_search_urls(search_type)
|
275
265
|
return false unless @cmd_opts[:show_urls]
|
276
|
-
|
266
|
+
|
277
267
|
count = @cmd_opts[:results]
|
278
|
-
count = SearchScraper::DEFAULT_RESULT_COUNT if count.nil?
|
279
|
-
|
268
|
+
count = SearchScraper::DEFAULT_RESULT_COUNT if count.nil?
|
269
|
+
|
280
270
|
case search_type
|
281
271
|
when :bing
|
282
272
|
puts 'Bing:'
|
283
273
|
puts "> Easy: #{BingScraper.build_url(SearchScraper::YASASHII_SITE,count: count)}"
|
284
274
|
puts "> Regular: #{BingScraper.build_url(SearchScraper::FUTSUU_SITE,count: count)}"
|
285
275
|
else
|
286
|
-
raise CLIError
|
276
|
+
raise CLIError,'must specify a sub command for option[show-urls]'
|
287
277
|
end
|
288
|
-
|
278
|
+
|
289
279
|
return true
|
290
280
|
end
|
291
281
|
end
|
data/lib/nhkore/cli/sift_cmd.rb
CHANGED
@@ -1,29 +1,18 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
24
12
|
require 'date'
|
25
13
|
require 'time'
|
26
14
|
|
15
|
+
require 'nhkore/datetime_parser'
|
27
16
|
require 'nhkore/news'
|
28
17
|
require 'nhkore/sifter'
|
29
18
|
require 'nhkore/util'
|
@@ -32,298 +21,193 @@ require 'nhkore/util'
|
|
32
21
|
module NHKore
|
33
22
|
module CLI
|
34
23
|
###
|
35
|
-
# @author Jonathan Bradley Whited
|
24
|
+
# @author Jonathan Bradley Whited
|
36
25
|
# @since 0.2.0
|
37
26
|
###
|
38
27
|
module SiftCmd
|
39
28
|
DEFAULT_SIFT_EXT = :csv
|
40
29
|
DEFAULT_SIFT_FUTSUU_FILE = "#{Sifter::DEFAULT_FUTSUU_FILE}{search.criteria}{file.ext}"
|
41
30
|
DEFAULT_SIFT_YASASHII_FILE = "#{Sifter::DEFAULT_YASASHII_FILE}{search.criteria}{file.ext}"
|
42
|
-
SIFT_EXTS = [
|
43
|
-
|
44
|
-
# Order matters.
|
45
|
-
SIFT_DATETIME_FMTS = [
|
46
|
-
'%Y-%m-%d %H:%M',
|
47
|
-
'%Y-%m-%d %H',
|
48
|
-
'%Y-%m-%d',
|
49
|
-
'%m-%d %H:%M',
|
50
|
-
'%Y-%m %H:%M',
|
51
|
-
'%m-%d %H',
|
52
|
-
'%Y-%m %H',
|
53
|
-
'%m-%d',
|
54
|
-
'%Y-%m',
|
55
|
-
'%d %H:%M',
|
56
|
-
'%y %H:%M',
|
57
|
-
'%d %H',
|
58
|
-
'%Y %H',
|
59
|
-
'%H:%M',
|
60
|
-
'%d',
|
61
|
-
'%Y'
|
62
|
-
]
|
63
|
-
|
31
|
+
SIFT_EXTS = %i[csv htm html json yaml yml].freeze
|
32
|
+
|
64
33
|
attr_accessor :sift_datetime_text
|
65
34
|
attr_accessor :sift_search_criteria
|
66
|
-
|
67
|
-
def build_sift_cmd
|
35
|
+
|
36
|
+
def build_sift_cmd
|
68
37
|
app = self
|
69
|
-
|
38
|
+
|
70
39
|
@sift_datetime_text = nil
|
71
40
|
@sift_search_criteria = nil
|
72
|
-
|
73
|
-
@sift_cmd = @app_cmd.define_command
|
41
|
+
|
42
|
+
@sift_cmd = @app_cmd.define_command do
|
74
43
|
name 'sift'
|
75
44
|
usage 'sift [OPTIONS] [COMMAND]...'
|
76
45
|
aliases :s
|
77
|
-
summary
|
78
|
-
|
79
|
-
|
46
|
+
summary 'Sift NHK News Web (Easy) articles data for the frequency of words' \
|
47
|
+
" (aliases: #{app.color_alias('s')})"
|
48
|
+
|
49
|
+
description(<<-DESC)
|
80
50
|
Sift NHK News Web (Easy) articles data for the frequency of words &
|
81
51
|
save to folder: #{Sifter::DEFAULT_DIR}
|
82
|
-
|
83
|
-
|
84
|
-
option :d,:datetime,<<-
|
52
|
+
DESC
|
53
|
+
|
54
|
+
option :d,:datetime,<<-DESC,argument: :required,transform: lambda { |value|
|
85
55
|
date time to filter on; examples:
|
86
56
|
'2020-7-1 13:10...2020-7-31 11:11';
|
87
57
|
'2020-12' (2020, December 1st-31st);
|
88
58
|
'7-4...7-9' (July 4th-9th of Current Year);
|
89
59
|
'7-9' (July 9th of Current Year);
|
90
60
|
'9' (9th of Current Year & Month)
|
91
|
-
|
61
|
+
DESC
|
92
62
|
app.sift_datetime_text = value # Save the original value for the file name
|
93
|
-
|
63
|
+
|
64
|
+
value = DatetimeParser.parse_range(value)
|
65
|
+
|
66
|
+
app.check_empty_opt(:datetime,value) if value.nil?
|
67
|
+
|
94
68
|
value
|
95
|
-
|
96
|
-
option :e,:ext,<<-
|
69
|
+
}
|
70
|
+
option :e,:ext,<<-DESC,argument: :required,default: DEFAULT_SIFT_EXT,transform: lambda { |value|
|
97
71
|
type of file (extension) to save; valid options: [#{SIFT_EXTS.join(', ')}];
|
98
72
|
not needed if you specify a file extension with the '--out' option: '--out sift.html'
|
99
|
-
|
100
|
-
value = Util.unspace_web_str(value).downcase
|
101
|
-
|
73
|
+
DESC
|
74
|
+
value = Util.unspace_web_str(value).downcase.to_sym
|
75
|
+
|
102
76
|
raise CLIError,"invalid ext[#{value}] for option[#{ext}]" unless SIFT_EXTS.include?(value)
|
103
|
-
|
77
|
+
|
104
78
|
value
|
105
|
-
|
106
|
-
option :i,:in,<<-
|
79
|
+
}
|
80
|
+
option :i,:in,<<-DESC,argument: :required,transform: lambda { |value|
|
107
81
|
file of NHK News Web (Easy) articles data to sift (see '#{App::NAME} news';
|
108
82
|
defaults: #{YasashiiNews::DEFAULT_FILE}, #{FutsuuNews::DEFAULT_FILE})
|
109
|
-
|
83
|
+
DESC
|
110
84
|
app.check_empty_opt(:in,value)
|
111
|
-
|
85
|
+
}
|
112
86
|
flag :D,:'no-defn','do not output the definitions for words (which can be quite long)'
|
113
87
|
flag :E,:'no-eng','do not output the English translations for words'
|
114
|
-
option :o,:out,<<-
|
88
|
+
option :o,:out,<<-DESC,argument: :required,transform: lambda { |value|
|
115
89
|
'directory/file' to save sifted data to; if you only specify a directory or a file, it will attach
|
116
90
|
the appropriate default directory/file name
|
117
91
|
(defaults: #{DEFAULT_SIFT_YASASHII_FILE}, #{DEFAULT_SIFT_FUTSUU_FILE})
|
118
|
-
|
92
|
+
DESC
|
119
93
|
app.check_empty_opt(:out,value)
|
120
|
-
|
121
|
-
flag :H,'no-sha256',<<-
|
94
|
+
}
|
95
|
+
flag :H,'no-sha256',<<-DESC
|
122
96
|
if you used this option with the 'news' command, then you'll also need this option here
|
123
97
|
to not fail on "duplicate" articles; see '#{App::NAME} news'
|
124
|
-
|
98
|
+
DESC
|
125
99
|
option :t,:title,'title to filter on, where search text only needs to be somewhere in the title',
|
126
100
|
argument: :required
|
127
101
|
option :u,:url,'URL to filter on, where search text only needs to be somewhere in the URL',
|
128
102
|
argument: :required
|
129
|
-
|
103
|
+
|
130
104
|
run do |opts,args,cmd|
|
131
105
|
puts cmd.help
|
132
106
|
end
|
133
107
|
end
|
134
|
-
|
135
|
-
@sift_easy_cmd = @sift_cmd.define_command
|
108
|
+
|
109
|
+
@sift_easy_cmd = @sift_cmd.define_command do
|
136
110
|
name 'easy'
|
137
111
|
usage 'easy [OPTIONS] [COMMAND]...'
|
138
112
|
aliases :e,:ez
|
139
113
|
summary "Sift NHK News Web Easy (Yasashii) articles data (aliases: #{app.color_alias('e ez')})"
|
140
|
-
|
141
|
-
description <<-
|
114
|
+
|
115
|
+
description <<-DESC
|
142
116
|
Sift NHK News Web Easy (Yasashii) articles data for the frequency of words &
|
143
117
|
save to file: #{DEFAULT_SIFT_YASASHII_FILE}
|
144
|
-
|
145
|
-
|
118
|
+
DESC
|
119
|
+
|
146
120
|
run do |opts,args,cmd|
|
147
121
|
app.refresh_cmd(opts,args,cmd)
|
148
122
|
app.run_sift_cmd(:yasashii)
|
149
123
|
end
|
150
124
|
end
|
151
|
-
|
152
|
-
@sift_regular_cmd = @sift_cmd.define_command
|
125
|
+
|
126
|
+
@sift_regular_cmd = @sift_cmd.define_command do
|
153
127
|
name 'regular'
|
154
128
|
usage 'regular [OPTIONS] [COMMAND]...'
|
155
129
|
aliases :r,:reg
|
156
130
|
summary "Sift NHK News Web Regular (Futsuu) articles data (aliases: #{app.color_alias('r reg')})"
|
157
|
-
|
158
|
-
description
|
131
|
+
|
132
|
+
description(<<-DESC)
|
159
133
|
Sift NHK News Web Regular (Futsuu) articles data for the frequency of words &
|
160
134
|
save to file: #{DEFAULT_SIFT_FUTSUU_FILE}
|
161
|
-
|
162
|
-
|
135
|
+
DESC
|
136
|
+
|
163
137
|
run do |opts,args,cmd|
|
164
138
|
app.refresh_cmd(opts,args,cmd)
|
165
139
|
app.run_sift_cmd(:futsuu)
|
166
140
|
end
|
167
141
|
end
|
168
142
|
end
|
169
|
-
|
143
|
+
|
170
144
|
def build_sift_filename(filename)
|
171
145
|
@sift_search_criteria = []
|
172
|
-
|
173
|
-
@sift_search_criteria << Util.strip_web_str(@sift_datetime_text.to_s
|
174
|
-
@sift_search_criteria << Util.strip_web_str(@cmd_opts[:title].to_s
|
175
|
-
@sift_search_criteria << Util.strip_web_str(@cmd_opts[:url].to_s
|
176
|
-
@sift_search_criteria.filter!
|
177
|
-
|
178
|
-
clean_regex = /[^[[:alnum:]]\-_
|
179
|
-
clean_search_criteria = ''.dup
|
180
|
-
|
181
|
-
@sift_search_criteria.each
|
146
|
+
|
147
|
+
@sift_search_criteria << Util.strip_web_str(@sift_datetime_text.to_s)
|
148
|
+
@sift_search_criteria << Util.strip_web_str(@cmd_opts[:title].to_s)
|
149
|
+
@sift_search_criteria << Util.strip_web_str(@cmd_opts[:url].to_s)
|
150
|
+
@sift_search_criteria.filter! { |sc| !sc.empty? }
|
151
|
+
|
152
|
+
clean_regex = /[^[[:alnum:]]\-_.]+/
|
153
|
+
clean_search_criteria = ''.dup
|
154
|
+
|
155
|
+
@sift_search_criteria.each do |sc|
|
182
156
|
clean_search_criteria << sc.gsub(clean_regex,'')
|
183
157
|
end
|
184
|
-
|
185
|
-
@sift_search_criteria = @sift_search_criteria.empty?
|
186
|
-
|
158
|
+
|
159
|
+
@sift_search_criteria = @sift_search_criteria.empty? ? nil : @sift_search_criteria.join(', ')
|
160
|
+
|
187
161
|
# Limit the file name length.
|
188
162
|
# If length is smaller, [..] still works appropriately.
|
189
163
|
clean_search_criteria = clean_search_criteria[0..32]
|
190
|
-
|
191
|
-
clean_search_criteria.prepend('_') unless clean_search_criteria.empty?
|
192
|
-
|
164
|
+
|
165
|
+
clean_search_criteria.prepend('_') unless clean_search_criteria.empty?
|
166
|
+
|
193
167
|
file_ext = @cmd_opts[:ext]
|
194
|
-
|
195
|
-
if file_ext.nil?
|
168
|
+
|
169
|
+
if file_ext.nil?
|
196
170
|
# Try to get from '--out' if it exists.
|
197
|
-
if !@cmd_opts[:out].nil?
|
198
|
-
file_ext = Util.unspace_web_str(File.extname(@cmd_opts[:out])).downcase
|
171
|
+
if !@cmd_opts[:out].nil?
|
172
|
+
file_ext = Util.unspace_web_str(File.extname(@cmd_opts[:out])).downcase
|
199
173
|
file_ext = file_ext.sub(/\A\./,'') # Remove '.'; can't be nil for to_sym()
|
200
|
-
file_ext = file_ext.to_sym
|
201
|
-
|
174
|
+
file_ext = file_ext.to_sym
|
175
|
+
|
202
176
|
file_ext = nil unless SIFT_EXTS.include?(file_ext)
|
203
177
|
end
|
204
|
-
|
205
|
-
file_ext = DEFAULT_SIFT_EXT if file_ext.nil?
|
178
|
+
|
179
|
+
file_ext = DEFAULT_SIFT_EXT if file_ext.nil?
|
206
180
|
@cmd_opts[:ext] = file_ext
|
207
181
|
end
|
208
|
-
|
182
|
+
|
209
183
|
filename = "#{filename}#{clean_search_criteria}.#{file_ext}"
|
210
|
-
|
184
|
+
|
211
185
|
return filename
|
212
186
|
end
|
213
|
-
|
214
|
-
# TODO: This should probably be moved into its own class, into Util, or into Sifter?
|
215
|
-
def parse_sift_datetime(value)
|
216
|
-
value = Util.reduce_space(value).strip() # Don't use unspace_web_str(), want spaces for formats
|
217
|
-
value = value.split('...',2)
|
218
|
-
|
219
|
-
check_empty_opt(:datetime,nil) if value.empty?() # For ''
|
220
|
-
|
221
|
-
# Make a "to" and a "from" date time range.
|
222
|
-
value << value[0].dup() if value.length == 1
|
223
|
-
|
224
|
-
to_day = nil
|
225
|
-
to_hour = 23
|
226
|
-
to_minute = 59
|
227
|
-
to_month = 12
|
228
|
-
to_year = Util::MAX_SANE_YEAR
|
229
|
-
|
230
|
-
value.each_with_index() do |v,i|
|
231
|
-
v = check_empty_opt(:datetime,v) # For '...', '12-25...', or '...12-25'
|
232
|
-
|
233
|
-
has_day = false
|
234
|
-
has_hour = false
|
235
|
-
has_minute = false
|
236
|
-
has_month = false
|
237
|
-
has_year = false
|
238
|
-
|
239
|
-
SIFT_DATETIME_FMTS.each_with_index() do |fmt,i|
|
240
|
-
begin
|
241
|
-
# If don't do this, "%d" values will be parsed using "%d %H".
|
242
|
-
# It seems as though strptime() ignores space.
|
243
|
-
raise ArgumentError if !v.include?(' ') && fmt.include?(' ')
|
244
|
-
|
245
|
-
# If don't do this, "%y" values will be parsed using "%d".
|
246
|
-
raise ArgumentError if fmt == '%d' && v.length > 2
|
247
|
-
|
248
|
-
v = Time.strptime(v,fmt,&Util.method(:guess_year))
|
249
|
-
|
250
|
-
has_day = fmt.include?('%d')
|
251
|
-
has_hour = fmt.include?('%H')
|
252
|
-
has_minute = fmt.include?('%M')
|
253
|
-
has_month = fmt.include?('%m')
|
254
|
-
has_year = fmt.include?('%Y')
|
255
|
-
|
256
|
-
break # No problem; this format worked
|
257
|
-
rescue ArgumentError
|
258
|
-
# Out of formats.
|
259
|
-
raise if i >= (SIFT_DATETIME_FMTS.length - 1)
|
260
|
-
end
|
261
|
-
end
|
262
|
-
|
263
|
-
# "From" date time.
|
264
|
-
if i == 0
|
265
|
-
# Set these so that "2012-7-4...7-9" will use the appropriate year
|
266
|
-
# of "2012" for "7-9".
|
267
|
-
to_day = v.day if has_day
|
268
|
-
to_hour = v.hour if has_hour
|
269
|
-
to_minute = v.min if has_minute
|
270
|
-
to_month = v.month if has_month
|
271
|
-
to_year = v.year if has_year
|
272
|
-
|
273
|
-
v = Time.new(
|
274
|
-
has_year ? v.year : Util::MIN_SANE_YEAR,
|
275
|
-
has_month ? v.month : 1,
|
276
|
-
has_day ? v.day : 1,
|
277
|
-
has_hour ? v.hour : 0,
|
278
|
-
has_minute ? v.min : 0
|
279
|
-
)
|
280
|
-
# "To" date time.
|
281
|
-
else
|
282
|
-
to_hour = v.hour if has_hour
|
283
|
-
to_minute = v.min if has_minute
|
284
|
-
to_month = v.month if has_month
|
285
|
-
to_year = v.year if has_year
|
286
|
-
|
287
|
-
if has_day
|
288
|
-
to_day = v.day
|
289
|
-
# Nothing passed from the "from" date time?
|
290
|
-
elsif to_day.nil?()
|
291
|
-
# Last day of month.
|
292
|
-
to_day = Date.new(to_year,to_month,-1).day
|
293
|
-
end
|
294
|
-
|
295
|
-
v = Time.new(to_year,to_month,to_day,to_hour,to_minute)
|
296
|
-
end
|
297
|
-
|
298
|
-
value[i] = v
|
299
|
-
end
|
300
|
-
|
301
|
-
return value
|
302
|
-
end
|
303
|
-
|
187
|
+
|
304
188
|
def run_sift_cmd(type)
|
305
189
|
news_name = nil
|
306
|
-
|
190
|
+
|
307
191
|
case type
|
308
192
|
when :futsuu
|
309
193
|
build_in_file(:in,default_dir: News::DEFAULT_DIR,default_filename: FutsuuNews::DEFAULT_FILENAME)
|
310
194
|
build_out_file(:out,default_dir: Sifter::DEFAULT_DIR,
|
311
195
|
default_filename: build_sift_filename(Sifter::DEFAULT_FUTSUU_FILENAME))
|
312
|
-
|
196
|
+
|
313
197
|
news_name = 'Regular'
|
314
198
|
when :yasashii
|
315
199
|
build_in_file(:in,default_dir: News::DEFAULT_DIR,default_filename: YasashiiNews::DEFAULT_FILENAME)
|
316
200
|
build_out_file(:out,default_dir: Sifter::DEFAULT_DIR,
|
317
201
|
default_filename: build_sift_filename(Sifter::DEFAULT_YASASHII_FILENAME))
|
318
|
-
|
202
|
+
|
319
203
|
news_name = 'Easy'
|
320
204
|
else
|
321
205
|
raise ArgumentError,"invalid type[#{type}]"
|
322
206
|
end
|
323
|
-
|
207
|
+
|
324
208
|
return unless check_in_file(:in,empty_ok: false)
|
325
209
|
return unless check_out_file(:out)
|
326
|
-
|
210
|
+
|
327
211
|
datetime_filter = @cmd_opts[:datetime]
|
328
212
|
dry_run = @cmd_opts[:dry_run]
|
329
213
|
file_ext = @cmd_opts[:ext]
|
@@ -334,55 +218,55 @@ module CLI
|
|
334
218
|
out_file = @cmd_opts[:out]
|
335
219
|
title_filter = @cmd_opts[:title]
|
336
220
|
url_filter = @cmd_opts[:url]
|
337
|
-
|
221
|
+
|
338
222
|
start_spin("Sifting NHK News Web #{news_name} data")
|
339
|
-
|
223
|
+
|
340
224
|
news = (type == :yasashii) ?
|
341
225
|
YasashiiNews.load_file(in_file,overwrite: no_sha256) :
|
342
226
|
FutsuuNews.load_file(in_file,overwrite: no_sha256)
|
343
|
-
|
227
|
+
|
344
228
|
sifter = Sifter.new(news)
|
345
|
-
|
346
|
-
sifter.filter_by_datetime(datetime_filter) unless datetime_filter.nil?
|
347
|
-
sifter.filter_by_title(title_filter) unless title_filter.nil?
|
348
|
-
sifter.filter_by_url(url_filter) unless url_filter.nil?
|
229
|
+
|
230
|
+
sifter.filter_by_datetime(datetime_filter) unless datetime_filter.nil?
|
231
|
+
sifter.filter_by_title(title_filter) unless title_filter.nil?
|
232
|
+
sifter.filter_by_url(url_filter) unless url_filter.nil?
|
349
233
|
sifter.ignore(:defn) if no_defn
|
350
234
|
sifter.ignore(:eng) if no_eng
|
351
|
-
|
352
|
-
sifter.caption = "NHK News Web #{news_name}".dup
|
353
|
-
|
354
|
-
if !@sift_search_criteria.nil?
|
355
|
-
if [
|
356
|
-
sifter.caption << " — #{Util.escape_html(@sift_search_criteria.to_s
|
235
|
+
|
236
|
+
sifter.caption = "NHK News Web #{news_name}".dup
|
237
|
+
|
238
|
+
if !@sift_search_criteria.nil?
|
239
|
+
if %i[htm html].any?(file_ext)
|
240
|
+
sifter.caption << " — #{Util.escape_html(@sift_search_criteria.to_s)}"
|
357
241
|
else
|
358
242
|
sifter.caption << " -- #{@sift_search_criteria}"
|
359
243
|
end
|
360
244
|
end
|
361
|
-
|
245
|
+
|
362
246
|
case file_ext
|
363
247
|
when :csv
|
364
|
-
sifter.put_csv!
|
248
|
+
sifter.put_csv!
|
365
249
|
when :htm,:html
|
366
|
-
sifter.put_html!
|
250
|
+
sifter.put_html!
|
367
251
|
when :json
|
368
|
-
sifter.put_json!
|
252
|
+
sifter.put_json!
|
369
253
|
when :yaml,:yml
|
370
|
-
sifter.put_yaml!
|
254
|
+
sifter.put_yaml!
|
371
255
|
else
|
372
256
|
raise ArgumentError,"invalid file ext[#{file_ext}]"
|
373
257
|
end
|
374
|
-
|
375
|
-
stop_spin
|
258
|
+
|
259
|
+
stop_spin
|
376
260
|
puts
|
377
|
-
|
261
|
+
|
378
262
|
if dry_run
|
379
|
-
puts sifter.to_s
|
263
|
+
puts sifter.to_s
|
380
264
|
else
|
381
265
|
start_spin('Saving sifted data to file')
|
382
|
-
|
266
|
+
|
383
267
|
sifter.save_file(out_file)
|
384
|
-
|
385
|
-
stop_spin
|
268
|
+
|
269
|
+
stop_spin
|
386
270
|
puts "> #{out_file}"
|
387
271
|
end
|
388
272
|
end
|