nhkore 0.3.4 → 0.3.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,23 +1,11 @@
1
- #!/usr/bin/env ruby
2
1
  # encoding: UTF-8
3
2
  # frozen_string_literal: true
4
3
 
5
4
  #--
6
5
  # This file is part of NHKore.
7
- # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
- #
9
- # NHKore is free software: you can redistribute it and/or modify
10
- # it under the terms of the GNU Lesser General Public License as published by
11
- # the Free Software Foundation, either version 3 of the License, or
12
- # (at your option) any later version.
13
- #
14
- # NHKore is distributed in the hope that it will be useful,
15
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- # GNU Lesser General Public License for more details.
18
- #
19
- # You should have received a copy of the GNU Lesser General Public License
20
- # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
6
+ # Copyright (c) 2020-2021 Jonathan Bradley Whited
7
+ #
8
+ # SPDX-License-Identifier: LGPL-3.0-or-later
21
9
  #++
22
10
 
23
11
 
@@ -34,164 +22,166 @@ require 'nhkore/util'
34
22
  module NHKore
35
23
  module CLI
36
24
  ###
37
- # @author Jonathan Bradley Whited (@esotericpig)
25
+ # @author Jonathan Bradley Whited
38
26
  # @since 0.2.0
39
27
  ###
40
28
  module NewsCmd
41
29
  DEFAULT_NEWS_SCRAPE = 1
42
-
43
- def build_news_cmd()
30
+
31
+ def build_news_cmd
44
32
  app = self
45
-
46
- @news_cmd = @app_cmd.define_command() do
33
+
34
+ @news_cmd = @app_cmd.define_command do
47
35
  name 'news'
48
36
  usage 'news [OPTIONS] [COMMAND]...'
49
37
  aliases :n
50
38
  summary "Scrape NHK News Web (Easy) articles (aliases: #{app.color_alias('n')})"
51
-
52
- description <<-EOD
39
+
40
+ description <<-DESC
53
41
  Scrape NHK News Web (Easy) articles &
54
42
  save to folder: #{News::DEFAULT_DIR}
55
- EOD
56
-
57
- option :d,:datetime,<<-EOD,argument: :required,transform: -> (value) do
43
+ DESC
44
+
45
+ option :d,:datetime,<<-DESC,argument: :required,transform: lambda { |value|
58
46
  date time to use as a fallback in cases when an article doesn't have one;
59
47
  format: YYYY-mm-dd H:M; example: 2020-03-30 15:30
60
- EOD
48
+ DESC
61
49
  value = Time.strptime(value,'%Y-%m-%d %H:%M',&DatetimeParser.method(:guess_year))
62
50
  value = Util.jst_time(value)
63
51
  value
64
- end
65
- option :i,:in,<<-EOD,argument: :required,transform: -> (value) do
52
+ }
53
+ option :i,:in,<<-DESC,argument: :required,transform: lambda { |value|
66
54
  HTML file of article to read instead of URL (for offline testing and/or slow internet;
67
55
  see '--no-dict' option)
68
- EOD
56
+ DESC
69
57
  app.check_empty_opt(:in,value)
70
- end
71
- flag :L,:lenient,<<-EOD
58
+ }
59
+ flag :L,:lenient,<<-DESC
72
60
  leniently (not strict) scrape articles:
73
61
  body & title content without the proper HTML/CSS classes/IDs and no futsuurl;
74
62
  example URLs that need this flag:
75
63
  -https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html
76
64
  -https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html
77
- EOD
78
- option :k,:like,<<-EOD,argument: :required,transform: -> (value) do
65
+ DESC
66
+ option :k,:like,<<-DESC,argument: :required,transform: lambda { |value|
79
67
  text to fuzzy search links for; for example, "--like '00123'" will only scrape links containing
80
68
  text '00123' -- like '*00123*'
81
- EOD
82
- value = Util.strip_web_str(value).downcase()
69
+ DESC
70
+ value = Util.strip_web_str(value).downcase
83
71
  value
84
- end
85
- option :l,:links,<<-EOD,argument: :required,transform: -> (value) do
72
+ }
73
+ option :l,:links,<<-DESC,argument: :required,transform: lambda { |value|
86
74
  'directory/file' of article links to scrape (see '#{App::NAME} search';
87
75
  defaults: #{SearchLinks::DEFAULT_YASASHII_FILE}, #{SearchLinks::DEFAULT_FUTSUU_FILE})
88
- EOD
76
+ DESC
89
77
  app.check_empty_opt(:links,value)
90
- end
91
- flag :M,:missingno,<<-EOD
78
+ }
79
+ flag :M,:missingno,<<-DESC
92
80
  very rarely an article will not have kana or kanji for a Ruby tag;
93
81
  to not raise an error, this will use previously scraped data to fill it in;
94
82
  example URL:
95
83
  -https://www3.nhk.or.jp/news/easy/k10012331311000/k10012331311000.html
96
- EOD
97
- flag :D,:'no-dict',<<-EOD
84
+ DESC
85
+ flag :D,:'no-dict',<<-DESC
98
86
  do not try to parse the dictionary files for the articles; useful in case of errors trying to load
99
87
  the dictionaries (or for offline testing)
100
- EOD
101
- flag :H,'no-sha256',<<-EOD
88
+ DESC
89
+ flag :H,'no-sha256',<<-DESC
102
90
  do not check the SHA-256 of the content to see if an article has already been scraped;
103
91
  for example, 2 URLs with the same content, but 1 with 'http' & 1 with 'https', will both be scraped;
104
92
  this is useful if 2 articles have the same SHA-256, but different content (unlikely)
105
- EOD
106
- option :o,:out,<<-EOD,argument: :required,transform: -> (value) do
93
+ DESC
94
+ option :o,:out,<<-DESC,argument: :required,transform: lambda { |value|
107
95
  'directory/file' to save words to; if you only specify a directory or a file, it will attach
108
96
  the appropriate default directory/file name
109
97
  (defaults: #{YasashiiNews::DEFAULT_FILE}, #{FutsuuNews::DEFAULT_FILE})
110
- EOD
98
+ DESC
111
99
  app.check_empty_opt(:out,value)
112
- end
100
+ }
113
101
  flag :r,:redo,'scrape article links even if they have already been scraped'
114
102
  option :s,:scrape,'number of unscraped article links to scrape',argument: :required,
115
- default: DEFAULT_NEWS_SCRAPE,transform: -> (value) do
116
- value = value.to_i()
117
- value = 1 if value < 1
118
- value
119
- end
120
- option nil,:'show-dict',<<-EOD
103
+ default: DEFAULT_NEWS_SCRAPE,transform: lambda { |value|
104
+ value = value.to_i
105
+ value = 1 if value < 1
106
+ value
107
+ }
108
+ option nil,:'show-dict',<<-DESC
121
109
  show dictionary URL and contents for the first article and exit;
122
110
  useful for debugging dictionary errors (see '--no-dict' option);
123
111
  implies '--dry-run' option
124
- EOD
125
- option :u,:url,<<-EOD,argument: :required,transform: -> (value) do
112
+ DESC
113
+ option :u,:url,<<-DESC,argument: :required,transform: lambda { |value|
126
114
  URL of article to scrape, instead of article links file (see '--links' option)
127
- EOD
115
+ DESC
128
116
  app.check_empty_opt(:url,value)
129
- end
130
-
117
+ }
118
+
131
119
  run do |opts,args,cmd|
132
120
  puts cmd.help
133
121
  end
134
122
  end
135
-
136
- @news_easy_cmd = @news_cmd.define_command() do
123
+
124
+ @news_easy_cmd = @news_cmd.define_command do
137
125
  name 'easy'
138
126
  usage 'easy [OPTIONS] [COMMAND]...'
139
127
  aliases :e,:ez
140
128
  summary "Scrape NHK News Web Easy (Yasashii) articles (aliases: #{app.color_alias('e ez')})"
141
-
142
- description <<-EOD
129
+
130
+ description <<-DESC
143
131
  Search for NHK News Web Easy (Yasashii) links &
144
132
  save to file: #{YasashiiNews::DEFAULT_FILE}
145
- EOD
146
-
133
+ DESC
134
+
147
135
  run do |opts,args,cmd|
148
136
  app.refresh_cmd(opts,args,cmd)
149
137
  app.run_news_cmd(:yasashii)
150
138
  end
151
139
  end
152
-
153
- @news_regular_cmd = @news_cmd.define_command() do
140
+
141
+ @news_regular_cmd = @news_cmd.define_command do
154
142
  name 'regular'
155
143
  usage 'regular [OPTIONS] [COMMAND]...'
156
144
  aliases :r,:reg
157
145
  summary "Scrape NHK News Web Regular (Futsuu) articles (aliases: #{app.color_alias('r reg')})"
158
-
159
- description <<-EOD
146
+
147
+ description <<-DESC
160
148
  Search for NHK News Web Regular (Futsuu) links &
161
149
  save to file: #{FutsuuNews::DEFAULT_FILE}
162
- EOD
163
-
150
+ DESC
151
+
164
152
  run do |opts,args,cmd|
165
153
  app.refresh_cmd(opts,args,cmd)
166
154
  app.run_news_cmd(:futsuu)
167
155
  end
168
156
  end
169
157
  end
170
-
158
+
171
159
  def run_news_cmd(type)
172
160
  @cmd_opts[:dry_run] = true if @cmd_opts[:show_dict]
173
161
  news_name = nil
174
-
162
+
175
163
  build_in_file(:in)
176
-
164
+
177
165
  case type
178
166
  when :futsuu
179
- build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::DEFAULT_FUTSUU_FILENAME)
167
+ build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,
168
+ default_filename: SearchLinks::DEFAULT_FUTSUU_FILENAME)
180
169
  build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: FutsuuNews::DEFAULT_FILENAME)
181
-
170
+
182
171
  news_name = 'Regular'
183
172
  when :yasashii
184
- build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::DEFAULT_YASASHII_FILENAME)
173
+ build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,
174
+ default_filename: SearchLinks::DEFAULT_YASASHII_FILENAME)
185
175
  build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: YasashiiNews::DEFAULT_FILENAME)
186
-
176
+
187
177
  news_name = 'Easy'
188
178
  else
189
179
  raise ArgumentError,"invalid type[#{type}]"
190
180
  end
191
-
181
+
192
182
  return unless check_in_file(:in,empty_ok: true)
193
183
  return unless check_out_file(:out)
194
-
184
+
195
185
  datetime = @cmd_opts[:datetime]
196
186
  dict = @cmd_opts[:no_dict] ? nil : :scrape
197
187
  dry_run = @cmd_opts[:dry_run]
@@ -200,39 +190,39 @@ module CLI
200
190
  like = @cmd_opts[:like]
201
191
  links_file = @cmd_opts[:links]
202
192
  max_scrapes = @cmd_opts[:scrape]
203
- max_scrapes = DEFAULT_NEWS_SCRAPE if max_scrapes.nil?()
193
+ max_scrapes = DEFAULT_NEWS_SCRAPE if max_scrapes.nil?
204
194
  missingno = @cmd_opts[:missingno]
205
195
  no_sha256 = @cmd_opts[:no_sha256]
206
196
  out_file = @cmd_opts[:out]
207
197
  redo_scrapes = @cmd_opts[:redo]
208
198
  show_dict = @cmd_opts[:show_dict]
209
-
199
+
210
200
  # Favor in_file option over url option.
211
- url = in_file.nil?() ? Util.strip_web_str(@cmd_opts[:url].to_s()) : in_file
212
- url = nil if url.empty?()
213
-
214
- if url.nil?()
201
+ url = in_file.nil? ? Util.strip_web_str(@cmd_opts[:url].to_s) : in_file
202
+ url = nil if url.empty?
203
+
204
+ if url.nil?
215
205
  # Then we must have a links file that exists.
216
206
  return unless check_in_file(:links,empty_ok: false)
217
207
  end
218
-
208
+
219
209
  start_spin("Scraping NHK News Web #{news_name} articles")
220
-
221
- is_file = !in_file.nil?()
210
+
211
+ is_file = !in_file.nil?
222
212
  link_count = -1
223
- links = File.exist?(links_file) ? SearchLinks.load_file(links_file) : SearchLinks.new()
213
+ links = File.exist?(links_file) ? SearchLinks.load_file(links_file) : SearchLinks.new
224
214
  new_articles = [] # For --dry-run
225
215
  news = nil
226
216
  scrape_count = 0
227
-
217
+
228
218
  if File.exist?(out_file)
229
219
  news = (type == :yasashii) ?
230
220
  YasashiiNews.load_file(out_file,overwrite: no_sha256) :
231
221
  FutsuuNews.load_file(out_file,overwrite: no_sha256)
232
222
  else
233
- news = (type == :yasashii) ? YasashiiNews.new() : FutsuuNews.new()
223
+ news = (type == :yasashii) ? YasashiiNews.new : FutsuuNews.new
234
224
  end
235
-
225
+
236
226
  @news_article_scraper_kargs = @scraper_kargs.merge({
237
227
  datetime: datetime,
238
228
  dict: dict,
@@ -243,154 +233,154 @@ module CLI
243
233
  @news_dict_scraper_kargs = @scraper_kargs.merge({
244
234
  is_file: is_file,
245
235
  })
246
-
247
- if url.nil?()
236
+
237
+ if url.nil?
248
238
  # Why store each() and do `links_len` instead of `links-len - 1`?
249
- #
239
+ #
250
240
  # If links contains 5 entries and you scrape all 5, then the output of
251
241
  # update_spin_detail() will end on 4, so all of this complexity is so
252
242
  # that update_spin_detail() only needs to be written/updated on one line.
253
-
254
- links_each = links.links.values.each()
255
- links_len = links.length()
256
-
243
+
244
+ links_each = links.links.values.each
245
+ links_len = links.length
246
+
257
247
  0.upto(links_len) do |i|
258
248
  update_spin_detail(" (scraped=#{scrape_count}, considered=#{link_count += 1})")
259
-
249
+
260
250
  break if i >= links_len || scrape_count >= max_scrapes
261
-
262
- link = links_each.next()
263
-
264
- next if !like.nil?() && !link.url.to_s().downcase().include?(like)
251
+
252
+ link = links_each.next
253
+
254
+ next if !like.nil? && !link.url.to_s.downcase.include?(like)
265
255
  next if !redo_scrapes && scraped_news_article?(news,link)
266
-
256
+
267
257
  url = link.url
268
-
258
+
269
259
  if (new_url = scrape_news_article(url,link: link,new_articles: new_articles,news: news))
270
260
  # --show-dict
271
261
  url = new_url
272
262
  scrape_count = max_scrapes - 1 # Break on next iteration for update_spin_detail()
273
263
  end
274
-
264
+
275
265
  # Break on next iteration for update_spin_detail().
276
266
  next if (scrape_count += 1) >= max_scrapes
277
-
278
- sleep_scraper()
267
+
268
+ sleep_scraper
279
269
  end
280
270
  else
281
271
  link = links[url]
282
-
283
- if link.nil?()
272
+
273
+ if link.nil?
284
274
  link = SearchLink.new(url)
285
275
  links.add_link(link)
286
276
  end
287
-
277
+
288
278
  scrape_news_article(url,link: link,new_articles: new_articles,news: news)
289
-
279
+
290
280
  scrape_count += 1
291
281
  end
292
-
293
- stop_spin()
282
+
283
+ stop_spin
294
284
  puts
295
-
285
+
296
286
  if scrape_count <= 0
297
287
  puts 'Nothing scraped!'
298
-
288
+
299
289
  if !dry_run && !show_dict
300
290
  puts
301
291
  start_spin('Saving updated links to file')
302
-
292
+
303
293
  links.save_file(links_file)
304
-
305
- stop_spin()
294
+
295
+ stop_spin
306
296
  puts "> #{links_file}"
307
297
  end
308
298
  else
309
299
  puts 'Last URL scraped:'
310
300
  puts "> #{url}"
311
301
  puts
312
-
302
+
313
303
  if show_dict
314
304
  puts @cmd_opts[:show_dict] # Updated in scrape_news_article()
315
305
  elsif dry_run
316
306
  if new_articles.length < 1
317
- raise CLIError,"scrape_count[#{scrape_count}] != new_articles[#{new_articles.length}]; " +
318
- "internal code is broken"
307
+ raise CLIError,"scrape_count[#{scrape_count}] != new_articles[#{new_articles.length}];" \
308
+ ' internal code is broken'
319
309
  elsif new_articles.length == 1
320
310
  puts new_articles.first
321
311
  else
322
312
  # Don't show the words (mini), too verbose for more than 1.
323
- new_articles.each() do |article|
313
+ new_articles.each do |article|
324
314
  puts article.to_s(mini: true)
325
315
  end
326
316
  end
327
317
  else
328
318
  start_spin('Saving scraped data to files')
329
-
319
+
330
320
  links.save_file(links_file)
331
321
  news.save_file(out_file)
332
-
333
- stop_spin()
322
+
323
+ stop_spin
334
324
  puts "> #{out_file}"
335
325
  puts "> #{links_file}"
336
326
  end
337
327
  end
338
328
  end
339
-
329
+
340
330
  def scrape_news_article(url,link:,new_articles:,news:)
341
331
  show_dict = @cmd_opts[:show_dict]
342
-
332
+
343
333
  if show_dict
344
334
  scraper = DictScraper.new(url,**@news_dict_scraper_kargs)
345
-
346
- @cmd_opts[:show_dict] = scraper.scrape().to_s()
347
-
335
+
336
+ @cmd_opts[:show_dict] = scraper.scrape.to_s
337
+
348
338
  return scraper.url
349
339
  end
350
-
340
+
351
341
  scraper = ArticleScraper.new(url,**@news_article_scraper_kargs)
352
- article = scraper.scrape()
353
-
342
+ article = scraper.scrape
343
+
354
344
  # run_news_cmd() handles overwriting with --redo or not
355
345
  # using scraped_news_article?().
356
346
  news.add_article(article,overwrite: true)
357
-
347
+
358
348
  news.update_article(article,link.url) # Favors https
359
349
  link.update_from_article(article)
360
-
350
+
361
351
  new_articles << article
362
-
352
+
363
353
  return false # No --show-dict
364
354
  end
365
-
355
+
366
356
  def scraped_news_article?(news,link)
367
- return true if link.scraped?()
368
-
357
+ return true if link.scraped?
358
+
369
359
  no_sha256 = @cmd_opts[:no_sha256]
370
-
360
+
371
361
  article = news.article(link.url)
372
-
373
- if !no_sha256 && article.nil?()
362
+
363
+ if !no_sha256 && article.nil?
374
364
  if !Util.empty_web_str?(link.sha256) && news.sha256?(link.sha256)
375
365
  article = news.article_with_sha256(link.sha256)
376
366
  end
377
-
378
- if article.nil?()
367
+
368
+ if article.nil?
379
369
  scraper = ArticleScraper.new(link.url,**@news_article_scraper_kargs)
380
-
381
- sha256 = scraper.scrape_sha256_only()
382
-
370
+
371
+ sha256 = scraper.scrape_sha256_only
372
+
383
373
  article = news.article_with_sha256(sha256) if news.sha256?(sha256)
384
374
  end
385
375
  end
386
-
376
+
387
377
  if article
388
378
  news.update_article(article,link.url) # Favors https
389
379
  link.update_from_article(article)
390
-
380
+
391
381
  return true
392
382
  end
393
-
383
+
394
384
  return false
395
385
  end
396
386
  end