nhkore 0.3.3 → 0.3.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,28 +1,17 @@
1
- #!/usr/bin/env ruby
2
1
  # encoding: UTF-8
3
2
  # frozen_string_literal: true
4
3
 
5
4
  #--
6
5
  # This file is part of NHKore.
7
- # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
- #
9
- # NHKore is free software: you can redistribute it and/or modify
10
- # it under the terms of the GNU Lesser General Public License as published by
11
- # the Free Software Foundation, either version 3 of the License, or
12
- # (at your option) any later version.
13
- #
14
- # NHKore is distributed in the hope that it will be useful,
15
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- # GNU Lesser General Public License for more details.
18
- #
19
- # You should have received a copy of the GNU Lesser General Public License
20
- # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
6
+ # Copyright (c) 2020-2021 Jonathan Bradley Whited
7
+ #
8
+ # SPDX-License-Identifier: LGPL-3.0-or-later
21
9
  #++
22
10
 
23
11
 
24
12
  require 'time'
25
13
 
14
+ require 'nhkore/datetime_parser'
26
15
  require 'nhkore/error'
27
16
  require 'nhkore/missingno'
28
17
  require 'nhkore/news'
@@ -33,164 +22,166 @@ require 'nhkore/util'
33
22
  module NHKore
34
23
  module CLI
35
24
  ###
36
- # @author Jonathan Bradley Whited (@esotericpig)
25
+ # @author Jonathan Bradley Whited
37
26
  # @since 0.2.0
38
27
  ###
39
28
  module NewsCmd
40
29
  DEFAULT_NEWS_SCRAPE = 1
41
-
42
- def build_news_cmd()
30
+
31
+ def build_news_cmd
43
32
  app = self
44
-
45
- @news_cmd = @app_cmd.define_command() do
33
+
34
+ @news_cmd = @app_cmd.define_command do
46
35
  name 'news'
47
36
  usage 'news [OPTIONS] [COMMAND]...'
48
37
  aliases :n
49
38
  summary "Scrape NHK News Web (Easy) articles (aliases: #{app.color_alias('n')})"
50
-
51
- description <<-EOD
39
+
40
+ description <<-DESC
52
41
  Scrape NHK News Web (Easy) articles &
53
42
  save to folder: #{News::DEFAULT_DIR}
54
- EOD
55
-
56
- option :d,:datetime,<<-EOD,argument: :required,transform: -> (value) do
43
+ DESC
44
+
45
+ option :d,:datetime,<<-DESC,argument: :required,transform: lambda { |value|
57
46
  date time to use as a fallback in cases when an article doesn't have one;
58
47
  format: YYYY-mm-dd H:M; example: 2020-03-30 15:30
59
- EOD
60
- value = Time.strptime(value,'%Y-%m-%d %H:%M',&Util.method(:guess_year))
48
+ DESC
49
+ value = Time.strptime(value,'%Y-%m-%d %H:%M',&DatetimeParser.method(:guess_year))
61
50
  value = Util.jst_time(value)
62
51
  value
63
- end
64
- option :i,:in,<<-EOD,argument: :required,transform: -> (value) do
52
+ }
53
+ option :i,:in,<<-DESC,argument: :required,transform: lambda { |value|
65
54
  HTML file of article to read instead of URL (for offline testing and/or slow internet;
66
55
  see '--no-dict' option)
67
- EOD
56
+ DESC
68
57
  app.check_empty_opt(:in,value)
69
- end
70
- flag :L,:lenient,<<-EOD
58
+ }
59
+ flag :L,:lenient,<<-DESC
71
60
  leniently (not strict) scrape articles:
72
61
  body & title content without the proper HTML/CSS classes/IDs and no futsuurl;
73
62
  example URLs that need this flag:
74
63
  -https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html
75
64
  -https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html
76
- EOD
77
- option :k,:like,<<-EOD,argument: :required,transform: -> (value) do
65
+ DESC
66
+ option :k,:like,<<-DESC,argument: :required,transform: lambda { |value|
78
67
  text to fuzzy search links for; for example, "--like '00123'" will only scrape links containing
79
68
  text '00123' -- like '*00123*'
80
- EOD
81
- value = Util.strip_web_str(value).downcase()
69
+ DESC
70
+ value = Util.strip_web_str(value).downcase
82
71
  value
83
- end
84
- option :l,:links,<<-EOD,argument: :required,transform: -> (value) do
72
+ }
73
+ option :l,:links,<<-DESC,argument: :required,transform: lambda { |value|
85
74
  'directory/file' of article links to scrape (see '#{App::NAME} search';
86
75
  defaults: #{SearchLinks::DEFAULT_YASASHII_FILE}, #{SearchLinks::DEFAULT_FUTSUU_FILE})
87
- EOD
76
+ DESC
88
77
  app.check_empty_opt(:links,value)
89
- end
90
- flag :M,:missingno,<<-EOD
78
+ }
79
+ flag :M,:missingno,<<-DESC
91
80
  very rarely an article will not have kana or kanji for a Ruby tag;
92
81
  to not raise an error, this will use previously scraped data to fill it in;
93
82
  example URL:
94
83
  -https://www3.nhk.or.jp/news/easy/k10012331311000/k10012331311000.html
95
- EOD
96
- flag :D,:'no-dict',<<-EOD
84
+ DESC
85
+ flag :D,:'no-dict',<<-DESC
97
86
  do not try to parse the dictionary files for the articles; useful in case of errors trying to load
98
87
  the dictionaries (or for offline testing)
99
- EOD
100
- flag :H,'no-sha256',<<-EOD
88
+ DESC
89
+ flag :H,'no-sha256',<<-DESC
101
90
  do not check the SHA-256 of the content to see if an article has already been scraped;
102
91
  for example, 2 URLs with the same content, but 1 with 'http' & 1 with 'https', will both be scraped;
103
92
  this is useful if 2 articles have the same SHA-256, but different content (unlikely)
104
- EOD
105
- option :o,:out,<<-EOD,argument: :required,transform: -> (value) do
93
+ DESC
94
+ option :o,:out,<<-DESC,argument: :required,transform: lambda { |value|
106
95
  'directory/file' to save words to; if you only specify a directory or a file, it will attach
107
96
  the appropriate default directory/file name
108
97
  (defaults: #{YasashiiNews::DEFAULT_FILE}, #{FutsuuNews::DEFAULT_FILE})
109
- EOD
98
+ DESC
110
99
  app.check_empty_opt(:out,value)
111
- end
100
+ }
112
101
  flag :r,:redo,'scrape article links even if they have already been scraped'
113
102
  option :s,:scrape,'number of unscraped article links to scrape',argument: :required,
114
- default: DEFAULT_NEWS_SCRAPE,transform: -> (value) do
115
- value = value.to_i()
116
- value = 1 if value < 1
117
- value
118
- end
119
- option nil,:'show-dict',<<-EOD
103
+ default: DEFAULT_NEWS_SCRAPE,transform: lambda { |value|
104
+ value = value.to_i
105
+ value = 1 if value < 1
106
+ value
107
+ }
108
+ option nil,:'show-dict',<<-DESC
120
109
  show dictionary URL and contents for the first article and exit;
121
110
  useful for debugging dictionary errors (see '--no-dict' option);
122
111
  implies '--dry-run' option
123
- EOD
124
- option :u,:url,<<-EOD,argument: :required,transform: -> (value) do
112
+ DESC
113
+ option :u,:url,<<-DESC,argument: :required,transform: lambda { |value|
125
114
  URL of article to scrape, instead of article links file (see '--links' option)
126
- EOD
115
+ DESC
127
116
  app.check_empty_opt(:url,value)
128
- end
129
-
117
+ }
118
+
130
119
  run do |opts,args,cmd|
131
120
  puts cmd.help
132
121
  end
133
122
  end
134
-
135
- @news_easy_cmd = @news_cmd.define_command() do
123
+
124
+ @news_easy_cmd = @news_cmd.define_command do
136
125
  name 'easy'
137
126
  usage 'easy [OPTIONS] [COMMAND]...'
138
127
  aliases :e,:ez
139
128
  summary "Scrape NHK News Web Easy (Yasashii) articles (aliases: #{app.color_alias('e ez')})"
140
-
141
- description <<-EOD
129
+
130
+ description <<-DESC
142
131
  Search for NHK News Web Easy (Yasashii) links &
143
132
  save to file: #{YasashiiNews::DEFAULT_FILE}
144
- EOD
145
-
133
+ DESC
134
+
146
135
  run do |opts,args,cmd|
147
136
  app.refresh_cmd(opts,args,cmd)
148
137
  app.run_news_cmd(:yasashii)
149
138
  end
150
139
  end
151
-
152
- @news_regular_cmd = @news_cmd.define_command() do
140
+
141
+ @news_regular_cmd = @news_cmd.define_command do
153
142
  name 'regular'
154
143
  usage 'regular [OPTIONS] [COMMAND]...'
155
144
  aliases :r,:reg
156
145
  summary "Scrape NHK News Web Regular (Futsuu) articles (aliases: #{app.color_alias('r reg')})"
157
-
158
- description <<-EOD
146
+
147
+ description <<-DESC
159
148
  Search for NHK News Web Regular (Futsuu) links &
160
149
  save to file: #{FutsuuNews::DEFAULT_FILE}
161
- EOD
162
-
150
+ DESC
151
+
163
152
  run do |opts,args,cmd|
164
153
  app.refresh_cmd(opts,args,cmd)
165
154
  app.run_news_cmd(:futsuu)
166
155
  end
167
156
  end
168
157
  end
169
-
158
+
170
159
  def run_news_cmd(type)
171
160
  @cmd_opts[:dry_run] = true if @cmd_opts[:show_dict]
172
161
  news_name = nil
173
-
162
+
174
163
  build_in_file(:in)
175
-
164
+
176
165
  case type
177
166
  when :futsuu
178
- build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::DEFAULT_FUTSUU_FILENAME)
167
+ build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,
168
+ default_filename: SearchLinks::DEFAULT_FUTSUU_FILENAME)
179
169
  build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: FutsuuNews::DEFAULT_FILENAME)
180
-
170
+
181
171
  news_name = 'Regular'
182
172
  when :yasashii
183
- build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::DEFAULT_YASASHII_FILENAME)
173
+ build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,
174
+ default_filename: SearchLinks::DEFAULT_YASASHII_FILENAME)
184
175
  build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: YasashiiNews::DEFAULT_FILENAME)
185
-
176
+
186
177
  news_name = 'Easy'
187
178
  else
188
179
  raise ArgumentError,"invalid type[#{type}]"
189
180
  end
190
-
181
+
191
182
  return unless check_in_file(:in,empty_ok: true)
192
183
  return unless check_out_file(:out)
193
-
184
+
194
185
  datetime = @cmd_opts[:datetime]
195
186
  dict = @cmd_opts[:no_dict] ? nil : :scrape
196
187
  dry_run = @cmd_opts[:dry_run]
@@ -199,39 +190,39 @@ module CLI
199
190
  like = @cmd_opts[:like]
200
191
  links_file = @cmd_opts[:links]
201
192
  max_scrapes = @cmd_opts[:scrape]
202
- max_scrapes = DEFAULT_NEWS_SCRAPE if max_scrapes.nil?()
193
+ max_scrapes = DEFAULT_NEWS_SCRAPE if max_scrapes.nil?
203
194
  missingno = @cmd_opts[:missingno]
204
195
  no_sha256 = @cmd_opts[:no_sha256]
205
196
  out_file = @cmd_opts[:out]
206
197
  redo_scrapes = @cmd_opts[:redo]
207
198
  show_dict = @cmd_opts[:show_dict]
208
-
199
+
209
200
  # Favor in_file option over url option.
210
- url = in_file.nil?() ? Util.strip_web_str(@cmd_opts[:url].to_s()) : in_file
211
- url = nil if url.empty?()
212
-
213
- if url.nil?()
201
+ url = in_file.nil? ? Util.strip_web_str(@cmd_opts[:url].to_s) : in_file
202
+ url = nil if url.empty?
203
+
204
+ if url.nil?
214
205
  # Then we must have a links file that exists.
215
206
  return unless check_in_file(:links,empty_ok: false)
216
207
  end
217
-
208
+
218
209
  start_spin("Scraping NHK News Web #{news_name} articles")
219
-
220
- is_file = !in_file.nil?()
210
+
211
+ is_file = !in_file.nil?
221
212
  link_count = -1
222
- links = File.exist?(links_file) ? SearchLinks.load_file(links_file) : SearchLinks.new()
213
+ links = File.exist?(links_file) ? SearchLinks.load_file(links_file) : SearchLinks.new
223
214
  new_articles = [] # For --dry-run
224
215
  news = nil
225
216
  scrape_count = 0
226
-
217
+
227
218
  if File.exist?(out_file)
228
219
  news = (type == :yasashii) ?
229
220
  YasashiiNews.load_file(out_file,overwrite: no_sha256) :
230
221
  FutsuuNews.load_file(out_file,overwrite: no_sha256)
231
222
  else
232
- news = (type == :yasashii) ? YasashiiNews.new() : FutsuuNews.new()
223
+ news = (type == :yasashii) ? YasashiiNews.new : FutsuuNews.new
233
224
  end
234
-
225
+
235
226
  @news_article_scraper_kargs = @scraper_kargs.merge({
236
227
  datetime: datetime,
237
228
  dict: dict,
@@ -242,154 +233,154 @@ module CLI
242
233
  @news_dict_scraper_kargs = @scraper_kargs.merge({
243
234
  is_file: is_file,
244
235
  })
245
-
246
- if url.nil?()
236
+
237
+ if url.nil?
247
238
  # Why store each() and do `links_len` instead of `links-len - 1`?
248
- #
239
+ #
249
240
  # If links contains 5 entries and you scrape all 5, then the output of
250
241
  # update_spin_detail() will end on 4, so all of this complexity is so
251
242
  # that update_spin_detail() only needs to be written/updated on one line.
252
-
253
- links_each = links.links.values.each()
254
- links_len = links.length()
255
-
243
+
244
+ links_each = links.links.values.each
245
+ links_len = links.length
246
+
256
247
  0.upto(links_len) do |i|
257
248
  update_spin_detail(" (scraped=#{scrape_count}, considered=#{link_count += 1})")
258
-
249
+
259
250
  break if i >= links_len || scrape_count >= max_scrapes
260
-
261
- link = links_each.next()
262
-
263
- next if !like.nil?() && !link.url.to_s().downcase().include?(like)
251
+
252
+ link = links_each.next
253
+
254
+ next if !like.nil? && !link.url.to_s.downcase.include?(like)
264
255
  next if !redo_scrapes && scraped_news_article?(news,link)
265
-
256
+
266
257
  url = link.url
267
-
258
+
268
259
  if (new_url = scrape_news_article(url,link: link,new_articles: new_articles,news: news))
269
260
  # --show-dict
270
261
  url = new_url
271
262
  scrape_count = max_scrapes - 1 # Break on next iteration for update_spin_detail()
272
263
  end
273
-
264
+
274
265
  # Break on next iteration for update_spin_detail().
275
266
  next if (scrape_count += 1) >= max_scrapes
276
-
277
- sleep_scraper()
267
+
268
+ sleep_scraper
278
269
  end
279
270
  else
280
271
  link = links[url]
281
-
282
- if link.nil?()
272
+
273
+ if link.nil?
283
274
  link = SearchLink.new(url)
284
275
  links.add_link(link)
285
276
  end
286
-
277
+
287
278
  scrape_news_article(url,link: link,new_articles: new_articles,news: news)
288
-
279
+
289
280
  scrape_count += 1
290
281
  end
291
-
292
- stop_spin()
282
+
283
+ stop_spin
293
284
  puts
294
-
285
+
295
286
  if scrape_count <= 0
296
287
  puts 'Nothing scraped!'
297
-
288
+
298
289
  if !dry_run && !show_dict
299
290
  puts
300
291
  start_spin('Saving updated links to file')
301
-
292
+
302
293
  links.save_file(links_file)
303
-
304
- stop_spin()
294
+
295
+ stop_spin
305
296
  puts "> #{links_file}"
306
297
  end
307
298
  else
308
299
  puts 'Last URL scraped:'
309
300
  puts "> #{url}"
310
301
  puts
311
-
302
+
312
303
  if show_dict
313
304
  puts @cmd_opts[:show_dict] # Updated in scrape_news_article()
314
305
  elsif dry_run
315
306
  if new_articles.length < 1
316
- raise CLIError,"scrape_count[#{scrape_count}] != new_articles[#{new_articles.length}]; " +
317
- "internal code is broken"
307
+ raise CLIError,"scrape_count[#{scrape_count}] != new_articles[#{new_articles.length}];" \
308
+ ' internal code is broken'
318
309
  elsif new_articles.length == 1
319
310
  puts new_articles.first
320
311
  else
321
312
  # Don't show the words (mini), too verbose for more than 1.
322
- new_articles.each() do |article|
313
+ new_articles.each do |article|
323
314
  puts article.to_s(mini: true)
324
315
  end
325
316
  end
326
317
  else
327
318
  start_spin('Saving scraped data to files')
328
-
319
+
329
320
  links.save_file(links_file)
330
321
  news.save_file(out_file)
331
-
332
- stop_spin()
322
+
323
+ stop_spin
333
324
  puts "> #{out_file}"
334
325
  puts "> #{links_file}"
335
326
  end
336
327
  end
337
328
  end
338
-
329
+
339
330
  def scrape_news_article(url,link:,new_articles:,news:)
340
331
  show_dict = @cmd_opts[:show_dict]
341
-
332
+
342
333
  if show_dict
343
334
  scraper = DictScraper.new(url,**@news_dict_scraper_kargs)
344
-
345
- @cmd_opts[:show_dict] = scraper.scrape().to_s()
346
-
335
+
336
+ @cmd_opts[:show_dict] = scraper.scrape.to_s
337
+
347
338
  return scraper.url
348
339
  end
349
-
340
+
350
341
  scraper = ArticleScraper.new(url,**@news_article_scraper_kargs)
351
- article = scraper.scrape()
352
-
342
+ article = scraper.scrape
343
+
353
344
  # run_news_cmd() handles overwriting with --redo or not
354
345
  # using scraped_news_article?().
355
346
  news.add_article(article,overwrite: true)
356
-
347
+
357
348
  news.update_article(article,link.url) # Favors https
358
349
  link.update_from_article(article)
359
-
350
+
360
351
  new_articles << article
361
-
352
+
362
353
  return false # No --show-dict
363
354
  end
364
-
355
+
365
356
  def scraped_news_article?(news,link)
366
- return true if link.scraped?()
367
-
357
+ return true if link.scraped?
358
+
368
359
  no_sha256 = @cmd_opts[:no_sha256]
369
-
360
+
370
361
  article = news.article(link.url)
371
-
372
- if !no_sha256 && article.nil?()
362
+
363
+ if !no_sha256 && article.nil?
373
364
  if !Util.empty_web_str?(link.sha256) && news.sha256?(link.sha256)
374
365
  article = news.article_with_sha256(link.sha256)
375
366
  end
376
-
377
- if article.nil?()
367
+
368
+ if article.nil?
378
369
  scraper = ArticleScraper.new(link.url,**@news_article_scraper_kargs)
379
-
380
- sha256 = scraper.scrape_sha256_only()
381
-
370
+
371
+ sha256 = scraper.scrape_sha256_only
372
+
382
373
  article = news.article_with_sha256(sha256) if news.sha256?(sha256)
383
374
  end
384
375
  end
385
-
376
+
386
377
  if article
387
378
  news.update_article(article,link.url) # Favors https
388
379
  link.update_from_article(article)
389
-
380
+
390
381
  return true
391
382
  end
392
-
383
+
393
384
  return false
394
385
  end
395
386
  end