nhkore 0.3.17 → 0.3.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,12 +3,11 @@
3
3
 
4
4
  #--
5
5
  # This file is part of NHKore.
6
- # Copyright (c) 2020-2021 Jonathan Bradley Whited
6
+ # Copyright (c) 2020 Bradley Whited
7
7
  #
8
8
  # SPDX-License-Identifier: LGPL-3.0-or-later
9
9
  #++
10
10
 
11
-
12
11
  require 'attr_bool'
13
12
  require 'digest'
14
13
 
@@ -24,7 +23,6 @@ require 'nhkore/util'
24
23
  require 'nhkore/variator'
25
24
  require 'nhkore/word'
26
25
 
27
-
28
26
  module NHKore
29
27
  class ArticleScraper < Scraper
30
28
  extend AttrBool::Ext
@@ -47,8 +45,8 @@ module NHKore
47
45
  # instead of raising an error
48
46
  # @param strict [true,false]
49
47
  def initialize(url,cleaners: [BestCleaner.new],datetime: nil,dict: :scrape,missingno: nil,
50
- polishers: [BestPolisher.new],splitter: BestSplitter.new,strict: true,
51
- variators: [BestVariator.new],year: nil,**kargs)
48
+ polishers: [BestPolisher.new],splitter: BestSplitter.new,strict: true,
49
+ variators: [BestVariator.new],year: nil,**kargs)
52
50
  super(url,**kargs)
53
51
 
54
52
  @cleaners = Array(cleaners)
@@ -179,13 +177,13 @@ module NHKore
179
177
 
180
178
  def scrape_content(doc,article)
181
179
  tag = doc.css('div#js-article-body')
182
- tag = doc.css('div.article-main__body') if tag.length < 1
183
- tag = doc.css('div.article-body') if tag.length < 1
180
+ tag = doc.css('div.article-main__body') if tag.empty?
181
+ tag = doc.css('div.article-body') if tag.empty?
184
182
 
185
183
  # - https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html
186
- tag = doc.css('div#main') if tag.length < 1 && !@strict
184
+ tag = doc.css('div#main') if tag.empty? && !@strict
187
185
 
188
- if tag.length > 0
186
+ if !tag.empty?
189
187
  text = Util.unspace_web_str(tag.text.to_s)
190
188
 
191
189
  if !text.empty?
@@ -202,14 +200,14 @@ module NHKore
202
200
  raise ScrapeError,"could not scrape content at URL[#{@url}]"
203
201
  end
204
202
 
205
- def scrape_datetime(doc,futsuurl=nil)
203
+ def scrape_datetime(doc,futsuurl = nil)
206
204
  year = scrape_year(doc,futsuurl)
207
205
 
208
206
  # First, try with the id.
209
207
  tag_name = 'p#js-article-date'
210
208
  tag = doc.css(tag_name)
211
209
 
212
- if tag.length > 0
210
+ if !tag.empty?
213
211
  tag_text = tag[0].text
214
212
 
215
213
  begin
@@ -226,7 +224,7 @@ module NHKore
226
224
  tag_name = 'p.article-main__date'
227
225
  tag = doc.css(tag_name)
228
226
 
229
- if tag.length > 0
227
+ if !tag.empty?
230
228
  tag_text = tag[0].text
231
229
 
232
230
  begin
@@ -244,10 +242,10 @@ module NHKore
244
242
  # - 'news20170331_k10010922481000'
245
243
  tag = doc.css('body')
246
244
 
247
- if tag.length > 0
245
+ if !tag.empty?
248
246
  tag_id = tag[0]['id'].to_s.split('_',2)
249
247
 
250
- if tag_id.length > 0
248
+ if !tag_id.empty?
251
249
  tag_id = tag_id[0].gsub(/[^[[:digit:]]]+/,'')
252
250
 
253
251
  if tag_id.length == 8
@@ -272,8 +270,8 @@ module NHKore
272
270
 
273
271
  begin
274
272
  scraper = DictScraper.new(dict_url,missingno: @missingno,parse_url: false,**@kargs)
275
- rescue OpenURI::HTTPError => e
276
- if retries == 0 && e.to_s.include?('404')
273
+ rescue Http404Error => e
274
+ if retries == 0
277
275
  read
278
276
 
279
277
  scraper = ArticleScraper.new(@url,str_or_io: @str_or_io,**@kargs)
@@ -283,7 +281,10 @@ module NHKore
283
281
 
284
282
  retry
285
283
  else
286
- raise e.exception("could not scrape dictionary URL[#{dict_url}] at URL[#{@url}]: #{e}")
284
+ # raise e.exception("failed to scrape dictionary URL[#{dict_url}] at URL[#{@url}]: #{e}")
285
+ Util.warn("failed to scrape dictionary URL[#{dict_url}] at URL[#{@url}]: #{e}")
286
+ @dict = nil
287
+ return
287
288
  end
288
289
  end
289
290
 
@@ -297,7 +298,7 @@ module NHKore
297
298
  # - 'news20170331_k10010922481000'
298
299
  tag = doc.css('body')
299
300
 
300
- if tag.length > 0
301
+ if !tag.empty?
301
302
  tag_id = tag[0]['id'].to_s.split('_',2)
302
303
 
303
304
  if tag_id.length == 2
@@ -360,7 +361,7 @@ module NHKore
360
361
  # First, try with the id.
361
362
  tag = doc.css('div#js-regular-news-wrapper')
362
363
 
363
- if tag.length > 0
364
+ if !tag.empty?
364
365
  link = scrape_link(tag[0])
365
366
 
366
367
  return link unless link.nil?
@@ -369,7 +370,7 @@ module NHKore
369
370
  # Second, try with the class.
370
371
  tag = doc.css('div.link-to-normal')
371
372
 
372
- if tag.length > 0
373
+ if !tag.empty?
373
374
  link = scrape_link(tag[0])
374
375
 
375
376
  return link unless link.nil?
@@ -385,7 +386,7 @@ module NHKore
385
386
  def scrape_link(tag)
386
387
  link = tag.css('a')
387
388
 
388
- return nil if link.length < 1
389
+ return nil if link.empty?
389
390
 
390
391
  link = Util.unspace_web_str(link[0]['href'].to_s)
391
392
 
@@ -493,24 +494,24 @@ module NHKore
493
494
  tag = doc.css('h1.article-main__title')
494
495
  tag_name = nil
495
496
 
496
- if tag.length < 1
497
+ if tag.empty?
497
498
  # - https://www3.nhk.or.jp/news/easy/em2024081312029/em2024081312029.html
498
499
  tag = doc.css('h1.article-title') # No warning.
499
500
  end
500
501
 
501
- if tag.length < 1
502
+ if tag.empty?
502
503
  # - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_illust.html
503
504
  tag_name = 'h1.article-eq__title'
504
505
  tag = doc.css(tag_name)
505
506
  end
506
- if tag.length < 1 && !@strict
507
+ if tag.empty? && !@strict
507
508
  # This shouldn't be used except for select sites.
508
509
  # - https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html
509
510
  tag_name = 'div#main h2'
510
511
  tag = doc.css(tag_name)
511
512
  end
512
513
 
513
- if tag.length > 0
514
+ if !tag.empty?
514
515
  Util.warn("using [#{tag_name}] for title at URL[#{@url}]") unless tag_name.nil?
515
516
 
516
517
  result = scrape_and_add_words(tag,article)
@@ -548,8 +549,8 @@ module NHKore
548
549
 
549
550
  if klass == 'dicwin' && !id.nil?
550
551
  if dicwin
551
- raise ScrapeError,"invalid dicWin class[#{child}] nested inside another dicWin class at" \
552
- " URL[#{@url}]"
552
+ raise ScrapeError,"invalid dicWin class[#{child}] nested inside another dicWin class at " \
553
+ "URL[#{@url}]"
553
554
  end
554
555
 
555
556
  dicwin_id = id
@@ -582,11 +583,11 @@ module NHKore
582
583
  return result
583
584
  end
584
585
 
585
- def scrape_year(doc,futsuurl=nil)
586
+ def scrape_year(doc,futsuurl = nil)
586
587
  # First, try body's id.
587
588
  tag = doc.css('body')
588
589
 
589
- if tag.length > 0
590
+ if !tag.empty?
590
591
  tag_id = tag[0]['id'].to_s.gsub(/[^[[:digit:]]]+/,'')
591
592
 
592
593
  if tag_id.length >= 4
@@ -3,16 +3,14 @@
3
3
 
4
4
  #--
5
5
  # This file is part of NHKore.
6
- # Copyright (c) 2020-2021 Jonathan Bradley Whited
6
+ # Copyright (c) 2020 Bradley Whited
7
7
  #
8
8
  # SPDX-License-Identifier: LGPL-3.0-or-later
9
9
  #++
10
10
 
11
-
12
11
  require 'nhkore/util'
13
12
  require 'nhkore/word'
14
13
 
15
-
16
14
  module NHKore
17
15
  class Cleaner
18
16
  def begin_clean(str)
@@ -3,12 +3,11 @@
3
3
 
4
4
  #--
5
5
  # This file is part of NHKore.
6
- # Copyright (c) 2020-2021 Jonathan Bradley Whited
6
+ # Copyright (c) 2020 Bradley Whited
7
7
  #
8
8
  # SPDX-License-Identifier: LGPL-3.0-or-later
9
9
  #++
10
10
 
11
-
12
11
  module NHKore
13
12
  module CLI
14
13
  module FXCmd
@@ -40,13 +39,11 @@ module CLI
40
39
  end
41
40
 
42
41
  def test_fx_progress_bar
43
- bars = nil
44
-
45
- if @cmd_opts[:all]
46
- bars = {default: :default,classic: :classic,no: :no}
47
- else
48
- bars = {user: @progress_bar}
49
- end
42
+ bars = if @cmd_opts[:all]
43
+ {default: :default,classic: :classic,no: :no}
44
+ else
45
+ {user: @progress_bar}
46
+ end
50
47
 
51
48
  bars.each do |name,bar|
52
49
  name = name.to_s.capitalize
@@ -65,19 +62,17 @@ module CLI
65
62
 
66
63
  def test_fx_spinner
67
64
  app_spinner = @spinner
68
- spinners = nil
69
-
70
- if @cmd_opts[:all]
71
- spinners = {
72
- default: App::DEFAULT_SPINNER,
73
- classic: App::CLASSIC_SPINNER,
74
- no: {},
75
- }
76
- else
77
- spinners = {
78
- user: app_spinner
79
- }
80
- end
65
+ spinners = if @cmd_opts[:all]
66
+ {
67
+ default: App::DEFAULT_SPINNER,
68
+ classic: App::CLASSIC_SPINNER,
69
+ no: {},
70
+ }
71
+ else
72
+ {
73
+ user: app_spinner
74
+ }
75
+ end
81
76
 
82
77
  spinners.each do |name,spinner|
83
78
  @spinner = spinner
@@ -3,22 +3,20 @@
3
3
 
4
4
  #--
5
5
  # This file is part of NHKore.
6
- # Copyright (c) 2020-2021 Jonathan Bradley Whited
6
+ # Copyright (c) 2020 Bradley Whited
7
7
  #
8
8
  # SPDX-License-Identifier: LGPL-3.0-or-later
9
9
  #++
10
10
 
11
-
12
11
  require 'nhkore/util'
13
12
 
14
-
15
13
  module NHKore
16
14
  module CLI
17
15
  module GetCmd
18
16
  DEFAULT_GET_CHUNK_SIZE = 4 * 1024
19
17
  DEFAULT_GET_URL_LENGTH = 11_000_000 # Just a generous estimation used as a fallback; may be outdated.
20
18
  GET_URL_FILENAME = 'nhkore-core.zip'
21
- GET_URL = "https://github.com/esotericpig/nhkore/releases/latest/download/#{GET_URL_FILENAME}"
19
+ GET_URL = "https://github.com/esotericpig/nhkore/releases/latest/download/#{GET_URL_FILENAME}".freeze
22
20
 
23
21
  def build_get_cmd
24
22
  app = self
@@ -27,8 +25,8 @@ module CLI
27
25
  name 'get'
28
26
  usage 'get [OPTIONS] [COMMAND]...'
29
27
  aliases :g
30
- summary "Download NHKore's pre-scraped files from the latest release" \
31
- " (aliases: #{app.color_alias('g')})"
28
+ summary "Download NHKore's pre-scraped files from the latest release " \
29
+ "(aliases: #{app.color_alias('g')})"
32
30
 
33
31
  description(<<-DESC)
34
32
  Download NHKore's pre-scraped files from the latest release &
@@ -41,7 +39,7 @@ module CLI
41
39
  transform: lambda { |value|
42
40
  app.check_empty_opt(:out,value)
43
41
  }
44
- flag nil,:'show-url','show download URL and exit (for downloading manually)' do |value,cmd|
42
+ flag nil,:'show-url','show download URL and exit (for downloading manually)' do |_value,_cmd|
45
43
  puts GET_URL
46
44
  exit
47
45
  end
@@ -3,12 +3,12 @@
3
3
 
4
4
  #--
5
5
  # This file is part of NHKore.
6
- # Copyright (c) 2020-2021 Jonathan Bradley Whited
6
+ # Copyright (c) 2020 Bradley Whited
7
7
  #
8
8
  # SPDX-License-Identifier: LGPL-3.0-or-later
9
9
  #++
10
10
 
11
-
11
+ require 'fileutils'
12
12
  require 'time'
13
13
 
14
14
  require 'nhkore/datetime_parser'
@@ -18,7 +18,6 @@ require 'nhkore/news'
18
18
  require 'nhkore/search_link'
19
19
  require 'nhkore/util'
20
20
 
21
-
22
21
  module NHKore
23
22
  module CLI
24
23
  module NewsCmd
@@ -112,7 +111,7 @@ module CLI
112
111
  app.check_empty_opt(:url,value)
113
112
  }
114
113
 
115
- run do |opts,args,cmd|
114
+ run do |_opts,_args,cmd|
116
115
  puts cmd.help
117
116
  end
118
117
  end
@@ -197,10 +196,8 @@ module CLI
197
196
  url = in_file.nil? ? Util.strip_web_str(@cmd_opts[:url].to_s) : in_file
198
197
  url = nil if url.empty?
199
198
 
200
- if url.nil?
201
- # Then we must have a links file that exists.
202
- return unless check_in_file(:links,empty_ok: false)
203
- end
199
+ # Then we must have a links file that exists.
200
+ return if url.nil? && !check_in_file(:links,empty_ok: false)
204
201
 
205
202
  start_spin("Scraping NHK News Web #{news_name} articles")
206
203
 
@@ -208,16 +205,14 @@ module CLI
208
205
  link_count = -1
209
206
  links = File.exist?(links_file) ? SearchLinks.load_file(links_file) : SearchLinks.new
210
207
  new_articles = [] # For --dry-run
211
- news = nil
212
208
  scrape_count = 0
213
209
 
214
- if File.exist?(out_file)
215
- news = (type == :yasashii) ?
216
- YasashiiNews.load_file(out_file,overwrite: no_sha256) :
217
- FutsuuNews.load_file(out_file,overwrite: no_sha256)
218
- else
219
- news = (type == :yasashii) ? YasashiiNews.new : FutsuuNews.new
220
- end
210
+ news = if File.exist?(out_file)
211
+ (type == :yasashii) ? YasashiiNews.load_file(out_file,overwrite: no_sha256)
212
+ : FutsuuNews.load_file(out_file,overwrite: no_sha256)
213
+ else
214
+ (type == :yasashii) ? YasashiiNews.new : FutsuuNews.new
215
+ end
221
216
 
222
217
  @news_article_scraper_kargs = @scraper_kargs.merge({
223
218
  datetime: datetime,
@@ -302,9 +297,9 @@ module CLI
302
297
  if show_dict
303
298
  puts @cmd_opts[:show_dict] # Updated in scrape_news_article()
304
299
  elsif dry_run
305
- if new_articles.length < 1
306
- raise CLIError,"scrape_count[#{scrape_count}] != new_articles[#{new_articles.length}];" \
307
- ' internal code is broken'
300
+ if new_articles.empty?
301
+ raise CLIError,"scrape_count[#{scrape_count}] != new_articles[#{new_articles.length}]; " \
302
+ 'internal code is broken'
308
303
  elsif new_articles.length == 1
309
304
  puts new_articles.first
310
305
  else
@@ -3,18 +3,16 @@
3
3
 
4
4
  #--
5
5
  # This file is part of NHKore.
6
- # Copyright (c) 2020-2022 Jonathan Bradley Whited
6
+ # Copyright (c) 2020 Bradley Whited
7
7
  #
8
8
  # SPDX-License-Identifier: LGPL-3.0-or-later
9
9
  #++
10
10
 
11
-
12
11
  require 'nhkore/error'
13
12
  require 'nhkore/search_link'
14
13
  require 'nhkore/search_scraper'
15
14
  require 'nhkore/util'
16
15
 
17
-
18
16
  module NHKore
19
17
  module CLI
20
18
  module SearchCmd
@@ -29,7 +27,7 @@ module CLI
29
27
 
30
28
  description <<-DESC
31
29
  Search for links (using a Search Engine, etc.) to NHK News Web (Easy) &
32
- save to folder: #{SearchLinks::DEFAULT_DIR}
30
+ save to folder: '#{SearchLinks::DEFAULT_DIR}'
33
31
  DESC
34
32
 
35
33
  option :i,:in,<<-DESC,argument: :required,transform: lambda { |value|
@@ -40,7 +38,7 @@ module CLI
40
38
  }
41
39
  option :l,:loop,'number of times to repeat the search to ensure results',argument: :required,
42
40
  transform: lambda { |value|
43
- value = value.to_i
41
+ value = value.to_s.strip.to_i
44
42
  value = 1 if value < 1
45
43
  value
46
44
  }
@@ -68,8 +66,8 @@ module CLI
68
66
  (see '--in' option)
69
67
  DESC
70
68
 
71
- run do |opts,args,cmd|
72
- opts.each do |key,value|
69
+ run do |opts,_args,cmd|
70
+ opts.each do |key,_value|
73
71
  key = key.to_s
74
72
 
75
73
  if key.include?('show')
@@ -176,16 +174,15 @@ module CLI
176
174
  start_spin("Scraping #{search_type}") unless show_count
177
175
 
178
176
  is_file = !in_file.nil?
179
- links = nil
180
177
  new_links = [] # For --dry-run
181
178
  url = in_file # nil will use default URL, else a file
182
179
 
183
- # Load previous links for 'scraped?' vars.
184
- if File.exist?(out_file)
185
- links = SearchLinks.load_file(out_file)
186
- else
187
- links = SearchLinks.new
188
- end
180
+ links = if File.exist?(out_file)
181
+ # Load previous links for 'scraped?' vars.
182
+ SearchLinks.load_file(out_file)
183
+ else
184
+ SearchLinks.new
185
+ end
189
186
 
190
187
  links_count = links.length
191
188
 
@@ -3,12 +3,11 @@
3
3
 
4
4
  #--
5
5
  # This file is part of NHKore.
6
- # Copyright (c) 2020-2021 Jonathan Bradley Whited
6
+ # Copyright (c) 2020 Bradley Whited
7
7
  #
8
8
  # SPDX-License-Identifier: LGPL-3.0-or-later
9
9
  #++
10
10
 
11
-
12
11
  require 'date'
13
12
  require 'time'
14
13
 
@@ -17,13 +16,12 @@ require 'nhkore/news'
17
16
  require 'nhkore/sifter'
18
17
  require 'nhkore/util'
19
18
 
20
-
21
19
  module NHKore
22
20
  module CLI
23
21
  module SiftCmd
24
22
  DEFAULT_SIFT_EXT = :csv
25
- DEFAULT_SIFT_FUTSUU_FILE = "#{Sifter::DEFAULT_FUTSUU_FILE}{search.criteria}{file.ext}"
26
- DEFAULT_SIFT_YASASHII_FILE = "#{Sifter::DEFAULT_YASASHII_FILE}{search.criteria}{file.ext}"
23
+ DEFAULT_SIFT_FUTSUU_FILE = "#{Sifter::DEFAULT_FUTSUU_FILE}{search.criteria}{file.ext}".freeze
24
+ DEFAULT_SIFT_YASASHII_FILE = "#{Sifter::DEFAULT_YASASHII_FILE}{search.criteria}{file.ext}".freeze
27
25
  SIFT_EXTS = %i[csv htm html json yaml yml].freeze
28
26
 
29
27
  attr_accessor :sift_datetime_text
@@ -39,8 +37,8 @@ module CLI
39
37
  name 'sift'
40
38
  usage 'sift [OPTIONS] [COMMAND]...'
41
39
  aliases :s
42
- summary 'Sift NHK News Web (Easy) articles data for the frequency of words' \
43
- " (aliases: #{app.color_alias('s')})"
40
+ summary 'Sift NHK News Web (Easy) articles data for the frequency of words ' \
41
+ "(aliases: #{app.color_alias('s')})"
44
42
 
45
43
  description(<<-DESC)
46
44
  Sift NHK News Web (Easy) articles data for the frequency of words &
@@ -93,11 +91,11 @@ module CLI
93
91
  to not fail on "duplicate" articles; see '#{App::NAME} news'
94
92
  DESC
95
93
  option :t,:title,'title to filter on, where search text only needs to be somewhere in the title',
96
- argument: :required
94
+ argument: :required
97
95
  option :u,:url,'URL to filter on, where search text only needs to be somewhere in the URL',
98
- argument: :required
96
+ argument: :required
99
97
 
100
- run do |opts,args,cmd|
98
+ run do |_opts,_args,cmd|
101
99
  puts cmd.help
102
100
  end
103
101
  end
@@ -232,11 +230,11 @@ module CLI
232
230
  sifter.caption = "NHK News Web #{news_name}".dup
233
231
 
234
232
  if !@sift_search_criteria.nil?
235
- if %i[htm html].any?(file_ext)
236
- sifter.caption << " &mdash; #{Util.escape_html(@sift_search_criteria.to_s)}"
237
- else
238
- sifter.caption << " -- #{@sift_search_criteria}"
239
- end
233
+ sifter.caption << if %i[htm html].any?(file_ext)
234
+ " &mdash; #{Util.escape_html(@sift_search_criteria.to_s)}"
235
+ else
236
+ " -- #{@sift_search_criteria}"
237
+ end
240
238
  end
241
239
 
242
240
  case file_ext