nhkore 0.3.13 → 0.3.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 21663a8ce4850b7f03361289832cfdb6caa5bc64e62af1ff7cd4b91b7fc2329b
4
- data.tar.gz: de45874fc3834c492ea74ad26067504629185db39fff2f42148a24b4be453cc9
3
+ metadata.gz: f23192b04fc6a0c1cf225db4a029e3226346d27c4fe977ee05f1b522c40708bb
4
+ data.tar.gz: 3e71d5ef9eb60327cb9ced89a819de77b5cd8910e755332d5764dc95e8263f71
5
5
  SHA512:
6
- metadata.gz: 0ea9413c534cb11d60764e6dd95473b65e6a76418b28ca69441c18741cd6920992f0289a6d9f5c39ed0124023296992f7a60696099ce056b009cadcf8e863867
7
- data.tar.gz: a223dd3e9416b5487274e3218fe55e6c936019a4faf58343a6c4fc9a3bc69c55805c08f42411337ce672766cd72934c423aff734bce766273e9a5707d356eceb
6
+ metadata.gz: 2cec06bd51e86ddd7b30a052b4078a2932344996bd6d97c4b7d927d0bc8fbcecbd2fb21680957551857d3a152d33409aa7ee8e1dd496a271cb31c63cbe2eb2e9
7
+ data.tar.gz: 186c42412e35567aebe2afee71636fb58a28510bd6ab3e8b65df6adef5373860f25e87be71e5d8499dcf6b6babb0ee3aae796a01f818732bc53ad682e584e701
data/CHANGELOG.md CHANGED
@@ -5,10 +5,26 @@ All notable changes to this project will be documented in this file.
5
5
  Format is based on [Keep a Changelog v1.0.0](https://keepachangelog.com/en/1.0.0),
6
6
  and this project adheres to [Semantic Versioning v2.0.0](https://semver.org/spec/v2.0.0.html).
7
7
 
8
- ## [[Unreleased]](https://github.com/esotericpig/nhkore/compare/v0.3.13...HEAD)
8
+ ## [[Unreleased]](https://github.com/esotericpig/nhkore/compare/v0.3.16...HEAD)
9
9
  -
10
10
 
11
11
 
12
+ ## [v0.3.16] - 2024-08-14
13
+
14
+ ### Fixed
15
+ - Fixed to work with new NHK pages.
16
+ - Updated gems.
17
+
18
+
19
+ ## [v0.3.14] - 2022-07-24
20
+
21
+ ### Added
22
+ - `--loop` option to `search` command so can run web search (search engine) multiple times since this usually doesn't get all results if only do once.
23
+
24
+ ### Fixed
25
+ - Updated gems (`nokogiri`).
26
+
27
+
12
28
  ## [v0.3.13] - 2022-04-27
13
29
 
14
30
  ### Fixed
data/Gemfile.lock CHANGED
@@ -1,19 +1,19 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- nhkore (0.3.13)
4
+ nhkore (0.3.16)
5
5
  attr_bool (~> 0.2)
6
6
  bimyou_segmenter (~> 1.2)
7
7
  cri (~> 2.15)
8
- down (~> 5.3)
9
- highline (~> 2.0)
8
+ down (~> 5.4)
9
+ highline (~> 3.1)
10
10
  http-cookie (~> 1.0)
11
11
  japanese_deinflector (~> 0.0)
12
- nokogiri (~> 1.13)
12
+ nokogiri (~> 1.16)
13
13
  psychgus (~> 1.3)
14
- public_suffix (~> 4.0)
14
+ public_suffix (~> 6.0)
15
15
  rainbow (~> 3.1)
16
- rss (~> 0.2)
16
+ rss (~> 0.3)
17
17
  rubyzip (~> 2.3)
18
18
  tiny_segmenter (~> 0.0)
19
19
  tty-progressbar (~> 0.18)
@@ -22,43 +22,48 @@ PATH
22
22
  GEM
23
23
  remote: https://rubygems.org/
24
24
  specs:
25
- addressable (2.8.0)
26
- public_suffix (>= 2.0.2, < 5.0)
25
+ addressable (2.8.7)
26
+ public_suffix (>= 2.0.2, < 7.0)
27
27
  attr_bool (0.2.2)
28
28
  bimyou_segmenter (1.2.0)
29
- cri (2.15.11)
30
- domain_name (0.5.20190701)
31
- unf (>= 0.0.5, < 1.0.0)
32
- down (5.3.1)
29
+ cri (2.15.12)
30
+ domain_name (0.6.20240107)
31
+ down (5.4.2)
33
32
  addressable (~> 2.8)
34
- highline (2.0.3)
35
- http-cookie (1.0.4)
33
+ highline (3.1.0)
34
+ reline
35
+ http-cookie (1.0.6)
36
36
  domain_name (~> 0.5)
37
+ io-console (0.7.2)
37
38
  japanese_deinflector (0.0.2)
38
- mini_portile2 (2.8.0)
39
- minitest (5.15.0)
40
- nokogiri (1.13.4)
41
- mini_portile2 (~> 2.8.0)
39
+ mini_portile2 (2.8.7)
40
+ minitest (5.25.0)
41
+ nokogiri (1.16.7)
42
+ mini_portile2 (~> 2.8.2)
42
43
  racc (~> 1.4)
43
- psych (4.0.3)
44
+ psych (5.1.2)
44
45
  stringio
45
46
  psychgus (1.3.4)
46
47
  psych (>= 3.0)
47
- public_suffix (4.0.7)
48
- racc (1.6.0)
48
+ public_suffix (6.0.1)
49
+ racc (1.8.1)
49
50
  rainbow (3.1.1)
50
- rake (13.0.6)
51
+ rake (13.2.1)
51
52
  raketeer (0.2.13)
52
53
  rake
53
- rdoc (6.4.0)
54
+ rdoc (6.7.0)
54
55
  psych (>= 4.0.0)
55
- redcarpet (3.5.1)
56
- rexml (3.2.5)
57
- rss (0.2.9)
56
+ redcarpet (3.6.0)
57
+ reline (0.5.9)
58
+ io-console (~> 0.5)
59
+ rexml (3.3.5)
60
+ strscan
61
+ rss (0.3.1)
58
62
  rexml
59
63
  rubyzip (2.3.2)
60
- stringio (3.0.1)
64
+ stringio (3.1.1)
61
65
  strings-ansi (0.2.0)
66
+ strscan (3.1.0)
62
67
  tiny_segmenter (0.0.6)
63
68
  tty-cursor (0.7.1)
64
69
  tty-progressbar (0.18.2)
@@ -66,16 +71,11 @@ GEM
66
71
  tty-cursor (~> 0.7)
67
72
  tty-screen (~> 0.8)
68
73
  unicode-display_width (>= 1.6, < 3.0)
69
- tty-screen (0.8.1)
74
+ tty-screen (0.8.2)
70
75
  tty-spinner (0.9.3)
71
76
  tty-cursor (~> 0.7)
72
- unf (0.1.4)
73
- unf_ext
74
- unf_ext (0.0.8.1)
75
- unicode-display_width (2.1.0)
76
- webrick (1.7.0)
77
- yard (0.9.27)
78
- webrick (~> 1.7.0)
77
+ unicode-display_width (2.5.0)
78
+ yard (0.9.36)
79
79
  yard_ghurt (1.2.1)
80
80
  rake
81
81
  yard
@@ -84,15 +84,15 @@ PLATFORMS
84
84
  ruby
85
85
 
86
86
  DEPENDENCIES
87
- bundler (~> 2.3)
88
- minitest (~> 5.15)
87
+ bundler (~> 2.5)
88
+ minitest (~> 5.25)
89
89
  nhkore!
90
- rake (~> 13.0)
90
+ rake (~> 13.2)
91
91
  raketeer (~> 0.2)
92
- rdoc (~> 6.4)
93
- redcarpet (~> 3.5)
92
+ rdoc (~> 6.7)
93
+ redcarpet (~> 3.6)
94
94
  yard (~> 0.9)
95
95
  yard_ghurt (~> 1.2)
96
96
 
97
97
  BUNDLED WITH
98
- 2.3.12
98
+ 2.5.17
data/README.md CHANGED
@@ -867,12 +867,11 @@ This will update *core/* for you:
867
867
  2. Update *core* package:
868
868
  - `$ bundle exec rake update_core`
869
869
  - `$ bundle exec rake clobber pkg_core`
870
- 3. Create a new tag & release:
870
+ 3. Commit & Push.
871
+ 4. Create a new tag & release:
871
872
  - Note: make sure to add *pkg/nhkore-core.zip*
872
873
  - `$ gh release create v0 pkg/*.gem pkg/*.zip`
873
- - `$ git pull`
874
- 4. Release to *GitHub Packages*:
875
- - With *Raketary*: `$ raketary github_pkg`
874
+ - `$ git pull && git fetch`
876
875
  5. Release to *RubyGems*:
877
876
  - `$ bundle exec rake release`
878
877
 
@@ -885,7 +884,7 @@ Releasing new HTML file for website:
885
884
  [GNU LGPL v3+](LICENSE.txt)
886
885
 
887
886
  > NHKore (<https://github.com/esotericpig/nhkore>)
888
- > Copyright (c) 2020-2021 Jonathan Bradley Whited
887
+ > Copyright (c) 2020-2022 Jonathan Bradley Whited
889
888
  >
890
889
  > NHKore is free software: you can redistribute it and/or modify
891
890
  > it under the terms of the GNU Lesser General Public License as published by
data/Rakefile CHANGED
@@ -56,11 +56,14 @@ task :update_core do |task|
56
56
  cmd = ['ruby','-w','./lib/nhkore.rb','-t','300','-m','10']
57
57
  hl = HighLine.new
58
58
 
59
- next unless sh(*cmd,'se','ez','bing')
59
+ next unless sh(*cmd,'se','--show-count','ez')
60
+ puts
61
+
62
+ next unless sh(*cmd,'se','-l','10','ez','bing')
60
63
  next unless hl.agree(continue_msg)
61
64
  puts
62
65
 
63
- next unless sh(*cmd,'news','-s','500','ez')
66
+ next unless sh(*cmd,'news','-s','1000','ez')
64
67
  next unless hl.agree(continue_msg)
65
68
  puts
66
69
 
@@ -74,7 +77,6 @@ task :update_core do |task|
74
77
  puts
75
78
  end
76
79
 
77
- # @since 0.3.6
78
80
  desc 'Update showcase file for release'
79
81
  task :update_showcase do |task|
80
82
  require 'highline'
data/lib/nhkore/app.rb CHANGED
@@ -3,7 +3,7 @@
3
3
 
4
4
  #--
5
5
  # This file is part of NHKore.
6
- # Copyright (c) 2020-2021 Jonathan Bradley Whited
6
+ # Copyright (c) 2020-2022 Jonathan Bradley Whited
7
7
  #
8
8
  # SPDX-License-Identifier: LGPL-3.0-or-later
9
9
  #++
@@ -27,18 +27,11 @@ require 'nhkore/cli/sift_cmd'
27
27
 
28
28
 
29
29
  module NHKore
30
- ###
31
- # @author Jonathan Bradley Whited
32
- # @since 0.2.0
33
- ###
34
30
  module CLI
35
31
  end
36
32
 
37
33
  ###
38
34
  # For disabling/enabling color output.
39
- #
40
- # @author Jonathan Bradley Whited
41
- # @since 0.2.1
42
35
  ###
43
36
  module CriColorExt
44
37
  @color = true
@@ -52,10 +45,6 @@ module NHKore
52
45
  end
53
46
  end
54
47
 
55
- ###
56
- # @author Jonathan Bradley Whited
57
- # @since 0.2.0
58
- ###
59
48
  class App
60
49
  include CLI::FXCmd
61
50
  include CLI::GetCmd
@@ -537,7 +526,11 @@ module NHKore
537
526
  end
538
527
 
539
528
  def sleep_scraper
540
- sleep(@sleep_time)
529
+ # Do a range to better emulate being a human.
530
+ r = rand(@sleep_time..(@sleep_time + 0.1111))
531
+ s = r.round(3) # Within 1000ms (0.000 - 0.999).
532
+
533
+ sleep(s)
541
534
  end
542
535
 
543
536
  def start_spin(title,detail: '')
@@ -572,10 +565,6 @@ module NHKore
572
565
  end
573
566
  end
574
567
 
575
- ###
576
- # @author Jonathan Bradley Whited
577
- # @since 0.2.0
578
- ###
579
568
  class NoProgressBar
580
569
  MSG = '%{title}... %{percent}%%'
581
570
  PUT_INTERVAL = 100.0 / 6.25
@@ -623,7 +612,7 @@ module NHKore
623
612
 
624
613
  @tokens[:advance] = percent
625
614
 
626
- puts to_s
615
+ puts self
627
616
  end
628
617
 
629
618
  def finish
@@ -631,7 +620,7 @@ module NHKore
631
620
  end
632
621
 
633
622
  def start
634
- puts to_s
623
+ puts self
635
624
  end
636
625
 
637
626
  def to_s
@@ -16,10 +16,6 @@ require 'nhkore/word'
16
16
 
17
17
 
18
18
  module NHKore
19
- ###
20
- # @author Jonathan Bradley Whited
21
- # @since 0.2.0
22
- ###
23
19
  class Article
24
20
  attr_reader :datetime
25
21
  attr_reader :futsuurl
@@ -29,7 +25,7 @@ module NHKore
29
25
  attr_reader :words
30
26
 
31
27
  def initialize
32
- super()
28
+ super
33
29
 
34
30
  @datetime = nil
35
31
  @futsuurl = nil
@@ -101,13 +97,13 @@ module NHKore
101
97
  end
102
98
 
103
99
  def futsuurl=(value)
104
- # Don't store URI, store String.
105
- @futsuurl = value.nil? ? nil : value.to_s
100
+ # Don't store URI, store String or nil.
101
+ @futsuurl = value&.to_s
106
102
  end
107
103
 
108
104
  def url=(value)
109
- # Don't store URI, store String.
110
- @url = value.nil? ? nil : value.to_s
105
+ # Don't store URI, store String or nil.
106
+ @url = value&.to_s
111
107
  end
112
108
 
113
109
  def to_s(mini: false)
@@ -26,10 +26,6 @@ require 'nhkore/word'
26
26
 
27
27
 
28
28
  module NHKore
29
- ###
30
- # @author Jonathan Bradley Whited
31
- # @since 0.2.0
32
- ###
33
29
  class ArticleScraper < Scraper
34
30
  extend AttrBool::Ext
35
31
 
@@ -139,7 +135,13 @@ module NHKore
139
135
  end
140
136
 
141
137
  def parse_dicwin_id(str)
142
- str = str.gsub(/\D+/,'')
138
+ str = str.to_s.strip.downcase
139
+
140
+ if str.start_with?('id-') # 'id-0000'
141
+ str = str.gsub(/\D+/,'')
142
+ else # 'RSHOK-K-003806'
143
+ # Same.
144
+ end
143
145
 
144
146
  return nil if str.empty?
145
147
  return str
@@ -235,8 +237,6 @@ module NHKore
235
237
  # Ignore; try again below.
236
238
  Util.warn("could not parse date time[#{tag_text}] from tag[#{tag_name}] at URL[#{@url}]: #{e}")
237
239
  end
238
-
239
- return datetime
240
240
  end
241
241
 
242
242
  # Third, try body's id.
@@ -393,7 +393,6 @@ module NHKore
393
393
  return link
394
394
  end
395
395
 
396
- # @since 0.3.8
397
396
  # @see https://www3.nhk.or.jp/news/easy/k10012759201000/k10012759201000.html
398
397
  def scrape_ruby_words(tag,result: ScrapeWordsResult.new)
399
398
  words = Word.scrape_ruby_tag(tag,missingno: @missingno,url: @url)
@@ -489,15 +488,21 @@ module NHKore
489
488
  end
490
489
 
491
490
  def scrape_title(doc,article)
491
+ # Not grabbing `<head><title>` because it doesn't have `<ruby>` tags.
492
+
492
493
  tag = doc.css('h1.article-main__title')
493
494
  tag_name = nil
494
495
 
496
+ if tag.length < 1
497
+ # - https://www3.nhk.or.jp/news/easy/em2024081312029/em2024081312029.html
498
+ tag = doc.css('h1.article-title') # No warning.
499
+ end
500
+
495
501
  if tag.length < 1
496
502
  # - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_illust.html
497
503
  tag_name = 'h1.article-eq__title'
498
504
  tag = doc.css(tag_name)
499
505
  end
500
-
501
506
  if tag.length < 1 && !@strict
502
507
  # This shouldn't be used except for select sites.
503
508
  # - https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html
@@ -632,16 +637,12 @@ module NHKore
632
637
  end
633
638
  end
634
639
 
635
- ###
636
- # @author Jonathan Bradley Whited
637
- # @since 0.2.0
638
- ###
639
640
  class ScrapeWordsResult
640
641
  attr_reader :text
641
642
  attr_reader :words
642
643
 
643
644
  def initialize
644
- super()
645
+ super
645
646
 
646
647
  @text = ''.dup
647
648
  @words = []
@@ -14,10 +14,6 @@ require 'nhkore/word'
14
14
 
15
15
 
16
16
  module NHKore
17
- ###
18
- # @author Jonathan Bradley Whited
19
- # @since 0.2.0
20
- ###
21
17
  class Cleaner
22
18
  def begin_clean(str)
23
19
  return str
@@ -53,10 +49,6 @@ module NHKore
53
49
  end
54
50
  end
55
51
 
56
- ###
57
- # @author Jonathan Bradley Whited
58
- # @since 0.2.0
59
- ###
60
52
  class BasicCleaner < Cleaner
61
53
  def end_clean(str)
62
54
  # This is very simple, as Splitter will split on punctuation,
@@ -70,10 +62,6 @@ module NHKore
70
62
  end
71
63
  end
72
64
 
73
- ###
74
- # @author Jonathan Bradley Whited
75
- # @since 0.2.0
76
- ###
77
65
  class BestCleaner < BasicCleaner
78
66
  end
79
67
  end
@@ -11,10 +11,6 @@
11
11
 
12
12
  module NHKore
13
13
  module CLI
14
- ###
15
- # @author Jonathan Bradley Whited
16
- # @since 0.2.0
17
- ###
18
14
  module FXCmd
19
15
  def build_fx_cmd
20
16
  app = self
@@ -14,10 +14,6 @@ require 'nhkore/util'
14
14
 
15
15
  module NHKore
16
16
  module CLI
17
- ###
18
- # @author Jonathan Bradley Whited
19
- # @since 0.2.0
20
- ###
21
17
  module GetCmd
22
18
  DEFAULT_GET_CHUNK_SIZE = 4 * 1024
23
19
  DEFAULT_GET_URL_LENGTH = 11_000_000 # Just a generous estimation used as a fallback; may be outdated.
@@ -21,10 +21,6 @@ require 'nhkore/util'
21
21
 
22
22
  module NHKore
23
23
  module CLI
24
- ###
25
- # @author Jonathan Bradley Whited
26
- # @since 0.2.0
27
- ###
28
24
  module NewsCmd
29
25
  DEFAULT_NEWS_SCRAPE = 1
30
26
 
@@ -255,16 +251,20 @@ module CLI
255
251
  next if !redo_scrapes && scraped_news_article?(news,link)
256
252
 
257
253
  url = link.url
254
+ result = scrape_news_article(url,link: link,new_articles: new_articles,news: news)
258
255
 
259
- if (new_url = scrape_news_article(url,link: link,new_articles: new_articles,news: news))
256
+ if result == :scraped
257
+ scrape_count += 1
258
+ elsif result == :unscraped
259
+ next
260
+ else
260
261
  # --show-dict
261
- url = new_url
262
- scrape_count = max_scrapes - 1 # Break on next iteration for update_spin_detail()
262
+ url = result
263
+ scrape_count = max_scrapes # Break on next iteration for update_spin_detail().
263
264
  end
264
265
 
265
266
  # Break on next iteration for update_spin_detail().
266
- next if (scrape_count += 1) >= max_scrapes
267
-
267
+ next if scrape_count >= max_scrapes
268
268
  sleep_scraper
269
269
  end
270
270
  else
@@ -275,9 +275,8 @@ module CLI
275
275
  links.add_link(link)
276
276
  end
277
277
 
278
- scrape_news_article(url,link: link,new_articles: new_articles,news: news)
279
-
280
- scrape_count += 1
278
+ result = scrape_news_article(url,link: link,new_articles: new_articles,news: news)
279
+ scrape_count += 1 if result != :unscraped
281
280
  end
282
281
 
283
282
  stop_spin
@@ -338,9 +337,17 @@ module CLI
338
337
  return scraper.url
339
338
  end
340
339
 
341
- scraper = ArticleScraper.new(url,**@news_article_scraper_kargs)
342
- article = scraper.scrape
340
+ scraper = nil
341
+
342
+ begin
343
+ scraper = ArticleScraper.new(url,**@news_article_scraper_kargs)
344
+ rescue Http404Error
345
+ # - https://www3.nhk.or.jp/news/easy/k10014157491000/k10014157491000.html
346
+ Util.warn("Ignoring URL with 404 error: #{url}.")
347
+ return :unscraped
348
+ end
343
349
 
350
+ article = scraper.scrape
344
351
  # run_news_cmd() handles overwriting with --redo or not
345
352
  # using scraped_news_article?().
346
353
  news.add_article(article,overwrite: true)
@@ -350,7 +357,7 @@ module CLI
350
357
 
351
358
  new_articles << article
352
359
 
353
- return false # No --show-dict
360
+ return :scraped # No --show-dict
354
361
  end
355
362
 
356
363
  def scraped_news_article?(news,link)
@@ -366,10 +373,15 @@ module CLI
366
373
  end
367
374
 
368
375
  if article.nil?
369
- scraper = ArticleScraper.new(link.url,**@news_article_scraper_kargs)
376
+ scraper = nil
370
377
 
371
- sha256 = scraper.scrape_sha256_only
378
+ begin
379
+ scraper = ArticleScraper.new(link.url,**@news_article_scraper_kargs)
380
+ rescue Http404Error
381
+ return false
382
+ end
372
383
 
384
+ sha256 = scraper.scrape_sha256_only
373
385
  article = news.article_with_sha256(sha256) if news.sha256?(sha256)
374
386
  end
375
387
  end