nhkore 0.3.13 → 0.3.16

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 21663a8ce4850b7f03361289832cfdb6caa5bc64e62af1ff7cd4b91b7fc2329b
4
- data.tar.gz: de45874fc3834c492ea74ad26067504629185db39fff2f42148a24b4be453cc9
3
+ metadata.gz: f23192b04fc6a0c1cf225db4a029e3226346d27c4fe977ee05f1b522c40708bb
4
+ data.tar.gz: 3e71d5ef9eb60327cb9ced89a819de77b5cd8910e755332d5764dc95e8263f71
5
5
  SHA512:
6
- metadata.gz: 0ea9413c534cb11d60764e6dd95473b65e6a76418b28ca69441c18741cd6920992f0289a6d9f5c39ed0124023296992f7a60696099ce056b009cadcf8e863867
7
- data.tar.gz: a223dd3e9416b5487274e3218fe55e6c936019a4faf58343a6c4fc9a3bc69c55805c08f42411337ce672766cd72934c423aff734bce766273e9a5707d356eceb
6
+ metadata.gz: 2cec06bd51e86ddd7b30a052b4078a2932344996bd6d97c4b7d927d0bc8fbcecbd2fb21680957551857d3a152d33409aa7ee8e1dd496a271cb31c63cbe2eb2e9
7
+ data.tar.gz: 186c42412e35567aebe2afee71636fb58a28510bd6ab3e8b65df6adef5373860f25e87be71e5d8499dcf6b6babb0ee3aae796a01f818732bc53ad682e584e701
data/CHANGELOG.md CHANGED
@@ -5,10 +5,26 @@ All notable changes to this project will be documented in this file.
5
5
  Format is based on [Keep a Changelog v1.0.0](https://keepachangelog.com/en/1.0.0),
6
6
  and this project adheres to [Semantic Versioning v2.0.0](https://semver.org/spec/v2.0.0.html).
7
7
 
8
- ## [[Unreleased]](https://github.com/esotericpig/nhkore/compare/v0.3.13...HEAD)
8
+ ## [[Unreleased]](https://github.com/esotericpig/nhkore/compare/v0.3.16...HEAD)
9
9
  -
10
10
 
11
11
 
12
+ ## [v0.3.16] - 2024-08-14
13
+
14
+ ### Fixed
15
+ - Fixed to work with new NHK pages.
16
+ - Updated gems.
17
+
18
+
19
+ ## [v0.3.14] - 2022-07-24
20
+
21
+ ### Added
22
+ - `--loop` option to `search` command so can run web search (search engine) multiple times since this usually doesn't get all results if only do once.
23
+
24
+ ### Fixed
25
+ - Updated gems (`nokogiri`).
26
+
27
+
12
28
  ## [v0.3.13] - 2022-04-27
13
29
 
14
30
  ### Fixed
data/Gemfile.lock CHANGED
@@ -1,19 +1,19 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- nhkore (0.3.13)
4
+ nhkore (0.3.16)
5
5
  attr_bool (~> 0.2)
6
6
  bimyou_segmenter (~> 1.2)
7
7
  cri (~> 2.15)
8
- down (~> 5.3)
9
- highline (~> 2.0)
8
+ down (~> 5.4)
9
+ highline (~> 3.1)
10
10
  http-cookie (~> 1.0)
11
11
  japanese_deinflector (~> 0.0)
12
- nokogiri (~> 1.13)
12
+ nokogiri (~> 1.16)
13
13
  psychgus (~> 1.3)
14
- public_suffix (~> 4.0)
14
+ public_suffix (~> 6.0)
15
15
  rainbow (~> 3.1)
16
- rss (~> 0.2)
16
+ rss (~> 0.3)
17
17
  rubyzip (~> 2.3)
18
18
  tiny_segmenter (~> 0.0)
19
19
  tty-progressbar (~> 0.18)
@@ -22,43 +22,48 @@ PATH
22
22
  GEM
23
23
  remote: https://rubygems.org/
24
24
  specs:
25
- addressable (2.8.0)
26
- public_suffix (>= 2.0.2, < 5.0)
25
+ addressable (2.8.7)
26
+ public_suffix (>= 2.0.2, < 7.0)
27
27
  attr_bool (0.2.2)
28
28
  bimyou_segmenter (1.2.0)
29
- cri (2.15.11)
30
- domain_name (0.5.20190701)
31
- unf (>= 0.0.5, < 1.0.0)
32
- down (5.3.1)
29
+ cri (2.15.12)
30
+ domain_name (0.6.20240107)
31
+ down (5.4.2)
33
32
  addressable (~> 2.8)
34
- highline (2.0.3)
35
- http-cookie (1.0.4)
33
+ highline (3.1.0)
34
+ reline
35
+ http-cookie (1.0.6)
36
36
  domain_name (~> 0.5)
37
+ io-console (0.7.2)
37
38
  japanese_deinflector (0.0.2)
38
- mini_portile2 (2.8.0)
39
- minitest (5.15.0)
40
- nokogiri (1.13.4)
41
- mini_portile2 (~> 2.8.0)
39
+ mini_portile2 (2.8.7)
40
+ minitest (5.25.0)
41
+ nokogiri (1.16.7)
42
+ mini_portile2 (~> 2.8.2)
42
43
  racc (~> 1.4)
43
- psych (4.0.3)
44
+ psych (5.1.2)
44
45
  stringio
45
46
  psychgus (1.3.4)
46
47
  psych (>= 3.0)
47
- public_suffix (4.0.7)
48
- racc (1.6.0)
48
+ public_suffix (6.0.1)
49
+ racc (1.8.1)
49
50
  rainbow (3.1.1)
50
- rake (13.0.6)
51
+ rake (13.2.1)
51
52
  raketeer (0.2.13)
52
53
  rake
53
- rdoc (6.4.0)
54
+ rdoc (6.7.0)
54
55
  psych (>= 4.0.0)
55
- redcarpet (3.5.1)
56
- rexml (3.2.5)
57
- rss (0.2.9)
56
+ redcarpet (3.6.0)
57
+ reline (0.5.9)
58
+ io-console (~> 0.5)
59
+ rexml (3.3.5)
60
+ strscan
61
+ rss (0.3.1)
58
62
  rexml
59
63
  rubyzip (2.3.2)
60
- stringio (3.0.1)
64
+ stringio (3.1.1)
61
65
  strings-ansi (0.2.0)
66
+ strscan (3.1.0)
62
67
  tiny_segmenter (0.0.6)
63
68
  tty-cursor (0.7.1)
64
69
  tty-progressbar (0.18.2)
@@ -66,16 +71,11 @@ GEM
66
71
  tty-cursor (~> 0.7)
67
72
  tty-screen (~> 0.8)
68
73
  unicode-display_width (>= 1.6, < 3.0)
69
- tty-screen (0.8.1)
74
+ tty-screen (0.8.2)
70
75
  tty-spinner (0.9.3)
71
76
  tty-cursor (~> 0.7)
72
- unf (0.1.4)
73
- unf_ext
74
- unf_ext (0.0.8.1)
75
- unicode-display_width (2.1.0)
76
- webrick (1.7.0)
77
- yard (0.9.27)
78
- webrick (~> 1.7.0)
77
+ unicode-display_width (2.5.0)
78
+ yard (0.9.36)
79
79
  yard_ghurt (1.2.1)
80
80
  rake
81
81
  yard
@@ -84,15 +84,15 @@ PLATFORMS
84
84
  ruby
85
85
 
86
86
  DEPENDENCIES
87
- bundler (~> 2.3)
88
- minitest (~> 5.15)
87
+ bundler (~> 2.5)
88
+ minitest (~> 5.25)
89
89
  nhkore!
90
- rake (~> 13.0)
90
+ rake (~> 13.2)
91
91
  raketeer (~> 0.2)
92
- rdoc (~> 6.4)
93
- redcarpet (~> 3.5)
92
+ rdoc (~> 6.7)
93
+ redcarpet (~> 3.6)
94
94
  yard (~> 0.9)
95
95
  yard_ghurt (~> 1.2)
96
96
 
97
97
  BUNDLED WITH
98
- 2.3.12
98
+ 2.5.17
data/README.md CHANGED
@@ -867,12 +867,11 @@ This will update *core/* for you:
867
867
  2. Update *core* package:
868
868
  - `$ bundle exec rake update_core`
869
869
  - `$ bundle exec rake clobber pkg_core`
870
- 3. Create a new tag & release:
870
+ 3. Commit & Push.
871
+ 4. Create a new tag & release:
871
872
  - Note: make sure to add *pkg/nhkore-core.zip*
872
873
  - `$ gh release create v0 pkg/*.gem pkg/*.zip`
873
- - `$ git pull`
874
- 4. Release to *GitHub Packages*:
875
- - With *Raketary*: `$ raketary github_pkg`
874
+ - `$ git pull && git fetch`
876
875
  5. Release to *RubyGems*:
877
876
  - `$ bundle exec rake release`
878
877
 
@@ -885,7 +884,7 @@ Releasing new HTML file for website:
885
884
  [GNU LGPL v3+](LICENSE.txt)
886
885
 
887
886
  > NHKore (<https://github.com/esotericpig/nhkore>)
888
- > Copyright (c) 2020-2021 Jonathan Bradley Whited
887
+ > Copyright (c) 2020-2022 Jonathan Bradley Whited
889
888
  >
890
889
  > NHKore is free software: you can redistribute it and/or modify
891
890
  > it under the terms of the GNU Lesser General Public License as published by
data/Rakefile CHANGED
@@ -56,11 +56,14 @@ task :update_core do |task|
56
56
  cmd = ['ruby','-w','./lib/nhkore.rb','-t','300','-m','10']
57
57
  hl = HighLine.new
58
58
 
59
- next unless sh(*cmd,'se','ez','bing')
59
+ next unless sh(*cmd,'se','--show-count','ez')
60
+ puts
61
+
62
+ next unless sh(*cmd,'se','-l','10','ez','bing')
60
63
  next unless hl.agree(continue_msg)
61
64
  puts
62
65
 
63
- next unless sh(*cmd,'news','-s','500','ez')
66
+ next unless sh(*cmd,'news','-s','1000','ez')
64
67
  next unless hl.agree(continue_msg)
65
68
  puts
66
69
 
@@ -74,7 +77,6 @@ task :update_core do |task|
74
77
  puts
75
78
  end
76
79
 
77
- # @since 0.3.6
78
80
  desc 'Update showcase file for release'
79
81
  task :update_showcase do |task|
80
82
  require 'highline'
data/lib/nhkore/app.rb CHANGED
@@ -3,7 +3,7 @@
3
3
 
4
4
  #--
5
5
  # This file is part of NHKore.
6
- # Copyright (c) 2020-2021 Jonathan Bradley Whited
6
+ # Copyright (c) 2020-2022 Jonathan Bradley Whited
7
7
  #
8
8
  # SPDX-License-Identifier: LGPL-3.0-or-later
9
9
  #++
@@ -27,18 +27,11 @@ require 'nhkore/cli/sift_cmd'
27
27
 
28
28
 
29
29
  module NHKore
30
- ###
31
- # @author Jonathan Bradley Whited
32
- # @since 0.2.0
33
- ###
34
30
  module CLI
35
31
  end
36
32
 
37
33
  ###
38
34
  # For disabling/enabling color output.
39
- #
40
- # @author Jonathan Bradley Whited
41
- # @since 0.2.1
42
35
  ###
43
36
  module CriColorExt
44
37
  @color = true
@@ -52,10 +45,6 @@ module NHKore
52
45
  end
53
46
  end
54
47
 
55
- ###
56
- # @author Jonathan Bradley Whited
57
- # @since 0.2.0
58
- ###
59
48
  class App
60
49
  include CLI::FXCmd
61
50
  include CLI::GetCmd
@@ -537,7 +526,11 @@ module NHKore
537
526
  end
538
527
 
539
528
  def sleep_scraper
540
- sleep(@sleep_time)
529
+ # Do a range to better emulate being a human.
530
+ r = rand(@sleep_time..(@sleep_time + 0.1111))
531
+ s = r.round(3) # Within 1000ms (0.000 - 0.999).
532
+
533
+ sleep(s)
541
534
  end
542
535
 
543
536
  def start_spin(title,detail: '')
@@ -572,10 +565,6 @@ module NHKore
572
565
  end
573
566
  end
574
567
 
575
- ###
576
- # @author Jonathan Bradley Whited
577
- # @since 0.2.0
578
- ###
579
568
  class NoProgressBar
580
569
  MSG = '%{title}... %{percent}%%'
581
570
  PUT_INTERVAL = 100.0 / 6.25
@@ -623,7 +612,7 @@ module NHKore
623
612
 
624
613
  @tokens[:advance] = percent
625
614
 
626
- puts to_s
615
+ puts self
627
616
  end
628
617
 
629
618
  def finish
@@ -631,7 +620,7 @@ module NHKore
631
620
  end
632
621
 
633
622
  def start
634
- puts to_s
623
+ puts self
635
624
  end
636
625
 
637
626
  def to_s
@@ -16,10 +16,6 @@ require 'nhkore/word'
16
16
 
17
17
 
18
18
  module NHKore
19
- ###
20
- # @author Jonathan Bradley Whited
21
- # @since 0.2.0
22
- ###
23
19
  class Article
24
20
  attr_reader :datetime
25
21
  attr_reader :futsuurl
@@ -29,7 +25,7 @@ module NHKore
29
25
  attr_reader :words
30
26
 
31
27
  def initialize
32
- super()
28
+ super
33
29
 
34
30
  @datetime = nil
35
31
  @futsuurl = nil
@@ -101,13 +97,13 @@ module NHKore
101
97
  end
102
98
 
103
99
  def futsuurl=(value)
104
- # Don't store URI, store String.
105
- @futsuurl = value.nil? ? nil : value.to_s
100
+ # Don't store URI, store String or nil.
101
+ @futsuurl = value&.to_s
106
102
  end
107
103
 
108
104
  def url=(value)
109
- # Don't store URI, store String.
110
- @url = value.nil? ? nil : value.to_s
105
+ # Don't store URI, store String or nil.
106
+ @url = value&.to_s
111
107
  end
112
108
 
113
109
  def to_s(mini: false)
@@ -26,10 +26,6 @@ require 'nhkore/word'
26
26
 
27
27
 
28
28
  module NHKore
29
- ###
30
- # @author Jonathan Bradley Whited
31
- # @since 0.2.0
32
- ###
33
29
  class ArticleScraper < Scraper
34
30
  extend AttrBool::Ext
35
31
 
@@ -139,7 +135,13 @@ module NHKore
139
135
  end
140
136
 
141
137
  def parse_dicwin_id(str)
142
- str = str.gsub(/\D+/,'')
138
+ str = str.to_s.strip.downcase
139
+
140
+ if str.start_with?('id-') # 'id-0000'
141
+ str = str.gsub(/\D+/,'')
142
+ else # 'RSHOK-K-003806'
143
+ # Same.
144
+ end
143
145
 
144
146
  return nil if str.empty?
145
147
  return str
@@ -235,8 +237,6 @@ module NHKore
235
237
  # Ignore; try again below.
236
238
  Util.warn("could not parse date time[#{tag_text}] from tag[#{tag_name}] at URL[#{@url}]: #{e}")
237
239
  end
238
-
239
- return datetime
240
240
  end
241
241
 
242
242
  # Third, try body's id.
@@ -393,7 +393,6 @@ module NHKore
393
393
  return link
394
394
  end
395
395
 
396
- # @since 0.3.8
397
396
  # @see https://www3.nhk.or.jp/news/easy/k10012759201000/k10012759201000.html
398
397
  def scrape_ruby_words(tag,result: ScrapeWordsResult.new)
399
398
  words = Word.scrape_ruby_tag(tag,missingno: @missingno,url: @url)
@@ -489,15 +488,21 @@ module NHKore
489
488
  end
490
489
 
491
490
  def scrape_title(doc,article)
491
+ # Not grabbing `<head><title>` because it doesn't have `<ruby>` tags.
492
+
492
493
  tag = doc.css('h1.article-main__title')
493
494
  tag_name = nil
494
495
 
496
+ if tag.length < 1
497
+ # - https://www3.nhk.or.jp/news/easy/em2024081312029/em2024081312029.html
498
+ tag = doc.css('h1.article-title') # No warning.
499
+ end
500
+
495
501
  if tag.length < 1
496
502
  # - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_illust.html
497
503
  tag_name = 'h1.article-eq__title'
498
504
  tag = doc.css(tag_name)
499
505
  end
500
-
501
506
  if tag.length < 1 && !@strict
502
507
  # This shouldn't be used except for select sites.
503
508
  # - https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html
@@ -632,16 +637,12 @@ module NHKore
632
637
  end
633
638
  end
634
639
 
635
- ###
636
- # @author Jonathan Bradley Whited
637
- # @since 0.2.0
638
- ###
639
640
  class ScrapeWordsResult
640
641
  attr_reader :text
641
642
  attr_reader :words
642
643
 
643
644
  def initialize
644
- super()
645
+ super
645
646
 
646
647
  @text = ''.dup
647
648
  @words = []
@@ -14,10 +14,6 @@ require 'nhkore/word'
14
14
 
15
15
 
16
16
  module NHKore
17
- ###
18
- # @author Jonathan Bradley Whited
19
- # @since 0.2.0
20
- ###
21
17
  class Cleaner
22
18
  def begin_clean(str)
23
19
  return str
@@ -53,10 +49,6 @@ module NHKore
53
49
  end
54
50
  end
55
51
 
56
- ###
57
- # @author Jonathan Bradley Whited
58
- # @since 0.2.0
59
- ###
60
52
  class BasicCleaner < Cleaner
61
53
  def end_clean(str)
62
54
  # This is very simple, as Splitter will split on punctuation,
@@ -70,10 +62,6 @@ module NHKore
70
62
  end
71
63
  end
72
64
 
73
- ###
74
- # @author Jonathan Bradley Whited
75
- # @since 0.2.0
76
- ###
77
65
  class BestCleaner < BasicCleaner
78
66
  end
79
67
  end
@@ -11,10 +11,6 @@
11
11
 
12
12
  module NHKore
13
13
  module CLI
14
- ###
15
- # @author Jonathan Bradley Whited
16
- # @since 0.2.0
17
- ###
18
14
  module FXCmd
19
15
  def build_fx_cmd
20
16
  app = self
@@ -14,10 +14,6 @@ require 'nhkore/util'
14
14
 
15
15
  module NHKore
16
16
  module CLI
17
- ###
18
- # @author Jonathan Bradley Whited
19
- # @since 0.2.0
20
- ###
21
17
  module GetCmd
22
18
  DEFAULT_GET_CHUNK_SIZE = 4 * 1024
23
19
  DEFAULT_GET_URL_LENGTH = 11_000_000 # Just a generous estimation used as a fallback; may be outdated.
@@ -21,10 +21,6 @@ require 'nhkore/util'
21
21
 
22
22
  module NHKore
23
23
  module CLI
24
- ###
25
- # @author Jonathan Bradley Whited
26
- # @since 0.2.0
27
- ###
28
24
  module NewsCmd
29
25
  DEFAULT_NEWS_SCRAPE = 1
30
26
 
@@ -255,16 +251,20 @@ module CLI
255
251
  next if !redo_scrapes && scraped_news_article?(news,link)
256
252
 
257
253
  url = link.url
254
+ result = scrape_news_article(url,link: link,new_articles: new_articles,news: news)
258
255
 
259
- if (new_url = scrape_news_article(url,link: link,new_articles: new_articles,news: news))
256
+ if result == :scraped
257
+ scrape_count += 1
258
+ elsif result == :unscraped
259
+ next
260
+ else
260
261
  # --show-dict
261
- url = new_url
262
- scrape_count = max_scrapes - 1 # Break on next iteration for update_spin_detail()
262
+ url = result
263
+ scrape_count = max_scrapes # Break on next iteration for update_spin_detail().
263
264
  end
264
265
 
265
266
  # Break on next iteration for update_spin_detail().
266
- next if (scrape_count += 1) >= max_scrapes
267
-
267
+ next if scrape_count >= max_scrapes
268
268
  sleep_scraper
269
269
  end
270
270
  else
@@ -275,9 +275,8 @@ module CLI
275
275
  links.add_link(link)
276
276
  end
277
277
 
278
- scrape_news_article(url,link: link,new_articles: new_articles,news: news)
279
-
280
- scrape_count += 1
278
+ result = scrape_news_article(url,link: link,new_articles: new_articles,news: news)
279
+ scrape_count += 1 if result != :unscraped
281
280
  end
282
281
 
283
282
  stop_spin
@@ -338,9 +337,17 @@ module CLI
338
337
  return scraper.url
339
338
  end
340
339
 
341
- scraper = ArticleScraper.new(url,**@news_article_scraper_kargs)
342
- article = scraper.scrape
340
+ scraper = nil
341
+
342
+ begin
343
+ scraper = ArticleScraper.new(url,**@news_article_scraper_kargs)
344
+ rescue Http404Error
345
+ # - https://www3.nhk.or.jp/news/easy/k10014157491000/k10014157491000.html
346
+ Util.warn("Ignoring URL with 404 error: #{url}.")
347
+ return :unscraped
348
+ end
343
349
 
350
+ article = scraper.scrape
344
351
  # run_news_cmd() handles overwriting with --redo or not
345
352
  # using scraped_news_article?().
346
353
  news.add_article(article,overwrite: true)
@@ -350,7 +357,7 @@ module CLI
350
357
 
351
358
  new_articles << article
352
359
 
353
- return false # No --show-dict
360
+ return :scraped # No --show-dict
354
361
  end
355
362
 
356
363
  def scraped_news_article?(news,link)
@@ -366,10 +373,15 @@ module CLI
366
373
  end
367
374
 
368
375
  if article.nil?
369
- scraper = ArticleScraper.new(link.url,**@news_article_scraper_kargs)
376
+ scraper = nil
370
377
 
371
- sha256 = scraper.scrape_sha256_only
378
+ begin
379
+ scraper = ArticleScraper.new(link.url,**@news_article_scraper_kargs)
380
+ rescue Http404Error
381
+ return false
382
+ end
372
383
 
384
+ sha256 = scraper.scrape_sha256_only
373
385
  article = news.article_with_sha256(sha256) if news.sha256?(sha256)
374
386
  end
375
387
  end