RubyGems - nhkore - Versions diffs - 0.3.3 → 0.3.8 - Mend

nhkore 0.3.3 → 0.3.8

Files changed (44) hide show

checksums.yaml +4 -4
data/.yardopts +3 -0
data/CHANGELOG.md +97 -2
data/Gemfile +0 -18
data/Gemfile.lock +89 -0
data/README.md +58 -30
data/Rakefile +68 -42
data/bin/nhkore +4 -15
data/lib/nhkore.rb +8 -20
data/lib/nhkore/app.rb +231 -236
data/lib/nhkore/article.rb +56 -53
data/lib/nhkore/article_scraper.rb +308 -289
data/lib/nhkore/cleaner.rb +20 -32
data/lib/nhkore/cli/fx_cmd.rb +41 -53
data/lib/nhkore/cli/get_cmd.rb +59 -70
data/lib/nhkore/cli/news_cmd.rb +145 -154
data/lib/nhkore/cli/search_cmd.rb +110 -120
data/lib/nhkore/cli/sift_cmd.rb +111 -227
data/lib/nhkore/datetime_parser.rb +328 -0
data/lib/nhkore/defn.rb +48 -55
data/lib/nhkore/dict.rb +26 -38
data/lib/nhkore/dict_scraper.rb +31 -40
data/lib/nhkore/entry.rb +43 -55
data/lib/nhkore/error.rb +16 -21
data/lib/nhkore/fileable.rb +10 -21
data/lib/nhkore/lib.rb +6 -17
data/lib/nhkore/missingno.rb +21 -33
data/lib/nhkore/news.rb +61 -66
data/lib/nhkore/polisher.rb +22 -34
data/lib/nhkore/scraper.rb +75 -82
data/lib/nhkore/search_link.rb +85 -78
data/lib/nhkore/search_scraper.rb +89 -92
data/lib/nhkore/sifter.rb +157 -171
data/lib/nhkore/splitter.rb +19 -31
data/lib/nhkore/user_agents.rb +28 -32
data/lib/nhkore/util.rb +72 -101
data/lib/nhkore/variator.rb +20 -32
data/lib/nhkore/version.rb +4 -16
data/lib/nhkore/word.rb +105 -99
data/nhkore.gemspec +58 -65
data/samples/looper.rb +71 -0
data/test/nhkore/test_helper.rb +3 -15
data/test/nhkore_test.rb +6 -18
metadata +53 -30

data/lib/nhkore/cli/news_cmd.rb CHANGED Viewed

@@ -1,28 +1,17 @@
-#!/usr/bin/env ruby
 # encoding: UTF-8
 # frozen_string_literal: true
 #--
 # This file is part of NHKore.
-# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
-#
-# NHKore is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# NHKore is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public License
-# along with NHKore.  If not, see <https://www.gnu.org/licenses/>.
+# Copyright (c) 2020-2021 Jonathan Bradley Whited
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
 #++
 require 'time'
+require 'nhkore/datetime_parser'
 require 'nhkore/error'
 require 'nhkore/missingno'
 require 'nhkore/news'
@@ -33,164 +22,166 @@ require 'nhkore/util'
 module NHKore
 module CLI
   ###
-  # @author Jonathan Bradley Whited (@esotericpig)
+  # @author Jonathan Bradley Whited
   # @since  0.2.0
   ###
   module NewsCmd
     DEFAULT_NEWS_SCRAPE = 1
-    def build_news_cmd()
+    def build_news_cmd
       app = self
-      @news_cmd = @app_cmd.define_command() do
+      @news_cmd = @app_cmd.define_command do
         name    'news'
         usage   'news [OPTIONS] [COMMAND]...'
         aliases :n
         summary "Scrape NHK News Web (Easy) articles (aliases: #{app.color_alias('n')})"
-        description <<-EOD
+        description <<-DESC
           Scrape NHK News Web (Easy) articles &
           save to folder: #{News::DEFAULT_DIR}
-        EOD
-        option :d,:datetime,<<-EOD,argument: :required,transform: -> (value) do
+        DESC
+        option :d,:datetime,<<-DESC,argument: :required,transform: lambda { |value|
           date time to use as a fallback in cases when an article doesn't have one;
           format: YYYY-mm-dd H:M; example: 2020-03-30 15:30
-        EOD
-          value = Time.strptime(value,'%Y-%m-%d %H:%M',&Util.method(:guess_year))
+        DESC
+          value = Time.strptime(value,'%Y-%m-%d %H:%M',&DatetimeParser.method(:guess_year))
           value = Util.jst_time(value)
           value
-        end
-        option :i,:in,<<-EOD,argument: :required,transform: -> (value) do
+        }
+        option :i,:in,<<-DESC,argument: :required,transform: lambda { |value|
           HTML file of article to read instead of URL (for offline testing and/or slow internet;
           see '--no-dict' option)
-        EOD
+        DESC
           app.check_empty_opt(:in,value)
-        end
-        flag :L,:lenient,<<-EOD
+        }
+        flag :L,:lenient,<<-DESC
           leniently (not strict) scrape articles:
           body & title content without the proper HTML/CSS classes/IDs and no futsuurl;
           example URLs that need this flag:
           -https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html
           -https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html
-        EOD
-        option :k,:like,<<-EOD,argument: :required,transform: -> (value) do
+        DESC
+        option :k,:like,<<-DESC,argument: :required,transform: lambda { |value|
           text to fuzzy search links for; for example, "--like '00123'" will only scrape links containing
           text '00123' -- like '*00123*'
-        EOD
-          value = Util.strip_web_str(value).downcase()
+        DESC
+          value = Util.strip_web_str(value).downcase
           value
-        end
-        option :l,:links,<<-EOD,argument: :required,transform: -> (value) do
+        }
+        option :l,:links,<<-DESC,argument: :required,transform: lambda { |value|
           'directory/file' of article links to scrape (see '#{App::NAME} search';
           defaults: #{SearchLinks::DEFAULT_YASASHII_FILE}, #{SearchLinks::DEFAULT_FUTSUU_FILE})
-        EOD
+        DESC
           app.check_empty_opt(:links,value)
-        end
-        flag :M,:missingno,<<-EOD
+        }
+        flag :M,:missingno,<<-DESC
           very rarely an article will not have kana or kanji for a Ruby tag;
           to not raise an error, this will use previously scraped data to fill it in;
           example URL:
           -https://www3.nhk.or.jp/news/easy/k10012331311000/k10012331311000.html
-        EOD
-        flag :D,:'no-dict',<<-EOD
+        DESC
+        flag :D,:'no-dict',<<-DESC
           do not try to parse the dictionary files for the articles; useful in case of errors trying to load
           the dictionaries (or for offline testing)
-        EOD
-        flag :H,'no-sha256',<<-EOD
+        DESC
+        flag :H,'no-sha256',<<-DESC
           do not check the SHA-256 of the content to see if an article has already been scraped;
           for example, 2 URLs with the same content, but 1 with 'http' & 1 with 'https', will both be scraped;
           this is useful if 2 articles have the same SHA-256, but different content (unlikely)
-        EOD
-        option :o,:out,<<-EOD,argument: :required,transform: -> (value) do
+        DESC
+        option :o,:out,<<-DESC,argument: :required,transform: lambda { |value|
           'directory/file' to save words to; if you only specify a directory or a file, it will attach
           the appropriate default directory/file name
           (defaults: #{YasashiiNews::DEFAULT_FILE}, #{FutsuuNews::DEFAULT_FILE})
-        EOD
+        DESC
           app.check_empty_opt(:out,value)
-        end
+        }
         flag :r,:redo,'scrape article links even if they have already been scraped'
         option :s,:scrape,'number of unscraped article links to scrape',argument: :required,
-          default: DEFAULT_NEWS_SCRAPE,transform: -> (value) do
-          value = value.to_i()
-          value = 1 if value < 1
-          value
-        end
-        option nil,:'show-dict',<<-EOD
+            default: DEFAULT_NEWS_SCRAPE,transform: lambda { |value|
+              value = value.to_i
+              value = 1 if value < 1
+              value
+            }
+        option nil,:'show-dict',<<-DESC
           show dictionary URL and contents for the first article and exit;
           useful for debugging dictionary errors (see '--no-dict' option);
           implies '--dry-run' option
-        EOD
-        option :u,:url,<<-EOD,argument: :required,transform: -> (value) do
+        DESC
+        option :u,:url,<<-DESC,argument: :required,transform: lambda { |value|
           URL of article to scrape, instead of article links file (see '--links' option)
-        EOD
+        DESC
           app.check_empty_opt(:url,value)
-        end
+        }
         run do |opts,args,cmd|
           puts cmd.help
         end
       end
-      @news_easy_cmd = @news_cmd.define_command() do
+      @news_easy_cmd = @news_cmd.define_command do
         name    'easy'
         usage   'easy [OPTIONS] [COMMAND]...'
         aliases :e,:ez
         summary "Scrape NHK News Web Easy (Yasashii) articles (aliases: #{app.color_alias('e ez')})"
-        description <<-EOD
+        description <<-DESC
           Search for NHK News Web Easy (Yasashii) links &
           save to file: #{YasashiiNews::DEFAULT_FILE}
-        EOD
+        DESC
         run do |opts,args,cmd|
           app.refresh_cmd(opts,args,cmd)
           app.run_news_cmd(:yasashii)
         end
       end
-      @news_regular_cmd = @news_cmd.define_command() do
+      @news_regular_cmd = @news_cmd.define_command do
         name    'regular'
         usage   'regular [OPTIONS] [COMMAND]...'
         aliases :r,:reg
         summary "Scrape NHK News Web Regular (Futsuu) articles (aliases: #{app.color_alias('r reg')})"
-        description <<-EOD
+        description <<-DESC
           Search for NHK News Web Regular (Futsuu) links &
           save to file: #{FutsuuNews::DEFAULT_FILE}
-        EOD
+        DESC
         run do |opts,args,cmd|
           app.refresh_cmd(opts,args,cmd)
           app.run_news_cmd(:futsuu)
         end
       end
     end
     def run_news_cmd(type)
       @cmd_opts[:dry_run] = true if @cmd_opts[:show_dict]
       news_name = nil
       build_in_file(:in)
       case type
       when :futsuu
-        build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::DEFAULT_FUTSUU_FILENAME)
+        build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,
+          default_filename: SearchLinks::DEFAULT_FUTSUU_FILENAME)
         build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: FutsuuNews::DEFAULT_FILENAME)
         news_name = 'Regular'
       when :yasashii
-        build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::DEFAULT_YASASHII_FILENAME)
+        build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,
+          default_filename: SearchLinks::DEFAULT_YASASHII_FILENAME)
         build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: YasashiiNews::DEFAULT_FILENAME)
         news_name = 'Easy'
       else
         raise ArgumentError,"invalid type[#{type}]"
       end
       return unless check_in_file(:in,empty_ok: true)
       return unless check_out_file(:out)
       datetime = @cmd_opts[:datetime]
       dict = @cmd_opts[:no_dict] ? nil : :scrape
       dry_run = @cmd_opts[:dry_run]
@@ -199,39 +190,39 @@ module CLI
       like = @cmd_opts[:like]
       links_file = @cmd_opts[:links]
       max_scrapes = @cmd_opts[:scrape]
-      max_scrapes = DEFAULT_NEWS_SCRAPE if max_scrapes.nil?()
+      max_scrapes = DEFAULT_NEWS_SCRAPE if max_scrapes.nil?
       missingno = @cmd_opts[:missingno]
       no_sha256 = @cmd_opts[:no_sha256]
       out_file = @cmd_opts[:out]
       redo_scrapes = @cmd_opts[:redo]
       show_dict = @cmd_opts[:show_dict]
       # Favor in_file option over url option.
-      url = in_file.nil?() ? Util.strip_web_str(@cmd_opts[:url].to_s()) : in_file
-      url = nil if url.empty?()
-      if url.nil?()
+      url = in_file.nil? ? Util.strip_web_str(@cmd_opts[:url].to_s) : in_file
+      url = nil if url.empty?
+      if url.nil?
         # Then we must have a links file that exists.
         return unless check_in_file(:links,empty_ok: false)
       end
       start_spin("Scraping NHK News Web #{news_name} articles")
-      is_file = !in_file.nil?()
+      is_file = !in_file.nil?
       link_count = -1
-      links = File.exist?(links_file) ? SearchLinks.load_file(links_file) : SearchLinks.new()
+      links = File.exist?(links_file) ? SearchLinks.load_file(links_file) : SearchLinks.new
       new_articles = [] # For --dry-run
       news = nil
       scrape_count = 0
       if File.exist?(out_file)
         news = (type == :yasashii) ?
           YasashiiNews.load_file(out_file,overwrite: no_sha256) :
           FutsuuNews.load_file(out_file,overwrite: no_sha256)
       else
-        news = (type == :yasashii) ? YasashiiNews.new() : FutsuuNews.new()
+        news = (type == :yasashii) ? YasashiiNews.new : FutsuuNews.new
       end
       @news_article_scraper_kargs = @scraper_kargs.merge({
         datetime: datetime,
         dict: dict,
@@ -242,154 +233,154 @@ module CLI
       @news_dict_scraper_kargs = @scraper_kargs.merge({
         is_file: is_file,
       })
-      if url.nil?()
+      if url.nil?
         # Why store each() and do `links_len` instead of `links-len - 1`?
-        #
+        #
         # If links contains 5 entries and you scrape all 5, then the output of
         # update_spin_detail() will end on 4, so all of this complexity is so
         # that update_spin_detail() only needs to be written/updated on one line.
-        links_each = links.links.values.each()
-        links_len = links.length()
+        links_each = links.links.values.each
+        links_len = links.length
         0.upto(links_len) do |i|
           update_spin_detail(" (scraped=#{scrape_count}, considered=#{link_count += 1})")
           break if i >= links_len || scrape_count >= max_scrapes
-          link = links_each.next()
-          next if !like.nil?() && !link.url.to_s().downcase().include?(like)
+          link = links_each.next
+          next if !like.nil? && !link.url.to_s.downcase.include?(like)
           next if !redo_scrapes && scraped_news_article?(news,link)
           url = link.url
           if (new_url = scrape_news_article(url,link: link,new_articles: new_articles,news: news))
             # --show-dict
             url = new_url
             scrape_count = max_scrapes - 1 # Break on next iteration for update_spin_detail()
           end
           # Break on next iteration for update_spin_detail().
           next if (scrape_count += 1) >= max_scrapes
-          sleep_scraper()
+          sleep_scraper
         end
       else
         link = links[url]
-        if link.nil?()
+        if link.nil?
           link = SearchLink.new(url)
           links.add_link(link)
         end
         scrape_news_article(url,link: link,new_articles: new_articles,news: news)
         scrape_count += 1
       end
-      stop_spin()
+      stop_spin
       puts
       if scrape_count <= 0
         puts 'Nothing scraped!'
         if !dry_run && !show_dict
           puts
           start_spin('Saving updated links to file')
           links.save_file(links_file)
-          stop_spin()
+          stop_spin
           puts "> #{links_file}"
         end
       else
         puts 'Last URL scraped:'
         puts "> #{url}"
         puts
         if show_dict
           puts @cmd_opts[:show_dict] # Updated in scrape_news_article()
         elsif dry_run
           if new_articles.length < 1
-            raise CLIError,"scrape_count[#{scrape_count}] != new_articles[#{new_articles.length}]; " +
-              "internal code is broken"
+            raise CLIError,"scrape_count[#{scrape_count}] != new_articles[#{new_articles.length}];" \
+              ' internal code is broken'
           elsif new_articles.length == 1
             puts new_articles.first
           else
             # Don't show the words (mini), too verbose for more than 1.
-            new_articles.each() do |article|
+            new_articles.each do |article|
               puts article.to_s(mini: true)
             end
           end
         else
           start_spin('Saving scraped data to files')
           links.save_file(links_file)
           news.save_file(out_file)
-          stop_spin()
+          stop_spin
           puts "> #{out_file}"
           puts "> #{links_file}"
         end
       end
     end
     def scrape_news_article(url,link:,new_articles:,news:)
       show_dict = @cmd_opts[:show_dict]
       if show_dict
         scraper = DictScraper.new(url,**@news_dict_scraper_kargs)
-        @cmd_opts[:show_dict] = scraper.scrape().to_s()
+        @cmd_opts[:show_dict] = scraper.scrape.to_s
         return scraper.url
       end
       scraper = ArticleScraper.new(url,**@news_article_scraper_kargs)
-      article = scraper.scrape()
+      article = scraper.scrape
       # run_news_cmd() handles overwriting with --redo or not
       #   using scraped_news_article?().
       news.add_article(article,overwrite: true)
       news.update_article(article,link.url) # Favors https
       link.update_from_article(article)
       new_articles << article
       return false # No --show-dict
     end
     def scraped_news_article?(news,link)
-      return true if link.scraped?()
+      return true if link.scraped?
       no_sha256 = @cmd_opts[:no_sha256]
       article = news.article(link.url)
-      if !no_sha256 && article.nil?()
+      if !no_sha256 && article.nil?
         if !Util.empty_web_str?(link.sha256) && news.sha256?(link.sha256)
           article = news.article_with_sha256(link.sha256)
         end
-        if article.nil?()
+        if article.nil?
           scraper = ArticleScraper.new(link.url,**@news_article_scraper_kargs)
-          sha256 = scraper.scrape_sha256_only()
+          sha256 = scraper.scrape_sha256_only
           article = news.article_with_sha256(sha256) if news.sha256?(sha256)
         end
       end
       if article
         news.update_article(article,link.url) # Favors https
         link.update_from_article(article)
         return true
       end
       return false
     end
   end