RubyGems - nhkore - Versions diffs - 0.3.7 → 0.3.11 - Mend

nhkore 0.3.7 → 0.3.11

Files changed (43) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +53 -2
data/Gemfile +0 -18
data/Gemfile.lock +36 -33
data/README.md +36 -30
data/Rakefile +38 -52
data/bin/nhkore +4 -15
data/lib/nhkore/app.rb +235 -234
data/lib/nhkore/article.rb +39 -53
data/lib/nhkore/article_scraper.rb +293 -285
data/lib/nhkore/cleaner.rb +20 -32
data/lib/nhkore/cli/fx_cmd.rb +41 -53
data/lib/nhkore/cli/get_cmd.rb +59 -70
data/lib/nhkore/cli/news_cmd.rb +143 -153
data/lib/nhkore/cli/search_cmd.rb +108 -118
data/lib/nhkore/cli/sift_cmd.rb +109 -120
data/lib/nhkore/datetime_parser.rb +88 -104
data/lib/nhkore/defn.rb +48 -55
data/lib/nhkore/dict.rb +26 -38
data/lib/nhkore/dict_scraper.rb +31 -40
data/lib/nhkore/entry.rb +43 -55
data/lib/nhkore/error.rb +16 -21
data/lib/nhkore/fileable.rb +10 -21
data/lib/nhkore/lib.rb +5 -17
data/lib/nhkore/missingno.rb +21 -33
data/lib/nhkore/news.rb +58 -72
data/lib/nhkore/polisher.rb +22 -34
data/lib/nhkore/scraper.rb +74 -83
data/lib/nhkore/search_link.rb +62 -76
data/lib/nhkore/search_scraper.rb +81 -92
data/lib/nhkore/sifter.rb +157 -171
data/lib/nhkore/splitter.rb +19 -31
data/lib/nhkore/user_agents.rb +28 -32
data/lib/nhkore/util.rb +72 -84
data/lib/nhkore/variator.rb +20 -32
data/lib/nhkore/version.rb +4 -16
data/lib/nhkore/word.rb +99 -97
data/lib/nhkore.rb +8 -20
data/nhkore.gemspec +30 -51
data/samples/looper.rb +18 -29
data/test/nhkore/test_helper.rb +3 -15
data/test/nhkore_test.rb +6 -18
metadata +33 -24

data/lib/nhkore/search_link.rb CHANGED Viewed

@@ -1,23 +1,11 @@
-#!/usr/bin/env ruby
 # encoding: UTF-8
 # frozen_string_literal: true
 #--
 # This file is part of NHKore.
-# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
-#
-# NHKore is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# NHKore is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public License
-# along with NHKore.  If not, see <https://www.gnu.org/licenses/>.
+# Copyright (c) 2020-2021 Jonathan Bradley Whited
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
 #++
@@ -30,22 +18,22 @@ require 'nhkore/util'
 module NHKore
   ###
-  # @author Jonathan Bradley Whited (@esotericpig)
+  # @author Jonathan Bradley Whited
   # @since  0.2.0
   ###
   class SearchLink
     extend AttrBool::Ext
     attr_reader :datetime
     attr_reader :futsuurl
     attr_accessor? :scraped
     attr_accessor :sha256
     attr_accessor :title
     attr_reader :url
     def initialize(url,scraped: false)
       super()
       @datetime = nil
       @futsuurl = nil
       @scraped = scraped
@@ -53,42 +41,42 @@ module NHKore
       @title = nil
       self.url = url
     end
     def encode_with(coder)
       # Order matters.
-      coder[:url] = @url.nil?() ? nil : @url.to_s()
+      coder[:url] = @url.nil? ? nil : @url.to_s
       coder[:scraped] = @scraped
-      coder[:datetime] = @datetime.nil?() ? nil : @datetime.iso8601()
+      coder[:datetime] = @datetime.nil? ? nil : @datetime.iso8601
       coder[:title] = @title
-      coder[:futsuurl] = @futsuurl.nil?() ? nil : @futsuurl.to_s()
+      coder[:futsuurl] = @futsuurl.nil? ? nil : @futsuurl.to_s
       coder[:sha256] = @sha256
     end
     def self.load_data(key,hash)
       slink = SearchLink.new(
         hash[:url],
         scraped: hash[:scraped],
       )
       slink.datetime = hash[:datetime]
       slink.futsuurl = hash[:futsuurl]
       slink.sha256 = hash[:sha256]
       slink.title = hash[:title]
       return slink
     end
     def update_from_article(article)
       # Don't update the url, as it may be different (e.g., http vs https).
-      self.datetime = article.datetime if @datetime.nil?()
+      self.datetime = article.datetime if @datetime.nil?
       self.futsuurl = article.futsuurl if Util.empty_web_str?(@futsuurl)
       @scraped = true # If we have an article, it's been scraped
       @sha256 = article.sha256 if Util.empty_web_str?(@sha256)
       @title = article.title if Util.empty_web_str?(@title)
     end
     def datetime=(value)
       if value.is_a?(Time)
         @datetime = value
@@ -96,22 +84,22 @@ module NHKore
         @datetime = Util.empty_web_str?(value) ? nil : Time.iso8601(value)
       end
     end
     def futsuurl=(value)
       # Don't store URI, store String.
-      @futsuurl = value.nil?() ? nil : value.to_s()
+      @futsuurl = value.nil? ? nil : value.to_s
     end
     def url=(value)
       # Don't store URI, store String.
-      @url = value.nil?() ? nil : value.to_s()
+      @url = value.nil? ? nil : value.to_s
     end
     def to_s(mini: false)
-      s = ''.dup()
+      s = ''.dup
       s << "'#{@url}': "
       if mini
         s << "{ scraped? #{@scraped ? 'yes' : 'NO'} }"
       else
@@ -121,87 +109,85 @@ module NHKore
         s << "\n  futsuurl: '#{@futsuurl}'"
         s << "\n  sha256:   '#{@sha256}'"
       end
       return s
     end
   end
   ###
-  # @author Jonathan Bradley Whited (@esotericpig)
+  # @author Jonathan Bradley Whited
   # @since  0.2.0
   ###
   class SearchLinks
     include Fileable
     DEFAULT_DIR = Util::CORE_DIR
     DEFAULT_FUTSUU_FILENAME = 'links_nhk_news_web_regular.yml'
     DEFAULT_YASASHII_FILENAME = 'links_nhk_news_web_easy.yml'
     def self.build_file(filename)
       return File.join(DEFAULT_DIR,filename)
     end
     DEFAULT_FUTSUU_FILE = build_file(DEFAULT_FUTSUU_FILENAME)
     DEFAULT_YASASHII_FILE = build_file(DEFAULT_YASASHII_FILENAME)
     attr_reader :links
-    def initialize()
+    def initialize
       super()
       @links = {}
     end
     def add_link(link)
-      url = link.url.nil?() ? nil : link.url.to_s()
+      url = link.url.nil? ? nil : link.url.to_s
       return self if @links.key?(url)
       @links[url] = link
       return self
     end
     def each(&block)
       return @links.each(&block)
     end
     def encode_with(coder)
       # Order matters.
       coder[:links] = @links
     end
     def self.load_data(data,file: nil,**kargs)
       data = Util.load_yaml(data,file: file)
       links = data[:links]
-      slinks = SearchLinks.new()
-      if !links.nil?()
-        links.each() do |key,hash|
-          key = key.to_s() unless key.nil?()
-          slinks.links[key] = SearchLink.load_data(key,hash)
-        end
+      slinks = SearchLinks.new
+      links&.each() do |key,hash|
+        key = key.to_s unless key.nil?
+        slinks.links[key] = SearchLink.load_data(key,hash)
       end
       return slinks
     end
     def [](url)
       url = url.url if url.respond_to?(:url)
-      url = url.to_s() unless url.nil?()
+      url = url.to_s unless url.nil?
       return @links[url]
     end
-    def length()
+    def length
       return @links.length
     end
-    def to_s()
+    def to_s
       return Util.dump_yaml(self)
     end
   end

data/lib/nhkore/search_scraper.rb CHANGED Viewed

@@ -1,23 +1,11 @@
-#!/usr/bin/env ruby
 # encoding: UTF-8
 # frozen_string_literal: true
 #--
 # This file is part of NHKore.
-# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
-#
-# NHKore is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# NHKore is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public License
-# along with NHKore.  If not, see <https://www.gnu.org/licenses/>.
+# Copyright (c) 2020-2021 Jonathan Bradley Whited
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
 #++
@@ -31,197 +19,198 @@ require 'nhkore/util'
 module NHKore
   ###
-  # @author Jonathan Bradley Whited (@esotericpig)
+  # @author Jonathan Bradley Whited
   # @since  0.2.0
   ###
   class SearchScraper < Scraper
     DEFAULT_RESULT_COUNT = 100
     FUTSUU_SITE = 'nhk.or.jp/news/html/'
     YASASHII_SITE = 'nhk.or.jp/news/easy/'
     # https://www3.nhk.or.jp/news/html/20200220/k10012294001000.html
-    FUTSUU_REGEX = /\A[^\.]+\.#{Regexp.quote(FUTSUU_SITE)}.+\.html?/i
+    FUTSUU_REGEX = /\A[^.]+\.#{Regexp.quote(FUTSUU_SITE)}.+\.html?/i.freeze
     # https://www3.nhk.or.jp/news/easy/k10012294001000/k10012294001000.html
     # - https://www3.nhk.or.jp/news/easy/article/disaster_heat.html
-    YASASHII_REGEX = /\A[^\.]+\.#{Regexp.quote(YASASHII_SITE)}.+\.html?/i
+    YASASHII_REGEX = /\A[^.]+\.#{Regexp.quote(YASASHII_SITE)}.+\.html?/i.freeze
     IGNORE_LINK_REGEX = %r{
-      /about\.html?            # https://www3.nhk.or.jp/news/easy/about.html
+      /about\.html?             # https://www3.nhk.or.jp/news/easy/about.html
       |/movieplayer\.html?      # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
       |/audio\.html?            # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
       |/news/easy/index\.html?  # http://www3.nhk.or.jp/news/easy/index.html
       # https://cgi2.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10011916321000&title=日本の会社が作った鉄道の車両「あずま」がイギリスで走る
       # https://www3.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10012689671000&title=「鬼滅の刃」の映画が台湾でも始まって大勢の人が見に行く
       |/enqform\.html?
-    }x
+    }x.freeze
     # Search Engines are strict, so trigger using the default HTTP header fields
     # with +header: {}+ and fetch/set the cookie using +eat_cookie: true+.
     def initialize(url,eat_cookie: true,header: {},**kargs)
       super(url,eat_cookie: eat_cookie,header: header,**kargs)
     end
     def ignore_link?(link,cleaned: true)
-      return true if link.nil?()
-      link = Util.unspace_web_str(link).downcase() unless cleaned
-      return true if link.empty?()
+      return true if link.nil?
+      link = Util.unspace_web_str(link).downcase unless cleaned
+      return true if link.empty?
       return true if IGNORE_LINK_REGEX.match?(link)
       return false
     end
   end
   ###
-  # @author Jonathan Bradley Whited (@esotericpig)
+  # @author Jonathan Bradley Whited
   # @since  0.2.0
   ###
   class BingScraper < SearchScraper
     attr_reader :regex
     attr_reader :site
     def initialize(site,regex: nil,url: nil,**kargs)
       case site
       when :futsuu
-        regex = FUTSUU_REGEX if regex.nil?()
+        regex = FUTSUU_REGEX if regex.nil?
         site = FUTSUU_SITE
       when :yasashii
-        regex = YASASHII_REGEX if regex.nil?()
+        regex = YASASHII_REGEX if regex.nil?
         site = YASASHII_SITE
       else
         raise ArgumentError,"invalid site[#{site}]"
       end
-      raise ArgumentError,"empty regex[#{regex}]" if regex.nil?()
+      raise ArgumentError,"empty regex[#{regex}]" if regex.nil?
       @regex = regex
       @site = site
-      url = self.class.build_url(site,**kargs) if url.nil?()
+      url = self.class.build_url(site,**kargs) if url.nil?
       # Delete class-specific args (don't pass to Open-URI).
       kargs.delete(:count)
       super(url,**kargs)
     end
     def self.build_url(site,count: DEFAULT_RESULT_COUNT,**kargs)
-      url = ''.dup()
+      url = ''.dup
       url << 'https://www.bing.com/search?'
       url << URI.encode_www_form(
         q: "site:#{site}",
         count: count
       )
       return url
     end
     def scrape(slinks,page=NextPage.new())
       next_page,link_count = scrape_html(slinks,page)
       if link_count <= 0
         scrape_rss(slinks,page,next_page)
       end
       return next_page
     end
     def scrape_html(slinks,page,next_page=NextPage.new())
-      doc = html_doc()
+      doc = html_doc
       link_count = 0
       anchors = doc.css('a')
-      anchors.each() do |anchor|
-        href = anchor['href'].to_s()
-        href = Util.unspace_web_str(href).downcase()
+      anchors.each do |anchor|
+        href = anchor['href'].to_s
+        href = Util.unspace_web_str(href).downcase
         next if ignore_link?(href)
-        if (md = href.match(/first\=(\d+)/))
-          count = md[1].to_i()
+        if (md = href.match(/first=(\d+)/))
+          count = md[1].to_i
           if count > page.count && (next_page.count < 0 || count < next_page.count)
             next_page.count = count
             next_page.url = join_url(href)
           end
         elsif href =~ regex
           slinks.add_link(SearchLink.new(href))
           link_count += 1
         end
       end
       return [next_page,link_count]
     end
     def scrape_rss(slinks,page,next_page=NextPage.new())
       link_count = 0
       if !@is_file
         uri = URI(@url)
         Util.replace_uri_query!(uri,format: 'rss')
-        open(uri)
-        doc = rss_doc()
+        self.open(uri)
+        doc = rss_doc
         rss_links = []
-        doc.items.each() do |item|
-          link = item.link.to_s()
-          link = Util.unspace_web_str(link).downcase()
+        doc.items.each do |item|
+          link = item.link.to_s
+          link = Util.unspace_web_str(link).downcase
           rss_links << link
           next if ignore_link?(link)
           next if link !~ regex
           slinks.add_link(SearchLink.new(link))
           link_count += 1
         end
         # For RSS, Bing will keep returning the same links over and over
         # if it's the last page or the "first=" query is the wrong count.
         # Therefore, we have to test the previous RSS links (+page.rss_links+).
-        if next_page.empty?() && doc.items.length >= 1 && page.rss_links != rss_links
+        if next_page.empty? && doc.items.length >= 1 && page.rss_links != rss_links
           next_page.count = (page.count < 0) ? 0 : page.count
           next_page.count += doc.items.length
           next_page.rss_links = rss_links
-          uri = URI(page.url.nil?() ? @url : page.url)
+          uri = URI(page.url.nil? ? @url : page.url)
           Util.replace_uri_query!(uri,first: next_page.count)
           next_page.url = uri
         end
       end
       return [next_page,link_count]
     end
   end
   ###
-  # @author Jonathan Bradley Whited (@esotericpig)
+  # @author Jonathan Bradley Whited
   # @since  0.2.0
   ###
   class NextPage
     attr_accessor :count
     attr_accessor :rss_links
     attr_accessor :url
-    def initialize()
+    def initialize
       super()
       @count = -1
       @rss_links = nil
       @url = nil
     end
-    def empty?()
-      return @url.nil?() || @count < 0
+    def empty?
+      return @url.nil? || @count < 0
     end
   end
 end