RubyGems - nhkore - Versions diffs - 0.3.4 → 0.3.9 - Mend

nhkore 0.3.4 → 0.3.9

Files changed (44) hide show

checksums.yaml +4 -4
data/.yardopts +3 -0
data/CHANGELOG.md +82 -2
data/Gemfile +0 -18
data/Gemfile.lock +89 -0
data/README.md +23 -19
data/Rakefile +53 -52
data/bin/nhkore +4 -15
data/lib/nhkore.rb +8 -20
data/lib/nhkore/app.rb +237 -236
data/lib/nhkore/article.rb +56 -53
data/lib/nhkore/article_scraper.rb +308 -289
data/lib/nhkore/cleaner.rb +20 -32
data/lib/nhkore/cli/fx_cmd.rb +41 -53
data/lib/nhkore/cli/get_cmd.rb +59 -70
data/lib/nhkore/cli/news_cmd.rb +143 -153
data/lib/nhkore/cli/search_cmd.rb +108 -118
data/lib/nhkore/cli/sift_cmd.rb +109 -120
data/lib/nhkore/datetime_parser.rb +89 -103
data/lib/nhkore/defn.rb +48 -55
data/lib/nhkore/dict.rb +26 -38
data/lib/nhkore/dict_scraper.rb +31 -40
data/lib/nhkore/entry.rb +43 -55
data/lib/nhkore/error.rb +16 -21
data/lib/nhkore/fileable.rb +10 -21
data/lib/nhkore/lib.rb +5 -17
data/lib/nhkore/missingno.rb +21 -33
data/lib/nhkore/news.rb +61 -66
data/lib/nhkore/polisher.rb +22 -34
data/lib/nhkore/scraper.rb +75 -82
data/lib/nhkore/search_link.rb +85 -78
data/lib/nhkore/search_scraper.rb +89 -92
data/lib/nhkore/sifter.rb +157 -171
data/lib/nhkore/splitter.rb +19 -31
data/lib/nhkore/user_agents.rb +28 -32
data/lib/nhkore/util.rb +72 -84
data/lib/nhkore/variator.rb +20 -32
data/lib/nhkore/version.rb +4 -16
data/lib/nhkore/word.rb +105 -99
data/nhkore.gemspec +57 -64
data/samples/looper.rb +71 -0
data/test/nhkore/test_helper.rb +3 -15
data/test/nhkore_test.rb +6 -18
metadata +50 -28

data/lib/nhkore/search_scraper.rb CHANGED Viewed

@@ -1,23 +1,11 @@
-#!/usr/bin/env ruby
 # encoding: UTF-8
 # frozen_string_literal: true
 #--
 # This file is part of NHKore.
-# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
-#
-# NHKore is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# NHKore is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public License
-# along with NHKore.  If not, see <https://www.gnu.org/licenses/>.
+# Copyright (c) 2020-2021 Jonathan Bradley Whited
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
 #++
@@ -31,189 +19,198 @@ require 'nhkore/util'
 module NHKore
   ###
-  # @author Jonathan Bradley Whited (@esotericpig)
+  # @author Jonathan Bradley Whited
   # @since  0.2.0
   ###
   class SearchScraper < Scraper
     DEFAULT_RESULT_COUNT = 100
     FUTSUU_SITE = 'nhk.or.jp/news/html/'
     YASASHII_SITE = 'nhk.or.jp/news/easy/'
     # https://www3.nhk.or.jp/news/html/20200220/k10012294001000.html
-    FUTSUU_REGEX = /\A[^\.]+\.#{Regexp.quote(FUTSUU_SITE)}.+\.html?/i
+    FUTSUU_REGEX = /\A[^.]+\.#{Regexp.quote(FUTSUU_SITE)}.+\.html?/i.freeze
     # https://www3.nhk.or.jp/news/easy/k10012294001000/k10012294001000.html
     # - https://www3.nhk.or.jp/news/easy/article/disaster_heat.html
-    YASASHII_REGEX = /\A[^\.]+\.#{Regexp.quote(YASASHII_SITE)}.+\.html?/i
+    YASASHII_REGEX = /\A[^.]+\.#{Regexp.quote(YASASHII_SITE)}.+\.html?/i.freeze
+    IGNORE_LINK_REGEX = %r{
+      /about\.html?             # https://www3.nhk.or.jp/news/easy/about.html
+      |/movieplayer\.html?      # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
+      |/audio\.html?            # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
+      |/news/easy/index\.html?  # http://www3.nhk.or.jp/news/easy/index.html
+      # https://cgi2.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10011916321000&title=日本の会社が作った鉄道の車両「あずま」がイギリスで走る
+      # https://www3.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10012689671000&title=「鬼滅の刃」の映画が台湾でも始まって大勢の人が見に行く
+      |/enqform\.html?
+    }x.freeze
     # Search Engines are strict, so trigger using the default HTTP header fields
     # with +header: {}+ and fetch/set the cookie using +eat_cookie: true+.
     def initialize(url,eat_cookie: true,header: {},**kargs)
       super(url,eat_cookie: eat_cookie,header: header,**kargs)
     end
     def ignore_link?(link,cleaned: true)
-      return true if link.nil?()
-      link = Util.unspace_web_str(link).downcase() unless cleaned
-      return true if link.empty?()
-      return true if link =~ /\/about\.html?/ # https://www3.nhk.or.jp/news/easy/about.html
-      return true if link =~ /\/movieplayer\.html?/ # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
-      return true if link =~ /\/audio\.html?/ # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
-      return true if link =~ /\/news\/easy\/index\.html?/ # http://www3.nhk.or.jp/news/easy/index.html
+      return true if link.nil?
+      link = Util.unspace_web_str(link).downcase unless cleaned
+      return true if link.empty?
+      return true if IGNORE_LINK_REGEX.match?(link)
       return false
     end
   end
   ###
-  # @author Jonathan Bradley Whited (@esotericpig)
+  # @author Jonathan Bradley Whited
   # @since  0.2.0
   ###
   class BingScraper < SearchScraper
     attr_reader :regex
     attr_reader :site
     def initialize(site,regex: nil,url: nil,**kargs)
       case site
       when :futsuu
-        regex = FUTSUU_REGEX if regex.nil?()
+        regex = FUTSUU_REGEX if regex.nil?
         site = FUTSUU_SITE
       when :yasashii
-        regex = YASASHII_REGEX if regex.nil?()
+        regex = YASASHII_REGEX if regex.nil?
         site = YASASHII_SITE
       else
         raise ArgumentError,"invalid site[#{site}]"
       end
-      raise ArgumentError,"empty regex[#{regex}]" if regex.nil?()
+      raise ArgumentError,"empty regex[#{regex}]" if regex.nil?
       @regex = regex
       @site = site
-      url = self.class.build_url(site,**kargs) if url.nil?()
+      url = self.class.build_url(site,**kargs) if url.nil?
       # Delete class-specific args (don't pass to Open-URI).
       kargs.delete(:count)
       super(url,**kargs)
     end
     def self.build_url(site,count: DEFAULT_RESULT_COUNT,**kargs)
-      url = ''.dup()
+      url = ''.dup
       url << 'https://www.bing.com/search?'
       url << URI.encode_www_form(
         q: "site:#{site}",
         count: count
       )
       return url
     end
     def scrape(slinks,page=NextPage.new())
       next_page,link_count = scrape_html(slinks,page)
       if link_count <= 0
         scrape_rss(slinks,page,next_page)
       end
       return next_page
     end
     def scrape_html(slinks,page,next_page=NextPage.new())
-      doc = html_doc()
+      doc = html_doc
       link_count = 0
       anchors = doc.css('a')
-      anchors.each() do |anchor|
-        href = anchor['href'].to_s()
-        href = Util.unspace_web_str(href).downcase()
+      anchors.each do |anchor|
+        href = anchor['href'].to_s
+        href = Util.unspace_web_str(href).downcase
         next if ignore_link?(href)
-        if (md = href.match(/first\=(\d+)/))
-          count = md[1].to_i()
+        if (md = href.match(/first=(\d+)/))
+          count = md[1].to_i
           if count > page.count && (next_page.count < 0 || count < next_page.count)
             next_page.count = count
             next_page.url = join_url(href)
           end
         elsif href =~ regex
           slinks.add_link(SearchLink.new(href))
           link_count += 1
         end
       end
       return [next_page,link_count]
     end
     def scrape_rss(slinks,page,next_page=NextPage.new())
       link_count = 0
       if !@is_file
         uri = URI(@url)
         Util.replace_uri_query!(uri,format: 'rss')
-        open(uri)
-        doc = rss_doc()
+        self.open(uri)
+        doc = rss_doc
         rss_links = []
-        doc.items.each() do |item|
-          link = item.link.to_s()
-          link = Util.unspace_web_str(link).downcase()
+        doc.items.each do |item|
+          link = item.link.to_s
+          link = Util.unspace_web_str(link).downcase
           rss_links << link
           next if ignore_link?(link)
           next if link !~ regex
           slinks.add_link(SearchLink.new(link))
           link_count += 1
         end
         # For RSS, Bing will keep returning the same links over and over
         # if it's the last page or the "first=" query is the wrong count.
         # Therefore, we have to test the previous RSS links (+page.rss_links+).
-        if next_page.empty?() && doc.items.length >= 1 && page.rss_links != rss_links
+        if next_page.empty? && doc.items.length >= 1 && page.rss_links != rss_links
           next_page.count = (page.count < 0) ? 0 : page.count
           next_page.count += doc.items.length
           next_page.rss_links = rss_links
-          uri = URI(page.url.nil?() ? @url : page.url)
+          uri = URI(page.url.nil? ? @url : page.url)
           Util.replace_uri_query!(uri,first: next_page.count)
           next_page.url = uri
         end
       end
       return [next_page,link_count]
     end
   end
   ###
-  # @author Jonathan Bradley Whited (@esotericpig)
+  # @author Jonathan Bradley Whited
   # @since  0.2.0
   ###
   class NextPage
     attr_accessor :count
     attr_accessor :rss_links
     attr_accessor :url
-    def initialize()
+    def initialize
       super()
       @count = -1
       @rss_links = nil
       @url = nil
     end
-    def empty?()
-      return @url.nil?() || @count < 0
+    def empty?
+      return @url.nil? || @count < 0
     end
   end
 end

data/lib/nhkore/sifter.rb CHANGED Viewed

@@ -1,23 +1,11 @@
-#!/usr/bin/env ruby
 # encoding: UTF-8
 # frozen_string_literal: true
 #--
 # This file is part of NHKore.
-# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
-#
-# NHKore is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# NHKore is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public License
-# along with NHKore.  If not, see <https://www.gnu.org/licenses/>.
+# Copyright (c) 2020-2021 Jonathan Bradley Whited
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
 #++
@@ -28,179 +16,179 @@ require 'nhkore/util'
 module NHKore
   ###
-  # @author Jonathan Bradley Whited (@esotericpig)
+  # @author Jonathan Bradley Whited
   # @since  0.2.0
   ###
   class Sifter
     include Fileable
     DEFAULT_DIR = Util::CORE_DIR
     DEFAULT_FUTSUU_FILENAME = 'sift_nhk_news_web_regular'
     DEFAULT_YASASHII_FILENAME = 'sift_nhk_news_web_easy'
     def self.build_file(filename)
       return File.join(DEFAULT_DIR,filename)
     end
     DEFAULT_FUTSUU_FILE = build_file(DEFAULT_FUTSUU_FILENAME)
     DEFAULT_YASASHII_FILE = build_file(DEFAULT_YASASHII_FILENAME)
     attr_accessor :articles
     attr_accessor :caption
     attr_accessor :filters
     attr_accessor :ignores
     attr_accessor :output
     def initialize(news)
-      @articles = news.articles.values.dup()
+      @articles = news.articles.values.dup
       @caption = nil
       @filters = {}
       @ignores = {}
       @output = nil
     end
-    def build_header()
+    def build_header
       header = []
       header << 'Frequency' unless @ignores[:freq]
       header << 'Word' unless @ignores[:word]
       header << 'Kana' unless @ignores[:kana]
       header << 'English' unless @ignores[:eng]
       header << 'Definition' unless @ignores[:defn]
       return header
     end
     def build_rows(words)
       rows = []
-      words.each() do |word|
+      words.each do |word|
         rows << build_word_row(word)
       end
       return rows
     end
     def build_word_row(word)
       row = []
       row << word.freq unless @ignores[:freq]
       row << word.word unless @ignores[:word]
       row << word.kana unless @ignores[:kana]
       row << word.eng unless @ignores[:eng]
       row << word.defn unless @ignores[:defn]
       return row
     end
     def filter?(article)
-      return false if @filters.empty?()
+      return false if @filters.empty?
       datetime_filter = @filters[:datetime]
       title_filter = @filters[:title]
       url_filter = @filters[:url]
-      if !datetime_filter.nil?()
+      if !datetime_filter.nil?
         datetime = article.datetime
-        return true if datetime.nil?() ||
+        return true if datetime.nil? ||
           datetime < datetime_filter[:from] || datetime > datetime_filter[:to]
       end
-      if !title_filter.nil?()
-        title = article.title.to_s()
+      if !title_filter.nil?
+        title = article.title.to_s
         title = Util.unspace_web_str(title) if title_filter[:unspace]
-        title = title.downcase() if title_filter[:uncase]
+        title = title.downcase if title_filter[:uncase]
         return true unless title.include?(title_filter[:filter])
       end
-      if !url_filter.nil?()
-        url = article.url.to_s()
+      if !url_filter.nil?
+        url = article.url.to_s
         url = Util.unspace_web_str(url) if url_filter[:unspace]
-        url = url.downcase() if url_filter[:uncase]
+        url = url.downcase if url_filter[:uncase]
         return true unless url.include?(url_filter[:filter])
       end
       return false
     end
     def filter_by_datetime(datetime_filter=nil,from: nil,to: nil)
-      if !datetime_filter.nil?()
-        if datetime_filter.respond_to?(:'[]')
+      if !datetime_filter.nil?
+        if datetime_filter.respond_to?(:[])
           # If out-of-bounds, just nil.
-          from = datetime_filter[0] if from.nil?()
-          to = datetime_filter[1] if to.nil?()
+          from = datetime_filter[0] if from.nil?
+          to = datetime_filter[1] if to.nil?
         else
-          from = datetime_filter if from.nil?()
-          to = datetime_filter if to.nil?()
+          from = datetime_filter if from.nil?
+          to = datetime_filter if to.nil?
         end
       end
-      from = to if from.nil?()
-      to = from if to.nil?()
-      from = Util.jst_time(from) unless from.nil?()
-      to = Util.jst_time(to) unless to.nil?()
+      from = to if from.nil?
+      to = from if to.nil?
+      from = Util.jst_time(from) unless from.nil?
+      to = Util.jst_time(to) unless to.nil?
       datetime_filter = [from,to]
-      return self if datetime_filter.flatten().compact().empty?()
+      return self if datetime_filter.flatten.compact.empty?
       @filters[:datetime] = {from: from,to: to}
       return self
     end
     def filter_by_title(title_filter,uncase: true,unspace: true)
       title_filter = Util.unspace_web_str(title_filter) if unspace
-      title_filter = title_filter.downcase() if uncase
+      title_filter = title_filter.downcase if uncase
       @filters[:title] = {filter: title_filter,uncase: uncase,unspace: unspace}
       return self
     end
     def filter_by_url(url_filter,uncase: true,unspace: true)
       url_filter = Util.unspace_web_str(url_filter) if unspace
-      url_filter = url_filter.downcase() if uncase
+      url_filter = url_filter.downcase if uncase
       @filters[:url] = {filter: url_filter,uncase: uncase,unspace: unspace}
       return self
     end
     def ignore(key)
       @ignores[key] = true
       return self
     end
     # This does not output {caption}.
-    def put_csv!()
+    def put_csv!
       require 'csv'
-      words = sift()
+      words = sift
       @output = CSV.generate(headers: :first_row,write_headers: true) do |csv|
-        csv << build_header()
-        words.each() do |word|
+        csv << build_header
+        words.each do |word|
           csv << build_word_row(word)
         end
       end
       return @output
     end
-    def put_html!()
-      words = sift()
-      @output = ''.dup()
-      @output << <<~EOH
+    def put_html!
+      words = sift
+      @output = ''.dup
+      @output << <<~HTML
         <!DOCTYPE html>
         <html lang="ja">
         <head>
@@ -249,146 +237,144 @@ module NHKore
         <h1>NHKore</h1>
         <h2>#{@caption}</h2>
         <table>
-      EOH
-      #"
+      HTML
       # If have too few or too many '<col>', invalid HTML.
-      @output << %Q{<col style="width:6em;">\n} unless @ignores[:freq]
-      @output << %Q{<col style="width:17em;">\n} unless @ignores[:word]
-      @output << %Q{<col style="width:17em;">\n} unless @ignores[:kana]
-      @output << %Q{<col style="width:5em;">\n} unless @ignores[:eng]
+      @output << %Q(<col style="width:6em;">\n) unless @ignores[:freq]
+      @output << %Q(<col style="width:17em;">\n) unless @ignores[:word]
+      @output << %Q(<col style="width:17em;">\n) unless @ignores[:kana]
+      @output << %Q(<col style="width:5em;">\n) unless @ignores[:eng]
       @output << "<col>\n" unless @ignores[:defn] # No width for defn, fills rest of page
       @output << '<tr>'
-      build_header().each() do |h|
+      build_header.each do |h|
         @output << "<th>#{h}</th>"
       end
       @output << "</tr>\n"
-      words.each() do |word|
+      words.each do |word|
         @output << '<tr>'
-        build_word_row(word).each() do |w|
-          @output << "<td>#{Util.escape_html(w.to_s())}</td>"
+        build_word_row(word).each do |w|
+          @output << "<td>#{Util.escape_html(w.to_s)}</td>"
         end
         @output << "</tr>\n"
       end
-      @output << <<~EOH
+      @output << <<~HTML
         </table>
         </body>
         </html>
-      EOH
-      #/
+      HTML
       return @output
     end
-    def put_json!()
+    def put_json!
       require 'json'
-      words = sift()
-      @output = ''.dup()
-      @output << <<~EOJ
+      words = sift
+      @output = ''.dup
+      @output << <<~JSON
         {
         "caption": #{JSON.generate(@caption)},
-        "header": #{JSON.generate(build_header())},
+        "header": #{JSON.generate(build_header)},
         "words": [
-      EOJ
-      if !words.empty?()
+      JSON
+      if !words.empty?
         0.upto(words.length - 2) do |i|
           @output << "  #{JSON.generate(build_word_row(words[i]))},\n"
         end
         @output << "  #{JSON.generate(build_word_row(words[-1]))}\n"
       end
       @output << "]\n}\n"
       return @output
     end
-    def put_yaml!()
+    def put_yaml!
       require 'psychgus'
-      words = sift()
+      words = sift
       yaml = {
         caption: @caption,
-        header: build_header(),
+        header: build_header,
         words: build_rows(words),
       }
-      header_styler = Class.new() do
+      header_styler = Class.new do
         include Psychgus::Styler
         def style_sequence(sniffer,node)
           parent = sniffer.parent
-          if !parent.nil?() && parent.node.respond_to?(:value) && parent.value == 'header'
+          if !parent.nil? && parent.node.respond_to?(:value) && parent.value == 'header'
             node.style = Psychgus::SEQUENCE_FLOW
           end
         end
       end
       # Put each Word on one line (flow/inline style).
-      @output = Util.dump_yaml(yaml,flow_level: 4,stylers: header_styler.new())
+      @output = Util.dump_yaml(yaml,flow_level: 4,stylers: header_styler.new)
       return @output
     end
-    def sift()
-      master_article = Article.new()
-      @articles.each() do |article|
+    def sift
+      master_article = Article.new
+      @articles.each do |article|
         next if filter?(article)
-        article.words.values().each() do |word|
+        article.words.each_value do |word|
           master_article.add_word(word,use_freq: true)
         end
       end
-      words = master_article.words.values()
-      words.sort!() do |word1,word2|
+      words = master_article.words.values
+      words.sort! do |word1,word2|
         # Order by freq DESC (most frequent words to top).
         i = (word2.freq <=> word1.freq)
         # Order by !defn.empty, word ASC, !kana.empty, kana ASC, defn.len DESC, defn ASC.
         i = compare_empty_str(word1.defn,word2.defn) if i == 0 # Favor words that have definitions
-        i = (word1.word.to_s() <=> word2.word.to_s()) if i == 0
+        i = (word1.word.to_s <=> word2.word.to_s) if i == 0
         i = compare_empty_str(word1.kana,word2.kana) if i == 0 # Favor words that have kana
-        i = (word1.kana.to_s() <=> word2.kana.to_s()) if i == 0
-        i = (word2.defn.to_s().length <=> word1.defn.to_s().length) if i == 0 # Favor longer definitions
-        i = (word1.defn.to_s() <=> word2.defn.to_s()) if i == 0
+        i = (word1.kana.to_s <=> word2.kana.to_s) if i == 0
+        i = (word2.defn.to_s.length <=> word1.defn.to_s.length) if i == 0 # Favor longer definitions
+        i = (word1.defn.to_s <=> word2.defn.to_s) if i == 0
         i
       end
       return words
     end
     def compare_empty_str(str1,str2)
       has_str1 = !Util.empty_web_str?(str1)
       has_str2 = !Util.empty_web_str?(str2)
       if has_str1 && !has_str2
         return -1 # Bubble word1 to top
       elsif !has_str1 && has_str2
         return 1 # Bubble word2 to top
       end
       return 0 # Further comparison needed
     end
-    def to_s()
-      return @output.to_s()
+    def to_s
+      return @output.to_s
     end
   end
 end