nhkore 0.3.14 → 0.3.16

Sign up to get free protection for your applications and to get access to all the features.
data/lib/nhkore/error.rb CHANGED
@@ -10,21 +10,11 @@
10
10
 
11
11
 
12
12
  module NHKore
13
- ###
14
- # @author Jonathan Bradley Whited
15
- # @since 0.2.0
16
- ###
17
13
  class Error < ::StandardError; end
18
14
 
19
- # @since 0.2.0
20
15
  class CLIError < Error; end
21
-
22
- # @since 0.2.0
16
+ class Http404Error < Error; end
23
17
  class ParseError < Error; end
24
-
25
- # @since 0.2.0
26
18
  class ScrapeError < Error; end
27
-
28
- # @since 0.2.0
29
19
  class ZipError < Error; end
30
20
  end
@@ -10,10 +10,6 @@
10
10
 
11
11
 
12
12
  module NHKore
13
- ###
14
- # @author Jonathan Bradley Whited
15
- # @since 0.2.0
16
- ###
17
13
  module Fileable
18
14
  def self.included(mod)
19
15
  mod.extend ClassMethods
data/lib/nhkore/lib.rb CHANGED
@@ -38,9 +38,6 @@ module NHKore
38
38
  ###
39
39
  # Include this file to only require the files needed to use this
40
40
  # Gem as a library (i.e., don't include CLI-related files).
41
- #
42
- # @author Jonathan Bradley Whited
43
- # @since 0.3.2
44
41
  ###
45
42
  module Lib
46
43
  end
@@ -13,10 +13,6 @@ require 'nhkore/util'
13
13
 
14
14
 
15
15
  module NHKore
16
- ###
17
- # @author Jonathan Bradley Whited
18
- # @since 0.2.0
19
- ###
20
16
  class Missingno
21
17
  attr_reader :kanas
22
18
  attr_reader :kanjis
@@ -68,13 +64,13 @@ module NHKore
68
64
  def kana_from_kanji(kanji)
69
65
  word = @kanjis[kanji]
70
66
 
71
- return word.nil? ? nil : word.kana
67
+ return word&.kana
72
68
  end
73
69
 
74
70
  def kanji_from_kana(kana)
75
71
  word = @kanas[kana]
76
72
 
77
- return word.nil? ? nil : word.kanji
73
+ return word&.kanji
78
74
  end
79
75
  end
80
76
  end
data/lib/nhkore/news.rb CHANGED
@@ -16,10 +16,6 @@ require 'nhkore/util'
16
16
 
17
17
 
18
18
  module NHKore
19
- ###
20
- # @author Jonathan Bradley Whited
21
- # @since 0.2.0
22
- ###
23
19
  class News
24
20
  include Fileable
25
21
 
@@ -30,7 +26,7 @@ module NHKore
30
26
  attr_reader :sha256s
31
27
 
32
28
  def initialize
33
- super()
29
+ super
34
30
 
35
31
  @articles = {}
36
32
  @sha256s = {}
@@ -127,10 +123,6 @@ module NHKore
127
123
  end
128
124
  end
129
125
 
130
- ###
131
- # @author Jonathan Bradley Whited
132
- # @since 0.2.0
133
- ###
134
126
  class FutsuuNews < News
135
127
  DEFAULT_FILENAME = 'nhk_news_web_regular.yml'
136
128
  DEFAULT_FILE = build_file(DEFAULT_FILENAME)
@@ -144,14 +136,10 @@ module NHKore
144
136
  end
145
137
 
146
138
  def save_file(file=DEFAULT_FILE,**kargs)
147
- super(file,**kargs)
139
+ super
148
140
  end
149
141
  end
150
142
 
151
- ###
152
- # @author Jonathan Bradley Whited
153
- # @since 0.2.0
154
- ###
155
143
  class YasashiiNews < News
156
144
  DEFAULT_FILENAME = 'nhk_news_web_easy.yml'
157
145
  DEFAULT_FILE = build_file(DEFAULT_FILENAME)
@@ -165,7 +153,7 @@ module NHKore
165
153
  end
166
154
 
167
155
  def save_file(file=DEFAULT_FILE,**kargs)
168
- super(file,**kargs)
156
+ super
169
157
  end
170
158
  end
171
159
  end
@@ -13,10 +13,6 @@ require 'nhkore/word'
13
13
 
14
14
 
15
15
  module NHKore
16
- ###
17
- # @author Jonathan Bradley Whited
18
- # @since 0.2.0
19
- ###
20
16
  class Polisher
21
17
  def begin_polish(str)
22
18
  return str
@@ -52,10 +48,6 @@ module NHKore
52
48
  end
53
49
  end
54
50
 
55
- ###
56
- # @author Jonathan Bradley Whited
57
- # @since 0.2.0
58
- ###
59
51
  class BasicPolisher < Polisher
60
52
  def end_polish(str)
61
53
  # Keep Japanese dots in names:
@@ -72,10 +64,6 @@ module NHKore
72
64
  end
73
65
  end
74
66
 
75
- ###
76
- # @author Jonathan Bradley Whited
77
- # @since 0.2.0
78
- ###
79
67
  class BestPolisher < BasicPolisher
80
68
  end
81
69
  end
@@ -13,15 +13,12 @@ require 'attr_bool'
13
13
  require 'nokogiri'
14
14
  require 'open-uri'
15
15
 
16
+ require 'nhkore/error'
16
17
  require 'nhkore/user_agents'
17
18
  require 'nhkore/util'
18
19
 
19
20
 
20
21
  module NHKore
21
- ###
22
- # @author Jonathan Bradley Whited
23
- # @since 0.2.0
24
- ###
25
22
  class Scraper
26
23
  extend AttrBool::Ext
27
24
 
@@ -177,7 +174,13 @@ module NHKore
177
174
  retry
178
175
  # Must come after HTTPRedirect since a subclass of HTTPError.
179
176
  rescue OpenURI::HTTPError => e
180
- raise e.exception("HTTP error[#{e}] at URL[#{url}]")
177
+ msg = "HTTP error[#{e}] at URL[#{url}]"
178
+
179
+ if e.to_s.include?('404 Not Found')
180
+ raise Http404Error,msg
181
+ else
182
+ raise e.exception(msg)
183
+ end
181
184
  rescue SocketError => e
182
185
  if (max_retries -= 1) < 0
183
186
  raise e.exception("Socket error[#{e}] at URL[#{url}]")
@@ -17,10 +17,6 @@ require 'nhkore/util'
17
17
 
18
18
 
19
19
  module NHKore
20
- ###
21
- # @author Jonathan Bradley Whited
22
- # @since 0.2.0
23
- ###
24
20
  class SearchLink
25
21
  extend AttrBool::Ext
26
22
 
@@ -45,11 +41,11 @@ module NHKore
45
41
  def encode_with(coder)
46
42
  # Order matters.
47
43
 
48
- coder[:url] = @url.nil? ? nil : @url.to_s
44
+ coder[:url] = @url&.to_s
49
45
  coder[:scraped] = @scraped
50
- coder[:datetime] = @datetime.nil? ? nil : @datetime.iso8601
46
+ coder[:datetime] = @datetime&.iso8601
51
47
  coder[:title] = @title
52
- coder[:futsuurl] = @futsuurl.nil? ? nil : @futsuurl.to_s
48
+ coder[:futsuurl] = @futsuurl&.to_s
53
49
  coder[:sha256] = @sha256
54
50
  end
55
51
 
@@ -86,13 +82,13 @@ module NHKore
86
82
  end
87
83
 
88
84
  def futsuurl=(value)
89
- # Don't store URI, store String.
90
- @futsuurl = value.nil? ? nil : value.to_s
85
+ # Don't store URI, store String or nil.
86
+ @futsuurl = value&.to_s
91
87
  end
92
88
 
93
89
  def url=(value)
94
- # Don't store URI, store String.
95
- @url = value.nil? ? nil : value.to_s
90
+ # Don't store URI, store String or nil.
91
+ @url = value&.to_s
96
92
  end
97
93
 
98
94
  def to_s(mini: false)
@@ -114,10 +110,6 @@ module NHKore
114
110
  end
115
111
  end
116
112
 
117
- ###
118
- # @author Jonathan Bradley Whited
119
- # @since 0.2.0
120
- ###
121
113
  class SearchLinks
122
114
  include Fileable
123
115
 
@@ -136,13 +128,13 @@ module NHKore
136
128
  attr_reader :links
137
129
 
138
130
  def initialize
139
- super()
131
+ super
140
132
 
141
133
  @links = {}
142
134
  end
143
135
 
144
136
  def add_link(link)
145
- url = link.url.nil? ? nil : link.url.to_s
137
+ url = link.url&.to_s
146
138
 
147
139
  return self if @links.key?(url)
148
140
 
@@ -9,6 +9,7 @@
9
9
  #++
10
10
 
11
11
 
12
+ require 'net/http'
12
13
  require 'uri'
13
14
 
14
15
  require 'nhkore/error'
@@ -18,10 +19,6 @@ require 'nhkore/util'
18
19
 
19
20
 
20
21
  module NHKore
21
- ###
22
- # @author Jonathan Bradley Whited
23
- # @since 0.2.0
24
- ###
25
22
  class SearchScraper < Scraper
26
23
  DEFAULT_RESULT_COUNT = 100
27
24
  FUTSUU_SITE = 'nhk.or.jp/news/html/'
@@ -34,10 +31,11 @@ module NHKore
34
31
  YASASHII_REGEX = /\A[^.]+\.#{Regexp.quote(YASASHII_SITE)}.+\.html?/i.freeze
35
32
 
36
33
  IGNORE_LINK_REGEX = %r{
37
- /about\.html? # https://www3.nhk.or.jp/news/easy/about.html
38
- |/movieplayer\.html? # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
39
- |/audio\.html? # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
40
- |/news/easy/index\.html? # http://www3.nhk.or.jp/news/easy/index.html
34
+ /about\.html? # https://www3.nhk.or.jp/news/easy/about.html
35
+ |/movieplayer\.html? # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
36
+ |/audio\.html? # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
37
+ |/news/easy/index\.html? # https://www3.nhk.or.jp/news/easy/index.html
38
+ |/disaster_earthquake.html # https://www3.nhk.or.jp/news/easy/article/disaster_earthquake.html
41
39
 
42
40
  # https://cgi2.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10011916321000&title=日本の会社が作った鉄道の車両「あずま」がイギリスで走る
43
41
  # https://www3.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10012689671000&title=「鬼滅の刃」の映画が台湾でも始まって大勢の人が見に行く
@@ -47,7 +45,7 @@ module NHKore
47
45
  # Search Engines are strict, so trigger using the default HTTP header fields
48
46
  # with +header: {}+ and fetch/set the cookie using +eat_cookie: true+.
49
47
  def initialize(url,eat_cookie: true,header: {},**kargs)
50
- super(url,eat_cookie: eat_cookie,header: header,**kargs)
48
+ super
51
49
  end
52
50
 
53
51
  def ignore_link?(link,cleaned: true)
@@ -56,17 +54,35 @@ module NHKore
56
54
  link = Util.unspace_web_str(link).downcase unless cleaned
57
55
 
58
56
  return true if link.empty?
59
-
60
57
  return true if IGNORE_LINK_REGEX.match?(link)
61
-
62
58
  return false
63
59
  end
60
+
61
+ # Example: https://www3.nhk.or.jp/news/easy/k10014150691000/k10014150691000.html
62
+ def fetch_valid_link?(link)
63
+ uri = begin
64
+ URI(link)
65
+ rescue StandardError
66
+ return false # Bad URL.
67
+ end
68
+
69
+ begin
70
+ ssl = uri.scheme.to_s.strip.downcase.include?('https')
71
+
72
+ Net::HTTP.start(uri.host,uri.port,use_ssl: ssl) do |http|
73
+ resp = http.head(uri.request_uri)
74
+ code = resp.code
75
+
76
+ return code != '404'
77
+ end
78
+ rescue StandardError
79
+ # Ignore; try actually scraping the article anyway.
80
+ end
81
+
82
+ return true
83
+ end
64
84
  end
65
85
 
66
- ###
67
- # @author Jonathan Bradley Whited
68
- # @since 0.2.0
69
- ###
70
86
  class BingScraper < SearchScraper
71
87
  attr_reader :regex
72
88
  attr_reader :site
@@ -136,9 +152,8 @@ module NHKore
136
152
  next_page.count = count
137
153
  next_page.url = join_url(href)
138
154
  end
139
- elsif href =~ regex
155
+ elsif href =~ regex && fetch_valid_link?(href)
140
156
  slinks.add_link(SearchLink.new(href))
141
-
142
157
  link_count += 1
143
158
  end
144
159
  end
@@ -165,10 +180,9 @@ module NHKore
165
180
  rss_links << link
166
181
 
167
182
  next if ignore_link?(link)
168
- next if link !~ regex
183
+ next if link !~ regex || !fetch_valid_link?(link)
169
184
 
170
185
  slinks.add_link(SearchLink.new(link))
171
-
172
186
  link_count += 1
173
187
  end
174
188
 
@@ -192,17 +206,13 @@ module NHKore
192
206
  end
193
207
  end
194
208
 
195
- ###
196
- # @author Jonathan Bradley Whited
197
- # @since 0.2.0
198
- ###
199
209
  class NextPage
200
210
  attr_accessor :count
201
211
  attr_accessor :rss_links
202
212
  attr_accessor :url
203
213
 
204
214
  def initialize
205
- super()
215
+ super
206
216
 
207
217
  @count = -1
208
218
  @rss_links = nil
data/lib/nhkore/sifter.rb CHANGED
@@ -15,10 +15,6 @@ require 'nhkore/util'
15
15
 
16
16
 
17
17
  module NHKore
18
- ###
19
- # @author Jonathan Bradley Whited
20
- # @since 0.2.0
21
- ###
22
18
  class Sifter
23
19
  include Fileable
24
20
 
@@ -61,10 +57,8 @@ module NHKore
61
57
  end
62
58
 
63
59
  def build_rows(words)
64
- rows = []
65
-
66
- words.each do |word|
67
- rows << build_word_row(word)
60
+ rows = words.map do |word|
61
+ build_word_row(word)
68
62
  end
69
63
 
70
64
  return rows
@@ -336,6 +330,11 @@ module NHKore
336
330
  next if filter?(article)
337
331
 
338
332
  article.words.each_value do |word|
333
+ # TODO: Try to remove garbage data better.
334
+ next if word.word.length < 2
335
+ next if word.freq <= 1
336
+ next if word.word =~ /\p{Latin}|[[:digit:]]/
337
+
339
338
  master_article.add_word(word,use_freq: true)
340
339
  end
341
340
  end
@@ -13,10 +13,6 @@ require 'nhkore/util'
13
13
 
14
14
 
15
15
  module NHKore
16
- ###
17
- # @author Jonathan Bradley Whited
18
- # @since 0.2.0
19
- ###
20
16
  class Splitter
21
17
  def begin_split(str)
22
18
  return str
@@ -30,19 +26,12 @@ module NHKore
30
26
  end
31
27
  end
32
28
 
33
- ###
34
- # @author Jonathan Bradley Whited
35
- # @since 0.2.0
36
- ###
37
29
  class BasicSplitter < Splitter
38
30
  def end_split(str)
39
31
  return str.split(Util::NORMALIZE_STR_REGEX)
40
32
  end
41
33
  end
42
34
 
43
- ###
44
- # @since 0.2.0
45
- ###
46
35
  class BimyouSplitter < Splitter
47
36
  def initialize(*)
48
37
  require 'bimyou_segmenter'
@@ -55,9 +44,6 @@ module NHKore
55
44
  end
56
45
  end
57
46
 
58
- ###
59
- # @since 0.2.0
60
- ###
61
47
  class TinySplitter < Splitter
62
48
  attr_accessor :tiny
63
49
 
@@ -74,10 +60,6 @@ module NHKore
74
60
  end
75
61
  end
76
62
 
77
- ###
78
- # @author Jonathan Bradley Whited
79
- # @since 0.2.0
80
- ###
81
63
  class BestSplitter < BimyouSplitter
82
64
  end
83
65
  end
@@ -38,9 +38,6 @@ module NHKore
38
38
  #
39
39
  # The gem is really old and had a lot of warnings, so decided to make this class.
40
40
  # Maybe I'll fork the gem and maintain a new version in the future...
41
- #
42
- # @author Jonathan Bradley Whited
43
- # @since 0.2.1
44
41
  ###
45
42
  class UserAgents
46
43
  attr_accessor :data
@@ -53,7 +50,7 @@ module NHKore
53
50
  # because we don't need all of the data in memory after getting just 1
54
51
  # sample, even though it's slower.
55
52
  def initialize
56
- super()
53
+ super
57
54
 
58
55
  # rubocop:disable all
59
56
  @data = [
data/lib/nhkore/util.rb CHANGED
@@ -16,10 +16,6 @@ require 'uri'
16
16
 
17
17
 
18
18
  module NHKore
19
- ###
20
- # @author Jonathan Bradley Whited
21
- # @since 0.2.0
22
- ###
23
19
  module Util
24
20
  CORE_DIR = 'core'
25
21
  WEB_DIR = 'web'
@@ -10,10 +10,6 @@
10
10
 
11
11
 
12
12
  module NHKore
13
- ###
14
- # @author Jonathan Bradley Whited
15
- # @since 0.2.0
16
- ###
17
13
  class Variator
18
14
  def begin_variate(str)
19
15
  return str
@@ -27,10 +23,6 @@ module NHKore
27
23
  end
28
24
  end
29
25
 
30
- ###
31
- # @author Jonathan Bradley Whited
32
- # @since 0.2.0
33
- ###
34
26
  class BasicVariator < Variator
35
27
  def end_variate(str)
36
28
  return [] # No variations; don't return nil
@@ -41,8 +33,6 @@ module NHKore
41
33
  # Guesses a word's dictionary/plain form (辞書形).
42
34
  #
43
35
  # It doesn't work very well,but better than nothing...
44
- #
45
- # @since 0.2.0
46
36
  ###
47
37
  class DictFormVariator < Variator
48
38
  attr_accessor :deinflector
@@ -66,10 +56,6 @@ module NHKore
66
56
  end
67
57
  end
68
58
 
69
- ###
70
- # @author Jonathan Bradley Whited
71
- # @since 0.2.0
72
- ###
73
59
  class BestVariator < DictFormVariator
74
60
  end
75
61
  end
@@ -10,5 +10,5 @@
10
10
 
11
11
 
12
12
  module NHKore
13
- VERSION = '0.3.14'
13
+ VERSION = '0.3.16'
14
14
  end
data/lib/nhkore/word.rb CHANGED
@@ -16,10 +16,6 @@ require 'nhkore/util'
16
16
 
17
17
 
18
18
  module NHKore
19
- ###
20
- # @author Jonathan Bradley Whited
21
- # @since 0.1.0
22
- ###
23
19
  class Word
24
20
  attr_accessor :defn
25
21
  attr_accessor :eng
data/lib/nhkore.rb CHANGED
@@ -26,12 +26,7 @@ require 'nhkore/cli/search_cmd'
26
26
  require 'nhkore/cli/sift_cmd'
27
27
 
28
28
 
29
- ###
30
- # @author Jonathan Bradley Whited
31
- # @since 0.1.0
32
- ###
33
29
  module NHKore
34
- # @since 0.2.0
35
30
  def self.run(args=ARGV)
36
31
  app = App.new(args)
37
32
 
data/nhkore.gemspec CHANGED
@@ -54,27 +54,27 @@ Gem::Specification.new do |spec|
54
54
  run_dep[ 'attr_bool' ,'~> 0.2' ] # attr_accessor?/attr_reader?.
55
55
  run_dep[ 'bimyou_segmenter' ,'~> 1.2' ] # Splitting Japanese sentences into words.
56
56
  run_dep[ 'cri' ,'~> 2.15' ] # CLI commands/options.
57
- run_dep[ 'down' ,'~> 5.3' ] # Downloading files (GetCmd).
58
- run_dep[ 'highline' ,'~> 2.0' ] # CLI input/output.
57
+ run_dep[ 'down' ,'~> 5.4' ] # Downloading files (GetCmd).
58
+ run_dep[ 'highline' ,'~> 3.1' ] # CLI input/output.
59
59
  run_dep[ 'http-cookie' ,'~> 1.0' ] # Parsing/Setting cookies [(Bing)Scraper].
60
60
  run_dep[ 'japanese_deinflector','~> 0.0' ] # Unconjugating Japanese words (dictionary form).
61
- run_dep[ 'nokogiri' ,'~> 1.13' ] # Scraping/Hacking.
61
+ run_dep[ 'nokogiri' ,'~> 1.16' ] # Scraping/Hacking.
62
62
  run_dep[ 'psychgus' ,'~> 1.3' ] # Styling Psych YAML.
63
- run_dep[ 'public_suffix' ,'~> 4.0' ] # Parsing URL domain names.
63
+ run_dep[ 'public_suffix' ,'~> 6.0' ] # Parsing URL domain names.
64
64
  run_dep[ 'rainbow' ,'~> 3.1' ] # CLI color output.
65
- run_dep[ 'rss' ,'~> 0.2' ] # Scraping [(Bing)Scraper].
65
+ run_dep[ 'rss' ,'~> 0.3' ] # Scraping [(Bing)Scraper].
66
66
  run_dep[ 'rubyzip' ,'~> 2.3' ] # Extracting Zip files (GetCmd).
67
67
  run_dep[ 'tiny_segmenter' ,'~> 0.0' ] # Splitting Japanese sentences into words.
68
68
  run_dep[ 'tty-progressbar' ,'~> 0.18' ] # CLI progress bars.
69
69
  run_dep[ 'tty-spinner' ,'~> 0.9' ] # CLI spinning progress.
70
70
 
71
71
  dev_dep = spec.method(:add_development_dependency)
72
- dev_dep[ 'bundler' ,'~> 2.3' ]
73
- dev_dep[ 'minitest' ,'~> 5.16' ]
74
- dev_dep[ 'rake' ,'~> 13.0' ]
72
+ dev_dep[ 'bundler' ,'~> 2.5' ]
73
+ dev_dep[ 'minitest' ,'~> 5.25' ]
74
+ dev_dep[ 'rake' ,'~> 13.2' ]
75
75
  dev_dep[ 'raketeer' ,'~> 0.2' ] # Extra Rake tasks.
76
- dev_dep[ 'rdoc' ,'~> 6.4' ] # YARDoc RDoc (*.rb).
77
- dev_dep[ 'redcarpet' ,'~> 3.5' ] # YARDoc Markdown (*.md).
76
+ dev_dep[ 'rdoc' ,'~> 6.7' ] # YARDoc RDoc (*.rb).
77
+ dev_dep[ 'redcarpet' ,'~> 3.6' ] # YARDoc Markdown (*.md).
78
78
  dev_dep[ 'yard' ,'~> 0.9' ] # Doc.
79
79
  dev_dep[ 'yard_ghurt','~> 1.2' ] # Extra YARDoc Rake tasks.
80
80
 
data/samples/looper.rb CHANGED
@@ -13,9 +13,6 @@
13
13
  ###
14
14
  # If you run this script, be aware that it uses the +-F+ force option
15
15
  # (which overwrites files without prompting).
16
- #
17
- # @author Jonathan Bradley Whited
18
- # @since 0.3.2
19
16
  ###
20
17
 
21
18
  case ARGV[0]