nhkore 0.3.13 → 0.3.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@
3
3
 
4
4
  #--
5
5
  # This file is part of NHKore.
6
- # Copyright (c) 2020-2021 Jonathan Bradley Whited
6
+ # Copyright (c) 2020-2022 Jonathan Bradley Whited
7
7
  #
8
8
  # SPDX-License-Identifier: LGPL-3.0-or-later
9
9
  #++
@@ -17,10 +17,6 @@ require 'nhkore/util'
17
17
 
18
18
  module NHKore
19
19
  module CLI
20
- ###
21
- # @author Jonathan Bradley Whited
22
- # @since 0.3.0
23
- ###
24
20
  module SearchCmd
25
21
  def build_search_cmd
26
22
  app = self
@@ -42,6 +38,12 @@ module CLI
42
38
  DESC
43
39
  app.check_empty_opt(:in,value)
44
40
  }
41
+ option :l,:loop,'number of times to repeat the search to ensure results',argument: :required,
42
+ transform: lambda { |value|
43
+ value = value.to_i
44
+ value = 1 if value < 1
45
+ value
46
+ }
45
47
  option :o,:out,<<-DESC,argument: :required,transform: lambda { |value|
46
48
  'directory/file' to save links to; if you only specify a directory or a file, it will attach the
47
49
  appropriate default directory/file name
@@ -164,6 +166,8 @@ module CLI
164
166
 
165
167
  dry_run = @cmd_opts[:dry_run]
166
168
  in_file = @cmd_opts[:in]
169
+ loop_times = @cmd_opts[:loop]
170
+ loop_times = 1 if loop_times.nil? || loop_times < 1
167
171
  out_file = @cmd_opts[:out]
168
172
  result_count = @cmd_opts[:results]
169
173
  result_count = SearchScraper::DEFAULT_RESULT_COUNT if result_count.nil?
@@ -174,9 +178,6 @@ module CLI
174
178
  is_file = !in_file.nil?
175
179
  links = nil
176
180
  new_links = [] # For --dry-run
177
- next_page = NextPage.new
178
- page_count = 0
179
- page_num = 1
180
181
  url = in_file # nil will use default URL, else a file
181
182
 
182
183
  # Load previous links for 'scraped?' vars.
@@ -196,43 +197,52 @@ module CLI
196
197
  end
197
198
 
198
199
  puts "#{scraped_count} of #{links_count} links scraped."
199
-
200
200
  return
201
201
  end
202
202
 
203
- range = (0..10_000) # Do a range to prevent an infinite loop; ichiman!
204
-
205
- case search_type
206
- # Anything that extends SearchScraper.
207
- when :bing
208
- range.each do
209
- scraper = nil
210
-
211
- case search_type
212
- when :bing
213
- scraper = BingScraper.new(nhk_type,count: result_count,is_file: is_file,url: url,**@scraper_kargs)
214
- else
215
- raise NHKore::Error,"internal code broken; add missing search_type[#{search_type}]"
216
- end
203
+ 1.upto(loop_times) do |loop_i|
204
+ page_range = (0..10_000) # Do a range to prevent an infinite loop; ichiman!
205
+
206
+ next_page = NextPage.new
207
+ page_count = 0
208
+ page_num = 1
209
+
210
+ case search_type
211
+ # Anything that extends SearchScraper.
212
+ when :bing
213
+ page_range.each do
214
+ scraper = nil
215
+
216
+ case search_type
217
+ when :bing
218
+ scraper = BingScraper.new(
219
+ nhk_type,count: result_count,is_file: is_file,url: url,**@scraper_kargs
220
+ )
221
+ else
222
+ raise NHKore::Error,"internal code broken; add missing search_type[#{search_type}]"
223
+ end
217
224
 
218
- next_page = scraper.scrape(links,next_page)
225
+ next_page = scraper.scrape(links,next_page)
219
226
 
220
- new_links.concat(links.links.values[links_count..-1])
221
- links_count = links.length
222
- page_count = next_page.count if next_page.count > 0
227
+ new_links.concat(links.links.values[links_count..])
228
+ links_count = links.length
229
+ page_count = next_page.count if next_page.count > 0
223
230
 
224
- update_spin_detail(" (page=#{page_num}, count=#{page_count}, links=#{links.length}," \
225
- " new_links=#{new_links.length})")
231
+ update_spin_detail(
232
+ format(' (%d/%d, page=%d, count=%d, links=%d, new_links=%d)',
233
+ loop_i,loop_times,page_num,page_count,links.length,new_links.length)
234
+ )
226
235
 
227
- break if next_page.empty?
236
+ break if next_page.empty?
228
237
 
229
- page_num += 1
230
- url = next_page.url
238
+ page_num += 1
239
+ url = next_page.url
231
240
 
232
- sleep_scraper
241
+ sleep_scraper
242
+ end
243
+ else
244
+ raise ArgumentError,"invalid search_type[#{search_type}]"
233
245
  end
234
- else
235
- raise ArgumentError,"invalid search_type[#{search_type}]"
236
246
  end
237
247
 
238
248
  stop_spin
@@ -20,10 +20,6 @@ require 'nhkore/util'
20
20
 
21
21
  module NHKore
22
22
  module CLI
23
- ###
24
- # @author Jonathan Bradley Whited
25
- # @since 0.2.0
26
- ###
27
23
  module SiftCmd
28
24
  DEFAULT_SIFT_EXT = :csv
29
25
  DEFAULT_SIFT_FUTSUU_FILE = "#{Sifter::DEFAULT_FUTSUU_FILE}{search.criteria}{file.ext}"
@@ -260,7 +256,7 @@ module CLI
260
256
  puts
261
257
 
262
258
  if dry_run
263
- puts sifter.to_s
259
+ puts sifter
264
260
  else
265
261
  start_spin('Saving sifted data to file')
266
262
 
@@ -17,10 +17,6 @@ require 'nhkore/util'
17
17
 
18
18
 
19
19
  module NHKore
20
- ###
21
- # @author Jonathan Bradley Whited
22
- # @since 0.3.4
23
- ###
24
20
  class DatetimeParser
25
21
  extend AttrBool::Ext
26
22
 
@@ -181,7 +177,7 @@ module NHKore
181
177
  return self if @min_or_max
182
178
 
183
179
  has_small = false
184
- jst_now = Util.jst_now()
180
+ jst_now = Util.jst_now
185
181
 
186
182
  # Must be from smallest to biggest.
187
183
 
data/lib/nhkore/defn.rb CHANGED
@@ -16,17 +16,13 @@ require 'nhkore/word'
16
16
 
17
17
 
18
18
  module NHKore
19
- ###
20
- # @author Jonathan Bradley Whited
21
- # @since 0.2.0
22
- ###
23
19
  class Defn
24
20
  attr_reader :hyoukis
25
21
  attr_accessor :text
26
22
  attr_reader :words
27
23
 
28
24
  def initialize
29
- super()
25
+ super
30
26
 
31
27
  @hyoukis = []
32
28
  @text = ''.dup
data/lib/nhkore/dict.rb CHANGED
@@ -14,15 +14,11 @@ require 'nhkore/error'
14
14
 
15
15
 
16
16
  module NHKore
17
- ###
18
- # @author Jonathan Bradley Whited
19
- # @since 0.2.0
20
- ###
21
17
  class Dict
22
18
  attr_reader :entries
23
19
 
24
20
  def initialize
25
- super()
21
+ super
26
22
 
27
23
  @entries = {}
28
24
  end
@@ -39,6 +35,7 @@ module NHKore
39
35
  dict = Dict.new
40
36
 
41
37
  hash.each do |id,array|
38
+ id = id.to_s.strip.downcase # 'RSHOK-K-003806', '0000'
42
39
  entry = Entry.scrape(id,array,missingno: missingno,url: url)
43
40
 
44
41
  next if entry.nil?
@@ -3,7 +3,7 @@
3
3
 
4
4
  #--
5
5
  # This file is part of NHKore.
6
- # Copyright (c) 2020-2021 Jonathan Bradley Whited
6
+ # Copyright (c) 2020-2022 Jonathan Bradley Whited
7
7
  #
8
8
  # SPDX-License-Identifier: LGPL-3.0-or-later
9
9
  #++
@@ -16,10 +16,6 @@ require 'nhkore/util'
16
16
 
17
17
 
18
18
  module NHKore
19
- ###
20
- # @author Jonathan Bradley Whited
21
- # @since 0.2.0
22
- ###
23
19
  class DictScraper < Scraper
24
20
  attr_accessor :missingno
25
21
 
@@ -39,7 +35,7 @@ module NHKore
39
35
  i = url.rindex(%r{[/\\]}) # Can be a URL or a file
40
36
  i = i.nil? ? 0 : (i + 1) # If no match found, no path
41
37
 
42
- basename = File.basename(url[i..-1],'.*') if basename.nil?
38
+ basename = File.basename(url[i..],'.*') if basename.nil?
43
39
  path = url[0...i]
44
40
 
45
41
  return "#{path}#{basename}.out.dic"
data/lib/nhkore/entry.rb CHANGED
@@ -14,10 +14,6 @@ require 'nhkore/util'
14
14
 
15
15
 
16
16
  module NHKore
17
- ###
18
- # @author Jonathan Bradley Whited
19
- # @since 0.2.0
20
- ###
21
17
  class Entry
22
18
  HYOUKI_SEP = '・'
23
19
 
@@ -25,18 +21,16 @@ module NHKore
25
21
  attr_accessor :id
26
22
 
27
23
  def initialize
28
- super()
24
+ super
29
25
 
30
26
  @defns = []
31
27
  @id = nil
32
28
  end
33
29
 
34
30
  def build_defn
35
- defns = []
36
31
  i = 0
37
-
38
- @defns.each do |defn|
39
- defns << "#{i += 1})#{defn}" # Japanese parenthesis
32
+ defns = @defns.map do |defn|
33
+ "#{i += 1})#{defn}" # Japanese parenthesis
40
34
  end
41
35
 
42
36
  return defns.join("\n")
data/lib/nhkore/error.rb CHANGED
@@ -10,21 +10,11 @@
10
10
 
11
11
 
12
12
  module NHKore
13
- ###
14
- # @author Jonathan Bradley Whited
15
- # @since 0.2.0
16
- ###
17
13
  class Error < ::StandardError; end
18
14
 
19
- # @since 0.2.0
20
15
  class CLIError < Error; end
21
-
22
- # @since 0.2.0
16
+ class Http404Error < Error; end
23
17
  class ParseError < Error; end
24
-
25
- # @since 0.2.0
26
18
  class ScrapeError < Error; end
27
-
28
- # @since 0.2.0
29
19
  class ZipError < Error; end
30
20
  end
@@ -10,10 +10,6 @@
10
10
 
11
11
 
12
12
  module NHKore
13
- ###
14
- # @author Jonathan Bradley Whited
15
- # @since 0.2.0
16
- ###
17
13
  module Fileable
18
14
  def self.included(mod)
19
15
  mod.extend ClassMethods
data/lib/nhkore/lib.rb CHANGED
@@ -38,9 +38,6 @@ module NHKore
38
38
  ###
39
39
  # Include this file to only require the files needed to use this
40
40
  # Gem as a library (i.e., don't include CLI-related files).
41
- #
42
- # @author Jonathan Bradley Whited
43
- # @since 0.3.2
44
41
  ###
45
42
  module Lib
46
43
  end
@@ -13,10 +13,6 @@ require 'nhkore/util'
13
13
 
14
14
 
15
15
  module NHKore
16
- ###
17
- # @author Jonathan Bradley Whited
18
- # @since 0.2.0
19
- ###
20
16
  class Missingno
21
17
  attr_reader :kanas
22
18
  attr_reader :kanjis
@@ -68,13 +64,13 @@ module NHKore
68
64
  def kana_from_kanji(kanji)
69
65
  word = @kanjis[kanji]
70
66
 
71
- return word.nil? ? nil : word.kana
67
+ return word&.kana
72
68
  end
73
69
 
74
70
  def kanji_from_kana(kana)
75
71
  word = @kanas[kana]
76
72
 
77
- return word.nil? ? nil : word.kanji
73
+ return word&.kanji
78
74
  end
79
75
  end
80
76
  end
data/lib/nhkore/news.rb CHANGED
@@ -16,10 +16,6 @@ require 'nhkore/util'
16
16
 
17
17
 
18
18
  module NHKore
19
- ###
20
- # @author Jonathan Bradley Whited
21
- # @since 0.2.0
22
- ###
23
19
  class News
24
20
  include Fileable
25
21
 
@@ -30,7 +26,7 @@ module NHKore
30
26
  attr_reader :sha256s
31
27
 
32
28
  def initialize
33
- super()
29
+ super
34
30
 
35
31
  @articles = {}
36
32
  @sha256s = {}
@@ -127,10 +123,6 @@ module NHKore
127
123
  end
128
124
  end
129
125
 
130
- ###
131
- # @author Jonathan Bradley Whited
132
- # @since 0.2.0
133
- ###
134
126
  class FutsuuNews < News
135
127
  DEFAULT_FILENAME = 'nhk_news_web_regular.yml'
136
128
  DEFAULT_FILE = build_file(DEFAULT_FILENAME)
@@ -144,14 +136,10 @@ module NHKore
144
136
  end
145
137
 
146
138
  def save_file(file=DEFAULT_FILE,**kargs)
147
- super(file,**kargs)
139
+ super
148
140
  end
149
141
  end
150
142
 
151
- ###
152
- # @author Jonathan Bradley Whited
153
- # @since 0.2.0
154
- ###
155
143
  class YasashiiNews < News
156
144
  DEFAULT_FILENAME = 'nhk_news_web_easy.yml'
157
145
  DEFAULT_FILE = build_file(DEFAULT_FILENAME)
@@ -165,7 +153,7 @@ module NHKore
165
153
  end
166
154
 
167
155
  def save_file(file=DEFAULT_FILE,**kargs)
168
- super(file,**kargs)
156
+ super
169
157
  end
170
158
  end
171
159
  end
@@ -13,10 +13,6 @@ require 'nhkore/word'
13
13
 
14
14
 
15
15
  module NHKore
16
- ###
17
- # @author Jonathan Bradley Whited
18
- # @since 0.2.0
19
- ###
20
16
  class Polisher
21
17
  def begin_polish(str)
22
18
  return str
@@ -52,10 +48,6 @@ module NHKore
52
48
  end
53
49
  end
54
50
 
55
- ###
56
- # @author Jonathan Bradley Whited
57
- # @since 0.2.0
58
- ###
59
51
  class BasicPolisher < Polisher
60
52
  def end_polish(str)
61
53
  # Keep Japanese dots in names:
@@ -72,10 +64,6 @@ module NHKore
72
64
  end
73
65
  end
74
66
 
75
- ###
76
- # @author Jonathan Bradley Whited
77
- # @since 0.2.0
78
- ###
79
67
  class BestPolisher < BasicPolisher
80
68
  end
81
69
  end
@@ -13,15 +13,12 @@ require 'attr_bool'
13
13
  require 'nokogiri'
14
14
  require 'open-uri'
15
15
 
16
+ require 'nhkore/error'
16
17
  require 'nhkore/user_agents'
17
18
  require 'nhkore/util'
18
19
 
19
20
 
20
21
  module NHKore
21
- ###
22
- # @author Jonathan Bradley Whited
23
- # @since 0.2.0
24
- ###
25
22
  class Scraper
26
23
  extend AttrBool::Ext
27
24
 
@@ -177,7 +174,13 @@ module NHKore
177
174
  retry
178
175
  # Must come after HTTPRedirect since a subclass of HTTPError.
179
176
  rescue OpenURI::HTTPError => e
180
- raise e.exception("HTTP error[#{e}] at URL[#{url}]")
177
+ msg = "HTTP error[#{e}] at URL[#{url}]"
178
+
179
+ if e.to_s.include?('404 Not Found')
180
+ raise Http404Error,msg
181
+ else
182
+ raise e.exception(msg)
183
+ end
181
184
  rescue SocketError => e
182
185
  if (max_retries -= 1) < 0
183
186
  raise e.exception("Socket error[#{e}] at URL[#{url}]")
@@ -17,10 +17,6 @@ require 'nhkore/util'
17
17
 
18
18
 
19
19
  module NHKore
20
- ###
21
- # @author Jonathan Bradley Whited
22
- # @since 0.2.0
23
- ###
24
20
  class SearchLink
25
21
  extend AttrBool::Ext
26
22
 
@@ -45,11 +41,11 @@ module NHKore
45
41
  def encode_with(coder)
46
42
  # Order matters.
47
43
 
48
- coder[:url] = @url.nil? ? nil : @url.to_s
44
+ coder[:url] = @url&.to_s
49
45
  coder[:scraped] = @scraped
50
- coder[:datetime] = @datetime.nil? ? nil : @datetime.iso8601
46
+ coder[:datetime] = @datetime&.iso8601
51
47
  coder[:title] = @title
52
- coder[:futsuurl] = @futsuurl.nil? ? nil : @futsuurl.to_s
48
+ coder[:futsuurl] = @futsuurl&.to_s
53
49
  coder[:sha256] = @sha256
54
50
  end
55
51
 
@@ -86,13 +82,13 @@ module NHKore
86
82
  end
87
83
 
88
84
  def futsuurl=(value)
89
- # Don't store URI, store String.
90
- @futsuurl = value.nil? ? nil : value.to_s
85
+ # Don't store URI, store String or nil.
86
+ @futsuurl = value&.to_s
91
87
  end
92
88
 
93
89
  def url=(value)
94
- # Don't store URI, store String.
95
- @url = value.nil? ? nil : value.to_s
90
+ # Don't store URI, store String or nil.
91
+ @url = value&.to_s
96
92
  end
97
93
 
98
94
  def to_s(mini: false)
@@ -114,10 +110,6 @@ module NHKore
114
110
  end
115
111
  end
116
112
 
117
- ###
118
- # @author Jonathan Bradley Whited
119
- # @since 0.2.0
120
- ###
121
113
  class SearchLinks
122
114
  include Fileable
123
115
 
@@ -136,13 +128,13 @@ module NHKore
136
128
  attr_reader :links
137
129
 
138
130
  def initialize
139
- super()
131
+ super
140
132
 
141
133
  @links = {}
142
134
  end
143
135
 
144
136
  def add_link(link)
145
- url = link.url.nil? ? nil : link.url.to_s
137
+ url = link.url&.to_s
146
138
 
147
139
  return self if @links.key?(url)
148
140
 
@@ -9,6 +9,7 @@
9
9
  #++
10
10
 
11
11
 
12
+ require 'net/http'
12
13
  require 'uri'
13
14
 
14
15
  require 'nhkore/error'
@@ -18,10 +19,6 @@ require 'nhkore/util'
18
19
 
19
20
 
20
21
  module NHKore
21
- ###
22
- # @author Jonathan Bradley Whited
23
- # @since 0.2.0
24
- ###
25
22
  class SearchScraper < Scraper
26
23
  DEFAULT_RESULT_COUNT = 100
27
24
  FUTSUU_SITE = 'nhk.or.jp/news/html/'
@@ -34,10 +31,11 @@ module NHKore
34
31
  YASASHII_REGEX = /\A[^.]+\.#{Regexp.quote(YASASHII_SITE)}.+\.html?/i.freeze
35
32
 
36
33
  IGNORE_LINK_REGEX = %r{
37
- /about\.html? # https://www3.nhk.or.jp/news/easy/about.html
38
- |/movieplayer\.html? # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
39
- |/audio\.html? # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
40
- |/news/easy/index\.html? # http://www3.nhk.or.jp/news/easy/index.html
34
+ /about\.html? # https://www3.nhk.or.jp/news/easy/about.html
35
+ |/movieplayer\.html? # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
36
+ |/audio\.html? # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
37
+ |/news/easy/index\.html? # https://www3.nhk.or.jp/news/easy/index.html
38
+ |/disaster_earthquake.html # https://www3.nhk.or.jp/news/easy/article/disaster_earthquake.html
41
39
 
42
40
  # https://cgi2.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10011916321000&title=日本の会社が作った鉄道の車両「あずま」がイギリスで走る
43
41
  # https://www3.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10012689671000&title=「鬼滅の刃」の映画が台湾でも始まって大勢の人が見に行く
@@ -47,7 +45,7 @@ module NHKore
47
45
  # Search Engines are strict, so trigger using the default HTTP header fields
48
46
  # with +header: {}+ and fetch/set the cookie using +eat_cookie: true+.
49
47
  def initialize(url,eat_cookie: true,header: {},**kargs)
50
- super(url,eat_cookie: eat_cookie,header: header,**kargs)
48
+ super
51
49
  end
52
50
 
53
51
  def ignore_link?(link,cleaned: true)
@@ -56,17 +54,35 @@ module NHKore
56
54
  link = Util.unspace_web_str(link).downcase unless cleaned
57
55
 
58
56
  return true if link.empty?
59
-
60
57
  return true if IGNORE_LINK_REGEX.match?(link)
61
-
62
58
  return false
63
59
  end
60
+
61
+ # Example: https://www3.nhk.or.jp/news/easy/k10014150691000/k10014150691000.html
62
+ def fetch_valid_link?(link)
63
+ uri = begin
64
+ URI(link)
65
+ rescue StandardError
66
+ return false # Bad URL.
67
+ end
68
+
69
+ begin
70
+ ssl = uri.scheme.to_s.strip.downcase.include?('https')
71
+
72
+ Net::HTTP.start(uri.host,uri.port,use_ssl: ssl) do |http|
73
+ resp = http.head(uri.request_uri)
74
+ code = resp.code
75
+
76
+ return code != '404'
77
+ end
78
+ rescue StandardError
79
+ # Ignore; try actually scraping the article anyway.
80
+ end
81
+
82
+ return true
83
+ end
64
84
  end
65
85
 
66
- ###
67
- # @author Jonathan Bradley Whited
68
- # @since 0.2.0
69
- ###
70
86
  class BingScraper < SearchScraper
71
87
  attr_reader :regex
72
88
  attr_reader :site
@@ -136,9 +152,8 @@ module NHKore
136
152
  next_page.count = count
137
153
  next_page.url = join_url(href)
138
154
  end
139
- elsif href =~ regex
155
+ elsif href =~ regex && fetch_valid_link?(href)
140
156
  slinks.add_link(SearchLink.new(href))
141
-
142
157
  link_count += 1
143
158
  end
144
159
  end
@@ -165,10 +180,9 @@ module NHKore
165
180
  rss_links << link
166
181
 
167
182
  next if ignore_link?(link)
168
- next if link !~ regex
183
+ next if link !~ regex || !fetch_valid_link?(link)
169
184
 
170
185
  slinks.add_link(SearchLink.new(link))
171
-
172
186
  link_count += 1
173
187
  end
174
188
 
@@ -192,17 +206,13 @@ module NHKore
192
206
  end
193
207
  end
194
208
 
195
- ###
196
- # @author Jonathan Bradley Whited
197
- # @since 0.2.0
198
- ###
199
209
  class NextPage
200
210
  attr_accessor :count
201
211
  attr_accessor :rss_links
202
212
  attr_accessor :url
203
213
 
204
214
  def initialize
205
- super()
215
+ super
206
216
 
207
217
  @count = -1
208
218
  @rss_links = nil