nhkore 0.3.13 → 0.3.16

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,7 +3,7 @@
3
3
 
4
4
  #--
5
5
  # This file is part of NHKore.
6
- # Copyright (c) 2020-2021 Jonathan Bradley Whited
6
+ # Copyright (c) 2020-2022 Jonathan Bradley Whited
7
7
  #
8
8
  # SPDX-License-Identifier: LGPL-3.0-or-later
9
9
  #++
@@ -17,10 +17,6 @@ require 'nhkore/util'
17
17
 
18
18
  module NHKore
19
19
  module CLI
20
- ###
21
- # @author Jonathan Bradley Whited
22
- # @since 0.3.0
23
- ###
24
20
  module SearchCmd
25
21
  def build_search_cmd
26
22
  app = self
@@ -42,6 +38,12 @@ module CLI
42
38
  DESC
43
39
  app.check_empty_opt(:in,value)
44
40
  }
41
+ option :l,:loop,'number of times to repeat the search to ensure results',argument: :required,
42
+ transform: lambda { |value|
43
+ value = value.to_i
44
+ value = 1 if value < 1
45
+ value
46
+ }
45
47
  option :o,:out,<<-DESC,argument: :required,transform: lambda { |value|
46
48
  'directory/file' to save links to; if you only specify a directory or a file, it will attach the
47
49
  appropriate default directory/file name
@@ -164,6 +166,8 @@ module CLI
164
166
 
165
167
  dry_run = @cmd_opts[:dry_run]
166
168
  in_file = @cmd_opts[:in]
169
+ loop_times = @cmd_opts[:loop]
170
+ loop_times = 1 if loop_times.nil? || loop_times < 1
167
171
  out_file = @cmd_opts[:out]
168
172
  result_count = @cmd_opts[:results]
169
173
  result_count = SearchScraper::DEFAULT_RESULT_COUNT if result_count.nil?
@@ -174,9 +178,6 @@ module CLI
174
178
  is_file = !in_file.nil?
175
179
  links = nil
176
180
  new_links = [] # For --dry-run
177
- next_page = NextPage.new
178
- page_count = 0
179
- page_num = 1
180
181
  url = in_file # nil will use default URL, else a file
181
182
 
182
183
  # Load previous links for 'scraped?' vars.
@@ -196,43 +197,52 @@ module CLI
196
197
  end
197
198
 
198
199
  puts "#{scraped_count} of #{links_count} links scraped."
199
-
200
200
  return
201
201
  end
202
202
 
203
- range = (0..10_000) # Do a range to prevent an infinite loop; ichiman!
204
-
205
- case search_type
206
- # Anything that extends SearchScraper.
207
- when :bing
208
- range.each do
209
- scraper = nil
210
-
211
- case search_type
212
- when :bing
213
- scraper = BingScraper.new(nhk_type,count: result_count,is_file: is_file,url: url,**@scraper_kargs)
214
- else
215
- raise NHKore::Error,"internal code broken; add missing search_type[#{search_type}]"
216
- end
203
+ 1.upto(loop_times) do |loop_i|
204
+ page_range = (0..10_000) # Do a range to prevent an infinite loop; ichiman!
205
+
206
+ next_page = NextPage.new
207
+ page_count = 0
208
+ page_num = 1
209
+
210
+ case search_type
211
+ # Anything that extends SearchScraper.
212
+ when :bing
213
+ page_range.each do
214
+ scraper = nil
215
+
216
+ case search_type
217
+ when :bing
218
+ scraper = BingScraper.new(
219
+ nhk_type,count: result_count,is_file: is_file,url: url,**@scraper_kargs
220
+ )
221
+ else
222
+ raise NHKore::Error,"internal code broken; add missing search_type[#{search_type}]"
223
+ end
217
224
 
218
- next_page = scraper.scrape(links,next_page)
225
+ next_page = scraper.scrape(links,next_page)
219
226
 
220
- new_links.concat(links.links.values[links_count..-1])
221
- links_count = links.length
222
- page_count = next_page.count if next_page.count > 0
227
+ new_links.concat(links.links.values[links_count..])
228
+ links_count = links.length
229
+ page_count = next_page.count if next_page.count > 0
223
230
 
224
- update_spin_detail(" (page=#{page_num}, count=#{page_count}, links=#{links.length}," \
225
- " new_links=#{new_links.length})")
231
+ update_spin_detail(
232
+ format(' (%d/%d, page=%d, count=%d, links=%d, new_links=%d)',
233
+ loop_i,loop_times,page_num,page_count,links.length,new_links.length)
234
+ )
226
235
 
227
- break if next_page.empty?
236
+ break if next_page.empty?
228
237
 
229
- page_num += 1
230
- url = next_page.url
238
+ page_num += 1
239
+ url = next_page.url
231
240
 
232
- sleep_scraper
241
+ sleep_scraper
242
+ end
243
+ else
244
+ raise ArgumentError,"invalid search_type[#{search_type}]"
233
245
  end
234
- else
235
- raise ArgumentError,"invalid search_type[#{search_type}]"
236
246
  end
237
247
 
238
248
  stop_spin
@@ -20,10 +20,6 @@ require 'nhkore/util'
20
20
 
21
21
  module NHKore
22
22
  module CLI
23
- ###
24
- # @author Jonathan Bradley Whited
25
- # @since 0.2.0
26
- ###
27
23
  module SiftCmd
28
24
  DEFAULT_SIFT_EXT = :csv
29
25
  DEFAULT_SIFT_FUTSUU_FILE = "#{Sifter::DEFAULT_FUTSUU_FILE}{search.criteria}{file.ext}"
@@ -260,7 +256,7 @@ module CLI
260
256
  puts
261
257
 
262
258
  if dry_run
263
- puts sifter.to_s
259
+ puts sifter
264
260
  else
265
261
  start_spin('Saving sifted data to file')
266
262
 
@@ -17,10 +17,6 @@ require 'nhkore/util'
17
17
 
18
18
 
19
19
  module NHKore
20
- ###
21
- # @author Jonathan Bradley Whited
22
- # @since 0.3.4
23
- ###
24
20
  class DatetimeParser
25
21
  extend AttrBool::Ext
26
22
 
@@ -181,7 +177,7 @@ module NHKore
181
177
  return self if @min_or_max
182
178
 
183
179
  has_small = false
184
- jst_now = Util.jst_now()
180
+ jst_now = Util.jst_now
185
181
 
186
182
  # Must be from smallest to biggest.
187
183
 
data/lib/nhkore/defn.rb CHANGED
@@ -16,17 +16,13 @@ require 'nhkore/word'
16
16
 
17
17
 
18
18
  module NHKore
19
- ###
20
- # @author Jonathan Bradley Whited
21
- # @since 0.2.0
22
- ###
23
19
  class Defn
24
20
  attr_reader :hyoukis
25
21
  attr_accessor :text
26
22
  attr_reader :words
27
23
 
28
24
  def initialize
29
- super()
25
+ super
30
26
 
31
27
  @hyoukis = []
32
28
  @text = ''.dup
data/lib/nhkore/dict.rb CHANGED
@@ -14,15 +14,11 @@ require 'nhkore/error'
14
14
 
15
15
 
16
16
  module NHKore
17
- ###
18
- # @author Jonathan Bradley Whited
19
- # @since 0.2.0
20
- ###
21
17
  class Dict
22
18
  attr_reader :entries
23
19
 
24
20
  def initialize
25
- super()
21
+ super
26
22
 
27
23
  @entries = {}
28
24
  end
@@ -39,6 +35,7 @@ module NHKore
39
35
  dict = Dict.new
40
36
 
41
37
  hash.each do |id,array|
38
+ id = id.to_s.strip.downcase # 'RSHOK-K-003806', '0000'
42
39
  entry = Entry.scrape(id,array,missingno: missingno,url: url)
43
40
 
44
41
  next if entry.nil?
@@ -3,7 +3,7 @@
3
3
 
4
4
  #--
5
5
  # This file is part of NHKore.
6
- # Copyright (c) 2020-2021 Jonathan Bradley Whited
6
+ # Copyright (c) 2020-2022 Jonathan Bradley Whited
7
7
  #
8
8
  # SPDX-License-Identifier: LGPL-3.0-or-later
9
9
  #++
@@ -16,10 +16,6 @@ require 'nhkore/util'
16
16
 
17
17
 
18
18
  module NHKore
19
- ###
20
- # @author Jonathan Bradley Whited
21
- # @since 0.2.0
22
- ###
23
19
  class DictScraper < Scraper
24
20
  attr_accessor :missingno
25
21
 
@@ -39,7 +35,7 @@ module NHKore
39
35
  i = url.rindex(%r{[/\\]}) # Can be a URL or a file
40
36
  i = i.nil? ? 0 : (i + 1) # If no match found, no path
41
37
 
42
- basename = File.basename(url[i..-1],'.*') if basename.nil?
38
+ basename = File.basename(url[i..],'.*') if basename.nil?
43
39
  path = url[0...i]
44
40
 
45
41
  return "#{path}#{basename}.out.dic"
data/lib/nhkore/entry.rb CHANGED
@@ -14,10 +14,6 @@ require 'nhkore/util'
14
14
 
15
15
 
16
16
  module NHKore
17
- ###
18
- # @author Jonathan Bradley Whited
19
- # @since 0.2.0
20
- ###
21
17
  class Entry
22
18
  HYOUKI_SEP = '・'
23
19
 
@@ -25,18 +21,16 @@ module NHKore
25
21
  attr_accessor :id
26
22
 
27
23
  def initialize
28
- super()
24
+ super
29
25
 
30
26
  @defns = []
31
27
  @id = nil
32
28
  end
33
29
 
34
30
  def build_defn
35
- defns = []
36
31
  i = 0
37
-
38
- @defns.each do |defn|
39
- defns << "#{i += 1})#{defn}" # Japanese parenthesis
32
+ defns = @defns.map do |defn|
33
+ "#{i += 1})#{defn}" # Japanese parenthesis
40
34
  end
41
35
 
42
36
  return defns.join("\n")
data/lib/nhkore/error.rb CHANGED
@@ -10,21 +10,11 @@
10
10
 
11
11
 
12
12
  module NHKore
13
- ###
14
- # @author Jonathan Bradley Whited
15
- # @since 0.2.0
16
- ###
17
13
  class Error < ::StandardError; end
18
14
 
19
- # @since 0.2.0
20
15
  class CLIError < Error; end
21
-
22
- # @since 0.2.0
16
+ class Http404Error < Error; end
23
17
  class ParseError < Error; end
24
-
25
- # @since 0.2.0
26
18
  class ScrapeError < Error; end
27
-
28
- # @since 0.2.0
29
19
  class ZipError < Error; end
30
20
  end
@@ -10,10 +10,6 @@
10
10
 
11
11
 
12
12
  module NHKore
13
- ###
14
- # @author Jonathan Bradley Whited
15
- # @since 0.2.0
16
- ###
17
13
  module Fileable
18
14
  def self.included(mod)
19
15
  mod.extend ClassMethods
data/lib/nhkore/lib.rb CHANGED
@@ -38,9 +38,6 @@ module NHKore
38
38
  ###
39
39
  # Include this file to only require the files needed to use this
40
40
  # Gem as a library (i.e., don't include CLI-related files).
41
- #
42
- # @author Jonathan Bradley Whited
43
- # @since 0.3.2
44
41
  ###
45
42
  module Lib
46
43
  end
@@ -13,10 +13,6 @@ require 'nhkore/util'
13
13
 
14
14
 
15
15
  module NHKore
16
- ###
17
- # @author Jonathan Bradley Whited
18
- # @since 0.2.0
19
- ###
20
16
  class Missingno
21
17
  attr_reader :kanas
22
18
  attr_reader :kanjis
@@ -68,13 +64,13 @@ module NHKore
68
64
  def kana_from_kanji(kanji)
69
65
  word = @kanjis[kanji]
70
66
 
71
- return word.nil? ? nil : word.kana
67
+ return word&.kana
72
68
  end
73
69
 
74
70
  def kanji_from_kana(kana)
75
71
  word = @kanas[kana]
76
72
 
77
- return word.nil? ? nil : word.kanji
73
+ return word&.kanji
78
74
  end
79
75
  end
80
76
  end
data/lib/nhkore/news.rb CHANGED
@@ -16,10 +16,6 @@ require 'nhkore/util'
16
16
 
17
17
 
18
18
  module NHKore
19
- ###
20
- # @author Jonathan Bradley Whited
21
- # @since 0.2.0
22
- ###
23
19
  class News
24
20
  include Fileable
25
21
 
@@ -30,7 +26,7 @@ module NHKore
30
26
  attr_reader :sha256s
31
27
 
32
28
  def initialize
33
- super()
29
+ super
34
30
 
35
31
  @articles = {}
36
32
  @sha256s = {}
@@ -127,10 +123,6 @@ module NHKore
127
123
  end
128
124
  end
129
125
 
130
- ###
131
- # @author Jonathan Bradley Whited
132
- # @since 0.2.0
133
- ###
134
126
  class FutsuuNews < News
135
127
  DEFAULT_FILENAME = 'nhk_news_web_regular.yml'
136
128
  DEFAULT_FILE = build_file(DEFAULT_FILENAME)
@@ -144,14 +136,10 @@ module NHKore
144
136
  end
145
137
 
146
138
  def save_file(file=DEFAULT_FILE,**kargs)
147
- super(file,**kargs)
139
+ super
148
140
  end
149
141
  end
150
142
 
151
- ###
152
- # @author Jonathan Bradley Whited
153
- # @since 0.2.0
154
- ###
155
143
  class YasashiiNews < News
156
144
  DEFAULT_FILENAME = 'nhk_news_web_easy.yml'
157
145
  DEFAULT_FILE = build_file(DEFAULT_FILENAME)
@@ -165,7 +153,7 @@ module NHKore
165
153
  end
166
154
 
167
155
  def save_file(file=DEFAULT_FILE,**kargs)
168
- super(file,**kargs)
156
+ super
169
157
  end
170
158
  end
171
159
  end
@@ -13,10 +13,6 @@ require 'nhkore/word'
13
13
 
14
14
 
15
15
  module NHKore
16
- ###
17
- # @author Jonathan Bradley Whited
18
- # @since 0.2.0
19
- ###
20
16
  class Polisher
21
17
  def begin_polish(str)
22
18
  return str
@@ -52,10 +48,6 @@ module NHKore
52
48
  end
53
49
  end
54
50
 
55
- ###
56
- # @author Jonathan Bradley Whited
57
- # @since 0.2.0
58
- ###
59
51
  class BasicPolisher < Polisher
60
52
  def end_polish(str)
61
53
  # Keep Japanese dots in names:
@@ -72,10 +64,6 @@ module NHKore
72
64
  end
73
65
  end
74
66
 
75
- ###
76
- # @author Jonathan Bradley Whited
77
- # @since 0.2.0
78
- ###
79
67
  class BestPolisher < BasicPolisher
80
68
  end
81
69
  end
@@ -13,15 +13,12 @@ require 'attr_bool'
13
13
  require 'nokogiri'
14
14
  require 'open-uri'
15
15
 
16
+ require 'nhkore/error'
16
17
  require 'nhkore/user_agents'
17
18
  require 'nhkore/util'
18
19
 
19
20
 
20
21
  module NHKore
21
- ###
22
- # @author Jonathan Bradley Whited
23
- # @since 0.2.0
24
- ###
25
22
  class Scraper
26
23
  extend AttrBool::Ext
27
24
 
@@ -177,7 +174,13 @@ module NHKore
177
174
  retry
178
175
  # Must come after HTTPRedirect since a subclass of HTTPError.
179
176
  rescue OpenURI::HTTPError => e
180
- raise e.exception("HTTP error[#{e}] at URL[#{url}]")
177
+ msg = "HTTP error[#{e}] at URL[#{url}]"
178
+
179
+ if e.to_s.include?('404 Not Found')
180
+ raise Http404Error,msg
181
+ else
182
+ raise e.exception(msg)
183
+ end
181
184
  rescue SocketError => e
182
185
  if (max_retries -= 1) < 0
183
186
  raise e.exception("Socket error[#{e}] at URL[#{url}]")
@@ -17,10 +17,6 @@ require 'nhkore/util'
17
17
 
18
18
 
19
19
  module NHKore
20
- ###
21
- # @author Jonathan Bradley Whited
22
- # @since 0.2.0
23
- ###
24
20
  class SearchLink
25
21
  extend AttrBool::Ext
26
22
 
@@ -45,11 +41,11 @@ module NHKore
45
41
  def encode_with(coder)
46
42
  # Order matters.
47
43
 
48
- coder[:url] = @url.nil? ? nil : @url.to_s
44
+ coder[:url] = @url&.to_s
49
45
  coder[:scraped] = @scraped
50
- coder[:datetime] = @datetime.nil? ? nil : @datetime.iso8601
46
+ coder[:datetime] = @datetime&.iso8601
51
47
  coder[:title] = @title
52
- coder[:futsuurl] = @futsuurl.nil? ? nil : @futsuurl.to_s
48
+ coder[:futsuurl] = @futsuurl&.to_s
53
49
  coder[:sha256] = @sha256
54
50
  end
55
51
 
@@ -86,13 +82,13 @@ module NHKore
86
82
  end
87
83
 
88
84
  def futsuurl=(value)
89
- # Don't store URI, store String.
90
- @futsuurl = value.nil? ? nil : value.to_s
85
+ # Don't store URI, store String or nil.
86
+ @futsuurl = value&.to_s
91
87
  end
92
88
 
93
89
  def url=(value)
94
- # Don't store URI, store String.
95
- @url = value.nil? ? nil : value.to_s
90
+ # Don't store URI, store String or nil.
91
+ @url = value&.to_s
96
92
  end
97
93
 
98
94
  def to_s(mini: false)
@@ -114,10 +110,6 @@ module NHKore
114
110
  end
115
111
  end
116
112
 
117
- ###
118
- # @author Jonathan Bradley Whited
119
- # @since 0.2.0
120
- ###
121
113
  class SearchLinks
122
114
  include Fileable
123
115
 
@@ -136,13 +128,13 @@ module NHKore
136
128
  attr_reader :links
137
129
 
138
130
  def initialize
139
- super()
131
+ super
140
132
 
141
133
  @links = {}
142
134
  end
143
135
 
144
136
  def add_link(link)
145
- url = link.url.nil? ? nil : link.url.to_s
137
+ url = link.url&.to_s
146
138
 
147
139
  return self if @links.key?(url)
148
140
 
@@ -9,6 +9,7 @@
9
9
  #++
10
10
 
11
11
 
12
+ require 'net/http'
12
13
  require 'uri'
13
14
 
14
15
  require 'nhkore/error'
@@ -18,10 +19,6 @@ require 'nhkore/util'
18
19
 
19
20
 
20
21
  module NHKore
21
- ###
22
- # @author Jonathan Bradley Whited
23
- # @since 0.2.0
24
- ###
25
22
  class SearchScraper < Scraper
26
23
  DEFAULT_RESULT_COUNT = 100
27
24
  FUTSUU_SITE = 'nhk.or.jp/news/html/'
@@ -34,10 +31,11 @@ module NHKore
34
31
  YASASHII_REGEX = /\A[^.]+\.#{Regexp.quote(YASASHII_SITE)}.+\.html?/i.freeze
35
32
 
36
33
  IGNORE_LINK_REGEX = %r{
37
- /about\.html? # https://www3.nhk.or.jp/news/easy/about.html
38
- |/movieplayer\.html? # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
39
- |/audio\.html? # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
40
- |/news/easy/index\.html? # http://www3.nhk.or.jp/news/easy/index.html
34
+ /about\.html? # https://www3.nhk.or.jp/news/easy/about.html
35
+ |/movieplayer\.html? # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
36
+ |/audio\.html? # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
37
+ |/news/easy/index\.html? # https://www3.nhk.or.jp/news/easy/index.html
38
+ |/disaster_earthquake.html # https://www3.nhk.or.jp/news/easy/article/disaster_earthquake.html
41
39
 
42
40
  # https://cgi2.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10011916321000&title=日本の会社が作った鉄道の車両「あずま」がイギリスで走る
43
41
  # https://www3.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10012689671000&title=「鬼滅の刃」の映画が台湾でも始まって大勢の人が見に行く
@@ -47,7 +45,7 @@ module NHKore
47
45
  # Search Engines are strict, so trigger using the default HTTP header fields
48
46
  # with +header: {}+ and fetch/set the cookie using +eat_cookie: true+.
49
47
  def initialize(url,eat_cookie: true,header: {},**kargs)
50
- super(url,eat_cookie: eat_cookie,header: header,**kargs)
48
+ super
51
49
  end
52
50
 
53
51
  def ignore_link?(link,cleaned: true)
@@ -56,17 +54,35 @@ module NHKore
56
54
  link = Util.unspace_web_str(link).downcase unless cleaned
57
55
 
58
56
  return true if link.empty?
59
-
60
57
  return true if IGNORE_LINK_REGEX.match?(link)
61
-
62
58
  return false
63
59
  end
60
+
61
+ # Example: https://www3.nhk.or.jp/news/easy/k10014150691000/k10014150691000.html
62
+ def fetch_valid_link?(link)
63
+ uri = begin
64
+ URI(link)
65
+ rescue StandardError
66
+ return false # Bad URL.
67
+ end
68
+
69
+ begin
70
+ ssl = uri.scheme.to_s.strip.downcase.include?('https')
71
+
72
+ Net::HTTP.start(uri.host,uri.port,use_ssl: ssl) do |http|
73
+ resp = http.head(uri.request_uri)
74
+ code = resp.code
75
+
76
+ return code != '404'
77
+ end
78
+ rescue StandardError
79
+ # Ignore; try actually scraping the article anyway.
80
+ end
81
+
82
+ return true
83
+ end
64
84
  end
65
85
 
66
- ###
67
- # @author Jonathan Bradley Whited
68
- # @since 0.2.0
69
- ###
70
86
  class BingScraper < SearchScraper
71
87
  attr_reader :regex
72
88
  attr_reader :site
@@ -136,9 +152,8 @@ module NHKore
136
152
  next_page.count = count
137
153
  next_page.url = join_url(href)
138
154
  end
139
- elsif href =~ regex
155
+ elsif href =~ regex && fetch_valid_link?(href)
140
156
  slinks.add_link(SearchLink.new(href))
141
-
142
157
  link_count += 1
143
158
  end
144
159
  end
@@ -165,10 +180,9 @@ module NHKore
165
180
  rss_links << link
166
181
 
167
182
  next if ignore_link?(link)
168
- next if link !~ regex
183
+ next if link !~ regex || !fetch_valid_link?(link)
169
184
 
170
185
  slinks.add_link(SearchLink.new(link))
171
-
172
186
  link_count += 1
173
187
  end
174
188
 
@@ -192,17 +206,13 @@ module NHKore
192
206
  end
193
207
  end
194
208
 
195
- ###
196
- # @author Jonathan Bradley Whited
197
- # @since 0.2.0
198
- ###
199
209
  class NextPage
200
210
  attr_accessor :count
201
211
  attr_accessor :rss_links
202
212
  attr_accessor :url
203
213
 
204
214
  def initialize
205
- super()
215
+ super
206
216
 
207
217
  @count = -1
208
218
  @rss_links = nil