nhkore 0.3.4 → 0.3.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,23 +1,11 @@
1
- #!/usr/bin/env ruby
2
1
  # encoding: UTF-8
3
2
  # frozen_string_literal: true
4
3
 
5
4
  #--
6
5
  # This file is part of NHKore.
7
- # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
- #
9
- # NHKore is free software: you can redistribute it and/or modify
10
- # it under the terms of the GNU Lesser General Public License as published by
11
- # the Free Software Foundation, either version 3 of the License, or
12
- # (at your option) any later version.
13
- #
14
- # NHKore is distributed in the hope that it will be useful,
15
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- # GNU Lesser General Public License for more details.
18
- #
19
- # You should have received a copy of the GNU Lesser General Public License
20
- # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
6
+ # Copyright (c) 2020-2021 Jonathan Bradley Whited
7
+ #
8
+ # SPDX-License-Identifier: LGPL-3.0-or-later
21
9
  #++
22
10
 
23
11
 
@@ -29,20 +17,20 @@ require 'nhkore/word'
29
17
 
30
18
  module NHKore
31
19
  ###
32
- # @author Jonathan Bradley Whited (@esotericpig)
20
+ # @author Jonathan Bradley Whited
33
21
  # @since 0.2.0
34
22
  ###
35
23
  class Article
36
- attr_accessor :datetime
37
- attr_accessor :futsuurl
24
+ attr_reader :datetime
25
+ attr_reader :futsuurl
38
26
  attr_accessor :sha256
39
27
  attr_accessor :title
40
- attr_accessor :url
28
+ attr_reader :url
41
29
  attr_reader :words
42
-
43
- def initialize()
30
+
31
+ def initialize
44
32
  super()
45
-
33
+
46
34
  @datetime = nil
47
35
  @futsuurl = nil
48
36
  @sha256 = nil
@@ -50,7 +38,7 @@ module NHKore
50
38
  @url = nil
51
39
  @words = {}
52
40
  end
53
-
41
+
54
42
  # Why does this not look up the kanji/kana only and then update the other
55
43
  # kana/kanji part appropriately?
56
44
  # - There are some words like +行って+. Without the kana, it's difficult to
@@ -60,70 +48,85 @@ module NHKore
60
48
  # try to populate the other value.
61
49
  def add_word(word,use_freq: false)
62
50
  curr_word = words[word.key]
63
-
64
- if curr_word.nil?()
51
+
52
+ if curr_word.nil?
65
53
  words[word.key] = word
66
54
  curr_word = word
67
55
  else
68
56
  curr_word.freq += (use_freq ? word.freq : 1)
69
-
70
- curr_word.defn = word.defn if word.defn.to_s().length > curr_word.defn.to_s().length
71
- curr_word.eng = word.eng if word.eng.to_s().length > curr_word.eng.to_s().length
57
+
58
+ curr_word.defn = word.defn if word.defn.to_s.length > curr_word.defn.to_s.length
59
+ curr_word.eng = word.eng if word.eng.to_s.length > curr_word.eng.to_s.length
72
60
  end
73
-
61
+
74
62
  return curr_word
75
63
  end
76
-
64
+
77
65
  def encode_with(coder)
78
66
  # Order matters.
79
-
80
- coder[:datetime] = @datetime.nil?() ? @datetime : @datetime.iso8601()
67
+
68
+ coder[:datetime] = @datetime.nil? ? @datetime : @datetime.iso8601
81
69
  coder[:title] = @title
82
- coder[:url] = @url
83
- coder[:futsuurl] = @futsuurl
70
+ coder[:url] = @url.nil? ? nil : @url.to_s
71
+ coder[:futsuurl] = @futsuurl.nil? ? nil : @futsuurl.to_s
84
72
  coder[:sha256] = @sha256
85
73
  coder[:words] = @words
86
74
  end
87
-
75
+
88
76
  def self.load_data(key,hash)
89
- datetime = hash[:datetime]
90
77
  words = hash[:words]
91
-
92
- article = Article.new()
93
-
94
- article.datetime = Util.empty_web_str?(datetime) ? nil : Time.iso8601(datetime)
78
+
79
+ article = Article.new
80
+
81
+ article.datetime = hash[:datetime]
95
82
  article.futsuurl = hash[:futsuurl]
96
83
  article.sha256 = hash[:sha256]
97
84
  article.title = hash[:title]
98
85
  article.url = hash[:url]
99
-
100
- if !words.nil?()
101
- words.each() do |k,h|
102
- k = k.to_s() # Change from a symbol
103
- article.words[k] = Word.load_data(k,h)
104
- end
86
+
87
+ words&.each() do |k,h|
88
+ k = k.to_s # Change from a symbol
89
+ article.words[k] = Word.load_data(k,h)
105
90
  end
106
-
91
+
107
92
  return article
108
93
  end
109
-
94
+
95
+ def datetime=(value)
96
+ if value.is_a?(Time)
97
+ @datetime = value
98
+ else
99
+ @datetime = Util.empty_web_str?(value) ? nil : Time.iso8601(value)
100
+ end
101
+ end
102
+
103
+ def futsuurl=(value)
104
+ # Don't store URI, store String.
105
+ @futsuurl = value.nil? ? nil : value.to_s
106
+ end
107
+
108
+ def url=(value)
109
+ # Don't store URI, store String.
110
+ @url = value.nil? ? nil : value.to_s
111
+ end
112
+
110
113
  def to_s(mini: false)
111
- s = ''.dup()
112
-
114
+ s = ''.dup
115
+
113
116
  s << "'#{@url}':"
114
117
  s << "\n datetime: '#{@datetime}'"
115
118
  s << "\n title: '#{@title}'"
116
119
  s << "\n url: '#{@url}'"
117
120
  s << "\n futsuurl: '#{@futsuurl}'"
118
121
  s << "\n sha256: '#{@sha256}'"
119
-
122
+
120
123
  if !mini
121
124
  s << "\n words:"
122
- @words.each() do |key,word|
125
+ @words.each do |key,word|
123
126
  s << "\n #{word}"
124
127
  end
125
128
  end
126
-
129
+
127
130
  return s
128
131
  end
129
132
  end
@@ -1,23 +1,11 @@
1
- #!/usr/bin/env ruby
2
1
  # encoding: UTF-8
3
2
  # frozen_string_literal: true
4
3
 
5
4
  #--
6
5
  # This file is part of NHKore.
7
- # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
- #
9
- # NHKore is free software: you can redistribute it and/or modify
10
- # it under the terms of the GNU Lesser General Public License as published by
11
- # the Free Software Foundation, either version 3 of the License, or
12
- # (at your option) any later version.
13
- #
14
- # NHKore is distributed in the hope that it will be useful,
15
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- # GNU Lesser General Public License for more details.
18
- #
19
- # You should have received a copy of the GNU Lesser General Public License
20
- # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
6
+ # Copyright (c) 2020-2021 Jonathan Bradley Whited
7
+ #
8
+ # SPDX-License-Identifier: LGPL-3.0-or-later
21
9
  #++
22
10
 
23
11
 
@@ -39,10 +27,12 @@ require 'nhkore/word'
39
27
 
40
28
  module NHKore
41
29
  ###
42
- # @author Jonathan Bradley Whited (@esotericpig)
30
+ # @author Jonathan Bradley Whited
43
31
  # @since 0.2.0
44
32
  ###
45
33
  class ArticleScraper < Scraper
34
+ extend AttrBool::Ext
35
+
46
36
  attr_reader :cleaners
47
37
  attr_accessor :datetime
48
38
  attr_accessor :dict
@@ -53,18 +43,20 @@ module NHKore
53
43
  attr_accessor? :strict
54
44
  attr_reader :variators
55
45
  attr_accessor :year
56
-
46
+
57
47
  # @param dict [Dict,:scrape,nil] the {Dict} (dictionary) to use for {Word#defn} (definitions)
58
48
  # [+:scrape+] auto-scrape it using {DictScraper}
59
49
  # [+nil+] don't scrape/use it
60
50
  # @param missingno [Missingno] data to use as a fallback for Ruby words without kana/kanji,
61
51
  # instead of raising an error
62
52
  # @param strict [true,false]
63
- def initialize(url,cleaners: [BestCleaner.new()],datetime: nil,dict: :scrape,missingno: nil,polishers: [BestPolisher.new()],splitter: BestSplitter.new(),strict: true,variators: [BestVariator.new()],year: nil,**kargs)
53
+ def initialize(url,cleaners: [BestCleaner.new],datetime: nil,dict: :scrape,missingno: nil,
54
+ polishers: [BestPolisher.new],splitter: BestSplitter.new,strict: true,
55
+ variators: [BestVariator.new],year: nil,**kargs)
64
56
  super(url,**kargs)
65
-
57
+
66
58
  @cleaners = Array(cleaners)
67
- @datetime = datetime.nil?() ? nil : Util.jst_time(datetime)
59
+ @datetime = datetime.nil? ? nil : Util.jst_time(datetime)
68
60
  @dict = dict
69
61
  @kargs = kargs
70
62
  @missingno = missingno
@@ -74,20 +66,20 @@ module NHKore
74
66
  @variators = Array(variators)
75
67
  @year = year
76
68
  end
77
-
69
+
78
70
  def add_words(article,words,text)
79
- words.each() do |word|
71
+ words.each do |word|
80
72
  # Words should have already been cleaned.
81
73
  # If we don't check this, Word.new() could raise an error in polish().
82
- next if polish(word.word).empty?()
83
-
74
+ next if polish(word.word).empty?
75
+
84
76
  article.add_word(polish(word))
85
-
86
- variate(word.word).each() do |v|
77
+
78
+ variate(word.word).each do |v|
87
79
  v = polish(clean(v))
88
-
89
- next if v.empty?()
90
-
80
+
81
+ next if v.empty?
82
+
91
83
  # Do not pass in "word: word". We only want defn & eng.
92
84
  # If we pass in kanji/kana & unknown, it will raise an error.
93
85
  article.add_word(Word.new(
@@ -97,513 +89,540 @@ module NHKore
97
89
  ))
98
90
  end
99
91
  end
100
-
101
- split(text).each() do |t|
92
+
93
+ split(text).each do |t|
102
94
  t = polish(clean(t))
103
-
104
- next if t.empty?()
105
-
95
+
96
+ next if t.empty?
97
+
106
98
  article.add_word(Word.new(unknown: t))
107
-
108
- variate(t).each() do |v|
99
+
100
+ variate(t).each do |v|
109
101
  v = polish(clean(v))
110
-
111
- next if v.empty?()
112
-
102
+
103
+ next if v.empty?
104
+
113
105
  article.add_word(Word.new(unknown: v))
114
106
  end
115
107
  end
116
108
  end
117
-
109
+
118
110
  def clean(obj)
119
111
  return Cleaner.clean_any(obj,@cleaners)
120
112
  end
121
-
122
- def fix_bad_html()
113
+
114
+ def fix_bad_html
123
115
  # Fixes:
124
116
  # - '<「<' without escaping '<' as '&lt;'
125
117
  # - https://www3.nhk.or.jp/news/easy/k10012118911000/k10012118911000.html
126
118
  # - '</p><br><「<ruby>台風<rt>たいふう</rt></ruby>'
127
-
128
- read()
129
-
130
- # To add a new one, simply add '|(...)' on a newline and test $#.
119
+
120
+ read
121
+
122
+ # To add a new one, simply add '|(...)' on a newline and test Regexp.last_match().
131
123
  @str_or_io = @str_or_io.gsub(/
132
- (\<「\<)
124
+ (?<cane><「<)
133
125
  /x) do |match|
134
- if !$1.nil?()
126
+ if !Regexp.last_match(:cane).nil?
135
127
  match = match.sub('<','&lt;')
136
128
  end
137
-
129
+
138
130
  match
139
131
  end
140
132
  end
141
-
133
+
142
134
  def parse_datetime(str,year)
143
135
  str = str.gsub(/[\[\][[:space:]]]+/,'') # Remove: [ ] \s
144
136
  str = "#{year}年 #{str} #{Util::JST_OFFSET}"
145
-
137
+
146
138
  return Time.strptime(str,'%Y年 %m月%d日%H時%M分 %:z')
147
139
  end
148
-
140
+
149
141
  def parse_dicwin_id(str)
150
142
  str = str.gsub(/\D+/,'')
151
-
152
- return nil if str.empty?()
143
+
144
+ return nil if str.empty?
153
145
  return str
154
146
  end
155
-
147
+
156
148
  def polish(obj)
157
149
  return Polisher.polish_any(obj,@polishers)
158
150
  end
159
-
160
- def scrape()
161
- scrape_dict()
162
- fix_bad_html()
163
-
164
- article = Article.new()
165
- doc = html_doc()
166
-
151
+
152
+ def scrape
153
+ scrape_dict
154
+ fix_bad_html
155
+
156
+ article = Article.new
157
+ doc = html_doc
158
+
167
159
  article.futsuurl = scrape_futsuurl(doc)
168
-
160
+
169
161
  article.datetime = scrape_datetime(doc,article.futsuurl)
170
162
  article.sha256 = scrape_content(doc,article)
171
163
  article.title = scrape_title(doc,article)
172
164
  article.url = @url
173
-
165
+
174
166
  return article
175
167
  end
176
-
177
- def scrape_and_add_words(tag,article,result: ScrapeWordsResult.new())
168
+
169
+ def scrape_and_add_words(tag,article,result: ScrapeWordsResult.new)
178
170
  result = scrape_words(tag,result: result)
179
- result.polish!()
180
-
171
+ result.polish!
172
+
181
173
  add_words(article,result.words,result.text)
182
-
174
+
183
175
  return result
184
176
  end
185
-
177
+
186
178
  def scrape_content(doc,article)
187
179
  tag = doc.css('div#js-article-body')
188
180
  tag = doc.css('div.article-main__body') if tag.length < 1
189
181
  tag = doc.css('div.article-body') if tag.length < 1
190
-
182
+
191
183
  # - https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html
192
184
  tag = doc.css('div#main') if tag.length < 1 && !@strict
193
-
185
+
194
186
  if tag.length > 0
195
- text = Util.unspace_web_str(tag.text.to_s())
196
-
197
- if !text.empty?()
187
+ text = Util.unspace_web_str(tag.text.to_s)
188
+
189
+ if !text.empty?
198
190
  hexdigest = Digest::SHA256.hexdigest(text)
199
-
200
- return hexdigest if article.nil?() # For scrape_sha256_only()
201
-
191
+
192
+ return hexdigest if article.nil? # For scrape_sha256_only()
193
+
202
194
  result = scrape_and_add_words(tag,article)
203
-
204
- return hexdigest if result.words?()
195
+
196
+ return hexdigest if result.words?
205
197
  end
206
198
  end
207
-
199
+
208
200
  raise ScrapeError,"could not scrape content at URL[#{@url}]"
209
201
  end
210
-
202
+
211
203
  def scrape_datetime(doc,futsuurl=nil)
212
204
  year = scrape_year(doc,futsuurl)
213
-
205
+
214
206
  # First, try with the id.
215
207
  tag_name = 'p#js-article-date'
216
208
  tag = doc.css(tag_name)
217
-
209
+
218
210
  if tag.length > 0
219
211
  tag_text = tag[0].text
220
-
212
+
221
213
  begin
222
214
  datetime = parse_datetime(tag_text,year)
223
-
215
+
224
216
  return datetime
225
217
  rescue ArgumentError => e
226
218
  # Ignore; try again below.
227
219
  Util.warn("could not parse date time[#{tag_text}] from tag[#{tag_name}] at URL[#{@url}]: #{e}")
228
220
  end
229
221
  end
230
-
222
+
231
223
  # Second, try with the class.
232
224
  tag_name = 'p.article-main__date'
233
225
  tag = doc.css(tag_name)
234
-
226
+
235
227
  if tag.length > 0
236
228
  tag_text = tag[0].text
237
-
229
+
238
230
  begin
239
231
  datetime = parse_datetime(tag_text,year)
240
-
232
+
241
233
  return datetime
242
234
  rescue ArgumentError => e
243
235
  # Ignore; try again below.
244
236
  Util.warn("could not parse date time[#{tag_text}] from tag[#{tag_name}] at URL[#{@url}]: #{e}")
245
237
  end
246
-
238
+
247
239
  return datetime
248
240
  end
249
-
241
+
250
242
  # Third, try body's id.
251
243
  # - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html
252
244
  # - 'news20170331_k10010922481000'
253
245
  tag = doc.css('body')
254
-
246
+
255
247
  if tag.length > 0
256
- tag_id = tag[0]['id'].to_s().split('_',2)
257
-
248
+ tag_id = tag[0]['id'].to_s.split('_',2)
249
+
258
250
  if tag_id.length > 0
259
251
  tag_id = tag_id[0].gsub(/[^[[:digit:]]]+/,'')
260
-
252
+
261
253
  if tag_id.length == 8
262
254
  datetime = Time.strptime(tag_id,'%Y%m%d')
263
-
255
+
264
256
  return datetime
265
257
  end
266
258
  end
267
259
  end
268
-
260
+
269
261
  # As a last resort, use our user-defined fallback (if specified).
270
- return @datetime unless @datetime.nil?()
271
-
262
+ return @datetime unless @datetime.nil?
263
+
272
264
  raise ScrapeError,"could not scrape date time at URL[#{@url}]"
273
265
  end
274
-
275
- def scrape_dict()
266
+
267
+ def scrape_dict
276
268
  return if @dict != :scrape
277
-
269
+
278
270
  dict_url = DictScraper.parse_url(@url)
279
271
  retries = 0
280
-
272
+
281
273
  begin
282
274
  scraper = DictScraper.new(dict_url,missingno: @missingno,parse_url: false,**@kargs)
283
275
  rescue OpenURI::HTTPError => e
284
- if retries == 0 && e.to_s().include?('404')
285
- read()
286
-
276
+ if retries == 0 && e.to_s.include?('404')
277
+ read
278
+
287
279
  scraper = ArticleScraper.new(@url,str_or_io: @str_or_io,**@kargs)
288
-
289
- dict_url = scraper.scrape_dict_url_only()
280
+
281
+ dict_url = scraper.scrape_dict_url_only
290
282
  retries += 1
291
-
283
+
292
284
  retry
293
285
  else
294
- raise e.exception("could not scrape dictionary at URL[#{dict_url}]: #{e}")
286
+ raise e.exception("could not scrape dictionary URL[#{dict_url}] at URL[#{@url}]: #{e}")
295
287
  end
296
288
  end
297
-
298
- @dict = scraper.scrape()
289
+
290
+ @dict = scraper.scrape
299
291
  end
300
-
301
- def scrape_dict_url_only()
302
- doc = html_doc()
303
-
292
+
293
+ def scrape_dict_url_only
294
+ doc = html_doc
295
+
304
296
  # - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html
305
297
  # - 'news20170331_k10010922481000'
306
298
  tag = doc.css('body')
307
-
299
+
308
300
  if tag.length > 0
309
- tag_id = tag[0]['id'].to_s().split('_',2)
310
-
301
+ tag_id = tag[0]['id'].to_s.split('_',2)
302
+
311
303
  if tag_id.length == 2
312
304
  dict_url = Util.strip_web_str(tag_id[1])
313
-
314
- if !dict_url.empty?()
305
+
306
+ if !dict_url.empty?
315
307
  return DictScraper.parse_url(@url,basename: dict_url)
316
308
  end
317
309
  end
318
310
  end
319
-
311
+
320
312
  raise ScrapeError,"could not scrape dictionary URL at URL[#{@url}]"
321
313
  end
322
-
323
- def scrape_dicwin_word(tag,id,result: ScrapeWordsResult.new())
314
+
315
+ def scrape_dicwin_word(tag,id,result: ScrapeWordsResult.new)
324
316
  dicwin_result = scrape_words(tag,dicwin: true)
325
-
326
- return nil unless dicwin_result.words?()
327
-
328
- kana = ''.dup()
329
- kanji = ''.dup()
330
-
331
- dicwin_result.words.each() do |word|
332
- kana << word.kana unless word.kana.nil?()
333
-
334
- if kanji.empty?()
335
- kanji << word.kanji unless word.kanji.nil?()
317
+
318
+ return nil unless dicwin_result.words?
319
+
320
+ kana = ''.dup
321
+ kanji = ''.dup
322
+
323
+ dicwin_result.words.each do |word|
324
+ kana << word.kana unless word.kana.nil?
325
+
326
+ if kanji.empty?
327
+ kanji << word.kanji unless word.kanji.nil?
336
328
  else
337
329
  kanji << word.word # Add trailing kana (or kanji) to kanji
338
330
  end
339
331
  end
340
-
332
+
341
333
  entry = nil
342
334
  kana = clean(kana)
343
335
  kanji = clean(kanji)
344
-
345
- raise ScrapeError,"empty dicWin word at URL[#{@url}] in tag[#{tag}]" if kana.empty?() && kanji.empty?()
346
-
347
- if !@dict.nil?()
336
+
337
+ raise ScrapeError,"empty dicWin word at URL[#{@url}] in tag[#{tag}]" if kana.empty? && kanji.empty?
338
+
339
+ if !@dict.nil?
348
340
  entry = @dict[id]
349
-
350
- raise ScrapeError,"no dicWin ID[#{id}] at URL[#{@url}] in dictionary[#{@dict}]" if entry.nil?()
351
-
352
- entry = entry.to_s()
341
+
342
+ raise ScrapeError,"no dicWin ID[#{id}] at URL[#{@url}] in dictionary[#{@dict}]" if entry.nil?
343
+
344
+ entry = entry.to_s
353
345
  end
354
-
346
+
355
347
  word = Word.new(
356
348
  defn: entry,
357
349
  kana: kana,
358
350
  kanji: kanji
359
351
  )
360
-
352
+
361
353
  result.add_text(dicwin_result.text) # Don't call dicwin_result.polish!()
362
354
  result.add_word(word)
363
-
355
+
364
356
  return word
365
357
  end
366
-
358
+
367
359
  def scrape_futsuurl(doc)
368
360
  # First, try with the id.
369
361
  tag = doc.css('div#js-regular-news-wrapper')
370
-
362
+
371
363
  if tag.length > 0
372
364
  link = scrape_link(tag[0])
373
-
374
- return link unless link.nil?()
365
+
366
+ return link unless link.nil?
375
367
  end
376
-
368
+
377
369
  # Second, try with the class.
378
370
  tag = doc.css('div.link-to-normal')
379
-
371
+
380
372
  if tag.length > 0
381
373
  link = scrape_link(tag[0])
382
-
383
- return link unless link.nil?()
374
+
375
+ return link unless link.nil?
384
376
  end
385
-
377
+
386
378
  # Some sites don't have a futsuurl and need a lenient mode.
387
379
  # - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html
388
380
  warn_or_error(ScrapeError,"could not scrape futsuurl at URL[#{@url}]")
389
-
381
+
390
382
  return nil
391
383
  end
392
-
384
+
393
385
  def scrape_link(tag)
394
386
  link = tag.css('a')
395
-
387
+
396
388
  return nil if link.length < 1
397
-
398
- link = Util.unspace_web_str(link[0]['href'].to_s())
399
-
400
- return nil if link.empty?()
389
+
390
+ link = Util.unspace_web_str(link[0]['href'].to_s)
391
+
392
+ return nil if link.empty?
401
393
  return link
402
394
  end
403
-
404
- def scrape_ruby_word(tag,result: ScrapeWordsResult.new())
405
- word = Word.scrape_ruby_tag(tag,missingno: @missingno,url: @url)
406
-
407
- return nil if word.nil?()
408
-
395
+
396
+ # @since 0.3.8
397
+ # @see https://www3.nhk.or.jp/news/easy/k10012759201000/k10012759201000.html
398
+ def scrape_ruby_words(tag,result: ScrapeWordsResult.new)
399
+ words = Word.scrape_ruby_tag(tag,missingno: @missingno,url: @url)
400
+ final_words = []
401
+
402
+ return final_words if words.nil?
403
+
404
+ words.each do |word|
405
+ final_words << scrape_ruby_word(word,result: result)
406
+ end
407
+
408
+ return final_words
409
+ end
410
+
411
+ def scrape_ruby_word(word,result: ScrapeWordsResult.new)
409
412
  # No cleaning; raw text.
410
413
  # Do not add kana to the text.
411
414
  result.add_text(word.kanji)
412
-
415
+
413
416
  kanji = clean(word.kanji)
414
417
  kana = clean(word.kana)
415
-
416
- if !@missingno.nil?()
418
+
419
+ # Even though Word.scrape_ruby_tag() also does this,
420
+ # check it again after cleaning above.
421
+ if !@missingno.nil?
417
422
  # Check kana first, since this is the typical scenario.
418
423
  # - https://www3.nhk.or.jp/news/easy/k10012331311000/k10012331311000.html
419
424
  # - '窓' in '(8)窓を開けて外の空気を入れましょう'
420
- if kana.empty?()
425
+ if kana.empty?
421
426
  kana = @missingno.kana_from_kanji(kanji)
422
- kana = kana.nil?() ? '' : clean(kana)
423
-
424
- if !kana.empty?()
427
+ kana = kana.nil? ? '' : clean(kana)
428
+
429
+ if !kana.empty?
425
430
  Util.warn("using missingno for kana[#{kana}] from kanji[#{kanji}]")
426
431
  end
427
- elsif kanji.empty?()
432
+ elsif kanji.empty?
428
433
  kanji = @missingno.kanji_from_kana(kana)
429
- kanji = kanji.nil?() ? '' : clean(kanji)
430
-
431
- if !kanji.empty?()
434
+ kanji = kanji.nil? ? '' : clean(kanji)
435
+
436
+ if !kanji.empty?
432
437
  Util.warn("using missingno for kanji[#{kanji}] from kana[#{kana}]")
433
438
  end
434
439
  end
435
440
  end
436
-
437
- raise ScrapeError,"empty kanji at URL[#{@url}] in tag[#{tag}]" if kanji.empty?()
438
- raise ScrapeError,"empty kana at URL[#{@url}] in tag[#{tag}]" if kana.empty?()
439
-
441
+
442
+ raise ScrapeError,"empty kanji at URL[#{@url}] in tag[#{tag}]" if kanji.empty?
443
+ raise ScrapeError,"empty kana at URL[#{@url}] in tag[#{tag}]" if kana.empty?
444
+
440
445
  word = Word.new(
441
446
  kana: kana,
442
447
  kanji: kanji,
443
448
  word: word
444
449
  )
445
-
450
+
446
451
  return word
447
452
  end
448
-
449
- def scrape_sha256_only()
450
- doc = html_doc()
451
-
453
+
454
+ def scrape_sha256_only
455
+ doc = html_doc
456
+
452
457
  sha256 = scrape_content(doc,nil)
453
-
458
+
454
459
  return sha256
455
460
  end
456
-
457
- def scrape_text_word(tag,result: ScrapeWordsResult.new())
461
+
462
+ def scrape_text_word(tag,result: ScrapeWordsResult.new)
458
463
  word = Word.scrape_text_node(tag,url: @url)
459
-
460
- if word.nil?()
461
- result.add_text(tag.text.to_s()) # Raw spaces for output
462
-
464
+
465
+ if word.nil?
466
+ result.add_text(tag.text.to_s) # Raw spaces for output
467
+
463
468
  return nil
464
469
  end
465
-
466
- text = word.kana # Should be kana only
467
-
470
+
471
+ # Kanji only for:
472
+ # - https://www3.nhk.or.jp/news/easy/k10012639271000/k10012639271000.html
473
+ # - '第3のビール'
474
+ text = word.word # Should usually be kana only
475
+
468
476
  result.add_text(text) # No cleaning; raw text
469
-
477
+
470
478
  text = clean(text)
471
-
472
- return nil if text.empty?() # No error; empty text is fine here
473
-
479
+
480
+ return nil if text.empty? # No error; empty text is fine here
481
+
474
482
  word = Word.new(
475
- kana: text,
476
- word: word
483
+ kana: clean(word.kana),
484
+ kanji: clean(word.kanji),
485
+ word: word,
477
486
  )
478
-
487
+
479
488
  return word
480
489
  end
481
-
490
+
482
491
  def scrape_title(doc,article)
483
492
  tag = doc.css('h1.article-main__title')
484
-
493
+ tag_name = nil
494
+
495
+ if tag.length < 1
496
+ # - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_illust.html
497
+ tag_name = 'h1.article-eq__title'
498
+ tag = doc.css(tag_name)
499
+ end
500
+
485
501
  if tag.length < 1 && !@strict
486
502
  # This shouldn't be used except for select sites.
487
503
  # - https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html
488
-
489
504
  tag_name = 'div#main h2'
490
-
491
- Util.warn("using [#{tag_name}] for title at URL[#{@url}]")
492
-
493
505
  tag = doc.css(tag_name)
494
506
  end
495
-
507
+
496
508
  if tag.length > 0
509
+ Util.warn("using [#{tag_name}] for title at URL[#{@url}]") unless tag_name.nil?
510
+
497
511
  result = scrape_and_add_words(tag,article)
498
512
  title = result.text
499
-
500
- return title unless title.empty?()
513
+
514
+ return title unless title.empty?
501
515
  end
502
-
516
+
503
517
  raise ScrapeError,"could not scrape title at URL[#{@url}]"
504
518
  end
505
-
506
- def scrape_words(tag,dicwin: false,result: ScrapeWordsResult.new())
507
- children = tag.children.to_a().reverse() # A faster stack?
508
-
509
- while !children.empty?()
510
- child = children.pop()
519
+
520
+ def scrape_words(tag,dicwin: false,result: ScrapeWordsResult.new)
521
+ children = tag.children.to_a.reverse # A faster stack?
522
+
523
+ while !children.empty?
524
+ child = children.pop
511
525
  name = nil
512
- word = nil
513
-
514
- name = Util.unspace_web_str(child.name.to_s()).downcase() if child.respond_to?(:name)
515
-
526
+ words = []
527
+
528
+ name = Util.unspace_web_str(child.name.to_s).downcase if child.respond_to?(:name)
529
+
516
530
  if name == 'ruby'
517
- word = scrape_ruby_word(child,result: result)
518
- elsif child.text?()
519
- word = scrape_text_word(child,result: result)
531
+ # Returns an array.
532
+ words = scrape_ruby_words(child,result: result)
533
+ elsif child.text?
534
+ words << scrape_text_word(child,result: result)
520
535
  elsif name == 'rt'
521
536
  raise ScrapeError,"invalid rt tag[#{child}] without a ruby tag at URL[#{@url}]"
522
537
  else
523
538
  dicwin_id = nil
524
-
539
+
525
540
  if name == 'a'
526
- id = parse_dicwin_id(child['id'].to_s())
527
- klass = Util.unspace_web_str(child['class'].to_s()).downcase()
528
-
529
- if klass == 'dicwin' && !id.nil?()
541
+ id = parse_dicwin_id(child['id'].to_s)
542
+ klass = Util.unspace_web_str(child['class'].to_s).downcase
543
+
544
+ if klass == 'dicwin' && !id.nil?
530
545
  if dicwin
531
- raise ScrapeError,"invalid dicWin class[#{child}] nested inside another dicWin class at URL[#{@url}]"
546
+ raise ScrapeError,"invalid dicWin class[#{child}] nested inside another dicWin class at" \
547
+ " URL[#{@url}]"
532
548
  end
533
-
549
+
534
550
  dicwin_id = id
535
551
  end
536
552
  end
537
-
538
- if dicwin_id.nil?()
539
- grand_children = child.children.to_a()
540
-
541
- (grand_children.length() - 1).downto(0).each() do |i|
542
- children.push(grand_children[i])
543
- end
544
-
553
+
554
+ if dicwin_id.nil?
545
555
  # I originally didn't use a stack-like Array and did a constant insert,
546
556
  # but I think this is slower (moving all elements down every time).
547
557
  # However, if it's using C-like code for moving memory, then maybe it
548
558
  # is faster?
549
- #children.insert(i + 1,*child.children.to_a())
559
+ # Old code:
560
+ # children.insert(i + 1,*child.children.to_a())
561
+ grand_children = child.children.to_a
562
+
563
+ (grand_children.length - 1).downto(0).each do |i|
564
+ children.push(grand_children[i])
565
+ end
550
566
  else
551
- word = scrape_dicwin_word(child,dicwin_id,result: result)
567
+ words << scrape_dicwin_word(child,dicwin_id,result: result)
552
568
  end
553
569
  end
554
-
555
- result.add_word(word) unless word.nil?()
570
+
571
+ words&.each do |word|
572
+ # All word-scraping methods can return nil.
573
+ result.add_word(word) unless word.nil?
574
+ end
556
575
  end
557
-
576
+
558
577
  return result
559
578
  end
560
-
579
+
561
580
  def scrape_year(doc,futsuurl=nil)
562
581
  # First, try body's id.
563
582
  tag = doc.css('body')
564
-
583
+
565
584
  if tag.length > 0
566
- tag_id = tag[0]['id'].to_s().gsub(/[^[[:digit:]]]+/,'')
567
-
585
+ tag_id = tag[0]['id'].to_s.gsub(/[^[[:digit:]]]+/,'')
586
+
568
587
  if tag_id.length >= 4
569
- year = tag_id[0..3].to_i()
570
-
588
+ year = tag_id[0..3].to_i
589
+
571
590
  return year if Util.sane_year?(year)
572
591
  end
573
592
  end
574
-
593
+
575
594
  # Second, try futsuurl.
576
- if !futsuurl.nil?()
595
+ if !futsuurl.nil?
577
596
  m = futsuurl.match(/([[:digit:]]{4,})/)
578
-
579
- if !m.nil?() && (m = m[0].to_s()).length >= 4
580
- year = m[0..3].to_i()
581
-
597
+
598
+ if !m.nil? && (m = m[0].to_s).length >= 4
599
+ year = m[0..3].to_i
600
+
582
601
  return year if Util.sane_year?(year)
583
602
  end
584
603
  end
585
-
604
+
586
605
  # As a last resort, use our user-defined fallbacks (if specified).
587
- return @year.to_i() unless @year.nil?()
588
- return @datetime.year if !@datetime.nil?() && Util.sane_year?(@datetime.year)
589
-
606
+ return @year.to_i unless @year.nil?
607
+ return @datetime.year if !@datetime.nil? && Util.sane_year?(@datetime.year)
608
+
590
609
  raise ScrapeError,"could not scrape year at URL[#{@url}]"
591
610
  end
592
-
611
+
593
612
  def split(str)
594
613
  return @splitter.split(str)
595
614
  end
596
-
615
+
597
616
  def variate(str)
598
617
  variations = []
599
-
600
- @variators.each() do |variator|
618
+
619
+ @variators.each do |variator|
601
620
  variations.push(*variator.variate(str))
602
621
  end
603
-
622
+
604
623
  return variations
605
624
  end
606
-
625
+
607
626
  def warn_or_error(klass,msg)
608
627
  if @strict
609
628
  raise klass,msg
@@ -612,42 +631,42 @@ module NHKore
612
631
  end
613
632
  end
614
633
  end
615
-
634
+
616
635
  ###
617
- # @author Jonathan Bradley Whited (@esotericpig)
636
+ # @author Jonathan Bradley Whited
618
637
  # @since 0.2.0
619
638
  ###
620
639
  class ScrapeWordsResult
621
640
  attr_reader :text
622
641
  attr_reader :words
623
-
624
- def initialize()
642
+
643
+ def initialize
625
644
  super()
626
-
627
- @text = ''.dup()
645
+
646
+ @text = ''.dup
628
647
  @words = []
629
648
  end
630
-
649
+
631
650
  def add_text(text)
632
651
  @text << Util.reduce_jpn_space(text)
633
-
652
+
634
653
  return self
635
654
  end
636
-
655
+
637
656
  def add_word(word)
638
657
  @words << word
639
-
658
+
640
659
  return self
641
660
  end
642
-
643
- def polish!()
661
+
662
+ def polish!
644
663
  @text = Util.strip_web_str(@text)
645
-
664
+
646
665
  return self
647
666
  end
648
-
649
- def words?()
650
- return !@words.empty?()
667
+
668
+ def words?
669
+ return !@words.empty?
651
670
  end
652
671
  end
653
672
  end