nhkore 0.3.7 → 0.3.11

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,23 +1,11 @@
1
- #!/usr/bin/env ruby
2
1
  # encoding: UTF-8
3
2
  # frozen_string_literal: true
4
3
 
5
4
  #--
6
5
  # This file is part of NHKore.
7
- # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
- #
9
- # NHKore is free software: you can redistribute it and/or modify
10
- # it under the terms of the GNU Lesser General Public License as published by
11
- # the Free Software Foundation, either version 3 of the License, or
12
- # (at your option) any later version.
13
- #
14
- # NHKore is distributed in the hope that it will be useful,
15
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- # GNU Lesser General Public License for more details.
18
- #
19
- # You should have received a copy of the GNU Lesser General Public License
20
- # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
6
+ # Copyright (c) 2020-2021 Jonathan Bradley Whited
7
+ #
8
+ # SPDX-License-Identifier: LGPL-3.0-or-later
21
9
  #++
22
10
 
23
11
 
@@ -39,12 +27,12 @@ require 'nhkore/word'
39
27
 
40
28
  module NHKore
41
29
  ###
42
- # @author Jonathan Bradley Whited (@esotericpig)
30
+ # @author Jonathan Bradley Whited
43
31
  # @since 0.2.0
44
32
  ###
45
33
  class ArticleScraper < Scraper
46
34
  extend AttrBool::Ext
47
-
35
+
48
36
  attr_reader :cleaners
49
37
  attr_accessor :datetime
50
38
  attr_accessor :dict
@@ -55,18 +43,20 @@ module NHKore
55
43
  attr_accessor? :strict
56
44
  attr_reader :variators
57
45
  attr_accessor :year
58
-
46
+
59
47
  # @param dict [Dict,:scrape,nil] the {Dict} (dictionary) to use for {Word#defn} (definitions)
60
48
  # [+:scrape+] auto-scrape it using {DictScraper}
61
49
  # [+nil+] don't scrape/use it
62
50
  # @param missingno [Missingno] data to use as a fallback for Ruby words without kana/kanji,
63
51
  # instead of raising an error
64
52
  # @param strict [true,false]
65
- def initialize(url,cleaners: [BestCleaner.new()],datetime: nil,dict: :scrape,missingno: nil,polishers: [BestPolisher.new()],splitter: BestSplitter.new(),strict: true,variators: [BestVariator.new()],year: nil,**kargs)
53
+ def initialize(url,cleaners: [BestCleaner.new],datetime: nil,dict: :scrape,missingno: nil,
54
+ polishers: [BestPolisher.new],splitter: BestSplitter.new,strict: true,
55
+ variators: [BestVariator.new],year: nil,**kargs)
66
56
  super(url,**kargs)
67
-
57
+
68
58
  @cleaners = Array(cleaners)
69
- @datetime = datetime.nil?() ? nil : Util.jst_time(datetime)
59
+ @datetime = datetime.nil? ? nil : Util.jst_time(datetime)
70
60
  @dict = dict
71
61
  @kargs = kargs
72
62
  @missingno = missingno
@@ -76,20 +66,20 @@ module NHKore
76
66
  @variators = Array(variators)
77
67
  @year = year
78
68
  end
79
-
69
+
80
70
  def add_words(article,words,text)
81
- words.each() do |word|
71
+ words.each do |word|
82
72
  # Words should have already been cleaned.
83
73
  # If we don't check this, Word.new() could raise an error in polish().
84
- next if polish(word.word).empty?()
85
-
74
+ next if polish(word.word).empty?
75
+
86
76
  article.add_word(polish(word))
87
-
88
- variate(word.word).each() do |v|
77
+
78
+ variate(word.word).each do |v|
89
79
  v = polish(clean(v))
90
-
91
- next if v.empty?()
92
-
80
+
81
+ next if v.empty?
82
+
93
83
  # Do not pass in "word: word". We only want defn & eng.
94
84
  # If we pass in kanji/kana & unknown, it will raise an error.
95
85
  article.add_word(Word.new(
@@ -99,522 +89,540 @@ module NHKore
99
89
  ))
100
90
  end
101
91
  end
102
-
103
- split(text).each() do |t|
92
+
93
+ split(text).each do |t|
104
94
  t = polish(clean(t))
105
-
106
- next if t.empty?()
107
-
95
+
96
+ next if t.empty?
97
+
108
98
  article.add_word(Word.new(unknown: t))
109
-
110
- variate(t).each() do |v|
99
+
100
+ variate(t).each do |v|
111
101
  v = polish(clean(v))
112
-
113
- next if v.empty?()
114
-
102
+
103
+ next if v.empty?
104
+
115
105
  article.add_word(Word.new(unknown: v))
116
106
  end
117
107
  end
118
108
  end
119
-
109
+
120
110
  def clean(obj)
121
111
  return Cleaner.clean_any(obj,@cleaners)
122
112
  end
123
-
124
- def fix_bad_html()
113
+
114
+ def fix_bad_html
125
115
  # Fixes:
126
116
  # - '<「<' without escaping '<' as '&lt;'
127
117
  # - https://www3.nhk.or.jp/news/easy/k10012118911000/k10012118911000.html
128
118
  # - '</p><br><「<ruby>台風<rt>たいふう</rt></ruby>'
129
-
130
- read()
131
-
132
- # To add a new one, simply add '|(...)' on a newline and test $#.
119
+
120
+ read
121
+
122
+ # To add a new one, simply add '|(...)' on a newline and test Regexp.last_match().
133
123
  @str_or_io = @str_or_io.gsub(/
134
- (\<「\<)
124
+ (?<cane><「<)
135
125
  /x) do |match|
136
- if !$1.nil?()
126
+ if !Regexp.last_match(:cane).nil?
137
127
  match = match.sub('<','&lt;')
138
128
  end
139
-
129
+
140
130
  match
141
131
  end
142
132
  end
143
-
133
+
144
134
  def parse_datetime(str,year)
145
135
  str = str.gsub(/[\[\][[:space:]]]+/,'') # Remove: [ ] \s
146
136
  str = "#{year}年 #{str} #{Util::JST_OFFSET}"
147
-
137
+
148
138
  return Time.strptime(str,'%Y年 %m月%d日%H時%M分 %:z')
149
139
  end
150
-
140
+
151
141
  def parse_dicwin_id(str)
152
142
  str = str.gsub(/\D+/,'')
153
-
154
- return nil if str.empty?()
143
+
144
+ return nil if str.empty?
155
145
  return str
156
146
  end
157
-
147
+
158
148
  def polish(obj)
159
149
  return Polisher.polish_any(obj,@polishers)
160
150
  end
161
-
162
- def scrape()
163
- scrape_dict()
164
- fix_bad_html()
165
-
166
- article = Article.new()
167
- doc = html_doc()
168
-
151
+
152
+ def scrape
153
+ scrape_dict
154
+ fix_bad_html
155
+
156
+ article = Article.new
157
+ doc = html_doc
158
+
169
159
  article.futsuurl = scrape_futsuurl(doc)
170
-
160
+
171
161
  article.datetime = scrape_datetime(doc,article.futsuurl)
172
162
  article.sha256 = scrape_content(doc,article)
173
163
  article.title = scrape_title(doc,article)
174
164
  article.url = @url
175
-
165
+
176
166
  return article
177
167
  end
178
-
179
- def scrape_and_add_words(tag,article,result: ScrapeWordsResult.new())
168
+
169
+ def scrape_and_add_words(tag,article,result: ScrapeWordsResult.new)
180
170
  result = scrape_words(tag,result: result)
181
- result.polish!()
182
-
171
+ result.polish!
172
+
183
173
  add_words(article,result.words,result.text)
184
-
174
+
185
175
  return result
186
176
  end
187
-
177
+
188
178
  def scrape_content(doc,article)
189
179
  tag = doc.css('div#js-article-body')
190
180
  tag = doc.css('div.article-main__body') if tag.length < 1
191
181
  tag = doc.css('div.article-body') if tag.length < 1
192
-
182
+
193
183
  # - https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html
194
184
  tag = doc.css('div#main') if tag.length < 1 && !@strict
195
-
185
+
196
186
  if tag.length > 0
197
- text = Util.unspace_web_str(tag.text.to_s())
198
-
199
- if !text.empty?()
187
+ text = Util.unspace_web_str(tag.text.to_s)
188
+
189
+ if !text.empty?
200
190
  hexdigest = Digest::SHA256.hexdigest(text)
201
-
202
- return hexdigest if article.nil?() # For scrape_sha256_only()
203
-
191
+
192
+ return hexdigest if article.nil? # For scrape_sha256_only()
193
+
204
194
  result = scrape_and_add_words(tag,article)
205
-
206
- return hexdigest if result.words?()
195
+
196
+ return hexdigest if result.words?
207
197
  end
208
198
  end
209
-
199
+
210
200
  raise ScrapeError,"could not scrape content at URL[#{@url}]"
211
201
  end
212
-
202
+
213
203
  def scrape_datetime(doc,futsuurl=nil)
214
204
  year = scrape_year(doc,futsuurl)
215
-
205
+
216
206
  # First, try with the id.
217
207
  tag_name = 'p#js-article-date'
218
208
  tag = doc.css(tag_name)
219
-
209
+
220
210
  if tag.length > 0
221
211
  tag_text = tag[0].text
222
-
212
+
223
213
  begin
224
214
  datetime = parse_datetime(tag_text,year)
225
-
215
+
226
216
  return datetime
227
217
  rescue ArgumentError => e
228
218
  # Ignore; try again below.
229
219
  Util.warn("could not parse date time[#{tag_text}] from tag[#{tag_name}] at URL[#{@url}]: #{e}")
230
220
  end
231
221
  end
232
-
222
+
233
223
  # Second, try with the class.
234
224
  tag_name = 'p.article-main__date'
235
225
  tag = doc.css(tag_name)
236
-
226
+
237
227
  if tag.length > 0
238
228
  tag_text = tag[0].text
239
-
229
+
240
230
  begin
241
231
  datetime = parse_datetime(tag_text,year)
242
-
232
+
243
233
  return datetime
244
234
  rescue ArgumentError => e
245
235
  # Ignore; try again below.
246
236
  Util.warn("could not parse date time[#{tag_text}] from tag[#{tag_name}] at URL[#{@url}]: #{e}")
247
237
  end
248
-
238
+
249
239
  return datetime
250
240
  end
251
-
241
+
252
242
  # Third, try body's id.
253
243
  # - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html
254
244
  # - 'news20170331_k10010922481000'
255
245
  tag = doc.css('body')
256
-
246
+
257
247
  if tag.length > 0
258
- tag_id = tag[0]['id'].to_s().split('_',2)
259
-
248
+ tag_id = tag[0]['id'].to_s.split('_',2)
249
+
260
250
  if tag_id.length > 0
261
251
  tag_id = tag_id[0].gsub(/[^[[:digit:]]]+/,'')
262
-
252
+
263
253
  if tag_id.length == 8
264
254
  datetime = Time.strptime(tag_id,'%Y%m%d')
265
-
255
+
266
256
  return datetime
267
257
  end
268
258
  end
269
259
  end
270
-
260
+
271
261
  # As a last resort, use our user-defined fallback (if specified).
272
- return @datetime unless @datetime.nil?()
273
-
262
+ return @datetime unless @datetime.nil?
263
+
274
264
  raise ScrapeError,"could not scrape date time at URL[#{@url}]"
275
265
  end
276
-
277
- def scrape_dict()
266
+
267
+ def scrape_dict
278
268
  return if @dict != :scrape
279
-
269
+
280
270
  dict_url = DictScraper.parse_url(@url)
281
271
  retries = 0
282
-
272
+
283
273
  begin
284
274
  scraper = DictScraper.new(dict_url,missingno: @missingno,parse_url: false,**@kargs)
285
275
  rescue OpenURI::HTTPError => e
286
- if retries == 0 && e.to_s().include?('404')
287
- read()
288
-
276
+ if retries == 0 && e.to_s.include?('404')
277
+ read
278
+
289
279
  scraper = ArticleScraper.new(@url,str_or_io: @str_or_io,**@kargs)
290
-
291
- dict_url = scraper.scrape_dict_url_only()
280
+
281
+ dict_url = scraper.scrape_dict_url_only
292
282
  retries += 1
293
-
283
+
294
284
  retry
295
285
  else
296
286
  raise e.exception("could not scrape dictionary URL[#{dict_url}] at URL[#{@url}]: #{e}")
297
287
  end
298
288
  end
299
-
300
- @dict = scraper.scrape()
289
+
290
+ @dict = scraper.scrape
301
291
  end
302
-
303
- def scrape_dict_url_only()
304
- doc = html_doc()
305
-
292
+
293
+ def scrape_dict_url_only
294
+ doc = html_doc
295
+
306
296
  # - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html
307
297
  # - 'news20170331_k10010922481000'
308
298
  tag = doc.css('body')
309
-
299
+
310
300
  if tag.length > 0
311
- tag_id = tag[0]['id'].to_s().split('_',2)
312
-
301
+ tag_id = tag[0]['id'].to_s.split('_',2)
302
+
313
303
  if tag_id.length == 2
314
304
  dict_url = Util.strip_web_str(tag_id[1])
315
-
316
- if !dict_url.empty?()
305
+
306
+ if !dict_url.empty?
317
307
  return DictScraper.parse_url(@url,basename: dict_url)
318
308
  end
319
309
  end
320
310
  end
321
-
311
+
322
312
  raise ScrapeError,"could not scrape dictionary URL at URL[#{@url}]"
323
313
  end
324
-
325
- def scrape_dicwin_word(tag,id,result: ScrapeWordsResult.new())
314
+
315
+ def scrape_dicwin_word(tag,id,result: ScrapeWordsResult.new)
326
316
  dicwin_result = scrape_words(tag,dicwin: true)
327
-
328
- return nil unless dicwin_result.words?()
329
-
330
- kana = ''.dup()
331
- kanji = ''.dup()
332
-
333
- dicwin_result.words.each() do |word|
334
- kana << word.kana unless word.kana.nil?()
335
-
336
- if kanji.empty?()
337
- kanji << word.kanji unless word.kanji.nil?()
317
+
318
+ return nil unless dicwin_result.words?
319
+
320
+ kana = ''.dup
321
+ kanji = ''.dup
322
+
323
+ dicwin_result.words.each do |word|
324
+ kana << word.kana unless word.kana.nil?
325
+
326
+ if kanji.empty?
327
+ kanji << word.kanji unless word.kanji.nil?
338
328
  else
339
329
  kanji << word.word # Add trailing kana (or kanji) to kanji
340
330
  end
341
331
  end
342
-
332
+
343
333
  entry = nil
344
334
  kana = clean(kana)
345
335
  kanji = clean(kanji)
346
-
347
- raise ScrapeError,"empty dicWin word at URL[#{@url}] in tag[#{tag}]" if kana.empty?() && kanji.empty?()
348
-
349
- if !@dict.nil?()
336
+
337
+ raise ScrapeError,"empty dicWin word at URL[#{@url}] in tag[#{tag}]" if kana.empty? && kanji.empty?
338
+
339
+ if !@dict.nil?
350
340
  entry = @dict[id]
351
-
352
- raise ScrapeError,"no dicWin ID[#{id}] at URL[#{@url}] in dictionary[#{@dict}]" if entry.nil?()
353
-
354
- entry = entry.to_s()
341
+
342
+ raise ScrapeError,"no dicWin ID[#{id}] at URL[#{@url}] in dictionary[#{@dict}]" if entry.nil?
343
+
344
+ entry = entry.to_s
355
345
  end
356
-
346
+
357
347
  word = Word.new(
358
348
  defn: entry,
359
349
  kana: kana,
360
350
  kanji: kanji
361
351
  )
362
-
352
+
363
353
  result.add_text(dicwin_result.text) # Don't call dicwin_result.polish!()
364
354
  result.add_word(word)
365
-
355
+
366
356
  return word
367
357
  end
368
-
358
+
369
359
  def scrape_futsuurl(doc)
370
360
  # First, try with the id.
371
361
  tag = doc.css('div#js-regular-news-wrapper')
372
-
362
+
373
363
  if tag.length > 0
374
364
  link = scrape_link(tag[0])
375
-
376
- return link unless link.nil?()
365
+
366
+ return link unless link.nil?
377
367
  end
378
-
368
+
379
369
  # Second, try with the class.
380
370
  tag = doc.css('div.link-to-normal')
381
-
371
+
382
372
  if tag.length > 0
383
373
  link = scrape_link(tag[0])
384
-
385
- return link unless link.nil?()
374
+
375
+ return link unless link.nil?
386
376
  end
387
-
377
+
388
378
  # Some sites don't have a futsuurl and need a lenient mode.
389
379
  # - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html
390
380
  warn_or_error(ScrapeError,"could not scrape futsuurl at URL[#{@url}]")
391
-
381
+
392
382
  return nil
393
383
  end
394
-
384
+
395
385
  def scrape_link(tag)
396
386
  link = tag.css('a')
397
-
387
+
398
388
  return nil if link.length < 1
399
-
400
- link = Util.unspace_web_str(link[0]['href'].to_s())
401
-
402
- return nil if link.empty?()
389
+
390
+ link = Util.unspace_web_str(link[0]['href'].to_s)
391
+
392
+ return nil if link.empty?
403
393
  return link
404
394
  end
405
-
406
- def scrape_ruby_word(tag,result: ScrapeWordsResult.new())
407
- word = Word.scrape_ruby_tag(tag,missingno: @missingno,url: @url)
408
-
409
- return nil if word.nil?()
410
-
395
+
396
+ # @since 0.3.8
397
+ # @see https://www3.nhk.or.jp/news/easy/k10012759201000/k10012759201000.html
398
+ def scrape_ruby_words(tag,result: ScrapeWordsResult.new)
399
+ words = Word.scrape_ruby_tag(tag,missingno: @missingno,url: @url)
400
+ final_words = []
401
+
402
+ return final_words if words.nil?
403
+
404
+ words.each do |word|
405
+ final_words << scrape_ruby_word(word,result: result)
406
+ end
407
+
408
+ return final_words
409
+ end
410
+
411
+ def scrape_ruby_word(word,result: ScrapeWordsResult.new)
411
412
  # No cleaning; raw text.
412
413
  # Do not add kana to the text.
413
414
  result.add_text(word.kanji)
414
-
415
+
415
416
  kanji = clean(word.kanji)
416
417
  kana = clean(word.kana)
417
-
418
- if !@missingno.nil?()
418
+
419
+ # Even though Word.scrape_ruby_tag() also does this,
420
+ # check it again after cleaning above.
421
+ if !@missingno.nil?
419
422
  # Check kana first, since this is the typical scenario.
420
423
  # - https://www3.nhk.or.jp/news/easy/k10012331311000/k10012331311000.html
421
424
  # - '窓' in '(8)窓を開けて外の空気を入れましょう'
422
- if kana.empty?()
425
+ if kana.empty?
423
426
  kana = @missingno.kana_from_kanji(kanji)
424
- kana = kana.nil?() ? '' : clean(kana)
425
-
426
- if !kana.empty?()
427
+ kana = kana.nil? ? '' : clean(kana)
428
+
429
+ if !kana.empty?
427
430
  Util.warn("using missingno for kana[#{kana}] from kanji[#{kanji}]")
428
431
  end
429
- elsif kanji.empty?()
432
+ elsif kanji.empty?
430
433
  kanji = @missingno.kanji_from_kana(kana)
431
- kanji = kanji.nil?() ? '' : clean(kanji)
432
-
433
- if !kanji.empty?()
434
+ kanji = kanji.nil? ? '' : clean(kanji)
435
+
436
+ if !kanji.empty?
434
437
  Util.warn("using missingno for kanji[#{kanji}] from kana[#{kana}]")
435
438
  end
436
439
  end
437
440
  end
438
-
439
- raise ScrapeError,"empty kanji at URL[#{@url}] in tag[#{tag}]" if kanji.empty?()
440
- raise ScrapeError,"empty kana at URL[#{@url}] in tag[#{tag}]" if kana.empty?()
441
-
441
+
442
+ raise ScrapeError,"empty kanji at URL[#{@url}] in tag[#{tag}]" if kanji.empty?
443
+ raise ScrapeError,"empty kana at URL[#{@url}] in tag[#{tag}]" if kana.empty?
444
+
442
445
  word = Word.new(
443
446
  kana: kana,
444
447
  kanji: kanji,
445
448
  word: word
446
449
  )
447
-
450
+
448
451
  return word
449
452
  end
450
-
451
- def scrape_sha256_only()
452
- doc = html_doc()
453
-
453
+
454
+ def scrape_sha256_only
455
+ doc = html_doc
456
+
454
457
  sha256 = scrape_content(doc,nil)
455
-
458
+
456
459
  return sha256
457
460
  end
458
-
459
- def scrape_text_word(tag,result: ScrapeWordsResult.new())
461
+
462
+ def scrape_text_word(tag,result: ScrapeWordsResult.new)
460
463
  word = Word.scrape_text_node(tag,url: @url)
461
-
462
- if word.nil?()
463
- result.add_text(tag.text.to_s()) # Raw spaces for output
464
-
464
+
465
+ if word.nil?
466
+ result.add_text(tag.text.to_s) # Raw spaces for output
467
+
465
468
  return nil
466
469
  end
467
-
470
+
468
471
  # Kanji only for:
469
472
  # - https://www3.nhk.or.jp/news/easy/k10012639271000/k10012639271000.html
470
473
  # - '第3のビール'
471
474
  text = word.word # Should usually be kana only
472
-
475
+
473
476
  result.add_text(text) # No cleaning; raw text
474
-
477
+
475
478
  text = clean(text)
476
-
477
- return nil if text.empty?() # No error; empty text is fine here
478
-
479
+
480
+ return nil if text.empty? # No error; empty text is fine here
481
+
479
482
  word = Word.new(
480
483
  kana: clean(word.kana),
481
484
  kanji: clean(word.kanji),
482
485
  word: word,
483
486
  )
484
-
487
+
485
488
  return word
486
489
  end
487
-
490
+
488
491
  def scrape_title(doc,article)
489
492
  tag = doc.css('h1.article-main__title')
490
493
  tag_name = nil
491
-
494
+
492
495
  if tag.length < 1
493
496
  # - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_illust.html
494
497
  tag_name = 'h1.article-eq__title'
495
498
  tag = doc.css(tag_name)
496
499
  end
497
-
500
+
498
501
  if tag.length < 1 && !@strict
499
502
  # This shouldn't be used except for select sites.
500
503
  # - https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html
501
504
  tag_name = 'div#main h2'
502
505
  tag = doc.css(tag_name)
503
506
  end
504
-
507
+
505
508
  if tag.length > 0
506
- Util.warn("using [#{tag_name}] for title at URL[#{@url}]") unless tag_name.nil?()
507
-
509
+ Util.warn("using [#{tag_name}] for title at URL[#{@url}]") unless tag_name.nil?
510
+
508
511
  result = scrape_and_add_words(tag,article)
509
512
  title = result.text
510
-
511
- return title unless title.empty?()
513
+
514
+ return title unless title.empty?
512
515
  end
513
-
516
+
514
517
  raise ScrapeError,"could not scrape title at URL[#{@url}]"
515
518
  end
516
-
517
- def scrape_words(tag,dicwin: false,result: ScrapeWordsResult.new())
518
- children = tag.children.to_a().reverse() # A faster stack?
519
-
520
- while !children.empty?()
521
- child = children.pop()
519
+
520
+ def scrape_words(tag,dicwin: false,result: ScrapeWordsResult.new)
521
+ children = tag.children.to_a.reverse # A faster stack?
522
+
523
+ while !children.empty?
524
+ child = children.pop
522
525
  name = nil
523
- word = nil
524
-
525
- name = Util.unspace_web_str(child.name.to_s()).downcase() if child.respond_to?(:name)
526
-
526
+ words = []
527
+
528
+ name = Util.unspace_web_str(child.name.to_s).downcase if child.respond_to?(:name)
529
+
527
530
  if name == 'ruby'
528
- word = scrape_ruby_word(child,result: result)
529
- elsif child.text?()
530
- word = scrape_text_word(child,result: result)
531
+ # Returns an array.
532
+ words = scrape_ruby_words(child,result: result)
533
+ elsif child.text?
534
+ words << scrape_text_word(child,result: result)
531
535
  elsif name == 'rt'
532
536
  raise ScrapeError,"invalid rt tag[#{child}] without a ruby tag at URL[#{@url}]"
533
537
  else
534
538
  dicwin_id = nil
535
-
539
+
536
540
  if name == 'a'
537
- id = parse_dicwin_id(child['id'].to_s())
538
- klass = Util.unspace_web_str(child['class'].to_s()).downcase()
539
-
540
- if klass == 'dicwin' && !id.nil?()
541
+ id = parse_dicwin_id(child['id'].to_s)
542
+ klass = Util.unspace_web_str(child['class'].to_s).downcase
543
+
544
+ if klass == 'dicwin' && !id.nil?
541
545
  if dicwin
542
- raise ScrapeError,"invalid dicWin class[#{child}] nested inside another dicWin class at URL[#{@url}]"
546
+ raise ScrapeError,"invalid dicWin class[#{child}] nested inside another dicWin class at" \
547
+ " URL[#{@url}]"
543
548
  end
544
-
549
+
545
550
  dicwin_id = id
546
551
  end
547
552
  end
548
-
549
- if dicwin_id.nil?()
550
- grand_children = child.children.to_a()
551
-
552
- (grand_children.length() - 1).downto(0).each() do |i|
553
- children.push(grand_children[i])
554
- end
555
-
553
+
554
+ if dicwin_id.nil?
556
555
  # I originally didn't use a stack-like Array and did a constant insert,
557
556
  # but I think this is slower (moving all elements down every time).
558
557
  # However, if it's using C-like code for moving memory, then maybe it
559
558
  # is faster?
560
- #children.insert(i + 1,*child.children.to_a())
559
+ # Old code:
560
+ # children.insert(i + 1,*child.children.to_a())
561
+ grand_children = child.children.to_a
562
+
563
+ (grand_children.length - 1).downto(0).each do |i|
564
+ children.push(grand_children[i])
565
+ end
561
566
  else
562
- word = scrape_dicwin_word(child,dicwin_id,result: result)
567
+ words << scrape_dicwin_word(child,dicwin_id,result: result)
563
568
  end
564
569
  end
565
-
566
- result.add_word(word) unless word.nil?()
570
+
571
+ words&.each do |word|
572
+ # All word-scraping methods can return nil.
573
+ result.add_word(word) unless word.nil?
574
+ end
567
575
  end
568
-
576
+
569
577
  return result
570
578
  end
571
-
579
+
572
580
  def scrape_year(doc,futsuurl=nil)
573
581
  # First, try body's id.
574
582
  tag = doc.css('body')
575
-
583
+
576
584
  if tag.length > 0
577
- tag_id = tag[0]['id'].to_s().gsub(/[^[[:digit:]]]+/,'')
578
-
585
+ tag_id = tag[0]['id'].to_s.gsub(/[^[[:digit:]]]+/,'')
586
+
579
587
  if tag_id.length >= 4
580
- year = tag_id[0..3].to_i()
581
-
588
+ year = tag_id[0..3].to_i
589
+
582
590
  return year if Util.sane_year?(year)
583
591
  end
584
592
  end
585
-
593
+
586
594
  # Second, try futsuurl.
587
- if !futsuurl.nil?()
595
+ if !futsuurl.nil?
588
596
  m = futsuurl.match(/([[:digit:]]{4,})/)
589
-
590
- if !m.nil?() && (m = m[0].to_s()).length >= 4
591
- year = m[0..3].to_i()
592
-
597
+
598
+ if !m.nil? && (m = m[0].to_s).length >= 4
599
+ year = m[0..3].to_i
600
+
593
601
  return year if Util.sane_year?(year)
594
602
  end
595
603
  end
596
-
604
+
597
605
  # As a last resort, use our user-defined fallbacks (if specified).
598
- return @year.to_i() unless @year.nil?()
599
- return @datetime.year if !@datetime.nil?() && Util.sane_year?(@datetime.year)
600
-
606
+ return @year.to_i unless @year.nil?
607
+ return @datetime.year if !@datetime.nil? && Util.sane_year?(@datetime.year)
608
+
601
609
  raise ScrapeError,"could not scrape year at URL[#{@url}]"
602
610
  end
603
-
611
+
604
612
  def split(str)
605
613
  return @splitter.split(str)
606
614
  end
607
-
615
+
608
616
  def variate(str)
609
617
  variations = []
610
-
611
- @variators.each() do |variator|
618
+
619
+ @variators.each do |variator|
612
620
  variations.push(*variator.variate(str))
613
621
  end
614
-
622
+
615
623
  return variations
616
624
  end
617
-
625
+
618
626
  def warn_or_error(klass,msg)
619
627
  if @strict
620
628
  raise klass,msg
@@ -623,42 +631,42 @@ module NHKore
623
631
  end
624
632
  end
625
633
  end
626
-
634
+
627
635
  ###
628
- # @author Jonathan Bradley Whited (@esotericpig)
636
+ # @author Jonathan Bradley Whited
629
637
  # @since 0.2.0
630
638
  ###
631
639
  class ScrapeWordsResult
632
640
  attr_reader :text
633
641
  attr_reader :words
634
-
635
- def initialize()
642
+
643
+ def initialize
636
644
  super()
637
-
638
- @text = ''.dup()
645
+
646
+ @text = ''.dup
639
647
  @words = []
640
648
  end
641
-
649
+
642
650
  def add_text(text)
643
651
  @text << Util.reduce_jpn_space(text)
644
-
652
+
645
653
  return self
646
654
  end
647
-
655
+
648
656
  def add_word(word)
649
657
  @words << word
650
-
658
+
651
659
  return self
652
660
  end
653
-
654
- def polish!()
661
+
662
+ def polish!
655
663
  @text = Util.strip_web_str(@text)
656
-
664
+
657
665
  return self
658
666
  end
659
-
660
- def words?()
661
- return !@words.empty?()
667
+
668
+ def words?
669
+ return !@words.empty?
662
670
  end
663
671
  end
664
672
  end