nhkore 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,653 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ require 'digest'
25
+
26
+ require 'nhkore/article'
27
+ require 'nhkore/cleaner'
28
+ require 'nhkore/dict'
29
+ require 'nhkore/dict_scraper'
30
+ require 'nhkore/error'
31
+ require 'nhkore/polisher'
32
+ require 'nhkore/scraper'
33
+ require 'nhkore/splitter'
34
+ require 'nhkore/util'
35
+ require 'nhkore/variator'
36
+ require 'nhkore/word'
37
+
38
+
39
+ module NHKore
40
+ ###
41
+ # @author Jonathan Bradley Whited (@esotericpig)
42
+ # @since 0.2.0
43
+ ###
44
+ class ArticleScraper < Scraper
45
+ attr_reader :cleaners
46
+ attr_accessor :datetime
47
+ attr_accessor :dict
48
+ attr_reader :kargs
49
+ attr_accessor :missingno
50
+ attr_accessor :mode
51
+ attr_reader :polishers
52
+ attr_accessor :splitter
53
+ attr_reader :variators
54
+ attr_accessor :year
55
+
56
+ # @param dict [Dict,:scrape,nil] the {Dict} (dictionary) to use for {Word#defn} (definitions)
57
+ # [+:scrape+] auto-scrape it using {DictScraper}
58
+ # [+nil+] don't scrape/use it
59
+ # @param missingno [Missingno] data to use as a fallback for Ruby words without kana/kanji,
60
+ # instead of raising an error
61
+ # @param mode [nil,:lenient]
62
+ def initialize(url,cleaners: [BestCleaner.new()],datetime: nil,dict: :scrape,missingno: nil,mode: nil,polishers: [BestPolisher.new()],splitter: BestSplitter.new(),variators: [BestVariator.new()],year: nil,**kargs)
63
+ super(url,**kargs)
64
+
65
+ @cleaners = Array(cleaners)
66
+ @datetime = datetime.nil?() ? nil : Util.jst_time(datetime)
67
+ @dict = dict
68
+ @kargs = kargs
69
+ @missingno = missingno
70
+ @mode = mode
71
+ @polishers = Array(polishers)
72
+ @splitter = splitter
73
+ @variators = Array(variators)
74
+ @year = year
75
+ end
76
+
77
+ def add_words(article,words,text)
78
+ words.each() do |word|
79
+ # Words should have already been cleaned.
80
+ # If we don't check this, Word.new() could raise an error in polish().
81
+ next if polish(word.word).empty?()
82
+
83
+ article.add_word(polish(word))
84
+
85
+ variate(word.word).each() do |v|
86
+ v = polish(clean(v))
87
+
88
+ next if v.empty?()
89
+
90
+ # Do not pass in "word: word". We only want defn & eng.
91
+ # If we pass in kanji/kana & unknown, it will raise an error.
92
+ article.add_word(Word.new(
93
+ defn: word.defn,
94
+ eng: word.eng,
95
+ unknown: v
96
+ ))
97
+ end
98
+ end
99
+
100
+ split(text).each() do |t|
101
+ t = polish(clean(t))
102
+
103
+ next if t.empty?()
104
+
105
+ article.add_word(Word.new(unknown: t))
106
+
107
+ variate(t).each() do |v|
108
+ v = polish(clean(v))
109
+
110
+ next if v.empty?()
111
+
112
+ article.add_word(Word.new(unknown: v))
113
+ end
114
+ end
115
+ end
116
+
117
+ def clean(obj)
118
+ return Cleaner.clean_any(obj,@cleaners)
119
+ end
120
+
121
+ def fix_bad_html()
122
+ # Fixes:
123
+ # - '<「<' without escaping '<' as '&lt;'
124
+ # - https://www3.nhk.or.jp/news/easy/k10012118911000/k10012118911000.html
125
+ # - '</p><br><「<ruby>台風<rt>たいふう</rt></ruby>'
126
+
127
+ @str_or_io = @str_or_io.read() if @str_or_io.respond_to?(:read)
128
+
129
+ # To add a new one, simply add '|(...)' on a newline and test $#.
130
+ @str_or_io = @str_or_io.gsub(/
131
+ (\<「\<)
132
+ /x) do |match|
133
+ if !$1.nil?()
134
+ match = match.sub('<','&lt;')
135
+ end
136
+
137
+ match
138
+ end
139
+ end
140
+
141
+ def parse_datetime(str,year)
142
+ str = str.gsub(/[\[\][[:space:]]]+/,'') # Remove: [ ] \s
143
+ str = "#{year}年 #{str} #{Util::JST_OFFSET}"
144
+
145
+ return Time.strptime(str,'%Y年 %m月%d日%H時%M分 %:z')
146
+ end
147
+
148
+ def parse_dicwin_id(str)
149
+ str = str.gsub(/\D+/,'')
150
+
151
+ return nil if str.empty?()
152
+ return str
153
+ end
154
+
155
+ def polish(obj)
156
+ return Polisher.polish_any(obj,@polishers)
157
+ end
158
+
159
+ def scrape()
160
+ scrape_dict()
161
+ fix_bad_html()
162
+
163
+ article = Article.new()
164
+ doc = html_doc()
165
+
166
+ article.futsuurl = scrape_futsuurl(doc)
167
+
168
+ article.datetime = scrape_datetime(doc,article.futsuurl)
169
+ article.sha256 = scrape_content(doc,article)
170
+ article.title = scrape_title(doc,article)
171
+ article.url = @url
172
+
173
+ return article
174
+ end
175
+
176
+ def scrape_and_add_words(tag,article,result: ScrapeWordsResult.new())
177
+ result = scrape_words(tag,result: result)
178
+ result.polish!()
179
+
180
+ add_words(article,result.words,result.text)
181
+
182
+ return result
183
+ end
184
+
185
+ def scrape_content(doc,article)
186
+ tag = doc.css('div#js-article-body')
187
+ tag = doc.css('div.article-main__body') if tag.length < 1
188
+ tag = doc.css('div.article-body') if tag.length < 1
189
+
190
+ # - https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html
191
+ tag = doc.css('div#main') if tag.length < 1 && @mode == :lenient
192
+
193
+ if tag.length > 0
194
+ text = Util.unspace_web_str(tag.text.to_s())
195
+
196
+ if !text.empty?()
197
+ hexdigest = Digest::SHA256.hexdigest(text)
198
+
199
+ return hexdigest if article.nil?() # For scrape_sha256_only()
200
+
201
+ result = scrape_and_add_words(tag,article)
202
+
203
+ return hexdigest if result.words?()
204
+ end
205
+ end
206
+
207
+ raise ScrapeError,"could not scrape content at URL[#{@url}]"
208
+ end
209
+
210
+ def scrape_datetime(doc,futsuurl=nil)
211
+ year = scrape_year(doc,futsuurl)
212
+
213
+ # First, try with the id.
214
+ tag_name = 'p#js-article-date'
215
+ tag = doc.css(tag_name)
216
+
217
+ if tag.length > 0
218
+ tag_text = tag[0].text
219
+
220
+ begin
221
+ datetime = parse_datetime(tag_text,year)
222
+
223
+ return datetime
224
+ rescue ArgumentError => e
225
+ # Ignore; try again below.
226
+ Util.warn("could not parse date time[#{tag_text}] from tag[#{tag_name}] at URL[#{@url}]: #{e}")
227
+ end
228
+ end
229
+
230
+ # Second, try with the class.
231
+ tag_name = 'p.article-main__date'
232
+ tag = doc.css(tag_name)
233
+
234
+ if tag.length > 0
235
+ tag_text = tag[0].text
236
+
237
+ begin
238
+ datetime = parse_datetime(tag_text,year)
239
+
240
+ return datetime
241
+ rescue ArgumentError => e
242
+ # Ignore; try again below.
243
+ Util.warn("could not parse date time[#{tag_text}] from tag[#{tag_name}] at URL[#{@url}]: #{e}")
244
+ end
245
+
246
+ return datetime
247
+ end
248
+
249
+ # Third, try body's id.
250
+ # - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html
251
+ # - 'news20170331_k10010922481000'
252
+ tag = doc.css('body')
253
+
254
+ if tag.length > 0
255
+ tag_id = tag[0]['id'].to_s().split('_',2)
256
+
257
+ if tag_id.length > 0
258
+ tag_id = tag_id[0].gsub(/[^[[:digit:]]]+/,'')
259
+
260
+ if tag_id.length == 8
261
+ datetime = Time.strptime(tag_id,'%Y%m%d')
262
+
263
+ return datetime
264
+ end
265
+ end
266
+ end
267
+
268
+ # As a last resort, use our user-defined fallback (if specified).
269
+ return @datetime unless @datetime.nil?()
270
+
271
+ raise ScrapeError,"could not scrape date time at URL[#{@url}]"
272
+ end
273
+
274
+ def scrape_dict()
275
+ return if @dict != :scrape
276
+
277
+ dict_url = DictScraper.parse_url(@url)
278
+ retries = 0
279
+
280
+ begin
281
+ scraper = DictScraper.new(dict_url,missingno: @missingno,parse_url: false,**@kargs)
282
+ rescue OpenURI::HTTPError => e
283
+ if retries == 0 && e.to_s().include?('404')
284
+ @str_or_io = @str_or_io.read() if @str_or_io.respond_to?(:read)
285
+
286
+ scraper = ArticleScraper.new(@url,str_or_io: @str_or_io,**@kargs)
287
+
288
+ dict_url = scraper.scrape_dict_url_only()
289
+ retries += 1
290
+
291
+ retry
292
+ else
293
+ raise e.exception("could not scrape dictionary at URL[#{dict_url}]: #{e}")
294
+ end
295
+ end
296
+
297
+ @dict = scraper.scrape()
298
+ end
299
+
300
+ def scrape_dict_url_only()
301
+ doc = html_doc()
302
+
303
+ # - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html
304
+ # - 'news20170331_k10010922481000'
305
+ tag = doc.css('body')
306
+
307
+ if tag.length > 0
308
+ tag_id = tag[0]['id'].to_s().split('_',2)
309
+
310
+ if tag_id.length == 2
311
+ dict_url = Util.strip_web_str(tag_id[1])
312
+
313
+ if !dict_url.empty?()
314
+ return DictScraper.parse_url(@url,basename: dict_url)
315
+ end
316
+ end
317
+ end
318
+
319
+ raise ScrapeError,"could not scrape dictionary URL at URL[#{@url}]"
320
+ end
321
+
322
+ def scrape_dicwin_word(tag,id,result: ScrapeWordsResult.new())
323
+ dicwin_result = scrape_words(tag,dicwin: true)
324
+
325
+ return nil unless dicwin_result.words?()
326
+
327
+ kana = ''.dup()
328
+ kanji = ''.dup()
329
+
330
+ dicwin_result.words.each() do |word|
331
+ kana << word.kana unless word.kana.nil?()
332
+
333
+ if kanji.empty?()
334
+ kanji << word.kanji unless word.kanji.nil?()
335
+ else
336
+ kanji << word.word # Add trailing kana (or kanji) to kanji
337
+ end
338
+ end
339
+
340
+ entry = nil
341
+ kana = clean(kana)
342
+ kanji = clean(kanji)
343
+
344
+ raise ScrapeError,"empty dicWin word at URL[#{@url}] in tag[#{tag}]" if kana.empty?() && kanji.empty?()
345
+
346
+ if !@dict.nil?()
347
+ entry = @dict[id]
348
+
349
+ raise ScrapeError,"no dicWin ID[#{id}] at URL[#{@url}] in dictionary[#{@dict}]" if entry.nil?()
350
+
351
+ entry = entry.to_s()
352
+ end
353
+
354
+ word = Word.new(
355
+ defn: entry,
356
+ kana: kana,
357
+ kanji: kanji
358
+ )
359
+
360
+ result.add_text(dicwin_result.text) # Don't call dicwin_result.polish!()
361
+ result.add_word(word)
362
+
363
+ return word
364
+ end
365
+
366
+ def scrape_futsuurl(doc)
367
+ # First, try with the id.
368
+ tag = doc.css('div#js-regular-news-wrapper')
369
+
370
+ if tag.length > 0
371
+ link = scrape_link(tag[0])
372
+
373
+ return link unless link.nil?()
374
+ end
375
+
376
+ # Second, try with the class.
377
+ tag = doc.css('div.link-to-normal')
378
+
379
+ if tag.length > 0
380
+ link = scrape_link(tag[0])
381
+
382
+ return link unless link.nil?()
383
+ end
384
+
385
+ # Some sites don't have a futsuurl and need a lenient mode.
386
+ # - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html
387
+ warn_or_error(ScrapeError,"could not scrape futsuurl at URL[#{@url}]")
388
+
389
+ return nil
390
+ end
391
+
392
+ def scrape_link(tag)
393
+ link = tag.css('a')
394
+
395
+ return nil if link.length < 1
396
+
397
+ link = Util.unspace_web_str(link[0]['href'].to_s())
398
+
399
+ return nil if link.empty?()
400
+ return link
401
+ end
402
+
403
+ def scrape_ruby_word(tag,result: ScrapeWordsResult.new())
404
+ word = Word.scrape_ruby_tag(tag,missingno: @missingno,url: @url)
405
+
406
+ return nil if word.nil?()
407
+
408
+ # No cleaning; raw text.
409
+ # Do not add kana to the text.
410
+ result.add_text(word.kanji)
411
+
412
+ kanji = clean(word.kanji)
413
+ kana = clean(word.kana)
414
+
415
+ if !@missingno.nil?()
416
+ # Check kana first, since this is the typical scenario.
417
+ # - https://www3.nhk.or.jp/news/easy/k10012331311000/k10012331311000.html
418
+ # - '窓' in '(8)窓を開けて外の空気を入れましょう'
419
+ if kana.empty?()
420
+ kana = @missingno.kana_from_kanji(kanji)
421
+ kana = kana.nil?() ? '' : clean(kana)
422
+
423
+ if !kana.empty?()
424
+ Util.warn("using missingno for kana[#{kana}] from kanji[#{kanji}]")
425
+ end
426
+ elsif kanji.empty?()
427
+ kanji = @missingno.kanji_from_kana(kana)
428
+ kanji = kanji.nil?() ? '' : clean(kanji)
429
+
430
+ if !kanji.empty?()
431
+ Util.warn("using missingno for kanji[#{kanji}] from kana[#{kana}]")
432
+ end
433
+ end
434
+ end
435
+
436
+ raise ScrapeError,"empty kanji at URL[#{@url}] in tag[#{tag}]" if kanji.empty?()
437
+ raise ScrapeError,"empty kana at URL[#{@url}] in tag[#{tag}]" if kana.empty?()
438
+
439
+ word = Word.new(
440
+ kana: kana,
441
+ kanji: kanji,
442
+ word: word
443
+ )
444
+
445
+ return word
446
+ end
447
+
448
+ def scrape_sha256_only()
449
+ doc = html_doc()
450
+
451
+ sha256 = scrape_content(doc,nil)
452
+
453
+ return sha256
454
+ end
455
+
456
+ def scrape_text_word(tag,result: ScrapeWordsResult.new())
457
+ word = Word.scrape_text_node(tag,url: @url)
458
+
459
+ if word.nil?()
460
+ result.add_text(tag.text.to_s()) # Raw spaces for output
461
+
462
+ return nil
463
+ end
464
+
465
+ text = word.kana # Should be kana only
466
+
467
+ result.add_text(text) # No cleaning; raw text
468
+
469
+ text = clean(text)
470
+
471
+ return nil if text.empty?() # No error; empty text is fine here
472
+
473
+ word = Word.new(
474
+ kana: text,
475
+ word: word
476
+ )
477
+
478
+ return word
479
+ end
480
+
481
+ def scrape_title(doc,article)
482
+ tag = doc.css('h1.article-main__title')
483
+
484
+ if tag.length < 1 && @mode == :lenient
485
+ # This shouldn't be used except for select sites.
486
+ # - https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html
487
+
488
+ tag_name = 'div#main h2'
489
+
490
+ Util.warn("using [#{tag_name}] for title at URL[#{@url}]")
491
+
492
+ tag = doc.css(tag_name)
493
+ end
494
+
495
+ if tag.length > 0
496
+ result = scrape_and_add_words(tag,article)
497
+ title = result.text
498
+
499
+ return title unless title.empty?()
500
+ end
501
+
502
+ raise ScrapeError,"could not scrape title at URL[#{@url}]"
503
+ end
504
+
505
+ def scrape_words(tag,dicwin: false,result: ScrapeWordsResult.new())
506
+ children = tag.children.to_a().reverse() # A faster stack?
507
+
508
+ while !children.empty?()
509
+ child = children.pop()
510
+ name = nil
511
+ word = nil
512
+
513
+ name = Util.unspace_web_str(child.name.to_s()).downcase() if child.respond_to?(:name)
514
+
515
+ if name == 'ruby'
516
+ word = scrape_ruby_word(child,result: result)
517
+ elsif child.text?()
518
+ word = scrape_text_word(child,result: result)
519
+ elsif name == 'rt'
520
+ raise ScrapeError,"invalid rt tag[#{child}] without a ruby tag at URL[#{@url}]"
521
+ else
522
+ dicwin_id = nil
523
+
524
+ if name == 'a'
525
+ id = parse_dicwin_id(child['id'].to_s())
526
+ klass = Util.unspace_web_str(child['class'].to_s()).downcase()
527
+
528
+ if klass == 'dicwin' && !id.nil?()
529
+ if dicwin
530
+ raise ScrapeError,"invalid dicWin class[#{child}] nested inside another dicWin class at URL[#{@url}]"
531
+ end
532
+
533
+ dicwin_id = id
534
+ end
535
+ end
536
+
537
+ if dicwin_id.nil?()
538
+ grand_children = child.children.to_a()
539
+
540
+ (grand_children.length() - 1).downto(0).each() do |i|
541
+ children.push(grand_children[i])
542
+ end
543
+
544
+ # I originally didn't use a stack-like Array and did a constant insert,
545
+ # but I think this is slower (moving all elements down every time).
546
+ # However, if it's using C-like code for moving memory, then maybe it
547
+ # is faster?
548
+ #children.insert(i + 1,*child.children.to_a())
549
+ else
550
+ word = scrape_dicwin_word(child,dicwin_id,result: result)
551
+ end
552
+ end
553
+
554
+ result.add_word(word) unless word.nil?()
555
+ end
556
+
557
+ return result
558
+ end
559
+
560
+ def scrape_year(doc,futsuurl=nil)
561
+ # First, try body's id.
562
+ tag = doc.css('body')
563
+
564
+ if tag.length > 0
565
+ tag_id = tag[0]['id'].to_s().gsub(/[^[[:digit:]]]+/,'')
566
+
567
+ if tag_id.length >= 4
568
+ year = tag_id[0..3].to_i()
569
+
570
+ return year if Util.sane_year?(year)
571
+ end
572
+ end
573
+
574
+ # Second, try futsuurl.
575
+ if !futsuurl.nil?()
576
+ m = futsuurl.match(/([[:digit:]]{4,})/)
577
+
578
+ if !m.nil?() && (m = m[0].to_s()).length >= 4
579
+ year = m[0..3].to_i()
580
+
581
+ return year if Util.sane_year?(year)
582
+ end
583
+ end
584
+
585
+ # As a last resort, use our user-defined fallbacks (if specified).
586
+ return @year unless Util.empty_web_str?(@year)
587
+ return @datetime.year if !@datetime.nil?() && Util.sane_year?(@datetime.year)
588
+
589
+ raise ScrapeError,"could not scrape year at URL[#{@url}]"
590
+ end
591
+
592
+ def split(str)
593
+ return @splitter.split(str)
594
+ end
595
+
596
+ def variate(str)
597
+ variations = []
598
+
599
+ @variators.each() do |variator|
600
+ variations.push(*variator.variate(str))
601
+ end
602
+
603
+ return variations
604
+ end
605
+
606
+ def warn_or_error(klass,msg)
607
+ case @mode
608
+ when :lenient
609
+ Util.warn(msg)
610
+ else
611
+ raise klass,msg
612
+ end
613
+ end
614
+ end
615
+
616
+ ###
617
+ # @author Jonathan Bradley Whited (@esotericpig)
618
+ # @since 0.2.0
619
+ ###
620
+ class ScrapeWordsResult
621
+ attr_reader :text
622
+ attr_reader :words
623
+
624
+ def initialize()
625
+ super()
626
+
627
+ @text = ''.dup()
628
+ @words = []
629
+ end
630
+
631
+ def add_text(text)
632
+ @text << Util.reduce_jpn_space(text)
633
+
634
+ return self
635
+ end
636
+
637
+ def add_word(word)
638
+ @words << word
639
+
640
+ return self
641
+ end
642
+
643
+ def polish!()
644
+ @text = Util.strip_web_str(@text)
645
+
646
+ return self
647
+ end
648
+
649
+ def words?()
650
+ return !@words.empty?()
651
+ end
652
+ end
653
+ end