nhkore 0.3.7 → 0.3.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +53 -2
- data/Gemfile +0 -18
- data/Gemfile.lock +36 -33
- data/README.md +36 -30
- data/Rakefile +38 -52
- data/bin/nhkore +4 -15
- data/lib/nhkore/app.rb +235 -234
- data/lib/nhkore/article.rb +39 -53
- data/lib/nhkore/article_scraper.rb +293 -285
- data/lib/nhkore/cleaner.rb +20 -32
- data/lib/nhkore/cli/fx_cmd.rb +41 -53
- data/lib/nhkore/cli/get_cmd.rb +59 -70
- data/lib/nhkore/cli/news_cmd.rb +143 -153
- data/lib/nhkore/cli/search_cmd.rb +108 -118
- data/lib/nhkore/cli/sift_cmd.rb +109 -120
- data/lib/nhkore/datetime_parser.rb +88 -104
- data/lib/nhkore/defn.rb +48 -55
- data/lib/nhkore/dict.rb +26 -38
- data/lib/nhkore/dict_scraper.rb +31 -40
- data/lib/nhkore/entry.rb +43 -55
- data/lib/nhkore/error.rb +16 -21
- data/lib/nhkore/fileable.rb +10 -21
- data/lib/nhkore/lib.rb +5 -17
- data/lib/nhkore/missingno.rb +21 -33
- data/lib/nhkore/news.rb +58 -72
- data/lib/nhkore/polisher.rb +22 -34
- data/lib/nhkore/scraper.rb +74 -83
- data/lib/nhkore/search_link.rb +62 -76
- data/lib/nhkore/search_scraper.rb +81 -92
- data/lib/nhkore/sifter.rb +157 -171
- data/lib/nhkore/splitter.rb +19 -31
- data/lib/nhkore/user_agents.rb +28 -32
- data/lib/nhkore/util.rb +72 -84
- data/lib/nhkore/variator.rb +20 -32
- data/lib/nhkore/version.rb +4 -16
- data/lib/nhkore/word.rb +99 -97
- data/lib/nhkore.rb +8 -20
- data/nhkore.gemspec +30 -51
- data/samples/looper.rb +18 -29
- data/test/nhkore/test_helper.rb +3 -15
- data/test/nhkore_test.rb +6 -18
- metadata +33 -24
@@ -1,23 +1,11 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
@@ -39,12 +27,12 @@ require 'nhkore/word'
|
|
39
27
|
|
40
28
|
module NHKore
|
41
29
|
###
|
42
|
-
# @author Jonathan Bradley Whited
|
30
|
+
# @author Jonathan Bradley Whited
|
43
31
|
# @since 0.2.0
|
44
32
|
###
|
45
33
|
class ArticleScraper < Scraper
|
46
34
|
extend AttrBool::Ext
|
47
|
-
|
35
|
+
|
48
36
|
attr_reader :cleaners
|
49
37
|
attr_accessor :datetime
|
50
38
|
attr_accessor :dict
|
@@ -55,18 +43,20 @@ module NHKore
|
|
55
43
|
attr_accessor? :strict
|
56
44
|
attr_reader :variators
|
57
45
|
attr_accessor :year
|
58
|
-
|
46
|
+
|
59
47
|
# @param dict [Dict,:scrape,nil] the {Dict} (dictionary) to use for {Word#defn} (definitions)
|
60
48
|
# [+:scrape+] auto-scrape it using {DictScraper}
|
61
49
|
# [+nil+] don't scrape/use it
|
62
50
|
# @param missingno [Missingno] data to use as a fallback for Ruby words without kana/kanji,
|
63
51
|
# instead of raising an error
|
64
52
|
# @param strict [true,false]
|
65
|
-
def initialize(url,cleaners: [BestCleaner.new
|
53
|
+
def initialize(url,cleaners: [BestCleaner.new],datetime: nil,dict: :scrape,missingno: nil,
|
54
|
+
polishers: [BestPolisher.new],splitter: BestSplitter.new,strict: true,
|
55
|
+
variators: [BestVariator.new],year: nil,**kargs)
|
66
56
|
super(url,**kargs)
|
67
|
-
|
57
|
+
|
68
58
|
@cleaners = Array(cleaners)
|
69
|
-
@datetime = datetime.nil?
|
59
|
+
@datetime = datetime.nil? ? nil : Util.jst_time(datetime)
|
70
60
|
@dict = dict
|
71
61
|
@kargs = kargs
|
72
62
|
@missingno = missingno
|
@@ -76,20 +66,20 @@ module NHKore
|
|
76
66
|
@variators = Array(variators)
|
77
67
|
@year = year
|
78
68
|
end
|
79
|
-
|
69
|
+
|
80
70
|
def add_words(article,words,text)
|
81
|
-
words.each
|
71
|
+
words.each do |word|
|
82
72
|
# Words should have already been cleaned.
|
83
73
|
# If we don't check this, Word.new() could raise an error in polish().
|
84
|
-
next if polish(word.word).empty?
|
85
|
-
|
74
|
+
next if polish(word.word).empty?
|
75
|
+
|
86
76
|
article.add_word(polish(word))
|
87
|
-
|
88
|
-
variate(word.word).each
|
77
|
+
|
78
|
+
variate(word.word).each do |v|
|
89
79
|
v = polish(clean(v))
|
90
|
-
|
91
|
-
next if v.empty?
|
92
|
-
|
80
|
+
|
81
|
+
next if v.empty?
|
82
|
+
|
93
83
|
# Do not pass in "word: word". We only want defn & eng.
|
94
84
|
# If we pass in kanji/kana & unknown, it will raise an error.
|
95
85
|
article.add_word(Word.new(
|
@@ -99,522 +89,540 @@ module NHKore
|
|
99
89
|
))
|
100
90
|
end
|
101
91
|
end
|
102
|
-
|
103
|
-
split(text).each
|
92
|
+
|
93
|
+
split(text).each do |t|
|
104
94
|
t = polish(clean(t))
|
105
|
-
|
106
|
-
next if t.empty?
|
107
|
-
|
95
|
+
|
96
|
+
next if t.empty?
|
97
|
+
|
108
98
|
article.add_word(Word.new(unknown: t))
|
109
|
-
|
110
|
-
variate(t).each
|
99
|
+
|
100
|
+
variate(t).each do |v|
|
111
101
|
v = polish(clean(v))
|
112
|
-
|
113
|
-
next if v.empty?
|
114
|
-
|
102
|
+
|
103
|
+
next if v.empty?
|
104
|
+
|
115
105
|
article.add_word(Word.new(unknown: v))
|
116
106
|
end
|
117
107
|
end
|
118
108
|
end
|
119
|
-
|
109
|
+
|
120
110
|
def clean(obj)
|
121
111
|
return Cleaner.clean_any(obj,@cleaners)
|
122
112
|
end
|
123
|
-
|
124
|
-
def fix_bad_html
|
113
|
+
|
114
|
+
def fix_bad_html
|
125
115
|
# Fixes:
|
126
116
|
# - '<「<' without escaping '<' as '<'
|
127
117
|
# - https://www3.nhk.or.jp/news/easy/k10012118911000/k10012118911000.html
|
128
118
|
# - '</p><br><「<ruby>台風<rt>たいふう</rt></ruby>'
|
129
|
-
|
130
|
-
read
|
131
|
-
|
132
|
-
# To add a new one, simply add '|(...)' on a newline and test
|
119
|
+
|
120
|
+
read
|
121
|
+
|
122
|
+
# To add a new one, simply add '|(...)' on a newline and test Regexp.last_match().
|
133
123
|
@str_or_io = @str_or_io.gsub(/
|
134
|
-
(
|
124
|
+
(?<cane><「<)
|
135
125
|
/x) do |match|
|
136
|
-
if
|
126
|
+
if !Regexp.last_match(:cane).nil?
|
137
127
|
match = match.sub('<','<')
|
138
128
|
end
|
139
|
-
|
129
|
+
|
140
130
|
match
|
141
131
|
end
|
142
132
|
end
|
143
|
-
|
133
|
+
|
144
134
|
def parse_datetime(str,year)
|
145
135
|
str = str.gsub(/[\[\][[:space:]]]+/,'') # Remove: [ ] \s
|
146
136
|
str = "#{year}年 #{str} #{Util::JST_OFFSET}"
|
147
|
-
|
137
|
+
|
148
138
|
return Time.strptime(str,'%Y年 %m月%d日%H時%M分 %:z')
|
149
139
|
end
|
150
|
-
|
140
|
+
|
151
141
|
def parse_dicwin_id(str)
|
152
142
|
str = str.gsub(/\D+/,'')
|
153
|
-
|
154
|
-
return nil if str.empty?
|
143
|
+
|
144
|
+
return nil if str.empty?
|
155
145
|
return str
|
156
146
|
end
|
157
|
-
|
147
|
+
|
158
148
|
def polish(obj)
|
159
149
|
return Polisher.polish_any(obj,@polishers)
|
160
150
|
end
|
161
|
-
|
162
|
-
def scrape
|
163
|
-
scrape_dict
|
164
|
-
fix_bad_html
|
165
|
-
|
166
|
-
article = Article.new
|
167
|
-
doc = html_doc
|
168
|
-
|
151
|
+
|
152
|
+
def scrape
|
153
|
+
scrape_dict
|
154
|
+
fix_bad_html
|
155
|
+
|
156
|
+
article = Article.new
|
157
|
+
doc = html_doc
|
158
|
+
|
169
159
|
article.futsuurl = scrape_futsuurl(doc)
|
170
|
-
|
160
|
+
|
171
161
|
article.datetime = scrape_datetime(doc,article.futsuurl)
|
172
162
|
article.sha256 = scrape_content(doc,article)
|
173
163
|
article.title = scrape_title(doc,article)
|
174
164
|
article.url = @url
|
175
|
-
|
165
|
+
|
176
166
|
return article
|
177
167
|
end
|
178
|
-
|
179
|
-
def scrape_and_add_words(tag,article,result: ScrapeWordsResult.new
|
168
|
+
|
169
|
+
def scrape_and_add_words(tag,article,result: ScrapeWordsResult.new)
|
180
170
|
result = scrape_words(tag,result: result)
|
181
|
-
result.polish!
|
182
|
-
|
171
|
+
result.polish!
|
172
|
+
|
183
173
|
add_words(article,result.words,result.text)
|
184
|
-
|
174
|
+
|
185
175
|
return result
|
186
176
|
end
|
187
|
-
|
177
|
+
|
188
178
|
def scrape_content(doc,article)
|
189
179
|
tag = doc.css('div#js-article-body')
|
190
180
|
tag = doc.css('div.article-main__body') if tag.length < 1
|
191
181
|
tag = doc.css('div.article-body') if tag.length < 1
|
192
|
-
|
182
|
+
|
193
183
|
# - https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html
|
194
184
|
tag = doc.css('div#main') if tag.length < 1 && !@strict
|
195
|
-
|
185
|
+
|
196
186
|
if tag.length > 0
|
197
|
-
text = Util.unspace_web_str(tag.text.to_s
|
198
|
-
|
199
|
-
if !text.empty?
|
187
|
+
text = Util.unspace_web_str(tag.text.to_s)
|
188
|
+
|
189
|
+
if !text.empty?
|
200
190
|
hexdigest = Digest::SHA256.hexdigest(text)
|
201
|
-
|
202
|
-
return hexdigest if article.nil?
|
203
|
-
|
191
|
+
|
192
|
+
return hexdigest if article.nil? # For scrape_sha256_only()
|
193
|
+
|
204
194
|
result = scrape_and_add_words(tag,article)
|
205
|
-
|
206
|
-
return hexdigest if result.words?
|
195
|
+
|
196
|
+
return hexdigest if result.words?
|
207
197
|
end
|
208
198
|
end
|
209
|
-
|
199
|
+
|
210
200
|
raise ScrapeError,"could not scrape content at URL[#{@url}]"
|
211
201
|
end
|
212
|
-
|
202
|
+
|
213
203
|
def scrape_datetime(doc,futsuurl=nil)
|
214
204
|
year = scrape_year(doc,futsuurl)
|
215
|
-
|
205
|
+
|
216
206
|
# First, try with the id.
|
217
207
|
tag_name = 'p#js-article-date'
|
218
208
|
tag = doc.css(tag_name)
|
219
|
-
|
209
|
+
|
220
210
|
if tag.length > 0
|
221
211
|
tag_text = tag[0].text
|
222
|
-
|
212
|
+
|
223
213
|
begin
|
224
214
|
datetime = parse_datetime(tag_text,year)
|
225
|
-
|
215
|
+
|
226
216
|
return datetime
|
227
217
|
rescue ArgumentError => e
|
228
218
|
# Ignore; try again below.
|
229
219
|
Util.warn("could not parse date time[#{tag_text}] from tag[#{tag_name}] at URL[#{@url}]: #{e}")
|
230
220
|
end
|
231
221
|
end
|
232
|
-
|
222
|
+
|
233
223
|
# Second, try with the class.
|
234
224
|
tag_name = 'p.article-main__date'
|
235
225
|
tag = doc.css(tag_name)
|
236
|
-
|
226
|
+
|
237
227
|
if tag.length > 0
|
238
228
|
tag_text = tag[0].text
|
239
|
-
|
229
|
+
|
240
230
|
begin
|
241
231
|
datetime = parse_datetime(tag_text,year)
|
242
|
-
|
232
|
+
|
243
233
|
return datetime
|
244
234
|
rescue ArgumentError => e
|
245
235
|
# Ignore; try again below.
|
246
236
|
Util.warn("could not parse date time[#{tag_text}] from tag[#{tag_name}] at URL[#{@url}]: #{e}")
|
247
237
|
end
|
248
|
-
|
238
|
+
|
249
239
|
return datetime
|
250
240
|
end
|
251
|
-
|
241
|
+
|
252
242
|
# Third, try body's id.
|
253
243
|
# - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html
|
254
244
|
# - 'news20170331_k10010922481000'
|
255
245
|
tag = doc.css('body')
|
256
|
-
|
246
|
+
|
257
247
|
if tag.length > 0
|
258
|
-
tag_id = tag[0]['id'].to_s
|
259
|
-
|
248
|
+
tag_id = tag[0]['id'].to_s.split('_',2)
|
249
|
+
|
260
250
|
if tag_id.length > 0
|
261
251
|
tag_id = tag_id[0].gsub(/[^[[:digit:]]]+/,'')
|
262
|
-
|
252
|
+
|
263
253
|
if tag_id.length == 8
|
264
254
|
datetime = Time.strptime(tag_id,'%Y%m%d')
|
265
|
-
|
255
|
+
|
266
256
|
return datetime
|
267
257
|
end
|
268
258
|
end
|
269
259
|
end
|
270
|
-
|
260
|
+
|
271
261
|
# As a last resort, use our user-defined fallback (if specified).
|
272
|
-
return @datetime unless @datetime.nil?
|
273
|
-
|
262
|
+
return @datetime unless @datetime.nil?
|
263
|
+
|
274
264
|
raise ScrapeError,"could not scrape date time at URL[#{@url}]"
|
275
265
|
end
|
276
|
-
|
277
|
-
def scrape_dict
|
266
|
+
|
267
|
+
def scrape_dict
|
278
268
|
return if @dict != :scrape
|
279
|
-
|
269
|
+
|
280
270
|
dict_url = DictScraper.parse_url(@url)
|
281
271
|
retries = 0
|
282
|
-
|
272
|
+
|
283
273
|
begin
|
284
274
|
scraper = DictScraper.new(dict_url,missingno: @missingno,parse_url: false,**@kargs)
|
285
275
|
rescue OpenURI::HTTPError => e
|
286
|
-
if retries == 0 && e.to_s
|
287
|
-
read
|
288
|
-
|
276
|
+
if retries == 0 && e.to_s.include?('404')
|
277
|
+
read
|
278
|
+
|
289
279
|
scraper = ArticleScraper.new(@url,str_or_io: @str_or_io,**@kargs)
|
290
|
-
|
291
|
-
dict_url = scraper.scrape_dict_url_only
|
280
|
+
|
281
|
+
dict_url = scraper.scrape_dict_url_only
|
292
282
|
retries += 1
|
293
|
-
|
283
|
+
|
294
284
|
retry
|
295
285
|
else
|
296
286
|
raise e.exception("could not scrape dictionary URL[#{dict_url}] at URL[#{@url}]: #{e}")
|
297
287
|
end
|
298
288
|
end
|
299
|
-
|
300
|
-
@dict = scraper.scrape
|
289
|
+
|
290
|
+
@dict = scraper.scrape
|
301
291
|
end
|
302
|
-
|
303
|
-
def scrape_dict_url_only
|
304
|
-
doc = html_doc
|
305
|
-
|
292
|
+
|
293
|
+
def scrape_dict_url_only
|
294
|
+
doc = html_doc
|
295
|
+
|
306
296
|
# - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html
|
307
297
|
# - 'news20170331_k10010922481000'
|
308
298
|
tag = doc.css('body')
|
309
|
-
|
299
|
+
|
310
300
|
if tag.length > 0
|
311
|
-
tag_id = tag[0]['id'].to_s
|
312
|
-
|
301
|
+
tag_id = tag[0]['id'].to_s.split('_',2)
|
302
|
+
|
313
303
|
if tag_id.length == 2
|
314
304
|
dict_url = Util.strip_web_str(tag_id[1])
|
315
|
-
|
316
|
-
if !dict_url.empty?
|
305
|
+
|
306
|
+
if !dict_url.empty?
|
317
307
|
return DictScraper.parse_url(@url,basename: dict_url)
|
318
308
|
end
|
319
309
|
end
|
320
310
|
end
|
321
|
-
|
311
|
+
|
322
312
|
raise ScrapeError,"could not scrape dictionary URL at URL[#{@url}]"
|
323
313
|
end
|
324
|
-
|
325
|
-
def scrape_dicwin_word(tag,id,result: ScrapeWordsResult.new
|
314
|
+
|
315
|
+
def scrape_dicwin_word(tag,id,result: ScrapeWordsResult.new)
|
326
316
|
dicwin_result = scrape_words(tag,dicwin: true)
|
327
|
-
|
328
|
-
return nil unless dicwin_result.words?
|
329
|
-
|
330
|
-
kana = ''.dup
|
331
|
-
kanji = ''.dup
|
332
|
-
|
333
|
-
dicwin_result.words.each
|
334
|
-
kana << word.kana unless word.kana.nil?
|
335
|
-
|
336
|
-
if kanji.empty?
|
337
|
-
kanji << word.kanji unless word.kanji.nil?
|
317
|
+
|
318
|
+
return nil unless dicwin_result.words?
|
319
|
+
|
320
|
+
kana = ''.dup
|
321
|
+
kanji = ''.dup
|
322
|
+
|
323
|
+
dicwin_result.words.each do |word|
|
324
|
+
kana << word.kana unless word.kana.nil?
|
325
|
+
|
326
|
+
if kanji.empty?
|
327
|
+
kanji << word.kanji unless word.kanji.nil?
|
338
328
|
else
|
339
329
|
kanji << word.word # Add trailing kana (or kanji) to kanji
|
340
330
|
end
|
341
331
|
end
|
342
|
-
|
332
|
+
|
343
333
|
entry = nil
|
344
334
|
kana = clean(kana)
|
345
335
|
kanji = clean(kanji)
|
346
|
-
|
347
|
-
raise ScrapeError,"empty dicWin word at URL[#{@url}] in tag[#{tag}]" if kana.empty?
|
348
|
-
|
349
|
-
if !@dict.nil?
|
336
|
+
|
337
|
+
raise ScrapeError,"empty dicWin word at URL[#{@url}] in tag[#{tag}]" if kana.empty? && kanji.empty?
|
338
|
+
|
339
|
+
if !@dict.nil?
|
350
340
|
entry = @dict[id]
|
351
|
-
|
352
|
-
raise ScrapeError,"no dicWin ID[#{id}] at URL[#{@url}] in dictionary[#{@dict}]" if entry.nil?
|
353
|
-
|
354
|
-
entry = entry.to_s
|
341
|
+
|
342
|
+
raise ScrapeError,"no dicWin ID[#{id}] at URL[#{@url}] in dictionary[#{@dict}]" if entry.nil?
|
343
|
+
|
344
|
+
entry = entry.to_s
|
355
345
|
end
|
356
|
-
|
346
|
+
|
357
347
|
word = Word.new(
|
358
348
|
defn: entry,
|
359
349
|
kana: kana,
|
360
350
|
kanji: kanji
|
361
351
|
)
|
362
|
-
|
352
|
+
|
363
353
|
result.add_text(dicwin_result.text) # Don't call dicwin_result.polish!()
|
364
354
|
result.add_word(word)
|
365
|
-
|
355
|
+
|
366
356
|
return word
|
367
357
|
end
|
368
|
-
|
358
|
+
|
369
359
|
def scrape_futsuurl(doc)
|
370
360
|
# First, try with the id.
|
371
361
|
tag = doc.css('div#js-regular-news-wrapper')
|
372
|
-
|
362
|
+
|
373
363
|
if tag.length > 0
|
374
364
|
link = scrape_link(tag[0])
|
375
|
-
|
376
|
-
return link unless link.nil?
|
365
|
+
|
366
|
+
return link unless link.nil?
|
377
367
|
end
|
378
|
-
|
368
|
+
|
379
369
|
# Second, try with the class.
|
380
370
|
tag = doc.css('div.link-to-normal')
|
381
|
-
|
371
|
+
|
382
372
|
if tag.length > 0
|
383
373
|
link = scrape_link(tag[0])
|
384
|
-
|
385
|
-
return link unless link.nil?
|
374
|
+
|
375
|
+
return link unless link.nil?
|
386
376
|
end
|
387
|
-
|
377
|
+
|
388
378
|
# Some sites don't have a futsuurl and need a lenient mode.
|
389
379
|
# - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html
|
390
380
|
warn_or_error(ScrapeError,"could not scrape futsuurl at URL[#{@url}]")
|
391
|
-
|
381
|
+
|
392
382
|
return nil
|
393
383
|
end
|
394
|
-
|
384
|
+
|
395
385
|
def scrape_link(tag)
|
396
386
|
link = tag.css('a')
|
397
|
-
|
387
|
+
|
398
388
|
return nil if link.length < 1
|
399
|
-
|
400
|
-
link = Util.unspace_web_str(link[0]['href'].to_s
|
401
|
-
|
402
|
-
return nil if link.empty?
|
389
|
+
|
390
|
+
link = Util.unspace_web_str(link[0]['href'].to_s)
|
391
|
+
|
392
|
+
return nil if link.empty?
|
403
393
|
return link
|
404
394
|
end
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
395
|
+
|
396
|
+
# @since 0.3.8
|
397
|
+
# @see https://www3.nhk.or.jp/news/easy/k10012759201000/k10012759201000.html
|
398
|
+
def scrape_ruby_words(tag,result: ScrapeWordsResult.new)
|
399
|
+
words = Word.scrape_ruby_tag(tag,missingno: @missingno,url: @url)
|
400
|
+
final_words = []
|
401
|
+
|
402
|
+
return final_words if words.nil?
|
403
|
+
|
404
|
+
words.each do |word|
|
405
|
+
final_words << scrape_ruby_word(word,result: result)
|
406
|
+
end
|
407
|
+
|
408
|
+
return final_words
|
409
|
+
end
|
410
|
+
|
411
|
+
def scrape_ruby_word(word,result: ScrapeWordsResult.new)
|
411
412
|
# No cleaning; raw text.
|
412
413
|
# Do not add kana to the text.
|
413
414
|
result.add_text(word.kanji)
|
414
|
-
|
415
|
+
|
415
416
|
kanji = clean(word.kanji)
|
416
417
|
kana = clean(word.kana)
|
417
|
-
|
418
|
-
|
418
|
+
|
419
|
+
# Even though Word.scrape_ruby_tag() also does this,
|
420
|
+
# check it again after cleaning above.
|
421
|
+
if !@missingno.nil?
|
419
422
|
# Check kana first, since this is the typical scenario.
|
420
423
|
# - https://www3.nhk.or.jp/news/easy/k10012331311000/k10012331311000.html
|
421
424
|
# - '窓' in '(8)窓を開けて外の空気を入れましょう'
|
422
|
-
if kana.empty?
|
425
|
+
if kana.empty?
|
423
426
|
kana = @missingno.kana_from_kanji(kanji)
|
424
|
-
kana = kana.nil?
|
425
|
-
|
426
|
-
if !kana.empty?
|
427
|
+
kana = kana.nil? ? '' : clean(kana)
|
428
|
+
|
429
|
+
if !kana.empty?
|
427
430
|
Util.warn("using missingno for kana[#{kana}] from kanji[#{kanji}]")
|
428
431
|
end
|
429
|
-
elsif kanji.empty?
|
432
|
+
elsif kanji.empty?
|
430
433
|
kanji = @missingno.kanji_from_kana(kana)
|
431
|
-
kanji = kanji.nil?
|
432
|
-
|
433
|
-
if !kanji.empty?
|
434
|
+
kanji = kanji.nil? ? '' : clean(kanji)
|
435
|
+
|
436
|
+
if !kanji.empty?
|
434
437
|
Util.warn("using missingno for kanji[#{kanji}] from kana[#{kana}]")
|
435
438
|
end
|
436
439
|
end
|
437
440
|
end
|
438
|
-
|
439
|
-
raise ScrapeError,"empty kanji at URL[#{@url}] in tag[#{tag}]" if kanji.empty?
|
440
|
-
raise ScrapeError,"empty kana at URL[#{@url}] in tag[#{tag}]" if kana.empty?
|
441
|
-
|
441
|
+
|
442
|
+
raise ScrapeError,"empty kanji at URL[#{@url}] in tag[#{tag}]" if kanji.empty?
|
443
|
+
raise ScrapeError,"empty kana at URL[#{@url}] in tag[#{tag}]" if kana.empty?
|
444
|
+
|
442
445
|
word = Word.new(
|
443
446
|
kana: kana,
|
444
447
|
kanji: kanji,
|
445
448
|
word: word
|
446
449
|
)
|
447
|
-
|
450
|
+
|
448
451
|
return word
|
449
452
|
end
|
450
|
-
|
451
|
-
def scrape_sha256_only
|
452
|
-
doc = html_doc
|
453
|
-
|
453
|
+
|
454
|
+
def scrape_sha256_only
|
455
|
+
doc = html_doc
|
456
|
+
|
454
457
|
sha256 = scrape_content(doc,nil)
|
455
|
-
|
458
|
+
|
456
459
|
return sha256
|
457
460
|
end
|
458
|
-
|
459
|
-
def scrape_text_word(tag,result: ScrapeWordsResult.new
|
461
|
+
|
462
|
+
def scrape_text_word(tag,result: ScrapeWordsResult.new)
|
460
463
|
word = Word.scrape_text_node(tag,url: @url)
|
461
|
-
|
462
|
-
if word.nil?
|
463
|
-
result.add_text(tag.text.to_s
|
464
|
-
|
464
|
+
|
465
|
+
if word.nil?
|
466
|
+
result.add_text(tag.text.to_s) # Raw spaces for output
|
467
|
+
|
465
468
|
return nil
|
466
469
|
end
|
467
|
-
|
470
|
+
|
468
471
|
# Kanji only for:
|
469
472
|
# - https://www3.nhk.or.jp/news/easy/k10012639271000/k10012639271000.html
|
470
473
|
# - '第3のビール'
|
471
474
|
text = word.word # Should usually be kana only
|
472
|
-
|
475
|
+
|
473
476
|
result.add_text(text) # No cleaning; raw text
|
474
|
-
|
477
|
+
|
475
478
|
text = clean(text)
|
476
|
-
|
477
|
-
return nil if text.empty?
|
478
|
-
|
479
|
+
|
480
|
+
return nil if text.empty? # No error; empty text is fine here
|
481
|
+
|
479
482
|
word = Word.new(
|
480
483
|
kana: clean(word.kana),
|
481
484
|
kanji: clean(word.kanji),
|
482
485
|
word: word,
|
483
486
|
)
|
484
|
-
|
487
|
+
|
485
488
|
return word
|
486
489
|
end
|
487
|
-
|
490
|
+
|
488
491
|
def scrape_title(doc,article)
|
489
492
|
tag = doc.css('h1.article-main__title')
|
490
493
|
tag_name = nil
|
491
|
-
|
494
|
+
|
492
495
|
if tag.length < 1
|
493
496
|
# - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_illust.html
|
494
497
|
tag_name = 'h1.article-eq__title'
|
495
498
|
tag = doc.css(tag_name)
|
496
499
|
end
|
497
|
-
|
500
|
+
|
498
501
|
if tag.length < 1 && !@strict
|
499
502
|
# This shouldn't be used except for select sites.
|
500
503
|
# - https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html
|
501
504
|
tag_name = 'div#main h2'
|
502
505
|
tag = doc.css(tag_name)
|
503
506
|
end
|
504
|
-
|
507
|
+
|
505
508
|
if tag.length > 0
|
506
|
-
Util.warn("using [#{tag_name}] for title at URL[#{@url}]") unless tag_name.nil?
|
507
|
-
|
509
|
+
Util.warn("using [#{tag_name}] for title at URL[#{@url}]") unless tag_name.nil?
|
510
|
+
|
508
511
|
result = scrape_and_add_words(tag,article)
|
509
512
|
title = result.text
|
510
|
-
|
511
|
-
return title unless title.empty?
|
513
|
+
|
514
|
+
return title unless title.empty?
|
512
515
|
end
|
513
|
-
|
516
|
+
|
514
517
|
raise ScrapeError,"could not scrape title at URL[#{@url}]"
|
515
518
|
end
|
516
|
-
|
517
|
-
def scrape_words(tag,dicwin: false,result: ScrapeWordsResult.new
|
518
|
-
children = tag.children.to_a
|
519
|
-
|
520
|
-
while !children.empty?
|
521
|
-
child = children.pop
|
519
|
+
|
520
|
+
def scrape_words(tag,dicwin: false,result: ScrapeWordsResult.new)
|
521
|
+
children = tag.children.to_a.reverse # A faster stack?
|
522
|
+
|
523
|
+
while !children.empty?
|
524
|
+
child = children.pop
|
522
525
|
name = nil
|
523
|
-
|
524
|
-
|
525
|
-
name = Util.unspace_web_str(child.name.to_s
|
526
|
-
|
526
|
+
words = []
|
527
|
+
|
528
|
+
name = Util.unspace_web_str(child.name.to_s).downcase if child.respond_to?(:name)
|
529
|
+
|
527
530
|
if name == 'ruby'
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
+
# Returns an array.
|
532
|
+
words = scrape_ruby_words(child,result: result)
|
533
|
+
elsif child.text?
|
534
|
+
words << scrape_text_word(child,result: result)
|
531
535
|
elsif name == 'rt'
|
532
536
|
raise ScrapeError,"invalid rt tag[#{child}] without a ruby tag at URL[#{@url}]"
|
533
537
|
else
|
534
538
|
dicwin_id = nil
|
535
|
-
|
539
|
+
|
536
540
|
if name == 'a'
|
537
|
-
id = parse_dicwin_id(child['id'].to_s
|
538
|
-
klass = Util.unspace_web_str(child['class'].to_s
|
539
|
-
|
540
|
-
if klass == 'dicwin' && !id.nil?
|
541
|
+
id = parse_dicwin_id(child['id'].to_s)
|
542
|
+
klass = Util.unspace_web_str(child['class'].to_s).downcase
|
543
|
+
|
544
|
+
if klass == 'dicwin' && !id.nil?
|
541
545
|
if dicwin
|
542
|
-
raise ScrapeError,"invalid dicWin class[#{child}] nested inside another dicWin class at
|
546
|
+
raise ScrapeError,"invalid dicWin class[#{child}] nested inside another dicWin class at" \
|
547
|
+
" URL[#{@url}]"
|
543
548
|
end
|
544
|
-
|
549
|
+
|
545
550
|
dicwin_id = id
|
546
551
|
end
|
547
552
|
end
|
548
|
-
|
549
|
-
if dicwin_id.nil?
|
550
|
-
grand_children = child.children.to_a()
|
551
|
-
|
552
|
-
(grand_children.length() - 1).downto(0).each() do |i|
|
553
|
-
children.push(grand_children[i])
|
554
|
-
end
|
555
|
-
|
553
|
+
|
554
|
+
if dicwin_id.nil?
|
556
555
|
# I originally didn't use a stack-like Array and did a constant insert,
|
557
556
|
# but I think this is slower (moving all elements down every time).
|
558
557
|
# However, if it's using C-like code for moving memory, then maybe it
|
559
558
|
# is faster?
|
560
|
-
#
|
559
|
+
# Old code:
|
560
|
+
# children.insert(i + 1,*child.children.to_a())
|
561
|
+
grand_children = child.children.to_a
|
562
|
+
|
563
|
+
(grand_children.length - 1).downto(0).each do |i|
|
564
|
+
children.push(grand_children[i])
|
565
|
+
end
|
561
566
|
else
|
562
|
-
|
567
|
+
words << scrape_dicwin_word(child,dicwin_id,result: result)
|
563
568
|
end
|
564
569
|
end
|
565
|
-
|
566
|
-
|
570
|
+
|
571
|
+
words&.each do |word|
|
572
|
+
# All word-scraping methods can return nil.
|
573
|
+
result.add_word(word) unless word.nil?
|
574
|
+
end
|
567
575
|
end
|
568
|
-
|
576
|
+
|
569
577
|
return result
|
570
578
|
end
|
571
|
-
|
579
|
+
|
572
580
|
def scrape_year(doc,futsuurl=nil)
|
573
581
|
# First, try body's id.
|
574
582
|
tag = doc.css('body')
|
575
|
-
|
583
|
+
|
576
584
|
if tag.length > 0
|
577
|
-
tag_id = tag[0]['id'].to_s
|
578
|
-
|
585
|
+
tag_id = tag[0]['id'].to_s.gsub(/[^[[:digit:]]]+/,'')
|
586
|
+
|
579
587
|
if tag_id.length >= 4
|
580
|
-
year = tag_id[0..3].to_i
|
581
|
-
|
588
|
+
year = tag_id[0..3].to_i
|
589
|
+
|
582
590
|
return year if Util.sane_year?(year)
|
583
591
|
end
|
584
592
|
end
|
585
|
-
|
593
|
+
|
586
594
|
# Second, try futsuurl.
|
587
|
-
if !futsuurl.nil?
|
595
|
+
if !futsuurl.nil?
|
588
596
|
m = futsuurl.match(/([[:digit:]]{4,})/)
|
589
|
-
|
590
|
-
if !m.nil?
|
591
|
-
year = m[0..3].to_i
|
592
|
-
|
597
|
+
|
598
|
+
if !m.nil? && (m = m[0].to_s).length >= 4
|
599
|
+
year = m[0..3].to_i
|
600
|
+
|
593
601
|
return year if Util.sane_year?(year)
|
594
602
|
end
|
595
603
|
end
|
596
|
-
|
604
|
+
|
597
605
|
# As a last resort, use our user-defined fallbacks (if specified).
|
598
|
-
return @year.to_i
|
599
|
-
return @datetime.year if !@datetime.nil?
|
600
|
-
|
606
|
+
return @year.to_i unless @year.nil?
|
607
|
+
return @datetime.year if !@datetime.nil? && Util.sane_year?(@datetime.year)
|
608
|
+
|
601
609
|
raise ScrapeError,"could not scrape year at URL[#{@url}]"
|
602
610
|
end
|
603
|
-
|
611
|
+
|
604
612
|
def split(str)
|
605
613
|
return @splitter.split(str)
|
606
614
|
end
|
607
|
-
|
615
|
+
|
608
616
|
def variate(str)
|
609
617
|
variations = []
|
610
|
-
|
611
|
-
@variators.each
|
618
|
+
|
619
|
+
@variators.each do |variator|
|
612
620
|
variations.push(*variator.variate(str))
|
613
621
|
end
|
614
|
-
|
622
|
+
|
615
623
|
return variations
|
616
624
|
end
|
617
|
-
|
625
|
+
|
618
626
|
def warn_or_error(klass,msg)
|
619
627
|
if @strict
|
620
628
|
raise klass,msg
|
@@ -623,42 +631,42 @@ module NHKore
|
|
623
631
|
end
|
624
632
|
end
|
625
633
|
end
|
626
|
-
|
634
|
+
|
627
635
|
###
|
628
|
-
# @author Jonathan Bradley Whited
|
636
|
+
# @author Jonathan Bradley Whited
|
629
637
|
# @since 0.2.0
|
630
638
|
###
|
631
639
|
class ScrapeWordsResult
|
632
640
|
attr_reader :text
|
633
641
|
attr_reader :words
|
634
|
-
|
635
|
-
def initialize
|
642
|
+
|
643
|
+
def initialize
|
636
644
|
super()
|
637
|
-
|
638
|
-
@text = ''.dup
|
645
|
+
|
646
|
+
@text = ''.dup
|
639
647
|
@words = []
|
640
648
|
end
|
641
|
-
|
649
|
+
|
642
650
|
def add_text(text)
|
643
651
|
@text << Util.reduce_jpn_space(text)
|
644
|
-
|
652
|
+
|
645
653
|
return self
|
646
654
|
end
|
647
|
-
|
655
|
+
|
648
656
|
def add_word(word)
|
649
657
|
@words << word
|
650
|
-
|
658
|
+
|
651
659
|
return self
|
652
660
|
end
|
653
|
-
|
654
|
-
def polish!
|
661
|
+
|
662
|
+
def polish!
|
655
663
|
@text = Util.strip_web_str(@text)
|
656
|
-
|
664
|
+
|
657
665
|
return self
|
658
666
|
end
|
659
|
-
|
660
|
-
def words?
|
661
|
-
return !@words.empty?
|
667
|
+
|
668
|
+
def words?
|
669
|
+
return !@words.empty?
|
662
670
|
end
|
663
671
|
end
|
664
672
|
end
|