nhkore 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +37 -1
- data/README.md +18 -6
- data/Rakefile +11 -16
- data/bin/nhkore +1 -3
- data/lib/nhkore/app.rb +616 -0
- data/lib/nhkore/article.rb +130 -0
- data/lib/nhkore/article_scraper.rb +653 -0
- data/lib/nhkore/cleaner.rb +91 -0
- data/lib/nhkore/cli/bing_cmd.rb +220 -0
- data/lib/nhkore/cli/fx_cmd.rb +116 -0
- data/lib/nhkore/cli/get_cmd.rb +153 -0
- data/lib/nhkore/cli/news_cmd.rb +375 -0
- data/lib/nhkore/cli/sift_cmd.rb +382 -0
- data/lib/nhkore/defn.rb +104 -0
- data/lib/nhkore/dict.rb +80 -0
- data/lib/nhkore/dict_scraper.rb +76 -0
- data/lib/nhkore/entry.rb +104 -0
- data/lib/nhkore/error.rb +35 -0
- data/lib/nhkore/fileable.rb +48 -0
- data/lib/nhkore/missingno.rb +92 -0
- data/lib/nhkore/news.rb +176 -0
- data/lib/nhkore/polisher.rb +93 -0
- data/lib/nhkore/scraper.rb +137 -0
- data/lib/nhkore/search_link.rb +188 -0
- data/lib/nhkore/search_scraper.rb +152 -0
- data/lib/nhkore/sifter.rb +339 -0
- data/lib/nhkore/splitter.rb +90 -0
- data/lib/nhkore/util.rb +190 -0
- data/lib/nhkore/variator.rb +87 -0
- data/lib/nhkore/version.rb +1 -1
- data/lib/nhkore/word.rb +134 -17
- data/lib/nhkore.rb +39 -40
- data/nhkore.gemspec +23 -8
- data/test/{nhkore_tester.rb → nhkore/test_helper.rb} +3 -1
- data/test/nhkore_test.rb +8 -6
- metadata +204 -11
@@ -0,0 +1,375 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
#--
|
6
|
+
# This file is part of NHKore.
|
7
|
+
# Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
|
8
|
+
#
|
9
|
+
# NHKore is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NHKore is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU Lesser General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU Lesser General Public License
|
20
|
+
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
|
24
|
+
require 'time'
|
25
|
+
|
26
|
+
require 'nhkore/error'
|
27
|
+
require 'nhkore/missingno'
|
28
|
+
require 'nhkore/news'
|
29
|
+
require 'nhkore/search_link'
|
30
|
+
require 'nhkore/util'
|
31
|
+
|
32
|
+
|
33
|
+
module NHKore
|
34
|
+
module CLI
|
35
|
+
###
|
36
|
+
# @author Jonathan Bradley Whited (@esotericpig)
|
37
|
+
# @since 0.2.0
|
38
|
+
###
|
39
|
+
module NewsCmd
|
40
|
+
DEFAULT_NEWS_SCRAPE = 1
|
41
|
+
|
42
|
+
def build_news_cmd()
|
43
|
+
app = self
|
44
|
+
|
45
|
+
@news_cmd = @app_cmd.define_command() do
|
46
|
+
name 'news'
|
47
|
+
usage 'news [OPTIONS] [COMMAND]...'
|
48
|
+
aliases :n
|
49
|
+
summary "Scrape NHK News Web (Easy) articles (aliases: #{app.color_alias('n')})"
|
50
|
+
|
51
|
+
description <<-EOD
|
52
|
+
Scrape NHK News Web (Easy) articles &
|
53
|
+
save to folder: #{News::DEFAULT_DIR}
|
54
|
+
EOD
|
55
|
+
|
56
|
+
option :d,:datetime,<<-EOD,argument: :required,transform: -> (value) do
|
57
|
+
date time to use as a fallback in cases when an article doesn't have one;
|
58
|
+
format: YYYY-mm-dd H:M; example: 2020-03-30 15:30
|
59
|
+
EOD
|
60
|
+
value = Time.strptime(value,'%Y-%m-%d %H:%M',&Util.method(:guess_year))
|
61
|
+
value = Util.jst_time(value)
|
62
|
+
value
|
63
|
+
end
|
64
|
+
option :i,:in,<<-EOD,argument: :required,transform: -> (value) do
|
65
|
+
HTML file of article to read instead of URL (for offline testing and/or slow internet;
|
66
|
+
see '--no-dict' option)
|
67
|
+
EOD
|
68
|
+
app.check_empty_opt(:in,value)
|
69
|
+
end
|
70
|
+
flag :L,:lenient,<<-EOD
|
71
|
+
leniently (not strict) scrape articles:
|
72
|
+
body & title content without the proper HTML/CSS classes/IDs and no futsuurl;
|
73
|
+
example URLs that need this flag:
|
74
|
+
-https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html
|
75
|
+
-https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html
|
76
|
+
EOD
|
77
|
+
option :k,:like,<<-EOD,argument: :required,transform: -> (value) do
|
78
|
+
text to fuzzy search links for; for example, "--like '00123'" will only scrape links containing
|
79
|
+
text '00123' -- like '*00123*'
|
80
|
+
EOD
|
81
|
+
value = Util.strip_web_str(value).downcase()
|
82
|
+
value
|
83
|
+
end
|
84
|
+
option :l,:links,<<-EOD,argument: :required,transform: -> (value) do
|
85
|
+
'directory/file' of article links (from a Search Engine) to scrape (see '#{App::NAME} bing';
|
86
|
+
defaults: #{SearchLinks::DEFAULT_BING_YASASHII_FILE}, #{SearchLinks::DEFAULT_BING_FUTSUU_FILE})
|
87
|
+
EOD
|
88
|
+
app.check_empty_opt(:links,value)
|
89
|
+
end
|
90
|
+
flag :M,:missingno,<<-EOD
|
91
|
+
very rarely an article will not have kana or kanji for a Ruby tag;
|
92
|
+
to not raise an error, this will use previously scraped data to fill it in;
|
93
|
+
example URL:
|
94
|
+
-https://www3.nhk.or.jp/news/easy/k10012331311000/k10012331311000.html
|
95
|
+
EOD
|
96
|
+
flag :D,:'no-dict',<<-EOD
|
97
|
+
do not try to parse the dictionary files for the articles; useful in case of errors trying to load
|
98
|
+
the dictionaries (or for offline testing)
|
99
|
+
EOD
|
100
|
+
option :o,:out,<<-EOD,argument: :required,transform: -> (value) do
|
101
|
+
'directory/file' to save words to; if you only specify a directory or a file, it will attach
|
102
|
+
the appropriate default directory/file name
|
103
|
+
(defaults: #{YasashiiNews::DEFAULT_FILE}, #{FutsuuNews::DEFAULT_FILE})
|
104
|
+
EOD
|
105
|
+
app.check_empty_opt(:out,value)
|
106
|
+
end
|
107
|
+
flag :r,:redo,'scrape article links even if they have already been scraped'
|
108
|
+
option :s,:scrape,'number of unscraped article links to scrape',argument: :required,
|
109
|
+
default: DEFAULT_NEWS_SCRAPE,transform: -> (value) do
|
110
|
+
value = value.to_i()
|
111
|
+
value = 1 if value < 1
|
112
|
+
value
|
113
|
+
end
|
114
|
+
option nil,:'show-dict',<<-EOD
|
115
|
+
show dictionary URL and contents for the first article and exit;
|
116
|
+
useful for debugging dictionary errors (see '--no-dict' option);
|
117
|
+
implies '--dry-run' option
|
118
|
+
EOD
|
119
|
+
option :u,:url,<<-EOD,argument: :required,transform: -> (value) do
|
120
|
+
URL of article to scrape, instead of article links file (see '--links' option)
|
121
|
+
EOD
|
122
|
+
app.check_empty_opt(:url,value)
|
123
|
+
end
|
124
|
+
|
125
|
+
run do |opts,args,cmd|
|
126
|
+
puts cmd.help
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
@news_easy_cmd = @news_cmd.define_command() do
|
131
|
+
name 'easy'
|
132
|
+
usage 'easy [OPTIONS] [COMMAND]...'
|
133
|
+
aliases :e,:ez
|
134
|
+
summary "Scrape NHK News Web Easy (Yasashii) articles (aliases: #{app.color_alias('e ez')})"
|
135
|
+
|
136
|
+
description <<-EOD
|
137
|
+
Search for NHK News Web Easy (Yasashii) links &
|
138
|
+
save to file: #{YasashiiNews::DEFAULT_FILE}
|
139
|
+
EOD
|
140
|
+
|
141
|
+
run do |opts,args,cmd|
|
142
|
+
app.refresh_cmd(opts,args,cmd)
|
143
|
+
app.run_news_cmd(:yasashii)
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
@news_regular_cmd = @news_cmd.define_command() do
|
148
|
+
name 'regular'
|
149
|
+
usage 'regular [OPTIONS] [COMMAND]...'
|
150
|
+
aliases :r,:reg
|
151
|
+
summary "Scrape NHK News Web Regular (Futsuu) articles (aliases: #{app.color_alias('r reg')})"
|
152
|
+
|
153
|
+
description <<-EOD
|
154
|
+
Search for NHK News Web Regular (Futsuu) links &
|
155
|
+
save to file: #{FutsuuNews::DEFAULT_FILE}
|
156
|
+
EOD
|
157
|
+
|
158
|
+
run do |opts,args,cmd|
|
159
|
+
app.refresh_cmd(opts,args,cmd)
|
160
|
+
app.run_news_cmd(:futsuu)
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
def run_news_cmd(type)
|
166
|
+
@cmd_opts[:dry_run] = true if @cmd_opts[:show_dict]
|
167
|
+
news_name = nil
|
168
|
+
|
169
|
+
build_in_file(:in)
|
170
|
+
|
171
|
+
case type
|
172
|
+
when :futsuu
|
173
|
+
build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::DEFAULT_BING_FUTSUU_FILENAME)
|
174
|
+
build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: FutsuuNews::DEFAULT_FILENAME)
|
175
|
+
|
176
|
+
news_name = 'Regular'
|
177
|
+
when :yasashii
|
178
|
+
build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::DEFAULT_BING_YASASHII_FILENAME)
|
179
|
+
build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: YasashiiNews::DEFAULT_FILENAME)
|
180
|
+
|
181
|
+
news_name = 'Easy'
|
182
|
+
else
|
183
|
+
raise ArgumentError,"invalid type[#{type}]"
|
184
|
+
end
|
185
|
+
|
186
|
+
return unless check_in_file(:in,empty_ok: true)
|
187
|
+
return unless check_out_file(:out)
|
188
|
+
|
189
|
+
datetime = @cmd_opts[:datetime]
|
190
|
+
dict = @cmd_opts[:no_dict] ? nil : :scrape
|
191
|
+
dry_run = @cmd_opts[:dry_run]
|
192
|
+
in_file = @cmd_opts[:in]
|
193
|
+
lenient = @cmd_opts[:lenient]
|
194
|
+
like = @cmd_opts[:like]
|
195
|
+
links_file = @cmd_opts[:links]
|
196
|
+
max_scrapes = @cmd_opts[:scrape]
|
197
|
+
max_scrapes = DEFAULT_NEWS_SCRAPE if max_scrapes.nil?()
|
198
|
+
missingno = @cmd_opts[:missingno]
|
199
|
+
out_file = @cmd_opts[:out]
|
200
|
+
redo_scrapes = @cmd_opts[:redo]
|
201
|
+
show_dict = @cmd_opts[:show_dict]
|
202
|
+
|
203
|
+
# Favor in_file option over url option.
|
204
|
+
url = in_file.nil?() ? Util.strip_web_str(@cmd_opts[:url].to_s()) : in_file
|
205
|
+
url = nil if url.empty?()
|
206
|
+
|
207
|
+
if url.nil?()
|
208
|
+
# Then we must have a links file that exists.
|
209
|
+
return unless check_in_file(:links,empty_ok: false)
|
210
|
+
end
|
211
|
+
|
212
|
+
start_spin("Scraping NHK News Web #{news_name} articles")
|
213
|
+
|
214
|
+
is_file = !in_file.nil?()
|
215
|
+
link_count = -1
|
216
|
+
links = File.exist?(links_file) ? SearchLinks.load_file(links_file) : SearchLinks.new()
|
217
|
+
new_articles = [] # For --dry-run
|
218
|
+
news = nil
|
219
|
+
scrape_count = 0
|
220
|
+
|
221
|
+
if File.exist?(out_file)
|
222
|
+
news = (type == :yasashii) ? YasashiiNews.load_file(out_file) : FutsuuNews.load_file(out_file)
|
223
|
+
else
|
224
|
+
news = (type == :yasashii) ? YasashiiNews.new() : FutsuuNews.new()
|
225
|
+
end
|
226
|
+
|
227
|
+
@news_article_scraper_kargs = @scraper_kargs.merge({
|
228
|
+
datetime: datetime,
|
229
|
+
dict: dict,
|
230
|
+
is_file: is_file,
|
231
|
+
missingno: missingno ? Missingno.new(news) : nil,
|
232
|
+
mode: lenient ? :lenient : nil,
|
233
|
+
})
|
234
|
+
@news_dict_scraper_kargs = @scraper_kargs.merge({
|
235
|
+
is_file: is_file,
|
236
|
+
})
|
237
|
+
|
238
|
+
if url.nil?()
|
239
|
+
links.each() do |key,link|
|
240
|
+
update_spin_detail(" (scraped=#{scrape_count}, considered=#{link_count += 1})")
|
241
|
+
|
242
|
+
break if scrape_count >= max_scrapes
|
243
|
+
next if !like.nil?() && !link.url.to_s().downcase().include?(like)
|
244
|
+
next if !redo_scrapes && scraped_news_article?(news,link)
|
245
|
+
|
246
|
+
url = link.url
|
247
|
+
|
248
|
+
if (new_url = scrape_news_article(url,link: link,new_articles: new_articles,news: news))
|
249
|
+
# --show-dict
|
250
|
+
url = new_url
|
251
|
+
scrape_count = max_scrapes - 1
|
252
|
+
end
|
253
|
+
|
254
|
+
# Break on next iteration for update_spin_detail().
|
255
|
+
next if (scrape_count += 1) >= max_scrapes
|
256
|
+
|
257
|
+
sleep_scraper()
|
258
|
+
end
|
259
|
+
else
|
260
|
+
link = links[url]
|
261
|
+
|
262
|
+
if link.nil?()
|
263
|
+
link = SearchLink.new(url)
|
264
|
+
links.add_link(link)
|
265
|
+
end
|
266
|
+
|
267
|
+
scrape_news_article(url,link: link,new_articles: new_articles,news: news)
|
268
|
+
|
269
|
+
scrape_count += 1
|
270
|
+
end
|
271
|
+
|
272
|
+
stop_spin()
|
273
|
+
puts
|
274
|
+
|
275
|
+
if scrape_count <= 0
|
276
|
+
puts 'Nothing scraped!'
|
277
|
+
|
278
|
+
if !dry_run && !show_dict
|
279
|
+
puts
|
280
|
+
start_spin('Saving updated links to file')
|
281
|
+
|
282
|
+
links.save_file(links_file)
|
283
|
+
|
284
|
+
stop_spin()
|
285
|
+
puts "> #{links_file}"
|
286
|
+
end
|
287
|
+
else
|
288
|
+
puts 'Last URL scraped:'
|
289
|
+
puts "> #{url}"
|
290
|
+
puts
|
291
|
+
|
292
|
+
if show_dict
|
293
|
+
puts @cmd_opts[:show_dict] # Updated in scrape_news_article()
|
294
|
+
elsif dry_run
|
295
|
+
if new_articles.length < 1
|
296
|
+
raise CLIError,"scrape_count[#{scrape_count}] != new_articles[#{new_articles.length}]; " +
|
297
|
+
"internal code is broken"
|
298
|
+
elsif new_articles.length == 1
|
299
|
+
puts new_articles.first
|
300
|
+
else
|
301
|
+
# Don't show the words (mini), too verbose for more than 1.
|
302
|
+
new_articles.each() do |article|
|
303
|
+
puts article.to_s(mini: true)
|
304
|
+
end
|
305
|
+
end
|
306
|
+
else
|
307
|
+
start_spin('Saving scraped data to files')
|
308
|
+
|
309
|
+
links.save_file(links_file)
|
310
|
+
news.save_file(out_file)
|
311
|
+
|
312
|
+
stop_spin()
|
313
|
+
puts "> #{out_file}"
|
314
|
+
puts "> #{links_file}"
|
315
|
+
end
|
316
|
+
end
|
317
|
+
end
|
318
|
+
|
319
|
+
def scrape_news_article(url,link:,new_articles:,news:)
|
320
|
+
show_dict = @cmd_opts[:show_dict]
|
321
|
+
|
322
|
+
if show_dict
|
323
|
+
scraper = DictScraper.new(url,**@news_dict_scraper_kargs)
|
324
|
+
|
325
|
+
@cmd_opts[:show_dict] = scraper.scrape().to_s()
|
326
|
+
|
327
|
+
return scraper.url
|
328
|
+
end
|
329
|
+
|
330
|
+
scraper = ArticleScraper.new(url,**@news_article_scraper_kargs)
|
331
|
+
article = scraper.scrape()
|
332
|
+
|
333
|
+
# run_news_cmd() handles overwriting with --redo or not
|
334
|
+
# using scraped_news_article?().
|
335
|
+
news.add_article(article,overwrite: true)
|
336
|
+
|
337
|
+
news.update_article(article,link.url) # Favors https
|
338
|
+
link.update_from_article(article)
|
339
|
+
|
340
|
+
new_articles << article
|
341
|
+
|
342
|
+
return false # No --show-dict
|
343
|
+
end
|
344
|
+
|
345
|
+
def scraped_news_article?(news,link)
|
346
|
+
return true if link.scraped?()
|
347
|
+
|
348
|
+
article = news.article(link.url)
|
349
|
+
|
350
|
+
if article.nil?()
|
351
|
+
if !Util.empty_web_str?(link.sha256) && news.sha256?(link.sha256)
|
352
|
+
article = news.article_with_sha256(link.sha256)
|
353
|
+
end
|
354
|
+
|
355
|
+
if article.nil?()
|
356
|
+
scraper = ArticleScraper.new(link.url,**@news_article_scraper_kargs)
|
357
|
+
|
358
|
+
sha256 = scraper.scrape_sha256_only()
|
359
|
+
|
360
|
+
article = news.article_with_sha256(sha256) if news.sha256?(sha256)
|
361
|
+
end
|
362
|
+
end
|
363
|
+
|
364
|
+
if article
|
365
|
+
news.update_article(article,link.url) # Favors https
|
366
|
+
link.update_from_article(article)
|
367
|
+
|
368
|
+
return true
|
369
|
+
end
|
370
|
+
|
371
|
+
return false
|
372
|
+
end
|
373
|
+
end
|
374
|
+
end
|
375
|
+
end
|