nhkore 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,375 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ require 'time'
25
+
26
+ require 'nhkore/error'
27
+ require 'nhkore/missingno'
28
+ require 'nhkore/news'
29
+ require 'nhkore/search_link'
30
+ require 'nhkore/util'
31
+
32
+
33
+ module NHKore
34
+ module CLI
35
+ ###
36
+ # @author Jonathan Bradley Whited (@esotericpig)
37
+ # @since 0.2.0
38
+ ###
39
+ module NewsCmd
40
+ DEFAULT_NEWS_SCRAPE = 1
41
+
42
+ def build_news_cmd()
43
+ app = self
44
+
45
+ @news_cmd = @app_cmd.define_command() do
46
+ name 'news'
47
+ usage 'news [OPTIONS] [COMMAND]...'
48
+ aliases :n
49
+ summary "Scrape NHK News Web (Easy) articles (aliases: #{app.color_alias('n')})"
50
+
51
+ description <<-EOD
52
+ Scrape NHK News Web (Easy) articles &
53
+ save to folder: #{News::DEFAULT_DIR}
54
+ EOD
55
+
56
+ option :d,:datetime,<<-EOD,argument: :required,transform: -> (value) do
57
+ date time to use as a fallback in cases when an article doesn't have one;
58
+ format: YYYY-mm-dd H:M; example: 2020-03-30 15:30
59
+ EOD
60
+ value = Time.strptime(value,'%Y-%m-%d %H:%M',&Util.method(:guess_year))
61
+ value = Util.jst_time(value)
62
+ value
63
+ end
64
+ option :i,:in,<<-EOD,argument: :required,transform: -> (value) do
65
+ HTML file of article to read instead of URL (for offline testing and/or slow internet;
66
+ see '--no-dict' option)
67
+ EOD
68
+ app.check_empty_opt(:in,value)
69
+ end
70
+ flag :L,:lenient,<<-EOD
71
+ leniently (not strict) scrape articles:
72
+ body & title content without the proper HTML/CSS classes/IDs and no futsuurl;
73
+ example URLs that need this flag:
74
+ -https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html
75
+ -https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html
76
+ EOD
77
+ option :k,:like,<<-EOD,argument: :required,transform: -> (value) do
78
+ text to fuzzy search links for; for example, "--like '00123'" will only scrape links containing
79
+ text '00123' -- like '*00123*'
80
+ EOD
81
+ value = Util.strip_web_str(value).downcase()
82
+ value
83
+ end
84
+ option :l,:links,<<-EOD,argument: :required,transform: -> (value) do
85
+ 'directory/file' of article links (from a Search Engine) to scrape (see '#{App::NAME} bing';
86
+ defaults: #{SearchLinks::DEFAULT_BING_YASASHII_FILE}, #{SearchLinks::DEFAULT_BING_FUTSUU_FILE})
87
+ EOD
88
+ app.check_empty_opt(:links,value)
89
+ end
90
+ flag :M,:missingno,<<-EOD
91
+ very rarely an article will not have kana or kanji for a Ruby tag;
92
+ to not raise an error, this will use previously scraped data to fill it in;
93
+ example URL:
94
+ -https://www3.nhk.or.jp/news/easy/k10012331311000/k10012331311000.html
95
+ EOD
96
+ flag :D,:'no-dict',<<-EOD
97
+ do not try to parse the dictionary files for the articles; useful in case of errors trying to load
98
+ the dictionaries (or for offline testing)
99
+ EOD
100
+ option :o,:out,<<-EOD,argument: :required,transform: -> (value) do
101
+ 'directory/file' to save words to; if you only specify a directory or a file, it will attach
102
+ the appropriate default directory/file name
103
+ (defaults: #{YasashiiNews::DEFAULT_FILE}, #{FutsuuNews::DEFAULT_FILE})
104
+ EOD
105
+ app.check_empty_opt(:out,value)
106
+ end
107
+ flag :r,:redo,'scrape article links even if they have already been scraped'
108
+ option :s,:scrape,'number of unscraped article links to scrape',argument: :required,
109
+ default: DEFAULT_NEWS_SCRAPE,transform: -> (value) do
110
+ value = value.to_i()
111
+ value = 1 if value < 1
112
+ value
113
+ end
114
+ option nil,:'show-dict',<<-EOD
115
+ show dictionary URL and contents for the first article and exit;
116
+ useful for debugging dictionary errors (see '--no-dict' option);
117
+ implies '--dry-run' option
118
+ EOD
119
+ option :u,:url,<<-EOD,argument: :required,transform: -> (value) do
120
+ URL of article to scrape, instead of article links file (see '--links' option)
121
+ EOD
122
+ app.check_empty_opt(:url,value)
123
+ end
124
+
125
+ run do |opts,args,cmd|
126
+ puts cmd.help
127
+ end
128
+ end
129
+
130
+ @news_easy_cmd = @news_cmd.define_command() do
131
+ name 'easy'
132
+ usage 'easy [OPTIONS] [COMMAND]...'
133
+ aliases :e,:ez
134
+ summary "Scrape NHK News Web Easy (Yasashii) articles (aliases: #{app.color_alias('e ez')})"
135
+
136
+ description <<-EOD
137
+ Search for NHK News Web Easy (Yasashii) links &
138
+ save to file: #{YasashiiNews::DEFAULT_FILE}
139
+ EOD
140
+
141
+ run do |opts,args,cmd|
142
+ app.refresh_cmd(opts,args,cmd)
143
+ app.run_news_cmd(:yasashii)
144
+ end
145
+ end
146
+
147
+ @news_regular_cmd = @news_cmd.define_command() do
148
+ name 'regular'
149
+ usage 'regular [OPTIONS] [COMMAND]...'
150
+ aliases :r,:reg
151
+ summary "Scrape NHK News Web Regular (Futsuu) articles (aliases: #{app.color_alias('r reg')})"
152
+
153
+ description <<-EOD
154
+ Search for NHK News Web Regular (Futsuu) links &
155
+ save to file: #{FutsuuNews::DEFAULT_FILE}
156
+ EOD
157
+
158
+ run do |opts,args,cmd|
159
+ app.refresh_cmd(opts,args,cmd)
160
+ app.run_news_cmd(:futsuu)
161
+ end
162
+ end
163
+ end
164
+
165
+ def run_news_cmd(type)
166
+ @cmd_opts[:dry_run] = true if @cmd_opts[:show_dict]
167
+ news_name = nil
168
+
169
+ build_in_file(:in)
170
+
171
+ case type
172
+ when :futsuu
173
+ build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::DEFAULT_BING_FUTSUU_FILENAME)
174
+ build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: FutsuuNews::DEFAULT_FILENAME)
175
+
176
+ news_name = 'Regular'
177
+ when :yasashii
178
+ build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::DEFAULT_BING_YASASHII_FILENAME)
179
+ build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: YasashiiNews::DEFAULT_FILENAME)
180
+
181
+ news_name = 'Easy'
182
+ else
183
+ raise ArgumentError,"invalid type[#{type}]"
184
+ end
185
+
186
+ return unless check_in_file(:in,empty_ok: true)
187
+ return unless check_out_file(:out)
188
+
189
+ datetime = @cmd_opts[:datetime]
190
+ dict = @cmd_opts[:no_dict] ? nil : :scrape
191
+ dry_run = @cmd_opts[:dry_run]
192
+ in_file = @cmd_opts[:in]
193
+ lenient = @cmd_opts[:lenient]
194
+ like = @cmd_opts[:like]
195
+ links_file = @cmd_opts[:links]
196
+ max_scrapes = @cmd_opts[:scrape]
197
+ max_scrapes = DEFAULT_NEWS_SCRAPE if max_scrapes.nil?()
198
+ missingno = @cmd_opts[:missingno]
199
+ out_file = @cmd_opts[:out]
200
+ redo_scrapes = @cmd_opts[:redo]
201
+ show_dict = @cmd_opts[:show_dict]
202
+
203
+ # Favor in_file option over url option.
204
+ url = in_file.nil?() ? Util.strip_web_str(@cmd_opts[:url].to_s()) : in_file
205
+ url = nil if url.empty?()
206
+
207
+ if url.nil?()
208
+ # Then we must have a links file that exists.
209
+ return unless check_in_file(:links,empty_ok: false)
210
+ end
211
+
212
+ start_spin("Scraping NHK News Web #{news_name} articles")
213
+
214
+ is_file = !in_file.nil?()
215
+ link_count = -1
216
+ links = File.exist?(links_file) ? SearchLinks.load_file(links_file) : SearchLinks.new()
217
+ new_articles = [] # For --dry-run
218
+ news = nil
219
+ scrape_count = 0
220
+
221
+ if File.exist?(out_file)
222
+ news = (type == :yasashii) ? YasashiiNews.load_file(out_file) : FutsuuNews.load_file(out_file)
223
+ else
224
+ news = (type == :yasashii) ? YasashiiNews.new() : FutsuuNews.new()
225
+ end
226
+
227
+ @news_article_scraper_kargs = @scraper_kargs.merge({
228
+ datetime: datetime,
229
+ dict: dict,
230
+ is_file: is_file,
231
+ missingno: missingno ? Missingno.new(news) : nil,
232
+ mode: lenient ? :lenient : nil,
233
+ })
234
+ @news_dict_scraper_kargs = @scraper_kargs.merge({
235
+ is_file: is_file,
236
+ })
237
+
238
+ if url.nil?()
239
+ links.each() do |key,link|
240
+ update_spin_detail(" (scraped=#{scrape_count}, considered=#{link_count += 1})")
241
+
242
+ break if scrape_count >= max_scrapes
243
+ next if !like.nil?() && !link.url.to_s().downcase().include?(like)
244
+ next if !redo_scrapes && scraped_news_article?(news,link)
245
+
246
+ url = link.url
247
+
248
+ if (new_url = scrape_news_article(url,link: link,new_articles: new_articles,news: news))
249
+ # --show-dict
250
+ url = new_url
251
+ scrape_count = max_scrapes - 1
252
+ end
253
+
254
+ # Break on next iteration for update_spin_detail().
255
+ next if (scrape_count += 1) >= max_scrapes
256
+
257
+ sleep_scraper()
258
+ end
259
+ else
260
+ link = links[url]
261
+
262
+ if link.nil?()
263
+ link = SearchLink.new(url)
264
+ links.add_link(link)
265
+ end
266
+
267
+ scrape_news_article(url,link: link,new_articles: new_articles,news: news)
268
+
269
+ scrape_count += 1
270
+ end
271
+
272
+ stop_spin()
273
+ puts
274
+
275
+ if scrape_count <= 0
276
+ puts 'Nothing scraped!'
277
+
278
+ if !dry_run && !show_dict
279
+ puts
280
+ start_spin('Saving updated links to file')
281
+
282
+ links.save_file(links_file)
283
+
284
+ stop_spin()
285
+ puts "> #{links_file}"
286
+ end
287
+ else
288
+ puts 'Last URL scraped:'
289
+ puts "> #{url}"
290
+ puts
291
+
292
+ if show_dict
293
+ puts @cmd_opts[:show_dict] # Updated in scrape_news_article()
294
+ elsif dry_run
295
+ if new_articles.length < 1
296
+ raise CLIError,"scrape_count[#{scrape_count}] != new_articles[#{new_articles.length}]; " +
297
+ "internal code is broken"
298
+ elsif new_articles.length == 1
299
+ puts new_articles.first
300
+ else
301
+ # Don't show the words (mini), too verbose for more than 1.
302
+ new_articles.each() do |article|
303
+ puts article.to_s(mini: true)
304
+ end
305
+ end
306
+ else
307
+ start_spin('Saving scraped data to files')
308
+
309
+ links.save_file(links_file)
310
+ news.save_file(out_file)
311
+
312
+ stop_spin()
313
+ puts "> #{out_file}"
314
+ puts "> #{links_file}"
315
+ end
316
+ end
317
+ end
318
+
319
+ def scrape_news_article(url,link:,new_articles:,news:)
320
+ show_dict = @cmd_opts[:show_dict]
321
+
322
+ if show_dict
323
+ scraper = DictScraper.new(url,**@news_dict_scraper_kargs)
324
+
325
+ @cmd_opts[:show_dict] = scraper.scrape().to_s()
326
+
327
+ return scraper.url
328
+ end
329
+
330
+ scraper = ArticleScraper.new(url,**@news_article_scraper_kargs)
331
+ article = scraper.scrape()
332
+
333
+ # run_news_cmd() handles overwriting with --redo or not
334
+ # using scraped_news_article?().
335
+ news.add_article(article,overwrite: true)
336
+
337
+ news.update_article(article,link.url) # Favors https
338
+ link.update_from_article(article)
339
+
340
+ new_articles << article
341
+
342
+ return false # No --show-dict
343
+ end
344
+
345
+ def scraped_news_article?(news,link)
346
+ return true if link.scraped?()
347
+
348
+ article = news.article(link.url)
349
+
350
+ if article.nil?()
351
+ if !Util.empty_web_str?(link.sha256) && news.sha256?(link.sha256)
352
+ article = news.article_with_sha256(link.sha256)
353
+ end
354
+
355
+ if article.nil?()
356
+ scraper = ArticleScraper.new(link.url,**@news_article_scraper_kargs)
357
+
358
+ sha256 = scraper.scrape_sha256_only()
359
+
360
+ article = news.article_with_sha256(sha256) if news.sha256?(sha256)
361
+ end
362
+ end
363
+
364
+ if article
365
+ news.update_article(article,link.url) # Favors https
366
+ link.update_from_article(article)
367
+
368
+ return true
369
+ end
370
+
371
+ return false
372
+ end
373
+ end
374
+ end
375
+ end