nhkore 0.3.3 → 0.3.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +3 -0
- data/CHANGELOG.md +97 -2
- data/Gemfile +0 -18
- data/Gemfile.lock +89 -0
- data/README.md +58 -30
- data/Rakefile +68 -42
- data/bin/nhkore +4 -15
- data/lib/nhkore.rb +8 -20
- data/lib/nhkore/app.rb +231 -236
- data/lib/nhkore/article.rb +56 -53
- data/lib/nhkore/article_scraper.rb +308 -289
- data/lib/nhkore/cleaner.rb +20 -32
- data/lib/nhkore/cli/fx_cmd.rb +41 -53
- data/lib/nhkore/cli/get_cmd.rb +59 -70
- data/lib/nhkore/cli/news_cmd.rb +145 -154
- data/lib/nhkore/cli/search_cmd.rb +110 -120
- data/lib/nhkore/cli/sift_cmd.rb +111 -227
- data/lib/nhkore/datetime_parser.rb +328 -0
- data/lib/nhkore/defn.rb +48 -55
- data/lib/nhkore/dict.rb +26 -38
- data/lib/nhkore/dict_scraper.rb +31 -40
- data/lib/nhkore/entry.rb +43 -55
- data/lib/nhkore/error.rb +16 -21
- data/lib/nhkore/fileable.rb +10 -21
- data/lib/nhkore/lib.rb +6 -17
- data/lib/nhkore/missingno.rb +21 -33
- data/lib/nhkore/news.rb +61 -66
- data/lib/nhkore/polisher.rb +22 -34
- data/lib/nhkore/scraper.rb +75 -82
- data/lib/nhkore/search_link.rb +85 -78
- data/lib/nhkore/search_scraper.rb +89 -92
- data/lib/nhkore/sifter.rb +157 -171
- data/lib/nhkore/splitter.rb +19 -31
- data/lib/nhkore/user_agents.rb +28 -32
- data/lib/nhkore/util.rb +72 -101
- data/lib/nhkore/variator.rb +20 -32
- data/lib/nhkore/version.rb +4 -16
- data/lib/nhkore/word.rb +105 -99
- data/nhkore.gemspec +58 -65
- data/samples/looper.rb +71 -0
- data/test/nhkore/test_helper.rb +3 -15
- data/test/nhkore_test.rb +6 -18
- metadata +53 -30
data/lib/nhkore/cli/news_cmd.rb
CHANGED
@@ -1,28 +1,17 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
24
12
|
require 'time'
|
25
13
|
|
14
|
+
require 'nhkore/datetime_parser'
|
26
15
|
require 'nhkore/error'
|
27
16
|
require 'nhkore/missingno'
|
28
17
|
require 'nhkore/news'
|
@@ -33,164 +22,166 @@ require 'nhkore/util'
|
|
33
22
|
module NHKore
|
34
23
|
module CLI
|
35
24
|
###
|
36
|
-
# @author Jonathan Bradley Whited
|
25
|
+
# @author Jonathan Bradley Whited
|
37
26
|
# @since 0.2.0
|
38
27
|
###
|
39
28
|
module NewsCmd
|
40
29
|
DEFAULT_NEWS_SCRAPE = 1
|
41
|
-
|
42
|
-
def build_news_cmd
|
30
|
+
|
31
|
+
def build_news_cmd
|
43
32
|
app = self
|
44
|
-
|
45
|
-
@news_cmd = @app_cmd.define_command
|
33
|
+
|
34
|
+
@news_cmd = @app_cmd.define_command do
|
46
35
|
name 'news'
|
47
36
|
usage 'news [OPTIONS] [COMMAND]...'
|
48
37
|
aliases :n
|
49
38
|
summary "Scrape NHK News Web (Easy) articles (aliases: #{app.color_alias('n')})"
|
50
|
-
|
51
|
-
description <<-
|
39
|
+
|
40
|
+
description <<-DESC
|
52
41
|
Scrape NHK News Web (Easy) articles &
|
53
42
|
save to folder: #{News::DEFAULT_DIR}
|
54
|
-
|
55
|
-
|
56
|
-
option :d,:datetime,<<-
|
43
|
+
DESC
|
44
|
+
|
45
|
+
option :d,:datetime,<<-DESC,argument: :required,transform: lambda { |value|
|
57
46
|
date time to use as a fallback in cases when an article doesn't have one;
|
58
47
|
format: YYYY-mm-dd H:M; example: 2020-03-30 15:30
|
59
|
-
|
60
|
-
value = Time.strptime(value,'%Y-%m-%d %H:%M',&
|
48
|
+
DESC
|
49
|
+
value = Time.strptime(value,'%Y-%m-%d %H:%M',&DatetimeParser.method(:guess_year))
|
61
50
|
value = Util.jst_time(value)
|
62
51
|
value
|
63
|
-
|
64
|
-
option :i,:in,<<-
|
52
|
+
}
|
53
|
+
option :i,:in,<<-DESC,argument: :required,transform: lambda { |value|
|
65
54
|
HTML file of article to read instead of URL (for offline testing and/or slow internet;
|
66
55
|
see '--no-dict' option)
|
67
|
-
|
56
|
+
DESC
|
68
57
|
app.check_empty_opt(:in,value)
|
69
|
-
|
70
|
-
flag :L,:lenient,<<-
|
58
|
+
}
|
59
|
+
flag :L,:lenient,<<-DESC
|
71
60
|
leniently (not strict) scrape articles:
|
72
61
|
body & title content without the proper HTML/CSS classes/IDs and no futsuurl;
|
73
62
|
example URLs that need this flag:
|
74
63
|
-https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html
|
75
64
|
-https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html
|
76
|
-
|
77
|
-
option :k,:like,<<-
|
65
|
+
DESC
|
66
|
+
option :k,:like,<<-DESC,argument: :required,transform: lambda { |value|
|
78
67
|
text to fuzzy search links for; for example, "--like '00123'" will only scrape links containing
|
79
68
|
text '00123' -- like '*00123*'
|
80
|
-
|
81
|
-
value = Util.strip_web_str(value).downcase
|
69
|
+
DESC
|
70
|
+
value = Util.strip_web_str(value).downcase
|
82
71
|
value
|
83
|
-
|
84
|
-
option :l,:links,<<-
|
72
|
+
}
|
73
|
+
option :l,:links,<<-DESC,argument: :required,transform: lambda { |value|
|
85
74
|
'directory/file' of article links to scrape (see '#{App::NAME} search';
|
86
75
|
defaults: #{SearchLinks::DEFAULT_YASASHII_FILE}, #{SearchLinks::DEFAULT_FUTSUU_FILE})
|
87
|
-
|
76
|
+
DESC
|
88
77
|
app.check_empty_opt(:links,value)
|
89
|
-
|
90
|
-
flag :M,:missingno,<<-
|
78
|
+
}
|
79
|
+
flag :M,:missingno,<<-DESC
|
91
80
|
very rarely an article will not have kana or kanji for a Ruby tag;
|
92
81
|
to not raise an error, this will use previously scraped data to fill it in;
|
93
82
|
example URL:
|
94
83
|
-https://www3.nhk.or.jp/news/easy/k10012331311000/k10012331311000.html
|
95
|
-
|
96
|
-
flag :D,:'no-dict',<<-
|
84
|
+
DESC
|
85
|
+
flag :D,:'no-dict',<<-DESC
|
97
86
|
do not try to parse the dictionary files for the articles; useful in case of errors trying to load
|
98
87
|
the dictionaries (or for offline testing)
|
99
|
-
|
100
|
-
flag :H,'no-sha256',<<-
|
88
|
+
DESC
|
89
|
+
flag :H,'no-sha256',<<-DESC
|
101
90
|
do not check the SHA-256 of the content to see if an article has already been scraped;
|
102
91
|
for example, 2 URLs with the same content, but 1 with 'http' & 1 with 'https', will both be scraped;
|
103
92
|
this is useful if 2 articles have the same SHA-256, but different content (unlikely)
|
104
|
-
|
105
|
-
option :o,:out,<<-
|
93
|
+
DESC
|
94
|
+
option :o,:out,<<-DESC,argument: :required,transform: lambda { |value|
|
106
95
|
'directory/file' to save words to; if you only specify a directory or a file, it will attach
|
107
96
|
the appropriate default directory/file name
|
108
97
|
(defaults: #{YasashiiNews::DEFAULT_FILE}, #{FutsuuNews::DEFAULT_FILE})
|
109
|
-
|
98
|
+
DESC
|
110
99
|
app.check_empty_opt(:out,value)
|
111
|
-
|
100
|
+
}
|
112
101
|
flag :r,:redo,'scrape article links even if they have already been scraped'
|
113
102
|
option :s,:scrape,'number of unscraped article links to scrape',argument: :required,
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
option nil,:'show-dict',<<-
|
103
|
+
default: DEFAULT_NEWS_SCRAPE,transform: lambda { |value|
|
104
|
+
value = value.to_i
|
105
|
+
value = 1 if value < 1
|
106
|
+
value
|
107
|
+
}
|
108
|
+
option nil,:'show-dict',<<-DESC
|
120
109
|
show dictionary URL and contents for the first article and exit;
|
121
110
|
useful for debugging dictionary errors (see '--no-dict' option);
|
122
111
|
implies '--dry-run' option
|
123
|
-
|
124
|
-
option :u,:url,<<-
|
112
|
+
DESC
|
113
|
+
option :u,:url,<<-DESC,argument: :required,transform: lambda { |value|
|
125
114
|
URL of article to scrape, instead of article links file (see '--links' option)
|
126
|
-
|
115
|
+
DESC
|
127
116
|
app.check_empty_opt(:url,value)
|
128
|
-
|
129
|
-
|
117
|
+
}
|
118
|
+
|
130
119
|
run do |opts,args,cmd|
|
131
120
|
puts cmd.help
|
132
121
|
end
|
133
122
|
end
|
134
|
-
|
135
|
-
@news_easy_cmd = @news_cmd.define_command
|
123
|
+
|
124
|
+
@news_easy_cmd = @news_cmd.define_command do
|
136
125
|
name 'easy'
|
137
126
|
usage 'easy [OPTIONS] [COMMAND]...'
|
138
127
|
aliases :e,:ez
|
139
128
|
summary "Scrape NHK News Web Easy (Yasashii) articles (aliases: #{app.color_alias('e ez')})"
|
140
|
-
|
141
|
-
description <<-
|
129
|
+
|
130
|
+
description <<-DESC
|
142
131
|
Search for NHK News Web Easy (Yasashii) links &
|
143
132
|
save to file: #{YasashiiNews::DEFAULT_FILE}
|
144
|
-
|
145
|
-
|
133
|
+
DESC
|
134
|
+
|
146
135
|
run do |opts,args,cmd|
|
147
136
|
app.refresh_cmd(opts,args,cmd)
|
148
137
|
app.run_news_cmd(:yasashii)
|
149
138
|
end
|
150
139
|
end
|
151
|
-
|
152
|
-
@news_regular_cmd = @news_cmd.define_command
|
140
|
+
|
141
|
+
@news_regular_cmd = @news_cmd.define_command do
|
153
142
|
name 'regular'
|
154
143
|
usage 'regular [OPTIONS] [COMMAND]...'
|
155
144
|
aliases :r,:reg
|
156
145
|
summary "Scrape NHK News Web Regular (Futsuu) articles (aliases: #{app.color_alias('r reg')})"
|
157
|
-
|
158
|
-
description <<-
|
146
|
+
|
147
|
+
description <<-DESC
|
159
148
|
Search for NHK News Web Regular (Futsuu) links &
|
160
149
|
save to file: #{FutsuuNews::DEFAULT_FILE}
|
161
|
-
|
162
|
-
|
150
|
+
DESC
|
151
|
+
|
163
152
|
run do |opts,args,cmd|
|
164
153
|
app.refresh_cmd(opts,args,cmd)
|
165
154
|
app.run_news_cmd(:futsuu)
|
166
155
|
end
|
167
156
|
end
|
168
157
|
end
|
169
|
-
|
158
|
+
|
170
159
|
def run_news_cmd(type)
|
171
160
|
@cmd_opts[:dry_run] = true if @cmd_opts[:show_dict]
|
172
161
|
news_name = nil
|
173
|
-
|
162
|
+
|
174
163
|
build_in_file(:in)
|
175
|
-
|
164
|
+
|
176
165
|
case type
|
177
166
|
when :futsuu
|
178
|
-
build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,
|
167
|
+
build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,
|
168
|
+
default_filename: SearchLinks::DEFAULT_FUTSUU_FILENAME)
|
179
169
|
build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: FutsuuNews::DEFAULT_FILENAME)
|
180
|
-
|
170
|
+
|
181
171
|
news_name = 'Regular'
|
182
172
|
when :yasashii
|
183
|
-
build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,
|
173
|
+
build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,
|
174
|
+
default_filename: SearchLinks::DEFAULT_YASASHII_FILENAME)
|
184
175
|
build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: YasashiiNews::DEFAULT_FILENAME)
|
185
|
-
|
176
|
+
|
186
177
|
news_name = 'Easy'
|
187
178
|
else
|
188
179
|
raise ArgumentError,"invalid type[#{type}]"
|
189
180
|
end
|
190
|
-
|
181
|
+
|
191
182
|
return unless check_in_file(:in,empty_ok: true)
|
192
183
|
return unless check_out_file(:out)
|
193
|
-
|
184
|
+
|
194
185
|
datetime = @cmd_opts[:datetime]
|
195
186
|
dict = @cmd_opts[:no_dict] ? nil : :scrape
|
196
187
|
dry_run = @cmd_opts[:dry_run]
|
@@ -199,39 +190,39 @@ module CLI
|
|
199
190
|
like = @cmd_opts[:like]
|
200
191
|
links_file = @cmd_opts[:links]
|
201
192
|
max_scrapes = @cmd_opts[:scrape]
|
202
|
-
max_scrapes = DEFAULT_NEWS_SCRAPE if max_scrapes.nil?
|
193
|
+
max_scrapes = DEFAULT_NEWS_SCRAPE if max_scrapes.nil?
|
203
194
|
missingno = @cmd_opts[:missingno]
|
204
195
|
no_sha256 = @cmd_opts[:no_sha256]
|
205
196
|
out_file = @cmd_opts[:out]
|
206
197
|
redo_scrapes = @cmd_opts[:redo]
|
207
198
|
show_dict = @cmd_opts[:show_dict]
|
208
|
-
|
199
|
+
|
209
200
|
# Favor in_file option over url option.
|
210
|
-
url = in_file.nil?
|
211
|
-
url = nil if url.empty?
|
212
|
-
|
213
|
-
if url.nil?
|
201
|
+
url = in_file.nil? ? Util.strip_web_str(@cmd_opts[:url].to_s) : in_file
|
202
|
+
url = nil if url.empty?
|
203
|
+
|
204
|
+
if url.nil?
|
214
205
|
# Then we must have a links file that exists.
|
215
206
|
return unless check_in_file(:links,empty_ok: false)
|
216
207
|
end
|
217
|
-
|
208
|
+
|
218
209
|
start_spin("Scraping NHK News Web #{news_name} articles")
|
219
|
-
|
220
|
-
is_file = !in_file.nil?
|
210
|
+
|
211
|
+
is_file = !in_file.nil?
|
221
212
|
link_count = -1
|
222
|
-
links = File.exist?(links_file) ? SearchLinks.load_file(links_file) : SearchLinks.new
|
213
|
+
links = File.exist?(links_file) ? SearchLinks.load_file(links_file) : SearchLinks.new
|
223
214
|
new_articles = [] # For --dry-run
|
224
215
|
news = nil
|
225
216
|
scrape_count = 0
|
226
|
-
|
217
|
+
|
227
218
|
if File.exist?(out_file)
|
228
219
|
news = (type == :yasashii) ?
|
229
220
|
YasashiiNews.load_file(out_file,overwrite: no_sha256) :
|
230
221
|
FutsuuNews.load_file(out_file,overwrite: no_sha256)
|
231
222
|
else
|
232
|
-
news = (type == :yasashii) ? YasashiiNews.new
|
223
|
+
news = (type == :yasashii) ? YasashiiNews.new : FutsuuNews.new
|
233
224
|
end
|
234
|
-
|
225
|
+
|
235
226
|
@news_article_scraper_kargs = @scraper_kargs.merge({
|
236
227
|
datetime: datetime,
|
237
228
|
dict: dict,
|
@@ -242,154 +233,154 @@ module CLI
|
|
242
233
|
@news_dict_scraper_kargs = @scraper_kargs.merge({
|
243
234
|
is_file: is_file,
|
244
235
|
})
|
245
|
-
|
246
|
-
if url.nil?
|
236
|
+
|
237
|
+
if url.nil?
|
247
238
|
# Why store each() and do `links_len` instead of `links-len - 1`?
|
248
|
-
#
|
239
|
+
#
|
249
240
|
# If links contains 5 entries and you scrape all 5, then the output of
|
250
241
|
# update_spin_detail() will end on 4, so all of this complexity is so
|
251
242
|
# that update_spin_detail() only needs to be written/updated on one line.
|
252
|
-
|
253
|
-
links_each = links.links.values.each
|
254
|
-
links_len = links.length
|
255
|
-
|
243
|
+
|
244
|
+
links_each = links.links.values.each
|
245
|
+
links_len = links.length
|
246
|
+
|
256
247
|
0.upto(links_len) do |i|
|
257
248
|
update_spin_detail(" (scraped=#{scrape_count}, considered=#{link_count += 1})")
|
258
|
-
|
249
|
+
|
259
250
|
break if i >= links_len || scrape_count >= max_scrapes
|
260
|
-
|
261
|
-
link = links_each.next
|
262
|
-
|
263
|
-
next if !like.nil?
|
251
|
+
|
252
|
+
link = links_each.next
|
253
|
+
|
254
|
+
next if !like.nil? && !link.url.to_s.downcase.include?(like)
|
264
255
|
next if !redo_scrapes && scraped_news_article?(news,link)
|
265
|
-
|
256
|
+
|
266
257
|
url = link.url
|
267
|
-
|
258
|
+
|
268
259
|
if (new_url = scrape_news_article(url,link: link,new_articles: new_articles,news: news))
|
269
260
|
# --show-dict
|
270
261
|
url = new_url
|
271
262
|
scrape_count = max_scrapes - 1 # Break on next iteration for update_spin_detail()
|
272
263
|
end
|
273
|
-
|
264
|
+
|
274
265
|
# Break on next iteration for update_spin_detail().
|
275
266
|
next if (scrape_count += 1) >= max_scrapes
|
276
|
-
|
277
|
-
sleep_scraper
|
267
|
+
|
268
|
+
sleep_scraper
|
278
269
|
end
|
279
270
|
else
|
280
271
|
link = links[url]
|
281
|
-
|
282
|
-
if link.nil?
|
272
|
+
|
273
|
+
if link.nil?
|
283
274
|
link = SearchLink.new(url)
|
284
275
|
links.add_link(link)
|
285
276
|
end
|
286
|
-
|
277
|
+
|
287
278
|
scrape_news_article(url,link: link,new_articles: new_articles,news: news)
|
288
|
-
|
279
|
+
|
289
280
|
scrape_count += 1
|
290
281
|
end
|
291
|
-
|
292
|
-
stop_spin
|
282
|
+
|
283
|
+
stop_spin
|
293
284
|
puts
|
294
|
-
|
285
|
+
|
295
286
|
if scrape_count <= 0
|
296
287
|
puts 'Nothing scraped!'
|
297
|
-
|
288
|
+
|
298
289
|
if !dry_run && !show_dict
|
299
290
|
puts
|
300
291
|
start_spin('Saving updated links to file')
|
301
|
-
|
292
|
+
|
302
293
|
links.save_file(links_file)
|
303
|
-
|
304
|
-
stop_spin
|
294
|
+
|
295
|
+
stop_spin
|
305
296
|
puts "> #{links_file}"
|
306
297
|
end
|
307
298
|
else
|
308
299
|
puts 'Last URL scraped:'
|
309
300
|
puts "> #{url}"
|
310
301
|
puts
|
311
|
-
|
302
|
+
|
312
303
|
if show_dict
|
313
304
|
puts @cmd_opts[:show_dict] # Updated in scrape_news_article()
|
314
305
|
elsif dry_run
|
315
306
|
if new_articles.length < 1
|
316
|
-
raise CLIError,"scrape_count[#{scrape_count}] != new_articles[#{new_articles.length}];
|
317
|
-
|
307
|
+
raise CLIError,"scrape_count[#{scrape_count}] != new_articles[#{new_articles.length}];" \
|
308
|
+
' internal code is broken'
|
318
309
|
elsif new_articles.length == 1
|
319
310
|
puts new_articles.first
|
320
311
|
else
|
321
312
|
# Don't show the words (mini), too verbose for more than 1.
|
322
|
-
new_articles.each
|
313
|
+
new_articles.each do |article|
|
323
314
|
puts article.to_s(mini: true)
|
324
315
|
end
|
325
316
|
end
|
326
317
|
else
|
327
318
|
start_spin('Saving scraped data to files')
|
328
|
-
|
319
|
+
|
329
320
|
links.save_file(links_file)
|
330
321
|
news.save_file(out_file)
|
331
|
-
|
332
|
-
stop_spin
|
322
|
+
|
323
|
+
stop_spin
|
333
324
|
puts "> #{out_file}"
|
334
325
|
puts "> #{links_file}"
|
335
326
|
end
|
336
327
|
end
|
337
328
|
end
|
338
|
-
|
329
|
+
|
339
330
|
def scrape_news_article(url,link:,new_articles:,news:)
|
340
331
|
show_dict = @cmd_opts[:show_dict]
|
341
|
-
|
332
|
+
|
342
333
|
if show_dict
|
343
334
|
scraper = DictScraper.new(url,**@news_dict_scraper_kargs)
|
344
|
-
|
345
|
-
@cmd_opts[:show_dict] = scraper.scrape
|
346
|
-
|
335
|
+
|
336
|
+
@cmd_opts[:show_dict] = scraper.scrape.to_s
|
337
|
+
|
347
338
|
return scraper.url
|
348
339
|
end
|
349
|
-
|
340
|
+
|
350
341
|
scraper = ArticleScraper.new(url,**@news_article_scraper_kargs)
|
351
|
-
article = scraper.scrape
|
352
|
-
|
342
|
+
article = scraper.scrape
|
343
|
+
|
353
344
|
# run_news_cmd() handles overwriting with --redo or not
|
354
345
|
# using scraped_news_article?().
|
355
346
|
news.add_article(article,overwrite: true)
|
356
|
-
|
347
|
+
|
357
348
|
news.update_article(article,link.url) # Favors https
|
358
349
|
link.update_from_article(article)
|
359
|
-
|
350
|
+
|
360
351
|
new_articles << article
|
361
|
-
|
352
|
+
|
362
353
|
return false # No --show-dict
|
363
354
|
end
|
364
|
-
|
355
|
+
|
365
356
|
def scraped_news_article?(news,link)
|
366
|
-
return true if link.scraped?
|
367
|
-
|
357
|
+
return true if link.scraped?
|
358
|
+
|
368
359
|
no_sha256 = @cmd_opts[:no_sha256]
|
369
|
-
|
360
|
+
|
370
361
|
article = news.article(link.url)
|
371
|
-
|
372
|
-
if !no_sha256 && article.nil?
|
362
|
+
|
363
|
+
if !no_sha256 && article.nil?
|
373
364
|
if !Util.empty_web_str?(link.sha256) && news.sha256?(link.sha256)
|
374
365
|
article = news.article_with_sha256(link.sha256)
|
375
366
|
end
|
376
|
-
|
377
|
-
if article.nil?
|
367
|
+
|
368
|
+
if article.nil?
|
378
369
|
scraper = ArticleScraper.new(link.url,**@news_article_scraper_kargs)
|
379
|
-
|
380
|
-
sha256 = scraper.scrape_sha256_only
|
381
|
-
|
370
|
+
|
371
|
+
sha256 = scraper.scrape_sha256_only
|
372
|
+
|
382
373
|
article = news.article_with_sha256(sha256) if news.sha256?(sha256)
|
383
374
|
end
|
384
375
|
end
|
385
|
-
|
376
|
+
|
386
377
|
if article
|
387
378
|
news.update_article(article,link.url) # Favors https
|
388
379
|
link.update_from_article(article)
|
389
|
-
|
380
|
+
|
390
381
|
return true
|
391
382
|
end
|
392
|
-
|
383
|
+
|
393
384
|
return false
|
394
385
|
end
|
395
386
|
end
|