nhkore 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +35 -1
- data/README.md +305 -17
- data/Rakefile +10 -13
- data/lib/nhkore.rb +2 -1
- data/lib/nhkore/app.rb +66 -43
- data/lib/nhkore/article_scraper.rb +2 -2
- data/lib/nhkore/cli/fx_cmd.rb +1 -1
- data/lib/nhkore/cli/get_cmd.rb +27 -12
- data/lib/nhkore/cli/news_cmd.rb +19 -7
- data/lib/nhkore/cli/{bing_cmd.rb → search_cmd.rb} +125 -52
- data/lib/nhkore/scraper.rb +123 -59
- data/lib/nhkore/search_link.rb +4 -4
- data/lib/nhkore/search_scraper.rb +70 -15
- data/lib/nhkore/user_agents.rb +1179 -0
- data/lib/nhkore/util.rb +36 -1
- data/lib/nhkore/version.rb +1 -1
- data/nhkore.gemspec +30 -18
- metadata +22 -4
data/lib/nhkore.rb
CHANGED
@@ -46,15 +46,16 @@ require 'nhkore/search_link'
|
|
46
46
|
require 'nhkore/search_scraper'
|
47
47
|
require 'nhkore/sifter'
|
48
48
|
require 'nhkore/splitter'
|
49
|
+
require 'nhkore/user_agents'
|
49
50
|
require 'nhkore/util'
|
50
51
|
require 'nhkore/variator'
|
51
52
|
require 'nhkore/version'
|
52
53
|
require 'nhkore/word'
|
53
54
|
|
54
|
-
require 'nhkore/cli/bing_cmd'
|
55
55
|
require 'nhkore/cli/fx_cmd'
|
56
56
|
require 'nhkore/cli/get_cmd'
|
57
57
|
require 'nhkore/cli/news_cmd'
|
58
|
+
require 'nhkore/cli/search_cmd'
|
58
59
|
require 'nhkore/cli/sift_cmd'
|
59
60
|
|
60
61
|
|
data/lib/nhkore/app.rb
CHANGED
@@ -24,6 +24,7 @@
|
|
24
24
|
require 'cri'
|
25
25
|
require 'highline'
|
26
26
|
require 'rainbow'
|
27
|
+
require 'set'
|
27
28
|
require 'tty-progressbar'
|
28
29
|
require 'tty-spinner'
|
29
30
|
|
@@ -31,10 +32,10 @@ require 'nhkore/error'
|
|
31
32
|
require 'nhkore/util'
|
32
33
|
require 'nhkore/version'
|
33
34
|
|
34
|
-
require 'nhkore/cli/bing_cmd'
|
35
35
|
require 'nhkore/cli/fx_cmd'
|
36
36
|
require 'nhkore/cli/get_cmd'
|
37
37
|
require 'nhkore/cli/news_cmd'
|
38
|
+
require 'nhkore/cli/search_cmd'
|
38
39
|
require 'nhkore/cli/sift_cmd'
|
39
40
|
|
40
41
|
|
@@ -47,30 +48,20 @@ module NHKore
|
|
47
48
|
end
|
48
49
|
|
49
50
|
###
|
50
|
-
# For disabling color output.
|
51
|
+
# For disabling/enabling color output.
|
51
52
|
#
|
52
53
|
# @author Jonathan Bradley Whited (@esotericpig)
|
53
|
-
# @since 0.2.
|
54
|
+
# @since 0.2.1
|
54
55
|
###
|
55
|
-
module
|
56
|
-
|
57
|
-
return str
|
58
|
-
end
|
59
|
-
|
60
|
-
def bold(str)
|
61
|
-
return str
|
62
|
-
end
|
56
|
+
module CriColorExt
|
57
|
+
@@color = true
|
63
58
|
|
64
|
-
def
|
65
|
-
|
59
|
+
def color=(color)
|
60
|
+
@@color = color
|
66
61
|
end
|
67
62
|
|
68
|
-
def
|
69
|
-
return
|
70
|
-
end
|
71
|
-
|
72
|
-
def yellow(str)
|
73
|
-
return str
|
63
|
+
def color?(io)
|
64
|
+
return @@color
|
74
65
|
end
|
75
66
|
end
|
76
67
|
|
@@ -79,14 +70,19 @@ module NHKore
|
|
79
70
|
# @since 0.2.0
|
80
71
|
###
|
81
72
|
class App
|
82
|
-
include CLI::BingCmd
|
83
73
|
include CLI::FXCmd
|
84
74
|
include CLI::GetCmd
|
85
75
|
include CLI::NewsCmd
|
76
|
+
include CLI::SearchCmd
|
86
77
|
include CLI::SiftCmd
|
87
78
|
|
88
79
|
NAME = 'nhkore'
|
89
80
|
|
81
|
+
DEFAULT_SLEEP_TIME = 0.1 # So that sites don't ban us (i.e., think we are human)
|
82
|
+
|
83
|
+
COLOR_OPTS = [:c,:color]
|
84
|
+
NO_COLOR_OPTS = [:C,:'no-color']
|
85
|
+
|
90
86
|
SPINNER_MSG = '[:spinner] :title:detail...'
|
91
87
|
CLASSIC_SPINNER = TTY::Spinner.new(SPINNER_MSG,format: :classic)
|
92
88
|
DEFAULT_SPINNER = TTY::Spinner.new(SPINNER_MSG,interval: 5,
|
@@ -94,8 +90,9 @@ module NHKore
|
|
94
90
|
NO_SPINNER = {} # Still outputs status & stores tokens
|
95
91
|
NO_SPINNER_MSG = '%{title}%{detail}...'
|
96
92
|
|
97
|
-
|
98
|
-
|
93
|
+
attr_reader :cmd
|
94
|
+
attr_reader :cmd_args
|
95
|
+
attr_reader :cmd_opts
|
99
96
|
attr_accessor :progress_bar
|
100
97
|
attr_accessor :scraper_kargs
|
101
98
|
attr_accessor :sleep_time
|
@@ -119,10 +116,10 @@ module NHKore
|
|
119
116
|
|
120
117
|
build_app_cmd()
|
121
118
|
|
122
|
-
build_bing_cmd()
|
123
119
|
build_fx_cmd()
|
124
120
|
build_get_cmd()
|
125
121
|
build_news_cmd()
|
122
|
+
build_search_cmd()
|
126
123
|
build_sift_cmd()
|
127
124
|
build_version_cmd()
|
128
125
|
|
@@ -130,18 +127,24 @@ module NHKore
|
|
130
127
|
end
|
131
128
|
|
132
129
|
def autodetect_color()
|
133
|
-
|
130
|
+
Cri::Platform.singleton_class.prepend(CriColorExt)
|
131
|
+
|
132
|
+
color = nil # Must be nil, not true/false
|
134
133
|
|
135
|
-
if
|
136
|
-
disable = true
|
137
|
-
elsif !@args.empty?()
|
134
|
+
if !@args.empty?()
|
138
135
|
# Kind of hacky, but necessary for Rainbow.
|
139
136
|
|
140
|
-
|
137
|
+
color_opts = opts_to_set(COLOR_OPTS)
|
138
|
+
no_color_opts = opts_to_set(NO_COLOR_OPTS)
|
141
139
|
|
142
140
|
@args.each() do |arg|
|
143
|
-
if
|
144
|
-
|
141
|
+
if color_opts.include?(arg)
|
142
|
+
color = true
|
143
|
+
break
|
144
|
+
end
|
145
|
+
|
146
|
+
if no_color_opts.include?(arg)
|
147
|
+
color = false
|
145
148
|
break
|
146
149
|
end
|
147
150
|
|
@@ -149,11 +152,11 @@ module NHKore
|
|
149
152
|
end
|
150
153
|
end
|
151
154
|
|
152
|
-
if
|
153
|
-
|
154
|
-
else
|
155
|
-
@rainbow.enabled = true # Force it in case Rainbow auto-disabled it
|
155
|
+
if color.nil?()
|
156
|
+
color = ($stdout.tty?() && ENV['TERM'] != 'dumb')
|
156
157
|
end
|
158
|
+
|
159
|
+
enable_color(color)
|
157
160
|
end
|
158
161
|
|
159
162
|
def build_app_cmd()
|
@@ -171,12 +174,15 @@ module NHKore
|
|
171
174
|
This is similar to a core word/vocabulary list.
|
172
175
|
EOD
|
173
176
|
|
174
|
-
flag :
|
177
|
+
flag :s,:'classic-fx',<<-EOD do |value,cmd|
|
175
178
|
use classic spinner/progress special effects (in case of no Unicode support) when running long tasks
|
176
179
|
EOD
|
177
180
|
app.progress_bar = :classic
|
178
181
|
app.spinner = CLASSIC_SPINNER
|
179
182
|
end
|
183
|
+
flag COLOR_OPTS[0],COLOR_OPTS[1],%q{force color output (for commands like '| less -R')} do |value,cmd|
|
184
|
+
app.enable_color(true)
|
185
|
+
end
|
180
186
|
flag :n,:'dry-run',<<-EOD
|
181
187
|
do a dry run without making changes; do not write to files, create directories, etc.
|
182
188
|
EOD
|
@@ -194,8 +200,8 @@ module NHKore
|
|
194
200
|
|
195
201
|
app.scraper_kargs[:max_retries] = value
|
196
202
|
end
|
197
|
-
flag
|
198
|
-
app.
|
203
|
+
flag NO_COLOR_OPTS[0],NO_COLOR_OPTS[1],'disable color output' do |value,cmd|
|
204
|
+
app.enable_color(false)
|
199
205
|
end
|
200
206
|
flag :X,:'no-fx','disable spinner/progress special effects when running long tasks' do |value,cmd|
|
201
207
|
app.progress_bar = :no
|
@@ -223,7 +229,7 @@ module NHKore
|
|
223
229
|
app.sleep_time = value.to_f()
|
224
230
|
app.sleep_time = 0.0 if app.sleep_time < 0.0
|
225
231
|
end
|
226
|
-
option :t,:
|
232
|
+
option :t,:timeout,<<-EOD,argument: :required do |value,cmd|
|
227
233
|
seconds for all URL timeouts: [open, read] (-1 or decimal >= 0)
|
228
234
|
EOD
|
229
235
|
value = value.to_f()
|
@@ -232,6 +238,14 @@ module NHKore
|
|
232
238
|
app.scraper_kargs[:open_timeout] = value
|
233
239
|
app.scraper_kargs[:read_timeout] = value
|
234
240
|
end
|
241
|
+
option :u,:'user-agent',<<-EOD,argument: :required do |value,cmd|
|
242
|
+
HTTP header field 'User-Agent' to use instead of a random one
|
243
|
+
EOD
|
244
|
+
value = app.check_empty_opt(:'user-agent',value)
|
245
|
+
|
246
|
+
app.scraper_kargs[:header] ||= {}
|
247
|
+
app.scraper_kargs[:header]['user-agent'] = value
|
248
|
+
end
|
235
249
|
# Big V, not small.
|
236
250
|
flag :V,:version,'show the version and exit' do |value,cmd|
|
237
251
|
app.show_version()
|
@@ -399,8 +413,8 @@ module NHKore
|
|
399
413
|
|
400
414
|
force = @cmd_opts[:force]
|
401
415
|
|
402
|
-
if !force && Dir.exist?(out_dir)
|
403
|
-
puts 'Warning: output directory already exists!'
|
416
|
+
if !force && Dir.exist?(out_dir) && !Dir.empty?(out_dir)
|
417
|
+
puts 'Warning: output directory already exists with files!'
|
404
418
|
puts ' : Files inside of this directory may be overwritten!'
|
405
419
|
puts "> '#{out_dir}'"
|
406
420
|
|
@@ -478,9 +492,18 @@ module NHKore
|
|
478
492
|
return color(str).green
|
479
493
|
end
|
480
494
|
|
481
|
-
def
|
482
|
-
Cri::
|
483
|
-
@rainbow.enabled =
|
495
|
+
def enable_color(enabled)
|
496
|
+
Cri::Platform.color = enabled
|
497
|
+
@rainbow.enabled = enabled
|
498
|
+
end
|
499
|
+
|
500
|
+
def opts_to_set(ary)
|
501
|
+
set = Set.new()
|
502
|
+
|
503
|
+
set.add("-#{ary[0].to_s()}") unless ary[0].nil?()
|
504
|
+
set.add("--#{ary[1].to_s()}") unless ary[1].nil?()
|
505
|
+
|
506
|
+
return set
|
484
507
|
end
|
485
508
|
|
486
509
|
def refresh_cmd(opts,args,cmd)
|
@@ -124,7 +124,7 @@ module NHKore
|
|
124
124
|
# - https://www3.nhk.or.jp/news/easy/k10012118911000/k10012118911000.html
|
125
125
|
# - '</p><br><「<ruby>台風<rt>たいふう</rt></ruby>'
|
126
126
|
|
127
|
-
|
127
|
+
read()
|
128
128
|
|
129
129
|
# To add a new one, simply add '|(...)' on a newline and test $#.
|
130
130
|
@str_or_io = @str_or_io.gsub(/
|
@@ -281,7 +281,7 @@ module NHKore
|
|
281
281
|
scraper = DictScraper.new(dict_url,missingno: @missingno,parse_url: false,**@kargs)
|
282
282
|
rescue OpenURI::HTTPError => e
|
283
283
|
if retries == 0 && e.to_s().include?('404')
|
284
|
-
|
284
|
+
read()
|
285
285
|
|
286
286
|
scraper = ArticleScraper.new(@url,str_or_io: @str_or_io,**@kargs)
|
287
287
|
|
data/lib/nhkore/cli/fx_cmd.rb
CHANGED
data/lib/nhkore/cli/get_cmd.rb
CHANGED
@@ -99,15 +99,14 @@ module CLI
|
|
99
99
|
|
100
100
|
return if dry_run
|
101
101
|
|
102
|
-
Tempfile.create([App::NAME,'.zip'],binmode: true) do |file|
|
102
|
+
Tempfile.create(["#{App::NAME}_",'.zip'],binmode: true) do |file|
|
103
103
|
puts
|
104
|
-
puts
|
104
|
+
puts "Downloading #{GET_URL_FILENAME} to temp file:"
|
105
105
|
puts "> #{file.path}"
|
106
|
-
puts
|
107
106
|
|
108
107
|
len = down.size
|
109
|
-
len = DEFAULT_GET_LENGTH if len.nil?()
|
110
|
-
bar = build_progress_bar(
|
108
|
+
len = DEFAULT_GET_LENGTH if len.nil?() || len < 1
|
109
|
+
bar = build_progress_bar('> Downloading',download: true,total: len)
|
111
110
|
|
112
111
|
bar.start()
|
113
112
|
|
@@ -120,9 +119,12 @@ module CLI
|
|
120
119
|
file.close()
|
121
120
|
bar.finish()
|
122
121
|
|
123
|
-
|
122
|
+
puts
|
123
|
+
puts "Extracting #{GET_URL_FILENAME}..."
|
124
124
|
|
125
|
-
|
125
|
+
# We manually ask the user whether to overwrite each file, so set this to
|
126
|
+
# true so that Zip extract() will force overwrites and not raise an error.
|
127
|
+
Zip.on_exists_proc = true
|
126
128
|
|
127
129
|
Zip::File.open(file) do |zip_file|
|
128
130
|
zip_file.each() do |entry|
|
@@ -130,17 +132,30 @@ module CLI
|
|
130
132
|
raise ZipError,"unsafe entry name[#{entry.name}] in Zip file"
|
131
133
|
end
|
132
134
|
|
133
|
-
name = File.basename(entry.name)
|
135
|
+
name = Util.strip_web_str(File.basename(entry.name))
|
136
|
+
|
137
|
+
next if name.empty?()
|
138
|
+
|
139
|
+
out_file = File.join(out_dir,name)
|
134
140
|
|
135
|
-
|
141
|
+
puts "> #{name}"
|
136
142
|
|
137
|
-
|
143
|
+
if !force && File.exist?(out_file)
|
144
|
+
puts
|
145
|
+
puts 'Warning: output file already exists!'
|
146
|
+
puts "> '#{out_file}'"
|
147
|
+
|
148
|
+
overwrite = @high.agree('Overwrite this file (yes/no)? ')
|
149
|
+
puts
|
150
|
+
|
151
|
+
next unless overwrite
|
152
|
+
end
|
153
|
+
|
154
|
+
entry.extract(out_file)
|
138
155
|
end
|
139
156
|
end
|
140
157
|
|
141
|
-
stop_spin()
|
142
158
|
puts
|
143
|
-
|
144
159
|
puts "Extracted #{GET_URL_FILENAME} to directory:"
|
145
160
|
puts "> #{out_dir}"
|
146
161
|
end
|
data/lib/nhkore/cli/news_cmd.rb
CHANGED
@@ -82,8 +82,8 @@ module CLI
|
|
82
82
|
value
|
83
83
|
end
|
84
84
|
option :l,:links,<<-EOD,argument: :required,transform: -> (value) do
|
85
|
-
'directory/file' of article links
|
86
|
-
defaults: #{SearchLinks::
|
85
|
+
'directory/file' of article links to scrape (see '#{App::NAME} search';
|
86
|
+
defaults: #{SearchLinks::DEFAULT_YASASHII_FILE}, #{SearchLinks::DEFAULT_FUTSUU_FILE})
|
87
87
|
EOD
|
88
88
|
app.check_empty_opt(:links,value)
|
89
89
|
end
|
@@ -170,12 +170,12 @@ module CLI
|
|
170
170
|
|
171
171
|
case type
|
172
172
|
when :futsuu
|
173
|
-
build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::
|
173
|
+
build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::DEFAULT_FUTSUU_FILENAME)
|
174
174
|
build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: FutsuuNews::DEFAULT_FILENAME)
|
175
175
|
|
176
176
|
news_name = 'Regular'
|
177
177
|
when :yasashii
|
178
|
-
build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::
|
178
|
+
build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::DEFAULT_YASASHII_FILENAME)
|
179
179
|
build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: YasashiiNews::DEFAULT_FILENAME)
|
180
180
|
|
181
181
|
news_name = 'Easy'
|
@@ -236,10 +236,22 @@ module CLI
|
|
236
236
|
})
|
237
237
|
|
238
238
|
if url.nil?()
|
239
|
-
|
239
|
+
# Why store each() and do `links_len` instead of `links-len - 1`?
|
240
|
+
#
|
241
|
+
# If links contains 5 entries and you scrape all 5, then the output of
|
242
|
+
# update_spin_detail() will end on 4, so all of this complexity is so
|
243
|
+
# that update_spin_detail() only needs to be written/updated on one line.
|
244
|
+
|
245
|
+
links_each = links.links.values.each()
|
246
|
+
links_len = links.length()
|
247
|
+
|
248
|
+
0.upto(links_len) do |i|
|
240
249
|
update_spin_detail(" (scraped=#{scrape_count}, considered=#{link_count += 1})")
|
241
250
|
|
242
|
-
break if scrape_count >= max_scrapes
|
251
|
+
break if i >= links_len || scrape_count >= max_scrapes
|
252
|
+
|
253
|
+
link = links_each.next()
|
254
|
+
|
243
255
|
next if !like.nil?() && !link.url.to_s().downcase().include?(like)
|
244
256
|
next if !redo_scrapes && scraped_news_article?(news,link)
|
245
257
|
|
@@ -248,7 +260,7 @@ module CLI
|
|
248
260
|
if (new_url = scrape_news_article(url,link: link,new_articles: new_articles,news: news))
|
249
261
|
# --show-dict
|
250
262
|
url = new_url
|
251
|
-
scrape_count = max_scrapes - 1
|
263
|
+
scrape_count = max_scrapes - 1 # Break on next iteration for update_spin_detail()
|
252
264
|
end
|
253
265
|
|
254
266
|
# Break on next iteration for update_spin_detail().
|
@@ -31,37 +31,37 @@ module NHKore
|
|
31
31
|
module CLI
|
32
32
|
###
|
33
33
|
# @author Jonathan Bradley Whited (@esotericpig)
|
34
|
-
# @since 0.
|
34
|
+
# @since 0.3.0
|
35
35
|
###
|
36
|
-
module
|
37
|
-
def
|
36
|
+
module SearchCmd
|
37
|
+
def build_search_cmd()
|
38
38
|
app = self
|
39
39
|
|
40
|
-
@
|
41
|
-
name '
|
42
|
-
usage '
|
43
|
-
aliases :
|
44
|
-
summary "Search
|
40
|
+
@search_cmd = @app_cmd.define_command() do
|
41
|
+
name 'search'
|
42
|
+
usage 'search [OPTIONS] [COMMAND]...'
|
43
|
+
aliases :se,:sea
|
44
|
+
summary "Search for links to NHK News Web (Easy) (aliases: #{app.color_alias('se sea')})"
|
45
45
|
|
46
46
|
description <<-EOD
|
47
|
-
Search
|
47
|
+
Search for links (using a Search Engine, etc.) to NHK News Web (Easy) &
|
48
48
|
save to folder: #{SearchLinks::DEFAULT_DIR}
|
49
49
|
EOD
|
50
50
|
|
51
51
|
option :i,:in,<<-EOD,argument: :required,transform: -> (value) do
|
52
|
-
|
53
|
-
see '--show
|
52
|
+
file to read instead of URL (for offline testing and/or slow internet;
|
53
|
+
see '--show-*' options)
|
54
54
|
EOD
|
55
55
|
app.check_empty_opt(:in,value)
|
56
56
|
end
|
57
57
|
option :o,:out,<<-EOD,argument: :required,transform: -> (value) do
|
58
58
|
'directory/file' to save links to; if you only specify a directory or a file, it will attach the
|
59
59
|
appropriate default directory/file name
|
60
|
-
(defaults: #{SearchLinks::
|
60
|
+
(defaults: #{SearchLinks::DEFAULT_YASASHII_FILE}, #{SearchLinks::DEFAULT_FUTSUU_FILE})
|
61
61
|
EOD
|
62
62
|
app.check_empty_opt(:out,value)
|
63
63
|
end
|
64
|
-
option :r,:results,'number of results per page to request from
|
64
|
+
option :r,:results,'number of results per page to request from search',argument: :required,
|
65
65
|
default: SearchScraper::DEFAULT_RESULT_COUNT,transform: -> (value) do
|
66
66
|
value = value.to_i()
|
67
67
|
value = 1 if value < 1
|
@@ -72,21 +72,26 @@ module CLI
|
|
72
72
|
useful for manually writing/updating scripts (but not for use in a variable);
|
73
73
|
implies '--dry-run' option
|
74
74
|
EOD
|
75
|
-
option nil,:'show-urls',<<-EOD
|
76
|
-
show the URLs used when scraping and exit;
|
77
|
-
|
75
|
+
option nil,:'show-urls',<<-EOD
|
76
|
+
show the URLs -- if any -- used when searching & scraping and exit;
|
77
|
+
you can download these for offline testing and/or slow internet
|
78
|
+
(see '--in' option)
|
78
79
|
EOD
|
79
|
-
puts "Easy: #{BingScraper.build_url(SearchScraper::YASASHII_SITE)}"
|
80
|
-
puts "Regular: #{BingScraper.build_url(SearchScraper::FUTSUU_SITE)}"
|
81
|
-
exit
|
82
|
-
end
|
83
80
|
|
84
81
|
run do |opts,args,cmd|
|
82
|
+
opts.each() do |key,value|
|
83
|
+
key = key.to_s()
|
84
|
+
|
85
|
+
if key.include?('show')
|
86
|
+
raise CLIError.new("must specify a sub command for option[#{key}]")
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
85
90
|
puts cmd.help
|
86
91
|
end
|
87
92
|
end
|
88
93
|
|
89
|
-
@
|
94
|
+
@search_easy_cmd = @search_cmd.define_command() do
|
90
95
|
name 'easy'
|
91
96
|
usage 'easy [OPTIONS] [COMMAND]...'
|
92
97
|
aliases :e,:ez
|
@@ -94,16 +99,16 @@ module CLI
|
|
94
99
|
|
95
100
|
description <<-EOD
|
96
101
|
Search for NHK News Web Easy (Yasashii) links &
|
97
|
-
save to file: #{SearchLinks::
|
102
|
+
save to file: #{SearchLinks::DEFAULT_YASASHII_FILE}
|
98
103
|
EOD
|
99
104
|
|
100
105
|
run do |opts,args,cmd|
|
101
106
|
app.refresh_cmd(opts,args,cmd)
|
102
|
-
app.
|
107
|
+
app.run_search_help()
|
103
108
|
end
|
104
109
|
end
|
105
110
|
|
106
|
-
@
|
111
|
+
@search_regular_cmd = @search_cmd.define_command() do
|
107
112
|
name 'regular'
|
108
113
|
usage 'regular [OPTIONS] [COMMAND]...'
|
109
114
|
aliases :r,:reg
|
@@ -111,28 +116,57 @@ module CLI
|
|
111
116
|
|
112
117
|
description <<-EOD
|
113
118
|
Search for NHK News Web Regular (Futsuu) links &
|
114
|
-
save to file: #{SearchLinks::
|
119
|
+
save to file: #{SearchLinks::DEFAULT_FUTSUU_FILE}
|
115
120
|
EOD
|
116
121
|
|
117
122
|
run do |opts,args,cmd|
|
118
123
|
app.refresh_cmd(opts,args,cmd)
|
119
|
-
app.
|
124
|
+
app.run_search_help()
|
120
125
|
end
|
121
126
|
end
|
127
|
+
|
128
|
+
@search_bing_cmd = Cri::Command.define() do
|
129
|
+
name 'bing'
|
130
|
+
usage 'bing [OPTIONS] [COMMAND]...'
|
131
|
+
aliases :b
|
132
|
+
summary "Search bing.com for links (aliases: #{app.color_alias('b')})"
|
133
|
+
|
134
|
+
description <<-EOD
|
135
|
+
Search bing.com for links & save to folder: #{SearchLinks::DEFAULT_DIR}
|
136
|
+
EOD
|
137
|
+
|
138
|
+
run do |opts,args,cmd|
|
139
|
+
app.refresh_cmd(opts,args,cmd)
|
140
|
+
app.run_search_cmd(cmd.supercommand.name.to_sym(),:bing)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
# dup()/clone() must be called for `cmd.supercommand` to work appropriately.
|
145
|
+
@search_easy_cmd.add_command @search_bing_cmd.dup()
|
146
|
+
@search_regular_cmd.add_command @search_bing_cmd.dup()
|
122
147
|
end
|
123
148
|
|
124
|
-
def
|
149
|
+
def run_search_cmd(nhk_type,search_type)
|
150
|
+
case nhk_type
|
151
|
+
when :easy
|
152
|
+
nhk_type = :yasashii
|
153
|
+
when :regular
|
154
|
+
nhk_type = :futsuu
|
155
|
+
end
|
156
|
+
|
157
|
+
return if show_search_urls(search_type)
|
158
|
+
|
125
159
|
@cmd_opts[:dry_run] = true if @cmd_opts[:show_count]
|
126
160
|
|
127
161
|
build_in_file(:in)
|
128
162
|
|
129
|
-
case
|
163
|
+
case nhk_type
|
130
164
|
when :futsuu
|
131
|
-
build_out_file(:out,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::
|
165
|
+
build_out_file(:out,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::DEFAULT_FUTSUU_FILENAME)
|
132
166
|
when :yasashii
|
133
|
-
build_out_file(:out,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::
|
167
|
+
build_out_file(:out,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::DEFAULT_YASASHII_FILENAME)
|
134
168
|
else
|
135
|
-
raise ArgumentError,"invalid
|
169
|
+
raise ArgumentError,"invalid nhk_type[#{nhk_type}]"
|
136
170
|
end
|
137
171
|
|
138
172
|
return unless check_in_file(:in,empty_ok: true)
|
@@ -145,7 +179,7 @@ module CLI
|
|
145
179
|
result_count = SearchScraper::DEFAULT_RESULT_COUNT if result_count.nil?()
|
146
180
|
show_count = @cmd_opts[:show_count]
|
147
181
|
|
148
|
-
start_spin(
|
182
|
+
start_spin("Scraping #{search_type}") unless show_count
|
149
183
|
|
150
184
|
is_file = !in_file.nil?()
|
151
185
|
links = nil
|
@@ -176,30 +210,43 @@ module CLI
|
|
176
210
|
return
|
177
211
|
end
|
178
212
|
|
179
|
-
# Do a range to prevent an infinite loop
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
213
|
+
range = (0..10000) # Do a range to prevent an infinite loop; ichiman!
|
214
|
+
|
215
|
+
case search_type
|
216
|
+
# Anything that extends SearchScraper.
|
217
|
+
when :bing
|
218
|
+
range.each() do
|
219
|
+
scraper = nil
|
220
|
+
|
221
|
+
case search_type
|
222
|
+
when :bing
|
223
|
+
scraper = BingScraper.new(nhk_type,count: result_count,is_file: is_file,url: url,**@scraper_kargs)
|
224
|
+
else
|
225
|
+
raise NHKore::Error,"internal code broken; add missing search_type[#{search_type}]"
|
226
|
+
end
|
227
|
+
|
228
|
+
next_page = scraper.scrape(links,next_page)
|
229
|
+
|
230
|
+
new_links.concat(links.links.values[links_count..-1])
|
231
|
+
links_count = links.length
|
232
|
+
page_count = next_page.count if next_page.count > 0
|
233
|
+
|
234
|
+
update_spin_detail(" (page=#{page_num}, count=#{page_count}, links=#{links.length}, " +
|
235
|
+
"new_links=#{new_links.length})")
|
236
|
+
|
237
|
+
break if next_page.empty?()
|
238
|
+
|
239
|
+
page_num += 1
|
240
|
+
url = next_page.url
|
241
|
+
|
242
|
+
sleep_scraper()
|
243
|
+
end
|
244
|
+
else
|
245
|
+
raise ArgumentError,"invalid search_type[#{search_type}]"
|
198
246
|
end
|
199
247
|
|
200
248
|
stop_spin()
|
201
249
|
puts
|
202
|
-
|
203
250
|
puts 'Last URL scraped:'
|
204
251
|
puts "> #{url}"
|
205
252
|
puts
|
@@ -215,6 +262,32 @@ module CLI
|
|
215
262
|
puts "> #{out_file}"
|
216
263
|
end
|
217
264
|
end
|
265
|
+
|
266
|
+
def run_search_help()
|
267
|
+
if @cmd_opts[:show_count] || @cmd_opts[:show_urls]
|
268
|
+
run_search_cmd(@cmd.name.to_sym(),nil)
|
269
|
+
else
|
270
|
+
puts @cmd.help
|
271
|
+
end
|
272
|
+
end
|
273
|
+
|
274
|
+
def show_search_urls(search_type)
|
275
|
+
return false unless @cmd_opts[:show_urls]
|
276
|
+
|
277
|
+
count = @cmd_opts[:results]
|
278
|
+
count = SearchScraper::DEFAULT_RESULT_COUNT if count.nil?()
|
279
|
+
|
280
|
+
case search_type
|
281
|
+
when :bing
|
282
|
+
puts 'Bing:'
|
283
|
+
puts "> Easy: #{BingScraper.build_url(SearchScraper::YASASHII_SITE,count: count)}"
|
284
|
+
puts "> Regular: #{BingScraper.build_url(SearchScraper::FUTSUU_SITE,count: count)}"
|
285
|
+
else
|
286
|
+
raise CLIError.new('must specify a sub command for option[show-urls]')
|
287
|
+
end
|
288
|
+
|
289
|
+
return true
|
290
|
+
end
|
218
291
|
end
|
219
292
|
end
|
220
293
|
end
|