nhkore 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +35 -1
- data/README.md +305 -17
- data/Rakefile +10 -13
- data/lib/nhkore.rb +2 -1
- data/lib/nhkore/app.rb +66 -43
- data/lib/nhkore/article_scraper.rb +2 -2
- data/lib/nhkore/cli/fx_cmd.rb +1 -1
- data/lib/nhkore/cli/get_cmd.rb +27 -12
- data/lib/nhkore/cli/news_cmd.rb +19 -7
- data/lib/nhkore/cli/{bing_cmd.rb → search_cmd.rb} +125 -52
- data/lib/nhkore/scraper.rb +123 -59
- data/lib/nhkore/search_link.rb +4 -4
- data/lib/nhkore/search_scraper.rb +70 -15
- data/lib/nhkore/user_agents.rb +1179 -0
- data/lib/nhkore/util.rb +36 -1
- data/lib/nhkore/version.rb +1 -1
- data/nhkore.gemspec +30 -18
- metadata +22 -4
data/lib/nhkore.rb
CHANGED
@@ -46,15 +46,16 @@ require 'nhkore/search_link'
|
|
46
46
|
require 'nhkore/search_scraper'
|
47
47
|
require 'nhkore/sifter'
|
48
48
|
require 'nhkore/splitter'
|
49
|
+
require 'nhkore/user_agents'
|
49
50
|
require 'nhkore/util'
|
50
51
|
require 'nhkore/variator'
|
51
52
|
require 'nhkore/version'
|
52
53
|
require 'nhkore/word'
|
53
54
|
|
54
|
-
require 'nhkore/cli/bing_cmd'
|
55
55
|
require 'nhkore/cli/fx_cmd'
|
56
56
|
require 'nhkore/cli/get_cmd'
|
57
57
|
require 'nhkore/cli/news_cmd'
|
58
|
+
require 'nhkore/cli/search_cmd'
|
58
59
|
require 'nhkore/cli/sift_cmd'
|
59
60
|
|
60
61
|
|
data/lib/nhkore/app.rb
CHANGED
@@ -24,6 +24,7 @@
|
|
24
24
|
require 'cri'
|
25
25
|
require 'highline'
|
26
26
|
require 'rainbow'
|
27
|
+
require 'set'
|
27
28
|
require 'tty-progressbar'
|
28
29
|
require 'tty-spinner'
|
29
30
|
|
@@ -31,10 +32,10 @@ require 'nhkore/error'
|
|
31
32
|
require 'nhkore/util'
|
32
33
|
require 'nhkore/version'
|
33
34
|
|
34
|
-
require 'nhkore/cli/bing_cmd'
|
35
35
|
require 'nhkore/cli/fx_cmd'
|
36
36
|
require 'nhkore/cli/get_cmd'
|
37
37
|
require 'nhkore/cli/news_cmd'
|
38
|
+
require 'nhkore/cli/search_cmd'
|
38
39
|
require 'nhkore/cli/sift_cmd'
|
39
40
|
|
40
41
|
|
@@ -47,30 +48,20 @@ module NHKore
|
|
47
48
|
end
|
48
49
|
|
49
50
|
###
|
50
|
-
# For disabling color output.
|
51
|
+
# For disabling/enabling color output.
|
51
52
|
#
|
52
53
|
# @author Jonathan Bradley Whited (@esotericpig)
|
53
|
-
# @since 0.2.
|
54
|
+
# @since 0.2.1
|
54
55
|
###
|
55
|
-
module
|
56
|
-
|
57
|
-
return str
|
58
|
-
end
|
59
|
-
|
60
|
-
def bold(str)
|
61
|
-
return str
|
62
|
-
end
|
56
|
+
module CriColorExt
|
57
|
+
@@color = true
|
63
58
|
|
64
|
-
def
|
65
|
-
|
59
|
+
def color=(color)
|
60
|
+
@@color = color
|
66
61
|
end
|
67
62
|
|
68
|
-
def
|
69
|
-
return
|
70
|
-
end
|
71
|
-
|
72
|
-
def yellow(str)
|
73
|
-
return str
|
63
|
+
def color?(io)
|
64
|
+
return @@color
|
74
65
|
end
|
75
66
|
end
|
76
67
|
|
@@ -79,14 +70,19 @@ module NHKore
|
|
79
70
|
# @since 0.2.0
|
80
71
|
###
|
81
72
|
class App
|
82
|
-
include CLI::BingCmd
|
83
73
|
include CLI::FXCmd
|
84
74
|
include CLI::GetCmd
|
85
75
|
include CLI::NewsCmd
|
76
|
+
include CLI::SearchCmd
|
86
77
|
include CLI::SiftCmd
|
87
78
|
|
88
79
|
NAME = 'nhkore'
|
89
80
|
|
81
|
+
DEFAULT_SLEEP_TIME = 0.1 # So that sites don't ban us (i.e., think we are human)
|
82
|
+
|
83
|
+
COLOR_OPTS = [:c,:color]
|
84
|
+
NO_COLOR_OPTS = [:C,:'no-color']
|
85
|
+
|
90
86
|
SPINNER_MSG = '[:spinner] :title:detail...'
|
91
87
|
CLASSIC_SPINNER = TTY::Spinner.new(SPINNER_MSG,format: :classic)
|
92
88
|
DEFAULT_SPINNER = TTY::Spinner.new(SPINNER_MSG,interval: 5,
|
@@ -94,8 +90,9 @@ module NHKore
|
|
94
90
|
NO_SPINNER = {} # Still outputs status & stores tokens
|
95
91
|
NO_SPINNER_MSG = '%{title}%{detail}...'
|
96
92
|
|
97
|
-
|
98
|
-
|
93
|
+
attr_reader :cmd
|
94
|
+
attr_reader :cmd_args
|
95
|
+
attr_reader :cmd_opts
|
99
96
|
attr_accessor :progress_bar
|
100
97
|
attr_accessor :scraper_kargs
|
101
98
|
attr_accessor :sleep_time
|
@@ -119,10 +116,10 @@ module NHKore
|
|
119
116
|
|
120
117
|
build_app_cmd()
|
121
118
|
|
122
|
-
build_bing_cmd()
|
123
119
|
build_fx_cmd()
|
124
120
|
build_get_cmd()
|
125
121
|
build_news_cmd()
|
122
|
+
build_search_cmd()
|
126
123
|
build_sift_cmd()
|
127
124
|
build_version_cmd()
|
128
125
|
|
@@ -130,18 +127,24 @@ module NHKore
|
|
130
127
|
end
|
131
128
|
|
132
129
|
def autodetect_color()
|
133
|
-
|
130
|
+
Cri::Platform.singleton_class.prepend(CriColorExt)
|
131
|
+
|
132
|
+
color = nil # Must be nil, not true/false
|
134
133
|
|
135
|
-
if
|
136
|
-
disable = true
|
137
|
-
elsif !@args.empty?()
|
134
|
+
if !@args.empty?()
|
138
135
|
# Kind of hacky, but necessary for Rainbow.
|
139
136
|
|
140
|
-
|
137
|
+
color_opts = opts_to_set(COLOR_OPTS)
|
138
|
+
no_color_opts = opts_to_set(NO_COLOR_OPTS)
|
141
139
|
|
142
140
|
@args.each() do |arg|
|
143
|
-
if
|
144
|
-
|
141
|
+
if color_opts.include?(arg)
|
142
|
+
color = true
|
143
|
+
break
|
144
|
+
end
|
145
|
+
|
146
|
+
if no_color_opts.include?(arg)
|
147
|
+
color = false
|
145
148
|
break
|
146
149
|
end
|
147
150
|
|
@@ -149,11 +152,11 @@ module NHKore
|
|
149
152
|
end
|
150
153
|
end
|
151
154
|
|
152
|
-
if
|
153
|
-
|
154
|
-
else
|
155
|
-
@rainbow.enabled = true # Force it in case Rainbow auto-disabled it
|
155
|
+
if color.nil?()
|
156
|
+
color = ($stdout.tty?() && ENV['TERM'] != 'dumb')
|
156
157
|
end
|
158
|
+
|
159
|
+
enable_color(color)
|
157
160
|
end
|
158
161
|
|
159
162
|
def build_app_cmd()
|
@@ -171,12 +174,15 @@ module NHKore
|
|
171
174
|
This is similar to a core word/vocabulary list.
|
172
175
|
EOD
|
173
176
|
|
174
|
-
flag :
|
177
|
+
flag :s,:'classic-fx',<<-EOD do |value,cmd|
|
175
178
|
use classic spinner/progress special effects (in case of no Unicode support) when running long tasks
|
176
179
|
EOD
|
177
180
|
app.progress_bar = :classic
|
178
181
|
app.spinner = CLASSIC_SPINNER
|
179
182
|
end
|
183
|
+
flag COLOR_OPTS[0],COLOR_OPTS[1],%q{force color output (for commands like '| less -R')} do |value,cmd|
|
184
|
+
app.enable_color(true)
|
185
|
+
end
|
180
186
|
flag :n,:'dry-run',<<-EOD
|
181
187
|
do a dry run without making changes; do not write to files, create directories, etc.
|
182
188
|
EOD
|
@@ -194,8 +200,8 @@ module NHKore
|
|
194
200
|
|
195
201
|
app.scraper_kargs[:max_retries] = value
|
196
202
|
end
|
197
|
-
flag
|
198
|
-
app.
|
203
|
+
flag NO_COLOR_OPTS[0],NO_COLOR_OPTS[1],'disable color output' do |value,cmd|
|
204
|
+
app.enable_color(false)
|
199
205
|
end
|
200
206
|
flag :X,:'no-fx','disable spinner/progress special effects when running long tasks' do |value,cmd|
|
201
207
|
app.progress_bar = :no
|
@@ -223,7 +229,7 @@ module NHKore
|
|
223
229
|
app.sleep_time = value.to_f()
|
224
230
|
app.sleep_time = 0.0 if app.sleep_time < 0.0
|
225
231
|
end
|
226
|
-
option :t,:
|
232
|
+
option :t,:timeout,<<-EOD,argument: :required do |value,cmd|
|
227
233
|
seconds for all URL timeouts: [open, read] (-1 or decimal >= 0)
|
228
234
|
EOD
|
229
235
|
value = value.to_f()
|
@@ -232,6 +238,14 @@ module NHKore
|
|
232
238
|
app.scraper_kargs[:open_timeout] = value
|
233
239
|
app.scraper_kargs[:read_timeout] = value
|
234
240
|
end
|
241
|
+
option :u,:'user-agent',<<-EOD,argument: :required do |value,cmd|
|
242
|
+
HTTP header field 'User-Agent' to use instead of a random one
|
243
|
+
EOD
|
244
|
+
value = app.check_empty_opt(:'user-agent',value)
|
245
|
+
|
246
|
+
app.scraper_kargs[:header] ||= {}
|
247
|
+
app.scraper_kargs[:header]['user-agent'] = value
|
248
|
+
end
|
235
249
|
# Big V, not small.
|
236
250
|
flag :V,:version,'show the version and exit' do |value,cmd|
|
237
251
|
app.show_version()
|
@@ -399,8 +413,8 @@ module NHKore
|
|
399
413
|
|
400
414
|
force = @cmd_opts[:force]
|
401
415
|
|
402
|
-
if !force && Dir.exist?(out_dir)
|
403
|
-
puts 'Warning: output directory already exists!'
|
416
|
+
if !force && Dir.exist?(out_dir) && !Dir.empty?(out_dir)
|
417
|
+
puts 'Warning: output directory already exists with files!'
|
404
418
|
puts ' : Files inside of this directory may be overwritten!'
|
405
419
|
puts "> '#{out_dir}'"
|
406
420
|
|
@@ -478,9 +492,18 @@ module NHKore
|
|
478
492
|
return color(str).green
|
479
493
|
end
|
480
494
|
|
481
|
-
def
|
482
|
-
Cri::
|
483
|
-
@rainbow.enabled =
|
495
|
+
def enable_color(enabled)
|
496
|
+
Cri::Platform.color = enabled
|
497
|
+
@rainbow.enabled = enabled
|
498
|
+
end
|
499
|
+
|
500
|
+
def opts_to_set(ary)
|
501
|
+
set = Set.new()
|
502
|
+
|
503
|
+
set.add("-#{ary[0].to_s()}") unless ary[0].nil?()
|
504
|
+
set.add("--#{ary[1].to_s()}") unless ary[1].nil?()
|
505
|
+
|
506
|
+
return set
|
484
507
|
end
|
485
508
|
|
486
509
|
def refresh_cmd(opts,args,cmd)
|
@@ -124,7 +124,7 @@ module NHKore
|
|
124
124
|
# - https://www3.nhk.or.jp/news/easy/k10012118911000/k10012118911000.html
|
125
125
|
# - '</p><br><「<ruby>台風<rt>たいふう</rt></ruby>'
|
126
126
|
|
127
|
-
|
127
|
+
read()
|
128
128
|
|
129
129
|
# To add a new one, simply add '|(...)' on a newline and test $#.
|
130
130
|
@str_or_io = @str_or_io.gsub(/
|
@@ -281,7 +281,7 @@ module NHKore
|
|
281
281
|
scraper = DictScraper.new(dict_url,missingno: @missingno,parse_url: false,**@kargs)
|
282
282
|
rescue OpenURI::HTTPError => e
|
283
283
|
if retries == 0 && e.to_s().include?('404')
|
284
|
-
|
284
|
+
read()
|
285
285
|
|
286
286
|
scraper = ArticleScraper.new(@url,str_or_io: @str_or_io,**@kargs)
|
287
287
|
|
data/lib/nhkore/cli/fx_cmd.rb
CHANGED
data/lib/nhkore/cli/get_cmd.rb
CHANGED
@@ -99,15 +99,14 @@ module CLI
|
|
99
99
|
|
100
100
|
return if dry_run
|
101
101
|
|
102
|
-
Tempfile.create([App::NAME,'.zip'],binmode: true) do |file|
|
102
|
+
Tempfile.create(["#{App::NAME}_",'.zip'],binmode: true) do |file|
|
103
103
|
puts
|
104
|
-
puts
|
104
|
+
puts "Downloading #{GET_URL_FILENAME} to temp file:"
|
105
105
|
puts "> #{file.path}"
|
106
|
-
puts
|
107
106
|
|
108
107
|
len = down.size
|
109
|
-
len = DEFAULT_GET_LENGTH if len.nil?()
|
110
|
-
bar = build_progress_bar(
|
108
|
+
len = DEFAULT_GET_LENGTH if len.nil?() || len < 1
|
109
|
+
bar = build_progress_bar('> Downloading',download: true,total: len)
|
111
110
|
|
112
111
|
bar.start()
|
113
112
|
|
@@ -120,9 +119,12 @@ module CLI
|
|
120
119
|
file.close()
|
121
120
|
bar.finish()
|
122
121
|
|
123
|
-
|
122
|
+
puts
|
123
|
+
puts "Extracting #{GET_URL_FILENAME}..."
|
124
124
|
|
125
|
-
|
125
|
+
# We manually ask the user whether to overwrite each file, so set this to
|
126
|
+
# true so that Zip extract() will force overwrites and not raise an error.
|
127
|
+
Zip.on_exists_proc = true
|
126
128
|
|
127
129
|
Zip::File.open(file) do |zip_file|
|
128
130
|
zip_file.each() do |entry|
|
@@ -130,17 +132,30 @@ module CLI
|
|
130
132
|
raise ZipError,"unsafe entry name[#{entry.name}] in Zip file"
|
131
133
|
end
|
132
134
|
|
133
|
-
name = File.basename(entry.name)
|
135
|
+
name = Util.strip_web_str(File.basename(entry.name))
|
136
|
+
|
137
|
+
next if name.empty?()
|
138
|
+
|
139
|
+
out_file = File.join(out_dir,name)
|
134
140
|
|
135
|
-
|
141
|
+
puts "> #{name}"
|
136
142
|
|
137
|
-
|
143
|
+
if !force && File.exist?(out_file)
|
144
|
+
puts
|
145
|
+
puts 'Warning: output file already exists!'
|
146
|
+
puts "> '#{out_file}'"
|
147
|
+
|
148
|
+
overwrite = @high.agree('Overwrite this file (yes/no)? ')
|
149
|
+
puts
|
150
|
+
|
151
|
+
next unless overwrite
|
152
|
+
end
|
153
|
+
|
154
|
+
entry.extract(out_file)
|
138
155
|
end
|
139
156
|
end
|
140
157
|
|
141
|
-
stop_spin()
|
142
158
|
puts
|
143
|
-
|
144
159
|
puts "Extracted #{GET_URL_FILENAME} to directory:"
|
145
160
|
puts "> #{out_dir}"
|
146
161
|
end
|
data/lib/nhkore/cli/news_cmd.rb
CHANGED
@@ -82,8 +82,8 @@ module CLI
|
|
82
82
|
value
|
83
83
|
end
|
84
84
|
option :l,:links,<<-EOD,argument: :required,transform: -> (value) do
|
85
|
-
'directory/file' of article links
|
86
|
-
defaults: #{SearchLinks::
|
85
|
+
'directory/file' of article links to scrape (see '#{App::NAME} search';
|
86
|
+
defaults: #{SearchLinks::DEFAULT_YASASHII_FILE}, #{SearchLinks::DEFAULT_FUTSUU_FILE})
|
87
87
|
EOD
|
88
88
|
app.check_empty_opt(:links,value)
|
89
89
|
end
|
@@ -170,12 +170,12 @@ module CLI
|
|
170
170
|
|
171
171
|
case type
|
172
172
|
when :futsuu
|
173
|
-
build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::
|
173
|
+
build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::DEFAULT_FUTSUU_FILENAME)
|
174
174
|
build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: FutsuuNews::DEFAULT_FILENAME)
|
175
175
|
|
176
176
|
news_name = 'Regular'
|
177
177
|
when :yasashii
|
178
|
-
build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::
|
178
|
+
build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::DEFAULT_YASASHII_FILENAME)
|
179
179
|
build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: YasashiiNews::DEFAULT_FILENAME)
|
180
180
|
|
181
181
|
news_name = 'Easy'
|
@@ -236,10 +236,22 @@ module CLI
|
|
236
236
|
})
|
237
237
|
|
238
238
|
if url.nil?()
|
239
|
-
|
239
|
+
# Why store each() and do `links_len` instead of `links-len - 1`?
|
240
|
+
#
|
241
|
+
# If links contains 5 entries and you scrape all 5, then the output of
|
242
|
+
# update_spin_detail() will end on 4, so all of this complexity is so
|
243
|
+
# that update_spin_detail() only needs to be written/updated on one line.
|
244
|
+
|
245
|
+
links_each = links.links.values.each()
|
246
|
+
links_len = links.length()
|
247
|
+
|
248
|
+
0.upto(links_len) do |i|
|
240
249
|
update_spin_detail(" (scraped=#{scrape_count}, considered=#{link_count += 1})")
|
241
250
|
|
242
|
-
break if scrape_count >= max_scrapes
|
251
|
+
break if i >= links_len || scrape_count >= max_scrapes
|
252
|
+
|
253
|
+
link = links_each.next()
|
254
|
+
|
243
255
|
next if !like.nil?() && !link.url.to_s().downcase().include?(like)
|
244
256
|
next if !redo_scrapes && scraped_news_article?(news,link)
|
245
257
|
|
@@ -248,7 +260,7 @@ module CLI
|
|
248
260
|
if (new_url = scrape_news_article(url,link: link,new_articles: new_articles,news: news))
|
249
261
|
# --show-dict
|
250
262
|
url = new_url
|
251
|
-
scrape_count = max_scrapes - 1
|
263
|
+
scrape_count = max_scrapes - 1 # Break on next iteration for update_spin_detail()
|
252
264
|
end
|
253
265
|
|
254
266
|
# Break on next iteration for update_spin_detail().
|
@@ -31,37 +31,37 @@ module NHKore
|
|
31
31
|
module CLI
|
32
32
|
###
|
33
33
|
# @author Jonathan Bradley Whited (@esotericpig)
|
34
|
-
# @since 0.
|
34
|
+
# @since 0.3.0
|
35
35
|
###
|
36
|
-
module
|
37
|
-
def
|
36
|
+
module SearchCmd
|
37
|
+
def build_search_cmd()
|
38
38
|
app = self
|
39
39
|
|
40
|
-
@
|
41
|
-
name '
|
42
|
-
usage '
|
43
|
-
aliases :
|
44
|
-
summary "Search
|
40
|
+
@search_cmd = @app_cmd.define_command() do
|
41
|
+
name 'search'
|
42
|
+
usage 'search [OPTIONS] [COMMAND]...'
|
43
|
+
aliases :se,:sea
|
44
|
+
summary "Search for links to NHK News Web (Easy) (aliases: #{app.color_alias('se sea')})"
|
45
45
|
|
46
46
|
description <<-EOD
|
47
|
-
Search
|
47
|
+
Search for links (using a Search Engine, etc.) to NHK News Web (Easy) &
|
48
48
|
save to folder: #{SearchLinks::DEFAULT_DIR}
|
49
49
|
EOD
|
50
50
|
|
51
51
|
option :i,:in,<<-EOD,argument: :required,transform: -> (value) do
|
52
|
-
|
53
|
-
see '--show
|
52
|
+
file to read instead of URL (for offline testing and/or slow internet;
|
53
|
+
see '--show-*' options)
|
54
54
|
EOD
|
55
55
|
app.check_empty_opt(:in,value)
|
56
56
|
end
|
57
57
|
option :o,:out,<<-EOD,argument: :required,transform: -> (value) do
|
58
58
|
'directory/file' to save links to; if you only specify a directory or a file, it will attach the
|
59
59
|
appropriate default directory/file name
|
60
|
-
(defaults: #{SearchLinks::
|
60
|
+
(defaults: #{SearchLinks::DEFAULT_YASASHII_FILE}, #{SearchLinks::DEFAULT_FUTSUU_FILE})
|
61
61
|
EOD
|
62
62
|
app.check_empty_opt(:out,value)
|
63
63
|
end
|
64
|
-
option :r,:results,'number of results per page to request from
|
64
|
+
option :r,:results,'number of results per page to request from search',argument: :required,
|
65
65
|
default: SearchScraper::DEFAULT_RESULT_COUNT,transform: -> (value) do
|
66
66
|
value = value.to_i()
|
67
67
|
value = 1 if value < 1
|
@@ -72,21 +72,26 @@ module CLI
|
|
72
72
|
useful for manually writing/updating scripts (but not for use in a variable);
|
73
73
|
implies '--dry-run' option
|
74
74
|
EOD
|
75
|
-
option nil,:'show-urls',<<-EOD
|
76
|
-
show the URLs used when scraping and exit;
|
77
|
-
|
75
|
+
option nil,:'show-urls',<<-EOD
|
76
|
+
show the URLs -- if any -- used when searching & scraping and exit;
|
77
|
+
you can download these for offline testing and/or slow internet
|
78
|
+
(see '--in' option)
|
78
79
|
EOD
|
79
|
-
puts "Easy: #{BingScraper.build_url(SearchScraper::YASASHII_SITE)}"
|
80
|
-
puts "Regular: #{BingScraper.build_url(SearchScraper::FUTSUU_SITE)}"
|
81
|
-
exit
|
82
|
-
end
|
83
80
|
|
84
81
|
run do |opts,args,cmd|
|
82
|
+
opts.each() do |key,value|
|
83
|
+
key = key.to_s()
|
84
|
+
|
85
|
+
if key.include?('show')
|
86
|
+
raise CLIError.new("must specify a sub command for option[#{key}]")
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
85
90
|
puts cmd.help
|
86
91
|
end
|
87
92
|
end
|
88
93
|
|
89
|
-
@
|
94
|
+
@search_easy_cmd = @search_cmd.define_command() do
|
90
95
|
name 'easy'
|
91
96
|
usage 'easy [OPTIONS] [COMMAND]...'
|
92
97
|
aliases :e,:ez
|
@@ -94,16 +99,16 @@ module CLI
|
|
94
99
|
|
95
100
|
description <<-EOD
|
96
101
|
Search for NHK News Web Easy (Yasashii) links &
|
97
|
-
save to file: #{SearchLinks::
|
102
|
+
save to file: #{SearchLinks::DEFAULT_YASASHII_FILE}
|
98
103
|
EOD
|
99
104
|
|
100
105
|
run do |opts,args,cmd|
|
101
106
|
app.refresh_cmd(opts,args,cmd)
|
102
|
-
app.
|
107
|
+
app.run_search_help()
|
103
108
|
end
|
104
109
|
end
|
105
110
|
|
106
|
-
@
|
111
|
+
@search_regular_cmd = @search_cmd.define_command() do
|
107
112
|
name 'regular'
|
108
113
|
usage 'regular [OPTIONS] [COMMAND]...'
|
109
114
|
aliases :r,:reg
|
@@ -111,28 +116,57 @@ module CLI
|
|
111
116
|
|
112
117
|
description <<-EOD
|
113
118
|
Search for NHK News Web Regular (Futsuu) links &
|
114
|
-
save to file: #{SearchLinks::
|
119
|
+
save to file: #{SearchLinks::DEFAULT_FUTSUU_FILE}
|
115
120
|
EOD
|
116
121
|
|
117
122
|
run do |opts,args,cmd|
|
118
123
|
app.refresh_cmd(opts,args,cmd)
|
119
|
-
app.
|
124
|
+
app.run_search_help()
|
120
125
|
end
|
121
126
|
end
|
127
|
+
|
128
|
+
@search_bing_cmd = Cri::Command.define() do
|
129
|
+
name 'bing'
|
130
|
+
usage 'bing [OPTIONS] [COMMAND]...'
|
131
|
+
aliases :b
|
132
|
+
summary "Search bing.com for links (aliases: #{app.color_alias('b')})"
|
133
|
+
|
134
|
+
description <<-EOD
|
135
|
+
Search bing.com for links & save to folder: #{SearchLinks::DEFAULT_DIR}
|
136
|
+
EOD
|
137
|
+
|
138
|
+
run do |opts,args,cmd|
|
139
|
+
app.refresh_cmd(opts,args,cmd)
|
140
|
+
app.run_search_cmd(cmd.supercommand.name.to_sym(),:bing)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
# dup()/clone() must be called for `cmd.supercommand` to work appropriately.
|
145
|
+
@search_easy_cmd.add_command @search_bing_cmd.dup()
|
146
|
+
@search_regular_cmd.add_command @search_bing_cmd.dup()
|
122
147
|
end
|
123
148
|
|
124
|
-
def
|
149
|
+
def run_search_cmd(nhk_type,search_type)
|
150
|
+
case nhk_type
|
151
|
+
when :easy
|
152
|
+
nhk_type = :yasashii
|
153
|
+
when :regular
|
154
|
+
nhk_type = :futsuu
|
155
|
+
end
|
156
|
+
|
157
|
+
return if show_search_urls(search_type)
|
158
|
+
|
125
159
|
@cmd_opts[:dry_run] = true if @cmd_opts[:show_count]
|
126
160
|
|
127
161
|
build_in_file(:in)
|
128
162
|
|
129
|
-
case
|
163
|
+
case nhk_type
|
130
164
|
when :futsuu
|
131
|
-
build_out_file(:out,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::
|
165
|
+
build_out_file(:out,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::DEFAULT_FUTSUU_FILENAME)
|
132
166
|
when :yasashii
|
133
|
-
build_out_file(:out,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::
|
167
|
+
build_out_file(:out,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::DEFAULT_YASASHII_FILENAME)
|
134
168
|
else
|
135
|
-
raise ArgumentError,"invalid
|
169
|
+
raise ArgumentError,"invalid nhk_type[#{nhk_type}]"
|
136
170
|
end
|
137
171
|
|
138
172
|
return unless check_in_file(:in,empty_ok: true)
|
@@ -145,7 +179,7 @@ module CLI
|
|
145
179
|
result_count = SearchScraper::DEFAULT_RESULT_COUNT if result_count.nil?()
|
146
180
|
show_count = @cmd_opts[:show_count]
|
147
181
|
|
148
|
-
start_spin(
|
182
|
+
start_spin("Scraping #{search_type}") unless show_count
|
149
183
|
|
150
184
|
is_file = !in_file.nil?()
|
151
185
|
links = nil
|
@@ -176,30 +210,43 @@ module CLI
|
|
176
210
|
return
|
177
211
|
end
|
178
212
|
|
179
|
-
# Do a range to prevent an infinite loop
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
213
|
+
range = (0..10000) # Do a range to prevent an infinite loop; ichiman!
|
214
|
+
|
215
|
+
case search_type
|
216
|
+
# Anything that extends SearchScraper.
|
217
|
+
when :bing
|
218
|
+
range.each() do
|
219
|
+
scraper = nil
|
220
|
+
|
221
|
+
case search_type
|
222
|
+
when :bing
|
223
|
+
scraper = BingScraper.new(nhk_type,count: result_count,is_file: is_file,url: url,**@scraper_kargs)
|
224
|
+
else
|
225
|
+
raise NHKore::Error,"internal code broken; add missing search_type[#{search_type}]"
|
226
|
+
end
|
227
|
+
|
228
|
+
next_page = scraper.scrape(links,next_page)
|
229
|
+
|
230
|
+
new_links.concat(links.links.values[links_count..-1])
|
231
|
+
links_count = links.length
|
232
|
+
page_count = next_page.count if next_page.count > 0
|
233
|
+
|
234
|
+
update_spin_detail(" (page=#{page_num}, count=#{page_count}, links=#{links.length}, " +
|
235
|
+
"new_links=#{new_links.length})")
|
236
|
+
|
237
|
+
break if next_page.empty?()
|
238
|
+
|
239
|
+
page_num += 1
|
240
|
+
url = next_page.url
|
241
|
+
|
242
|
+
sleep_scraper()
|
243
|
+
end
|
244
|
+
else
|
245
|
+
raise ArgumentError,"invalid search_type[#{search_type}]"
|
198
246
|
end
|
199
247
|
|
200
248
|
stop_spin()
|
201
249
|
puts
|
202
|
-
|
203
250
|
puts 'Last URL scraped:'
|
204
251
|
puts "> #{url}"
|
205
252
|
puts
|
@@ -215,6 +262,32 @@ module CLI
|
|
215
262
|
puts "> #{out_file}"
|
216
263
|
end
|
217
264
|
end
|
265
|
+
|
266
|
+
def run_search_help()
|
267
|
+
if @cmd_opts[:show_count] || @cmd_opts[:show_urls]
|
268
|
+
run_search_cmd(@cmd.name.to_sym(),nil)
|
269
|
+
else
|
270
|
+
puts @cmd.help
|
271
|
+
end
|
272
|
+
end
|
273
|
+
|
274
|
+
def show_search_urls(search_type)
|
275
|
+
return false unless @cmd_opts[:show_urls]
|
276
|
+
|
277
|
+
count = @cmd_opts[:results]
|
278
|
+
count = SearchScraper::DEFAULT_RESULT_COUNT if count.nil?()
|
279
|
+
|
280
|
+
case search_type
|
281
|
+
when :bing
|
282
|
+
puts 'Bing:'
|
283
|
+
puts "> Easy: #{BingScraper.build_url(SearchScraper::YASASHII_SITE,count: count)}"
|
284
|
+
puts "> Regular: #{BingScraper.build_url(SearchScraper::FUTSUU_SITE,count: count)}"
|
285
|
+
else
|
286
|
+
raise CLIError.new('must specify a sub command for option[show-urls]')
|
287
|
+
end
|
288
|
+
|
289
|
+
return true
|
290
|
+
end
|
218
291
|
end
|
219
292
|
end
|
220
293
|
end
|