nhkore 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,91 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ require 'nhkore/util'
25
+ require 'nhkore/word'
26
+
27
+
28
+ module NHKore
29
+ ###
30
+ # @author Jonathan Bradley Whited (@esotericpig)
31
+ # @since 0.2.0
32
+ ###
33
+ class Cleaner
34
+ def begin_clean(str)
35
+ return str
36
+ end
37
+
38
+ def clean(str)
39
+ str = begin_clean(str)
40
+ str = end_clean(str)
41
+
42
+ return str
43
+ end
44
+
45
+ def self.clean_any(obj,cleaners)
46
+ return nil if obj.nil?()
47
+
48
+ cleaners = Array(cleaners)
49
+
50
+ return obj if cleaners.empty?()
51
+
52
+ if obj.is_a?(Word)
53
+ obj = Word.new(
54
+ kana: clean_any(obj.kana,cleaners),
55
+ kanji: clean_any(obj.kanji,cleaners),
56
+ word: obj
57
+ )
58
+ else # String
59
+ cleaners.each() do |cleaner|
60
+ obj = cleaner.clean(obj)
61
+ end
62
+ end
63
+
64
+ return obj
65
+ end
66
+ end
67
+
68
+ ###
69
+ # @author Jonathan Bradley Whited (@esotericpig)
70
+ # @since 0.2.0
71
+ ###
72
+ class BasicCleaner < Cleaner
73
+ def end_clean(str)
74
+ # This is very simple, as Splitter will split on punctuation,
75
+ # and Polisher will remove the leftover punctuation, digits, etc.
76
+ # If this is stricter, then errors will be raised in ArticleScraper's
77
+ # scrape_dicwin_word() & scrape_ruby_word().
78
+
79
+ str = Util.unspace_web_str(str) # Who needs space in Japanese?
80
+
81
+ return str
82
+ end
83
+ end
84
+
85
+ ###
86
+ # @author Jonathan Bradley Whited (@esotericpig)
87
+ # @since 0.2.0
88
+ ###
89
+ class BestCleaner < BasicCleaner
90
+ end
91
+ end
@@ -0,0 +1,220 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ require 'nhkore/error'
25
+ require 'nhkore/search_link'
26
+ require 'nhkore/search_scraper'
27
+ require 'nhkore/util'
28
+
29
+
30
+ module NHKore
31
+ module CLI
32
+ ###
33
+ # @author Jonathan Bradley Whited (@esotericpig)
34
+ # @since 0.2.0
35
+ ###
36
+ module BingCmd
37
+ def build_bing_cmd()
38
+ app = self
39
+
40
+ @bing_cmd = @app_cmd.define_command() do
41
+ name 'bing'
42
+ usage 'bing [OPTIONS] [COMMAND]...'
43
+ aliases :b
44
+ summary "Search bing.com for links to NHK News Web (Easy) (aliases: #{app.color_alias('b')})"
45
+
46
+ description <<-EOD
47
+ Search bing.com for links to NHK News Web (Easy) &
48
+ save to folder: #{SearchLinks::DEFAULT_DIR}
49
+ EOD
50
+
51
+ option :i,:in,<<-EOD,argument: :required,transform: -> (value) do
52
+ HTML file to read instead of URL (for offline testing and/or slow internet;
53
+ see '--show-urls' option)
54
+ EOD
55
+ app.check_empty_opt(:in,value)
56
+ end
57
+ option :o,:out,<<-EOD,argument: :required,transform: -> (value) do
58
+ 'directory/file' to save links to; if you only specify a directory or a file, it will attach the
59
+ appropriate default directory/file name
60
+ (defaults: #{SearchLinks::DEFAULT_BING_YASASHII_FILE}, #{SearchLinks::DEFAULT_BING_FUTSUU_FILE})
61
+ EOD
62
+ app.check_empty_opt(:out,value)
63
+ end
64
+ option :r,:results,'number of results per page to request from Bing',argument: :required,
65
+ default: SearchScraper::DEFAULT_RESULT_COUNT,transform: -> (value) do
66
+ value = value.to_i()
67
+ value = 1 if value < 1
68
+ value
69
+ end
70
+ option nil,:'show-count',<<-EOD
71
+ show the number of links scraped and exit;
72
+ useful for manually writing/updating scripts (but not for use in a variable);
73
+ implies '--dry-run' option
74
+ EOD
75
+ option nil,:'show-urls',<<-EOD do |value,cmd|
76
+ show the URLs used when scraping and exit; you can download these for offline testing and/or
77
+ slow internet (see '--in' option)
78
+ EOD
79
+ puts "Easy: #{BingScraper.build_url(SearchScraper::YASASHII_SITE)}"
80
+ puts "Regular: #{BingScraper.build_url(SearchScraper::FUTSUU_SITE)}"
81
+ exit
82
+ end
83
+
84
+ run do |opts,args,cmd|
85
+ puts cmd.help
86
+ end
87
+ end
88
+
89
+ @bing_easy_cmd = @bing_cmd.define_command() do
90
+ name 'easy'
91
+ usage 'easy [OPTIONS] [COMMAND]...'
92
+ aliases :e,:ez
93
+ summary "Search for NHK News Web Easy (Yasashii) links (aliases: #{app.color_alias('e ez')})"
94
+
95
+ description <<-EOD
96
+ Search for NHK News Web Easy (Yasashii) links &
97
+ save to file: #{SearchLinks::DEFAULT_BING_YASASHII_FILE}
98
+ EOD
99
+
100
+ run do |opts,args,cmd|
101
+ app.refresh_cmd(opts,args,cmd)
102
+ app.run_bing_cmd(:yasashii)
103
+ end
104
+ end
105
+
106
+ @bing_regular_cmd = @bing_cmd.define_command() do
107
+ name 'regular'
108
+ usage 'regular [OPTIONS] [COMMAND]...'
109
+ aliases :r,:reg
110
+ summary "Search for NHK News Web Regular (Futsuu) links (aliases: #{app.color_alias('r reg')})"
111
+
112
+ description <<-EOD
113
+ Search for NHK News Web Regular (Futsuu) links &
114
+ save to file: #{SearchLinks::DEFAULT_BING_FUTSUU_FILE}
115
+ EOD
116
+
117
+ run do |opts,args,cmd|
118
+ app.refresh_cmd(opts,args,cmd)
119
+ app.run_bing_cmd(:futsuu)
120
+ end
121
+ end
122
+ end
123
+
124
+ def run_bing_cmd(type)
125
+ @cmd_opts[:dry_run] = true if @cmd_opts[:show_count]
126
+
127
+ build_in_file(:in)
128
+
129
+ case type
130
+ when :futsuu
131
+ build_out_file(:out,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::DEFAULT_BING_FUTSUU_FILENAME)
132
+ when :yasashii
133
+ build_out_file(:out,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::DEFAULT_BING_YASASHII_FILENAME)
134
+ else
135
+ raise ArgumentError,"invalid type[#{type}]"
136
+ end
137
+
138
+ return unless check_in_file(:in,empty_ok: true)
139
+ return unless check_out_file(:out)
140
+
141
+ dry_run = @cmd_opts[:dry_run]
142
+ in_file = @cmd_opts[:in]
143
+ out_file = @cmd_opts[:out]
144
+ result_count = @cmd_opts[:results]
145
+ result_count = SearchScraper::DEFAULT_RESULT_COUNT if result_count.nil?()
146
+ show_count = @cmd_opts[:show_count]
147
+
148
+ start_spin('Scraping bing.com') unless show_count
149
+
150
+ is_file = !in_file.nil?()
151
+ links = nil
152
+ new_links = [] # For --dry-run
153
+ next_page = NextPage.new()
154
+ page_count = 0
155
+ page_num = 1
156
+ url = in_file # nil will use default URL, else a file
157
+
158
+ # Load previous links for 'scraped?' vars.
159
+ if File.exist?(out_file)
160
+ links = SearchLinks.load_file(out_file)
161
+ else
162
+ links = SearchLinks.new()
163
+ end
164
+
165
+ links_count = links.length
166
+
167
+ if show_count
168
+ scraped_count = 0
169
+
170
+ links.links.values.each() do |link|
171
+ scraped_count += 1 if link.scraped?()
172
+ end
173
+
174
+ puts "#{scraped_count} of #{links_count} links scraped."
175
+
176
+ return
177
+ end
178
+
179
+ # Do a range to prevent an infinite loop. Ichiman!
180
+ (0..10000).each() do
181
+ scraper = BingScraper.new(type,count: result_count,is_file: is_file,url: url,**@scraper_kargs)
182
+
183
+ next_page = scraper.scrape(links,next_page)
184
+
185
+ new_links.concat(links.links.values[links_count..-1])
186
+ links_count = links.length
187
+ page_count = next_page.count if next_page.count > 0
188
+
189
+ update_spin_detail(" (page=#{page_num}, count=#{page_count}, links=#{links.length}, " +
190
+ "new_links=#{new_links.length})")
191
+
192
+ break if next_page.empty?()
193
+
194
+ page_num += 1
195
+ url = next_page.url
196
+
197
+ sleep_scraper()
198
+ end
199
+
200
+ stop_spin()
201
+ puts
202
+
203
+ puts 'Last URL scraped:'
204
+ puts "> #{url}"
205
+ puts
206
+
207
+ if dry_run
208
+ new_links.each() do |link|
209
+ puts link.to_s(mini: true)
210
+ end
211
+ else
212
+ links.save_file(out_file)
213
+
214
+ puts 'Saved scraped links to file:'
215
+ puts "> #{out_file}"
216
+ end
217
+ end
218
+ end
219
+ end
220
+ end
@@ -0,0 +1,116 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ module NHKore
25
+ module CLI
26
+ ###
27
+ # @author Jonathan Bradley Whited (@esotericpig)
28
+ # @since 0.2.0
29
+ ###
30
+ module FXCmd
31
+ def build_fx_cmd()
32
+ app = self
33
+
34
+ @fx_cmd = @app_cmd.define_command() do
35
+ name 'fx'
36
+ usage 'fx [OPTIONS] [COMMAND]...'
37
+ summary 'Test spinner/progress special effects (for running long tasks)'
38
+
39
+ description <<-EOD
40
+ Test if the special effects work on your command line:\n
41
+ - #{App::NAME} [-c/-X] fx
42
+ EOD
43
+
44
+ flag :a,:all,'test all special effects regardless of global options'
45
+
46
+ run do |opts,args,cmd|
47
+ app.refresh_cmd(opts,args,cmd)
48
+ app.run_fx_cmd()
49
+ end
50
+ end
51
+ end
52
+
53
+ def run_fx_cmd()
54
+ test_fx_progress_bar()
55
+ test_fx_spinner()
56
+ end
57
+
58
+ def test_fx_progress_bar()
59
+ bars = nil
60
+
61
+ if @cmd_opts[:all]
62
+ bars = [:default,:classic,:no]
63
+ else
64
+ bars = [@progress_bar]
65
+ end
66
+
67
+ bars.each() do |bar|
68
+ name = (bars.length == 1) ? 'User' : bar.to_s().capitalize()
69
+ bar = build_progress_bar("Testing #{name} progress",download: false,type: bar)
70
+
71
+ bar.start()
72
+
73
+ 0.upto(99) do
74
+ sleep(0.05)
75
+ bar.advance()
76
+ end
77
+
78
+ bar.finish()
79
+ end
80
+ end
81
+
82
+ def test_fx_spinner()
83
+ app_spinner = @spinner
84
+ spinners = nil
85
+
86
+ if @cmd_opts[:all]
87
+ spinners = {
88
+ default: App::DEFAULT_SPINNER,
89
+ classic: App::CLASSIC_SPINNER,
90
+ no: App::NO_SPINNER
91
+ }
92
+ else
93
+ spinners = {
94
+ user: app_spinner
95
+ }
96
+ end
97
+
98
+ spinners.each() do |name,spinner|
99
+ @spinner = spinner
100
+
101
+ start_spin("Testing #{name.to_s().capitalize()} spinner")
102
+
103
+ 1.upto(3) do |i|
104
+ sleep(1.1)
105
+ update_spin_detail(" (#{i}/3)")
106
+ end
107
+
108
+ stop_spin()
109
+ end
110
+
111
+ # Reset back to users'.
112
+ @spinner = app_spinner
113
+ end
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,153 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ require 'down/net_http'
25
+ require 'tempfile'
26
+ require 'zip'
27
+
28
+ require 'nhkore/util'
29
+
30
+
31
+ module NHKore
32
+ module CLI
33
+ ###
34
+ # @author Jonathan Bradley Whited (@esotericpig)
35
+ # @since 0.2.0
36
+ ###
37
+ module GetCmd
38
+ DEFAULT_GET_CHUNK_SIZE = 4 * 1024
39
+ DEFAULT_GET_URL_LENGTH = 5_000_000 # Just a generous estimation used as a fallback; may be outdated
40
+ GET_URL_FILENAME = 'nhkore-core.zip'
41
+ GET_URL = "https://github.com/esotericpig/nhkore/releases/latest/download/#{GET_URL_FILENAME}"
42
+
43
+ def build_get_cmd()
44
+ app = self
45
+
46
+ @get_cmd = @app_cmd.define_command() do
47
+ name 'get'
48
+ usage 'get [OPTIONS] [COMMAND]...'
49
+ aliases :g
50
+ summary "Download NHKore's pre-scraped files from the latest release (aliases: #{app.color_alias('g')})"
51
+
52
+ description <<-EOD
53
+ Download NHKore's pre-scraped files from the latest release &
54
+ save to folder: #{Util::CORE_DIR}
55
+
56
+ Note: the latest NHK articles may not have been scraped yet.
57
+ EOD
58
+
59
+ option :o,:out,'directory to save downloaded files to',argument: :required,default: Util::CORE_DIR,
60
+ transform: -> (value) do
61
+ app.check_empty_opt(:out,value)
62
+ end
63
+ flag nil,:'show-url','show download URL and exit (for downloading manually)' do |value,cmd|
64
+ puts GET_URL
65
+ exit
66
+ end
67
+
68
+ run do |opts,args,cmd|
69
+ app.refresh_cmd(opts,args,cmd)
70
+ app.run_get_cmd()
71
+ end
72
+ end
73
+ end
74
+
75
+ def run_get_cmd()
76
+ build_out_dir(:out,default_dir: Util::CORE_DIR)
77
+
78
+ return unless check_out_dir(:out)
79
+
80
+ chunk_size = DEFAULT_GET_CHUNK_SIZE
81
+ down = nil
82
+ dry_run = @cmd_opts[:dry_run]
83
+ force = @cmd_opts[:force]
84
+ max_retries = @scraper_kargs[:max_retries]
85
+ max_retries = 3 if max_retries.nil?()
86
+ out_dir = @cmd_opts[:out]
87
+
88
+ begin
89
+ start_spin('Opening URL')
90
+
91
+ begin
92
+ down = Down::NetHttp.open(GET_URL,rewindable: false,**@scraper_kargs)
93
+ rescue Down::ConnectionError
94
+ raise if (max_retries -= 1) < 0
95
+ retry
96
+ end
97
+
98
+ stop_spin()
99
+
100
+ return if dry_run
101
+
102
+ Tempfile.create([App::NAME,'.zip'],binmode: true) do |file|
103
+ puts
104
+ puts 'Downloading to temp file:'
105
+ puts "> #{file.path}"
106
+ puts
107
+
108
+ len = down.size
109
+ len = DEFAULT_GET_LENGTH if len.nil?()
110
+ bar = build_progress_bar("Downloading #{GET_URL_FILENAME}",download: true,total: len)
111
+
112
+ bar.start()
113
+
114
+ while !down.eof?()
115
+ file.write(down.read(chunk_size))
116
+ bar.advance(chunk_size)
117
+ end
118
+
119
+ down.close()
120
+ file.close()
121
+ bar.finish()
122
+
123
+ start_spin("Extracting #{GET_URL_FILENAME}")
124
+
125
+ Zip.on_exists_proc = force # true will force overwriting files on extract()
126
+
127
+ Zip::File.open(file) do |zip_file|
128
+ zip_file.each() do |entry|
129
+ if !entry.name_safe?()
130
+ raise ZipError,"unsafe entry name[#{entry.name}] in Zip file"
131
+ end
132
+
133
+ name = File.basename(entry.name)
134
+
135
+ update_spin_detail(" (file=#{name})")
136
+
137
+ entry.extract(File.join(out_dir,name))
138
+ end
139
+ end
140
+
141
+ stop_spin()
142
+ puts
143
+
144
+ puts "Extracted #{GET_URL_FILENAME} to directory:"
145
+ puts "> #{out_dir}"
146
+ end
147
+ ensure
148
+ down.close() if !down.nil?() && !down.closed?()
149
+ end
150
+ end
151
+ end
152
+ end
153
+ end