nhkore 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,91 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ require 'nhkore/util'
25
+ require 'nhkore/word'
26
+
27
+
28
+ module NHKore
29
+ ###
30
+ # @author Jonathan Bradley Whited (@esotericpig)
31
+ # @since 0.2.0
32
+ ###
33
+ class Cleaner
34
+ def begin_clean(str)
35
+ return str
36
+ end
37
+
38
+ def clean(str)
39
+ str = begin_clean(str)
40
+ str = end_clean(str)
41
+
42
+ return str
43
+ end
44
+
45
+ def self.clean_any(obj,cleaners)
46
+ return nil if obj.nil?()
47
+
48
+ cleaners = Array(cleaners)
49
+
50
+ return obj if cleaners.empty?()
51
+
52
+ if obj.is_a?(Word)
53
+ obj = Word.new(
54
+ kana: clean_any(obj.kana,cleaners),
55
+ kanji: clean_any(obj.kanji,cleaners),
56
+ word: obj
57
+ )
58
+ else # String
59
+ cleaners.each() do |cleaner|
60
+ obj = cleaner.clean(obj)
61
+ end
62
+ end
63
+
64
+ return obj
65
+ end
66
+ end
67
+
68
+ ###
69
+ # @author Jonathan Bradley Whited (@esotericpig)
70
+ # @since 0.2.0
71
+ ###
72
+ class BasicCleaner < Cleaner
73
+ def end_clean(str)
74
+ # This is very simple, as Splitter will split on punctuation,
75
+ # and Polisher will remove the leftover punctuation, digits, etc.
76
+ # If this is stricter, then errors will be raised in ArticleScraper's
77
+ # scrape_dicwin_word() & scrape_ruby_word().
78
+
79
+ str = Util.unspace_web_str(str) # Who needs space in Japanese?
80
+
81
+ return str
82
+ end
83
+ end
84
+
85
+ ###
86
+ # @author Jonathan Bradley Whited (@esotericpig)
87
+ # @since 0.2.0
88
+ ###
89
+ class BestCleaner < BasicCleaner
90
+ end
91
+ end
@@ -0,0 +1,220 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ require 'nhkore/error'
25
+ require 'nhkore/search_link'
26
+ require 'nhkore/search_scraper'
27
+ require 'nhkore/util'
28
+
29
+
30
+ module NHKore
31
+ module CLI
32
+ ###
33
+ # @author Jonathan Bradley Whited (@esotericpig)
34
+ # @since 0.2.0
35
+ ###
36
+ module BingCmd
37
+ def build_bing_cmd()
38
+ app = self
39
+
40
+ @bing_cmd = @app_cmd.define_command() do
41
+ name 'bing'
42
+ usage 'bing [OPTIONS] [COMMAND]...'
43
+ aliases :b
44
+ summary "Search bing.com for links to NHK News Web (Easy) (aliases: #{app.color_alias('b')})"
45
+
46
+ description <<-EOD
47
+ Search bing.com for links to NHK News Web (Easy) &
48
+ save to folder: #{SearchLinks::DEFAULT_DIR}
49
+ EOD
50
+
51
+ option :i,:in,<<-EOD,argument: :required,transform: -> (value) do
52
+ HTML file to read instead of URL (for offline testing and/or slow internet;
53
+ see '--show-urls' option)
54
+ EOD
55
+ app.check_empty_opt(:in,value)
56
+ end
57
+ option :o,:out,<<-EOD,argument: :required,transform: -> (value) do
58
+ 'directory/file' to save links to; if you only specify a directory or a file, it will attach the
59
+ appropriate default directory/file name
60
+ (defaults: #{SearchLinks::DEFAULT_BING_YASASHII_FILE}, #{SearchLinks::DEFAULT_BING_FUTSUU_FILE})
61
+ EOD
62
+ app.check_empty_opt(:out,value)
63
+ end
64
+ option :r,:results,'number of results per page to request from Bing',argument: :required,
65
+ default: SearchScraper::DEFAULT_RESULT_COUNT,transform: -> (value) do
66
+ value = value.to_i()
67
+ value = 1 if value < 1
68
+ value
69
+ end
70
+ option nil,:'show-count',<<-EOD
71
+ show the number of links scraped and exit;
72
+ useful for manually writing/updating scripts (but not for use in a variable);
73
+ implies '--dry-run' option
74
+ EOD
75
+ option nil,:'show-urls',<<-EOD do |value,cmd|
76
+ show the URLs used when scraping and exit; you can download these for offline testing and/or
77
+ slow internet (see '--in' option)
78
+ EOD
79
+ puts "Easy: #{BingScraper.build_url(SearchScraper::YASASHII_SITE)}"
80
+ puts "Regular: #{BingScraper.build_url(SearchScraper::FUTSUU_SITE)}"
81
+ exit
82
+ end
83
+
84
+ run do |opts,args,cmd|
85
+ puts cmd.help
86
+ end
87
+ end
88
+
89
+ @bing_easy_cmd = @bing_cmd.define_command() do
90
+ name 'easy'
91
+ usage 'easy [OPTIONS] [COMMAND]...'
92
+ aliases :e,:ez
93
+ summary "Search for NHK News Web Easy (Yasashii) links (aliases: #{app.color_alias('e ez')})"
94
+
95
+ description <<-EOD
96
+ Search for NHK News Web Easy (Yasashii) links &
97
+ save to file: #{SearchLinks::DEFAULT_BING_YASASHII_FILE}
98
+ EOD
99
+
100
+ run do |opts,args,cmd|
101
+ app.refresh_cmd(opts,args,cmd)
102
+ app.run_bing_cmd(:yasashii)
103
+ end
104
+ end
105
+
106
+ @bing_regular_cmd = @bing_cmd.define_command() do
107
+ name 'regular'
108
+ usage 'regular [OPTIONS] [COMMAND]...'
109
+ aliases :r,:reg
110
+ summary "Search for NHK News Web Regular (Futsuu) links (aliases: #{app.color_alias('r reg')})"
111
+
112
+ description <<-EOD
113
+ Search for NHK News Web Regular (Futsuu) links &
114
+ save to file: #{SearchLinks::DEFAULT_BING_FUTSUU_FILE}
115
+ EOD
116
+
117
+ run do |opts,args,cmd|
118
+ app.refresh_cmd(opts,args,cmd)
119
+ app.run_bing_cmd(:futsuu)
120
+ end
121
+ end
122
+ end
123
+
124
+ def run_bing_cmd(type)
125
+ @cmd_opts[:dry_run] = true if @cmd_opts[:show_count]
126
+
127
+ build_in_file(:in)
128
+
129
+ case type
130
+ when :futsuu
131
+ build_out_file(:out,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::DEFAULT_BING_FUTSUU_FILENAME)
132
+ when :yasashii
133
+ build_out_file(:out,default_dir: SearchLinks::DEFAULT_DIR,default_filename: SearchLinks::DEFAULT_BING_YASASHII_FILENAME)
134
+ else
135
+ raise ArgumentError,"invalid type[#{type}]"
136
+ end
137
+
138
+ return unless check_in_file(:in,empty_ok: true)
139
+ return unless check_out_file(:out)
140
+
141
+ dry_run = @cmd_opts[:dry_run]
142
+ in_file = @cmd_opts[:in]
143
+ out_file = @cmd_opts[:out]
144
+ result_count = @cmd_opts[:results]
145
+ result_count = SearchScraper::DEFAULT_RESULT_COUNT if result_count.nil?()
146
+ show_count = @cmd_opts[:show_count]
147
+
148
+ start_spin('Scraping bing.com') unless show_count
149
+
150
+ is_file = !in_file.nil?()
151
+ links = nil
152
+ new_links = [] # For --dry-run
153
+ next_page = NextPage.new()
154
+ page_count = 0
155
+ page_num = 1
156
+ url = in_file # nil will use default URL, else a file
157
+
158
+ # Load previous links for 'scraped?' vars.
159
+ if File.exist?(out_file)
160
+ links = SearchLinks.load_file(out_file)
161
+ else
162
+ links = SearchLinks.new()
163
+ end
164
+
165
+ links_count = links.length
166
+
167
+ if show_count
168
+ scraped_count = 0
169
+
170
+ links.links.values.each() do |link|
171
+ scraped_count += 1 if link.scraped?()
172
+ end
173
+
174
+ puts "#{scraped_count} of #{links_count} links scraped."
175
+
176
+ return
177
+ end
178
+
179
+ # Do a range to prevent an infinite loop. Ichiman!
180
+ (0..10000).each() do
181
+ scraper = BingScraper.new(type,count: result_count,is_file: is_file,url: url,**@scraper_kargs)
182
+
183
+ next_page = scraper.scrape(links,next_page)
184
+
185
+ new_links.concat(links.links.values[links_count..-1])
186
+ links_count = links.length
187
+ page_count = next_page.count if next_page.count > 0
188
+
189
+ update_spin_detail(" (page=#{page_num}, count=#{page_count}, links=#{links.length}, " +
190
+ "new_links=#{new_links.length})")
191
+
192
+ break if next_page.empty?()
193
+
194
+ page_num += 1
195
+ url = next_page.url
196
+
197
+ sleep_scraper()
198
+ end
199
+
200
+ stop_spin()
201
+ puts
202
+
203
+ puts 'Last URL scraped:'
204
+ puts "> #{url}"
205
+ puts
206
+
207
+ if dry_run
208
+ new_links.each() do |link|
209
+ puts link.to_s(mini: true)
210
+ end
211
+ else
212
+ links.save_file(out_file)
213
+
214
+ puts 'Saved scraped links to file:'
215
+ puts "> #{out_file}"
216
+ end
217
+ end
218
+ end
219
+ end
220
+ end
@@ -0,0 +1,116 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ module NHKore
25
+ module CLI
26
+ ###
27
+ # @author Jonathan Bradley Whited (@esotericpig)
28
+ # @since 0.2.0
29
+ ###
30
+ module FXCmd
31
+ def build_fx_cmd()
32
+ app = self
33
+
34
+ @fx_cmd = @app_cmd.define_command() do
35
+ name 'fx'
36
+ usage 'fx [OPTIONS] [COMMAND]...'
37
+ summary 'Test spinner/progress special effects (for running long tasks)'
38
+
39
+ description <<-EOD
40
+ Test if the special effects work on your command line:\n
41
+ - #{App::NAME} [-c/-X] fx
42
+ EOD
43
+
44
+ flag :a,:all,'test all special effects regardless of global options'
45
+
46
+ run do |opts,args,cmd|
47
+ app.refresh_cmd(opts,args,cmd)
48
+ app.run_fx_cmd()
49
+ end
50
+ end
51
+ end
52
+
53
+ def run_fx_cmd()
54
+ test_fx_progress_bar()
55
+ test_fx_spinner()
56
+ end
57
+
58
+ def test_fx_progress_bar()
59
+ bars = nil
60
+
61
+ if @cmd_opts[:all]
62
+ bars = [:default,:classic,:no]
63
+ else
64
+ bars = [@progress_bar]
65
+ end
66
+
67
+ bars.each() do |bar|
68
+ name = (bars.length == 1) ? 'User' : bar.to_s().capitalize()
69
+ bar = build_progress_bar("Testing #{name} progress",download: false,type: bar)
70
+
71
+ bar.start()
72
+
73
+ 0.upto(99) do
74
+ sleep(0.05)
75
+ bar.advance()
76
+ end
77
+
78
+ bar.finish()
79
+ end
80
+ end
81
+
82
+ def test_fx_spinner()
83
+ app_spinner = @spinner
84
+ spinners = nil
85
+
86
+ if @cmd_opts[:all]
87
+ spinners = {
88
+ default: App::DEFAULT_SPINNER,
89
+ classic: App::CLASSIC_SPINNER,
90
+ no: App::NO_SPINNER
91
+ }
92
+ else
93
+ spinners = {
94
+ user: app_spinner
95
+ }
96
+ end
97
+
98
+ spinners.each() do |name,spinner|
99
+ @spinner = spinner
100
+
101
+ start_spin("Testing #{name.to_s().capitalize()} spinner")
102
+
103
+ 1.upto(3) do |i|
104
+ sleep(1.1)
105
+ update_spin_detail(" (#{i}/3)")
106
+ end
107
+
108
+ stop_spin()
109
+ end
110
+
111
+ # Reset back to users'.
112
+ @spinner = app_spinner
113
+ end
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,153 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ require 'down/net_http'
25
+ require 'tempfile'
26
+ require 'zip'
27
+
28
+ require 'nhkore/util'
29
+
30
+
31
+ module NHKore
32
+ module CLI
33
+ ###
34
+ # @author Jonathan Bradley Whited (@esotericpig)
35
+ # @since 0.2.0
36
+ ###
37
+ module GetCmd
38
+ DEFAULT_GET_CHUNK_SIZE = 4 * 1024
39
+ DEFAULT_GET_URL_LENGTH = 5_000_000 # Just a generous estimation used as a fallback; may be outdated
40
+ GET_URL_FILENAME = 'nhkore-core.zip'
41
+ GET_URL = "https://github.com/esotericpig/nhkore/releases/latest/download/#{GET_URL_FILENAME}"
42
+
43
+ def build_get_cmd()
44
+ app = self
45
+
46
+ @get_cmd = @app_cmd.define_command() do
47
+ name 'get'
48
+ usage 'get [OPTIONS] [COMMAND]...'
49
+ aliases :g
50
+ summary "Download NHKore's pre-scraped files from the latest release (aliases: #{app.color_alias('g')})"
51
+
52
+ description <<-EOD
53
+ Download NHKore's pre-scraped files from the latest release &
54
+ save to folder: #{Util::CORE_DIR}
55
+
56
+ Note: the latest NHK articles may not have been scraped yet.
57
+ EOD
58
+
59
+ option :o,:out,'directory to save downloaded files to',argument: :required,default: Util::CORE_DIR,
60
+ transform: -> (value) do
61
+ app.check_empty_opt(:out,value)
62
+ end
63
+ flag nil,:'show-url','show download URL and exit (for downloading manually)' do |value,cmd|
64
+ puts GET_URL
65
+ exit
66
+ end
67
+
68
+ run do |opts,args,cmd|
69
+ app.refresh_cmd(opts,args,cmd)
70
+ app.run_get_cmd()
71
+ end
72
+ end
73
+ end
74
+
75
+ def run_get_cmd()
76
+ build_out_dir(:out,default_dir: Util::CORE_DIR)
77
+
78
+ return unless check_out_dir(:out)
79
+
80
+ chunk_size = DEFAULT_GET_CHUNK_SIZE
81
+ down = nil
82
+ dry_run = @cmd_opts[:dry_run]
83
+ force = @cmd_opts[:force]
84
+ max_retries = @scraper_kargs[:max_retries]
85
+ max_retries = 3 if max_retries.nil?()
86
+ out_dir = @cmd_opts[:out]
87
+
88
+ begin
89
+ start_spin('Opening URL')
90
+
91
+ begin
92
+ down = Down::NetHttp.open(GET_URL,rewindable: false,**@scraper_kargs)
93
+ rescue Down::ConnectionError
94
+ raise if (max_retries -= 1) < 0
95
+ retry
96
+ end
97
+
98
+ stop_spin()
99
+
100
+ return if dry_run
101
+
102
+ Tempfile.create([App::NAME,'.zip'],binmode: true) do |file|
103
+ puts
104
+ puts 'Downloading to temp file:'
105
+ puts "> #{file.path}"
106
+ puts
107
+
108
+ len = down.size
109
+ len = DEFAULT_GET_LENGTH if len.nil?()
110
+ bar = build_progress_bar("Downloading #{GET_URL_FILENAME}",download: true,total: len)
111
+
112
+ bar.start()
113
+
114
+ while !down.eof?()
115
+ file.write(down.read(chunk_size))
116
+ bar.advance(chunk_size)
117
+ end
118
+
119
+ down.close()
120
+ file.close()
121
+ bar.finish()
122
+
123
+ start_spin("Extracting #{GET_URL_FILENAME}")
124
+
125
+ Zip.on_exists_proc = force # true will force overwriting files on extract()
126
+
127
+ Zip::File.open(file) do |zip_file|
128
+ zip_file.each() do |entry|
129
+ if !entry.name_safe?()
130
+ raise ZipError,"unsafe entry name[#{entry.name}] in Zip file"
131
+ end
132
+
133
+ name = File.basename(entry.name)
134
+
135
+ update_spin_detail(" (file=#{name})")
136
+
137
+ entry.extract(File.join(out_dir,name))
138
+ end
139
+ end
140
+
141
+ stop_spin()
142
+ puts
143
+
144
+ puts "Extracted #{GET_URL_FILENAME} to directory:"
145
+ puts "> #{out_dir}"
146
+ end
147
+ ensure
148
+ down.close() if !down.nil?() && !down.closed?()
149
+ end
150
+ end
151
+ end
152
+ end
153
+ end