nhkore 0.3.3 → 0.3.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +3 -0
- data/CHANGELOG.md +97 -2
- data/Gemfile +0 -18
- data/Gemfile.lock +89 -0
- data/README.md +58 -30
- data/Rakefile +68 -42
- data/bin/nhkore +4 -15
- data/lib/nhkore.rb +8 -20
- data/lib/nhkore/app.rb +231 -236
- data/lib/nhkore/article.rb +56 -53
- data/lib/nhkore/article_scraper.rb +308 -289
- data/lib/nhkore/cleaner.rb +20 -32
- data/lib/nhkore/cli/fx_cmd.rb +41 -53
- data/lib/nhkore/cli/get_cmd.rb +59 -70
- data/lib/nhkore/cli/news_cmd.rb +145 -154
- data/lib/nhkore/cli/search_cmd.rb +110 -120
- data/lib/nhkore/cli/sift_cmd.rb +111 -227
- data/lib/nhkore/datetime_parser.rb +328 -0
- data/lib/nhkore/defn.rb +48 -55
- data/lib/nhkore/dict.rb +26 -38
- data/lib/nhkore/dict_scraper.rb +31 -40
- data/lib/nhkore/entry.rb +43 -55
- data/lib/nhkore/error.rb +16 -21
- data/lib/nhkore/fileable.rb +10 -21
- data/lib/nhkore/lib.rb +6 -17
- data/lib/nhkore/missingno.rb +21 -33
- data/lib/nhkore/news.rb +61 -66
- data/lib/nhkore/polisher.rb +22 -34
- data/lib/nhkore/scraper.rb +75 -82
- data/lib/nhkore/search_link.rb +85 -78
- data/lib/nhkore/search_scraper.rb +89 -92
- data/lib/nhkore/sifter.rb +157 -171
- data/lib/nhkore/splitter.rb +19 -31
- data/lib/nhkore/user_agents.rb +28 -32
- data/lib/nhkore/util.rb +72 -101
- data/lib/nhkore/variator.rb +20 -32
- data/lib/nhkore/version.rb +4 -16
- data/lib/nhkore/word.rb +105 -99
- data/nhkore.gemspec +58 -65
- data/samples/looper.rb +71 -0
- data/test/nhkore/test_helper.rb +3 -15
- data/test/nhkore_test.rb +6 -18
- metadata +53 -30
data/lib/nhkore/cleaner.rb
CHANGED
@@ -1,23 +1,11 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
@@ -27,28 +15,28 @@ require 'nhkore/word'
|
|
27
15
|
|
28
16
|
module NHKore
|
29
17
|
###
|
30
|
-
# @author Jonathan Bradley Whited
|
18
|
+
# @author Jonathan Bradley Whited
|
31
19
|
# @since 0.2.0
|
32
20
|
###
|
33
21
|
class Cleaner
|
34
22
|
def begin_clean(str)
|
35
23
|
return str
|
36
24
|
end
|
37
|
-
|
25
|
+
|
38
26
|
def clean(str)
|
39
27
|
str = begin_clean(str)
|
40
28
|
str = end_clean(str)
|
41
|
-
|
29
|
+
|
42
30
|
return str
|
43
31
|
end
|
44
|
-
|
32
|
+
|
45
33
|
def self.clean_any(obj,cleaners)
|
46
|
-
return nil if obj.nil?
|
47
|
-
|
34
|
+
return nil if obj.nil?
|
35
|
+
|
48
36
|
cleaners = Array(cleaners)
|
49
|
-
|
50
|
-
return obj if cleaners.empty?
|
51
|
-
|
37
|
+
|
38
|
+
return obj if cleaners.empty?
|
39
|
+
|
52
40
|
if obj.is_a?(Word)
|
53
41
|
obj = Word.new(
|
54
42
|
kana: clean_any(obj.kana,cleaners),
|
@@ -56,17 +44,17 @@ module NHKore
|
|
56
44
|
word: obj
|
57
45
|
)
|
58
46
|
else # String
|
59
|
-
cleaners.each
|
47
|
+
cleaners.each do |cleaner|
|
60
48
|
obj = cleaner.clean(obj)
|
61
49
|
end
|
62
50
|
end
|
63
|
-
|
51
|
+
|
64
52
|
return obj
|
65
53
|
end
|
66
54
|
end
|
67
|
-
|
55
|
+
|
68
56
|
###
|
69
|
-
# @author Jonathan Bradley Whited
|
57
|
+
# @author Jonathan Bradley Whited
|
70
58
|
# @since 0.2.0
|
71
59
|
###
|
72
60
|
class BasicCleaner < Cleaner
|
@@ -75,15 +63,15 @@ module NHKore
|
|
75
63
|
# and Polisher will remove the leftover punctuation, digits, etc.
|
76
64
|
# If this is stricter, then errors will be raised in ArticleScraper's
|
77
65
|
# scrape_dicwin_word() & scrape_ruby_word().
|
78
|
-
|
66
|
+
|
79
67
|
str = Util.unspace_web_str(str) # Who needs space in Japanese?
|
80
|
-
|
68
|
+
|
81
69
|
return str
|
82
70
|
end
|
83
71
|
end
|
84
|
-
|
72
|
+
|
85
73
|
###
|
86
|
-
# @author Jonathan Bradley Whited
|
74
|
+
# @author Jonathan Bradley Whited
|
87
75
|
# @since 0.2.0
|
88
76
|
###
|
89
77
|
class BestCleaner < BasicCleaner
|
data/lib/nhkore/cli/fx_cmd.rb
CHANGED
@@ -1,113 +1,101 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
24
12
|
module NHKore
|
25
13
|
module CLI
|
26
14
|
###
|
27
|
-
# @author Jonathan Bradley Whited
|
15
|
+
# @author Jonathan Bradley Whited
|
28
16
|
# @since 0.2.0
|
29
17
|
###
|
30
18
|
module FXCmd
|
31
|
-
def build_fx_cmd
|
19
|
+
def build_fx_cmd
|
32
20
|
app = self
|
33
|
-
|
34
|
-
@fx_cmd = @app_cmd.define_command
|
21
|
+
|
22
|
+
@fx_cmd = @app_cmd.define_command do
|
35
23
|
name 'fx'
|
36
24
|
usage 'fx [OPTIONS] [COMMAND]...'
|
37
25
|
summary 'Test spinner/progress special effects (for running long tasks)'
|
38
|
-
|
39
|
-
description <<-
|
26
|
+
|
27
|
+
description <<-DESC
|
40
28
|
Test if the special effects work on your command line:\n
|
41
29
|
- #{App::NAME} [-s/-X] fx
|
42
|
-
|
43
|
-
|
30
|
+
DESC
|
31
|
+
|
44
32
|
flag :a,:all,'test all special effects regardless of global options'
|
45
|
-
|
33
|
+
|
46
34
|
run do |opts,args,cmd|
|
47
35
|
app.refresh_cmd(opts,args,cmd)
|
48
|
-
app.run_fx_cmd
|
36
|
+
app.run_fx_cmd
|
49
37
|
end
|
50
38
|
end
|
51
39
|
end
|
52
|
-
|
53
|
-
def run_fx_cmd
|
54
|
-
test_fx_progress_bar
|
55
|
-
test_fx_spinner
|
40
|
+
|
41
|
+
def run_fx_cmd
|
42
|
+
test_fx_progress_bar
|
43
|
+
test_fx_spinner
|
56
44
|
end
|
57
|
-
|
58
|
-
def test_fx_progress_bar
|
45
|
+
|
46
|
+
def test_fx_progress_bar
|
59
47
|
bars = nil
|
60
|
-
|
48
|
+
|
61
49
|
if @cmd_opts[:all]
|
62
50
|
bars = {default: :default,classic: :classic,no: :no}
|
63
51
|
else
|
64
52
|
bars = {user: @progress_bar}
|
65
53
|
end
|
66
|
-
|
67
|
-
bars.each
|
68
|
-
name = name.to_s
|
54
|
+
|
55
|
+
bars.each do |name,bar|
|
56
|
+
name = name.to_s.capitalize
|
69
57
|
bar = build_progress_bar("Testing #{name} progress",download: false,type: bar)
|
70
|
-
|
71
|
-
bar.start
|
72
|
-
|
58
|
+
|
59
|
+
bar.start
|
60
|
+
|
73
61
|
0.upto(99) do
|
74
62
|
sleep(0.05)
|
75
|
-
bar.advance
|
63
|
+
bar.advance
|
76
64
|
end
|
77
|
-
|
78
|
-
bar.finish
|
65
|
+
|
66
|
+
bar.finish
|
79
67
|
end
|
80
68
|
end
|
81
|
-
|
82
|
-
def test_fx_spinner
|
69
|
+
|
70
|
+
def test_fx_spinner
|
83
71
|
app_spinner = @spinner
|
84
72
|
spinners = nil
|
85
|
-
|
73
|
+
|
86
74
|
if @cmd_opts[:all]
|
87
75
|
spinners = {
|
88
76
|
default: App::DEFAULT_SPINNER,
|
89
77
|
classic: App::CLASSIC_SPINNER,
|
90
|
-
no:
|
78
|
+
no: {},
|
91
79
|
}
|
92
80
|
else
|
93
81
|
spinners = {
|
94
82
|
user: app_spinner
|
95
83
|
}
|
96
84
|
end
|
97
|
-
|
98
|
-
spinners.each
|
85
|
+
|
86
|
+
spinners.each do |name,spinner|
|
99
87
|
@spinner = spinner
|
100
|
-
|
101
|
-
start_spin("Testing #{name.to_s
|
102
|
-
|
88
|
+
|
89
|
+
start_spin("Testing #{name.to_s.capitalize} spinner")
|
90
|
+
|
103
91
|
1.upto(3) do |i|
|
104
92
|
sleep(1.1)
|
105
93
|
update_spin_detail(" (#{i}/3)")
|
106
94
|
end
|
107
|
-
|
108
|
-
stop_spin
|
95
|
+
|
96
|
+
stop_spin
|
109
97
|
end
|
110
|
-
|
98
|
+
|
111
99
|
# Reset back to users'.
|
112
100
|
@spinner = app_spinner
|
113
101
|
end
|
data/lib/nhkore/cli/get_cmd.rb
CHANGED
@@ -1,23 +1,11 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
1
|
# encoding: UTF-8
|
3
2
|
# frozen_string_literal: true
|
4
3
|
|
5
4
|
#--
|
6
5
|
# This file is part of NHKore.
|
7
|
-
# Copyright (c) 2020 Jonathan Bradley Whited
|
8
|
-
#
|
9
|
-
#
|
10
|
-
# it under the terms of the GNU Lesser General Public License as published by
|
11
|
-
# the Free Software Foundation, either version 3 of the License, or
|
12
|
-
# (at your option) any later version.
|
13
|
-
#
|
14
|
-
# NHKore is distributed in the hope that it will be useful,
|
15
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
-
# GNU Lesser General Public License for more details.
|
18
|
-
#
|
19
|
-
# You should have received a copy of the GNU Lesser General Public License
|
20
|
-
# along with NHKore. If not, see <https://www.gnu.org/licenses/>.
|
6
|
+
# Copyright (c) 2020-2021 Jonathan Bradley Whited
|
7
|
+
#
|
8
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later
|
21
9
|
#++
|
22
10
|
|
23
11
|
|
@@ -27,140 +15,141 @@ require 'nhkore/util'
|
|
27
15
|
module NHKore
|
28
16
|
module CLI
|
29
17
|
###
|
30
|
-
# @author Jonathan Bradley Whited
|
18
|
+
# @author Jonathan Bradley Whited
|
31
19
|
# @since 0.2.0
|
32
20
|
###
|
33
21
|
module GetCmd
|
34
22
|
DEFAULT_GET_CHUNK_SIZE = 4 * 1024
|
35
|
-
DEFAULT_GET_URL_LENGTH =
|
23
|
+
DEFAULT_GET_URL_LENGTH = 11_000_000 # Just a generous estimation used as a fallback; may be outdated.
|
36
24
|
GET_URL_FILENAME = 'nhkore-core.zip'
|
37
25
|
GET_URL = "https://github.com/esotericpig/nhkore/releases/latest/download/#{GET_URL_FILENAME}"
|
38
|
-
|
39
|
-
def build_get_cmd
|
26
|
+
|
27
|
+
def build_get_cmd
|
40
28
|
app = self
|
41
|
-
|
42
|
-
@get_cmd = @app_cmd.define_command
|
29
|
+
|
30
|
+
@get_cmd = @app_cmd.define_command do
|
43
31
|
name 'get'
|
44
32
|
usage 'get [OPTIONS] [COMMAND]...'
|
45
33
|
aliases :g
|
46
|
-
summary "Download NHKore's pre-scraped files from the latest release
|
47
|
-
|
48
|
-
|
34
|
+
summary "Download NHKore's pre-scraped files from the latest release" \
|
35
|
+
" (aliases: #{app.color_alias('g')})"
|
36
|
+
|
37
|
+
description(<<-DESC)
|
49
38
|
Download NHKore's pre-scraped files from the latest release &
|
50
39
|
save to folder: #{Util::CORE_DIR}
|
51
|
-
|
40
|
+
|
52
41
|
Note: the latest NHK articles may not have been scraped yet.
|
53
|
-
|
54
|
-
|
42
|
+
DESC
|
43
|
+
|
55
44
|
option :o,:out,'directory to save downloaded files to',argument: :required,default: Util::CORE_DIR,
|
56
|
-
|
57
|
-
|
58
|
-
|
45
|
+
transform: lambda { |value|
|
46
|
+
app.check_empty_opt(:out,value)
|
47
|
+
}
|
59
48
|
flag nil,:'show-url','show download URL and exit (for downloading manually)' do |value,cmd|
|
60
49
|
puts GET_URL
|
61
50
|
exit
|
62
51
|
end
|
63
|
-
|
52
|
+
|
64
53
|
run do |opts,args,cmd|
|
65
54
|
app.refresh_cmd(opts,args,cmd)
|
66
|
-
app.run_get_cmd
|
55
|
+
app.run_get_cmd
|
67
56
|
end
|
68
57
|
end
|
69
58
|
end
|
70
|
-
|
71
|
-
def run_get_cmd
|
59
|
+
|
60
|
+
def run_get_cmd
|
72
61
|
require 'down/net_http'
|
73
62
|
require 'tempfile'
|
74
63
|
require 'zip'
|
75
|
-
|
64
|
+
|
76
65
|
build_out_dir(:out,default_dir: Util::CORE_DIR)
|
77
|
-
|
66
|
+
|
78
67
|
return unless check_out_dir(:out)
|
79
|
-
|
68
|
+
|
80
69
|
chunk_size = DEFAULT_GET_CHUNK_SIZE
|
81
70
|
down = nil
|
82
71
|
dry_run = @cmd_opts[:dry_run]
|
83
72
|
force = @cmd_opts[:force]
|
84
73
|
max_retries = @scraper_kargs[:max_retries]
|
85
|
-
max_retries = 3 if max_retries.nil?
|
74
|
+
max_retries = 3 if max_retries.nil?
|
86
75
|
out_dir = @cmd_opts[:out]
|
87
|
-
|
76
|
+
|
88
77
|
begin
|
89
78
|
start_spin('Opening URL')
|
90
|
-
|
79
|
+
|
91
80
|
begin
|
92
81
|
down = Down::NetHttp.open(GET_URL,rewindable: false,**@scraper_kargs)
|
93
82
|
rescue Down::ConnectionError
|
94
83
|
raise if (max_retries -= 1) < 0
|
95
84
|
retry
|
96
85
|
end
|
97
|
-
|
98
|
-
stop_spin
|
99
|
-
|
86
|
+
|
87
|
+
stop_spin
|
88
|
+
|
100
89
|
return if dry_run
|
101
|
-
|
90
|
+
|
102
91
|
Tempfile.create(["#{App::NAME}_",'.zip'],binmode: true) do |file|
|
103
92
|
puts
|
104
93
|
puts "Downloading #{GET_URL_FILENAME} to temp file:"
|
105
94
|
puts "> #{file.path}"
|
106
|
-
|
95
|
+
|
107
96
|
len = down.size
|
108
|
-
len = DEFAULT_GET_LENGTH if len.nil?
|
97
|
+
len = DEFAULT_GET_LENGTH if len.nil? || len < 1
|
109
98
|
bar = build_progress_bar('> Downloading',download: true,total: len)
|
110
|
-
|
111
|
-
bar.start
|
112
|
-
|
113
|
-
while !down.eof?
|
99
|
+
|
100
|
+
bar.start
|
101
|
+
|
102
|
+
while !down.eof?
|
114
103
|
file.write(down.read(chunk_size))
|
115
104
|
bar.advance(chunk_size)
|
116
105
|
end
|
117
|
-
|
118
|
-
down.close
|
119
|
-
file.close
|
120
|
-
bar.finish
|
121
|
-
|
106
|
+
|
107
|
+
down.close
|
108
|
+
file.close
|
109
|
+
bar.finish
|
110
|
+
|
122
111
|
puts
|
123
112
|
puts "Extracting #{GET_URL_FILENAME}..."
|
124
|
-
|
113
|
+
|
125
114
|
# We manually ask the user whether to overwrite each file, so set this to
|
126
115
|
# true so that Zip extract() will force overwrites and not raise an error.
|
127
116
|
Zip.on_exists_proc = true
|
128
|
-
|
117
|
+
|
129
118
|
Zip::File.open(file) do |zip_file|
|
130
|
-
zip_file.each
|
131
|
-
if !entry.name_safe?
|
119
|
+
zip_file.each do |entry|
|
120
|
+
if !entry.name_safe?
|
132
121
|
raise ZipError,"unsafe entry name[#{entry.name}] in Zip file"
|
133
122
|
end
|
134
|
-
|
123
|
+
|
135
124
|
name = Util.strip_web_str(File.basename(entry.name))
|
136
|
-
|
137
|
-
next if name.empty?
|
138
|
-
|
125
|
+
|
126
|
+
next if name.empty?
|
127
|
+
|
139
128
|
out_file = File.join(out_dir,name)
|
140
|
-
|
129
|
+
|
141
130
|
puts "> #{name}"
|
142
|
-
|
131
|
+
|
143
132
|
if !force && File.exist?(out_file)
|
144
133
|
puts
|
145
134
|
puts 'Warning: output file already exists!'
|
146
135
|
puts "> '#{out_file}'"
|
147
|
-
|
136
|
+
|
148
137
|
overwrite = @high.agree('Overwrite this file (yes/no)? ')
|
149
138
|
puts
|
150
|
-
|
139
|
+
|
151
140
|
next unless overwrite
|
152
141
|
end
|
153
|
-
|
142
|
+
|
154
143
|
entry.extract(out_file)
|
155
144
|
end
|
156
145
|
end
|
157
|
-
|
146
|
+
|
158
147
|
puts
|
159
148
|
puts "Extracted #{GET_URL_FILENAME} to directory:"
|
160
149
|
puts "> #{out_dir}"
|
161
150
|
end
|
162
151
|
ensure
|
163
|
-
down.close
|
152
|
+
down.close if !down.nil? && !down.closed?
|
164
153
|
end
|
165
154
|
end
|
166
155
|
end
|