nhkore 0.3.7 → 0.3.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,46 +1,34 @@
1
- #!/usr/bin/env ruby
2
1
  # encoding: UTF-8
3
2
  # frozen_string_literal: true
4
3
 
5
4
  #--
6
5
  # This file is part of NHKore.
7
- # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
- #
9
- # NHKore is free software: you can redistribute it and/or modify
10
- # it under the terms of the GNU Lesser General Public License as published by
11
- # the Free Software Foundation, either version 3 of the License, or
12
- # (at your option) any later version.
13
- #
14
- # NHKore is distributed in the hope that it will be useful,
15
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- # GNU Lesser General Public License for more details.
18
- #
19
- # You should have received a copy of the GNU Lesser General Public License
20
- # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
6
+ # Copyright (c) 2020-2021 Jonathan Bradley Whited
7
+ #
8
+ # SPDX-License-Identifier: LGPL-3.0-or-later
21
9
  #++
22
10
 
23
11
 
24
12
  module NHKore
25
13
  ###
26
- # @author Jonathan Bradley Whited (@esotericpig)
14
+ # @author Jonathan Bradley Whited
27
15
  # @since 0.2.0
28
16
  ###
29
17
  class Variator
30
18
  def begin_variate(str)
31
19
  return str
32
20
  end
33
-
21
+
34
22
  def variate(str)
35
23
  str = begin_variate(str)
36
24
  str = end_variate(str)
37
-
25
+
38
26
  return str
39
27
  end
40
28
  end
41
-
29
+
42
30
  ###
43
- # @author Jonathan Bradley Whited (@esotericpig)
31
+ # @author Jonathan Bradley Whited
44
32
  # @since 0.2.0
45
33
  ###
46
34
  class BasicVariator < Variator
@@ -48,38 +36,38 @@ module NHKore
48
36
  return [] # No variations; don't return nil
49
37
  end
50
38
  end
51
-
39
+
52
40
  ###
53
41
  # Guesses a word's dictionary/plain form (辞書形).
54
- #
42
+ #
55
43
  # It doesn't work very well,but better than nothing...
56
- #
44
+ #
57
45
  # @since 0.2.0
58
46
  ###
59
47
  class DictFormVariator < Variator
60
48
  attr_accessor :deinflector
61
-
49
+
62
50
  def initialize(*)
63
51
  require 'set' # Must require manually because JapaneseDeinflector is old
64
52
  require 'japanese_deinflector'
65
-
53
+
66
54
  super
67
-
68
- @deinflector = JapaneseDeinflector.new()
55
+
56
+ @deinflector = JapaneseDeinflector.new
69
57
  end
70
-
58
+
71
59
  def end_variate(str)
72
60
  guess = @deinflector.deinflect(str)
73
-
61
+
74
62
  return [] if guess.length < 1
75
63
  return [] if (guess = guess[0])[:weight] < 0.5
76
-
64
+
77
65
  return [guess[:word]]
78
66
  end
79
67
  end
80
-
68
+
81
69
  ###
82
- # @author Jonathan Bradley Whited (@esotericpig)
70
+ # @author Jonathan Bradley Whited
83
71
  # @since 0.2.0
84
72
  ###
85
73
  class BestVariator < DictFormVariator
@@ -1,26 +1,14 @@
1
- #!/usr/bin/env ruby
2
1
  # encoding: UTF-8
3
2
  # frozen_string_literal: true
4
3
 
5
4
  #--
6
5
  # This file is part of NHKore.
7
- # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
- #
9
- # NHKore is free software: you can redistribute it and/or modify
10
- # it under the terms of the GNU Lesser General Public License as published by
11
- # the Free Software Foundation, either version 3 of the License, or
12
- # (at your option) any later version.
13
- #
14
- # NHKore is distributed in the hope that it will be useful,
15
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- # GNU Lesser General Public License for more details.
18
- #
19
- # You should have received a copy of the GNU Lesser General Public License
20
- # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
6
+ # Copyright (c) 2020-2021 Jonathan Bradley Whited
7
+ #
8
+ # SPDX-License-Identifier: LGPL-3.0-or-later
21
9
  #++
22
10
 
23
11
 
24
12
  module NHKore
25
- VERSION = '0.3.7'
13
+ VERSION = '0.3.8'
26
14
  end
data/lib/nhkore/word.rb CHANGED
@@ -1,23 +1,11 @@
1
- #!/usr/bin/env ruby
2
1
  # encoding: UTF-8
3
2
  # frozen_string_literal: true
4
3
 
5
4
  #--
6
5
  # This file is part of NHKore.
7
- # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
- #
9
- # NHKore is free software: you can redistribute it and/or modify
10
- # it under the terms of the GNU Lesser General Public License as published by
11
- # the Free Software Foundation, either version 3 of the License, or
12
- # (at your option) any later version.
13
- #
14
- # NHKore is distributed in the hope that it will be useful,
15
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
- # GNU Lesser General Public License for more details.
18
- #
19
- # You should have received a copy of the GNU Lesser General Public License
20
- # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
6
+ # Copyright (c) 2020-2021 Jonathan Bradley Whited
7
+ #
8
+ # SPDX-License-Identifier: LGPL-3.0-or-later
21
9
  #++
22
10
 
23
11
 
@@ -29,7 +17,7 @@ require 'nhkore/util'
29
17
 
30
18
  module NHKore
31
19
  ###
32
- # @author Jonathan Bradley Whited (@esotericpig)
20
+ # @author Jonathan Bradley Whited
33
21
  # @since 0.1.0
34
22
  ###
35
23
  class Word
@@ -39,42 +27,42 @@ module NHKore
39
27
  attr_reader :kana
40
28
  attr_reader :kanji
41
29
  attr_reader :key
42
-
30
+
43
31
  def initialize(defn: nil,eng: nil,freq: 1,kana: nil,kanji: nil,unknown: nil,word: nil,**kargs)
44
32
  super()
45
-
46
- if !word.nil?()
47
- defn = word.defn if defn.nil?()
48
- eng = word.eng if eng.nil?()
49
- freq = word.freq if freq.nil?()
50
- kana = word.kana if kana.nil?()
51
- kanji = word.kanji if kanji.nil?()
33
+
34
+ if !word.nil?
35
+ defn = word.defn if defn.nil?
36
+ eng = word.eng if eng.nil?
37
+ freq = word.freq if freq.nil?
38
+ kana = word.kana if kana.nil?
39
+ kanji = word.kanji if kanji.nil?
52
40
  end
53
-
41
+
54
42
  raise ArgumentError,"freq[#{freq}] cannot be < 1" if freq < 1
55
-
56
- if !unknown.nil?()
43
+
44
+ if !unknown.nil?
57
45
  # kanji?() only tests if it contains kanji, so don't use kana?().
58
46
  if Util.kanji?(unknown)
59
47
  if !Util.empty_web_str?(kanji)
60
48
  raise ArgumentError,"unknown[#{unknown}] will overwrite kanji[#{kanji}]"
61
49
  end
62
-
50
+
63
51
  kanji = unknown
64
52
  else
65
53
  if !Util.empty_web_str?(kana)
66
54
  raise ArgumentError,"unknown[#{unknown}] will overwrite kana[#{kana}]"
67
55
  end
68
-
56
+
69
57
  kana = unknown
70
58
  end
71
59
  end
72
-
60
+
73
61
  kana = nil if Util.empty_web_str?(kana)
74
62
  kanji = nil if Util.empty_web_str?(kanji)
75
-
76
- raise ArgumentError,'kanji and kana cannot both be empty' if kana.nil?() && kanji.nil?()
77
-
63
+
64
+ raise ArgumentError,'kanji and kana cannot both be empty' if kana.nil? && kanji.nil?
65
+
78
66
  @defn = defn
79
67
  @eng = eng
80
68
  @freq = freq
@@ -82,118 +70,132 @@ module NHKore
82
70
  @kanji = kanji
83
71
  @key = "#{kanji}=#{kana}" # nil.to_s() is ''
84
72
  end
85
-
73
+
86
74
  def encode_with(coder)
87
75
  # Ignore @key because it will be the key in the YAML/Hash.
88
76
  # Order matters.
89
-
77
+
90
78
  coder[:kanji] = @kanji
91
79
  coder[:kana] = @kana
92
80
  coder[:freq] = @freq
93
81
  coder[:defn] = @defn
94
82
  coder[:eng] = @eng
95
83
  end
96
-
84
+
97
85
  def self.load_data(key,hash)
98
- key = key.to_s() # Change from a symbol
99
-
86
+ key = key.to_s # Change from a symbol
87
+
100
88
  word = Word.new(
101
89
  defn: hash[:defn],
102
90
  eng: hash[:eng],
103
91
  kana: hash[:kana],
104
92
  kanji: hash[:kanji]
105
93
  )
106
-
94
+
107
95
  if key != word.key
108
96
  raise ArgumentError,"the key from the hash[#{key}] does not match the generated key[#{word.key}]"
109
97
  end
110
-
111
- freq = hash[:freq].to_i() # nil.to_i() is 0
98
+
99
+ freq = hash[:freq].to_i # nil.to_i() is 0
112
100
  word.freq = freq if freq > 0
113
-
101
+
114
102
  return word
115
103
  end
116
-
104
+
117
105
  # Do not clean and/or strip spaces, as the raw text is important for
118
- # Defn and ArticleScraper.
106
+ # Defn and ArticleScraper.
107
+ #
108
+ # This originally only scraped 1 word, but multiple words were added
109
+ # after seeing this link for 産業能率大学, which is valid HTML:
110
+ # https://www3.nhk.or.jp/news/easy/k10012759201000/k10012759201000.html
111
+ #
112
+ # @return [Array<Word>] the scraped {Word}(s)
119
113
  def self.scrape_ruby_tag(tag,missingno: nil,url: nil)
120
114
  # First, try <rb> tags.
121
- kanji = tag.css('rb')
115
+ kanjis = tag.css('rb')
122
116
  # Second, try text nodes.
123
- kanji = tag.search('./text()') if kanji.length < 1
117
+ kanjis = tag.search('./text()') if kanjis.length < 1
124
118
  # Third, try non-<rt> tags, in case of being surrounded by <span>, <b>, etc.
125
- kanji = tag.search("./*[not(name()='rt')]") if kanji.length < 1
126
-
127
- raise ScrapeError,"no kanji at URL[#{url}] in tag[#{tag}]" if kanji.length < 1
128
- raise ScrapeError,"too many kanji at URL[#{url}] in tag[#{tag}]" if kanji.length > 1
129
-
130
- kanji = kanji[0].text
131
- kana = tag.css('rt')
132
-
133
- raise ScrapeError,"no kana at URL[#{url}] in tag[#{tag}]" if kana.length < 1
134
- raise ScrapeError,"too many kana at URL[#{url}] in tag[#{tag}]" if kana.length > 1
135
-
136
- kana = kana[0].text
137
-
138
- if !missingno.nil?()
139
- # Check kana first, since this is the typical scenario.
140
- # - https://www3.nhk.or.jp/news/easy/k10012331311000/k10012331311000.html
141
- # - '窓' in '(8)窓を開けて外の空気を入れましょう'
142
- if Util.empty_web_str?(kana)
143
- kana = missingno.kana_from_kanji(kanji)
144
-
145
- if !Util.empty_web_str?(kana)
146
- Util.warn("using missingno for kana[#{kana}] from kanji[#{kanji}]")
147
- end
148
- elsif Util.empty_web_str?(kanji)
149
- kanji = missingno.kanji_from_kana(kana)
150
-
151
- if !Util.empty_web_str?(kanji)
152
- Util.warn("using missingno for kanji[#{kanji}] from kana[#{kana}]")
119
+ kanjis = tag.search("./*[not(name()='rt')]") if kanjis.length < 1
120
+
121
+ kanas = tag.css('rt')
122
+
123
+ raise ScrapeError,"no kanji at URL[#{url}] in tag[#{tag}]" if kanjis.length < 1
124
+ raise ScrapeError,"no kana at URL[#{url}] in tag[#{tag}]" if kanas.length < 1
125
+
126
+ if kanjis.length != kanas.length
127
+ raise ScrapeError,"number of kanji & kana mismatch at URL[#{url}] in tag[#{tag}]"
128
+ end
129
+
130
+ words = []
131
+
132
+ (0...kanjis.length).each do |i|
133
+ kanji = kanjis[i].text
134
+ kana = kanas[i].text
135
+
136
+ # Uncomment for debugging; really need a logger.
137
+ #puts "Word[#{i}]: #{kanji} => #{kana}"
138
+
139
+ if !missingno.nil?
140
+ # Check kana first, since this is the typical scenario.
141
+ # - https://www3.nhk.or.jp/news/easy/k10012331311000/k10012331311000.html
142
+ # - '窓' in '(8)窓を開けて外の空気を入れましょう'
143
+ if Util.empty_web_str?(kana)
144
+ kana = missingno.kana_from_kanji(kanji)
145
+
146
+ if !Util.empty_web_str?(kana)
147
+ Util.warn("using missingno for kana[#{kana}] from kanji[#{kanji}]")
148
+ end
149
+ elsif Util.empty_web_str?(kanji)
150
+ kanji = missingno.kanji_from_kana(kana)
151
+
152
+ if !Util.empty_web_str?(kanji)
153
+ Util.warn("using missingno for kanji[#{kanji}] from kana[#{kana}]")
154
+ end
153
155
  end
154
156
  end
157
+
158
+ raise ScrapeError,"empty kanji at URL[#{url}] in tag[#{tag}]" if Util.empty_web_str?(kanji)
159
+ raise ScrapeError,"empty kana at URL[#{url}] in tag[#{tag}]" if Util.empty_web_str?(kana)
160
+
161
+ words << Word.new(kanji: kanji,kana: kana)
155
162
  end
156
-
157
- raise ScrapeError,"empty kanji at URL[#{url}] in tag[#{tag}]" if Util.empty_web_str?(kanji)
158
- raise ScrapeError,"empty kana at URL[#{url}] in tag[#{tag}]" if Util.empty_web_str?(kana)
159
-
160
- word = Word.new(kana: kana,kanji: kanji)
161
-
162
- return word
163
+
164
+ return words
163
165
  end
164
-
166
+
165
167
  # Do not clean and/or strip spaces, as the raw text is important for
166
- # Defn and ArticleScraper.
168
+ # Defn and ArticleScraper.
167
169
  def self.scrape_text_node(tag,url: nil)
168
170
  text = tag.text
169
-
171
+
170
172
  # No error; empty text is fine (not strictly kanji/kana only).
171
173
  return nil if Util.empty_web_str?(text)
172
-
174
+
173
175
  word = Word.new(unknown: text)
174
-
176
+
175
177
  return word
176
178
  end
177
-
178
- def kanji?()
179
+
180
+ def kanji?
179
181
  return !Util.empty_web_str?(@kanji)
180
182
  end
181
-
182
- def word()
183
- return kanji?() ? @kanji : @kana
183
+
184
+ def word
185
+ return kanji? ? @kanji : @kana
184
186
  end
185
-
186
- def to_s()
187
- s = ''.dup()
188
-
187
+
188
+ def to_s
189
+ s = ''.dup
190
+
189
191
  s << "'#{@key}': "
190
192
  s << "{ kanji=>'#{@kanji}'"
191
193
  s << ", kana=>'#{@kana}'"
192
194
  s << ", freq=>#{@freq}"
193
- s << ", defn=>'#{@defn.to_s().gsub("\n",'\\n')}'"
195
+ s << ", defn=>'#{@defn.to_s.gsub("\n",'\\n')}'"
194
196
  s << ", eng=>'#{@eng}'"
195
197
  s << ' }'
196
-
198
+
197
199
  return s
198
200
  end
199
201
  end
data/nhkore.gemspec CHANGED
@@ -1,42 +1,21 @@
1
1
  # encoding: UTF-8
2
2
  # frozen_string_literal: true
3
3
 
4
- #--
5
- # This file is part of NHKore.
6
- # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
7
- #
8
- # NHKore is free software: you can redistribute it and/or modify
9
- # it under the terms of the GNU Lesser General Public License as published by
10
- # the Free Software Foundation, either version 3 of the License, or
11
- # (at your option) any later version.
12
- #
13
- # NHKore is distributed in the hope that it will be useful,
14
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
- # GNU Lesser General Public License for more details.
17
- #
18
- # You should have received a copy of the GNU Lesser General Public License
19
- # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
20
- #++
21
-
22
-
23
- lib = File.expand_path(File.join('..','lib'),__FILE__)
24
- $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
25
-
26
- require 'nhkore/version'
27
-
28
- Gem::Specification.new() do |spec|
4
+
5
+ require_relative 'lib/nhkore/version'
6
+
7
+ Gem::Specification.new do |spec|
29
8
  spec.name = 'nhkore'
30
9
  spec.version = NHKore::VERSION
31
- spec.authors = ['Jonathan Bradley Whited (@esotericpig)']
32
- spec.email = ['bradley@esotericpig.com']
10
+ spec.authors = ['Jonathan Bradley Whited']
11
+ spec.email = ['code@esotericpig.com']
33
12
  spec.licenses = ['LGPL-3.0-or-later']
34
13
  spec.homepage = 'https://github.com/esotericpig/nhkore'
35
14
  spec.summary = 'NHK News Web (Easy) word frequency (core) scraper for Japanese language learners.'
36
15
  spec.description =
37
16
  'Scrapes NHK News Web (Easy) for the word frequency (core list) for Japanese language learners.' \
38
17
  ' Includes a CLI app and a scraper library.'
39
-
18
+
40
19
  spec.metadata = {
41
20
  'homepage_uri' => 'https://github.com/esotericpig/nhkore',
42
21
  'source_code_uri' => 'https://github.com/esotericpig/nhkore',
@@ -46,66 +25,66 @@ Gem::Specification.new() do |spec|
46
25
  #'wiki_uri' => '',
47
26
  #'mailing_list_uri' => '',
48
27
  }
49
-
28
+
50
29
  spec.requirements = [
51
30
  'Nokogiri: https://www.nokogiri.org/tutorials/installing_nokogiri.html',
52
31
  ]
53
-
54
- spec.required_ruby_version = '>= 2.4'
32
+
33
+ spec.required_ruby_version = '>= 2.5'
55
34
  spec.require_paths = ['lib']
56
35
  spec.bindir = 'bin'
57
36
  spec.executables = [spec.name]
58
-
37
+
59
38
  spec.files = [
60
39
  Dir.glob(File.join("{#{spec.require_paths.join(',')}}",'**','*.{erb,rb}')),
61
40
  Dir.glob(File.join(spec.bindir,'*')),
62
41
  Dir.glob(File.join('{samples,test,yard}','**','*.{erb,rb}')),
63
42
  %W[ Gemfile Gemfile.lock #{spec.name}.gemspec Rakefile .yardopts ],
64
43
  %w[ CHANGELOG.md LICENSE.txt README.md ],
65
- ].flatten()
66
-
44
+ ].flatten
45
+
67
46
  spec.add_runtime_dependency 'attr_bool' ,'~> 0.2' # For attr_accessor?/attr_reader?
68
47
  spec.add_runtime_dependency 'bimyou_segmenter' ,'~> 1.2' # For splitting Japanese sentences into words
69
48
  spec.add_runtime_dependency 'cri' ,'~> 2.15' # For CLI commands/options
70
- spec.add_runtime_dependency 'down' ,'~> 5.1' # For downloading files (GetCmd)
49
+ spec.add_runtime_dependency 'down' ,'~> 5.2' # For downloading files (GetCmd)
71
50
  spec.add_runtime_dependency 'highline' ,'~> 2.0' # For CLI input/output
72
51
  spec.add_runtime_dependency 'http-cookie' ,'~> 1.0' # For parsing/setting cookies (BingScraper/Scraper)
73
52
  spec.add_runtime_dependency 'japanese_deinflector' ,'~> 0.0' # For unconjugating Japanese words (plain/dictionary form)
74
- spec.add_runtime_dependency 'nokogiri' ,'~> 1.10' # For scraping/hacking
53
+ spec.add_runtime_dependency 'nokogiri' ,'~> 1.11' # For scraping/hacking
75
54
  spec.add_runtime_dependency 'psychgus' ,'~> 1.3' # For styling Psych YAML
76
55
  spec.add_runtime_dependency 'public_suffix' ,'~> 4.0' # For parsing URL domain names
77
56
  spec.add_runtime_dependency 'rainbow' ,'~> 3.0' # For CLI color output
78
57
  spec.add_runtime_dependency 'rubyzip' ,'~> 2.3' # For extracting Zip files (GetCmd)
79
58
  spec.add_runtime_dependency 'tiny_segmenter' ,'~> 0.0' # For splitting Japanese sentences into words
80
- spec.add_runtime_dependency 'tty-progressbar' ,'~> 0.17' # For CLI progress bars
59
+ spec.add_runtime_dependency 'tty-progressbar' ,'~> 0.18' # For CLI progress bars
81
60
  spec.add_runtime_dependency 'tty-spinner' ,'~> 0.9' # For CLI spinning progress
82
-
83
- spec.add_development_dependency 'bundler' ,'~> 2.1'
61
+
62
+ spec.add_development_dependency 'bundler' ,'~> 2.2'
84
63
  spec.add_development_dependency 'minitest' ,'~> 5.14'
85
64
  spec.add_development_dependency 'rake' ,'~> 13.0'
86
65
  spec.add_development_dependency 'raketeer' ,'~> 0.2' # For extra Rake tasks
87
- spec.add_development_dependency 'rdoc' ,'~> 6.2' # For YARDoc RDoc (*.rb)
66
+ spec.add_development_dependency 'rdoc' ,'~> 6.3' # For YARDoc RDoc (*.rb)
88
67
  spec.add_development_dependency 'redcarpet' ,'~> 3.5' # For YARDoc Markdown (*.md)
89
68
  spec.add_development_dependency 'yard' ,'~> 0.9' # For documentation
90
69
  spec.add_development_dependency 'yard_ghurt','~> 1.2' # For extra YARDoc Rake tasks
91
-
92
- spec.post_install_message = <<-EOM
93
-
70
+
71
+ spec.post_install_message = <<-MSG
72
+
94
73
  NHKore v#{NHKore::VERSION}
95
-
74
+
96
75
  You can now use [#{spec.executables.join(', ')}] on the command line.
97
-
76
+
98
77
  Homepage: #{spec.homepage}
99
-
78
+
100
79
  Code: #{spec.metadata['source_code_uri']}
101
80
  Bugs: #{spec.metadata['bug_tracker_uri']}
102
-
81
+
103
82
  Changelog: #{spec.metadata['changelog_uri']}
104
-
105
- EOM
106
-
83
+
84
+ MSG
85
+
107
86
  spec.extra_rdoc_files = %w[ CHANGELOG.md LICENSE.txt README.md ]
108
-
87
+
109
88
  spec.rdoc_options = [
110
89
  '--hyperlink-all','--show-hash',
111
90
  '--title',"NHKore v#{NHKore::VERSION} Doc",