tataki 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: aa8fec9bc8527b014ade8528d1cd9cefa2223c59
4
- data.tar.gz: 39f9543bbcdf66bb83fc3062a019c126064cbde2
3
+ metadata.gz: c75759d9be482c52fc40c1b59d543fa709877f9d
4
+ data.tar.gz: d2f4bc1514cd2eb3d4a8b57286509b2e91442db3
5
5
  SHA512:
6
- metadata.gz: d1b330894a4b2bd8d159b6ebabb3be71486ba820f87feee4a744682a83e3a93936acbeacd48bfd44cf80ecec99f197b2da6d354e97e2048e067fe1bfe73492e0
7
- data.tar.gz: ff9e1b5643bb7d748bfe0fadfcd191ebf270103198662d5e2f6c2d4aa3171d8c29bc4bebbe8a58ce4cbf67853ea9a8d87a5f2a43d4c1c5de2cbdff158627c4a5
6
+ metadata.gz: 6b1cc2f50a0302cb959b6414d56a8c3e565286b9284ec07ecf328caefb6735a23d1aa83f09e1abdf5be208c5811a0b5936cb422beb192119209678a2c45ef599
7
+ data.tar.gz: 315cf59f199dfa3532293ae19252083ce58a2e305ea0ee3421bebd1fb66f99c0d89c75ea3df93943e7fb3f0060a732c1c4519dbb10aeb76d6313a6d3b5743224
data/README.md CHANGED
@@ -36,11 +36,12 @@ require "tataki/base"
36
36
  alphabet_converter = Tataki::Converter::Alphabet.new
37
37
  alphabet_converter.to_kana("abcde") # => "えーびーしーでぃーいー"
38
38
 
39
- roman_alphabet_converter = Tataki::Converter::Combine.new(Tataki::Converter::Roman.new, Tataki::Converter::Alphabet.new)
40
- roman_alphabet_converter.to_kana("robottotaisennf") # => "ろぼっとたいせんえふ"
41
-
42
39
  skk_converter = Tataki::Converter::SkkJisyo.new
43
40
  skk_converter.to_kana("研究者") # => "けんきゅうしゃ"
41
+
42
+ alphabet_skk_converter = Tataki::Converter::Combine.new(Tataki::Converter::Alphabet.new, Tataki::Converter::SkkJisyo.new)
43
+ alphabet_skk_converter.to_kana("X線研究者") # => "robottotaisennf"
44
+
44
45
  ```
45
46
 
46
47
  ## TODO
@@ -0,0 +1,17 @@
1
+ require "benchmark"
2
+
3
+ N = 1000
4
+
5
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
6
+
7
+ require 'tataki/base'
8
+ converter = Tataki::Converter::SkkJisyo.new(%w[M jinmei])
9
+
10
+ source = "かな漢字変換" * 100
11
+
12
+ puts Benchmark::CAPTION
13
+ puts Benchmark.measure {
14
+ N.times do
15
+ converter.to_kana(source)
16
+ end
17
+ }
@@ -2,7 +2,6 @@
2
2
  require "yaml"
3
3
  require "time"
4
4
  require "skk/jisyo"
5
- require "trie"
6
5
 
7
6
  module Tataki
8
7
  module Converter
@@ -12,30 +11,34 @@ module Tataki
12
11
 
13
12
  def initialize(jisyo_types = DEFAULT_JISYO_SUFFIXES)
14
13
  @jisyo_paths = jisyo_types.map{|suffix| Skk::Jisyo.path(suffix) }
15
- @trie_cache_path = trie_cache_path(jisyo_types.join("_"))
14
+ @table_cache_path = table_cache_path(jisyo_types.join("_"))
16
15
 
17
16
  config_file = File.expand_path(DEFAULT_CONFIG_PATH, __FILE__)
18
17
  config_data = YAML.load_file(config_file)
19
18
  @roman_data = config_data["roman_table"]
20
19
  @ignore_kana = config_data["ignore_kana"]
21
- @trie = setup_jisyo.freeze
20
+ tables = setup_jisyo
21
+ @match_table = tables[0].freeze
22
+ @okurigana_table = tables[1].freeze
22
23
  end
23
24
 
24
25
  def setup_jisyo
25
- if File.exist?(@trie_cache_path)
26
- trie = Marshal.load(File.read(@trie_cache_path))
26
+ if File.exist?(@table_cache_path)
27
+ tables = Marshal.load(File.read(@table_cache_path))
27
28
  else
28
- trie = Trie.new
29
+ match_table = {}
30
+ okurigana_table = {}
29
31
  @jisyo_paths.each do |jisyo_path|
30
- add_jisyo(trie, jisyo_path)
32
+ add_jisyo(match_table, okurigana_table, jisyo_path)
31
33
  end
32
- File.binwrite(@trie_cache_path, Marshal.dump(trie))
33
- File.write("#{@trie_cache_path}.timestamp", Time.now.to_s)
34
+ tables = [match_table, okurigana_table]
35
+ File.binwrite(@table_cache_path, Marshal.dump(tables))
36
+ File.write("#{@table_cache_path}.timestamp", Time.now.to_s)
34
37
  end
35
- trie
38
+ tables
36
39
  end
37
40
 
38
- def add_jisyo(trie, jisyo_path)
41
+ def add_jisyo(match_table, okurigana_table, jisyo_path)
39
42
  File.open(jisyo_path, "rb:euc-jp") do |jisyo_file|
40
43
  jisyo_file.each_line do |line|
41
44
  next if line.empty? || line[0] == ";" || line.include?("#")
@@ -44,8 +47,14 @@ module Tataki
44
47
  kana.gsub!(/[^ぁ-んa-z]/, "")
45
48
  next if kana.empty? || !(kana =~ /^[ぁ-ん]+[a-z]?/) || @ignore_kana.include?(kana)
46
49
  kanji_part.gsub!(/^\/|;.+|\/$/, "")
50
+
51
+ table = kana =~ /^(.+)([a-z])$/ ? okurigana_table : match_table
47
52
  kanji_part.split("/").each do |kanji|
48
- trie.insert(kanji, kana)
53
+ kanji_prefix = kanji[0]
54
+ table_entry = table[kanji_prefix]
55
+ table[kanji_prefix] = table_entry = [] unless table_entry
56
+ table_entry.push($2 ? [kanji, $1, $2] : [kanji, kana])
57
+ table_entry.sort_by!{|entry| - (entry[0].size) }
49
58
  end
50
59
  end
51
60
  end
@@ -55,8 +64,8 @@ module Tataki
55
64
  File.expand_path("../../../../data/jisyo", __FILE__)
56
65
  end
57
66
 
58
- def trie_cache_path(name)
59
- File.join(jisyo_path, "SKK-JISYO.#{name}.trie.cache")
67
+ def table_cache_path(name)
68
+ File.join(jisyo_path, "SKK-JISYO.#{name}.table.cache")
60
69
  end
61
70
 
62
71
  def jisyo_timestamp(path)
@@ -64,61 +73,49 @@ module Tataki
64
73
  end
65
74
 
66
75
  def to_kana(sentence)
67
- _to_kana(sentence, "", "", @trie)
76
+ _to_kana(sentence, "")
68
77
  end
69
78
 
70
79
  private
71
80
 
72
- def _to_kana(sentence, kana, prefix, trie, through_alphabet = true)
73
- return if trie.empty?
81
+ def _to_kana(sentence, kana)
74
82
  return kana if sentence.empty?
75
83
 
76
- next_ch = sentence[0]
77
- next_sentence = sentence[1..-1]
78
- next_trie = trie.find_prefix(next_ch)
79
- next_trie_values = next_trie.values
80
- next_trie_values.reject!{|value| value =~ /[a-z]/ }
81
- next_set = next_trie.find([])
82
- next_set_values = next_set.values
83
- okurigana = find_okurigana(next_set_values, next_sentence)
84
- next_set_values.reject!{|value| value =~ /[a-z]/ }
85
- if okurigana
86
- return _to_kana(next_sentence, kana + okurigana, "", @trie)
87
- elsif next_set_values.size > 0 && next_set_values.size == next_trie_values.size
88
- return _to_kana(next_sentence, kana + next_set_values.sample, "", @trie)
89
- end
90
-
91
- if next_sentence.empty?
92
- if next_set_values.size > 0
93
- return kana + next_set_values.sample
94
- elsif through_alphabet
95
- return kana + prefix + next_ch
96
- end
84
+ table_entry = find_okurigana_entry(sentence) || find_match_entry(sentence)
85
+ if table_entry
86
+ next_kanji = table_entry[0]
87
+ next_kana = table_entry[1]
88
+ next_sentence = sentence[next_kanji.size .. -1]
89
+ return _to_kana(next_sentence, kana + next_kana)
97
90
  end
98
91
 
99
- next_kana = _to_kana(next_sentence, kana, prefix + next_ch, next_trie, false)
100
-
101
- if next_kana
102
- return next_kana
103
- end
92
+ return _to_kana(sentence[1 .. -1], kana + sentence[0])
93
+ end
104
94
 
105
- if next_set_values.size > 0
106
- return _to_kana(next_sentence, kana + next_set_values.sample, "", @trie)
107
- elsif through_alphabet
108
- return _to_kana(next_sentence, kana + prefix + next_ch, "", @trie)
109
- else
110
- return nil
95
+ def find_okurigana_entry(sentence)
96
+ entries = @okurigana_table[sentence[0]]
97
+ return unless entries
98
+
99
+ entries.each do |entry|
100
+ kanji, yomi, alphabet = *entry
101
+ next unless sentence.start_with?(kanji)
102
+ next_ch = sentence[kanji.size]
103
+ okurigana_candidates = @roman_data[alphabet]
104
+ next unless okurigana_candidates
105
+ okurigana_candidates.each do |okurigana|
106
+ return entry if okurigana == next_ch
107
+ end
111
108
  end
109
+ nil
112
110
  end
113
111
 
114
- def find_okurigana(yomi_candidates, next_sentence)
115
- yomi_candidates.each do |yomi|
116
- next unless yomi =~ /.+([a-z])$/
117
- okurigana_yomi = @roman_data[$1]
118
- next unless okurigana_yomi
119
- okurigana_yomi.each do |okurigana|
120
- return yomi.gsub(/[a-z]$/, "") if next_sentence.start_with?(okurigana)
121
- end
112
+ def find_match_entry(sentence)
113
+ entries = @match_table[sentence[0]]
114
+ return unless entries
115
+
116
+ entries.each do |entry|
117
+ kanji, yomi = *entry
118
+ return entry if sentence.start_with?(kanji)
122
119
  end
123
120
  nil
124
121
  end
@@ -1,3 +1,3 @@
1
1
  module Tataki
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.4"
3
3
  end
@@ -1,2 +1,3 @@
1
+ require 'pry'
1
2
  $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
2
- require 'tataki'
3
+ require 'tataki/base'
@@ -3,7 +3,6 @@ require "spec_helper"
3
3
 
4
4
  describe Tataki::Converter::Combine do
5
5
  let(:skk_converter) { Tataki::Converter::SkkJisyo.new }
6
- let(:roman_converter) { Tataki::Converter::Roman.new }
7
6
  let(:alphabet_converter) { Tataki::Converter::Alphabet.new }
8
7
 
9
8
  describe ".to_kana" do
@@ -13,14 +12,6 @@ describe Tataki::Converter::Combine do
13
12
  end
14
13
  end
15
14
 
16
- context "when roman + alphabet" do
17
- let(:converter) do
18
- Tataki::Converter::Combine.new(roman_converter, alphabet_converter)
19
- end
20
-
21
- include_examples "converts_kana", "robottotaisennf", "ろぼっとたいせんえふ"
22
- end
23
-
24
15
  context "when skk-jisyo + alphabet" do
25
16
  let(:converter) do
26
17
  Tataki::Converter::Combine.new(skk_converter, alphabet_converter)
@@ -2,7 +2,6 @@
2
2
  require "spec_helper"
3
3
 
4
4
  describe Tataki::Converter::SkkJisyo do
5
-
6
5
  describe ".to_kana" do
7
6
  shared_examples "converts_kana" do |sentence, kana|
8
7
  it "converts #{sentence.inspect} to #{kana.inspect}" do
@@ -29,5 +28,15 @@ describe Tataki::Converter::SkkJisyo do
29
28
  include_examples "converts_kana", "漢字", "漢字"
30
29
  include_examples "converts_kana", "半澤直樹", "はんざわなおき"
31
30
  end
31
+
32
+ context "with M, jinmei jisyo" do
33
+ let(:converter) { Tataki::Converter::SkkJisyo.new(%w[M jinmei]) }
34
+
35
+ include_examples "converts_kana", "", ""
36
+ include_examples "converts_kana", "漢字", "かんじ"
37
+ include_examples "converts_kana", "半澤直樹", "はんざわなおき"
38
+ include_examples "converts_kana", "半澤直樹倍返し", "はんざわなおきばいかえし"
39
+ include_examples "converts_kana", "半澤直樹、銀行を買う", "はんざわなおき、ぎんこうをかう"
40
+ end
32
41
  end
33
42
  end
@@ -9,7 +9,6 @@ describe Tataki do
9
9
  describe ".converters" do
10
10
  it "returns converters" do
11
11
  expect(Tataki.converters).to match_array([
12
- Tataki::Converter::Roman,
13
12
  Tataki::Converter::Alphabet,
14
13
  Tataki::Converter::Combine,
15
14
  Tataki::Converter::SkkJisyo,
@@ -18,6 +17,8 @@ describe Tataki do
18
17
  end
19
18
 
20
19
  describe "String.to_kana" do
20
+ before { require "tataki" }
21
+
21
22
  it "converts to kana" do
22
23
  expect("X線研究者".to_kana).to eq("えっくすせんけんきゅうしゃ")
23
24
  end
@@ -19,7 +19,6 @@ Gem::Specification.new do |spec|
19
19
  spec.require_paths = ["lib"]
20
20
 
21
21
  spec.add_dependency "skk-jisyo", "~> 0.0.5"
22
- spec.add_dependency "trie"
23
22
 
24
23
  spec.add_development_dependency "bundler", "~> 1.3"
25
24
  spec.add_development_dependency "rake"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tataki
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - hogelog
@@ -24,20 +24,6 @@ dependencies:
24
24
  - - ~>
25
25
  - !ruby/object:Gem::Version
26
26
  version: 0.0.5
27
- - !ruby/object:Gem::Dependency
28
- name: trie
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - '>='
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :runtime
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - '>='
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
27
  - !ruby/object:Gem::Dependency
42
28
  name: bundler
43
29
  requirement: !ruby/object:Gem::Requirement
@@ -137,22 +123,20 @@ files:
137
123
  - LICENSE.txt
138
124
  - README.md
139
125
  - Rakefile
126
+ - benchmark/001-M_jinmei.rb
140
127
  - data/alphabet.yml
141
128
  - data/jisyo/.gitignore
142
- - data/roman.yml
143
129
  - data/skk-jisyo.yml
144
130
  - lib/tataki.rb
145
131
  - lib/tataki/base.rb
146
132
  - lib/tataki/converters.rb
147
133
  - lib/tataki/converters/alphabet.rb
148
134
  - lib/tataki/converters/combine.rb
149
- - lib/tataki/converters/roman.rb
150
135
  - lib/tataki/converters/skk_jisyo.rb
151
136
  - lib/tataki/version.rb
152
137
  - spec/spec_helper.rb
153
138
  - spec/tataki/converters/alphabet_spec.rb
154
139
  - spec/tataki/converters/combine_spec.rb
155
- - spec/tataki/converters/roman_spec.rb
156
140
  - spec/tataki/converters/skk_jisyo_spec.rb
157
141
  - spec/tataki_spec.rb
158
142
  - tataki.gemspec
@@ -184,7 +168,6 @@ test_files:
184
168
  - spec/spec_helper.rb
185
169
  - spec/tataki/converters/alphabet_spec.rb
186
170
  - spec/tataki/converters/combine_spec.rb
187
- - spec/tataki/converters/roman_spec.rb
188
171
  - spec/tataki/converters/skk_jisyo_spec.rb
189
172
  - spec/tataki_spec.rb
190
173
  has_rdoc:
@@ -1,142 +0,0 @@
1
- table:
2
- "a": あ
3
- "i": い
4
- "u": う
5
- "e": え
6
- "o": お
7
- "ka": か
8
- "ki": き
9
- "ku": く
10
- "ke": け
11
- "ko": こ
12
- "ga": が
13
- "gi": ぎ
14
- "gu": ぐ
15
- "ge": げ
16
- "go": ご
17
- "sa": さ
18
- "si": し
19
- "shi": し
20
- "su": す
21
- "se": せ
22
- "so": そ
23
- "za": ざ
24
- "zi": じ
25
- "ji": じ
26
- "zu": ず
27
- "ze": ぜ
28
- "zo": ぞ
29
- "ta": た
30
- "ti": ち
31
- "chi": ち
32
- "tu": つ
33
- "tsu": つ
34
- "te": て
35
- "to": と
36
- "da": だ
37
- "di": ぢ
38
- "du": づ
39
- "de": で
40
- "do": ど
41
- "na": な
42
- "ni": に
43
- "nu": ぬ
44
- "ne": ね
45
- "no": の
46
- "ha": は
47
- "hi": ひ
48
- "hu": ふ
49
- "fu": ふ
50
- "he": へ
51
- "ho": ほ
52
- "ba": ば
53
- "bi": び
54
- "bu": ぶ
55
- "be": べ
56
- "bo": ぼ
57
- "pa": ぱ
58
- "pi": ぴ
59
- "pu": ぷ
60
- "pe": ぺ
61
- "po": ぽ
62
- "ma": ま
63
- "mi": み
64
- "mu": む
65
- "me": め
66
- "mo": も
67
- "ya": や
68
- "yu": ゆ
69
- "yo": よ
70
- "ra": ら
71
- "ri": り
72
- "ru": る
73
- "re": れ
74
- "ro": ろ
75
- "wa": わ
76
- "wo": を
77
- "n": ん
78
- "nn": ん
79
- "xa": ぁ
80
- "la": ぁ
81
- "xi": ぃ
82
- "li": ぃ
83
- "xu": ぅ
84
- "lu": ぅ
85
- "xe": ぇ
86
- "le": ぇ
87
- "xo": ぉ
88
- "lo": ぉ
89
- "kya": きゃ
90
- "kyu": きゅ
91
- "kyo": きょ
92
- "gya": ぎゃ
93
- "gyu": ぎゅ
94
- "gyo": ぎょ
95
- "zya": じゃ
96
- "sya": しゃ
97
- "sha": しゃ
98
- "syu": しゅ
99
- "shu": しゅ
100
- "syo": しょ
101
- "sho": しょ
102
- "ja": じゃ
103
- "zyu": じゅ
104
- "ju": じゅ
105
- "zyo": じょ
106
- "jo": じょ
107
- "tya": ちゃ
108
- "cha": ちゃ
109
- "tyu": ちゅ
110
- "chu": ちゅ
111
- "tyo": ちょ
112
- "cho": ちょ
113
- "dya": ぢゃ
114
- "dyu": ぢゅ
115
- "dyo": ぢょ
116
- "nya": にゃ
117
- "nyu": にゅ
118
- "nyo": にょ
119
- "hya": ひゃ
120
- "hyu": ひゅ
121
- "hyo": ひょ
122
- "bya": びゃ
123
- "byu": びゅ
124
- "byo": びょ
125
- "pya": ぴゃ
126
- "pyu": ぴゅ
127
- "pyo": ぴょ
128
- "mya": みゃ
129
- "myu": みゅ
130
- "myo": みょ
131
- "xya": ゃ
132
- "lya": ゃ
133
- "xyu": ゅ
134
- "lyu": ゅ
135
- "xyo": ょ
136
- "lyo": ょ
137
- "rya": りゃ
138
- "ryu": りゅ
139
- "ryo": りょ
140
- "xwa": ゎ
141
- "lwa": ゎ
142
- consonant: [k, g, s, j, t, c, d, n, h, f, b, p, m, y, r, w, x, l]
@@ -1,67 +0,0 @@
1
- # coding: utf-8
2
- require "trie"
3
- require "yaml"
4
-
5
- module Tataki
6
- module Converter
7
- class Roman < Base
8
- SOKUON = "っ"
9
-
10
- def initialize
11
- @trie = Trie.new
12
- roman_file = File.expand_path("../../../../data/roman.yml", __FILE__)
13
- roman_data = YAML.load_file(roman_file)
14
- roman_data["table"].each do |roman, kana|
15
- @trie.insert(roman, kana)
16
- end
17
- @consonant = roman_data["consonant"]
18
- @trie.freeze
19
- end
20
-
21
- def to_kana(sentence)
22
- _to_kana(sentence.downcase, "", "", @trie)
23
- end
24
-
25
- private
26
-
27
- def _to_kana(sentence, kana, prefix, trie, through_alphabet = true)
28
- return if trie.empty?
29
- return kana if sentence.empty?
30
-
31
- next_ch = sentence[0]
32
- next_sentence = sentence[1..-1]
33
- next_trie = trie.find_prefix(next_ch)
34
- next_set = next_trie.find([])
35
- if next_set.size > 0 && next_set.size == next_trie.size
36
- return _to_kana(next_sentence, kana + next_set.values.first, "", @trie)
37
- end
38
-
39
- if next_sentence.empty?
40
- if next_set.size > 0
41
- return kana + prefix + next_set.values.first
42
- else
43
- return kana + prefix + next_ch
44
- end
45
- end
46
-
47
- next_kana = _to_kana(next_sentence, kana, prefix + next_ch, next_trie, false)
48
-
49
- if next_kana
50
- return next_kana
51
- end
52
-
53
- if next_set.size > 0
54
- return _to_kana(next_sentence, kana + next_set.values.first, "", @trie)
55
- elsif @consonant.include?(next_ch) && next_sentence.start_with?(next_ch)
56
- return _to_kana(next_sentence, kana + SOKUON, "", @trie)
57
- elsif through_alphabet
58
- return _to_kana(next_sentence, kana + prefix + next_ch, "", @trie)
59
- else
60
- return nil
61
- end
62
- end
63
- end
64
- end
65
-
66
- Tataki::CONVERTERS << Converter::Roman
67
- end
@@ -1,30 +0,0 @@
1
- # coding: utf-8
2
- require "spec_helper"
3
-
4
- describe Tataki::Converter::Roman do
5
- let(:converter) { Tataki::Converter::Roman.new }
6
-
7
- describe ".to_kana" do
8
- shared_examples "converts_kana" do |sentence, kana|
9
- it "converts #{sentence.inspect} to #{kana.inspect}" do
10
- expect(converter.to_kana(sentence)).to eq(kana)
11
- end
12
- end
13
-
14
- include_examples "converts_kana", "", ""
15
- include_examples "converts_kana", "hoge", "ほげ"
16
- include_examples "converts_kana", "hogelog", "ほげぉg"
17
- include_examples "converts_kana", "hogge", "ほっげ"
18
- include_examples "converts_kana", "hogs", "ほgs"
19
- include_examples "converts_kana", "nanka", "なんか"
20
- include_examples "converts_kana", "nannnan", "なんなん"
21
- include_examples "converts_kana", "nannnann", "なんなん"
22
- include_examples "converts_kana", "nannnannsei", "なんなんせい"
23
- include_examples "converts_kana", "kukkingu", "くっきんぐ"
24
- include_examples "converts_kana", "kukkingu papa", "くっきんぐ ぱぱ"
25
- include_examples "converts_kana", "toukyoutokkyokyokakyoku", "とうきょうとっきょきょかきょく"
26
-
27
- include_examples "converts_kana", "kku", "っく"
28
- include_examples "converts_kana", ",,", ",,"
29
- end
30
- end