tataki 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: aa8fec9bc8527b014ade8528d1cd9cefa2223c59
4
- data.tar.gz: 39f9543bbcdf66bb83fc3062a019c126064cbde2
3
+ metadata.gz: c75759d9be482c52fc40c1b59d543fa709877f9d
4
+ data.tar.gz: d2f4bc1514cd2eb3d4a8b57286509b2e91442db3
5
5
  SHA512:
6
- metadata.gz: d1b330894a4b2bd8d159b6ebabb3be71486ba820f87feee4a744682a83e3a93936acbeacd48bfd44cf80ecec99f197b2da6d354e97e2048e067fe1bfe73492e0
7
- data.tar.gz: ff9e1b5643bb7d748bfe0fadfcd191ebf270103198662d5e2f6c2d4aa3171d8c29bc4bebbe8a58ce4cbf67853ea9a8d87a5f2a43d4c1c5de2cbdff158627c4a5
6
+ metadata.gz: 6b1cc2f50a0302cb959b6414d56a8c3e565286b9284ec07ecf328caefb6735a23d1aa83f09e1abdf5be208c5811a0b5936cb422beb192119209678a2c45ef599
7
+ data.tar.gz: 315cf59f199dfa3532293ae19252083ce58a2e305ea0ee3421bebd1fb66f99c0d89c75ea3df93943e7fb3f0060a732c1c4519dbb10aeb76d6313a6d3b5743224
data/README.md CHANGED
@@ -36,11 +36,12 @@ require "tataki/base"
36
36
  alphabet_converter = Tataki::Converter::Alphabet.new
37
37
  alphabet_converter.to_kana("abcde") # => "えーびーしーでぃーいー"
38
38
 
39
- roman_alphabet_converter = Tataki::Converter::Combine.new(Tataki::Converter::Roman.new, Tataki::Converter::Alphabet.new)
40
- roman_alphabet_converter.to_kana("robottotaisennf") # => "ろぼっとたいせんえふ"
41
-
42
39
  skk_converter = Tataki::Converter::SkkJisyo.new
43
40
  skk_converter.to_kana("研究者") # => "けんきゅうしゃ"
41
+
42
+ alphabet_skk_converter = Tataki::Converter::Combine.new(Tataki::Converter::Alphabet.new, Tataki::Converter::SkkJisyo.new)
43
+ alphabet_skk_converter.to_kana("X線研究者") # => "robottotaisennf"
44
+
44
45
  ```
45
46
 
46
47
  ## TODO
@@ -0,0 +1,17 @@
1
+ require "benchmark"
2
+
3
+ N = 1000
4
+
5
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
6
+
7
+ require 'tataki/base'
8
+ converter = Tataki::Converter::SkkJisyo.new(%w[M jinmei])
9
+
10
+ source = "かな漢字変換" * 100
11
+
12
+ puts Benchmark::CAPTION
13
+ puts Benchmark.measure {
14
+ N.times do
15
+ converter.to_kana(source)
16
+ end
17
+ }
@@ -2,7 +2,6 @@
2
2
  require "yaml"
3
3
  require "time"
4
4
  require "skk/jisyo"
5
- require "trie"
6
5
 
7
6
  module Tataki
8
7
  module Converter
@@ -12,30 +11,34 @@ module Tataki
12
11
 
13
12
  def initialize(jisyo_types = DEFAULT_JISYO_SUFFIXES)
14
13
  @jisyo_paths = jisyo_types.map{|suffix| Skk::Jisyo.path(suffix) }
15
- @trie_cache_path = trie_cache_path(jisyo_types.join("_"))
14
+ @table_cache_path = table_cache_path(jisyo_types.join("_"))
16
15
 
17
16
  config_file = File.expand_path(DEFAULT_CONFIG_PATH, __FILE__)
18
17
  config_data = YAML.load_file(config_file)
19
18
  @roman_data = config_data["roman_table"]
20
19
  @ignore_kana = config_data["ignore_kana"]
21
- @trie = setup_jisyo.freeze
20
+ tables = setup_jisyo
21
+ @match_table = tables[0].freeze
22
+ @okurigana_table = tables[1].freeze
22
23
  end
23
24
 
24
25
  def setup_jisyo
25
- if File.exist?(@trie_cache_path)
26
- trie = Marshal.load(File.read(@trie_cache_path))
26
+ if File.exist?(@table_cache_path)
27
+ tables = Marshal.load(File.read(@table_cache_path))
27
28
  else
28
- trie = Trie.new
29
+ match_table = {}
30
+ okurigana_table = {}
29
31
  @jisyo_paths.each do |jisyo_path|
30
- add_jisyo(trie, jisyo_path)
32
+ add_jisyo(match_table, okurigana_table, jisyo_path)
31
33
  end
32
- File.binwrite(@trie_cache_path, Marshal.dump(trie))
33
- File.write("#{@trie_cache_path}.timestamp", Time.now.to_s)
34
+ tables = [match_table, okurigana_table]
35
+ File.binwrite(@table_cache_path, Marshal.dump(tables))
36
+ File.write("#{@table_cache_path}.timestamp", Time.now.to_s)
34
37
  end
35
- trie
38
+ tables
36
39
  end
37
40
 
38
- def add_jisyo(trie, jisyo_path)
41
+ def add_jisyo(match_table, okurigana_table, jisyo_path)
39
42
  File.open(jisyo_path, "rb:euc-jp") do |jisyo_file|
40
43
  jisyo_file.each_line do |line|
41
44
  next if line.empty? || line[0] == ";" || line.include?("#")
@@ -44,8 +47,14 @@ module Tataki
44
47
  kana.gsub!(/[^ぁ-んa-z]/, "")
45
48
  next if kana.empty? || !(kana =~ /^[ぁ-ん]+[a-z]?/) || @ignore_kana.include?(kana)
46
49
  kanji_part.gsub!(/^\/|;.+|\/$/, "")
50
+
51
+ table = kana =~ /^(.+)([a-z])$/ ? okurigana_table : match_table
47
52
  kanji_part.split("/").each do |kanji|
48
- trie.insert(kanji, kana)
53
+ kanji_prefix = kanji[0]
54
+ table_entry = table[kanji_prefix]
55
+ table[kanji_prefix] = table_entry = [] unless table_entry
56
+ table_entry.push($2 ? [kanji, $1, $2] : [kanji, kana])
57
+ table_entry.sort_by!{|entry| - (entry[0].size) }
49
58
  end
50
59
  end
51
60
  end
@@ -55,8 +64,8 @@ module Tataki
55
64
  File.expand_path("../../../../data/jisyo", __FILE__)
56
65
  end
57
66
 
58
- def trie_cache_path(name)
59
- File.join(jisyo_path, "SKK-JISYO.#{name}.trie.cache")
67
+ def table_cache_path(name)
68
+ File.join(jisyo_path, "SKK-JISYO.#{name}.table.cache")
60
69
  end
61
70
 
62
71
  def jisyo_timestamp(path)
@@ -64,61 +73,49 @@ module Tataki
64
73
  end
65
74
 
66
75
  def to_kana(sentence)
67
- _to_kana(sentence, "", "", @trie)
76
+ _to_kana(sentence, "")
68
77
  end
69
78
 
70
79
  private
71
80
 
72
- def _to_kana(sentence, kana, prefix, trie, through_alphabet = true)
73
- return if trie.empty?
81
+ def _to_kana(sentence, kana)
74
82
  return kana if sentence.empty?
75
83
 
76
- next_ch = sentence[0]
77
- next_sentence = sentence[1..-1]
78
- next_trie = trie.find_prefix(next_ch)
79
- next_trie_values = next_trie.values
80
- next_trie_values.reject!{|value| value =~ /[a-z]/ }
81
- next_set = next_trie.find([])
82
- next_set_values = next_set.values
83
- okurigana = find_okurigana(next_set_values, next_sentence)
84
- next_set_values.reject!{|value| value =~ /[a-z]/ }
85
- if okurigana
86
- return _to_kana(next_sentence, kana + okurigana, "", @trie)
87
- elsif next_set_values.size > 0 && next_set_values.size == next_trie_values.size
88
- return _to_kana(next_sentence, kana + next_set_values.sample, "", @trie)
89
- end
90
-
91
- if next_sentence.empty?
92
- if next_set_values.size > 0
93
- return kana + next_set_values.sample
94
- elsif through_alphabet
95
- return kana + prefix + next_ch
96
- end
84
+ table_entry = find_okurigana_entry(sentence) || find_match_entry(sentence)
85
+ if table_entry
86
+ next_kanji = table_entry[0]
87
+ next_kana = table_entry[1]
88
+ next_sentence = sentence[next_kanji.size .. -1]
89
+ return _to_kana(next_sentence, kana + next_kana)
97
90
  end
98
91
 
99
- next_kana = _to_kana(next_sentence, kana, prefix + next_ch, next_trie, false)
100
-
101
- if next_kana
102
- return next_kana
103
- end
92
+ return _to_kana(sentence[1 .. -1], kana + sentence[0])
93
+ end
104
94
 
105
- if next_set_values.size > 0
106
- return _to_kana(next_sentence, kana + next_set_values.sample, "", @trie)
107
- elsif through_alphabet
108
- return _to_kana(next_sentence, kana + prefix + next_ch, "", @trie)
109
- else
110
- return nil
95
+ def find_okurigana_entry(sentence)
96
+ entries = @okurigana_table[sentence[0]]
97
+ return unless entries
98
+
99
+ entries.each do |entry|
100
+ kanji, yomi, alphabet = *entry
101
+ next unless sentence.start_with?(kanji)
102
+ next_ch = sentence[kanji.size]
103
+ okurigana_candidates = @roman_data[alphabet]
104
+ next unless okurigana_candidates
105
+ okurigana_candidates.each do |okurigana|
106
+ return entry if okurigana == next_ch
107
+ end
111
108
  end
109
+ nil
112
110
  end
113
111
 
114
- def find_okurigana(yomi_candidates, next_sentence)
115
- yomi_candidates.each do |yomi|
116
- next unless yomi =~ /.+([a-z])$/
117
- okurigana_yomi = @roman_data[$1]
118
- next unless okurigana_yomi
119
- okurigana_yomi.each do |okurigana|
120
- return yomi.gsub(/[a-z]$/, "") if next_sentence.start_with?(okurigana)
121
- end
112
+ def find_match_entry(sentence)
113
+ entries = @match_table[sentence[0]]
114
+ return unless entries
115
+
116
+ entries.each do |entry|
117
+ kanji, yomi = *entry
118
+ return entry if sentence.start_with?(kanji)
122
119
  end
123
120
  nil
124
121
  end
@@ -1,3 +1,3 @@
1
1
  module Tataki
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.4"
3
3
  end
@@ -1,2 +1,3 @@
1
+ require 'pry'
1
2
  $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
2
- require 'tataki'
3
+ require 'tataki/base'
@@ -3,7 +3,6 @@ require "spec_helper"
3
3
 
4
4
  describe Tataki::Converter::Combine do
5
5
  let(:skk_converter) { Tataki::Converter::SkkJisyo.new }
6
- let(:roman_converter) { Tataki::Converter::Roman.new }
7
6
  let(:alphabet_converter) { Tataki::Converter::Alphabet.new }
8
7
 
9
8
  describe ".to_kana" do
@@ -13,14 +12,6 @@ describe Tataki::Converter::Combine do
13
12
  end
14
13
  end
15
14
 
16
- context "when roman + alphabet" do
17
- let(:converter) do
18
- Tataki::Converter::Combine.new(roman_converter, alphabet_converter)
19
- end
20
-
21
- include_examples "converts_kana", "robottotaisennf", "ろぼっとたいせんえふ"
22
- end
23
-
24
15
  context "when skk-jisyo + alphabet" do
25
16
  let(:converter) do
26
17
  Tataki::Converter::Combine.new(skk_converter, alphabet_converter)
@@ -2,7 +2,6 @@
2
2
  require "spec_helper"
3
3
 
4
4
  describe Tataki::Converter::SkkJisyo do
5
-
6
5
  describe ".to_kana" do
7
6
  shared_examples "converts_kana" do |sentence, kana|
8
7
  it "converts #{sentence.inspect} to #{kana.inspect}" do
@@ -29,5 +28,15 @@ describe Tataki::Converter::SkkJisyo do
29
28
  include_examples "converts_kana", "漢字", "漢字"
30
29
  include_examples "converts_kana", "半澤直樹", "はんざわなおき"
31
30
  end
31
+
32
+ context "with M, jinmei jisyo" do
33
+ let(:converter) { Tataki::Converter::SkkJisyo.new(%w[M jinmei]) }
34
+
35
+ include_examples "converts_kana", "", ""
36
+ include_examples "converts_kana", "漢字", "かんじ"
37
+ include_examples "converts_kana", "半澤直樹", "はんざわなおき"
38
+ include_examples "converts_kana", "半澤直樹倍返し", "はんざわなおきばいかえし"
39
+ include_examples "converts_kana", "半澤直樹、銀行を買う", "はんざわなおき、ぎんこうをかう"
40
+ end
32
41
  end
33
42
  end
@@ -9,7 +9,6 @@ describe Tataki do
9
9
  describe ".converters" do
10
10
  it "returns converters" do
11
11
  expect(Tataki.converters).to match_array([
12
- Tataki::Converter::Roman,
13
12
  Tataki::Converter::Alphabet,
14
13
  Tataki::Converter::Combine,
15
14
  Tataki::Converter::SkkJisyo,
@@ -18,6 +17,8 @@ describe Tataki do
18
17
  end
19
18
 
20
19
  describe "String.to_kana" do
20
+ before { require "tataki" }
21
+
21
22
  it "converts to kana" do
22
23
  expect("X線研究者".to_kana).to eq("えっくすせんけんきゅうしゃ")
23
24
  end
@@ -19,7 +19,6 @@ Gem::Specification.new do |spec|
19
19
  spec.require_paths = ["lib"]
20
20
 
21
21
  spec.add_dependency "skk-jisyo", "~> 0.0.5"
22
- spec.add_dependency "trie"
23
22
 
24
23
  spec.add_development_dependency "bundler", "~> 1.3"
25
24
  spec.add_development_dependency "rake"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tataki
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - hogelog
@@ -24,20 +24,6 @@ dependencies:
24
24
  - - ~>
25
25
  - !ruby/object:Gem::Version
26
26
  version: 0.0.5
27
- - !ruby/object:Gem::Dependency
28
- name: trie
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - '>='
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :runtime
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - '>='
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
27
  - !ruby/object:Gem::Dependency
42
28
  name: bundler
43
29
  requirement: !ruby/object:Gem::Requirement
@@ -137,22 +123,20 @@ files:
137
123
  - LICENSE.txt
138
124
  - README.md
139
125
  - Rakefile
126
+ - benchmark/001-M_jinmei.rb
140
127
  - data/alphabet.yml
141
128
  - data/jisyo/.gitignore
142
- - data/roman.yml
143
129
  - data/skk-jisyo.yml
144
130
  - lib/tataki.rb
145
131
  - lib/tataki/base.rb
146
132
  - lib/tataki/converters.rb
147
133
  - lib/tataki/converters/alphabet.rb
148
134
  - lib/tataki/converters/combine.rb
149
- - lib/tataki/converters/roman.rb
150
135
  - lib/tataki/converters/skk_jisyo.rb
151
136
  - lib/tataki/version.rb
152
137
  - spec/spec_helper.rb
153
138
  - spec/tataki/converters/alphabet_spec.rb
154
139
  - spec/tataki/converters/combine_spec.rb
155
- - spec/tataki/converters/roman_spec.rb
156
140
  - spec/tataki/converters/skk_jisyo_spec.rb
157
141
  - spec/tataki_spec.rb
158
142
  - tataki.gemspec
@@ -184,7 +168,6 @@ test_files:
184
168
  - spec/spec_helper.rb
185
169
  - spec/tataki/converters/alphabet_spec.rb
186
170
  - spec/tataki/converters/combine_spec.rb
187
- - spec/tataki/converters/roman_spec.rb
188
171
  - spec/tataki/converters/skk_jisyo_spec.rb
189
172
  - spec/tataki_spec.rb
190
173
  has_rdoc:
@@ -1,142 +0,0 @@
1
- table:
2
- "a": あ
3
- "i": い
4
- "u": う
5
- "e": え
6
- "o": お
7
- "ka": か
8
- "ki": き
9
- "ku": く
10
- "ke": け
11
- "ko": こ
12
- "ga": が
13
- "gi": ぎ
14
- "gu": ぐ
15
- "ge": げ
16
- "go": ご
17
- "sa": さ
18
- "si": し
19
- "shi": し
20
- "su": す
21
- "se": せ
22
- "so": そ
23
- "za": ざ
24
- "zi": じ
25
- "ji": じ
26
- "zu": ず
27
- "ze": ぜ
28
- "zo": ぞ
29
- "ta": た
30
- "ti": ち
31
- "chi": ち
32
- "tu": つ
33
- "tsu": つ
34
- "te": て
35
- "to": と
36
- "da": だ
37
- "di": ぢ
38
- "du": づ
39
- "de": で
40
- "do": ど
41
- "na": な
42
- "ni": に
43
- "nu": ぬ
44
- "ne": ね
45
- "no": の
46
- "ha": は
47
- "hi": ひ
48
- "hu": ふ
49
- "fu": ふ
50
- "he": へ
51
- "ho": ほ
52
- "ba": ば
53
- "bi": び
54
- "bu": ぶ
55
- "be": べ
56
- "bo": ぼ
57
- "pa": ぱ
58
- "pi": ぴ
59
- "pu": ぷ
60
- "pe": ぺ
61
- "po": ぽ
62
- "ma": ま
63
- "mi": み
64
- "mu": む
65
- "me": め
66
- "mo": も
67
- "ya": や
68
- "yu": ゆ
69
- "yo": よ
70
- "ra": ら
71
- "ri": り
72
- "ru": る
73
- "re": れ
74
- "ro": ろ
75
- "wa": わ
76
- "wo": を
77
- "n": ん
78
- "nn": ん
79
- "xa": ぁ
80
- "la": ぁ
81
- "xi": ぃ
82
- "li": ぃ
83
- "xu": ぅ
84
- "lu": ぅ
85
- "xe": ぇ
86
- "le": ぇ
87
- "xo": ぉ
88
- "lo": ぉ
89
- "kya": きゃ
90
- "kyu": きゅ
91
- "kyo": きょ
92
- "gya": ぎゃ
93
- "gyu": ぎゅ
94
- "gyo": ぎょ
95
- "zya": じゃ
96
- "sya": しゃ
97
- "sha": しゃ
98
- "syu": しゅ
99
- "shu": しゅ
100
- "syo": しょ
101
- "sho": しょ
102
- "ja": じゃ
103
- "zyu": じゅ
104
- "ju": じゅ
105
- "zyo": じょ
106
- "jo": じょ
107
- "tya": ちゃ
108
- "cha": ちゃ
109
- "tyu": ちゅ
110
- "chu": ちゅ
111
- "tyo": ちょ
112
- "cho": ちょ
113
- "dya": ぢゃ
114
- "dyu": ぢゅ
115
- "dyo": ぢょ
116
- "nya": にゃ
117
- "nyu": にゅ
118
- "nyo": にょ
119
- "hya": ひゃ
120
- "hyu": ひゅ
121
- "hyo": ひょ
122
- "bya": びゃ
123
- "byu": びゅ
124
- "byo": びょ
125
- "pya": ぴゃ
126
- "pyu": ぴゅ
127
- "pyo": ぴょ
128
- "mya": みゃ
129
- "myu": みゅ
130
- "myo": みょ
131
- "xya": ゃ
132
- "lya": ゃ
133
- "xyu": ゅ
134
- "lyu": ゅ
135
- "xyo": ょ
136
- "lyo": ょ
137
- "rya": りゃ
138
- "ryu": りゅ
139
- "ryo": りょ
140
- "xwa": ゎ
141
- "lwa": ゎ
142
- consonant: [k, g, s, j, t, c, d, n, h, f, b, p, m, y, r, w, x, l]
@@ -1,67 +0,0 @@
1
- # coding: utf-8
2
- require "trie"
3
- require "yaml"
4
-
5
- module Tataki
6
- module Converter
7
- class Roman < Base
8
- SOKUON = "っ"
9
-
10
- def initialize
11
- @trie = Trie.new
12
- roman_file = File.expand_path("../../../../data/roman.yml", __FILE__)
13
- roman_data = YAML.load_file(roman_file)
14
- roman_data["table"].each do |roman, kana|
15
- @trie.insert(roman, kana)
16
- end
17
- @consonant = roman_data["consonant"]
18
- @trie.freeze
19
- end
20
-
21
- def to_kana(sentence)
22
- _to_kana(sentence.downcase, "", "", @trie)
23
- end
24
-
25
- private
26
-
27
- def _to_kana(sentence, kana, prefix, trie, through_alphabet = true)
28
- return if trie.empty?
29
- return kana if sentence.empty?
30
-
31
- next_ch = sentence[0]
32
- next_sentence = sentence[1..-1]
33
- next_trie = trie.find_prefix(next_ch)
34
- next_set = next_trie.find([])
35
- if next_set.size > 0 && next_set.size == next_trie.size
36
- return _to_kana(next_sentence, kana + next_set.values.first, "", @trie)
37
- end
38
-
39
- if next_sentence.empty?
40
- if next_set.size > 0
41
- return kana + prefix + next_set.values.first
42
- else
43
- return kana + prefix + next_ch
44
- end
45
- end
46
-
47
- next_kana = _to_kana(next_sentence, kana, prefix + next_ch, next_trie, false)
48
-
49
- if next_kana
50
- return next_kana
51
- end
52
-
53
- if next_set.size > 0
54
- return _to_kana(next_sentence, kana + next_set.values.first, "", @trie)
55
- elsif @consonant.include?(next_ch) && next_sentence.start_with?(next_ch)
56
- return _to_kana(next_sentence, kana + SOKUON, "", @trie)
57
- elsif through_alphabet
58
- return _to_kana(next_sentence, kana + prefix + next_ch, "", @trie)
59
- else
60
- return nil
61
- end
62
- end
63
- end
64
- end
65
-
66
- Tataki::CONVERTERS << Converter::Roman
67
- end
@@ -1,30 +0,0 @@
1
- # coding: utf-8
2
- require "spec_helper"
3
-
4
- describe Tataki::Converter::Roman do
5
- let(:converter) { Tataki::Converter::Roman.new }
6
-
7
- describe ".to_kana" do
8
- shared_examples "converts_kana" do |sentence, kana|
9
- it "converts #{sentence.inspect} to #{kana.inspect}" do
10
- expect(converter.to_kana(sentence)).to eq(kana)
11
- end
12
- end
13
-
14
- include_examples "converts_kana", "", ""
15
- include_examples "converts_kana", "hoge", "ほげ"
16
- include_examples "converts_kana", "hogelog", "ほげぉg"
17
- include_examples "converts_kana", "hogge", "ほっげ"
18
- include_examples "converts_kana", "hogs", "ほgs"
19
- include_examples "converts_kana", "nanka", "なんか"
20
- include_examples "converts_kana", "nannnan", "なんなん"
21
- include_examples "converts_kana", "nannnann", "なんなん"
22
- include_examples "converts_kana", "nannnannsei", "なんなんせい"
23
- include_examples "converts_kana", "kukkingu", "くっきんぐ"
24
- include_examples "converts_kana", "kukkingu papa", "くっきんぐ ぱぱ"
25
- include_examples "converts_kana", "toukyoutokkyokyokakyoku", "とうきょうとっきょきょかきょく"
26
-
27
- include_examples "converts_kana", "kku", "っく"
28
- include_examples "converts_kana", ",,", ",,"
29
- end
30
- end