okura 0.0.0 → 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,83 @@
1
+ #!/bin/env ruby
2
+ # -*- filetype:ruby -*-
3
+
4
+ require 'rubygems'
5
+
6
+ commands={
7
+ 'console' => lambda {|args|
8
+ require 'okura/console'
9
+ raise IllegalOption.new('Usage: console dict_dir') unless args.length==1
10
+ dict_dir=args[0]
11
+ Okura::Console.new.run_console dict_dir
12
+ },
13
+ 'compile' => lambda {|args|
14
+ require 'okura/serializer'
15
+ require 'optparse'
16
+ format_info=Okura::Serializer::FormatInfo.new
17
+ opt=OptionParser.new
18
+ dry=false
19
+ opt.on('--word_dic=TYPE'){|t| format_info.word_dic=t.to_sym}
20
+ opt.on('--dry'){|d|dry=d}
21
+ opt.on('--encoding=ENCODING'){|encoding| format_info.encoding=encoding}
22
+ opt.parse! args
23
+ raise IllegalOption.new('Usage: compile dict_dir dest_dir') unless args.length==2
24
+ dict_dir=args[0]
25
+ bin_dir=args[1]
26
+ puts <<-EOS
27
+ Settings
28
+ - File Encoding: #{format_info.encoding}
29
+ - Format:
30
+ - word_dic: #{format_info.word_dic}
31
+ - unk_dic: #{format_info.unk_dic}
32
+ - features: #{format_info.features}
33
+ - char_types: #{format_info.char_types}
34
+ - matrix: #{format_info.matrix}
35
+ EOS
36
+ next if dry
37
+ Dir.mkdir bin_dir unless File.exists? bin_dir
38
+ format_info.compile_dict(dict_dir,bin_dir)
39
+ },
40
+ 'help' => lambda {|args|
41
+ puts <<-EOS
42
+ USAGE:
43
+ okura command [args]
44
+ okura --version
45
+ command:
46
+ compile src_dir dest_dir
47
+ console dict_dir
48
+ help
49
+ EOS
50
+ }
51
+ }
52
+
53
+ class IllegalOption < StandardError
54
+ def initialize msg
55
+ @message=msg
56
+ end
57
+ attr_reader :message
58
+ end
59
+
60
+ if $*.empty?
61
+ commands['help'].call []
62
+ else
63
+ command=$*.first
64
+ if command == '--version'
65
+ require 'okura/version'
66
+ puts "Okura #{Okura::Version}"
67
+ exit 0
68
+ end
69
+ unless commands.has_key? command
70
+ puts "unknown command: #{command}"
71
+ commands[:help].call []
72
+ exit 1
73
+ end
74
+ begin
75
+ commands[command].call $*[1..-1]
76
+ exit 0
77
+ rescue IllegalOption
78
+ $stderr.puts $!.message
79
+ exit 1
80
+ end
81
+ end
82
+
83
+
@@ -113,8 +113,8 @@ module Okura
113
113
  def get id
114
114
  raise 'bad id' unless id < @indices.length
115
115
  from=@indices[id]
116
- to=(id+1 < @indices.length) ? @indices[id+1] : @str.length
117
- @str[from...to]
116
+ to=(id+1 < @indices.length) ? @indices[id+1] : @str.bytesize
117
+ (from...to).map{|i|@str.getbyte(i)}.pack('C*').force_encoding 'UTF-8'
118
118
  end
119
119
  def [](id)
120
120
  get id
@@ -132,7 +132,7 @@ module Okura
132
132
  id=@indices.length
133
133
  @indices.push @size
134
134
  @surfaces.push surface
135
- @size+=surface.size
135
+ @size+=surface.bytesize
136
136
  id
137
137
  end
138
138
  end
@@ -258,7 +258,7 @@ module Okura
258
258
  end
259
259
  # Integer -> Feature
260
260
  def from_id id
261
- @map_id[id]
261
+ @map_id[id] || (raise "Features: ID undefined (#{id})")
262
262
  end
263
263
  def [](id)
264
264
  from_id id
@@ -30,11 +30,17 @@ module Okura
30
30
  }
31
31
  }
32
32
 
33
+ features_r=open_src(src_dir,'right-id.def'){|src|
34
+ open_dest(bin_dir,'right-id.bin'){|dest|
35
+ serializer_for('Features',features).compile(src,dest)
36
+ }
37
+ }
38
+
33
39
  word_src_files=
34
40
  Dir.chdir(src_dir){ Dir.glob('*.csv') }.
35
41
  map{|file|File.join(src_dir,file)}
36
42
  open_dest(bin_dir,'word_dic.bin'){|dest|
37
- serializer_for('WordDic',word_dic).compile(features_l,word_src_files,encoding,dest)
43
+ serializer_for('WordDic',word_dic).compile(features_l,features_r,word_src_files,encoding,dest)
38
44
  }
39
45
 
40
46
  char_types=open_src(src_dir,'char.def'){|src|
@@ -45,7 +51,7 @@ module Okura
45
51
 
46
52
  open_src(src_dir,'unk.def'){|src|
47
53
  open_dest(bin_dir,'unk_dic.bin'){|dest|
48
- serializer_for('UnkDic',unk_dic).compile(char_types,features_l,src,dest)
54
+ serializer_for('UnkDic',unk_dic).compile(char_types,features_l,features_r,src,dest)
49
55
  }
50
56
  }
51
57
 
@@ -64,6 +70,9 @@ module Okura
64
70
  features_l=open_bin(bin_dir,'left-id.bin'){|bin|
65
71
  serializer_for('Features',features).load(bin)
66
72
  }
73
+ features_r=open_bin(bin_dir,'right-id.bin'){|bin|
74
+ serializer_for('Features',features).load(bin)
75
+ }
67
76
  wd=open_bin(bin_dir,'word_dic.bin'){|f|
68
77
  serializer_for('WordDic',word_dic).load(f)
69
78
  }
@@ -142,15 +151,15 @@ module Okura
142
151
  }
143
152
  end
144
153
  class Naive
145
- def compile(features,inputs,encoding,output)
154
+ def compile(features_l,features_r,inputs,encoding,output)
146
155
  dic=Okura::WordDic::Naive.new
147
156
  Okura::Serializer::WordDic.each_input(inputs,encoding){|input|
148
157
  parser=Okura::Parser::Word.new(input)
149
158
  parser.each{|surface,lid,rid,cost|
150
159
  word=Okura::Word.new(
151
160
  surface,
152
- features.from_id(lid),
153
- features.from_id(rid),
161
+ features_l.from_id(lid),
162
+ features_r.from_id(rid),
154
163
  cost
155
164
  )
156
165
  dic.define word
@@ -163,16 +172,16 @@ module Okura
163
172
  end
164
173
  end
165
174
  class DoubleArray
166
- def compile(features,inputs,encoding,output)
167
- puts 'loading'
175
+ def compile(features_l,features_r,inputs,encoding,output)
176
+ puts 'loading...'
168
177
  dic=Okura::WordDic::DoubleArray::Builder.new
169
178
  Okura::Serializer::WordDic.each_input(inputs,encoding){|input|
170
179
  parser=Okura::Parser::Word.new(input)
171
180
  parser.each{|surface,lid,rid,cost|
172
181
  word=Okura::Word.new(
173
182
  surface,
174
- features.from_id(lid),
175
- features.from_id(rid),
183
+ features_l.from_id(lid),
184
+ features_r.from_id(rid),
176
185
  cost
177
186
  )
178
187
  dic.define word
@@ -182,7 +191,7 @@ module Okura
182
191
  writer=Okura::Serializer::BinaryWriter.new output
183
192
  words,base,check=dic.data_for_serialize
184
193
  raise 'base.length!=check.length' if base.length!=check.length
185
- puts 'serialize words'
194
+ puts 'writing words...'
186
195
  words.instance_eval do
187
196
  writer.write_object @groups
188
197
  writer.write_object @left_features
@@ -191,13 +200,12 @@ module Okura
191
200
  writer.write_int32_array @right_ids
192
201
  writer.write_int32_array @costs
193
202
  writer.write_int32_array @surface_ids
194
- puts 'serialize surfaces'
195
203
  @surfaces.instance_eval do
196
204
  writer.write_object @str
197
205
  writer.write_int32_array @indices
198
206
  end
199
207
  end
200
- puts 'serialize DAT indices'
208
+ puts 'writing word index...'
201
209
  writer.write_int32_array base
202
210
  writer.write_int32_array check
203
211
  end
@@ -255,11 +263,11 @@ module Okura
255
263
  end
256
264
  module UnkDic
257
265
  class Marshal
258
- def compile(char_types,features,input,output)
266
+ def compile(char_types,features_l,features_r,input,output)
259
267
  unk=Okura::UnkDic.new char_types
260
268
  parser=Okura::Parser::UnkDic.new input
261
269
  parser.each{|type_name,lid,rid,cost|
262
- unk.define type_name,features.from_id(lid),features.from_id(rid),cost
270
+ unk.define type_name,features_l.from_id(lid),features_r.from_id(rid),cost
263
271
  }
264
272
  ::Marshal.dump(unk,output)
265
273
  end
@@ -191,16 +191,12 @@ Z,9,10,5244,記号,空白,*,*,*,*,*
191
191
  tagger.dic.word_dic.word_size.should == 3
192
192
  tagger.mat.cost(0,1).should == 5
193
193
 
194
- pending {
195
- w2=tagger.dic.word_dic.possible_words('w2',0)[0]
196
- w2.left.name.should == 'F5'
197
- w2.right.name.should == 'F6'
198
- }
199
- penging {
200
- u1=tagger.dic.unk_dic.word_templates_for('A')
201
- u1.left.name.should == 'F5'
202
- u1.right.name.should == 'F6'
203
- }
194
+ w2=tagger.dic.word_dic.possible_words('w2',0)[0]
195
+ w2.left.text.should == 'F5'
196
+ w2.right.text.should == 'F6'
197
+ u1=tagger.dic.unk_dic.word_templates_for('A')[0]
198
+ u1.left.text.should == 'F5'
199
+ u1.right.text.should == 'F6'
204
200
  }
205
201
  end
206
202
  end
@@ -272,17 +268,18 @@ TYPE3 0 1 3
272
268
  # subject : Serializer class
273
269
  it 'コンパイルして復元できる' do
274
270
  serializer=subject.new
275
- features=Okura::Features.new
276
- features.add 854,f(854)
277
- features.add 458,f(458)
278
- features.add 645,f(645)
279
- features.add 546,f(546)
271
+ features_l=Okura::Features.new
272
+ features_l.add 854,f(854)
273
+ features_l.add 645,f(645)
274
+ features_r=Okura::Features.new
275
+ features_r.add 458,f(458)
276
+ features_r.add 546,f(546)
280
277
  out=StringIO.new
281
278
  src=<<-EOS
282
279
  あがなう,854,458,6636,動詞,自立,*,*,五段・ワ行促音便,基本形,あがなう,アガナウ,アガナウ,あがなう/購う/贖う,
283
280
  あがめる,645,546,1234,動詞,自立,*,*,一段,基本形,あがめる,アガメル,アガメル,あがめる/崇める,
284
281
  EOS
285
- serializer.compile(features,[as_io(src)],'UTF-8',out)
282
+ serializer.compile(features_l,features_r,[as_io(src)],'UTF-8',out)
286
283
  out.rewind
287
284
  wd=serializer.load(out)
288
285
 
@@ -307,13 +304,14 @@ TYPE3 0 1 3
307
304
  cts.define_type 'Z',false,true,0
308
305
  cts.define_map 0x0001,cts.named('A'),[]
309
306
  cts.define_map 0x0002,cts.named('Z'),[]
310
- features=Okura::Features.new
311
- features.add 5,'F5'
312
- features.add 6,'F6'
313
- features.add 9,'F9'
314
- features.add 10,'F10'
307
+ features_l=Okura::Features.new
308
+ features_l.add 5,'F5'
309
+ features_l.add 9,'F9'
310
+ features_r=Okura::Features.new
311
+ features_r.add 6,'F6'
312
+ features_r.add 10,'F10'
315
313
  out=StringIO.new
316
- serializer.compile(cts,features,as_io(<<-EOS),out)
314
+ serializer.compile(cts,features_l,features_r,as_io(<<-EOS),out)
317
315
  A,5,6,3274,記号,一般,*,*,*,*,*
318
316
  Z,9,10,5244,記号,空白,*,*,*,*,*
319
317
  EOS
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: okura
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.0.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -45,7 +45,8 @@ dependencies:
45
45
  version: 0.5.4
46
46
  description: Pure ruby morpheme analyzer, using MeCab format dic
47
47
  email: discommunicative@gmail.com
48
- executables: []
48
+ executables:
49
+ - okura
49
50
  extensions: []
50
51
  extra_rdoc_files: []
51
52
  files:
@@ -58,6 +59,7 @@ files:
58
59
  - test/okura_spec.rb
59
60
  - test/spec_helper.rb
60
61
  - test/words_spec.rb
62
+ - bin/okura
61
63
  homepage: https://github.com/todesking/okura
62
64
  licenses: []
63
65
  post_install_message: