okura 0.0.0 → 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,83 @@
1
+ #!/bin/env ruby
2
+ # -*- filetype:ruby -*-
3
+
4
+ require 'rubygems'
5
+
6
+ commands={
7
+ 'console' => lambda {|args|
8
+ require 'okura/console'
9
+ raise IllegalOption.new('Usage: console dict_dir') unless args.length==1
10
+ dict_dir=args[0]
11
+ Okura::Console.new.run_console dict_dir
12
+ },
13
+ 'compile' => lambda {|args|
14
+ require 'okura/serializer'
15
+ require 'optparse'
16
+ format_info=Okura::Serializer::FormatInfo.new
17
+ opt=OptionParser.new
18
+ dry=false
19
+ opt.on('--word_dic=TYPE'){|t| format_info.word_dic=t.to_sym}
20
+ opt.on('--dry'){|d|dry=d}
21
+ opt.on('--encoding=ENCODING'){|encoding| format_info.encoding=encoding}
22
+ opt.parse! args
23
+ raise IllegalOption.new('Usage: compile dict_dir dest_dir') unless args.length==2
24
+ dict_dir=args[0]
25
+ bin_dir=args[1]
26
+ puts <<-EOS
27
+ Settings
28
+ - File Encoding: #{format_info.encoding}
29
+ - Format:
30
+ - word_dic: #{format_info.word_dic}
31
+ - unk_dic: #{format_info.unk_dic}
32
+ - features: #{format_info.features}
33
+ - char_types: #{format_info.char_types}
34
+ - matrix: #{format_info.matrix}
35
+ EOS
36
+ next if dry
37
+ Dir.mkdir bin_dir unless File.exists? bin_dir
38
+ format_info.compile_dict(dict_dir,bin_dir)
39
+ },
40
+ 'help' => lambda {|args|
41
+ puts <<-EOS
42
+ USAGE:
43
+ okura command [args]
44
+ okura --version
45
+ command:
46
+ compile src_dir dest_dir
47
+ console dict_dir
48
+ help
49
+ EOS
50
+ }
51
+ }
52
+
53
+ class IllegalOption < StandardError
54
+ def initialize msg
55
+ @message=msg
56
+ end
57
+ attr_reader :message
58
+ end
59
+
60
+ if $*.empty?
61
+ commands['help'].call []
62
+ else
63
+ command=$*.first
64
+ if command == '--version'
65
+ require 'okura/version'
66
+ puts "Okura #{Okura::Version}"
67
+ exit 0
68
+ end
69
+ unless commands.has_key? command
70
+ puts "unknown command: #{command}"
71
+ commands[:help].call []
72
+ exit 1
73
+ end
74
+ begin
75
+ commands[command].call $*[1..-1]
76
+ exit 0
77
+ rescue IllegalOption
78
+ $stderr.puts $!.message
79
+ exit 1
80
+ end
81
+ end
82
+
83
+
@@ -113,8 +113,8 @@ module Okura
113
113
  def get id
114
114
  raise 'bad id' unless id < @indices.length
115
115
  from=@indices[id]
116
- to=(id+1 < @indices.length) ? @indices[id+1] : @str.length
117
- @str[from...to]
116
+ to=(id+1 < @indices.length) ? @indices[id+1] : @str.bytesize
117
+ (from...to).map{|i|@str.getbyte(i)}.pack('C*').force_encoding 'UTF-8'
118
118
  end
119
119
  def [](id)
120
120
  get id
@@ -132,7 +132,7 @@ module Okura
132
132
  id=@indices.length
133
133
  @indices.push @size
134
134
  @surfaces.push surface
135
- @size+=surface.size
135
+ @size+=surface.bytesize
136
136
  id
137
137
  end
138
138
  end
@@ -258,7 +258,7 @@ module Okura
258
258
  end
259
259
  # Integer -> Feature
260
260
  def from_id id
261
- @map_id[id]
261
+ @map_id[id] || (raise "Features: ID undefined (#{id})")
262
262
  end
263
263
  def [](id)
264
264
  from_id id
@@ -30,11 +30,17 @@ module Okura
30
30
  }
31
31
  }
32
32
 
33
+ features_r=open_src(src_dir,'right-id.def'){|src|
34
+ open_dest(bin_dir,'right-id.bin'){|dest|
35
+ serializer_for('Features',features).compile(src,dest)
36
+ }
37
+ }
38
+
33
39
  word_src_files=
34
40
  Dir.chdir(src_dir){ Dir.glob('*.csv') }.
35
41
  map{|file|File.join(src_dir,file)}
36
42
  open_dest(bin_dir,'word_dic.bin'){|dest|
37
- serializer_for('WordDic',word_dic).compile(features_l,word_src_files,encoding,dest)
43
+ serializer_for('WordDic',word_dic).compile(features_l,features_r,word_src_files,encoding,dest)
38
44
  }
39
45
 
40
46
  char_types=open_src(src_dir,'char.def'){|src|
@@ -45,7 +51,7 @@ module Okura
45
51
 
46
52
  open_src(src_dir,'unk.def'){|src|
47
53
  open_dest(bin_dir,'unk_dic.bin'){|dest|
48
- serializer_for('UnkDic',unk_dic).compile(char_types,features_l,src,dest)
54
+ serializer_for('UnkDic',unk_dic).compile(char_types,features_l,features_r,src,dest)
49
55
  }
50
56
  }
51
57
 
@@ -64,6 +70,9 @@ module Okura
64
70
  features_l=open_bin(bin_dir,'left-id.bin'){|bin|
65
71
  serializer_for('Features',features).load(bin)
66
72
  }
73
+ features_r=open_bin(bin_dir,'right-id.bin'){|bin|
74
+ serializer_for('Features',features).load(bin)
75
+ }
67
76
  wd=open_bin(bin_dir,'word_dic.bin'){|f|
68
77
  serializer_for('WordDic',word_dic).load(f)
69
78
  }
@@ -142,15 +151,15 @@ module Okura
142
151
  }
143
152
  end
144
153
  class Naive
145
- def compile(features,inputs,encoding,output)
154
+ def compile(features_l,features_r,inputs,encoding,output)
146
155
  dic=Okura::WordDic::Naive.new
147
156
  Okura::Serializer::WordDic.each_input(inputs,encoding){|input|
148
157
  parser=Okura::Parser::Word.new(input)
149
158
  parser.each{|surface,lid,rid,cost|
150
159
  word=Okura::Word.new(
151
160
  surface,
152
- features.from_id(lid),
153
- features.from_id(rid),
161
+ features_l.from_id(lid),
162
+ features_r.from_id(rid),
154
163
  cost
155
164
  )
156
165
  dic.define word
@@ -163,16 +172,16 @@ module Okura
163
172
  end
164
173
  end
165
174
  class DoubleArray
166
- def compile(features,inputs,encoding,output)
167
- puts 'loading'
175
+ def compile(features_l,features_r,inputs,encoding,output)
176
+ puts 'loading...'
168
177
  dic=Okura::WordDic::DoubleArray::Builder.new
169
178
  Okura::Serializer::WordDic.each_input(inputs,encoding){|input|
170
179
  parser=Okura::Parser::Word.new(input)
171
180
  parser.each{|surface,lid,rid,cost|
172
181
  word=Okura::Word.new(
173
182
  surface,
174
- features.from_id(lid),
175
- features.from_id(rid),
183
+ features_l.from_id(lid),
184
+ features_r.from_id(rid),
176
185
  cost
177
186
  )
178
187
  dic.define word
@@ -182,7 +191,7 @@ module Okura
182
191
  writer=Okura::Serializer::BinaryWriter.new output
183
192
  words,base,check=dic.data_for_serialize
184
193
  raise 'base.length!=check.length' if base.length!=check.length
185
- puts 'serialize words'
194
+ puts 'writing words...'
186
195
  words.instance_eval do
187
196
  writer.write_object @groups
188
197
  writer.write_object @left_features
@@ -191,13 +200,12 @@ module Okura
191
200
  writer.write_int32_array @right_ids
192
201
  writer.write_int32_array @costs
193
202
  writer.write_int32_array @surface_ids
194
- puts 'serialize surfaces'
195
203
  @surfaces.instance_eval do
196
204
  writer.write_object @str
197
205
  writer.write_int32_array @indices
198
206
  end
199
207
  end
200
- puts 'serialize DAT indices'
208
+ puts 'writing word index...'
201
209
  writer.write_int32_array base
202
210
  writer.write_int32_array check
203
211
  end
@@ -255,11 +263,11 @@ module Okura
255
263
  end
256
264
  module UnkDic
257
265
  class Marshal
258
- def compile(char_types,features,input,output)
266
+ def compile(char_types,features_l,features_r,input,output)
259
267
  unk=Okura::UnkDic.new char_types
260
268
  parser=Okura::Parser::UnkDic.new input
261
269
  parser.each{|type_name,lid,rid,cost|
262
- unk.define type_name,features.from_id(lid),features.from_id(rid),cost
270
+ unk.define type_name,features_l.from_id(lid),features_r.from_id(rid),cost
263
271
  }
264
272
  ::Marshal.dump(unk,output)
265
273
  end
@@ -191,16 +191,12 @@ Z,9,10,5244,記号,空白,*,*,*,*,*
191
191
  tagger.dic.word_dic.word_size.should == 3
192
192
  tagger.mat.cost(0,1).should == 5
193
193
 
194
- pending {
195
- w2=tagger.dic.word_dic.possible_words('w2',0)[0]
196
- w2.left.name.should == 'F5'
197
- w2.right.name.should == 'F6'
198
- }
199
- penging {
200
- u1=tagger.dic.unk_dic.word_templates_for('A')
201
- u1.left.name.should == 'F5'
202
- u1.right.name.should == 'F6'
203
- }
194
+ w2=tagger.dic.word_dic.possible_words('w2',0)[0]
195
+ w2.left.text.should == 'F5'
196
+ w2.right.text.should == 'F6'
197
+ u1=tagger.dic.unk_dic.word_templates_for('A')[0]
198
+ u1.left.text.should == 'F5'
199
+ u1.right.text.should == 'F6'
204
200
  }
205
201
  end
206
202
  end
@@ -272,17 +268,18 @@ TYPE3 0 1 3
272
268
  # subject : Serializer class
273
269
  it 'コンパイルして復元できる' do
274
270
  serializer=subject.new
275
- features=Okura::Features.new
276
- features.add 854,f(854)
277
- features.add 458,f(458)
278
- features.add 645,f(645)
279
- features.add 546,f(546)
271
+ features_l=Okura::Features.new
272
+ features_l.add 854,f(854)
273
+ features_l.add 645,f(645)
274
+ features_r=Okura::Features.new
275
+ features_r.add 458,f(458)
276
+ features_r.add 546,f(546)
280
277
  out=StringIO.new
281
278
  src=<<-EOS
282
279
  あがなう,854,458,6636,動詞,自立,*,*,五段・ワ行促音便,基本形,あがなう,アガナウ,アガナウ,あがなう/購う/贖う,
283
280
  あがめる,645,546,1234,動詞,自立,*,*,一段,基本形,あがめる,アガメル,アガメル,あがめる/崇める,
284
281
  EOS
285
- serializer.compile(features,[as_io(src)],'UTF-8',out)
282
+ serializer.compile(features_l,features_r,[as_io(src)],'UTF-8',out)
286
283
  out.rewind
287
284
  wd=serializer.load(out)
288
285
 
@@ -307,13 +304,14 @@ TYPE3 0 1 3
307
304
  cts.define_type 'Z',false,true,0
308
305
  cts.define_map 0x0001,cts.named('A'),[]
309
306
  cts.define_map 0x0002,cts.named('Z'),[]
310
- features=Okura::Features.new
311
- features.add 5,'F5'
312
- features.add 6,'F6'
313
- features.add 9,'F9'
314
- features.add 10,'F10'
307
+ features_l=Okura::Features.new
308
+ features_l.add 5,'F5'
309
+ features_l.add 9,'F9'
310
+ features_r=Okura::Features.new
311
+ features_r.add 6,'F6'
312
+ features_r.add 10,'F10'
315
313
  out=StringIO.new
316
- serializer.compile(cts,features,as_io(<<-EOS),out)
314
+ serializer.compile(cts,features_l,features_r,as_io(<<-EOS),out)
317
315
  A,5,6,3274,記号,一般,*,*,*,*,*
318
316
  Z,9,10,5244,記号,空白,*,*,*,*,*
319
317
  EOS
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: okura
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.0.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -45,7 +45,8 @@ dependencies:
45
45
  version: 0.5.4
46
46
  description: Pure ruby morpheme analyzer, using MeCab format dic
47
47
  email: discommunicative@gmail.com
48
- executables: []
48
+ executables:
49
+ - okura
49
50
  extensions: []
50
51
  extra_rdoc_files: []
51
52
  files:
@@ -58,6 +59,7 @@ files:
58
59
  - test/okura_spec.rb
59
60
  - test/spec_helper.rb
60
61
  - test/words_spec.rb
62
+ - bin/okura
61
63
  homepage: https://github.com/todesking/okura
62
64
  licenses: []
63
65
  post_install_message: