crfpp 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,13 @@
1
1
 
2
+ require 'forwardable'
3
+ require 'tempfile'
4
+
2
5
  require 'crfpp/version'
6
+ require 'crfpp/errors'
7
+ require 'crfpp/filelike'
8
+ require 'crfpp/macro'
9
+ require 'crfpp/template'
10
+ require 'crfpp/feature'
11
+ require 'crfpp/model'
3
12
  require 'crfpp/native'
4
13
  require 'crfpp/utilities'
@@ -0,0 +1,16 @@
1
+ module CRFPP
2
+
3
+ class Error < StandardError
4
+
5
+ attr_accessor :original
6
+
7
+ def initialize(message = nil, original = $!)
8
+ super(message)
9
+ @original = original
10
+ end
11
+
12
+ end
13
+
14
+ class NativeError < Error; end
15
+
16
+ end
@@ -0,0 +1,29 @@
1
+ module CRFPP
2
+
3
+ class Feature < Struct.new(:content, :type, :id)
4
+
5
+ class << self
6
+ def parse(string)
7
+ if string =~ /^([UB])(\d*):(.+)\s*$/
8
+ Feature.new($3, $1.to_sym, $2)
9
+ else
10
+ string
11
+ end
12
+ end
13
+ end
14
+
15
+ def initialize(content = Macro.new, type = :U, id = nil)
16
+ super
17
+ end
18
+
19
+ def identifier(base = id)
20
+ base.is_a?(Numeric) ? ('%02d' % base) : base
21
+ end
22
+
23
+ def to_s(number = id)
24
+ [type.to_s.upcase, identifier(number), ':', content].compact.join
25
+ end
26
+
27
+ end
28
+
29
+ end
@@ -0,0 +1,27 @@
1
+ module CRFPP
2
+
3
+ module Filelike
4
+
5
+ attr_writer :path
6
+
7
+ def path
8
+ @path ||= Tempfile.new('filelike').path
9
+ end
10
+
11
+ def write
12
+ File.open(path, 'w:UTF-8') do |f|
13
+ f.write(to_s)
14
+ f.close
15
+ end
16
+ end
17
+
18
+ def read
19
+ f = File.open(path, 'r:UTF-8')
20
+ f.read
21
+ ensure
22
+ f.close
23
+ end
24
+
25
+ end
26
+
27
+ end
@@ -0,0 +1,17 @@
1
+ module CRFPP
2
+ class Macro < Struct.new(:row, :column)
3
+
4
+ TEMPLATE = '%%x[%d,%d]'.freeze
5
+
6
+ alias col column
7
+
8
+ def initialize(row = 0, column = 0)
9
+ super
10
+ end
11
+
12
+ def to_s
13
+ TEMPLATE % values
14
+ end
15
+
16
+ end
17
+ end
@@ -0,0 +1,23 @@
1
+ module CRFPP
2
+ class Model
3
+
4
+ include Filelike
5
+
6
+ attr_reader :data
7
+
8
+ def initialize(path = nil)
9
+ @data, @path = '', path
10
+ end
11
+
12
+ def open
13
+ @data = read
14
+ self
15
+ end
16
+
17
+ def save
18
+ write(@data)
19
+ self
20
+ end
21
+
22
+ end
23
+ end
@@ -0,0 +1,32 @@
1
+ module CRFPP
2
+ class Template
3
+
4
+ extend Forwardable
5
+
6
+ include Filelike
7
+
8
+ attr_reader :features
9
+
10
+ def_delegators :@features, :<<, :length
11
+
12
+ def initialize(path = nil)
13
+ @path = path
14
+ open
15
+ end
16
+
17
+ def open
18
+ @features = read.lines.map { |line| Feature.parse(line) }
19
+ self
20
+ end
21
+
22
+ def clear
23
+ @features = []
24
+ end
25
+
26
+ def to_s
27
+ i = -1
28
+ features.map { |f| f.is_a?(Feature) ? f.to_s(i += 1) : f.chomp }.join("\n")
29
+ end
30
+
31
+ end
32
+ end
@@ -1,12 +1,71 @@
1
1
 
2
2
  module CRFPP
3
3
 
4
- def learn(template, data, model, options = {})
5
- Native.learn(['--thread=2', template, data, model].join(' '))
4
+ # Creates a new Model based on a template and training data.
5
+ #
6
+ # The data parameter can either be an array of strings or a filename. The
7
+ # possible options are:
8
+ #
9
+ # :threads: False or the number of threads to us (default is 2).
10
+ #
11
+ # :algorithm: L1 or L2 (default)
12
+ #
13
+ # :cost: With this option, you can change the hyper-parameter for the CRFs.
14
+ # With larger C value, CRF tends to overfit to the give training
15
+ # corpus. This parameter trades the balance between overfitting and
16
+ # underfitting. The results will significantly be influenced by this
17
+ # parameter. You can find an optimal value by using held-out data or
18
+ # more general model selection method such as cross validation.
19
+ #
20
+ # :frequency: This parameter sets the cut-off threshold for the features. CRF++
21
+ # uses the features that occurs no less than NUM times in the given training
22
+ # data. The default value is 1. When you apply CRF++ to large data, the
23
+ # number of unique features would amount to several millions. This option is
24
+ # useful in such cases.
25
+ #
26
+ def learn(template, data, options = {})
27
+ options = { :threads => 2, :algorithm => :L2, :cost => 1.0, :frequency => 1}.merge(options)
28
+
29
+ unless File.exists?(data)
30
+ data = save_data_to_tempfile([data].flatten)
31
+ temporary = true
32
+ end
33
+
34
+ template = Template.new(template) unless template.is_a?(Template)
35
+ model = Model.new
36
+
37
+ arguments = []
38
+
39
+ # TODO check algorithm names
40
+ # arguments << "--algorithm=#{options[:algorithm]}"
41
+
42
+ arguments << "--cost=#{options[:cost]}"
43
+ arguments << "--thread=#{options[:threads]}"
44
+ arguments << "--freq=#{options[:frequency]}"
45
+
46
+ arguments << template.path
47
+ arguments << data
48
+ arguments << model.path
49
+
50
+ success = Native.learn(arguments.join(' '))
51
+ raise NativeError, 'crfpp learn failed' unless success
52
+
53
+ model
54
+ ensure
55
+ data.unlink if temporary
6
56
  end
7
57
 
8
58
  alias train learn
9
59
 
10
60
  module_function :train, :learn
11
61
 
62
+ private
63
+
64
+ # Saves data to temporary file and returns the path
65
+ def save_data_to_tempfile(data)
66
+ tmp = Tempfile.new(data)
67
+ tmp.write(data.join("\n"))
68
+ tmp.path
69
+ end
70
+
12
71
  end
@@ -1,3 +1,3 @@
1
1
  module CRFPP
2
- VERSION = '0.0.1'.freeze
2
+ VERSION = '0.0.2'.freeze
3
3
  end
@@ -0,0 +1,25 @@
1
+ require 'helper'
2
+
3
+ module CRFPP
4
+
5
+ class TestFeature < Test::Unit::TestCase
6
+
7
+ def test_assignment_of_constructor_arguments
8
+ assert_raise(ArgumentError) { Feature.new(1,2,3,4) }
9
+ end
10
+
11
+ def test_to_s
12
+ assert_equal 'U:%x[0,0]', Feature.new.to_s
13
+ assert_equal 'U:%x[2,3]', Feature.new(Macro.new(2,3)).to_s
14
+ assert_equal 'U01:%x[2,3]', Feature.new(Macro.new(2,3)).to_s(1)
15
+ assert_equal 'U123:%x[2,3]', Feature.new(Macro.new(2,3)).to_s(123)
16
+ end
17
+
18
+ def test_parse_feature_strings
19
+ assert_equal 'U:%x[42,23]/AB', Feature.parse('U:%x[42,23]/AB').to_s
20
+ end
21
+
22
+
23
+ end
24
+
25
+ end
@@ -0,0 +1,19 @@
1
+ require 'helper'
2
+
3
+ module CRFPP
4
+ class TestFilelike < Test::Unit::TestCase
5
+ def setup
6
+ @klass = Class.new
7
+ @klass.instance_eval { include Filelike }
8
+ end
9
+
10
+ def subject
11
+ @klass.new
12
+ end
13
+
14
+ def test_has_a_path_by_default
15
+ assert subject.path
16
+ end
17
+
18
+ end
19
+ end
@@ -0,0 +1,20 @@
1
+ require 'helper'
2
+
3
+ module CRFPP
4
+
5
+ class TestMacro < Test::Unit::TestCase
6
+
7
+ def test_assignment_of_constructor_arguments
8
+ assert_equal [0, 0], Macro.new.values
9
+ assert_equal [1, 1], Macro.new(1,1).values
10
+ assert_equal [1, 0], Macro.new(1).values
11
+ assert_raise(ArgumentError) { Macro.new(1,2,3) }
12
+ end
13
+
14
+ def test_to_s
15
+ assert_equal '%x[0,0]', Macro.new.to_s
16
+ end
17
+
18
+ end
19
+
20
+ end
@@ -3,12 +3,10 @@ require 'helper'
3
3
  module CRFPP
4
4
  class TestNative < Test::Unit::TestCase
5
5
 
6
- FIXTURES_ROOT = File.expand_path('../../fixtures', __FILE__)
7
-
8
6
  def test_create_a_new_model_file_through_training
9
7
  model = Tempfile.new('model')
10
- assert Native.learn("#{FIXTURES_ROOT}/template #{FIXTURES_ROOT}/train.data #{model.path}")
11
- assert model.length > 0
8
+ assert Native.learn("#{FixturesRoot}/template #{FixturesRoot}/train.data #{model.path}"), 'Native.learn returned with error exit status'
9
+ assert model.length > 0, 'nothing was written to model file'
12
10
  ensure
13
11
  model.close
14
12
  model.unlink
@@ -0,0 +1,23 @@
1
+ require 'helper'
2
+
3
+ module CRFPP
4
+
5
+ class TestTemplate < Test::Unit::TestCase
6
+
7
+ def test_load_template_from_empty_file
8
+ file = Tempfile.new('template')
9
+ assert Template.new(file.path).to_s.empty?
10
+ ensure
11
+ file.close
12
+ file.unlink
13
+ end
14
+
15
+ def test_load_template_from_file
16
+ path = "#{FixturesRoot}/template"
17
+ assert_equal Template.new(path).to_s.chomp, File.open(path).read.chomp
18
+ end
19
+
20
+
21
+ end
22
+
23
+ end
@@ -0,0 +1,12 @@
1
+ require 'helper'
2
+
3
+ module CRFPP
4
+ class TestUtilities < Test::Unit::TestCase
5
+
6
+ def test_create_a_new_model_file_through_training
7
+ model = CRFPP.learn("#{FixturesRoot}/template", "#{FixturesRoot}/train.data")
8
+ assert !model.open.data.empty?
9
+ end
10
+
11
+ end
12
+ end
@@ -1,4 +1,6 @@
1
1
  require 'test/unit'
2
2
  require 'crfpp'
3
3
 
4
- require 'tempfile'
4
+ module CRFPP
5
+ FixturesRoot = File.expand_path('../fixtures', __FILE__)
6
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crfpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-08-16 00:00:00.000000000Z
12
+ date: 2011-08-17 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
16
- requirement: &2157069020 !ruby/object:Gem::Requirement
16
+ requirement: &2156849560 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0.9'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2157069020
24
+ version_requirements: *2156849560
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rake-compiler
27
- requirement: &2157068480 !ruby/object:Gem::Requirement
27
+ requirement: &2156849000 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0.7'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *2157068480
35
+ version_requirements: *2156849000
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: ZenTest
38
- requirement: &2157067980 !ruby/object:Gem::Requirement
38
+ requirement: &2156848460 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: '4.6'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2157067980
46
+ version_requirements: *2156848460
47
47
  description: A Ruby extension to interface with CRF++, the Conditional Random Fields
48
48
  library written in C++. You need to install libcrfpp to use this gem.
49
49
  email:
@@ -67,10 +67,21 @@ files:
67
67
  - ext/crfpp/tagger.cpp
68
68
  - ext/crfpp/tagger.hpp
69
69
  - lib/crfpp.rb
70
+ - lib/crfpp/errors.rb
71
+ - lib/crfpp/feature.rb
72
+ - lib/crfpp/filelike.rb
73
+ - lib/crfpp/macro.rb
74
+ - lib/crfpp/model.rb
75
+ - lib/crfpp/template.rb
70
76
  - lib/crfpp/utilities.rb
71
77
  - lib/crfpp/version.rb
78
+ - test/crfpp/test_feature.rb
79
+ - test/crfpp/test_filelike.rb
80
+ - test/crfpp/test_macro.rb
72
81
  - test/crfpp/test_native.rb
73
82
  - test/crfpp/test_tagger.rb
83
+ - test/crfpp/test_template.rb
84
+ - test/crfpp/test_utilities.rb
74
85
  - test/fixtures/template
75
86
  - test/fixtures/test.data
76
87
  - test/fixtures/train.data
@@ -108,8 +119,13 @@ signing_key:
108
119
  specification_version: 3
109
120
  summary: Conditional Random Fields for Ruby.
110
121
  test_files:
122
+ - test/crfpp/test_feature.rb
123
+ - test/crfpp/test_filelike.rb
124
+ - test/crfpp/test_macro.rb
111
125
  - test/crfpp/test_native.rb
112
126
  - test/crfpp/test_tagger.rb
127
+ - test/crfpp/test_template.rb
128
+ - test/crfpp/test_utilities.rb
113
129
  - test/fixtures/template
114
130
  - test/fixtures/test.data
115
131
  - test/fixtures/train.data