crfpp 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,13 @@
1
1
 
2
+ require 'forwardable'
3
+ require 'tempfile'
4
+
2
5
  require 'crfpp/version'
6
+ require 'crfpp/errors'
7
+ require 'crfpp/filelike'
8
+ require 'crfpp/macro'
9
+ require 'crfpp/template'
10
+ require 'crfpp/feature'
11
+ require 'crfpp/model'
3
12
  require 'crfpp/native'
4
13
  require 'crfpp/utilities'
@@ -0,0 +1,16 @@
1
+ module CRFPP
2
+
3
+ class Error < StandardError
4
+
5
+ attr_accessor :original
6
+
7
+ def initialize(message = nil, original = $!)
8
+ super(message)
9
+ @original = original
10
+ end
11
+
12
+ end
13
+
14
+ class NativeError < Error; end
15
+
16
+ end
@@ -0,0 +1,29 @@
1
+ module CRFPP
2
+
3
+ class Feature < Struct.new(:content, :type, :id)
4
+
5
+ class << self
6
+ def parse(string)
7
+ if string =~ /^([UB])(\d*):(.+)\s*$/
8
+ Feature.new($3, $1.to_sym, $2)
9
+ else
10
+ string
11
+ end
12
+ end
13
+ end
14
+
15
+ def initialize(content = Macro.new, type = :U, id = nil)
16
+ super
17
+ end
18
+
19
+ def identifier(base = id)
20
+ base.is_a?(Numeric) ? ('%02d' % base) : base
21
+ end
22
+
23
+ def to_s(number = id)
24
+ [type.to_s.upcase, identifier(number), ':', content].compact.join
25
+ end
26
+
27
+ end
28
+
29
+ end
@@ -0,0 +1,27 @@
1
+ module CRFPP
2
+
3
+ module Filelike
4
+
5
+ attr_writer :path
6
+
7
+ def path
8
+ @path ||= Tempfile.new('filelike').path
9
+ end
10
+
11
+ def write
12
+ File.open(path, 'w:UTF-8') do |f|
13
+ f.write(to_s)
14
+ f.close
15
+ end
16
+ end
17
+
18
+ def read
19
+ f = File.open(path, 'r:UTF-8')
20
+ f.read
21
+ ensure
22
+ f.close
23
+ end
24
+
25
+ end
26
+
27
+ end
@@ -0,0 +1,17 @@
1
+ module CRFPP
2
+ class Macro < Struct.new(:row, :column)
3
+
4
+ TEMPLATE = '%%x[%d,%d]'.freeze
5
+
6
+ alias col column
7
+
8
+ def initialize(row = 0, column = 0)
9
+ super
10
+ end
11
+
12
+ def to_s
13
+ TEMPLATE % values
14
+ end
15
+
16
+ end
17
+ end
@@ -0,0 +1,23 @@
1
+ module CRFPP
2
+ class Model
3
+
4
+ include Filelike
5
+
6
+ attr_reader :data
7
+
8
+ def initialize(path = nil)
9
+ @data, @path = '', path
10
+ end
11
+
12
+ def open
13
+ @data = read
14
+ self
15
+ end
16
+
17
+ def save
18
+ write(@data)
19
+ self
20
+ end
21
+
22
+ end
23
+ end
@@ -0,0 +1,32 @@
1
+ module CRFPP
2
+ class Template
3
+
4
+ extend Forwardable
5
+
6
+ include Filelike
7
+
8
+ attr_reader :features
9
+
10
+ def_delegators :@features, :<<, :length
11
+
12
+ def initialize(path = nil)
13
+ @path = path
14
+ open
15
+ end
16
+
17
+ def open
18
+ @features = read.lines.map { |line| Feature.parse(line) }
19
+ self
20
+ end
21
+
22
+ def clear
23
+ @features = []
24
+ end
25
+
26
+ def to_s
27
+ i = -1
28
+ features.map { |f| f.is_a?(Feature) ? f.to_s(i += 1) : f.chomp }.join("\n")
29
+ end
30
+
31
+ end
32
+ end
@@ -1,12 +1,71 @@
1
1
 
2
2
  module CRFPP
3
3
 
4
- def learn(template, data, model, options = {})
5
- Native.learn(['--thread=2', template, data, model].join(' '))
4
+ # Creates a new Model based on a template and training data.
5
+ #
6
+ # The data parameter can either be an array of strings or a filename. The
7
+ # possible options are:
8
+ #
9
+ # :threads: False or the number of threads to us (default is 2).
10
+ #
11
+ # :algorithm: L1 or L2 (default)
12
+ #
13
+ # :cost: With this option, you can change the hyper-parameter for the CRFs.
14
+ # With larger C value, CRF tends to overfit to the give training
15
+ # corpus. This parameter trades the balance between overfitting and
16
+ # underfitting. The results will significantly be influenced by this
17
+ # parameter. You can find an optimal value by using held-out data or
18
+ # more general model selection method such as cross validation.
19
+ #
20
+ # :frequency: This parameter sets the cut-off threshold for the features. CRF++
21
+ # uses the features that occurs no less than NUM times in the given training
22
+ # data. The default value is 1. When you apply CRF++ to large data, the
23
+ # number of unique features would amount to several millions. This option is
24
+ # useful in such cases.
25
+ #
26
+ def learn(template, data, options = {})
27
+ options = { :threads => 2, :algorithm => :L2, :cost => 1.0, :frequency => 1}.merge(options)
28
+
29
+ unless File.exists?(data)
30
+ data = save_data_to_tempfile([data].flatten)
31
+ temporary = true
32
+ end
33
+
34
+ template = Template.new(template) unless template.is_a?(Template)
35
+ model = Model.new
36
+
37
+ arguments = []
38
+
39
+ # TODO check algorithm names
40
+ # arguments << "--algorithm=#{options[:algorithm]}"
41
+
42
+ arguments << "--cost=#{options[:cost]}"
43
+ arguments << "--thread=#{options[:threads]}"
44
+ arguments << "--freq=#{options[:frequency]}"
45
+
46
+ arguments << template.path
47
+ arguments << data
48
+ arguments << model.path
49
+
50
+ success = Native.learn(arguments.join(' '))
51
+ raise NativeError, 'crfpp learn failed' unless success
52
+
53
+ model
54
+ ensure
55
+ data.unlink if temporary
6
56
  end
7
57
 
8
58
  alias train learn
9
59
 
10
60
  module_function :train, :learn
11
61
 
62
+ private
63
+
64
+ # Saves data to temporary file and returns the path
65
+ def save_data_to_tempfile(data)
66
+ tmp = Tempfile.new(data)
67
+ tmp.write(data.join("\n"))
68
+ tmp.path
69
+ end
70
+
12
71
  end
@@ -1,3 +1,3 @@
1
1
  module CRFPP
2
- VERSION = '0.0.1'.freeze
2
+ VERSION = '0.0.2'.freeze
3
3
  end
@@ -0,0 +1,25 @@
1
+ require 'helper'
2
+
3
+ module CRFPP
4
+
5
+ class TestFeature < Test::Unit::TestCase
6
+
7
+ def test_assignment_of_constructor_arguments
8
+ assert_raise(ArgumentError) { Feature.new(1,2,3,4) }
9
+ end
10
+
11
+ def test_to_s
12
+ assert_equal 'U:%x[0,0]', Feature.new.to_s
13
+ assert_equal 'U:%x[2,3]', Feature.new(Macro.new(2,3)).to_s
14
+ assert_equal 'U01:%x[2,3]', Feature.new(Macro.new(2,3)).to_s(1)
15
+ assert_equal 'U123:%x[2,3]', Feature.new(Macro.new(2,3)).to_s(123)
16
+ end
17
+
18
+ def test_parse_feature_strings
19
+ assert_equal 'U:%x[42,23]/AB', Feature.parse('U:%x[42,23]/AB').to_s
20
+ end
21
+
22
+
23
+ end
24
+
25
+ end
@@ -0,0 +1,19 @@
1
+ require 'helper'
2
+
3
+ module CRFPP
4
+ class TestFilelike < Test::Unit::TestCase
5
+ def setup
6
+ @klass = Class.new
7
+ @klass.instance_eval { include Filelike }
8
+ end
9
+
10
+ def subject
11
+ @klass.new
12
+ end
13
+
14
+ def test_has_a_path_by_default
15
+ assert subject.path
16
+ end
17
+
18
+ end
19
+ end
@@ -0,0 +1,20 @@
1
+ require 'helper'
2
+
3
+ module CRFPP
4
+
5
+ class TestMacro < Test::Unit::TestCase
6
+
7
+ def test_assignment_of_constructor_arguments
8
+ assert_equal [0, 0], Macro.new.values
9
+ assert_equal [1, 1], Macro.new(1,1).values
10
+ assert_equal [1, 0], Macro.new(1).values
11
+ assert_raise(ArgumentError) { Macro.new(1,2,3) }
12
+ end
13
+
14
+ def test_to_s
15
+ assert_equal '%x[0,0]', Macro.new.to_s
16
+ end
17
+
18
+ end
19
+
20
+ end
@@ -3,12 +3,10 @@ require 'helper'
3
3
  module CRFPP
4
4
  class TestNative < Test::Unit::TestCase
5
5
 
6
- FIXTURES_ROOT = File.expand_path('../../fixtures', __FILE__)
7
-
8
6
  def test_create_a_new_model_file_through_training
9
7
  model = Tempfile.new('model')
10
- assert Native.learn("#{FIXTURES_ROOT}/template #{FIXTURES_ROOT}/train.data #{model.path}")
11
- assert model.length > 0
8
+ assert Native.learn("#{FixturesRoot}/template #{FixturesRoot}/train.data #{model.path}"), 'Native.learn returned with error exit status'
9
+ assert model.length > 0, 'nothing was written to model file'
12
10
  ensure
13
11
  model.close
14
12
  model.unlink
@@ -0,0 +1,23 @@
1
+ require 'helper'
2
+
3
+ module CRFPP
4
+
5
+ class TestTemplate < Test::Unit::TestCase
6
+
7
+ def test_load_template_from_empty_file
8
+ file = Tempfile.new('template')
9
+ assert Template.new(file.path).to_s.empty?
10
+ ensure
11
+ file.close
12
+ file.unlink
13
+ end
14
+
15
+ def test_load_template_from_file
16
+ path = "#{FixturesRoot}/template"
17
+ assert_equal Template.new(path).to_s.chomp, File.open(path).read.chomp
18
+ end
19
+
20
+
21
+ end
22
+
23
+ end
@@ -0,0 +1,12 @@
1
+ require 'helper'
2
+
3
+ module CRFPP
4
+ class TestUtilities < Test::Unit::TestCase
5
+
6
+ def test_create_a_new_model_file_through_training
7
+ model = CRFPP.learn("#{FixturesRoot}/template", "#{FixturesRoot}/train.data")
8
+ assert !model.open.data.empty?
9
+ end
10
+
11
+ end
12
+ end
@@ -1,4 +1,6 @@
1
1
  require 'test/unit'
2
2
  require 'crfpp'
3
3
 
4
- require 'tempfile'
4
+ module CRFPP
5
+ FixturesRoot = File.expand_path('../fixtures', __FILE__)
6
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crfpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-08-16 00:00:00.000000000Z
12
+ date: 2011-08-17 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
16
- requirement: &2157069020 !ruby/object:Gem::Requirement
16
+ requirement: &2156849560 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0.9'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2157069020
24
+ version_requirements: *2156849560
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rake-compiler
27
- requirement: &2157068480 !ruby/object:Gem::Requirement
27
+ requirement: &2156849000 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0.7'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *2157068480
35
+ version_requirements: *2156849000
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: ZenTest
38
- requirement: &2157067980 !ruby/object:Gem::Requirement
38
+ requirement: &2156848460 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: '4.6'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2157067980
46
+ version_requirements: *2156848460
47
47
  description: A Ruby extension to interface with CRF++, the Conditional Random Fields
48
48
  library written in C++. You need to install libcrfpp to use this gem.
49
49
  email:
@@ -67,10 +67,21 @@ files:
67
67
  - ext/crfpp/tagger.cpp
68
68
  - ext/crfpp/tagger.hpp
69
69
  - lib/crfpp.rb
70
+ - lib/crfpp/errors.rb
71
+ - lib/crfpp/feature.rb
72
+ - lib/crfpp/filelike.rb
73
+ - lib/crfpp/macro.rb
74
+ - lib/crfpp/model.rb
75
+ - lib/crfpp/template.rb
70
76
  - lib/crfpp/utilities.rb
71
77
  - lib/crfpp/version.rb
78
+ - test/crfpp/test_feature.rb
79
+ - test/crfpp/test_filelike.rb
80
+ - test/crfpp/test_macro.rb
72
81
  - test/crfpp/test_native.rb
73
82
  - test/crfpp/test_tagger.rb
83
+ - test/crfpp/test_template.rb
84
+ - test/crfpp/test_utilities.rb
74
85
  - test/fixtures/template
75
86
  - test/fixtures/test.data
76
87
  - test/fixtures/train.data
@@ -108,8 +119,13 @@ signing_key:
108
119
  specification_version: 3
109
120
  summary: Conditional Random Fields for Ruby.
110
121
  test_files:
122
+ - test/crfpp/test_feature.rb
123
+ - test/crfpp/test_filelike.rb
124
+ - test/crfpp/test_macro.rb
111
125
  - test/crfpp/test_native.rb
112
126
  - test/crfpp/test_tagger.rb
127
+ - test/crfpp/test_template.rb
128
+ - test/crfpp/test_utilities.rb
113
129
  - test/fixtures/template
114
130
  - test/fixtures/test.data
115
131
  - test/fixtures/train.data