crfpp 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/crfpp.rb +9 -0
- data/lib/crfpp/errors.rb +16 -0
- data/lib/crfpp/feature.rb +29 -0
- data/lib/crfpp/filelike.rb +27 -0
- data/lib/crfpp/macro.rb +17 -0
- data/lib/crfpp/model.rb +23 -0
- data/lib/crfpp/template.rb +32 -0
- data/lib/crfpp/utilities.rb +61 -2
- data/lib/crfpp/version.rb +1 -1
- data/test/crfpp/test_feature.rb +25 -0
- data/test/crfpp/test_filelike.rb +19 -0
- data/test/crfpp/test_macro.rb +20 -0
- data/test/crfpp/test_native.rb +2 -4
- data/test/crfpp/test_template.rb +23 -0
- data/test/crfpp/test_utilities.rb +12 -0
- data/test/helper.rb +3 -1
- metadata +24 -8
data/lib/crfpp.rb
CHANGED
@@ -1,4 +1,13 @@
|
|
1
1
|
|
2
|
+
require 'forwardable'
|
3
|
+
require 'tempfile'
|
4
|
+
|
2
5
|
require 'crfpp/version'
|
6
|
+
require 'crfpp/errors'
|
7
|
+
require 'crfpp/filelike'
|
8
|
+
require 'crfpp/macro'
|
9
|
+
require 'crfpp/template'
|
10
|
+
require 'crfpp/feature'
|
11
|
+
require 'crfpp/model'
|
3
12
|
require 'crfpp/native'
|
4
13
|
require 'crfpp/utilities'
|
data/lib/crfpp/errors.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
module CRFPP
|
2
|
+
|
3
|
+
class Feature < Struct.new(:content, :type, :id)
|
4
|
+
|
5
|
+
class << self
|
6
|
+
def parse(string)
|
7
|
+
if string =~ /^([UB])(\d*):(.+)\s*$/
|
8
|
+
Feature.new($3, $1.to_sym, $2)
|
9
|
+
else
|
10
|
+
string
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def initialize(content = Macro.new, type = :U, id = nil)
|
16
|
+
super
|
17
|
+
end
|
18
|
+
|
19
|
+
def identifier(base = id)
|
20
|
+
base.is_a?(Numeric) ? ('%02d' % base) : base
|
21
|
+
end
|
22
|
+
|
23
|
+
def to_s(number = id)
|
24
|
+
[type.to_s.upcase, identifier(number), ':', content].compact.join
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module CRFPP
|
2
|
+
|
3
|
+
module Filelike
|
4
|
+
|
5
|
+
attr_writer :path
|
6
|
+
|
7
|
+
def path
|
8
|
+
@path ||= Tempfile.new('filelike').path
|
9
|
+
end
|
10
|
+
|
11
|
+
def write
|
12
|
+
File.open(path, 'w:UTF-8') do |f|
|
13
|
+
f.write(to_s)
|
14
|
+
f.close
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def read
|
19
|
+
f = File.open(path, 'r:UTF-8')
|
20
|
+
f.read
|
21
|
+
ensure
|
22
|
+
f.close
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
data/lib/crfpp/macro.rb
ADDED
data/lib/crfpp/model.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
module CRFPP
|
2
|
+
class Model
|
3
|
+
|
4
|
+
include Filelike
|
5
|
+
|
6
|
+
attr_reader :data
|
7
|
+
|
8
|
+
def initialize(path = nil)
|
9
|
+
@data, @path = '', path
|
10
|
+
end
|
11
|
+
|
12
|
+
def open
|
13
|
+
@data = read
|
14
|
+
self
|
15
|
+
end
|
16
|
+
|
17
|
+
def save
|
18
|
+
write(@data)
|
19
|
+
self
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module CRFPP
|
2
|
+
class Template
|
3
|
+
|
4
|
+
extend Forwardable
|
5
|
+
|
6
|
+
include Filelike
|
7
|
+
|
8
|
+
attr_reader :features
|
9
|
+
|
10
|
+
def_delegators :@features, :<<, :length
|
11
|
+
|
12
|
+
def initialize(path = nil)
|
13
|
+
@path = path
|
14
|
+
open
|
15
|
+
end
|
16
|
+
|
17
|
+
def open
|
18
|
+
@features = read.lines.map { |line| Feature.parse(line) }
|
19
|
+
self
|
20
|
+
end
|
21
|
+
|
22
|
+
def clear
|
23
|
+
@features = []
|
24
|
+
end
|
25
|
+
|
26
|
+
def to_s
|
27
|
+
i = -1
|
28
|
+
features.map { |f| f.is_a?(Feature) ? f.to_s(i += 1) : f.chomp }.join("\n")
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
end
|
data/lib/crfpp/utilities.rb
CHANGED
@@ -1,12 +1,71 @@
|
|
1
1
|
|
2
2
|
module CRFPP
|
3
3
|
|
4
|
-
|
5
|
-
|
4
|
+
# Creates a new Model based on a template and training data.
|
5
|
+
#
|
6
|
+
# The data parameter can either be an array of strings or a filename. The
|
7
|
+
# possible options are:
|
8
|
+
#
|
9
|
+
# :threads: False or the number of threads to us (default is 2).
|
10
|
+
#
|
11
|
+
# :algorithm: L1 or L2 (default)
|
12
|
+
#
|
13
|
+
# :cost: With this option, you can change the hyper-parameter for the CRFs.
|
14
|
+
# With larger C value, CRF tends to overfit to the give training
|
15
|
+
# corpus. This parameter trades the balance between overfitting and
|
16
|
+
# underfitting. The results will significantly be influenced by this
|
17
|
+
# parameter. You can find an optimal value by using held-out data or
|
18
|
+
# more general model selection method such as cross validation.
|
19
|
+
#
|
20
|
+
# :frequency: This parameter sets the cut-off threshold for the features. CRF++
|
21
|
+
# uses the features that occurs no less than NUM times in the given training
|
22
|
+
# data. The default value is 1. When you apply CRF++ to large data, the
|
23
|
+
# number of unique features would amount to several millions. This option is
|
24
|
+
# useful in such cases.
|
25
|
+
#
|
26
|
+
def learn(template, data, options = {})
|
27
|
+
options = { :threads => 2, :algorithm => :L2, :cost => 1.0, :frequency => 1}.merge(options)
|
28
|
+
|
29
|
+
unless File.exists?(data)
|
30
|
+
data = save_data_to_tempfile([data].flatten)
|
31
|
+
temporary = true
|
32
|
+
end
|
33
|
+
|
34
|
+
template = Template.new(template) unless template.is_a?(Template)
|
35
|
+
model = Model.new
|
36
|
+
|
37
|
+
arguments = []
|
38
|
+
|
39
|
+
# TODO check algorithm names
|
40
|
+
# arguments << "--algorithm=#{options[:algorithm]}"
|
41
|
+
|
42
|
+
arguments << "--cost=#{options[:cost]}"
|
43
|
+
arguments << "--thread=#{options[:threads]}"
|
44
|
+
arguments << "--freq=#{options[:frequency]}"
|
45
|
+
|
46
|
+
arguments << template.path
|
47
|
+
arguments << data
|
48
|
+
arguments << model.path
|
49
|
+
|
50
|
+
success = Native.learn(arguments.join(' '))
|
51
|
+
raise NativeError, 'crfpp learn failed' unless success
|
52
|
+
|
53
|
+
model
|
54
|
+
ensure
|
55
|
+
data.unlink if temporary
|
6
56
|
end
|
7
57
|
|
8
58
|
alias train learn
|
9
59
|
|
10
60
|
module_function :train, :learn
|
11
61
|
|
62
|
+
private
|
63
|
+
|
64
|
+
# Saves data to temporary file and returns the path
|
65
|
+
def save_data_to_tempfile(data)
|
66
|
+
tmp = Tempfile.new(data)
|
67
|
+
tmp.write(data.join("\n"))
|
68
|
+
tmp.path
|
69
|
+
end
|
70
|
+
|
12
71
|
end
|
data/lib/crfpp/version.rb
CHANGED
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
module CRFPP
|
4
|
+
|
5
|
+
class TestFeature < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def test_assignment_of_constructor_arguments
|
8
|
+
assert_raise(ArgumentError) { Feature.new(1,2,3,4) }
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_to_s
|
12
|
+
assert_equal 'U:%x[0,0]', Feature.new.to_s
|
13
|
+
assert_equal 'U:%x[2,3]', Feature.new(Macro.new(2,3)).to_s
|
14
|
+
assert_equal 'U01:%x[2,3]', Feature.new(Macro.new(2,3)).to_s(1)
|
15
|
+
assert_equal 'U123:%x[2,3]', Feature.new(Macro.new(2,3)).to_s(123)
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_parse_feature_strings
|
19
|
+
assert_equal 'U:%x[42,23]/AB', Feature.parse('U:%x[42,23]/AB').to_s
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
module CRFPP
|
4
|
+
class TestFilelike < Test::Unit::TestCase
|
5
|
+
def setup
|
6
|
+
@klass = Class.new
|
7
|
+
@klass.instance_eval { include Filelike }
|
8
|
+
end
|
9
|
+
|
10
|
+
def subject
|
11
|
+
@klass.new
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_has_a_path_by_default
|
15
|
+
assert subject.path
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
module CRFPP
|
4
|
+
|
5
|
+
class TestMacro < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def test_assignment_of_constructor_arguments
|
8
|
+
assert_equal [0, 0], Macro.new.values
|
9
|
+
assert_equal [1, 1], Macro.new(1,1).values
|
10
|
+
assert_equal [1, 0], Macro.new(1).values
|
11
|
+
assert_raise(ArgumentError) { Macro.new(1,2,3) }
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_to_s
|
15
|
+
assert_equal '%x[0,0]', Macro.new.to_s
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
data/test/crfpp/test_native.rb
CHANGED
@@ -3,12 +3,10 @@ require 'helper'
|
|
3
3
|
module CRFPP
|
4
4
|
class TestNative < Test::Unit::TestCase
|
5
5
|
|
6
|
-
FIXTURES_ROOT = File.expand_path('../../fixtures', __FILE__)
|
7
|
-
|
8
6
|
def test_create_a_new_model_file_through_training
|
9
7
|
model = Tempfile.new('model')
|
10
|
-
assert Native.learn("#{
|
11
|
-
assert model.length > 0
|
8
|
+
assert Native.learn("#{FixturesRoot}/template #{FixturesRoot}/train.data #{model.path}"), 'Native.learn returned with error exit status'
|
9
|
+
assert model.length > 0, 'nothing was written to model file'
|
12
10
|
ensure
|
13
11
|
model.close
|
14
12
|
model.unlink
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
module CRFPP
|
4
|
+
|
5
|
+
class TestTemplate < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def test_load_template_from_empty_file
|
8
|
+
file = Tempfile.new('template')
|
9
|
+
assert Template.new(file.path).to_s.empty?
|
10
|
+
ensure
|
11
|
+
file.close
|
12
|
+
file.unlink
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_load_template_from_file
|
16
|
+
path = "#{FixturesRoot}/template"
|
17
|
+
assert_equal Template.new(path).to_s.chomp, File.open(path).read.chomp
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
module CRFPP
|
4
|
+
class TestUtilities < Test::Unit::TestCase
|
5
|
+
|
6
|
+
def test_create_a_new_model_file_through_training
|
7
|
+
model = CRFPP.learn("#{FixturesRoot}/template", "#{FixturesRoot}/train.data")
|
8
|
+
assert !model.open.data.empty?
|
9
|
+
end
|
10
|
+
|
11
|
+
end
|
12
|
+
end
|
data/test/helper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crfpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-08-
|
12
|
+
date: 2011-08-17 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
16
|
-
requirement: &
|
16
|
+
requirement: &2156849560 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0.9'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2156849560
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rake-compiler
|
27
|
-
requirement: &
|
27
|
+
requirement: &2156849000 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0.7'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2156849000
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: ZenTest
|
38
|
-
requirement: &
|
38
|
+
requirement: &2156848460 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: '4.6'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2156848460
|
47
47
|
description: A Ruby extension to interface with CRF++, the Conditional Random Fields
|
48
48
|
library written in C++. You need to install libcrfpp to use this gem.
|
49
49
|
email:
|
@@ -67,10 +67,21 @@ files:
|
|
67
67
|
- ext/crfpp/tagger.cpp
|
68
68
|
- ext/crfpp/tagger.hpp
|
69
69
|
- lib/crfpp.rb
|
70
|
+
- lib/crfpp/errors.rb
|
71
|
+
- lib/crfpp/feature.rb
|
72
|
+
- lib/crfpp/filelike.rb
|
73
|
+
- lib/crfpp/macro.rb
|
74
|
+
- lib/crfpp/model.rb
|
75
|
+
- lib/crfpp/template.rb
|
70
76
|
- lib/crfpp/utilities.rb
|
71
77
|
- lib/crfpp/version.rb
|
78
|
+
- test/crfpp/test_feature.rb
|
79
|
+
- test/crfpp/test_filelike.rb
|
80
|
+
- test/crfpp/test_macro.rb
|
72
81
|
- test/crfpp/test_native.rb
|
73
82
|
- test/crfpp/test_tagger.rb
|
83
|
+
- test/crfpp/test_template.rb
|
84
|
+
- test/crfpp/test_utilities.rb
|
74
85
|
- test/fixtures/template
|
75
86
|
- test/fixtures/test.data
|
76
87
|
- test/fixtures/train.data
|
@@ -108,8 +119,13 @@ signing_key:
|
|
108
119
|
specification_version: 3
|
109
120
|
summary: Conditional Random Fields for Ruby.
|
110
121
|
test_files:
|
122
|
+
- test/crfpp/test_feature.rb
|
123
|
+
- test/crfpp/test_filelike.rb
|
124
|
+
- test/crfpp/test_macro.rb
|
111
125
|
- test/crfpp/test_native.rb
|
112
126
|
- test/crfpp/test_tagger.rb
|
127
|
+
- test/crfpp/test_template.rb
|
128
|
+
- test/crfpp/test_utilities.rb
|
113
129
|
- test/fixtures/template
|
114
130
|
- test/fixtures/test.data
|
115
131
|
- test/fixtures/train.data
|