feature_set 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/feature_set/builder.rb +35 -8
- data/lib/feature_set/version.rb +1 -1
- data/spec/feature_set/builder_spec.rb +20 -1
- metadata +9 -9
data/lib/feature_set/builder.rb
CHANGED
@@ -11,35 +11,62 @@ module FeatureSet
|
|
11
11
|
BUILTIN_FEATURE_BUILDERS = %w[FeatureSet::FeatureBuilder::Cuss
|
12
12
|
FeatureSet::FeatureBuilder::WordVector].map(&:constantize)
|
13
13
|
|
14
|
-
attr_accessor :options, :feature_builders, :data, :features
|
14
|
+
attr_accessor :options, :feature_builders, :data, :features, :name
|
15
15
|
|
16
16
|
def initialize(options = {})
|
17
17
|
@options = options
|
18
|
+
@name = options[:name]
|
18
19
|
@feature_builders = []
|
19
20
|
@features = []
|
20
21
|
@data = []
|
21
22
|
end
|
22
23
|
|
23
24
|
def add_data(data)
|
24
|
-
clear_features
|
25
25
|
(@data << data).flatten!
|
26
26
|
end
|
27
27
|
|
28
28
|
def clear_data
|
29
29
|
@data = []
|
30
|
-
clear_features
|
31
30
|
end
|
32
31
|
|
33
32
|
def clear_features
|
34
33
|
@features = []
|
35
34
|
end
|
35
|
+
|
36
|
+
def arff
|
37
|
+
relation = Rarff::Relation.new(name || 'Data')
|
38
|
+
keys = features.first.keys
|
39
|
+
instances = features.map do |row|
|
40
|
+
keys.map do |key|
|
41
|
+
value = row[key]
|
42
|
+
if value.is_a?(String)
|
43
|
+
value.gsub(/\\/, "\\\\\\\\").gsub(/"/, "\\\\\"").gsub(/'/, '\\\\\'')
|
44
|
+
elsif value.is_a?(Symbol)
|
45
|
+
value.to_s
|
46
|
+
else
|
47
|
+
value
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
relation.instances = instances
|
52
|
+
keys.each_with_index do |key, index|
|
53
|
+
relation.attributes[index].name = key.to_s
|
54
|
+
end
|
55
|
+
relation
|
56
|
+
end
|
36
57
|
|
37
58
|
def generate_features(opts = {})
|
38
|
-
|
59
|
+
wrapped_data = self.class.wrap_dataset(data)
|
60
|
+
feature_builders.each {|fb| fb.before_generate_features(wrapped_data) }
|
61
|
+
@features = generate_features_for(wrapped_data, opts.merge(:already_wrapped => true))
|
62
|
+
end
|
39
63
|
|
40
|
-
|
41
|
-
|
42
|
-
|
64
|
+
def generate_features_for(data, opts = {})
|
65
|
+
# FYI, we explicitly do not call before_generate_features because this can be used on unknown rows for classification, and
|
66
|
+
# we want our feature generators to keep any cached data from the previous 'generate_features' feature building call. This is
|
67
|
+
# important for Wordvector, for example, since it needs to build the idf mappings beforehand and we want them used on any new data.
|
68
|
+
wrapped_data = opts[:already_wrapped] ? data : self.class.wrap_dataset(data)
|
69
|
+
wrapped_data.map do |row|
|
43
70
|
output_row = {}
|
44
71
|
|
45
72
|
row.each do |key, datum|
|
@@ -56,7 +83,7 @@ module FeatureSet
|
|
56
83
|
output_row
|
57
84
|
end
|
58
85
|
end
|
59
|
-
|
86
|
+
|
60
87
|
def add_feature_builders(*builders)
|
61
88
|
builders = BUILTIN_FEATURE_BUILDERS.map(&:new) if [:all, "all"].include?(builders.first)
|
62
89
|
(@feature_builders << builders).flatten!
|
data/lib/feature_set/version.rb
CHANGED
@@ -68,5 +68,24 @@ describe FeatureSet::Builder do
|
|
68
68
|
@builder.features[1].should == { :status_cuss_count => 1, :class => :less_awesome }
|
69
69
|
@builder.features[2].should == { :status_cuss_count => 0, :foo_cuss_count => 1, :class => :awesome }
|
70
70
|
end
|
71
|
+
|
72
|
+
it "should allow generation of features on new data while leaving the old data intact" do
|
73
|
+
@builder.generate_features
|
74
|
+
num_features = @builder.features.length
|
75
|
+
@builder.generate_features_for([{ :status => "is this shitty text?" }, { :status => "foo bar" }]).should == [{ :status_cuss_count => 1 }, { :status_cuss_count => 0 }]
|
76
|
+
@builder.features.length.should == num_features
|
77
|
+
end
|
78
|
+
|
79
|
+
describe "outputing an ARFF file" do
|
80
|
+
it "should return a rarff relation object" do
|
81
|
+
@builder.generate_features
|
82
|
+
arff = @builder.arff
|
83
|
+
arff.should be_a(Rarff::Relation)
|
84
|
+
arff.attributes.first.name.should == "status_cuss_count"
|
85
|
+
arff.attributes.last.name.should == "class"
|
86
|
+
arff.to_s.should =~ /Data/
|
87
|
+
arff.to_s.should =~ /status_cuss_count/
|
88
|
+
end
|
89
|
+
end
|
71
90
|
end
|
72
|
-
end
|
91
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: feature_set
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2011-12-17 00:00:00.000000000Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &70136561584740 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70136561584740
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: wwood-rarff
|
27
|
-
requirement: &
|
27
|
+
requirement: &70136561584320 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70136561584320
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: activesupport
|
38
|
-
requirement: &
|
38
|
+
requirement: &70136561583900 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70136561583900
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: i18n
|
49
|
-
requirement: &
|
49
|
+
requirement: &70136561583480 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,7 +54,7 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70136561583480
|
58
58
|
description: FeatureSet is a Ruby library for generating feature vectors from textual
|
59
59
|
data. It can output in ARFF format for experimentation with Weka.
|
60
60
|
email:
|