feature_set 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -11,35 +11,62 @@ module FeatureSet
11
11
  BUILTIN_FEATURE_BUILDERS = %w[FeatureSet::FeatureBuilder::Cuss
12
12
  FeatureSet::FeatureBuilder::WordVector].map(&:constantize)
13
13
 
14
- attr_accessor :options, :feature_builders, :data, :features
14
+ attr_accessor :options, :feature_builders, :data, :features, :name
15
15
 
16
16
  def initialize(options = {})
17
17
  @options = options
18
+ @name = options[:name]
18
19
  @feature_builders = []
19
20
  @features = []
20
21
  @data = []
21
22
  end
22
23
 
23
24
  def add_data(data)
24
- clear_features
25
25
  (@data << data).flatten!
26
26
  end
27
27
 
28
28
  def clear_data
29
29
  @data = []
30
- clear_features
31
30
  end
32
31
 
33
32
  def clear_features
34
33
  @features = []
35
34
  end
35
+
36
+ def arff
37
+ relation = Rarff::Relation.new(name || 'Data')
38
+ keys = features.first.keys
39
+ instances = features.map do |row|
40
+ keys.map do |key|
41
+ value = row[key]
42
+ if value.is_a?(String)
43
+ value.gsub(/\\/, "\\\\\\\\").gsub(/"/, "\\\\\"").gsub(/'/, '\\\\\'')
44
+ elsif value.is_a?(Symbol)
45
+ value.to_s
46
+ else
47
+ value
48
+ end
49
+ end
50
+ end
51
+ relation.instances = instances
52
+ keys.each_with_index do |key, index|
53
+ relation.attributes[index].name = key.to_s
54
+ end
55
+ relation
56
+ end
36
57
 
37
58
  def generate_features(opts = {})
38
- wrapped_data_set = self.class.wrap_dataset(data)
59
+ wrapped_data = self.class.wrap_dataset(data)
60
+ feature_builders.each {|fb| fb.before_generate_features(wrapped_data) }
61
+ @features = generate_features_for(wrapped_data, opts.merge(:already_wrapped => true))
62
+ end
39
63
 
40
- feature_builders.each {|fb| fb.before_generate_features(wrapped_data_set) }
41
-
42
- @features = wrapped_data_set.map do |row|
64
+ def generate_features_for(data, opts = {})
65
+ # FYI, we explicitly do not call before_generate_features because this can be used on unknown rows for classification, and
66
+ # we want our feature generators to keep any cached data from the previous 'generate_features' feature building call. This is
67
+ # important for Wordvector, for example, since it needs to build the idf mappings beforehand and we want them used on any new data.
68
+ wrapped_data = opts[:already_wrapped] ? data : self.class.wrap_dataset(data)
69
+ wrapped_data.map do |row|
43
70
  output_row = {}
44
71
 
45
72
  row.each do |key, datum|
@@ -56,7 +83,7 @@ module FeatureSet
56
83
  output_row
57
84
  end
58
85
  end
59
-
86
+
60
87
  def add_feature_builders(*builders)
61
88
  builders = BUILTIN_FEATURE_BUILDERS.map(&:new) if [:all, "all"].include?(builders.first)
62
89
  (@feature_builders << builders).flatten!
@@ -1,3 +1,3 @@
1
1
  module FeatureSet
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -68,5 +68,24 @@ describe FeatureSet::Builder do
68
68
  @builder.features[1].should == { :status_cuss_count => 1, :class => :less_awesome }
69
69
  @builder.features[2].should == { :status_cuss_count => 0, :foo_cuss_count => 1, :class => :awesome }
70
70
  end
71
+
72
+ it "should allow generation of features on new data while leaving the old data intact" do
73
+ @builder.generate_features
74
+ num_features = @builder.features.length
75
+ @builder.generate_features_for([{ :status => "is this shitty text?" }, { :status => "foo bar" }]).should == [{ :status_cuss_count => 1 }, { :status_cuss_count => 0 }]
76
+ @builder.features.length.should == num_features
77
+ end
78
+
79
+ describe "outputing an ARFF file" do
80
+ it "should return a rarff relation object" do
81
+ @builder.generate_features
82
+ arff = @builder.arff
83
+ arff.should be_a(Rarff::Relation)
84
+ arff.attributes.first.name.should == "status_cuss_count"
85
+ arff.attributes.last.name.should == "class"
86
+ arff.to_s.should =~ /Data/
87
+ arff.to_s.should =~ /status_cuss_count/
88
+ end
89
+ end
71
90
  end
72
- end
91
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: feature_set
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2011-12-17 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &70284888584540 !ruby/object:Gem::Requirement
16
+ requirement: &70136561584740 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *70284888584540
24
+ version_requirements: *70136561584740
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: wwood-rarff
27
- requirement: &70284888584120 !ruby/object:Gem::Requirement
27
+ requirement: &70136561584320 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70284888584120
35
+ version_requirements: *70136561584320
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: activesupport
38
- requirement: &70284888583700 !ruby/object:Gem::Requirement
38
+ requirement: &70136561583900 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70284888583700
46
+ version_requirements: *70136561583900
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: i18n
49
- requirement: &70284888583280 !ruby/object:Gem::Requirement
49
+ requirement: &70136561583480 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,7 +54,7 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70284888583280
57
+ version_requirements: *70136561583480
58
58
  description: FeatureSet is a Ruby library for generating feature vectors from textual
59
59
  data. It can output in ARFF format for experimentation with Weka.
60
60
  email: