feature_set 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,35 +11,62 @@ module FeatureSet
11
11
  BUILTIN_FEATURE_BUILDERS = %w[FeatureSet::FeatureBuilder::Cuss
12
12
  FeatureSet::FeatureBuilder::WordVector].map(&:constantize)
13
13
 
14
- attr_accessor :options, :feature_builders, :data, :features
14
+ attr_accessor :options, :feature_builders, :data, :features, :name
15
15
 
16
16
  def initialize(options = {})
17
17
  @options = options
18
+ @name = options[:name]
18
19
  @feature_builders = []
19
20
  @features = []
20
21
  @data = []
21
22
  end
22
23
 
23
24
  def add_data(data)
24
- clear_features
25
25
  (@data << data).flatten!
26
26
  end
27
27
 
28
28
  def clear_data
29
29
  @data = []
30
- clear_features
31
30
  end
32
31
 
33
32
  def clear_features
34
33
  @features = []
35
34
  end
35
+
36
+ def arff
37
+ relation = Rarff::Relation.new(name || 'Data')
38
+ keys = features.first.keys
39
+ instances = features.map do |row|
40
+ keys.map do |key|
41
+ value = row[key]
42
+ if value.is_a?(String)
43
+ value.gsub(/\\/, "\\\\\\\\").gsub(/"/, "\\\\\"").gsub(/'/, '\\\\\'')
44
+ elsif value.is_a?(Symbol)
45
+ value.to_s
46
+ else
47
+ value
48
+ end
49
+ end
50
+ end
51
+ relation.instances = instances
52
+ keys.each_with_index do |key, index|
53
+ relation.attributes[index].name = key.to_s
54
+ end
55
+ relation
56
+ end
36
57
 
37
58
  def generate_features(opts = {})
38
- wrapped_data_set = self.class.wrap_dataset(data)
59
+ wrapped_data = self.class.wrap_dataset(data)
60
+ feature_builders.each {|fb| fb.before_generate_features(wrapped_data) }
61
+ @features = generate_features_for(wrapped_data, opts.merge(:already_wrapped => true))
62
+ end
39
63
 
40
- feature_builders.each {|fb| fb.before_generate_features(wrapped_data_set) }
41
-
42
- @features = wrapped_data_set.map do |row|
64
+ def generate_features_for(data, opts = {})
65
+ # FYI, we explicitly do not call before_generate_features because this can be used on unknown rows for classification, and
66
+ # we want our feature generators to keep any cached data from the previous 'generate_features' feature building call. This is
67
+ # important for Wordvector, for example, since it needs to build the idf mappings beforehand and we want them used on any new data.
68
+ wrapped_data = opts[:already_wrapped] ? data : self.class.wrap_dataset(data)
69
+ wrapped_data.map do |row|
43
70
  output_row = {}
44
71
 
45
72
  row.each do |key, datum|
@@ -56,7 +83,7 @@ module FeatureSet
56
83
  output_row
57
84
  end
58
85
  end
59
-
86
+
60
87
  def add_feature_builders(*builders)
61
88
  builders = BUILTIN_FEATURE_BUILDERS.map(&:new) if [:all, "all"].include?(builders.first)
62
89
  (@feature_builders << builders).flatten!
@@ -1,3 +1,3 @@
1
1
  module FeatureSet
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -68,5 +68,24 @@ describe FeatureSet::Builder do
68
68
  @builder.features[1].should == { :status_cuss_count => 1, :class => :less_awesome }
69
69
  @builder.features[2].should == { :status_cuss_count => 0, :foo_cuss_count => 1, :class => :awesome }
70
70
  end
71
+
72
+ it "should allow generation of features on new data while leaving the old data intact" do
73
+ @builder.generate_features
74
+ num_features = @builder.features.length
75
+ @builder.generate_features_for([{ :status => "is this shitty text?" }, { :status => "foo bar" }]).should == [{ :status_cuss_count => 1 }, { :status_cuss_count => 0 }]
76
+ @builder.features.length.should == num_features
77
+ end
78
+
79
+ describe "outputing an ARFF file" do
80
+ it "should return a rarff relation object" do
81
+ @builder.generate_features
82
+ arff = @builder.arff
83
+ arff.should be_a(Rarff::Relation)
84
+ arff.attributes.first.name.should == "status_cuss_count"
85
+ arff.attributes.last.name.should == "class"
86
+ arff.to_s.should =~ /Data/
87
+ arff.to_s.should =~ /status_cuss_count/
88
+ end
89
+ end
71
90
  end
72
- end
91
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: feature_set
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2011-12-17 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &70284888584540 !ruby/object:Gem::Requirement
16
+ requirement: &70136561584740 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *70284888584540
24
+ version_requirements: *70136561584740
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: wwood-rarff
27
- requirement: &70284888584120 !ruby/object:Gem::Requirement
27
+ requirement: &70136561584320 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70284888584120
35
+ version_requirements: *70136561584320
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: activesupport
38
- requirement: &70284888583700 !ruby/object:Gem::Requirement
38
+ requirement: &70136561583900 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70284888583700
46
+ version_requirements: *70136561583900
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: i18n
49
- requirement: &70284888583280 !ruby/object:Gem::Requirement
49
+ requirement: &70136561583480 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,7 +54,7 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70284888583280
57
+ version_requirements: *70136561583480
58
58
  description: FeatureSet is a Ruby library for generating feature vectors from textual
59
59
  data. It can output in ARFF format for experimentation with Weka.
60
60
  email: