feature_set 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/feature_set/builder.rb +35 -8
- data/lib/feature_set/version.rb +1 -1
- data/spec/feature_set/builder_spec.rb +20 -1
- metadata +9 -9
    
        data/lib/feature_set/builder.rb
    CHANGED
    
    | @@ -11,35 +11,62 @@ module FeatureSet | |
| 11 11 | 
             
                BUILTIN_FEATURE_BUILDERS = %w[FeatureSet::FeatureBuilder::Cuss 
         | 
| 12 12 | 
             
                                              FeatureSet::FeatureBuilder::WordVector].map(&:constantize)
         | 
| 13 13 |  | 
| 14 | 
            -
                attr_accessor :options, :feature_builders, :data, :features
         | 
| 14 | 
            +
                attr_accessor :options, :feature_builders, :data, :features, :name
         | 
| 15 15 |  | 
| 16 16 | 
             
                def initialize(options = {})
         | 
| 17 17 | 
             
                  @options = options
         | 
| 18 | 
            +
                  @name = options[:name]
         | 
| 18 19 | 
             
                  @feature_builders = []
         | 
| 19 20 | 
             
                  @features = []
         | 
| 20 21 | 
             
                  @data = []
         | 
| 21 22 | 
             
                end
         | 
| 22 23 |  | 
| 23 24 | 
             
                def add_data(data)
         | 
| 24 | 
            -
                  clear_features
         | 
| 25 25 | 
             
                  (@data << data).flatten!
         | 
| 26 26 | 
             
                end
         | 
| 27 27 |  | 
| 28 28 | 
             
                def clear_data
         | 
| 29 29 | 
             
                  @data = []
         | 
| 30 | 
            -
                  clear_features
         | 
| 31 30 | 
             
                end
         | 
| 32 31 |  | 
| 33 32 | 
             
                def clear_features
         | 
| 34 33 | 
             
                  @features = []
         | 
| 35 34 | 
             
                end
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                def arff
         | 
| 37 | 
            +
                  relation = Rarff::Relation.new(name || 'Data')
         | 
| 38 | 
            +
                  keys = features.first.keys
         | 
| 39 | 
            +
                  instances = features.map do |row|
         | 
| 40 | 
            +
                    keys.map do |key|
         | 
| 41 | 
            +
                      value = row[key]
         | 
| 42 | 
            +
                      if value.is_a?(String)
         | 
| 43 | 
            +
                        value.gsub(/\\/, "\\\\\\\\").gsub(/"/, "\\\\\"").gsub(/'/, '\\\\\'')
         | 
| 44 | 
            +
                      elsif value.is_a?(Symbol)
         | 
| 45 | 
            +
                        value.to_s
         | 
| 46 | 
            +
                      else
         | 
| 47 | 
            +
                        value
         | 
| 48 | 
            +
                      end
         | 
| 49 | 
            +
                    end
         | 
| 50 | 
            +
                  end
         | 
| 51 | 
            +
                  relation.instances = instances
         | 
| 52 | 
            +
                  keys.each_with_index do |key, index|
         | 
| 53 | 
            +
                    relation.attributes[index].name = key.to_s
         | 
| 54 | 
            +
                  end
         | 
| 55 | 
            +
                  relation
         | 
| 56 | 
            +
                end
         | 
| 36 57 |  | 
| 37 58 | 
             
                def generate_features(opts = {})
         | 
| 38 | 
            -
                   | 
| 59 | 
            +
                  wrapped_data = self.class.wrap_dataset(data)
         | 
| 60 | 
            +
                  feature_builders.each {|fb| fb.before_generate_features(wrapped_data) }
         | 
| 61 | 
            +
                  @features = generate_features_for(wrapped_data, opts.merge(:already_wrapped => true))
         | 
| 62 | 
            +
                end
         | 
| 39 63 |  | 
| 40 | 
            -
             | 
| 41 | 
            -
                  
         | 
| 42 | 
            -
                   | 
| 64 | 
            +
                def generate_features_for(data, opts = {})
         | 
| 65 | 
            +
                  # FYI, we explicitly do not call before_generate_features because this can be used on unknown rows for classification, and
         | 
| 66 | 
            +
                  # we want our feature generators to keep any cached data from the previous 'generate_features' feature building call.  This is
         | 
| 67 | 
            +
                  # important for Wordvector, for example, since it needs to build the idf mappings beforehand and we want them used on any new data.
         | 
| 68 | 
            +
                  wrapped_data = opts[:already_wrapped] ? data : self.class.wrap_dataset(data)
         | 
| 69 | 
            +
                  wrapped_data.map do |row|
         | 
| 43 70 | 
             
                    output_row = {}
         | 
| 44 71 |  | 
| 45 72 | 
             
                    row.each do |key, datum|
         | 
| @@ -56,7 +83,7 @@ module FeatureSet | |
| 56 83 | 
             
                    output_row
         | 
| 57 84 | 
             
                  end
         | 
| 58 85 | 
             
                end
         | 
| 59 | 
            -
             | 
| 86 | 
            +
             | 
| 60 87 | 
             
                def add_feature_builders(*builders)
         | 
| 61 88 | 
             
                  builders = BUILTIN_FEATURE_BUILDERS.map(&:new) if [:all, "all"].include?(builders.first)
         | 
| 62 89 | 
             
                  (@feature_builders << builders).flatten!
         | 
    
        data/lib/feature_set/version.rb
    CHANGED
    
    
| @@ -68,5 +68,24 @@ describe FeatureSet::Builder do | |
| 68 68 | 
             
                  @builder.features[1].should == { :status_cuss_count => 1, :class => :less_awesome }
         | 
| 69 69 | 
             
                  @builder.features[2].should == { :status_cuss_count => 0, :foo_cuss_count => 1, :class => :awesome }
         | 
| 70 70 | 
             
                end
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                it "should allow generation of features on new data while leaving the old data intact" do
         | 
| 73 | 
            +
                  @builder.generate_features
         | 
| 74 | 
            +
                  num_features = @builder.features.length
         | 
| 75 | 
            +
                  @builder.generate_features_for([{ :status => "is this shitty text?" }, { :status => "foo bar" }]).should == [{ :status_cuss_count => 1 }, { :status_cuss_count => 0 }]
         | 
| 76 | 
            +
                  @builder.features.length.should == num_features
         | 
| 77 | 
            +
                end
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                describe "outputing an ARFF file" do
         | 
| 80 | 
            +
                  it "should return a rarff relation object" do
         | 
| 81 | 
            +
                    @builder.generate_features
         | 
| 82 | 
            +
                    arff = @builder.arff
         | 
| 83 | 
            +
                    arff.should be_a(Rarff::Relation)
         | 
| 84 | 
            +
                    arff.attributes.first.name.should == "status_cuss_count"
         | 
| 85 | 
            +
                    arff.attributes.last.name.should == "class"
         | 
| 86 | 
            +
                    arff.to_s.should =~ /Data/
         | 
| 87 | 
            +
                    arff.to_s.should =~ /status_cuss_count/
         | 
| 88 | 
            +
                  end
         | 
| 89 | 
            +
                end
         | 
| 71 90 | 
             
              end
         | 
| 72 | 
            -
            end
         | 
| 91 | 
            +
            end
         | 
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: feature_set
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.0. | 
| 4 | 
            +
              version: 0.0.2
         | 
| 5 5 | 
             
              prerelease: 
         | 
| 6 6 | 
             
            platform: ruby
         | 
| 7 7 | 
             
            authors:
         | 
| @@ -13,7 +13,7 @@ date: 2011-12-17 00:00:00.000000000Z | |
| 13 13 | 
             
            dependencies:
         | 
| 14 14 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 15 15 | 
             
              name: rspec
         | 
| 16 | 
            -
              requirement: & | 
| 16 | 
            +
              requirement: &70136561584740 !ruby/object:Gem::Requirement
         | 
| 17 17 | 
             
                none: false
         | 
| 18 18 | 
             
                requirements:
         | 
| 19 19 | 
             
                - - ! '>='
         | 
| @@ -21,10 +21,10 @@ dependencies: | |
| 21 21 | 
             
                    version: '0'
         | 
| 22 22 | 
             
              type: :development
         | 
| 23 23 | 
             
              prerelease: false
         | 
| 24 | 
            -
              version_requirements: * | 
| 24 | 
            +
              version_requirements: *70136561584740
         | 
| 25 25 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 26 26 | 
             
              name: wwood-rarff
         | 
| 27 | 
            -
              requirement: & | 
| 27 | 
            +
              requirement: &70136561584320 !ruby/object:Gem::Requirement
         | 
| 28 28 | 
             
                none: false
         | 
| 29 29 | 
             
                requirements:
         | 
| 30 30 | 
             
                - - ! '>='
         | 
| @@ -32,10 +32,10 @@ dependencies: | |
| 32 32 | 
             
                    version: '0'
         | 
| 33 33 | 
             
              type: :runtime
         | 
| 34 34 | 
             
              prerelease: false
         | 
| 35 | 
            -
              version_requirements: * | 
| 35 | 
            +
              version_requirements: *70136561584320
         | 
| 36 36 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 37 37 | 
             
              name: activesupport
         | 
| 38 | 
            -
              requirement: & | 
| 38 | 
            +
              requirement: &70136561583900 !ruby/object:Gem::Requirement
         | 
| 39 39 | 
             
                none: false
         | 
| 40 40 | 
             
                requirements:
         | 
| 41 41 | 
             
                - - ! '>='
         | 
| @@ -43,10 +43,10 @@ dependencies: | |
| 43 43 | 
             
                    version: '0'
         | 
| 44 44 | 
             
              type: :runtime
         | 
| 45 45 | 
             
              prerelease: false
         | 
| 46 | 
            -
              version_requirements: * | 
| 46 | 
            +
              version_requirements: *70136561583900
         | 
| 47 47 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 48 48 | 
             
              name: i18n
         | 
| 49 | 
            -
              requirement: & | 
| 49 | 
            +
              requirement: &70136561583480 !ruby/object:Gem::Requirement
         | 
| 50 50 | 
             
                none: false
         | 
| 51 51 | 
             
                requirements:
         | 
| 52 52 | 
             
                - - ! '>='
         | 
| @@ -54,7 +54,7 @@ dependencies: | |
| 54 54 | 
             
                    version: '0'
         | 
| 55 55 | 
             
              type: :runtime
         | 
| 56 56 | 
             
              prerelease: false
         | 
| 57 | 
            -
              version_requirements: * | 
| 57 | 
            +
              version_requirements: *70136561583480
         | 
| 58 58 | 
             
            description: FeatureSet is a Ruby library for generating feature vectors from textual
         | 
| 59 59 | 
             
              data.  It can output in ARFF format for experimentation with Weka.
         | 
| 60 60 | 
             
            email:
         |