feature_set 0.0.2 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,8 +1,36 @@
1
- ## FeatureSet
1
+ # This library is alpha and is not yet finished.
2
2
 
3
+ # FeatureSet
3
4
 
4
- Helpful tasks:
5
+ A Ruby library for building machine learning datasets.
5
6
 
6
- - rake build
7
- - rake install
8
- - rake release
7
+ In machine learning, feature selection is often more difficult than algorithm selection. For many classes of problems, any reasonably modern algorithm can be used (i.e., a SVM, decision tree, etc.). However, all of these algorithms require information-rich features to learn from, and finding and constructing those features can is often its own engineering challenge. FeatureSet is a library that makes it easy to construct features from your data as a pre-processing step before applying a modern machine learning library such as Weka or libsvm.
8
+
9
+ FeatureSet takes a dataset consisting of hashes, with any any object as the value of each key, and builds features from these values as appropriate. For example, a string value could be expanded into a number of new features- a count of cuss words in the string, a count of slang, a sentiment score, and/or a complete word vector with TF-IDF values.
10
+
11
+ FeatureSet is extensible, so anyone can write new FeatureBuilders that know to which datatypes they can be applied. The set of included feature builders expands as the community submits new ones.
12
+
13
+ ## FeatureBuilders
14
+
15
+ ## Example Code
16
+
17
+ data_set = FeatureSet::DataSet.new
18
+ data_set.add_feature_builder FeatureSet::FeatureBuilders::WordVector.new(:word_limit => 2000, :idf_cutoff => 8.0)
19
+ data_set.add_feature_builder FeatureSet::FeatureBuilders::Cuss.new
20
+ data_set.add_data :status => "This is a spam email", :class => :spam
21
+ data_set.add_data :status => "This is a not spam", :class => :not_spam
22
+ data_set.build_features_from_data!(:include_original => false) #do not include :status as it's own column in the output
23
+
24
+ # The following ARFF can be imported into Weka
25
+ puts data_set.to_rarff.to_s
26
+
27
+ serialized_builders = data_set.dump_feature_builders
28
+
29
+ ... later ...
30
+
31
+ data_set = FeatureSet::DataSet.new
32
+ data_set.load_feature_builders(serialized_builders)
33
+ features = data_set.build_features_for({ :status => "Is this spam?" })
34
+
35
+
36
+ See the specs for more usage examples.
@@ -20,7 +20,7 @@ Gem::Specification.new do |s|
20
20
 
21
21
  # specify any dependencies here; for example:
22
22
  s.add_development_dependency "rspec"
23
- s.add_runtime_dependency "wwood-rarff"
23
+ s.add_runtime_dependency "iterationlabs-rarff"
24
24
  s.add_runtime_dependency "activesupport"
25
25
  s.add_runtime_dependency "i18n"
26
26
  end
@@ -1,4 +1,4 @@
1
+ require "rubygems"
1
2
  require "rarff"
2
-
3
3
  require "feature_set/version"
4
- require "feature_set/builder"
4
+ require "feature_set/data_set"
@@ -0,0 +1,135 @@
1
+ require 'active_support'
2
+ require 'active_support/inflector'
3
+
4
+ require "feature_set/feature_builders/word_vector"
5
+ require "feature_set/feature_builders/cuss"
6
+ require "feature_set/feature_builders/emoticon"
7
+
8
+ require "feature_set/datum"
9
+
10
+ module FeatureSet
11
+ class DataSet
12
+ BUILTIN_FEATURE_BUILDERS = %w[FeatureSet::FeatureBuilders::Cuss
13
+ FeatureSet::FeatureBuilders::Emoticon
14
+ FeatureSet::FeatureBuilders::WordVector].map(&:constantize)
15
+
16
+ attr_accessor :options, :feature_builders, :data, :features, :name
17
+
18
+ def initialize(options = {})
19
+ @options = options
20
+ @name = options[:name]
21
+ @feature_builders = []
22
+ @features = []
23
+ @data = []
24
+ end
25
+
26
+ def add_data(data)
27
+ (@data << data).flatten!
28
+ end
29
+
30
+ def clear_data
31
+ @data = []
32
+ end
33
+
34
+ def clear_features
35
+ @features = []
36
+ end
37
+
38
+ def to_rarff
39
+ relation = Rarff::Relation.new(name || 'Data')
40
+ keys = features.first.keys
41
+ instances = features.map do |row|
42
+ keys.map do |key|
43
+ value = row[key]
44
+ if value.is_a?(String)
45
+ value.gsub(/\\/, "\\\\\\\\").gsub(/"/, "\\\\\"").gsub(/'/, '\\\\\'')
46
+ elsif value.is_a?(Symbol)
47
+ value.to_s
48
+ else
49
+ value
50
+ end
51
+ end
52
+ end
53
+ relation.instances = instances
54
+ keys.each_with_index do |key, index|
55
+ relation.attributes[index].name = key.to_s
56
+ end
57
+ relation
58
+ end
59
+
60
+ # This only knows how to output arfs with true/false classes and all numeric attributes.
61
+ # Additionally, every row must have the same attributes.
62
+ def output_numeric_arff(io)
63
+ keys = features.first.keys
64
+ io.puts "@RELATION Data"
65
+ keys.each do |key|
66
+ io.puts "@ATTRIBUTE #{key} NUMERIC" unless key == :class
67
+ end
68
+ io.puts "@ATTRIBUTE class {false,true}"
69
+ io.puts "@DATA"
70
+ features.each do |feature|
71
+ io.puts keys.map { |k| k == :class ? feature[k].to_s : feature[k].to_f }.join(",")
72
+ end
73
+ end
74
+
75
+ def build_features_from_data!(opts = {})
76
+ wrapped_data = self.class.wrap_dataset(data)
77
+ feature_builders.each {|fb| fb.before_build_features(wrapped_data) }
78
+ @features = build_features_for(wrapped_data, opts.merge(:already_wrapped => true))
79
+ end
80
+
81
+ def build_features_for(data, opts = {})
82
+ # FYI, we explicitly do not call before_build_features because this can be used on unknown rows for classification, and
83
+ # we want our feature builders to keep any cached data from the previous 'build_features_from_data!' call. This is important for
84
+ # Wordvector, for example, since it needs to build the idf mappings beforehand and needs to re-use them on any new data.
85
+ wrapped_data = opts[:already_wrapped] ? data : self.class.wrap_dataset(data)
86
+ wrapped_data.map.with_index do |row, index|
87
+ output_row = {}
88
+
89
+ row.each do |key, datum|
90
+ if key == :class
91
+ output_row[:class] = datum
92
+ next
93
+ end
94
+
95
+ if opts[:include_original] && (opts[:include_original].is_a?(TrueClass) || ![opts[:include_original][:except]].flatten.include?(key))
96
+ output_row[key] = datum.value
97
+ end
98
+
99
+ feature_builders.each do |builder|
100
+ builder.build_features(datum, key, row).each do |feature, value|
101
+ output_row["#{key}_#{feature}".to_sym] = value
102
+ end
103
+ end
104
+ end
105
+
106
+ if index % 10 == 0
107
+ STDERR.print "."; STDERR.flush
108
+ end
109
+
110
+ output_row
111
+ end
112
+ end
113
+
114
+ def add_feature_builders(*builders)
115
+ builders = BUILTIN_FEATURE_BUILDERS.map(&:new) if [:all, "all"].include?(builders.first)
116
+ (@feature_builders << builders).flatten!
117
+ end
118
+ alias_method :add_feature_builder, :add_feature_builders
119
+
120
+
121
+ def dump_feature_builders
122
+ Marshal.dump(feature_builders)
123
+ end
124
+
125
+ def load_feature_builders(serialized_builders)
126
+ clear_features
127
+ self.feature_builders = Marshal.load(serialized_builders)
128
+ end
129
+
130
+ def self.wrap_dataset(dataset)
131
+ dataset = [dataset] unless dataset.is_a?(Array)
132
+ dataset.map { |row| row.inject({}) { |m, (k, v)| m[k] = (k == :class ? v : Datum.new(v)) ; m } }
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,18 @@
1
+ module FeatureSet
2
+ module FeatureBuilders
3
+ class Base
4
+ attr_accessor :options
5
+
6
+ def initialize(options = {})
7
+ @options = options
8
+ end
9
+
10
+ def build_features(datum, key, row)
11
+ raise "Please implement 'build_features' in your subclass of FeatureBuilders::Base."
12
+ end
13
+
14
+ def before_build_features(dataset)
15
+ end
16
+ end
17
+ end
18
+ end
@@ -1,11 +1,11 @@
1
- require "feature_set/feature_builder/base"
1
+ require "feature_set/feature_builders/base"
2
2
 
3
3
  module FeatureSet
4
- module FeatureBuilder
4
+ module FeatureBuilders
5
5
  class Cuss < Base
6
6
  CUSS_WORDS = File.read(File.expand_path(File.join(File.dirname(__FILE__), '..', 'data', 'cusswords.txt'))).split("\n").map {|i| i.strip.downcase }
7
7
 
8
- def generate_features(datum, key, row)
8
+ def build_features(datum, key, row)
9
9
  return {} unless datum.value.is_a?(String)
10
10
  { :cuss_count => (datum.tokens & CUSS_WORDS).length }
11
11
  end
@@ -0,0 +1,19 @@
1
+ require "feature_set/feature_builders/base"
2
+
3
+ module FeatureSet
4
+ module FeatureBuilders
5
+ class Emoticon < Base
6
+ HAPPY = [">:]", ":-)", ":)", ":o)", ":]", ":3", ":c)", ":>", "=]", "8)", "=)", ":}", ":^)", ">:D", ":-D", ":D", "8-D", "8D", "x-D", "xD", "X-D", "XD", "=-D", "=D", "=-3", "=3"]
7
+ SAD = [":'(", ";*(", ":_(", "T.T", "T_T", "Y.Y", "Y_Y", ">:[", ":-(", ":(", ":-c", ":c", ":-<", ":<", ":-[", ":[", ":{", ">.>", "<.<", ">.<", "D:<", "D:", "D8", "D;", "D=", "DX", "v.v", "D-':"]
8
+ HUMOR = [">;]", ";-)", ";)", "*-)", "*)", ";-]", ";]", ";D", ">:P", ":-P", ":P", "X-P", "x-p", "xp", "XP", ":-p", ":p", "=p", ":-b", ":b"]
9
+
10
+ def build_features(datum, key, row)
11
+ return {} unless datum.value.is_a?(String)
12
+ tokens = datum.value.split(/\s+/)
13
+ { :happy_emoticon_count => (tokens & HAPPY).length,
14
+ :sad_emoticon_count => (tokens & SAD).length,
15
+ :humor_emoticon_count => (tokens & HUMOR).length }
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,85 @@
1
+ require "feature_set/feature_builders/base"
2
+
3
+ module FeatureSet
4
+ module FeatureBuilders
5
+ class WordVector < Base
6
+ attr_accessor :idfs
7
+
8
+ # Options:
9
+ # :tf_only => true|false, default is false
10
+ # :idf_cutiff => <cutoff>, default is 10
11
+ # :word_limit => <word limit>, default is 2000
12
+ def initialize(options = {})
13
+ super
14
+ @idfs = {}
15
+ end
16
+
17
+ def before_build_features(dataset)
18
+ @idfs = {}
19
+ dataset.each do |row|
20
+ row.each do |key, datum|
21
+ next if key == :class
22
+ if datum.value.is_a?(String)
23
+ idfs[key] ||= {}
24
+ datum.token_counts.keys.each do |token|
25
+ idfs[key][token] ||= 0
26
+ idfs[key][token] += 1
27
+ end
28
+ end
29
+ end
30
+ end
31
+
32
+ num_docs = dataset.length
33
+ idf_cutoff = (options[:idf_cutoff] || 10).to_f
34
+ word_limit = options[:word_limit] || 2000
35
+ STDERR.puts "Done building df counts. The dataset has #{num_docs} documents."
36
+
37
+ idfs.each do |feature, freqs|
38
+ pruned = 0
39
+ if options[:tf_only]
40
+ new_freqs = freqs
41
+ else
42
+ new_freqs = {}
43
+ freqs.each do |key, value|
44
+ log = Math.log(num_docs / value.to_f)
45
+ if log < idf_cutoff
46
+ new_freqs[key] = log
47
+ else
48
+ pruned += 1
49
+ end
50
+ end
51
+ end
52
+ if options[:word_limit]
53
+ new_freqs = if options[:tf_only]
54
+ new_freqs.to_a.sort {|a, b| b.last <=> a.last }
55
+ else
56
+ new_freqs.to_a.sort {|a, b| a.last <=> b.last }
57
+ end
58
+ new_freqs = new_freqs[0...word_limit].inject({}) { |m, (k, v)| m[k] = v; m }
59
+ end
60
+ idfs[feature] = new_freqs
61
+ STDERR.puts "Done calculating idfs for #{feature}. Pruned #{pruned} rare values, leaving #{idfs[feature].length} values."
62
+ end
63
+ end
64
+
65
+ def build_features(datum, key, row)
66
+ return {} unless datum.value.is_a?(String)
67
+ num_words = datum.tokens.length.to_f
68
+ unless idfs[key]
69
+ STDERR.puts "WARNING: build_features called on untrained data in WordVector. Are you calling 'data_set.build_features_for' without calling 'data_set.build_features_from_data!' first?"
70
+ end
71
+ if options[:tf_only]
72
+ (idfs[key] || {}).inject({}) do |memo, (word, idf)|
73
+ memo["wv_#{word}"] = ((datum.token_counts[word] || 0) / num_words)
74
+ memo
75
+ end
76
+ else
77
+ (idfs[key] || {}).inject({}) do |memo, (word, idf)|
78
+ memo["wv_#{word}"] = ((datum.token_counts[word] || 0) / num_words) * idf
79
+ memo
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
@@ -1,3 +1,3 @@
1
1
  module FeatureSet
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.4"
3
3
  end
@@ -0,0 +1,135 @@
1
+ require 'spec_helper'
2
+
3
+ describe FeatureSet::DataSet do
4
+ describe "adding feature builders" do
5
+ it "can add all known feature builders" do
6
+ data_set = FeatureSet::DataSet.new
7
+ data_set.add_feature_builders :all
8
+ data_set.feature_builders.map {|i| i.class}.should include(FeatureSet::FeatureBuilders::WordVector)
9
+ data_set.feature_builders.length.should == Dir[File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "lib", "feature_set", "feature_builders", "*.rb"))].length - 1
10
+ end
11
+
12
+ it "can add individual feature builders" do
13
+ data_set = FeatureSet::DataSet.new
14
+ data_set.add_feature_builders FeatureSet::FeatureBuilders::WordVector.new
15
+ data_set.feature_builders.length.should == 1
16
+ end
17
+
18
+ it "can add arrays of feature builders" do
19
+ data_set = FeatureSet::DataSet.new
20
+ data_set.add_feature_builders [FeatureSet::FeatureBuilders::WordVector.new, FeatureSet::FeatureBuilders::Cuss.new]
21
+ data_set.feature_builders.length.should == 2
22
+ end
23
+ end
24
+
25
+ describe "adding data" do
26
+ it "should accept mappings between one or more strings and their classifications" do
27
+ data_set = FeatureSet::DataSet.new
28
+ data_set.add_data [ { :status => "I am happy!", :class => :happy },
29
+ { :status => "I am sad." , :class => :sad } ]
30
+ data_set.data.should == [ { :status => "I am happy!", :class => :happy },
31
+ { :status => "I am sad." , :class => :sad } ]
32
+ data_set.add_data :status => "Something", :another_feature => "Something else", :class => :awesome
33
+ data_set.data.should == [ { :status => "I am happy!", :class => :happy },
34
+ { :status => "I am sad." , :class => :sad },
35
+ { :status => "Something", :another_feature => "Something else", :class => :awesome } ]
36
+ data_set.clear_data
37
+ data_set.data.should == []
38
+ data_set.data = [ { :status => "I am happy!", :class => :happy },
39
+ { :status => "I am sad." , :class => :sad } ]
40
+ data_set.data.should == [ { :status => "I am happy!", :class => :happy },
41
+ { :status => "I am sad." , :class => :sad } ]
42
+ end
43
+ end
44
+
45
+ describe "generating features" do
46
+ before do
47
+ @data_set = FeatureSet::DataSet.new
48
+ @data_set.add_feature_builders FeatureSet::FeatureBuilders::Cuss.new
49
+ @data_set.add_data :status => "this is some text", :class => :awesome
50
+ @data_set.add_data :status => "this is some shitty text", :class => :less_awesome
51
+ end
52
+
53
+ it "should output a row of features for every line of data" do
54
+ @data_set.build_features_from_data!
55
+ @data_set.features[0].should == { :status_cuss_count => 0, :class => :awesome }
56
+ @data_set.features[1].should == { :status_cuss_count => 1, :class => :less_awesome }
57
+ end
58
+
59
+ it "should make it easy to keep the original data" do
60
+ @data_set.build_features_from_data!(:include_original => true)
61
+ @data_set.features[0].should == { :status => "this is some text", :status_cuss_count => 0, :class => :awesome }
62
+ @data_set.features[1].should == { :status => "this is some shitty text", :status_cuss_count => 1, :class => :less_awesome }
63
+ end
64
+
65
+ it "should generate features for every string" do
66
+ @data_set.add_data :status => "text", :foo => "more shitty text", :class => :awesome
67
+ @data_set.build_features_from_data!
68
+ @data_set.features[1].should == { :status_cuss_count => 1, :class => :less_awesome }
69
+ @data_set.features[2].should == { :status_cuss_count => 0, :foo_cuss_count => 1, :class => :awesome }
70
+ end
71
+
72
+ it "should allow generation of features on new data while leaving the old data intact" do
73
+ @data_set.build_features_from_data!
74
+ num_features = @data_set.features.length
75
+ @data_set.build_features_for([{ :status => "is this shitty text?" }, { :status => "foo bar" }]).should == [{ :status_cuss_count => 1 }, { :status_cuss_count => 0 }]
76
+ @data_set.features.length.should == num_features
77
+ end
78
+ end
79
+
80
+ describe "serialization" do
81
+ it "should be able to serialize, saving all trained builders, but not the dataset" do
82
+ data_set = FeatureSet::DataSet.new
83
+ data_set.add_feature_builder FeatureSet::FeatureBuilders::WordVector.new
84
+ data_set.add_data :status => "this is some text", :class => :awesome
85
+ data_set.add_data :status => "this is some shitty text", :class => :less_awesome
86
+ data_set.build_features_from_data!
87
+ trained_rows = data_set.build_features_for([{ :status => "is this shitty text?" }, { :status => "foo bar" }])
88
+ serialized_builders = data_set.dump_feature_builders
89
+
90
+ data_set = FeatureSet::DataSet.new
91
+ data_set.add_feature_builder FeatureSet::FeatureBuilders::WordVector.new
92
+ untrained_rows = data_set.build_features_for([{ :status => "is this shitty text?" }, { :status => "foo bar" }])
93
+
94
+ data_set2 = FeatureSet::DataSet.new
95
+ data_set2.load_feature_builders(serialized_builders)
96
+ data_set2.data.should == []
97
+ rows_from_dump = data_set2.build_features_for([{ :status => "is this shitty text?" }, { :status => "foo bar" }])
98
+ rows_from_dump.should == trained_rows
99
+ rows_from_dump.should_not == untrained_rows
100
+ end
101
+ end
102
+
103
+ describe "outputing an ARFF file" do
104
+ before do
105
+ @data_set = FeatureSet::DataSet.new
106
+ @data_set.add_feature_builders FeatureSet::FeatureBuilders::Cuss.new
107
+ @data_set.add_data :status => "this is some text", :foo => 2, :class => :awesome
108
+ @data_set.add_data :status => "this is some shitty text", :foo => 5, :class => :less_awesome
109
+ end
110
+
111
+ describe "as an rarff relation" do
112
+ it "should return a rarff relation object" do
113
+ @data_set.build_features_from_data!(:include_original => { :except => :status })
114
+ arff = @data_set.to_rarff
115
+ arff.should be_a(Rarff::Relation)
116
+ arff.attributes.map(&:name).should =~ ["status_cuss_count", "class", "foo"]
117
+ arff.attributes.last.name.should == "class"
118
+ arff.to_s.should =~ /Data/
119
+ arff.to_s.should =~ /status_cuss_count/
120
+ end
121
+ end
122
+
123
+ describe "as a numeric arff" do
124
+ it "should output an arff to an IO object" do
125
+ @data_set.build_features_from_data!(:include_original => { :except => :status })
126
+ io = StringIO.new
127
+ @data_set.output_numeric_arff(io)
128
+ io.rewind
129
+ str = io.read
130
+ str.should =~ /@ATTRIBUTE status_cuss_count NUMERIC/
131
+ str.scan(/@ATTRIBUTE class /).length.should == 1
132
+ end
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,16 @@
1
+ require 'spec_helper'
2
+
3
+ describe FeatureSet::FeatureBuilders::Cuss do
4
+ before do
5
+ @builder = FeatureSet::FeatureBuilders::Cuss.new
6
+ end
7
+
8
+ it "should output :cuss_count as the number of distinct cuss words found" do
9
+ @builder.build_features(FeatureSet::Datum.new("this fucking shit"), nil, nil).should == { :cuss_count => 2 }
10
+ @builder.build_features(FeatureSet::Datum.new("this fucking fucking fucking shit"), nil, nil).should == { :cuss_count => 2 }
11
+ end
12
+
13
+ it "should ignore non-string features" do
14
+ @builder.build_features(FeatureSet::Datum.new(2), nil, nil).should == {}
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ require 'spec_helper'
2
+
3
+ describe FeatureSet::FeatureBuilders::Emoticon do
4
+ before do
5
+ @builder = FeatureSet::FeatureBuilders::Emoticon.new
6
+ end
7
+
8
+ it "should output counts of the number of distinct emoticons of each type" do
9
+ @builder.build_features(FeatureSet::Datum.new("blah :) XP"), nil, nil).should == { :happy_emoticon_count => 1, :humor_emoticon_count => 1, :sad_emoticon_count => 0 }
10
+ @builder.build_features(FeatureSet::Datum.new("blah ;) :("), nil, nil).should == { :happy_emoticon_count => 0, :humor_emoticon_count => 1, :sad_emoticon_count => 1 }
11
+ end
12
+
13
+ it "should ignore non-string features" do
14
+ @builder.build_features(FeatureSet::Datum.new(2), nil, nil).should == {}
15
+ end
16
+ end
@@ -0,0 +1,87 @@
1
+ require 'spec_helper'
2
+
3
+ describe FeatureSet::FeatureBuilders::WordVector do
4
+ it "should output a named feature for every word in the dataset, after performing tfidf" do
5
+ builder = FeatureSet::FeatureBuilders::WordVector.new
6
+ dataset = [
7
+ { :m1 => "hello world. hello!", :m2 => "how goes?", :class => :yes },
8
+ { :m1 => "foo world", :m2 => "how?", :class => :no }
9
+ ]
10
+ wrapped_dataset = FeatureSet::DataSet.wrap_dataset(dataset)
11
+ builder.before_build_features(wrapped_dataset)
12
+
13
+ builder.idfs.should == {
14
+ :m1 => { "hello" => Math.log(2/1.0), "world" => Math.log(2/2.0), "foo" => Math.log(2/1.0) },
15
+ :m2 => { "how" => Math.log(2/2.0), "goes" => Math.log(2/1.0) }
16
+ }
17
+
18
+ builder.build_features(wrapped_dataset.first[:m1], :m1, wrapped_dataset.first).should == { "wv_hello" => (2/3.0) * Math.log(2/1.0), "wv_world" => (1/3.0) * Math.log(2/2.0), "wv_foo" => 0 }
19
+ builder.build_features(wrapped_dataset.first[:m2], :m2, wrapped_dataset.first).should == { "wv_how" => (1/2.0) * Math.log(2/2.0), "wv_goes" => (1/2.0) * Math.log(2/1.0) }
20
+
21
+ builder.build_features(wrapped_dataset.last[:m1], :m1, wrapped_dataset.last).should == { "wv_hello" => 0, "wv_world" => (1/2.0) * Math.log(2/2.0), "wv_foo" => (1/2.0) * Math.log(2/1.0) }
22
+ builder.build_features(wrapped_dataset.last[:m2], :m2, wrapped_dataset.last).should == { "wv_how" => (1/1.0) * Math.log(2/2.0), "wv_goes" => 0 }
23
+ end
24
+
25
+ it "should ignore non-string features" do
26
+ builder = FeatureSet::FeatureBuilders::WordVector.new
27
+ builder.before_build_features([{ :something => FeatureSet::Datum.new(2), :class => false }, { :something => FeatureSet::Datum.new(1), :class => true }])
28
+ builder.build_features(FeatureSet::Datum.new(2), :something, { :something => FeatureSet::Datum.new(2), :class => false }).should == {}
29
+ end
30
+
31
+ it "should allow specifying the idf cutoff" do
32
+ builder = FeatureSet::FeatureBuilders::WordVector.new(:idf_cutoff => 2.0)
33
+ dataset = [{ :m1 => "hello world. hello!", :class => true }] * 10
34
+ dataset << { :m1 => "foo", :class => false }
35
+ wrapped_dataset = FeatureSet::DataSet.wrap_dataset(dataset)
36
+ builder.before_build_features(wrapped_dataset)
37
+ builder.idfs.should == {
38
+ :m1 => { "hello" => Math.log(11/10.0), "world" => Math.log(11/10.0) }
39
+ }
40
+ end
41
+
42
+ it "should allow specifying a word-count threshold" do
43
+ builder = FeatureSet::FeatureBuilders::WordVector.new(:word_limit => 2)
44
+ dataset = [{ :m1 => "hello world. hello!", :class => true }] * 10
45
+ dataset << { :m1 => "foo", :class => false }
46
+ dataset << { :m1 => "hello", :class => false }
47
+ dataset << { :m1 => "hello", :class => false }
48
+ wrapped_dataset = FeatureSet::DataSet.wrap_dataset(dataset)
49
+ builder.before_build_features(wrapped_dataset)
50
+ builder.idfs.should == {
51
+ :m1 => { "hello" => Math.log(13/12.0), "world" => Math.log(13/10.0) }
52
+ }
53
+
54
+ builder = FeatureSet::FeatureBuilders::WordVector.new(:word_limit => 1)
55
+ dataset = [{ :m1 => "hello world. hello!", :class => true }] * 10
56
+ dataset << { :m1 => "foo", :class => false }
57
+ dataset << { :m1 => "world", :class => false }
58
+ dataset << { :m1 => "world", :class => false }
59
+ wrapped_dataset = FeatureSet::DataSet.wrap_dataset(dataset)
60
+ builder.before_build_features(wrapped_dataset)
61
+ builder.idfs.should == {
62
+ :m1 => { "world" => Math.log(13/12.0) }
63
+ }
64
+ end
65
+
66
+ it "should allow use of TF-only without IDF" do
67
+ builder = FeatureSet::FeatureBuilders::WordVector.new(:tf_only => true, :word_limit => 2)
68
+ dataset = [
69
+ { :m1 => "hello world. hello!", :m2 => "how goes?", :class => :yes },
70
+ { :m1 => "foo world", :m2 => "how?", :class => :no },
71
+ { :m1 => "hello world!", :m2 => "how goes it?", :class => :no }
72
+ ]
73
+ wrapped_dataset = FeatureSet::DataSet.wrap_dataset(dataset)
74
+ builder.before_build_features(wrapped_dataset)
75
+
76
+ builder.idfs.should == {
77
+ :m1 => { "hello" => 2, "world" => 3 },
78
+ :m2 => { "how" => 3, "goes" => 2 }
79
+ }
80
+
81
+ builder.build_features(wrapped_dataset.first[:m1], :m1, wrapped_dataset.first).should == { "wv_hello" => (2/3.0), "wv_world" => (1/3.0) }
82
+ builder.build_features(wrapped_dataset.first[:m2], :m2, wrapped_dataset.first).should == { "wv_how" => (1/2.0), "wv_goes" => (1/2.0) }
83
+
84
+ builder.build_features(wrapped_dataset[1][:m1], :m1, wrapped_dataset[1]).should == { "wv_hello" => 0, "wv_world" => (1/2.0) }
85
+ builder.build_features(wrapped_dataset[1][:m2], :m2, wrapped_dataset[1]).should == { "wv_how" => (1/1.0) , "wv_goes" => 0 }
86
+ end
87
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: feature_set
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-12-17 00:00:00.000000000Z
12
+ date: 2012-01-23 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &70136561584740 !ruby/object:Gem::Requirement
16
+ requirement: &70355551090860 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *70136561584740
24
+ version_requirements: *70355551090860
25
25
  - !ruby/object:Gem::Dependency
26
- name: wwood-rarff
27
- requirement: &70136561584320 !ruby/object:Gem::Requirement
26
+ name: iterationlabs-rarff
27
+ requirement: &70355551090340 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70136561584320
35
+ version_requirements: *70355551090340
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: activesupport
38
- requirement: &70136561583900 !ruby/object:Gem::Requirement
38
+ requirement: &70355551089920 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70136561583900
46
+ version_requirements: *70355551089920
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: i18n
49
- requirement: &70136561583480 !ruby/object:Gem::Requirement
49
+ requirement: &70355551075900 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,7 +54,7 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70136561583480
57
+ version_requirements: *70355551075900
58
58
  description: FeatureSet is a Ruby library for generating feature vectors from textual
59
59
  data. It can output in ARFF format for experimentation with Weka.
60
60
  email:
@@ -70,17 +70,19 @@ files:
70
70
  - Rakefile
71
71
  - feature_set.gemspec
72
72
  - lib/feature_set.rb
73
- - lib/feature_set/builder.rb
74
73
  - lib/feature_set/data/cusswords.txt
74
+ - lib/feature_set/data_set.rb
75
75
  - lib/feature_set/datum.rb
76
- - lib/feature_set/feature_builder/base.rb
77
- - lib/feature_set/feature_builder/cuss.rb
78
- - lib/feature_set/feature_builder/word_vector.rb
76
+ - lib/feature_set/feature_builders/base.rb
77
+ - lib/feature_set/feature_builders/cuss.rb
78
+ - lib/feature_set/feature_builders/emoticon.rb
79
+ - lib/feature_set/feature_builders/word_vector.rb
79
80
  - lib/feature_set/version.rb
80
- - spec/feature_set/builder_spec.rb
81
+ - spec/feature_set/data_set_spec.rb
81
82
  - spec/feature_set/datum_spec.rb
82
- - spec/feature_set/feature/cuss_spec.rb
83
- - spec/feature_set/feature/word_vector_spec.rb
83
+ - spec/feature_set/feature_builders/cuss_spec.rb
84
+ - spec/feature_set/feature_builders/emoticon_spec.rb
85
+ - spec/feature_set/feature_builders/word_vector_spec.rb
84
86
  - spec/spec.opts
85
87
  - spec/spec_helper.rb
86
88
  homepage: https://github.com/iterationlabs/feature_set
@@ -108,9 +110,10 @@ signing_key:
108
110
  specification_version: 3
109
111
  summary: Generate feature vectors from textual data
110
112
  test_files:
111
- - spec/feature_set/builder_spec.rb
113
+ - spec/feature_set/data_set_spec.rb
112
114
  - spec/feature_set/datum_spec.rb
113
- - spec/feature_set/feature/cuss_spec.rb
114
- - spec/feature_set/feature/word_vector_spec.rb
115
+ - spec/feature_set/feature_builders/cuss_spec.rb
116
+ - spec/feature_set/feature_builders/emoticon_spec.rb
117
+ - spec/feature_set/feature_builders/word_vector_spec.rb
115
118
  - spec/spec.opts
116
119
  - spec/spec_helper.rb
@@ -1,97 +0,0 @@
1
- require 'active_support'
2
- require 'active_support/inflector'
3
-
4
- require "feature_set/feature_builder/word_vector"
5
- require "feature_set/feature_builder/cuss"
6
-
7
- require "feature_set/datum"
8
-
9
- module FeatureSet
10
- class Builder
11
- BUILTIN_FEATURE_BUILDERS = %w[FeatureSet::FeatureBuilder::Cuss
12
- FeatureSet::FeatureBuilder::WordVector].map(&:constantize)
13
-
14
- attr_accessor :options, :feature_builders, :data, :features, :name
15
-
16
- def initialize(options = {})
17
- @options = options
18
- @name = options[:name]
19
- @feature_builders = []
20
- @features = []
21
- @data = []
22
- end
23
-
24
- def add_data(data)
25
- (@data << data).flatten!
26
- end
27
-
28
- def clear_data
29
- @data = []
30
- end
31
-
32
- def clear_features
33
- @features = []
34
- end
35
-
36
- def arff
37
- relation = Rarff::Relation.new(name || 'Data')
38
- keys = features.first.keys
39
- instances = features.map do |row|
40
- keys.map do |key|
41
- value = row[key]
42
- if value.is_a?(String)
43
- value.gsub(/\\/, "\\\\\\\\").gsub(/"/, "\\\\\"").gsub(/'/, '\\\\\'')
44
- elsif value.is_a?(Symbol)
45
- value.to_s
46
- else
47
- value
48
- end
49
- end
50
- end
51
- relation.instances = instances
52
- keys.each_with_index do |key, index|
53
- relation.attributes[index].name = key.to_s
54
- end
55
- relation
56
- end
57
-
58
- def generate_features(opts = {})
59
- wrapped_data = self.class.wrap_dataset(data)
60
- feature_builders.each {|fb| fb.before_generate_features(wrapped_data) }
61
- @features = generate_features_for(wrapped_data, opts.merge(:already_wrapped => true))
62
- end
63
-
64
- def generate_features_for(data, opts = {})
65
- # FYI, we explicitly do not call before_generate_features because this can be used on unknown rows for classification, and
66
- # we want our feature generators to keep any cached data from the previous 'generate_features' feature building call. This is
67
- # important for Wordvector, for example, since it needs to build the idf mappings beforehand and we want them used on any new data.
68
- wrapped_data = opts[:already_wrapped] ? data : self.class.wrap_dataset(data)
69
- wrapped_data.map do |row|
70
- output_row = {}
71
-
72
- row.each do |key, datum|
73
- (output_row[:class] = datum) and next if key == :class
74
- output_row[key] = datum.value if opts[:include_original]
75
-
76
- feature_builders.each do |builder|
77
- builder.generate_features(datum, key, row).each do |feature, value|
78
- output_row["#{key}_#{feature}".to_sym] = value
79
- end
80
- end
81
- end
82
-
83
- output_row
84
- end
85
- end
86
-
87
- def add_feature_builders(*builders)
88
- builders = BUILTIN_FEATURE_BUILDERS.map(&:new) if [:all, "all"].include?(builders.first)
89
- (@feature_builders << builders).flatten!
90
- end
91
- alias_method :add_feature_builder, :add_feature_builders
92
-
93
- def self.wrap_dataset(dataset)
94
- dataset.map { |row| row.inject({}) { |m, (k, v)| m[k] = (k == :class ? v : Datum.new(v)) ; m } }
95
- end
96
- end
97
- end
@@ -1,18 +0,0 @@
1
- module FeatureSet
2
- module FeatureBuilder
3
- class Base
4
- attr_accessor :options
5
-
6
- def initialize(options = {})
7
- @options = options
8
- end
9
-
10
- def generate_features(datum, key, row)
11
- raise "Please implement 'generate_features' in your subclass of FeatureBuilder::Base."
12
- end
13
-
14
- def before_generate_features(dataset)
15
- end
16
- end
17
- end
18
- end
@@ -1,45 +0,0 @@
1
- require "feature_set/feature_builder/base"
2
-
3
- module FeatureSet
4
- module FeatureBuilder
5
- class WordVector < Base
6
- attr_accessor :idfs
7
-
8
- def initialize(options = {})
9
- super
10
- end
11
-
12
- def before_generate_features(dataset)
13
- @idfs = {}
14
- dataset.each do |row|
15
- row.each do |key, datum|
16
- next if key == :class
17
- if datum.value.is_a?(String)
18
- idfs[key] ||= {}
19
- datum.token_counts.keys.each do |token|
20
- idfs[key][token] ||= 0
21
- idfs[key][token] += 1
22
- end
23
- end
24
- end
25
- end
26
-
27
- num_docs = dataset.length
28
- idfs.each do |feature, freqs|
29
- freqs.each do |key, value|
30
- idfs[feature][key] = Math.log(num_docs / value.to_f)
31
- end
32
- end
33
-
34
- def generate_features(datum, key, row)
35
- return {} unless datum.value.is_a?(String)
36
- num_words = datum.tokens.length.to_f
37
- idfs[key].inject({}) do |memo, (word, idf)|
38
- memo[word] = ((datum.token_counts[word] || 0) / num_words) * idf
39
- memo
40
- end
41
- end
42
- end
43
- end
44
- end
45
- end
@@ -1,91 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe FeatureSet::Builder do
4
- describe "adding feature builders" do
5
- it "can add all known feature builders" do
6
- builder = FeatureSet::Builder.new
7
- builder.add_feature_builders :all
8
- builder.feature_builders.map {|i| i.class}.should include(FeatureSet::FeatureBuilder::WordVector)
9
- builder.feature_builders.length.should == Dir[File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "lib", "feature_set", "feature_builder", "*.rb"))].length - 1
10
- end
11
-
12
- it "can add individual feature builders" do
13
- builder = FeatureSet::Builder.new
14
- builder.add_feature_builder FeatureSet::FeatureBuilder::WordVector.new
15
- builder.feature_builders.length.should == 1
16
- end
17
-
18
- it "can add arrays of feature builders" do
19
- builder = FeatureSet::Builder.new
20
- builder.add_feature_builders [FeatureSet::FeatureBuilder::WordVector.new, FeatureSet::FeatureBuilder::Cuss.new]
21
- builder.feature_builders.length.should == 2
22
- end
23
- end
24
-
25
- describe "adding data" do
26
- it "should accept mappings between one or more strings and their classifications" do
27
- builder = FeatureSet::Builder.new
28
- builder.add_data [ { :status => "I am happy!", :class => :happy },
29
- { :status => "I am sad." , :class => :sad } ]
30
- builder.data.should == [ { :status => "I am happy!", :class => :happy },
31
- { :status => "I am sad." , :class => :sad } ]
32
- builder.add_data :status => "Something", :another_feature => "Something else", :class => :awesome
33
- builder.data.should == [ { :status => "I am happy!", :class => :happy },
34
- { :status => "I am sad." , :class => :sad },
35
- { :status => "Something", :another_feature => "Something else", :class => :awesome } ]
36
- builder.clear_data
37
- builder.data.should == []
38
- builder.data = [ { :status => "I am happy!", :class => :happy },
39
- { :status => "I am sad." , :class => :sad } ]
40
- builder.data.should == [ { :status => "I am happy!", :class => :happy },
41
- { :status => "I am sad." , :class => :sad } ]
42
- end
43
- end
44
-
45
- describe "generating features" do
46
- before do
47
- @builder = FeatureSet::Builder.new
48
- @builder.add_feature_builder FeatureSet::FeatureBuilder::Cuss.new
49
- @builder.add_data :status => "this is some text", :class => :awesome
50
- @builder.add_data :status => "this is some shitty text", :class => :less_awesome
51
- end
52
-
53
- it "should output a row of features for every line of data" do
54
- @builder.generate_features
55
- @builder.features[0].should == { :status_cuss_count => 0, :class => :awesome }
56
- @builder.features[1].should == { :status_cuss_count => 1, :class => :less_awesome }
57
- end
58
-
59
- it "should make it easy to keep the original data" do
60
- @builder.generate_features(:include_original => true)
61
- @builder.features[0].should == { :status => "this is some text", :status_cuss_count => 0, :class => :awesome }
62
- @builder.features[1].should == { :status => "this is some shitty text", :status_cuss_count => 1, :class => :less_awesome }
63
- end
64
-
65
- it "should generate features for every string" do
66
- @builder.add_data :status => "text", :foo => "more shitty text", :class => :awesome
67
- @builder.generate_features
68
- @builder.features[1].should == { :status_cuss_count => 1, :class => :less_awesome }
69
- @builder.features[2].should == { :status_cuss_count => 0, :foo_cuss_count => 1, :class => :awesome }
70
- end
71
-
72
- it "should allow generation of features on new data while leaving the old data intact" do
73
- @builder.generate_features
74
- num_features = @builder.features.length
75
- @builder.generate_features_for([{ :status => "is this shitty text?" }, { :status => "foo bar" }]).should == [{ :status_cuss_count => 1 }, { :status_cuss_count => 0 }]
76
- @builder.features.length.should == num_features
77
- end
78
-
79
- describe "outputing an ARFF file" do
80
- it "should return a rarff relation object" do
81
- @builder.generate_features
82
- arff = @builder.arff
83
- arff.should be_a(Rarff::Relation)
84
- arff.attributes.first.name.should == "status_cuss_count"
85
- arff.attributes.last.name.should == "class"
86
- arff.to_s.should =~ /Data/
87
- arff.to_s.should =~ /status_cuss_count/
88
- end
89
- end
90
- end
91
- end
@@ -1,16 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe FeatureSet::FeatureBuilder::Cuss do
4
- before do
5
- @builder = FeatureSet::FeatureBuilder::Cuss.new
6
- end
7
-
8
- it "should output :cuss_count as the number of distinct cuss words found" do
9
- @builder.generate_features(FeatureSet::Datum.new("this fucking shit"), nil, nil).should == { :cuss_count => 2 }
10
- @builder.generate_features(FeatureSet::Datum.new("this fucking fucking fucking shit"), nil, nil).should == { :cuss_count => 2 }
11
- end
12
-
13
- it "should ignore non-string features" do
14
- @builder.generate_features(FeatureSet::Datum.new(2), nil, nil).should == {}
15
- end
16
- end
@@ -1,30 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe FeatureSet::FeatureBuilder::WordVector do
4
- it "should output a named feature for every word in the dataset, after performing tfidf" do
5
- builder = FeatureSet::FeatureBuilder::WordVector.new
6
- dataset = [
7
- { :m1 => "hello world. hello!", :m2 => "how goes?", :class => :yes },
8
- { :m1 => "foo world", :m2 => "how?", :class => :no }
9
- ]
10
- wrapped_dataset = FeatureSet::Builder.wrap_dataset(dataset)
11
- builder.before_generate_features(wrapped_dataset)
12
-
13
- builder.idfs.should == {
14
- :m1 => { "hello" => Math.log(2/1.0), "world" => Math.log(2/2.0), "foo" => Math.log(2/1.0) },
15
- :m2 => { "how" => Math.log(2/2.0), "goes" => Math.log(2/1.0) }
16
- }
17
-
18
- builder.generate_features(wrapped_dataset.first[:m1], :m1, wrapped_dataset.first).should == { "hello" => (2/3.0) * Math.log(2/1.0), "world" => (1/3.0) * Math.log(2/2.0), "foo" => 0 }
19
- builder.generate_features(wrapped_dataset.first[:m2], :m2, wrapped_dataset.first).should == { "how" => (1/2.0) * Math.log(2/2.0), "goes" => (1/2.0) * Math.log(2/1.0) }
20
-
21
- builder.generate_features(wrapped_dataset.last[:m1], :m1, wrapped_dataset.last).should == { "hello" => 0, "world" => (1/2.0) * Math.log(2/2.0), "foo" => (1/2.0) * Math.log(2/1.0) }
22
- builder.generate_features(wrapped_dataset.last[:m2], :m2, wrapped_dataset.last).should == { "how" => (1/1.0) * Math.log(2/2.0), "goes" => 0 }
23
- end
24
-
25
- it "should ignore non-string features" do
26
- builder = FeatureSet::FeatureBuilder::WordVector.new
27
- builder.before_generate_features([{ :something => FeatureSet::Datum.new(2), :class => false }, { :something => FeatureSet::Datum.new(1), :class => true }])
28
- builder.generate_features(FeatureSet::Datum.new(2), :something, { :something => FeatureSet::Datum.new(2), :class => false }).should == {}
29
- end
30
- end