feature_set 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/.rvmrc +1 -0
- data/Gemfile +4 -0
- data/README.markdown +8 -0
- data/Rakefile +6 -0
- data/feature_set.gemspec +26 -0
- data/lib/feature_set/builder.rb +70 -0
- data/lib/feature_set/data/cusswords.txt +351 -0
- data/lib/feature_set/datum.rb +24 -0
- data/lib/feature_set/feature_builder/base.rb +18 -0
- data/lib/feature_set/feature_builder/cuss.rb +14 -0
- data/lib/feature_set/feature_builder/word_vector.rb +45 -0
- data/lib/feature_set/version.rb +3 -0
- data/lib/feature_set.rb +4 -0
- data/spec/feature_set/builder_spec.rb +72 -0
- data/spec/feature_set/datum_spec.rb +31 -0
- data/spec/feature_set/feature/cuss_spec.rb +16 -0
- data/spec/feature_set/feature/word_vector_spec.rb +30 -0
- data/spec/spec.opts +4 -0
- data/spec/spec_helper.rb +5 -0
- metadata +116 -0
data/.gitignore
ADDED
data/.rvmrc
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rvm use 1.9.2@feature_set --create
|
data/Gemfile
ADDED
data/README.markdown
ADDED
data/Rakefile
ADDED
data/feature_set.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "feature_set/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "feature_set"
|
7
|
+
s.version = FeatureSet::VERSION
|
8
|
+
s.authors = ["Andrew Cantino"]
|
9
|
+
s.email = ["andrew@iterationlabs.com"]
|
10
|
+
s.homepage = "https://github.com/iterationlabs/feature_set"
|
11
|
+
s.summary = %q{Generate feature vectors from textual data}
|
12
|
+
s.description = %q{FeatureSet is a Ruby library for generating feature vectors from textual data. It can output in ARFF format for experimentation with Weka.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "feature_set"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
# specify any dependencies here; for example:
|
22
|
+
s.add_development_dependency "rspec"
|
23
|
+
s.add_runtime_dependency "wwood-rarff"
|
24
|
+
s.add_runtime_dependency "activesupport"
|
25
|
+
s.add_runtime_dependency "i18n"
|
26
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'active_support'
|
2
|
+
require 'active_support/inflector'
|
3
|
+
|
4
|
+
require "feature_set/feature_builder/word_vector"
|
5
|
+
require "feature_set/feature_builder/cuss"
|
6
|
+
|
7
|
+
require "feature_set/datum"
|
8
|
+
|
9
|
+
module FeatureSet
|
10
|
+
class Builder
|
11
|
+
BUILTIN_FEATURE_BUILDERS = %w[FeatureSet::FeatureBuilder::Cuss
|
12
|
+
FeatureSet::FeatureBuilder::WordVector].map(&:constantize)
|
13
|
+
|
14
|
+
attr_accessor :options, :feature_builders, :data, :features
|
15
|
+
|
16
|
+
def initialize(options = {})
|
17
|
+
@options = options
|
18
|
+
@feature_builders = []
|
19
|
+
@features = []
|
20
|
+
@data = []
|
21
|
+
end
|
22
|
+
|
23
|
+
def add_data(data)
|
24
|
+
clear_features
|
25
|
+
(@data << data).flatten!
|
26
|
+
end
|
27
|
+
|
28
|
+
def clear_data
|
29
|
+
@data = []
|
30
|
+
clear_features
|
31
|
+
end
|
32
|
+
|
33
|
+
def clear_features
|
34
|
+
@features = []
|
35
|
+
end
|
36
|
+
|
37
|
+
def generate_features(opts = {})
|
38
|
+
wrapped_data_set = self.class.wrap_dataset(data)
|
39
|
+
|
40
|
+
feature_builders.each {|fb| fb.before_generate_features(wrapped_data_set) }
|
41
|
+
|
42
|
+
@features = wrapped_data_set.map do |row|
|
43
|
+
output_row = {}
|
44
|
+
|
45
|
+
row.each do |key, datum|
|
46
|
+
(output_row[:class] = datum) and next if key == :class
|
47
|
+
output_row[key] = datum.value if opts[:include_original]
|
48
|
+
|
49
|
+
feature_builders.each do |builder|
|
50
|
+
builder.generate_features(datum, key, row).each do |feature, value|
|
51
|
+
output_row["#{key}_#{feature}".to_sym] = value
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
output_row
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def add_feature_builders(*builders)
|
61
|
+
builders = BUILTIN_FEATURE_BUILDERS.map(&:new) if [:all, "all"].include?(builders.first)
|
62
|
+
(@feature_builders << builders).flatten!
|
63
|
+
end
|
64
|
+
alias_method :add_feature_builder, :add_feature_builders
|
65
|
+
|
66
|
+
def self.wrap_dataset(dataset)
|
67
|
+
dataset.map { |row| row.inject({}) { |m, (k, v)| m[k] = (k == :class ? v : Datum.new(v)) ; m } }
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,351 @@
|
|
1
|
+
anus
|
2
|
+
arse
|
3
|
+
arsehole
|
4
|
+
ass
|
5
|
+
ass-hat
|
6
|
+
asshat
|
7
|
+
ass-jabber
|
8
|
+
assjabber
|
9
|
+
ass-pirate
|
10
|
+
asspirate
|
11
|
+
assbag
|
12
|
+
assbandit
|
13
|
+
assbanger
|
14
|
+
assbite
|
15
|
+
assclown
|
16
|
+
asscock
|
17
|
+
asscracker
|
18
|
+
asses
|
19
|
+
assface
|
20
|
+
assfuck
|
21
|
+
assfucker
|
22
|
+
assgoblin
|
23
|
+
asshead
|
24
|
+
asshole
|
25
|
+
asshopper
|
26
|
+
assjacker
|
27
|
+
asslick
|
28
|
+
asslicker
|
29
|
+
assmonkey
|
30
|
+
assmunch
|
31
|
+
assmuncher
|
32
|
+
assnigger
|
33
|
+
assshit
|
34
|
+
assshole
|
35
|
+
asssucker
|
36
|
+
asswad
|
37
|
+
asswipe
|
38
|
+
bampot
|
39
|
+
bastard
|
40
|
+
beaner
|
41
|
+
bitch
|
42
|
+
bitchass
|
43
|
+
bitches
|
44
|
+
bitchtits
|
45
|
+
bitchy
|
46
|
+
blow job
|
47
|
+
blowjob
|
48
|
+
bollocks
|
49
|
+
bollox
|
50
|
+
boner
|
51
|
+
brotherfucker
|
52
|
+
bullshit
|
53
|
+
bumblefuck
|
54
|
+
butt plug
|
55
|
+
buttplug
|
56
|
+
butt-pirate
|
57
|
+
buttpirate
|
58
|
+
buttfucka
|
59
|
+
buttfucker
|
60
|
+
camel toe
|
61
|
+
cameltoe
|
62
|
+
carpetmuncher
|
63
|
+
chinc
|
64
|
+
chink
|
65
|
+
choad
|
66
|
+
chode
|
67
|
+
clit
|
68
|
+
clitface
|
69
|
+
clitfuck
|
70
|
+
clusterfuck
|
71
|
+
cock
|
72
|
+
cockass
|
73
|
+
cockbite
|
74
|
+
cockburger
|
75
|
+
cockface
|
76
|
+
cockfucker
|
77
|
+
cockhead
|
78
|
+
cockjockey
|
79
|
+
cockknoker
|
80
|
+
cockmaster
|
81
|
+
cockmongler
|
82
|
+
cockmongruel
|
83
|
+
cockmonkey
|
84
|
+
cockmuncher
|
85
|
+
cocknose
|
86
|
+
cocknugget
|
87
|
+
cockshit
|
88
|
+
cocksmith
|
89
|
+
cocksmoke
|
90
|
+
cocksmoker
|
91
|
+
cocksniffer
|
92
|
+
cocksucker
|
93
|
+
cockwaffle
|
94
|
+
coochie
|
95
|
+
coochy
|
96
|
+
coon
|
97
|
+
cooter
|
98
|
+
cracker
|
99
|
+
cum
|
100
|
+
cumbubble
|
101
|
+
cumdumpster
|
102
|
+
cumguzzler
|
103
|
+
cumjockey
|
104
|
+
cumslut
|
105
|
+
cumtart
|
106
|
+
cunnie
|
107
|
+
cunnilingus
|
108
|
+
cunt
|
109
|
+
cuntass
|
110
|
+
cuntface
|
111
|
+
cunthole
|
112
|
+
cuntlicker
|
113
|
+
cuntrag
|
114
|
+
cuntslut
|
115
|
+
dago
|
116
|
+
damn
|
117
|
+
deggo
|
118
|
+
dick
|
119
|
+
dickbag
|
120
|
+
dickbeaters
|
121
|
+
dickface
|
122
|
+
dickfuck
|
123
|
+
dickfucker
|
124
|
+
dickhead
|
125
|
+
dickhole
|
126
|
+
dickjuice
|
127
|
+
dickmilk
|
128
|
+
dickmonger
|
129
|
+
dicks
|
130
|
+
dickslap
|
131
|
+
dicksucker
|
132
|
+
dicksucking
|
133
|
+
dickwad
|
134
|
+
dickweasel
|
135
|
+
dickweed
|
136
|
+
dickwod
|
137
|
+
dike
|
138
|
+
dildo
|
139
|
+
dipshit
|
140
|
+
doochbag
|
141
|
+
dookie
|
142
|
+
douche
|
143
|
+
douche-fag
|
144
|
+
douchefag
|
145
|
+
douchebag
|
146
|
+
douchewaffle
|
147
|
+
dumass
|
148
|
+
dumb ass
|
149
|
+
dumbass
|
150
|
+
dumbfuck
|
151
|
+
dumbshit
|
152
|
+
dumshit
|
153
|
+
dyke
|
154
|
+
fag
|
155
|
+
fagbag
|
156
|
+
fagfucker
|
157
|
+
faggit
|
158
|
+
faggot
|
159
|
+
faggotcock
|
160
|
+
fagtard
|
161
|
+
fatass
|
162
|
+
fellatio
|
163
|
+
feltch
|
164
|
+
flamer
|
165
|
+
fuck
|
166
|
+
fuckass
|
167
|
+
fuckbag
|
168
|
+
fuckboy
|
169
|
+
fuckbrain
|
170
|
+
fuckbutt
|
171
|
+
fucked
|
172
|
+
fucker
|
173
|
+
fuckersucker
|
174
|
+
fuckface
|
175
|
+
fuckhead
|
176
|
+
fuckhole
|
177
|
+
fuckin
|
178
|
+
fucking
|
179
|
+
fucknut
|
180
|
+
fucknutt
|
181
|
+
fuckoff
|
182
|
+
fucks
|
183
|
+
fuckstick
|
184
|
+
fucktard
|
185
|
+
fucktart
|
186
|
+
fuckup
|
187
|
+
fuckwad
|
188
|
+
fuckwit
|
189
|
+
fuckwitt
|
190
|
+
fudgepacker
|
191
|
+
gay
|
192
|
+
gayass
|
193
|
+
gaybob
|
194
|
+
gaydo
|
195
|
+
gayfuck
|
196
|
+
gayfuckist
|
197
|
+
gaylord
|
198
|
+
gaytard
|
199
|
+
gaywad
|
200
|
+
goddamn
|
201
|
+
goddamnit
|
202
|
+
gooch
|
203
|
+
gook
|
204
|
+
gringo
|
205
|
+
guido
|
206
|
+
handjob
|
207
|
+
hard on
|
208
|
+
hardon
|
209
|
+
heeb
|
210
|
+
hell
|
211
|
+
ho
|
212
|
+
hoe
|
213
|
+
homo
|
214
|
+
homodumbshit
|
215
|
+
honkey
|
216
|
+
humping
|
217
|
+
jackass
|
218
|
+
jap
|
219
|
+
jerk off
|
220
|
+
jerkoff
|
221
|
+
jigaboo
|
222
|
+
jizz
|
223
|
+
jungle bunny
|
224
|
+
junglebunny
|
225
|
+
kike
|
226
|
+
kooch
|
227
|
+
kootch
|
228
|
+
kraut
|
229
|
+
kunt
|
230
|
+
kyke
|
231
|
+
lameass
|
232
|
+
lesbian
|
233
|
+
lesbo
|
234
|
+
lezzie
|
235
|
+
mcfagget
|
236
|
+
mick
|
237
|
+
minge
|
238
|
+
mothafucka
|
239
|
+
mothafuckin\'
|
240
|
+
mothafuckin
|
241
|
+
motherfucker
|
242
|
+
motherfucking
|
243
|
+
muff
|
244
|
+
muffdiver
|
245
|
+
munging
|
246
|
+
negro
|
247
|
+
nigaboo
|
248
|
+
nigga
|
249
|
+
nigger
|
250
|
+
niggers
|
251
|
+
niglet
|
252
|
+
nut sack
|
253
|
+
nutsack
|
254
|
+
paki
|
255
|
+
panooch
|
256
|
+
pecker
|
257
|
+
peckerhead
|
258
|
+
penis
|
259
|
+
penisbanger
|
260
|
+
penisfucker
|
261
|
+
penispuffer
|
262
|
+
piss
|
263
|
+
pissed
|
264
|
+
pissed off
|
265
|
+
pissedoff
|
266
|
+
pissflaps
|
267
|
+
polesmoker
|
268
|
+
pollock
|
269
|
+
poon
|
270
|
+
poonani
|
271
|
+
poonany
|
272
|
+
poontang
|
273
|
+
porch monkey
|
274
|
+
porchmonkey
|
275
|
+
prick
|
276
|
+
punanny
|
277
|
+
punta
|
278
|
+
pussies
|
279
|
+
pussy
|
280
|
+
pussylicking
|
281
|
+
puto
|
282
|
+
queef
|
283
|
+
queer
|
284
|
+
queerbait
|
285
|
+
queerhole
|
286
|
+
renob
|
287
|
+
rimjob
|
288
|
+
ruski
|
289
|
+
sand nigger
|
290
|
+
sandnigger
|
291
|
+
schlong
|
292
|
+
scrote
|
293
|
+
shit
|
294
|
+
shitass
|
295
|
+
shitbag
|
296
|
+
shitbagger
|
297
|
+
shitbrains
|
298
|
+
shitbreath
|
299
|
+
shitcanned
|
300
|
+
shitcunt
|
301
|
+
shitdick
|
302
|
+
shitface
|
303
|
+
shitfaced
|
304
|
+
shithead
|
305
|
+
shithole
|
306
|
+
shithouse
|
307
|
+
shitspitter
|
308
|
+
shitstain
|
309
|
+
shitter
|
310
|
+
shittiest
|
311
|
+
shitting
|
312
|
+
shitty
|
313
|
+
shiz
|
314
|
+
shiznit
|
315
|
+
skank
|
316
|
+
skeet
|
317
|
+
skullfuck
|
318
|
+
slut
|
319
|
+
slutbag
|
320
|
+
smeg
|
321
|
+
snatch
|
322
|
+
spic
|
323
|
+
spick
|
324
|
+
splooge
|
325
|
+
spook
|
326
|
+
suckass
|
327
|
+
tard
|
328
|
+
testicle
|
329
|
+
thundercunt
|
330
|
+
tit
|
331
|
+
titfuck
|
332
|
+
tits
|
333
|
+
tittyfuck
|
334
|
+
twat
|
335
|
+
twatlips
|
336
|
+
twats
|
337
|
+
twatwaffle
|
338
|
+
unclefucker
|
339
|
+
va-j-j
|
340
|
+
vajj
|
341
|
+
vag
|
342
|
+
vagina
|
343
|
+
vajayjay
|
344
|
+
vjayjay
|
345
|
+
wank
|
346
|
+
wankjob
|
347
|
+
wetback
|
348
|
+
whore
|
349
|
+
whorebag
|
350
|
+
whoreface
|
351
|
+
wop
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module FeatureSet
|
2
|
+
class Datum
|
3
|
+
TOKEN_REGEX = /[\s\/]+/
|
4
|
+
NON_ASCII_REGEX = /[^a-zA-Z0-9_-]/
|
5
|
+
|
6
|
+
attr_accessor :value
|
7
|
+
|
8
|
+
def initialize(v)
|
9
|
+
self.value = v
|
10
|
+
end
|
11
|
+
|
12
|
+
def tokens
|
13
|
+
@tokens ||= begin
|
14
|
+
value.strip.downcase.gsub(NON_ASCII_REGEX, ' ').split(TOKEN_REGEX)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def token_counts
|
19
|
+
@token_counts ||= begin
|
20
|
+
tokens.inject({}) { |m, w| m[w] ||= 0; m[w] += 1; m }
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module FeatureSet
|
2
|
+
module FeatureBuilder
|
3
|
+
class Base
|
4
|
+
attr_accessor :options
|
5
|
+
|
6
|
+
def initialize(options = {})
|
7
|
+
@options = options
|
8
|
+
end
|
9
|
+
|
10
|
+
def generate_features(datum, key, row)
|
11
|
+
raise "Please implement 'generate_features' in your subclass of FeatureBuilder::Base."
|
12
|
+
end
|
13
|
+
|
14
|
+
def before_generate_features(dataset)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require "feature_set/feature_builder/base"
|
2
|
+
|
3
|
+
module FeatureSet
|
4
|
+
module FeatureBuilder
|
5
|
+
class Cuss < Base
|
6
|
+
CUSS_WORDS = File.read(File.expand_path(File.join(File.dirname(__FILE__), '..', 'data', 'cusswords.txt'))).split("\n").map {|i| i.strip.downcase }
|
7
|
+
|
8
|
+
def generate_features(datum, key, row)
|
9
|
+
return {} unless datum.value.is_a?(String)
|
10
|
+
{ :cuss_count => (datum.tokens & CUSS_WORDS).length }
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require "feature_set/feature_builder/base"
|
2
|
+
|
3
|
+
module FeatureSet
|
4
|
+
module FeatureBuilder
|
5
|
+
class WordVector < Base
|
6
|
+
attr_accessor :idfs
|
7
|
+
|
8
|
+
def initialize(options = {})
|
9
|
+
super
|
10
|
+
end
|
11
|
+
|
12
|
+
def before_generate_features(dataset)
|
13
|
+
@idfs = {}
|
14
|
+
dataset.each do |row|
|
15
|
+
row.each do |key, datum|
|
16
|
+
next if key == :class
|
17
|
+
if datum.value.is_a?(String)
|
18
|
+
idfs[key] ||= {}
|
19
|
+
datum.token_counts.keys.each do |token|
|
20
|
+
idfs[key][token] ||= 0
|
21
|
+
idfs[key][token] += 1
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
num_docs = dataset.length
|
28
|
+
idfs.each do |feature, freqs|
|
29
|
+
freqs.each do |key, value|
|
30
|
+
idfs[feature][key] = Math.log(num_docs / value.to_f)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def generate_features(datum, key, row)
|
35
|
+
return {} unless datum.value.is_a?(String)
|
36
|
+
num_words = datum.tokens.length.to_f
|
37
|
+
idfs[key].inject({}) do |memo, (word, idf)|
|
38
|
+
memo[word] = ((datum.token_counts[word] || 0) / num_words) * idf
|
39
|
+
memo
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
data/lib/feature_set.rb
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe FeatureSet::Builder do
|
4
|
+
describe "adding feature builders" do
|
5
|
+
it "can add all known feature builders" do
|
6
|
+
builder = FeatureSet::Builder.new
|
7
|
+
builder.add_feature_builders :all
|
8
|
+
builder.feature_builders.map {|i| i.class}.should include(FeatureSet::FeatureBuilder::WordVector)
|
9
|
+
builder.feature_builders.length.should == Dir[File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "lib", "feature_set", "feature_builder", "*.rb"))].length - 1
|
10
|
+
end
|
11
|
+
|
12
|
+
it "can add individual feature builders" do
|
13
|
+
builder = FeatureSet::Builder.new
|
14
|
+
builder.add_feature_builder FeatureSet::FeatureBuilder::WordVector.new
|
15
|
+
builder.feature_builders.length.should == 1
|
16
|
+
end
|
17
|
+
|
18
|
+
it "can add arrays of feature builders" do
|
19
|
+
builder = FeatureSet::Builder.new
|
20
|
+
builder.add_feature_builders [FeatureSet::FeatureBuilder::WordVector.new, FeatureSet::FeatureBuilder::Cuss.new]
|
21
|
+
builder.feature_builders.length.should == 2
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
describe "adding data" do
|
26
|
+
it "should accept mappings between one or more strings and their classifications" do
|
27
|
+
builder = FeatureSet::Builder.new
|
28
|
+
builder.add_data [ { :status => "I am happy!", :class => :happy },
|
29
|
+
{ :status => "I am sad." , :class => :sad } ]
|
30
|
+
builder.data.should == [ { :status => "I am happy!", :class => :happy },
|
31
|
+
{ :status => "I am sad." , :class => :sad } ]
|
32
|
+
builder.add_data :status => "Something", :another_feature => "Something else", :class => :awesome
|
33
|
+
builder.data.should == [ { :status => "I am happy!", :class => :happy },
|
34
|
+
{ :status => "I am sad." , :class => :sad },
|
35
|
+
{ :status => "Something", :another_feature => "Something else", :class => :awesome } ]
|
36
|
+
builder.clear_data
|
37
|
+
builder.data.should == []
|
38
|
+
builder.data = [ { :status => "I am happy!", :class => :happy },
|
39
|
+
{ :status => "I am sad." , :class => :sad } ]
|
40
|
+
builder.data.should == [ { :status => "I am happy!", :class => :happy },
|
41
|
+
{ :status => "I am sad." , :class => :sad } ]
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "generating features" do
|
46
|
+
before do
|
47
|
+
@builder = FeatureSet::Builder.new
|
48
|
+
@builder.add_feature_builder FeatureSet::FeatureBuilder::Cuss.new
|
49
|
+
@builder.add_data :status => "this is some text", :class => :awesome
|
50
|
+
@builder.add_data :status => "this is some shitty text", :class => :less_awesome
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should output a row of features for every line of data" do
|
54
|
+
@builder.generate_features
|
55
|
+
@builder.features[0].should == { :status_cuss_count => 0, :class => :awesome }
|
56
|
+
@builder.features[1].should == { :status_cuss_count => 1, :class => :less_awesome }
|
57
|
+
end
|
58
|
+
|
59
|
+
it "should make it easy to keep the original data" do
|
60
|
+
@builder.generate_features(:include_original => true)
|
61
|
+
@builder.features[0].should == { :status => "this is some text", :status_cuss_count => 0, :class => :awesome }
|
62
|
+
@builder.features[1].should == { :status => "this is some shitty text", :status_cuss_count => 1, :class => :less_awesome }
|
63
|
+
end
|
64
|
+
|
65
|
+
it "should generate features for every string" do
|
66
|
+
@builder.add_data :status => "text", :foo => "more shitty text", :class => :awesome
|
67
|
+
@builder.generate_features
|
68
|
+
@builder.features[1].should == { :status_cuss_count => 1, :class => :less_awesome }
|
69
|
+
@builder.features[2].should == { :status_cuss_count => 0, :foo_cuss_count => 1, :class => :awesome }
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe FeatureSet::Datum do
|
4
|
+
describe "tokenize" do
|
5
|
+
it "should return an array of tokens" do
|
6
|
+
FeatureSet::Datum.new("hello world sup?").tokens.should =~ ["hello", "world", "sup"]
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should memoize" do
|
10
|
+
datum = FeatureSet::Datum.new("hello world sup?")
|
11
|
+
datum.tokens.should =~ ["hello", "world", "sup"]
|
12
|
+
datum.value = "hello"
|
13
|
+
datum.tokens.should =~ ["hello", "world", "sup"]
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
describe "#token_counts" do
|
18
|
+
it "should provide counts for each token" do
|
19
|
+
datum = FeatureSet::Datum.new("hello world sup? hello!")
|
20
|
+
datum.token_counts.should == { "hello" => 2, "world" => 1, "sup" => 1}
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should memoize" do
|
24
|
+
datum = FeatureSet::Datum.new("hello world sup? hello!")
|
25
|
+
datum.token_counts.should == { "hello" => 2, "world" => 1, "sup" => 1}
|
26
|
+
datum.value = "hello"
|
27
|
+
datum.instance_variable_set(:@tokens, ["hello"])
|
28
|
+
datum.token_counts.should == { "hello" => 2, "world" => 1, "sup" => 1}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe FeatureSet::FeatureBuilder::Cuss do
|
4
|
+
before do
|
5
|
+
@builder = FeatureSet::FeatureBuilder::Cuss.new
|
6
|
+
end
|
7
|
+
|
8
|
+
it "should output :cuss_count as the number of distinct cuss words found" do
|
9
|
+
@builder.generate_features(FeatureSet::Datum.new("this fucking shit"), nil, nil).should == { :cuss_count => 2 }
|
10
|
+
@builder.generate_features(FeatureSet::Datum.new("this fucking fucking fucking shit"), nil, nil).should == { :cuss_count => 2 }
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should ignore non-string features" do
|
14
|
+
@builder.generate_features(FeatureSet::Datum.new(2), nil, nil).should == {}
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe FeatureSet::FeatureBuilder::WordVector do
|
4
|
+
it "should output a named feature for every word in the dataset, after performing tfidf" do
|
5
|
+
builder = FeatureSet::FeatureBuilder::WordVector.new
|
6
|
+
dataset = [
|
7
|
+
{ :m1 => "hello world. hello!", :m2 => "how goes?", :class => :yes },
|
8
|
+
{ :m1 => "foo world", :m2 => "how?", :class => :no }
|
9
|
+
]
|
10
|
+
wrapped_dataset = FeatureSet::Builder.wrap_dataset(dataset)
|
11
|
+
builder.before_generate_features(wrapped_dataset)
|
12
|
+
|
13
|
+
builder.idfs.should == {
|
14
|
+
:m1 => { "hello" => Math.log(2/1.0), "world" => Math.log(2/2.0), "foo" => Math.log(2/1.0) },
|
15
|
+
:m2 => { "how" => Math.log(2/2.0), "goes" => Math.log(2/1.0) }
|
16
|
+
}
|
17
|
+
|
18
|
+
builder.generate_features(wrapped_dataset.first[:m1], :m1, wrapped_dataset.first).should == { "hello" => (2/3.0) * Math.log(2/1.0), "world" => (1/3.0) * Math.log(2/2.0), "foo" => 0 }
|
19
|
+
builder.generate_features(wrapped_dataset.first[:m2], :m2, wrapped_dataset.first).should == { "how" => (1/2.0) * Math.log(2/2.0), "goes" => (1/2.0) * Math.log(2/1.0) }
|
20
|
+
|
21
|
+
builder.generate_features(wrapped_dataset.last[:m1], :m1, wrapped_dataset.last).should == { "hello" => 0, "world" => (1/2.0) * Math.log(2/2.0), "foo" => (1/2.0) * Math.log(2/1.0) }
|
22
|
+
builder.generate_features(wrapped_dataset.last[:m2], :m2, wrapped_dataset.last).should == { "how" => (1/1.0) * Math.log(2/2.0), "goes" => 0 }
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should ignore non-string features" do
|
26
|
+
builder = FeatureSet::FeatureBuilder::WordVector.new
|
27
|
+
builder.before_generate_features([{ :something => FeatureSet::Datum.new(2), :class => false }, { :something => FeatureSet::Datum.new(1), :class => true }])
|
28
|
+
builder.generate_features(FeatureSet::Datum.new(2), :something, { :something => FeatureSet::Datum.new(2), :class => false }).should == {}
|
29
|
+
end
|
30
|
+
end
|
data/spec/spec.opts
ADDED
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,116 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: feature_set
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Andrew Cantino
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-12-17 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: &70284888584540 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70284888584540
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: wwood-rarff
|
27
|
+
requirement: &70284888584120 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70284888584120
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: activesupport
|
38
|
+
requirement: &70284888583700 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :runtime
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *70284888583700
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: i18n
|
49
|
+
requirement: &70284888583280 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
type: :runtime
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *70284888583280
|
58
|
+
description: FeatureSet is a Ruby library for generating feature vectors from textual
|
59
|
+
data. It can output in ARFF format for experimentation with Weka.
|
60
|
+
email:
|
61
|
+
- andrew@iterationlabs.com
|
62
|
+
executables: []
|
63
|
+
extensions: []
|
64
|
+
extra_rdoc_files: []
|
65
|
+
files:
|
66
|
+
- .gitignore
|
67
|
+
- .rvmrc
|
68
|
+
- Gemfile
|
69
|
+
- README.markdown
|
70
|
+
- Rakefile
|
71
|
+
- feature_set.gemspec
|
72
|
+
- lib/feature_set.rb
|
73
|
+
- lib/feature_set/builder.rb
|
74
|
+
- lib/feature_set/data/cusswords.txt
|
75
|
+
- lib/feature_set/datum.rb
|
76
|
+
- lib/feature_set/feature_builder/base.rb
|
77
|
+
- lib/feature_set/feature_builder/cuss.rb
|
78
|
+
- lib/feature_set/feature_builder/word_vector.rb
|
79
|
+
- lib/feature_set/version.rb
|
80
|
+
- spec/feature_set/builder_spec.rb
|
81
|
+
- spec/feature_set/datum_spec.rb
|
82
|
+
- spec/feature_set/feature/cuss_spec.rb
|
83
|
+
- spec/feature_set/feature/word_vector_spec.rb
|
84
|
+
- spec/spec.opts
|
85
|
+
- spec/spec_helper.rb
|
86
|
+
homepage: https://github.com/iterationlabs/feature_set
|
87
|
+
licenses: []
|
88
|
+
post_install_message:
|
89
|
+
rdoc_options: []
|
90
|
+
require_paths:
|
91
|
+
- lib
|
92
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
93
|
+
none: false
|
94
|
+
requirements:
|
95
|
+
- - ! '>='
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: '0'
|
98
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
99
|
+
none: false
|
100
|
+
requirements:
|
101
|
+
- - ! '>='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
requirements: []
|
105
|
+
rubyforge_project: feature_set
|
106
|
+
rubygems_version: 1.8.10
|
107
|
+
signing_key:
|
108
|
+
specification_version: 3
|
109
|
+
summary: Generate feature vectors from textual data
|
110
|
+
test_files:
|
111
|
+
- spec/feature_set/builder_spec.rb
|
112
|
+
- spec/feature_set/datum_spec.rb
|
113
|
+
- spec/feature_set/feature/cuss_spec.rb
|
114
|
+
- spec/feature_set/feature/word_vector_spec.rb
|
115
|
+
- spec/spec.opts
|
116
|
+
- spec/spec_helper.rb
|