feature_set 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/.rvmrc +1 -0
- data/Gemfile +4 -0
- data/README.markdown +8 -0
- data/Rakefile +6 -0
- data/feature_set.gemspec +26 -0
- data/lib/feature_set/builder.rb +70 -0
- data/lib/feature_set/data/cusswords.txt +351 -0
- data/lib/feature_set/datum.rb +24 -0
- data/lib/feature_set/feature_builder/base.rb +18 -0
- data/lib/feature_set/feature_builder/cuss.rb +14 -0
- data/lib/feature_set/feature_builder/word_vector.rb +45 -0
- data/lib/feature_set/version.rb +3 -0
- data/lib/feature_set.rb +4 -0
- data/spec/feature_set/builder_spec.rb +72 -0
- data/spec/feature_set/datum_spec.rb +31 -0
- data/spec/feature_set/feature/cuss_spec.rb +16 -0
- data/spec/feature_set/feature/word_vector_spec.rb +30 -0
- data/spec/spec.opts +4 -0
- data/spec/spec_helper.rb +5 -0
- metadata +116 -0
data/.gitignore
ADDED
data/.rvmrc
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rvm use 1.9.2@feature_set --create
|
data/Gemfile
ADDED
data/README.markdown
ADDED
data/Rakefile
ADDED
data/feature_set.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "feature_set/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "feature_set"
|
7
|
+
s.version = FeatureSet::VERSION
|
8
|
+
s.authors = ["Andrew Cantino"]
|
9
|
+
s.email = ["andrew@iterationlabs.com"]
|
10
|
+
s.homepage = "https://github.com/iterationlabs/feature_set"
|
11
|
+
s.summary = %q{Generate feature vectors from textual data}
|
12
|
+
s.description = %q{FeatureSet is a Ruby library for generating feature vectors from textual data. It can output in ARFF format for experimentation with Weka.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "feature_set"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
# specify any dependencies here; for example:
|
22
|
+
s.add_development_dependency "rspec"
|
23
|
+
s.add_runtime_dependency "wwood-rarff"
|
24
|
+
s.add_runtime_dependency "activesupport"
|
25
|
+
s.add_runtime_dependency "i18n"
|
26
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'active_support'
|
2
|
+
require 'active_support/inflector'
|
3
|
+
|
4
|
+
require "feature_set/feature_builder/word_vector"
|
5
|
+
require "feature_set/feature_builder/cuss"
|
6
|
+
|
7
|
+
require "feature_set/datum"
|
8
|
+
|
9
|
+
module FeatureSet
|
10
|
+
class Builder
|
11
|
+
BUILTIN_FEATURE_BUILDERS = %w[FeatureSet::FeatureBuilder::Cuss
|
12
|
+
FeatureSet::FeatureBuilder::WordVector].map(&:constantize)
|
13
|
+
|
14
|
+
attr_accessor :options, :feature_builders, :data, :features
|
15
|
+
|
16
|
+
def initialize(options = {})
|
17
|
+
@options = options
|
18
|
+
@feature_builders = []
|
19
|
+
@features = []
|
20
|
+
@data = []
|
21
|
+
end
|
22
|
+
|
23
|
+
def add_data(data)
|
24
|
+
clear_features
|
25
|
+
(@data << data).flatten!
|
26
|
+
end
|
27
|
+
|
28
|
+
def clear_data
|
29
|
+
@data = []
|
30
|
+
clear_features
|
31
|
+
end
|
32
|
+
|
33
|
+
def clear_features
|
34
|
+
@features = []
|
35
|
+
end
|
36
|
+
|
37
|
+
def generate_features(opts = {})
|
38
|
+
wrapped_data_set = self.class.wrap_dataset(data)
|
39
|
+
|
40
|
+
feature_builders.each {|fb| fb.before_generate_features(wrapped_data_set) }
|
41
|
+
|
42
|
+
@features = wrapped_data_set.map do |row|
|
43
|
+
output_row = {}
|
44
|
+
|
45
|
+
row.each do |key, datum|
|
46
|
+
(output_row[:class] = datum) and next if key == :class
|
47
|
+
output_row[key] = datum.value if opts[:include_original]
|
48
|
+
|
49
|
+
feature_builders.each do |builder|
|
50
|
+
builder.generate_features(datum, key, row).each do |feature, value|
|
51
|
+
output_row["#{key}_#{feature}".to_sym] = value
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
output_row
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def add_feature_builders(*builders)
|
61
|
+
builders = BUILTIN_FEATURE_BUILDERS.map(&:new) if [:all, "all"].include?(builders.first)
|
62
|
+
(@feature_builders << builders).flatten!
|
63
|
+
end
|
64
|
+
alias_method :add_feature_builder, :add_feature_builders
|
65
|
+
|
66
|
+
def self.wrap_dataset(dataset)
|
67
|
+
dataset.map { |row| row.inject({}) { |m, (k, v)| m[k] = (k == :class ? v : Datum.new(v)) ; m } }
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,351 @@
|
|
1
|
+
anus
|
2
|
+
arse
|
3
|
+
arsehole
|
4
|
+
ass
|
5
|
+
ass-hat
|
6
|
+
asshat
|
7
|
+
ass-jabber
|
8
|
+
assjabber
|
9
|
+
ass-pirate
|
10
|
+
asspirate
|
11
|
+
assbag
|
12
|
+
assbandit
|
13
|
+
assbanger
|
14
|
+
assbite
|
15
|
+
assclown
|
16
|
+
asscock
|
17
|
+
asscracker
|
18
|
+
asses
|
19
|
+
assface
|
20
|
+
assfuck
|
21
|
+
assfucker
|
22
|
+
assgoblin
|
23
|
+
asshead
|
24
|
+
asshole
|
25
|
+
asshopper
|
26
|
+
assjacker
|
27
|
+
asslick
|
28
|
+
asslicker
|
29
|
+
assmonkey
|
30
|
+
assmunch
|
31
|
+
assmuncher
|
32
|
+
assnigger
|
33
|
+
assshit
|
34
|
+
assshole
|
35
|
+
asssucker
|
36
|
+
asswad
|
37
|
+
asswipe
|
38
|
+
bampot
|
39
|
+
bastard
|
40
|
+
beaner
|
41
|
+
bitch
|
42
|
+
bitchass
|
43
|
+
bitches
|
44
|
+
bitchtits
|
45
|
+
bitchy
|
46
|
+
blow job
|
47
|
+
blowjob
|
48
|
+
bollocks
|
49
|
+
bollox
|
50
|
+
boner
|
51
|
+
brotherfucker
|
52
|
+
bullshit
|
53
|
+
bumblefuck
|
54
|
+
butt plug
|
55
|
+
buttplug
|
56
|
+
butt-pirate
|
57
|
+
buttpirate
|
58
|
+
buttfucka
|
59
|
+
buttfucker
|
60
|
+
camel toe
|
61
|
+
cameltoe
|
62
|
+
carpetmuncher
|
63
|
+
chinc
|
64
|
+
chink
|
65
|
+
choad
|
66
|
+
chode
|
67
|
+
clit
|
68
|
+
clitface
|
69
|
+
clitfuck
|
70
|
+
clusterfuck
|
71
|
+
cock
|
72
|
+
cockass
|
73
|
+
cockbite
|
74
|
+
cockburger
|
75
|
+
cockface
|
76
|
+
cockfucker
|
77
|
+
cockhead
|
78
|
+
cockjockey
|
79
|
+
cockknoker
|
80
|
+
cockmaster
|
81
|
+
cockmongler
|
82
|
+
cockmongruel
|
83
|
+
cockmonkey
|
84
|
+
cockmuncher
|
85
|
+
cocknose
|
86
|
+
cocknugget
|
87
|
+
cockshit
|
88
|
+
cocksmith
|
89
|
+
cocksmoke
|
90
|
+
cocksmoker
|
91
|
+
cocksniffer
|
92
|
+
cocksucker
|
93
|
+
cockwaffle
|
94
|
+
coochie
|
95
|
+
coochy
|
96
|
+
coon
|
97
|
+
cooter
|
98
|
+
cracker
|
99
|
+
cum
|
100
|
+
cumbubble
|
101
|
+
cumdumpster
|
102
|
+
cumguzzler
|
103
|
+
cumjockey
|
104
|
+
cumslut
|
105
|
+
cumtart
|
106
|
+
cunnie
|
107
|
+
cunnilingus
|
108
|
+
cunt
|
109
|
+
cuntass
|
110
|
+
cuntface
|
111
|
+
cunthole
|
112
|
+
cuntlicker
|
113
|
+
cuntrag
|
114
|
+
cuntslut
|
115
|
+
dago
|
116
|
+
damn
|
117
|
+
deggo
|
118
|
+
dick
|
119
|
+
dickbag
|
120
|
+
dickbeaters
|
121
|
+
dickface
|
122
|
+
dickfuck
|
123
|
+
dickfucker
|
124
|
+
dickhead
|
125
|
+
dickhole
|
126
|
+
dickjuice
|
127
|
+
dickmilk
|
128
|
+
dickmonger
|
129
|
+
dicks
|
130
|
+
dickslap
|
131
|
+
dicksucker
|
132
|
+
dicksucking
|
133
|
+
dickwad
|
134
|
+
dickweasel
|
135
|
+
dickweed
|
136
|
+
dickwod
|
137
|
+
dike
|
138
|
+
dildo
|
139
|
+
dipshit
|
140
|
+
doochbag
|
141
|
+
dookie
|
142
|
+
douche
|
143
|
+
douche-fag
|
144
|
+
douchefag
|
145
|
+
douchebag
|
146
|
+
douchewaffle
|
147
|
+
dumass
|
148
|
+
dumb ass
|
149
|
+
dumbass
|
150
|
+
dumbfuck
|
151
|
+
dumbshit
|
152
|
+
dumshit
|
153
|
+
dyke
|
154
|
+
fag
|
155
|
+
fagbag
|
156
|
+
fagfucker
|
157
|
+
faggit
|
158
|
+
faggot
|
159
|
+
faggotcock
|
160
|
+
fagtard
|
161
|
+
fatass
|
162
|
+
fellatio
|
163
|
+
feltch
|
164
|
+
flamer
|
165
|
+
fuck
|
166
|
+
fuckass
|
167
|
+
fuckbag
|
168
|
+
fuckboy
|
169
|
+
fuckbrain
|
170
|
+
fuckbutt
|
171
|
+
fucked
|
172
|
+
fucker
|
173
|
+
fuckersucker
|
174
|
+
fuckface
|
175
|
+
fuckhead
|
176
|
+
fuckhole
|
177
|
+
fuckin
|
178
|
+
fucking
|
179
|
+
fucknut
|
180
|
+
fucknutt
|
181
|
+
fuckoff
|
182
|
+
fucks
|
183
|
+
fuckstick
|
184
|
+
fucktard
|
185
|
+
fucktart
|
186
|
+
fuckup
|
187
|
+
fuckwad
|
188
|
+
fuckwit
|
189
|
+
fuckwitt
|
190
|
+
fudgepacker
|
191
|
+
gay
|
192
|
+
gayass
|
193
|
+
gaybob
|
194
|
+
gaydo
|
195
|
+
gayfuck
|
196
|
+
gayfuckist
|
197
|
+
gaylord
|
198
|
+
gaytard
|
199
|
+
gaywad
|
200
|
+
goddamn
|
201
|
+
goddamnit
|
202
|
+
gooch
|
203
|
+
gook
|
204
|
+
gringo
|
205
|
+
guido
|
206
|
+
handjob
|
207
|
+
hard on
|
208
|
+
hardon
|
209
|
+
heeb
|
210
|
+
hell
|
211
|
+
ho
|
212
|
+
hoe
|
213
|
+
homo
|
214
|
+
homodumbshit
|
215
|
+
honkey
|
216
|
+
humping
|
217
|
+
jackass
|
218
|
+
jap
|
219
|
+
jerk off
|
220
|
+
jerkoff
|
221
|
+
jigaboo
|
222
|
+
jizz
|
223
|
+
jungle bunny
|
224
|
+
junglebunny
|
225
|
+
kike
|
226
|
+
kooch
|
227
|
+
kootch
|
228
|
+
kraut
|
229
|
+
kunt
|
230
|
+
kyke
|
231
|
+
lameass
|
232
|
+
lesbian
|
233
|
+
lesbo
|
234
|
+
lezzie
|
235
|
+
mcfagget
|
236
|
+
mick
|
237
|
+
minge
|
238
|
+
mothafucka
|
239
|
+
mothafuckin\'
|
240
|
+
mothafuckin
|
241
|
+
motherfucker
|
242
|
+
motherfucking
|
243
|
+
muff
|
244
|
+
muffdiver
|
245
|
+
munging
|
246
|
+
negro
|
247
|
+
nigaboo
|
248
|
+
nigga
|
249
|
+
nigger
|
250
|
+
niggers
|
251
|
+
niglet
|
252
|
+
nut sack
|
253
|
+
nutsack
|
254
|
+
paki
|
255
|
+
panooch
|
256
|
+
pecker
|
257
|
+
peckerhead
|
258
|
+
penis
|
259
|
+
penisbanger
|
260
|
+
penisfucker
|
261
|
+
penispuffer
|
262
|
+
piss
|
263
|
+
pissed
|
264
|
+
pissed off
|
265
|
+
pissedoff
|
266
|
+
pissflaps
|
267
|
+
polesmoker
|
268
|
+
pollock
|
269
|
+
poon
|
270
|
+
poonani
|
271
|
+
poonany
|
272
|
+
poontang
|
273
|
+
porch monkey
|
274
|
+
porchmonkey
|
275
|
+
prick
|
276
|
+
punanny
|
277
|
+
punta
|
278
|
+
pussies
|
279
|
+
pussy
|
280
|
+
pussylicking
|
281
|
+
puto
|
282
|
+
queef
|
283
|
+
queer
|
284
|
+
queerbait
|
285
|
+
queerhole
|
286
|
+
renob
|
287
|
+
rimjob
|
288
|
+
ruski
|
289
|
+
sand nigger
|
290
|
+
sandnigger
|
291
|
+
schlong
|
292
|
+
scrote
|
293
|
+
shit
|
294
|
+
shitass
|
295
|
+
shitbag
|
296
|
+
shitbagger
|
297
|
+
shitbrains
|
298
|
+
shitbreath
|
299
|
+
shitcanned
|
300
|
+
shitcunt
|
301
|
+
shitdick
|
302
|
+
shitface
|
303
|
+
shitfaced
|
304
|
+
shithead
|
305
|
+
shithole
|
306
|
+
shithouse
|
307
|
+
shitspitter
|
308
|
+
shitstain
|
309
|
+
shitter
|
310
|
+
shittiest
|
311
|
+
shitting
|
312
|
+
shitty
|
313
|
+
shiz
|
314
|
+
shiznit
|
315
|
+
skank
|
316
|
+
skeet
|
317
|
+
skullfuck
|
318
|
+
slut
|
319
|
+
slutbag
|
320
|
+
smeg
|
321
|
+
snatch
|
322
|
+
spic
|
323
|
+
spick
|
324
|
+
splooge
|
325
|
+
spook
|
326
|
+
suckass
|
327
|
+
tard
|
328
|
+
testicle
|
329
|
+
thundercunt
|
330
|
+
tit
|
331
|
+
titfuck
|
332
|
+
tits
|
333
|
+
tittyfuck
|
334
|
+
twat
|
335
|
+
twatlips
|
336
|
+
twats
|
337
|
+
twatwaffle
|
338
|
+
unclefucker
|
339
|
+
va-j-j
|
340
|
+
vajj
|
341
|
+
vag
|
342
|
+
vagina
|
343
|
+
vajayjay
|
344
|
+
vjayjay
|
345
|
+
wank
|
346
|
+
wankjob
|
347
|
+
wetback
|
348
|
+
whore
|
349
|
+
whorebag
|
350
|
+
whoreface
|
351
|
+
wop
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module FeatureSet
|
2
|
+
class Datum
|
3
|
+
TOKEN_REGEX = /[\s\/]+/
|
4
|
+
NON_ASCII_REGEX = /[^a-zA-Z0-9_-]/
|
5
|
+
|
6
|
+
attr_accessor :value
|
7
|
+
|
8
|
+
def initialize(v)
|
9
|
+
self.value = v
|
10
|
+
end
|
11
|
+
|
12
|
+
def tokens
|
13
|
+
@tokens ||= begin
|
14
|
+
value.strip.downcase.gsub(NON_ASCII_REGEX, ' ').split(TOKEN_REGEX)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def token_counts
|
19
|
+
@token_counts ||= begin
|
20
|
+
tokens.inject({}) { |m, w| m[w] ||= 0; m[w] += 1; m }
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module FeatureSet
|
2
|
+
module FeatureBuilder
|
3
|
+
class Base
|
4
|
+
attr_accessor :options
|
5
|
+
|
6
|
+
def initialize(options = {})
|
7
|
+
@options = options
|
8
|
+
end
|
9
|
+
|
10
|
+
def generate_features(datum, key, row)
|
11
|
+
raise "Please implement 'generate_features' in your subclass of FeatureBuilder::Base."
|
12
|
+
end
|
13
|
+
|
14
|
+
def before_generate_features(dataset)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require "feature_set/feature_builder/base"
|
2
|
+
|
3
|
+
module FeatureSet
|
4
|
+
module FeatureBuilder
|
5
|
+
class Cuss < Base
|
6
|
+
CUSS_WORDS = File.read(File.expand_path(File.join(File.dirname(__FILE__), '..', 'data', 'cusswords.txt'))).split("\n").map {|i| i.strip.downcase }
|
7
|
+
|
8
|
+
def generate_features(datum, key, row)
|
9
|
+
return {} unless datum.value.is_a?(String)
|
10
|
+
{ :cuss_count => (datum.tokens & CUSS_WORDS).length }
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require "feature_set/feature_builder/base"
|
2
|
+
|
3
|
+
module FeatureSet
|
4
|
+
module FeatureBuilder
|
5
|
+
class WordVector < Base
|
6
|
+
attr_accessor :idfs
|
7
|
+
|
8
|
+
def initialize(options = {})
|
9
|
+
super
|
10
|
+
end
|
11
|
+
|
12
|
+
def before_generate_features(dataset)
|
13
|
+
@idfs = {}
|
14
|
+
dataset.each do |row|
|
15
|
+
row.each do |key, datum|
|
16
|
+
next if key == :class
|
17
|
+
if datum.value.is_a?(String)
|
18
|
+
idfs[key] ||= {}
|
19
|
+
datum.token_counts.keys.each do |token|
|
20
|
+
idfs[key][token] ||= 0
|
21
|
+
idfs[key][token] += 1
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
num_docs = dataset.length
|
28
|
+
idfs.each do |feature, freqs|
|
29
|
+
freqs.each do |key, value|
|
30
|
+
idfs[feature][key] = Math.log(num_docs / value.to_f)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def generate_features(datum, key, row)
|
35
|
+
return {} unless datum.value.is_a?(String)
|
36
|
+
num_words = datum.tokens.length.to_f
|
37
|
+
idfs[key].inject({}) do |memo, (word, idf)|
|
38
|
+
memo[word] = ((datum.token_counts[word] || 0) / num_words) * idf
|
39
|
+
memo
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
data/lib/feature_set.rb
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe FeatureSet::Builder do
|
4
|
+
describe "adding feature builders" do
|
5
|
+
it "can add all known feature builders" do
|
6
|
+
builder = FeatureSet::Builder.new
|
7
|
+
builder.add_feature_builders :all
|
8
|
+
builder.feature_builders.map {|i| i.class}.should include(FeatureSet::FeatureBuilder::WordVector)
|
9
|
+
builder.feature_builders.length.should == Dir[File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "lib", "feature_set", "feature_builder", "*.rb"))].length - 1
|
10
|
+
end
|
11
|
+
|
12
|
+
it "can add individual feature builders" do
|
13
|
+
builder = FeatureSet::Builder.new
|
14
|
+
builder.add_feature_builder FeatureSet::FeatureBuilder::WordVector.new
|
15
|
+
builder.feature_builders.length.should == 1
|
16
|
+
end
|
17
|
+
|
18
|
+
it "can add arrays of feature builders" do
|
19
|
+
builder = FeatureSet::Builder.new
|
20
|
+
builder.add_feature_builders [FeatureSet::FeatureBuilder::WordVector.new, FeatureSet::FeatureBuilder::Cuss.new]
|
21
|
+
builder.feature_builders.length.should == 2
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
describe "adding data" do
|
26
|
+
it "should accept mappings between one or more strings and their classifications" do
|
27
|
+
builder = FeatureSet::Builder.new
|
28
|
+
builder.add_data [ { :status => "I am happy!", :class => :happy },
|
29
|
+
{ :status => "I am sad." , :class => :sad } ]
|
30
|
+
builder.data.should == [ { :status => "I am happy!", :class => :happy },
|
31
|
+
{ :status => "I am sad." , :class => :sad } ]
|
32
|
+
builder.add_data :status => "Something", :another_feature => "Something else", :class => :awesome
|
33
|
+
builder.data.should == [ { :status => "I am happy!", :class => :happy },
|
34
|
+
{ :status => "I am sad." , :class => :sad },
|
35
|
+
{ :status => "Something", :another_feature => "Something else", :class => :awesome } ]
|
36
|
+
builder.clear_data
|
37
|
+
builder.data.should == []
|
38
|
+
builder.data = [ { :status => "I am happy!", :class => :happy },
|
39
|
+
{ :status => "I am sad." , :class => :sad } ]
|
40
|
+
builder.data.should == [ { :status => "I am happy!", :class => :happy },
|
41
|
+
{ :status => "I am sad." , :class => :sad } ]
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
describe "generating features" do
|
46
|
+
before do
|
47
|
+
@builder = FeatureSet::Builder.new
|
48
|
+
@builder.add_feature_builder FeatureSet::FeatureBuilder::Cuss.new
|
49
|
+
@builder.add_data :status => "this is some text", :class => :awesome
|
50
|
+
@builder.add_data :status => "this is some shitty text", :class => :less_awesome
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should output a row of features for every line of data" do
|
54
|
+
@builder.generate_features
|
55
|
+
@builder.features[0].should == { :status_cuss_count => 0, :class => :awesome }
|
56
|
+
@builder.features[1].should == { :status_cuss_count => 1, :class => :less_awesome }
|
57
|
+
end
|
58
|
+
|
59
|
+
it "should make it easy to keep the original data" do
|
60
|
+
@builder.generate_features(:include_original => true)
|
61
|
+
@builder.features[0].should == { :status => "this is some text", :status_cuss_count => 0, :class => :awesome }
|
62
|
+
@builder.features[1].should == { :status => "this is some shitty text", :status_cuss_count => 1, :class => :less_awesome }
|
63
|
+
end
|
64
|
+
|
65
|
+
it "should generate features for every string" do
|
66
|
+
@builder.add_data :status => "text", :foo => "more shitty text", :class => :awesome
|
67
|
+
@builder.generate_features
|
68
|
+
@builder.features[1].should == { :status_cuss_count => 1, :class => :less_awesome }
|
69
|
+
@builder.features[2].should == { :status_cuss_count => 0, :foo_cuss_count => 1, :class => :awesome }
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe FeatureSet::Datum do
|
4
|
+
describe "tokenize" do
|
5
|
+
it "should return an array of tokens" do
|
6
|
+
FeatureSet::Datum.new("hello world sup?").tokens.should =~ ["hello", "world", "sup"]
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should memoize" do
|
10
|
+
datum = FeatureSet::Datum.new("hello world sup?")
|
11
|
+
datum.tokens.should =~ ["hello", "world", "sup"]
|
12
|
+
datum.value = "hello"
|
13
|
+
datum.tokens.should =~ ["hello", "world", "sup"]
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
describe "#token_counts" do
|
18
|
+
it "should provide counts for each token" do
|
19
|
+
datum = FeatureSet::Datum.new("hello world sup? hello!")
|
20
|
+
datum.token_counts.should == { "hello" => 2, "world" => 1, "sup" => 1}
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should memoize" do
|
24
|
+
datum = FeatureSet::Datum.new("hello world sup? hello!")
|
25
|
+
datum.token_counts.should == { "hello" => 2, "world" => 1, "sup" => 1}
|
26
|
+
datum.value = "hello"
|
27
|
+
datum.instance_variable_set(:@tokens, ["hello"])
|
28
|
+
datum.token_counts.should == { "hello" => 2, "world" => 1, "sup" => 1}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe FeatureSet::FeatureBuilder::Cuss do
|
4
|
+
before do
|
5
|
+
@builder = FeatureSet::FeatureBuilder::Cuss.new
|
6
|
+
end
|
7
|
+
|
8
|
+
it "should output :cuss_count as the number of distinct cuss words found" do
|
9
|
+
@builder.generate_features(FeatureSet::Datum.new("this fucking shit"), nil, nil).should == { :cuss_count => 2 }
|
10
|
+
@builder.generate_features(FeatureSet::Datum.new("this fucking fucking fucking shit"), nil, nil).should == { :cuss_count => 2 }
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should ignore non-string features" do
|
14
|
+
@builder.generate_features(FeatureSet::Datum.new(2), nil, nil).should == {}
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe FeatureSet::FeatureBuilder::WordVector do
|
4
|
+
it "should output a named feature for every word in the dataset, after performing tfidf" do
|
5
|
+
builder = FeatureSet::FeatureBuilder::WordVector.new
|
6
|
+
dataset = [
|
7
|
+
{ :m1 => "hello world. hello!", :m2 => "how goes?", :class => :yes },
|
8
|
+
{ :m1 => "foo world", :m2 => "how?", :class => :no }
|
9
|
+
]
|
10
|
+
wrapped_dataset = FeatureSet::Builder.wrap_dataset(dataset)
|
11
|
+
builder.before_generate_features(wrapped_dataset)
|
12
|
+
|
13
|
+
builder.idfs.should == {
|
14
|
+
:m1 => { "hello" => Math.log(2/1.0), "world" => Math.log(2/2.0), "foo" => Math.log(2/1.0) },
|
15
|
+
:m2 => { "how" => Math.log(2/2.0), "goes" => Math.log(2/1.0) }
|
16
|
+
}
|
17
|
+
|
18
|
+
builder.generate_features(wrapped_dataset.first[:m1], :m1, wrapped_dataset.first).should == { "hello" => (2/3.0) * Math.log(2/1.0), "world" => (1/3.0) * Math.log(2/2.0), "foo" => 0 }
|
19
|
+
builder.generate_features(wrapped_dataset.first[:m2], :m2, wrapped_dataset.first).should == { "how" => (1/2.0) * Math.log(2/2.0), "goes" => (1/2.0) * Math.log(2/1.0) }
|
20
|
+
|
21
|
+
builder.generate_features(wrapped_dataset.last[:m1], :m1, wrapped_dataset.last).should == { "hello" => 0, "world" => (1/2.0) * Math.log(2/2.0), "foo" => (1/2.0) * Math.log(2/1.0) }
|
22
|
+
builder.generate_features(wrapped_dataset.last[:m2], :m2, wrapped_dataset.last).should == { "how" => (1/1.0) * Math.log(2/2.0), "goes" => 0 }
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should ignore non-string features" do
|
26
|
+
builder = FeatureSet::FeatureBuilder::WordVector.new
|
27
|
+
builder.before_generate_features([{ :something => FeatureSet::Datum.new(2), :class => false }, { :something => FeatureSet::Datum.new(1), :class => true }])
|
28
|
+
builder.generate_features(FeatureSet::Datum.new(2), :something, { :something => FeatureSet::Datum.new(2), :class => false }).should == {}
|
29
|
+
end
|
30
|
+
end
|
data/spec/spec.opts
ADDED
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,116 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: feature_set
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Andrew Cantino
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-12-17 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: &70284888584540 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70284888584540
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: wwood-rarff
|
27
|
+
requirement: &70284888584120 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70284888584120
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: activesupport
|
38
|
+
requirement: &70284888583700 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :runtime
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *70284888583700
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: i18n
|
49
|
+
requirement: &70284888583280 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
type: :runtime
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *70284888583280
|
58
|
+
description: FeatureSet is a Ruby library for generating feature vectors from textual
|
59
|
+
data. It can output in ARFF format for experimentation with Weka.
|
60
|
+
email:
|
61
|
+
- andrew@iterationlabs.com
|
62
|
+
executables: []
|
63
|
+
extensions: []
|
64
|
+
extra_rdoc_files: []
|
65
|
+
files:
|
66
|
+
- .gitignore
|
67
|
+
- .rvmrc
|
68
|
+
- Gemfile
|
69
|
+
- README.markdown
|
70
|
+
- Rakefile
|
71
|
+
- feature_set.gemspec
|
72
|
+
- lib/feature_set.rb
|
73
|
+
- lib/feature_set/builder.rb
|
74
|
+
- lib/feature_set/data/cusswords.txt
|
75
|
+
- lib/feature_set/datum.rb
|
76
|
+
- lib/feature_set/feature_builder/base.rb
|
77
|
+
- lib/feature_set/feature_builder/cuss.rb
|
78
|
+
- lib/feature_set/feature_builder/word_vector.rb
|
79
|
+
- lib/feature_set/version.rb
|
80
|
+
- spec/feature_set/builder_spec.rb
|
81
|
+
- spec/feature_set/datum_spec.rb
|
82
|
+
- spec/feature_set/feature/cuss_spec.rb
|
83
|
+
- spec/feature_set/feature/word_vector_spec.rb
|
84
|
+
- spec/spec.opts
|
85
|
+
- spec/spec_helper.rb
|
86
|
+
homepage: https://github.com/iterationlabs/feature_set
|
87
|
+
licenses: []
|
88
|
+
post_install_message:
|
89
|
+
rdoc_options: []
|
90
|
+
require_paths:
|
91
|
+
- lib
|
92
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
93
|
+
none: false
|
94
|
+
requirements:
|
95
|
+
- - ! '>='
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: '0'
|
98
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
99
|
+
none: false
|
100
|
+
requirements:
|
101
|
+
- - ! '>='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
requirements: []
|
105
|
+
rubyforge_project: feature_set
|
106
|
+
rubygems_version: 1.8.10
|
107
|
+
signing_key:
|
108
|
+
specification_version: 3
|
109
|
+
summary: Generate feature vectors from textual data
|
110
|
+
test_files:
|
111
|
+
- spec/feature_set/builder_spec.rb
|
112
|
+
- spec/feature_set/datum_spec.rb
|
113
|
+
- spec/feature_set/feature/cuss_spec.rb
|
114
|
+
- spec/feature_set/feature/word_vector_spec.rb
|
115
|
+
- spec/spec.opts
|
116
|
+
- spec/spec_helper.rb
|