nlp_backpack 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.rdoc +22 -0
- data/Rakefile +45 -0
- data/VERSION +1 -0
- data/lib/nlp_backpack.rb +10 -0
- data/lib/nlp_backpack/chunker.rb +5 -0
- data/lib/nlp_backpack/chunker/regex_chunker.rb +107 -0
- data/lib/nlp_backpack/chunker/tag_pattern.rb +31 -0
- data/lib/nlp_backpack/classifier.rb +5 -0
- data/lib/nlp_backpack/classifier/base.rb +28 -0
- data/lib/nlp_backpack/classifier/naive_bayes.rb +83 -0
- data/lib/nlp_backpack/evaluation.rb +6 -0
- data/lib/nlp_backpack/evaluation/accuracy.rb +46 -0
- data/lib/nlp_backpack/evaluation/base.rb +12 -0
- data/lib/nlp_backpack/evaluation/confusion_matrix.rb +66 -0
- data/lib/nlp_backpack/frequency_distribution.rb +47 -0
- data/lib/nlp_backpack/pos.rb +5 -0
- data/lib/nlp_backpack/pos/brill_tagger.rb +142 -0
- data/lib/nlp_backpack/pos/brill_tagger/lexicon.txt +93696 -0
- data/lib/nlp_backpack/pos/pos_array.rb +32 -0
- data/lib/nlp_backpack/stop_words.rb +17 -0
- data/lib/nlp_backpack/stop_words/stop_words.txt +429 -0
- data/lib/nlp_backpack/tokenizers/custom.rb +13 -0
- data/lib/nlp_backpack/tokenizers/line.rb +13 -0
- data/lib/nlp_backpack/tokenizers/space.rb +13 -0
- data/lib/nlp_backpack/tokenizers/tab.rb +13 -0
- data/lib/nlp_backpack/tokenizers/whitespace.rb +13 -0
- data/lib/nlp_backpack/tokenizers/word.rb +13 -0
- data/nlp_backpack.gemspec +109 -0
- data/spec/chunkers/regex_chunker_spec.rb +46 -0
- data/spec/chunkers/tag_pattern_spec.rb +40 -0
- data/spec/classifiers/naive_bayes_spec.rb +68 -0
- data/spec/evaluation/accuracy_spec.rb +29 -0
- data/spec/evaluation/confusion_matrix_spec.rb +29 -0
- data/spec/frequency_distribution_spec.rb +53 -0
- data/spec/nlp_backpack_spec.rb +4 -0
- data/spec/pos/brill_tagger_spec.rb +24 -0
- data/spec/pos/pos_array_spec.rb +45 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +18 -0
- data/spec/stop_words_spec.rb +15 -0
- data/spec/test_saves/naive.nb +1 -0
- data/spec/tokenizers/custom_spec.rb +24 -0
- data/spec/tokenizers/line_spec.rb +15 -0
- data/spec/tokenizers/space_spec.rb +15 -0
- data/spec/tokenizers/tab_spec.rb +15 -0
- data/spec/tokenizers/whitespace_spec.rb +16 -0
- data/spec/tokenizers/word_spec.rb +15 -0
- metadata +141 -0
@@ -0,0 +1,32 @@
|
|
1
|
+
# A POSArray holds the words and pos tags for a sentence
|
2
|
+
|
3
|
+
module NLPBackpack
|
4
|
+
|
5
|
+
module POS
|
6
|
+
class InvalidSentence < Exception; end;
|
7
|
+
|
8
|
+
class POSArray < Array
|
9
|
+
|
10
|
+
def <<(values)
|
11
|
+
validate_sentence(values)
|
12
|
+
super
|
13
|
+
end
|
14
|
+
|
15
|
+
def append(word, pos)
|
16
|
+
self << [word, pos]
|
17
|
+
end
|
18
|
+
|
19
|
+
# Turn into word/pos
|
20
|
+
def to_s
|
21
|
+
map {|word| word.join("/") }.join(" ")
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def validate_sentence(value)
|
27
|
+
raise InvalidSentence("Adding words needs to be structured like: [word, pos]") unless value.size == 2
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module NLPBackpack
|
2
|
+
|
3
|
+
class StopWords
|
4
|
+
class << self
|
5
|
+
def all
|
6
|
+
File.read(stop_word_path).split("\n")
|
7
|
+
end
|
8
|
+
|
9
|
+
private
|
10
|
+
|
11
|
+
def stop_word_path
|
12
|
+
File.expand_path(File.dirname(__FILE__) + '/stop_words/stop_words.txt')
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
@@ -0,0 +1,429 @@
|
|
1
|
+
a
|
2
|
+
about
|
3
|
+
above
|
4
|
+
across
|
5
|
+
after
|
6
|
+
again
|
7
|
+
against
|
8
|
+
all
|
9
|
+
almost
|
10
|
+
alone
|
11
|
+
along
|
12
|
+
already
|
13
|
+
also
|
14
|
+
although
|
15
|
+
always
|
16
|
+
among
|
17
|
+
an
|
18
|
+
and
|
19
|
+
another
|
20
|
+
any
|
21
|
+
anybody
|
22
|
+
anyone
|
23
|
+
anything
|
24
|
+
anywhere
|
25
|
+
are
|
26
|
+
area
|
27
|
+
areas
|
28
|
+
around
|
29
|
+
as
|
30
|
+
ask
|
31
|
+
asked
|
32
|
+
asking
|
33
|
+
asks
|
34
|
+
at
|
35
|
+
away
|
36
|
+
b
|
37
|
+
back
|
38
|
+
backed
|
39
|
+
backing
|
40
|
+
backs
|
41
|
+
be
|
42
|
+
became
|
43
|
+
because
|
44
|
+
become
|
45
|
+
becomes
|
46
|
+
been
|
47
|
+
before
|
48
|
+
began
|
49
|
+
behind
|
50
|
+
being
|
51
|
+
beings
|
52
|
+
best
|
53
|
+
better
|
54
|
+
between
|
55
|
+
big
|
56
|
+
both
|
57
|
+
but
|
58
|
+
by
|
59
|
+
c
|
60
|
+
came
|
61
|
+
can
|
62
|
+
cannot
|
63
|
+
case
|
64
|
+
cases
|
65
|
+
certain
|
66
|
+
certainly
|
67
|
+
clear
|
68
|
+
clearly
|
69
|
+
come
|
70
|
+
could
|
71
|
+
d
|
72
|
+
did
|
73
|
+
differ
|
74
|
+
different
|
75
|
+
differently
|
76
|
+
do
|
77
|
+
does
|
78
|
+
done
|
79
|
+
down
|
80
|
+
down
|
81
|
+
downed
|
82
|
+
downing
|
83
|
+
downs
|
84
|
+
during
|
85
|
+
e
|
86
|
+
each
|
87
|
+
early
|
88
|
+
either
|
89
|
+
end
|
90
|
+
ended
|
91
|
+
ending
|
92
|
+
ends
|
93
|
+
enough
|
94
|
+
even
|
95
|
+
evenly
|
96
|
+
ever
|
97
|
+
every
|
98
|
+
everybody
|
99
|
+
everyone
|
100
|
+
everything
|
101
|
+
everywhere
|
102
|
+
f
|
103
|
+
face
|
104
|
+
faces
|
105
|
+
fact
|
106
|
+
facts
|
107
|
+
far
|
108
|
+
felt
|
109
|
+
few
|
110
|
+
find
|
111
|
+
finds
|
112
|
+
first
|
113
|
+
for
|
114
|
+
four
|
115
|
+
from
|
116
|
+
full
|
117
|
+
fully
|
118
|
+
further
|
119
|
+
furthered
|
120
|
+
furthering
|
121
|
+
furthers
|
122
|
+
g
|
123
|
+
gave
|
124
|
+
general
|
125
|
+
generally
|
126
|
+
get
|
127
|
+
gets
|
128
|
+
give
|
129
|
+
given
|
130
|
+
gives
|
131
|
+
go
|
132
|
+
going
|
133
|
+
good
|
134
|
+
goods
|
135
|
+
got
|
136
|
+
great
|
137
|
+
greater
|
138
|
+
greatest
|
139
|
+
group
|
140
|
+
grouped
|
141
|
+
grouping
|
142
|
+
groups
|
143
|
+
h
|
144
|
+
had
|
145
|
+
has
|
146
|
+
have
|
147
|
+
having
|
148
|
+
he
|
149
|
+
her
|
150
|
+
here
|
151
|
+
herself
|
152
|
+
high
|
153
|
+
high
|
154
|
+
high
|
155
|
+
higher
|
156
|
+
highest
|
157
|
+
him
|
158
|
+
himself
|
159
|
+
his
|
160
|
+
how
|
161
|
+
however
|
162
|
+
i
|
163
|
+
if
|
164
|
+
important
|
165
|
+
in
|
166
|
+
interest
|
167
|
+
interested
|
168
|
+
interesting
|
169
|
+
interests
|
170
|
+
into
|
171
|
+
is
|
172
|
+
it
|
173
|
+
its
|
174
|
+
itself
|
175
|
+
j
|
176
|
+
just
|
177
|
+
k
|
178
|
+
keep
|
179
|
+
keeps
|
180
|
+
kind
|
181
|
+
knew
|
182
|
+
know
|
183
|
+
known
|
184
|
+
knows
|
185
|
+
l
|
186
|
+
large
|
187
|
+
largely
|
188
|
+
last
|
189
|
+
later
|
190
|
+
latest
|
191
|
+
least
|
192
|
+
less
|
193
|
+
let
|
194
|
+
lets
|
195
|
+
like
|
196
|
+
likely
|
197
|
+
long
|
198
|
+
longer
|
199
|
+
longest
|
200
|
+
m
|
201
|
+
made
|
202
|
+
make
|
203
|
+
making
|
204
|
+
man
|
205
|
+
many
|
206
|
+
may
|
207
|
+
me
|
208
|
+
member
|
209
|
+
members
|
210
|
+
men
|
211
|
+
might
|
212
|
+
more
|
213
|
+
most
|
214
|
+
mostly
|
215
|
+
mr
|
216
|
+
mrs
|
217
|
+
much
|
218
|
+
must
|
219
|
+
my
|
220
|
+
myself
|
221
|
+
n
|
222
|
+
necessary
|
223
|
+
need
|
224
|
+
needed
|
225
|
+
needing
|
226
|
+
needs
|
227
|
+
never
|
228
|
+
new
|
229
|
+
new
|
230
|
+
newer
|
231
|
+
newest
|
232
|
+
next
|
233
|
+
no
|
234
|
+
nobody
|
235
|
+
non
|
236
|
+
noone
|
237
|
+
not
|
238
|
+
nothing
|
239
|
+
now
|
240
|
+
nowhere
|
241
|
+
number
|
242
|
+
numbers
|
243
|
+
o
|
244
|
+
of
|
245
|
+
off
|
246
|
+
often
|
247
|
+
old
|
248
|
+
older
|
249
|
+
oldest
|
250
|
+
on
|
251
|
+
once
|
252
|
+
one
|
253
|
+
only
|
254
|
+
open
|
255
|
+
opened
|
256
|
+
opening
|
257
|
+
opens
|
258
|
+
or
|
259
|
+
order
|
260
|
+
ordered
|
261
|
+
ordering
|
262
|
+
orders
|
263
|
+
other
|
264
|
+
others
|
265
|
+
our
|
266
|
+
out
|
267
|
+
over
|
268
|
+
p
|
269
|
+
part
|
270
|
+
parted
|
271
|
+
parting
|
272
|
+
parts
|
273
|
+
per
|
274
|
+
perhaps
|
275
|
+
place
|
276
|
+
places
|
277
|
+
point
|
278
|
+
pointed
|
279
|
+
pointing
|
280
|
+
points
|
281
|
+
possible
|
282
|
+
present
|
283
|
+
presented
|
284
|
+
presenting
|
285
|
+
presents
|
286
|
+
problem
|
287
|
+
problems
|
288
|
+
put
|
289
|
+
puts
|
290
|
+
q
|
291
|
+
quite
|
292
|
+
r
|
293
|
+
rather
|
294
|
+
really
|
295
|
+
right
|
296
|
+
right
|
297
|
+
room
|
298
|
+
rooms
|
299
|
+
s
|
300
|
+
said
|
301
|
+
same
|
302
|
+
saw
|
303
|
+
say
|
304
|
+
says
|
305
|
+
second
|
306
|
+
seconds
|
307
|
+
see
|
308
|
+
seem
|
309
|
+
seemed
|
310
|
+
seeming
|
311
|
+
seems
|
312
|
+
sees
|
313
|
+
several
|
314
|
+
shall
|
315
|
+
she
|
316
|
+
should
|
317
|
+
show
|
318
|
+
showed
|
319
|
+
showing
|
320
|
+
shows
|
321
|
+
side
|
322
|
+
sides
|
323
|
+
since
|
324
|
+
small
|
325
|
+
smaller
|
326
|
+
smallest
|
327
|
+
so
|
328
|
+
some
|
329
|
+
somebody
|
330
|
+
someone
|
331
|
+
something
|
332
|
+
somewhere
|
333
|
+
state
|
334
|
+
states
|
335
|
+
still
|
336
|
+
still
|
337
|
+
such
|
338
|
+
sure
|
339
|
+
t
|
340
|
+
take
|
341
|
+
taken
|
342
|
+
than
|
343
|
+
that
|
344
|
+
the
|
345
|
+
their
|
346
|
+
them
|
347
|
+
then
|
348
|
+
there
|
349
|
+
therefore
|
350
|
+
these
|
351
|
+
they
|
352
|
+
thing
|
353
|
+
things
|
354
|
+
think
|
355
|
+
thinks
|
356
|
+
this
|
357
|
+
those
|
358
|
+
though
|
359
|
+
thought
|
360
|
+
thoughts
|
361
|
+
three
|
362
|
+
through
|
363
|
+
thus
|
364
|
+
to
|
365
|
+
today
|
366
|
+
together
|
367
|
+
too
|
368
|
+
took
|
369
|
+
toward
|
370
|
+
turn
|
371
|
+
turned
|
372
|
+
turning
|
373
|
+
turns
|
374
|
+
two
|
375
|
+
u
|
376
|
+
under
|
377
|
+
until
|
378
|
+
up
|
379
|
+
upon
|
380
|
+
us
|
381
|
+
use
|
382
|
+
used
|
383
|
+
uses
|
384
|
+
v
|
385
|
+
very
|
386
|
+
w
|
387
|
+
want
|
388
|
+
wanted
|
389
|
+
wanting
|
390
|
+
wants
|
391
|
+
was
|
392
|
+
way
|
393
|
+
ways
|
394
|
+
we
|
395
|
+
well
|
396
|
+
wells
|
397
|
+
went
|
398
|
+
were
|
399
|
+
what
|
400
|
+
when
|
401
|
+
where
|
402
|
+
whether
|
403
|
+
which
|
404
|
+
while
|
405
|
+
who
|
406
|
+
whole
|
407
|
+
whose
|
408
|
+
why
|
409
|
+
will
|
410
|
+
with
|
411
|
+
within
|
412
|
+
without
|
413
|
+
work
|
414
|
+
worked
|
415
|
+
working
|
416
|
+
works
|
417
|
+
would
|
418
|
+
x
|
419
|
+
y
|
420
|
+
year
|
421
|
+
years
|
422
|
+
yet
|
423
|
+
you
|
424
|
+
young
|
425
|
+
younger
|
426
|
+
youngest
|
427
|
+
your
|
428
|
+
yours
|
429
|
+
z
|