nlp_backpack 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. data/.document +5 -0
  2. data/.gitignore +21 -0
  3. data/LICENSE +20 -0
  4. data/README.rdoc +22 -0
  5. data/Rakefile +45 -0
  6. data/VERSION +1 -0
  7. data/lib/nlp_backpack.rb +10 -0
  8. data/lib/nlp_backpack/chunker.rb +5 -0
  9. data/lib/nlp_backpack/chunker/regex_chunker.rb +107 -0
  10. data/lib/nlp_backpack/chunker/tag_pattern.rb +31 -0
  11. data/lib/nlp_backpack/classifier.rb +5 -0
  12. data/lib/nlp_backpack/classifier/base.rb +28 -0
  13. data/lib/nlp_backpack/classifier/naive_bayes.rb +83 -0
  14. data/lib/nlp_backpack/evaluation.rb +6 -0
  15. data/lib/nlp_backpack/evaluation/accuracy.rb +46 -0
  16. data/lib/nlp_backpack/evaluation/base.rb +12 -0
  17. data/lib/nlp_backpack/evaluation/confusion_matrix.rb +66 -0
  18. data/lib/nlp_backpack/frequency_distribution.rb +47 -0
  19. data/lib/nlp_backpack/pos.rb +5 -0
  20. data/lib/nlp_backpack/pos/brill_tagger.rb +142 -0
  21. data/lib/nlp_backpack/pos/brill_tagger/lexicon.txt +93696 -0
  22. data/lib/nlp_backpack/pos/pos_array.rb +32 -0
  23. data/lib/nlp_backpack/stop_words.rb +17 -0
  24. data/lib/nlp_backpack/stop_words/stop_words.txt +429 -0
  25. data/lib/nlp_backpack/tokenizers/custom.rb +13 -0
  26. data/lib/nlp_backpack/tokenizers/line.rb +13 -0
  27. data/lib/nlp_backpack/tokenizers/space.rb +13 -0
  28. data/lib/nlp_backpack/tokenizers/tab.rb +13 -0
  29. data/lib/nlp_backpack/tokenizers/whitespace.rb +13 -0
  30. data/lib/nlp_backpack/tokenizers/word.rb +13 -0
  31. data/nlp_backpack.gemspec +109 -0
  32. data/spec/chunkers/regex_chunker_spec.rb +46 -0
  33. data/spec/chunkers/tag_pattern_spec.rb +40 -0
  34. data/spec/classifiers/naive_bayes_spec.rb +68 -0
  35. data/spec/evaluation/accuracy_spec.rb +29 -0
  36. data/spec/evaluation/confusion_matrix_spec.rb +29 -0
  37. data/spec/frequency_distribution_spec.rb +53 -0
  38. data/spec/nlp_backpack_spec.rb +4 -0
  39. data/spec/pos/brill_tagger_spec.rb +24 -0
  40. data/spec/pos/pos_array_spec.rb +45 -0
  41. data/spec/spec.opts +1 -0
  42. data/spec/spec_helper.rb +18 -0
  43. data/spec/stop_words_spec.rb +15 -0
  44. data/spec/test_saves/naive.nb +1 -0
  45. data/spec/tokenizers/custom_spec.rb +24 -0
  46. data/spec/tokenizers/line_spec.rb +15 -0
  47. data/spec/tokenizers/space_spec.rb +15 -0
  48. data/spec/tokenizers/tab_spec.rb +15 -0
  49. data/spec/tokenizers/whitespace_spec.rb +16 -0
  50. data/spec/tokenizers/word_spec.rb +15 -0
  51. metadata +141 -0
@@ -0,0 +1,32 @@
1
+ # A POSArray holds the words and pos tags for a sentence
2
+
3
+ module NLPBackpack
4
+
5
+ module POS
6
+ class InvalidSentence < Exception; end;
7
+
8
+ class POSArray < Array
9
+
10
+ def <<(values)
11
+ validate_sentence(values)
12
+ super
13
+ end
14
+
15
+ def append(word, pos)
16
+ self << [word, pos]
17
+ end
18
+
19
+ # Turn into word/pos
20
+ def to_s
21
+ map {|word| word.join("/") }.join(" ")
22
+ end
23
+
24
+ private
25
+
26
+ def validate_sentence(value)
27
+ raise InvalidSentence("Adding words needs to be structured like: [word, pos]") unless value.size == 2
28
+ end
29
+ end
30
+ end
31
+
32
+ end
@@ -0,0 +1,17 @@
1
+ module NLPBackpack
2
+
3
+ class StopWords
4
+ class << self
5
+ def all
6
+ File.read(stop_word_path).split("\n")
7
+ end
8
+
9
+ private
10
+
11
+ def stop_word_path
12
+ File.expand_path(File.dirname(__FILE__) + '/stop_words/stop_words.txt')
13
+ end
14
+ end
15
+ end
16
+
17
+ end
@@ -0,0 +1,429 @@
1
+ a
2
+ about
3
+ above
4
+ across
5
+ after
6
+ again
7
+ against
8
+ all
9
+ almost
10
+ alone
11
+ along
12
+ already
13
+ also
14
+ although
15
+ always
16
+ among
17
+ an
18
+ and
19
+ another
20
+ any
21
+ anybody
22
+ anyone
23
+ anything
24
+ anywhere
25
+ are
26
+ area
27
+ areas
28
+ around
29
+ as
30
+ ask
31
+ asked
32
+ asking
33
+ asks
34
+ at
35
+ away
36
+ b
37
+ back
38
+ backed
39
+ backing
40
+ backs
41
+ be
42
+ became
43
+ because
44
+ become
45
+ becomes
46
+ been
47
+ before
48
+ began
49
+ behind
50
+ being
51
+ beings
52
+ best
53
+ better
54
+ between
55
+ big
56
+ both
57
+ but
58
+ by
59
+ c
60
+ came
61
+ can
62
+ cannot
63
+ case
64
+ cases
65
+ certain
66
+ certainly
67
+ clear
68
+ clearly
69
+ come
70
+ could
71
+ d
72
+ did
73
+ differ
74
+ different
75
+ differently
76
+ do
77
+ does
78
+ done
79
+ down
80
+ down
81
+ downed
82
+ downing
83
+ downs
84
+ during
85
+ e
86
+ each
87
+ early
88
+ either
89
+ end
90
+ ended
91
+ ending
92
+ ends
93
+ enough
94
+ even
95
+ evenly
96
+ ever
97
+ every
98
+ everybody
99
+ everyone
100
+ everything
101
+ everywhere
102
+ f
103
+ face
104
+ faces
105
+ fact
106
+ facts
107
+ far
108
+ felt
109
+ few
110
+ find
111
+ finds
112
+ first
113
+ for
114
+ four
115
+ from
116
+ full
117
+ fully
118
+ further
119
+ furthered
120
+ furthering
121
+ furthers
122
+ g
123
+ gave
124
+ general
125
+ generally
126
+ get
127
+ gets
128
+ give
129
+ given
130
+ gives
131
+ go
132
+ going
133
+ good
134
+ goods
135
+ got
136
+ great
137
+ greater
138
+ greatest
139
+ group
140
+ grouped
141
+ grouping
142
+ groups
143
+ h
144
+ had
145
+ has
146
+ have
147
+ having
148
+ he
149
+ her
150
+ here
151
+ herself
152
+ high
153
+ high
154
+ high
155
+ higher
156
+ highest
157
+ him
158
+ himself
159
+ his
160
+ how
161
+ however
162
+ i
163
+ if
164
+ important
165
+ in
166
+ interest
167
+ interested
168
+ interesting
169
+ interests
170
+ into
171
+ is
172
+ it
173
+ its
174
+ itself
175
+ j
176
+ just
177
+ k
178
+ keep
179
+ keeps
180
+ kind
181
+ knew
182
+ know
183
+ known
184
+ knows
185
+ l
186
+ large
187
+ largely
188
+ last
189
+ later
190
+ latest
191
+ least
192
+ less
193
+ let
194
+ lets
195
+ like
196
+ likely
197
+ long
198
+ longer
199
+ longest
200
+ m
201
+ made
202
+ make
203
+ making
204
+ man
205
+ many
206
+ may
207
+ me
208
+ member
209
+ members
210
+ men
211
+ might
212
+ more
213
+ most
214
+ mostly
215
+ mr
216
+ mrs
217
+ much
218
+ must
219
+ my
220
+ myself
221
+ n
222
+ necessary
223
+ need
224
+ needed
225
+ needing
226
+ needs
227
+ never
228
+ new
229
+ new
230
+ newer
231
+ newest
232
+ next
233
+ no
234
+ nobody
235
+ non
236
+ noone
237
+ not
238
+ nothing
239
+ now
240
+ nowhere
241
+ number
242
+ numbers
243
+ o
244
+ of
245
+ off
246
+ often
247
+ old
248
+ older
249
+ oldest
250
+ on
251
+ once
252
+ one
253
+ only
254
+ open
255
+ opened
256
+ opening
257
+ opens
258
+ or
259
+ order
260
+ ordered
261
+ ordering
262
+ orders
263
+ other
264
+ others
265
+ our
266
+ out
267
+ over
268
+ p
269
+ part
270
+ parted
271
+ parting
272
+ parts
273
+ per
274
+ perhaps
275
+ place
276
+ places
277
+ point
278
+ pointed
279
+ pointing
280
+ points
281
+ possible
282
+ present
283
+ presented
284
+ presenting
285
+ presents
286
+ problem
287
+ problems
288
+ put
289
+ puts
290
+ q
291
+ quite
292
+ r
293
+ rather
294
+ really
295
+ right
296
+ right
297
+ room
298
+ rooms
299
+ s
300
+ said
301
+ same
302
+ saw
303
+ say
304
+ says
305
+ second
306
+ seconds
307
+ see
308
+ seem
309
+ seemed
310
+ seeming
311
+ seems
312
+ sees
313
+ several
314
+ shall
315
+ she
316
+ should
317
+ show
318
+ showed
319
+ showing
320
+ shows
321
+ side
322
+ sides
323
+ since
324
+ small
325
+ smaller
326
+ smallest
327
+ so
328
+ some
329
+ somebody
330
+ someone
331
+ something
332
+ somewhere
333
+ state
334
+ states
335
+ still
336
+ still
337
+ such
338
+ sure
339
+ t
340
+ take
341
+ taken
342
+ than
343
+ that
344
+ the
345
+ their
346
+ them
347
+ then
348
+ there
349
+ therefore
350
+ these
351
+ they
352
+ thing
353
+ things
354
+ think
355
+ thinks
356
+ this
357
+ those
358
+ though
359
+ thought
360
+ thoughts
361
+ three
362
+ through
363
+ thus
364
+ to
365
+ today
366
+ together
367
+ too
368
+ took
369
+ toward
370
+ turn
371
+ turned
372
+ turning
373
+ turns
374
+ two
375
+ u
376
+ under
377
+ until
378
+ up
379
+ upon
380
+ us
381
+ use
382
+ used
383
+ uses
384
+ v
385
+ very
386
+ w
387
+ want
388
+ wanted
389
+ wanting
390
+ wants
391
+ was
392
+ way
393
+ ways
394
+ we
395
+ well
396
+ wells
397
+ went
398
+ were
399
+ what
400
+ when
401
+ where
402
+ whether
403
+ which
404
+ while
405
+ who
406
+ whole
407
+ whose
408
+ why
409
+ will
410
+ with
411
+ within
412
+ without
413
+ work
414
+ worked
415
+ working
416
+ works
417
+ would
418
+ x
419
+ y
420
+ year
421
+ years
422
+ yet
423
+ you
424
+ young
425
+ younger
426
+ youngest
427
+ your
428
+ yours
429
+ z