nlp_backpack 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. data/.document +5 -0
  2. data/.gitignore +21 -0
  3. data/LICENSE +20 -0
  4. data/README.rdoc +22 -0
  5. data/Rakefile +45 -0
  6. data/VERSION +1 -0
  7. data/lib/nlp_backpack.rb +10 -0
  8. data/lib/nlp_backpack/chunker.rb +5 -0
  9. data/lib/nlp_backpack/chunker/regex_chunker.rb +107 -0
  10. data/lib/nlp_backpack/chunker/tag_pattern.rb +31 -0
  11. data/lib/nlp_backpack/classifier.rb +5 -0
  12. data/lib/nlp_backpack/classifier/base.rb +28 -0
  13. data/lib/nlp_backpack/classifier/naive_bayes.rb +83 -0
  14. data/lib/nlp_backpack/evaluation.rb +6 -0
  15. data/lib/nlp_backpack/evaluation/accuracy.rb +46 -0
  16. data/lib/nlp_backpack/evaluation/base.rb +12 -0
  17. data/lib/nlp_backpack/evaluation/confusion_matrix.rb +66 -0
  18. data/lib/nlp_backpack/frequency_distribution.rb +47 -0
  19. data/lib/nlp_backpack/pos.rb +5 -0
  20. data/lib/nlp_backpack/pos/brill_tagger.rb +142 -0
  21. data/lib/nlp_backpack/pos/brill_tagger/lexicon.txt +93696 -0
  22. data/lib/nlp_backpack/pos/pos_array.rb +32 -0
  23. data/lib/nlp_backpack/stop_words.rb +17 -0
  24. data/lib/nlp_backpack/stop_words/stop_words.txt +429 -0
  25. data/lib/nlp_backpack/tokenizers/custom.rb +13 -0
  26. data/lib/nlp_backpack/tokenizers/line.rb +13 -0
  27. data/lib/nlp_backpack/tokenizers/space.rb +13 -0
  28. data/lib/nlp_backpack/tokenizers/tab.rb +13 -0
  29. data/lib/nlp_backpack/tokenizers/whitespace.rb +13 -0
  30. data/lib/nlp_backpack/tokenizers/word.rb +13 -0
  31. data/nlp_backpack.gemspec +109 -0
  32. data/spec/chunkers/regex_chunker_spec.rb +46 -0
  33. data/spec/chunkers/tag_pattern_spec.rb +40 -0
  34. data/spec/classifiers/naive_bayes_spec.rb +68 -0
  35. data/spec/evaluation/accuracy_spec.rb +29 -0
  36. data/spec/evaluation/confusion_matrix_spec.rb +29 -0
  37. data/spec/frequency_distribution_spec.rb +53 -0
  38. data/spec/nlp_backpack_spec.rb +4 -0
  39. data/spec/pos/brill_tagger_spec.rb +24 -0
  40. data/spec/pos/pos_array_spec.rb +45 -0
  41. data/spec/spec.opts +1 -0
  42. data/spec/spec_helper.rb +18 -0
  43. data/spec/stop_words_spec.rb +15 -0
  44. data/spec/test_saves/naive.nb +1 -0
  45. data/spec/tokenizers/custom_spec.rb +24 -0
  46. data/spec/tokenizers/line_spec.rb +15 -0
  47. data/spec/tokenizers/space_spec.rb +15 -0
  48. data/spec/tokenizers/tab_spec.rb +15 -0
  49. data/spec/tokenizers/whitespace_spec.rb +16 -0
  50. data/spec/tokenizers/word_spec.rb +15 -0
  51. metadata +141 -0
@@ -0,0 +1,32 @@
1
+ # A POSArray holds the words and pos tags for a sentence
2
+
3
+ module NLPBackpack
4
+
5
+ module POS
6
+ class InvalidSentence < Exception; end;
7
+
8
+ class POSArray < Array
9
+
10
+ def <<(values)
11
+ validate_sentence(values)
12
+ super
13
+ end
14
+
15
+ def append(word, pos)
16
+ self << [word, pos]
17
+ end
18
+
19
+ # Turn into word/pos
20
+ def to_s
21
+ map {|word| word.join("/") }.join(" ")
22
+ end
23
+
24
+ private
25
+
26
+ def validate_sentence(value)
27
+ raise InvalidSentence("Adding words needs to be structured like: [word, pos]") unless value.size == 2
28
+ end
29
+ end
30
+ end
31
+
32
+ end
@@ -0,0 +1,17 @@
1
+ module NLPBackpack
2
+
3
+ class StopWords
4
+ class << self
5
+ def all
6
+ File.read(stop_word_path).split("\n")
7
+ end
8
+
9
+ private
10
+
11
+ def stop_word_path
12
+ File.expand_path(File.dirname(__FILE__) + '/stop_words/stop_words.txt')
13
+ end
14
+ end
15
+ end
16
+
17
+ end
@@ -0,0 +1,429 @@
1
+ a
2
+ about
3
+ above
4
+ across
5
+ after
6
+ again
7
+ against
8
+ all
9
+ almost
10
+ alone
11
+ along
12
+ already
13
+ also
14
+ although
15
+ always
16
+ among
17
+ an
18
+ and
19
+ another
20
+ any
21
+ anybody
22
+ anyone
23
+ anything
24
+ anywhere
25
+ are
26
+ area
27
+ areas
28
+ around
29
+ as
30
+ ask
31
+ asked
32
+ asking
33
+ asks
34
+ at
35
+ away
36
+ b
37
+ back
38
+ backed
39
+ backing
40
+ backs
41
+ be
42
+ became
43
+ because
44
+ become
45
+ becomes
46
+ been
47
+ before
48
+ began
49
+ behind
50
+ being
51
+ beings
52
+ best
53
+ better
54
+ between
55
+ big
56
+ both
57
+ but
58
+ by
59
+ c
60
+ came
61
+ can
62
+ cannot
63
+ case
64
+ cases
65
+ certain
66
+ certainly
67
+ clear
68
+ clearly
69
+ come
70
+ could
71
+ d
72
+ did
73
+ differ
74
+ different
75
+ differently
76
+ do
77
+ does
78
+ done
79
+ down
80
+ down
81
+ downed
82
+ downing
83
+ downs
84
+ during
85
+ e
86
+ each
87
+ early
88
+ either
89
+ end
90
+ ended
91
+ ending
92
+ ends
93
+ enough
94
+ even
95
+ evenly
96
+ ever
97
+ every
98
+ everybody
99
+ everyone
100
+ everything
101
+ everywhere
102
+ f
103
+ face
104
+ faces
105
+ fact
106
+ facts
107
+ far
108
+ felt
109
+ few
110
+ find
111
+ finds
112
+ first
113
+ for
114
+ four
115
+ from
116
+ full
117
+ fully
118
+ further
119
+ furthered
120
+ furthering
121
+ furthers
122
+ g
123
+ gave
124
+ general
125
+ generally
126
+ get
127
+ gets
128
+ give
129
+ given
130
+ gives
131
+ go
132
+ going
133
+ good
134
+ goods
135
+ got
136
+ great
137
+ greater
138
+ greatest
139
+ group
140
+ grouped
141
+ grouping
142
+ groups
143
+ h
144
+ had
145
+ has
146
+ have
147
+ having
148
+ he
149
+ her
150
+ here
151
+ herself
152
+ high
153
+ high
154
+ high
155
+ higher
156
+ highest
157
+ him
158
+ himself
159
+ his
160
+ how
161
+ however
162
+ i
163
+ if
164
+ important
165
+ in
166
+ interest
167
+ interested
168
+ interesting
169
+ interests
170
+ into
171
+ is
172
+ it
173
+ its
174
+ itself
175
+ j
176
+ just
177
+ k
178
+ keep
179
+ keeps
180
+ kind
181
+ knew
182
+ know
183
+ known
184
+ knows
185
+ l
186
+ large
187
+ largely
188
+ last
189
+ later
190
+ latest
191
+ least
192
+ less
193
+ let
194
+ lets
195
+ like
196
+ likely
197
+ long
198
+ longer
199
+ longest
200
+ m
201
+ made
202
+ make
203
+ making
204
+ man
205
+ many
206
+ may
207
+ me
208
+ member
209
+ members
210
+ men
211
+ might
212
+ more
213
+ most
214
+ mostly
215
+ mr
216
+ mrs
217
+ much
218
+ must
219
+ my
220
+ myself
221
+ n
222
+ necessary
223
+ need
224
+ needed
225
+ needing
226
+ needs
227
+ never
228
+ new
229
+ new
230
+ newer
231
+ newest
232
+ next
233
+ no
234
+ nobody
235
+ non
236
+ noone
237
+ not
238
+ nothing
239
+ now
240
+ nowhere
241
+ number
242
+ numbers
243
+ o
244
+ of
245
+ off
246
+ often
247
+ old
248
+ older
249
+ oldest
250
+ on
251
+ once
252
+ one
253
+ only
254
+ open
255
+ opened
256
+ opening
257
+ opens
258
+ or
259
+ order
260
+ ordered
261
+ ordering
262
+ orders
263
+ other
264
+ others
265
+ our
266
+ out
267
+ over
268
+ p
269
+ part
270
+ parted
271
+ parting
272
+ parts
273
+ per
274
+ perhaps
275
+ place
276
+ places
277
+ point
278
+ pointed
279
+ pointing
280
+ points
281
+ possible
282
+ present
283
+ presented
284
+ presenting
285
+ presents
286
+ problem
287
+ problems
288
+ put
289
+ puts
290
+ q
291
+ quite
292
+ r
293
+ rather
294
+ really
295
+ right
296
+ right
297
+ room
298
+ rooms
299
+ s
300
+ said
301
+ same
302
+ saw
303
+ say
304
+ says
305
+ second
306
+ seconds
307
+ see
308
+ seem
309
+ seemed
310
+ seeming
311
+ seems
312
+ sees
313
+ several
314
+ shall
315
+ she
316
+ should
317
+ show
318
+ showed
319
+ showing
320
+ shows
321
+ side
322
+ sides
323
+ since
324
+ small
325
+ smaller
326
+ smallest
327
+ so
328
+ some
329
+ somebody
330
+ someone
331
+ something
332
+ somewhere
333
+ state
334
+ states
335
+ still
336
+ still
337
+ such
338
+ sure
339
+ t
340
+ take
341
+ taken
342
+ than
343
+ that
344
+ the
345
+ their
346
+ them
347
+ then
348
+ there
349
+ therefore
350
+ these
351
+ they
352
+ thing
353
+ things
354
+ think
355
+ thinks
356
+ this
357
+ those
358
+ though
359
+ thought
360
+ thoughts
361
+ three
362
+ through
363
+ thus
364
+ to
365
+ today
366
+ together
367
+ too
368
+ took
369
+ toward
370
+ turn
371
+ turned
372
+ turning
373
+ turns
374
+ two
375
+ u
376
+ under
377
+ until
378
+ up
379
+ upon
380
+ us
381
+ use
382
+ used
383
+ uses
384
+ v
385
+ very
386
+ w
387
+ want
388
+ wanted
389
+ wanting
390
+ wants
391
+ was
392
+ way
393
+ ways
394
+ we
395
+ well
396
+ wells
397
+ went
398
+ were
399
+ what
400
+ when
401
+ where
402
+ whether
403
+ which
404
+ while
405
+ who
406
+ whole
407
+ whose
408
+ why
409
+ will
410
+ with
411
+ within
412
+ without
413
+ work
414
+ worked
415
+ working
416
+ works
417
+ would
418
+ x
419
+ y
420
+ year
421
+ years
422
+ yet
423
+ you
424
+ young
425
+ younger
426
+ youngest
427
+ your
428
+ yours
429
+ z