treat 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. data/INSTALL +0 -0
  2. data/LICENSE +28 -0
  3. data/README +0 -0
  4. data/TODO +67 -0
  5. data/bin/INFO +1 -0
  6. data/examples/benchmark.rb +81 -0
  7. data/examples/keywords.rb +60 -0
  8. data/examples/texts/bugged_out.txt +26 -0
  9. data/examples/texts/half_cocked_basel.txt +16 -0
  10. data/examples/texts/hedge_funds.txt +24 -0
  11. data/examples/texts/hose_and_dry.txt +19 -0
  12. data/examples/texts/hungarys_troubles.txt +46 -0
  13. data/examples/texts/indias_slowdown.txt +15 -0
  14. data/examples/texts/merkozy_rides_again.txt +24 -0
  15. data/examples/texts/prada_is_not_walmart.txt +9 -0
  16. data/examples/texts/republican_nomination.txt +26 -0
  17. data/examples/texts/to_infinity_and_beyond.txt +15 -0
  18. data/lib/treat.rb +91 -0
  19. data/lib/treat/buildable.rb +115 -0
  20. data/lib/treat/categories.rb +29 -0
  21. data/lib/treat/category.rb +28 -0
  22. data/lib/treat/delegatable.rb +90 -0
  23. data/lib/treat/detectors.rb +28 -0
  24. data/lib/treat/detectors/encoding/native.rb +12 -0
  25. data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
  26. data/lib/treat/detectors/format/file.rb +36 -0
  27. data/lib/treat/detectors/language/language_detector.rb +19 -0
  28. data/lib/treat/detectors/language/what_language.rb +29 -0
  29. data/lib/treat/entities.rb +52 -0
  30. data/lib/treat/entities/collection.rb +19 -0
  31. data/lib/treat/entities/constituents.rb +15 -0
  32. data/lib/treat/entities/document.rb +11 -0
  33. data/lib/treat/entities/entity.rb +242 -0
  34. data/lib/treat/entities/sentence.rb +8 -0
  35. data/lib/treat/entities/text.rb +7 -0
  36. data/lib/treat/entities/tokens.rb +37 -0
  37. data/lib/treat/entities/zones.rb +17 -0
  38. data/lib/treat/exception.rb +5 -0
  39. data/lib/treat/extractors.rb +41 -0
  40. data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
  41. data/lib/treat/extractors/named_entity/abner.rb +20 -0
  42. data/lib/treat/extractors/named_entity/stanford.rb +174 -0
  43. data/lib/treat/extractors/statistics/frequency.rb +22 -0
  44. data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
  45. data/lib/treat/extractors/statistics/position_in.rb +13 -0
  46. data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
  47. data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
  48. data/lib/treat/extractors/time/chronic.rb +12 -0
  49. data/lib/treat/extractors/time/native.rb +12 -0
  50. data/lib/treat/extractors/time/nickel.rb +45 -0
  51. data/lib/treat/extractors/topic_words/lda.rb +71 -0
  52. data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
  53. data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
  54. data/lib/treat/extractors/topics/reuters.rb +91 -0
  55. data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
  56. data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
  57. data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
  58. data/lib/treat/feature.rb +53 -0
  59. data/lib/treat/formatters.rb +44 -0
  60. data/lib/treat/formatters/cleaners/html.rb +17 -0
  61. data/lib/treat/formatters/readers/autoselect.rb +35 -0
  62. data/lib/treat/formatters/readers/gocr.rb +24 -0
  63. data/lib/treat/formatters/readers/html.rb +13 -0
  64. data/lib/treat/formatters/readers/ocropus.rb +31 -0
  65. data/lib/treat/formatters/readers/pdf.rb +17 -0
  66. data/lib/treat/formatters/readers/txt.rb +15 -0
  67. data/lib/treat/formatters/serializers/xml.rb +48 -0
  68. data/lib/treat/formatters/serializers/yaml.rb +15 -0
  69. data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
  70. data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
  71. data/lib/treat/formatters/unserializers/xml.rb +79 -0
  72. data/lib/treat/formatters/unserializers/yaml.rb +15 -0
  73. data/lib/treat/formatters/visualizers/dot.rb +73 -0
  74. data/lib/treat/formatters/visualizers/html.rb +12 -0
  75. data/lib/treat/formatters/visualizers/inspect.rb +16 -0
  76. data/lib/treat/formatters/visualizers/short_value.rb +14 -0
  77. data/lib/treat/formatters/visualizers/standoff.rb +41 -0
  78. data/lib/treat/formatters/visualizers/tree.rb +28 -0
  79. data/lib/treat/formatters/visualizers/txt.rb +31 -0
  80. data/lib/treat/group.rb +96 -0
  81. data/lib/treat/inflectors.rb +50 -0
  82. data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
  83. data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
  84. data/lib/treat/inflectors/declensors/en.rb +18 -0
  85. data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
  86. data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
  87. data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
  88. data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
  89. data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
  90. data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
  91. data/lib/treat/inflectors/stemmers/porter.rb +158 -0
  92. data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
  93. data/lib/treat/inflectors/stemmers/uea.rb +30 -0
  94. data/lib/treat/lexicalizers.rb +49 -0
  95. data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
  96. data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
  97. data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
  98. data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
  99. data/lib/treat/lexicalizers/tag/brill.rb +101 -0
  100. data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
  101. data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
  102. data/lib/treat/processors.rb +45 -0
  103. data/lib/treat/processors/chunkers/txt.rb +27 -0
  104. data/lib/treat/processors/parsers/enju.rb +214 -0
  105. data/lib/treat/processors/parsers/stanford.rb +60 -0
  106. data/lib/treat/processors/segmenters/punkt.rb +48 -0
  107. data/lib/treat/processors/segmenters/stanford.rb +45 -0
  108. data/lib/treat/processors/segmenters/tactful.rb +34 -0
  109. data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
  110. data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
  111. data/lib/treat/processors/tokenizers/perl.rb +96 -0
  112. data/lib/treat/processors/tokenizers/punkt.rb +42 -0
  113. data/lib/treat/processors/tokenizers/stanford.rb +33 -0
  114. data/lib/treat/processors/tokenizers/tactful.rb +59 -0
  115. data/lib/treat/proxies.rb +66 -0
  116. data/lib/treat/registrable.rb +26 -0
  117. data/lib/treat/resources.rb +10 -0
  118. data/lib/treat/resources/categories.rb +18 -0
  119. data/lib/treat/resources/delegates.rb +96 -0
  120. data/lib/treat/resources/dependencies.rb +0 -0
  121. data/lib/treat/resources/edges.rb +8 -0
  122. data/lib/treat/resources/formats.rb +23 -0
  123. data/lib/treat/resources/languages.rb +86 -0
  124. data/lib/treat/resources/languages.txt +504 -0
  125. data/lib/treat/resources/tags.rb +393 -0
  126. data/lib/treat/sugar.rb +43 -0
  127. data/lib/treat/tree.rb +174 -0
  128. data/lib/treat/utilities.rb +127 -0
  129. data/lib/treat/visitable.rb +27 -0
  130. data/test/profile.rb +2 -0
  131. data/test/tc_detectors.rb +27 -0
  132. data/test/tc_entity.rb +105 -0
  133. data/test/tc_extractors.rb +48 -0
  134. data/test/tc_formatters.rb +46 -0
  135. data/test/tc_inflectors.rb +39 -0
  136. data/test/tc_lexicalizers.rb +39 -0
  137. data/test/tc_processors.rb +36 -0
  138. data/test/tc_resources.rb +27 -0
  139. data/test/tc_treat.rb +64 -0
  140. data/test/tc_tree.rb +60 -0
  141. data/test/tests.rb +19 -0
  142. data/test/texts.rb +20 -0
  143. data/test/texts/english/long.html +24 -0
  144. data/test/texts/english/long.txt +22 -0
  145. data/test/texts/english/medium.txt +5 -0
  146. data/test/texts/english/short.txt +3 -0
  147. metadata +412 -0
@@ -0,0 +1,18 @@
1
+ silently { require 'english' }
2
+
3
+ module Treat
4
+ module Inflectors
5
+ module Declensors
6
+ module En
7
+ def self.declense(entity, options)
8
+ string = entity.to_s
9
+ if options[:count] == :plural
10
+ ::English.plural(string)
11
+ elsif options[:count] == :singular
12
+ ::English.singular(string)
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,30 @@
1
+ module Treat
2
+ module Inflectors
3
+ module Declensors
4
+ silently { require 'linguistics' }
5
+ # Obtain word declensions in English using the
6
+ # ruby 'linguistics' gem.
7
+ class Linguistics
8
+ def self.declense(entity, options = {})
9
+ begin
10
+ l = entity.language.to_s.upcase
11
+ delegate = nil
12
+ silently { delegate = ::Linguistics.const_get(l) }
13
+ rescue RuntimeError
14
+ raise "Ruby Linguistics does not have a module " +
15
+ " installed for the #{entity.language} language."
16
+ end
17
+ string = entity.to_s
18
+ if options[:count] == :plural
19
+ if entity.has?(:category) &&
20
+ [:noun, :adjective, :verb].include?(entity.category)
21
+ silently { delegate.send(:"plural_#{entity.category}", string) }
22
+ else
23
+ silently { delegate.plural(string) }
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,12 @@
1
+ module Treat
2
+ module Inflectors
3
+ module Lemmatizers
4
+ class ELemma
5
+ silently { require 'treat/inflectors/lemmatizers/elemma/elemma'}
6
+ def self.lemma(entity, options = nil)
7
+ ::ELemma::parse(word, entity.tag)
8
+ end
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,213 @@
1
+
2
+ SHELL = /bin/sh
3
+
4
+ # V=0 quiet, V=1 verbose. other values don't work.
5
+ V = 0
6
+ Q1 = $(V:1=)
7
+ Q = $(Q1:0=@)
8
+ n=$(NULLCMD)
9
+ ECHO1 = $(V:1=@$n)
10
+ ECHO = $(ECHO1:0=@echo)
11
+
12
+ #### Start of system configuration section. ####
13
+
14
+ srcdir = .
15
+ topdir = /usr/local/include/ruby-1.9.1
16
+ hdrdir = /usr/local/include/ruby-1.9.1
17
+ arch_hdrdir = /usr/local/include/ruby-1.9.1/$(arch)
18
+ VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
19
+ prefix = $(DESTDIR)/usr/local
20
+ exec_prefix = $(prefix)
21
+ rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
22
+ bindir = $(exec_prefix)/bin
23
+ sbindir = $(exec_prefix)/sbin
24
+ libexecdir = $(exec_prefix)/libexec
25
+ datarootdir = $(prefix)/share
26
+ datadir = $(datarootdir)
27
+ sysconfdir = $(prefix)/etc
28
+ sharedstatedir = $(prefix)/com
29
+ localstatedir = $(prefix)/var
30
+ includedir = $(prefix)/include
31
+ oldincludedir = $(DESTDIR)/usr/include
32
+ docdir = $(datarootdir)/doc/$(PACKAGE)
33
+ infodir = $(datarootdir)/info
34
+ htmldir = $(docdir)
35
+ dvidir = $(docdir)
36
+ pdfdir = $(docdir)
37
+ psdir = $(docdir)
38
+ libdir = $(exec_prefix)/lib
39
+ localedir = $(datarootdir)/locale
40
+ mandir = $(datarootdir)/man
41
+ ridir = $(datarootdir)/$(RI_BASE_NAME)
42
+ sitedir = $(rubylibprefix)/site_ruby
43
+ vendordir = $(rubylibprefix)/vendor_ruby
44
+ rubyhdrdir = $(includedir)/$(RUBY_BASE_NAME)-$(ruby_version)
45
+ sitehdrdir = $(rubyhdrdir)/site_ruby
46
+ vendorhdrdir = $(rubyhdrdir)/vendor_ruby
47
+ rubylibdir = $(rubylibprefix)/$(ruby_version)
48
+ archdir = $(rubylibdir)/$(arch)
49
+ sitelibdir = $(sitedir)/$(ruby_version)
50
+ sitearchdir = $(sitelibdir)/$(sitearch)
51
+ vendorlibdir = $(vendordir)/$(ruby_version)
52
+ vendorarchdir = $(vendorlibdir)/$(sitearch)
53
+
54
+ NULLCMD = :
55
+
56
+ CC = gcc-4.2
57
+ CXX = g++-4.2
58
+ LIBRUBY = $(LIBRUBY_SO)
59
+ LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
60
+ LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
61
+ LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static
62
+ OUTFLAG = -o
63
+ COUTFLAG = -o
64
+
65
+ RUBY_EXTCONF_H =
66
+ cflags = $(optflags) $(debugflags) $(warnflags)
67
+ optflags = -O3
68
+ debugflags = -ggdb
69
+ warnflags = -Wextra -Wno-unused-parameter -Wno-parentheses -Wno-long-long -Wno-missing-field-initializers -Wpointer-arith -Wwrite-strings -Wdeclaration-after-statement -Wshorten-64-to-32 -Wimplicit-function-declaration
70
+ CFLAGS = -fno-common -Wall -I/usr/local/WordNet-2.1/include/
71
+ INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
72
+ DEFS =
73
+ CPPFLAGS = -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE $(DEFS) $(cppflags)
74
+ CXXFLAGS = $(CFLAGS) $(cxxflags)
75
+ ldflags = -L. -L/usr/local/lib
76
+ dldflags = -Wl,-undefined,dynamic_lookup -Wl,-multiply_defined,suppress -Wl,-flat_namespace
77
+ ARCH_FLAG =
78
+ DLDFLAGS = $(ldflags) $(dldflags)
79
+ LDSHARED = $(CC) -dynamic -bundle
80
+ LDSHAREDXX = $(CXX) -dynamic -bundle
81
+ AR = ar
82
+ EXEEXT =
83
+
84
+ RUBY_BASE_NAME = ruby
85
+ RUBY_INSTALL_NAME = ruby19
86
+ RUBY_SO_NAME = ruby.1.9.1
87
+ arch = x86_64-darwin11.0.0
88
+ sitearch = $(arch)
89
+ ruby_version = 1.9.1
90
+ ruby = /usr/local/bin/ruby19
91
+ RUBY = $(ruby)
92
+ RM = rm -f
93
+ RM_RF = $(RUBY) -run -e rm -- -rf
94
+ RMDIRS = rmdir -p
95
+ MAKEDIRS = mkdir -p
96
+ INSTALL = /usr/bin/install -c
97
+ INSTALL_PROG = $(INSTALL) -m 0755
98
+ INSTALL_DATA = $(INSTALL) -m 644
99
+ COPY = cp
100
+
101
+ #### End of system configuration section. ####
102
+
103
+ preload =
104
+
105
+ libpath = . $(libdir)
106
+ LIBPATH = -L. -L$(libdir)
107
+ DEFFILE =
108
+
109
+ CLEANFILES = mkmf.log
110
+ DISTCLEANFILES =
111
+ DISTCLEANDIRS =
112
+
113
+ extout =
114
+ extout_prefix =
115
+ target_prefix =
116
+ LOCAL_LIBS = -L/usr/local/WordNet-2.1/lib -lwn
117
+ LIBS = $(LIBRUBYARG_SHARED) -lpthread -ldl -lobjc
118
+ SRCS = elemma.c
119
+ OBJS = elemma.o
120
+ TARGET = elemma
121
+ DLLIB = $(TARGET).bundle
122
+ EXTSTATIC =
123
+ STATIC_LIB =
124
+
125
+ BINDIR = $(bindir)
126
+ RUBYCOMMONDIR = $(sitedir)$(target_prefix)
127
+ RUBYLIBDIR = $(sitelibdir)$(target_prefix)
128
+ RUBYARCHDIR = $(sitearchdir)$(target_prefix)
129
+ HDRDIR = $(rubyhdrdir)/ruby$(target_prefix)
130
+ ARCHHDRDIR = $(rubyhdrdir)/$(arch)/ruby$(target_prefix)
131
+
132
+ TARGET_SO = $(DLLIB)
133
+ CLEANLIBS = $(TARGET).bundle
134
+ CLEANOBJS = *.o *.bak
135
+
136
+ all: $(DLLIB)
137
+ static: $(STATIC_LIB)
138
+ .PHONY: all install static install-so install-rb
139
+ .PHONY: clean clean-so clean-rb
140
+
141
+ clean-rb-default::
142
+ clean-rb::
143
+ clean-so::
144
+ clean: clean-so clean-rb-default clean-rb
145
+ @-$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES)
146
+
147
+ distclean-rb-default::
148
+ distclean-rb::
149
+ distclean-so::
150
+ distclean: clean distclean-so distclean-rb-default distclean-rb
151
+ @-$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
152
+ @-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
153
+ @-$(RMDIRS) $(DISTCLEANDIRS) 2> /dev/null || true
154
+
155
+ realclean: distclean
156
+ install: install-so install-rb
157
+
158
+ install-so: $(RUBYARCHDIR)
159
+ install-so: $(RUBYARCHDIR)/$(DLLIB)
160
+ $(RUBYARCHDIR)/$(DLLIB): $(DLLIB)
161
+ @-$(MAKEDIRS) $(@D)
162
+ $(INSTALL_PROG) $(DLLIB) $(@D)
163
+ install-rb: pre-install-rb install-rb-default
164
+ install-rb-default: pre-install-rb-default
165
+ pre-install-rb: Makefile
166
+ pre-install-rb-default: Makefile
167
+ pre-install-rb-default:
168
+ $(ECHO) installing default elemma libraries
169
+ $(RUBYARCHDIR):
170
+ $(Q) $(MAKEDIRS) $@
171
+
172
+ site-install: site-install-so site-install-rb
173
+ site-install-so: install-so
174
+ site-install-rb: install-rb
175
+
176
+ .SUFFIXES: .c .m .cc .mm .cxx .cpp .C .o
177
+
178
+ .cc.o:
179
+ $(ECHO) compiling $(<)
180
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
181
+
182
+ .mm.o:
183
+ $(ECHO) compiling $(<)
184
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
185
+
186
+ .cxx.o:
187
+ $(ECHO) compiling $(<)
188
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
189
+
190
+ .cpp.o:
191
+ $(ECHO) compiling $(<)
192
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
193
+
194
+ .C.o:
195
+ $(ECHO) compiling $(<)
196
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
197
+
198
+ .c.o:
199
+ $(ECHO) compiling $(<)
200
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $<
201
+
202
+ .m.o:
203
+ $(ECHO) compiling $(<)
204
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $<
205
+
206
+ $(DLLIB): $(OBJS) Makefile
207
+ $(ECHO) linking shared-object $(DLLIB)
208
+ @-$(RM) $(@)
209
+ $(Q) $(LDSHARED) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
210
+
211
+
212
+
213
+ $(OBJS): $(hdrdir)/ruby.h $(hdrdir)/ruby/defines.h $(arch_hdrdir)/ruby/config.h
@@ -0,0 +1,68 @@
1
+ #include "wn.h"
2
+ #include "wnconsts.h"
3
+ #include "ruby.h"
4
+
5
+ /*
6
+
7
+ Copyright (C) 2004 UTIYAMA Masao <mutiyama@crl.go.jp>
8
+
9
+ This program is free software; you can redistribute it and/or modify
10
+ it under the terms of the GNU General Public License as published by
11
+ the Free Software Foundation; either version 2 of the License, or
12
+ (at your option) any later version.
13
+
14
+ This program is distributed in the hope that it will be useful,
15
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ MERCHANTABITreatY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ GNU General Public License for more details.
18
+
19
+ You should have received a copy of the GNU General Public License
20
+ along with this program; if not, write to the Free Software
21
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22
+ */
23
+
24
+ static VALUE
25
+ parse(VALUE klass, VALUE rb_word, VALUE rb_pos)
26
+ {
27
+ char *word;
28
+ char *POS = STR2CSTR(rb_pos);
29
+ char *lemma;
30
+ int pos;
31
+ VALUE retval = rb_ary_new();
32
+
33
+ word = malloc(strlen(STR2CSTR(rb_word))+1);
34
+ if(!word){rb_raise(rb_eStandardError, "malloc failed.\n");}
35
+ strcpy(word, STR2CSTR(rb_word));
36
+
37
+ if(strcmp(POS,"noun")==0){pos = NOUN;}
38
+ else if(strcmp(POS,"verb")==0){pos = VERB;}
39
+ else if(strcmp(POS,"adj")==0){pos = ADJ;}
40
+ else if(strcmp(POS,"adv")==0){pos = ADV;}
41
+ else{
42
+ rb_raise(rb_eStandardError, "%s should be (noun|verb|adj|adv)\n", POS);
43
+ }
44
+ if(is_defined(word, pos)){
45
+ /*printf("* %s found as is.\n", word);*/
46
+ rb_ary_push(retval, rb_str_new2(word));
47
+ }
48
+ if((lemma=morphstr(word, pos))!=NULL){
49
+ do {
50
+ if(is_defined(lemma, pos)){
51
+ /*printf("* %s => %s found.\n", word, lemma);*/
52
+ rb_ary_push(retval, rb_str_new2(lemma));
53
+ }
54
+ } while((lemma=morphstr(NULL, pos))!=NULL);
55
+ }
56
+ free(word);
57
+ return retval;
58
+ }
59
+
60
+ void
61
+ Init_elemma()
62
+ {
63
+ VALUE mod = rb_define_module("ELemma");
64
+ rb_define_module_function(mod, "parse", parse, 2);
65
+ if(wninit()){
66
+ rb_raise(rb_eStandardError, "Cannot open WordNet database\n");
67
+ }
68
+ }
@@ -0,0 +1,6 @@
1
+ require 'mkmf'
2
+
3
+ $CFLAGS = "-Wall -I/usr/local/WordNet-2.1/include/"
4
+ $LOCAL_LIBS = "-L/usr/local/WordNet-2.1/lib -lwn"
5
+
6
+ create_makefile("elemma")
@@ -0,0 +1,21 @@
1
+ module Treat
2
+ module Inflectors
3
+ module OrdinalWords
4
+ class Linguistics
5
+ silently { require 'linguistics' }
6
+ def self.ordinal_words(number, options = {})
7
+ begin
8
+ l = number.language.to_s.upcase
9
+ delegate = nil
10
+ silently { delegate = ::Linguistics.const_get(l) }
11
+ rescue RuntimeError
12
+ lang = Treat::Resources::Language.describe(number.language)
13
+ raise "Ruby Linguistics does not have a module " +
14
+ " installed for the #{lang} language."
15
+ end
16
+ silently { delegate.ordinate(number.to_s) }
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,158 @@
1
+ module Treat
2
+ module Inflectors
3
+ module Stemmers
4
+ # Stem a word using a native Ruby implementation of the
5
+ # Porter stemming algorithm, ported to Ruby from the
6
+ # version coded up in Perl.
7
+ #
8
+ # Authored by Ray Pereda (raypereda@hotmail.com).
9
+ #
10
+ # Original paper: Porter, 1980. An algorithm for suffix stripping,
11
+ # Program, Vol. 14, no. 3, pp 130-137,
12
+ # Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
13
+ class Porter
14
+ # Returns the stem of a word using a native Porter stemmer.
15
+ # Options: none.
16
+ def self.stem(word, options = {})
17
+ # Copy the word and convert it to a string.
18
+ w = word.to_s
19
+ return w if w.length < 3
20
+ # Map initial y to Y so that the patterns
21
+ # never treat it as vowel.
22
+ w[0] = 'Y' if w[0] == ?y
23
+ # Step 1a
24
+ if w =~ /(ss|i)es$/
25
+ w = $` + $1
26
+ elsif w =~ /([^s])s$/
27
+ w = $` + $1
28
+ end
29
+ # Step 1b
30
+ if w =~ /eed$/
31
+ w.chop! if $` =~ MGR0
32
+ elsif w =~ /(ed|ing)$/
33
+ stem = $`
34
+ if stem =~ VOWEL_IN_STEM
35
+ w = stem
36
+ case w
37
+ when /(at|bl|iz)$/ then w << "e"
38
+ when /([^aeiouylsz])\1$/ then w.chop!
39
+ when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
40
+ end
41
+ end
42
+ end
43
+ if w =~ /y$/
44
+ stem = $`
45
+ w = stem + "i" if stem =~ VOWEL_IN_STEM
46
+ end
47
+ # Step 2
48
+ if w =~ SUFFIX_1_REGEXP
49
+ stem = $`
50
+ suffix = $1
51
+ if stem =~ MGR0
52
+ w = stem + STEP_2_LIST[suffix]
53
+ end
54
+ end
55
+ # Step 3
56
+ if w =~
57
+ /(icate|ative|alize|iciti|ical|ful|ness)$/
58
+ stem = $`
59
+ suffix = $1
60
+ if stem =~ MGR0
61
+ w = stem + STEP_3_LIST[suffix]
62
+ end
63
+ end
64
+ # Step 4
65
+ if w =~ SUFFIX_2_REGEXP
66
+ stem = $`
67
+ if stem =~ MGR1
68
+ w = stem
69
+ end
70
+ elsif w =~ /(s|t)(ion)$/
71
+ stem = $` + $1
72
+ if stem =~ MGR1
73
+ w = stem
74
+ end
75
+ end
76
+ # Step 5
77
+ if w =~ /e$/
78
+ stem = $`
79
+ if (stem =~ MGR1) ||
80
+ (stem =~ MEQ1 && stem !~
81
+ /^#{CC}#{V}[^aeiouwxy]$/o)
82
+ w = stem
83
+ end
84
+ end
85
+ if w =~ /ll$/ && w =~ MGR1
86
+ w.chop!
87
+ end
88
+ # and turn initial Y back to y
89
+ w[0] = 'y' if w[0] == ?Y
90
+ w
91
+ end
92
+
93
+ STEP_2_LIST = {
94
+ 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
95
+ 'izer'=>'ize', 'bli'=>'ble',
96
+ 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
97
+ 'ization'=>'ize', 'ation'=>'ate',
98
+ 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
99
+ 'ousness'=>'ous', 'anati'=>'al',
100
+ 'iviti'=>'ive', 'binati'=>'ble', 'logi'=>'log'
101
+ }
102
+ STEP_3_LIST = {
103
+ 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
104
+ 'ical'=>'ic', 'ful'=>'', 'ness'=>''
105
+ }
106
+ SUFFIX_1_REGEXP = /(
107
+ ational |
108
+ tional |
109
+ enci |
110
+ anci |
111
+ izer |
112
+ bli |
113
+ alli |
114
+ entli |
115
+ eli |
116
+ ousli |
117
+ ization |
118
+ ation |
119
+ ator |
120
+ alism |
121
+ iveness |
122
+ fulness |
123
+ ousness |
124
+ anati |
125
+ iviti |
126
+ binati |
127
+ logi)$/x
128
+ SUFFIX_2_REGEXP = /(
129
+ al |
130
+ ance |
131
+ ence |
132
+ er |
133
+ ic |
134
+ able |
135
+ ible |
136
+ ant |
137
+ ement |
138
+ ment |
139
+ ent |
140
+ ou |
141
+ ism |
142
+ ate |
143
+ iti |
144
+ ous |
145
+ ive |
146
+ ize)$/x
147
+ C = "[^aeiou]" # consonant
148
+ V = "[aeiouy]" # vowel
149
+ CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
150
+ VV = "#{V}(?>[aeiou]*)" # vowel sequence
151
+ MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
152
+ MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
153
+ MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
154
+ VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
155
+ end
156
+ end
157
+ end
158
+ end