treat 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (147) hide show
  1. data/INSTALL +0 -0
  2. data/LICENSE +28 -0
  3. data/README +0 -0
  4. data/TODO +67 -0
  5. data/bin/INFO +1 -0
  6. data/examples/benchmark.rb +81 -0
  7. data/examples/keywords.rb +60 -0
  8. data/examples/texts/bugged_out.txt +26 -0
  9. data/examples/texts/half_cocked_basel.txt +16 -0
  10. data/examples/texts/hedge_funds.txt +24 -0
  11. data/examples/texts/hose_and_dry.txt +19 -0
  12. data/examples/texts/hungarys_troubles.txt +46 -0
  13. data/examples/texts/indias_slowdown.txt +15 -0
  14. data/examples/texts/merkozy_rides_again.txt +24 -0
  15. data/examples/texts/prada_is_not_walmart.txt +9 -0
  16. data/examples/texts/republican_nomination.txt +26 -0
  17. data/examples/texts/to_infinity_and_beyond.txt +15 -0
  18. data/lib/treat.rb +91 -0
  19. data/lib/treat/buildable.rb +115 -0
  20. data/lib/treat/categories.rb +29 -0
  21. data/lib/treat/category.rb +28 -0
  22. data/lib/treat/delegatable.rb +90 -0
  23. data/lib/treat/detectors.rb +28 -0
  24. data/lib/treat/detectors/encoding/native.rb +12 -0
  25. data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
  26. data/lib/treat/detectors/format/file.rb +36 -0
  27. data/lib/treat/detectors/language/language_detector.rb +19 -0
  28. data/lib/treat/detectors/language/what_language.rb +29 -0
  29. data/lib/treat/entities.rb +52 -0
  30. data/lib/treat/entities/collection.rb +19 -0
  31. data/lib/treat/entities/constituents.rb +15 -0
  32. data/lib/treat/entities/document.rb +11 -0
  33. data/lib/treat/entities/entity.rb +242 -0
  34. data/lib/treat/entities/sentence.rb +8 -0
  35. data/lib/treat/entities/text.rb +7 -0
  36. data/lib/treat/entities/tokens.rb +37 -0
  37. data/lib/treat/entities/zones.rb +17 -0
  38. data/lib/treat/exception.rb +5 -0
  39. data/lib/treat/extractors.rb +41 -0
  40. data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
  41. data/lib/treat/extractors/named_entity/abner.rb +20 -0
  42. data/lib/treat/extractors/named_entity/stanford.rb +174 -0
  43. data/lib/treat/extractors/statistics/frequency.rb +22 -0
  44. data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
  45. data/lib/treat/extractors/statistics/position_in.rb +13 -0
  46. data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
  47. data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
  48. data/lib/treat/extractors/time/chronic.rb +12 -0
  49. data/lib/treat/extractors/time/native.rb +12 -0
  50. data/lib/treat/extractors/time/nickel.rb +45 -0
  51. data/lib/treat/extractors/topic_words/lda.rb +71 -0
  52. data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
  53. data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
  54. data/lib/treat/extractors/topics/reuters.rb +91 -0
  55. data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
  56. data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
  57. data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
  58. data/lib/treat/feature.rb +53 -0
  59. data/lib/treat/formatters.rb +44 -0
  60. data/lib/treat/formatters/cleaners/html.rb +17 -0
  61. data/lib/treat/formatters/readers/autoselect.rb +35 -0
  62. data/lib/treat/formatters/readers/gocr.rb +24 -0
  63. data/lib/treat/formatters/readers/html.rb +13 -0
  64. data/lib/treat/formatters/readers/ocropus.rb +31 -0
  65. data/lib/treat/formatters/readers/pdf.rb +17 -0
  66. data/lib/treat/formatters/readers/txt.rb +15 -0
  67. data/lib/treat/formatters/serializers/xml.rb +48 -0
  68. data/lib/treat/formatters/serializers/yaml.rb +15 -0
  69. data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
  70. data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
  71. data/lib/treat/formatters/unserializers/xml.rb +79 -0
  72. data/lib/treat/formatters/unserializers/yaml.rb +15 -0
  73. data/lib/treat/formatters/visualizers/dot.rb +73 -0
  74. data/lib/treat/formatters/visualizers/html.rb +12 -0
  75. data/lib/treat/formatters/visualizers/inspect.rb +16 -0
  76. data/lib/treat/formatters/visualizers/short_value.rb +14 -0
  77. data/lib/treat/formatters/visualizers/standoff.rb +41 -0
  78. data/lib/treat/formatters/visualizers/tree.rb +28 -0
  79. data/lib/treat/formatters/visualizers/txt.rb +31 -0
  80. data/lib/treat/group.rb +96 -0
  81. data/lib/treat/inflectors.rb +50 -0
  82. data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
  83. data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
  84. data/lib/treat/inflectors/declensors/en.rb +18 -0
  85. data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
  86. data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
  87. data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
  88. data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
  89. data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
  90. data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
  91. data/lib/treat/inflectors/stemmers/porter.rb +158 -0
  92. data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
  93. data/lib/treat/inflectors/stemmers/uea.rb +30 -0
  94. data/lib/treat/lexicalizers.rb +49 -0
  95. data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
  96. data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
  97. data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
  98. data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
  99. data/lib/treat/lexicalizers/tag/brill.rb +101 -0
  100. data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
  101. data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
  102. data/lib/treat/processors.rb +45 -0
  103. data/lib/treat/processors/chunkers/txt.rb +27 -0
  104. data/lib/treat/processors/parsers/enju.rb +214 -0
  105. data/lib/treat/processors/parsers/stanford.rb +60 -0
  106. data/lib/treat/processors/segmenters/punkt.rb +48 -0
  107. data/lib/treat/processors/segmenters/stanford.rb +45 -0
  108. data/lib/treat/processors/segmenters/tactful.rb +34 -0
  109. data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
  110. data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
  111. data/lib/treat/processors/tokenizers/perl.rb +96 -0
  112. data/lib/treat/processors/tokenizers/punkt.rb +42 -0
  113. data/lib/treat/processors/tokenizers/stanford.rb +33 -0
  114. data/lib/treat/processors/tokenizers/tactful.rb +59 -0
  115. data/lib/treat/proxies.rb +66 -0
  116. data/lib/treat/registrable.rb +26 -0
  117. data/lib/treat/resources.rb +10 -0
  118. data/lib/treat/resources/categories.rb +18 -0
  119. data/lib/treat/resources/delegates.rb +96 -0
  120. data/lib/treat/resources/dependencies.rb +0 -0
  121. data/lib/treat/resources/edges.rb +8 -0
  122. data/lib/treat/resources/formats.rb +23 -0
  123. data/lib/treat/resources/languages.rb +86 -0
  124. data/lib/treat/resources/languages.txt +504 -0
  125. data/lib/treat/resources/tags.rb +393 -0
  126. data/lib/treat/sugar.rb +43 -0
  127. data/lib/treat/tree.rb +174 -0
  128. data/lib/treat/utilities.rb +127 -0
  129. data/lib/treat/visitable.rb +27 -0
  130. data/test/profile.rb +2 -0
  131. data/test/tc_detectors.rb +27 -0
  132. data/test/tc_entity.rb +105 -0
  133. data/test/tc_extractors.rb +48 -0
  134. data/test/tc_formatters.rb +46 -0
  135. data/test/tc_inflectors.rb +39 -0
  136. data/test/tc_lexicalizers.rb +39 -0
  137. data/test/tc_processors.rb +36 -0
  138. data/test/tc_resources.rb +27 -0
  139. data/test/tc_treat.rb +64 -0
  140. data/test/tc_tree.rb +60 -0
  141. data/test/tests.rb +19 -0
  142. data/test/texts.rb +20 -0
  143. data/test/texts/english/long.html +24 -0
  144. data/test/texts/english/long.txt +22 -0
  145. data/test/texts/english/medium.txt +5 -0
  146. data/test/texts/english/short.txt +3 -0
  147. metadata +412 -0
@@ -0,0 +1,18 @@
1
+ silently { require 'english' }
2
+
3
+ module Treat
4
+ module Inflectors
5
+ module Declensors
6
+ module En
7
+ def self.declense(entity, options)
8
+ string = entity.to_s
9
+ if options[:count] == :plural
10
+ ::English.plural(string)
11
+ elsif options[:count] == :singular
12
+ ::English.singular(string)
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,30 @@
1
+ module Treat
2
+ module Inflectors
3
+ module Declensors
4
+ silently { require 'linguistics' }
5
+ # Obtain word declensions in English using the
6
+ # ruby 'linguistics' gem.
7
+ class Linguistics
8
+ def self.declense(entity, options = {})
9
+ begin
10
+ l = entity.language.to_s.upcase
11
+ delegate = nil
12
+ silently { delegate = ::Linguistics.const_get(l) }
13
+ rescue RuntimeError
14
+ raise "Ruby Linguistics does not have a module " +
15
+ " installed for the #{entity.language} language."
16
+ end
17
+ string = entity.to_s
18
+ if options[:count] == :plural
19
+ if entity.has?(:category) &&
20
+ [:noun, :adjective, :verb].include?(entity.category)
21
+ silently { delegate.send(:"plural_#{entity.category}", string) }
22
+ else
23
+ silently { delegate.plural(string) }
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,12 @@
1
+ module Treat
2
+ module Inflectors
3
+ module Lemmatizers
4
+ class ELemma
5
+ silently { require 'treat/inflectors/lemmatizers/elemma/elemma'}
6
+ def self.lemma(entity, options = nil)
7
+ ::ELemma::parse(word, entity.tag)
8
+ end
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,213 @@
1
+
2
+ SHELL = /bin/sh
3
+
4
+ # V=0 quiet, V=1 verbose. other values don't work.
5
+ V = 0
6
+ Q1 = $(V:1=)
7
+ Q = $(Q1:0=@)
8
+ n=$(NULLCMD)
9
+ ECHO1 = $(V:1=@$n)
10
+ ECHO = $(ECHO1:0=@echo)
11
+
12
+ #### Start of system configuration section. ####
13
+
14
+ srcdir = .
15
+ topdir = /usr/local/include/ruby-1.9.1
16
+ hdrdir = /usr/local/include/ruby-1.9.1
17
+ arch_hdrdir = /usr/local/include/ruby-1.9.1/$(arch)
18
+ VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
19
+ prefix = $(DESTDIR)/usr/local
20
+ exec_prefix = $(prefix)
21
+ rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
22
+ bindir = $(exec_prefix)/bin
23
+ sbindir = $(exec_prefix)/sbin
24
+ libexecdir = $(exec_prefix)/libexec
25
+ datarootdir = $(prefix)/share
26
+ datadir = $(datarootdir)
27
+ sysconfdir = $(prefix)/etc
28
+ sharedstatedir = $(prefix)/com
29
+ localstatedir = $(prefix)/var
30
+ includedir = $(prefix)/include
31
+ oldincludedir = $(DESTDIR)/usr/include
32
+ docdir = $(datarootdir)/doc/$(PACKAGE)
33
+ infodir = $(datarootdir)/info
34
+ htmldir = $(docdir)
35
+ dvidir = $(docdir)
36
+ pdfdir = $(docdir)
37
+ psdir = $(docdir)
38
+ libdir = $(exec_prefix)/lib
39
+ localedir = $(datarootdir)/locale
40
+ mandir = $(datarootdir)/man
41
+ ridir = $(datarootdir)/$(RI_BASE_NAME)
42
+ sitedir = $(rubylibprefix)/site_ruby
43
+ vendordir = $(rubylibprefix)/vendor_ruby
44
+ rubyhdrdir = $(includedir)/$(RUBY_BASE_NAME)-$(ruby_version)
45
+ sitehdrdir = $(rubyhdrdir)/site_ruby
46
+ vendorhdrdir = $(rubyhdrdir)/vendor_ruby
47
+ rubylibdir = $(rubylibprefix)/$(ruby_version)
48
+ archdir = $(rubylibdir)/$(arch)
49
+ sitelibdir = $(sitedir)/$(ruby_version)
50
+ sitearchdir = $(sitelibdir)/$(sitearch)
51
+ vendorlibdir = $(vendordir)/$(ruby_version)
52
+ vendorarchdir = $(vendorlibdir)/$(sitearch)
53
+
54
+ NULLCMD = :
55
+
56
+ CC = gcc-4.2
57
+ CXX = g++-4.2
58
+ LIBRUBY = $(LIBRUBY_SO)
59
+ LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
60
+ LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
61
+ LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static
62
+ OUTFLAG = -o
63
+ COUTFLAG = -o
64
+
65
+ RUBY_EXTCONF_H =
66
+ cflags = $(optflags) $(debugflags) $(warnflags)
67
+ optflags = -O3
68
+ debugflags = -ggdb
69
+ warnflags = -Wextra -Wno-unused-parameter -Wno-parentheses -Wno-long-long -Wno-missing-field-initializers -Wpointer-arith -Wwrite-strings -Wdeclaration-after-statement -Wshorten-64-to-32 -Wimplicit-function-declaration
70
+ CFLAGS = -fno-common -Wall -I/usr/local/WordNet-2.1/include/
71
+ INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
72
+ DEFS =
73
+ CPPFLAGS = -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE $(DEFS) $(cppflags)
74
+ CXXFLAGS = $(CFLAGS) $(cxxflags)
75
+ ldflags = -L. -L/usr/local/lib
76
+ dldflags = -Wl,-undefined,dynamic_lookup -Wl,-multiply_defined,suppress -Wl,-flat_namespace
77
+ ARCH_FLAG =
78
+ DLDFLAGS = $(ldflags) $(dldflags)
79
+ LDSHARED = $(CC) -dynamic -bundle
80
+ LDSHAREDXX = $(CXX) -dynamic -bundle
81
+ AR = ar
82
+ EXEEXT =
83
+
84
+ RUBY_BASE_NAME = ruby
85
+ RUBY_INSTALL_NAME = ruby19
86
+ RUBY_SO_NAME = ruby.1.9.1
87
+ arch = x86_64-darwin11.0.0
88
+ sitearch = $(arch)
89
+ ruby_version = 1.9.1
90
+ ruby = /usr/local/bin/ruby19
91
+ RUBY = $(ruby)
92
+ RM = rm -f
93
+ RM_RF = $(RUBY) -run -e rm -- -rf
94
+ RMDIRS = rmdir -p
95
+ MAKEDIRS = mkdir -p
96
+ INSTALL = /usr/bin/install -c
97
+ INSTALL_PROG = $(INSTALL) -m 0755
98
+ INSTALL_DATA = $(INSTALL) -m 644
99
+ COPY = cp
100
+
101
+ #### End of system configuration section. ####
102
+
103
+ preload =
104
+
105
+ libpath = . $(libdir)
106
+ LIBPATH = -L. -L$(libdir)
107
+ DEFFILE =
108
+
109
+ CLEANFILES = mkmf.log
110
+ DISTCLEANFILES =
111
+ DISTCLEANDIRS =
112
+
113
+ extout =
114
+ extout_prefix =
115
+ target_prefix =
116
+ LOCAL_LIBS = -L/usr/local/WordNet-2.1/lib -lwn
117
+ LIBS = $(LIBRUBYARG_SHARED) -lpthread -ldl -lobjc
118
+ SRCS = elemma.c
119
+ OBJS = elemma.o
120
+ TARGET = elemma
121
+ DLLIB = $(TARGET).bundle
122
+ EXTSTATIC =
123
+ STATIC_LIB =
124
+
125
+ BINDIR = $(bindir)
126
+ RUBYCOMMONDIR = $(sitedir)$(target_prefix)
127
+ RUBYLIBDIR = $(sitelibdir)$(target_prefix)
128
+ RUBYARCHDIR = $(sitearchdir)$(target_prefix)
129
+ HDRDIR = $(rubyhdrdir)/ruby$(target_prefix)
130
+ ARCHHDRDIR = $(rubyhdrdir)/$(arch)/ruby$(target_prefix)
131
+
132
+ TARGET_SO = $(DLLIB)
133
+ CLEANLIBS = $(TARGET).bundle
134
+ CLEANOBJS = *.o *.bak
135
+
136
+ all: $(DLLIB)
137
+ static: $(STATIC_LIB)
138
+ .PHONY: all install static install-so install-rb
139
+ .PHONY: clean clean-so clean-rb
140
+
141
+ clean-rb-default::
142
+ clean-rb::
143
+ clean-so::
144
+ clean: clean-so clean-rb-default clean-rb
145
+ @-$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES)
146
+
147
+ distclean-rb-default::
148
+ distclean-rb::
149
+ distclean-so::
150
+ distclean: clean distclean-so distclean-rb-default distclean-rb
151
+ @-$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
152
+ @-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
153
+ @-$(RMDIRS) $(DISTCLEANDIRS) 2> /dev/null || true
154
+
155
+ realclean: distclean
156
+ install: install-so install-rb
157
+
158
+ install-so: $(RUBYARCHDIR)
159
+ install-so: $(RUBYARCHDIR)/$(DLLIB)
160
+ $(RUBYARCHDIR)/$(DLLIB): $(DLLIB)
161
+ @-$(MAKEDIRS) $(@D)
162
+ $(INSTALL_PROG) $(DLLIB) $(@D)
163
+ install-rb: pre-install-rb install-rb-default
164
+ install-rb-default: pre-install-rb-default
165
+ pre-install-rb: Makefile
166
+ pre-install-rb-default: Makefile
167
+ pre-install-rb-default:
168
+ $(ECHO) installing default elemma libraries
169
+ $(RUBYARCHDIR):
170
+ $(Q) $(MAKEDIRS) $@
171
+
172
+ site-install: site-install-so site-install-rb
173
+ site-install-so: install-so
174
+ site-install-rb: install-rb
175
+
176
+ .SUFFIXES: .c .m .cc .mm .cxx .cpp .C .o
177
+
178
+ .cc.o:
179
+ $(ECHO) compiling $(<)
180
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
181
+
182
+ .mm.o:
183
+ $(ECHO) compiling $(<)
184
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
185
+
186
+ .cxx.o:
187
+ $(ECHO) compiling $(<)
188
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
189
+
190
+ .cpp.o:
191
+ $(ECHO) compiling $(<)
192
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
193
+
194
+ .C.o:
195
+ $(ECHO) compiling $(<)
196
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
197
+
198
+ .c.o:
199
+ $(ECHO) compiling $(<)
200
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $<
201
+
202
+ .m.o:
203
+ $(ECHO) compiling $(<)
204
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $<
205
+
206
+ $(DLLIB): $(OBJS) Makefile
207
+ $(ECHO) linking shared-object $(DLLIB)
208
+ @-$(RM) $(@)
209
+ $(Q) $(LDSHARED) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
210
+
211
+
212
+
213
+ $(OBJS): $(hdrdir)/ruby.h $(hdrdir)/ruby/defines.h $(arch_hdrdir)/ruby/config.h
@@ -0,0 +1,68 @@
1
+ #include "wn.h"
2
+ #include "wnconsts.h"
3
+ #include "ruby.h"
4
+
5
+ /*
6
+
7
+ Copyright (C) 2004 UTIYAMA Masao <mutiyama@crl.go.jp>
8
+
9
+ This program is free software; you can redistribute it and/or modify
10
+ it under the terms of the GNU General Public License as published by
11
+ the Free Software Foundation; either version 2 of the License, or
12
+ (at your option) any later version.
13
+
14
+ This program is distributed in the hope that it will be useful,
15
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ MERCHANTABITreatY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ GNU General Public License for more details.
18
+
19
+ You should have received a copy of the GNU General Public License
20
+ along with this program; if not, write to the Free Software
21
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22
+ */
23
+
24
+ static VALUE
25
+ parse(VALUE klass, VALUE rb_word, VALUE rb_pos)
26
+ {
27
+ char *word;
28
+ char *POS = STR2CSTR(rb_pos);
29
+ char *lemma;
30
+ int pos;
31
+ VALUE retval = rb_ary_new();
32
+
33
+ word = malloc(strlen(STR2CSTR(rb_word))+1);
34
+ if(!word){rb_raise(rb_eStandardError, "malloc failed.\n");}
35
+ strcpy(word, STR2CSTR(rb_word));
36
+
37
+ if(strcmp(POS,"noun")==0){pos = NOUN;}
38
+ else if(strcmp(POS,"verb")==0){pos = VERB;}
39
+ else if(strcmp(POS,"adj")==0){pos = ADJ;}
40
+ else if(strcmp(POS,"adv")==0){pos = ADV;}
41
+ else{
42
+ rb_raise(rb_eStandardError, "%s should be (noun|verb|adj|adv)\n", POS);
43
+ }
44
+ if(is_defined(word, pos)){
45
+ /*printf("* %s found as is.\n", word);*/
46
+ rb_ary_push(retval, rb_str_new2(word));
47
+ }
48
+ if((lemma=morphstr(word, pos))!=NULL){
49
+ do {
50
+ if(is_defined(lemma, pos)){
51
+ /*printf("* %s => %s found.\n", word, lemma);*/
52
+ rb_ary_push(retval, rb_str_new2(lemma));
53
+ }
54
+ } while((lemma=morphstr(NULL, pos))!=NULL);
55
+ }
56
+ free(word);
57
+ return retval;
58
+ }
59
+
60
+ void
61
+ Init_elemma()
62
+ {
63
+ VALUE mod = rb_define_module("ELemma");
64
+ rb_define_module_function(mod, "parse", parse, 2);
65
+ if(wninit()){
66
+ rb_raise(rb_eStandardError, "Cannot open WordNet database\n");
67
+ }
68
+ }
@@ -0,0 +1,6 @@
1
+ require 'mkmf'
2
+
3
+ $CFLAGS = "-Wall -I/usr/local/WordNet-2.1/include/"
4
+ $LOCAL_LIBS = "-L/usr/local/WordNet-2.1/lib -lwn"
5
+
6
+ create_makefile("elemma")
@@ -0,0 +1,21 @@
1
+ module Treat
2
+ module Inflectors
3
+ module OrdinalWords
4
+ class Linguistics
5
+ silently { require 'linguistics' }
6
+ def self.ordinal_words(number, options = {})
7
+ begin
8
+ l = number.language.to_s.upcase
9
+ delegate = nil
10
+ silently { delegate = ::Linguistics.const_get(l) }
11
+ rescue RuntimeError
12
+ lang = Treat::Resources::Language.describe(number.language)
13
+ raise "Ruby Linguistics does not have a module " +
14
+ " installed for the #{lang} language."
15
+ end
16
+ silently { delegate.ordinate(number.to_s) }
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,158 @@
1
+ module Treat
2
+ module Inflectors
3
+ module Stemmers
4
+ # Stem a word using a native Ruby implementation of the
5
+ # Porter stemming algorithm, ported to Ruby from the
6
+ # version coded up in Perl.
7
+ #
8
+ # Authored by Ray Pereda (raypereda@hotmail.com).
9
+ #
10
+ # Original paper: Porter, 1980. An algorithm for suffix stripping,
11
+ # Program, Vol. 14, no. 3, pp 130-137,
12
+ # Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
13
+ class Porter
14
+ # Returns the stem of a word using a native Porter stemmer.
15
+ # Options: none.
16
+ def self.stem(word, options = {})
17
+ # Copy the word and convert it to a string.
18
+ w = word.to_s
19
+ return w if w.length < 3
20
+ # Map initial y to Y so that the patterns
21
+ # never treat it as vowel.
22
+ w[0] = 'Y' if w[0] == ?y
23
+ # Step 1a
24
+ if w =~ /(ss|i)es$/
25
+ w = $` + $1
26
+ elsif w =~ /([^s])s$/
27
+ w = $` + $1
28
+ end
29
+ # Step 1b
30
+ if w =~ /eed$/
31
+ w.chop! if $` =~ MGR0
32
+ elsif w =~ /(ed|ing)$/
33
+ stem = $`
34
+ if stem =~ VOWEL_IN_STEM
35
+ w = stem
36
+ case w
37
+ when /(at|bl|iz)$/ then w << "e"
38
+ when /([^aeiouylsz])\1$/ then w.chop!
39
+ when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
40
+ end
41
+ end
42
+ end
43
+ if w =~ /y$/
44
+ stem = $`
45
+ w = stem + "i" if stem =~ VOWEL_IN_STEM
46
+ end
47
+ # Step 2
48
+ if w =~ SUFFIX_1_REGEXP
49
+ stem = $`
50
+ suffix = $1
51
+ if stem =~ MGR0
52
+ w = stem + STEP_2_LIST[suffix]
53
+ end
54
+ end
55
+ # Step 3
56
+ if w =~
57
+ /(icate|ative|alize|iciti|ical|ful|ness)$/
58
+ stem = $`
59
+ suffix = $1
60
+ if stem =~ MGR0
61
+ w = stem + STEP_3_LIST[suffix]
62
+ end
63
+ end
64
+ # Step 4
65
+ if w =~ SUFFIX_2_REGEXP
66
+ stem = $`
67
+ if stem =~ MGR1
68
+ w = stem
69
+ end
70
+ elsif w =~ /(s|t)(ion)$/
71
+ stem = $` + $1
72
+ if stem =~ MGR1
73
+ w = stem
74
+ end
75
+ end
76
+ # Step 5
77
+ if w =~ /e$/
78
+ stem = $`
79
+ if (stem =~ MGR1) ||
80
+ (stem =~ MEQ1 && stem !~
81
+ /^#{CC}#{V}[^aeiouwxy]$/o)
82
+ w = stem
83
+ end
84
+ end
85
+ if w =~ /ll$/ && w =~ MGR1
86
+ w.chop!
87
+ end
88
+ # and turn initial Y back to y
89
+ w[0] = 'y' if w[0] == ?Y
90
+ w
91
+ end
92
+
93
+ STEP_2_LIST = {
94
+ 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
95
+ 'izer'=>'ize', 'bli'=>'ble',
96
+ 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
97
+ 'ization'=>'ize', 'ation'=>'ate',
98
+ 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
99
+ 'ousness'=>'ous', 'anati'=>'al',
100
+ 'iviti'=>'ive', 'binati'=>'ble', 'logi'=>'log'
101
+ }
102
+ STEP_3_LIST = {
103
+ 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
104
+ 'ical'=>'ic', 'ful'=>'', 'ness'=>''
105
+ }
106
+ SUFFIX_1_REGEXP = /(
107
+ ational |
108
+ tional |
109
+ enci |
110
+ anci |
111
+ izer |
112
+ bli |
113
+ alli |
114
+ entli |
115
+ eli |
116
+ ousli |
117
+ ization |
118
+ ation |
119
+ ator |
120
+ alism |
121
+ iveness |
122
+ fulness |
123
+ ousness |
124
+ anati |
125
+ iviti |
126
+ binati |
127
+ logi)$/x
128
+ SUFFIX_2_REGEXP = /(
129
+ al |
130
+ ance |
131
+ ence |
132
+ er |
133
+ ic |
134
+ able |
135
+ ible |
136
+ ant |
137
+ ement |
138
+ ment |
139
+ ent |
140
+ ou |
141
+ ism |
142
+ ate |
143
+ iti |
144
+ ous |
145
+ ive |
146
+ ize)$/x
147
+ C = "[^aeiou]" # consonant
148
+ V = "[aeiouy]" # vowel
149
+ CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
150
+ VV = "#{V}(?>[aeiou]*)" # vowel sequence
151
+ MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
152
+ MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
153
+ MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
154
+ VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
155
+ end
156
+ end
157
+ end
158
+ end