treat 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL +0 -0
- data/LICENSE +28 -0
- data/README +0 -0
- data/TODO +67 -0
- data/bin/INFO +1 -0
- data/examples/benchmark.rb +81 -0
- data/examples/keywords.rb +60 -0
- data/examples/texts/bugged_out.txt +26 -0
- data/examples/texts/half_cocked_basel.txt +16 -0
- data/examples/texts/hedge_funds.txt +24 -0
- data/examples/texts/hose_and_dry.txt +19 -0
- data/examples/texts/hungarys_troubles.txt +46 -0
- data/examples/texts/indias_slowdown.txt +15 -0
- data/examples/texts/merkozy_rides_again.txt +24 -0
- data/examples/texts/prada_is_not_walmart.txt +9 -0
- data/examples/texts/republican_nomination.txt +26 -0
- data/examples/texts/to_infinity_and_beyond.txt +15 -0
- data/lib/treat.rb +91 -0
- data/lib/treat/buildable.rb +115 -0
- data/lib/treat/categories.rb +29 -0
- data/lib/treat/category.rb +28 -0
- data/lib/treat/delegatable.rb +90 -0
- data/lib/treat/detectors.rb +28 -0
- data/lib/treat/detectors/encoding/native.rb +12 -0
- data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
- data/lib/treat/detectors/format/file.rb +36 -0
- data/lib/treat/detectors/language/language_detector.rb +19 -0
- data/lib/treat/detectors/language/what_language.rb +29 -0
- data/lib/treat/entities.rb +52 -0
- data/lib/treat/entities/collection.rb +19 -0
- data/lib/treat/entities/constituents.rb +15 -0
- data/lib/treat/entities/document.rb +11 -0
- data/lib/treat/entities/entity.rb +242 -0
- data/lib/treat/entities/sentence.rb +8 -0
- data/lib/treat/entities/text.rb +7 -0
- data/lib/treat/entities/tokens.rb +37 -0
- data/lib/treat/entities/zones.rb +17 -0
- data/lib/treat/exception.rb +5 -0
- data/lib/treat/extractors.rb +41 -0
- data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
- data/lib/treat/extractors/named_entity/abner.rb +20 -0
- data/lib/treat/extractors/named_entity/stanford.rb +174 -0
- data/lib/treat/extractors/statistics/frequency.rb +22 -0
- data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
- data/lib/treat/extractors/statistics/position_in.rb +13 -0
- data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
- data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
- data/lib/treat/extractors/time/chronic.rb +12 -0
- data/lib/treat/extractors/time/native.rb +12 -0
- data/lib/treat/extractors/time/nickel.rb +45 -0
- data/lib/treat/extractors/topic_words/lda.rb +71 -0
- data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
- data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
- data/lib/treat/extractors/topics/reuters.rb +91 -0
- data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
- data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
- data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
- data/lib/treat/feature.rb +53 -0
- data/lib/treat/formatters.rb +44 -0
- data/lib/treat/formatters/cleaners/html.rb +17 -0
- data/lib/treat/formatters/readers/autoselect.rb +35 -0
- data/lib/treat/formatters/readers/gocr.rb +24 -0
- data/lib/treat/formatters/readers/html.rb +13 -0
- data/lib/treat/formatters/readers/ocropus.rb +31 -0
- data/lib/treat/formatters/readers/pdf.rb +17 -0
- data/lib/treat/formatters/readers/txt.rb +15 -0
- data/lib/treat/formatters/serializers/xml.rb +48 -0
- data/lib/treat/formatters/serializers/yaml.rb +15 -0
- data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
- data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
- data/lib/treat/formatters/unserializers/xml.rb +79 -0
- data/lib/treat/formatters/unserializers/yaml.rb +15 -0
- data/lib/treat/formatters/visualizers/dot.rb +73 -0
- data/lib/treat/formatters/visualizers/html.rb +12 -0
- data/lib/treat/formatters/visualizers/inspect.rb +16 -0
- data/lib/treat/formatters/visualizers/short_value.rb +14 -0
- data/lib/treat/formatters/visualizers/standoff.rb +41 -0
- data/lib/treat/formatters/visualizers/tree.rb +28 -0
- data/lib/treat/formatters/visualizers/txt.rb +31 -0
- data/lib/treat/group.rb +96 -0
- data/lib/treat/inflectors.rb +50 -0
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
- data/lib/treat/inflectors/declensors/en.rb +18 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
- data/lib/treat/inflectors/stemmers/porter.rb +158 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
- data/lib/treat/inflectors/stemmers/uea.rb +30 -0
- data/lib/treat/lexicalizers.rb +49 -0
- data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
- data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
- data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
- data/lib/treat/lexicalizers/tag/brill.rb +101 -0
- data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
- data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
- data/lib/treat/processors.rb +45 -0
- data/lib/treat/processors/chunkers/txt.rb +27 -0
- data/lib/treat/processors/parsers/enju.rb +214 -0
- data/lib/treat/processors/parsers/stanford.rb +60 -0
- data/lib/treat/processors/segmenters/punkt.rb +48 -0
- data/lib/treat/processors/segmenters/stanford.rb +45 -0
- data/lib/treat/processors/segmenters/tactful.rb +34 -0
- data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
- data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
- data/lib/treat/processors/tokenizers/perl.rb +96 -0
- data/lib/treat/processors/tokenizers/punkt.rb +42 -0
- data/lib/treat/processors/tokenizers/stanford.rb +33 -0
- data/lib/treat/processors/tokenizers/tactful.rb +59 -0
- data/lib/treat/proxies.rb +66 -0
- data/lib/treat/registrable.rb +26 -0
- data/lib/treat/resources.rb +10 -0
- data/lib/treat/resources/categories.rb +18 -0
- data/lib/treat/resources/delegates.rb +96 -0
- data/lib/treat/resources/dependencies.rb +0 -0
- data/lib/treat/resources/edges.rb +8 -0
- data/lib/treat/resources/formats.rb +23 -0
- data/lib/treat/resources/languages.rb +86 -0
- data/lib/treat/resources/languages.txt +504 -0
- data/lib/treat/resources/tags.rb +393 -0
- data/lib/treat/sugar.rb +43 -0
- data/lib/treat/tree.rb +174 -0
- data/lib/treat/utilities.rb +127 -0
- data/lib/treat/visitable.rb +27 -0
- data/test/profile.rb +2 -0
- data/test/tc_detectors.rb +27 -0
- data/test/tc_entity.rb +105 -0
- data/test/tc_extractors.rb +48 -0
- data/test/tc_formatters.rb +46 -0
- data/test/tc_inflectors.rb +39 -0
- data/test/tc_lexicalizers.rb +39 -0
- data/test/tc_processors.rb +36 -0
- data/test/tc_resources.rb +27 -0
- data/test/tc_treat.rb +64 -0
- data/test/tc_tree.rb +60 -0
- data/test/tests.rb +19 -0
- data/test/texts.rb +20 -0
- data/test/texts/english/long.html +24 -0
- data/test/texts/english/long.txt +22 -0
- data/test/texts/english/medium.txt +5 -0
- data/test/texts/english/short.txt +3 -0
- metadata +412 -0
@@ -0,0 +1,18 @@
|
|
1
|
+
silently { require 'english' }
|
2
|
+
|
3
|
+
module Treat
|
4
|
+
module Inflectors
|
5
|
+
module Declensors
|
6
|
+
module En
|
7
|
+
def self.declense(entity, options)
|
8
|
+
string = entity.to_s
|
9
|
+
if options[:count] == :plural
|
10
|
+
::English.plural(string)
|
11
|
+
elsif options[:count] == :singular
|
12
|
+
::English.singular(string)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Treat
|
2
|
+
module Inflectors
|
3
|
+
module Declensors
|
4
|
+
silently { require 'linguistics' }
|
5
|
+
# Obtain word declensions in English using the
|
6
|
+
# ruby 'linguistics' gem.
|
7
|
+
class Linguistics
|
8
|
+
def self.declense(entity, options = {})
|
9
|
+
begin
|
10
|
+
l = entity.language.to_s.upcase
|
11
|
+
delegate = nil
|
12
|
+
silently { delegate = ::Linguistics.const_get(l) }
|
13
|
+
rescue RuntimeError
|
14
|
+
raise "Ruby Linguistics does not have a module " +
|
15
|
+
" installed for the #{entity.language} language."
|
16
|
+
end
|
17
|
+
string = entity.to_s
|
18
|
+
if options[:count] == :plural
|
19
|
+
if entity.has?(:category) &&
|
20
|
+
[:noun, :adjective, :verb].include?(entity.category)
|
21
|
+
silently { delegate.send(:"plural_#{entity.category}", string) }
|
22
|
+
else
|
23
|
+
silently { delegate.plural(string) }
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,213 @@
|
|
1
|
+
|
2
|
+
SHELL = /bin/sh
|
3
|
+
|
4
|
+
# V=0 quiet, V=1 verbose. other values don't work.
|
5
|
+
V = 0
|
6
|
+
Q1 = $(V:1=)
|
7
|
+
Q = $(Q1:0=@)
|
8
|
+
n=$(NULLCMD)
|
9
|
+
ECHO1 = $(V:1=@$n)
|
10
|
+
ECHO = $(ECHO1:0=@echo)
|
11
|
+
|
12
|
+
#### Start of system configuration section. ####
|
13
|
+
|
14
|
+
srcdir = .
|
15
|
+
topdir = /usr/local/include/ruby-1.9.1
|
16
|
+
hdrdir = /usr/local/include/ruby-1.9.1
|
17
|
+
arch_hdrdir = /usr/local/include/ruby-1.9.1/$(arch)
|
18
|
+
VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
|
19
|
+
prefix = $(DESTDIR)/usr/local
|
20
|
+
exec_prefix = $(prefix)
|
21
|
+
rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
|
22
|
+
bindir = $(exec_prefix)/bin
|
23
|
+
sbindir = $(exec_prefix)/sbin
|
24
|
+
libexecdir = $(exec_prefix)/libexec
|
25
|
+
datarootdir = $(prefix)/share
|
26
|
+
datadir = $(datarootdir)
|
27
|
+
sysconfdir = $(prefix)/etc
|
28
|
+
sharedstatedir = $(prefix)/com
|
29
|
+
localstatedir = $(prefix)/var
|
30
|
+
includedir = $(prefix)/include
|
31
|
+
oldincludedir = $(DESTDIR)/usr/include
|
32
|
+
docdir = $(datarootdir)/doc/$(PACKAGE)
|
33
|
+
infodir = $(datarootdir)/info
|
34
|
+
htmldir = $(docdir)
|
35
|
+
dvidir = $(docdir)
|
36
|
+
pdfdir = $(docdir)
|
37
|
+
psdir = $(docdir)
|
38
|
+
libdir = $(exec_prefix)/lib
|
39
|
+
localedir = $(datarootdir)/locale
|
40
|
+
mandir = $(datarootdir)/man
|
41
|
+
ridir = $(datarootdir)/$(RI_BASE_NAME)
|
42
|
+
sitedir = $(rubylibprefix)/site_ruby
|
43
|
+
vendordir = $(rubylibprefix)/vendor_ruby
|
44
|
+
rubyhdrdir = $(includedir)/$(RUBY_BASE_NAME)-$(ruby_version)
|
45
|
+
sitehdrdir = $(rubyhdrdir)/site_ruby
|
46
|
+
vendorhdrdir = $(rubyhdrdir)/vendor_ruby
|
47
|
+
rubylibdir = $(rubylibprefix)/$(ruby_version)
|
48
|
+
archdir = $(rubylibdir)/$(arch)
|
49
|
+
sitelibdir = $(sitedir)/$(ruby_version)
|
50
|
+
sitearchdir = $(sitelibdir)/$(sitearch)
|
51
|
+
vendorlibdir = $(vendordir)/$(ruby_version)
|
52
|
+
vendorarchdir = $(vendorlibdir)/$(sitearch)
|
53
|
+
|
54
|
+
NULLCMD = :
|
55
|
+
|
56
|
+
CC = gcc-4.2
|
57
|
+
CXX = g++-4.2
|
58
|
+
LIBRUBY = $(LIBRUBY_SO)
|
59
|
+
LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
|
60
|
+
LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
|
61
|
+
LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static
|
62
|
+
OUTFLAG = -o
|
63
|
+
COUTFLAG = -o
|
64
|
+
|
65
|
+
RUBY_EXTCONF_H =
|
66
|
+
cflags = $(optflags) $(debugflags) $(warnflags)
|
67
|
+
optflags = -O3
|
68
|
+
debugflags = -ggdb
|
69
|
+
warnflags = -Wextra -Wno-unused-parameter -Wno-parentheses -Wno-long-long -Wno-missing-field-initializers -Wpointer-arith -Wwrite-strings -Wdeclaration-after-statement -Wshorten-64-to-32 -Wimplicit-function-declaration
|
70
|
+
CFLAGS = -fno-common -Wall -I/usr/local/WordNet-2.1/include/
|
71
|
+
INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
|
72
|
+
DEFS =
|
73
|
+
CPPFLAGS = -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE $(DEFS) $(cppflags)
|
74
|
+
CXXFLAGS = $(CFLAGS) $(cxxflags)
|
75
|
+
ldflags = -L. -L/usr/local/lib
|
76
|
+
dldflags = -Wl,-undefined,dynamic_lookup -Wl,-multiply_defined,suppress -Wl,-flat_namespace
|
77
|
+
ARCH_FLAG =
|
78
|
+
DLDFLAGS = $(ldflags) $(dldflags)
|
79
|
+
LDSHARED = $(CC) -dynamic -bundle
|
80
|
+
LDSHAREDXX = $(CXX) -dynamic -bundle
|
81
|
+
AR = ar
|
82
|
+
EXEEXT =
|
83
|
+
|
84
|
+
RUBY_BASE_NAME = ruby
|
85
|
+
RUBY_INSTALL_NAME = ruby19
|
86
|
+
RUBY_SO_NAME = ruby.1.9.1
|
87
|
+
arch = x86_64-darwin11.0.0
|
88
|
+
sitearch = $(arch)
|
89
|
+
ruby_version = 1.9.1
|
90
|
+
ruby = /usr/local/bin/ruby19
|
91
|
+
RUBY = $(ruby)
|
92
|
+
RM = rm -f
|
93
|
+
RM_RF = $(RUBY) -run -e rm -- -rf
|
94
|
+
RMDIRS = rmdir -p
|
95
|
+
MAKEDIRS = mkdir -p
|
96
|
+
INSTALL = /usr/bin/install -c
|
97
|
+
INSTALL_PROG = $(INSTALL) -m 0755
|
98
|
+
INSTALL_DATA = $(INSTALL) -m 644
|
99
|
+
COPY = cp
|
100
|
+
|
101
|
+
#### End of system configuration section. ####
|
102
|
+
|
103
|
+
preload =
|
104
|
+
|
105
|
+
libpath = . $(libdir)
|
106
|
+
LIBPATH = -L. -L$(libdir)
|
107
|
+
DEFFILE =
|
108
|
+
|
109
|
+
CLEANFILES = mkmf.log
|
110
|
+
DISTCLEANFILES =
|
111
|
+
DISTCLEANDIRS =
|
112
|
+
|
113
|
+
extout =
|
114
|
+
extout_prefix =
|
115
|
+
target_prefix =
|
116
|
+
LOCAL_LIBS = -L/usr/local/WordNet-2.1/lib -lwn
|
117
|
+
LIBS = $(LIBRUBYARG_SHARED) -lpthread -ldl -lobjc
|
118
|
+
SRCS = elemma.c
|
119
|
+
OBJS = elemma.o
|
120
|
+
TARGET = elemma
|
121
|
+
DLLIB = $(TARGET).bundle
|
122
|
+
EXTSTATIC =
|
123
|
+
STATIC_LIB =
|
124
|
+
|
125
|
+
BINDIR = $(bindir)
|
126
|
+
RUBYCOMMONDIR = $(sitedir)$(target_prefix)
|
127
|
+
RUBYLIBDIR = $(sitelibdir)$(target_prefix)
|
128
|
+
RUBYARCHDIR = $(sitearchdir)$(target_prefix)
|
129
|
+
HDRDIR = $(rubyhdrdir)/ruby$(target_prefix)
|
130
|
+
ARCHHDRDIR = $(rubyhdrdir)/$(arch)/ruby$(target_prefix)
|
131
|
+
|
132
|
+
TARGET_SO = $(DLLIB)
|
133
|
+
CLEANLIBS = $(TARGET).bundle
|
134
|
+
CLEANOBJS = *.o *.bak
|
135
|
+
|
136
|
+
all: $(DLLIB)
|
137
|
+
static: $(STATIC_LIB)
|
138
|
+
.PHONY: all install static install-so install-rb
|
139
|
+
.PHONY: clean clean-so clean-rb
|
140
|
+
|
141
|
+
clean-rb-default::
|
142
|
+
clean-rb::
|
143
|
+
clean-so::
|
144
|
+
clean: clean-so clean-rb-default clean-rb
|
145
|
+
@-$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES)
|
146
|
+
|
147
|
+
distclean-rb-default::
|
148
|
+
distclean-rb::
|
149
|
+
distclean-so::
|
150
|
+
distclean: clean distclean-so distclean-rb-default distclean-rb
|
151
|
+
@-$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
|
152
|
+
@-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
|
153
|
+
@-$(RMDIRS) $(DISTCLEANDIRS) 2> /dev/null || true
|
154
|
+
|
155
|
+
realclean: distclean
|
156
|
+
install: install-so install-rb
|
157
|
+
|
158
|
+
install-so: $(RUBYARCHDIR)
|
159
|
+
install-so: $(RUBYARCHDIR)/$(DLLIB)
|
160
|
+
$(RUBYARCHDIR)/$(DLLIB): $(DLLIB)
|
161
|
+
@-$(MAKEDIRS) $(@D)
|
162
|
+
$(INSTALL_PROG) $(DLLIB) $(@D)
|
163
|
+
install-rb: pre-install-rb install-rb-default
|
164
|
+
install-rb-default: pre-install-rb-default
|
165
|
+
pre-install-rb: Makefile
|
166
|
+
pre-install-rb-default: Makefile
|
167
|
+
pre-install-rb-default:
|
168
|
+
$(ECHO) installing default elemma libraries
|
169
|
+
$(RUBYARCHDIR):
|
170
|
+
$(Q) $(MAKEDIRS) $@
|
171
|
+
|
172
|
+
site-install: site-install-so site-install-rb
|
173
|
+
site-install-so: install-so
|
174
|
+
site-install-rb: install-rb
|
175
|
+
|
176
|
+
.SUFFIXES: .c .m .cc .mm .cxx .cpp .C .o
|
177
|
+
|
178
|
+
.cc.o:
|
179
|
+
$(ECHO) compiling $(<)
|
180
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
181
|
+
|
182
|
+
.mm.o:
|
183
|
+
$(ECHO) compiling $(<)
|
184
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
185
|
+
|
186
|
+
.cxx.o:
|
187
|
+
$(ECHO) compiling $(<)
|
188
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
189
|
+
|
190
|
+
.cpp.o:
|
191
|
+
$(ECHO) compiling $(<)
|
192
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
193
|
+
|
194
|
+
.C.o:
|
195
|
+
$(ECHO) compiling $(<)
|
196
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
|
197
|
+
|
198
|
+
.c.o:
|
199
|
+
$(ECHO) compiling $(<)
|
200
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $<
|
201
|
+
|
202
|
+
.m.o:
|
203
|
+
$(ECHO) compiling $(<)
|
204
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $<
|
205
|
+
|
206
|
+
$(DLLIB): $(OBJS) Makefile
|
207
|
+
$(ECHO) linking shared-object $(DLLIB)
|
208
|
+
@-$(RM) $(@)
|
209
|
+
$(Q) $(LDSHARED) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
|
210
|
+
|
211
|
+
|
212
|
+
|
213
|
+
$(OBJS): $(hdrdir)/ruby.h $(hdrdir)/ruby/defines.h $(arch_hdrdir)/ruby/config.h
|
@@ -0,0 +1,68 @@
|
|
1
|
+
#include "wn.h"
|
2
|
+
#include "wnconsts.h"
|
3
|
+
#include "ruby.h"
|
4
|
+
|
5
|
+
/*
|
6
|
+
|
7
|
+
Copyright (C) 2004 UTIYAMA Masao <mutiyama@crl.go.jp>
|
8
|
+
|
9
|
+
This program is free software; you can redistribute it and/or modify
|
10
|
+
it under the terms of the GNU General Public License as published by
|
11
|
+
the Free Software Foundation; either version 2 of the License, or
|
12
|
+
(at your option) any later version.
|
13
|
+
|
14
|
+
This program is distributed in the hope that it will be useful,
|
15
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
MERCHANTABITreatY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
GNU General Public License for more details.
|
18
|
+
|
19
|
+
You should have received a copy of the GNU General Public License
|
20
|
+
along with this program; if not, write to the Free Software
|
21
|
+
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
22
|
+
*/
|
23
|
+
|
24
|
+
static VALUE
|
25
|
+
parse(VALUE klass, VALUE rb_word, VALUE rb_pos)
|
26
|
+
{
|
27
|
+
char *word;
|
28
|
+
char *POS = STR2CSTR(rb_pos);
|
29
|
+
char *lemma;
|
30
|
+
int pos;
|
31
|
+
VALUE retval = rb_ary_new();
|
32
|
+
|
33
|
+
word = malloc(strlen(STR2CSTR(rb_word))+1);
|
34
|
+
if(!word){rb_raise(rb_eStandardError, "malloc failed.\n");}
|
35
|
+
strcpy(word, STR2CSTR(rb_word));
|
36
|
+
|
37
|
+
if(strcmp(POS,"noun")==0){pos = NOUN;}
|
38
|
+
else if(strcmp(POS,"verb")==0){pos = VERB;}
|
39
|
+
else if(strcmp(POS,"adj")==0){pos = ADJ;}
|
40
|
+
else if(strcmp(POS,"adv")==0){pos = ADV;}
|
41
|
+
else{
|
42
|
+
rb_raise(rb_eStandardError, "%s should be (noun|verb|adj|adv)\n", POS);
|
43
|
+
}
|
44
|
+
if(is_defined(word, pos)){
|
45
|
+
/*printf("* %s found as is.\n", word);*/
|
46
|
+
rb_ary_push(retval, rb_str_new2(word));
|
47
|
+
}
|
48
|
+
if((lemma=morphstr(word, pos))!=NULL){
|
49
|
+
do {
|
50
|
+
if(is_defined(lemma, pos)){
|
51
|
+
/*printf("* %s => %s found.\n", word, lemma);*/
|
52
|
+
rb_ary_push(retval, rb_str_new2(lemma));
|
53
|
+
}
|
54
|
+
} while((lemma=morphstr(NULL, pos))!=NULL);
|
55
|
+
}
|
56
|
+
free(word);
|
57
|
+
return retval;
|
58
|
+
}
|
59
|
+
|
60
|
+
void
|
61
|
+
Init_elemma()
|
62
|
+
{
|
63
|
+
VALUE mod = rb_define_module("ELemma");
|
64
|
+
rb_define_module_function(mod, "parse", parse, 2);
|
65
|
+
if(wninit()){
|
66
|
+
rb_raise(rb_eStandardError, "Cannot open WordNet database\n");
|
67
|
+
}
|
68
|
+
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Treat
|
2
|
+
module Inflectors
|
3
|
+
module OrdinalWords
|
4
|
+
class Linguistics
|
5
|
+
silently { require 'linguistics' }
|
6
|
+
def self.ordinal_words(number, options = {})
|
7
|
+
begin
|
8
|
+
l = number.language.to_s.upcase
|
9
|
+
delegate = nil
|
10
|
+
silently { delegate = ::Linguistics.const_get(l) }
|
11
|
+
rescue RuntimeError
|
12
|
+
lang = Treat::Resources::Language.describe(number.language)
|
13
|
+
raise "Ruby Linguistics does not have a module " +
|
14
|
+
" installed for the #{lang} language."
|
15
|
+
end
|
16
|
+
silently { delegate.ordinate(number.to_s) }
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,158 @@
|
|
1
|
+
module Treat
|
2
|
+
module Inflectors
|
3
|
+
module Stemmers
|
4
|
+
# Stem a word using a native Ruby implementation of the
|
5
|
+
# Porter stemming algorithm, ported to Ruby from the
|
6
|
+
# version coded up in Perl.
|
7
|
+
#
|
8
|
+
# Authored by Ray Pereda (raypereda@hotmail.com).
|
9
|
+
#
|
10
|
+
# Original paper: Porter, 1980. An algorithm for suffix stripping,
|
11
|
+
# Program, Vol. 14, no. 3, pp 130-137,
|
12
|
+
# Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
|
13
|
+
class Porter
|
14
|
+
# Returns the stem of a word using a native Porter stemmer.
|
15
|
+
# Options: none.
|
16
|
+
def self.stem(word, options = {})
|
17
|
+
# Copy the word and convert it to a string.
|
18
|
+
w = word.to_s
|
19
|
+
return w if w.length < 3
|
20
|
+
# Map initial y to Y so that the patterns
|
21
|
+
# never treat it as vowel.
|
22
|
+
w[0] = 'Y' if w[0] == ?y
|
23
|
+
# Step 1a
|
24
|
+
if w =~ /(ss|i)es$/
|
25
|
+
w = $` + $1
|
26
|
+
elsif w =~ /([^s])s$/
|
27
|
+
w = $` + $1
|
28
|
+
end
|
29
|
+
# Step 1b
|
30
|
+
if w =~ /eed$/
|
31
|
+
w.chop! if $` =~ MGR0
|
32
|
+
elsif w =~ /(ed|ing)$/
|
33
|
+
stem = $`
|
34
|
+
if stem =~ VOWEL_IN_STEM
|
35
|
+
w = stem
|
36
|
+
case w
|
37
|
+
when /(at|bl|iz)$/ then w << "e"
|
38
|
+
when /([^aeiouylsz])\1$/ then w.chop!
|
39
|
+
when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
if w =~ /y$/
|
44
|
+
stem = $`
|
45
|
+
w = stem + "i" if stem =~ VOWEL_IN_STEM
|
46
|
+
end
|
47
|
+
# Step 2
|
48
|
+
if w =~ SUFFIX_1_REGEXP
|
49
|
+
stem = $`
|
50
|
+
suffix = $1
|
51
|
+
if stem =~ MGR0
|
52
|
+
w = stem + STEP_2_LIST[suffix]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
# Step 3
|
56
|
+
if w =~
|
57
|
+
/(icate|ative|alize|iciti|ical|ful|ness)$/
|
58
|
+
stem = $`
|
59
|
+
suffix = $1
|
60
|
+
if stem =~ MGR0
|
61
|
+
w = stem + STEP_3_LIST[suffix]
|
62
|
+
end
|
63
|
+
end
|
64
|
+
# Step 4
|
65
|
+
if w =~ SUFFIX_2_REGEXP
|
66
|
+
stem = $`
|
67
|
+
if stem =~ MGR1
|
68
|
+
w = stem
|
69
|
+
end
|
70
|
+
elsif w =~ /(s|t)(ion)$/
|
71
|
+
stem = $` + $1
|
72
|
+
if stem =~ MGR1
|
73
|
+
w = stem
|
74
|
+
end
|
75
|
+
end
|
76
|
+
# Step 5
|
77
|
+
if w =~ /e$/
|
78
|
+
stem = $`
|
79
|
+
if (stem =~ MGR1) ||
|
80
|
+
(stem =~ MEQ1 && stem !~
|
81
|
+
/^#{CC}#{V}[^aeiouwxy]$/o)
|
82
|
+
w = stem
|
83
|
+
end
|
84
|
+
end
|
85
|
+
if w =~ /ll$/ && w =~ MGR1
|
86
|
+
w.chop!
|
87
|
+
end
|
88
|
+
# and turn initial Y back to y
|
89
|
+
w[0] = 'y' if w[0] == ?Y
|
90
|
+
w
|
91
|
+
end
|
92
|
+
|
93
|
+
STEP_2_LIST = {
|
94
|
+
'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
|
95
|
+
'izer'=>'ize', 'bli'=>'ble',
|
96
|
+
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
|
97
|
+
'ization'=>'ize', 'ation'=>'ate',
|
98
|
+
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
|
99
|
+
'ousness'=>'ous', 'anati'=>'al',
|
100
|
+
'iviti'=>'ive', 'binati'=>'ble', 'logi'=>'log'
|
101
|
+
}
|
102
|
+
STEP_3_LIST = {
|
103
|
+
'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
|
104
|
+
'ical'=>'ic', 'ful'=>'', 'ness'=>''
|
105
|
+
}
|
106
|
+
SUFFIX_1_REGEXP = /(
|
107
|
+
ational |
|
108
|
+
tional |
|
109
|
+
enci |
|
110
|
+
anci |
|
111
|
+
izer |
|
112
|
+
bli |
|
113
|
+
alli |
|
114
|
+
entli |
|
115
|
+
eli |
|
116
|
+
ousli |
|
117
|
+
ization |
|
118
|
+
ation |
|
119
|
+
ator |
|
120
|
+
alism |
|
121
|
+
iveness |
|
122
|
+
fulness |
|
123
|
+
ousness |
|
124
|
+
anati |
|
125
|
+
iviti |
|
126
|
+
binati |
|
127
|
+
logi)$/x
|
128
|
+
SUFFIX_2_REGEXP = /(
|
129
|
+
al |
|
130
|
+
ance |
|
131
|
+
ence |
|
132
|
+
er |
|
133
|
+
ic |
|
134
|
+
able |
|
135
|
+
ible |
|
136
|
+
ant |
|
137
|
+
ement |
|
138
|
+
ment |
|
139
|
+
ent |
|
140
|
+
ou |
|
141
|
+
ism |
|
142
|
+
ate |
|
143
|
+
iti |
|
144
|
+
ous |
|
145
|
+
ive |
|
146
|
+
ize)$/x
|
147
|
+
C = "[^aeiou]" # consonant
|
148
|
+
V = "[aeiouy]" # vowel
|
149
|
+
CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
|
150
|
+
VV = "#{V}(?>[aeiou]*)" # vowel sequence
|
151
|
+
MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
|
152
|
+
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
|
153
|
+
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
|
154
|
+
VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|