thera 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. data/.document +5 -0
  2. data/.gitignore +56 -0
  3. data/Gemfile +2 -0
  4. data/Gemfile.lock +20 -0
  5. data/LICENSE.txt +1 -0
  6. data/README.rdoc +8 -0
  7. data/Rakefile +1 -0
  8. data/ext/Makefile +225 -0
  9. data/ext/extconf.rb +29 -0
  10. data/ext/quarry/quarry_toolkit.cpp +148 -0
  11. data/lib/quarry/Makefile.linux +2 -0
  12. data/lib/quarry/Makefile.osx +6 -0
  13. data/lib/quarry/Makefile.targets +23 -0
  14. data/lib/quarry/obj/.gitkeep +0 -0
  15. data/lib/quarry/src/classifier/aode/aode_classifier.cpp +0 -0
  16. data/lib/quarry/src/classifier/aode/aode_classifier.h +0 -0
  17. data/lib/quarry/src/classifier/centroid/centroid_classifier.cpp +0 -0
  18. data/lib/quarry/src/classifier/centroid/centroid_classifier.h +0 -0
  19. data/lib/quarry/src/classifier/classifier.cpp +32 -0
  20. data/lib/quarry/src/classifier/classifier.h +59 -0
  21. data/lib/quarry/src/classifier/knn/knn_classifier.cpp +0 -0
  22. data/lib/quarry/src/classifier/knn/knn_classifier.h +0 -0
  23. data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.cpp +40 -0
  24. data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.h +18 -0
  25. data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp +80 -0
  26. data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.h +52 -0
  27. data/lib/quarry/src/data_set/data_set.cpp +130 -0
  28. data/lib/quarry/src/data_set/data_set.h +78 -0
  29. data/lib/quarry/src/data_set/dense/dense_data_set.h +39 -0
  30. data/lib/quarry/src/data_set/dense/dense_example.h +44 -0
  31. data/lib/quarry/src/data_set/example.cpp +10 -0
  32. data/lib/quarry/src/data_set/example.h +23 -0
  33. data/lib/quarry/src/data_set/feature.h +36 -0
  34. data/lib/quarry/src/data_set/features/nominal_feature.cpp +57 -0
  35. data/lib/quarry/src/data_set/features/nominal_feature.h +76 -0
  36. data/lib/quarry/src/data_set/features/numeric_feature.cpp +69 -0
  37. data/lib/quarry/src/data_set/features/numeric_feature.h +78 -0
  38. data/lib/quarry/src/data_set/sparse/sparse_data_set.h +40 -0
  39. data/lib/quarry/src/data_set/sparse/sparse_example.cpp +82 -0
  40. data/lib/quarry/src/data_set/sparse/sparse_example.h +38 -0
  41. data/lib/quarry/src/metrics/confusion_matrix.cpp +129 -0
  42. data/lib/quarry/src/metrics/confusion_matrix.h +82 -0
  43. data/lib/quarry/src/model/model.cpp +29 -0
  44. data/lib/quarry/src/model/model.h +50 -0
  45. data/lib/quarry/src/preprocessing/examples/example_preprocessor.h +20 -0
  46. data/lib/quarry/src/preprocessing/examples/weights/binary_weight.h +20 -0
  47. data/lib/quarry/src/preprocessing/examples/weights/local_weight.h +29 -0
  48. data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h +19 -0
  49. data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h +59 -0
  50. data/lib/quarry/src/preprocessing/text/inplace_processor/downcase.h +26 -0
  51. data/lib/quarry/src/preprocessing/text/inplace_processor/inplace_processor.h +17 -0
  52. data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer.h +44 -0
  53. data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer_original.cpp +375 -0
  54. data/lib/quarry/src/preprocessing/text/text_pipeline.cpp +29 -0
  55. data/lib/quarry/src/preprocessing/text/text_pipeline.h +37 -0
  56. data/lib/quarry/src/preprocessing/text/token_selector/pos_tag_selector.h +21 -0
  57. data/lib/quarry/src/preprocessing/text/token_selector/stop_words.cpp +82 -0
  58. data/lib/quarry/src/preprocessing/text/token_selector/stop_words.h +20 -0
  59. data/lib/quarry/src/preprocessing/text/token_selector/token_selector.h +17 -0
  60. data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.cpp +29 -0
  61. data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.h +20 -0
  62. data/lib/quarry/src/preprocessing/text/tokeniser/tokeniser.h +19 -0
  63. data/lib/quarry/src/quarry.cpp +1 -0
  64. data/lib/quarry/src/quarry.h +29 -0
  65. data/lib/quarry/src/storage/arff.cpp +198 -0
  66. data/lib/quarry/src/storage/arff.h +26 -0
  67. data/lib/quarry/src/storage/binary.cpp +457 -0
  68. data/lib/quarry/src/storage/binary.h +79 -0
  69. data/lib/quarry/src/storage/folders.cpp +98 -0
  70. data/lib/quarry/src/storage/folders.h +25 -0
  71. data/lib/quarry/src/storage/storage.h +19 -0
  72. data/lib/quarry/src/test.cpp +6 -0
  73. data/lib/quarry_rb/classifier/classifier.rb +22 -0
  74. data/lib/quarry_rb/classifier/naive_bayes_classifier.rb +10 -0
  75. data/lib/quarry_rb/confusion_matrix.rb +58 -0
  76. data/lib/quarry_rb/data_set/data_set.rb +42 -0
  77. data/lib/quarry_rb/data_set/example.rb +33 -0
  78. data/lib/quarry_rb/data_set/feature.rb +28 -0
  79. data/lib/quarry_rb/enumerable_helper.rb +32 -0
  80. data/lib/quarry_rb/model/model.rb +56 -0
  81. data/lib/quarry_rb/storage/arff.rb +11 -0
  82. data/lib/quarry_rb/storage/binary.rb +23 -0
  83. data/lib/quarry_rb/storage/folders.rb +11 -0
  84. data/lib/quarry_rb/text_pipeline.rb +16 -0
  85. data/lib/thera.rb +20 -0
  86. data/test/helper.rb +19 -0
  87. data/test/test_quarry.rb +33 -0
  88. data/thera.gemspec +21 -0
  89. metadata +148 -0
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.gitignore ADDED
@@ -0,0 +1,56 @@
1
+ *.bundle
2
+ *.o
3
+ *.a
4
+ output
5
+ Makefile
6
+ obj/
7
+ mkmf.log
8
+
9
+ # rcov generated
10
+ coverage
11
+
12
+ # rdoc generated
13
+ rdoc
14
+
15
+ # yard generated
16
+ doc
17
+ .yardoc
18
+
19
+ # bundler
20
+ .bundle
21
+
22
+ # jeweler generated
23
+ pkg
24
+
25
+ # Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore:
26
+ #
27
+ # * Create a file at ~/.gitignore
28
+ # * Include files you want ignored
29
+ # * Run: git config --global core.excludesfile ~/.gitignore
30
+ #
31
+ # After doing this, these files will be ignored in all your git projects,
32
+ # saving you from having to 'pollute' every project you touch with them
33
+ #
34
+ # Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
35
+ #
36
+ # For MacOS:
37
+ #
38
+ #.DS_Store
39
+
40
+ # For TextMate
41
+ #*.tmproj
42
+ #tmtags
43
+
44
+ # For emacs:
45
+ #*~
46
+ #\#*
47
+ #.\#*
48
+
49
+ # For vim:
50
+ #*.swp
51
+
52
+ # For redcar:
53
+ #.redcar
54
+
55
+ # For rubinius:
56
+ #*.rbc
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'http://rubygems.org'
2
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,20 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ git (1.2.5)
5
+ jeweler (1.6.4)
6
+ bundler (~> 1.0)
7
+ git (>= 1.2.5)
8
+ rake
9
+ rake (0.9.2)
10
+ rcov (0.9.10)
11
+ shoulda (2.11.3)
12
+
13
+ PLATFORMS
14
+ ruby
15
+
16
+ DEPENDENCIES
17
+ bundler (~> 1.0.0)
18
+ jeweler (~> 1.6.4)
19
+ rcov
20
+ shoulda
data/LICENSE.txt ADDED
@@ -0,0 +1 @@
1
+ Public Domain
data/README.rdoc ADDED
@@ -0,0 +1,8 @@
1
+ = thera
2
+
3
+ Ruby Data Mining Library
4
+
5
+
6
+ == Copyright
7
+
8
+ None, this library is in the Public Domain.
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require 'bundler/gem_tasks'
data/ext/Makefile ADDED
@@ -0,0 +1,225 @@
1
+
2
+ SHELL = /bin/sh
3
+
4
+ #### Start of system configuration section. ####
5
+
6
+ srcdir = quarry
7
+ topdir = /Users/will/.rvm/rubies/ruby-1.9.2-p290/include/ruby-1.9.1
8
+ hdrdir = /Users/will/.rvm/rubies/ruby-1.9.2-p290/include/ruby-1.9.1
9
+ arch_hdrdir = /Users/will/.rvm/rubies/ruby-1.9.2-p290/include/ruby-1.9.1/$(arch)
10
+ VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
11
+
12
+ prefix = $(DESTDIR)/Users/will/.rvm/rubies/ruby-1.9.2-p290
13
+
14
+ rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
15
+
16
+ exec_prefix = $(prefix)
17
+
18
+ vendorhdrdir = $(rubyhdrdir)/vendor_ruby
19
+
20
+ sitehdrdir = $(rubyhdrdir)/site_ruby
21
+
22
+ rubyhdrdir = $(includedir)/$(RUBY_BASE_NAME)-$(ruby_version)
23
+
24
+ vendordir = $(rubylibprefix)/vendor_ruby
25
+
26
+ sitedir = $(rubylibprefix)/site_ruby
27
+
28
+ ridir = $(datarootdir)/$(RI_BASE_NAME)
29
+
30
+ mandir = $(datarootdir)/man
31
+
32
+ localedir = $(datarootdir)/locale
33
+
34
+ libdir = $(exec_prefix)/lib
35
+
36
+ psdir = $(docdir)
37
+
38
+ pdfdir = $(docdir)
39
+
40
+ dvidir = $(docdir)
41
+
42
+ htmldir = $(docdir)
43
+
44
+ infodir = $(datarootdir)/info
45
+
46
+ docdir = $(datarootdir)/doc/$(PACKAGE)
47
+
48
+ oldincludedir = $(DESTDIR)/usr/include
49
+
50
+ includedir = $(prefix)/include
51
+
52
+ localstatedir = $(prefix)/var
53
+
54
+ sharedstatedir = $(prefix)/com
55
+
56
+ sysconfdir = $(prefix)/etc
57
+
58
+ datadir = $(datarootdir)
59
+
60
+ datarootdir = $(prefix)/share
61
+
62
+ libexecdir = $(exec_prefix)/libexec
63
+
64
+ sbindir = $(exec_prefix)/sbin
65
+
66
+ bindir = $(exec_prefix)/bin
67
+
68
+ rubylibdir = $(rubylibprefix)/$(ruby_version)
69
+
70
+ archdir = $(rubylibdir)/$(arch)
71
+
72
+ sitelibdir = $(sitedir)/$(ruby_version)
73
+
74
+ sitearchdir = $(sitelibdir)/$(sitearch)
75
+
76
+ vendorlibdir = $(vendordir)/$(ruby_version)
77
+
78
+ vendorarchdir = $(vendorlibdir)/$(sitearch)
79
+
80
+
81
+ CC = /usr/bin/gcc-4.2
82
+ CXX = g++
83
+ LIBRUBY = $(LIBRUBY_SO)
84
+ LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
85
+ LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
86
+ LIBRUBYARG_STATIC = -lruby.1.9.1-static
87
+ OUTFLAG = -o
88
+ COUTFLAG = -o
89
+
90
+ RUBY_EXTCONF_H =
91
+ cflags = $(optflags) $(debugflags) $(warnflags)
92
+ optflags = -O3
93
+ debugflags = -ggdb
94
+ warnflags = -Wextra -Wno-unused-parameter -Wno-parentheses -Wpointer-arith -Wwrite-strings -Wno-missing-field-initializers -Wshorten-64-to-32 -Wno-long-long
95
+ CFLAGS = -fno-common -x c++
96
+ INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir) -I./../lib/quarry/src
97
+ DEFS =
98
+ CPPFLAGS = -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE $(DEFS) $(cppflags) -I/Users/will/.rvm/gems/ruby-1.9.2-p290/gems/rice-1.4.3/ruby/lib/include
99
+ CXXFLAGS = $(CFLAGS) -Wall -g
100
+ ldflags = -L. -L/usr/local/lib -L/Users/will/.rvm/gems/ruby-1.9.2-p290/gems/rice-1.4.3/ruby/lib/lib
101
+ dldflags = -Wl,-undefined,dynamic_lookup -Wl,-multiply_defined,suppress -Wl,-flat_namespace
102
+ ARCH_FLAG =
103
+ DLDFLAGS = $(ldflags) $(dldflags)
104
+ LDSHARED = g++ -dynamic -bundle
105
+ LDSHAREDXX = $(CXX) -dynamic -bundle
106
+ AR = ar
107
+ EXEEXT =
108
+
109
+ RUBY_BASE_NAME = ruby
110
+ RUBY_INSTALL_NAME = ruby
111
+ RUBY_SO_NAME = ruby.1.9.1
112
+ arch = x86_64-darwin11.1.0
113
+ sitearch = $(arch)
114
+ ruby_version = 1.9.1
115
+ ruby = /Users/will/.rvm/rubies/ruby-1.9.2-p290/bin/ruby
116
+ RUBY = $(ruby)
117
+ RM = rm -f
118
+ RM_RF = $(RUBY) -run -e rm -- -rf
119
+ RMDIRS = $(RUBY) -run -e rmdir -- -p
120
+ MAKEDIRS = mkdir -p
121
+ INSTALL = /usr/bin/install -c
122
+ INSTALL_PROG = $(INSTALL) -m 0755
123
+ INSTALL_DATA = $(INSTALL) -m 644
124
+ COPY = cp
125
+
126
+ #### End of system configuration section. ####
127
+
128
+ preload =
129
+
130
+
131
+ CXX = g++
132
+
133
+ libpath = . $(libdir) ./../lib/quarry/obj
134
+ LIBPATH = -L. -L$(libdir) -L./../lib/quarry/obj
135
+ DEFFILE =
136
+
137
+ CLEANFILES = mkmf.log
138
+ DISTCLEANFILES =
139
+ DISTCLEANDIRS =
140
+
141
+ extout =
142
+ extout_prefix =
143
+ target_prefix =
144
+ LOCAL_LIBS =
145
+ LIBS = -lquarry -lrice -lruby.1.9.1 -lpthread -ldl -lobjc
146
+ SRCS = quarry_toolkit.cpp
147
+ OBJS = quarry_toolkit.o
148
+ TARGET = quarry_toolkit
149
+ DLLIB = $(TARGET).bundle
150
+ EXTSTATIC =
151
+ STATIC_LIB =
152
+
153
+ BINDIR = $(bindir)
154
+ RUBYCOMMONDIR = $(sitedir)$(target_prefix)
155
+ RUBYLIBDIR = $(sitelibdir)$(target_prefix)
156
+ RUBYARCHDIR = $(sitearchdir)$(target_prefix)
157
+ HDRDIR = $(rubyhdrdir)/ruby$(target_prefix)
158
+ ARCHHDRDIR = $(rubyhdrdir)/$(arch)/ruby$(target_prefix)
159
+
160
+ TARGET_SO = $(DLLIB)
161
+ CLEANLIBS = $(TARGET).bundle
162
+ CLEANOBJS = *.o *.bak
163
+
164
+ all: $(DLLIB)
165
+ static: $(STATIC_LIB)
166
+ .PHONY: all install static install-so install-rb
167
+ .PHONY: clean clean-so clean-rb
168
+
169
+ clean-rb-default::
170
+ clean-rb::
171
+ clean-so::
172
+ clean: clean-so clean-rb-default clean-rb
173
+ @-$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES)
174
+
175
+ distclean-rb-default::
176
+ distclean-rb::
177
+ distclean-so::
178
+ distclean: clean distclean-so distclean-rb-default distclean-rb
179
+ @-$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
180
+ @-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
181
+ @-$(RMDIRS) $(DISTCLEANDIRS)
182
+
183
+ realclean: distclean
184
+ install: install-so install-rb
185
+
186
+ install-so: $(RUBYARCHDIR)
187
+ install-so: $(RUBYARCHDIR)/$(DLLIB)
188
+ $(RUBYARCHDIR)/$(DLLIB): $(DLLIB)
189
+ @-$(MAKEDIRS) $(@D)
190
+ $(INSTALL_PROG) $(DLLIB) $(@D)
191
+ install-rb: pre-install-rb install-rb-default
192
+ install-rb-default: pre-install-rb-default
193
+ pre-install-rb: Makefile
194
+ pre-install-rb-default: Makefile
195
+ $(RUBYARCHDIR):
196
+ $(MAKEDIRS) $@
197
+
198
+ site-install: site-install-so site-install-rb
199
+ site-install-so: install-so
200
+ site-install-rb: install-rb
201
+
202
+ .SUFFIXES: .c .m .cc .cxx .cpp .C .o
203
+
204
+ .cc.o:
205
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
206
+
207
+ .cxx.o:
208
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
209
+
210
+ .cpp.o:
211
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
212
+
213
+ .C.o:
214
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
215
+
216
+ .c.o:
217
+ $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $<
218
+
219
+ $(DLLIB): $(OBJS) Makefile
220
+ @-$(RM) $(@)
221
+ $(LDSHAREDXX) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
222
+
223
+
224
+
225
+ $(OBJS): $(hdrdir)/ruby.h $(hdrdir)/ruby/defines.h $(arch_hdrdir)/ruby/config.h
data/ext/extconf.rb ADDED
@@ -0,0 +1,29 @@
1
+ require 'mkmf-rice'
2
+
3
+ QUARRY_H = 'quarry.h'
4
+ MARKER = 'mkmf_marker'
5
+ quarry_dir = File.join(File.dirname(__FILE__), '..', 'lib', 'quarry')
6
+ quarry_obj = File.join(quarry_dir, 'obj')
7
+ quarry_src = File.join(quarry_dir, 'src')
8
+ quarry_header = File.join(quarry_src, QUARRY_H)
9
+
10
+ # compile quarry as a static lib
11
+ Dir.chdir(quarry_dir) do
12
+ if RUBY_PLATFORM =~ /darwin/
13
+ puts "Compiling quarry (OS X mode)"
14
+ `make -f Makefile.osx clean`
15
+ `make -f Makefile.osx`
16
+ else
17
+ puts "Compiling quarry (linux mode)"
18
+ `make -f Makefile.linux clean`
19
+ `make -f Makefile.linux`
20
+ end
21
+ end
22
+
23
+ # the cflags are required to make mkmf compile in c++ mode
24
+ with_cflags("-x c++") do
25
+ find_header(QUARRY_H, quarry_src)
26
+ $LIBPATH << quarry_obj
27
+ have_library('quarry', MARKER, quarry_header)
28
+ create_makefile('quarry_toolkit', 'quarry')
29
+ end
@@ -0,0 +1,148 @@
1
+ #include "rice/Constructor.hpp"
2
+ #include "rice/Data_Type.hpp"
3
+ #include "rice/Module.hpp"
4
+ #include "rice/Array.hpp"
5
+ #include "quarry.h"
6
+ using namespace Rice;
7
+
8
+ Object model_rank(Object self, Object ex) {
9
+ Model::Model *model = from_ruby<Model::Model *>(self);
10
+ DataSet::Example *example = from_ruby<DataSet::Example *>(ex);
11
+ Array indexes;
12
+
13
+ vector<Classifier::Score> *ranks = model->rank(example);
14
+ for(unsigned int i = 0; i < ranks->size(); i++)
15
+ indexes.push(ranks->at(i).category);
16
+
17
+ delete ranks;
18
+ return indexes;
19
+ }
20
+
21
+ Object model_rank_text(Object self, Object text) {
22
+ Model::Model *model = from_ruby<Model::Model *>(self);
23
+ string example_text = from_ruby<string>(text);
24
+ Array indexes;
25
+
26
+ vector<Classifier::Score> *ranks = model->rank_text(example_text);
27
+ for(unsigned int i = 0; i < ranks->size(); i++)
28
+ indexes.push(ranks->at(i).category);
29
+
30
+ delete ranks;
31
+ return indexes;
32
+ }
33
+
34
+
35
+ extern "C" {
36
+
37
+ void Init_quarry_toolkit() {
38
+ Module rb_mQuarry = define_module("Quarry");
39
+ Module rb_mDataSet = define_module_under(rb_mQuarry, "DataSet");
40
+ Module rb_mClassifier = define_module_under(rb_mQuarry, "Classifier");
41
+ Module rb_mPreprocessing = define_module_under(rb_mQuarry, "Preprocessing");
42
+ Module rb_mText = define_module_under(rb_mPreprocessing, "Text");
43
+
44
+
45
+ // text pipeline
46
+ rb_mText.define_module_function("standard_pipeline", &Preprocessing::Text::StandardPipeline);
47
+ Data_Type<Preprocessing::Text::TextPipeline> rb_cTextPipeline = define_class_under<Preprocessing::Text::TextPipeline>(rb_mQuarry, "ImplTextPipeline")
48
+ .define_constructor(Constructor<Preprocessing::Text::TextPipeline>())
49
+ .define_method("process_text", &Preprocessing::Text::TextPipeline::process_text);
50
+
51
+ // storage
52
+ Data_Type<Storage::Storage> rb_cStorage = define_class_under<Storage::Storage>(rb_mQuarry, "ImplStorage");
53
+
54
+ Data_Type<Storage::ARFF> rb_cARFF = define_class_under<Storage::ARFF, Storage::Storage>(rb_mQuarry, "ImplARFF")
55
+ .define_constructor(Constructor<Storage::ARFF, string>())
56
+ .define_method("read", &Storage::ARFF::read);
57
+
58
+ Data_Type<Storage::Binary> rb_cBinary = define_class_under<Storage::Binary, Storage::Storage>(rb_mQuarry, "ImplBinary")
59
+ .define_constructor(Constructor<Storage::Binary, string>())
60
+ .define_method("read", &Storage::Binary::read)
61
+ .define_method("write", &Storage::Binary::write)
62
+ .define_method("read_model", &Storage::Binary::read_model)
63
+ .define_method("write_model", &Storage::Binary::write_model)
64
+ .define_method("get_write_examples", &Storage::Binary::get_write_examples)
65
+ .define_method("set_write_examples", &Storage::Binary::set_write_examples);
66
+
67
+ Data_Type<Storage::Folders> rb_cFolders = define_class_under<Storage::Folders, Storage::Storage>(rb_mQuarry, "ImplFolders")
68
+ .define_constructor(Constructor<Storage::Folders, string, Preprocessing::Text::TextPipeline *>())
69
+ .define_method("read", &Storage::Folders::read);
70
+
71
+
72
+ // model
73
+ Data_Type<Model::Model> rb_cModel = define_class_under<Model::Model>(rb_mQuarry, "ImplModel")
74
+ .define_constructor(Constructor<Model::Model>())
75
+ .define_method("train", &Model::Model::train)
76
+ .define_method("train_text", &Model::Model::train_text)
77
+ .define_method("classify", &Model::Model::classify)
78
+ .define_method("classify_text", &Model::Model::classify_text)
79
+ .define_method("set_data_set", &Model::Model::set_data_set)
80
+ .define_method("get_data_set", &Model::Model::get_data_set)
81
+ .define_method("set_classifier", &Model::Model::set_classifier)
82
+ .define_method("get_classifier", &Model::Model::get_classifier)
83
+ .define_method("set_text_pipeline", &Model::Model::set_text_pipeline)
84
+ .define_method("get_text_pipeline", &Model::Model::get_text_pipeline)
85
+ .define_method("rank", &model_rank)
86
+ .define_method("rank_text", &model_rank_text);
87
+
88
+
89
+
90
+ // data set
91
+ Data_Type<DataSet::Feature> rb_cDataSetFeature = define_class_under<DataSet::Feature>(rb_mDataSet, "ImplFeature")
92
+ .define_method("get_name", &DataSet::Feature::get_name)
93
+ .define_method("set_name", &DataSet::Feature::set_name)
94
+ .define_constructor(Constructor<DataSet::Feature, string, int>());
95
+
96
+ Data_Type<DataSet::Example> rb_cDataSetExample = define_class_under<DataSet::Example>(rb_mDataSet, "ImplExample")
97
+ .define_method("category_index", &DataSet::Example::category_index)
98
+ .define_method("get_value", &DataSet::Example::get_value)
99
+ .define_method("set_value", &DataSet::Example::set_value)
100
+ .define_constructor(Constructor<DataSet::Example, int>());
101
+
102
+ Data_Type<DataSet::DataSet> rb_cDataSet = define_class_under<DataSet::DataSet>(rb_mDataSet, "ImplDataSet")
103
+ .define_constructor(Constructor<DataSet::DataSet>())
104
+ .define_method("get_name", &DataSet::DataSet::get_name)
105
+ .define_method("set_name", &DataSet::DataSet::set_name)
106
+ .define_method("features_size", &DataSet::DataSet::features_size)
107
+ .define_method("examples_size", &DataSet::DataSet::examples_size)
108
+ .define_method("get_feature_by_index", &DataSet::DataSet::get_feature_by_index)
109
+ .define_method("get_feature_by_name", &DataSet::DataSet::get_feature_by_name)
110
+ .define_method("get_example_by_index", &DataSet::DataSet::get_example_by_index)
111
+ .define_method("stratify", &DataSet::DataSet::stratify)
112
+ .define_method("cross_fold_validation", &DataSet::DataSet::cross_fold_validation);
113
+
114
+
115
+ // abstract classifier
116
+ Data_Type<Classifier::Classifier> rb_cClassifierClassifier = define_class_under<Classifier::Classifier>(rb_mClassifier, "ImplClassifier")
117
+ .define_method("prepare", &Classifier::Classifier::prepare)
118
+ .define_method("classify", &Classifier::Classifier::classify);
119
+
120
+
121
+ // bayesian classifiers
122
+ Data_Type<Classifier::NaiveBayesClassifier> rb_cClassifierNaiveBayesClassifier = define_class_under<Classifier::NaiveBayesClassifier, Classifier::Classifier>(rb_mClassifier, "ImplNaiveBayesClassifier")
123
+ .define_constructor(Constructor<Classifier::NaiveBayesClassifier, DataSet::DataSet *>());
124
+
125
+
126
+ // confusion matrix
127
+ Data_Type<ConfusionMatrix> rb_cConfusionMatrix = define_class_under<ConfusionMatrix>(rb_mQuarry, "ImplConfusionMatrix")
128
+ .define_constructor(Constructor<ConfusionMatrix, DataSet::DataSet *>())
129
+ .define_method("accuracy", &ConfusionMatrix::accuracy)
130
+ .define_method("error", &ConfusionMatrix::error)
131
+ .define_method("print_summary", &ConfusionMatrix::print_summary)
132
+ .define_method("avg_tp", &ConfusionMatrix::avg_tp)
133
+ .define_method("avg_fp", &ConfusionMatrix::avg_fp)
134
+ .define_method("avg_tn", &ConfusionMatrix::avg_tn)
135
+ .define_method("avg_fn", &ConfusionMatrix::avg_fn)
136
+ .define_method("avg_precision", &ConfusionMatrix::avg_precision)
137
+ .define_method("avg_recall", &ConfusionMatrix::avg_recall)
138
+ .define_method("avg_fscore", &ConfusionMatrix::avg_fscore)
139
+ .define_method("add", &ConfusionMatrix::add)
140
+ .define_method("tp", &ConfusionMatrix::tp)
141
+ .define_method("fp", &ConfusionMatrix::tp)
142
+ .define_method("tn", &ConfusionMatrix::tp)
143
+ .define_method("fn", &ConfusionMatrix::tp)
144
+ .define_method("precision", &ConfusionMatrix::tp)
145
+ .define_method("recall", &ConfusionMatrix::tp)
146
+ .define_method("fscore", &ConfusionMatrix::tp);
147
+ }
148
+ }
@@ -0,0 +1,2 @@
1
+ STATIC_LIB = ar rcs
2
+ include Makefile.targets
@@ -0,0 +1,6 @@
1
+ STATIC_LIB = libtool -o
2
+ CXX = clang++
3
+ CPPFLAGS = -Wno-deprecated-writable-strings
4
+ #CXX = g++
5
+ #CPPFLAGS = -ggdb
6
+ include Makefile.targets
@@ -0,0 +1,23 @@
1
+ SRCS = src/quarry.cpp src/data_set/data_set.cpp src/data_set/example.cpp src/data_set/sparse/sparse_example.cpp src/data_set/features/nominal_feature.cpp src/data_set/features/numeric_feature.cpp src/classifier/classifier.cpp src/classifier/naive_bayes/naive_bayes_classifier.cpp src/metrics/confusion_matrix.cpp src/storage/arff.cpp src/storage/folders.cpp src/storage/binary.cpp src/preprocessing/text/text_pipeline.cpp src/preprocessing/text/inplace_processor/porter_stemmer_original.cpp src/preprocessing/text/token_selector/stop_words.cpp src/preprocessing/text/tokeniser/simple_tokeniser.cpp src/model/model.cpp
2
+ OBJS = ${SRCS:.cpp=.o}
3
+ INCFLAGS = -Isrc
4
+ CPPFLAGS += -O3
5
+ LIB = libquarry.a
6
+
7
+ .SUFFIXES:
8
+ .SUFFIXES: .cpp .o
9
+
10
+ .cpp.o :
11
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) -o $@ -c $<
12
+
13
+ all : $(LIB)
14
+
15
+ $(LIB) : $(OBJS)
16
+ $(STATIC_LIB) obj/$(LIB) $(OBJS)
17
+
18
+ test : $(LIB)
19
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) -o obj/test obj/$(LIB) src/test.cpp
20
+
21
+ clean :
22
+ rm -f $(OBJS)
23
+ rm -f obj/*
File without changes
File without changes
@@ -0,0 +1,32 @@
1
+ #include "classifier.h"
2
+
3
+ int Classifier::Classifier::classify(DataSet::Example *example) {
4
+ double max_score = 0.0, category_score = 0.0;
5
+ int max_category = 1;
6
+
7
+ for(int category = 1; category <= data_set->categories_size(); category++) {
8
+ category_score = score(category, example);
9
+ if(category_score > max_score) {
10
+ max_score = category_score;
11
+ max_category = category;
12
+ }
13
+ }
14
+
15
+ return max_category;
16
+ }
17
+
18
+ vector<Classifier::Score> *Classifier::Classifier::rank(DataSet::Example *example) {
19
+ vector<Score> *scores = score_all(example);
20
+ sort(scores->begin(), scores->end(), Score::compare);
21
+ return scores;
22
+ }
23
+
24
+ vector<Classifier::Score> *Classifier::Classifier::score_all(DataSet::Example *example) {
25
+ vector<Score> *scores = new vector<Score>();
26
+ scores->reserve(data_set->categories_size());
27
+
28
+ for(int category = 1; category <= data_set->categories_size(); category++)
29
+ scores->push_back(Score(category, score(category, example)));
30
+
31
+ return scores;
32
+ }
@@ -0,0 +1,59 @@
1
+ #ifndef __classifier__
2
+ #define __classifier__
3
+ #include "data_set/data_set.h"
4
+ #include <algorithm>
5
+ #include <utility>
6
+ #include <vector>
7
+ #include <iostream>
8
+ #include <typeinfo>
9
+ using namespace std;
10
+
11
+ namespace Storage {
12
+ class Binary;
13
+ }
14
+
15
+ namespace Classifier {
16
+
17
+ class Score {
18
+ public:
19
+ int category;
20
+ double score;
21
+
22
+ Score(int category, double score) : category(category), score(score) {}
23
+ static bool compare (Score a, Score b) {
24
+ return b.score < a.score; // descending
25
+ }
26
+ };
27
+
28
+
29
+ class Classifier {
30
+ public:
31
+ DataSet::DataSet *data_set;
32
+ vector<bool> numeric_features;
33
+ vector<bool> nominal_features;
34
+
35
+ Classifier(DataSet::DataSet *data_set) : data_set(data_set), numeric_features(data_set->features_size(), 0), nominal_features(data_set->features_size(), 0) {
36
+ DataSet::Feature *feature = NULL;
37
+
38
+ for(unsigned int i = 0; i < data_set->features.size(); i++) {
39
+ feature = data_set->features[i];
40
+ if(typeid(*feature) == typeid(DataSet::NumericFeature))
41
+ numeric_features[i] = true;
42
+ else
43
+ nominal_features[i] = true;
44
+ }
45
+ }
46
+
47
+ virtual void prepare() {};
48
+ virtual Classifier *clone(DataSet::DataSet *new_data_set) = 0;
49
+ virtual double score(int category, DataSet::Example *example) = 0;
50
+ virtual int classify(DataSet::Example *example);
51
+ virtual vector<Score> *rank(DataSet::Example *example);
52
+ virtual vector<Score> *score_all(DataSet::Example *example);
53
+ virtual void write_binary(Storage::Binary *file) {}
54
+ virtual void read_binary(Storage::Binary *file) {}
55
+ virtual uint32_t mark() = 0;
56
+ };
57
+ }
58
+
59
+ #endif
File without changes
File without changes