thera 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. data/.document +5 -0
  2. data/.gitignore +56 -0
  3. data/Gemfile +2 -0
  4. data/Gemfile.lock +20 -0
  5. data/LICENSE.txt +1 -0
  6. data/README.rdoc +8 -0
  7. data/Rakefile +1 -0
  8. data/ext/Makefile +225 -0
  9. data/ext/extconf.rb +29 -0
  10. data/ext/quarry/quarry_toolkit.cpp +148 -0
  11. data/lib/quarry/Makefile.linux +2 -0
  12. data/lib/quarry/Makefile.osx +6 -0
  13. data/lib/quarry/Makefile.targets +23 -0
  14. data/lib/quarry/obj/.gitkeep +0 -0
  15. data/lib/quarry/src/classifier/aode/aode_classifier.cpp +0 -0
  16. data/lib/quarry/src/classifier/aode/aode_classifier.h +0 -0
  17. data/lib/quarry/src/classifier/centroid/centroid_classifier.cpp +0 -0
  18. data/lib/quarry/src/classifier/centroid/centroid_classifier.h +0 -0
  19. data/lib/quarry/src/classifier/classifier.cpp +32 -0
  20. data/lib/quarry/src/classifier/classifier.h +59 -0
  21. data/lib/quarry/src/classifier/knn/knn_classifier.cpp +0 -0
  22. data/lib/quarry/src/classifier/knn/knn_classifier.h +0 -0
  23. data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.cpp +40 -0
  24. data/lib/quarry/src/classifier/multinomial_bayes/multinomial_bayes_classifier.h +18 -0
  25. data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.cpp +80 -0
  26. data/lib/quarry/src/classifier/naive_bayes/naive_bayes_classifier.h +52 -0
  27. data/lib/quarry/src/data_set/data_set.cpp +130 -0
  28. data/lib/quarry/src/data_set/data_set.h +78 -0
  29. data/lib/quarry/src/data_set/dense/dense_data_set.h +39 -0
  30. data/lib/quarry/src/data_set/dense/dense_example.h +44 -0
  31. data/lib/quarry/src/data_set/example.cpp +10 -0
  32. data/lib/quarry/src/data_set/example.h +23 -0
  33. data/lib/quarry/src/data_set/feature.h +36 -0
  34. data/lib/quarry/src/data_set/features/nominal_feature.cpp +57 -0
  35. data/lib/quarry/src/data_set/features/nominal_feature.h +76 -0
  36. data/lib/quarry/src/data_set/features/numeric_feature.cpp +69 -0
  37. data/lib/quarry/src/data_set/features/numeric_feature.h +78 -0
  38. data/lib/quarry/src/data_set/sparse/sparse_data_set.h +40 -0
  39. data/lib/quarry/src/data_set/sparse/sparse_example.cpp +82 -0
  40. data/lib/quarry/src/data_set/sparse/sparse_example.h +38 -0
  41. data/lib/quarry/src/metrics/confusion_matrix.cpp +129 -0
  42. data/lib/quarry/src/metrics/confusion_matrix.h +82 -0
  43. data/lib/quarry/src/model/model.cpp +29 -0
  44. data/lib/quarry/src/model/model.h +50 -0
  45. data/lib/quarry/src/preprocessing/examples/example_preprocessor.h +20 -0
  46. data/lib/quarry/src/preprocessing/examples/weights/binary_weight.h +20 -0
  47. data/lib/quarry/src/preprocessing/examples/weights/local_weight.h +29 -0
  48. data/lib/quarry/src/preprocessing/text/example_generator/example_generator.h +19 -0
  49. data/lib/quarry/src/preprocessing/text/example_generator/token_counter.h +59 -0
  50. data/lib/quarry/src/preprocessing/text/inplace_processor/downcase.h +26 -0
  51. data/lib/quarry/src/preprocessing/text/inplace_processor/inplace_processor.h +17 -0
  52. data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer.h +44 -0
  53. data/lib/quarry/src/preprocessing/text/inplace_processor/porter_stemmer_original.cpp +375 -0
  54. data/lib/quarry/src/preprocessing/text/text_pipeline.cpp +29 -0
  55. data/lib/quarry/src/preprocessing/text/text_pipeline.h +37 -0
  56. data/lib/quarry/src/preprocessing/text/token_selector/pos_tag_selector.h +21 -0
  57. data/lib/quarry/src/preprocessing/text/token_selector/stop_words.cpp +82 -0
  58. data/lib/quarry/src/preprocessing/text/token_selector/stop_words.h +20 -0
  59. data/lib/quarry/src/preprocessing/text/token_selector/token_selector.h +17 -0
  60. data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.cpp +29 -0
  61. data/lib/quarry/src/preprocessing/text/tokeniser/simple_tokeniser.h +20 -0
  62. data/lib/quarry/src/preprocessing/text/tokeniser/tokeniser.h +19 -0
  63. data/lib/quarry/src/quarry.cpp +1 -0
  64. data/lib/quarry/src/quarry.h +29 -0
  65. data/lib/quarry/src/storage/arff.cpp +198 -0
  66. data/lib/quarry/src/storage/arff.h +26 -0
  67. data/lib/quarry/src/storage/binary.cpp +457 -0
  68. data/lib/quarry/src/storage/binary.h +79 -0
  69. data/lib/quarry/src/storage/folders.cpp +98 -0
  70. data/lib/quarry/src/storage/folders.h +25 -0
  71. data/lib/quarry/src/storage/storage.h +19 -0
  72. data/lib/quarry/src/test.cpp +6 -0
  73. data/lib/quarry_rb/classifier/classifier.rb +22 -0
  74. data/lib/quarry_rb/classifier/naive_bayes_classifier.rb +10 -0
  75. data/lib/quarry_rb/confusion_matrix.rb +58 -0
  76. data/lib/quarry_rb/data_set/data_set.rb +42 -0
  77. data/lib/quarry_rb/data_set/example.rb +33 -0
  78. data/lib/quarry_rb/data_set/feature.rb +28 -0
  79. data/lib/quarry_rb/enumerable_helper.rb +32 -0
  80. data/lib/quarry_rb/model/model.rb +56 -0
  81. data/lib/quarry_rb/storage/arff.rb +11 -0
  82. data/lib/quarry_rb/storage/binary.rb +23 -0
  83. data/lib/quarry_rb/storage/folders.rb +11 -0
  84. data/lib/quarry_rb/text_pipeline.rb +16 -0
  85. data/lib/thera.rb +20 -0
  86. data/test/helper.rb +19 -0
  87. data/test/test_quarry.rb +33 -0
  88. data/thera.gemspec +21 -0
  89. metadata +148 -0
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.gitignore ADDED
@@ -0,0 +1,56 @@
1
+ *.bundle
2
+ *.o
3
+ *.a
4
+ output
5
+ Makefile
6
+ obj/
7
+ mkmf.log
8
+
9
+ # rcov generated
10
+ coverage
11
+
12
+ # rdoc generated
13
+ rdoc
14
+
15
+ # yard generated
16
+ doc
17
+ .yardoc
18
+
19
+ # bundler
20
+ .bundle
21
+
22
+ # jeweler generated
23
+ pkg
24
+
25
+ # Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore:
26
+ #
27
+ # * Create a file at ~/.gitignore
28
+ # * Include files you want ignored
29
+ # * Run: git config --global core.excludesfile ~/.gitignore
30
+ #
31
+ # After doing this, these files will be ignored in all your git projects,
32
+ # saving you from having to 'pollute' every project you touch with them
33
+ #
34
+ # Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
35
+ #
36
+ # For MacOS:
37
+ #
38
+ #.DS_Store
39
+
40
+ # For TextMate
41
+ #*.tmproj
42
+ #tmtags
43
+
44
+ # For emacs:
45
+ #*~
46
+ #\#*
47
+ #.\#*
48
+
49
+ # For vim:
50
+ #*.swp
51
+
52
+ # For redcar:
53
+ #.redcar
54
+
55
+ # For rubinius:
56
+ #*.rbc
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'http://rubygems.org'
2
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,20 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ git (1.2.5)
5
+ jeweler (1.6.4)
6
+ bundler (~> 1.0)
7
+ git (>= 1.2.5)
8
+ rake
9
+ rake (0.9.2)
10
+ rcov (0.9.10)
11
+ shoulda (2.11.3)
12
+
13
+ PLATFORMS
14
+ ruby
15
+
16
+ DEPENDENCIES
17
+ bundler (~> 1.0.0)
18
+ jeweler (~> 1.6.4)
19
+ rcov
20
+ shoulda
data/LICENSE.txt ADDED
@@ -0,0 +1 @@
1
+ Public Domain
data/README.rdoc ADDED
@@ -0,0 +1,8 @@
1
+ = thera
2
+
3
+ Ruby Data Mining Library
4
+
5
+
6
+ == Copyright
7
+
8
+ None, this library is in the Public Domain.
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require 'bundler/gem_tasks'
data/ext/Makefile ADDED
@@ -0,0 +1,225 @@
1
+
2
+ SHELL = /bin/sh
3
+
4
+ #### Start of system configuration section. ####
5
+
6
+ srcdir = quarry
7
+ topdir = /Users/will/.rvm/rubies/ruby-1.9.2-p290/include/ruby-1.9.1
8
+ hdrdir = /Users/will/.rvm/rubies/ruby-1.9.2-p290/include/ruby-1.9.1
9
+ arch_hdrdir = /Users/will/.rvm/rubies/ruby-1.9.2-p290/include/ruby-1.9.1/$(arch)
10
+ VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
11
+
12
+ prefix = $(DESTDIR)/Users/will/.rvm/rubies/ruby-1.9.2-p290
13
+
14
+ rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
15
+
16
+ exec_prefix = $(prefix)
17
+
18
+ vendorhdrdir = $(rubyhdrdir)/vendor_ruby
19
+
20
+ sitehdrdir = $(rubyhdrdir)/site_ruby
21
+
22
+ rubyhdrdir = $(includedir)/$(RUBY_BASE_NAME)-$(ruby_version)
23
+
24
+ vendordir = $(rubylibprefix)/vendor_ruby
25
+
26
+ sitedir = $(rubylibprefix)/site_ruby
27
+
28
+ ridir = $(datarootdir)/$(RI_BASE_NAME)
29
+
30
+ mandir = $(datarootdir)/man
31
+
32
+ localedir = $(datarootdir)/locale
33
+
34
+ libdir = $(exec_prefix)/lib
35
+
36
+ psdir = $(docdir)
37
+
38
+ pdfdir = $(docdir)
39
+
40
+ dvidir = $(docdir)
41
+
42
+ htmldir = $(docdir)
43
+
44
+ infodir = $(datarootdir)/info
45
+
46
+ docdir = $(datarootdir)/doc/$(PACKAGE)
47
+
48
+ oldincludedir = $(DESTDIR)/usr/include
49
+
50
+ includedir = $(prefix)/include
51
+
52
+ localstatedir = $(prefix)/var
53
+
54
+ sharedstatedir = $(prefix)/com
55
+
56
+ sysconfdir = $(prefix)/etc
57
+
58
+ datadir = $(datarootdir)
59
+
60
+ datarootdir = $(prefix)/share
61
+
62
+ libexecdir = $(exec_prefix)/libexec
63
+
64
+ sbindir = $(exec_prefix)/sbin
65
+
66
+ bindir = $(exec_prefix)/bin
67
+
68
+ rubylibdir = $(rubylibprefix)/$(ruby_version)
69
+
70
+ archdir = $(rubylibdir)/$(arch)
71
+
72
+ sitelibdir = $(sitedir)/$(ruby_version)
73
+
74
+ sitearchdir = $(sitelibdir)/$(sitearch)
75
+
76
+ vendorlibdir = $(vendordir)/$(ruby_version)
77
+
78
+ vendorarchdir = $(vendorlibdir)/$(sitearch)
79
+
80
+
81
+ CC = /usr/bin/gcc-4.2
82
+ CXX = g++
83
+ LIBRUBY = $(LIBRUBY_SO)
84
+ LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
85
+ LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
86
+ LIBRUBYARG_STATIC = -lruby.1.9.1-static
87
+ OUTFLAG = -o
88
+ COUTFLAG = -o
89
+
90
+ RUBY_EXTCONF_H =
91
+ cflags = $(optflags) $(debugflags) $(warnflags)
92
+ optflags = -O3
93
+ debugflags = -ggdb
94
+ warnflags = -Wextra -Wno-unused-parameter -Wno-parentheses -Wpointer-arith -Wwrite-strings -Wno-missing-field-initializers -Wshorten-64-to-32 -Wno-long-long
95
+ CFLAGS = -fno-common -x c++
96
+ INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir) -I./../lib/quarry/src
97
+ DEFS =
98
+ CPPFLAGS = -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE $(DEFS) $(cppflags) -I/Users/will/.rvm/gems/ruby-1.9.2-p290/gems/rice-1.4.3/ruby/lib/include
99
+ CXXFLAGS = $(CFLAGS) -Wall -g
100
+ ldflags = -L. -L/usr/local/lib -L/Users/will/.rvm/gems/ruby-1.9.2-p290/gems/rice-1.4.3/ruby/lib/lib
101
+ dldflags = -Wl,-undefined,dynamic_lookup -Wl,-multiply_defined,suppress -Wl,-flat_namespace
102
+ ARCH_FLAG =
103
+ DLDFLAGS = $(ldflags) $(dldflags)
104
+ LDSHARED = g++ -dynamic -bundle
105
+ LDSHAREDXX = $(CXX) -dynamic -bundle
106
+ AR = ar
107
+ EXEEXT =
108
+
109
+ RUBY_BASE_NAME = ruby
110
+ RUBY_INSTALL_NAME = ruby
111
+ RUBY_SO_NAME = ruby.1.9.1
112
+ arch = x86_64-darwin11.1.0
113
+ sitearch = $(arch)
114
+ ruby_version = 1.9.1
115
+ ruby = /Users/will/.rvm/rubies/ruby-1.9.2-p290/bin/ruby
116
+ RUBY = $(ruby)
117
+ RM = rm -f
118
+ RM_RF = $(RUBY) -run -e rm -- -rf
119
+ RMDIRS = $(RUBY) -run -e rmdir -- -p
120
+ MAKEDIRS = mkdir -p
121
+ INSTALL = /usr/bin/install -c
122
+ INSTALL_PROG = $(INSTALL) -m 0755
123
+ INSTALL_DATA = $(INSTALL) -m 644
124
+ COPY = cp
125
+
126
+ #### End of system configuration section. ####
127
+
128
+ preload =
129
+
130
+
131
+ CXX = g++
132
+
133
+ libpath = . $(libdir) ./../lib/quarry/obj
134
+ LIBPATH = -L. -L$(libdir) -L./../lib/quarry/obj
135
+ DEFFILE =
136
+
137
+ CLEANFILES = mkmf.log
138
+ DISTCLEANFILES =
139
+ DISTCLEANDIRS =
140
+
141
+ extout =
142
+ extout_prefix =
143
+ target_prefix =
144
+ LOCAL_LIBS =
145
+ LIBS = -lquarry -lrice -lruby.1.9.1 -lpthread -ldl -lobjc
146
+ SRCS = quarry_toolkit.cpp
147
+ OBJS = quarry_toolkit.o
148
+ TARGET = quarry_toolkit
149
+ DLLIB = $(TARGET).bundle
150
+ EXTSTATIC =
151
+ STATIC_LIB =
152
+
153
+ BINDIR = $(bindir)
154
+ RUBYCOMMONDIR = $(sitedir)$(target_prefix)
155
+ RUBYLIBDIR = $(sitelibdir)$(target_prefix)
156
+ RUBYARCHDIR = $(sitearchdir)$(target_prefix)
157
+ HDRDIR = $(rubyhdrdir)/ruby$(target_prefix)
158
+ ARCHHDRDIR = $(rubyhdrdir)/$(arch)/ruby$(target_prefix)
159
+
160
+ TARGET_SO = $(DLLIB)
161
+ CLEANLIBS = $(TARGET).bundle
162
+ CLEANOBJS = *.o *.bak
163
+
164
+ all: $(DLLIB)
165
+ static: $(STATIC_LIB)
166
+ .PHONY: all install static install-so install-rb
167
+ .PHONY: clean clean-so clean-rb
168
+
169
+ clean-rb-default::
170
+ clean-rb::
171
+ clean-so::
172
+ clean: clean-so clean-rb-default clean-rb
173
+ @-$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES)
174
+
175
+ distclean-rb-default::
176
+ distclean-rb::
177
+ distclean-so::
178
+ distclean: clean distclean-so distclean-rb-default distclean-rb
179
+ @-$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
180
+ @-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
181
+ @-$(RMDIRS) $(DISTCLEANDIRS)
182
+
183
+ realclean: distclean
184
+ install: install-so install-rb
185
+
186
+ install-so: $(RUBYARCHDIR)
187
+ install-so: $(RUBYARCHDIR)/$(DLLIB)
188
+ $(RUBYARCHDIR)/$(DLLIB): $(DLLIB)
189
+ @-$(MAKEDIRS) $(@D)
190
+ $(INSTALL_PROG) $(DLLIB) $(@D)
191
+ install-rb: pre-install-rb install-rb-default
192
+ install-rb-default: pre-install-rb-default
193
+ pre-install-rb: Makefile
194
+ pre-install-rb-default: Makefile
195
+ $(RUBYARCHDIR):
196
+ $(MAKEDIRS) $@
197
+
198
+ site-install: site-install-so site-install-rb
199
+ site-install-so: install-so
200
+ site-install-rb: install-rb
201
+
202
+ .SUFFIXES: .c .m .cc .cxx .cpp .C .o
203
+
204
+ .cc.o:
205
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
206
+
207
+ .cxx.o:
208
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
209
+
210
+ .cpp.o:
211
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
212
+
213
+ .C.o:
214
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
215
+
216
+ .c.o:
217
+ $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $<
218
+
219
+ $(DLLIB): $(OBJS) Makefile
220
+ @-$(RM) $(@)
221
+ $(LDSHAREDXX) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
222
+
223
+
224
+
225
+ $(OBJS): $(hdrdir)/ruby.h $(hdrdir)/ruby/defines.h $(arch_hdrdir)/ruby/config.h
data/ext/extconf.rb ADDED
@@ -0,0 +1,29 @@
1
+ require 'mkmf-rice'
2
+
3
+ QUARRY_H = 'quarry.h'
4
+ MARKER = 'mkmf_marker'
5
+ quarry_dir = File.join(File.dirname(__FILE__), '..', 'lib', 'quarry')
6
+ quarry_obj = File.join(quarry_dir, 'obj')
7
+ quarry_src = File.join(quarry_dir, 'src')
8
+ quarry_header = File.join(quarry_src, QUARRY_H)
9
+
10
+ # compile quarry as a static lib
11
+ Dir.chdir(quarry_dir) do
12
+ if RUBY_PLATFORM =~ /darwin/
13
+ puts "Compiling quarry (OS X mode)"
14
+ `make -f Makefile.osx clean`
15
+ `make -f Makefile.osx`
16
+ else
17
+ puts "Compiling quarry (linux mode)"
18
+ `make -f Makefile.linux clean`
19
+ `make -f Makefile.linux`
20
+ end
21
+ end
22
+
23
+ # the cflags are required to make mkmf compile in c++ mode
24
+ with_cflags("-x c++") do
25
+ find_header(QUARRY_H, quarry_src)
26
+ $LIBPATH << quarry_obj
27
+ have_library('quarry', MARKER, quarry_header)
28
+ create_makefile('quarry_toolkit', 'quarry')
29
+ end
@@ -0,0 +1,148 @@
1
+ #include "rice/Constructor.hpp"
2
+ #include "rice/Data_Type.hpp"
3
+ #include "rice/Module.hpp"
4
+ #include "rice/Array.hpp"
5
+ #include "quarry.h"
6
+ using namespace Rice;
7
+
8
+ Object model_rank(Object self, Object ex) {
9
+ Model::Model *model = from_ruby<Model::Model *>(self);
10
+ DataSet::Example *example = from_ruby<DataSet::Example *>(ex);
11
+ Array indexes;
12
+
13
+ vector<Classifier::Score> *ranks = model->rank(example);
14
+ for(unsigned int i = 0; i < ranks->size(); i++)
15
+ indexes.push(ranks->at(i).category);
16
+
17
+ delete ranks;
18
+ return indexes;
19
+ }
20
+
21
+ Object model_rank_text(Object self, Object text) {
22
+ Model::Model *model = from_ruby<Model::Model *>(self);
23
+ string example_text = from_ruby<string>(text);
24
+ Array indexes;
25
+
26
+ vector<Classifier::Score> *ranks = model->rank_text(example_text);
27
+ for(unsigned int i = 0; i < ranks->size(); i++)
28
+ indexes.push(ranks->at(i).category);
29
+
30
+ delete ranks;
31
+ return indexes;
32
+ }
33
+
34
+
35
+ extern "C" {
36
+
37
+ void Init_quarry_toolkit() {
38
+ Module rb_mQuarry = define_module("Quarry");
39
+ Module rb_mDataSet = define_module_under(rb_mQuarry, "DataSet");
40
+ Module rb_mClassifier = define_module_under(rb_mQuarry, "Classifier");
41
+ Module rb_mPreprocessing = define_module_under(rb_mQuarry, "Preprocessing");
42
+ Module rb_mText = define_module_under(rb_mPreprocessing, "Text");
43
+
44
+
45
+ // text pipeline
46
+ rb_mText.define_module_function("standard_pipeline", &Preprocessing::Text::StandardPipeline);
47
+ Data_Type<Preprocessing::Text::TextPipeline> rb_cTextPipeline = define_class_under<Preprocessing::Text::TextPipeline>(rb_mQuarry, "ImplTextPipeline")
48
+ .define_constructor(Constructor<Preprocessing::Text::TextPipeline>())
49
+ .define_method("process_text", &Preprocessing::Text::TextPipeline::process_text);
50
+
51
+ // storage
52
+ Data_Type<Storage::Storage> rb_cStorage = define_class_under<Storage::Storage>(rb_mQuarry, "ImplStorage");
53
+
54
+ Data_Type<Storage::ARFF> rb_cARFF = define_class_under<Storage::ARFF, Storage::Storage>(rb_mQuarry, "ImplARFF")
55
+ .define_constructor(Constructor<Storage::ARFF, string>())
56
+ .define_method("read", &Storage::ARFF::read);
57
+
58
+ Data_Type<Storage::Binary> rb_cBinary = define_class_under<Storage::Binary, Storage::Storage>(rb_mQuarry, "ImplBinary")
59
+ .define_constructor(Constructor<Storage::Binary, string>())
60
+ .define_method("read", &Storage::Binary::read)
61
+ .define_method("write", &Storage::Binary::write)
62
+ .define_method("read_model", &Storage::Binary::read_model)
63
+ .define_method("write_model", &Storage::Binary::write_model)
64
+ .define_method("get_write_examples", &Storage::Binary::get_write_examples)
65
+ .define_method("set_write_examples", &Storage::Binary::set_write_examples);
66
+
67
+ Data_Type<Storage::Folders> rb_cFolders = define_class_under<Storage::Folders, Storage::Storage>(rb_mQuarry, "ImplFolders")
68
+ .define_constructor(Constructor<Storage::Folders, string, Preprocessing::Text::TextPipeline *>())
69
+ .define_method("read", &Storage::Folders::read);
70
+
71
+
72
+ // model
73
+ Data_Type<Model::Model> rb_cModel = define_class_under<Model::Model>(rb_mQuarry, "ImplModel")
74
+ .define_constructor(Constructor<Model::Model>())
75
+ .define_method("train", &Model::Model::train)
76
+ .define_method("train_text", &Model::Model::train_text)
77
+ .define_method("classify", &Model::Model::classify)
78
+ .define_method("classify_text", &Model::Model::classify_text)
79
+ .define_method("set_data_set", &Model::Model::set_data_set)
80
+ .define_method("get_data_set", &Model::Model::get_data_set)
81
+ .define_method("set_classifier", &Model::Model::set_classifier)
82
+ .define_method("get_classifier", &Model::Model::get_classifier)
83
+ .define_method("set_text_pipeline", &Model::Model::set_text_pipeline)
84
+ .define_method("get_text_pipeline", &Model::Model::get_text_pipeline)
85
+ .define_method("rank", &model_rank)
86
+ .define_method("rank_text", &model_rank_text);
87
+
88
+
89
+
90
+ // data set
91
+ Data_Type<DataSet::Feature> rb_cDataSetFeature = define_class_under<DataSet::Feature>(rb_mDataSet, "ImplFeature")
92
+ .define_method("get_name", &DataSet::Feature::get_name)
93
+ .define_method("set_name", &DataSet::Feature::set_name)
94
+ .define_constructor(Constructor<DataSet::Feature, string, int>());
95
+
96
+ Data_Type<DataSet::Example> rb_cDataSetExample = define_class_under<DataSet::Example>(rb_mDataSet, "ImplExample")
97
+ .define_method("category_index", &DataSet::Example::category_index)
98
+ .define_method("get_value", &DataSet::Example::get_value)
99
+ .define_method("set_value", &DataSet::Example::set_value)
100
+ .define_constructor(Constructor<DataSet::Example, int>());
101
+
102
+ Data_Type<DataSet::DataSet> rb_cDataSet = define_class_under<DataSet::DataSet>(rb_mDataSet, "ImplDataSet")
103
+ .define_constructor(Constructor<DataSet::DataSet>())
104
+ .define_method("get_name", &DataSet::DataSet::get_name)
105
+ .define_method("set_name", &DataSet::DataSet::set_name)
106
+ .define_method("features_size", &DataSet::DataSet::features_size)
107
+ .define_method("examples_size", &DataSet::DataSet::examples_size)
108
+ .define_method("get_feature_by_index", &DataSet::DataSet::get_feature_by_index)
109
+ .define_method("get_feature_by_name", &DataSet::DataSet::get_feature_by_name)
110
+ .define_method("get_example_by_index", &DataSet::DataSet::get_example_by_index)
111
+ .define_method("stratify", &DataSet::DataSet::stratify)
112
+ .define_method("cross_fold_validation", &DataSet::DataSet::cross_fold_validation);
113
+
114
+
115
+ // abstract classifier
116
+ Data_Type<Classifier::Classifier> rb_cClassifierClassifier = define_class_under<Classifier::Classifier>(rb_mClassifier, "ImplClassifier")
117
+ .define_method("prepare", &Classifier::Classifier::prepare)
118
+ .define_method("classify", &Classifier::Classifier::classify);
119
+
120
+
121
+ // bayesian classifiers
122
+ Data_Type<Classifier::NaiveBayesClassifier> rb_cClassifierNaiveBayesClassifier = define_class_under<Classifier::NaiveBayesClassifier, Classifier::Classifier>(rb_mClassifier, "ImplNaiveBayesClassifier")
123
+ .define_constructor(Constructor<Classifier::NaiveBayesClassifier, DataSet::DataSet *>());
124
+
125
+
126
+ // confusion matrix
127
+ Data_Type<ConfusionMatrix> rb_cConfusionMatrix = define_class_under<ConfusionMatrix>(rb_mQuarry, "ImplConfusionMatrix")
128
+ .define_constructor(Constructor<ConfusionMatrix, DataSet::DataSet *>())
129
+ .define_method("accuracy", &ConfusionMatrix::accuracy)
130
+ .define_method("error", &ConfusionMatrix::error)
131
+ .define_method("print_summary", &ConfusionMatrix::print_summary)
132
+ .define_method("avg_tp", &ConfusionMatrix::avg_tp)
133
+ .define_method("avg_fp", &ConfusionMatrix::avg_fp)
134
+ .define_method("avg_tn", &ConfusionMatrix::avg_tn)
135
+ .define_method("avg_fn", &ConfusionMatrix::avg_fn)
136
+ .define_method("avg_precision", &ConfusionMatrix::avg_precision)
137
+ .define_method("avg_recall", &ConfusionMatrix::avg_recall)
138
+ .define_method("avg_fscore", &ConfusionMatrix::avg_fscore)
139
+ .define_method("add", &ConfusionMatrix::add)
140
+ .define_method("tp", &ConfusionMatrix::tp)
141
+ .define_method("fp", &ConfusionMatrix::tp)
142
+ .define_method("tn", &ConfusionMatrix::tp)
143
+ .define_method("fn", &ConfusionMatrix::tp)
144
+ .define_method("precision", &ConfusionMatrix::tp)
145
+ .define_method("recall", &ConfusionMatrix::tp)
146
+ .define_method("fscore", &ConfusionMatrix::tp);
147
+ }
148
+ }
@@ -0,0 +1,2 @@
1
+ STATIC_LIB = ar rcs
2
+ include Makefile.targets
@@ -0,0 +1,6 @@
1
+ STATIC_LIB = libtool -o
2
+ CXX = clang++
3
+ CPPFLAGS = -Wno-deprecated-writable-strings
4
+ #CXX = g++
5
+ #CPPFLAGS = -ggdb
6
+ include Makefile.targets
@@ -0,0 +1,23 @@
1
+ SRCS = src/quarry.cpp src/data_set/data_set.cpp src/data_set/example.cpp src/data_set/sparse/sparse_example.cpp src/data_set/features/nominal_feature.cpp src/data_set/features/numeric_feature.cpp src/classifier/classifier.cpp src/classifier/naive_bayes/naive_bayes_classifier.cpp src/metrics/confusion_matrix.cpp src/storage/arff.cpp src/storage/folders.cpp src/storage/binary.cpp src/preprocessing/text/text_pipeline.cpp src/preprocessing/text/inplace_processor/porter_stemmer_original.cpp src/preprocessing/text/token_selector/stop_words.cpp src/preprocessing/text/tokeniser/simple_tokeniser.cpp src/model/model.cpp
2
+ OBJS = ${SRCS:.cpp=.o}
3
+ INCFLAGS = -Isrc
4
+ CPPFLAGS += -O3
5
+ LIB = libquarry.a
6
+
7
+ .SUFFIXES:
8
+ .SUFFIXES: .cpp .o
9
+
10
+ .cpp.o :
11
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) -o $@ -c $<
12
+
13
+ all : $(LIB)
14
+
15
+ $(LIB) : $(OBJS)
16
+ $(STATIC_LIB) obj/$(LIB) $(OBJS)
17
+
18
+ test : $(LIB)
19
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) -o obj/test obj/$(LIB) src/test.cpp
20
+
21
+ clean :
22
+ rm -f $(OBJS)
23
+ rm -f obj/*
File without changes
File without changes
@@ -0,0 +1,32 @@
1
+ #include "classifier.h"
2
+
3
+ int Classifier::Classifier::classify(DataSet::Example *example) {
4
+ double max_score = 0.0, category_score = 0.0;
5
+ int max_category = 1;
6
+
7
+ for(int category = 1; category <= data_set->categories_size(); category++) {
8
+ category_score = score(category, example);
9
+ if(category_score > max_score) {
10
+ max_score = category_score;
11
+ max_category = category;
12
+ }
13
+ }
14
+
15
+ return max_category;
16
+ }
17
+
18
+ vector<Classifier::Score> *Classifier::Classifier::rank(DataSet::Example *example) {
19
+ vector<Score> *scores = score_all(example);
20
+ sort(scores->begin(), scores->end(), Score::compare);
21
+ return scores;
22
+ }
23
+
24
+ vector<Classifier::Score> *Classifier::Classifier::score_all(DataSet::Example *example) {
25
+ vector<Score> *scores = new vector<Score>();
26
+ scores->reserve(data_set->categories_size());
27
+
28
+ for(int category = 1; category <= data_set->categories_size(); category++)
29
+ scores->push_back(Score(category, score(category, example)));
30
+
31
+ return scores;
32
+ }
@@ -0,0 +1,59 @@
1
+ #ifndef __classifier__
2
+ #define __classifier__
3
+ #include "data_set/data_set.h"
4
+ #include <algorithm>
5
+ #include <utility>
6
+ #include <vector>
7
+ #include <iostream>
8
+ #include <typeinfo>
9
+ using namespace std;
10
+
11
+ namespace Storage {
12
+ class Binary;
13
+ }
14
+
15
+ namespace Classifier {
16
+
17
+ class Score {
18
+ public:
19
+ int category;
20
+ double score;
21
+
22
+ Score(int category, double score) : category(category), score(score) {}
23
+ static bool compare (Score a, Score b) {
24
+ return b.score < a.score; // descending
25
+ }
26
+ };
27
+
28
+
29
+ class Classifier {
30
+ public:
31
+ DataSet::DataSet *data_set;
32
+ vector<bool> numeric_features;
33
+ vector<bool> nominal_features;
34
+
35
+ Classifier(DataSet::DataSet *data_set) : data_set(data_set), numeric_features(data_set->features_size(), 0), nominal_features(data_set->features_size(), 0) {
36
+ DataSet::Feature *feature = NULL;
37
+
38
+ for(unsigned int i = 0; i < data_set->features.size(); i++) {
39
+ feature = data_set->features[i];
40
+ if(typeid(*feature) == typeid(DataSet::NumericFeature))
41
+ numeric_features[i] = true;
42
+ else
43
+ nominal_features[i] = true;
44
+ }
45
+ }
46
+
47
+ virtual void prepare() {};
48
+ virtual Classifier *clone(DataSet::DataSet *new_data_set) = 0;
49
+ virtual double score(int category, DataSet::Example *example) = 0;
50
+ virtual int classify(DataSet::Example *example);
51
+ virtual vector<Score> *rank(DataSet::Example *example);
52
+ virtual vector<Score> *score_all(DataSet::Example *example);
53
+ virtual void write_binary(Storage::Binary *file) {}
54
+ virtual void read_binary(Storage::Binary *file) {}
55
+ virtual uint32_t mark() = 0;
56
+ };
57
+ }
58
+
59
+ #endif
File without changes
File without changes