treat 1.2.0 → 2.0.0rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (217) hide show
  1. data/LICENSE +2 -2
  2. data/README.md +12 -21
  3. data/lib/treat/autoload.rb +44 -0
  4. data/lib/treat/config/config.rb +38 -0
  5. data/lib/treat/config/configurable.rb +51 -0
  6. data/lib/treat/config/data/config.rb +50 -0
  7. data/lib/treat/config/data/core.rb +52 -0
  8. data/lib/treat/config/data/databases.rb +10 -0
  9. data/lib/treat/config/data/entities.rb +15 -0
  10. data/lib/treat/config/data/languages/agnostic.rb +31 -0
  11. data/lib/treat/config/{languages → data/languages}/arabic.rb +0 -0
  12. data/lib/treat/config/{languages → data/languages}/chinese.rb +0 -0
  13. data/lib/treat/config/{languages → data/languages}/dutch.rb +1 -1
  14. data/lib/treat/config/data/languages/english.rb +95 -0
  15. data/lib/treat/config/data/languages/french.rb +148 -0
  16. data/lib/treat/config/data/languages/german.rb +135 -0
  17. data/lib/treat/config/{languages → data/languages}/greek.rb +1 -1
  18. data/lib/treat/config/data/languages/italian.rb +162 -0
  19. data/lib/treat/config/data/languages/polish.rb +11 -0
  20. data/lib/treat/config/{languages → data/languages}/portuguese.rb +1 -1
  21. data/lib/treat/config/{languages → data/languages}/russian.rb +1 -1
  22. data/lib/treat/config/data/languages/spanish.rb +291 -0
  23. data/lib/treat/config/data/languages/swedish.rb +289 -0
  24. data/lib/treat/config/data/libraries.rb +12 -0
  25. data/lib/treat/config/data/linguistics.rb +44 -0
  26. data/lib/treat/config/data/tags.rb +328 -0
  27. data/lib/treat/config/{workers → data/workers}/extractors.rb +2 -10
  28. data/lib/treat/config/{workers → data/workers}/formatters.rb +0 -0
  29. data/lib/treat/config/{workers → data/workers}/inflectors.rb +0 -0
  30. data/lib/treat/config/{workers → data/workers}/learners.rb +0 -0
  31. data/lib/treat/config/{workers → data/workers}/lexicalizers.rb +4 -3
  32. data/lib/treat/config/{workers → data/workers}/processors.rb +3 -3
  33. data/lib/treat/config/{workers → data/workers}/retrievers.rb +0 -0
  34. data/lib/treat/config/importable.rb +31 -0
  35. data/lib/treat/config/paths.rb +23 -0
  36. data/lib/treat/config/tags.rb +37 -0
  37. data/lib/treat/core/dsl.rb +55 -0
  38. data/lib/treat/{installer.rb → core/installer.rb} +10 -12
  39. data/lib/treat/core/server.rb +40 -0
  40. data/lib/treat/entities/entities.rb +101 -0
  41. data/lib/treat/entities/{abilities/doable.rb → entity/applicable.rb} +5 -3
  42. data/lib/treat/entities/{abilities → entity}/buildable.rb +118 -63
  43. data/lib/treat/entities/{abilities → entity}/checkable.rb +2 -2
  44. data/lib/treat/entities/{abilities → entity}/comparable.rb +6 -6
  45. data/lib/treat/entities/{abilities → entity}/countable.rb +2 -1
  46. data/lib/treat/entities/entity/debuggable.rb +86 -0
  47. data/lib/treat/entities/{abilities → entity}/delegatable.rb +16 -26
  48. data/lib/treat/entities/{abilities → entity}/exportable.rb +2 -2
  49. data/lib/treat/entities/{abilities → entity}/iterable.rb +4 -16
  50. data/lib/treat/entities/{abilities → entity}/magical.rb +22 -17
  51. data/lib/treat/entities/entity/registrable.rb +36 -0
  52. data/lib/treat/entities/{abilities → entity}/stringable.rb +18 -15
  53. data/lib/treat/entities/entity.rb +86 -77
  54. data/lib/treat/exception.rb +3 -0
  55. data/lib/treat/helpers/hash.rb +29 -0
  56. data/lib/treat/helpers/help.rb +35 -0
  57. data/lib/treat/helpers/object.rb +55 -0
  58. data/lib/treat/helpers/string.rb +124 -0
  59. data/lib/treat/{core → learning}/data_set.rb +11 -11
  60. data/lib/treat/{core → learning}/export.rb +3 -3
  61. data/lib/treat/{core → learning}/problem.rb +26 -16
  62. data/lib/treat/{core → learning}/question.rb +5 -9
  63. data/lib/treat/loaders/linguistics.rb +8 -9
  64. data/lib/treat/loaders/stanford.rb +5 -11
  65. data/lib/treat/modules.rb +33 -0
  66. data/lib/treat/proxies/array.rb +27 -0
  67. data/lib/treat/proxies/language.rb +47 -0
  68. data/lib/treat/proxies/number.rb +18 -0
  69. data/lib/treat/proxies/proxy.rb +25 -0
  70. data/lib/treat/proxies/string.rb +18 -0
  71. data/lib/treat/version.rb +10 -1
  72. data/lib/treat/{workers.rb → workers/categorizable.rb} +18 -19
  73. data/lib/treat/workers/extractors/keywords/tf_idf.rb +11 -11
  74. data/lib/treat/workers/extractors/language/what_language.rb +8 -6
  75. data/lib/treat/workers/extractors/name_tag/stanford.rb +10 -4
  76. data/lib/treat/workers/extractors/similarity/levenshtein.rb +36 -0
  77. data/lib/treat/workers/extractors/similarity/tf_idf.rb +27 -0
  78. data/lib/treat/workers/extractors/tf_idf/native.rb +4 -4
  79. data/lib/treat/workers/extractors/time/chronic.rb +2 -4
  80. data/lib/treat/workers/extractors/time/nickel.rb +19 -20
  81. data/lib/treat/workers/extractors/time/ruby.rb +2 -1
  82. data/lib/treat/workers/extractors/topic_words/lda.rb +12 -12
  83. data/lib/treat/workers/extractors/topics/reuters.rb +9 -13
  84. data/lib/treat/workers/formatters/readers/autoselect.rb +1 -1
  85. data/lib/treat/workers/formatters/readers/image.rb +19 -9
  86. data/lib/treat/workers/formatters/readers/odt.rb +2 -1
  87. data/lib/treat/workers/formatters/readers/pdf.rb +20 -3
  88. data/lib/treat/workers/formatters/readers/xml.rb +0 -1
  89. data/lib/treat/workers/formatters/serializers/mongo.rb +10 -20
  90. data/lib/treat/workers/formatters/serializers/xml.rb +17 -26
  91. data/lib/treat/workers/formatters/serializers/yaml.rb +5 -4
  92. data/lib/treat/workers/formatters/unserializers/mongo.rb +4 -4
  93. data/lib/treat/workers/formatters/unserializers/xml.rb +3 -4
  94. data/lib/treat/workers/formatters/unserializers/yaml.rb +3 -4
  95. data/lib/treat/workers/formatters/visualizers/dot.rb +1 -0
  96. data/lib/treat/workers/formatters/visualizers/standoff.rb +2 -3
  97. data/lib/treat/workers/formatters/visualizers/tree.rb +2 -3
  98. data/lib/treat/workers/{group.rb → groupable.rb} +9 -9
  99. data/lib/treat/workers/inflectors/cardinalizers/linguistics.rb +1 -3
  100. data/lib/treat/workers/inflectors/conjugators/linguistics.rb +5 -7
  101. data/lib/treat/workers/inflectors/declensors/english.rb +13 -20
  102. data/lib/treat/workers/inflectors/declensors/linguistics.rb +29 -28
  103. data/lib/treat/workers/inflectors/ordinalizers/linguistics.rb +0 -2
  104. data/lib/treat/workers/inflectors/stemmers/porter.rb +8 -10
  105. data/lib/treat/workers/inflectors/stemmers/porter_c.rb +7 -7
  106. data/lib/treat/workers/inflectors/stemmers/uea.rb +3 -8
  107. data/lib/treat/workers/learners/classifiers/id3.rb +17 -14
  108. data/lib/treat/workers/learners/classifiers/linear.rb +15 -27
  109. data/lib/treat/workers/learners/classifiers/mlp.rb +32 -19
  110. data/lib/treat/workers/learners/classifiers/svm.rb +28 -21
  111. data/lib/treat/workers/lexicalizers/categorizers/from_tag.rb +19 -3
  112. data/lib/treat/workers/lexicalizers/sensers/wordnet.rb +15 -7
  113. data/lib/treat/workers/lexicalizers/taggers/brill/patch.rb +4 -1
  114. data/lib/treat/workers/lexicalizers/taggers/brill.rb +8 -19
  115. data/lib/treat/workers/lexicalizers/taggers/lingua.rb +4 -15
  116. data/lib/treat/workers/lexicalizers/taggers/stanford.rb +22 -13
  117. data/lib/treat/workers/processors/chunkers/autoselect.rb +2 -3
  118. data/lib/treat/workers/processors/chunkers/html.rb +1 -6
  119. data/lib/treat/workers/processors/parsers/enju.rb +2 -4
  120. data/lib/treat/workers/processors/parsers/stanford.rb +13 -7
  121. data/lib/treat/workers/processors/segmenters/punkt.rb +25 -11
  122. data/lib/treat/workers/processors/segmenters/scalpel.rb +20 -0
  123. data/lib/treat/workers/processors/segmenters/srx.rb +42 -0
  124. data/lib/treat/workers/processors/segmenters/stanford.rb +5 -5
  125. data/lib/treat/workers/processors/segmenters/tactful.rb +21 -11
  126. data/lib/treat/workers/processors/tokenizers/ptb.rb +40 -30
  127. data/lib/treat/workers/processors/tokenizers/punkt.rb +14 -19
  128. data/lib/treat/workers/processors/tokenizers/stanford.rb +38 -22
  129. data/lib/treat/workers/retrievers/indexers/ferret.rb +6 -3
  130. data/lib/treat/workers/retrievers/searchers/ferret.rb +2 -2
  131. data/lib/treat/workers/workers.rb +6 -0
  132. data/lib/treat.rb +18 -32
  133. data/models/MANIFEST +1 -0
  134. data/spec/core/data_set.rb +174 -0
  135. data/spec/core/export.rb +52 -0
  136. data/spec/core/problem.rb +144 -0
  137. data/spec/core/question.rb +52 -0
  138. data/spec/{collection.rb → entities/collection.rb} +20 -35
  139. data/spec/{document.rb → entities/document.rb} +3 -54
  140. data/spec/{entity.rb → entities/entity.rb} +10 -9
  141. data/spec/entities/phrase.rb +33 -0
  142. data/spec/{token.rb → entities/token.rb} +0 -57
  143. data/spec/entities/word.rb +3 -0
  144. data/spec/{zone.rb → entities/zone.rb} +0 -26
  145. data/spec/helper.rb +116 -32
  146. data/spec/sandbox.rb +258 -25
  147. data/spec/treat.rb +26 -34
  148. data/spec/workers/agnostic.rb +137 -0
  149. data/spec/workers/english.rb +194 -0
  150. data/spec/workers/examples/english/economist/hungarys_troubles.txt +46 -0
  151. data/spec/workers/examples/english/economist/saving_the_euro.odt +0 -0
  152. data/spec/{samples → workers/examples/english}/mathematicians/archimedes.abw +0 -0
  153. data/spec/{samples → workers/examples/english}/mathematicians/euler.html +0 -0
  154. data/spec/{samples → workers/examples/english}/mathematicians/gauss.pdf +0 -0
  155. data/spec/{samples → workers/examples/english}/mathematicians/leibniz.txt +0 -0
  156. data/spec/{samples → workers/examples/english}/mathematicians/newton.doc +0 -0
  157. data/spec/workers/examples/english/phrase.xml +5 -0
  158. data/spec/workers/examples/english/test.txt +1 -0
  159. data/spec/workers/language.rb +280 -0
  160. data/spec/workers.rb +28 -0
  161. metadata +122 -105
  162. data/lib/treat/config/core/acronyms.rb +0 -5
  163. data/lib/treat/config/core/encodings.rb +0 -8
  164. data/lib/treat/config/core/entities.rb +0 -2
  165. data/lib/treat/config/core/language.rb +0 -3
  166. data/lib/treat/config/core/paths.rb +0 -8
  167. data/lib/treat/config/core/syntax.rb +0 -1
  168. data/lib/treat/config/core/verbosity.rb +0 -1
  169. data/lib/treat/config/databases/default.rb +0 -1
  170. data/lib/treat/config/databases/mongo.rb +0 -1
  171. data/lib/treat/config/languages/agnostic.rb +0 -34
  172. data/lib/treat/config/languages/english.rb +0 -60
  173. data/lib/treat/config/languages/french.rb +0 -18
  174. data/lib/treat/config/languages/german.rb +0 -18
  175. data/lib/treat/config/languages/italian.rb +0 -12
  176. data/lib/treat/config/languages/polish.rb +0 -12
  177. data/lib/treat/config/languages/spanish.rb +0 -12
  178. data/lib/treat/config/languages/swedish.rb +0 -12
  179. data/lib/treat/config/libraries/punkt.rb +0 -1
  180. data/lib/treat/config/libraries/reuters.rb +0 -1
  181. data/lib/treat/config/libraries/stanford.rb +0 -1
  182. data/lib/treat/config/linguistics/categories.rb +0 -4
  183. data/lib/treat/config/linguistics/punctuation.rb +0 -33
  184. data/lib/treat/config/tags/aligned.rb +0 -221
  185. data/lib/treat/config/tags/enju.rb +0 -71
  186. data/lib/treat/config/tags/paris7.rb +0 -17
  187. data/lib/treat/config/tags/ptb.rb +0 -15
  188. data/lib/treat/config/workers/list.rb +0 -1
  189. data/lib/treat/config.rb +0 -135
  190. data/lib/treat/core.rb +0 -5
  191. data/lib/treat/entities/abilities/copyable.rb +0 -47
  192. data/lib/treat/entities/abilities/debuggable.rb +0 -83
  193. data/lib/treat/entities/abilities/registrable.rb +0 -46
  194. data/lib/treat/entities/collection.rb +0 -40
  195. data/lib/treat/entities/document.rb +0 -10
  196. data/lib/treat/entities/group.rb +0 -18
  197. data/lib/treat/entities/section.rb +0 -13
  198. data/lib/treat/entities/token.rb +0 -47
  199. data/lib/treat/entities/zone.rb +0 -12
  200. data/lib/treat/entities.rb +0 -6
  201. data/lib/treat/helpers/didyoumean.rb +0 -57
  202. data/lib/treat/helpers/escaping.rb +0 -15
  203. data/lib/treat/helpers/formatting.rb +0 -41
  204. data/lib/treat/helpers/objtohash.rb +0 -8
  205. data/lib/treat/helpers/platform.rb +0 -15
  206. data/lib/treat/helpers/reflection.rb +0 -17
  207. data/lib/treat/helpers/temporary.rb +0 -27
  208. data/lib/treat/helpers/verbosity.rb +0 -19
  209. data/lib/treat/helpers.rb +0 -5
  210. data/lib/treat/loaders.rb +0 -10
  211. data/lib/treat/proxies.rb +0 -106
  212. data/lib/treat/workers/formatters/unserializers/autoselect.rb +0 -17
  213. data/lib/treat/workers/inflectors/declensors/active_support.rb +0 -31
  214. data/lib/treat/workers/processors/tokenizers/tactful.rb +0 -68
  215. data/spec/core.rb +0 -441
  216. data/spec/phrase.rb +0 -112
  217. data/spec/word.rb +0 -111
@@ -1,41 +1,57 @@
1
- # A wrapper for the Stanford parser's
2
- # Penn-Treebank style tokenizer.
1
+ # Tokenization provided by Stanford Penn-Treebank style
2
+ # tokenizer. Most punctuation is split from adjoining words,
3
+ # verb contractions and the Anglo-Saxon genitive of nouns are
4
+ # split into their component morphemes, and each morpheme is
5
+ # tagged separately. N.B. Contrary to the standard PTB
6
+ # tokenization, double quotes (") are NOT changed to doubled
7
+ # single forward- and backward- quotes (`` and '') by default.
3
8
  class Treat::Workers::Processors::Tokenizers::Stanford
4
9
 
5
- require 'treat/loaders/stanford'
6
10
  Treat::Loaders::Stanford.load
7
11
 
12
+ # Default options for the tokenizer.
13
+ DefaultOptions = {
14
+ directional_quotes: false,
15
+ escape_characters: false
16
+ }
17
+
18
+ # Hold one instance of the tokenizer.
8
19
  @@tokenizer = nil
9
20
 
10
- # Tokenize the entity using a Penn-Treebank
11
- # style tokenizer.
21
+ # Perform tokenization of the entity and add
22
+ # the resulting tokens as its children.
12
23
  #
13
- # Options: none.
24
+ # Options:
25
+ # - (Boolean) :directional_quotes => Whether
26
+ # to attempt to get correct directional quotes,
27
+ # replacing "..." by ``...''. Off by default.
14
28
  def self.tokenize(entity, options = {})
15
-
16
- entity.check_hasnt_children
17
-
18
- s = entity.to_s
19
-
29
+ options = DefaultOptions.merge(options)
20
30
  @@tokenizer ||=
21
31
  ::StanfordCoreNLP.load(:tokenize)
22
- text =
23
- ::StanfordCoreNLP::Text.new(s)
32
+ entity.check_hasnt_children
33
+ text = ::StanfordCoreNLP::
34
+ Text.new(entity.to_s)
24
35
  @@tokenizer.annotate(text)
25
-
26
- add_tokens(entity, text.get(:tokens))
27
-
36
+ add_tokens(entity, text.get(:tokens), options)
28
37
  end
29
38
 
30
39
  # Add the tokens to the entity.
31
- def self.add_tokens(entity, tokens)
40
+ def self.add_tokens(entity, tokens, options)
32
41
  tokens.each do |token|
33
42
  val = token.value
34
- val = '(' if val == '-LRB-' # Fix for other special chars
35
- val = ')' if val == '-RRB'
36
- t = Treat::Entities::Token.
37
- from_string(token.value)
38
- entity << t
43
+ unless options[:escape_characters]
44
+ Treat.tags.ptb.escape_characters.
45
+ each do |key, value|
46
+ val.gsub!(value, key)
47
+ end
48
+ end
49
+ unless options[:directional_quotes]
50
+ val.gsub!(/``/,'"')
51
+ val.gsub!(/''/,'"')
52
+ end
53
+ entity << Treat::Entities::Token.
54
+ from_string(val)
39
55
  end
40
56
  end
41
57
 
@@ -17,9 +17,13 @@ class Treat::Workers::Retrievers::Indexers::Ferret
17
17
  # Annotates the collection with the path to the
18
18
  # index for future use (e.g. in searching).
19
19
  def self.index(collection, options = {})
20
+
21
+ unless collection.get(:folder)
22
+ raise Treat::Exception,
23
+ "Only collections stored on disk " +
24
+ "can currently be indexed with Ferret."
25
+ end
20
26
 
21
- # FIXME - what if the collection is stored
22
- # in a database?
23
27
  path = "#{collection.folder}/.index"
24
28
  return path if FileTest.directory?(path)
25
29
 
@@ -45,7 +49,6 @@ class Treat::Workers::Retrievers::Indexers::Ferret
45
49
  end
46
50
 
47
51
  path
48
-
49
52
  end
50
53
 
51
54
  end
@@ -10,7 +10,7 @@ class Treat::Workers::Retrievers::Searchers::Ferret
10
10
  require 'find'
11
11
 
12
12
  DefaultOptions = {
13
- :q => nil,
13
+ :query => nil,
14
14
  :limit => :all,
15
15
  :callback => nil
16
16
  }
@@ -30,7 +30,7 @@ class Treat::Workers::Retrievers::Searchers::Ferret
30
30
  "This collection must be indexed to be searchable."
31
31
  end
32
32
 
33
- unless options[:q]
33
+ unless options[:query]
34
34
  raise Treat::Exception,
35
35
  'You must set a query by using the :q option.'
36
36
  end
@@ -0,0 +1,6 @@
1
+ module Treat::Workers
2
+ # Require mixins for workers.
3
+ require_relative 'categorizable'
4
+ # Make all workers categorizable.
5
+ extend Treat::Workers::Categorizable
6
+ end
data/lib/treat.rb CHANGED
@@ -1,36 +1,22 @@
1
+ # Treat is a toolkit for natural language
2
+ # processing and computational linguistics
3
+ # in Ruby. The Treat project aims to build
4
+ # a language- and algorithm- agnostic NLP
5
+ # framework for Ruby with support for tasks
6
+ # such as document retrieval, text chunking,
7
+ # segmentation and tokenization, natural
8
+ # language parsing, part-of-speech tagging,
9
+ # keyword mining and named entity recognition.
10
+ #
11
+ # Author: Louis-Antoine Mullie (c) 2010-12.
12
+ #
13
+ # Released under the General Public License.
1
14
  module Treat
2
15
 
3
- # Treat requires Ruby >= 1.9.2
4
- if RUBY_VERSION < '1.9.2'
5
- raise "Treat requires Ruby version 1.9.2 " +
6
- "or higher, but current is #{RUBY_VERSION}."
7
- end
8
-
9
- # Custom exception class.
10
- class Exception < ::Exception; end
11
-
12
- # Load configuration options.
13
- require 'treat/config'
14
- # Load all workers.
15
- require 'treat/helpers'
16
- # Require library loaders.
17
- require 'treat/loaders'
18
- # Require all core classes.
19
- require 'treat/core'
20
- # Require all entity classes.
21
- require 'treat/entities'
22
- # Lazy load worker classes.
23
- require 'treat/workers'
24
- # Require proxies last.
25
- require 'treat/proxies'
26
-
27
- # Turn sugar on.
28
- Treat::Config.sweeten!
29
-
30
- # Install packages for a given language.
31
- def self.install(language = :english)
32
- require 'treat/installer'
33
- Treat::Installer.install(language)
34
- end
16
+ # * Load all the core classes. * #
17
+ require_relative 'treat/version'
18
+ require_relative 'treat/exception'
19
+ require_relative 'treat/autoload'
20
+ require_relative 'treat/modules'
35
21
 
36
22
  end
data/models/MANIFEST ADDED
@@ -0,0 +1 @@
1
+ This is a folder containing the data models used by Treat.
@@ -0,0 +1,174 @@
1
+ describe Treat::Learning::DataSet do
2
+
3
+ before do
4
+ @question = Treat::Learning::Question.new(:is_key_sentence, :sentence, 0, :continuous)
5
+ @feature = Treat::Learning::Feature.new(:word_count, 0)
6
+ @problem = Treat::Learning::Problem.new(@question, @feature)
7
+ @tag = Treat::Learning::Tag.new(:paragraph_length, 0,
8
+ "->(e) { e.parent_paragraph.word_count }")
9
+ @paragraph = Treat::Entities::Paragraph.new(
10
+ "Ranga and I went to the store. Meanwhile, Ryan was sleeping.")
11
+ @paragraph.do :segment, :tokenize
12
+ @sentence = @paragraph.sentences[0]
13
+ @data_set = Treat::Learning::DataSet.new(@problem)
14
+ end
15
+
16
+ describe "#initialize" do
17
+ context "when supplied with a problem" do
18
+ it "should initialize an empty data set" do
19
+ data_set = Treat::Learning::DataSet.new(@problem)
20
+ data_set.items.should eql []
21
+ data_set.problem.should eql @problem
22
+ end
23
+ end
24
+ context "when supplied with an improper argument" do
25
+ it "should raise an error" do
26
+ # The argument to initialize should be a Problem.
27
+ expect { data_set = Treat::Learning::DataSet.new("foo") }.to raise_error
28
+ end
29
+ end
30
+ end
31
+
32
+ describe "#self.build" do
33
+
34
+ end
35
+
36
+ describe "#==(other_data_set)" do
37
+ context "when supplied with an equivalent data set" do
38
+ it "returns true" do
39
+ data_set1 = Treat::Learning::DataSet.new(@problem)
40
+ data_set2 = Treat::Learning::DataSet.new(@problem)
41
+ data_set1.should == data_set2
42
+ data_set1 << @sentence
43
+ data_set2 << @sentence
44
+ data_set1.should == data_set2
45
+ end
46
+ end
47
+
48
+ context "when supplied with a non-equivalent data set" do
49
+ it "returns false" do
50
+ # Get two slightly different problems.
51
+ question1 = Treat::Learning::Question.new(
52
+ :is_key_sentence, :sentence, 0, :continuous)
53
+ question2 = Treat::Learning::Question.new(
54
+ :is_key_word, :sentence, 0, :continuous)
55
+ problem1 = Treat::Learning::Problem.new(question1, @feature)
56
+ problem2 = Treat::Learning::Problem.new(question2, @feature)
57
+ # Then the problems shouldn't be equal anymore.
58
+ problem1.should_not == problem2
59
+ # Create data sets with the different problems.
60
+ data_set1 = Treat::Learning::DataSet.new(problem1)
61
+ data_set2 = Treat::Learning::DataSet.new(problem2)
62
+ # Then the data sets shouldn't be equal anymore.
63
+ data_set1.should_not == data_set2
64
+ # Create two data sets with the same problems.
65
+ data_set1 = Treat::Learning::DataSet.new(@problem)
66
+ data_set2 = Treat::Learning::DataSet.new(@problem)
67
+ # Then these should be equal.
68
+ data_set1.should == data_set2
69
+ # But when different items are added
70
+ data_set1 << Treat::Entities::Sentence.new(
71
+ "This sentence is not the same as the other.").tokenize
72
+ data_set2 << Treat::Entities::Sentence.new(
73
+ "This sentence is similar to the other.").tokenize
74
+ # They shouldn't be equal anymore.
75
+ data_set1.should_not == data_set2
76
+ end
77
+ end
78
+
79
+ end
80
+
81
+ describe "#merge" do
82
+ context "when supplied with two data sets refering to the same problem" do
83
+ it "merges the two together" do
84
+ # Create two data sets with the same problem.
85
+ data_set1 = Treat::Learning::DataSet.new(@problem)
86
+ data_set2 = Treat::Learning::DataSet.new(@problem)
87
+ # Add a sentence to each data set.
88
+ data_set1 << Treat::Entities::Sentence.new(
89
+ "This sentence is not the same as the other.").tokenize
90
+ data_set2 << Treat::Entities::Sentence.new(
91
+ "This sentence is similar to the other.").tokenize
92
+ # Merge the two data sets together.
93
+ data_set1.merge(data_set2)
94
+ # Check if the merge has occured properly.
95
+ data_set1.items.size.should eql 2
96
+ data_set1.items[1].should eql data_set2.items[0]
97
+ end
98
+ end
99
+
100
+ context "when supplied with two data sets refering to different problems" do
101
+ it "raises an error" do
102
+ # Get two slightly different questions.
103
+ question1 = Treat::Learning::Question.new(
104
+ :is_key_sentence, :sentence, 0, :continuous)
105
+ question2 = Treat::Learning::Question.new(
106
+ :is_key_word, :sentence, 0, :continuous)
107
+ # Create two problems with the different questions.
108
+ problem1 = Treat::Learning::Problem.new(question1, @feature)
109
+ problem2 = Treat::Learning::Problem.new(question2, @feature)
110
+ # Create two data sets with the different problems.
111
+ data_set1 = Treat::Learning::DataSet.new(problem1)
112
+ data_set2 = Treat::Learning::DataSet.new(problem2)
113
+ # Add elements to each of the data sets.
114
+ data_set1 << Treat::Entities::Sentence.new(
115
+ "This sentence is not the same as the other.").tokenize
116
+ data_set2 << Treat::Entities::Sentence.new(
117
+ "This sentence is similar to the other.").tokenize
118
+ # Try to merge them; but this should fail.
119
+ expect { data_set1.merge(data_set2) }.to raise_error
120
+ end
121
+ end
122
+ end
123
+
124
+ describe "#<<(entity)" do
125
+ context "when supplied with a proper entity" do
126
+ it "exports the features and tags and adds them to the data set" do
127
+ problem = Treat::Learning::Problem.new(@question, @feature, @tag)
128
+ data_set = Treat::Learning::DataSet.new(problem)
129
+ data_set << @sentence
130
+ data_set.items.tap { |e| e[0][:id] = 0 }.
131
+ should eql [{:tags=>[11], :features=>[7, 0], :id=>0}]
132
+ end
133
+ end
134
+ end
135
+
136
+ describe "#serialize" do
137
+ context "when asked to use a given adapter" do
138
+ it "calls the corresponding #to_something method" do
139
+
140
+ end
141
+ end
142
+ end
143
+
144
+ describe "#to_marshal, #self.from_marshal" do
145
+ context "when asked to successively serialize and deserialize data" do
146
+ it "completes a round trip without losing information" do
147
+ problem = Treat::Learning::Problem.new(@question, @feature, @tag)
148
+ data_set = Treat::Learning::DataSet.new(problem)
149
+ data_set << @sentence
150
+ data_set.to_marshal(file: 'test.dump')
151
+ Treat::Learning::DataSet.from_marshal(
152
+ file: 'test.dump').should == data_set
153
+ FileUtils.rm('test.dump')
154
+ end
155
+ end
156
+ end
157
+
158
+ describe "#to_mongo" do
159
+
160
+ end
161
+
162
+ describe "#self.unserialize" do
163
+ context "when asked to use a given adapter" do
164
+ it "calls the corresponding #to_something method" do
165
+
166
+ end
167
+ end
168
+ end
169
+
170
+ describe "#self.from_mongo" do
171
+
172
+ end
173
+
174
+ end
@@ -0,0 +1,52 @@
1
+ describe Treat::Learning::Export do
2
+
3
+ describe "#initialize" do
4
+ context "when supplied with acceptable parameters" do
5
+ it "should give access to the parameters" do
6
+ export = Treat::Learning::Export.new(:name, 0, "->(e) { e }")
7
+ export.name.should eql :name
8
+ export.default.should eql 0
9
+ export.proc_string.should eql "->(e) { e }"
10
+ export.proc.should be_instance_of Proc
11
+ export.proc.call('x').should eql 'x'
12
+ end
13
+ end
14
+ context "when supplied with wrong parameters" do
15
+ it "should raise an exception" do
16
+ # First argument should be a symbol representing the name of the export.
17
+ expect { Treat::Learning::Export.new(nil) }.to raise_error
18
+ # Third argument, if supplied, should be a string that
19
+ # evaluates to a proc (NOT a proc/lambda).
20
+ expect { Treat::Learning::Export.new(:name, 0, lambda { x } ) }.to raise_error
21
+ # Third argument should be proper ruby syntax.
22
+ expect { Treat::Learning::Export.new(:name, 0, "->(e) { ") }.to raise_error
23
+ # Third argument should evaluate to a proc.
24
+ expect { Treat::Learning::Export.new(:name, 0, "2") }.to raise_error
25
+ end
26
+ end
27
+ end
28
+
29
+ describe "#==(question)" do
30
+ context "when supplied with an equal question" do
31
+ it "should return true" do
32
+ Treat::Learning::Export.new(:name).
33
+ should == Treat::Learning::Export.new(:name)
34
+ Treat::Learning::Export.new(:name, 0).
35
+ should == Treat::Learning::Export.new(:name, 0)
36
+ Treat::Learning::Export.new(:name, 0, "->(e) { }").
37
+ should == Treat::Learning::Export.new(:name, 0, "->(e) { }")
38
+ end
39
+ end
40
+ context "when supplied with a different question" do
41
+ it "should return false" do
42
+ Treat::Learning::Export.new(:name).
43
+ should_not == Treat::Learning::Export.new(:name2)
44
+ Treat::Learning::Export.new(:name, 0).
45
+ should_not == Treat::Learning::Export.new(:name, 1)
46
+ Treat::Learning::Export.new(:name, 0, "->(e) { }").
47
+ should_not == Treat::Learning::Export.new(:name, 0, "->(e) { x }")
48
+ end
49
+ end
50
+ end
51
+
52
+ end
@@ -0,0 +1,144 @@
1
+ describe Treat::Learning::Problem do
2
+
3
+ before do
4
+ @question = Treat::Learning::Question.new(:is_key_sentence,
5
+ :sentence, 0, :continuous)
6
+ @feature = Treat::Learning::Feature.new(:word_count, 0)
7
+ @tag = Treat::Learning::Tag.new(:paragraph_length, 0,
8
+ "->(e) { e.parent_paragraph.word_count }")
9
+ @paragraph = Treat::Entities::Paragraph.new(
10
+ "Ranga and I went to the store. Meanwhile, Ryan was sleeping.")
11
+ @paragraph.do :segment, :tokenize
12
+ @sentence = @paragraph.sentences[0]
13
+ @hash = {"question"=>{"name"=>:is_key_sentence, "target"=>:sentence,
14
+ "type"=>:continuous, "default"=>0}, "features"=>[
15
+ {"proc_string"=>nil, "default"=>0, "name"=>:word_count, "proc"=>nil}],
16
+ "tags"=>[{"proc_string"=>"->(e) { e.parent_paragraph.word_count }",
17
+ "default"=>0, "name"=>:paragraph_length, "proc"=>nil}], "id"=>0}
18
+ end
19
+
20
+ describe "#initialize" do
21
+ context "when supplied with proper arguments" do
22
+ it "initializes the problem and gives access to parameters" do
23
+ problem = Treat::Learning::Problem.new(@question, @feature, @tag)
24
+ problem.question.should eql @question
25
+ problem.features.should eql [@feature]
26
+ problem.tags.should eql [@tag]
27
+ problem.feature_labels.should eql [@feature.name]
28
+ problem.tag_labels.should eql [@tag.name]
29
+ # ID ??? FIXME
30
+ end
31
+ end
32
+ context "when supplied with unacceptable arguments" do
33
+ it "raises an error" do
34
+ # First argument should be instance of Question.
35
+ expect { Treat::Learning::Problem.new('foo') }.to raise_error
36
+ # Arguments >= 2 should be instances of Export.
37
+ expect { Treat::Learning::Problem.new(@question, 'foo') }.to raise_error
38
+ # Should have at least one Feature in the arguments.
39
+ expect { Treat::Learning::Problem.new(@question, @tag) }.to raise_error
40
+ end
41
+ end
42
+ end
43
+
44
+ describe "#==(problem)" do
45
+ context "when supplied with an equal problem" do
46
+ it "should return true" do
47
+ Treat::Learning::Problem.new(@question, @feature).
48
+ should == Treat::Learning::Problem.new(@question, @feature)
49
+ Treat::Learning::Problem.new(@question, @feature, @tag).
50
+ should == Treat::Learning::Problem.new(@question, @feature, @tag)
51
+ end
52
+ end
53
+ context "when supplied with a different question" do
54
+ it "should return false" do
55
+ question = Treat::Learning::Question.new(:is_key_sentence, :sentence)
56
+ feature = Treat::Learning::Feature.new(:word_count, 999)
57
+ tag = Treat::Learning::Tag.new(:paragraph_length, 999)
58
+ Treat::Learning::Problem.new(@question, @feature).
59
+ should_not == Treat::Learning::Problem.new(question, @feature)
60
+ Treat::Learning::Problem.new(@question, @feature).
61
+ should_not == Treat::Learning::Problem.new(@question, feature)
62
+ Treat::Learning::Problem.new(@question, @feature, @tag).
63
+ should_not == Treat::Learning::Problem.new(@question, @feature, tag)
64
+ end
65
+ end
66
+ end
67
+
68
+ describe "#export_tags(entity)" do
69
+ context "when called on a problem that has tags" do
70
+ context "and called with an entity of the proper type" do
71
+ it "returns an array of the tags" do
72
+ problem = Treat::Learning::Problem.new(@question, @feature, @tag)
73
+ problem.export_tags(@sentence).should eql [11]
74
+ end
75
+ end
76
+ end
77
+ context "when called on a problem that doesn't have tags" do
78
+ it "raises an error" do
79
+ problem = Treat::Learning::Problem.new(@question, @feature)
80
+ expect { problem.export_tags(@sentence) }.to raise_error
81
+ end
82
+ end
83
+ end
84
+
85
+ describe "#export_features(entity, include_answer = true)" do
86
+
87
+ context "when called with an entity of the proper type" do
88
+ context "and include_answer is set to true" do
89
+ context "and the answer is already set on the entity" do
90
+ it "returns an array of the exported features, with the answer" do
91
+ problem = Treat::Learning::Problem.new(@question, @feature)
92
+ @sentence.set :is_key_sentence, 1
93
+ problem.export_features(@sentence).should eql [7, 1]
94
+ end
95
+ end
96
+ context "and the answer is not already set on the entity" do
97
+ it "returns an array of the exported features, with the question's default answer" do
98
+ problem = Treat::Learning::Problem.new(@question, @feature)
99
+ problem.export_features(@sentence).should eql [7, @question.default]
100
+ end
101
+ end
102
+ end
103
+ context "and include_answer is set to false" do
104
+ it "returns an array of the exported features, without the answer" do
105
+ problem = Treat::Learning::Problem.new(@question, @feature)
106
+ problem.export_features(@sentence, false).should eql [7]
107
+ end
108
+ end
109
+ end
110
+ context "when supplied with an entity that is not of the proper type" do
111
+ it "raises an error" do
112
+ problem = Treat::Learning::Problem.new(@question, @feature)
113
+ word = Treat::Entities::Word.new('test')
114
+ expect { problem.export_features(word) }.to raise_error
115
+ end
116
+ end
117
+ end
118
+
119
+ describe "#to_hash" do
120
+ context "when called on a problem" do
121
+ it "returns a hash describing the problem" do
122
+ Treat::Learning::Problem.new(@question, @feature, @tag).
123
+ to_hash.tap { |e| e['id'] = 0 }.should eql @hash
124
+ end
125
+ end
126
+ end
127
+
128
+ describe "#self.from_hash" do
129
+ context "when called with a hash describing a problem" do
130
+ it "returns a problem based on the hash" do
131
+ problem = Treat::Learning::Problem.from_hash(@hash)
132
+ problem.question.name.should eql :is_key_sentence
133
+ problem.question.target.should eql :sentence
134
+ problem.question.type.should eql :continuous
135
+ problem.question.default.should eql 0
136
+ problem.features[0].proc_string.should eql nil
137
+ problem.features[0].default.should eql 0
138
+ problem.features[0].name.should eql :word_count
139
+ problem.features[0].proc.should eql nil
140
+ end
141
+ end
142
+ end
143
+
144
+ end
@@ -0,0 +1,52 @@
1
+ describe Treat::Learning::Question do
2
+
3
+ describe "#initialize" do
4
+ context "when supplied with acceptable parameters" do
5
+ it "should give access to the parameters" do
6
+ question = Treat::Learning::Question.new(
7
+ :is_keyword, :word, 0, :continuous)
8
+ question.name.should eql :is_keyword
9
+ question.target.should eql :word
10
+ question.type.should eql :continuous
11
+ question.default.should eql 0
12
+ end
13
+ end
14
+ context "when supplied with wrong parameters" do
15
+ it "should raise an exception" do
16
+ # Name should be a symbol
17
+ expect { Treat::Learning::Question.new(
18
+ nil, :sentence) }.to raise_error
19
+ # Target should be an actual entity type
20
+ expect { Treat::Learning::Question.new(
21
+ :name, :foo) }.to raise_error
22
+ # Distribution type should be continuous or discrete
23
+ expect { Treat::Learning::Question.new(
24
+ :name, :sentence, 0, :nonsense) }.to raise_error
25
+ end
26
+ end
27
+ end
28
+
29
+ describe "#==(question)" do
30
+ context "when supplied with an equal question" do
31
+ it "should return true" do
32
+ Treat::Learning::Question.new(
33
+ :is_keyword, :word).
34
+ should == Treat::Learning::Question.new(
35
+ :is_keyword, :word)
36
+ end
37
+ end
38
+ context "when supplied with a different question" do
39
+ it "should return false" do
40
+ Treat::Learning::Question.new(
41
+ :is_keyword, :word).
42
+ should_not == Treat::Learning::Question.new(
43
+ :is_keyword, :sentence)
44
+ Treat::Learning::Question.new(
45
+ :is_keyword, :word, 0, :continuous).
46
+ should_not == Treat::Learning::Question.new(
47
+ :is_keyword, :word, 0, :discrete)
48
+ end
49
+ end
50
+ end
51
+
52
+ end