treat 1.2.0 → 2.0.0rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (217) hide show
  1. data/LICENSE +2 -2
  2. data/README.md +12 -21
  3. data/lib/treat/autoload.rb +44 -0
  4. data/lib/treat/config/config.rb +38 -0
  5. data/lib/treat/config/configurable.rb +51 -0
  6. data/lib/treat/config/data/config.rb +50 -0
  7. data/lib/treat/config/data/core.rb +52 -0
  8. data/lib/treat/config/data/databases.rb +10 -0
  9. data/lib/treat/config/data/entities.rb +15 -0
  10. data/lib/treat/config/data/languages/agnostic.rb +31 -0
  11. data/lib/treat/config/{languages → data/languages}/arabic.rb +0 -0
  12. data/lib/treat/config/{languages → data/languages}/chinese.rb +0 -0
  13. data/lib/treat/config/{languages → data/languages}/dutch.rb +1 -1
  14. data/lib/treat/config/data/languages/english.rb +95 -0
  15. data/lib/treat/config/data/languages/french.rb +148 -0
  16. data/lib/treat/config/data/languages/german.rb +135 -0
  17. data/lib/treat/config/{languages → data/languages}/greek.rb +1 -1
  18. data/lib/treat/config/data/languages/italian.rb +162 -0
  19. data/lib/treat/config/data/languages/polish.rb +11 -0
  20. data/lib/treat/config/{languages → data/languages}/portuguese.rb +1 -1
  21. data/lib/treat/config/{languages → data/languages}/russian.rb +1 -1
  22. data/lib/treat/config/data/languages/spanish.rb +291 -0
  23. data/lib/treat/config/data/languages/swedish.rb +289 -0
  24. data/lib/treat/config/data/libraries.rb +12 -0
  25. data/lib/treat/config/data/linguistics.rb +44 -0
  26. data/lib/treat/config/data/tags.rb +328 -0
  27. data/lib/treat/config/{workers → data/workers}/extractors.rb +2 -10
  28. data/lib/treat/config/{workers → data/workers}/formatters.rb +0 -0
  29. data/lib/treat/config/{workers → data/workers}/inflectors.rb +0 -0
  30. data/lib/treat/config/{workers → data/workers}/learners.rb +0 -0
  31. data/lib/treat/config/{workers → data/workers}/lexicalizers.rb +4 -3
  32. data/lib/treat/config/{workers → data/workers}/processors.rb +3 -3
  33. data/lib/treat/config/{workers → data/workers}/retrievers.rb +0 -0
  34. data/lib/treat/config/importable.rb +31 -0
  35. data/lib/treat/config/paths.rb +23 -0
  36. data/lib/treat/config/tags.rb +37 -0
  37. data/lib/treat/core/dsl.rb +55 -0
  38. data/lib/treat/{installer.rb → core/installer.rb} +10 -12
  39. data/lib/treat/core/server.rb +40 -0
  40. data/lib/treat/entities/entities.rb +101 -0
  41. data/lib/treat/entities/{abilities/doable.rb → entity/applicable.rb} +5 -3
  42. data/lib/treat/entities/{abilities → entity}/buildable.rb +118 -63
  43. data/lib/treat/entities/{abilities → entity}/checkable.rb +2 -2
  44. data/lib/treat/entities/{abilities → entity}/comparable.rb +6 -6
  45. data/lib/treat/entities/{abilities → entity}/countable.rb +2 -1
  46. data/lib/treat/entities/entity/debuggable.rb +86 -0
  47. data/lib/treat/entities/{abilities → entity}/delegatable.rb +16 -26
  48. data/lib/treat/entities/{abilities → entity}/exportable.rb +2 -2
  49. data/lib/treat/entities/{abilities → entity}/iterable.rb +4 -16
  50. data/lib/treat/entities/{abilities → entity}/magical.rb +22 -17
  51. data/lib/treat/entities/entity/registrable.rb +36 -0
  52. data/lib/treat/entities/{abilities → entity}/stringable.rb +18 -15
  53. data/lib/treat/entities/entity.rb +86 -77
  54. data/lib/treat/exception.rb +3 -0
  55. data/lib/treat/helpers/hash.rb +29 -0
  56. data/lib/treat/helpers/help.rb +35 -0
  57. data/lib/treat/helpers/object.rb +55 -0
  58. data/lib/treat/helpers/string.rb +124 -0
  59. data/lib/treat/{core → learning}/data_set.rb +11 -11
  60. data/lib/treat/{core → learning}/export.rb +3 -3
  61. data/lib/treat/{core → learning}/problem.rb +26 -16
  62. data/lib/treat/{core → learning}/question.rb +5 -9
  63. data/lib/treat/loaders/linguistics.rb +8 -9
  64. data/lib/treat/loaders/stanford.rb +5 -11
  65. data/lib/treat/modules.rb +33 -0
  66. data/lib/treat/proxies/array.rb +27 -0
  67. data/lib/treat/proxies/language.rb +47 -0
  68. data/lib/treat/proxies/number.rb +18 -0
  69. data/lib/treat/proxies/proxy.rb +25 -0
  70. data/lib/treat/proxies/string.rb +18 -0
  71. data/lib/treat/version.rb +10 -1
  72. data/lib/treat/{workers.rb → workers/categorizable.rb} +18 -19
  73. data/lib/treat/workers/extractors/keywords/tf_idf.rb +11 -11
  74. data/lib/treat/workers/extractors/language/what_language.rb +8 -6
  75. data/lib/treat/workers/extractors/name_tag/stanford.rb +10 -4
  76. data/lib/treat/workers/extractors/similarity/levenshtein.rb +36 -0
  77. data/lib/treat/workers/extractors/similarity/tf_idf.rb +27 -0
  78. data/lib/treat/workers/extractors/tf_idf/native.rb +4 -4
  79. data/lib/treat/workers/extractors/time/chronic.rb +2 -4
  80. data/lib/treat/workers/extractors/time/nickel.rb +19 -20
  81. data/lib/treat/workers/extractors/time/ruby.rb +2 -1
  82. data/lib/treat/workers/extractors/topic_words/lda.rb +12 -12
  83. data/lib/treat/workers/extractors/topics/reuters.rb +9 -13
  84. data/lib/treat/workers/formatters/readers/autoselect.rb +1 -1
  85. data/lib/treat/workers/formatters/readers/image.rb +19 -9
  86. data/lib/treat/workers/formatters/readers/odt.rb +2 -1
  87. data/lib/treat/workers/formatters/readers/pdf.rb +20 -3
  88. data/lib/treat/workers/formatters/readers/xml.rb +0 -1
  89. data/lib/treat/workers/formatters/serializers/mongo.rb +10 -20
  90. data/lib/treat/workers/formatters/serializers/xml.rb +17 -26
  91. data/lib/treat/workers/formatters/serializers/yaml.rb +5 -4
  92. data/lib/treat/workers/formatters/unserializers/mongo.rb +4 -4
  93. data/lib/treat/workers/formatters/unserializers/xml.rb +3 -4
  94. data/lib/treat/workers/formatters/unserializers/yaml.rb +3 -4
  95. data/lib/treat/workers/formatters/visualizers/dot.rb +1 -0
  96. data/lib/treat/workers/formatters/visualizers/standoff.rb +2 -3
  97. data/lib/treat/workers/formatters/visualizers/tree.rb +2 -3
  98. data/lib/treat/workers/{group.rb → groupable.rb} +9 -9
  99. data/lib/treat/workers/inflectors/cardinalizers/linguistics.rb +1 -3
  100. data/lib/treat/workers/inflectors/conjugators/linguistics.rb +5 -7
  101. data/lib/treat/workers/inflectors/declensors/english.rb +13 -20
  102. data/lib/treat/workers/inflectors/declensors/linguistics.rb +29 -28
  103. data/lib/treat/workers/inflectors/ordinalizers/linguistics.rb +0 -2
  104. data/lib/treat/workers/inflectors/stemmers/porter.rb +8 -10
  105. data/lib/treat/workers/inflectors/stemmers/porter_c.rb +7 -7
  106. data/lib/treat/workers/inflectors/stemmers/uea.rb +3 -8
  107. data/lib/treat/workers/learners/classifiers/id3.rb +17 -14
  108. data/lib/treat/workers/learners/classifiers/linear.rb +15 -27
  109. data/lib/treat/workers/learners/classifiers/mlp.rb +32 -19
  110. data/lib/treat/workers/learners/classifiers/svm.rb +28 -21
  111. data/lib/treat/workers/lexicalizers/categorizers/from_tag.rb +19 -3
  112. data/lib/treat/workers/lexicalizers/sensers/wordnet.rb +15 -7
  113. data/lib/treat/workers/lexicalizers/taggers/brill/patch.rb +4 -1
  114. data/lib/treat/workers/lexicalizers/taggers/brill.rb +8 -19
  115. data/lib/treat/workers/lexicalizers/taggers/lingua.rb +4 -15
  116. data/lib/treat/workers/lexicalizers/taggers/stanford.rb +22 -13
  117. data/lib/treat/workers/processors/chunkers/autoselect.rb +2 -3
  118. data/lib/treat/workers/processors/chunkers/html.rb +1 -6
  119. data/lib/treat/workers/processors/parsers/enju.rb +2 -4
  120. data/lib/treat/workers/processors/parsers/stanford.rb +13 -7
  121. data/lib/treat/workers/processors/segmenters/punkt.rb +25 -11
  122. data/lib/treat/workers/processors/segmenters/scalpel.rb +20 -0
  123. data/lib/treat/workers/processors/segmenters/srx.rb +42 -0
  124. data/lib/treat/workers/processors/segmenters/stanford.rb +5 -5
  125. data/lib/treat/workers/processors/segmenters/tactful.rb +21 -11
  126. data/lib/treat/workers/processors/tokenizers/ptb.rb +40 -30
  127. data/lib/treat/workers/processors/tokenizers/punkt.rb +14 -19
  128. data/lib/treat/workers/processors/tokenizers/stanford.rb +38 -22
  129. data/lib/treat/workers/retrievers/indexers/ferret.rb +6 -3
  130. data/lib/treat/workers/retrievers/searchers/ferret.rb +2 -2
  131. data/lib/treat/workers/workers.rb +6 -0
  132. data/lib/treat.rb +18 -32
  133. data/models/MANIFEST +1 -0
  134. data/spec/core/data_set.rb +174 -0
  135. data/spec/core/export.rb +52 -0
  136. data/spec/core/problem.rb +144 -0
  137. data/spec/core/question.rb +52 -0
  138. data/spec/{collection.rb → entities/collection.rb} +20 -35
  139. data/spec/{document.rb → entities/document.rb} +3 -54
  140. data/spec/{entity.rb → entities/entity.rb} +10 -9
  141. data/spec/entities/phrase.rb +33 -0
  142. data/spec/{token.rb → entities/token.rb} +0 -57
  143. data/spec/entities/word.rb +3 -0
  144. data/spec/{zone.rb → entities/zone.rb} +0 -26
  145. data/spec/helper.rb +116 -32
  146. data/spec/sandbox.rb +258 -25
  147. data/spec/treat.rb +26 -34
  148. data/spec/workers/agnostic.rb +137 -0
  149. data/spec/workers/english.rb +194 -0
  150. data/spec/workers/examples/english/economist/hungarys_troubles.txt +46 -0
  151. data/spec/workers/examples/english/economist/saving_the_euro.odt +0 -0
  152. data/spec/{samples → workers/examples/english}/mathematicians/archimedes.abw +0 -0
  153. data/spec/{samples → workers/examples/english}/mathematicians/euler.html +0 -0
  154. data/spec/{samples → workers/examples/english}/mathematicians/gauss.pdf +0 -0
  155. data/spec/{samples → workers/examples/english}/mathematicians/leibniz.txt +0 -0
  156. data/spec/{samples → workers/examples/english}/mathematicians/newton.doc +0 -0
  157. data/spec/workers/examples/english/phrase.xml +5 -0
  158. data/spec/workers/examples/english/test.txt +1 -0
  159. data/spec/workers/language.rb +280 -0
  160. data/spec/workers.rb +28 -0
  161. metadata +122 -105
  162. data/lib/treat/config/core/acronyms.rb +0 -5
  163. data/lib/treat/config/core/encodings.rb +0 -8
  164. data/lib/treat/config/core/entities.rb +0 -2
  165. data/lib/treat/config/core/language.rb +0 -3
  166. data/lib/treat/config/core/paths.rb +0 -8
  167. data/lib/treat/config/core/syntax.rb +0 -1
  168. data/lib/treat/config/core/verbosity.rb +0 -1
  169. data/lib/treat/config/databases/default.rb +0 -1
  170. data/lib/treat/config/databases/mongo.rb +0 -1
  171. data/lib/treat/config/languages/agnostic.rb +0 -34
  172. data/lib/treat/config/languages/english.rb +0 -60
  173. data/lib/treat/config/languages/french.rb +0 -18
  174. data/lib/treat/config/languages/german.rb +0 -18
  175. data/lib/treat/config/languages/italian.rb +0 -12
  176. data/lib/treat/config/languages/polish.rb +0 -12
  177. data/lib/treat/config/languages/spanish.rb +0 -12
  178. data/lib/treat/config/languages/swedish.rb +0 -12
  179. data/lib/treat/config/libraries/punkt.rb +0 -1
  180. data/lib/treat/config/libraries/reuters.rb +0 -1
  181. data/lib/treat/config/libraries/stanford.rb +0 -1
  182. data/lib/treat/config/linguistics/categories.rb +0 -4
  183. data/lib/treat/config/linguistics/punctuation.rb +0 -33
  184. data/lib/treat/config/tags/aligned.rb +0 -221
  185. data/lib/treat/config/tags/enju.rb +0 -71
  186. data/lib/treat/config/tags/paris7.rb +0 -17
  187. data/lib/treat/config/tags/ptb.rb +0 -15
  188. data/lib/treat/config/workers/list.rb +0 -1
  189. data/lib/treat/config.rb +0 -135
  190. data/lib/treat/core.rb +0 -5
  191. data/lib/treat/entities/abilities/copyable.rb +0 -47
  192. data/lib/treat/entities/abilities/debuggable.rb +0 -83
  193. data/lib/treat/entities/abilities/registrable.rb +0 -46
  194. data/lib/treat/entities/collection.rb +0 -40
  195. data/lib/treat/entities/document.rb +0 -10
  196. data/lib/treat/entities/group.rb +0 -18
  197. data/lib/treat/entities/section.rb +0 -13
  198. data/lib/treat/entities/token.rb +0 -47
  199. data/lib/treat/entities/zone.rb +0 -12
  200. data/lib/treat/entities.rb +0 -6
  201. data/lib/treat/helpers/didyoumean.rb +0 -57
  202. data/lib/treat/helpers/escaping.rb +0 -15
  203. data/lib/treat/helpers/formatting.rb +0 -41
  204. data/lib/treat/helpers/objtohash.rb +0 -8
  205. data/lib/treat/helpers/platform.rb +0 -15
  206. data/lib/treat/helpers/reflection.rb +0 -17
  207. data/lib/treat/helpers/temporary.rb +0 -27
  208. data/lib/treat/helpers/verbosity.rb +0 -19
  209. data/lib/treat/helpers.rb +0 -5
  210. data/lib/treat/loaders.rb +0 -10
  211. data/lib/treat/proxies.rb +0 -106
  212. data/lib/treat/workers/formatters/unserializers/autoselect.rb +0 -17
  213. data/lib/treat/workers/inflectors/declensors/active_support.rb +0 -31
  214. data/lib/treat/workers/processors/tokenizers/tactful.rb +0 -68
  215. data/spec/core.rb +0 -441
  216. data/spec/phrase.rb +0 -112
  217. data/spec/word.rb +0 -111
@@ -2,7 +2,7 @@
2
2
  # problem as well as data for entities that
3
3
  # have already been classified, complete with
4
4
  # references to these entities.
5
- class Treat::Core::DataSet
5
+ class Treat::Learning::DataSet
6
6
 
7
7
  # The classification problem this
8
8
  # data set holds data for.
@@ -13,26 +13,26 @@ class Treat::Core::DataSet
13
13
 
14
14
  # Initialize the DataSet.
15
15
  def initialize(problem)
16
- unless problem.is_a?(Treat::Core::Problem)
16
+ unless problem.is_a?(Treat::Learning::Problem)
17
17
  raise Treat::Exception, "The first argument " +
18
18
  "to initialize should be an instance of " +
19
- "Treat::Core::Problem."
19
+ "Treat::Learning::Problem."
20
20
  end
21
21
  @problem, @items = problem, []
22
22
  end
23
23
 
24
24
  def self.build(from)
25
25
  if from.is_a?(Hash)
26
- Treat::Core::DataSet.unserialize(
26
+ Treat::Learning::DataSet.unserialize(
27
27
  Treat.databases.default.adapter, from)
28
28
  elsif from.is_a?(String)
29
29
  unless File.readable?(from)
30
30
  raise Treat::Exception,
31
- "Attempting to initialize data set from "
32
- "file #{from}, but it is not readable."
31
+ "Attempting to initialize data set from " +
32
+ "file '#{from}', but it is not readable."
33
33
  end
34
- Treat::Core::DataSet.unserialize(
35
- extension, file: from)
34
+ Treat::Learning::DataSet.unserialize(
35
+ File.extname(from)[1..-1], file: from)
36
36
  end
37
37
  end
38
38
 
@@ -92,7 +92,7 @@ class Treat::Core::DataSet
92
92
  next unless tag.proc_string
93
93
  tag.proc = eval(tag.proc_string)
94
94
  end
95
- data_set = Treat::Core::DataSet.new(problem)
95
+ data_set = Treat::Learning::DataSet.new(problem)
96
96
  data_set.items = items
97
97
  data_set
98
98
  end
@@ -131,7 +131,7 @@ class Treat::Core::DataSet
131
131
  raise Treat::Exception,
132
132
  "Couldn't retrieve problem ID #{options[:problem]}."
133
133
  end
134
- problem = Treat::Core::Problem.from_hash(p_record)
134
+ problem = Treat::Learning::Problem.from_hash(p_record)
135
135
  data = database.collection('data').find(options).to_a
136
136
  items = []
137
137
  data.each do |datum|
@@ -142,7 +142,7 @@ class Treat::Core::DataSet
142
142
  item[:id] = datum['id']
143
143
  items << item
144
144
  end
145
- data_set = Treat::Core::DataSet.new(problem)
145
+ data_set = Treat::Learning::DataSet.new(problem)
146
146
  data_set.items = items
147
147
  data_set
148
148
  end
@@ -1,6 +1,6 @@
1
1
  # Represents a feature to be used
2
2
  # in a classification task.
3
- class Treat::Core::Export
3
+ class Treat::Learning::Export
4
4
 
5
5
  # The name of the feature. If no
6
6
  # proc is supplied, this assumes
@@ -55,5 +55,5 @@ class Treat::Core::Export
55
55
 
56
56
  end
57
57
 
58
- class Treat::Core::Feature < Treat::Core::Export; end
59
- class Treat::Core::Tag < Treat::Core::Export; end
58
+ class Treat::Learning::Feature < Treat::Learning::Export; end
59
+ class Treat::Learning::Tag < Treat::Learning::Export; end
@@ -2,7 +2,7 @@
2
2
  # - What question are we trying to answer?
3
3
  # - What features are we going to look at
4
4
  # to attempt to answer that question?
5
- class Treat::Core::Problem
5
+ class Treat::Learning::Problem
6
6
 
7
7
  # A unique identifier for the problem.
8
8
  attr_accessor :id
@@ -20,21 +20,21 @@ class Treat::Core::Problem
20
20
  # Initialize the problem with a question
21
21
  # and an arbitrary number of features. # FIXME: init with id!?
22
22
  def initialize(question, *exports)
23
- unless question.is_a?(Treat::Core::Question)
23
+ unless question.is_a?(Treat::Learning::Question)
24
24
  raise Treat::Exception,
25
25
  "The first argument to initialize " +
26
26
  "should be an instance of " +
27
- "Treat::Core::Question."
27
+ "Treat::Learning::Question."
28
28
  end
29
- if exports.any? { |f| !f.is_a?(Treat::Core::Export) }
29
+ if exports.any? { |f| !f.is_a?(Treat::Learning::Export) }
30
30
  raise Treat::Exception,
31
31
  "The second argument and all subsequent ones " +
32
32
  "to initialize should be instances of subclasses " +
33
- "of Treat::Core::Export."
33
+ "of Treat::Learning::Export."
34
34
  end
35
35
  @question, @id = question, object_id
36
36
  @features = exports.select do |exp|
37
- exp.is_a?(Treat::Core::Feature)
37
+ exp.is_a?(Treat::Learning::Feature)
38
38
  end
39
39
  if @features.size == 0
40
40
  raise Treat::Exception,
@@ -42,7 +42,7 @@ class Treat::Core::Problem
42
42
  "one feature to work with."
43
43
  end
44
44
  @tags = exports.select do |exp|
45
- exp.is_a?(Treat::Core::Tag)
45
+ exp.is_a?(Treat::Learning::Tag)
46
46
  end
47
47
  @feature_labels = @features.map { |f| f.name }
48
48
  @tag_labels = @tags.map { |t| t.name }
@@ -63,7 +63,7 @@ class Treat::Core::Problem
63
63
  # all of the features.
64
64
  def export_features(e, include_answer = true)
65
65
  features = export(e, @features)
66
- return features unless include_answer
66
+ return features if !include_answer
67
67
  features << (e.has?(@question.name) ?
68
68
  e.get(@question.name) : @question.default)
69
69
  features
@@ -80,9 +80,11 @@ class Treat::Core::Problem
80
80
 
81
81
  def export(entity, exports)
82
82
  unless @question.target == entity.type
83
+ targ, type = @question.target, entity.type
83
84
  raise Treat::Exception,
84
- "This classification problem targets #{@question.target}s, " +
85
- "but a(n) #{entity.type} was passed to export instead."
85
+ "This classification problem targets " +
86
+ "#{targ}s, but a(n) #{type} " +
87
+ "was passed to export instead."
86
88
  end
87
89
  ret = []
88
90
  exports.each do |export|
@@ -103,28 +105,36 @@ class Treat::Core::Problem
103
105
  'id' => @id }
104
106
  end
105
107
 
108
+ def object_to_hash(obj)
109
+ hash = {}
110
+ obj.instance_variables.each do |var|
111
+ val = obj.instance_variable_get(var)
112
+ hash[var.to_s.delete("@")] = val
113
+ end
114
+ hash
115
+ end
116
+
106
117
  def self.from_hash(hash)
107
- question = Treat::Core::Question.new(
118
+ question = Treat::Learning::Question.new(
108
119
  hash['question']['name'],
109
120
  hash['question']['target'],
110
- hash['question']['type'],
111
121
  hash['question']['default'],
112
- hash['question']['labels']
122
+ hash['question']['type']
113
123
  )
114
124
  features = []
115
125
  hash['features'].each do |feature|
116
- features << Treat::Core::Feature.new(
126
+ features << Treat::Learning::Feature.new(
117
127
  feature['name'], feature['default'],
118
128
  feature['proc_string'])
119
129
  end
120
130
  tags = []
121
131
  hash['tags'].each do |tag|
122
- tags << Treat::Core::Tag.new(
132
+ tags << Treat::Learning::Tag.new(
123
133
  tag['name'], tag['default'],
124
134
  tag['proc_string'])
125
135
  end
126
136
  features_and_tags = features + tags
127
- p = Treat::Core::Problem.new(question, *features_and_tags)
137
+ p = Treat::Learning::Problem.new(question, *features_and_tags)
128
138
  p.id = hash['id']
129
139
  p
130
140
  end
@@ -1,6 +1,6 @@
1
1
  # Defines a question to answer in the
2
2
  # context of a classification problem.
3
- class Treat::Core::Question
3
+ class Treat::Learning::Question
4
4
 
5
5
  # Defines an arbitrary label for the
6
6
  # question we are trying to answer
@@ -16,12 +16,9 @@ class Treat::Core::Question
16
16
  attr_reader :type
17
17
  # Default for the answer to the question.
18
18
  attr_reader :default
19
- # A list of possible answers to the question.
20
- attr_reader :labels
21
19
 
22
20
  # Initialize the question.
23
- def initialize(name, target,
24
- type = :continuous, default = nil, labels = [])
21
+ def initialize(name, target, default = nil, type = :continuous)
25
22
  unless name.is_a?(Symbol)
26
23
  raise Treat::Exception,
27
24
  "Question name should be a symbol."
@@ -35,8 +32,8 @@ class Treat::Core::Question
35
32
  raise Treat::Exception, "Type should be " +
36
33
  "continuous or discrete."
37
34
  end
38
- @name, @target, @type, @default, @labels =
39
- name, target, type, default, labels
35
+ @name, @target, @type, @default =
36
+ name, target, type, default
40
37
  end
41
38
 
42
39
  # Custom comparison operator for questions.
@@ -44,8 +41,7 @@ class Treat::Core::Question
44
41
  @name == question.name &&
45
42
  @type == question.type &&
46
43
  @target == question.target &&
47
- @default == question.default &&
48
- @labels = question.labels
44
+ @default == question.default
49
45
  end
50
46
 
51
47
  end
@@ -1,10 +1,8 @@
1
1
  # A helper class to load a language class
2
- # registered with the Linguistics gem.
2
+ # registered with the Linguistics gem, for
3
+ # example Linguistics::EN.
3
4
  class Treat::Loaders::Linguistics
4
5
 
5
- # Linguistics throws warnings; silence them.
6
- silence_warnings { require 'linguistics' }
7
-
8
6
  # Linguistics classes for each language.
9
7
  @@languages = {}
10
8
 
@@ -13,16 +11,17 @@ class Treat::Loaders::Linguistics
13
11
  # if there is no such language class registered.
14
12
  def self.load(language)
15
13
  silence_warnings do
14
+ # Linguistics throws warnings; silence them.
15
+ silence_warnings { require 'linguistics' }
16
+ code = language.to_s[0..1].upcase
16
17
  @@languages[language] ||=
17
- ::Linguistics.const_get(
18
- language.to_s[0..1].upcase)
18
+ ::Linguistics.const_get(code)
19
19
  end
20
20
  return @@languages[language]
21
21
  rescue RuntimeError
22
22
  raise Treat::Exception,
23
- "Ruby Linguistics does " +
24
- "not have a module installed " +
25
- "for the #{language} language."
23
+ "Ruby Linguistics does not have a module " +
24
+ "installed for the #{language} language."
26
25
  end
27
26
 
28
27
  end
@@ -1,30 +1,24 @@
1
1
  # A helper class to load the CoreNLP package.
2
2
  class Treat::Loaders::Stanford
3
-
4
- require 'stanford-core-nlp'
5
3
 
4
+ # Keep track of whether its loaded or not.
6
5
  @@loaded = false
7
6
 
8
7
  # Load CoreNLP package for a given language.
9
8
  def self.load(language = nil)
10
9
  return if @@loaded
10
+ require 'stanford-core-nlp'
11
11
  language ||= Treat.core.language.default
12
-
13
12
  StanfordCoreNLP.jar_path =
14
13
  Treat.libraries.stanford.jar_path ||
15
14
  Treat.paths.bin + 'stanford/'
16
-
17
15
  StanfordCoreNLP.model_path =
18
16
  Treat.libraries.stanford.model_path ||
19
17
  Treat.paths.models + 'stanford/'
20
-
21
18
  StanfordCoreNLP.use(language)
22
- if Treat.core.verbosity.silence
23
- StanfordCoreNLP.log_file = NULL_DEVICE
24
- end
25
-
26
- StanfordCoreNLP.bind
27
- @@loaded = true
19
+ StanfordCoreNLP.log_file = '/dev/null' if
20
+ Treat.core.verbosity.silence
21
+ StanfordCoreNLP.bind; @@loaded = true
28
22
  end
29
23
 
30
24
  end
@@ -0,0 +1,33 @@
1
+ module Treat
2
+
3
+ # Contains common utility/helper functions.
4
+ module Helpers; include Autoload; end
5
+
6
+ # Contains all the configuration options.
7
+ module Config; include Autoload; end
8
+
9
+ # Import all the configuration options.
10
+ Treat::Config.import!
11
+
12
+ # Contains classes to load external libraries.
13
+ module Loaders; include Autoload; end
14
+
15
+ # Contains machine learning core classes.
16
+ module Learning; include Autoload; end
17
+
18
+ # Contains the document object models.
19
+ module Entities; include Autoload; end
20
+
21
+ # Contains all the worker categories.
22
+ module Workers; include Autoload; end
23
+
24
+ # Make all the worker categories.
25
+ Treat::Workers.categorize!
26
+
27
+ # Installs builders on core Ruby objects.
28
+ module Proxies; include Autoload; end
29
+
30
+ # Core classes (installer, server, etc.)
31
+ module Core; include Autoload; end
32
+
33
+ end
@@ -0,0 +1,27 @@
1
+ module Treat::Proxies
2
+
3
+ module Array
4
+ # Include base proxy functionality.
5
+ include Treat::Proxies::Proxy
6
+ def method_missing(sym, *args, &block)
7
+ if [:do, :apply].include?(sym) ||
8
+ Treat::Workers.lookup(sym)
9
+ map do |el|
10
+ if el.is_a?(Treat::Entities::Entity)
11
+ el.send(sym, *args)
12
+ else
13
+ el.to_entity.send(sym, *args)
14
+ end
15
+ end
16
+ else
17
+ super(sym, *args, &block)
18
+ end
19
+ end
20
+ end
21
+
22
+ # Include Treat methods on numerics.
23
+ ::Array.class_eval do
24
+ include Treat::Proxies::Array
25
+ end
26
+
27
+ end
@@ -0,0 +1,47 @@
1
+ module Treat::Proxies
2
+
3
+ # This is kind of ugly; need to find a
4
+ # better solution eventually (?)
5
+ Treat::Entities::Entity.class_eval do
6
+
7
+ # Rename the true language detection
8
+ # method to :language_proxied, and
9
+ # only call it if language detection
10
+ # is turned on in the configuration.
11
+ alias :language_proxied :language
12
+
13
+ # Proxy the #language method, defined on
14
+ # all textual entities, in order to catch
15
+ # the method call if language detection is
16
+ # turned off and return the default language
17
+ # in that case.
18
+ def language(extractor = nil, options = {})
19
+
20
+ return Treat.core.language.default if
21
+ !Treat.core.language.detect
22
+
23
+ if is_a?(Treat::Entities::Symbol) ||
24
+ is_a?(Treat::Entities::Number) ||
25
+ is_a?(Treat::Entities::Punctuation)
26
+ return Treat.core.language.default
27
+ end
28
+
29
+ dlvl = Treat.core.language.detect_at
30
+ dklass = Treat::Entities.const_get(dlvl.cc)
31
+
32
+ if self.class.compare_with(dklass) < 1
33
+ anc = ancestor_with_type(dlvl)
34
+ return anc.language if anc
35
+ return self.parent.language if has_parent?
36
+ end
37
+
38
+ extractor ||= Treat.workers.
39
+ extractors.language.default
40
+
41
+ language_proxied(extractor, options)
42
+
43
+ end
44
+
45
+ end
46
+
47
+ end
@@ -0,0 +1,18 @@
1
+ module Treat::Proxies
2
+
3
+ # Install Treat functions on Numeric objects.
4
+ module Numeric
5
+ # Include base proxy functionality.
6
+ include Treat::Proxies::Proxy
7
+ # Return the entity corresponding to the number.
8
+ def to_entity(builder = nil)
9
+ Treat::Entities::Number.from_numeric(self)
10
+ end
11
+ end
12
+
13
+ # Include Treat methods on numerics.
14
+ ::Numeric.class_eval do
15
+ include Treat::Proxies::Numeric
16
+ end
17
+
18
+ end
@@ -0,0 +1,25 @@
1
+ # Proxies install builders on core Ruby objects;
2
+ # when a method defined by Treat is called on these
3
+ # objects, the Ruby object is cast to a Treat entity
4
+ # and the method is called on the resultant type.
5
+ module Treat::Proxies
6
+
7
+ # Provides a base functionality for proxies.
8
+ module Proxy
9
+ # Build the entity corresponding to the proxied
10
+ # object and send the method call to the entity.
11
+ def method_missing(sym, *args, &block)
12
+ if [:do, :apply].include?(sym) ||
13
+ Treat::Workers.lookup(sym)
14
+ to_entity.send(sym, *args)
15
+ else
16
+ super(sym, *args, &block)
17
+ end
18
+ end
19
+ # Create an unknown type of entity by default.
20
+ def to_entity(builder = nil)
21
+ Treat::Entities::Unknown(self.to_s)
22
+ end
23
+ end
24
+
25
+ end
@@ -0,0 +1,18 @@
1
+ module Treat::Proxies
2
+
3
+ # Install Treat functions on String objects.
4
+ module String
5
+ # Include base proxy functionality.
6
+ include Treat::Proxies::Proxy
7
+ # Return the entity corresponding to the string.
8
+ def to_entity
9
+ Treat::Entities::Entity.from_string(self)
10
+ end
11
+ end
12
+
13
+ # Include Treat methods on strings.
14
+ ::String.class_eval do
15
+ include Treat::Proxies::String
16
+ end
17
+
18
+ end
data/lib/treat/version.rb CHANGED
@@ -1,3 +1,12 @@
1
1
  module Treat
2
- VERSION = "1.2.0"
2
+
3
+ # The current version of Treat.
4
+ VERSION = "2.0.0rc1"
5
+
6
+ # Treat requires Ruby >= 1.9.2
7
+ if RUBY_VERSION < '1.9.2'
8
+ raise "Treat requires Ruby version 1.9.2 " +
9
+ "or higher, but current is #{RUBY_VERSION}."
10
+ end
11
+
3
12
  end
@@ -1,27 +1,27 @@
1
1
  # This module creates all the worker categories
2
2
  # and the groups within these categories and adds
3
3
  # the relevant hooks on the appropriate entities.
4
- module Treat::Workers
4
+ module Treat::Workers::Categorizable
5
5
 
6
- require 'treat/workers/group'
6
+ require_relative 'groupable'
7
7
 
8
8
  # A lookup table for entity types.
9
9
  @@lookup = {}
10
10
 
11
11
  # Find a worker group based on method.
12
- def self.lookup(method)
12
+ def lookup(method)
13
13
  @@lookup[method]
14
14
  end
15
15
 
16
- def self.create_categories
17
- Treat.workers.list.each do |cat|
18
- create_category(cat.to_s.
16
+ def categorize!
17
+ Treat.workers.members.each do |cat|
18
+ create_category(cat.
19
19
  capitalize.intern,
20
20
  load_category_conf(cat))
21
21
  end
22
22
  end
23
23
 
24
- def self.load_category_conf(name)
24
+ def load_category_conf(name)
25
25
  config = Treat.workers[name]
26
26
  if config.nil?
27
27
  raise Treat::Exception,
@@ -31,10 +31,11 @@ module Treat::Workers
31
31
  config
32
32
  end
33
33
 
34
- def self.create_category(name, conf)
35
- category = self.const_set(name, Module.new)
34
+ def create_category(name, conf)
35
+ category = Treat::Workers.
36
+ const_set(name, Module.new)
36
37
  conf.each_pair do |group, worker|
37
- name = cc(group.to_s).intern
38
+ name = group.to_s.cc.intern
38
39
  category.module_eval do
39
40
  @@methods = []; def methods;
40
41
  @@methods; end; def groups;
@@ -44,7 +45,7 @@ module Treat::Workers
44
45
  end
45
46
  end
46
47
 
47
- def self.create_group(name, conf, category)
48
+ def create_group(name, conf, category)
48
49
  group = category.const_set(name, Module.new)
49
50
  self.set_group_options(group, conf)
50
51
  self.bind_group_targets(group)
@@ -53,17 +54,17 @@ module Treat::Workers
53
54
  @@lookup[group.method] = group
54
55
  end
55
56
 
56
- def self.bind_group_targets(group)
57
+ def bind_group_targets(group)
57
58
  group.targets.each do |entity_type|
58
59
  entity = Treat::Entities.
59
- const_get(cc(entity_type))
60
+ const_get(entity_type.cc)
60
61
  entity.class_eval do
61
62
  add_workers group
62
63
  end
63
64
  end
64
65
  end
65
66
 
66
- def self.register_group_presets(group, conf)
67
+ def register_group_presets(group, conf)
67
68
  return unless conf.respond_to? :presets
68
69
  conf.presets.each do |m|
69
70
  @@methods << m
@@ -71,9 +72,9 @@ module Treat::Workers
71
72
  end
72
73
  end
73
74
 
74
- def self.set_group_options(group, conf)
75
+ def set_group_options(group, conf)
75
76
  group.module_eval do
76
- extend Treat::Workers::Group
77
+ extend Treat::Workers::Groupable
77
78
  self.type = conf.type
78
79
  self.targets = conf.targets
79
80
  if conf.respond_to?(:default)
@@ -90,7 +91,5 @@ module Treat::Workers
90
91
  end
91
92
  end
92
93
  end
93
-
94
- self.create_categories
95
-
94
+
96
95
  end
@@ -1,6 +1,6 @@
1
- # This retrieves a supplied number of keywords
2
- # by selecting the N words with the highest TF*IDF
3
- # for each document.
1
+ # Extracts an arbitrary number of keywords from a
2
+ # document in a collection by selecting its N words
3
+ # with the highest TF*IDF score.
4
4
  class Treat::Workers::Extractors::Keywords::TfIdf
5
5
 
6
6
  # Default options - retrieve 5 keywords.
@@ -8,31 +8,31 @@ class Treat::Workers::Extractors::Keywords::TfIdf
8
8
 
9
9
  # Annotate a document with an array containing
10
10
  # the N words with the highest TF*IDF in that
11
- # document,
11
+ # document.
12
12
  def self.keywords(entity, options = {})
13
13
 
14
14
  options = DefaultOptions.merge(options)
15
15
  tf_idfs = {}
16
16
 
17
17
  entity.each_word do |word|
18
- tf_idfs[word] ||= word.tf_idf
18
+ tf_idf = word.tf_idf
19
+ if tf_idf
20
+ tf_idfs[word] ||= tf_idf
21
+ end
19
22
  end
20
23
 
21
24
  tf_idfs = tf_idfs.
22
25
  sort_by {|k,v| v}.reverse
23
-
24
- if tf_idfs.size <= options[:number]
25
- return tf_idfs
26
- end
27
-
26
+
28
27
  keywords = []
29
28
  i = 0
29
+ max_count = tf_idfs.size < options[:number] ? tf_idfs.size : options[:number]
30
30
 
31
31
  tf_idfs.each do |word|
32
32
 
33
33
  w = word[0].to_s
34
34
  next if keywords.include?(w)
35
- break if i > options[:number]
35
+ break if i > max_count
36
36
  keywords << w
37
37
 
38
38
  i += 1
@@ -1,9 +1,11 @@
1
- # Adaptor for the 'whatlanguage' gem, which
2
- # performs probabilistic language detection.
3
- # The library works by checking for the presence
4
- # of words with bloom filters built from
5
- # dictionaries based upon each source language.
6
- module Treat::Workers::Extractors::Language::WhatLanguage
1
+ # Language detection using a probabilistic algorithm
2
+ # that checks for the presence of words with Bloom
3
+ # filters built from dictionaries for each language.
4
+ #
5
+ # Original paper: Grothoff. 2007. A Quick Introduction to
6
+ # Bloom Filters. Department of Computer Sciences, Purdue
7
+ # University.
8
+ class Treat::Workers::Extractors::Language::WhatLanguage
7
9
 
8
10
  # Require the 'whatlanguage' gem.
9
11
  silence_warnings { require 'whatlanguage' }