treat 1.2.0 → 2.0.0rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (217) hide show
  1. data/LICENSE +2 -2
  2. data/README.md +12 -21
  3. data/lib/treat/autoload.rb +44 -0
  4. data/lib/treat/config/config.rb +38 -0
  5. data/lib/treat/config/configurable.rb +51 -0
  6. data/lib/treat/config/data/config.rb +50 -0
  7. data/lib/treat/config/data/core.rb +52 -0
  8. data/lib/treat/config/data/databases.rb +10 -0
  9. data/lib/treat/config/data/entities.rb +15 -0
  10. data/lib/treat/config/data/languages/agnostic.rb +31 -0
  11. data/lib/treat/config/{languages → data/languages}/arabic.rb +0 -0
  12. data/lib/treat/config/{languages → data/languages}/chinese.rb +0 -0
  13. data/lib/treat/config/{languages → data/languages}/dutch.rb +1 -1
  14. data/lib/treat/config/data/languages/english.rb +95 -0
  15. data/lib/treat/config/data/languages/french.rb +148 -0
  16. data/lib/treat/config/data/languages/german.rb +135 -0
  17. data/lib/treat/config/{languages → data/languages}/greek.rb +1 -1
  18. data/lib/treat/config/data/languages/italian.rb +162 -0
  19. data/lib/treat/config/data/languages/polish.rb +11 -0
  20. data/lib/treat/config/{languages → data/languages}/portuguese.rb +1 -1
  21. data/lib/treat/config/{languages → data/languages}/russian.rb +1 -1
  22. data/lib/treat/config/data/languages/spanish.rb +291 -0
  23. data/lib/treat/config/data/languages/swedish.rb +289 -0
  24. data/lib/treat/config/data/libraries.rb +12 -0
  25. data/lib/treat/config/data/linguistics.rb +44 -0
  26. data/lib/treat/config/data/tags.rb +328 -0
  27. data/lib/treat/config/{workers → data/workers}/extractors.rb +2 -10
  28. data/lib/treat/config/{workers → data/workers}/formatters.rb +0 -0
  29. data/lib/treat/config/{workers → data/workers}/inflectors.rb +0 -0
  30. data/lib/treat/config/{workers → data/workers}/learners.rb +0 -0
  31. data/lib/treat/config/{workers → data/workers}/lexicalizers.rb +4 -3
  32. data/lib/treat/config/{workers → data/workers}/processors.rb +3 -3
  33. data/lib/treat/config/{workers → data/workers}/retrievers.rb +0 -0
  34. data/lib/treat/config/importable.rb +31 -0
  35. data/lib/treat/config/paths.rb +23 -0
  36. data/lib/treat/config/tags.rb +37 -0
  37. data/lib/treat/core/dsl.rb +55 -0
  38. data/lib/treat/{installer.rb → core/installer.rb} +10 -12
  39. data/lib/treat/core/server.rb +40 -0
  40. data/lib/treat/entities/entities.rb +101 -0
  41. data/lib/treat/entities/{abilities/doable.rb → entity/applicable.rb} +5 -3
  42. data/lib/treat/entities/{abilities → entity}/buildable.rb +118 -63
  43. data/lib/treat/entities/{abilities → entity}/checkable.rb +2 -2
  44. data/lib/treat/entities/{abilities → entity}/comparable.rb +6 -6
  45. data/lib/treat/entities/{abilities → entity}/countable.rb +2 -1
  46. data/lib/treat/entities/entity/debuggable.rb +86 -0
  47. data/lib/treat/entities/{abilities → entity}/delegatable.rb +16 -26
  48. data/lib/treat/entities/{abilities → entity}/exportable.rb +2 -2
  49. data/lib/treat/entities/{abilities → entity}/iterable.rb +4 -16
  50. data/lib/treat/entities/{abilities → entity}/magical.rb +22 -17
  51. data/lib/treat/entities/entity/registrable.rb +36 -0
  52. data/lib/treat/entities/{abilities → entity}/stringable.rb +18 -15
  53. data/lib/treat/entities/entity.rb +86 -77
  54. data/lib/treat/exception.rb +3 -0
  55. data/lib/treat/helpers/hash.rb +29 -0
  56. data/lib/treat/helpers/help.rb +35 -0
  57. data/lib/treat/helpers/object.rb +55 -0
  58. data/lib/treat/helpers/string.rb +124 -0
  59. data/lib/treat/{core → learning}/data_set.rb +11 -11
  60. data/lib/treat/{core → learning}/export.rb +3 -3
  61. data/lib/treat/{core → learning}/problem.rb +26 -16
  62. data/lib/treat/{core → learning}/question.rb +5 -9
  63. data/lib/treat/loaders/linguistics.rb +8 -9
  64. data/lib/treat/loaders/stanford.rb +5 -11
  65. data/lib/treat/modules.rb +33 -0
  66. data/lib/treat/proxies/array.rb +27 -0
  67. data/lib/treat/proxies/language.rb +47 -0
  68. data/lib/treat/proxies/number.rb +18 -0
  69. data/lib/treat/proxies/proxy.rb +25 -0
  70. data/lib/treat/proxies/string.rb +18 -0
  71. data/lib/treat/version.rb +10 -1
  72. data/lib/treat/{workers.rb → workers/categorizable.rb} +18 -19
  73. data/lib/treat/workers/extractors/keywords/tf_idf.rb +11 -11
  74. data/lib/treat/workers/extractors/language/what_language.rb +8 -6
  75. data/lib/treat/workers/extractors/name_tag/stanford.rb +10 -4
  76. data/lib/treat/workers/extractors/similarity/levenshtein.rb +36 -0
  77. data/lib/treat/workers/extractors/similarity/tf_idf.rb +27 -0
  78. data/lib/treat/workers/extractors/tf_idf/native.rb +4 -4
  79. data/lib/treat/workers/extractors/time/chronic.rb +2 -4
  80. data/lib/treat/workers/extractors/time/nickel.rb +19 -20
  81. data/lib/treat/workers/extractors/time/ruby.rb +2 -1
  82. data/lib/treat/workers/extractors/topic_words/lda.rb +12 -12
  83. data/lib/treat/workers/extractors/topics/reuters.rb +9 -13
  84. data/lib/treat/workers/formatters/readers/autoselect.rb +1 -1
  85. data/lib/treat/workers/formatters/readers/image.rb +19 -9
  86. data/lib/treat/workers/formatters/readers/odt.rb +2 -1
  87. data/lib/treat/workers/formatters/readers/pdf.rb +20 -3
  88. data/lib/treat/workers/formatters/readers/xml.rb +0 -1
  89. data/lib/treat/workers/formatters/serializers/mongo.rb +10 -20
  90. data/lib/treat/workers/formatters/serializers/xml.rb +17 -26
  91. data/lib/treat/workers/formatters/serializers/yaml.rb +5 -4
  92. data/lib/treat/workers/formatters/unserializers/mongo.rb +4 -4
  93. data/lib/treat/workers/formatters/unserializers/xml.rb +3 -4
  94. data/lib/treat/workers/formatters/unserializers/yaml.rb +3 -4
  95. data/lib/treat/workers/formatters/visualizers/dot.rb +1 -0
  96. data/lib/treat/workers/formatters/visualizers/standoff.rb +2 -3
  97. data/lib/treat/workers/formatters/visualizers/tree.rb +2 -3
  98. data/lib/treat/workers/{group.rb → groupable.rb} +9 -9
  99. data/lib/treat/workers/inflectors/cardinalizers/linguistics.rb +1 -3
  100. data/lib/treat/workers/inflectors/conjugators/linguistics.rb +5 -7
  101. data/lib/treat/workers/inflectors/declensors/english.rb +13 -20
  102. data/lib/treat/workers/inflectors/declensors/linguistics.rb +29 -28
  103. data/lib/treat/workers/inflectors/ordinalizers/linguistics.rb +0 -2
  104. data/lib/treat/workers/inflectors/stemmers/porter.rb +8 -10
  105. data/lib/treat/workers/inflectors/stemmers/porter_c.rb +7 -7
  106. data/lib/treat/workers/inflectors/stemmers/uea.rb +3 -8
  107. data/lib/treat/workers/learners/classifiers/id3.rb +17 -14
  108. data/lib/treat/workers/learners/classifiers/linear.rb +15 -27
  109. data/lib/treat/workers/learners/classifiers/mlp.rb +32 -19
  110. data/lib/treat/workers/learners/classifiers/svm.rb +28 -21
  111. data/lib/treat/workers/lexicalizers/categorizers/from_tag.rb +19 -3
  112. data/lib/treat/workers/lexicalizers/sensers/wordnet.rb +15 -7
  113. data/lib/treat/workers/lexicalizers/taggers/brill/patch.rb +4 -1
  114. data/lib/treat/workers/lexicalizers/taggers/brill.rb +8 -19
  115. data/lib/treat/workers/lexicalizers/taggers/lingua.rb +4 -15
  116. data/lib/treat/workers/lexicalizers/taggers/stanford.rb +22 -13
  117. data/lib/treat/workers/processors/chunkers/autoselect.rb +2 -3
  118. data/lib/treat/workers/processors/chunkers/html.rb +1 -6
  119. data/lib/treat/workers/processors/parsers/enju.rb +2 -4
  120. data/lib/treat/workers/processors/parsers/stanford.rb +13 -7
  121. data/lib/treat/workers/processors/segmenters/punkt.rb +25 -11
  122. data/lib/treat/workers/processors/segmenters/scalpel.rb +20 -0
  123. data/lib/treat/workers/processors/segmenters/srx.rb +42 -0
  124. data/lib/treat/workers/processors/segmenters/stanford.rb +5 -5
  125. data/lib/treat/workers/processors/segmenters/tactful.rb +21 -11
  126. data/lib/treat/workers/processors/tokenizers/ptb.rb +40 -30
  127. data/lib/treat/workers/processors/tokenizers/punkt.rb +14 -19
  128. data/lib/treat/workers/processors/tokenizers/stanford.rb +38 -22
  129. data/lib/treat/workers/retrievers/indexers/ferret.rb +6 -3
  130. data/lib/treat/workers/retrievers/searchers/ferret.rb +2 -2
  131. data/lib/treat/workers/workers.rb +6 -0
  132. data/lib/treat.rb +18 -32
  133. data/models/MANIFEST +1 -0
  134. data/spec/core/data_set.rb +174 -0
  135. data/spec/core/export.rb +52 -0
  136. data/spec/core/problem.rb +144 -0
  137. data/spec/core/question.rb +52 -0
  138. data/spec/{collection.rb → entities/collection.rb} +20 -35
  139. data/spec/{document.rb → entities/document.rb} +3 -54
  140. data/spec/{entity.rb → entities/entity.rb} +10 -9
  141. data/spec/entities/phrase.rb +33 -0
  142. data/spec/{token.rb → entities/token.rb} +0 -57
  143. data/spec/entities/word.rb +3 -0
  144. data/spec/{zone.rb → entities/zone.rb} +0 -26
  145. data/spec/helper.rb +116 -32
  146. data/spec/sandbox.rb +258 -25
  147. data/spec/treat.rb +26 -34
  148. data/spec/workers/agnostic.rb +137 -0
  149. data/spec/workers/english.rb +194 -0
  150. data/spec/workers/examples/english/economist/hungarys_troubles.txt +46 -0
  151. data/spec/workers/examples/english/economist/saving_the_euro.odt +0 -0
  152. data/spec/{samples → workers/examples/english}/mathematicians/archimedes.abw +0 -0
  153. data/spec/{samples → workers/examples/english}/mathematicians/euler.html +0 -0
  154. data/spec/{samples → workers/examples/english}/mathematicians/gauss.pdf +0 -0
  155. data/spec/{samples → workers/examples/english}/mathematicians/leibniz.txt +0 -0
  156. data/spec/{samples → workers/examples/english}/mathematicians/newton.doc +0 -0
  157. data/spec/workers/examples/english/phrase.xml +5 -0
  158. data/spec/workers/examples/english/test.txt +1 -0
  159. data/spec/workers/language.rb +280 -0
  160. data/spec/workers.rb +28 -0
  161. metadata +122 -105
  162. data/lib/treat/config/core/acronyms.rb +0 -5
  163. data/lib/treat/config/core/encodings.rb +0 -8
  164. data/lib/treat/config/core/entities.rb +0 -2
  165. data/lib/treat/config/core/language.rb +0 -3
  166. data/lib/treat/config/core/paths.rb +0 -8
  167. data/lib/treat/config/core/syntax.rb +0 -1
  168. data/lib/treat/config/core/verbosity.rb +0 -1
  169. data/lib/treat/config/databases/default.rb +0 -1
  170. data/lib/treat/config/databases/mongo.rb +0 -1
  171. data/lib/treat/config/languages/agnostic.rb +0 -34
  172. data/lib/treat/config/languages/english.rb +0 -60
  173. data/lib/treat/config/languages/french.rb +0 -18
  174. data/lib/treat/config/languages/german.rb +0 -18
  175. data/lib/treat/config/languages/italian.rb +0 -12
  176. data/lib/treat/config/languages/polish.rb +0 -12
  177. data/lib/treat/config/languages/spanish.rb +0 -12
  178. data/lib/treat/config/languages/swedish.rb +0 -12
  179. data/lib/treat/config/libraries/punkt.rb +0 -1
  180. data/lib/treat/config/libraries/reuters.rb +0 -1
  181. data/lib/treat/config/libraries/stanford.rb +0 -1
  182. data/lib/treat/config/linguistics/categories.rb +0 -4
  183. data/lib/treat/config/linguistics/punctuation.rb +0 -33
  184. data/lib/treat/config/tags/aligned.rb +0 -221
  185. data/lib/treat/config/tags/enju.rb +0 -71
  186. data/lib/treat/config/tags/paris7.rb +0 -17
  187. data/lib/treat/config/tags/ptb.rb +0 -15
  188. data/lib/treat/config/workers/list.rb +0 -1
  189. data/lib/treat/config.rb +0 -135
  190. data/lib/treat/core.rb +0 -5
  191. data/lib/treat/entities/abilities/copyable.rb +0 -47
  192. data/lib/treat/entities/abilities/debuggable.rb +0 -83
  193. data/lib/treat/entities/abilities/registrable.rb +0 -46
  194. data/lib/treat/entities/collection.rb +0 -40
  195. data/lib/treat/entities/document.rb +0 -10
  196. data/lib/treat/entities/group.rb +0 -18
  197. data/lib/treat/entities/section.rb +0 -13
  198. data/lib/treat/entities/token.rb +0 -47
  199. data/lib/treat/entities/zone.rb +0 -12
  200. data/lib/treat/entities.rb +0 -6
  201. data/lib/treat/helpers/didyoumean.rb +0 -57
  202. data/lib/treat/helpers/escaping.rb +0 -15
  203. data/lib/treat/helpers/formatting.rb +0 -41
  204. data/lib/treat/helpers/objtohash.rb +0 -8
  205. data/lib/treat/helpers/platform.rb +0 -15
  206. data/lib/treat/helpers/reflection.rb +0 -17
  207. data/lib/treat/helpers/temporary.rb +0 -27
  208. data/lib/treat/helpers/verbosity.rb +0 -19
  209. data/lib/treat/helpers.rb +0 -5
  210. data/lib/treat/loaders.rb +0 -10
  211. data/lib/treat/proxies.rb +0 -106
  212. data/lib/treat/workers/formatters/unserializers/autoselect.rb +0 -17
  213. data/lib/treat/workers/inflectors/declensors/active_support.rb +0 -31
  214. data/lib/treat/workers/processors/tokenizers/tactful.rb +0 -68
  215. data/spec/core.rb +0 -441
  216. data/spec/phrase.rb +0 -112
  217. data/spec/word.rb +0 -111
@@ -0,0 +1,101 @@
1
+ module Treat::Entities
2
+
3
+ # * Collection and document classes * #
4
+
5
+ # Represents a collection.
6
+ class Collection < Entity; end
7
+
8
+ # Represents a document.
9
+ class Document < Entity; end
10
+
11
+ # * Sections and related classes * #
12
+
13
+ # Represents a section.
14
+ class Section < Entity; end
15
+
16
+ # Represents a page of text.
17
+ class Page < Section; end
18
+
19
+ # Represents a block of text
20
+ class Block < Section; end
21
+
22
+ # Represents a list.
23
+ class List < Section; end
24
+
25
+ # * Zones and related classes * #
26
+
27
+ # Represents a zone of text.
28
+ class Zone < Entity; end
29
+
30
+ # Represents a title, subtitle,
31
+ # logical header of a text.
32
+ class Title < Zone; end
33
+
34
+ # Represents a paragraph (group
35
+ # of sentences and/or phrases).
36
+ class Paragraph < Zone; end
37
+
38
+ # * Groups and related classes * #
39
+
40
+ # Represents a group of tokens.
41
+ class Group < Entity; end
42
+
43
+ # Represents a group of words
44
+ # with a sentence ender (.!?)
45
+ class Sentence < Group; end
46
+
47
+ # Represents a group of words,
48
+ # with no sentence ender.
49
+ class Phrase < Group; end
50
+
51
+ # Represents a non-linguistic
52
+ # fragment (e.g. stray symbols).
53
+ class Fragment < Group; end
54
+
55
+ # * Tokens and related classes* #
56
+
57
+ # Represents a terminal element
58
+ # (leaf) in the text structure.
59
+ class Token < Entity; end
60
+
61
+ # Represents a word. Strictly,
62
+ # this is /^[[:alpha:]\-']+$/.
63
+ class Word < Token; end
64
+
65
+ # Represents an enclitic.
66
+ # Strictly, this is any of
67
+ # 'll 'm 're 's 't or 've.
68
+ class Enclitic < Token; end
69
+
70
+ # Represents a number. Strictly,
71
+ # this is /^#?([0-9]+)(\.[0-9]+)?$/.
72
+ class Number < Token
73
+ def to_i; to_s.to_i; end
74
+ def to_f; to_s.to_f; end
75
+ end
76
+
77
+ # Represents a punctuation sign.
78
+ # Strictly, this is /^[[:punct:]\$]+$/.
79
+ class Punctuation < Token; end
80
+
81
+ # Represents a character that is neither
82
+ # a word, an enclitic, a number or a
83
+ # punctuation character (e.g. @#$%&*).
84
+ class Symbol < Token; end
85
+
86
+ # Represents a url. This is (imperfectly)
87
+ # defined as /^(http|https):\/\/[a-z0-9]
88
+ # +([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}
89
+ # (([0-9]{1,5})?\/.*)?$/ix
90
+ class Url < Token; end
91
+
92
+ # Represents a valid RFC822 address.
93
+ # This is (imperfectly) defined as
94
+ # /.+\@.+\..+/ (fixme maybe?)
95
+ class Email < Token; end
96
+
97
+ # Represents a token whose type
98
+ # cannot be identified.
99
+ class Unknown; end
100
+
101
+ end
@@ -1,8 +1,8 @@
1
1
  # Implement support for the functions #do and #do_task.
2
- module Treat::Entities::Abilities::Doable
2
+ module Treat::Entities::Entity::Applicable
3
3
 
4
4
  # Perform the supplied tasks on the entity.
5
- def do(*tasks)
5
+ def apply(*tasks)
6
6
  tasks.each do |task|
7
7
 
8
8
  if task.is_a?(Hash)
@@ -25,6 +25,8 @@ module Treat::Entities::Abilities::Doable
25
25
  end
26
26
  self
27
27
  end
28
+
29
+ alias :do :apply
28
30
 
29
31
  # Perform an individual task on an entity
30
32
  # given a worker and options to pass to it.
@@ -33,7 +35,7 @@ module Treat::Entities::Abilities::Doable
33
35
  entity_types = group.targets
34
36
  f = nil
35
37
  entity_types.each do |t|
36
- f = true if is_a?(Treat::Entities.const_get(cc(t)))
38
+ f = true if is_a?(Treat::Entities.const_get(t.cc))
37
39
  end
38
40
  if f || entity_types.include?(:entity)
39
41
  send(task, worker, options)
@@ -3,7 +3,7 @@
3
3
  # a string or a numeric object. This class
4
4
  # is pretty much self-explanatory.
5
5
  # FIXME how can we make this language independent?
6
- module Treat::Entities::Abilities::Buildable
6
+ module Treat::Entities::Entity::Buildable
7
7
 
8
8
  require 'schiphol'
9
9
  require 'fileutils'
@@ -23,23 +23,40 @@ module Treat::Entities::Abilities::Buildable
23
23
  # Build an entity from anything (can be
24
24
  # a string, numeric,folder, or file name
25
25
  # representing a raw or serialized file).
26
- def build(file_or_value, options = {})
26
+ def build(*args)
27
+
28
+ # This probably needs some doc.
29
+ if args.size == 0
30
+ file_or_value = ''
31
+ elsif args[0].is_a?(Hash)
32
+ file_or_value = args[0]
33
+ elsif args.size == 1
34
+ if args[0].is_a?(Treat::Entities::Entity)
35
+ args[0] = [args[0]]
36
+ end
37
+ file_or_value = args[0]
38
+ else
39
+ file_or_value = args
40
+ end
27
41
 
28
42
  fv = file_or_value.to_s
29
43
 
30
- if file_or_value.is_a?(Hash)
44
+ if fv == ''; self.new
45
+ elsif file_or_value.is_a?(Array)
46
+ from_array(file_or_value)
47
+ elsif file_or_value.is_a?(Hash)
31
48
  from_db(file_or_value)
32
49
  elsif self == Treat::Entities::Document ||
33
50
  (fv.index('yml') || fv.index('yaml') ||
34
51
  fv.index('xml') || fv.index('mongo'))
35
52
  if fv =~ UriRegexp
36
- from_url(fv, options)
53
+ from_url(fv)
37
54
  else
38
- from_file(fv, options)
55
+ from_file(fv)
39
56
  end
40
57
  elsif self == Treat::Entities::Collection
41
58
  if FileTest.directory?(fv)
42
- from_folder(fv, options)
59
+ from_folder(fv)
43
60
  else
44
61
  create_collection(fv)
45
62
  end
@@ -63,27 +80,35 @@ module Treat::Entities::Abilities::Buildable
63
80
  # is user-created (i.e. by calling build
64
81
  # instead of from_string directly).
65
82
  def from_string(string, enforce_type = false)
66
-
83
+ # If calling using the build syntax (i.e. user-
84
+ # called), enforce the type that was supplied.
67
85
  enforce_type = true if caller_method == :build
68
-
69
86
  unless self == Treat::Entities::Entity
70
87
  return self.new(string) if enforce_type
71
88
  end
72
-
73
89
  e = anything_from_string(string)
74
-
75
90
  if enforce_type && !e.is_a?(self)
76
- raise "Asked to build a #{cl(self).downcase} "+
91
+ raise "Asked to build a #{self.mn.downcase} "+
77
92
  "from \"#{string}\" and to enforce type, "+
78
- "but type detected was #{cl(e.class).downcase}."
93
+ "but type detected was #{e.class.mn.downcase}."
79
94
  end
80
-
81
95
  e
82
-
96
+ end
97
+
98
+ # Build a document from an array
99
+ # of builders.
100
+ def from_array(array)
101
+ obj = self.new
102
+ array.each do |el|
103
+ el = el.to_entity unless el.
104
+ is_a?(Treat::Entities::Entity)
105
+ obj << el
106
+ end
107
+ obj
83
108
  end
84
109
 
85
110
  # Build a document from an URL.
86
- def from_url(url, options)
111
+ def from_url(url)
87
112
  unless self ==
88
113
  Treat::Entities::Document
89
114
  raise Treat::Exception,
@@ -91,16 +116,22 @@ module Treat::Entities::Abilities::Buildable
91
116
  'else than a document from a url.'
92
117
  end
93
118
 
94
- f = Schiphol.download(url,
95
- :download_folder => Treat.paths.files,
96
- :show_progress => Treat.core.verbosity.silence,
97
- :rectify_extensions => true,
98
- :max_tries => 3
99
- )
100
-
101
- options[:default_to] ||= 'html'
119
+ begin
120
+ folder = Treat.paths.files
121
+ if folder[-1] == '/'
122
+ folder = folder[0..-2]
123
+ end
124
+ f = Schiphol.download(url,
125
+ download_folder: folder,
126
+ show_progress: !Treat.core.verbosity.silence,
127
+ rectify_extensions: true,
128
+ max_tries: 3)
129
+ rescue
130
+ raise Treat::Exception,
131
+ "Couldn't download file at #{url}."
132
+ end
102
133
 
103
- e = from_file(f, options)
134
+ e = from_file(f,'html')
104
135
  e.set :url, url.to_s
105
136
  e
106
137
 
@@ -123,7 +154,7 @@ module Treat::Entities::Abilities::Buildable
123
154
 
124
155
  # Build an entity from a folder with documents.
125
156
  # Folders will be searched recursively.
126
- def from_folder(folder, options)
157
+ def from_folder(folder)
127
158
 
128
159
  return if Reserved.include?(folder)
129
160
 
@@ -148,40 +179,48 @@ module Treat::Entities::Abilities::Buildable
148
179
 
149
180
  c = Treat::Entities::Collection.new(folder)
150
181
  folder += '/' unless folder[-1] == '/'
151
-
182
+
183
+ if !FileTest.directory?(folder)
184
+ FileUtils.mkdir(folder)
185
+ end
186
+
187
+ c.set :folder, folder
188
+ i = folder + '/.index'
189
+ c.set :index, i if FileTest.directory?(i)
190
+
152
191
  Dir[folder + '*'].each do |f|
153
192
  if FileTest.directory?(f)
154
193
  c2 = Treat::Entities::Collection.
155
- from_folder(f, options)
194
+ from_folder(f)
156
195
  c.<<(c2, false) if c2
157
196
  else
158
197
  c.<<(Treat::Entities::Document.
159
- from_file(f, options), false)
198
+ from_file(f), false)
160
199
  end
161
200
  end
162
- c
201
+
202
+ return c
163
203
 
164
204
  end
165
205
 
166
206
  # Build a document from a raw or serialized file.
167
- def from_file(file, options)
207
+ def from_file(file,def_fmt=nil)
168
208
 
169
209
  if file.index('yml') ||
170
210
  file.index('yaml') ||
171
211
  file.index('xml') ||
172
212
  file.index('mongo')
173
- from_serialized_file(file, options)
213
+ from_serialized_file(file)
174
214
  else
175
- fmt = Treat::Workers::Formatters::Readers::Autoselect.
176
- detect_format(file, options[:default_to])
177
- options[:_format] = fmt
178
- from_raw_file(file, options)
215
+ fmt = Treat::Workers::Formatters::
216
+ Readers::Autoselect.detect_format(file,def_fmt)
217
+ from_raw_file(file, fmt)
179
218
  end
180
219
 
181
220
  end
182
221
 
183
222
  # Build a document from a raw file.
184
- def from_raw_file(file, options)
223
+ def from_raw_file(file, def_fmt='txt')
185
224
 
186
225
  unless self ==
187
226
  Treat::Entities::Document
@@ -195,31 +234,36 @@ module Treat::Entities::Abilities::Buildable
195
234
  "Path '#{file}' does not "+
196
235
  "point to a readable file."
197
236
  end
198
-
199
- d = Treat::Entities::Document.new(file)
200
-
237
+ options = {default_format: def_fmt}
238
+ d = Treat::Entities::Document.new
239
+ d.set :file, file
201
240
  d.read(:autoselect, options)
202
241
 
203
242
  end
204
243
 
205
244
  # Build an entity from a serialized file.
206
- def from_serialized_file(file, options)
207
-
208
- if file.index('mongo')
209
- options[:id] = file.scan( # Consolidate this
210
- /([0-9]+)\.mongo/).first.first
211
- from_db(:mongo, options)
245
+ def from_serialized_file(file)
246
+
247
+ unless File.readable?(file)
248
+ raise Treat::Exception,
249
+ "Path '#{file}' does not "+
250
+ "point to a readable file."
251
+ end
252
+ doc = Treat::Entities::Document.new
253
+ doc.set :file, file
254
+ format = nil
255
+ if file.index('yml') ||
256
+ file.index('yaml')
257
+ format = :yaml
258
+ elsif file.index('xml')
259
+ f = :xml
212
260
  else
213
- unless File.readable?(file)
214
- raise Treat::Exception,
215
- "Path '#{file}' does not "+
216
- "point to a readable file."
217
- end
218
- d = Treat::Entities::Document.new(file)
219
- d.unserialize(:autoselect, options)
220
- d.children[0].set_as_root! # Fix this
221
- d.children[0]
261
+ raise Treat::Exception,
262
+ "Unreadable serialized format for #{file}."
222
263
  end
264
+ doc.unserialize(format)
265
+ doc.children[0].set_as_root! # Fix this
266
+ doc.children[0]
223
267
 
224
268
  end
225
269
 
@@ -238,15 +282,28 @@ module Treat::Entities::Abilities::Buildable
238
282
 
239
283
  # Build any kind of entity from a string.
240
284
  def anything_from_string(string)
285
+ case self.mn.downcase.intern
286
+ when :document
287
+ folder = Treat.paths.files
288
+ if folder[-1] == '/'
289
+ folder = folder[0..-2]
290
+ end
291
+
292
+ now = Time.now.to_f
293
+ doc_file = folder+ "/#{now}.txt"
294
+ string.force_encoding('UTF-8')
295
+ File.open(doc_file, 'w') do |f|
296
+ f.puts string
297
+ end
241
298
 
242
- case cl(self).downcase.intern
243
- when :document, :collection
299
+ from_raw_file(doc_file)
300
+ when :collection
244
301
  raise Treat::Exception,
245
- "Cannot create a document or " +
302
+ "Cannot create a " +
246
303
  "collection from a string " +
247
304
  "(need a readable file/folder)."
248
305
  when :phrase
249
- sentence_or_phrase_from_string(string)
306
+ group_from_string(string)
250
307
  when :token
251
308
  token_from_string(string)
252
309
  when :zone
@@ -258,7 +315,7 @@ module Treat::Entities::Abilities::Buildable
258
315
  if string.gsub(/[\.\!\?]+/,
259
316
  '.').count('.') <= 1 &&
260
317
  string.count("\n") == 0
261
- sentence_or_phrase_from_string(string)
318
+ group_from_string(string)
262
319
  else
263
320
  zone_from_string(string)
264
321
  end
@@ -269,15 +326,14 @@ module Treat::Entities::Abilities::Buildable
269
326
 
270
327
  end
271
328
 
329
+ # This should be improved on.
272
330
  def check_encoding(string)
273
331
  string.encode("UTF-8", undef: :replace) # Fix
274
332
  end
275
333
 
276
334
  # Build a phrase from a string.
277
- def sentence_or_phrase_from_string(string)
278
-
335
+ def group_from_string(string)
279
336
  check_encoding(string)
280
-
281
337
  if !(string =~ /[a-zA-Z]+/)
282
338
  Treat::Entities::Fragment.new(string)
283
339
  elsif string.count('.!?') >= 1
@@ -285,7 +341,6 @@ module Treat::Entities::Abilities::Buildable
285
341
  else
286
342
  Treat::Entities::Phrase.new(string)
287
343
  end
288
-
289
344
  end
290
345
 
291
346
  # Build the right type of token
@@ -1,7 +1,7 @@
1
1
  # This module implements methods that are used
2
2
  # by workers to determine if an entity is properly
3
3
  # formatted before working on it.
4
- module Treat::Entities::Abilities::Checkable
4
+ module Treat::Entities::Entity::Checkable
5
5
 
6
6
  # Check if the entity has the given feature,
7
7
  # and if so return it. If not, calculate the
@@ -15,7 +15,7 @@ module Treat::Entities::Abilities::Checkable
15
15
  g2 = Treat::Workers.lookup(feature)
16
16
 
17
17
  raise Treat::Exception,
18
- "#{g1.type.to_s.capitalize} #{task} " +
18
+ "#{g1.type.to_s.capitalize} " +
19
19
  "requires #{g2.type} #{g2.method}."
20
20
  end
21
21
 
@@ -1,21 +1,21 @@
1
- module Treat::Entities::Abilities::Comparable
1
+ # Allow comparison of entity hierarchy in DOM.
2
+ module Treat::Entities::Entity::Comparable
2
3
 
4
+ # Determines whether the receiving class
5
+ # is smaller, equal or greater in the DOM
6
+ # hierarchy compared to the supplied one.
3
7
  def compare_with(klass)
4
-
5
8
  i = 0; rank_a = nil; rank_b = nil
6
-
7
9
  Treat.core.entities.order.each do |type|
8
- klass2 = Treat::Entities.const_get(cc(type))
10
+ klass2 = Treat::Entities.const_get(type.cc)
9
11
  rank_a = i if self <= klass2
10
12
  rank_b = i if klass <= klass2
11
13
  next if rank_a && rank_b
12
14
  i += 1
13
15
  end
14
-
15
16
  return -1 if rank_a < rank_b
16
17
  return 0 if rank_a == rank_b
17
18
  return 1 if rank_a > rank_b
18
-
19
19
  end
20
20
 
21
21
  end
@@ -1,4 +1,4 @@
1
- module Treat::Entities::Abilities::Countable
1
+ module Treat::Entities::Entity::Countable
2
2
 
3
3
  # Find the position of the current entity
4
4
  # inside the parent entity, starting at 1.
@@ -41,6 +41,7 @@ module Treat::Entities::Abilities::Countable
41
41
  # Returns the frequency of the given value
42
42
  # in the this entity.
43
43
  def frequency_of(value)
44
+ value = value.downcase
44
45
  if is_a?(Treat::Entities::Token)
45
46
  raise Treat::Exception,
46
47
  "Cannot get the frequency " +
@@ -0,0 +1,86 @@
1
+ # When Treat.debug is set to true, each call to
2
+ # #call_worker will result in a debug message being
3
+ # printed by the #print_debug function.
4
+ module Treat::Entities::Entity::Debuggable
5
+
6
+ # Previous state and counter.
7
+ @@prev, @@i = nil, 0
8
+
9
+ # Explains what Treat is currently doing.
10
+ # Fixme: last call will never get shown.
11
+ def print_debug(entity, task, worker, group, options)
12
+ # Get a list of the worker's targets.
13
+ targets = group.targets.map(&:to_s)
14
+
15
+ # List the worker's targets as either
16
+ # a single target or an and/or form
17
+ # (since it would be too costly to
18
+ # actually determine what target types
19
+ # were processed at runtime for each call).
20
+ t = targets.size == 1 ? targets[0] : targets[
21
+ 0..-2].join(', ') + ' and/or ' + targets[-1]
22
+
23
+ # Add genitive for annotations (sing./plural)
24
+ genitive = targets.size > 1 ? 'their' : 'its'
25
+
26
+ # Set up an empty string and humanize task name.
27
+ doing, human_task = '', task.to_s.gsub('_', ' ')
28
+
29
+ # Base is "{task}-ed {a(n)|N} {target(s)}"
30
+ if [:transformer, :computer].include?(group.type)
31
+ tt = human_task
32
+ tt = tt[0..-2] if tt[-1] == 'e'
33
+ ed = tt[-1] == 'd' ? '' : 'ed'
34
+ doing = "#{tt.capitalize}#{ed} #{t}"
35
+ # Base is "Annotated {a(n)|N} {target(s)}"
36
+ elsif group.type == :annotator
37
+ if group.preset_option
38
+ opt = options[group.preset_option]
39
+ form = opt.to_s.gsub('_', ' ')
40
+ human_task[-1] = ''
41
+ human_task = form + ' ' + human_task
42
+ end
43
+ doing = "Annotated #{t} with " +
44
+ "#{genitive} #{human_task}"
45
+ end
46
+
47
+ # Form is '{base} in format {worker}'.
48
+ if group.to_s.index('Formatters')
49
+ curr = doing + ' in format ' + worker.to_s
50
+ # Form is '{base} using {worker}'.
51
+ else
52
+ curr = doing + ' using ' + worker.to_s.gsub('_', ' ')
53
+ end
54
+
55
+ # Remove any double pluralization that may happen.
56
+ curr.gsub!('ss', 's') unless curr.index('class')
57
+
58
+ # Accumulate repeated tasks.
59
+ @@i += 1 if curr == @@prev
60
+
61
+ # Change tasks, so output.
62
+ if curr != @@prev && @@prev
63
+ # Pluralize entity names if necessary.
64
+ if @@i > 1
65
+ Treat.core.entities.list.each do |e|
66
+ @@prev.gsub!(e.to_s, e.to_s + 's')
67
+ end
68
+ @@prev.gsub!('its', 'their')
69
+ @@prev = @@prev.split(' ').
70
+ insert(1, @@i.to_s).join(' ')
71
+ # Add determiner if singular.
72
+ else
73
+ @@prev = @@prev.split(' ').
74
+ insert(1, 'a').join(' ')
75
+ end
76
+ # Reset counter.
77
+ @@i = 0
78
+ # Write to stdout.
79
+ puts @@prev + '.'
80
+ end
81
+
82
+ @@prev = curr
83
+
84
+ end
85
+
86
+ end
@@ -1,7 +1,7 @@
1
1
  # Makes a class delegatable, allowing calls
2
2
  # on it to be forwarded to a worker class
3
3
  # able to perform the appropriate task.
4
- module Treat::Entities::Abilities::Delegatable
4
+ module Treat::Entities::Entity::Delegatable
5
5
 
6
6
  # Add preset methods to an entity class.
7
7
  def add_presets(group)
@@ -10,27 +10,25 @@ module Treat::Entities::Abilities::Delegatable
10
10
  return unless opt
11
11
 
12
12
  self.class_eval do
13
- group.presets.each do |preset|
14
- define_method(preset) do |worker=nil, options={}|
15
- return get(preset) if has?(preset)
16
- options = {opt => preset}.merge(options)
17
- m = group.method
18
- send(m, worker, options)
19
- f = unset(m)
20
- features[preset] = f if f
13
+ group.presets.each do |preset|
14
+ define_method(preset) do |worker=nil, options={}|
15
+ return get(preset) if has?(preset)
16
+ options = {opt => preset}.merge(options)
17
+ m = group.method
18
+ send(m, worker, options)
19
+ f = unset(m)
20
+ features[preset] = f if f
21
+ end
21
22
  end
22
23
  end
23
- end
24
24
 
25
25
  end
26
26
 
27
27
  # Add the workers to perform a task on an entity class.
28
28
  def add_workers(group)
29
29
  self.class_eval do
30
-
31
30
  task = group.method
32
31
  add_presets(group)
33
-
34
32
  define_method(task) do |worker=nil, options={}|
35
33
  if worker.is_a?(Hash)
36
34
  options, worker =
@@ -64,7 +62,7 @@ module Treat::Entities::Abilities::Delegatable
64
62
  worker_not_found(worker, group)
65
63
  end
66
64
 
67
- worker = group.const_get(cc(worker.to_s).intern)
65
+ worker = group.const_get(worker.to_s.cc.intern)
68
66
  result = worker.send(group.method, entity, options)
69
67
 
70
68
  if group.type == :annotator && result
@@ -90,40 +88,32 @@ module Treat::Entities::Abilities::Delegatable
90
88
  # Get the default worker for that language
91
89
  # inside the given group.
92
90
  def find_worker_for_language(language, group)
93
-
94
91
  lang = Treat.languages[language]
95
92
  cat = group.to_s.split('::')[2].downcase.intern
96
- group = ucc(cl(group)).intern
97
-
93
+ group = group.mn.ucc.intern
98
94
  if lang.nil?
99
95
  raise Treat::Exception,
100
96
  "No configuration file loaded for language #{language}."
101
97
  end
102
-
103
98
  workers = lang.workers
104
-
105
99
  if !workers.respond_to?(cat) ||
106
100
  !workers[cat].respond_to?(group)
107
101
  workers = Treat.languages.agnostic.workers
108
102
  end
109
-
110
103
  if !workers.respond_to?(cat) ||
111
104
  !workers[cat].respond_to?(group)
112
105
  raise Treat::Exception,
113
106
  "No #{group} is/are available for the " +
114
107
  "#{language.to_s.capitalize} language."
115
108
  end
116
-
117
-
118
109
  workers[cat][group].first
119
-
120
110
  end
121
111
 
122
112
  # Return an error message and suggest possible typos.
123
- def worker_not_found(klass, group)
124
- "Algorithm '#{ucc(cl(klass))}' couldn't be "+
125
- "found in group #{group}." + did_you_mean?(
126
- group.list.map { |c| ucc(c) }, ucc(klass))
113
+ def worker_not_found(worker, group)
114
+ "Worker with name '#{worker}' couldn't be "+
115
+ "found in group #{group}." + Treat::Helpers::Help.
116
+ did_you_mean?(group.list.map { |c| c.ucc }, worker)
127
117
  end
128
118
 
129
119
  end