treat 1.0.6 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (210) hide show
  1. data/LICENSE +2 -4
  2. data/README.md +13 -12
  3. data/bin/MANIFEST +1 -0
  4. data/bin/stanford/bridge.jar +0 -0
  5. data/bin/stanford/joda-time.jar +0 -0
  6. data/bin/stanford/stanford-corenlp.jar +0 -0
  7. data/bin/stanford/stanford-parser.jar +0 -0
  8. data/bin/stanford/xom.jar +0 -0
  9. data/files/{www.economist.com/21552208 → 21552208.html} +86 -89
  10. data/files/{guides.rubyonrails.org/3_2_release_notes.html → 3_2_release_notes.html} +0 -0
  11. data/files/{INFO → MANIFEST} +0 -0
  12. data/files/{www.rubyinside.com/nethttp-cheat-sheet-2940.html → nethttp-cheat-sheet-2940.html} +12 -16
  13. data/files/weather-central-canada-heat-wave.html +1370 -0
  14. data/lib/treat/config/core/acronyms.rb +4 -0
  15. data/lib/treat/config/core/encodings.rb +8 -0
  16. data/lib/treat/config/core/entities.rb +2 -0
  17. data/lib/treat/config/core/language.rb +3 -0
  18. data/lib/treat/config/core/paths.rb +8 -0
  19. data/lib/treat/config/core/syntax.rb +1 -0
  20. data/lib/treat/config/core/verbosity.rb +1 -0
  21. data/lib/treat/config/databases/mongo.rb +3 -0
  22. data/lib/treat/config/languages/agnostic.rb +34 -0
  23. data/lib/treat/config/languages/arabic.rb +13 -0
  24. data/lib/treat/config/languages/chinese.rb +13 -0
  25. data/lib/treat/config/languages/dutch.rb +12 -0
  26. data/lib/treat/config/languages/english.rb +60 -0
  27. data/lib/treat/config/languages/french.rb +18 -0
  28. data/lib/treat/config/languages/german.rb +18 -0
  29. data/lib/treat/config/languages/greek.rb +12 -0
  30. data/lib/treat/config/languages/italian.rb +12 -0
  31. data/lib/treat/config/languages/polish.rb +12 -0
  32. data/lib/treat/config/languages/portuguese.rb +12 -0
  33. data/lib/treat/config/languages/russian.rb +12 -0
  34. data/lib/treat/config/languages/spanish.rb +12 -0
  35. data/lib/treat/config/languages/swedish.rb +12 -0
  36. data/lib/treat/config/libraries/stanford.rb +1 -0
  37. data/lib/treat/config/linguistics/categories.rb +4 -0
  38. data/lib/treat/config/linguistics/punctuation.rb +33 -0
  39. data/lib/treat/config/tags/aligned.rb +221 -0
  40. data/lib/treat/config/tags/enju.rb +71 -0
  41. data/lib/treat/config/tags/paris7.rb +17 -0
  42. data/lib/treat/config/tags/ptb.rb +15 -0
  43. data/lib/treat/config/workers/extractors.rb +39 -0
  44. data/lib/treat/config/workers/formatters.rb +20 -0
  45. data/lib/treat/config/workers/inflectors.rb +27 -0
  46. data/lib/treat/config/workers/learners.rb +6 -0
  47. data/lib/treat/config/workers/lexicalizers.rb +18 -0
  48. data/lib/treat/config/workers/list.rb +1 -0
  49. data/lib/treat/config/workers/processors.rb +19 -0
  50. data/lib/treat/config/workers/retrievers.rb +12 -0
  51. data/lib/treat/config.rb +125 -0
  52. data/lib/treat/{classification.rb → core/classification.rb} +1 -1
  53. data/lib/treat/{data_set.rb → core/data_set.rb} +1 -4
  54. data/lib/treat/{tree.rb → core/node.rb} +5 -5
  55. data/lib/treat/core/server.rb +3 -0
  56. data/lib/treat/core.rb +5 -0
  57. data/lib/treat/entities/abilities/buildable.rb +61 -56
  58. data/lib/treat/entities/abilities/checkable.rb +2 -2
  59. data/lib/treat/entities/abilities/comparable.rb +21 -0
  60. data/lib/treat/entities/abilities/copyable.rb +2 -0
  61. data/lib/treat/entities/abilities/countable.rb +1 -1
  62. data/lib/treat/entities/abilities/debuggable.rb +1 -1
  63. data/lib/treat/entities/abilities/delegatable.rb +42 -36
  64. data/lib/treat/entities/abilities/doable.rb +2 -2
  65. data/lib/treat/entities/abilities/exportable.rb +1 -1
  66. data/lib/treat/entities/abilities/iterable.rb +21 -33
  67. data/lib/treat/entities/abilities/magical.rb +8 -8
  68. data/lib/treat/entities/abilities/registrable.rb +0 -38
  69. data/lib/treat/entities/abilities/stringable.rb +19 -19
  70. data/lib/treat/entities/collection.rb +31 -0
  71. data/lib/treat/entities/document.rb +10 -0
  72. data/lib/treat/entities/entity.rb +18 -13
  73. data/lib/treat/entities/group.rb +15 -0
  74. data/lib/treat/entities/section.rb +13 -0
  75. data/lib/treat/entities/token.rb +35 -0
  76. data/lib/treat/entities/zone.rb +11 -0
  77. data/lib/treat/entities.rb +5 -75
  78. data/lib/treat/helpers/didyoumean.rb +57 -0
  79. data/lib/treat/helpers/escaping.rb +15 -0
  80. data/lib/treat/helpers/formatting.rb +41 -0
  81. data/lib/treat/helpers/platform.rb +15 -0
  82. data/lib/treat/helpers/reflection.rb +17 -0
  83. data/lib/treat/helpers/temporary.rb +27 -0
  84. data/lib/treat/helpers/verbosity.rb +19 -0
  85. data/lib/treat/helpers.rb +5 -0
  86. data/lib/treat/installer.rb +46 -165
  87. data/lib/treat/loaders/linguistics.rb +22 -27
  88. data/lib/treat/loaders/stanford.rb +23 -41
  89. data/lib/treat/loaders.rb +10 -0
  90. data/lib/treat/proxies.rb +73 -24
  91. data/lib/treat/version.rb +3 -0
  92. data/lib/treat/{extractors → workers/extractors}/keywords/tf_idf.rb +1 -1
  93. data/lib/treat/{extractors → workers/extractors}/language/what_language.rb +11 -4
  94. data/lib/treat/{extractors → workers/extractors}/name_tag/stanford.rb +3 -4
  95. data/lib/treat/{extractors → workers/extractors}/tf_idf/native.rb +4 -5
  96. data/lib/treat/{extractors → workers/extractors}/time/chronic.rb +1 -1
  97. data/lib/treat/{extractors → workers/extractors}/time/nickel.rb +1 -1
  98. data/lib/treat/{extractors → workers/extractors}/time/ruby.rb +1 -1
  99. data/lib/treat/{extractors → workers/extractors}/topic_words/lda.rb +1 -1
  100. data/lib/treat/{extractors → workers/extractors}/topics/reuters.rb +4 -4
  101. data/lib/treat/{formatters → workers/formatters}/readers/abw.rb +2 -2
  102. data/lib/treat/{formatters → workers/formatters}/readers/autoselect.rb +10 -3
  103. data/lib/treat/{formatters → workers/formatters}/readers/doc.rb +2 -2
  104. data/lib/treat/{formatters → workers/formatters}/readers/html.rb +4 -4
  105. data/lib/treat/{formatters → workers/formatters}/readers/image.rb +2 -2
  106. data/lib/treat/{formatters → workers/formatters}/readers/odt.rb +2 -2
  107. data/lib/treat/{formatters → workers/formatters}/readers/pdf.rb +2 -2
  108. data/lib/treat/{formatters → workers/formatters}/readers/txt.rb +2 -2
  109. data/lib/treat/{formatters → workers/formatters}/readers/xml.rb +2 -2
  110. data/lib/treat/workers/formatters/serializers/mongo.rb +60 -0
  111. data/lib/treat/{formatters → workers/formatters}/serializers/xml.rb +1 -2
  112. data/lib/treat/{formatters → workers/formatters}/serializers/yaml.rb +1 -1
  113. data/lib/treat/{formatters → workers/formatters}/unserializers/autoselect.rb +3 -1
  114. data/lib/treat/workers/formatters/unserializers/mongo.rb +80 -0
  115. data/lib/treat/{formatters → workers/formatters}/unserializers/xml.rb +2 -2
  116. data/lib/treat/{formatters → workers/formatters}/unserializers/yaml.rb +1 -1
  117. data/lib/treat/{formatters → workers/formatters}/visualizers/dot.rb +1 -1
  118. data/lib/treat/{formatters → workers/formatters}/visualizers/standoff.rb +2 -3
  119. data/lib/treat/{formatters → workers/formatters}/visualizers/tree.rb +1 -1
  120. data/lib/treat/{groupable.rb → workers/group.rb} +6 -12
  121. data/lib/treat/{inflectors → workers/inflectors}/cardinalizers/linguistics.rb +7 -2
  122. data/lib/treat/{inflectors → workers/inflectors}/conjugators/linguistics.rb +11 -11
  123. data/lib/treat/{inflectors → workers/inflectors}/declensors/active_support.rb +2 -2
  124. data/lib/treat/{inflectors → workers/inflectors}/declensors/english/inflect.rb +1 -1
  125. data/lib/treat/{inflectors → workers/inflectors}/declensors/english.rb +2 -2
  126. data/lib/treat/{inflectors → workers/inflectors}/declensors/linguistics.rb +4 -4
  127. data/lib/treat/{inflectors → workers/inflectors}/ordinalizers/linguistics.rb +8 -2
  128. data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter.rb +2 -2
  129. data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter_c.rb +1 -1
  130. data/lib/treat/{inflectors → workers/inflectors}/stemmers/uea.rb +1 -1
  131. data/lib/treat/{ai → workers/learners}/classifiers/id3.rb +1 -1
  132. data/lib/treat/{ai → workers/learners}/classifiers/mlp.rb +1 -1
  133. data/lib/treat/{lexicalizers → workers/lexicalizers}/categorizers/from_tag.rb +9 -9
  134. data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet/synset.rb +2 -2
  135. data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet.rb +4 -4
  136. data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill/patch.rb +2 -2
  137. data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill.rb +2 -8
  138. data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/lingua.rb +1 -6
  139. data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/stanford.rb +31 -42
  140. data/lib/treat/workers/processors/chunkers/autoselect.rb +19 -0
  141. data/lib/treat/{processors → workers/processors}/chunkers/html.rb +4 -3
  142. data/lib/treat/workers/processors/chunkers/txt.rb +32 -0
  143. data/lib/treat/{processors → workers/processors}/parsers/enju.rb +3 -3
  144. data/lib/treat/{processors → workers/processors}/parsers/stanford.rb +6 -8
  145. data/lib/treat/{processors → workers/processors}/segmenters/punkt.rb +6 -10
  146. data/lib/treat/{processors → workers/processors}/segmenters/stanford.rb +2 -2
  147. data/lib/treat/{processors → workers/processors}/segmenters/tactful.rb +3 -6
  148. data/lib/treat/{processors → workers/processors}/tokenizers/ptb.rb +6 -5
  149. data/lib/treat/{processors → workers/processors}/tokenizers/punkt.rb +1 -1
  150. data/lib/treat/{processors → workers/processors}/tokenizers/stanford.rb +1 -1
  151. data/lib/treat/{processors → workers/processors}/tokenizers/tactful.rb +3 -5
  152. data/lib/treat/{retrievers → workers/retrievers}/indexers/ferret.rb +1 -1
  153. data/lib/treat/{retrievers → workers/retrievers}/searchers/ferret.rb +1 -1
  154. data/lib/treat/workers.rb +96 -0
  155. data/lib/treat.rb +23 -49
  156. data/spec/collection.rb +4 -4
  157. data/spec/document.rb +5 -5
  158. data/spec/entity.rb +33 -32
  159. data/spec/{tree.rb → node.rb} +5 -5
  160. data/spec/phrase.rb +5 -39
  161. data/spec/sandbox.rb +212 -6
  162. data/spec/token.rb +12 -9
  163. data/spec/treat.rb +12 -9
  164. data/spec/word.rb +10 -9
  165. data/spec/zone.rb +6 -2
  166. data/tmp/{INFO → MANIFEST} +0 -0
  167. data/tmp/english.yaml +10340 -0
  168. metadata +149 -139
  169. data/lib/treat/ai.rb +0 -12
  170. data/lib/treat/categories.rb +0 -90
  171. data/lib/treat/categorizable.rb +0 -44
  172. data/lib/treat/configurable.rb +0 -115
  173. data/lib/treat/dependencies.rb +0 -25
  174. data/lib/treat/downloader.rb +0 -87
  175. data/lib/treat/entities/abilities.rb +0 -10
  176. data/lib/treat/entities/entities.rb +0 -102
  177. data/lib/treat/exception.rb +0 -7
  178. data/lib/treat/extractors.rb +0 -79
  179. data/lib/treat/formatters/serializers/mongo.rb +0 -64
  180. data/lib/treat/formatters.rb +0 -41
  181. data/lib/treat/helpers/decimal_point_escaper.rb +0 -22
  182. data/lib/treat/inflectors.rb +0 -52
  183. data/lib/treat/kernel.rb +0 -208
  184. data/lib/treat/languages/arabic.rb +0 -16
  185. data/lib/treat/languages/chinese.rb +0 -16
  186. data/lib/treat/languages/dutch.rb +0 -16
  187. data/lib/treat/languages/english.rb +0 -63
  188. data/lib/treat/languages/french.rb +0 -20
  189. data/lib/treat/languages/german.rb +0 -20
  190. data/lib/treat/languages/greek.rb +0 -16
  191. data/lib/treat/languages/italian.rb +0 -17
  192. data/lib/treat/languages/language.rb +0 -10
  193. data/lib/treat/languages/list.txt +0 -504
  194. data/lib/treat/languages/polish.rb +0 -16
  195. data/lib/treat/languages/portuguese.rb +0 -16
  196. data/lib/treat/languages/russian.rb +0 -16
  197. data/lib/treat/languages/spanish.rb +0 -16
  198. data/lib/treat/languages/swedish.rb +0 -16
  199. data/lib/treat/languages.rb +0 -132
  200. data/lib/treat/lexicalizers.rb +0 -37
  201. data/lib/treat/object.rb +0 -7
  202. data/lib/treat/processors/chunkers/autoselect.rb +0 -16
  203. data/lib/treat/processors/chunkers/txt.rb +0 -21
  204. data/lib/treat/processors.rb +0 -38
  205. data/lib/treat/retrievers.rb +0 -27
  206. data/lib/treat/server.rb +0 -26
  207. data/lib/treat/universalisation/encodings.rb +0 -12
  208. data/lib/treat/universalisation/tags.rb +0 -453
  209. data/lib/treat/universalisation.rb +0 -9
  210. data/spec/languages.rb +0 -25
@@ -1,87 +0,0 @@
1
- # Download a file without storing it entirely in memory.
2
- class Treat::Downloader
3
-
4
- require 'net/http'
5
- require 'fileutils'
6
-
7
- class << self
8
- attr_accessor :show_progress
9
- end
10
-
11
- self.show_progress = false
12
-
13
- MaxTries = 3
14
-
15
- # Download a file into destination, and return
16
- # the path to the downloaded file. If the filename
17
- # is nil, it will set the default filename to 'top'.
18
- def self.download(protocol, server, dir, file = nil, target_base = nil, target_dir = nil)
19
-
20
- require 'progressbar' if self.show_progress
21
-
22
- target_base ||= Treat.files
23
- target_dir ||= server
24
-
25
- dir += '/' if dir && dir[-1] != '/'
26
- resource = "#{dir}#{file}"
27
- resource = "/#{resource}" unless resource[0] == '/'
28
- url = "#{server}#{resource}"
29
- path = File.join(target_base, target_dir)
30
-
31
- unless FileTest.directory?(path)
32
- FileUtils.mkdir(path)
33
- end
34
-
35
-
36
- file = File.open("#{path}/#{file}", 'w')
37
- tries = 0
38
- begin
39
-
40
- Net::HTTP.start(server) do |http|
41
-
42
- http.use_ssl = true if protocol == 'https'
43
-
44
- http.request_get(resource) do |response|
45
-
46
- if response.content_length
47
- length = response.content_length
48
- else
49
- warn 'Unknown file size; ETR unknown.'
50
- length = 10000
51
- end
52
-
53
- pbar = self.show_progress ?
54
- ProgressBar.new(url, length) : nil
55
-
56
- unless response.code == '200'
57
- raise Treat::Exception,
58
- "response code was not 200 "+
59
- "OK, but was #{response.code}. "
60
- end
61
-
62
- response.read_body do |segment|
63
- pbar.inc(segment.length) if pbar
64
- file.write(segment)
65
- end
66
-
67
- pbar.finish if pbar
68
-
69
- end
70
-
71
- end
72
-
73
- file.path.to_s
74
-
75
- rescue Exception => error
76
- tries += 1
77
- retry if tries < MaxTries
78
- raise Treat::Exception,
79
- "Couldn't download #{url}. (#{error.message})"
80
- file.delete
81
- ensure
82
- file.close
83
- end
84
-
85
- end
86
-
87
- end
@@ -1,10 +0,0 @@
1
- # Requires the -able mixins for the Entity class.
2
- module Treat::Entities::Abilities
3
-
4
- p = 'treat/entities/abilities/*.rb'
5
-
6
- Dir[Treat.lib + p].each do |f|
7
- require f
8
- end
9
-
10
- end
@@ -1,102 +0,0 @@
1
- module Treat::Entities
2
-
3
- # Require the generic entity lass.
4
- require 'treat/entities/entity'
5
-
6
- # Represents a collection of texts.
7
- class Collection < Entity
8
-
9
- # Initialize the collection with a folder
10
- # containing the texts of the collection.
11
- def initialize(folder = nil, id = nil)
12
- super('', id)
13
- set :folder, folder
14
- i = folder + '/.index'
15
- set :index, i if FileTest.directory?(i)
16
- end
17
-
18
- # Works like the default <<, but if the
19
- # file being added is a collection or a
20
- # document, then copy that collection or
21
- # document into this collection's folder.
22
- def <<(entities, copy = true)
23
- unless entities.is_a? Array
24
- entities = [entities]
25
- end
26
- entities.each do |entity|
27
- if [:document, :collection].
28
- include?(entity.type) && copy
29
- entity = entity.copy_into(self)
30
- end
31
- end
32
- super(entities)
33
- end
34
-
35
- end
36
-
37
- # Represents a document.
38
- class Document < Entity
39
-
40
- def initialize(file = nil, id = nil)
41
- super('', id)
42
- set :file, file
43
- end
44
-
45
- end
46
-
47
- # Represents a section, usually with a title
48
- # and at least one paragraph.
49
- class Section < Entity; end
50
-
51
- # Represents a zone of text
52
- # (Title, Paragraph, List, Quote).
53
- class Zone < Entity; end
54
-
55
- # Represents a title, subtitle, logical header.
56
- class Title < Zone; end
57
-
58
- # Represents a paragraph.
59
- class Paragraph < Zone; end
60
-
61
- # Represents a list.
62
- class List < Zone; end
63
-
64
- # Represents a group of words.
65
- class Phrase < Entity; end
66
-
67
- # Represents a group of words with a sentence ender.
68
- class Sentence < Phrase; end
69
-
70
- # Represents a terminal element in the text structure.
71
- class Token < Entity; end
72
-
73
- # Represents a word.
74
- class Word < Token; end
75
-
76
- # Represents a clitic ('s).
77
- class Enclitic < Token; end
78
-
79
- # Represents a number.
80
- class Number < Token
81
- def to_i; to_s.to_i; end
82
- def to_f; to_s.to_f; end
83
- end
84
-
85
- # Represents a punctuation sign.
86
- class Punctuation < Token; end
87
-
88
- # Represents a character that is neither
89
- # alphabetical, numerical or a punctuation
90
- # character (e.g. @#$%&*).
91
- class Symbol < Token; end
92
-
93
- # Represents a url.
94
- class Url < Token; end
95
-
96
- # Represents a valid RFC822 address.
97
- class Email < Token; end
98
-
99
- # Represents an entity of unknown type.
100
- class Unknown; end
101
-
102
- end
@@ -1,7 +0,0 @@
1
- module Treat
2
- # Custom exception class for the Treat toolkit.
3
- # Used to distinguish between errors raised by
4
- # gems/Ruby from errors raised by the toolkit.
5
- class Exception < ::Exception; end
6
- class InvalidInputException < Exception; end
7
- end
@@ -1,79 +0,0 @@
1
- # Extractors extract information out of texts.
2
- module Treat::Extractors
3
-
4
- # Extracts the language from an entity.
5
- module Language
6
- extend Treat::Groupable
7
- self.type = :annotator
8
- self.targets = [:entity]
9
- self.default = :what_language
10
- end
11
-
12
- # Extracts the date/time of a phrase.
13
- module Time
14
- extend Treat::Groupable
15
- self.type = :annotator
16
- self.targets = [:phrase]
17
- end
18
-
19
- # Extract the topic from a document or zone.
20
- module Topics
21
- extend Treat::Groupable
22
- self.type = :annotator
23
- self.targets = [:document, :section, :zone]
24
- end
25
-
26
- # Extract the keywords from a text.
27
- module Keywords
28
- extend Treat::Groupable
29
- self.type = :annotator
30
- self.targets = [:document, :section, :zone]
31
- end
32
-
33
- # Extract clusters of topic words from a collection.
34
- module TopicWords
35
- extend Treat::Groupable
36
- self.type = :annotator
37
- self.targets = [:collection]
38
- end
39
-
40
- # Extract named entities from phrases.
41
- module NameTag
42
- extend Treat::Groupable
43
- self.type = :annotator
44
- self.targets = [:phrase, :word]
45
- end
46
-
47
- # Extract coreferences from a zone.
48
- module Coreferences
49
- extend Treat::Groupable
50
- self.type = :annotator
51
- self.targets = [:zone]
52
- end
53
-
54
- # Retrieve the main grammatical roles
55
- # in the phrase (subject, verb, object).
56
- module Roles
57
- extend Treat::Groupable
58
- self.type = :annotator
59
- self.targets = [:phrase]
60
- end
61
-
62
- module TfIdf
63
- extend Treat::Groupable
64
- self.type = :annotator
65
- self.targets = [:word]
66
- self.default = :native
67
- end
68
-
69
- module Summary
70
- extend Treat::Groupable
71
- self.type = :annotator
72
- self.targets = [:document]
73
- self.default = :keyword_count
74
- end
75
-
76
- # Make Extractors categorizable.
77
- extend Treat::Categorizable
78
-
79
- end
@@ -1,64 +0,0 @@
1
- # Stores an entity in a Mongo collection.
2
- class Treat::Formatters::Serializers::Mongo
3
-
4
- # Reauire the Mongo DB
5
- require 'mongo'
6
-
7
- # Serialize an entity tree in XML format.
8
- #
9
- # Options:
10
- # - (String) :file => a file to write to.
11
- def self.serialize(entity, options = {})
12
-
13
- unless options[:database]
14
- raise Treat::Exception,
15
- 'Must supply the database name.'
16
- end
17
-
18
- @@conn ||= Mongo::Connection.new
19
- @@db ||= @@conn[options[:database]]
20
-
21
- path = []
22
-
23
- entity.each_ancestor do |ancestor|
24
- path << [ancestor.type, ancestor.id]
25
- end
26
-
27
- path = path.reverse
28
-
29
- target = @@db
30
-
31
- path.each do |type_id|
32
- coll = @@db[type_id[0]][type_id[1]]
33
- end
34
-
35
- # Store path
36
-
37
- Treat::Entities.list.each do |type|
38
-
39
- type = entity.type.to_s
40
- type = (type == 'entity') ? 'entities' : (type + 's')
41
- doc = coll[type]
42
-
43
- features = {}
44
- features['id'] = entity.id
45
- features['value'] = entity.value
46
-
47
- entity.features.each_pair do |feature, value|
48
- if value.is_a? Treat::Entities::Entity
49
- value = value.id
50
- elsif value.is_a?(Array) || value.is_a?(Hash)
51
- value = value.inspect
52
- else
53
- value = value.to_s
54
- end
55
- features[feature.to_s] = value
56
- end
57
-
58
- doc.insert(features)
59
-
60
- end
61
-
62
- end
63
-
64
- end
@@ -1,41 +0,0 @@
1
- # Formatters handle conversion of Entities to and from
2
- # external file formats.
3
- module Treat::Formatters
4
-
5
- # Readers read a document's content.
6
- module Readers
7
- extend Treat::Groupable
8
- self.type = :computer
9
- self.targets = [:document]
10
- end
11
-
12
- # Unserializers recreate entities
13
- # from a serialized format.
14
- module Unserializers
15
- extend Treat::Groupable
16
- self.type = :computer
17
- self.targets = [:entity]
18
- end
19
-
20
- # Serializers transform entities
21
- # into a storable format.
22
- module Serializers
23
- extend Treat::Groupable
24
- self.type = :computer
25
- self.targets = [:entity]
26
- self.default = :yaml
27
- end
28
-
29
- # Visualizers transform entities
30
- # into a visualizable format.
31
- module Visualizers
32
- extend Treat::Groupable
33
- self.type = :computer
34
- self.targets = [:entity]
35
- self.default = :tree
36
- end
37
-
38
- # Make Formatters categorizable.
39
- extend Treat::Categorizable
40
-
41
- end
@@ -1,22 +0,0 @@
1
- module Treat::Helpers
2
-
3
- class DecimalPointEscaper
4
-
5
- EscapeChar = '^^'
6
- EscapedEscapeChar = '\^\^'
7
-
8
- def self.escape!(s)
9
- s.gsub!(/([0-9]+)\.([0-9]+)/) do
10
- $1 + EscapeChar + $2
11
- end
12
- end
13
-
14
- def self.unescape!(s)
15
- s.gsub!(/([0-9]+)#{EscapedEscapeChar}([0-9]+)/) do
16
- $1 + '.' + $2
17
- end
18
- end
19
-
20
- end
21
-
22
- end
@@ -1,52 +0,0 @@
1
- # Category of worker groups that retrieve
2
- # the inflections of a word.
3
- module Treat::Inflectors
4
-
5
- # Return the stem (*not root form*) of a word.
6
- module Stemmers
7
- extend Treat::Groupable
8
- self.type = :annotator
9
- self.targets = [:word]
10
- end
11
-
12
- # Retrieve the different declensions of a
13
- # noun (singular, plural).
14
- module Declensors
15
- extend Treat::Groupable
16
- self.type = :annotator
17
- self.targets = [:word]
18
- self.preset_option = :count
19
- self.presets = [:plural, :singular]
20
- end
21
-
22
- # Retrieve the different conjugations of a word
23
- # given a mode, tense, person, and/or number.
24
- module Conjugators
25
- extend Treat::Groupable
26
- self.type = :annotator
27
- self.targets = [:word]
28
- self.preset_option = :form
29
- self.presets = [:infinitive, :present_participle,
30
- :plural_verb, :singular_verb]
31
- end
32
-
33
- # Retrieve the full text description of a
34
- # cardinal number.
35
- module Cardinalizers
36
- extend Treat::Groupable
37
- self.type = :annotator
38
- self.targets = [:number]
39
- end
40
-
41
- # Retrieve the full text description of an
42
- # ordinal number.
43
- module Ordinalizers
44
- extend Treat::Groupable
45
- self.type = :annotator
46
- self.targets = [:number]
47
- end
48
-
49
- # Make Inflectors categorizable.
50
- extend Treat::Categorizable
51
-
52
- end
data/lib/treat/kernel.rb DELETED
@@ -1,208 +0,0 @@
1
- # Extends the core Kernel module to provide
2
- # easy access to utility functions used across
3
- # the library.
4
- module Kernel
5
-
6
- # Require file utilities for creating and
7
- # deleting temporary files.
8
- require 'fileutils'
9
-
10
- # A list of acronyms used in class names within
11
- # the program. These do not CamelCase; they
12
- # CAMELCase.
13
- Acronyms = %w[xml html txt odt abw doc yaml uea lda pdf ptb dot ai id3 svo mlp]
14
-
15
- # A cache to optimize camel casing.
16
- @@cc_cache = {}
17
-
18
- # A cache to optimize un camel casing.
19
- @@ucc_cache = {}
20
-
21
- # Runs a block of code without warnings.
22
- def silence_warnings(&block)
23
- warn_level = $VERBOSE
24
- $VERBOSE = nil
25
- result = block.call
26
- $VERBOSE = warn_level
27
- result
28
- end
29
-
30
- # Runs a block of code while blocking stdout.
31
- def silence_stdout(log = NULL_DEVICE)
32
- unless Treat.silence
33
- yield; return
34
- end
35
- old = $stdout.dup
36
- $stdout.reopen(File.new(log, 'w'))
37
- yield
38
- $stdout = old
39
- end
40
-
41
- # Create a temporary file which is deleted
42
- # after execution of the block.
43
- def create_temp_file(ext, value = nil, &block)
44
- fname = Treat.tmp +
45
- "#{Random.rand(10000000).to_s}.#{ext}"
46
- File.open(fname, 'w') do |f|
47
- f.write(value) if value
48
- block.call(f.path)
49
- end
50
- ensure
51
- File.delete(fname)
52
- end
53
-
54
- # Create a temporary directory, which is
55
- # deleted after execution of the block.
56
- def create_temp_dir(&block)
57
- dname = "#{Treat.lib}/../tmp/"+
58
- "#{Random.rand(10000000).to_s}"
59
- Dir.mkdir(dname)
60
- block.call(dname)
61
- ensure
62
- FileUtils.rm_rf(dname)
63
- end
64
-
65
- # Convert un_camel_case to CamelCase.
66
- def camel_case(o_phrase)
67
- phrase = o_phrase.to_s.dup
68
- return @@cc_cache[o_phrase] if @@cc_cache[o_phrase]
69
-
70
- if Acronyms.include?(phrase)
71
- phrase = phrase.upcase
72
- else
73
- phrase.gsub!(/^[a-z]|_[a-z]/) { |a| a.upcase }
74
- phrase.gsub!('_', '')
75
- end
76
- @@cc_cache[o_phrase] = phrase
77
- end
78
-
79
- alias :cc :camel_case
80
-
81
- # Convert CamelCase to un_camel_case.
82
- def un_camel_case(o_phrase)
83
- phrase = o_phrase.to_s.dup
84
- return @@ucc_cache[o_phrase] if @@ucc_cache[o_phrase]
85
- if Acronyms.include?(phrase.downcase)
86
- phrase = phrase.downcase
87
- else
88
- phrase.gsub!(/[A-Z]/) { |p| '_' + p.downcase }
89
- phrase = phrase[1..-1] if phrase[0] == '_'
90
- end
91
- @@ucc_cache[o_phrase] = phrase
92
- end
93
-
94
- alias :ucc :un_camel_case
95
-
96
- # Retrieve the Class from a Module::Class.
97
- def class_name(n); n.to_s.split('::')[-1]; end
98
-
99
- alias :cl :class_name
100
-
101
- # Search the list to see if there are words similar to #name
102
- # in the #list If yes, return a string saying "Did you mean
103
- # ... ?" with the names.
104
- def did_you_mean?(list, name)
105
- return '' # Fix
106
- list = list.map { |e| e.to_s }
107
- name = name.to_s
108
- sugg = []
109
- list.each do |element|
110
- l = levenshtein(element,name)
111
- if l > 0 && l < 2
112
- sugg << element
113
- end
114
- end
115
- unless sugg.size == 0
116
- if sugg.size == 1
117
- msg += " Perhaps you meant '#{sugg[0]}' ?"
118
- else
119
- sugg_quote = sugg[0..-2].map do
120
- |x| '\'' + x + '\''
121
- end
122
- msg += " Perhaps you meant " +
123
- "#{sugg_quote.join(', ')}," +
124
- " or '#{sugg[-1]}' ?"
125
- end
126
- end
127
- msg
128
- end
129
-
130
- alias :dym? :did_you_mean?
131
-
132
- # Return the name of the method that called the method
133
- # that calls this method.
134
- def caller_method(n = 3)
135
- at = caller(n).first
136
- /^(.+?):(\d+)(?::in `(.*)')?/ =~ at
137
- Regexp.last_match[3].gsub('block in ', '').intern
138
- end
139
-
140
- alias :cm :caller_method
141
-
142
- # Detect the platform we're running on.
143
- def detect_platform
144
- p = RUBY_PLATFORM.downcase
145
- return :mac if p.include?("darwin")
146
- return :windows if p.include?("mswin")
147
- return :linux if p.include?("linux")
148
- return :unknown
149
- end
150
-
151
- # Return the levensthein distance between two stringsm
152
- # taking into account the costs of insertion, deletion,
153
- # and substitution. Stolen from:
154
- # http://ruby-snippets.heroku.com/string/levenshtein-distance
155
- # Used by did_you_mean?
156
- def levenshtein(first, other, ins=1, del=1, sub=1)
157
- return nil if first.nil? || other.nil?
158
- dm = []
159
- dm[0] = (0..first.length).collect { |i| i * ins}
160
- fill = [0] * (first.length - 1).abs
161
- for i in 1..other.length
162
- dm[i] = [i * del, fill.flatten]
163
- end
164
- for i in 1..other.length
165
- for j in 1..first.length
166
- dm[i][j] = [
167
- dm[i-1][j-1] +
168
- (first[i-1] ==
169
- other[i-1] ? 0 : sub),
170
- dm[i][j-1] + ins,
171
- dm[i-1][j] + del
172
- ].min
173
- end
174
- end
175
- dm[other.length][first.length]
176
- end
177
-
178
- if detect_platform == :windows
179
- NULL_DEVICE = 'NUL'
180
- else
181
- NULL_DEVICE = '/dev/null'
182
- end
183
-
184
- def prompt(msg, valid_answers)
185
-
186
- msg = msg
187
- n = msg.include?("\n") ? ":\n" : ''
188
- q = msg.include?("\n") ? '' : '?'
189
-
190
- s = "\nPlease enter one of #{valid_answers.join(', ')}: "
191
- puts "Do you want to #{n}#{msg}#{q} \n#{s}"
192
-
193
- begin
194
- answer = STDIN.gets.strip
195
- unless valid_answers.include?(answer)
196
- puts "Invalid input."
197
- puts s
198
- raise Treat::InvalidInputException
199
- end
200
- puts
201
- answer
202
- rescue Treat::InvalidInputException
203
- retry
204
- end
205
-
206
- end
207
-
208
- end
@@ -1,16 +0,0 @@
1
- class Treat::Languages::Arabic
2
-
3
- RequiredDependencies = []
4
- OptionalDependencies = []
5
-
6
- Extractors = {}
7
- Inflectors = {}
8
- Lexicalizers = {
9
- :taggers => [:stanford]
10
- }
11
- Processors = {
12
- :parsers => [:stanford]
13
- }
14
- Retrievers = {}
15
-
16
- end
@@ -1,16 +0,0 @@
1
- class Treat::Languages::Chinese
2
-
3
- RequiredDependencies = []
4
- OptionalDependencies = []
5
-
6
- Extractors = {}
7
- Inflectors = {}
8
- Lexicalizers = {
9
- :taggers => [:stanford]
10
- }
11
- Processors = {
12
- :parsers => [:stanford]
13
- }
14
- Retrievers = {}
15
-
16
- end