treat 0.2.5 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Treat - Text Retrieval and Annotation Toolkit
1
+ Treat - Text Retrieval, Extraction and Annotation Toolkit
2
2
 
3
3
  This program is free software: you can redistribute it and/or modify
4
4
  it under the terms of the GNU General Public License as published by
@@ -11,9 +11,9 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
11
  GNU General Public License for more details.
12
12
 
13
13
  You should have received a copy of the GNU General Public License
14
- along with this program. If not, see <http://www.gnu.org/licenses/>.
14
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
15
15
 
16
- Author: Louis-Antoine Mullie (louis.mullie@gmail.com). Copyright 2012.
16
+ Author: Louis-Antoine Mullie (louis.mullie@gmail.com). Copyright 2011-12.
17
17
 
18
18
  Non-trivial amount of code has been incorporated and modified from other libraries:
19
19
 
data/README.md ADDED
@@ -0,0 +1,33 @@
1
+ ![Build Status](https://secure.travis-ci.org/louismullie/treat.png) ![Dependency Status](https://gemnasium.com/louismullie/treat.png)
2
+
3
+ Treat is a toolkit for natural language processing and computational linguistics in Ruby. It provides a common API for a number of gems and external libraries for document retrieval, parsing, annotation, and information extraction.
4
+
5
+ **Current features**
6
+
7
+ * Text extractors for PDF, HTML, XML, Word, AbiWord, OpenOffice and image formats (Ocropus)
8
+ * Text retrieval with indexation and full-text search (Ferret)
9
+ * Text chunkers, sentence segmenters, tokenizers, and parsers for several languages (Stanford & Enju)
10
+ * Word inflectors, including stemmers, conjugators, declensors, and number inflection
11
+ * Lexical resources (WordNet interface, several POS taggers for English, Stanford taggers for several languages)
12
+ * Language, date/time, topic words (LDA) and keyword (TF*IDF) extraction.
13
+ * Simple text statistics (frequency, TF*IDF)
14
+ * Serialization of annotated entities to YAML or XML format
15
+ * Visualization in ASCII tree, directed graph (DOT) and tag-bracketed (standoff) formats
16
+ * Linguistic resources, including full ISO-639-1 and ISO-639-2 support, and tag alignments for several treebanks
17
+
18
+ <br>
19
+
20
+ **Resources**
21
+
22
+ * Read the [latest documentation](http://rubydoc.info/github/louismullie/treat/master/frames).
23
+ * See how to [install Treat](https://github.com/louismullie/treat/wiki/Installing-Treat).
24
+ * Learn how to [use Treat](https://github.com/louismullie/treat/wiki/Using-Treat).
25
+ * Help out by [contributing to the project](https://github.com/louismullie/treat/wiki/Contributing-to-Treat).
26
+ * View a list of [papers](https://github.com/louismullie/treat/wiki/Papers) about tools included in this toolkit.
27
+ * Open an [issue](https://github.com/louismullie/treat/issues).
28
+
29
+ <br>
30
+
31
+ **License**
32
+
33
+ This software is released under the [GPL License](https://github.com/louismullie/treat/wiki/License-Information) and includes software released under the GPL, Ruby, Apache 2.0 and MIT licenses.
data/files/INFO ADDED
@@ -0,0 +1 @@
1
+ This is a folder containing the files downloaded by Treat.
data/lib/treat.rb CHANGED
@@ -1,127 +1,62 @@
1
- # Main namespace for Treat modules.
2
- #
3
- # === Entities
4
- #
5
- # Entities are Tree structures that represent textual entities
6
- # (from a collection of texts down to an individual word), with
7
- # a value, features, children and dependencies linking it to other
8
- # textual entities.
9
- #
10
- # Here are some example of how to create entities:
11
- #
12
- # Treat.sweeten!
13
- #
14
- # c = Collection 'folder_with_documents'
15
- # d = Document 'filename.txt'
16
- # p = Paragraph 'A short story. The end.'
17
- # s = Sentence 'That is not a sentence.'
18
- # w = Word 'fox'
19
- #
20
- # Here is a list of entities and their description:
21
- #
22
- # - A Collection represents a folder with different textual documents.
23
- # - A Document represents a file with a textual content.
24
- # - A Zone can be a Section, Title, a Paragraph or a List and represents an intra-section division of content.
25
- # - A Sentence represents just that.
26
- # - A Phrase is a group of words; a Sentence is a Phrase with a sentence ender (.!?)
27
- # - A Token can be a Word, a Number, a Punctuation or a Symbol (non-punctuation, non-alphanumeric character).
28
- #
29
- # === Functions
30
- #
31
- # A worker class is defined for each implemented algorithm performing a given
32
- # task. These classes are clustered into workers performing the same given task
33
- # differently (Group), and the groups are clustered into Categories
34
- # of groups of workers that perform related tasks.
35
- #
36
- # Here are the different Categories and their description:
37
- #
38
- # - Processors perform the building of tree of entities representing texts (chunking, segmenting, tokenizing, parsing).
39
- # - Lexicalizers give lexical information about words (synsets, semantic relationships, tag, word category).
40
- # - Extractors extract semantic information about an entity (topic, date, time, named entity).
41
- # - Inflectors allow to retrieve the different inflections of a word (declensors, conjugators, stemmers, lemmatizers).
42
- # - Formatters handle the conversion of entities to and from different formats (readers, serializers, unserializers, visualizers).
43
- # - Retrievers allow to index and search collections of documents.
44
- #
45
- # === Linguistic Resources
46
- #
47
- # The Languages module contains linguistic information about
48
- # languages (full ISO-639-1 and 2 language list, tag alignments
49
- # for three treebanks, word categories, etc.)
50
- #
51
- # === Exception Class.
52
- #
53
- # Treat::Exception defines a custom exception class for the Treat module.
54
- #
55
1
  module Treat
2
+
3
+ # Require custom exception cass.
4
+ require 'treat/exception'
56
5
 
57
- # Make sure that we are running on Ruby 1.9 or higher.
6
+ # Treat requires Ruby 1.9 or higher.
58
7
  if RUBY_VERSION <= '1.9'
59
- raise 'Treat requires Ruby 1.9 or higher.'
8
+ raise Treat::Exception,
9
+ 'Treat requires Ruby 1.9 or higher.'
60
10
  end
61
11
 
62
12
  # The current version of Treat.
63
- VERSION = "0.2.5"
64
-
65
- #$LOAD_PATH << '/ruby/gems/treat/lib/' # Remove for release
66
-
67
- # Create class variables for the Treat module.
13
+ VERSION = "1.0.0"
14
+
15
+ # Add methods to handle syntactic sugar,
16
+ # language configuration options, and paths.
17
+ require 'treat/configurable'
18
+ extend Treat::Configurable
19
+
20
+ # The folders in the library and descriptions.
21
+ Paths = {
22
+ :tmp => 'temporary files',
23
+ :lib => 'class and module definitions',
24
+ :bin => 'binary files',
25
+ :files => 'user-saved files',
26
+ :data => 'data set files',
27
+ :models => 'model files',
28
+ :spec => 'spec test files'
29
+ }
30
+
31
+ # Add methods to provide access to common paths.
68
32
  class << self
69
- # Boolean - output debug information.
70
- attr_accessor :debug
71
- # Symbol - default language to use when detect_language is false.
72
- attr_accessor :default_language
73
- # Symbol - default encoding to use.
74
- attr_accessor :default_encoding
75
- # Boolean - detect language or use default?
76
- attr_accessor :detect_language
77
- # Symbol - the ideal entity level to detect language at
78
- # (e.g., :entity, :sentence, :zone, :section, :document)
79
- attr_accessor :language_detection_level
80
- # String - folder of this file.
81
- attr_accessor :lib
82
- # String - folder for tests.
83
- attr_accessor :test
33
+ Paths.each do |path, _|
34
+ define_method(path) do
35
+ (File.dirname(__FILE__).
36
+ split('/')[0..-2].join('/') +
37
+ '/' + path.to_s + '/').gsub(
38
+ 'lib/../', '')
39
+ end
40
+ end
84
41
  end
85
42
 
86
- # Turn off debug by default.
87
- self.debug = false
88
- # Set the default language to english.
89
- self.default_language = :eng
90
- # Set the default encoding to utf-8.
91
- self.default_encoding = :utf_8
92
- # Turn language detection off by default.
93
- self.detect_language = false
94
- # Detect the language once per text by default.
95
- self.language_detection_level = :zone
96
- # Set the lib path to that of this file.
97
- self.lib = File.dirname(__FILE__)
98
- # Set the paths to the test folder.
99
- self.test = self.lib + '/../test'
100
-
101
- # Require inline C
102
- # require 'inline'
103
-
104
- # Require modified core classes.
105
43
  require 'treat/object'
106
44
  require 'treat/kernel'
107
- # Require all files for the Treat library.
108
- require 'treat/exception'
45
+ require 'treat/downloader'
109
46
  require 'treat/languages'
47
+ require 'treat/linguistics'
110
48
  require 'treat/entities'
111
49
  require 'treat/categories'
50
+ require 'treat/data_set'
112
51
  require 'treat/proxies'
113
- require 'treat/sugar'
114
-
115
- # Make sugar available when needed.
116
- extend Treat::Sugar
117
52
 
118
53
  # Install packages for a given language.
119
54
  def self.install(language = :english)
120
- require 'treat/install'
55
+ require 'treat/installer'
121
56
  Treat::Installer.install(language)
122
57
  end
123
58
 
124
- # Turn on detect language.
125
- def self.detect!; self.detect_language = true; end
126
-
59
+ # Enable syntactic sugar by default.
60
+ Treat.sweeten!
61
+
127
62
  end
data/lib/treat/ai.rb ADDED
@@ -0,0 +1,12 @@
1
+ module Treat::AI
2
+
3
+ module Classifiers
4
+ extend Treat::Groupable
5
+ self.type = :annotator
6
+ self.targets = [:entity]
7
+ self.default = :id3
8
+ end
9
+
10
+ extend Treat::Categorizable
11
+
12
+ end
@@ -0,0 +1,27 @@
1
+ class Treat::AI::Classifiers::ID3
2
+
3
+ require 'decisiontree'
4
+
5
+ @@classifiers = {}
6
+
7
+ def self.classify(entity, options = {})
8
+
9
+ set = options[:training]
10
+ cl = set.classification
11
+
12
+ if !@@classifiers[cl]
13
+ dec_tree = DecisionTree::ID3Tree.new(
14
+ set.labels, set.items,
15
+ cl.default, :continuous)
16
+ dec_tree.train
17
+ else
18
+ dec_tree = @@classifiers[cl]
19
+ end
20
+
21
+ dec_tree.predict(
22
+ cl.export_item(entity, false)
23
+ )[0]
24
+
25
+ end
26
+
27
+ end
@@ -1,43 +1,90 @@
1
- module Treat
2
- # This module keeps track of all categories that
3
- # exist and the methods they implement.
4
- module Categories
5
- class << self
6
- # A list of all categories.
7
- attr_accessor :list
1
+ # This module keeps track of all the Treat::Categorizable
2
+ # modules that exist and the methods they define.
3
+ #
4
+ #
5
+ # - Processors perform the building of tree of
6
+ # entities representing texts (chunking,
7
+ # segmenting, tokenizing, parsing).
8
+ # - Lexicalizers give lexical information about
9
+ # words (synsets, semantic relationships,
10
+ # tag, word category).
11
+ # - Extractors extract semantic information about
12
+ # an entity (language, topic, date, time, named
13
+ # entity, coreferences).
14
+ # - Inflectors allow to retrieve the different
15
+ # inflections of a word (declensors, conjugators,
16
+ # stemmers, lemmatizers).
17
+ # - Formatters handle the conversion of entities to
18
+ # and from different formats(readers, serializers,
19
+ # unserializers, visualizers).
20
+ # - Retrievers allow to index and search collections
21
+ # of documents.
22
+ module Treat::Categories
23
+
24
+ class << self
25
+ # A list of all categories.
26
+ attr_accessor :list
27
+ end
28
+
29
+ # Array - list of all categories.
30
+ self.list = []
31
+ # A lookup table for entity types.
32
+ @@lookup = {}
33
+
34
+ # Require all categories.
35
+ require 'treat/categorizable'
36
+ require 'treat/formatters'
37
+ require 'treat/processors'
38
+ require 'treat/lexicalizers'
39
+ require 'treat/inflectors'
40
+ require 'treat/extractors'
41
+ require 'treat/retrievers'
42
+ require 'treat/ai'
43
+
44
+ # Create the lookup table.
45
+ self.list.each do |category|
46
+ category.groups.each do |group|
47
+ group = category.const_get(group)
48
+ @@lookup[group.method] = group
49
+ group.presets.each do |x,y|
50
+ @@lookup[x] = group
51
+ end if group.presets
8
52
  end
9
- # Array - list of all categories.
10
- self.list = []
11
- @@lookup = nil
12
- # Find the class of a group given its method.
13
- def self.lookup(method)
14
- return @@lookup[method] if @@lookup
15
- @@lookup = {}
53
+ end
54
+
55
+ # Find the class of a group given its method.
56
+ def self.lookup(method)
57
+ @@lookup[method]
58
+ end
59
+
60
+ # Fix -- This must be moved urgently.
61
+ Treat::Entities::Entity.class_eval do
62
+
63
+ alias :true_language :language
64
+
65
+ def language(extractor = nil, options = {})
16
66
 
17
- self.list.each do |category|
18
- category.groups.each do |group|
19
- group = category.const_get(group)
20
- @@lookup[group.method] = group
21
- methods = group.presets.merge(
22
- group.preprocessors.merge(
23
- group.postprocessors
24
- )
25
- )
26
- methods.each do |x,y|
27
- @@lookup[x] = group
28
- end
67
+ if is_a?(Treat::Entities::Symbol) ||
68
+ is_a?(Treat::Entities::Number)
69
+ return Treat.default_language
70
+ end
71
+
72
+ if !Treat.detect_language
73
+ return Treat.default_language
74
+ else
75
+ dlvl = Treat.language_detection_level
76
+ if (Treat::Entities.rank(type) <
77
+ Treat::Entities.rank(dlvl)) &&
78
+ has_parent?
79
+ anc = ancestor_with_type(dlvl)
80
+ return anc.language if anc
29
81
  end
30
82
  end
31
83
 
32
- @@lookup[method]
84
+ true_language(extractor, options)
85
+
33
86
  end
34
- # Require all categories.
35
- require 'treat/category'
36
- require 'treat/formatters'
37
- require 'treat/processors'
38
- require 'treat/lexicalizers'
39
- require 'treat/inflectors'
40
- require 'treat/extractors'
41
- require 'treat/retrievers'
87
+
42
88
  end
89
+
43
90
  end
@@ -0,0 +1,44 @@
1
+ # A categorizable module brings together groups
2
+ # of algorithms that perform similar functions.
3
+ module Treat::Categorizable
4
+
5
+ # The contents of each categorizable
6
+ # module are groupable.
7
+ require 'treat/groupable'
8
+
9
+ # Add workers to the Entities based on the
10
+ # configuration for a given category.
11
+ def self.extended(category)
12
+ Treat::Categories.list << category
13
+ category.module_eval do
14
+ groups.each do |group|
15
+ group = const_get(group)
16
+ group.targets.each do |entity_type|
17
+ entity = Treat::Entities.
18
+ const_get(cc(entity_type))
19
+ entity.class_eval do
20
+ add_workers group
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+
27
+ # Get the list of groups defined
28
+ # under this module.
29
+ @@groups = self.constants
30
+
31
+ # Populate a list of methods.
32
+ @@methods = []
33
+ @@groups.each do |group|
34
+ @@methods << const_get(group).method
35
+ end
36
+
37
+ # Provide a list of methods implemented in
38
+ # the groups contained within this category.
39
+ def methods; @@methods; end
40
+
41
+ # Provides a list of groups within this category.
42
+ def groups; self.constants; end
43
+
44
+ end
@@ -0,0 +1,61 @@
1
+ class Treat::Classification
2
+
3
+ attr_reader :types
4
+ attr_reader :features
5
+ attr_reader :question
6
+ attr_reader :labels
7
+ attr_reader :default
8
+
9
+ def initialize(type_or_types, feature_or_features, question, default = false)
10
+
11
+ @types, @features,
12
+ @question, @default =
13
+ [*type_or_types],
14
+ [*feature_or_features],
15
+ question, default
16
+
17
+ @labels = []
18
+
19
+ @features.each do |cmd|
20
+ if cmd.is_a?(Array)
21
+ @labels << cmd[0]
22
+ else
23
+ @labels << cmd
24
+ end
25
+ end
26
+
27
+ end
28
+
29
+ def export_item(e, include_question = true)
30
+
31
+ line = []
32
+
33
+ @features.each do |cmd|
34
+ begin
35
+ if cmd.is_a?(Array)
36
+ line << cmd[1].call(e)
37
+ else
38
+ line << e.send(cmd)
39
+ end
40
+ rescue Treat::Exception
41
+ dflt = (
42
+ (cmd.is_a?(Array) && cmd[2]) ?
43
+ cmd[2] : nil
44
+ )
45
+ line << dflt
46
+ end
47
+ end
48
+
49
+ begin
50
+ if include_question
51
+ line << e.send(@question)
52
+ end
53
+ rescue Treat::Exception
54
+ line << @default
55
+ end
56
+ line[-1] = '' if line[-1].nil?
57
+ line
58
+
59
+ end
60
+
61
+ end