treat 0.2.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Treat - Text Retrieval and Annotation Toolkit
1
+ Treat - Text Retrieval, Extraction and Annotation Toolkit
2
2
 
3
3
  This program is free software: you can redistribute it and/or modify
4
4
  it under the terms of the GNU General Public License as published by
@@ -11,9 +11,9 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
11
  GNU General Public License for more details.
12
12
 
13
13
  You should have received a copy of the GNU General Public License
14
- along with this program. If not, see <http://www.gnu.org/licenses/>.
14
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
15
15
 
16
- Author: Louis-Antoine Mullie (louis.mullie@gmail.com). Copyright 2012.
16
+ Author: Louis-Antoine Mullie (louis.mullie@gmail.com). Copyright 2011-12.
17
17
 
18
18
  Non-trivial amount of code has been incorporated and modified from other libraries:
19
19
 
data/README.md ADDED
@@ -0,0 +1,33 @@
1
+ ![Build Status](https://secure.travis-ci.org/louismullie/treat.png) ![Dependency Status](https://gemnasium.com/louismullie/treat.png)
2
+
3
+ Treat is a toolkit for natural language processing and computational linguistics in Ruby. It provides a common API for a number of gems and external libraries for document retrieval, parsing, annotation, and information extraction.
4
+
5
+ **Current features**
6
+
7
+ * Text extractors for PDF, HTML, XML, Word, AbiWord, OpenOffice and image formats (Ocropus)
8
+ * Text retrieval with indexation and full-text search (Ferret)
9
+ * Text chunkers, sentence segmenters, tokenizers, and parsers for several languages (Stanford & Enju)
10
+ * Word inflectors, including stemmers, conjugators, declensors, and number inflection
11
+ * Lexical resources (WordNet interface, several POS taggers for English, Stanford taggers for several languages)
12
+ * Language, date/time, topic words (LDA) and keyword (TF*IDF) extraction.
13
+ * Simple text statistics (frequency, TF*IDF)
14
+ * Serialization of annotated entities to YAML or XML format
15
+ * Visualization in ASCII tree, directed graph (DOT) and tag-bracketed (standoff) formats
16
+ * Linguistic resources, including full ISO-639-1 and ISO-639-2 support, and tag alignments for several treebanks
17
+
18
+ <br>
19
+
20
+ **Resources**
21
+
22
+ * Read the [latest documentation](http://rubydoc.info/github/louismullie/treat/master/frames).
23
+ * See how to [install Treat](https://github.com/louismullie/treat/wiki/Installing-Treat).
24
+ * Learn how to [use Treat](https://github.com/louismullie/treat/wiki/Using-Treat).
25
+ * Help out by [contributing to the project](https://github.com/louismullie/treat/wiki/Contributing-to-Treat).
26
+ * View a list of [papers](https://github.com/louismullie/treat/wiki/Papers) about tools included in this toolkit.
27
+ * Open an [issue](https://github.com/louismullie/treat/issues).
28
+
29
+ <br>
30
+
31
+ **License**
32
+
33
+ This software is released under the [GPL License](https://github.com/louismullie/treat/wiki/License-Information) and includes software released under the GPL, Ruby, Apache 2.0 and MIT licenses.
data/files/INFO ADDED
@@ -0,0 +1 @@
1
+ This is a folder containing the files downloaded by Treat.
data/lib/treat.rb CHANGED
@@ -1,127 +1,62 @@
1
- # Main namespace for Treat modules.
2
- #
3
- # === Entities
4
- #
5
- # Entities are Tree structures that represent textual entities
6
- # (from a collection of texts down to an individual word), with
7
- # a value, features, children and dependencies linking it to other
8
- # textual entities.
9
- #
10
- # Here are some example of how to create entities:
11
- #
12
- # Treat.sweeten!
13
- #
14
- # c = Collection 'folder_with_documents'
15
- # d = Document 'filename.txt'
16
- # p = Paragraph 'A short story. The end.'
17
- # s = Sentence 'That is not a sentence.'
18
- # w = Word 'fox'
19
- #
20
- # Here is a list of entities and their description:
21
- #
22
- # - A Collection represents a folder with different textual documents.
23
- # - A Document represents a file with a textual content.
24
- # - A Zone can be a Section, Title, a Paragraph or a List and represents an intra-section division of content.
25
- # - A Sentence represents just that.
26
- # - A Phrase is a group of words; a Sentence is a Phrase with a sentence ender (.!?)
27
- # - A Token can be a Word, a Number, a Punctuation or a Symbol (non-punctuation, non-alphanumeric character).
28
- #
29
- # === Functions
30
- #
31
- # A worker class is defined for each implemented algorithm performing a given
32
- # task. These classes are clustered into workers performing the same given task
33
- # differently (Group), and the groups are clustered into Categories
34
- # of groups of workers that perform related tasks.
35
- #
36
- # Here are the different Categories and their description:
37
- #
38
- # - Processors perform the building of tree of entities representing texts (chunking, segmenting, tokenizing, parsing).
39
- # - Lexicalizers give lexical information about words (synsets, semantic relationships, tag, word category).
40
- # - Extractors extract semantic information about an entity (topic, date, time, named entity).
41
- # - Inflectors allow to retrieve the different inflections of a word (declensors, conjugators, stemmers, lemmatizers).
42
- # - Formatters handle the conversion of entities to and from different formats (readers, serializers, unserializers, visualizers).
43
- # - Retrievers allow to index and search collections of documents.
44
- #
45
- # === Linguistic Resources
46
- #
47
- # The Languages module contains linguistic information about
48
- # languages (full ISO-639-1 and 2 language list, tag alignments
49
- # for three treebanks, word categories, etc.)
50
- #
51
- # === Exception Class.
52
- #
53
- # Treat::Exception defines a custom exception class for the Treat module.
54
- #
55
1
  module Treat
2
+
3
+ # Require custom exception cass.
4
+ require 'treat/exception'
56
5
 
57
- # Make sure that we are running on Ruby 1.9 or higher.
6
+ # Treat requires Ruby 1.9 or higher.
58
7
  if RUBY_VERSION <= '1.9'
59
- raise 'Treat requires Ruby 1.9 or higher.'
8
+ raise Treat::Exception,
9
+ 'Treat requires Ruby 1.9 or higher.'
60
10
  end
61
11
 
62
12
  # The current version of Treat.
63
- VERSION = "0.2.5"
64
-
65
- #$LOAD_PATH << '/ruby/gems/treat/lib/' # Remove for release
66
-
67
- # Create class variables for the Treat module.
13
+ VERSION = "1.0.0"
14
+
15
+ # Add methods to handle syntactic sugar,
16
+ # language configuration options, and paths.
17
+ require 'treat/configurable'
18
+ extend Treat::Configurable
19
+
20
+ # The folders in the library and descriptions.
21
+ Paths = {
22
+ :tmp => 'temporary files',
23
+ :lib => 'class and module definitions',
24
+ :bin => 'binary files',
25
+ :files => 'user-saved files',
26
+ :data => 'data set files',
27
+ :models => 'model files',
28
+ :spec => 'spec test files'
29
+ }
30
+
31
+ # Add methods to provide access to common paths.
68
32
  class << self
69
- # Boolean - output debug information.
70
- attr_accessor :debug
71
- # Symbol - default language to use when detect_language is false.
72
- attr_accessor :default_language
73
- # Symbol - default encoding to use.
74
- attr_accessor :default_encoding
75
- # Boolean - detect language or use default?
76
- attr_accessor :detect_language
77
- # Symbol - the ideal entity level to detect language at
78
- # (e.g., :entity, :sentence, :zone, :section, :document)
79
- attr_accessor :language_detection_level
80
- # String - folder of this file.
81
- attr_accessor :lib
82
- # String - folder for tests.
83
- attr_accessor :test
33
+ Paths.each do |path, _|
34
+ define_method(path) do
35
+ (File.dirname(__FILE__).
36
+ split('/')[0..-2].join('/') +
37
+ '/' + path.to_s + '/').gsub(
38
+ 'lib/../', '')
39
+ end
40
+ end
84
41
  end
85
42
 
86
- # Turn off debug by default.
87
- self.debug = false
88
- # Set the default language to english.
89
- self.default_language = :eng
90
- # Set the default encoding to utf-8.
91
- self.default_encoding = :utf_8
92
- # Turn language detection off by default.
93
- self.detect_language = false
94
- # Detect the language once per text by default.
95
- self.language_detection_level = :zone
96
- # Set the lib path to that of this file.
97
- self.lib = File.dirname(__FILE__)
98
- # Set the paths to the test folder.
99
- self.test = self.lib + '/../test'
100
-
101
- # Require inline C
102
- # require 'inline'
103
-
104
- # Require modified core classes.
105
43
  require 'treat/object'
106
44
  require 'treat/kernel'
107
- # Require all files for the Treat library.
108
- require 'treat/exception'
45
+ require 'treat/downloader'
109
46
  require 'treat/languages'
47
+ require 'treat/linguistics'
110
48
  require 'treat/entities'
111
49
  require 'treat/categories'
50
+ require 'treat/data_set'
112
51
  require 'treat/proxies'
113
- require 'treat/sugar'
114
-
115
- # Make sugar available when needed.
116
- extend Treat::Sugar
117
52
 
118
53
  # Install packages for a given language.
119
54
  def self.install(language = :english)
120
- require 'treat/install'
55
+ require 'treat/installer'
121
56
  Treat::Installer.install(language)
122
57
  end
123
58
 
124
- # Turn on detect language.
125
- def self.detect!; self.detect_language = true; end
126
-
59
+ # Enable syntactic sugar by default.
60
+ Treat.sweeten!
61
+
127
62
  end
data/lib/treat/ai.rb ADDED
@@ -0,0 +1,12 @@
1
+ module Treat::AI
2
+
3
+ module Classifiers
4
+ extend Treat::Groupable
5
+ self.type = :annotator
6
+ self.targets = [:entity]
7
+ self.default = :id3
8
+ end
9
+
10
+ extend Treat::Categorizable
11
+
12
+ end
@@ -0,0 +1,27 @@
1
+ class Treat::AI::Classifiers::ID3
2
+
3
+ require 'decisiontree'
4
+
5
+ @@classifiers = {}
6
+
7
+ def self.classify(entity, options = {})
8
+
9
+ set = options[:training]
10
+ cl = set.classification
11
+
12
+ if !@@classifiers[cl]
13
+ dec_tree = DecisionTree::ID3Tree.new(
14
+ set.labels, set.items,
15
+ cl.default, :continuous)
16
+ dec_tree.train
17
+ else
18
+ dec_tree = @@classifiers[cl]
19
+ end
20
+
21
+ dec_tree.predict(
22
+ cl.export_item(entity, false)
23
+ )[0]
24
+
25
+ end
26
+
27
+ end
@@ -1,43 +1,90 @@
1
- module Treat
2
- # This module keeps track of all categories that
3
- # exist and the methods they implement.
4
- module Categories
5
- class << self
6
- # A list of all categories.
7
- attr_accessor :list
1
+ # This module keeps track of all the Treat::Categorizable
2
+ # modules that exist and the methods they define.
3
+ #
4
+ #
5
+ # - Processors perform the building of tree of
6
+ # entities representing texts (chunking,
7
+ # segmenting, tokenizing, parsing).
8
+ # - Lexicalizers give lexical information about
9
+ # words (synsets, semantic relationships,
10
+ # tag, word category).
11
+ # - Extractors extract semantic information about
12
+ # an entity (language, topic, date, time, named
13
+ # entity, coreferences).
14
+ # - Inflectors allow to retrieve the different
15
+ # inflections of a word (declensors, conjugators,
16
+ # stemmers, lemmatizers).
17
+ # - Formatters handle the conversion of entities to
18
+ # and from different formats(readers, serializers,
19
+ # unserializers, visualizers).
20
+ # - Retrievers allow to index and search collections
21
+ # of documents.
22
+ module Treat::Categories
23
+
24
+ class << self
25
+ # A list of all categories.
26
+ attr_accessor :list
27
+ end
28
+
29
+ # Array - list of all categories.
30
+ self.list = []
31
+ # A lookup table for entity types.
32
+ @@lookup = {}
33
+
34
+ # Require all categories.
35
+ require 'treat/categorizable'
36
+ require 'treat/formatters'
37
+ require 'treat/processors'
38
+ require 'treat/lexicalizers'
39
+ require 'treat/inflectors'
40
+ require 'treat/extractors'
41
+ require 'treat/retrievers'
42
+ require 'treat/ai'
43
+
44
+ # Create the lookup table.
45
+ self.list.each do |category|
46
+ category.groups.each do |group|
47
+ group = category.const_get(group)
48
+ @@lookup[group.method] = group
49
+ group.presets.each do |x,y|
50
+ @@lookup[x] = group
51
+ end if group.presets
8
52
  end
9
- # Array - list of all categories.
10
- self.list = []
11
- @@lookup = nil
12
- # Find the class of a group given its method.
13
- def self.lookup(method)
14
- return @@lookup[method] if @@lookup
15
- @@lookup = {}
53
+ end
54
+
55
+ # Find the class of a group given its method.
56
+ def self.lookup(method)
57
+ @@lookup[method]
58
+ end
59
+
60
+ # Fix -- This must be moved urgently.
61
+ Treat::Entities::Entity.class_eval do
62
+
63
+ alias :true_language :language
64
+
65
+ def language(extractor = nil, options = {})
16
66
 
17
- self.list.each do |category|
18
- category.groups.each do |group|
19
- group = category.const_get(group)
20
- @@lookup[group.method] = group
21
- methods = group.presets.merge(
22
- group.preprocessors.merge(
23
- group.postprocessors
24
- )
25
- )
26
- methods.each do |x,y|
27
- @@lookup[x] = group
28
- end
67
+ if is_a?(Treat::Entities::Symbol) ||
68
+ is_a?(Treat::Entities::Number)
69
+ return Treat.default_language
70
+ end
71
+
72
+ if !Treat.detect_language
73
+ return Treat.default_language
74
+ else
75
+ dlvl = Treat.language_detection_level
76
+ if (Treat::Entities.rank(type) <
77
+ Treat::Entities.rank(dlvl)) &&
78
+ has_parent?
79
+ anc = ancestor_with_type(dlvl)
80
+ return anc.language if anc
29
81
  end
30
82
  end
31
83
 
32
- @@lookup[method]
84
+ true_language(extractor, options)
85
+
33
86
  end
34
- # Require all categories.
35
- require 'treat/category'
36
- require 'treat/formatters'
37
- require 'treat/processors'
38
- require 'treat/lexicalizers'
39
- require 'treat/inflectors'
40
- require 'treat/extractors'
41
- require 'treat/retrievers'
87
+
42
88
  end
89
+
43
90
  end
@@ -0,0 +1,44 @@
1
+ # A categorizable module brings together groups
2
+ # of algorithms that perform similar functions.
3
+ module Treat::Categorizable
4
+
5
+ # The contents of each categorizable
6
+ # module are groupable.
7
+ require 'treat/groupable'
8
+
9
+ # Add workers to the Entities based on the
10
+ # configuration for a given category.
11
+ def self.extended(category)
12
+ Treat::Categories.list << category
13
+ category.module_eval do
14
+ groups.each do |group|
15
+ group = const_get(group)
16
+ group.targets.each do |entity_type|
17
+ entity = Treat::Entities.
18
+ const_get(cc(entity_type))
19
+ entity.class_eval do
20
+ add_workers group
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+
27
+ # Get the list of groups defined
28
+ # under this module.
29
+ @@groups = self.constants
30
+
31
+ # Populate a list of methods.
32
+ @@methods = []
33
+ @@groups.each do |group|
34
+ @@methods << const_get(group).method
35
+ end
36
+
37
+ # Provide a list of methods implemented in
38
+ # the groups contained within this category.
39
+ def methods; @@methods; end
40
+
41
+ # Provides a list of groups within this category.
42
+ def groups; self.constants; end
43
+
44
+ end
@@ -0,0 +1,61 @@
1
+ class Treat::Classification
2
+
3
+ attr_reader :types
4
+ attr_reader :features
5
+ attr_reader :question
6
+ attr_reader :labels
7
+ attr_reader :default
8
+
9
+ def initialize(type_or_types, feature_or_features, question, default = false)
10
+
11
+ @types, @features,
12
+ @question, @default =
13
+ [*type_or_types],
14
+ [*feature_or_features],
15
+ question, default
16
+
17
+ @labels = []
18
+
19
+ @features.each do |cmd|
20
+ if cmd.is_a?(Array)
21
+ @labels << cmd[0]
22
+ else
23
+ @labels << cmd
24
+ end
25
+ end
26
+
27
+ end
28
+
29
+ def export_item(e, include_question = true)
30
+
31
+ line = []
32
+
33
+ @features.each do |cmd|
34
+ begin
35
+ if cmd.is_a?(Array)
36
+ line << cmd[1].call(e)
37
+ else
38
+ line << e.send(cmd)
39
+ end
40
+ rescue Treat::Exception
41
+ dflt = (
42
+ (cmd.is_a?(Array) && cmd[2]) ?
43
+ cmd[2] : nil
44
+ )
45
+ line << dflt
46
+ end
47
+ end
48
+
49
+ begin
50
+ if include_question
51
+ line << e.send(@question)
52
+ end
53
+ rescue Treat::Exception
54
+ line << @default
55
+ end
56
+ line[-1] = '' if line[-1].nil?
57
+ line
58
+
59
+ end
60
+
61
+ end