treat 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (147) hide show
  1. data/INSTALL +0 -0
  2. data/LICENSE +28 -0
  3. data/README +0 -0
  4. data/TODO +67 -0
  5. data/bin/INFO +1 -0
  6. data/examples/benchmark.rb +81 -0
  7. data/examples/keywords.rb +60 -0
  8. data/examples/texts/bugged_out.txt +26 -0
  9. data/examples/texts/half_cocked_basel.txt +16 -0
  10. data/examples/texts/hedge_funds.txt +24 -0
  11. data/examples/texts/hose_and_dry.txt +19 -0
  12. data/examples/texts/hungarys_troubles.txt +46 -0
  13. data/examples/texts/indias_slowdown.txt +15 -0
  14. data/examples/texts/merkozy_rides_again.txt +24 -0
  15. data/examples/texts/prada_is_not_walmart.txt +9 -0
  16. data/examples/texts/republican_nomination.txt +26 -0
  17. data/examples/texts/to_infinity_and_beyond.txt +15 -0
  18. data/lib/treat.rb +91 -0
  19. data/lib/treat/buildable.rb +115 -0
  20. data/lib/treat/categories.rb +29 -0
  21. data/lib/treat/category.rb +28 -0
  22. data/lib/treat/delegatable.rb +90 -0
  23. data/lib/treat/detectors.rb +28 -0
  24. data/lib/treat/detectors/encoding/native.rb +12 -0
  25. data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
  26. data/lib/treat/detectors/format/file.rb +36 -0
  27. data/lib/treat/detectors/language/language_detector.rb +19 -0
  28. data/lib/treat/detectors/language/what_language.rb +29 -0
  29. data/lib/treat/entities.rb +52 -0
  30. data/lib/treat/entities/collection.rb +19 -0
  31. data/lib/treat/entities/constituents.rb +15 -0
  32. data/lib/treat/entities/document.rb +11 -0
  33. data/lib/treat/entities/entity.rb +242 -0
  34. data/lib/treat/entities/sentence.rb +8 -0
  35. data/lib/treat/entities/text.rb +7 -0
  36. data/lib/treat/entities/tokens.rb +37 -0
  37. data/lib/treat/entities/zones.rb +17 -0
  38. data/lib/treat/exception.rb +5 -0
  39. data/lib/treat/extractors.rb +41 -0
  40. data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
  41. data/lib/treat/extractors/named_entity/abner.rb +20 -0
  42. data/lib/treat/extractors/named_entity/stanford.rb +174 -0
  43. data/lib/treat/extractors/statistics/frequency.rb +22 -0
  44. data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
  45. data/lib/treat/extractors/statistics/position_in.rb +13 -0
  46. data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
  47. data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
  48. data/lib/treat/extractors/time/chronic.rb +12 -0
  49. data/lib/treat/extractors/time/native.rb +12 -0
  50. data/lib/treat/extractors/time/nickel.rb +45 -0
  51. data/lib/treat/extractors/topic_words/lda.rb +71 -0
  52. data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
  53. data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
  54. data/lib/treat/extractors/topics/reuters.rb +91 -0
  55. data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
  56. data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
  57. data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
  58. data/lib/treat/feature.rb +53 -0
  59. data/lib/treat/formatters.rb +44 -0
  60. data/lib/treat/formatters/cleaners/html.rb +17 -0
  61. data/lib/treat/formatters/readers/autoselect.rb +35 -0
  62. data/lib/treat/formatters/readers/gocr.rb +24 -0
  63. data/lib/treat/formatters/readers/html.rb +13 -0
  64. data/lib/treat/formatters/readers/ocropus.rb +31 -0
  65. data/lib/treat/formatters/readers/pdf.rb +17 -0
  66. data/lib/treat/formatters/readers/txt.rb +15 -0
  67. data/lib/treat/formatters/serializers/xml.rb +48 -0
  68. data/lib/treat/formatters/serializers/yaml.rb +15 -0
  69. data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
  70. data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
  71. data/lib/treat/formatters/unserializers/xml.rb +79 -0
  72. data/lib/treat/formatters/unserializers/yaml.rb +15 -0
  73. data/lib/treat/formatters/visualizers/dot.rb +73 -0
  74. data/lib/treat/formatters/visualizers/html.rb +12 -0
  75. data/lib/treat/formatters/visualizers/inspect.rb +16 -0
  76. data/lib/treat/formatters/visualizers/short_value.rb +14 -0
  77. data/lib/treat/formatters/visualizers/standoff.rb +41 -0
  78. data/lib/treat/formatters/visualizers/tree.rb +28 -0
  79. data/lib/treat/formatters/visualizers/txt.rb +31 -0
  80. data/lib/treat/group.rb +96 -0
  81. data/lib/treat/inflectors.rb +50 -0
  82. data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
  83. data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
  84. data/lib/treat/inflectors/declensors/en.rb +18 -0
  85. data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
  86. data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
  87. data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
  88. data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
  89. data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
  90. data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
  91. data/lib/treat/inflectors/stemmers/porter.rb +158 -0
  92. data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
  93. data/lib/treat/inflectors/stemmers/uea.rb +30 -0
  94. data/lib/treat/lexicalizers.rb +49 -0
  95. data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
  96. data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
  97. data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
  98. data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
  99. data/lib/treat/lexicalizers/tag/brill.rb +101 -0
  100. data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
  101. data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
  102. data/lib/treat/processors.rb +45 -0
  103. data/lib/treat/processors/chunkers/txt.rb +27 -0
  104. data/lib/treat/processors/parsers/enju.rb +214 -0
  105. data/lib/treat/processors/parsers/stanford.rb +60 -0
  106. data/lib/treat/processors/segmenters/punkt.rb +48 -0
  107. data/lib/treat/processors/segmenters/stanford.rb +45 -0
  108. data/lib/treat/processors/segmenters/tactful.rb +34 -0
  109. data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
  110. data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
  111. data/lib/treat/processors/tokenizers/perl.rb +96 -0
  112. data/lib/treat/processors/tokenizers/punkt.rb +42 -0
  113. data/lib/treat/processors/tokenizers/stanford.rb +33 -0
  114. data/lib/treat/processors/tokenizers/tactful.rb +59 -0
  115. data/lib/treat/proxies.rb +66 -0
  116. data/lib/treat/registrable.rb +26 -0
  117. data/lib/treat/resources.rb +10 -0
  118. data/lib/treat/resources/categories.rb +18 -0
  119. data/lib/treat/resources/delegates.rb +96 -0
  120. data/lib/treat/resources/dependencies.rb +0 -0
  121. data/lib/treat/resources/edges.rb +8 -0
  122. data/lib/treat/resources/formats.rb +23 -0
  123. data/lib/treat/resources/languages.rb +86 -0
  124. data/lib/treat/resources/languages.txt +504 -0
  125. data/lib/treat/resources/tags.rb +393 -0
  126. data/lib/treat/sugar.rb +43 -0
  127. data/lib/treat/tree.rb +174 -0
  128. data/lib/treat/utilities.rb +127 -0
  129. data/lib/treat/visitable.rb +27 -0
  130. data/test/profile.rb +2 -0
  131. data/test/tc_detectors.rb +27 -0
  132. data/test/tc_entity.rb +105 -0
  133. data/test/tc_extractors.rb +48 -0
  134. data/test/tc_formatters.rb +46 -0
  135. data/test/tc_inflectors.rb +39 -0
  136. data/test/tc_lexicalizers.rb +39 -0
  137. data/test/tc_processors.rb +36 -0
  138. data/test/tc_resources.rb +27 -0
  139. data/test/tc_treat.rb +64 -0
  140. data/test/tc_tree.rb +60 -0
  141. data/test/tests.rb +19 -0
  142. data/test/texts.rb +20 -0
  143. data/test/texts/english/long.html +24 -0
  144. data/test/texts/english/long.txt +22 -0
  145. data/test/texts/english/medium.txt +5 -0
  146. data/test/texts/english/short.txt +3 -0
  147. metadata +412 -0
@@ -0,0 +1,15 @@
1
+ To Infinity and Beyond
2
+
3
+ THE annual Consumer Electronics Show (CES) feels right at home amid the blazing neon excess of Las Vegas. For four days each January, thousands of companies gather to showcase their flashiest technologies at America's biggest trade show. This year over 20,000 brand new gadgets competed for attendees' attention. Each has its own power cord or battery, and almost every one is either bigger or faster—and thus more power-hungry—than last year's model.
4
+
5
+ The International Energy Agency expects consumer electronics' appetite for electricity to double over the next decade, from 15% to 30% of residential consumption worldwide. Even supposedly energy-saving innovations, such as the organic-LED (OLED) screen in the massive 55-inch television proudly displayed by South Korea's LG, consume oodles of power, just by dint of being so big.
6
+
7
+ But this year's show also sees a welcome counter-trend. Several companies launched products labelled as having "infinite power". Such devices are meant to generate at least as much power as they consume. Buy one of these gizmos, the theory goes, and you need never connect it to a wall socket.
8
+
9
+ Eton Corporation announced Rukus, a portable boom box that can stream music from Bluetooth devices such as smartphones and tablets. For every hour that the Rukus is in full sunlight, it can play an hour of music, harvesting solar energy from 40 square inches (260 square centimetres) of photovoltaic panels. It has an internal battery which stores sunshine for cloudy picnics and can also be tapped to recharge other mobile gadgets.
10
+
11
+ Similarly impressive is a high-tech cover for Amazon's Kindle ebook reader by Solar Focus (see picture). A solar panel on the outer face gives 90 minutes of reading time for every hour of sunlight. Surplus energy is stored in a small lithium-ion battery and allows an LED reading lamp to run for up to 50 hours without drawing on the Kindle's own battery.
12
+
13
+ Both the Kindle and the Rukus have frugal monochrome E Ink displays that consume a fraction of the power of the colourful LCD screens found on most gadgets. Even the best solar-panel case for Apple's iPhone, for instance, can do no more than slow the rate at which the smartphone runs down. You might think, then, that something as large as a practical solar-powered electric car would be utterly impossible. Not so, says Ford. At CES, the carmaker showed a domestic solar panel kit it claims will offset all the electricity used over the lifetime of its new Focus Electric plug-in vehicle. The 150 square-foot (14 square-metre) array, to be installed on owners' houses, should feed enough power to the grid as the car draws to recharge its battery. to account for every mile a typical motorist drives. Panels on roofs in rainy Seattle might allow 12,000 miles (19,300km) of driving each year; denizens of sunny Tucson may squeeze out 15,000 miles or more.
14
+
15
+ The solar kit will set a Ford owner back around $10,000 (the price would be higher were it not for American federal incentives). It comes with a 25-year guarantee—22 years more than the warranty on the car. This is still some way from the dream of a self-contained solar runabout, but it is a start. The increasingly power-hungry electronics industry might will no doubt need to steer itself in a similar direction.
@@ -0,0 +1,91 @@
1
+ # This file requires all source code files for the Treat module.
2
+
3
+ #
4
+ # Main Treat namespace.
5
+ #
6
+ # Textual model:
7
+ #
8
+ # - Tree - Contains abstract tree node and leaf structures.
9
+ # - Entities - Contains concrtypee node and leaf structures
10
+ # that represent textual entities.
11
+ #
12
+ # Algorithm namespaces:
13
+ #
14
+ # - Dtypeectors - Namespace for language, encoding, and format
15
+ # detectors.
16
+ # - Extractors - Namespace for algorithms that extract
17
+ # information from entities.
18
+ # - Formatters - Namespace for algorithms that handle
19
+ # conversion to and from different formats.
20
+ # - Inflectors - Namespace for algorithms that supply
21
+ # the base form, inflections and declensions of a word.
22
+ # - Lexicalizers - Namespace for algorithms that supply
23
+ # lexical information about a word (part of speech,
24
+ # synstypes, klass.)
25
+ # - Processors - Namespace for algorithms that process an
26
+ # entity into a tree of sub-entities.
27
+ #
28
+ # Other modules:
29
+ #
30
+ # - Group - Creates functions for algorithm groups.
31
+ # - Proxies - Provide proxies for Treat functions on String,
32
+ # Numeric and Array classes.
33
+ # - Utilities - Supply utility functions used across the library.
34
+ #
35
+ module Treat
36
+
37
+ # Make sure that we are running on Ruby 1.9 or higher.
38
+ if RUBY_VERSION <= '1.9'
39
+ raise 'Treat requires Ruby 1.9 or higher.'
40
+ end
41
+
42
+ # The current version of Treat.
43
+ VERSION = "0.1.1"
44
+
45
+ # Require all files for the Treat library.
46
+ require 'treat/exception'
47
+ require 'treat/utilities'
48
+ require 'treat/resources'
49
+ require 'treat/entities'
50
+ require 'treat/categories'
51
+ require 'treat/proxies'
52
+
53
+ # Provides syntactic sugar.
54
+ require 'treat/sugar'
55
+ extend Sugar
56
+
57
+ # Create class variables for the Treat module.
58
+ class << self
59
+ # Default language to use when detect_language is false
60
+ attr_accessor :default_language
61
+ # Default encoding to use.
62
+ attr_accessor :default_encoding
63
+ # Boolean - detect language or use default?
64
+ attr_accessor :detect_language
65
+ # Identifier - the ideal entity level to detect language at
66
+ # (:entity, :sentence, :zone, :text, :document, klass.)
67
+ attr_accessor :language_detection_level
68
+ # String - main folder for executable files.
69
+ attr_accessor :bin
70
+ end
71
+
72
+ # Folder paths.
73
+ @@lib = File.dirname(__FILE__)
74
+ @@test = @@lib + '/../test/'
75
+ @@tmp = @@lib + '/../tmp/'
76
+ @@bin = @@lib + '/../bin'
77
+ def self.lib; @@lib; end
78
+ def self.test; @@test; end
79
+ def self.tmp; @@tmp; end
80
+
81
+ # Stype the default language to english.
82
+ self.default_language = :eng
83
+ # Stype the default encoding to utf-8.
84
+ self.default_encoding = :utf_8
85
+ # Turn language detection off by default.
86
+ self.detect_language = false
87
+ # Dtypeect the language once per text by default.
88
+ self.language_detection_level = :text
89
+ # Stype the bin path to the gem's bin folder by default.
90
+ self.bin = @@bin
91
+ end
@@ -0,0 +1,115 @@
1
+ module Treat
2
+ module Buildable
3
+
4
+ def from_anything(file_or_value, id)
5
+ if File.readable?(file_or_value.to_s)
6
+ from_file(file_or_value)
7
+ elsif file_or_value.is_a?(String)
8
+ from_string(file_or_value)
9
+ elsif file_or_value.is_a?(Numeric)
10
+ from_numeric(file_or_value)
11
+ else
12
+ raise "Unrecognizable input #{file_or_value}. "+
13
+ "Use filename, folder, text or a number."
14
+ end
15
+ end
16
+
17
+ def from_string(string)
18
+ if self == Treat::Entities::Document ||
19
+ self == Treat::Entities::Collection
20
+ raise Treat::Exception,
21
+ "Cannot create a document or collection from " +
22
+ "a string (need a readable file/folder)."
23
+ end
24
+ string = string.to_s
25
+ dot = string.count('.') + string.count('!') + string.count('?')
26
+ return Treat::Entities::Text.new(string) if dot > 1 ||
27
+ (string.count("\n") > 0 && dot == 1)
28
+ return Treat::Entities::Sentence.new(string) if dot == 1 && string.size > 5
29
+ if string.count(' ') == 0
30
+ return Treat::Entities::Clitic.new(string) if string == "'s"
31
+ return Treat::Entities::Word.new(string) if string =~ /^[[:alpha:]\-']+$/
32
+ return Treat::Entities::Number.new(string) if string =~ /^[[:digit:]]+$/
33
+ return Treat::Entities::Punctuation.new(string) if string =~ /^[[:punct:]]+$/
34
+ return Treat::Entities::Symbol.new(string)
35
+ else
36
+ return Treat::Entities::Phrase.new(string)
37
+ end
38
+ return Treat::Entities::Unknown.new(string)
39
+ end
40
+
41
+ def from_numeric(numeric)
42
+ unless self == Treat::Entities::Number
43
+ raise Treat::Exception,
44
+ "Cannot create something else than a " +
45
+ " number from a numeric object."
46
+ end
47
+ Treat::Entities::Number.new(numeric.to_s)
48
+ end
49
+
50
+ def from_folder(folder)
51
+ unless FileTest.directory?(folder)
52
+ raise Treat::Exception,
53
+ "Path '#{folder}' does not point to a folder."
54
+ end
55
+ unless File.readable?(folder)
56
+ raise Treat::Exception,
57
+ "Folder '#{folder}' is not readable."
58
+ end
59
+ unless self == Treat::Entities::Collection
60
+ raise Treat::Exception,
61
+ "Cannot create something else than a " +
62
+ "collection from folder '#{folder}'."
63
+ end
64
+ c = Treat::Entities::Collection.new
65
+ folder += '/' unless folder[-1] == '/'
66
+ Dir[folder + '*'].each do |f|
67
+ next if FileTest.directory?(f)
68
+ c << Treat::Entities::Document.from_file(f)
69
+ end
70
+ c
71
+ end
72
+
73
+ def from_file(file)
74
+ unless File.readable?(file)
75
+ raise Treat::Exception,
76
+ "Path '#{file}' does not point to a readable file."
77
+ end
78
+ if FileTest.directory?(file)
79
+ from_folder(file)
80
+ else
81
+ ext = file.split('.')[-1]
82
+ ext = 'yaml' if ext == 'yml' # Humanize the yml extension.
83
+ if Treat::Formatters::Unserializers.list.
84
+ include?(ext.downcase.intern)
85
+ from_serialized_file(file)
86
+ else
87
+ from_raw_file(file)
88
+ end
89
+ end
90
+ end
91
+
92
+ def from_raw_file(file)
93
+ unless self == Treat::Entities::Document
94
+ raise Treat::Exception,
95
+ "Cannot create something else than a " +
96
+ "document from raw file '#{file}'."
97
+ end
98
+ d = Treat::Entities::Document.new(file)
99
+ d.read
100
+ end
101
+
102
+ def from_serialized_file(file)
103
+ unless [Treat::Entities::Document, Treat::Entities::Collection].include?(self)
104
+ raise Treat::Exception,
105
+ "Cannot create something else than a " +
106
+ "document from raw file '#{file}'."
107
+ end
108
+ d = Treat::Entities::Document.new(file)
109
+ d.unserialize
110
+ d.children[0].set_as_root!
111
+ d.children[0]
112
+ end
113
+
114
+ end
115
+ end
@@ -0,0 +1,29 @@
1
+ module Treat
2
+ module Categories
3
+ # Modify the module that includes Category to
4
+ # setup autoload, delegators and provide a list
5
+ # of methods.
6
+ class << self; attr_accessor :list; end
7
+ self.list = []
8
+ # Boolean - does any of the categories
9
+ # groups respond to the symbol.
10
+ def self.have_method?(sym); methods.include?(sym); end
11
+ # Provide a list of all methods implemented
12
+ # by all Treat categories.
13
+ @@methods = []
14
+ def self.methods
15
+ return @@methods unless @@methods.empty?
16
+ self.list.each do |ns|
17
+ ns.methods.each { |method| @@methods << method }
18
+ end
19
+ @@methods
20
+ end
21
+ require 'treat/category'
22
+ require 'treat/detectors'
23
+ require 'treat/formatters'
24
+ require 'treat/processors'
25
+ require 'treat/lexicalizers'
26
+ require 'treat/extractors'
27
+ require 'treat/inflectors'
28
+ end
29
+ end
@@ -0,0 +1,28 @@
1
+ module Treat
2
+ # Provides functions common to all algorithm categories.
3
+ module Category
4
+ require 'treat/group'
5
+ def self.extended(category)
6
+ Treat::Categories.list << category
7
+ category.module_eval do
8
+ groups.each do |group|
9
+ group = const_get(group)
10
+ group.targets.each do |entity_type|
11
+ entity = Entities.const_get(cc(entity_type))
12
+ entity.class_eval { add_delegators group }
13
+ end
14
+ end
15
+ end
16
+ end
17
+ def groups; self.constants; end
18
+ # Provide a list of methods implemented in
19
+ # the groups contained within that
20
+ def methods
21
+ methods = []
22
+ groups.each do |group|
23
+ methods << const_get(group).method
24
+ end
25
+ methods
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,90 @@
1
+ module Treat
2
+ # Makes a class delegatable, allowing calls on it to be forwarded
3
+ # to a delegate class performing the appropriate call.
4
+ module Delegatable
5
+
6
+ # Get the default delegate for that language
7
+ # inside the given group.
8
+ def get_language_delegate(language, group)
9
+ lang = Treat::Resources::Languages.describe(language)
10
+ lclass = cc(lang).intern
11
+ if Treat::Resources::Delegates.constants.include?(lclass)
12
+ cat = group.to_s.split('::')[-2].intern
13
+ lclass = Treat::Resources::Delegates.
14
+ const_get(lclass).const_get(cat)
15
+ g = ucc(cl(group)).intern
16
+ if !lclass[g] || !lclass[g][0]
17
+ d = ucc(cl(group))
18
+ d.gsub!('_', ' ')
19
+ d = d[0..-2] if d[-1] == 's'
20
+ d = 'delegator to find ' + d
21
+ raise Treat::Exception, "No #{d}" +
22
+ " is available for the #{lang} language."
23
+ end
24
+ return lclass[g][0]
25
+ else
26
+ raise Treat::Exception,
27
+ "Language '#{lang}' is not supported (yet)."
28
+ end
29
+ end
30
+
31
+ # Add decorator methods to entities.
32
+ def decorate(group, m)
33
+ decorators = group.methods -
34
+ Object.methods -
35
+ [:type, :type=, :targets, :targets=,
36
+ :default, :default=, :add,
37
+ :has_target?, :list]
38
+ decorators.each do |decorator_m|
39
+ define_method(decorator_m) do |delegate=nil, options={}|
40
+ options[:decorator] = decorator_m
41
+ send(m, delegate, options)
42
+ end
43
+ end
44
+ end
45
+
46
+ # Raise an exception and suggest alternatives.
47
+ def delegate_not_found(klass, group)
48
+ "Algorithm '#{ucc(klass)}' couldn't be found in group #{group}." +
49
+ did_you_mean?(group.list.map { |c| ucc(c) }, ucc(klass))
50
+ end
51
+
52
+ # Add delegator group to all entities of a class.
53
+ def add_delegators(group)
54
+ # Define each method in group.
55
+ self.class_eval do
56
+ m = group.method
57
+ decorate(group, m)
58
+ define_method(m) do |delegate=nil, options={}|
59
+ decorator = options.delete(:decorator)
60
+ puts self.id if !@features
61
+ if !@features[m].nil?
62
+ @features[m]
63
+ else
64
+ if delegate.nil?
65
+ delegate = group.default.nil? ?
66
+ self.class.get_language_delegate(language, group) :
67
+ group.default
68
+ raise "No default delegate for #{group}." if delegate == :none
69
+ end
70
+ if not group.list.include?(delegate)
71
+ raise Treat::Exception,
72
+ self.class.delegate_not_found(delegate, group)
73
+ else
74
+ delegate_klass = group.const_get(:"#{cc(delegate.to_s)}")
75
+ result = accept(group, delegate_klass, m, options)
76
+ if decorator
77
+ result = group.send(decorator, self, result)
78
+ end
79
+ if group.type == :annotator
80
+ f = decorator.nil? ? m : decorator
81
+ @features[f] = result
82
+ end
83
+ result
84
+ end
85
+ end
86
+ end
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,28 @@
1
+ module Treat
2
+ # Detectors detect a specific meta-information about
3
+ # an entity, such as encoding, format and language.
4
+ module Detectors
5
+ # Group for algorithms that detect encoding.
6
+ module Encoding
7
+ extend Group
8
+ self.type = :annotator
9
+ self.targets = [:entity]
10
+ self.default = :native
11
+ end
12
+ # Group for algorithms that support format detection.
13
+ module Format
14
+ extend Group
15
+ self.type = :annotator
16
+ self.targets = [:entity]
17
+ self.default = :file
18
+ end
19
+ # Group for algorithms that do language detection.
20
+ module Language
21
+ extend Group
22
+ self.type = :annotator
23
+ self.targets = [:entity]
24
+ self.default = :what_language
25
+ end
26
+ extend Treat::Category
27
+ end
28
+ end
@@ -0,0 +1,12 @@
1
+ module Treat
2
+ module Detectors
3
+ module Encoding
4
+ class Native
5
+ def self.encoding(entity, options={})
6
+ entity.value.encoding.name.
7
+ gsub('-', '_').downcase.intern
8
+ end
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,24 @@
1
+ module Treat
2
+ module Detectors
3
+ module Encoding
4
+ # Require the 'rchardet19' gem.
5
+ silently { require 'rchardet19' }
6
+ # A wrapper for the 'rchardet19' gem, which
7
+ # detects the encoding of a file.
8
+ class RChardet19
9
+ # Returns an Encoding object representing
10
+ # the encoding of the supplied entity's
11
+ # text value.
12
+ #
13
+ # Options: none.
14
+ def self.encoding(entity, options={})
15
+ r = CharDet.detect(entity.to_s)
16
+ Treat::Feature.new({
17
+ r.encoding.
18
+ gsub('-', '_').intern =>
19
+ r.confidence}).best
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,36 @@
1
+ module Treat
2
+ module Detectors
3
+ module Format
4
+ # A wrapper for the *NIX 'file' command,
5
+ # witch uses etc/magic to detect the format
6
+ # of a file.
7
+ class File
8
+ # Returns an identifier representing
9
+ # the format of a file using the *NIX
10
+ # 'file' command.
11
+ #
12
+ # Options: none.
13
+ def self.format(entity, options = {})
14
+ format = nil
15
+ create_temp_file(:txt, entity.to_s) do |tmp|
16
+ format = `file #{tmp}`
17
+ end
18
+ if format.scan('text')
19
+ :txt
20
+ elsif format.scan('XML')
21
+ :xml
22
+ elsif format.scan('HTML')
23
+ :html
24
+ elsif format.scan('image')
25
+ :image
26
+ elsif format.scan('PDF')
27
+ :pdf
28
+ else
29
+ raise Treat::Exception,
30
+ "Unsupported text format #{format}."
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end