treat 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL +0 -0
- data/LICENSE +28 -0
- data/README +0 -0
- data/TODO +67 -0
- data/bin/INFO +1 -0
- data/examples/benchmark.rb +81 -0
- data/examples/keywords.rb +60 -0
- data/examples/texts/bugged_out.txt +26 -0
- data/examples/texts/half_cocked_basel.txt +16 -0
- data/examples/texts/hedge_funds.txt +24 -0
- data/examples/texts/hose_and_dry.txt +19 -0
- data/examples/texts/hungarys_troubles.txt +46 -0
- data/examples/texts/indias_slowdown.txt +15 -0
- data/examples/texts/merkozy_rides_again.txt +24 -0
- data/examples/texts/prada_is_not_walmart.txt +9 -0
- data/examples/texts/republican_nomination.txt +26 -0
- data/examples/texts/to_infinity_and_beyond.txt +15 -0
- data/lib/treat.rb +91 -0
- data/lib/treat/buildable.rb +115 -0
- data/lib/treat/categories.rb +29 -0
- data/lib/treat/category.rb +28 -0
- data/lib/treat/delegatable.rb +90 -0
- data/lib/treat/detectors.rb +28 -0
- data/lib/treat/detectors/encoding/native.rb +12 -0
- data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
- data/lib/treat/detectors/format/file.rb +36 -0
- data/lib/treat/detectors/language/language_detector.rb +19 -0
- data/lib/treat/detectors/language/what_language.rb +29 -0
- data/lib/treat/entities.rb +52 -0
- data/lib/treat/entities/collection.rb +19 -0
- data/lib/treat/entities/constituents.rb +15 -0
- data/lib/treat/entities/document.rb +11 -0
- data/lib/treat/entities/entity.rb +242 -0
- data/lib/treat/entities/sentence.rb +8 -0
- data/lib/treat/entities/text.rb +7 -0
- data/lib/treat/entities/tokens.rb +37 -0
- data/lib/treat/entities/zones.rb +17 -0
- data/lib/treat/exception.rb +5 -0
- data/lib/treat/extractors.rb +41 -0
- data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
- data/lib/treat/extractors/named_entity/abner.rb +20 -0
- data/lib/treat/extractors/named_entity/stanford.rb +174 -0
- data/lib/treat/extractors/statistics/frequency.rb +22 -0
- data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
- data/lib/treat/extractors/statistics/position_in.rb +13 -0
- data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
- data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
- data/lib/treat/extractors/time/chronic.rb +12 -0
- data/lib/treat/extractors/time/native.rb +12 -0
- data/lib/treat/extractors/time/nickel.rb +45 -0
- data/lib/treat/extractors/topic_words/lda.rb +71 -0
- data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
- data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
- data/lib/treat/extractors/topics/reuters.rb +91 -0
- data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
- data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
- data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
- data/lib/treat/feature.rb +53 -0
- data/lib/treat/formatters.rb +44 -0
- data/lib/treat/formatters/cleaners/html.rb +17 -0
- data/lib/treat/formatters/readers/autoselect.rb +35 -0
- data/lib/treat/formatters/readers/gocr.rb +24 -0
- data/lib/treat/formatters/readers/html.rb +13 -0
- data/lib/treat/formatters/readers/ocropus.rb +31 -0
- data/lib/treat/formatters/readers/pdf.rb +17 -0
- data/lib/treat/formatters/readers/txt.rb +15 -0
- data/lib/treat/formatters/serializers/xml.rb +48 -0
- data/lib/treat/formatters/serializers/yaml.rb +15 -0
- data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
- data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
- data/lib/treat/formatters/unserializers/xml.rb +79 -0
- data/lib/treat/formatters/unserializers/yaml.rb +15 -0
- data/lib/treat/formatters/visualizers/dot.rb +73 -0
- data/lib/treat/formatters/visualizers/html.rb +12 -0
- data/lib/treat/formatters/visualizers/inspect.rb +16 -0
- data/lib/treat/formatters/visualizers/short_value.rb +14 -0
- data/lib/treat/formatters/visualizers/standoff.rb +41 -0
- data/lib/treat/formatters/visualizers/tree.rb +28 -0
- data/lib/treat/formatters/visualizers/txt.rb +31 -0
- data/lib/treat/group.rb +96 -0
- data/lib/treat/inflectors.rb +50 -0
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
- data/lib/treat/inflectors/declensors/en.rb +18 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
- data/lib/treat/inflectors/stemmers/porter.rb +158 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
- data/lib/treat/inflectors/stemmers/uea.rb +30 -0
- data/lib/treat/lexicalizers.rb +49 -0
- data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
- data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
- data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
- data/lib/treat/lexicalizers/tag/brill.rb +101 -0
- data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
- data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
- data/lib/treat/processors.rb +45 -0
- data/lib/treat/processors/chunkers/txt.rb +27 -0
- data/lib/treat/processors/parsers/enju.rb +214 -0
- data/lib/treat/processors/parsers/stanford.rb +60 -0
- data/lib/treat/processors/segmenters/punkt.rb +48 -0
- data/lib/treat/processors/segmenters/stanford.rb +45 -0
- data/lib/treat/processors/segmenters/tactful.rb +34 -0
- data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
- data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
- data/lib/treat/processors/tokenizers/perl.rb +96 -0
- data/lib/treat/processors/tokenizers/punkt.rb +42 -0
- data/lib/treat/processors/tokenizers/stanford.rb +33 -0
- data/lib/treat/processors/tokenizers/tactful.rb +59 -0
- data/lib/treat/proxies.rb +66 -0
- data/lib/treat/registrable.rb +26 -0
- data/lib/treat/resources.rb +10 -0
- data/lib/treat/resources/categories.rb +18 -0
- data/lib/treat/resources/delegates.rb +96 -0
- data/lib/treat/resources/dependencies.rb +0 -0
- data/lib/treat/resources/edges.rb +8 -0
- data/lib/treat/resources/formats.rb +23 -0
- data/lib/treat/resources/languages.rb +86 -0
- data/lib/treat/resources/languages.txt +504 -0
- data/lib/treat/resources/tags.rb +393 -0
- data/lib/treat/sugar.rb +43 -0
- data/lib/treat/tree.rb +174 -0
- data/lib/treat/utilities.rb +127 -0
- data/lib/treat/visitable.rb +27 -0
- data/test/profile.rb +2 -0
- data/test/tc_detectors.rb +27 -0
- data/test/tc_entity.rb +105 -0
- data/test/tc_extractors.rb +48 -0
- data/test/tc_formatters.rb +46 -0
- data/test/tc_inflectors.rb +39 -0
- data/test/tc_lexicalizers.rb +39 -0
- data/test/tc_processors.rb +36 -0
- data/test/tc_resources.rb +27 -0
- data/test/tc_treat.rb +64 -0
- data/test/tc_tree.rb +60 -0
- data/test/tests.rb +19 -0
- data/test/texts.rb +20 -0
- data/test/texts/english/long.html +24 -0
- data/test/texts/english/long.txt +22 -0
- data/test/texts/english/medium.txt +5 -0
- data/test/texts/english/short.txt +3 -0
- metadata +412 -0
@@ -0,0 +1,15 @@
|
|
1
|
+
To Infinity and Beyond
|
2
|
+
|
3
|
+
THE annual Consumer Electronics Show (CES) feels right at home amid the blazing neon excess of Las Vegas. For four days each January, thousands of companies gather to showcase their flashiest technologies at America's biggest trade show. This year over 20,000 brand new gadgets competed for attendees' attention. Each has its own power cord or battery, and almost every one is either bigger or faster—and thus more power-hungry—than last year's model.
|
4
|
+
|
5
|
+
The International Energy Agency expects consumer electronics' appetite for electricity to double over the next decade, from 15% to 30% of residential consumption worldwide. Even supposedly energy-saving innovations, such as the organic-LED (OLED) screen in the massive 55-inch television proudly displayed by South Korea's LG, consume oodles of power, just by dint of being so big.
|
6
|
+
|
7
|
+
But this year's show also sees a welcome counter-trend. Several companies launched products labelled as having "infinite power". Such devices are meant to generate at least as much power as they consume. Buy one of these gizmos, the theory goes, and you need never connect it to a wall socket.
|
8
|
+
|
9
|
+
Eton Corporation announced Rukus, a portable boom box that can stream music from Bluetooth devices such as smartphones and tablets. For every hour that the Rukus is in full sunlight, it can play an hour of music, harvesting solar energy from 40 square inches (260 square centimetres) of photovoltaic panels. It has an internal battery which stores sunshine for cloudy picnics and can also be tapped to recharge other mobile gadgets.
|
10
|
+
|
11
|
+
Similarly impressive is a high-tech cover for Amazon's Kindle ebook reader by Solar Focus (see picture). A solar panel on the outer face gives 90 minutes of reading time for every hour of sunlight. Surplus energy is stored in a small lithium-ion battery and allows an LED reading lamp to run for up to 50 hours without drawing on the Kindle's own battery.
|
12
|
+
|
13
|
+
Both the Kindle and the Rukus have frugal monochrome E Ink displays that consume a fraction of the power of the colourful LCD screens found on most gadgets. Even the best solar-panel case for Apple's iPhone, for instance, can do no more than slow the rate at which the smartphone runs down. You might think, then, that something as large as a practical solar-powered electric car would be utterly impossible. Not so, says Ford. At CES, the carmaker showed a domestic solar panel kit it claims will offset all the electricity used over the lifetime of its new Focus Electric plug-in vehicle. The 150 square-foot (14 square-metre) array, to be installed on owners' houses, should feed enough power to the grid as the car draws to recharge its battery. to account for every mile a typical motorist drives. Panels on roofs in rainy Seattle might allow 12,000 miles (19,300km) of driving each year; denizens of sunny Tucson may squeeze out 15,000 miles or more.
|
14
|
+
|
15
|
+
The solar kit will set a Ford owner back around $10,000 (the price would be higher were it not for American federal incentives). It comes with a 25-year guarantee—22 years more than the warranty on the car. This is still some way from the dream of a self-contained solar runabout, but it is a start. The increasingly power-hungry electronics industry might will no doubt need to steer itself in a similar direction.
|
data/lib/treat.rb
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
# This file requires all source code files for the Treat module.
|
2
|
+
|
3
|
+
#
|
4
|
+
# Main Treat namespace.
|
5
|
+
#
|
6
|
+
# Textual model:
|
7
|
+
#
|
8
|
+
# - Tree - Contains abstract tree node and leaf structures.
|
9
|
+
# - Entities - Contains concrtypee node and leaf structures
|
10
|
+
# that represent textual entities.
|
11
|
+
#
|
12
|
+
# Algorithm namespaces:
|
13
|
+
#
|
14
|
+
# - Dtypeectors - Namespace for language, encoding, and format
|
15
|
+
# detectors.
|
16
|
+
# - Extractors - Namespace for algorithms that extract
|
17
|
+
# information from entities.
|
18
|
+
# - Formatters - Namespace for algorithms that handle
|
19
|
+
# conversion to and from different formats.
|
20
|
+
# - Inflectors - Namespace for algorithms that supply
|
21
|
+
# the base form, inflections and declensions of a word.
|
22
|
+
# - Lexicalizers - Namespace for algorithms that supply
|
23
|
+
# lexical information about a word (part of speech,
|
24
|
+
# synstypes, klass.)
|
25
|
+
# - Processors - Namespace for algorithms that process an
|
26
|
+
# entity into a tree of sub-entities.
|
27
|
+
#
|
28
|
+
# Other modules:
|
29
|
+
#
|
30
|
+
# - Group - Creates functions for algorithm groups.
|
31
|
+
# - Proxies - Provide proxies for Treat functions on String,
|
32
|
+
# Numeric and Array classes.
|
33
|
+
# - Utilities - Supply utility functions used across the library.
|
34
|
+
#
|
35
|
+
module Treat
|
36
|
+
|
37
|
+
# Make sure that we are running on Ruby 1.9 or higher.
|
38
|
+
if RUBY_VERSION <= '1.9'
|
39
|
+
raise 'Treat requires Ruby 1.9 or higher.'
|
40
|
+
end
|
41
|
+
|
42
|
+
# The current version of Treat.
|
43
|
+
VERSION = "0.1.1"
|
44
|
+
|
45
|
+
# Require all files for the Treat library.
|
46
|
+
require 'treat/exception'
|
47
|
+
require 'treat/utilities'
|
48
|
+
require 'treat/resources'
|
49
|
+
require 'treat/entities'
|
50
|
+
require 'treat/categories'
|
51
|
+
require 'treat/proxies'
|
52
|
+
|
53
|
+
# Provides syntactic sugar.
|
54
|
+
require 'treat/sugar'
|
55
|
+
extend Sugar
|
56
|
+
|
57
|
+
# Create class variables for the Treat module.
|
58
|
+
class << self
|
59
|
+
# Default language to use when detect_language is false
|
60
|
+
attr_accessor :default_language
|
61
|
+
# Default encoding to use.
|
62
|
+
attr_accessor :default_encoding
|
63
|
+
# Boolean - detect language or use default?
|
64
|
+
attr_accessor :detect_language
|
65
|
+
# Identifier - the ideal entity level to detect language at
|
66
|
+
# (:entity, :sentence, :zone, :text, :document, klass.)
|
67
|
+
attr_accessor :language_detection_level
|
68
|
+
# String - main folder for executable files.
|
69
|
+
attr_accessor :bin
|
70
|
+
end
|
71
|
+
|
72
|
+
# Folder paths.
|
73
|
+
@@lib = File.dirname(__FILE__)
|
74
|
+
@@test = @@lib + '/../test/'
|
75
|
+
@@tmp = @@lib + '/../tmp/'
|
76
|
+
@@bin = @@lib + '/../bin'
|
77
|
+
def self.lib; @@lib; end
|
78
|
+
def self.test; @@test; end
|
79
|
+
def self.tmp; @@tmp; end
|
80
|
+
|
81
|
+
# Stype the default language to english.
|
82
|
+
self.default_language = :eng
|
83
|
+
# Stype the default encoding to utf-8.
|
84
|
+
self.default_encoding = :utf_8
|
85
|
+
# Turn language detection off by default.
|
86
|
+
self.detect_language = false
|
87
|
+
# Dtypeect the language once per text by default.
|
88
|
+
self.language_detection_level = :text
|
89
|
+
# Stype the bin path to the gem's bin folder by default.
|
90
|
+
self.bin = @@bin
|
91
|
+
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
module Treat
|
2
|
+
module Buildable
|
3
|
+
|
4
|
+
def from_anything(file_or_value, id)
|
5
|
+
if File.readable?(file_or_value.to_s)
|
6
|
+
from_file(file_or_value)
|
7
|
+
elsif file_or_value.is_a?(String)
|
8
|
+
from_string(file_or_value)
|
9
|
+
elsif file_or_value.is_a?(Numeric)
|
10
|
+
from_numeric(file_or_value)
|
11
|
+
else
|
12
|
+
raise "Unrecognizable input #{file_or_value}. "+
|
13
|
+
"Use filename, folder, text or a number."
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def from_string(string)
|
18
|
+
if self == Treat::Entities::Document ||
|
19
|
+
self == Treat::Entities::Collection
|
20
|
+
raise Treat::Exception,
|
21
|
+
"Cannot create a document or collection from " +
|
22
|
+
"a string (need a readable file/folder)."
|
23
|
+
end
|
24
|
+
string = string.to_s
|
25
|
+
dot = string.count('.') + string.count('!') + string.count('?')
|
26
|
+
return Treat::Entities::Text.new(string) if dot > 1 ||
|
27
|
+
(string.count("\n") > 0 && dot == 1)
|
28
|
+
return Treat::Entities::Sentence.new(string) if dot == 1 && string.size > 5
|
29
|
+
if string.count(' ') == 0
|
30
|
+
return Treat::Entities::Clitic.new(string) if string == "'s"
|
31
|
+
return Treat::Entities::Word.new(string) if string =~ /^[[:alpha:]\-']+$/
|
32
|
+
return Treat::Entities::Number.new(string) if string =~ /^[[:digit:]]+$/
|
33
|
+
return Treat::Entities::Punctuation.new(string) if string =~ /^[[:punct:]]+$/
|
34
|
+
return Treat::Entities::Symbol.new(string)
|
35
|
+
else
|
36
|
+
return Treat::Entities::Phrase.new(string)
|
37
|
+
end
|
38
|
+
return Treat::Entities::Unknown.new(string)
|
39
|
+
end
|
40
|
+
|
41
|
+
def from_numeric(numeric)
|
42
|
+
unless self == Treat::Entities::Number
|
43
|
+
raise Treat::Exception,
|
44
|
+
"Cannot create something else than a " +
|
45
|
+
" number from a numeric object."
|
46
|
+
end
|
47
|
+
Treat::Entities::Number.new(numeric.to_s)
|
48
|
+
end
|
49
|
+
|
50
|
+
def from_folder(folder)
|
51
|
+
unless FileTest.directory?(folder)
|
52
|
+
raise Treat::Exception,
|
53
|
+
"Path '#{folder}' does not point to a folder."
|
54
|
+
end
|
55
|
+
unless File.readable?(folder)
|
56
|
+
raise Treat::Exception,
|
57
|
+
"Folder '#{folder}' is not readable."
|
58
|
+
end
|
59
|
+
unless self == Treat::Entities::Collection
|
60
|
+
raise Treat::Exception,
|
61
|
+
"Cannot create something else than a " +
|
62
|
+
"collection from folder '#{folder}'."
|
63
|
+
end
|
64
|
+
c = Treat::Entities::Collection.new
|
65
|
+
folder += '/' unless folder[-1] == '/'
|
66
|
+
Dir[folder + '*'].each do |f|
|
67
|
+
next if FileTest.directory?(f)
|
68
|
+
c << Treat::Entities::Document.from_file(f)
|
69
|
+
end
|
70
|
+
c
|
71
|
+
end
|
72
|
+
|
73
|
+
def from_file(file)
|
74
|
+
unless File.readable?(file)
|
75
|
+
raise Treat::Exception,
|
76
|
+
"Path '#{file}' does not point to a readable file."
|
77
|
+
end
|
78
|
+
if FileTest.directory?(file)
|
79
|
+
from_folder(file)
|
80
|
+
else
|
81
|
+
ext = file.split('.')[-1]
|
82
|
+
ext = 'yaml' if ext == 'yml' # Humanize the yml extension.
|
83
|
+
if Treat::Formatters::Unserializers.list.
|
84
|
+
include?(ext.downcase.intern)
|
85
|
+
from_serialized_file(file)
|
86
|
+
else
|
87
|
+
from_raw_file(file)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def from_raw_file(file)
|
93
|
+
unless self == Treat::Entities::Document
|
94
|
+
raise Treat::Exception,
|
95
|
+
"Cannot create something else than a " +
|
96
|
+
"document from raw file '#{file}'."
|
97
|
+
end
|
98
|
+
d = Treat::Entities::Document.new(file)
|
99
|
+
d.read
|
100
|
+
end
|
101
|
+
|
102
|
+
def from_serialized_file(file)
|
103
|
+
unless [Treat::Entities::Document, Treat::Entities::Collection].include?(self)
|
104
|
+
raise Treat::Exception,
|
105
|
+
"Cannot create something else than a " +
|
106
|
+
"document from raw file '#{file}'."
|
107
|
+
end
|
108
|
+
d = Treat::Entities::Document.new(file)
|
109
|
+
d.unserialize
|
110
|
+
d.children[0].set_as_root!
|
111
|
+
d.children[0]
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|
115
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Treat
|
2
|
+
module Categories
|
3
|
+
# Modify the module that includes Category to
|
4
|
+
# setup autoload, delegators and provide a list
|
5
|
+
# of methods.
|
6
|
+
class << self; attr_accessor :list; end
|
7
|
+
self.list = []
|
8
|
+
# Boolean - does any of the categories
|
9
|
+
# groups respond to the symbol.
|
10
|
+
def self.have_method?(sym); methods.include?(sym); end
|
11
|
+
# Provide a list of all methods implemented
|
12
|
+
# by all Treat categories.
|
13
|
+
@@methods = []
|
14
|
+
def self.methods
|
15
|
+
return @@methods unless @@methods.empty?
|
16
|
+
self.list.each do |ns|
|
17
|
+
ns.methods.each { |method| @@methods << method }
|
18
|
+
end
|
19
|
+
@@methods
|
20
|
+
end
|
21
|
+
require 'treat/category'
|
22
|
+
require 'treat/detectors'
|
23
|
+
require 'treat/formatters'
|
24
|
+
require 'treat/processors'
|
25
|
+
require 'treat/lexicalizers'
|
26
|
+
require 'treat/extractors'
|
27
|
+
require 'treat/inflectors'
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Treat
|
2
|
+
# Provides functions common to all algorithm categories.
|
3
|
+
module Category
|
4
|
+
require 'treat/group'
|
5
|
+
def self.extended(category)
|
6
|
+
Treat::Categories.list << category
|
7
|
+
category.module_eval do
|
8
|
+
groups.each do |group|
|
9
|
+
group = const_get(group)
|
10
|
+
group.targets.each do |entity_type|
|
11
|
+
entity = Entities.const_get(cc(entity_type))
|
12
|
+
entity.class_eval { add_delegators group }
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
def groups; self.constants; end
|
18
|
+
# Provide a list of methods implemented in
|
19
|
+
# the groups contained within that
|
20
|
+
def methods
|
21
|
+
methods = []
|
22
|
+
groups.each do |group|
|
23
|
+
methods << const_get(group).method
|
24
|
+
end
|
25
|
+
methods
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
module Treat
|
2
|
+
# Makes a class delegatable, allowing calls on it to be forwarded
|
3
|
+
# to a delegate class performing the appropriate call.
|
4
|
+
module Delegatable
|
5
|
+
|
6
|
+
# Get the default delegate for that language
|
7
|
+
# inside the given group.
|
8
|
+
def get_language_delegate(language, group)
|
9
|
+
lang = Treat::Resources::Languages.describe(language)
|
10
|
+
lclass = cc(lang).intern
|
11
|
+
if Treat::Resources::Delegates.constants.include?(lclass)
|
12
|
+
cat = group.to_s.split('::')[-2].intern
|
13
|
+
lclass = Treat::Resources::Delegates.
|
14
|
+
const_get(lclass).const_get(cat)
|
15
|
+
g = ucc(cl(group)).intern
|
16
|
+
if !lclass[g] || !lclass[g][0]
|
17
|
+
d = ucc(cl(group))
|
18
|
+
d.gsub!('_', ' ')
|
19
|
+
d = d[0..-2] if d[-1] == 's'
|
20
|
+
d = 'delegator to find ' + d
|
21
|
+
raise Treat::Exception, "No #{d}" +
|
22
|
+
" is available for the #{lang} language."
|
23
|
+
end
|
24
|
+
return lclass[g][0]
|
25
|
+
else
|
26
|
+
raise Treat::Exception,
|
27
|
+
"Language '#{lang}' is not supported (yet)."
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Add decorator methods to entities.
|
32
|
+
def decorate(group, m)
|
33
|
+
decorators = group.methods -
|
34
|
+
Object.methods -
|
35
|
+
[:type, :type=, :targets, :targets=,
|
36
|
+
:default, :default=, :add,
|
37
|
+
:has_target?, :list]
|
38
|
+
decorators.each do |decorator_m|
|
39
|
+
define_method(decorator_m) do |delegate=nil, options={}|
|
40
|
+
options[:decorator] = decorator_m
|
41
|
+
send(m, delegate, options)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Raise an exception and suggest alternatives.
|
47
|
+
def delegate_not_found(klass, group)
|
48
|
+
"Algorithm '#{ucc(klass)}' couldn't be found in group #{group}." +
|
49
|
+
did_you_mean?(group.list.map { |c| ucc(c) }, ucc(klass))
|
50
|
+
end
|
51
|
+
|
52
|
+
# Add delegator group to all entities of a class.
|
53
|
+
def add_delegators(group)
|
54
|
+
# Define each method in group.
|
55
|
+
self.class_eval do
|
56
|
+
m = group.method
|
57
|
+
decorate(group, m)
|
58
|
+
define_method(m) do |delegate=nil, options={}|
|
59
|
+
decorator = options.delete(:decorator)
|
60
|
+
puts self.id if !@features
|
61
|
+
if !@features[m].nil?
|
62
|
+
@features[m]
|
63
|
+
else
|
64
|
+
if delegate.nil?
|
65
|
+
delegate = group.default.nil? ?
|
66
|
+
self.class.get_language_delegate(language, group) :
|
67
|
+
group.default
|
68
|
+
raise "No default delegate for #{group}." if delegate == :none
|
69
|
+
end
|
70
|
+
if not group.list.include?(delegate)
|
71
|
+
raise Treat::Exception,
|
72
|
+
self.class.delegate_not_found(delegate, group)
|
73
|
+
else
|
74
|
+
delegate_klass = group.const_get(:"#{cc(delegate.to_s)}")
|
75
|
+
result = accept(group, delegate_klass, m, options)
|
76
|
+
if decorator
|
77
|
+
result = group.send(decorator, self, result)
|
78
|
+
end
|
79
|
+
if group.type == :annotator
|
80
|
+
f = decorator.nil? ? m : decorator
|
81
|
+
@features[f] = result
|
82
|
+
end
|
83
|
+
result
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Treat
|
2
|
+
# Detectors detect a specific meta-information about
|
3
|
+
# an entity, such as encoding, format and language.
|
4
|
+
module Detectors
|
5
|
+
# Group for algorithms that detect encoding.
|
6
|
+
module Encoding
|
7
|
+
extend Group
|
8
|
+
self.type = :annotator
|
9
|
+
self.targets = [:entity]
|
10
|
+
self.default = :native
|
11
|
+
end
|
12
|
+
# Group for algorithms that support format detection.
|
13
|
+
module Format
|
14
|
+
extend Group
|
15
|
+
self.type = :annotator
|
16
|
+
self.targets = [:entity]
|
17
|
+
self.default = :file
|
18
|
+
end
|
19
|
+
# Group for algorithms that do language detection.
|
20
|
+
module Language
|
21
|
+
extend Group
|
22
|
+
self.type = :annotator
|
23
|
+
self.targets = [:entity]
|
24
|
+
self.default = :what_language
|
25
|
+
end
|
26
|
+
extend Treat::Category
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Treat
|
2
|
+
module Detectors
|
3
|
+
module Encoding
|
4
|
+
# Require the 'rchardet19' gem.
|
5
|
+
silently { require 'rchardet19' }
|
6
|
+
# A wrapper for the 'rchardet19' gem, which
|
7
|
+
# detects the encoding of a file.
|
8
|
+
class RChardet19
|
9
|
+
# Returns an Encoding object representing
|
10
|
+
# the encoding of the supplied entity's
|
11
|
+
# text value.
|
12
|
+
#
|
13
|
+
# Options: none.
|
14
|
+
def self.encoding(entity, options={})
|
15
|
+
r = CharDet.detect(entity.to_s)
|
16
|
+
Treat::Feature.new({
|
17
|
+
r.encoding.
|
18
|
+
gsub('-', '_').intern =>
|
19
|
+
r.confidence}).best
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module Treat
|
2
|
+
module Detectors
|
3
|
+
module Format
|
4
|
+
# A wrapper for the *NIX 'file' command,
|
5
|
+
# witch uses etc/magic to detect the format
|
6
|
+
# of a file.
|
7
|
+
class File
|
8
|
+
# Returns an identifier representing
|
9
|
+
# the format of a file using the *NIX
|
10
|
+
# 'file' command.
|
11
|
+
#
|
12
|
+
# Options: none.
|
13
|
+
def self.format(entity, options = {})
|
14
|
+
format = nil
|
15
|
+
create_temp_file(:txt, entity.to_s) do |tmp|
|
16
|
+
format = `file #{tmp}`
|
17
|
+
end
|
18
|
+
if format.scan('text')
|
19
|
+
:txt
|
20
|
+
elsif format.scan('XML')
|
21
|
+
:xml
|
22
|
+
elsif format.scan('HTML')
|
23
|
+
:html
|
24
|
+
elsif format.scan('image')
|
25
|
+
:image
|
26
|
+
elsif format.scan('PDF')
|
27
|
+
:pdf
|
28
|
+
else
|
29
|
+
raise Treat::Exception,
|
30
|
+
"Unsupported text format #{format}."
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|