treat 1.0.6 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +2 -4
- data/README.md +13 -12
- data/bin/MANIFEST +1 -0
- data/bin/stanford/bridge.jar +0 -0
- data/bin/stanford/joda-time.jar +0 -0
- data/bin/stanford/stanford-corenlp.jar +0 -0
- data/bin/stanford/stanford-parser.jar +0 -0
- data/bin/stanford/xom.jar +0 -0
- data/files/{www.economist.com/21552208 → 21552208.html} +86 -89
- data/files/{guides.rubyonrails.org/3_2_release_notes.html → 3_2_release_notes.html} +0 -0
- data/files/{INFO → MANIFEST} +0 -0
- data/files/{www.rubyinside.com/nethttp-cheat-sheet-2940.html → nethttp-cheat-sheet-2940.html} +12 -16
- data/files/weather-central-canada-heat-wave.html +1370 -0
- data/lib/treat/config/core/acronyms.rb +4 -0
- data/lib/treat/config/core/encodings.rb +8 -0
- data/lib/treat/config/core/entities.rb +2 -0
- data/lib/treat/config/core/language.rb +3 -0
- data/lib/treat/config/core/paths.rb +8 -0
- data/lib/treat/config/core/syntax.rb +1 -0
- data/lib/treat/config/core/verbosity.rb +1 -0
- data/lib/treat/config/databases/mongo.rb +3 -0
- data/lib/treat/config/languages/agnostic.rb +34 -0
- data/lib/treat/config/languages/arabic.rb +13 -0
- data/lib/treat/config/languages/chinese.rb +13 -0
- data/lib/treat/config/languages/dutch.rb +12 -0
- data/lib/treat/config/languages/english.rb +60 -0
- data/lib/treat/config/languages/french.rb +18 -0
- data/lib/treat/config/languages/german.rb +18 -0
- data/lib/treat/config/languages/greek.rb +12 -0
- data/lib/treat/config/languages/italian.rb +12 -0
- data/lib/treat/config/languages/polish.rb +12 -0
- data/lib/treat/config/languages/portuguese.rb +12 -0
- data/lib/treat/config/languages/russian.rb +12 -0
- data/lib/treat/config/languages/spanish.rb +12 -0
- data/lib/treat/config/languages/swedish.rb +12 -0
- data/lib/treat/config/libraries/stanford.rb +1 -0
- data/lib/treat/config/linguistics/categories.rb +4 -0
- data/lib/treat/config/linguistics/punctuation.rb +33 -0
- data/lib/treat/config/tags/aligned.rb +221 -0
- data/lib/treat/config/tags/enju.rb +71 -0
- data/lib/treat/config/tags/paris7.rb +17 -0
- data/lib/treat/config/tags/ptb.rb +15 -0
- data/lib/treat/config/workers/extractors.rb +39 -0
- data/lib/treat/config/workers/formatters.rb +20 -0
- data/lib/treat/config/workers/inflectors.rb +27 -0
- data/lib/treat/config/workers/learners.rb +6 -0
- data/lib/treat/config/workers/lexicalizers.rb +18 -0
- data/lib/treat/config/workers/list.rb +1 -0
- data/lib/treat/config/workers/processors.rb +19 -0
- data/lib/treat/config/workers/retrievers.rb +12 -0
- data/lib/treat/config.rb +125 -0
- data/lib/treat/{classification.rb → core/classification.rb} +1 -1
- data/lib/treat/{data_set.rb → core/data_set.rb} +1 -4
- data/lib/treat/{tree.rb → core/node.rb} +5 -5
- data/lib/treat/core/server.rb +3 -0
- data/lib/treat/core.rb +5 -0
- data/lib/treat/entities/abilities/buildable.rb +61 -56
- data/lib/treat/entities/abilities/checkable.rb +2 -2
- data/lib/treat/entities/abilities/comparable.rb +21 -0
- data/lib/treat/entities/abilities/copyable.rb +2 -0
- data/lib/treat/entities/abilities/countable.rb +1 -1
- data/lib/treat/entities/abilities/debuggable.rb +1 -1
- data/lib/treat/entities/abilities/delegatable.rb +42 -36
- data/lib/treat/entities/abilities/doable.rb +2 -2
- data/lib/treat/entities/abilities/exportable.rb +1 -1
- data/lib/treat/entities/abilities/iterable.rb +21 -33
- data/lib/treat/entities/abilities/magical.rb +8 -8
- data/lib/treat/entities/abilities/registrable.rb +0 -38
- data/lib/treat/entities/abilities/stringable.rb +19 -19
- data/lib/treat/entities/collection.rb +31 -0
- data/lib/treat/entities/document.rb +10 -0
- data/lib/treat/entities/entity.rb +18 -13
- data/lib/treat/entities/group.rb +15 -0
- data/lib/treat/entities/section.rb +13 -0
- data/lib/treat/entities/token.rb +35 -0
- data/lib/treat/entities/zone.rb +11 -0
- data/lib/treat/entities.rb +5 -75
- data/lib/treat/helpers/didyoumean.rb +57 -0
- data/lib/treat/helpers/escaping.rb +15 -0
- data/lib/treat/helpers/formatting.rb +41 -0
- data/lib/treat/helpers/platform.rb +15 -0
- data/lib/treat/helpers/reflection.rb +17 -0
- data/lib/treat/helpers/temporary.rb +27 -0
- data/lib/treat/helpers/verbosity.rb +19 -0
- data/lib/treat/helpers.rb +5 -0
- data/lib/treat/installer.rb +46 -165
- data/lib/treat/loaders/linguistics.rb +22 -27
- data/lib/treat/loaders/stanford.rb +23 -41
- data/lib/treat/loaders.rb +10 -0
- data/lib/treat/proxies.rb +73 -24
- data/lib/treat/version.rb +3 -0
- data/lib/treat/{extractors → workers/extractors}/keywords/tf_idf.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/language/what_language.rb +11 -4
- data/lib/treat/{extractors → workers/extractors}/name_tag/stanford.rb +3 -4
- data/lib/treat/{extractors → workers/extractors}/tf_idf/native.rb +4 -5
- data/lib/treat/{extractors → workers/extractors}/time/chronic.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/time/nickel.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/time/ruby.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/topic_words/lda.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/topics/reuters.rb +4 -4
- data/lib/treat/{formatters → workers/formatters}/readers/abw.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/autoselect.rb +10 -3
- data/lib/treat/{formatters → workers/formatters}/readers/doc.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/html.rb +4 -4
- data/lib/treat/{formatters → workers/formatters}/readers/image.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/odt.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/pdf.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/txt.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/xml.rb +2 -2
- data/lib/treat/workers/formatters/serializers/mongo.rb +60 -0
- data/lib/treat/{formatters → workers/formatters}/serializers/xml.rb +1 -2
- data/lib/treat/{formatters → workers/formatters}/serializers/yaml.rb +1 -1
- data/lib/treat/{formatters → workers/formatters}/unserializers/autoselect.rb +3 -1
- data/lib/treat/workers/formatters/unserializers/mongo.rb +80 -0
- data/lib/treat/{formatters → workers/formatters}/unserializers/xml.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/unserializers/yaml.rb +1 -1
- data/lib/treat/{formatters → workers/formatters}/visualizers/dot.rb +1 -1
- data/lib/treat/{formatters → workers/formatters}/visualizers/standoff.rb +2 -3
- data/lib/treat/{formatters → workers/formatters}/visualizers/tree.rb +1 -1
- data/lib/treat/{groupable.rb → workers/group.rb} +6 -12
- data/lib/treat/{inflectors → workers/inflectors}/cardinalizers/linguistics.rb +7 -2
- data/lib/treat/{inflectors → workers/inflectors}/conjugators/linguistics.rb +11 -11
- data/lib/treat/{inflectors → workers/inflectors}/declensors/active_support.rb +2 -2
- data/lib/treat/{inflectors → workers/inflectors}/declensors/english/inflect.rb +1 -1
- data/lib/treat/{inflectors → workers/inflectors}/declensors/english.rb +2 -2
- data/lib/treat/{inflectors → workers/inflectors}/declensors/linguistics.rb +4 -4
- data/lib/treat/{inflectors → workers/inflectors}/ordinalizers/linguistics.rb +8 -2
- data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter.rb +2 -2
- data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter_c.rb +1 -1
- data/lib/treat/{inflectors → workers/inflectors}/stemmers/uea.rb +1 -1
- data/lib/treat/{ai → workers/learners}/classifiers/id3.rb +1 -1
- data/lib/treat/{ai → workers/learners}/classifiers/mlp.rb +1 -1
- data/lib/treat/{lexicalizers → workers/lexicalizers}/categorizers/from_tag.rb +9 -9
- data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet/synset.rb +2 -2
- data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet.rb +4 -4
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill/patch.rb +2 -2
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill.rb +2 -8
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/lingua.rb +1 -6
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/stanford.rb +31 -42
- data/lib/treat/workers/processors/chunkers/autoselect.rb +19 -0
- data/lib/treat/{processors → workers/processors}/chunkers/html.rb +4 -3
- data/lib/treat/workers/processors/chunkers/txt.rb +32 -0
- data/lib/treat/{processors → workers/processors}/parsers/enju.rb +3 -3
- data/lib/treat/{processors → workers/processors}/parsers/stanford.rb +6 -8
- data/lib/treat/{processors → workers/processors}/segmenters/punkt.rb +6 -10
- data/lib/treat/{processors → workers/processors}/segmenters/stanford.rb +2 -2
- data/lib/treat/{processors → workers/processors}/segmenters/tactful.rb +3 -6
- data/lib/treat/{processors → workers/processors}/tokenizers/ptb.rb +6 -5
- data/lib/treat/{processors → workers/processors}/tokenizers/punkt.rb +1 -1
- data/lib/treat/{processors → workers/processors}/tokenizers/stanford.rb +1 -1
- data/lib/treat/{processors → workers/processors}/tokenizers/tactful.rb +3 -5
- data/lib/treat/{retrievers → workers/retrievers}/indexers/ferret.rb +1 -1
- data/lib/treat/{retrievers → workers/retrievers}/searchers/ferret.rb +1 -1
- data/lib/treat/workers.rb +96 -0
- data/lib/treat.rb +23 -49
- data/spec/collection.rb +4 -4
- data/spec/document.rb +5 -5
- data/spec/entity.rb +33 -32
- data/spec/{tree.rb → node.rb} +5 -5
- data/spec/phrase.rb +5 -39
- data/spec/sandbox.rb +212 -6
- data/spec/token.rb +12 -9
- data/spec/treat.rb +12 -9
- data/spec/word.rb +10 -9
- data/spec/zone.rb +6 -2
- data/tmp/{INFO → MANIFEST} +0 -0
- data/tmp/english.yaml +10340 -0
- metadata +149 -139
- data/lib/treat/ai.rb +0 -12
- data/lib/treat/categories.rb +0 -90
- data/lib/treat/categorizable.rb +0 -44
- data/lib/treat/configurable.rb +0 -115
- data/lib/treat/dependencies.rb +0 -25
- data/lib/treat/downloader.rb +0 -87
- data/lib/treat/entities/abilities.rb +0 -10
- data/lib/treat/entities/entities.rb +0 -102
- data/lib/treat/exception.rb +0 -7
- data/lib/treat/extractors.rb +0 -79
- data/lib/treat/formatters/serializers/mongo.rb +0 -64
- data/lib/treat/formatters.rb +0 -41
- data/lib/treat/helpers/decimal_point_escaper.rb +0 -22
- data/lib/treat/inflectors.rb +0 -52
- data/lib/treat/kernel.rb +0 -208
- data/lib/treat/languages/arabic.rb +0 -16
- data/lib/treat/languages/chinese.rb +0 -16
- data/lib/treat/languages/dutch.rb +0 -16
- data/lib/treat/languages/english.rb +0 -63
- data/lib/treat/languages/french.rb +0 -20
- data/lib/treat/languages/german.rb +0 -20
- data/lib/treat/languages/greek.rb +0 -16
- data/lib/treat/languages/italian.rb +0 -17
- data/lib/treat/languages/language.rb +0 -10
- data/lib/treat/languages/list.txt +0 -504
- data/lib/treat/languages/polish.rb +0 -16
- data/lib/treat/languages/portuguese.rb +0 -16
- data/lib/treat/languages/russian.rb +0 -16
- data/lib/treat/languages/spanish.rb +0 -16
- data/lib/treat/languages/swedish.rb +0 -16
- data/lib/treat/languages.rb +0 -132
- data/lib/treat/lexicalizers.rb +0 -37
- data/lib/treat/object.rb +0 -7
- data/lib/treat/processors/chunkers/autoselect.rb +0 -16
- data/lib/treat/processors/chunkers/txt.rb +0 -21
- data/lib/treat/processors.rb +0 -38
- data/lib/treat/retrievers.rb +0 -27
- data/lib/treat/server.rb +0 -26
- data/lib/treat/universalisation/encodings.rb +0 -12
- data/lib/treat/universalisation/tags.rb +0 -453
- data/lib/treat/universalisation.rb +0 -9
- data/spec/languages.rb +0 -25
data/lib/treat/downloader.rb
DELETED
@@ -1,87 +0,0 @@
|
|
1
|
-
# Download a file without storing it entirely in memory.
|
2
|
-
class Treat::Downloader
|
3
|
-
|
4
|
-
require 'net/http'
|
5
|
-
require 'fileutils'
|
6
|
-
|
7
|
-
class << self
|
8
|
-
attr_accessor :show_progress
|
9
|
-
end
|
10
|
-
|
11
|
-
self.show_progress = false
|
12
|
-
|
13
|
-
MaxTries = 3
|
14
|
-
|
15
|
-
# Download a file into destination, and return
|
16
|
-
# the path to the downloaded file. If the filename
|
17
|
-
# is nil, it will set the default filename to 'top'.
|
18
|
-
def self.download(protocol, server, dir, file = nil, target_base = nil, target_dir = nil)
|
19
|
-
|
20
|
-
require 'progressbar' if self.show_progress
|
21
|
-
|
22
|
-
target_base ||= Treat.files
|
23
|
-
target_dir ||= server
|
24
|
-
|
25
|
-
dir += '/' if dir && dir[-1] != '/'
|
26
|
-
resource = "#{dir}#{file}"
|
27
|
-
resource = "/#{resource}" unless resource[0] == '/'
|
28
|
-
url = "#{server}#{resource}"
|
29
|
-
path = File.join(target_base, target_dir)
|
30
|
-
|
31
|
-
unless FileTest.directory?(path)
|
32
|
-
FileUtils.mkdir(path)
|
33
|
-
end
|
34
|
-
|
35
|
-
|
36
|
-
file = File.open("#{path}/#{file}", 'w')
|
37
|
-
tries = 0
|
38
|
-
begin
|
39
|
-
|
40
|
-
Net::HTTP.start(server) do |http|
|
41
|
-
|
42
|
-
http.use_ssl = true if protocol == 'https'
|
43
|
-
|
44
|
-
http.request_get(resource) do |response|
|
45
|
-
|
46
|
-
if response.content_length
|
47
|
-
length = response.content_length
|
48
|
-
else
|
49
|
-
warn 'Unknown file size; ETR unknown.'
|
50
|
-
length = 10000
|
51
|
-
end
|
52
|
-
|
53
|
-
pbar = self.show_progress ?
|
54
|
-
ProgressBar.new(url, length) : nil
|
55
|
-
|
56
|
-
unless response.code == '200'
|
57
|
-
raise Treat::Exception,
|
58
|
-
"response code was not 200 "+
|
59
|
-
"OK, but was #{response.code}. "
|
60
|
-
end
|
61
|
-
|
62
|
-
response.read_body do |segment|
|
63
|
-
pbar.inc(segment.length) if pbar
|
64
|
-
file.write(segment)
|
65
|
-
end
|
66
|
-
|
67
|
-
pbar.finish if pbar
|
68
|
-
|
69
|
-
end
|
70
|
-
|
71
|
-
end
|
72
|
-
|
73
|
-
file.path.to_s
|
74
|
-
|
75
|
-
rescue Exception => error
|
76
|
-
tries += 1
|
77
|
-
retry if tries < MaxTries
|
78
|
-
raise Treat::Exception,
|
79
|
-
"Couldn't download #{url}. (#{error.message})"
|
80
|
-
file.delete
|
81
|
-
ensure
|
82
|
-
file.close
|
83
|
-
end
|
84
|
-
|
85
|
-
end
|
86
|
-
|
87
|
-
end
|
@@ -1,102 +0,0 @@
|
|
1
|
-
module Treat::Entities
|
2
|
-
|
3
|
-
# Require the generic entity lass.
|
4
|
-
require 'treat/entities/entity'
|
5
|
-
|
6
|
-
# Represents a collection of texts.
|
7
|
-
class Collection < Entity
|
8
|
-
|
9
|
-
# Initialize the collection with a folder
|
10
|
-
# containing the texts of the collection.
|
11
|
-
def initialize(folder = nil, id = nil)
|
12
|
-
super('', id)
|
13
|
-
set :folder, folder
|
14
|
-
i = folder + '/.index'
|
15
|
-
set :index, i if FileTest.directory?(i)
|
16
|
-
end
|
17
|
-
|
18
|
-
# Works like the default <<, but if the
|
19
|
-
# file being added is a collection or a
|
20
|
-
# document, then copy that collection or
|
21
|
-
# document into this collection's folder.
|
22
|
-
def <<(entities, copy = true)
|
23
|
-
unless entities.is_a? Array
|
24
|
-
entities = [entities]
|
25
|
-
end
|
26
|
-
entities.each do |entity|
|
27
|
-
if [:document, :collection].
|
28
|
-
include?(entity.type) && copy
|
29
|
-
entity = entity.copy_into(self)
|
30
|
-
end
|
31
|
-
end
|
32
|
-
super(entities)
|
33
|
-
end
|
34
|
-
|
35
|
-
end
|
36
|
-
|
37
|
-
# Represents a document.
|
38
|
-
class Document < Entity
|
39
|
-
|
40
|
-
def initialize(file = nil, id = nil)
|
41
|
-
super('', id)
|
42
|
-
set :file, file
|
43
|
-
end
|
44
|
-
|
45
|
-
end
|
46
|
-
|
47
|
-
# Represents a section, usually with a title
|
48
|
-
# and at least one paragraph.
|
49
|
-
class Section < Entity; end
|
50
|
-
|
51
|
-
# Represents a zone of text
|
52
|
-
# (Title, Paragraph, List, Quote).
|
53
|
-
class Zone < Entity; end
|
54
|
-
|
55
|
-
# Represents a title, subtitle, logical header.
|
56
|
-
class Title < Zone; end
|
57
|
-
|
58
|
-
# Represents a paragraph.
|
59
|
-
class Paragraph < Zone; end
|
60
|
-
|
61
|
-
# Represents a list.
|
62
|
-
class List < Zone; end
|
63
|
-
|
64
|
-
# Represents a group of words.
|
65
|
-
class Phrase < Entity; end
|
66
|
-
|
67
|
-
# Represents a group of words with a sentence ender.
|
68
|
-
class Sentence < Phrase; end
|
69
|
-
|
70
|
-
# Represents a terminal element in the text structure.
|
71
|
-
class Token < Entity; end
|
72
|
-
|
73
|
-
# Represents a word.
|
74
|
-
class Word < Token; end
|
75
|
-
|
76
|
-
# Represents a clitic ('s).
|
77
|
-
class Enclitic < Token; end
|
78
|
-
|
79
|
-
# Represents a number.
|
80
|
-
class Number < Token
|
81
|
-
def to_i; to_s.to_i; end
|
82
|
-
def to_f; to_s.to_f; end
|
83
|
-
end
|
84
|
-
|
85
|
-
# Represents a punctuation sign.
|
86
|
-
class Punctuation < Token; end
|
87
|
-
|
88
|
-
# Represents a character that is neither
|
89
|
-
# alphabetical, numerical or a punctuation
|
90
|
-
# character (e.g. @#$%&*).
|
91
|
-
class Symbol < Token; end
|
92
|
-
|
93
|
-
# Represents a url.
|
94
|
-
class Url < Token; end
|
95
|
-
|
96
|
-
# Represents a valid RFC822 address.
|
97
|
-
class Email < Token; end
|
98
|
-
|
99
|
-
# Represents an entity of unknown type.
|
100
|
-
class Unknown; end
|
101
|
-
|
102
|
-
end
|
data/lib/treat/exception.rb
DELETED
data/lib/treat/extractors.rb
DELETED
@@ -1,79 +0,0 @@
|
|
1
|
-
# Extractors extract information out of texts.
|
2
|
-
module Treat::Extractors
|
3
|
-
|
4
|
-
# Extracts the language from an entity.
|
5
|
-
module Language
|
6
|
-
extend Treat::Groupable
|
7
|
-
self.type = :annotator
|
8
|
-
self.targets = [:entity]
|
9
|
-
self.default = :what_language
|
10
|
-
end
|
11
|
-
|
12
|
-
# Extracts the date/time of a phrase.
|
13
|
-
module Time
|
14
|
-
extend Treat::Groupable
|
15
|
-
self.type = :annotator
|
16
|
-
self.targets = [:phrase]
|
17
|
-
end
|
18
|
-
|
19
|
-
# Extract the topic from a document or zone.
|
20
|
-
module Topics
|
21
|
-
extend Treat::Groupable
|
22
|
-
self.type = :annotator
|
23
|
-
self.targets = [:document, :section, :zone]
|
24
|
-
end
|
25
|
-
|
26
|
-
# Extract the keywords from a text.
|
27
|
-
module Keywords
|
28
|
-
extend Treat::Groupable
|
29
|
-
self.type = :annotator
|
30
|
-
self.targets = [:document, :section, :zone]
|
31
|
-
end
|
32
|
-
|
33
|
-
# Extract clusters of topic words from a collection.
|
34
|
-
module TopicWords
|
35
|
-
extend Treat::Groupable
|
36
|
-
self.type = :annotator
|
37
|
-
self.targets = [:collection]
|
38
|
-
end
|
39
|
-
|
40
|
-
# Extract named entities from phrases.
|
41
|
-
module NameTag
|
42
|
-
extend Treat::Groupable
|
43
|
-
self.type = :annotator
|
44
|
-
self.targets = [:phrase, :word]
|
45
|
-
end
|
46
|
-
|
47
|
-
# Extract coreferences from a zone.
|
48
|
-
module Coreferences
|
49
|
-
extend Treat::Groupable
|
50
|
-
self.type = :annotator
|
51
|
-
self.targets = [:zone]
|
52
|
-
end
|
53
|
-
|
54
|
-
# Retrieve the main grammatical roles
|
55
|
-
# in the phrase (subject, verb, object).
|
56
|
-
module Roles
|
57
|
-
extend Treat::Groupable
|
58
|
-
self.type = :annotator
|
59
|
-
self.targets = [:phrase]
|
60
|
-
end
|
61
|
-
|
62
|
-
module TfIdf
|
63
|
-
extend Treat::Groupable
|
64
|
-
self.type = :annotator
|
65
|
-
self.targets = [:word]
|
66
|
-
self.default = :native
|
67
|
-
end
|
68
|
-
|
69
|
-
module Summary
|
70
|
-
extend Treat::Groupable
|
71
|
-
self.type = :annotator
|
72
|
-
self.targets = [:document]
|
73
|
-
self.default = :keyword_count
|
74
|
-
end
|
75
|
-
|
76
|
-
# Make Extractors categorizable.
|
77
|
-
extend Treat::Categorizable
|
78
|
-
|
79
|
-
end
|
@@ -1,64 +0,0 @@
|
|
1
|
-
# Stores an entity in a Mongo collection.
|
2
|
-
class Treat::Formatters::Serializers::Mongo
|
3
|
-
|
4
|
-
# Reauire the Mongo DB
|
5
|
-
require 'mongo'
|
6
|
-
|
7
|
-
# Serialize an entity tree in XML format.
|
8
|
-
#
|
9
|
-
# Options:
|
10
|
-
# - (String) :file => a file to write to.
|
11
|
-
def self.serialize(entity, options = {})
|
12
|
-
|
13
|
-
unless options[:database]
|
14
|
-
raise Treat::Exception,
|
15
|
-
'Must supply the database name.'
|
16
|
-
end
|
17
|
-
|
18
|
-
@@conn ||= Mongo::Connection.new
|
19
|
-
@@db ||= @@conn[options[:database]]
|
20
|
-
|
21
|
-
path = []
|
22
|
-
|
23
|
-
entity.each_ancestor do |ancestor|
|
24
|
-
path << [ancestor.type, ancestor.id]
|
25
|
-
end
|
26
|
-
|
27
|
-
path = path.reverse
|
28
|
-
|
29
|
-
target = @@db
|
30
|
-
|
31
|
-
path.each do |type_id|
|
32
|
-
coll = @@db[type_id[0]][type_id[1]]
|
33
|
-
end
|
34
|
-
|
35
|
-
# Store path
|
36
|
-
|
37
|
-
Treat::Entities.list.each do |type|
|
38
|
-
|
39
|
-
type = entity.type.to_s
|
40
|
-
type = (type == 'entity') ? 'entities' : (type + 's')
|
41
|
-
doc = coll[type]
|
42
|
-
|
43
|
-
features = {}
|
44
|
-
features['id'] = entity.id
|
45
|
-
features['value'] = entity.value
|
46
|
-
|
47
|
-
entity.features.each_pair do |feature, value|
|
48
|
-
if value.is_a? Treat::Entities::Entity
|
49
|
-
value = value.id
|
50
|
-
elsif value.is_a?(Array) || value.is_a?(Hash)
|
51
|
-
value = value.inspect
|
52
|
-
else
|
53
|
-
value = value.to_s
|
54
|
-
end
|
55
|
-
features[feature.to_s] = value
|
56
|
-
end
|
57
|
-
|
58
|
-
doc.insert(features)
|
59
|
-
|
60
|
-
end
|
61
|
-
|
62
|
-
end
|
63
|
-
|
64
|
-
end
|
data/lib/treat/formatters.rb
DELETED
@@ -1,41 +0,0 @@
|
|
1
|
-
# Formatters handle conversion of Entities to and from
|
2
|
-
# external file formats.
|
3
|
-
module Treat::Formatters
|
4
|
-
|
5
|
-
# Readers read a document's content.
|
6
|
-
module Readers
|
7
|
-
extend Treat::Groupable
|
8
|
-
self.type = :computer
|
9
|
-
self.targets = [:document]
|
10
|
-
end
|
11
|
-
|
12
|
-
# Unserializers recreate entities
|
13
|
-
# from a serialized format.
|
14
|
-
module Unserializers
|
15
|
-
extend Treat::Groupable
|
16
|
-
self.type = :computer
|
17
|
-
self.targets = [:entity]
|
18
|
-
end
|
19
|
-
|
20
|
-
# Serializers transform entities
|
21
|
-
# into a storable format.
|
22
|
-
module Serializers
|
23
|
-
extend Treat::Groupable
|
24
|
-
self.type = :computer
|
25
|
-
self.targets = [:entity]
|
26
|
-
self.default = :yaml
|
27
|
-
end
|
28
|
-
|
29
|
-
# Visualizers transform entities
|
30
|
-
# into a visualizable format.
|
31
|
-
module Visualizers
|
32
|
-
extend Treat::Groupable
|
33
|
-
self.type = :computer
|
34
|
-
self.targets = [:entity]
|
35
|
-
self.default = :tree
|
36
|
-
end
|
37
|
-
|
38
|
-
# Make Formatters categorizable.
|
39
|
-
extend Treat::Categorizable
|
40
|
-
|
41
|
-
end
|
@@ -1,22 +0,0 @@
|
|
1
|
-
module Treat::Helpers
|
2
|
-
|
3
|
-
class DecimalPointEscaper
|
4
|
-
|
5
|
-
EscapeChar = '^^'
|
6
|
-
EscapedEscapeChar = '\^\^'
|
7
|
-
|
8
|
-
def self.escape!(s)
|
9
|
-
s.gsub!(/([0-9]+)\.([0-9]+)/) do
|
10
|
-
$1 + EscapeChar + $2
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
|
-
def self.unescape!(s)
|
15
|
-
s.gsub!(/([0-9]+)#{EscapedEscapeChar}([0-9]+)/) do
|
16
|
-
$1 + '.' + $2
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
end
|
21
|
-
|
22
|
-
end
|
data/lib/treat/inflectors.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
# Category of worker groups that retrieve
|
2
|
-
# the inflections of a word.
|
3
|
-
module Treat::Inflectors
|
4
|
-
|
5
|
-
# Return the stem (*not root form*) of a word.
|
6
|
-
module Stemmers
|
7
|
-
extend Treat::Groupable
|
8
|
-
self.type = :annotator
|
9
|
-
self.targets = [:word]
|
10
|
-
end
|
11
|
-
|
12
|
-
# Retrieve the different declensions of a
|
13
|
-
# noun (singular, plural).
|
14
|
-
module Declensors
|
15
|
-
extend Treat::Groupable
|
16
|
-
self.type = :annotator
|
17
|
-
self.targets = [:word]
|
18
|
-
self.preset_option = :count
|
19
|
-
self.presets = [:plural, :singular]
|
20
|
-
end
|
21
|
-
|
22
|
-
# Retrieve the different conjugations of a word
|
23
|
-
# given a mode, tense, person, and/or number.
|
24
|
-
module Conjugators
|
25
|
-
extend Treat::Groupable
|
26
|
-
self.type = :annotator
|
27
|
-
self.targets = [:word]
|
28
|
-
self.preset_option = :form
|
29
|
-
self.presets = [:infinitive, :present_participle,
|
30
|
-
:plural_verb, :singular_verb]
|
31
|
-
end
|
32
|
-
|
33
|
-
# Retrieve the full text description of a
|
34
|
-
# cardinal number.
|
35
|
-
module Cardinalizers
|
36
|
-
extend Treat::Groupable
|
37
|
-
self.type = :annotator
|
38
|
-
self.targets = [:number]
|
39
|
-
end
|
40
|
-
|
41
|
-
# Retrieve the full text description of an
|
42
|
-
# ordinal number.
|
43
|
-
module Ordinalizers
|
44
|
-
extend Treat::Groupable
|
45
|
-
self.type = :annotator
|
46
|
-
self.targets = [:number]
|
47
|
-
end
|
48
|
-
|
49
|
-
# Make Inflectors categorizable.
|
50
|
-
extend Treat::Categorizable
|
51
|
-
|
52
|
-
end
|
data/lib/treat/kernel.rb
DELETED
@@ -1,208 +0,0 @@
|
|
1
|
-
# Extends the core Kernel module to provide
|
2
|
-
# easy access to utility functions used across
|
3
|
-
# the library.
|
4
|
-
module Kernel
|
5
|
-
|
6
|
-
# Require file utilities for creating and
|
7
|
-
# deleting temporary files.
|
8
|
-
require 'fileutils'
|
9
|
-
|
10
|
-
# A list of acronyms used in class names within
|
11
|
-
# the program. These do not CamelCase; they
|
12
|
-
# CAMELCase.
|
13
|
-
Acronyms = %w[xml html txt odt abw doc yaml uea lda pdf ptb dot ai id3 svo mlp]
|
14
|
-
|
15
|
-
# A cache to optimize camel casing.
|
16
|
-
@@cc_cache = {}
|
17
|
-
|
18
|
-
# A cache to optimize un camel casing.
|
19
|
-
@@ucc_cache = {}
|
20
|
-
|
21
|
-
# Runs a block of code without warnings.
|
22
|
-
def silence_warnings(&block)
|
23
|
-
warn_level = $VERBOSE
|
24
|
-
$VERBOSE = nil
|
25
|
-
result = block.call
|
26
|
-
$VERBOSE = warn_level
|
27
|
-
result
|
28
|
-
end
|
29
|
-
|
30
|
-
# Runs a block of code while blocking stdout.
|
31
|
-
def silence_stdout(log = NULL_DEVICE)
|
32
|
-
unless Treat.silence
|
33
|
-
yield; return
|
34
|
-
end
|
35
|
-
old = $stdout.dup
|
36
|
-
$stdout.reopen(File.new(log, 'w'))
|
37
|
-
yield
|
38
|
-
$stdout = old
|
39
|
-
end
|
40
|
-
|
41
|
-
# Create a temporary file which is deleted
|
42
|
-
# after execution of the block.
|
43
|
-
def create_temp_file(ext, value = nil, &block)
|
44
|
-
fname = Treat.tmp +
|
45
|
-
"#{Random.rand(10000000).to_s}.#{ext}"
|
46
|
-
File.open(fname, 'w') do |f|
|
47
|
-
f.write(value) if value
|
48
|
-
block.call(f.path)
|
49
|
-
end
|
50
|
-
ensure
|
51
|
-
File.delete(fname)
|
52
|
-
end
|
53
|
-
|
54
|
-
# Create a temporary directory, which is
|
55
|
-
# deleted after execution of the block.
|
56
|
-
def create_temp_dir(&block)
|
57
|
-
dname = "#{Treat.lib}/../tmp/"+
|
58
|
-
"#{Random.rand(10000000).to_s}"
|
59
|
-
Dir.mkdir(dname)
|
60
|
-
block.call(dname)
|
61
|
-
ensure
|
62
|
-
FileUtils.rm_rf(dname)
|
63
|
-
end
|
64
|
-
|
65
|
-
# Convert un_camel_case to CamelCase.
|
66
|
-
def camel_case(o_phrase)
|
67
|
-
phrase = o_phrase.to_s.dup
|
68
|
-
return @@cc_cache[o_phrase] if @@cc_cache[o_phrase]
|
69
|
-
|
70
|
-
if Acronyms.include?(phrase)
|
71
|
-
phrase = phrase.upcase
|
72
|
-
else
|
73
|
-
phrase.gsub!(/^[a-z]|_[a-z]/) { |a| a.upcase }
|
74
|
-
phrase.gsub!('_', '')
|
75
|
-
end
|
76
|
-
@@cc_cache[o_phrase] = phrase
|
77
|
-
end
|
78
|
-
|
79
|
-
alias :cc :camel_case
|
80
|
-
|
81
|
-
# Convert CamelCase to un_camel_case.
|
82
|
-
def un_camel_case(o_phrase)
|
83
|
-
phrase = o_phrase.to_s.dup
|
84
|
-
return @@ucc_cache[o_phrase] if @@ucc_cache[o_phrase]
|
85
|
-
if Acronyms.include?(phrase.downcase)
|
86
|
-
phrase = phrase.downcase
|
87
|
-
else
|
88
|
-
phrase.gsub!(/[A-Z]/) { |p| '_' + p.downcase }
|
89
|
-
phrase = phrase[1..-1] if phrase[0] == '_'
|
90
|
-
end
|
91
|
-
@@ucc_cache[o_phrase] = phrase
|
92
|
-
end
|
93
|
-
|
94
|
-
alias :ucc :un_camel_case
|
95
|
-
|
96
|
-
# Retrieve the Class from a Module::Class.
|
97
|
-
def class_name(n); n.to_s.split('::')[-1]; end
|
98
|
-
|
99
|
-
alias :cl :class_name
|
100
|
-
|
101
|
-
# Search the list to see if there are words similar to #name
|
102
|
-
# in the #list If yes, return a string saying "Did you mean
|
103
|
-
# ... ?" with the names.
|
104
|
-
def did_you_mean?(list, name)
|
105
|
-
return '' # Fix
|
106
|
-
list = list.map { |e| e.to_s }
|
107
|
-
name = name.to_s
|
108
|
-
sugg = []
|
109
|
-
list.each do |element|
|
110
|
-
l = levenshtein(element,name)
|
111
|
-
if l > 0 && l < 2
|
112
|
-
sugg << element
|
113
|
-
end
|
114
|
-
end
|
115
|
-
unless sugg.size == 0
|
116
|
-
if sugg.size == 1
|
117
|
-
msg += " Perhaps you meant '#{sugg[0]}' ?"
|
118
|
-
else
|
119
|
-
sugg_quote = sugg[0..-2].map do
|
120
|
-
|x| '\'' + x + '\''
|
121
|
-
end
|
122
|
-
msg += " Perhaps you meant " +
|
123
|
-
"#{sugg_quote.join(', ')}," +
|
124
|
-
" or '#{sugg[-1]}' ?"
|
125
|
-
end
|
126
|
-
end
|
127
|
-
msg
|
128
|
-
end
|
129
|
-
|
130
|
-
alias :dym? :did_you_mean?
|
131
|
-
|
132
|
-
# Return the name of the method that called the method
|
133
|
-
# that calls this method.
|
134
|
-
def caller_method(n = 3)
|
135
|
-
at = caller(n).first
|
136
|
-
/^(.+?):(\d+)(?::in `(.*)')?/ =~ at
|
137
|
-
Regexp.last_match[3].gsub('block in ', '').intern
|
138
|
-
end
|
139
|
-
|
140
|
-
alias :cm :caller_method
|
141
|
-
|
142
|
-
# Detect the platform we're running on.
|
143
|
-
def detect_platform
|
144
|
-
p = RUBY_PLATFORM.downcase
|
145
|
-
return :mac if p.include?("darwin")
|
146
|
-
return :windows if p.include?("mswin")
|
147
|
-
return :linux if p.include?("linux")
|
148
|
-
return :unknown
|
149
|
-
end
|
150
|
-
|
151
|
-
# Return the levensthein distance between two stringsm
|
152
|
-
# taking into account the costs of insertion, deletion,
|
153
|
-
# and substitution. Stolen from:
|
154
|
-
# http://ruby-snippets.heroku.com/string/levenshtein-distance
|
155
|
-
# Used by did_you_mean?
|
156
|
-
def levenshtein(first, other, ins=1, del=1, sub=1)
|
157
|
-
return nil if first.nil? || other.nil?
|
158
|
-
dm = []
|
159
|
-
dm[0] = (0..first.length).collect { |i| i * ins}
|
160
|
-
fill = [0] * (first.length - 1).abs
|
161
|
-
for i in 1..other.length
|
162
|
-
dm[i] = [i * del, fill.flatten]
|
163
|
-
end
|
164
|
-
for i in 1..other.length
|
165
|
-
for j in 1..first.length
|
166
|
-
dm[i][j] = [
|
167
|
-
dm[i-1][j-1] +
|
168
|
-
(first[i-1] ==
|
169
|
-
other[i-1] ? 0 : sub),
|
170
|
-
dm[i][j-1] + ins,
|
171
|
-
dm[i-1][j] + del
|
172
|
-
].min
|
173
|
-
end
|
174
|
-
end
|
175
|
-
dm[other.length][first.length]
|
176
|
-
end
|
177
|
-
|
178
|
-
if detect_platform == :windows
|
179
|
-
NULL_DEVICE = 'NUL'
|
180
|
-
else
|
181
|
-
NULL_DEVICE = '/dev/null'
|
182
|
-
end
|
183
|
-
|
184
|
-
def prompt(msg, valid_answers)
|
185
|
-
|
186
|
-
msg = msg
|
187
|
-
n = msg.include?("\n") ? ":\n" : ''
|
188
|
-
q = msg.include?("\n") ? '' : '?'
|
189
|
-
|
190
|
-
s = "\nPlease enter one of #{valid_answers.join(', ')}: "
|
191
|
-
puts "Do you want to #{n}#{msg}#{q} \n#{s}"
|
192
|
-
|
193
|
-
begin
|
194
|
-
answer = STDIN.gets.strip
|
195
|
-
unless valid_answers.include?(answer)
|
196
|
-
puts "Invalid input."
|
197
|
-
puts s
|
198
|
-
raise Treat::InvalidInputException
|
199
|
-
end
|
200
|
-
puts
|
201
|
-
answer
|
202
|
-
rescue Treat::InvalidInputException
|
203
|
-
retry
|
204
|
-
end
|
205
|
-
|
206
|
-
end
|
207
|
-
|
208
|
-
end
|
@@ -1,16 +0,0 @@
|
|
1
|
-
class Treat::Languages::Arabic
|
2
|
-
|
3
|
-
RequiredDependencies = []
|
4
|
-
OptionalDependencies = []
|
5
|
-
|
6
|
-
Extractors = {}
|
7
|
-
Inflectors = {}
|
8
|
-
Lexicalizers = {
|
9
|
-
:taggers => [:stanford]
|
10
|
-
}
|
11
|
-
Processors = {
|
12
|
-
:parsers => [:stanford]
|
13
|
-
}
|
14
|
-
Retrievers = {}
|
15
|
-
|
16
|
-
end
|
@@ -1,16 +0,0 @@
|
|
1
|
-
class Treat::Languages::Chinese
|
2
|
-
|
3
|
-
RequiredDependencies = []
|
4
|
-
OptionalDependencies = []
|
5
|
-
|
6
|
-
Extractors = {}
|
7
|
-
Inflectors = {}
|
8
|
-
Lexicalizers = {
|
9
|
-
:taggers => [:stanford]
|
10
|
-
}
|
11
|
-
Processors = {
|
12
|
-
:parsers => [:stanford]
|
13
|
-
}
|
14
|
-
Retrievers = {}
|
15
|
-
|
16
|
-
end
|