treat 1.2.0 → 2.0.0rc1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +2 -2
- data/README.md +12 -21
- data/lib/treat/autoload.rb +44 -0
- data/lib/treat/config/config.rb +38 -0
- data/lib/treat/config/configurable.rb +51 -0
- data/lib/treat/config/data/config.rb +50 -0
- data/lib/treat/config/data/core.rb +52 -0
- data/lib/treat/config/data/databases.rb +10 -0
- data/lib/treat/config/data/entities.rb +15 -0
- data/lib/treat/config/data/languages/agnostic.rb +31 -0
- data/lib/treat/config/{languages → data/languages}/arabic.rb +0 -0
- data/lib/treat/config/{languages → data/languages}/chinese.rb +0 -0
- data/lib/treat/config/{languages → data/languages}/dutch.rb +1 -1
- data/lib/treat/config/data/languages/english.rb +95 -0
- data/lib/treat/config/data/languages/french.rb +148 -0
- data/lib/treat/config/data/languages/german.rb +135 -0
- data/lib/treat/config/{languages → data/languages}/greek.rb +1 -1
- data/lib/treat/config/data/languages/italian.rb +162 -0
- data/lib/treat/config/data/languages/polish.rb +11 -0
- data/lib/treat/config/{languages → data/languages}/portuguese.rb +1 -1
- data/lib/treat/config/{languages → data/languages}/russian.rb +1 -1
- data/lib/treat/config/data/languages/spanish.rb +291 -0
- data/lib/treat/config/data/languages/swedish.rb +289 -0
- data/lib/treat/config/data/libraries.rb +12 -0
- data/lib/treat/config/data/linguistics.rb +44 -0
- data/lib/treat/config/data/tags.rb +328 -0
- data/lib/treat/config/{workers → data/workers}/extractors.rb +2 -10
- data/lib/treat/config/{workers → data/workers}/formatters.rb +0 -0
- data/lib/treat/config/{workers → data/workers}/inflectors.rb +0 -0
- data/lib/treat/config/{workers → data/workers}/learners.rb +0 -0
- data/lib/treat/config/{workers → data/workers}/lexicalizers.rb +4 -3
- data/lib/treat/config/{workers → data/workers}/processors.rb +3 -3
- data/lib/treat/config/{workers → data/workers}/retrievers.rb +0 -0
- data/lib/treat/config/importable.rb +31 -0
- data/lib/treat/config/paths.rb +23 -0
- data/lib/treat/config/tags.rb +37 -0
- data/lib/treat/core/dsl.rb +55 -0
- data/lib/treat/{installer.rb → core/installer.rb} +10 -12
- data/lib/treat/core/server.rb +40 -0
- data/lib/treat/entities/entities.rb +101 -0
- data/lib/treat/entities/{abilities/doable.rb → entity/applicable.rb} +5 -3
- data/lib/treat/entities/{abilities → entity}/buildable.rb +118 -63
- data/lib/treat/entities/{abilities → entity}/checkable.rb +2 -2
- data/lib/treat/entities/{abilities → entity}/comparable.rb +6 -6
- data/lib/treat/entities/{abilities → entity}/countable.rb +2 -1
- data/lib/treat/entities/entity/debuggable.rb +86 -0
- data/lib/treat/entities/{abilities → entity}/delegatable.rb +16 -26
- data/lib/treat/entities/{abilities → entity}/exportable.rb +2 -2
- data/lib/treat/entities/{abilities → entity}/iterable.rb +4 -16
- data/lib/treat/entities/{abilities → entity}/magical.rb +22 -17
- data/lib/treat/entities/entity/registrable.rb +36 -0
- data/lib/treat/entities/{abilities → entity}/stringable.rb +18 -15
- data/lib/treat/entities/entity.rb +86 -77
- data/lib/treat/exception.rb +3 -0
- data/lib/treat/helpers/hash.rb +29 -0
- data/lib/treat/helpers/help.rb +35 -0
- data/lib/treat/helpers/object.rb +55 -0
- data/lib/treat/helpers/string.rb +124 -0
- data/lib/treat/{core → learning}/data_set.rb +11 -11
- data/lib/treat/{core → learning}/export.rb +3 -3
- data/lib/treat/{core → learning}/problem.rb +26 -16
- data/lib/treat/{core → learning}/question.rb +5 -9
- data/lib/treat/loaders/linguistics.rb +8 -9
- data/lib/treat/loaders/stanford.rb +5 -11
- data/lib/treat/modules.rb +33 -0
- data/lib/treat/proxies/array.rb +27 -0
- data/lib/treat/proxies/language.rb +47 -0
- data/lib/treat/proxies/number.rb +18 -0
- data/lib/treat/proxies/proxy.rb +25 -0
- data/lib/treat/proxies/string.rb +18 -0
- data/lib/treat/version.rb +10 -1
- data/lib/treat/{workers.rb → workers/categorizable.rb} +18 -19
- data/lib/treat/workers/extractors/keywords/tf_idf.rb +11 -11
- data/lib/treat/workers/extractors/language/what_language.rb +8 -6
- data/lib/treat/workers/extractors/name_tag/stanford.rb +10 -4
- data/lib/treat/workers/extractors/similarity/levenshtein.rb +36 -0
- data/lib/treat/workers/extractors/similarity/tf_idf.rb +27 -0
- data/lib/treat/workers/extractors/tf_idf/native.rb +4 -4
- data/lib/treat/workers/extractors/time/chronic.rb +2 -4
- data/lib/treat/workers/extractors/time/nickel.rb +19 -20
- data/lib/treat/workers/extractors/time/ruby.rb +2 -1
- data/lib/treat/workers/extractors/topic_words/lda.rb +12 -12
- data/lib/treat/workers/extractors/topics/reuters.rb +9 -13
- data/lib/treat/workers/formatters/readers/autoselect.rb +1 -1
- data/lib/treat/workers/formatters/readers/image.rb +19 -9
- data/lib/treat/workers/formatters/readers/odt.rb +2 -1
- data/lib/treat/workers/formatters/readers/pdf.rb +20 -3
- data/lib/treat/workers/formatters/readers/xml.rb +0 -1
- data/lib/treat/workers/formatters/serializers/mongo.rb +10 -20
- data/lib/treat/workers/formatters/serializers/xml.rb +17 -26
- data/lib/treat/workers/formatters/serializers/yaml.rb +5 -4
- data/lib/treat/workers/formatters/unserializers/mongo.rb +4 -4
- data/lib/treat/workers/formatters/unserializers/xml.rb +3 -4
- data/lib/treat/workers/formatters/unserializers/yaml.rb +3 -4
- data/lib/treat/workers/formatters/visualizers/dot.rb +1 -0
- data/lib/treat/workers/formatters/visualizers/standoff.rb +2 -3
- data/lib/treat/workers/formatters/visualizers/tree.rb +2 -3
- data/lib/treat/workers/{group.rb → groupable.rb} +9 -9
- data/lib/treat/workers/inflectors/cardinalizers/linguistics.rb +1 -3
- data/lib/treat/workers/inflectors/conjugators/linguistics.rb +5 -7
- data/lib/treat/workers/inflectors/declensors/english.rb +13 -20
- data/lib/treat/workers/inflectors/declensors/linguistics.rb +29 -28
- data/lib/treat/workers/inflectors/ordinalizers/linguistics.rb +0 -2
- data/lib/treat/workers/inflectors/stemmers/porter.rb +8 -10
- data/lib/treat/workers/inflectors/stemmers/porter_c.rb +7 -7
- data/lib/treat/workers/inflectors/stemmers/uea.rb +3 -8
- data/lib/treat/workers/learners/classifiers/id3.rb +17 -14
- data/lib/treat/workers/learners/classifiers/linear.rb +15 -27
- data/lib/treat/workers/learners/classifiers/mlp.rb +32 -19
- data/lib/treat/workers/learners/classifiers/svm.rb +28 -21
- data/lib/treat/workers/lexicalizers/categorizers/from_tag.rb +19 -3
- data/lib/treat/workers/lexicalizers/sensers/wordnet.rb +15 -7
- data/lib/treat/workers/lexicalizers/taggers/brill/patch.rb +4 -1
- data/lib/treat/workers/lexicalizers/taggers/brill.rb +8 -19
- data/lib/treat/workers/lexicalizers/taggers/lingua.rb +4 -15
- data/lib/treat/workers/lexicalizers/taggers/stanford.rb +22 -13
- data/lib/treat/workers/processors/chunkers/autoselect.rb +2 -3
- data/lib/treat/workers/processors/chunkers/html.rb +1 -6
- data/lib/treat/workers/processors/parsers/enju.rb +2 -4
- data/lib/treat/workers/processors/parsers/stanford.rb +13 -7
- data/lib/treat/workers/processors/segmenters/punkt.rb +25 -11
- data/lib/treat/workers/processors/segmenters/scalpel.rb +20 -0
- data/lib/treat/workers/processors/segmenters/srx.rb +42 -0
- data/lib/treat/workers/processors/segmenters/stanford.rb +5 -5
- data/lib/treat/workers/processors/segmenters/tactful.rb +21 -11
- data/lib/treat/workers/processors/tokenizers/ptb.rb +40 -30
- data/lib/treat/workers/processors/tokenizers/punkt.rb +14 -19
- data/lib/treat/workers/processors/tokenizers/stanford.rb +38 -22
- data/lib/treat/workers/retrievers/indexers/ferret.rb +6 -3
- data/lib/treat/workers/retrievers/searchers/ferret.rb +2 -2
- data/lib/treat/workers/workers.rb +6 -0
- data/lib/treat.rb +18 -32
- data/models/MANIFEST +1 -0
- data/spec/core/data_set.rb +174 -0
- data/spec/core/export.rb +52 -0
- data/spec/core/problem.rb +144 -0
- data/spec/core/question.rb +52 -0
- data/spec/{collection.rb → entities/collection.rb} +20 -35
- data/spec/{document.rb → entities/document.rb} +3 -54
- data/spec/{entity.rb → entities/entity.rb} +10 -9
- data/spec/entities/phrase.rb +33 -0
- data/spec/{token.rb → entities/token.rb} +0 -57
- data/spec/entities/word.rb +3 -0
- data/spec/{zone.rb → entities/zone.rb} +0 -26
- data/spec/helper.rb +116 -32
- data/spec/sandbox.rb +258 -25
- data/spec/treat.rb +26 -34
- data/spec/workers/agnostic.rb +137 -0
- data/spec/workers/english.rb +194 -0
- data/spec/workers/examples/english/economist/hungarys_troubles.txt +46 -0
- data/spec/workers/examples/english/economist/saving_the_euro.odt +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/archimedes.abw +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/euler.html +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/gauss.pdf +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/leibniz.txt +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/newton.doc +0 -0
- data/spec/workers/examples/english/phrase.xml +5 -0
- data/spec/workers/examples/english/test.txt +1 -0
- data/spec/workers/language.rb +280 -0
- data/spec/workers.rb +28 -0
- metadata +122 -105
- data/lib/treat/config/core/acronyms.rb +0 -5
- data/lib/treat/config/core/encodings.rb +0 -8
- data/lib/treat/config/core/entities.rb +0 -2
- data/lib/treat/config/core/language.rb +0 -3
- data/lib/treat/config/core/paths.rb +0 -8
- data/lib/treat/config/core/syntax.rb +0 -1
- data/lib/treat/config/core/verbosity.rb +0 -1
- data/lib/treat/config/databases/default.rb +0 -1
- data/lib/treat/config/databases/mongo.rb +0 -1
- data/lib/treat/config/languages/agnostic.rb +0 -34
- data/lib/treat/config/languages/english.rb +0 -60
- data/lib/treat/config/languages/french.rb +0 -18
- data/lib/treat/config/languages/german.rb +0 -18
- data/lib/treat/config/languages/italian.rb +0 -12
- data/lib/treat/config/languages/polish.rb +0 -12
- data/lib/treat/config/languages/spanish.rb +0 -12
- data/lib/treat/config/languages/swedish.rb +0 -12
- data/lib/treat/config/libraries/punkt.rb +0 -1
- data/lib/treat/config/libraries/reuters.rb +0 -1
- data/lib/treat/config/libraries/stanford.rb +0 -1
- data/lib/treat/config/linguistics/categories.rb +0 -4
- data/lib/treat/config/linguistics/punctuation.rb +0 -33
- data/lib/treat/config/tags/aligned.rb +0 -221
- data/lib/treat/config/tags/enju.rb +0 -71
- data/lib/treat/config/tags/paris7.rb +0 -17
- data/lib/treat/config/tags/ptb.rb +0 -15
- data/lib/treat/config/workers/list.rb +0 -1
- data/lib/treat/config.rb +0 -135
- data/lib/treat/core.rb +0 -5
- data/lib/treat/entities/abilities/copyable.rb +0 -47
- data/lib/treat/entities/abilities/debuggable.rb +0 -83
- data/lib/treat/entities/abilities/registrable.rb +0 -46
- data/lib/treat/entities/collection.rb +0 -40
- data/lib/treat/entities/document.rb +0 -10
- data/lib/treat/entities/group.rb +0 -18
- data/lib/treat/entities/section.rb +0 -13
- data/lib/treat/entities/token.rb +0 -47
- data/lib/treat/entities/zone.rb +0 -12
- data/lib/treat/entities.rb +0 -6
- data/lib/treat/helpers/didyoumean.rb +0 -57
- data/lib/treat/helpers/escaping.rb +0 -15
- data/lib/treat/helpers/formatting.rb +0 -41
- data/lib/treat/helpers/objtohash.rb +0 -8
- data/lib/treat/helpers/platform.rb +0 -15
- data/lib/treat/helpers/reflection.rb +0 -17
- data/lib/treat/helpers/temporary.rb +0 -27
- data/lib/treat/helpers/verbosity.rb +0 -19
- data/lib/treat/helpers.rb +0 -5
- data/lib/treat/loaders.rb +0 -10
- data/lib/treat/proxies.rb +0 -106
- data/lib/treat/workers/formatters/unserializers/autoselect.rb +0 -17
- data/lib/treat/workers/inflectors/declensors/active_support.rb +0 -31
- data/lib/treat/workers/processors/tokenizers/tactful.rb +0 -68
- data/spec/core.rb +0 -441
- data/spec/phrase.rb +0 -112
- data/spec/word.rb +0 -111
@@ -0,0 +1,101 @@
|
|
1
|
+
module Treat::Entities
|
2
|
+
|
3
|
+
# * Collection and document classes * #
|
4
|
+
|
5
|
+
# Represents a collection.
|
6
|
+
class Collection < Entity; end
|
7
|
+
|
8
|
+
# Represents a document.
|
9
|
+
class Document < Entity; end
|
10
|
+
|
11
|
+
# * Sections and related classes * #
|
12
|
+
|
13
|
+
# Represents a section.
|
14
|
+
class Section < Entity; end
|
15
|
+
|
16
|
+
# Represents a page of text.
|
17
|
+
class Page < Section; end
|
18
|
+
|
19
|
+
# Represents a block of text
|
20
|
+
class Block < Section; end
|
21
|
+
|
22
|
+
# Represents a list.
|
23
|
+
class List < Section; end
|
24
|
+
|
25
|
+
# * Zones and related classes * #
|
26
|
+
|
27
|
+
# Represents a zone of text.
|
28
|
+
class Zone < Entity; end
|
29
|
+
|
30
|
+
# Represents a title, subtitle,
|
31
|
+
# logical header of a text.
|
32
|
+
class Title < Zone; end
|
33
|
+
|
34
|
+
# Represents a paragraph (group
|
35
|
+
# of sentences and/or phrases).
|
36
|
+
class Paragraph < Zone; end
|
37
|
+
|
38
|
+
# * Groups and related classes * #
|
39
|
+
|
40
|
+
# Represents a group of tokens.
|
41
|
+
class Group < Entity; end
|
42
|
+
|
43
|
+
# Represents a group of words
|
44
|
+
# with a sentence ender (.!?)
|
45
|
+
class Sentence < Group; end
|
46
|
+
|
47
|
+
# Represents a group of words,
|
48
|
+
# with no sentence ender.
|
49
|
+
class Phrase < Group; end
|
50
|
+
|
51
|
+
# Represents a non-linguistic
|
52
|
+
# fragment (e.g. stray symbols).
|
53
|
+
class Fragment < Group; end
|
54
|
+
|
55
|
+
# * Tokens and related classes* #
|
56
|
+
|
57
|
+
# Represents a terminal element
|
58
|
+
# (leaf) in the text structure.
|
59
|
+
class Token < Entity; end
|
60
|
+
|
61
|
+
# Represents a word. Strictly,
|
62
|
+
# this is /^[[:alpha:]\-']+$/.
|
63
|
+
class Word < Token; end
|
64
|
+
|
65
|
+
# Represents an enclitic.
|
66
|
+
# Strictly, this is any of
|
67
|
+
# 'll 'm 're 's 't or 've.
|
68
|
+
class Enclitic < Token; end
|
69
|
+
|
70
|
+
# Represents a number. Strictly,
|
71
|
+
# this is /^#?([0-9]+)(\.[0-9]+)?$/.
|
72
|
+
class Number < Token
|
73
|
+
def to_i; to_s.to_i; end
|
74
|
+
def to_f; to_s.to_f; end
|
75
|
+
end
|
76
|
+
|
77
|
+
# Represents a punctuation sign.
|
78
|
+
# Strictly, this is /^[[:punct:]\$]+$/.
|
79
|
+
class Punctuation < Token; end
|
80
|
+
|
81
|
+
# Represents a character that is neither
|
82
|
+
# a word, an enclitic, a number or a
|
83
|
+
# punctuation character (e.g. @#$%&*).
|
84
|
+
class Symbol < Token; end
|
85
|
+
|
86
|
+
# Represents a url. This is (imperfectly)
|
87
|
+
# defined as /^(http|https):\/\/[a-z0-9]
|
88
|
+
# +([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}
|
89
|
+
# (([0-9]{1,5})?\/.*)?$/ix
|
90
|
+
class Url < Token; end
|
91
|
+
|
92
|
+
# Represents a valid RFC822 address.
|
93
|
+
# This is (imperfectly) defined as
|
94
|
+
# /.+\@.+\..+/ (fixme maybe?)
|
95
|
+
class Email < Token; end
|
96
|
+
|
97
|
+
# Represents a token whose type
|
98
|
+
# cannot be identified.
|
99
|
+
class Unknown; end
|
100
|
+
|
101
|
+
end
|
@@ -1,8 +1,8 @@
|
|
1
1
|
# Implement support for the functions #do and #do_task.
|
2
|
-
module Treat::Entities::
|
2
|
+
module Treat::Entities::Entity::Applicable
|
3
3
|
|
4
4
|
# Perform the supplied tasks on the entity.
|
5
|
-
def
|
5
|
+
def apply(*tasks)
|
6
6
|
tasks.each do |task|
|
7
7
|
|
8
8
|
if task.is_a?(Hash)
|
@@ -25,6 +25,8 @@ module Treat::Entities::Abilities::Doable
|
|
25
25
|
end
|
26
26
|
self
|
27
27
|
end
|
28
|
+
|
29
|
+
alias :do :apply
|
28
30
|
|
29
31
|
# Perform an individual task on an entity
|
30
32
|
# given a worker and options to pass to it.
|
@@ -33,7 +35,7 @@ module Treat::Entities::Abilities::Doable
|
|
33
35
|
entity_types = group.targets
|
34
36
|
f = nil
|
35
37
|
entity_types.each do |t|
|
36
|
-
f = true if is_a?(Treat::Entities.const_get(cc
|
38
|
+
f = true if is_a?(Treat::Entities.const_get(t.cc))
|
37
39
|
end
|
38
40
|
if f || entity_types.include?(:entity)
|
39
41
|
send(task, worker, options)
|
@@ -3,7 +3,7 @@
|
|
3
3
|
# a string or a numeric object. This class
|
4
4
|
# is pretty much self-explanatory.
|
5
5
|
# FIXME how can we make this language independent?
|
6
|
-
module Treat::Entities::
|
6
|
+
module Treat::Entities::Entity::Buildable
|
7
7
|
|
8
8
|
require 'schiphol'
|
9
9
|
require 'fileutils'
|
@@ -23,23 +23,40 @@ module Treat::Entities::Abilities::Buildable
|
|
23
23
|
# Build an entity from anything (can be
|
24
24
|
# a string, numeric,folder, or file name
|
25
25
|
# representing a raw or serialized file).
|
26
|
-
def build(
|
26
|
+
def build(*args)
|
27
|
+
|
28
|
+
# This probably needs some doc.
|
29
|
+
if args.size == 0
|
30
|
+
file_or_value = ''
|
31
|
+
elsif args[0].is_a?(Hash)
|
32
|
+
file_or_value = args[0]
|
33
|
+
elsif args.size == 1
|
34
|
+
if args[0].is_a?(Treat::Entities::Entity)
|
35
|
+
args[0] = [args[0]]
|
36
|
+
end
|
37
|
+
file_or_value = args[0]
|
38
|
+
else
|
39
|
+
file_or_value = args
|
40
|
+
end
|
27
41
|
|
28
42
|
fv = file_or_value.to_s
|
29
43
|
|
30
|
-
if
|
44
|
+
if fv == ''; self.new
|
45
|
+
elsif file_or_value.is_a?(Array)
|
46
|
+
from_array(file_or_value)
|
47
|
+
elsif file_or_value.is_a?(Hash)
|
31
48
|
from_db(file_or_value)
|
32
49
|
elsif self == Treat::Entities::Document ||
|
33
50
|
(fv.index('yml') || fv.index('yaml') ||
|
34
51
|
fv.index('xml') || fv.index('mongo'))
|
35
52
|
if fv =~ UriRegexp
|
36
|
-
from_url(fv
|
53
|
+
from_url(fv)
|
37
54
|
else
|
38
|
-
from_file(fv
|
55
|
+
from_file(fv)
|
39
56
|
end
|
40
57
|
elsif self == Treat::Entities::Collection
|
41
58
|
if FileTest.directory?(fv)
|
42
|
-
from_folder(fv
|
59
|
+
from_folder(fv)
|
43
60
|
else
|
44
61
|
create_collection(fv)
|
45
62
|
end
|
@@ -63,27 +80,35 @@ module Treat::Entities::Abilities::Buildable
|
|
63
80
|
# is user-created (i.e. by calling build
|
64
81
|
# instead of from_string directly).
|
65
82
|
def from_string(string, enforce_type = false)
|
66
|
-
|
83
|
+
# If calling using the build syntax (i.e. user-
|
84
|
+
# called), enforce the type that was supplied.
|
67
85
|
enforce_type = true if caller_method == :build
|
68
|
-
|
69
86
|
unless self == Treat::Entities::Entity
|
70
87
|
return self.new(string) if enforce_type
|
71
88
|
end
|
72
|
-
|
73
89
|
e = anything_from_string(string)
|
74
|
-
|
75
90
|
if enforce_type && !e.is_a?(self)
|
76
|
-
raise "Asked to build a #{
|
91
|
+
raise "Asked to build a #{self.mn.downcase} "+
|
77
92
|
"from \"#{string}\" and to enforce type, "+
|
78
|
-
"but type detected was #{
|
93
|
+
"but type detected was #{e.class.mn.downcase}."
|
79
94
|
end
|
80
|
-
|
81
95
|
e
|
82
|
-
|
96
|
+
end
|
97
|
+
|
98
|
+
# Build a document from an array
|
99
|
+
# of builders.
|
100
|
+
def from_array(array)
|
101
|
+
obj = self.new
|
102
|
+
array.each do |el|
|
103
|
+
el = el.to_entity unless el.
|
104
|
+
is_a?(Treat::Entities::Entity)
|
105
|
+
obj << el
|
106
|
+
end
|
107
|
+
obj
|
83
108
|
end
|
84
109
|
|
85
110
|
# Build a document from an URL.
|
86
|
-
def from_url(url
|
111
|
+
def from_url(url)
|
87
112
|
unless self ==
|
88
113
|
Treat::Entities::Document
|
89
114
|
raise Treat::Exception,
|
@@ -91,16 +116,22 @@ module Treat::Entities::Abilities::Buildable
|
|
91
116
|
'else than a document from a url.'
|
92
117
|
end
|
93
118
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
119
|
+
begin
|
120
|
+
folder = Treat.paths.files
|
121
|
+
if folder[-1] == '/'
|
122
|
+
folder = folder[0..-2]
|
123
|
+
end
|
124
|
+
f = Schiphol.download(url,
|
125
|
+
download_folder: folder,
|
126
|
+
show_progress: !Treat.core.verbosity.silence,
|
127
|
+
rectify_extensions: true,
|
128
|
+
max_tries: 3)
|
129
|
+
rescue
|
130
|
+
raise Treat::Exception,
|
131
|
+
"Couldn't download file at #{url}."
|
132
|
+
end
|
102
133
|
|
103
|
-
e = from_file(f,
|
134
|
+
e = from_file(f,'html')
|
104
135
|
e.set :url, url.to_s
|
105
136
|
e
|
106
137
|
|
@@ -123,7 +154,7 @@ module Treat::Entities::Abilities::Buildable
|
|
123
154
|
|
124
155
|
# Build an entity from a folder with documents.
|
125
156
|
# Folders will be searched recursively.
|
126
|
-
def from_folder(folder
|
157
|
+
def from_folder(folder)
|
127
158
|
|
128
159
|
return if Reserved.include?(folder)
|
129
160
|
|
@@ -148,40 +179,48 @@ module Treat::Entities::Abilities::Buildable
|
|
148
179
|
|
149
180
|
c = Treat::Entities::Collection.new(folder)
|
150
181
|
folder += '/' unless folder[-1] == '/'
|
151
|
-
|
182
|
+
|
183
|
+
if !FileTest.directory?(folder)
|
184
|
+
FileUtils.mkdir(folder)
|
185
|
+
end
|
186
|
+
|
187
|
+
c.set :folder, folder
|
188
|
+
i = folder + '/.index'
|
189
|
+
c.set :index, i if FileTest.directory?(i)
|
190
|
+
|
152
191
|
Dir[folder + '*'].each do |f|
|
153
192
|
if FileTest.directory?(f)
|
154
193
|
c2 = Treat::Entities::Collection.
|
155
|
-
from_folder(f
|
194
|
+
from_folder(f)
|
156
195
|
c.<<(c2, false) if c2
|
157
196
|
else
|
158
197
|
c.<<(Treat::Entities::Document.
|
159
|
-
from_file(f
|
198
|
+
from_file(f), false)
|
160
199
|
end
|
161
200
|
end
|
162
|
-
|
201
|
+
|
202
|
+
return c
|
163
203
|
|
164
204
|
end
|
165
205
|
|
166
206
|
# Build a document from a raw or serialized file.
|
167
|
-
def from_file(file,
|
207
|
+
def from_file(file,def_fmt=nil)
|
168
208
|
|
169
209
|
if file.index('yml') ||
|
170
210
|
file.index('yaml') ||
|
171
211
|
file.index('xml') ||
|
172
212
|
file.index('mongo')
|
173
|
-
from_serialized_file(file
|
213
|
+
from_serialized_file(file)
|
174
214
|
else
|
175
|
-
fmt = Treat::Workers::Formatters::
|
176
|
-
detect_format(file,
|
177
|
-
|
178
|
-
from_raw_file(file, options)
|
215
|
+
fmt = Treat::Workers::Formatters::
|
216
|
+
Readers::Autoselect.detect_format(file,def_fmt)
|
217
|
+
from_raw_file(file, fmt)
|
179
218
|
end
|
180
219
|
|
181
220
|
end
|
182
221
|
|
183
222
|
# Build a document from a raw file.
|
184
|
-
def from_raw_file(file,
|
223
|
+
def from_raw_file(file, def_fmt='txt')
|
185
224
|
|
186
225
|
unless self ==
|
187
226
|
Treat::Entities::Document
|
@@ -195,31 +234,36 @@ module Treat::Entities::Abilities::Buildable
|
|
195
234
|
"Path '#{file}' does not "+
|
196
235
|
"point to a readable file."
|
197
236
|
end
|
198
|
-
|
199
|
-
d = Treat::Entities::Document.new
|
200
|
-
|
237
|
+
options = {default_format: def_fmt}
|
238
|
+
d = Treat::Entities::Document.new
|
239
|
+
d.set :file, file
|
201
240
|
d.read(:autoselect, options)
|
202
241
|
|
203
242
|
end
|
204
243
|
|
205
244
|
# Build an entity from a serialized file.
|
206
|
-
def from_serialized_file(file
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
245
|
+
def from_serialized_file(file)
|
246
|
+
|
247
|
+
unless File.readable?(file)
|
248
|
+
raise Treat::Exception,
|
249
|
+
"Path '#{file}' does not "+
|
250
|
+
"point to a readable file."
|
251
|
+
end
|
252
|
+
doc = Treat::Entities::Document.new
|
253
|
+
doc.set :file, file
|
254
|
+
format = nil
|
255
|
+
if file.index('yml') ||
|
256
|
+
file.index('yaml')
|
257
|
+
format = :yaml
|
258
|
+
elsif file.index('xml')
|
259
|
+
f = :xml
|
212
260
|
else
|
213
|
-
|
214
|
-
|
215
|
-
"Path '#{file}' does not "+
|
216
|
-
"point to a readable file."
|
217
|
-
end
|
218
|
-
d = Treat::Entities::Document.new(file)
|
219
|
-
d.unserialize(:autoselect, options)
|
220
|
-
d.children[0].set_as_root! # Fix this
|
221
|
-
d.children[0]
|
261
|
+
raise Treat::Exception,
|
262
|
+
"Unreadable serialized format for #{file}."
|
222
263
|
end
|
264
|
+
doc.unserialize(format)
|
265
|
+
doc.children[0].set_as_root! # Fix this
|
266
|
+
doc.children[0]
|
223
267
|
|
224
268
|
end
|
225
269
|
|
@@ -238,15 +282,28 @@ module Treat::Entities::Abilities::Buildable
|
|
238
282
|
|
239
283
|
# Build any kind of entity from a string.
|
240
284
|
def anything_from_string(string)
|
285
|
+
case self.mn.downcase.intern
|
286
|
+
when :document
|
287
|
+
folder = Treat.paths.files
|
288
|
+
if folder[-1] == '/'
|
289
|
+
folder = folder[0..-2]
|
290
|
+
end
|
291
|
+
|
292
|
+
now = Time.now.to_f
|
293
|
+
doc_file = folder+ "/#{now}.txt"
|
294
|
+
string.force_encoding('UTF-8')
|
295
|
+
File.open(doc_file, 'w') do |f|
|
296
|
+
f.puts string
|
297
|
+
end
|
241
298
|
|
242
|
-
|
243
|
-
when :
|
299
|
+
from_raw_file(doc_file)
|
300
|
+
when :collection
|
244
301
|
raise Treat::Exception,
|
245
|
-
"Cannot create a
|
302
|
+
"Cannot create a " +
|
246
303
|
"collection from a string " +
|
247
304
|
"(need a readable file/folder)."
|
248
305
|
when :phrase
|
249
|
-
|
306
|
+
group_from_string(string)
|
250
307
|
when :token
|
251
308
|
token_from_string(string)
|
252
309
|
when :zone
|
@@ -258,7 +315,7 @@ module Treat::Entities::Abilities::Buildable
|
|
258
315
|
if string.gsub(/[\.\!\?]+/,
|
259
316
|
'.').count('.') <= 1 &&
|
260
317
|
string.count("\n") == 0
|
261
|
-
|
318
|
+
group_from_string(string)
|
262
319
|
else
|
263
320
|
zone_from_string(string)
|
264
321
|
end
|
@@ -269,15 +326,14 @@ module Treat::Entities::Abilities::Buildable
|
|
269
326
|
|
270
327
|
end
|
271
328
|
|
329
|
+
# This should be improved on.
|
272
330
|
def check_encoding(string)
|
273
331
|
string.encode("UTF-8", undef: :replace) # Fix
|
274
332
|
end
|
275
333
|
|
276
334
|
# Build a phrase from a string.
|
277
|
-
def
|
278
|
-
|
335
|
+
def group_from_string(string)
|
279
336
|
check_encoding(string)
|
280
|
-
|
281
337
|
if !(string =~ /[a-zA-Z]+/)
|
282
338
|
Treat::Entities::Fragment.new(string)
|
283
339
|
elsif string.count('.!?') >= 1
|
@@ -285,7 +341,6 @@ module Treat::Entities::Abilities::Buildable
|
|
285
341
|
else
|
286
342
|
Treat::Entities::Phrase.new(string)
|
287
343
|
end
|
288
|
-
|
289
344
|
end
|
290
345
|
|
291
346
|
# Build the right type of token
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# This module implements methods that are used
|
2
2
|
# by workers to determine if an entity is properly
|
3
3
|
# formatted before working on it.
|
4
|
-
module Treat::Entities::
|
4
|
+
module Treat::Entities::Entity::Checkable
|
5
5
|
|
6
6
|
# Check if the entity has the given feature,
|
7
7
|
# and if so return it. If not, calculate the
|
@@ -15,7 +15,7 @@ module Treat::Entities::Abilities::Checkable
|
|
15
15
|
g2 = Treat::Workers.lookup(feature)
|
16
16
|
|
17
17
|
raise Treat::Exception,
|
18
|
-
"#{g1.type.to_s.capitalize}
|
18
|
+
"#{g1.type.to_s.capitalize} " +
|
19
19
|
"requires #{g2.type} #{g2.method}."
|
20
20
|
end
|
21
21
|
|
@@ -1,21 +1,21 @@
|
|
1
|
-
|
1
|
+
# Allow comparison of entity hierarchy in DOM.
|
2
|
+
module Treat::Entities::Entity::Comparable
|
2
3
|
|
4
|
+
# Determines whether the receiving class
|
5
|
+
# is smaller, equal or greater in the DOM
|
6
|
+
# hierarchy compared to the supplied one.
|
3
7
|
def compare_with(klass)
|
4
|
-
|
5
8
|
i = 0; rank_a = nil; rank_b = nil
|
6
|
-
|
7
9
|
Treat.core.entities.order.each do |type|
|
8
|
-
klass2 = Treat::Entities.const_get(cc
|
10
|
+
klass2 = Treat::Entities.const_get(type.cc)
|
9
11
|
rank_a = i if self <= klass2
|
10
12
|
rank_b = i if klass <= klass2
|
11
13
|
next if rank_a && rank_b
|
12
14
|
i += 1
|
13
15
|
end
|
14
|
-
|
15
16
|
return -1 if rank_a < rank_b
|
16
17
|
return 0 if rank_a == rank_b
|
17
18
|
return 1 if rank_a > rank_b
|
18
|
-
|
19
19
|
end
|
20
20
|
|
21
21
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
module Treat::Entities::
|
1
|
+
module Treat::Entities::Entity::Countable
|
2
2
|
|
3
3
|
# Find the position of the current entity
|
4
4
|
# inside the parent entity, starting at 1.
|
@@ -41,6 +41,7 @@ module Treat::Entities::Abilities::Countable
|
|
41
41
|
# Returns the frequency of the given value
|
42
42
|
# in the this entity.
|
43
43
|
def frequency_of(value)
|
44
|
+
value = value.downcase
|
44
45
|
if is_a?(Treat::Entities::Token)
|
45
46
|
raise Treat::Exception,
|
46
47
|
"Cannot get the frequency " +
|
@@ -0,0 +1,86 @@
|
|
1
|
+
# When Treat.debug is set to true, each call to
|
2
|
+
# #call_worker will result in a debug message being
|
3
|
+
# printed by the #print_debug function.
|
4
|
+
module Treat::Entities::Entity::Debuggable
|
5
|
+
|
6
|
+
# Previous state and counter.
|
7
|
+
@@prev, @@i = nil, 0
|
8
|
+
|
9
|
+
# Explains what Treat is currently doing.
|
10
|
+
# Fixme: last call will never get shown.
|
11
|
+
def print_debug(entity, task, worker, group, options)
|
12
|
+
# Get a list of the worker's targets.
|
13
|
+
targets = group.targets.map(&:to_s)
|
14
|
+
|
15
|
+
# List the worker's targets as either
|
16
|
+
# a single target or an and/or form
|
17
|
+
# (since it would be too costly to
|
18
|
+
# actually determine what target types
|
19
|
+
# were processed at runtime for each call).
|
20
|
+
t = targets.size == 1 ? targets[0] : targets[
|
21
|
+
0..-2].join(', ') + ' and/or ' + targets[-1]
|
22
|
+
|
23
|
+
# Add genitive for annotations (sing./plural)
|
24
|
+
genitive = targets.size > 1 ? 'their' : 'its'
|
25
|
+
|
26
|
+
# Set up an empty string and humanize task name.
|
27
|
+
doing, human_task = '', task.to_s.gsub('_', ' ')
|
28
|
+
|
29
|
+
# Base is "{task}-ed {a(n)|N} {target(s)}"
|
30
|
+
if [:transformer, :computer].include?(group.type)
|
31
|
+
tt = human_task
|
32
|
+
tt = tt[0..-2] if tt[-1] == 'e'
|
33
|
+
ed = tt[-1] == 'd' ? '' : 'ed'
|
34
|
+
doing = "#{tt.capitalize}#{ed} #{t}"
|
35
|
+
# Base is "Annotated {a(n)|N} {target(s)}"
|
36
|
+
elsif group.type == :annotator
|
37
|
+
if group.preset_option
|
38
|
+
opt = options[group.preset_option]
|
39
|
+
form = opt.to_s.gsub('_', ' ')
|
40
|
+
human_task[-1] = ''
|
41
|
+
human_task = form + ' ' + human_task
|
42
|
+
end
|
43
|
+
doing = "Annotated #{t} with " +
|
44
|
+
"#{genitive} #{human_task}"
|
45
|
+
end
|
46
|
+
|
47
|
+
# Form is '{base} in format {worker}'.
|
48
|
+
if group.to_s.index('Formatters')
|
49
|
+
curr = doing + ' in format ' + worker.to_s
|
50
|
+
# Form is '{base} using {worker}'.
|
51
|
+
else
|
52
|
+
curr = doing + ' using ' + worker.to_s.gsub('_', ' ')
|
53
|
+
end
|
54
|
+
|
55
|
+
# Remove any double pluralization that may happen.
|
56
|
+
curr.gsub!('ss', 's') unless curr.index('class')
|
57
|
+
|
58
|
+
# Accumulate repeated tasks.
|
59
|
+
@@i += 1 if curr == @@prev
|
60
|
+
|
61
|
+
# Change tasks, so output.
|
62
|
+
if curr != @@prev && @@prev
|
63
|
+
# Pluralize entity names if necessary.
|
64
|
+
if @@i > 1
|
65
|
+
Treat.core.entities.list.each do |e|
|
66
|
+
@@prev.gsub!(e.to_s, e.to_s + 's')
|
67
|
+
end
|
68
|
+
@@prev.gsub!('its', 'their')
|
69
|
+
@@prev = @@prev.split(' ').
|
70
|
+
insert(1, @@i.to_s).join(' ')
|
71
|
+
# Add determiner if singular.
|
72
|
+
else
|
73
|
+
@@prev = @@prev.split(' ').
|
74
|
+
insert(1, 'a').join(' ')
|
75
|
+
end
|
76
|
+
# Reset counter.
|
77
|
+
@@i = 0
|
78
|
+
# Write to stdout.
|
79
|
+
puts @@prev + '.'
|
80
|
+
end
|
81
|
+
|
82
|
+
@@prev = curr
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# Makes a class delegatable, allowing calls
|
2
2
|
# on it to be forwarded to a worker class
|
3
3
|
# able to perform the appropriate task.
|
4
|
-
module Treat::Entities::
|
4
|
+
module Treat::Entities::Entity::Delegatable
|
5
5
|
|
6
6
|
# Add preset methods to an entity class.
|
7
7
|
def add_presets(group)
|
@@ -10,27 +10,25 @@ module Treat::Entities::Abilities::Delegatable
|
|
10
10
|
return unless opt
|
11
11
|
|
12
12
|
self.class_eval do
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
13
|
+
group.presets.each do |preset|
|
14
|
+
define_method(preset) do |worker=nil, options={}|
|
15
|
+
return get(preset) if has?(preset)
|
16
|
+
options = {opt => preset}.merge(options)
|
17
|
+
m = group.method
|
18
|
+
send(m, worker, options)
|
19
|
+
f = unset(m)
|
20
|
+
features[preset] = f if f
|
21
|
+
end
|
21
22
|
end
|
22
23
|
end
|
23
|
-
end
|
24
24
|
|
25
25
|
end
|
26
26
|
|
27
27
|
# Add the workers to perform a task on an entity class.
|
28
28
|
def add_workers(group)
|
29
29
|
self.class_eval do
|
30
|
-
|
31
30
|
task = group.method
|
32
31
|
add_presets(group)
|
33
|
-
|
34
32
|
define_method(task) do |worker=nil, options={}|
|
35
33
|
if worker.is_a?(Hash)
|
36
34
|
options, worker =
|
@@ -64,7 +62,7 @@ module Treat::Entities::Abilities::Delegatable
|
|
64
62
|
worker_not_found(worker, group)
|
65
63
|
end
|
66
64
|
|
67
|
-
worker = group.const_get(
|
65
|
+
worker = group.const_get(worker.to_s.cc.intern)
|
68
66
|
result = worker.send(group.method, entity, options)
|
69
67
|
|
70
68
|
if group.type == :annotator && result
|
@@ -90,40 +88,32 @@ module Treat::Entities::Abilities::Delegatable
|
|
90
88
|
# Get the default worker for that language
|
91
89
|
# inside the given group.
|
92
90
|
def find_worker_for_language(language, group)
|
93
|
-
|
94
91
|
lang = Treat.languages[language]
|
95
92
|
cat = group.to_s.split('::')[2].downcase.intern
|
96
|
-
group =
|
97
|
-
|
93
|
+
group = group.mn.ucc.intern
|
98
94
|
if lang.nil?
|
99
95
|
raise Treat::Exception,
|
100
96
|
"No configuration file loaded for language #{language}."
|
101
97
|
end
|
102
|
-
|
103
98
|
workers = lang.workers
|
104
|
-
|
105
99
|
if !workers.respond_to?(cat) ||
|
106
100
|
!workers[cat].respond_to?(group)
|
107
101
|
workers = Treat.languages.agnostic.workers
|
108
102
|
end
|
109
|
-
|
110
103
|
if !workers.respond_to?(cat) ||
|
111
104
|
!workers[cat].respond_to?(group)
|
112
105
|
raise Treat::Exception,
|
113
106
|
"No #{group} is/are available for the " +
|
114
107
|
"#{language.to_s.capitalize} language."
|
115
108
|
end
|
116
|
-
|
117
|
-
|
118
109
|
workers[cat][group].first
|
119
|
-
|
120
110
|
end
|
121
111
|
|
122
112
|
# Return an error message and suggest possible typos.
|
123
|
-
def worker_not_found(
|
124
|
-
"
|
125
|
-
"found in group #{group}." +
|
126
|
-
group.list.map { |c| ucc
|
113
|
+
def worker_not_found(worker, group)
|
114
|
+
"Worker with name '#{worker}' couldn't be "+
|
115
|
+
"found in group #{group}." + Treat::Helpers::Help.
|
116
|
+
did_you_mean?(group.list.map { |c| c.ucc }, worker)
|
127
117
|
end
|
128
118
|
|
129
119
|
end
|