treat 0.2.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +3 -3
- data/README.md +33 -0
- data/files/INFO +1 -0
- data/lib/treat.rb +40 -105
- data/lib/treat/ai.rb +12 -0
- data/lib/treat/ai/classifiers/id3.rb +27 -0
- data/lib/treat/categories.rb +82 -35
- data/lib/treat/categorizable.rb +44 -0
- data/lib/treat/classification.rb +61 -0
- data/lib/treat/configurable.rb +115 -0
- data/lib/treat/data_set.rb +42 -0
- data/lib/treat/dependencies.rb +24 -0
- data/lib/treat/downloader.rb +87 -0
- data/lib/treat/entities.rb +68 -66
- data/lib/treat/entities/abilities.rb +10 -0
- data/lib/treat/entities/abilities/buildable.rb +327 -0
- data/lib/treat/entities/abilities/checkable.rb +31 -0
- data/lib/treat/entities/abilities/copyable.rb +45 -0
- data/lib/treat/entities/abilities/countable.rb +51 -0
- data/lib/treat/entities/abilities/debuggable.rb +83 -0
- data/lib/treat/entities/abilities/delegatable.rb +123 -0
- data/lib/treat/entities/abilities/doable.rb +62 -0
- data/lib/treat/entities/abilities/exportable.rb +11 -0
- data/lib/treat/entities/abilities/iterable.rb +115 -0
- data/lib/treat/entities/abilities/magical.rb +83 -0
- data/lib/treat/entities/abilities/registrable.rb +74 -0
- data/lib/treat/entities/abilities/stringable.rb +91 -0
- data/lib/treat/entities/entities.rb +104 -0
- data/lib/treat/entities/entity.rb +122 -245
- data/lib/treat/exception.rb +4 -4
- data/lib/treat/extractors.rb +77 -80
- data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
- data/lib/treat/extractors/language/what_language.rb +50 -45
- data/lib/treat/extractors/name_tag/stanford.rb +55 -0
- data/lib/treat/extractors/tf_idf/native.rb +87 -0
- data/lib/treat/extractors/time/chronic.rb +55 -0
- data/lib/treat/extractors/time/nickel.rb +86 -62
- data/lib/treat/extractors/time/ruby.rb +53 -0
- data/lib/treat/extractors/topic_words/lda.rb +67 -58
- data/lib/treat/extractors/topics/reuters.rb +100 -87
- data/lib/treat/formatters.rb +39 -35
- data/lib/treat/formatters/readers/abw.rb +49 -29
- data/lib/treat/formatters/readers/autoselect.rb +37 -33
- data/lib/treat/formatters/readers/doc.rb +19 -13
- data/lib/treat/formatters/readers/html.rb +52 -30
- data/lib/treat/formatters/readers/image.rb +41 -40
- data/lib/treat/formatters/readers/odt.rb +59 -45
- data/lib/treat/formatters/readers/pdf.rb +28 -25
- data/lib/treat/formatters/readers/txt.rb +12 -15
- data/lib/treat/formatters/readers/xml.rb +73 -36
- data/lib/treat/formatters/serializers/xml.rb +80 -79
- data/lib/treat/formatters/serializers/yaml.rb +19 -18
- data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
- data/lib/treat/formatters/unserializers/xml.rb +94 -99
- data/lib/treat/formatters/unserializers/yaml.rb +20 -19
- data/lib/treat/formatters/visualizers/dot.rb +132 -132
- data/lib/treat/formatters/visualizers/standoff.rb +52 -44
- data/lib/treat/formatters/visualizers/tree.rb +26 -29
- data/lib/treat/groupable.rb +153 -0
- data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
- data/lib/treat/inflectors.rb +50 -45
- data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
- data/lib/treat/inflectors/declensors/active_support.rb +31 -0
- data/lib/treat/inflectors/declensors/english.rb +38 -0
- data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
- data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
- data/lib/treat/inflectors/stemmers/porter.rb +160 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
- data/lib/treat/inflectors/stemmers/uea.rb +28 -0
- data/lib/treat/installer.rb +308 -0
- data/lib/treat/kernel.rb +105 -27
- data/lib/treat/languages.rb +122 -88
- data/lib/treat/languages/arabic.rb +15 -15
- data/lib/treat/languages/chinese.rb +15 -15
- data/lib/treat/languages/dutch.rb +15 -15
- data/lib/treat/languages/english.rb +61 -62
- data/lib/treat/languages/french.rb +19 -19
- data/lib/treat/languages/german.rb +20 -20
- data/lib/treat/languages/greek.rb +15 -15
- data/lib/treat/languages/italian.rb +16 -16
- data/lib/treat/languages/polish.rb +15 -15
- data/lib/treat/languages/portuguese.rb +15 -15
- data/lib/treat/languages/russian.rb +15 -15
- data/lib/treat/languages/spanish.rb +16 -16
- data/lib/treat/languages/swedish.rb +16 -16
- data/lib/treat/lexicalizers.rb +34 -55
- data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
- data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
- data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
- data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
- data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
- data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
- data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
- data/lib/treat/linguistics.rb +9 -0
- data/lib/treat/linguistics/categories.rb +11 -0
- data/lib/treat/linguistics/tags.rb +422 -0
- data/lib/treat/loaders/linguistics.rb +30 -0
- data/lib/treat/loaders/stanford.rb +27 -0
- data/lib/treat/object.rb +1 -0
- data/lib/treat/processors.rb +37 -44
- data/lib/treat/processors/chunkers/autoselect.rb +16 -0
- data/lib/treat/processors/chunkers/html.rb +71 -0
- data/lib/treat/processors/chunkers/txt.rb +18 -24
- data/lib/treat/processors/parsers/enju.rb +253 -208
- data/lib/treat/processors/parsers/stanford.rb +130 -131
- data/lib/treat/processors/segmenters/punkt.rb +79 -45
- data/lib/treat/processors/segmenters/stanford.rb +46 -48
- data/lib/treat/processors/segmenters/tactful.rb +43 -36
- data/lib/treat/processors/tokenizers/perl.rb +124 -92
- data/lib/treat/processors/tokenizers/ptb.rb +81 -0
- data/lib/treat/processors/tokenizers/punkt.rb +48 -42
- data/lib/treat/processors/tokenizers/stanford.rb +39 -38
- data/lib/treat/processors/tokenizers/tactful.rb +64 -55
- data/lib/treat/proxies.rb +52 -35
- data/lib/treat/retrievers.rb +26 -16
- data/lib/treat/retrievers/indexers/ferret.rb +47 -26
- data/lib/treat/retrievers/searchers/ferret.rb +69 -50
- data/lib/treat/tree.rb +241 -183
- data/spec/collection.rb +123 -0
- data/spec/document.rb +93 -0
- data/spec/entity.rb +408 -0
- data/spec/languages.rb +25 -0
- data/spec/phrase.rb +146 -0
- data/spec/samples/mathematicians/archimedes.abw +34 -0
- data/spec/samples/mathematicians/euler.html +21 -0
- data/spec/samples/mathematicians/gauss.pdf +0 -0
- data/spec/samples/mathematicians/leibniz.txt +13 -0
- data/spec/samples/mathematicians/newton.doc +0 -0
- data/spec/sandbox.rb +5 -0
- data/spec/token.rb +109 -0
- data/spec/treat.rb +52 -0
- data/spec/tree.rb +117 -0
- data/spec/word.rb +110 -0
- data/spec/zone.rb +66 -0
- data/tmp/INFO +1 -1
- metadata +100 -201
- data/INSTALL +0 -1
- data/README +0 -3
- data/TODO +0 -28
- data/lib/economist/half_cocked_basel.txt +0 -16
- data/lib/economist/hungarys_troubles.txt +0 -46
- data/lib/economist/indias_slowdown.txt +0 -15
- data/lib/economist/merkozy_rides_again.txt +0 -24
- data/lib/economist/prada_is_not_walmart.txt +0 -9
- data/lib/economist/to_infinity_and_beyond.txt +0 -15
- data/lib/ferret/_11.cfs +0 -0
- data/lib/ferret/_14.cfs +0 -0
- data/lib/ferret/_p.cfs +0 -0
- data/lib/ferret/_s.cfs +0 -0
- data/lib/ferret/_v.cfs +0 -0
- data/lib/ferret/_y.cfs +0 -0
- data/lib/ferret/segments +0 -0
- data/lib/ferret/segments_15 +0 -0
- data/lib/treat/buildable.rb +0 -157
- data/lib/treat/category.rb +0 -33
- data/lib/treat/delegatable.rb +0 -116
- data/lib/treat/doable.rb +0 -45
- data/lib/treat/entities/collection.rb +0 -14
- data/lib/treat/entities/document.rb +0 -12
- data/lib/treat/entities/phrases.rb +0 -17
- data/lib/treat/entities/tokens.rb +0 -61
- data/lib/treat/entities/zones.rb +0 -41
- data/lib/treat/extractors/coreferences/stanford.rb +0 -69
- data/lib/treat/extractors/date/chronic.rb +0 -32
- data/lib/treat/extractors/date/ruby.rb +0 -25
- data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
- data/lib/treat/extractors/language/language_extractor.rb +0 -27
- data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
- data/lib/treat/extractors/roles/naive.rb +0 -73
- data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
- data/lib/treat/extractors/statistics/position_in.rb +0 -14
- data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
- data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
- data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
- data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
- data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
- data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
- data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
- data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
- data/lib/treat/feature.rb +0 -58
- data/lib/treat/features.rb +0 -7
- data/lib/treat/formatters/visualizers/short_value.rb +0 -29
- data/lib/treat/formatters/visualizers/txt.rb +0 -45
- data/lib/treat/group.rb +0 -106
- data/lib/treat/helpers/linguistics_loader.rb +0 -18
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
- data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
- data/lib/treat/inflectors/declensions/english.rb +0 -319
- data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
- data/lib/treat/inflectors/stem/porter.rb +0 -162
- data/lib/treat/inflectors/stem/porter_c.rb +0 -26
- data/lib/treat/inflectors/stem/uea.rb +0 -30
- data/lib/treat/install.rb +0 -59
- data/lib/treat/languages/tags.rb +0 -377
- data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
- data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
- data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
- data/lib/treat/lexicalizers/tag/brill.rb +0 -91
- data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
- data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
- data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
- data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
- data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
- data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
- data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
- data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
- data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
- data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
- data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
- data/lib/treat/registrable.rb +0 -28
- data/lib/treat/sugar.rb +0 -50
- data/lib/treat/viewable.rb +0 -29
- data/lib/treat/visitable.rb +0 -28
- data/test/profile.rb +0 -2
- data/test/tc_entity.rb +0 -117
- data/test/tc_extractors.rb +0 -73
- data/test/tc_formatters.rb +0 -41
- data/test/tc_inflectors.rb +0 -34
- data/test/tc_lexicalizers.rb +0 -32
- data/test/tc_processors.rb +0 -50
- data/test/tc_resources.rb +0 -22
- data/test/tc_treat.rb +0 -60
- data/test/tc_tree.rb +0 -60
- data/test/tests.rb +0 -20
- data/test/texts.rb +0 -19
- data/test/texts/english/half_cocked_basel.txt +0 -16
- data/test/texts/english/hose_and_dry.doc +0 -0
- data/test/texts/english/hungarys_troubles.abw +0 -70
- data/test/texts/english/long.html +0 -24
- data/test/texts/english/long.txt +0 -22
- data/test/texts/english/medium.txt +0 -5
- data/test/texts/english/republican_nomination.pdf +0 -0
- data/test/texts/english/saving_the_euro.odt +0 -0
- data/test/texts/english/short.txt +0 -3
- data/test/texts/english/zero_sum.html +0 -111
@@ -0,0 +1,30 @@
|
|
1
|
+
class Treat::Loaders
|
2
|
+
|
3
|
+
# A helper class to load a language class
|
4
|
+
# registered with the Linguistics gem.
|
5
|
+
class Linguistics
|
6
|
+
|
7
|
+
silence_warnings { require 'linguistics' }
|
8
|
+
@@languages = {}
|
9
|
+
|
10
|
+
def self.load(language)
|
11
|
+
if @@languages[language]
|
12
|
+
return @@languages[language]
|
13
|
+
end
|
14
|
+
begin
|
15
|
+
l = language.to_s.upcase
|
16
|
+
silence_warnings do
|
17
|
+
@@languages[language] =
|
18
|
+
::Linguistics.const_get(l)
|
19
|
+
end
|
20
|
+
rescue RuntimeError
|
21
|
+
raise "Ruby Linguistics does " +
|
22
|
+
"not have a module installed " +
|
23
|
+
"for the #{language} language."
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
class Treat::Loaders
|
2
|
+
|
3
|
+
# A helper class to load a language class
|
4
|
+
# registered with the Linguistics gem.
|
5
|
+
class Stanford
|
6
|
+
|
7
|
+
require 'stanford-core-nlp'
|
8
|
+
|
9
|
+
StanfordCoreNLP.jar_path =
|
10
|
+
Treat.bin + 'stanford/'
|
11
|
+
|
12
|
+
StanfordCoreNLP.model_path =
|
13
|
+
Treat.models + 'stanford/'
|
14
|
+
|
15
|
+
StanfordCoreNLP.use(
|
16
|
+
Treat::Languages.describe(
|
17
|
+
Treat.default_language))
|
18
|
+
|
19
|
+
StanfordCoreNLP.log_file =
|
20
|
+
NULL_DEVICE if Treat.silence
|
21
|
+
|
22
|
+
StanfordCoreNLP.init
|
23
|
+
@@loaded = true
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
data/lib/treat/object.rb
CHANGED
data/lib/treat/processors.rb
CHANGED
@@ -1,45 +1,38 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
#
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
# - Segmenters : split a text or zone into sentence objects.
|
11
|
-
# - Tokenizers : split a sentence into Token objects.
|
12
|
-
# - Parsers: split a sentence into a tree of phrases
|
13
|
-
# containing other phrases and Token objects, representing
|
14
|
-
# the syntactic structure.
|
15
|
-
module Processors
|
16
|
-
# Chunkers split a text into zones.
|
17
|
-
module Chunkers
|
18
|
-
extend Group
|
19
|
-
self.type = :transformer
|
20
|
-
self.targets = [:document, :section]
|
21
|
-
end
|
22
|
-
# Segmenters split a text or zone into sentences.
|
23
|
-
module Segmenters
|
24
|
-
extend Group
|
25
|
-
self.type = :transformer
|
26
|
-
self.targets = [:document, :zone]
|
27
|
-
end
|
28
|
-
# Tokenizers splits a sentence into Token objects.
|
29
|
-
module Tokenizers
|
30
|
-
extend Group
|
31
|
-
self.type = :transformer
|
32
|
-
self.targets = [:document, :zone, :phrase]
|
33
|
-
end
|
34
|
-
# Parsers split a sentence into phrase objects
|
35
|
-
# representing its syntactic structure, with the
|
36
|
-
# Token objects as children of the phrases.
|
37
|
-
module Parsers
|
38
|
-
extend Group
|
39
|
-
self.type = :transformer
|
40
|
-
self.targets = [:document, :zone, :phrase]
|
41
|
-
end
|
42
|
-
# Makes all the groups autoloadable and creates the workers.
|
43
|
-
extend Treat::Category
|
1
|
+
# Processors build trees representing textual entities.
|
2
|
+
module Treat::Processors
|
3
|
+
|
4
|
+
# Chunkers split a document into sections and zones.
|
5
|
+
module Chunkers
|
6
|
+
extend Treat::Groupable
|
7
|
+
self.type = :transformer
|
8
|
+
self.targets = [:document]
|
9
|
+
self.default = :autoselect
|
44
10
|
end
|
45
|
-
|
11
|
+
|
12
|
+
# Segmenters split a document or zone into sentences.
|
13
|
+
module Segmenters
|
14
|
+
extend Treat::Groupable
|
15
|
+
self.type = :transformer
|
16
|
+
self.targets = [:zone]
|
17
|
+
end
|
18
|
+
|
19
|
+
# Tokenizers splits a sentence into Token objects.
|
20
|
+
module Tokenizers
|
21
|
+
extend Treat::Groupable
|
22
|
+
self.type = :transformer
|
23
|
+
self.targets = [:phrase]
|
24
|
+
end
|
25
|
+
|
26
|
+
# Parsers split a sentence into phrase objects
|
27
|
+
# representing its syntactic structure, with the
|
28
|
+
# Token objects as children of the phrases.
|
29
|
+
module Parsers
|
30
|
+
extend Treat::Groupable
|
31
|
+
self.type = :transformer
|
32
|
+
self.targets = [:phrase]
|
33
|
+
end
|
34
|
+
|
35
|
+
# Make Processors categorizable.
|
36
|
+
extend Treat::Categorizable
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
class Treat::Processors::Chunkers::Autoselect
|
2
|
+
|
3
|
+
def self.chunk(entity, options = {})
|
4
|
+
entity.check_has(:format)
|
5
|
+
begin
|
6
|
+
k = Treat::Processors::
|
7
|
+
Chunkers.const_get(cc(entity.format))
|
8
|
+
k.chunk(entity, options)
|
9
|
+
rescue Treat::Exception
|
10
|
+
Treat::Processors::
|
11
|
+
Chunkers::TXT.chunk(entity, options)
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
class Treat::Processors::Chunkers::HTML
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
def self.chunk(entity, options = {})
|
6
|
+
|
7
|
+
entity.check_hasnt_children
|
8
|
+
|
9
|
+
doc = Nokogiri::HTML(entity.value)
|
10
|
+
recurse(entity, doc)
|
11
|
+
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.recurse(node, html_node, level = 1)
|
15
|
+
|
16
|
+
html_node.children.each do |child|
|
17
|
+
|
18
|
+
next if child.name == 'text'
|
19
|
+
|
20
|
+
txt = child.inner_text
|
21
|
+
|
22
|
+
if child.name =~ /^h([0-9]{1})$/ ||
|
23
|
+
(child.name == 'p' && txt.length < 45 &&
|
24
|
+
node.parent && node.parent.type == :section)
|
25
|
+
|
26
|
+
if $1
|
27
|
+
lvl = $1.to_i
|
28
|
+
if lvl <= level
|
29
|
+
node.ancestors_with_type(:section).
|
30
|
+
each do |s|
|
31
|
+
l = s.has?(:level) ? s.level : 1
|
32
|
+
node = s if l == lvl - 1
|
33
|
+
end
|
34
|
+
node = node <<
|
35
|
+
Treat::Entities::Section.new
|
36
|
+
elsif lvl > level
|
37
|
+
node = node <<
|
38
|
+
Treat::Entities::Section.new
|
39
|
+
end
|
40
|
+
level = lvl
|
41
|
+
node.set :level, level
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
t = node <<
|
46
|
+
Treat::Entities::Title.new(txt)
|
47
|
+
t.set :level, level
|
48
|
+
|
49
|
+
elsif child.name == 'p'
|
50
|
+
|
51
|
+
node << Treat::Entities::Zone.
|
52
|
+
from_string(txt)
|
53
|
+
|
54
|
+
elsif ['ul', 'ol'].include?(child.name)
|
55
|
+
node = node <<
|
56
|
+
Treat::Entities::List.new
|
57
|
+
elsif ['li'].include?(child.name)
|
58
|
+
n = Treat::Entities::Entity.
|
59
|
+
zone_from_string(txt)
|
60
|
+
node << n
|
61
|
+
end
|
62
|
+
|
63
|
+
if child.children.size > 0
|
64
|
+
recurse(node, child, level)
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
@@ -1,27 +1,21 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
if zone.length < 60
|
18
|
-
text << Treat::Entities::Title.new(zone)
|
19
|
-
else
|
20
|
-
text << Treat::Entities::Paragraph.new(zone)
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|
1
|
+
class Treat::Processors::Chunkers::TXT
|
2
|
+
|
3
|
+
# Separates a string into
|
4
|
+
# zones on the basis of newlines.
|
5
|
+
#
|
6
|
+
# Options: none.
|
7
|
+
def self.chunk(entity, options = {})
|
8
|
+
|
9
|
+
entity.check_hasnt_children
|
10
|
+
zones = entity.to_s.split("\n")
|
11
|
+
|
12
|
+
zones.each do |zone|
|
13
|
+
zone.strip!
|
14
|
+
next if zone == ''
|
15
|
+
entity << Treat::Entities::
|
16
|
+
Zone.from_string(zone)
|
25
17
|
end
|
18
|
+
|
26
19
|
end
|
20
|
+
|
27
21
|
end
|
@@ -1,218 +1,263 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
1
|
+
# This class is a wrapper for the Enju syntactic
|
2
|
+
# parser for English. Given an entity's string value,
|
3
|
+
# the parser formats it runs it through Enju, and
|
4
|
+
# parses the XML output by Enju using the Nokogiri
|
5
|
+
# XML reader. It creates wrappers for the sentences,
|
6
|
+
# syntactical phrases and tokens that Enju identified.
|
7
|
+
#
|
8
|
+
# Original paper:
|
9
|
+
#
|
10
|
+
# Takuya Matsuzaki, Yusuke Miyao, and Jun'ichi Tsujii.
|
11
|
+
# 2007. Efficient HPSG Parsing with Supertagging and
|
12
|
+
# CFG-filtering. In Proceedings of IJCAI 2007.
|
13
|
+
module Treat::Processors::Parsers::Enju
|
14
|
+
|
15
|
+
# Require the 'open3' library to connect
|
16
|
+
# with the background Enju process.
|
17
|
+
require 'open3'
|
18
|
+
|
19
|
+
# Require the Nokogiri XML parser.
|
20
|
+
require 'nokogiri'
|
21
|
+
|
22
|
+
# Create only one process and hold on to it.
|
23
|
+
@@parser = nil
|
24
|
+
|
25
|
+
# A hash of Enju cat tags mapped to word categories.
|
26
|
+
Ectc = Treat::Linguistics::Tags::EnjuCatToCategory
|
27
|
+
|
28
|
+
# A hash of Enju cat/xcat pairs mapped to PTB tags.
|
29
|
+
Ecxtp = Treat::Linguistics::Tags::EnjuCatXcatToPTB
|
30
|
+
|
31
|
+
# Parse the entity into its syntactical
|
32
|
+
# phrases using Enju.
|
33
|
+
#
|
34
|
+
# Options: none.
|
35
|
+
def self.parse(entity, options = {})
|
36
|
+
|
37
|
+
entity.check_hasnt_children
|
38
|
+
val = entity.to_s
|
39
|
+
|
40
|
+
@@id_table = {}
|
41
|
+
@@dependencies_table = {}
|
42
|
+
|
43
|
+
stdin, stdout = proc
|
44
|
+
text, remove_last = valid_text(val)
|
45
|
+
stdin.puts(text + "\n")
|
46
|
+
|
47
|
+
parsed = build(stdout.gets, remove_last)
|
48
|
+
|
49
|
+
if parsed
|
50
|
+
entity.remove_all!
|
51
|
+
parsed.children.each do |child|
|
52
|
+
entity << child
|
53
|
+
end
|
54
|
+
# Remove the period we added at the end.
|
55
|
+
if remove_last
|
56
|
+
last = entity.punctuations[-1]
|
57
|
+
entity.remove!(last)
|
58
|
+
end
|
59
|
+
else
|
60
|
+
warn "Warning - Enju couldn't " +
|
61
|
+
"parse the text '#{entity.short_value}'."
|
62
|
+
return
|
63
|
+
end
|
64
|
+
|
65
|
+
link_heads(entity)
|
66
|
+
add_dependencies(entity)
|
67
|
+
end
|
68
|
+
|
69
|
+
# Return the process running Enju.
|
70
|
+
def self.proc
|
71
|
+
begin
|
72
|
+
@@parser = ::Open3.popen3("enju -xml -i")
|
73
|
+
rescue Exception => e
|
74
|
+
raise Treat::Exception,
|
75
|
+
"Couldn't initialize Enju: #{e.message}."
|
76
|
+
end
|
77
|
+
@@parser
|
78
|
+
end
|
79
|
+
|
80
|
+
# Parses an Enju XML output file using the Nogoriki
|
81
|
+
# XML reader and converts that structure into a tree
|
82
|
+
# of wrappers for textual entities.
|
83
|
+
def self.build(xml, remove_last = false)
|
84
|
+
# Read in the XML file.
|
85
|
+
reader = Nokogiri::XML::Reader.from_memory(xml)
|
86
|
+
entity = nil
|
87
|
+
pd = 0
|
88
|
+
# Read the XML file entity by entity.
|
89
|
+
while reader.read
|
90
|
+
# The depth in the XML tree.
|
91
|
+
cd = reader.depth
|
92
|
+
# If we are at the end of the
|
93
|
+
# children stack, pop up.
|
94
|
+
if pd > cd
|
95
|
+
entity = entity.parent
|
96
|
+
end
|
97
|
+
# If an end element has been reached,
|
98
|
+
# change the depth and pop up on next
|
99
|
+
# iteration.
|
100
|
+
if reader.node_type ==
|
101
|
+
Nokogiri::XML::Reader::TYPE_END_ELEMENT
|
102
|
+
pd = cd
|
103
|
+
next
|
104
|
+
end
|
105
|
+
# Get and format attributes and dependencies.
|
106
|
+
attributes = reader.attributes
|
107
|
+
id = attributes.delete('id')
|
108
|
+
new_attr = {}; dependencies = {}
|
109
|
+
unless attributes.size == 0
|
110
|
+
new_attr, dependencies =
|
111
|
+
cleanup_attributes(reader.name, attributes)
|
112
|
+
end
|
113
|
+
# Create the appropriate entity for the
|
114
|
+
# element.
|
115
|
+
current_value = ''
|
116
|
+
case reader.name
|
117
|
+
when 'sentence'
|
118
|
+
entity = Treat::Entities::Sentence.new('')
|
119
|
+
@@id_table[id] = entity.id
|
120
|
+
@@dependencies_table[entity.id] = dependencies
|
121
|
+
entity.features = new_attr
|
122
|
+
when 'cons'
|
123
|
+
entity = entity <<
|
124
|
+
Treat::Entities::Phrase.new('')
|
125
|
+
@@id_table[id] = entity.id
|
126
|
+
@@dependencies_table[entity.id] = dependencies
|
127
|
+
entity.features = new_attr
|
128
|
+
when 'tok'
|
129
|
+
tmp_attributes = new_attr
|
130
|
+
tmp_dependencies = dependencies
|
131
|
+
else
|
132
|
+
current_value = reader.value.gsub(/\s+/, "")
|
133
|
+
unless current_value.size == 0
|
134
|
+
entity = entity <<
|
135
|
+
Treat::Entities::Token.from_string(current_value)
|
136
|
+
if entity.is_a?(Treat::Entities::Word)
|
137
|
+
entity.features = tmp_attributes
|
138
|
+
@@id_table[id] = entity.id
|
139
|
+
@@dependencies_table[entity.id] = tmp_dependencies
|
132
140
|
else
|
133
|
-
|
134
|
-
|
135
|
-
text += '.' unless ['!', '?'].include?(text[-1])
|
136
|
-
end
|
137
|
-
return text, remove_last
|
138
|
-
end
|
139
|
-
# Link the head and sem_head to their entities.
|
140
|
-
def self.link_heads(entity)
|
141
|
-
entity.each_phrase do |phrase|
|
142
|
-
if phrase.has?(:head)
|
143
|
-
phrase.link(@@id_table[phrase.head], 'head', true, -1)
|
144
|
-
phrase.unset(:head)
|
145
|
-
end
|
146
|
-
if phrase.has?(:sem_head)
|
147
|
-
phrase.link(@@id_table[phrase.sem_head], 'sem_head', true, -1)
|
148
|
-
phrase.unset(:sem_head)
|
149
|
-
end
|
141
|
+
# Do something useful here
|
142
|
+
entity.set :tag, 'SYM'
|
150
143
|
end
|
151
144
|
end
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
145
|
+
end
|
146
|
+
pd = cd
|
147
|
+
end
|
148
|
+
entity
|
149
|
+
end
|
150
|
+
|
151
|
+
# Validate a text - Enju wants period to parse a sentence.
|
152
|
+
def self.valid_text(val)
|
153
|
+
if val.count('.') == 0
|
154
|
+
remove_last = true
|
155
|
+
text = val + '.'
|
156
|
+
else
|
157
|
+
remove_last = false
|
158
|
+
text = val.gsub('.', '')
|
159
|
+
text += '.' unless ['!', '?'].include?(text[-1])
|
160
|
+
end
|
161
|
+
return text, remove_last
|
162
|
+
end
|
163
|
+
|
164
|
+
# Link the head and sem_head to their entities.
|
165
|
+
def self.link_heads(entity)
|
166
|
+
entity.each_phrase do |phrase|
|
167
|
+
if phrase.has?(:head)
|
168
|
+
phrase.link(
|
169
|
+
@@id_table[phrase.head],
|
170
|
+
'head', true, -1)
|
171
|
+
phrase.unset(:head)
|
172
|
+
end
|
173
|
+
if phrase.has?(:sem_head)
|
174
|
+
phrase.link(
|
175
|
+
@@id_table[phrase.sem_head],
|
176
|
+
'sem_head', true, -1)
|
177
|
+
phrase.unset(:sem_head)
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
# Add dependencies a posteriori to a parsed entity.
|
183
|
+
def self.add_dependencies(entity2)
|
184
|
+
|
185
|
+
entity2.each_entity(:word, :phrase) do |entity|
|
186
|
+
@@dependencies_table.each_pair do |id, dependencies|
|
187
|
+
next if dependencies.nil?
|
188
|
+
entity = entity2.root.find(id)
|
189
|
+
next if entity.nil?
|
190
|
+
dependencies.each_pair do |argument, type|
|
191
|
+
# Skip this argument if we
|
192
|
+
# don't know the target node.
|
193
|
+
next if argument == 'unk'
|
194
|
+
entity.link(
|
195
|
+
@@id_table[argument],
|
196
|
+
type.intern
|
197
|
+
)
|
167
198
|
end
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
end
|
200
|
-
# Delete after iteration.
|
201
|
-
attributes.delete('arg1')
|
202
|
-
attributes.delete('arg2')
|
203
|
-
# Handle naming conventions.
|
204
|
-
if attributes.has_key?('pos')
|
205
|
-
new_attributes[:tag] = new_attributes[:pos]
|
206
|
-
new_attributes[:tag_set] = :penn
|
207
|
-
new_attributes.delete :pos
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
end
|
203
|
+
|
204
|
+
# Helper function to convert Enju attributes to Treat attributes.
|
205
|
+
def self.cleanup_attributes(name, attributes)
|
206
|
+
|
207
|
+
new_attr = {}
|
208
|
+
dependencies = {}
|
209
|
+
pred = attributes.delete('pred')
|
210
|
+
|
211
|
+
attributes.each_pair do |attribute2, value|
|
212
|
+
|
213
|
+
attribute = attribute2.strip
|
214
|
+
|
215
|
+
if attribute == 'arg1' ||
|
216
|
+
attribute == 'arg2'
|
217
|
+
dependencies[value] = pred
|
218
|
+
next
|
219
|
+
end
|
220
|
+
|
221
|
+
if attribute == 'cat'
|
222
|
+
new_attr[:cat] = value
|
223
|
+
if name == 'tok'
|
224
|
+
if value.length > 1 &&
|
225
|
+
['P', 'X'].include?(value[-1]) &&
|
226
|
+
value != 'PN'
|
227
|
+
new_attr[:saturated] =
|
228
|
+
(value[-1] == 'P')
|
229
|
+
value = value[0..-2]
|
208
230
|
end
|
209
|
-
|
210
|
-
|
211
|
-
|
231
|
+
new_attr[:category] = Ectc[value]
|
232
|
+
else
|
233
|
+
tags = Ecxtp.select do |m|
|
234
|
+
m[0] == value && m[1] ==
|
235
|
+
attributes['xcat']
|
212
236
|
end
|
213
|
-
|
237
|
+
tag = (tags.size == 0) ?
|
238
|
+
'FW' : tags[0][2]
|
239
|
+
new_attr[:tag] = tag
|
214
240
|
end
|
241
|
+
else
|
242
|
+
new_attr[:"#{attribute}"] = value
|
215
243
|
end
|
244
|
+
|
245
|
+
end
|
246
|
+
|
247
|
+
# Handle naming conventions.
|
248
|
+
if attributes.has_key?('pos')
|
249
|
+
new_attr[:tag] = new_attr[:pos]
|
250
|
+
new_attr[:tag_set] = :penn
|
251
|
+
new_attr.delete :pos
|
252
|
+
end
|
253
|
+
|
254
|
+
if attributes.has_key?('base')
|
255
|
+
new_attr[:lemma] = new_attr[:base]
|
256
|
+
new_attr.delete :base
|
216
257
|
end
|
258
|
+
|
259
|
+
return new_attr, dependencies
|
260
|
+
|
217
261
|
end
|
218
|
-
|
262
|
+
|
263
|
+
end
|