treat 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +7 -8
- data/TODO +16 -13
- data/examples/keywords.rb +89 -1
- data/lib/treat/buildable.rb +1 -8
- data/lib/treat/categories.rb +3 -4
- data/lib/treat/category.rb +1 -1
- data/lib/treat/delegatable.rb +1 -1
- data/lib/treat/detectors/encoding/native.rb +5 -0
- data/lib/treat/detectors/encoding/r_chardet19.rb +2 -3
- data/lib/treat/detectors/language/language_detector.rb +4 -0
- data/lib/treat/detectors/language/what_language.rb +4 -4
- data/lib/treat/detectors.rb +1 -1
- data/lib/treat/entities/entity.rb +5 -3
- data/lib/treat/entities/tokens.rb +14 -5
- data/lib/treat/entities/zones.rb +4 -0
- data/lib/treat/entities.rb +7 -5
- data/lib/treat/extractors/keywords/topics_frequency.rb +40 -0
- data/lib/treat/extractors/statistics/{frequency.rb → frequency_in.rb} +5 -4
- data/lib/treat/extractors/statistics/frequency_of.rb +3 -5
- data/lib/treat/extractors/statistics/{position_in.rb → position_in_parent.rb} +4 -3
- data/lib/treat/extractors/statistics/tf_idf.rb +36 -0
- data/lib/treat/extractors/statistics/transition_matrix.rb +25 -25
- data/lib/treat/extractors/statistics/transition_probability.rb +8 -4
- data/lib/treat/extractors/time/chronic.rb +8 -0
- data/lib/treat/extractors/time/native.rb +6 -0
- data/lib/treat/extractors/time/nickel.rb +31 -23
- data/lib/treat/extractors/topic_words/lda.rb +21 -16
- data/lib/treat/extractors/topics/reuters.rb +6 -4
- data/lib/treat/extractors.rb +7 -7
- data/lib/treat/formatters/readers/abw.rb +32 -0
- data/lib/treat/formatters/readers/autoselect.rb +13 -11
- data/lib/treat/formatters/readers/doc.rb +13 -0
- data/lib/treat/formatters/readers/gocr.rb +2 -0
- data/lib/treat/formatters/readers/html.rb +21 -1
- data/lib/treat/formatters/readers/ocropus.rb +3 -3
- data/lib/treat/formatters/readers/odt.rb +41 -0
- data/lib/treat/formatters/readers/pdf.rb +5 -2
- data/lib/treat/formatters/readers/txt.rb +2 -0
- data/lib/treat/formatters/serializers/xml.rb +3 -2
- data/lib/treat/formatters/serializers/yaml.rb +2 -0
- data/lib/treat/formatters/unserializers/autoselect.rb +7 -1
- data/lib/treat/formatters/unserializers/xml.rb +6 -1
- data/lib/treat/formatters/unserializers/yaml.rb +5 -1
- data/lib/treat/formatters/visualizers/dot.rb +35 -37
- data/lib/treat/formatters/visualizers/html.rb +1 -0
- data/lib/treat/formatters/visualizers/inspect.rb +4 -0
- data/lib/treat/formatters/visualizers/short_value.rb +18 -3
- data/lib/treat/formatters/visualizers/standoff.rb +11 -6
- data/lib/treat/formatters/visualizers/tree.rb +5 -1
- data/lib/treat/formatters/visualizers/txt.rb +6 -1
- data/lib/treat/formatters.rb +1 -1
- data/lib/treat/group.rb +4 -3
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +14 -17
- data/lib/treat/inflectors/conjugations/linguistics.rb +16 -3
- data/lib/treat/inflectors/declensions/linguistics.rb +17 -6
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +9 -10
- data/lib/treat/inflectors/stem/porter.rb +6 -2
- data/lib/treat/inflectors/stem/porter_c.rb +4 -1
- data/lib/treat/inflectors/stem/uea.rb +4 -4
- data/lib/treat/languages/english/tags.rb +16 -0
- data/lib/treat/languages/english.rb +4 -1
- data/lib/treat/lexicalizers/category/from_tag.rb +4 -4
- data/lib/treat/lexicalizers/linkages/naive.rb +3 -3
- data/lib/treat/lexicalizers/tag/brill.rb +3 -11
- data/lib/treat/lexicalizers/tag/lingua.rb +4 -6
- data/lib/treat/lexicalizers.rb +0 -2
- data/lib/treat/processors/chunkers/txt.rb +4 -4
- data/lib/treat/processors/parsers/enju.rb +3 -17
- data/lib/treat/processors/parsers/stanford.rb +4 -0
- data/lib/treat/processors/segmenters/punkt.rb +1 -0
- data/lib/treat/processors/segmenters/stanford.rb +4 -0
- data/lib/treat/processors/segmenters/tactful.rb +4 -1
- data/lib/treat/processors/tokenizers/punkt.rb +1 -2
- data/lib/treat/processors/tokenizers/stanford.rb +4 -0
- data/lib/treat/processors/tokenizers/tactful.rb +1 -1
- data/lib/treat/processors.rb +4 -4
- data/lib/treat/proxies.rb +18 -11
- data/lib/treat/registrable.rb +12 -5
- data/lib/treat/sugar.rb +8 -3
- data/lib/treat/tree.rb +10 -3
- data/lib/treat.rb +55 -55
- data/test/tc_entity.rb +7 -7
- data/test/tc_extractors.rb +6 -4
- data/test/tc_formatters.rb +0 -4
- data/test/tests.rb +2 -0
- data/test/texts.rb +4 -4
- metadata +48 -56
- data/examples/texts/bugged_out.txt +0 -26
- data/examples/texts/half_cocked_basel.txt +0 -16
- data/examples/texts/hedge_funds.txt +0 -24
- data/examples/texts/hose_and_dry.txt +0 -19
- data/examples/texts/hungarys_troubles.txt +0 -46
- data/examples/texts/indias_slowdown.txt +0 -15
- data/examples/texts/merkozy_rides_again.txt +0 -24
- data/examples/texts/prada_is_not_walmart.txt +0 -9
- data/examples/texts/republican_nomination.txt +0 -26
- data/examples/texts/to_infinity_and_beyond.txt +0 -15
- data/lib/treat/entities/text.rb +0 -7
- data/lib/treat/extractors/key_sentences/topics_frequency.rb +0 -49
- data/lib/treat/formatters/cleaners/html.rb +0 -17
@@ -2,62 +2,54 @@ module Treat
|
|
2
2
|
module Formatters
|
3
3
|
module Visualizers
|
4
4
|
class Dot
|
5
|
-
|
6
|
-
BorderColors = {
|
7
|
-
:verb => "#00AABB",
|
8
|
-
:noun => "#FAD4A7",
|
9
|
-
:adverb => '#103585',
|
10
|
-
:adjective => '#D21D54'
|
11
|
-
}
|
5
|
+
DefaultOptions = {colors: {}, :features => :all}
|
12
6
|
# Create the top-most graph structure
|
13
7
|
# and delegate the creation of the graph
|
14
8
|
# nodes to to_dot.
|
15
9
|
def self.visualize(entity, options = {})
|
10
|
+
options = DefaultOptions.merge(options)
|
16
11
|
string = "graph {"
|
17
|
-
string << self.to_dot(entity)
|
12
|
+
string << self.to_dot(entity, options)
|
18
13
|
string << "\n}"
|
19
14
|
end
|
20
15
|
# dot -Tpdf test4.dot > test4.pdf
|
21
|
-
def self.to_dot(entity)
|
16
|
+
def self.to_dot(entity, options)
|
17
|
+
# Id
|
22
18
|
string = ''
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
label = "label=\"#{entity.value.inspect[1..-2]}\","
|
29
|
-
end
|
19
|
+
label = ''
|
20
|
+
string = "\n#{entity.id} ["
|
21
|
+
# Value
|
22
|
+
if entity.is_a?(Treat::Entities::Token)
|
23
|
+
label = entity.to_s
|
30
24
|
else
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
else
|
35
|
-
label = "label=\"#{cc(cl(entity.class))}\","
|
25
|
+
label = entity.type.to_s.capitalize + " "
|
26
|
+
if entity.is_leaf?
|
27
|
+
label = entity.short_value.gsub(' [...]', " [...] \\n")
|
36
28
|
end
|
37
29
|
end
|
38
|
-
|
30
|
+
# Features
|
39
31
|
if entity.has_features?
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
32
|
+
unless options[:features] == :none
|
33
|
+
label << "\\n"
|
34
|
+
entity.features.each do |feature, value|
|
35
|
+
if options[:features] == :all ||
|
36
|
+
options[:features].include?(feature)
|
37
|
+
if value.is_a?(Treat::Entities::Entity)
|
38
|
+
label << "\\n#{feature}=\\\"*#{value.id}\\\","
|
39
|
+
else
|
40
|
+
label << "\\n#{feature}=\\\"#{value}\\\","
|
41
|
+
end
|
42
|
+
end
|
46
43
|
end
|
47
44
|
end
|
48
|
-
string = string[0..-2]
|
49
|
-
string << "]"
|
50
|
-
else
|
51
|
-
string << "#{label[0..-2]}]"
|
52
45
|
end
|
46
|
+
label = label[0..-2] if label[-1] == ','
|
47
|
+
string << "label=\"#{label}\"]"
|
48
|
+
# Parent-child relationships.
|
53
49
|
if entity.has_parent?
|
54
50
|
string << "\n#{entity.parent.id} -- #{entity.id};"
|
55
51
|
end
|
56
|
-
|
57
|
-
entity.each do |child|
|
58
|
-
string << self.to_dot(child)
|
59
|
-
end
|
60
|
-
end
|
52
|
+
# Edges.
|
61
53
|
if entity.has_edges?
|
62
54
|
entity.edges.each_pair do |target, type|
|
63
55
|
string << "\n#{entity.id} -- #{target}"
|
@@ -65,6 +57,12 @@ module Treat
|
|
65
57
|
string << "arrowhead=\"odiamond\"]"
|
66
58
|
end
|
67
59
|
end
|
60
|
+
# Recurse.
|
61
|
+
if entity.has_children?
|
62
|
+
entity.each do |child|
|
63
|
+
string << self.to_dot(child, options)
|
64
|
+
end
|
65
|
+
end
|
68
66
|
string
|
69
67
|
end
|
70
68
|
end
|
@@ -1,7 +1,11 @@
|
|
1
1
|
module Treat
|
2
2
|
module Formatters
|
3
3
|
module Visualizers
|
4
|
+
# Handles the call to inspect.
|
4
5
|
class Inspect
|
6
|
+
# Return a terminal-friendly visualization of an entity.
|
7
|
+
#
|
8
|
+
# Options: none.
|
5
9
|
def self.visualize(entity, options = {})
|
6
10
|
s = "#{entity.class.to_s.split('::')[-1]} (#{entity.id.to_s})"
|
7
11
|
unless caller_method == :inspect
|
@@ -2,11 +2,26 @@ module Treat
|
|
2
2
|
module Formatters
|
3
3
|
module Visualizers
|
4
4
|
class ShortValue
|
5
|
+
# Default options for the visualizer.
|
6
|
+
DefaultOptions = { max_words: 6, max_length: 30 }
|
7
|
+
# Returns the text value of an entity, shortend
|
8
|
+
# with [..] if the value is longer than :max_words
|
9
|
+
# or longer than :max_length.
|
10
|
+
#
|
11
|
+
# Options:
|
12
|
+
# - (Integer) :max_words => the maximum number
|
13
|
+
# of words in an entity before it is shortened.
|
14
|
+
# - (Integer) :max_length => the maximum number
|
15
|
+
# of characters in an entity before it is shortened.s
|
5
16
|
def self.visualize(entity, options = {})
|
6
|
-
options
|
17
|
+
options = DefaultOptions.merge(options)
|
7
18
|
words = entity.to_s.split(' ')
|
8
|
-
|
9
|
-
|
19
|
+
if words.size < options[:max_words] ||
|
20
|
+
entity.to_s.length < options[:max_length]
|
21
|
+
entity.to_s
|
22
|
+
else
|
23
|
+
words[0..2].join(' ') + ' [...] ' + words[-3..-1].join(' ')
|
24
|
+
end
|
10
25
|
end
|
11
26
|
end
|
12
27
|
end
|
@@ -5,24 +5,29 @@ module Treat
|
|
5
5
|
# an entity in standoff format; for example:
|
6
6
|
# (S (NP John) (VP has (VP come))).
|
7
7
|
class Standoff
|
8
|
-
|
8
|
+
# Default options for the visualizer.
|
9
|
+
DefaultOptions = { indent: 0 }
|
10
|
+
# A lambda to recursively visualize the children
|
11
|
+
# of an entity.
|
12
|
+
Recurse = lambda do |entity, options|
|
9
13
|
v = ''
|
10
14
|
entity.each { |child| v += visualize(child, options) }
|
11
15
|
v
|
12
16
|
end
|
13
17
|
# Visualize the entity using standoff notation.
|
14
|
-
# This can only be called on sentences
|
15
|
-
# is not a suitable format to
|
16
|
-
#
|
18
|
+
# This can only be called on sentences and smaller
|
19
|
+
# entities, as it is not a suitable format to
|
20
|
+
# represent larger entities.
|
17
21
|
def self.visualize(entity, options = {})
|
18
|
-
options =
|
22
|
+
options = DefaultOptions.merge(options)
|
19
23
|
value = ''; spaces = ''
|
20
24
|
options[:indent].times { spaces << ' '}
|
21
25
|
options[:indent] += 1
|
22
26
|
if entity.is_a?(Treat::Entities::Token)
|
23
27
|
value += "#{spaces}(#{entity.tag} #{entity.value})"
|
24
28
|
elsif entity.is_a?(Treat::Entities::Constituent)
|
25
|
-
|
29
|
+
tag = entity.has?(:tag) ? entity.tag : ''
|
30
|
+
value += ("#{spaces}(#{tag}\n" +
|
26
31
|
"#{Recurse.call(entity, options)})\n")
|
27
32
|
elsif entity.is_a?(Treat::Entities::Sentence)
|
28
33
|
value += ("#{spaces}(S\n" +
|
@@ -1,11 +1,15 @@
|
|
1
1
|
module Treat
|
2
2
|
module Formatters
|
3
3
|
module Visualizers
|
4
|
+
# This class generates an ASCII representation
|
5
|
+
# of a tree of entities.
|
4
6
|
class Tree
|
7
|
+
# Default options for the visualizer.
|
8
|
+
DefaultOptions = { indent: 0 }
|
5
9
|
# Obtain a plain text tree representation
|
6
10
|
# of the entity.
|
7
11
|
def self.visualize(entity, options = {})
|
8
|
-
options =
|
12
|
+
options = DefaultOptions.merge(options)
|
9
13
|
string = ''
|
10
14
|
if entity.has_children?
|
11
15
|
spacer = '--'
|
@@ -3,10 +3,15 @@ module Treat
|
|
3
3
|
module Visualizers
|
4
4
|
# Creates a plain text visualization of an entity.
|
5
5
|
class Txt
|
6
|
+
# The default options for the visualizer.
|
7
|
+
DefaultOptions = { sep: ' ' }
|
6
8
|
# Obtain a plain text visualization of the entity,
|
7
9
|
# with no additional information.
|
10
|
+
#
|
11
|
+
# Options:
|
12
|
+
# (String) :sep => the separator to use between words.
|
8
13
|
def self.visualize(entity, options = {})
|
9
|
-
options
|
14
|
+
options = DefaultOptions.merge(options)
|
10
15
|
return entity.value if !entity.has_children?
|
11
16
|
value = ''
|
12
17
|
entity.each do |child|
|
data/lib/treat/formatters.rb
CHANGED
data/lib/treat/group.rb
CHANGED
@@ -61,14 +61,15 @@ module Treat
|
|
61
61
|
end
|
62
62
|
is_target
|
63
63
|
end
|
64
|
+
# Cache the list of adaptors to improve performance.
|
65
|
+
@@list = {}
|
64
66
|
# Populates once the list of the adaptors in the group
|
65
67
|
# by crawling the filesystem.
|
66
|
-
@@list = {}
|
67
68
|
def list
|
68
69
|
mod = ucc(cl(self))
|
69
70
|
if @@list[mod].nil?
|
70
71
|
@@list[mod] = []
|
71
|
-
dirs = Dir
|
72
|
+
dirs = Dir.glob("#{Treat.lib}/treat/*/#{mod}/*.rb")
|
72
73
|
dirs.each do |file|
|
73
74
|
@@list[mod] <<
|
74
75
|
:"#{file.split('/')[-1][0..-4]}"
|
@@ -79,7 +80,7 @@ module Treat
|
|
79
80
|
# Get constants in this module, excluding those
|
80
81
|
# defined by parent modules.
|
81
82
|
def const_get(const); super(const, false); end
|
82
|
-
#
|
83
|
+
# Lazy load the classes in the group.
|
83
84
|
def const_missing(const)
|
84
85
|
bits = self.ancestors[0].to_s.split('::')
|
85
86
|
bits.collect! { |bit| ucc(bit) }
|
@@ -1,43 +1,40 @@
|
|
1
1
|
module Treat
|
2
2
|
module Inflectors
|
3
3
|
module CardinalWords
|
4
|
+
# This class is a wrapper for the functions included
|
5
|
+
# in the 'linguistics' gem that allow to describe a
|
6
|
+
# number in words in cardinal form.
|
7
|
+
#
|
8
|
+
# Project website: http://deveiate.org/projects/Linguistics/
|
4
9
|
class Linguistics
|
10
|
+
# Require the 'linguistics' gem.
|
5
11
|
silence_warnings { require 'linguistics' }
|
12
|
+
# Return the description of a cardinal number in words.
|
6
13
|
#
|
7
14
|
# Options:
|
8
15
|
#
|
9
|
-
# :group => Controls how many numbers at a time are
|
16
|
+
# - :group => Controls how many numbers at a time are
|
10
17
|
# grouped together. Valid values are 0 (normal grouping),
|
11
18
|
# 1 (single-digit grouping, e.g., “one, two, three, four”),
|
12
19
|
# 2 (double-digit grouping, e.g., “twelve, thirty-four”, or
|
13
20
|
# 3 (triple-digit grouping, e.g., “one twenty-three, four”).
|
14
|
-
# :comma => Set the character/s used to separate word groups.
|
21
|
+
# - :comma => Set the character/s used to separate word groups.
|
15
22
|
# Defaults to ", ".
|
16
|
-
# :and => Set the word and/or characters used where ' and '
|
23
|
+
# - :and => Set the word and/or characters used where ' and '
|
17
24
|
# (the default) is normally used. Setting :and to ' ', for
|
18
25
|
# example, will cause 2556 to be returned as “two-thousand,
|
19
26
|
# five hundred fifty-six” instead of “two-thousand, five
|
20
27
|
# hundred and fifty-six”.
|
21
|
-
# :zero => Set the word used to represent the numeral 0 in
|
28
|
+
# - :zero => Set the word used to represent the numeral 0 in
|
22
29
|
# the result. 'zero' is the default.
|
23
|
-
# :decimal => Set the translation of any decimal points in
|
30
|
+
# - :decimal => Set the translation of any decimal points in
|
24
31
|
# the number; the default is 'point'.
|
25
|
-
# :asArray If set to a true value, the number will be returned
|
32
|
+
# - :asArray If set to a true value, the number will be returned
|
26
33
|
# as an array of word groups instead of a String.
|
27
34
|
#
|
28
35
|
# More specific options when using :type => :ordinal:
|
29
|
-
#
|
30
|
-
#
|
31
36
|
def self.cardinal_words(entity, options = {})
|
32
|
-
|
33
|
-
l = entity.language.to_s.upcase
|
34
|
-
delegate = nil
|
35
|
-
silence_warnings { delegate = ::Linguistics.const_get(l) }
|
36
|
-
rescue RuntimeError
|
37
|
-
raise "Ruby Linguistics does not have a module " +
|
38
|
-
" installed for the #{entity.language} language."
|
39
|
-
end
|
40
|
-
silence_warnings { delegate.numwords(entity.to_s, options) }
|
37
|
+
silence_warnings { ::Linguistics::EN.numwords(entity.to_s, options) }
|
41
38
|
end
|
42
39
|
end
|
43
40
|
end
|
@@ -1,15 +1,28 @@
|
|
1
1
|
module Treat
|
2
2
|
module Inflectors
|
3
3
|
module Conjugations
|
4
|
+
# This class is a wrapper for the functions included
|
5
|
+
# in the 'linguistics' gem that allow to conjugate verbs.
|
6
|
+
#
|
7
|
+
# Project website: http://deveiate.org/projects/Linguistics/
|
4
8
|
class Linguistics
|
5
9
|
silence_warnings { require 'linguistics' }
|
6
|
-
|
10
|
+
# Conjugate a verb using ruby linguistics with the specified
|
11
|
+
# mode, tense, count and person.
|
12
|
+
#
|
13
|
+
# Options:
|
14
|
+
#
|
15
|
+
# - (Symbol) :mode => :infinitive, :indicative, :subjunctive, :participle
|
16
|
+
# - (Symbol) :tense => :past, :present, :future
|
17
|
+
# - (Symbol) :count => :singular, :plural
|
18
|
+
# - (Symbol) :person => :first, :second, :third
|
19
|
+
def self.conjugations(entity, parameters)
|
7
20
|
begin
|
8
21
|
l = entity.language.to_s.upcase
|
9
22
|
delegate = nil
|
10
23
|
silence_warnings { delegate = ::Linguistics.const_get(l) }
|
11
24
|
rescue RuntimeError
|
12
|
-
raise "Ruby Linguistics does not have a module " +
|
25
|
+
raise "Ruby Linguistics does not have a module " +
|
13
26
|
" installed for the #{entity.language} language."
|
14
27
|
end
|
15
28
|
if parameters[:mode] == :infinitive
|
@@ -27,4 +40,4 @@ module Treat
|
|
27
40
|
end
|
28
41
|
end
|
29
42
|
end
|
30
|
-
end
|
43
|
+
end
|
@@ -1,24 +1,35 @@
|
|
1
1
|
module Treat
|
2
2
|
module Inflectors
|
3
3
|
module Declensions
|
4
|
-
|
5
|
-
#
|
6
|
-
#
|
4
|
+
# This class is a wrapper for the functions included
|
5
|
+
# in the 'linguistics' gem that allow to obtain the
|
6
|
+
# declensions of a word.
|
7
|
+
#
|
8
|
+
# Project website: http://deveiate.org/projects/Linguistics/
|
7
9
|
class Linguistics
|
8
|
-
|
10
|
+
# Require Ruby Linguistics
|
11
|
+
silence_warnings { require 'linguistics' }
|
12
|
+
# Retrieve a declension of a word using the 'linguistics' gem.
|
13
|
+
#
|
14
|
+
# Options:
|
15
|
+
#
|
16
|
+
# - (Identifier) :count => :singular, :plural
|
17
|
+
def self.declensions(entity, options = {})
|
9
18
|
begin
|
10
19
|
l = entity.language.to_s.upcase
|
11
20
|
delegate = nil
|
12
21
|
silence_warnings { delegate = ::Linguistics.const_get(l) }
|
13
22
|
rescue RuntimeError
|
14
|
-
raise "Ruby Linguistics does not have a module " +
|
23
|
+
raise "Ruby Linguistics does not have a module " +
|
15
24
|
" installed for the #{entity.language} language."
|
16
25
|
end
|
17
26
|
string = entity.to_s
|
18
27
|
if options[:count] == :plural
|
19
28
|
if entity.has?(:category) &&
|
20
29
|
[:noun, :adjective, :verb].include?(entity.category)
|
21
|
-
silence_warnings
|
30
|
+
silence_warnings do
|
31
|
+
delegate.send(:"plural_#{entity.category}", string)
|
32
|
+
end
|
22
33
|
else
|
23
34
|
silence_warnings { delegate.plural(string) }
|
24
35
|
end
|
@@ -1,19 +1,18 @@
|
|
1
1
|
module Treat
|
2
2
|
module Inflectors
|
3
3
|
module OrdinalWords
|
4
|
+
# This class is a wrapper for the functions included
|
5
|
+
# in the 'linguistics' gem that allow to describe a
|
6
|
+
# number in words in ordinal form.
|
7
|
+
#
|
8
|
+
# Project website: http://deveiate.org/projects/Linguistics/
|
4
9
|
class Linguistics
|
10
|
+
# Require Ruby Linguistics.
|
5
11
|
silence_warnings { require 'linguistics' }
|
12
|
+
# Desribe a number in words in ordinal form, using the
|
13
|
+
# 'linguistics' gem.
|
6
14
|
def self.ordinal_words(number, options = {})
|
7
|
-
|
8
|
-
l = number.language.to_s.upcase
|
9
|
-
delegate = nil
|
10
|
-
silence_warnings { delegate = ::Linguistics.const_get(l) }
|
11
|
-
rescue RuntimeError
|
12
|
-
lang = Treat::Languages.describe(number.language)
|
13
|
-
raise "Ruby Linguistics does not have a module " +
|
14
|
-
" installed for the #{lang} language."
|
15
|
-
end
|
16
|
-
silence_warnings { delegate.ordinate(number.to_s) }
|
15
|
+
silence_warnings { ::Linguistics::EN.ordinate(number.to_s) }
|
17
16
|
end
|
18
17
|
end
|
19
18
|
end
|
@@ -2,16 +2,20 @@ module Treat
|
|
2
2
|
module Inflectors
|
3
3
|
module Stem
|
4
4
|
# Stem a word using a native Ruby implementation of the
|
5
|
-
# Porter stemming algorithm, ported to Ruby from
|
6
|
-
# version coded up in Perl.
|
5
|
+
# Porter stemming algorithm, ported to Ruby from a
|
6
|
+
# version coded up in Perl. This is a simplified
|
7
|
+
# implementation; for a true and fast Porter stemmer,
|
8
|
+
# see Treat::Inflectors::Stem::PorterC.
|
7
9
|
#
|
8
10
|
# Authored by Ray Pereda (raypereda@hotmail.com).
|
11
|
+
# Unknown license.
|
9
12
|
#
|
10
13
|
# Original paper: Porter, 1980. An algorithm for suffix stripping,
|
11
14
|
# Program, Vol. 14, no. 3, pp 130-137,
|
12
15
|
# Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
|
13
16
|
class Porter
|
14
17
|
# Returns the stem of a word using a native Porter stemmer.
|
18
|
+
#
|
15
19
|
# Options: none.
|
16
20
|
def self.stem(word, options = {})
|
17
21
|
# Copy the word and convert it to a string.
|
@@ -9,10 +9,13 @@ module Treat
|
|
9
9
|
# Program, Vol. 14, no. 3, pp 130-137,
|
10
10
|
# Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
|
11
11
|
class PorterC
|
12
|
+
# Require the 'ruby-stemmer' gem.
|
12
13
|
silence_warnings { require 'lingua/stemmer' }
|
14
|
+
# Remove a conflict between this gem and the 'engtagger' gem.
|
13
15
|
::LinguaStemmer = ::Lingua
|
14
16
|
Object.instance_eval { remove_const :Lingua }
|
15
|
-
# Stem the word using
|
17
|
+
# Stem the word using a full-blown Porter stemmer in C.
|
18
|
+
#
|
16
19
|
# Options: none.
|
17
20
|
def self.stem(word, options = {})
|
18
21
|
silence_warnings { ::LinguaStemmer.stemmer(word.to_s) }
|
@@ -9,10 +9,10 @@ module Treat
|
|
9
9
|
# groups of rules: the first to clean the tokens, and
|
10
10
|
# the second to alter suffixes."
|
11
11
|
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
15
|
-
#
|
12
|
+
# Project website: https://github.com/ealdent/uea-stemmer
|
13
|
+
# Original paper: Jenkins, Marie-Claire, Smith, Dan,
|
14
|
+
# Conservative stemming for search and indexing, 2005.
|
15
|
+
# http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
|
16
16
|
class UEA
|
17
17
|
# Require the 'uea-stemmer' gem.
|
18
18
|
silence_warnings { require 'uea-stemmer' }
|
@@ -183,6 +183,22 @@ module Treat
|
|
183
183
|
['PRT', 'Particle'],
|
184
184
|
['S', 'Sentence']
|
185
185
|
]
|
186
|
+
|
187
|
+
# Maps Enju categories to Treat categories.
|
188
|
+
EnjuCatToCategory = {
|
189
|
+
'ADJ' => :adjective,
|
190
|
+
'ADV' => :adverb,
|
191
|
+
'CONJ' => :conjunction,
|
192
|
+
'COOD' => :conjunction,
|
193
|
+
'C' => :complementizer,
|
194
|
+
'D' => :determiner,
|
195
|
+
'N' => :noun,
|
196
|
+
'P' => :preposition,
|
197
|
+
'PN' => :punctuation,
|
198
|
+
'SC' => :conjunction,
|
199
|
+
'V' => :verb,
|
200
|
+
'PRT' => :particle
|
201
|
+
}
|
186
202
|
|
187
203
|
# Description of the xcat in the Enju output specification.
|
188
204
|
EnjuXCatDescription = [
|
@@ -1,8 +1,10 @@
|
|
1
1
|
module Treat
|
2
2
|
module Languages
|
3
3
|
class English
|
4
|
+
|
4
5
|
require 'treat/languages/english/tags'
|
5
6
|
require 'treat/languages/english/categories'
|
7
|
+
|
6
8
|
Extractors = {
|
7
9
|
time: [:chronic],
|
8
10
|
topics: [:reuters],
|
@@ -11,7 +13,7 @@ module Treat
|
|
11
13
|
}
|
12
14
|
Processors = {
|
13
15
|
chunkers: [:txt],
|
14
|
-
parsers: [:
|
16
|
+
parsers: [:stanford, :enju],
|
15
17
|
segmenters: [:tactful, :punkt, :stanford],
|
16
18
|
tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
|
17
19
|
}
|
@@ -28,6 +30,7 @@ module Treat
|
|
28
30
|
ordinal_words: [:linguistics],
|
29
31
|
cardinal_words: [:linguistics]
|
30
32
|
}
|
33
|
+
|
31
34
|
end
|
32
35
|
end
|
33
36
|
end
|
@@ -4,13 +4,12 @@ module Treat
|
|
4
4
|
# A class that detects the category of a word from its tag,
|
5
5
|
# using the default tagger for the language of the entity.
|
6
6
|
class FromTag
|
7
|
-
DefaultOptions = { tagger: nil }
|
8
7
|
# Find the category of the current entity.
|
8
|
+
#
|
9
9
|
# Options:
|
10
|
-
#
|
11
|
-
#
|
10
|
+
#
|
11
|
+
# - (Symbol) :tagger => force the use of a tagger.
|
12
12
|
def self.category(entity, options = {})
|
13
|
-
options = DefaultOptions.merge(options)
|
14
13
|
tag = options[:tagger].nil? ? entity.tag : entity.tag(options[:tagger])
|
15
14
|
lang = Treat::Languages.get(entity.language)
|
16
15
|
cat = lang::WordTagToCategory[tag]
|
@@ -21,6 +20,7 @@ module Treat
|
|
21
20
|
if cat.size == 1
|
22
21
|
return cat[0]
|
23
22
|
else
|
23
|
+
entity.set :tag_set, :penn
|
24
24
|
if entity.has?(:tag_set)
|
25
25
|
if cat[entity.tag_set]
|
26
26
|
return cat[entity.tag_set]
|
@@ -27,7 +27,7 @@ module Treat
|
|
27
27
|
end
|
28
28
|
# Return the subject of the sentence|verb.
|
29
29
|
def self.subject(entity, options)
|
30
|
-
verb = entity.category == :verb ?
|
30
|
+
verb = (entity.has?(:category) && entity.category == :verb) ?
|
31
31
|
main_verb(entity) : entity.main_verb
|
32
32
|
args = []
|
33
33
|
main_verb.edges.each_pair do |id,edge|
|
@@ -37,7 +37,7 @@ module Treat
|
|
37
37
|
end
|
38
38
|
# Return the object of the sentence|verb.
|
39
39
|
def self.object(entity, options)
|
40
|
-
verb = entity.category == :verb ?
|
40
|
+
verb = (entity.has?(:category) && entity.category == :verb) ?
|
41
41
|
main_verb(entity) : entity.main_verb
|
42
42
|
if verb.voice == 'passive'
|
43
43
|
return
|
@@ -50,7 +50,7 @@ module Treat
|
|
50
50
|
end
|
51
51
|
# Find the main verb (shallowest verb in the tree).
|
52
52
|
def self.main_verb(entity, options)
|
53
|
-
verbs = entity.
|
53
|
+
verbs = entity.verbs
|
54
54
|
if verbs.empty?
|
55
55
|
return
|
56
56
|
end
|
@@ -52,24 +52,16 @@ module Treat
|
|
52
52
|
@@tagger = nil
|
53
53
|
# Hold the user-set options
|
54
54
|
@@options = {}
|
55
|
-
# Hold the default options.
|
56
|
-
DefaultOptions = {
|
57
|
-
lexicon: nil,
|
58
|
-
lexical_rules: nil,
|
59
|
-
contextual_rules: nil
|
60
|
-
}
|
61
55
|
# Tag words using a native Brill tagger.
|
62
56
|
#
|
63
|
-
#
|
57
|
+
# Options:
|
58
|
+
#
|
64
59
|
# :lexicon => String (Lexicon file to use)
|
65
60
|
# :lexical_rules => String (Lexical rule file to use)
|
66
61
|
# :contextual_rules => String (Contextual rules file to use)
|
67
62
|
def self.tag(entity, options = {})
|
68
63
|
# Reinitialize the tagger if the options have changed.
|
69
|
-
if options != @@options
|
70
|
-
@@options = DefaultOptions.merge(options)
|
71
|
-
@@tagger = nil # Reset the tagger
|
72
|
-
end
|
64
|
+
@@tagger = nil if options != @@options
|
73
65
|
# Create the tagger if necessary
|
74
66
|
@@tagger ||= ::Brill::Tagger.new(options[:lexicon],
|
75
67
|
options[:lexical_rules], options[:contextual_rules])
|
@@ -24,9 +24,8 @@ module Treat
|
|
24
24
|
@@options = {}
|
25
25
|
# Hold the default options.
|
26
26
|
DefaultOptions = {
|
27
|
-
unknown_word_tag: '
|
28
|
-
relax: false
|
29
|
-
debug: false
|
27
|
+
unknown_word_tag: 'FW',
|
28
|
+
relax: false
|
30
29
|
}
|
31
30
|
# Tag the word using a probabilistic model taking
|
32
31
|
# into account known words found in a lexicon and
|
@@ -34,11 +33,10 @@ module Treat
|
|
34
33
|
#
|
35
34
|
# Options:
|
36
35
|
#
|
37
|
-
#
|
36
|
+
# - (Boolean) :relax => Relax the Hidden Markov Model:
|
38
37
|
# this may improve accuracy for uncommon words,
|
39
38
|
# particularly words used polysemously.
|
40
|
-
#
|
41
|
-
# :unknown_word_tag => (String) Tag for unknown words.
|
39
|
+
# - (String) :unknown_word_tag => Tag for unknown words.
|
42
40
|
def self.tag(entity, options = {})
|
43
41
|
# Reinitialize the tagger if the options have changed.
|
44
42
|
if options != @@options
|