treat 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +7 -8
- data/TODO +16 -13
- data/examples/keywords.rb +89 -1
- data/lib/treat/buildable.rb +1 -8
- data/lib/treat/categories.rb +3 -4
- data/lib/treat/category.rb +1 -1
- data/lib/treat/delegatable.rb +1 -1
- data/lib/treat/detectors/encoding/native.rb +5 -0
- data/lib/treat/detectors/encoding/r_chardet19.rb +2 -3
- data/lib/treat/detectors/language/language_detector.rb +4 -0
- data/lib/treat/detectors/language/what_language.rb +4 -4
- data/lib/treat/detectors.rb +1 -1
- data/lib/treat/entities/entity.rb +5 -3
- data/lib/treat/entities/tokens.rb +14 -5
- data/lib/treat/entities/zones.rb +4 -0
- data/lib/treat/entities.rb +7 -5
- data/lib/treat/extractors/keywords/topics_frequency.rb +40 -0
- data/lib/treat/extractors/statistics/{frequency.rb → frequency_in.rb} +5 -4
- data/lib/treat/extractors/statistics/frequency_of.rb +3 -5
- data/lib/treat/extractors/statistics/{position_in.rb → position_in_parent.rb} +4 -3
- data/lib/treat/extractors/statistics/tf_idf.rb +36 -0
- data/lib/treat/extractors/statistics/transition_matrix.rb +25 -25
- data/lib/treat/extractors/statistics/transition_probability.rb +8 -4
- data/lib/treat/extractors/time/chronic.rb +8 -0
- data/lib/treat/extractors/time/native.rb +6 -0
- data/lib/treat/extractors/time/nickel.rb +31 -23
- data/lib/treat/extractors/topic_words/lda.rb +21 -16
- data/lib/treat/extractors/topics/reuters.rb +6 -4
- data/lib/treat/extractors.rb +7 -7
- data/lib/treat/formatters/readers/abw.rb +32 -0
- data/lib/treat/formatters/readers/autoselect.rb +13 -11
- data/lib/treat/formatters/readers/doc.rb +13 -0
- data/lib/treat/formatters/readers/gocr.rb +2 -0
- data/lib/treat/formatters/readers/html.rb +21 -1
- data/lib/treat/formatters/readers/ocropus.rb +3 -3
- data/lib/treat/formatters/readers/odt.rb +41 -0
- data/lib/treat/formatters/readers/pdf.rb +5 -2
- data/lib/treat/formatters/readers/txt.rb +2 -0
- data/lib/treat/formatters/serializers/xml.rb +3 -2
- data/lib/treat/formatters/serializers/yaml.rb +2 -0
- data/lib/treat/formatters/unserializers/autoselect.rb +7 -1
- data/lib/treat/formatters/unserializers/xml.rb +6 -1
- data/lib/treat/formatters/unserializers/yaml.rb +5 -1
- data/lib/treat/formatters/visualizers/dot.rb +35 -37
- data/lib/treat/formatters/visualizers/html.rb +1 -0
- data/lib/treat/formatters/visualizers/inspect.rb +4 -0
- data/lib/treat/formatters/visualizers/short_value.rb +18 -3
- data/lib/treat/formatters/visualizers/standoff.rb +11 -6
- data/lib/treat/formatters/visualizers/tree.rb +5 -1
- data/lib/treat/formatters/visualizers/txt.rb +6 -1
- data/lib/treat/formatters.rb +1 -1
- data/lib/treat/group.rb +4 -3
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +14 -17
- data/lib/treat/inflectors/conjugations/linguistics.rb +16 -3
- data/lib/treat/inflectors/declensions/linguistics.rb +17 -6
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +9 -10
- data/lib/treat/inflectors/stem/porter.rb +6 -2
- data/lib/treat/inflectors/stem/porter_c.rb +4 -1
- data/lib/treat/inflectors/stem/uea.rb +4 -4
- data/lib/treat/languages/english/tags.rb +16 -0
- data/lib/treat/languages/english.rb +4 -1
- data/lib/treat/lexicalizers/category/from_tag.rb +4 -4
- data/lib/treat/lexicalizers/linkages/naive.rb +3 -3
- data/lib/treat/lexicalizers/tag/brill.rb +3 -11
- data/lib/treat/lexicalizers/tag/lingua.rb +4 -6
- data/lib/treat/lexicalizers.rb +0 -2
- data/lib/treat/processors/chunkers/txt.rb +4 -4
- data/lib/treat/processors/parsers/enju.rb +3 -17
- data/lib/treat/processors/parsers/stanford.rb +4 -0
- data/lib/treat/processors/segmenters/punkt.rb +1 -0
- data/lib/treat/processors/segmenters/stanford.rb +4 -0
- data/lib/treat/processors/segmenters/tactful.rb +4 -1
- data/lib/treat/processors/tokenizers/punkt.rb +1 -2
- data/lib/treat/processors/tokenizers/stanford.rb +4 -0
- data/lib/treat/processors/tokenizers/tactful.rb +1 -1
- data/lib/treat/processors.rb +4 -4
- data/lib/treat/proxies.rb +18 -11
- data/lib/treat/registrable.rb +12 -5
- data/lib/treat/sugar.rb +8 -3
- data/lib/treat/tree.rb +10 -3
- data/lib/treat.rb +55 -55
- data/test/tc_entity.rb +7 -7
- data/test/tc_extractors.rb +6 -4
- data/test/tc_formatters.rb +0 -4
- data/test/tests.rb +2 -0
- data/test/texts.rb +4 -4
- metadata +48 -56
- data/examples/texts/bugged_out.txt +0 -26
- data/examples/texts/half_cocked_basel.txt +0 -16
- data/examples/texts/hedge_funds.txt +0 -24
- data/examples/texts/hose_and_dry.txt +0 -19
- data/examples/texts/hungarys_troubles.txt +0 -46
- data/examples/texts/indias_slowdown.txt +0 -15
- data/examples/texts/merkozy_rides_again.txt +0 -24
- data/examples/texts/prada_is_not_walmart.txt +0 -9
- data/examples/texts/republican_nomination.txt +0 -26
- data/examples/texts/to_infinity_and_beyond.txt +0 -15
- data/lib/treat/entities/text.rb +0 -7
- data/lib/treat/extractors/key_sentences/topics_frequency.rb +0 -49
- data/lib/treat/formatters/cleaners/html.rb +0 -17
@@ -2,62 +2,54 @@ module Treat
|
|
2
2
|
module Formatters
|
3
3
|
module Visualizers
|
4
4
|
class Dot
|
5
|
-
|
6
|
-
BorderColors = {
|
7
|
-
:verb => "#00AABB",
|
8
|
-
:noun => "#FAD4A7",
|
9
|
-
:adverb => '#103585',
|
10
|
-
:adjective => '#D21D54'
|
11
|
-
}
|
5
|
+
DefaultOptions = {colors: {}, :features => :all}
|
12
6
|
# Create the top-most graph structure
|
13
7
|
# and delegate the creation of the graph
|
14
8
|
# nodes to to_dot.
|
15
9
|
def self.visualize(entity, options = {})
|
10
|
+
options = DefaultOptions.merge(options)
|
16
11
|
string = "graph {"
|
17
|
-
string << self.to_dot(entity)
|
12
|
+
string << self.to_dot(entity, options)
|
18
13
|
string << "\n}"
|
19
14
|
end
|
20
15
|
# dot -Tpdf test4.dot > test4.pdf
|
21
|
-
def self.to_dot(entity)
|
16
|
+
def self.to_dot(entity, options)
|
17
|
+
# Id
|
22
18
|
string = ''
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
label = "label=\"#{entity.value.inspect[1..-2]}\","
|
29
|
-
end
|
19
|
+
label = ''
|
20
|
+
string = "\n#{entity.id} ["
|
21
|
+
# Value
|
22
|
+
if entity.is_a?(Treat::Entities::Token)
|
23
|
+
label = entity.to_s
|
30
24
|
else
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
else
|
35
|
-
label = "label=\"#{cc(cl(entity.class))}\","
|
25
|
+
label = entity.type.to_s.capitalize + " "
|
26
|
+
if entity.is_leaf?
|
27
|
+
label = entity.short_value.gsub(' [...]', " [...] \\n")
|
36
28
|
end
|
37
29
|
end
|
38
|
-
|
30
|
+
# Features
|
39
31
|
if entity.has_features?
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
32
|
+
unless options[:features] == :none
|
33
|
+
label << "\\n"
|
34
|
+
entity.features.each do |feature, value|
|
35
|
+
if options[:features] == :all ||
|
36
|
+
options[:features].include?(feature)
|
37
|
+
if value.is_a?(Treat::Entities::Entity)
|
38
|
+
label << "\\n#{feature}=\\\"*#{value.id}\\\","
|
39
|
+
else
|
40
|
+
label << "\\n#{feature}=\\\"#{value}\\\","
|
41
|
+
end
|
42
|
+
end
|
46
43
|
end
|
47
44
|
end
|
48
|
-
string = string[0..-2]
|
49
|
-
string << "]"
|
50
|
-
else
|
51
|
-
string << "#{label[0..-2]}]"
|
52
45
|
end
|
46
|
+
label = label[0..-2] if label[-1] == ','
|
47
|
+
string << "label=\"#{label}\"]"
|
48
|
+
# Parent-child relationships.
|
53
49
|
if entity.has_parent?
|
54
50
|
string << "\n#{entity.parent.id} -- #{entity.id};"
|
55
51
|
end
|
56
|
-
|
57
|
-
entity.each do |child|
|
58
|
-
string << self.to_dot(child)
|
59
|
-
end
|
60
|
-
end
|
52
|
+
# Edges.
|
61
53
|
if entity.has_edges?
|
62
54
|
entity.edges.each_pair do |target, type|
|
63
55
|
string << "\n#{entity.id} -- #{target}"
|
@@ -65,6 +57,12 @@ module Treat
|
|
65
57
|
string << "arrowhead=\"odiamond\"]"
|
66
58
|
end
|
67
59
|
end
|
60
|
+
# Recurse.
|
61
|
+
if entity.has_children?
|
62
|
+
entity.each do |child|
|
63
|
+
string << self.to_dot(child, options)
|
64
|
+
end
|
65
|
+
end
|
68
66
|
string
|
69
67
|
end
|
70
68
|
end
|
@@ -1,7 +1,11 @@
|
|
1
1
|
module Treat
|
2
2
|
module Formatters
|
3
3
|
module Visualizers
|
4
|
+
# Handles the call to inspect.
|
4
5
|
class Inspect
|
6
|
+
# Return a terminal-friendly visualization of an entity.
|
7
|
+
#
|
8
|
+
# Options: none.
|
5
9
|
def self.visualize(entity, options = {})
|
6
10
|
s = "#{entity.class.to_s.split('::')[-1]} (#{entity.id.to_s})"
|
7
11
|
unless caller_method == :inspect
|
@@ -2,11 +2,26 @@ module Treat
|
|
2
2
|
module Formatters
|
3
3
|
module Visualizers
|
4
4
|
class ShortValue
|
5
|
+
# Default options for the visualizer.
|
6
|
+
DefaultOptions = { max_words: 6, max_length: 30 }
|
7
|
+
# Returns the text value of an entity, shortend
|
8
|
+
# with [..] if the value is longer than :max_words
|
9
|
+
# or longer than :max_length.
|
10
|
+
#
|
11
|
+
# Options:
|
12
|
+
# - (Integer) :max_words => the maximum number
|
13
|
+
# of words in an entity before it is shortened.
|
14
|
+
# - (Integer) :max_length => the maximum number
|
15
|
+
# of characters in an entity before it is shortened.s
|
5
16
|
def self.visualize(entity, options = {})
|
6
|
-
options
|
17
|
+
options = DefaultOptions.merge(options)
|
7
18
|
words = entity.to_s.split(' ')
|
8
|
-
|
9
|
-
|
19
|
+
if words.size < options[:max_words] ||
|
20
|
+
entity.to_s.length < options[:max_length]
|
21
|
+
entity.to_s
|
22
|
+
else
|
23
|
+
words[0..2].join(' ') + ' [...] ' + words[-3..-1].join(' ')
|
24
|
+
end
|
10
25
|
end
|
11
26
|
end
|
12
27
|
end
|
@@ -5,24 +5,29 @@ module Treat
|
|
5
5
|
# an entity in standoff format; for example:
|
6
6
|
# (S (NP John) (VP has (VP come))).
|
7
7
|
class Standoff
|
8
|
-
|
8
|
+
# Default options for the visualizer.
|
9
|
+
DefaultOptions = { indent: 0 }
|
10
|
+
# A lambda to recursively visualize the children
|
11
|
+
# of an entity.
|
12
|
+
Recurse = lambda do |entity, options|
|
9
13
|
v = ''
|
10
14
|
entity.each { |child| v += visualize(child, options) }
|
11
15
|
v
|
12
16
|
end
|
13
17
|
# Visualize the entity using standoff notation.
|
14
|
-
# This can only be called on sentences
|
15
|
-
# is not a suitable format to
|
16
|
-
#
|
18
|
+
# This can only be called on sentences and smaller
|
19
|
+
# entities, as it is not a suitable format to
|
20
|
+
# represent larger entities.
|
17
21
|
def self.visualize(entity, options = {})
|
18
|
-
options =
|
22
|
+
options = DefaultOptions.merge(options)
|
19
23
|
value = ''; spaces = ''
|
20
24
|
options[:indent].times { spaces << ' '}
|
21
25
|
options[:indent] += 1
|
22
26
|
if entity.is_a?(Treat::Entities::Token)
|
23
27
|
value += "#{spaces}(#{entity.tag} #{entity.value})"
|
24
28
|
elsif entity.is_a?(Treat::Entities::Constituent)
|
25
|
-
|
29
|
+
tag = entity.has?(:tag) ? entity.tag : ''
|
30
|
+
value += ("#{spaces}(#{tag}\n" +
|
26
31
|
"#{Recurse.call(entity, options)})\n")
|
27
32
|
elsif entity.is_a?(Treat::Entities::Sentence)
|
28
33
|
value += ("#{spaces}(S\n" +
|
@@ -1,11 +1,15 @@
|
|
1
1
|
module Treat
|
2
2
|
module Formatters
|
3
3
|
module Visualizers
|
4
|
+
# This class generates an ASCII representation
|
5
|
+
# of a tree of entities.
|
4
6
|
class Tree
|
7
|
+
# Default options for the visualizer.
|
8
|
+
DefaultOptions = { indent: 0 }
|
5
9
|
# Obtain a plain text tree representation
|
6
10
|
# of the entity.
|
7
11
|
def self.visualize(entity, options = {})
|
8
|
-
options =
|
12
|
+
options = DefaultOptions.merge(options)
|
9
13
|
string = ''
|
10
14
|
if entity.has_children?
|
11
15
|
spacer = '--'
|
@@ -3,10 +3,15 @@ module Treat
|
|
3
3
|
module Visualizers
|
4
4
|
# Creates a plain text visualization of an entity.
|
5
5
|
class Txt
|
6
|
+
# The default options for the visualizer.
|
7
|
+
DefaultOptions = { sep: ' ' }
|
6
8
|
# Obtain a plain text visualization of the entity,
|
7
9
|
# with no additional information.
|
10
|
+
#
|
11
|
+
# Options:
|
12
|
+
# (String) :sep => the separator to use between words.
|
8
13
|
def self.visualize(entity, options = {})
|
9
|
-
options
|
14
|
+
options = DefaultOptions.merge(options)
|
10
15
|
return entity.value if !entity.has_children?
|
11
16
|
value = ''
|
12
17
|
entity.each do |child|
|
data/lib/treat/formatters.rb
CHANGED
data/lib/treat/group.rb
CHANGED
@@ -61,14 +61,15 @@ module Treat
|
|
61
61
|
end
|
62
62
|
is_target
|
63
63
|
end
|
64
|
+
# Cache the list of adaptors to improve performance.
|
65
|
+
@@list = {}
|
64
66
|
# Populates once the list of the adaptors in the group
|
65
67
|
# by crawling the filesystem.
|
66
|
-
@@list = {}
|
67
68
|
def list
|
68
69
|
mod = ucc(cl(self))
|
69
70
|
if @@list[mod].nil?
|
70
71
|
@@list[mod] = []
|
71
|
-
dirs = Dir
|
72
|
+
dirs = Dir.glob("#{Treat.lib}/treat/*/#{mod}/*.rb")
|
72
73
|
dirs.each do |file|
|
73
74
|
@@list[mod] <<
|
74
75
|
:"#{file.split('/')[-1][0..-4]}"
|
@@ -79,7 +80,7 @@ module Treat
|
|
79
80
|
# Get constants in this module, excluding those
|
80
81
|
# defined by parent modules.
|
81
82
|
def const_get(const); super(const, false); end
|
82
|
-
#
|
83
|
+
# Lazy load the classes in the group.
|
83
84
|
def const_missing(const)
|
84
85
|
bits = self.ancestors[0].to_s.split('::')
|
85
86
|
bits.collect! { |bit| ucc(bit) }
|
@@ -1,43 +1,40 @@
|
|
1
1
|
module Treat
|
2
2
|
module Inflectors
|
3
3
|
module CardinalWords
|
4
|
+
# This class is a wrapper for the functions included
|
5
|
+
# in the 'linguistics' gem that allow to describe a
|
6
|
+
# number in words in cardinal form.
|
7
|
+
#
|
8
|
+
# Project website: http://deveiate.org/projects/Linguistics/
|
4
9
|
class Linguistics
|
10
|
+
# Require the 'linguistics' gem.
|
5
11
|
silence_warnings { require 'linguistics' }
|
12
|
+
# Return the description of a cardinal number in words.
|
6
13
|
#
|
7
14
|
# Options:
|
8
15
|
#
|
9
|
-
# :group => Controls how many numbers at a time are
|
16
|
+
# - :group => Controls how many numbers at a time are
|
10
17
|
# grouped together. Valid values are 0 (normal grouping),
|
11
18
|
# 1 (single-digit grouping, e.g., “one, two, three, four”),
|
12
19
|
# 2 (double-digit grouping, e.g., “twelve, thirty-four”, or
|
13
20
|
# 3 (triple-digit grouping, e.g., “one twenty-three, four”).
|
14
|
-
# :comma => Set the character/s used to separate word groups.
|
21
|
+
# - :comma => Set the character/s used to separate word groups.
|
15
22
|
# Defaults to ", ".
|
16
|
-
# :and => Set the word and/or characters used where ' and '
|
23
|
+
# - :and => Set the word and/or characters used where ' and '
|
17
24
|
# (the default) is normally used. Setting :and to ' ', for
|
18
25
|
# example, will cause 2556 to be returned as “two-thousand,
|
19
26
|
# five hundred fifty-six” instead of “two-thousand, five
|
20
27
|
# hundred and fifty-six”.
|
21
|
-
# :zero => Set the word used to represent the numeral 0 in
|
28
|
+
# - :zero => Set the word used to represent the numeral 0 in
|
22
29
|
# the result. 'zero' is the default.
|
23
|
-
# :decimal => Set the translation of any decimal points in
|
30
|
+
# - :decimal => Set the translation of any decimal points in
|
24
31
|
# the number; the default is 'point'.
|
25
|
-
# :asArray If set to a true value, the number will be returned
|
32
|
+
# - :asArray If set to a true value, the number will be returned
|
26
33
|
# as an array of word groups instead of a String.
|
27
34
|
#
|
28
35
|
# More specific options when using :type => :ordinal:
|
29
|
-
#
|
30
|
-
#
|
31
36
|
def self.cardinal_words(entity, options = {})
|
32
|
-
|
33
|
-
l = entity.language.to_s.upcase
|
34
|
-
delegate = nil
|
35
|
-
silence_warnings { delegate = ::Linguistics.const_get(l) }
|
36
|
-
rescue RuntimeError
|
37
|
-
raise "Ruby Linguistics does not have a module " +
|
38
|
-
" installed for the #{entity.language} language."
|
39
|
-
end
|
40
|
-
silence_warnings { delegate.numwords(entity.to_s, options) }
|
37
|
+
silence_warnings { ::Linguistics::EN.numwords(entity.to_s, options) }
|
41
38
|
end
|
42
39
|
end
|
43
40
|
end
|
@@ -1,15 +1,28 @@
|
|
1
1
|
module Treat
|
2
2
|
module Inflectors
|
3
3
|
module Conjugations
|
4
|
+
# This class is a wrapper for the functions included
|
5
|
+
# in the 'linguistics' gem that allow to conjugate verbs.
|
6
|
+
#
|
7
|
+
# Project website: http://deveiate.org/projects/Linguistics/
|
4
8
|
class Linguistics
|
5
9
|
silence_warnings { require 'linguistics' }
|
6
|
-
|
10
|
+
# Conjugate a verb using ruby linguistics with the specified
|
11
|
+
# mode, tense, count and person.
|
12
|
+
#
|
13
|
+
# Options:
|
14
|
+
#
|
15
|
+
# - (Symbol) :mode => :infinitive, :indicative, :subjunctive, :participle
|
16
|
+
# - (Symbol) :tense => :past, :present, :future
|
17
|
+
# - (Symbol) :count => :singular, :plural
|
18
|
+
# - (Symbol) :person => :first, :second, :third
|
19
|
+
def self.conjugations(entity, parameters)
|
7
20
|
begin
|
8
21
|
l = entity.language.to_s.upcase
|
9
22
|
delegate = nil
|
10
23
|
silence_warnings { delegate = ::Linguistics.const_get(l) }
|
11
24
|
rescue RuntimeError
|
12
|
-
raise "Ruby Linguistics does not have a module " +
|
25
|
+
raise "Ruby Linguistics does not have a module " +
|
13
26
|
" installed for the #{entity.language} language."
|
14
27
|
end
|
15
28
|
if parameters[:mode] == :infinitive
|
@@ -27,4 +40,4 @@ module Treat
|
|
27
40
|
end
|
28
41
|
end
|
29
42
|
end
|
30
|
-
end
|
43
|
+
end
|
@@ -1,24 +1,35 @@
|
|
1
1
|
module Treat
|
2
2
|
module Inflectors
|
3
3
|
module Declensions
|
4
|
-
|
5
|
-
#
|
6
|
-
#
|
4
|
+
# This class is a wrapper for the functions included
|
5
|
+
# in the 'linguistics' gem that allow to obtain the
|
6
|
+
# declensions of a word.
|
7
|
+
#
|
8
|
+
# Project website: http://deveiate.org/projects/Linguistics/
|
7
9
|
class Linguistics
|
8
|
-
|
10
|
+
# Require Ruby Linguistics
|
11
|
+
silence_warnings { require 'linguistics' }
|
12
|
+
# Retrieve a declension of a word using the 'linguistics' gem.
|
13
|
+
#
|
14
|
+
# Options:
|
15
|
+
#
|
16
|
+
# - (Identifier) :count => :singular, :plural
|
17
|
+
def self.declensions(entity, options = {})
|
9
18
|
begin
|
10
19
|
l = entity.language.to_s.upcase
|
11
20
|
delegate = nil
|
12
21
|
silence_warnings { delegate = ::Linguistics.const_get(l) }
|
13
22
|
rescue RuntimeError
|
14
|
-
raise "Ruby Linguistics does not have a module " +
|
23
|
+
raise "Ruby Linguistics does not have a module " +
|
15
24
|
" installed for the #{entity.language} language."
|
16
25
|
end
|
17
26
|
string = entity.to_s
|
18
27
|
if options[:count] == :plural
|
19
28
|
if entity.has?(:category) &&
|
20
29
|
[:noun, :adjective, :verb].include?(entity.category)
|
21
|
-
silence_warnings
|
30
|
+
silence_warnings do
|
31
|
+
delegate.send(:"plural_#{entity.category}", string)
|
32
|
+
end
|
22
33
|
else
|
23
34
|
silence_warnings { delegate.plural(string) }
|
24
35
|
end
|
@@ -1,19 +1,18 @@
|
|
1
1
|
module Treat
|
2
2
|
module Inflectors
|
3
3
|
module OrdinalWords
|
4
|
+
# This class is a wrapper for the functions included
|
5
|
+
# in the 'linguistics' gem that allow to describe a
|
6
|
+
# number in words in ordinal form.
|
7
|
+
#
|
8
|
+
# Project website: http://deveiate.org/projects/Linguistics/
|
4
9
|
class Linguistics
|
10
|
+
# Require Ruby Linguistics.
|
5
11
|
silence_warnings { require 'linguistics' }
|
12
|
+
# Desribe a number in words in ordinal form, using the
|
13
|
+
# 'linguistics' gem.
|
6
14
|
def self.ordinal_words(number, options = {})
|
7
|
-
|
8
|
-
l = number.language.to_s.upcase
|
9
|
-
delegate = nil
|
10
|
-
silence_warnings { delegate = ::Linguistics.const_get(l) }
|
11
|
-
rescue RuntimeError
|
12
|
-
lang = Treat::Languages.describe(number.language)
|
13
|
-
raise "Ruby Linguistics does not have a module " +
|
14
|
-
" installed for the #{lang} language."
|
15
|
-
end
|
16
|
-
silence_warnings { delegate.ordinate(number.to_s) }
|
15
|
+
silence_warnings { ::Linguistics::EN.ordinate(number.to_s) }
|
17
16
|
end
|
18
17
|
end
|
19
18
|
end
|
@@ -2,16 +2,20 @@ module Treat
|
|
2
2
|
module Inflectors
|
3
3
|
module Stem
|
4
4
|
# Stem a word using a native Ruby implementation of the
|
5
|
-
# Porter stemming algorithm, ported to Ruby from
|
6
|
-
# version coded up in Perl.
|
5
|
+
# Porter stemming algorithm, ported to Ruby from a
|
6
|
+
# version coded up in Perl. This is a simplified
|
7
|
+
# implementation; for a true and fast Porter stemmer,
|
8
|
+
# see Treat::Inflectors::Stem::PorterC.
|
7
9
|
#
|
8
10
|
# Authored by Ray Pereda (raypereda@hotmail.com).
|
11
|
+
# Unknown license.
|
9
12
|
#
|
10
13
|
# Original paper: Porter, 1980. An algorithm for suffix stripping,
|
11
14
|
# Program, Vol. 14, no. 3, pp 130-137,
|
12
15
|
# Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
|
13
16
|
class Porter
|
14
17
|
# Returns the stem of a word using a native Porter stemmer.
|
18
|
+
#
|
15
19
|
# Options: none.
|
16
20
|
def self.stem(word, options = {})
|
17
21
|
# Copy the word and convert it to a string.
|
@@ -9,10 +9,13 @@ module Treat
|
|
9
9
|
# Program, Vol. 14, no. 3, pp 130-137,
|
10
10
|
# Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
|
11
11
|
class PorterC
|
12
|
+
# Require the 'ruby-stemmer' gem.
|
12
13
|
silence_warnings { require 'lingua/stemmer' }
|
14
|
+
# Remove a conflict between this gem and the 'engtagger' gem.
|
13
15
|
::LinguaStemmer = ::Lingua
|
14
16
|
Object.instance_eval { remove_const :Lingua }
|
15
|
-
# Stem the word using
|
17
|
+
# Stem the word using a full-blown Porter stemmer in C.
|
18
|
+
#
|
16
19
|
# Options: none.
|
17
20
|
def self.stem(word, options = {})
|
18
21
|
silence_warnings { ::LinguaStemmer.stemmer(word.to_s) }
|
@@ -9,10 +9,10 @@ module Treat
|
|
9
9
|
# groups of rules: the first to clean the tokens, and
|
10
10
|
# the second to alter suffixes."
|
11
11
|
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
15
|
-
#
|
12
|
+
# Project website: https://github.com/ealdent/uea-stemmer
|
13
|
+
# Original paper: Jenkins, Marie-Claire, Smith, Dan,
|
14
|
+
# Conservative stemming for search and indexing, 2005.
|
15
|
+
# http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
|
16
16
|
class UEA
|
17
17
|
# Require the 'uea-stemmer' gem.
|
18
18
|
silence_warnings { require 'uea-stemmer' }
|
@@ -183,6 +183,22 @@ module Treat
|
|
183
183
|
['PRT', 'Particle'],
|
184
184
|
['S', 'Sentence']
|
185
185
|
]
|
186
|
+
|
187
|
+
# Maps Enju categories to Treat categories.
|
188
|
+
EnjuCatToCategory = {
|
189
|
+
'ADJ' => :adjective,
|
190
|
+
'ADV' => :adverb,
|
191
|
+
'CONJ' => :conjunction,
|
192
|
+
'COOD' => :conjunction,
|
193
|
+
'C' => :complementizer,
|
194
|
+
'D' => :determiner,
|
195
|
+
'N' => :noun,
|
196
|
+
'P' => :preposition,
|
197
|
+
'PN' => :punctuation,
|
198
|
+
'SC' => :conjunction,
|
199
|
+
'V' => :verb,
|
200
|
+
'PRT' => :particle
|
201
|
+
}
|
186
202
|
|
187
203
|
# Description of the xcat in the Enju output specification.
|
188
204
|
EnjuXCatDescription = [
|
@@ -1,8 +1,10 @@
|
|
1
1
|
module Treat
|
2
2
|
module Languages
|
3
3
|
class English
|
4
|
+
|
4
5
|
require 'treat/languages/english/tags'
|
5
6
|
require 'treat/languages/english/categories'
|
7
|
+
|
6
8
|
Extractors = {
|
7
9
|
time: [:chronic],
|
8
10
|
topics: [:reuters],
|
@@ -11,7 +13,7 @@ module Treat
|
|
11
13
|
}
|
12
14
|
Processors = {
|
13
15
|
chunkers: [:txt],
|
14
|
-
parsers: [:
|
16
|
+
parsers: [:stanford, :enju],
|
15
17
|
segmenters: [:tactful, :punkt, :stanford],
|
16
18
|
tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
|
17
19
|
}
|
@@ -28,6 +30,7 @@ module Treat
|
|
28
30
|
ordinal_words: [:linguistics],
|
29
31
|
cardinal_words: [:linguistics]
|
30
32
|
}
|
33
|
+
|
31
34
|
end
|
32
35
|
end
|
33
36
|
end
|
@@ -4,13 +4,12 @@ module Treat
|
|
4
4
|
# A class that detects the category of a word from its tag,
|
5
5
|
# using the default tagger for the language of the entity.
|
6
6
|
class FromTag
|
7
|
-
DefaultOptions = { tagger: nil }
|
8
7
|
# Find the category of the current entity.
|
8
|
+
#
|
9
9
|
# Options:
|
10
|
-
#
|
11
|
-
#
|
10
|
+
#
|
11
|
+
# - (Symbol) :tagger => force the use of a tagger.
|
12
12
|
def self.category(entity, options = {})
|
13
|
-
options = DefaultOptions.merge(options)
|
14
13
|
tag = options[:tagger].nil? ? entity.tag : entity.tag(options[:tagger])
|
15
14
|
lang = Treat::Languages.get(entity.language)
|
16
15
|
cat = lang::WordTagToCategory[tag]
|
@@ -21,6 +20,7 @@ module Treat
|
|
21
20
|
if cat.size == 1
|
22
21
|
return cat[0]
|
23
22
|
else
|
23
|
+
entity.set :tag_set, :penn
|
24
24
|
if entity.has?(:tag_set)
|
25
25
|
if cat[entity.tag_set]
|
26
26
|
return cat[entity.tag_set]
|
@@ -27,7 +27,7 @@ module Treat
|
|
27
27
|
end
|
28
28
|
# Return the subject of the sentence|verb.
|
29
29
|
def self.subject(entity, options)
|
30
|
-
verb = entity.category == :verb ?
|
30
|
+
verb = (entity.has?(:category) && entity.category == :verb) ?
|
31
31
|
main_verb(entity) : entity.main_verb
|
32
32
|
args = []
|
33
33
|
main_verb.edges.each_pair do |id,edge|
|
@@ -37,7 +37,7 @@ module Treat
|
|
37
37
|
end
|
38
38
|
# Return the object of the sentence|verb.
|
39
39
|
def self.object(entity, options)
|
40
|
-
verb = entity.category == :verb ?
|
40
|
+
verb = (entity.has?(:category) && entity.category == :verb) ?
|
41
41
|
main_verb(entity) : entity.main_verb
|
42
42
|
if verb.voice == 'passive'
|
43
43
|
return
|
@@ -50,7 +50,7 @@ module Treat
|
|
50
50
|
end
|
51
51
|
# Find the main verb (shallowest verb in the tree).
|
52
52
|
def self.main_verb(entity, options)
|
53
|
-
verbs = entity.
|
53
|
+
verbs = entity.verbs
|
54
54
|
if verbs.empty?
|
55
55
|
return
|
56
56
|
end
|
@@ -52,24 +52,16 @@ module Treat
|
|
52
52
|
@@tagger = nil
|
53
53
|
# Hold the user-set options
|
54
54
|
@@options = {}
|
55
|
-
# Hold the default options.
|
56
|
-
DefaultOptions = {
|
57
|
-
lexicon: nil,
|
58
|
-
lexical_rules: nil,
|
59
|
-
contextual_rules: nil
|
60
|
-
}
|
61
55
|
# Tag words using a native Brill tagger.
|
62
56
|
#
|
63
|
-
#
|
57
|
+
# Options:
|
58
|
+
#
|
64
59
|
# :lexicon => String (Lexicon file to use)
|
65
60
|
# :lexical_rules => String (Lexical rule file to use)
|
66
61
|
# :contextual_rules => String (Contextual rules file to use)
|
67
62
|
def self.tag(entity, options = {})
|
68
63
|
# Reinitialize the tagger if the options have changed.
|
69
|
-
if options != @@options
|
70
|
-
@@options = DefaultOptions.merge(options)
|
71
|
-
@@tagger = nil # Reset the tagger
|
72
|
-
end
|
64
|
+
@@tagger = nil if options != @@options
|
73
65
|
# Create the tagger if necessary
|
74
66
|
@@tagger ||= ::Brill::Tagger.new(options[:lexicon],
|
75
67
|
options[:lexical_rules], options[:contextual_rules])
|
@@ -24,9 +24,8 @@ module Treat
|
|
24
24
|
@@options = {}
|
25
25
|
# Hold the default options.
|
26
26
|
DefaultOptions = {
|
27
|
-
unknown_word_tag: '
|
28
|
-
relax: false
|
29
|
-
debug: false
|
27
|
+
unknown_word_tag: 'FW',
|
28
|
+
relax: false
|
30
29
|
}
|
31
30
|
# Tag the word using a probabilistic model taking
|
32
31
|
# into account known words found in a lexicon and
|
@@ -34,11 +33,10 @@ module Treat
|
|
34
33
|
#
|
35
34
|
# Options:
|
36
35
|
#
|
37
|
-
#
|
36
|
+
# - (Boolean) :relax => Relax the Hidden Markov Model:
|
38
37
|
# this may improve accuracy for uncommon words,
|
39
38
|
# particularly words used polysemously.
|
40
|
-
#
|
41
|
-
# :unknown_word_tag => (String) Tag for unknown words.
|
39
|
+
# - (String) :unknown_word_tag => Tag for unknown words.
|
42
40
|
def self.tag(entity, options = {})
|
43
41
|
# Reinitialize the tagger if the options have changed.
|
44
42
|
if options != @@options
|